Demo entry 6339814

py

   

Submitted by anonymous on Dec 26, 2016 at 03:45
Language: Python. Code size: 6.5 kB.

#-*- coding:utf-8 -*-

from __future__ import division 
import csv
import random
import numpy as np

# 正态分布
def gauss(eta, sigma):
    def gaussX(x):
            return (1/(np.sqrt(2*np.pi)*sigma))*(np.exp(-(x-eta)**2/sigma**2))
    return gaussX

# 保留小数精度
def numfmt(num, n=4):
    return round(num, n)

# 划分训练集和测试集
def split_set(total_set, split_rate=0.8):
    size = int(round(len(total_set) * split_rate))
    index_set = range(len(total_set))
    random.shuffle(index_set)
    train_set = []
    test_set = []
    for i in range(len(index_set)):
            if i <= size:
                    train_set.append(total_set[index_set[i]])
            else:
                    test_set.append(total_set[index_set[i]])
    return train_set, test_set

# 朴素贝叶斯 -- 连续变量
class NaiveBayes(object):
    def __init__(self):
        self.label = list()
        self.data = list()
        self.argdict = dict()

    def set_label(self, labels):
        self.label = labels

    def set_data(self, datas):
        self.data = datas

    # 计算每个标签的均值和方差
    def __calcu_klass(self, klasses):
        kls1 = []
        for k1 in range(1, len(klasses[0])):
                types = [klasses[i][k1] for i in range(len(klasses))]
                eta = numfmt(np.mean(types))
                sigma = numfmt(np.cov(types).tolist())
                # print(self.label[k1], types)
                # print('eta', eta)
                # print('sigma', sigma)
                kls1.append({'eta': eta, 'sigma': sigma, 'gauss': gauss(eta, sigma)})
        return kls1

    def train(self):
        # 对类别分类
        klass1 = [elem for elem in self.data if elem[0]==1]
        klass2 = [elem for elem in self.data if elem[0]==2]
        klass3 = [elem for elem in self.data if elem[0]==3]

        # 各类的先验概率
        total = len(klass1) + len(klass2) + len(klass3)
        self.argdict['p1'] = len(klass1)/total
        self.argdict['p2'] = len(klass2)/total
        self.argdict['p3'] = len(klass3)/total

        # 存储每个标签的均值、方差和高斯函数
        self.argdict['klass1'] = self.__calcu_klass(klass1)
        self.argdict['klass2'] = self.__calcu_klass(klass2)
        self.argdict['klass3'] = self.__calcu_klass(klass3)

    def classify(self,data_test):
        p1 = np.log(self.argdict['p1'])
        p2 = np.log(self.argdict['p2'])
        p3 = np.log(self.argdict['p3'])
        for i in range(1,len(data_test)):
                p3 = p3 + np.log(0.0001 + self.argdict['klass3'][i-1]['gauss'](data_test[i]))
                p2 = p2 + np.log(0.0001 + self.argdict['klass2'][i-1]['gauss'](data_test[i]))
                p1 = p1 + np.log(0.0001 + self.argdict['klass1'][i-1]['gauss'](data_test[i]))
        ps = [p1, p2, p3]
        return (ps.index(max(ps))+1) == data_test[0]

# 朴素贝叶斯 -- 离散变量
class NaiveBayes2(object):
    def __init__(self):
        self.label = list()
        self.data = list()
        self.klasses = list()
        self.argdict = dict()

    def set_label(self, labels):
        self.label = labels
    
    def set_data(self, datas):
        self.data = datas

    def set_klass(self, klasses):
        self.klasses = klasses

    def train(self):
        args = {}
        total = len(self.data)
        for klass in self.klasses:
            kdata = [elem for elem in self.data if elem[-1]==klass]
            klass_args = {}
            klass_args['rate'] = len(kdata)/total
            for i in range(len(self.label)):
                label_args = {}
                kds = [kd[i] for kd in kdata]
                kd_total = len(kds)
                for ktype in set(kds):
                    label_args[ktype] = kds.count(ktype)/kd_total
                klass_args[self.label[i]] = label_args
            args[klass] = klass_args
        self.argdict = args

    def classify(self, data_test):
        labels = self.label
        ps = {}
        for klass in self.klasses:
            arg_klass = self.argdict[klass]
            p_klass = arg_klass['rate']
            for i in range(len(labels)):
                arg_label = arg_klass[labels[i]]
                if arg_label.has_key(data_test[i]):
                    p_klass *= arg_label[data_test[i]]
                else:
                    p_klass *= 0
            ps[klass] = p_klass
        pitems = ps.items()
        klass, pmax = pitems[0]
        for pitem in pitems:
            if pitem[1] > pmax:
                klass, pmax = pitem
        return klass == data_test[-1]       

# 连续变量
# http://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data
# wind.data.txt
def test1(split_rate=0.8):
    datas = []
    csv_reader = csv.reader(open('wine.data.txt', 'rb'))
    for line in csv_reader:
            datas.append([float(x) for x in line])
    train_set, test_set = split_set(datas, split_rate)
    label_str = 'klass,Alcohol,Malic acid,Ash,Alcalinity of ash,Magnesium,Total phenols,Flavanoids,Nonflavanoid phenols,Proanthocyanins,Color intensity,Hue,OD280/OD315 of diluted wines,Proline'
    bayes = NaiveBayes()
    bayes.set_label(label_str.split(','))
    bayes.set_data(train_set)
    bayes.train()
    total = 0
    succ = 0
    print('======连续变量======')
    for test in test_set:
            if bayes.classify(test):
                    succ += 1
            total +=1
    print('测试样本数 = %d' % total)
    print('分类正确数 = %d' % succ)
    print('分类准确率 = %.2f%%' % (100*succ/total))
    return succ/total

# 离散变量
# http://archive.ics.uci.edu/ml/machine-learning-databases/car/car.data
# car.data.txt
def test2(split_rate=0.8):
    datas = []
    csv_reader = csv.reader(open('car.data.txt', 'rb'))
    for line in csv_reader:
        datas.append(line)
    train_set, test_set = split_set(datas, split_rate)
    label_str = 'buying,maint,doors,presons,lug_boot,safety'
    bayes2 = NaiveBayes2()
    bayes2.set_label(label_str.split(','))
    bayes2.set_data(train_set)
    bayes2.set_klass('unacc,acc,good,vgood'.split(','))
    bayes2.train()
    bayes2.classify(test_set[0])
    total = 0
    succ = 0
    print('======离散变量======')
    for test in test_set:
        if bayes2.classify(test):
            succ += 1
        total += 1
    print('测试样本数 = %d' % total)
    print('分类正确数 = %d' % succ)
    print('分类准确率 = %.2f%%' % (100*succ/total))
    return succ/total

if __name__ == '__main__':
    test1()
    test2()

This snippet took 0.01 seconds to highlight.

Back to the Entry List or Home.

Delete this entry (admin only).