Demo entry 6717084

python

   

Submitted by anonymous on Feb 23, 2018 at 01:35
Language: HTML. Code size: 7.0 kB.

# This code includes ID3 and C4.5 algorithm 
# set the modelName to C4.5
modelName = 'C4.5' 
nFeature = 3 

print '**********'

featureNames = []

# read in training dataset 
trainingInstances = []
fr = open('train.txt','rb')
line = fr.readline()
arr = line.strip('\r\n').split('\t')
for i in range(nFeature):
    featureNames.append(arr[i+3])
for line in fr:
    arr = line.strip('\r\n').split('\t')
    featureValues = arr[3:nFeature+3]
    label = arr[nFeature+3]
    trainingInstances.append([featureValues,label])
fr.close()

# show the data format that is using later
print 'Training data:'
nTrainingInstance = len(trainingInstances)
for i in range(nTrainingInstance):
    print i+1,trainingInstances[i]

print '**********'

import math

# Entropy Function
def Entropy(ns):
    ret = 0.0
    nTotal = sum(ns)
    for n in ns:
        ret += -1.0*n/nTotal*math.log(1.0*n/nTotal,2)
    return ret

# Information Gain --> ID3 choose attribute with maximum Information Gain for each level
def IG(instanceIDs,featureID):
    label2count = {}
    value2label2count = {}
    for instanceID in instanceIDs:
        featureValues,label = trainingInstances[instanceID]
        if not label in label2count:
            label2count[label] = 0
        label2count[label] += 1
        featureValue = featureValues[featureID]
        if not featureValue in value2label2count:
            value2label2count[featureValue] = {}
        if not label in value2label2count[featureValue]:
            value2label2count[featureValue][label] = 0
        value2label2count[featureValue][label] += 1
    N = len(instanceIDs)
    ns = []
    for [label,count] in label2count.items():
        ns.append(count)
    H_Y = Entropy(ns)
    H_Y_X = 0.0
    for [value,label2count] in value2label2count.items():
        ns = []
        for [label,count] in label2count.items():
            ns.append(count)
        H_Y_X += 1.0*sum(ns)/N*Entropy(ns)
    return H_Y-H_Y_X

# Gain Ratio --> C4.5 choose attribute with maximum Gain Ratio for each level
def GainRatio(instanceIDs,featureID):
    ret = IG(instanceIDs,featureID)
    if ret == 0: return ret
    value2count = {}
    for instanceID in instanceIDs:
        featureValues,label = trainingInstances[instanceID]
        featureValue = featureValues[featureID]
        if not featureValue in value2count:
            value2count[featureValue] = 0
        value2count[featureValue] += 1
    ns = []
    for [value,count] in value2count.items():
        ns.append(count)
    splitInfo = Entropy(ns)
    return ret/splitInfo

# instanceIDs,featureIDs,level,subTree,bestFeatureID,majorityLabel,parentMajorityLabel
def ConstructDecisionTree(tree):

    instanceIDs,featureIDs,level,_,_,_,parentMajorityLabel = tree

    label2count = {}
    for instanceID in instanceIDs:
        label = trainingInstances[instanceID][1]
        if not label in label2count:
            label2count[label] = 0
        label2count[label] += 1
    label_count = sorted(label2count.items(),key=lambda x:-x[1])
    majorityLabel = label_count[0][0]
    if len(label_count) > 1 and label_count[0][1] == label_count[1][1]:
        majorityLabel = parentMajorityLabel
    tree[5] = majorityLabel
    print '',majorityLabel,label_count
    if len(label_count) == 1: return
    if len(featureIDs) == 0: return

    featureID2criterionScore = {}
    for featureID in featureIDs:
        if modelName == 'ID3':
            criterionScore = IG(instanceIDs,featureID)
        elif modelName == 'C4.5':
            criterionScore = GainRatio(instanceIDs,featureID)            
        else:
            criterionScore = IG(instanceIDs,featureID)
        featureID2criterionScore[featureID] = criterionScore
    featureID_criterionScore = sorted(featureID2criterionScore.items(),key=lambda x:-x[1])
    bestFeatureID,bestCriterionScore = featureID_criterionScore[0][0],featureID_criterionScore[0][1]
    if bestCriterionScore == 0.0: return
    tree[4] = bestFeatureID

    subFeatureIDs = set()
    for featureID in featureIDs:
        if featureID == bestFeatureID: continue
        subFeatureIDs.add(featureID)
    level += 1
    value2subInstanceIDs = {}
    for instanceID in instanceIDs:
        value = trainingInstances[instanceID][0][bestFeatureID]
        if not value in value2subInstanceIDs:
            value2subInstanceIDs[value] = set()
        value2subInstanceIDs[value].add(instanceID)
    for [value,subInstanceIDs] in value2subInstanceIDs.items():
        tree[3][value] = [subInstanceIDs,subFeatureIDs,level,{},None,None,majorityLabel]
        print level,featureNames[bestFeatureID],bestCriterionScore,value
        ConstructDecisionTree(tree[3][value])

print 'Construction:'
tree = [set(range(nTrainingInstance)),set(range(nFeature)),0,{},None,None,None]
print 0
ConstructDecisionTree(tree)

print '**********'

# traverse constructed tree in top to bottom format
def Traverse(tree,_tab):
    instanceIDs,featureIDs,level,subTree,bestFeatureID,majorityLabel,parentMajorityLabel = tree
    print _tab,level,'(',majorityLabel,')'
    if tree[4] == None: return
    _tab += '  '
    print _tab,featureNames[bestFeatureID]
    _tab += '  '    
    for [value,branch] in sorted(subTree.items(),key=lambda x:x[0]):
        print _tab,value
        Traverse(branch,_tab)

print 'Traverse:'
Traverse(tree,'')

print '**********'

# Evaluation Process start
def Predict(tree,featureValues):
    instanceIDs,featureIDs,level,subTree,bestFeatureID,majorityLabel,parentMajorityLabel = tree
    if tree[4] == None: return majorityLabel
    value = featureValues[bestFeatureID]
    if not value in subTree: return majorityLabel
    return Predict(subTree[value],featureValues)

testInstances = []
fr = open('test.txt','rb')
line = fr.readline()
for line in fr:
    arr = line.strip('\r\n').split('\t')
    featureValues = arr[3:nFeature+3]
    label = arr[nFeature+3]
    testInstances.append([featureValues,label])
fr.close()

TP,FP,TN,FN = 0,0,0,0

print 'Test data:'
nTestInstance = len(testInstances)
for i in range(nTestInstance):
    featureValues = testInstances[i][0]
    truthLabel = testInstances[i][1]
    predictLabel = Predict(tree,featureValues)
    print i+1+nTrainingInstance,featureValues,'ground-truth:',truthLabel,'predict:',predictLabel
    if truthLabel == 'Win' and predictLabel == 'Win': TP += 1
    if truthLabel == 'Lose' and predictLabel == 'Win': FP += 1
    if truthLabel == 'Win' and predictLabel == 'Lose': FN += 1
    if truthLabel == 'Lose' and predictLabel == 'Lose': TN += 1

print '**********'

print 'Confusion matrix:'
print '---------------'
print '| TP',TP,'|','FN',FN,'|'
print '| FP',FP,'|','TN',TN,'|'
print '---------------'

accuracy = 1.0*(TP+TN)/nTestInstance
precision = 0.0
if TP+FP > 0: precision = 1.0*TP/(TP+FP)
recall = 0.0
if TP+FN > 0: recall = 1.0*TP/(TP+FN)
f1 = 0.0
if precision > 0.0 and recall > 0.0: f1 = 2*precision*recall/(precision+recall)

print 'Accuracy:',accuracy
print 'Precision:',precision
print 'Recall:',recall
print 'F1:',f1

This snippet took 0.00 seconds to highlight.

Back to the Entry List or Home.

Delete this entry (admin only).