Demo entry 6627008

123

   

Submitted by anonymous on Jun 27, 2017 at 16:45
Language: Python 3. Code size: 6.9 kB.

import os
import pandas as pd
import numpy as np
import time
import jieba
from sklearn.cross_validation import train_test_split
from sklearn import tree
from sklearn.neighbors import KNeighborsClassifier
import string 

#输入变量
read = 'CAC'   #'CAC' or 'Online'
classifier = 'Naive' #'Naive' or 'SVC' , 'DT', 'knn'
Tier = 'QFS_Tier1'  #'QFS_Tier1' or 'QFS_Tier3'
feature = 'tfv' #'cv', 'tfv', 'hv'

t1 = time.time()
t2 = time.time()
if read == 'CAC':
    data_read = pd.read_csv(open('CAC Mastersheet LV2.csv',
        'rU',encoding='utf-8'),error_bad_lines=False)
    data_read = data_read.replace(np.nan,'Error', regex=True)
    data_read = data_read.loc[(data_read[Tier] != 'Error')]
    data_read = data_read[:1000] 
    X_read = data_read['Description'].tolist() 
    y_read = data_read[Tier].tolist() 
elif read == 'Online':
    data_read = pd.read_csv(open('MB Online master sheet LV2.csv',
        'rU',encoding='utf-8'),error_bad_lines=False)
    data_read = data_read.replace(np.nan,'Error', regex=True)
    data_read = data_read.loc[(data_read[Tier] != 'Error')]
    data_read = data_read[:]
    X_read = (data_read['Description'] + data_read['Title']).tolist() 
    y_read = data_read[Tier].tolist() 


### 添加内容
add_words = ['跳档','跳挡','挂档','挂挡','倒档','倒挡','异响','卡顿','故障'
                'C档','S档','D档','c档','s档','d档',
                'C挡','S挡','D挡','c挡','s挡','d挡','跳枪','跳抢','碳罐','炭罐','助力泵',
                '故障灯']
for word in add_words:
    jieba.add_word(word)


unification_list = {'/':[',',',','!','!','。','.',' '':',':'],
                    '变速箱':['跳档','跳挡','挂档','挂挡','倒档','倒挡''C档','S档','D档','c档','s档','d档',
                        'C挡','S挡','D挡','c挡','s挡','d挡',],
                    '碳罐':['跳枪','跳抢','炭罐']
                     }
def cut_words(test_contents): 
    target = []
    for content in test_contents:
        try:
            result = jieba.cut(content)
            result = list(result)
            target.append(" ".join(result))
        except:
            target.append(["Error"])
    return target

def cut_words_shouci(test_contents): 
    target_quality = []
    for content in test_contents:
        for k in unification_list:
            for v in unification_list[k]:
                content = content.replace(v,k )
        content = content.split('/')
        part_target_quality = []
        part_target = []
        m = 0
        for i, sentence in enumerate(content):
            try:
                result = jieba.cut(sentence)
                result = list(result)
                if '首次' in result:
                    m = i
                else:
                    result = result
                part_target_quality.append(" ".join(result)) 
            except:
                part_target_quality.append("Error")
        target_quality.append(" ".join(part_target_quality[m:]))
    return target_quality

def QFS_Tier_cleansing(data):
    QFS_Tier =[]
    for item in data:
        item = string.capwords(item)
        if 'ngine' in item:
            new_item = 'Engine'
        elif 'ifting' in item:
            new_item = 'Window lifting'
        elif 'heel' in item:
            new_item = 'Wheel'
        elif 'key' in item or 'Key' in item:
            new_item = 'Key'
        elif 'attery' in item:
            new_item = 'Battery'
        elif 'ransmission' in item:
            new_item = 'Transmission'
        elif 'ECO' in item:
            new_item = 'ECO start/stop function'
        elif 'Tire Pressure' in item:
            new_item = 'Tpm Sensor'
        # elif 'Abs' in item or 'Esp' in item or 'Abc' in item or 'ystem' in item or 'Pre-safe' in item or 'Collision Prevention Assist Plus' in item or 'rpm' in item or 'Rpm' in item:
        #     new_item = 'Das'
        elif 'Rpm sensor' in item:
            new_item = 'Rpm'
        elif 'Camera' in item:
            new_item = 'Camera'
        elif 'Mmc' in item:
            new_item = 'Mb Connect'
        elif 'Trim' in item:
            new_item = 'Trim'
        # elif 'amp' in item :
        #     new_item = 'Lamp'
        else:
            new_item = item
        QFS_Tier.append(new_item) #替换一级分类
    return QFS_Tier
y_read = QFS_Tier_cleansing(y_read)

from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn import grid_search
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.multiclass import OneVsRestClassifier

tfv = TfidfVectorizer(ngram_range=(1, 2))
hv = HashingVectorizer()
cv = CountVectorizer()

X_cut = cut_words(X_read)


if feature == 'tfv':
    X_features = tfv.fit_transform(X_cut)
elif feature == 'hv':
    X_features = hv.fit_transform(X_cut)
elif feature == 'cv':
    X_features = cv.fit_transform(X_cut)

X_features = tfv.fit_transform(X_cut)
X = X_features

encoding = LabelEncoder()
y = encoding.fit_transform(y_read)

print("features finished in: %.3fs" % (time.time() - t1))
t1 = time.time()
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3,)

svm = SVC(C=1, gamma='auto', kernel='linear')
MNB = MultinomialNB(alpha=0.01)
dt = tree.DecisionTreeClassifier()
knn = KNeighborsClassifier(n_neighbors=3)


if classifier == 'Naive':
    MNB.fit(X_train,y_train)
    y_predict = MNB.predict(X_test)
elif classifier == 'SVC':
    svm.fit(X_train,y_train)
    y_predict = svm.predict(X_test)
elif classifier == 'DT':
    dt.fit(X_train,y_train)
    y_predict = dt.predict(X_test)
elif classifier == 'knn':
    knn.fit(X_train,y_train)
    y_predict = knn.predict(X_test)



print("Trained finished in: %.3fs" % (time.time() - t1))


from sklearn.metrics import classification_report
predict_value = encoding.inverse_transform(y_predict.tolist())
y_test_inverse = encoding.inverse_transform(y_test)

print("Accuracy:{0:.2f}%".format(np.mean(y_predict == y_test) * 100))
print("totally finished in: %.3fs" % (time.time() - t2))


def error_report(predict_value,y_test_inverse,): 
    predict_value = predict_value.tolist()
    y_test_inverse = y_test_inverse.tolist()   
    error = []
    error_description = []
    error_about_Error = []
    error_about_Vehicle = []
    error_about_Engine = []
    for i in range(0,len(predict_value)):
        if predict_value[i] != y_test_inverse[i]:
            error.append("Predict: %s ^^ True: %s" % (predict_value[i],y_test_inverse[i]))
            #error_description.append(X_test[i])

    error_series = pd.Series(error)
    error_report = pd.DataFrame(error_series.value_counts())
    error_report.columns = ['Counts']
    print('Error couted: %s' % len(error))
    print(error_report.head(30))

error_report(predict_value, y_test_inverse)
print(classification_report(y_test_inverse, predict_value))
print("Accuracy:{0:.2f}%".format(np.mean(y_predict == y_test) * 100))

This snippet took 0.02 seconds to highlight.

Back to the Entry List or Home.

Delete this entry (admin only).