Demo entry 6627018

123

   

Submitted by anonymous on Jun 27, 2017 at 17:00
Language: Python 3. Code size: 5.9 kB.

import pickle
import os
import pandas as pd
import numpy as np
import time
import gensim 
import jieba.analyse
import codecs 
import pickle
import matplotlib.pyplot as plt
import matplotlib as mpl
import string 

#from __future__ import absolute_import
from keras.optimizers import RMSprop
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation, Flatten
from keras.layers.embeddings import Embedding
from keras.layers.convolutional import Convolution1D, MaxPooling1D
from keras.layers.recurrent  import SimpleRNN, GRU, LSTM
from keras.callbacks import EarlyStopping

from sklearn.preprocessing import LabelEncoder
from keras.utils import np_utils

read = 'CAC'   #'CAC' or 'Online'
Tier = 'QFS_Tier3'
if read == 'CAC':
    data_read = pd.read_csv(open('CAC Mastersheet LV2.csv',
        'rU',encoding='utf-8'),error_bad_lines=False)
    data_read = data_read.replace(np.nan,'Error', regex=True)
    data_read = data_read[:]
    X_read = data_read['Description']
    y_read = data_read[Tier]
elif read == 'Online':
    data_read = pd.read_csv(open('MB Online master sheet LV2.csv',
        'rU',encoding='utf-8'),error_bad_lines=False)
    data_read = data_read.replace(np.nan,'Error', regex=True)
    data_read = data_read[:]
    X_read = data_read['Description'] + data_read['Title']
    y_read = data_read[Tier]    

add_words = ['跳档','跳挡','挂档','挂挡','倒档','倒挡','异响','卡顿','故障'
                'C档','S档','D档','c档','s档','d档',
                'C挡','S挡','D挡','c挡','s挡','d挡','跳枪','跳抢','碳罐','炭罐','助力泵',
                '故障灯']
for word in add_words:
    jieba.add_word(word)

def cut_words(test_contents): 
    target = []
    for content in test_contents:
        try:
            result = jieba.cut(content,)
            result = list(result)
            target.append(" ".join(result))
        except:
            target.append(["Error"])
    return target

X_cut = cut_words(X_read)

def QFS_Tier_cleansing(data):
    QFS_Tier =[]
    for item in data:
        item = string.capwords(item)
        if 'ngine' in item:
            new_item = 'Engine'
        elif 'nterior' in item:
            new_item = 'Interior_Odor'
        elif 'ifting' in item:
            new_item = 'Window lifting'
        elif 'heel' in item:
            new_item = 'Wheel'
        elif 'key' in item or 'Key' in item:
            new_item = 'Key'
        elif 'attery' in item:
            new_item = 'Battery'
        elif 'Dvd'in item or 'Cd' in item:
            new_item = 'DVD/CD driver'
        elif 'ransmission' in item:
            new_item = 'Transmission'
        elif 'ECO' in item:
            new_item = 'ECO start/stop function'
        elif 'Tire Pressure' in item:
            new_item = 'Tpm Sensor'
        # elif 'Abs' in item or 'Esp' in item or 'Abc' in item or 'ystem' in item or 'Pre-safe' in item or 'Collision Prevention Assist Plus' in item or 'rpm' in item or 'Rpm' in item:
        #     new_item = 'Das'
        elif 'Rpm sensor' in item:
            new_item = 'Rpm'
        elif 'Camera' in item:
            new_item = 'Camera'
        elif 'Mmc' in item:
            new_item = 'Mb Connect'
        elif 'Trim' in item:
            new_item = 'Trim'
        # elif 'amp' in item :
        #     new_item = 'Lamp'
        else:
            new_item = item
        QFS_Tier.append(new_item) #替换一级分类
    return QFS_Tier


encoding = LabelEncoder()
y_read = QFS_Tier_cleansing(y_read)
y = encoding.fit_transform(y_read)

textraw = X_cut
#textraw = [line.encode('utf-8') for line in textraw] # 需要存为str才能被keras使用


maxfeatures = 50000 # 只选择最重要的词
from keras.preprocessing.text import Tokenizer
token = Tokenizer(nb_words=maxfeatures)
token.fit_on_texts(textraw) #如果文本较大可以使用文本流
text_seq = token.texts_to_sequences(textraw)
np.median([len(x) for x in text_seq])

y_label = y_read # 定义好标签
nb_classes = len(np.unique(y_label))
print(nb_classes)


maxlen = 400 # 定义文本最大长度
batch_size = 100 # 批次
word_dim = 100 # 词向量维度
nb_filter = 200  # 卷积核个数
filter_length = 5 # 卷积窗口大小
hidden_dims = 100  # 隐藏层神经元个数
nb_epoch = 10      # 训练迭代次数
pool_length = 3   # 池化窗口大小
from sklearn.cross_validation import train_test_split
train_X, test_X, train_y, test_y = train_test_split(text_seq, y , 
    train_size=0.7, random_state=1)

X_train = sequence.pad_sequences(train_X, maxlen=maxlen,padding='post', truncating='post')
X_test = sequence.pad_sequences(test_X, maxlen=maxlen,padding='post', truncating='post')
print('X_train shape:', X_train.shape)
print('X_test shape:', X_test.shape)

assert 3<4
print('Build model...')
model = Sequential()

# 词向量嵌入层,输入:词典大小,词向量大小,文本长度
model.add(Embedding(maxfeatures, word_dim,input_length=maxlen)) 
model.add(Dropout(0.25))
model.add(Convolution1D(nb_filter=nb_filter,
                        filter_length=filter_length,
                        border_mode="valid",
                        activation="relu"))
# 池化层
model.add(MaxPooling1D(pool_length=pool_length))
model.add(Flatten())
# 全连接层
model.add(Dense(hidden_dims))
model.add(Dropout(0.25))
model.add(Activation('relu'))
model.add(Dense(nb_classes))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy', optimizer='rmsprop',metrics=['accuracy'])

Y_train = np_utils.to_categorical(train_y, nb_classes)
Y_test = np_utils.to_categorical(test_y, nb_classes)

earlystop = EarlyStopping(monitor='val_loss', patience=1, verbose=1)
result = model.fit(X_train, Y_train, batch_size=batch_size, nb_epoch=nb_epoch, 
            validation_split=0.1, show_accuracy=True,callbacks=[earlystop])

#score = earlystop.model.evaluate(X_test, Y_test, batch_size=batch_size)
#print('Test score:', score)
classes = earlystop.model.predict_classes(X_test, batch_size=batch_size)
acc = np_utils.accuracy(classes, test_y) # 要用没有转换前的y
print('Test accuracy:', acc)
from keras.utils import plot_model
plot_model(model, to_file='model.png')

# model.save('model.h5')

This snippet took 0.01 seconds to highlight.

Back to the Entry List or Home.

Delete this entry (admin only).