Demo entry 6627024

123

   

Submitted by anonymous on Jun 27, 2017 at 17:10
Language: Python 3. Code size: 5.5 kB.

import pickle
import os
import pandas as pd
import numpy as np
import time
import gensim 
import jieba.analyse
import pickle 
import codecs

read = 'Online'   #'CAC' or 'Online'


fp = codecs.open('Key_words_trained.txt', 'r', 'utf-8')
new_key_words = fp.readline()
new_key_words = new_key_words.split(' ')
fp.close()

if read == 'CAC':
    data_read = pd.read_csv(open('CAC Mastersheet LV.csv',
        'rU',encoding='utf-8'),error_bad_lines=False)
    data_read = data_read.replace(np.nan,'Error', regex=True)
    data_read = data_read[:] 
    X_read = data_read['Description'].tolist()  
elif read == 'Online':
    data_read = pd.read_csv(open('MB Online master sheet LV.csv',
        'rU',encoding='utf-8'),error_bad_lines=False)
    data_read = data_read.replace(np.nan,'Error', regex=True)
    data_read = data_read[:]
    X_read = (data_read['Description'] + data_read['Title']).tolist() 


X = data_read["Description"] #CAC

# assert  1 ==2
add_words = ['跳档','跳挡','挂档','挂挡','倒档','倒挡','异响','卡顿','故障'
                'C档','S档','D档','c档','s档','d档',
                'C挡','S挡','D挡','c挡','s挡','d挡','跳枪','跳抢','碳罐','炭罐','助力泵',
                '故障灯']
for word in add_words:
    jieba.add_word(word)


unification_list = {'/':[',',',','!','!','。','.',' '':',':'],
                    '变速箱':['跳档','跳挡','挂档','挂挡','倒档','倒挡''C档','S档','D档','c档','s档','d档',
                        'C挡','S挡','D挡','c挡','s挡','d挡',],
                    '碳罐':['跳枪','跳抢','炭罐']
                     }

def unification(content):
    for k in unification_list:
        for v in unification_list[k]:
            content = content.replace(v,k ,regex = False)
            print('1')
            print(content_old)
            print('1')
            print(content)
    return content


def cut_words_intolist(test_contents): 
    target_quality = []
    for content in test_contents:
        # content = content.replace(',','/')
        # content = content.replace('!','/')
        # content = content.replace('?','/')
        # content = content.replace('?','/')
        # content = content.replace('。','/')
        # content = content.replace('.','/')
        # content = content.replace(' ','/')
        # content = content.replace(':','/')
        for k in unification_list:
            for v in unification_list[k]:
                content = content.replace(v,k )
        #content = unification(content)
        content = content.split('/')
        #print(content)
        part_target_quality = []
        part_target = []
        m = 0
        for i, sentence in enumerate(content):
            try:
                result = jieba.cut(sentence)
                result = list(result)
                if '首次' in result:
                    m = i
                else:
                    result = result
                #part_target.append(" ".join(result)) #【说 要 拆 排气管 检查】
                #result = unification(result)
                part_target_quality.append(result) #【'检查', '去', '了',】
            except:
                part_target_quality.append("Error")
        target_quality.append(part_target_quality[m:])
    return target_quality

t1 = time.time()
X_cut = cut_words_intolist(X)

n = 0
for item in X_cut:
    if item == [[]]:
        n +=1

#assert 1 ==2
#print('X_cut')
#print(X_cut)
print("cuted in: %.3fs" % (time.time() - t1))

def quality_cleansing(test_contents,keywords_list):
    quality_related = []
    quality_none_related = []
    key_words = []
    for content in test_contents:
        part_quality_related = []
        part_none_quality_related = []
        part_key_words = []
        for sentence in content:
            list_test = []
            list_test = [item for item in sentence if item in keywords_list]
            if list_test != []:
                part_key_words.append(' '.join(list_test))
            if  list_test != []:
                part_quality_related.append(' '.join(sentence))
            else:
                part_none_quality_related.append(' '.join(sentence))
        #print(list(set(part_key_words)))
        quality_related.append(part_quality_related)
        quality_none_related.append(part_none_quality_related)
        key_words.append(list(set(part_key_words)))
    return quality_related, quality_none_related, key_words
t1 = time.time()
X_quality, X_none_quality,keywords = quality_cleansing(X_cut,new_key_words)
print("quality cleansed in: %.3fs" % (time.time() - t1))
# print('X_quality')
# print(X_quality)
# print('X_none_quality')
# print(X_none_quality)
# print('keywords')
# print(keywords)
t1 = time.time()
import pickle  


# #testing
# f_X_quality = open('X_quality_cleansing_Nov.txt', 'wb')
# f_X_none_quality = open('X_none_quality_cleansing_Nov.txt','wb')
# f_keywords = open('keywords_cleasing_Nov.txt', 'wb')
# X_cut_data = open('X_cut_Nov.txt', 'wb')

#testing
f_X_quality = open('X_quality_cleansing_try.txt', 'wb')
f_X_none_quality = open('X_none_quality_cleansing_try.txt','wb')
f_keywords = open('keywords_cleasing_try.txt', 'wb')
X_cut_data = open('X_cut_try.txt', 'wb')

# #traning
# f_X_quality = open('X_quality_cleansing.txt', 'wb')
# f_X_none_quality = open('X_none_quality_cleansing.txt','wb')
# f_keywords = open('keywords_cleasing.txt', 'wb')
# X_cut_data = open('X_cut.txt', 'wb')

pickle.dump(X_quality, f_X_quality)
pickle.dump(X_none_quality, f_X_none_quality)
pickle.dump(keywords, f_keywords)
pickle.dump(X_cut, X_cut_data)


f_X_quality.close()
f_X_none_quality.close()
f_keywords.close()
print("saved in: %.3fs" % (time.time() - t1))
#k= pickle.load(open('keywords.pkl','rb'))

This snippet took 0.01 seconds to highlight.

Back to the Entry List or Home.

Delete this entry (admin only).