Demo entry 6747591

   

Submitted by anonymous on Jun 04, 2018 at 12:46
Language: Python. Code size: 8.8 kB.

# _*_ coding: utf-8 _*_
'''
Created on 2018年5月28日

@author: chizh
'''

import sys
import csv
import uniout
import db_interface
import re
maxInt = sys.maxsize 
csv.field_size_limit(500 * 1024 * 1024)
decrement = True

while decrement:  
    # decrease the maxInt value by factor 10   
    # as long as the OverflowError occurs.  
  
    decrement = False  
    try:  
        csv.field_size_limit(maxInt)  
    except OverflowError:  
        maxInt = int(maxInt/10)  
        decrement = True  

class csv_processor_xcar(object):
    def __init__(self,db):
        #数据准备阶段
        filepath = raw_input('>>> 请输入数据源文件夹的路径:')
        if filepath[-1]!='\\':
            filepath = filepath + '\\'
        files = ['xcar(8).csv', 'xcar(9).csv', 'xcar(10).csv', 'xcar(11).csv', 'xcar(12).csv', 'xcar(13).csv', 'xcar(14).csv', 'xcar(15).csv']
        print '>>>即将读取'
        for name in files:
            print '\t'+filepath+name
            
        #读取CSV
        for name in files:
            filename = filepath+name
            with open(filename) as currentfile:
                reader = csv.reader(currentfile)
                data = list(reader)
                '''
                data是一个list,对于每一个data[i][j]
                :i: 代表第i条数据,其中i=0时是标题;
                :j: 代表字段
                    j=0  用户名
                    j=1  用户等级
                    j=2  楼层  
                    j=3  页面标题
                    j=4  页面网址
                    j=5  帖子内容
                    j=6  时间
                    j=7  是否为精华
                '''
                #对data进行循环
                data = data[1:]
                for entry in data:
                    if entry[0] and entry[3]!='此贴已关闭' :#用户名不能为空
                        userName = entry[0]
                        sourceURL = self.get_sourceURL(entry[4])
                        replyID = self.get_replyID(entry[2])
                        postTitle = self.get_postTitle(entry[3])
                        postContent = self.get_postContent(entry[5])
                        isBest = self.get_isBest(entry[7])
                        date_year = self.get_date_year(entry[6])
                        date_month = self.get_date_month(entry[6])
                        date_day = self.get_date_day(entry[6])
                        source_forum = self.get_source_forum(entry[3])
                        level = self.get_level(entry[1])
#                         print(userName, sourceURL, replyID, postTitle, postContent, isBest, date_year, date_month, date_day, source_forum)
                        db.save_post(userName, sourceURL, replyID, postTitle, postContent, isBest, date_year, date_month, date_day, source_forum, level)
                
                
    def get_sourceURL(self, entry):
        entry = entry.split('&page=')
        return entry[0]
    
    def get_replyID(self, entry):
        replyID = int(entry.replace('楼',''))
        return replyID
    
    def get_postTitle(self,entry):
        temStr = entry.split('_',1)
        postTitle = temStr[0]
        return postTitle
    
    def get_postContent(self, entry):
        regEx_style = re.compile(r'<style[^>]*?>[\\s\\S]*?<\\/style>')
        regEx_script = re.compile(r'<script[^>]*?>[\\s\\S]*?<\\/script>')
        regEx_comment = re.compile(r'<!--[^>]+-->')
        regEx_div = re.compile(r'<div[^>]+>')
        regEx_divE = re.compile(r'</div>')
        regEx_p = re.compile(r'<p[^>]+>')
        regEx_pE = re.compile(r'</p>')
        regEx_img = re.compile(r'<img[^>]+>')
        regEx_font = re.compile(r'<font[^>]+>')
        regEx_fontE = re.compile(r'</font>')
        regEx_span = re.compile(r'<span[^>]+>')
        regEx_spanE = re.compile(r'</span>')
        regEx_input = re.compile(r'<input[^>]+>')
        regEx_a = re.compile(r'<a[^>]+>')
        regEx_aE = re.compile(r'</a>')
        regEx_i = re.compile(r'<i>')
        regEx_iE = re.compile(r'</i>')
        regEx_table = re.compile(r'<table[^>]+>')
        regEx_tableE = re.compile(r'</table>')
        regEx_em = re.compile(r'<em[^>]+>')
        regEx_emE = re.compile(r'</em>')
        regEx_br1 = re.compile(r'<br[^>]+>')
        regEx_br2 = re.compile(r'<br>')
        regEx_space = re.compile(r' \s+')
        result = entry.replace('<img src="http://www.xcar.com.cn/bbs/images/common/back.gif" title="新窗口中打开引用楼层" onload="if(this.width>760) {this.resized=true; this.width=760; this.alt=\'Click here to open new window\';}" onmouseover="if(this.width>760) {this.resized=true; this.width=760; this.style.cursor=\'hand\'; this.alt=\'Click here to open new window\';}" onclick="if(!this.resized) {return true;} else {window.open(\'http://www.xcar.com.cn/bbs/images/common/back.gif\');}" onmousewheel="return imgzoom(this);" alt="" border="0">','')
        result = result.replace('&nbsp;',' ')
        
        result, number = re.subn(regEx_style, '', result)
        result, number = re.subn(regEx_script, '', result)
        result, number = re.subn(regEx_comment, '', result)
        result, number = re.subn(regEx_div, '', result)
        result, number = re.subn(regEx_divE, '', result)
        result, number = re.subn(regEx_p, '', result)
        result, number = re.subn(regEx_pE, '', result)
        result, number = re.subn(regEx_font, '', result)
        result, number = re.subn(regEx_fontE, '', result)
        result, number = re.subn(regEx_span, '', result)
        result, number = re.subn(regEx_spanE, '', result)
        result, number = re.subn(regEx_input, '', result)
        result, number = re.subn(regEx_a, '', result)
        result, number = re.subn(regEx_aE, '', result)
        result, number = re.subn(regEx_i, '', result)
        result, number = re.subn(regEx_iE, '', result)
        result, number = re.subn(regEx_table, '', result)
        result, number = re.subn(regEx_tableE, '', result)
        result, number = re.subn(regEx_em, ' ', result)
        result, number = re.subn(regEx_emE, ' ', result)
        result, number = re.subn(regEx_br1, ' ', result)
        result, number = re.subn(regEx_br2, ' ', result)
        result, number = re.subn(regEx_img, '<img>', result)
        result, number = re.subn(regEx_space, ' ', result)
        postContent = result
        return postContent
    
    def get_isBest(self, entry):
        if entry == '不是精华':
            isBest = 0
        else:
            isBest = 1
        return isBest
    
    def get_date_year(self,entry):
#         temp = entry.decode("utf-8")
        date_year = int(entry[10:14])
        return date_year
    
    def get_date_month(self, entry):
        date_month = int(entry[15:17])
        return date_month
    
    def get_date_day(self, entry):
        date_day = int(entry[18:20])
        return date_day
    
    def get_source_forum(self, entry):
        #去掉Title
        temStr_1 = entry.split('_',1)
        temStr_2 = temStr_1[1]
        #再次分割开各个部分
        temList = temStr_2.split('_')
        if re.match('^第.*页$',temList[0]):#判断是否第一页
            source_forum = temList[1]
        else:#如果是的话
            source_forum = temList[0]
        return source_forum
    
    def get_level(self, entry):
        if re.search('社区乞丐', entry):
            return 0
        elif re.search('用户被禁止发言', entry):
            return 0
        elif re.search('游客', entry):
            return 0
        elif re.search('新手上路', entry):
            return 1
        elif re.search('初级会员', entry):
            return 2
        elif re.search('三星会员', entry):
            return 3
        elif re.search('支柱会员', entry):
            return 4
        elif re.search('青铜长老', entry):
            return 5
        elif re.search('白银长老', entry):
            return 6
        elif re.search('黄金长老', entry):
            return 7
        elif re.search('白金长老', entry):
            return 8
        elif re.search('本站元老', entry):
            return 9
        elif re.search('青铜元老', entry):
            return 10
        elif re.search('白银元老', entry):
            return 11
        elif re.search('黄金元老', entry):
            return 12
        elif re.search('白金元老', entry):
            return 13
        elif re.search('钻石元老', entry):
            return 14
        elif re.search('版务助理', entry):
            return 15
        elif re.search('实习版主', entry):
            return 16
        elif re.search('版主', entry):
            return 17
        elif re.search('离休干部', entry):
            return 17
        else:
            return 0
    
if __name__ == '__main__':
    db = db_interface.DBConnector()
    db.db_connect()
    csv_processor_xcar(db)
    db.db_disconnect()

This snippet took 0.02 seconds to highlight.

Back to the Entry List or Home.

Delete this entry (admin only).