Demo entry 6637476

py

   

Submitted by anonymous on Sep 02, 2017 at 23:17
Language: Python. Code size: 9.5 kB.

# -*- coding: utf-8 -*-
#author LIU Bin
#date 2017-07-20


import re
import datetime
import scrapy
from scrapy.spider import CrawlSpider
from scrapy.selector import Selector
from scrapy.http import Request
import requests
import json
from sina.items import InformationItem, TweetsItem, FollowsItem, FansItem, InfoDetailsItem
from sina.settings import Tweets_Num, IDS


class SinaspiderSpider(CrawlSpider):
    name = "sinaSpider"
    allowed_domains = ['https://m.weibo.cn']
    ids = IDS
    scrawl_ID = set(ids)  # enregistrer le ID pour crawler
    finish_ID = set()  # enregistrer le ID deja crawler

    def start_requests(self):
        while self.scrawl_ID.__len__():
            ID = self.scrawl_ID.pop()
            self.finish_ID.add(ID)
            # informations personnelles
            url_information0 = "https://m.weibo.cn/api/container/getIndex?type=uid&value=%s" % ID
            print url_information0
            yield Request(url=url_information0, meta={"ID": ID}, callback=self.parseInformation)

    def parseInformation(self, response):   #prendre les informations personnelles
        if len(response.body) > 50:
            print "**************************"
            print "Fetch information0 Success"
            print "**************************"

            informationItems = InformationItem()
            informations = json.loads(response.body)

            if informations.get("userInfo", ""):
                # print informations["userInfo"]
                informationItems["_id"] = informations["userInfo"]["id"]
                informationItems["NickName"] = informations["userInfo"]["screen_name"]
                informationItems["Signature"] = informations["userInfo"]["description"]
                informationItems["Num_Tweets"] = informations["userInfo"]["statuses_count"]
                informationItems["Num_Follows"] = informations["userInfo"]["follow_count"]
                informationItems["Num_Fans"] = informations["userInfo"]["followers_count"]
                informationItems["User_Url"] = informations["userInfo"]["profile_url"]
                informationItems['Avatar'] = informations["userInfo"]["profile_image_url"]
                informationItems['LocalAvatar'] = ''
                informationItems['Cover'] = informations["userInfo"]['cover_image_phone']
                informationItems['LocalCover'] = ''
                informationItems['Used'] = False
                yield informationItems

            # entrance de Weibo.com
            tweets_container_id = informations["tabsInfo"]["tabs"][1]["containerid"]
            url_tweets = "https://m.weibo.cn/api/container/getIndex?type=uid&value=%s&containerid=%s" % (
                response.meta["ID"], tweets_container_id)
            yield Request(url=url_tweets, meta={"ID": response.meta["ID"],'owner':informations["userInfo"]["screen_name"]},
            callback=self.parseTweets, dont_filter=True)

            # entrance de information detailles
            info_container_tabs = informations["tabsInfo"]["tabs"]
            for tab in info_container_tabs:
                if tab['tab_type'] == "profile":
                    info_container_id = tab['containerid'] + \
                        '_' + '-' + '_INFO'
                    print info_container_id
                    url_details = 'https://m.weibo.cn/api/container/getIndex?type=uid&value=%s&containerid=%s' % (
                        response.meta["ID"], info_container_id)
                    yield Request(url=url_details, meta={"detail_id": info_container_id,'ID':response.meta["ID"]}, 
                        callback=self.parseDetails, dont_filter=True)
        else:
            print "**************************"
            print "Fetch information0 Fail"
            print "**************************"
            return

    def parseDetails(self, response):   #prendre informations detailles
        if len(response.body) > 50:
            print "**************************"
            print "Fetch InfoDetails Success"
            print "**************************"
            infos = json.loads(response.body)
            details = InfoDetailsItem()
            details['_id'] = response.meta['detail_id']
            details['ID'] = response.meta['ID']
            if infos.get('cards', ''):
                cards = infos['cards']
                for card in cards:
                    if card.get('card_group', ''):
                        card_group = card['card_group']
                        for group in card_group:
                            if group.get('item_name', ''):
                                if group['item_name'] == '昵称':
                                    details['NickName'] = group['item_content']
                                    print details['NickName']
                                elif group['item_name'] == '标签':
                                    print group['item_content']
                                    details['WeiboTag'] = group['item_content']
                                elif group['item_name'] == '性别':
                                    details['Gender'] = group['item_content']
                                elif group['item_name'] == '所在地':
                                    details['Place'] = group['item_content']
                                elif group['item_name'] == '简介':
                                    details['Intro'] = group['item_content']
                                elif group['item_name'] == '等级':
                                    details['Rank'] = group['item_content']
                                elif group['item_name'] == '学校':
                                    details['School'] = group['item_content']
                                elif group['item_name'] == '注册时间':
                                    details['RegTime'] = group['item_content']
                                else:
                                    pass
            yield details

    def parseTweets(self, response):    #prendre microblogs
        if len(response.body) > 50:
            print "**************************"
            print "Fetch Tweets Success"
            print "**************************"

            tweets = json.loads(response.body)
            ID = response.meta["ID"]
            Owner = response.meta["owner"]
            page = ''
            containerid = ''
            if tweets.get("cards", ""):
                cards = tweets["cards"]
                if tweets["cardlistInfo"].get("page", ""):
                    page = tweets["cardlistInfo"]["page"]
                    page = str(page)
                else:
                    return
                if tweets["cardlistInfo"].get("containerid", ""):
                    containerid = tweets["cardlistInfo"]["containerid"]
                for card in cards:
                    mblog = card.get('mblog', '')
                    if mblog:
                        tweetsItems = TweetsItem()
                        tweetsItems["_id"] = card["itemid"]
                        tweetsItems["ID"] = ID
                        tweetsItems["Owner"] = Owner
                        tweetsItems["Used"] = False
                        tweetsItems['LocalImgs'] = []
                        tweetsItems["Content"] = json.dumps(mblog).decode('unicode-escape')
                        tweetsItems["PubTime"] = mblog["created_at"]
                        tweetsItems["Like"] = mblog["attitudes_count"]
                        tweetsItems["Comment"] = mblog["comments_count"]
                        tweetsItems["Transfer"] = mblog["reposts_count"]
                        tweetsItems["TweetsText"] = mblog["text"]
                        pics = mblog.get('pics', '')
                        if pics:
                            img_urls = []
                            small_img_urls = []
                            for pic in pics:
                                url = pic["large"]['url']
                                surl = pic['url']
                                img_urls.append(url)
                                small_img_urls.append(surl)
                            tweetsItems["Imgs"] = img_urls
                            tweetsItems['SmallImgs'] = small_img_urls
                        else:
                            tweetsItems["Imgs"] = []
                            tweetsItems['SmallImgs'] = []
                    yield tweetsItems
                print "**************************"
                print "Tweetspage: " + page
                print "**************************"
                if page >= Tweets_Num:
                    print "**************************"
                    print "Fetch Tweets Finish"
                    print "**************************"
                    return
                url_tweets = "https://m.weibo.cn/api/container/getIndex?type=uid&value=%s&containerid=%s&page=%s" % (
                    ID, containerid, page)
                yield Request(url=url_tweets, meta={"ID": ID}, callback=self.parseTweets, dont_filter=True)
            else:
                return
        else:
            print "**************************"
            print "Fetch Tweets Finish"
            print "**************************"
            return

This snippet took 0.01 seconds to highlight.

Back to the Entry List or Home.

Delete this entry (admin only).