Submitted by anonymous on May 15, 2018 at 12:30
Language: Python 3. Code size: 887 Bytes.

def start_requests(self):
		with open('douban_top250_urls.txt') as f:
			for line in f:
				url = line.strip() + 'comments?start=0&limit=20&sort=new_score&status=P&percent_type=h'
				print("现在读取完成的 url 为:" + url)
				yield Request(url, callback=self.parse, cookies=self.cookie, headers=self.headers)

	def parse(self, response):
		page = Selector(response)
		comments = page.css('div.comment p::text').extract()

		file_name = "douban_top250_reviews_pos_51-100.txt"
		with open(file_name, 'a') as f:
			print("正在写入文件 " + file_name)
			for c in comments:
				if self.cn_text(c):

		next_page = page.xpath('//div[@id = "paginator"]/a[@class = "next"]/@href').extract_first()  # 提取数组的第一个元素

		if next_page:
			url_next_page = response.urljoin(next_page)
			yield Request(url_next_page, callback=self.parse, cookies=self.cookie, headers=self.headers)

