Demo entry 6738461

#

   

Submitted by anonymous on May 04, 2018 at 09:00
Language: Python 3. Code size: 1.4 kB.

main_url = "https://gz.lianjia.com/xiaoqu/"#链家根链接
area_link_list = {'tianhe':[], 'yuexiu':[], 'liwan':[],'haizhu':[],'panyu':[],'baiyun':[],'huangpugz':[],'conghua':[],'zengcheng':[],'huadou':[],'nansha':[]}#不同区域字典变量
for area in area_link_list:#循环爬抓广州每个区域(如天河区)所有链接
	for page in range(1,100):
		try:
			url = main_url + area + '/' + 'pg' + str(page) + '/'#构造页面链接
			open_obj = urllib.request.urlopen(url, timeout = 10)#打开页面链接
			html = open_obj.read()#读取页面信息
			page_source = etree.HTML(html)#解析页面信息
			page_source.xpath('//ul[@class="listContent"]/li[1]/a/@href')
			for item in range(1,32):
				try:
					xiaoqu_link = page_source.xpath('//ul[@class="listContent"]/li[%s]/a/@href' % str(item))[0].strip()#解析结点链接信息
					area_link_list['%s' % area].append(xiaoqu_link)#存储小区链接
				except:
					print('collect %s items in page%s at %s' %(str(item), str(page), area))
					break

		except:
			print('collect %s pages at %s' %(str(page-1), area))
			break




length = 0#调整变量存储格式
for area in area_link_list:
	list_length = len(area_link_list['%s' % area])
	if  list_length > length:
		length = list_length

for area in area_link_list:
	list_length = len(area_link_list['%s' % area])
	if  list_length < length:
		need = length - list_length
		for i in range(1, need+1):
			area_link_list['%s' % area].append('')

df = pd.DataFrame(area_link_list)
df.to_csv(r"E:\链家数据\数据.csv")

This snippet took 0.01 seconds to highlight.

Back to the Entry List or Home.

Delete this entry (admin only).