Demo entry 6770402

Web crawler

   

Submitted by anonymous on Nov 13, 2018 at 10:04
Language: Python. Code size: 2.2 kB.

def extract_financial(u_f,path):
    f = xlwt.Workbook(encoding='utf-8')
    for url in u_f:
        table_name = url.split('/')[3]
        print table_name
        
        # 通过设置User Agent的来达到隐藏身份的目的,以免被网站屏蔽访问
        headers = {
            'User-Agent': 'Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.114 Safari/537.36'}
        req = urllib2.Request(url, headers=headers)
        table = f.add_sheet(table_name)
        try:
            response = urllib2.urlopen(req)
        except urllib2.URLError,err:
            print err.reason
        except urllib2.HTTPError,err:
            print err.code
        else:
            html = etree.HTML(response.read())
            result = html.xpath('//*[@id="zbCenter"]/div/span/table[3]/tr[2]/td[1]/table[2]/tr')
            if result:
                row_num = 0
                table.write(row_num,0,etree.tostring(result[0],method='text',encoding='utf-8'))
                row_num += 1
                if result[1].xpath('./descendant::tr'):
                    for tr in result[1].xpath('./descendant::tr'):
                        col_num = 0
                        if tr.xpath('./td'):
                            for td in tr.xpath('./td'):
                                table.write(row_num,col_num,etree.tostring(td,method='text',encoding='utf-8'))
                                col_num += 1
                            row_num += 1

            result = html.xpath('//*[@id="zbCenter"]/div/span/table[3]/tr[2]/td[1]/table[3]/tr[2]/td[1]/table/descendant::tr')
            if result:
                table.write(row_num, 0, '')
                row_num += 1
                table.write(row_num,0,'Finances - Leverage')
                row_num += 1
                for tr in result:
                    if not(tr):break
                    col_num = 0
                    if tr.xpath('./td'):
                        for td in tr.xpath('./td'):
                            table.write(row_num, col_num, etree.tostring(td, method='text', encoding='utf-8'))
                            col_num += 1
                        row_num += 1
    f.save(path)

This snippet took 0.01 seconds to highlight.

Back to the Entry List or Home.

Delete this entry (admin only).