Demo entry 6706739

qwqw

   

Submitted by anonymous on Jan 31, 2018 at 08:10
Language: Python 3. Code size: 2.7 kB.

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Date    : 2017-10-23 12:42:07
# @Author  : Leo Wood (leowood@foxmail.com)
from selenium import webdriver
from bs4 import BeautifulSoup
import os


def init_driver(): #整个这个函数就是为了返回一个driver用来给Beautifulsoup解析,作为它的参数。具体的如果用的火狐、IE、Chrome不同则对应修改
    chromedriver = 'C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe'
    os.environ["webdriver.chrome.driver"] = chromedriver
    driver = webdriver.Ie(chromedriver)      #这里是调用谷歌浏览器,也就是chrome浏览器,所以正常应该改成Chrome,但是不改也能运行,不知道为啥
    return driver


def get_soup(url, driver):
    driver.get(url)
    soup = BeautifulSoup(driver.page_source, "html.parser")
    return soup


def get_detail(soup):
    news_lists = soup.find('div', class_='content_list').find_all('li')
    for news in news_lists:
        if news.contents == []:
            news_lists.remove(news)
        elif news.find('div', class_='dd_lm').find('a').string == '视频': #所有的这里的.string都可以改成get_text(),二者是一样的作用,都是Beautifulsoup里用来获取标签包住的文本内容
            news_lists.remove(news)
    with open('chinanews.txt', 'a', encoding='utf-8') as f:
        for news in news_lists[:10]:
            news_dict = {}
            news_dict['title'] = news.find(
                'div', class_='dd_bt').find('a').string
            news_dict['comments'] = ''
            news_dict['comments_num'] = ''
            news_dict['url'] = 'http://www.chinanews.com' + \
                news.find('div', class_='dd_bt').find('a')['href']
            content_driver = init_driver()
            news_content_soup = get_soup(news_dict['url'], content_driver)
            news_dict['content'] = ''
            contents = news_content_soup.find('div', class_='left_zw')
            if str(type(contents)) != "<class 'NoneType'>":
                for p in contents.find_all('p'):
                    if str(type(p.string)) != "<class 'NoneType'>":
                        news_dict['content'] += p.string.replace('\u3000', ' ')
            content_driver.quit()
            news_dict['type'] = news.find(
                'div', class_='dd_lm').find('a').string
            news_dict['words_links'] = ''
            news_dict['time'] = '2017-' + \
                news.find('div', class_='dd_time').string
            news_dict['hot'] = ''
            news_dict['source'] = '中国新闻网'
            f.write(str(news_dict)+'\n')
            print(news_dict['title'], '  done')


def main():
    driver = init_driver()
    date = '1001'
    url = 'http://www.chinanews.com/scroll-news/2017/{date}/news.shtml'.format(
        date=date)
    soup = get_soup(url, driver)
    get_detail(soup)


if __name__ == '__main__':
    main()

This snippet took 0.01 seconds to highlight.

Back to the Entry List or Home.

Delete this entry (admin only).