Selenium：根据网站每个类别的页面数量进行抓取

下面的代码能够遍历所有类别并提取数据。该代码肯定需要更多的测试和一些增强的错误处理。PS祝你在这个编码项目中好运。import requestsimport timefrom random import randintfrom datetime import datetimefrom selenium import webdriverfrom selenium.webdriver.chrome.options import Optionsfrom selenium.common.exceptions import NoSuchElementExceptionfrom newspaper.utils import BeautifulSoupfrom newspaper import Articlechrome_options = Options()chrome_options.add_argument("--test-type")chrome_options.add_argument('--ignore-certificate-errors')chrome_options.add_argument('--disable-extensions')chrome_options.add_argument('disable-infobars')chrome_options.add_argument("--incognito")# chrome_options.add_argument('--headless')# window size as an argument is required in headless mode# chrome_options.add_argument('window-size=1920x1080')driver = webdriver.Chrome('/usr/local/bin/chromedriver', options=chrome_options)papers = []urls_set = set()def get_articles(link):   while True:      try:        next_link = driver.find_element_by_link_text("Suivant")        if next_link:            raw_html = requests.get(url)            soup = BeautifulSoup(raw_html.text, 'html.parser')            for articles_tags in soup.findAll('div', {'class': 'articles'}):                for article_href in articles_tags.find_all('a', href=True):                    if not str(article_href['href']).endswith('#commentaires'):                        article = Article(article_href['href'])                        article.download()                        article.parse()                        if article.url is not None:                            article_url = article_href['href']                            title = article.title                            publish_date = datetime.strptime(str(article.publish_date),                                                             '%Y-%m-%d %H:%M:%S').strftime('%Y-%m-%d')                                                        text_of_article = article.text.replace('\n', '')            driver.execute_script("arguments[0].scrollIntoView(true);", next_link)            next_link.click()            # Initiates a random wait to prevent the            # harvesting operation from starting before            # the page has completely loaded            time.sleep(randint(2, 4))    except NoSuchElementException:        return legorafi_urls = {'monde-libre': 'http://www.legorafi.fr/category/monde-libre',             'politique': 'http://www.legorafi.fr/category/france/politique',             'societe': 'http://www.legorafi.fr/category/france/societe',             'economie': 'http://www.legorafi.fr/category/france/economie',             'culture': 'http://www.legorafi.fr/category/culture',             'people': 'http://www.legorafi.fr/category/people',             'sports': 'http://www.legorafi.fr/category/sports',             'hi-tech': 'http://www.legorafi.fr/category/hi-tech',             'sciences': 'http://www.legorafi.fr/category/sciences',             'ledito': 'http://www.legorafi.fr/category/ledito/'             }for category, url in legorafi_urls.items():   if url:     browser = driver.get(url)     driver.implicitly_wait(30)     get_articles(browser)  else:     driver.quit()

Selenium：根据网站每个类别的页面数量进行抓取

1回答