有没有办法优化for循环?Selenium 需要很长时间才能抓取 38 页

我正在尝试通过 Selenium 和 python抓取https://arxiv.org/search/?query=healthcare&searchtype=allI 。for 循环执行时间太长。我尝试使用无头浏览器和 PhantomJS 进行抓取,但它不会抓取抽象字段(需要通过单击更多按钮来扩展抽象字段)

import pandas as pd

import selenium

import re

import time

from selenium.common.exceptions import NoSuchElementException

from selenium.webdriver import Firefox


browser = Firefox()

url_healthcare = 'https://arxiv.org/search/?query=healthcare&searchtype=all'

browser.get(url_healthcare)


dfs = []

for i in range(1, 39):

    articles = browser.find_elements_by_tag_name('li[class="arxiv-result"]')


    for article in articles:

        title = article.find_element_by_tag_name('p[class="title is-5 mathjax"]').text

        arxiv_id = article.find_element_by_tag_name('a').text.replace('arXiv:','')

        arxiv_link = article.find_elements_by_tag_name('a')[0].get_attribute('href') 

        pdf_link = article.find_elements_by_tag_name('a')[1].get_attribute('href')

        authors = article.find_element_by_tag_name('p[class="authors"]').text.replace('Authors:','')


        try:

                link1 = browser.find_element_by_link_text('▽ More')

                link1.click()

        except:

                time.sleep(0.1)


        abstract = article.find_element_by_tag_name('p[class="abstract mathjax"]').text

        date = article.find_element_by_tag_name('p[class="is-size-7"]').text

        date = re.split(r"Submitted|;",date)[1]

        tag = article.find_element_by_tag_name('div[class="tags is-inline-block"]').text.replace('\n', ',')

        

        try:

            doi = article.find_element_by_tag_name('div[class="tags has-addons"]').text

            doi = re.split(r'\s', doi)[1] 

        except NoSuchElementException:

            doi = 'None'


        all_combined = [title, arxiv_id, arxiv_link, pdf_link, authors, abstract, date, tag, doi]

        dfs.append(all_combined)


    print('Finished Extracting Page:', i)



慕码人2483693
浏览 66回答 2
2回答

qq_花开花谢_0

以下实现在16 秒内实现了这一目标。为加快执行进程,我采取了以下措施:完全删除Selenium(无需点击)对于abstract, 使用BeautifulSoup的输出并稍后对其进行处理添加multiprocessing以显着加快该过程from multiprocessing import Process, Managerimport requests from bs4 import BeautifulSoupimport reimport timestart_time = time.time()def get_no_of_pages(showing_text):    no_of_results = int((re.findall(r"(\d+,*\d+) results for all",showing_text)[0].replace(',','')))    pages = no_of_results//200 + 1    print("total pages:",pages)    return pages def clean(text):    return text.replace("\n", '').replace("  ",'')def get_data_from_page(url,page_number,data):    print("getting page",page_number)    response = requests.get(url+"start="+str(page_number*200))    soup = BeautifulSoup(response.content, "lxml")        arxiv_results = soup.find_all("li",{"class","arxiv-result"})    for arxiv_result in arxiv_results:        paper = {}         paper["titles"]= clean(arxiv_result.find("p",{"class","title is-5 mathjax"}).text)        links = arxiv_result.find_all("a")        paper["arxiv_ids"]= links[0].text.replace('arXiv:','')        paper["arxiv_links"]= links[0].get('href')        paper["pdf_link"]= links[1].get('href')        paper["authors"]= clean(arxiv_result.find("p",{"class","authors"}).text.replace('Authors:',''))        split_abstract = arxiv_result.find("p",{"class":"abstract mathjax"}).text.split("▽ More\n\n\n",1)        if len(split_abstract) == 2:            paper["abstract"] = clean(split_abstract[1].replace("△ Less",''))        else:             paper["abstract"] = clean(split_abstract[0].replace("△ Less",''))        paper["date"] = re.split(r"Submitted|;",arxiv_results[0].find("p",{"class":"is-size-7"}).text)[1]        paper["tag"] = clean(arxiv_results[0].find("div",{"class":"tags is-inline-block"}).text)         doi = arxiv_results[0].find("div",{"class":"tags has-addons"})               if doi is None:            paper["doi"] = "None"        else:            paper["doi"] = re.split(r'\s', doi.text)[1]         data.append(paper)        print(f"page {page_number} done")if __name__ == "__main__":    url = 'https://arxiv.org/search/?searchtype=all&query=healthcare&abstracts=show&size=200&order=-announced_date_first&'    response = requests.get(url+"start=0")    soup = BeautifulSoup(response.content, "lxml")    with Manager() as manager:        data = manager.list()          processes = []        get_data_from_page(url,0,data)        showing_text = soup.find("h1",{"class":"title is-clearfix"}).text        for i in range(1,get_no_of_pages(showing_text)):            p = Process(target=get_data_from_page, args=(url,i,data))            p.start()            processes.append(p)        for p in processes:            p.join()        print("Number of entires scraped:",len(data))        stop_time = time.time()        print("Time taken:", stop_time-start_time,"seconds")输出:>>> python test.pygetting page 0page 0 donetotal pages: 10getting page 1getting page 4getting page 2getting page 6getting page 5getting page 3getting page 7getting page 9getting page 8page 9 donepage 4 donepage 1 donepage 6 donepage 2 donepage 7 donepage 3 donepage 5 donepage 8 doneNumber of entires scraped: 1890Time taken: 15.911492586135864 seconds

白衣非少年

您可以根据要求尝试一下美丽的汤做法。无需点击更多链接。from requests import getfrom bs4 import BeautifulSoup# you can change the size to retrieve all the results at one shot.url = 'https://arxiv.org/search/?query=healthcare&searchtype=all&abstracts=show&order=-announced_date_first&size=50&start=0'response = get(url,verify = False)soup = BeautifulSoup(response.content, "lxml")#print(soup)queryresults = soup.find_all("li", attrs={"class": "arxiv-result"})for result in queryresults:    title = result.find("p",attrs={"class": "title is-5 mathjax"})    print(title.text)#If you need full abstract content - try this (you do not need to click on more button    for result in queryresults:        abstractFullContent = result.find("span",attrs={"class": "abstract-full has-text-grey-dark mathjax"})        print(abstractFullContent.text)输出: Interpretable Deep Learning for Automatic Diagnosis of 12-lead Electrocardiogram              Leveraging Technology for Healthcare and Retaining Access to Personal Health Data to Enhance Personal Health and Well-being  Towards new forms of particle sensing and manipulation and 3D imaging on a smartphone for healthcare applications
打开App,查看更多内容
随时随地看视频慕课网APP

相关分类

Python