网页抓取文本返回一个空集

如前所述，您实际上并没有将 html 源代码输入 BeautifulSoup。所以首先改变的是soup = BeautifulSoup(driver.current_url, features='lxml')：soup = BeautifulSoup(driver.page_source, features='lxml')第二个问题：有些元素没有<a>带有class=detail的标签。因此，您将无法从 NoneType 中获取 href。我添加了一个 try/except 以在发生这种情况时跳过（但不确定这是否会产生您想要的结果）。你也可以摆脱那个类，然后说Details_Page = each_Contract.find('a').get('href')接下来，那只是url的扩展名，你需要追加根，所以： driver.get('https://www.tenders.gov.au' + Details_Page)我也看不到您指的是哪里 class=Contact-Heading。您还参考 class='class': 'list-desc-inner' 和一个点，然后 'class': 'list_desc_inner' 在另一个点。同样，我没有看到 class=list_desc_inner下一个。将列表附加到列表中，您想要Awarded.append(Combined)，而不是Awarded.append[Combined]我还在.strip()那里添加以清理文本中的一些空白。无论如何，您需要修复和清理很多东西，而且我也不知道您的预期输出应该是什么。但希望这能让你开始。此外，正如评论中所述，您可以单击下载按钮并立即获得结果，但也许您正在努力练习......import requestsfrom requests import getfrom selenium import webdriverfrom bs4 import BeautifulSoupfrom lxml import htmlimport pandas as pd#import chromedriver_binary  # Adds chromedriver binary to pathoptions = webdriver.ChromeOptions()options.add_argument('--ignore-certificate-errors')options.add_argument('--incognito')options.add_argument('--headless')driver = webdriver.Chrome(executable_path=r"C:\chromedriver.exe")#click the search button on Austenders to return all Awarded Contractsimport time#define the starting point: Austenders Awarded Contracts search pagedriver.get('https://www.tenders.gov.au/cn/search')#Find the Search Button and return all search resultsSearch_Results = driver.find_element_by_name("SearchButton")if 'inactive' in Search_Results.get_attribute('name'):    print("Search Button not found")    exit;print('Search Button found')Search_Results.click()    #Pause code to prevent blocking by websitetime.sleep(1)i = 0Awarded = []#Move to the next search page by finding the Next button at the bottom of the page#This code will need to be refined as the last search will be skipped currently.while True:    Next_Page = driver.find_element_by_class_name('next')    if 'inactive' in Next_Page.get_attribute('class'):        print("End of Search Results")        exit;      i = i + 1    time.sleep(2)    #Loop through all the Detail links on the current Search Results Page    print("Checking search results page " + str(i))    print(driver.current_url)    soup = BeautifulSoup(driver.page_source, features='lxml')    #Find all Contract detail links in the current search results page    Details = soup.findAll('div', {'class': 'list-desc-inner'})    for each_Contract in Details:        #Loop through each Contract details link and scrape all the detailed         #Contract information page        try:            Details_Page = each_Contract.find('a', {'class': 'detail'}).get('href')                   driver.get('https://www.tenders.gov.au' + Details_Page)            #Scrape all the data in the Awarded Contract page            #r = requests.get(driver.current_url)            soup = BeautifulSoup(driver.page_source, features='lxml')            #find a list of all the Contract Info (contained in the the 'Contact Heading'            #class of the span element)            Contract = soup.find_all('span', {'class': 'Contact-Heading'})            Contract_Info = [span.text.strip() for span in Contract]            #find a list of all the Summary Contract info which is in the text of\            #the 'list_desc_inner' class            Sub = soup.find_all('div', {'class': 'list-desc-inner'})            Sub_Info = [div.text.strip() for div in Sub]            #Combine the lists into a unified list and append to the Awarded table            Combined = [Contract_Info, Sub_Info]            Awarded.append(Combined)            #Go back to the Search Results page (from the Detailed Contract page)            driver.back()        except:            continue    #Go to the next Search Page by clicking on the Next button at the bottom of the page    Next_Page.click()    #    time.sleep(3)    driver.close()print(Awarded.Shape)

网页抓取文本返回一个空集

1回答