我正在尝试构建一个网络爬虫以从 tsx 页面获取趋势股票。我目前获得了所有趋势链接,现在我正在尝试抓取各个页面上的信息。根据我的代码,当我尝试在 getStockDetails() 中输出“quote_wrapper”时,它返回一个空列表。我怀疑是因为 JavaScript 还没有在页面上呈现?不确定这是不是一回事。无论如何,我试图输出页面上的所有html进行调试,我也没有看到。我读到“渲染”JavaScript 的唯一方法是使用 Selenium 并使用 browser.execute_script("return document.documentElement.outerHTML")。它适用于索引页面,所以我尝试在其他页面上使用它。我也在代码中对此进行了评论。谢谢你的帮助,如果可以的话。
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup as soup
from urllib2 import urlopen as uReq
import time
import random
import requests
def getTrendingQuotes(source_code):
# grabs all the trending quotes for that day
links = []
page_soup = soup(source_code, "lxml")
trendingQuotes = page_soup.findAll("div", {"id": "trendingQuotes"})
all_trendingQuotes = trendingQuotes[0].findAll('a')
for link in all_trendingQuotes:
url = link.get('href')
name = link.text
# print(name)
links.append(url)
return links
def getStockDetails(url, browser):
print(url)
source_code = browser.execute_script(
"return document.documentElement.outerHTML")
#What is the correct syntax here?
#I'm trying to get the innerHTML of whole page in selenium driver
#It seems I can only access the JavaScript for the entire page this way
# source_code = browser.execute_script(
# "return" + url +".documentElement.outerHTML")
page_soup = soup(source_code, "html.parser")
# print(page_soup)
quote_wrapper = page_soup.findAll("div", {"class": "quoteWrapper"})
print(quote_wrapper)
def trendingBot(browser):
while True:
source_code = browser.execute_script(
"return document.documentElement.outerHTML")
trending = getTrendingQuotes(source_code)
for trend in trending:
browser.get(trend)
getStockDetails(trend, browser)
break
# print(trend)
相关分类