我正在尝试webscrape一个有多个javascript呈现页面的网站(https://openlibrary.ecampusontario.ca/catalogue/)。我能够从第一页获取内容,但我不知道如何让我的脚本单击后续页面上的按钮来获取该内容。这是我的剧本。
browser = webdriver.Chrome(executable_path = webdriver_path,
chrome_options = chrome_options)
# Load webpage
url = "https://openlibrary.ecampusontario.ca/catalogue/"
browser.get(url)
# to ensure that the page has loaded completely.
time.sleep(3)
data = []
# Parse HTML, close browser
page_soup = soup(browser.page_source, 'lxml')
containers = page_soup.findAll("div", {"class":"result-item tooltip"})
for container in containers:
item = {}
item['type'] = "Textbook"
item['title'] = container.find('h4', {'class' : 'textbook-title'}).text.strip()
item['author'] = container.find('p', {'class' : 'textbook-authors'}).text.strip()
item['link'] = "https://openlibrary.ecampusontario.ca/catalogue/" + container.find('h4', {'class' : 'textbook-title'}).a["href"]
item['source'] = "eCampus Ontario"
item['base_url'] = "https://openlibrary.ecampusontario.ca/catalogue/"
data.append(item) # add the item to the list
with open("js-webscrape-2.json", "w") as writeJSON:
json.dump(data, writeJSON, ensure_ascii=False)
browser.quit()
相关分类