有一个点击按钮的网页抓取网站

我正在尝试抓取一个具有多个 javascript 渲染页面 ( https://openlibrary.ecampusontario.ca/catalogue/ ) 的网站。我能够从第一页获取内容,但我不确定如何让我的脚本点击后续页面上的按钮来获取该内容。这是我的脚本。


import time

from bs4 import BeautifulSoup as soup

import requests

from selenium import webdriver

from selenium.webdriver.chrome.options import Options

import json


# The path to where you have your chrome webdriver stored:

webdriver_path = '/Users/rawlins/Downloads/chromedriver'


# Add arguments telling Selenium to not actually open a window

chrome_options = Options()

chrome_options.add_argument('--headless')

chrome_options.add_argument('--window-size=1920x1080')


# Fire up the headless browser

browser = webdriver.Chrome(executable_path = webdriver_path,

chrome_options = chrome_options)


# Load webpage

url = "https://openlibrary.ecampusontario.ca/catalogue/"

browser.get(url)


# to ensure that the page has loaded completely.

time.sleep(3)


data = [] 


# Parse HTML, close browser

page_soup = soup(browser.page_source, 'lxml')

containers = page_soup.findAll("div", {"class":"result-item tooltip"})


for container in containers:

    item = {}

    item['type'] = "Textbook"

    item['title'] = container.find('h4', {'class' : 'textbook-title'}).text.strip()

    item['author'] = container.find('p', {'class' : 'textbook-authors'}).text.strip()

    item['link'] = "https://openlibrary.ecampusontario.ca/catalogue/" + container.find('h4', {'class' : 'textbook-title'}).a["href"]

    item['source'] = "eCampus Ontario"

    item['base_url'] = "https://openlibrary.ecampusontario.ca/catalogue/"

    data.append(item) # add the item to the list


with open("js-webscrape-2.json", "w") as writeJSON:

    json.dump(data, writeJSON, ensure_ascii=False)


browser.quit()


心有法竹
浏览 190回答 2
2回答
打开App,查看更多内容
随时随地看视频慕课网APP

相关分类

JavaScript