猿问

无法抓取谷歌图片硒

我有以下脚本,我希望它可以抓取谷歌图片。它首先单击图像,然后单击下一个(>)按钮以切换到下一个图像。


它下载第一张图片,但是当它轮到第二张图片时,它会抛出一个错误。


Traceback (most recent call last):

  File "c:/Users/intel/Desktop/Scrappr/image_scrape.pyw", line 40, in <module>

    attribute_value = WebDriverWait(driver, 5).until(EC.visibility_of_element_located((By.CLASS_NAME, 'n3VNCb'))).get_attribute("src")

  File "C:\Users\intel\AppData\Local\Programs\Python\Python38\lib\site-packages\selenium\webdriver\support\wait.py", line 80, in until

    raise TimeoutException(message, screen, stacktrace)

selenium.common.exceptions.TimeoutException: Message:

我的代码:


import requests

import shutil

import time

import urllib

from selenium.webdriver.common.by import By

from selenium.webdriver.common.keys import Keys

from selenium.webdriver.support.ui import WebDriverWait

from selenium.webdriver.support import expected_conditions as EC

from bs4 import BeautifulSoup as Soup

from selenium.webdriver.chrome.options import Options

from selenium import webdriver


user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) ' \

             'Chrome/80.0.3987.132 Safari/537.36'


options = Options()

#options.add_argument("--headless")

options.add_argument(f'user-agent={user_agent}')

options.add_argument("--disable-web-security")

options.add_argument("--allow-running-insecure-content")

options.add_argument("--allow-cross-origin-auth-prompt")


driver = webdriver.Chrome(executable_path=r"C:\Users\intel\Downloads\setups\chromedriver.exe", options=options)

driver.get("https://www.google.com/search?q=mac+beautiful+ui&tbm=isch&ved=2ahUKEwiL3ILMveToAhWGCHIKHVPNAScQ2-cCegQIABAA&oq=mac+beautiful+ui&gs_lcp=CgNpbWcQAzoECAAQQzoCCAA6BQgAEIMBOgYIABAFEB46BggAEAgQHlDPI1iEUWCgU2gAcAB4AIAByAKIAd8dkgEHMC40LjkuM5gBAKABAaoBC2d3cy13aXotaW1n&sclient=img&ei=Q9-TXsuuMoaRyAPTmoe4Ag&bih=657&biw=1360")


driver.find_element_by_class_name("rg_i").click()


慕标5832272
浏览 73回答 2
2回答

慕盖茨4494581

我已经整理并重构了一些代码。最终结果能够为您选择的关键字抓取 n 个图像(请参阅 参考资料SEARCH_TERMS):import base64import osimport requestsimport timefrom io import BytesIOfrom PIL import Imagefrom selenium.webdriver.common.by import Byfrom selenium.webdriver.support.ui import WebDriverWaitfrom selenium.webdriver.support import expected_conditions as ECfrom selenium.webdriver.chrome.options import Optionsfrom selenium import webdriverCHROME_DRIVER_LOCATION = r'C:\Users\intel\Downloads\setups\chromedriver.exe'SEARCH_TERMS = ['very', 'hot', 'chicks']TARGET_SAVE_LOCATION = os.path.join(r'c:\test', '_'.join([x.capitalize() for x in SEARCH_TERMS]),&nbsp; r'{}.{}')if not os.path.isdir(os.path.dirname(TARGET_SAVE_LOCATION)):&nbsp; &nbsp; os.makedirs(os.path.dirname(TARGET_SAVE_LOCATION))def check_if_result_b64(source):&nbsp; &nbsp; possible_header = source.split(',')[0]&nbsp; &nbsp; if possible_header.startswith('data') and ';base64' in possible_header:&nbsp; &nbsp; &nbsp; &nbsp; image_type = possible_header.replace('data:image/', '').replace(';base64', '')&nbsp; &nbsp; &nbsp; &nbsp; return image_type&nbsp; &nbsp; return Falsedef get_driver():&nbsp; &nbsp; user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) ' \&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;'Chrome/80.0.3987.132 Safari/537.36'&nbsp; &nbsp; options = Options()&nbsp; &nbsp; #options.add_argument("--headless")&nbsp; &nbsp; options.add_argument(f'user-agent={user_agent}')&nbsp; &nbsp; options.add_argument("--disable-web-security")&nbsp; &nbsp; options.add_argument("--allow-running-insecure-content")&nbsp; &nbsp; options.add_argument("--allow-cross-origin-auth-prompt")&nbsp; &nbsp; new_driver = webdriver.Chrome(executable_path=CHROME_DRIVER_LOCATION, options=options)&nbsp; &nbsp; new_driver.get(f"https://www.google.com/search?q={'+'.join(SEARCH_TERMS)}&source=lnms&tbm=isch&sa=X")&nbsp; &nbsp; return new_driverdriver = get_driver()first_search_result = driver.find_elements_by_xpath('//a/div/img')[0]first_search_result.click()right_panel_base = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, f'''//*[@data-query="{' '.join(SEARCH_TERMS)}"]''')))first_image = right_panel_base.find_elements_by_xpath('//*[@data-noaft="1"]')[0]magic_class = first_image.get_attribute('class')image_finder_xp = f'//*[@class="{magic_class}"]'# initial wait for the first image to be loaded# this part could be improved but I couldn't find a proper way of doing ittime.sleep(3)# initial thumbnail for "to_be_loaded image"thumbnail_src = driver.find_elements_by_xpath(image_finder_xp)[-1].get_attribute("src")for i in range(10):&nbsp; &nbsp; # issue 4: All image elements share the same class. Assuming that you always click "next":&nbsp; &nbsp; # The last element is the base64 encoded thumbnail version is of the "next image"&nbsp; &nbsp; # [-2] element is the element currently displayed&nbsp; &nbsp; target = driver.find_elements_by_xpath(image_finder_xp)[-2]&nbsp; &nbsp; # you need to wait until image is completely loaded:&nbsp; &nbsp; # first the base64 encoded thumbnail will be displayed&nbsp; &nbsp; # so we check if the displayed element src match the cached thumbnail src.&nbsp; &nbsp; # However sometimes the final result is the base64 content, so wait is capped&nbsp; &nbsp; # at 5 seconds.&nbsp; &nbsp; wait_time_start = time.time()&nbsp; &nbsp; while (target.get_attribute("src") == thumbnail_src) and time.time() < wait_time_start + 5:&nbsp; &nbsp; &nbsp; &nbsp; time.sleep(0.2)&nbsp; &nbsp; thumbnail_src = driver.find_elements_by_xpath(image_finder_xp)[-1].get_attribute("src")&nbsp; &nbsp; attribute_value = target.get_attribute("src")&nbsp; &nbsp; print(attribute_value)&nbsp; &nbsp; # issue 1: if the image is base64, requests get won't work because the src is not an url&nbsp; &nbsp; is_b64 = check_if_result_b64(attribute_value)&nbsp; &nbsp; if is_b64:&nbsp; &nbsp; &nbsp; &nbsp; image_format = is_b64&nbsp; &nbsp; &nbsp; &nbsp; content = base64.b64decode(attribute_value.split(';base64')[1])&nbsp; &nbsp; else:&nbsp; &nbsp; &nbsp; &nbsp; resp = requests.get(attribute_value, stream=True)&nbsp; &nbsp; &nbsp; &nbsp; temp_for_image_extension = BytesIO(resp.content)&nbsp; &nbsp; &nbsp; &nbsp; image = Image.open(temp_for_image_extension)&nbsp; &nbsp; &nbsp; &nbsp; image_format = image.format&nbsp; &nbsp; &nbsp; &nbsp; content = resp.content&nbsp; &nbsp; # issue 2: if you 'open' a file, later you have to close it. Use a "with" pattern instead&nbsp; &nbsp; with open(TARGET_SAVE_LOCATION.format(i, image_format), 'wb') as f:&nbsp; &nbsp; &nbsp; &nbsp; f.write(content)&nbsp; &nbsp; # issue 3: this Xpath is bad """//*[@id="Sva75c"]/div/div/div[3]/div[2]/div/div[1]/div[1]/div/div[1]/a[2]/div""" if page layout changes, this path breaks instantly&nbsp; &nbsp; svg_arrows_xpath = '//div[@jscontroller]//a[contains(@jsaction, "click:trigger")]//*[@viewBox="0 0 24 24"]'&nbsp; &nbsp; next_arrow = driver.find_elements_by_xpath(svg_arrows_xpath)[-3]&nbsp; &nbsp; next_arrow.click()

米脂

免责声明:我怀疑 Google 是否允许在搜索中进行抓取。您应该查看https://www.google.com/robots.txt以找出答案。话虽如此,我认为您的WebDriverWait方法存在问题,尽管我不确定它到底是什么。由于您已经让您的驱动程序在此之前等待time.sleep,因此我只是尝试直接找到该元素,并且它有效:i = 0while i < 10:&nbsp; &nbsp; i += 1&nbsp; &nbsp; time.sleep(5)&nbsp; &nbsp; attribute_value = driver.find_element_by_css_selector("img.n3VNCb").get_attribute("src") # NEW LINE&nbsp; &nbsp; print(attribute_value)&nbsp; &nbsp; resp = requests.get(attribute_value, stream=True)&nbsp; &nbsp; local_file = open(r'C:/users/intel/desktop/local_image'+ str(i) + '.jpg', 'wb')&nbsp; &nbsp; resp.raw.decode_content = True&nbsp; &nbsp; shutil.copyfileobj(resp.raw, local_file)&nbsp; &nbsp; del resp&nbsp; &nbsp; driver.find_element_by_xpath("""//*[@id="Sva75c"]/div/div/div[3]/div[2]/div/div[1]/div[1]/div/div[1]/a[2]/div""").click()
随时随地看视频慕课网APP

相关分类

Python
我要回答