无法使用 BeautifulSoup 抓取图像 url

我的抓取代码是。


from bs4 import BeautifulSoup

import re


root_tag=["article",{"class":"story"}]

image_tag=["img",{"":""},"org-src"]

header=["h3",{"class":"story-title"}]

news_tag=["a",{"":""},"href"]

txt_data=["p",{"":""}]




import requests

ua1 = 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'

ua2 = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit 537.36 (KHTML, like Gecko) Chrome'

headers = {'User-Agent': ua2,

           'Accept': 'text/html,application/xhtml+xml,application/xml;' \

                     'q=0.9,image/webp,*/*;q=0.8'}

session = requests.Session()

response = session.get("website-link", headers=headers)

webContent = response.content



bs = BeautifulSoup(webContent, 'lxml')

all_tab_data = bs.findAll(root_tag[0], root_tag[1])


output=[]

for div in all_tab_data:

    image_url = None

    div_img = str(div)

    match = re.search(r"(http(s?):)([/|.|\w|\s|-])*\.(?:jpg|gif|png|jpeg)", div_img)

    print(match)

    # match = re.search(r"([^\\s]+(\\.(?i)(jpg|png|gif|bmp))$)",div)

    if match != None:

        image_url = str(match.group(0))

    else:

        image_url = div.find(image_tag[0], image_tag[1]).get(image_tag[2])

    if image_url !=None:

        if image_url[0] == '/' and image_url[1] != '/':

            image_url = main_url + image_url

        if image_url[0] == '/' and image_url[1] == '/':

            image_url="https://" + image_url[2:]

    output.append(image_url)

它只给出一个 image_url,然后给出错误 AttributeError: 'NoneType' object has no attribute 'get'


慕哥6287543
浏览 140回答 1
1回答

扬帆大鱼

您可能应该尝试重用解析库,而不是自己解析这些部分。考虑这种方法:from bs4 import BeautifulSoupimport reroot_tag =  ["article", {"class":"story"}]image_tag = ["img", {"":""}, "org-src"]header =    ["h3", {"class":"story-title"}]news_tag =  ["a", {"":""}, "href"]txt_data =  ["p", {"":""}]# import requests# ua1 = 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'# ua2 = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit 537.36 (KHTML, like Gecko) Chrome'# headers = {'User-Agent': ua2,#            'Accept': 'text/html,application/xhtml+xml,application/xml;' \#                      'q=0.9,image/webp,*/*;q=0.8'}# session = requests.Session()# response = session.get("https://www.reuters.com/energy-environment", headers=headers)# webContent = response.content# file = open('output', 'wb')# file.write(webContent)# file.close()file = open('output', 'r')webContent = file.read()bs = BeautifulSoup(webContent, 'html.parser')all_tab_data = bs.findAll(*root_tag)output = []for div in all_tab_data:    image_url = None    div_img = str(div)    article_section = BeautifulSoup(div_img, 'html.parser')    article_images = article_section.findAll(*image_tag)    if article_images is not None:        output.extend([i.get('org-src') for i in article_images if i and i.get('org-src') is not None])
打开App,查看更多内容
随时随地看视频慕课网APP

相关分类

Python