我的抓取代码是。
from bs4 import BeautifulSoup
import re
root_tag=["article",{"class":"story"}]
image_tag=["img",{"":""},"org-src"]
header=["h3",{"class":"story-title"}]
news_tag=["a",{"":""},"href"]
txt_data=["p",{"":""}]
import requests
ua1 = 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'
ua2 = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit 537.36 (KHTML, like Gecko) Chrome'
headers = {'User-Agent': ua2,
'Accept': 'text/html,application/xhtml+xml,application/xml;' \
'q=0.9,image/webp,*/*;q=0.8'}
session = requests.Session()
response = session.get("website-link", headers=headers)
webContent = response.content
bs = BeautifulSoup(webContent, 'lxml')
all_tab_data = bs.findAll(root_tag[0], root_tag[1])
output=[]
for div in all_tab_data:
image_url = None
div_img = str(div)
match = re.search(r"(http(s?):)([/|.|\w|\s|-])*\.(?:jpg|gif|png|jpeg)", div_img)
print(match)
# match = re.search(r"([^\\s]+(\\.(?i)(jpg|png|gif|bmp))$)",div)
if match != None:
image_url = str(match.group(0))
else:
image_url = div.find(image_tag[0], image_tag[1]).get(image_tag[2])
if image_url !=None:
if image_url[0] == '/' and image_url[1] != '/':
image_url = main_url + image_url
if image_url[0] == '/' and image_url[1] == '/':
image_url="https://" + image_url[2:]
output.append(image_url)
它只给出一个 image_url,然后给出错误 AttributeError: 'NoneType' object has no attribute 'get'
扬帆大鱼
相关分类