import urllib2
import urllib
import re
from bs4 import BeautifulSoup
def http_web(url_1):#获取网页
request_1=urllib2.Request(url_1)
request_1.add_header("user-agent","Mozilla/5.0")
opendz=urllib2.urlopen(request_1)
read_sj=opendz.read()
return read_sj
def screen_link_1(readdz):#根据网页抓取有用链接(正则筛选)并保存链接(保存不重复的)
fgzl=re.findall(r'href="http://.+\.html',readdz)fgzl_1=re.findall(r'http://.+\.html',readdz)
i=0
print fgzl_1
wj=open('http.txt','w')
for a in fgzl_1:
print i,':',a
wj.write(a+'\n')#写入行数据
i+=1
wj.close()
def screen_link_2():
wj=open('http.txt','r')
wj_web=open('http_web.txt','w')
while True:
wjh=(wj.readline())
tupian_http=re.findall(r'http://.+\d',wjh)#序列
if not wjh:break
print tupian_http[0]
for i in range(2,50):
url_2=tupian_http[0]+'_'+str(i)+'.html'
i+=1
print url_2
req = urllib2.Request(url_2)
try:
urllib2.urlopen(req)
except urllib2.HTTPError, e:
print '网页错误'
print e.code
print e.reason
#网页正常就保存网址
else:
wj_web.write(url_2+'\n')#写入行数据
#url_2=tupian_http[0]+'_'+str(i)+'.html'
#i+=1
#print url_2
wj.close()
wj_web.close()
获取网页上的数据名称
def s_s(html_string):
soup_1=BeautifulSoup(html_string,'html.parser',from_encoding='utf-8')
links_1=soup_1.find('img')
return links_1
根据数据名称保存文件
def p_f(string_1):
i=n
print '--下载图片中--'
#print link.name,link['src'],link.get_text()
print string_1.name,string_1['src'],string_1.get_text()
urllib.urlretrieve(string_1['src'],'%s.jpg' % i)
#urllib.urlretrieve(link['src'],'%s.jpg' % i)
#i+=1
对以上函数整体调用
def dywj(http_):
http_string=httpweb(http)
soup_string=s_s(http_string)
print_file=p_f(soup_string)
url_http=str(raw_input('#请输入官网网址:'))
a_1=http_web(url_http)
b_1=screen_link_1(a_1)
screen_link_2()#根据网页进行扩展网页页数
print '----开始抓取图片---'
n=0
wj=open('http_web.txt','r')#网址文件名字
while True:
wjh=(wj.readline())
if not wjh:break
dywj(wjh)
n+=50
wj.close()
print '#-----抓取图片完成------'