求大佬指点哈!Python中for循环中列表切片问题

这个程序是抽取豆瓣top250页面所有电影相关信息(名称,分数,影评人数,引用语)。
问题是在parse_page函数中,top250共十个页面,成功提取前八页的信息,但最后两页的信息提取有问题提示listindexoutofrange,但此数据在for中有显示,for之外调用时就出错。求解。
importsocket
importssl
deflog(*args,**kwargs):
print('log:',*args,**kwargs)
defparse_url(url):
#提取协议与uri
protocol=url.split('://')[0]
ifprotocol=='http':
protocol='http'
uri=url.split('://')[1]
elifprotocol=='https':
protocol='https'
uri=url.split('://')[1]
else:
uri=url
#提取主机地址
index=uri.find('/')
ifindex==-1:
host=uri
else:
host=uri.split('/')[0]
#提取端口号
http_ports={
'http':80,
'https':443,
}
ifprotocolinhttp_ports:
port=http_ports[protocol]
else:
port=uri.split(':')[1]
#提取路径
ifindex==-1:
path='/'
else:
path='/'+uri.split('/')[1]
returnprotocol,host,port,path
defsocket_by_protocol(protocol):
ifprotocol=='http':
s=socket.socket()
elifprotocol=='https':
s=ssl.wrap_socket(socket.socket())
returns
defresponse_by_socket(s):
buffer_size=1024
all_data=b''
whileTrue:
response=s.recv(buffer_size)
iflen(response)==0:
break
all_data+=response
returnall_data.decode()
defparse_response(response):
errors=''
ifresponse:
header,body=response.split('\r\n\r\n',1)
header_line=header.split('\r\n')
status_code=header_line[0].split()[1]
headers={}
forlineinheader_line[1:]:
k,v=line.split(':')
headers[k]=v
else:
errors='responseisnullvalue.'
headers={}
body=''
returnstatus_code,headers,body
defconstruct_request(host,path):
request='GET{}HTTP/1.1\r\nhost:{}\r\nconnection:close\r\n\r\n'.format(path,host)
returnrequest.encode()
defget(url,query):
protocol,host,port,path=parse_url(url)
s=socket_by_protocol(protocol)
s.connect((host,port))
cons_path='{}?{}={}'.format(path,query[1],query[0])
request=construct_request(host,cons_path)
s.send(request)
response=response_by_socket(s)
status_code,header,body=parse_response(response)
returnstatus_code,header,body
defparse_page(source=''):
mv_name=[]
mv_score=[]
mv_people=[]
mv_quot=[]
first_split=str(source.split('').pop(1))
second_split=str(first_split.split('').pop(0))
third_split=second_split.split('')
delthird_split[0]
forlineinthird_split:
line=line.split('')
delline[1]
#名称抽取
raw_single_mv_name=line[0].split('')[0].split('')[1]
single_mv_name=raw_single_mv_name.split('')[0]
mv_name.append(single_mv_name)
#分数与评价人数抽取
raw_single_mv_evaluate=line[0].split('')[1].split('')
single_mv_score=raw_single_mv_evaluate[1].split('">')[1]
mv_score.append(single_mv_score)
single_mv_people=raw_single_mv_evaluate[3].split('')[1]
mv_people.append(single_mv_people)
#引用语抽取
#log(mv_name,mv_score,mv_people,line[0])
#log(line[0].split('')[1])
raw_singe_mv_quot=line[0].split('')[1]
#log(raw_singe_mv_quot)
single_mv_quot=raw_singe_mv_quot.split('')[0]
#log(single_mv_quot)
mv_quot.append(single_mv_quot)
#此处mv_quot有值
log(mv_quot)
#为何这里mv_quot提示listindexoutofrange
log(mv_quot)
#log(len(mv_name),len(mv_score),len(mv_people),len(mv_quot))
returnmv_name,mv_score,mv_people,mv_quot
defmain():
url="https://movie.douban.com/top250"
protocol,host,port,path=parse_url(url)
log(protocol,host,port,path)
queries={}
forvin[valueforvalueinrange(250,0,-25)]:
queries[v]='start'
log(queries)
i=0
forqinqueries.items():
try:
status_code,header,body=get(url,q)
"""
ifi==8:
log(status_code,header,body)
"""
mvo_name,mvo_score,mvo_people,mvo_quot=parse_page(source=body)
#log(mvo_name)
#log(mvo_score)
#log(mvo_people)
log(mvo_quot)
i+=1
exceptExceptionase:
log(e)
continue
if__name__=='__main__':
main()
沧海一幻觉
浏览 331回答 2
2回答

四季花海

这行代码有问题raw_singe_mv_quot=line[0].split('')[1]拆开解释tmp_list=line[0].split('')raw_singe_mv_quot=tmp_list[1]tmp_list这个列表的长度可能为1,所以tmp_list[1]会报错误。具体逻辑我也没看,你自己排查吧!
打开App,查看更多内容
随时随地看视频慕课网APP

相关分类

JavaScript