from lxml import etree import requests #爬取糗事百科 page = eval(input("请输入需要爬取的总页数:")) #print(type(page)) #page = 3 data = "" x = "" for p in range(1,page+1): url = "https://www.qiushibaike.com/8hr/page/{}/".format(p) print(url) headers = { "User-Agent": "Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36" } res = requests.get(url, headers = headers) tree = etree.HTML(res.text) all = tree.xpath('//div[@id="content-left"]/div') data += url+'\n' for div in all: author = div.xpath('.//h2/text()') age = div.xpath('.//div[contains(@class, "articleGender")]/text()') content = div.xpath('.//span/text()') xinbie = div.xpath('.//div[contains(@class, "articleGender")]/@class') funny = div.xpath('.//span[@class="stats-vote"]/i/text()') conment = div.xpath('.//a[@class="qiushi_comments"]/i/text()') if xinbie == 'articleGender manIcon': x ='男' elif xinbie =='articleGender womenIcon': x = '女' else: x = "性别不明" up = '作者:' + author[0].strip() + '\t性别:'+ x +'\t年龄:' + str(age) middle = content[0].strip() bottom = '好笑数:'+ funny[0] + '\t评论数:' + conment[0] data+= up + '\n' + middle +'\n'+bottom+'\n' print('author:',author[0],type(author)) print('age:', age[0],type(age)) print('content:', content[0], type(content)) print('funny:', funny[0], type(funny)) print('conment:', conment[0], type(conment)) with open('xiushibaike_spider.txt', 'w', encoding="utf-8") as f: f.write(data)
爬取了5个字段,类型都为list,为什么唯独age[0]提示 IndexError: list index out of range ?
2. xinbie的判断那里,我知道是list和字符串不能比较,怎么改才对?
产品经理不是经理