爬网页表格,想要把表格最后一栏url以超链接形式写入EXCEL,要怎么做呢?提取的url写入excel是文本格式。
获取网页urls,写入link_data列表:
links = soup.find_all('a', href=re.compile(r"/permitExt/xkgkAction!xkgk")) for link in links: link_data.append(urlparse.urljoin(root_url, link['href']))
网页url写入excel:
for i in range(2, len(data)+2): sheet.cell(row=i, column=9, value=(link_data[i-2]))
整体代码如下。爬取的数据、链接没有问题,然而excel中的链接是文本格式,不是超链接。
# coding=utf-8 import urllib.request import urllib.parse as urlparse import re from bs4 import BeautifulSoup from openpyxl import * def CementSpider(end_page): ''' 爬取网站的表格 :param end_page: :return: ''' data = [] # 初始文本数据 link_data = [] # 初始链接数据 root_url = 'http://permit.mep.gov.cn/' # 翻页查找 for page_num in range(1, end_page + 1): page_num = str(page_num) url = 'http://permit.mep.gov.cn/permitExt/outside/Publicity?pageno=%s' % (page_num) search = "&enterName=&publishtime=&province=&city=&treadcode=C301,C3011,C3021&treadname=" + urllib.request.quote("水泥、石灰和石膏制造,水泥制造,水泥制品制造") resp = urllib.request.urlopen(url + search) if resp.getcode() == 200: print("获取第%s页链接成功!" % (page_num)) else: print("获取第%s页链接失败/(ㄒoㄒ)/~~" % (page_num)) html_doc = resp.read() soup = BeautifulSoup(html_doc, "html.parser") try: # <div class="tb-con"> trs = soup.find('div', class_="tb-con").find_all('tr') # 存储表格数据 data1 = [] for tr in trs: row_data = [] for td in tr: row_data.append(td.string) data1.append(row_data) data += data1[1:] except: print("第%s页没有数据啊/(ㄒoㄒ)/~~" % (page_num)) try: # <td class="bgcolor1"> links = soup.find_all('a', href=re.compile(r"/permitExt/xkgkAction!xkgk")) for link in links: link_data.append(urlparse.urljoin(root_url, link['href'])) except: print("第%s页没有详细数据链接啊/(ㄒoㄒ)/~~" % (page_num)) return data, link_data def write_Excel(data, link_data): ''' 将数据写入EXCEL表 :param data: :return: ''' wb = Workbook() sheet = wb.active sheet.title = "sheet1" # 创建最终保存表格 sheet.cell(row=1, column=1, value='编号') sheet.cell(row=1, column=2, value='省/直辖市') sheet.cell(row=1, column=3, value='地市') sheet.cell(row=1, column=4, value='许可证编号') sheet.cell(row=1, column=5, value='单位名称') sheet.cell(row=1, column=6, value='行业类别') sheet.cell(row=1, column=7, value='有效期限') sheet.cell(row=1, column=8, value='发证日期') sheet.cell(row=1, column=9, value='查看链接') for i in range(2, len(data)+2): sheet.cell(row=i, column=1, value=i - 1) if data[i-2][16] == '\n': sheet.cell(row=i, column=9, value=(link_data[i-2])) for j in range(1, 8): try: sheet.cell(row=i, column=j+1, value=data[i-2][2*j-1]) except: print('该表格某处数据空白/(ㄒoㄒ)/~~') wb.save("水泥制造.xlsx") # EXCEL保存 def end_page(): ''' 获取最后一页页码 :return: ''' url = 'http://permit.mep.gov.cn/permitExt/outside/Publicity?&enterName=&province=&city=&treadcode=C301,C3011,C3021&treadname=%E6%B0%B4%E6%B3%A5%E3%80%81%E7%9F%B3%E7%81%B0%E5%92%8C%E7%9F%B3%E8%86%8F%E5%88%B6%E9%80%A0,%E6%B0%B4%E6%B3%A5%E5%88%B6%E9%80%A0,%E6%B0%B4%E6%B3%A5%E5%88%B6%E5%93%81%E5%88%B6%E9%80%A0' response = urllib.request.urlopen(url) html_page = response.read() soup_page = BeautifulSoup(html_page, "html.parser") page_node = soup_page.find('div', class_='page') end_page = int(re.findall('\d+', page_node.get_text())[0]) return end_page if __name__ == "__main__": end_page = end_page() data, link_data = CementSpider(end_page) write_Excel(data, link_data)
相关分类