#coding=utf-8
import urllib
from bs4 import BeautifulSoup
import re
import pymysql.cursors
resp = urllib.urlopen('https://en.wikipedia.org/wiki/Main_Page').read().decode('utf-8')
soup = BeautifulSoup(resp, "html.parser")
# 获取所有以/wiki/开头的a标签的href属性
listUrls = soup.findAll("a", href=re.compile("^/wiki/"))
# 输出所有的词条对应的名称和URL
for url in listUrls:
# 过滤以。jpg或JPG结尾的url
if not re.search("\.(jpg|JPG)$", url['href']):
print url.get_text(), '<======>','https://en.wikipedia.org'+url['href']
# 获取数据库连接
connection = pymysql.connect(host = 'localhost',
user = 'root',
password = '6322004',
db = 'wikiurl',
charset = 'utf8mb4')
try:
# 获取会话指针
with connection.cursor() as cursor:
sql = "insert into `urls`(`urlname`, `urlhref`) values(%s, %s)"
# 执行sql语句
cursor.execute(sql,(url.get_text(),'https://en.wikipedia.org'+url['href']))
connection.commit()
finally:
connection.close()
finally:
connection.close()
把这块去掉
你缩进就是这样的吗?