一. 概要
1.通过python爬虫循环爬取古诗词网站古诗名句
2.落地到本地数据库
二. 页面分析
首先通过firedebug进行页面定位:
其次源码定位:
最终生成lxml etree定位div标签源码:
response = etree.HTML(data)
for row in response.xpath('//div[@class="left"]/div[@class="sons"]/div[@class="cont"]'):
content = row.xpath('a/text()')[0]
origin = row.xpath('a/text()')[-1]
self.db.add_new_row('mingJuSpider', {'content': content, 'origin': origin, 'createTime': str(date.today())})
三. 执行结果
四. 脚本源码
#!/usr/bin/env python
# -*- coding: utf-8 -*-
'''
@Date : 2017/12/21 12:35
@Author : kaiqing.huang
@File : mingJuSpider.py
'''
from utils import MySpider, MongoBase
from datetime import date
from lxml import etree
import sys
class mingJuSpider():
def __init__(self):
self.db = MongoBase()
self.spider = MySpider()
def download(self):
for pageId in range(1,117):
url = 'http://so.gushiwen.org/mingju/Default.aspx?p={}&c=&t='.format(pageId)
print url
data = self.spider.get(url)
if data:
self.parse(data)
def parse(self, data):
response = etree.HTML(data)
for row in response.xpath('//div[@class="left"]/div[@class="sons"]/div[@class="cont"]'):
content = row.xpath('a/text()')[0]
origin = row.xpath('a/text()')[-1]
self.db.add_new_row('mingJuSpider', {'content': content, 'origin': origin, 'createTime': str(date.today())})
if __name__ == '__main__':
sys.setrecursionlimit(100000)
do = mingJuSpider()
do.download()