一、背景
在Python写爬虫时候,经常会遇到爬虫与反爬虫的博弈,高强度、高频次地爬取网页信息,一般会给网站服务器带来巨大压力与性能损坏,故同一个IP不断爬取网页的信息,很快就可能被网站管理员封掉。故我们可以搭建自己的代理IP库,不停的更换自己的IP去爬去网页,不会因为同一IP而影响爬虫的进行。将爬取到的IP信息进行判断筛选可用的代理地址存入数据库MySQL/Redis/Mongodb/Memcache,后期需要使用代理IP,直接从私有库中获取以逸待劳。
二、相关资料
2.1 使用的Python模块
Requests 获取网络请求
BeautifulSoup处理网页文件获取需要的信息
configparser读取配置文件信息,获取相关内容信息
pymysql用于MySQL数据库操作
redis用于Redis的操作
pymongo用于Mongodb操作
memcache用于Memcache操作
2.2 相关参考链接
Redis可参考Redis-3.2主从复制与集群搭建
Mongodb可参考Mongodb基础
Memcache可参考Memcached 安装脚本(附服务器自启动)
Python基础爬虫可参考利用Python搜索51CTO推荐博客并保存至Excel
三、代码示例
3.1 github地址
PROXIES
3.2 代码
a.spider.py
#!/bin/env python# -*- coding:utf-8 -*-# _author:kaliarchimport requestsfrom bs4 import BeautifulSoupimport randomclass GetProxyIP: def __init__(self, page=10): self._page = page self.url_head = 'http://www.xicidaili.com/wt/' def get_ip(self): """ get resouce proxy ip pool :return: res_pool list """ headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36"} res_pool = [] for pagenum in range(1, self._page): url = self.url_head + str(pagenum) response = requests.get(url, headers=headers) soup = BeautifulSoup(response.text, "html.parser") soup_tr = soup.find_all('tr') for item in soup_tr: try: soup_td = item.find_all('td') # 获取到网页的代理IP信息 res_pool.append(soup_td[5].text.lower() + '://' + soup_td[1].text + ':' + soup_td[2].text) except IndexError: pass return res_pool def right_proxies(self, res_pool): """ check available ip :param res_pool: :return:right_pool list """ right_pool = [] for ip in res_pool: if 'https' in ip: proxies = {'http': ip} else: proxies = {"http": ip} check_urllist = ['http://www.baidu.com', 'http://www.taobao.com', 'https://cloud.tencent.com/'] try: response = requests.get(random.choice(check_urllist), proxies=proxies, timeout=1) # 判断筛选可用IP if response.status_code: right_pool.append(proxies) print('add ip %s' % proxies) except Exception as e: continue return right_poolif __name__ == '__main__': # 实例化类,可以传入page proxyhelper = GetProxyIP(2) res_pool = proxyhelper.get_ip() proxy_ip = proxyhelper.right_proxies(res_pool) print(proxy_ip)
b.db.conf
[mysql]HOST = 172.20.6.100PORT = 3306USER = rootPASSWD = mysqladminDB = pydbTABLE = pytabCHARSET = utf8[redis]HOST = 172.20.6.100PORT = 6379PASSWD = redisadmin[memcache]HOST = 172.20.6.100PORT = 11211[mongodb]HOST = 172.20.6.100PORT = 27017DB = db1USER = mongoadminPASSWD = mongopwd
c.save_mysql.py
#!/bin/env python# -*- coding:utf-8 -*-# _author:kaliarchimport pymysqlimport configparserimport spiderclass MysqlOper: # initial database information def __init__(self, result_list): #初始化mysql数据库的性格信息 config = configparser.ConfigParser() config.read('db.conf') self.host = config['mysql']['HOST'] self.port = int(config['mysql']['PORT']) self.user = config['mysql']['USER'] self.passwd = config['mysql']['PASSWD'] self.db = config['mysql']['DB'] self.table = config['mysql']['TABLE'] self.charset = config['mysql']['CHARSET'] self.result_list = result_list def mysql_save(self): # create db cursor try: DB = pymysql.connect(self.host, self.user, self.passwd, self.db, port=self.port, charset=self.charset) cursor = DB.cursor() except Exception as e: print("connect dbserver fail,Please see information:") print(e) exit(1) # check and create tables cursor.execute('show tables in pydb') tables = cursor.fetchall() flag = True for tab in tables: if self.table in tab: flag = False print('%s is exist' % self.table) print(flag) if flag: #创建pytab表 cursor.execute( '''create table pytab (id int unsigned not null primary key auto_increment, protocol varchar(10),content varchar(50))''') else: return 0 # 讲获取到的IP写入到mysql数据库 for values in self.result_list: for prot, cont in values.items(): try: cursor.execute("insert into pytab (protocol,content) value (%s,%s);", [prot, cont]) except Exception as e: print("insert db occer error", e)if __name__ == "__main__": proxyhelper = spider.GetProxyIP(3) res_pool = proxyhelper.get_ip() proxy_ip = proxyhelper.right_proxies(res_pool) dbhelper = MysqlOper(proxy_ip) dbhelper.mysql_save()
3.save_redis.py
#!/bin/env python# -*- coding:utf-8 -*-# _author:kaliarchimport redisimport randomimport configparserimport spiderclass RedisOper: def __init__(self): """ initialization redis infomation :param """ config = configparser.ConfigParser() config.read('db.conf') self.host = config['redis']['HOST'] self.port = config['redis']['PORT'] self.passwd = config['redis']['PASSWD'] self.pool = redis.ConnectionPool(host=self.host, port=self.port, password=self.passwd) self.redis_helper = redis.Redis(connection_pool=self.pool) self.pipe = self.redis_helper.pipeline(transaction=True) def redis_save(self, result_list): """ save data :return:None """ for num, cont in enumerate(result_list): self.redis_helper.set(num, cont) self.pipe.execute() def redis_gain(self): """ gain data :return: proxies """ num = random.randint(0, 10) ip = self.redis_helper.get(num) self.pipe.execute() return ipif __name__ == '__main__': proxyhelper = spider.GetProxyIP(2) res_pool = proxyhelper.get_ip() proxy_ip = proxyhelper.right_proxies(res_pool) dbhelper = RedisOper() dbhelper.redis_save(proxy_ip) ip = dbhelper.redis_gain() print(ip)
4.save_mongodb.py
#!/bin/env python# -*- coding:utf-8 -*-# _author:kaliarchimport configparserimport spiderfrom pymongo import MongoClientclass MongodbOper: def __init__(self): """ initialization redis infomation :param """ config = configparser.ConfigParser() config.read('db.conf') self.host = config['mongodb']['HOST'] self.port = config['mongodb']['PORT'] self.db = config['mongodb']['DB'] self.user = config['mongodb']['USER'] self.pwd = config['mongodb']['PASSWD'] self.client = MongoClient(self.host, int(self.port)) self.db_auth = self.client.admin self.db_auth.authenticate(self.user, self.pwd) self.DB = self.client[self.db] self.collection = self.DB.myset def mongodb_save(self, result_list): """ save data :return:None """ for values in result_list: self.collection.insert(values) def mongodb_gain(self): """ gain data :return: proxies """ ip = self.collection.find_one() return ipif __name__ == '__main__': proxyhelper = spider.GetProxyIP(2) res_pool = proxyhelper.get_ip() proxy_ip = proxyhelper.right_proxies(res_pool) dbhelper = MongodbOper() dbhelper.mongodb_save(proxy_ip) ip = dbhelper.mongodb_gain() print(ip)
5.save_memcache.py
#!/bin/env python# -*- coding:utf-8 -*-# _author:kaliarchimport memcacheimport randomimport configparserimport spiderclass MemcacheOper: def __init__(self): """ initialization redis infomation :param """ config = configparser.ConfigParser() config.read('db.conf') self.host = config['memcache']['HOST'] self.port = config['memcache']['PORT'] self.mcoper = memcache.Client([self.host + ':' + self.port], debug=True) def memcache_save(self, result_list): """ save data :return:None """ for num, cont in enumerate(result_list): self.mcoper.set(str(num), cont) def memcache_gain(self): """ gain data :return: proxies """ num = random.randint(0, 10) ip = self.mcoper.get(str(num)) return ipif __name__ == '__main__': proxyhelper = spider.GetProxyIP(2) res_pool = proxyhelper.get_ip() proxy_ip = proxyhelper.right_proxies(res_pool) dbhelper = MemcacheOper() dbhelper.memcache_save(proxy_ip) ip = dbhelper.memcache_gain() print(ip)