这里的代码并不是最新的,请到https://github.com/derekhe/bike-crawler获取最新代码该爬虫为单车地图的Python演示代码,具备以下功能:
支持ofo和摩拜
多线程爬取
自动去重
按照ofo和摩拜输出对应的csv文件,存放在db/【日期】/【日期】-【时间】-【品牌】.csv文件内
运行环境:
Python3
运行前请联系微信bcdata获取token,内置的token为演示用,单车位置是真实的,ID是随机的。
运行:
pip3 install -r requirements.txtpython3 crawler.py
这里的代码并不是最新的,请到https://github.com/derekhe/bike-crawler获取最新代码
import datetimeimport jsonimport osimport os.pathimport sqlite3import threadingimport timefrom concurrent.futures import ThreadPoolExecutorimport numpy as npimport pandas as pdimport requestsclass Crawler:
def __init__(self):
self.start_time = datetime.datetime.now()
self.csv_path = "./db/" + datetime.datetime.now().strftime("%Y%m%d")
os.makedirs(self.csv_path, exist_ok=True)
self.csv_name = self.csv_path + "/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
self.db_name = "./temp.db"
self.lock = threading.Lock()
self.total = 0
self.done = 0
self.bikes_count = 0
def get_nearby_bikes(self, args):
try:
url = "http://www.dancheditu.com:3000/bikes?lat=%s&lng=%s&cityid=%s&token=%s" % (args[0], args[1], args[2], args[3])
headers = { 'charset': "utf-8", 'platform': "4", 'content-type': "application/x-www-form-urlencoded", 'user-agent': "MicroMessenger/6.5.4.1000 NetType/WIFI Language/zh_CN", 'host': "mwx.mobike.com", 'connection': "Keep-Alive", 'accept-encoding': "gzip", 'cache-control': "no-cache"
}
self.request(headers, args, url) except Exception as ex:
print(ex) def request(self, headers, args, url):
response = requests.request( "GET", url, headers=headers,
timeout=30, verify=False
) with self.lock: with sqlite3.connect(self.db_name) as c: try:
decoded = json.loads(response.text)['msg']
self.done += 1
for x in decoded:
self.bikes_count += 1
if x['brand'] == 'ofo':
c.execute("INSERT OR IGNORE INTO ofo VALUES (%d,'%s',%f,%f)" % (
int(time.time()) * 1000, x['id'], x['lat'], x['lng'])) else:
c.execute("INSERT OR IGNORE INTO mobike VALUES (%d,'%s',%f,%f)" % (
int(time.time()) * 1000, x['id'], x['lat'], x['lng']))
timespent = datetime.datetime.now() - self.start_time
percent = self.done / self.total
total = timespent / percent
print("位置 %s, 单车数量 %s, 进度 %0.2f%%, 速度 %0.2f个/分钟, 总时间 %s, 剩余时间 %s" % (
args, self.bikes_count, percent * 100, self.done / timespent.total_seconds() * 60, total, total - timespent)) except Exception as ex:
print(ex) def start(self, config):
if os.path.isfile(self.db_name):
os.remove(self.db_name) try: with sqlite3.connect(self.db_name) as c:
c.execute(self.generate_create_table_sql('ofo'))
c.execute(self.generate_create_table_sql('mobike')) except Exception as ex:
print(ex) pass
executor = ThreadPoolExecutor(max_workers=config['workers'])
print("Start")
self.total = 0
lat_range = np.arange(config['top_lat'], config['bottom_lat'], -config['offset']) for lat in lat_range:
lng_range = np.arange(config['left_lng'], config['right_lng'], config['offset']) for lon in lng_range:
self.total += 1
executor.submit(self.get_nearby_bikes, (lat, lon, config['cityid'], config['token']))
executor.shutdown()
self.group_data() def generate_create_table_sql(self, brand):
return '''CREATE TABLE {0}
(
"Time" DATETIME,
"bikeId" VARCHAR(12),
lat DOUBLE,
lon DOUBLE,
CONSTRAINT "{0}_bikeId_lat_lon_pk"
PRIMARY KEY (bikeId, lat, lon)
);'''.format(brand) def group_data(self):
print("正在导出数据")
conn = sqlite3.connect(self.db_name)
self.export_to_csv(conn, "mobike")
self.export_to_csv(conn, "ofo") def export_to_csv(self, conn, brand):
df = pd.read_sql_query("SELECT * FROM %s" % brand, conn, parse_dates=True)
df['Time'] = pd.to_datetime(df['Time'], unit='ms').dt.tz_localize('UTC').dt.tz_convert('Asia/Chongqing')
df.to_csv(self.csv_name + "-" + brand + ".csv", header=False, index=False)# 配置# 经纬度请用百度拾取工具拾取,http://api.map.baidu.com/lbsapi/getpoint/config = { # 左边经度
"left_lng": 103.9213455517, # 上边维度
"top_lat": 30.7828453209, # 右边经度
"right_lng": 104.2178123382, # 右边维度
"bottom_lat": 30.4781772402, # 平移量,用于遍历整个区域的最小间隔,请自行调整,必要时可以参考www.dancheditu.com
# 参数过小则抓取太过于密集,导致重复数据过多
# 参数过大则抓取太过于稀疏,会漏掉一些数据
"offset": 0.02, # 城市id,请参考http://www.dancheditu.com/的FAQ
"cityid": 75, # 线程数,请合理利用资源,线程数请不要过大,过大服务器会返回错误
"workers": 20, # token,请加微信bcdata付费获取,demo只能提供单车的真实位置,但是id号是随机的
"token": "demo"}
Crawler().start(config)
print("完成")
作者:我是思聪
链接:https://www.jianshu.com/p/6038cfa25cf1