使用的是python3做的一个爬虫,主要用于爬取网贷之家上面的数据。比较粗糙,很多该封装的地方没有进行封装。如果需要每天进行的话,还需要人工进行操作。后期会考虑挂靠服务器定时进行爬取,哈哈先这么搞,剩下的以后再说。
import urllib
import requests #导入两个爬虫包,忘了是用哪个了,嘻嘻
import re,time,pymysql #导入正则表达式、时间处理、mysql连接包
from concurrent.futures import ThreadPoolExecutor,ProcessPoolExecutor #导入异步爬取框架
def url_get(url):#爬取每个平台的地址并返回
url='%s'%url
headers={'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Encoding':'gzip, deflate',
'Accept-Language':'zh-CN,zh;q=0.8',
'Cache-Control':'no-cache',
'Connection':'keep-alive',
'Host': '',
'Pragma': 'no - cache',
'User - Agent': 'Mozilla / 5.0(WindowsNT6.1;Win64;x64) AppleWebKit / 537.36(KHTML,like Gecko) Chrome/59.0.3071.115Safari/537.36'}
res = requests.get(url=url, headers=headers)
txt = res.text
req = pile('///plat-info-\d*?.html')
urls = re.findall(req, txt)
return(urls)
def data_get(url):#获取平台的基础数据,并存入提前写好的数据库表中
if url!='/plat-info-0.html':
url='%s'%url
headers={'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Encoding':'gzip, deflate',
'Accept-Language':'zh-CN,zh;q=0.8',
'Cache-Control':'no-cache',
'Connection':'keep-alive',
'Host': '',
'Pragma': 'no - cache',
'User - Agent': 'Mozilla / 5.0(WindowsNT6.1;Win64;x64) AppleWebKit / 537.36(KHTML,like Gecko) Chrome/59.0.3071.115Safari/537.36'}
res=requests.get(url=url,headers=headers)
# res=requests.request(url,headers)
txt=res.text
r=pile('<div class="rate-data".*')
s=re.findall(r,string=txt)
# print(s[1].replace('<div class="rate-data">','').replace(' ',''))
title=pile('<div class="".*?</div>')
t=re.findall(title,txt)
req = pile('<h1 alt=.*?">')
ti = re.findall(req, txt)
print(ti)
req = pile('<em>更新时间.*?</em>')
tim = re.findall(req, txt)
# print(tim[0].replace('<em>更新时间','').replace('</em>',''))
dict={}
print(url)
name=ti[0].replace('<h1 alt="','').replace('">','')
dict['name']=name
dict['add_time']=tim[0].replace('<em>更新时间:','').replace('</em>','')
for i in range(len(s)):
k=s[i].replace('<div class="rate-data">','').replace('\r','').replace(' ','').replace(',','')
v=t[i].replace('<div class="">','').replace('</div>','')
dict[v]=float(k)
data=dict
loc = pymysql.connect(host='localhost', user='root', passwd='1234', db='loc', charset='utf8')
l = loc.cursor()
mysql_code = 'insert into wdzj_pt_detail (add_date,name,bid_sum,account_chage,repayment_not,avg_apr,avg_loan_limit,bid_person,bid_avg,repayment_person,loan_person,loan_avg,loan_count,loan_repayment_person) values (\'%s\',\'%s\',\'%s\',\'%s\',\'%s\',\'%s\',\'%s\',\'%s\',\'%d\',\'%s\',\'%s\',\'%s\',\'%s\',\'%s\')' % (
data['add_time'], data['name'], data['成交量(万元)'], data['资金净流入(万元)'], data['待还余额(万元)'], data['参考收益率(%)'],
data['平均借款期限(月)'], data['投资人数(人)'], data['人均投资金额(万元)'], data['待收投资人数(人)'], data['借款人数(人)'],
data['人均借款金额(万元)'],
data['借款标数(个)'], data['待还借款人数(人)'])
l.execute('%s' % mysql_code)
mit()
l.close()
loc.close()
print('Insert data of %s to loc.wdzj_pt_detail,Done!' % data['name'])
else:
pass
return
url='/platdata-1.html'# 爬虫的入口网址
if __name__ == '__main__':#启动爬虫
url_list=url_get(url)#获取每个平台的专属页面网址
p = ProcessPoolExecutor()#创建多线程包
# p=ThreadPoolExecutor()
for i in range(1000):#爬取1000个页面就可以了
p.submit(data_get,'http:%s'%url_list[i])#分布式爬取,速度比正常的爬虫快3分钟,本来也不慢,只是怕被封IP。
time.sleep(1)
p.shutdown(wait=True)#停止多线程爬虫
now=time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()))
print('数据收集完成,完成时间点为%s'%str(now))
如果觉得《【python爬虫】爬取网贷之家所有P2P平台基本数据并写入MYsql数据库》对你有帮助,请点赞、收藏,并留下你的观点哦!