失眠网,内容丰富有趣,生活中的好帮手!
失眠网 > 【python 爬虫】百度手机助手爬虫

【python 爬虫】百度手机助手爬虫

时间:2022-03-31 15:44:21

相关推荐

【python 爬虫】百度手机助手爬虫

一、需求分析:

抓取百度手机助手软件应用,导出EXCEL和插入mysql。字段包括:

1. app_name:应用名称2. app_pic:应用logo3. app_score:应用评分4. app_topic:应用主题5. app_type:应用分类6. app_download_num:应用下载量7. app_size:应用大小8. app_version:应用版本9. app_xiaobian:应用小编寄语10. app_jieshao:应用介绍11. create_time:抓取时间

抓取网站:/software/

二、效果展示:

三、建表语句

CREATE TABLE `t_baidu_info` (`id` int(11) NOT NULL AUTO_INCREMENT,`app_name` varchar(64) NOT NULL COMMENT '应用名称',`app_pic` mediumtext COMMENT '应用logo',`app_score` varchar(64) DEFAULT NULL COMMENT '应用评分',`app_topic` varchar(64) DEFAULT NULL COMMENT '应用主题',`app_type` varchar(64) DEFAULT NULL COMMENT '应用分类',`app_download_num` varchar(64) DEFAULT NULL COMMENT '应用下载量',`app_size` varchar(64) DEFAULT NULL COMMENT '应用大小',`app_version` varchar(64) DEFAULT NULL COMMENT '应用版本',`app_xiaobian` mediumtext COMMENT '小编介绍评语',`app_jieshao` mediumtext COMMENT '应用介绍',`create_time` datetime DEFAULT NULL COMMENT '创建时间',PRIMARY KEY (`id`),KEY `Index 2` (`app_name`),KEY `Index 3` (`app_type`)) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8 COMMENT='百度手机助手爬虫表';

四、python 爬虫代码

# encoding: utf-8from __future__ import divisionimport timeimport sysreload(sys)time1=time.time()sys.setdefaultencoding('utf-8')import requestsimport refrom lxml import etreeimport pandas as pd#########定义抓取的数据结构app_name=[]app_pic=[]app_score=[]app_topic=[]app_type=[]app_download_num=[]app_size=[]app_version=[]app_xiaobian=[]app_jieshao=[]create_time=[]###爬虫地址入口base_url="/software/"###类别数字# category_num = [501, 502, 503, 504, 505, 506, 507, 508, 509, 510]category_num=[501]###分页编号# page_num = [1, 2, 3, 4, 5, 6, 7, 8]page_num = [1]# 所有应用类别的URLlistcategoryPageURL_list = []for x in category_num:for y in page_num:print base_url + str(x) + '/list_' + str(y) + '.html'categoryPageURL_list.append(base_url + str(x) + '/list_' + str(y) + '.html')#爬取所有应用详情页的urlappDetailPageURL_list = []for url_1 in categoryPageURL_list:#构造request请求对象content = requests.get(url_1).content#re模块用于对正则表达式的支持,pattern可以理解为一个匹配模式,re.S指"."可以匹配换行"\n"pattern = pile('<a class="app-box" href="(.*?)" target="_blank">', re.S)resultStr = re.findall(pattern, content)for result in resultStr:appDetailPageURL = '/' + resultprint appDetailPageURLappDetailPageURL_list.append(appDetailPageURL)###################循环抓取################## url_2=appDetailPageURL_list[0]for url_2 in appDetailPageURL_list:try:html_appDetailPageURL=requests.get(url_2).contentselecor=etree.HTML(html_appDetailPageURL)######当前时间import datetimenowTime=datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')print nowTimecreate_time.append(nowTime)########应用名称app_name_1=re.findall('<h1 class="app-name">(.*?)</h1>',html_appDetailPageURL,re.S)app_name_2=re.findall('<span>(.*?)</span>',str(app_name_1[0]),re.S)for each in app_name_2:print eachapp_name.append(each)#######应用logoapp_pic_1=re.findall('<img src="(.*?)".*?/>',html_appDetailPageURL,re.S)print app_pic_1[0]app_pic.append(app_pic_1[0])######应用评分app_score_1=re.findall('<span class="star-xbig"><span class="star-percent" style="width:(.*?)"></span></span>',html_appDetailPageURL,re.S)app_score_2=float(int(str(app_score_1[0]).replace("%",''))/100)*5print app_score_2app_score.append(app_score_2)#######应用分类app_type_1=selecor.xpath('//*[@id="doc"]/div[1]/div/span[5]/a/text()')for each in app_type_1:print eachapp_type.append(each)######应用主题app_type_2=selecor.xpath('//*[@id="doc"]/div[1]/div/span[3]/a/text()')for each in app_type_2:print eachapp_topic.append(each)#####应用下载量app_download_num_1=selecor.xpath('//*[@id="doc"]/div[2]/div/div[1]/div/div[2]/div[2]/span[3]/text()')for each in app_download_num_1:print str(each).replace("下载次数: ",'')app_download_num.append(str(each).replace("下载次数: ",''))#####应用版本app_version_1=selecor.xpath('//*[@id="doc"]/div[2]/div/div[1]/div/div[2]/div[2]/span[2]/text()')for each in app_version_1:print str(each).replace('版本: ','')app_version.append(str(each).replace('版本: ',''))#####应用大小app_size_1=selecor.xpath('//*[@id="doc"]/div[2]/div/div[1]/div/div[2]/div[2]/span[1]/text()')for each in app_size_1:print str(each).replace('大小: ','')app_size.append(str(each).replace('大小: ',''))######应用小编评语app_xiaobian_1=selecor.xpath('//*[@id="doc"]/div[2]/div/div[2]/div[1]/div[1]/span[2]/text()')if len(app_xiaobian_1)>0:for each in app_xiaobian_1:print eachapp_xiaobian.append(each)else:print "无小编评语"app_xiaobian.append('无小编评语')#####应用介绍app_jieshao_1=re.findall('<p class="content content_hover">(.*?)<span class="occupied"></span></p>',html_appDetailPageURL,re.S)if len(app_jieshao_1)>0:print app_jieshao_1[0]app_jieshao.append(app_jieshao_1[0])else:app_jieshao.append("无应用介绍")except Exception, ex:print Exception, ":", exprint len(app_name),len(app_pic),len(app_name),len(app_score),len(app_name),len(app_topic),len(app_type),len(app_download_num),len(app_size),\len(app_version),len(app_xiaobian),len(app_jieshao),len(create_time)data=pd.DataFrame({"app_name":app_name,"app_pic":app_pic,"app_score":app_score,"app_topic":app_topic,"app_type":app_type,"app_download_num":app_download_num,"app_size":app_size,"app_version":app_version,"app_xiaobian":app_xiaobian,"app_jieshao":app_jieshao,"create_time":create_time})print data###############写入EXCEL##############pd.DataFrame.to_excel(data,u"C:\\Users\\Administrator\\Desktop\\风控模型--赖德发\百度手机助手爬虫\\t_baidu_info.xlsx",header=True,encoding='gbk',index=False)############################先连上数据库##########################import pymysql## 加上字符集参数,防止中文乱码dbconn=pymysql.connect(host="127.0.0.1",database="cgjr",user="root",password="12345",port=3306,charset='utf8')#################################################################################################写入mysql数据库################################## 执行sql语句try:with dbconn.cursor() as cursor:# 执行sql语句,插入记录sql = 'INSERT INTO t_baidu_info (app_name, app_pic, app_score, app_topic, app_type,app_download_num,app_size,app_version,app_xiaobian,app_jieshao,create_time) VALUES (%s, %s, %s, %s, %s,%s,%s,%s,%s,%s,%s)'for i in range(0, len(data)):print "正在插入数据:" + str(i)cursor.execute(sql, (str(data.iloc[i, 2]), str(data.iloc[i,3]), str(data.iloc[i,4]), data.iloc[i,6], data.iloc[i,7], data.iloc[i,0],data.iloc[i,5], data.iloc[i,8],data.iloc[i,9],data.iloc[i,1],data.iloc[i,10]))# 没有设置默认自动提交,需要主动提交,以保存所执行的语句mit()except dbconn.Error, e:print "Error %d: %s" % (e.args[0], e.args[1])sys.exit(1)finally:dbconn.close()print ('数据已插入,插入数据库成功!')time2 = time.time()print u'总共耗时:' + str(time2 - time1) + 's'

如果觉得《【python 爬虫】百度手机助手爬虫》对你有帮助,请点赞、收藏,并留下你的观点哦!

本内容不代表本网观点和政治立场,如有侵犯你的权益请联系我们处理。
网友评论
网友评论仅供其表达个人看法,并不表明网站立场。