失眠网 > python爬虫爬取淘宝搜索页面商品信息数据

python爬虫爬取淘宝搜索页面商品信息数据

时间：2022-02-26 05:57:57

主要使用的库：

requests:爬虫请求并获取源码

re：使用正则表达式提取数据

json:使用JSON提取数据

pandas：使用pandans存储数据

以下是源代码：

#!coding=utf-8import requestsimport reimport timeimport jsonfrom requests.packages.urllib3.exceptions import InsecureRequestWarningimport pandas as pdrequests.packages.urllib3.disable_warnings(InsecureRequestWarning) ###禁止提醒SSL警告class tb(object):####手机端def __init__(self,path,seach): ###保存数据路径self.path = path ###保存数据路径self.seach= seach ##搜索词self.s = requests.session()headers = {'Host':'s.','Accept-Encoding':'br, gzip, deflate','Connection':'keep-alive','Accept':'application/json','User-Agent':'Mozilla/5.0 (iPhone; CPU iPhone OS 12_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) FxiOS/10.6b8836 Mobile/16A366 Safari/605.1.15','Accept-Language':'zh-cn','X-Requested-With':'XMLHttpRequest',}self.s.headers.update(headers) ##插入头信息def seachdata(self):for i in range(0,100):time.sleep(1.25)url='https://s./search?event_submit_do_new_search_auction=1&_input_charset=utf-8&topSearch=1&atype=b&searchfrom=1&action=home%3Aredirect_app_action&from=1&q={}&sst=1&n=20&buying=buyitnow&m=api4h5&abtest=18&wlsort=18&style=list&closeModues=nav%2Cselecthot%2Conesearch&page={}'.format(self.seach,i) ##爬取的网址print(i)req = self.s.get(url=url, verify=False).text #爬取页面结果try:js=json.loads(req)print(js)except:print('err')listItem=js['listItem']title=[] ##名称sold=[] ##月销量commentCount=[] ##评论量item_id=[] ##商品IDuserId=[] ##商家IDnick=[] ##商家名称location=[] ##商家地址pic_path=[] ##图片itemNumId=[] ##商品NIDoriginalPrice=[] ##原价price=[] ##售价category=[] ##类别IDitemurl=[] ##商品链接if listItem==[]:breakfor j in listItem: ##数据提取title.append(j['title'])sold.append(j['sold'])try:commentCount.append(j['commentCount'])except:commentCount.append('')item_id.append(j['item_id'])userId.append(j['userId'])nick.append(j['nick'])location.append(j['location'])pic_path.append(j['pic_path'])itemNumId.append(j['itemNumId'])originalPrice.append(j['originalPrice'])price.append(j['price'])try:category.append(j['category'])except:category.append('')itemurl.append(j['url'])data={'title_名称':title,'sold_月销量': sold,'commentCount_评论量': commentCount,'item_id_商品ID': item_id,'userId_商家ID': userId,'nick_商家名称': nick,'location_商家地址': location,'pic_path_图片': pic_path,'itemNumId_商品NID': itemNumId,'originalPrice_原价': originalPrice,'price_售价': price,'category_类别ID': category,'itemurl_商品链接': itemurl,}df=pd.DataFrame(data)if i==0:df.to_csv(self.path+r'\out.csv', index=False, header=1, encoding="GB18030")else:df.to_csv(self.path+r'\out.csv', index=False, header=0, mode='a', encoding="GB18030")###保存文件if __name__ == '__main__':t=tb(r'E:\taobao','手机')t.seachdata()

如果觉得《python爬虫爬取淘宝搜索页面商品信息数据》对你有帮助，请点赞、收藏，并留下你的观点哦！

本内容不代表本网观点和政治立场，如有侵犯你的权益请联系我们处理。

网友评论

网友评论仅供其表达个人看法，并不表明网站立场。