失眠网,内容丰富有趣,生活中的好帮手!
失眠网 > python实现京东单个商品信息抓取(标题 品牌(中/英文) 图片 型号 价格 详情)

python实现京东单个商品信息抓取(标题 品牌(中/英文) 图片 型号 价格 详情)

时间:2021-12-04 19:22:57

相关推荐

python实现京东单个商品信息抓取(标题 品牌(中/英文) 图片 型号 价格 详情)

python相关包

pip install lxmlpip install requestspip install urllib3

代码部分

#!/usr/bin/python3## -*- coding: utf-8 -*-import requests,re,urllib3,json,random,stringimport lxml.etree as etreeurllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)class jdspider:""""""def __init__(self,headers=None):cookie=''.join(random.sample(string.ascii_letters + string.digits, 32)*4)self.heads = headers if headers else {'authority': '','method': 'GET','scheme': 'https','accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9','cookie': '__jdu=1675665312520306609640; areaId=1; PCSYCityID=CN_110000_110100_0; shshshfpa=6f8ad375-c871-c406-2bc6-5357bb27518c-1675665315; shshshfpb=hkXyP4dFGBP_nNUpKxHJNTw; ipLoc-djd=1-2801-54766-0; mt_xid=V2_5VwMVVFlfVV4dSRpYBmYBE1VaWVpdGkgpDFcyBUFbXg1OCB5LGkAAb1YaTlRcUQoDQBxaVmRUE1BcX1ZZL0oYXwd7AhFOXF5DWhtCGl4OZAUiUG1YYl4dShlfDWUBF1VtXVNTGQ==; unpl=JF8EAKlnNSttURxdARoFThUWQlwDW11aHEcDbGIGUg1RSlwGGQZJRhl7XlVdXxRKFB9uYRRUWlNLUQ4fACsSEXteXV5tC0oXBW5uBV1cWUtkNRgCKxsgS1pSWloPTxUDbGMGVV9ZTFMEEwUeEyBKbVNuXg9IEgRuYAZdWFp7ZAUfBRwTE3tcZF9tSh9LAWdvB1xeFUtTAx8FHBYSS15QXVwKShAEbm8CUVxoSmQG; __jdv=122270672|norefer|t_281_0818001|cpc|_0_8f8506d6780f41cfa12427a809235ce8|1675757696357; TrackID=1RLmikjo_ekVYZD2O9X00Vuq2nsUtExey0-QYtYUAfXIF_BEx7wN3s4YSv8PpFiUrpofG1yKxVKBOm9kGx-cVVzRmfOQBtB8Pko22itVG6Ss; pinId=57kC0TMY_imlDxsL1hZtMbV9-x-f3wj7; pin=jd_6690f5fcff316; unick=jd_130011zjj; _tp=GddPg2iKimJO1ghABMLB+Fpac9UhEdJvtWgGtDwUWd4=; _pst=jd_6690f5fcff316; user-key=5728a71d-7d12-4f11-95a4-a624b0f67983; shshshfpx=6f8ad375-c871-c406-2bc6-5357bb27518c-1675665315; __jdc=122270672; shshshfp=a7f104aa8822ce82a1956a5a34c1bef3; ip_cityCode=2802; jsavif=1; jsavif=1; __jda=122270672.1675665312520306609640.1675665313.1675828511.1675834848.12; 3AB9D23F7A4B3C9B=GJNZ7FN5IDR24R66U6TOLDWZGASH24KNCSCVI3OH3JYU4JYIL3BUJYCUEZZLDSRWCOQPYRN4OZ2LWGE6FXIY2VEX3M','user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.54 Safari/537.36',}self.goods_url = "/{}.html"self.detail_url = "/description/channel?skuId={}&mainSkuId={}&charset=utf-8&cdn=2&callback=showdesc"# 代理配置self.tunnel = ":15818"username = "t13020803831933"password = ""self.proxies = {"http": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": self.tunnel},"https": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": self.tunnel}}self.proxies=Nonedef getprice(self,jd_sn):res = requests.get("https://item./product/{}.html".format(jd_sn), headers=self.heads, proxies=self.proxies)html = res.texts = pile(r'"price":(.*\}),')js = re.findall(s, html)try:data = json.loads(js[0])return {"m": data["m"], "p": data["p"], "op": data["op"]}except Exception as e:return {}def downloadimg(self,img_url):"""图片资源下载"""if "http" not in img_url:img_url = "http:"+img_urlreturn img_url.replace(".avif","")def spider(self,sku):"""数据获取"""goods = dict()#商品页html代码res = requests.get(self.goods_url.format(sku),headers=self.heads,verify=False, proxies=self.proxies)html = res.textmainskuid = re.findall(r'mainSkuId:.\d+', html)[0].split(":'")[1]#获取详情图片detail = requests.get(self.detail_url.format(sku,mainskuid),headers=self.heads,verify=False, proxies=self.proxies)tree = etree.HTML(html)# 商品标题title = tree.xpath("/html/body/div[6]/div/div[2]/div[1]//text()")for item in title:if item.strip():title = item.strip()goods['goods_name'] = titleimgs = tree.xpath("//*[@id='spec-list']/ul/li/img/@src")ware = tree.xpath("//*[@id='detail']/div[2]/div[2]/div[2]/p//text()")if ware:ware = ware[0]#商品包装goods['ware'] = ware#获取品牌brand = tree.xpath("//*[@clstag='shangpin|keycount|product|mbNav-5']//text()")if not brand:brand = tree.xpath("//*[@clstag='shangpin|keycount|product|mbNav-4']//text()")elif not brand:brand = tree.xpath("//*[@clstag='shangpin|keycount|product|mbNav-3']//text()")elif not brand:brand = tree.xpath("//*[@clstag='shangpin|keycount|product|mbNav-2']//text()")elif not brand:brand = tree.xpath("//*[@clstag='shangpin|keycount|product|mbNav-1']//text()")elif not brand:brand = tree.xpath("//*[@clstag='shangpin|keycount|product|mbNav-0']//text()")if brand:brand = brand[0].replace(" ","")#获取型号model=""model1 = tree.xpath("//*[@id='crumb-wrap']/div/div[1]/div[9]//text()")brand1=brandgoods["p_link"] = "/%s.html"%skugoods["jd_sn"] = skugoods['en_brand'] =""if "(" in brand:brand1 = brand.split("(")if len(brand1)>1:en_brand = brand1[1].replace(")","")goods['en_brand'] = en_brand.replace(" ","")zh_brand = brand1[0]else:zh_brand = brand1try:if zh_brand and isinstance(zh_brand,list):goods['zh_brand'] = zh_brand[0].replace(" ","")else:goods['zh_brand'] = zh_brand.replace(" ","")except Exception as e:goods['zh_brand']=""try:if not model:model = model1[0].replace(goods['zh_brand'],"").replace(goods['en_brand'],"")goods["model"] = model.replace(" ","").replace("\r","").replace("\n","")except Exception as e:goods["model"] =""try:if not goods["model"]:model = tree.xpath('//*[@id="crumb-wrap"]/div/div[1]/div[@class="item ellipsis"]//text()')model = model[0].replace(goods['zh_brand'],"").replace(goods['zh_brand'],"")goods["model"] = model.replace(" ","").replace("\r","").replace("\n","").replace("(","").replace(")","")except Exception as e:goods["model"]=""parameter_list = tree.xpath('//*[@id="detail"]/div[2]/div[1]/div[1]/ul[@class="parameter2 p-parameter-list"]/li//text()')attrs_dict = dict()for parameter in parameter_list:param = parameter.split(":")if ("商品编号" not in param[0]) and ("店铺" not in param[0]):if len(param)>1:attrs_dict[param[0]]=param[1]attrs = tree.xpath("//*[@id='detail']/div/div/div/div[@class='Ptable-item']")for item1 in attrs:attr = item1.xpath('dl/dl[@class="clearfix"]')for item in attr:attrname = item.xpath("dt//text()")attrvalue = item.xpath("dd//text()")attrs_dict[attrname[0].strip()] = attrvalue[-1].strip()goods['image_url'] = list()#获取相册图片for img in imgs:url = "https:" + img.replace("s54x54", "s400x400").replace("n5", "sku")goods['image_url'].append(self.downloadimg(url))goods['price'] = self.getprice(sku)#获取详情部分s = pile(r'(//img.*?(\.jpg|\.png|\.jpeg|\.gif))')try:datail_html = eval(detail.content.decode().replace("showdesc"," "))["content"].replace("data-lazyload","src").replace("//img10","https://img10").replace("<style></style>","<style>img{width:100%;margin:0 auto}</style>")datail_imgs = re.findall(s, datail_html)except Exception as e:datail_imgs = re.findall(s, detail.text)datail_imgs=[img[0] for img in datail_imgs]goods_detail=list()for detail_img in datail_imgs:goods_detail.append(self.downloadimg(detail_img))goods['goods_content'] = goods_detailgoods['attrs'] = attrs_dictreturn goodsif __name__ == '__main__':jd_spider = jdspider()goods = jd_spider.spider("8163617")print(json.dumps(goods))

返回数据:

{'goods_name': '福临门 面粉 麦芯通用小麦粉 中筋粉 馒头、包子、烙饼等各类面食 中粮出品 十斤 5kg(新老包装随机发货)', 'ware': '面粉*1', 'p_link': '/8163617.html', 'jd_sn': '8163617', 'en_brand': '', 'zh_brand': '福临门', 'model': '面粉', 'image_url': ['/sku/jfs/t1/130368/33/29760/77107/63367b89Edeba674b/3f394e7176f87f80.jpg', '/sku/jfs/t1/184788/40/11977/83962/60dae86eE53d5f781/395d705d2a05a369.jpg', '/sku/jfs/t1/133680/36/3344/133873/5efb2132E6f5f3907/da42bef991bb8137.jpg', '/sku/jfs/t1/130722/39/3309/191088/5efb214dE798ecf85/c996260e88fbcf86.jpg', '/sku/jfs/t1/30944/25/12069/158605/5cb6d866Eb8223b63/39014969736ab6ec.jpg', '/sku/jfs/t1/30568/24/12098/206240/5cb6d866E52af5b76/0c87d72d3e6045d7.jpg', '/sku/jfs/t1/36030/2/918/137565/5cb6d866E081adb5c/3d9b1327927033bd.jpg'], 'price': {'m': '40.00', 'p': '25.90', 'op': '29.90'}, 'goods_content': ['/sku/jfs/t1/124486/31/19486/271554/60b44bd8E3af704d2/779209bd54d08613.jpg', '/sku/jfs/t1/120024/12/19494/190137/60b44bd8Eec546fe5/7ba5709ab0700d78.jpg', '/sku/jfs/t1/119935/9/0/284780/60b44bd8E0a0e3d04/897bdf1e780549b5.jpg', '/sku/jfs/t1/173493/33/12253/280326/60b44bd8E41102deb/cbf47d1715258209.jpg', '/sku/jfs/t1/183467/37/6629/284945/60b44bd8Ec3aca21b/ac2d45d6dbd3ef72.jpg', '/sku/jfs/t1/172257/5/12271/280601/60b44bd8Ec214616c/aac314443a965381.jpg', '/sku/jfs/t1/128236/26/19102/282816/60b44bd8E8509e542/167c154edc3599ab.jpg', '/sku/jfs/t1/179134/19/6490/279496/60b44bd8E403a3561/fdb2e464c68f0d2f.jpg', '/sku/jfs/t1/183555/35/6547/283235/60b44bd8Eaea13877/2e9729a6e29d53ef.jpg', '/sku/jfs/t1/195060/35/5592/286560/60b44bd8E5274b157/080d8e2295ff8eb7.jpg', '/sku/jfs/t1/176982/32/6688/285309/60b44bd8Ed75a7b10/c2ba32a5d76575f8.jpg', '/sku/jfs/t1/180719/6/6666/191040/60b44bd8E5744e5df/f687e8de0c20e1b0.jpg', '/sku/jfs/t1/110975/11/15228/278363/60b44bd8E94621ebe/49dad409ccb0ff24.jpg'], 'attrs': {'商品名称': '福临门面粉', '商品毛重': '5.1kg', '商品产地': '河南濮阳', '包装形式': '袋装', '类别': '麦芯粉', '净含量': '5000g', '保质期': '12个月', '生产许可证号': 'SC10141090200018', '产品标准号': 'GB/T 1355'}}

如果觉得《python实现京东单个商品信息抓取(标题 品牌(中/英文) 图片 型号 价格 详情)》对你有帮助,请点赞、收藏,并留下你的观点哦!

本内容不代表本网观点和政治立场,如有侵犯你的权益请联系我们处理。
网友评论
网友评论仅供其表达个人看法,并不表明网站立场。