失眠网,内容丰富有趣,生活中的好帮手!
失眠网 > python爬虫——用selenium爬取京东商品信息

python爬虫——用selenium爬取京东商品信息

时间:2019-07-29 12:23:19

相关推荐

python爬虫——用selenium爬取京东商品信息

python爬虫——用selenium爬取京东商品信息

1.先附上效果图(我偷懒只爬了4页)

2.京东的网址/

3.我这里是不加载图片,加快爬取速度,也可以用Headless无弹窗模式

options = webdriver.ChromeOptions()options.add_experimental_option('prefs', {'profile.managed_default_content_settings.images': 2})#不加载图片browser = webdriver.Chrome(options=options)wait =WebDriverWait(browser,50)#设置等待时间url = '/'data_list = []#设置全局变量用来存储数据keyword="python爬虫"#关键词

4.先找到搜索框并用selenium模拟点击(这里发现京东不需要登录就能看到商品信息)

def search():browser.get('/')try:input = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "#key"))) #等到搜索框加载出来submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "#search > div > div.form > button")))#等到搜索按钮可以被点击input[0].send_keys(keyword)#向搜索框内输入关键词submit.click()#点击total = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, '#J_bottomPage > span.p-skip > em:nth-child(1) > b')))#记录一下总页码,等到总页码加载出来html = browser.page_source#获取网页信息prase_html(html)#调用提取数据的函数(后面才写到)return total[0].textexcept TimeoutError:search()

5.进入了第一页,先写好翻页的函数,需要滑动到底部才能加载后30个商品,总共有60个商品

def next_page(page_number):try:# 滑动到底部browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")time.sleep(random.randint(1, 3))#设置随机延迟button = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#J_bottomPage > span.p-num > a.pn-next > em')))#翻页按钮button.click()# 翻页动作wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "#J_goodsList > ul > li:nth-child(30)")))#等到30个商品都加载出来# 滑动到底部,加载出后三十个货物信息browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "#J_goodsList > ul > li:nth-child(60)")))#等到60个商品都加载出来wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR, "#J_bottomPage > span.p-num > a.curr"), str(page_number)))# 判断翻页成功,高亮的按钮数字与设置的页码一样html = browser.page_source#获取网页信息prase_html(html)#调用提取数据的函数except TimeoutError:return next_page(page_number)

6.能正常翻页就简单很多了,开始抽取需要的商品信息,搜索不同的关键词,页面的布局会有变化,需要重新写定位商品信息

def prase_html(html):html = etree.HTML(html)# 开始提取信息,找到ul标签下的全部li标签try:lis = browser.find_elements_by_class_name('gl-item')# 遍历for li in lis:# 名字title = li.find_element_by_xpath('.//div[@class="p-name p-name-type-2"]//em').text# 价格price = li.find_element_by_xpath('.//div[@class="p-price"]//i').text# 评论数comment = li.find_elements_by_xpath('.//div[@class="p-commit"]//a')# 商铺名字shop_name = li.find_elements_by_xpath('.//div[@class="p-shop"]//a')if comment:comment = comment[0].textelse:comment = Noneif shop_name:shop_name = shop_name[0].textelse:shop_name = Nonedata_dict ={}#写入字典data_dict["title"] = titledata_dict["price"] = pricedata_dict["shop_name"] = shop_namedata_dict["comment"] = commentprint(data_dict)data_list.append(data_dict)#写入全局变量except TimeoutError:prase_html(html)

7.存储方法

def save_html():content = json.dumps(data_list, ensure_ascii=False, indent=2)#把全局变量转化为json数据with open("jingdong.json", "a+", encoding="utf-8") as f:f.write(content)print("json文件写入成功")with open('jingdong.csv', 'w', encoding='utf-8', newline='') as f:# 表头title = data_list[0].keys()# 声明writerwriter = csv.DictWriter(f, title)# 写入表头writer.writeheader()# 批量写入数据writer.writerows(data_list)print('csv文件写入完成')

8.开始调用

def main():print("第", 1, "页:")total = int(search())for i in range(2, 5):# for i in range(2, total + 1):#想全爬的就用这个循环time.sleep(random.randint(1, 3))#设置随机延迟print("第", i, "页:")next_page(i)save_html()if __name__ == "__main__":main()

遇到的坑

这个是我弄了好久也没有弄好的,我一加这个滑动到底部就报错,我也不知道逻辑错哪里了,所以第一页只能爬30个商品,后面的都能爬60个,希望知道的小伙伴运行过能告诉我一下

def search():browser.get('/')try:input = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "#key"))) #submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "#search > div > div.form > button")))input[0].send_keys(keyword)submit.click()total = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, '#J_bottomPage > span.p-skip > em:nth-child(1) > b')))# # 滑动到底部,加载出后三十个货物信息# browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")# wait.until(#EC.presence_of_all_elements_located((By.CSS_SELECTOR, "#J_goodsList > ul > li:nth-child(60)"))# )html = browser.page_sourceprase_html(html)return total[0].textexcept TimeoutError:search()

9.附上完整代码

import timefrom selenium import webdriverfrom selenium.webdriver.support import expected_conditions as ECfrom mon.by import Byfrom selenium.webdriver.support.ui import WebDriverWaitimport randomimport jsonimport csvfrom lxml import etreeoptions = webdriver.ChromeOptions()options.add_experimental_option('prefs', {'profile.managed_default_content_settings.images': 2})#不加载图片browser = webdriver.Chrome(options=options)wait =WebDriverWait(browser,50)#设置等待时间url = '/'data_list= []#设置全局变量用来存储数据keyword ="python爬虫"#关键词def search():browser.get('/')try:input = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "#key"))) #等到搜索框加载出来submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "#search > div > div.form > button")))#等到搜索按钮可以被点击input[0].send_keys(keyword)#向搜索框内输入关键词submit.click()#点击total = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, '#J_bottomPage > span.p-skip > em:nth-child(1) > b')))#记录一下总页码,等到总页码加载出来# # 滑动到底部,加载出后三十个货物信息# browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")# wait.until(#EC.presence_of_all_elements_located((By.CSS_SELECTOR, "#J_goodsList > ul > li:nth-child(60)"))# )html = browser.page_source#获取网页信息prase_html(html)#调用提取数据的函数return total[0].text#返回总页数except TimeoutError:search()def next_page(page_number):try:# 滑动到底部browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")time.sleep(random.randint(1, 3))#设置随机延迟button = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#J_bottomPage > span.p-num > a.pn-next > em')))#翻页按钮button.click()# 翻页动作wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "#J_goodsList > ul > li:nth-child(30)")))#等到30个商品都加载出来# 滑动到底部,加载出后三十个货物信息browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "#J_goodsList > ul > li:nth-child(60)")))#等到60个商品都加载出来wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR, "#J_bottomPage > span.p-num > a.curr"), str(page_number)))# 判断翻页成功,高亮的按钮数字与设置的页码一样html = browser.page_source#获取网页信息prase_html(html)#调用提取数据的函数except TimeoutError:return next_page(page_number)def prase_html(html):html = etree.HTML(html)# 开始提取信息,找到ul标签下的全部li标签try:lis = browser.find_elements_by_class_name('gl-item')# 遍历for li in lis:# 名字title = li.find_element_by_xpath('.//div[@class="p-name p-name-type-2"]//em').text# 价格price = li.find_element_by_xpath('.//div[@class="p-price"]//i').text# 评论数comment = li.find_elements_by_xpath('.//div[@class="p-commit"]//a')# 商铺名字shop_name = li.find_elements_by_xpath('.//div[@class="p-shop"]//a')if comment:comment = comment[0].textelse:comment = Noneif shop_name:shop_name = shop_name[0].textelse:shop_name = Nonedata_dict ={}#写入字典data_dict["title"] = titledata_dict["price"] = pricedata_dict["shop_name"] = shop_namedata_dict["comment"] = commentprint(data_dict)data_list.append(data_dict)#写入全局变量except TimeoutError:prase_html(html)def save_html():content = json.dumps(data_list, ensure_ascii=False, indent=2)#把全局变量转化为json数据with open("jingdong1.json", "a+", encoding="utf-8") as f:f.write(content)print("json文件写入成功")with open('jingdong1.csv', 'w', encoding='utf-8', newline='') as f:# 表头title = data_list[0].keys()# 声明writerwriter = csv.DictWriter(f, title)# 写入表头writer.writeheader()# 批量写入数据writer.writerows(data_list)print('csv文件写入完成')def main():print("第", 1, "页:")total = int(search())for i in range(2, 5):# for i in range(2, total + 1):time.sleep(random.randint(1, 3)) # 设置随机延迟print("第", i, "页:")next_page(i)save_html()if __name__ == "__main__":main()

如果觉得《python爬虫——用selenium爬取京东商品信息》对你有帮助,请点赞、收藏,并留下你的观点哦!

本内容不代表本网观点和政治立场,如有侵犯你的权益请联系我们处理。
网友评论
网友评论仅供其表达个人看法,并不表明网站立场。