失眠网,内容丰富有趣,生活中的好帮手!
失眠网 > 使用python selenium爬取淘宝商品信息 自动登录淘宝和爬取某一宝贝的主图 属性

使用python selenium爬取淘宝商品信息 自动登录淘宝和爬取某一宝贝的主图 属性

时间:2021-12-15 03:06:03

相关推荐

使用python   selenium爬取淘宝商品信息 自动登录淘宝和爬取某一宝贝的主图 属性

selenium作为一个自动化测试工具非常好用,谁用谁知道啊。

先说如何登录淘宝,淘宝现在直接用会员名和密码登录会有滑块验证,找了网上说的几种方法和自己尝试了一番效果还是不太理想,实测过程中,即使滑块滑动成功了也无法登录,出现报错的情况。限于自身的技术水平无法解决。但是方法总比困难多,最后用了微博账号登录的。如果你使用了下文的登录方法。那快去注册一个微博账号或者绑定淘宝吧

登录:

首先确保安装了selenium,requests

pip installselenium

pip installrequests

然后安装webdriver,具体怎么安装可以网上搜索,这里不做描述。

1.登陆:

#encoding=utf-8from selenium import webdriverfrom mon.by import Byfrom selenium.webdriver.support import expected_conditions as ECfrom mon.keys import Keysfrom selenium.webdriver.support.wait import WebDriverWaitfrom selenium.webdriver import ActionChainsimport timeimport requestsimport osimport re#url = '/member/login.jhtml'options = webdriver.ChromeOptions()# 不加载图片,加快访问速度options.add_experimental_option("prefs",{"profile.mamaged_default_content_settings.images":2})# 设置为开发者模式,防止被各大网站识别出来使用了Selenium,options.add_experimental_option('excludeSwitches',['enable-automation'])driver = webdriver.Chrome(executable_path='C:\\Program Files (x86)\\Google\\Chrome\\Application\\chromedriver',options=options)wait = WebDriverWait(driver,10)def login():driver.get(url)# 打开网页driver.get(url)# 等待 密码登录选项 出现password_login = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '.qrcode-login > .login-links > .forget-pwd')))password_login.click()# 等待 微博登录选项 出现weibo_login = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '.weibo-login')))weibo_login.click()#获取账号输入框#input1 = driver.find_element_by_id('TPL_username_1')#input1.send_keys('xxxx ')EMAIL1 = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'.username> .W_input' )))EMAIL1.send_keys('xxxxxx') #输入自己的账号#获取密码输入框#input2 = driver.find_element_by_id('TPL_password_1')#input2.send_keys('xxx')PASSWD = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#pl_login_logged > div > div:nth-child(3) > div > input')))PASSWD.send_keys('xxxxx')#获取登陆按钮button = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,'#pl_login_logged > div > div:nth-child(7) > div:nth-child(1) > a > span')))#button = driver.find_element_by_id('J_SubmitStatic')button.click()time.sleep(1)

2.定位:

以手机为关键词,点击第一个宝贝,以第一个宝贝为爬取对象。爬取宝贝的主图,属性图和详细图信息

def location():#定位到新页面并且搜索手机关键字time.sleep(1)driver.switch_to.window(driver.window_handles[0])search = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#q')))#search = driver.find_element_by_id('q')search.send_keys('电脑')search.send_keys(Keys.ENTER)#取第一个宝贝做测试time.sleep(1)index = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'.pic>a')))#search = driver.find_element_by_id('q')#search.send_keys('手机')index.send_keys(Keys.ENTER)

3.爬取宝贝主图:

def get_main_pic():#爬取主图信息time.sleep(1)driver.switch_to.window(driver.window_handles[1])pics = driver.find_elements(By.CSS_SELECTOR,'#J_UlThumb li a img')# .为匹配任意字符,*为匹配多个 \转译字符pattern = r"/imgextra/.*/.*/O.*?\..{3}"list = []for item in pics: print(item.get_attribute("src"))src = item.get_attribute("src")opt = re.findall(pattern,src)print(opt[0])list.append(opt[0])dowland_pic(list,mainPicDir)

4.爬取宝贝属性图:

def get_attribute_pic():#爬取详情图信息detail = driver.find_elements(By.CSS_SELECTOR,'.tb-prop dd ul li')list = []for item in detail: print(item.get_attribute("style"))src = item.get_attribute("style")list.append(src)dowland_pic(list,attributePicDir)

5.爬取宝贝详情图:

def get_detail_pic():#爬取详情图信息#detail = driver.find_elements(By.CSS_SELECTOR,'#description .content div div img')detail = driver.find_elements(By.CSS_SELECTOR,'#description .content p img')if detail:detail = driver.find_elements(By.CSS_SELECTOR,'#description .content div div img')if detail:print('not find detail')return 0list = []for item in detail: print(item.get_attribute("src"))src = item.get_attribute("src")list.append(src)dowland_pic(list,detailPicDir)

说明:爬取详情图有些样式的宝贝还不能爬取,还要做些适配性工作。还有不足的地方就是宝贝视频不能爬取后期需要加入这个功能敬请期待。

6.完整代码:

#encoding=utf-8from selenium import webdriverfrom mon.by import Byfrom selenium.webdriver.support import expected_conditions as ECfrom mon.keys import Keysfrom selenium.webdriver.support.wait import WebDriverWaitfrom selenium.webdriver import ActionChainsimport timeimport requestsimport osimport re#url = '/member/login.jhtml'options = webdriver.ChromeOptions()# 不加载图片,加快访问速度options.add_experimental_option("prefs",{"profile.mamaged_default_content_settings.images":2})# 设置为开发者模式,防止被各大网站识别出来使用了Selenium,options.add_experimental_option('excludeSwitches',['enable-automation'])driver = webdriver.Chrome(executable_path='C:\\Program Files (x86)\\Google\\Chrome\\Application\\chromedriver',options=options)wait = WebDriverWait(driver,10)mainPicDir = './main/'detailPicDir = './detail/'attributePicDir = './attribute/'def mkdir():#创建一个目录os.makedirs(mainPicDir,exist_ok=True)os.makedirs(detailPicDir,exist_ok=True)os.makedirs(attributePicDir,exist_ok=True)def dowland_pic(src,dir):cout = 0for index in src:r = requests.get(index)fileName = "%s.jpg"%coutlcoalDir = dir + fileNamewith open(lcoalDir,'wb') as f:f.write(r.content)cout +=1def login():driver.get(url)# 打开网页driver.get(url)# 等待 密码登录选项 出现password_login = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '.qrcode-login > .login-links > .forget-pwd')))password_login.click()# 等待 微博登录选项 出现weibo_login = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '.weibo-login')))weibo_login.click()#获取账号输入框#input1 = driver.find_element_by_id('TPL_username_1')#input1.send_keys('xxxx ')EMAIL1 = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'.username> .W_input' )))EMAIL1.send_keys('xxxxxx') #输入自己的账号#获取密码输入框#input2 = driver.find_element_by_id('TPL_password_1')#input2.send_keys('xxx')PASSWD = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#pl_login_logged > div > div:nth-child(3) > div > input')))PASSWD.send_keys('xxxxx')#获取登陆按钮button = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,'#pl_login_logged > div > div:nth-child(7) > div:nth-child(1) > a > span')))#button = driver.find_element_by_id('J_SubmitStatic')button.click()time.sleep(1)def location():#定位到新页面并且搜索手机关键字time.sleep(1)driver.switch_to.window(driver.window_handles[0])search = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#q')))#search = driver.find_element_by_id('q')search.send_keys('电脑')search.send_keys(Keys.ENTER)#取第一个宝贝做测试time.sleep(1)index = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'.pic>a')))#search = driver.find_element_by_id('q')#search.send_keys('手机')index.send_keys(Keys.ENTER)def get_main_pic():#爬取主图信息time.sleep(1)driver.switch_to.window(driver.window_handles[1])pics = driver.find_elements(By.CSS_SELECTOR,'#J_UlThumb li a img')# .为匹配任意字符,*为匹配多个 \转译字符pattern = r"/imgextra/.*/.*/O.*?\..{3}"list = []for item in pics: print(item.get_attribute("src"))src = item.get_attribute("src")opt = re.findall(pattern,src)print(opt[0])list.append(opt[0])dowland_pic(list,mainPicDir)def get_detail_pic():#爬取详情图信息#detail = driver.find_elements(By.CSS_SELECTOR,'#description .content div div img')detail = driver.find_elements(By.CSS_SELECTOR,'#description .content p img')if detail:detail = driver.find_elements(By.CSS_SELECTOR,'#description .content div div img')if detail:print('not find detail')return 0list = []for item in detail: print(item.get_attribute("src"))src = item.get_attribute("src")list.append(src)dowland_pic(list,detailPicDir)def get_attribute_pic():#爬取详情图信息detail = driver.find_elements(By.CSS_SELECTOR,'.tb-prop dd ul li')list = []for item in detail: print(item.get_attribute("style"))src = item.get_attribute("style")list.append(src)dowland_pic(list,attributePicDir)def main():print('mkdir')mkdir()print('login')login()print('location')location()print('get_main_pic')get_main_pic()print('get_detail_pic')get_detail_pic()get_attribute_pic()if __name__ == "__main__":main()

使用python selenium爬取淘宝商品信息 自动登录淘宝和爬取某一宝贝的主图 属性图和详情图等等

如果觉得《使用python selenium爬取淘宝商品信息 自动登录淘宝和爬取某一宝贝的主图 属性》对你有帮助,请点赞、收藏,并留下你的观点哦!

本内容不代表本网观点和政治立场,如有侵犯你的权益请联系我们处理。
网友评论
网友评论仅供其表达个人看法,并不表明网站立场。