目录
request的基本使用
urllib使用
图片爬取
获取动态数据
session和cokkie的处理
使用xpath解析
使用正则解析
BeautifulSoup使用
seleium自动化爬虫
其他自动化操作
实现无界面
自动化处理iframe标签
基于selenium的12306用户登录
代理的使用
验证码解析
协程的使用
同步爬虫
多线程异步爬虫的使用
线程池
异步协程
aiohttp实现任务异步协程
分布式爬虫
简单练手项目
肯德基破解
爬取简历模板
百度AI实现爬虫
好久之前做的python非框架爬虫全集笔记一直没整理,今天有空整理了一番,方便以后查看。
request的基本使用
案例一
# -*- coding: utf-8 -*-import requestsif __name__ == "__main__":# step 1:指定urlurl = '/'# step 2:发起请求response = requests.get(url=url)# step 3:获取响应数据.text返回的是字符串形式的响应数据page_text = response.textprint(page_text)# step_4:持久化存储with open('./sogou.html', 'w', encoding='utf-8') as fp:fp.write(page_text)print('爬取数据结束!!!')
案例二:
# -*- coding: utf-8 -*-import requestsimport jsonif __name__ == "__main__":url = '/j/search_subjects'param = {'type': 'movie','tag': "喜剧",'sort': 'recommend','page_limit': 20, # 一次取出的个数'page_start': 20, # 从库中的第几部电影去取}headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36 Edg/89.0.774.75'}response = requests.get(url=url, params=param, headers=headers)list_data = response.json()fp = open('./douban.json', 'w', encoding='utf-8')json.dump(list_data, fp=fp, ensure_ascii=False)print('over!!!')
案例三
# -*- coding: utf-8 -*-import requestsimport jsonif __name__ == "__main__":# 1.指定urlpost_url = '/sug'# 2.进行UA伪装headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36 Edg/89.0.774.75'}# 3.请求参数处理(同get请求一致)word = input('enter a word:')data = {'kw': word}# 4.请求发送response = requests.post(url=post_url, data=data, headers=headers)# 5.获取响应数据:json()方法返回的是obj_(如果确认响应数据是json类型的,才可以使用json())dic_obj = response.json()# 持久化存储fileName = word + '.json'fp = open(fileName, 'w', encoding='utf-8')json.dump(dic_obj, fp=fp, ensure_ascii=False)print('over!!!')
案例四
# -*- coding: utf-8 -*-# 每次爬取需要进行UA伪装,伪装成某款浏览器# User-Agent(请求载体生份标识)import requestsif __name__ == "__main__":# UA伪装:将对应的User-Agent封装到一个字典中headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36 Edg/89.0.774.75'}url = '/web'# 处理url携带的参数:封装到字典中kw = input('enter a word:')param = {'query': kw}# 对指定的url发起的请求对应的url是携带参数的,并且请求过程中处理了参数response = requests.get(url=url, params=param, headers=headers)page_text = response.textfileName = kw + '.html'with open(fileName, 'w', encoding='utf-8') as fp:fp.write(page_text)print(fileName, '保存成功!!!')
urllib使用
import requestsimport reimport osimport urllibdirName = "imgLab"if not os.path.exists(dirName):os.mkdir(dirName)url = "/s?wd=%E7%8B%97&tn=98012088_5_dg&ch=11"headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.77 Safari/537.36 Edg/91.0.864.37',}response = requests.get(url=url, headers=headers)page_text = response.textex = '<div class="op-img-address-divide-high">.*?<img src="(.*?)" class=.*?</div>'img_src_list = re.findall(ex, page_text, re.S)for src in img_src_list:imgPath = dirName + "/" + src.split('/')[-1]src = src + '&fm=26'urllib.request.urlretrieve(src, imgPath)print(imgPath, '下载成功!')
图片爬取
案例一
from lxml import etreeimport requestsimport osimport urllibfileName = "图片"if not os.path.exists(fileName):os.mkdir(fileName)url = "/"headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36'}response = requests.get(url=url, headers=headers)page_text = response.texttree = etree.HTML(page_text)li_list = tree.xpath('//*[@id="main"]/div[3]/ul/li')arr = []for li in li_list:href = '' + li.xpath(' ./a/span/img/@src')[0]arr.append(href)for ar in arr:filePath = fileName + '/' + ar.split('/')[-1]urllib.request.urlretrieve(ar, filePath)print("爬取结束!!!")
案例二
# -*- coding: utf-8 -*-import requestsif __name__ == "__main__":# 如何爬取图片数据url = '/th/id/R6706ad2e7a68edabddbc1b5644707c4f?rik=u8uR%2bWe5bxIosA&riu=http%3a%2f%%2fuploads%2fpc%2fplace2%2f-09-14%2f9aab9bb7-2593-4ca6-8c5a-31355443aebc.jpg&ehk=HpOwqU6w6%2fssF4CJQMbTOshMh4lIXJONXU%2btYNsAKSI%3d&risl=1&pid=ImgRaw'# content返回的是二进制形式的图片数据# text(字符串) content(二进制) json() (对象)img_data = requests.get(url=url).contentwith open('./qiutu.jpg', 'wb') as fp:fp.write(img_data)
获取动态数据
# -*- coding: utf-8 -*-import requestsimport jsonif __name__ == "__main__":headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36 Edg/89.0.774.75'}# 批量获取不同企业的id值url = ''# 参数的封装id_list = [] # 存储企业的idall_data_list = [] # 存储所有的企业详情数据for page in range(1, 6):page = str(page)data = {}json_ids = requests.post(url=url, headers=headers, data=data).json()for dic in json_ids['list']:id_list.append(dic['ID'])# 获取所有的企业详情数据post_url = ''for id in id_list:data = {'id': id}detail_json = requests.post(url=url, headers=headers, data=data).json()all_data_list.append(detail_json)# 持久化存储all_data_listfp = open('./allData.json', 'w', encoding='utf-8')json.dump(all_data_list, fp=fp, ensure_ascii=False)print('over!!!')
session和cokkie的处理
使用xpath解析
案例一
# -*- coding: utf-8 -*-from lxml import etreeif __name__ == '__main__':# 实例化好了一个etree对象,且将被解析的源码加载到了该对象中tree = etree.parse('r.html')# r=tree.xpath('/html/body/div')# r=tree.xpath('/html//div')# r=tree.xpath('//div')# r=tree.xpath('//div[@class="song"]')# r=tree.xpath('//div[@class="tang"]//li[5]/a/text()')[0]# r=tree.xpath('//li[7]//text()')# r=tree.xpath('//div[@class="tang"]//text()')r = tree.xpath('//div[@class="song"]/img/@src')print(r)
案例二
# -*- coding: utf-8 -*-# 需求: 爬取58二手房的房源信息import requestsfrom lxml import etreeif __name__ == "__main__":headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/53'}# 爬取到页面源码数据url = '/ershoufang/'page_text = requests.get(url=url, headers=headers).text# 数据解析tree = etree.HTML(page_text)# 存储的就是标签对象td_list = tree.xpath('//td[@class="t"]')fp = open('58.txt', 'w', encoding='utf-8')for td in td_list:title = td.xpath('./a/text()')[0]print(title)fp.write(title + '\n')fp.close()
案例三
# -*- coding: utf-8 -*-# 需求: 解析下载图片数据import requestsfrom lxml import etreeimport osif __name__ == "__main__":headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/53'}# 爬取到页面源码数据url = '/4kyouxi/'response = requests.get(url=url, headers=headers)# 手动设定响应数据编码格式# response.encoding='utf-8'page_text = response.text# 数据解析tree = etree.HTML(page_text)li_list = tree.xpath('//div[@class="slist"]/ul/li')# 创建一个文件夹if not os.path.exists('./picLibs'):os.mkdir('./picLibs')for li in li_list:img_src = '' + li.xpath('./a/img/@src')[0]img_name = li.xpath('./a/img/@alt')[0] + '.jpg'# 通用解决中文乱码方案img_name = img_name.encode('iso-8859-1').decode('gbk')# print(img_name,img_src)# 请求图片进行持久化存储img_data = requests.get(url=img_src, headers=headers).contentimg_path = 'picLibs/' + img_namewith open(img_path, 'wb') as fp:fp.write(img_data)print(img_name, '下载成功!!!')
案例四
# -*- coding: utf-8 -*-# 需求: 解析出所有城市名称import requestsfrom lxml import etreeif __name__ == "__main__":'''headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/53'}#爬取到页面源码数据url='/historydata/'response=requests.get(url=url,headers=headers)#手动设定响应数据编码格式#response.encoding='utf-8'page_text=response.text#数据解析tree=etree.HTML(page_text)host_li_list=tree.xpath('//div[@class="bottom"]/ul/li')all_city_names=[]#解析到热门城市的城市名称for li in host_li_list:hot_city_name=li.xpath('./a/text()')[0]all_city_names.append(hot_city_name)#解析全部城市的名称city_names_list=tree.xpath('div[@class="bottom"]/ul/div[2]/li')for li in city_names_list:city_name=li.xpath('./a/text()')[0]all_city_names.append(city_name)print(all_city_names,len(all_city_names))'''headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/53'}# 爬取到页面源码数据url = '/historydata/'response = requests.get(url=url, headers=headers)# 手动设定响应数据编码格式# response.encoding='utf-8'page_text = response.text# 数据解析tree = etree.HTML(page_text)a_list = tree.xpath('//div[@class="bottom"]/ul/li/a | //div[@class="bottom"]/div[2]/li/a')all_city_names = []for a in a_list:city_name = a.xpath('./text()')[0]all_city_names.append(city_name)print(all_city_names, len(all_city_names))
使用正则解析
案例一
# -*- coding: utf-8 -*-import requestsimport reimport os# 需求: 爬取糗事百科中糗图板块下所有的糗图图片if __name__ == '__main__':# 创建一个文件夹,保存所有的图片if not os.path.exists('./qiutuLibs'):os.mkdir('./qiutuLibs')url = '/imgrank/'headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36 Edg/89.0.774.75'}# 使用通用爬虫对url对应的一整张页面进行爬取page_text = requests.get(url=url, headers=headers).text# 使用聚焦爬虫将页面中所有的糗图进行爬取ex = '<div class="thumb">.*?<img src="(.*?)" alt.*?</div>'img_src_list = re.findall(ex, page_text, re.S)# print(img_src_list)for src in img_src_list:# 拼接一个完整的图片urlsrc = 'https:' + src# 请求到了图片的二进制数据img_data = requests.get(url=src, headers=headers).content# 生成图片名称img_name = src.split('/')[-1]# 图片最终存储的路径imgPath = './qiutuLibs/' + img_namewith open(imgPath, 'wb') as fp:fp.write(img_data)print(img_name, '下载成功!!!')
案例二
# -*- coding: utf-8 -*-import requestsimport reimport os# 需求: 爬取糗事百科中糗图板块下所有的糗图图片if __name__ == '__main__':headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36 Edg/89.0.774.75'}# 创建一个文件夹,保存所有的图片if not os.path.exists('./qiutuLibs'):os.mkdir('./qiutuLibs')# 设置一个通用的url模板url = '/imgrank/page/%d/'for pageNum in range(1, 3):# 对应页面的urlnew_url = format(url % pageNum)page_text = requests.get(url=new_url, headers=headers).text# 使用聚焦爬虫将页面中所有的糗图进行爬取ex = '<div class="thumb">.*?<img src="(.*?)" alt.*?</div>'img_src_list = re.findall(ex, page_text, re.S)# print(img_src_list)for src in img_src_list:# 拼接一个完整的图片urlsrc = 'https:' + src# 请求到了图片的二进制数据img_data = requests.get(url=src, headers=headers).content# 生成图片名称img_name = src.split('/')[-1]# 图片最终存储的路径imgPath = './qiutuLibs/' + img_namewith open(imgPath, 'wb') as fp:fp.write(img_data)print(img_name, '下载成功!!!')
BeautifulSoup使用
案例一
# -*- coding: utf-8 -*-import requestsfrom bs4 import BeautifulSoupif __name__=='__main__':#对首页的页面数据进行爬取headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36 Edg/89.0.774.76'}url='/book/sanguoyanyi.html'page_text=requests.get(url=url,headers=headers)page_text.encoding='utf-8'page_text=page_text.text#在首页中解析出章节的标题和详情页的url#1.实例化BeautifulSoup对象,需要将页面源码数据加载到该对象中soup=BeautifulSoup(page_text,'lxml')#解析章节标题和详情的urlli_list=soup.select('.book-mulu>ul>li')fp=open('./sanguo.txt','w',encoding='utf-8')for li in li_list:title=li.a.stringdetail_url=''+li.a['href']#对详情页发起请求,解析出章节内容detail_page_text=requests.get(url=detail_url,headers=headers)detail_page_text.encoding='utf-8'detail_page_text=detail_page_text.text#解析出详情页中的相关的章节内容detail_soup=BeautifulSoup(detail_page_text,'lxml')div_tag=detail_soup.find('div',class_='chapter_content')#解析到了章节的内容content=div_tag.textfp.write(title+':'+content+'\n')print(title,'爬取成功!!!')
案例二
from bs4 import BeautifulSoupimport requestsimport osfileName = 'novel'if not os.path.exists(fileName):os.mkdir(fileName)headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.77 Safari/537.36 Edg/91.0.864.37','Connection': 'close'}url = "/book/sanguoyanyi.html"response = requests.get(url=url, headers=headers)response.encoding = 'utf-8'soup = BeautifulSoup(response.text, 'lxml')title = soup.select('.book-mulu > ul > li > a')cnt = 0for t in title:href = '' + t['href']response = requests.get(url=href, headers=headers)response.encoding = 'utf-8'page_text = response.textsoup = BeautifulSoup(page_text, 'lxml')div = soup.find('div', class_='card bookmark-list')filePath = fileName + '/' + t.string + '.txt'pageTxt = div.textwith open(filePath, 'w', encoding='utf-8') as fp:fp.write(pageTxt)print('爬取成功!!!')cnt += 1if cnt == 10:break
seleium自动化爬虫
解决iframe问题
案例一
# -*- coding: utf-8 -*-# 需求:模拟登录from selenium import webdriverfrom time import sleepbro = webdriver.Chrome(executable_path='./chromedriver')bro.get('/')bro.switch_to.frame('login_frame')a_tag = bro.find_element_by_id("switcher_plogin")a_tag.click()userName_tag = bro.find_element_by_id('u')password_tag = bro.find_element_by_id('p')sleep(1)userName_tag.send_keys('1292161328')sleep(1)password_tag.send_keys('1234567890')sleep(1)btn = bro.find_element_by_id('login_button')btn.click()sleep(3)bro.quit()
案例二
# -*- coding: utf-8 -*-from selenium import webdriverfrom lxml import etreefrom time import sleep# 实例化一个浏览器对象(传入的驱动程序)bro = webdriver.Chrome(executable_path='./chromedriver')bro.add_argument('-kiosk')# 让浏览器发起一个指定url对应请求bro.get('http://scxk.:81/xk/')# 获取浏览器当前页面的页面源码数据page_text = bro.page_source# 解析企业名称tree = etree.HTML(page_text)li_list = tree.xpath('//ul[@id="gzlist"]/li')for li in li_list:name = li.xpath('./dl/@title')[0]print(name)sleep(5)bro.quit()
案例三
其他自动化操作
from selenium import webdriverfrom time import sleepbro = webdriver.Chrome(executable_path='./chromedriver')bro.get('/')# 实现标签定位search_input = bro.find_element_by_id('q')# 标签交互search_input.send_keys('Iphone')# 执行一组js程序bro.execute_script('window.scrollTo(0,document.body.scrollHeight)')sleep(2)# 点击搜索按钮btn = bro.find_element_by_css_selector('.btn-search')btn.click()bro.get('')sleep(2)# 回退bro.back()sleep(2)# 前进bro.forward()sleep(5)bro.quit()
案例四
实现无界面
# -*- coding: utf-8 -*-from selenium import webdriverfrom time import sleep# 实现无可视化界面from selenium.webdriver.chrome.options import Options# 实现规避检测from selenium.webdriver import ChromeOptions# 实现无可视化界面的操作chrome_options = Options()chrome_options.add_argument('--headless')chrome_options.add_argument('--disable-gpu')# 实现规避检测option = ChromeOptions()option.add_experimental_option('excludeSwitches', ['enable-automation'])# 如何实现selenium的规避风险bro = webdriver.Chrome(executable_path='./chromedriver', chrome_options=chrome_options, options=option)# 无可视化界面(无头浏览器) phantomJsbro.get('')print(bro.page_source)sleep(2)bro.quit()
案例五
from selenium import webdriverfrom time import sleep# 后面是你的浏览器驱动位置,记得前面加r'','r'是防止字符转义的driver = webdriver.Chrome(r'./chromedriver')# 用get打开百度页面driver.get("")# 查找页面的“设置”选项,并进行点击# driver.find_elements_by_link_text('设置')[0].click()# sleep(2)# # # 打开设置后找到“搜索设置”选项,设置为每页显示50条# driver.find_elements_by_link_text('搜索设置')[0].click()# sleep(2)# #选中每页显示50条# m = driver.find_element_by_id('nr')# sleep(2)# m.find_element_by_xpath('//*[@id="nr"]/option[3]').click()# m.find_element_by_xpath('.//option[3]').click()# sleep(2)# # 点击保存设置# driver.find_elements_by_class_name("prefpanelgo")[0].click()# sleep(2)# # 处理弹出的警告页面 确定accept() 和 取消dismiss()# driver.switch_to_alert().accept()# sleep(2)# 找到百度的输入框,并输入 美女driver.find_element_by_id('kw').send_keys('美女')sleep(2)# 点击搜索按钮driver.find_element_by_id('su').click()sleep(2)# 在打开的页面中找到“Selenium - 开源中国社区”,并打开这个页面driver.find_elements_by_link_text('美女_海量精选高清图片_百度图片')[0].click()sleep(3)# 关闭浏览器driver.quit()
案例六
自动化处理iframe标签
# -*- coding: utf-8 -*-from selenium import webdriverfrom time import sleep# 导入动作链对应的类from selenium.webdriver import ActionChainsbro = webdriver.Chrome(executable_path='./chromedriver')bro.get('/try/try.php?filename=jqueryui-api-droppable')# 如果定位的标签是存在于iframe标签之中则必须通过如下操作在进行标签定位bro.switch_to.frame('iframeResult') # 切换浏览器标签定位的作用域div = bro.find_element_by_id('draggable')# 动作链action = ActionChains(bro)# 点击长按的标签action.click_and_hold(div)for i in range(5):# perform()立即执行动作链操作# move_by_offset(x,y):x水平方向,y竖直方向action.move_by_offset(17, 0).perform()sleep(0.3)# 释放地址链action.release()print(div)
案例七
基于selenium的12306用户登录
# -*- coding: utf-8 -*-import requestsfrom hashlib import md5class Chaojiying_Client(object):def __init__(self, username, password, soft_id):self.username = usernamepassword = password.encode('utf8')self.password = md5(password).hexdigest()self.soft_id = soft_idself.base_params = {'user': self.username,'pass2': self.password,'softid': self.soft_id,}self.headers = {'Connection': 'Keep-Alive','User-Agent': 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)',}def PostPic(self, im, codetype):"""im: 图片字节codetype: 题目类型 参考 /price.html"""params = {'codetype': codetype,}params.update(self.base_params)files = {'userfile': ('ccc.jpg', im)}r = requests.post('/Upload/Processing.php', data=params, files=files,headers=self.headers)return r.json()def ReportError(self, im_id):"""im_id:报错题目的图片ID"""params = {'id': im_id,}params.update(self.base_params)r = requests.post('/Upload/ReportError.php', data=params, headers=self.headers)return r.json()# 使用elenium打开登录页面from selenium import webdriverimport timefrom PIL import Imagefrom selenium.webdriver import ActionChainsbro = webdriver.Chrome(executable_path='./chromedriver')bro.maximize_window() # 全屏bro.get('/otn/resources/login.html')time.sleep(1)# 点击账号登录bro.find_elements_by_link_text('账号登录')[0].click()time.sleep(1)# save_screenshot将当前页面进行截图并保存bro.save_screenshot('aa.png')# 确定验证码图片对应的左上角和右下角的坐标(裁剪区域确定)code_img_ele = bro.find_element_by_css_selector('#J-loginImg')location = code_img_ele.location # 验证码图片左上角的坐标print('location:', location)size = code_img_ele.size # 验证码标签对应的长和宽print('size:', size)# 左上角和右下角坐标rangle = (int(location['x']), int(location['y']), int(location['x'] + size['width']), int(location['y'] + size['height']))# 至此验证码图片区域就确定下来了i = Image.open('./aa.png')code_img_name = './code.png'# crop根据指定区域进行图片裁剪frame = i.crop(rangle)frame.save(code_img_name)# 将验证码图片交给超级鹰进行识别chaojiying = Chaojiying_Client('1292161328', 'wuxiangnong', '915445') # 用户中心>>软件ID 生成一个替换 96001im = open('code.png', 'rb').read() # 本地图片文件路径 来替换 a.jpg 有时WIN系统须要//print(chaojiying.PostPic(im, 9004)['pic_str'])result = chaojiying.PostPic(im, 9004)['pic_str']all_list = [] # 要存储即将被点击的点的坐标[x1,y1][x2,y2]if '|' in result:list_1 = result.split('|')count_1 = len(list_1)for i in range(count_1):xy_list = []x = int(list_1[i].split(',')[0])y = int(list_1[i].split(',')[1])xy_list.append(x)xy_list.append(y)all_list.append(xy_list)else:x = int(result.split(',')[0])y = int(result.split(',')[1])xy_list = []xy_list.append(x)xy_list.append(y)all_list.append(xy_list)print(all_list)# 遍历列表,使用动作链对每一个列表元素对应的x,y指定的位置进行点击操作for l in all_list:x = l[0]y = l[1]ActionChains(bro).move_to_element_with_offset(code_img_ele, x, y).click().perform()time.sleep(0.5)bro.find_element_by_id('J-userName').send_keys('19828430139')time.sleep(2)bro.find_element_by_id('J-password').send_keys('wuxiangnong9595')time.sleep(2)bro.find_element_by_id('J-login').click()time.sleep(3)bro.quit()
代理的使用
案例一
# -*- coding: utf-8 -*-import requestsurl = '/s?wd=ip'headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36 Edg/89.0.774.76'}page_text = requests.get(url=url, headers=headers, proxies={"https": "222.110.147.50:3128"})with open('ip.html', 'w', encoding='utf-8') as fp:fp.write(page_text)
验证码解析
# -*- coding: utf-8 -*-import requestsfrom lxml import etree'''导入一个打码类'''# 封装识别验证码图片下载到本地的函数def getCodeText(imgPath, codeType):# 普通用户用户名username = 'bobo328410948'# 普通用户密码password = 'bobo328410948'# 软件ID,开发者分成必要参数。登录开发者后台【我的软件】获得!appid = 6003# 软件密钥,开发者分成必要参数。登录开发者后台【我的软件】获得!appkey = '1f4b564483ae5c907a1d34f8e2f2776c'# 图片文件:即将被识别的验证码图片的路径filename = imgPath# 验证码类型,# 例:1004表示4位字母数字,不同类型收费不同。请准确填写,否则影响识别率。在此查询所有类型 /price.htmlcodetype = codeType# 超时时间,秒timeout = 20result = None# 检查if (username == 'username'):print('请设置好相关参数再测试')else:# 初始化yundama = YDMHttp(username, password, appid, appkey)# 登陆云打码uid = yundama.login();print('uid: %s' % uid)# 查询余额balance = yundama.balance();print('balance: %s' % balance)# 开始识别,图片路径,验证码类型ID,超时时间(秒),识别结果cid, result = yundama.decode(filename, codetype, timeout);print('cid: %s, result: %s' % (cid, result))return resultif __name__ == "__main__":headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36 Edg/89.0.774.76'}url = '/user/login.aspx'page_text = requests.get(url=url, headers=headers).text# 解析验证码图片img中的属性值tree = etree.HTML(page_text)code_img_src = '' + tree.xpath('//*[@id="imgCode"]/@src')img_data = requests.get(url=code_img_src, headers=headers).content# 验证码图片保存到了本地with open('./code.jpg', 'wb') as fp:fp.write(img_data)# 打码咯
协程的使用
案例一
# -*- coding: utf-8 -*-import asyncioasync def request(url):print('正在请求的url是', url)print('请求成功', url)return url# async修饰的函数,调用之后返回的一个协程对象c = request('')# 创建一个事件循环对象# loop=asyncio.get_event_loop()# #将协程对象注册到loop中,然后启动loop# loop.run_until_complete(c)# task的使用loop = asyncio.get_event_loop()# 基于loop创建一个task对象task = loop.create_task(c)print(task)# future的使用# loop=asyncio.get_event_loop()# task=asyncio.ensure_future(c)# print(task)# loop.run_until_complete(task)# print(task)def callback_func(task):# result返回的就是任务对象中封装对象对应函数的返回值print(task.result)# 绑定回调loop = asyncio.get_event_loop()task = asyncio.ensure_future(c)# 将回调函数绑定到任务对象中task.add_done_callback(callback_func)loop.run_until_complete(task)
同步爬虫
import requestsheaders = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36 Edg/89.0.774.76'}urls = {'http://xmdx./Files/DownLoad/jianli/04/jianli10231.rar','http://zjlt./Files/DownLoad/jianli/04/jianli10229.rar','http://xmdx./Files/DownLoad/jianli/04/jianli10231.rar'}def get_content(url):print('正在爬取:', url)response = requests.get(url=url, headers=headers)if response.status_code == 200:return response.contentdef parse_content(content):print('响应数据的长度为:', len(content))for url in urls:content = get_content(url)parse_content(content)
多线程异步爬虫的使用
案例一
# -*- coding: utf-8 -*-import asyncioimport timeasync def request(url):print('正在下载', url)# 在异步协程中如果出现了同步模块相关代码,那么就无法实现异步# time.sleep(2)# 当在asyncio中遇到阻塞操作必须进行手动挂起await asyncio.sleep(2)print('下载完毕', url)start = time.time()urls = {'','',''}# 任务对象:存放多个任务对象stasks = []for url in urls:c = request(url)task = asyncio.ensure_future(c)stasks.append(task)loop = asyncio.get_event_loop()# 需要将任务列表封装到wait中loop.run_until_complete(asyncio.wait(stasks))print(time.time() - start)
案例二
# -*- coding: utf-8 -*-import requestsimport asyncioimport timestart = time.time()urls = ['http://127.0.0.1:5000/bobo','http://127.0.0.1:5000/jay','http://127.0.0.1:5000/tom']async def get_page(url):print('正在下载', url)# requests.get发起的请求是基于同步,必须基于异步的网络请求模块进行指定的url# aiohttp:基于异步网络请求的模块response = requests.get(url=url)print(response.text)tasks = []for url in urls:c = get_page(url)task = asyncio.ensure_future(c)tasks.append(task)loop = asyncio.get_event_loop()loop.run_until_complete(asyncio.wait(tasks))end = time.time()print('耗时', end - start)
线程池
案例三
# -*- coding: utf-8 -*-import time# 使用单线程串行方式的执行def get_page(str):print("正在下载:", str)time.sleep(2)print('下载成功:', str)name_list = ['xiaozi', 'aa', 'bb', 'cc']start_time = time.time()for i in range(len(name_list)):get_page(name_list[i])end_time = time.time()print('%d second' % (end_time - start_time))
案例四
# -*- coding: utf-8 -*-import time# 导入线程池对应模块的类from multiprocessing.dummy import Pool# 使用线程池方式执行start_time = time.time()def get_page(str):print("正在下载:", str)time.sleep(2)print('下载成功:', str)name_list = ['xiaozi', 'aa', 'bb', 'cc']# 实例化一个线程池pool = Pool(4)# 将列表每一个元素传递给get_page进行处理pool.map(get_page, name_list)end_time = time.time()print(end_time - start_time)
异步协程
案例五
aiohttp实现任务异步协程
# -*- coding: utf-8 -*-import timeimport asyncioimport aiohttpstart = time.time()urls = ['http://127.0.0.1:5000/bobo','http://127.0.0.1:5000/jay','http://127.0.0.1:5000/tom']async def get_page(url):async with aiohttp.ClientSession() as session:# get()、post():# headers,params/data,proxy='http://ip:port'async with session.get(url) as response:# text()可以返回字符串形式的响应数据# read()返回二进制形式的响应数据# json()返回的就是json对象# 注意:获取响应数据操作之前一定要使用await进行手动挂起page_text = await response.text()print(page_text)tasks = []for url in urls:c = get_page(url)task = asyncio.ensure_future(c)tasks.append(task)loop = asyncio.get_event_loop()loop.run_until_complete(asyncio.wait(tasks))end = time.time()print('耗时', end - start)
分布式爬虫
简单练手项目
肯德基破解
# -*- coding: utf-8 -*-import requestsif __name__ == "__main__":url = '/kfccda/ashx/GetStoreList.ashx?op=keyword'param = {'cname': '','pid': '','keyword': '北京','pageIndex': '1','pageSize': '10',}headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36 Edg/89.0.774.75'}response = requests.get(url=url, params=param, headers=headers)list_data = response.textfp = open('./KFC.text', 'w', encoding='utf-8')fp.write(list_data)fp.close()print('over!!!')
爬取简历模板
import requestsimport osfrom lxml import etreeif __name__ == "__main__":headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/53'}url = '/jianli/free.html'page_text = requests.get(url=url, headers=headers).text# 创建文件夹if not os.path.exists('./new'):os.mkdir('./new')# 实例对象tree = etree.HTML(page_text)a_lists = tree.xpath('//div[@id="container"]/div/a')for a in a_lists:href = a.xpath('./@href')[0]src = 'https:' + hrefpage_text_detail = requests.get(url=src, headers=headers).texttreeDetail = etree.HTML(page_text_detail)a_lists_products = treeDetail.xpath('//div[@class="clearfix mt20 downlist"]/ul/li')[0]href2 = a_lists_products.xpath('./a/@href')[0]products_name = href2[-7:]response = requests.get(url=href2, headers=headers)data_products = response.contentdata_path = 'new/' + products_namewith open(data_path, 'wb') as fp:fp.write(data_products)fp.close()print(products_name, "下载成功!!!")
百度AI实现爬虫
如果觉得《Python 爬虫总结——案例代码》对你有帮助,请点赞、收藏,并留下你的观点哦!