失眠网 > python 爬取5566图库图片

python 爬取5566图库图片

时间：2023-06-25 10:32:54

相关推荐

python 爬取5566图库图片

1 import requests 2 import random 3 import re 4 import time 5 import os 6 from bs4 import BeautifulSoup 7 8 9 class GetGirlsPhoto(object): 10def __init__(self, head_url, repository_name): 11 self.url = head_url 12 self.list_url = [] 13 self.list_pic_url = dict() 14 self.header_file = 'user_agents.txt' 15 self.path = repository_name 16 17#编码问题解决 18def chartset(self, rsp): 19 _chart = requests.utils.get_encoding_from_headers(rsp.headers) 20 if _chart == 'ISO-8859-1': 21 rsp.encoding = requests.utils.get_encodings_from_content(rsp.text) 22 23#随机User-Agent 24def get_header(self): 25 with open(self.header_file, 'r') as f: 26 headers = f.readlines() 27 header = random.choice(headers).strip() 28 header = {'User-Agent': header} 29 return header 30 31#获取首页下方页码列表的链接，存入list_url 32def get_url_list(self): 33 rsp = requests.get(self.url, headers=self.get_header()) 34 self.chartset(rsp) 35 tg_bf = BeautifulSoup(rsp.text, 'lxml') 36 tag = tg_bf.find_all('a', target='_self') 37 res_url = r"(?<=href=\").+?(?=\")|(?<=href=\').+?(?=\')" 38 link = re.findall(res_url, str(tag), re.I | re.S | re.M) 39 for i in link[1:-3]: 40 url = self.url+i 41 self.list_url.append(url) 42 print('获取\“%s\”子链接成功' % self.url) 43 44#根据list_url,获取每页的图片入口链接，存入list_pic_url（所有的图片入口链接） 45def get_pic_link(self): 46 self.get_url_list() 47 for url in self.list_url: 48 rsp = requests.get(url, headers=self.get_header()) 49 self.chartset(rsp) 50 tag_bf = BeautifulSoup(rsp.text, 'lxml') 51 a_tag = tag_bf.find_all('a', class_='picLink') 52 for i in a_tag: 53 self.list_pic_url[i.get('title')] = i.get('href') 54 time.sleep(1) 55 print('获取\“%s\”子链接成功！' % url) 56 57#根据list_pic_url获取图片详细页的连接，然后分析出图片地址，最后进行下载 58def get_pic(self): 59 self.get_pic_link() 60 for title, url in self.list_pic_url.items(): 61 print('开始下载%s系列' % title) 62 rsp = requests.get(url, headers=self.get_header()).text 63 tag_bf = BeautifulSoup(rsp, 'lxml') 64 tag = tag_bf.find('div', class_='pages') 65 res_url = r"(?<=href=\").+?(?=\")|(?<=href=\').+?(?=\')" 66 link = re.findall(res_url, str(tag), re.I | re.S | re.M) 67 dir_path = self.path+'/'+title 68 is_exist = os.path.exists(dir_path) 69 if not is_exist: 70 os.makedirs(dir_path) 71 for index, i in enumerate(link[1:-1]): 72 real_url = url.rsplit('/', 1)[0]+'/'+i 73 if i == "#": 74 rsp = requests.get(url+i, headers=self.get_header()) 75 else: 76 rsp = requests.get(real_url, headers=self.get_header()) 77 self.chartset(rsp) 78 a_bf = BeautifulSoup(rsp.text, 'lxml') 79 img = a_bf.find('div', class_='articleBody') 80 res_url = r"(?<=src=\").+?(?=\")|(?<=src=\').+?(?=\')" 81 img_url = re.findall(res_url, str(img), re.I | re.S | re.M) 82 pic_rsp = requests.get(img_url[0], headers=self.get_header()) 83 img_name = title+str(index+1)+'.jpg' 84 img_path = dir_path+'/'+img_name 85 with open(img_path, 'wb') as f: 86 f.write(pic_rsp.content) 87 f.flush() 88 f.close() 89 print('%s下载完成!' % img_name) 90 time.sleep(3) 91 print("*" * 30) 92 93 94 if __name__ == '__main__': 95urls = ['/a/Mygirl', 96 '/a/Beautyleg'] 97for i in urls: 98 url = i 99 path_name = i.rsplit('/', 1)[1]100 print(i, path_name)101 pd = GetGirlsPhoto(head_url=url, repository_name=path_name)102 pd.get_pic()103 time.sleep(120)

如果觉得《python 爬取5566图库图片》对你有帮助，请点赞、收藏，并留下你的观点哦！

本内容不代表本网观点和政治立场，如有侵犯你的权益请联系我们处理。

网友评论

网友评论仅供其表达个人看法，并不表明网站立场。