该楼层疑似违规已被系统折叠隐藏此楼查看此楼
代码如下,每次输入其他页面的范围就不可以爬取,只能爬取第一页的图片,希望有经验的老师可以指点一二,谢谢
#encoding=utf-8
import requests
from bs4 import BeautifulSoup
import threading
from lxml import etree
#获取网页html原码
def get_html(url):
headers={'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36'}
resquest=requests.get(url=url,headers=headers)
response=resquest.content
return response
def img_get_html(html):
soup=BeautifulSoup(html,'lxml')
all_a=soup.find_all('a',class_='list-group-item')
for one_a in all_a:
img_html=get_html(one_a['href'])
img_get_url(img_html)
def img_get_url(html):
soup=etree.HTML(html)#初始化html文件
items=soup.xpath('//div[@class="artile_des"]')
for item in items:
img_url_list=item.xpath('table/tbody/tr/td/a/img/@src')
img_save(img_url_list)
# start_save_img(img_url)
x=1
def img_save(img_url_list):
global x
for img_url in img_url_list
print '正在下载'+img_url
img_content=requests.get(img_url).content
with open('../斗图/'+str(x)+img_url[-4:],'wb') as f:
x += 1
f.write(img_content)
#def start_save_img(img_url):
# th=threading.Thread(target=img_save,args=(img_url,))
# th.start()
def main():
start_pn=int(raw_input("请输入开始页码:"))
end_pn=int(raw_input("请输入结束页码:"))
for pn in range(start_pn,end_pn+1):
start_url = "/article/list/?page="
start_url=get_html(start_url.format(str(pn)))
img_get_html(start_url)
if __name__ == '__main__':
main()
如果觉得《python爬取百度贴吧图片只能爬取置顶帖_交流帖 爬取前两页的图片 不能爬取指定页面...》对你有帮助,请点赞、收藏,并留下你的观点哦!