失眠网 > Python爬虫微信公众号话题标签内容采集打印PDF输出

Python爬虫微信公众号话题标签内容采集打印PDF输出

时间：2019-01-10 05:51:22

微信公众号内容采集，比较怪异，其参数，post参数需要话费时间去搞定，这里采集的是话题标签的内容，同时应用了pdfkit打印输出内容。

这里实现应用了两个版本，第一个是直接网页访问，其真实地址即post网址也存在比较多的参数，没有尝试过，获取到的内容仅有部分，比较不理想。第二个版本是采用了无头浏览器直接访问，获取到网页源码，进行解析，得到想要的内容。

本渣渣现在比较懒，代码都是拿以前的，现成的，复制，改改，直接使用的！

版本一：

#微信公众号内容获取打印pdf#by 微信：huguo00289#https://mp./mp/homepage?__biz=MzA4NjQ3MDk4OA==&hid=5&sn=573b1b806f9ebf63171a56ee2936b883&devicetype=android-29&version=27001239&lang=zh_CN&nettype=WIFI&a=&session_us=gh_7d55ab2d943f&wx_header=1&fontScale=100&from=timeline&isappinstalled=0&scene=1&subscene=2&clicktime=1594602258&enterid=1594602258&ascene=14#-*-coding:UTF-8-*-importrequestsfromfake_useragentimportUserAgentimportos,reimportpdfkitconfg=pdfkit.configuration(wkhtmltopdf=r'D:\wkhtmltox-0.12.5-1.mxe-cross-win64\wkhtmltox\bin\wkhtmltopdf.exe')classDu():def__init__(self,furl):ua=UserAgent()self.headers={"User-Agent":ua.random,}self.url=furldefget_urls(self):response=requests.get(self.url,headers=self.headers,timeout=8)html=response.content.decode('utf-8')req=re.findall(r'vardata={(.+?)if',html,re.S)[0]urls=re.findall(r',"link":"(.+?)",',req,re.S)urls=set(urls)print(len(urls))returnurlsdefget_content(self,url,category):response=requests.get(url,headers=self.headers,timeout=8)print(response.status_code)html=response.content.decode('utf-8')req=re.findall(r'<divid="img-content"class="rich_media_wrp">(.+?)varfirst_sceen__time',html,re.S)[0]#获取标题h1=re.findall(r'<h2class="rich_media_title"id="activity-name">(.+?)</h2>',req,re.S)[0]h1=h1.strip()pattern=r"[\/\\\:\*\?\"\<\>\|]"h1=re.sub(pattern,"_",h1)#替换为下划线print(h1)#获取详情detail=re.findall(r'<divclass="rich_media_content"id="js_content"style="visibility:hidden;">(.+?)<scriptnonce=".+?"type="text/javascript">',req,re.S)[0]data=f'<h1>{h1}</h1>\n{detail}'self.dypdf(h1,data,category)returndatadefdypdf(self,h1,data,category):datas=f'<html><head><metacharset="UTF-8"></head><body>{data}</body></html>'print("开始打印内容！")pdfkit.from_string(datas,f'{category}/{h1}.pdf',configuration=confg)print("打印保存成功！")if__name__=='__main__':furl="https://mp./mp/homepage?__biz=MzA4NjQ3MDk4OA==&hid=5&sn=573b1b806f9ebf63171a56ee2936b883&devicetype=android-29&version=27001239&lang=zh_CN&nettype=WIFI&a=&session_us=gh_7d55ab2d943f&wx_header=1&fontScale=100&from=timeline&isappinstalled=0&scene=1&subscene=2&clicktime=1594602258&enterid=1594602258&ascene=14"category="潘通色卡（电子版）"datas=''os.makedirs(f'{category}/',exist_ok=True)spider=Du(furl)urls=spider.get_urls()forurlinurls:print(f">>正在爬取链接：{url}..")try:data=spider.get_content(url,category)exceptExceptionase:print(f"爬取错误，错误代码为:{e}")datas='%s%s%s'%(datas,'\n',data)spider.dypdf(category,datas,category)

版本二：

#微信公众号内容获取打印pdf#by 微信：huguo00289#https://mp./mp/homepage?__biz=MzA4NjQ3MDk4OA==&hid=5&sn=573b1b806f9ebf63171a56ee2936b883&devicetype=android-29&version=27001239&lang=zh_CN&nettype=WIFI&a=&session_us=gh_7d55ab2d943f&wx_header=1&fontScale=100&from=timeline&isappinstalled=0&scene=1&subscene=2&clicktime=1594602258&enterid=1594602258&ascene=14#-*-coding:UTF-8-*-importrequestsfromseleniumimportwebdriverimportos,re,timeimportpdfkitfrombs4importBeautifulSoupconfg=pdfkit.configuration(wkhtmltopdf=r'D:\wkhtmltox-0.12.5-1.mxe-cross-win64\wkhtmltox\bin\wkhtmltopdf.exe')classwx():def__init__(self,furl):self.url=furlself.chrome_driver=r'C:\Users\Administrator\Desktop\chromedriver_win32\chromedriver.exe'#chromedriver的文件位置self.browser=webdriver.Chrome(executable_path=self.chrome_driver)defget_urls(self):urls=[]self.browser.get(self.url)hrefs=self.browser.find_elements_by_xpath("//div[@class='article_list']/a[@class='list_itemjs_post']")forhrefinhrefs:url=href.get_attribute('href')urls.append(url)print(len(urls))returnurlsdefget_content(self,url,category):self.browser.get(url)time.sleep(5)#调用driver的page_source属性获取页面源码pageSource=self.browser.page_sourcesoup=BeautifulSoup(pageSource,'lxml')#获取标题h1=re.findall(r'<h2class="rich_media_title"id="activity-name">(.+?)</h2>',pageSource,re.S)[0]h1=h1.strip()pattern=r"[\/\\\:\*\?\"\<\>\|]"h1=re.sub(pattern,"_",h1)#替换为下划线print(h1)#获取详情detail=soup.find('div',class_="rich_media_content")detail=str(detail)del_text="""<pclass=""style="margin-top:-1px;max-width:100%;font-family:微软雅黑;white-space:normal;min-height:40px;visibility:visible;height:40px;line-height:40px;border-radius:10px;text-align:center;box-shadow:rgb(190,190,190)0px3px5px;color:rgb(255,255,255);box-sizing:border-box!important;word-wrap:break-word!important;background-image:none;background-attachment:scroll;background-color:rgb(245,143,198);background-position:0%0%;background-repeat:repeat;"><strongclass=""style="max-width:100%;box-sizing:border-box!important;word-wrap:break-word!important;"><span style="max-width:100%;font-size:14px;box-sizing:border-box!important;word-wrap:break-word!important;">↑点击上方<span style="max-width:100%;box-sizing:border-box!important;word-wrap:break-word!important;">“染整百科”</span>关注我们</span></strong></p>"""detail=detail.replace(del_text,'')data=f'<h1>{h1}</h1>\n{detail}'self.dypdf(h1,data,category)returndatadefdypdf(self,h1,data,category):datas=f'<html><head><metacharset="UTF-8"></head><body>{data}</body></html>'print("开始打印内容！")pdfkit.from_string(datas,f'{category}/{h1}.pdf',configuration=confg)print("打印保存成功！")defquit(self):self.browser.quit()if__name__=='__main__':furl="https://mp./mp/homepage?__biz=MzA4NjQ3MDk4OA==&hid=5&sn=573b1b806f9ebf63171a56ee2936b883&devicetype=android-29&version=27001239&lang=zh_CN&nettype=WIFI&a=&session_us=gh_7d55ab2d943f&wx_header=1&fontScale=100&from=timeline&isappinstalled=0&scene=1&subscene=2&clicktime=1594602258&enterid=1594602258&ascene=14"category="潘通色卡（电子版）"datas=''os.makedirs(f'{category}/',exist_ok=True)spider=wx(furl)urls=spider.get_urls()forurlinurls:print(f">>正在爬取链接：{url}..")try:data=spider.get_content(url,category)exceptExceptionase:print(f"爬取错误，错误代码为:{e}")datas='%s%s%s'%(datas,'\n',data)spider.quit()spider.dypdf(category,datas,category)

以上代码仅供参考，如有雷同，那肯定是本渣渣抄袭的！

微信公众号：二爷记

不定时分享python源码及工具

如果觉得《Python爬虫微信公众号话题标签内容采集打印PDF输出》对你有帮助，请点赞、收藏，并留下你的观点哦！

本内容不代表本网观点和政治立场，如有侵犯你的权益请联系我们处理。

网友评论

网友评论仅供其表达个人看法，并不表明网站立场。