好得很程序员自学网

<tfoot draggable='sEl'></tfoot>

Python爬虫,微信公众号话题标签内容采集打印PDF输出

微信公众号内容采集,比较怪异,其参数,post参数需要话费时间去搞定,这里采集的是话题标签的内容,同时应用了pdfkit打印输出内容。

这里实现应用了两个版本,第一个是直接网页访问,其真实地址即post网址也存在比较多的参数,没有尝试过,获取到的内容仅有部分,比较不理想。第二个版本是采用了无头浏览器直接访问,获取到网页源码,进行解析,得到想要的内容。

本渣渣现在比较懒,代码都是拿以前的,现成的,复制,改改,直接使用的!

版本一:

 #微信公众号内容获取打印pdf
#by 微信:huguo00289
#https://mp.weixin.qq测试数据/mp/homepage?__biz=MzA4NjQ3MDk4OA==&hid=5&sn=573b1b806f9ebf63171a56ee2936b883&devicetype=android-29&version=27001239&lang=zh_CN&nettype=WIFI&a=&session_us=gh_7d55ab2d943f&wx_header=1&fontScale=100&from=timeline&isappinstalled=0&scene=1&subscene=2&clicktime=1594602258&enterid=1594602258&ascene=14
#?-*-?coding:?UTF-8?-*-
import?requests
from?fake_useragent?import?UserAgent
import?os,re
import?pdfkit


confg?=?pdfkit.configuration(
????wkhtmltopdf=r'D:\wkhtmltox-0.12.5-1.mxe-cross-win64\wkhtmltox\bin\wkhtmltopdf.exe')

class?Du():
????def?__init__(self,furl):
????????ua=UserAgent()
????????self.headers={
????????????"User-Agent":?ua.random,
??????????????????????}
????????self.url=furl


????def?get_urls(self):

????????response=requests.get(self.url,headers=self.headers,timeout=8)
????????html=response.content.decode('utf-8')
????????req=re.findall(r'var?data={(.+?)if',html,re.S)[0]
????????urls=re.findall(r',"link":"(.+?)",',req,re.S)


????????urls=set(urls)
????????print(len(urls))


????????return?urls



????def?get_content(self,url,category):
????????response?=?requests.get(url,?headers=self.headers,?timeout=8)
????????print(response.status_code)
????????html?=?response.content.decode('utf-8')
????????req?=?re.findall(r'<div?id="img-content"?class="rich_media_wrp">(.+?)var?first_sceen__time',html,re.S)[0]

????????#获取标题
????????h1=re.findall(r'<h2?class="rich_media_title"?id="activity-name">(.+?)</h2>',req,re.S)[0]
????????h1=h1.strip()
????????pattern?=?r"[\/\\\:\*\?\"\<\>\|]"
????????h1?=?re.sub(pattern,?"_",?h1)??#?替换为下划线
????????print(h1)

????????#获取详情
????????detail?=?re.findall(r'<div?class="rich_media_content?"?id="js_content"?style="visibility:?hidden;">(.+?)<script?nonce=".+?"?type="text/javascript">',req,re.S)[0]





????????data?=?f'<h1>{h1}</h1>\n{detail}'

????????self.dypdf(h1,data,category)

????????return?data




????def?dypdf(self,h1,data,category):
????????datas?=?f'<html><head><meta?charset="UTF-8"></head><body>{data}</body></html>'
????????print("开始打印内容!")
????????pdfkit.from_string(datas,?f'{category}/{h1}.pdf',?configuration=confg)
????????print("打印保存成功!")




if?__name__=='__main__':
????furl="https://mp.weixin.qq测试数据/mp/homepage?__biz=MzA4NjQ3MDk4OA==&hid=5&sn=573b1b806f9ebf63171a56ee2936b883&devicetype=android-29&version=27001239&lang=zh_CN&nettype=WIFI&a=&session_us=gh_7d55ab2d943f&wx_header=1&fontScale=100&from=timeline&isappinstalled=0&scene=1&subscene=2&clicktime=1594602258&enterid=1594602258&ascene=14"
????category="潘通色卡(电子版)"
????datas?=?''
????os.makedirs(f'{category}/',exist_ok=True)
????spider=Du(furl)
????urls=spider.get_urls()
????for?url?in?urls:
????????print(f">>?正在爬取链接:{url}?..")
????????try:
????????????data=spider.get_content(url,category)
????????except?Exception?as?e:
????????????print(f"爬取错误,错误代码为:{e}")

????????datas='%s%s%s'%(datas,'\n',data)

????spider.dypdf(category,datas,category)
 

版本二:

 #微信公众号内容获取打印pdf
#by 微信:huguo00289
#https://mp.weixin.qq测试数据/mp/homepage?__biz=MzA4NjQ3MDk4OA==&hid=5&sn=573b1b806f9ebf63171a56ee2936b883&devicetype=android-29&version=27001239&lang=zh_CN&nettype=WIFI&a=&session_us=gh_7d55ab2d943f&wx_header=1&fontScale=100&from=timeline&isappinstalled=0&scene=1&subscene=2&clicktime=1594602258&enterid=1594602258&ascene=14
#?-*-?coding:?UTF-8?-*-
import?requests
from?selenium?import?webdriver
import?os,re,time
import?pdfkit
from?bs4?import?BeautifulSoup



confg?=?pdfkit.configuration(
????wkhtmltopdf=r'D:\wkhtmltox-0.12.5-1.mxe-cross-win64\wkhtmltox\bin\wkhtmltopdf.exe')

class?wx():
????def?__init__(self,furl):
????????self.url?=?furl
????????self.chrome_driver?=?r'C:\Users\Administrator\Desktop\chromedriver_win32\chromedriver.exe'??#?chromedriver的文件位置
????????self.browser?=?webdriver.Chrome(executable_path=self.chrome_driver)


????def?get_urls(self):
????????urls=[]
????????self.browser.get(self.url)
????????hrefs=self.browser.find_elements_by_xpath("//div[@class='article_list']/a[@class='list_item?js_post']")
????????for?href?in?hrefs:
????????????url=href.get_attribute('href')
????????????urls.append(url)

????????print(len(urls))


????????return?urls



????def?get_content(self,url,category):
????????self.browser.get(url)
????????time.sleep(5)
????????#?调用driver的page_source属性获取页面源码
????????pageSource?=?self.browser.page_source
????????soup=BeautifulSoup(pageSource,'lxml')

????????#获取标题
????????h1=re.findall(r'<h2?class="rich_media_title"?id="activity-name">(.+?)</h2>',pageSource,re.S)[0]
????????h1=h1.strip()
????????pattern?=?r"[\/\\\:\*\?\"\<\>\|]"
????????h1?=?re.sub(pattern,?"_",?h1)??#?替换为下划线
????????print(h1)


????????#获取详情
????????detail?=soup.find('div',class_="rich_media_content")
????????detail=str(detail)
????????del_text="""<p?class=""?style="margin-top:?-1px;?max-width:?100%;?font-family:?微软雅黑;?white-space:?normal;?min-height:?40px;?visibility:?visible;?height:?40px;?line-height:?40px;?border-radius:?10px;?text-align:?center;?box-shadow:?rgb(190,?190,?190)?0px?3px?5px;?color:?rgb(255,?255,?255);?box-sizing:?border-box?!important;?word-wrap:?break-word?!important;?background-image:?none;?background-attachment:?scroll;?background-color:?rgb(245,?143,?198);?background-position:?0%?0%;?background-repeat:?repeat;"><strong?class=""?style="max-width:?100%;?box-sizing:?border-box?!important;?word-wrap:?break-word?!important;"><span style="max-width:?100%;?font-size:?14px;?box-sizing:?border-box?!important;?word-wrap:?break-word?!important;">↑?点击上方<span style="max-width:?100%;?box-sizing:?border-box?!important;?word-wrap:?break-word?!important;">“染整百科”</span>关注我们</span></strong></p>"""

????????detail=detail.replace(del_text,'')


????????data?=?f'<h1>{h1}</h1>\n{detail}'

????????self.dypdf(h1,data,category)

????????return?data




????def?dypdf(self,h1,data,category):
????????datas?=?f'<html><head><meta?charset="UTF-8"></head><body>{data}</body></html>'
????????print("开始打印内容!")
????????pdfkit.from_string(datas,?f'{category}/{h1}.pdf',?configuration=confg)
????????print("打印保存成功!")


????def?quit(self):
????????self.browser.quit()


if?__name__=='__main__':
????furl="https://mp.weixin.qq测试数据/mp/homepage?__biz=MzA4NjQ3MDk4OA==&hid=5&sn=573b1b806f9ebf63171a56ee2936b883&devicetype=android-29&version=27001239&lang=zh_CN&nettype=WIFI&a=&session_us=gh_7d55ab2d943f&wx_header=1&fontScale=100&from=timeline&isappinstalled=0&scene=1&subscene=2&clicktime=1594602258&enterid=1594602258&ascene=14"
????category="潘通色卡(电子版)"
????datas?=?''
????os.makedirs(f'{category}/',exist_ok=True)
????spider=wx(furl)
????urls=spider.get_urls()
????for?url?in?urls:
????????print(f">>?正在爬取链接:{url}?..")
????????try:
????????????data=spider.get_content(url,category)
????????except?Exception?as?e:
????????????print(f"爬取错误,错误代码为:{e}")

????????datas='%s%s%s'%(datas,'\n',data)

????spider.quit()
????spider.dypdf(category,datas,category)

 

以上代码仅供参考,如有雷同,那肯定是本渣渣抄袭的!

?? ? ?

微信公众号:二爷记

不定时分享python源码及工具

查看更多关于Python爬虫,微信公众号话题标签内容采集打印PDF输出的详细内容...

  阅读:28次