看效果:
不扯没用的,直接上代码:
#?author???:?sunzd#?date?????:?2019/9/01#?position?:?beijingfrom?fake_useragent?import?UserAgentfrom?bs4?import?BeautifulSoupfrom?urllib?import?requestfrom?urllib?import?errorimport?reimport?timedef?html_request(url):if?url?is?None:returnprint("download?html?is?:{0}".format(url))#?如果url包含中文,则需要进行编码#?模拟浏览器行为headers?=?{'UserAgent':?str(UserAgent().random)}req?=?request.Request(url,?headers=headers)try:html?=?request.urlopen(req).read().decode('utf-8')except?error.URLError?as?e:if?hasattr(e,?"code"):print(e.code)if?hasattr(e,?"reason"):print(e.reason)return?None#?print(html)return?htmldef?html_parser(url,?html):if?url?is?None?or?html?is?None:return#?pattern?=?'<main>(.+?)</main>'???#因为<main>后紧跟的时‘\n’因此需要忽略掉使用模式修正符re.S使'.'可以匹配任意字符#?articles?=?re测试数据pile(pattern,?re.S).findall(html)#?articles?=?articles[0]pattern_art?=?'<div?class="article-item-box?csdn-tracking-statistics"?data(.+?)</div>'#?print(articles)articles?=?re测试数据pile(pattern_art,?re.S).findall(html.replace('\n',?''))print(articles.__len__())for?article?in?articles:soup?=?BeautifulSoup(article,?'html.parser')title?=?soup.find('a',?attrs={'target':?'_blank'})#?print(title)print("文章题目:{0}\n文章类型:{1}".format(title.text.replace('?',?'').replace("原",?"").replace("转",?""),?title.span.text))print("文章链接:{0}".format(title.attrs['href']))html_request(title.attrs['href'])infors?=?soup.find('div',?attrs={'class':?'info-box?d-flex?align-content-center'})#?for?infor?in?infors.p.next_siblings:???next_siblings?:?因为不包括自己,因此会把第一个p节点信息去掉。#?for?infor?in?infors.children:#?????if?infor?==?'?':??#?‘?’空格也会识别为他的孩子,因此需要过滤掉#?????????continue#?????#?print("======{0}".format(infor))#?????if?infor.span:??#?只需要<span?>节点的信息#?????????print("{0}".format(infor.span.text))pattern_next?=?'<li?class="js-page-next?js-page-action?ui-pager?ui-pager-disabled">'next?=?re测试数据pile(pattern_next).findall(html)#?print(html)print("是否为最后一页:{0}----{1}".format(len(next),?next))if?len(next)?==?0:return?0else:return?0if?__name__?==?'__main__':name?=?'你自己的名称'page?=?1url?=?"https://blog.csdn.net/"?+?name?+?"/article/list/"?+?str(page)?+?'?'while?page?<?7:html?=?html_request(url)#?print(html)next?=?html_parser(url,?html)page?+=?1if?page?>?6:page?=?1url?=?"https://blog.csdn.net/"?+?name?+?"/article/list/"?+?str(page)?+?'?'
查看更多关于Python入门学习之:10分钟1500访问量的详细内容...
声明:本文来自网络,不代表【好得很程序员自学网】立场,转载请注明出处:http://www.haodehen.cn/did162544