bs4 requests
<html> <head> </head> <body> <p id="one"><a></a></p> <p id="two"><a href="#" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" >abc</a></p> <p id="three"><a href="#" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" >three a</a><a href="#" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" >three a</a><a href="#" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" >three a</a></p> <p id="four"><a href="#" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" >four<p>four p</p><p>four p</p><p>four p</p> a</a></p> </body> </html>
from bs4 import BeautifulSoup
import lxml
if __name__=='__main__':
s = BeautifulSoup(open('test.html'),'lxml')
print s.prettify()
print "------------------------------"
print s.find('p')
print s.find_all('p')
print "------------------------------"
print s.find('p',id='one')
print s.find_all('p',id='one')
print "------------------------------"
print s.find('p',id="two")
print s.find_all('p',id="two")
print "------------------------------"
print s.find('p',id="three")
print s.find_all('p',id="three")
print "------------------------------"
print s.find('p',id="four")
print s.find_all('p',id="four")
print "------------------------------" def get_html():
url = "https://HdhCmsTestsanwen.net/"
two_html = ['sanwen','shige','zawen','suibi','rizhi','novel']
for doc in two_html:
i=1
if doc=='sanwen':
print "running sanwen -----------------------------"
if doc=='shige':
print "running shige ------------------------------"
if doc=='zawen':
print 'running zawen -------------------------------'
if doc=='suibi':
print 'running suibi -------------------------------'
if doc=='rizhi':
print 'running ruzhi -------------------------------'
if doc=='nove':
print 'running xiaoxiaoshuo -------------------------'
while(i<10):
par = {'p':i}
res = requests.get(url+doc+'/',params=par)
if res.status_code==200:
soup(res.text)
i+=i def soup(html_text):
s = BeautifulSoup(html_text,'lxml')
link = s.find('p',class_='categorylist').find_all('li')
for i in link:
if i!=s.find('li',class_='page'):
title = i.find_all('a')[1]
author = i.find_all('a')[2].text
url = title.attrs['href']
sign = re测试数据pile(r'(//)|/')
match = sign.search(title.text)
file_name = title.text
if match:
file_name = sign.sub('a',str(title.text)) def get_content(url):
res = requests.get('https://HdhCmsTestsanwen.net'+url)
if res.status_code==200:
soup = BeautifulSoup(res.text,'lxml')
contents = soup.find('p',class_='content').find_all('p')
content = ''
for i in contents:
content+=i.text+'\n'
return content f = open(file_name+'.txt','w') print 'running w txt'+file_name+'.txt' f.write(title.text+'\n') f.write(author+'\n') content=get_content(url) f.write(content) f.close()
f = open(file_name+'.txt','w') print 'running w txt'+file_name+'.txt' f.write(title.text+'\n') f.write(author+'\n') content=get_content(url) f.write(content) f.close()
差点忘了效果图
能会出现timeout现象吧,只能说上大学一定要选网好的啊!
以上就是python爬取文章实例教程的详细内容,更多请关注Gxl网其它相关文章!
声明:本文来自网络,不代表【好得很程序员自学网】立场,转载请注明出处:http://www.haodehen.cn/did81825