Python云琥在线课程详情图片采集
运营中文案是比较头疼的问题,而网络上有不少不错的参考案例,搜集起来参考的话还是一个比较不错的参考,尤其是同类业务的话,自己想文案外加作图的话无疑会头秃的!
#云琥在线 图片采集
#20191120 by 微信:huguo00289
# -*- coding: UTF-8 -*-
import requests,time,re,os
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
class Yun():
def __init__(self):
self.ua = UserAgent()
self.headers = {"User-Agent": self.ua.random}
self.url="http://HdhCmsTestyunhuzx测试数据/cla***oom/explore?orderBy=createdTime&categoryId"
self.path='yunhuzx/'
def oss(self):
os.makedirs(self.path,exist_ok=True)
print(">>>创建目录文件夹 yunhuzx 成功!")
def req(self,url):
response = requests.get(url,headers=self.headers)
print(f'>>>网页访问成功,返回状态码:{response.status_code}')
req=response.text
return req
def get_num(self):
response=self.req(self.url)
num=re.findall(r'<p id="tony-page" style="float: left;margin: 5px 0">共(.+?)页/.+?个',response,re.S)[0]
print(f'>>>页码数:{num}')
return num
def get_urllist(self,num):
urllist=[]
for i in range(1,int(num)+1):
url=f'{self.url}=&page={i}'
print(url)
response = self.req(self.url)
soup=BeautifulSoup(response,'lxml')
div=soup.find_all('div',class_="col-md-3 col-xs-6 xcol-lg-2")
print(len(div))
for a in div:
href=a.find('div',class_="course-item").find('a')['href']
href=f'http://HdhCmsTestyunhuzx测试数据{href}'
urllist.append(href)
return urllist
def get_img(self,url):
#url="http://HdhCmsTestyunhuzx测试数据/cla***oom/30696/introduction"
response = self.req(url)
soup = BeautifulSoup(response, 'lxml')
title=soup.find('li',class_="active").get_text()
title = title.replace('\n', '')
title = title.replace(' ', '')
title = re.sub(r'[\|\/\<\>\:\*\?\\\"]', "_", title) # 剔除不合法字符
print(title)
img=soup.find('div',class_="class-about").find('img')
if img:
img_url=img['src']
print(img_url)
if 'jpg' in img_url:
img_name=f'{title}.jpg'
elif 'png' in img_url:
img_name = f'{title}.png'
elif 'gif' in img_url:
img_name = f'{title}.gif'
else:
img_name = f'{title}.jpeg'
print(img_name)
self.bctp(self.path, img_url, img_name)
else:
print(f'>>>课程:{title} 链接:{url} >>>无班级介绍!跳过采集!')
pass
#下载图片
def bctp(self,lj,img_url,img_name):
print(">>>开始下载图片!")
try:
r = requests.get(img_url,headers=self.headers,timeout=50)
with open(f'{lj}/{img_name}', 'wb') as f:
f.write(r.content)
print(f'>>>下载{img_name}图片成功!')
time.sleep(1)
except Exception as e:
if "port=443): Read timed out" in str(e):
time.sleep(2)
try:
r = requests.get(img_url, headers=self.headers,timeout=100)
with open(f'{lj}/{img_name}', 'wb') as f:
f.write(r.content)
print(f'>>>下载{img_name}图片成功!')
except Exception as e:
print(f'>>>下载{img_name}图片失败!')
print(f'错误代码:{e}')
with open(f'{lj}/spider.txt', 'a+', encoding='utf-8') as f:
f.write(f'错误代码:{e}---下载 {img_url} 图片失败\n')
else:
print(f'>>>下载{img_name}图片失败!')
print(f'错误代码:{e}')
with open(f'{lj}/spider.txt', 'a+', encoding='utf-8') as f:
f.write(f'错误代码:{e}---下载 {img_url} 图片失败\n')
if __name__ == '__main__':
spider=Yun()
print(f'>>>爬虫启动中,请稍后......')
spider.oss()
num=spider.get_num()
urllist=spider.get_urllist(num)
print(f'>>>列表页链接采集完成,详情页图片开始采集中......')
for url in urllist:
print(f'>>> {url} 详情页图片开始采集......')
try:
spider.get_img(url)
except Exception as e:
print(f'获取 {url} 失败!错误代码:{e}')
with open(f'lose_spider.txt', 'a+', encoding='utf-8') as f:
f.write(f'错误代码:{e}---获取 {url} 内容失败\n')
print(f'>>>采集完毕,程序5s后自动关闭!')
print(f'PS:采集内容保存于运行目录 文件夹 yunhuzx !')
time.sleep(5)
查看更多关于Python云琥在线课程详情图片采集的详细内容...
声明:本文来自网络,不代表【好得很程序员自学网】立场,转载请注明出处:http://www.haodehen.cn/did126059