Python云琥在线课程详情图片采集
运营中文案是比较头疼的问题,而网络上有不少不错的参考案例,搜集起来参考的话还是一个比较不错的参考,尤其是同类业务的话,自己想文案外加作图的话无疑会头秃的!
#云琥在线 图片采集 #20191120 by 微信:huguo00289 # -*- coding: UTF-8 -*- import requests,time,re,os from bs4 import BeautifulSoup from fake_useragent import UserAgent class Yun(): def __init__(self): self.ua = UserAgent() self.headers = {"User-Agent": self.ua.random} self.url="http://HdhCmsTestyunhuzx测试数据/cla***oom/explore?orderBy=createdTime&categoryId" self.path='yunhuzx/' def oss(self): os.makedirs(self.path,exist_ok=True) print(">>>创建目录文件夹 yunhuzx 成功!") def req(self,url): response = requests.get(url,headers=self.headers) print(f'>>>网页访问成功,返回状态码:{response.status_code}') req=response.text return req def get_num(self): response=self.req(self.url) num=re.findall(r'<p id="tony-page" style="float: left;margin: 5px 0">共(.+?)页/.+?个',response,re.S)[0] print(f'>>>页码数:{num}') return num def get_urllist(self,num): urllist=[] for i in range(1,int(num)+1): url=f'{self.url}=&page={i}' print(url) response = self.req(self.url) soup=BeautifulSoup(response,'lxml') div=soup.find_all('div',class_="col-md-3 col-xs-6 xcol-lg-2") print(len(div)) for a in div: href=a.find('div',class_="course-item").find('a')['href'] href=f'http://HdhCmsTestyunhuzx测试数据{href}' urllist.append(href) return urllist def get_img(self,url): #url="http://HdhCmsTestyunhuzx测试数据/cla***oom/30696/introduction" response = self.req(url) soup = BeautifulSoup(response, 'lxml') title=soup.find('li',class_="active").get_text() title = title.replace('\n', '') title = title.replace(' ', '') title = re.sub(r'[\|\/\<\>\:\*\?\\\"]', "_", title) # 剔除不合法字符 print(title) img=soup.find('div',class_="class-about").find('img') if img: img_url=img['src'] print(img_url) if 'jpg' in img_url: img_name=f'{title}.jpg' elif 'png' in img_url: img_name = f'{title}.png' elif 'gif' in img_url: img_name = f'{title}.gif' else: img_name = f'{title}.jpeg' print(img_name) self.bctp(self.path, img_url, img_name) else: print(f'>>>课程:{title} 链接:{url} >>>无班级介绍!跳过采集!') pass #下载图片 def bctp(self,lj,img_url,img_name): print(">>>开始下载图片!") try: r = requests.get(img_url,headers=self.headers,timeout=50) with open(f'{lj}/{img_name}', 'wb') as f: f.write(r.content) print(f'>>>下载{img_name}图片成功!') time.sleep(1) except Exception as e: if "port=443): Read timed out" in str(e): time.sleep(2) try: r = requests.get(img_url, headers=self.headers,timeout=100) with open(f'{lj}/{img_name}', 'wb') as f: f.write(r.content) print(f'>>>下载{img_name}图片成功!') except Exception as e: print(f'>>>下载{img_name}图片失败!') print(f'错误代码:{e}') with open(f'{lj}/spider.txt', 'a+', encoding='utf-8') as f: f.write(f'错误代码:{e}---下载 {img_url} 图片失败\n') else: print(f'>>>下载{img_name}图片失败!') print(f'错误代码:{e}') with open(f'{lj}/spider.txt', 'a+', encoding='utf-8') as f: f.write(f'错误代码:{e}---下载 {img_url} 图片失败\n') if __name__ == '__main__': spider=Yun() print(f'>>>爬虫启动中,请稍后......') spider.oss() num=spider.get_num() urllist=spider.get_urllist(num) print(f'>>>列表页链接采集完成,详情页图片开始采集中......') for url in urllist: print(f'>>> {url} 详情页图片开始采集......') try: spider.get_img(url) except Exception as e: print(f'获取 {url} 失败!错误代码:{e}') with open(f'lose_spider.txt', 'a+', encoding='utf-8') as f: f.write(f'错误代码:{e}---获取 {url} 内容失败\n') print(f'>>>采集完毕,程序5s后自动关闭!') print(f'PS:采集内容保存于运行目录 文件夹 yunhuzx !') time.sleep(5)
查看更多关于Python云琥在线课程详情图片采集的详细内容...
声明:本文来自网络,不代表【好得很程序员自学网】立场,转载请注明出处:http://www.haodehen.cn/did126059