现在很多网站都是异步加载的方式加载数据,大部分都是json数据,如果不知道数据的传递过程,一些参数理不清头绪的话,又想要获取数据,那就比较难搞了,尤其是对于本渣渣级选手而言。
? ? ? ?
? ? ?
目标网址
https://HdhCmsTestkeyshot测试数据/gallery/
需求
获取图片信息,需高清大图
?
经过简单浏览器抓包调试,可以获取到一些信息!
? ? ? ?
? ? ?
? ? ? ?
? ? ?
?
不想努力了,想了两个笨方法,好在数据量不大!
枚举法获取图片地址,爬取图片
1.枚举获取图片地址
代码示例
for?i?in?range(10000): ????if?len(str(i))==1: ????????i=f'000{i}' ????if?len(str(i))==2: ????????i?=?f'00{i}' ????if?len(str(i))==3: ????????i?=?f'0{i}' ????if?len(str(i))?==4: ????????i=i ????print(i) ????url=f"https://HdhCmsTestkeyshot测试数据/wp-content/uploads/2016/06/keyshot-gallery-{i}.jpg" ????if?requests.get(url,?headers=self.random_headers): ????????print("存在图片!")
图片链接:
https://HdhCmsTestkeyshot测试数据/wp-content/uploads/2016/06/keyshot-gallery-0003.jpg
可以看到id与图片链接是存在关系的,所以,对于id进行迭代,同时进行了if判断!
2.图片下载
代码示例
????def?save_img(self,?img_url,?img_name,?path): ????????os.makedirs(f'{path}/',?exist_ok=True) ????????print("开始下载图片!") ????????print(f">>>?开始保存?{img_name}?图片") ????????r?=?requests.get(img_url,?headers=self.random_headers,timeout=8) ????????with?open(f'{path}/{img_name}.jpg',?'wb')?as?f: ????????????f.write(r.content) ????????print(f">>>?保存?{img_name}?图片成功")
这里需要注意的是 timeout=8 属性一定需要标配,尤其是国外网站获取请求的话,不然容易卡死!
完整代码
#?-*-?coding:?UTF-8?-*- #微信:huguo00289 import?requests import?random,os class?Httprequest(object): ????ua_list?=?[ ????????'Mozilla/5.0?(Windows?NT?6.1;?WOW64)?AppleWebKit/535.1?(KHTML,?like?Gecko)?Chrome/14.0.835.163?Safari/535.1', ????????'Mozilla/5.0?(Windows?NT?10.0;?Win64;?x64)?AppleWebKit/537.36?(KHTML,?like?Gecko)?Chrome/73.0.3683.103?Safari/537.36Chrome?17.0', ????????'Mozilla/5.0?(Macintosh;?Intel?Mac?OS?X?10_7_0)?AppleWebKit/535.11?(KHTML,?like?Gecko)?Chrome/17.0.963.56?Safari/535.11', ????????'Mozilla/5.0?(Windows?NT?6.1;?WOW64;?rv:6.0)?Gecko/20100101?Firefox/6.0Firefox?4.0.1', ????????'Mozilla/5.0?(Macintosh;?Intel?Mac?OS?X?10.6;?rv:2.0.1)?Gecko/20100101?Firefox/4.0.1', ????????'Mozilla/5.0?(Macintosh;?U;?Intel?Mac?OS?X?10_6_8;?en-us)?AppleWebKit/534.50?(KHTML,?like?Gecko)?Version/5.1?Safari/534.50', ????????'Mozilla/5.0?(Windows;?U;?Windows?NT?6.1;?en-us)?AppleWebKit/534.50?(KHTML,?like?Gecko)?Version/5.1?Safari/534.50', ????????'Opera/9.80?(Windows?NT?6.1;?U;?en)?Presto/2.8.131?Version/11.11', ????] ????@property??#把方法变成属性的装饰器 ????def?random_headers(self): ????????return?{ ????????????'User-Agent':?random.choice(self.ua_list) ????????} class?Get_imgs(Httprequest): ????def?__init__(self): ????????self.path="key" ????def?getimgs(self): ????????for?i?in?range(10000): ????????????if?len(str(i))==1: ????????????????i=f'000{i}' ????????????if?len(str(i))==2: ????????????????i?=?f'00{i}' ????????????if?len(str(i))==3: ????????????????i?=?f'0{i}' ????????????if?len(str(i))?==4: ????????????????i=i ????????????print(i) ????????????url=f"https://HdhCmsTestkeyshot测试数据/wp-content/uploads/2016/06/keyshot-gallery-{i}.jpg" ????????????if?requests.get(url,?headers=self.random_headers): ????????????????print("存在图片!") ????????????????self.save_img(url,?str(i),?self.path) ????#下载图片 ????def?save_img(self,?img_url,?img_name,?path): ????????os.makedirs(f'{path}/',?exist_ok=True) ????????print("开始下载图片!") ????????print(f">>>?开始保存?{img_name}?图片") ????????r?=?requests.get(img_url,?headers=self.random_headers,timeout=8) ????????with?open(f'{path}/{img_name}.jpg',?'wb')?as?f: ????????????f.write(r.content) ????????print(f">>>?保存?{img_name}?图片成功") if?__name__=='__main__': ????spider=Get_imgs() ????spider.getimgs()
手动获取json数据包,爬取图片
1.正则获取图片地址
代码示例
img_urls=[] ????zeimg=r'href="(.+?)"' ????imgs=re.findall(zeimg,str(datas),re.S) ????for?img?in?imgs: ????????if?"HdhCmsTestkeyshot测试数据"?in?img: ????????????img_urls.append(img)
2.多线程下载图片,这里使用了线程池技术
代码示例
def?main(): ????img_urls=get_imgs() ????try: ????????#?开4个?worker,没有参数时默认是?cpu?的核心数 ????????pool?=?ThreadPool() ????????results?=?pool.map(save_img,?img_urls) ????????pool.close() ????????pool.join() ????????print("采集所有图片完成!") ????except: ????????print("Error:?unable?to?start?thread")
完整代码
#keyshot图片采集 #?-*-?coding:?UTF-8?-*- #微信:huguo00289 import?requests,re,os,random from?multiprocessing.dummy?import?Pool?as?ThreadPool def?get_imgs(): ????datas=""" ???? ???? ????data:?"<li?id="eg-2-post-id-333312"?data-skin="keyshot-gallery"?class="filterall?filter-entertainment?eg-keyshot-gallery-wrapper?eg-post-id-333312?eg-newli"?data-date="1352027697"?data-title="dmitrij-le">?????<div?class="esg-media-cover-wrapper">?<div?class="esg-entry-media"><img?src="https://HdhCmsTestkeyshot测试数据/wp-content/plugins/essential-grid/public/assets/images/300x200transparent.png"?data-lazythumb="https://HdhCmsTestkeyshot测试数据/wp-content/uploads/2016/06/keyshot-gallery-0095-700x1121-25x25.jpg"?data-no-lazy="1"?data-lazysrc="https://HdhCmsTestkeyshot测试数据/wp-content/uploads/2016/06/keyshot-gallery-0095-700x1121.jpg"?alt=""?width="700"?height="1121"></div>??????????????<div?class="esg-entry-cover?esg-transition"?data-delay="0"?data-duration="deafult"?data-clickable="on"?data-transition="esg-fade">??<a?class="eg-invisiblebutton??esgbox"?href="https://HdhCmsTestkeyshot测试数据/wp-content/uploads/2016/06/keyshot-gallery-0095.jpg"??data-width="923"??data-height="1478"?></a>????????????????<div?class="esg-overlay?esg-transition?eg-keyshot-gallery-container"?data-delay="0"?data-duration="default"?data-transition="esg-fade"></div>??????<div?class="esg-center?eg-post-333312?eg-keyshot-gallery-element-14-a?esg-transition"?data-delay="0"?data-duration="default"?data-transition="esg-slideup"><a?class="eg-keyshot-gallery-element-14?eg-post-333312"?href="javascript:void(0);"?target="_self"></a></div>???????????????<div?class="esg-center?eg-keyshot-gallery-element-8?esg-none?esg-clear"?style="height:?5px;?visibility:?hidden;"></div>?????<div?class="esg-center?eg-post-333312?eg-keyshot-gallery-element-3?esg-transition"?data-delay="0.1"?data-duration="default"?data-transition="esg-flipup">Dmitrij?Leppée</div>???????????????<div?class="esg-center?eg-keyshot-gallery-element-9?esg-none?esg-clear"?style="height:?5px;?visibility:?hidden;"></div>???????????????<div?class="esg-center?eg-keyshot-gallery-element-11?esg-none?esg-clear"?style="height:?5px;?visibility:?hidden;"></div>????????????</div>????</div>??</li>?<li?id="eg-2-post-id-333248"?data-skin="keyshot-gallery"?class="filterall?filter-jewelry?eg-keyshot-gallery-wrapper?eg-post-id-333248?eg-newli"?data-date="1351522438"?data-title="tiho-ramov">?????<div?class="esg-media-cover-wrapper">?<div?class="esg-entry-media"><img?src="https://HdhCmsTestkeyshot测试数据/wp-content/plugins/essential-grid/public/assets/images/300x200transparent.png"?data-lazythumb="https://HdhCmsTestkeyshot测试数据/wp-content/uploads/2016/06/keyshot-gallery-0073-700x321-25x25.png"?data-no-lazy="1"?data-lazysrc="https://HdhCmsTestkeyshot测试数据/wp-content/uploads/2016/06/keyshot-gallery-0073-700x321.png"?alt=""?width="700"?height="321"></div>??????????????<div?class="esg-entry-cover?esg-transition"?data-delay="0"?data-duration="deafult"?data-clickable="on"?data-transition="esg-fade">??<a?class="eg-invisiblebutton??esgbox"?href="https://HdhCmsTestkeyshot测试数据/wp-content/uploads/2016/06/keyshot-gallery-0073.png"??data-width="1000"??data-height="458"?></a>????????????????<div?class="esg-overlay?esg-transition?eg-keyshot-gallery-container"?data-delay="0"?data-duration="default"?data-transition="esg-fade"></div>??????<div?class="esg-center?eg-post-333248?eg-keyshot-gallery-element-14-a?esg-transition"?data-delay="0"?data-duration="default"?data-transition="esg-slideup"><a?class="eg-keyshot-gallery-element-14?eg-post-333248"?href="javascript:void(0);"?target="_self"></a></div>???????????????<div?class="esg-center?eg-keyshot-gallery-element-8?esg-none?esg-clear"?style="height:?5px;?visibility:?hidden;"></div>?????<div?class="esg-center?eg-post-333248?eg-keyshot-gallery-element-3?esg-transition"?data-delay="0.1"?data-duration="default"?data-transition="esg-flipup">Tiho?Ramovic</div>???????????????<div?class="esg-center?eg-keyshot-gallery-element-9?esg-none?esg-clear"?style="height:?5px;?visibility:?hidden;"></div>???????????????<div?class="esg-center?eg-keyshot-gallery-element-11?esg-none?esg-clear"?style="height:?5px;?visibility:?hidden;"></div>????????????</div>????</div>??</li>?<li?id="eg-2-post-id-333308"?data-skin="keyshot-gallery"?class="filterall?filter-entertainment?eg-keyshot-gallery-wrapper?eg-post-id-333308?eg-newli"?data-date="1349780210"?data-title="vitaly-bul">?????<div?class="esg-media-cover-wrapper">?<div?class="esg-entry-media"><img?src="https://HdhCmsTestkeyshot测试数据/wp-content/plugins/essential-grid/public/assets/images/300x200transparent.png"?data-lazythumb="https://HdhCmsTestkeyshot测试数据/wp-content/uploads/2012/10/keyshot-gallery-0113-700x1020-25x25.jpg"?data-no-lazy="1"?data-lazysrc="https://HdhCmsTestkeyshot测试数据/wp-content/uploads/2012/10/keyshot-gallery-0113-700x1020.jpg"?alt=""?width="700"?height="1020"></div>??????????????<div?class="esg-entry-cover?esg-transition"?data-delay="0"?data-duration="deafult"?data-clickable="on"?data-transition="esg-fade">??<a?class="eg-invisiblebutton??esgbox"?href="https://HdhCmsTestkeyshot测试数据/wp-content/uploads/2012/10/keyshot-gallery-0113.jpg"??data-width="961"??data-height="1400"?></a>????????????????<div?class="esg-overlay?esg-transition?eg-keyshot-gallery-container"?data-delay="0"?data-duration="default"?data-transition="esg-fade"></div>??????<div?class="esg-center?eg-post-333308?eg-keyshot-gallery-element-14-a?esg-transition"?data-delay="0"?data-duration="default"?data-transition="esg-slideup"><a?class="eg-keyshot-gallery-element-14?eg-post-333308"?href="javascript:void(0);"?target="_self"></a></div>???????????????<div?class="esg-center?eg-keyshot-gallery-element-8?esg-none?esg-clear"?style="height:?5px;?visibility:?hidden;"></div>?????<div?class="esg-center?eg-post-333308?eg-keyshot-gallery-element-3?esg-transition"?data-delay="0.1"?data-duration="default"?data-transition="esg-flipup">Vitaly?Bulgarov</div>???????????????<div?class="esg-center?eg-keyshot-gallery-element-9?esg-none?esg-clear"?style="height:?5px;?visibility:?hidden;"></div>???????????????<div?class="esg-center?eg-keyshot-gallery-element-11?esg-none?esg-clear"?style="height:?5px;?visibility:?hidden;"></div>????????????</div>????</div>??</li>?<li?id="eg-2-post-id-333310"?data-skin="keyshot-gallery"?class="filterall?filter-entertainment?eg-keyshot-gallery-wrapper?eg-post-id-333310?eg-newli"?data-date="1345460494"?data-title="maarten-ve">?????<div?class="esg-media-cover-wrapper">?<div?class="esg-entry-media"><img?src="https://HdhCmsTestkeyshot测试数据/wp-content/plugins/essential-grid/public/assets/images/300x200transparent.png"?data-lazythumb="https://HdhCmsTestkeyshot测试数据/wp-content/uploads/2016/06/keyshot-gallery-0094-700x1017-25x25.jpg"?data-no-lazy="1"?data-lazysrc="https://HdhCmsTestkeyshot测试数据/wp-content/uploads/2016/06/keyshot-gallery-0094-700x1017.jpg"?alt=""?width="700"?height="1017"></div>??????????????<div?class="esg-entry-cover?esg-transition"?data-delay="0"?data-duration="deafult"?data-clickable="on"?data-transition="esg-fade">??<a?class="eg-invisiblebutton??esgbox"?href="https://HdhCmsTestkeyshot测试数据/wp-content/uploads/2016/06/keyshot-gallery-0094.jpg"??data-width="1321"??data-height="1920"?></a>????????????????<div?class="esg-overlay?esg-transition?eg-keyshot-gallery-container"?data-delay="0"?data-duration="default"?data-transition="esg-fade"></div>??????<div?class="esg-center?eg-post-333310?eg-keyshot-gallery-element-14-a?esg-transition"?data-delay="0"?data-duration="default"?data-transition="esg-slideup"><a?class="eg-keyshot-gallery-element-14?eg-post-333310"?href="javascript:void(0);"?target="_self"></a></div>???????????????<div?class="esg-center?eg-keyshot-gallery-element-8?esg-none?esg-clear"?style="height:?5px;?visibility:?hidden;"></div>?????<div?class="esg-center?eg-post-333310?eg-keyshot-gallery-element-3?esg-transition"?data-delay="0.1"?data-duration="default"?data-transition="esg-flipup">Maarten?Verhoeven</div>???????????????<div?class="esg-center?eg-keyshot-gallery-element-9?esg-none?esg-clear"?style="height:?5px;?visibility:?hidden;"></div>???????????????<div?class="esg-center?eg-keyshot-gallery-element-11?esg-none?esg-clear"?style="height:?5px;?visibility:?hidden;"></div>????????????</div>????</div>??</li>?<li?id="eg-2-post-id-333207"?data-skin="keyshot-gallery"?class="filterall?filter-engineering?eg-keyshot-gallery-wrapper?eg-post-id-333207?eg-newli"?data-date="1334153155"?data-title="philippe-v">?????<div?class="esg-media-cover-wrapper">?<div?class="esg-entry-media"><img?src="https://HdhCmsTestkeyshot测试数据/wp-content/plugins/essential-grid/public/assets/images/300x200transparent.png"?data-lazythumb="https://HdhCmsTestkeyshot测试数据/wp-content/uploads/2016/06/keyshot-gallery-0054-700x394-25x25.jpg"?data-no-lazy="1"?data-lazysrc="https://HdhCmsTestkeyshot测试数据/wp-content/uploads/2016/06/keyshot-gallery-0054-700x394.jpg"?alt=""?width="700"?height="394"></div>??????????????<div?class="esg-entry-cover?esg-transition"?data-delay="0"?data-duration="deafult"?data-clickable="on"?data-transition="esg-fade">??<a?class="eg-invisiblebutton??esgbox"?href="https://HdhCmsTestkeyshot测试数据/wp-content/uploads/2016/06/keyshot-gallery-0054.jpg"??data-width="1280"??data-height="720"?></a>????????????????<div?class="esg-overlay?esg-transition?eg-keyshot-gallery-container"?data-delay="0"?data-duration="default"?data-transition="esg-fade"></div>??????<div?class="esg-center?eg-post-333207?eg-keyshot-gallery-element-14-a?esg-transition"?data-delay="0"?data-duration="default"?data-transition="esg-slideup"><a?class="eg-keyshot-gallery-element-14?eg-post-333207"?href="javascript:void(0);"?target="_self"></a></div>???????????????<div?class="esg-center?eg-keyshot-gallery-element-8?esg-none?esg-clear"?style="height:?5px;?visibility:?hidden;"></div>?????<div?class="esg-center?eg-post-333207?eg-keyshot-gallery-element-3?esg-transition"?data-delay="0.1"?data-duration="default"?data-transition="esg-flipup">Philippe?Vanagt</div>???????????????<div?class="esg-center?eg-keyshot-gallery-element-9?esg-none?esg-clear"?style="height:?5px;?visibility:?hidden;"></div>???????????????<div?class="esg-center?eg-keyshot-gallery-element-11?esg-none?esg-clear"?style="height:?5px;?visibility:?hidden;"></div>????????????</div>????</div>??</li>?" ????message:?"" ????success:?true ???? ???? """ ????img_urls=[] ????zeimg=r'href="(.+?)"' ????imgs=re.findall(zeimg,str(datas),re.S) ????for?img?in?imgs: ????????if?"HdhCmsTestkeyshot测试数据"?in?img: ????????????img_urls.append(img) ????print(len(img_urls)) ????return?img_urls #下载图片 def?save_img(img_url): ????path?=?"key" ????ua_list?=?[ ????????'Mozilla/5.0?(Windows?NT?6.1;?WOW64)?AppleWebKit/535.1?(KHTML,?like?Gecko)?Chrome/14.0.835.163?Safari/535.1', ????????'Mozilla/5.0?(Windows?NT?10.0;?Win64;?x64)?AppleWebKit/537.36?(KHTML,?like?Gecko)?Chrome/73.0.3683.103?Safari/537.36Chrome?17.0', ????????'Mozilla/5.0?(Macintosh;?Intel?Mac?OS?X?10_7_0)?AppleWebKit/535.11?(KHTML,?like?Gecko)?Chrome/17.0.963.56?Safari/535.11', ????????'Mozilla/5.0?(Windows?NT?6.1;?WOW64;?rv:6.0)?Gecko/20100101?Firefox/6.0Firefox?4.0.1', ????????'Mozilla/5.0?(Macintosh;?Intel?Mac?OS?X?10.6;?rv:2.0.1)?Gecko/20100101?Firefox/4.0.1', ????????'Mozilla/5.0?(Macintosh;?U;?Intel?Mac?OS?X?10_6_8;?en-us)?AppleWebKit/534.50?(KHTML,?like?Gecko)?Version/5.1?Safari/534.50', ????????'Mozilla/5.0?(Windows;?U;?Windows?NT?6.1;?en-us)?AppleWebKit/534.50?(KHTML,?like?Gecko)?Version/5.1?Safari/534.50', ????????'Opera/9.80?(Windows?NT?6.1;?U;?en)?Presto/2.8.131?Version/11.11', ????] ????os.makedirs(f'{path}/',?exist_ok=True) ????img_name=img_url.split('/')[-1] ????print("开始下载图片!") ????print(f">>>?开始保存?{img_name}?图片") ????r?=?requests.get(img_url,headers={'User-Agent':random.choice(ua_list)},timeout=8) ????with?open(f'{path}/{img_name}',?'wb')?as?f: ????????f.write(r.content) ????print(f">>>?保存?{img_name}?图片成功") def?main(): ????img_urls=get_imgs() ????try: ????????#?开4个?worker,没有参数时默认是?cpu?的核心数 ????????pool?=?ThreadPool() ????????results?=?pool.map(save_img,?img_urls) ????????pool.close() ????????pool.join() ????????print("采集所有图片完成!") ????except: ????????print("Error:?unable?to?start?thread") if?__name__=='__main__': ????main() ????
?? ? ?
微信公众号:二爷记
不定时分享python源码及工具
查看更多关于low版爬虫脚本,Python简单图片爬虫案例的详细内容...
声明:本文来自网络,不代表【好得很程序员自学网】立场,转载请注明出处:http://www.haodehen.cn/did126117