好得很程序员自学网

<tfoot draggable='sEl'></tfoot>

low版爬虫脚本,Python简单图片爬虫案例

现在很多网站都是异步加载的方式加载数据,大部分都是json数据,如果不知道数据的传递过程,一些参数理不清头绪的话,又想要获取数据,那就比较难搞了,尤其是对于本渣渣级选手而言。

? ? ? ?

? ? ?

目标网址

https://HdhCmsTestkeyshot测试数据/gallery/

需求

获取图片信息,需高清大图

?

经过简单浏览器抓包调试,可以获取到一些信息!

? ? ? ?

? ? ?

? ? ? ?

? ? ?

?

不想努力了,想了两个笨方法,好在数据量不大!

枚举法获取图片地址,爬取图片

1.枚举获取图片地址

代码示例

 for?i?in?range(10000):
????if?len(str(i))==1:
????????i=f'000{i}'
????if?len(str(i))==2:
????????i?=?f'00{i}'
????if?len(str(i))==3:
????????i?=?f'0{i}'
????if?len(str(i))?==4:
????????i=i

????print(i)
????url=f"https://HdhCmsTestkeyshot测试数据/wp-content/uploads/2016/06/keyshot-gallery-{i}.jpg"
????if?requests.get(url,?headers=self.random_headers):
????????print("存在图片!")
 

图片链接:

https://HdhCmsTestkeyshot测试数据/wp-content/uploads/2016/06/keyshot-gallery-0003.jpg

可以看到id与图片链接是存在关系的,所以,对于id进行迭代,同时进行了if判断!

2.图片下载

代码示例

 ????def?save_img(self,?img_url,?img_name,?path):
????????os.makedirs(f'{path}/',?exist_ok=True)
????????print("开始下载图片!")
????????print(f">>>?开始保存?{img_name}?图片")
????????r?=?requests.get(img_url,?headers=self.random_headers,timeout=8)
????????with?open(f'{path}/{img_name}.jpg',?'wb')?as?f:
????????????f.write(r.content)
????????print(f">>>?保存?{img_name}?图片成功")
 

这里需要注意的是 timeout=8 属性一定需要标配,尤其是国外网站获取请求的话,不然容易卡死!

完整代码

 #?-*-?coding:?UTF-8?-*-
#微信:huguo00289
import?requests
import?random,os


class?Httprequest(object):
????ua_list?=?[
????????'Mozilla/5.0?(Windows?NT?6.1;?WOW64)?AppleWebKit/535.1?(KHTML,?like?Gecko)?Chrome/14.0.835.163?Safari/535.1',
????????'Mozilla/5.0?(Windows?NT?10.0;?Win64;?x64)?AppleWebKit/537.36?(KHTML,?like?Gecko)?Chrome/73.0.3683.103?Safari/537.36Chrome?17.0',
????????'Mozilla/5.0?(Macintosh;?Intel?Mac?OS?X?10_7_0)?AppleWebKit/535.11?(KHTML,?like?Gecko)?Chrome/17.0.963.56?Safari/535.11',
????????'Mozilla/5.0?(Windows?NT?6.1;?WOW64;?rv:6.0)?Gecko/20100101?Firefox/6.0Firefox?4.0.1',
????????'Mozilla/5.0?(Macintosh;?Intel?Mac?OS?X?10.6;?rv:2.0.1)?Gecko/20100101?Firefox/4.0.1',
????????'Mozilla/5.0?(Macintosh;?U;?Intel?Mac?OS?X?10_6_8;?en-us)?AppleWebKit/534.50?(KHTML,?like?Gecko)?Version/5.1?Safari/534.50',
????????'Mozilla/5.0?(Windows;?U;?Windows?NT?6.1;?en-us)?AppleWebKit/534.50?(KHTML,?like?Gecko)?Version/5.1?Safari/534.50',
????????'Opera/9.80?(Windows?NT?6.1;?U;?en)?Presto/2.8.131?Version/11.11',
????]
????@property??#把方法变成属性的装饰器
????def?random_headers(self):
????????return?{
????????????'User-Agent':?random.choice(self.ua_list)
????????}


class?Get_imgs(Httprequest):
????def?__init__(self):
????????self.path="key"


????def?getimgs(self):
????????for?i?in?range(10000):
????????????if?len(str(i))==1:
????????????????i=f'000{i}'
????????????if?len(str(i))==2:
????????????????i?=?f'00{i}'
????????????if?len(str(i))==3:
????????????????i?=?f'0{i}'
????????????if?len(str(i))?==4:
????????????????i=i

????????????print(i)
????????????url=f"https://HdhCmsTestkeyshot测试数据/wp-content/uploads/2016/06/keyshot-gallery-{i}.jpg"
????????????if?requests.get(url,?headers=self.random_headers):
????????????????print("存在图片!")
????????????????self.save_img(url,?str(i),?self.path)


????#下载图片
????def?save_img(self,?img_url,?img_name,?path):
????????os.makedirs(f'{path}/',?exist_ok=True)
????????print("开始下载图片!")
????????print(f">>>?开始保存?{img_name}?图片")
????????r?=?requests.get(img_url,?headers=self.random_headers,timeout=8)
????????with?open(f'{path}/{img_name}.jpg',?'wb')?as?f:
????????????f.write(r.content)
????????print(f">>>?保存?{img_name}?图片成功")



if?__name__=='__main__':
????spider=Get_imgs()
????spider.getimgs()
 

手动获取json数据包,爬取图片

1.正则获取图片地址

代码示例

 img_urls=[]
????zeimg=r'href="(.+?)"'
????imgs=re.findall(zeimg,str(datas),re.S)
????for?img?in?imgs:
????????if?"HdhCmsTestkeyshot测试数据"?in?img:
????????????img_urls.append(img)
 

2.多线程下载图片,这里使用了线程池技术

代码示例

 def?main():
????img_urls=get_imgs()

????try:
????????#?开4个?worker,没有参数时默认是?cpu?的核心数
????????pool?=?ThreadPool()
????????results?=?pool.map(save_img,?img_urls)
????????pool.close()
????????pool.join()
????????print("采集所有图片完成!")

????except:
????????print("Error:?unable?to?start?thread")
 

完整代码

 
#keyshot图片采集
#?-*-?coding:?UTF-8?-*-
#微信:huguo00289
import?requests,re,os,random
from?multiprocessing.dummy?import?Pool?as?ThreadPool

def?get_imgs():
????datas="""
????
????
????data:?"<li?id="eg-2-post-id-333312"?data-skin="keyshot-gallery"?class="filterall?filter-entertainment?eg-keyshot-gallery-wrapper?eg-post-id-333312?eg-newli"?data-date="1352027697"?data-title="dmitrij-le">?????<div?class="esg-media-cover-wrapper">?<div?class="esg-entry-media"><img?src="https://HdhCmsTestkeyshot测试数据/wp-content/plugins/essential-grid/public/assets/images/300x200transparent.png"?data-lazythumb="https://HdhCmsTestkeyshot测试数据/wp-content/uploads/2016/06/keyshot-gallery-0095-700x1121-25x25.jpg"?data-no-lazy="1"?data-lazysrc="https://HdhCmsTestkeyshot测试数据/wp-content/uploads/2016/06/keyshot-gallery-0095-700x1121.jpg"?alt=""?width="700"?height="1121"></div>??????????????<div?class="esg-entry-cover?esg-transition"?data-delay="0"?data-duration="deafult"?data-clickable="on"?data-transition="esg-fade">??<a?class="eg-invisiblebutton??esgbox"?href="https://HdhCmsTestkeyshot测试数据/wp-content/uploads/2016/06/keyshot-gallery-0095.jpg"??data-width="923"??data-height="1478"?></a>????????????????<div?class="esg-overlay?esg-transition?eg-keyshot-gallery-container"?data-delay="0"?data-duration="default"?data-transition="esg-fade"></div>??????<div?class="esg-center?eg-post-333312?eg-keyshot-gallery-element-14-a?esg-transition"?data-delay="0"?data-duration="default"?data-transition="esg-slideup"><a?class="eg-keyshot-gallery-element-14?eg-post-333312"?href="javascript:void(0);"?target="_self"></a></div>???????????????<div?class="esg-center?eg-keyshot-gallery-element-8?esg-none?esg-clear"?style="height:?5px;?visibility:?hidden;"></div>?????<div?class="esg-center?eg-post-333312?eg-keyshot-gallery-element-3?esg-transition"?data-delay="0.1"?data-duration="default"?data-transition="esg-flipup">Dmitrij?Leppée</div>???????????????<div?class="esg-center?eg-keyshot-gallery-element-9?esg-none?esg-clear"?style="height:?5px;?visibility:?hidden;"></div>???????????????<div?class="esg-center?eg-keyshot-gallery-element-11?esg-none?esg-clear"?style="height:?5px;?visibility:?hidden;"></div>????????????</div>????</div>??</li>?<li?id="eg-2-post-id-333248"?data-skin="keyshot-gallery"?class="filterall?filter-jewelry?eg-keyshot-gallery-wrapper?eg-post-id-333248?eg-newli"?data-date="1351522438"?data-title="tiho-ramov">?????<div?class="esg-media-cover-wrapper">?<div?class="esg-entry-media"><img?src="https://HdhCmsTestkeyshot测试数据/wp-content/plugins/essential-grid/public/assets/images/300x200transparent.png"?data-lazythumb="https://HdhCmsTestkeyshot测试数据/wp-content/uploads/2016/06/keyshot-gallery-0073-700x321-25x25.png"?data-no-lazy="1"?data-lazysrc="https://HdhCmsTestkeyshot测试数据/wp-content/uploads/2016/06/keyshot-gallery-0073-700x321.png"?alt=""?width="700"?height="321"></div>??????????????<div?class="esg-entry-cover?esg-transition"?data-delay="0"?data-duration="deafult"?data-clickable="on"?data-transition="esg-fade">??<a?class="eg-invisiblebutton??esgbox"?href="https://HdhCmsTestkeyshot测试数据/wp-content/uploads/2016/06/keyshot-gallery-0073.png"??data-width="1000"??data-height="458"?></a>????????????????<div?class="esg-overlay?esg-transition?eg-keyshot-gallery-container"?data-delay="0"?data-duration="default"?data-transition="esg-fade"></div>??????<div?class="esg-center?eg-post-333248?eg-keyshot-gallery-element-14-a?esg-transition"?data-delay="0"?data-duration="default"?data-transition="esg-slideup"><a?class="eg-keyshot-gallery-element-14?eg-post-333248"?href="javascript:void(0);"?target="_self"></a></div>???????????????<div?class="esg-center?eg-keyshot-gallery-element-8?esg-none?esg-clear"?style="height:?5px;?visibility:?hidden;"></div>?????<div?class="esg-center?eg-post-333248?eg-keyshot-gallery-element-3?esg-transition"?data-delay="0.1"?data-duration="default"?data-transition="esg-flipup">Tiho?Ramovic</div>???????????????<div?class="esg-center?eg-keyshot-gallery-element-9?esg-none?esg-clear"?style="height:?5px;?visibility:?hidden;"></div>???????????????<div?class="esg-center?eg-keyshot-gallery-element-11?esg-none?esg-clear"?style="height:?5px;?visibility:?hidden;"></div>????????????</div>????</div>??</li>?<li?id="eg-2-post-id-333308"?data-skin="keyshot-gallery"?class="filterall?filter-entertainment?eg-keyshot-gallery-wrapper?eg-post-id-333308?eg-newli"?data-date="1349780210"?data-title="vitaly-bul">?????<div?class="esg-media-cover-wrapper">?<div?class="esg-entry-media"><img?src="https://HdhCmsTestkeyshot测试数据/wp-content/plugins/essential-grid/public/assets/images/300x200transparent.png"?data-lazythumb="https://HdhCmsTestkeyshot测试数据/wp-content/uploads/2012/10/keyshot-gallery-0113-700x1020-25x25.jpg"?data-no-lazy="1"?data-lazysrc="https://HdhCmsTestkeyshot测试数据/wp-content/uploads/2012/10/keyshot-gallery-0113-700x1020.jpg"?alt=""?width="700"?height="1020"></div>??????????????<div?class="esg-entry-cover?esg-transition"?data-delay="0"?data-duration="deafult"?data-clickable="on"?data-transition="esg-fade">??<a?class="eg-invisiblebutton??esgbox"?href="https://HdhCmsTestkeyshot测试数据/wp-content/uploads/2012/10/keyshot-gallery-0113.jpg"??data-width="961"??data-height="1400"?></a>????????????????<div?class="esg-overlay?esg-transition?eg-keyshot-gallery-container"?data-delay="0"?data-duration="default"?data-transition="esg-fade"></div>??????<div?class="esg-center?eg-post-333308?eg-keyshot-gallery-element-14-a?esg-transition"?data-delay="0"?data-duration="default"?data-transition="esg-slideup"><a?class="eg-keyshot-gallery-element-14?eg-post-333308"?href="javascript:void(0);"?target="_self"></a></div>???????????????<div?class="esg-center?eg-keyshot-gallery-element-8?esg-none?esg-clear"?style="height:?5px;?visibility:?hidden;"></div>?????<div?class="esg-center?eg-post-333308?eg-keyshot-gallery-element-3?esg-transition"?data-delay="0.1"?data-duration="default"?data-transition="esg-flipup">Vitaly?Bulgarov</div>???????????????<div?class="esg-center?eg-keyshot-gallery-element-9?esg-none?esg-clear"?style="height:?5px;?visibility:?hidden;"></div>???????????????<div?class="esg-center?eg-keyshot-gallery-element-11?esg-none?esg-clear"?style="height:?5px;?visibility:?hidden;"></div>????????????</div>????</div>??</li>?<li?id="eg-2-post-id-333310"?data-skin="keyshot-gallery"?class="filterall?filter-entertainment?eg-keyshot-gallery-wrapper?eg-post-id-333310?eg-newli"?data-date="1345460494"?data-title="maarten-ve">?????<div?class="esg-media-cover-wrapper">?<div?class="esg-entry-media"><img?src="https://HdhCmsTestkeyshot测试数据/wp-content/plugins/essential-grid/public/assets/images/300x200transparent.png"?data-lazythumb="https://HdhCmsTestkeyshot测试数据/wp-content/uploads/2016/06/keyshot-gallery-0094-700x1017-25x25.jpg"?data-no-lazy="1"?data-lazysrc="https://HdhCmsTestkeyshot测试数据/wp-content/uploads/2016/06/keyshot-gallery-0094-700x1017.jpg"?alt=""?width="700"?height="1017"></div>??????????????<div?class="esg-entry-cover?esg-transition"?data-delay="0"?data-duration="deafult"?data-clickable="on"?data-transition="esg-fade">??<a?class="eg-invisiblebutton??esgbox"?href="https://HdhCmsTestkeyshot测试数据/wp-content/uploads/2016/06/keyshot-gallery-0094.jpg"??data-width="1321"??data-height="1920"?></a>????????????????<div?class="esg-overlay?esg-transition?eg-keyshot-gallery-container"?data-delay="0"?data-duration="default"?data-transition="esg-fade"></div>??????<div?class="esg-center?eg-post-333310?eg-keyshot-gallery-element-14-a?esg-transition"?data-delay="0"?data-duration="default"?data-transition="esg-slideup"><a?class="eg-keyshot-gallery-element-14?eg-post-333310"?href="javascript:void(0);"?target="_self"></a></div>???????????????<div?class="esg-center?eg-keyshot-gallery-element-8?esg-none?esg-clear"?style="height:?5px;?visibility:?hidden;"></div>?????<div?class="esg-center?eg-post-333310?eg-keyshot-gallery-element-3?esg-transition"?data-delay="0.1"?data-duration="default"?data-transition="esg-flipup">Maarten?Verhoeven</div>???????????????<div?class="esg-center?eg-keyshot-gallery-element-9?esg-none?esg-clear"?style="height:?5px;?visibility:?hidden;"></div>???????????????<div?class="esg-center?eg-keyshot-gallery-element-11?esg-none?esg-clear"?style="height:?5px;?visibility:?hidden;"></div>????????????</div>????</div>??</li>?<li?id="eg-2-post-id-333207"?data-skin="keyshot-gallery"?class="filterall?filter-engineering?eg-keyshot-gallery-wrapper?eg-post-id-333207?eg-newli"?data-date="1334153155"?data-title="philippe-v">?????<div?class="esg-media-cover-wrapper">?<div?class="esg-entry-media"><img?src="https://HdhCmsTestkeyshot测试数据/wp-content/plugins/essential-grid/public/assets/images/300x200transparent.png"?data-lazythumb="https://HdhCmsTestkeyshot测试数据/wp-content/uploads/2016/06/keyshot-gallery-0054-700x394-25x25.jpg"?data-no-lazy="1"?data-lazysrc="https://HdhCmsTestkeyshot测试数据/wp-content/uploads/2016/06/keyshot-gallery-0054-700x394.jpg"?alt=""?width="700"?height="394"></div>??????????????<div?class="esg-entry-cover?esg-transition"?data-delay="0"?data-duration="deafult"?data-clickable="on"?data-transition="esg-fade">??<a?class="eg-invisiblebutton??esgbox"?href="https://HdhCmsTestkeyshot测试数据/wp-content/uploads/2016/06/keyshot-gallery-0054.jpg"??data-width="1280"??data-height="720"?></a>????????????????<div?class="esg-overlay?esg-transition?eg-keyshot-gallery-container"?data-delay="0"?data-duration="default"?data-transition="esg-fade"></div>??????<div?class="esg-center?eg-post-333207?eg-keyshot-gallery-element-14-a?esg-transition"?data-delay="0"?data-duration="default"?data-transition="esg-slideup"><a?class="eg-keyshot-gallery-element-14?eg-post-333207"?href="javascript:void(0);"?target="_self"></a></div>???????????????<div?class="esg-center?eg-keyshot-gallery-element-8?esg-none?esg-clear"?style="height:?5px;?visibility:?hidden;"></div>?????<div?class="esg-center?eg-post-333207?eg-keyshot-gallery-element-3?esg-transition"?data-delay="0.1"?data-duration="default"?data-transition="esg-flipup">Philippe?Vanagt</div>???????????????<div?class="esg-center?eg-keyshot-gallery-element-9?esg-none?esg-clear"?style="height:?5px;?visibility:?hidden;"></div>???????????????<div?class="esg-center?eg-keyshot-gallery-element-11?esg-none?esg-clear"?style="height:?5px;?visibility:?hidden;"></div>????????????</div>????</div>??</li>?"
????message:?""
????success:?true
????
????
"""
????img_urls=[]
????zeimg=r'href="(.+?)"'
????imgs=re.findall(zeimg,str(datas),re.S)
????for?img?in?imgs:
????????if?"HdhCmsTestkeyshot测试数据"?in?img:
????????????img_urls.append(img)

????print(len(img_urls))

????return?img_urls


#下载图片
def?save_img(img_url):
????path?=?"key"
????ua_list?=?[
????????'Mozilla/5.0?(Windows?NT?6.1;?WOW64)?AppleWebKit/535.1?(KHTML,?like?Gecko)?Chrome/14.0.835.163?Safari/535.1',
????????'Mozilla/5.0?(Windows?NT?10.0;?Win64;?x64)?AppleWebKit/537.36?(KHTML,?like?Gecko)?Chrome/73.0.3683.103?Safari/537.36Chrome?17.0',
????????'Mozilla/5.0?(Macintosh;?Intel?Mac?OS?X?10_7_0)?AppleWebKit/535.11?(KHTML,?like?Gecko)?Chrome/17.0.963.56?Safari/535.11',
????????'Mozilla/5.0?(Windows?NT?6.1;?WOW64;?rv:6.0)?Gecko/20100101?Firefox/6.0Firefox?4.0.1',
????????'Mozilla/5.0?(Macintosh;?Intel?Mac?OS?X?10.6;?rv:2.0.1)?Gecko/20100101?Firefox/4.0.1',
????????'Mozilla/5.0?(Macintosh;?U;?Intel?Mac?OS?X?10_6_8;?en-us)?AppleWebKit/534.50?(KHTML,?like?Gecko)?Version/5.1?Safari/534.50',
????????'Mozilla/5.0?(Windows;?U;?Windows?NT?6.1;?en-us)?AppleWebKit/534.50?(KHTML,?like?Gecko)?Version/5.1?Safari/534.50',
????????'Opera/9.80?(Windows?NT?6.1;?U;?en)?Presto/2.8.131?Version/11.11',
????]
????os.makedirs(f'{path}/',?exist_ok=True)
????img_name=img_url.split('/')[-1]
????print("开始下载图片!")
????print(f">>>?开始保存?{img_name}?图片")
????r?=?requests.get(img_url,headers={'User-Agent':random.choice(ua_list)},timeout=8)
????with?open(f'{path}/{img_name}',?'wb')?as?f:
????????f.write(r.content)
????print(f">>>?保存?{img_name}?图片成功")



def?main():
????img_urls=get_imgs()

????try:
????????#?开4个?worker,没有参数时默认是?cpu?的核心数
????????pool?=?ThreadPool()
????????results?=?pool.map(save_img,?img_urls)
????????pool.close()
????????pool.join()
????????print("采集所有图片完成!")

????except:
????????print("Error:?unable?to?start?thread")




if?__name__=='__main__':
????main()
????
 

?? ? ?

微信公众号:二爷记

不定时分享python源码及工具

查看更多关于low版爬虫脚本,Python简单图片爬虫案例的详细内容...

  阅读:37次