亚马逊是国际知名的电商平台,访问国际站需要***,而国内访问的话是cn国内站点,不同的地区有不同的站点,每个商品有一个id号,不同地区商品是存在差异的!
亚马逊amazon商品数据采集有点类似于采集百度搜索结果信息,协议头非常重要,除了ua之外,cookies头需要携带,要不然不能访问,国内国外站点一样!
感觉网站存在反爬,尤其是英文站点,因为网页源码价格信息与实际前端页面看到的价格信息存在差异!
输入商品id号采集商品相关信息!
国内站(cn)采集
采集效果
附源码
#国内亚马逊商品爬虫 #20200213?by微信:huguo00289 #?-*-?coding=utf-8?-*- import?requests from?fake_useragent?import?UserAgent import?re,os,time from?lxml?import?etree def?ua(): ????ua=UserAgent() ????headers={ ????????'User-Agent':ua.random, ????????'Cookie':?'x-wl-uid=1eZRN4GNhENdZSGdOrvzQEy2WvlxT/sXztd0uB1drNz9lanSFUVkDtpyWsVQQfwSjhXmvZLrY67w=;?session-id=459-1321777-5720413;?ubid-acbcn=459-5647010-5360714;?lc-acbcn=zh_CN;?i18n-prefs=CNY;?session-token=g6hxLDDoHhzZLHWxd7FnNbtphW7mG7zCPY29lJB7vwUfa73azlZ8jPh8iS6M+c/4mKa3c/d/Pzgiv61e7sJx858blgOf+pmyxOtu55z5AlVE2nRoPAyWFMeG4OKmZQI3Lg5/MNhcN71PW9x2OkQWWLOeqcikSKmxqaEQL9qGyYcnTbrYggdlInP0pROsR8oz;?session-id-time=2082787201l;?csm-hit=tb:s-KV6TYQQV77AQ5HHBPD94|1581595664859&t:1581595666568&adb:adblk_yes' ?????????????} ????return?headers #保存txt def?tx(id,text,path): ????print(f"正在保存商品数据..") ????with?open(f'{path}{id}.txt','w')?as?f: ????????f.write(text) ????print(f">>>保存商品数据成功!") #下载图片 def?down(img_url,img_name,path): ????ua?=?UserAgent() ????headers?=?{'User-Agent':?ua.random} ????print(f"下载{img_name}图片..") ????r=requests.get(img_url,headers=headers,timeout=10) ????with?open(f'{path}{img_name}','wb')?as?f: ????????f.write(r.content) ????????print(f">>>下载{img_name}图片完成!") ????time.sleep(2) def?get_shopping(id): ????url=f"https://HdhCmsTestamazon.cn/dp/{id}" ????html=requests.get(url,headers=ua(),timeout=10).content.decode('utf-8') ????time.sleep(3) ????title=re.findall(r'"立即购买:(.+?)",',html,re.S)[0]?#B&O?PLAY?by?Bang?&?Olufsen?Beoplay?P6?便携式扬声器1140026?黑色 ????print(title) ????path=f'{title}/' ????os.makedirs(path,exist_ok=True)?#创建目录 ????req=etree.HTML(html) ????price=req.xpath('//span[@id="priceblock_ourprice"]/text()')[0] ????print(price) ????text='%s%s%s%s%s'%(url,'\n',title,'\n',price) ????tx(id,?text,?path) ????imgs=req.xpath('//span[@class="a-button-text"]/img/@src') ????for?img?in?imgs: ????????if?'jpg'?in?img: ????????????imgurl=img.split('._')[0] ????????????img_url?=f'{imgurl}.jpg' ????????????img_name?=?img_url.split('/')[-1] ????????????print(img_url,img_name) ????????????down(img_url,?img_name,?path) if?__name__=='__main__': ????id=input("请输入要采集的商品id(比如:B00C3YADSK):") ????get_shopping(id)
美国站(com)采集(需要***!!)
比较坑爹的***
我的展示页面(国内免费***)
tx的付费***
什么鬼我的配送不到?无价格显示?
不管了,价格pass
采集效果
附上源码参考:
#国外亚马逊商品爬虫 #20200213 #https://HdhCmsTestamazon测试数据/dp/B07S3659V2 #?-*-?coding=utf-8?-*- import?requests from?fake_useragent?import?UserAgent import?re,os,time,random from?lxml?import?etree def?ua(): ????ua=UserAgent() ????sjs?=?random.randint(1111111,?9999999) ????#print(sjs) ????sj?=?str(sjs) ????headers={ ????????'User-Agent':ua.random, ????????'Cookie':?f'x-wl-uid=1eZRN4GNhENdZSGdOrvzQEy2WvlxT/sXztd0uB1drNz9lanSFUVkDtpyWsVQQfwSjhXmvZLrY67w=;?session-id=459-1321777-{sj};?ubid-acbcn=459-5647010-{sj};?lc-acbcn=zh_CN;?i18n-prefs=CNY;?session-token=g6hxLDDoHhzZLHWxd7FnNbtphW7mG7zCPY29lJB7vwUfa73azlZ8jPh8iS6M+c/4mKa3c/d/Pzgiv61e7sJx858blgOf+pmyxOtu55z5AlVE2nRoPAyWFMeG4OKmZQI3Lg5/MNhcN71PW9x2OkQWWLOeqcikSKmxqaEQL9qGyYcnTbrYggdlInP0pROsR8oz;?session-id-time=2082787201l;?csm-hit=tb:s-KV6TYQQV77AQ5HHBPD94|1581595664859&t:1581595666568&adb:adblk_yes' ?????????????} ????return?headers #保存txt def?tx(id,text,path): ????print(f"正在保存商品数据..") ????with?open(f'{path}{id}.txt','w',encoding='utf-8')?as?f: ????????f.write(text) ????print(f">>>保存商品数据成功!") #下载图片 def?down(img_url,img_name,path): ????ua?=?UserAgent() ????headers?=?{'User-Agent':?ua.random} ????print(f"下载{img_name}图片..") ????r=requests.get(img_url,headers=headers,timeout=10) ????with?open(f'{path}{img_name}','wb')?as?f: ????????f.write(r.content) ????????print(f">>>下载{img_name}图片完成!") ????time.sleep(1) def?get_shopping(id): ????#id="B07S3659V2" ????#url="https://HdhCmsTestamazon测试数据/dp/B07S3659V2" ????url=f"https://HdhCmsTestamazon测试数据/dp/{id}" ????html=requests.get(url,headers=ua(),timeout=10).content.decode('utf-8') ????#print(html) ????time.sleep(2) ????req?=?etree.HTML(html) ????title=re.findall(r'Amazon测试数据:?(.+?)',html,re.S)[0]?#B&O?PLAY?by?Bang?&?Olufsen?Beoplay?P6?便携式扬声器1140026?黑色 ????print(title) ????path=f'{id}/' ????os.makedirs(path,exist_ok=True)?#创建目录 ????price=re.findall(r'"isPreorder":.+?,"price":(.+?),"doesMAPPolicyApply":.+?',html,re.S)[0] ????price?=f'${price}' ????''' ????try: ????????price=req.xpath('//span[@id="priceblock_saleprice"]/text()')[0] ????except: ????????price?=?req.xpath('//span[@id="priceblock_ourprice"]/text()')[0] ????''' ????print(price) ????productdescriptions=req.xpath('//div[@id="productDescription"]//text()') ????productdescription='\n'.join(productdescriptions) ????text='%s%s%s%s%s%s%s'%(url,'\n',title,'\n',price,'\n',productdescription) ????tx(id,?text,?path) ????imgs=req.xpath('//span[@class="a-button-text"]/img/@src') ????for?img?in?imgs: ????????if?'jpg'?in?img: ????????????imgurl=img.split('._')[0] ????????????img_url?=f'{imgurl}.jpg' ????????????img_name?=?img_url.split('/')[-1] ????????????print(img_url,img_name) ????????????down(img_url,?img_name,?path) ????print(f">>>下载图片完毕!") if?__name__=='__main__': ????#id="B07XR5TRSZ" ????id=input("请输入要采集的商品id(比如:B07GJ2MWTZ):") ????get_shopping(id)
查看更多关于电商商品爬虫,亚马逊amazon采集源码的详细内容...
声明:本文来自网络,不代表【好得很程序员自学网】立场,转载请注明出处:http://www.haodehen.cn/did126013