电商商品爬虫，亚马逊amazon采集源码

亚马逊是国际知名的电商平台，访问国际站需要***，而国内访问的话是cn国内站点，不同的地区有不同的站点，每个商品有一个id号，不同地区商品是存在差异的！

亚马逊amazon商品数据采集有点类似于采集百度搜索结果信息，协议头非常重要，除了ua之外，cookies头需要携带，要不然不能访问，国内国外站点一样！

感觉网站存在反爬，尤其是英文站点，因为网页源码价格信息与实际前端页面看到的价格信息存在差异！

输入商品id号采集商品相关信息！

国内站（cn）采集

采集效果

附源码

#国内亚马逊商品爬虫
#20200213?by微信：huguo00289




#?-*-?coding=utf-8?-*-
import?requests
from?fake_useragent?import?UserAgent
import?re,os,time
from?lxml?import?etree


def?ua():
????ua=UserAgent()
????headers={
????????'User-Agent':ua.random,
????????'Cookie':?'x-wl-uid=1eZRN4GNhENdZSGdOrvzQEy2WvlxT/sXztd0uB1drNz9lanSFUVkDtpyWsVQQfwSjhXmvZLrY67w=;?session-id=459-1321777-5720413;?ubid-acbcn=459-5647010-5360714;?lc-acbcn=zh_CN;?i18n-prefs=CNY;?session-token=g6hxLDDoHhzZLHWxd7FnNbtphW7mG7zCPY29lJB7vwUfa73azlZ8jPh8iS6M+c/4mKa3c/d/Pzgiv61e7sJx858blgOf+pmyxOtu55z5AlVE2nRoPAyWFMeG4OKmZQI3Lg5/MNhcN71PW9x2OkQWWLOeqcikSKmxqaEQL9qGyYcnTbrYggdlInP0pROsR8oz;?session-id-time=2082787201l;?csm-hit=tb:s-KV6TYQQV77AQ5HHBPD94|1581595664859&t:1581595666568&adb:adblk_yes'
?????????????}
????return?headers


#保存txt
def?tx(id,text,path):
????print(f"正在保存商品数据..")
????with?open(f'{path}{id}.txt','w')?as?f:
????????f.write(text)
????print(f">>>保存商品数据成功！")


#下载图片
def?down(img_url,img_name,path):
????ua?=?UserAgent()
????headers?=?{'User-Agent':?ua.random}
????print(f"下载{img_name}图片..")
????r=requests.get(img_url,headers=headers,timeout=10)
????with?open(f'{path}{img_name}','wb')?as?f:
????????f.write(r.content)
????????print(f">>>下载{img_name}图片完成！")
????time.sleep(2)


def?get_shopping(id):
????url=f"https://HdhCmsTestamazon.cn/dp/{id}"
????html=requests.get(url,headers=ua(),timeout=10).content.decode('utf-8')
????time.sleep(3)
????title=re.findall(r'"立即购买：(.+?)",',html,re.S)[0]?#B&O?PLAY?by?Bang?&?Olufsen?Beoplay?P6?便携式扬声器1140026?黑色
????print(title)
????path=f'{title}/'
????os.makedirs(path,exist_ok=True)?#创建目录
????req=etree.HTML(html)
????price=req.xpath('//span[@id="priceblock_ourprice"]/text()')[0]
????print(price)
????text='%s%s%s%s%s'%(url,'\n',title,'\n',price)
????tx(id,?text,?path)
????imgs=req.xpath('//span[@class="a-button-text"]/img/@src')
????for?img?in?imgs:
????????if?'jpg'?in?img:
????????????imgurl=img.split('._')[0]
????????????img_url?=f'{imgurl}.jpg'
????????????img_name?=?img_url.split('/')[-1]
????????????print(img_url,img_name)
????????????down(img_url,?img_name,?path)




if?__name__=='__main__':
????id=input("请输入要采集的商品id（比如：B00C3YADSK）：")
????get_shopping(id)

美国站（com）采集（需要***！！）

比较坑爹的***

我的展示页面（国内免费***）

tx的付费***

什么鬼我的配送不到？无价格显示？

不管了，价格pass

采集效果

附上源码参考：

#国外亚马逊商品爬虫
#20200213
#https://HdhCmsTestamazon测试数据/dp/B07S3659V2


#?-*-?coding=utf-8?-*-
import?requests
from?fake_useragent?import?UserAgent
import?re,os,time,random
from?lxml?import?etree


def?ua():
????ua=UserAgent()
????sjs?=?random.randint(1111111,?9999999)
????#print(sjs)
????sj?=?str(sjs)
????headers={
????????'User-Agent':ua.random,
????????'Cookie':?f'x-wl-uid=1eZRN4GNhENdZSGdOrvzQEy2WvlxT/sXztd0uB1drNz9lanSFUVkDtpyWsVQQfwSjhXmvZLrY67w=;?session-id=459-1321777-{sj};?ubid-acbcn=459-5647010-{sj};?lc-acbcn=zh_CN;?i18n-prefs=CNY;?session-token=g6hxLDDoHhzZLHWxd7FnNbtphW7mG7zCPY29lJB7vwUfa73azlZ8jPh8iS6M+c/4mKa3c/d/Pzgiv61e7sJx858blgOf+pmyxOtu55z5AlVE2nRoPAyWFMeG4OKmZQI3Lg5/MNhcN71PW9x2OkQWWLOeqcikSKmxqaEQL9qGyYcnTbrYggdlInP0pROsR8oz;?session-id-time=2082787201l;?csm-hit=tb:s-KV6TYQQV77AQ5HHBPD94|1581595664859&t:1581595666568&adb:adblk_yes'
?????????????}
????return?headers


#保存txt
def?tx(id,text,path):
????print(f"正在保存商品数据..")
????with?open(f'{path}{id}.txt','w',encoding='utf-8')?as?f:
????????f.write(text)
????print(f">>>保存商品数据成功！")


#下载图片
def?down(img_url,img_name,path):
????ua?=?UserAgent()
????headers?=?{'User-Agent':?ua.random}
????print(f"下载{img_name}图片..")
????r=requests.get(img_url,headers=headers,timeout=10)
????with?open(f'{path}{img_name}','wb')?as?f:
????????f.write(r.content)
????????print(f">>>下载{img_name}图片完成！")
????time.sleep(1)


def?get_shopping(id):
????#id="B07S3659V2"
????#url="https://HdhCmsTestamazon测试数据/dp/B07S3659V2"
????url=f"https://HdhCmsTestamazon测试数据/dp/{id}"
????html=requests.get(url,headers=ua(),timeout=10).content.decode('utf-8')
????#print(html)
????time.sleep(2)
????req?=?etree.HTML(html)
????title=re.findall(r'Amazon测试数据:?(.+?)',html,re.S)[0]?#B&O?PLAY?by?Bang?&?Olufsen?Beoplay?P6?便携式扬声器1140026?黑色
????print(title)
????path=f'{id}/'
????os.makedirs(path,exist_ok=True)?#创建目录
????price=re.findall(r'"isPreorder":.+?,"price":(.+?),"doesMAPPolicyApply":.+?',html,re.S)[0]
????price?=f'${price}'
????'''
????try:
????????price=req.xpath('//span[@id="priceblock_saleprice"]/text()')[0]
????except:
????????price?=?req.xpath('//span[@id="priceblock_ourprice"]/text()')[0]
????'''
????print(price)
????productdescriptions=req.xpath('//div[@id="productDescription"]//text()')
????productdescription='\n'.join(productdescriptions)
????text='%s%s%s%s%s%s%s'%(url,'\n',title,'\n',price,'\n',productdescription)
????tx(id,?text,?path)
????imgs=req.xpath('//span[@class="a-button-text"]/img/@src')
????for?img?in?imgs:
????????if?'jpg'?in?img:
????????????imgurl=img.split('._')[0]
????????????img_url?=f'{imgurl}.jpg'
????????????img_name?=?img_url.split('/')[-1]
????????????print(img_url,img_name)
????????????down(img_url,?img_name,?path)


????print(f">>>下载图片完毕！")


if?__name__=='__main__':
????#id="B07XR5TRSZ"
????id=input("请输入要采集的商品id（比如：B07GJ2MWTZ）：")
????get_shopping(id)

查看更多关于电商商品爬虫，亚马逊amazon采集源码的详细内容...

声明：本文来自网络，不代表【好得很程序员自学网】立场，转载请注明出处：http://www.haodehen.cn/did126013

更新时间：2022-11-28 阅读：38次