python百度关键词相关搜索采集,链轮查询采集相关关键词工具exe
1.随机生成协议头 2.关键词相关筛选 3.关键词去重 4.链轮采集
#百度关键词相关搜索采集 #20191118 # -*- coding: UTF-8 -*- import requests,re,time from fake_useragent import UserAgent from bs4 import BeautifulSoup #随机生成协议头 def ua(): ua=UserAgent() headers = {"User-Agent": ua.random} return headers #筛选词初始设定 req="工业|产品|外观|结构|造型|手机|犀牛|rhino|proe" #筛选关键词 def search(req,con,n): text=re.search(req,con) if text: data=text.group(n) else: data='no' return data #获取相关关键词源码 def get_a(key): response = requests.get(f"https://HdhCmsTestbaidu测试数据/s?ie=utf-8&tn=baidu&wd={key}", headers=ua(), timeout=5).text time.sleep(2) soup = BeautifulSoup(response, 'lxml') div = soup.find('div', id='rs').find_all('a') return div #相关关键词链轮查询采集 def get_keywords(keywords): xg_keywords=keywords all_keywords=[] for key in keywords: print(">>>开始查询 %s 相关关键词!" % key) try: div=get_a(key) except Exception as e: print(f'错误代码:{e}') print(f'正在重新获取网页内容...') time.sleep(5) div = get_a(key) for a in div: keyword=a.get_text() print(keyword) #筛选 if search(r'(%s)'%req,keyword,1) =='no': print(f'-剔除关键词 {keyword}') else: print(f'>>获取关键词 {keyword}') print(xg_keywords) #去重 if keyword not in xg_keywords: xg_keywords.append(keyword) all_keywords.append(keyword) #链轮 get_keywords(all_keywords) if __name__ == '__main__': get_keywords(["工业设计培训"])
代码参考来源:流量贩子 《seo应用编程》
版本二
百度相关搜索关键词抓取 1.读取txt文档关键词 2.导出txt关键词 3.多线程采集关键词
#百度相关搜索关键词抓取,读取txt关键词,导出txt关键词 # -*- coding=utf-8 -*- import requests import re import time from multiprocessing.dummy import Pool as ThreadPool #百度相关关键词查询 def xgss(url): headers = { "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36" } html=requests.get(url,headers=headers).text #print(html) ze=r'<div id="rs"><div class="tt">相关搜索</div><table cellpadding="0">(.+?)</table></div>' xgss=re.findall(ze,html,re.S) #print(xgss) xgze=r'<th><a href="(.+?)">(.+?)</a></th>' sj=re.findall(xgze,str(xgss),re.S) #print(sj) gjc='' for x in sj: print(x[1]) gjc=gjc+x[1]+'\n' # 导出关键词为txt文本 with open(".\gjcsj.txt", 'a', encoding='utf-8') as f: f.write(gjc) print("-----------------------------------") return gjc print("程序运行,正在导入关键词列表!!!") print("-----------------------------------") # 导入要搜索的关键词txt列表 urls = [] data = [] for line in open('.\gjc.txt', "r", encoding='utf-8'): data.append(line) print("导入关键词列表成功!") print("-----------------------------------") #转换关键词为搜索链接 for keyword in data: url = 'https://HdhCmsTestbaidu测试数据/s?wd=' + keyword urls.append(url) print("采集百度相关搜索关键词开启!") print("...................") #多线程获取相关关键词 try: # 开4个 worker,没有参数时默认是 cpu 的核心数 pool = ThreadPool() results = pool.map(xgss, urls) pool.close() pool.join() print("采集百度相关搜索关键词完成,已保存于gjcsj.txt!") except: print("Error: unable to start thread") print("8s后程序自动关闭!!!") time.sleep(8)
exe下载地址: 链接: https://pan.baidu测试数据/s/1RhmZ99dYCSIJsEe-SnlhXQ 提取码: 9sjs
查看更多关于python百度关键词相关搜索采集,链轮查询采集相关关键词工具exe的详细内容...
声明:本文来自网络,不代表【好得很程序员自学网】立场,转载请注明出处:http://www.haodehen.cn/did126035