Python与seo,百度关键词相关搜索关键词采集源码
时间:2022-07-22
本文章向大家介绍Python与seo,百度关键词相关搜索关键词采集源码,主要内容包括其使用实例、应用技巧、基本知识点总结和需要注意事项,具有一定的参考价值,需要的朋友可以参考一下。
百度关键词相关搜索关键词采集源码
#百度相关搜索
# -*- coding=utf-8 -*-
#20200714 by 微信:huguo00289
import requests,time,random
from lxml import etree
from fake_useragent import UserAgent
def get_keyword(keyword):
data=[]
ua=UserAgent()
headers={
'Cookie': 'PSTM=1558408522; BIDUPSID=BFDF2424811E5E531D933DC854B78C67; BAIDUID=BFDF2424811E5E531D933DC854B78C67:SL=0:NR=10:FG=1; MSA_WH=375_812; BD_UPN=12314353; H_WISE_SIDS=144367_142699_144157_142019_144883_141875_141744_143161_144989_144420_144134_142919_144483_136861_131246_137745_144743_138883_140259_141942_127969_144171_140065_144338_140593_143057_141808_140350_144608_144727_143923_131423_144289_142206_144220_144501_107312_143949_144105_144306_143478_144966_142911_140312_143549_143647_144239_142113_143855_136751_140842_110085; BDUSS_BFESS=1vQzN4d0pPNzB2MUQyUUQtV3d6OEZzYldhN2FWUm1RZEZ3UUVyb1Y1Mmtqc0JlSVFBQUFBJCQAAAAAAAAAAAEAAACgwJmS08W4xcTuAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAKQBmV6kAZleVW; MCITY=-%3A; sug=3; sugstore=0; ORIGIN=0; bdime=0; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; BDSFRCVID=LuuOJexroG3_dMRrBfK9UG9zgmKK0gOTDYLEUamaI2AU2V4VN4vPEG0Pt_U-mEt-J8jwogKK0gOTH6KF_2uxOjjg8UtVJeC6EG0Ptf8g0M5; H_BDCLCKID_SF=tbkD_C-MfIvhDRTvhCcjh-FSMgTBKI62aKDs2P5aBhcqJ-ovQTbrbMuwK45hB5cP3b5E0b6cWKJJ8UbeWfvp3t_D-tuH3lLHQJnp2DbKLp5nhMJmBp_VhfL3qtCOaJby523ion3vQpP-OpQ3DRoWXPIqbN7P-p5Z5mAqKl0MLPbtbb0xXj_0-nDSHH-tt6De3j; delPer=0; BD_CK_SAM=1; PSINO=7; H_PS_PSSID=1457_31670_32141_32139_32046_32230_32092_32298_26350_32261; COOKIE_SESSION=7_0_4_5_9_5_0_3_2_3_0_0_1608_0_0_0_1594720087_0_1594723266%7C9%23328033_18_1594447339%7C9; H_PS_645EC=8046hkQMotVPI51%2B5I0oGWsgl5ams9mPpS71Aw1L%2FgLPGzpf4I2A6FpO8U4',
#'User-Agent': random.choice(ua_list)
'User-Agent': ua.random,
}
url=f"https://www.baidu.com/s?wd={keyword}&ie=UTF-8"
html=requests.get(url,headers=headers,timeout=5).content.decode('utf-8')
time.sleep(2)
try:
req=etree.HTML(html)
tt=req.xpath('//div[@id="rs"]//text()')
tt.remove('相关搜索')
print(tt)
data=tt
except Exception as e:
print(e.args)
time.sleep(5)
print(f">> 等待5s,正在尝试重新采集 {keyword} 相关关键词")
get_ua_keyword(keyword)
return data
def get_ua_keyword(keyword):
data = []
print(f'>> 正在采集 {keyword} 相关关键词..')
ua_list = [
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0Firefox 4.0.1',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36Chrome 17.0',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1',
]
headers = {
'Cookie': 'PSTM=1558408522; BIDUPSID=BFDF2424811E5E531D933DC854B78C67; BAIDUID=BFDF2424811E5E531D933DC854B78C67:SL=0:NR=10:FG=1; MSA_WH=375_812; BD_UPN=12314353; H_WISE_SIDS=144367_142699_144157_142019_144883_141875_141744_143161_144989_144420_144134_142919_144483_136861_131246_137745_144743_138883_140259_141942_127969_144171_140065_144338_140593_143057_141808_140350_144608_144727_143923_131423_144289_142206_144220_144501_107312_143949_144105_144306_143478_144966_142911_140312_143549_143647_144239_142113_143855_136751_140842_110085; BDUSS_BFESS=1vQzN4d0pPNzB2MUQyUUQtV3d6OEZzYldhN2FWUm1RZEZ3UUVyb1Y1Mmtqc0JlSVFBQUFBJCQAAAAAAAAAAAEAAACgwJmS08W4xcTuAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAKQBmV6kAZleVW; MCITY=-%3A; sug=3; sugstore=0; ORIGIN=0; bdime=0; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; BDSFRCVID=LuuOJexroG3_dMRrBfK9UG9zgmKK0gOTDYLEUamaI2AU2V4VN4vPEG0Pt_U-mEt-J8jwogKK0gOTH6KF_2uxOjjg8UtVJeC6EG0Ptf8g0M5; H_BDCLCKID_SF=tbkD_C-MfIvhDRTvhCcjh-FSMgTBKI62aKDs2P5aBhcqJ-ovQTbrbMuwK45hB5cP3b5E0b6cWKJJ8UbeWfvp3t_D-tuH3lLHQJnp2DbKLp5nhMJmBp_VhfL3qtCOaJby523ion3vQpP-OpQ3DRoWXPIqbN7P-p5Z5mAqKl0MLPbtbb0xXj_0-nDSHH-tt6De3j; delPer=0; BD_CK_SAM=1; PSINO=7; H_PS_PSSID=1457_31670_32141_32139_32046_32230_32092_32298_26350_32261; COOKIE_SESSION=7_0_4_5_9_5_0_3_2_3_0_0_1608_0_0_0_1594720087_0_1594723266%7C9%23328033_18_1594447339%7C9; H_PS_645EC=8046hkQMotVPI51%2B5I0oGWsgl5ams9mPpS71Aw1L%2FgLPGzpf4I2A6FpO8U4',
'User-Agent': random.choice(ua_list)
}
url = f"https://www.baidu.com/s?wd={keyword}&ie=UTF-8"
html = requests.get(url, headers=headers, timeout=5).content.decode('utf-8')
time.sleep(2)
try:
if '相关搜索' in html:
req = etree.HTML(html)
tt = req.xpath('//div[@id="rs"]//text()')
tt.remove('相关搜索')
print(tt)
data = tt
else:
print(f">> {keyword} 无相关关键词!! ")
data=[]
except Exception as e:
print(e.args)
print(f">> 采集 {keyword} 相关关键词失败!! ")
print('>> 正在保存失败关键词..')
with open('fail_keywords.txt', 'a+', encoding='utf-8') as f:
f.write(f'{keyword}n')
return data
def lead_keywords():
print('>> 正在导入关键词列表..')
try:
with open('keyss.txt','r',encoding='gbk') as f:
keywords=f.readlines()
except:
with open('keyss.txt','r',encoding='utf-8') as f:
keywords=f.readlines()
print(keywords)
print('>> 正在导入关键词列表成功!')
return keywords
def save(datas):
print('>> 正在保存相关关键词列表..')
with open('keywords.txt', 'w', encoding='utf-8') as f:
f.write('n'.join(datas))
print('>> 正在保存相关关键词列表成功!')
def main():
datas=[]
keywords=lead_keywords()
for keyword in keywords:
keyword.strip()
data=get_keyword(keyword)
datas.extend(data)
save(datas)
if __name__ == '__main__':
main()
- 批量导出csv文件的基本尝试(r8笔记第44天)
- Golang 中的并发限制与超时控制
- 一条简单的报警信息发现的oracle bug(r8笔记第42天)
- 一条insert语句导致的性能问题分析(一)(r8笔记第40天)
- 一条insert语句导致的性能问题分析(二)(r8笔记第43天)
- dataguard中的密码文件管理(r8笔记第39天)
- Mybatis_day02
- Golang构建HTTP服务(一)--- net/http库源码笔记
- Golang构建HTTP服务(二)--- Handler,ServeMux与中间件
- 使用Let's Encrypt的SSL证书配置HTTPS手记
- Mybatis_day01
- golang 如何验证struct字段的数据格式
- ggolot2 画ROC曲线
- 47. 访问MySql数据库实现增删改查 | 厚土Go学习笔记
- JavaScript 教程
- JavaScript 编辑工具
- JavaScript 与HTML
- JavaScript 与Java
- JavaScript 数据结构
- JavaScript 基本数据类型
- JavaScript 特殊数据类型
- JavaScript 运算符
- JavaScript typeof 运算符
- JavaScript 表达式
- JavaScript 类型转换
- JavaScript 基本语法
- JavaScript 注释
- Javascript 基本处理流程
- Javascript 选择结构
- Javascript if 语句
- Javascript if 语句的嵌套
- Javascript switch 语句
- Javascript 循环结构
- Javascript 循环结构实例
- Javascript 跳转语句
- Javascript 控制语句总结
- Javascript 函数介绍
- Javascript 函数的定义
- Javascript 函数调用
- Javascript 几种特殊的函数
- JavaScript 内置函数简介
- Javascript eval() 函数
- Javascript isFinite() 函数
- Javascript isNaN() 函数
- parseInt() 与 parseFloat()
- escape() 与 unescape()
- Javascript 字符串介绍
- Javascript length属性
- javascript 字符串函数
- Javascript 日期对象简介
- Javascript 日期对象用途
- Date 对象属性和方法
- Javascript 数组是什么
- Javascript 创建数组
- Javascript 数组赋值与取值
- Javascript 数组属性和方法
- UML类图自动生成,太爽了
- Python爬虫之mongodb介绍和安装
- 一句话说清楚 CountDownLatch 和 CyclicBarrier 的区别
- Android 功耗(8)---如何找到阻止进入deep idle SODI的元凶
- 【技术创作101训练营】三种不同场景下 vue 组件动态加载的方法及实现
- leetcode链表之回文链表
- 如何理解 Go 中的反射
- Synchronized深入分析
- Spring的一些零碎知识点整理
- CentOS7上安装并配置KVM,以及通过KVM安装CentOS系统
- 建议收藏 哭着喊着 从C语言转向C++刷算法
- Spring的事务管理
- 面向切面的Spring
- 搭建ELK日志分析平台(下)—— 搭建kibana和logstash服务器
- Spring对JDBC的模板支持——JdbcTemplate