爬虫

时间:2019-11-25
本文章向大家介绍爬虫,主要包括爬虫使用实例、应用技巧、基本知识点总结和需要注意事项,具有一定的参考价值,需要的朋友可以参考一下。

百度首页

import requests
res = requests.get('https://www.baidu.com')
res.encoding = 'utf-8'
print(res.text)
with open('a.html', 'w') as f:
    f.write(res.text)

百度图片

import requests
res = requests.get('https://www.baidu.com/s',
                   params={'wd':'图片'},
                   headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36',
                            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
                            'Accept-Encoding': 'gzip, deflate, br',
                            'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
                            'Cache-Control': 'no-cache',
                            'Connection': 'keep-alive',
                            'Cookie': 'BD_UPN=12314753; PSTM=1572350125; BAIDUID=79D0925D8720B930D1F1E5BFF612720F:FG=1; BIDUPSID=AA6E74403EED680B571512C161DCBEA9; BDUSS=EyeXBkQXJNZ1Q0QXk0dzhoTlh1ODFzUzNwa0lySWJwMFBrOVJHMS1SNn5ILTFkRVFBQUFBJCQAAAAAAAAAAAEAAACxNoeFsM3A6GZlbGzIyAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAL-SxV2~ksVdRE; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; ispeed_lsm=2; BD_HOME=1; H_PS_PSSID=1449_21086_18560_20698_29567_29220_26350; delPer=0; BD_CK_SAM=1; PSINO=3; H_PS_645EC=2d24IwpbvK2eVobcmeLgWHGcv8LmvTpWTYgrzRwRetwbEpdCPi08ahOlrNs; COOKIE_SESSION=15438_1_7_5_14_10_0_1_3_5_39_3_72210_0_0_0_1574650244_1574491787_1574665633%7C9%233409_3_1574491763%7C2',
                            'Host': 'www.baidu.com',
                            'Pragma': 'no-cache',
                            'Sec-Fetch-Mode': 'navigate',
                            'Sec-Fetch-Site': 'none',
                            'Sec-Fetch-User': '?1',
                            'Upgrade-Insecure-Requests': '1'
                            })
res.encoding = 'gbk'
print(res.text)

with open('a.html', 'w') as f:
    f.write(res.text)

华华手机商城

import requests
headers = {'Referer': 'http://www.aa7a.cn/user.php?&ref=http%3A%2F%2Fwww.aa7a.cn%2Fuser.php%3Fact%3Dlogout',
           'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36'}
# 登录时发送的请求
res = requests.post('http://www.aa7a.cn/user.php',
                    headers=headers,
                    data={
                        'username': '2960113637@qq.com',
                        'password':'zrh960906*',
                        'captcha': 'GC3T',
                        'remember': 1,
                        'ref': 'http://www.aa7a.cn/',
                        'act': 'act_login'
                    })
cookie=res.cookies.get_dict()  # 登录成功,获取cookie
res=requests.get('http://www.aa7a.cn/',headers=headers,
                 cookies=cookie,
                 )
if '2960113637@qq.com' in res.text:
    print("登录成功")
else:
    print("没有登录")

梨视频

import requests
import re
res = requests.get('https://www.pearvideo.com/category_loading.jsp?reqType=5&categoryId=1&start=0')   # 刷新页面得到的数据
reg_text = '<a href="(.*?)" class="vervideo-lilink actplay">'   # html页面
obj = re.findall(reg_text,res.text)
# print(obj)
for url in obj:
    url = 'https://www.pearvideo.com/'+ url  # 拼接路径
    res1 = requests.get(url)
    obj1 = re.findall('srcUrl="(.*?)"',res1.text)
    print(obj1[0])
    name = obj1[0].rsplit('/',1)[1]
    res2 = requests.get(obj1[0])
    with open (name,'wb') as f:
        for line in res2.iter_content():
            f.write(line)

原文地址:https://www.cnblogs.com/zrh-960906/p/11929149.html