Day05 - 码农教程

requests的POST请求：

'''
post请求登录github

Request URL:
    https://github.com/session

Request
    Method: POST

#Referer表示上一个请求的页面
Referer:
    https://github.com/login

User-Agent:
    Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36

请求体：
    只有POST请求才有
    commit: Sign in
    utf8: ✓
    authenticity_token: COh/MDoDDUVauDtPbZ2A6pjf4pEA4pV8jwRO8PjHPzbXiLJiwtCXRQ7Ik3kXWxJOOF+i5/1r9twxUqaUnXe5TA==
    login: HS1
    password: ***********
    webauthn-support: unsupported
'''

'''
# 1.访问login页面获取token信息

Request URL: 
    https://github.com/login

Request Method: 
    GET

#服务端告诉客户端需要设置的Cookies
响应头：
    Set-Cookies

请求头：
    Cookie
    User-Agent
'''

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36'
}

import requests
import re

url1 = 'https://github.com/login'
response = requests.get(url1, headers=headers)

# 把login页返回的Cookie信息转为字典
login_cookies = response.cookies.get_dict()
# print(response.text)

token = re.findall('<input type="hidden" name="authenticity_token" value="(.*?)" />', response.text, re.S)
# print(token[0])


'''
# 2.往session页面发送post请求

Request URL: 
    https://github.com/session

Request Method: 
    POST

请求头：
    Cookie
    User-Agent
'''
url2 = 'https://github.com/session'

# 拼接请求体信息
form_data = {
    'commit': 'Sign in',
    'utf8': '✓',
    'authenticity_token': token[0],
    'login': '********',
    'password': '***********',
    'webauthn-support': 'unsupported'
}

headers1 = {
    'Referer': 'https://github.com/login',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36'
}

# 携带请求头、请求体、login页的cookies信息
response2 = requests.post(url2, data=form_data, headers=headers1, cookies=login_cookies)

# print(response2.status_code)
# print(response2.text)

with open('github.html', 'w', encoding='utf-8') as f:
    f.write(response2.text)

response响应：

import requests

response = requests.get('https://www.baidu.com')

print(response.status_code)     # 获取响应状态码
print(response.url)     # 获取url地址
print(response.text)     # 获取文本
print(response.content)     # 获取二进制流
print(response.headers)     # 获取页面请求头信息
print(response.history)     # 上一次请求地址
print(response.cookies)     # 返回cookies对象
print(response.cookies.get_dict())     # 获取cookies信息转换成字典
print(response.cookies.items())     # 获取cookies信息转换成字典
print(response.encoding)     # 字符编码
print(response.elapsed)     # 访问时间


#一点一点写入二进制流
import requests

# 往音频地址发送get请求

url = 'https://vd3.bdstatic.com/mda-ic4pfhh3ex32svqi/hd/mda-ic4pfhh3ex32svqi.mp4?auth_key=1557973824-0-0-bfb2e69bb5198ff65e18065d91b2b8c8&bcevod_channel=searchbox_feed&pd=wisenatural&abtest=all.mp4'
response = requests.get(url, stream=True)     # stream=True 把content设置为一个迭代器对象
print(response.content)

with open('love_for_GD.mp4', 'wb') as f:
    for content in response.iter_content():
        f.write(content)

requests高级应用：

'''
证书验证(大部分网站都是https)
'''
import requests


# https = http + ssl
response = requests.get('https://www.xiaohuar.com')
print(response.status_code)


# 改进1:去掉报错,但是会报警告
import requests
response = requests.get('https://www.xiaohuar.com', verify=False)
# 不验证证书,报警告,返回200
print(response.status_code)

# 改进2:去掉报错,并且去掉警报信息
import requests
import urllib3
urllib3.disable_warnings()  # 关闭警告
response = requests.get('https://www.xiaohuar.com', verify=False)
print(response.status_code)

# 改进3:加上证书(伪代码)
# 很多网站都是https,但是不用证书也可以访问,大多数情况都是可以携带也可以不携带证书
# 知乎\百度等都是可带可不带
# 有硬性要求的,则必须带，比如对于定向的用户,拿到证书后才有权限访问某个特定网站
import requests
response = requests.get(
    'https://www.xiaohuar.com',
    cert=('/path/server.crt', '/path/key'))
print(response.status_code)


'''
超时设置

# 两种超时:float or tuple
# timeout=0.1  # 代表接收数据的超时时间
# timeout=(0.1,0.2)  # 0.1代表链接超时  0.2代表接收数据的超时时间
'''

import requests

response = requests.get('https://www.baidu.com',
                        timeout=0.0001)
print(response.status_code)


'''
使用代理（重要指数*******）

# 代理设置:先发送请求给代理,然后由代理帮忙发送(封ip是常见的事情)
# 西刺代理
'''

import requests
proxies={
    # 带用户名密码的代理,@符号前是用户名与密码
    'http': 'http://tank:123@localhost:9527',
    'http': 'http://localhost:9527',
    'https': 'https://localhost:9527',
}
response = requests.get('https://www.12306.cn', proxies=proxies)
print(response.status_code)


# 支持socks代理,安装:pip install requests[socks]
import requests
proxies = {
    'http': 'socks5://user:pass@host:port',
    'https': 'socks5://user:pass@host:port'
}
respone = requests.get('https://www.12306.cn', proxies=proxies)

print(respone.status_code)


'''
# 认证设置
登录网站时，会弹出一个框，要求你输入用户名与密码（类似于alert），此时无法进入html页面，待授权通过后才能进入html页面。

Requests模块为我们提供了多种身份认证方式，包括基本身份认证等...

其原理指的是通过输入用户名与密码获取用户的凭证来识别用户，然后通过token对用户进行授权。
基本身份认证:
    HTTP Basic Auth是HTTP1.0提出的认证方式。客户端对于每一个realm，通过提供用户名和密码来进行认证的方式当认证失败时，服务器收到客户端请求，返回401。

'''
import requests
# 通过访问github的api来测试
url = 'https://api.github.com/user'
HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36',
}

# 测试1，失败返回401
response = requests.get(url, headers=HEADERS)
print(response.status_code)  # 401
print(response.text)


# 测试2，通过requests.auth内的HTTPBasicAuth进行认证，认证成功返回用户信息
from requests.auth import HTTPBasicAuth
response = requests.get(url, headers=HEADERS, auth=HTTPBasicAuth('uesrname', 'pwd'))
print(response.text)


# 测试3，通过requests.get请求内的auth参数默认就是HTTPBasicAuth，认证成功返回用户信息
response = requests.get(url, headers=HEADERS, auth=('*******', '**********'))
print(response.text)


'''
上传文件
'''
import requests

# 上传文本文件
files1 = {'file': open('user.txt', 'rb')}
response = requests.post('http://httpbin.org/post', files=files1)
print(response.status_code)  # 200
print(response.text)  # 200

# 上传图片文件
files2 = {'jpg': open('小狗.jpg', 'rb')}
response = requests.post('http://httpbin.org/post', files=files2)
print(response.status_code)  # 200
print(response.text)  # 200

# 上传视频文件
files3 = {'movie': open('love_for_GD.mp4', 'rb')}
response = requests.post('http://httpbin.org/post', files=files3)
print(response.status_code)  # 200
print(response.text)  # 200

selenium使用：

'''
selenium模块讲解
一、什么是selenium
    selenium最初是一个自动化测试工具，使用它驱动浏览器自动去执行某些自定义好的操作，
    例如在页面中执行javaScript代码、跳过登陆验证等，可以使用selenium实现爬虫

二、为什么要用selenium
    1.优点
    使用requests模块登陆需要分析大量的复杂通信流程，使用selenium可以轻松跳过登陆验证

    2.缺点
    浏览器会加载css，js，图片，视频，，，数据。爬虫效率相比requests模块要低

三、如何用
    1.下载selenium模块
    pip3 install -i https://pypi.tuna.tsinghua.edu.cn/simple selenium

    2.下载浏览器驱动
    http://npm.taobao.org/mirrors/chromedriver/2.38/
'''

# selenium之第一次
# 用来驱动浏览器的
from selenium import webdriver

# 调用得到一个动作链对象，破解滑动验证码的时候用的 可以拖动图片
from selenium.webdriver import ActionChains

# 按照什么方式查找属性，By.ID,By.CSS_SELECTOR，By.Class
from selenium.webdriver.common.by import By

# 键盘按键操作
from selenium.webdriver.common.keys import Keys

# 和下面WebDriverWait一起用的
from selenium.webdriver.support import expected_conditions as EC

# 等待页面加载某些元素
from selenium.webdriver.support.wait import WebDriverWait

import time

# 通过谷歌浏览器驱动打开谷歌浏览器
chrome = webdriver.Chrome()
# chrome是一个驱动对象
# 括号内为驱动的绝对地址、或者将驱动放到python解释器的Scripts文件夹中，并将该文件夹加入环境变量

'''
访问百度主页
'''
# 若try出现异常
try:
    # 向百度主页发送get请求
    # chrome.get('https://www.jianshu.com/u/bfd35b09c0d7')

    # 参数1：驱动对象，参数2：超时时间
    wait = WebDriverWait(chrome, 10)

    # 1.访问百度
    chrome.get('http://www.baidu.com')

    # 查找input输入框
    input_tag = wait.until(
        # 调用EC的presence_of_element_located()
        EC.presence_of_element_located(
            # 此处可以写一个元组，参数1：查找属性的方式，参数2：属性的名字
            (By.ID, 'kw')
        )
    )

    # 3.搜索一拳超人
    input_tag.send_keys('一拳超人')

    # 4.按键盘回车键
    input_tag.send_keys(Keys.ENTER)

    time.sleep(50)

# 无论发生什么都会关闭浏览器
finally:
    chrome.close()     # 关闭浏览器


'''
访问京东主页

'''
try:
    # 向百度主页发送get请求
    # chrome.get('https://www.jianshu.com/u/bfd35b09c0d7')

    # 参数1：驱动对象，参数2：超时时间
    wait = WebDriverWait(chrome, 10)

    # 1.访问京东主页
    chrome.get('https://www.jd.com')

    # 查找input输入框
    input_tag = wait.until(
        # 调用EC的presence_of_element_located()
        EC.presence_of_element_located(
            # 此处可以写一个元组，参数1：查找属性的方式，参数2：属性的名字
            (By.ID, 'key')
        )
    )

    # 3.搜索一拳超人
    input_tag.send_keys('唐诗三百首')

    # 4.按键盘回车键
    button = wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'button')))

    # 5.点击搜索按钮
    button.click()

    time.sleep(50)

# 无论发生什么都会关闭浏览器
finally:
    chrome.close()     # 关闭浏览器

'''
选择器
# 自动登陆百度
'''
from selenium import webdriver
import time

'''
===============所有方法===================
    element是查找一个标签
    elements是查找所有标签

    1、find_element_by_link_text  通过链接文本去找
    2、find_element_by_id 通过id去找
    3、find_element_by_class_name
    4、find_element_by_partial_link_text
    5、find_element_by_name
    6、find_element_by_css_selector
    7、find_element_by_tag_name
'''

# 获取驱动对象
driver = webdriver.Chrome()
try:

    # 自动登陆百度
    # 往百度发送get请求
    driver.get('https://www.baidu.com')

    # 隐式等待
    driver.implicitly_wait(10)

    # 1、find_element_by_link_text     # 通过链接文本去找
    # 根据'登陆'寻找标签
    # send_tag = driver.find_element_by_link_text('登录')
    # send_tag.click()

    # 2、find_element_by_partial_link_text     # 通过局部链接文本去找
    send_tag = driver.find_element_by_partial_link_text('登')
    send_tag.click()

    # 3、find_element_by_class_name
    send_tag = driver.find_element_by_class_name('tang-pass-footerBarULogin')
    send_tag.click()

    time.sleep(1)

    # 4、find_element_by_name
    username = driver.find_element_by_name('userName')
    username.send_keys('********')

    time.sleep(1)

    # 5、find_element_by_id     # 通过id去找
    password = driver.find_element_by_id('TANGRAM__PSP_10__password')
    password.send_keys('********')

    time.sleep(1)

    # 6、find_element_by_css_selector     # 根据属性选择器查找
    login = driver.find_element_by_css_selector('#TANGRAM__PSP_10__submit')
    # css = driver.find_element_by_css_selector('.pass-button-submit')
    login.click()

    # 7、find_element_by_tag_name     #通过标签名称查找
    # driver.find_element_by_tag_name('div')

    time.sleep(10)

finally:
    driver.close()

爬取快代理：

'''
爬取快代理：
    1.访问快代理页面
    2.通过re模块解析并提取所有代理
    3.通过ip测试网站对爬取的代理进行测试
    4.若test_ip函数抛出异常代表代理作废，否则代理有效
    5.利用有效的代理进行代理测试

<tr>
                    <td data-title="IP">124.205.143.212</td>
                    <td data-title="PORT">40585</td>
                    <td data-title="匿名度">高匿名</td>
                    <td data-title="类型">HTTP</td>
                    <td data-title="位置">北京市北京市  鹏博士宽带</td>
                    <td data-title="响应速度">2秒</td>
                    <td data-title="最后验证时间">2019-06-17 16:30:54</td>
                </tr>
re:
    <tr>.*?<td data-title="IP">(.*?)</td>.*?<td data-title="PORT">(.*?)</td>
'''

'''
页面链接
第一页：
    https://www.kuaidaili.com/free/
第二页：
    https://www.kuaidaili.com/free/inha/2/

'''

import requests
import re
import time

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36',
}


def get_index(url):
    time.sleep(1)
    response1 = requests.get(url, headers=headers)
    return response1


def parse_index(text):
    ip_list1 = re.findall('<tr>.*?<td data-title="IP">(.*?)</td>.*?<td data-title="PORT">(.*?)</td>', text, re.S)
    for ip_port in ip_list1:
        ip1 = ':'.join(ip_port)
        yield ip1

def test_ip(ip2):
    print('测试ip: %s' % ip2)
    try:
        proxies = {'https': ip2}

        # ip测试网站
        ip_url1 = 'https://www.ipip.net/'

        # 使用有效与无效的代理对ip测试站点进行访问，若返回的结果为200则代表当前测试ip正常
        response2 = requests.get(ip_url1, headers=headers, proxies=proxies, timeout=1)

        if response2.status_code == 200:
            return ip

    # 若ip代理无效则抛出异常
    except Exception as e:
        print(e)

# 使用代理爬取nba
def spider_nba(good_ip1):
    url = 'https://china.nba.com/'
    proxies = {'https': good_ip1}
    response3 = requests.get(url, headers=headers, proxies=proxies)

    print(response3.status_code)
    print(response3.text)


if __name__ == '__main__':
    base_url = 'https://www.kuaidaili.com/free/inha/{}/'
    for line in range(1, 2905):
        ip_url = base_url.format(line)
        response = get_index(ip_url)
        ip_list = parse_index(response.text)
        for ip in ip_list:
            good_ip = test_ip(ip)
            if good_ip:
                spider_nba(good_ip)

自动登录抽屉新热榜：

from selenium import webdriver

import time

# 获取驱动对象
driver = webdriver.Chrome()

try:

    # 自动登陆抽屉新热榜
    # 发送get请求
    driver.get('https://dig.chouti.com/')

    # 隐式等待
    driver.implicitly_wait(10)
    
    # 获取 '登陆' 按钮
    send_tag = driver.find_element_by_id('login_btn')
    send_tag.click()

    # 获取手机号输入框
    username = driver.find_element_by_class_name('login-phone')
    username.send_keys('***********')

    time.sleep(1)

    # 获取密码输入框
    password = driver.find_element_by_class_name('pwd-password-input')
    password.send_keys('**************')

    time.sleep(1)

    # 获取 '登陆' 按钮
    login = driver.find_elements_by_link_text('登录')
    login[1].click()

    time.sleep(10)

finally:
    driver.close()

原文地址：https://www.cnblogs.com/raotao/p/11041585.html