requests session的应用,python金点设计奖数据爬虫

时间:2022-07-22
本文章向大家介绍requests session的应用,python金点设计奖数据爬虫,主要内容包括其使用实例、应用技巧、基本知识点总结和需要注意事项,具有一定的参考价值,需要的朋友可以参考一下。

应用python爬取金点设计奖获奖作品数据的时候,发现无论如何更换协议头,获取的json数据都不会更改,但是手动打开网页json加载的数据会更改,后面想到使用使用session,数据果断出来了!

网站:

http://www.goldenpin.org.tw

金点设计奖(Golden Pin Design Award),是台湾创意设计中心执行,在台湾拥有35年历史,是台湾历史最悠久、最权威且最富知名度的专业设计竞赛。2014年首度将报名资格扩大到全球的华人市场(新增中国、中国香港、中国澳门、新加坡、马来西亚五地),参赛厂商超过数千家,报名作品累积上万件。被媒体称为「全球华人市场最顶尖设计奖项」、「设计界的金马奖」 。

目标网址:

http://www.goldenpin.org.tw/金點設計獎/?y=2019

抓包访问数据:

ajax加载分页数据:

json加载html网页数据:

协议头数据都是一致,无论如何访问哪个年份,除了页码数据(分页)不一样

更改协议头,补全Referer,补全协议头都不能获取到不同的数据

后面采用requests session,第一次访问年份获奖数据,再用json获取html数据

requests session的用法

在使用python requests库时遇到一个问题,就是如何在session中设置对所有请求都生效的cookie?requests中的session对象一大特性就是它会自动为你管理cookie,当你登录一个页面时,它可以自动识别response中的set cookie头,然后为下面的请求一直维持这个cookie。

添加cookie有2种方式:

一个是把cookie先写成字典形式,然后把字典转换为cookiejar

s = requests.Session()  # 开启一个会话Session
cookie_dict={'49BAC005-7D5B-4231-8CEA-16939BEACD67': 'cktest001',   # 从chrome浏览器中取到的cookie值
             'JSESSIONID':'F4FFF69B8XXXXXXC8DCB4C061C0',
             'JSESSIONIDSSO':'9D49C76FD6XXXXXF294242B44A'
             }
s.cookies = requests.utils.cookiejar_from_dict(cookie_dict, cookiejar=None, overwrite=True)  # 把cookie值转换为cookiejar类型,然后传给Session

#注意:这个方法会替换掉原有的cookies

二是追加cookies

s = requests.Session()  # 开启一个会话Session
jar = requests.cookies.RequestsCookieJar()   # 创建一个Cookie Jar对象
jar.set('49BAC005-7D5B-4231-8CEA-1XXXXBEACD67','cktXXXX001')  # 向Cookie Jar对象中添加cookie值
jar.set('JSESSIONID','F4FFF69B8CXXXX80F0C8DCB4C061C0')
jar.set('JSESSIONIDSSO','9D49C7XXXX448FDF5B0F294242B44A')
s.cookies.update(jar)  # 把cookies追加到Session中

来实现python金点设计奖数据爬虫

第一步:构建采集url:

def get_url():
    urls=[]
    categorys = ["金點設計獎", "金點概念設計獎", "金點新秀設計獎"]
    years = ["2019", "2018", "2017", "2016", "2015"]
    for category in categorys:
        cate_gory=urllib.parse.quote(category)
        for year in years:
            url=f"http://www.goldenpin.org.tw/{cate_gory}/?y={year}"
            print(url)
            urls.append(url)

    print(len(urls))
    return urls

第二步:随机协议头

def ua():
    ua=UserAgent()
    headers={"User-Agent":ua.random,}
    return headers

第三步:requests 访问年份首页,获取requests session

def get_session(furl):
    s = requests.session()
    s.mount('http://', HTTPAdapter(max_retries=3))
    s.mount('https://', HTTPAdapter(max_retries=3))
    try:
        fresponse = s.get(furl, headers=ua(), timeout=10)
    except requests.exceptions.RequestException as e:
        print(f'访问链接失败,错误代码:{e}')
        with open(f'furl_spider.txt', 'a+', encoding='utf-8') as f:
            f.write(f'{furl}-访问链接失败,错误代码:{e}n')
    print(fresponse.status_code)
    time.sleep(2)
    return s

第四步:获取json数据

def get_req(s,i):
    url="http://www.goldenpin.org.tw/ft-admin/admin-ajax.php"
    data={
    'action': 'presscore_template_ajax',
    'postID': '15317',
    'paged': '1',
    'targetPage': i,
    'term':'',
    'orderby':'',
    'order':'',
    'nonce': '1f3d287a9a',
    'contentType': 'portfolio',
    'pageData[type]': 'page',
    'pageData[template]': 'portfolio',
    'pageData[layout]': 'masonry',
    'sender': 'more',
    }
    response=s.post(url,data=data,headers=ua(),timeout=10)
    print(response.status_code)
    if response.status_code==200:
        req=response.content.decode('utf-8')
        html=json.loads(req)['html']
        h3s=re.findall(r'<h3 class="entry-title"><a  target="_blank" href="(.+?)" title="(.+?)" rel',html,re.S)
        print(len(h3s))
        for h3 in h3s:
            title=h3[1]
            href=h3[0]
            data=title,href
            print(data)
            get_content(href)

        time.sleep(2)

大奖设计作品数据获取:

def get_content(url):
    #url="http://www.goldenpin.org.tw/project/ps63/"
    response = requests.post(url,headers=ua(),timeout=10).content.decode('utf-8')
    time.sleep(2)
    html=etree.HTML(response)
    category=html.xpath('//ol[@class="breadcrumbs text-small"]/a[1]/text()')[0]
    print(category)
    year=html.xpath('//ol[@class="breadcrumbs text-small"]/a[2]/text()')[0]
    print(year)
    subclassification = html.xpath('//ol[@class="breadcrumbs text-small"]/text()')[0]
    print(subclassification)
    h1=html.xpath('//h1[@class="entry-title"]/text()')[0]
    h1 = re.sub(r'[|/<>:*?'\"]', "_", h1)  # 剔除不合法字符
    print(h1)  # 获取标题
    path=f"{year}/{category}/{subclassification}/"
    os.makedirs(path,exist_ok=True) #创建目录
    descriptions=html.xpath('//div[@class="wpb_wrapper"]//text()')
    description=''.join(descriptions)
    texts='%s%s%s'%(h1,'n',description)
    print(description)
    imgs=html.xpath('//div[@class="fancy-media-wrap layzr-bg"]/img[@class="lazy-load preload-me"]/@data-src')
    down(h1,imgs,path,texts)

下载模块:

def down(h1,imgs,path,texts):
    try:
        print(f'>>> 开始保存{h1}.txt文本..')
        with open(f'{path}/{h1}.txt','w',encoding='utf-8') as f:
            f.write(texts)
            print(f'>>> 保存{h1}.txt文本成功!')
    except Exception as e:
        print(f'获取详情数据失败,错误代码:{e}')
        with open(f'{path}/text_spider.txt', 'a+', encoding='utf-8') as f:
            f.write(f'{h1},{texts}-获取详情数据失败,错误代码:{e}n')
    i=1
    for img in imgs:
        img_url=img
        suffix = os.path.splitext(img_url)[1]  # 获取后缀名
        img_name = '%s%s%d%s' % (h1,'_',i, suffix)
        print(f'>>> 开始下载{img_name}图片..')
        s = requests.Session()
        s.mount('http://', HTTPAdapter(max_retries=3))
        s.mount('https://', HTTPAdapter(max_retries=3))
        try:
            r = s.get(img_url, timeout=20, headers=ua())
            with open(f'{path}/{img_name}', 'wb') as f:
                f.write(r.content)
                print(f'>>> 开始下载{img_name}图片完成!')
        except requests.exceptions.RequestException as e:
            print(f'下载图片失败,错误代码:{e}')
            with open(f'{path}/img_spider.txt', 'a+', encoding='utf-8') as f:
                f.write(f'{img_url},{img_name},{path}-下载图片失败,错误代码:{e}n')
        time.sleep(1)
        i=i+1

最后:创建一个mian()函数来运行爬虫

def main():
    urls=get_url()
    for furl in urls:
        print(f'>>> 正在抓取 {furl} 链接数据...')
        try:
            s=get_session(furl)
            for i in range(5, 6):
                try:
                    get_req(s,i)
                except Exception as e:
                    print(f'获取网页数据失败,错误代码:{e}')
                    with open(f'json_spider.txt', 'a+', encoding='utf-8') as f:
                        f.write(f'{furl},{i},-获取网页数据失败,错误代码:{e}n')
        except Exception as e:
            print(f'获取网页数据失败,错误代码:{e}')
            with open(f'spider.txt', 'a+', encoding='utf-8') as f:
                f.write(f'{furl}-获取网页数据失败,错误代码:{e}n')
        time.sleep(1)
    time.sleep(5)

运行效果:

附完整源码:

#金点设计奖采集
# -*- coding: utf-8 -*-
#20200103 by 微信:huguo00289

import requests
from fake_useragent import UserAgent
import json,re,os,time
from lxml import etree
from requests.adapters import HTTPAdapter #引入 HTTPAdapter 库
import urllib.parse

def ua():
    ua=UserAgent()
    headers={"User-Agent":ua.random,}
    return headers

def get_session(furl):
    s = requests.session()
    s.mount('http://', HTTPAdapter(max_retries=3))
    s.mount('https://', HTTPAdapter(max_retries=3))
    try:
        fresponse = s.get(furl, headers=ua(), timeout=10)
    except requests.exceptions.RequestException as e:
        print(f'访问链接失败,错误代码:{e}')
        with open(f'furl_spider.txt', 'a+', encoding='utf-8') as f:
            f.write(f'{furl}-访问链接失败,错误代码:{e}n')
    print(fresponse.status_code)
    time.sleep(2)
    return s


def get_req(s,i):
    url="http://www.goldenpin.org.tw/ft-admin/admin-ajax.php"
    data={
    'action': 'presscore_template_ajax',
    'postID': '15317',
    'paged': '1',
    'targetPage': i,
    'term':'',
    'orderby':'',
    'order':'',
    'nonce': '1f3d287a9a',
    'contentType': 'portfolio',
    'pageData[type]': 'page',
    'pageData[template]': 'portfolio',
    'pageData[layout]': 'masonry',
    'sender': 'more',
    }
    response=s.post(url,data=data,headers=ua(),timeout=10)
    print(response.status_code)
    if response.status_code==200:
        req=response.content.decode('utf-8')
        html=json.loads(req)['html']
        h3s=re.findall(r'<h3 class="entry-title"><a  target="_blank" href="(.+?)" title="(.+?)" rel',html,re.S)
        print(len(h3s))
        for h3 in h3s:
            title=h3[1]
            href=h3[0]
            data=title,href
            print(data)
            get_content(href)

        time.sleep(2)


def get_content(url):
    #url="http://www.goldenpin.org.tw/project/ps63/"
    response = requests.post(url,headers=ua(),timeout=10).content.decode('utf-8')
    time.sleep(2)
    html=etree.HTML(response)
    category=html.xpath('//ol[@class="breadcrumbs text-small"]/a[1]/text()')[0]
    print(category)
    year=html.xpath('//ol[@class="breadcrumbs text-small"]/a[2]/text()')[0]
    print(year)
    subclassification = html.xpath('//ol[@class="breadcrumbs text-small"]/text()')[0]
    print(subclassification)
    h1=html.xpath('//h1[@class="entry-title"]/text()')[0]
    h1 = re.sub(r'[|/<>:*?'\"]', "_", h1)  # 剔除不合法字符
    print(h1)  # 获取标题
    path=f"{year}/{category}/{subclassification}/"
    os.makedirs(path,exist_ok=True) #创建目录
    descriptions=html.xpath('//div[@class="wpb_wrapper"]//text()')
    description=''.join(descriptions)
    texts='%s%s%s'%(h1,'n',description)
    print(description)
    imgs=html.xpath('//div[@class="fancy-media-wrap layzr-bg"]/img[@class="lazy-load preload-me"]/@data-src')
    down(h1,imgs,path,texts)

def down(h1,imgs,path,texts):
    try:
        print(f'>>> 开始保存{h1}.txt文本..')
        with open(f'{path}/{h1}.txt','w',encoding='utf-8') as f:
            f.write(texts)
            print(f'>>> 保存{h1}.txt文本成功!')
    except Exception as e:
        print(f'获取详情数据失败,错误代码:{e}')
        with open(f'{path}/text_spider.txt', 'a+', encoding='utf-8') as f:
            f.write(f'{h1},{texts}-获取详情数据失败,错误代码:{e}n')
    i=1
    for img in imgs:
        img_url=img
        suffix = os.path.splitext(img_url)[1]  # 获取后缀名
        img_name = '%s%s%d%s' % (h1,'_',i, suffix)
        print(f'>>> 开始下载{img_name}图片..')
        s = requests.Session()
        s.mount('http://', HTTPAdapter(max_retries=3))
        s.mount('https://', HTTPAdapter(max_retries=3))
        try:
            r = s.get(img_url, timeout=20, headers=ua())
            with open(f'{path}/{img_name}', 'wb') as f:
                f.write(r.content)
                print(f'>>> 开始下载{img_name}图片完成!')
        except requests.exceptions.RequestException as e:
            print(f'下载图片失败,错误代码:{e}')
            with open(f'{path}/img_spider.txt', 'a+', encoding='utf-8') as f:
                f.write(f'{img_url},{img_name},{path}-下载图片失败,错误代码:{e}n')
        time.sleep(1)
        i=i+1


def get_url():
    urls=[]
    categorys = ["金點設計獎", "金點概念設計獎", "金點新秀設計獎"]
    years = ["2019", "2018", "2017", "2016", "2015"]
    for category in categorys:
        cate_gory=urllib.parse.quote(category)
        for year in years:
            url=f"http://www.goldenpin.org.tw/{cate_gory}/?y={year}"
            print(url)
            urls.append(url)

    print(len(urls))
    return urls


def main():
    urls=get_url()
    for furl in urls:
        print(f'>>> 正在抓取 {furl} 链接数据...')
        try:
            s=get_session(furl)
            for i in range(5, 6):
                try:
                    get_req(s,i)
                except Exception as e:
                    print(f'获取网页数据失败,错误代码:{e}')
                    with open(f'json_spider.txt', 'a+', encoding='utf-8') as f:
                        f.write(f'{furl},{i},-获取网页数据失败,错误代码:{e}n')
        except Exception as e:
            print(f'获取网页数据失败,错误代码:{e}')
            with open(f'spider.txt', 'a+', encoding='utf-8') as f:
                f.write(f'{furl}-获取网页数据失败,错误代码:{e}n')
        time.sleep(1)
    time.sleep(5)


if __name__=="__main__":
    main()