[Python爬虫]头条图集爬取

时间:2019-09-24
本文章向大家介绍[Python爬虫]头条图集爬取,主要包括[Python爬虫]头条图集爬取使用实例、应用技巧、基本知识点总结和需要注意事项,具有一定的参考价值,需要的朋友可以参考一下。
import requests
from urllib.parse import urlencode
import os
from hashlib import md5
from multiprocessing.pool import Pool

def get_page(offset):
    headers={
        'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.75 Safari/537.36',
        'cookie':'tt_webid=6724223385113069069; WEATHER_CITY=%E5%8C%97%E4%BA%AC; tt_webid=6724223385113069069; csrftoken=9e9d6c3be6aabc313dce0c4f1a116047; sso_uid_tt=27219b1c2d00b8a6021444d85d83dc38; toutiao_sso_user=7562e682c093b193cce298f25dd396ba; login_flag=8391d980bfc8a8908e7c6c80596a016c; __tea_sdk__ssid=undefined; _ga=GA1.2.931504366.1565662966; sid_guard=7562e682c093b193cce298f25dd396ba%7C1565663040%7C5126263%7CFri%2C+11-Oct-2019+10%3A21%3A43+GMT; uid_tt=27219b1c2d00b8a6021444d85d83dc38; sid_tt=7562e682c093b193cce298f25dd396ba; sessionid=7562e682c093b193cce298f25dd396ba; uuid="w:443dcb551552404fbfde212f1054c781"; __tasessionId=i5j7qcydf1569292028372; s_v_web_id=1e7e3b52d7bc46698bb26079c99fd83d',
        'pragma':'no-cache',
        'referer':'https://www.toutiao.com/search/?keyword=%E8%A1%97%E6%8B%8D',
        'x-requested-with':'XMLHttpRequest'
    }
    params={
        'aid':'24',
        'app_name':'web_search',
        'offset':offset,
        'format':'json',
        'keyword':'街拍',
        'autoload':'true',
        'count':'20',
        'en_qc':'1',
        'cur_tab':'1',
        #'from':'search_tab',
        #'pd':'synthesis',
    }
    print(urlencode(params))
    url='https://www.toutiao.com/api/search/content/?'+urlencode(params)
    try:
        response=requests.get(url,headers=headers)
        if response.status_code==200:
            print(response.json())
            return response.json()
    except requests.ConnectionError:
        return 'No response'

def get_image(json):
    if json.get('data'):
        for item in json.get('data'):
            if 'title' in item and 'image_list' in item and item['image_list']!=[]:
                title=item.get('title')
                images=item.get('image_list')
                for image in images:
                    print(title)
                    print(image)
                    yield {
                        'image':image.get('url'),
                        'title':title
                    }
    else:
        print('Not parse')
def save_image(item):
    if not os.path.exists(item.get('title')):
        os.mkdir(item.get('title'))
    try:
        response=requests.get(item.get('image'))
        if response.status_code==200:
            file_path='{0}/{1}.{2}'.format(item.get('title'),md5(response.content).hexdigest(),'jpg')
            if not os.path.exists(file_path):
                with open(file_path,'wb') as f:
                    f.write(response.content)
            else:
                print('Already Downloaded',file_path)
    except requests.ConnectionError:
        print('Failed to Save Image')

def main(offset):
    json=get_page(offset)
    for item in get_image(json):
        print(item)
        save_image(item)
GROUP_START=1
GROUP_END=1
if __name__=='__main__':
    pool=Pool()
    groups=([x*20 for x in range(GROUP_START,GROUP_END+1)])
    pool.map(main,groups)
    pool.close()
    pool.join()

原文地址:https://www.cnblogs.com/lightmonster/p/11577909.html