selenium二或者三层连接爬取

  时间:2019-01-11
本文章向大家介绍selenium二或者三层连接爬取,主要包括selenium二或者三层连接爬取使用实例、应用技巧、基本知识点总结和需要注意事项,具有一定的参考价值,需要的朋友可以参考一下。

今天的案例以猫眼影院为例:
爬取里面各个地区,各地的电影院的所有信息
url:https://maoyan.com/cinemas

import requests
from lxml import etree
from selenium import webdriver
from urllib import request,parse
import time


dirver=webdriver.PhantomJS(executable_path=r'D:\ysc桌面\Desktop\phantomjs-2.1.1-windows\bin\phantomjs.exe')
#dirver=webdriver.Chrome()

#代理ip
proxy = {
    "HTTP": "113.3.152.88:8118",
    "HTTPS": "219.234.5.128:3128",
}
#伪装头
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)Chrome/70.0.3538.110 Safari/537.36",
}

#地址
base_url="https://maoyan.com/cinemas"

#打开网页获取信息
response= requests.get(url=base_url,headers=headers,proxies=proxy)
html=response.content.decode("utf-8")
with open("maoyan.html","w",encoding="utf-8")as fb:
    fb.write(html)
    
#调用etree.HTML进行树状转换
html_tree = etree.HTML(html)
#获取品牌id 行政区id 特殊厅id
li_tree=html_tree.xpath('//ul[@class="tags-lines"]/li')

#获取品牌id
brandId_dict={}
for i in li_tree[0].xpath('./ul/li')[1:]:
    brand=i.xpath('./a/text()|./a/@data-id')
    brandId_dict[brand[-1]]=brand[0]

#特殊厅id
hallType_dict={}
for k in li_tree[-1].xpath('./ul/li')[1:]:
    hallType=k.xpath('./a/text()|./a/@data-id')
    hallType_dict[hallType[-1]] = hallType[0]


#行政区id
districtId_dict={}
for j in li_tree[1].xpath('./ul/li')[1:]:
    district=j.xpath('./a/text()|./a/@data-id')
    districtId_dict[district[-1]] = district[0]



# print(brandId_dict)
# print(hallType_dict)
# print(districtId_dict)
#选中影院 厅 最后才选行政区

for brandId in brandId_dict.values():
    for hallType in hallType_dict.values():
        data={
        'brandId':brandId,
        'hallType':hallType,
        }

        #response=requests.get(url=base_url,params=data)
        data_str=parse.urlencode(data)
        new_url=base_url+"?"+data_str
        dirver.get(new_url)

        n=1
        for districtId in districtId_dict.keys():
            #找到行政区点击
            print(districtId)
            dirver.find_element_by_link_text(districtId).click()
            time.sleep(1)
            #查找行踪区第一层地区
            if dirver.page_source.find("float-filter") == -1:
                continue
            filter_tree=etree.HTML(dirver.page_source)
            oneplace=filter_tree.xpath('//div[@class="float-filter"]/ul[@class="tags"]/li/a/text()')[1:]
            one_url = filter_tree.xpath('//div[@class="float-filter"]/ul[@class="tags"]/li/a/@href')[1:]
            if n!=1:
                for oneurl in one_url:
                    new_one=base_url+oneurl
                    print(new_one)
                    #找到第二层地址进行点击
                    two_res=requests.get(url=new_one,headers=headers,proxies=proxy)
                    info_tree = etree.HTML(two_res.content.decode("utf-8"))
                    info_total=info_tree.xpath('//div[@class="cinema-info"]/a/text()|//div[@class="cinema-info"]/p/text()')
                    if info_total:
                        print("影院名称: "+info_total[0]+" "+info_total[-1])
                        with open("maoyanyingyuan.txt","a",encoding="utf-8") as fb:
                            fb.write("影院名称: "+info_total[0]+" "+info_total[-1]+"\n")
                    else:
                        print("暂时没有该地区的影院信息")
                continue
            n+=1
            oneplace_dict={}
            for one in oneplace:
                #找到第一层地址点击
                print(one)
                dirver.find_element_by_link_text(one).click()
                if dirver.page_source.find("station-tags") == -1:
                    oneplace_dict[one]=""
                    continue
                time.sleep(1)
                two_tree = etree.HTML(dirver.page_source)
                # twoplace=two_tree.xpath('//div[@class="float-filter"]/ul[@class="tags station-tags"]/li/a/text()')[1:]
                href_url=two_tree.xpath('//div[@class="float-filter"]/ul[@class="tags station-tags"]/li/a/@href')[1:]
                #print(href_url)

                for two_url in href_url:
                    two_url=base_url+two_url
                    print(two_url)
                    #找到第二层地址进行点击
                    two_res=requests.get(url=two_url,headers=headers,proxies=proxy)
                    info_tree = etree.HTML(two_res.content.decode("utf-8"))
                    info_total=info_tree.xpath('//div[@class="cinema-info"]/a/text()|//div[@class="cinema-info"]/p/text()')
                    if info_total:
                        print("影院名称: "+info_total[0]+" "+info_total[-1])
                        with open("maoyanyingyuan.txt","a",encoding="utf-8") as fb:
                            fb.write("影院名称: "+info_total[0]+" "+info_total[-1]+"\n")
                    else:
                        print("暂时没有该地区的影院信息")


上一页 下一页

原文地址:http://www.manongjc.com/article/42965.html