python爬取国家统计局2019年行政区划分数据mssql

时间:2020-05-30
本文章向大家介绍python爬取国家统计局2019年行政区划分数据mssql,主要包括python爬取国家统计局2019年行政区划分数据mssql使用实例、应用技巧、基本知识点总结和需要注意事项,具有一定的参考价值,需要的朋友可以参考一下。

请安装python3.7版本,更高版本无法使用pymssql包

#!/usr/bin/python
# -*- coding: UTF-8 -*-
#   功能:  获取省市县数据
#   版本:v1.1
import importlib
import sys
import pymssql
importlib.reload(sys)
import requests
import lxml.etree as etree
import os
class chinese_city():
    # 初始化函数
    def __init__(self):
        self.trdic = {
            1: '//tr[@class="provincetr"]',
            2: '//tr[@class="citytr"]',
            3: '//tr[@class="countytr"]',
            4: '//tr[@class="towntr"]',
            5: '//tr[@class="villagetr"]'
        }        
 
    def crawl_page(self,url):
        try:
            headers = {
                'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:71.0) Gecko/20100101 Firefox/71.0',
                'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
            }
            html = requests.get(url, headers=headers, timeout=1000)
            html.encoding = 'gbk'
            print(str(html.status_code)+url)
            text = html.text
            return text
        except requests.exceptions.RequestException:
            print('超时'+url)
    
    #根据trid 解析子页
    def parse(self, trid, pid, url):
        if url.strip() == '':
            return None
        else:
            html = self.crawl_page(url)
            tree = etree.HTML(html, parser=etree.HTMLParser(encoding='gbk'))
            if trid==3:
                nodes = tree.xpath(self.trdic.get(trid))
                if len(nodes)==0:
                    nodes = tree.xpath(self.trdic.get(4))
                    print('有镇的市:'+url)
            else:
                nodes = tree.xpath(self.trdic.get(trid)) 
            path = os.path.basename(url)
            base_url = url.replace(path, '')
            values = []
            for node in nodes:
                nexturl = node.xpath('./td[1]/a/@href')
                if(len(nexturl)>0):
                    code = node.xpath('./td[1]/a/text()')
                    if len(code) == 0:
                        code = node.xpath('./td[1]/text()')
                    name = node.xpath('./td[2]/a/text()')
                    if len(name) == 0:
                        name = node.xpath('./td[2]/text()')
                    value = {}
                    value['url'] = base_url + "".join(nexturl)
                    value['code'] = "".join(code)
                    value['name'] = "".join(name)
                    value['pcode'] = pid
                    value['grade'] = trid
                    value['cat'] = 0
                    values.append(value)
            return values
 
    #解析社区页
    def parseVillager(self, trid, pid, url):
        if url.strip() == '':
            return None
        else:
            html = self.crawl_page(url)
            tree = etree.HTML(html, parser=etree.HTMLParser(encoding='gbk'))
            nodes = tree.xpath(self.trdic.get(trid))
            values = []
            for node in nodes:
                code = node.xpath('./td[1]/text()')
                cate = node.xpath('./td[2]/text()')
                name = node.xpath('./td[3]/text()')
                value = {}
                value['code'] = "".join(code)
                value['name'] = "".join(name)
                value['pcode'] = pid
                value['grade'] = trid
                value['cat'] = cate
                values.append(value)
            return values
 
    #插入数据库
    def insert_to_db(self, taobao):
        conn = pymssql.connect(
            host=r'127.0.0.1',
            user=r'sa',
            password='123',
            database='areadb'
        )
        cursor = conn.cursor()
        try:
            param = []
            for p in taobao:
                param.append((p.get("code"), p.get("grade"), p.get("pcode"), p.get("name").strip(),p.get("cat"))) 
            sql = 'INSERT INTO fetch_area VALUES(%s,%d,%s,%s,%d)'
            cursor.executemany(sql, param)
            conn.commit()
        except Exception as e:
            conn.rollback()
            print(e)
        finally:
            if cursor:
                cursor.close()
            if conn:
                conn.close()
 
    #从头执行解析
    def parseChineseCity(self):
        citys = self.parse(2, '520000000000', 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/52.html')
        if not citys is None:
            self.insert_to_db(citys)
            for city in citys:
                countys = self.parse(3, city['code'], city['url'])
                if not countys is None:
                    self.insert_to_db(countys)
                    for county in countys:
                        towns = self.parse(4, county['code'], county['url'])
                        if not towns is None:
                            self.insert_to_db(towns)
                            for town in towns:
                                villagers = self.parseVillager(5, town['code'], town['url'])
                                if not villagers is None:
                                    self.insert_to_db(villagers)
 
if __name__ == '__main__':
    chinese_city = chinese_city()
    chinese_city.parseChineseCity()

创建数据库表

CREATE TABLE [dbo].[fetch_area](
    [code] [varchar](16) NULL,
    [grade] [int] NULL,
    [pcode] [varchar](16) NULL,
    [name] [nvarchar](64) NULL,
    [cat] [int] NULL
)

原文地址:https://www.cnblogs.com/hllive/p/12992858.html