python爬虫——图书馆html文件解析并存入excel

时间:2019-01-18
本文章向大家介绍python爬虫——图书馆html文件解析并存入excel,主要包括python爬虫——图书馆html文件解析并存入excel使用实例、应用技巧、基本知识点总结和需要注意事项,具有一定的参考价值,需要的朋友可以参考一下。

环境

  • beautifulsoup4
  • xlsxwriter

补上次爬取网页的分析代码:

from bs4 import BeautifulSoup
import os
import copy
from xlsxwriter.workbook import Workbook

def save_into_excel(persons_list,save_name):
    headings = ['学号', '姓名', '专业班级', '书名', '作者', '索引号', '借阅时间']
    data = []
    for person in persons_list:
        for book_info in person['books_list']:
            data.append([person['no'], person['name'], save_name, book_info['title'], book_info['author'], book_info['index'], book_info['time']])
    workbook = Workbook(save_name + '.xlsx')
    worksheet = workbook.add_worksheet()

    worksheet.write_row('A1', headings)
    for row_num, row_data in enumerate(data):
        worksheet.write_row(row_num + 1, 0, row_data)

    workbook.close()

def getInfo(major):
    persons_list = []
    person = {
        'no': 0,
        'major': major,
        'name' : '',
        'books_list' : []
    }
    book_info = {
        'title':'',
        'author':'',
        'index' :'',
        'time' : ''
    }
    path = 'E:\\craw_lib'
    for file_name in os.listdir(path):
        file_path = os.path.join(path,file_name)
        sep_pos = file_name.index('_')
        no = file_name[2:sep_pos]
        no = int(no)

        if no != person['no']:
            # 获取到新用户
            if person['no'] != 0:
                person_copy = copy.deepcopy(person)
                persons_list.append(person_copy)
            person['no'] = no
            person['books_list'].clear()

        # print("no : %s" % no)

        filehander = open(file_path, "r", encoding='utf-8')
        soup = BeautifulSoup(filehander, 'lxml')

        #获取姓名
        name_div = soup.find('div', 'navbar_info_zh')
        strings = name_div.stripped_strings
        list = []
        for s in strings:
            list.append(str(s))
        name = list[2]
        name = name[4:]
        person['name'] = name
        print(name)

        table = soup.find('table', id = 'contentTable')
        all_tr = table.find_all('tr')
        for tr in all_tr:
            #创建书单
            tds = tr.find_all('td')
            if len(tds) != 0:
                borrow_flag = True
                state_tag = tds[0]
                for ss in state_tag.stripped_strings:
                    state = str(ss)
                    if state == '借书':
                        title_tag = tds[2]
                        author_tag = tds[3]
                        index_tag = tds[4]
                        time_tag = tds[7]
                        book_info['title'] = str(title_tag.string) #直接赋值时,深拷贝递归超限
                        book_info['author'] = str(author_tag.string)
                        book_info['index'] = str(index_tag.string)
                        book_info['time'] = str(time_tag.string)

                    else:
                        borrow_flag = False
                if borrow_flag == True:
                    book_info_copy = copy.deepcopy(book_info)
                    person['books_list'].append(book_info_copy)
    person_copy = copy.deepcopy(person)
    persons_list.append(person_copy)
    save_into_excel(persons_list,major)

if __name__ == '__main__':
    major = input('please input a major:\n')
    getInfo(major)

有关保存数据到xlsx,参考:

https://www.cnblogs.com/yxpblog/p/5249866.html