python爬虫爬小说,来不及解释了。

时间:2019-09-26
本文章向大家介绍python爬虫爬小说,来不及解释了。,主要包括python爬虫爬小说,来不及解释了。使用实例、应用技巧、基本知识点总结和需要注意事项,具有一定的参考价值,需要的朋友可以参考一下。

网上没新的txt下载,自己想办法下载来看,可以根据网页标签不同来修改。 还没入门,不智能慢慢研究吧。

bs4要装一下,pip install BeautifulSoup4

 1 #coding utf-8
 2 import urllib.request
 3 from bs4 import BeautifulSoup
 4 import time
 5 import re
 6 
 7 def get_html(url):
 8     page = urllib.request.urlopen(url)
 9     html = page.read()
10 #    print(bytes.decode(html))
11     return html
12 
13 '''
14 page='https://www.xuehong.cc/book/36273/'
15 p1 = BeautifulSoup(get_html(page).decode('utf-8'), 'html.parser')
16 p2=[]
17 for p in p1.find_all('a',):
18     p2.append(p['href'])
19 print(p2)
20 '''
21 
22 p3=['/book/36273/31737154.html', '/book/36273/31737155.html', '/book/36273/31737156.html', '/book/36273/31737157.html', '/book/36273/31737158.html', '/book/36273/31737159.html', '/book/36273/31737160.html', '/book/36273/31737161.html', '/book/36273/31737162.html', '/book/36273/31737163.html', '/book/36273/31737164.html', '/book/36273/31737165.html', '/book/36273/31737166.html', '/book/36273/31737167.html', '/book/36273/31737168.html', '/book/36273/31737169.html', '/book/36273/31737170.html', '/book/36273/31863549.html', '/book/36273/32060318.html', '/book/36273/32060319.html', '/book/36273/32060320.html', '/book/36273/32157836.html', '/book/36273/32675620.html', '/book/36273/32693741.html', '/book/36273/32705629.html', '/book/36273/32720993.html', '/book/36273/32720995.html', '/book/36273/32751825.html', '/book/36273/32969531.html', '/book/36273/32969532.html', '/book/36273/32969533.html', '/book/36273/32969534.html', '/book/36273/32969535.html', '/book/36273/32969536.html', '/book/36273/32969537.html', '/book/36273/32969538.html', '/book/36273/32969539.html', '/book/36273/32969540.html', '/book/36273/32969541.html', '/book/36273/33178998.html', '/book/36273/33179002.html', '/book/36273/33179005.html', '/book/36273/33179008.html', '/book/36273/33415818.html', '/book/36273/33434196.html', '/book/36273/35213931.html', '/book/36273/35213932.html', '/book/36273/35213933.html', '/book/36273/35213934.html', '/book/36273/35213935.html', '/book/36273/35213936.html', '/book/36273/35213937.html', '/book/36273/35213938.html', '/book/36273/35213939.html', '/book/36273/35213940.html', '/book/36273/35213941.html', '/book/36273/35213942.html', '/book/36273/35213943.html', '/book/36273/35262823.html', '/book/36273/35318036.html', '/book/36273/35318037.html', '/book/36273/35362277.html', '/book/36273/35390213.html', '/book/36273/35397646.html', '/book/36273/35398640.html', '/book/36273/35410795.html', '/book/36273/35418366.html', '/book/36273/35454975.html', '/book/36273/35455295.html', '/book/36273/35456452.html', '/book/36273/35458123.html', '/book/36273/35488936.html', '/book/36273/35488937.html', '/book/36273/35495130.html', '/book/36273/35498675.html', '/book/36273/35503958.html', '/book/36273/35510595.html', '/book/36273/35510628.html', '/book/36273/35517338.html', '/book/36273/35522119.html', '/book/36273/35529846.html', '/book/36273/35536421.html', '/book/36273/35590637.html', '/book/36273/35590638.html', '/book/36273/35601859.html', '/book/36273/35657475.html', '/book/36273/35662329.html', '/book/36273/35675638.html', '/book/36273/35693345.html', '/book/36273/35693346.html', '/book/36273/35735160.html', '/book/36273/35740864.html', '/book/36273/35750550.html', '/book/36273/35754379.html', '/book/36273/35786823.html']
23 
24 url='https://www.xuehong.cc/book/36273/31737154.html'
25 i=0
26 for num in p3:
27     urlNum='https://www.xuehong.cc'+p3[i]
28 
29     soup = BeautifulSoup(get_html(urlNum).decode('utf-8'), 'html.parser')
30     for j in soup.find_all('h1',):
31         print(j)
32         with open('F:\\book.txt', 'a',encoding='utf-8') as f:  # 设置文件对象
33             f.write(str(j)+"\n\n")
34 
35     for k in soup.find_all('div', id='content'):
36         k1=str(k).replace("    ","")
37         k2=k1.replace("<br/><br/>","\n\n")
38         print(k2)
39         with open('F:\\book.txt', 'a',encoding='utf-8') as f:  # 设置文件对象
40             f.write(k2+"\n\n\n\n")
41 i=i+1

原文地址:https://www.cnblogs.com/rood/p/11592835.html