爬取帖子 - 码农教程

百度贴吧、爬取帖子的标题、发布时间和链接
 1 import threading
 2 import requests
 3 import re
 4 import os
 5 
 6 #   百度贴吧        爬取帖子的标题、发布时间和链接
 7 
 8 #   要搜索的贴吧名称
 9 word = '文字控吧'
10 #   设置爬取页数
11 num = 5
12 
13 
14 # 获取详情页url和标题
15 def parse(word, pn):
16     r = requests.get('https://tieba.baidu.com/f', params={'kw': word, 'pn': pn}).content.decode()
17     article_urls = re.findall(r'<a rel="noreferrer" href="(/p/\d+)" title="(.*?)" target=', r, re.S)
18     print('正在请求中...')
19     return article_urls
20 
21 
22 #   发起请求
23 def parse_detail(article_urls):
24     for article_url in article_urls:
25         article_req = requests.get('https://tieba.baidu.com' + article_url[0]).text
26         if not re.findall(r'"userName":"(.*?)"', article_req, re.S):
27             print('未匹配到数据，这个正则不符合这个贴吧，需要重写正则')
28             continue
29         #   楼主
30         author = re.findall(r'"userName":"(.*?)"', article_req, re.S)[0]
31         #   发帖时间
32         crete_time = \
33             re.findall(r'<span class="tail-info">1楼</span><span class="tail-info">(.*?)</span>', article_req, re.S)[0]
34         if author and crete_time and crete_time:
35             content = '楼主：{}\n标题：{}\n发布时间：{}\n链接：{}\n'.format(author, article_url[1], crete_time,
36                                                               'https://tieba.baidu.com' + article_url[0])
37             print(content)
38             #   写入文件
39             with open(word + '.txt', 'a')as f:
40                 f.write('{}\n'.format(content))
41 
42 
43 if not os.path.exists('百度贴吧'):
44     #   创建文件夹
45     os.mkdir('百度贴吧')
46 os.chdir('百度贴吧')
47 
48 t_list = []
49 for pn in range(0, num * 50, 50):
50     #   先获取详情页url和标题
51     article_urls = parse(word, pn)
52     #   对每一个详情页进行请求
53     t = threading.Thread(target=parse_detail, args=(article_urls,))
54     t_list.append(t)
55 
56 # 启动线程
57 for t in t_list:
58     t.start()
59 # 等待所有线程结束
60 for t in t_list:
61     t.join()
百度贴吧
原文地址：https://www.cnblogs.com/jiyu-hlzy/p/11804167.html