Python爬虫股票评论,snowNLP简单分析股民用户情绪

时间:2022-05-04
本文章向大家介绍Python爬虫股票评论,snowNLP简单分析股民用户情绪,主要内容包括一、背景、二、数据来源、三、数据获取、四、前端数据展示、基本概念、基础应用、原理机制和需要注意的事项等,并结合实例形式分析了其使用技巧,希望通过本文能帮助到大家理解应用这部分内容。

一、背景

股民是网络用户的一大群体,他们的网络情绪在一定程度上反映了该股票的情况,也反映了股市市场的波动情况。作为一只时间充裕的研究僧,我课余时间准备写个小代码get一下股民的评论数据,分析以下用户情绪的走势。代码还会修改,因为结果不准确,哈哈!

二、数据来源

本次项目不用于商用,数据来源于东方财富网,由于物理条件,我只获取了一只股票的部分评论,没有爬取官方的帖子,都是获取的散户的评论。

三、数据获取

Python是个好工具,这次我使用了selenium和PhantomJS组合进行爬取网页数据,当然还是要分析网页的dom结构拿到自己需要的数据。

爬虫部分:

from selenium import webdriver  
import time  
import json  
import re    
# from HTMLParser import HTMLParser   
from myNLP import *  
# from lxml import html  
# import requests  
class Crawler:  
    url = ''  
    newurl = set()  
    headers = {}  
    cookies = {}  
    def __init__(self, stocknum, page):  
        self.url = 'http://guba.eastmoney.com/list,'+stocknum+',5_'+page+'.html'  
        cap = webdriver.DesiredCapabilities.PHANTOMJS  
        cap["phantomjs.page.settings.resourceTimeout"] = 1000  
        #cap["phantomjs.page.settings.loadImages"] = False  
        #cap["phantomjs.page.settings.localToRemoteUrlAccessEnabled"] = True  
        self.driver = webdriver.PhantomJS(desired_capabilities=cap)  
    def crawAllHtml(self,url):  
        self.driver.get(url)  
        time.sleep(2)  
#         htmlData = requests.get(url).content.decode('utf-8')  
#         domTree = html.fromstring(htmlData)  
#         return domTree  
    def getNewUrl(self,url):  
        self.newurl.add(url)  
    def filterHtmlTag(self, htmlStr):  
        self.htmlStr = htmlStr    
        #先过滤CDATA    
        re_cdata=re.compile('//<!CDATA
[>]∗//
>',re.I) #匹配CDATA    
        re_script=re.compile('<s*script[^>]*>[^<]*<s*/s*scripts*>',re.I)#Script    
        re_style=re.compile('<s*style[^>]*>[^<]*<s*/s*styles*>',re.I)#style    
        re_br=re.compile('<brs*?/?>')#处理换行    
        re_h=re.compile('</?w+[^>]*>')#HTML标签    
        re_comment=re.compile('<!--[^>]*-->')#HTML注释    
        s=re_cdata.sub('',htmlStr)#去掉CDATA    
        s=re_script.sub('',s) #去掉SCRIPT    
        s=re_style.sub('',s)#去掉style    
        s=re_br.sub('n',s)#将br转换为换行    
        blank_line=re.compile('n+')#去掉多余的空行    
        s = blank_line.sub('n',s)    
        s=re_h.sub('',s) #去掉HTML 标签    
        s=re_comment.sub('',s)#去掉HTML注释    
        #去掉多余的空行    
        blank_line=re.compile('n+')    
        s=blank_line.sub('n',s)    
        return s  
    def getData(self):  
        comments = []  
        self.crawAllHtml(self.url)  
        postlist = self.driver.find_elements_by_xpath('//*[@id="articlelistnew"]/div')  
        for post in postlist:  
            href = post.find_elements_by_tag_name('span')[2].find_elements_by_tag_name('a')  
            if len(href):  
                self.getNewUrl(href[0].get_attribute('href'))  
#             if len(post.find_elements_by_xpath('./span[3]/a/@href')):  
#                 self.getNewUrl('http://guba.eastmoney.com'+post.find_elements_by_xpath('./span[3]/a/@href')[0])  
        for url in self.newurl:  
            self.crawAllHtml(url)  
            time = self.driver.find_elements_by_xpath('//*[@id="zwconttb"]/div[2]')  
            post = self.driver.find_elements_by_xpath('//*[@id="zwconbody"]/div')  
            age = self.driver.find_elements_by_xpath('//*[@id="zwconttbn"]/span/span[2]')  
            if len(post) and len(time) and len(age):  
                text = self.filterHtmlTag(post[0].text)  
                if len(text):  
                    tmp = myNLP(text)  
                    comments.append({'time':time[0].text,'content':tmp.prob, 'age':age[0].text})  
            commentlist = self.driver.find_elements_by_xpath('//*[@id="zwlist"]/div')    
            if len(commentlist):  
                for comment in commentlist:  
                    time = comment.find_elements_by_xpath('./div[3]/div[1]/div[2]')  
                    post = comment.find_elements_by_xpath('./div[3]/div[1]/div[3]')  
                    age = comment.find_elements_by_xpath('./div[3]/div[1]/div[1]/span[2]/span[2]')  
                    if len(post) and len(time) and len(age):  
                        text = self.filterHtmlTag(post[0].text)  
                        if len(text):  
                            tmp = myNLP(text)  
                            comments.append({'time':time[0].text,'content':tmp.prob, 'age':age[0].text})  
        return json.dumps(comments)  
存储部分:

这部分其实可以用数据库来做,但是由于只是试水,就简单用json文件来存部分数据
import io  
class File:  
    name = ''  
    type = ''  
    src = ''  
    file = ''  
    def __init__(self,name, type, src):  
        self.name = name  
        self.type = type  
        self.src = src    
        filename = self.src+self.name+'.'+self.type  
        self.file = io.open(filename,'w+', encoding = 'utf-8')  
    def inputData(self,data):  
        self.file.write(data.decode('utf-8'))  
        self.file.close()  
    def closeFile(self):  
        self.file.close()  

测试用的local服务器:

这里只是为了要用浏览器浏览数据图,由于需要读取数据,js没有权限操作本地的文件,只能利用一个简单的服务器来弄了

  1. import SimpleHTTPServer
  2. import SocketServer;
  3. PORT = 8000
  4. Handler = SimpleHTTPServer.SimpleHTTPRequestHandler
  5. httpd = SocketServer.TCPServer(("", PORT), Handler);
  6. httpd.serve_forever()

NLP部分:snowNLP这个包还是用来评价买卖东西的评论比较准确

不是专门研究自然语言的,直接使用他人的算法库。这个snowNLP可以建立一个训练,有空自己来弄一个关于股票评论的。

  1. #!/usr/bin/env python
  2. # -*- coding: UTF-8 -*-
  3. from snownlp import SnowNLP
  4. class myNLP:
  5. prob = 0.5
  6. def _init_(self, text):
  7. self.prob = SnowNLP(text).sentiments

主调度:

  1. # -*- coding: UTF-8 -*-
  2. '''''
  3. Created on 2017年5月17日
  4. @author: luhaiya
  5. @id: 2016110274
  6. @description:
  7. '''
  8. #http://data.eastmoney.com/stockcomment/ 所有股票的列表信息
  9. #http://guba.eastmoney.com/list,600000,5.html 某只股票股民的帖子页面
  10. #http://quote.eastmoney.com/sh600000.html?stype=stock 查询某只股票
  11. from Crawler import *
  12. from File import *
  13. import sys
  14. default_encoding = 'utf-8'
  15. if sys.getdefaultencoding() != default_encoding:
  16. reload(sys)
  17. sys.setdefaultencoding(default_encoding)
  18. def main():
  19. stocknum = str(600000)
  20. total = dict()
  21. for i in range(1,10):
  22. page = str(i)
  23. crawler = Crawler(stocknum, page)
  24. datalist = crawler.getData()
  25. comments = File(stocknum+'_page_'+page,'json','./data/')
  26. comments.inputData(datalist)
  27. data = open('./data/'+stocknum+'_page_'+page+'.json','r').read()
  28. jsonData = json.loads(data)
  29. for detail in jsonData:
  30. num = '1' if '年' not in detail['age'].encode('utf-8') else detail['age'].encode('utf-8').replace('年','')
  31. num = float(num)
  32. date = detail['time'][4:14].encode('utf-8')
  33. total[date] = total[date] if date in total.keys() else {'num':0, 'content':0}
  34. total[date]['num'] = total[date]['num'] + num if total[date]['num'] else num
  35. total[date]['content'] = total[date]['content'] + detail['content']*num if total[date]['content'] else detail['content']*num
  36. total = json.dumps(total)
  37. totalfile = File(stocknum,'json','./data/')
  38. totalfile.inputData(total)
  39. if __name__ == "__main__":
  40. main()

四、前端数据展示

使用百度的echarts。用户的情绪是使用当天所有评论的情绪值的加权平均,加权系数与用户的股龄正相关。

  1. <!DOCTYPE html>
  2. <html>
  3. <head>
  4. <meta charset="UTF-8">
  5. <title>分析图表</title>
  6. <style>
  7. body{texr-align:center;}
  8. #mainContainer{width:100%;}
  9. #fileContainer{width:100%; text-align:center;}
  10. #picContainer{width: 800px;height:600px;margin:0 auto;}
  11. </style>
  12. </head>
  13. <body>
  14. <div id = 'mainContainer'>
  15. <div id = 'fileContainer'>这里是文件夹列表</div>
  16. <div id = 'picContainer'></div>
  17. </div>
  18. <script src="http://apps.bdimg.com/libs/jquery/2.1.1/jquery.min.js"></script>
  19. <script src = "./echarts.js"></script>
  20. <script>
  21. main();
  22. function main(){
  23. var stocknum = 600000;
  24. getDate(stocknum);
  25. }
  26. function getDate(stocknum){
  27. var src = "./data/"+stocknum+".json";
  28. $.getJSON(src, function (res){
  29. var date = [];
  30. for(var key in res){
  31. key = key.replace('-','/').replace('-','/');
  32. date.push(key);
  33. }
  34. date.sort();
  35. data = [];
  36. for (var i = 0; i < date.length; i++) {
  37. dat = date[i].replace('/','-').replace('/','-');
  38. data.push(res[dat]['content']/res[dat]['num']);
  39. }
  40. drawPic(date,data);
  41. })
  42. }
  43. function drawPic(date, data){
  44. //initialize and setting options
  45. var myChart = echarts.init(document.getElementById('picContainer'));
  46. option = {
  47. tooltip: {
  48. trigger: 'axis',
  49. position: function (pt) {
  50. return [pt[0], '10%'];
  51. }
  52. },
  53. title: {
  54. left: 'center',
  55. text: '股票情绪走向图',
  56. },
  57. toolbox: {
  58. feature: {
  59. dataZoom: {
  60. yAxisIndex: 'none'
  61. },
  62. restore: {},
  63. saveAsImage: {}
  64. }
  65. },
  66. xAxis: {
  67. type: 'category',
  68. boundaryGap: false,
  69. data: date
  70. },
  71. yAxis: {
  72. type: 'value',
  73. boundaryGap: [0, '100%']
  74. },
  75. dataZoom: [{
  76. type: 'inside',
  77. start: 0,
  78. end: 10
  79. }, {
  80. start: 0,
  81. end: 10,
  82. handleIcon: 'M10.7,11.9v-1.3H9.3v1.3c-4.9,0.3-8.8,4.4-8.8,9.4c0,5,3.9,9.1,8.8,9.4v1.3h1.3v-1.3c4.9-0.3,8.8-4.4,8.8-9.4C19.5,16.3,15.6,12.2,10.7,11.9z M13.3,24.4H6.7V23h6.6V24.4z M13.3,19.6H6.7v-1.4h6.6V19.6z',
  83. handleSize: '80%',
  84. handleStyle: {
  85. color: '#fff',
  86. shadowBlur: 3,
  87. shadowColor: 'rgba(0, 0, 0, 0.6)',
  88. shadowOffsetX: 2,
  89. shadowOffsetY: 2
  90. }
  91. }],
  92. series: [
  93. {
  94. name:'stocknum',
  95. type:'line',
  96. smooth:true,
  97. symbol: 'none',
  98. sampling: 'average',
  99. itemStyle: {
  100. normal: {
  101. color: 'rgb(255, 70, 131)'
  102. }
  103. },
  104. areaStyle: {
  105. normal: {
  106. color: new echarts.graphic.LinearGradient(0, 0, 0, 1, [{
  107. offset: 0,
  108. color: 'rgb(255, 158, 68)'
  109. }, {
  110. offset: 1,
  111. color: 'rgb(255, 70, 131)'
  112. }])
  113. }
  114. },
  115. data: data
  116. }
  117. ]
  118. };
  119. //draw pic
  120. myChart.setOption(option);
  121. }
  122. </script>
  123. </body>
  124. </html>

图1

图2

图1是我分析用户情绪画出的时间推进图,理论上小于0.5表消极情绪,大于0.5表示积极情绪。图2是实际股价的走势。

via: http://blog.csdn.net/SeaIsGod/article/details/72859071