爬取新闻网信息

时间:2022-07-24
本文章向大家介绍爬取新闻网信息,主要内容包括其使用实例、应用技巧、基本知识点总结和需要注意事项,具有一定的参考价值,需要的朋友可以参考一下。

爬虫案例

学习了HttpClient和Jsoup,就掌握了如何抓取数据和如何解析数据。但是HttpClient对动态数据解析支持不是很友好,所以又学习了HtmlUtil,用于解析动态数据。

主要目的是HtmlUtil和Jsoup的学习。

需求分析

爬取凤凰网、网易、搜狐、今日头条。

除了今日头条,其他页面的数据都是静态的,很好爬取。

由于技术有限,对今日头条的详情页面爬取还是有点技术上的问题,待解决。

数据库表分析

根据需求分析,我们创建的表如下

-- auto-generated definition
CREATE TABLE news
(
  id          INT AUTO_INCREMENT
    PRIMARY KEY,
  title       VARCHAR(128) NULL,
  url         VARCHAR(256) NULL,
  image       VARCHAR(256) NULL,
  create_date DATETIME     NULL,
  news_date   DATETIME     NULL,
  content     TEXT         NULL,
  source      VARCHAR(32)  NULL
);

项目示例

使用`Spring Boot 进行开发

添加依赖

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>
    <parent>
        <groupId>org.springframework.boot</groupId>
        <artifactId>spring-boot-starter-parent</artifactId>
        <version>2.3.2.RELEASE</version>
        <relativePath/> <!-- lookup parent from repository -->
    </parent>
    <groupId>com.ray</groupId>
    <artifactId>newscrawler</artifactId>
    <version>0.0.1-SNAPSHOT</version>
    <name>news-crawler</name>
    <description>新闻爬虫</description>

    <properties>
        <java.version>1.8</java.version>
    </properties>

    <dependencies>
        <!-- 使用Jetty,需要在spring-boot-starter-web排除spring-boot-starter-tomcat,因为SpringBoot默认使用tomcat -->
        <dependency>
            <groupId>org.springframework.boot</groupId>
            <artifactId>spring-boot-starter-web</artifactId>
            <exclusions>
                <exclusion>
                    <groupId>org.springframework.boot</groupId>
                    <artifactId>spring-boot-starter-tomcat</artifactId>
                </exclusion>
            </exclusions>
        </dependency>
        <!-- Jetty适合长连接应用,就是聊天类的长连接 -->
        <dependency>
            <groupId>org.springframework.boot</groupId>
            <artifactId>spring-boot-starter-jetty</artifactId>
        </dependency>

        <dependency>
            <groupId>org.springframework.boot</groupId>
            <artifactId>spring-boot-starter-test</artifactId>
            <scope>test</scope>
        </dependency>
        <dependency>
            <groupId>org.springframework.boot</groupId>
            <artifactId>spring-boot-devtools</artifactId>
        </dependency>
        <dependency>
            <groupId>org.springframework.boot</groupId>
            <artifactId>spring-boot-starter-data-redis</artifactId>
        </dependency>

        <dependency>
            <groupId>org.mybatis.spring.boot</groupId>
            <artifactId>mybatis-spring-boot-starter</artifactId>
            <version>1.3.2</version>
        </dependency>
        <dependency>
            <groupId>com.github.pagehelper</groupId>
            <artifactId>pagehelper-spring-boot-starter</artifactId>
            <version>1.2.5</version>
        </dependency>
        <!--<dependency>
            <groupId>mysql</groupId>
            <artifactId>mysql-connector-java</artifactId>
            <version>5.1.38</version>
        </dependency>-->
        <dependency>
            <groupId>mysql</groupId>
            <artifactId>mysql-connector-java</artifactId>
            <version>8.0.11</version>
        </dependency>

        <dependency>
            <groupId>com.alibaba</groupId>
            <artifactId>druid</artifactId>
            <version>1.0.29</version>
        </dependency>

        <!--swagger2的jar包-->
        <dependency>
            <groupId>io.springfox</groupId>
            <artifactId>springfox-swagger2</artifactId>
            <exclusions>
                <exclusion>
                    <groupId>com.google.guava</groupId>
                    <artifactId>guava</artifactId>
                </exclusion>
            </exclusions>
            <version>2.9.2</version>
        </dependency>
        <!--引入视觉的样式的UI-->
        <dependency>
            <groupId>io.springfox</groupId>
            <artifactId>springfox-swagger-ui</artifactId>
            <version>2.9.2</version>
        </dependency>

        <!-- https://mvnrepository.com/artifact/org.jsoup/jsoup -->
        <dependency>
            <groupId>org.jsoup</groupId>
            <artifactId>jsoup</artifactId>
            <version>1.13.1</version>
        </dependency>

        <!-- https://mvnrepository.com/artifact/net.sourceforge.htmlunit/htmlunit -->
        <dependency>
            <groupId>net.sourceforge.htmlunit</groupId>
            <artifactId>htmlunit</artifactId>
            <version>2.42.0</version>
        </dependency>

    </dependencies>

    <build>
        <plugins>
            <plugin>
                <groupId>org.springframework.boot</groupId>
                <artifactId>spring-boot-maven-plugin</artifactId>
            </plugin>
        </plugins>
    </build>
</project>

配置文件

spring.datasource.url=jdbc:mysql://localhost:3306/ray0804?useUnicode=true&useJDBCCompliantTimezoneShift=true&useLegacyDatetimeCode=false&serverTimezone=UTC
spring.datasource.username=root
spring.datasource.password=root
spring.datasource.driver-class-name=com.mysql.cj.jdbc.Driver

# MyBatis configuration
mybatis.mapper-locations=classpath:mapper/*.xml
# u5206u9875u914Du7F6E
pagehelper.auto-dialect=true
pagehelper.reasonable=true
pagehelper.support-methods-arguments=true

# Redis configuration
#spring.redis.host=192.168.80.222
#spring.redis.port=6379

# urls
news.neteasy.url=http://news.163.com/rank/
news.toutiao.url=https://www.toutiao.com/ch/news_hot/
news.sohu.url=http://news.sohu.com/
news.ifeng.url=http://www.ifeng.com/

# logging
logging.level.root=info

代码实现

public class News {

    private Integer id;

    private String title;

    private String url;

    private String image;

    private Date createDate;

    private Date newsDate;

    private String source;

    private String content;

    public Integer getId() {
        return id;
    }

    public void setId(Integer id) {
        this.id = id;
    }

    public String getTitle() {
        return title;
    }

    public void setTitle(String title) {
        this.title = title;
    }

    public String getUrl() {
        return url;
    }

    public void setUrl(String url) {
        this.url = url;
    }

    public String getImage() {
        return StringUtils.isNotBlank(image) ? image : "/img/news.png";
    }

    public void setImage(String image) {
        this.image = image;
    }

    public Date getCreateDate() {
        return createDate;
    }

    public void setCreateDate(Date createDate) {
        this.createDate = createDate;
    }

    public Date getNewsDate() {
        return newsDate;
    }

    public void setNewsDate(Date newsDate) {
        this.newsDate = newsDate;
    }

    public String getSource() {
        return source;
    }

    public void setSource(String source) {
        this.source = source;
    }

    public String getContent() {
        return content;
    }

    public void setContent(String content) {
        this.content = content;
    }

    public String getSummary() {
        // 去除网页中的所有标签,然后取出140个字符
        String summary = NewsUtils.getTextFromContent(content);
        // 值得注意,如果新闻太短,小于140个字符,则有多少截取多少!!!
        summary = summary.substring(0, summary.length() > 140 ? 140 : summary.length()) + "...";
        return summary;
    }

    public String getLargeImage() {
        String largeImage = NewsUtils.getImageFromContent(content);
        return StringUtils.isNotBlank(largeImage) ? largeImage : image;
    }

    @Override
    public String toString() {
        StringBuilder sb = new StringBuilder();
        sb.append(getClass().getSimpleName());
        sb.append(" [");
        sb.append("Hash = ").append(hashCode());
        sb.append(", id=").append(id);
        sb.append(", title=").append(title);
        sb.append(", url=").append(url);
        sb.append(", image=").append(image);
        sb.append(", createDate=").append(createDate);
        sb.append(", newsDate=").append(newsDate);
        sb.append(", source=").append(source);
        sb.append(", content=").append(content);
        sb.append("]");
        return sb.toString();
    }
}
public class NewsExample {

    protected String orderByClause;

    protected boolean distinct;

    protected List<Criteria> oredCriteria;

    public NewsExample() {
        oredCriteria = new ArrayList<Criteria>();
    }

    public void setOrderByClause(String orderByClause) {
        this.orderByClause = orderByClause;
    }

    public String getOrderByClause() {
        return orderByClause;
    }

    public void setDistinct(boolean distinct) {
        this.distinct = distinct;
    }

    public boolean isDistinct() {
        return distinct;
    }

    public List<Criteria> getOredCriteria() {
        return oredCriteria;
    }

    public void or(Criteria criteria) {
        oredCriteria.add(criteria);
    }

    public Criteria or() {
        Criteria criteria = createCriteriaInternal();
        oredCriteria.add(criteria);
        return criteria;
    }

    public Criteria createCriteria() {
        Criteria criteria = createCriteriaInternal();
        if (oredCriteria.size() == 0) {
            oredCriteria.add(criteria);
        }
        return criteria;
    }

    protected Criteria createCriteriaInternal() {
        Criteria criteria = new Criteria();
        return criteria;
    }

    public void clear() {
        oredCriteria.clear();
        orderByClause = null;
        distinct = false;
    }

    protected abstract static class GeneratedCriteria {
        protected List<Criterion> criteria;

        protected GeneratedCriteria() {
            super();
            criteria = new ArrayList<Criterion>();
        }

        public boolean isValid() {
            return criteria.size() > 0;
        }

        public List<Criterion> getAllCriteria() {
            return criteria;
        }

        public List<Criterion> getCriteria() {
            return criteria;
        }

        protected void addCriterion(String condition) {
            if (condition == null) {
                throw new RuntimeException("Value for condition cannot be null");
            }
            criteria.add(new Criterion(condition));
        }

        protected void addCriterion(String condition, Object value, String property) {
            if (value == null) {
                throw new RuntimeException("Value for " + property + " cannot be null");
            }
            criteria.add(new Criterion(condition, value));
        }

        protected void addCriterion(String condition, Object value1, Object value2, String property) {
            if (value1 == null || value2 == null) {
                throw new RuntimeException("Between values for " + property + " cannot be null");
            }
            criteria.add(new Criterion(condition, value1, value2));
        }

        public Criteria andIdIsNull() {
            addCriterion("id is null");
            return (Criteria) this;
        }

        public Criteria andIdIsNotNull() {
            addCriterion("id is not null");
            return (Criteria) this;
        }

        public Criteria andIdEqualTo(Integer value) {
            addCriterion("id =", value, "id");
            return (Criteria) this;
        }

        public Criteria andIdNotEqualTo(Integer value) {
            addCriterion("id <>", value, "id");
            return (Criteria) this;
        }

        public Criteria andIdGreaterThan(Integer value) {
            addCriterion("id >", value, "id");
            return (Criteria) this;
        }

        public Criteria andIdGreaterThanOrEqualTo(Integer value) {
            addCriterion("id >=", value, "id");
            return (Criteria) this;
        }

        public Criteria andIdLessThan(Integer value) {
            addCriterion("id <", value, "id");
            return (Criteria) this;
        }

        public Criteria andIdLessThanOrEqualTo(Integer value) {
            addCriterion("id <=", value, "id");
            return (Criteria) this;
        }

        public Criteria andIdIn(List<Integer> values) {
            addCriterion("id in", values, "id");
            return (Criteria) this;
        }

        public Criteria andIdNotIn(List<Integer> values) {
            addCriterion("id not in", values, "id");
            return (Criteria) this;
        }

        public Criteria andIdBetween(Integer value1, Integer value2) {
            addCriterion("id between", value1, value2, "id");
            return (Criteria) this;
        }

        public Criteria andIdNotBetween(Integer value1, Integer value2) {
            addCriterion("id not between", value1, value2, "id");
            return (Criteria) this;
        }

        public Criteria andTitleIsNull() {
            addCriterion("title is null");
            return (Criteria) this;
        }

        public Criteria andTitleIsNotNull() {
            addCriterion("title is not null");
            return (Criteria) this;
        }

        public Criteria andTitleEqualTo(String value) {
            addCriterion("title =", value, "title");
            return (Criteria) this;
        }

        public Criteria andTitleNotEqualTo(String value) {
            addCriterion("title <>", value, "title");
            return (Criteria) this;
        }

        public Criteria andTitleGreaterThan(String value) {
            addCriterion("title >", value, "title");
            return (Criteria) this;
        }

        public Criteria andTitleGreaterThanOrEqualTo(String value) {
            addCriterion("title >=", value, "title");
            return (Criteria) this;
        }

        public Criteria andTitleLessThan(String value) {
            addCriterion("title <", value, "title");
            return (Criteria) this;
        }

        public Criteria andTitleLessThanOrEqualTo(String value) {
            addCriterion("title <=", value, "title");
            return (Criteria) this;
        }

        public Criteria andTitleLike(String value) {
            addCriterion("title like", value, "title");
            return (Criteria) this;
        }

        public Criteria andTitleNotLike(String value) {
            addCriterion("title not like", value, "title");
            return (Criteria) this;
        }

        public Criteria andTitleIn(List<String> values) {
            addCriterion("title in", values, "title");
            return (Criteria) this;
        }

        public Criteria andTitleNotIn(List<String> values) {
            addCriterion("title not in", values, "title");
            return (Criteria) this;
        }

        public Criteria andTitleBetween(String value1, String value2) {
            addCriterion("title between", value1, value2, "title");
            return (Criteria) this;
        }

        public Criteria andTitleNotBetween(String value1, String value2) {
            addCriterion("title not between", value1, value2, "title");
            return (Criteria) this;
        }

        public Criteria andUrlIsNull() {
            addCriterion("url is null");
            return (Criteria) this;
        }

        public Criteria andUrlIsNotNull() {
            addCriterion("url is not null");
            return (Criteria) this;
        }

        public Criteria andUrlEqualTo(String value) {
            addCriterion("url =", value, "url");
            return (Criteria) this;
        }

        public Criteria andUrlNotEqualTo(String value) {
            addCriterion("url <>", value, "url");
            return (Criteria) this;
        }

        public Criteria andUrlGreaterThan(String value) {
            addCriterion("url >", value, "url");
            return (Criteria) this;
        }

        public Criteria andUrlGreaterThanOrEqualTo(String value) {
            addCriterion("url >=", value, "url");
            return (Criteria) this;
        }

        public Criteria andUrlLessThan(String value) {
            addCriterion("url <", value, "url");
            return (Criteria) this;
        }

        public Criteria andUrlLessThanOrEqualTo(String value) {
            addCriterion("url <=", value, "url");
            return (Criteria) this;
        }

        public Criteria andUrlLike(String value) {
            addCriterion("url like", value, "url");
            return (Criteria) this;
        }

        public Criteria andUrlNotLike(String value) {
            addCriterion("url not like", value, "url");
            return (Criteria) this;
        }

        public Criteria andUrlIn(List<String> values) {
            addCriterion("url in", values, "url");
            return (Criteria) this;
        }

        public Criteria andUrlNotIn(List<String> values) {
            addCriterion("url not in", values, "url");
            return (Criteria) this;
        }

        public Criteria andUrlBetween(String value1, String value2) {
            addCriterion("url between", value1, value2, "url");
            return (Criteria) this;
        }

        public Criteria andUrlNotBetween(String value1, String value2) {
            addCriterion("url not between", value1, value2, "url");
            return (Criteria) this;
        }

        public Criteria andImageIsNull() {
            addCriterion("image is null");
            return (Criteria) this;
        }

        public Criteria andImageIsNotNull() {
            addCriterion("image is not null");
            return (Criteria) this;
        }

        public Criteria andImageEqualTo(String value) {
            addCriterion("image =", value, "image");
            return (Criteria) this;
        }

        public Criteria andImageNotEqualTo(String value) {
            addCriterion("image <>", value, "image");
            return (Criteria) this;
        }

        public Criteria andImageGreaterThan(String value) {
            addCriterion("image >", value, "image");
            return (Criteria) this;
        }

        public Criteria andImageGreaterThanOrEqualTo(String value) {
            addCriterion("image >=", value, "image");
            return (Criteria) this;
        }

        public Criteria andImageLessThan(String value) {
            addCriterion("image <", value, "image");
            return (Criteria) this;
        }

        public Criteria andImageLessThanOrEqualTo(String value) {
            addCriterion("image <=", value, "image");
            return (Criteria) this;
        }

        public Criteria andImageLike(String value) {
            addCriterion("image like", value, "image");
            return (Criteria) this;
        }

        public Criteria andImageNotLike(String value) {
            addCriterion("image not like", value, "image");
            return (Criteria) this;
        }

        public Criteria andImageIn(List<String> values) {
            addCriterion("image in", values, "image");
            return (Criteria) this;
        }

        public Criteria andImageNotIn(List<String> values) {
            addCriterion("image not in", values, "image");
            return (Criteria) this;
        }

        public Criteria andImageBetween(String value1, String value2) {
            addCriterion("image between", value1, value2, "image");
            return (Criteria) this;
        }

        public Criteria andImageNotBetween(String value1, String value2) {
            addCriterion("image not between", value1, value2, "image");
            return (Criteria) this;
        }

        public Criteria andCreateDateIsNull() {
            addCriterion("create_date is null");
            return (Criteria) this;
        }

        public Criteria andCreateDateIsNotNull() {
            addCriterion("create_date is not null");
            return (Criteria) this;
        }

        public Criteria andCreateDateEqualTo(Date value) {
            addCriterion("create_date =", value, "createDate");
            return (Criteria) this;
        }

        public Criteria andCreateDateNotEqualTo(Date value) {
            addCriterion("create_date <>", value, "createDate");
            return (Criteria) this;
        }

        public Criteria andCreateDateGreaterThan(Date value) {
            addCriterion("create_date >", value, "createDate");
            return (Criteria) this;
        }

        public Criteria andCreateDateGreaterThanOrEqualTo(Date value) {
            addCriterion("create_date >=", value, "createDate");
            return (Criteria) this;
        }

        public Criteria andCreateDateLessThan(Date value) {
            addCriterion("create_date <", value, "createDate");
            return (Criteria) this;
        }

        public Criteria andCreateDateLessThanOrEqualTo(Date value) {
            addCriterion("create_date <=", value, "createDate");
            return (Criteria) this;
        }

        public Criteria andCreateDateIn(List<Date> values) {
            addCriterion("create_date in", values, "createDate");
            return (Criteria) this;
        }

        public Criteria andCreateDateNotIn(List<Date> values) {
            addCriterion("create_date not in", values, "createDate");
            return (Criteria) this;
        }

        public Criteria andCreateDateBetween(Date value1, Date value2) {
            addCriterion("create_date between", value1, value2, "createDate");
            return (Criteria) this;
        }

        public Criteria andCreateDateNotBetween(Date value1, Date value2) {
            addCriterion("create_date not between", value1, value2, "createDate");
            return (Criteria) this;
        }

        public Criteria andNewsDateIsNull() {
            addCriterion("news_date is null");
            return (Criteria) this;
        }

        public Criteria andNewsDateIsNotNull() {
            addCriterion("news_date is not null");
            return (Criteria) this;
        }

        public Criteria andNewsDateEqualTo(Date value) {
            addCriterion("news_date =", value, "newsDate");
            return (Criteria) this;
        }

        public Criteria andNewsDateNotEqualTo(Date value) {
            addCriterion("news_date <>", value, "newsDate");
            return (Criteria) this;
        }

        public Criteria andNewsDateGreaterThan(Date value) {
            addCriterion("news_date >", value, "newsDate");
            return (Criteria) this;
        }

        public Criteria andNewsDateGreaterThanOrEqualTo(Date value) {
            addCriterion("news_date >=", value, "newsDate");
            return (Criteria) this;
        }

        public Criteria andNewsDateLessThan(Date value) {
            addCriterion("news_date <", value, "newsDate");
            return (Criteria) this;
        }

        public Criteria andNewsDateLessThanOrEqualTo(Date value) {
            addCriterion("news_date <=", value, "newsDate");
            return (Criteria) this;
        }

        public Criteria andNewsDateIn(List<Date> values) {
            addCriterion("news_date in", values, "newsDate");
            return (Criteria) this;
        }

        public Criteria andNewsDateNotIn(List<Date> values) {
            addCriterion("news_date not in", values, "newsDate");
            return (Criteria) this;
        }

        public Criteria andNewsDateBetween(Date value1, Date value2) {
            addCriterion("news_date between", value1, value2, "newsDate");
            return (Criteria) this;
        }

        public Criteria andNewsDateNotBetween(Date value1, Date value2) {
            addCriterion("news_date not between", value1, value2, "newsDate");
            return (Criteria) this;
        }

        public Criteria andSourceIsNull() {
            addCriterion("source is null");
            return (Criteria) this;
        }

        public Criteria andSourceIsNotNull() {
            addCriterion("source is not null");
            return (Criteria) this;
        }

        public Criteria andSourceEqualTo(String value) {
            addCriterion("source =", value, "source");
            return (Criteria) this;
        }

        public Criteria andSourceNotEqualTo(String value) {
            addCriterion("source <>", value, "source");
            return (Criteria) this;
        }

        public Criteria andSourceGreaterThan(String value) {
            addCriterion("source >", value, "source");
            return (Criteria) this;
        }

        public Criteria andSourceGreaterThanOrEqualTo(String value) {
            addCriterion("source >=", value, "source");
            return (Criteria) this;
        }

        public Criteria andSourceLessThan(String value) {
            addCriterion("source <", value, "source");
            return (Criteria) this;
        }

        public Criteria andSourceLessThanOrEqualTo(String value) {
            addCriterion("source <=", value, "source");
            return (Criteria) this;
        }

        public Criteria andSourceLike(String value) {
            addCriterion("source like", value, "source");
            return (Criteria) this;
        }

        public Criteria andSourceNotLike(String value) {
            addCriterion("source not like", value, "source");
            return (Criteria) this;
        }

        public Criteria andSourceIn(List<String> values) {
            addCriterion("source in", values, "source");
            return (Criteria) this;
        }

        public Criteria andSourceNotIn(List<String> values) {
            addCriterion("source not in", values, "source");
            return (Criteria) this;
        }

        public Criteria andSourceBetween(String value1, String value2) {
            addCriterion("source between", value1, value2, "source");
            return (Criteria) this;
        }

        public Criteria andSourceNotBetween(String value1, String value2) {
            addCriterion("source not between", value1, value2, "source");
            return (Criteria) this;
        }
    }

    public static class Criteria extends GeneratedCriteria {

        protected Criteria() {
            super();
        }
    }

    public static class Criterion {
        private String condition;

        private Object value;

        private Object secondValue;

        private boolean noValue;

        private boolean singleValue;

        private boolean betweenValue;

        private boolean listValue;

        private String typeHandler;

        public String getCondition() {
            return condition;
        }

        public Object getValue() {
            return value;
        }

        public Object getSecondValue() {
            return secondValue;
        }

        public boolean isNoValue() {
            return noValue;
        }

        public boolean isSingleValue() {
            return singleValue;
        }

        public boolean isBetweenValue() {
            return betweenValue;
        }

        public boolean isListValue() {
            return listValue;
        }

        public String getTypeHandler() {
            return typeHandler;
        }

        protected Criterion(String condition) {
            super();
            this.condition = condition;
            this.typeHandler = null;
            this.noValue = true;
        }

        protected Criterion(String condition, Object value, String typeHandler) {
            super();
            this.condition = condition;
            this.value = value;
            this.typeHandler = typeHandler;
            if (value instanceof List<?>) {
                this.listValue = true;
            } else {
                this.singleValue = true;
            }
        }

        protected Criterion(String condition, Object value) {
            this(condition, value, null);
        }

        protected Criterion(String condition, Object value, Object secondValue, String typeHandler) {
            super();
            this.condition = condition;
            this.value = value;
            this.secondValue = secondValue;
            this.typeHandler = typeHandler;
            this.betweenValue = true;
        }

        protected Criterion(String condition, Object value, Object secondValue) {
            this(condition, value, secondValue, null);
        }
    }
}
@Mapper
public interface NewsDao {

    long countByExample(NewsExample example);

    int deleteByExample(NewsExample example);

    int deleteByPrimaryKey(Integer id);

    int insert(News record);

    int insertSelective(News record);

    List<News> selectByExampleWithBLOBs(NewsExample example);

    List<News> selectByExample(NewsExample example);

    News selectByPrimaryKey(Integer id);

    int updateByExampleSelective(@Param("record") News record, @Param("example") NewsExample example);

    int updateByExampleWithBLOBs(@Param("record") News record, @Param("example") NewsExample example);

    int updateByExample(@Param("record") News record, @Param("example") NewsExample example);

    int updateByPrimaryKeySelective(News record);

    int updateByPrimaryKeyWithBLOBs(News record);

    int updateByPrimaryKey(News record);
}
public interface NewsService {

    int saveNews(News news);

    List<News> searchNewsForPage(int page, int pageSize, NewsExample example);

    Long countByExample(NewsExample example);
}
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE mapper PUBLIC "-//mybatis.org//DTD Mapper 3.0//EN" "http://mybatis.org/dtd/mybatis-3-mapper.dtd">
<mapper namespace="com.ray.news.crawler.dao.NewsDao">
    <resultMap id="BaseResultMap" type="com.ray.news.crawler.entity.News">
        <id column="id" jdbcType="INTEGER" property="id" />
        <result column="title" jdbcType="VARCHAR" property="title" />
        <result column="url" jdbcType="VARCHAR" property="url" />
        <result column="image" jdbcType="VARCHAR" property="image" />
        <result column="create_date" jdbcType="TIMESTAMP" property="createDate" />
        <result column="news_date" jdbcType="TIMESTAMP" property="newsDate" />
        <result column="source" jdbcType="VARCHAR" property="source" />
    </resultMap>
    <resultMap extends="BaseResultMap" id="ResultMapWithBLOBs" type="com.ray.news.crawler.entity.News">
        <result column="content" jdbcType="LONGVARCHAR" property="content" />
    </resultMap>
    <sql id="Example_Where_Clause">
        <where>
            <foreach collection="oredCriteria" item="criteria" separator="or">
                <if test="criteria.valid">
                    <trim prefix="(" prefixOverrides="and" suffix=")">
                        <foreach collection="criteria.criteria" item="criterion">
                            <choose>
                                <when test="criterion.noValue">
                                    and ${criterion.condition}
                                </when>
                                <when test="criterion.singleValue">
                                    and ${criterion.condition} #{criterion.value}
                                </when>
                                <when test="criterion.betweenValue">
                                    and ${criterion.condition} #{criterion.value} and #{criterion.secondValue}
                                </when>
                                <when test="criterion.listValue">
                                    and ${criterion.condition}
                                    <foreach close=")" collection="criterion.value" item="listItem" open="(" separator=",">
                                        #{listItem}
                                    </foreach>
                                </when>
                            </choose>
                        </foreach>
                    </trim>
                </if>
            </foreach>
        </where>
    </sql>
    <sql id="Update_By_Example_Where_Clause">
        <where>
            <foreach collection="example.oredCriteria" item="criteria" separator="or">
                <if test="criteria.valid">
                    <trim prefix="(" prefixOverrides="and" suffix=")">
                        <foreach collection="criteria.criteria" item="criterion">
                            <choose>
                                <when test="criterion.noValue">
                                    and ${criterion.condition}
                                </when>
                                <when test="criterion.singleValue">
                                    and ${criterion.condition} #{criterion.value}
                                </when>
                                <when test="criterion.betweenValue">
                                    and ${criterion.condition} #{criterion.value} and #{criterion.secondValue}
                                </when>
                                <when test="criterion.listValue">
                                    and ${criterion.condition}
                                    <foreach close=")" collection="criterion.value" item="listItem" open="(" separator=",">
                                        #{listItem}
                                    </foreach>
                                </when>
                            </choose>
                        </foreach>
                    </trim>
                </if>
            </foreach>
        </where>
    </sql>
    <sql id="Base_Column_List">
    id, title, url, image, create_date, news_date, source
  </sql>
    <sql id="Blob_Column_List">
    content
  </sql>
    <select id="selectByExampleWithBLOBs" parameterType="com.ray.news.crawler.entity.NewsExample" resultMap="ResultMapWithBLOBs">
        select
        <if test="distinct">
            distinct
        </if>
        <include refid="Base_Column_List" />
        ,
        <include refid="Blob_Column_List" />
        from news
        <if test="_parameter != null">
            <include refid="Example_Where_Clause" />
        </if>
        <if test="orderByClause != null">
            order by ${orderByClause}
        </if>
    </select>
    <select id="selectByExample" parameterType="com.ray.news.crawler.entity.NewsExample" resultMap="BaseResultMap">
        select
        <if test="distinct">
            distinct
        </if>
        <include refid="Base_Column_List" />
        from news
        <if test="_parameter != null">
            <include refid="Example_Where_Clause" />
        </if>
        <if test="orderByClause != null">
            order by ${orderByClause}
        </if>
    </select>
    <select id="selectByPrimaryKey" parameterType="java.lang.Integer" resultMap="ResultMapWithBLOBs">
        select
        <include refid="Base_Column_List" />
        ,
        <include refid="Blob_Column_List" />
        from news
        where id = #{id,jdbcType=INTEGER}
    </select>
    <delete id="deleteByPrimaryKey" parameterType="java.lang.Integer">
    delete from news
    where id = #{id,jdbcType=INTEGER}
  </delete>
    <delete id="deleteByExample" parameterType="com.ray.news.crawler.entity.NewsExample">
        delete from news
        <if test="_parameter != null">
            <include refid="Example_Where_Clause" />
        </if>
    </delete>
    <insert id="insert" parameterType="com.ray.news.crawler.entity.News">
    insert into news (id, title, url,
      image, create_date, news_date,
      source, content)
    values (#{id,jdbcType=INTEGER}, #{title,jdbcType=VARCHAR}, #{url,jdbcType=VARCHAR},
      #{image,jdbcType=VARCHAR}, #{createDate,jdbcType=TIMESTAMP}, #{newsDate,jdbcType=TIMESTAMP},
      #{source,jdbcType=VARCHAR}, #{content,jdbcType=LONGVARCHAR})
  </insert>
    <insert id="insertSelective" parameterType="com.ray.news.crawler.entity.News">
        insert into news
        <trim prefix="(" suffix=")" suffixOverrides=",">
            <if test="id != null">
                id,
            </if>
            <if test="title != null">
                title,
            </if>
            <if test="url != null">
                url,
            </if>
            <if test="image != null">
                image,
            </if>
            <if test="createDate != null">
                create_date,
            </if>
            <if test="newsDate != null">
                news_date,
            </if>
            <if test="source != null">
                source,
            </if>
            <if test="content != null">
                content,
            </if>
        </trim>
        <trim prefix="values (" suffix=")" suffixOverrides=",">
            <if test="id != null">
                #{id,jdbcType=INTEGER},
            </if>
            <if test="title != null">
                #{title,jdbcType=VARCHAR},
            </if>
            <if test="url != null">
                #{url,jdbcType=VARCHAR},
            </if>
            <if test="image != null">
                #{image,jdbcType=VARCHAR},
            </if>
            <if test="createDate != null">
                #{createDate,jdbcType=TIMESTAMP},
            </if>
            <if test="newsDate != null">
                #{newsDate,jdbcType=TIMESTAMP},
            </if>
            <if test="source != null">
                #{source,jdbcType=VARCHAR},
            </if>
            <if test="content != null">
                #{content,jdbcType=LONGVARCHAR},
            </if>
        </trim>
    </insert>
    <select id="countByExample" parameterType="com.ray.news.crawler.entity.NewsExample" resultType="java.lang.Long">
        select count(*) from news
        <if test="_parameter != null">
            <include refid="Example_Where_Clause" />
        </if>
    </select>
    <update id="updateByExampleSelective" parameterType="map">
        update news
        <set>
            <if test="record.id != null">
                id = #{record.id,jdbcType=INTEGER},
            </if>
            <if test="record.title != null">
                title = #{record.title,jdbcType=VARCHAR},
            </if>
            <if test="record.url != null">
                url = #{record.url,jdbcType=VARCHAR},
            </if>
            <if test="record.image != null">
                image = #{record.image,jdbcType=VARCHAR},
            </if>
            <if test="record.createDate != null">
                create_date = #{record.createDate,jdbcType=TIMESTAMP},
            </if>
            <if test="record.newsDate != null">
                news_date = #{record.newsDate,jdbcType=TIMESTAMP},
            </if>
            <if test="record.source != null">
                source = #{record.source,jdbcType=VARCHAR},
            </if>
            <if test="record.content != null">
                content = #{record.content,jdbcType=LONGVARCHAR},
            </if>
        </set>
        <if test="_parameter != null">
            <include refid="Update_By_Example_Where_Clause" />
        </if>
    </update>
    <update id="updateByExampleWithBLOBs" parameterType="map">
        update news
        set id = #{record.id,jdbcType=INTEGER},
        title = #{record.title,jdbcType=VARCHAR},
        url = #{record.url,jdbcType=VARCHAR},
        image = #{record.image,jdbcType=VARCHAR},
        create_date = #{record.createDate,jdbcType=TIMESTAMP},
        news_date = #{record.newsDate,jdbcType=TIMESTAMP},
        source = #{record.source,jdbcType=VARCHAR},
        content = #{record.content,jdbcType=LONGVARCHAR}
        <if test="_parameter != null">
            <include refid="Update_By_Example_Where_Clause" />
        </if>
    </update>
    <update id="updateByExample" parameterType="map">
        update news
        set id = #{record.id,jdbcType=INTEGER},
        title = #{record.title,jdbcType=VARCHAR},
        url = #{record.url,jdbcType=VARCHAR},
        image = #{record.image,jdbcType=VARCHAR},
        create_date = #{record.createDate,jdbcType=TIMESTAMP},
        news_date = #{record.newsDate,jdbcType=TIMESTAMP},
        source = #{record.source,jdbcType=VARCHAR}
        <if test="_parameter != null">
            <include refid="Update_By_Example_Where_Clause" />
        </if>
    </update>
    <update id="updateByPrimaryKeySelective" parameterType="com.ray.news.crawler.entity.News">
        update news
        <set>
            <if test="title != null">
                title = #{title,jdbcType=VARCHAR},
            </if>
            <if test="url != null">
                url = #{url,jdbcType=VARCHAR},
            </if>
            <if test="image != null">
                image = #{image,jdbcType=VARCHAR},
            </if>
            <if test="createDate != null">
                create_date = #{createDate,jdbcType=TIMESTAMP},
            </if>
            <if test="newsDate != null">
                news_date = #{newsDate,jdbcType=TIMESTAMP},
            </if>
            <if test="source != null">
                source = #{source,jdbcType=VARCHAR},
            </if>
            <if test="content != null">
                content = #{content,jdbcType=LONGVARCHAR},
            </if>
        </set>
        where id = #{id,jdbcType=INTEGER}
    </update>
    <update id="updateByPrimaryKeyWithBLOBs" parameterType="com.ray.news.crawler.entity.News">
    update news
    set title = #{title,jdbcType=VARCHAR},
      url = #{url,jdbcType=VARCHAR},
      image = #{image,jdbcType=VARCHAR},
      create_date = #{createDate,jdbcType=TIMESTAMP},
      news_date = #{newsDate,jdbcType=TIMESTAMP},
      source = #{source,jdbcType=VARCHAR},
      content = #{content,jdbcType=LONGVARCHAR}
    where id = #{id,jdbcType=INTEGER}
  </update>
    <update id="updateByPrimaryKey" parameterType="com.ray.news.crawler.entity.News">
    update news
    set title = #{title,jdbcType=VARCHAR},
      url = #{url,jdbcType=VARCHAR},
      image = #{image,jdbcType=VARCHAR},
      create_date = #{createDate,jdbcType=TIMESTAMP},
      news_date = #{newsDate,jdbcType=TIMESTAMP},
      source = #{source,jdbcType=VARCHAR}
    where id = #{id,jdbcType=INTEGER}
  </update>
</mapper>
@Service
public class NewsServiceImpl implements NewsService {

    @Autowired
    private NewsDao newsDao;

    @Override
    @Transactional
    public int saveNews(News news) {
        //1.check if the news is already existing
        NewsExample newsExample = new NewsExample();
        newsExample.createCriteria().andUrlEqualTo(news.getUrl());
        long count = newsDao.countByExample(newsExample);
        //2.if the news is not existing, insert it into the table
        if (count == 0) {
            return newsDao.insert(news);
        }
        return 0;
    }

    @Override
    public List<News> searchNewsForPage(int page, int pageSize, NewsExample example) {
        PageHelper.startPage(page, pageSize);
        List<News> news = newsDao.selectByExampleWithBLOBs(example);
        if (CollectionUtils.isEmpty(news)) {
            return Collections.EMPTY_LIST;
        } else {
            return news;
        }
    }

    @Override
    public Long countByExample(NewsExample example) {
        return newsDao.countByExample(example);
    }

}

工具类

public final class NewsUtils {

    public static String getTextFromContent(String content) {
        String scriptRegex = "<script[^>]*?>[\s\S]*?<\/script>"; // script
        String styleRegex = "<style[^>]*?>[\s\S]*?<\/style>"; // style
        String htmlTagRegex = "<[^>]+>"; // HTML tag
        String spaceRegex = "\s+|t|r|n";// other characters

        Pattern scriptPattern = Pattern.compile(scriptRegex, Pattern.CASE_INSENSITIVE);
        Matcher scriptMatcher = scriptPattern.matcher(content);
        content = scriptMatcher.replaceAll("");

        Pattern stylePattern = Pattern.compile(styleRegex, Pattern.CASE_INSENSITIVE);
        Matcher styleMatcher = stylePattern.matcher(content);
        content = styleMatcher.replaceAll("");

        Pattern htmlTagPattern = Pattern.compile(htmlTagRegex, Pattern.CASE_INSENSITIVE);
        Matcher htmlTagMatcher = htmlTagPattern.matcher(content);
        content = htmlTagMatcher.replaceAll("");

        Pattern spacePattern = Pattern.compile(spaceRegex, Pattern.CASE_INSENSITIVE);
        Matcher spaceMatcher = spacePattern.matcher(content);
        content = spaceMatcher.replaceAll(" ");

        return content;
    }

    public static String getImageFromContent(String content) {
        String image = null;
        String imgRegex = "(<img.*src\s*=\s*(.*?)[^>]*?>)";
        Pattern imgPattern = Pattern.compile(imgRegex, Pattern.CASE_INSENSITIVE);
        Matcher imgMatcher = imgPattern.matcher(content);
        // 找到img标签
        if (imgMatcher.find()) {
            String img = imgMatcher.group();
            // 匹配<img>中的src数据
            Matcher srcMatcher = Pattern.compile("src\s*=\s*"?(.*?)("|>|\s+)").matcher(img);
            image = srcMatcher.find() ? srcMatcher.group(1) : null;
        }
        return image;
    }

    public static String getSourceFromPathVariable(String pathVariable) {
        switch (pathVariable) {
            case "toutiao" :
                return "今日头条";
            case "neteasy" :
                return "网易";
            case "sohu" :
                return "搜狐";
            case "ifeng" :
                return "凤凰";
            case "sina" :
                return "新浪";
            default:
                return null;
        }
    }
}

数据抓取接口

public interface NewsPuller {

    void pullNews();

    default Document getHtmlFromUrl(String url, boolean useHtmlUnit) throws Exception {
        if (!useHtmlUnit) {
            return Jsoup.connect(url)
                    //模拟火狐浏览器
                    .userAgent("Mozilla/4.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)")
                    .get();
        }

        WebClient webClient = new WebClient(BrowserVersion.CHROME); //新建一个模拟谷歌Chrome浏览器的浏览器客户端对象
        webClient.getOptions().setJavaScriptEnabled(true); //很重要,启用JS
        webClient.getOptions().setCssEnabled(false); //是否启用CSS, 因为不需要展现页面, 所以不需要启用
        webClient.getOptions().setActiveXNative(false);
        webClient.getOptions().setCssEnabled(false);
        webClient.getOptions().setThrowExceptionOnScriptError(false); //当JS执行出错的时候是否抛出异常
        webClient.getOptions().setThrowExceptionOnFailingStatusCode(false); //当HTTP的状态非200时是否抛出异常
        //webClient.setAjaxController(new NicelyResynchronizingAjaxController());//设置支持AJAX
        webClient.getOptions().setUseInsecureSSL(true);
        webClient.getOptions().setTimeout(10 * 1000);

        HtmlPage rootPage = null;

        try {
            rootPage = webClient.getPage(url);
            webClient.waitForBackgroundJavaScript(10 * 1000); //异步JS执行需要耗时,所以这里线程要阻塞10秒,等待异步JS执行结束
            String htmlStr = rootPage.asXml(); //直接将加载完成的页面转换成xml格式的字符串
            //System.out.println(htmlStr);
            return Jsoup.parse(htmlStr); //获取html文档
        } finally {
            webClient.close();
        }
    }
}

凤凰网爬取

@Component("ifengNewsPuller")
public class IfengNewsPuller implements NewsPuller {

    private static final Logger logger = LoggerFactory.getLogger(IfengNewsPuller.class);

    @Value("${news.ifeng.url}")
    private String url;

    @Autowired
    private NewsService newsService;

    private SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");

    @Override
    public void pullNews() {
        logger.info("开始拉取凤凰新闻!");

        // 1. 获取首页
        Document html = null;

        try {
            html = getHtmlFromUrl(url, false);
        } catch (Exception e) {
            logger.error("==============获取凤凰首页失败: {} =============", url);
            e.printStackTrace();
            return;
        }

        // 2. jsoup 获取新闻 <a> 标签
        Elements newsATags = html.select("div#newsList")
                .select("ul.news_list-3wjAJJJM")
                .select("li")
                .select("a");

        // 3.从<a>标签中抽取基本信息,封装成news
        HashSet<News> newsSet = new HashSet<>();
        for (Element a : newsATags) {
            String url = a.attr("href");
            String title = a.text();
            News n = new News();
            n.setSource("凤凰");
            n.setUrl(url);
            n.setTitle(title);
            n.setCreateDate(new Date());
            newsSet.add(n);
        }

        // 4.根据新闻url访问新闻,获取新闻内容
        newsSet.parallelStream().forEach(news -> {
            logger.info("开始抽取凤凰新闻《{}》内容:{}", news.getTitle(), news.getUrl());
            Document newsHtml = null;
            try {
                newsHtml = getHtmlFromUrl(news.getUrl(), false);
                Elements contentElement = newsHtml.select("div.text-3zQ3cZD4");
                if (contentElement.isEmpty()) {
                    contentElement = newsHtml.select("div.caption-3_nUnnKX h1");
                }
                if (contentElement.isEmpty()) {
                    return;
                }
                // 直接从头部信息获取部分数据
                String time = newsHtml.head().select("meta[name=og:time ]").attr("content");
                if (StringUtils.isNotBlank(time)) {
                    news.setNewsDate(sdf.parse(time));
                }
                String content = contentElement.toString();
                String image = NewsUtils.getImageFromContent(content);
                news.setContent(contentElement.text());
                news.setImage(image);
                newsService.saveNews(news);
                logger.info("抽取凤凰新闻《{}》成功!", news.getTitle());
            } catch (Exception e) {
                logger.error("凤凰新闻抽取失败:{}", news.getUrl());
                e.printStackTrace();
            }
        });
        logger.info("凤凰新闻抽取完成!");
    }
}

网易新闻爬取

@Component("netEasyNewsPuller")
public class NetEasyNewsPuller implements NewsPuller {

    private static final Logger logger = LoggerFactory.getLogger(NetEasyNewsPuller.class);

    @Value("${news.neteasy.url}")
    private String url;

    @Autowired
    private NewsService newsService;

    @Override
    public void pullNews() {
        logger.info("开始拉取网易热门新闻!");

        // 1. 获取首页
        Document html = null;
        try {
            html = getHtmlFromUrl(url, false);
        } catch (Exception e) {
            logger.error("==============获取网易新闻首页失败: {}=============", url);
            e.printStackTrace();
            return;
        }

        // 2. jsoup 获取指定标签
        //Elements newA = html.select("div.tabContents")
        Elements newA = html.select("div.tabContents.active") // 减少一部分数据
                .select("table")
                .select("td")
                .select("a");
        //System.out.println(newA);

        // 3. 从标签中抽取信息,封装成 news
        HashSet<News> newsSet = new HashSet<>();
        newA.forEach(a -> {
            String url = a.attr("href");
            News n = new News();
            n.setSource("网易");
            n.setUrl(url);
            n.setCreateDate(new Date());
            newsSet.add(n);
        });

        // 4. 根据url 访问新闻,获取新闻内容
        newsSet.forEach(news -> {
            logger.info("开始抽取新闻内容:{}", news.getUrl());
            Document newsHtml = null;
            try {
                newsHtml = getHtmlFromUrl(news.getUrl(), false);
                Elements newsContentAll = newsHtml.select("div#epContentLeft");
                if (!newsContentAll.isEmpty()) {
                    Elements newsContent = newsHtml.select("div#endText");
                    Element titleP = newsContentAll.select("h1").first();
                    String title = titleP.text();
                    String image = NewsUtils.getImageFromContent(newsContentAll.toString());

                    news.setTitle(title);
                    news.setContent(newsContent.text());
                    news.setImage(image);
                    newsService.saveNews(news);
                    logger.info("抽取网易新闻《{}》成功!", news.getTitle());
                } else {
                    logger.error("新闻抽取失败:{}", news.getUrl());
                }
            } catch (Exception e) {
                logger.error("新闻抽取失败:{}", news.getUrl());
                e.printStackTrace();
            }
        });
        logger.info("网易新闻拉取完成!");
    }
}

搜狐新闻爬取

@Component("sohuNewsPuller")
public class SohuNewsPuller implements NewsPuller {

    private static final Logger logger = LoggerFactory.getLogger(SohuNewsPuller.class);

    @Value("${news.sohu.url}")
    private String url;

    @Autowired
    private NewsService newsService;

    @Override
    public void pullNews() {
        logger.info("开始拉取搜狐新闻!");

        // 1. 获取首页
        Document html = null;
        try {
            html = getHtmlFromUrl(url, false);
        } catch (Exception e) {
            logger.error("==============获取搜狐首页失败: {}=============", url);
            e.printStackTrace();
            return;
        }

        // 2. jsoup获取新闻<a>标签
        Elements newsATags = html.select("div.focus-news")
                .select("div.list16")
                .select("li")
                .select("a");

        // 3. 从<a>标签中抽取基本信息,封装成 news
        HashSet<News> newsSet = new HashSet<>();
        for (Element a : newsATags) {
            String url = a.attr("href");
            String title = a.attr("title");
            News n = new News();
            n.setSource("搜狐");
            n.setUrl(url);
            n.setTitle(title);
            n.setCreateDate(new Date());
            newsSet.add(n);
        }

        // 4. 根据新闻url访问新闻,获取新闻内容
        newsSet.forEach(news -> {
            logger.info("开始抽取搜狐新闻内容:{}", news.getUrl());
            Document newsHtml = null;
            try {
                newsHtml = getHtmlFromUrl(news.getUrl(), false);
                String content = newsHtml.select("article.article").first().text();
                String image = NewsUtils.getImageFromContent(content);

                news.setContent(content);
                news.setImage(image);
                newsService.saveNews(news);
                logger.info("抽取搜狐新闻《{}》成功!", news.getTitle());
            } catch (Exception e) {
                logger.error("新闻抽取失败:{}", news.getUrl());
                e.printStackTrace();
            }
        });
        logger.info("搜狐新闻拉取完成!");
    }
}

今日头条爬取

详情页面爬取内容不完善

@Component("toutiaoNewsPuller")
public class ToutiaoNewsPuller implements NewsPuller {

    private static final Logger logger = LoggerFactory.getLogger(ToutiaoNewsPuller.class);

    private static final String TOUTIAO_URL = "https://www.toutiao.com";

    @Autowired
    private NewsService newsService;

    @Value("${news.toutiao.url}")
    private String url;


    @Override
    public void pullNews() {
        logger.info("开始拉取今日头条热门新闻!");

        // 1. 获取首页
        Document html = null;
        try {
            html = getHtmlFromUrl(url, true);
        } catch (Exception e) {
            logger.error("获取今日头条主页失败!");
            e.printStackTrace();
            return;
        }

        // 2. 解析页面并封装到 news
        Map<String, News> newsMap = new HashMap<>();
        for (Element a :
               // select 语法参考: https://www.open-open.com/jsoup/selector-syntax.htm
               html.select("a[href~=/group/.*]:not(.comment)") ) {
            logger.info("<a>标签: n{}", a);
            String href = TOUTIAO_URL + a.attr("href");
            String title = StringUtils.isNotBlank(a.select("p").text()) ?
                    a.select("p").text() : a.text();
            String image = a.select("img").attr("src");

            News news = newsMap.get(href);

            if (news == null) {
                News n = new News();
                n.setSource("今日头条");
                n.setUrl(href);
                n.setCreateDate(new Date());
                n.setImage(image);
                n.setTitle(title);
                newsMap.put(href, n);
            } else {
                if (a.hasClass("img-wrap")) {
                    news.setImage(image);
                } else if (a.hasClass("title")) {
                    news.setTitle(title);
                }
            }
        }

        logger.info("今日头条新闻标题拉取完成!");
        logger.info("开始拉取新闻内容...");

        newsMap.values().stream().forEach(news -> {
            logger.info("抽取今日头条新闻《{}》", news.getTitle());
            Document contentHtml = null;
            try {
                contentHtml = getHtmlFromUrl(news.getUrl(), true);
            } catch (Exception e) {
                logger.error("获取今日头条新闻《{}》内容失败!", news.getTitle());
                e.printStackTrace();
                return;
            }
            /*Elements scripts = contentHtml.getElementsByTag("script");
            scripts.forEach(script -> {
                String regex = "articleInfo: \{\s*[\n\r]*\s*title: '.*',\s*[\n\r]*\s*content: '(.*)',";
                Pattern pattern = Pattern.compile(regex);
                Matcher matcher = pattern.matcher(script.toString());
                if (matcher.find()) {
                    String content = matcher.group(1)
                            .replace("&lt;", "<")
                            .replace("&gt;", ">")
                            .replace("&quot;", """)
                            .replace("&#x3D;", "=");
                    logger.info("content: {}", content);
                    news.setContent(content);
                }
            });*/


            String content = contentHtml.select("article").text();
            logger.info("content: {}", content);
            news.setContent(content);
        });

        newsMap.values()
                .stream()
                .filter(news -> StringUtils.isNotBlank(news.getContent()) && !news.getContent().equals("null"))
                .forEach(newsService::saveNews);
        logger.info("今日头条新闻内容拉取完成!");
    }
}

控制器

@RestController
@RequestMapping("/news")
@Api(value = "新闻拉取API")
public class NewsController {

    private static final Logger logger = LoggerFactory.getLogger(NewsController.class);

    @Autowired
    @Qualifier("ifengNewsPuller")
    private NewsPuller ifengNewsPuller;
    @Autowired
    @Qualifier("netEasyNewsPuller")
    private NewsPuller neteasyNewsPuller;
    @Autowired
    @Qualifier("sohuNewsPuller")
    private NewsPuller sohuNewsPuller;
    @Autowired
    @Qualifier("toutiaoNewsPuller")
    private NewsPuller toutiaoNewsPuller;
    @Autowired
    private NewsService newsService;

    @ApiOperation(value = "爬虫拉取凤凰新闻")
    @GetMapping("/pull/ifeng")
    public void pullIfengNews() {
        ifengNewsPuller.pullNews();
    }

    @ApiOperation(value = "爬虫拉取网易新闻")
    @GetMapping("/pull/neteasy")
    public void pullNeteasyNews() {
        neteasyNewsPuller.pullNews();
    }

    @ApiOperation(value = "爬虫拉取搜狐新闻")
    @GetMapping("/pull/sohu")
    public void pullSohuNews() {
        sohuNewsPuller.pullNews();
    }

    @ApiOperation(value = "爬虫拉取今日头条新闻")
    @GetMapping("/pull/toutiao")
    public void pullToutiaoNews() {
        toutiaoNewsPuller.pullNews();
    }


    @ApiOperation(value = "获取{source}新闻")
    @GetMapping("/{source}")
    public List<News> getToutiaoNews(@RequestParam Integer page, @RequestParam Integer pageSize, @PathVariable String source) {
        NewsExample example = new NewsExample();
        example.createCriteria().andSourceEqualTo(NewsUtils.getSourceFromPathVariable(source));
        example.setOrderByClause("create_date desc");
        return newsService.searchNewsForPage(page, pageSize, example);
    }

    @ApiOperation("获取{source}新闻总数")
    @GetMapping("/{source}/count")
    public Long getToutiaoCount(@PathVariable String source) {
        NewsExample example = new NewsExample();
        example.createCriteria().andSourceEqualTo(NewsUtils.getSourceFromPathVariable(source));
        return newsService.countByExample(example);
    }

    @ApiOperation(value = "获取所有新闻")
    @GetMapping
    public List<News> getNews(@RequestParam Integer page, @RequestParam Integer pageSize) {
        NewsExample example = new NewsExample();
        example.createCriteria();
        example.setOrderByClause("create_date desc");
        return newsService.searchNewsForPage(page, pageSize, example);
    }

    @ApiOperation("获取新闻总数")
    @GetMapping("/count")
    public Long getCount() {
        NewsExample example = new NewsExample();
        example.createCriteria();
        return newsService.countByExample(example);
    }
}

配置类

@Configuration
@EnableSwagger2
public class SwaggerConfig {
    @Bean
    public Docket createRestApi() {
        return new Docket(DocumentationType.SWAGGER_2)
                .useDefaultResponseMessages(false)
                .apiInfo(apiInfo())
                .select()
                .apis(RequestHandlerSelectors.withClassAnnotation(Api.class))
                .paths(PathSelectors.any())
                .build();
    }

    private ApiInfo apiInfo() {
        return new ApiInfoBuilder()
                .title("新闻爬虫API文档")
                .description("使用Jsoup + HtmlUtil")
                .version("1.0")
                .build();
    }
}

引导类

@SpringBootApplication
public class NewscrawlerApplication {

    public static void main(String[] args) {
        SpringApplication.run(NewscrawlerApplication.class, args);
        System.out.println("swagger2: http://localhost:8080/swagger-ui.html");
    }
}