作者:jessehua
起源:www.jianshu.com/p/cfead4b3e34e
WebMagic 是一个开源的 java 爬虫框架。
WebMagic 框架的应用并不是本文的重点,具体如何应用请参考官网文档:http://webmagic.io/docs/。
本文是对 spring boot+WebMagic+MyBatis 做了整合,应用 WebMagic 爬取数据,而后通过 MyBatis 长久化爬取的数据到 mysql 数据库。
本文提供的源代码能够作为 java 爬虫我的项目的脚手架。
1. 增加 maven 依赖
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>hyzx</groupId>
<artifactId>qbasic-crawler</artifactId>
<version>1.0.0</version>
<parent>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-parent</artifactId>
<version>1.5.21.RELEASE</version>
<relativePath/> <!-- lookup parent from repository -->
</parent>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<maven.test.skip>true</maven.test.skip>
<java.version>1.8</java.version>
<maven.compiler.plugin.version>3.8.1</maven.compiler.plugin.version>
<maven.resources.plugin.version>3.1.0</maven.resources.plugin.version>
<mysql.connector.version>5.1.47</mysql.connector.version>
<druid.spring.boot.starter.version>1.1.17</druid.spring.boot.starter.version>
<mybatis.spring.boot.starter.version>1.3.4</mybatis.spring.boot.starter.version>
<fastjson.version>1.2.58</fastjson.version>
<commons.lang3.version>3.9</commons.lang3.version>
<joda.time.version>2.10.2</joda.time.version>
<webmagic.core.version>0.7.3</webmagic.core.version>
</properties>
<dependencies>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-devtools</artifactId>
<scope>runtime</scope>
<optional>true</optional>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-test</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-configuration-processor</artifactId>
<optional>true</optional>
</dependency>
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>${mysql.connector.version}</version>
</dependency>
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>druid-spring-boot-starter</artifactId>
<version>${druid.spring.boot.starter.version}</version>
</dependency>
<dependency>
<groupId>org.mybatis.spring.boot</groupId>
<artifactId>mybatis-spring-boot-starter</artifactId>
<version>${mybatis.spring.boot.starter.version}</version>
</dependency>
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
<version>${fastjson.version}</version>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-lang3</artifactId>
<version>${commons.lang3.version}</version>
</dependency>
<dependency>
<groupId>joda-time</groupId>
<artifactId>joda-time</artifactId>
<version>${joda.time.version}</version>
</dependency>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-core</artifactId>
<version>${webmagic.core.version}</version>
<exclusions>
<exclusion>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
</exclusion>
</exclusions>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>${maven.compiler.plugin.version}</version>
<configuration>
<source>${java.version}</source>
<target>${java.version}</target>
<encoding>${project.build.sourceEncoding}</encoding>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-resources-plugin</artifactId>
<version>${maven.resources.plugin.version}</version>
<configuration>
<encoding>${project.build.sourceEncoding}</encoding>
</configuration>
</plugin>
<plugin>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-maven-plugin</artifactId>
<configuration>
<fork>true</fork>
<addResources>true</addResources>
</configuration>
<executions>
<execution>
<goals>
<goal>repackage</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>
<repositories>
<repository>
<id>public</id>
<name>aliyun nexus</name>
<url>http://maven.aliyun.com/nexus/content/groups/public/</url>
<releases>
<enabled>true</enabled>
</releases>
</repository>
</repositories>
<pluginRepositories>
<pluginRepository>
<id>public</id>
<name>aliyun nexus</name>
<url>http://maven.aliyun.com/nexus/content/groups/public/</url>
<releases>
<enabled>true</enabled>
</releases>
<snapshots>
<enabled>false</enabled>
</snapshots>
</pluginRepository>
</pluginRepositories>
</project>
2. 我的项目配置文件 application.properties
配置 mysql 数据源,druid 数据库连接池以及 MyBatis 的 mapper 文件的地位。
# mysql 数据源配置
spring.datasource.name=mysql
spring.datasource.type=com.alibaba.druid.pool.DruidDataSource
spring.datasource.driver-class-name=com.mysql.jdbc.Driver
spring.datasource.url=jdbc:mysql://192.168.0.63:3306/gjhzjl?useUnicode=true&characterEncoding=utf8&useSSL=false&allowMultiQueries=true
spring.datasource.username=root
spring.datasource.password=root
# druid 数据库连接池配置
spring.datasource.druid.initial-size=5
spring.datasource.druid.min-idle=5
spring.datasource.druid.max-active=10
spring.datasource.druid.max-wait=60000
spring.datasource.druid.validation-query=SELECT 1 FROM DUAL
spring.datasource.druid.test-on-borrow=false
spring.datasource.druid.test-on-return=false
spring.datasource.druid.test-while-idle=true
spring.datasource.druid.time-between-eviction-runs-millis=60000
spring.datasource.druid.min-evictable-idle-time-millis=300000
spring.datasource.druid.max-evictable-idle-time-millis=600000
# mybatis 配置
mybatis.mapperLocations=classpath:mapper/**/*.xml
3. 数据库表构造
CREATE TABLE `cms_content` (`contentId` varchar(40) NOT NULL COMMENT '内容 ID',
`title` varchar(150) NOT NULL COMMENT '题目',
`content` longtext COMMENT '文章内容',
`releaseDate` datetime NOT NULL COMMENT '公布日期',
PRIMARY KEY (`contentId`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8 COMMENT='CMS 内容表';
4. 实体类
import java.util.Date;
public class CmsContentPO {
private String contentId;
private String title;
private String content;
private Date releaseDate;
public String getContentId() {return contentId;}
public void setContentId(String contentId) {this.contentId = contentId;}
public String getTitle() {return title;}
public void setTitle(String title) {this.title = title;}
public String getContent() {return content;}
public void setContent(String content) {this.content = content;}
public Date getReleaseDate() {return releaseDate;}
public void setReleaseDate(Date releaseDate) {this.releaseDate = releaseDate;}
}
5.mapper 接口
public interface CrawlerMapper {int addCmsContent(CmsContentPO record);
}
6.CrawlerMapper.xml 文件
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE mapper PUBLIC "-//mybatis.org//DTD Mapper 3.0//EN" "http://mybatis.org/dtd/mybatis-3-mapper.dtd">
<mapper namespace="com.hyzx.qbasic.dao.CrawlerMapper">
<insert id="addCmsContent" parameterType="com.hyzx.qbasic.model.CmsContentPO">
insert into cms_content (contentId,
title,
releaseDate,
content)
values (#{contentId,jdbcType=VARCHAR},
#{title,jdbcType=VARCHAR},
#{releaseDate,jdbcType=TIMESTAMP},
#{content,jdbcType=LONGVARCHAR})
</insert>
</mapper>
7.XXX 页面内容解决类 XXXPageProcessor
次要用于解析爬取到的 XXX html 页面。
@Component
public class XXXPageProcessor implements PageProcessor {private Site site = Site.me().setRetryTimes(3).setSleepTime(1000);
@Override
public void process(Page page) {page.addTargetRequests(page.getHtml().links().regex("https://www\\.xxx\\.com/question/\\d+/answer/\\d+.*").all());
page.putField("title", page.getHtml().xpath("//h1[@class='QuestionHeader-title']/text()").toString());
page.putField("answer", page.getHtml().xpath("//div[@class='QuestionAnswer-content']/tidyText()").toString());
if (page.getResultItems().get("title") == null) {
// 如果是列表页,跳过此页,pipeline 不进行后续解决
page.setSkip(true);
}
}
@Override
public Site getSite() {return site;}
}
8.XXX 数据处理类 XXXPipeline
次要用于将 XXX html 页面解析出的数据存储到 mysql 数据库。
@Component
public class XXXPipeline implements Pipeline {private static final Logger LOGGER = LoggerFactory.getLogger(XXXPipeline.class);
@Autowired
private CrawlerMapper crawlerMapper;
public void process(ResultItems resultItems, Task task) {String title = resultItems.get("title");
String answer = resultItems.get("answer");
CmsContentPO contentPO = new CmsContentPO();
contentPO.setContentId(UUID.randomUUID().toString());
contentPO.setTitle(title);
contentPO.setReleaseDate(new Date());
contentPO.setContent(answer);
try {boolean success = crawlerMapper.addCmsContent(contentPO) > 0;
LOGGER.info("保留文章胜利:{}", title);
} catch (Exception ex) {LOGGER.error("保留文章失败", ex);
}
}
}
9. 爬虫工作类 XXXTask
每十分钟启动一次爬虫。
@Component
public class XXXTask {private static final Logger LOGGER = LoggerFactory.getLogger(XXXPipeline.class);
@Autowired
private XXXPipeline XXXPipeline;
@Autowired
private XXXPageProcessor xxxPageProcessor;
private ScheduledExecutorService timer = Executors.newSingleThreadScheduledExecutor();
public void crawl() {
// 定时工作,每 10 分钟爬取一次
timer.scheduleWithFixedDelay(() -> {Thread.currentThread().setName("xxxCrawlerThread");
try {Spider.create(xxxPageProcessor)
// 从 https://www.xxx.com/explore 开始抓
.addUrl("https://www.xxx.com/explore")
// 抓取到的数据存数据库
.addPipeline(xxxPipeline)
// 开启 2 个线程抓取
.thread(2)
// 异步启动爬虫
.start();} catch (Exception ex) {LOGGER.error("定时抓取数据线程执行异样", ex);
}
}, 0, 10, TimeUnit.MINUTES);
}
}
10.Spring boot 程序启动类
@SpringBootApplication
@MapperScan(basePackages = "com.hyzx.qbasic.dao")
public class Application implements CommandLineRunner {
@Autowired
private XXXTask xxxTask;
public static void main(String[] args) throws IOException {SpringApplication.run(Application.class, args);
}
@Override
public void run(String... strings) throws Exception {
// 爬取数据
xxxTask.crawl();}
}
最初,关注公众号 Java 技术栈,在后盾回复:面试,能够获取我整顿的 Java/ Spring Boot 系列面试题和答案,十分齐全。
近期热文举荐:
1.1,000+ 道 Java 面试题及答案整顿 (2021 最新版)
2. 别在再满屏的 if/ else 了,试试策略模式,真香!!
3. 卧槽!Java 中的 xx ≠ null 是什么新语法?
4.Spring Boot 2.5 重磅公布,光明模式太炸了!
5.《Java 开发手册(嵩山版)》最新公布,速速下载!
感觉不错,别忘了顺手点赞 + 转发哦!