WebMagic 是一个开源 爬虫 框架,本项目通过在 SpringBoot 项目中使用WebMagic去抓取数据,最后使用MyBatis将 数据入库 。
本项目代码地址: ArticleCrawler: SrpingBoot+WebMagic+MyBaties实现爬虫和数据入库 (gitee测试数据)
创建数据库:
本示例中库名为article,表名为cms_content,表中包含contentId、title、date三个字段。
1 2 3 4 5 6 |
CREATE TABLE `cms_content` ( `contentId` varchar (40) NOT NULL COMMENT '内容ID' , `title` varchar (150) NOT NULL COMMENT '标题' , ` date ` varchar (150) NOT NULL COMMENT '发布日期' , PRIMARY KEY (`contentId`) ) ENGINE=InnoDB DEFAULT CHARSET=utf8 COMMENT= 'CMS内容表' ; |
新建SpringBoot项目:
1、配置依赖pom.xml
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 |
<? xml version = "1.0" encoding = "UTF-8" ?> < project xmlns = "http://maven.apache.org/POM/4.0.0" xmlns:xsi = "http://HdhCmsTestw3.org/2001/XMLSchema-instance" xsi:schemaLocation = "http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd" > < modelVersion >4.0.0</ modelVersion > < parent > < groupId >org.springframework.boot</ groupId > < artifactId >spring-boot-starter-parent</ artifactId > < version >2.5.5</ version > < relativePath /> </ parent > < groupId >com.example</ groupId > < artifactId >Article</ artifactId > < version >0.0.1-SNAPSHOT</ version > < name >Article</ name > < description >Article</ description > < properties > < java.version >1.8</ java.version > < project.build.sourceEncoding >UTF-8</ project.build.sourceEncoding > < maven.test.skip >true</ maven.test.skip > < maven测试数据piler.plugin.version >3.8.1</ maven测试数据piler.plugin.version > < maven.resources.plugin.version >3.1.0</ maven.resources.plugin.version >
< mysql.connector.version >5.1.47</ mysql.connector.version > < druid.spring.boot.starter.version >1.1.17</ druid.spring.boot.starter.version > < mybatis.spring.boot.starter.version >1.3.4</ mybatis.spring.boot.starter.version > < fastjson.version >1.2.58</ fastjson.version > < commons.lang3.version >3.9</ commons.lang3.version > < joda.time.version >2.10.2</ joda.time.version > < webmagic.core.version >0.7.5</ webmagic.core.version > </ properties >
< dependencies > < dependency > < groupId >org.springframework.boot</ groupId > < artifactId >spring-boot-starter-web</ artifactId > </ dependency >
< dependency > < groupId >org.springframework.boot</ groupId > < artifactId >spring-boot-starter-test</ artifactId > < scope >test</ scope > </ dependency >
< dependency > < groupId >org.springframework.boot</ groupId > < artifactId >spring-boot-configuration-processor</ artifactId > < optional >true</ optional > </ dependency >
< dependency > < groupId >mysql</ groupId > < artifactId >mysql-connector-java</ artifactId > < version >${mysql.connector.version}</ version > </ dependency >
< dependency > < groupId >com.alibaba</ groupId > < artifactId >druid-spring-boot-starter</ artifactId > < version >${druid.spring.boot.starter.version}</ version > </ dependency >
< dependency > < groupId >org.mybatis.spring.boot</ groupId > < artifactId >mybatis-spring-boot-starter</ artifactId > < version >${mybatis.spring.boot.starter.version}</ version > </ dependency >
< dependency > < groupId >com.alibaba</ groupId > < artifactId >fastjson</ artifactId > < version >${fastjson.version}</ version > </ dependency >
< dependency > < groupId >org.apache测试数据mons</ groupId > < artifactId >commons-lang3</ artifactId > < version >${commons.lang3.version}</ version > </ dependency >
< dependency > < groupId >joda-time</ groupId > < artifactId >joda-time</ artifactId > < version >${joda.time.version}</ version > </ dependency >
< dependency > < groupId >us.codecraft</ groupId > < artifactId >webmagic-core</ artifactId > < version >${webmagic.core.version}</ version > < exclusions > < exclusion > < groupId >org.slf4j</ groupId > < artifactId >slf4j-log4j12</ artifactId > </ exclusion > </ exclusions > </ dependency >
</ dependencies >
< build > < plugins > < plugin > < groupId >org.apache.maven.plugins</ groupId > < artifactId >maven-compiler-plugin</ artifactId > < version >${maven测试数据piler.plugin.version}</ version > < configuration > < source >${java.version}</ source > < target >${java.version}</ target > < encoding >${project.build.sourceEncoding}</ encoding > </ configuration > </ plugin >
< plugin > < groupId >org.apache.maven.plugins</ groupId > < artifactId >maven-resources-plugin</ artifactId > < version >${maven.resources.plugin.version}</ version > < configuration > < encoding >${project.build.sourceEncoding}</ encoding > </ configuration > </ plugin >
< plugin > < groupId >org.springframework.boot</ groupId > < artifactId >spring-boot-maven-plugin</ artifactId > < configuration > < fork >true</ fork > < addResources >true</ addResources > </ configuration > < executions > < execution > < goals > < goal >repackage</ goal > </ goals > </ execution > </ executions > </ plugin > </ plugins > </ build >
< repositories > < repository > < id >public</ id > < name >aliyun nexus</ name > < url >http://maven.aliyun测试数据/nexus/content/groups/public/</ url > < releases > < enabled >true</ enabled > </ releases > </ repository > </ repositories >
< pluginRepositories > < pluginRepository > < id >public</ id > < name >aliyun nexus</ name > < url >http://maven.aliyun测试数据/nexus/content/groups/public/</ url > < releases > < enabled >true</ enabled > </ releases > < snapshots > < enabled >false</ enabled > </ snapshots > </ pluginRepository > </ pluginRepositories >
</ project > |
2、创建CmsContentPO.java
数据实体,和表中3个字段对应。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 |
package site.exciter.article.model;
public class CmsContentPO { private String contentId;
private String title;
private String date;
public String getContentId() { return contentId; }
public void setContentId(String contentId) { this .contentId = contentId; }
public String getTitle() { return title; }
public void setTitle(String title) { this .title = title; }
public String getDate() { return date; }
public void setDate(String date) { this .date = date; } } |
3、创建CrawlerMapper.java
1 2 3 4 5 6 7 8 9 |
package site.exciter.article.dao;
import org.apache.ibatis.annotations.Mapper; import site.exciter.article.model.CmsContentPO;
@Mapper public interface CrawlerMapper { int addCmsContent(CmsContentPO record); } |
4、配置映射文件CrawlerMapper.xml
在resources下新建mapper文件夹,在mapper下创建CrawlerMapper.xml
1 2 3 4 5 6 7 8 9 10 11 12 13 |
<? xml version = "1.0" encoding = "UTF-8" ?> <!DOCTYPE mapper PUBLIC "-//mybatis.org//DTD Mapper 3.0//EN" "http://mybatis.org/dtd/mybatis-3-mapper.dtd"> < mapper namespace = "site.exciter.article.dao.CrawlerMapper" >
< insert id = "addCmsContent" parameterType = "site.exciter.article.model.CmsContentPO" > insert into cms_content (contentId, title, date) values (#{contentId,jdbcType=VARCHAR}, #{title,jdbcType=VARCHAR}, #{date,jdbcType=VARCHAR}) </ insert > </ mapper > |
5、配置application.properties
配置数据库和mybatis映射关系。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 |
# mysql spring.datasource.name=mysql spring.datasource.type=com.alibaba.druid.pool.DruidDataSource spring.datasource.driver-class-name=com.mysql.jdbc.Driver spring.datasource.url=jdbc:mysql://10.201.61.184:3306/article?useUnicode=true&characterEncoding=utf8&useSSL=false&allowMultiQueries=true spring.datasource.username=root spring.datasource.password=root
# druid spring.datasource.druid.initial-size=5 spring.datasource.druid.min-idle=5 spring.datasource.druid.max-active=10 spring.datasource.druid.max-wait=60000 spring.datasource.druid.validation-query=SELECT 1 FROM DUAL spring.datasource.druid.test-on-borrow=false spring.datasource.druid.test-on-return=false spring.datasource.druid.test-while-idle=true spring.datasource.druid.time-between-eviction-runs-millis=60000 spring.datasource.druid.min-evictable-idle-time-millis=300000 spring.datasource.druid.max-evictable-idle-time-millis=600000
# mybatis mybatis.mapperLocations=classpath:mapper/CrawlerMapper.xml |
6、创建ArticlePageProcessor.java
解析html的逻辑。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 |
package site.exciter.article;
import org.springframework.stereotype.Component; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.processor.PageProcessor; import us.codecraft.webmagic.selector.Selectable;
@Component public class ArticlePageProcessor implements PageProcessor {
private Site site = Site.me().setRetryTimes( 3 ).setSleepTime( 1000 );
@Override public void process(Page page) { String detail_urls_Xpath = "//*[@class='postTitle']/a[@class='postTitle2']/@href" ; String next_page_xpath = "//*[@id='nav_next_page']/a/@href" ; String next_page_css = "#homepage_top_pager > div:nth-child(1) > a:nth-child(7)" ; String title_xpath = "//h1[@class='postTitle']/a/span/text()" ; String date_xpath = "//span[@id='post-date']/text()" ; page.putField( "title" , page.getHtml().xpath(title_xpath).toString()); if (page.getResultItems().get( "title" ) == null ) { page.setSkip( true ); } page.putField( "date" , page.getHtml().xpath(date_xpath).toString());
if (page.getHtml().xpath(detail_urls_Xpath).match()) { Selectable detailUrls = page.getHtml().xpath(detail_urls_Xpath); page.addTargetRequests(detailUrls.all()); }
if (page.getHtml().xpath(next_page_xpath).match()) { Selectable nextPageUrl = page.getHtml().xpath(next_page_xpath); page.addTargetRequests(nextPageUrl.all());
} else if (page.getHtml().css(next_page_css).match()) { Selectable nextPageUrl = page.getHtml().css(next_page_css).links(); page.addTargetRequests(nextPageUrl.all()); } }
@Override public Site getSite() { return site; } } |
7、创建ArticlePipeline.java
处理数据的持久化。
package site.exciter.article; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.stereotype.Component; import site.exciter.article.model.CmsContentPO; import site.exciter.article.dao.CrawlerMapper; import us.codecraft.webmagic.ResultItems; import us.codecraft.webmagic.Task; import us.codecraft.webmagic.pipeline.Pipeline; import java.util.UUID; @Component public class ArticlePipeline implements Pipeline { private static final Logger LOGGER = LoggerFactory.getLogger(ArticlePipeline. class ); @Autowired private CrawlerMapper crawlerMapper; public void process(ResultItems resultItems, Task task) { String title = resultItems.get( "title" ); String date = resultItems.get( "date" ); CmsContentPO contentPO = new CmsContentPO(); contentPO.setContentId(UUID.randomUUID().toString()); contentPO.setTitle(title); contentPO.setDate(date); try { boolean success = crawlerMapper.addCmsContent(contentPO) > 0; LOGGER.info( "保存成功:{}" , title); } catch (Exception ex) { LOGGER.error( "保存失败" , ex); } } }
8、创建ArticleTask.java
执行抓取任务。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 |
package site.exciter.article;
import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.stereotype.Component; import us.codecraft.webmagic.Spider;
import java.util.concurrent.Executors; import java.util.concurrent.ScheduledExecutorService; import java.util.concurrent.TimeUnit;
@Component public class ArticleTask { private static final Logger LOGGER = LoggerFactory.getLogger(ArticlePipeline. class );
@Autowired private ArticlePipeline articlePipeline;
@Autowired private ArticlePageProcessor articlePageProcessor;
private ScheduledExecutorService timer = Executors.newSingleThreadScheduledExecutor();
public void crawl() { // 定时任务,每10分钟爬取一次 timer.scheduleWithFixedDelay(() -> { Thread.currentThread().setName( "ArticleCrawlerThread" );
try { Spider.create(articlePageProcessor) .addUrl( "http://HdhCmsTestcnblogs测试数据/dick159/default.html?page=2" ) // 抓取到的数据存数据库 .addPipeline(articlePipeline) // 开启5个线程抓取 .thread( 5 ) // 异步启动爬虫 .start(); } catch (Exception ex) { LOGGER.error( "定时抓取数据线程执行异常" , ex); } }, 0 , 10 , TimeUnit.MINUTES); } } |
9、修改Application
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 |
package site.exciter.article;
import org.mybatis.spring.annotation.MapperScan; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.boot.CommandLineRunner; import org.springframework.boot.SpringApplication; import org.springframework.boot.autoconfigure.SpringBootApplication;
@SpringBootApplication @MapperScan (basePackages = "site.exciter.article.interface" ) public class ArticleApplication implements CommandLineRunner {
@Autowired private ArticleTask articleTask;
public static void main(String[] args) { SpringApplication.run(ArticleApplication. class , args); }
@Override public void run(String... args) throws Exception { articleTask.crawl(); } } |
10、执行application,开始抓数据并入库
到此这篇关于SrpingBoot+WebMagic+ MyBaties 实现爬虫和数据入库的示例的文章就介绍到这了,更多相关SrpingBoot+WebMagic+MyBaties爬虫和数据入库内容请搜索以前的文章或继续浏览下面的相关文章希望大家以后多多支持!
原文链接:https://juejin.cn/post/7018897037219332104
查看更多关于SpringBoot+WebMagic+MyBaties实现爬虫和数据入库的示例的详细内容...