好得很程序员自学网

<tfoot draggable='sEl'></tfoot>

SpringBoot+WebMagic+MyBaties实现爬虫和数据入库的示例

WebMagic 是一个开源 爬虫 框架,本项目通过在 SpringBoot 项目中使用WebMagic去抓取数据,最后使用MyBatis将 数据入库 。

本项目代码地址: ArticleCrawler: SrpingBoot+WebMagic+MyBaties实现爬虫和数据入库 (gitee测试数据)

创建数据库:

本示例中库名为article,表名为cms_content,表中包含contentId、title、date三个字段。

?

1

2

3

4

5

6

CREATE TABLE `cms_content` (

   `contentId` varchar (40) NOT NULL COMMENT '内容ID' ,

   `title` varchar (150) NOT NULL COMMENT '标题' ,

   ` date ` varchar (150) NOT NULL COMMENT '发布日期' ,

   PRIMARY KEY (`contentId`)

) ENGINE=InnoDB DEFAULT CHARSET=utf8 COMMENT= 'CMS内容表' ;

新建SpringBoot项目:

1、配置依赖pom.xml

?

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

<? xml version = "1.0" encoding = "UTF-8" ?>

< project xmlns = "http://maven.apache.org/POM/4.0.0" xmlns:xsi = "http://HdhCmsTestw3.org/2001/XMLSchema-instance"

          xsi:schemaLocation = "http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd" >

     < modelVersion >4.0.0</ modelVersion >

     < parent >

         < groupId >org.springframework.boot</ groupId >

         < artifactId >spring-boot-starter-parent</ artifactId >

         < version >2.5.5</ version >

         < relativePath />

     </ parent >

     < groupId >com.example</ groupId >

     < artifactId >Article</ artifactId >

     < version >0.0.1-SNAPSHOT</ version >

     < name >Article</ name >

     < description >Article</ description >

     < properties >

         < java.version >1.8</ java.version >

         < project.build.sourceEncoding >UTF-8</ project.build.sourceEncoding >

         < maven.test.skip >true</ maven.test.skip >

         < maven测试数据piler.plugin.version >3.8.1</ maven测试数据piler.plugin.version >

         < maven.resources.plugin.version >3.1.0</ maven.resources.plugin.version >

 

         < mysql.connector.version >5.1.47</ mysql.connector.version >

         < druid.spring.boot.starter.version >1.1.17</ druid.spring.boot.starter.version >

         < mybatis.spring.boot.starter.version >1.3.4</ mybatis.spring.boot.starter.version >

         < fastjson.version >1.2.58</ fastjson.version >

         < commons.lang3.version >3.9</ commons.lang3.version >

         < joda.time.version >2.10.2</ joda.time.version >

         < webmagic.core.version >0.7.5</ webmagic.core.version >

     </ properties >

 

     < dependencies >

         < dependency >

             < groupId >org.springframework.boot</ groupId >

             < artifactId >spring-boot-starter-web</ artifactId >

         </ dependency >

 

         < dependency >

             < groupId >org.springframework.boot</ groupId >

             < artifactId >spring-boot-starter-test</ artifactId >

             < scope >test</ scope >

         </ dependency >

 

 

         < dependency >

             < groupId >org.springframework.boot</ groupId >

             < artifactId >spring-boot-configuration-processor</ artifactId >

             < optional >true</ optional >

         </ dependency >

 

         < dependency >

             < groupId >mysql</ groupId >

             < artifactId >mysql-connector-java</ artifactId >

             < version >${mysql.connector.version}</ version >

         </ dependency >

 

         < dependency >

             < groupId >com.alibaba</ groupId >

             < artifactId >druid-spring-boot-starter</ artifactId >

             < version >${druid.spring.boot.starter.version}</ version >

         </ dependency >

 

         < dependency >

             < groupId >org.mybatis.spring.boot</ groupId >

             < artifactId >mybatis-spring-boot-starter</ artifactId >

             < version >${mybatis.spring.boot.starter.version}</ version >

         </ dependency >

 

         < dependency >

             < groupId >com.alibaba</ groupId >

             < artifactId >fastjson</ artifactId >

             < version >${fastjson.version}</ version >

         </ dependency >

 

         < dependency >

             < groupId >org.apache测试数据mons</ groupId >

             < artifactId >commons-lang3</ artifactId >

             < version >${commons.lang3.version}</ version >

         </ dependency >

 

         < dependency >

             < groupId >joda-time</ groupId >

             < artifactId >joda-time</ artifactId >

             < version >${joda.time.version}</ version >

         </ dependency >

 

         < dependency >

             < groupId >us.codecraft</ groupId >

             < artifactId >webmagic-core</ artifactId >

             < version >${webmagic.core.version}</ version >

             < exclusions >

                 < exclusion >

                     < groupId >org.slf4j</ groupId >

                     < artifactId >slf4j-log4j12</ artifactId >

                 </ exclusion >

             </ exclusions >

         </ dependency >

 

     </ dependencies >

 

     < build >

         < plugins >

             < plugin >

                 < groupId >org.apache.maven.plugins</ groupId >

                 < artifactId >maven-compiler-plugin</ artifactId >

                 < version >${maven测试数据piler.plugin.version}</ version >

                 < configuration >

                     < source >${java.version}</ source >

                     < target >${java.version}</ target >

                     < encoding >${project.build.sourceEncoding}</ encoding >

                 </ configuration >

             </ plugin >

 

             < plugin >

                 < groupId >org.apache.maven.plugins</ groupId >

                 < artifactId >maven-resources-plugin</ artifactId >

                 < version >${maven.resources.plugin.version}</ version >

                 < configuration >

                     < encoding >${project.build.sourceEncoding}</ encoding >

                 </ configuration >

             </ plugin >

 

             < plugin >

                 < groupId >org.springframework.boot</ groupId >

                 < artifactId >spring-boot-maven-plugin</ artifactId >

                 < configuration >

                     < fork >true</ fork >

                     < addResources >true</ addResources >

                 </ configuration >

                 < executions >

                     < execution >

                         < goals >

                             < goal >repackage</ goal >

                         </ goals >

                     </ execution >

                 </ executions >

             </ plugin >

         </ plugins >

     </ build >

 

     < repositories >

         < repository >

             < id >public</ id >

             < name >aliyun nexus</ name >

             < url >http://maven.aliyun测试数据/nexus/content/groups/public/</ url >

             < releases >

                 < enabled >true</ enabled >

             </ releases >

         </ repository >

     </ repositories >

 

     < pluginRepositories >

         < pluginRepository >

             < id >public</ id >

             < name >aliyun nexus</ name >

             < url >http://maven.aliyun测试数据/nexus/content/groups/public/</ url >

             < releases >

                 < enabled >true</ enabled >

             </ releases >

             < snapshots >

                 < enabled >false</ enabled >

             </ snapshots >

         </ pluginRepository >

     </ pluginRepositories >

 

</ project >

2、创建CmsContentPO.java

数据实体,和表中3个字段对应。

?

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

package site.exciter.article.model;

 

public class CmsContentPO {

     private String contentId;

 

     private String title;

 

     private String date;

 

     public String getContentId() {

         return contentId;

     }

 

     public void setContentId(String contentId) {

         this .contentId = contentId;

     }

 

     public String getTitle() {

         return title;

     }

 

     public void setTitle(String title) {

         this .title = title;

     }

 

     public String getDate() {

         return date;

     }

 

     public void setDate(String date) {

         this .date = date;

     }

}

3、创建CrawlerMapper.java

?

1

2

3

4

5

6

7

8

9

package site.exciter.article.dao;

 

import org.apache.ibatis.annotations.Mapper;

import site.exciter.article.model.CmsContentPO;

 

@Mapper

public interface CrawlerMapper {

     int addCmsContent(CmsContentPO record);

}

4、配置映射文件CrawlerMapper.xml

在resources下新建mapper文件夹,在mapper下创建CrawlerMapper.xml

?

1

2

3

4

5

6

7

8

9

10

11

12

13

<? xml version = "1.0" encoding = "UTF-8" ?>

<!DOCTYPE mapper PUBLIC "-//mybatis.org//DTD Mapper 3.0//EN" "http://mybatis.org/dtd/mybatis-3-mapper.dtd">

< mapper namespace = "site.exciter.article.dao.CrawlerMapper" >

 

     < insert id = "addCmsContent" parameterType = "site.exciter.article.model.CmsContentPO" >

         insert into cms_content (contentId,

         title,

         date)

         values (#{contentId,jdbcType=VARCHAR},

         #{title,jdbcType=VARCHAR},

         #{date,jdbcType=VARCHAR})

     </ insert >

</ mapper >

5、配置application.properties

配置数据库和mybatis映射关系。

?

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

# mysql

spring.datasource.name=mysql

spring.datasource.type=com.alibaba.druid.pool.DruidDataSource

spring.datasource.driver-class-name=com.mysql.jdbc.Driver

spring.datasource.url=jdbc:mysql://10.201.61.184:3306/article?useUnicode=true&characterEncoding=utf8&useSSL=false&allowMultiQueries=true

spring.datasource.username=root

spring.datasource.password=root

 

# druid

spring.datasource.druid.initial-size=5

spring.datasource.druid.min-idle=5

spring.datasource.druid.max-active=10

spring.datasource.druid.max-wait=60000

spring.datasource.druid.validation-query=SELECT 1 FROM DUAL

spring.datasource.druid.test-on-borrow=false

spring.datasource.druid.test-on-return=false

spring.datasource.druid.test-while-idle=true

spring.datasource.druid.time-between-eviction-runs-millis=60000

spring.datasource.druid.min-evictable-idle-time-millis=300000

spring.datasource.druid.max-evictable-idle-time-millis=600000

 

# mybatis

mybatis.mapperLocations=classpath:mapper/CrawlerMapper.xml

6、创建ArticlePageProcessor.java

解析html的逻辑。

?

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

package site.exciter.article;

 

import org.springframework.stereotype.Component;

import us.codecraft.webmagic.Page;

import us.codecraft.webmagic.Site;

import us.codecraft.webmagic.processor.PageProcessor;

import us.codecraft.webmagic.selector.Selectable;

 

@Component

public class ArticlePageProcessor implements PageProcessor {

 

     private Site site = Site.me().setRetryTimes( 3 ).setSleepTime( 1000 );

 

     @Override

     public void process(Page page) {

         String detail_urls_Xpath = "//*[@class='postTitle']/a[@class='postTitle2']/@href" ;

         String next_page_xpath = "//*[@id='nav_next_page']/a/@href" ;

         String next_page_css = "#homepage_top_pager > div:nth-child(1) > a:nth-child(7)" ;

         String title_xpath = "//h1[@class='postTitle']/a/span/text()" ;

         String date_xpath = "//span[@id='post-date']/text()" ;

         page.putField( "title" , page.getHtml().xpath(title_xpath).toString());

         if (page.getResultItems().get( "title" ) == null ) {

             page.setSkip( true );

         }

         page.putField( "date" , page.getHtml().xpath(date_xpath).toString());

 

         if (page.getHtml().xpath(detail_urls_Xpath).match()) {

             Selectable detailUrls = page.getHtml().xpath(detail_urls_Xpath);

             page.addTargetRequests(detailUrls.all());

         }

 

         if (page.getHtml().xpath(next_page_xpath).match()) {

             Selectable nextPageUrl = page.getHtml().xpath(next_page_xpath);

             page.addTargetRequests(nextPageUrl.all());

 

         } else if (page.getHtml().css(next_page_css).match()) {

             Selectable nextPageUrl = page.getHtml().css(next_page_css).links();

             page.addTargetRequests(nextPageUrl.all());

         }

     }

 

     @Override

     public Site getSite() {

         return site;

     }

}

7、创建ArticlePipeline.java

处理数据的持久化。

package  site.exciter.article;    import  org.slf4j.Logger;  import  org.slf4j.LoggerFactory;  import  org.springframework.beans.factory.annotation.Autowired;  import  org.springframework.stereotype.Component;  import  site.exciter.article.model.CmsContentPO;  import  site.exciter.article.dao.CrawlerMapper;  import  us.codecraft.webmagic.ResultItems;  import  us.codecraft.webmagic.Task;  import  us.codecraft.webmagic.pipeline.Pipeline;    import  java.util.UUID;    @Component  public   class  ArticlePipeline  implements  Pipeline {         private   static   final  Logger LOGGER = LoggerFactory.getLogger(ArticlePipeline. class );        @Autowired       private  CrawlerMapper crawlerMapper;         public   void  process(ResultItems resultItems, Task task) {          String title = resultItems.get( "title" );          String date = resultItems.get( "date" );            CmsContentPO contentPO =  new  CmsContentPO();          contentPO.setContentId(UUID.randomUUID().toString());          contentPO.setTitle(title);          contentPO.setDate(date);             try  {               boolean  success = crawlerMapper.addCmsContent(contentPO) > 0;              LOGGER.info( "保存成功:{}" , title);          }  catch  (Exception ex) {              LOGGER.error( "保存失败" , ex);          }      }  } 

8、创建ArticleTask.java

执行抓取任务。

?

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

package site.exciter.article;

 

import org.slf4j.Logger;

import org.slf4j.LoggerFactory;

import org.springframework.beans.factory.annotation.Autowired;

import org.springframework.stereotype.Component;

import us.codecraft.webmagic.Spider;

 

import java.util.concurrent.Executors;

import java.util.concurrent.ScheduledExecutorService;

import java.util.concurrent.TimeUnit;

 

@Component

public class ArticleTask {

     private static final Logger LOGGER = LoggerFactory.getLogger(ArticlePipeline. class );

 

     @Autowired

     private ArticlePipeline articlePipeline;

 

     @Autowired

     private ArticlePageProcessor articlePageProcessor;

 

     private ScheduledExecutorService timer = Executors.newSingleThreadScheduledExecutor();

 

     public void crawl() {

         // 定时任务,每10分钟爬取一次

         timer.scheduleWithFixedDelay(() -> {

             Thread.currentThread().setName( "ArticleCrawlerThread" );

 

             try {

                 Spider.create(articlePageProcessor)

                         .addUrl( "http://HdhCmsTestcnblogs测试数据/dick159/default.html?page=2" )

                         // 抓取到的数据存数据库

                         .addPipeline(articlePipeline)

                         // 开启5个线程抓取

                         .thread( 5 )

                         // 异步启动爬虫

                         .start();

             } catch (Exception ex) {

                 LOGGER.error( "定时抓取数据线程执行异常" , ex);

             }

         }, 0 , 10 , TimeUnit.MINUTES);

     }

}

9、修改Application

?

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

package site.exciter.article;

 

import org.mybatis.spring.annotation.MapperScan;

import org.springframework.beans.factory.annotation.Autowired;

import org.springframework.boot.CommandLineRunner;

import org.springframework.boot.SpringApplication;

import org.springframework.boot.autoconfigure.SpringBootApplication;

 

@SpringBootApplication

@MapperScan (basePackages = "site.exciter.article.interface" )

public class ArticleApplication implements CommandLineRunner {

 

     @Autowired

     private ArticleTask articleTask;

 

     public static void main(String[] args) {

         SpringApplication.run(ArticleApplication. class , args);

     }

 

     @Override

     public void run(String... args) throws Exception {

         articleTask.crawl();

     }

}

10、执行application,开始抓数据并入库

到此这篇关于SrpingBoot+WebMagic+ MyBaties 实现爬虫和数据入库的示例的文章就介绍到这了,更多相关SrpingBoot+WebMagic+MyBaties爬虫和数据入库内容请搜索以前的文章或继续浏览下面的相关文章希望大家以后多多支持!

原文链接:https://juejin.cn/post/7018897037219332104

查看更多关于SpringBoot+WebMagic+MyBaties实现爬虫和数据入库的示例的详细内容...

  阅读:21次