第一步:
创建项目
scrapy stratproject [name]
如 scrapy startproject choushibaike
第二步:
进入到项目的文件夹目录创建APP
scrapy gensider baike lovehhy.net
第三步:
配置baike.py文件
# -*- coding: utf-8 -*- import scrapy from ..items import ChoushibaikeItem class BaikeSpider(scrapy.Spider): name = ‘baike‘ allowed_domains = [‘lovehhy.net‘] start_urls = [‘http://HdhCmsTestlovehhy.net/joke/Detail/QSBK‘] def parse(self, response): titles = response.xpath(‘//div[@class="cat_llb"]/h3/a/text()‘).extract() contents = response.xpath(‘//div[@class="cat_llb"]/div[@id="endtext"]/text()‘).extract() times = response.xpath(‘//div[@class="cat_llb"]/text()‘).extract() title_list = [] for title in titles: title_list.append(title) content_list = [] for content in contents: content_list.append(content) time_list = [] for time in times: time_list.append(time) item_lists = zip(title_list, content_list, time_list) for item_list in item_lists: item = ChoushibaikeItem() item[‘title‘] = item_list[0] item[‘content‘] = item_list[1] item[‘time‘] = item_list[2][0:22] item[‘click‘] = item_list[2][22:-1] yield item next_url = response.xpath(‘//a[text()="下一页>>"]/@href‘).extract_first() url = response.urljoin(next_url) yield scrapy.Request(url=url, callback=self.parse)
第四步:
配置items.py文件
import scrapy
class ChoushibaikeItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() title = scrapy.Field() content = scrapy.Field() time = scrapy.Field() click = scrapy.Field()
第五步:
配置pipelines.py文件
import pymongo
class MongoPipeline(object):
def __init__(self, mongo_uri, mongo_db):
self.mongo_uri = mongo_uri
self.mongo_db = mongo_db
@classmethod
def from_crawler(cls, crawler):
return cls(
mongo_uri=crawler.settings.get(‘MONGO_URI‘),
mongo_db=crawler.settings.get(‘MONGO_DB‘)
)
def open_spider(self, spider):
self.client = pymongo.MongoClient(self.mongo_uri)
self.db = self.client[self.mongo_db]
def process_item(self, item, spider):
name = item.__class__.__name__
self.db[name].insert(dict(item))
return item
def close_spider(self, spider):
self.client.close()
第六步:
配置settings.py文件
# -*- coding: utf-8 -*-
# Scrapy settings for choushibaike project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://docs.scrapy.org/en/latest/topics/settings.html
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = ‘choushibaike‘
SPIDER_MODULES = [‘choushibaike.spiders‘]
NEWSPIDER_MODULE = ‘choushibaike.spiders‘
# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36‘
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
ITEM_PIPELINES = {
# ‘choushibaike.pipelines.ChoushibaikePipeline‘: 300,
‘choushibaike.pipelines.MongoPipeline‘: 400,
}
MONGO_URI = ‘mongodb://admin:[email protected]/‘
MONGO_DB = ‘choushibaike‘
第七步:
运行项目
scrapy crawl baike
查看更多关于【菜鸟学Python】使用Scrapy框架爬取糗事百科的详细内容...
声明:本文来自网络,不代表【好得很程序员自学网】立场,转载请注明出处:http://www.haodehen.cn/did170497