Table of Contents

Scrapy Architecture Code

Scrapy commands

Overview about scrapy commands

scrapy command code scripts

Scrapy command code scripts are stored in directory scrapy/commands/:

bench.py
check.py
crawl.py
deploy.py
edit.py
fetch.py
genspider.py
list.py
parse.py
runspider.py
settings.py
shell.py
startproject.py
version.py
view.py

Scrapy Settings

Default settings

default_settings.py

"""
This module contains the default values for all settings used by Scrapy.
 
For more information about these settings you can read the settings
documentation in docs/topics/settings.rst
 
Scrapy developers, if you add a setting here remember to:
 
* add it in alphabetical order
* group similar settings without leaving blank lines
* add its documentation to the available settings documentation
  (docs/topics/settings.rst)
 
"""
 
import os
import sys
from importlib import import_module
from os.path import join, abspath, dirname
 
AJAXCRAWL_ENABLED = False
 
BOT_NAME = 'scrapybot'
 
CLOSESPIDER_TIMEOUT = 0
CLOSESPIDER_PAGECOUNT = 0
CLOSESPIDER_ITEMCOUNT = 0
CLOSESPIDER_ERRORCOUNT = 0
 
COMMANDS_MODULE = ''
 
COMPRESSION_ENABLED = True
 
CONCURRENT_ITEMS = 100
 
CONCURRENT_REQUESTS = 16
CONCURRENT_REQUESTS_PER_DOMAIN = 8
CONCURRENT_REQUESTS_PER_IP = 0
 
COOKIES_ENABLED = True
COOKIES_DEBUG = False
 
DEFAULT_ITEM_CLASS = 'scrapy.item.Item'
 
DEFAULT_REQUEST_HEADERS = {
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    'Accept-Language': 'en',
}
 
DEPTH_LIMIT = 0
DEPTH_STATS = True
DEPTH_PRIORITY = 0
 
DNSCACHE_ENABLED = True
DNSCACHE_SIZE = 10000
DNS_TIMEOUT = 60
 
DOWNLOAD_DELAY = 0
 
DOWNLOAD_HANDLERS = {}
DOWNLOAD_HANDLERS_BASE = {
    'file': 'scrapy.core.downloader.handlers.file.FileDownloadHandler',
    'http': 'scrapy.core.downloader.handlers.http.HTTPDownloadHandler',
    'https': 'scrapy.core.downloader.handlers.http.HTTPDownloadHandler',
    's3': 'scrapy.core.downloader.handlers.s3.S3DownloadHandler',
    'ftp': 'scrapy.core.downloader.handlers.ftp.FTPDownloadHandler',
}
 
DOWNLOAD_TIMEOUT = 180      # 3mins
 
DOWNLOAD_MAXSIZE = 1024*1024*1024   # 1024m
DOWNLOAD_WARNSIZE = 32*1024*1024    # 32m
 
DOWNLOADER = 'scrapy.core.downloader.Downloader'
 
DOWNLOADER_HTTPCLIENTFACTORY = 'scrapy.core.downloader.webclient.ScrapyHTTPClientFactory'
DOWNLOADER_CLIENTCONTEXTFACTORY = 'scrapy.core.downloader.contextfactory.ScrapyClientContextFactory'
 
DOWNLOADER_MIDDLEWARES = {}
 
DOWNLOADER_MIDDLEWARES_BASE = {
    # Engine side
    'scrapy.downloadermiddlewares.robotstxt.RobotsTxtMiddleware': 100,
    'scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware': 300,
    'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware': 350,
    'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': 400,
    'scrapy.downloadermiddlewares.retry.RetryMiddleware': 500,
    'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware': 550,
    'scrapy.downloadermiddlewares.ajaxcrawl.AjaxCrawlMiddleware': 560,
    'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware': 580,
    'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 590,
    'scrapy.downloadermiddlewares.redirect.RedirectMiddleware': 600,
    'scrapy.downloadermiddlewares.cookies.CookiesMiddleware': 700,
    'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 750,
    'scrapy.downloadermiddlewares.chunked.ChunkedTransferMiddleware': 830,
    'scrapy.downloadermiddlewares.stats.DownloaderStats': 850,
    'scrapy.downloadermiddlewares.httpcache.HttpCacheMiddleware': 900,
    # Downloader side
}
 
DOWNLOADER_STATS = True
 
DUPEFILTER_CLASS = 'scrapy.dupefilters.RFPDupeFilter'
 
try:
    EDITOR = os.environ['EDITOR']
except KeyError:
    if sys.platform == 'win32':
        EDITOR = '%s -m idlelib.idle'
    else:
        EDITOR = 'vi'
 
EXTENSIONS = {}
 
EXTENSIONS_BASE = {
    'scrapy.extensions.corestats.CoreStats': 0,
    'scrapy.telnet.TelnetConsole': 0,
    'scrapy.extensions.memusage.MemoryUsage': 0,
    'scrapy.extensions.memdebug.MemoryDebugger': 0,
    'scrapy.extensions.closespider.CloseSpider': 0,
    'scrapy.extensions.feedexport.FeedExporter': 0,
    'scrapy.extensions.logstats.LogStats': 0,
    'scrapy.extensions.spiderstate.SpiderState': 0,
    'scrapy.extensions.throttle.AutoThrottle': 0,
}
 
FEED_URI = None
FEED_URI_PARAMS = None  # a function to extend uri arguments
FEED_FORMAT = 'jsonlines'
FEED_STORE_EMPTY = False
FEED_EXPORT_FIELDS = None
FEED_STORAGES = {}
FEED_STORAGES_BASE = {
    '': 'scrapy.extensions.feedexport.FileFeedStorage',
    'file': 'scrapy.extensions.feedexport.FileFeedStorage',
    'stdout': 'scrapy.extensions.feedexport.StdoutFeedStorage',
    's3': 'scrapy.extensions.feedexport.S3FeedStorage',
    'ftp': 'scrapy.extensions.feedexport.FTPFeedStorage',
}
FEED_EXPORTERS = {}
FEED_EXPORTERS_BASE = {
    'json': 'scrapy.exporters.JsonItemExporter',
    'jsonlines': 'scrapy.exporters.JsonLinesItemExporter',
    'jl': 'scrapy.exporters.JsonLinesItemExporter',
    'csv': 'scrapy.exporters.CsvItemExporter',
    'xml': 'scrapy.exporters.XmlItemExporter',
    'marshal': 'scrapy.exporters.MarshalItemExporter',
    'pickle': 'scrapy.exporters.PickleItemExporter',
}
 
HTTPCACHE_ENABLED = False
HTTPCACHE_DIR = 'httpcache'
HTTPCACHE_IGNORE_MISSING = False
HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
HTTPCACHE_EXPIRATION_SECS = 0
HTTPCACHE_IGNORE_HTTP_CODES = []
HTTPCACHE_IGNORE_SCHEMES = ['file']
HTTPCACHE_DBM_MODULE = 'anydbm'
HTTPCACHE_POLICY = 'scrapy.extensions.httpcache.DummyPolicy'
HTTPCACHE_GZIP = False
 
ITEM_PROCESSOR = 'scrapy.pipelines.ItemPipelineManager'
 
ITEM_PIPELINES = {}
ITEM_PIPELINES_BASE = {}
 
LOG_ENABLED = True
LOG_ENCODING = 'utf-8'
LOG_FORMATTER = 'scrapy.logformatter.LogFormatter'
LOG_FORMAT = '%(asctime)s [%(name)s] %(levelname)s: %(message)s'
LOG_DATEFORMAT = '%Y-%m-%d %H:%M:%S'
LOG_STDOUT = False
LOG_LEVEL = 'DEBUG'
LOG_FILE = None
 
LOG_UNSERIALIZABLE_REQUESTS = False
 
LOGSTATS_INTERVAL = 60.0
 
MAIL_HOST = 'localhost'
MAIL_PORT = 25
MAIL_FROM = 'scrapy@localhost'
MAIL_PASS = None
MAIL_USER = None
 
MEMDEBUG_ENABLED = False        # enable memory debugging
MEMDEBUG_NOTIFY = []            # send memory debugging report by mail at engine shutdown
 
MEMUSAGE_ENABLED = False
MEMUSAGE_LIMIT_MB = 0
MEMUSAGE_NOTIFY_MAIL = []
MEMUSAGE_REPORT = False
MEMUSAGE_WARNING_MB = 0
 
METAREFRESH_ENABLED = True
METAREFRESH_MAXDELAY = 100
 
NEWSPIDER_MODULE = ''
 
RANDOMIZE_DOWNLOAD_DELAY = True
 
REACTOR_THREADPOOL_MAXSIZE = 10
 
REDIRECT_ENABLED = True
REDIRECT_MAX_TIMES = 20  # uses Firefox default setting
REDIRECT_PRIORITY_ADJUST = +2
 
REFERER_ENABLED = True
 
RETRY_ENABLED = True
RETRY_TIMES = 2  # initial response + 2 retries = 3 requests
RETRY_HTTP_CODES = [500, 502, 503, 504, 400, 408]
RETRY_PRIORITY_ADJUST = -1
 
ROBOTSTXT_OBEY = False
 
SCHEDULER = 'scrapy.core.scheduler.Scheduler'
SCHEDULER_DISK_QUEUE = 'scrapy.squeues.PickleLifoDiskQueue'
SCHEDULER_MEMORY_QUEUE = 'scrapy.squeues.LifoMemoryQueue'
 
SPIDER_LOADER_CLASS = 'scrapy.spiderloader.SpiderLoader'
 
SPIDER_MIDDLEWARES = {}
 
SPIDER_MIDDLEWARES_BASE = {
    # Engine side
    'scrapy.spidermiddlewares.httperror.HttpErrorMiddleware': 50,
    'scrapy.spidermiddlewares.offsite.OffsiteMiddleware': 500,
    'scrapy.spidermiddlewares.referer.RefererMiddleware': 700,
    'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware': 800,
    'scrapy.spidermiddlewares.depth.DepthMiddleware': 900,
    # Spider side
}
 
SPIDER_MODULES = []
 
STATS_CLASS = 'scrapy.statscollectors.MemoryStatsCollector'
STATS_DUMP = True
 
STATSMAILER_RCPTS = []
 
TEMPLATES_DIR = abspath(join(dirname(__file__), '..', 'templates'))
 
URLLENGTH_LIMIT = 2083
 
USER_AGENT = 'Scrapy/%s (+http://scrapy.org)' % import_module('scrapy').__version__
 
TELNETCONSOLE_ENABLED = 1
TELNETCONSOLE_PORT = [6023, 6073]
TELNETCONSOLE_HOST = '127.0.0.1'
 
SPIDER_CONTRACTS = {}
SPIDER_CONTRACTS_BASE = {
    'scrapy.contracts.default.UrlContract': 1,
    'scrapy.contracts.default.ReturnsContract': 2,
    'scrapy.contracts.default.ScrapesContract': 3,
}

Some sections in default_settings.py and Custom Settings

scrapy engine

scrapy engine init and start

DownloadMiddleware

Instant DownloadMiddleware

DownloadMiddleware run Download

Request in Scrapy

Crawl and Spider

scrapy crawl command start and run Spider

Command to start spider

scrapy crawl <spidername>

crawling diagram: Basic Steps for crawling:

  1. Step1: Call Class Method update_settings of Spider
  2. Step2: Call Class Method from_crawler to create spider Object
  3. Step3: Call Method self.spider.start_requests() return all requests for downloading before run the spider to download
    class Crawler(object):
    ................
        @defer.inlineCallbacks
        def crawl(self, *args, **kwargs):
            assert not self.crawling, "Crawling already taking place"
            self.crawling = True
     
            try:
                self.spider = self._create_spider(*args, **kwargs)
                self.engine = self._create_engine()
                start_requests = iter(self.spider.start_requests())
                yield self.engine.open_spider(self.spider, start_requests)
                yield defer.maybeDeferred(self.engine.start)
            except Exception:
                self.crawling = False
                raise
    ...........................
    class Spider(object_ref):
        """Base class for scrapy spiders. All spiders must inherit from this
        class.
        """
        @classmethod
        def from_crawler(cls, crawler, *args, **kwargs):
            spider = cls(*args, **kwargs)
            spider._set_crawler(crawler)
            return spider
     
        def start_requests(self):
            for url in self.start_urls:
                yield self.make_requests_from_url(url)
     
        def make_requests_from_url(self, url):
            return Request(url, dont_filter=True)
     
        def parse(self, response):
            raise NotImplementedError
     
        @classmethod
        def update_settings(cls, settings):
            settings.setdict(cls.custom_settings or {}, priority='spider')
     
        @classmethod
        def handles_request(cls, request):
            return url_is_from_spider(request.url, cls)
     
        @staticmethod
        def close(spider, reason):
            closed = getattr(spider, 'closed', None)
            if callable(closed):
                return closed(reason)        
  4. Step4: Add spider to Shedule for downloading
  5. Step5: After download url finished, Crawler calling function parse
  6. Step6: Continue download new requests: Crawler get all requests yielded from function parse and continue downloading
  7. Step7: process data downloaded: Crawler get all items yielded from function parse and process theme
  8. Step8: Mỗi request đều cố hàm callback parse(self, response) Hoặc _response_downloaded(self, response), sau khi request được download, scrapy sẽ gọi hàm callback này để xử lý response. Nếu:
    • Hàm callback trả về giá trị là Request → sẽ tiếp tục download những request này
    • Nếu hàm callback trả về giá trị là Item → đưa vô pipeline đễ xử lý data

Spider Classes

Thuật toán download và parse response trong Spider

Spider class

class Spider(object_ref):
    """Base class for scrapy spiders. All spiders must inherit from this
    class.
    """
 
    name = None
    custom_settings = None
 
    def __init__(self, name=None, **kwargs):
        if name is not None:
            self.name = name
        elif not getattr(self, 'name', None):
            raise ValueError("%s must have a name" % type(self).__name__)
        self.__dict__.update(kwargs)
        if not hasattr(self, 'start_urls'):
            self.start_urls = []
 
    @property
    def logger(self):
        logger = logging.getLogger(self.name)
        return logging.LoggerAdapter(logger, {'spider': self})
 
    def log(self, message, level=logging.DEBUG, **kw):
        """Log the given message at the given log level
 
        This helper wraps a log call to the logger within the spider, but you
        can use it directly (e.g. Spider.logger.info('msg')) or use any other
        Python logger too.
        """
        self.logger.log(level, message, **kw)
 
    @classmethod
    def from_crawler(cls, crawler, *args, **kwargs):
        spider = cls(*args, **kwargs)
        spider._set_crawler(crawler)
        return spider
 
    def set_crawler(self, crawler):
        warnings.warn("set_crawler is deprecated, instantiate and bound the "
                      "spider to this crawler with from_crawler method "
                      "instead.",
                      category=ScrapyDeprecationWarning, stacklevel=2)
        assert not hasattr(self, 'crawler'), "Spider already bounded to a " \
                                             "crawler"
        self._set_crawler(crawler)
 
    def _set_crawler(self, crawler):
        self.crawler = crawler
        self.settings = crawler.settings
        crawler.signals.connect(self.close, signals.spider_closed)
 
    def start_requests(self):
        for url in self.start_urls:
            yield self.make_requests_from_url(url)
 
    def make_requests_from_url(self, url):
        return Request(url, dont_filter=True)
 
    def parse(self, response):
        raise NotImplementedError
 
    @classmethod
    def update_settings(cls, settings):
        settings.setdict(cls.custom_settings or {}, priority='spider')
 
    @classmethod
    def handles_request(cls, request):
        return url_is_from_spider(request.url, cls)
 
    @staticmethod
    def close(spider, reason):
        closed = getattr(spider, 'closed', None)
        if callable(closed):
            return closed(reason)
 
    def __str__(self):
        return "<%s %r at 0x%0x>" % (type(self).__name__, self.name, id(self))
 
    __repr__ = __str__

Mô tả thuật toán:

  1. Step1: Scrapy Engine gọi hàm start_requests thực hiện việc download tất cả các link trong start_urls:
    def start_requests(self):
        for url in self.start_urls:
            yield self.make_requests_from_url(url)
    def make_requests_from_url(self, url):
        return Request(url, dont_filter=True)
  2. Step2: Response được download trong step1 sẽ được xử lý trong hàm parse(self, response). Hàm parse trả về các tập dữ liệu bên dưới
    1. Tập dữ liệu 1: Tập gồm các dữ liệu web được gởi tới Pipeline để xử lý
    2. Tập dữ liệu 2: Tập gồm các link mới sẽ được download bởi scrapy và gởi tới hàm parse(self, response)

CrawlSpider class

CrawlSpider kế thừa từ spider và sử dụng 2 thuật toán cơ bản:

⇒ This modules implements the CrawlSpider which is the recommended spider to use for scraping typical web sites that requires crawling pages.

"""
This modules implements the CrawlSpider which is the recommended spider to use
for scraping typical web sites that requires crawling pages.
 
See documentation in docs/topics/spiders.rst
"""
 
import copy
 
from scrapy.http import Request, HtmlResponse
from scrapy.utils.spider import iterate_spider_output
from scrapy.spiders import Spider
 
def identity(x):
    return x
 
class Rule(object):
 
    def __init__(self, link_extractor, callback=None, cb_kwargs=None, follow=None, process_links=None, process_request=identity):
        self.link_extractor = link_extractor
        self.callback = callback
        self.cb_kwargs = cb_kwargs or {}
        self.process_links = process_links
        self.process_request = process_request
        if follow is None:
            self.follow = False if callback else True
        else:
            self.follow = follow
 
class CrawlSpider(Spider):
 
    rules = ()
 
    def __init__(self, *a, **kw):
        super(CrawlSpider, self).__init__(*a, **kw)
        self._compile_rules()
 
    def parse(self, response):
        return self._parse_response(response, self.parse_start_url, cb_kwargs={}, follow=True)
 
    def parse_start_url(self, response):
        return []
 
    def process_results(self, response, results):
        return results
 
    def _requests_to_follow(self, response):
        if not isinstance(response, HtmlResponse):
            return
        seen = set()
        for n, rule in enumerate(self._rules):
            links = [l for l in rule.link_extractor.extract_links(response) if l not in seen]
            if links and rule.process_links:
                links = rule.process_links(links)
            for link in links:
                seen.add(link)
                r = Request(url=link.url, callback=self._response_downloaded)
                r.meta.update(rule=n, link_text=link.text)
                yield rule.process_request(r)
 
    def _response_downloaded(self, response):
        rule = self._rules[response.meta['rule']]
        return self._parse_response(response, rule.callback, rule.cb_kwargs, rule.follow)
 
    def _parse_response(self, response, callback, cb_kwargs, follow=True):
        if callback:
            cb_res = callback(response, **cb_kwargs) or ()
            cb_res = self.process_results(response, cb_res)
            for requests_or_item in iterate_spider_output(cb_res):
                yield requests_or_item
 
        if follow and self._follow_links:
            for request_or_item in self._requests_to_follow(response):
                yield request_or_item
 
    def _compile_rules(self):
        def get_method(method):
            if callable(method):
                return method
            elif isinstance(method, basestring):
                return getattr(self, method, None)
 
        self._rules = [copy.copy(r) for r in self.rules]
        for rule in self._rules:
            rule.callback = get_method(rule.callback)
            rule.process_links = get_method(rule.process_links)
            rule.process_request = get_method(rule.process_request)
 
    @classmethod
    def from_crawler(cls, crawler, *args, **kwargs):
        spider = super(CrawlSpider, cls).from_crawler(crawler, *args, **kwargs)
        spider._follow_links = crawler.settings.getbool(
            'CRAWLSPIDER_FOLLOW_LINKS', True)
        return spider
 
    def set_crawler(self, crawler):
        super(CrawlSpider, self).set_crawler(crawler)
        self._follow_links = crawler.settings.getbool('CRAWLSPIDER_FOLLOW_LINKS', True)

ItemPipeline để xử lý lưu trữ dữ liệu

refer: https://doc.scrapy.org/en/latest/topics/item-pipeline.html

Riêng ImagesPipeline and FilesPipeline, request sẽ được gọi và download trong hàm process_item của pipeline

ItemPipeline Classes

With some public functions for pipeline:

@classmethod
def from_crawler(cls, crawler)
def open_spider(self, spider)
def process_item(self, item, spider)
def media_to_download(self, request, info)
def get_media_requests(self, item, info)
def media_downloaded(self, response, request, info)
def media_failed(self, failure, request, info)
def item_completed(self, results, item, info)

ItemPipelineManager

Chú thích cho sơ đồ và mô tả kiến trúc của thuật toán pipeline:

ImagesPipeline

http://doc.scrapy.org/en/latest/topics/images.html

Store in FilesPipeline

With 2 public functions:

def persist_file(self, path, buf, info, meta=None, headers=None)
def stat_file(self, path, info)

Called from FilesPipeline:

def file_downloaded(self, response, request, info):
    path = self.file_path(request, response=response, info=info)
    buf = BytesIO(response.body)
    self.store.persist_file(path, buf, info)
    checksum = md5sum(buf)
    return checksum

Write items to a JSON file

The following pipeline stores all scraped items (from all spiders) into a single items.jl file, containing one item per line serialized in JSON format:

import json
 
class JsonWriterPipeline(object):
 
    def open_spider(self, spider):
        self.file = open('items.jl', 'wb')
 
    def close_spider(self, spider):
        self.file.close()
 
    def process_item(self, item, spider):
        line = json.dumps(dict(item)) + "\n"
        self.file.write(line)
        return item

Note: The purpose of JsonWriterPipeline is just to introduce how to write item pipelines. If you really want to store all scraped items into a JSON file you should use the Feed exports.

Write items to MongoDB

In this example we’ll write items to MongoDB using pymongo. MongoDB address and database name are specified in Scrapy settings; MongoDB collection is named after item class.

The main point of this example is to show how to use from_crawler() method and how to clean up the resources properly.:

import pymongo
 
class MongoPipeline(object):
 
    collection_name = 'scrapy_items'
 
    def __init__(self, mongo_uri, mongo_db):
        self.mongo_uri = mongo_uri
        self.mongo_db = mongo_db
 
    @classmethod
    def from_crawler(cls, crawler):
        return cls(
            mongo_uri=crawler.settings.get('MONGO_URI'),
            mongo_db=crawler.settings.get('MONGO_DATABASE', 'items')
        )
 
    def open_spider(self, spider):
        self.client = pymongo.MongoClient(self.mongo_uri)
        self.db = self.client[self.mongo_db]
 
    def close_spider(self, spider):
        self.client.close()
 
    def process_item(self, item, spider):
        self.db[self.collection_name].insert(dict(item))
        return item

Duplicates filter

A filter that looks for duplicate items, and drops those items that were already processed. Let’s say that our items have a unique id, but our spider returns multiples items with the same id:

from scrapy.exceptions import DropItem
 
class DuplicatesPipeline(object):
 
    def __init__(self):
        self.ids_seen = set()
 
    def process_item(self, item, spider):
        if item['id'] in self.ids_seen:
            raise DropItem("Duplicate item found: %s" % item)
        else:
            self.ids_seen.add(item['id'])
            return item

scope of allowed_domains

allowed_domains was filtered in site-packages\scrapy\utils\url.py:

def url_is_from_any_domain(url, domains):
    """Return True if the url belongs to any of the given domains"""
    host = parse_url(url).netloc.lower()
 
    if host:
        return any(((host == d.lower()) or (host.endswith('.%s' % d.lower())) for d in domains))
    else:
        return False
 
def url_is_from_spider(url, spider):
    """Return True if the url belongs to the given spider"""
    return url_is_from_any_domain(url,
        [spider.name] + list(getattr(spider, 'allowed_domains', [])))

and spider call it to check before download:

class Spider(object_ref): 
    @classmethod
    def handles_request(cls, request):
        return url_is_from_spider(request.url, cls) 

Integrate Scrapy with Other Systems

Integrate via below systems: