Table of Contents
Scrapy Architecture Code
Scrapy commands
Overview about scrapy commands
- Scrapy command format
scrapy --help Scrapy 1.0.3 - project: templatedownload Usage: scrapy <command> [options] [args] Available commands: bench Run quick benchmark test check Check spider contracts commands crawl Run a spider edit Edit spider fetch Fetch a URL using the Scrapy downloader genspider Generate new spider using pre-defined templates list List available spiders parse Parse URL (using its spider) and print the results runspider Run a self-contained spider (without creating a project) settings Get settings values shell Interactive scraping console startproject Create new project version Print Scrapy version view Open URL in browser, as seen by Scrapy Use "scrapy <command> -h" to see more info about a command
- For example:
scrapy startproject -h Usage ===== scrapy startproject <project_name> Create new project Options ======= --help, -h show this help message and exit Global Options -------------- --logfile=FILE log file. if omitted stderr will be used --loglevel=LEVEL, -L LEVEL log level (default: DEBUG) --nolog disable logging completely --profile=FILE write python cProfile stats to FILE --lsprof=FILE write lsprof profiling stats to FILE --pidfile=FILE write process ID to FILE --set=NAME=VALUE, -s NAME=VALUE set/override setting (may be repeated) --pdb enable pdb on failure
- content of script scrapy:
python -mscrapy.cmdline %*
- scrapy command diagram
scrapy command code scripts
Scrapy command code scripts are stored in directory scrapy/commands/:
bench.py check.py crawl.py deploy.py edit.py fetch.py genspider.py list.py parse.py runspider.py settings.py shell.py startproject.py version.py view.py
Scrapy Settings
Default settings
default_settings.py
""" This module contains the default values for all settings used by Scrapy. For more information about these settings you can read the settings documentation in docs/topics/settings.rst Scrapy developers, if you add a setting here remember to: * add it in alphabetical order * group similar settings without leaving blank lines * add its documentation to the available settings documentation (docs/topics/settings.rst) """ import os import sys from importlib import import_module from os.path import join, abspath, dirname AJAXCRAWL_ENABLED = False BOT_NAME = 'scrapybot' CLOSESPIDER_TIMEOUT = 0 CLOSESPIDER_PAGECOUNT = 0 CLOSESPIDER_ITEMCOUNT = 0 CLOSESPIDER_ERRORCOUNT = 0 COMMANDS_MODULE = '' COMPRESSION_ENABLED = True CONCURRENT_ITEMS = 100 CONCURRENT_REQUESTS = 16 CONCURRENT_REQUESTS_PER_DOMAIN = 8 CONCURRENT_REQUESTS_PER_IP = 0 COOKIES_ENABLED = True COOKIES_DEBUG = False DEFAULT_ITEM_CLASS = 'scrapy.item.Item' DEFAULT_REQUEST_HEADERS = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en', } DEPTH_LIMIT = 0 DEPTH_STATS = True DEPTH_PRIORITY = 0 DNSCACHE_ENABLED = True DNSCACHE_SIZE = 10000 DNS_TIMEOUT = 60 DOWNLOAD_DELAY = 0 DOWNLOAD_HANDLERS = {} DOWNLOAD_HANDLERS_BASE = { 'file': 'scrapy.core.downloader.handlers.file.FileDownloadHandler', 'http': 'scrapy.core.downloader.handlers.http.HTTPDownloadHandler', 'https': 'scrapy.core.downloader.handlers.http.HTTPDownloadHandler', 's3': 'scrapy.core.downloader.handlers.s3.S3DownloadHandler', 'ftp': 'scrapy.core.downloader.handlers.ftp.FTPDownloadHandler', } DOWNLOAD_TIMEOUT = 180 # 3mins DOWNLOAD_MAXSIZE = 1024*1024*1024 # 1024m DOWNLOAD_WARNSIZE = 32*1024*1024 # 32m DOWNLOADER = 'scrapy.core.downloader.Downloader' DOWNLOADER_HTTPCLIENTFACTORY = 'scrapy.core.downloader.webclient.ScrapyHTTPClientFactory' DOWNLOADER_CLIENTCONTEXTFACTORY = 'scrapy.core.downloader.contextfactory.ScrapyClientContextFactory' DOWNLOADER_MIDDLEWARES = {} DOWNLOADER_MIDDLEWARES_BASE = { # Engine side 'scrapy.downloadermiddlewares.robotstxt.RobotsTxtMiddleware': 100, 'scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware': 300, 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware': 350, 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': 400, 'scrapy.downloadermiddlewares.retry.RetryMiddleware': 500, 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware': 550, 'scrapy.downloadermiddlewares.ajaxcrawl.AjaxCrawlMiddleware': 560, 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware': 580, 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 590, 'scrapy.downloadermiddlewares.redirect.RedirectMiddleware': 600, 'scrapy.downloadermiddlewares.cookies.CookiesMiddleware': 700, 'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 750, 'scrapy.downloadermiddlewares.chunked.ChunkedTransferMiddleware': 830, 'scrapy.downloadermiddlewares.stats.DownloaderStats': 850, 'scrapy.downloadermiddlewares.httpcache.HttpCacheMiddleware': 900, # Downloader side } DOWNLOADER_STATS = True DUPEFILTER_CLASS = 'scrapy.dupefilters.RFPDupeFilter' try: EDITOR = os.environ['EDITOR'] except KeyError: if sys.platform == 'win32': EDITOR = '%s -m idlelib.idle' else: EDITOR = 'vi' EXTENSIONS = {} EXTENSIONS_BASE = { 'scrapy.extensions.corestats.CoreStats': 0, 'scrapy.telnet.TelnetConsole': 0, 'scrapy.extensions.memusage.MemoryUsage': 0, 'scrapy.extensions.memdebug.MemoryDebugger': 0, 'scrapy.extensions.closespider.CloseSpider': 0, 'scrapy.extensions.feedexport.FeedExporter': 0, 'scrapy.extensions.logstats.LogStats': 0, 'scrapy.extensions.spiderstate.SpiderState': 0, 'scrapy.extensions.throttle.AutoThrottle': 0, } FEED_URI = None FEED_URI_PARAMS = None # a function to extend uri arguments FEED_FORMAT = 'jsonlines' FEED_STORE_EMPTY = False FEED_EXPORT_FIELDS = None FEED_STORAGES = {} FEED_STORAGES_BASE = { '': 'scrapy.extensions.feedexport.FileFeedStorage', 'file': 'scrapy.extensions.feedexport.FileFeedStorage', 'stdout': 'scrapy.extensions.feedexport.StdoutFeedStorage', 's3': 'scrapy.extensions.feedexport.S3FeedStorage', 'ftp': 'scrapy.extensions.feedexport.FTPFeedStorage', } FEED_EXPORTERS = {} FEED_EXPORTERS_BASE = { 'json': 'scrapy.exporters.JsonItemExporter', 'jsonlines': 'scrapy.exporters.JsonLinesItemExporter', 'jl': 'scrapy.exporters.JsonLinesItemExporter', 'csv': 'scrapy.exporters.CsvItemExporter', 'xml': 'scrapy.exporters.XmlItemExporter', 'marshal': 'scrapy.exporters.MarshalItemExporter', 'pickle': 'scrapy.exporters.PickleItemExporter', } HTTPCACHE_ENABLED = False HTTPCACHE_DIR = 'httpcache' HTTPCACHE_IGNORE_MISSING = False HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' HTTPCACHE_EXPIRATION_SECS = 0 HTTPCACHE_IGNORE_HTTP_CODES = [] HTTPCACHE_IGNORE_SCHEMES = ['file'] HTTPCACHE_DBM_MODULE = 'anydbm' HTTPCACHE_POLICY = 'scrapy.extensions.httpcache.DummyPolicy' HTTPCACHE_GZIP = False ITEM_PROCESSOR = 'scrapy.pipelines.ItemPipelineManager' ITEM_PIPELINES = {} ITEM_PIPELINES_BASE = {} LOG_ENABLED = True LOG_ENCODING = 'utf-8' LOG_FORMATTER = 'scrapy.logformatter.LogFormatter' LOG_FORMAT = '%(asctime)s [%(name)s] %(levelname)s: %(message)s' LOG_DATEFORMAT = '%Y-%m-%d %H:%M:%S' LOG_STDOUT = False LOG_LEVEL = 'DEBUG' LOG_FILE = None LOG_UNSERIALIZABLE_REQUESTS = False LOGSTATS_INTERVAL = 60.0 MAIL_HOST = 'localhost' MAIL_PORT = 25 MAIL_FROM = 'scrapy@localhost' MAIL_PASS = None MAIL_USER = None MEMDEBUG_ENABLED = False # enable memory debugging MEMDEBUG_NOTIFY = [] # send memory debugging report by mail at engine shutdown MEMUSAGE_ENABLED = False MEMUSAGE_LIMIT_MB = 0 MEMUSAGE_NOTIFY_MAIL = [] MEMUSAGE_REPORT = False MEMUSAGE_WARNING_MB = 0 METAREFRESH_ENABLED = True METAREFRESH_MAXDELAY = 100 NEWSPIDER_MODULE = '' RANDOMIZE_DOWNLOAD_DELAY = True REACTOR_THREADPOOL_MAXSIZE = 10 REDIRECT_ENABLED = True REDIRECT_MAX_TIMES = 20 # uses Firefox default setting REDIRECT_PRIORITY_ADJUST = +2 REFERER_ENABLED = True RETRY_ENABLED = True RETRY_TIMES = 2 # initial response + 2 retries = 3 requests RETRY_HTTP_CODES = [500, 502, 503, 504, 400, 408] RETRY_PRIORITY_ADJUST = -1 ROBOTSTXT_OBEY = False SCHEDULER = 'scrapy.core.scheduler.Scheduler' SCHEDULER_DISK_QUEUE = 'scrapy.squeues.PickleLifoDiskQueue' SCHEDULER_MEMORY_QUEUE = 'scrapy.squeues.LifoMemoryQueue' SPIDER_LOADER_CLASS = 'scrapy.spiderloader.SpiderLoader' SPIDER_MIDDLEWARES = {} SPIDER_MIDDLEWARES_BASE = { # Engine side 'scrapy.spidermiddlewares.httperror.HttpErrorMiddleware': 50, 'scrapy.spidermiddlewares.offsite.OffsiteMiddleware': 500, 'scrapy.spidermiddlewares.referer.RefererMiddleware': 700, 'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware': 800, 'scrapy.spidermiddlewares.depth.DepthMiddleware': 900, # Spider side } SPIDER_MODULES = [] STATS_CLASS = 'scrapy.statscollectors.MemoryStatsCollector' STATS_DUMP = True STATSMAILER_RCPTS = [] TEMPLATES_DIR = abspath(join(dirname(__file__), '..', 'templates')) URLLENGTH_LIMIT = 2083 USER_AGENT = 'Scrapy/%s (+http://scrapy.org)' % import_module('scrapy').__version__ TELNETCONSOLE_ENABLED = 1 TELNETCONSOLE_PORT = [6023, 6073] TELNETCONSOLE_HOST = '127.0.0.1' SPIDER_CONTRACTS = {} SPIDER_CONTRACTS_BASE = { 'scrapy.contracts.default.UrlContract': 1, 'scrapy.contracts.default.ReturnsContract': 2, 'scrapy.contracts.default.ScrapesContract': 3, }
Some sections in default_settings.py and Custom Settings
- Bot name
BOT_NAME = 'scrapybot'
- Download handlers
DOWNLOAD_HANDLERS = {} DOWNLOAD_HANDLERS_BASE = { 'file': 'scrapy.core.downloader.handlers.file.FileDownloadHandler', 'http': 'scrapy.core.downloader.handlers.http.HTTPDownloadHandler', 'https': 'scrapy.core.downloader.handlers.http.HTTPDownloadHandler', 's3': 'scrapy.core.downloader.handlers.s3.S3DownloadHandler', 'ftp': 'scrapy.core.downloader.handlers.ftp.FTPDownloadHandler', }
- Download Middlewares:
DOWNLOADER_MIDDLEWARES = {} DOWNLOADER_MIDDLEWARES_BASE = { # Engine side 'scrapy.downloadermiddlewares.robotstxt.RobotsTxtMiddleware': 100, 'scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware': 300, 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware': 350, 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': 400, 'scrapy.downloadermiddlewares.retry.RetryMiddleware': 500, 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware': 550, 'scrapy.downloadermiddlewares.ajaxcrawl.AjaxCrawlMiddleware': 560, 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware': 580, 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 590, 'scrapy.downloadermiddlewares.redirect.RedirectMiddleware': 600, 'scrapy.downloadermiddlewares.cookies.CookiesMiddleware': 700, 'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 750, 'scrapy.downloadermiddlewares.chunked.ChunkedTransferMiddleware': 830, 'scrapy.downloadermiddlewares.stats.DownloaderStats': 850, 'scrapy.downloadermiddlewares.httpcache.HttpCacheMiddleware': 900, # Downloader side }
- Extensions:
EXTENSIONS = {} EXTENSIONS_BASE = { 'scrapy.extensions.corestats.CoreStats': 0, 'scrapy.telnet.TelnetConsole': 0, 'scrapy.extensions.memusage.MemoryUsage': 0, 'scrapy.extensions.memdebug.MemoryDebugger': 0, 'scrapy.extensions.closespider.CloseSpider': 0, 'scrapy.extensions.feedexport.FeedExporter': 0, 'scrapy.extensions.logstats.LogStats': 0, 'scrapy.extensions.spiderstate.SpiderState': 0, 'scrapy.extensions.throttle.AutoThrottle': 0, }
- Feed storages
FEED_STORAGES = {} FEED_STORAGES_BASE = { '': 'scrapy.extensions.feedexport.FileFeedStorage', 'file': 'scrapy.extensions.feedexport.FileFeedStorage', 'stdout': 'scrapy.extensions.feedexport.StdoutFeedStorage', 's3': 'scrapy.extensions.feedexport.S3FeedStorage', 'ftp': 'scrapy.extensions.feedexport.FTPFeedStorage', }
- Feed Exporters:
FEED_EXPORTERS = {} FEED_EXPORTERS_BASE = { 'json': 'scrapy.exporters.JsonItemExporter', 'jsonlines': 'scrapy.exporters.JsonLinesItemExporter', 'jl': 'scrapy.exporters.JsonLinesItemExporter', 'csv': 'scrapy.exporters.CsvItemExporter', 'xml': 'scrapy.exporters.XmlItemExporter', 'marshal': 'scrapy.exporters.MarshalItemExporter', 'pickle': 'scrapy.exporters.PickleItemExporter', }
- Spider Middlewares:
SPIDER_MIDDLEWARES = {} SPIDER_MIDDLEWARES_BASE = { # Engine side 'scrapy.spidermiddlewares.httperror.HttpErrorMiddleware': 50, 'scrapy.spidermiddlewares.offsite.OffsiteMiddleware': 500, 'scrapy.spidermiddlewares.referer.RefererMiddleware': 700, 'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware': 800, 'scrapy.spidermiddlewares.depth.DepthMiddleware': 900, # Spider side }
- Spider Modules
SPIDER_MODULES = []
scrapy engine
scrapy engine init and start
DownloadMiddleware
Instant DownloadMiddleware
DownloadMiddleware run Download
Request in Scrapy
Crawl and Spider
scrapy crawl command start and run Spider
Command to start spider
scrapy crawl <spidername>
crawling diagram: Basic Steps for crawling:
- Step1: Call Class Method update_settings of Spider
- Step2: Call Class Method from_crawler to create spider Object
- Step3: Call Method self.spider.start_requests() return all requests for downloading before run the spider to download
class Crawler(object): ................ @defer.inlineCallbacks def crawl(self, *args, **kwargs): assert not self.crawling, "Crawling already taking place" self.crawling = True try: self.spider = self._create_spider(*args, **kwargs) self.engine = self._create_engine() start_requests = iter(self.spider.start_requests()) yield self.engine.open_spider(self.spider, start_requests) yield defer.maybeDeferred(self.engine.start) except Exception: self.crawling = False raise ........................... class Spider(object_ref): """Base class for scrapy spiders. All spiders must inherit from this class. """ @classmethod def from_crawler(cls, crawler, *args, **kwargs): spider = cls(*args, **kwargs) spider._set_crawler(crawler) return spider def start_requests(self): for url in self.start_urls: yield self.make_requests_from_url(url) def make_requests_from_url(self, url): return Request(url, dont_filter=True) def parse(self, response): raise NotImplementedError @classmethod def update_settings(cls, settings): settings.setdict(cls.custom_settings or {}, priority='spider') @classmethod def handles_request(cls, request): return url_is_from_spider(request.url, cls) @staticmethod def close(spider, reason): closed = getattr(spider, 'closed', None) if callable(closed): return closed(reason)
- Step4: Add spider to Shedule for downloading
- Step5: After download url finished, Crawler calling function parse
- Step6: Continue download new requests: Crawler get all requests yielded from function parse and continue downloading
- Step7: process data downloaded: Crawler get all items yielded from function parse and process theme
- Step8: Mỗi request đều cố hàm callback parse(self, response) Hoặc _response_downloaded(self, response), sau khi request được download, scrapy sẽ gọi hàm callback này để xử lý response. Nếu:
- Hàm callback trả về giá trị là Request → sẽ tiếp tục download những request này
- Nếu hàm callback trả về giá trị là Item → đưa vô pipeline đễ xử lý data
Spider Classes
Thuật toán download và parse response trong Spider
Spider class
class Spider(object_ref): """Base class for scrapy spiders. All spiders must inherit from this class. """ name = None custom_settings = None def __init__(self, name=None, **kwargs): if name is not None: self.name = name elif not getattr(self, 'name', None): raise ValueError("%s must have a name" % type(self).__name__) self.__dict__.update(kwargs) if not hasattr(self, 'start_urls'): self.start_urls = [] @property def logger(self): logger = logging.getLogger(self.name) return logging.LoggerAdapter(logger, {'spider': self}) def log(self, message, level=logging.DEBUG, **kw): """Log the given message at the given log level This helper wraps a log call to the logger within the spider, but you can use it directly (e.g. Spider.logger.info('msg')) or use any other Python logger too. """ self.logger.log(level, message, **kw) @classmethod def from_crawler(cls, crawler, *args, **kwargs): spider = cls(*args, **kwargs) spider._set_crawler(crawler) return spider def set_crawler(self, crawler): warnings.warn("set_crawler is deprecated, instantiate and bound the " "spider to this crawler with from_crawler method " "instead.", category=ScrapyDeprecationWarning, stacklevel=2) assert not hasattr(self, 'crawler'), "Spider already bounded to a " \ "crawler" self._set_crawler(crawler) def _set_crawler(self, crawler): self.crawler = crawler self.settings = crawler.settings crawler.signals.connect(self.close, signals.spider_closed) def start_requests(self): for url in self.start_urls: yield self.make_requests_from_url(url) def make_requests_from_url(self, url): return Request(url, dont_filter=True) def parse(self, response): raise NotImplementedError @classmethod def update_settings(cls, settings): settings.setdict(cls.custom_settings or {}, priority='spider') @classmethod def handles_request(cls, request): return url_is_from_spider(request.url, cls) @staticmethod def close(spider, reason): closed = getattr(spider, 'closed', None) if callable(closed): return closed(reason) def __str__(self): return "<%s %r at 0x%0x>" % (type(self).__name__, self.name, id(self)) __repr__ = __str__
Mô tả thuật toán:
- Step1: Scrapy Engine gọi hàm start_requests thực hiện việc download tất cả các link trong start_urls:
def start_requests(self): for url in self.start_urls: yield self.make_requests_from_url(url) def make_requests_from_url(self, url): return Request(url, dont_filter=True)
- Step2: Response được download trong step1 sẽ được xử lý trong hàm parse(self, response). Hàm parse trả về các tập dữ liệu bên dưới
- Tập dữ liệu 1: Tập gồm các dữ liệu web được gởi tới Pipeline để xử lý
- Tập dữ liệu 2: Tập gồm các link mới sẽ được download bởi scrapy và gởi tới hàm parse(self, response)
CrawlSpider class
CrawlSpider kế thừa từ spider và sử dụng 2 thuật toán cơ bản:
- Thuật toán đệ quy để tìm tất cả url liên kết với url khởi tạo và tạo thành mạng lưới url liên kết với nó
- Thuật toán extract links dựa theo rule để lọc ra những url mà nó muốn download
⇒ This modules implements the CrawlSpider which is the recommended spider to use for scraping typical web sites that requires crawling pages.
""" This modules implements the CrawlSpider which is the recommended spider to use for scraping typical web sites that requires crawling pages. See documentation in docs/topics/spiders.rst """ import copy from scrapy.http import Request, HtmlResponse from scrapy.utils.spider import iterate_spider_output from scrapy.spiders import Spider def identity(x): return x class Rule(object): def __init__(self, link_extractor, callback=None, cb_kwargs=None, follow=None, process_links=None, process_request=identity): self.link_extractor = link_extractor self.callback = callback self.cb_kwargs = cb_kwargs or {} self.process_links = process_links self.process_request = process_request if follow is None: self.follow = False if callback else True else: self.follow = follow class CrawlSpider(Spider): rules = () def __init__(self, *a, **kw): super(CrawlSpider, self).__init__(*a, **kw) self._compile_rules() def parse(self, response): return self._parse_response(response, self.parse_start_url, cb_kwargs={}, follow=True) def parse_start_url(self, response): return [] def process_results(self, response, results): return results def _requests_to_follow(self, response): if not isinstance(response, HtmlResponse): return seen = set() for n, rule in enumerate(self._rules): links = [l for l in rule.link_extractor.extract_links(response) if l not in seen] if links and rule.process_links: links = rule.process_links(links) for link in links: seen.add(link) r = Request(url=link.url, callback=self._response_downloaded) r.meta.update(rule=n, link_text=link.text) yield rule.process_request(r) def _response_downloaded(self, response): rule = self._rules[response.meta['rule']] return self._parse_response(response, rule.callback, rule.cb_kwargs, rule.follow) def _parse_response(self, response, callback, cb_kwargs, follow=True): if callback: cb_res = callback(response, **cb_kwargs) or () cb_res = self.process_results(response, cb_res) for requests_or_item in iterate_spider_output(cb_res): yield requests_or_item if follow and self._follow_links: for request_or_item in self._requests_to_follow(response): yield request_or_item def _compile_rules(self): def get_method(method): if callable(method): return method elif isinstance(method, basestring): return getattr(self, method, None) self._rules = [copy.copy(r) for r in self.rules] for rule in self._rules: rule.callback = get_method(rule.callback) rule.process_links = get_method(rule.process_links) rule.process_request = get_method(rule.process_request) @classmethod def from_crawler(cls, crawler, *args, **kwargs): spider = super(CrawlSpider, cls).from_crawler(crawler, *args, **kwargs) spider._follow_links = crawler.settings.getbool( 'CRAWLSPIDER_FOLLOW_LINKS', True) return spider def set_crawler(self, crawler): super(CrawlSpider, self).set_crawler(crawler) self._follow_links = crawler.settings.getbool('CRAWLSPIDER_FOLLOW_LINKS', True)
ItemPipeline để xử lý lưu trữ dữ liệu
refer: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
Riêng ImagesPipeline and FilesPipeline, request sẽ được gọi và download trong hàm process_item của pipeline
ItemPipeline Classes
With some public functions for pipeline:
@classmethod def from_crawler(cls, crawler) def open_spider(self, spider) def process_item(self, item, spider) def media_to_download(self, request, info) def get_media_requests(self, item, info) def media_downloaded(self, response, request, info) def media_failed(self, failure, request, info) def item_completed(self, results, item, info)
ItemPipelineManager
Chú thích cho sơ đồ và mô tả kiến trúc của thuật toán pipeline:
- Tất cả pipeline classes được xem là middleware để xử lý dữ liệu sau khi parse bởi spider(middleware có nghĩa là dữ liệu được parse bởi tất cả spider đều được gởi tuần tự đến tất cả những pipeline này), danh sách các pipeline được sử dụng sẽ lưu trữ trong settings của chương trình ITEM_PIPELINES và ITEM_PIPELINES_BASE:
ITEM_PIPELINES = {'templatedownload.pipelines.MyFilesPipeline', }
- khi chương trình khởi động, scrapy sẽ gọi các class method của tất cả pipeline class để khởi dựng nó :
- Gọi class method from_crawler của pipeline class(cls là pipeline class)
@classmethod def from_crawler(cls, crawler):
- Gọi class method from_settings của pipeline class(cls là pipeline class)
@classmethod def from_settings(cls, settings)
⇒ Với class method, chương trình Có thể khởi tạo các thuộc tính cơ bản và khởi dựng pipeline class trong phương thức from_crawler hoặc from_settings
- Khi spider parse và trả về dữ liệu bất kỳ, scrapy sẽ gọi các hàm bên dưới của tất cả pipeline class để xử lý nó. Theo thứ tự như sau:
- Gọi phương thức open_spider
def open_spider(self, spider)
- Gọi phương thức process_item
def process_item(self, item, spider):
- Gọi phương thức close_spider
def close_spider(self, spider)
ImagesPipeline
Store in FilesPipeline
With 2 public functions:
def persist_file(self, path, buf, info, meta=None, headers=None) def stat_file(self, path, info)
Called from FilesPipeline:
def file_downloaded(self, response, request, info): path = self.file_path(request, response=response, info=info) buf = BytesIO(response.body) self.store.persist_file(path, buf, info) checksum = md5sum(buf) return checksum
Write items to a JSON file
The following pipeline stores all scraped items (from all spiders) into a single items.jl file, containing one item per line serialized in JSON format:
import json class JsonWriterPipeline(object): def open_spider(self, spider): self.file = open('items.jl', 'wb') def close_spider(self, spider): self.file.close() def process_item(self, item, spider): line = json.dumps(dict(item)) + "\n" self.file.write(line) return item
Note: The purpose of JsonWriterPipeline is just to introduce how to write item pipelines. If you really want to store all scraped items into a JSON file you should use the Feed exports.
Write items to MongoDB
In this example we’ll write items to MongoDB using pymongo. MongoDB address and database name are specified in Scrapy settings; MongoDB collection is named after item class.
The main point of this example is to show how to use from_crawler() method and how to clean up the resources properly.:
import pymongo class MongoPipeline(object): collection_name = 'scrapy_items' def __init__(self, mongo_uri, mongo_db): self.mongo_uri = mongo_uri self.mongo_db = mongo_db @classmethod def from_crawler(cls, crawler): return cls( mongo_uri=crawler.settings.get('MONGO_URI'), mongo_db=crawler.settings.get('MONGO_DATABASE', 'items') ) def open_spider(self, spider): self.client = pymongo.MongoClient(self.mongo_uri) self.db = self.client[self.mongo_db] def close_spider(self, spider): self.client.close() def process_item(self, item, spider): self.db[self.collection_name].insert(dict(item)) return item
Duplicates filter
A filter that looks for duplicate items, and drops those items that were already processed. Let’s say that our items have a unique id, but our spider returns multiples items with the same id:
from scrapy.exceptions import DropItem class DuplicatesPipeline(object): def __init__(self): self.ids_seen = set() def process_item(self, item, spider): if item['id'] in self.ids_seen: raise DropItem("Duplicate item found: %s" % item) else: self.ids_seen.add(item['id']) return item
scope of allowed_domains
allowed_domains was filtered in site-packages\scrapy\utils\url.py:
def url_is_from_any_domain(url, domains): """Return True if the url belongs to any of the given domains""" host = parse_url(url).netloc.lower() if host: return any(((host == d.lower()) or (host.endswith('.%s' % d.lower())) for d in domains)) else: return False def url_is_from_spider(url, spider): """Return True if the url belongs to the given spider""" return url_is_from_any_domain(url, [spider.name] + list(getattr(spider, 'allowed_domains', [])))
and spider call it to check before download:
class Spider(object_ref): @classmethod def handles_request(cls, request): return url_is_from_spider(request.url, cls)
Integrate Scrapy with Other Systems
Integrate via below systems:
- Database: MySQL, MongoDB
- Cache: Redis Cache, Cm Cache → You can start multiple spider instances that share a single redis queue. Best suitable for broad multi-domain crawls.