====== Scrapy Architecture Code ====== ===== Scrapy commands ===== ==== Overview about scrapy commands ==== * Scrapy command format scrapy --help Scrapy 1.0.3 - project: templatedownload Usage: scrapy [options] [args] Available commands: bench Run quick benchmark test check Check spider contracts commands crawl Run a spider edit Edit spider fetch Fetch a URL using the Scrapy downloader genspider Generate new spider using pre-defined templates list List available spiders parse Parse URL (using its spider) and print the results runspider Run a self-contained spider (without creating a project) settings Get settings values shell Interactive scraping console startproject Create new project version Print Scrapy version view Open URL in browser, as seen by Scrapy Use "scrapy -h" to see more info about a command * For example: scrapy startproject -h Usage ===== scrapy startproject Create new project Options ======= --help, -h show this help message and exit Global Options -------------- --logfile=FILE log file. if omitted stderr will be used --loglevel=LEVEL, -L LEVEL log level (default: DEBUG) --nolog disable logging completely --profile=FILE write python cProfile stats to FILE --lsprof=FILE write lsprof profiling stats to FILE --pidfile=FILE write process ID to FILE --set=NAME=VALUE, -s NAME=VALUE set/override setting (may be repeated) --pdb enable pdb on failure * content of script scrapy: python -mscrapy.cmdline %* * scrapy command diagram {{:crawler:scrapy-command.png|}} ==== scrapy command code scripts ==== Scrapy command code scripts are stored in directory **scrapy/commands/**: bench.py check.py crawl.py deploy.py edit.py fetch.py genspider.py list.py parse.py runspider.py settings.py shell.py startproject.py version.py view.py ===== Scrapy Settings ===== ==== Default settings ==== default_settings.py """ This module contains the default values for all settings used by Scrapy. For more information about these settings you can read the settings documentation in docs/topics/settings.rst Scrapy developers, if you add a setting here remember to: * add it in alphabetical order * group similar settings without leaving blank lines * add its documentation to the available settings documentation (docs/topics/settings.rst) """ import os import sys from importlib import import_module from os.path import join, abspath, dirname AJAXCRAWL_ENABLED = False BOT_NAME = 'scrapybot' CLOSESPIDER_TIMEOUT = 0 CLOSESPIDER_PAGECOUNT = 0 CLOSESPIDER_ITEMCOUNT = 0 CLOSESPIDER_ERRORCOUNT = 0 COMMANDS_MODULE = '' COMPRESSION_ENABLED = True CONCURRENT_ITEMS = 100 CONCURRENT_REQUESTS = 16 CONCURRENT_REQUESTS_PER_DOMAIN = 8 CONCURRENT_REQUESTS_PER_IP = 0 COOKIES_ENABLED = True COOKIES_DEBUG = False DEFAULT_ITEM_CLASS = 'scrapy.item.Item' DEFAULT_REQUEST_HEADERS = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en', } DEPTH_LIMIT = 0 DEPTH_STATS = True DEPTH_PRIORITY = 0 DNSCACHE_ENABLED = True DNSCACHE_SIZE = 10000 DNS_TIMEOUT = 60 DOWNLOAD_DELAY = 0 DOWNLOAD_HANDLERS = {} DOWNLOAD_HANDLERS_BASE = { 'file': 'scrapy.core.downloader.handlers.file.FileDownloadHandler', 'http': 'scrapy.core.downloader.handlers.http.HTTPDownloadHandler', 'https': 'scrapy.core.downloader.handlers.http.HTTPDownloadHandler', 's3': 'scrapy.core.downloader.handlers.s3.S3DownloadHandler', 'ftp': 'scrapy.core.downloader.handlers.ftp.FTPDownloadHandler', } DOWNLOAD_TIMEOUT = 180 # 3mins DOWNLOAD_MAXSIZE = 1024*1024*1024 # 1024m DOWNLOAD_WARNSIZE = 32*1024*1024 # 32m DOWNLOADER = 'scrapy.core.downloader.Downloader' DOWNLOADER_HTTPCLIENTFACTORY = 'scrapy.core.downloader.webclient.ScrapyHTTPClientFactory' DOWNLOADER_CLIENTCONTEXTFACTORY = 'scrapy.core.downloader.contextfactory.ScrapyClientContextFactory' DOWNLOADER_MIDDLEWARES = {} DOWNLOADER_MIDDLEWARES_BASE = { # Engine side 'scrapy.downloadermiddlewares.robotstxt.RobotsTxtMiddleware': 100, 'scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware': 300, 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware': 350, 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': 400, 'scrapy.downloadermiddlewares.retry.RetryMiddleware': 500, 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware': 550, 'scrapy.downloadermiddlewares.ajaxcrawl.AjaxCrawlMiddleware': 560, 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware': 580, 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 590, 'scrapy.downloadermiddlewares.redirect.RedirectMiddleware': 600, 'scrapy.downloadermiddlewares.cookies.CookiesMiddleware': 700, 'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 750, 'scrapy.downloadermiddlewares.chunked.ChunkedTransferMiddleware': 830, 'scrapy.downloadermiddlewares.stats.DownloaderStats': 850, 'scrapy.downloadermiddlewares.httpcache.HttpCacheMiddleware': 900, # Downloader side } DOWNLOADER_STATS = True DUPEFILTER_CLASS = 'scrapy.dupefilters.RFPDupeFilter' try: EDITOR = os.environ['EDITOR'] except KeyError: if sys.platform == 'win32': EDITOR = '%s -m idlelib.idle' else: EDITOR = 'vi' EXTENSIONS = {} EXTENSIONS_BASE = { 'scrapy.extensions.corestats.CoreStats': 0, 'scrapy.telnet.TelnetConsole': 0, 'scrapy.extensions.memusage.MemoryUsage': 0, 'scrapy.extensions.memdebug.MemoryDebugger': 0, 'scrapy.extensions.closespider.CloseSpider': 0, 'scrapy.extensions.feedexport.FeedExporter': 0, 'scrapy.extensions.logstats.LogStats': 0, 'scrapy.extensions.spiderstate.SpiderState': 0, 'scrapy.extensions.throttle.AutoThrottle': 0, } FEED_URI = None FEED_URI_PARAMS = None # a function to extend uri arguments FEED_FORMAT = 'jsonlines' FEED_STORE_EMPTY = False FEED_EXPORT_FIELDS = None FEED_STORAGES = {} FEED_STORAGES_BASE = { '': 'scrapy.extensions.feedexport.FileFeedStorage', 'file': 'scrapy.extensions.feedexport.FileFeedStorage', 'stdout': 'scrapy.extensions.feedexport.StdoutFeedStorage', 's3': 'scrapy.extensions.feedexport.S3FeedStorage', 'ftp': 'scrapy.extensions.feedexport.FTPFeedStorage', } FEED_EXPORTERS = {} FEED_EXPORTERS_BASE = { 'json': 'scrapy.exporters.JsonItemExporter', 'jsonlines': 'scrapy.exporters.JsonLinesItemExporter', 'jl': 'scrapy.exporters.JsonLinesItemExporter', 'csv': 'scrapy.exporters.CsvItemExporter', 'xml': 'scrapy.exporters.XmlItemExporter', 'marshal': 'scrapy.exporters.MarshalItemExporter', 'pickle': 'scrapy.exporters.PickleItemExporter', } HTTPCACHE_ENABLED = False HTTPCACHE_DIR = 'httpcache' HTTPCACHE_IGNORE_MISSING = False HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' HTTPCACHE_EXPIRATION_SECS = 0 HTTPCACHE_IGNORE_HTTP_CODES = [] HTTPCACHE_IGNORE_SCHEMES = ['file'] HTTPCACHE_DBM_MODULE = 'anydbm' HTTPCACHE_POLICY = 'scrapy.extensions.httpcache.DummyPolicy' HTTPCACHE_GZIP = False ITEM_PROCESSOR = 'scrapy.pipelines.ItemPipelineManager' ITEM_PIPELINES = {} ITEM_PIPELINES_BASE = {} LOG_ENABLED = True LOG_ENCODING = 'utf-8' LOG_FORMATTER = 'scrapy.logformatter.LogFormatter' LOG_FORMAT = '%(asctime)s [%(name)s] %(levelname)s: %(message)s' LOG_DATEFORMAT = '%Y-%m-%d %H:%M:%S' LOG_STDOUT = False LOG_LEVEL = 'DEBUG' LOG_FILE = None LOG_UNSERIALIZABLE_REQUESTS = False LOGSTATS_INTERVAL = 60.0 MAIL_HOST = 'localhost' MAIL_PORT = 25 MAIL_FROM = 'scrapy@localhost' MAIL_PASS = None MAIL_USER = None MEMDEBUG_ENABLED = False # enable memory debugging MEMDEBUG_NOTIFY = [] # send memory debugging report by mail at engine shutdown MEMUSAGE_ENABLED = False MEMUSAGE_LIMIT_MB = 0 MEMUSAGE_NOTIFY_MAIL = [] MEMUSAGE_REPORT = False MEMUSAGE_WARNING_MB = 0 METAREFRESH_ENABLED = True METAREFRESH_MAXDELAY = 100 NEWSPIDER_MODULE = '' RANDOMIZE_DOWNLOAD_DELAY = True REACTOR_THREADPOOL_MAXSIZE = 10 REDIRECT_ENABLED = True REDIRECT_MAX_TIMES = 20 # uses Firefox default setting REDIRECT_PRIORITY_ADJUST = +2 REFERER_ENABLED = True RETRY_ENABLED = True RETRY_TIMES = 2 # initial response + 2 retries = 3 requests RETRY_HTTP_CODES = [500, 502, 503, 504, 400, 408] RETRY_PRIORITY_ADJUST = -1 ROBOTSTXT_OBEY = False SCHEDULER = 'scrapy.core.scheduler.Scheduler' SCHEDULER_DISK_QUEUE = 'scrapy.squeues.PickleLifoDiskQueue' SCHEDULER_MEMORY_QUEUE = 'scrapy.squeues.LifoMemoryQueue' SPIDER_LOADER_CLASS = 'scrapy.spiderloader.SpiderLoader' SPIDER_MIDDLEWARES = {} SPIDER_MIDDLEWARES_BASE = { # Engine side 'scrapy.spidermiddlewares.httperror.HttpErrorMiddleware': 50, 'scrapy.spidermiddlewares.offsite.OffsiteMiddleware': 500, 'scrapy.spidermiddlewares.referer.RefererMiddleware': 700, 'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware': 800, 'scrapy.spidermiddlewares.depth.DepthMiddleware': 900, # Spider side } SPIDER_MODULES = [] STATS_CLASS = 'scrapy.statscollectors.MemoryStatsCollector' STATS_DUMP = True STATSMAILER_RCPTS = [] TEMPLATES_DIR = abspath(join(dirname(__file__), '..', 'templates')) URLLENGTH_LIMIT = 2083 USER_AGENT = 'Scrapy/%s (+http://scrapy.org)' % import_module('scrapy').__version__ TELNETCONSOLE_ENABLED = 1 TELNETCONSOLE_PORT = [6023, 6073] TELNETCONSOLE_HOST = '127.0.0.1' SPIDER_CONTRACTS = {} SPIDER_CONTRACTS_BASE = { 'scrapy.contracts.default.UrlContract': 1, 'scrapy.contracts.default.ReturnsContract': 2, 'scrapy.contracts.default.ScrapesContract': 3, } ==== Some sections in default_settings.py and Custom Settings === * Bot name BOT_NAME = 'scrapybot' * Download handlers DOWNLOAD_HANDLERS = {} DOWNLOAD_HANDLERS_BASE = { 'file': 'scrapy.core.downloader.handlers.file.FileDownloadHandler', 'http': 'scrapy.core.downloader.handlers.http.HTTPDownloadHandler', 'https': 'scrapy.core.downloader.handlers.http.HTTPDownloadHandler', 's3': 'scrapy.core.downloader.handlers.s3.S3DownloadHandler', 'ftp': 'scrapy.core.downloader.handlers.ftp.FTPDownloadHandler', } * Download Middlewares: DOWNLOADER_MIDDLEWARES = {} DOWNLOADER_MIDDLEWARES_BASE = { # Engine side 'scrapy.downloadermiddlewares.robotstxt.RobotsTxtMiddleware': 100, 'scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware': 300, 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware': 350, 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': 400, 'scrapy.downloadermiddlewares.retry.RetryMiddleware': 500, 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware': 550, 'scrapy.downloadermiddlewares.ajaxcrawl.AjaxCrawlMiddleware': 560, 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware': 580, 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 590, 'scrapy.downloadermiddlewares.redirect.RedirectMiddleware': 600, 'scrapy.downloadermiddlewares.cookies.CookiesMiddleware': 700, 'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 750, 'scrapy.downloadermiddlewares.chunked.ChunkedTransferMiddleware': 830, 'scrapy.downloadermiddlewares.stats.DownloaderStats': 850, 'scrapy.downloadermiddlewares.httpcache.HttpCacheMiddleware': 900, # Downloader side } * Extensions: EXTENSIONS = {} EXTENSIONS_BASE = { 'scrapy.extensions.corestats.CoreStats': 0, 'scrapy.telnet.TelnetConsole': 0, 'scrapy.extensions.memusage.MemoryUsage': 0, 'scrapy.extensions.memdebug.MemoryDebugger': 0, 'scrapy.extensions.closespider.CloseSpider': 0, 'scrapy.extensions.feedexport.FeedExporter': 0, 'scrapy.extensions.logstats.LogStats': 0, 'scrapy.extensions.spiderstate.SpiderState': 0, 'scrapy.extensions.throttle.AutoThrottle': 0, } * Feed storages FEED_STORAGES = {} FEED_STORAGES_BASE = { '': 'scrapy.extensions.feedexport.FileFeedStorage', 'file': 'scrapy.extensions.feedexport.FileFeedStorage', 'stdout': 'scrapy.extensions.feedexport.StdoutFeedStorage', 's3': 'scrapy.extensions.feedexport.S3FeedStorage', 'ftp': 'scrapy.extensions.feedexport.FTPFeedStorage', } * Feed Exporters: FEED_EXPORTERS = {} FEED_EXPORTERS_BASE = { 'json': 'scrapy.exporters.JsonItemExporter', 'jsonlines': 'scrapy.exporters.JsonLinesItemExporter', 'jl': 'scrapy.exporters.JsonLinesItemExporter', 'csv': 'scrapy.exporters.CsvItemExporter', 'xml': 'scrapy.exporters.XmlItemExporter', 'marshal': 'scrapy.exporters.MarshalItemExporter', 'pickle': 'scrapy.exporters.PickleItemExporter', } * Spider Middlewares: SPIDER_MIDDLEWARES = {} SPIDER_MIDDLEWARES_BASE = { # Engine side 'scrapy.spidermiddlewares.httperror.HttpErrorMiddleware': 50, 'scrapy.spidermiddlewares.offsite.OffsiteMiddleware': 500, 'scrapy.spidermiddlewares.referer.RefererMiddleware': 700, 'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware': 800, 'scrapy.spidermiddlewares.depth.DepthMiddleware': 900, # Spider side } * Spider Modules SPIDER_MODULES = [] ===== scrapy engine ===== ==== scrapy engine init and start ==== {{:crawler:scrapyengine_init.png|}} ==== DownloadMiddleware ==== {{:crawler:scrapy_downloadmiddlewareclass.png|}} === Instant DownloadMiddleware === {{:crawler:scrapy_downloadmiddleware_instant.png|}} === DownloadMiddleware run Download === {{:crawler:scrapy_downloadmiddleware_download.png|}} ==== Request in Scrapy ==== ==== Crawl and Spider ==== === scrapy crawl command start and run Spider === Command to start spider scrapy crawl crawling diagram: {{:crawler:scrapy_crawlcommand_start.png|}} Basic Steps for crawling: - Step1: Call Class Method **update_settings** of Spider - Step2: Call Class Method **from_crawler** to create spider Object - Step3: Call Method self.spider.**start_requests()** return all requests for downloading before run the spider to download class Crawler(object): ................ @defer.inlineCallbacks def crawl(self, *args, **kwargs): assert not self.crawling, "Crawling already taking place" self.crawling = True try: self.spider = self._create_spider(*args, **kwargs) self.engine = self._create_engine() start_requests = iter(self.spider.start_requests()) yield self.engine.open_spider(self.spider, start_requests) yield defer.maybeDeferred(self.engine.start) except Exception: self.crawling = False raise ........................... class Spider(object_ref): """Base class for scrapy spiders. All spiders must inherit from this class. """ @classmethod def from_crawler(cls, crawler, *args, **kwargs): spider = cls(*args, **kwargs) spider._set_crawler(crawler) return spider def start_requests(self): for url in self.start_urls: yield self.make_requests_from_url(url) def make_requests_from_url(self, url): return Request(url, dont_filter=True) def parse(self, response): raise NotImplementedError @classmethod def update_settings(cls, settings): settings.setdict(cls.custom_settings or {}, priority='spider') @classmethod def handles_request(cls, request): return url_is_from_spider(request.url, cls) @staticmethod def close(spider, reason): closed = getattr(spider, 'closed', None) if callable(closed): return closed(reason) - Step4: Add spider to Shedule for downloading - Step5: After download url finished, Crawler calling function **parse** - Step6: **Continue download new requests**: Crawler get all requests yielded from function **parse** and continue downloading - Step7: **process data downloaded**: Crawler get all items yielded from function **parse** and process theme - Step8: Mỗi request đều cố hàm callback **parse(self, response)** Hoặc **_response_downloaded(self, response)**, sau khi request được download, scrapy sẽ gọi hàm callback này để xử lý response. Nếu: * Hàm callback trả về giá trị là **Request** -> sẽ tiếp tục download những request này * Nếu hàm callback trả về giá trị là **Item** -> đưa vô pipeline đễ xử lý data === Spider Classes === {{:crawler:scrapy_spider_class.png|}} === Thuật toán download và parse response trong Spider === Spider class class Spider(object_ref): """Base class for scrapy spiders. All spiders must inherit from this class. """ name = None custom_settings = None def __init__(self, name=None, **kwargs): if name is not None: self.name = name elif not getattr(self, 'name', None): raise ValueError("%s must have a name" % type(self).__name__) self.__dict__.update(kwargs) if not hasattr(self, 'start_urls'): self.start_urls = [] @property def logger(self): logger = logging.getLogger(self.name) return logging.LoggerAdapter(logger, {'spider': self}) def log(self, message, level=logging.DEBUG, **kw): """Log the given message at the given log level This helper wraps a log call to the logger within the spider, but you can use it directly (e.g. Spider.logger.info('msg')) or use any other Python logger too. """ self.logger.log(level, message, **kw) @classmethod def from_crawler(cls, crawler, *args, **kwargs): spider = cls(*args, **kwargs) spider._set_crawler(crawler) return spider def set_crawler(self, crawler): warnings.warn("set_crawler is deprecated, instantiate and bound the " "spider to this crawler with from_crawler method " "instead.", category=ScrapyDeprecationWarning, stacklevel=2) assert not hasattr(self, 'crawler'), "Spider already bounded to a " \ "crawler" self._set_crawler(crawler) def _set_crawler(self, crawler): self.crawler = crawler self.settings = crawler.settings crawler.signals.connect(self.close, signals.spider_closed) def start_requests(self): for url in self.start_urls: yield self.make_requests_from_url(url) def make_requests_from_url(self, url): return Request(url, dont_filter=True) def parse(self, response): raise NotImplementedError @classmethod def update_settings(cls, settings): settings.setdict(cls.custom_settings or {}, priority='spider') @classmethod def handles_request(cls, request): return url_is_from_spider(request.url, cls) @staticmethod def close(spider, reason): closed = getattr(spider, 'closed', None) if callable(closed): return closed(reason) def __str__(self): return "<%s %r at 0x%0x>" % (type(self).__name__, self.name, id(self)) __repr__ = __str__ Mô tả thuật toán: - Step1: Scrapy Engine gọi hàm **start_requests** thực hiện việc download tất cả các link trong **start_urls**: def start_requests(self): for url in self.start_urls: yield self.make_requests_from_url(url) def make_requests_from_url(self, url): return Request(url, dont_filter=True) - Step2: Response được download trong step1 sẽ được xử lý trong hàm **parse(self, response)**. Hàm parse trả về các tập dữ liệu bên dưới - Tập dữ liệu 1: Tập gồm các dữ liệu web được gởi tới Pipeline để xử lý - Tập dữ liệu 2: Tập gồm các link mới sẽ được download bởi scrapy và gởi tới hàm **parse(self, response)** === CrawlSpider class === **CrawlSpider kế thừa từ spider** và sử dụng 2 thuật toán cơ bản: * **Thuật toán đệ quy** để tìm tất cả url liên kết với url khởi tạo và tạo thành mạng lưới url liên kết với nó * **Thuật toán extract links** dựa theo rule để lọc ra những url mà nó muốn download => This modules implements the CrawlSpider which is the recommended spider to use for scraping typical web sites that requires crawling pages. """ This modules implements the CrawlSpider which is the recommended spider to use for scraping typical web sites that requires crawling pages. See documentation in docs/topics/spiders.rst """ import copy from scrapy.http import Request, HtmlResponse from scrapy.utils.spider import iterate_spider_output from scrapy.spiders import Spider def identity(x): return x class Rule(object): def __init__(self, link_extractor, callback=None, cb_kwargs=None, follow=None, process_links=None, process_request=identity): self.link_extractor = link_extractor self.callback = callback self.cb_kwargs = cb_kwargs or {} self.process_links = process_links self.process_request = process_request if follow is None: self.follow = False if callback else True else: self.follow = follow class CrawlSpider(Spider): rules = () def __init__(self, *a, **kw): super(CrawlSpider, self).__init__(*a, **kw) self._compile_rules() def parse(self, response): return self._parse_response(response, self.parse_start_url, cb_kwargs={}, follow=True) def parse_start_url(self, response): return [] def process_results(self, response, results): return results def _requests_to_follow(self, response): if not isinstance(response, HtmlResponse): return seen = set() for n, rule in enumerate(self._rules): links = [l for l in rule.link_extractor.extract_links(response) if l not in seen] if links and rule.process_links: links = rule.process_links(links) for link in links: seen.add(link) r = Request(url=link.url, callback=self._response_downloaded) r.meta.update(rule=n, link_text=link.text) yield rule.process_request(r) def _response_downloaded(self, response): rule = self._rules[response.meta['rule']] return self._parse_response(response, rule.callback, rule.cb_kwargs, rule.follow) def _parse_response(self, response, callback, cb_kwargs, follow=True): if callback: cb_res = callback(response, **cb_kwargs) or () cb_res = self.process_results(response, cb_res) for requests_or_item in iterate_spider_output(cb_res): yield requests_or_item if follow and self._follow_links: for request_or_item in self._requests_to_follow(response): yield request_or_item def _compile_rules(self): def get_method(method): if callable(method): return method elif isinstance(method, basestring): return getattr(self, method, None) self._rules = [copy.copy(r) for r in self.rules] for rule in self._rules: rule.callback = get_method(rule.callback) rule.process_links = get_method(rule.process_links) rule.process_request = get_method(rule.process_request) @classmethod def from_crawler(cls, crawler, *args, **kwargs): spider = super(CrawlSpider, cls).from_crawler(crawler, *args, **kwargs) spider._follow_links = crawler.settings.getbool( 'CRAWLSPIDER_FOLLOW_LINKS', True) return spider def set_crawler(self, crawler): super(CrawlSpider, self).set_crawler(crawler) self._follow_links = crawler.settings.getbool('CRAWLSPIDER_FOLLOW_LINKS', True) ==== ItemPipeline để xử lý lưu trữ dữ liệu ==== refer: https://doc.scrapy.org/en/latest/topics/item-pipeline.html Riêng **ImagesPipeline and FilesPipeline**, request sẽ được **gọi và download trong hàm process_item của pipeline** === ItemPipeline Classes === {{:crawler:scrapy_pipeline.png|}} With some public functions for pipeline: @classmethod def from_crawler(cls, crawler) def open_spider(self, spider) def process_item(self, item, spider) def media_to_download(self, request, info) def get_media_requests(self, item, info) def media_downloaded(self, response, request, info) def media_failed(self, failure, request, info) def item_completed(self, results, item, info) === ItemPipelineManager === {{:crawler:scrapy_itempipelinemanager.png|}} Chú thích cho sơ đồ và mô tả kiến trúc của thuật toán pipeline: * Tất cả **pipeline classes** được xem là middleware để xử lý dữ liệu sau khi parse bởi spider(middleware có nghĩa là **dữ liệu được parse bởi tất cả spider đều được gởi tuần tự đến tất cả những pipeline này**), danh sách các pipeline được sử dụng sẽ lưu trữ trong settings của chương trình **ITEM_PIPELINES** và **ITEM_PIPELINES_BASE**: ITEM_PIPELINES = {'templatedownload.pipelines.MyFilesPipeline', } * khi **chương trình khởi động**, scrapy sẽ **gọi các class method của tất cả pipeline class để khởi dựng nó** : - Gọi class method **from_crawler** của pipeline class(cls là pipeline class) @classmethod def from_crawler(cls, crawler): - Gọi class method from_settings của pipeline class(cls là pipeline class) @classmethod def from_settings(cls, settings) => Với class method, chương trình Có thể **khởi tạo các thuộc tính cơ bản** và **khởi dựng pipeline class** trong phương thức from_crawler hoặc from_settings * Khi s**pider parse và trả về dữ liệu bất kỳ**, scrapy sẽ gọi các hàm bên dưới của tất cả pipeline class để xử lý nó. Theo thứ tự như sau: - Gọi phương thức open_spider def open_spider(self, spider) - Gọi phương thức process_item def process_item(self, item, spider): - Gọi phương thức close_spider def close_spider(self, spider) === ImagesPipeline === http://doc.scrapy.org/en/latest/topics/images.html {{:crawler:scrapy_imagepipeline.png|}} === Store in FilesPipeline === {{:crawler:scrapyfilestore.png|}} With 2 public functions: def persist_file(self, path, buf, info, meta=None, headers=None) def stat_file(self, path, info) Called from FilesPipeline: def file_downloaded(self, response, request, info): path = self.file_path(request, response=response, info=info) buf = BytesIO(response.body) self.store.persist_file(path, buf, info) checksum = md5sum(buf) return checksum === Write items to a JSON file === The following pipeline stores all scraped items (from all spiders) into a single items.jl file, containing one item per line serialized in JSON format: import json class JsonWriterPipeline(object): def open_spider(self, spider): self.file = open('items.jl', 'wb') def close_spider(self, spider): self.file.close() def process_item(self, item, spider): line = json.dumps(dict(item)) + "\n" self.file.write(line) return item Note: The purpose of JsonWriterPipeline is just to introduce how to write item pipelines. If you really want to store all scraped items into a JSON file you should use the Feed exports. === Write items to MongoDB === In this example we’ll write items to MongoDB using pymongo. MongoDB address and database name are specified in Scrapy settings; MongoDB collection is named after item class. The main point of this example is to show how to use from_crawler() method and how to clean up the resources properly.: import pymongo class MongoPipeline(object): collection_name = 'scrapy_items' def __init__(self, mongo_uri, mongo_db): self.mongo_uri = mongo_uri self.mongo_db = mongo_db @classmethod def from_crawler(cls, crawler): return cls( mongo_uri=crawler.settings.get('MONGO_URI'), mongo_db=crawler.settings.get('MONGO_DATABASE', 'items') ) def open_spider(self, spider): self.client = pymongo.MongoClient(self.mongo_uri) self.db = self.client[self.mongo_db] def close_spider(self, spider): self.client.close() def process_item(self, item, spider): self.db[self.collection_name].insert(dict(item)) return item === Duplicates filter === A filter that looks for duplicate items, and drops those items that were already processed. Let’s say that our items have a unique id, but our spider returns multiples items with the same id: from scrapy.exceptions import DropItem class DuplicatesPipeline(object): def __init__(self): self.ids_seen = set() def process_item(self, item, spider): if item['id'] in self.ids_seen: raise DropItem("Duplicate item found: %s" % item) else: self.ids_seen.add(item['id']) return item ==== scope of allowed_domains ==== allowed_domains was filtered in **site-packages\scrapy\utils\url.py**: def url_is_from_any_domain(url, domains): """Return True if the url belongs to any of the given domains""" host = parse_url(url).netloc.lower() if host: return any(((host == d.lower()) or (host.endswith('.%s' % d.lower())) for d in domains)) else: return False def url_is_from_spider(url, spider): """Return True if the url belongs to the given spider""" return url_is_from_any_domain(url, [spider.name] + list(getattr(spider, 'allowed_domains', []))) and spider call it to check before download: class Spider(object_ref): @classmethod def handles_request(cls, request): return url_is_from_spider(request.url, cls) ==== Integrate Scrapy with Other Systems ==== Integrate via below systems: * Database: MySQL, MongoDB * Cache: Redis Cache, Cm Cache -> You can **start multiple spider instances that share a single redis queue**. Best suitable for **broad multi-domain crawls**.