crawler:scrapyarchitecturecode
Differences
This shows you the differences between two versions of the page.
crawler:scrapyarchitecturecode [2016/08/02 17:13] – old revision restored (2016/08/03 00:11) admin | crawler:scrapyarchitecturecode [2022/10/29 16:15] (current) – external edit 127.0.0.1 | ||
---|---|---|---|
Line 1: | Line 1: | ||
+ | ====== Scrapy Architecture Code ====== | ||
+ | ===== Scrapy commands ===== | ||
+ | ==== Overview about scrapy commands ==== | ||
+ | * Scrapy command format< | ||
+ | scrapy --help | ||
+ | Scrapy 1.0.3 - project: templatedownload | ||
+ | Usage: | ||
+ | scrapy < | ||
+ | |||
+ | Available commands: | ||
+ | bench Run quick benchmark test | ||
+ | check Check spider contracts | ||
+ | commands | ||
+ | crawl Run a spider | ||
+ | edit Edit spider | ||
+ | fetch Fetch a URL using the Scrapy downloader | ||
+ | genspider | ||
+ | list List available spiders | ||
+ | parse Parse URL (using its spider) and print the results | ||
+ | runspider | ||
+ | settings | ||
+ | shell | ||
+ | startproject | ||
+ | version | ||
+ | view Open URL in browser, as seen by Scrapy | ||
+ | |||
+ | Use " | ||
+ | * For example:< | ||
+ | scrapy startproject -h | ||
+ | Usage | ||
+ | ===== | ||
+ | scrapy startproject < | ||
+ | |||
+ | Create new project | ||
+ | |||
+ | Options | ||
+ | ======= | ||
+ | --help, -h show this help message and exit | ||
+ | |||
+ | Global Options | ||
+ | -------------- | ||
+ | --logfile=FILE | ||
+ | --loglevel=LEVEL, | ||
+ | log level (default: DEBUG) | ||
+ | --nolog | ||
+ | --profile=FILE | ||
+ | --lsprof=FILE | ||
+ | --pidfile=FILE | ||
+ | --set=NAME=VALUE, | ||
+ | set/ | ||
+ | --pdb | ||
+ | </ | ||
+ | * content of script scrapy:< | ||
+ | python -mscrapy.cmdline %* | ||
+ | </ | ||
+ | * scrapy command diagram | ||
+ | {{: | ||
+ | ==== scrapy command code scripts ==== | ||
+ | Scrapy command code scripts are stored in directory **scrapy/ | ||
+ | bench.py | ||
+ | check.py | ||
+ | crawl.py | ||
+ | deploy.py | ||
+ | edit.py | ||
+ | fetch.py | ||
+ | genspider.py | ||
+ | list.py | ||
+ | parse.py | ||
+ | runspider.py | ||
+ | settings.py | ||
+ | shell.py | ||
+ | startproject.py | ||
+ | version.py | ||
+ | view.py | ||
+ | </ | ||
+ | ===== Scrapy Settings ===== | ||
+ | ==== Default settings ==== | ||
+ | default_settings.py | ||
+ | <code python> | ||
+ | """ | ||
+ | This module contains the default values for all settings used by Scrapy. | ||
+ | |||
+ | For more information about these settings you can read the settings | ||
+ | documentation in docs/ | ||
+ | |||
+ | Scrapy developers, if you add a setting here remember to: | ||
+ | |||
+ | * add it in alphabetical order | ||
+ | * group similar settings without leaving blank lines | ||
+ | * add its documentation to the available settings documentation | ||
+ | (docs/ | ||
+ | |||
+ | """ | ||
+ | |||
+ | import os | ||
+ | import sys | ||
+ | from importlib import import_module | ||
+ | from os.path import join, abspath, dirname | ||
+ | |||
+ | AJAXCRAWL_ENABLED = False | ||
+ | |||
+ | BOT_NAME = ' | ||
+ | |||
+ | CLOSESPIDER_TIMEOUT = 0 | ||
+ | CLOSESPIDER_PAGECOUNT = 0 | ||
+ | CLOSESPIDER_ITEMCOUNT = 0 | ||
+ | CLOSESPIDER_ERRORCOUNT = 0 | ||
+ | |||
+ | COMMANDS_MODULE = '' | ||
+ | |||
+ | COMPRESSION_ENABLED = True | ||
+ | |||
+ | CONCURRENT_ITEMS = 100 | ||
+ | |||
+ | CONCURRENT_REQUESTS = 16 | ||
+ | CONCURRENT_REQUESTS_PER_DOMAIN = 8 | ||
+ | CONCURRENT_REQUESTS_PER_IP = 0 | ||
+ | |||
+ | COOKIES_ENABLED = True | ||
+ | COOKIES_DEBUG = False | ||
+ | |||
+ | DEFAULT_ITEM_CLASS = ' | ||
+ | |||
+ | DEFAULT_REQUEST_HEADERS = { | ||
+ | ' | ||
+ | ' | ||
+ | } | ||
+ | |||
+ | DEPTH_LIMIT = 0 | ||
+ | DEPTH_STATS = True | ||
+ | DEPTH_PRIORITY = 0 | ||
+ | |||
+ | DNSCACHE_ENABLED = True | ||
+ | DNSCACHE_SIZE = 10000 | ||
+ | DNS_TIMEOUT = 60 | ||
+ | |||
+ | DOWNLOAD_DELAY = 0 | ||
+ | |||
+ | DOWNLOAD_HANDLERS = {} | ||
+ | DOWNLOAD_HANDLERS_BASE = { | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | } | ||
+ | |||
+ | DOWNLOAD_TIMEOUT = 180 # 3mins | ||
+ | |||
+ | DOWNLOAD_MAXSIZE = 1024*1024*1024 | ||
+ | DOWNLOAD_WARNSIZE = 32*1024*1024 | ||
+ | |||
+ | DOWNLOADER = ' | ||
+ | |||
+ | DOWNLOADER_HTTPCLIENTFACTORY = ' | ||
+ | DOWNLOADER_CLIENTCONTEXTFACTORY = ' | ||
+ | |||
+ | DOWNLOADER_MIDDLEWARES = {} | ||
+ | |||
+ | DOWNLOADER_MIDDLEWARES_BASE = { | ||
+ | # Engine side | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | # Downloader side | ||
+ | } | ||
+ | |||
+ | DOWNLOADER_STATS = True | ||
+ | |||
+ | DUPEFILTER_CLASS = ' | ||
+ | |||
+ | try: | ||
+ | EDITOR = os.environ[' | ||
+ | except KeyError: | ||
+ | if sys.platform == ' | ||
+ | EDITOR = '%s -m idlelib.idle' | ||
+ | else: | ||
+ | EDITOR = ' | ||
+ | |||
+ | EXTENSIONS = {} | ||
+ | |||
+ | EXTENSIONS_BASE = { | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | } | ||
+ | |||
+ | FEED_URI = None | ||
+ | FEED_URI_PARAMS = None # a function to extend uri arguments | ||
+ | FEED_FORMAT = ' | ||
+ | FEED_STORE_EMPTY = False | ||
+ | FEED_EXPORT_FIELDS = None | ||
+ | FEED_STORAGES = {} | ||
+ | FEED_STORAGES_BASE = { | ||
+ | '': | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | } | ||
+ | FEED_EXPORTERS = {} | ||
+ | FEED_EXPORTERS_BASE = { | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | } | ||
+ | |||
+ | HTTPCACHE_ENABLED = False | ||
+ | HTTPCACHE_DIR = ' | ||
+ | HTTPCACHE_IGNORE_MISSING = False | ||
+ | HTTPCACHE_STORAGE = ' | ||
+ | HTTPCACHE_EXPIRATION_SECS = 0 | ||
+ | HTTPCACHE_IGNORE_HTTP_CODES = [] | ||
+ | HTTPCACHE_IGNORE_SCHEMES = [' | ||
+ | HTTPCACHE_DBM_MODULE = ' | ||
+ | HTTPCACHE_POLICY = ' | ||
+ | HTTPCACHE_GZIP = False | ||
+ | |||
+ | ITEM_PROCESSOR = ' | ||
+ | |||
+ | ITEM_PIPELINES = {} | ||
+ | ITEM_PIPELINES_BASE = {} | ||
+ | |||
+ | LOG_ENABLED = True | ||
+ | LOG_ENCODING = ' | ||
+ | LOG_FORMATTER = ' | ||
+ | LOG_FORMAT = ' | ||
+ | LOG_DATEFORMAT = ' | ||
+ | LOG_STDOUT = False | ||
+ | LOG_LEVEL = ' | ||
+ | LOG_FILE = None | ||
+ | |||
+ | LOG_UNSERIALIZABLE_REQUESTS = False | ||
+ | |||
+ | LOGSTATS_INTERVAL = 60.0 | ||
+ | |||
+ | MAIL_HOST = ' | ||
+ | MAIL_PORT = 25 | ||
+ | MAIL_FROM = ' | ||
+ | MAIL_PASS = None | ||
+ | MAIL_USER = None | ||
+ | |||
+ | MEMDEBUG_ENABLED = False # enable memory debugging | ||
+ | MEMDEBUG_NOTIFY = [] # send memory debugging report by mail at engine shutdown | ||
+ | |||
+ | MEMUSAGE_ENABLED = False | ||
+ | MEMUSAGE_LIMIT_MB = 0 | ||
+ | MEMUSAGE_NOTIFY_MAIL = [] | ||
+ | MEMUSAGE_REPORT = False | ||
+ | MEMUSAGE_WARNING_MB = 0 | ||
+ | |||
+ | METAREFRESH_ENABLED = True | ||
+ | METAREFRESH_MAXDELAY = 100 | ||
+ | |||
+ | NEWSPIDER_MODULE = '' | ||
+ | |||
+ | RANDOMIZE_DOWNLOAD_DELAY = True | ||
+ | |||
+ | REACTOR_THREADPOOL_MAXSIZE = 10 | ||
+ | |||
+ | REDIRECT_ENABLED = True | ||
+ | REDIRECT_MAX_TIMES = 20 # uses Firefox default setting | ||
+ | REDIRECT_PRIORITY_ADJUST = +2 | ||
+ | |||
+ | REFERER_ENABLED = True | ||
+ | |||
+ | RETRY_ENABLED = True | ||
+ | RETRY_TIMES = 2 # initial response + 2 retries = 3 requests | ||
+ | RETRY_HTTP_CODES = [500, 502, 503, 504, 400, 408] | ||
+ | RETRY_PRIORITY_ADJUST = -1 | ||
+ | |||
+ | ROBOTSTXT_OBEY = False | ||
+ | |||
+ | SCHEDULER = ' | ||
+ | SCHEDULER_DISK_QUEUE = ' | ||
+ | SCHEDULER_MEMORY_QUEUE = ' | ||
+ | |||
+ | SPIDER_LOADER_CLASS = ' | ||
+ | |||
+ | SPIDER_MIDDLEWARES = {} | ||
+ | |||
+ | SPIDER_MIDDLEWARES_BASE = { | ||
+ | # Engine side | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | # Spider side | ||
+ | } | ||
+ | |||
+ | SPIDER_MODULES = [] | ||
+ | |||
+ | STATS_CLASS = ' | ||
+ | STATS_DUMP = True | ||
+ | |||
+ | STATSMAILER_RCPTS = [] | ||
+ | |||
+ | TEMPLATES_DIR = abspath(join(dirname(__file__), | ||
+ | |||
+ | URLLENGTH_LIMIT = 2083 | ||
+ | |||
+ | USER_AGENT = ' | ||
+ | |||
+ | TELNETCONSOLE_ENABLED = 1 | ||
+ | TELNETCONSOLE_PORT = [6023, 6073] | ||
+ | TELNETCONSOLE_HOST = ' | ||
+ | |||
+ | SPIDER_CONTRACTS = {} | ||
+ | SPIDER_CONTRACTS_BASE = { | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | } | ||
+ | |||
+ | </ | ||
+ | ==== Some sections in default_settings.py and Custom Settings === | ||
+ | * Bot name< | ||
+ | BOT_NAME = ' | ||
+ | </ | ||
+ | * Download handlers< | ||
+ | DOWNLOAD_HANDLERS = {} | ||
+ | DOWNLOAD_HANDLERS_BASE = { | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | } | ||
+ | </ | ||
+ | * Download Middlewares:< | ||
+ | DOWNLOADER_MIDDLEWARES = {} | ||
+ | DOWNLOADER_MIDDLEWARES_BASE = { | ||
+ | # Engine side | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | # Downloader side | ||
+ | } | ||
+ | </ | ||
+ | * Extensions:< | ||
+ | EXTENSIONS = {} | ||
+ | EXTENSIONS_BASE = { | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | } | ||
+ | </ | ||
+ | * Feed storages< | ||
+ | FEED_STORAGES = {} | ||
+ | FEED_STORAGES_BASE = { | ||
+ | '': | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | } | ||
+ | </ | ||
+ | * Feed Exporters:< | ||
+ | FEED_EXPORTERS = {} | ||
+ | FEED_EXPORTERS_BASE = { | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | } | ||
+ | </ | ||
+ | * Spider Middlewares: | ||
+ | SPIDER_MIDDLEWARES = {} | ||
+ | SPIDER_MIDDLEWARES_BASE = { | ||
+ | # Engine side | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | # Spider side | ||
+ | } | ||
+ | </ | ||
+ | * Spider Modules< | ||
+ | SPIDER_MODULES = [] | ||
+ | </ | ||
+ | |||
+ | ===== scrapy engine ===== | ||
+ | ==== scrapy engine init and start ==== | ||
+ | {{: | ||
+ | ==== DownloadMiddleware ==== | ||
+ | {{: | ||
+ | === Instant DownloadMiddleware === | ||
+ | {{: | ||
+ | === DownloadMiddleware run Download === | ||
+ | {{: | ||
+ | ==== Request in Scrapy ==== | ||
+ | ==== Crawl and Spider ==== | ||
+ | === scrapy crawl command start and run Spider === | ||
+ | Command to start spider< | ||
+ | scrapy crawl < | ||
+ | </ | ||
+ | crawling diagram: | ||
+ | {{: | ||
+ | Basic Steps for crawling: | ||
+ | - Step1: Call Class Method **update_settings** of Spider | ||
+ | - Step2: Call Class Method **from_crawler** to create spider Object | ||
+ | - Step3: Call Method self.spider.**start_requests()** return all requests for downloading | ||
+ | class Crawler(object): | ||
+ | ................ | ||
+ | @defer.inlineCallbacks | ||
+ | def crawl(self, *args, **kwargs): | ||
+ | assert not self.crawling, | ||
+ | self.crawling = True | ||
+ | |||
+ | try: | ||
+ | self.spider = self._create_spider(*args, | ||
+ | self.engine = self._create_engine() | ||
+ | start_requests = iter(self.spider.start_requests()) | ||
+ | yield self.engine.open_spider(self.spider, | ||
+ | yield defer.maybeDeferred(self.engine.start) | ||
+ | except Exception: | ||
+ | self.crawling = False | ||
+ | raise | ||
+ | ........................... | ||
+ | class Spider(object_ref): | ||
+ | """ | ||
+ | class. | ||
+ | """ | ||
+ | @classmethod | ||
+ | def from_crawler(cls, | ||
+ | spider = cls(*args, **kwargs) | ||
+ | spider._set_crawler(crawler) | ||
+ | return spider | ||
+ | | ||
+ | def start_requests(self): | ||
+ | for url in self.start_urls: | ||
+ | yield self.make_requests_from_url(url) | ||
+ | | ||
+ | def make_requests_from_url(self, | ||
+ | return Request(url, | ||
+ | | ||
+ | def parse(self, response): | ||
+ | raise NotImplementedError | ||
+ | | ||
+ | @classmethod | ||
+ | def update_settings(cls, | ||
+ | settings.setdict(cls.custom_settings or {}, priority=' | ||
+ | |||
+ | @classmethod | ||
+ | def handles_request(cls, | ||
+ | return url_is_from_spider(request.url, | ||
+ | |||
+ | @staticmethod | ||
+ | def close(spider, | ||
+ | closed = getattr(spider, | ||
+ | if callable(closed): | ||
+ | return closed(reason) | ||
+ | </ | ||
+ | - Step4: Add spider to Shedule for downloading | ||
+ | - Step5: After download url finished, Crawler calling function **parse** | ||
+ | - Step6: **Continue download new requests**: Crawler get all requests yielded from function **parse** and continue downloading | ||
+ | - Step7: **process data downloaded**: | ||
+ | - Step8: Mỗi request đều cố hàm callback **parse(self, | ||
+ | * Hàm callback trả về giá trị là **Request** -> sẽ tiếp tục download những request này | ||
+ | * Nếu hàm callback trả về giá trị là **Item** -> đưa vô pipeline đễ xử lý data | ||
+ | |||
+ | === Spider Classes === | ||
+ | {{: | ||
+ | === Thuật toán download và parse response trong Spider === | ||
+ | Spider class< | ||
+ | class Spider(object_ref): | ||
+ | """ | ||
+ | class. | ||
+ | """ | ||
+ | |||
+ | name = None | ||
+ | custom_settings = None | ||
+ | |||
+ | def __init__(self, | ||
+ | if name is not None: | ||
+ | self.name = name | ||
+ | elif not getattr(self, | ||
+ | raise ValueError(" | ||
+ | self.__dict__.update(kwargs) | ||
+ | if not hasattr(self, | ||
+ | self.start_urls = [] | ||
+ | |||
+ | @property | ||
+ | def logger(self): | ||
+ | logger = logging.getLogger(self.name) | ||
+ | return logging.LoggerAdapter(logger, | ||
+ | |||
+ | def log(self, message, level=logging.DEBUG, | ||
+ | """ | ||
+ | |||
+ | This helper wraps a log call to the logger within the spider, but you | ||
+ | can use it directly (e.g. Spider.logger.info(' | ||
+ | Python logger too. | ||
+ | """ | ||
+ | self.logger.log(level, | ||
+ | |||
+ | @classmethod | ||
+ | def from_crawler(cls, | ||
+ | spider = cls(*args, **kwargs) | ||
+ | spider._set_crawler(crawler) | ||
+ | return spider | ||
+ | |||
+ | def set_crawler(self, | ||
+ | warnings.warn(" | ||
+ | " | ||
+ | " | ||
+ | category=ScrapyDeprecationWarning, | ||
+ | assert not hasattr(self, | ||
+ | " | ||
+ | self._set_crawler(crawler) | ||
+ | |||
+ | def _set_crawler(self, | ||
+ | self.crawler = crawler | ||
+ | self.settings = crawler.settings | ||
+ | crawler.signals.connect(self.close, | ||
+ | |||
+ | def start_requests(self): | ||
+ | for url in self.start_urls: | ||
+ | yield self.make_requests_from_url(url) | ||
+ | |||
+ | def make_requests_from_url(self, | ||
+ | return Request(url, | ||
+ | |||
+ | def parse(self, response): | ||
+ | raise NotImplementedError | ||
+ | |||
+ | @classmethod | ||
+ | def update_settings(cls, | ||
+ | settings.setdict(cls.custom_settings or {}, priority=' | ||
+ | |||
+ | @classmethod | ||
+ | def handles_request(cls, | ||
+ | return url_is_from_spider(request.url, | ||
+ | |||
+ | @staticmethod | ||
+ | def close(spider, | ||
+ | closed = getattr(spider, | ||
+ | if callable(closed): | ||
+ | return closed(reason) | ||
+ | |||
+ | def __str__(self): | ||
+ | return "< | ||
+ | |||
+ | __repr__ = __str__ | ||
+ | </ | ||
+ | Mô tả thuật toán: | ||
+ | - Step1: Scrapy Engine gọi hàm **start_requests** thực hiện việc download tất cả các link trong **start_urls**:< | ||
+ | def start_requests(self): | ||
+ | for url in self.start_urls: | ||
+ | yield self.make_requests_from_url(url) | ||
+ | def make_requests_from_url(self, | ||
+ | return Request(url, | ||
+ | </ | ||
+ | - Step2: Response được download trong step1 sẽ được xử lý trong hàm **parse(self, | ||
+ | - Tập dữ liệu 1: Tập gồm các dữ liệu web được gởi tới Pipeline để xử lý | ||
+ | - Tập dữ liệu 2: Tập gồm các link mới sẽ được download bởi scrapy và gởi tới hàm **parse(self, | ||
+ | === CrawlSpider class === | ||
+ | **CrawlSpider kế thừa từ spider** và sử dụng 2 thuật toán cơ bản: | ||
+ | * **Thuật toán đệ quy** để tìm tất cả url liên kết với url khởi tạo và tạo thành mạng lưới url liên kết với nó | ||
+ | * **Thuật toán extract links** dựa theo rule để lọc ra những url mà nó muốn download | ||
+ | => This modules implements the CrawlSpider which is the recommended spider to use for scraping typical web sites that requires crawling pages. | ||
+ | <code python> | ||
+ | """ | ||
+ | This modules implements the CrawlSpider which is the recommended spider to use | ||
+ | for scraping typical web sites that requires crawling pages. | ||
+ | |||
+ | See documentation in docs/ | ||
+ | """ | ||
+ | |||
+ | import copy | ||
+ | |||
+ | from scrapy.http import Request, HtmlResponse | ||
+ | from scrapy.utils.spider import iterate_spider_output | ||
+ | from scrapy.spiders import Spider | ||
+ | |||
+ | def identity(x): | ||
+ | return x | ||
+ | |||
+ | class Rule(object): | ||
+ | |||
+ | def __init__(self, | ||
+ | self.link_extractor = link_extractor | ||
+ | self.callback = callback | ||
+ | self.cb_kwargs = cb_kwargs or {} | ||
+ | self.process_links = process_links | ||
+ | self.process_request = process_request | ||
+ | if follow is None: | ||
+ | self.follow = False if callback else True | ||
+ | else: | ||
+ | self.follow = follow | ||
+ | |||
+ | class CrawlSpider(Spider): | ||
+ | |||
+ | rules = () | ||
+ | |||
+ | def __init__(self, | ||
+ | super(CrawlSpider, | ||
+ | self._compile_rules() | ||
+ | |||
+ | def parse(self, response): | ||
+ | return self._parse_response(response, | ||
+ | |||
+ | def parse_start_url(self, | ||
+ | return [] | ||
+ | |||
+ | def process_results(self, | ||
+ | return results | ||
+ | |||
+ | def _requests_to_follow(self, | ||
+ | if not isinstance(response, | ||
+ | return | ||
+ | seen = set() | ||
+ | for n, rule in enumerate(self._rules): | ||
+ | links = [l for l in rule.link_extractor.extract_links(response) if l not in seen] | ||
+ | if links and rule.process_links: | ||
+ | links = rule.process_links(links) | ||
+ | for link in links: | ||
+ | seen.add(link) | ||
+ | r = Request(url=link.url, | ||
+ | r.meta.update(rule=n, | ||
+ | yield rule.process_request(r) | ||
+ | |||
+ | def _response_downloaded(self, | ||
+ | rule = self._rules[response.meta[' | ||
+ | return self._parse_response(response, | ||
+ | |||
+ | def _parse_response(self, | ||
+ | if callback: | ||
+ | cb_res = callback(response, | ||
+ | cb_res = self.process_results(response, | ||
+ | for requests_or_item in iterate_spider_output(cb_res): | ||
+ | yield requests_or_item | ||
+ | |||
+ | if follow and self._follow_links: | ||
+ | for request_or_item in self._requests_to_follow(response): | ||
+ | yield request_or_item | ||
+ | |||
+ | def _compile_rules(self): | ||
+ | def get_method(method): | ||
+ | if callable(method): | ||
+ | return method | ||
+ | elif isinstance(method, | ||
+ | return getattr(self, | ||
+ | |||
+ | self._rules = [copy.copy(r) for r in self.rules] | ||
+ | for rule in self._rules: | ||
+ | rule.callback = get_method(rule.callback) | ||
+ | rule.process_links = get_method(rule.process_links) | ||
+ | rule.process_request = get_method(rule.process_request) | ||
+ | |||
+ | @classmethod | ||
+ | def from_crawler(cls, | ||
+ | spider = super(CrawlSpider, | ||
+ | spider._follow_links = crawler.settings.getbool( | ||
+ | ' | ||
+ | return spider | ||
+ | |||
+ | def set_crawler(self, | ||
+ | super(CrawlSpider, | ||
+ | self._follow_links = crawler.settings.getbool(' | ||
+ | |||
+ | </ | ||
+ | ==== ItemPipeline để xử lý lưu trữ dữ liệu ==== | ||
+ | refer: https:// | ||
+ | |||
+ | Riêng **ImagesPipeline and FilesPipeline**, | ||
+ | |||
+ | === ItemPipeline Classes === | ||
+ | {{: | ||
+ | |||
+ | With some public functions for pipeline:< | ||
+ | @classmethod | ||
+ | def from_crawler(cls, | ||
+ | def open_spider(self, | ||
+ | def process_item(self, | ||
+ | def media_to_download(self, | ||
+ | def get_media_requests(self, | ||
+ | def media_downloaded(self, | ||
+ | def media_failed(self, | ||
+ | def item_completed(self, | ||
+ | </ | ||
+ | === ItemPipelineManager === | ||
+ | {{: | ||
+ | Chú thích cho sơ đồ và mô tả kiến trúc của thuật toán pipeline: | ||
+ | * Tất cả **pipeline classes** được xem là middleware để xử lý dữ liệu sau khi parse bởi spider(middleware có nghĩa là **dữ liệu được parse bởi tất cả spider đều được gởi tuần tự đến tất cả những pipeline này**), danh sách các pipeline được sử dụng sẽ lưu trữ trong settings của chương trình **ITEM_PIPELINES** và **ITEM_PIPELINES_BASE**:< | ||
+ | ITEM_PIPELINES = {' | ||
+ | } | ||
+ | </ | ||
+ | * khi **chương trình khởi động**, scrapy sẽ **gọi các class method của tất cả pipeline class để khởi dựng nó** : | ||
+ | - Gọi class method **from_crawler** của pipeline class(cls là pipeline class)< | ||
+ | @classmethod | ||
+ | def from_crawler(cls, | ||
+ | </ | ||
+ | - Gọi class method from_settings | ||
+ | @classmethod | ||
+ | def from_settings(cls, | ||
+ | </ | ||
+ | * Khi s**pider parse và trả về dữ liệu bất kỳ**, scrapy sẽ gọi các hàm bên dưới của tất cả pipeline class để xử lý nó. Theo thứ tự như sau: | ||
+ | - Gọi phương thức open_spider< | ||
+ | def open_spider(self, | ||
+ | </ | ||
+ | - Gọi phương thức process_item< | ||
+ | def process_item(self, | ||
+ | </ | ||
+ | - Gọi phương thức close_spider< | ||
+ | def close_spider(self, | ||
+ | </ | ||
+ | === ImagesPipeline === | ||
+ | http:// | ||
+ | |||
+ | {{: | ||
+ | === Store in FilesPipeline === | ||
+ | {{: | ||
+ | |||
+ | With 2 public functions:< | ||
+ | def persist_file(self, | ||
+ | def stat_file(self, | ||
+ | </ | ||
+ | Called from FilesPipeline:< | ||
+ | def file_downloaded(self, | ||
+ | path = self.file_path(request, | ||
+ | buf = BytesIO(response.body) | ||
+ | self.store.persist_file(path, | ||
+ | checksum = md5sum(buf) | ||
+ | return checksum | ||
+ | </ | ||
+ | === Write items to a JSON file === | ||
+ | The following pipeline stores all scraped items (from all spiders) into a single items.jl file, containing one item per line serialized in JSON format:< | ||
+ | import json | ||
+ | |||
+ | class JsonWriterPipeline(object): | ||
+ | |||
+ | def open_spider(self, | ||
+ | self.file = open(' | ||
+ | |||
+ | def close_spider(self, | ||
+ | self.file.close() | ||
+ | |||
+ | def process_item(self, | ||
+ | line = json.dumps(dict(item)) + " | ||
+ | self.file.write(line) | ||
+ | return item | ||
+ | </ | ||
+ | Note: The purpose of JsonWriterPipeline is just to introduce how to write item pipelines. If you really want to store all scraped items into a JSON file you should use the Feed exports. | ||
+ | === Write items to MongoDB === | ||
+ | In this example we’ll write items to MongoDB using pymongo. MongoDB address and database name are specified in Scrapy settings; MongoDB collection is named after item class. | ||
+ | |||
+ | The main point of this example is to show how to use from_crawler() method and how to clean up the resources properly.:< | ||
+ | import pymongo | ||
+ | |||
+ | class MongoPipeline(object): | ||
+ | |||
+ | collection_name = ' | ||
+ | |||
+ | def __init__(self, | ||
+ | self.mongo_uri = mongo_uri | ||
+ | self.mongo_db = mongo_db | ||
+ | |||
+ | @classmethod | ||
+ | def from_crawler(cls, | ||
+ | return cls( | ||
+ | mongo_uri=crawler.settings.get(' | ||
+ | mongo_db=crawler.settings.get(' | ||
+ | ) | ||
+ | |||
+ | def open_spider(self, | ||
+ | self.client = pymongo.MongoClient(self.mongo_uri) | ||
+ | self.db = self.client[self.mongo_db] | ||
+ | |||
+ | def close_spider(self, | ||
+ | self.client.close() | ||
+ | |||
+ | def process_item(self, | ||
+ | self.db[self.collection_name].insert(dict(item)) | ||
+ | return item | ||
+ | </ | ||
+ | === Duplicates filter === | ||
+ | A filter that looks for duplicate items, and drops those items that were already processed. Let’s say that our items have a unique id, but our spider returns multiples items with the same id:<code python> | ||
+ | from scrapy.exceptions import DropItem | ||
+ | |||
+ | class DuplicatesPipeline(object): | ||
+ | |||
+ | def __init__(self): | ||
+ | self.ids_seen = set() | ||
+ | |||
+ | def process_item(self, | ||
+ | if item[' | ||
+ | raise DropItem(" | ||
+ | else: | ||
+ | self.ids_seen.add(item[' | ||
+ | return item | ||
+ | </ | ||
+ | ==== scope of allowed_domains ==== | ||
+ | allowed_domains was filtered in **site-packages\scrapy\utils\url.py**:< | ||
+ | def url_is_from_any_domain(url, | ||
+ | """ | ||
+ | host = parse_url(url).netloc.lower() | ||
+ | |||
+ | if host: | ||
+ | return any(((host == d.lower()) or (host.endswith(' | ||
+ | else: | ||
+ | return False | ||
+ | |||
+ | def url_is_from_spider(url, | ||
+ | """ | ||
+ | return url_is_from_any_domain(url, | ||
+ | [spider.name] + list(getattr(spider, | ||
+ | </ | ||
+ | and spider call it to check before download:< | ||
+ | class Spider(object_ref): | ||
+ | @classmethod | ||
+ | def handles_request(cls, | ||
+ | return url_is_from_spider(request.url, | ||
+ | </ | ||
+ | ==== Integrate Scrapy with Other Systems ==== | ||
+ | Integrate via below systems: | ||
+ | * Database: MySQL, MongoDB | ||
+ | * Cache: Redis Cache, Cm Cache -> You can **start multiple spider instances that share a single redis queue**. Best suitable for **broad multi-domain crawls**. |
crawler/scrapyarchitecturecode.1470158024.txt.gz · Last modified: 2022/10/29 16:15 (external edit)