crawler:scrapyarchitecturecode
Differences
This shows you the differences between two versions of the page.
| crawler:scrapyarchitecturecode [2016/08/02 17:13] – old revision restored (2016/08/03 00:11) admin | crawler:scrapyarchitecturecode [2022/10/29 16:15] (current) – external edit 127.0.0.1 | ||
|---|---|---|---|
| Line 1: | Line 1: | ||
| + | ====== Scrapy Architecture Code ====== | ||
| + | ===== Scrapy commands ===== | ||
| + | ==== Overview about scrapy commands ==== | ||
| + | * Scrapy command format< | ||
| + | scrapy --help | ||
| + | Scrapy 1.0.3 - project: templatedownload | ||
| + | Usage: | ||
| + | scrapy < | ||
| + | |||
| + | Available commands: | ||
| + | bench Run quick benchmark test | ||
| + | check Check spider contracts | ||
| + | commands | ||
| + | crawl Run a spider | ||
| + | edit Edit spider | ||
| + | fetch Fetch a URL using the Scrapy downloader | ||
| + | genspider | ||
| + | list List available spiders | ||
| + | parse Parse URL (using its spider) and print the results | ||
| + | runspider | ||
| + | settings | ||
| + | shell | ||
| + | startproject | ||
| + | version | ||
| + | view Open URL in browser, as seen by Scrapy | ||
| + | |||
| + | Use " | ||
| + | * For example:< | ||
| + | scrapy startproject -h | ||
| + | Usage | ||
| + | ===== | ||
| + | scrapy startproject < | ||
| + | |||
| + | Create new project | ||
| + | |||
| + | Options | ||
| + | ======= | ||
| + | --help, -h show this help message and exit | ||
| + | |||
| + | Global Options | ||
| + | -------------- | ||
| + | --logfile=FILE | ||
| + | --loglevel=LEVEL, | ||
| + | log level (default: DEBUG) | ||
| + | --nolog | ||
| + | --profile=FILE | ||
| + | --lsprof=FILE | ||
| + | --pidfile=FILE | ||
| + | --set=NAME=VALUE, | ||
| + | set/ | ||
| + | --pdb | ||
| + | </ | ||
| + | * content of script scrapy:< | ||
| + | python -mscrapy.cmdline %* | ||
| + | </ | ||
| + | * scrapy command diagram | ||
| + | {{: | ||
| + | ==== scrapy command code scripts ==== | ||
| + | Scrapy command code scripts are stored in directory **scrapy/ | ||
| + | bench.py | ||
| + | check.py | ||
| + | crawl.py | ||
| + | deploy.py | ||
| + | edit.py | ||
| + | fetch.py | ||
| + | genspider.py | ||
| + | list.py | ||
| + | parse.py | ||
| + | runspider.py | ||
| + | settings.py | ||
| + | shell.py | ||
| + | startproject.py | ||
| + | version.py | ||
| + | view.py | ||
| + | </ | ||
| + | ===== Scrapy Settings ===== | ||
| + | ==== Default settings ==== | ||
| + | default_settings.py | ||
| + | <code python> | ||
| + | """ | ||
| + | This module contains the default values for all settings used by Scrapy. | ||
| + | |||
| + | For more information about these settings you can read the settings | ||
| + | documentation in docs/ | ||
| + | |||
| + | Scrapy developers, if you add a setting here remember to: | ||
| + | |||
| + | * add it in alphabetical order | ||
| + | * group similar settings without leaving blank lines | ||
| + | * add its documentation to the available settings documentation | ||
| + | (docs/ | ||
| + | |||
| + | """ | ||
| + | |||
| + | import os | ||
| + | import sys | ||
| + | from importlib import import_module | ||
| + | from os.path import join, abspath, dirname | ||
| + | |||
| + | AJAXCRAWL_ENABLED = False | ||
| + | |||
| + | BOT_NAME = ' | ||
| + | |||
| + | CLOSESPIDER_TIMEOUT = 0 | ||
| + | CLOSESPIDER_PAGECOUNT = 0 | ||
| + | CLOSESPIDER_ITEMCOUNT = 0 | ||
| + | CLOSESPIDER_ERRORCOUNT = 0 | ||
| + | |||
| + | COMMANDS_MODULE = '' | ||
| + | |||
| + | COMPRESSION_ENABLED = True | ||
| + | |||
| + | CONCURRENT_ITEMS = 100 | ||
| + | |||
| + | CONCURRENT_REQUESTS = 16 | ||
| + | CONCURRENT_REQUESTS_PER_DOMAIN = 8 | ||
| + | CONCURRENT_REQUESTS_PER_IP = 0 | ||
| + | |||
| + | COOKIES_ENABLED = True | ||
| + | COOKIES_DEBUG = False | ||
| + | |||
| + | DEFAULT_ITEM_CLASS = ' | ||
| + | |||
| + | DEFAULT_REQUEST_HEADERS = { | ||
| + | ' | ||
| + | ' | ||
| + | } | ||
| + | |||
| + | DEPTH_LIMIT = 0 | ||
| + | DEPTH_STATS = True | ||
| + | DEPTH_PRIORITY = 0 | ||
| + | |||
| + | DNSCACHE_ENABLED = True | ||
| + | DNSCACHE_SIZE = 10000 | ||
| + | DNS_TIMEOUT = 60 | ||
| + | |||
| + | DOWNLOAD_DELAY = 0 | ||
| + | |||
| + | DOWNLOAD_HANDLERS = {} | ||
| + | DOWNLOAD_HANDLERS_BASE = { | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | } | ||
| + | |||
| + | DOWNLOAD_TIMEOUT = 180 # 3mins | ||
| + | |||
| + | DOWNLOAD_MAXSIZE = 1024*1024*1024 | ||
| + | DOWNLOAD_WARNSIZE = 32*1024*1024 | ||
| + | |||
| + | DOWNLOADER = ' | ||
| + | |||
| + | DOWNLOADER_HTTPCLIENTFACTORY = ' | ||
| + | DOWNLOADER_CLIENTCONTEXTFACTORY = ' | ||
| + | |||
| + | DOWNLOADER_MIDDLEWARES = {} | ||
| + | |||
| + | DOWNLOADER_MIDDLEWARES_BASE = { | ||
| + | # Engine side | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | # Downloader side | ||
| + | } | ||
| + | |||
| + | DOWNLOADER_STATS = True | ||
| + | |||
| + | DUPEFILTER_CLASS = ' | ||
| + | |||
| + | try: | ||
| + | EDITOR = os.environ[' | ||
| + | except KeyError: | ||
| + | if sys.platform == ' | ||
| + | EDITOR = '%s -m idlelib.idle' | ||
| + | else: | ||
| + | EDITOR = ' | ||
| + | |||
| + | EXTENSIONS = {} | ||
| + | |||
| + | EXTENSIONS_BASE = { | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | } | ||
| + | |||
| + | FEED_URI = None | ||
| + | FEED_URI_PARAMS = None # a function to extend uri arguments | ||
| + | FEED_FORMAT = ' | ||
| + | FEED_STORE_EMPTY = False | ||
| + | FEED_EXPORT_FIELDS = None | ||
| + | FEED_STORAGES = {} | ||
| + | FEED_STORAGES_BASE = { | ||
| + | '': | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | } | ||
| + | FEED_EXPORTERS = {} | ||
| + | FEED_EXPORTERS_BASE = { | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | } | ||
| + | |||
| + | HTTPCACHE_ENABLED = False | ||
| + | HTTPCACHE_DIR = ' | ||
| + | HTTPCACHE_IGNORE_MISSING = False | ||
| + | HTTPCACHE_STORAGE = ' | ||
| + | HTTPCACHE_EXPIRATION_SECS = 0 | ||
| + | HTTPCACHE_IGNORE_HTTP_CODES = [] | ||
| + | HTTPCACHE_IGNORE_SCHEMES = [' | ||
| + | HTTPCACHE_DBM_MODULE = ' | ||
| + | HTTPCACHE_POLICY = ' | ||
| + | HTTPCACHE_GZIP = False | ||
| + | |||
| + | ITEM_PROCESSOR = ' | ||
| + | |||
| + | ITEM_PIPELINES = {} | ||
| + | ITEM_PIPELINES_BASE = {} | ||
| + | |||
| + | LOG_ENABLED = True | ||
| + | LOG_ENCODING = ' | ||
| + | LOG_FORMATTER = ' | ||
| + | LOG_FORMAT = ' | ||
| + | LOG_DATEFORMAT = ' | ||
| + | LOG_STDOUT = False | ||
| + | LOG_LEVEL = ' | ||
| + | LOG_FILE = None | ||
| + | |||
| + | LOG_UNSERIALIZABLE_REQUESTS = False | ||
| + | |||
| + | LOGSTATS_INTERVAL = 60.0 | ||
| + | |||
| + | MAIL_HOST = ' | ||
| + | MAIL_PORT = 25 | ||
| + | MAIL_FROM = ' | ||
| + | MAIL_PASS = None | ||
| + | MAIL_USER = None | ||
| + | |||
| + | MEMDEBUG_ENABLED = False # enable memory debugging | ||
| + | MEMDEBUG_NOTIFY = [] # send memory debugging report by mail at engine shutdown | ||
| + | |||
| + | MEMUSAGE_ENABLED = False | ||
| + | MEMUSAGE_LIMIT_MB = 0 | ||
| + | MEMUSAGE_NOTIFY_MAIL = [] | ||
| + | MEMUSAGE_REPORT = False | ||
| + | MEMUSAGE_WARNING_MB = 0 | ||
| + | |||
| + | METAREFRESH_ENABLED = True | ||
| + | METAREFRESH_MAXDELAY = 100 | ||
| + | |||
| + | NEWSPIDER_MODULE = '' | ||
| + | |||
| + | RANDOMIZE_DOWNLOAD_DELAY = True | ||
| + | |||
| + | REACTOR_THREADPOOL_MAXSIZE = 10 | ||
| + | |||
| + | REDIRECT_ENABLED = True | ||
| + | REDIRECT_MAX_TIMES = 20 # uses Firefox default setting | ||
| + | REDIRECT_PRIORITY_ADJUST = +2 | ||
| + | |||
| + | REFERER_ENABLED = True | ||
| + | |||
| + | RETRY_ENABLED = True | ||
| + | RETRY_TIMES = 2 # initial response + 2 retries = 3 requests | ||
| + | RETRY_HTTP_CODES = [500, 502, 503, 504, 400, 408] | ||
| + | RETRY_PRIORITY_ADJUST = -1 | ||
| + | |||
| + | ROBOTSTXT_OBEY = False | ||
| + | |||
| + | SCHEDULER = ' | ||
| + | SCHEDULER_DISK_QUEUE = ' | ||
| + | SCHEDULER_MEMORY_QUEUE = ' | ||
| + | |||
| + | SPIDER_LOADER_CLASS = ' | ||
| + | |||
| + | SPIDER_MIDDLEWARES = {} | ||
| + | |||
| + | SPIDER_MIDDLEWARES_BASE = { | ||
| + | # Engine side | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | # Spider side | ||
| + | } | ||
| + | |||
| + | SPIDER_MODULES = [] | ||
| + | |||
| + | STATS_CLASS = ' | ||
| + | STATS_DUMP = True | ||
| + | |||
| + | STATSMAILER_RCPTS = [] | ||
| + | |||
| + | TEMPLATES_DIR = abspath(join(dirname(__file__), | ||
| + | |||
| + | URLLENGTH_LIMIT = 2083 | ||
| + | |||
| + | USER_AGENT = ' | ||
| + | |||
| + | TELNETCONSOLE_ENABLED = 1 | ||
| + | TELNETCONSOLE_PORT = [6023, 6073] | ||
| + | TELNETCONSOLE_HOST = ' | ||
| + | |||
| + | SPIDER_CONTRACTS = {} | ||
| + | SPIDER_CONTRACTS_BASE = { | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | } | ||
| + | |||
| + | </ | ||
| + | ==== Some sections in default_settings.py and Custom Settings === | ||
| + | * Bot name< | ||
| + | BOT_NAME = ' | ||
| + | </ | ||
| + | * Download handlers< | ||
| + | DOWNLOAD_HANDLERS = {} | ||
| + | DOWNLOAD_HANDLERS_BASE = { | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | } | ||
| + | </ | ||
| + | * Download Middlewares:< | ||
| + | DOWNLOADER_MIDDLEWARES = {} | ||
| + | DOWNLOADER_MIDDLEWARES_BASE = { | ||
| + | # Engine side | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | # Downloader side | ||
| + | } | ||
| + | </ | ||
| + | * Extensions:< | ||
| + | EXTENSIONS = {} | ||
| + | EXTENSIONS_BASE = { | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | } | ||
| + | </ | ||
| + | * Feed storages< | ||
| + | FEED_STORAGES = {} | ||
| + | FEED_STORAGES_BASE = { | ||
| + | '': | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | } | ||
| + | </ | ||
| + | * Feed Exporters:< | ||
| + | FEED_EXPORTERS = {} | ||
| + | FEED_EXPORTERS_BASE = { | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | } | ||
| + | </ | ||
| + | * Spider Middlewares: | ||
| + | SPIDER_MIDDLEWARES = {} | ||
| + | SPIDER_MIDDLEWARES_BASE = { | ||
| + | # Engine side | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | ' | ||
| + | # Spider side | ||
| + | } | ||
| + | </ | ||
| + | * Spider Modules< | ||
| + | SPIDER_MODULES = [] | ||
| + | </ | ||
| + | |||
| + | ===== scrapy engine ===== | ||
| + | ==== scrapy engine init and start ==== | ||
| + | {{: | ||
| + | ==== DownloadMiddleware ==== | ||
| + | {{: | ||
| + | === Instant DownloadMiddleware === | ||
| + | {{: | ||
| + | === DownloadMiddleware run Download === | ||
| + | {{: | ||
| + | ==== Request in Scrapy ==== | ||
| + | ==== Crawl and Spider ==== | ||
| + | === scrapy crawl command start and run Spider === | ||
| + | Command to start spider< | ||
| + | scrapy crawl < | ||
| + | </ | ||
| + | crawling diagram: | ||
| + | {{: | ||
| + | Basic Steps for crawling: | ||
| + | - Step1: Call Class Method **update_settings** of Spider | ||
| + | - Step2: Call Class Method **from_crawler** to create spider Object | ||
| + | - Step3: Call Method self.spider.**start_requests()** return all requests for downloading | ||
| + | class Crawler(object): | ||
| + | ................ | ||
| + | @defer.inlineCallbacks | ||
| + | def crawl(self, *args, **kwargs): | ||
| + | assert not self.crawling, | ||
| + | self.crawling = True | ||
| + | |||
| + | try: | ||
| + | self.spider = self._create_spider(*args, | ||
| + | self.engine = self._create_engine() | ||
| + | start_requests = iter(self.spider.start_requests()) | ||
| + | yield self.engine.open_spider(self.spider, | ||
| + | yield defer.maybeDeferred(self.engine.start) | ||
| + | except Exception: | ||
| + | self.crawling = False | ||
| + | raise | ||
| + | ........................... | ||
| + | class Spider(object_ref): | ||
| + | """ | ||
| + | class. | ||
| + | """ | ||
| + | @classmethod | ||
| + | def from_crawler(cls, | ||
| + | spider = cls(*args, **kwargs) | ||
| + | spider._set_crawler(crawler) | ||
| + | return spider | ||
| + | | ||
| + | def start_requests(self): | ||
| + | for url in self.start_urls: | ||
| + | yield self.make_requests_from_url(url) | ||
| + | | ||
| + | def make_requests_from_url(self, | ||
| + | return Request(url, | ||
| + | | ||
| + | def parse(self, response): | ||
| + | raise NotImplementedError | ||
| + | | ||
| + | @classmethod | ||
| + | def update_settings(cls, | ||
| + | settings.setdict(cls.custom_settings or {}, priority=' | ||
| + | |||
| + | @classmethod | ||
| + | def handles_request(cls, | ||
| + | return url_is_from_spider(request.url, | ||
| + | |||
| + | @staticmethod | ||
| + | def close(spider, | ||
| + | closed = getattr(spider, | ||
| + | if callable(closed): | ||
| + | return closed(reason) | ||
| + | </ | ||
| + | - Step4: Add spider to Shedule for downloading | ||
| + | - Step5: After download url finished, Crawler calling function **parse** | ||
| + | - Step6: **Continue download new requests**: Crawler get all requests yielded from function **parse** and continue downloading | ||
| + | - Step7: **process data downloaded**: | ||
| + | - Step8: Mỗi request đều cố hàm callback **parse(self, | ||
| + | * Hàm callback trả về giá trị là **Request** -> sẽ tiếp tục download những request này | ||
| + | * Nếu hàm callback trả về giá trị là **Item** -> đưa vô pipeline đễ xử lý data | ||
| + | |||
| + | === Spider Classes === | ||
| + | {{: | ||
| + | === Thuật toán download và parse response trong Spider === | ||
| + | Spider class< | ||
| + | class Spider(object_ref): | ||
| + | """ | ||
| + | class. | ||
| + | """ | ||
| + | |||
| + | name = None | ||
| + | custom_settings = None | ||
| + | |||
| + | def __init__(self, | ||
| + | if name is not None: | ||
| + | self.name = name | ||
| + | elif not getattr(self, | ||
| + | raise ValueError(" | ||
| + | self.__dict__.update(kwargs) | ||
| + | if not hasattr(self, | ||
| + | self.start_urls = [] | ||
| + | |||
| + | @property | ||
| + | def logger(self): | ||
| + | logger = logging.getLogger(self.name) | ||
| + | return logging.LoggerAdapter(logger, | ||
| + | |||
| + | def log(self, message, level=logging.DEBUG, | ||
| + | """ | ||
| + | |||
| + | This helper wraps a log call to the logger within the spider, but you | ||
| + | can use it directly (e.g. Spider.logger.info(' | ||
| + | Python logger too. | ||
| + | """ | ||
| + | self.logger.log(level, | ||
| + | |||
| + | @classmethod | ||
| + | def from_crawler(cls, | ||
| + | spider = cls(*args, **kwargs) | ||
| + | spider._set_crawler(crawler) | ||
| + | return spider | ||
| + | |||
| + | def set_crawler(self, | ||
| + | warnings.warn(" | ||
| + | " | ||
| + | " | ||
| + | category=ScrapyDeprecationWarning, | ||
| + | assert not hasattr(self, | ||
| + | " | ||
| + | self._set_crawler(crawler) | ||
| + | |||
| + | def _set_crawler(self, | ||
| + | self.crawler = crawler | ||
| + | self.settings = crawler.settings | ||
| + | crawler.signals.connect(self.close, | ||
| + | |||
| + | def start_requests(self): | ||
| + | for url in self.start_urls: | ||
| + | yield self.make_requests_from_url(url) | ||
| + | |||
| + | def make_requests_from_url(self, | ||
| + | return Request(url, | ||
| + | |||
| + | def parse(self, response): | ||
| + | raise NotImplementedError | ||
| + | |||
| + | @classmethod | ||
| + | def update_settings(cls, | ||
| + | settings.setdict(cls.custom_settings or {}, priority=' | ||
| + | |||
| + | @classmethod | ||
| + | def handles_request(cls, | ||
| + | return url_is_from_spider(request.url, | ||
| + | |||
| + | @staticmethod | ||
| + | def close(spider, | ||
| + | closed = getattr(spider, | ||
| + | if callable(closed): | ||
| + | return closed(reason) | ||
| + | |||
| + | def __str__(self): | ||
| + | return "< | ||
| + | |||
| + | __repr__ = __str__ | ||
| + | </ | ||
| + | Mô tả thuật toán: | ||
| + | - Step1: Scrapy Engine gọi hàm **start_requests** thực hiện việc download tất cả các link trong **start_urls**:< | ||
| + | def start_requests(self): | ||
| + | for url in self.start_urls: | ||
| + | yield self.make_requests_from_url(url) | ||
| + | def make_requests_from_url(self, | ||
| + | return Request(url, | ||
| + | </ | ||
| + | - Step2: Response được download trong step1 sẽ được xử lý trong hàm **parse(self, | ||
| + | - Tập dữ liệu 1: Tập gồm các dữ liệu web được gởi tới Pipeline để xử lý | ||
| + | - Tập dữ liệu 2: Tập gồm các link mới sẽ được download bởi scrapy và gởi tới hàm **parse(self, | ||
| + | === CrawlSpider class === | ||
| + | **CrawlSpider kế thừa từ spider** và sử dụng 2 thuật toán cơ bản: | ||
| + | * **Thuật toán đệ quy** để tìm tất cả url liên kết với url khởi tạo và tạo thành mạng lưới url liên kết với nó | ||
| + | * **Thuật toán extract links** dựa theo rule để lọc ra những url mà nó muốn download | ||
| + | => This modules implements the CrawlSpider which is the recommended spider to use for scraping typical web sites that requires crawling pages. | ||
| + | <code python> | ||
| + | """ | ||
| + | This modules implements the CrawlSpider which is the recommended spider to use | ||
| + | for scraping typical web sites that requires crawling pages. | ||
| + | |||
| + | See documentation in docs/ | ||
| + | """ | ||
| + | |||
| + | import copy | ||
| + | |||
| + | from scrapy.http import Request, HtmlResponse | ||
| + | from scrapy.utils.spider import iterate_spider_output | ||
| + | from scrapy.spiders import Spider | ||
| + | |||
| + | def identity(x): | ||
| + | return x | ||
| + | |||
| + | class Rule(object): | ||
| + | |||
| + | def __init__(self, | ||
| + | self.link_extractor = link_extractor | ||
| + | self.callback = callback | ||
| + | self.cb_kwargs = cb_kwargs or {} | ||
| + | self.process_links = process_links | ||
| + | self.process_request = process_request | ||
| + | if follow is None: | ||
| + | self.follow = False if callback else True | ||
| + | else: | ||
| + | self.follow = follow | ||
| + | |||
| + | class CrawlSpider(Spider): | ||
| + | |||
| + | rules = () | ||
| + | |||
| + | def __init__(self, | ||
| + | super(CrawlSpider, | ||
| + | self._compile_rules() | ||
| + | |||
| + | def parse(self, response): | ||
| + | return self._parse_response(response, | ||
| + | |||
| + | def parse_start_url(self, | ||
| + | return [] | ||
| + | |||
| + | def process_results(self, | ||
| + | return results | ||
| + | |||
| + | def _requests_to_follow(self, | ||
| + | if not isinstance(response, | ||
| + | return | ||
| + | seen = set() | ||
| + | for n, rule in enumerate(self._rules): | ||
| + | links = [l for l in rule.link_extractor.extract_links(response) if l not in seen] | ||
| + | if links and rule.process_links: | ||
| + | links = rule.process_links(links) | ||
| + | for link in links: | ||
| + | seen.add(link) | ||
| + | r = Request(url=link.url, | ||
| + | r.meta.update(rule=n, | ||
| + | yield rule.process_request(r) | ||
| + | |||
| + | def _response_downloaded(self, | ||
| + | rule = self._rules[response.meta[' | ||
| + | return self._parse_response(response, | ||
| + | |||
| + | def _parse_response(self, | ||
| + | if callback: | ||
| + | cb_res = callback(response, | ||
| + | cb_res = self.process_results(response, | ||
| + | for requests_or_item in iterate_spider_output(cb_res): | ||
| + | yield requests_or_item | ||
| + | |||
| + | if follow and self._follow_links: | ||
| + | for request_or_item in self._requests_to_follow(response): | ||
| + | yield request_or_item | ||
| + | |||
| + | def _compile_rules(self): | ||
| + | def get_method(method): | ||
| + | if callable(method): | ||
| + | return method | ||
| + | elif isinstance(method, | ||
| + | return getattr(self, | ||
| + | |||
| + | self._rules = [copy.copy(r) for r in self.rules] | ||
| + | for rule in self._rules: | ||
| + | rule.callback = get_method(rule.callback) | ||
| + | rule.process_links = get_method(rule.process_links) | ||
| + | rule.process_request = get_method(rule.process_request) | ||
| + | |||
| + | @classmethod | ||
| + | def from_crawler(cls, | ||
| + | spider = super(CrawlSpider, | ||
| + | spider._follow_links = crawler.settings.getbool( | ||
| + | ' | ||
| + | return spider | ||
| + | |||
| + | def set_crawler(self, | ||
| + | super(CrawlSpider, | ||
| + | self._follow_links = crawler.settings.getbool(' | ||
| + | |||
| + | </ | ||
| + | ==== ItemPipeline để xử lý lưu trữ dữ liệu ==== | ||
| + | refer: https:// | ||
| + | |||
| + | Riêng **ImagesPipeline and FilesPipeline**, | ||
| + | |||
| + | === ItemPipeline Classes === | ||
| + | {{: | ||
| + | |||
| + | With some public functions for pipeline:< | ||
| + | @classmethod | ||
| + | def from_crawler(cls, | ||
| + | def open_spider(self, | ||
| + | def process_item(self, | ||
| + | def media_to_download(self, | ||
| + | def get_media_requests(self, | ||
| + | def media_downloaded(self, | ||
| + | def media_failed(self, | ||
| + | def item_completed(self, | ||
| + | </ | ||
| + | === ItemPipelineManager === | ||
| + | {{: | ||
| + | Chú thích cho sơ đồ và mô tả kiến trúc của thuật toán pipeline: | ||
| + | * Tất cả **pipeline classes** được xem là middleware để xử lý dữ liệu sau khi parse bởi spider(middleware có nghĩa là **dữ liệu được parse bởi tất cả spider đều được gởi tuần tự đến tất cả những pipeline này**), danh sách các pipeline được sử dụng sẽ lưu trữ trong settings của chương trình **ITEM_PIPELINES** và **ITEM_PIPELINES_BASE**:< | ||
| + | ITEM_PIPELINES = {' | ||
| + | } | ||
| + | </ | ||
| + | * khi **chương trình khởi động**, scrapy sẽ **gọi các class method của tất cả pipeline class để khởi dựng nó** : | ||
| + | - Gọi class method **from_crawler** của pipeline class(cls là pipeline class)< | ||
| + | @classmethod | ||
| + | def from_crawler(cls, | ||
| + | </ | ||
| + | - Gọi class method from_settings | ||
| + | @classmethod | ||
| + | def from_settings(cls, | ||
| + | </ | ||
| + | * Khi s**pider parse và trả về dữ liệu bất kỳ**, scrapy sẽ gọi các hàm bên dưới của tất cả pipeline class để xử lý nó. Theo thứ tự như sau: | ||
| + | - Gọi phương thức open_spider< | ||
| + | def open_spider(self, | ||
| + | </ | ||
| + | - Gọi phương thức process_item< | ||
| + | def process_item(self, | ||
| + | </ | ||
| + | - Gọi phương thức close_spider< | ||
| + | def close_spider(self, | ||
| + | </ | ||
| + | === ImagesPipeline === | ||
| + | http:// | ||
| + | |||
| + | {{: | ||
| + | === Store in FilesPipeline === | ||
| + | {{: | ||
| + | |||
| + | With 2 public functions:< | ||
| + | def persist_file(self, | ||
| + | def stat_file(self, | ||
| + | </ | ||
| + | Called from FilesPipeline:< | ||
| + | def file_downloaded(self, | ||
| + | path = self.file_path(request, | ||
| + | buf = BytesIO(response.body) | ||
| + | self.store.persist_file(path, | ||
| + | checksum = md5sum(buf) | ||
| + | return checksum | ||
| + | </ | ||
| + | === Write items to a JSON file === | ||
| + | The following pipeline stores all scraped items (from all spiders) into a single items.jl file, containing one item per line serialized in JSON format:< | ||
| + | import json | ||
| + | |||
| + | class JsonWriterPipeline(object): | ||
| + | |||
| + | def open_spider(self, | ||
| + | self.file = open(' | ||
| + | |||
| + | def close_spider(self, | ||
| + | self.file.close() | ||
| + | |||
| + | def process_item(self, | ||
| + | line = json.dumps(dict(item)) + " | ||
| + | self.file.write(line) | ||
| + | return item | ||
| + | </ | ||
| + | Note: The purpose of JsonWriterPipeline is just to introduce how to write item pipelines. If you really want to store all scraped items into a JSON file you should use the Feed exports. | ||
| + | === Write items to MongoDB === | ||
| + | In this example we’ll write items to MongoDB using pymongo. MongoDB address and database name are specified in Scrapy settings; MongoDB collection is named after item class. | ||
| + | |||
| + | The main point of this example is to show how to use from_crawler() method and how to clean up the resources properly.:< | ||
| + | import pymongo | ||
| + | |||
| + | class MongoPipeline(object): | ||
| + | |||
| + | collection_name = ' | ||
| + | |||
| + | def __init__(self, | ||
| + | self.mongo_uri = mongo_uri | ||
| + | self.mongo_db = mongo_db | ||
| + | |||
| + | @classmethod | ||
| + | def from_crawler(cls, | ||
| + | return cls( | ||
| + | mongo_uri=crawler.settings.get(' | ||
| + | mongo_db=crawler.settings.get(' | ||
| + | ) | ||
| + | |||
| + | def open_spider(self, | ||
| + | self.client = pymongo.MongoClient(self.mongo_uri) | ||
| + | self.db = self.client[self.mongo_db] | ||
| + | |||
| + | def close_spider(self, | ||
| + | self.client.close() | ||
| + | |||
| + | def process_item(self, | ||
| + | self.db[self.collection_name].insert(dict(item)) | ||
| + | return item | ||
| + | </ | ||
| + | === Duplicates filter === | ||
| + | A filter that looks for duplicate items, and drops those items that were already processed. Let’s say that our items have a unique id, but our spider returns multiples items with the same id:<code python> | ||
| + | from scrapy.exceptions import DropItem | ||
| + | |||
| + | class DuplicatesPipeline(object): | ||
| + | |||
| + | def __init__(self): | ||
| + | self.ids_seen = set() | ||
| + | |||
| + | def process_item(self, | ||
| + | if item[' | ||
| + | raise DropItem(" | ||
| + | else: | ||
| + | self.ids_seen.add(item[' | ||
| + | return item | ||
| + | </ | ||
| + | ==== scope of allowed_domains ==== | ||
| + | allowed_domains was filtered in **site-packages\scrapy\utils\url.py**:< | ||
| + | def url_is_from_any_domain(url, | ||
| + | """ | ||
| + | host = parse_url(url).netloc.lower() | ||
| + | |||
| + | if host: | ||
| + | return any(((host == d.lower()) or (host.endswith(' | ||
| + | else: | ||
| + | return False | ||
| + | |||
| + | def url_is_from_spider(url, | ||
| + | """ | ||
| + | return url_is_from_any_domain(url, | ||
| + | [spider.name] + list(getattr(spider, | ||
| + | </ | ||
| + | and spider call it to check before download:< | ||
| + | class Spider(object_ref): | ||
| + | @classmethod | ||
| + | def handles_request(cls, | ||
| + | return url_is_from_spider(request.url, | ||
| + | </ | ||
| + | ==== Integrate Scrapy with Other Systems ==== | ||
| + | Integrate via below systems: | ||
| + | * Database: MySQL, MongoDB | ||
| + | * Cache: Redis Cache, Cm Cache -> You can **start multiple spider instances that share a single redis queue**. Best suitable for **broad multi-domain crawls**. | ||
crawler/scrapyarchitecturecode.1470158024.txt.gz · Last modified: (external edit)
