scrapy --help Scrapy 1.0.3 - project: templatedownload Usage: scrapy <command> [options] [args] Available commands: bench Run quick benchmark test check Check spider contracts commands crawl Run a spider edit Edit spider fetch Fetch a URL using the Scrapy downloader genspider Generate new spider using pre-defined templates list List available spiders parse Parse URL (using its spider) and print the results runspider Run a self-contained spider (without creating a project) settings Get settings values shell Interactive scraping console startproject Create new project version Print Scrapy version view Open URL in browser, as seen by Scrapy Use "scrapy <command> -h" to see more info about a command
scrapy startproject -h Usage ===== scrapy startproject <project_name> Create new project Options ======= --help, -h show this help message and exit Global Options -------------- --logfile=FILE log file. if omitted stderr will be used --loglevel=LEVEL, -L LEVEL log level (default: DEBUG) --nolog disable logging completely --profile=FILE write python cProfile stats to FILE --lsprof=FILE write lsprof profiling stats to FILE --pidfile=FILE write process ID to FILE --set=NAME=VALUE, -s NAME=VALUE set/override setting (may be repeated) --pdb enable pdb on failure
python -mscrapy.cmdline %*
Scrapy command code scripts are stored in directory scrapy/commands/:
bench.py check.py crawl.py deploy.py edit.py fetch.py genspider.py list.py parse.py runspider.py settings.py shell.py startproject.py version.py view.py
default_settings.py
""" This module contains the default values for all settings used by Scrapy. For more information about these settings you can read the settings documentation in docs/topics/settings.rst Scrapy developers, if you add a setting here remember to: * add it in alphabetical order * group similar settings without leaving blank lines * add its documentation to the available settings documentation (docs/topics/settings.rst) """ import os import sys from importlib import import_module from os.path import join, abspath, dirname AJAXCRAWL_ENABLED = False BOT_NAME = 'scrapybot' CLOSESPIDER_TIMEOUT = 0 CLOSESPIDER_PAGECOUNT = 0 CLOSESPIDER_ITEMCOUNT = 0 CLOSESPIDER_ERRORCOUNT = 0 COMMANDS_MODULE = '' COMPRESSION_ENABLED = True CONCURRENT_ITEMS = 100 CONCURRENT_REQUESTS = 16 CONCURRENT_REQUESTS_PER_DOMAIN = 8 CONCURRENT_REQUESTS_PER_IP = 0 COOKIES_ENABLED = True COOKIES_DEBUG = False DEFAULT_ITEM_CLASS = 'scrapy.item.Item' DEFAULT_REQUEST_HEADERS = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en', } DEPTH_LIMIT = 0 DEPTH_STATS = True DEPTH_PRIORITY = 0 DNSCACHE_ENABLED = True DNSCACHE_SIZE = 10000 DNS_TIMEOUT = 60 DOWNLOAD_DELAY = 0 DOWNLOAD_HANDLERS = {} DOWNLOAD_HANDLERS_BASE = { 'file': 'scrapy.core.downloader.handlers.file.FileDownloadHandler', 'http': 'scrapy.core.downloader.handlers.http.HTTPDownloadHandler', 'https': 'scrapy.core.downloader.handlers.http.HTTPDownloadHandler', 's3': 'scrapy.core.downloader.handlers.s3.S3DownloadHandler', 'ftp': 'scrapy.core.downloader.handlers.ftp.FTPDownloadHandler', } DOWNLOAD_TIMEOUT = 180 # 3mins DOWNLOAD_MAXSIZE = 1024*1024*1024 # 1024m DOWNLOAD_WARNSIZE = 32*1024*1024 # 32m DOWNLOADER = 'scrapy.core.downloader.Downloader' DOWNLOADER_HTTPCLIENTFACTORY = 'scrapy.core.downloader.webclient.ScrapyHTTPClientFactory' DOWNLOADER_CLIENTCONTEXTFACTORY = 'scrapy.core.downloader.contextfactory.ScrapyClientContextFactory' DOWNLOADER_MIDDLEWARES = {} DOWNLOADER_MIDDLEWARES_BASE = { # Engine side 'scrapy.downloadermiddlewares.robotstxt.RobotsTxtMiddleware': 100, 'scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware': 300, 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware': 350, 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': 400, 'scrapy.downloadermiddlewares.retry.RetryMiddleware': 500, 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware': 550, 'scrapy.downloadermiddlewares.ajaxcrawl.AjaxCrawlMiddleware': 560, 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware': 580, 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 590, 'scrapy.downloadermiddlewares.redirect.RedirectMiddleware': 600, 'scrapy.downloadermiddlewares.cookies.CookiesMiddleware': 700, 'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 750, 'scrapy.downloadermiddlewares.chunked.ChunkedTransferMiddleware': 830, 'scrapy.downloadermiddlewares.stats.DownloaderStats': 850, 'scrapy.downloadermiddlewares.httpcache.HttpCacheMiddleware': 900, # Downloader side } DOWNLOADER_STATS = True DUPEFILTER_CLASS = 'scrapy.dupefilters.RFPDupeFilter' try: EDITOR = os.environ['EDITOR'] except KeyError: if sys.platform == 'win32': EDITOR = '%s -m idlelib.idle' else: EDITOR = 'vi' EXTENSIONS = {} EXTENSIONS_BASE = { 'scrapy.extensions.corestats.CoreStats': 0, 'scrapy.telnet.TelnetConsole': 0, 'scrapy.extensions.memusage.MemoryUsage': 0, 'scrapy.extensions.memdebug.MemoryDebugger': 0, 'scrapy.extensions.closespider.CloseSpider': 0, 'scrapy.extensions.feedexport.FeedExporter': 0, 'scrapy.extensions.logstats.LogStats': 0, 'scrapy.extensions.spiderstate.SpiderState': 0, 'scrapy.extensions.throttle.AutoThrottle': 0, } FEED_URI = None FEED_URI_PARAMS = None # a function to extend uri arguments FEED_FORMAT = 'jsonlines' FEED_STORE_EMPTY = False FEED_EXPORT_FIELDS = None FEED_STORAGES = {} FEED_STORAGES_BASE = { '': 'scrapy.extensions.feedexport.FileFeedStorage', 'file': 'scrapy.extensions.feedexport.FileFeedStorage', 'stdout': 'scrapy.extensions.feedexport.StdoutFeedStorage', 's3': 'scrapy.extensions.feedexport.S3FeedStorage', 'ftp': 'scrapy.extensions.feedexport.FTPFeedStorage', } FEED_EXPORTERS = {} FEED_EXPORTERS_BASE = { 'json': 'scrapy.exporters.JsonItemExporter', 'jsonlines': 'scrapy.exporters.JsonLinesItemExporter', 'jl': 'scrapy.exporters.JsonLinesItemExporter', 'csv': 'scrapy.exporters.CsvItemExporter', 'xml': 'scrapy.exporters.XmlItemExporter', 'marshal': 'scrapy.exporters.MarshalItemExporter', 'pickle': 'scrapy.exporters.PickleItemExporter', } HTTPCACHE_ENABLED = False HTTPCACHE_DIR = 'httpcache' HTTPCACHE_IGNORE_MISSING = False HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' HTTPCACHE_EXPIRATION_SECS = 0 HTTPCACHE_IGNORE_HTTP_CODES = [] HTTPCACHE_IGNORE_SCHEMES = ['file'] HTTPCACHE_DBM_MODULE = 'anydbm' HTTPCACHE_POLICY = 'scrapy.extensions.httpcache.DummyPolicy' HTTPCACHE_GZIP = False ITEM_PROCESSOR = 'scrapy.pipelines.ItemPipelineManager' ITEM_PIPELINES = {} ITEM_PIPELINES_BASE = {} LOG_ENABLED = True LOG_ENCODING = 'utf-8' LOG_FORMATTER = 'scrapy.logformatter.LogFormatter' LOG_FORMAT = '%(asctime)s [%(name)s] %(levelname)s: %(message)s' LOG_DATEFORMAT = '%Y-%m-%d %H:%M:%S' LOG_STDOUT = False LOG_LEVEL = 'DEBUG' LOG_FILE = None LOG_UNSERIALIZABLE_REQUESTS = False LOGSTATS_INTERVAL = 60.0 MAIL_HOST = 'localhost' MAIL_PORT = 25 MAIL_FROM = 'scrapy@localhost' MAIL_PASS = None MAIL_USER = None MEMDEBUG_ENABLED = False # enable memory debugging MEMDEBUG_NOTIFY = [] # send memory debugging report by mail at engine shutdown MEMUSAGE_ENABLED = False MEMUSAGE_LIMIT_MB = 0 MEMUSAGE_NOTIFY_MAIL = [] MEMUSAGE_REPORT = False MEMUSAGE_WARNING_MB = 0 METAREFRESH_ENABLED = True METAREFRESH_MAXDELAY = 100 NEWSPIDER_MODULE = '' RANDOMIZE_DOWNLOAD_DELAY = True REACTOR_THREADPOOL_MAXSIZE = 10 REDIRECT_ENABLED = True REDIRECT_MAX_TIMES = 20 # uses Firefox default setting REDIRECT_PRIORITY_ADJUST = +2 REFERER_ENABLED = True RETRY_ENABLED = True RETRY_TIMES = 2 # initial response + 2 retries = 3 requests RETRY_HTTP_CODES = [500, 502, 503, 504, 400, 408] RETRY_PRIORITY_ADJUST = -1 ROBOTSTXT_OBEY = False SCHEDULER = 'scrapy.core.scheduler.Scheduler' SCHEDULER_DISK_QUEUE = 'scrapy.squeues.PickleLifoDiskQueue' SCHEDULER_MEMORY_QUEUE = 'scrapy.squeues.LifoMemoryQueue' SPIDER_LOADER_CLASS = 'scrapy.spiderloader.SpiderLoader' SPIDER_MIDDLEWARES = {} SPIDER_MIDDLEWARES_BASE = { # Engine side 'scrapy.spidermiddlewares.httperror.HttpErrorMiddleware': 50, 'scrapy.spidermiddlewares.offsite.OffsiteMiddleware': 500, 'scrapy.spidermiddlewares.referer.RefererMiddleware': 700, 'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware': 800, 'scrapy.spidermiddlewares.depth.DepthMiddleware': 900, # Spider side } SPIDER_MODULES = [] STATS_CLASS = 'scrapy.statscollectors.MemoryStatsCollector' STATS_DUMP = True STATSMAILER_RCPTS = [] TEMPLATES_DIR = abspath(join(dirname(__file__), '..', 'templates')) URLLENGTH_LIMIT = 2083 USER_AGENT = 'Scrapy/%s (+http://scrapy.org)' % import_module('scrapy').__version__ TELNETCONSOLE_ENABLED = 1 TELNETCONSOLE_PORT = [6023, 6073] TELNETCONSOLE_HOST = '127.0.0.1' SPIDER_CONTRACTS = {} SPIDER_CONTRACTS_BASE = { 'scrapy.contracts.default.UrlContract': 1, 'scrapy.contracts.default.ReturnsContract': 2, 'scrapy.contracts.default.ScrapesContract': 3, }
BOT_NAME = 'scrapybot'
DOWNLOAD_HANDLERS = {} DOWNLOAD_HANDLERS_BASE = { 'file': 'scrapy.core.downloader.handlers.file.FileDownloadHandler', 'http': 'scrapy.core.downloader.handlers.http.HTTPDownloadHandler', 'https': 'scrapy.core.downloader.handlers.http.HTTPDownloadHandler', 's3': 'scrapy.core.downloader.handlers.s3.S3DownloadHandler', 'ftp': 'scrapy.core.downloader.handlers.ftp.FTPDownloadHandler', }
DOWNLOADER_MIDDLEWARES = {} DOWNLOADER_MIDDLEWARES_BASE = { # Engine side 'scrapy.downloadermiddlewares.robotstxt.RobotsTxtMiddleware': 100, 'scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware': 300, 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware': 350, 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': 400, 'scrapy.downloadermiddlewares.retry.RetryMiddleware': 500, 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware': 550, 'scrapy.downloadermiddlewares.ajaxcrawl.AjaxCrawlMiddleware': 560, 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware': 580, 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 590, 'scrapy.downloadermiddlewares.redirect.RedirectMiddleware': 600, 'scrapy.downloadermiddlewares.cookies.CookiesMiddleware': 700, 'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 750, 'scrapy.downloadermiddlewares.chunked.ChunkedTransferMiddleware': 830, 'scrapy.downloadermiddlewares.stats.DownloaderStats': 850, 'scrapy.downloadermiddlewares.httpcache.HttpCacheMiddleware': 900, # Downloader side }
EXTENSIONS = {} EXTENSIONS_BASE = { 'scrapy.extensions.corestats.CoreStats': 0, 'scrapy.telnet.TelnetConsole': 0, 'scrapy.extensions.memusage.MemoryUsage': 0, 'scrapy.extensions.memdebug.MemoryDebugger': 0, 'scrapy.extensions.closespider.CloseSpider': 0, 'scrapy.extensions.feedexport.FeedExporter': 0, 'scrapy.extensions.logstats.LogStats': 0, 'scrapy.extensions.spiderstate.SpiderState': 0, 'scrapy.extensions.throttle.AutoThrottle': 0, }
FEED_STORAGES = {} FEED_STORAGES_BASE = { '': 'scrapy.extensions.feedexport.FileFeedStorage', 'file': 'scrapy.extensions.feedexport.FileFeedStorage', 'stdout': 'scrapy.extensions.feedexport.StdoutFeedStorage', 's3': 'scrapy.extensions.feedexport.S3FeedStorage', 'ftp': 'scrapy.extensions.feedexport.FTPFeedStorage', }
FEED_EXPORTERS = {} FEED_EXPORTERS_BASE = { 'json': 'scrapy.exporters.JsonItemExporter', 'jsonlines': 'scrapy.exporters.JsonLinesItemExporter', 'jl': 'scrapy.exporters.JsonLinesItemExporter', 'csv': 'scrapy.exporters.CsvItemExporter', 'xml': 'scrapy.exporters.XmlItemExporter', 'marshal': 'scrapy.exporters.MarshalItemExporter', 'pickle': 'scrapy.exporters.PickleItemExporter', }
SPIDER_MIDDLEWARES = {} SPIDER_MIDDLEWARES_BASE = { # Engine side 'scrapy.spidermiddlewares.httperror.HttpErrorMiddleware': 50, 'scrapy.spidermiddlewares.offsite.OffsiteMiddleware': 500, 'scrapy.spidermiddlewares.referer.RefererMiddleware': 700, 'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware': 800, 'scrapy.spidermiddlewares.depth.DepthMiddleware': 900, # Spider side }
SPIDER_MODULES = []
Command to start spider
scrapy crawl <spidername>
crawling diagram: Basic Steps for crawling:
class Crawler(object): ................ @defer.inlineCallbacks def crawl(self, *args, **kwargs): assert not self.crawling, "Crawling already taking place" self.crawling = True try: self.spider = self._create_spider(*args, **kwargs) self.engine = self._create_engine() start_requests = iter(self.spider.start_requests()) yield self.engine.open_spider(self.spider, start_requests) yield defer.maybeDeferred(self.engine.start) except Exception: self.crawling = False raise ........................... class Spider(object_ref): """Base class for scrapy spiders. All spiders must inherit from this class. """ @classmethod def from_crawler(cls, crawler, *args, **kwargs): spider = cls(*args, **kwargs) spider._set_crawler(crawler) return spider def start_requests(self): for url in self.start_urls: yield self.make_requests_from_url(url) def make_requests_from_url(self, url): return Request(url, dont_filter=True) def parse(self, response): raise NotImplementedError @classmethod def update_settings(cls, settings): settings.setdict(cls.custom_settings or {}, priority='spider') @classmethod def handles_request(cls, request): return url_is_from_spider(request.url, cls) @staticmethod def close(spider, reason): closed = getattr(spider, 'closed', None) if callable(closed): return closed(reason)
Spider class
class Spider(object_ref): """Base class for scrapy spiders. All spiders must inherit from this class. """ name = None custom_settings = None def __init__(self, name=None, **kwargs): if name is not None: self.name = name elif not getattr(self, 'name', None): raise ValueError("%s must have a name" % type(self).__name__) self.__dict__.update(kwargs) if not hasattr(self, 'start_urls'): self.start_urls = [] @property def logger(self): logger = logging.getLogger(self.name) return logging.LoggerAdapter(logger, {'spider': self}) def log(self, message, level=logging.DEBUG, **kw): """Log the given message at the given log level This helper wraps a log call to the logger within the spider, but you can use it directly (e.g. Spider.logger.info('msg')) or use any other Python logger too. """ self.logger.log(level, message, **kw) @classmethod def from_crawler(cls, crawler, *args, **kwargs): spider = cls(*args, **kwargs) spider._set_crawler(crawler) return spider def set_crawler(self, crawler): warnings.warn("set_crawler is deprecated, instantiate and bound the " "spider to this crawler with from_crawler method " "instead.", category=ScrapyDeprecationWarning, stacklevel=2) assert not hasattr(self, 'crawler'), "Spider already bounded to a " \ "crawler" self._set_crawler(crawler) def _set_crawler(self, crawler): self.crawler = crawler self.settings = crawler.settings crawler.signals.connect(self.close, signals.spider_closed) def start_requests(self): for url in self.start_urls: yield self.make_requests_from_url(url) def make_requests_from_url(self, url): return Request(url, dont_filter=True) def parse(self, response): raise NotImplementedError @classmethod def update_settings(cls, settings): settings.setdict(cls.custom_settings or {}, priority='spider') @classmethod def handles_request(cls, request): return url_is_from_spider(request.url, cls) @staticmethod def close(spider, reason): closed = getattr(spider, 'closed', None) if callable(closed): return closed(reason) def __str__(self): return "<%s %r at 0x%0x>" % (type(self).__name__, self.name, id(self)) __repr__ = __str__
Mô tả thuật toán:
def start_requests(self): for url in self.start_urls: yield self.make_requests_from_url(url) def make_requests_from_url(self, url): return Request(url, dont_filter=True)
CrawlSpider kế thừa từ spider và sử dụng 2 thuật toán cơ bản:
⇒ This modules implements the CrawlSpider which is the recommended spider to use for scraping typical web sites that requires crawling pages.
""" This modules implements the CrawlSpider which is the recommended spider to use for scraping typical web sites that requires crawling pages. See documentation in docs/topics/spiders.rst """ import copy from scrapy.http import Request, HtmlResponse from scrapy.utils.spider import iterate_spider_output from scrapy.spiders import Spider def identity(x): return x class Rule(object): def __init__(self, link_extractor, callback=None, cb_kwargs=None, follow=None, process_links=None, process_request=identity): self.link_extractor = link_extractor self.callback = callback self.cb_kwargs = cb_kwargs or {} self.process_links = process_links self.process_request = process_request if follow is None: self.follow = False if callback else True else: self.follow = follow class CrawlSpider(Spider): rules = () def __init__(self, *a, **kw): super(CrawlSpider, self).__init__(*a, **kw) self._compile_rules() def parse(self, response): return self._parse_response(response, self.parse_start_url, cb_kwargs={}, follow=True) def parse_start_url(self, response): return [] def process_results(self, response, results): return results def _requests_to_follow(self, response): if not isinstance(response, HtmlResponse): return seen = set() for n, rule in enumerate(self._rules): links = [l for l in rule.link_extractor.extract_links(response) if l not in seen] if links and rule.process_links: links = rule.process_links(links) for link in links: seen.add(link) r = Request(url=link.url, callback=self._response_downloaded) r.meta.update(rule=n, link_text=link.text) yield rule.process_request(r) def _response_downloaded(self, response): rule = self._rules[response.meta['rule']] return self._parse_response(response, rule.callback, rule.cb_kwargs, rule.follow) def _parse_response(self, response, callback, cb_kwargs, follow=True): if callback: cb_res = callback(response, **cb_kwargs) or () cb_res = self.process_results(response, cb_res) for requests_or_item in iterate_spider_output(cb_res): yield requests_or_item if follow and self._follow_links: for request_or_item in self._requests_to_follow(response): yield request_or_item def _compile_rules(self): def get_method(method): if callable(method): return method elif isinstance(method, basestring): return getattr(self, method, None) self._rules = [copy.copy(r) for r in self.rules] for rule in self._rules: rule.callback = get_method(rule.callback) rule.process_links = get_method(rule.process_links) rule.process_request = get_method(rule.process_request) @classmethod def from_crawler(cls, crawler, *args, **kwargs): spider = super(CrawlSpider, cls).from_crawler(crawler, *args, **kwargs) spider._follow_links = crawler.settings.getbool( 'CRAWLSPIDER_FOLLOW_LINKS', True) return spider def set_crawler(self, crawler): super(CrawlSpider, self).set_crawler(crawler) self._follow_links = crawler.settings.getbool('CRAWLSPIDER_FOLLOW_LINKS', True)
refer: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
Riêng ImagesPipeline and FilesPipeline, request sẽ được gọi và download trong hàm process_item của pipeline
With some public functions for pipeline:
@classmethod def from_crawler(cls, crawler) def open_spider(self, spider) def process_item(self, item, spider) def media_to_download(self, request, info) def get_media_requests(self, item, info) def media_downloaded(self, response, request, info) def media_failed(self, failure, request, info) def item_completed(self, results, item, info)
Chú thích cho sơ đồ và mô tả kiến trúc của thuật toán pipeline:
ITEM_PIPELINES = {'templatedownload.pipelines.MyFilesPipeline', }
@classmethod def from_crawler(cls, crawler):
@classmethod def from_settings(cls, settings)
⇒ Với class method, chương trình Có thể khởi tạo các thuộc tính cơ bản và khởi dựng pipeline class trong phương thức from_crawler hoặc from_settings
def open_spider(self, spider)
def process_item(self, item, spider):
def close_spider(self, spider)
With 2 public functions:
def persist_file(self, path, buf, info, meta=None, headers=None) def stat_file(self, path, info)
Called from FilesPipeline:
def file_downloaded(self, response, request, info): path = self.file_path(request, response=response, info=info) buf = BytesIO(response.body) self.store.persist_file(path, buf, info) checksum = md5sum(buf) return checksum
The following pipeline stores all scraped items (from all spiders) into a single items.jl file, containing one item per line serialized in JSON format:
import json class JsonWriterPipeline(object): def open_spider(self, spider): self.file = open('items.jl', 'wb') def close_spider(self, spider): self.file.close() def process_item(self, item, spider): line = json.dumps(dict(item)) + "\n" self.file.write(line) return item
Note: The purpose of JsonWriterPipeline is just to introduce how to write item pipelines. If you really want to store all scraped items into a JSON file you should use the Feed exports.
In this example we’ll write items to MongoDB using pymongo. MongoDB address and database name are specified in Scrapy settings; MongoDB collection is named after item class.
The main point of this example is to show how to use from_crawler() method and how to clean up the resources properly.:
import pymongo class MongoPipeline(object): collection_name = 'scrapy_items' def __init__(self, mongo_uri, mongo_db): self.mongo_uri = mongo_uri self.mongo_db = mongo_db @classmethod def from_crawler(cls, crawler): return cls( mongo_uri=crawler.settings.get('MONGO_URI'), mongo_db=crawler.settings.get('MONGO_DATABASE', 'items') ) def open_spider(self, spider): self.client = pymongo.MongoClient(self.mongo_uri) self.db = self.client[self.mongo_db] def close_spider(self, spider): self.client.close() def process_item(self, item, spider): self.db[self.collection_name].insert(dict(item)) return item
A filter that looks for duplicate items, and drops those items that were already processed. Let’s say that our items have a unique id, but our spider returns multiples items with the same id:
from scrapy.exceptions import DropItem class DuplicatesPipeline(object): def __init__(self): self.ids_seen = set() def process_item(self, item, spider): if item['id'] in self.ids_seen: raise DropItem("Duplicate item found: %s" % item) else: self.ids_seen.add(item['id']) return item
allowed_domains was filtered in site-packages\scrapy\utils\url.py:
def url_is_from_any_domain(url, domains): """Return True if the url belongs to any of the given domains""" host = parse_url(url).netloc.lower() if host: return any(((host == d.lower()) or (host.endswith('.%s' % d.lower())) for d in domains)) else: return False def url_is_from_spider(url, spider): """Return True if the url belongs to the given spider""" return url_is_from_any_domain(url, [spider.name] + list(getattr(spider, 'allowed_domains', [])))
and spider call it to check before download:
class Spider(object_ref): @classmethod def handles_request(cls, request): return url_is_from_spider(request.url, cls)
Integrate via below systems: