====== Scrapy Examples ====== ===== Download entire site with scrapy ===== from scrapy.selector import HtmlXPathSelector from scrapy.contrib.spiders import CrawlSpider, Rule from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor as sle class BabySpider(CrawlSpider): name = "baby" allowed_domains = ["babies.vn"] start_urls = [ "http://shop.babies.vn/index.php" ] rules = [ Rule(sle(allow=("/*.html")), callback='parse_template'), ] def parse_template(self, response): filename = response.url.split("/")[-1] with open(filename, 'wb') as f: f.write(response.body) ==== Download images and js, css files ==== * define fields storing image and file urls in items.py # Define here the models for your scraped items # # See documentation in: # http://doc.scrapy.org/en/latest/topics/items.html from scrapy.item import Item, Field class WebItem(Item): image_urls = Field() file_urls = Field() * get image and files url for downloading spider baby.py from scrapy.selector import HtmlXPathSelector from scrapy.contrib.spiders import CrawlSpider, Rule from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor as sle from myscrapy.items import WebItem class BabySpider(CrawlSpider): name = "baby" allowed_domains = ["babies.vn"] start_urls = [ "http://shop.babies.vn/index.php" ] rules = [ Rule(sle(allow=("/*.html")), callback='parse_template'), ] def parse_template(self, response): print response.url filename = response.url.split("/")[-1] print filename with open(filename, 'wb') as f: f.write(response.body) selector = HtmlXPathSelector(response) web_item = WebItem() web_item['image_urls'] = selector.select('//img/@src').extract(); web_item['file_urls'] = selector.select('//script/@src|/html/head/link[@rel="stylesheet"]/@href').extract(); yield web_item * define pipelines for storing the images and files in settings.py ITEM_PIPELINES = {'myscrapy.pipelines.MyscrapyPipeline', 'scrapy.contrib.pipeline.images.ImagesPipeline', 'scrapy.contrib.pipeline.files.FilesPipeline', } IMAGES_STORE = os.path.join(PROJECT_DIR,'media/images') FILES_STORE = os.path.join(PROJECT_DIR,'media/files') ==== Change the format directory and filename for storing images and files ==== * Create custom pipelines extends from FilesPipeline and ImagesPiline in pipelines.py class MyImagesPipeline(ImagesPipeline): def file_path(self, request, response=None, info=None): ## start of deprecation warning block (can be removed in the future) def _warn(): from scrapy.exceptions import ScrapyDeprecationWarning import warnings warnings.warn('FilesPipeline.file_key(url) method is deprecated, please use ' 'file_path(request, response=None, info=None) instead', category=ScrapyDeprecationWarning, stacklevel=1) # check if called from file_key with url as first argument if not isinstance(request, Request): _warn() url = request else: url = request.url # detect if file_key() method has been overridden if not hasattr(self.file_key, '_base'): _warn() return self.file_key(url) ## end of deprecation warning block o = urlparse(url) return o.path class MyFilesPipeline(FilesPipeline): def file_path(self, request, response=None, info=None): ## start of deprecation warning block (can be removed in the future) def _warn(): from scrapy.exceptions import ScrapyDeprecationWarning import warnings warnings.warn('FilesPipeline.file_key(url) method is deprecated, please use ' 'file_path(request, response=None, info=None) instead', category=ScrapyDeprecationWarning, stacklevel=1) # check if called from file_key with url as first argument if not isinstance(request, Request): _warn() url = request else: url = request.url # detect if file_key() method has been overridden if not hasattr(self.file_key, '_base'): _warn() return self.file_key(url) ## end of deprecation warning block o = urlparse(url) return o.path * edit the settings.py to change the pipelines for storing files and images: ITEM_PIPELINES = {'myscrapy.pipelines.MyscrapyPipeline', 'myscrapy.pipelines.MyFilesPipeline', 'myscrapy.pipelines.MyImagesPipeline', } ==== Change the code for parsing start_urls in spider ==== from scrapy.selector import HtmlXPathSelector from scrapy.contrib.spiders import CrawlSpider, Rule from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor as sle from myscrapy.items import WebItem from os import path from urlparse import urlparse class BabySpider(CrawlSpider): name = "baby" allowed_domains = ["babies.vn"] start_urls = [ "http://shop.babies.vn" ] rules = [ Rule(sle(allow=("/*.html")), callback='parse_template'), ] def parse_start_url(self, response): yield self.myparse(response) def myparse(self, response): print response.url o = urlparse(response.url) filename = path.basename(o.path) if filename == '': filename = 'index.html' with open(filename, 'wb') as f: f.write(response.body) print filename selector = HtmlXPathSelector(response) web_item = WebItem() web_item['image_urls'] = selector.select('//img/@src').extract(); web_item['file_urls'] = selector.select('//script/@src|/html/head/link[@rel="stylesheet"]/@href').extract(); return web_item def parse_template(self, response): yield self.myparse(response) ==== Modify myparse for changing http://shop.babies.vn to '' in all html files ==== def myparse(self, response): output_dir = path.join(self.settings['OUTPUT_DIR']) if not path.exists(output_dir): os.makedirs(output_dir) o = urlparse(response.url) filename = path.basename(o.path) if filename == '': filename = 'index.html' filename = path.join(output_dir, filename) with open(filename, 'wb') as f: body = re.sub(r'http://shop.babies.vn/', '', response.body) f.write(body) print filename selector = HtmlXPathSelector(response) web_item = WebItem() web_item['image_urls'] = selector.select('//img/@src').extract(); web_item['file_urls'] = selector.select('//script/@src|/html/head/link[@rel="stylesheet"]/@href').extract(); return web_item ===== Scrapy OpenSources ===== Sort follow filter **Most stars** * https://github.com/scrapinghub/portia * https://github.com/gnemoug/distribute_crawler -> distribute spiders * https://github.com/darkrho/scrapy-redis -> distributed spiders with single redis for receiving items * https://github.com/geekan/scrapy-examples * https://github.com/holgerd77/django-dynamic-scraper -> Manage scrapy spiders via django admin * https://github.com/scrapinghub/scrapyjs * https://github.com/scrapy/scrapyd * https://github.com/scrapinghub/scrapylib -> collection of code * https://github.com/mvanveen/hncrawl * https://github.com/scrapinghub/scrapyrt * https://github.com/aivarsk/scrapy-proxies * https://github.com/kalessin/finance -> Crawl finance data and store to mysql server * https://github.com/istresearch/scrapy-cluster * https://github.com/scrapy/scrapely * https://github.com/octoberman/scrapy-indeed-spider * https://github.com/dcondrey/scrapy-spiders -> collection of spiders * https://github.com/arthurk/scrapy-german-news -> good spider(with simhash algorithm, sqlite) * https://github.com/jackliusr/scrapy-crawlers -> collection of spiders * https://github.com/hemslo/poky-engine -> good architecture * https://github.com/anderson916/google-play-crawler * https://github.com/arpitrai/Daily-Deal-Aggregator * https://github.com/duydo/scrapy-crunchbase * https://github.com/richardkyeung/pandora-food-scrapy * https://github.com/rahulrrixe/Financial-News-Crawler -> config spiders in json data and store crawl data in Mongo * https://github.com/supercoderz/hydbusroutes * https://github.com/shijilspark/scrapy -> scrapy deals with django app and crontab * https://github.com/walbuc/Django-Scrapy -> good architecture with django: * sqlite 3 in dev, postgres or mongodb in prod * Using Celery: Distributed Task Queue * Using redis server for caching * Scrapy create webservice REST and Django using REST API to get data * https://github.com/sdiwu/PlayStoreScrapy * https://github.com/amferraz/9gag-scraper * https://github.com/junwei-wang/GoogleLoginSpider * https://github.com/lodow/portia-proxy -> You can **annotate a web page** to identify the data you wish to * https://github.com/voliveirajr/seleniumcrawler -> scrapy with selenium extract * https://github.com/pelick/VerticleSearchEngine * https://github.com/eliangcs/pystock-crawler