====== Scrapy Examples ======
===== Download entire site with scrapy =====
<code python>
from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor as sle

class BabySpider(CrawlSpider):
    name = "baby"
    allowed_domains = ["babies.vn"]
    start_urls = [
        "http://shop.babies.vn/index.php"
    ]
    rules = [
        Rule(sle(allow=("/*.html")), callback='parse_template'),
    ]

    def parse_template(self, response):
        filename = response.url.split("/")[-1]
        with open(filename, 'wb') as f:
            f.write(response.body)
</code>
==== Download images and js, css files ====
  * define fields storing image and file urls in items.py<code python>
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html

from scrapy.item import Item, Field 

class WebItem(Item):
    image_urls = Field()
    file_urls = Field()
</code>
  * get image and files url for downloading spider baby.py<code python>
from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor as sle
from myscrapy.items import WebItem

class BabySpider(CrawlSpider):
    name = "baby"
    allowed_domains = ["babies.vn"]
    start_urls = [
        "http://shop.babies.vn/index.php"
    ]
    rules = [
        Rule(sle(allow=("/*.html")), callback='parse_template'),
    ]

    def parse_template(self, response):
        print response.url
        filename = response.url.split("/")[-1]
        print filename
        with open(filename, 'wb') as f:
            f.write(response.body)
        selector = HtmlXPathSelector(response)
        web_item = WebItem()
        web_item['image_urls'] = selector.select('//img/@src').extract();
        web_item['file_urls'] = selector.select('//script/@src|/html/head/link[@rel="stylesheet"]/@href').extract();
        yield web_item
</code>
  * define pipelines for storing the images and files in settings.py<code python>
ITEM_PIPELINES = {'myscrapy.pipelines.MyscrapyPipeline',
                  'scrapy.contrib.pipeline.images.ImagesPipeline',
                  'scrapy.contrib.pipeline.files.FilesPipeline',
                }
IMAGES_STORE = os.path.join(PROJECT_DIR,'media/images')
FILES_STORE = os.path.join(PROJECT_DIR,'media/files')                
</code>
==== Change the format directory and filename for storing images and files ====
  * Create custom pipelines extends from FilesPipeline and ImagesPiline in pipelines.py<code python>
class MyImagesPipeline(ImagesPipeline):
     def file_path(self, request, response=None, info=None):
        ## start of deprecation warning block (can be removed in the future)
        def _warn():
            from scrapy.exceptions import ScrapyDeprecationWarning
            import warnings
            warnings.warn('FilesPipeline.file_key(url) method is deprecated, please use '
                          'file_path(request, response=None, info=None) instead',
                          category=ScrapyDeprecationWarning, stacklevel=1)

        # check if called from file_key with url as first argument
        if not isinstance(request, Request):
            _warn()
            url = request
        else:
            url = request.url

        # detect if file_key() method has been overridden
        if not hasattr(self.file_key, '_base'):
            _warn()
            return self.file_key(url)
        ## end of deprecation warning block
        o = urlparse(url)
        return o.path
class MyFilesPipeline(FilesPipeline):
     def file_path(self, request, response=None, info=None):
        ## start of deprecation warning block (can be removed in the future)
        def _warn():
            from scrapy.exceptions import ScrapyDeprecationWarning
            import warnings
            warnings.warn('FilesPipeline.file_key(url) method is deprecated, please use '
                          'file_path(request, response=None, info=None) instead',
                          category=ScrapyDeprecationWarning, stacklevel=1)

        # check if called from file_key with url as first argument
        if not isinstance(request, Request):
            _warn()
            url = request
        else:
            url = request.url

        # detect if file_key() method has been overridden
        if not hasattr(self.file_key, '_base'):
            _warn()
            return self.file_key(url)
        ## end of deprecation warning block
        o = urlparse(url)
        return o.path
</code>
  * edit the settings.py to change the pipelines for storing files and images:<code python>
ITEM_PIPELINES = {'myscrapy.pipelines.MyscrapyPipeline',
                  'myscrapy.pipelines.MyFilesPipeline',
                  'myscrapy.pipelines.MyImagesPipeline',
                }
</code>
==== Change the code for parsing start_urls in spider ====
<code python>
from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor as sle
from myscrapy.items import WebItem
from os import path
from urlparse import urlparse

class BabySpider(CrawlSpider):
    name = "baby"
    allowed_domains = ["babies.vn"]
    start_urls = [
        "http://shop.babies.vn"
    ]
    rules = [
        Rule(sle(allow=("/*.html")), callback='parse_template'),
    ]
    def parse_start_url(self, response):
        yield self.myparse(response)
    def myparse(self, response):
        print response.url
        o = urlparse(response.url)
        filename = path.basename(o.path)
        if filename == '':
            filename = 'index.html'
        with open(filename, 'wb') as f:
            f.write(response.body)
        print filename
        selector = HtmlXPathSelector(response)
        web_item = WebItem()
        web_item['image_urls'] = selector.select('//img/@src').extract();
        web_item['file_urls'] = selector.select('//script/@src|/html/head/link[@rel="stylesheet"]/@href').extract();
        return web_item
    def parse_template(self, response):
        yield self.myparse(response)
</code>
==== Modify myparse for changing http://shop.babies.vn to '' in all html files ====
<code python>
def myparse(self, response):    
    output_dir = path.join(self.settings['OUTPUT_DIR'])
    if not  path.exists(output_dir):
        os.makedirs(output_dir)
    
    o = urlparse(response.url)
    filename = path.basename(o.path)
    if filename == '':
        filename = 'index.html'
    filename = path.join(output_dir, filename)
    
    with open(filename, 'wb') as f:
        body = re.sub(r'http://shop.babies.vn/', '', response.body)
        f.write(body)
    print filename
    selector = HtmlXPathSelector(response)
    web_item = WebItem()
    web_item['image_urls'] = selector.select('//img/@src').extract();
    web_item['file_urls'] = selector.select('//script/@src|/html/head/link[@rel="stylesheet"]/@href').extract();
    return web_item
</code>
===== Scrapy OpenSources =====
Sort follow filter **Most stars**
  * https://github.com/scrapinghub/portia
  * https://github.com/gnemoug/distribute_crawler -> distribute spiders  
  * https://github.com/darkrho/scrapy-redis -> distributed spiders with single redis for receiving items
  * https://github.com/geekan/scrapy-examples
  * https://github.com/holgerd77/django-dynamic-scraper -> Manage scrapy spiders via django admin
  * https://github.com/scrapinghub/scrapyjs
  * https://github.com/scrapy/scrapyd
  * https://github.com/scrapinghub/scrapylib -> collection of code
  * https://github.com/mvanveen/hncrawl
  * https://github.com/scrapinghub/scrapyrt
  * https://github.com/aivarsk/scrapy-proxies
  * https://github.com/kalessin/finance -> Crawl finance data and store to mysql server
  * https://github.com/istresearch/scrapy-cluster
  * https://github.com/scrapy/scrapely
  * https://github.com/octoberman/scrapy-indeed-spider
  * https://github.com/dcondrey/scrapy-spiders -> collection of spiders
  * https://github.com/arthurk/scrapy-german-news -> good spider(with simhash algorithm, sqlite)
  * https://github.com/jackliusr/scrapy-crawlers -> collection of spiders
  * https://github.com/hemslo/poky-engine -> good architecture
  * https://github.com/anderson916/google-play-crawler
  * https://github.com/arpitrai/Daily-Deal-Aggregator
  * https://github.com/duydo/scrapy-crunchbase
  * https://github.com/richardkyeung/pandora-food-scrapy
  * https://github.com/rahulrrixe/Financial-News-Crawler -> config spiders in json data and store crawl data in Mongo
  * https://github.com/supercoderz/hydbusroutes
  * https://github.com/shijilspark/scrapy -> scrapy deals with django app and crontab
  * https://github.com/walbuc/Django-Scrapy -> good architecture with django:
    * sqlite 3 in dev, postgres or mongodb in prod
    * Using Celery: Distributed Task Queue
    * Using redis server for caching
    * Scrapy create webservice REST and Django using REST API to get data
  * https://github.com/sdiwu/PlayStoreScrapy
  * https://github.com/amferraz/9gag-scraper
  * https://github.com/junwei-wang/GoogleLoginSpider
  * https://github.com/lodow/portia-proxy -> You can **annotate a web page** to identify the data you wish to 
  * https://github.com/voliveirajr/seleniumcrawler -> scrapy with selenium
extract
  * https://github.com/pelick/VerticleSearchEngine
  * https://github.com/eliangcs/pystock-crawler