This is an old revision of the document!

Scrapy Examples

Download entire site with scrapy

from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor as sle
 
class BabySpider(CrawlSpider):
    name = "baby"
    allowed_domains = ["babies.vn"]
    start_urls = [
        "http://shop.babies.vn/index.php"
    ]
    rules = [
        Rule(sle(allow=("/*.html")), callback='parse_template'),
    ]
 
    def parse_template(self, response):
        filename = response.url.split("/")[-1]
        with open(filename, 'wb') as f:
            f.write(response.body)

Download images and js, css files

define fields storing image and file urls in items.py

# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html
 
from scrapy.item import Item, Field 
 
class WebItem(Item):
    image_urls = Field()
    file_urls = Field()

get image and files url for downloading spider baby.py

from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor as sle
from myscrapy.items import WebItem
 
class BabySpider(CrawlSpider):
    name = "baby"
    allowed_domains = ["babies.vn"]
    start_urls = [
        "http://shop.babies.vn/index.php"
    ]
    rules = [
        Rule(sle(allow=("/*.html")), callback='parse_template'),
    ]
 
    def parse_template(self, response):
        print response.url
        filename = response.url.split("/")[-1]
        print filename
        with open(filename, 'wb') as f:
            f.write(response.body)
        selector = HtmlXPathSelector(response)
        web_item = WebItem()
        web_item['image_urls'] = selector.select('//img/@src').extract();
        web_item['file_urls'] = selector.select('//script/@src|/html/head/link[@rel="stylesheet"]/@href').extract();
        yield web_item

define pipelines for storing the images and files in settings.py

ITEM_PIPELINES = {'myscrapy.pipelines.MyscrapyPipeline',
                  'scrapy.contrib.pipeline.images.ImagesPipeline',
                  'scrapy.contrib.pipeline.files.FilesPipeline',
                }
IMAGES_STORE = os.path.join(PROJECT_DIR,'media/images')
FILES_STORE = os.path.join(PROJECT_DIR,'media/files')

Change the format directory and filename for storing images and files

Create custom pipelines extends from FilesPipeline and ImagesPiline in pipelines.py

class MyImagesPipeline(ImagesPipeline):
     def file_path(self, request, response=None, info=None):
        ## start of deprecation warning block (can be removed in the future)
        def _warn():
            from scrapy.exceptions import ScrapyDeprecationWarning
            import warnings
            warnings.warn('FilesPipeline.file_key(url) method is deprecated, please use '
                          'file_path(request, response=None, info=None) instead',
                          category=ScrapyDeprecationWarning, stacklevel=1)
 
        # check if called from file_key with url as first argument
        if not isinstance(request, Request):
            _warn()
            url = request
        else:
            url = request.url
 
        # detect if file_key() method has been overridden
        if not hasattr(self.file_key, '_base'):
            _warn()
            return self.file_key(url)
        ## end of deprecation warning block
        o = urlparse(url)
        return o.path
class MyFilesPipeline(FilesPipeline):
     def file_path(self, request, response=None, info=None):
        ## start of deprecation warning block (can be removed in the future)
        def _warn():
            from scrapy.exceptions import ScrapyDeprecationWarning
            import warnings
            warnings.warn('FilesPipeline.file_key(url) method is deprecated, please use '
                          'file_path(request, response=None, info=None) instead',
                          category=ScrapyDeprecationWarning, stacklevel=1)
 
        # check if called from file_key with url as first argument
        if not isinstance(request, Request):
            _warn()
            url = request
        else:
            url = request.url
 
        # detect if file_key() method has been overridden
        if not hasattr(self.file_key, '_base'):
            _warn()
            return self.file_key(url)
        ## end of deprecation warning block
        o = urlparse(url)
        return o.path

edit the settings.py to change the pipelines for storing files and images:

ITEM_PIPELINES = {'myscrapy.pipelines.MyscrapyPipeline',
                  'myscrapy.pipelines.MyFilesPipeline',
                  'myscrapy.pipelines.MyImagesPipeline',
                }

Change the code for parsing start_urls in spider

from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor as sle
from myscrapy.items import WebItem
from os import path
from urlparse import urlparse
 
class BabySpider(CrawlSpider):
    name = "baby"
    allowed_domains = ["babies.vn"]
    start_urls = [
        "http://shop.babies.vn"
    ]
    rules = [
        Rule(sle(allow=("/*.html")), callback='parse_template'),
    ]
    def parse_start_url(self, response):
        yield self.myparse(response)
    def myparse(self, response):
        print response.url
        o = urlparse(response.url)
        filename = path.basename(o.path)
        if filename == '':
            filename = 'index.html'
        with open(filename, 'wb') as f:
            f.write(response.body)
        print filename
        selector = HtmlXPathSelector(response)
        web_item = WebItem()
        web_item['image_urls'] = selector.select('//img/@src').extract();
        web_item['file_urls'] = selector.select('//script/@src|/html/head/link[@rel="stylesheet"]/@href').extract();
        return web_item
    def parse_template(self, response):
        yield self.myparse(response)

Modify myparse for changing http://shop.babies.vn to '' in all html files

def myparse(self, response):    
    output_dir = path.join(self.settings['OUTPUT_DIR'])
    if not  path.exists(output_dir):
        os.makedirs(output_dir)
 
    o = urlparse(response.url)
    filename = path.basename(o.path)
    if filename == '':
        filename = 'index.html'
    filename = path.join(output_dir, filename)
 
    with open(filename, 'wb') as f:
        body = re.sub(r'http://shop.babies.vn/', '', response.body)
        f.write(body)
    print filename
    selector = HtmlXPathSelector(response)
    web_item = WebItem()
    web_item['image_urls'] = selector.select('//img/@src').extract();
    web_item['file_urls'] = selector.select('//script/@src|/html/head/link[@rel="stylesheet"]/@href').extract();
    return web_item

Scrapy OpenSources

https://github.com/darkrho/scrapy-redis → distributed spiders with single redis for receiving items
https://github.com/gnemoug/distribute_crawler → distribute spiders
https://github.com/holgerd77/django-dynamic-scraper → Manage scrapy spiders via django admin
https://github.com/scrapy/scrapyd
https://github.com/scrapinghub/scrapylib
https://github.com/mvanveen/hncrawl
https://github.com/scrapinghub/scrapyrt
https://github.com/aivarsk/scrapy-proxies
https://github.com/kalessin/finance
https://github.com/istresearch/scrapy-cluster
https://github.com/octoberman/scrapy-indeed-spider
https://github.com/dcondrey/scrapy-spiders
https://github.com/arthurk/scrapy-german-news → good spider
https://github.com/jackliusr/scrapy-crawlers
https://github.com/hemslo/poky-engine → good architecture
https://github.com/anderson916/google-play-crawler
https://github.com/arpitrai/Daily-Deal-Aggregator
https://github.com/duydo/scrapy-crunchbase
https://github.com/richardkyeung/pandora-food-scrapy
https://github.com/rahulrrixe/Financial-News-Crawler → config spiders in json data
https://github.com/supercoderz/hydbusroutes
https://github.com/shijilspark/scrapy → scrapy deals with django app
https://github.com/walbuc/Django-Scrapy
https://github.com/sdiwu/PlayStoreScrapy

my notes

Table of Contents