User Tools

Site Tools


crawler:scrapyexamples

Scrapy Examples

Download entire site with scrapy

from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor as sle
 
class BabySpider(CrawlSpider):
    name = "baby"
    allowed_domains = ["babies.vn"]
    start_urls = [
        "http://shop.babies.vn/index.php"
    ]
    rules = [
        Rule(sle(allow=("/*.html")), callback='parse_template'),
    ]
 
    def parse_template(self, response):
        filename = response.url.split("/")[-1]
        with open(filename, 'wb') as f:
            f.write(response.body)

Download images and js, css files

  • define fields storing image and file urls in items.py
    # Define here the models for your scraped items
    #
    # See documentation in:
    # http://doc.scrapy.org/en/latest/topics/items.html
     
    from scrapy.item import Item, Field 
     
    class WebItem(Item):
        image_urls = Field()
        file_urls = Field()
  • get image and files url for downloading spider baby.py
    from scrapy.selector import HtmlXPathSelector
    from scrapy.contrib.spiders import CrawlSpider, Rule
    from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor as sle
    from myscrapy.items import WebItem
     
    class BabySpider(CrawlSpider):
        name = "baby"
        allowed_domains = ["babies.vn"]
        start_urls = [
            "http://shop.babies.vn/index.php"
        ]
        rules = [
            Rule(sle(allow=("/*.html")), callback='parse_template'),
        ]
     
        def parse_template(self, response):
            print response.url
            filename = response.url.split("/")[-1]
            print filename
            with open(filename, 'wb') as f:
                f.write(response.body)
            selector = HtmlXPathSelector(response)
            web_item = WebItem()
            web_item['image_urls'] = selector.select('//img/@src').extract();
            web_item['file_urls'] = selector.select('//script/@src|/html/head/link[@rel="stylesheet"]/@href').extract();
            yield web_item
  • define pipelines for storing the images and files in settings.py
    ITEM_PIPELINES = {'myscrapy.pipelines.MyscrapyPipeline',
                      'scrapy.contrib.pipeline.images.ImagesPipeline',
                      'scrapy.contrib.pipeline.files.FilesPipeline',
                    }
    IMAGES_STORE = os.path.join(PROJECT_DIR,'media/images')
    FILES_STORE = os.path.join(PROJECT_DIR,'media/files')                

Change the format directory and filename for storing images and files

  • Create custom pipelines extends from FilesPipeline and ImagesPiline in pipelines.py
    class MyImagesPipeline(ImagesPipeline):
         def file_path(self, request, response=None, info=None):
            ## start of deprecation warning block (can be removed in the future)
            def _warn():
                from scrapy.exceptions import ScrapyDeprecationWarning
                import warnings
                warnings.warn('FilesPipeline.file_key(url) method is deprecated, please use '
                              'file_path(request, response=None, info=None) instead',
                              category=ScrapyDeprecationWarning, stacklevel=1)
     
            # check if called from file_key with url as first argument
            if not isinstance(request, Request):
                _warn()
                url = request
            else:
                url = request.url
     
            # detect if file_key() method has been overridden
            if not hasattr(self.file_key, '_base'):
                _warn()
                return self.file_key(url)
            ## end of deprecation warning block
            o = urlparse(url)
            return o.path
    class MyFilesPipeline(FilesPipeline):
         def file_path(self, request, response=None, info=None):
            ## start of deprecation warning block (can be removed in the future)
            def _warn():
                from scrapy.exceptions import ScrapyDeprecationWarning
                import warnings
                warnings.warn('FilesPipeline.file_key(url) method is deprecated, please use '
                              'file_path(request, response=None, info=None) instead',
                              category=ScrapyDeprecationWarning, stacklevel=1)
     
            # check if called from file_key with url as first argument
            if not isinstance(request, Request):
                _warn()
                url = request
            else:
                url = request.url
     
            # detect if file_key() method has been overridden
            if not hasattr(self.file_key, '_base'):
                _warn()
                return self.file_key(url)
            ## end of deprecation warning block
            o = urlparse(url)
            return o.path
  • edit the settings.py to change the pipelines for storing files and images:
    ITEM_PIPELINES = {'myscrapy.pipelines.MyscrapyPipeline',
                      'myscrapy.pipelines.MyFilesPipeline',
                      'myscrapy.pipelines.MyImagesPipeline',
                    }

Change the code for parsing start_urls in spider

from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor as sle
from myscrapy.items import WebItem
from os import path
from urlparse import urlparse
 
class BabySpider(CrawlSpider):
    name = "baby"
    allowed_domains = ["babies.vn"]
    start_urls = [
        "http://shop.babies.vn"
    ]
    rules = [
        Rule(sle(allow=("/*.html")), callback='parse_template'),
    ]
    def parse_start_url(self, response):
        yield self.myparse(response)
    def myparse(self, response):
        print response.url
        o = urlparse(response.url)
        filename = path.basename(o.path)
        if filename == '':
            filename = 'index.html'
        with open(filename, 'wb') as f:
            f.write(response.body)
        print filename
        selector = HtmlXPathSelector(response)
        web_item = WebItem()
        web_item['image_urls'] = selector.select('//img/@src').extract();
        web_item['file_urls'] = selector.select('//script/@src|/html/head/link[@rel="stylesheet"]/@href').extract();
        return web_item
    def parse_template(self, response):
        yield self.myparse(response)

Modify myparse for changing http://shop.babies.vn to '' in all html files

def myparse(self, response):    
    output_dir = path.join(self.settings['OUTPUT_DIR'])
    if not  path.exists(output_dir):
        os.makedirs(output_dir)
 
    o = urlparse(response.url)
    filename = path.basename(o.path)
    if filename == '':
        filename = 'index.html'
    filename = path.join(output_dir, filename)
 
    with open(filename, 'wb') as f:
        body = re.sub(r'http://shop.babies.vn/', '', response.body)
        f.write(body)
    print filename
    selector = HtmlXPathSelector(response)
    web_item = WebItem()
    web_item['image_urls'] = selector.select('//img/@src').extract();
    web_item['file_urls'] = selector.select('//script/@src|/html/head/link[@rel="stylesheet"]/@href').extract();
    return web_item

Scrapy OpenSources

Sort follow filter Most stars

extract

crawler/scrapyexamples.txt · Last modified: 2022/10/29 16:15 by 127.0.0.1