Table of Contents

Scrapy Examples

Download entire site with scrapy

from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor as sle
 
class BabySpider(CrawlSpider):
    name = "baby"
    allowed_domains = ["babies.vn"]
    start_urls = [
        "http://shop.babies.vn/index.php"
    ]
    rules = [
        Rule(sle(allow=("/*.html")), callback='parse_template'),
    ]
 
    def parse_template(self, response):
        filename = response.url.split("/")[-1]
        with open(filename, 'wb') as f:
            f.write(response.body)

Download images and js, css files

Change the format directory and filename for storing images and files

Change the code for parsing start_urls in spider

from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor as sle
from myscrapy.items import WebItem
from os import path
from urlparse import urlparse
 
class BabySpider(CrawlSpider):
    name = "baby"
    allowed_domains = ["babies.vn"]
    start_urls = [
        "http://shop.babies.vn"
    ]
    rules = [
        Rule(sle(allow=("/*.html")), callback='parse_template'),
    ]
    def parse_start_url(self, response):
        yield self.myparse(response)
    def myparse(self, response):
        print response.url
        o = urlparse(response.url)
        filename = path.basename(o.path)
        if filename == '':
            filename = 'index.html'
        with open(filename, 'wb') as f:
            f.write(response.body)
        print filename
        selector = HtmlXPathSelector(response)
        web_item = WebItem()
        web_item['image_urls'] = selector.select('//img/@src').extract();
        web_item['file_urls'] = selector.select('//script/@src|/html/head/link[@rel="stylesheet"]/@href').extract();
        return web_item
    def parse_template(self, response):
        yield self.myparse(response)

Modify myparse for changing http://shop.babies.vn to '' in all html files

def myparse(self, response):    
    output_dir = path.join(self.settings['OUTPUT_DIR'])
    if not  path.exists(output_dir):
        os.makedirs(output_dir)
 
    o = urlparse(response.url)
    filename = path.basename(o.path)
    if filename == '':
        filename = 'index.html'
    filename = path.join(output_dir, filename)
 
    with open(filename, 'wb') as f:
        body = re.sub(r'http://shop.babies.vn/', '', response.body)
        f.write(body)
    print filename
    selector = HtmlXPathSelector(response)
    web_item = WebItem()
    web_item['image_urls'] = selector.select('//img/@src').extract();
    web_item['file_urls'] = selector.select('//script/@src|/html/head/link[@rel="stylesheet"]/@href').extract();
    return web_item

Scrapy OpenSources

Sort follow filter Most stars

extract