crawler:scrapyexamples
This is an old revision of the document!
Table of Contents
Scrapy Examples
Download entire site with scrapy
from scrapy.selector import HtmlXPathSelector from scrapy.contrib.spiders import CrawlSpider, Rule from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor as sle class BabySpider(CrawlSpider): name = "baby" allowed_domains = ["babies.vn"] start_urls = [ "http://shop.babies.vn/index.php" ] rules = [ Rule(sle(allow=("/*.html")), callback='parse_template'), ] def parse_template(self, response): filename = response.url.split("/")[-1] with open(filename, 'wb') as f: f.write(response.body)
Download images and js, css files
- define fields storing image and file urls in items.py
# Define here the models for your scraped items # # See documentation in: # http://doc.scrapy.org/en/latest/topics/items.html from scrapy.item import Item, Field class WebItem(Item): image_urls = Field() file_urls = Field()
- get image and files url for downloading spider baby.py
from scrapy.selector import HtmlXPathSelector from scrapy.contrib.spiders import CrawlSpider, Rule from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor as sle from myscrapy.items import WebItem class BabySpider(CrawlSpider): name = "baby" allowed_domains = ["babies.vn"] start_urls = [ "http://shop.babies.vn/index.php" ] rules = [ Rule(sle(allow=("/*.html")), callback='parse_template'), ] def parse_template(self, response): print response.url filename = response.url.split("/")[-1] print filename with open(filename, 'wb') as f: f.write(response.body) selector = HtmlXPathSelector(response) web_item = WebItem() web_item['image_urls'] = selector.select('//img/@src').extract(); web_item['file_urls'] = selector.select('//script/@src|/html/head/link[@rel="stylesheet"]/@href').extract(); yield web_item
- define pipelines for storing the images and files in settings.py
ITEM_PIPELINES = {'myscrapy.pipelines.MyscrapyPipeline', 'scrapy.contrib.pipeline.images.ImagesPipeline', 'scrapy.contrib.pipeline.files.FilesPipeline', } IMAGES_STORE = os.path.join(PROJECT_DIR,'media/images') FILES_STORE = os.path.join(PROJECT_DIR,'media/files')
Change the format directory and filename for storing images and files
- Create custom pipelines extends from FilesPipeline and ImagesPiline in pipelines.py
class MyImagesPipeline(ImagesPipeline): def file_path(self, request, response=None, info=None): ## start of deprecation warning block (can be removed in the future) def _warn(): from scrapy.exceptions import ScrapyDeprecationWarning import warnings warnings.warn('FilesPipeline.file_key(url) method is deprecated, please use ' 'file_path(request, response=None, info=None) instead', category=ScrapyDeprecationWarning, stacklevel=1) # check if called from file_key with url as first argument if not isinstance(request, Request): _warn() url = request else: url = request.url # detect if file_key() method has been overridden if not hasattr(self.file_key, '_base'): _warn() return self.file_key(url) ## end of deprecation warning block o = urlparse(url) return o.path class MyFilesPipeline(FilesPipeline): def file_path(self, request, response=None, info=None): ## start of deprecation warning block (can be removed in the future) def _warn(): from scrapy.exceptions import ScrapyDeprecationWarning import warnings warnings.warn('FilesPipeline.file_key(url) method is deprecated, please use ' 'file_path(request, response=None, info=None) instead', category=ScrapyDeprecationWarning, stacklevel=1) # check if called from file_key with url as first argument if not isinstance(request, Request): _warn() url = request else: url = request.url # detect if file_key() method has been overridden if not hasattr(self.file_key, '_base'): _warn() return self.file_key(url) ## end of deprecation warning block o = urlparse(url) return o.path
- edit the settings.py to change the pipelines for storing files and images:
ITEM_PIPELINES = {'myscrapy.pipelines.MyscrapyPipeline', 'myscrapy.pipelines.MyFilesPipeline', 'myscrapy.pipelines.MyImagesPipeline', }
Change the code for parsing start_urls in spider
from scrapy.selector import HtmlXPathSelector from scrapy.contrib.spiders import CrawlSpider, Rule from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor as sle from myscrapy.items import WebItem from os import path from urlparse import urlparse class BabySpider(CrawlSpider): name = "baby" allowed_domains = ["babies.vn"] start_urls = [ "http://shop.babies.vn" ] rules = [ Rule(sle(allow=("/*.html")), callback='parse_template'), ] def parse_start_url(self, response): yield self.myparse(response) def myparse(self, response): print response.url o = urlparse(response.url) filename = path.basename(o.path) if filename == '': filename = 'index.html' with open(filename, 'wb') as f: f.write(response.body) print filename selector = HtmlXPathSelector(response) web_item = WebItem() web_item['image_urls'] = selector.select('//img/@src').extract(); web_item['file_urls'] = selector.select('//script/@src|/html/head/link[@rel="stylesheet"]/@href').extract(); return web_item def parse_template(self, response): yield self.myparse(response)
Modify myparse for changing http://shop.babies.vn to '' in all html files
def myparse(self, response): output_dir = path.join(self.settings['OUTPUT_DIR']) if not path.exists(output_dir): os.makedirs(output_dir) o = urlparse(response.url) filename = path.basename(o.path) if filename == '': filename = 'index.html' filename = path.join(output_dir, filename) with open(filename, 'wb') as f: body = re.sub(r'http://shop.babies.vn/', '', response.body) f.write(body) print filename selector = HtmlXPathSelector(response) web_item = WebItem() web_item['image_urls'] = selector.select('//img/@src').extract(); web_item['file_urls'] = selector.select('//script/@src|/html/head/link[@rel="stylesheet"]/@href').extract(); return web_item
Scrapy OpenSources
- https://github.com/darkrho/scrapy-redis → distributed spiders with single redis for receiving items
- https://github.com/gnemoug/distribute_crawler → distribute spiders
- https://github.com/holgerd77/django-dynamic-scraper → Manage scrapy spiders via django admin
- https://github.com/arthurk/scrapy-german-news → good spider
- https://github.com/hemslo/poky-engine → good architecture
- https://github.com/rahulrrixe/Financial-News-Crawler → config spiders in json data
- https://github.com/shijilspark/scrapy → scrapy deals with django app
crawler/scrapyexamples.1430626638.txt.gz · Last modified: 2022/10/29 16:15 (external edit)