crawler:scrapyexamples
Table of Contents
Scrapy Examples
Download entire site with scrapy
from scrapy.selector import HtmlXPathSelector from scrapy.contrib.spiders import CrawlSpider, Rule from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor as sle class BabySpider(CrawlSpider): name = "baby" allowed_domains = ["babies.vn"] start_urls = [ "http://shop.babies.vn/index.php" ] rules = [ Rule(sle(allow=("/*.html")), callback='parse_template'), ] def parse_template(self, response): filename = response.url.split("/")[-1] with open(filename, 'wb') as f: f.write(response.body)
Download images and js, css files
- define fields storing image and file urls in items.py
# Define here the models for your scraped items # # See documentation in: # http://doc.scrapy.org/en/latest/topics/items.html from scrapy.item import Item, Field class WebItem(Item): image_urls = Field() file_urls = Field()
- get image and files url for downloading spider baby.py
from scrapy.selector import HtmlXPathSelector from scrapy.contrib.spiders import CrawlSpider, Rule from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor as sle from myscrapy.items import WebItem class BabySpider(CrawlSpider): name = "baby" allowed_domains = ["babies.vn"] start_urls = [ "http://shop.babies.vn/index.php" ] rules = [ Rule(sle(allow=("/*.html")), callback='parse_template'), ] def parse_template(self, response): print response.url filename = response.url.split("/")[-1] print filename with open(filename, 'wb') as f: f.write(response.body) selector = HtmlXPathSelector(response) web_item = WebItem() web_item['image_urls'] = selector.select('//img/@src').extract(); web_item['file_urls'] = selector.select('//script/@src|/html/head/link[@rel="stylesheet"]/@href').extract(); yield web_item
- define pipelines for storing the images and files in settings.py
ITEM_PIPELINES = {'myscrapy.pipelines.MyscrapyPipeline', 'scrapy.contrib.pipeline.images.ImagesPipeline', 'scrapy.contrib.pipeline.files.FilesPipeline', } IMAGES_STORE = os.path.join(PROJECT_DIR,'media/images') FILES_STORE = os.path.join(PROJECT_DIR,'media/files')
Change the format directory and filename for storing images and files
- Create custom pipelines extends from FilesPipeline and ImagesPiline in pipelines.py
class MyImagesPipeline(ImagesPipeline): def file_path(self, request, response=None, info=None): ## start of deprecation warning block (can be removed in the future) def _warn(): from scrapy.exceptions import ScrapyDeprecationWarning import warnings warnings.warn('FilesPipeline.file_key(url) method is deprecated, please use ' 'file_path(request, response=None, info=None) instead', category=ScrapyDeprecationWarning, stacklevel=1) # check if called from file_key with url as first argument if not isinstance(request, Request): _warn() url = request else: url = request.url # detect if file_key() method has been overridden if not hasattr(self.file_key, '_base'): _warn() return self.file_key(url) ## end of deprecation warning block o = urlparse(url) return o.path class MyFilesPipeline(FilesPipeline): def file_path(self, request, response=None, info=None): ## start of deprecation warning block (can be removed in the future) def _warn(): from scrapy.exceptions import ScrapyDeprecationWarning import warnings warnings.warn('FilesPipeline.file_key(url) method is deprecated, please use ' 'file_path(request, response=None, info=None) instead', category=ScrapyDeprecationWarning, stacklevel=1) # check if called from file_key with url as first argument if not isinstance(request, Request): _warn() url = request else: url = request.url # detect if file_key() method has been overridden if not hasattr(self.file_key, '_base'): _warn() return self.file_key(url) ## end of deprecation warning block o = urlparse(url) return o.path
- edit the settings.py to change the pipelines for storing files and images:
ITEM_PIPELINES = {'myscrapy.pipelines.MyscrapyPipeline', 'myscrapy.pipelines.MyFilesPipeline', 'myscrapy.pipelines.MyImagesPipeline', }
Change the code for parsing start_urls in spider
from scrapy.selector import HtmlXPathSelector from scrapy.contrib.spiders import CrawlSpider, Rule from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor as sle from myscrapy.items import WebItem from os import path from urlparse import urlparse class BabySpider(CrawlSpider): name = "baby" allowed_domains = ["babies.vn"] start_urls = [ "http://shop.babies.vn" ] rules = [ Rule(sle(allow=("/*.html")), callback='parse_template'), ] def parse_start_url(self, response): yield self.myparse(response) def myparse(self, response): print response.url o = urlparse(response.url) filename = path.basename(o.path) if filename == '': filename = 'index.html' with open(filename, 'wb') as f: f.write(response.body) print filename selector = HtmlXPathSelector(response) web_item = WebItem() web_item['image_urls'] = selector.select('//img/@src').extract(); web_item['file_urls'] = selector.select('//script/@src|/html/head/link[@rel="stylesheet"]/@href').extract(); return web_item def parse_template(self, response): yield self.myparse(response)
Modify myparse for changing http://shop.babies.vn to '' in all html files
def myparse(self, response): output_dir = path.join(self.settings['OUTPUT_DIR']) if not path.exists(output_dir): os.makedirs(output_dir) o = urlparse(response.url) filename = path.basename(o.path) if filename == '': filename = 'index.html' filename = path.join(output_dir, filename) with open(filename, 'wb') as f: body = re.sub(r'http://shop.babies.vn/', '', response.body) f.write(body) print filename selector = HtmlXPathSelector(response) web_item = WebItem() web_item['image_urls'] = selector.select('//img/@src').extract(); web_item['file_urls'] = selector.select('//script/@src|/html/head/link[@rel="stylesheet"]/@href').extract(); return web_item
Scrapy OpenSources
Sort follow filter Most stars
- https://github.com/gnemoug/distribute_crawler → distribute spiders
- https://github.com/darkrho/scrapy-redis → distributed spiders with single redis for receiving items
- https://github.com/holgerd77/django-dynamic-scraper → Manage scrapy spiders via django admin
- https://github.com/scrapinghub/scrapylib → collection of code
- https://github.com/kalessin/finance → Crawl finance data and store to mysql server
- https://github.com/dcondrey/scrapy-spiders → collection of spiders
- https://github.com/arthurk/scrapy-german-news → good spider(with simhash algorithm, sqlite)
- https://github.com/jackliusr/scrapy-crawlers → collection of spiders
- https://github.com/hemslo/poky-engine → good architecture
- https://github.com/rahulrrixe/Financial-News-Crawler → config spiders in json data and store crawl data in Mongo
- https://github.com/shijilspark/scrapy → scrapy deals with django app and crontab
- https://github.com/walbuc/Django-Scrapy → good architecture with django:
- sqlite 3 in dev, postgres or mongodb in prod
- Using Celery: Distributed Task Queue
- Using redis server for caching
- Scrapy create webservice REST and Django using REST API to get data
- https://github.com/lodow/portia-proxy → You can annotate a web page to identify the data you wish to
- https://github.com/voliveirajr/seleniumcrawler → scrapy with selenium
extract
crawler/scrapyexamples.txt · Last modified: 2022/10/29 16:15 by 127.0.0.1