====== Scrapy Examples ======
===== Download entire site with scrapy =====
from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor as sle
class BabySpider(CrawlSpider):
name = "baby"
allowed_domains = ["babies.vn"]
start_urls = [
"http://shop.babies.vn/index.php"
]
rules = [
Rule(sle(allow=("/*.html")), callback='parse_template'),
]
def parse_template(self, response):
filename = response.url.split("/")[-1]
with open(filename, 'wb') as f:
f.write(response.body)
==== Download images and js, css files ====
* define fields storing image and file urls in items.py
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html
from scrapy.item import Item, Field
class WebItem(Item):
image_urls = Field()
file_urls = Field()
* get image and files url for downloading spider baby.py
from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor as sle
from myscrapy.items import WebItem
class BabySpider(CrawlSpider):
name = "baby"
allowed_domains = ["babies.vn"]
start_urls = [
"http://shop.babies.vn/index.php"
]
rules = [
Rule(sle(allow=("/*.html")), callback='parse_template'),
]
def parse_template(self, response):
print response.url
filename = response.url.split("/")[-1]
print filename
with open(filename, 'wb') as f:
f.write(response.body)
selector = HtmlXPathSelector(response)
web_item = WebItem()
web_item['image_urls'] = selector.select('//img/@src').extract();
web_item['file_urls'] = selector.select('//script/@src|/html/head/link[@rel="stylesheet"]/@href').extract();
yield web_item
* define pipelines for storing the images and files in settings.py
ITEM_PIPELINES = {'myscrapy.pipelines.MyscrapyPipeline',
'scrapy.contrib.pipeline.images.ImagesPipeline',
'scrapy.contrib.pipeline.files.FilesPipeline',
}
IMAGES_STORE = os.path.join(PROJECT_DIR,'media/images')
FILES_STORE = os.path.join(PROJECT_DIR,'media/files')
==== Change the format directory and filename for storing images and files ====
* Create custom pipelines extends from FilesPipeline and ImagesPiline in pipelines.py
class MyImagesPipeline(ImagesPipeline):
def file_path(self, request, response=None, info=None):
## start of deprecation warning block (can be removed in the future)
def _warn():
from scrapy.exceptions import ScrapyDeprecationWarning
import warnings
warnings.warn('FilesPipeline.file_key(url) method is deprecated, please use '
'file_path(request, response=None, info=None) instead',
category=ScrapyDeprecationWarning, stacklevel=1)
# check if called from file_key with url as first argument
if not isinstance(request, Request):
_warn()
url = request
else:
url = request.url
# detect if file_key() method has been overridden
if not hasattr(self.file_key, '_base'):
_warn()
return self.file_key(url)
## end of deprecation warning block
o = urlparse(url)
return o.path
class MyFilesPipeline(FilesPipeline):
def file_path(self, request, response=None, info=None):
## start of deprecation warning block (can be removed in the future)
def _warn():
from scrapy.exceptions import ScrapyDeprecationWarning
import warnings
warnings.warn('FilesPipeline.file_key(url) method is deprecated, please use '
'file_path(request, response=None, info=None) instead',
category=ScrapyDeprecationWarning, stacklevel=1)
# check if called from file_key with url as first argument
if not isinstance(request, Request):
_warn()
url = request
else:
url = request.url
# detect if file_key() method has been overridden
if not hasattr(self.file_key, '_base'):
_warn()
return self.file_key(url)
## end of deprecation warning block
o = urlparse(url)
return o.path
* edit the settings.py to change the pipelines for storing files and images:
ITEM_PIPELINES = {'myscrapy.pipelines.MyscrapyPipeline',
'myscrapy.pipelines.MyFilesPipeline',
'myscrapy.pipelines.MyImagesPipeline',
}
==== Change the code for parsing start_urls in spider ====
from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor as sle
from myscrapy.items import WebItem
from os import path
from urlparse import urlparse
class BabySpider(CrawlSpider):
name = "baby"
allowed_domains = ["babies.vn"]
start_urls = [
"http://shop.babies.vn"
]
rules = [
Rule(sle(allow=("/*.html")), callback='parse_template'),
]
def parse_start_url(self, response):
yield self.myparse(response)
def myparse(self, response):
print response.url
o = urlparse(response.url)
filename = path.basename(o.path)
if filename == '':
filename = 'index.html'
with open(filename, 'wb') as f:
f.write(response.body)
print filename
selector = HtmlXPathSelector(response)
web_item = WebItem()
web_item['image_urls'] = selector.select('//img/@src').extract();
web_item['file_urls'] = selector.select('//script/@src|/html/head/link[@rel="stylesheet"]/@href').extract();
return web_item
def parse_template(self, response):
yield self.myparse(response)
==== Modify myparse for changing http://shop.babies.vn to '' in all html files ====
def myparse(self, response):
output_dir = path.join(self.settings['OUTPUT_DIR'])
if not path.exists(output_dir):
os.makedirs(output_dir)
o = urlparse(response.url)
filename = path.basename(o.path)
if filename == '':
filename = 'index.html'
filename = path.join(output_dir, filename)
with open(filename, 'wb') as f:
body = re.sub(r'http://shop.babies.vn/', '', response.body)
f.write(body)
print filename
selector = HtmlXPathSelector(response)
web_item = WebItem()
web_item['image_urls'] = selector.select('//img/@src').extract();
web_item['file_urls'] = selector.select('//script/@src|/html/head/link[@rel="stylesheet"]/@href').extract();
return web_item
===== Scrapy OpenSources =====
Sort follow filter **Most stars**
* https://github.com/scrapinghub/portia
* https://github.com/gnemoug/distribute_crawler -> distribute spiders
* https://github.com/darkrho/scrapy-redis -> distributed spiders with single redis for receiving items
* https://github.com/geekan/scrapy-examples
* https://github.com/holgerd77/django-dynamic-scraper -> Manage scrapy spiders via django admin
* https://github.com/scrapinghub/scrapyjs
* https://github.com/scrapy/scrapyd
* https://github.com/scrapinghub/scrapylib -> collection of code
* https://github.com/mvanveen/hncrawl
* https://github.com/scrapinghub/scrapyrt
* https://github.com/aivarsk/scrapy-proxies
* https://github.com/kalessin/finance -> Crawl finance data and store to mysql server
* https://github.com/istresearch/scrapy-cluster
* https://github.com/scrapy/scrapely
* https://github.com/octoberman/scrapy-indeed-spider
* https://github.com/dcondrey/scrapy-spiders -> collection of spiders
* https://github.com/arthurk/scrapy-german-news -> good spider(with simhash algorithm, sqlite)
* https://github.com/jackliusr/scrapy-crawlers -> collection of spiders
* https://github.com/hemslo/poky-engine -> good architecture
* https://github.com/anderson916/google-play-crawler
* https://github.com/arpitrai/Daily-Deal-Aggregator
* https://github.com/duydo/scrapy-crunchbase
* https://github.com/richardkyeung/pandora-food-scrapy
* https://github.com/rahulrrixe/Financial-News-Crawler -> config spiders in json data and store crawl data in Mongo
* https://github.com/supercoderz/hydbusroutes
* https://github.com/shijilspark/scrapy -> scrapy deals with django app and crontab
* https://github.com/walbuc/Django-Scrapy -> good architecture with django:
* sqlite 3 in dev, postgres or mongodb in prod
* Using Celery: Distributed Task Queue
* Using redis server for caching
* Scrapy create webservice REST and Django using REST API to get data
* https://github.com/sdiwu/PlayStoreScrapy
* https://github.com/amferraz/9gag-scraper
* https://github.com/junwei-wang/GoogleLoginSpider
* https://github.com/lodow/portia-proxy -> You can **annotate a web page** to identify the data you wish to
* https://github.com/voliveirajr/seleniumcrawler -> scrapy with selenium
extract
* https://github.com/pelick/VerticleSearchEngine
* https://github.com/eliangcs/pystock-crawler