from scrapy.selector import HtmlXPathSelector from scrapy.contrib.spiders import CrawlSpider, Rule from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor as sle class BabySpider(CrawlSpider): name = "baby" allowed_domains = ["babies.vn"] start_urls = [ "http://shop.babies.vn/index.php" ] rules = [ Rule(sle(allow=("/*.html")), callback='parse_template'), ] def parse_template(self, response): filename = response.url.split("/")[-1] with open(filename, 'wb') as f: f.write(response.body)
# Define here the models for your scraped items # # See documentation in: # http://doc.scrapy.org/en/latest/topics/items.html from scrapy.item import Item, Field class WebItem(Item): image_urls = Field() file_urls = Field()
from scrapy.selector import HtmlXPathSelector from scrapy.contrib.spiders import CrawlSpider, Rule from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor as sle from myscrapy.items import WebItem class BabySpider(CrawlSpider): name = "baby" allowed_domains = ["babies.vn"] start_urls = [ "http://shop.babies.vn/index.php" ] rules = [ Rule(sle(allow=("/*.html")), callback='parse_template'), ] def parse_template(self, response): print response.url filename = response.url.split("/")[-1] print filename with open(filename, 'wb') as f: f.write(response.body) selector = HtmlXPathSelector(response) web_item = WebItem() web_item['image_urls'] = selector.select('//img/@src').extract(); web_item['file_urls'] = selector.select('//script/@src|/html/head/link[@rel="stylesheet"]/@href').extract(); yield web_item
ITEM_PIPELINES = {'myscrapy.pipelines.MyscrapyPipeline', 'scrapy.contrib.pipeline.images.ImagesPipeline', 'scrapy.contrib.pipeline.files.FilesPipeline', } IMAGES_STORE = os.path.join(PROJECT_DIR,'media/images') FILES_STORE = os.path.join(PROJECT_DIR,'media/files')
class MyImagesPipeline(ImagesPipeline): def file_path(self, request, response=None, info=None): ## start of deprecation warning block (can be removed in the future) def _warn(): from scrapy.exceptions import ScrapyDeprecationWarning import warnings warnings.warn('FilesPipeline.file_key(url) method is deprecated, please use ' 'file_path(request, response=None, info=None) instead', category=ScrapyDeprecationWarning, stacklevel=1) # check if called from file_key with url as first argument if not isinstance(request, Request): _warn() url = request else: url = request.url # detect if file_key() method has been overridden if not hasattr(self.file_key, '_base'): _warn() return self.file_key(url) ## end of deprecation warning block o = urlparse(url) return o.path class MyFilesPipeline(FilesPipeline): def file_path(self, request, response=None, info=None): ## start of deprecation warning block (can be removed in the future) def _warn(): from scrapy.exceptions import ScrapyDeprecationWarning import warnings warnings.warn('FilesPipeline.file_key(url) method is deprecated, please use ' 'file_path(request, response=None, info=None) instead', category=ScrapyDeprecationWarning, stacklevel=1) # check if called from file_key with url as first argument if not isinstance(request, Request): _warn() url = request else: url = request.url # detect if file_key() method has been overridden if not hasattr(self.file_key, '_base'): _warn() return self.file_key(url) ## end of deprecation warning block o = urlparse(url) return o.path
ITEM_PIPELINES = {'myscrapy.pipelines.MyscrapyPipeline', 'myscrapy.pipelines.MyFilesPipeline', 'myscrapy.pipelines.MyImagesPipeline', }
from scrapy.selector import HtmlXPathSelector from scrapy.contrib.spiders import CrawlSpider, Rule from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor as sle from myscrapy.items import WebItem from os import path from urlparse import urlparse class BabySpider(CrawlSpider): name = "baby" allowed_domains = ["babies.vn"] start_urls = [ "http://shop.babies.vn" ] rules = [ Rule(sle(allow=("/*.html")), callback='parse_template'), ] def parse_start_url(self, response): yield self.myparse(response) def myparse(self, response): print response.url o = urlparse(response.url) filename = path.basename(o.path) if filename == '': filename = 'index.html' with open(filename, 'wb') as f: f.write(response.body) print filename selector = HtmlXPathSelector(response) web_item = WebItem() web_item['image_urls'] = selector.select('//img/@src').extract(); web_item['file_urls'] = selector.select('//script/@src|/html/head/link[@rel="stylesheet"]/@href').extract(); return web_item def parse_template(self, response): yield self.myparse(response)
def myparse(self, response): output_dir = path.join(self.settings['OUTPUT_DIR']) if not path.exists(output_dir): os.makedirs(output_dir) o = urlparse(response.url) filename = path.basename(o.path) if filename == '': filename = 'index.html' filename = path.join(output_dir, filename) with open(filename, 'wb') as f: body = re.sub(r'http://shop.babies.vn/', '', response.body) f.write(body) print filename selector = HtmlXPathSelector(response) web_item = WebItem() web_item['image_urls'] = selector.select('//img/@src').extract(); web_item['file_urls'] = selector.select('//script/@src|/html/head/link[@rel="stylesheet"]/@href').extract(); return web_item
Sort follow filter Most stars
extract