我的Spider运行时没有显示任何错误,但是图像未存储在文件夹中,这是我的抓取文件:
Spider.py:
import scrapy import re import os import urlparse from scrapy.spiders import CrawlSpider, Rule from scrapy.linkextractors import LinkExtractor from scrapy.loader.processors import Join, MapCompose, TakeFirst from scrapy.pipelines.images import ImagesPipeline from production.items import ProductionItem, ListResidentialItem class productionSpider(scrapy.Spider): name = "production" allowed_domains = ["someurl.com"] start_urls = [ "someurl.com" ] def parse(self, response): for sel in response.xpath('//html/body'): item = ProductionItem() img_url = sel.xpath('//a[@data-tealium-id="detail_nav_showphotos"]/@href').extract()[0] yield scrapy.Request(urlparse.urljoin(response.url, img_url),callback=self.parseBasicListingInfo, meta={'item': item}) def parseBasicListingInfo(item, response): item = response.request.meta['item'] item = ListResidentialItem() try: image_urls = map(unicode.strip,response.xpath('//a[@itemprop="contentUrl"]/@data-href').extract()) item['image_urls'] = [ x for x in image_urls] except IndexError: item['image_urls'] = '' return item
settings.py:
from scrapy.settings.default_settings import ITEM_PIPELINES from scrapy.pipelines.images import ImagesPipeline BOT_NAME = 'production' SPIDER_MODULES = ['production.spiders'] NEWSPIDER_MODULE = 'production.spiders' DEFAULT_ITEM_CLASS = 'production.items' ROBOTSTXT_OBEY = True DEPTH_PRIORITY = 1 IMAGE_STORE = '/images' CONCURRENT_REQUESTS = 250 DOWNLOAD_DELAY = 2 ITEM_PIPELINES = { 'scrapy.contrib.pipeline.images.ImagesPipeline': 300, }
items.py
# -*- coding: utf-8 -*- import scrapy class ProductionItem(scrapy.Item): img_url = scrapy.Field() # ScrapingList Residential & Yield Estate for sale class ListResidentialItem(scrapy.Item): image_urls = scrapy.Field() images = scrapy.Field() pass
我的管道文件为空,我不确定应该添加到pipeline.py文件中。
我的工作最终结果是:
spider.py:
import scrapy import re import urlparse from scrapy.spiders import CrawlSpider, Rule from scrapy.linkextractors import LinkExtractor from scrapy.loader.processors import Join, MapCompose, TakeFirst from scrapy.pipelines.images import ImagesPipeline from production.items import ProductionItem from production.items import ImageItem class productionSpider(scrapy.Spider): name = "production" allowed_domains = ["url"] start_urls = [ "startingurl.com" ] def parse(self, response): for sel in response.xpath('//html/body'): item = ProductionItem() img_url = sel.xpath('//a[@idd="followclaslink"]/@href').extract()[0] yield scrapy.Request(urlparse.urljoin(response.url, img_url),callback=self.parseImages, meta={'item': item}) def parseImages(self, response): for elem in response.xpath("//img"): img_url = elem.xpath("@src").extract_first() yield ImageItem(image_urls=[img_url])
Settings.py
BOT_NAME = 'production' SPIDER_MODULES = ['production.spiders'] NEWSPIDER_MODULE = 'production.spiders' DEFAULT_ITEM_CLASS = 'production.items' ROBOTSTXT_OBEY = True IMAGES_STORE = '/Users/home/images' DOWNLOAD_DELAY = 2 ITEM_PIPELINES = {'scrapy.pipelines.images.ImagesPipeline': 1} # Disable cookies (enabled by default) items.py # -*- coding: utf-8 -*- import scrapy class ProductionItem(scrapy.Item): img_url = scrapy.Field() # ScrapingList Residential & Yield Estate for sale class ListResidentialItem(scrapy.Item): image_urls = scrapy.Field() images = scrapy.Field() class ImageItem(scrapy.Item): image_urls = scrapy.Field() images = scrapy.Field()
pipelines.py
import scrapy from scrapy.pipelines.images import ImagesPipeline from scrapy.exceptions import DropItem class MyImagesPipeline(ImagesPipeline): def get_media_requests(self, item, info): for image_url in item['image_urls']: yield scrapy.Request(image_url) def item_completed(self, results, item, info): image_paths = [x['path'] for ok, x in results if ok] if not image_paths: raise DropItem("Item contains no images") item['image_paths'] = image_paths return item