Python scrapy 模块,spiders() 实例源码


def start_requests(self):
        """This function generates the initial request of ArchiveSpider.

        See '\

        The most import part of the function is to set a request meta,
        'archive_meta', according to its site 'archive_rules'. The meta would
        be used to parse article URLs from response and generate next request!
        for page in self.page_templates:
            url = page.format(p_num=self.p_kw['start'])
            meta = dict(archive_meta=dict(
            logger.debug('Page format meta info:\n%s', pprint.pformat(meta))
            yield scrapy.Request(url, callback=self.parse, meta=meta)
def __init__(self, domains, urls, *args, **kwargs):
        """Constructor for FeedSpider.

        domains : list
            A list of domains for the site.
        urls : list
            A list of feed URLs of the site.
        provider : string
            The provider of RSS feed.
        url_regex : string
            URL pattern regular expression.

        If you use this spider to store item into database, additional
        keywords are required:

        platform_id : int
            The id of a platform instance.
        session : object
            An instance of SQLAlchemy session.

        Other keywords are used to specify how to parse the XML, see\
        self.platform_id = kwargs.pop('platform_id', None)
        self.session = kwargs.pop('session', None)
        self.url_regex = kwargs.pop('url_regex', None)
        self.provider = kwargs.pop('provider', 'self')
        self.iterator = kwargs.pop('iterator', 'iternodes')
        self.itertag = kwargs.pop('iterator', 'item')
        self.allowed_domains = domains
        self.start_urls = urls
        super(FeedSpider, self).__init__(*args, **kwargs)
def parse(self, response):
        item = DoubanTopMoviesItem()
        item['title_ch'] = response.xpath('//div[@class="hd"]//span[@class="title"][1]/text()').extract()

        # ???title-title-other ??3?????????title-other????????????????
        # en_list = response.xpath('//div[@class="hd"]//span[@class="title"][2]/text()').extract()
        # item['title_en'] = [en.replace('\xa0/\xa0','').replace('  ','') for en in en_list]
        # ht_list = response.xpath('//div[@class="hd"]//span[@class="other"]/text()').extract()
        # item['title_ht'] = [ht.replace('\xa0/\xa0','').replace('  ','') for ht in ht_list]
        # detail_list = response.xpath('//div[@class="bd"]/p[1]/text()').extract()
        # item['detail'] = [detail.replace('  ', '').replace('\xa0', '').replace('\n', '') for detail in detail_list]
        # ?????????quote??????????
        # item['quote'] = response.xpath('//span[@class="inq"]/text()').extract()

        item['rating_num'] = response.xpath('//div[@class="star"]/span[2]/text()').extract()
        # ??????“XXX???”???????????XXX??
        count_list = response.xpath('//div[@class="star"]/span[4]/text()').extract()
        item['rating_count'] = [re.findall('\d+',count)[0] for count in count_list]
        item['image_urls'] = response.xpath('//div[@class="pic"]/a/img/@src').extract()
        item['topid'] = response.xpath('//div[@class="pic"]/em/text()').extract()

        yield item

        # ???????????
        # new_url = response.xpath('//link[@rel="next"]/@href').extract_first()
        # if new_url:
        #     next_url = self.base_url+new_url
        #     yield scrapy.Request(next_url, callback=self.parse)

######-------??start_urls?LinkExtractor ???????--------#####
    # from scrapy.spiders import CrawlSpider, Rule
    # from scrapy.linkextractors import LinkExtractor
    # class SpDoubanSpider(CrawlSpider):
        # ?
    # ?????????????????
    # rules = [Rule(LinkExtractor(allow=(r'\?start=\d+.*')),
    #                callback='parse_item', follow=True)
    #           ]
    # def parse_item(self, response):
    #     # item ??????
    #     yield item
######-------??start_urls?LinkExtractor ???????--------#####