我有以下代码用于scrapy框架:
# -*- coding: utf-8 -*- import scrapy from scrapy.contrib.spiders import Rule from scrapy.linkextractors import LinkExtractor from lxml import html class Scrapy1Spider(scrapy.Spider): name = "scrapy1" allowed_domains = ["sfbay.craigslist.org"] start_urls = ( 'http://sfbay.craigslist.org/search/npo', ) rules = (Rule(LinkExtractor(allow=(), restrict_xpaths=('//a[@class="button next"]',)), callback="parse", follow= True),) def parse(self, response): site = html.fromstring(response.body_as_unicode()) titles = site.xpath('//div[@class="content"]/p[@class="row"]') print len(titles), 'AAAA'
但是问题是我得到100个结果,但没有转到下一页。
你rule未使用,因为你没有使用CrawlSpider。
rule
CrawlSpider
因此,你必须requests手动创建下一页,如下所示:
requests
# -*- coding: utf-8 -*- import scrapy from scrapy.contrib.spiders import Rule from scrapy.linkextractors import LinkExtractor from lxml import html class Scrapy1Spider(scrapy.Spider): name = "craiglist" allowed_domains = ["sfbay.craigslist.org"] start_urls = ( 'http://sfbay.craigslist.org/search/npo', ) Rules = (Rule(LinkExtractor(allow=(), restrict_xpaths=('//a[@class="button next"]',)), callback="parse", follow= True),) def parse(self, response): site = html.fromstring(response.body_as_unicode()) titles = site.xpath('//div[@class="content"]/p[@class="row"]') print len(titles), 'AAAA' # follow next page links next_page = response.xpath('.//a[@class="button next"]/@href').extract() if next_page: next_href = next_page[0] next_page_url = 'http://sfbay.craigslist.org' + next_href request = scrapy.Request(url=next_page_url) yield request
或者CrawlSpider像这样使用:
# -*- coding: utf-8 -*- import scrapy from scrapy.spiders import CrawlSpider, Rule from scrapy.linkextractors import LinkExtractor from lxml import html class Scrapy1Spider(CrawlSpider): name = "craiglist" allowed_domains = ["sfbay.craigslist.org"] start_urls = ( 'http://sfbay.craigslist.org/search/npo', ) rules = (Rule(LinkExtractor(allow=(), restrict_xpaths=('//a[@class="button next"]',)), callback="parse_page", follow= True),) def parse_page(self, response): site = html.fromstring(response.body_as_unicode()) titles = site.xpath('//div[@class="content"]/p[@class="row"]') print len(titles), 'AAAA'