Python scrapy 模块,Selector() 实例源码

我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用scrapy.Selector()

项目:rental    作者:meihuanyu    | 项目源码 | 文件源码
def parse_page(self, response):
        self.write(response.body)

        sel = Selector(response)
        infos = sel.xpath('//tr[@class="cells"]').extract()
        for i, info in enumerate(infos):
            val = Selector(text = info)
            ip = val.xpath('//td[2]/text()').extract_first()
            port = val.xpath('//td[3]/text()').extract_first()
            country = val.xpath('//td[5]/text()').extract_first()
            anonymity = val.xpath('//td[4]/text()').extract_first()

            proxy = Proxy()
            proxy.set_value(
                    ip = ip,
                    port = port,
                    country = country,
                    anonymity = anonymity,
                    source = self.name,
            )

            self.add_proxy(proxy = proxy)
项目:rental    作者:meihuanyu    | 项目源码 | 文件源码
def parse_page(self, response):
        self.write(response.body)

        sel = Selector(response)
        infos = sel.xpath('//ul[@class="l2"]').extract()
        for i, info in enumerate(infos):
            val = Selector(text = info)
            ip = val.xpath('//ul[@class="l2"]/span[1]/li/text()').extract_first()
            port = val.xpath('//ul[@class="l2"]/span[2]/li/text()').extract_first()
            anonymity = val.xpath('//ul[@class="l2"]/span[3]/li/text()').extract_first()
            https = val.xpath('//ul[@class="l2"]/span[4]/li/text()').extract_first()
            country = val.xpath('//ul[@class="l2"]/span[5]/li/a/text()').extract_first()

            proxy = Proxy()
            proxy.set_value(
                    ip = ip,
                    port = port,
                    country = country,
                    anonymity = anonymity,
                    source = self.name,
            )

            self.add_proxy(proxy = proxy)
项目:rental    作者:meihuanyu    | 项目源码 | 文件源码
def parse_page(self, response):
        self.write(response.body)

        sel = Selector(response)
        infos = sel.xpath('//tbody/tr').extract()
        for i, info in enumerate(infos):
            if i == 0:
                continue

            val = Selector(text = info)
            ip = val.xpath('//td[1]/text()').extract_first()
            port = val.xpath('//td[2]/text()').extract_first()
            country = val.xpath('//td[3]/div/text()').extract_first()
            anonymity = val.xpath('//td[6]/text()').extract_first()

            proxy = Proxy()
            proxy.set_value(
                    ip = ip,
                    port = port,
                    country = country,
                    anonymity = anonymity,
                    source = self.name,
            )

            self.add_proxy(proxy = proxy)
项目:web_crawler    作者:NearXdu    | 项目源码 | 文件源码
def parse(self, response):
        sel=scrapy.Selector(response)
        links_in_a_page = sel.xpath('//a[@href]')

        for link_sel in links_in_a_page:
            item=OschinaItem()
            link=str(link_sel.re('href="(.*?)"')[0])
            if link:
                if not link.startswith('http'):
                    link=response.url+link
                yield scrapy.Request(link,callback=self.parse)
                item['link']=link
                link_text=link_sel.xpath('text()').extract()
                if link_text:
                    item['link_text']=str(link_text[0].encode('utf-8').strip())
                else:
                    item['link_text']=None

                yield item
项目:SinaWeiboSpider    作者:wen-fei    | 项目源码 | 文件源码
def parse_user_0(self, response):
        """ ??????-???????????????? """
        user_item = UserItem()
        selector = Selector(response)
        text0 = selector.xpath('body/div[@class="u"]/div[@class="tip2"]').extract_first()
        if text0:
            num_tweets = re.findall(u'\u5fae\u535a\[(\d+)\]', text0)  # ???
            num_follows = re.findall(u'\u5173\u6ce8\[(\d+)\]', text0)  # ???
            num_fans = re.findall(u'\u7c89\u4e1d\[(\d+)\]', text0)  # ???
            if num_tweets:
                user_item["ctweets"] = int(num_tweets[0])
            if num_follows:
                user_item["cfollows"] = int(num_follows[0])
            if num_fans:
                user_item["cfans"] = int(num_fans[0])
            user_item["_id"] = response.meta["user_id"]
            url_information1 = "http://weibo.cn/%s/info" % response.meta["user_id"]
            yield Request(url=url_information1, meta={"item": user_item}, callback=self.parse_user_1)
项目:SinaWeiboSpider    作者:wen-fei    | 项目源码 | 文件源码
def parse_user_1(self, response):
        """ ??????2 """
        user_item = response.meta["item"]
        selector = Selector(response)
        text1 = ";".join(selector.xpath('body/div[@class="c"]/text()').extract())  # ????????text()

        nickname = re.findall(u'\u6635\u79f0[:|\uff1a](.*?);', text1)  # ??
        intro = re.findall(u'\u7b80\u4ecb[:|\uff1a](.*?);', text1)  # ??
        auth = re.findall(u'\u8ba4\u8bc1[:|\uff1a](.*?);', text1)  # ????

        gender = re.findall(u'\u6027\u522b[:|\uff1a](.*?);', text1)  # ??
        place = re.findall(u'\u5730\u533a[:|\uff1a](.*?);', text1)  # ???????????
        birthday = re.findall(u'\u751f\u65e5[:|\uff1a](.*?);', text1)  # ??
        sexorientation = re.findall(u'\u6027\u53d6\u5411[:|\uff1a](.*?);', text1)  # ???
        marriage = re.findall(u'\u611f\u60c5\u72b6\u51b5[:|\uff1a](.*?);', text1)  # ????
        url = re.findall(u'\u4e92\u8054\u7f51[:|\uff1a](.*?);', text1)  # ????

        if nickname:
            user_item["nickname"] = nickname[0]
        if auth:
            user_item["auth"] = auth[0]
        if intro:
            user_item["intro"] = intro[0]
        user_item['t'] = time.strftime('%Y-%m-%d', time.localtime(time.time()))
        yield user_item
项目:my-scrapy    作者:azraelkuan    | 项目源码 | 文件源码
def get_xicidaili():
    url = "http://www.xicidaili.com/nn/%s"
    for i in range(1, 2):
        page_url = url % str(i)
        print(page_url)
        s = requests.session()
        req = s.get(page_url, headers=headers)
        selector = Selector(text=req.text)
        ip_nodes = selector.xpath("//table//tr")
        for each in ip_nodes[1:]:
            ip = each.xpath("./td[2]/text()").extract()[0]
            port = each.xpath("./td[3]/text()").extract()[0]
            http_type = each.xpath("./td[6]/text()").extract()[0]
            if http_type == "HTTP":
                proxies = {
                    "http": "%s://%s:%s" % ("http", ip, port),
                    "https": "%s://%s:%s" % ("http", ip, port),
                    }
                try:
                    r = requests.get('http://www.ip138.com/', proxies=proxies, timeout=5)
                    if r.status_code == 200:
                        print("%s:%s is valid" % (ip, port))
                except:
                    print("%s:%s is not valid" % (ip, port))
项目:autoinjection    作者:ChengWiLL    | 项目源码 | 文件源码
def parse(self,response):
        sel = scrapy.Selector(response)
        article_info = sel.xpath("//a")

        for info in article_info:
            item = GovcrawlItem()
            link = info.xpath('@href').extract()
            if not link:
                continue
            position = link[0].find("/")
            if position < 0 or "?" not in link[0]:
                continue
            elif "http" not in link[0]:
                url = response.url + link[0][position:]
            else:
                url = link[0]
            yield scrapy.Request(url,callback=self.parse)
            item['link'] = url
            title = info.xpath('text()').extract()
            if title:
                item['title'] = title[0]
            else:
                item['title'] = None
            #print item['link']
            yield item
项目:scrapy-training    作者:scrapinghub    | 项目源码 | 文件源码
def parse_page(self, response):
        next_page = response.meta.get('page') + 1
        json_data = json.loads(response.text)
        if json_data.get('type') != 'success':
            return
        articles = scrapy.Selector(text=json_data.get('html')).css('article')
        for article in articles:
            yield {
                'author': article.css('div.author-meta a ::text').extract_first(),
                'date': article.css('div.clock-meta a ::text').extract_first(),
                'title': article.css('h1.entry-title ::text').extract_first()
            }
        yield scrapy.FormRequest(
            self.scrolling_url, formdata={'action': 'infinite_scroll', 'page': str(next_page), 'order': 'DESC'},
            callback=self.parse_page, meta={'page': next_page}
        )
项目:Scrapy_CrawlMeiziTu    作者:williamzxl    | 项目源码 | 文件源码
def parse_item(self, response):
         item = CrawlmeizituItem()
         selector = scrapy.Selector(response)

         image_title = selector.xpath('//h2/a/text()').extract()
         image_url = selector.xpath('//h2/a/@href').extract()
         image_tags = selector.xpath('//div[@class="metaRight"]/p/text()').extract()
         if selector.xpath('//*[@id="picture"]/p/img/@src').extract():
            image_src = selector.xpath('//*[@id="picture"]/p/img/@src').extract()
         else:
            image_src = selector.xpath('//*[@id="maincontent"]/div/p/img/@src').extract()
         if selector.xpath('//*[@id="picture"]/p/img/@alt').extract():
             pic_name = selector.xpath('//*[@id="picture"]/p/img/@alt').extract()
         else:
            pic_name = selector.xpath('//*[@id="maincontent"]/div/p/img/@alt').extract()
         #//*[@id="maincontent"]/div/p/img/@alt
         item['title'] = image_title
         item['url'] = image_url
         item['tags'] = image_tags
         item['src'] = image_src
         item['alt'] = pic_name
         print(item)
         time.sleep(1)
         yield item
项目:crawllagou    作者:ScarecrowFu    | 项目源码 | 文件源码
def parse(self,response):
        sel = Selector(response)
        keys = sel.xpath('//*[@class="menu_main job_hopping"]/h2/text()').extract()
        i = 1
        item = defaultdict(list)
        for key in keys:
            if key.strip() != '':
                print "test"
                print key.strip()
                try:
                    print i
                    item[key.strip()].append(sel.xpath('//*[@class="menu_box"][{}]/div[2]/dl/dd/a/text()'.format(i)).extract())
                    i = i + 1
                    # item["key"].append(key)
                except Exception, e:
                    print e
            else:
                continue
        yield item
项目:directory-tests    作者:uktrade    | 项目源码 | 文件源码
def fas_browse_suppliers_using_every_sector_filter(
        context: Context, actor_alias: str):
    actor = context.get_actor(actor_alias)
    session = actor.session

    response = fas_ui_find_supplier.go_to(session, term="")
    context.response = response

    sector_filters_selector = "#id_sectors input::attr(value)"
    content = response.content.decode("utf-8")
    sector_filters = Selector(text=content).css(
        sector_filters_selector).extract()
    results = {}
    for sector in sector_filters:
        logging.debug(
            "%s will browse Suppliers by Industry sector filter '%s'",
            actor_alias, sector
        )
        response = fas_ui_find_supplier.go_to(session, sectors=[sector])
        results[sector] = {
            "url": response.request.url,
            "sectors": [sector],
            "response": response
        }
    context.results = results
项目:directory-tests    作者:uktrade    | 项目源码 | 文件源码
def fas_browse_suppliers_by_invalid_sectors(
        context: Context, actor_alias: str):
    actor = context.get_actor(actor_alias)
    session = actor.session

    response = fas_ui_find_supplier.go_to(session, term="")
    context.response = response

    sector_selector = "#id_sectors input::attr(value)"
    content = response.content.decode("utf-8")
    filters = Selector(text=content).css(sector_selector).extract()

    sectors = list(set(choice(filters)
                       for _ in range(randrange(1, len(filters)))))

    sectors.append("this_is_an_invalid_sector_filter")
    logging.debug(
        "%s will browse Suppliers by multiple Industry sector filters and will"
        " inject an invalid filter: '%s'",
        actor_alias, ", ".join(sectors)
    )
    context.response = fas_ui_find_supplier.go_to(session, sectors=sectors)
项目:directory-tests    作者:uktrade    | 项目源码 | 文件源码
def fas_should_see_filtered_search_results(context, actor_alias):
    results = context.results
    sector_filters_selector = "#id_sectors input"
    for industry, result in results.items():
        context.response = result["response"]
        content = result["response"].content.decode("utf-8")
        filters = Selector(text=content).css(sector_filters_selector).extract()
        for fil in filters:
            sector = Selector(text=fil).css("input::attr(value)").extract()[0]
            checked = True if Selector(text=fil).css("input::attr(checked)").extract() else False
            if sector in result["sectors"]:
                with assertion_msg(
                        "Expected search results to be filtered by '%s' sector"
                        " but this filter was not checked!"):
                    assert checked
            else:
                with assertion_msg(
                        "Expected search results to be filtered only by "
                        "following sectors '%s', but they are also filtered "
                        "by '%s'!", ", ".join(result['sectors']), sector):
                    assert not checked
        logging.debug(
            "%s was presented with '%s' industry search results correctly "
            "filtered by following sectors: '%s'", actor_alias, industry,
            ", ".join(result['sectors']))
项目:directory-tests    作者:uktrade    | 项目源码 | 文件源码
def fas_should_see_highlighted_search_term(context, actor_alias, search_term):
    response = context.response
    content = response.content.decode("utf-8")
    search_summaries_selector = ".ed-company-search-summary"
    summaries = Selector(text=content).css(search_summaries_selector).extract()
    tag = "em"
    keywords = [surround(keyword, tag) for keyword in search_term.split()]
    founds = []
    for summary in summaries:
        founds += [(keyword in summary) for keyword in keywords]

    with assertion_msg(
            "Expected to see at least 1 search result with highlighted search "
            "term: '%s'".format(", ".join(keywords))):
        assert any(founds)

    logging.debug(
        "{alias} found highlighted search {term}: '{keywords}' {founds} {times}"
        " in {results} search results".format(
            alias=actor_alias, term="terms" if len(keywords) > 1 else "term",
            keywords=", ".join(keywords), founds=len([f for f in founds if f]),
            times="times" if len([f for f in founds if f]) > 1 else "time",
            results=len(summaries)))
项目:scrapyweixi    作者:Felix-P-Code    | 项目源码 | 文件源码
def parse_url_list(self,response):
        sel = scrapy.Selector(response)
        wait_text = sel.xpath("//p[@id='loading']//text()").extract()
        if wait_text:
            #???
            meta = response.meta
            meta['isscreen'] = 1
            #scrapy ???URL?????????url???
            yield scrapy.Request(response.url, meta=meta, callback=self.parse_validate,dont_filter=True)
        else:
            #????html??
            url_list = sel.xpath("//h4[@class='weui_media_title']/@hrefs").extract()
            for li in url_list:
                href = li.strip()
                url = 'http://mp.weixin.qq.com%s' % href
                #print(url)
                yield scrapy.Request(url, meta=self.meta, callback=self.parse_item)
项目:structure_spider    作者:ShichaoMa    | 项目源码 | 文件源码
def enrich_wrapper(func):
    """
    item_loader???pickle ?????????response???selector??, ???????
    ???enrich??????selector????????selector
    :param func:
    :return:
    """

    @wraps(func)
    def wrapper(*args, **kwargs):
        item_loader = args[1]
        response = args[2]
        selector = Selector(text=response.text)
        item_loader.selector = selector
        result = func(*args, **kwargs)
        item_loader.selector = None

        return result

    return wrapper
项目:IPProxyTool    作者:awolfly9    | 项目源码 | 文件源码
def parse_page(self, response):
        self.write(response.body)

        sel = Selector(response)
        infos = sel.xpath('//tr[@class="cells"]').extract()
        for i, info in enumerate(infos):
            self.log(info)
            val = Selector(text = info)

            ip = val.xpath('//td[2]/text()').extract_first()
            port = val.xpath('//td[3]/text()').extract_first()
            country = val.xpath('//td[5]/text()').extract_first()
            anonymity = val.xpath('//td[4]/text()').extract_first()

            proxy = Proxy()
            proxy.set_value(
                    ip = ip,
                    port = port,
                    country = country,
                    anonymity = anonymity,
                    source = self.name,
            )

            self.add_proxy(proxy = proxy)
项目:IPProxyTool    作者:awolfly9    | 项目源码 | 文件源码
def parse_page(self, response):
        self.write(response.body)

        sel = Selector(response)
        infos = sel.xpath('//ul[@class="l2"]').extract()
        for i, info in enumerate(infos):
            val = Selector(text = info)
            ip = val.xpath('//ul[@class="l2"]/span[1]/li/text()').extract_first()
            port = val.xpath('//ul[@class="l2"]/span[2]/li/text()').extract_first()
            anonymity = val.xpath('//ul[@class="l2"]/span[3]/li/text()').extract_first()
            https = val.xpath('//ul[@class="l2"]/span[4]/li/text()').extract_first()
            country = val.xpath('//ul[@class="l2"]/span[5]/li/a/text()').extract_first()

            proxy = Proxy()
            proxy.set_value(
                    ip = ip,
                    port = port,
                    country = country,
                    anonymity = anonymity,
                    source = self.name,
            )

            self.add_proxy(proxy = proxy)
项目:IPProxyTool    作者:awolfly9    | 项目源码 | 文件源码
def parse_page(self, response):
        self.write(response.body)

        sel = Selector(response)
        infos = sel.xpath('//tbody/tr').extract()
        for i, info in enumerate(infos):
            if i == 0:
                continue

            val = Selector(text = info)
            ip = val.xpath('//td[1]/text()').extract_first()
            port = val.xpath('//td[2]/text()').extract_first()
            country = val.xpath('//td[3]/div/text()').extract_first()
            anonymity = val.xpath('//td[6]/text()').extract_first()

            proxy = Proxy()
            proxy.set_value(
                    ip = ip,
                    port = port,
                    country = country,
                    anonymity = anonymity,
                    source = self.name,
            )

            self.add_proxy(proxy = proxy)
项目:quant    作者:yutiansut    | 项目源码 | 文件源码
def parse_url_list(self, response):
        sel = scrapy.Selector(response)
        print(sel)
        # first_url_list = sel.xpath('//title[1]//text()').extract()
        # print(first_url_list)

        article_xpath = ".//*[@id='news']/ul/li/div/a[1]/@href"
        article_url_list = sel.xpath(article_xpath).extract()

        for article_url in article_url_list:
            print(article_url)
            yield scrapy.Request(article_url,self.parse_article)


            #yield self.parse_article(url)

        #content = selenium_request(article_url_list)
        #print(content)
项目:WeiboWebSpider    作者:Apocally    | 项目源码 | 文件源码
def parse_info(self, response):
        selector = scrapy.Selector(response)
        item = WeiboWebInfoItem()
        info = selector.xpath("body/div[@class='u']/div[@class='tip2']")
        info_text = info.extract_first()
        try:
            item['ID'] = re.findall("uid=(.*?)\">", info_text)[0]
            item['TweetsNum'] = re.findall("??\[(.*?)\]</span>", info_text)[0]
            item['FollowerNum'] = re.findall("??\[(.*?)\]</span>", info_text)[0]
            item['FanNum'] = re.findall("??\[(.*?)\]</span>", info_text)[0]
            tweet_url, follower_url = url_generator_for_id(item['ID'])
            item['URL'] = tweet_url
        except:
            pass
        basic_info_url = 'http://weibo.cn/%s/info' % item['ID']
        yield scrapy.Request(basic_info_url, meta={"item": item}, callback=self.parse_basic_info)
项目:findtrip    作者:fankcoder    | 项目源码 | 文件源码
def parse(self, response):
        sel = scrapy.Selector(response)
        dataList = sel.xpath("//div[@class='m-fly-item s-oneway']")
        items = []

        for index,each in enumerate(dataList):
            flight_each = "//div[@id='list-box']/div["+str(index+1)+"]"
            detail_span = "//div[@class='fl-detail-nav']/ul/li[1]/span[@class='nav-label']"
            f_route_div = "//div[@class='m-fl-info-bd']/div"

            airports = sel.xpath(flight_each + f_route_div + '/p[3]//text()').extract()
            company = sel.xpath(flight_each + f_route_div + '/p[1]//text()').extract()
            flight_time = sel.xpath(flight_each + f_route_div + '/p[2]//text()').extract()
            passtime = sel.xpath(flight_each + f_route_div + '/p[4]//text()').extract()
            price = sel.xpath(flight_each + "//div[@class='fl-price-box']//em//text()").extract()

            item = FindtripItem()
            item['site'] = 'Qua'
            item['company'] = company
            item['flight_time'] = flight_time
            item['airports'] = airports
            item['passtime'] = passtime
            item['price'] = price
            items.append(item)
        return items
项目:crawler    作者:Yabea    | 项目源码 | 文件源码
def parse(self,response):
        sel = Selector(response)
        keys = sel.xpath('//*[@class="menu_main job_hopping"]/h2/text()').extract()
        i = 1
        item = defaultdict(list)
        for key in keys:
            if key.strip() != '':
                print "test"
                print key.strip()
                try:
                    print i
                    item[key.strip()].append(sel.xpath('//*[@class="menu_box"][{}]/div[2]/dl/dd/a/text()'.format(i)).extract())
                    i = i + 1
                    # item["key"].append(key)
                except Exception, e:
                    print e
            else:
                continue
        yield item
项目:badoo_scrapy_splash_redis    作者:Supe2015    | 项目源码 | 文件源码
def parse_detail(self, response):
    res_dir = response.meta["RESDIR"]
    print 'res_dir:', res_dir
    rensel = scrapy.Selector(response)
        text = rensel.xpath('//script/text()').extract()
    tmp1 = re.findall(r'"url":\"(.*?)\"', str(text))
    if len(tmp1) > 0:
        uid_p_list = []
        for i in tmp1:
            uid_p_list.append(i.strip().replace('\\', ''))
        for i in uid_p_list[1:]:
            pid = i.split('/')[-3]
        print i
        r = Redis(host='192.168.5.24', port='6379')
                print r.llen(self.MCOUNTRY)
        r.lpush(self.MCOUNTRY,i)
项目:badoo_scrapy_splash_redis    作者:Supe2015    | 项目源码 | 文件源码
def parse_detail(self, response):
    res_dir = response.meta["RESDIR"]
    print 'res_dir:', res_dir
    rensel = scrapy.Selector(response)
        text = rensel.xpath('//script/text()').extract()
    tmp1 = re.findall(r'"url":\"(.*?)\"', str(text))
    if len(tmp1) > 0:
        uid_p_list = []
        for i in tmp1:
            uid_p_list.append(i.strip().replace('\\', ''))
        for i in uid_p_list[1:]:
            pid = i.split('/')[-3]
        print i
        r = Redis(host='192.168.5.24', port='6379')
                print r.llen(self.MCOUNTRY)
        r.lpush(self.MCOUNTRY,i)
项目:remotor    作者:jamiebull1    | 项目源码 | 文件源码
def parse_job(self, response):
        """Parse a joblink into a JobItem.
        """
        s = Selector(response)
        item = JobItem()
        item['url'] = response.url
        item['site'] = 'Remote.co'
        item['title'] = s.css('h1::text').extract_first()
        item['company'] = s.xpath(
            '//strong[@itemprop="name"]/text()').extract_first()
        job = s.css('.job-description')
        job.xpath('p[1]')
        item['text'] = s.xpath(
            '//div[@class="job_description"]//text()').extract()
        try:
            posted = s.xpath('//time//text()').extract_first()
            item['date_posted'] = utilities.naturaltime(
                posted.replace('Posted ', '')).isoformat()
        except Exception as e:
            self.logger.error(e)
        yield item
项目:remotor    作者:jamiebull1    | 项目源码 | 文件源码
def parse_job(self, response):
        """Parse a joblink into a JobItem.
        """
        s = Selector(response)
        item = JobItem()
        item['url'] = response.url
        item['site'] = 'RemoteWorking'
        item['title'] = s.css('h1::text').extract_first()
        item['text'] = s.xpath(
            '//div[@itemprop="description"]//text()').extract()

        try:
            posted = s.xpath('//li[@class="date-posted"]//text()').extract_first()
            item['date_posted'] = utilities.naturaltime(
                posted.replace('Posted ', '')).isoformat()
        except Exception as e:
            self.logger.error(e)
        yield item
项目:remotor    作者:jamiebull1    | 项目源码 | 文件源码
def parse_job(self, response):
        """Parse a joblink into a JobItem.
        """
        s = Selector(response)
        item = JobItem()
        item['url'] = response.url
        item['site'] = 'Jobspresso'
        item['title'] = s.xpath(
            '//h2[@class="page-title"]//text()').extract_first()
        item['text'] = s.xpath(
            '//div[@itemprop="description"]//text()').extract()
        try:
            posted = s.xpath('//date/text()').extract_first()
            item['date_posted'] = parse_time(posted).isoformat()
        except Exception as e:
            self.logger.error(e)
        yield item
项目:remotor    作者:jamiebull1    | 项目源码 | 文件源码
def parse(self, response):
        """Get the pagination links and hand them off.
        """
        s = Selector(response)
        pagination = s.css('.pagination')
        pagelinks = [response.url]
        pagelinks.extend(pagination.xpath(
            '//a[contains(@href, "l-remote/p-")]/@href').extract())
#        for pagelink in pagelinks:
        for pagelink in pagelinks[:1]:
            request = Request(
                urljoin(self.root, pagelink),
                callback=self.parse_jobspage,
                dont_filter=True,
                )
            yield request
项目:remotor    作者:jamiebull1    | 项目源码 | 文件源码
def parse_job(self, response):
        """Parse a joblink into a JobItem.
        """
        s = Selector(response)
        item = JobItem()
        item['url'] = response.url.split('?')[0]
        item['site'] = 'CareerBuilder'
        item['title'] = s.css('h1::text').extract_first()
        item['text'] = s.css('.job-facts::text').extract()
        item['text'].extend(s.css('.item').css('.tag::text').extract())
        item['text'].extend(s.css('.description::text').extract())
        try:
            posted = s.xpath(
                '//h3[@id="job-begin-date"]/text()').extract_first()
            item['date_posted'] = utilities.naturaltime(
                posted.replace('Posted ', '')).isoformat()
        except Exception as e:
            self.logger.error(e)
        yield item
项目:pydata_webscraping    作者:jmortega    | 项目源码 | 文件源码
def parse(self, response):
        hxs = scrapy.Selector(response)
        slots_tutorials = hxs.xpath('//td[@class="slot slot-tutorial"]')
        for slot in slots_tutorials:
            speakers_tutorials = slot.xpath('//span[@class="speaker"]/text()').extract()
            urls_tutorials = slot.xpath('//span[@class="title"]//@href').extract()
            talks_tutorials = slot.xpath('//span[@class="title"]//a/text()').extract()

        indexSpeaker=0
        for speaker in speakers_tutorials:
            yield Request(url=''.join(('http://www.pydata.org', urls_tutorials[indexSpeaker])),
                          callback=self.parse_details,
                          meta={'speaker': speaker.strip(), 'url': urls_tutorials[indexSpeaker], 
                          'talk': talks_tutorials[indexSpeaker]}
                          )       
            indexSpeaker=indexSpeaker+1
项目:rental    作者:meihuanyu    | 项目源码 | 文件源码
def parse_page(self, response):
        self.write(response.body)

        sel = Selector(response)
        infos = sel.xpath('//tbody/tr').extract()
        for i, info in enumerate(infos):
            if i == 0:
                continue

            val = Selector(text = info)
            ip = val.xpath('//td[1]/text()').extract_first()
            port = val.xpath('//td[2]/text()').extract_first()
            country = val.xpath('//td[6]/text()').extract_first()
            anonymity = val.xpath('//td[3]/text()').extract_first()
            https = val.xpath('//td[4]/text()').extract_first()

            proxy = Proxy()
            proxy.set_value(
                    ip = ip,
                    port = port,
                    country = country,
                    anonymity = anonymity,
                    source = self.name,
            )

            self.add_proxy(proxy = proxy)
项目:web_crawler    作者:NearXdu    | 项目源码 | 文件源码
def parse(self, response):
        def getdomain(url):
            proto, rest = urllib.splittype(url)
            host, rest = urllib.splithost(rest)
            return "http://"+host
        sel=scrapy.Selector(response)
        links_in_a_page=sel.xpath('//a[@href]')

        for link_sel in links_in_a_page:
            item=XinhuaItem()
            link=str(link_sel.re('href="(.*?)"')[0])


            if link:
                if not link.startswith('http'):
                    link=response.url+link
                    #link=getdomain(response.url)+link



                yield scrapy.Request(link,callback=self.parse)

                p1=re.compile(r'.*\d{4}-\d{2}/\d{2}.*')
                if re.match(p1,link):
                    print ("Y: "+link)
                    item['link']=link
                    yield item
                else:
                    print ("F: "+link)
项目:web_crawler    作者:NearXdu    | 项目源码 | 文件源码
def parse(self, response):
        def getdomain(url):
            proto, rest = urllib.splittype(url)
            host, rest = urllib.splithost(rest)
            return "http://"+host

        sel=scrapy.Selector(response)
        links_in_a_page = sel.xpath('//a[@href]')

        for link_sel in links_in_a_page:
            item=QqurlItem()
            link=str(link_sel.re('href="(.*?)"')[0])

            if link:
                if not link.startswith('http'):
                    if link.startswith('javascript'):
                        continue
                    if link.startswith('//support'):
                        continue
                    link=getdomain(response.url)+link


                if  re.match('.*comment.*',link):
                    continue


                yield scrapy.Request(link,callback=self.parse)
                if not re.match('.*comment.*',link):
                    if re.match('^http.*qq.com.*\.s?html?$',link):
                        item['link']=link
                        yield item
项目:web_crawler    作者:NearXdu    | 项目源码 | 文件源码
def parse(self, response):
        def getdomain(url):
            #proto,rest=urllib.splittype(url)
            #host,rest=urllib.splithost(rest)
            return "http:"

        sel =  scrapy.Selector(response)
        links_in_a_page=sel.xpath('//a[@href]')

        for link_sel in links_in_a_page:
            item=SohuItem()
            link=str(link_sel.re('href="(.*?)"')[0])

            if link:
                if not link.startswith('http'):
                    link=getdomain(response.url)+link

                yield scrapy.Request(link,callback=self.parse)

                p1=re.compile(r'.*/a/.*')
                p2=re.compile(r'.*#comment_area$')
                p3=re.compile(r'.*news.sohu.com.*s?html?$')



                if (re.match(p3,link) or re.match(p1,link)) and (not re.match(p2,link)):
                    #print ('T: '+link)
                    item['link']=link
                    yield item
                else:
                    pass
                    #print ('F: '+link)
项目:scrapy-training    作者:scrapinghub    | 项目源码 | 文件源码
def alternative_parse_method(self, response):
        # An alternative would be to build a Scrapy selector from the JS string
        # and extract the data using CSS selectors
        script = response.xpath('//script[contains(., "var data =")]/text()').extract_first()
        sel = scrapy.Selector(root=js2xml.parse(script))
        for quote in sel.css('var[name="data"] > array > object'):
            yield {
                'text': quote.css('property[name="text"] > string::text').extract_first(),
                'author': quote.css('property[name="author"] property[name="name"] > string::text').extract_first(),
                'tags': quote.css('property[name="tags"] string::text').extract(),
            }

        link_next = response.css('li.next a::attr("href")').extract_first()
        if link_next:
            yield scrapy.Request(response.urljoin(link_next))
项目:scrapy-training    作者:scrapinghub    | 项目源码 | 文件源码
def parse(self, response):
        self.driver.get(response.url)
        sel = scrapy.Selector(text=self.driver.page_source)
        for quote in sel.css('div.quote'):
            yield {
                'text': quote.css('span.text::text').extract_first(),
                'author': quote.css('span small::text').extract_first(),
                'tags': quote.css('div.tags a.tag::text').extract(),
            }
        next_page = sel.css('li.next > a::attr(href)').extract_first()
        if next_page:
            yield scrapy.Request(response.urljoin(next_page))
项目:Scrapy_CrawlMeiziTu    作者:williamzxl    | 项目源码 | 文件源码
def parse(self, response):
        selector = scrapy.Selector(response)
        #item = CrawlmeizituItemPage()

        next_pages = selector.xpath('//*[@id="wp_page_numbers"]/ul/li/a/@href').extract()
        next_pages_text = selector.xpath('//*[@id="wp_page_numbers"]/ul/li/a/text()').extract()
        all_urls = []
        if '???' in next_pages_text:
            next_url = "http://www.meizitu.com/a/{}".format(next_pages[-2])
            with open('..//url.txt', 'a+') as fp:
                fp.write('\n')
                fp.write(next_url)
                fp.write("\n")
            request = scrapy.http.Request(next_url, callback=self.parse)
            time.sleep(2)
            yield request

        all_info = selector.xpath('//h3[@class="tit"]/a')
        #??????????
        for info in all_info:
            links = info.xpath('//h3[@class="tit"]/a/@href').extract()
        for link in links:
            request = scrapy.http.Request(link, callback=self.parse_item)
            time.sleep(1)
            yield request

        # next_link = selector.xpath('//*[@id="wp_page_numbers"]/ul/li/a/@href').extract()
        # next_link_text = selector.xpath('//*[@id="wp_page_numbers"]/ul/li/a/text()').extract()
        # if '???' in next_link_text:
        #     nextPage = "http://www.meizitu.com/a/{}".format(next_link[-2])
        #     item['page_url'] = nextPage
        #     yield item

            #??????????
项目:crawllagou    作者:ScarecrowFu    | 项目源码 | 文件源码
def parse_detail(self,response):
        item = CrawldetailsItem()
        sel = Selector(response)

        try:
            item["kd"] = response.meta['kd']
            item["title"] = self.get_text(sel,'//*[@id="job_detail"]/dt/h1/@title')
            item["company"] = sel.xpath('//*[@id="container"]/div[2]/dl/dt/a/div/h2/text()').extract()[0].strip()
            item["city"] = sel.xpath('//*[@id="job_detail"]/dd[1]/p[1]/span[2]/text()').extract()[0]
            item["address"] = sel.xpath('//*[@id="container"]/div[2]/dl/dd/div[1]/text()').extract()[0]
            industry = sel.xpath('//*[@id="container"]/div[2]/dl/dd/ul[1]/li[1]').extract()[0]
            item["industry"] = BeautifulSoup(industry).get_text().encode("utf-8").split(' ')[1].strip()
            scale = sel.xpath('//*[@id="container"]/div[2]/dl/dd/ul[1]/li[2]').extract()[0]
            item["scale"] = BeautifulSoup(scale).get_text().encode("utf-8").split(' ')[1].strip()
            phase = sel.xpath('//*[@id="container"]/div[2]/dl/dd/ul[2]/li').extract()[0]
            item["phase"] = BeautifulSoup(phase).get_text().encode("utf-8").split(' ')[1].strip()
            item["salary"] = sel.xpath('//*[@id="job_detail"]/dd[1]/p[1]/span[1]/text()').extract()[0]
            item["experience"] = sel.xpath('//*[@id="job_detail"]/dd[1]/p[1]/span[3]/text()').extract()[0]
            item["education"] = sel.xpath('//*[@id="job_detail"]/dd[1]/p[1]/span[4]/text()').extract()[0]
            item["description"] = self.get_text(sel,'//*[@id="job_detail"]/dd[2]')
            item["url"] = response.url
            item["published"] = sel.xpath('//*[@id="job_detail"]/dd[1]/p[3]/text()').extract()[0][:-8]
            item["tag"] = self.get_text(sel, '//*[@id="job_detail"]/dd[1]/p[2]/text()')


        except Exception, e:
            print e
        yield item
项目:directory-tests    作者:uktrade    | 项目源码 | 文件源码
def get_case_studies_details(response: Response):
    content = response.content.decode("utf-8")
    article_selector = "#company-projects > article"
    articles = Selector(text=content).css(article_selector).extract()
    result = []
    for article in articles:
        title = Selector(text=article).css("h3::text").extract()[0]
        summary = Selector(text=article).css("p::text").extract()[0]
        href = Selector(text=article).css("a::attr(href)").extract()[0]
        slug = href.split("/")[-2]
        assert slug, "Could not extract case study slug from {}".format(article)
        logging.debug("Got case study slug: %s", slug)
        result.append((title, summary, href, slug))
    assert result, "No Case Study details extracted from {}".format(articles)
    return result
项目:directory-tests    作者:uktrade    | 项目源码 | 文件源码
def fas_get_company_profile_url(response: Response, name: str) -> str:
    content = response.content.decode("utf-8")
    links_to_profiles_selector = "#ed-search-list-container a"
    href_selector = "a::attr(href)"
    links_to_profiles = Selector(text=content).css(
        links_to_profiles_selector).extract()
    profile_url = None
    for link in links_to_profiles:
        if escape_html(name).lower() in escape_html(link).lower():
            profile_url = Selector(text=link).css(href_selector).extract()[0]
    with assertion_msg(
            "Couldn't find link to '%s' company profile page in the response",
            name):
        assert profile_url
    return profile_url
项目:directory-tests    作者:uktrade    | 项目源码 | 文件源码
def fas_follow_case_study_links_to_related_sectors(context, actor_alias):
    actor = context.get_actor(actor_alias)
    session = actor.session
    content = context.response.content.decode("utf-8")
    links_css_selector = "#company-showcase .case-study-info a"
    links_to_sectors = Selector(text=content).css(links_css_selector).extract()
    with assertion_msg("Expected to find at least 1 link to Industry sector"
                       "associated with Company Showcase Case Study"):
        assert links_css_selector
    results = {}
    fas_url = get_absolute_url("ui-supplier:landing")
    for link in links_to_sectors:
        industry = Selector(text=link).css("a::text").extract()[0]
        href = Selector(text=link).css("a::attr(href)").extract()[0]
        url = urljoin(fas_url, href)
        sectors = [value for _, value in parse_qsl(urlsplit(href).query)]
        logging.debug(
            "%s will look for Suppliers in '%s' Industry sectors '%s'",
            actor_alias, industry, ", ".join(sectors)
        )
        response = make_request(Method.GET, url=url, session=session)
        results[industry] = {
            "url": url,
            "sectors": sectors,
            "response": response
        }
    context.results = results
项目:directory-tests    作者:uktrade    | 项目源码 | 文件源码
def fas_should_see_unfiltered_search_results(context, actor_alias):
    response = context.response
    content = response.content.decode("utf-8")
    sector_filters_selector = "#id_sectors input"
    filters = Selector(text=content).css(sector_filters_selector).extract()
    for fil in filters:
        sector = Selector(text=fil).css("input::attr(value)").extract()[0]
        selector = "input::attr(checked)"
        checked = True if Selector(text=fil).css(selector).extract() else False
        with assertion_msg(
                "Expected search results to be unfiltered but this "
                "filter was checked: '%s'", sector):
            assert not checked
    logging.debug("%s was shown with unfiltered search results", actor_alias)
项目:web-crawler-spider-    作者:Hardysong    | 项目源码 | 文件源码
def parse_location(self,response):

        loc_hxs = scrapy.Selector(response)
        loc_xs = loc_hxs.xpath('//div[@id="aside"]/script[1]').extract()[0]
        coord_text = re.findall(r'lng:\w+.\w+,lat:\w+.\w+',loc_xs)[0]

        item = response.meta['item']
        item['location'] = coord_text.encode('gbk')
        return item
        #print  coord_text
项目:web-crawler-spider-    作者:Hardysong    | 项目源码 | 文件源码
def parse(self,response):
        reload(sys)
        sys.setdefaultencoding('utf8')

        print '__________'
        if response.status == 403:
            print 'meet 403, sleep 600 sconds'
            import time
            time.sleep(1200)
            yield Request(response.url,callback=self.parse)
        #404,????????????
        elif response.status == 404:
            print 'meet 404,return'
        else:

            hxs = scrapy.Selector(response)

            for i in range(1,31):
                item = SoufangItem()


                name_ = hxs.xpath('/html/body/div[4]/div[1]/ul/li['+str(i)+']/div[1]/div[1]/a/text()').extract()
                name = ''.join(name_)

                http = hxs.xpath('/html/body/div[4]/div[1]/ul/li['+str(i)+']/div[1]/div[1]/a/@href').extract()
                href = ''.join(http)
                #href = href + 'xiangqing/'

                item['name'] = name.encode('gbk')

                item['link'] = href.encode('gbk')

                yield Request(href,callback=self.parse_detail,meta={'item':item})

                print name, href
            print '__________'
项目:web-crawler-spider-    作者:Hardysong    | 项目源码 | 文件源码
def parse_detail(self,response):
        #print 'in'

        loc_hxs = scrapy.Selector(response)
        loudongzongshu = loc_hxs.xpath('/html/body/div[5]/div[2]/div[2]/div[5]/span[2]/text()').extract()
        loudongzongshu = ''.join(loudongzongshu)

        fangwuzongshu = loc_hxs.xpath('/html/body/div[5]/div[2]/div[2]/div[6]/span[2]/text()').extract()
        fangwuzongshu = ''.join(fangwuzongshu)

        item = response.meta['item']
        item['address'] = loudongzongshu.encode('gbk')
        item['zonghushu'] = fangwuzongshu.encode('gbk')

        return item
项目:web-crawler-spider-    作者:Hardysong    | 项目源码 | 文件源码
def parse_detail(self,response):

        loc_hxs = scrapy.Selector(response)
        build_num_ = loc_hxs.xpath('/html/body/div[3]/div[4]/div[1]/div[2]/div[2]/ul/li[2]/text()').extract()
        build_num = ''.join(build_num_)

        total_households_ = loc_hxs.xpath('/html/body/div[3]/div[4]/div[1]/div[2]/div[2]/ul/li[4]/text()').extract()
        total_households = ''.join(total_households_)

        plot_ratio_ = loc_hxs.xpath('/html/body/div[3]/div[4]/div[1]/div[2]/div[2]/ul/li[6]/text()').extract()
        plot_ratio = ''.join(plot_ratio_)

        green_ratio_ = loc_hxs.xpath('/html/body/div[3]/div[4]/div[1]/div[2]/div[2]/ul/li[8]/text()').extract()
        green_ratio = ''.join(green_ratio_)

        property_fee_ = loc_hxs.xpath('/html/body/div[3]/div[4]/div[1]/div[2]/div[2]/ul/li[10]/text()').extract()
        property_fee = ''.join(property_fee_)

        item = response.meta['item']
        item['build_num'] = build_num.encode('gbk')
        item['total_households'] = total_households.encode('gbk')
        item['plot_ratio'] = plot_ratio.encode('gbk')
        item['greening_ratio'] = green_ratio.encode('gbk')
        item['properity_fee'] = property_fee.encode('gbk')

        return item
项目:decoration-design-crawler    作者:imflyn    | 项目源码 | 文件源码
def test_parse_content(self):
        content = requests.get('http://xiaoguotu.to8to.com/topic/11.html')
        response = Response('http://xiaoguotu.to8to.com/topic/11.html')
        response.text = content.content.decode("utf-8")
        selector = Selector(response)
        title = selector.xpath('//div[@class="xdb_title"]/h1/text()').extract()[0]
        description = selector.xpath('//div[@class="xdbc_description"]//div//p/text()').extract()[0]
        items_selector = selector.xpath('//div[@class="xdbc_main_content"]//p')
        article = []
        text = ''
        for index, item_selector in enumerate(items_selector):
            try:
                text = item_selector.xpath('span/text()').extract()[0]
            except IndexError:
                try:
                    img_url = item_selector.xpath('img/@src').extract()[0]
                    img_width = 0
                    try:
                        img_width = item_selector.xpath('img/@width').extract()[0]
                    except IndexError:
                        pass
                    img_height = 0
                    try:
                        img_height = item_selector.xpath('img/@height').extract()[0]
                    except IndexError:
                        pass
                    article.append({'content': text, 'img_url': img_url, 'img_width': img_width, 'img_height': img_height})
                except IndexError:
                    continue
        design_topic_item = DesignTopicItem()
        design_topic_item['title'] = title
        design_topic_item['description'] = description
        design_topic_item['article'] = article
        design_topic_item['html_url'] = response.url
        return design_topic_item
项目:scrapyweixi    作者:Felix-P-Code    | 项目源码 | 文件源码
def parse(self, response):
        sel = scrapy.Selector(response)
        #print(sel.xpath('//title').extract())
        fligint_div = "//ul[@class='news-list2']/li[1]/div[@class='gzh-box2']/div[@class='img-box']/a[1]/@href"
        first_url_list = sel.xpath(fligint_div).extract()
        self.first_url = first_url_list[0]
        print(self.first_url)
        yield  scrapy.Request(self.first_url,meta=self.meta, callback=self.parse_url_list)