我们从Python开源项目中,提取了以下31个代码示例,用于说明如何使用scrapy.selector()。
def parse_salaries(self, response): """ The values about person salary is in another table in another page, that function grab all the table headers and values and assign to the entity[entity_id] The id was passed in the response.meta """ item = VereadorItem() item['name'] = response.meta['name'] item['id'] = response.meta['entity_id'] item['mesano'] = response.meta['mesano'] for salary in response.xpath('//*[@id="holerite"]').extract(): selector = Selector(text=salary) table = selector.xpath('//tr[@class="holerite_valor"]/td/text()').extract() item["salary_gross"] = table[0] item["salary_liquid"] = selector.xpath('//tr[@class="holerite_valor"]/td/strong/text()').extract_first() return item
def parse(self, response): selector = Selector(response) articles = selector.xpath('//ul[@class="article-list thumbnails"]/li') for article in articles: item = Jianshu2Item() url = article.xpath('div/h4/a/@href').extract() likeNum = article.xpath('div/div/span[2]/text()').extract() posturl = 'http://www.jianshu.com'+url[0] if len(likeNum) == 0: item['likeNum'] = 0 else: item['likeNum'] = int(likeNum[0].split(' ')[-1]) request = Request(posturl,callback=self.parse_donate) request.meta['item'] = item yield request next_link = selector.xpath('//*[@id="list-container"]/div[@class="load-more"]/button/@data-url').extract()[0] if next_link: next_link = self.url + str(next_link) yield Request(next_link,callback=self.parse)
def list_parse(self, response): selector = Selector(text=response.body) list = selector.xpath("//li//a[@class='msk']/@title") urls = selector.xpath("//a[@class='zpgi']/@href").extract() start_url = "http://music.163.com" for tmp_url in urls: yield scrapy.Request(url=start_url + tmp_url, method="GET", callback=self.list_parse, meta={"cat": response.meta['cat']}) i = 1 for tmp in list: list_id = selector.xpath("//li[" + str(i) + "]//a[@class='icon-play f-fr']/@data-res-id").extract_first() i = i + 1 # ???? yield scrapy.Request(url=start_url+"/playlist?id="+list_id, method="GET", callback=self.play_list_parse, meta={"cat": response.meta['cat'], "id": list_id})
def parse(self, response): selector = Selector(response) ID = response.meta["ID"] text0 = selector.xpath('body/div[@class="u"]/div[@class="tip2"]').extract_first() info = InfoItem() if text0: num_tweets = re.findall(u'\u5fae\u535a\[(\d+)\]', text0) # ??? num_follows = re.findall(u'\u5173\u6ce8\[(\d+)\]', text0) # ??? num_fans = re.findall(u'\u7c89\u4e1d\[(\d+)\]', text0) # ??? if num_tweets: info["num_tweets"] = int(num_tweets[0]) if num_follows: info["num_follows"] = int(num_follows[0]) if num_fans: info["num_fans"] = int(num_fans[0]) url_information1 = "http://weibo.cn/%s/info" % ID yield Request(url=url_information1, meta={"item":info,"ID":ID}, dont_filter=True, callback=self.parse1)
def parse3_fans(self, response): """ ????????????ID """ selector = Selector(response) text2 = selector.xpath('body//table/tr/td/a/@href').extract() url_main = response.meta["url_main"] ID_ = response.meta["ID"] for elem in text2: elem = re.findall('uid=(\d+)', elem) if elem: ID = int(elem[0]) if ID not in self.friends_id: # ??ID???????????? self.friends_id.add(ID) url_next = selector.xpath( u'body//div[@class="pa" and @id="pagelist"]/form/div/a[text()="\u4e0b\u9875"]/@href').extract() if url_next: yield Request(url="http://weibo.cn%s" % url_next[0], meta={"url_main":url_main,"ID":ID_}, callback=self.parse3_fans) else: self.fans_finish = True if self.fans_finish and self.follows_finish: yield Request(url=url_main, meta={"ID":ID_}, dont_filter=True, callback=self.parse)
def parse3_follows(self, response): """ ????????????ID """ selector = Selector(response) text2 = selector.xpath('body//table/tr/td/a/@href').extract() url_main = response.meta["url_main"] ID_ = response.meta["ID"] for elem in text2: elem = re.findall('uid=(\d+)', elem) if elem: ID = int(elem[0]) if ID not in self.friends_id: # ??ID???????????? self.friends_id.add(ID) url_next = selector.xpath( u'body//div[@class="pa" and @id="pagelist"]/form/div/a[text()="\u4e0b\u9875"]/@href').extract() if url_next: yield Request(url="http://weibo.cn%s" % url_next[0], meta={"url_main":url_main,"ID":ID_}, callback=self.parse3_follows) else: self.follows_finish = True if self.fans_finish and self.follows_finish: yield Request(url=url_main, meta={"ID":ID_}, dont_filter=True, callback=self.parse)
def parse_single_song(self, response): loader = response.meta['loader'] selector = Selector(response) singer = selector.xpath('//title/text()').extract() loader.add_value('singer', singer) loader.add_value('_id', response.meta['song_id']) comment_data, comment_url = api_comment(response.meta['song_id'], 0, 100) source_data, source_url = api_song_url(response.meta['song_id']) comment_id = generate_comment_index()['comment_index'] loader.add_value('comment_id', comment_id) yield scrapy.FormRequest(url=comment_url, method='POST', headers=self.headers, formdata=comment_data, callback=self.parse_comments, meta={'comment_id': comment_id}) yield scrapy.FormRequest(url=source_url, method='POST', headers=self.headers, formdata=source_data, meta={'loader': loader}, callback=self.get_source_url)
def parse_entities(self, response): """ A table is displayed with the data about the person who works at the Câmara """ mesano = response.meta['mesano'] self.log('Getting mesano: ' + mesano) # Check if the table is empty if not response.css('table tr td:nth-child(1)').extract_first(): return self.log('Nenhum dado disponível') for tr in response.xpath('//table/tr').extract(): selector = Selector(text=tr) entity_id = re.search("(javascript:pesquisa\()(\d*)(\);)", tr).group(2) request = scrapy.FormRequest( url=BASE_URL + 'holerite/consulta_beneficiario.html', formdata={ 'hol_ben_id': entity_id, 'hol_mesano': mesano, 'hol_tipo': '1', 'hol_grupo': GRUPO, 'acao':'' }, callback=self.parse_salaries ) request.meta['name'] = selector.xpath("//tr/td/text()").extract_first() request.meta['entity_id'] = entity_id request.meta['mesano'] = mesano yield request
def parse_item(self, response): selector = Selector(response).xpath('//p[@align="center"]') for sel in selector: image_urls = sel.xpath('a/img/@src').extract() path = [] for img in image_urls: path.append(urlparse.urlparse(img).path) item = SisyItem() item['image_urls'] = image_urls item['images'] = path return item
def parse(self, response): selector = Selector(response) books = selector.xpath('//div[@class="info"]/h2/a/@href').extract() for book in books: print book yield Request(book, callback=self.parse_item) nextPage = selector.xpath('//span[@class="next"]/a/@href').extract() if nextPage: print nextPage[0] yield Request(self.url+nextPage[0],callback=self.parse)
def parse(self, response): # print response.body value = shenZhouCarsItem() item = fieldsItem() selector = Selector(response) cars = selector.xpath('//ul[@class="carInfor-xj clearfix"]') for index in range(0, len(cars), 2): basic = cars[index] specific = cars[index+1] item['car_brand'] = basic.xpath('li[1]/span[1]/text()').re(r'\s+(.*)')[0] item['car_series'] = basic.xpath('li[2]/span/text()').re(r'\s+(.*)')[0] item['car_issue_date'] = basic.xpath('li[3]/span/text()').re(r'\s+(.*)')[0] item['car_config_model'] = basic.xpath('li[4]/span/text()').re(r'\s+(.*)')[0] item['car_seats_num'] = specific.xpath('li[1]/span/text()').re(r'\s+(.*)')[0] item['car_doors'] = specific.xpath('li[2]/span/text()').re(r'\s+(.*)')[0] item['car_fuel_type'] = specific.xpath('li[3]/span/text()').re(r'\s+(.*)')[0] item['car_gearbox_type'] = specific.xpath('li[4]/span/text()').re(r'\s+(.*)')[0] item['car_displacement'] = specific.xpath('li[5]/span/text()').extract()[0] item['car_fuel_num'] = specific.xpath('li[6]/span/text()').re(r'\s+(.*)')[0] item['car_drive_way'] = specific.xpath('li[7]/span/text()').re(r'\s+(.*)')[0] item['car_engine_intake'] = specific.xpath('li[8]/span/text()').re(r'\s+(.*)')[0] item['car_skylight'] = specific.xpath('li[9]/span/text()').re(r'\s+(.*)')[0] item['car_tank_capa'] = specific.xpath('li[10]/span/text()').re(r'\s+(.*)')[0] item['car_voicebox'] = specific.xpath('li[11]/span/text()').re(r'^\s+(\w*)')[0] item['car_seats_type'] = specific.xpath('li[12]/span/text()').re(r'\s+(.*)')[0] item['car_reverse_radar'] = specific.xpath('li[13]/span/text()').re(r'\s+(.*)')[0] item['car_airbag'] = specific.xpath('li[14]/span/text()').re(r'\s+(\w*)')[0] item['car_dvd'] = specific.xpath('li[15]/span/text()').re(r'\s+(.*)')[0] item['car_gps'] = specific.xpath('li[16]/span/text()').re(r'\s+(.*)')[0] if item['car_airbag'] == u'6510' item['car_airbag'] = "0" value['model'] = 'RentMe.model_info' value['pk'] = item['car_brand']+item['car_series']+item['car_issue_date']+item['car_config_model'] value['fields'] = {'car_brand': item['car_brand'], 'car_series': item['car_series'], 'car_issue_date': item['car_issue_date'], 'car_config_model': item['car_config_model'], 'car_seats_num': item['car_seats_num'], 'car_doors': item['car_doors'], 'car_fuel_type': item['car_fuel_type'], 'car_gearbox_type': item['car_gearbox_type'], 'car_displacement': item['car_displacement'], 'car_fuel_num': item['car_fuel_num'], 'car_drive_way': item['car_drive_way'], 'car_engine_intake': item['car_engine_intake'], 'car_skylight': item['car_skylight'], 'car_tank_capa': item['car_tank_capa'], 'car_voicebox': item['car_voicebox'], 'car_seats_type': item['car_seats_type'], 'car_reverse_radar': item['car_reverse_radar'], 'car_airbag': item['car_airbag'], 'car_dvd': item['car_dvd'], 'car_gps': item['car_gps'], 'car_deposit': 5000, 'car_day_price': 100, 'car_time_out_price': 150, 'car_over_kilo_price': 0.5} yield value
def parse_list(self, response): selector = Selector(response) items_selector = selector.xpath('//div[@class="xmp_container"]//div[@class="item"]') for item_selector in items_selector: # http://xiaoguotu.to8to.com/c10037052.html cid = item_selector.xpath('div//a/@href').extract()[0][2:-6] title = item_selector.xpath('div//a/@title').extract()[0] # http://xiaoguotu.to8to.com/getxgtjson.php?a2=0&a12=&a11=10037052&a1=0 next_url = (constant.PROTOCOL_HTTP + self.start_url_domain + '/getxgtjson.php?a2=0&a12=&a11={cid}&a1=0').format(cid=cid) yield scrapy.Request(next_url, self.parse_content, meta={'cid': cid, 'title': title})
def parse_list(self, response): selector = Selector(response) items_selector = selector.xpath('//div[@class="xgt_topic"]') for item_selector in items_selector: # /topic/7334.html href = item_selector.xpath('div//a/@href').extract()[0] href = href.strip() # http://xiaoguotu.to8to.com/topic/7334.html next_url = (constant.PROTOCOL_HTTP + self.start_url_domain + href) if self.design_topic_service.is_duplicate_url(next_url): continue yield scrapy.Request(next_url, self.parse_content)
def parse_content(self, response): selector = Selector(response) title = selector.xpath('//div[@class="xdb_title"]/h1/text()').extract()[0] description = selector.xpath('//div[@class="xdbc_description"]//div//p/text()').extract()[0] items_selector = selector.xpath('//div[@class="xdbc_main_content"]//p') article = [] text = '' for index, item_selector in enumerate(items_selector): try: text = item_selector.xpath('span/text()').extract()[0] except IndexError: try: img_url = item_selector.xpath('img/@src').extract()[0] img_width = 0 try: img_width = item_selector.xpath('img/@width').extract()[0] except IndexError: pass img_height = 0 try: img_height = item_selector.xpath('img/@height').extract()[0] except IndexError: pass article.append({'content': text, 'img_url': img_url, 'img_width': img_width, 'img_height': img_height}) except IndexError: continue design_topic_item = DesignTopicItem() design_topic_item['title'] = title design_topic_item['description'] = description design_topic_item['article'] = article design_topic_item['html_url'] = response.url return design_topic_item
def parse_list(self, response): selector = Selector(response) items_selector = selector.xpath('//div[@id="listITme"]//div[@class="gl-listItem"]') for item_selector in items_selector: id = item_selector.xpath('a/@href').extract()[0].replace('/strategy/', '') # http://guju.com.cn/strategy/strategy_getStrategyInfo_ajax?strategyModel.id=4498 next_url = (constant.PROTOCOL_HTTP + self.start_url_domain + '/strategy/strategy_getStrategyInfo_ajax?strategyModel.id={id}').format( id=id) if self.design_strategy_service.is_duplicate_url(next_url): log.info("=================???" + next_url + "===========") continue yield scrapy.Request(next_url, self.parse_content, meta={'id': id})
def parse(self, response): # print(response, type(response)) # from scrapy.http.response.html import HtmlResponse item = TopStockItem() selector = Selector(response) stocks = selector.xpath('//td[@class="keyword"]/a[@class="list-title"]') for index, stock in enumerate(stocks): item['name'] = stock.xpath('text()').extract()[0] item['num'] = index + 1 item['source'] = "baidu" yield item
def parse_detail(self, response): url = urlparse.urlparse(response.url) path = url.path.split("/") item = PostItem() selector = Selector(response) item['postId'] = path[2] item['authorId'] = path[1] item['postDetail'] = selector.xpath('//div[@class="detail"]').extract()[0] yield item
def play_list_parse(self, response): start_url = "http://music.163.com" item = playListItem() selector = Selector(text=response.body) item['list_play'] = int(selector.xpath("//strong[@id='play-count']/text()").extract_first()) item['list_collection'] = int(selector.xpath("//a[@class='u-btni u-btni-fav ']/@data-count").extract_first()) # item['list_comment'] = int(selector.xpath("//span[@id='cnt_comment_count']/text()").extract_first()) item['list_name'] = selector.xpath("//h2[@class='f-ff2 f-brk']/text()").extract_first() item['list_id'] = response.meta['id'] item['list_tag'] = selector.xpath("//a[@class='u-tag']/i/text()").extract() item['list_creator'] = selector.xpath("//span[@class='name']/a/text()").extract_first() item['list_creator_id'] = selector.xpath("//span[@class='name']/a/@href").extract_first() item['type'] = response.meta['cat'] # urls = selector.xpath("//ul[@class='f-hide']/li/a/@href").extract() # for url in urls: # yield scrapy.Request(url=start_url + url, method="GET", callback=self.detail_parse) yield item # def detail_parse(self, response): # selector = Selector(text=response.body) # id = selector.xpath("//div[@id='content-operation']/@data-rid").extract_first() # detail = validate.Validate(str(id)) # info = demjson.decode(detail.get_music_json()) # if info['total'] > 10000: # item = detailItem() # item['music_id'] = id # item['music_name'] = selector.xpath("//em[@class='f-ff2']/text()").extract_first() # item['music_album'] = selector.xpath("//p[@class='des s-fc4']/a/text()").extract_first() # item['music_artist'] = selector.xpath("//p[@class='des s-fc4']/span/@title").extract_first() # item['music_comment_num'] = int(info['total']) # item['music_comment'] = info['hotComments'] # yield item
def parse(self,response): item = DoubanmovieItem() selector = Selector(response) movies = selector.xpath('//div[@class="info"]') for eachmovie in movies: title = eachmovie.xpath('div[@class="hd"]/a/span/text()').extract() fullTitle = '' for each in fullTitle: fullTitle += each movieInfo = eachmovie.xpath('div[@class="bd"]/p/text()').extract() star = eachmovie.xpath('div[@class="bd"]/div[@class="star"]/span/text()').extract()[0] quote = eachmovie.xpath('div[@class="bd"]/p[@class="quote"]/span/text()').extract() if quote: quote = quote[0] else: quote = '' item['title'] = title item['movieInfo'] = ';'.join(movieInfo) item['star'] = star item['quote'] = quote yield item nextlink = selector.xpath('//span[@class="next"]/link/@herf').extract() if nextlink: nextlink = nextlink[0] print nextlink #Request,????????????????? yield Request(self.url + nextlink,callback=self.parse)
def parse(self, response): item = ZhihupythonItem() #selector = Selector(response) question_Field = response.xpath('//div[@class="feed-main"]') for each in question_Field: question = each.xpath('div[@class="content"]/h2/a/text()') print question item['Question'] = question yield item
def parse3(self, response): """ ????????????ID """ selector = Selector(response) text2 = selector.xpath('body//table/tr/td/a/@href').extract() for elem in text2: elem = re.findall('uid=(\d+)', elem) if elem: ID = int(elem[0]) if ID not in self.finish_ID: # ??ID???????????? self.scrawl_ID.append(ID) url_next = selector.xpath( u'body//div[@class="pa" and @id="pagelist"]/form/div/a[text()="\u4e0b\u9875"]/@href').extract() if url_next: yield Request(url="http://weibo.cn%s" % url_next[0], callback=self.parse3)
def parse(self, response): selector = Selector(response) text0 = selector.xpath('body/div[@class="u"]/div[@class="tip2"]').extract_first() info = InfoItem() if text0: num_tweets = re.findall(u'\u5fae\u535a\[(\d+)\]', text0) # ??? num_follows = re.findall(u'\u5173\u6ce8\[(\d+)\]', text0) # ??? num_fans = re.findall(u'\u7c89\u4e1d\[(\d+)\]', text0) # ??? if num_tweets: info["num_tweets"] = int(num_tweets[0]) if num_follows: info["num_follows"] = int(num_follows[0]) if num_fans: info["num_fans"] = int(num_fans[0]) url_information1 = "http://weibo.cn/%s/info" % self.next_ID[-1] yield Request(url=url_information1, meta={"item":info,"ID":self.next_ID[-1]}, dont_filter=True, callback=self.parse1) # ??????????? if random.random() > float(info["num_follows"])/(info["num_follows"] + info["num_fans"]): try: url_fans = "http://weibo.cn/%s/fans" % self.next_ID[-1] yield Request(url=url_fans, dont_filter=True, callback=self.parse3) # ???? except: url_follows = "http://weibo.cn/%s/follow" % self.next_ID[-1] yield Request(url=url_follows, dont_filter=True, callback=self.parse3) # ????? else: try: url_follows = "http://weibo.cn/%s/follow" % self.next_ID[-1] yield Request(url=url_follows, dont_filter=True, callback=self.parse3) # ????? except: url_fans = "http://weibo.cn/%s/fans" % self.next_ID[-1] yield Request(url=url_fans, dont_filter=True, callback=self.parse3) # ????
def parse4(self, response): """ ????????????? """ selector = Selector(response) text0 = selector.xpath('body/div[@class="u"]/div[@class="tip2"]').extract_first() if text0: num_follows = re.findall(u'\u5173\u6ce8\[(\d+)\]', text0) # ??? num_fans = re.findall(u'\u7c89\u4e1d\[(\d+)\]', text0) # ??? if num_follows and num_fans: self.degree_v = num_fans + num_follows else: self.degree_v = False
def get_list_id(self, response): selector = Selector(response) # ????????????? url_list = selector.xpath('//body//a[@class="s-fc0"]/@href')[:-1].extract() type_ = 0 for url in url_list: type_ += 1 yield scrapy.FormRequest(url='http://music.163.com/m{}'.format(url), method='GET', callback=self.parse_song_list, headers=self.headers, meta={'type': type_})
def parse_song_list(self, response): selector = Selector(response) song_name_list = selector.xpath('//body//ul[@class="f-hide"]/li/a/text()').extract() song_id_list = selector.xpath('//body//ul[@class="f-hide"]/li/a/@href').extract() for index, id_ in enumerate(song_id_list): l = ItemLoader(item=SongListItem()) l.add_value('song_name', song_name_list[index]) l.add_value('type', response.meta['type']) yield scrapy.FormRequest(url=self.BASE_URL + id_, meta={'song_id': id_[9:], 'loader': l}, method='GET', headers=self.headers, callback=self.parse_single_song)
def get_list_id(self, response): selector = Selector(response) # ????????????? url_list = selector.xpath('//body//p[@class="dec"]/a/@href').extract() for url in url_list: yield scrapy.FormRequest(url='http://music.163.com/m{}'.format(url), method='GET', callback=self.parse_song_list, headers=self.headers)
def parse_song_list(self, response): selector = Selector(response) song_name_list = selector.xpath('//body//ul[@class="f-hide"]/li/a/text()').extract() song_id_list = selector.xpath('//body//ul[@class="f-hide"]/li/a/@href').extract() title = selector.xpath('//title/text()').extract() for index, id_ in enumerate(song_id_list): l = ItemLoader(item=PlayListItem()) l.add_value('song_name', song_name_list[index]) l.add_value('title', title) yield scrapy.FormRequest(url=self.BASE_URL + id_, meta={'song_id': id_[9:], 'loader': l}, method='GET', headers=self.headers, callback=self.parse_single_song)
def parse_items(self, response): print "------------" print(response.url) print("----------") from scrapy.selector import Selector import json category = response.meta['category']['category'] sub_category = response.meta['category']['sub_category'] response_json = json.loads(response.body) required_text = response_json["result"]["html"] response = Selector(text=required_text) all_items = response.xpath('//div[contains(@class, "grid_item")]') for each_item in all_items: name = each_item.xpath('.//div[@class="title"]/a/text()').extract_first() price = each_item.xpath('.//span[@class="price"]/text()').extract_first() image_urls = [each_item.xpath(".//img/@src").extract_first()] affiliate_link = each_item.xpath(".//a/@href").extract_first() website = "polyvore.com" brand = [i for i in ALL_BRANDS if i.lower() in name.lower()] if brand: brand = brand[0] print ("brand", brand) else: print (name, brand, "exited") continue item = ProductItem( name=name.strip(), price=price.strip(), image_urls=image_urls, brand=brand.strip(), affiliate_link=affiliate_link, category=category, sub_category=sub_category, website=website ) yield item if response_json["result"]["more_pages"] == "1": next_page = int(response_json["result"]["page"]) + 1 else: return next_link = url_to_use.format(str(next_page), urllib.quote(sub_category)) my_request = scrapy.Request( next_link, self.parse_items) my_request.meta['category'] = { "sub_category": sub_category, "category": category, } yield my_request
def parse1(self, response): selector = Selector(response) infoItem = response.meta["item"] ID = response.meta["ID"] text1 = ";".join(selector.xpath('body/div[@class="c"]/text()').extract()) # ????????text() nickname = re.findall(u'\u6635\u79f0[:|\uff1a](.*?);', text1) # ?? gender = re.findall(u'\u6027\u522b[:|\uff1a](.*?);', text1) # ?? place = re.findall(u'\u5730\u533a[:|\uff1a](.*?);', text1) # ??????????? signature = re.findall(u'\u7b80\u4ecb[:|\uff1a](.*?);', text1) # ???? birthday = re.findall(u'\u751f\u65e5[:|\uff1a](.*?);', text1) # ?? sexorientation = re.findall(u'\u6027\u53d6\u5411[:|\uff1a](.*?);', text1) # ??? marriage = re.findall(u'\u611f\u60c5\u72b6\u51b5[:|\uff1a](.*?);', text1) # ???? url = re.findall(u'\u4e92\u8054\u7f51[:|\uff1a](.*?);', text1) # ???? if nickname: infoItem['nickname'] = nickname[0] if gender: infoItem['gender'] = gender[0] if place: place = place[0].split(" ") infoItem["province"] = place[0] if len(place) > 1: infoItem["city"] = place[1] if signature: infoItem["signature"] = signature[0] if birthday: try: birthday = datetime.datetime.strptime(birthday[0], "%Y-%m-%d") infoItem["birthday"] = birthday - datetime.timedelta(hours=8) except Exception: pass if sexorientation: if sexorientation[0] == gender[0]: infoItem["sexorientation"] = "gay" else: infoItem["sexorientation"] = "Heterosexual" if marriage: infoItem["marriage"] = marriage[0] if url: infoItem["url"] = url[0] infoItem["user_id"] = ID yield infoItem ############??######### if len(self.scrawl_ID) > 0: ID = self.scrawl_ID.popleft() self.finish_ID.add(ID) url_main = "http://weibo.cn/u/%s" % ID url_fans = "http://weibo.cn/%s/fans" % ID url_follows = "http://weibo.cn/%s/follow" % ID # ??????????? if len(self.scrawl_ID) < 4: yield Request(url=url_fans, dont_filter=True, callback=self.parse3) # ???? yield Request(url=url_follows, dont_filter=True, callback=self.parse3) # ????? yield Request(url=url_main, meta={"ID":ID}, dont_filter=True, callback=self.parse)
def parse1(self, response): selector = Selector(response) infoItem = response.meta["item"] ID = response.meta["ID"] text1 = ";".join(selector.xpath('body/div[@class="c"]/text()').extract()) # ????????text() nickname = re.findall(u'\u6635\u79f0[:|\uff1a](.*?);', text1) # ?? gender = re.findall(u'\u6027\u522b[:|\uff1a](.*?);', text1) # ?? place = re.findall(u'\u5730\u533a[:|\uff1a](.*?);', text1) # ??????????? signature = re.findall(u'\u7b80\u4ecb[:|\uff1a](.*?);', text1) # ???? birthday = re.findall(u'\u751f\u65e5[:|\uff1a](.*?);', text1) # ?? sexorientation = re.findall(u'\u6027\u53d6\u5411[:|\uff1a](.*?);', text1) # ??? marriage = re.findall(u'\u611f\u60c5\u72b6\u51b5[:|\uff1a](.*?);', text1) # ???? url = re.findall(u'\u4e92\u8054\u7f51[:|\uff1a](.*?);', text1) # ???? if nickname: infoItem['nickname'] = nickname[0] if gender: infoItem['gender'] = gender[0] if place: place = place[0].split(" ") infoItem["province"] = place[0] if len(place) > 1: infoItem["city"] = place[1] if signature: infoItem["signature"] = signature[0] if birthday: try: birthday = datetime.datetime.strptime(birthday[0], "%Y-%m-%d") infoItem["birthday"] = birthday - datetime.timedelta(hours=8) except Exception: pass if sexorientation: if sexorientation[0] == gender[0]: infoItem["sexorientation"] = "gay" else: infoItem["sexorientation"] = "Heterosexual" if marriage: infoItem["marriage"] = marriage[0] if url: infoItem["url"] = url[0] infoItem["user_id"] = ID yield infoItem