Python bs4 模块,SoupStrainer() 实例源码

我们从Python开源项目中,提取了以下37个代码示例,用于说明如何使用bs4.SoupStrainer()

项目:scientific-paper-summarisation    作者:EdCo95    | 项目源码 | 文件源码
def getJournalURL(jname):
# get journal URL given the journal name for retrieving article PIIs
    urlstr = "http://api.elsevier.com/sitemap/page/sitemap/" + jname[0].lower() + ".html"
    retl = ""
    with urllib.request.urlopen(urlstr) as url:
        response = url.read()
        linkcnt = 0
        for link in BeautifulSoup(response, parse_only=SoupStrainer("a")):
            if linkcnt == 0:
                linkcnt += 1
                continue
            if link.has_attr("href"):
                if link.text.lower() == jname.lower():
                    #print(link["href"])
                    retl = link["href"]
                    break
            linkcnt += 1
    return retl
项目:maoyan    作者:Eastwu5788    | 项目源码 | 文件源码
def __init__(self, data, encoding=None):
        """
         Initialize serializer class
         :param data: ori data
         :param encoding: encoding type of your ori data
         """
        self.data = data

        if not self.data:
            raise ValueError("You must input origin data to this class")

        # if you don't support encoding type we will use chardet to check the type
        self.encoding = encoding if encoding else UnicodeDammit(self.data).original_encoding
        self.encoding = None if self.encoding == "utf-8" else self.encoding

        # initialize beautiful soup
        # only_content_div = SoupStrainer("body")
        self.obj = BeautifulSoup(data, features="lxml", from_encoding=self.encoding)
项目:taemin    作者:ningirsu    | 项目源码 | 文件源码
def get_title(html):
        """
            Get the title element from a HTML document

            :param str html: The html to parse

            :Example:

            >>> Link.get_title("xxxx<title>Title</title>xxxx")
            'Title'

            >>> print(Link.get_title("xxxx<>Title</title>xxxx"))
            None
        """
        bs = BeautifulSoup(html, 'html.parser', parse_only=SoupStrainer('title'))

        title = bs.find("title")
        if not title:
            return None

        if not title.string:
            return None

        return title.string.strip().replace('\n', ' ')
项目:web_page_classification    作者:yuhui-lin    | 项目源码 | 文件源码
def get_child_urls(main_page, max_child=20):
    """retrieve urls from giving html page.
    args:
        main_page(str): html file.
        max_child(int): max number of return urls.
    return:
        list of url string.
    """
    from bs4 import BeautifulSoup, SoupStrainer
    children = []
    for link in BeautifulSoup(main_page,
                              "html.parser",
                              parse_only=SoupStrainer('a')):
        if link.has_attr('href') and link['href'].startswith("http"):
            children.append(link['href'])
    if len(children) > max_child:
        children = children[:max_child]
    return children
项目:plugin.video.vrt.nu    作者:pietje666    | 项目源码 | 文件源码
def __get_menu_items(self, url, soupstrainer_parser_selector, routing_action, video_dictionary_action=None):
        response = requests.get(url)
        tiles = SoupStrainer('a', soupstrainer_parser_selector)
        soup = BeautifulSoup(response.content, "html.parser", parse_only=tiles)
        listing = []
        for tile in soup.find_all(class_="tile"):
            link_to_video = tile["href"]
            thumbnail, title = self.__get_thumbnail_and_title(tile)
            video_dictionary = None
            if video_dictionary_action is not None:
                video_dictionary = video_dictionary_action(tile)

            item = helperobjects.TitleItem(title, {'action': routing_action, 'video': link_to_video},
                                           False, thumbnail, video_dictionary)
            listing.append(item)
        return listing
项目:reahl    作者:reahl    | 项目源码 | 文件源码
def read(self):
        with io.open(self.filename, 'rb') as dhtml_file:
            def strain(name, attrs):
                if name == 'title':
                    return True
                if name == 'div' and dict(attrs).get('id', None) in self.ids:
                    return True
                return False
            soup = BeautifulSoup(dhtml_file, "lxml", parse_only=SoupStrainer(strain))
            parser = html_parser.HTMLParser()
            self.title = parser.unescape(soup.title.decode_contents()) if soup.title else _('Untitled')
            for an_id in self.ids:
                found_elements = soup.find_all(id=an_id)
                if found_elements:
                    [element] = found_elements
                    self.elements[an_id] = element.decode_contents()
                else:
                    self.elements[an_id] = ''
            self.original_encoding = soup.original_encoding
项目:DropMuse    作者:DropMuse    | 项目源码 | 文件源码
def get_lyrics_with_urls(urls):
    # TODO

    ret = []
    for url in urls:
        time.sleep(3)
        print(url)

        response = urlopen(url, timeout=5)
        content = response.read()
        for lyrics in bs(content, "html.parser", parse_only=SoupStrainer('p')):
            if(lyrics.has_attr('style')):
                lyrics = re.sub('</?br/?>', '\n', str(lyrics))
                lyrics = re.sub('<.*?>', '', str(lyrics))
                lyrics = re.sub('\n', ' \n', str(lyrics))
                ret.append(lyrics)
                print(lyrics)
                print(str(get_sentiment(lyrics)))
    return ret
项目:DropMuse    作者:DropMuse    | 项目源码 | 文件源码
def get_lyrics(artist, song):
    artist = format_artist(artist)
    song = format_song(song)

    time.sleep(1)
    url = LYRICS_URL.format(artist, song)
    content = None
    try:
        response = urlopen(url)
        content = response.read()
    except Exception as e:
        print(url)
        print(e)
        print("failed\n")
        return None

    soup = bs(content, "html.parser", parse_only=SoupStrainer('div'))
    for l in soup:
        for lyrics in soup.find_all(string=lambda t: isinstance(t, Comment)):
            if "start of lyrics" in lyrics or "Usage" in lyrics:
                lyrics = re.sub('</?br/?>', '', str(lyrics.parent))
                lyrics = re.sub('<.*?>', '', str(lyrics))

                return str(lyrics)
项目:Wakapedia    作者:ACMProjectsTeam3    | 项目源码 | 文件源码
def scrape_category_page(url):
    global ALL_TEXT, non_bmp_map, threads, count
    soup = BeautifulSoup(urllib.request.urlopen(url), 'lxml', parse_only=SoupStrainer('div'))

      ### accounts for categories with over 200 pages
    link = soup.find('a', href=True, text='next page')
    if (link != None):
        try:
            t = catThread('https://en.wikipedia.org' + link['href'])
            t.daemon = True
            t.start()
            threads.append(t)
        except:
            print ("Error: Unable to thread.")

      ### sends links of wikipedia articles in the category to be scraped
    pages_in_category = soup.find('div', {'id':'mw-pages'}).find('div',{'class':'mw-category'})
    for obj in pages_in_category.findAll('a'):
        tempbun = scrape(Bundle('https://en.wikipedia.org' + obj['href'], False))
        with lock:
            ALL_TEXT += tempbun.text.translate(non_bmp_map)
            print (count)
            count += 1
项目:Hockey-Scraper    作者:HarryShomer    | 项目源码 | 文件源码
def get_soup(game_html):
    """
    Uses Beautiful soup to parses the html document.
    Some parsers work for some pages but don't work for others....I'm not sure why so I just try them all here in order

    :param game_html: html doc

    :return: "soupified" html and player_shifts portion of html (it's a bunch of td tags)
    """
    strainer = SoupStrainer('td', attrs={'class': re.compile(r'bborder')})
    soup = BeautifulSoup(game_html.text, "lxml", parse_only=strainer)
    soup = soup.select('td.+.bborder')

    if len(soup) == 0:
        soup = BeautifulSoup(game_html.text, "html.parser", parse_only=strainer)
        soup = soup.select('td.+.bborder')

        if len(soup) == 0:
            soup = BeautifulSoup(game_html.text, "html5lib")
            soup = soup.select('td.+.bborder')

    return soup
项目:misc    作者:yuhui-lin    | 项目源码 | 文件源码
def get_child_urls(main_page, max_child=20):
    """retrieve urls from giving html page.
    args:
        main_page(str): html file.
        max_child(int): max number of return urls.
    return:
        list of url string.
    """
    from bs4 import BeautifulSoup, SoupStrainer
    children = []
    for link in BeautifulSoup(main_page, "html.parser", parse_only=SoupStrainer('a')):
        if link.has_attr('href') and link['href'].startswith("http"):
            children.append(link['href'])
    if len(children) > max_child:
        children = children[:max_child]
    return children
项目:scientific-paper-summarisation    作者:EdCo95    | 项目源码 | 文件源码
def collectArticles(urlstr):
# get article PIIs
    retl = []
    with urllib.request.urlopen(urlstr) as url:
        response = url.read()
        linkcnt = 0
        for link in BeautifulSoup(response, parse_only=SoupStrainer("a")):
            if linkcnt == 0:
                linkcnt += 1
                continue
            if link.has_attr("href"):
                #print(link["href"])
                retl.append(link["href"])
            linkcnt += 1
    return retl
项目:pitchfx-data-scraper    作者:whazell    | 项目源码 | 文件源码
def get_links ( url ):
    '''
        Get all the links off of the page:
        gd2.mlb.com/components/game/mlb/year/month/day/

        And finds the links for the games that have the following 
        format:

        gid_year_mm_dd_team1mlb_team2mlb   
    '''
    f = get_page (url)
    if f==False: return False

    # Compile the regex to match links outside of the loop for 
    # performance
    links = []
    regex = re.compile("\"gid_(.*?)\"", re.IGNORECASE)

    # Find all links on page and if they are links to games then add to list
    for link in BeautifulSoup(f, "lxml",parse_only=SoupStrainer('a', href=True) ):
        match = regex.findall(str(link))
        if match:
           links.extend(match)

    return links
项目:isar    作者:ilbers    | 项目源码 | 文件源码
def _check_latest_version(self, url, package, package_regex, current_version, ud, d):
        """
        Return the latest version of a package inside a given directory path
        If error or no version, return ""
        """
        valid = 0
        version = ['', '', '']

        bb.debug(3, "VersionURL: %s" % (url))
        soup = BeautifulSoup(self._fetch_index(url, ud, d), "html.parser", parse_only=SoupStrainer("a"))
        if not soup:
            bb.debug(3, "*** %s NO SOUP" % (url))
            return ""

        for line in soup.find_all('a', href=True):
            bb.debug(3, "line['href'] = '%s'" % (line['href']))
            bb.debug(3, "line = '%s'" % (str(line)))

            newver = self._parse_path(package_regex, line['href'])
            if not newver:
                newver = self._parse_path(package_regex, str(line))

            if newver:
                bb.debug(3, "Upstream version found: %s" % newver[1])
                if valid == 0:
                    version = newver
                    valid = 1
                elif self._vercmp(version, newver) < 0:
                    version = newver

        pupver = re.sub('_', '.', version[1])

        bb.debug(3, "*** %s -> UpstreamVersion = %s (CurrentVersion = %s)" %
                (package, pupver or "N/A", current_version[1]))

        if valid:
            return pupver

        return ""
项目:play-scraper    作者:danieliu    | 项目源码 | 文件源码
def _parse_multiple_apps(self, list_response):
        """Extracts app ids from a list's Response object, sends GET requests to
        each app, parses detailed info and returns all apps in a list.

        :param list_response: the Response object from a list request
        :return: a list of app dictionaries
        """
        list_strainer = SoupStrainer('span', {'class': 'preview-overlay-container'})
        soup = BeautifulSoup(list_response.content, 'lxml', parse_only=list_strainer)

        app_ids = [x.attrs['data-docid'] for x in soup.select('span.preview-overlay-container')]
        responses = multi_app_request(app_ids)

        app_strainer = SoupStrainer('div', {'class': 'main-content'})
        apps = []
        errors = []
        for i, r in enumerate(responses):
            if r is not None and r.status_code == requests.codes.ok:
                soup = BeautifulSoup(r.content, 'lxml', parse_only=app_strainer)
                apps.append(self._parse_app_details(soup))
            else:
                errors.append(app_ids[i])

        if errors:
            self._log.error("There was an error parsing the following apps: {errors}.".format(
                errors=", ".join(errors)))

        return apps
项目:play-scraper    作者:danieliu    | 项目源码 | 文件源码
def get_categories():
    """
    Sends a GET request to the front page (base url of the app store),
    parses and returns a list of all available categories.

    Note: May contain some promotions, e.g. "Popular Characters"
    """
    categories = {}
    strainer = SoupStrainer('a', {'class': 'child-submenu-link'})

    response = send_request('GET', s.BASE_URL)
    soup = BeautifulSoup(response.content, 'lxml', parse_only=strainer)
    category_links = soup.select('a.child-submenu-link')

    age = '?age='

    for cat in category_links:
        url = urljoin(s.BASE_URL, cat.attrs['href'])
        category_id = url.split('/')[-1]
        name = cat.string

        if age in category_id:
            category_id = 'FAMILY'
            url = url.split('?')[0]
            name = 'Family'

        if category_id not in categories:
            categories[category_id] = {
                'name': name,
                'url': url,
                'category_id': category_id}

    return categories
项目:-PunkScan    作者:swordli    | 项目源码 | 文件源码
def resolve_title(url):

    #grab the first title if there's more than one
    try:
        pnk_log(mod, "Requesting %s" % url)
        r = pnk_request(url)
        response_text = r.text

        for title in BeautifulSoup(response_text, 'html.parser', parse_only=SoupStrainer('title')):
            return title.text.strip()
    except:
        return None
项目:LazyBook    作者:cfifty    | 项目源码 | 文件源码
def getSingle(s):

    # load in your friends dictionary
    structDir = os.path.abspath(os.path.join(os.path.dirname( __file__ ), '..', 'Structs'))
    with open(structDir + '/friendsDict.pkl','rb') as input:
        friendsDict = pickle.load(input)

    # -------------- Now, let's compile a list of friends who are single ------------
    Single = []
    iteration = 1
    relatStrainer = SoupStrainer(text=re.compile("Single</div>"))
    relatExt = "/about?section=relationship&pnref=about"
    relatExtBeta = "&sk=about&section=relationship"
    fbook = "https://facebook.com"


    for friend in friendsDict: 
        if (friendsDict[friend].find("php") != -1):
            relatURL = fbook + friendsDict[friend] + relatExtBeta
        else:
            relatURL = fbook + friendsDict[friend] + relatExt

        relatInfo = s.get(relatURL)
        soup = BeautifulSoup(relatInfo.text,"lxml",parse_only=relatStrainer)
        comment = soup.find(text=re.compile("Single</div>"))
        if (comment != None):
            # since some names have special characters, we need to strip these
            temp = friend.encode('utf-8').strip()
            Single.append(temp + "\n")
        print friend + " is single = " + str(comment != None)
        # print iteration
        iteration += 1

    # print Single

    singleStr = ''.join(Single)

    with open(structDir + "/single.txt","wb") as f: 
        f.write(singleStr)
项目:LazyBook    作者:cfifty    | 项目源码 | 文件源码
def getFriendsList(friends, part,s):
    ID = vanity
    if(part == 1):
        index = 0;
    elif(part == 2): 
        index = 24;
    elif(part == 3):
        index = 24+36
    else:
        index = 24+36+36

    # find scrape their total number of friends
    temp = s.get('https://www.facebook.com/' + ID + '/friends')
    soup = BeautifulSoup(temp.text,"lxml")
    strainer = SoupStrainer('a',href=re.compile("fref=fr_tab"))

    # iterator over entire friends list and pull out the relevant information from 
    # the html docs that display 24 or 36 friends each
    while (index < (numFriends)): 
        if index == 0:
            temp = s.get('https://m.facebook.com/' + ID + '/friends')
            soup = BeautifulSoup(temp.text,"lxml",parse_only=strainer)
            tempLst = soup.findAll('a')
            for item in tempLst:
                friends.append(item)
            index = 24 + 36*3
        else: 
            temp = (s.get('https://m.facebook.com/' + ID + '/friends?startindex='
                + str(index)))
            soup = BeautifulSoup(temp.text,"lxml",parse_only=strainer)
            tempLst = soup.findAll('a')
            for item in tempLst:
                friends.append(item)
            index = index + 36*4
    return
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda    作者:SignalMedia    | 项目源码 | 文件源码
def __init__(self, *args, **kwargs):
        super(_BeautifulSoupHtml5LibFrameParser, self).__init__(*args,
                                                                **kwargs)
        from bs4 import SoupStrainer
        self._strainer = SoupStrainer('table')
项目:BachMaker    作者:anbrjohn    | 项目源码 | 文件源码
def scrape(webpage, extension=".mid"):
    # Get all the files of a given extension from a webpage
    http = httplib2.Http()
    status, response = http.request(webpage)
    files = []
    for link in BeautifulSoup(response, parseOnlyThese=SoupStrainer('a')):
        if link.has_attr('href'):
            linkname = link['href']
            if linkname[-len(extension):] == extension:
                files += [linkname]
    return files
项目:AlfredWorkflow-DYHub    作者:Jeff2Ma    | 项目源码 | 文件源码
def get_film_info_subhd():
    items = []
    target_url = 'http://subhd.com'
    content = urllib2.urlopen(target_url).read().decode('utf-8')
    only_hotl_tags = SoupStrainer(class_='hotl')
    soup = BeautifulSoup(content, "html.parser", parse_only=only_hotl_tags)
    i = 0
    for link in soup.find_all('a', limit=7):
        link_url = target_url + link.get('href')
        link_img = target_url + link.findChildren('img')[0].get('src')
        cover_img = 'http://img3.doubanio.com/view/movie_poster_cover/spst/public/' + link_img.split('/sub/poster/l/')[
            1]
        link_title = link.findChildren('img')[0].get('title')

        save_path = os.path.abspath("./icons/icon-s")
        imgData = urllib2.urlopen(cover_img).read()
        fileName = save_path + str(i) + '.jpg'
        output = open(fileName, 'wb+')
        output.write(imgData)
        output.close()

        json_item = dict(title=link_title, subtitle='', arg=link_url, icon='icons/icon-s' + str(i) + '.jpg')
        items.append(json_item)
        i = i + 1

    return generate_xml(items)
项目:punkspider    作者:aiwennba    | 项目源码 | 文件源码
def resolve_title(url):

    #grab the first title if there's more than one
    try:
        pnk_log(mod, "Requesting %s" % url)
        r = pnk_request(url)
        response_text = r.text

        for title in BeautifulSoup(response_text, 'html.parser', parse_only=SoupStrainer('title')):
            return title.text.strip()
    except:
        return None
项目:VulScript    作者:y1ng1996    | 项目源码 | 文件源码
def __get_version(self):
        '''
        get jenkins version
        :return:
        '''
        try:
            html = urllib2.urlopen(self.url + '/login?from=%2F').read()
            links = SoupStrainer('a' ,href = re.compile(VERSION_TAG))
            version_text = BeautifulSoup(html, "html.parser", parse_only= links)
            if version_text.text != "":
                color_output("[+]....jenkins version is %s" % version_text.text)
                version_re = re.findall(u"ver.\s(.*)" ,version_text.text)
                if len(version_re) != 0:
                    if version_re[0][0:4] >= self.check_version:
                        self.user_link = ASYNCH_PEOPEL_PERFIX
                    else:
                        self.user_link = PEOPLE_PERFIX
            else:
                color_output("[-]....can't get jenkins version!")
                sys.exit()
        except urllib2.URLError,e:
            color_output("[-]....can't get jenkins version!")
            sys.exit()
        except Exception,e:
            color_output("[-]....get version error:%s" % str(e))
            sys.exit()
项目:Wakapedia    作者:ACMProjectsTeam3    | 项目源码 | 文件源码
def scrape(url):
      ### opens url so it's like a file
    try:
        link = urllib.request.urlopen(url)
    except urllib.error.HTTPError:
        return ''

    soup = BeautifulSoup(link.read().decode('utf-8'), 'lxml', parse_only=SoupStrainer('p'))

    alltxt = ''
      ### iterate thru the <p> tags
    for para in soup.find_all('p'):
        alltxt = alltxt + para.get_text() + ' '

    return alltxt
项目:Wakapedia    作者:ACMProjectsTeam3    | 项目源码 | 文件源码
def scrape(bun):
    ### opens url so it's like a file
  link = urllib.request.urlopen(bun.URL)

  soup = None
    ### flag for retrieving categories (or not)
  if bun.categories:
    soup = BeautifulSoup(link.read().decode('utf-8'), 'lxml')
  else:
    p_tags = SoupStrainer('p')
    soup = BeautifulSoup(link.read().decode('utf-8'), 'lxml', parse_only=p_tags)

    ### dictionary of paragraphs
  doc = {}
    ### add token and count to replace paragraphs in HTML
  token = 'Waka'
  count = 0

    ### all the paragraph texts in one string
  alltxt = ''
    ### iterate thru the <p> tags
  for para in soup.find_all('p'):
      ### put raw text in dictionary
    doc[token+str(count)] = para.get_text()
    alltxt = alltxt + para.get_text() + ' '
      ### replace <p> contents with a token
    para.string = token + str(count)
    count+=1

    ### get the list of categories
  cats = []
  if bun.categories:
    for cat in soup.find('div', {'id': 'catlinks'}).find('ul').findAll('li'):
      cats.append('https://en.wikipedia.org' + cat.find('a')['href'])

  for css in soup.find_all('link', rel='stylesheet'):
    css['href'] = '//en.wikipedia.org/' + css['href']

  for js in soup.find_all('script', src=re.compile('.*')):
    js['src'] = '//en.wikipedia.org/' + js['src']

    ### update stuff in Bundle
  bun.paragraphs = doc
  bun.text = alltxt
  bun.html = str(soup.encode('ascii', 'xmlcharrefreplace').decode('utf-8'))
  bun.categories = cats

  return bun
项目:hacker-scripts    作者:restran    | 项目源码 | 文件源码
def __get_version(self):
        '''
        get jenkins version
        :return:
        '''
        try:
            html = urllib2.urlopen(self.url + '/login?from=%2F').read()
            links = SoupStrainer('a' ,href = re.compile(VERSION_TAG))
            version_text = BeautifulSoup(html, "html.parser", parse_only= links)
            if version_text.text != "":
                color_output("[+]....jenkins version is %s" % version_text.text)
                version_re = re.findall(u"ver.\s(.*)" ,version_text.text)
                if len(version_re) != 0:
                    if version_re[0][0:4] >= self.check_version:
                        self.user_link = ASYNCH_PEOPEL_PERFIX
                    else:
                        self.user_link = PEOPLE_PERFIX
            else:
                color_output("[-]....can't get jenkins version!")
                sys.exit()
        except urllib2.URLError,e:
            color_output("[-]....can't get jenkins version!")
            sys.exit()
        except Exception,e:
            color_output("[-]....get version error:%s" % str(e))
            sys.exit()
项目:quality-content-synthesizer    作者:pratulyab    | 项目源码 | 文件源码
def __init__(self, text_blob, *args, **kwargs):
        TextParser.text_strainer = SoupStrainer(TextParser.strain_through)
        self.soup = BeautifulSoup(text_blob, 'html.parser', parse_only=TextParser.text_strainer)
        self.text = self._extract_text()
项目:kenya-news-scrapper    作者:alfiepoleon    | 项目源码 | 文件源码
def get_tuko():
    tuko_url = 'https://www.tuko.co.ke'
    if check_connection(tuko_url):
        tuko = requests.get(tuko_url)
        soup = BeautifulSoup(tuko.text, 'lxml', parse_only=SoupStrainer('a'))
        tuko = []
        for link in soup.select('a.news__link', limit=6):
            news_title = '{}({})'.format(link.get_text(), link.get('href'))
            tuko_link = requests.get(link.get('href'))
            soup_link = BeautifulSoup(tuko_link.text, 'lxml', parse_only=SoupStrainer(['p', 'meta', 'img']))
            try:
                article_date = soup_link.find("meta", itemprop="datePublished")['content']
            except (TypeError, ValueError):
                print('Tuko: No article date meta')
                continue
            image = ''
            try:
                image = soup_link.find("meta", property="og:image")['content']
            except (TypeError, ValueError):
                try:
                    image = soup_link.find('img', class_='article-image__picture')['src']
                except (TypeError, ValueError):
                    print('Tuko: No image found')
            news_dict = {
                'category': 'news',
                'source': 'tuko',
                'title': link.get_text(),
                'link': link.get('href'),
                'image': image,
                'content': [link_inner.get_text().strip(' ,.-') for link_inner in
                            soup_link.select('p.align-left > strong', limit=3) if not
                            link_inner.get_text().startswith('READ ALSO')],
                'date': article_date,
                'date_added': datetime.datetime.utcnow()
            }
            collection.update({'link': link.get('href')}, news_dict, upsert=True)
            tuko.append(news_dict)
        return tuko
项目:kenya-news-scrapper    作者:alfiepoleon    | 项目源码 | 文件源码
def get_capital():
    capital_url = 'http://www.capitalfm.co.ke/news/{}/{:02}'.format(today.year, today.month)
    if check_connection(capital_url):
        capital = requests.get(capital_url)
        soup = BeautifulSoup(capital.text, 'lxml', parse_only=SoupStrainer('div'))
        capital = []
        for article in soup.select('div.entry-information'):
            article_link = article.a
            link = article_link['href']
            title = article_link.get_text()
            capital_link = requests.get(link)
            soup_link = BeautifulSoup(capital_link.text, 'lxml', parse_only=SoupStrainer(['meta', 'img', 'div']))
            article_date = soup_link.find("meta", property="article:published_time")['content']
            image = ''
            try:
                image = soup_link.find("meta", property="og:image")['content']
            except (TypeError, ValueError):
                try:
                    image = soup_link.find('img', class_='size-full')['src']
                except (TypeError, ValueError):
                    print('Capital: No image found')

            try:
                content = get_content(soup_link, 'entry-content').split('\u2013')[1].strip()
            except IndexError:
                content = get_content(soup_link, 'entry-content').strip()
            news_dict = {
                'category': 'news',
                'source': 'capital',
                'title': title,
                'link': link,
                'image': image,
                'content': content,
                'date': article_date,
                'date_added': datetime.datetime.utcnow()
            }
            collection.update({'link': link}, news_dict, upsert=True)
            capital.append(news_dict)
        return capital
项目:CMSpyder    作者:j4v    | 项目源码 | 文件源码
def discover_domains(subdomain_id, request_result_text):

    # retrieve subdomain object
    subdomain = Subdomain.objects.get(id=subdomain_id)

    # Create and start logger
    logger = create_logger('discover_{0}.log'.format(subdomain.id))

    logger.info('discover {0} START'.format(subdomain.id))

    # keep list or extracted subdomains to limit db queries
    extracted_subdomain = []

    for link in BeautifulSoup(request_result_text,
                              'html.parser',  # todo use lxml to speed things up
                              parseOnlyThese=SoupStrainer('a')):
        # todo this only saves 'href' attributes in 'a' elements, can be missing valid entries
        if link.has_attr('href'):
            href = link['href']
            extract_result = extract_subdomain(href)
            if extract_result not in extracted_subdomain:
                extracted_subdomain.append(extract_result)
                new_subdomain = import_subdomain(href,
                                                 discovered_by=subdomain)
                logger.info('discover found {0}'.format(new_subdomain))

    logger.info('discover {0} DONE'.format(subdomain_id))

    # release memory
    gc.collect()
项目:gushiwen-crawler    作者:shenyunhang    | 项目源码 | 文件源码
def run(self):
        while True:
            data = self._queue_data.get()
            self._index = data[0]
            html_contents = data[1]

            html_contents = re.sub('<br />', '\n', html_contents)
            only_main3 = SoupStrainer(class_="main3")
            soup_only_main3 = BeautifulSoup(
                html_contents, 'html.parser', parse_only=only_main3)

            # ?????
            if self._num_empty > 1000:
                break
            # ?????
            if soup_only_main3.get_text(strip=True) == self._delete:
                self._num_empty += 1
                continue
            else:
                self._num_empty = 0

            title_poetry = soup_only_main3.find(class_='son1').h1.string

            soup_only_main3.find(class_='son2').p.span.decompose()
            dynasty_poetry = soup_only_main3.find(class_='son2').p.string
            soup_only_main3.find(class_='son2').p.decompose()

            soup_only_main3.find(class_='son2').p.span.decompose()
            author_poetry = soup_only_main3.find(class_='son2').p.string
            soup_only_main3.find(class_='son2').p.decompose()

            soup_only_main3.find(class_='son2').p.decompose()
            soup_only_main3.find(class_='yizhu').decompose()
            content_poetry = soup_only_main3.find(
                class_='cont',id='cont').get_text()
            content_poetry = re.sub('[\n]+', '\n', content_poetry)
            content_poetry = content_poetry.strip('\n')

            path_html, path_txt = get_output_path(dynasty_poetry, self._index)
            file_html = open(path_html, 'w')
            file_html.writelines(data[1].encode('utf-8'))
            file_html.close()
            file_txt = open(path_txt, 'w')
            file_txt.writelines(title_poetry.encode('utf-8') + '\n')
            file_txt.writelines(dynasty_poetry.encode('utf-8') + '\n')
            file_txt.writelines(author_poetry.encode('utf-8') + '\n')
            file_txt.writelines(content_poetry.encode('utf-8') + '\n')
            file_txt.close()

            print '-----------------------------------------------------------'
            print 'Parser: ', self._index
            print '???', title_poetry
            print '???', dynasty_poetry
            print '???', author_poetry
            print '???\n', content_poetry

        print 'Parser finish'
项目:isar    作者:ilbers    | 项目源码 | 文件源码
def _check_latest_version_by_dir(self, dirver, package, package_regex,
            current_version, ud, d):
        """
            Scan every directory in order to get upstream version.
        """
        version_dir = ['', '', '']
        version = ['', '', '']

        dirver_regex = re.compile("(?P<pfx>\D*)(?P<ver>(\d+[\.\-_])+(\d+))")
        s = dirver_regex.search(dirver)
        if s:
            version_dir[1] = s.group('ver')
        else:
            version_dir[1] = dirver

        dirs_uri = bb.fetch.encodeurl([ud.type, ud.host,
                ud.path.split(dirver)[0], ud.user, ud.pswd, {}])
        bb.debug(3, "DirURL: %s, %s" % (dirs_uri, package))

        soup = BeautifulSoup(self._fetch_index(dirs_uri, ud, d), "html.parser", parse_only=SoupStrainer("a"))
        if not soup:
            return version[1]

        for line in soup.find_all('a', href=True):
            s = dirver_regex.search(line['href'].strip("/"))
            if s:
                sver = s.group('ver')

                # When prefix is part of the version directory it need to
                # ensure that only version directory is used so remove previous
                # directories if exists.
                #
                # Example: pfx = '/dir1/dir2/v' and version = '2.5' the expected
                # result is v2.5.
                spfx = s.group('pfx').split('/')[-1]

                version_dir_new = ['', sver, '']
                if self._vercmp(version_dir, version_dir_new) <= 0:
                    dirver_new = spfx + sver
                    path = ud.path.replace(dirver, dirver_new, True) \
                        .split(package)[0]
                    uri = bb.fetch.encodeurl([ud.type, ud.host, path,
                        ud.user, ud.pswd, {}])

                    pupver = self._check_latest_version(uri,
                            package, package_regex, current_version, ud, d)
                    if pupver:
                        version[1] = pupver

                    version_dir = version_dir_new

        return version[1]
项目:cabu    作者:thylong    | 项目源码 | 文件源码
def extract_links(response_content, unique=False, blacklist_domains=[],
                  whitelist_domains=[], regex=None, zen_path=None,
                  blacklist_extensions=[], whitelist_extensions=[]):
    """Extract links from a response content.

    Args:
        response_content (str): The HTML page received in a Response Object.
        unique (bool): A parameter defining if the list can contain duplicates.
                       Defaults to False.
        blacklist_domains (list): List of domains to exclude from the result.
        whitelist_domains (list): List of domains to include from the result.
        regex (list): A regular expression filter on the link.
                      Defaults to None.
        zen_path (list): A selector to restrict the XPath to parse with bs4.

    Returns:
        links (list): A list of extracted and filtered links.
    """

    if any([item in blacklist_domains for item in whitelist_domains]) \
       or any([item in blacklist_extensions for item in whitelist_extensions]):
        raise LinkExtractorException('blacklist_domains and whitelist_domains '
                                     'can`t contain common value(s).')

    soup = BeautifulSoup(
        response_content, "html.parser", parse_only=SoupStrainer('a')
    )
    links = [a.text for a in soup]

    if unique:
        links = list(set(links))

    if regex:
        links = filter_links(links, regex)

    if whitelist_domains:
        for domn in whitelist_domains:
            links = filter_links(links, domn.replace('.', '\.'), include=True)

    if blacklist_domains:
        for domn in blacklist_domains:
            links = filter_links(links, domn.replace('.', '\.'), include=False)

    if whitelist_extensions:
        for ext in whitelist_extensions:
            links = filter_links(links, ext.replace('.', '\.'), include=True)

    if blacklist_extensions:
        for ext in blacklist_extensions:
            links = filter_links(links, ext.replace('.', '\.'), include=False)

    return links
项目:LazyBook    作者:cfifty    | 项目源码 | 文件源码
def getFriendsBirthdays(birthdays,friendsDict,s): 

# --------- Getting Birthday Info -----------
    relatStrainer = SoupStrainer(text=re.compile("Birthday"))
    relatExt = "/about"
    relatExtBeta = "&sk=about"
    fbook = "https://facebook.com"  

    #***** Note: will have to perform additional string methods because scraping from main page
    for friend in friendsDict: 
        if (friendsDict[friend].find("php") != -1):
            relatURL = fbook + friendsDict[friend] + relatExtBeta
        else:
            relatURL = fbook + friendsDict[friend] + relatExt

        relatInfo = s.get(relatURL)
        soup = BeautifulSoup(relatInfo.text,"lxml",parse_only=relatStrainer)

        subString = soup.find(text=re.compile("Birthday"))

        if (subString != None):
            # Cut off everthing before Birthday
            stringIndex = subString.find('Birthday')
            subString = subString[stringIndex:]

            # Cut off the prefix to get the birthdate and everything after
            stringIndex = subString.find('<div>')
            subString = subString[(stringIndex+5):]

            # Get rid of everything after the birthday
            stringIndex = subString.find('</div>')
            subString = subString[:stringIndex]

            # Standardize the birthday date by cutting off the year if there is one
            commaIndex = subString.find(',')
            if (commaIndex != -1):
                subString = subString[:commaIndex]

            if (subString in birthdays):
                birthdays[subString].append(friend)
            else:
                birthdays[subString] = [friend]

            print friend + " has birthday " + subString
    return
项目:AlfredWorkflow-DYHub    作者:Jeff2Ma    | 项目源码 | 文件源码
def get_film_info_dytt():
    items = []
    target_url = 'http://www.dy2018.com/'
    content = urllib2.urlopen(target_url).read()
    content = unicode(content,'GBK').encode('utf-8')
    only_hotl_tags = SoupStrainer(class_='co_content222')
    soup = BeautifulSoup(content, "html.parser", parse_only=only_hotl_tags)
    i = 0

    key = re.compile(r'?(.+?)?')

    for link in soup.find_all('li', limit=8):

        if i != 0:
            link_url = target_url + link.findChildren('a')[0].get('href')
            link_time = link.findChildren('span')[0].string
            link_title = link.findChildren('a')[0].get('title')[5:]

            file_name = re.findall(u'?(.*?)[?|?]', link_title)[0]

            # print file_name.encode("utf-8")

            douban_api = 'https://api.douban.com/v2/movie/search?q=' + file_name.encode("utf-8")
            user_agent = 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'
            headers = {'User-Agent': user_agent}
            req = urllib2.Request(douban_api, None, headers)
            api_content = urllib2.urlopen(req)
            json_content = json.load(api_content)['subjects'][0]['images']['small']
            img_url = json_content
            #print img_url

            save_path = os.path.abspath("./icons/icon")
            img_data = urllib2.urlopen(img_url).read()
            file_name = save_path + str(i) + '.jpg'
            output = open(file_name, 'wb+')
            output.write(img_data)
            output.close()

            json_item = dict(title=link_title, subtitle='??: '+link_time, arg=link_url, icon='icons/icon' + str(i) + '.jpg')
            items.append(json_item)
        i = i + 1

    return generate_xml(items)

# print(get_film_info_dytt())
项目:kenya-news-scrapper    作者:alfiepoleon    | 项目源码 | 文件源码
def get_standard():
    standard_url = 'https://www.standardmedia.co.ke/'
    if check_connection(standard_url):
        standard = requests.get(standard_url)
        soup = BeautifulSoup(standard.text, 'lxml', parse_only=SoupStrainer('div'))
        standard = []
        for link in soup.select('.col-xs-8.zero a', limit=11):
            if link.get_text():
                news_title = '{}({})'.format(link.get_text().strip(), link.get('href'))
                standard_link = requests.get(link.get('href'))
                soup_link = BeautifulSoup(standard_link.text, 'lxml', parse_only=SoupStrainer(['script', 'div']))
                try:
                    data = json.loads(soup_link.find('script', type='application/ld+json').text.replace("\\", r"\\"))
                    article_date = data['dateModified']
                    image = data['image']['url']
                    if image == 'https://www.standardmedia.co.ke':
                        image = ''
                except (ValueError, AttributeError):
                    print('Standard: invalid json detected')
                    continue
                try:
                    content = get_content(soup_link, 'main-article')
                except AttributeError:
                    try:
                        content = get_content(soup_link, 'story')
                    except AttributeError:
                        print('Standard: No content found')
                        continue

                news_dict = {
                    'category': 'news',
                    'source': 'standard',
                    'title': link.get_text().strip(),
                    'link': link.get('href'),
                    'image': image,
                    'content': content,
                    'date': article_date,
                    'date_added': datetime.datetime.utcnow()
                }
                collection.update({'link': link.get('href')}, news_dict, upsert=True)
                standard.append(news_dict)
        return standard