Python bs4 模块,Comment() 实例源码

我们从Python开源项目中,提取了以下19个代码示例,用于说明如何使用bs4.Comment()

项目:FMBlog    作者:vc12345679    | 项目源码 | 文件源码
def get_blog(cls, file_name):
        if cls.is_exist(file_name):
            with open(cls._real_file_name(file_name), 'r', encoding='utf-8') as f:
                txt = f.read()
            mtime = os.path.getmtime(cls._real_file_name(file_name))
            from bs4 import BeautifulSoup, Comment
            import yaml
            comment = BeautifulSoup(txt, "html.parser").find(text=lambda text: isinstance(text, Comment))
            if comment is not None:
                blog_info = yaml.load(comment)
                if 'use_toc' not in blog_info:
                    blog_info['use_toc'] = False
                    html = markdown(txt)
                    return blog_info, txt, html, mtime
            else:
                return
        else:
            return
项目:DropMuse    作者:DropMuse    | 项目源码 | 文件源码
def get_lyrics(artist, song):
    artist = format_artist(artist)
    song = format_song(song)

    time.sleep(1)
    url = LYRICS_URL.format(artist, song)
    content = None
    try:
        response = urlopen(url)
        content = response.read()
    except Exception as e:
        print(url)
        print(e)
        print("failed\n")
        return None

    soup = bs(content, "html.parser", parse_only=SoupStrainer('div'))
    for l in soup:
        for lyrics in soup.find_all(string=lambda t: isinstance(t, Comment)):
            if "start of lyrics" in lyrics or "Usage" in lyrics:
                lyrics = re.sub('</?br/?>', '', str(lyrics.parent))
                lyrics = re.sub('<.*?>', '', str(lyrics))

                return str(lyrics)
项目:dactyl    作者:ripple    | 项目源码 | 文件源码
def get_overrides(soup):
    overrides = []
    comments = soup.find_all(string=lambda text:isinstance(text,Comment))
    for comment in comments:
        m = re.match(OVERRIDE_COMMENT_REGEX, comment)
        if m:
            new_overrides = m.group(1).split(",")
            new_overrides = [o.strip() for o in new_overrides]
            logger.info("Overrides found: %s" % new_overrides)
            overrides += new_overrides
    return overrides
项目:python-spider    作者:naginoasukara    | 项目源码 | 文件源码
def codeAnalyse(html, clas, name = ""):
    soup = BeautifulSoup(html,"html.parser")
    source = soup.find('code', id = "__cnt_0_4")
    soup = BeautifulSoup(str(source),"html.parser")
    comments = soup.findAll(text=lambda text:isinstance(text, Comment))
    soup = BeautifulSoup(comments[0],"html.parser")
    source = soup.find('a', {"class":clas},string = name)
    pos = 0
    for son in source.parent.find_next_sibling().find_next_siblings():
        pos = pos+1
        print(source.string, ":", son.a.string, pos, son.a.attrs['href'])
项目:azure-search-ta    作者:yokawasa    | 项目源码 | 文件源码
def __get_navigable_strings(self,soup):
        if isinstance(soup, NavigableString):
            if type(soup) not in (Comment, Declaration) and soup.strip():
                yield soup
        elif soup.name not in ('script', 'style'):
            for c in soup.contents:
                for g in self.__get_navigable_strings(c):
                    yield g
项目:cc98    作者:zjuchenyuan    | 项目源码 | 文件源码
def text(self, target=None, ignore_pureascii_words=False):
        """
        Get all text in HTML, skip script and comment
        :param target: the BeatuifulSoup object, default self.b
        :param ignore_pureascii_words: if set True, only return words that contains Chinese charaters (may be useful for English version website)
        :return: list of str
        """
        if target is None:
            target = self.b
        from bs4 import Comment
        from bs4.element import NavigableString,Doctype
        result = []
        for descendant in target.descendants:
            if not isinstance(descendant, NavigableString) \
                    or isinstance(descendant,Doctype) \
                    or descendant.parent.name in ["script", "style"] \
                    or isinstance(descendant, Comment) \
                    or "none" in descendant.parent.get("style","")\
                    or "font-size:0px" in descendant.parent.get("style",""):
                continue
            data = descendant.strip()
            if len(data) > 0:
                if not ignore_pureascii_words or any([ord(i)>127 for i in data]):
                    if PY2:
                        result.append(data.encode())
                    else:
                        result.append(data)
        return result
项目:Jarvis    作者:sukeesh    | 项目源码 | 文件源码
def get_lyric(self, singer, song):
        # Replace spaces with _
        singer = singer.replace(' ', '_')
        song = song.replace(' ', '_')
        url = 'http://lyrics.wikia.com/{0}:{1}'.format(singer, song)
        req = requests.get(url)
        s = BeautifulSoup(req.text, "lxml")
        # Get main lyrics holder
        lyrics = s.find("div", {'class': 'lyricbox'})
        if lyrics is None:
            return None
        # Remove Scripts
        [s.extract() for s in lyrics('script')]
        # Remove comments
        comments = lyrics.findAll(text=lambda text: isinstance(text, Comment))
        # Remove unecessary tags
        for tag in ['div', 'i', 'b', 'a']:
            for match in lyrics.findAll(tag):
                match.replaceWithChildren()

        # TODO: check if you need the encode/decode thing, if you do then do a try catch for it

        # get output as string and remove non unicode characters and replace <br> with newlines
        # output = str(lyrics).encode('utf-8', errors = 'replace')[22:-6:].decode('utf-8').replace('\n','').replace('<br/>','\n')
        output = str(lyrics).replace('\n', '').replace('<br/>', '\n')[22:-6:]
        try:
            return output
        except:
            return output.encode('utf-8')
项目:Spotilyrics    作者:eitchtee    | 项目源码 | 文件源码
def lyricswikia(artist, song):
    # original code found @
    # https://github.com/geekpradd/PyLyrics/blob/master/PyLyrics/functions.py
    song = song.split(' - ', 1)[0]
    artist = artist.replace(' ', '_')
    song = song.replace(' ', '_')
    url = 'http://lyrics.wikia.com/{0}:{1}'.format(artist, song)
    print('Trying:', url)
    r = requests.get(url)
    s = BeautifulSoup(r.text, 'html.parser')
    # Get main lyrics holder
    lyrics = s.find("div", {'class': 'lyricbox'})
    if lyrics is not None:
        # Remove Scripts
        [s.extract() for e in lyrics('script')]

        # Remove Comments
        comments = lyrics.findAll(text=lambda text: isinstance(text, Comment))
        [comment.extract() for comment in comments]

        # Remove unecessary tags
        for tag in ['div', 'i', 'b', 'a']:
            for match in lyrics.findAll(tag):
                match.replaceWithChildren()
        # Get output as a string and remove non unicode characters and replace
        # <br> with newlines
        lyrics = str(lyrics).encode('utf-8', errors='replace')[22:-6:].decode(
            "utf-8").replace('\n', '').replace('<br/>', '\n')
    try:
        return lyrics
    except:
        return lyrics.encode('utf-8')
项目:studentsdb    作者:PyDev777    | 项目源码 | 文件源码
def _get_commented_CDN_tags(self):
        def get_comment(s):
            return s if isinstance(s, Comment) and '//' in s and s.strip()[:4] in ['<lin', '<scr'] else ''
        comments = bs(self._get_template()).find_all(string=get_comment)
        tags = self._unitags(bs(str(comments)).select('link[href*="//"], script[src*="//"]'))
        if tags:
            for tag in tags:
                for comment in comments:
                    if tag['open'] in comment and tag['ref'] in comment:
                        tag['comment'] = comment
        return tags
项目:MarkdownLivePreview    作者:math2001    | 项目源码 | 文件源码
def strip_html_comments(html):
    soup = BeautifulSoup(html, 'html.parser')
    for element in soup.find_all(text=lambda text: isinstance(text, html_comment)):
        element.extract()
    return str(soup)
项目:nnnba    作者:joeyism    | 项目源码 | 文件源码
def findSalaries(self, soupped):
        total_salaries = []
        all_all_salaries = soupped.find("div", {"id": "all_all_salaries"})
        comments=all_all_salaries.find_all(string=lambda text:isinstance(text,Comment))
        raw_salary_rows = BeautifulSoup(comments[0], "lxml").find("tbody").find_all("tr")
        for each_raw_salary in raw_salary_rows:
            year = each_raw_salary.find("th").text.replace("-","_").encode("utf8")
            salary = self.salaryTextToFloat(each_raw_salary.find_all("td")[2].text)
            total_salaries.append((year, salary))
        return total_salaries
项目:daily_notification    作者:zjuchenyuan    | 项目源码 | 文件源码
def text(self, target=None, ignore_pureascii_words=False):
        """
        Get all text in HTML, skip script and comment
        :param target: the BeatuifulSoup object, default self.b
        :param ignore_pureascii_words: if set True, only return words that contains Chinese charaters (may be useful for English version website)
        :return: list of str
        """
        if target is None:
            target = self.b
        from bs4 import Comment
        from bs4.element import NavigableString,Doctype
        result = []
        for descendant in target.descendants:
            if not isinstance(descendant, NavigableString) \
                    or isinstance(descendant,Doctype) \
                    or descendant.parent.name in ["script", "style"] \
                    or isinstance(descendant, Comment) \
                    or "none" in descendant.parent.get("style","")\
                    or "font-size:0px" in descendant.parent.get("style",""):
                continue
            data = descendant.strip()
            if len(data) > 0:
                if not ignore_pureascii_words or any([ord(i)>127 for i in data]):
                    if PY2:
                        result.append(data.encode())
                    else:
                        result.append(data)
        return result
项目:tidy_page    作者:desion    | 项目源码 | 文件源码
def clean_tag(doc):
    for tag in doc.find_all(["style", "script","form", "textarea", "input", "iframe", "select","frame", "link"]):
        tag.extract()
    comments = doc.findAll(text=lambda text:isinstance(text, Comment))
    [comment.extract() for comment in comments]
项目:plumeria    作者:sk89q    | 项目源码 | 文件源码
def search_esv(message, verse):
    """
    Search for a bible passage from the English Standard Version.

    Example::

        bible Romans 12:16

    """
    r = await http.get("http://www.esvapi.org/v2/rest/passageQuery", params={
        "key": "IP",
        "passage": verse,
        "output-format": "crossway-xml-1.0",
        "include-simple-entities": "true",
    })

    doc = BeautifulSoup(r.text(), features="lxml")
    if not doc.passage:
        raise CommandError("Verse not found.")
    lines = []
    for verse_unit in doc.passage.content.find_all('verse-unit'):
        num = int(verse_unit.find('verse-num').text)
        woc = verse_unit.find('woc')
        if woc:
            text = woc.text
        else:
            text = "".join([str(node) for node in verse_unit.children
                            if isinstance(node, NavigableString) and not isinstance(node, Comment)])
        lines.append("**{}** {}".format(num, text.strip()))
    return "\n".join(lines)
项目:pycrawler    作者:zyq001    | 项目源码 | 文件源码
def unwrapUseless(soup):
    # unwrap??????
    for a in soup.select('a'):
        a.unwrap()
    for a in soup.select('b'):
        a.unwrap()
    for a in soup.select('font'):
        a.unwrap()
    for a in soup.select('span'):
        a.unwrap()
    comments = soup.findAll(text=lambda text: isinstance(text, Comment))
    [comment.extract() for comment in comments]
项目:Magic-Spoiler    作者:Cockatrice    | 项目源码 | 文件源码
def scrape_mythic_card_page(url):
    r = requests.get(url)

    soup = BS(r.text, "html.parser")

    comments = soup.find_all(string=lambda text: isinstance(text, Comment))

    card = {}

    for comment in comments:
        if comment == 'CARD NAME':
            card['name'] = comment.next_element.strip().replace('"', '')
        elif comment == 'MANA COST':
            try:
                card['manaCost'] = comment.next_element.strip().replace('"', '')
            except:
                pass
        elif comment == 'TYPE':
            card['type'] = comment.next_element.strip().replace('"', '')
        elif comment == 'CARD TEXT':
            buildText = ''
            for element in comment.next_elements:
                try:
                    if not element.strip() in ['CARD TEXT', 'FLAVOR TEXT', '']:
                        if buildText != '':
                            buildText += '\n'
                        buildText += element.strip()
                    if element.strip() == 'FLAVOR TEXT':
                        card['text'] = buildText
                        break
                except:
                    pass
        elif comment == 'Set Number':
            try:
                card['number'] = comment.next_element.strip()
            except:
                pass
        elif comment == 'P/T':
            try:
                if comment.next_element.strip().split('/')[0] != '':
                    card['power'] = comment.next_element.strip().split('/')[0]
                    card['toughness'] = comment.next_element.strip().split('/')[1]
            except:
                pass

    return card
项目:relational-social-media-search-engine    作者:indervirbanipal    | 项目源码 | 文件源码
def loadSearch(self, url, firstName='results'):
        """
        Loads the search page using the url provided and returns raw search results
        """
        print " inside loadSearch .."

        '''
        97.77.104.22:80
        174.129.204.124:80
        '''
        proxy = {
            "http":"209.222.25.83:3128",
        }
        headers = {'Accept-Encoding': 'identity'}
        html2 = requests.get(url, proxies=proxy, headers=headers)
        print "HTML 2"
        # print html2.content
        # html = html2.content
        html = self.loadPage(url)
        print "SPAGE"
        # print sPage[:200]
        spContent = BeautifulSoup(html)

        #title = spContent.find('title')
        #if title is not None:
            #if title.string is not lSrchTitle:
                #sys.exit('There is some problem with url provided, it does not correspond to Linkedin Search')
        comment = None
        comments = spContent.findAll(text=lambda text:isinstance(text, Comment))
        print "COMMENTS"
        # print comments
        # print " >> BEAUTIFULSOUP FINDALL"
        #print comments
        cLen = len(comments)
        print "Length of COmments"+cLen.__str__()
        if cLen > 0 and cLen > 11:
            comment = comments[11]  
        if comment is None:
            for cmnt in comments:   
                if firstName in cmnt:
                    comment = cmnt
        print "output COMMENTS :"
        # print comment            
        return comment
项目:pycrawler    作者:zyq001    | 项目源码 | 文件源码
def dealLocalFile():
    rootDir = os.getcwd()

    list_dirs = os.walk(rootDir)
    for root, dirs, files in list_dirs:
        # for d in dirs:
        #     print os.path.join(root, d)
        for f in files:
            if f.endswith('html'):
                path = os.path.join(root, f)
                soup = BeautifulSoup(open(path), 'html.parser')
                soup = soup.body

                #????
                comments = soup.findAll(text=lambda text: isinstance(text, Comment))
                [comment.extract() for comment in comments]

                #??span??
                spans = soup.select("span")
                [span.unwrap() for span in spans]

                #??font??
                fonts = soup.select("font")
                [font.unwrap() for font in fonts]

                pps = soup.select("p")
                for pp in pps:
                    del pp['style']
                    # text = pp.get_text()
                    # text = text.strip()
                    # if text is '' or len(text) < 1:#????p??,??
                    #     pp.extract()
                # #
                # imgs = soup.select("img")
                # for img in imgs:
                #     src = img['src']
                #     index = src.find('/')
                #     if index != -1:
                #         newSrc = 'imgs' + src[index:]
                #         img['src'] = newSrc
                #         # print newSrc
                ps = soup.select('p')
                title = ''
                for p in ps:
                    if p.get_text() != '' and len(p.get_text()) > 0:
                        title = p.get_text()
                        p.extract()
                        break
                fo = open(title + ".html", "w")
                soup.prettify()
                fo.write(str(soup));

                # ???????
                fo.close()

                # print soup.prettify()
项目:feature_engineering    作者:webeng    | 项目源码 | 文件源码
def _parse_tags(cls, html):

        excluded_tags = ['script', 'style', 'noscript', 'html', 'head', 'meta',
                         'link', 'body', 'input', 'form', 'a']
        minimum_text_node_length = 8

        y_data = []
        text_data = []
        tag_signatures = []

        soup = BeautifulSoup(html, 'html.parser')

        for tag in soup.findAll():

            path = '.'.join(reversed([p.name for p in tag.parentGenerator() if p]))
            tag_signature = '.'.join([path, tag.name])

            if (tag.name not in excluded_tags) and ('table' not in path):

                tag_text = []
                for text in tag.contents:
                    if isinstance(text, Comment):
                        continue
                    try:
                        text = text.strip()
                        aux = BeautifulSoup(text, 'html.parser')
                        if aux.find() is None:
                            tag_text.append(text)
                    except Exception, e:
                        pass

                tag_text = "\n".join(tag_text)

                if tag_text and len(tag_text) > minimum_text_node_length:
                    if tag_text not in text_data:

                        # Remove line returns and tabs
                        tag_text = cls._remove_chars(tag_text)
                        if tag_text:
                            y_data.append(len(tag_text))
                            text_data.append(tag_text)
                            tag_signatures.append(path)

        x = np.array(y_data)
        return x, text_data, tag_signatures