Python bs4.element 模块,NavigableString() 实例源码

我们从Python开源项目中,提取了以下14个代码示例,用于说明如何使用bs4.element.NavigableString()

项目:spyglass    作者:Crypta-Eve    | 项目源码 | 文件源码
def parseStatus(rtext):
    texts = [t for t in rtext.contents if isinstance(t, NavigableString)]
    for text in texts:
        upperText = text.strip().upper()
        originalText = upperText
        for char in CHARS_TO_IGNORE:
            upperText = upperText.replace(char, "")
        upperWords = upperText.split()
        if (("CLEAR" in upperWords or "CLR" in upperWords) and not originalText.endswith("?")):
            return states.CLEAR
        elif ("STAT" in upperWords or "STATUS" in upperWords):
            return states.REQUEST
        elif ("?" in originalText):
            return states.REQUEST
        elif (text.strip().upper() in ("BLUE", "BLUES ONLY", "ONLY BLUE" "STILL BLUE", "ALL BLUES")):
            return states.CLEAR
项目:spyglass    作者:Crypta-Eve    | 项目源码 | 文件源码
def parseShips(rtext):
    def formatShipName(text, word):
        newText = u"""<span style="color:#d95911;font-weight:bold"> {0}</span>"""
        text = text.replace(word, newText.format(word))
        return text

    texts = [t for t in rtext.contents if isinstance(t, NavigableString)]
    for text in texts:
        upperText = text.upper()
        for shipName in evegate.SHIPNAMES:
            if shipName in upperText:
                hit = True
                start = upperText.find(shipName)
                end = start + len(shipName)
                if ((start > 0 and upperText[start - 1] not in (" ", "X")) or (
                                end < len(upperText) - 1 and upperText[end] not in ("S", " "))):
                    hit = False
                if hit:
                    shipInText = text[start:end]
                    formatted = formatShipName(text, shipInText)
                    textReplace(text, formatted)
                    return True
项目:LiSpider    作者:jay7n    | 项目源码 | 文件源码
def _censorNaviStrCandidateWithTemplate(self, candi_str, template_str, template_var_cache):
        if not type(candi_str) == element.NavigableString or not type(template_str) == element.NavigableString:
            return False

        matchObj = self.RegPattern.search(template_str)

        if matchObj is not None:
            varName = matchObj.group(1)
            varValue = None

            subed_tmpl_str = self.RegPattern.sub('(.+)', template_str)
            reg2 = re.compile(subed_tmpl_str)
            self.logger.debug('subed tmpl reg2 =', reg2)

            mo2 = reg2.match(candi_str)
            if mo2 is not None:
                varValue = mo2.group(1)
                self._procTemplateVariable(varName, varValue, template_var_cache)
            else:
                return False

        elif not candi_str == template_str:
            return False

        return True
项目:zpretty    作者:collective    | 项目源码 | 文件源码
def is_text(self):
        ''' Check if this element is a text

        Also comments and processing instructions
        are instances of NavigableString,
        so we have to make additional checks
        '''
        if not isinstance(self.context, NavigableString):
            return False
        if (
            self.is_comment() or
            self.is_doctype() or
            self.is_processing_instruction()
        ):
            return False
        return True
项目:cc98    作者:zjuchenyuan    | 项目源码 | 文件源码
def text(self, target=None, ignore_pureascii_words=False):
        """
        Get all text in HTML, skip script and comment
        :param target: the BeatuifulSoup object, default self.b
        :param ignore_pureascii_words: if set True, only return words that contains Chinese charaters (may be useful for English version website)
        :return: list of str
        """
        if target is None:
            target = self.b
        from bs4 import Comment
        from bs4.element import NavigableString,Doctype
        result = []
        for descendant in target.descendants:
            if not isinstance(descendant, NavigableString) \
                    or isinstance(descendant,Doctype) \
                    or descendant.parent.name in ["script", "style"] \
                    or isinstance(descendant, Comment) \
                    or "none" in descendant.parent.get("style","")\
                    or "font-size:0px" in descendant.parent.get("style",""):
                continue
            data = descendant.strip()
            if len(data) > 0:
                if not ignore_pureascii_words or any([ord(i)>127 for i in data]):
                    if PY2:
                        result.append(data.encode())
                    else:
                        result.append(data)
        return result
项目:spyglass    作者:Crypta-Eve    | 项目源码 | 文件源码
def parseUrls(rtext):
    def findUrls(s):
        # yes, this is faster than regex and less complex to read
        urls = []
        prefixes = ("http://", "https://")
        for prefix in prefixes:
            start = 0
            while start >= 0:
                start = s.find(prefix, start)
                if start >= 0:
                    stop = s.find(" ", start)
                    if stop < 0:
                        stop = len(s)
                    urls.append(s[start:stop])
                    start += 1
        return urls

    def formatUrl(text, url):
        newText = u"""<a style="color:#28a5ed;font-weight:bold" href="link/{0}">{0}</a>"""
        text = text.replace(url, newText.format(url))
        return text

    texts = [t for t in rtext.contents if isinstance(t, NavigableString)]
    for text in texts:
        urls = findUrls(text)
        for url in urls:
            textReplace(text, formatUrl(text, url))
            return True
项目:OSM-HOT-ConvNet    作者:larsroemheld    | 项目源码 | 文件源码
def getSoupStringConcat(soupTag):
    '''
    Beautiful soup tags return their content text in the .string parameter if there is only one string child.
    Some unfortunate cases on scotus blog have more than one child-string, and this helper just concat's them.
    :param soupTag: a bs4 tag that contains one or more strings
    :return: a string containing all string children of soupTag, concatenated.
    '''
    if isinstance(soupTag, NavigableString): return soupTag.string
    result = ""
    for t in soupTag.descendants:
        if t.string is not None and isinstance(t, NavigableString): # only include NavigableStrings (work around .string default searching behavior)
            if t.parent.name != "script": # prevent reading js
                result = result + t.string
    return result
项目:coq-rst    作者:cpitclaudel    | 项目源码 | 文件源码
def is_whitespace_string(elem):
    return isinstance(elem, NavigableString) and elem.strip() == ""
项目:coq-rst    作者:cpitclaudel    | 项目源码 | 文件源码
def lex(source):
    """Convert source into a stream of (css_classes, token_string)."""
    soup = BeautifulSoup(coqdoc(source))
    root = soup.find(class_='code')
    strip_soup(root, is_whitespace_string)
    for elem in root.children:
        if isinstance(elem, NavigableString):
            yield [], elem
        elif elem.name == "span":
            cls = "coqdoc-{}".format(elem['type'])
            yield [cls], elem.string
        elif elem.name == 'br':
            pass
        else:
            raise ValueError(elem)
项目:LiSpider    作者:jay7n    | 项目源码 | 文件源码
def _parseTagRecursive(self, candi_tag, template_tag, template_var_cache):
        for idx, tmpChild in enumerate(template_tag.contents):
            if tmpChild.name == 'lisp_pass':
                # this means <...>,
                # indicating that anything in this tag is expected to be ignored.
                continue

            if len(candi_tag.contents) <= idx:
                return False

            candiChild = candi_tag.contents[idx]

            typeCandi = type(candiChild)
            typeTmp = type(tmpChild)

            valid = False
            if typeCandi == typeTmp == element.Tag:
                if self._censorTagCandidateWithTemplate(candiChild, tmpChild, template_var_cache):
                    valid = self._parseTagRecursive(candiChild, tmpChild, template_var_cache)
            elif typeCandi == typeTmp == element.NavigableString:
                valid = self._censorNaviStrCandidateWithTemplate(
                    candiChild, tmpChild, template_var_cache)

            if valid is False and len(template_var_cache) > 0:
                self.logger.warning(template_tag)
                self.logger.warning(candi_tag)
                self.logger.warning('censor not passed. cache will be cleared')
                template_var_cache.clear()

                return False

        return True
项目:daily_notification    作者:zjuchenyuan    | 项目源码 | 文件源码
def text(self, target=None, ignore_pureascii_words=False):
        """
        Get all text in HTML, skip script and comment
        :param target: the BeatuifulSoup object, default self.b
        :param ignore_pureascii_words: if set True, only return words that contains Chinese charaters (may be useful for English version website)
        :return: list of str
        """
        if target is None:
            target = self.b
        from bs4 import Comment
        from bs4.element import NavigableString,Doctype
        result = []
        for descendant in target.descendants:
            if not isinstance(descendant, NavigableString) \
                    or isinstance(descendant,Doctype) \
                    or descendant.parent.name in ["script", "style"] \
                    or isinstance(descendant, Comment) \
                    or "none" in descendant.parent.get("style","")\
                    or "font-size:0px" in descendant.parent.get("style",""):
                continue
            data = descendant.strip()
            if len(data) > 0:
                if not ignore_pureascii_words or any([ord(i)>127 for i in data]):
                    if PY2:
                        result.append(data.encode())
                    else:
                        result.append(data)
        return result
项目:zpretty    作者:collective    | 项目源码 | 文件源码
def text(self):
        ''' Return the text contained in this element (if any)

        Convert the text characters to html entities
        '''
        if not isinstance(self.context, NavigableString):
            return u''
        if self.is_comment():
            return unicode(self.context)
        return self.escaper.substitute_html(self.context.string)
项目:PTTChatBot_DL2017    作者:thisray    | 项目源码 | 文件源码
def parse_article(self, url):
        raw  = self.session.get(url, verify=False)
        soup = BeautifulSoup(raw.text, "lxml")
        try:
            article = {}
            article["Author"] = soup.select(".article-meta-value")[0].contents[0].split(" ")[0]
            article["Board"]  = soup.select(".article-meta-value")[1].contents[0]
            article["Title"]  = soup.select(".article-meta-value")[2].contents[0]
            article["Date"]  = soup.select(".article-meta-value")[3].contents[0]
            content = ""
            for tag in soup.select("#main-content")[0]:
                if type(tag) is NavigableString and tag !='\n':
                    content += tag
                    break
            article["Content"] = content
            findIPtag = u'? ???:'

            # deal different ip type
            try:    
                ip_temp = soup.find(string = re.compile(findIPtag))
                ip_temp = re.search(r"[0-9]*\.[0-9]*\.[0-9]*\.[0-9]*", ip_temp).group()
            except:
                try:
                    ip_temp = 'NA'
                    f2_content = soup.select('.f2')
                    for content in f2_content:
                        if findIPtag in content.contents[0]:
                            ip_temp = content.next_sibling.split()[-1]
                            break
                except:
                    ip_temp = 'NA'
            article["IP"] = ip_temp 

            upvote = 0
            downvote = 0
            novote = 0
            response_list = []

            for response_struct in soup.select(".push"):
                if "warning-box" not in response_struct['class']:
                    response_dic = {}
                    response_dic["Content"] = response_struct.select(".push-content")[0].contents[0][1:]
                    response_dic["Vote"]  = response_struct.select(".push-tag")[0].contents[0][0]
                    response_dic["User"]  = response_struct.select(".push-userid")[0].contents[0]
                    response_list.append(response_dic)
                    if response_dic["Vote"] == u"?":
                        upvote += 1
                    elif response_dic["Vote"] == u"?":
                        downvote += 1
                    else:
                        novote += 1

            article["Responses"] = response_list
            article["UpVote"] = upvote
            article["DownVote"] = downvote
            article["NoVote"] = novote
        except Exception as e:
            print(e)
            print(u"error in: %s " % url)

        return article
项目:legal    作者:tompecina    | 项目源码 | 文件源码
def check_html(runner, html, key=None, app=None, check_html=True, check_classes=True):

    caller = stack()[1]
    filepos = '{}:{:d}'.format(caller.filename.rpartition('/')[2], caller.lineno)
    app = app or filepos.partition('_')[2].partition('.')[0]
    if key:
        filepos += '-{}'.format(key)

    store = []
    soup = BeautifulSoup(html, 'html.parser')
    for desc in soup.descendants:
        if isinstance(desc, Tag):
            name = desc.name
            attrs = desc.attrs
            store.append(name)
            for attr in sorted(attrs):
                tag = str(attrs.get('name'))
                if name == 'input' and tag == 'csrfmiddlewaretoken' and attr == 'value':
                    continue
                store.append(attr)
                val = attrs[attr]
                if check_classes and attr == 'class':
                    for cls in val:
                        if cls:
                            runner.assertIn(cls, CLASS_ARRAY[app], msg=filepos)
                if isinstance(val, list):
                    store.extend(sorted(val))
                elif (isinstance(val, str)
                    and not (val.startswith(STATIC_URL) or ('date' in tag and attr == 'value'))):
                    if '?' in val:
                        part = val.rpartition('?')
                        store.append(part[0])
                        for arg in sorted(part[2].split('&')):
                            store.append(arg)
                    else:
                        store.append(val)
        elif isinstance(desc, NavigableString):
            store.append(str(desc))
    string = ' '.join(' '.join(store).split())
    hsh = md5(string.encode()).hexdigest()[:HASH_LEN]

    if check_html:
        if WRITE_CHECKFILE:
            print(filepos, hsh, file=CHECKFILE)
        elif CHECK_HTML:
            runner.assertIn(filepos, CHECK_ARRAY, msg=filepos)
            runner.assertEqual(CHECK_ARRAY[filepos][:HASH_LEN], hsh, msg=filepos)