Python bs4 模块,NavigableString() 实例源码

我们从Python开源项目中,提取了以下24个代码示例,用于说明如何使用bs4.NavigableString()

项目:netwars    作者:i008    | 项目源码 | 文件源码
def process_post_bodies(bodies: List[Tag]) -> (str, list):
        for body in bodies:
            cites = list()
            cited = body.findAll('div', {'class': 'cite'})
            if cited:
                cites = [c['name'] for c in cited]
            collect_text = []
            for tag in body:
                # TODO: This is a suboptimal(and partially wrong) solution to parse cites in post body (a lot to improve here)
                if tag.name not in ('div', 'p'):
                    if hasattr(tag, 'text'):
                        collect_text.append(tag.text)
                    elif isinstance(tag, NavigableString):
                        collect_text.append(str(tag))
                    else:
                        collect_text.append('\n')
            else:
                yield ''.join(collect_text), cites
项目:txt2evernote    作者:Xunius    | 项目源码 | 文件源码
def checklistInENMLtoSoup(soup):
        '''
        Transforms Evernote checklist elements to github `* [ ]` task list style
        '''
        transform_tags = ['p','div']

        # soup.select cant be used with dashes: https://bugs.launchpad.net/beautifulsoup/+bug/1276211
        for todo in soup.find_all('en-todo'):
            parent = todo.parent
            transform = parent.find() == todo and parent.name in transform_tags

            checked = todo.attrs.get('checked',None) == "true"
            todo.replace_with("[x] " if checked else "[ ] ")

            # EN checklist can appear anywhere, but if they appear at the beggining
            # of a block element, transform it so it ressembles github markdown syntax
            if transform:
                content = ''.join(unicode(child) for child in parent.children
                    if isinstance(child, NavigableString)
                ).strip()

                new_tag = soup.new_tag("li")
                new_tag.string = content
                parent.replace_with(new_tag)
项目:tumanov_castleoaks    作者:Roamdev    | 项目源码 | 文件源码
def process_tag(tag, valid_tags=()):
    if isinstance(tag, NavigableString):
        return tag

    if tag.name in valid_tags:
        for subtag in tag.contents:
            subtag.replaceWith(process_tag(subtag, valid_tags))
        return tag
    else:
        result = ""
        for subtag in tag.contents:
            result += str(process_tag(subtag, valid_tags))
        return result
项目:MSTU_scraper    作者:rrr3371    | 项目源码 | 文件源码
def get_students(self):
        group = self.group
        group_name = group[:group.find('(')].strip()
        group_code = group[group.find('(')+1:group.find(')')]

        students = []
        for row in self.table.children:
            if type(row) == NavigableString:
                continue

            active = True
            link = row.find(class_='fio_3').parent
            if link.has_attr('style') and link['style'] == 'color:gray;': #????? ?????? - ??????? ????????
                active = False
            student_id = parse_qs(urlparse(link['href']).query)['sid'][0]

            name = row.find(class_='fio_3').string.strip()
            record_book_id = row.find(class_='hc3').string.strip()

            name = " ".join(name.split())
            record_book_id  = " ".join(record_book_id.split())
            students.append({'name': name, 'id': student_id, 'record_book': record_book_id, 'active': int(active)})

        return {'group': group_name, 'code': group_code, 'students': students, 'id': self.group_id}
项目:douban-movie    作者:chishui    | 项目源码 | 文件源码
def parse(movie):
    url = PAGE_URL % movie.id
    r = requests.get(url)
    soup = BeautifulSoup(r.text.encode('utf-8'), 'lxml')
    movie.score = soup.find('strong', 'rating_num').text
    info = soup.find('div', {'id': 'info'})
    for linebreak in info.find_all('br'):
        linebreak.extract()
    for span in info.contents:
        if isinstance(span, NavigableString): continue
        if span.contents[0]:
            if span.contents[0].string == u'??':
                if isinstance(span.contents[1], NavigableString):
                    movie.director = span.contents[2].text
            elif span.contents[0].string == u'??':
                if isinstance(span.contents[1], NavigableString):
                    movie.actor = span.contents[2].text
    print movie
项目:Shosetsu    作者:ccubed    | 项目源码 | 文件源码
def parse_character_results(soup):
    """
    Parse a page of character results.

    :param soup: The BS4 class object
    :return: Returns a list of dictionaries containing a name, gender and list of dictionaries containing a game name/id pair
             for games they appeared in.
    """
    soup = list(soup.find_all('table', class_='stripe')[0].children)[1:]
    characters = []
    for item in soup:
        temp_c = {'gender': None, 'name': None, 'games': {}}
        temp_c['gender'] = item.abbr.get('title')
        temp_c['name'] = list(item.children)[1].a.string
        temp_c['games'] = []
        for game in list(list(list(item.children)[1].children)[1].children):
            if isinstance(game, NavigableString):
                continue
            temp_c['games'].append({'name': game.string, 'id': game.get('href').split('/')[1]})
        characters.append(temp_c)
        del temp_c
    return characters
项目:zhihu-terminal    作者:duduainankai    | 项目源码 | 文件源码
def print_content(contents):
    for content in contents:
        name = content.name
        #if not isinstance(content, Tag):
        if isinstance(content, NavigableString):
            s = str(content)
            s = s.replace("\n","")
            print s.strip()
        else:
            if name == "img":
                '''
                img = content.find("img")
                if img:
                    print img.get("src")
                '''
                print "[??]"
            elif name == "br":
                print ""
            elif name == "noscript":
                continue
            elif name == "li":
                print "•",
            print_content(content.contents)
项目:SecTools    作者:Shad0wpf    | 项目源码 | 文件源码
def get_detail(self, host_soup, vul_summary):
        ''' host report -> section 2.2: vulnerability detail, return dict '''
        name_detail_lst = host_soup.find('div', id='vul_detail').table.contents
        same_vuls = []
        for i in name_detail_lst:
            if type(i) is NavigableString:
                continue
            if i.span:
                name = i.span.string
                for name_port in vul_summary:
                    if name in name_port:
                        same_vuls.append(name_port)
            elif same_vuls:
                # in case of repeat vulnerability but differ port
                lst_solu = self.get_solution(i)
                for name_port in same_vuls:
                    lst = vul_summary.get(name_port)
                    if lst and (len(lst) == 5):
                        vul_summary[name_port].extend(lst_solu)
                same_vuls = []
        return vul_summary
项目:SecTools    作者:Shad0wpf    | 项目源码 | 文件源码
def get_solution(self, tag):
        '''['????', '????', 'CVE??'] '''
        value = []
        tr_lst = tag.table.contents
        for i in tr_lst:
            if type(i) is NavigableString:
                continue
            if i.th.string in (u'????', u'????'):
                val = [i.strip() for i in i.td.strings]
                val = '\n'.join(val).replace('\n*', '*')
                value.append(val)
            elif i.th.string == u'CVE??':
                value.append(i.td.string)
        if len(value) == 2:
            value.append(None)
        return value
项目:azure-search-ta    作者:yokawasa    | 项目源码 | 文件源码
def __get_navigable_strings(self,soup):
        if isinstance(soup, NavigableString):
            if type(soup) not in (Comment, Declaration) and soup.strip():
                yield soup
        elif soup.name not in ('script', 'style'):
            for c in soup.contents:
                for g in self.__get_navigable_strings(c):
                    yield g
项目:loving-ai    作者:opencog    | 项目源码 | 文件源码
def parse_aiml_text(text):
    text = '<p>' + text + '</p>'
    soup = BeautifulSoup(text, 'lxml')
    tokens = []
    try:
        for c in soup.p.children:
            if isinstance(c, NavigableString):
                token = c.string.strip()
                if token:
                    tokens.append(token)
    except Exception as ex:
        logger.warn(ex)
        return text
    return ' '.join(tokens)
项目:weather    作者:awolfly9    | 项目源码 | 文件源码
def get_first_text(soup, strip = False, types = (NavigableString, CData)):
    data = None
    for s in soup._all_strings(strip, types = types):
        data = s
        break
    return data
项目:weather    作者:awolfly9    | 项目源码 | 文件源码
def get_texts(soup, strip = False, types = (NavigableString, CData)):
    texts = []
    for s in soup._all_strings(strip, types = types):
        texts.append(s)

    return texts
项目:Search-Engine    作者:SoufianEly    | 项目源码 | 文件源码
def html_to_text(html):
    "Creates a formatted text email message from a rendered html template (page)"
    soup = BeautifulSoup(html, 'html.parser')
    # Ignore anything in head
    body, text = soup.body, []
    if body is None:
    return ""
    else:
        for element in body.descendants:
            # We use type and not isinstance since comments, cdata, etc are subclasses that we don't want
            if type(element) == NavigableString:
                # We use the assumption that other tags can't be inside a script or style
                if element.parent.name in ('script', 'style'):
                    continue
                elif element.parent.name == 'a':
                    # replace link text with the link
                    #text += [element.parent['href']]
                    continue
                # remove any multiple and leading/trailing whitespace
                string = ' '.join(element.string.split())
                if string:
                    if element.parent.name == 'p':
                        # Add extra paragraph formatting newline
                        string = '\n' + string
                    text += [string]
        doc = '\n'.join(text) #.encode('utf-8')
        return doc
项目:MSTU_scraper    作者:rrr3371    | 项目源码 | 文件源码
def parse_notes(self):
        notes = []
        #??????????? ????????? ???? ?????? ??????? ??? ?????
        for tag in self.page.find_all(class_='div-comment'):
            sibling = tag.next_sibling.next_sibling if type(tag.next_sibling) is NavigableString else tag.next_sibling
            if sibling and not (sibling.has_attr('class') and 'div-control' in sibling['class']):
                note = tag.get_text()
                if note.startswith('?????(?):'):
                    notes.append({'name':'authors', 'value':note[10:].strip()})
                elif note.startswith('??????????:'):
                    notes.append({'name':'comment', 'value':note[12:].strip()})
                else:
                    raise NotImplementedError('??????????? ????? ? ??????????: {}'.format(note))
        return notes
项目:MSTU_scraper    作者:rrr3371    | 项目源码 | 文件源码
def parse_description(self, tag_id):
        description_span = self.page.find(id=tag_id)
        description = []
        for discipline_property in description_span.find_all(class_='div-comment'):
            property_name = discipline_property.string.strip()
            sibling = discipline_property.next_sibling.next_sibling if type(discipline_property.next_sibling) \
                       is NavigableString else discipline_property.next_sibling

            property_value = sibling.string
            property_value = property_value.strip() if property_value else ''
            if property_value:
                description.append({'name':property_name, 'value':property_value})
        return description
项目:QuestionAnswerNLP    作者:debjyoti385    | 项目源码 | 文件源码
def strip_tags(html, invalid_tags):
    soup = BeautifulSoup(html,"html.parser")
    coref_id_set=set()
    set2text={}
    for tag in soup.findAll(True):
        if tag.name in invalid_tags:
            s = ""

            for c in tag.contents:
                if not isinstance(c, NavigableString):
                    c = strip_tags(unicode(c), invalid_tags)
                s += unicode(c)

            tag.replaceWith(s)

    for t in soup.find_all("coref"):
        if t['set-id'] in coref_id_set :
            pronoun_regex = re.compile('|'.join(pronouns))
            # print t.get_text(),
            if len(pronouns.intersection(nltk.word_tokenize(t.get_text().lower()))) > 0:
                # print t.get_text(),
                t.replaceWith(set2text[t['set-id']])
                # print "REPLACED WITH :" , set2text[t['set-id']]
        else:
            coref_id_set.add(t['set-id'])
            set2text[t['set-id']]=t.get_text()


    # print soup
    soup =  re.sub("(\\t|\\r?\\n)+", " ",str(soup))
    soup = re.sub("</s><s>","\n",soup)
    soup = re.sub('<[^>]*>', '', soup)
    return soup
项目:LFS201    作者:s-nt-s    | 项目源码 | 文件源码
def get_parrafos(soup):
    prfs= soup.find_all(['li','table'])
    ps = soup.find_all('p')
    for p in ps:
        if not p.span:
            prfs.append(p)
            continue
        flag=False
        for c in p.contents:
            if ((isinstance(c, bs4.NavigableString) or isinstance(c, unicode)) and not is_vacio(c)):
                flag=True
                break
        if flag:
            prfs.append(p)
    return prfs
项目:LFS201    作者:s-nt-s    | 项目源码 | 文件源码
def eqsibling(n):
    r=[]
    tag=n.name
    s=n.next_sibling
    while s:
        if (isinstance(s, bs4.NavigableString) or isinstance(s, unicode)):
            if not is_vacio(s):
                break
        elif s.name!=tag or not eqclass(s,n):
            break
        r.append(s)
        s=s.next_sibling
    return r
项目:plumeria    作者:sk89q    | 项目源码 | 文件源码
def search_esv(message, verse):
    """
    Search for a bible passage from the English Standard Version.

    Example::

        bible Romans 12:16

    """
    r = await http.get("http://www.esvapi.org/v2/rest/passageQuery", params={
        "key": "IP",
        "passage": verse,
        "output-format": "crossway-xml-1.0",
        "include-simple-entities": "true",
    })

    doc = BeautifulSoup(r.text(), features="lxml")
    if not doc.passage:
        raise CommandError("Verse not found.")
    lines = []
    for verse_unit in doc.passage.content.find_all('verse-unit'):
        num = int(verse_unit.find('verse-num').text)
        woc = verse_unit.find('woc')
        if woc:
            text = woc.text
        else:
            text = "".join([str(node) for node in verse_unit.children
                            if isinstance(node, NavigableString) and not isinstance(node, Comment)])
        lines.append("**{}** {}".format(num, text.strip()))
    return "\n".join(lines)
项目:Vasco_de_data    作者:KeynesYouDigIt    | 项目源码 | 文件源码
def UNHDR_scrape_description():
    #the final object will be a dictionary with indicator name as the key and desctiption as content
    may_contain_indicators=[]
    clean_listed_indicators={}
    urls = ['http://hdr.undp.org/en/composite/HDI',
    'http://hdr.undp.org/en/composite/IHDI',
    'http://hdr.undp.org/en/composite/trends',
    'http://hdr.undp.org/en/composite/GDI',
    'http://hdr.undp.org/en/composite/GII',
    'http://hdr.undp.org/en/composite/MPI',]
    for url in urls:
        url_response_raw = rq.get(url)
        BS = BeautifulSoup(url_response_raw.text, "lxml")
        p_elements = BS.find_all('p')
        p_contents = []
        for e in p_elements:
            p_contents.append(e)
            for paragraph in p_contents:
                if not isinstance(paragraph,NavigableString):
                    if 'Definitions' in paragraph.text:
                        may_contain_indicators.append(paragraph)

    for paragraf in may_contain_indicators:
        if ':' in paragraf.text:
            with_colons_added = paragraf.get_text('::')
            dub_colon_as_list = []
            for i in enumerate(with_colons_added.split('::')):
                dub_colon_as_list.append(i)
            for i,string in dub_colon_as_list:
                if ': ' in string:
                    indicator_name_full=str(unicodedata.normalize('NFKD',dub_colon_as_list[i-1][1]).encode('ascii', 'ignore')).strip('\n')
                    indicator_name_abridged=indicator_name_full[:indicator_name_full.find(':')]
                    description=str(unicodedata.normalize('NFKD',dub_colon_as_list[i][1]).encode('ascii', 'ignore')).strip('\n')
                    if i+1<len(dub_colon_as_list) and 'http' in dub_colon_as_list[i+1][1]:
                        details_link=dub_colon_as_list[i+1][1]
                    else:
                        details_link ='no further link provided for this indicator'
                    print 'adding %s %s %s' % (indicator_name_abridged, description, details_link)
                    clean_listed_indicators[indicator_name_abridged]=[description,details_link]
    return clean_listed_indicators
项目:SecTools    作者:Shad0wpf    | 项目源码 | 文件源码
def get_summary(self, host_soup):
        ''' host report -> section 1: host summary, return list '''
        result = []
        condition = (u'IP??', u'????')
        p = host_soup.find('tr', class_='even').parent
        for i in p.contents:
            if type(i) is NavigableString:
                continue
            elif i.th.string in condition:
                result.append(i.td.string)
        if len(result) < 2:
            result.append(None)
        return result
项目:txt2evernote    作者:Xunius    | 项目源码 | 文件源码
def checklistInSoupToENML(soup):
        '''
        Transforms github style checklists `* [ ]` in the BeautifulSoup tree to
        enml.
        '''

        checktodo_re = re.compile(r'\[(.)\]')

        # To be more github compatible, if in a list all elements begins with `[ ]``
        # transform it to normal `[ ]` evernote elements
        for ul in soup.find_all('ul'):
            tasks = []; istodo = True

            for li in ul.find_all('li'):
                task = soup.new_tag('div')
                todo_tag = soup.new_tag('en-todo')

                reg = checktodo_re.match(li.get_text())
                istodo = istodo and reg
                character = reg.group(1) if reg else None

                if character == "x": todo_tag['checked']="true"

                task.append(todo_tag)
                if reg: task.append(NavigableString(li.get_text()[3:].strip()))
                tasks.append(task)

            if istodo:
                for task in tasks: ul.insert_after(task)
                ul.extract()

        # For the rest of elements just replace `[ ]` with the appropriate element
        for todo in soup.find_all(text=checktodo_re):
            str_re = re.match(r'(.*)\[(.)\](.*)',todo)
            pre = str_re.group(1)
            post = str_re.group(3)

            todo_tag = soup.new_tag('en-todo')
            if str_re.group(2) == "x": todo_tag['checked']="true"

            todo.replace_with(todo_tag)
            todo_tag.insert_before(pre)
            todo_tag.insert_after(post)
项目:dactyl    作者:ripple    | 项目源码 | 文件源码
def check_all_pages(target=None):
    """Reads all pages for a target and checks them for style."""
    target = dactyl_build.get_target(target)
    pages = dactyl_build.get_pages(target)

    pp_env = dactyl_build.setup_pp_env()

    print("Style Checker - checking all pages in target %s" % target["name"])

    style_issues = []
    for page in pages:
        if "md" not in page:
            # Not a doc page, move on
            continue
        logger.info("Checking page %s" % page["name"])
        page_issues = []
        html = dactyl_build.parse_markdown(page, pages=pages, target=target)
        soup = BeautifulSoup(html, "html.parser")

        overrides = get_overrides(soup)

        content_elements = ["p","li","a","em","strong","th","td",
                            "h1","h2","h3","h4","h5","h6"]
        for el in soup.descendants:
            if (type(el) == NavigableString and
                el.parent.name in content_elements and
                str(el).strip()):
                passage = str(el).strip()
                passage_issues = check_passage(passage, overrides)
                if passage_issues:
                    page_issues += passage_issues
                #print("'%s' (%s)" % (el, el.parent.name))
        # for el in soup.find_all(content_elements):
        #     for passage in el.stripped_strings:
        #         passage_issues = check_passage(passage, overrides)
        #         if passage_issues:
        #             page_issues += passage_issues

        if page_issues:
            style_issues.append( (page["name"], page_issues) )

    return style_issues