Python bs4 模块,Tag() 实例源码

我们从Python开源项目中,提取了以下46个代码示例,用于说明如何使用bs4.Tag()

项目:BGmi    作者:RicterZ    | 项目源码 | 文件源码
def parser_day_bangumi(soup):
    """

    :param soup:
    :type soup: bs4.Tag
    :return: list
    :rtype: list[dict]
    """
    li = []
    for soup in soup.find_all('li'):
        url = soup.select_one('a')
        span = soup.find('span')
        if url:
            name = url['title']
            url = url['href']
            assert isinstance(url, str)
            bangumi_id = url.split('/')[-1]
            soup.find('li', )
            li.append({'name': name, 'keyword': bangumi_id, 'cover': span['data-src']})
    return li
项目:netwars    作者:i008    | 项目源码 | 文件源码
def process_post_bodies(bodies: List[Tag]) -> (str, list):
        for body in bodies:
            cites = list()
            cited = body.findAll('div', {'class': 'cite'})
            if cited:
                cites = [c['name'] for c in cited]
            collect_text = []
            for tag in body:
                # TODO: This is a suboptimal(and partially wrong) solution to parse cites in post body (a lot to improve here)
                if tag.name not in ('div', 'p'):
                    if hasattr(tag, 'text'):
                        collect_text.append(tag.text)
                    elif isinstance(tag, NavigableString):
                        collect_text.append(str(tag))
                    else:
                        collect_text.append('\n')
            else:
                yield ''.join(collect_text), cites
项目:PyBloqs    作者:manahl    | 项目源码 | 文件源码
def append_to(parent, tag, **kwargs):
    """
    Append an element to the supplied parent.

    :param parent: Parent to append to.
    :param tag: Tag to create.
    :param args: Tag args.
    :param kwargs: Tag kwargs.
    :return: New element.
    """
    if hasattr(parent, "soup"):
        soup = parent.soup
    else:
        soup = parent.find_parent("html")

    # Create Tag explicitly instead of using new_tag, otherwise attribute "name" leads to clash with tag-name in bs4
    new_tag = bs4.Tag(builder=soup.builder, name=tag, attrs=kwargs)

    new_tag.soup = soup

    parent.append(new_tag)

    return new_tag
项目:nyx    作者:Cappycot    | 项目源码 | 文件源码
def read_component(thing):
    if isinstance(thing, Tag):
        if thing.name == "em":
            return "*" + read_component(thing.next_element) + "*"
        elif thing.name == "strong":
            return "**" + read_component(thing.next_element) + "**"
        elif thing.name == "u":
            return "__" + read_component(thing.next_element) + "__"
        elif thing.attrs.get("style") == "text-decoration: line-through;":
            return "~~" + read_component(thing.next_element) + "~~"
        elif thing.attrs.get("id") is not None and "footnoteref" in \
                thing.attrs["id"]:
            return ""
        else:
            return read_component(thing.next_element)
    else:
        return thing
项目:Weather    作者:dev4love    | 项目源码 | 文件源码
def show_weather(cityinfo):
    print(u'?????? #%s,%s# ???...' % (cityinfo.get(u'parent_name_ch'), cityinfo.get(u'city_name_ch')))
    weather_content = api.getWeather(cityinfo.get(u'id'))
    soup = BeautifulSoup(weather_content, u'html.parser')
    # print(soup.prettify())
    # print(soup.title)
    table_tag = soup.find_all(u'table', class_=u'sevendays')[0]
    for child in table_tag.children:
        if not isinstance(child, Tag):
            continue

        date = child.find(u'td', class_=u'date').get_text()
        temp = child.find(u'td', class_=u'temp').get_text()
        desc = child.find(u'td', class_=u'desc').get_text()
        print(''.join(date.split()))
        print(''.join(temp.split()))
        print(''.join(desc.split()))
        print(u'=================')
项目:Weather    作者:dev4love    | 项目源码 | 文件源码
def _showWeather(self, city):
        self.info.insert(tk.INSERT, u'?????? #%s, %s# ???...\n\n\n' % (
            city.get(u'city_name_ch'), city.get(u'parent_name_ch')))

        weather_content = self.api.getWeather(city.get(u'id'))
        soup = BeautifulSoup(weather_content, u'html.parser')

        table_tag = soup.find_all(u'table', class_=u'sevendays')[0]
        for child in table_tag.children:
            if not isinstance(child, Tag):
                continue

            date = child.find(u'td', class_=u'date').get_text()
            temp = child.find(u'td', class_=u'temp').get_text()
            desc = child.find(u'td', class_=u'desc').get_text()

            self.info.insert(tk.INSERT, ''.join(date.split()) + '\n')
            self.info.insert(tk.INSERT, ''.join(temp.split()) + '\n')
            self.info.insert(tk.INSERT, ''.join(desc.split()) + '\n')
            self.info.insert(tk.INSERT, u'=================' + '\n')
项目:PyZimuDog    作者:linheimx    | 项目源码 | 文件源码
def get_movie_list(kw_movie, pageIndex=0):
    url = api_movies.format(movie=kw_movie, page_index=pageIndex)
    html = fetch_text(url)
    dom = BeautifulSoup(html, 'html.parser')
    try:
        # 1.movie
        div_items = dom.find_all('div', 'item prel clearfix')  # type:Tag
        movies = []
        for div in div_items:
            movie = process_movie_item(div)
            movies.append(movie)

        # 2.page next
        div_page = dom.find('div', 'pagination l clearfix')
        index, haveNext = process_page_next(div_page)
        page = PageMovie(movies, index, haveNext)
        return Resp(page)
    except Exception as e:
        return Resp(errorMsg=e.__repr__())
项目:PyZimuDog    作者:linheimx    | 项目源码 | 文件源码
def process_movie_item(div_item: Tag) -> Movie:
    movie = Movie()

    # ----------------
    div1 = div_item.find('div', 'litpic hidden-xs hidden-sm')
    a = div1.findChild()
    # detail_url
    movie.detail_url = base_url + a['href']
    # avatar
    img = a.findChild()
    movie.avatar_url = img['data-original']

    # ---------------
    div2 = div_item.find('div', 'title')  # type:Tag
    b = div2.select("p a b")[0]  # type:Tag
    movie.name = b.text
    return movie
项目:PyZimuDog    作者:linheimx    | 项目源码 | 文件源码
def get_MovieList(keyword: str) -> List[Movie]:
    '''
    ?????????????
    :param keyword:
    :return:
    '''
    r = requests.get(base_url + '/search?ad=1&q={0}'.format(keyword))

    dom = BeautifulSoup(r.text, 'html.parser')

    list_movie = []

    div_blocks = dom.find_all('div', class_='item prel clearfix')
    try:
        for div_block in div_blocks:  # type:Tag
            movie = get_Movie(div_block)
            if movie:
                list_movie.append(movie)
    except BaseException:
        pass
    return list_movie
项目:PyZimuDog    作者:linheimx    | 项目源码 | 文件源码
def get_Movie(item: Tag) -> Movie:
    '''
    ??????
    :param item:
    :return:
    '''

    try:
        movie = Movie()

        a = item.select_one('div.title p a')  # type:Tag
        movie.detail_url = a['href']
        movie.name = a.findChild().text
    except BaseException:
        pass

    return movie
项目:PyZimuDog    作者:linheimx    | 项目源码 | 文件源码
def get_ZimusByMovie(url: str) -> List[Zimu]:
    r = requests.get(base_url + "/" + url)
    dom = BeautifulSoup(r.text, 'html.parser')

    list_zimu = []

    father = dom.select_one('body tbody')  # type: Tag
    trs = father.select('tr')  # type:List[Tag]
    for tr in trs:
        try:
            a = tr.select_one('td a')
            zimu = Zimu()
            zimu.detail_url = a['href']
            zimu.name = a['title']
            list_zimu.append(zimu)
        except BaseException:
            continue
    return list_zimu
项目:django-allauth-providers-ko    作者:askdjango    | 项目源码 | 文件源码
def naver_complete_login(request, app, token):
    provider = providers.registry.by_id(NaverProvider.id)
    headers = {'authorization': 'Bearer {}'.format(token.token)}
    resp = requests.get(API_URL + '/nid/getUserProfile.xml', headers=headers)
    resp.raise_for_status()

    soup = BeautifulSoup(resp.text, 'xml')
    parsed = {}
    for sub in ('result', 'response'):
        props = {}
        for tag in soup.find(sub):
            if isinstance(tag, Tag):
                props[tag.name] = tag.text
        parsed[sub] = props

    extra_data = parsed['response']
    login = provider.sociallogin_from_response(request, extra_data)
    return login
项目:wiktionary-translations-parser    作者:elfxiong    | 项目源码 | 文件源码
def parse_translation_table(self, table):
        """ Overrides GeneralParser's method.
        :param table: a Tag object. Not necessary a table; can be a div.
        :return: (translation, language_name, language_code)
        """

        # go through all "li" elements in a table
        for li in table.find_all('li'):
            if not isinstance(li, Tag):
                continue
            text = li.get_text().split(':')
            if len(text) < 2:
                continue

            # language name is before ":"
            lang_name = text[0]

            # language code is usually in super script
            lang_code = li.find(class_="trad-sup-code")
            if lang_code:
                lang_code = lang_code.text.strip()[1:-1]
            else:
                lang_code = ""

            # There are two functions that removes parentheses. Not sure which one to use.
            t = remove_parenthesis(text[1])
            trans_list = re.split(COMMA_OR_SEMICOLON, t)
                # each "trans" is: translation <sup>(lang_code)</sup> (transliteration)
                # lang_code and transliteration may not exist
            for trans in trans_list:
                translation = trans.split('(')[0].strip()
                yield (translation, lang_name.strip(), lang_code)
项目:wiktionary-translations-parser    作者:elfxiong    | 项目源码 | 文件源码
def parse_translation_table(self, table):
        """
        Parse the table to get translations and the languages.
        Hopefully this function will work for most editions. Override this method if needed.
        :param table: a Tag object. Not necessary a table; can be a div.
        :return: (translation, language_name, language_code)
        """
        for li in table.find_all('li'):
            if not isinstance(li, Tag):
                continue
            text = li.get_text().split(':')

            # TBD: the table is not a translation table
            #  OR the table is a translation table but there are some <li> without colon
            if len(text) < 2:
                continue

            # language name is before ":"
            lang_name = text[0].strip()

            # language code is in super script
            lang_code = li.find("sup")
            if lang_code:
                lang_code = remove_all_punctuation(lang_code.text).strip()
            else:
                lang_code = ""

            t = remove_parenthesis(text[1])
            trans_list = re.split(COMMA_OR_SEMICOLON, t)
            # each "trans" is: translation <sup>(lang_code)</sup> (transliteration)
            # lang_code and transliteration may not exist
            for trans in trans_list:
                # translation = trans.split('(')[0].strip()
                translation = re.split(r'[(??]', trans)[0].strip()
                # Throw out tuples if they have '[['
                if "[[" in translation:
                    continue

                yield (translation, lang_name, lang_code)
项目:wiktionary-translations-parser    作者:elfxiong    | 项目源码 | 文件源码
def parse_unordered_list_polish(self, ulist):

        for li in ulist.find_all('li'):
            if not isinstance(li, Tag):
                continue
            if not li.get_text() == '':
                text = li.get_text().split(':')
                lang_name = text[0]
                lang_code = ''
                if len(text) > 1:
                    trans_list = re.split(COMMA_OR_SEMICOLON, text[1])
                    for trans in trans_list:
                        translation = remove_parenthesis(trans).strip()
                        yield (translation, lang_name, lang_code)
项目:wiktionary-translations-parser    作者:elfxiong    | 项目源码 | 文件源码
def parse_translation_table_russian(self, table):

        for li in table.find_all('li'):
            if not isinstance(li, Tag):
                continue
            text = li.get_text().split(':')

            # language name is before ":"
            lang_name = text[0]

            lang_code = ''
            if li.find("sub"):
                lang_code = li.find("sub").get_text()

            # remove the lang code from the lang name
            lang_name = lang_name[:-len(lang_code)]

            if len(text) > 1:
                t = remove_parenthesis(text[1])
            else:
                t = remove_parenthesis(text[0])

            trans_list = re.split(COMMA_OR_SEMICOLON, t)

            for trans in trans_list:
                translation = trans.split('(')[0].strip()
                if not translation == '':
                    yield (translation, lang_name, lang_code)
项目:tvlinker    作者:ozmartian    | 项目源码 | 文件源码
def bs_tag_to_string(bstag: Tag) -> str:
        return ''.join(str(item) for item in bstag.contents)
项目:BGmi    作者:RicterZ    | 项目源码 | 文件源码
def search_by_keyword(self, keyword, count=None):
        """
        return a list of dict with at least 4 key: download, name, title, episode
        example:
[
            {
                'name':"?????????",
                'download': 'magnet:?xt=urn:btih:what ever',
                'title': "[????] ????????? ?12? MP4 720p  ?",
                'episode': 12
            },
        ]
    ```
    :param keyword: search key word
    :type keyword: str
    :param count: how many page to fetch from website
    :type count: int

    :return: list of episode search result
    :rtype: list[dict]
    """

    result = []
    r = network.get(server_root + "Home/Search", params={'searchstr': keyword}).text
    s = BeautifulSoup(r, 'lxml')
    td_list = s.find_all('tr', attrs={'class': 'js-search-results-row'})  # type:list[bs4.Tag]
    for tr in td_list:
        title = tr.find('a', class_='magnet-link-wrap').text
        time_string = tr.find_all('td')[2].string
        result.append({
            'download': tr.find('a', class_='magnet-link').attrs.get('data-clipboard-text', ''),
            'name': keyword,
            'title': title,
            'episode': self.parse_episode(title),
            'time': int(time.mktime(time.strptime(time_string, "%Y/%m/%d %H:%M")))
        })
        # print(result)
    return result

```

项目:ao3    作者:alexwlchan    | 项目源码 | 文件源码
def author(self):
        """The author of this work."""
        # The author of the work is kept in the byline, in the form
        #
        #     <h3 class="byline heading">
        #       <a href="/users/[author_name]" rel="author">[author_name]</a>
        #     </h3>
        #
        byline_tag = self._soup.find('h3', attrs={'class': 'byline'})
        a_tag = [t
                 for t in byline_tag.contents
                 if isinstance(t, Tag)]
        assert len(a_tag) == 1
        return a_tag[0].contents[0].strip()
项目:secret    作者:jianlong108    | 项目源码 | 文件源码
def isTagClass(obj):
    return isinstance(obj, Tag)
项目:secret    作者:jianlong108    | 项目源码 | 文件源码
def getelementlistwithlabel(tagObj, label, options={}):

    if isinstance(tagObj, Tag):
        elementlist = []
        templist = tagObj.find_all(label, attrs=options)
        elementlist.extend(templist)
        return elementlist
    else:
        print '??????,??Tag?? ????:' + tagObj
        return None
项目:secret    作者:jianlong108    | 项目源码 | 文件源码
def gettextlistwithlabel(tagObj):

    if isinstance(tagObj, Tag):

        strlist = tagObj.get_text()

        return strlist.encode('utf-8')
    else:
        print '??????,??Tag?? ????:' + tagObj
        return None
项目:online-judge-tools    作者:kmyk    | 项目源码 | 文件源码
def _parse_sample_tag(self, tag):
        assert isinstance(tag, bs4.Tag)
        assert tag.name == 'pre'
        prv = utils.previous_sibling_tag(tag)
        pprv = tag.parent and utils.previous_sibling_tag(tag.parent)
        if prv.name == 'h6' and tag.parent.name == 'div' and tag.parent['class'] == ['paragraph'] and pprv.name == 'h5':
            log.debug('h6: %s', str(prv))
            log.debug('name.encode(): %s', prv.string.encode())
            s = tag.string or ''  # tag.string for the tag "<pre></pre>" returns None
            return utils.textfile(s.lstrip()), pprv.string + ' ' + prv.string
项目:online-judge-tools    作者:kmyk    | 项目源码 | 文件源码
def previous_sibling_tag(tag):
    tag = tag.previous_sibling
    while tag and not isinstance(tag, bs4.Tag):
        tag = tag.previous_sibling
    return tag
项目:online-judge-tools    作者:kmyk    | 项目源码 | 文件源码
def next_sibling_tag(tag):
    tag = tag.next_sibling
    while tag and not isinstance(tag, bs4.Tag):
        tag = tag.next_sibling
    return tag
项目:online-judge-tools    作者:kmyk    | 项目源码 | 文件源码
def __init__(self, form, url):
        assert isinstance(form, bs4.Tag)
        assert form.name == 'form'
        self.form = form
        self.url = url
        self.payload = {}
        self.files = {}
        for input in self.form.find_all('input'):
            log.debug('input: %s', str(input))
            if input.attrs.get('type') in [ 'checkbox', 'radio' ]:
                continue
            if 'name' in input.attrs and 'value' in input.attrs:
                self.payload[input['name']] = input['value']
项目:online-judge-tools    作者:kmyk    | 项目源码 | 文件源码
def _parse_sample_tag(self, tag):
        assert isinstance(tag, bs4.Tag)
        assert tag.name == 'h2'
        name = tag.contents[0]
        if ':' in name:
            name = name[:  name.find(':') ]
        if name in [ 'Sample input', 'Sample output' ]:
            nxt = tag.next_sibling
            while nxt and nxt.string.strip() == '':
                nxt = nxt.next_sibling
            if nxt.name == 'pre':
                s = utils.textfile(utils.dos2unix(nxt.string.lstrip()))
            else:
                s = ''
            return s, name
项目:biweeklybudget    作者:jantman    | 项目源码 | 文件源码
def soupify(self, body):
        # https://www.crummy.com/software/BeautifulSoup/
        # docs: https://www.crummy.com/software/BeautifulSoup/bs4/doc/
        # bs4 codebase: http://bazaar.launchpad.net/~leonardr/beautifulsoup/bs4/files
        if isinstance(body, Tag): return body
        soup = BeautifulSoup(body, "html.parser")
        return soup
项目:PyTaskHelper    作者:AvSinStudio    | 项目源码 | 文件源码
def parse_tasks(tasks, year):
    db = []
    for task in tasks:
        if not isinstance(task, bs4.Tag):
            continue
        task = task.td
        base_name = task.strong.text.strip()
        next_tag = task.strong.next_sibling.next_sibling.name
        if next_tag == 'span':
            name = base_name
            maximum = task.span.text.strip()
            results = parse_results(task.table, year)
            db.append({'category': 'common', 'name': name,
                       'max': int(maximum), 'students': results,
                       'year': year})
        else:
            for st in task.findAll('font'):
                if st.previous.name != 'div':
                    continue
                name = st.text.strip()
                category = base_name
                maximum = st.findNext('span').text.strip()
                results = parse_results(st.findNext('table'), year)
                db.append({'category': category, 'name': name,
                           'max': int(maximum), 'students': results,
                           'year': year})
    return db
项目:table2xml    作者:phiedulxp    | 项目源码 | 文件源码
def is_leaf_table(table_soup):
    if not isinstance(table_soup,Tag):
        return True
    if len(table_soup.find_all('table')) == 0:
        return True
    return False
项目:table2xml    作者:phiedulxp    | 项目源码 | 文件源码
def is_leaf_td(td_soup):
    if not isinstance(td_soup,Tag):
        return True
    if td_soup.table == None:
        return True
    return False
项目:table2xml    作者:phiedulxp    | 项目源码 | 文件源码
def is_single_head(tr_soup):
    if not isinstance(tr_soup,Tag):
        return True
    if tr_soup.th.nextSibling.name != 'th':
        return True
    return False
项目:table2xml    作者:phiedulxp    | 项目源码 | 文件源码
def extract_txt(tag):
    return [el.get_text().strip() if is_leaf_td(el) else \
    get_deep_table(el.table) for el in tag if isinstance(el, Tag)]
项目:nyx    作者:Cappycot    | 项目源码 | 文件源码
def fetch_level(element, limit=1024):
    length = 0
    parts = []
    if element is None:
        return "[DATA ERROR]"
    for thing in [element] + list(element.next_siblings):
        # component = read_component(thing)
        if isinstance(thing, Tag):
            if thing.name == "em":
                component = "*" + fetch_level(thing.next_element) + "*"
            elif thing.name == "strong":
                component = "**" + fetch_level(thing.next_element) + "**"
            elif thing.name == "u":
                component = "__" + fetch_level(thing.next_element) + "__"
            elif thing.attrs.get("style") == "text-decoration: line-through;":
                component = "~~" + fetch_level(thing.next_element) + "~~"
            elif thing.attrs.get("id") is not None and "footnoteref" in \
                    thing.attrs["id"]:
                return ""
            else:
                component = fetch_level(thing.next_element)
        else:
            component = thing
        if component:
            length += len(component)
            if length > limit - 3:
                if not component.endswith(".") or length > limit:
                    break
            else:
                parts.append(component)
    if len(parts) == 0:
        return "[WITHHELD]"
    return "".join(parts).strip("-:, ")
项目:LFS201    作者:s-nt-s    | 项目源码 | 文件源码
def first_starts_in(div):
    if len(div.contents)==0:
        return False
    f=div.contents[0]
    txt=None
    if isinstance(f, bs4.Tag):
        if f.name=="h3":
            return False
        txt=f.get_text()
    else:
        txt=f.string
    return not starts_in(txt,ini1)
项目:LFS201    作者:s-nt-s    | 项目源码 | 文件源码
def sclean(txt):
    if isinstance(txt, bs4.Tag):
        txt=txt.get_text()
    txt=filter(lambda x: x in printable, txt)
    txt=sp.sub("",txt).strip()
    return txt
项目:LFS201    作者:s-nt-s    | 项目源码 | 文件源码
def get_spbr(soup):
    pcmd=[]
    for p in soup.findAll("p"):
        if len(p.contents)>2:
            c1=p.contents[0]
            c2=p.contents[1]
            c3=p.contents[2]
            if isinstance(c1, bs4.Tag) and isinstance(c2, bs4.Tag):
                if c1.name=="span" and c2.name=="br":
                    if not isinstance(c3, bs4.Tag) or c3.name!="span" or c1.attrs["class"]!=c3.attrs["class"]:
                        pcmd.append(p)
    return pcmd
项目:wiktionary-translations-parser    作者:elfxiong    | 项目源码 | 文件源码
def generate_translation_tuples(self, soup):
        """
        A generator of translation tuples
        :param soup: BeautifulSoup object
        :return: tuple of the form (edition, headword, head_lang, translation, trans_lang, trans_lang_code, part_of_speech)
        """

        # START non-edition-specific
        # this is the table of content which is present in each edition
        toc = soup.find('div', id='mw-content-text')

        page_state = {'headword': None,
                  'headword_lang': None,
                  'part_of_speech': ''}


        pronounce = ''

        headword_element = soup.find('h1', id='titleHeading')

        if headword_element is not None:

            page_state['headword'] = headword_element.text

        for element in toc.children:
            if isinstance(element, Tag):  # it could be a Tag or a NavigableString
                level = self.get_heading_level(element.name)
                # END non-edition-specific
                # Find the headword language

                if level == 2 and 'id' in element.attrs and element['id'] == 'mwAQ':

                    page_state['headword_lang'] = element.text.replace('dili','').strip()
                    pronounce = ''

                elif level == 3 and 'id' in element.attrs and element['id'] == 'mwAw':

                    page_state['part_of_speech'] = element.text

                elif element.name == 'ul':

                    for li in element.find_all('li'):
                        if not isinstance(li, Tag):
                            continue
                        if li.get_text().split(':')[0] == 'T?l?ffüz':
                            pronounce = li.get_text().split(':')[1].strip()

                elif element.name == 'p':

                    if  '<div class="NavHead" #FFFFE0">' in element.text:

                        for translation, lang, lang_code in self.parse_translation_table(\
                            element.find_next_sibling('div', class_='NavFrame')):

                            if translation == '':
                                continue
                            translation = translation.strip()
                            lang = lang.strip()
                            yield (
                                self.edition, page_state['headword'], page_state['headword_lang'], translation, lang,
                                lang_code, page_state['part_of_speech'], pronounce)
项目:wiktionary-translations-parser    作者:elfxiong    | 项目源码 | 文件源码
def generate_translation_tuples(self, soup):
        """ A generator of translation tuples
        :param soup: BeautifulSoup object
        :return: tuple of the form (edition, headword, head_lang, translation, trans_lang, trans_lang_code, part_of_speech, pronunciation)
        """

        # this is the table of contents 
        # it is present in every page in the French edition
        toc = soup.find('div', id='mw-content-text')

        # set default values for tuple elements
        page_state = {'headword': '',
                      'headword_lang': '',
                      'part_of_speech': '',
                      'pronunciation': ''}
        edition = "nl"

        if not toc:
            return  # skip it if there's no table of contents

        for element in toc.children:
            if isinstance(element, Tag):  # it could be a Tag or a NavigableString
                level = self.get_heading_level(element.name)

                if level == 2:  # it is a header tag; headword language almost always appears here
                    page_state['headword_lang'] = self.get_heading_text(element)
                elif level == 4:  # it is an h4; part of speech almost always appears here
                    page_state['part_of_speech'] = self.get_heading_text(element)
                elif element.name == "p":  # is a paragraph tag
                    bold_word = element.b
                    if bold_word:
                        page_state['headword'] = bold_word.get_text()   # the headword is usually just bolded
                elif element.name == "h5":
                    first_headline = element.find(class_="mw-headline")
                    if first_headline and first_headline.text.strip() == "Vertalingen":  # this is a translation header
                        # this is a translation table
                        while True:     # loop through all consecutive tables; they all have translations
                            table = element.find_next_sibling()
                            if table.has_attr("class") and "NavFrame" in table.get("class"):
                                for translation, lang, lang_code in self.parse_translation_table(table):
                                    yield (
                                        edition, page_state['headword'], page_state['headword_lang'], translation, lang,
                                        lang_code, page_state['part_of_speech'], page_state['pronunciation'])
                                element = table     # move to next table
                            else:
                                break
项目:wiktionary-translations-parser    作者:elfxiong    | 项目源码 | 文件源码
def generate_translation_tuples(self, soup):
        """
        A generator of translation tuples
        :param soup: BeautifulSoup object
        :return: tuple of the form (edition, headword, head_lang, translation, trans_lang, trans_lang_code, part_of_speech)
        """

        # START non-edition-specific
        # this is the table of content which is present in each edition
        toc = soup.find('div', id='mw-content-text')

        page_state = {'headword': None,
                      'headword_lang': None,
                      'part_of_speech': ''}
        pronounce = ''
        head = soup.find('h1', id='titleHeading')
        if head is not None:
            page_state['headword'] = head.text

        for element in toc.children:
            if isinstance(element, Tag):  # it could be a Tag or a NavigableString
                level = self.get_heading_level(element.name)
                # END non-edition-specific
                # Find the headword language

                if level == 1:

                    if element.big is not None:

                        page_state['headword_lang'] = remove_parenthesis(element.b.text).strip()

                        # Find Part of Speech: Not sure if this works. The only way i've been able to see a correlation between
                        # All pages for part of speech is by it being a h2 and the POS in a font tag. Since my sample test is so small
                        # I don't know if it's working properly

                if level == 2:
                    if element.text is not None:
                        page_state['part_of_speech'] = element.text.strip()


                # Find the translation table
                elif element.name == 'ul':

                    for translation, lang, lang_code in self.parse_translation_table(element):
                        yield (
                            self.edition, page_state['headword'], page_state['headword_lang'], translation, lang,
                            lang_code, page_state['part_of_speech'], pronounce)
                    translation_table = False
项目:wiktionary-translations-parser    作者:elfxiong    | 项目源码 | 文件源码
def parse_page(self, soup):
        """ Yield for each language section
        """
        # try:
        #     page_heading = soup.find('div', class_='mw-body-content').previous_sibling.text
        # except AttributeError as e:
        #     print(soup)
        page_content = soup.find('div', id='mw-content-text')
        page_heading = None
        element = soup.find('div', class_='mw-body-content') or page_content
        while not page_heading:
            if element is None:
                return None
            element = element.previous_sibling
            if isinstance(element, Tag):
                page_heading = element.text

        page_state = {'headword': None,
                      'headword_lang': '',
                      'part_of_speech': [''],
                      'pronunciation': '',
                      'translations': defaultdict(list)}
        for element in page_content.children:
            if isinstance(element, Tag):
                pronunciation = element.find(class_="IPA")
                if pronunciation:
                    page_state['pronunciation'] = pronunciation.text.strip()

                level = self.get_heading_level(element.name)
                if level == 2:
                    if page_state['headword']:
                        yield page_state
                    page_state['headword'] = page_heading  # default value
                    page_state['headword_lang'] = self.get_heading_text(element)
                    page_state['part_of_speech'] = ['']
                    page_state['pronunciation'] = ''
                    page_state['translations'] = defaultdict(list)
                elif level == 3:
                    page_state['part_of_speech'].append(self.get_heading_text(element))
                # elif element.name == "p":
                #     bold_word = element.b
                #     if bold_word:
                #         page_state['headword'] = bold_word.get_text()
                elif element.name == 'table' and 'class' in element.attrs and 'translations' in element['class']:
                    translation_tup_list = list(self.parse_translation_table(element))
                    if not translation_tup_list:
                        continue
                    pos = page_state['part_of_speech'][-1]
                    page_state['translations'][pos] += translation_tup_list

        if page_state['headword']:
            yield page_state
项目:wiktionary-translations-parser    作者:elfxiong    | 项目源码 | 文件源码
def parse_page(self, soup):
        page_content = soup.find('div', id='mw-content-text')
        page_heading = None
        element = soup.find('div', class_='mw-body-content') or page_content
        while not page_heading:
            if element is None:
                return None
            element = element.previous_sibling
            if isinstance(element, Tag):
                page_heading = element.text

        page_state = {'headword': None,
                      'headword_lang': '',
                      'part_of_speech': [''],
                      'pronunciation': '',
                      'translations': defaultdict(list)}
        for element in page_content.children:
            if isinstance(element, Tag):  # it could be a Tag or a NavigableString
                pronunciation = element.find(class_='IPA')
                if pronunciation:
                    page_state['pronunciation'] = pronunciation.text.strip()

                level = self.get_heading_level(element.name)
                if level == 2:
                    if page_state['headword']:
                        yield page_state
                    page_state['headword'] = page_heading  # default value
                    page_state['headword_lang'] = self.get_heading_text(element)
                    page_state['part_of_speech'] = ['']
                    page_state['pronunciation'] = ''
                    page_state['translations'] = defaultdict(list)
                elif level == 3:
                    page_state['part_of_speech'].append(self.get_heading_text(element))
                # elif element.name == "p":  # is a paragraph tag
                # bold_word = element.b
                # if bold_word:
                #     page_state['headword'] = bold_word.get_text()
                # print("headword: ", bold_word.get_text().strip())
                elif element.name == "h4":
                    first_headline = element.find()
                    if first_headline and first_headline.text.strip() == u"D?ch":  # this translation header
                        table = element.find_next_sibling(class_="columns")
                        translation_tup_list = list(self.parse_translation_table(table))
                        if not translation_tup_list:
                            continue
                        pos = page_state['part_of_speech'][-1]
                        page_state['translations'][pos] += translation_tup_list

        if page_state['headword']:
            yield page_state
项目:wiktionary-translations-parser    作者:elfxiong    | 项目源码 | 文件源码
def generate_translation_tuples(self, soup):
        """ A generator of translation tuples
        :param soup: BeautifulSoup object
        :return: tuple of the form (edition, headword, head_lang, translation, trans_lang, trans_lang_code, part_of_speech, pronunciation)
        """

        # this is the table of contents 
        # it is present in every page in the French edition
        toc = soup.find('div', id='mw-content-text')

        # set default values for tuple elements
        page_state = {'headword': '',
                      'headword_lang': '',
                      'part_of_speech': '',
                      'pronunciation': ''}
        edition = "fr"

        if not toc:
            return  # skip it if there's no table of contents

        for element in toc.children:
            if isinstance(element, Tag):  # it could be a Tag or a NavigableString
                level = self.get_heading_level(element.name)

                if level == 2:  # it is a header tag; headword language almost always appears here
                    page_state['headword_lang'] = self.get_heading_text(element).strip()
                elif level == 3:  # it is an h3; part of speech almost always appears here
                    page_state['part_of_speech'] = self.get_heading_text(element).strip()
                elif element.name == "p":  # is a paragraph tag
                    bold_word = element.b
                    if bold_word:
                        page_state['headword'] = bold_word.get_text()   # the headword is usually just bolded
                        link = element.span    # pronunciation usually appears right after headword in an <a> tag
                        if link:
                            if link.has_attr('class') and "API" in link.get("class"):
                                page_state['pronunciation'] = link.get_text()
                elif element.name == "h4":
                    first_headline = element.find("span")
                    if first_headline and first_headline.text.strip() == "Traductions":  # this is a translation header
                        # this is a translation table
                        while True:     # loop through all consecutive tables; they all have translations
                            table = element.find_next_sibling()
                            if table.has_attr("class") and "boite" in table.get("class"):
                                for translation, lang, lang_code in self.parse_translation_table(table):
                                    yield (
                                        edition, page_state['headword'], page_state['headword_lang'], translation, lang,
                                        lang_code, page_state['part_of_speech'], page_state['pronunciation'])
                                element = table     # move to next table
                            else:
                                break
项目:wiktionary-translations-parser    作者:elfxiong    | 项目源码 | 文件源码
def parse_page(self, soup):
        page_content = soup.find('div', id='mw-content-text')
        page_heading = None
        element = soup.find('div', class_='mw-body-content') or page_content
        while not page_heading:
            if element is None:
                return None
            element = element.previous_sibling
            if isinstance(element, Tag):
                page_heading = element.text

        page_state = {'headword': None,
                      'headword_lang': '',
                      'part_of_speech': [''],
                      'pronunciation': '',
                      'translations': defaultdict(list)}
        for element in page_content.children:
            if isinstance(element, Tag):
                pronunciation = element.find(class_="ipa")
                if pronunciation:
                    page_state['pronunciation'] = pronunciation.text.strip()

                level = self.get_heading_level(element.name)
                if level == 2:
                    if page_state['headword']:
                        yield page_state
                    s = self.get_heading_text(element)
                    # format: "headword (language)"
                    page_state['headword'] = s.split('(')[0].strip() or page_heading
                    page_state['headword_lang'] = s[s.find("(") + 1:s.find(")")]
                    page_state['translation_region'] = False
                    page_state['part_of_speech'] = ['']
                    page_state['pronunciation'] = ''
                    page_state['translations'] = defaultdict(list)
                elif level == 3:
                    page_state['part_of_speech'].append(self.get_heading_text(element).split(',')[0].strip())
                    page_state['translation_region'] = False
                elif element.name == "h4":
                    if element.text.strip() == u"Übersetzungen":
                        page_state['translation_region'] = True
                        continue
                    first_headline = element.find()
                    if first_headline and first_headline.text.strip() == u"Übersetzungen":
                        page_state['translation_region'] = True
                    else:
                        page_state['translation_region'] = False
                elif 'class' not in element.attrs:
                    page_state['translation_region'] = False
                elif page_state['translation_region']:
                    translation_tup_list = list(self.parse_translation_table(element))
                    if not translation_tup_list:
                        continue
                    pos = page_state['part_of_speech'][-1]
                    page_state['translations'][pos] += translation_tup_list

        if page_state['headword']:
            yield page_state
项目:BGmi    作者:RicterZ    | 项目源码 | 文件源码
def fetch_episode_of_bangumi(self, bangumi_id, subtitle_list=None, max_page=MAX_PAGE):
        """
        get all episode by bangumi id
        example
[
            {
                "download": "magnet:?xt=urn:btih:e43b3b6b53dd9fd6af1199e112d3c7ff15cab82c",
                "name": "????",
                "subtitle_group": "58a9c1c9f5dc363606ab42ec",
                "title": "?????????????[????/Made in Abyss][07][GB][720P]",
                "episode": 0,
                "time": 1503301292
            },
        ]
    ```

    :param bangumi_id: bangumi_id
    :param subtitle_list: list of subtitle group
    :type subtitle_list: list
    :param max_page: how many page you want to crawl if there is no subtitle list
    :type max_page: int
    :return: list of bangumi
    :rtype: list[dict]
    """

    result = []
    if os.environ.get('DEBUG', False):
        print(server_root + 'Bangumi/{}'.format(bangumi_id))
    r = network.get(server_root + 'Home/Bangumi/{}'.format(bangumi_id)).text

    soup = BeautifulSoup(r, 'lxml')
    # name = soup.find('p', class_='bangumi-title').text
    container = soup.find('div', class_='central-container')  # type:bs4.Tag
    episode_container_list = {}
    for index, tag in enumerate(container.contents):
        if hasattr(tag, 'attrs'):
            subtitle_id = tag.attrs.get('id', False)
            if subtitle_list:
                if subtitle_id in subtitle_list:
                    episode_container_list[tag.attrs.get('id', None)] = tag.find_next_sibling('table')
            else:
                if subtitle_id:
                    episode_container_list[tag.attrs.get('id', None)] = tag.find_next_sibling('table')

    for subtitle_id, container in episode_container_list.items():
        for tr in container.find_all('tr')[1:]:
            title = tr.find('a', class_='magnet-link-wrap').text
            time_string = tr.find_all('td')[2].string
            result.append({
                'download': tr.find('a', class_='magnet-link').attrs.get('data-clipboard-text', ''),
                'subtitle_group': str(subtitle_id),
                'title': title,
                'episode': self.parse_episode(title),
                'time': int(time.mktime(time.strptime(time_string, "%Y/%m/%d %H:%M")))
            })

    return result

```

项目:sneakerbot    作者:rhawiz    | 项目源码 | 文件源码
def scrape_tag_contents(tags, html):
    tag_list = copy.copy(tags)
    if isinstance(html, Tag):
        soup = html
    else:
        soup = BeautifulSoup(html, "lxml")
    results = []
    content_tag, content_attr = tag_list.pop()
    if not len(tag_list):
        return list(soup.findAll(name=content_tag, attrs=content_attr))
    first_tag, first_attr = tag_list.pop(0)
    element_list = soup.findAll(name=first_tag, attrs=first_attr)

    for tag, attr in tag_list:
        temp = ResultSet([], ())
        for element in element_list:
            if isinstance(attr, dict):
                temp += element.findAll(name=tag, attrs=attr)
            elif isinstance(attr, unicode) or isinstance(attr, str):
                if element.has_attr(attr):
                    temp.append(element[attr])

        element_list = temp

    for element in element_list:
        if content_tag == "regex":
            pattern = content_attr
            text = element
            if not isinstance(text, str):
                text = element.text
            if text:
                match = re.findall(pattern, text)
                if match:
                    results.append(match[0])
        elif content_attr is None or content_attr == "":
            if content_tag is None or content_tag == "":
                text = element
            else:
                text = element.find(content_tag)
            if text:
                results.append(text.text)
        elif content_tag is None or content_tag == "":
            if element.has_attr(content_attr):
                results.append(element[content_attr])
        else:
            info_container = element.findAll(name=content_tag)
            for container in info_container:
                if isinstance(content_attr, dict):
                    results.append(container)
                elif info_container.has_attr(content_attr):
                    results.append(container[content_attr])
    return results