Python HTMLParser 模块,HTMLParser() 实例源码

我们从Python开源项目中,提取了以下49个代码示例,用于说明如何使用HTMLParser.HTMLParser()

项目:pi_romulus    作者:ArthurMoore85    | 项目源码 | 文件源码
def __init__(self):
        super(EmuApi, self).__init__()
        self.service = 'EmuParadise'
        self.base_url = 'https://www.emuparadise.me'
        self.referrer = None
        self._parser = HTMLParser.HTMLParser()
        self.endpoints = ENDPOINTS
        self.response = self.get_response()
        self.search_regex = '<div class="roms">' \
                            '<a .*?href="(.*?)">(.*?)</a>.*?' \
                            '<a href="\/roms\/roms\.php\?sysid=(\d+)".*?class="sysname">' \
                            '(.*?)</a>.*?<b>Size:</b> (.*?) .*?</div>'
        self.download_url = 'http://direct.emuparadise.me/roms/get-download.php?gid={download_id}' \
                            '&token={token}' \
                            '&mirror_available=true'
        self.requires_arguments = True
        self.token = '211217baa2d87c57b360b9a673a12cfd'
项目:CorpBot.py    作者:corpnewt    | 项目源码 | 文件源码
def getXKCDImageTitle ( html ):
    comicBlock = find_last_between( html, 'div id="comic"', "</div>")

    if not comicBlock:
        return None

    imageTitle = find_last_between( comicBlock, "alt=", ">" )
    # Drop srcset= if there
    imageTitle = imageTitle.split('srcset=')[0]
    h = HTMLParser()
    imageTitle = h.unescape(imageTitle)
    imageTitle = imageTitle.replace('"', '').strip()
    imageTitle = imageTitle.replace('/', '').strip()
    return imageTitle

# Garfield Minus Garfield Methods
项目:TACTIC-Handler    作者:listyque    | 项目源码 | 文件源码
def handle_charref(self, name):
        # XXX workaround for a bug in HTMLParser. Remove this once
        # it's fixed in all supported versions.
        # http://bugs.python.org/issue13633
        if name.startswith('x'):
            real_name = int(name.lstrip('x'), 16)
        elif name.startswith('X'):
            real_name = int(name.lstrip('X'), 16)
        else:
            real_name = int(name)

        try:
            data = unichr(real_name)
        except (ValueError, OverflowError), e:
            data = u"\N{REPLACEMENT CHARACTER}"

        self.handle_data(data)
项目:TACTIC-Handler    作者:listyque    | 项目源码 | 文件源码
def feed(self, markup):
        args, kwargs = self.parser_args
        parser = BeautifulSoupHTMLParser(*args, **kwargs)
        parser.soup = self.soup
        try:
            parser.feed(markup)
        except HTMLParseError, e:
            warnings.warn(RuntimeWarning(
                "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
            raise e

# Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some
# 3.2.3 code. This ensures they don't treat markup like <p></p> as a
# string.
#
# XXX This code can be removed once most Python 3 users are on 3.2.3.
项目:BioQueue    作者:liyao001    | 项目源码 | 文件源码
def get_steps(protocol_id):
    """
    Get steps of a protocol
    :param protocol_id: int, protocol id
    :return: list, list of unresolved steps
    """
    step_list = []

    steps = Protocol.objects.filter(parent=protocol_id).order_by('step_order')
    html_parser = HTMLParser()
    workspace_path = settings['env']['workspace']
    for index, step in enumerate(steps):
        # priority for self-compiled tool
        software_path = os.path.join(os.path.join(os.path.join(workspace_path, str(step.user_id)), 'bin'),
                                     str(step.software))
        if os.path.exists(software_path) and os.path.isfile(software_path):
            step.software = software_path
        step_list.append({
            'id': index,
            'parameter': html_parser.unescape(str(step.software).rstrip() + " " + str(step.parameter)),
            'specify_output': step.specify_output,
            'hash': step.hash,
        })
    return step_list
项目:llk    作者:Tycx2ry    | 项目源码 | 文件源码
def handle_charref(self, name):
        # XXX workaround for a bug in HTMLParser. Remove this once
        # it's fixed in all supported versions.
        # http://bugs.python.org/issue13633
        if name.startswith('x'):
            real_name = int(name.lstrip('x'), 16)
        elif name.startswith('X'):
            real_name = int(name.lstrip('X'), 16)
        else:
            real_name = int(name)

        try:
            data = unichr(real_name)
        except (ValueError, OverflowError), e:
            data = u"\N{REPLACEMENT CHARACTER}"

        self.handle_data(data)
项目:llk    作者:Tycx2ry    | 项目源码 | 文件源码
def feed(self, markup):
        args, kwargs = self.parser_args
        parser = BeautifulSoupHTMLParser(*args, **kwargs)
        parser.soup = self.soup
        try:
            parser.feed(markup)
        except HTMLParseError, e:
            warnings.warn(RuntimeWarning(
                "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
            raise e

# Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some
# 3.2.3 code. This ensures they don't treat markup like <p></p> as a
# string.
#
# XXX This code can be removed once most Python 3 users are on 3.2.3.
项目:Taigabot    作者:FrozenPigs    | 项目源码 | 文件源码
def twitter_url(match, bot=None):
    # Find the tweet ID from the URL
    tweet_id = match.group(1)

    # Get the tweet using the tweepy API
    api = get_api(bot)
    if not api:
        return
    try:
        tweet = api.get_status(tweet_id)
        user = tweet.user
    except tweepy.error.TweepError:
        return

    # Format the return the text of the tweet
    text = " ".join(tweet.text.split())

    if user.verified:
        prefix = u"\u2713"
    else:
        prefix = ""

    time = timesince.timesince(tweet.created_at, datetime.utcnow())
    h = HTMLParser()
    return u"{}@\x02{}\x02 ({}): {} ({} ago)".format(prefix, user.screen_name, user.name, h.unescape(text), time)
项目:electron-crash-reporter    作者:lipis    | 项目源码 | 文件源码
def insert_to(project_url, destination, find_what, indent=0):
  url = ('%smagic/%s' % (project_url, destination)).replace('\\', '/')
  response = urllib2.urlopen(url)
  if response.getcode() == 200:
    with open(destination, 'r') as dest:
      dest_contents = dest.readlines()
      lines = ''.join(dest_contents)
      content = HTMLParser().unescape(response.read())
      if content.replace(' ', '') in lines.replace(' ', ''):
        print_out('IGNORED', destination)
        return

    generated = []
    for line in dest_contents:
      generated.append(line)
      if line.lower().find(find_what.lower()) >= 0:
        spaces = len(line) - len(line.lstrip())
        for l in content.split('\n'):
          if l:
            generated.append('%s%s\n' % (' ' * (spaces + indent), l))

    with open(destination, 'w') as dest:
      for line in generated:
        dest.write(line)
      print_out('INSERT', destination)
项目:weeman    作者:evait-security    | 项目源码 | 文件源码
def handle_charref(self, name):
        # XXX workaround for a bug in HTMLParser. Remove this once
        # it's fixed in all supported versions.
        # http://bugs.python.org/issue13633
        if name.startswith('x'):
            real_name = int(name.lstrip('x'), 16)
        elif name.startswith('X'):
            real_name = int(name.lstrip('X'), 16)
        else:
            real_name = int(name)

        try:
            data = unichr(real_name)
        except (ValueError, OverflowError), e:
            data = u"\N{REPLACEMENT CHARACTER}"

        self.handle_data(data)
项目:weeman    作者:evait-security    | 项目源码 | 文件源码
def feed(self, markup):
        args, kwargs = self.parser_args
        parser = BeautifulSoupHTMLParser(*args, **kwargs)
        parser.soup = self.soup
        try:
            parser.feed(markup)
        except HTMLParseError, e:
            warnings.warn(RuntimeWarning(
                "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
            raise e

# Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some
# 3.2.3 code. This ensures they don't treat markup like <p></p> as a
# string.
#
# XXX This code can be removed once most Python 3 users are on 3.2.3.
项目:searchForAll    作者:MemoryAndDream    | 项目源码 | 文件源码
def process(keyword,page):
    url='https://www.google.com/search?q=%s&start=%s&num=100'%(keyword,page*100)
    urlinfos=[]
    #urlinfo1={"url":"http://www.baidu.com/link?url=966OdUyxuwFJoAYx_XGYq7_FiVLcej4qEA3Q84e-lLAtLPRGGHA6tsNFNsTN9zka&wd=&eqid=a64931cc000026c3000000035994fd9e","title":"python Django?? ?????????????????..._???","info":'? W3School,???????????????? jQuery ??? jQuery ??jQuery ???? ?W3School,???????? jQuery ????????????? jQuery...'}
    page = ct.crawlerTool.getPage(url)
    #print page
    #print url
    segments = ct.crawlerTool.getXpath('//div[@class="g"]',page)
    #print segments
    for segment in segments:
        #print segment
        try:
            urlinfo={}
            urlinfo['url']= ct.crawlerTool.getXpath('//h3/a/@href',segment)[0]#/text()???????
            urlinfo['title'] = ct.crawlerTool.getXpath('//h3/a/text()',segment)[0]
            urlinfo['info'] =  HTMLParser().unescape(ct.crawlerTool.extractorText(ct.crawlerTool.getXpath('//div[@class="s"]', segment)))
            #print urlinfo['url'],urlinfo['title'],urlinfo['info']
            #info??????????
            urlinfos.append(urlinfo)
        except:
            print('error')
            traceback.print_exc()
    return {"urlinfos":urlinfos}
项目:meet-notes    作者:lipis    | 项目源码 | 文件源码
def insert_to(project_url, destination, find_what, indent=0):
  url = ('%smagic/%s' % (project_url, destination)).replace('\\', '/')
  response = urllib2.urlopen(url)
  if response.getcode() == 200:
    with open(destination, 'r') as dest:
      dest_contents = dest.readlines()
      lines = ''.join(dest_contents)
      content = HTMLParser().unescape(response.read())
      if content.replace(' ', '') in lines.replace(' ', ''):
        print_out('IGNORED', destination)
        return

    generated = []
    for line in dest_contents:
      generated.append(line)
      if line.lower().find(find_what.lower()) >= 0:
        spaces = len(line) - len(line.lstrip())
        for l in content.split('\n'):
          if l:
            generated.append('%s%s\n' % (' ' * (spaces + indent), l))

    with open(destination, 'w') as dest:
      for line in generated:
        dest.write(line)
      print_out('INSERT', destination)
项目:chromium-build    作者:discordapp    | 项目源码 | 文件源码
def ParseGTestXML(xml_content):
  """Parse gtest XML result."""
  results = []

  html = HTMLParser.HTMLParser()

  # TODO(jbudorick): Unclear how this handles crashes.
  testsuites = xml.etree.ElementTree.fromstring(xml_content)
  for testsuite in testsuites:
    suite_name = testsuite.attrib['name']
    for testcase in testsuite:
      case_name = testcase.attrib['name']
      result_type = base_test_result.ResultType.PASS
      log = []
      for failure in testcase:
        result_type = base_test_result.ResultType.FAIL
        log.append(html.unescape(failure.attrib['message']))

      results.append(base_test_result.BaseTestResult(
          '%s.%s' % (suite_name, TestNameWithoutDisabledPrefix(case_name)),
          result_type,
          int(float(testcase.attrib['time']) * 1000),
          log=('\n'.join(log) if log else '')))

  return results
项目:chandl    作者:gebn    | 项目源码 | 文件源码
def unescape_html(html_):
    """
    Replace HTML entities (e.g. `&pound;`) in a string.

    :param html_: The escaped HTML.
    :return: The input string with entities replaces.
    """

    # http://stackoverflow.com/a/2360639

    if sys.version_info.major == 2:  # 2.7
        # noinspection PyUnresolvedReferences,PyCompatibility
        from HTMLParser import HTMLParser
        return HTMLParser().unescape(html_)

    if sys.version_info.minor == 3:  # 3.3
        # noinspection PyCompatibility
        from html.parser import HTMLParser
        # noinspection PyDeprecation
        return HTMLParser().unescape(html_)

    # 3.4+
    # noinspection PyCompatibility
    import html
    return html.unescape(html_)
项目:iitb-blog-aggregator    作者:ranveeraggarwal    | 项目源码 | 文件源码
def feeds(page_url):
    """Search the given URL for possible feeds, returning a list of them."""

    # If the URL is a feed, there's no need to scan it for links.
    if is_feed(page_url):
        return [page_url]

    data = fetch_url(page_url)
    parser = FeedFinder(page_url)
    try:
        parser.feed(data)
    except HTMLParser.HTMLParseError:
        pass
    found = parser.urls()

    # Return only feeds that feedparser can understand.
    return [feed for feed in found if is_feed(feed)]
项目:catchup4kodi    作者:catchup4kodi    | 项目源码 | 文件源码
def handle_charref(self, name):
        # XXX workaround for a bug in HTMLParser. Remove this once
        # it's fixed in all supported versions.
        # http://bugs.python.org/issue13633
        if name.startswith('x'):
            real_name = int(name.lstrip('x'), 16)
        elif name.startswith('X'):
            real_name = int(name.lstrip('X'), 16)
        else:
            real_name = int(name)

        try:
            data = unichr(real_name)
        except (ValueError, OverflowError), e:
            data = u"\N{REPLACEMENT CHARACTER}"

        self.handle_data(data)
项目:catchup4kodi    作者:catchup4kodi    | 项目源码 | 文件源码
def feed(self, markup):
        args, kwargs = self.parser_args
        parser = BeautifulSoupHTMLParser(*args, **kwargs)
        parser.soup = self.soup
        try:
            parser.feed(markup)
        except HTMLParseError, e:
            warnings.warn(RuntimeWarning(
                "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
            raise e

# Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some
# 3.2.3 code. This ensures they don't treat markup like <p></p> as a
# string.
#
# XXX This code can be removed once most Python 3 users are on 3.2.3.
项目:plugin.video.brplay    作者:olavopeixoto    | 项目源码 | 文件源码
def _provider_auth(self, url, qs, username, password, html):

        url += '?sid=0'
        # prepare auth
        r = self.session.post(url + '&id=tve&option=credential', proxies=self.proxy, headers={'Accept-Encoding': 'gzip'})

        # authenticate
        post_data = {
            'option': 'credential',
            'urlRedirect': url,
            'Ecom_User_ID': username,
            'Ecom_Password': password,
        }
        r1 = self.session.post(url, data=post_data, proxies=self.proxy, headers={'Accept-Encoding': 'gzip'})

        r2 = self.session.get(url, proxies=self.proxy, headers={'Accept-Encoding': 'gzip'})

        try:
            html_parser = HTMLParser.HTMLParser()
            redirurl = re.findall(r'<form method=\"POST\" enctype=\"application/x-www-form-urlencoded\" action=\"(.*)\">', r2.text)[0]
            argsre = dict([(match.group(1), html_parser.unescape(match.group(2))) for match in re.finditer(r'<input type=\"hidden\" name=\"(\w+)\" value=\"([^\"]+)\"/>', r2.text)])

            return self.session.post(redirurl, data=argsre, proxies=self.proxy, headers={'Accept-Encoding': 'gzip'})
        except:
            raise Exception('Invalid user name or password.')
项目:respeaker_virtualenv    作者:respeaker    | 项目源码 | 文件源码
def handle_charref(self, name):
        # XXX workaround for a bug in HTMLParser. Remove this once
        # it's fixed in all supported versions.
        # http://bugs.python.org/issue13633
        if name.startswith('x'):
            real_name = int(name.lstrip('x'), 16)
        elif name.startswith('X'):
            real_name = int(name.lstrip('X'), 16)
        else:
            real_name = int(name)

        try:
            data = unichr(real_name)
        except (ValueError, OverflowError), e:
            data = u"\N{REPLACEMENT CHARACTER}"

        self.handle_data(data)
项目:respeaker_virtualenv    作者:respeaker    | 项目源码 | 文件源码
def feed(self, markup):
        args, kwargs = self.parser_args
        parser = BeautifulSoupHTMLParser(*args, **kwargs)
        parser.soup = self.soup
        try:
            parser.feed(markup)
        except HTMLParseError, e:
            warnings.warn(RuntimeWarning(
                "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
            raise e

# Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some
# 3.2.3 code. This ensures they don't treat markup like <p></p> as a
# string.
#
# XXX This code can be removed once most Python 3 users are on 3.2.3.
项目:tellmeabout.coffee    作者:billyfung    | 项目源码 | 文件源码
def handle_charref(self, name):
        # XXX workaround for a bug in HTMLParser. Remove this once
        # it's fixed in all supported versions.
        # http://bugs.python.org/issue13633
        if name.startswith('x'):
            real_name = int(name.lstrip('x'), 16)
        elif name.startswith('X'):
            real_name = int(name.lstrip('X'), 16)
        else:
            real_name = int(name)

        try:
            data = unichr(real_name)
        except (ValueError, OverflowError), e:
            data = u"\N{REPLACEMENT CHARACTER}"

        self.handle_data(data)
项目:tellmeabout.coffee    作者:billyfung    | 项目源码 | 文件源码
def feed(self, markup):
        args, kwargs = self.parser_args
        parser = BeautifulSoupHTMLParser(*args, **kwargs)
        parser.soup = self.soup
        try:
            parser.feed(markup)
        except HTMLParseError, e:
            warnings.warn(RuntimeWarning(
                "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
            raise e

# Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some
# 3.2.3 code. This ensures they don't treat markup like <p></p> as a
# string.
#
# XXX This code can be removed once most Python 3 users are on 3.2.3.
项目:xunfeng    作者:ysrc    | 项目源码 | 文件源码
def get_url(domain,port,timeout):
    url_list = []
    if port ==443:
        surl = 'https://' + domain
    else:
        surl = 'http://' + domain
    res = urllib2.urlopen(surl, timeout=timeout)
    html = res.read()
    root_url = res.geturl()
    m = re.findall("<(?:img|link|script)[^>]*?(?:src|href)=('|\")(.*?)\\1", html, re.I)
    if m:
        for url in m:
            ParseResult = urlparse.urlparse(url[1])
            if ParseResult.netloc and ParseResult.scheme:
                if domain == ParseResult.hostname:
                    url_list.append(HTMLParser.HTMLParser().unescape(url[1]))
            elif not ParseResult.netloc and not ParseResult.scheme:
                url_list.append(HTMLParser.HTMLParser().unescape(urlparse.urljoin(root_url, url[1])))
    return list(set(url_list))
项目:gn_build    作者:realcome    | 项目源码 | 文件源码
def ParseGTestXML(xml_content):
  """Parse gtest XML result."""
  results = []

  html = HTMLParser.HTMLParser()

  # TODO(jbudorick): Unclear how this handles crashes.
  testsuites = xml.etree.ElementTree.fromstring(xml_content)
  for testsuite in testsuites:
    suite_name = testsuite.attrib['name']
    for testcase in testsuite:
      case_name = testcase.attrib['name']
      result_type = base_test_result.ResultType.PASS
      log = []
      for failure in testcase:
        result_type = base_test_result.ResultType.FAIL
        log.append(html.unescape(failure.attrib['message']))

      results.append(base_test_result.BaseTestResult(
          '%s.%s' % (suite_name, case_name),
          result_type,
          int(float(testcase.attrib['time']) * 1000),
          log=('\n'.join(log) if log else '')))

  return results
项目:Price-Comparator    作者:Thejas-1    | 项目源码 | 文件源码
def handle_charref(self, name):
        # XXX workaround for a bug in HTMLParser. Remove this once
        # it's fixed in all supported versions.
        # http://bugs.python.org/issue13633
        if name.startswith('x'):
            real_name = int(name.lstrip('x'), 16)
        elif name.startswith('X'):
            real_name = int(name.lstrip('X'), 16)
        else:
            real_name = int(name)

        try:
            data = unichr(real_name)
        except (ValueError, OverflowError), e:
            data = u"\N{REPLACEMENT CHARACTER}"

        self.handle_data(data)
项目:Price-Comparator    作者:Thejas-1    | 项目源码 | 文件源码
def feed(self, markup):
        args, kwargs = self.parser_args
        parser = BeautifulSoupHTMLParser(*args, **kwargs)
        parser.soup = self.soup
        try:
            parser.feed(markup)
        except HTMLParseError, e:
            warnings.warn(RuntimeWarning(
                "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
            raise e

# Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some
# 3.2.3 code. This ensures they don't treat markup like <p></p> as a
# string.
#
# XXX This code can be removed once most Python 3 users are on 3.2.3.
项目:oil    作者:oilshell    | 项目源码 | 文件源码
def test_cdata_with_closing_tags(self):
        # see issue #13358
        # make sure that HTMLParser calls handle_data only once for each CDATA.
        # The normal event collector normalizes the events in get_events,
        # so we override it to return the original list of events.
        class Collector(EventCollector):
            def get_events(self):
                return self.events

        content = """<!-- not a comment --> &not-an-entity-ref;
                  <a href="" /> </p><p> &amp; <span></span></style>
                  '</script' + '>' </html> </head> </scripter>!"""
        for element in [' script', 'script ', ' script ',
                        '\nscript', 'script\n', '\nscript\n']:
            s = u'<script>{content}</{element}>'.format(element=element,
                                                        content=content)
            self._run_check(s, [("starttag", "script", []),
                                ("data", content),
                                ("endtag", "script")],
                            collector=Collector)
项目:python2-tracer    作者:extremecoders-re    | 项目源码 | 文件源码
def test_cdata_with_closing_tags(self):
        # see issue #13358
        # make sure that HTMLParser calls handle_data only once for each CDATA.
        # The normal event collector normalizes the events in get_events,
        # so we override it to return the original list of events.
        class Collector(EventCollector):
            def get_events(self):
                return self.events

        content = """<!-- not a comment --> &not-an-entity-ref;
                  <a href="" /> </p><p> &amp; <span></span></style>
                  '</script' + '>' </html> </head> </scripter>!"""
        for element in [' script', 'script ', ' script ',
                        '\nscript', 'script\n', '\nscript\n']:
            s = u'<script>{content}</{element}>'.format(element=element,
                                                        content=content)
            self._run_check(s, [("starttag", "script", []),
                                ("data", content),
                                ("endtag", "script")],
                            collector=Collector)
项目:jaychou    作者:fantasysea    | 项目源码 | 文件源码
def lrc2dict(lrc):
    time_stamps = re.findall(r'\[[^\]]+\]', lrc)
    html_parser = HTMLParser.HTMLParser()
    if time_stamps:
        # ????
        lyric = lrc
        for tplus in time_stamps:
            lyric = lyric.replace(tplus, '').replace('\r', '').replace('\n', '').replace('????','').replace('???','').replace('?????','').replace('???','').replace('??','').replace('??','').replace('??','').replace('??','')
            lyric = lyric.replace('???', '').replace('??', '').replace('????', '').replace('???', '').replace('??', '').replace('???', '')
        # ????
        # tplus: [02:31.79]
        # t 02:31.79
        # print lyric
        print html_parser.unescape(lyric)
        return html_parser.unescape(lyric)
    else:
        return ''
项目:ome-model    作者:ome    | 项目源码 | 文件源码
def HTML(text, encoding=None):
    """Parse the given HTML source and return a markup stream.

    Unlike with `HTMLParser`, the returned stream is reusable, meaning it can be
    iterated over multiple times:

    >>> html = HTML('<body><h1>Foo</h1></body>', encoding='utf-8')
    >>> print(html)
    <body><h1>Foo</h1></body>
    >>> print(html.select('h1'))
    <h1>Foo</h1>
    >>> print(html.select('h1/text()'))
    Foo

    :param text: the HTML source
    :return: the parsed XML event stream
    :raises ParseError: if the HTML text is not well-formed, and error recovery
                        fails
    """
    if isinstance(text, unicode):
        # If it's unicode text the encoding should be set to None.
        # The option to pass in an incorrect encoding is for ease
        # of writing doctests that work in both Python 2.x and 3.x.
        return Stream(list(HTMLParser(StringIO(text), encoding=None)))
    return Stream(list(HTMLParser(BytesIO(text), encoding=encoding)))
项目:svg-animation-tools    作者:parallax    | 项目源码 | 文件源码
def handle_charref(self, name):
        # XXX workaround for a bug in HTMLParser. Remove this once
        # it's fixed in all supported versions.
        # http://bugs.python.org/issue13633
        if name.startswith('x'):
            real_name = int(name.lstrip('x'), 16)
        elif name.startswith('X'):
            real_name = int(name.lstrip('X'), 16)
        else:
            real_name = int(name)

        try:
            data = unichr(real_name)
        except (ValueError, OverflowError), e:
            data = u"\N{REPLACEMENT CHARACTER}"

        self.handle_data(data)
项目:svg-animation-tools    作者:parallax    | 项目源码 | 文件源码
def feed(self, markup):
        args, kwargs = self.parser_args
        parser = BeautifulSoupHTMLParser(*args, **kwargs)
        parser.soup = self.soup
        try:
            parser.feed(markup)
        except HTMLParseError, e:
            warnings.warn(RuntimeWarning(
                "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
            raise e

# Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some
# 3.2.3 code. This ensures they don't treat markup like <p></p> as a
# string.
#
# XXX This code can be removed once most Python 3 users are on 3.2.3.
项目:svg-animation-tools    作者:parallax    | 项目源码 | 文件源码
def handle_charref(self, name):
        # XXX workaround for a bug in HTMLParser. Remove this once
        # it's fixed in all supported versions.
        # http://bugs.python.org/issue13633
        if name.startswith('x'):
            real_name = int(name.lstrip('x'), 16)
        elif name.startswith('X'):
            real_name = int(name.lstrip('X'), 16)
        else:
            real_name = int(name)

        try:
            data = unichr(real_name)
        except (ValueError, OverflowError), e:
            data = u"\N{REPLACEMENT CHARACTER}"

        self.handle_data(data)
项目:svg-animation-tools    作者:parallax    | 项目源码 | 文件源码
def feed(self, markup):
        args, kwargs = self.parser_args
        parser = BeautifulSoupHTMLParser(*args, **kwargs)
        parser.soup = self.soup
        try:
            parser.feed(markup)
        except HTMLParseError, e:
            warnings.warn(RuntimeWarning(
                "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
            raise e

# Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some
# 3.2.3 code. This ensures they don't treat markup like <p></p> as a
# string.
#
# XXX This code can be removed once most Python 3 users are on 3.2.3.
项目:DIS_MeituanReptile    作者:myvary    | 项目源码 | 文件源码
def downloader_html(self,url):
        '''
        :param url: ??????url
        :return  ???????,????????????0
        '''
        try:
            print url
            BAIDU_UA = 'Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)'#????UA
            headers = {'User-Agent': BAIDU_UA}  ##????????User-Agent ?UA?????????????????
            data = requests.get(url, headers=headers) ##??????????????????
            html_parser = HTMLParser.HTMLParser()
            data = html_parser.unescape(data.text)
            return data
        except:
            print '?????????',url
            return '0'
项目:pelisalacarta-ce    作者:pelisalacarta-ce    | 项目源码 | 文件源码
def http_response(self, request, response):
        if not hasattr(response, "seek"):
            response = response_seek_wrapper(response)
        http_message = response.info()
        url = response.geturl()
        ct_hdrs = http_message.getheaders("content-type")
        if is_html(ct_hdrs, url, self._allow_xhtml):
            try:
                try:
                    html_headers = parse_head(response,
                                              self.head_parser_class())
                finally:
                    response.seek(0)
            except (HTMLParser.HTMLParseError,
                    sgmllib.SGMLParseError):
                pass
            else:
                for hdr, val in html_headers:
                    # add a header
                    http_message.dict[hdr.lower()] = val
                    text = hdr + ": " + val
                    for line in text.split("\n"):
                        http_message.headers.append(line + "\n")
        return response
项目:crestify    作者:crestify    | 项目源码 | 文件源码
def __init__(self, file_name, user_id):
        with open(file_name, 'r') as self.opened_file:
            self.html = self.opened_file.read()
        self.soup = BeautifulSoup4(self.html)
        self.user = user_id
        self.urls = dict()
        self.check_duplicates = dict()
        self.check_duplicates_query = Bookmark.query.filter(Bookmark.user == self.user,
                                                            Bookmark.deleted == False).all()
        for bmark in self.check_duplicates_query:
            self.check_duplicates[bmark.main_url] = bmark
        self.tags_dict = dict()
        self.tags_set = set()
        self.html_parser = HTMLParser.HTMLParser()
        self.valid_url = re.compile(
            r'^(?:[a-z0-9\.\-]*)://'
            r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}(?<!-)\.?)|'
            r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|'
            r'\[?[A-F0-9]*:[A-F0-9:]+\]?)'
            r'(?::\d+)?'
            r'(?:/?|[/?]\S+)$', re.IGNORECASE)
项目:crestify    作者:crestify    | 项目源码 | 文件源码
def __init__(self, file_name, user_id):
        with open(file_name, 'r') as self.opened_file:
            self.data = self.opened_file.read()
        self.user = user_id
        self.data = ujson.loads(self.data)
        self.urls = dict()
        self.tags_dict = dict()
        self.tags_set = set()
        self.check_duplicates = dict()
        self.check_duplicates_query = Bookmark.query.filter(Bookmark.user == self.user,
                                                            Bookmark.deleted == False).all()
        for x in self.check_duplicates_query:
            self.check_duplicates[x.main_url] = x
        self.html_parser = HTMLParser.HTMLParser()
        self.valid_url = re.compile(
            r'^(?:[a-z0-9\.\-]*)://'
            r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}(?<!-)\.?)|'
            r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|'
            r'\[?[A-F0-9]*:[A-F0-9:]+\]?)'
            r'(?::\d+)?'
            r'(?:/?|[/?]\S+)$', re.IGNORECASE)
项目:plugin.video.streamondemand-pureita    作者:orione7    | 项目源码 | 文件源码
def http_response(self, request, response):
        if not hasattr(response, "seek"):
            response = response_seek_wrapper(response)
        http_message = response.info()
        url = response.geturl()
        ct_hdrs = http_message.getheaders("content-type")
        if is_html(ct_hdrs, url, self._allow_xhtml):
            try:
                try:
                    html_headers = parse_head(response,
                                              self.head_parser_class())
                finally:
                    response.seek(0)
            except (HTMLParser.HTMLParseError,
                    sgmllib.SGMLParseError):
                pass
            else:
                for hdr, val in html_headers:
                    # add a header
                    http_message.dict[hdr.lower()] = val
                    text = hdr + ": " + val
                    for line in text.split("\n"):
                        http_message.headers.append(line + "\n")
        return response
项目:Alexa-Chatter    作者:ekt1701    | 项目源码 | 文件源码
def handle_charref(self, name):
        # XXX workaround for a bug in HTMLParser. Remove this once
        # it's fixed in all supported versions.
        # http://bugs.python.org/issue13633
        if name.startswith('x'):
            real_name = int(name.lstrip('x'), 16)
        elif name.startswith('X'):
            real_name = int(name.lstrip('X'), 16)
        else:
            real_name = int(name)

        try:
            data = unichr(real_name)
        except (ValueError, OverflowError), e:
            data = u"\N{REPLACEMENT CHARACTER}"

        self.handle_data(data)
项目:Alexa-Chatter    作者:ekt1701    | 项目源码 | 文件源码
def feed(self, markup):
        args, kwargs = self.parser_args
        parser = BeautifulSoupHTMLParser(*args, **kwargs)
        parser.soup = self.soup
        try:
            parser.feed(markup)
        except HTMLParseError, e:
            warnings.warn(RuntimeWarning(
                "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
            raise e

# Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some
# 3.2.3 code. This ensures they don't treat markup like <p></p> as a
# string.
#
# XXX This code can be removed once most Python 3 users are on 3.2.3.
项目:plugin.video.rtbfauvio    作者:Gaet81    | 项目源码 | 文件源码
def play_video(self, datas):
        url = datas.get('url')
        data = channel.get_url(url)
        regex = r"""src="(https://www.rtbf.be/auvio/embed/media[^"]+)"""
        iframe_url = re.findall(regex, data)[0]
        iframe_url
        data = channel.get_url(iframe_url)
        regex = r"""data-media="([^"]+)"""
        media = re.findall(regex, data)[0]        
        h = HTMLParser.HTMLParser()
        media_json = h.unescape(media)
        regex = r""""high":"([^"]+)"""
        all_url = re.findall(regex, media_json)
        if len(all_url) > 0:
            video_url = all_url[0]
        else:
            regex = r"""url&quot;:&quot;([^&]+)"""
            iframe_url = re.findall(regex, data)[0]
            video_url = iframe_url.replace("\\", "")    
        channel.playUrl(video_url)
项目:amazon_order_history_scraper    作者:drewctate    | 项目源码 | 文件源码
def handle_charref(self, name):
        # XXX workaround for a bug in HTMLParser. Remove this once
        # it's fixed in all supported versions.
        # http://bugs.python.org/issue13633
        if name.startswith('x'):
            real_name = int(name.lstrip('x'), 16)
        elif name.startswith('X'):
            real_name = int(name.lstrip('X'), 16)
        else:
            real_name = int(name)

        try:
            data = unichr(real_name)
        except (ValueError, OverflowError), e:
            data = u"\N{REPLACEMENT CHARACTER}"

        self.handle_data(data)
项目:amazon_order_history_scraper    作者:drewctate    | 项目源码 | 文件源码
def feed(self, markup):
        args, kwargs = self.parser_args
        parser = BeautifulSoupHTMLParser(*args, **kwargs)
        parser.soup = self.soup
        try:
            parser.feed(markup)
        except HTMLParseError, e:
            warnings.warn(RuntimeWarning(
                "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
            raise e

# Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some
# 3.2.3 code. This ensures they don't treat markup like <p></p> as a
# string.
#
# XXX This code can be removed once most Python 3 users are on 3.2.3.
项目:toollabs    作者:multichill    | 项目源码 | 文件源码
def getBelvedereArtistsGenerator():
    """
    Generator to return Auckland Art Gallery paintings

    """
    htmlparser = HTMLParser.HTMLParser()

    basesearchurl=u'http://digital.belvedere.at/people/%s'
    urlregex = u'\<h3\>\<a href\=\"\/people\/(?P<id>\d+)\/[^\"]+\"\>(?P<name>[^\<]+)\<\/a\>\<\/h3\>\<div\>(?P<description>[^\<]+)\<\/div\>'

    # Just loop over the pages
    for i in string.ascii_lowercase:
        searchurl = basesearchurl % (i,)
        print searchurl
        searchPage = requests.get(searchurl)

        matches = re.finditer(urlregex, searchPage.text)
        for match in matches:
            artist = {}
            artist[u'id'] = match.group(u'id')
            artist[u'name'] = htmlparser.unescape(match.group(u'name'))
            artist[u'description'] = htmlparser.unescape(match.group(u'description'))
            artist[u'url'] = u'http://digital.belvedere.at/people/%s/' % (match.group(u'id'),)
            yield artist
项目:kekescan    作者:xiaoxiaoleo    | 项目源码 | 文件源码
def handle_charref(self, name):
        # XXX workaround for a bug in HTMLParser. Remove this once
        # it's fixed in all supported versions.
        # http://bugs.python.org/issue13633
        if name.startswith('x'):
            real_name = int(name.lstrip('x'), 16)
        elif name.startswith('X'):
            real_name = int(name.lstrip('X'), 16)
        else:
            real_name = int(name)

        try:
            data = unichr(real_name)
        except (ValueError, OverflowError), e:
            data = u"\N{REPLACEMENT CHARACTER}"

        self.handle_data(data)
项目:kekescan    作者:xiaoxiaoleo    | 项目源码 | 文件源码
def feed(self, markup):
        args, kwargs = self.parser_args
        parser = BeautifulSoupHTMLParser(*args, **kwargs)
        parser.soup = self.soup
        try:
            parser.feed(markup)
        except HTMLParseError, e:
            warnings.warn(RuntimeWarning(
                "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
            raise e

# Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some
# 3.2.3 code. This ensures they don't treat markup like <p></p> as a
# string.
#
# XXX This code can be removed once most Python 3 users are on 3.2.3.
项目:mibici-tools    作者:regenhans    | 项目源码 | 文件源码
def handle_charref(self, name):
        # XXX workaround for a bug in HTMLParser. Remove this once
        # it's fixed in all supported versions.
        # http://bugs.python.org/issue13633
        if name.startswith('x'):
            real_name = int(name.lstrip('x'), 16)
        elif name.startswith('X'):
            real_name = int(name.lstrip('X'), 16)
        else:
            real_name = int(name)

        try:
            data = unichr(real_name)
        except (ValueError, OverflowError), e:
            data = u"\N{REPLACEMENT CHARACTER}"

        self.handle_data(data)