Python html 模块,parser() 实例源码

我们从Python开源项目中,提取了以下16个代码示例,用于说明如何使用html.parser()

项目:chandl    作者:gebn    | 项目源码 | 文件源码
def unescape_html(html_):
    """
    Replace HTML entities (e.g. `£`) in a string.

    :param html_: The escaped HTML.
    :return: The input string with entities replaces.
    """

    # http://stackoverflow.com/a/2360639

    if sys.version_info.major == 2:  # 2.7
        # noinspection PyUnresolvedReferences,PyCompatibility
        from HTMLParser import HTMLParser
        return HTMLParser().unescape(html_)

    if sys.version_info.minor == 3:  # 3.3
        # noinspection PyCompatibility
        from html.parser import HTMLParser
        # noinspection PyDeprecation
        return HTMLParser().unescape(html_)

    # 3.4+
    # noinspection PyCompatibility
    import html
    return html.unescape(html_)
项目:NixieBot    作者:Zedsquared    | 项目源码 | 文件源码
def processIncomingTweet(tweet): #check tweet that has come in via the filter stream, it might have commands in it
    # print(tweet)
    global maxWordQ
    global wordq
    if scanTags(tweet,"NixieBotShowMe") :
        theWord=extractWord(html.parser.HTMLParser().unescape(tweet['text']))
        if ((theWord is not None ) or ( hasCommand(tweet))) :
            wordqPut(tweet,priority = prioritise(tweet))
            size = wordq.qsize()
            if size > maxWordQ : maxWordQ = size
            print("word request from", tweet['user']['screen_name'], "word = ", theWord, " Word queue at:", size, "maxqueue was ", maxWordQ)
            recentReqs.append(tweet) # store for sending to hard storage every now and then
            if len(recentReqs) > reqPickleFrequency :
                if pickleMe(recentReqs, "Requests", dateStamp=True) :
                    recentReqs[:]=[]      
            #userCounter.update(tweet['user']['screen_name'])

    # DMreceipt bad idea as it still counts against rate limit
    #for ht in tweet['entities']['hashtags']:        
    #    if ht['text']=="NBreceipt" and not rct:
    #        sendReceipt(tweet,theWord,tt)
    #        rct=True
项目:progrobot    作者:petr-kalinin    | 项目源码 | 文件源码
def html2tele(html):
    #print("html2tele input: ", html)
    parser = _HTMLToText()
    parser.feed(html)
    parser.close()
    result = parser.get_text()
    result = re.sub(r'\n(\s*\n+)', '\n\n', result)
    result = re.sub(r' +<pre>', '<pre>', result)
    result = re.sub(r'</pre> +', '</pre>', result)
    #print("html2tele result: ", result)
    return result

#----------
项目:hangoutsbot    作者:das7pad    | 项目源码 | 文件源码
def simple_parse_to_segments(html, debug=False, **kwargs):
    html = fix_urls(html)
    html = '<html>' + html + '</html>' # html.parser seems to ignore the final entityref without html closure
    parser = simpleHTMLParser(debug)
    return parser.feed(html)
项目:NixieBot    作者:Zedsquared    | 项目源码 | 文件源码
def on_success(self, tweet):
        global recentIDDeque
        if 'text' in tweet and not ('retweeted_status' in tweet) :
            print("<<<<<<<<<<<<<<<<<<<  Incoming!<<<<<<<<<<<<<<<<<< " + html.parser.HTMLParser().unescape(tweet['text']) + tweet['id_str'])
            if tweet['id_str'] not in recentIDDeque :
                 processIncomingTweet(tweet)
                 recentIDDeque.appendleft(tweet['id_str'])
            else :
                print("!!!! duplicate!  Ignored ")
            backOffTime = 60
项目:NixieBot    作者:Zedsquared    | 项目源码 | 文件源码
def processIncomingTweet(tweet): #check tweet that has come in via the filter stream, it might have commands in it
    # print(tweet)
    global botState
    global wordq
    global randstream
    if scanTags(tweet,"NixieBotShowMe") :
        theWord=extractWord(html.parser.HTMLParser().unescape(tweet['text']))
        if ((theWord is not None ) or ( hasCommand(tweet))) :
            wordqPut(tweet,priority = prioritise(tweet))
            size = wordq.qsize()
            if size > botState['maxWordQ'] : botState['maxWordQ'] = size
            print("word request from", tweet['user']['screen_name'], "word = ", theWord, " Word queue at:", size, "maxqueue was ", botState['maxWordQ'])
            recentReqs.append(tweet) # store for sending to hard storage every now and then
            if len(recentReqs) > reqPickleFrequency :
                if pickleMe(recentReqs, "Requests", dateStamp=True) :
                    recentReqs[:]=[]      
            #userCounter.update(tweet['user']['screen_name'])
    elif scanTags(tweet,"NixieBotRollMe") :
        rollq.put(tweet)
        print("roll request incoming! Word queue at:", rollq.qsize())
    else :
        #must be a trump tweet so submit to random for now
        randstream.on_success(tweet)

    # DMreceipt bad idea as it still counts against rate limit
    #for ht in tweet['entities']['hashtags']:        
    #    if ht['text']=="NBreceipt" and not rct:
    #        sendReceipt(tweet,theWord,tt)
    #        rct=True
项目:NixieBot    作者:Zedsquared    | 项目源码 | 文件源码
def on_success(self, tweet):
        if 'text' in tweet and not ('retweeted_status' in tweet) :
            print("<<<<<<<<<<<<<<<<<<<  Incoming!<<<<<<<<<<<<<<<<<< " + html.parser.HTMLParser().unescape(tweet['text']))
            processIncomingTweet(tweet)
            backOffTime = 60
项目:packaging    作者:blockstack    | 项目源码 | 文件源码
def test_html_import(self):
        import html
        import html.entities
        import html.parser
        self.assertTrue(True)
项目:deep-learning-nlp-rl-papers    作者:madrugado    | 项目源码 | 文件源码
def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("--toc-maker", help="path to ToC making tool")
    parser.add_argument("--twitter-poster", default="t update", help="twitter poster command")
    parser.add_argument("-t", "--use-twitter", action="store_true")

    known_args, unknown_args = parser.parse_known_args()

    if not known_args.toc_maker:
        known_args.toc_maker = "./gh-md-toc"
        if not os.path.isfile(known_args.toc_maker):
            s = cmd.getoutput("uname -s").lower()
            f = "gh-md-toc.%s.amd64.tgz" % s
            URL = "https://github.com/ekalinin/github-markdown-toc.go/releases/download/0.6.0/%s" % f
            if not os.path.isfile(f):
                if cmd.getstatusoutput("wget %s" % URL)[0] != 0:
                    raise EnvironmentError("Cannot download toc maker from URL: %s" % URL)
            if cmd.getstatusoutput("tar xzf %s" % f)[0] != 0:
                    raise EnvironmentError("Cannot untar toc maker from file %s" % f)
            os.remove(f)

            current_permissions = stat.S_IMODE(os.lstat(known_args.toc_maker).st_mode)
            os.chmod(known_args.toc_maker, current_permissions & stat.S_IXUSR)

    if unknown_args:
        filepath = unknown_args[0]
    else:
        print("You should specify the path for file to work with!")
        quit(1)

    return known_args, filepath
项目:web_ctp    作者:molebot    | 项目源码 | 文件源码
def test_with_deleted_parent(self):
        # see #18681
        from html import parser
        html = sys.modules.pop('html')
        def cleanup():
            sys.modules['html'] = html
        self.addCleanup(cleanup)
        with self.assertRaisesRegex(ImportError, 'html'):
            imp.reload(parser)
项目:fondamentibook    作者:xelatihy    | 项目源码 | 文件源码
def __init__(self):
        '''Crea un parser per la class HTMLNode'''
        # inizializza la class base super()
        super().__init__()
        self.root = None
        self.stack = []
项目:fondamentibook    作者:xelatihy    | 项目源码 | 文件源码
def parse(html):
    '''Esegue il parsing HTML del testo html e
    ritorna la radice dell'albero.'''
    parser = _MyHTMLParser()
    parser.feed(html)
    return parser.root
项目:ouroboros    作者:pybee    | 项目源码 | 文件源码
def test_with_deleted_parent(self):
        # see #18681
        from html import parser
        html = sys.modules.pop('html')
        def cleanup():
            sys.modules['html'] = html
        self.addCleanup(cleanup)
        with self.assertRaisesRegex(ImportError, 'html'):
            imp.reload(parser)
项目:kbe_server    作者:xiaohaoppy    | 项目源码 | 文件源码
def test_with_deleted_parent(self):
        # see #18681
        from html import parser
        html = sys.modules.pop('html')
        def cleanup():
            sys.modules['html'] = html
        self.addCleanup(cleanup)
        with self.assertRaisesRegex(ImportError, 'html'):
            imp.reload(parser)
项目:NixieBot    作者:Zedsquared    | 项目源码 | 文件源码
def loadUserFont(fontfile) :
    #load in font file generated from online font designer at http://b7971.lucsmall.com/
    #lines should look like: 0x7622, // 0 - A
    #and the bit order should be reversed using the button at the top of that page
    global comLock
    global userProperChars
    font = {}
    stashfx = effx
    stashspeed = fxspeed
    setEffex(0,0)
    userProperChars = ""
    print("loading font")
    with open(fontfile) as ff :
        for line in ff :
            if line == '\n' : continue # cope with blank at end of file
            parts = line.split(",")
            print("parts = ",parts)
            bits = parts[0]
            letter = parts[1].split("-")[1].strip()
            bitval = int(bits,16) 
            print(bitval,letter)
            font[letter] = bitval
    font['-'] = 0x0022  #nasty hack as hyphen entry is broken by the split("-")
    font[','] = 0x0004  # ditto for comma
    font['~'] = 0x1310  # and tilde
    print(len(font)," characters loaded, now sending")
    with comLock :
        print("loadfont got comlock")
        cmd = "$B7F" + "U" * tubes
        print(cmd)
        com.write(bytes(cmd+"\r","utf-8")) 
        for glyph in font:
            userProperChars = userProperChars + glyph
            cmd="$B7W"+glyph
            mask =int('0b0100000000000000',2)
            while mask > 0 :
                if int(font[glyph]) & int(mask) > 0 :
                    cmd = cmd + "1"
                else :
                    cmd = cmd + "0"
                mask = mask >> 1
            print(cmd) 
            com.write(bytes(cmd+"\r","utf-8"))
            time.sleep(0.3)
            cmd="$B7M"+ glyph * tubes
            print(cmd)
            com.write(bytes(cmd+"\r","utf-8"))
        # special case (ok, bodge!) for space as the strip command in the font file parser above will remove it, and all fonts need a space
        cmd="$B7W 000000000000000"
        print(cmd)
        com.write(bytes(cmd+"\r","utf-8"))
        cmd="$B7M                    "
        print(cmd)
        com.write(bytes(cmd+"\r","utf-8"))
        userProperChars = userProperChars + " "
        setEffex(stashfx,stashspeed)
        # now write out character set file ( used by proper()  )
        with open("uCharSet.txt",'w' ) as cf :
            cf.write(userProperChars)       
    print("loadfont rel comlock")
项目:packaging    作者:blockstack    | 项目源码 | 文件源码
def test_future_moves(self):
        """
        Ensure everything is available from the future.moves interface that we
        claim and expect. (Issue #104).
        """
        from future.moves.collections import Counter, OrderedDict   # backported to Py2.6
        from future.moves.collections import UserDict, UserList, UserString

        from future.moves import configparser
        from future.moves import copyreg

        from future.moves.itertools import filterfalse, zip_longest

        from future.moves import html
        import future.moves.html.entities
        import future.moves.html.parser

        from future.moves import http
        import future.moves.http.client
        import future.moves.http.cookies
        import future.moves.http.cookiejar
        import future.moves.http.server

        from future.moves import queue

        from future.moves import socketserver

        from future.moves.subprocess import check_output              # even on Py2.6
        from future.moves.subprocess import getoutput, getstatusoutput

        from future.moves.sys import intern

        from future.moves import urllib
        import future.moves.urllib.error
        import future.moves.urllib.parse
        import future.moves.urllib.request
        import future.moves.urllib.response
        import future.moves.urllib.robotparser

        try:
            # Is _winreg available on Py2? If so, ensure future.moves._winreg is available too:
            import _winreg
        except ImportError:
            pass
        else:
            from future.moves import winreg

        from future.moves import xmlrpc
        import future.moves.xmlrpc.client
        import future.moves.xmlrpc.server

        from future.moves import _dummy_thread
        from future.moves import _markupbase
        from future.moves import _thread