Python html.parser.HTMLParser 模块,__init__() 实例源码

我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用html.parser.HTMLParser.__init__()

项目:DataTree    作者:tvgrabbers    | 项目源码 | 文件源码
def __init__(self, data_def = None, warnaction = "default", warngoal = sys.stderr, caller_id = 0):
        self.tree_lock = RLock()
        with self.tree_lock:
            self.dtc = DataTreeConstants()
            self.known_urlid = (0, 4, 11, 14)
            self.known_linkid = (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12)
            self.errorcode = dte.dtDataDefOK
            self.caller_id = caller_id
            self.cdata_def = {}
            self.ddtype = ""
            if sys.modules['DataTreeGrab']._warnings == None:
                sys.modules['DataTreeGrab']._warnings = _Warnings(warnaction, warngoal, caller_id)

            elif caller_id not in sys.modules['DataTreeGrab']._warnings._ids or warnaction != None:
                sys.modules['DataTreeGrab']._warnings.set_warnaction(warnaction, caller_id)

            if isinstance(data_def, dict):
                self.data_def = data_def
                self.convert_data_def()

            else:
                self.data_def = {}
项目:DataTree    作者:tvgrabbers    | 项目源码 | 文件源码
def __init__(self, dtree, data = None, parent = None, key = None):
        self.type = "value"
        self.key = key
        self.keys = []
        self.key_index = {}
        self.value = None
        DATAnode.__init__(self, dtree, parent)
        with self.node_lock:
            if isinstance(data, list):
                self.type = "list"
                for k in range(len(data)):
                    JSONnode(self.dtree, data[k], self, k)

            elif isinstance(data, dict):
                self.type = "dict"
                for k, item in data.items():
                    JSONnode(self.dtree, item, self, k)

            else:
                self.type = "value"
                self.value = data
项目:DataTree    作者:tvgrabbers    | 项目源码 | 文件源码
def __init__(self, data, output = sys.stdout, warnaction = "default", warngoal = sys.stderr, caller_id = 0):
        DATAtree.__init__(self, output, warnaction, warngoal, caller_id)
        with self.tree_lock:
            self.tree_type ='json'
            self.extract_from_parent = True
            self.data = data
            # Read the json data into the tree
            try:
                self.root = JSONnode(self, data, key = 'ROOT')
                self.start_node = self.root

            except:
                self.warn('Unable to parse the JSON data. Invalid dataset!', dtDataWarning, 1)
                self.start_node = NULLnode()

# end JSONtree
项目:DataTree    作者:tvgrabbers    | 项目源码 | 文件源码
def __init__(self, data_def, data = None, warnaction = "default", warngoal = sys.stderr, caller_id = 0):
        self.tree_lock = RLock()
        with self.tree_lock:
            self.dtc = DataTreeConstants()
            self.ddconv = DataDef_Convert(warnaction = warnaction , warngoal = warngoal, caller_id = caller_id)
            self.caller_id = caller_id
            self.print_tags = False
            self.print_searchtree = False
            self.show_result = False
            self.fle = sys.stdout
            if sys.modules['DataTreeGrab']._warnings == None:
                sys.modules['DataTreeGrab']._warnings = _Warnings(warnaction, warngoal, caller_id)

            else:
                sys.modules['DataTreeGrab']._warnings.set_warnaction(warnaction, caller_id)

            self.searchtree = None
            self.timezone = pytz.utc
            self.errorcode = dte.dtDataInvalid
            self.result = []
            self.data_def = None
            self.init_data_def(data_def)
            if data != None:
                self.init_data(data)
项目:Peppy    作者:project-owner    | 项目源码 | 文件源码
def __init__(self, base_url, url=None):
        """ Initializer

        :param base_url: site base url
        :param url: current url
        """
        HTMLParser.__init__(self)
        self.cache = {}
        self.items = []
        self.pages = {}
        self.total_pages = 0
        self.base_url = base_url
        self.url = self.base_url 
        self.site_total_pages = 0
        self.CACHE_SIZE = 500      
        if url:
            self.url = url
项目:Weeds    作者:seamile    | 项目源码 | 文件源码
def __init__(self, name, outbox, max_task):
        '''
        @name: ???, ???????????,
        @outbox: ???????? url ????
        @max_task: ????????? (????????? coroutine ??)
        '''
        multiprocessing.Process.__init__(self)
        self.name = name
        self.inbox = multiprocessing.Queue()  # ??????????? url
        self.outbox = outbox
        self.max_task = max_task

        self.doing = multiprocessing.Value('i', 0)
        self._doing = set()
        self.result = set()  # ?????? url
        self.loop = None
项目:veripress    作者:veripress    | 项目源码 | 文件源码
def __init__(self):
        """Initialize attributes."""
        if sys.version.startswith('3.'):
            # Python 3.x
            super().__init__(convert_charrefs=False)
        else:
            # use HTMLParser.__init__ because HTMLParser is an 'old' style class, which cannot be passed to super()
            # see http://codependentcodr.blogspot.com/2012/02/python-htmlparser-and-super.html
            HTMLParser.__init__(self)

        self._root = _HtmlHeaderNode(level=0)  # root node with no data of itself, only 'children' matters
        self._curr_node = self._root  # most recently handled header node
        self._in_header = False
        self._header_id_count = {}  # record header ids to avoid collisions
        self._html = ''  # full HTML string parsed
        self._temp_start_tag = ''  # temporary HTML start tag of this current header node
项目:search-plugins    作者:qbittorrent    | 项目源码 | 文件源码
def __init__(self, results, url):
            HTMLParser.__init__(self)
            self.results = results
            self.url = url
            self.current_item = {} # One torrent result
            self.add_query = True
            self.torrent_info_index = 0 # Count of the meta data encountered
            self.torrent_info_array = []
            self.meta_data_grabbing = 0
            self.meta_data_array = []
            self.torrent_no_files = 0
            self.torrent_date_added = 0
            self.torrent_popularity = 0
            self.mangnet_link = ""
            self.desc_link = ""
            self.torrent_name = ""
项目:grasp    作者:textgain    | 项目源码 | 文件源码
def __init__(self, model, label, data=[]):
        """ Returns a new Model calibrated on the given data,
            which is a set of (vector, label)-tuples.
        """
        self._model = model
        self._label = label
        # Isotonic regression:
        y = ((model.predict(v)[label], label == x) for v, x in data)
        y = sorted(y) # monotonic
        y = zip(*y)
        y = list(y or ((),()))
        x = list(y[0])
        y = list(y[1])
        y = pav(y)
        x = [0] + x + [1]
        y = [0] + y + [1]
        f = {}
        i = 0
        # Linear interpolation:
        for p in range(100 + 1):
            p *= 0.01
            while x[i] < p:
                i += 1
            f[p] = (y[i-1] * (x[i] - p) + y[i] * (p - x[i-1])) / (x[i] - x[i-1])
        self._f = f
项目:grasp    作者:textgain    | 项目源码 | 文件源码
def __init__(self, path='WordNet-3.0'):
        """ Opens the WordNet database from the given path 
            (that contains dict/index.noun, dict/data.noun, ...)
        """
        self._f = {} # {'n': <open file 'dict/index.noun'>}

        for k, v in (('n', 'noun'), ('v', 'verb'), ('a', 'adj' ), ('r', 'adv' )):

            f = cd(path, 'dict',  'data.%s' % v)
            f = open(f, 'rb')
            self._f[k] = f

            f = cd(path, 'dict', 'index.%s' % v)
            f = open(f, 'r')
            for s in f:
                if not s.startswith(' '):
                    s = s.strip()
                    s = s.split(' ')
                    p = s[-int(s[2]):]
                    w = s[0]
                    w = w.replace('_', ' ')
                    self[w, k] = p # {('grasp', 'n'): (offset1, ...)}
            f.close()
项目:pipelines    作者:gis-rpd    | 项目源码 | 文件源码
def __init__(
        self,
        decode_html_entities=False,
        data_separator=' ',
    ):

        HTMLParser.__init__(self)

        self._parse_html_entities = decode_html_entities
        self._data_separator = data_separator

        self._in_td = False
        self._in_th = False
        self._current_table = []
        self._current_row = []
        self._current_cell = []
        self.tables = []
项目:asyncmultitasks    作者:willwinworld    | 项目源码 | 文件源码
def get_links(html):  # ????????
    class URLSeeker(HTMLParser):
        def __init__(self):
            HTMLParser.__init__(self)  # ?? ???super.__init__(self)
            self.urls = []

        def handle_starttag(self, tag, attrs):
            href = dict(attrs).get('href')
            if href and tag == 'a':
                self.urls.append(href)

    url_seeker = URLSeeker()
    url_seeker.feed(html)
    print('@@'*20)
    print(url_seeker.urls)
    print('@@'*20)
    return url_seeker.urls  # ?????????
项目:portalcheck    作者:pi0    | 项目源码 | 文件源码
def __init__(self):
        HTMLParser.__init__(self)
        self._in_td = False
        self._in_th = False
        self._current_table = []
        self._current_row = []
        self._current_cell = []
        self.tables = []
项目:progrobot    作者:petr-kalinin    | 项目源码 | 文件源码
def __init__(self):
        HTMLParser.__init__(self)
        self.buf = []
        self.last_text = []
        self.hide_output = False
        self.tag_count = 0
        self.current_tag = None
项目:learn-python    作者:xushubo    | 项目源码 | 文件源码
def __init__(self):
        HTMLParser.__init__(self)
        self.event_time = []
        self.event_title = []
        self.event_location = []
        self.in_time = False
        self.in_title = False
        self.in_location = False
项目:B.E.N.J.I.    作者:the-ethan-hunt    | 项目源码 | 文件源码
def __init__(self, *args, **kwargs):
        HTMLParser.__init__(self, *args, **kwargs)

        # Keep a list of empty-element tags that were encountered
        # without an explicit closing tag. If we encounter a closing tag
        # of this type, we'll associate it with one of those entries.
        #
        # This isn't a stack because we don't care about the
        # order. It's a list of closing tags we've already handled and
        # will ignore, assuming they ever show up.
        self.already_closed_empty_element = []
项目:B.E.N.J.I.    作者:the-ethan-hunt    | 项目源码 | 文件源码
def __init__(self, *args, **kwargs):
        if CONSTRUCTOR_TAKES_STRICT and not CONSTRUCTOR_STRICT_IS_DEPRECATED:
            kwargs['strict'] = False
        if CONSTRUCTOR_TAKES_CONVERT_CHARREFS:
            kwargs['convert_charrefs'] = False
        self.parser_args = (args, kwargs)
项目:gprime    作者:GenealogyCollective    | 项目源码 | 文件源码
def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.snf = StyledNoteFormatter(self)
项目:gprime    作者:GenealogyCollective    | 项目源码 | 文件源码
def __init__(self, form):
        self.form = form
        self.database = form.database
        self._backend = WebAppBackend()
        self._backend.build_link = self.build_link
项目:gprime    作者:GenealogyCollective    | 项目源码 | 文件源码
def __init__(self, form):
        HTMLParser.__init__(self)
        self.form = form
        self.__text = ""
        self.__tags = {}
        self.__stack = []
项目:core-python    作者:yidao620c    | 项目源码 | 文件源码
def __init__(self):
        self.data = []
        self.href = 0
        self.linkname = ''
        self.patt = re.compile(r'^/doc/\d+$')
        HTMLParser.__init__(self)
项目:core-python    作者:yidao620c    | 项目源码 | 文件源码
def __init__(self):
        self.data = set([])
        self.href = 0
        self.patt = re.compile(r'^\?p=\d+$')
        HTMLParser.__init__(self)
项目:mooder    作者:phith0n    | 项目源码 | 文件源码
def __init__(self, allows = []):
        HTMLParser.__init__(self)
        self.allow_tags = allows if allows else self.allow_tags
        self.result = []
        self.start = []
        self.data = []
项目:xiaodi    作者:shenaishiren    | 项目源码 | 文件源码
def __init__(self, allows = []):
        HTMLParser.__init__(self)
        self.allow_tags = allows if allows else self.allow_tags
        self.result = []
        self.start = []
        self.data = []
项目:DataTree    作者:tvgrabbers    | 项目源码 | 文件源码
def __init__(self, warnaction = None, warngoal = sys.stderr, caller_id = 0):
        self.warn_lock = RLock()
        self.onceregistry = {}
        self.filters = []
        self._ids = []
        if not caller_id in self._ids:
            self._ids.append(caller_id)
        self.warngoal = warngoal
        if warnaction == None:
            warnaction = "default"

        self.set_warnaction(warnaction, caller_id)
项目:DataTree    作者:tvgrabbers    | 项目源码 | 文件源码
def __init__(self, dtree, parent = None):
        self.node_lock = RLock()
        with self.node_lock:
            self.dtc = DataTreeConstants()
            self.children = []
            self.dtree = dtree
            self.parent = parent
            self.value = None
            self.child_index = 0
            self.level = 0
            self.links = {}
            self.links["values"] = {}
            self.links["nodes"] = {}
            self.end_links = {}
            self.end_links["values"] = {}
            self.end_links["nodes"] = {}

            self.is_root = bool(self.parent == None)
            n = self
            while not n.is_root:
                n = n.parent

            self.root = n
            if isinstance(parent, DATAnode):
                self.parent.append_child(self)
                self.level = parent.level + 1
项目:DataTree    作者:tvgrabbers    | 项目源码 | 文件源码
def __init__(self, dtree, data = None, parent = None):
        self.tag = u''
        self.text = u''
        self.tail = u''
        self.attributes = {}
        self.attr_names = []
        DATAnode.__init__(self, dtree, parent)
        with self.node_lock:
            if isinstance(data, (str, unicode)):
                self.tag = data.lower().strip()

            elif isinstance(data, list):
                if len(data) > 0:
                    self.tag = data[0].lower().strip()

                if len(data) > 1 and isinstance(data[1], (list, tuple)):
                    for a in data[1]:
                        if isinstance(a[1], (str, unicode)):
                            self.attributes[a[0].lower().strip()] = a[1].strip()

                        else:
                            self.attributes[a[0].lower().strip()] = a[1]

                    if 'class' in self.attributes.keys():
                        self.attr_names.append('class')

                    if 'id' in self.attributes.keys():
                        self.attr_names.append('id')

                    for a in self.attributes.keys():
                        if a not in self.attr_names:
                            self.attr_names.append(a)
项目:DataTree    作者:tvgrabbers    | 项目源码 | 文件源码
def __init__(self, data, autoclose_tags=[], print_tags = False, output = sys.stdout, warnaction = "default", warngoal = sys.stderr, caller_id = 0):
        HTMLParser.__init__(self)
        DATAtree.__init__(self, output, warnaction, warngoal, caller_id)
        with self.tree_lock:
            self.tree_type ='html'
            self.print_tags = print_tags
            self.autoclose_tags = autoclose_tags
            self.is_tail = False
            self.root = HTMLnode(self, 'root')
            self.current_node = self.root
            self.last_node = None
            self.text = u''
            self.open_tags = {}
            self.count_tags(data)
            # read the html page into the tree
            try:
                # Cover for incomplete reads where the essentiel body part is retrieved
                for ctag in ('body', 'BODY', 'html', 'HTML', 'xml', 'XML'):
                    if u'<%s>' % (ctag, ) in data and not u'</%s>' % (ctag, ) in data:
                        data = u'%s</%s>' % (data, ctag)

                self.feed(data)
                self.reset()
                self.start_node = self.root

            except:
                self.warn('Unable to parse the HTML data. Invalid dataset!', dtDataWarning, 1)
                self.start_node = NULLnode()
项目:Python_Study    作者:thsheep    | 项目源码 | 文件源码
def get_links(html):
    class URLSeeker(HTMLParser):
        def __init__(self):
            HTMLParser.__init__(self)
            self.urls = []

        def handle_starttag(self, tag, attrs):
            href = dict(attrs).get('href')
            if href and tag == 'a':
                self.urls.append(href)

    url_seeker = URLSeeker()
    url_seeker.feed(html)
    return url_seeker.urls
项目:netscaler-ansible-modules    作者:citrix    | 项目源码 | 文件源码
def __init__(self):
        HTMLParser.__init__(self)
        self.entity = None
        self.state = 'IDLE'
        self.data = {
            'rwattrs': [],
            'roattrs': [],
            'addattrs': [],
            'updateattrs': [],
        }
        self.new_current_attr()

        self.re_json_arg = re.compile(r'"([^"]+)": *<[^>]+>')
项目:pandachaika    作者:pandabuilder    | 项目源码 | 文件源码
def __init__(self) -> None:
        HTMLParser.__init__(self)
        self.galleries: typing.Set[str] = set()
        self.stop_at_favorites: int = 0
项目:pandachaika    作者:pandabuilder    | 项目源码 | 文件源码
def __init__(self) -> None:
        HTMLParser.__init__(self)
        self.empty_search = 0
项目:pandachaika    作者:pandabuilder    | 项目源码 | 文件源码
def __init__(self) -> None:
        HTMLParser.__init__(self, convert_charrefs=True)
        self.torrent_link = ''
        self.stop_at_found: int = 0
        self.found_non_final_gallery: int = 0
        self.parent_gallery: str = ''
        self.found_parent_gallery: int = 0
        self.found_gallery_link: int = 0
        self.non_final_gallery: str = ''
项目:pandachaika    作者:pandabuilder    | 项目源码 | 文件源码
def __init__(self) -> None:
        HTMLParser.__init__(self, convert_charrefs=True)
        self.torrent = ''
        self.found_seed_data = 0
        self.found_posted_data = 0
        self.posted_date = ''
        self.seeds = 0
项目:pandachaika    作者:pandabuilder    | 项目源码 | 文件源码
def __init__(self) -> None:
        HTMLParser.__init__(self)
        self.archive = ''
项目:hakkuframework    作者:4shadoww    | 项目源码 | 文件源码
def __init__(self, **kwargs):
        HTMLParser.__init__(self)
        self.kwargs = kwargs
        self.tables = []
        self.last_row = []
        self.rows = []
        self.max_row_width = 0
        self.active = None
        self.last_content = ""
        self.is_last_row_header = False
项目:MCSManager-fsmodule    作者:Suwings    | 项目源码 | 文件源码
def __init__(self, allows = []):
        HTMLParser.__init__(self)
        self.allow_tags = allows if allows else self.allow_tags
        self.result = []
        self.start = []
        self.data = []
项目:ankimaker    作者:carllacan    | 项目源码 | 文件源码
def __init__(self, lang):
        self.lang = lang # lang of the word we are looking up
        self.in_lang = False # flag: are we in the appropriate language?
        self.getting_defs = False # flag: are we collecting definitions?
        self.pos = "" # part of speech we are in
        self.trans = {} # each key is the pos, each entry the translations
        HTMLParser.__init__(self)
项目:ShelbySearch    作者:Agentscreech    | 项目源码 | 文件源码
def __init__(self, *args, **kwargs):
        HTMLParser.__init__(self, *args, **kwargs)

        # Keep a list of empty-element tags that were encountered
        # without an explicit closing tag. If we encounter a closing tag
        # of this type, we'll associate it with one of those entries.
        #
        # This isn't a stack because we don't care about the
        # order. It's a list of closing tags we've already handled and
        # will ignore, assuming they ever show up.
        self.already_closed_empty_element = []
项目:ShelbySearch    作者:Agentscreech    | 项目源码 | 文件源码
def __init__(self, *args, **kwargs):
        if CONSTRUCTOR_TAKES_STRICT and not CONSTRUCTOR_STRICT_IS_DEPRECATED:
            kwargs['strict'] = False
        if CONSTRUCTOR_TAKES_CONVERT_CHARREFS:
            kwargs['convert_charrefs'] = False
        self.parser_args = (args, kwargs)
项目:campus-network-login    作者:xiadingZ    | 项目源码 | 文件源码
def __init__(self):
        HTMLParser.__init__(self)
        self.count = 0
        self.id = None
项目:Weeds    作者:seamile    | 项目源码 | 文件源码
def __init__(self, url, with_subdomain=False):
        HTMLParser.__init__(self)
        self.protocol, self.domain, self.path = self.parse_url(url)
        self.with_subdomain = with_subdomain
        self.links = set()
项目:veripress    作者:veripress    | 项目源码 | 文件源码
def __init__(self, level):
        """Initialize attributes."""
        self.level = level  # header level of the element, e.g. 1 for <h1>, 2 for <h2>, etc
        self.id = ''  # anchor id (in-page link), used in 'id' and 'href' attribute of 'a' tag
        self.text = ''  # pure text content of header tag, e.g. 'Title' for '<h1>Title</h1>'
        self.inner_html = ''  # inner HTML
        self.father = None  # point to the direct father node
        self.children = []  # elements with lower levels that directly follows the current elem
项目:GUIYoutube    作者:coltking    | 项目源码 | 文件源码
def __init__(self, *args, **kwargs):
        HTMLParser.__init__(self, *args, **kwargs)

        # Keep a list of empty-element tags that were encountered
        # without an explicit closing tag. If we encounter a closing tag
        # of this type, we'll associate it with one of those entries.
        #
        # This isn't a stack because we don't care about the
        # order. It's a list of closing tags we've already handled and
        # will ignore, assuming they ever show up.
        self.already_closed_empty_element = []
项目:GUIYoutube    作者:coltking    | 项目源码 | 文件源码
def __init__(self, *args, **kwargs):
        if CONSTRUCTOR_TAKES_STRICT and not CONSTRUCTOR_STRICT_IS_DEPRECATED:
            kwargs['strict'] = False
        if CONSTRUCTOR_TAKES_CONVERT_CHARREFS:
            kwargs['convert_charrefs'] = False
        self.parser_args = (args, kwargs)
项目:zhihu-oauth    作者:7sDream    | 项目源码 | 文件源码
def __init__(self):
        HTMLParser.__init__(self)
        self._level = 0
        self._last = ''
        self._in_code = False
        self._prettified = [_BASE_HTML_HEADER]
项目:zhihu-oauth    作者:7sDream    | 项目源码 | 文件源码
def __init__(self, value=None):
        self._value = value
项目:softuni-course-calendar-scraper    作者:Enether    | 项目源码 | 文件源码
def __init__(self):
        HTMLParser.__init__(self)
        self.lectures = 0
项目:softuni-course-calendar-scraper    作者:Enether    | 项目源码 | 文件源码
def __init__(self):
        HTMLParser.__init__(self)
项目:My-Web-Server-Framework-With-Python2.7    作者:syjsu    | 项目源码 | 文件源码
def __init__(self, **kwargs):
        HTMLParser.__init__(self)
        self.kwargs = kwargs
        self.tables = []
        self.last_row = []
        self.rows = []
        self.max_row_width = 0
        self.active = None
        self.last_content = ""
        self.is_last_row_header = False