Python lxml.html 模块,HTMLParser() 实例源码

我们从Python开源项目中,提取了以下15个代码示例,用于说明如何使用lxml.html.HTMLParser()

项目:ChemDataExtractor    作者:mcs07    | 项目源码 | 文件源码
def parse_rsc_html(htmlstring):
    """Messy RSC HTML needs this special parser to fix problems before creating selector."""
    converted = UnicodeDammit(htmlstring)
    if not converted.unicode_markup:
        raise UnicodeDecodeError('Failed to detect encoding, tried [%s]')
    root = fromstring(htmlstring, parser=HTMLParser(recover=True, encoding=converted.original_encoding))
    # Add p.otherpara tags around orphan text
    newp = None
    for child in root.get_element_by_id('wrapper'):
        if newp is not None:
            if child.tag in BLOCK_ELEMENTS or child.get('id', '').startswith('sect') or child.getnext() is None:
                child.addprevious(newp)
                newp = None
            else:
                newp.append(child)
        if newp is None and child.tag in BLOCK_ELEMENTS and child.tail and child.tail.strip():
            newp = Element('p', **{'class': 'otherpara'})
            newp.text = child.tail
            child.tail = ''
    return root
项目:BlogSpider    作者:hack4code    | 项目源码 | 文件源码
def __call__(self, doc, encoding='UTF-8'):
        if isinstance(doc,
                      (str, bytes)):
            doc = fromstring(bytes(bytearray(doc,
                                             encoding=encoding)),
                             parser=HTMLParser(encoding=encoding))
        if not isinstance(doc,
                          HtmlElement):
            return None
        for cls in self.EXTRACTORS:
            extract = cls()
            tags_ = extract(doc)
            if tags_:
                tags = []
                for idx, tag in enumerate(tags_):
                    if idx < 2 and len(tag) > 16:
                        break
                    elif len(tag) < 16:
                        tags.append(tag)
                else:
                    if tags:
                        logger.info('TagExtractor got tags %s',
                                    tags)
                    return tags
项目:ChemDataExtractor    作者:mcs07    | 项目源码 | 文件源码
def from_text(cls, text, base_url=None, parser=HTMLParser, translator=CssHTMLTranslator, fmt='html', namespaces=None, encoding=None):
        log.debug('Parsing {} with {}'.format(fmt, parser))
        root = fromstring(text, parser=parser(recover=True, encoding=cls._get_encoding(text, encoding)), base_url=base_url)
        if base_url and hasattr(root, 'make_links_absolute'):
            root.make_links_absolute()
        return cls(root, translator=translator, fmt=fmt, namespaces=namespaces)
项目:ChemDataExtractor    作者:mcs07    | 项目源码 | 文件源码
def from_html_text(cls, text, base_url=None, namespaces=None, encoding=None):
        return cls.from_text(text, base_url=base_url, parser=HTMLParser, translator=CssHTMLTranslator, fmt='html', namespaces=namespaces, encoding=encoding)
项目:ChemDataExtractor    作者:mcs07    | 项目源码 | 文件源码
def from_response(cls, response, parser=HTMLParser, translator=CssHTMLTranslator, fmt='html', namespaces=None):
        return cls.from_text(response.content, response.url, parser, translator, fmt, namespaces=namespaces, encoding=response.encoding)
项目:ChemDataExtractor    作者:mcs07    | 项目源码 | 文件源码
def from_html(cls, response, namespaces=None):
        return cls.from_response(response, parser=HTMLParser, translator=CssHTMLTranslator, fmt='html', namespaces=namespaces)
项目:dbapi    作者:acrazing    | 项目源码 | 文件源码
def to_xml(content, **kwargs):
        return html.fromstring(html=content, parser=html.HTMLParser(encoding='utf-8'), **kwargs)
项目:BlogSpider    作者:hack4code    | 项目源码 | 文件源码
def get_media_requests(self, item, info):
        doc = item['content']
        if isinstance(doc,
                      (str, bytes)):
            doc = fromstring(doc,
                             parser=HTMLParser(encoding=item['encoding']))
            item['content'] = doc

        try:
            attr = self.spiderinfo.spider.image_url_attr
        except AttributeError:
            attr = 'src'

        urls = []
        for e in doc.xpath('//img'):
            if attr in e.attrib:
                url = e.get(attr).strip(' \t\n')
                if url.startswith('/'):
                    url = urljoin(item['link'].strip(),
                                  url)
                    if url.startswith('//'):
                        url = 'http:' + url
                urls.append((url, e))

        reqs = []
        for url, e in urls:
            if not url.startswith('data'):
                try:
                    r = Request(url,
                                meta={'img': e})
                except ValueError:
                    logger.error((
                        'Error in pipeline image create Request[{}]'
                        ).format(url))
                else:
                    reqs.append(r)
        return reqs
项目:gooderp_org    作者:osbzr    | 项目源码 | 文件源码
def _pretty_arch(self, arch):
        # remove_blank_string does not seem to work on HTMLParser, and
        # pretty-printing with lxml more or less requires stripping
        # whitespace: http://lxml.de/FAQ.html#why-doesn-t-the-pretty-print-option-reformat-my-xml-output
        # so serialize to XML, parse as XML (remove whitespace) then serialize
        # as XML (pretty print)
        arch_no_whitespace = etree.fromstring(
            etree.tostring(arch, encoding='utf-8'),
            parser=etree.XMLParser(encoding='utf-8', remove_blank_text=True))
        return etree.tostring(
            arch_no_whitespace, encoding='unicode', pretty_print=True)
项目:gooderp_org    作者:osbzr    | 项目源码 | 文件源码
def save(self, cr, uid, res_id, value, xpath=None, context=None):
        """ Update a view section. The view section may embed fields to write

        :param str model:
        :param int res_id:
        :param str xpath: valid xpath to the tag to replace
        """
        res_id = int(res_id)

        arch_section = html.fromstring(
            value, parser=html.HTMLParser(encoding='utf-8'))

        if xpath is None:
            # value is an embedded field on its own, not a view section
            self.save_embedded_field(cr, uid, arch_section, context=context)
            return

        for el in self.extract_embedded_fields(cr, uid, arch_section, context=context):
            self.save_embedded_field(cr, uid, el, context=context)

            # transform embedded field back to t-field
            el.getparent().replace(el, self.to_field_ref(cr, uid, el, context=context))

        arch = self.replace_arch_section(cr, uid, res_id, xpath, arch_section, context=context)
        self.write(cr, uid, res_id, {
            'arch': self._pretty_arch(arch)
        }, context=context)

        view = self.browse(cr, SUPERUSER_ID, res_id, context=context)
        if view.model_data_id:
            view.model_data_id.write({'noupdate': True})
项目:gooderp_org    作者:osbzr    | 项目源码 | 文件源码
def field_rountrip_result(self, field, value, expected):
        model = 'website.converter.test'
        Model = self.registry(model)
        id = Model.create(
            self.cr, self.uid, {
                field: value
            })
        [record] = Model.browse(self.cr, self.uid, [id])

        e = etree.Element('span')
        field_value = 'record.%s' % field
        e.set('t-field', field_value)

        rendered = self.registry('website.qweb').render_tag_field(
            e, {'field': field_value}, '', ir_qweb.QWebContext(self.cr, self.uid, {
                'record': record,
            }, context={'inherit_branding': True}))
        element = html.fromstring(
            rendered, parser=html.HTMLParser(encoding='utf-8'))

        converter = self.registry('website.qweb').get_converter_for(
            element.get('data-oe-type'))

        value_back = converter.from_html(
            self.cr, self.uid, model, Model._fields[field], element)

        if isinstance(expected, str):
            expected = expected.decode('utf-8')
        self.assertEqual(value_back, expected)
项目:parsel-cli    作者:rmax    | 项目源码 | 文件源码
def fix_parsel_parser(new_type='html_html', base_type='html'):
    """Fix a custom parser for parsel using lxml.html.HTMLParser.

    The main reason is to have handy helpers as make_links_absolute method.
    """
    selector._ctgroup[new_type] = selector._ctgroup[base_type].copy()
    selector._ctgroup[new_type]['_parser'] = lxml_html.HTMLParser
    return new_type
项目:BlogSpider    作者:hack4code    | 项目源码 | 文件源码
def process_item(self, item, spider):
        item['title'] = self.format_title(item['title'])
        doc = item['content']
        if not isinstance(doc,
                          HtmlElement):
            if isinstance(doc,
                          (str, bytes)):
                doc = fromstring(bytes(bytearray(doc,
                                                 encoding=item['encoding'])),
                                 parser=HTMLParser(encoding=item['encoding']))
            else:
                raise ContentException((
                    'Error in content pipeline unsupported doc type[{}]'
                    ).format(doc.__class__.__name__))

        # remove element with class name for clean display
        removed_classes = getattr(spider,
                                  self.REMOVED_CLASSES_NAME,
                                  None)
        if removed_classes is not None:
            doc = self.remove_element_with_class(doc,
                                                 removed_classes)

        # remove element with xpath for clean display
        removed_xpath_nodes = getattr(spider,
                                      self.REMOVED_XPATH_NODES_NAME,
                                      None)
        if removed_xpath_nodes is not None:
            doc = self.remove_element_with_xpath(doc,
                                                 removed_xpath_nodes)
        allow_classes = getattr(spider,
                                self.ALLOW_CLASSES_NAME,
                                None)
        safe_attrs = getattr(spider,
                             self.SAFE_ATTRS_NAME,
                             None)
        doc = self.clean_html(doc,
                              allow_classes=allow_classes,
                              safe_attrs=safe_attrs)
        doc = self.make_abs_link(doc,
                                 item['link'])
        item['content'] = doc
        return item
项目:gooderp_org    作者:osbzr    | 项目源码 | 文件源码
def test_m2o(self):
        """ the M2O field conversion (from html) is markedly different from
        others as it directly writes into the m2o and returns nothing at all.
        """
        model = 'website.converter.test'
        field = 'many2one'

        Sub = self.registry('website.converter.test.sub')
        sub_id = Sub.create(self.cr, self.uid, {'name': "Foo"})

        Model = self.registry(model)
        id = Model.create(self.cr, self.uid, {field: sub_id})
        [record] = Model.browse(self.cr, self.uid, [id])

        e = etree.Element('span')
        field_value = 'record.%s' % field
        e.set('t-field', field_value)

        rendered = self.registry('website.qweb').render_tag_field(
            e, {'field': field_value}, '', ir_qweb.QWebContext(self.cr, self.uid, {
                'record': record,
            }, context={'inherit_branding': True}))

        element = html.fromstring(rendered, parser=html.HTMLParser(encoding='utf-8'))
        # emulate edition
        element.text = "New content"

        converter = self.registry('website.qweb').get_converter_for(
            element.get('data-oe-type'))

        value_back = converter.from_html(
            self.cr, self.uid, model, Model._fields[field], element)

        self.assertIsNone(
            value_back, "the m2o converter should return None to avoid spurious"
                        " or useless writes on the parent record")

        self.assertEqual(
            Sub.browse(self.cr, self.uid, sub_id).name,
            "New content",
            "element edition should have been written directly to the m2o record"
        )
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda    作者:SignalMedia    | 项目源码 | 文件源码
def _build_doc(self):
        """
        Raises
        ------
        ValueError
            * If a URL that lxml cannot parse is passed.

        Exception
            * Any other ``Exception`` thrown. For example, trying to parse a
              URL that is syntactically correct on a machine with no internet
              connection will fail.

        See Also
        --------
        pandas.io.html._HtmlFrameParser._build_doc
        """
        from lxml.html import parse, fromstring, HTMLParser
        from lxml.etree import XMLSyntaxError

        parser = HTMLParser(recover=False, encoding=self.encoding)

        try:
            # try to parse the input in the simplest way
            r = parse(self.io, parser=parser)

            try:
                r = r.getroot()
            except AttributeError:
                pass
        except (UnicodeDecodeError, IOError):
            # if the input is a blob of html goop
            if not _is_url(self.io):
                r = fromstring(self.io, parser=parser)

                try:
                    r = r.getroot()
                except AttributeError:
                    pass
            else:
                # not a url
                scheme = parse_url(self.io).scheme
                if scheme not in _valid_schemes:
                    # lxml can't parse it
                    msg = ('%r is not a valid url scheme, valid schemes are '
                           '%s') % (scheme, _valid_schemes)
                    raise ValueError(msg)
                else:
                    # something else happened: maybe a faulty connection
                    raise
        else:
            if not hasattr(r, 'text_content'):
                raise XMLSyntaxError("no text parsed from document", 0, 0, 0)
        return r