Python lxml.etree 模块,LxmlError() 实例源码

我们从Python开源项目中,提取了以下6个代码示例,用于说明如何使用lxml.etree.LxmlError()

项目:htmltab    作者:flother    | 项目源码 | 文件源码
def select_elements(doc, select):
    """
    Return the elements within ``doc`` that match the selector
    ``select``. The selector can be an index, a CSS selector, or an
    XPath expression.
    """
    try:
        int(select)
        elements = doc.xpath("(//table)[{}]".format(int(select)))
    except ValueError:
        # Expression wasn't a valid integer so try to use it as a CSS selector.
        try:
            elements = doc.cssselect(select)
        except SelectorError:
            # Nope, not a valid CSS expression. Last attempt is to try it as an
            # Path expression.
            try:
                elements = doc.xpath(select)
            except LxmlError:
                # Catch the specific LXML error and raise a more generic error
                # because the problem could lie with any of the index, CSS
                # selector, or XPath expression.
                raise ValueError("'{}' not an index, CSS selector, or XPath "
                                 "expression".format(select))
    return elements
项目:squid_dedup    作者:frispete    | 项目源码 | 文件源码
def extract(pagedata, pagefile):
    ret = []
    try:
        root = etree.HTML(pagedata)
    except etree.LxmlError as e:
        log.error('<%s> malformed: %s', pagefile, e)
        return ret

    for table in root.findall('.//table[@class="mirrortable"]'):
        country = None
        for e in table.iter('a', 'td', 'th'):
            if e.tag == 'th' and e.text:
                country = e.text
            elif e.tag == 'a' and e.text.startswith('http'):
                url = e.get('href')
                if not url.endswith('/'):
                    url += '/'
                ret.append((url, country))
    return ret
项目:mes    作者:osess    | 项目源码 | 文件源码
def from_xml(self, content, forbid_dtd=True, forbid_entities=True):
        """
        Given some XML data, returns a Python dictionary of the decoded data.

        By default XML entity declarations and DTDs will raise a BadRequest
        exception content but subclasses may choose to override this if
        necessary.
        """
        if lxml is None:
            raise ImproperlyConfigured("Usage of the XML aspects requires lxml and defusedxml.")

        try:
            parsed = parse_xml(StringIO(content), forbid_dtd=forbid_dtd,
                               forbid_entities=forbid_entities)
        except (LxmlError, DefusedXmlException):
            raise BadRequest

        return self.from_etree(parsed.getroot())
项目:htmltab    作者:flother    | 项目源码 | 文件源码
def parse_html(html_file):
    """
    Read the HTML file using lxml's HTML parser, but convert to Unicode
    using Beautiful Soup's UnicodeDammit class.

    Can raise LxmlError or TypeError if the file can't be opened or
    parsed.
    """
    unicode_html = UnicodeDammit(html_file, smart_quotes_to="html",
                                 is_html=True)
    if unicode_html.unicode_markup is None:
        raise ValueError("no HTML provided")
    if not unicode_html.unicode_markup:
        raise ValueError("could not detect character encoding")
    return lxml.html.fromstring(unicode_html.unicode_markup)
项目:squid_dedup    作者:frispete    | 项目源码 | 文件源码
def extract(pagedata, pagefile):
    ret = []
    try:
        root = etree.HTML(pagedata)
    except etree.LxmlError as e:
        log.error('<%s> malformed: %s', pagefile, e)
        return ret

    table = root.find('.//table[@summary]')
    if table is None:
        log.error('<%s> malformed: summary table not found', pagefile)
        return ret

    country = None
    for e in table.iter('a', 'td'):
        if e.tag == 'td':
            cc = None
            for se in e:
                if se.tag == 'img':
                    cc = se.get('alt')
                    break
            if cc:
                country = '%s (%s)' % (e.xpath('string()').strip(), cc)
        elif e.tag == 'a':
            if e.text == 'HTTP':
                url = e.get('href')
                if not url.endswith('/'):
                    url += '/'
                ret.append((url, country))
    return ret
项目:squid_dedup    作者:frispete    | 项目源码 | 文件源码
def generate(url, pagedata, pagefile, redirfile, repl):
    log.info('generate %s', redirfile)
    mtime = os.stat(pagefile).st_mtime
    try:
        root = etree.HTML(pagedata)
    except etree.LxmlError as e:
        log.error('<%s> malformed: %s', pagefile, e)
        return 2

    table = root.find('.//table[@summary]')
    if table is None:
        log.error('<%s> malformed: summary table not found', pagefile)
        return 3

    try:
        fd = open(redirfile, 'w')
    except Exception as e:
        log.error(e)
    else:
        fd.write('''#
# this file was automatically generated based on
# %s from %s
#
#abort   .html
#abort   .jpg
#abort   .png
#abort   .jpeg
#abort   .gif
#abort   .html
#abort   .shtml
#abort   .java
#abort   .jar
#abort   .htm

# openSUSE Headquarter
regexi ^http://download.opensuse.org/(.*)$ %s
''' % (url, email.utils.formatdate(mtime, localtime = True), repl))
        country = None
        for e in table.iter('a', 'td'):
            if e.tag == 'td':
                cc = None
                for se in e:
                    if se.tag == 'img':
                        cc = se.get('alt')
                        break
                if cc:
                    c = '# %s (%s)\n' % (e.xpath('string()').strip(), cc)
                    if c != country:
                        country = c
                        fd.write(c)
            elif e.tag == 'a':
                if e.text == 'HTTP':
                    url = e.get('href')
                    if not url.endswith('/'):
                        url += '/'
                    fd.write('regexi ^%s(.*)$ %s\n' % (url, repl))
        fd.write('\n')
        fd.close()
    return 0