Python lxml.etree 模块,LxmlError() 实例源码


项目:htmltab    作者:flother    | 项目源码 | 文件源码
def select_elements(doc, select):
    Return the elements within ``doc`` that match the selector
    ``select``. The selector can be an index, a CSS selector, or an
    XPath expression.
        elements = doc.xpath("(//table)[{}]".format(int(select)))
    except ValueError:
        # Expression wasn't a valid integer so try to use it as a CSS selector.
            elements = doc.cssselect(select)
        except SelectorError:
            # Nope, not a valid CSS expression. Last attempt is to try it as an
            # Path expression.
                elements = doc.xpath(select)
            except LxmlError:
                # Catch the specific LXML error and raise a more generic error
                # because the problem could lie with any of the index, CSS
                # selector, or XPath expression.
                raise ValueError("'{}' not an index, CSS selector, or XPath "
    return elements
项目:squid_dedup    作者:frispete    | 项目源码 | 文件源码
def extract(pagedata, pagefile):
    ret = []
        root = etree.HTML(pagedata)
    except etree.LxmlError as e:
        log.error('<%s> malformed: %s', pagefile, e)
        return ret

    for table in root.findall('.//table[@class="mirrortable"]'):
        country = None
        for e in table.iter('a', 'td', 'th'):
            if e.tag == 'th' and e.text:
                country = e.text
            elif e.tag == 'a' and e.text.startswith('http'):
                url = e.get('href')
                if not url.endswith('/'):
                    url += '/'
                ret.append((url, country))
    return ret
项目:mes    作者:osess    | 项目源码 | 文件源码
def from_xml(self, content, forbid_dtd=True, forbid_entities=True):
        Given some XML data, returns a Python dictionary of the decoded data.

        By default XML entity declarations and DTDs will raise a BadRequest
        exception content but subclasses may choose to override this if
        if lxml is None:
            raise ImproperlyConfigured("Usage of the XML aspects requires lxml and defusedxml.")

            parsed = parse_xml(StringIO(content), forbid_dtd=forbid_dtd,
        except (LxmlError, DefusedXmlException):
            raise BadRequest

        return self.from_etree(parsed.getroot())
项目:htmltab    作者:flother    | 项目源码 | 文件源码
def parse_html(html_file):
    Read the HTML file using lxml's HTML parser, but convert to Unicode
    using Beautiful Soup's UnicodeDammit class.

    Can raise LxmlError or TypeError if the file can't be opened or
    unicode_html = UnicodeDammit(html_file, smart_quotes_to="html",
    if unicode_html.unicode_markup is None:
        raise ValueError("no HTML provided")
    if not unicode_html.unicode_markup:
        raise ValueError("could not detect character encoding")
    return lxml.html.fromstring(unicode_html.unicode_markup)
项目:squid_dedup    作者:frispete    | 项目源码 | 文件源码
def extract(pagedata, pagefile):
    ret = []
        root = etree.HTML(pagedata)
    except etree.LxmlError as e:
        log.error('<%s> malformed: %s', pagefile, e)
        return ret

    table = root.find('.//table[@summary]')
    if table is None:
        log.error('<%s> malformed: summary table not found', pagefile)
        return ret

    country = None
    for e in table.iter('a', 'td'):
        if e.tag == 'td':
            cc = None
            for se in e:
                if se.tag == 'img':
                    cc = se.get('alt')
            if cc:
                country = '%s (%s)' % (e.xpath('string()').strip(), cc)
        elif e.tag == 'a':
            if e.text == 'HTTP':
                url = e.get('href')
                if not url.endswith('/'):
                    url += '/'
                ret.append((url, country))
    return ret
项目:squid_dedup    作者:frispete    | 项目源码 | 文件源码
def generate(url, pagedata, pagefile, redirfile, repl):'generate %s', redirfile)
    mtime = os.stat(pagefile).st_mtime
        root = etree.HTML(pagedata)
    except etree.LxmlError as e:
        log.error('<%s> malformed: %s', pagefile, e)
        return 2

    table = root.find('.//table[@summary]')
    if table is None:
        log.error('<%s> malformed: summary table not found', pagefile)
        return 3

        fd = open(redirfile, 'w')
    except Exception as e:
# this file was automatically generated based on
# %s from %s
#abort   .html
#abort   .jpg
#abort   .png
#abort   .jpeg
#abort   .gif
#abort   .html
#abort   .shtml
#abort   .java
#abort   .jar
#abort   .htm

# openSUSE Headquarter
regexi ^*)$ %s
''' % (url, email.utils.formatdate(mtime, localtime = True), repl))
        country = None
        for e in table.iter('a', 'td'):
            if e.tag == 'td':
                cc = None
                for se in e:
                    if se.tag == 'img':
                        cc = se.get('alt')
                if cc:
                    c = '# %s (%s)\n' % (e.xpath('string()').strip(), cc)
                    if c != country:
                        country = c
            elif e.tag == 'a':
                if e.text == 'HTTP':
                    url = e.get('href')
                    if not url.endswith('/'):
                        url += '/'
                    fd.write('regexi ^%s(.*)$ %s\n' % (url, repl))
    return 0