Python lxml.html 模块，tostring() 实例源码

我们从Python开源项目中，提取了以下46个代码示例，用于说明如何使用lxml.html.tostring()。

项目：base_function 作者：Rockyzsu | 项目源码 | 文件源码

def lxml_test():
    url = "http://www.caixunzz.com"
    req = urllib2.Request(url=url)
    resp = urllib2.urlopen(req)
    #print resp.read()
    '''
    parse_body=html.fromstring(resp.read())
    href=parse_body.xpath('//a[@class="label"]/@href')
    print href
    #not working from above
    '''

    tree = etree.HTML(resp.read())
    href = tree.xpath('//a[@class="label"]/@href')
    #print href.tag
    for i in href:
        #print html.tostring(i)
        #print type(i)
        print i

    print type(href)

#not working yet

项目：BlogSpider 作者：hack4code | 项目源码 | 文件源码

def process_item(self, item, spider):
        if item is not None:
            doc = item['content']
            if not isinstance(doc,
                              (str, bytes)):
                if isinstance(doc,
                              HtmlElement):
                    item['content'] = tostring(doc,
                                               encoding='UTF-8',
                                               pretty_print=True,
                                               method='html')
                    item['encoding'] = 'UTF-8'
                else:
                    raise Exception((
                        'Error in store pipeline unsupported doc type[{}]'
                        ).format(doc.__class__.__name__))

            item_ = dict(item)
            item_['lang'] = get_article_lang(item)
            item_['spider'] = spider._id
            item_['source'] = spider.title
            item_['category'] = get_category(item_)
            if not is_exists_article(item_):
                save_article(item_)
        return item

项目：gooderp_org 作者：osbzr | 项目源码 | 文件源码

def setUp(self):
        super(TestViewSaving, self).setUp()
        self.arch = h.DIV(
            h.DIV(
                h.H3("Column 1"),
                h.UL(
                    h.LI("Item 1"),
                    h.LI("Item 2"),
                    h.LI("Item 3"))),
            h.DIV(
                h.H3("Column 2"),
                h.UL(
                    h.LI("Item 1"),
                    h.LI(h.SPAN("My Company", attrs(model='res.company', id=1, field='name', type='char'))),
                    h.LI(h.SPAN("+00 00 000 00 0 000", attrs(model='res.company', id=1, field='phone', type='char')))
                ))
        )
        self.view_id = self.registry('ir.ui.view').create(self.cr, self.uid, {
            'name': "Test View",
            'type': 'qweb',
            'arch': ET.tostring(self.arch, encoding='utf-8').decode('utf-8')
        })

项目：ingestors 作者：alephdata | 项目源码 | 文件源码

def ingest(self, file_path):
        """Ingestor implementation."""
        file_size = self.result.size or os.path.getsize(file_path)
        if file_size > self.MAX_SIZE:
            raise ProcessingException("XML file is too large.")

        try:
            doc = etree.parse(file_path)
        except (ParserError, ParseError):
            raise ProcessingException("XML could not be parsed.")

        text = self.extract_html_text(doc.getroot())
        transform = etree.XSLT(self.XSLT)
        html_doc = transform(doc)
        html_body = html.tostring(html_doc,
                                  encoding='unicode',
                                  pretty_print=True)
        self.result.flag(self.result.FLAG_HTML)
        self.result.emit_html_body(html_body, text)

项目：WebAutomaiton 作者：AlvinXuCH | 项目源码 | 文件源码

def WriteHTML(self,testcaseinfo):

        self.CreateHtmlFile()

        f = open(self.reportfile,"r")

        htmlcontent = f.read()
        f.close()
        #tree = mytree.fromstring(str(htmlcontent))
        htmlcontent.encode('utf-8')
        tree = html.fromstring(htmlcontent)
        tableElem = tree.find(".//table")
        if testcaseinfo.result == "Failed":
            mytablerow = "<tr><td>{0}</td><td>{1}</td><td>{2}</td><td bgcolor=\"#FF0000\">{3}</td><td>{4}</td><td>{5}</td><td>{6}</td><td>{7}</td></tr>".format(testcaseinfo.id,testcaseinfo.name,testcaseinfo.owner,testcaseinfo.result,testcaseinfo.starttime,testcaseinfo.endtime,testcaseinfo.secondsDuration,testcaseinfo.errorinfo)
        else:
            mytablerow = "<tr><td>{0}</td><td>{1}</td><td>{2}</td><td>{3}</td><td>{4}</td><td>{5}</td><td>{6}</td><td>{7}</td></tr>".format(testcaseinfo.id,testcaseinfo.name,testcaseinfo.owner,testcaseinfo.result,testcaseinfo.starttime,testcaseinfo.endtime,testcaseinfo.secondsDuration,testcaseinfo.errorinfo)
        tableElem.append(mytree.HTML(str(mytablerow)))

        f = open(self.reportfile,"w")
        #html.tostring
        newContent = repr(html.tostring(tree,method="html",with_tail=False))
        newContent = newContent.replace(r"\n","").replace(r"\t","").replace('b\'',"")
        newContent = newContent[:len(newContent)-1]
        f.write(newContent)
        f.close()

项目：crestify 作者：crestify | 项目源码 | 文件源码

def __init__(self, file_name, user_id):
        with open(file_name, 'r') as self.opened_file:
            #  So Instapaper doesn't close <li> tags
            #  This was causing infinite recursion when using BS directly
            #  Hence why the stuff below is being done, so that the <li> tags get closed
            self.html = html.document_fromstring(self.opened_file.read())
            self.html = html.tostring(self.html)
        self.soup = BeautifulSoup4(self.html)
        self.user = user_id
        self.urls = dict()
        self.check_duplicates = dict()
        self.check_duplicates_query = Bookmark.query.filter(Bookmark.user == self.user,
                                                            Bookmark.deleted == False).all()
        for bmark in self.check_duplicates_query:
            self.check_duplicates[bmark.main_url] = bmark
        self.tags_dict = dict()
        self.tags_set = set()
        self.valid_url = re.compile(
            r'^(?:[a-z0-9\.\-]*)://'
            r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}(?<!-)\.?)|'
            r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|'
            r'\[?[A-F0-9]*:[A-F0-9:]+\]?)'
            r'(?::\d+)?'
            r'(?:/?|[/?]\S+)$', re.IGNORECASE)

项目：idealoom 作者：conversence | 项目源码 | 文件源码

def _sanitize_html_frags(html_value, valid_tags, valid_attributes):
    fragments = html.fragments_fromstring(html_value)
    for f in fragments:
        if isinstance(f, html.HtmlElement):
            _sanitize_html_rec(f, valid_tags, valid_attributes)
            if f.tag in valid_tags:
                _clean_attributes(f, valid_attributes)
                yield html.tostring(f, encoding="unicode")
            else:
                if f.text:
                    yield f.text
                for sub in f:
                    yield html.tostring(sub, encoding="unicode")
                if f.tail:
                    yield f.tail
                if f.tag in ('p', 'br'):
                    yield '\n'
        else:
            yield f

项目：calibre_dangdang 作者：qunxyz | 项目源码 | 文件源码

def totext(self, elem):
        return self.tostring(elem, encoding=unicode, method='text').strip()

项目：calibre_dangdang 作者：qunxyz | 项目源码 | 文件源码

def parse_results_page(self, root):  # {{{
        from lxml.html import tostring

        matches = []

        def title_ok(title):
            title = title.lower()
            bad = ['bulk pack', '[audiobook]', '[audio cd]', '(a book companion)', '( slipcase with door )', ': free sampler']
            for x in bad:
                if x in title:
                    return False
            # if title and title[0] in '[{' and re.search(r'\(\s*author\s*\)', title) is not None:
            #     # Bad entries in the catalog
            #     return False
            return True

        for a in root.xpath(r'//li[starts-with(@class, "line")]//a[@href and contains(@name, "itemlist-picture")]'):
            # title = a.get('title')
            # if title_ok(title):
            url = a.get('href')
            if url.startswith('/'):
                url = 'http://product.dangdang.com/%s' % (url)
            matches.append(url)

        # Keep only the top 5 matches as the matches are sorted by relevance by
        # Amazon so lower matches are not likely to be very relevant
        return matches[:5]
    # }}}

项目：zing 作者：evernote | 项目源码 | 文件源码

def url_trim(html):
    """Trims anchor texts that are longer than 70 chars."""
    fragment = fromstring(html)
    for el, attrib_, link_, pos_ in fragment.iterlinks():
        new_link_text = trim_url(el.text_content())
        el.text = new_link_text

    return mark_safe(tostring(fragment, encoding=unicode))

项目：base_function 作者：Rockyzsu | 项目源码 | 文件源码

def lxml_case3():


    text = '''
    <div>
        <ul>
             <li class="item-0"><a href="link1.html">first item</a></li>
             <li class="item-1"><a href="link2.html">second item</a></li>
             <li class="item-inactive"><a href="link3.html">third item><span>Hello world</span></a></li>
             <li class="item-1"><a href="link4.html">fourth item</a></li>
             <li class="item-0"><a href="link5.html">fifth item</a>
             <li class="de-item-0"><a href="link5.html">fifth item</a>
         </ul>
     </div>
    '''

    tree=etree.HTML(text)
    html_s=etree.tostring(tree)
    #print html_s
    #print tree.xpath('//li//span/text()')[0]
    '''
    reg_case=tree.xpath('//*[starts-with(@class,"item")]')
    for i in reg_case:
        print i.xpath('.//a/@href')
    '''
    result=tree.xpath(r'//*[re:match(@class, "item-0")]')
    print result

    for i in result[0]:
        print i.xpath('.//a/@href')