Python lxml.html 模块,tostring() 实例源码

我们从Python开源项目中,提取了以下46个代码示例,用于说明如何使用lxml.html.tostring()

项目:base_function    作者:Rockyzsu    | 项目源码 | 文件源码
def lxml_test():
    url = "http://www.caixunzz.com"
    req = urllib2.Request(url=url)
    resp = urllib2.urlopen(req)
    #print resp.read()
    '''
    parse_body=html.fromstring(resp.read())
    href=parse_body.xpath('//a[@class="label"]/@href')
    print href
    #not working from above
    '''

    tree = etree.HTML(resp.read())
    href = tree.xpath('//a[@class="label"]/@href')
    #print href.tag
    for i in href:
        #print html.tostring(i)
        #print type(i)
        print i

    print type(href)

#not working yet
项目:BlogSpider    作者:hack4code    | 项目源码 | 文件源码
def process_item(self, item, spider):
        if item is not None:
            doc = item['content']
            if not isinstance(doc,
                              (str, bytes)):
                if isinstance(doc,
                              HtmlElement):
                    item['content'] = tostring(doc,
                                               encoding='UTF-8',
                                               pretty_print=True,
                                               method='html')
                    item['encoding'] = 'UTF-8'
                else:
                    raise Exception((
                        'Error in store pipeline unsupported doc type[{}]'
                        ).format(doc.__class__.__name__))

            item_ = dict(item)
            item_['lang'] = get_article_lang(item)
            item_['spider'] = spider._id
            item_['source'] = spider.title
            item_['category'] = get_category(item_)
            if not is_exists_article(item_):
                save_article(item_)
        return item
项目:gooderp_org    作者:osbzr    | 项目源码 | 文件源码
def setUp(self):
        super(TestViewSaving, self).setUp()
        self.arch = h.DIV(
            h.DIV(
                h.H3("Column 1"),
                h.UL(
                    h.LI("Item 1"),
                    h.LI("Item 2"),
                    h.LI("Item 3"))),
            h.DIV(
                h.H3("Column 2"),
                h.UL(
                    h.LI("Item 1"),
                    h.LI(h.SPAN("My Company", attrs(model='res.company', id=1, field='name', type='char'))),
                    h.LI(h.SPAN("+00 00 000 00 0 000", attrs(model='res.company', id=1, field='phone', type='char')))
                ))
        )
        self.view_id = self.registry('ir.ui.view').create(self.cr, self.uid, {
            'name': "Test View",
            'type': 'qweb',
            'arch': ET.tostring(self.arch, encoding='utf-8').decode('utf-8')
        })
项目:ingestors    作者:alephdata    | 项目源码 | 文件源码
def ingest(self, file_path):
        """Ingestor implementation."""
        file_size = self.result.size or os.path.getsize(file_path)
        if file_size > self.MAX_SIZE:
            raise ProcessingException("XML file is too large.")

        try:
            doc = etree.parse(file_path)
        except (ParserError, ParseError):
            raise ProcessingException("XML could not be parsed.")

        text = self.extract_html_text(doc.getroot())
        transform = etree.XSLT(self.XSLT)
        html_doc = transform(doc)
        html_body = html.tostring(html_doc,
                                  encoding='unicode',
                                  pretty_print=True)
        self.result.flag(self.result.FLAG_HTML)
        self.result.emit_html_body(html_body, text)
项目:WebAutomaiton    作者:AlvinXuCH    | 项目源码 | 文件源码
def WriteHTML(self,testcaseinfo):

        self.CreateHtmlFile()

        f = open(self.reportfile,"r")

        htmlcontent = f.read()
        f.close()
        #tree = mytree.fromstring(str(htmlcontent))
        htmlcontent.encode('utf-8')
        tree = html.fromstring(htmlcontent)
        tableElem = tree.find(".//table")
        if testcaseinfo.result == "Failed":
            mytablerow = "<tr><td>{0}</td><td>{1}</td><td>{2}</td><td bgcolor=\"#FF0000\">{3}</td><td>{4}</td><td>{5}</td><td>{6}</td><td>{7}</td></tr>".format(testcaseinfo.id,testcaseinfo.name,testcaseinfo.owner,testcaseinfo.result,testcaseinfo.starttime,testcaseinfo.endtime,testcaseinfo.secondsDuration,testcaseinfo.errorinfo)
        else:
            mytablerow = "<tr><td>{0}</td><td>{1}</td><td>{2}</td><td>{3}</td><td>{4}</td><td>{5}</td><td>{6}</td><td>{7}</td></tr>".format(testcaseinfo.id,testcaseinfo.name,testcaseinfo.owner,testcaseinfo.result,testcaseinfo.starttime,testcaseinfo.endtime,testcaseinfo.secondsDuration,testcaseinfo.errorinfo)
        tableElem.append(mytree.HTML(str(mytablerow)))

        f = open(self.reportfile,"w")
        #html.tostring
        newContent = repr(html.tostring(tree,method="html",with_tail=False))
        newContent = newContent.replace(r"\n","").replace(r"\t","").replace('b\'',"")
        newContent = newContent[:len(newContent)-1]
        f.write(newContent)
        f.close()
项目:crestify    作者:crestify    | 项目源码 | 文件源码
def __init__(self, file_name, user_id):
        with open(file_name, 'r') as self.opened_file:
            #  So Instapaper doesn't close <li> tags
            #  This was causing infinite recursion when using BS directly
            #  Hence why the stuff below is being done, so that the <li> tags get closed
            self.html = html.document_fromstring(self.opened_file.read())
            self.html = html.tostring(self.html)
        self.soup = BeautifulSoup4(self.html)
        self.user = user_id
        self.urls = dict()
        self.check_duplicates = dict()
        self.check_duplicates_query = Bookmark.query.filter(Bookmark.user == self.user,
                                                            Bookmark.deleted == False).all()
        for bmark in self.check_duplicates_query:
            self.check_duplicates[bmark.main_url] = bmark
        self.tags_dict = dict()
        self.tags_set = set()
        self.valid_url = re.compile(
            r'^(?:[a-z0-9\.\-]*)://'
            r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}(?<!-)\.?)|'
            r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|'
            r'\[?[A-F0-9]*:[A-F0-9:]+\]?)'
            r'(?::\d+)?'
            r'(?:/?|[/?]\S+)$', re.IGNORECASE)
项目:idealoom    作者:conversence    | 项目源码 | 文件源码
def _sanitize_html_frags(html_value, valid_tags, valid_attributes):
    fragments = html.fragments_fromstring(html_value)
    for f in fragments:
        if isinstance(f, html.HtmlElement):
            _sanitize_html_rec(f, valid_tags, valid_attributes)
            if f.tag in valid_tags:
                _clean_attributes(f, valid_attributes)
                yield html.tostring(f, encoding="unicode")
            else:
                if f.text:
                    yield f.text
                for sub in f:
                    yield html.tostring(sub, encoding="unicode")
                if f.tail:
                    yield f.tail
                if f.tag in ('p', 'br'):
                    yield '\n'
        else:
            yield f
项目:calibre_dangdang    作者:qunxyz    | 项目源码 | 文件源码
def totext(self, elem):
        return self.tostring(elem, encoding=unicode, method='text').strip()
项目:calibre_dangdang    作者:qunxyz    | 项目源码 | 文件源码
def parse_results_page(self, root):  # {{{
        from lxml.html import tostring

        matches = []

        def title_ok(title):
            title = title.lower()
            bad = ['bulk pack', '[audiobook]', '[audio cd]', '(a book companion)', '( slipcase with door )', ': free sampler']
            for x in bad:
                if x in title:
                    return False
            # if title and title[0] in '[{' and re.search(r'\(\s*author\s*\)', title) is not None:
            #     # Bad entries in the catalog
            #     return False
            return True

        for a in root.xpath(r'//li[starts-with(@class, "line")]//a[@href and contains(@name, "itemlist-picture")]'):
            # title = a.get('title')
            # if title_ok(title):
            url = a.get('href')
            if url.startswith('/'):
                url = 'http://product.dangdang.com/%s' % (url)
            matches.append(url)

        # Keep only the top 5 matches as the matches are sorted by relevance by
        # Amazon so lower matches are not likely to be very relevant
        return matches[:5]
    # }}}
项目:zing    作者:evernote    | 项目源码 | 文件源码
def url_trim(html):
    """Trims anchor texts that are longer than 70 chars."""
    fragment = fromstring(html)
    for el, attrib_, link_, pos_ in fragment.iterlinks():
        new_link_text = trim_url(el.text_content())
        el.text = new_link_text

    return mark_safe(tostring(fragment, encoding=unicode))
项目:base_function    作者:Rockyzsu    | 项目源码 | 文件源码
def lxml_case3():


    text = '''
    <div>
        <ul>
             <li class="item-0"><a href="link1.html">first item</a></li>
             <li class="item-1"><a href="link2.html">second item</a></li>
             <li class="item-inactive"><a href="link3.html">third item><span>Hello world</span></a></li>
             <li class="item-1"><a href="link4.html">fourth item</a></li>
             <li class="item-0"><a href="link5.html">fifth item</a>
             <li class="de-item-0"><a href="link5.html">fifth item</a>
         </ul>
     </div>
    '''

    tree=etree.HTML(text)
    html_s=etree.tostring(tree)
    #print html_s
    #print tree.xpath('//li//span/text()')[0]
    '''
    reg_case=tree.xpath('//*[starts-with(@class,"item")]')
    for i in reg_case:
        print i.xpath('.//a/@href')
    '''
    result=tree.xpath(r'//*[re:match(@class, "item-0")]')
    print result

    for i in result[0]:
        print i.xpath('.//a/@href')
项目:Taigabot    作者:FrozenPigs    | 项目源码 | 文件源码
def test_body(self):
            html = '''<body><p>test</p></body>'''
            res = b'''<html><body><p>test</p></body></html>'''
            tree = self.soupparser.fromstring(html)
            self.assertEqual(tostring(tree), res)
项目:Taigabot    作者:FrozenPigs    | 项目源码 | 文件源码
def test_head_body(self):
            # HTML tag missing, parser should fix that
            html = '<head><title>test</title></head><body><p>test</p></body>'
            res = b'<html><head><title>test</title></head><body><p>test</p></body></html>'
            tree = self.soupparser.fromstring(html)
            self.assertEqual(tostring(tree), res)
项目:Taigabot    作者:FrozenPigs    | 项目源码 | 文件源码
def test_wrap_html(self):
            # <head> outside <html>, parser should fix that
            html = '<head><title>title</test></head><html><body/></html>'
            res = b'<html><head><title>title</title></head><body></body></html>'
            tree = self.soupparser.fromstring(html)
            self.assertEqual(tostring(tree), res)
项目:Taigabot    作者:FrozenPigs    | 项目源码 | 文件源码
def test_comment_pi(self):
            html = '''<!-- comment -->
<?test asdf?>
<head><title>test</title></head><body><p>test</p></body>
<!-- another comment -->'''
            res = b'''<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">
<!-- comment --><?test asdf?><html><head><title>test</title></head><body><p>test</p></body></html><!-- another comment -->'''
            tree = self.soupparser.fromstring(html).getroottree()
            self.assertEqual(tostring(tree, method='html'), res)
项目:Taigabot    作者:FrozenPigs    | 项目源码 | 文件源码
def test_doctype1(self):
            # Test document type declaration, comments and PI's
            # outside the root
            html = \
'''<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
<!--another comment--><html><head><title>My first HTML document</title></head><body><p>Hello world!</p></body></html><?foo bar>'''

            res = \
b'''<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
<!--another comment--><html><head><title>My first HTML document</title></head><body><p>Hello world!</p></body></html><?foo bar?>'''

            tree = self.soupparser.fromstring(html).getroottree()
            self.assertEqual(tree.docinfo.public_id, "-//W3C//DTD HTML 4.01//EN")
            self.assertEqual(tostring(tree), res)
项目:Taigabot    作者:FrozenPigs    | 项目源码 | 文件源码
def test_doctype_html5(self):
            # html 5 doctype declaration
            html = b'<!DOCTYPE html>\n<html lang="en"></html>'

            tree = self.soupparser.fromstring(html).getroottree()
            self.assertTrue(tree.docinfo.public_id is None)
            self.assertEqual(tostring(tree), html)
项目:europarl    作者:chozelinek    | 项目源码 | 文件源码
def get_language(self, s_intervention, p, i_lang, new_paragraphs):
        language = p.xpath('.//span[@class="italic"][text()[re:test(.,"^[\xad\s\.—–\-?,\(]*({})[\xad\s\.—–\-?,\)]*")]]'.format('|'.join(self.langs)), namespaces=self.ns)
        if len(language) > 0 and not self.explanations_of_vote.match(language[0].text):
            lang = re.match(
                r'.*({}).*'.format('|'.join(self.langs)),
                language[0].text)
            output = lang.group(1)
            for l in language:
                l.drop_tree()
        else:
            p = html.tostring(p, with_tail=True, encoding='utf-8').decode('utf-8')
            lang_in_text = re.search(
                r'\(({})\)'.format('|'.join(self.langs)),
                p)
            if lang_in_text is not None:
                output = lang_in_text.group(1)
                p = re.sub(r'\(({})\) *'.format('|'.join(self.langs)), r'', p)
            else:
                if len(new_paragraphs) == 0:
                    if 'role' in s_intervention.keys():
                        president_pattern = '|'.join(self.loc['president'])
                        if re.match(r'{}\Z'.format(president_pattern), s_intervention['role']):
                                output = 'unknown'
                        else:
                            if i_lang is None:
                                output = self.language.upper()
                            else:
                                output = i_lang
                    else:
                        if i_lang is None:
                            output = self.language.upper()
                        else:
                            output = i_lang
                else:
                    output = new_paragraphs[-1]['language']
            p = html.fromstring(p)
        return output, p
项目:europarl    作者:chozelinek    | 项目源码 | 文件源码
def serialize(self, infile, root):
        ofile_name = os.path.splitext(os.path.basename(infile))[0]
        ofile_path = os.path.join(self.outdir, ofile_name+'.xml')
        xml = etree.tostring(
            root,
            encoding='utf-8',
            xml_declaration=True,
            pretty_print=True).decode('utf-8')
        with open(ofile_path, mode='w', encoding='utf-8') as ofile:
            ofile.write(xml)
        pass
项目:europarl    作者:chozelinek    | 项目源码 | 文件源码
def serialize(self, infile, root):
        ofile_name = os.path.splitext(os.path.basename(infile))[0]
        ofile_path = os.path.join(self.outdir, ofile_name+'.xml')
        xml = etree.tostring(
            root,
            encoding='utf-8',
            xml_declaration=True,
            pretty_print=True).decode('utf-8')
        with open(ofile_path, mode='w', encoding='utf-8') as ofile:
            ofile.write(xml)
        pass
项目:europarl    作者:chozelinek    | 项目源码 | 文件源码
def get_name(self, tree):
        name = tree.xpath('//li[@class="mep_name"]')[0]
        name = self.rm_a.clean_html(name)
        name = html.tostring(name).decode('utf-8')
        name = re.sub(r'[\t\n]', r'', name)
        name = name.split('<br>')
        name = [html.fromstring(x).text_content() for x in name]
        name = ' '.join(name)
        return name
项目:html-telegraph-poster    作者:mercuree    | 项目源码 | 文件源码
def convert_html_to_telegraph_format(html_string, clean_html=True, output_format="json_string"):
    if clean_html:
        html_string = clean_article_html(html_string)

        body = preprocess_fragments(
            _fragments_from_string(html_string)
        )
        if body is not None:
            desc = [x for x in body.iterdescendants()]
            for tag in desc:
                preprocess_media_tags(tag)
            move_to_top(body)
            post_process(body)
    else:
        fragments = _fragments_from_string(html_string)
        body = fragments[0].getparent() if len(fragments) else None

    content = []
    if body is not None:
        content = [_recursive_convert(x) for x in body.iterchildren()]

    if output_format == 'json_string':
        return json.dumps(content, ensure_ascii=False)
    elif output_format == 'python_list':
        return content
    elif output_format == 'html_string':
        return html.tostring(body, encoding='unicode')
项目:gooderp_org    作者:osbzr    | 项目源码 | 文件源码
def from_html(self, cr, uid, model, field, element, context=None):
        content = []
        if element.text: content.append(element.text)
        content.extend(html.tostring(child)
                       for child in element.iterchildren(tag=etree.Element))
        return '\n'.join(content)
项目:gooderp_org    作者:osbzr    | 项目源码 | 文件源码
def test_save(self):
        Company = self.registry('res.company')
        View = self.registry('ir.ui.view')

        replacement = ET.tostring(h.DIV(
            h.H3("Column 2"),
            h.UL(
                h.LI("wob wob wob"),
                h.LI(h.SPAN("Acme Corporation", attrs(model='res.company', id=1, field='name', expression="bob", type='char'))),
                h.LI(h.SPAN("+12 3456789", attrs(model='res.company', id=1, field='phone', expression="edmund", type='char'))),
            )
        ), encoding='utf-8')
        View.save(self.cr, self.uid, res_id=self.view_id, value=replacement,
                  xpath='/div/div[2]')

        company = Company.browse(self.cr, self.uid, 1)
        self.assertEqual(company.name, "Acme Corporation")
        self.assertEqual(company.phone, "+12 3456789")
        self.eq(
            ET.fromstring(View.browse(self.cr, self.uid, self.view_id).arch.encode('utf-8')),
            h.DIV(
                h.DIV(
                    h.H3("Column 1"),
                    h.UL(
                        h.LI("Item 1"),
                        h.LI("Item 2"),
                        h.LI("Item 3"))),
                h.DIV(
                    h.H3("Column 2"),
                    h.UL(
                        h.LI("wob wob wob"),
                        h.LI(h.SPAN({'t-field': "bob"})),
                        h.LI(h.SPAN({'t-field': "edmund"}))
                    ))
            )
        )
项目:gooderp_org    作者:osbzr    | 项目源码 | 文件源码
def test_save_only_embedded(self):
        Company = self.registry('res.company')
        company_id = 1
        Company.write(self.cr, self.uid, company_id, {'name': "Foo Corporation"})

        node = html.tostring(h.SPAN(
            "Acme Corporation",
            attrs(model='res.company', id=company_id, field="name", expression='bob', type='char')))

        self.registry('ir.ui.view').save(self.cr, self.uid, res_id=company_id,value=node)

        company = Company.browse(self.cr, self.uid, company_id)
        self.assertEqual(company.name, "Acme Corporation")
项目:gooderp_org    作者:osbzr    | 项目源码 | 文件源码
def test_field_tail(self):
        View = self.registry('ir.ui.view')
        replacement = ET.tostring(
            h.LI(h.SPAN("+12 3456789", attrs(
                        model='res.company', id=1, type='char',
                        field='phone', expression="edmund")),
                 "whop whop"
        ), encoding="utf-8")
        View.save(self.cr, self.uid, res_id = self.view_id, value=replacement,
                  xpath='/div/div[2]/ul/li[3]')

        self.eq(
            ET.fromstring(View.browse(self.cr, self.uid, self.view_id).arch.encode('utf-8')),
            h.DIV(
                h.DIV(
                    h.H3("Column 1"),
                    h.UL(
                        h.LI("Item 1"),
                        h.LI("Item 2"),
                        h.LI("Item 3"))),
                h.DIV(
                    h.H3("Column 2"),
                    h.UL(
                        h.LI("Item 1"),
                        h.LI(h.SPAN("My Company", attrs(model='res.company', id=1, field='name', type='char'))),
                        h.LI(h.SPAN({'t-field': "edmund"}), "whop whop"),
                    ))
            )
        )
项目:danube-delta    作者:honzajavorek    | 项目源码 | 文件源码
def modify_html(content, prop='_content'):
    html_string = getattr(content, prop)
    html_tree = html.fromstring(html_string)

    yield html_tree

    html_string = html.tostring(html_tree, encoding='unicode')
    html_string = re.sub(r'%7B(\w+)%7D', r'{\1}', html_string)
    html_string = re.sub(r'%7C(\w+)%7C', r'|\1|', html_string)
    setattr(content, prop, html_string)
项目:krauler    作者:occrp-attic    | 项目源码 | 文件源码
def get_content(self, page, meta):
        if not page.is_html:
            return page.content

        check_path = self.config.data.get('check_path')
        if check_path is not None:
            if page.doc.find(check_path) is None:
                log.info("Failed XML path check: %r", page.url)
                return None

        for meta_el in ['title', 'author', 'date']:
            path = self.config.data.get('%s_path' % meta_el)
            if path is not None and page.doc.findtext(path):
                meta[meta_el] = page.doc.findtext(path)

        if 'date' in meta:
            try:
                date = meta.pop('date')
                date = parse(date)
                if 'dates' not in meta:
                    meta['dates'] = []
                meta['dates'].append(date.isoformat())
            except Exception as ex:
                log.exception(ex)

        body = page.doc
        if self.config.data.get('body_path') is not None:
            body = page.doc.find(self.config.data.get('body_path'))

        for path in self.config.data.get('remove_paths', []):
            for el in body.findall(path):
                el.drop_tree()

        return html.tostring(body)
项目:xcrawler    作者:0xE8551CCB    | 项目源码 | 文件源码
def parse_movie_details(self, response):
        html_root = html.fromstring(response.content,
                                    base_url=response.base_url)

        movie_info = dict()
        movie_info['??'] = self.xpath_first(html_root,
                                            '//div[@id="content"]'
                                            '/h1/span[1]/text()').strip()

        try:
            # to pure text
            soup = BeautifulSoup(html.tostring(
                self.xpath_first(html_root,
                                 '//div[@id="info"]')), 'html')
        except TypeError:
            return None
        else:
            for line in soup.get_text().splitlines():
                try:
                    left, *right = line.split(':')
                except AttributeError:
                    pass
                else:
                    key = left.strip()
                    value = ''.join(x.strip() for x in right)

                    if key and value:
                        movie_info[key] = value

            yield movie_info
项目:nom    作者:frnsys    | 项目源码 | 文件源码
def test_convert_spans(self):
        expected = '''
            <p>
                <em><strong>
                    foobar
                    <em>
                        lala
                        <strong>
                            yum
                        </strong>
                    </em>
                    <span>
                        hey hey
                    </span>
                    <strong>
                        uh oh
                    </strong>
                    <span>
                        yes
                    </span>
                </strong></em>
            </p>
        '''

        h = fromstring(html)
        for span in h.findall('.//span'):
            html2md.convert_span(span)
        result = tostring(h).decode('utf-8')

        results = [x.replace('\n', '').replace(' ', '') for x in [result, expected]]
        print('=========')
        print(results[0])
        print('=========')
        print(results[1])
        self.assertEqual(results[0], results[1])
项目:nom    作者:frnsys    | 项目源码 | 文件源码
def html_to_markdown(html):
    """convert html to markdown.
    this will try and convert span styling
    to the proper tags as well.

    e.g. `<span style='font-weight:bold;'>foo</span>`
    will become `<strong>foo</strong>`.
    """
    h = fromstring(html)

    clean_highlighted_code(h)
    for span in h.findall('.//span') + h.findall('.//font'):
        convert_span(span)

    html = tostring(h).decode('utf-8')

    # not ideal but works in a pinch
    html = html.replace('<mark>', '==')
    html = html.replace('</mark>', '==')

    md = to_md(html)

    # sometimes html2text returns a ton of extra whitespace.
    # clean up lines with only whitespace.
    # condense line break streaks of 3 or more.
    md = re.sub(r'\n([\s\*_]+)\n', '\n\n', md)
    md = re.sub(r'\n{3,}', '\n\n', md)

    return md
项目:nom    作者:frnsys    | 项目源码 | 文件源码
def rewrite_links(raw_html, rewrite_func):
    """
    Take an HTML input string, rewrite links according
    to the `rewrite_func`, return the rewritten HTML string.
    """
    html = fromstring(raw_html)
    html.rewrite_links(rewrite_func)
    return tostring(html)
项目:reahl    作者:reahl    | 项目源码 | 文件源码
def view_source(self):
        for line in html.tostring(self.lxml_html, pretty_print=True, encoding='unicode').split('\n'): 
            print(line)
项目:reahl    作者:reahl    | 项目源码 | 文件源码
def get_html_for(self, locator):
        """Returns the HTML of the element (including its own tags) targeted by the given `locator`

           :param locator: An instance of :class:`XPath` or a string containing an XPath expression.
        """
        xpath = six.text_type(locator)
        element = self.xpath(xpath)[0]
        return html.tostring(element, encoding='unicode')
项目:reahl    作者:reahl    | 项目源码 | 文件源码
def get_inner_html_for(self, locator):
        """Returns the HTML of the children of the element targeted by the given `locator` (excluding the 
           element's own tags).

           :param locator: An instance of :class:`XPath` or a string containing an XPath expression.
        """
        xpath = six.text_type(locator)
        element = self.xpath(xpath)[0]
        return ''.join(html.tostring(child, encoding='unicode') for child in element.getchildren())
项目:spider    作者:luanxiangming    | 项目源码 | 文件源码
def WriteHTML(self, testcaseinfo):

        self.CreateHtmlFile()

        f = open(self.reportfile, "r")

        htmlcontent = f.read()
        f.close()
        # tree = mytree.fromstring(str(htmlcontent))
        htmlcontent.encode('utf-8')
        tree = html.fromstring(htmlcontent)
        tableElem = tree.find(".//table")
        if testcaseinfo.result == "Failed":
            mytablerow = "<tr><td>{0}</td><td>{1}</td><td>{2}</td><td bgcolor=\"#FF0000\">{3}</td><td>{4}</td><td>{5}</td><td>{6}</td><td>{7}</td></tr>".format(
                testcaseinfo.id, testcaseinfo.name, testcaseinfo.owner, testcaseinfo.result, testcaseinfo.starttime,
                testcaseinfo.endtime, testcaseinfo.secondsDuration, testcaseinfo.errorinfo)
        elif testcaseinfo.result == "Pass":
            mytablerow = "<tr><td>{0}</td><td>{1}</td><td>{2}</td><td bgcolor=\"#00FF00\">{3}</td><td>{4}</td><td>{5}</td><td>{6}</td><td>{7}</td></tr>".format(
                testcaseinfo.id, testcaseinfo.name, testcaseinfo.owner, testcaseinfo.result, testcaseinfo.starttime,
                testcaseinfo.endtime, testcaseinfo.secondsDuration, testcaseinfo.errorinfo)
        else:
            mytablerow = "<tr><td>{0}</td><td>{1}</td><td>{2}</td><td>{3}</td><td>{4}</td><td>{5}</td><td>{6}</td><td>{7}</td></tr>".format(
                testcaseinfo.id, testcaseinfo.name, testcaseinfo.owner, testcaseinfo.result, testcaseinfo.starttime,
                testcaseinfo.endtime, testcaseinfo.secondsDuration, testcaseinfo.errorinfo)
        tableElem.append(mytree.HTML(str(mytablerow)))

        f = open(self.reportfile, "w")
        # html.tostring
        newContent = repr(html.tostring(tree, method="html", with_tail=False))
        newContent = newContent.replace(r"\n", "").replace(r"\t", "").replace('b\'', "")
        newContent = newContent[:len(newContent) - 1]
        f.write(newContent)
        f.close()
项目:all2rss    作者:Sendarg    | 项目源码 | 文件源码
def process_content(jsonBody,item_dict):
    entry = json.loads(jsonBody)
    content=Xhtml.fromstring(entry['body'])
    # get author
    # print item_dict['json_url']
    try:
        author=content.xpath('//span[@class="author"]/text()')[0].strip()
    except IndexError:
        author = ''
    try:
        bio=content.xpath('//span[@class="bio"]/text()')[0].strip()
    except IndexError:
        bio=''
    item_dict['author'] = author + bio

    coverelement = Element('img')
    coverelement.set('src', item_dict['cover'])
    content.insert(0, coverelement)

    item_dict['content'] = Xhtml.tostring(content, encoding='unicode')
    #
    print "++++\tGet zhihu items\t++++"
    print item_dict['cover']
    print item_dict['created']
    print item_dict['title']
    print item_dict['author']
    print item_dict['link']
    return item_dict
项目:all2rss    作者:Sendarg    | 项目源码 | 文件源码
def process_content(html,item_dict):
    root = Xhtml.fromstring(html)
    # ??????
    try:
        content = root.xpath('//*[@class="article-content"]')[0]
    except IndexError:
        return ''
    #
    item_dict['cover'] = None
    imgs = root.xpath('//img[@src]')
    if imgs:
        for img in imgs:
            src=img.attrib['src'].strip()
            if (not item_dict['cover']) and  src[-3:].lower() in ['jpg','png','gif'] :
                item_dict['cover']='http:'+src
                # ????
                coverelement = Element('img')
                coverelement.set('src', item_dict['cover'])
                content.insert(0, coverelement)
            elif src[:22]=="data:image/png;base64,":
                img.set("src","")
            else:
                pass


    item_dict['content'] = Xhtml.tostring(content, encoding='unicode')
    #
    print "++++\tGet jaq items\t++++"
    print item_dict['cover']
    print item_dict['created']
    print item_dict['title']
    print item_dict['desc']
    print item_dict['link']
    return item_dict
项目:maas    作者:maas    | 项目源码 | 文件源码
def __init__(self, failure):
        traceback = html.Element("pre")
        traceback.text = failure.getTraceback()
        super(StartFailedPage, self).__init__(
            status=int(SERVICE_UNAVAILABLE), brief="MAAS failed to start",
            detail=html.tostring(traceback, encoding=str))
项目:pytracking    作者:resulto    | 项目源码 | 文件源码
def adapt_html(
        html_text, extra_metadata, click_tracking=True, open_tracking=True,
        configuration=None, **kwargs):
    """Changes an HTML string by replacing links (<a href...>) with tracking
    links and by adding a 1x1 transparent pixel just before the closing body
    tag.

    :param html_text: The HTML to change (unicode or bytestring).
    :param extra_metadata: A dict that can be json-encoded and that will
        be encoded in the tracking link.
    :param click_tracking: If links (<a href...>) must be changed.
    :param open_tracking: If a transparent pixel must be added before the
        closing body tag.
    :param configuration: An optional Configuration instance.
    :param kwargs: Optional configuration parameters. If provided with a
        Configuration instance, the kwargs parameters will override the
        Configuration parameters.
    """
    configuration = get_configuration(configuration, kwargs)

    tree = html.fromstring(html_text)

    if click_tracking:
        _replace_links(tree, extra_metadata, configuration)

    if open_tracking:
        _add_tracking_pixel(tree, extra_metadata, configuration)

    new_html_text = html.tostring(tree)

    return new_html_text.decode("utf-8")
项目:idealoom    作者:conversence    | 项目源码 | 文件源码
def _clean_html(html_value, cleaner):
    fragments = html.fragments_fromstring(html_value)
    for f in fragments:
        if isinstance(f, html.HtmlElement):
            cleaner(f)
            yield html.tostring(f, encoding="unicode")
        else:
            yield f
项目:weasyl    作者:Weasyl    | 项目源码 | 文件源码
def markdown(target, image=False):
    fragment = _markdown_fragment(target, image)
    return html.tostring(fragment, encoding=unicode)[5:-6]  # <div>...</div>
项目:calibre_dangdang    作者:qunxyz    | 项目源码 | 文件源码
def parse_details_page(url, log, timeout, browser):
    from calibre.utils.cleantext import clean_ascii_chars
    from calibre.ebooks.chardet import xml_to_unicode
    import html5lib
    from lxml.html import tostring
    try:
        raw = browser.open_novisit(url, timeout=timeout).read().decode('gb18030').strip()
    except Exception as e:
        if callable(getattr(e, 'getcode', None)) and \
                        e.getcode() == 404:
            log.error('URL malformed: %r'%url)
            return
        attr = getattr(e, 'args', [None])
        attr = attr if attr else [None]
        if isinstance(attr[0], socket.timeout):
            msg = 'Amazon timed out. Try again later.'
            log.error(msg)
        else:
            msg = 'Failed to make details query: %r'%url
            log.exception(msg)
        return

    oraw = raw
    raw = raw
    raw = xml_to_unicode(raw, strip_encoding_pats=True, resolve_entities=True)[0]
    if '<title>404 - ' in raw:
        log.error('URL malformed: %r'%url)
        return

    try:
        root = html5lib.parse(raw, treebuilder='lxml',
                              namespaceHTMLElements=False)
    except:
        msg = 'Failed to parse amazon details page: %r'%url
        log.exception(msg)
        return

    errmsg = root.xpath('//*[@id="errorMessage"]')
    if errmsg:
        msg = 'Failed to parse amazon details page: %r'%url
        msg += tostring(errmsg, method='text', encoding=unicode).strip()
        log.error(msg)
        return

    from css_selectors import Select
    selector = Select(root)
    return oraw, root, selector
项目:calibre_dangdang    作者:qunxyz    | 项目源码 | 文件源码
def _render_comments(self, desc):
        from calibre.library.comments import sanitize_comments_html
        import html5lib
        # html5lib parsed noscript as CDATA

        desc = html5lib.parseFragment('<div>%s</div>' % (self.totext(desc).replace('textarea', 'div')), \
                                      treebuilder='lxml', namespaceHTMLElements=False)[0]
        matches = desc.xpath('descendant::*[contains(text(), "????") \
            or contains(text(), "????") or contains(text(), "????") \
            or contains(text(), "????") or contains(text(), "????")]/../*[self::p or self::div or self::span]')

        if matches:
            if len(matches)>1:
                desc = matches[-1]
                for item in matches:
                    content_len = len(self.totext(item))
                    if content_len > 50 and content_len < 200:
                        desc = item
                        break

        for c in desc.xpath('descendant::noscript'):
            c.getparent().remove(c)
        for c in desc.xpath('descendant::*[@class="seeAll" or'
                            ' @class="emptyClear" or @id="collapsePS" or'
                            ' @id="expandPS"]'):
            c.getparent().remove(c)
        #
        for a in desc.xpath('descendant::a[@href]'):
            del a.attrib['href']
            a.tag = 'span'
        desc = self.tostring(desc, method='text', encoding=unicode).strip()
        # return desc
        # Encoding bug in Amazon data U+fffd (replacement char)
        # in some examples it is present in place of '
        desc = desc.replace('\ufffd', "'")
        # remove all attributes from tags
        desc = re.sub(r'<([a-zA-Z0-9]+)\s[^>]+>', r'<\1>', desc)
        # Collapse whitespace
        desc = re.sub('\n+', '\n', desc)
        desc = re.sub(' +', ' ', desc)
        # Remove the notice about text referring to out of print editions
        desc = re.sub(r'(?s)<em>--This text ref.*?</em>', '', desc)
        # Remove comments
        desc = re.sub(r'(?s)<!--.*?-->', '', desc)
        return sanitize_comments_html(desc)
项目:calibre_dangdang    作者:qunxyz    | 项目源码 | 文件源码
def parse_series(self, root):
        ans = (None, None)

        # This is found on the paperback/hardback pages for books on amazon.com
        series = root.xpath('//div[@data-feature-name="seriesTitle"]')
        if series:
            series = series[0]
            spans = series.xpath('./span')
            if spans:
                raw = self.tostring(spans[0], encoding=unicode, method='text', with_tail=False).strip()
                m = re.search('\s+([0-9.]+)$', raw.strip())
                if m is not None:
                    series_index = float(m.group(1))
                    s = series.xpath('./a[@id="series-page-link"]')
                    if s:
                        series = self.tostring(s[0], encoding=unicode, method='text', with_tail=False).strip()
                        if series:
                            ans = (series, series_index)
        # This is found on Kindle edition pages on amazon.com
        if ans == (None, None):
            for span in root.xpath('//div[@id="aboutEbooksSection"]//li/span'):
                text = (span.text or '').strip()
                m = re.match('Book\s+([0-9.]+)', text)
                if m is not None:
                    series_index = float(m.group(1))
                    a = span.xpath('./a[@href]')
                    if a:
                        series = self.tostring(a[0], encoding=unicode, method='text', with_tail=False).strip()
                        if series:
                            ans = (series, series_index)
        if ans == (None, None):
            desc = root.xpath('//div[@id="ps-content"]/div[@class="buying"]')
            if desc:
                raw = self.tostring(desc[0], method='text', encoding=unicode)
                raw = re.sub(r'\s+', ' ', raw)
                match = self.series_pat.search(raw)
                if match is not None:
                    s, i = match.group('series'), float(match.group('index'))
                    if s:
                        ans = (s, i)
        if ans[0]:
            ans = (re.sub(r'\s+Series$', '', ans[0]).strip(), ans[1])
            ans = (re.sub(r'\(.+?\s+Series\)$', '', ans[0]).strip(), ans[1])
        return ans
项目:zendesk-utils    作者:trailbehind    | 项目源码 | 文件源码
def update_zendesk_article_html(self):
    '''
    rewrite the html of zendesk articles 
    to point anchor tags at new zendesk articles, instead of old uservoice articles
    '''
    print "**UPDATING HTML to switch anchor hrefs to zendesk"
    url = '{}/api/v2/help_center/categories/{}/articles.json'.format(self.zendesk_url, self.zendesk_destination_category_id)

    articles = []
    while url:
      response = requests.get(url, headers=self.headers, auth=self.credentials)
      if response.status_code != 200:
        print('FAILED to get get article list with error {}'.format(response.status_code))
        exit()
      data = response.json()
      for article in data['articles']:
        articles.append(article)
      url = data['next_page']

    print "UPDATING HTML for {} articles".format(len(articles))
    for article in articles:
      url = "{}/api/v2/help_center/articles/{}.json".format(self.zendesk_url, article['id'])
      response = requests.get(url, headers=self.headers, auth=self.credentials)
      if response.status_code != 200:
        print('FAILED to update HTML for article {} with error {}'.format(article['id'], response.status_code))
        exit()
      html_doc = fromstring(article['body'])
      for anchor_tag in html_doc.cssselect('a'):
        if not anchor_tag.get('href'):
          continue
        number_from_string_regex = re.search('(\d+)', anchor_tag.get('href'))
        if not number_from_string_regex:
          continue
        uv_id = int(number_from_string_regex.group(0))
        if uv_id in self.uvid_to_zdid:
          url = "{}/api/v2/help_center/articles/{}.json".format(self.zendesk_url, self.uvid_to_zdid[uv_id])
          response = requests.get(url, headers=self.headers, auth=self.credentials)
          if response.status_code != 200:
            print('FAILED to get article {} with error {}'.format(self.uvid_to_zdid[uv_id], response.status_code))
            exit()
          new_url = response.json()['article']['html_url']
          try:
            print('CHANGING {} to {}'.format(anchor_tag.get('href'), new_url))
          except:
            e = sys.exc_info()[0]
            print "lxml parsing error {}".format(e)
          anchor_tag.set('href', new_url)
          info = {
            'body': tostring(html_doc)
          }
          payload = json.dumps({'article': info})
          url = "{}/api/v2/help_center/articles/{}.json".format(self.zendesk_url, article['id'])
          response = requests.put(url, data=payload, headers=self.headers, auth=self.credentials)
          if response.status_code != 200:
            print('FAILED to update HTML for article {} with error {}'.format(article['id'], response.status_code))
            exit()
        else:
          print "SKIPPING this href {}".format(anchor_tag.get('href'))