Python lxml 模块,html() 实例源码

我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用lxml.html()

项目:Pofapi    作者:cmpunches    | 项目源码 | 文件源码
def generate_html_gallery( person_suite ):
    doc, tag, text = Doc().tagtext()

    doc.asis('<!DOCTYPE html>')
    with tag('html', lang="en"):
        with tag('head'):
            doc.asis('<meta charset="utf-8">')
            doc.asis('<meta name="viewport" content="width=device-width, initial-scale=1">')
            doc.asis('<link rel="stylesheet" href="http://maxcdn.bootstrapcdn.com/bootstrap/3.3.6/css/bootstrap.min.css">')
            with tag('script', src="https://ajax.googleapis.com/ajax/libs/jquery/1.12.0/jquery.min.js"):
                pass
            with tag('script', src="http://maxcdn.bootstrapcdn.com/bootstrap/3.3.6/js/bootstrap.min.js"):
                pass
            with tag('body'):
                with tag('div', klass="container-fluid"):
                    for person in person_suite:
                        print("Adding photos for user {0}".format(person.uid))
                        with tag('div', klass='row'):
                            for photo in person.photos:
                                with tag('div', klass="col-xs-1", style="padding-left: 5px; padding-right: 5px; padding-top: 5px; padding-bottom: 5px;"):
                                   with tag('p'):
                                       with tag('a', href=person.profile_url, target="_blank"):
                                           doc.stag('img', src=photo, height="175", width="175")
    return indent(doc.getvalue())
项目:gns3-documentation-template    作者:GNS3    | 项目源码 | 文件源码
def __init__(self, id, title, data, export_dir, authors=[], modifiedTime=None, theme=None, editable_by_anyone=False, template='document', appliances=None, config={}):
        log.info('Process document %s %s', id, title)
        if theme is None:
            self._theme = Theme(export_dir)
        else:
            self._theme = theme

        self._template = template
        self._config = config
        self._export_dir = export_dir
        self._authors = authors
        self._modifiedTime = modifiedTime
        self._data = data
        self._title = title
        self._appliances = appliances
        self._id = id
        self._html = lxml.html.fromstring(self._data)
        text = html_to_text(self._data)
        text = re.sub('\n\n+', '\n\n', text)
        self._text = text.replace('\n', '<br/>')
        self._editable_by_anyone = editable_by_anyone
项目:ubi-virtual-assistant    作者:Alzemand    | 项目源码 | 文件源码
def get_file_urls(mainUrl,extension):
    uniFileUrls = []
    if not mainUrl.lower().startswith('http://') and not mainUrl.lower().startswith('https://'):
        mainUrl = 'http://%s'%mainUrl
    print('Downloading from %s...'%mainUrl)
    if extension.startswith('*'):
        extension = extension[1:]
    if not extension.startswith('.'):
        extension = '.' + extension
    req = urllib.request.Request(
        mainUrl, 
        data=None, 
        headers={
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'
        }
    )
    urlContent = urllib.request.urlopen(req).read().decode('utf-8')
    html = lxml.html.fromstring(urlContent) 
    urls = html.xpath('//a/@href')
    for url in urls:
        if url.endswith(extension):
            url = urljoin(mainUrl,url)
            if url not in uniFileUrls:
                uniFileUrls.append(url)
    return uniFileUrls
项目:ubi-virtual-assistant    作者:Alzemand    | 项目源码 | 文件源码
def get_file_urls(mainUrl,extension):
    uniFileUrls = []
    if not mainUrl.lower().startswith('http://') and not mainUrl.lower().startswith('https://'):
        mainUrl = 'http://%s'%mainUrl
    print('Downloading from %s...'%mainUrl)
    if extension.startswith('*'):
        extension = extension[1:]
    if not extension.startswith('.'):
        extension = '.' + extension
    req = urllib.request.Request(
        mainUrl, 
        data=None, 
        headers={
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'
        }
    )
    urlContent = urllib.request.urlopen(req).read().decode('utf-8')
    html = lxml.html.fromstring(urlContent) 
    urls = html.xpath('//a/@href')
    for url in urls:
        if url.endswith(extension):
            url = urljoin(mainUrl,url)
            if url not in uniFileUrls:
                uniFileUrls.append(url)
    return uniFileUrls
项目:catchWecaht    作者:leon0204    | 项目源码 | 文件源码
def get_list(self, search_url):
        data = {}
        # keylist =  [0] * 5
        data['table_name'] = 'dailyKeyword'
        html = requests.get(search_url, headers=self.headers, verify=False).content
        selector = etree.HTML(html)

        # ????
        keyurl = selector.xpath('//div[@class="aside"]/ol[@class="hot-news"]/li/a/@href')
        keyword = selector.xpath('//div[@class="aside"]/ol[@class="hot-news"]/li/a/text()')
        res = {}
        res['keyurl'] = keyurl
        res['keyword'] = keyword

        for x in range(0,10):
            data['keyword'] = keyword[x]
            data ['keyurl'] = keyurl[x]
            data ['id'] = (x+1)
            self.save(data)
        return res



    # ??????
项目:competitive-cli    作者:GDGVIT    | 项目源码 | 文件源码
def login(self, username, password):
        """
        logs the user in and returns a bool value
        stores the username in self.username.
        """
        get_response = self.uva_session.get(UvaSession.UVA_HOST)
        login_text = lxml.html.fromstring(get_response.text)
        hidden_inputs = login_text.xpath(r'//form//input[@type="hidden"]')
        # print hidden_inputs
        form = {x.attrib["name"]: x.attrib["value"] for x in hidden_inputs if x.attrib['name'] not in ["cx", "ie"]}
        form["username"] = username
        form["passwd"] = password
        form["remember"] = "yes"
        login_response = self.uva_session.post(UvaSession.UVA_HOST + "index.php?option=com_comprofiler&task=login",
                                               data=form, headers={"referer": UvaSession.UVA_HOST})

        self.logged_in = login_response.url == UvaSession.UVA_HOST
        if (self.logged_in): self.username = username
        return self.logged_in
项目:python-search-engine    作者:ncouture    | 项目源码 | 文件源码
def get_clean_html(etree, text_only=False):
    _is_etree(etree)
    # enable filters to remove Javascript and CSS from HTML document
    cleaner = Cleaner()
    cleaner.javascript = True
    cleaner.style = True
    cleaner.html = True
    cleaner.page_structure = False
    cleaner.meta = False
    cleaner.safe_attrs_only = False
    cleaner.links = False

    html = cleaner.clean_html(etree)
    if text_only:
        return html.text_content()

    return lxml.html.tostring(html)
项目:python-search-engine    作者:ncouture    | 项目源码 | 文件源码
def parse_that(url):
    resp = requests.get(url)
    url = url
    raw = resp.text
    tree = get_etree(raw)
    title = doctitle(tree)
    links = get_links(tree, url)
    keywords = get_url_keywords(url)
    meta_description = meta_name_description(tree)
    html = get_clean_html(tree)
    text_content = get_clean_html(tree, text_only=True)
    return {'rank': 0,
            'title': title,
            'url': url,
            'description': meta_description,
            'keywords': keywords,
            'raw': raw,
            'text': text_content,
            'internal_links': links['internal'],
            'external_links': links['external']}
项目:WebScraping    作者:liinnux    | 项目源码 | 文件源码
def open(self, url, timeout=60):
        """Wait for download to complete and return result"""
        loop = QEventLoop()
        timer = QTimer()
        timer.setSingleShot(True)
        timer.timeout.connect(loop.quit)
        self.loadFinished.connect(loop.quit)
        self.load(QUrl(url))
        timer.start(timeout * 1000)
        loop.exec_() # delay here until download finished
        if timer.isActive():
            # downloaded successfully
            timer.stop()
            return self.html()
        else:
            # timed out
            print 'Request timed out:', url
项目:WebScraping    作者:liinnux    | 项目源码 | 文件源码
def register(first_name, last_name, email, password, captcha_fn):
    cj = cookielib.CookieJar()
    opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
    html = opener.open(REGISTER_URL).read()
    form = parse_form(html)
    form['first_name'] = first_name
    form['last_name'] = last_name
    form['email'] = email
    form['password'] = form['password_two'] = password
    img = extract_image(html)
    captcha = captcha_fn(img)
    form['recaptcha_response_field'] = captcha
    encoded_data = urllib.urlencode(form)
    request = urllib2.Request(REGISTER_URL, encoded_data)
    response = opener.open(request)
    success = '/user/register' not in response.geturl()
    return success
项目:table-extractor    作者:dbpedia    | 项目源码 | 文件源码
def remove_html_encode_errors(self, headers, error):
        """
        Use this method to remove html special characters (Eg. &nbps), encoding errors or other unicode text.

        Simply pass headers rows to the method and the error, as a unicode string, you want to correct

        :param headers: rows list of headers
        :param error: unicode string you want to delete from header cells
        :return: nothing
        """
        # Iterates over headers
        for row in headers:
            # Iterate over header cells
            for header in row:
                # Replace 'error' with u'' in the text of this header cell
                header['th'] = header['th'].replace(error, u'')
项目:table-extractor    作者:dbpedia    | 项目源码 | 文件源码
def url_composer(self, query, service):
        """
        This function is used to compose a url to call some web services, such as sparql endpoints.

        :param query: is the string used in some rest calls.
        :param service: type of service you request (dbpedia sparql endpoint)
        :return url: the url composed
        """
        # use quote_plus method from urllib to encode special character (must to do with web service)
        query = urllib.quote_plus(query)

        """
        The following if clause are differentiated by service requested Eg. 'dbpedia',..
            but in all the cases url is composed using pre formatted string along with the query
        """
        if service == 'dbpedia':
            url = self.dbpedia_sparql_url + query + self.call_format_sparql

        elif service == 'html':
            url = self.html_format + query

        else:
            url = "ERROR"
        return url
项目:munch-core    作者:crunchmail    | 项目源码 | 文件源码
def mk_plaintext(self):
        try:
            h = html2text.HTML2Text()
            h.ignore_images = True
            h.inline_links = False
            h.wrap_links = False
            h.unicode_snob = True  # Prevents accents removing
            h.skip_internal_links = True
            h.ignore_anchors = True
            h.body_width = 0
            h.use_automatic_links = True
            h.ignore_tables = True
        except html.parser.HTMLParseError as e:
            raise WrongHTML(e)

        return h.handle(self.mk_html())
项目:munch-core    作者:crunchmail    | 项目源码 | 文件源码
def mk_html(self):
        """Simply calls configured html template filters

        See settings.CAMPAIGNS['HTML_TEMPLATE_FILTERS']
        """
        # Doctype gets frequently removed by content filters, so we save
        # it...
        doc = lxml.etree.HTML(self.html)
        doctype = ''
        if doc is not None:
            doctype = doc.getroottree().docinfo.doctype

        # ... we process content...
        mangled_content = post_template_html_generation.process(
            self.html,
            detach_images=self.detach_images,
            organization=self.author.organization)

        # And we re-inject it
        return '{}\n{}'.format(doctype, mangled_content)
项目:munch-core    作者:crunchmail    | 项目源码 | 文件源码
def handle_images(html, detach_images=False, organization=None, **kwargs):
    """ Detach base64 images and others if detach_images is enabled
    """
    tree = lxml.html.fromstring(html)

    for img in tree.cssselect('img'):
        try:
            src = img.attrib['src']
        except KeyError:
            raise WrongHTML('<img> devrait avoir un attribut "src"')
        if src.startswith('data:image/'):
            # TODO: handle ValueError
            image = InlineImage(src, organization=organization)
            url = image.store()
            img.set('src', url)
        else:
            if detach_images and organization:
                image = HostedImage(src, organization=organization)
                url = image.store()
                img.set('src', url)
    return lxml.html.tostring(tree).decode()
项目:webkivy    作者:miohtama    | 项目源码 | 文件源码
def crawl(self, url, base_url):
        """Crawl .html page and extract all URls we think are part of application from there.

        Parallerize downloads using threads.
        """

        resp = requests.get(url)

        # See through redirects
        final_base_url = resp.url

        tree = lxml.html.fromstring(resp.content)
        elems = tree.cssselect("a")
        links = [urljoin(final_base_url, elem.attrib.get("href", "")) for elem in elems]
        links = [link for link in links if is_likely_app_part(link, base_url)]

        # Load all links paraller
        with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
            future_to_url = {executor.submit(self.fetch_file, link, base_url): link for link in links}
            for future in concurrent.futures.as_completed(future_to_url):
                future.result()  # Raise exception in main thread if bad stuff happened
项目:dark-keeper    作者:itcrab    | 项目源码 | 文件源码
def test_create_content_good(self, html_mock):
        url = 'https://talkpython.fm.mock/episodes/all'
        responses.add(responses.GET, url,
                    body=html_mock, status=200,
                    content_type='text/html')

        request = Request(
            [1, 2],
            'Mozilla/5.0 (Windows NT 10.0; WOW64) '
            'AppleWebKit/537.36 (KHTML, like Gecko) '
            'Chrome/53.0.2785.116 Safari/537.36 OPR/40.0.2308.81',
        )

        html = request.receive_html(url)

        content = Content()
        content.set_content(html)

        assert isinstance(content.get_content(), lxml.html.HtmlElement)
项目:nicar-2017-schedule    作者:jsvine    | 项目源码 | 文件源码
def get_sessions():
    """
    Fetch and parse the schedule HTML from the NICAR webpage.
    """
    html = fix_encoding(requests.get(SCHEDULE_URL).content)
    dom = lxml.html.fromstring(html)
    day_els = dom.cssselect("ul.listview.pane")
    days_zipped = zip(day_els, DATES)
    sessions_nested = [ parse_day(el, date) for el, date in days_zipped ]
    sessions = itertools.chain.from_iterable(sessions_nested)
    return list(sorted(sessions, key=itemgetter(
        "date",
        "time_start",
        "time_end",
        "title"
    )))
项目:New-vulnerable-report-scraper-    作者:shamlikt    | 项目源码 | 文件源码
def scrape_kb_crt(self, url):
        ''' This method is used for parsing www.kb.cert.or'''
        data = self.get_html_data(url)
        lists = data.find(id="list-of-vuls").find_all("li") # Selecting list of valuns from https://www.kb.cert.org/vuls/
        for li in lists:
            temp_data = deepcopy(self.value)                # creating copy of self.value 
            temp_data['val_name'] = li.find("span", class_="vul-title truncate").text # parsing name using class name of span
            date = li.find("span", class_="vul-date").text  # parsing published using class name of span 
            temp_data['date'] = datetime.strptime(date, '%d %b %Y').date()
            page_link = "{}{}".format(url.strip('/vuls/'),li.a['href'])   # Creating link address
            temp_data['link'] = page_link
            new_data = self.get_html_data(page_link).find(id="vulnerability-note-content") # fetching link data and selecting a specific div using id 
            temp_data['description'] = new_data.p.text
            temp_data['solution'] = new_data.find_all("table")[2].find("tr").text # selecting solution part from html page using 'tr' tabs 
            self.data.append(temp_data) # appending temp data info to class variable called self.data
            temp_data['severity'] = "Medium" 
            temp_data['affected'] = "Please find description"
项目:New-vulnerable-report-scraper-    作者:shamlikt    | 项目源码 | 文件源码
def scrape_fortinet(self, url):
        # ''' This method is used for parsing http://www.fortiguard.com/psirt'''
        data_fn = self.get_html_data(url)      # souping
        advisory_fn = data_fn.find('div', class_ ="results") # identifying the required tagset
        section_fn = advisory_fn.find_all('div', class_ ="title")
        for list in section_fn:
            temp_data_fn = deepcopy(self.value)
            temp_data_fn['val_name'] = list.text.strip()
            page_link_fn = "{}{}".format(url.strip('/psirt/'),list.a['href'])
            temp_data_fn['link'] = page_link_fn
            new_data_fn = self.get_html_data(page_link_fn)
            temp_data_fn['description'] = new_data_fn.find_all('div', class_="detail-item")[1].html.body.p.text.strip()

            new_table_fn = new_data_fn.find('table', class_="table table-responsive table-borderless")
            date = new_table_fn.find_all('tr')[1].find_all('td')[1].text.strip()
            temp_data_fn['date'] = datetime.strptime(date, '%b %d, %Y').date()
            temp_data_fn['severity'] = "Medium"
            temp_data_fn['affected'] = "Please find description"
            temp_data_fn['solution'] = "Information not available in website" 
            self.data.append(temp_data_fn) # appending temp data info to class variable called self.data
项目:New-vulnerable-report-scraper-    作者:shamlikt    | 项目源码 | 文件源码
def scrape_cisco(self, url):
         # Scraping the Ajax page (Identified the json call)
        ajax_data = get("https://tools.cisco.com/security/center/publicationService.x?criteria=exact&cves=&keyword=&last_published_date=&limit=30&offset=0&publicationTypeIDs=1,3&securityImpactRatings=&sort=-day_sir&title=").text
        json_data = json.loads(ajax_data) #convert to json (Type: List of dicts)
        for dictionary in json_data[:9]:
            temp_data_ci = deepcopy(self.value)
            temp_data_ci['val_name'] = dictionary['title']
            temp_data_ci['severity'] = dictionary['severity']
            temp_data_ci['date'] = self.convert_cisco_date(dictionary['firstPublished'])     # skip all updates and include only new advisories
            page_link_ci = dictionary['url']
            temp_data_ci['link'] = page_link_ci
            # Scraping the CSS part
            css_data = get(page_link_ci)
            css_tree = lxml.html.fromstring(css_data.text)  # build the DOM Tree
            sel = CSSSelector('meta')   # construct a CSS Selector
            results = sel(css_tree)     # Apply the selector to the DOM tree.
            match = results[38]     # copy the list for the 38th result.
            temp_data_ci['description'] = match.get('content')  # get the content attribute for the 38th result.

            new_data_ci = self.get_html_data(page_link_ci)
            temp_data_ci['affected'] = new_data_ci.find('div', class_="ud-innercontent-area", id="vulnerableproducts").text.strip()
            temp_data_ci['solution'] = new_data_ci.find('div', class_="ud-innercontent-area", id="workaroundsfield").text.strip()
#            temp_data_ci['solution'] = new_data_ci.find('div', class_="ud-innercontent-area", id="fixedsoftfield",).text.strip() #alternate
            self.data.append(temp_data_ci) # appending temp data info to class variable called self.data
项目:wswp    作者:kjam    | 项目源码 | 文件源码
def open(self, url, timeout=60):
        """Wait for download to complete and return result"""
        loop = QEventLoop()
        timer = QTimer()
        timer.setSingleShot(True)
        timer.timeout.connect(loop.quit)
        self.loadFinished.connect(loop.quit)
        self.load(QUrl(url))
        timer.start(timeout * 1000)
        loop.exec_() # delay here until download finished
        if timer.isActive():
            # downloaded successfully
            timer.stop()
            return self.html()
        else:
            # timed out
            print 'Request timed out:', url
项目:de-visualization-wizard    作者:deleidos    | 项目源码 | 文件源码
def childNodesWithText(self, node):
        root = node
        # create the first text node
        # if we have some text in the node
        if root.text:
            t = lxml.html.HtmlElement()
            t.text = root.text
            t.tag = 'text'
            root.text = None
            root.insert(0, t)
        # loop childs
        for c, n in enumerate(list(root)):
            idx = root.index(n)
            # don't process texts nodes
            if n.tag == 'text':
                continue
            # create a text node for tail
            if n.tail:
                t = self.createElement(tag='text', text=n.tail, tail=None)
                root.insert(idx + 1, t)
        return list(root)
项目:openkamer    作者:openkamer    | 项目源码 | 文件源码
def get_related_document_ids(kamervraag_url):
    logger.info('get related antwoord id for url: ' + kamervraag_url)
    page = requests.get(kamervraag_url, timeout=60)
    tree = lxml.html.fromstring(page.content)
    relations_titles = tree.xpath('//div[@id="main-column"]//h2[@class="divisiekop1"]')
    overheidnl_document_ids = []
    for title_element in relations_titles:
        if title_element.text_content() == "Relaties":
            column_elements = title_element.getparent().xpath('//tr/td/p')
            next_is_antwoord_url = False
            for column_element in column_elements:
                if next_is_antwoord_url:
                    overheidnl_document_ids.append(column_element.text_content())
                    next_is_antwoord_url = False
                if column_element.text_content() == 'is beantwoord in':
                    next_is_antwoord_url = True
    return overheidnl_document_ids
项目:openkamer    作者:openkamer    | 项目源码 | 文件源码
def get_kamervraag_document_id_and_content(url):
    logger.info('get kamervraag document id and content for url: ' + url)
    page = requests.get(url, timeout=60)
    tree = lxml.html.fromstring(page.content)
    elements = tree.xpath('//ul/li/a[@id="technischeInfoHyperlink"]')
    if elements:
        document_id = elements[0].get('href').split('/')[-1]
    else:
        elements = tree.xpath('/html/head/meta[@name="dcterms.identifier"]')
        if not elements:
            return None, '', ''
        document_id = elements[0].get('content')
    logger.info('document id: ' + document_id)
    content_html = ''
    if tree.xpath('//div[@id="main-column"]'):
        content_html = lxml.etree.tostring(tree.xpath('//div[@id="main-column"]')[0])
    titles = tree.xpath('//h1[@class="kamervraagomschrijving_kop no-toc"]')
    title = ''
    if titles:
        title = titles[0].text_content()
        title = re.sub('\s{2,}', ' ', title).strip()
    return document_id, content_html, title
项目:Pofapi    作者:cmpunches    | 项目源码 | 文件源码
def Main():
    output_path = "lol.html"
    config_file = "config.ini"

    config = POFSession.Config( config_file )

    testSession = POFSession(config)
    testSession.login(config.username, config.password)

    galleryData = list()
    users = testSession.searchUsers(config, 100, online_only=True)
    print("Search complete.")
    for user in users:
        photos = testSession.getPhotos(user)
        galleryDataEntry = UserGalleyDataEntry(user, photos)
        galleryData.append(galleryDataEntry)

    html_doc = generate_html_gallery( galleryData )

    save_gallery_to_file( output_path, html_doc )

    open_gallery( output_path )
项目:okami    作者:ambrozic    | 项目源码 | 文件源码
def items(self, task, response):
        items = []
        document = lxml.html.document_fromstring(html=response.text)
        products = document.xpath("//div[@class='product']")
        for product in products:
            iid = int(product.xpath(".//@product-id")[0])
            name = product.xpath(".//h2/text()")[0]
            desc = product.xpath(".//p/text()")[0]
            category = product.xpath(".//span/text()")[0]
            price = float(product.xpath(".//em/text()")[0])
            images = product.xpath(".//div//img/@src")
            item = Product(
                iid=iid,
                url=response.url,
                name=name,
                category=category,
                desc=desc,
                price=price,
                images=images,
            )
            items.append(item)
        return items
项目:Python-Web-Scraping-Second-Edition    作者:PacktPublishing    | 项目源码 | 文件源码
def open(self, url, timeout=60):
        """Wait for download to complete and return result"""
        loop = QEventLoop()
        timer = QTimer()
        timer.setSingleShot(True)
        timer.timeout.connect(loop.quit)
        self.loadFinished.connect(loop.quit)
        self.load(QUrl(url))
        timer.start(timeout * 1000)
        loop.exec_() # delay here until download finished
        if timer.isActive():
            # downloaded successfully
            timer.stop()
            return self.html()
        else:
            # timed out
            print 'Request timed out:', url
项目:pyscrapers    作者:veltzer    | 项目源码 | 文件源码
def get_my_content(r):
    """
    the return from the server in vk is not a standard HTML.
    this is why we must cut it up and cant use the regular
    'get_real_content' helper.
    """
    assert r.status_code == 200
    # str_content=r.content.decode(errors='ignore')
    try:
        content = r.content  # type: bytes
        str_content = content.decode(errors='ignore')
    except Exception as e:
        print(e)
        print('could not decode')
        print(r.content)
        sys.exit(1)
    str_content = str_content[str_content.find('<input'):]
    c = str.encode('<html><body>')+str.encode(str_content)+str.encode('</body></html>')
    root = lxml.html.fromstring(c)
    return root
项目:bing_query_title_match    作者:sefira    | 项目源码 | 文件源码
def get_full_answer(url):
    print(url)
    page = lxml.html.document_fromstring(urllib.request.urlopen(url).read().decode("gbk"))
    best = page.xpath("//pre[contains(@class, 'best-text mb-10')]")
    common = page.xpath("//meta[contains(@name, 'description')]")
    if len(best) >= 1:
        best = best[0].text_content()
    else:
        if len(common) >= 1:
            best = common[0].text_content()
        else:
            best = "???????"
    return best

#############################################################
### web server
项目:gns3-documentation-template    作者:GNS3    | 项目源码 | 文件源码
def html(self):
        return self._html
项目:gns3-documentation-template    作者:GNS3    | 项目源码 | 文件源码
def export(self):
        output_dir = os.path.join(self._export_dir, self._id)
        root = ".."
        os.makedirs(output_dir, exist_ok=True)
        data, files = self._process(root=root)

        with open(os.path.join(output_dir, 'index.html'), 'wb+') as f:
            f.write(data.encode('utf-8'))

        for url, file in files:
            self._download_url(url, os.path.join(output_dir, file))
项目:gns3-documentation-template    作者:GNS3    | 项目源码 | 文件源码
def _process(self, root='..'):
        files = []
        self._clean_html()
        self._annotate()

        for (element, attr, url, _) in self._html.iterlinks():
            if element.tag == 'a' and attr == 'href' and url.startswith('https://www.google.com/url'):
                element.set('href', process_link(url, root=root))
            elif element.tag == 'img' and attr == 'src':
                filetitle = hashlib.md5(url.encode()).hexdigest()
                filetitle += '.jpg'
                element.set('src', '../' +  self._id + '/' + filetitle) # We go to top level to handle when the document is use as appliance
                files.append((url, filetitle))

        self._toc = self._get_toc()
        self._add_anchors()
        self._wrap_images()
        self._replace_youtube_videos()

        # Wrap the original body
        try:
            body = self._html.xpath('//body')[0]
        except (IndexError):
            body = lxml.html.Element('body')
        body.tag = 'div'

        if 'style' in body.attrib:
            del body.attrib['style']

        self._content = lxml.etree.tostring(body, pretty_print=True, method="html")
        return self._theme.render(self._template + '.html', document=self, root=root, config=self._config, appliances=self._appliances), files
项目:gns3-documentation-template    作者:GNS3    | 项目源码 | 文件源码
def _wrap_images(self):
        """
        Wrap images in a target blank
        """
        for img in self._html.iter('img'):
            img.attrib.pop('style', None)
            a = img
            a.attrib["href"] = img.attrib.pop("src", None)
            a.attrib["target"] = "_blank"
            a.tag = "a"
            img = lxml.html.builder.IMG()
            img.attrib["src"] = a.attrib["href"]
            a.append(img)
项目:gns3-documentation-template    作者:GNS3    | 项目源码 | 文件源码
def _add_anchors(self):
        """
        Add anchors link to h1, h2, h3
        """
        for element in self._html.iter('h1', 'h2', 'h3'):
            if len(element) == 0 and element.attrib.get('id') is not None:
                a = lxml.html.builder.A()
                a.attrib['href'] = "#" + element.attrib.get('id')
                a.text = element.text
                element.text = None
                element.append(a)
项目:gns3-documentation-template    作者:GNS3    | 项目源码 | 文件源码
def main():
    """
    Run a test
    """
    import tempfile

    with tempfile.TemporaryDirectory() as tmpdir:
        document = DriveDocument("42", "test", "<html><body style=\"test\"><h1>Hello</h1></body></html>", editable_by_anyone=True)
        document.export(tmpdir)
        with open(os.path.join(tmpdir, "42", "index.html")) as f:
            print(f.read())
项目:catchWecaht    作者:leon0204    | 项目源码 | 文件源码
def get_keylist(self, search_url,i):

        html = requests.get(search_url, headers=self.headers, verify=False).content
        selector = etree.HTML(html)
        # ????
        content = selector.xpath('//div[@class="news-box"]/ul/li/div[@class="txt-box"]/h3/a/@href')
        for list in content:
            maincontent = self.get_content(list,i)
项目:catchWecaht    作者:leon0204    | 项目源码 | 文件源码
def removeFile(self):
        # ?????
        if os.path.exists('/home/wwwroot/laravel/public/img/daily/'):
            shutil.rmtree(r'/home/wwwroot/laravel/public/img/daily')  # pic
        if os.path.exists('/home/wwwroot/url/daily/'):
            shutil.rmtree(r'/home/wwwroot/url/daily')  # html




    # ???????????
项目:catchWecaht    作者:leon0204    | 项目源码 | 文件源码
def get_list(self, search_url):

        html = requests.get(search_url, headers=self.headers, verify=False).content
        selector = etree.HTML(html)
        # ????
        content = selector.xpath('//div[@class="news-box"]/ul/li/div[@class="txt-box"]/h3/a/@href')
        for list in content:
            maincontent = self.get_content(list)



    # ?????????????
项目:catchWecaht    作者:leon0204    | 项目源码 | 文件源码
def get_list(self, search_url):

        html = requests.get(search_url, headers=self.headers, verify=False).content
        selector = etree.HTML(html)
        # ????
        content = selector.xpath('//div[@class="news-box"]/ul/li/div[@class="txt-box"]/h3/a/@href')
        for list in content:
            maincontent = self.get_content(list)



    # ?????????????
项目:FuME    作者:fupadev    | 项目源码 | 文件源码
def reserve(self, match):
        payload = {'match_selected': match['match_id'],
                   'match_verein_id': '',
                   'as_values_match_verein_id': '',
                   'check_match': match['match_id']}

        r = self.driver.request("POST", self.baseUrl + '&act=new', data=payload)
        doc = lxml.html.fromstring(r.content)
        path_match = "/html/body//table//tr[@id]/*//text() | " \
                     "/html/body//table//tr[@id]/*//@href"
        raw = doc.xpath(path_match)

        # 2017-06-05 -> 05.06.17
        date = datetime.datetime.strptime(match['match_date'], '%Y-%m-%d %H:%M').strftime('%d.%m.%y %H:%M')

        # ---- raw snipet -----
        # 0 06.06.17 18:30 Uhr
        # 1 Relegation
        # 2 TSV Landsberg
        # 3 - TSV Bogen
        # 4 index.php?page=fotograf_spiele&mafo_id=43704&act=del
        # 5 Bereits jemand eingetragen:
        # 6 http://www.fupa.net/fupaner/abc-def-3
        # 7 abc def
        # ...

        for i, d in enumerate(raw):
            if date in d:
                if match['home'] in raw[i + 2] and match['guest'] in raw[i + 3]:
                    url = raw[i + 4]
                    match['mafo_id'] = url.split("?")[1].split("&")[1].split("=")[1]
                    try:
                        if 'Bereits jemand eingetragen' in raw[i + 5]:
                            # already reserved
                            return match, raw[i + 7]  # Photographer
                    except:
                        pass
                    # match can be reserved
                    return match, None
项目:htmltab    作者:flother    | 项目源码 | 文件源码
def parse_html(html_file):
    """
    Read the HTML file using lxml's HTML parser, but convert to Unicode
    using Beautiful Soup's UnicodeDammit class.

    Can raise LxmlError or TypeError if the file can't be opened or
    parsed.
    """
    unicode_html = UnicodeDammit(html_file, smart_quotes_to="html",
                                 is_html=True)
    if unicode_html.unicode_markup is None:
        raise ValueError("no HTML provided")
    if not unicode_html.unicode_markup:
        raise ValueError("could not detect character encoding")
    return lxml.html.fromstring(unicode_html.unicode_markup)
项目:Taigabot    作者:FrozenPigs    | 项目源码 | 文件源码
def test_parse_fragments_fromstring(self):
        parser = lxml.html.HTMLParser(encoding='utf-8', remove_comments=True)
        html = """<frameset>
            <frame src="main.php" name="srcpg" id="srcpg" frameborder="0" rolling="Auto" marginwidth="" marginheight="0">
        </frameset>"""
        etree_document = lxml.html.fragments_fromstring(html, parser=parser)
        self.assertEqual(len(etree_document), 1)
        root = etree_document[0]
        self.assertEqual(root.tag, "frameset")
        frame_element = root[0]
        self.assertEqual(frame_element.tag, 'frame')
项目:Taigabot    作者:FrozenPigs    | 项目源码 | 文件源码
def test_parse_fromstring(self):
        parser = lxml.html.HTMLParser(encoding='utf-8', remove_comments=True)
        html = """<html><frameset>
            <frame src="main.php" name="srcpg" id="srcpg" frameborder="0" rolling="Auto" marginwidth="" marginheight="0">
        </frameset></html>"""
        etree_document = lxml.html.fromstring(html, parser=parser)
        self.assertEqual(etree_document.tag, 'html')
        self.assertEqual(len(etree_document), 1)
        frameset_element = etree_document[0]
        self.assertEqual(len(frameset_element), 1)
        frame_element = frameset_element[0]
        self.assertEqual(frame_element.tag, 'frame')
项目:Taigabot    作者:FrozenPigs    | 项目源码 | 文件源码
def test_allow_tags(self):
        html = """
            <html>
            <head>
            </head>
            <body>
            <p>some text</p>
            <table>
            <tr>
            <td>hello</td><td>world</td>
            </tr>
            <tr>
            <td>hello</td><td>world</td>
            </tr>
            </table>
            <img>
            </body>
            </html>
            """

        html_root = lxml.html.document_fromstring(html)
        cleaner = Cleaner(
            remove_unknown_tags = False,
            allow_tags = ['table', 'tr', 'td'])
        result = cleaner.clean_html(html_root)

        self.assertEqual(12-5+1, len(list(result.iter())))
项目:Taigabot    作者:FrozenPigs    | 项目源码 | 文件源码
def test_safe_attrs_included(self):
        html = """<p><span style="color: #00ffff;">Cyan</span></p>"""

        safe_attrs=set(lxml.html.defs.safe_attrs)
        safe_attrs.add('style')

        cleaner = Cleaner(
            safe_attrs_only=True,
            safe_attrs=safe_attrs)
        result = cleaner.clean_html(html)

        self.assertEqual(html, result)
项目:Taigabot    作者:FrozenPigs    | 项目源码 | 文件源码
def test_safe_attrs_excluded(self):
        html = """<p><span style="color: #00ffff;">Cyan</span></p>"""
        expected = """<p><span>Cyan</span></p>"""

        safe_attrs=set()

        cleaner = Cleaner(
            safe_attrs_only=True,
            safe_attrs=safe_attrs)
        result = cleaner.clean_html(html)

        self.assertEqual(expected, result)
项目:competitive-cli    作者:GDGVIT    | 项目源码 | 文件源码
def submit(self, probNum, path=".", language=None):
        """
        submits the problem according to the problem Number of the question.
        returns a list containing the submission details about the question.
        """
        file_path, filename = UvaSession.find_file(probNum, path)
        probFile = open(file_path)

        if language is None:
            language_number = UvaSession.find_language(filename)
        else:
            language_number = UvaSession.language_handler[language]

        if language_number is None:
            return

        payload = {
            "localid": probNum,
            "code": probFile.read(),
            "language": language_number,
            "codeupl": "",
            "problemid": "",
            "category": "",
            "submit": "Submit"
        }

        updated_headers = {
            "Referer": UvaSession.UVA_HOST + "index.php?option=com_onlinejudge&Itemid=25",
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
            "Host": "uva.onlinejudge.org",
            "Origin": UvaSession.UVA_HOST
        }

        resp = self.uva_session.post(UvaSession.SUBMIT_PATH, data=payload, headers=updated_headers)
        submission_id = resp.url[resp.url.find('ID')+3:]
        return self.check_result(submission_id, probNum)
项目:competitive-cli    作者:GDGVIT    | 项目源码 | 文件源码
def login(self, username="", password=""):

        # logging in without credentials
        self.username = username
        response_page = self.codechef_session.get(CodechefSession.codechef_url)
        html_page = lxml.html.fromstring(response_page.text)
        hidden_inputs = html_page.xpath(
            r'//form//input[@type="hidden"]'
        )
        payload = {i.attrib["name"]: i.attrib["value"]
                   for i in hidden_inputs}
        payload['name'] = username
        payload['pass'] = password
        payload['op'] = 'Login'
        response = self.codechef_session.post(CodechefSession.codechef_url, data=payload)

        # removing extra sessions using simple scraping and form handling
        while response.url == CodechefSession.codechef_url + '/session/limit':
            html_page = lxml.html.fromstring(response.text)
            all_inputs = html_page.xpath(r'//form//input')
            payload = {i.attrib["name"]: i.attrib["value"] for i in all_inputs[::-1]}

            response = self.codechef_session.post(CodechefSession.codechef_url + '/session/limit', data=payload)
        soup = bs(response.content, 'lxml')
        name = soup.find(text=username)

        self.logged_in = bool(name)
        if self.logged_in: self.username = username
        return self.logged_in
项目:competitive-cli    作者:GDGVIT    | 项目源码 | 文件源码
def submit(self, question_code, path=".", language=None):
        contest = ""
        for contests in self.info_present_contests():
            for contest_ques in CodechefSession.ques_in_contest(contests['contest_name']):
                if contest_ques == question_code:
                    contest = '/' + contests['contest_name']
                    break
        file_path = path
        # file_path, file_name = CodechefSession.find_file(question_code, path)
        lang = CodechefSession.language_handler[language]
        response = self.codechef_session.get(
            self.codechef_url + contest + '/submit/' + question_code
        )

        html_page = lxml.html.fromstring(response.text)
        hidden_inputs = html_page.xpath(r'//form//input[@type="hidden"]')
        payload = {i.attrib['name']: i.attrib['value'] for i in hidden_inputs}
        payload['language'] = lang
        payload['problem_code'] = question_code
        payload['op'] = 'Submit'

        file = {
            "files[sourcefile]": open(file_path)
        }

        response = self.codechef_session.post(CodechefSession.codechef_url + contest + '/submit/' + question_code,
                                              data=payload,
                                              files=file
                                              )

        sub_id = response.url.split('/')[-1]
        return sub_id , self.check_result(sub_id, question_code)