Python lxml.etree 模块,HTMLParser() 实例源码

我们从Python开源项目中,提取了以下48个代码示例,用于说明如何使用lxml.etree.HTMLParser()

项目:national-geographic-wallpaper    作者:atareao    | 项目源码 | 文件源码
def set_nasa_wallpaper():
    st = datetime.fromtimestamp(time.time()).strftime('%y%m%d')
    url = URL07.format(st)
    r = requests.get(url)
    if r.status_code == 200:
        try:
            parser = etree.HTMLParser(recover=True)
            html = etree.HTML(r.content, parser)
            images = html.iter('img')
            if images is not None:
                images = list(images)
                if len(images) > 0:
                    image_url = images[0].getparent().attrib['href']
                    image_url = 'https://apod.nasa.gov/' + image_url
                    if download(image_url) is True:
                        set_background(comun.POTD)
        except Exception as e:
            print(e)
项目:base_function    作者:Rockyzsu    | 项目源码 | 文件源码
def debug_page():
    headers = {
        'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:31.0) Gecko/20100101 Firefox/31.0'
    }
    url = 'http://m.qfang.com/guangzhou/rent/100001468?gardenId=1109818'
    r = requests.get(url=url, headers=headers)
    #r.encoding='gbk'
    print r.status_code
    print type(r.content)
    print r.content
    #print chardet.detect(r)
    tree = etree.HTML(r.text,parser=etree.HTMLParser(encoding='utf-8'))
    #print etree.tostring(tree)
    return tree,r.text

# ????????header??
项目:awslogin    作者:byu-oit    | 项目源码 | 文件源码
def get_account_names(saml_assertion):
    saml_url = "https://signin.aws.amazon.com:443/saml"
    headers = {
        'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8'
    }
    response = requests.post(saml_url, headers=headers, data={
        'SAMLResponse': saml_assertion.assertion
    })
    response.raise_for_status()
    html_response = ET.fromstring(response.text, ET.HTMLParser())
    account_names = {}
    for element in html_response.findall('.//div[@class="saml-account-name"]'):
        account_id = element.text.split(' ')[2].replace('(', '').replace(')', '')
        account_name = element.text.split(' ')[1]
        account_names[account_id] = account_name

    return account_names
项目:edxcut    作者:mitodl    | 项目源码 | 文件源码
def list_courses(self):
        '''
        List courses available in Studio site
        '''
        self.ensure_studio_site()
        url = "%s/home/" % self.BASE
        ret = self.ses.get(url)
        parser = etree.HTMLParser()
        xml = etree.parse(StringIO(ret.content), parser).getroot()
        courses = []
        course_ids = []
        for course in xml.findall('.//li[@class="course-item"]'):
            cid = course.get("data-course-key")
            if self.verbose:
                print cid  # etree.tostring(course)
            courses.append(course)
            course_ids.append(cid)
        return {'xml': courses,
                'course_ids': course_ids,
                }
项目:edxcut    作者:mitodl    | 项目源码 | 文件源码
def _get_block_child_info_from_content_preview(self, block_id):
        '''
        Get child info dict from content preview
        '''
        xblock = self.get_xblock(usage_key=block_id, view="container_preview")
        html = xblock['html']
        parser = etree.HTMLParser()
        xml = etree.parse(StringIO(html), parser).getroot()
        ids =[]
        child_blocks = []
        for elem in xml.findall('.//li[@class="studio-xblock-wrapper is-draggable"]'):
            cid = elem.get('data-locator')
            ids.append(cid)
            child_blocks.append(self.get_xblock(usage_key=cid))
        child_info = {'children': child_blocks,
                      'child_ids': ids,
                      }
        return child_info
项目:hocr-spec-python    作者:kba    | 项目源码 | 文件源码
def validate(self, source, parse_strict=False, filename=None):
        """
        Validate a hocr document

        Args:
            source (str): A filename or '-' to read from STDIN
            parse_strict (bool): Whether to be strict about broken HTML. Default: False
            filename (str): Filename to use in the reports. Set this if reading
                            from STDIN for nicer output

        """
        parser = etree.HTMLParser(recover=parse_strict)
        if not filename: filename = source
        if source == '-': source = sys.stdin
        doc = etree.parse(source, parser)
        root = doc.getroot()
        report = HocrValidator.Report(filename)
        try:
            self.spec.check(report, root)
        except ValueError as e:
            sys.stderr.write("Validation errored\n")
        return report
项目:prestashop-sync    作者:dragoon    | 项目源码 | 文件源码
def get_xml_data(req_string, headers, data=None):
    req = urllib2.Request(req_string, headers=headers)
    html_data = _get_html_data(req, data)
    # Clean chunked data
    html_data = clean_chunked_data(html_data)
    #log_user_action(req.get_host() ,'chunked data', html_data, {})

    try:
        data = etree.fromstring(html_data)
    except XMLSyntaxError:
        # lxml cannot handle encoding declarations :(
        data = etree.HTML(html_data, etree.HTMLParser())
        # data is None when it was not XML, like 404 page without 404 code
        if data is not None:
            data = data.getroottree()
        else:
            raise urllib2.HTTPError(req_string, 404, "Not an XML", None, None)
        # TODO: check valid
        #if not data.find('.//prestashop'):
        #    raise urllib2.HTTPError(req_string, 404, "Not an XML", None, None)
    return data
项目:aws-adfs    作者:venth    | 项目源码 | 文件源码
def _a_page_of_expired_login():
        return ET.fromstring(
            '''
<!DOCTYPE html>
<html>
  <head>
    <title>Amazon Web Services Sign-In</title>
    <meta name="viewport" content="width=device-width" />
    <meta http-equiv="Content-Type" content="text/html; charset=UTF-8"></head>
    <body>
      <div id="container">
        <h1 class="background">Amazon Web Services Login</h1>
        <div id="content">
          <div id="main_error"></div>
          <form id="saml_form" name="saml_form" action="/saml" method="post">
            <input type="hidden" name="RelayState" value="" />
            <input type="hidden" name="SAMLResponse" value="" />
            <input type="hidden" name="name" value="" />
            <p style="font-size: 16px; padding-left: 20px;">Select a role:</p>
          </div>
        </body>
      </html>
            ''',
            ET.HTMLParser(),
        )
项目:FundSpider    作者:s6530085    | 项目源码 | 文件源码
def parse_home(self, home_content):
        if home_content is None:
            return None
        home_content = home_content.encode('ISO-8859-1').decode('gbk')
        html = etree.HTML(home_content, parser=etree.HTMLParser(encoding='utf-8'))
        alinks = html.xpath('//a[@href]')

        pattern_capture = re.compile(ur"?(\d{6})?(.+)")
        l = []
        for alink in alinks:
            aa = alink.text
            if aa != None:
                match = pattern_capture.match(aa)
                if match:
                    #????,???????
                    # l.append((match.group(1), match.group(2)))
                    l.append(match.group(1))
        return l

    #?????????????,???????,????dict?,????????,?????????????
项目:FundSpider    作者:s6530085    | 项目源码 | 文件源码
def parse_ratio(self, info, content):
        # content = content.split('"')[1]
        html = etree.HTML(content, parser=etree.HTMLParser(encoding='utf-8'))
        tds = html.xpath('//td[@class="tor"]')
        if len(tds) > 2:
            #??????,?????---?
            #???????????????????????????????????http://fund.eastmoney.com/f10/cyrjg_510090.html?????>???????????????+??=100%????<=????????????
            insito = tds[0].text
            if insito != '---':
                info.inratio += safe_to_float(insito.split("%")[0])
            # innerto = tds[2].text
            # if innerto != '---':
            #     self.inratio += safe_to_float(innerto.split("%")[0])
            # self.inratio = safe_to_float(.split('%')[0]) + safe_to_float(tds[2].text.split('%')[0])

    #?????????,????????
项目:FundSpider    作者:s6530085    | 项目源码 | 文件源码
def parse_stocks(self, info, content):
        html = etree.HTML(content, parser=etree.HTMLParser(encoding='utf-8'))
        #????????????,?????????????
        tbs = html.xpath('//table[@class="w782 comm tzxq"]')
        # pers = html.xpath('//table[@class="w782 comm tzxq"]')
        if len(tbs) > 0:
            #???????,?????
            stocktds = tbs[0].xpath('.//td[@class="tol"]/a')
            pers = tbs[0].xpath('.//td[@class="tor"]')
            # ???????????,????,?????5???
            front, interval = 2, 5
            if not '???' in content:
                front, interval = 0, 3
            for (index, stocked) in enumerate(stocktds):
                # info.stocks.append(stocked.text)
                # tor????,?????????
                per = pers[index*interval+front]
                # ???????? "???????????????????????????????" ????????
                if per == '---':
                    continue
                # ?????????,??[????-3.6%,????-4.1%]?????
                # ????????bug,?????,??????
                stockname = stocked.text
                if not stockname is None and len(stockname) > 0:
                    info.stocks.append(stockname + '-' + per.text)
项目:FundSpider    作者:s6530085    | 项目源码 | 文件源码
def parse_index_list(self, index_list_content):
        # ????????
        index_list_content = index_list_content.encode('ISO-8859-1').decode('utf-8')
        parsed_content = etree.HTML(index_list_content, parser=etree.HTMLParser(encoding='utf-8'))
        trs = parsed_content.xpath('//tbody/tr')
        indexs = []
        for tr in trs:
            tds = tr.xpath('./td')
            if len(tds) == 5:
                index = IndexInfo()
                code = tds[0].text.strip()
                if len(code.split('.')) == 2:
                    index.code = code.split('.')[0]
                    index.full_code = code
                index.name = tds[1].text.strip()
                index.begin_time = tds[2].text.strip()
                index.short_name = tds[3].text.strip()
                #????url,????????
                weave = tds[4].xpath('./a')
                if len(weave) == 1:
                    index.weave = weave[0].attrib['href'].strip()
                else:
                    index.weave = tds[4].text.strip()
                indexs.append(index)
        return indexs
项目:wcag-zoo    作者:data61    | 项目源码 | 文件源码
def get_tree(self, html):
        if not hasattr(self, '_tree'):
            # Pre-parse
            parser = etree.HTMLParser()
            html = etree.parse(BytesIO(html), parser).getroot()
            self._tree = Premoler(
                html,
                exclude_pseudoclasses=True,
                method="html",
                preserve_internal_links=True,
                base_path=self.kwargs.get('staticpath', '.'),
                include_star_selectors=True,
                strip_important=False,
                disable_validation=True,
                media_rules=self.kwargs.get('media_rules', [])
            ).transform()
        return self._tree
项目:freebora    作者:deeplook    | 项目源码 | 文件源码
def get_cats_sync(full_urls=False, verbose=False):
    "Generate category URLs for free O'Reilly ebooks."

    base_url = 'http://shop.oreilly.com'
    url = base_url + '/category/ebooks.do'
    if verbose:
        print(url)
    p = etree.HTMLParser()
    tree = etree.parse(url, parser=p)
    xpath_expr = '//a[starts-with(@href, "/category/ebooks/")]/@href'
    cat_urls = tree.xpath(xpath_expr)
    cat_urls = [base_url + u for u in cat_urls if u.endswith('.do')]
    for u in cat_urls:
        if verbose:
            print(u)
        tree1 = etree.parse(u, parser=p)
        urls = tree1.xpath(xpath_expr)
        for u in urls:
            if not u.endswith('.do'):
                continue
            if full_urls:
                yield base_url + u
            else:
                pat = 'category/ebooks/(.*?).do'
                yield re.findall(pat, u)[0]
项目:kibitzr    作者:kibitzr    | 项目源码 | 文件源码
def xpath_selector(selector, html):
    """
    Returns Xpath match for `selector` within `html`.

    :param selector: XPath string
    :param html: Unicode content
    """
    from lxml import etree
    # lxml requires argument to be bytes
    # see https://github.com/kibitzr/kibitzr/issues/47
    encoded = html.encode('utf-8')
    root = etree.fromstring(encoded, parser=etree.HTMLParser())
    elements = root.xpath(selector)
    if elements:
        return True, etree.tostring(
            next(iter(elements)),
            method='html',
            pretty_print=True,
            encoding='unicode',
        )
    else:
        logger.warning('XPath selector not found: %r', selector)
        return False, html
项目:webmon    作者:KarolBedkowski    | 项目源码 | 文件源码
def _get_elements_by_xpath(filter_, data, expression):
    try:
        from lxml import etree
    except ImportError:
        raise common.FilterError(filter_, "module lxml not found")
    # pylint: disable=no-member
    html_parser = etree.HTMLParser(encoding='utf-8', recover=True,
                                   strip_cdata=True)
    document = etree.fromstringlist([data], html_parser)
    for elem in document.xpath(expression):
        # pylint: disable=protected-access
        if isinstance(elem, etree._Element):
            text = etree.tostring(elem)
        else:
            text = str(elem)
        if isinstance(text, str):
            yield text
        else:
            yield text.decode('utf-8')
项目:webmon    作者:KarolBedkowski    | 项目源码 | 文件源码
def _filter(self, item: str, result: common.Result) -> ty.Iterable[str]:
        try:
            from lxml import etree
        except ImportError:
            raise common.FilterError(self, "module lxml not found")
        # pylint: disable=no-member
        html_parser = etree.HTMLParser(encoding='utf-8', recover=True,
                                       strip_cdata=True)
        document = etree.fromstringlist([item], html_parser)
        for elem in document.findall(".//*[@id='" + self._conf["sel"] + "']"):
            # pylint: disable=protected-access
            if isinstance(elem, etree._Element):
                text = etree.tostring(elem)  # type: ty.Union[str, bytes]
                if text:
                    if hasattr(text, 'decode'):
                        yield text.decode('utf-8')
                    else:
                        yield str(text)
            else:
                yield str(elem)
项目:gasvaktin    作者:gasvaktin    | 项目源码 | 文件源码
def get_individual_atlantsolia_prices():
    relation = glob.ATLANTSOLIA_LOCATION_RELATION
    url = 'http://atlantsolia.is/stodvarverd.aspx'
    res = requests.get(url, headers=utils.headers())
    html_text = res.content
    html = etree.fromstring(html_text, etree.HTMLParser())
    div_prices = html.find(('.//*[@id="content"]/div/div/div/div[2]/div/div/'
                            'table/tbody'))
    prices = {}
    for div_price in div_prices:
        key = relation[div_price[0][0].text]
        bensin95 = float(div_price[1][0].text.replace(',', '.'))
        diesel = float(div_price[2][0].text.replace(',', '.'))
        bensin95_discount = bensin95 - glob.ATLANTSOLIA_MINIMUM_DISCOUNT
        diesel_discount = diesel - glob.ATLANTSOLIA_MINIMUM_DISCOUNT
        prices[key] = {
            'bensin95': bensin95,
            'diesel': diesel,
            'bensin95_discount': int(bensin95_discount * 10) / 10.0,
            'diesel_discount': int(diesel_discount * 10) / 10.0
        }
    return prices
项目:gasvaktin    作者:gasvaktin    | 项目源码 | 文件源码
def get_global_skeljungur_prices():
    url = 'http://www.skeljungur.is/einstaklingar/eldsneytisverd/'
    res = requests.get(url, headers=utils.headers())
    html = etree.fromstring(res.content, etree.HTMLParser())
    bensin95_text = html.find(('.//*[@id="st-container"]/div/div/div/div/'
                               'div[2]/div/div/div[1]/div[1]/div[1]/section/'
                               'div/div[2]/div[1]/div[2]/h2')).text
    diesel_text = html.find(('.//*[@id="st-container"]/div/div/div/div/div[2]/'
                             'div/div/div[1]/div[1]/div[1]/section/div/div[2]/'
                             'div[1]/div[4]/h2')).text
    bensin95 = float(bensin95_text.replace(' kr.', '').replace(',', '.'))
    diesel = float(diesel_text.replace(' kr.', '').replace(',', '.'))
    return {
        'bensin95': bensin95,
        'diesel': diesel,
        'bensin95_discount': bensin95 - glob.SKELJUNGUR_DISCOUNT,
        'diesel_discount': diesel - glob.SKELJUNGUR_DISCOUNT
    }
项目:ISS    作者:RyanJenkins    | 项目源码 | 文件源码
def bandcamp_markup_for_url(urlstr):
    url = urlparse.urlparse(urlstr)

    parser = etree.HTMLParser(no_network=False)
    req = urllib2.urlopen(urlstr)
    tree = etree.parse(req, parser)
    embed_meta = tree.xpath('//meta[@property="og:video:secure_url"]')
    embed_url = embed_meta[0].get('content')

    markup = ('<iframe class="bandcamp-embed" '
        + 'src="%s" ' % embed_url
        + 'seamless>'
        + '<a href="%s">Embedded Bandcamp Link</a>' % urlstr
        + '</iframe>')

    return markup
项目:wechat-spider    作者:bowenpay    | 项目源码 | 文件源码
def extract(self):
        d = self.data
        res = None
        if not d:
            return d
        elif isinstance(d, basestring):
            if d.startswith('http'):
                ## ???????
                res = download_to_oss(d, OSS2_CONF["IMAGES_PATH"])
            else:
                ## ??????????
                htmlparser = etree.HTMLParser()
                tree = etree.parse(StringIO(d), htmlparser)
                # ??????src
                srcs = tree.xpath("//img[starts-with(@src,'http')]/@src")
                data_srcs = tree.xpath("//img[starts-with(@data-src,'http')]/@data-src")
                srcs = list(set(srcs + data_srcs))
                # ?????OSS?
                new_srcs = [download_to_oss(item, OSS2_CONF["IMAGES_PATH"]) for item in srcs]
                # ?????????src
                res = replace_all(d, srcs, new_srcs)
        elif isinstance(d, list):
            res = [download_to_oss(item, OSS2_CONF["IMAGES_PATH"]) for item in d]

        return res
项目:selenext    作者:Wykleph    | 项目源码 | 文件源码
def find_element_by_xpath(self, xpath):
        """
        Find an element in the DOM by xpath.

        Args:
            xpath:

        Returns:
            WebElement
        """

        tree = etree.fromstring(self.current_response, etree.HTMLParser())
        element = tree.xpath(xpath)[0]
        self.current_response = etree.tostring(element)
        print(self.current_response)
        return WebElement(None, self.current_response, self.current_url, parent=self.id)

    # %%%%%%%%%%%%%%%%%%% Find elements %%%%%%%%%%%%%%%%%%% #
项目:selenext    作者:Wykleph    | 项目源码 | 文件源码
def find_elements_by_xpath(self, xpath):
        """
        Find all elements in the DOM matching the given xpath.

        Args:
            xpath:

        Returns:
            list
        """

        tree = etree.fromstring(self.current_response, etree.HTMLParser())
        elements = tree.xpath(xpath)

        output_elements = []
        for element in elements:
            resp = etree.tostring(element)
            output_elements.append(WebElement(None, resp, self.current_url, parent=self.id))

        return output_elements
项目:recipebook    作者:dpapathanasiou    | 项目源码 | 文件源码
def __init__(self, url, pageEncoding=ENCODING):
        self.url  = url
        self.html = restClient.get(self.url)
        if self.html is not None:
            self.valid  = True
            self.encode = pageEncoding
            self.parser = etree.HTMLParser(encoding=self.encode)
            self.tree   = etree.HTML(self.html, parser=self.parser)
        else:
            self.valid = False
            raise ValueError('could not fetch data from: ""'+self.url+'""')
项目:national-geographic-wallpaper    作者:atareao    | 项目源码 | 文件源码
def set_fstoppers_wallpaper():
    r = requests.get(URL05)
    url = None
    image_url = None
    if r.status_code == 200:
        try:
            parser = etree.HTMLParser(recover=True)
            html = etree.HTML(r.content, parser)
            print(etree.tostring(html))
            print('===========')
            for element in html.iter('img'):
                # print(element.tag, element.attrib, element.text)
                try:
                    print(element.attrib['data-original'])
                    url = 'https://fstoppers.com' +\
                        element.getparent().attrib['href']
                    break
                except Exception as e:
                    print(e)
            if url is not None:
                print(url)
                r = requests.get(url)
                if r.status_code == 200:
                    html = etree.HTML(r.content, parser)
                    print(etree.tostring(html))
                    for element in html.iter('div'):
                        try:
                            if element.attrib['class'] == 'photo':
                                image_url = element.attrib['data-xlarge']
                                break
                        except Exception as e:
                            print(e)
        except Exception as e:
            print(e)
        if image_url is not None:
            if download(image_url) is True:
                set_background(comun.POTD)
项目:ceiba-dl    作者:lantw44    | 项目源码 | 文件源码
def web(self, path, args={}, encoding=None, allow_return_none=False):
        self.logger.debug('????????')
        if allow_return_none:
            if path in self.web_cache and self.web_cache[path] == args:
                self.logger.debug('????? {} ????'.format(path))
                self.logger.debug('???{}'.format(args))
                return
        self.web_cache[path] = dict(args)
        url = urllib.parse.urljoin(self.web_url, urllib.parse.quote(path))
        if len(args) > 0:
            url += '?' + urllib.parse.urlencode(args)
        self.logger.debug('HTTP ?????{}'.format(url))
        data = io.BytesIO()
        self.curl.setopt(pycurl.URL, url)
        self.curl.setopt(pycurl.COOKIE, self.web_cookie)
        self.curl.setopt(pycurl.NOBODY, False)
        self.curl.setopt(pycurl.NOPROGRESS, True)
        self.curl.setopt(pycurl.WRITEDATA, data)
        self.curl.setopt(pycurl.HEADERFUNCTION, lambda *x: None)
        self.curl.setopt(pycurl.XFERINFOFUNCTION, lambda *x: None)
        self.curl.perform()
        status = self.curl.getinfo(pycurl.RESPONSE_CODE)
        if status != 200:
            raise ServerError(status)
        data.seek(io.SEEK_SET)
        return etree.parse(data, etree.HTMLParser(
            encoding=encoding, remove_comments=True))
项目:Taigabot    作者:FrozenPigs    | 项目源码 | 文件源码
def parsehtml():
    """
    Test HTML parsing.

    >>> # p = HTMLTreeBuilder.TreeBuilder()
    >>> p = ElementTree.HTMLParser()
    >>> p.feed("<p><p>spam<b>egg</b></p>")
    >>> serialize(p.close())
    '<p>spam<b>egg</b></p>'
    """

# doesn't work with lxml.etree
项目:edxcut    作者:mitodl    | 项目源码 | 文件源码
def get_basic_course_info(self):
        '''
        Get basic course info (start date, end date, ...) from instructor dashboard
        '''
        url = "%s#view-course_info" % self.instructor_dashboard_url
        ret = self.ses.get(url)
        # print ret.content
        parser = etree.HTMLParser()
        xml = etree.parse(StringIO(ret.content), parser).getroot()
        bci_div = xml.find('.//div[@class="basic-wrapper"]')
        if bci_div is None:
            return None
        fields = ["course-organization", "course-number", "course-name", "course-display-name", "course-start-date",
                  "course-end-date", "course-started", "course-num-sections", "grade-cutoffs"]
        # look for elements like: <li class="field text is-not-editable" id="field-grade-cutoffs">
        data = {}
        for field in fields:
            felem = bci_div.find('.//li[@id="field-%s"]' % field)
            if felem is None:
                data[field] = None
            else:
                belem = felem.find('b')
                data[field] = belem.text
        if self.verbose:
            print json.dumps(data, indent=4)
        return data
项目:dati-ckan-docker    作者:italia    | 项目源码 | 文件源码
def _extract_urls(self, content, base_url):
        '''
        Get the URLs out of a WAF index page
        '''
        try:
            parser = etree.HTMLParser()
            tree = etree.fromstring(content, parser=parser)
        except Exception, inst:
            msg = 'Couldn\'t parse content into a tree: %s: %s' \
                  % (inst, content)
            raise Exception(msg)
        urls = []
        for url in tree.xpath('//a/@href'):
            url = url.strip()
            if not url:
                continue
            if '?' in url:
                log.debug('Ignoring link in WAF because it has "?": %s', url)
                continue
            if '/' in url:
                log.debug('Ignoring link in WAF because it has "/": %s', url)
                continue
            if '#' in url:
                log.debug('Ignoring link in WAF because it has "#": %s', url)
                continue
            if 'mailto:' in url:
                log.debug('Ignoring link in WAF because it has "mailto:": %s', url)
                continue
            log.debug('WAF contains file: %s', url)
            urls.append(url)
        base_url = base_url.rstrip('/').split('/')
        if 'index' in base_url[-1]:
            base_url.pop()
        base_url = '/'.join(base_url)
        base_url += '/'
        log.debug('WAF base URL: %s', base_url)
        return [base_url + i for i in urls]
项目:HtmlExtract-Python    作者:xinyi-spark    | 项目源码 | 文件源码
def get_src_link_number(url, Html):
    '''
    ??Html???src???????css?js?pic?html?url??
    '''
    Html = utf8_transfer(Html)
    page = etree.HTML(Html, parser=etree.HTMLParser(encoding='utf-8'))
    src_link_list = get_src_links(page, Html)
    src_link_number = {}
    for src_num in src_link_list:
        src_link_number[src_num[0]] = get_link_number(url, src_num[1])
    return src_link_number
项目:newsrecommender    作者:Newsrecommender    | 项目源码 | 文件源码
def parse_item(self, response):
        super(NextBigWhatSpider, self).parse_item(response)
        htmlparser = etree.HTMLParser()
        tree = etree.parse(BytesIO(response.body), htmlparser)

        news_item = NewsItem()

        try:
            title = tree.xpath(".//header[contains(@class, 'entry-header')]/h1/text()")
            details = tree.xpath('.//div[contains(@class, "herald-entry-content")]/p/text()')

            if title and details:
                news_item['source'] = self.name
                news_item['crawled_date'] = datetime.now()
                news_item['source_url'] = response.url.split('?')[0]

                news_item['title'] = title[0].strip().encode('ascii','ignore')
                news_item['details'] = "\t".join([item.strip().encode('ascii','ignore') for item in details if item.strip()])

                img_urls = tree.xpath('.//div[contains(@class, "herald-post-thumbnail herald-post-thumbnail-single")]/span/img/@src')
                if img_urls:
                    news_item['img_urls'] = get_stripped_list(img_urls)

                meta_result = self.get_meta(tree)

                if 'description' in meta_result:
                    news_item['blurb'] = meta_result['description']

                return news_item

        except:
            pass
        return None
项目:newsrecommender    作者:Newsrecommender    | 项目源码 | 文件源码
def parse_item(self, response):
        super(BusinessStandardSpider, self).parse_item(response)
        htmlparser = etree.HTMLParser()
        tree = etree.parse(BytesIO(response.body), htmlparser)

        news_item = NewsItem()
        try:

            title = tree.xpath(".//h1[contains(@class,\'headline\')]//text()")
            details = tree.xpath('.//span[contains(@class,\'p-content\')]/div//text()[not(ancestor::script)]')
            if title and details:
                news_item['source'] = self.name
                news_item['source_url'] = response.url.split('?')[0]
                news_item['crawled_date'] = datetime.now()
                news_item['title'] = title[0].strip().encode('ascii','ignore')
                news_item['details'] = "\t".join([item.strip().encode('ascii','ignore') for item in details])

                img_urls = tree.xpath('.//img[contains(@class,\'imgCont\')]/@src')
                if img_urls:
                    news_item['img_urls'] = get_stripped_list(img_urls)


                published_date = tree.xpath('.//p[contains(@class,\'fL\')]//span//text()')
                if published_date:
                    news_item['published_date'] = datetime.strptime(published_date[3].split("\t")[0], '%B %d, %Y')

                related = tree.xpath('.//div[contains(@class,\'readmore_tagBG\')]//h2//a/text()')
                if related:
                    news_item['tags'] = [item.strip() for item in related if item.strip()]

                cover_image = tree.xpath('.//img[contains(@class,\'imgCont\')]/@src')
                if cover_image:
                    news_item['cover_image'] = cover_image
                return news_item

        except:
            pass
        return None
项目:newsrecommender    作者:Newsrecommender    | 项目源码 | 文件源码
def parse_item(self, response):
        super(TechCrunchSpider, self).parse_item(response)
        htmlparser = etree.HTMLParser()
        tree = etree.parse(BytesIO(response.body), htmlparser)

        news_item = NewsItem()
        try:
            title = tree.xpath(".//h1[contains(@class,\'alpha tweet-title\')]//text()")
            details = tree.xpath('.//div[contains(@class,\'article-entry text\')]//p//text()')
            if title and details:
                news_item['title'] = title[0].strip().encode('ascii','ignore')
                news_item['details'] = "\t".join([ det.strip().encode('ascii','ignore') for det in details])

                news_item['source'] = self.name
                news_item['crawled_date'] = datetime.now()
                news_item['source_url'] = response.url.split('?')[0]

                img_urls = tree.xpath('.//div[contains(@class,\'article-entry text\')]/img/@src')
                if img_urls:
                    news_item['img_urls'] = img_urls


                cover_image = tree.xpath('.//div[contains(@class,\'article-entry text\')]/img/@src')
                if cover_image:
                    news_item['cover_image'] = cover_image[0]

                author = tree.xpath('/html/body/div[4]/article/div/div[1]/div/header/div[2]/div[1]/a/text()')
                if author :
                    news_item['author'] = author


                return news_item

        except:
            pass
        return None
项目:newsrecommender    作者:Newsrecommender    | 项目源码 | 文件源码
def parse_item(self, response):
        super(SmallBizTrendsSpider, self).parse_item(response)
        htmlparser = etree.HTMLParser()
        tree = etree.parse(BytesIO(response.body), htmlparser)

        news_item = NewsItem()
        try:
            title = tree.xpath(".//div[@class='post-inner']/h1/text()")
            details = tree.xpath('.//div[@class=\"entry\"]/p/text()')
            if title and details:
                news_item['source'] = self.name
                news_item['crawled_date'] = datetime.now()
                news_item['source_url'] = response.url.split('?')[0]
                news_item['title'] = title[0].strip().decode('unicode_escape').encode('ascii','ignore')
                news_item['details'] = '\t'.join([item.strip().encode('ascii','ignore').decode('unicode_escape') for item in details if item.strip()])
                # ' '.join([item.strip().encode('ascii','ignore').decode('unicode_escape') for item in details if item.strip()])

                if tree.xpath('.//span[@class=\'full-span-featured-image\']/span/img/@src'):
                    news_item['img_urls'] = tree.xpath('.//span[@class=\'full-span-featured-image\']/span/img/@src')
                elif tree.xpath('.//img[contains(@class,\'size-full\')]/@src'):
                    news_item['img_urls'] = tree.xpath('.//img[contains(@class,\'size-full\')]/@src')
                elif tree.xpath('.//img[contains(@class,\'aligncenter\')]/@src'):
                    news_item['img_urls'] = tree.xpath('.//img[contains(@class,\'aligncenter\')]/@src')

                meta_result = self.get_meta(tree)

                if 'description' in meta_result:
                    news_item['blurb'] = meta_result['description']

                published_date = tree.xpath('.//span[contains(@class,\'article-date\')]/text()')
                if published_date:
                    news_item['published_date'] = datetime.strptime(published_date[0], '%b %d, %Y')
                author = tree.xpath('.//span[contains(@itemprop,\'name\')]/a/text()')
                if author:
                    news_item['author'] = author
                return news_item

        except:
            pass
        return None
项目:newsrecommender    作者:Newsrecommender    | 项目源码 | 文件源码
def parse_item(self, response):
        super(DealCurrySpider, self).parse_item(response)
        htmlparser = etree.HTMLParser()
        tree = etree.parse(BytesIO(response.body), htmlparser)

        news_item = NewsItem()

        try:
            title = tree.xpath(".//h1/text()")
            details = tree.xpath('.//div[contains(@class, "articleSpacer")]/p//text()')
            if title and details:
                news_item['source_url'] = response.url.split('?')[0]
                news_item['source'] = self.name
                news_item['crawled_date'] = datetime.now()
                news_item['title'] = title[0].strip().encode('ascii','ignore')
                news_item['details'] = "\t".join([x.strip().encode('ascii','ignore')for x in details]).strip()
                # "\t".join([item.strip().encode('ascii','ignore') for item in details if item.strip()])

                tags = tree.xpath('.//div[contains(@style, "padding-bottom:10px")]/span[contains(@style, "color:#346f9a; float:left; text-align:left")]/a/text()')
                news_item['tags'] = tags[0].strip().encode('ascii','ignore')

                published_date = tree.xpath(".//span[contains(@style, 'color:#6b6b6b;float:left; text-align:left; margin-left:5px')]/text()")
                news_item['published_date'] = datetime.strptime(published_date[0].encode('ascii','ignore'), '%d %B %Y') 
                author = tree.xpath('.//div[contains(@style, "")]/span[contains(@style, "color:#6b6b6b; float:left; text-align:left;")]/text()')
                news_item['author'] = author[0].split('by')[1].strip().encode('ascii','ignore')

                img_urls = tree.xpath('.//div[contains(@style, "padding-bottom:10px")]/img/@src')
                if img_urls:
                    news_item['img_urls'] = get_stripped_list(img_urls)

                meta_result = self.get_meta(tree)

                if 'description' in meta_result:
                    news_item['blurb'] = meta_result['description']

                return news_item

        except:
            pass
        return None
项目:aws-adfs    作者:venth    | 项目源码 | 文件源码
def _retrieve_roles_page(roles_page_url, context, session, ssl_verification_enabled,
                         vip_security_code):
    response = session.post(
        roles_page_url,
        verify=ssl_verification_enabled,
        allow_redirects=True,
        data={
            'AuthMethod': 'VIPAuthenticationProviderWindowsAccountName',
            'Context': context,
            'security_code': vip_security_code,
        }
    )
    logging.debug(u'''Request:
            * url: {}
            * headers: {}
        Response:
            * status: {}
            * headers: {}
            * body: {}
        '''.format(roles_page_url, response.request.headers, response.status_code, response.headers,
                   response.text))

    if response.status_code != 200:
        raise click.ClickException(
            u'Issues during redirection to aws roles page. The error response {}'.format(
                response
            )
        )

    html_response = ET.fromstring(response.text, ET.HTMLParser())
    return roles_assertion_extractor.extract(html_response)
项目:aws-adfs    作者:venth    | 项目源码 | 文件源码
def _retrieve_roles_page(roles_page_url, context, session, ssl_verification_enabled,
                         signed_response):
    logging.debug('context: {}'.format(context))
    logging.debug('sig_response: {}'.format(signed_response))

    response = session.post(
        roles_page_url,
        verify=ssl_verification_enabled,
        headers=_headers,
        allow_redirects=True,
        data={
            'AuthMethod': 'DuoAdfsAdapter',
            'Context': context,
            'sig_response': signed_response,
        }
    )
    logging.debug(u'''Request:
            * url: {}
            * headers: {}
        Response:
            * status: {}
            * headers: {}
            * body: {}
        '''.format(roles_page_url, response.request.headers, response.status_code, response.headers,
                   response.text))

    if response.status_code != 200:
        raise click.ClickException(
            u'Issues during redirection to aws roles page. The error response {}'.format(
                response
            )
        )

    html_response = ET.fromstring(response.text, ET.HTMLParser())
    return roles_assertion_extractor.extract(html_response)
项目:aws-adfs    作者:venth    | 项目源码 | 文件源码
def _strategy(response, config, session):

    html_response = ET.fromstring(response.text, ET.HTMLParser())

    def _plain_extractor():
        def extract():
            return roles_assertion_extractor.extract(html_response)
        return extract

    def _duo_extractor():
        def extract():
            return duo_auth.extract(html_response, config.ssl_verification, session)
        return extract

    def _symantec_vip_extractor():
        def extract():
            return symantec_vip_access.extract(html_response, config.ssl_verification, session)
        return extract

    chosen_strategy = _plain_extractor

    if _is_duo_authentication(html_response):
        chosen_strategy = _duo_extractor
    elif _is_symantec_vip_authentication(html_response):
        chosen_strategy = _symantec_vip_extractor

    return chosen_strategy()
项目:FundSpider    作者:s6530085    | 项目源码 | 文件源码
def parse_statistic(self, info, content):
        html = etree.HTML(content, parser=etree.HTMLParser(encoding='utf-8'))
        nums = html.xpath(u'//table[@class="fxtb"]//td')
        length = len(nums)
        if length % 4 == 3:
            nums = nums[0:length - 3]
            length -= 3
        if (length > 0 and length%4==0):
            for i in range(0, length/4):
                tds = nums[i*4:(i+1)*4]
                #??????????,??????
                if tds[0].text == FundInfo.STD_CHINESE_KEY:
                    stds = reversed(tds[1:4])
                    for stdnum in stds:
                        #??1,2,3????,????????1?????,?????,?????????,??
                        if stdnum.text != '--':
                            info.std = safe_to_float(stdnum.text.split('%')[0])
                            break
                elif tds[0].text == FundInfo.SHARPERATIO_CHINESE_KEY:
                    sharpes = reversed(tds[1:4])
                    for sharpenum in sharpes:
                        if sharpenum.text != '--':
                            info.sharperatio = safe_to_float(sharpenum.text)
                            break
                elif tds[0].text == FundInfo.INFORATIO_CHINESE_KEY:
                    infos = reversed(tds[1:4])
                    for infonum in infos:
                        if infonum.text != '--':
                            info.inforatio = safe_to_float(infonum.text)
                            break

        #?????????????
        trackbias = html.xpath(u'//div[@id="jjzsfj"]//table[@class="fxtb"]//td')
        if len(trackbias) == 3:
            info.bias = safe_to_float(trackbias[1].text.split('%')[0])

        #????,???????
        styles = html.xpath('//table[@class="fgtb"]//td')
        if len(styles) >= 2:
            #??????????
            info.style = styles[1].text.strip()
项目:FundSpider    作者:s6530085    | 项目源码 | 文件源码
def parse_annual(self, info, content):
        #??????,?????,?????????,????????????0??,??????
        #?????????????
        html = etree.HTML(content, parser=etree.HTMLParser(encoding='utf-8'))
        trs = html.xpath('//table/tbody/tr')
        #??????,??????
        if len(trs)==5:
            yieldtds = trs[0].xpath('./td')
            #??????
            yieldvalue = 1.0
            yieldpow = 0.0
            #????,?????????
            for yearyield in yieldtds[1:]:
                y = yearyield.text
                if y != '---':
                    yieldvalue *= (1 + safe_to_float(y.split('%')[0]) / 100)
                    yieldpow += 1
            #???????????
            if yieldpow != 0.0:
                info.annualyield = yieldvalue ** (1.0/yieldpow) - 1

            ranktds = trs[3].xpath('./td')
            rankcount = 0
            rankvalue = 0.0
            for ranktd in ranktds[1:]:
                r = ''.join(ranktd.itertext()).strip()
                if r != '---':
                    rankvalue += safe_to_float(r.split('|')[0]) / safe_to_float(r.split('|')[1])
                    rankcount += 1
            if rankcount > 0:
                info.annualrank = rankvalue / rankcount
项目:FundSpider    作者:s6530085    | 项目源码 | 文件源码
def parse_stock(self, content):
        stock = StockInfo()
        html = etree.HTML(content, parser=etree.HTMLParser(encoding='utf-8'))
        ths = html.xpath('//th[@class="tips-fieldnameL"]')
        tds = html.xpath('//td[contains(@class, "tips-dataL")]')
        for (index, th) in enumerate(ths):
            key = th.text.strip()
            value = tds[index].text.strip()
            if value == '--':
                value = ''
            if key == StockInfo.FULL_NAME_CHINESE_KEY:
                stock.fullname = value
            elif key == StockInfo.USED_NAME_CHINESE_KEY:
                #?????
                stock.used_names = value.split('->')
            elif key == StockInfo.CODE_CHINESE_KEY:
                stock.code = value
            elif key == StockInfo.SHORT_NAME_CHINESE_KEY:
                stock.shortname = value
            elif key == StockInfo.MARKET_CHINESE_KEY:
                stock.market = value
            elif key == StockInfo.INDUSTRY_CHINESE_KEY:
                stock.industry = value
            elif key == StockInfo.AREA_CHINESE_KEY:
                stock.area = value
            # ???,?????????????,??????????????,
            elif key == StockInfo.RELEASE_DATE_CHINESE_KEY:
                stock.releasedate = value
        return stock
项目:calculette-impots-python    作者:openfisca    | 项目源码 | 文件源码
def main():
    global parser
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument('--output-html', help='Output result page HTML to file')
    parser.add_argument('--saisies', dest='saisie_variables', metavar='nom=valeur', nargs='+', help='Variables saisies')
    parser.add_argument('--year', default='2015', type=int,
                        help='Calculer les impôts de l\'année N sur les revenus de l\'année N-1')
    args = parser.parse_args()

    cgi_url = 'http://www3.finances.gouv.fr/cgi-bin/calc-{}.cgi'.format(args.year)
    headers = {'User-Agent': 'Calculette-Impots-Python'}
    saisie_variables = {} if args.saisie_variables is None else dict(iter_saisie_variables(args.saisie_variables))
    default_saisie_variables = {
        # '0DA': '1965',
        # '1AJ': '15000',
        'pre_situation_famille': 'C',
        'pre_situation_residence': 'M',
        # 'simplifie': '1',
        }
    data = merge(default_saisie_variables, saisie_variables)
    response = requests.post(cgi_url, headers=headers, data=data)
    if args.output_html is not None:
        with open(args.output_html, 'w') as output_html_file:
            output_html_file.write(re.sub(
                pattern=r'=(.)/calcul_impot/2015/',
                repl=r'=\1http://www3.finances.gouv.fr/calcul_impot/2015/',
                string=response.text,
                ))
    root_node = etree.fromstring(response.text, etree.HTMLParser())
    results = list(iter_results(root_node))
    print(json.dumps(results, ensure_ascii=False, indent=2, sort_keys=True))

    return 0
项目:xpath.py    作者:elliterate    | 项目源码 | 文件源码
def setup_document(self):
        # Determine the path of the fixture to load.
        filename = getattr(type(self), "__fixture__")
        fixture_path = os.path.join(_FIXTURE_DIR, filename)

        parser = etree.HTMLParser(encoding="UTF-8")

        # Open the fixture file in the browser.
        self.document = etree.parse(fixture_path, parser)
项目:normal_hark_lite    作者:nanshihui    | 项目源码 | 文件源码
def __init__(self):
        self.parser = etree.HTMLParser()
项目:gae-sports-data    作者:jnguyen-ca    | 项目源码 | 文件源码
def _request(self):
        """Makes requests to vegasinsider odds pages to get game odds

        Returns:
            dict: values are self._scrape()
        """
        if not memcache.add(type(self).__name__, True, 3):
            time.sleep(3)
        logging.info('Scraping VegasInsider for %s' % (self.league))

        url = "http://www.vegasinsider.com/%s/odds/las-vegas/" % (self.league)
        response = urlfetch.fetch(url)

#         time.sleep(3)
#         url = "http://www.vegasinsider.com/%s/odds/offshore/" % (self.vi_league)
#         response = urlfetch.fetch(url)
#         offshore_tree = etree.fromstring(response.content, etree.HTMLParser())

        try:
            vegas_odds = self._scrape(response.content, 1)
#         offshore_odds = self._scrape(offshore_tree, 8)
        except IndexError as e:
            logging.exception(e)
            vegas_odds = {}

        return {
                'vegas' : vegas_odds, 
#                 'offshore' : offshore_odds
                }
项目:freebora    作者:deeplook    | 项目源码 | 文件源码
def download_filelist_sync(cat, verbose=False):
    "Generate URLs for free O'Reilly ebooks in PDF format."

    url = 'http://shop.oreilly.com/category/ebooks/%s.do' % cat
    if verbose:
        print(url)
    p = etree.HTMLParser()
    t1 = etree.parse(url, parser=p)
    table_pag1 = t1.xpath('//table[@class="pagination"]')[0]
    xp = '//td[@class="default"]/select[@name="dirPage"]/option/@value'
    page_urls = set(table_pag1.xpath(xp))
    for i, page_url in enumerate(page_urls):
        # if verbose:
        #     print(page_url)
        t2 = etree.parse('http://shop.oreilly.com' + page_url, parser=p)
        xp = '//span[@class="price"][contains(., "$0.00")]/'\
             '../../../../div[@class="thumbheader"]/a/@href'
        paths = t2.xpath(xp)
        for j, path in enumerate(paths):
            url = 'http://shop.oreilly.com' + path
            html = requests.get(url).text
            url_csps = re.findall('path_info\:\s+(.*?\.csp)', html)
            if len(url_csps) != 1:
                continue
            url_csp = url_csps[0]
            url_csp = re.sub('\?.*', '', url_csp)
            url_pdf = re.sub('\.csp', '.pdf', url_csp)
            url_pdf = re.sub('/free/', '/free/files/', url_pdf)
            u = 'http://www.oreilly.com/%s' % url_pdf
            if verbose:
                print(u)
            yield u
项目:pyEbaySniper    作者:braph    | 项目源码 | 文件源码
def get_as_etree(url):
    response = requests.get(url)
    parser = etree.HTMLParser()
    return etree.parse(StringIO(response.text), parser)
项目:gasvaktin    作者:gasvaktin    | 项目源码 | 文件源码
def get_global_olis_prices():
    url = 'http://www.olis.is/solustadir/thjonustustodvar/eldsneytisverd/'
    res = requests.get(url, headers=utils.headers())
    html = etree.fromstring(res.content, etree.HTMLParser())
    bensin95_text = html.find('.//*[@id="gas-price"]/span[1]').text
    diesel_text = html.find('.//*[@id="gas-price"]/span[2]').text
    bensin_discount_text = html.find('.//*[@id="gas-price"]/span[4]').text
    diesel_discount_text = html.find('.//*[@id="gas-price"]/span[5]').text
    return {
        'bensin95': float(bensin95_text.replace(',', '.')),
        'diesel': float(diesel_text.replace(',', '.')),
        'bensin95_discount': float(bensin_discount_text.replace(',', '.')),
        'diesel_discount': float(diesel_discount_text.replace(',', '.'))
    }