Python lxml.etree 模块,HTML 实例源码

我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用lxml.etree.HTML

项目:QUANTAXIS    作者:yutiansut    | 项目源码 | 文件源码
def QA_fetch_get_stock_block():
    url_list = ['gn', 'dy', 'thshy', 'zjhhy']  # ??/??/?????/?????
    data = []

    for item in url_list:
        tree = etree.HTML(requests.get(
            'http://q.10jqka.com.cn/{}/'.format(item), headers=headers).text)
        gn = tree.xpath('/html/body/div/div/div/div/div/a/text()')
        gpath = tree.xpath('/html/body/div/div/div/div/div/a/@href')
        for _i in range(len(gn)):
            for i in range(1, 15):
                _data = etree.HTML(requests.get(
                    'http://q.10jqka.com.cn/{}/detail/order/desc/page/{}/ajax/1/code/{}'.format(item, i, gpath[_i].split('/')[-2]), headers=headers).text)
                name = _data.xpath('/html/body/table/tbody/tr/td[3]/a/text()')
                code = _data.xpath('/html/body/table/tbody/tr/td[3]/a/@href')
                for i_ in range(len(name)):
                    print(
                        'Now Crawling-{}-{}-{}-{}'.format(gn[_i], code[i_].split('/')[-1], item, 'ths'))
                    data.append([gn[_i], code[i_].split('/')[-1], item, 'ths'])

    return pd.DataFrame(data, columns=['blockname',  'code', 'type', 'source']).set_index('code', drop=False)
项目:Projects    作者:it2school    | 项目源码 | 文件源码
def rdoc(num_elements=1000):
    """Randomly generate an invalid HTML document."""
    tag_names = ['p', 'div', 'span', 'i', 'b', 'script', 'table']
    elements = []
    for i in range(num_elements):
        choice = random.randint(0,3)
        if choice == 0:
            # New tag.
            tag_name = random.choice(tag_names)
            elements.append("<%s>" % tag_name)
        elif choice == 1:
            elements.append(rsentence(random.randint(1,4)))
        elif choice == 2:
            # Close a tag.
            tag_name = random.choice(tag_names)
            elements.append("</%s>" % tag_name)
    return "<html>" + "\n".join(elements) + "</html>"
项目:talonspider    作者:howie6879    | 项目源码 | 文件源码
def _get_html(cls, html, url, html_etree, params, **kwargs):
        if html:
            html = etree.HTML(html)
        elif url:
            if not kwargs.get('headers', None):
                kwargs['headers'] = {
                    "User-Agent": get_random_user_agent()
                }
            response = requests.get(url, params, **kwargs)
            response.raise_for_status()
            content = response.content
            charset = cchardet.detect(content)
            text = content.decode(charset['encoding'])
            html = etree.HTML(text)
        elif html_etree is not None:
            return html_etree
        else:
            raise ValueError("html(url or html_etree) is expected")
        return html
项目:TACTIC-Handler    作者:listyque    | 项目源码 | 文件源码
def rdoc(num_elements=1000):
    """Randomly generate an invalid HTML document."""
    tag_names = ['p', 'div', 'span', 'i', 'b', 'script', 'table']
    elements = []
    for i in range(num_elements):
        choice = random.randint(0,3)
        if choice == 0:
            # New tag.
            tag_name = random.choice(tag_names)
            elements.append("<%s>" % tag_name)
        elif choice == 1:
            elements.append(rsentence(random.randint(1,4)))
        elif choice == 2:
            # Close a tag.
            tag_name = random.choice(tag_names)
            elements.append("</%s>" % tag_name)
    return "<html>" + "\n".join(elements) + "</html>"
项目:UPBGE-CommunityAddon    作者:elmeunick9    | 项目源码 | 文件源码
def rdoc(num_elements=1000):
    """Randomly generate an invalid HTML document."""
    tag_names = ['p', 'div', 'span', 'i', 'b', 'script', 'table']
    elements = []
    for i in range(num_elements):
        choice = random.randint(0,3)
        if choice == 0:
            # New tag.
            tag_name = random.choice(tag_names)
            elements.append("<%s>" % tag_name)
        elif choice == 1:
            elements.append(rsentence(random.randint(1,4)))
        elif choice == 2:
            # Close a tag.
            tag_name = random.choice(tag_names)
            elements.append("</%s>" % tag_name)
    return "<html>" + "\n".join(elements) + "</html>"
项目:IPProxy    作者:yutian2011    | 项目源码 | 文件源码
def parse_page(page,pattern):
    page = etree.HTML(page.lower()) 
    #page = etree.HTML(page.lower().decode('utf-8')) 
    ips = page.xpath(pattern["ip"])
    ports = page.xpath(pattern["port"])
    ty = page.xpath(pattern["type"])
    for i in range(len(ips)):
        ret = {}
        str = "%s:%s"
        ret["ip_port"] = str%(ips[i].text,ports[i].text)
        if ty[i].text.find("https") == -1:
            ret["type"] = 0
        else:
            ret["type"] = 1
        ret["db_flag"] = False
        yield ret
项目:llk    作者:Tycx2ry    | 项目源码 | 文件源码
def rdoc(num_elements=1000):
    """Randomly generate an invalid HTML document."""
    tag_names = ['p', 'div', 'span', 'i', 'b', 'script', 'table']
    elements = []
    for i in range(num_elements):
        choice = random.randint(0,3)
        if choice == 0:
            # New tag.
            tag_name = random.choice(tag_names)
            elements.append("<%s>" % tag_name)
        elif choice == 1:
            elements.append(rsentence(random.randint(1,4)))
        elif choice == 2:
            # Close a tag.
            tag_name = random.choice(tag_names)
            elements.append("</%s>" % tag_name)
    return "<html>" + "\n".join(elements) + "</html>"
项目:catchWecaht    作者:leon0204    | 项目源码 | 文件源码
def get_list(self, search_url):
        data = {}
        # keylist =  [0] * 5
        data['table_name'] = 'dailyKeyword'
        html = requests.get(search_url, headers=self.headers, verify=False).content
        selector = etree.HTML(html)

        # ????
        keyurl = selector.xpath('//div[@class="aside"]/ol[@class="hot-news"]/li/a/@href')
        keyword = selector.xpath('//div[@class="aside"]/ol[@class="hot-news"]/li/a/text()')
        res = {}
        res['keyurl'] = keyurl
        res['keyword'] = keyword

        for x in range(0,10):
            data['keyword'] = keyword[x]
            data ['keyurl'] = keyurl[x]
            data ['id'] = (x+1)
            self.save(data)
        return res



    # ??????
项目:dpspider    作者:doupengs    | 项目源码 | 文件源码
def __init__(self,data=None,response=None,url=None,logFile=None,color=True,debug=4):
        '''
        :param data: default=None <class str|unicode response.text>
        :param response: default=None <class Response>
        :param url: default=None <class str>
        :param logFile: default=None <class str>
        :param color: default=True <class bool>
        :param debug: default=4 <class int|0 NONE,1 [Error],2 [Error][WARING],3 [Error][WARING][INFO],4 ALL>
        '''
        self.logFile = logFile
        self.color = color
        self.debug = debug
        self.data = data
        self.response = response
        try:
            self.url = response.request.url if response and not url else url
            self._html = etree.HTML(self.data) if data else None
        except Exception as e:
            printText("[Error]parser.py Parser __init__:%s"%e,logFile=self.logFile,color=self.color,debug=self.debug)
项目:harbour-sailfinder    作者:DylanVanAssche    | 项目源码 | 文件源码
def rdoc(num_elements=1000):
    """Randomly generate an invalid HTML document."""
    tag_names = ['p', 'div', 'span', 'i', 'b', 'script', 'table']
    elements = []
    for i in range(num_elements):
        choice = random.randint(0,3)
        if choice == 0:
            # New tag.
            tag_name = random.choice(tag_names)
            elements.append("<%s>" % tag_name)
        elif choice == 1:
            elements.append(rsentence(random.randint(1,4)))
        elif choice == 2:
            # Close a tag.
            tag_name = random.choice(tag_names)
            elements.append("</%s>" % tag_name)
    return "<html>" + "\n".join(elements) + "</html>"
项目:harbour-sailfinder    作者:DylanVanAssche    | 项目源码 | 文件源码
def rdoc(num_elements=1000):
    """Randomly generate an invalid HTML document."""
    tag_names = ['p', 'div', 'span', 'i', 'b', 'script', 'table']
    elements = []
    for i in range(num_elements):
        choice = random.randint(0,3)
        if choice == 0:
            # New tag.
            tag_name = random.choice(tag_names)
            elements.append("<%s>" % tag_name)
        elif choice == 1:
            elements.append(rsentence(random.randint(1,4)))
        elif choice == 2:
            # Close a tag.
            tag_name = random.choice(tag_names)
            elements.append("</%s>" % tag_name)
    return "<html>" + "\n".join(elements) + "</html>"
项目:AutoHome_WOM_Spider    作者:dtc-auto    | 项目源码 | 文件源码
def get_type_id():
    start_url_list = [
        'http://www.autohome.com.cn/a00/',  # ???
        'http://www.autohome.com.cn/a0/',  # ???
        'http://www.autohome.com.cn/a/',  # ????
        'http://www.autohome.com.cn/b/',  # ???
        'http://www.autohome.com.cn/c/',  # ????
        'http://www.autohome.com.cn/d/',  # ???
        'http://www.autohome.com.cn/suv/',  # SUV
        'http://www.autohome.com.cn/mpv/',  # MPV
        'http://www.autohome.com.cn/s/',  # ??
        'http://www.autohome.com.cn/p/',  # ??
        'http://www.autohome.com.cn/mb/',  # ??
    ]
    models_list = []
    for url_t in start_url_list:
        model_resp = process_request(url_t)
        model_respose = etree.HTML(model_resp)
        models = model_respose.xpath('.//a/@data-value')
        models_list = models_list + models
        models_list = list(set(models_list))
    return models_list
项目:national-geographic-wallpaper    作者:atareao    | 项目源码 | 文件源码
def set_nasa_wallpaper():
    st = datetime.fromtimestamp(time.time()).strftime('%y%m%d')
    url = URL07.format(st)
    r = requests.get(url)
    if r.status_code == 200:
        try:
            parser = etree.HTMLParser(recover=True)
            html = etree.HTML(r.content, parser)
            images = html.iter('img')
            if images is not None:
                images = list(images)
                if len(images) > 0:
                    image_url = images[0].getparent().attrib['href']
                    image_url = 'https://apod.nasa.gov/' + image_url
                    if download(image_url) is True:
                        set_background(comun.POTD)
        except Exception as e:
            print(e)
项目:B.E.N.J.I.    作者:the-ethan-hunt    | 项目源码 | 文件源码
def rdoc(num_elements=1000):
    """Randomly generate an invalid HTML document."""
    tag_names = ['p', 'div', 'span', 'i', 'b', 'script', 'table']
    elements = []
    for i in range(num_elements):
        choice = random.randint(0,3)
        if choice == 0:
            # New tag.
            tag_name = random.choice(tag_names)
            elements.append("<%s>" % tag_name)
        elif choice == 1:
            elements.append(rsentence(random.randint(1,4)))
        elif choice == 2:
            # Close a tag.
            tag_name = random.choice(tag_names)
            elements.append("</%s>" % tag_name)
    return "<html>" + "\n".join(elements) + "</html>"
项目:base_function    作者:Rockyzsu    | 项目源码 | 文件源码
def show_body():
    # with open('lianjia_body.txt','r') as fp:
    with open('cq_error.txt', 'r') as fp:
        content = json.loads(fp.read())['body']
    # print content
    tree = etree.HTML(content)
    nodes = tree.xpath('//li[@class="pictext"]')
    for node in nodes:
        xiaoqu_url = node.xpath('.//a[@class="flexbox post_ulog"]/@href')[0]
        name = node.xpath('.//div[@class="item_list"]/div[@class="item_main"]/text()')[0]
        desc = node.xpath('.//div[@class="item_list"]/div[@class="item_other text_cut"]/text()')[0]
        details = desc.split()
        price = node.xpath('.//div[@class="item_list"]/div[@class="item_minor"]/span/em/text()')[0]
        print xiaoqu_url
        print name
        print len(details)
        # print details
        for i in details:
            print i
            print
            # print details[0],details[1],details[2]
            # print price
项目:base_function    作者:Rockyzsu    | 项目源码 | 文件源码
def get_city_link():
    headers = {'Host': 'm.lianjia.com',
               'User-Agent': 'UCWEB/2.0 (Linux; U; Adr 2.3; zh-CN; MI-ONEPlus) U2/1.0.0 UCBrowser/8.6.0.199 U2/1.0.0 Mobile'}
    url = 'https://m.lianjia.com/city/'
    r = requests.get(url=url, headers=headers)
    contnet = r.text
    # print contnet
    tree = etree.HTML(contnet)
    t1 = tree.xpath('//ul[@class="item_lists"]')[1]
    city_list = []
    for city in t1:
        link = city.xpath('.//a/@href')[0]
        if link == '/sh/':
            continue
        if link == '/su/':
            continue
        if link == '/xsbn/':
            continue

        city_list.append('https://m.lianjia.com' + link)
    return city_list
项目:base_function    作者:Rockyzsu    | 项目源码 | 文件源码
def debug_page():
    headers = {
        'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:31.0) Gecko/20100101 Firefox/31.0'
    }
    url = 'http://m.qfang.com/guangzhou/rent/100001468?gardenId=1109818'
    r = requests.get(url=url, headers=headers)
    #r.encoding='gbk'
    print r.status_code
    print type(r.content)
    print r.content
    #print chardet.detect(r)
    tree = etree.HTML(r.text,parser=etree.HTMLParser(encoding='utf-8'))
    #print etree.tostring(tree)
    return tree,r.text

# ????????header??
项目:base_function    作者:Rockyzsu    | 项目源码 | 文件源码
def testcase2():
    js=json.loads(open('lianjia_sh.txt').read())
    #print js
    body=js['data']
    tree = etree.HTML(body)
    nodes = tree.xpath('//li[@class="pictext"]')
    print "NODE:",len(nodes)
    print js['args']
    print '*'*20
    print type(js)
    print type(js['args'])
    #p=re.compile('"cur_city_name":"(.*?)"')
    p=re.compile('"total":(\d+)')
    s=p.findall(js['args'])[0]
    print s
    '''
    print type(s)
    print s
    print s.decode('utf-8').encode('gbk')
    print s.decode('unicode_escape')


    for k,v in js['args'].items():
        print k,"::",v
    '''
项目:base_function    作者:Rockyzsu    | 项目源码 | 文件源码
def lxml_case2():
    #?????
    str1='''
    <bookstore>

    <book>
      <title>Harry Potter</title>
      <author>J K. Rowling</author>
      <year>2005</year>
      <price>29.99</price>
    </book>

    </bookstore>
    '''
    tree=etree.HTML(str1)
    t1=tree.xpath('bookstore')
    print t1
项目:base_function    作者:Rockyzsu    | 项目源码 | 文件源码
def getData(self):
        base_url='http://sj.qq.com/myapp/category.htm'
        parent_url='http://sj.qq.com/myapp/category.htm?orgame=1'
        s=requests.get(url=parent_url,headers=self.headers)
        print s.status_code
        #print s.text
        tree=etree.HTML(s.text)
        menu=tree.xpath('//ul[@class="menu-junior"]')[0]
        print type(menu)

        link= menu.xpath('.//li[@id]/a/@href')
        catelog=[]
        for i in link:
            print i
            p=re.compile('categoryId=(-?\d+)')
            #x=base_url+i
            x=p.findall(i)[0]
            #print x
            catelog.append(x)
        return catelog
项目:openlawClawer    作者:sml2h3    | 项目源码 | 文件源码
def get_list(self, cookies):
        print("?????%s???\r\n" % self.page)
        page_r = requests.get(self.targetUrl + "&page=%s" % self.page, cookies=cookies)
        if page_r.status_code == 200:
            if 'window.v=' in page_r.text:
                return 10001
            tree = etree.HTML(page_r.text)
            init_list = tree.xpath('//*[@id="ht-kb"]/article/h3/a')
            list_array = []
            for item in init_list:
                item_link = item.get('href')
                item_text = item.text
                item_array = [item_text,item_link]
                list_array.append(item_array)
            return list_array
        else:
            print("???????5??????\r\n")
            time.sleep(5)
            return self.get_list()
项目:webspider    作者:GuozhuHe    | 项目源码 | 文件源码
def get_proxys(pages=4):
    """????"""
    proxy_list = []
    url = 'http://www.xicidaili.com/wn/'
    headers = generate_http_header()
    headers.update(
        {
            'Referer': 'http://www.xicidaili.com/wn/',
            'Host': 'www.xicidaili.com',
        }
    )
    for page_no in range(1, pages + 1):
        response = requests.get(url=url.format(page_no=page_no), headers=headers)
        html = etree.HTML(response.text)
        ips = html.xpath("//table[@id='ip_list']/tr/td[2]/text()")
        ports = html.xpath("//table[@id='ip_list']/tr/td[3]/text()")
        assert len(ips) == len(ports)
        for (ip, port) in zip(ips, ports):
            proxy_list.append(constants.HTTP_PROXY_FORMATTER.format(ip=ip, port=port))
    return proxy_list
项目:webspider    作者:GuozhuHe    | 项目源码 | 文件源码
def requests_company_detail_data(company_id):
    """?????????"""
    headers = generate_http_header()
    crawler_sleep()
    try:
        response = requests.get(
            url=constants.COMPANY_DETAIL_URL.format(company_id=company_id),
            headers=headers,
            cookies=Cookies.get_random_cookies(),
            allow_redirects=False,
            timeout=constants.TIMEOUT)
    except RequestException as e:
        logging.error(e)
        raise RequestsError(error_log=e)
    html = etree.HTML(response.text)
    advantage = html.xpath('//div[@id="tags_container"]//li/text()')
    size = html.xpath('//div[@id="basic_container"]//li[3]/span/text()')
    address = html.xpath('//p[@class="mlist_li_desc"]/text()')
    introduce = html.xpath('//span[@class="company_content"]//text()')

    return format_tag(advantage, address, size, introduce, company_id)
项目:webspider    作者:GuozhuHe    | 项目源码 | 文件源码
def requests_job_detail_data(job_id):
    """?????????"""
    headers = generate_http_header()
    crawler_sleep()
    try:
        response = requests.get(
            url=constants.JOB_DETAIL_URL.format(job_id=job_id),
            headers=headers,
            cookies=Cookies.get_random_cookies(),
            allow_redirects=False,
            timeout=constants.TIMEOUT)
    except RequestException as e:
        logging.error(e)
        raise RequestsError(error_log=e)
    html = etree.HTML(response.text)
    department = html.xpath('//div[@class="job-name"]/div[@class="company"]/text()')
    description = html.xpath('//dd[@class="job_bt"]/div//text()')
    keywords = html.xpath('//dd[@class="job_request"]//li[@class="labels"]/text()')
    return format_tag(department, description, keywords, job_id)
项目:weeman    作者:evait-security    | 项目源码 | 文件源码
def rdoc(num_elements=1000):
    """Randomly generate an invalid HTML document."""
    tag_names = ['p', 'div', 'span', 'i', 'b', 'script', 'table']
    elements = []
    for i in range(num_elements):
        choice = random.randint(0,3)
        if choice == 0:
            # New tag.
            tag_name = random.choice(tag_names)
            elements.append("<%s>" % tag_name)
        elif choice == 1:
            elements.append(rsentence(random.randint(1,4)))
        elif choice == 2:
            # Close a tag.
            tag_name = random.choice(tag_names)
            elements.append("</%s>" % tag_name)
    return "<html>" + "\n".join(elements) + "</html>"
项目:Music163    作者:qshine    | 项目源码 | 文件源码
def index(url='http://music.163.com/discover'):
    headers = {
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'Accept-Encoding': 'gzip, deflate, sdch',
        'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6',
        'Host': 'music.163.com',
        'Referer': 'http://music.163.com/',
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 DOL/s_1511_r2x9ak474125_821',
    }
    try:
        r = requests.get(url, headers=headers, timeout=4)
        html = etree.HTML(r.content)
        play_lists = [urlparse.urljoin('http://music.163.com/', link) for link in
                      html.xpath('//*[@id="discover-module"]/div[1]/div/div/div[1]/ul//li/div/a/@href') if
                      link.startswith('/playlist')]
        for url in play_lists:
            app.send_task(
                'tasks.playlist.playlist',
                args=(url, ),
                queue='playlist_queue',
                routing_key='tasks_playlist'
            )
    except:
        print '????'
项目:Music163    作者:qshine    | 项目源码 | 文件源码
def playlist(url):
    headers = {
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'Accept-Encoding': 'gzip, deflate, sdch',
        'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6',
        'Host': 'music.163.com',
        'Referer': 'http://music.163.com/',
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 DOL/s_1511_r2x9ak474125_821',
    }
    try:
        r = requests.get(url, headers=headers)
        if r.status_code == 200:
            html = etree.HTML(r.content)
            ids = [search(link).group() for link in html.xpath('//a/@href') if link.startswith('/song?id') and search(link)]
            for song_id in ids:
                url = 'http://music.163.com/weapi/v1/resource/comments/R_SO_4_{}?csrf_token='.format(song_id)
                app.send_task(
                    'tasks.comment.comment',
                    args=(url, song_id),
                    queue='comment_queue',
                    routing_key='tasks_comment'
                )
                time.sleep(5)
    except:
        print '????'
项目:Daily-code    作者:rui7157    | 项目源码 | 文件源码
def parse(self, response):
        # ??response yield link
        if not response:
            return None
        et = etree.HTML(response)      
        links = et.xpath("//*[@valign='top'][1]/a/@href")
        urls=[]
        for link in links:
            #??id?????????
            print link
            uid=re.findall(r"http://weibo\.cn/u/(\w*)", link) #??????????eq:http://weibo.cn/renzhenghao)
            if uid:
                uid=uid[0]
            else:
                continue   
            SinaWeiboItem["uid"]=uid   
            info_url = "http://weibo.cn/{uid}/info".format(uid=uid)
            Request(info_url, callback=self.parse_info)
            datas={"uid":SinaWeiboItem["uid"],"name":SinaWeiboItem["name"],"info":SinaWeiboItem["info"]}
            print sina_info.insert(datas)
            urls.append("http://weibo.cn/{uid}/fans".format(uid=uid)) #url????????
        return urls
项目:InstaBot    作者:nickpettican    | 项目源码 | 文件源码
def media_by_tag(browser, tag_url, media_url, tag, media_max_likes, media_min_likes):
    # returns list with the 14 'nodes' (posts) for the tag page 

    result = {'posts': False, 'tag': tag}
    try:
        explore_site = browser.get(tag_url %(tag))
        tree = etree.HTML(explore_site.text)
        data = return_sharedData(tree)
        if data:
            nodes = data['entry_data']['TagPage'][0]['tag']['media']['nodes']
            result['posts'] = [{'user_id': n['owner']['id'],
                                'username': return_username(browser, media_url, n['code']),
                                'likes': n['likes']['count'], 
                                'caption': n['caption'], 
                                'media_id': n['id'], 
                                'url_code': n['code']} 
                                for n in nodes if media_min_likes <= n['likes']['count'] <= media_max_likes if not n['comments_disabled']]
    except Exception as e:
        print '\nError in obtaining media by tag: %s' %(e)
    return result
项目:flickr_downloader    作者:Denisolt    | 项目源码 | 文件源码
def rdoc(num_elements=1000):
    """Randomly generate an invalid HTML document."""
    tag_names = ['p', 'div', 'span', 'i', 'b', 'script', 'table']
    elements = []
    for i in range(num_elements):
        choice = random.randint(0,3)
        if choice == 0:
            # New tag.
            tag_name = random.choice(tag_names)
            elements.append("<%s>" % tag_name)
        elif choice == 1:
            elements.append(rsentence(random.randint(1,4)))
        elif choice == 2:
            # Close a tag.
            tag_name = random.choice(tag_names)
            elements.append("</%s>" % tag_name)
    return "<html>" + "\n".join(elements) + "</html>"
项目:SmallReptileTraining    作者:yanbober    | 项目源码 | 文件源码
def parse_main_subjects(self, content):
        '''
        ?????????????????
        :param content: ???????
        :return: ['?????????', '?????????']
        '''
        try:
            html = etree.HTML(content.lower())
            subject = html.xpath('//ul[@class="img"]/li')
            subject_urls = list()
            for sub in subject:
                a_href = sub[0].get('href')
                subject_urls.append(a_href)
            return subject_urls
        except Exception as e:
            print(str(e))
            return list()
项目:HtmlExtract-Python    作者:xinyi-spark    | 项目源码 | 文件源码
def replace_InvalidTag(Html):
    '''
    ??HTML??????
    '''
    re_cdata = re.compile('//<!\[CDATA\[[^>]*//\]\]>', re.I)  # ??CDATA
    Html = re_cdata.sub('', Html)
    re_cdata = re.compile('<!\[CDATA\[[^>]*//\]\]>', re.I)  # ??CDATA
    Html = re_cdata.sub('', Html)
    re_br = re.compile('<br\s*?/?>')  # ????
    Html = re_br.sub('\n', Html)
    space_line = re.compile('\s+')  # ???????
    Html = space_line.sub('', Html)
    re_comment = re.compile('<!--[^>]*-->')  # ??HTML??
    Html = re_comment.sub('', Html)
    re_style = re.compile('<style\s*[^>]*>(.*?)</style\s*>')
    Html = re_style.sub('', Html)
    re_script = re.compile('<script\s*[^>]*>(.*?)</script>')
    Html = re_script.sub('', Html)
    re_h = re.compile('</?[^>]*>')  # ??html??
    Html = re_h.sub('', Html)
    return Html
项目:HtmlExtract-Python    作者:xinyi-spark    | 项目源码 | 文件源码
def replace_CharEntity(Html):
    '''
    ????HTML????, ?????????HTML????????
    '''
    CHAR_ENTITIES = {'nbsp': ' ', '160': ' ',
                     'lt': '<', '60': '<',
                     'gt': '>', '62': '>',
                     'amp': '&', '38': '&',
                     'quot': '"', '34': '"', }
    re_charEntity = re.compile(r'&#?(?P<name>\w+);')
    sz = re_charEntity.search(Html)
    while sz:
        key = sz.group('name')  # ??&?d?entity,?&gt;?gt
        try:
            Html = re_charEntity.sub(CHAR_ENTITIES[key], Html, 1)
            sz = re_charEntity.search(Html)
        except KeyError:
            # ?????
            Html = re_charEntity.sub('', Html, 1)
            sz = re_charEntity.search(Html)
    return Html
项目:HtmlExtract-Python    作者:xinyi-spark    | 项目源码 | 文件源码
def extract_meta(html):
    '''
    ????meta???????????
    '''
    if chardet.detect(html)['encoding'] == 'utf-8':
        html = html.decode('utf-8')
    meta_list = []
    # ??html?meta???
    page = etree.HTML(html.lower())
    xpath_result = page.xpath(u"//meta/@content")
    for once_xpath_result in xpath_result:
        # ???????????
        if zh_check(once_xpath_result) == True:
            meta_list.append(utf8_transfer(once_xpath_result).decode('utf-8'))
    if meta_list != []:
        return meta_list
    else:
        return False
项目:DistributeCrawler    作者:SmallHedgehog    | 项目源码 | 文件源码
def validProxy(self):
        """
         url: http://www.66ip.cn/
        """
        url = 'http://www.66ip.cn/areaindex_1/1.html'

        response = requests.get(url=url, headers=self.headers)
        htmlDoc = response.content.decode('gbk')

        htmlTree = etree.HTML(htmlDoc)
        proxy_list = htmlTree.xpath('.//table//tr')
        for proxy in proxy_list:
            proxies = ':'.join(proxy.xpath('./td/text()')[0:2])
            if self.__verifyProxy(proxies):
                if self.__isVaildProxy(proxies):
                    return {
                        "https": "https://{proxy}".format(proxy = proxies)
                    }
        return None

    # ??IP??????
项目:isar    作者:ilbers    | 项目源码 | 文件源码
def rdoc(num_elements=1000):
    """Randomly generate an invalid HTML document."""
    tag_names = ['p', 'div', 'span', 'i', 'b', 'script', 'table']
    elements = []
    for i in range(num_elements):
        choice = random.randint(0,3)
        if choice == 0:
            # New tag.
            tag_name = random.choice(tag_names)
            elements.append("<%s>" % tag_name)
        elif choice == 1:
            elements.append(rsentence(random.randint(1,4)))
        elif choice == 2:
            # Close a tag.
            tag_name = random.choice(tag_names)
            elements.append("</%s>" % tag_name)
    return "<html>" + "\n".join(elements) + "</html>"
项目:prestashop-sync    作者:dragoon    | 项目源码 | 文件源码
def get_xml_data(req_string, headers, data=None):
    req = urllib2.Request(req_string, headers=headers)
    html_data = _get_html_data(req, data)
    # Clean chunked data
    html_data = clean_chunked_data(html_data)
    #log_user_action(req.get_host() ,'chunked data', html_data, {})

    try:
        data = etree.fromstring(html_data)
    except XMLSyntaxError:
        # lxml cannot handle encoding declarations :(
        data = etree.HTML(html_data, etree.HTMLParser())
        # data is None when it was not XML, like 404 page without 404 code
        if data is not None:
            data = data.getroottree()
        else:
            raise urllib2.HTTPError(req_string, 404, "Not an XML", None, None)
        # TODO: check valid
        #if not data.find('.//prestashop'):
        #    raise urllib2.HTTPError(req_string, 404, "Not an XML", None, None)
    return data
项目:IBM-Waston-apply    作者:littlewizardLI    | 项目源码 | 文件源码
def MakePoem(word):
  url_base = "http://so.gushiwen.org/search.aspx?value="
  key = word
  url = url_base+key
  res = requests.get(url)
  res.encoding = 'utf-8'
  #print(res.text)
  root = etree.HTML(res.content)
  items = root.xpath('//div[@class="sons"][2]/p[@style="margin-bottom:0px;"]')[0]
  item = items.xpath('string(.)')

  content = item.replace('\n','').replace(' ','')
  length = len(content)
  answer = content[:length-1]

  return answer



#print(content)
项目:catchup4kodi    作者:catchup4kodi    | 项目源码 | 文件源码
def rdoc(num_elements=1000):
    """Randomly generate an invalid HTML document."""
    tag_names = ['p', 'div', 'span', 'i', 'b', 'script', 'table']
    elements = []
    for i in range(num_elements):
        choice = random.randint(0,3)
        if choice == 0:
            # New tag.
            tag_name = random.choice(tag_names)
            elements.append("<%s>" % tag_name)
        elif choice == 1:
            elements.append(rsentence(random.randint(1,4)))
        elif choice == 2:
            # Close a tag.
            tag_name = random.choice(tag_names)
            elements.append("</%s>" % tag_name)
    return "<html>" + "\n".join(elements) + "</html>"
项目:FundSpider    作者:s6530085    | 项目源码 | 文件源码
def parse_home(self, home_content):
        if home_content is None:
            return None
        home_content = home_content.encode('ISO-8859-1').decode('gbk')
        html = etree.HTML(home_content, parser=etree.HTMLParser(encoding='utf-8'))
        alinks = html.xpath('//a[@href]')

        pattern_capture = re.compile(ur"?(\d{6})?(.+)")
        l = []
        for alink in alinks:
            aa = alink.text
            if aa != None:
                match = pattern_capture.match(aa)
                if match:
                    #????,???????
                    # l.append((match.group(1), match.group(2)))
                    l.append(match.group(1))
        return l

    #?????????????,???????,????dict?,????????,?????????????
项目:FundSpider    作者:s6530085    | 项目源码 | 文件源码
def parse_ratio(self, info, content):
        # content = content.split('"')[1]
        html = etree.HTML(content, parser=etree.HTMLParser(encoding='utf-8'))
        tds = html.xpath('//td[@class="tor"]')
        if len(tds) > 2:
            #??????,?????---?
            #???????????????????????????????????http://fund.eastmoney.com/f10/cyrjg_510090.html?????>???????????????+??=100%????<=????????????
            insito = tds[0].text
            if insito != '---':
                info.inratio += safe_to_float(insito.split("%")[0])
            # innerto = tds[2].text
            # if innerto != '---':
            #     self.inratio += safe_to_float(innerto.split("%")[0])
            # self.inratio = safe_to_float(.split('%')[0]) + safe_to_float(tds[2].text.split('%')[0])

    #?????????,????????
项目:FundSpider    作者:s6530085    | 项目源码 | 文件源码
def parse_stocks(self, info, content):
        html = etree.HTML(content, parser=etree.HTMLParser(encoding='utf-8'))
        #????????????,?????????????
        tbs = html.xpath('//table[@class="w782 comm tzxq"]')
        # pers = html.xpath('//table[@class="w782 comm tzxq"]')
        if len(tbs) > 0:
            #???????,?????
            stocktds = tbs[0].xpath('.//td[@class="tol"]/a')
            pers = tbs[0].xpath('.//td[@class="tor"]')
            # ???????????,????,?????5???
            front, interval = 2, 5
            if not '???' in content:
                front, interval = 0, 3
            for (index, stocked) in enumerate(stocktds):
                # info.stocks.append(stocked.text)
                # tor????,?????????
                per = pers[index*interval+front]
                # ???????? "???????????????????????????????" ????????
                if per == '---':
                    continue
                # ?????????,??[????-3.6%,????-4.1%]?????
                # ????????bug,?????,??????
                stockname = stocked.text
                if not stockname is None and len(stockname) > 0:
                    info.stocks.append(stockname + '-' + per.text)
项目:FundSpider    作者:s6530085    | 项目源码 | 文件源码
def parse_index_list(self, index_list_content):
        # ????????
        index_list_content = index_list_content.encode('ISO-8859-1').decode('utf-8')
        parsed_content = etree.HTML(index_list_content, parser=etree.HTMLParser(encoding='utf-8'))
        trs = parsed_content.xpath('//tbody/tr')
        indexs = []
        for tr in trs:
            tds = tr.xpath('./td')
            if len(tds) == 5:
                index = IndexInfo()
                code = tds[0].text.strip()
                if len(code.split('.')) == 2:
                    index.code = code.split('.')[0]
                    index.full_code = code
                index.name = tds[1].text.strip()
                index.begin_time = tds[2].text.strip()
                index.short_name = tds[3].text.strip()
                #????url,????????
                weave = tds[4].xpath('./a')
                if len(weave) == 1:
                    index.weave = weave[0].attrib['href'].strip()
                else:
                    index.weave = tds[4].text.strip()
                indexs.append(index)
        return indexs
项目:ShelbySearch    作者:Agentscreech    | 项目源码 | 文件源码
def rdoc(num_elements=1000):
    """Randomly generate an invalid HTML document."""
    tag_names = ['p', 'div', 'span', 'i', 'b', 'script', 'table']
    elements = []
    for i in range(num_elements):
        choice = random.randint(0,3)
        if choice == 0:
            # New tag.
            tag_name = random.choice(tag_names)
            elements.append("<%s>" % tag_name)
        elif choice == 1:
            elements.append(rsentence(random.randint(1,4)))
        elif choice == 2:
            # Close a tag.
            tag_name = random.choice(tag_names)
            elements.append("</%s>" % tag_name)
    return "<html>" + "\n".join(elements) + "</html>"
项目:respeaker_virtualenv    作者:respeaker    | 项目源码 | 文件源码
def rdoc(num_elements=1000):
    """Randomly generate an invalid HTML document."""
    tag_names = ['p', 'div', 'span', 'i', 'b', 'script', 'table']
    elements = []
    for i in range(num_elements):
        choice = random.randint(0,3)
        if choice == 0:
            # New tag.
            tag_name = random.choice(tag_names)
            elements.append("<%s>" % tag_name)
        elif choice == 1:
            elements.append(rsentence(random.randint(1,4)))
        elif choice == 2:
            # Close a tag.
            tag_name = random.choice(tag_names)
            elements.append("</%s>" % tag_name)
    return "<html>" + "\n".join(elements) + "</html>"
项目:tellmeabout.coffee    作者:billyfung    | 项目源码 | 文件源码
def rdoc(num_elements=1000):
    """Randomly generate an invalid HTML document."""
    tag_names = ['p', 'div', 'span', 'i', 'b', 'script', 'table']
    elements = []
    for i in range(num_elements):
        choice = random.randint(0,3)
        if choice == 0:
            # New tag.
            tag_name = random.choice(tag_names)
            elements.append("<%s>" % tag_name)
        elif choice == 1:
            elements.append(rsentence(random.randint(1,4)))
        elif choice == 2:
            # Close a tag.
            tag_name = random.choice(tag_names)
            elements.append("</%s>" % tag_name)
    return "<html>" + "\n".join(elements) + "</html>"
项目:proxy_pool    作者:jhao104    | 项目源码 | 文件源码
def getHtmlTree(url, **kwargs):
    """
    ??html?
    :param url:
    :param kwargs:
    :return:
    """

    header = {'Connection': 'keep-alive',
              'Cache-Control': 'max-age=0',
              'Upgrade-Insecure-Requests': '1',
              'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko)',
              'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
              'Accept-Encoding': 'gzip, deflate, sdch',
              'Accept-Language': 'zh-CN,zh;q=0.8',
              }
    # TODO ??????????????
    wr = WebRequest()

    # delay 2s for per request
    time.sleep(2)

    html = wr.get(url=url, header=header).content
    return etree.HTML(html)
项目:Price-Comparator    作者:Thejas-1    | 项目源码 | 文件源码
def rdoc(num_elements=1000):
    """Randomly generate an invalid HTML document."""
    tag_names = ['p', 'div', 'span', 'i', 'b', 'script', 'table']
    elements = []
    for i in range(num_elements):
        choice = random.randint(0,3)
        if choice == 0:
            # New tag.
            tag_name = random.choice(tag_names)
            elements.append("<%s>" % tag_name)
        elif choice == 1:
            elements.append(rsentence(random.randint(1,4)))
        elif choice == 2:
            # Close a tag.
            tag_name = random.choice(tag_names)
            elements.append("</%s>" % tag_name)
    return "<html>" + "\n".join(elements) + "</html>"
项目:spoon    作者:Jiramew    | 项目源码 | 文件源码
def get_html_tree(url, headers=None, cookie=None, proxy=None):
    if headers is None:
        headers = HEADERS

    try:
        response = requests.get(url=url, headers=headers, cookies=cookie, timeout=10, proxies=proxy)
        response.raise_for_status()
        response.encoding = response.apparent_encoding
        html = response.text
        if isinstance(html, bytes):
            html = html.decode("utf-8")
        time.sleep(1)
        return etree.HTML(html)
    except Exception as e:
        log.error("{0}".format(e))
        raise e
项目:WebAutomaiton    作者:AlvinXuCH    | 项目源码 | 文件源码
def WriteHTML(self,testcaseinfo):

        self.CreateHtmlFile()

        f = open(self.reportfile,"r")

        htmlcontent = f.read()
        f.close()
        #tree = mytree.fromstring(str(htmlcontent))
        htmlcontent.encode('utf-8')
        tree = html.fromstring(htmlcontent)
        tableElem = tree.find(".//table")
        if testcaseinfo.result == "Failed":
            mytablerow = "<tr><td>{0}</td><td>{1}</td><td>{2}</td><td bgcolor=\"#FF0000\">{3}</td><td>{4}</td><td>{5}</td><td>{6}</td><td>{7}</td></tr>".format(testcaseinfo.id,testcaseinfo.name,testcaseinfo.owner,testcaseinfo.result,testcaseinfo.starttime,testcaseinfo.endtime,testcaseinfo.secondsDuration,testcaseinfo.errorinfo)
        else:
            mytablerow = "<tr><td>{0}</td><td>{1}</td><td>{2}</td><td>{3}</td><td>{4}</td><td>{5}</td><td>{6}</td><td>{7}</td></tr>".format(testcaseinfo.id,testcaseinfo.name,testcaseinfo.owner,testcaseinfo.result,testcaseinfo.starttime,testcaseinfo.endtime,testcaseinfo.secondsDuration,testcaseinfo.errorinfo)
        tableElem.append(mytree.HTML(str(mytablerow)))

        f = open(self.reportfile,"w")
        #html.tostring
        newContent = repr(html.tostring(tree,method="html",with_tail=False))
        newContent = newContent.replace(r"\n","").replace(r"\t","").replace('b\'',"")
        newContent = newContent[:len(newContent)-1]
        f.write(newContent)
        f.close()