Python lxml.html 模块,document_fromstring() 实例源码

我们从Python开源项目中,提取了以下20个代码示例,用于说明如何使用lxml.html.document_fromstring()

项目:TorrentBro    作者:subins2000    | 项目源码 | 文件源码
def files(self):
        if not self._files:
            path = '/ajax_details_filelist.php'
            url = self.url.path(path).query_param('id', self.id)

            request = urllib.request.Request(
                url, headers={'User-Agent': "Magic Browser"})
            response = urllib.request.urlopen(request).read()

            root = html.document_fromstring(response)

            rows = root.findall('.//tr')

            if len(rows) == 1 and rows[0].find('td').get('colspan') == str(2):
                self._files = {}
            else:
                for row in rows:
                    name, size = [unicode(v.text_content())
                              for v in row.findall('.//td')]
                    self._files[name] = size.replace('\xa0', ' ')
        return self._files
项目:presidency    作者:jayrav13    | 项目源码 | 文件源码
def all(self):

        url = "http://www.presidency.ucsb.edu/executive_orders.php?year=%d&Submit=DISPLAY" % self.year

        page = requests.get(url)
        tree = html.document_fromstring(page.text)

        table = tree.xpath('//form[@name="executive_orders"]')[0].getnext().xpath('tr')

        output = []

        for i in range(1, len(table)):

            data = table[i].xpath('td')

            output.append({
                "president": data[0].text_content(),
                "date": data[1].text_content(),
                "id": data[2].xpath('a')[0].attrib['href'].split('=')[1],
                "link": "http://www.presidency.ucsb.edu" + data[2].xpath('a')[0].attrib['href'][2:]
            })

        return output
项目:prestashop-sync    作者:dragoon    | 项目源码 | 文件源码
def search_shops_on_forum(force=False):
    # Get member pages
    step = 500
    last_page = page_number = (Member.objects.aggregate(Max('page_number')) and not force) or 1
    page_url = 'http://www.prestashop.com/forums/members/page__sort_key__members_display_name__sort_order__asc__max_results__%d__st__%d' % (step, (last_page-1)*step)
    while page_url:
        page = document_fromstring(urllib2.urlopen(page_url).read())
        for member in page.cssselect('ul.members li h3.bar a:first'):
            # member url
            Member.objects.get_or_create(link=member.get('href'), defaults={'page_number':page_number})
        page_url = page.cssselect('ul.pagination.left li.next a').get('href')
        page_number+=1

    for member in Member.objects.filter(page_number__gte=last_page):
        member_page = document_fromstring(urllib2.urlopen(member.link).read())
        for link in member_page.cssselect('div.general_box div.signature a'):
            ShopLink.objects.get_or_create(link=link.get('href'), member=member)
项目:prestashop-sync    作者:dragoon    | 项目源码 | 文件源码
def search_shops_on_rus_forum(force=False):
    last_page = (MemberRus.objects.aggregate(Max('page_number')) and not force) or 1

    for i in range(last_page, 4219):
        page_url = 'http://prestadev.ru/forum/profile.php?u='+str(i)
        page = document_fromstring(urllib2.urlopen(page_url).read())
        messages = 0
        try:
            messages = int(page.cssselect('div.wttborder td strong')[2].text.strip())
        except:
            pass
        try:
            params = {'title': page.cssselect('#profilename')[0].text.strip(),
                  'messages': messages,
                  'page_number': i,
                  'home_page':page.cssselect('div.wttborder td.row1')[4]}
        except IndexError:
            continue
        member = MemberRus.objects.get_or_create(**params)[0]
        for link in page.cssselect('div.wgborder td.row1 a'):
            ShopLinkRus.objects.get_or_create(link=link.get('href'), member=member)
项目:crestify    作者:crestify    | 项目源码 | 文件源码
def __init__(self, file_name, user_id):
        with open(file_name, 'r') as self.opened_file:
            #  So Instapaper doesn't close <li> tags
            #  This was causing infinite recursion when using BS directly
            #  Hence why the stuff below is being done, so that the <li> tags get closed
            self.html = html.document_fromstring(self.opened_file.read())
            self.html = html.tostring(self.html)
        self.soup = BeautifulSoup4(self.html)
        self.user = user_id
        self.urls = dict()
        self.check_duplicates = dict()
        self.check_duplicates_query = Bookmark.query.filter(Bookmark.user == self.user,
                                                            Bookmark.deleted == False).all()
        for bmark in self.check_duplicates_query:
            self.check_duplicates[bmark.main_url] = bmark
        self.tags_dict = dict()
        self.tags_set = set()
        self.valid_url = re.compile(
            r'^(?:[a-z0-9\.\-]*)://'
            r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}(?<!-)\.?)|'
            r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|'
            r'\[?[A-F0-9]*:[A-F0-9:]+\]?)'
            r'(?::\d+)?'
            r'(?:/?|[/?]\S+)$', re.IGNORECASE)
项目:pygflib    作者:waz4bb    | 项目源码 | 文件源码
def get_apikey(self, header=None):
        """
        Retrieve and sets a new apikey.

        :param header: a custom header for retrieving the apikey.
        """

        self.header = copy.deepcopy(self.DEFAULTHEADER)

        if header is None:
            header = self.APIKEYHEADER

        response = requests.get('http://www.gutefrage.net/frage_hinzufuegen', headers=header)
        self.apikey = re.search(
            "key: '([^']+)'",
            html.document_fromstring(response.text).xpath('//script[1]')[0].text
            ).group(1)

        self.header['X-Api-Key'] = self.apikey

        return self.apikey


    #TODO: rework this function eventually
项目:TorrentBro    作者:subins2000    | 项目源码 | 文件源码
def items(self):
        """
        Request URL and parse response. Yield a ``Torrent`` for every torrent
        on page.
        """

        request = urllib.request.Request(
            self.url, headers={'User-Agent': "Magic Browser"})
        response = urllib.request.urlopen(request).read()

        root = html.document_fromstring(str(response))
        items = [self._build_torrent(row) for row in
                 self._get_torrent_rows(root)]
        for item in items:
            yield item
项目:TorrentBro    作者:subins2000    | 项目源码 | 文件源码
def info(self):
        if self._info is None:

            request = urllib.request.Request(
                self.url, headers={'User-Agent': "Magic Browser"})
            response = urllib.request.urlopen(request).read()

            root = html.document_fromstring(response)
            info = root.cssselect('#details .nfo pre')[0].text_content()
            self._info = info
        return self._info
项目:presidency    作者:jayrav13    | 项目源码 | 文件源码
def scrape(self):

        # Return Wikipedia page and turn into a tree.
        base_url = 'https://en.wikipedia.org'
        response = requests.get(base_url + '/wiki/Cabinet_of_the_United_States')
        tree = html.document_fromstring(response.text)

        # Get all of the rows of the Cabinet table.
        rows = tree.xpath('//th[text()="Cabinet"]')[0].getparent().getparent().getchildren()

        obj = []

        # Iterate through all rows.
        for x in rows:

            # Retrieve all of the elements per row.
            data = x.getchildren()

            # Only look at this if we're looking at Cabinet members.
            if len(data) == 3 and data[0].tag == 'td':
                print(data[1].xpath('div/a'))
                # Clean up data with strip.
                obj.append({
                    "title": [x for x in data[0].text_content().split('\n') if x != ''][0],
                    "seal": 'https:' + data[0].xpath('a/img')[0].attrib['src'],
                    "img": 'https:' + data[1].xpath('a/img')[0].attrib['src'],
                    "name": [x for x in data[1].text_content().split('\n') if x != ''][0],
                    "details": base_url + data[1].xpath('div/a')[0].attrib['href'] if len(data[1].xpath('div/a')) > 0 else None,
                    "is_acting": (len([x for x in data[1].text_content().split('\n') if x != '']) > 1 and [x for x in data[1].text_content().split('\n') if x != ''][1] == 'Acting'),
                    "date_appointed": data[2].text_content(),
                })

        print(json.dumps(obj))
项目:presidency    作者:jayrav13    | 项目源码 | 文件源码
def __init__(self):
        self._base_url = 'https://en.wikipedia.org'
        self._response = requests.get(self._base_url + '/wiki/Political_appointments_of_Donald_Trump')
        self._tree = html.document_fromstring(self._response.text)

        self._congress_url = '/wiki/List_of_United_States_Senators_in_the_115th_Congress_by_seniority'
        self._senators_scraper = Senators(self._congress_url)
        self._senators = self._senators_scraper.scrape()
项目:presidency    作者:jayrav13    | 项目源码 | 文件源码
def __init__(self, url):
        self._base_url = 'https://en.wikipedia.org'
        self._response = requests.get(self._base_url + url)
        self._tree = html.document_fromstring(self._response.text)
项目:presidency    作者:jayrav13    | 项目源码 | 文件源码
def all(self):

        url = "http://www.presidency.ucsb.edu/debates.php"

        # Retrieve all debates as tree.
        page = requests.get(url)
        tree = html.document_fromstring(page.text)

        # List of all debate and date elements.
        dates = [x for x in tree.xpath('//td[@class="docdate"]') if len(x.text_content()) > 0]
        debates = tree.xpath('//td[@class="doctext"]')

        # Throw error if lengths are off.
        if len(dates) != len(debates):
            raise Exception('Sorry - something went wrong! Please open an issue at https://github.com/jayrav13/presidency/issues and include the following timestamp: %s' % str(time.time()))
            return None

        # Curate list of all debates.
        self.data = []

        for i in range(0, len(debates)):

            self.data.append({
                "date" : dates[i].text_content(),
                "debate" : debates[i].xpath('a')[0].text_content(),
                "link" : debates[i].xpath('a')[0].attrib['href'],
                "id" : int(debates[i].xpath('a')[0].attrib['href'].split('?')[1].split('=')[1])
            })

        return self.data
项目:presidency    作者:jayrav13    | 项目源码 | 文件源码
def retrieve(self):

        url = 'http://www.presidency.ucsb.edu/ws/index.php?pid='

        page = requests.get(url + str(self.id))
        tree = html.document_fromstring(page.text)

        self.data = {
            "text": tree.xpath('//span[@class="displaytext"]')[0].text_content()
        }

        return self.data
项目:presidency    作者:jayrav13    | 项目源码 | 文件源码
def get(self):

        page = requests.get(self.url)
        self.tree = html.document_fromstring(page.text)

        output = {
            "text" : self.tree.xpath('//span[@class="displaytext"]')[0].text_content(),
            "date": self.tree.xpath('//span[@class="docdate"]')[0].text_content(),
            "title": self.tree.xpath('//title')[0].text_content(),
            "id": self.id,
            "url": self.url,
            "president": self.tree.xpath('//title')[0].text_content().split(':')[0]
        }

        return output
项目:dbb-ranking-parser    作者:homeworkprod    | 项目源码 | 文件源码
def select_rank_rows(html):
    """Return the table rows that are expected to contain rank data."""
    root = document_fromstring(html)
    return root.xpath(
        'body/form/table[@class="sportView"][2]/tr[position() > 1]')
项目:CSCE482-WordcloudPlus    作者:ggaytan00    | 项目源码 | 文件源码
def scrape(site_address):
    page = requests.get(site_address)           #returns raw html
    page = clean_html(page.content) #removes <script> tags and their contents
    document = html.document_fromstring(page)   #removes all other tags

    return document.text_content()

# takes a url as a string and returns a STRING of all of the words
# that are used on that webpage
项目:reflected_xss_scanner    作者:justloop    | 项目源码 | 文件源码
def fill_login_form(url, body, username, password):
    doc = html.document_fromstring(body, base_url=url)
    form = _pick_form(doc.xpath('//form'))
    userfield, passfield = _pick_fields(form)
    form.fields[userfield] = username
    form.fields[passfield] = password
    hasSubmitBefore, submit_values= submit_value(form)
    form_values = form.form_values()
    if not hasSubmitBefore:
        form_values += submit_values
    return (form.form_values()+submit_values),form_values, form.action or form.base_url, form.method, _pick_fields(form)
项目:Python    作者:ANT-Pi    | 项目源码 | 文件源码
def get_content(source):
    '''
    get the content from the source code page
    :param source:
    :return:
    '''
    selector = html.document_fromstring(source)
    content = selector.xpath('//div[@class="readtext"]')[0]
    num = content.xpath('h4/text()')
    every_content = content.xpath('p/text()')
    write_file(num)
    for each in every_content:
        write_file(each)
项目:FruitLine    作者:Caratpine    | 项目源码 | 文件源码
def select_url(url, html, fruitline_spider_variable):
    if html < 10:
        return []
    try:
        html_element = document_fromstring(urllib2.unquote(html))
        html_element.make_links_absolute(url)
        links = [i[2] for i in html_element.iterlinks()]
    except Exception, e:
        spider_logger.error("Function: select_url, Info: %s" % str(e))
        return []
    links_unrepeat = set()
    [links_unrepeat.add(i) for i in links]

    final_links = []
    for i in list(links_unrepeat):
        full_url = repair_url(i, fruitline_spider_variable)
        if fruitline_spider_variable.filter_rule != "":
            pattern = re.compile(fruitline_spider_variable.filter_rule)
            if re.match(pattern, full_url):
                if full_url not in fruitline_spider_variable.crawled_url_queue:
                    d = dict()
                    d['method'] = "get"
                    d['url'] = full_url
                    final_links.append(d)
        else:
            if full_url not in fruitline_spider_variable.crawled_url_queue:
                d = dict()
                d['method'] = "get"
                d['url'] = full_url
                final_links.append(d)

    return final_links
项目:get_wx_article    作者:zywaited    | 项目源码 | 文件源码
def get_wx_article_lists(article_html,id_index):
    # global article_flag
    #?????
    wx_article_list = []

    html_tree = html.document_fromstring(article_html)
    html_nodes = html_tree.xpath('//ul[@class="article-ul"]//li')

    for html_node in html_nodes:
        #?????????
        wx_article_object = {}

        html_node_children = html_node.getchildren()

        #???????????????????(????????find?????????????)
        div_wx_ft_children = html_node_children[1].find('div[@class="wx-ft"]').getchildren()
        pub_time = div_wx_ft_children[1].text_content().strip()
        pub_time = pub_time.encode('utf-8').split('?')
        if len(pub_time) < 2:
            print_pass_a_article(id_index,'time')
        else:
            pub_time = int(time.mktime(time.strptime(pub_time[1],'%Y-%m-%d %H:%M:%S')))
            #????????
            if pub_time <= last_time:
                # article_flag = False
#                 print 'out of the time and return'
                return wx_article_list            
        wx_article_object['time'] = str(pub_time)
        readnum_and_likenum = re.split(r'\s',div_wx_ft_children[2].text_content().strip())
        length = len(readnum_and_likenum)
        if length < 2:   
            print_pass_a_article(id_index,'readnum_and_likenum')
        readnum = str(readnum_and_likenum[0]).strip()
        wx_article_object['readnum'] = str(int(readnum))
        likenum = str(readnum_and_likenum[length-1]).strip()
        wx_article_object['likenum'] = str(int(likenum))

        div_wx_ft_h4 = html_node_children[1].find('h4')
        title = div_wx_ft_h4.find('a').text_content()
        if not title:
            print_pass_a_article(id_index,'title')
        wx_article_object['title'] = title
        content = div_wx_ft_h4.getnext().text_content()
        if not content:
            print_pass_a_article(id_index,'content')
        wx_article_object['content'] = content

        #url?img-data-hash
        div_wx_img_a = html_node_children[0].find('a')
        url = div_wx_img_a.get('href')
        if not url:
            print_pass_a_article(id_index,'url')
        wx_article_object['url'] = url
        img_hash = div_wx_img_a.find('img').get('data-hash')
        if not img_hash:
            print_pass_a_article(id_index,'img-hash')
        wx_article_object['imglink'] = get_img_link(img_hash)
        wx_article_object['id'] = str(int(id_index))

        wx_article_list.append(wx_article_object)
    return wx_article_list