Python bs4 模块,BeautifulSoup() 实例源码

我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用bs4.BeautifulSoup()

项目:socialhome    作者:jaywink    | 项目源码 | 文件源码
def make_nsfw_safe(text):
    """Make NSFW safer by adding click-to-show class to images."""
    soup = BeautifulSoup(text, "lxml")
    images = soup.find_all("img")

    for image in images:
        if image.get("class"):
            image["class"] = "%s nsfw" % " ".join(image.get("class"))
        else:
            image["class"] = "nsfw"
        image.replace_with(image)

    result = str(soup)
    # We don't want html/body, which BeautifulSoup kindly wraps our new HTML in
    if result.startswith("<html><body>") and result.endswith("</body></html>"):
        result = result[len("<html><body>"):-len("</body></html>")]
    return result
项目:qqmbr    作者:ischurov    | 项目源码 | 文件源码
def mathjax(s):
    with open("temp.log", "w") as f:
        f.write(s)

    p = Popen([app.config['mjpage'],
              '--dollars',
               '--output', "CommonHTML",
               '--fontURL',
               ("https://cdnjs.cloudflare.com/ajax/libs/"
                "mathjax/2.7.0/fonts/HTML-CSS")], stdout=PIPE, stdin=PIPE,
              stderr=PIPE)

    #filename = hashlib.sha256(s.encode('utf-8')).hexdigest()
    #with open(filename, 'w') as f:
    #    print(s, file=f)

    res = p.communicate(input=s.encode('utf-8'))
    out = res[0].decode('utf-8')
    err = res[1].decode('utf-8')

    soup = BeautifulSoup(out, 'html.parser')
    style = str(soup.style)
    body = "".join(str(s) for s in soup.body.children)

    return style, body
项目:optimum-arena    作者:ovidiugiorgi    | 项目源码 | 文件源码
def get_best(url):
    url = 'http://www.infoarena.ro' + url
    source_code = requests.get(url)
    plain_text = source_code.text
    soup = BeautifulSoup(plain_text, "html.parser")
    name = soup.find('span', {'class': 'username'}).find('a')['href'][35:]
    tests = soup.find_all('td', {'class': 'number'})
    max_ms = -1
    for test in tests:
        test = test.string
        if test.endswith('ms'):
            time = int(test.strip('ms'))
            max_ms = max(max_ms, time)
    if name not in d or max_ms < d[name][0]:
        d[name] = (max_ms, url)
    print(max_ms, name, url)
项目:HatDecrypter    作者:HatBashBR    | 项目源码 | 文件源码
def decrypt(hash, tipo):
    global word

    try:
        if(tipo == 0):
            url = BeautifulSoup(urllib.urlopen("https://md5.gromweb.com/?md5=" + hash), "html.parser")
        else:
            url = BeautifulSoup(urllib.urlopen("https://sha1.gromweb.com/?hash=" + hash), "html.parser")

        password = url.find("em", {"class": "long-content string"})
        password = re.sub(re.compile("<.*?>"), "", str(password)).strip()
        if str(password) == "None":
            print word+"\t\t\t\t[-] Senha nao encontrada! :-("
        else:
            print word+"\t\t\t\t[+] Senha encontrada: " + password
    except IOError:
       decryptwl(hash, tipo)
项目:AFSCbot    作者:HadManySons    | 项目源码 | 文件源码
def add_afsc_links(full_afsc_dict, reddit):
    """
    Add links to /r/AirForce wiki from given filename into the dictionary.
    :param dict: either enlisted_dict or officer_dict
    :param reddit: PRAW reddit object
    """
    # gets dict of AFSC to link on /r/AirForce wiki
    wiki_page = reddit.subreddit("AirForce").wiki["index"]
    wiki_soup = BeautifulSoup(wiki_page.content_html, "html.parser")
    links = wiki_soup.find_all("a")

    # currently all wiki AFSC are enlisted
    for link in links:
        # not all links have /r/AirForce/wiki/jobs so this is more generalized
        # using only /r/AirForce/ wiki links
        if "www.reddit.com/r/AirForce/wiki/" in link["href"]:
            AFSC_code = link["href"].split("/")[-1].upper()
            base_afsc = AFSC_code[:5]  # shaves off any prefixes
            if base_afsc in full_afsc_dict["enlisted"].keys():
                full_afsc_dict["enlisted"][base_afsc]["link"] = link["href"]
项目:scibot    作者:SciCrunch    | 项目源码 | 文件源码
def process_POST_request(request):
    dict_ = urlparse.parse_qs(request.text)
    def htmlify(thing):
        try:
            html = dict_[thing][0]
        except KeyError as e:
            html = ''
        return '<html>' + html + '</html>'
    uri = dict_['uri'][0]
    head = htmlify('head')
    body = htmlify('body')
    try:
        text = dict_['data'][0]
    except KeyError as e:
        text = ''

    headsoup = BeautifulSoup(head, 'lxml')
    bodysoup = BeautifulSoup(body, 'lxml')

    target_uri = getUri(uri, headsoup, bodysoup)
    doi = getDoi(headsoup, bodysoup)
    return target_uri, doi, head, body, text
项目:Rosi    作者:HaoBingo    | 项目源码 | 文件源码
def getRosiItem():
    start = time.time()
    index = 1
    while True:
        url = "http://www.mmxyz.net/category/rosi/page/{}/".format(index)
        res = requests.get(url,timeout=10)
        if res.status_code == 404:
            print("+   Time: {:.2f} S         +".format(time.time()-start))
            print("+   Total Pages:     {}   +".format(index-1))
            print("+  Total Numbers:   {}  +".format(len(RosiItems)))
            print("+-------------------------+\r\n\r\n")
            return
        soup = BeautifulSoup(res.content, "html.parser")
        rosiList = soup.find_all("a", class_="inimg")
        for rosi in rosiList:
            RosiItems.append(rosi['href'])
        index += 1
项目:sopel-modules    作者:phixion    | 项目源码 | 文件源码
def hltb(bot,trigger):
    if not trigger.group(2):
        return bot.say("Enter a game name to search.")
    game = trigger.group(2)
    url = "http://howlongtobeat.com/search_main.php?page=1"
    payload = {"queryString":game,"t":"games","sorthead":"popular","sortd":"Normal Order","length_type":"main","detail":"0"}
    test = {'Content-type':'application/x-www-form-urlencoded', 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.97 Safari/537.36','origin':'https://howlongtobeat.com','referer':'https://howlongtobeat.com'}
    session = requests.Session()
    session.post(url, headers=test, data=payload)
    r = session.post(url, headers=test, data=payload)
    if len(r.content) < 250:
        return bot.say("No results.")
    bs = BeautifulSoup(r.content)
    first = bs.findAll("div", {"class":"search_list_details"})[0]
    name = first.a.text
    time = first.findAll('div')[3].text
    bot.say('{} - {}'.format(name, time))
项目:Crawler_and_Share    作者:f496328mm    | 项目源码 | 文件源码
def craw_last_index(ptt_class_name):   
    #ptt_class_name = 'Soft_Job'
    index_url = 'https://www.ptt.cc/bbs/' + ptt_class_name + '/index.html'
    res = requests.get(index_url,verify = True)
    soup3 = BeautifulSoup(res.text, "lxml")   

    x = soup3('',{'class':"btn wide"},text = re.compile('??'))
    last_index = x[0]['href']
    last_index = last_index.replace('/bbs/' + ptt_class_name + '/index','')
    last_index = int( last_index.replace('.html','') )+1

    return last_index
#--------------------------------------------------------------------------------- 
# ?? ubuntu - crontab-e, ????, ??????? data 
# ?? PTT ????, ???????, ??????, 
# ??????DATA, ???? index ??????, ??????? data,
# ?????, ??????
项目:OpenCouture-Dev    作者:9-9-0    | 项目源码 | 文件源码
def addToCart(self):
        print '\nADD TO CART -----------------'
        session_get = self.user_session.get(self.URL_product_url, headers=self.get_headers)
        #print session_get.content
        soup = BeautifulSoup(session_get.content, 'lxml')

        results = soup.find_all('select', class_='size-select')
        #print results

        for item in results[0].select('option'):
            re_result = re.sub(self.sub_pattern, '', item.string)
            #print re_result
            matchObj = re.search(r"^%s+$" % self.user_size, re_result)
            if matchObj:
                self.post_data_addToCart['pid'] = item['value']
                self.post_data_addToCart['masterPID'] = item['value'].partition("_")[0]
                print self.post_data_addToCart
                break

        session_post = self.user_session.post(url=self.URL_cart_post_url, headers=self.post_headers, data=self.post_data_addToCart)
        print 'Add To Cart Status: ' + str(session_post.status_code)
项目:OpenCouture-Dev    作者:9-9-0    | 项目源码 | 文件源码
def finalBoss(self):
        print '\nEntering Payment Info -----------------------------'
        self.get_headers['Referer'] = self.URL_checkout_url
        self.post_headers['Referer'] = self.URL_pay_url
        #print json.dumps(self.get_headers, indent=1)
        session_get = self.user_session.get(self.URL_pay_url, headers=self.get_headers)
        savePage(session_get, 'finalCheckout.html')
        soup = BeautifulSoup(session_get.content, 'lxml')
    pay_secure_key = soup.find('input', {'name':'dwfrm_payment_securekey'})
        print pay_secure_key


        #NOTE: Visa, Mastercard, etc...correspond to different types. Find how they get set
        #NOTE: Visa = 001, Mastercard = 002, AE = 003, Discover = 004
        post_data_payInfo = { 'dwfrm_payment_creditCard_type': '002',
                              'dwfrm_payment_creditCard_owner': 'Bob McFlymo',
                              'dwfrm_payment_creditCard_number': '5105105105105100',
                              'dwfrm_payment_creditCard_month': '01',
                              'dwfrm_payment_creditCard_year': '2018',
                              'dwfrm_payment_creditCard_cvn': '002',
                              'dwfrm_payment_securekey': pay_secure_key,
                              'dwfrm_payment_signcreditcardfields': 'sign'
                             }

        #savePage(session_get, 'finalCheckout.html')
项目:OpenCouture-Dev    作者:9-9-0    | 项目源码 | 文件源码
def checkItemDirect(self):
        #NOTE: this function will most likely hamper performance but in some cases may improve it, leave it up to user choice to run this before checkout
        #Basic Steps:
        #Use BS to parse for <ul class="size options"
        #Size marked as follows: <li class="8 available" data-option-title="8"
        #Therefore, match data-option-title with user_size, then check the class for available keyword
        session_get = self.user_session.get(self.URL_product)
        print 'Status of requests.get: ' + str(session_get.status_code)
        soup = BeautifulSoup(session_get.content, "lxml")
        #Check that the lxml parser works for html
        #Look to use SoupStrainer to improve parsing efficiency
        for li in soup.select('li[data-option-title]'):
            #print li['class']
            #print type(li['class'])

            if (self.user_size in li['class']) & ('available' in li['class']): 
                print 'Size ' + self.user_size + ' Available'
项目:potatoygg    作者:Ripolin    | 项目源码 | 文件源码
def getMoreInfo(self, nzb):
        """
        Get details about a torrent.

        .. seealso:: MovieSearcher.correctRelease
        """
        data = self.getHTMLData(nzb['detail_url'])
        soup = BeautifulSoup(data, 'html.parser')
        description = soup.find(id='description')
        if description:
            nzb['description'] = description.prettify()
        line = soup.find(text='Date de publication').parent.parent
        pub = line.find_all('td')[1]
        added = datetime.strptime(pub.getText().split('(')[0].strip(),
                                  '%d/%m/%Y %H:%M')
        nzb['age'] = (datetime.now() - added).days
        self.log.debug(nzb['age'])
项目:qqmbr    作者:ischurov    | 项目源码 | 文件源码
def test_parse_html2(self):
        parser = QqParser(allowed_tags={'chapter', 'section',
                                        'subsection', 'subsubsection',
                                        'eq', 'eqref', 'ref',
                                        'equation', 'label', 'idx'})
        doc = r"""\chapter \label h1:label
    Hello

This is a \ref{h1:label}.
"""
        tree = parser.parse(doc)
        html = QqHTMLFormatter(tree)
        s = html.do_format()
        soup = BeautifulSoup(s, 'html.parser')

        self.assertEqual(soup.h1['id'], 'label_h1_label')
        self.assertEqual(soup.span['class'], ['section__number'])
        self.assertEqual(soup.span.string, "1")
        self.assertEqual(soup("a")[1].attrs,{'class': ['a-ref'], 'title': '', 'href': '#label_h1_label'})
        self.assertEqual(soup("a")[1].string, "1")
项目:qqmbr    作者:ischurov    | 项目源码 | 文件源码
def test_parse_html3(self):
        parser = QqParser(allowed_tags={'h1', 'h2', 'h3', 'h4', 'eq', 'eqref', 'ref', 'equation', 'label', 'idx'})
        doc = r"""\equation \label eq:x2y2
    x^2 + y^2 = z^2

See \ref{eq:x2y2}.
"""
        tree = parser.parse(doc)
        html = QqHTMLFormatter(tree)
        html.counters['equation'].showparents = False
        s = html.do_format()
        soup = BeautifulSoup(s, 'html.parser')
        self.assertEqual(soup.div.attrs, {'id':"label_eq_x2y2",'class':["latex_equation"]})
        self.assertEqual(soup.span['class'], ['ref'])
        self.assertEqual(soup.a['class'], ['a-ref'])
        self.assertEqual(soup.a['href'], '#mjx-eqn-1')
        self.assertEqual(soup.a.string, "(1)")
项目:qqmbr    作者:ischurov    | 项目源码 | 文件源码
def test_refs_with_separator(self):
        doc = r"""\chapter Hello \label sec:first

\chapter World \label sec:other

See
\ref[section][sec:first] and \ref[section][sec:other] for details.
"""
        parser = QqParser()
        formatter = QqHTMLFormatter()
        parser.allowed_tags.update(formatter.uses_tags())
        tree = parser.parse(doc)
        formatter.root = tree
        print(tree.as_list())
        html = formatter.do_format()
        soup = BeautifulSoup(html, "html.parser")
        self.assertEqual(soup("a")[2].contents[0], "section 1")
项目:qqmbr    作者:ischurov    | 项目源码 | 文件源码
def test_missing_label(self):
        doc = r"""\chapter Hello \label sec:first

\chapter World \label sec:other

See
\ref[section][sec:third] and \ref[zection][sec:another] for details.
"""
        parser = QqParser()
        formatter = QqHTMLFormatter()
        parser.allowed_tags.update(formatter.uses_tags())
        tree = parser.parse(doc)
        formatter.root = tree
        print(tree.as_list())
        html = formatter.do_format()
        soup = BeautifulSoup(html, "html.parser")
        self.assertEqual(soup("a")[2].contents[0], "section ???")
        self.assertEqual(soup("a")[3].contents[0], "zection ???")
项目:pixiv2pawoo    作者:TimeCompass    | 项目源码 | 文件源码
def getpixivfollow():
    """Get pixiv bookmark."""
    users = ['1789300']
    page = 1
    userlist = {}
    bookmark_url = u'https://www.pixiv.net/bookmark.php'
    while len(users) > 0:
        page_params = (
            ('type', 'user'),
            ('rest', 'show'),
            ('p', str(page)))
        bookmark_page = PIXIV_SESSION.get(
            bookmark_url, params=page_params, proxies=PROXY).text
        bookmark_content = BeautifulSoup(bookmark_page, 'lxml')
        print(u'Get Pixiv bookmark page {0} ...'.format(page))
        users = bookmark_content.select("div[class=usericon]")
        if len(users) == 0:
            break
        for user in users:
            user_info = user.find('a', attrs={'class': 'ui-profile-popup'})
            user_name = user_info.attrs['data-user_name']
            user_id = user_info.attrs['data-user_id']
            userlist[user_id] = user_name
        page += 1
    return userlist
项目:pixiv2pawoo    作者:TimeCompass    | 项目源码 | 文件源码
def pixiv2pawoo(pixivid):
    """Pixiv -> Pawoo."""
    pawoourl = u'https://pawoo.net/oauth_authentications/{0}?provider=pixiv'
    pawoolink = pawoourl.format(pixivid)
    pawoopage = PAWOO_SESSION.get(pawoolink, proxies=PROXY)
    if pawoopage.status_code == 200:
        pawooname = pawoopage.headers.get('link').split(';')[0]
        pawooname = pawooname.replace(
            '<https://pawoo.net/.well-known/webfinger?resource=acct%3A', '')
        pawooname = pawooname.replace('%40pawoo.net>', '')
        csrf_token = BeautifulSoup(pawoopage.text, 'lxml')
        csrf_token = csrf_token.select(
            "meta[name=csrf-token]")[0].attrs.get('content')
        with open('pawoolist.txt', 'a', encoding='utf-8-sig') as pawoofile:
            pawoofile.write(
                '{1},https://pawoo.net/@{0}\n'.format(pawooname, pixivid))
        followpawoo(pawooname, csrf_token)
        return 1
    else:
        return 0
项目:course-crawler    作者:Foair    | 项目源码 | 文件源码
def get_book(url):
    """ ????? PDF ??? """
    # ????????
    print('???????……')
    nav_page = CONNECTION.get(url).text
    shelves = set(re.findall(r'/courses/.+/pdfbook/\d/', nav_page))
    for shelf_count, shelf in enumerate(shelves, 1):
        res = CONNECTION.get(BASE_URL + shelf).text
        soup = BeautifulSoup(res, 'lxml')
        save_dir = os.path.join(BASE_DIR, 'Books', str(shelf_count))
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)
        for book_count, book in enumerate(soup.select('#booknav a'), 1):
            print('------>', book.string)
            file_name = REG_FILE.sub(' ', book.string) + '.pdf'
            pdf = CONNECTION.get(BASE_URL + book['rel'][0]).content
            with open(os.path.join(save_dir, file_name), 'wb') as pdf_file:
                pdf_file.write(pdf)
项目:Verification-code-crack    作者:weixianglin    | 项目源码 | 文件源码
def read_captcha():
    header={
        'User-Agent':'Mozilla/5.0 (iPad; CPU OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1',
        'Host':'login.weibo.cn'
    }
    url_login = 'http://login.weibo.cn/login/'
    html = requests.get(url_login,headers=header).content  # ????
    soup = BeautifulSoup(html, 'lxml')
    code_img = str(soup.find('img'))[24:-3]  # ?????????
    print(code_img)
    urlretrieve(code_img, r'E:\????\??????\1 ???\captcha_master1\captcha_master\main_captcha\captcha.gif')
    show_img(r'E:\????\??????\1 ???\captcha_master1\captcha_master\main_captcha\captcha.gif')
    remove_line(r'E:\????\??????\1 ???\captcha_master1\captcha_master\main_captcha\captcha.gif',
                r'E:\????\??????\1 ???\captcha_master1\captcha_master\main_captcha/')
    pic_cut('captcha_removeline.gif', 'E:/????/??????/1 ???/captcha_master1/captcha_master/main_captcha/',
            'E:/????/??????/1 ???/captcha_master1/captcha_master/word/')
项目:lichking    作者:melonrun    | 项目源码 | 文件源码
def gen_item_comment(self, response):
        comment = []
        new_comment = {}
        comments_data = []
        rep_time_list = response.xpath('//div[@class="authi"]//em').extract()
        for indexi, content in enumerate(response.xpath('//td[@class="t_f"]').extract()):
            soup = BeautifulSoup(content, 'lxml')
            if soup.find('div', class_='attach_nopermission') is not None:
                soup.find('div', class_='attach_nopermission').clear()
            [s.extract() for s in soup('script')]  # remove script tag
            c = StrClean.clean_unicode(soup.get_text())
            comments_data.append({'content': c, 'reply_time': self.format_rep_date(rep_time_list[indexi])})
        new_comment['url'] = response.url
        new_comment['comments_data'] = comments_data
        comment.append(new_comment)
        return comment
项目:lichking    作者:melonrun    | 项目源码 | 文件源码
def gen_item_comment(self, response):
        comment = []
        new_comment = {}
        comments_data = []
        rep_time_list = response.xpath('//div[@class="authi"]//em').extract()
        for indexi, content in enumerate(response.xpath('//div[@class="t_fsz"]//table[1]').extract()):
            soup = BeautifulSoup(content, 'lxml')
            [s.extract() for s in soup('script')]  # remove script tag
            c = StrClean.clean_comment(soup.get_text())
            if indexi >= len(rep_time_list):
                rep_time = self.format_rep_date(rep_time_list[-1])
            else:
                rep_time = self.format_rep_date(rep_time_list[indexi])
            comments_data.append({'content': c, 'reply_time': rep_time})
        new_comment['url'] = response.url
        new_comment['comments_data'] = comments_data
        comment.append(new_comment)
        return comment
项目:lichking    作者:melonrun    | 项目源码 | 文件源码
def gen_item_comment(self, response):
        comment = []
        new_comment = {}
        comments_data = []
        rep_time_list = re.findall(u'\d{4}-\d{1,2}-\d{1,2} \d{1,2}:\d{1,2}:\d{1,2}', response.body)
        for indexi, content in enumerate(response.xpath('//div[@class="t_fsz"]//table[1]').extract()):
            soup = BeautifulSoup(content, 'lxml')
            [s.extract() for s in soup('script')]  # remove script tag
            c = StrClean.clean_comment(soup.get_text())
            if indexi >= len(rep_time_list):
                rep_time = self.format_rep_date(rep_time_list[-1])
            else:
                rep_time = self.format_rep_date(rep_time_list[indexi])
            comments_data.append({'content': c, 'reply_time': rep_time})
        new_comment['url'] = response.url
        new_comment['comments_data'] = comments_data
        comment.append(new_comment)
        return comment
项目:lichking    作者:melonrun    | 项目源码 | 文件源码
def article_detail(aitem, response):
        for a_content in response.xpath('//script').extract():
            if a_content.find("detailArticle|post") == -1:
                continue
            a_content = a_content.split("props=")[1]
            a_content = a_content.split(",location")[0]
            a_content = json.loads(a_content).get("detailArticle|post")
            aitem.content = BeautifulSoup(a_content.get("content"), 'lxml').get_text()
            aitem.time = a_content.get('published_at')
            aitem.last_reply_time = aitem.time
            aitem.views = a_content.get('counters').get('view_count')
            aitem.replies = a_content.get('counters').get('comment')
            aitem.author = a_content.get('user').get('name')
            aitem.title = a_content.get('title')
            category_tags = json.loads(a_content.get('extraction_tags'))
            category = ''
            for category_tag in category_tags:
                category += category_tag[0] + ' '
            aitem.category = category

        return aitem
项目:lichking    作者:melonrun    | 项目源码 | 文件源码
def gen_item_comment(self, response):
        comment = []
        new_comment = {}
        comments_data = []
        rep_time_list = response.xpath('//span[@class="time"]').extract()
        for indexi, content in enumerate(response.xpath('//div[@class="t_fsz"]/table[1]').extract()):
            soup = BeautifulSoup(content, 'lxml')
            if soup.find('div', class_='attach_nopermission') is not None:
                soup.find('div', class_='attach_nopermission').clear()
            [s.extract() for s in soup('script')]     # remove script tag
            c = StrClean.clean_unicode(soup.get_text())
            comments_data.append({'content': c, 'reply_time': self.format_rep_date(rep_time_list[indexi])})
        new_comment['url'] = response.url
        new_comment['comments_data'] = comments_data
        comment.append(new_comment)
        return comment
项目:lichking    作者:melonrun    | 项目源码 | 文件源码
def gen_item_comment(self, response, is_first=False):
        comment = []
        new_comment = {}
        comments_data = []
        rep_time_list = response.xpath('//span[@class="date"]/text()').extract()
        for indexi, content in enumerate(response.xpath('//div[@class="replycontent"]').extract()):
            soup = BeautifulSoup(content, 'lxml')
            [s.extract() for s in soup('script')]  # remove script tag
            c = StrClean.clean_comment(soup.get_text())
            time_index = indexi
            if is_first:
                time_index += 1
            if time_index >= len(rep_time_list):
                rep_time = self.format_rep_date(rep_time_list[-1])
            else:
                rep_time = self.format_rep_date(rep_time_list[time_index])
            comments_data.append({'content': c, 'reply_time': rep_time})
        new_comment['url'] = response.url
        new_comment['comments_data'] = comments_data
        comment.append(new_comment)
        return comment
项目:lichking    作者:melonrun    | 项目源码 | 文件源码
def gen_item_comment(self, response):
        comment = []
        new_comment = {}
        comments_data = []
        rep_time_list = response.xpath('//div[@class="authi"]//em').extract()
        for indexi, content in enumerate(response.xpath('//div[@class="t_fsz"]//table[1]').extract()):
            soup = BeautifulSoup(content, 'lxml')
            [s.extract() for s in soup('script')]  # remove script tag
            c = StrClean.clean_comment(soup.get_text())
            if indexi >= len(rep_time_list):
                rep_time = self.format_rep_date(rep_time_list[-1])
            else:
                rep_time = self.format_rep_date(rep_time_list[indexi])
            comments_data.append({'content': c, 'reply_time': rep_time})
        new_comment['url'] = response.url
        new_comment['comments_data'] = comments_data
        comment.append(new_comment)
        return comment
项目:lichking    作者:melonrun    | 项目源码 | 文件源码
def gen_item_comment(self, response, is_first=False):
        comment = []
        new_comment = {}
        comments_data = []
        rep_time_list = response.xpath('//div[@class="authi"]/em').extract()
        if len(rep_time_list) == 0:
            return comment
        for indexi, content in enumerate(response.xpath('//div[@class="pct"]//table[1]').extract()):
            if is_first and indexi == 0:
                continue
            soup = BeautifulSoup(content, 'lxml')
            [s.extract() for s in soup('script')]  # remove script tag
            c = StrClean.clean_comment(soup.get_text())
            time_index = indexi
            if time_index >= len(rep_time_list):
                rep_time = self.format_rep_date(rep_time_list[-1])
            else:
                rep_time = self.format_rep_date(rep_time_list[time_index])
            comments_data.append({'content': c, 'reply_time': rep_time})
        new_comment['url'] = response.url
        new_comment['comments_data'] = comments_data
        comment.append(new_comment)
        return comment
项目:encore.ai    作者:dyelax    | 项目源码 | 文件源码
def download_lyrics(artist, url):
  print url
  time.sleep(random() + 2)
  page = urllib2.urlopen(url).read()
  soup = BeautifulSoup(page, 'html.parser')

  # Get the song title
  song_title = soup.find('title').get_text().split(' - ')[1].lower().replace('/', ' ').replace(' ', '_')

  # Get the lyrics div
  lyrics = soup.findAll('div', {'class': ''})

  for i in lyrics:
    lyrics = i.get_text().strip()
    if len(lyrics) > 10:
      with open('artists/' + artist + '/' + song_title + '.txt', 'wb') as w:
        cleaned_lyrics = lyrics.replace('\r\n', ' *BREAK* ').replace('\n', ' *BREAK* ').replace('  ', ' ')
        w.write(cleaned_lyrics.encode('utf-8'))
项目:encore.ai    作者:dyelax    | 项目源码 | 文件源码
def download_songs(url):
  time.sleep(random.random() * 0.5)
  try:
    page = urllib2.urlopen(url).read()
    soup = BeautifulSoup(page, 'html.parser')

    # Get the artist name
    artist_name = soup.findAll('h1')[0].get_text()[:-7].lower().replace(' ', '_')

    # Store all songs for a given artist
    with open('artist_data/'+artist_name+'.txt', 'wb') as w:
      for song in soup.findAll('a', {'target': '_blank'}):
        if 'lyrics/' in song['href']:
          song_url = song['href'][1:].strip()
          w.write(song_url + '\n')
  except urllib2.HTTPError:
    print '404 not found'
项目:earthy    作者:alvations    | 项目源码 | 文件源码
def packages(self):
        """
        Parse XML file to locate packages.
        """
        xml = requests.get(self._xml_url).content
        soup = BeautifulSoup(xml, "html.parser")
        nltk_packages, third_party = defaultdict(dict), defaultdict(dict)
        for pack in soup.find_all('package'):
            package_attributes = pack.attrs
            name = package_attributes['id']
            # Keeps track of nltk_data packages vs third_party packages.
            if package_attributes['url'].startswith(self._nltk_data_url):
                nltk_packages[name] = package_attributes
            else:
                third_party[name] = package_attributes
        return nltk_packages, third_party
项目:Jumper-Cogs    作者:Redjumpman    | 项目源码 | 文件源码
def _online_tibia(self):
        """Get total players playing"""
        url = "http://www.tibia.com/community/?subtopic=worlds"
        try:
            async with aiohttp.get(url) as response:
                soup = BeautifulSoup(await response.text(), "html.parser")
                div1 = soup.find('div', attrs={'id': 'RightArtwork'})
                div2 = div1.find('div', attrs={'id': 'PlayersOnline'})
                test = div2.get_text()
                test1 = test.replace("Players Online", "")
                new = "Players currently playing Tibia: " + test1
                # div2 = div1.find('div', attrs={'class': 'Border_2'})
                # div3 = div2.find('div', attrs={'class': 'Border_3'})
                # table = div3.find_all('table', attrs={'class': 'Table1'})
                # tr = table.find_all('tr')
                # tbody = div4.find('div', attrs={'class': 'CaptionInnerContainer'})
                await self.bot.say(str(new))
        except:
            await self.bot.say("Could not retrive data. The webserver may be offline.")
项目:Jumper-Cogs    作者:Redjumpman    | 项目源码 | 文件源码
def _server_tibia(self, servername):
        """Get Server Info"""
        servername = servername.title()
        url = "https://secure.tibia.com/community/?subtopic=worlds&world=" + str(servername)
        try:
            async with aiohttp.get(url) as response:
                soup = BeautifulSoup(await response.text(), "html5lib")
                b = soup.find_all("table", attrs={'class': 'Table1'})
                new = []
                rows = b[1].tbody.div.find_all('td')
                for row in rows:
                    new.append(row.get_text())
                k = new[::2]
                l = new[1::2]
                zipped = list(zip(k, l))
                t = tabulate(zipped, headers=["Category", "Info"])
                await self.bot.say("```Python" + "\n" + str(t) + "```")
        except:
            await self.bot.say("Unable to retrive server data. The webserver may be offline.")
项目:CourseGrab    作者:nnsun    | 项目源码 | 文件源码
def get_course_status(course_num):
    client = Client()
    subject = client.get_course_subject(course_num)
    if subject is None:
        return None
    semester = get_semester()
    subject_url = "http://classes.cornell.edu/browse/roster/" + semester + "/subject/" + subject
    subject_page = requests.get(subject_url)
    subject_page.raise_for_status()
    subject_bs4 = bs4.BeautifulSoup(subject_page.text, "html.parser")
    course_code_tags = subject_bs4.find_all("strong", class_="tooltip-iws")
    for tag in course_code_tags:
        course_code = int(tag.getText().strip())
        if course_num == course_code:
            section = tag.parent.parent.parent
            status = section.find_all('li', class_ = "open-status")[0].i["class"][-1]
            if "open-status-open" in status:
                return "open"
            if "open-status-closed" in status:
                return "closed"
            if "open-status-warning" in status:
                return "waitlist"
            if "open-status-archive" in status:
                return "archive"
项目:WPS-4th    作者:Fastcampus-WPS    | 项目源码 | 文件源码
def get_soup_from_url(url, params=None):
    '''
    url? parameter? ???? ?? URL? GET??? ?? ??(HTML text)?
    BeautifulSoup??? ??? ??
    :param url: GET??? ?? URL string
    :param params: GET?? ???? dict
    :return: BeautifulSoup object
    '''
    # requests.get??? ?? ???(response??)? r??? ??
    r = requests.get(url, params=params)
    # response???? text???? ??? ??? html_doc??? ??
    html_doc = r.text

    # BeautifulSoup??? ??, ??? html text
    soup = BeautifulSoup(html_doc, 'lxml')
    return soup
项目:Gank-Alfred-Workflow    作者:hujiaweibujidao    | 项目源码 | 文件源码
def test_tag_inherits_self_closing_rules_from_builder(self):
        if XML_BUILDER_PRESENT:
            xml_soup = BeautifulSoup("", "xml")
            xml_br = xml_soup.new_tag("br")
            xml_p = xml_soup.new_tag("p")

            # Both the <br> and <p> tag are empty-element, just because
            # they have no contents.
            self.assertEqual(b"<br/>", xml_br.encode())
            self.assertEqual(b"<p/>", xml_p.encode())

        html_soup = BeautifulSoup("", "html")
        html_br = html_soup.new_tag("br")
        html_p = html_soup.new_tag("p")

        # The HTML builder users HTML's rules about which tags are
        # empty-element tags, and the new tags reflect these rules.
        self.assertEqual(b"<br/>", html_br.encode())
        self.assertEqual(b"<p></p>", html_p.encode())
项目:TorScrapper    作者:ConanKapoor    | 项目源码 | 文件源码
def Scrape(url):
    timeout = 10
    socket.setdefaulttimeout(timeout)

    #Collecting html content.
    headers = {'User-Agent': 'TorScrapper - Onion scrapper | github.com/ConanKapoor/TorScrapper.git' }
    req = urllib.request.Request(url,None,headers)
    response = urllib.request.urlopen(req)

    #Using BeautifulSoup to parse html object response.
    page = BeautifulSoup(response.read(),'html.parser')

    #Saving output
    token = re.sub(r'[^\w]', '', url)
    name = os.path.abspath("") + '/Output/Scraped-' + token +'.html'
    file = open(name,'w')
    file.write(str(page))
    file.close()

# Taking input.
项目:scientific-paper-summarisation    作者:EdCo95    | 项目源码 | 文件源码
def getJournalURL(jname):
# get journal URL given the journal name for retrieving article PIIs
    urlstr = "http://api.elsevier.com/sitemap/page/sitemap/" + jname[0].lower() + ".html"
    retl = ""
    with urllib.request.urlopen(urlstr) as url:
        response = url.read()
        linkcnt = 0
        for link in BeautifulSoup(response, parse_only=SoupStrainer("a")):
            if linkcnt == 0:
                linkcnt += 1
                continue
            if link.has_attr("href"):
                if link.text.lower() == jname.lower():
                    #print(link["href"])
                    retl = link["href"]
                    break
            linkcnt += 1
    return retl
项目:code    作者:ActiveState    | 项目源码 | 文件源码
def get_url(self, query):
        site1 = urllib.urlopen('http://www.youtube.com/results?search_query=%s'%query)
        html = site1.read()
        soup = BS(html)

        links = soup.findAll('a')
        vidlinks = [link.get('href') for link in links if link.get('href') is not None]
        vlink = [ i for i in vidlinks if '/watch?v=' in i][0]

        img_link = soup.findAll('img',{'alt':'Thumbnail', 'width':'185'})[0].get('src')
        img_url =  'http:%s' %img_link

        imagethread = threading.Thread(target=lambda:urllib.urlretrieve(img_url, 'Files\image.jpg'))
        imagethread.start()

        return vlink
项目:code    作者:ActiveState    | 项目源码 | 文件源码
def run(self):
        ind=self.qu.get()
        url=self.url+str(ind)
        soup =bs.BeautifulSoup(''.join( ul.urlopen(url).readlines() ))
        bu = up.urlsplit(self.url)
        print 'started with the ' ,str(url).split('/')[-1],
        for i in  soup.find_all(attrs = { "class" : "recipe-title"}):
            sp = up.urlsplit(i.a.get('href'))
            path = sp.path
            print path
            if re.search(pat, path):
                path = bu.scheme+'://'+bu.netloc+path
                filename = str(path).split('/')[-2]
                filename = op.join(op.abspath(op.curdir),filename+'.py') # recipe will be stored in given location
#                filename = op.join(op.abspath(op.curdir),filename+'.html')
#uncomment the above line if downloading the web page for teh recipe
                print path
                self.q.put((path,filename))
        self.fetch_data()
        time.sleep(1)
        self.qu.task_done()
        self.q.join()
        print 'done with the ' ,str(url).split('/')[-1],
项目:meg-server    作者:Argonne-National-Laboratory    | 项目源码 | 文件源码
def get_all_key_signatures(cfg, keyid):
    """
    Get all signatures for a specific key. We exclude self signed signatures
    because this is not helpful for us.
    """
    content, status_code = make_sks_request(
        cfg, requests.get, "lookup", {"op": "vindex", "search": "0x{}".format(keyid)}, None
    )
    if status_code != 200:
        return status_code, content
    elem = BeautifulSoup(content, HTML_PARSER).span
    ids = []
    while (elem.findNext().name != "strong" and elem.findNext()):
        elem = elem.findNext()
        if "op=get" in elem["href"] and elem.text != keyid:
            ids.append(elem.text)
    return ids
项目:meg-server    作者:Argonne-National-Laboratory    | 项目源码 | 文件源码
def search_key(cfg, search_str):
    """
    Search for a key by a given string
    """
    content, status_code = make_sks_request(
        cfg, requests.get, "lookup", {"op": "index", "search": search_str}, None
    )
    if status_code != 200:
        return content, status_code
    bs = BeautifulSoup(content, HTML_PARSER)
    regex = re.compile(r"^pub *\d{3,4}\w\/([\w\d]{8})")
    ids = []
    for pre in bs.findAll("pre"):
        match = regex.search(pre.text.strip("\r\n"))
        if match and not "KEY REVOKED" in pre.text:
            ids.append(match.groups()[0])
    return {"ids": ids}, status_code
项目:NewsScrapy    作者:yinzishao    | 项目源码 | 文件源码
def parse_news(self,response):
        item = response.meta.get("item",NewsItem())
        soup = BeautifulSoup(response.body.decode('gbk'))
        pic = soup.find('p' , class_ = 'f_center').find('img').get('src') if  soup.find('p' , class_ = 'f_center') and soup.find('p' , class_ = 'f_center').find('img') else None
        referer_web = soup.find('a',id = 'ne_article_source').text if soup.find('a',id = 'ne_article_source') else None
        referer_url = soup.find('a',id = 'ne_article_source').get('href') if soup.find('a',id = 'ne_article_source') else None
        author = soup.find('span',class_ = 'ep-editor').text if soup.find('span',class_ = 'ep-editor') else None
        if u"?" in author:
            author = author.split(u"?")[-1]
        crawl_date = NOW
        read_num = soup.find('div',class_ = 'post_comment_joincount').find('a').text if soup.find('div',class_ = 'post_comment_tiecount') else 0
        comment_num = soup.find('div',class_ = 'post_comment_tiecount').find('a').text if soup.find('div',class_ = 'post_comment_tiecount') else 0
        content = soup.find('div',class_ = 'post_text').get_text(strip=True) if soup.find('div',class_ = 'post_text') else None
        item['referer_web'] = referer_web
        item['content'] = content
        item['referer_url'] = referer_url
        item['author'] = author
        item['crawl_date'] = crawl_date
        item['pic'] = pic
        item['comment_num'] = int(comment_num)
        item['read_num'] = int(read_num)
        yield item
项目:NewsScrapy    作者:yinzishao    | 项目源码 | 文件源码
def parse_news(self, response):
        item = response.meta.get("item", NewsItem())
        soup = BeautifulSoup(response.body.decode("utf-8").encode("utf-8"),"lxml")
        pic = soup.find("p",class_ = "detailPic").find("img").get("src") if soup.find("p",class_ = "detailPic") else None
        referer_web = soup.find("span",class_ = "ss03").text if soup.find("span",class_ = "ss03") else None
        author = soup.find("span",itemprop="author").find("span").text if soup.find("span",itemprop="author") else None
        temp = soup.find("div" ,id = "main_content")
        if temp:
            ps = temp.find_all("p") if temp.find_all("p") else None
            content = "\n\n".join([ p.text.strip() for p in ps])
        else:
            content = None
        item['pic'] = pic
        item['referer_web'] = referer_web
        item['author'] = author
        item['content'] = content
        item['crawl_date'] = NOW
        yield item
项目:NewsScrapy    作者:yinzishao    | 项目源码 | 文件源码
def parse_news(self,response):
        item = response.meta.get("item",None)
        # #??????????????????????
        # news_date = item.get("news_date",None)
        # if news_date:
        #     struct_date = datetime.datetime.strptime(news_date,"%Y-%m-%d")
        #     news_date = struct_date.strftime("%Y-%m-%d %H:%M:%S")
        #
        #     delta = self.end_now-struct_date
        #     if delta.days == self.end_day:
        #         # pass
        #         raise CloseSpider('today scrapy end')
        soup = BeautifulSoup(response.body)
        news_content_group = soup.find("div",class_="entry-content group")
        #??????
        news_content_group.find("div",class_="related_posts").replace_with("")
        content = news_content_group.text.strip()
        item["content"] = content
        item["catalogue"] = u"????"
        yield item
项目:NewsScrapy    作者:yinzishao    | 项目源码 | 文件源码
def parse_news(self,response):
        item = response.meta.get("item",NewsItem())
        pageindex = response.meta.get("pageindex",1)
        soup = BeautifulSoup(response.body, 'lxml')
        origin_date = soup.find("td", class_="time").text.strip()
        struct_date= datetime.datetime.strptime(origin_date,"%Y-%m-%d %H:%M")
        news_date = struct_date.strftime("%Y-%m-%d %H:%M:%S")
        content = soup.find("div", class_= "lph-article-comView").text.strip() if soup.find("div", class_= "lph-article-comView").text.strip() else None
        item["news_date"]= news_date
        item["crawl_date"]= NOW
        item["content"] = content
        item["catalogue"] = u"????"
        item = judge_news_crawl(item)
        if item:
            yield item
        else:
            self.flag = int(pageindex)
项目:NewsScrapy    作者:yinzishao    | 项目源码 | 文件源码
def parse(self,response):
        origin_url = response.url
        if "index" not in origin_url:
            soup = BeautifulSoup(response.body,"lxml")
            catalogue =  soup.find("a",class_ = "blue CurrChnlCls").get("title").strip()
            news_list = soup.find("div", class_ = "lie_main_m").find_all("li")
            for news in news_list:
                title = news.find("a").text.strip()
                news_url = "http://www.cnta.gov.cn/xxfb" + news.find("a").get("href")[2:]
                news_no = news_url.rsplit("/",1)[-1].split(".")[0]
                item = NewsItem(
                        news_url =news_url,
                        title = title,
                        news_no = news_no,
                        catalogue = catalogue,
                    )
                yield scrapy.Request(item["news_url"],callback=self.parse_news,meta={'item':item})
        else:
            topic_url = origin_url.rsplit(".",1)[0]
            self.flag.setdefault(topic_url,0)
            yield scrapy.Request(origin_url,callback=self.parse_topic)
项目:NewsScrapy    作者:yinzishao    | 项目源码 | 文件源码
def parse(self, response):
        origin_url = response.url
        #http://money.163.com/special/002526O5/transport_02.html
        search_result = re.search(r"_(\d)*?\.",origin_url)
        #????
        pageindex = search_result.group(1) if search_result else 1
        soup = BeautifulSoup(response.body,"lxml")
        news_list = soup("div",class_="list_item clearfix")
        for news in news_list:
            news_date = news.find("span",class_="time").text if news.find("span",class_="time")else None
            title = news.find("h2").text if news.find("h2") else None
            news_url = news.find("h2").a.get("href",None) if news.find("h2") else None
            abstract = news.find("p").contents[0] if news.find("p") else None
            item = NewsItem(title=title,news_url=news_url,abstract=abstract,news_date=news_date)
            item = judge_news_crawl(item)   #??????????
            if item:
                request = scrapy.Request(news_url,callback=self.parse_news,meta={"item":item})
                yield request
            else:
                self.flag = int(pageindex)
        if not self.flag:
            next_url = self.next_url % int(pageindex)+1
            yield scrapy.Request(next_url)
项目:NewsScrapy    作者:yinzishao    | 项目源码 | 文件源码
def parse_news(self,response):
        item = response.meta.get("item",NewsItem())
        soup = BeautifulSoup(response.body)
        referer_web = soup.find("a",id="ne_article_source").text if soup.find("a",id="ne_article_source") else None
        referer_url = soup.find("a",id="ne_article_source").get("href",None) if soup.find("a",id="ne_article_source") else None
        comment_num = soup.find("a",class_="post_cnum_tie").text if soup.find("a",id="ne_article_source") else None
        content = soup.find("div",class_="post_text").text.strip() if soup.find("div",class_="post_text") else None
        #??: ?????????-?????  ??????
        author_source = soup.find("span",class_="left").text if soup.find("span",class_="left") else None
        #TODO ??????
        # import pdb;pdb.set_trace()
        # author = re.search(u"??(.*)",author_source).group(1)[1:] if author_source else None
        # item["author"]=author
        item["referer_web"]=referer_web
        item["referer_url"]=referer_url
        item["comment_num"]=comment_num
        item["content"]=content
        item["crawl_date"]=NOW
        yield item