Python nltk 模块,download() 实例源码

我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用nltk.download()

项目:Python-Scripts-Repo-on-Data-Science    作者:qalhata    | 项目源码 | 文件源码
def get_only_text_washingtonpost_url(url):
    # this func will take the URL as an argument and return only
    # the raw text of the url.
    # this function works specifically for the washPost articles
    # because we know the structure of the pages
    page = urllib.urlopen(url).read().decode('utf8')
    # we download the URL
    soup = BeautifulSoup(page)
    # initialize a beautifulsoup object with the page we downloaded
    text = ' '.join(map(lambda p: p.text, soup.find_all('article')))
    # the above gets everything bewteen a pair of HTML tags
    # that look a certain way e.g. <article> stuff</article>
    # the above format is specific to the washington post
    soup2 = BeautifulSoup(text)
    # find all the paragraph tage <p>
    text = ' '.join(map(lambda p: p.text, soup2.find_all('p')))
    return soup.title.text, text

#######################################################################

# TEST
######################################################################
项目:NUIG-suggestion    作者:MixedEmotions    | 项目源码 | 文件源码
def activate(self, *args, **kwargs):

        np.random.seed(1337)  # for reproducibility

        st = datetime.now()
        self._classifierModel = load_model(self.savedModelPath)       
        logger.info("{} {}".format(datetime.now() - st, "loaded _classifierModel"))

        st = datetime.now()
        self._tokenizer = self.get_tokenizer()
        logger.info("{} {}".format(datetime.now() - st, "loaded _tokenizer"))

        #st = datetime.now()
        #nltk.download()
        #self._tokenizer_nltk = nltk.data.load('tokenizers/punkt/english.pickle')
        #logger.info("{} {}".format(datetime.now() - st, "loaded _tokenizer_nltk"))

        logger.info("SuggestionMiningDL plugin is ready to go!")
项目:R-net    作者:matthew-z    | 项目源码 | 文件源码
def prepare_data():
    make_dirs("data/cache")
    make_dirs("data/embedding/char")
    make_dirs("data/embedding/word")
    make_dirs("data/squad")
    make_dirs("data/trained_model")
    make_dirs("checkpoint")

    nltk.download("punkt")

    train_filename = "train-v1.1.json"
    dev_filename = "dev-v1.1.json"
    squad_base_url = "https://rajpurkar.github.io/SQuAD-explorer/dataset/"

    train_url = os.path.join(squad_base_url, train_filename)
    dev_url = os.path.join(squad_base_url, dev_filename)

    download_prefix = os.path.join("data", "squad")
    maybe_download(train_url, download_prefix, train_filename)
    maybe_download(dev_url, download_prefix, dev_filename)

    char_embedding_pretrain_url = "https://raw.githubusercontent.com/minimaxir/char-embeddings/master/glove.840B.300d-char.txt"
    char_embedding_filename = "glove_char.840B.300d.txt"
    maybe_download(char_embedding_pretrain_url, "data/embedding/char", char_embedding_filename)
项目:textkit    作者:learntextvis    | 项目源码 | 文件源码
def download():
    '''
    Install required libraries.
    Note this library will install nltk dependencies into your
    user directory.
    '''

    click.echo("Installing nltk packages into your user directories in " +
               "the following order of existence (first found):\n" +
               '\n'.join(nltk.data.path))

    extensions = [("taggers", "averaged_perceptron_tagger"),
                  ("corpora", "wordnet"),
                  ("tokenizers", "punkt")]

    missing = check_packages_exist(extensions)

    for ext_tuple in missing:
        nltk.download(ext_tuple[1])
项目:skills-ml    作者:workforce-data-initiative    | 项目源码 | 文件源码
def retrieve_onet_titles(self):
        onet_titles = pd.concat(
            (pd.read_csv(self.onet_downloader.download(
                version,
                'Occupation Data.txt',
                'occupation_data.txt'
            ), sep='\t') for version in ONET_VERSIONS),
            ignore_index=True
        )
        # Assumes pandas 0.19, keeps newest duplicate Title
        onet_titles.drop_duplicates('Title', inplace=True, keep='last')
        onet_titles['Major'] = onet_titles.iloc[:, 0].apply(lambda x: x[:2])

        LOWER = True
        if LOWER:
            # all RDD strings are unicode
            onet_titles['Title'] = onet_titles['Title'].str.lower()
            onet_titles['Description'] = onet_titles['Description'].str.lower()

        # now we can do a title -> Major, Minor lookup
        onet_titles.set_index('Title', inplace=True)
        # access with onet_titles.loc[u'Sales Agents, Financial Services']
        return onet_titles
项目:deeppavlov    作者:deepmipt    | 项目源码 | 文件源码
def __init__(self, opt, embedding_dim):
        """Initialize the class according to given parameters."""

        self.tok2emb = {}
        self.embedding_dim = embedding_dim
        self.opt = copy.deepcopy(opt)
        self.load_items()

        nltk.download('punkt')

        if not self.opt.get('fasttext_model'):
            raise RuntimeError('No pretrained fasttext model provided')
        self.fasttext_model_file = self.opt.get('fasttext_model')
        if not os.path.isfile(self.fasttext_model_file):
            emb_path = os.environ.get('EMBEDDINGS_URL')
            if not emb_path:
                raise RuntimeError('No pretrained fasttext model provided')
            fname = os.path.basename(self.fasttext_model_file)
            try:
                print('Trying to download a pretrained fasttext model from the repository')
                url = urllib.parse.urljoin(emb_path, fname)
                urllib.request.urlretrieve(url, self.fasttext_model_file)
                print('Downloaded a fasttext model')
            except Exception as e:
                raise RuntimeError('Looks like the `EMBEDDINGS_URL` variable is set incorrectly', e)

        self.fasttext_model = fasttext.load_model(self.fasttext_model_file)
项目:goose    作者:sexxis    | 项目源码 | 文件源码
def main():
    nltk_deps = ['punkt', 'averaged_perceptron_tagger']
    print 'Checking nltk deps...'
    map(nltk.download, nltk_deps)
    print 'nltk deps done'
项目:tokenquery    作者:ramtinms    | 项目源码 | 文件源码
def __init__(self, tokenizer_type="PTBTokenizer"):

        # Sanity checks
        if tokenizer_type in ['SpaceTokenizer', 'NLTKWhiteSpaceTokenizer', 'PTBTokenizer']:
            self.tokenizer_type = tokenizer_type
        else:
            print ("Unrecognized tokenizer type : setting back to default (PTBTokenizer)")
            self.tokenizer_type = "PTBTokenizer"
        try:
            nltk.data.find('punkt.zip')
        except LookupError:
            nltk.download('punkt')
项目:tokenquery    作者:ramtinms    | 项目源码 | 文件源码
def __init__(self):
        try:
            nltk.data.find('taggers/averaged_perceptron_tagger/averaged_perceptron_tagger.pickle')
        except LookupError:
            nltk.download('averaged_perceptron_tagger')
项目:UrbanSearch    作者:urbansearchTUD    | 项目源码 | 文件源码
def load_nltk_data():
    nltk.download('punkt')
    nltk.download('stopwords')
    nltk.download('snowball_data')
项目:coquery    作者:gkunter    | 项目源码 | 文件源码
def download_packages(self):
        import nltk

        for x in [comp for comp in self._missing if "/" in comp]:
            package = x.split("/")[1]
            self.updateLabel.emit(package)
            nltk.download(package, raise_on_error=True)
            self.progressTheBar.emit()
项目:fabric8-analytics-tagger    作者:fabric8-analytics    | 项目源码 | 文件源码
def prepare():
    """Prepare tagger for run.

    This should be after installation to initialize tagger's resources.
    """
    import nltk
    import requests
    from libarchive import extract_memory
    import os
    from shutil import move
    from f8a_tagger.utils import get_files_dir

    nltk.download("punkt")
    nltk.download("wordnet")

    maven_index_checker_url = 'https://github.com/fabric8-analytics/' \
                              'maven-index-checker/files/1275145/' \
                              'maven-index-checker-v0.1-alpha.zip'
    response = requests.get(maven_index_checker_url)
    if response.ok is not True:
        raise RemoteDependencyMissingError("Failed to download maven-index-checker with "
                                           "response code %s",
                                           response.status_code)

    # Unfortunately no way how to know name or path of extracted file,
    # so assume it's maven-index-checker.jar
    jar_name = "maven-index-checker.jar"

    jar_path = get_files_dir()
    extract_memory(response.content)
    move(jar_name, os.path.join(jar_path, jar_name))
项目:presswork    作者:hangtwenty    | 项目源码 | 文件源码
def run(self):
        # setuptools is an oldie goldie. super() is not supported by base class (it's an "old style class")
        SetuptoolsInstallCommand.do_egg_install(self)

        import nltk
        for corpus in _required_nltk_corpora:
            nltk.download(corpus)
项目:vec4ir    作者:lgalke    | 项目源码 | 文件源码
def install_nltk_corpora(*packages):
        nltk_packages = list(packages)
        try:
            installed = (set(os.listdir(nltk.data.find("corpora"))) |
                         (set(os.listdir(nltk.data.find("taggers"))))) | \
                        (set(os.listdir(nltk.data.find("tokenizers"))))
        except LookupError:
            installed = set()
        if not set(nltk_packages) <= set(installed):
            nltk.download(nltk_packages)
项目:bsd    作者:cjhutto    | 项目源码 | 文件源码
def run(self):
        # PUT YOUR POST-INSTALL SCRIPT HERE or CALL A FUNCTION
        import nltk
        nltk.download('punkt')
        install.run(self)
项目:wordsim    作者:recski    | 项目源码 | 文件源码
def ensure_nltk_packages():
    for package in ('stopwords', 'punkt', 'wordnet'):
        nltk.download(package, quiet=True)
项目:bigworldgraph    作者:majdigital    | 项目源码 | 文件源码
def download_nltk_resource_if_missing(resource_path, resource):
    """
    Download a missing resource from the Natural Language Processing Toolkit.

    :param resource_path: Link / path for NLTK resource.
    :type resource_path: str
    :param resource: Identifier / name of resource (will be used to download the resource if its not found).
    :type resource: str
    """
    try:
        nltk.data.find(resource_path)
    except LookupError:
        nltk.download(resource)
项目:MachineLearningProject    作者:ymynem    | 项目源码 | 文件源码
def download():
    """
    Download reuters data and stopwords if not already present"
    """
    nltk.download("reuters")
    nltk.download("stopwords")
项目:Easy-Latent-Dirichlet-Allocation    作者:bjherger    | 项目源码 | 文件源码
def __init__(self, num_topics=6, num_iterations=500, random_state=None, clean_text=True, vectorizer=None):
        """
        Init for LDA estimator
        :param num_topics: Number of topics to model (generally 3-10)
        :type num_topics: int
        :param num_iterations: Number of iterations to allow before locking in topics
        :type num_iterations: int
        :param random_state: Random seed, for consistent topics
        :type random_state: int
        :param clean_text: Whether to clean text using self.preprocess(). Recommended if you have not preprocessed
        the text already
        :type clean_text: bool
        :param vectorizer: Word vectorizer to use. The word vectorizer should convert a collection of text documents
        to a matrix of token counts
        """
        self.num_topics = num_topics
        self.num_iterations = num_iterations
        self.random_state = random_state
        self.lda_model = lda.LDA(n_topics=self.num_topics, n_iter=self.num_iterations, random_state=self.random_state)
        self.clean_text = clean_text
        self.get_topic_description_df = None
        if vectorizer is not None:
            self.vectorizer = vectorizer
        else:
            self.vectorizer = CountVectorizer()

        # Make sure nltk has required data sets
        nltk.download('punkt')
        nltk.download('stopwords')
        nltk.download('wordnet')
项目:skills-ml    作者:workforce-data-initiative    | 项目源码 | 文件源码
def __init__(self, onet_source=OnetSourceDownloader):
        self.onet_downloader = onet_source()
        self.onet_titles = self.retrieve_onet_titles()
        logging.info('Retrieved onet titles')
        # ... Following the ESA description:
        # https://en.wikipedia.org/wiki/Explicit_semantic_analysis
        self.tfidf_vectorizer = TfidfVectorizer(stop_words='english')
        # optimization note: convert from CSR to CSC
        self.tf = self.tfidf_vectorizer.fit_transform(self.onet_titles['Description'].values)
        self.concept_row = self.onet_titles.index.values
        try:
            wn.synset
        except LookupError:
            nltk.download('wordnet')
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda    作者:SignalMedia    | 项目源码 | 文件源码
def download_lite():
    for each in MIN_CORPORA:
        nltk.download(each)
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda    作者:SignalMedia    | 项目源码 | 文件源码
def download_all():
    for each in ALL_CORPORA:
        nltk.download(each)
项目:Political-Opinion-Finder    作者:philhabell    | 项目源码 | 文件源码
def nltkDownload(self):
        try:
            nltk.data.find("tokenizers")
        except LookupError:
            #self.dis.spinner("Downloading NLTK Data")
            print("No NLTK data found, downloading now...")
            nltk.download("all")
            #self.dis.stop()


    # The searcher find tweets in the database with with the search term handed
    # to it with. It will return the tweets the term and number of times it 
    # apeares in the database in a dictionary.
    # It must be handed:
    #    *a search term as a string
项目:Twitter    作者:LucasRodriguez    | 项目源码 | 文件源码
def run():
    nltk.download('punkt')
项目:neighborhood_mood_aws    作者:jarrellmark    | 项目源码 | 文件源码
def download_lite():
    for each in MIN_CORPORA:
        nltk.download(each)
项目:neighborhood_mood_aws    作者:jarrellmark    | 项目源码 | 文件源码
def download_all():
    for each in ALL_CORPORA:
        nltk.download(each)
项目:redbiom    作者:biocore    | 项目源码 | 文件源码
def _post():
    import nltk
    nltk.download('stopwords')
    nltk.download('punkt')
项目:WebNav    作者:nyu-dl    | 项目源码 | 文件源码
def __init__(self, wiki, vocab, n_consec):
        self.wiki = wiki
        self.vocab = vocab
        self.n_consec = n_consec # number of consecutive sections that are used to form a query
        nltk.download('punkt')
        self.tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
项目:ip6words    作者:lstn    | 项目源码 | 文件源码
def nltk_download_packages():
    nltk.download("words")
    nltk.download("brown")
    nltk.download("abc")
    nltk.download("inaugural")
    nltk.download("genesis")
项目:RNNVis    作者:myaooo    | 项目源码 | 文件源码
def tokenize(str_stream, eos=True, remove_punct=False):
    """
    Given a str or str_stream (f.read()) convert the str to a list of sentences,
        e.g.: [[word, word], [word, word, ...], ...]
    :param str_stream: a str or a str_stream
    :param eos: wether turns '.' into <eos> tag
    :param remove_punct: wether to remove punctuations: ':', ';', '--', ',', "'"
    :return: a list of sentences, each sentence is a list of words (str)
    """
    # do lazy import coz import nltk is very slow
    import nltk
    try:
        nltk.data.load('tokenizers/punkt/english.pickle')
    except LookupError:
        print('punct resource not found, using nltk.download("punkt") to download resource data...')
        nltk.download('punkt')
    tokens = [nltk.word_tokenize(t) for t in nltk.sent_tokenize(str_stream.lower())]
    # get POS Tags
    tokens_tags = nltk.pos_tag_sents(tokens, tagset='universal')
    pos_tags = []
    for token_tags in tokens_tags:
        _, tags = zip(*token_tags)
        pos_tags.append(tags)
    # tag number
    tokens = [['N' if isfloat(t) else t for t in sublist] for sublist in tokens]
    if eos:
        for token in tokens:
            token[-1] = '<eos>'
    if remove_punct:
        tokens = [[t for t in sublist if t not in __punct_set] for sublist in tokens]
    return tokens, pos_tags
项目:empythy    作者:ClimbsRocks    | 项目源码 | 文件源码
def load_movie_reviews():

    # movie_reviews is a sizeable corpus to import, so only load it if we have to
    from nltk.corpus import movie_reviews
    try:
        movie_reviews.categories()
    except:
        import nltk
        print('This appears to be your first time using the NLTK Movie Reviews corpus. We will first download the necessary corpus (this is a one-time download that might take a little while')
        nltk.download('movie_reviews')
        from nltk.corpus import movie_reviews

    raw_data = []

    # NLTK's corpus is structured in an interesting way
    # first iterate through the two categories (pos and neg)
    for category in movie_reviews.categories():

        if category == 'pos':
            pretty_category_name = 'positive'
        elif category == 'neg':
            pretty_category_name = 'negative'

        # each of these categories is just fileids, so grab those
        for fileid in movie_reviews.fileids(category):

            # then each review is a NLTK class where each item in that class instance is a word
            review_words = movie_reviews.words(fileid)
            review_text = ''

            for word in review_words:
                review_text += ' ' + word

            review_dictionary = {
                'text': review_text,
                'sentiment': pretty_category_name
            }

            raw_data.append(review_dictionary)

    return raw_data
项目:jenova    作者:dungba88    | 项目源码 | 文件源码
def download():
    """skip unverified certificate and show download dialog"""
    try:
        create_unverified_https_context = ssl._create_unverified_context
    except AttributeError:
        pass
    else:
        ssl._create_default_https_context = create_unverified_https_context

    nltk.download()
项目:master-thesis    作者:AndreasMadsen    | 项目源码 | 文件源码
def download(self, name: str) -> None:
        if not self.exists(name):
            nltk.download(name, download_dir=self.nltk_dir)
项目:memex-dossier-open    作者:dossier    | 项目源码 | 文件源码
def run(self):
        import nltk
        from memex_dossier.models.tests.test_features import nltk_data_packages
        for data_name in nltk_data_packages:
            print('nltk.download(%r)' % data_name)
            nltk.download(data_name)
项目:memex-dossier-open    作者:dossier    | 项目源码 | 文件源码
def nltk_data():
    for data_name in nltk_data_packages:
        print('nltk.download(%r)' % data_name)
        nltk.download(data_name)
项目:blabbr    作者:bfontaine    | 项目源码 | 文件源码
def setup_nltk(self, **kw):
        import nltk
        from nltk.data import find

        tagger = "averaged_perceptron_tagger"

        try:
            find("taggers/%s" % tagger)
        except LookupError:
            click.echo("Downloading NTLK data (~2MB)...")
            nltk.download(tagger)
            return True

        return False
项目:smmry-alternate    作者:andersonpaac    | 项目源码 | 文件源码
def initstopwords(self):
        try:
            s=set(stopwords.words('english'))
        except LookupError as e:
                import nltk
                nltk.download()
                s=set(stopwords.words('english'))
        st = LancasterStemmer()
        for each in s:
            self.stopwords.append(st.stem(each))

    #Given a dictionary of key: frequency, value: array of words
    #build the opposite
项目:beepboop    作者:nicolehe    | 项目源码 | 文件源码
def download_lite():
    for each in MIN_CORPORA:
        nltk.download(each)
项目:beepboop    作者:nicolehe    | 项目源码 | 文件源码
def download_all():
    for each in ALL_CORPORA:
        nltk.download(each)
项目:LinguisticAnalysis    作者:DucAnhPhi    | 项目源码 | 文件源码
def install():
    for d in dependencies:
        pip.main(['install', d])

    # after nltk module was installed
    import nltk
    for data in nltk_data:
        nltk.download(data)
项目:Quadflor    作者:quadflor    | 项目源码 | 文件源码
def install_nltk_corpora(*packages):
        nltk_packages = list(packages)
        try:
            installed = (set(os.listdir(nltk.data.find("corpora"))) |
                         (set(os.listdir(nltk.data.find("taggers"))))) | \
                        (set(os.listdir(nltk.data.find("tokenizers"))))
        except LookupError:
            installed = set()
        if not set(nltk_packages) <= set(installed):
            nltk.download(nltk_packages)
项目:teem-tag    作者:P2Pvalue    | 项目源码 | 文件源码
def build_dict_from_nltk(output_file, corpus=None, stopwords=None,
                         stemmer=Stemmer(), measure='IDF', verbose=False):
    '''
    @param output_file: the name of the file where the dictionary should be
                        saved
    @param corpus:      the NLTK corpus to use (defaults to nltk.corpus.reuters)
    @param stopwords:   a list of (not stemmed) stopwords (defaults to
                        nltk.corpus.reuters.words('stopwords'))
    @param stemmer:     the L{Stemmer} object to be used
    @param measure:     the measure used to compute the weights ('IDF'
                        i.e. 'inverse document frequency' or 'ICF' i.e.
                        'inverse collection frequency'; defaults to 'IDF')
    @param verbose:     whether information on the progress should be printed
                        on screen
    '''

    from build_dict import build_dict
    import nltk
    import pickle

    if not (corpus and stopwords):
        nltk.download('reuters')

    corpus = corpus or nltk.corpus.reuters
    stopwords = stopwords or nltk.corpus.reuters.words('stopwords')

    corpus_list = []

    if verbose: print 'Processing corpus...'
    for file in corpus.fileids():
        doc = [stemmer(Tag(w.lower())).stem for w in corpus.words(file)
               if w[0].isalpha()]
        corpus_list.append(doc)

    if verbose: print 'Processing stopwords...'
    stopwords = [stemmer(Tag(w.lower())).stem for w in stopwords]

    if verbose: print 'Building dictionary... '
    dictionary = build_dict(corpus_list, stopwords, measure)
    with open(output_file, 'wb') as out:
        pickle.dump(dictionary, out, -1)
项目:kind2anki    作者:prz3m    | 项目源码 | 文件源码
def download_lite():
    for each in MIN_CORPORA:
        nltk.download(each)
项目:kind2anki    作者:prz3m    | 项目源码 | 文件源码
def download_all():
    for each in ALL_CORPORA:
        nltk.download(each)
项目:sia-cog    作者:deepakkumar1984    | 项目源码 | 文件源码
def download():
    nltk.download()
项目:luvina    作者:oarriaga    | 项目源码 | 文件源码
def download_nltk_data(package_name='all'):
    """ download necessary data from NLTK
    args:
        package_name: string containing the package name to install
    returns:
        None
    """
    if package_name is 'all':
        data = ['punkt', 'wordnet', 'stopwords', 'averaged_perceptron_tagger']
        for package in data:
            download(package)
    else:
        download(package)
项目:text-to-image    作者:paarthneekhara    | 项目源码 | 文件源码
def create_data_paths():
    if not os.path.isdir(DATA_DIR):
        raise EnvironmentError('Needs to be run from project directory containing ' + DATA_DIR)
    needed_paths = [
        os.path.join(DATA_DIR, 'samples'),
        os.path.join(DATA_DIR, 'val_samples'),
        os.path.join(DATA_DIR, 'Models'),
    ]
    for p in needed_paths:
        make_sure_path_exists(p)


# adapted from http://stackoverflow.com/questions/51212/how-to-write-a-download-progress-indicator-in-python
项目:acl2017-interactive_summarizer    作者:UKPLab    | 项目源码 | 文件源码
def _sentence_tokenizer(self, language):
        try:
            path = to_string("tokenizers/punkt/%s.pickle") % to_string(language)
            return nltk.data.load(path)
        except (LookupError, zipfile.BadZipfile):
            raise LookupError(
                "NLTK tokenizers are missing. Download them by following command: "
                '''python -c "import nltk; nltk.download('punkt')"'''
            )
项目:nlp_sum    作者:Zhujunnan    | 项目源码 | 文件源码
def english_sentence_segment(text):
    """segment text into sentence"""
    try:
        sent_detector = nltk.data.load(
            'tokenizers/punkt/english.pickle'
        )

        extra_abbrev = ["e.g", "al", "i.e"]
        sent_detector._params.abbrev_types.update(extra_abbrev)
        return sent_detector.tokenize(text)
    except LookupError as e:
        raise LookupError(
            "NLTK tokenizers are missing. Download them by following command: "
            '''python -c "import nltk; nltk.download('punkt')"'''
        )
项目:PYSHA    作者:shafaypro    | 项目源码 | 文件源码
def download_preferences(self):
        import nltk  # importing the natural language processing module
        nltk.download()  # opening the gui based Natural language processing download kit