Python nltk 模块,data() 实例源码

我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用nltk.data()

项目:coquery    作者:gkunter    | 项目源码 | 文件源码
def copy_packages(self):
        import nltk.data
        target_path = nltk.data.path[0]

        for x in [comp for comp in self._missing if "/" in comp]:
            parts = x.split("/")
            subdir = os.path.join(target_path, parts[0])
            package = parts[1]
            zip_name = "{}.zip".format(package)
            self.updateLabel.emit(package)
            src = os.path.join(_NLTK_dir, zip_name)
            dst = os.path.join(subdir, zip_name)
            if not os.path.exists(subdir):
                os.makedirs(subdir)

            if os.path.exists(src):
                shutil.copyfile(src, dst)
            else:
                raise ValueError("Package file {}.zip not found in {}".format(package, _NLTK_dir))

            with zipfile.ZipFile(dst) as zipped:
                for member in zipped.infolist():
                    zipped.extract(member, subdir)

            self.progressTheBar.emit()
项目:DNN-Sentiment    作者:awjuliani    | 项目源码 | 文件源码
def batch_iter(data, batch_size, num_epochs, shuffle=True):
    """
    Generates a batch iterator for a dataset.
    """
    data = np.array(data)
    data_size = len(data)
    num_batches_per_epoch = int(len(data)/batch_size) + 1
    for epoch in range(num_epochs):
        # Shuffle the data at each epoch
        if shuffle:
            shuffle_indices = np.random.permutation(np.arange(data_size))
            shuffled_data = data[shuffle_indices]
        else:
            shuffled_data = data
        for batch_num in range(num_batches_per_epoch):
            start_index = batch_num * batch_size
            end_index = min((batch_num + 1) * batch_size, data_size)
            yield shuffled_data[start_index:end_index]
项目:SCDL    作者:lngvietthang    | 项目源码 | 文件源码
def read_json_file(path_to_json):
    objects = []
    data = ''
    with io.open(path_to_json, 'r', encoding='utf8') as f:
        for line in f:
            if line in ['\n', '\n\r']:
                objects.append(json.loads(data))
                data = ''
            else:
                data += line
        try:
            objects.append(json.loads(data))
        except:
            return objects
    return objects

# get original sentence, compression sentence
项目:minke    作者:DistrictDataLabs    | 项目源码 | 文件源码
def __init__(self, root, fileids=DOC_PATTERN, tags=None,
                 word_tokenizer=WordPunctTokenizer(),
                 sent_tokenizer=nltk.data.LazyLoader(
                    'tokenizers/punkt/english.pickle'),
                 encoding='utf8', **kwargs):
        """
        Initialize the corpus reader.  Categorization arguments
        (``cat_pattern``, ``cat_map``, and ``cat_file``) are passed to
        the ``CategorizedCorpusReader`` constructor.  The remaining arguments
        are passed to the ``CorpusReader`` constructor.
        """
        # Add the default category pattern if not passed into the class.
        if not any(key.startswith('cat_') for key in kwargs.keys()):
            kwargs['cat_pattern'] = CAT_PATTERN

        CategorizedCorpusReader.__init__(self, kwargs)
        CorpusReader.__init__(self, root, fileids, encoding)

        self._word_tokenizer = word_tokenizer
        self._sent_tokenizer = sent_tokenizer
        self._good_tags = tags or self.TAGS
项目:minke    作者:DistrictDataLabs    | 项目源码 | 文件源码
def docs(self, fileids=None, categories=None):
        """
        Returns the complete JSON document for every file in the corpus.
        Note that I attempted to use the nltk ``CorpusView`` and ``concat``
        methods here, but was not getting memory safe iteration. Instead the
        simple Python generator by far did a better job of ensuring that file
        handles got closed and that not all data was loaded into memory at a
        time. In the future, I will try to re-implement the corpus view.
        """
        # Resolve the fileids and the categories
        fileids = self._resolve(fileids, categories)

        # Create a generator, loading one document into memory at a time.
        for path, enc, fileid in self.abspaths(fileids, True, True):
            with codecs.open(path, 'r', encoding=enc) as f:
                yield json.load(f)
项目:Price-Comparator    作者:Thejas-1    | 项目源码 | 文件源码
def sql_query(dbname, query):
    """
    Execute an SQL query over a database.
    :param dbname: filename of persistent store
    :type schema: str
    :param query: SQL query
    :type rel_name: str
    """
    import sqlite3
    try:
        path = nltk.data.find(dbname)
        connection =  sqlite3.connect(str(path))
        cur = connection.cursor()
        return cur.execute(query)
    except (ValueError, sqlite3.OperationalError):
        import warnings
        warnings.warn("Make sure the database file %s is installed and uncompressed." % dbname)
        raise
项目:DNN-Sentiment    作者:awjuliani    | 项目源码 | 文件源码
def load_data_and_labels():
    """
    Loads MR polarity data from files, splits the data into words and generates labels.
    Returns split sentences and labels.
    """
    # Load data from files
    positive_examples = list(open("./data/rt-polaritydata/rt-polarity.pos", "r").readlines())
    positive_examples = [s.strip() for s in positive_examples]
    negative_examples = list(open("./data/rt-polaritydata/rt-polarity.neg", "r").readlines())
    negative_examples = [s.strip() for s in negative_examples]
    # Split by words
    #x_text = list(open("./trainUNK.txt", "r").readlines())
    x_text = positive_examples + negative_examples
    x_text = [clean_str(sent) for sent in x_text]
    x_text = [s.split(" ") for s in x_text]
    # Generate labels
    positive_labels = [[0, 1] for _ in positive_examples]
    negative_labels = [[1, 0] for _ in negative_examples]
    y = np.concatenate([positive_labels, negative_labels], 0)
    return [x_text, y]
项目:DNN-Sentiment    作者:awjuliani    | 项目源码 | 文件源码
def load_data_for_books(path):
    text = ''.join(open(path).readlines()).decode('utf8')
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    book = tokenizer.tokenize(text)
    #book = re.split(r' *[\.\?!][\'"\)\]]* *', text)
    #book = list(open(path, "r").readlines())
    book = [s.strip() for s in book]
    book = [clean_str(sent) for sent in book]
    book = [s.split(" ") for s in book]
    x_text = book
    y = np.vstack([np.zeros(len(book)),np.ones(len(book))]).T
    sentences, labels = x_text,y
    sentences_padded = pad_sentences(sentences)



    sentencesT, labelsT = load_data_and_labels()
    sentences_paddedT = pad_sentences(sentencesT)
    vocabulary, vocabulary_inv = build_vocab(sentences_paddedT)
    x, y = build_input_data(sentences_padded, labels, vocabulary)
    return [x, y, vocabulary, vocabulary_inv, sentencesT]
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda    作者:SignalMedia    | 项目源码 | 文件源码
def sql_query(dbname, query):
    """
    Execute an SQL query over a database.
    :param dbname: filename of persistent store
    :type schema: str
    :param query: SQL query
    :type rel_name: str
    """
    import sqlite3
    try:
        path = nltk.data.find(dbname)
        connection =  sqlite3.connect(str(path))
        cur = connection.cursor()
        return cur.execute(query)
    except (ValueError, sqlite3.OperationalError):
        import warnings
        warnings.warn("Make sure the database file %s is installed and uncompressed." % dbname)
        raise
项目:neighborhood_mood_aws    作者:jarrellmark    | 项目源码 | 文件源码
def sql_query(dbname, query):
    """
    Execute an SQL query over a database.
    :param dbname: filename of persistent store
    :type schema: str
    :param query: SQL query
    :type rel_name: str
    """
    import sqlite3
    try:
        path = nltk.data.find(dbname)
        connection =  sqlite3.connect(str(path))
        cur = connection.cursor()
        return cur.execute(query)
    except (ValueError, sqlite3.OperationalError):
        import warnings
        warnings.warn("Make sure the database file %s is installed and uncompressed." % dbname)
        raise
项目:hate-to-hugs    作者:sdoran35    | 项目源码 | 文件源码
def sql_query(dbname, query):
    """
    Execute an SQL query over a database.
    :param dbname: filename of persistent store
    :type schema: str
    :param query: SQL query
    :type rel_name: str
    """
    import sqlite3
    try:
        path = nltk.data.find(dbname)
        connection =  sqlite3.connect(str(path))
        cur = connection.cursor()
        return cur.execute(query)
    except (ValueError, sqlite3.OperationalError):
        import warnings
        warnings.warn("Make sure the database file %s is installed and uncompressed." % dbname)
        raise
项目:FancyWord    作者:EastonLee    | 项目源码 | 文件源码
def sql_query(dbname, query):
    """
    Execute an SQL query over a database.
    :param dbname: filename of persistent store
    :type schema: str
    :param query: SQL query
    :type rel_name: str
    """
    import sqlite3
    try:
        path = nltk.data.find(dbname)
        connection =  sqlite3.connect(str(path))
        cur = connection.cursor()
        return cur.execute(query)
    except (ValueError, sqlite3.OperationalError):
        import warnings
        warnings.warn("Make sure the database file %s is installed and uncompressed." % dbname)
        raise
项目:beepboop    作者:nicolehe    | 项目源码 | 文件源码
def sql_query(dbname, query):
    """
    Execute an SQL query over a database.
    :param dbname: filename of persistent store
    :type schema: str
    :param query: SQL query
    :type rel_name: str
    """
    import sqlite3
    try:
        path = nltk.data.find(dbname)
        connection =  sqlite3.connect(str(path))
        cur = connection.cursor()
        return cur.execute(query)
    except (ValueError, sqlite3.OperationalError):
        import warnings
        warnings.warn("Make sure the database file %s is installed and uncompressed." % dbname)
        raise
项目:kind2anki    作者:prz3m    | 项目源码 | 文件源码
def sql_query(dbname, query):
    """
    Execute an SQL query over a database.
    :param dbname: filename of persistent store
    :type schema: str
    :param query: SQL query
    :type rel_name: str
    """
    import sqlite3
    try:
        path = nltk.data.find(dbname)
        connection =  sqlite3.connect(str(path))
        cur = connection.cursor()
        return cur.execute(query)
    except (ValueError, sqlite3.OperationalError):
        import warnings
        warnings.warn("Make sure the database file %s is installed and uncompressed." % dbname)
        raise
项目:but_sentiment    作者:MixedEmotions    | 项目源码 | 文件源码
def sql_query(dbname, query):
    """
    Execute an SQL query over a database.
    :param dbname: filename of persistent store
    :type schema: str
    :param query: SQL query
    :type rel_name: str
    """
    import sqlite3
    try:
        path = nltk.data.find(dbname)
        connection =  sqlite3.connect(str(path))
        cur = connection.cursor()
        return cur.execute(query)
    except (ValueError, sqlite3.OperationalError):
        import warnings
        warnings.warn("Make sure the database file %s is installed and uncompressed." % dbname)
        raise
项目:clickbait    作者:bhargaviparanjape    | 项目源码 | 文件源码
def add_full_stops_to_the_end(infile, outfile):
    #clean data of small titles nad add full stops for NLTK to work
    output_format = '{}.\n'.format
    with open(infile) as fin, codecs.open(outfile, 'w+', 'utf-8') as fout:
        for line in fin:
            if line[0] == ' ':
                pass
            #ignore headlines with less than three words
            elif len(line.split()) <= 3:
                pass
            elif line.endswith('.\n') or line.endswith('!\n') or line.endswith('?\n') or line.endswith('!\n') or line.endswith('\'\n') or line.endswith('"\n'):
                print >> fout, line.decode('utf-8'),
            else:
                print >> fout, output_format(line.strip()).decode('utf-8'),



############################################
#   Convert All except first word and quotes
#   to lower case                          #
############################################
项目:Malicious_Website_Detection    作者:medhini    | 项目源码 | 文件源码
def location(url):
    fdata={'Accept':'*/*',
    'Accept-Encoding':'gzip, deflate',
    'Accept-Language':'en-US,en;q=0.8',
    'Connection':'keep-alive',
    'Content-Length':'29',
    'Content-type':'application/x-www-form-urlencoded',
    'Cookie':'PHPSESSID=hisbu0rrh09nssn99vckkqr740; __utma=103585558.1324897437.1443987736.1443987736.1443987736.1; __utmb=103585558.2.10.1443987736; __utmc=103585558; __utmz=103585558.1443987736.1.1.utmcsr=google|utmccn=(organic)|utmcmd=organic|utmctr=(not%20provided)',
    'Host':'get-site-ip.com',
    'Origin':'http://get-site-ip.com',
    'Referer':'http://get-site-ip.com/',
    'User-Agent':'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36'}
    response=requests.post('http://get-site-ip.com/_pages/_moduler/ajaxSkrivUtIpPaNamn.php',data={'dnsNakeLookUp_In':url})
    #print response.content
    soup=BeautifulSoup(response.content,"lxml")
    #print "Location : "
    for i in soup.find_all("div", { "class" :"response"}):
    #   print i.get_text()
    #   print i.get_text().split('-')[2].replace(' ','')
        return i.get_text().split('-')[2].replace(' ','')

#Finds number of special characters
项目:Malicious_Website_Detection    作者:medhini    | 项目源码 | 文件源码
def nofoutofplacefeatures(url):


#   pdb.set_trace()

    if url[:4]=="http":
        r = requests.get(url)
    else:
        url="http://"+url
        r  = requests.get(url)

    #r = requests.get(url)
    data = r.text
    data2=r.content

    document, errors = tidy_document(data,
      options={'numeric-entities':1})

    #print document
    #print errors
    #print "Number of Elements Out of Place : " + str(len(errors))
    return len(errors)
项目:Malicious_Website_Detection    作者:medhini    | 项目源码 | 文件源码
def reg_date(url):
    url=url.strip("www.")
    print url
    ur="http://www.whois.com/whois/"+url
    r = requests.get(ur)
    data = r.content.decode("utf-8")

    #print data
    try :
        soup = BeautifulSoup(data)
        #<div class="whois_result" 
        for link in soup.find_all("div",{"class":"whois_result"}):
            site = link.get_text().lower()
            print site.decode("utf-8")
            print "\n date is \n"
            print re.findall("\d\d-[a-z][a-z][a-z]-\d\d\d\d",site.decode("utf-8"))[1]
            return re.findall("\d\d-[a-z][a-z][a-z]-\d\d\d\d",site.decode("utf-8"))[1]
    except:
        pass
项目:Malicious_Website_Detection    作者:medhini    | 项目源码 | 文件源码
def nofoutofplacefeatures(url):
    try:


    #   pdb.set_trace()

        if url[:4]=="http":
            r = requests.get(url)
        else:
            url="http://"+url
            r  = requests.get(url)

        #r = requests.get(url)
        data = r.text
        data2=r.content

        document, errors = tidy_document(data,
          options={'numeric-entities':1})

        #print document
        #print errors
        #print "Number of Elements Out of Place : " + str(len(errors))
        return len(errors)
    except:
        pass
项目:Malicious_Website_Detection    作者:medhini    | 项目源码 | 文件源码
def reg_date(url):
    url=url.strip("www.")
    #print url
    ur="http://www.whois.com/whois/"+url
    r = requests.get(ur)
    data = r.content.decode("utf-8")

    #print data
    try :
        soup = BeautifulSoup(data,"lxml")
        #<div class="whois_result" 
        for link in soup.find_all("div",{"class":"whois_result"}):
            site = link.get_text().lower()
            #print site.decode("utf-8")
            print "\n Domain registration date is " + re.findall("\d\d-[a-z][a-z][a-z]-\d\d\d\d",site.decode("utf-8"))[1]

            return re.findall("\d\d-[a-z][a-z][a-z]-\d\d\d\d",site.decode("utf-8"))[1]
    except:
        pass
项目:Malicious_Website_Detection    作者:medhini    | 项目源码 | 文件源码
def location(url):
    fdata={'Accept':'*/*',
    'Accept-Encoding':'gzip, deflate',
    'Accept-Language':'en-US,en;q=0.8',
    'Connection':'keep-alive',
    'Content-Length':'29',
    'Content-type':'application/x-www-form-urlencoded',
    'Cookie':'PHPSESSID=hisbu0rrh09nssn99vckkqr740; __utma=103585558.1324897437.1443987736.1443987736.1443987736.1; __utmb=103585558.2.10.1443987736; __utmc=103585558; __utmz=103585558.1443987736.1.1.utmcsr=google|utmccn=(organic)|utmcmd=organic|utmctr=(not%20provided)',
    'Host':'get-site-ip.com',
    'Origin':'http://get-site-ip.com',
    'Referer':'http://get-site-ip.com/',
    'User-Agent':'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36'}
    response=requests.post('http://get-site-ip.com/_pages/_moduler/ajaxSkrivUtIpPaNamn.php',data={'dnsNakeLookUp_In':url})
    #print response.content
    soup=BeautifulSoup(response.content,"lxml")
    #print "Location : "
    for i in soup.find_all("div", { "class" :"response"}):
    #   print i.get_text()
    #   print i.get_text().split('-')[2].replace(' ','')
        return i.get_text().split('-')[2].replace(' ','')

#Finds number of special characters
项目:Malicious_Website_Detection    作者:medhini    | 项目源码 | 文件源码
def nofoutofplacefeatures(url):


#   pdb.set_trace()

    if url[:4]=="http":
        r = requests.get(url)
    else:
        url="http://"+url
        r  = requests.get(url)

    #r = requests.get(url)
    data = r.text
    data2=r.content

    document, errors = tidy_document(data,
      options={'numeric-entities':1})

    #print document
    #print errors
    #print "Number of Elements Out of Place : " + str(len(errors))
    return len(errors)
项目:scientific-paper-summarisation    作者:EdCo95    | 项目源码 | 文件源码
def read_data(source):
    """
    Reads the sentence data from the csv file, which is of the form (sentence, is_summary_sentence).
    Args:
        source = the data file to read the data from
    Returns:
        A list of tuples where each tuple is of the form (sentence, is_summary_sentence).
    """

    sentences = []
    count = 0
    with open(source, "r") as csvfile:
        reader = csv.reader(csvfile)
        for row in reader:
            sentence = row[0]
            sentence = sentence.strip("\"")
            sentence = sentence.strip("[")
            sentence = sentence.strip("]")
            sentence = sentence.replace("'", "")
            sentence = sentence.replace(" ", "")
            sentence = sentence.split(",")
            sentences.append(sentence)
            count += 1

    return sentences


# ============================================

# ================ MAIN PROGRAM ==============


# Read in all of the papers into a list of lists. Each item in the list is a sentence, in the form of a list of words.
项目:Smelly-London    作者:Smelly-London    | 项目源码 | 文件源码
def tokenize_to_sentence(text):
    parser = nltk.data.load('tokenizers/punkt/english.pickle')
    # split into sentences
    sentences = parser.tokenize(text.strip())
    return [lemmatize_sentence(sentence) for sentence in sentences]
项目:Smelly-London    作者:Smelly-London    | 项目源码 | 文件源码
def getMeta(self, fileName):
        """Return the meta data for a given fileName e.g year, url, MOH, borough, bID.  """
        splitReport = fileName.split('.')
        bID = splitReport[2]
        year = splitReport[1]
        url = self.getUrl(bID)
        try:
            region = mapping[bID][1]
            mohRegion = mapping[bID][0]
        except:
            # TODO there is a problem with mappings e.g Acton.1915.b19783905.txt. Region cannot be found
            print(fileName)
            return (None, None, None, None, None)
        return year, region, bID, url, mohRegion
项目:steam_game_generator    作者:applepinegames    | 项目源码 | 文件源码
def get_app_data(app_id):
  url = 'http://store.steampowered.com/api/appdetails?appids=' + str(app_id)
  response = urllib.urlopen(url)
  try:
    data = json.loads(response.read())
    if not data[str(app_id)]['success'] or data[str(app_id)]['data']['type'] != 'game':
      return None
    return data[str(app_id)]
  except:
    return None
项目:steam_game_generator    作者:applepinegames    | 项目源码 | 文件源码
def get_apps():
  url = 'http://api.steampowered.com/ISteamApps/GetAppList/v2/'
  response = urllib.urlopen(url)
  try:
    data = json.loads(response.read())
    apps = data['applist']['apps']
    return apps
  except:
    return None
项目:steam_game_generator    作者:applepinegames    | 项目源码 | 文件源码
def get_description_from_app_data(app_data):
  description = clean_string(app_data['data']['detailed_description'])
  sentences = SENTENCE_DETECTOR.tokenize(description.strip())
  if len(sentences) > 0:
    sentences = sentences[0:(min(3, len(sentences)))]
    sentences = [x for x in sentences if len(x.split(' ')) > 5 and not x.split(' ')[0].isupper() and x.find('\r') == -1]
    combined_sentence = ' '.join(sentences)
    if len(combined_sentence) == 0 or not combined_sentence[0].isalpha() or len(combined_sentence.split(' ')) < 5:
      return None
    return combined_sentence
  return None
项目:steam_game_generator    作者:applepinegames    | 项目源码 | 文件源码
def get_title_from_app_data(app_data):
  return clean_string(app_data['data']['name'])
项目:SCDL    作者:lngvietthang    | 项目源码 | 文件源码
def load_data_from_json2(path_to_json, test_split, vocabulary_size):
    '''
    Load data for training and testing from json file
    :param path_to_json: path to json file
    :param word2vec_dict: dictionary of word2vec
    :return: X_train, y_train, X_test, y_test
    '''
    X=[]
    y=[]
    len_sent_array=[]
    sample_weight=[]
    objests=read_json_file(path_to_json)
    print 'Data %d sentences'%len(objests)
    i=0
    original_sentence_array=[]
    compression_sentence_array=[]
    word2indext_dict, _ = word2index(objests, vocabulary_size)
    for object in objests:
        original_sentence, compression_sentence = get_originalSent_compressionSent(object)
        (array_sent, sample_w) = word2vec(original_sentence, word2indext_dict)
        X.append(array_sent)
        sample_weight.append(sample_w)
        (y_l,l) = label_compress(original_sentence, compression_sentence)
        y.append(y_l)
        len_sent_array.append(l)
        i+=1
        if i%100==0:
            sys.stdout.write('.')
        #get text array:
        original_sentence_array.append(original_sentence)
        compression_sentence_array.append(compression_sentence)
    return ((X[int(len(X)*test_split):],y[int(len(y)*test_split):], len_sent_array[int(len(len_sent_array)*test_split):], sample_weight[int(len(sample_weight)*test_split):]), (X[:int(len(X)*test_split)], y[:int(len(y)*test_split)], len_sent_array[:int(len(len_sent_array)*test_split)], sample_weight[:int(len(sample_weight)*test_split)]), (original_sentence_array, compression_sentence_array))
项目:minke    作者:DistrictDataLabs    | 项目源码 | 文件源码
def feeds(self):
        """
        Opens and returns the collection of feeds associated with the corpus.
        """
        data = self.open('feeds.json')
        return json.load(data)
项目:minke    作者:DistrictDataLabs    | 项目源码 | 文件源码
def describe(self, fileids=None, categories=None):
        """
        Performs a single pass of the corpus and returns a dictionary with a
        variety of metrics concerning the state of the corpus.
        """
        # Structures to perform counting.
        counts  = nltk.FreqDist()
        tokens  = nltk.FreqDist()
        started = time.time()

        # Perform single pass over paragraphs, tokenize and count
        for para in self.paras(fileids, categories):
            counts['paras'] += 1

            for sent in self._sent_tokenizer.tokenize(para):
                counts['sents'] += 1

                for word in self._word_tokenizer.tokenize(sent):
                    counts['words'] += 1
                    tokens[word] += 1

        # Compute the number of files and categories in the corpus
        n_fileids = len(self._resolve(fileids, categories) or self.fileids())
        n_topics  = len(self.categories(self._resolve(fileids, categories)))

        # Return data structure with information
        return {
            'files':  n_fileids,
            'topics': n_topics,
            'paras':  counts['paras'],
            'sents':  counts['sents'],
            'words':  counts['words'],
            'vocab':  len(tokens),
            'lexdiv': float(counts['words']) / float(len(tokens)),
            'ppdoc':  float(counts['paras']) / float(n_fileids),
            'sppar':  float(counts['sents']) / float(counts['paras']),
            'secs':   time.time() - started,
        }
项目:minke    作者:DistrictDataLabs    | 项目源码 | 文件源码
def html(self, fileids=None, categories=None):
        """
        The preprocessed pickles do not contain HTML data.
        """
        raise TypeError(
            "Preprocessed corpus does not contain HTML data."
        )
项目:EventMiner    作者:hltcoe    | 项目源码 | 文件源码
def prep_data(data):
    sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
    sents = sent_detector.tokenize(data['content'].strip())
    sent_dict = {str(uuid.uuid4()): {'text': x} for x in sents[:2]}
    data['sents'] = sent_dict

    return data
项目:Price-Comparator    作者:Thejas-1    | 项目源码 | 文件源码
def fcfg_demo():
    import nltk.data
    g = nltk.data.load('grammars/book_grammars/feat0.fcfg')
    print(g)
    print()
项目:Price-Comparator    作者:Thejas-1    | 项目源码 | 文件源码
def setup_module(module):
    from nose import SkipTest
    import nltk.data
    try:
        nltk.data.find('corpora/childes/data-xml/Eng-USA-MOR/')
    except LookupError as e:
        print(e)
        raise SkipTest("The CHILDES corpus is not found. "
                       "It should be manually downloaded and saved/unpacked "
                       "to [NLTK_Data_Dir]/corpora/childes/")
项目:Price-Comparator    作者:Thejas-1    | 项目源码 | 文件源码
def data(self):
        for name in self.names:
            f = nltk.data.find(name)
            with f.open() as fp:
                file_data = fp.read().decode('utf8')
            yield f, file_data
项目:Price-Comparator    作者:Thejas-1    | 项目源码 | 文件源码
def test_correct_length(self):
        # Check that the corpus views report the correct lengths:

        for f, file_data in self.data():
            v = StreamBackedCorpusView(f, read_whitespace_block)
            self.assertEqual(len(v), len(file_data.split()))

            v = StreamBackedCorpusView(f, read_line_block)
            self.assertEqual(len(v), len(self.linetok.tokenize(file_data)))
项目:Price-Comparator    作者:Thejas-1    | 项目源码 | 文件源码
def augment(self, data):
        """
        Add more data to the ``Concept``'s extension set.

        :param data: a new semantic value
        :type data: string or pair of strings
        :rtype: set

        """
        self._extension.add(data)
        self.extension = sorted(list(self._extension))
        return self._extension
项目:Price-Comparator    作者:Thejas-1    | 项目源码 | 文件源码
def _str2records(filename, rel):
    """
    Read a file into memory and convert each relation clause into a list.
    """
    recs = []
    contents = nltk.data.load("corpora/chat80/%s" % filename, format="text")
    for line in contents.splitlines():
        if line.startswith(rel):
            line = re.sub(rel+r'\(', '', line)
            line = re.sub(r'\)\.$', '', line)
            record = line.split(',')
            recs.append(record)
    return recs
项目:Price-Comparator    作者:Thejas-1    | 项目源码 | 文件源码
def process_bundle(rels):
    """
    Given a list of relation metadata bundles, make a corresponding
    dictionary of concepts, indexed by the relation name.

    :param rels: bundle of metadata needed for constructing a concept
    :type rels: list(dict)
    :return: a dictionary of concepts, indexed by the relation name.
    :rtype: dict(str): Concept 
    """
    concepts = {}
    for rel in rels:
        rel_name = rel['rel_name']
        closures = rel['closures']
        schema = rel['schema']
        filename = rel['filename']

        concept_list = clause2concepts(filename, rel_name, schema, closures)
        for c in concept_list:
            label = c.prefLabel
            if (label in concepts):
                for data in c.extension:
                    concepts[label].augment(data)
                concepts[label].close()
            else:
                concepts[label] = c
    return concepts
项目:Price-Comparator    作者:Thejas-1    | 项目源码 | 文件源码
def val_load(db):
    """
    Load a ``Valuation`` from a persistent database.

    :param db: name of file from which data is read.
               The suffix '.db' should be omitted from the name.
    :type db: str
    """
    dbname = db+".db"

    if not os.access(dbname, os.R_OK):
        sys.exit("Cannot read file: %s" % dbname)
    else:
        db_in = shelve.open(db)
        from nltk.sem import Valuation
        val = Valuation(db_in)
#        val.read(db_in.items())
        return val


#def alpha(str):
    #"""
    #Utility to filter out non-alphabetic constants.

    #:param str: candidate constant
    #:type str: string
    #:rtype: bool
    #"""
    #try:
        #int(str)
        #return False
    #except ValueError:
        ## some unknown values in records are labeled '?'
        #if not str == '?':
            #return True
项目:w2vec-similarity    作者:jayantj    | 项目源码 | 文件源码
def tokenize_sentences(text):
  import nltk.data
  sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
  return sent_tokenizer.tokenize(text)
项目:Humour-Detection    作者:srishti-1795    | 项目源码 | 文件源码
def readFileOfReviews():
# Read each review from file
        global reviewsLst
    preview = open("data.txt", "rb")
    reviewsLst =  pickle.load(preview)
项目:DNN-Sentiment    作者:awjuliani    | 项目源码 | 文件源码
def load_data():
    """
    Loads and preprocessed data for the MR dataset.
    Returns input vectors, labels, vocabulary, and inverse vocabulary.
    """
    # Load and preprocess data
    sentences, labels = load_data_and_labels()
    sentences_padded = pad_sentences(sentences)
    vocabulary, vocabulary_inv = build_vocab(sentences_padded)
    x, y = build_input_data(sentences_padded, labels, vocabulary)
    return [x, y, vocabulary, vocabulary_inv]
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda    作者:SignalMedia    | 项目源码 | 文件源码
def fcfg_demo():
    import nltk.data
    g = nltk.data.load('grammars/book_grammars/feat0.fcfg')
    print(g)
    print()
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda    作者:SignalMedia    | 项目源码 | 文件源码
def setup_module(module):
    from nose import SkipTest
    import nltk.data
    try:
        nltk.data.find('corpora/childes/data-xml/Eng-USA-MOR/')
    except LookupError as e:
        print(e)
        raise SkipTest("The CHILDES corpus is not found. "
                       "It should be manually downloaded and saved/unpacked "
                       "to [NLTK_Data_Dir]/corpora/childes/")
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda    作者:SignalMedia    | 项目源码 | 文件源码
def data(self):
        for name in self.names:
            f = nltk.data.find(name)
            with f.open() as fp:
                file_data = fp.read().decode('utf8')
            yield f, file_data
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda    作者:SignalMedia    | 项目源码 | 文件源码
def test_correct_length(self):
        # Check that the corpus views report the correct lengths:

        for f, file_data in self.data():
            v = StreamBackedCorpusView(f, read_whitespace_block)
            self.assertEqual(len(v), len(file_data.split()))

            v = StreamBackedCorpusView(f, read_line_block)
            self.assertEqual(len(v), len(self.linetok.tokenize(file_data)))