Python gensim.corpora 模块,WikiCorpus() 实例源码


项目:wiki-sim-search    作者:chrisjmccormick    | 项目源码 | 文件源码
def formatTime(seconds):
    Takes a number of elapsed seconds and returns a string in the format h:mm.
    m, s = divmod(seconds, 60)
    h, m = divmod(m, 60)
    return "%d:%02d" % (h, m)

# TODO - Add example code for loading each item back from disk (if needed).
#      - Maybe a commented line below the 'save' command?

# ======== main ========
# Main entry point for the script.
# This little check has to do with the multiprocess module (which is used by
# WikiCorpus). Without it, the code will spawn infinite processes and hang!
项目:DataScience-And-MachineLearning-Handbook-For-Coders    作者:wxyyxc1992    | 项目源码 | 文件源码
def wiki2texts(self, wiki_data_path, wiki_texts_path='./wiki_texts.txt'):

        wiki_data_path -- ????????
        if not wiki_data_path:
            print("??? Wiki ?????????? ??")

        # ???????
        wiki_corpus = WikiCorpus(wiki_data_path, dictionary={})
        texts_num = 0

        with open(wiki_text_path, 'w', encoding='utf-8') as output:
            for text in wiki_corpus.get_texts():
                output.write(b' '.join(text).decode('utf-8') + '\n')
                texts_num += 1
                if texts_num % 10000 == 0:
          "??? %d ???" % texts_num)

        print("???????? OpenCC ??????")
项目:blstm-cws    作者:chantera    | 项目源码 | 文件源码
def zhwiki2chars(in_file, out_file):
    reg = re.compile(r'^[a-zA-Z]+$')

    def _isalpha(string):
        return reg.match(string) is not None

    i = 0
    out = open(out_file, 'w')
    wiki = WikiCorpus(in_file, lemmatize=False, dictionary={})
    for article in wiki.get_texts():
        tokens = []
        for token in article:
            token = token.decode("utf-8").strip()
            if _isalpha(token):
            tokens.append(" ".join(token))  # divided by character
        out.write(" ".join(tokens) + "\n")
        i += 1
        if i % 10000 == 0:
            print("process %d articles" % i)
项目:word2vec-tutorial    作者:zake7749    | 项目源码 | 文件源码
def main():

    if len(sys.argv) != 2:
        print("Usage: python3 " + sys.argv[0] + " wiki_data_path")

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
    wiki_corpus = WikiCorpus(sys.argv[1], dictionary={})
    texts_num = 0

    with"wiki_texts.txt",'w',encoding='utf-8') as output:
        for text in wiki_corpus.get_texts():
            output.write(b' '.join(text).decode('utf-8') + '\n')
            texts_num += 1
            if texts_num % 10000 == 0:
      "??? %d ???" % texts_num)
项目:word2vec-tutorial    作者:zake7749    | 项目源码 | 文件源码
def main():

    if len(sys.argv) != 2:
        print("Usage: python3 " + sys.argv[0] + " wiki_data_path")

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
    wiki_corpus = WikiCorpus(sys.argv[1], dictionary={})
    texts_num = 0

    with open("wiki_texts.txt",'w',encoding='utf-8') as output:
        for text in wiki_corpus.get_texts():
            output.write(' '.join(text) + '\n')
            texts_num += 1
            if texts_num % 10000 == 0:
      "??? %d ???" % texts_num)
项目:Word2vec    作者:Alex-CHUN-YU    | 项目源码 | 文件源码
def set_wiki_to_txt(self, wiki_data_path = None):
        if wiki_data_path == None:
            # ?????
            if len(sys.argv) != 2:
                print("Please Usage: python3 " + sys.argv[0] + " wiki_data_path")
                wiki_corpus = WikiCorpus(sys.argv[1], dictionary = {})
            wiki_corpus = WikiCorpus(wiki_data_path, dictionary = {})
        # wiki.xml convert to wiki.txt
        with open("wiki_text.txt", 'w', encoding = 'utf-8') as output:
            text_count = 0
            for text in wiki_corpus.get_texts():
                # save use byte and decode utf-8
                output.write(b' '.join(text).decode('utf-8') + '\n')
                text_count += 1
                if text_count % 10000 == 0:
          "????? %d ???" % text_count)
项目:ChineseSA    作者:cwlseu    | 项目源码 | 文件源码
def __init__(self, fname, _lemmatize=False, _dictionary={}, filter_namespaces=('0',)):
        self.fname = fname
        self.logger = startlog()
        self.corpus = WikiCorpus(fname, lemmatize=_lemmatize, dictionary=_dictionary)
项目:ChineseSA    作者:cwlseu    | 项目源码 | 文件源码
def __init__(self, fname, _lemmatize=False, _dictionary={}, filter_namespaces=('0',)):
        self.fname = fname
        self.logger = startlog()
        self.corpus = WikiCorpus(fname, lemmatize=_lemmatize, dictionary=_dictionary)
        self.traincorpusfname = None
项目:KEM    作者:UDICatNCHU    | 项目源码 | 文件源码
def wikiToTxt(self):
        # This function takes about 25 minutes
        from gensim.corpora import WikiCorpus

        wiki_corpus = WikiCorpus('./build/zhwiki-latest-pages-articles.xml.bz2', dictionary={})

        texts_num = 0
        with open('./build/wiki_texts.txt', 'w', encoding='utf-8') as output:
            for text in wiki_corpus.get_texts():
                output.write(b' '.join(text).decode('utf-8') + '\n')
                texts_num += 1
                if texts_num % 10000 == 0:
          "??? %d ???" % texts_num)
项目:Book_DeepLearning_Practice    作者:wac81    | 项目源码 | 文件源码
def get_save_wikitext(wiki_filename,text_filename):
    output = open(text_filename, 'w')
    wiki = corpora.WikiCorpus(wiki_filename, lemmatize=False, dictionary={})
    for text in wiki.get_texts():
        # text = delNOTNeedWords(text,"../../stopwords.txt")[1]
        output.write(" ".join(text) + "\n")
        i = i + 1
        if (i % 10000 == 0):
  "Saved " + str(i) + " articles")
项目:twitter_LDA_topic_modeling    作者:kenneth-orton    | 项目源码 | 文件源码
def main():
    parser = argparse.ArgumentParser(description='Create a corpus from a collection of tweets and/or build an LDA model')
    subparsers = parser.add_subparsers(dest='mode')

    text_corpus_parser = subparsers.add_parser('text', help='Build corpus from directory of text files')
    text_corpus_parser.add_argument('-d', '--docs_loc', required=True, action='store', dest='docs_loc', help='Directory where tweet documents stored')
    text_corpus_parser.add_argument('-c', '--corp_loc', required=True, action='store', dest='corp_loc', help='Location and name to save corpus')
    text_corpus_parser.add_argument('-m', '--lemma', action='store_true', dest='lemma', help='Use this option to lemmatize words')

    wiki_corpus_parser = subparsers.add_parser('wiki', help='Build corpus from compressed Wikipedia articles')
    wiki_corpus_parser.add_argument('-w', '--wiki_loc', required=True, action='store', dest='wiki_loc', help='Location of compressed Wikipedia dump')
    wiki_corpus_parser.add_argument('-c', '--corp_loc', required=True, action='store', dest='corp_loc', help='Location and name to save corpus')
    wiki_corpus_parser.add_argument('-m', '--lemma', action='store_true', dest='lemma', help='Use this option to lemmatize words')

    lda_model_parser = subparsers.add_parser('lda', help='Create LDA model from saved corpus')
    lda_model_parser.add_argument('-c', '--corp_loc', required=True, action='store', dest='corp_loc', help='Location of corpus')
    lda_model_parser.add_argument('-d', '--dict_loc', required=True, action='store', dest='dict_loc', help='Location of dictionary')
    lda_model_parser.add_argument('-n', '--num_topics', required=True, action='store', dest='num_topics', help='Number of topics to assign to LDA model')
    lda_model_parser.add_argument('-p', '--num_pass', required=True, action='store', dest='num_pass', help='Number of passes through corpus when training the LDA model')
    lda_model_parser.add_argument('-l', '--lda_loc', required=True, action='store', dest='lda_loc', help='Location and name to save LDA model')

    lda_vis_parser = subparsers.add_parser('ldavis', help='Create visualization of LDA model')
    lda_vis_parser.add_argument('-c', '--corp_loc', required=True, action='store', dest='corp_loc', help='Location of corpus')
    lda_vis_parser.add_argument('-d', '--dict_loc', required=True, action='store', dest='dict_loc', help='Location of dictionary')
    lda_vis_parser.add_argument('-l', '--lda_loc', required=True, action='store', dest='lda_loc', help='Location of LDA model')

    args = parser.parse_args()

    if args.mode == 'text':
        doc_corpus = DocCorpus(args.docs_loc, args.lemma)

        doc_corpus.dictionary.filter_extremes(no_below=1, no_above=0.5, keep_n=DEFAULT_DICT_SIZE)

        MmCorpus.serialize(args.corp_loc + '.mm', doc_corpus) + '.dict')

    if args.mode == 'wiki':
        if args.lemma:
            wiki_corpus = WikiCorpus(args.wiki_loc, lemmatize=True, tokenizer_func=wiki_tokenizer, article_min_tokens=100, token_min_len=3, token_max_len=15)
            wiki_corpus = WikiCorpus(args.wiki_loc, lemmatize=False, tokenizer_func=wiki_tokenizer, article_min_tokens=100, token_min_len=3, token_max_len=15)

        wiki_corpus.dictionary.filter_extremes(no_below=5, no_above=0.5, keep_n=DEFAULT_DICT_SIZE)

        MmCorpus.serialize(args.corp_loc + '.mm', wiki_corpus) + '.dict')

    if args.mode == 'lda':
        build_LDA_model(args.corp_loc, args.dict_loc, args.num_topics, args.num_pass, args.lda_loc)

    if args.mode == 'ldavis':
        build_pyLDAvis_output(args.corp_loc, args.dict_loc, args.lda_loc)