Python networkx 模块,pagerank() 实例源码

我们从Python开源项目中,提取了以下35个代码示例,用于说明如何使用networkx.pagerank()

项目:stock-eagle    作者:mtusman    | 项目源码 | 文件源码
def rank(nodes, edges):
    ''' Creates the graph with the calculates nodes (sentences) and their weight'''
    graph = nx.DiGraph()
    graph.add_nodes_from(nodes)
    graph.add_weighted_edges_from(edges)
    ''' Uses google's pagerank formula to find the most important senteces'''
    return nx.pagerank(graph)
项目:Papyrus--simple-but-effective-text-summarization-tool    作者:RebeccaMerrett    | 项目源码 | 文件源码
def function_2(text):
    paragraphs = text.split('\n\n')
    count_vect = CountVectorizer()
    bow_matrix = count_vect.fit_transform(paragraphs)
    normalized_matrix = TfidfTransformer().fit_transform(bow_matrix)
    similarity_graph = normalized_matrix * normalized_matrix.T #term frequency/inverse doc frequency applied
    similarity_graph.toarray()
    nx_graph = nx.from_scipy_sparse_matrix(similarity_graph)
    scores = nx.pagerank(nx_graph) #TextRank applied
    ranked = sorted(((scores[i],s) for i,s in enumerate(paragraphs)), reverse=True) #Sorts all paragraphs from highest to lowest scores
    ten_percent = int(round(10.00/100.00 * len(ranked)))
    ten_percent_high_scores = ranked[0:ten_percent]
    summary = [x[1] for x in ten_percent_high_scores] #Takes top 10%, so the paragraphs with the highest scores (does not disturb the rank order)
    return "\n\n".join(summary)

#Text taken from the user's uploaded PDF or URL, cleaned and formatted.
项目:YelpDataChallenge    作者:fujunswufe    | 项目源码 | 文件源码
def extractSentences(document):
    # sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
    # sentenceTokens = sent_detector.tokenize(text.strip())
    sentenceTokens = document.sentences()
    graph = buildGraph(sentenceTokens)

    calculated_page_rank = nx.pagerank(graph, weight='weight')

    #most important sentences in ascending order of importance
    sentences = sorted(calculated_page_rank, key=calculated_page_rank.get, reverse=True)

    #return a 100 word summary
    summary = ' '.join(sentences)
    summaryWords = summary.split()
    summaryWords = summaryWords[0:101]
    summary = ' '.join(summaryWords)

    return summary
项目:YelpDataChallenge    作者:fujunswufe    | 项目源码 | 文件源码
def extractSentences(text):  # this should be a bunch of sentences, not just one sentence 
    sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
    sentenceTokens = sent_detector.tokenize(text.strip())
    graph = buildGraph(sentenceTokens)

    calculated_page_rank = nx.pagerank(graph, weight='weight')  # implemented weight graph here 

    #most important sentences in ascending order of importance
    sentences = sorted(calculated_page_rank, key=calculated_page_rank.get, reverse=True)

    #return a 100 word summary
    summary = ' '.join(sentences)
    summaryWords = summary.split()
    summaryWords = summaryWords[0:101]
    summary = ' '.join(summaryWords)

    return summary
项目:YelpDataChallenge    作者:fujunswufe    | 项目源码 | 文件源码
def cosine_similarity_self(A):
    similarity = np.dot(A, A.T)
    square_mag = np.diag(similarity)
    inv_square_mag = 1 / square_mag
    inv_square_mag[np.isinf(inv_square_mag)] = 0
    inv_mag = np.sqrt(inv_square_mag)
    cosine = similarity * inv_mag
    cosine = cosine.T * inv_mag
    return cosine

# document should be a list of sentences
# method = "word2vec", "lda", "tfidf"
# def extraction(document, method="rawText"):
#
#     # graph = build_graph(document, method)  # document is a list of sentences
#
#     calculated_page_rank = networkx.pagerank(graph, weight="weight")
#
#     # most important sentences in descending order of importance
#     sentences = sorted(calculated_page_rank, key=calculated_page_rank.get, reverse=False)
#
#     return sentences[0:4]
项目:KDDCUP2016    作者:hugochan    | 项目源码 | 文件源码
def __init__(self):
        self.index = Index(config.INDEX_PATH)

        # Checks if the full graph for this dataset was already ranked.
        # If not, run page rank and store the results
        pr_file_path = "%s/page_rank/%s.p" % (config.DATA, config.DATASET)
        if not os.path.exists(pr_file_path):
            g = nx.DiGraph()
            g.add_edges_from(model.get_all_edges())

            print "Running pageRank with %d nodes." % g.number_of_nodes()
            self.pr = nx.pagerank(g)

            cPickle.dump(self.pr, open(pr_file_path, "w"))

        # Else, just loads it
        else:
            self.pr = cPickle.load(open(pr_file_path, 'r'))
项目:text-analytics-with-python    作者:dipanjanS    | 项目源码 | 文件源码
def textrank_text_summarizer(documents, num_sentences=2,
                             feature_type='frequency'):

    vec, dt_matrix = build_feature_matrix(norm_sentences, 
                                      feature_type='tfidf')
    similarity_matrix = (dt_matrix * dt_matrix.T)

    similarity_graph = networkx.from_scipy_sparse_matrix(similarity_matrix)
    scores = networkx.pagerank(similarity_graph)   

    ranked_sentences = sorted(((score, index) 
                                for index, score 
                                in scores.items()), 
                              reverse=True)

    top_sentence_indices = [ranked_sentences[index][1] 
                            for index in range(num_sentences)]
    top_sentence_indices.sort()

    for index in top_sentence_indices:
        print sentences[index]
项目:bookmark_analysis    作者:tarwn    | 项目源码 | 文件源码
def extractSentences(text):
    sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
    sentenceTokens = sent_detector.tokenize(text.strip())
    graph = buildGraph(sentenceTokens)

    calculated_page_rank = nx.pagerank(graph, weight='weight')

    # most important sentences in ascending order of importance
    sentences = sorted(calculated_page_rank, key=calculated_page_rank.get,
                       reverse=True)

    # return a 100 word summary
    summary = ' '.join(sentences)
    summaryWords = summary.split()
    summaryWords = summaryWords[0:101]
    summary = ' '.join(summaryWords)

    return summary
项目:acl2017-interactive_summarizer    作者:UKPLab    | 项目源码 | 文件源码
def add_sentences(self, sentences):
        """
        @type sentences: list[Sentence]
        :param sentences:
        :return:
        """
        counter = self.counter
        G = self.G
        for sentence in sentences:
            G.add_nodes_from(sentence.concepts)
            counter.update(ngrams(sentence.concepts, self.N))

        for (keys, value) in counter.items():
            for i in range(0, len(keys) - 1):
                for j in range(1, len(keys)):
                    G.add_edge(keys[i], keys[j], weight=value)
                    # counter.update((keys[i], keys[j]))

        # for (key, value) in counter.items():
        #     G.add_edge(key[0], key[1], attr={"weight": value})

        print("V := (N,E), |N| = %s, |E| = %s" % (len(G.nodes()), len(G.edges())))

        self.pr = nx.pagerank(G)
项目:acl2017-interactive_summarizer    作者:UKPLab    | 项目源码 | 文件源码
def incorporate_feedback(self, flightrecorder):
        """

        :param flightrecorder:
        :return:
         @type flightrecorder: FlightRecorder
        """
        G = self.G
        print("V := (N,E), |N| = %s, |E| = %s" % (len(G.nodes()), len(G.edges())))

        # use the pagerank personalization feature to incorporate flightrecorder feedback

        union = flightrecorder.union()

        for rejected in union.reject:
            if(G.has_node(rejected)):
                G.remove_node(rejected)

        print("V := (N,E), |N| = %s, |E| = %s" % (len(G.nodes()), len(G.edges())))

        self.pr = nx.pagerank(G)
项目:TextRankPlus    作者:zuoxiaolei    | 项目源码 | 文件源码
def sort_sentences(sentences, words,model, pagerank_config = {'alpha': 0.85,}):
    """???????????????

    Keyword arguments:
    sentences         --  ????????
    words             --  ?????????sentences???????????????
    sim_func          --  ????????????????????????
    pagerank_config   --  pagerank???
    """
    sorted_sentences = []
    _source = words
    sentences_num = len(_source)
    graph = np.zeros((sentences_num, sentences_num))

    for x in xrange(sentences_num):
        for y in xrange(x, sentences_num):
            similarity = get_similarity( _source[x], _source[y], model)
            graph[x, y] = similarity
            graph[y, x] = similarity
    nx_graph = nx.from_numpy_matrix(graph)
    scores = nx.pagerank(nx_graph, **pagerank_config)              # this is a dict
    sorted_scores = sorted(scores.items(), key = lambda item: item[1], reverse=True)

    for index, score in sorted_scores:
        item = AttrDict(index=index, sentence=sentences[index], weight=score)
        sorted_sentences.append(item)

    return sorted_sentences
项目:TextRankPlus    作者:zuoxiaolei    | 项目源码 | 文件源码
def sort_sentences(sentences, words, sim_func = get_similarity, pagerank_config = {'alpha': 0.85,}):
    """???????????????

    Keyword arguments:
    sentences         --  ????????
    words             --  ?????????sentences???????????????
    sim_func          --  ????????????????????????
    pagerank_config   --  pagerank???
    """
    sorted_sentences = []
    _source = words
    sentences_num = len(_source)
    graph = np.zeros((sentences_num, sentences_num))

    for x in xrange(sentences_num):
        for y in xrange(x, sentences_num):
            similarity = sim_func( _source[x], _source[y] )
            graph[x, y] = similarity
            graph[y, x] = similarity

    nx_graph = nx.from_numpy_matrix(graph)
    scores = nx.pagerank(nx_graph, **pagerank_config)              # this is a dict
    sorted_scores = sorted(scores.items(), key = lambda item: item[1], reverse=True)

    for index, score in sorted_scores:
        item = AttrDict(index=index, sentence=sentences[index], weight=score)
        sorted_sentences.append(item)

    return sorted_sentences
项目:Naver-News-Summarizer    作者:devFallingstar    | 项目源码 | 文件源码
def __init__(self, text):
        self.sentences = get_sentences(text)
        self.graph = build_graph(self.sentences)
        self.pagerank = networkx.pagerank(self.graph, weight='weight')
        self.reordered = sorted(self.pagerank, key=self.pagerank.get, reverse=True)
        self.nouns = []
        for sentence in self.sentences:
            self.nouns += sentence.nouns
        self.bow = collections.Counter(self.nouns)
项目:JustCopy    作者:exe1023    | 项目源码 | 文件源码
def sort_sentences(sentences, words, sim_func = get_similarity, pagerank_config = {'alpha': 0.85,}):
    """???????????????

    Keyword arguments:
    sentences         --  ????????
    words             --  ?????????sentences???????????????
    sim_func          --  ????????????????????????
    pagerank_config   --  pagerank???
    """
    sorted_sentences = []
    _source = words
    sentences_num = len(_source)        
    graph = np.zeros((sentences_num, sentences_num))

    for x in xrange(sentences_num):
        for y in xrange(x, sentences_num):
            similarity = sim_func( _source[x], _source[y] )
            graph[x, y] = similarity
            graph[y, x] = similarity

    nx_graph = nx.from_numpy_matrix(graph)
    scores = nx.pagerank(nx_graph, **pagerank_config)              # this is a dict
    sorted_scores = sorted(scores.items(), key = lambda item: item[1], reverse=True)

    for index, score in sorted_scores:
        item = AttrDict(index=index, sentence=sentences[index], weight=score)
        sorted_sentences.append(item)

    return sorted_sentences
项目:PhD    作者:wutaoadeny    | 项目源码 | 文件源码
def Page_Rank(G):   
    PageRank_Centrality = nx.pagerank(G, alpha=0.85)
    #print "PageRank_Centrality:", sorted(PageRank_Centrality.iteritems(), key=lambda d:d[1], reverse = True)
    return PageRank_Centrality
项目:PhD    作者:wutaoadeny    | 项目源码 | 文件源码
def Page_Rank(G):
    PageRank_Centrality = nx.pagerank(G, alpha=0.85)
    #print "PageRank_Centrality:", sorted(PageRank_Centrality.iteritems(), key=lambda d:d[1], reverse = True)
    return PageRank_Centrality
项目:tweetopo    作者:zthxxx    | 项目源码 | 文件源码
def __init__(self, edges, measure='pagerank'):
        '''
        Class for analysis graph
        :param edges: weighted_edges The edges must be given as 3-tuples like (u,v,weight)
        :param measure: what measure for analysis to filter,
                        must be one of  'degree' or 'pagerank' or 'clustering'
        '''
        self.measures = ['degree', 'pagerank', 'clustering']
        self.measure = measure
        self.ranks = {}
        self.G = nx.Graph()
        self.import_data(edges)
项目:tweetopo    作者:zthxxx    | 项目源码 | 文件源码
def get_pageranks(self):
        pageranks = nx.pagerank(self.G)
        max_pagerank = max(pageranks.values())
        return pageranks, max_pagerank
项目:senpai    作者:AdiChat    | 项目源码 | 文件源码
def extractSentences(self, text):
        '''
        Extracts sentences from the graph using pagerank
        Arguments:
            text: input textual data
        Returns:
            summary: a bunch of sentences
        Raises:
            None
        '''
项目:senpai    作者:AdiChat    | 项目源码 | 文件源码
def extractSentences(self, text):
        '''
        Extracts sentences from the graph using pagerank
        Arguments:
            text: input textual data
        Returns:
            summary: a bunch of sentences
        Raises:
            None
        '''
项目:AIZooService    作者:zhanglbjames    | 项目源码 | 文件源码
def sort_sentences(sentences, words, sim_func = get_similarity, pagerank_config = {'alpha': 0.85,}):
    """???????????????

    Keyword arguments:
    sentences         --  ????????
    words             --  ?????????sentences???????????????
    sim_func          --  ????????????????????????
    pagerank_config   --  pagerank???
    """
    sorted_sentences = []
    _source = words
    sentences_num = len(_source)        
    graph = np.zeros((sentences_num, sentences_num))

    for x in xrange(sentences_num):
        for y in xrange(x, sentences_num):
            similarity = sim_func( _source[x], _source[y] )
            graph[x, y] = similarity
            graph[y, x] = similarity

    nx_graph = nx.from_numpy_matrix(graph)
    scores = nx.pagerank(nx_graph, **pagerank_config)              # this is a dict
    sorted_scores = sorted(scores.items(), key = lambda item: item[1], reverse=True)

    for index, score in sorted_scores:
        item = AttrDict(index=index, sentence=sentences[index], weight=score)
        sorted_sentences.append(item)

    return sorted_sentences
项目:KDDCUP2016    作者:hugochan    | 项目源码 | 文件源码
def search(self, query, exclude=[], force=False, limit=20):

        # Fetches all document that have at least one of the terms
        pubs = self.index.search(query,
                                 search_fields=["title", "abstract"],
                                 return_fields=["id"],
                                 ignore=exclude)

        # Unpack and convert to a set for fast lookup
        pubs = set([pub_id for (pub_id,) in pubs])

        # index_ids, _scores = self.index.search(query, ["title", "abstract"], limit=limit, mode="ALL")
        # docs = set(self.index.get_documents(index_ids, "id"))

        g = nx.DiGraph()
        for u, v in self.edges:
            if (u in pubs) and (v in pubs):
                g.add_edge(u, v)

            #       print "PageRank with %d nodes." % g.number_of_nodes()
        r = nx.pagerank(g, alpha=0.7)

        if len(r) == 0:
            return []

        ids, _pg = zip(*sorted(r.items(), key=lambda (k, v): v, reverse=True))
        return ids[:limit]
项目:KDDCUP2016    作者:hugochan    | 项目源码 | 文件源码
def search(self, query, exclude=[], limit=50, force=False):

        graph = build_graph(query,
                            self.params['K'],
                            self.params['H'],
                            self.params['min_topic_lift'],
                            self.params['min_ngram_lift'],
                            exclude, force, load=True)


        # Simple method to check if node is a document node.
        is_doc = lambda node: node["type"] == "paper"

        # Builds a new unweighted graph with only the documents as nodes
        docs_graph = nx.DiGraph()

        # Removes all non doc nodes
        for u, v in graph.edges():
            u = graph.node[u]
            v = graph.node[v]
            if is_doc(u) and is_doc(v):
                docs_graph.add_edge(u["entity_id"], v["entity_id"])

        r = nx.pagerank(docs_graph, alpha=0.7)
        if len(r) == 0:
            return []

        ids, _pg = zip(*sorted(r.items(), key=lambda (k, v): v, reverse=True))
        return ids[:limit]
项目:academic    作者:xinchrome    | 项目源码 | 文件源码
def venueNet_feature():
    # output: compute the author centrialy for each venue
    #         venue centrality dict

    CSpaper = pickle.load(open(cspath+"CSvenuePaper","rb")) # all papers in CS venues
    CSvenue_paper = pickle.load(open(cspath+"CSvenue_paper","rb")) #data type, dict, key, value: list
    Citations = pickle.load(open(cspath+"Citations","rb"))
    CSPV = pickle.load( open(cspath+"CSvenuePaper_Venue","rb")) #data type, dict, key, value: list
    nodeSet = set()
    edgeSet = set()
    for key,val in CSvenue_paper.iteritems():
        nodeSet.add(key)
        temp = defaultdict(int)
        for p in val:
            for citing in Citations[p]:
                if citing in CSpaper:
                    temp[(CSPV[citing],key)] +=1
        edges = [(key[0],key[1],val) for key,val in temp.iteritems()]
        edgeSet.update(edges)
    g = nx.DiGraph()
    g.add_nodes_from(nodeSet)
    g.add_weighted_edges_from(edgeSet)

    pr = defaultdict(int)
    for node in g.nodes():
        pr[node]=1

    #DG.add_weighted_edges_from([(1,2,0.5), (3,1,0.75)])
    #pr = nx.pagerank(g) 
    #page rank is time-consuming, replace this in real atmosphere
    pickle.dump(pr,open(cspath+"venue_cen","wb"))
    print 'venueNet_feature finish'
项目:anomalous-vertices-detection    作者:Kagandi    | 项目源码 | 文件源码
def pagerank(self):
        """Return the PageRank of the nodes in the graph.

        Returns
        -------
        pagerank : dictionary
            Dictionary of nodes with PageRank as value

        Examples
        --------
        >>> g.pagerank()
         """
        return nx.pagerank(self._graph, weight=self._weight_field)
项目:acl2017-interactive_summarizer    作者:UKPLab    | 项目源码 | 文件源码
def __init__(self, stemmer, language, N=2, G=nx.DiGraph()):
        self.G = G
        self.stemmer = stemmer
        self.language = language
        self.N = N

        self.counter = Counter()
        self.pr = nx.pagerank(G)
项目:TextRankPlus    作者:zuoxiaolei    | 项目源码 | 文件源码
def sort_words(vertex_source, edge_source, model, window = 2, pagerank_config = {'alpha': 0.85,}):
    """??????????????

    Keyword arguments:
    vertex_source   --  ???????????????????????????????pagerank????
    edge_source     --  ?????????????????????????????????pagerank???
    window          --  ????????window????????????
    pagerank_config --  pagerank???
    """

    #??????????
    sorted_words   = []
    word_index     = {}
    index_word     = {}
    _vertex_source = vertex_source
    _edge_source   = edge_source
    words_number   = 0
    for word_list in _vertex_source:
        for word in word_list:
            if not word in word_index:
                word_index[word] = words_number
                index_word[words_number] = word
                words_number += 1

    graph = np.zeros((words_number, words_number))

    #???
    for word_list in _edge_source:
        for w1, w2 in combine(word_list, window):
            if w1 in word_index and w2 in word_index:
                index1 = word_index[w1]
                index2 = word_index[w2]
                try:
                    similarity = model.similarity(w1,w2)
                    if similarity<0:
                        similarity = 0
                    #print similarity
                except:
                    similarity = 0
                graph[index1][index2] = similarity
                graph[index2][index1] = similarity
#                graph[index1][index2] = 1.0
#                graph[index2][index1] = 1.0

    nx_graph = nx.from_numpy_matrix(graph)

    scores = nx.pagerank(nx_graph, max_iter=100,**pagerank_config)          # this is a dict
    sorted_scores = sorted(scores.items(), key = lambda item: item[1], reverse=True)
    for index, score in sorted_scores:
        item = AttrDict(word=index_word[index], weight=score)
        sorted_words.append(item)
    return sorted_words
项目:JustCopy    作者:exe1023    | 项目源码 | 文件源码
def sort_words(vertex_source, edge_source, window = 2, pagerank_config = {'alpha': 0.85,}):
    """??????????????

    Keyword arguments:
    vertex_source   --  ???????????????????????????????pagerank????
    edge_source     --  ?????????????????????????????????pagerank???
    window          --  ????????window????????????
    pagerank_config --  pagerank???
    """
    sorted_words   = []
    word_index     = {}
    index_word     = {}
    _vertex_source = vertex_source
    _edge_source   = edge_source
    words_number   = 0
    for word_list in _vertex_source:
        for word in word_list:
            if not word in word_index:
                word_index[word] = words_number
                index_word[words_number] = word
                words_number += 1

    graph = np.zeros((words_number, words_number))

    for word_list in _edge_source:
        for w1, w2 in combine(word_list, window):
            if w1 in word_index and w2 in word_index:
                index1 = word_index[w1]
                index2 = word_index[w2]
                graph[index1][index2] = 1.0
                graph[index2][index1] = 1.0

    debug('graph:\n', graph)

    nx_graph = nx.from_numpy_matrix(graph)
    scores = nx.pagerank(nx_graph, **pagerank_config)          # this is a dict
    sorted_scores = sorted(scores.items(), key = lambda item: item[1], reverse=True)
    for index, score in sorted_scores:
        item = AttrDict(word=index_word[index], weight=score)
        sorted_words.append(item)

    return sorted_words
项目:YelpDataChallenge    作者:fujunswufe    | 项目源码 | 文件源码
def extractKeyphrases(text):
    #tokenize the text using nltk
    wordTokens = nltk.word_tokenize(text)

    #assign POS tags to the words in the text
    tagged = nltk.pos_tag(wordTokens)
    textlist = [x[0] for x in tagged]

    tagged = filter_for_tags(tagged)
    tagged = normalize(tagged)

    unique_word_set = unique_everseen([x[0] for x in tagged])
    word_set_list = list(unique_word_set)

   #this will be used to determine adjacent words in order to construct keyphrases with two words

    graph = buildGraph(word_set_list)

    #pageRank - initial value of 1.0, error tolerance of 0,0001, 
    calculated_page_rank = nx.pagerank(graph, weight='weight')

    #most important words in ascending order of importance
    keyphrases = sorted(calculated_page_rank, key=calculated_page_rank.get, reverse=True)

    #the number of keyphrases returned will be relative to the size of the text (a third of the number of vertices)
    aThird = int(len(word_set_list) / 3)
    keyphrases = keyphrases[0:aThird+1]

    #take keyphrases with multiple words into consideration as done in the paper - if two words are adjacent in the text and are selected as keywords, join them
    #together
    modifiedKeyphrases = set([])
    dealtWith = set([]) #keeps track of individual keywords that have been joined to form a keyphrase
    i = 0
    j = 1
    while j < len(textlist):
        firstWord = textlist[i]
        secondWord = textlist[j]
        if firstWord in keyphrases and secondWord in keyphrases:
            keyphrase = firstWord + ' ' + secondWord
            modifiedKeyphrases.add(keyphrase)
            dealtWith.add(firstWord)
            dealtWith.add(secondWord)
        else:
            if firstWord in keyphrases and firstWord not in dealtWith: 
                modifiedKeyphrases.add(firstWord)

            #if this is the last word in the text, and it is a keyword,
            #it definitely has no chance of being a keyphrase at this point    
            if j == len(textlist)-1 and secondWord in keyphrases and secondWord not in dealtWith:
                modifiedKeyphrases.add(secondWord)

        i = i + 1
        j = j + 1

    return modifiedKeyphrases
项目:YelpDataChallenge    作者:fujunswufe    | 项目源码 | 文件源码
def extractKeyphrases(text):
    #tokenize the text using nltk
    wordTokens = nltk.word_tokenize(text)

    #assign POS tags to the words in the text
    tagged = nltk.pos_tag(wordTokens)
    textlist = [x[0] for x in tagged]

    tagged = filter_for_tags(tagged)
    tagged = normalize(tagged)

    unique_word_set = unique_everseen([x[0] for x in tagged])
    word_set_list = list(unique_word_set)

    #this will be used to determine adjacent words in order to construct keyphrases with two words

    graph = buildGraph(word_set_list)

    #pageRank - initial value of 1.0, error tolerance of 0,0001, 
    calculated_page_rank = nx.pagerank(graph, weight='weight')

    #most important words in ascending order of importance
    keyphrases = sorted(calculated_page_rank, key=calculated_page_rank.get, reverse=True)

    #the number of keyphrases returned will be relative to the size of the text (a third of the number of vertices)
    aThird = len(word_set_list) / 3
    keyphrases = keyphrases[0:aThird+1]

    #take keyphrases with multiple words into consideration as done in the paper - if two words are adjacent in the text and are selected as keywords, join them
    #together
    modifiedKeyphrases = set([])
    dealtWith = set([]) #keeps track of individual keywords that have been joined to form a keyphrase
    i = 0
    j = 1
    while j < len(textlist):
        firstWord = textlist[i]
        secondWord = textlist[j]
        if firstWord in keyphrases and secondWord in keyphrases:
            keyphrase = firstWord + ' ' + secondWord
            modifiedKeyphrases.add(keyphrase)
            dealtWith.add(firstWord)
            dealtWith.add(secondWord)
        else:
            if firstWord in keyphrases and firstWord not in dealtWith: 
                modifiedKeyphrases.add(firstWord)

            #if this is the last word in the text, and it is a keyword,
            #it definitely has no chance of being a keyphrase at this point    
            if j == len(textlist)-1 and secondWord in keyphrases and secondWord not in dealtWith:
                modifiedKeyphrases.add(secondWord)

        i = i + 1
        j = j + 1

    return modifiedKeyphrases
项目:AIZooService    作者:zhanglbjames    | 项目源码 | 文件源码
def sort_words(vertex_source, edge_source, window = 2, pagerank_config = {'alpha': 0.85,}):
    """??????????????

    Keyword arguments:
    vertex_source   --  ???????????????????????????????pagerank????
    edge_source     --  ?????????????????????????????????pagerank???
    window          --  ????????window????????????
    pagerank_config --  pagerank???
    """
    sorted_words   = []
    word_index     = {}
    index_word     = {}
    _vertex_source = vertex_source
    _edge_source   = edge_source
    words_number   = 0
    for word_list in _vertex_source:
        for word in word_list:
            if not word in word_index:
                word_index[word] = words_number
                index_word[words_number] = word
                words_number += 1

    graph = np.zeros((words_number, words_number))

    for word_list in _edge_source:
        for w1, w2 in combine(word_list, window):
            if w1 in word_index and w2 in word_index:
                index1 = word_index[w1]
                index2 = word_index[w2]
                graph[index1][index2] = 1.0
                graph[index2][index1] = 1.0

    debug('graph:\n', graph)

    nx_graph = nx.from_numpy_matrix(graph)
    scores = nx.pagerank(nx_graph, **pagerank_config)          # this is a dict
    sorted_scores = sorted(scores.items(), key = lambda item: item[1], reverse=True)
    for index, score in sorted_scores:
        item = AttrDict(word=index_word[index], weight=score)
        sorted_words.append(item)

    return sorted_words
项目:NLP-Keyword-Extraction-Ensemble-Method    作者:Ashwin-Ravi    | 项目源码 | 文件源码
def score_keyphrases_by_textrank(text, n_keywords=0.05):
    from itertools import takewhile, tee, izip
    import networkx, nltk

    # tokenize for all words, and extract *candidate* words
    words = [word.lower()
             for sent in nltk.sent_tokenize(text)
             for word in nltk.word_tokenize(sent)]
    candidates = extract_candidate_words(text)
    # build graph, each node is a unique candidate
    graph = networkx.Graph()
    graph.add_nodes_from(set(candidates))
    # iterate over word-pairs, add unweighted edges into graph
    def pairwise(iterable):
        """s -> (s0,s1), (s1,s2), (s2, s3), ..."""
        a, b = tee(iterable)
        next(b, None)
        return izip(a, b)
    for w1, w2 in pairwise(candidates):
        if w2:
            graph.add_edge(*sorted([w1, w2]))
    # score nodes using default pagerank algorithm, sort by score, keep top n_keywords
    ranks = networkx.pagerank(graph)
    if 0 < n_keywords < 1:
        n_keywords = int(round(len(candidates) * n_keywords))
    word_ranks = {word_rank[0]: word_rank[1]
                  for word_rank in sorted(ranks.iteritems(), key=lambda x: x[1], reverse=True)[:n_keywords]}
    keywords = set(word_ranks.keys())
    # merge keywords into keyphrases
    keyphrases = {}
    j = 0
    for i, word in enumerate(words):
        if i < j:
            continue
        if word in keywords:
            kp_words = list(takewhile(lambda x: x in keywords, words[i:i+10]))
            avg_pagerank = sum(word_ranks[w] for w in kp_words) / float(len(kp_words))
            keyphrases[' '.join(kp_words)] = avg_pagerank
            # counter as hackish way to ensure merged keyphrases are non-overlapping
            j = i + len(kp_words)

    return sorted(keyphrases.iteritems(), key=lambda x: x[1], reverse=True)
项目:Graduation-design    作者:Baichenjia    | 项目源码 | 文件源码
def build_matrix():
    ######????? ? ? ?????
    word_index = {}  # ????????
    index_word = {}  # ????????
    weibo_data = handel_weibo_data()  # ????????????
    index = 0
    for sent in weibo_data:  # ?????
        for word in sent:   # ?????????
            if not word in word_index.keys():
                word_index[word] = index
                index_word[index] = word
                index += 1
    words_number = index
    #print "words_number", words_number
    #######???????
    graph = np.zeros((words_number, words_number))  # ??????
    for word_list in weibo_data:  # ???
        for i in range(len(word_list)):  # ???????????????????????????????
            for j in range(i, len(word_list)):
                w1 = word_list[i]
                w2 = word_list[j]  # ???????????
                index1 = word_index[w1]
                index2 = word_index[w2]
                graph[index1][index2] += 1   # ?????????1
                graph[index2][index1] += 1   # ?????????
    ######?????networkx??pagerank?????????????????
    nx_graph = nx.from_numpy_matrix(graph)  # ??networdx
    scores = nx.pagerank(nx_graph, alpha=0.85)  # ??pagerank??
    sorted_scores = sorted(scores.items(), key=lambda item: item[1], reverse=True)  # ????????
    key_words = []  # ??????????
    for index, score in sorted_scores:
        if index_word[index] == u'??' or index_word[index] == u'??' or len(index_word[index]) == 1:
            continue
        key_words.append((index_word[index], score))
    ########????????100????????
    fp_textrank_result = open('f://emotion/mysite/Label_extract/result_textrank.txt', 'w+')
    for i in range(100):
        fp_textrank_result.write(key_words[i][0] + ' ' + str(round(key_words[i][1], 10)))
        fp_textrank_result.write('\n')
    fp_textrank_result.close()
    """
    fp_test = open('f://emotion/mysite/Label_extract/test.txt', 'w+')
    for i in range(100):
        fp_test.write(key_words[i][0] + '?')
    fp_test.close()
    """
    print "textrank key word calculate is success..."
    return key_words
项目:breaking_cycles_in_noisy_hierarchies    作者:zhenv5    | 项目源码 | 文件源码
def computing_hierarchy(graph_file,players_score_func_name):
    import os.path
    if players_score_func_name == "socialagony":
        from helper_funs import dir_tail_name
        dir_name,tail = dir_tail_name(graph_file)
        agony_file = os.path.join(dir_name,tail.split(".")[0] + "_socialagony.txt")
        #agony_file = graph_file[:len(graph_file)-6] + "_socialagony.txt"
        #from compute_social_agony import compute_social_agony
        #players = compute_social_agony(graph_file,agony_path = "agony/agony ")     
        if False:
        #if os.path.isfile(agony_file):
            print("load pre-computed socialagony from: %s" % agony_file)
            players = read_dict_from_file(agony_file)
        else:
            print("start computing socialagony...")
            from compute_social_agony import compute_social_agony
            players = compute_social_agony(graph_file,agony_path = "agony/agony ")
            print("write socialagony to file: %s" % agony_file)
        return players
    g = nx.read_edgelist(graph_file,create_using = nx.DiGraph(),nodetype = int)
    if players_score_func_name == "pagerank":
        #print("computing pagerank...")
        players = nx.pagerank(g, alpha = 0.85)
        return players
    elif players_score_func_name == "trueskill":
        output_file = graph_file[:len(graph_file)-6] + "_trueskill.txt"
        output_file_2 = graph_file[:len(graph_file)-6] + "_trueskill.pkl"
        #from true_skill import graphbased_trueskill
        #players = graphbased_trueskill(g)
        #from file_io import write_dict_to_file
        #write_dict_to_file(players,output_file)

        '''
        if os.path.isfile(output_file):
            print("load pre-computed trueskill from: %s" % output_file)
            players = read_dict_from_file(output_file,key_type = int, value_type = float)
        elif os.path.isfile(output_file_2):
            print("load pre-computed trueskill from: %s" % output_file_2)
            players = read_from_pickle(output_file_2)           
        '''
        if True:
            print("start computing trueskill...")
            from true_skill import graphbased_trueskill
            players = graphbased_trueskill(g)
            from file_io import write_dict_to_file
            print("write trueskill to file: %s" % output_file)
            write_dict_to_file(players,output_file)

        return players
项目:breaking_cycles_in_noisy_hierarchies    作者:zhenv5    | 项目源码 | 文件源码
def breaking_cycles_by_hierarchy_performance(graph_file,gt_file,players_score_name):

    from measures import report_performance
    if players_score_name != "ensembling":
        players_score_dict  = computing_hierarchy(graph_file,players_score_name)
        e1,e2,e3,e4 = remove_cycle_edges_by_hierarchy(graph_file,players_score_dict,players_score_name)

        if players_score_name == "pagerank":
            report_performance(gt_file,e1,"PR")
            return

        if players_score_name == "socialagony":
            note = "SA_"
        elif players_score_name == "trueskill":
            note = "TS_"

        report_performance(gt_file,e1, note+"G")
        report_performance(gt_file,e2, note+"F")
        report_performance(gt_file,e3, note+"B")
        report_performance(gt_file,e4, note+"Voting")
    else:
        players_score_dict  = computing_hierarchy(graph_file,"socialagony")
        e1,e2,e3,e4 = remove_cycle_edges_by_hierarchy(graph_file,players_score_dict,"socialagony")
        report_performance(gt_file,e1,  "SA_G")
        write_pairs_to_file(e1,graph_file[:len(graph_file)-6] + "_removed_by_SA-G.edges")
        report_performance(gt_file,e2,  "SA_F")
        write_pairs_to_file(e2,graph_file[:len(graph_file)-6] + "_removed_by_SA-F.edges")
        report_performance(gt_file,e3,  "SA_B")
        write_pairs_to_file(e3,graph_file[:len(graph_file)-6] + "_removed_by_SA-B.edges")
        report_performance(gt_file,e4,  "SA_Voting")
        write_pairs_to_file(e4,graph_file[:len(graph_file)-6] + "_removed_by_SA-Voting.edges")

        players_score_dict  = computing_hierarchy(graph_file,"trueskill")
        e5,e6,e7,e8 = remove_cycle_edges_by_hierarchy(graph_file,players_score_dict,"trueskill")
        report_performance(gt_file,e5,  "TS_G")
        write_pairs_to_file(e5,graph_file[:len(graph_file)-6] + "_removed_by_TS-G.edges")
        report_performance(gt_file,e6,  "TS_F")
        write_pairs_to_file(e6,graph_file[:len(graph_file)-6] + "_removed_by_TS-F.edges")
        report_performance(gt_file,e7,  "TS_B")
        write_pairs_to_file(e7,graph_file[:len(graph_file)-6] + "_removed_by_TS-B.edges")
        report_performance(gt_file,e8,  "TS_Voting")
        write_pairs_to_file(e7,graph_file[:len(graph_file)-6] + "_removed_by_TS-Voting.edges")

        e9 = remove_cycle_edges_by_voting(graph_file,[set(e1),set(e2),set(e3),set(e5),set(e6),set(e7)])
        report_performance(gt_file,e9,"H_Voting")
        write_pairs_to_file(e9,graph_file[:len(graph_file)-6] + "_removed_by_H-Voting.edges")