Python sklearn.metrics 模块,silhouette_score() 实例源码

我们从Python开源项目中,提取了以下39个代码示例,用于说明如何使用sklearn.metrics.silhouette_score()

项目:rca-evaluation    作者:sieve-microservices    | 项目源码 | 文件源码
def silhouette_score(series, clusters):
    distances = np.zeros((series.shape[0], series.shape[0]))
    for idx_a, metric_a in enumerate(series):
        for idx_b, metric_b in enumerate(series):
            distances[idx_a, idx_b] = _sbd(metric_a, metric_b)[0]
    labels = np.zeros(series.shape[0])
    for i, (cluster, indicies) in enumerate(clusters):
        for index in indicies:
            labels[index] = i

    # silhouette is only defined, if we have 2 clusters with assignments at 
    # minimum
    if len(np.unique(labels)) == 1 or (len(np.unique(labels)) >= distances.shape[0]):
    #if len(np.unique(labels)) == 1:
        return labels, -1
    else:
        return labels, _silhouette_score(distances, labels, metric='precomputed')
项目:lol-category    作者:vonum    | 项目源码 | 文件源码
def spectral(data):
  spectral = SpectralClustering(
      eigen_solver='arpack',
      affinity='rbf',
      assign_labels='discretize'
  ).fit(data)

  print 'Spectral'
  print collections.Counter(spectral.labels_)
  print metrics.silhouette_score(data, spectral.labels_)

  reduced_data = reduce_with_pca(data, 2)
  plot_2d_data(reduced_data, spectral.labels_)
项目:sptgraph    作者:epfl-lts2    | 项目源码 | 文件源码
def best_shape_clustering(mols, nb_layers, k_range=range(3, 20), train_ratio=0.8, cluster_key='shape_cid'):
    from sklearn.cross_validation import train_test_split
    from sklearn.metrics import silhouette_score

    shape_df = mols['dynamic'].apply(lambda x: temporal_shape(x, nb_layers))
    train_idx, test_idx = train_test_split(shape_df.index.values, train_size=train_ratio)

    train_mat = np.array(list(shape_df[shape_df.index.isin(train_idx)].values))
    full_mat = np.array(list(shape_df.values))

    centroids = None
    labels = None
    best_score = 0
    for k in k_range:
        res = cluster_shapes(train_mat, full_mat, k)
        score = silhouette_score(full_mat, res[1])
        if score > best_score:
            centroids = res[0]
            labels = res[1]
            best_score = score

    mols[cluster_key] = labels
    return mols, centroids
项目:texta    作者:texta-tk    | 项目源码 | 文件源码
def _find_optimal_clustering(self,clusterings):

        max_score = float('-inf')
        max_clustering = None

        for clustering in clusterings:
            labeled_vectors = [(node.vector,cluster_idx) for cluster_idx in range(len(clustering)) for node in _get_cluster_nodes(clustering[cluster_idx][1]) ]
            vectors,labels = [np.array(x) for x in zip(*labeled_vectors)]
            if np.in1d([1],labels)[0]:
                score = silhouette_score(vectors,labels,metric='cosine')
            else:
                continue # silhouette doesn't work with just one cluster
            if score > max_score:
                max_score = score
                max_clustering = clustering

        return zip(*max_clustering)[1] if max_clustering else zip(*clusterings[0])[1]
项目:TPs    作者:DataMiningP7    | 项目源码 | 文件源码
def evaluate_kmeans(X, model):
    """ Evaluate a K-Means model that has been trained on X using the
     Silhouette score.

    Args:
        X: the TF-IDF matrix where each line represents a document and each
           column represents a word, typically obtained by running
           transform_text() from the TP2.
        model: the KMeans model trained on X.
    Returns:
        A double that corresponds to the Silhouette score of the model.
    """
    return silhouette_score(X, model.labels_)


# Ex2
项目:yellowbrick    作者:DistrictDataLabs    | 项目源码 | 文件源码
def fit(self, X, y=None, **kwargs):
        """
        Fits the model and generates the the silhouette visualization.

        TODO: decide to use this method or the score method to draw.
        NOTE: Probably this would be better in score, but the standard score
        is a little different and I'm not sure how it's used.
        """
        # Fit the wrapped estimator
        self.estimator.fit(X, y, **kwargs)

        # Get the properties of the dataset
        self.n_samples = X.shape[0]
        self.n_clusters = self.estimator.n_clusters

        # Compute the scores of the cluster
        labels = self.estimator.predict(X)
        self.silhouette_score_ = silhouette_score(X, labels)
        self.silhouette_samples_ = silhouette_samples(X, labels)

        # Draw the silhouette figure
        self.draw(labels)

        # Return the estimator
        return self
项目:VASC    作者:wang-research    | 项目源码 | 文件源码
def clustering( points, k=2,name='kmeans'):
    '''
    points: N_samples * N_features
    k: number of clusters
    '''
    if name == 'kmeans':
        kmeans = KMeans( n_clusters=k,n_init=100 ).fit(points)
        ## print within_variance
        #cluster_distance = kmeans.transform( points )
        #within_variance = sum( np.min(cluster_distance,axis=1) ) / float( points.shape[0] )
        #print("AvgWithinSS:"+str(within_variance))
        if len( np.unique(kmeans.labels_) ) > 1: 
            si = silhouette_score( points,kmeans.labels_ )
            #print("Silhouette:"+str(si))
        else:
            si = 0
            print("Silhouette:"+str(si))
        return kmeans.labels_,si

    if name == 'spec':
        spec= SpectralClustering( n_clusters=k,affinity='cosine' ).fit( points )
        si = silhouette_score( points,spec.labels_ )
        print("Silhouette:"+str(si))
        return spec.labels_,si
项目:GRIPy    作者:giruenf    | 项目源码 | 文件源码
def k_means(data, nc, req_info=None):
    means = np.mean(data, axis=0)
    stds = np.std(data, axis=0)

    sdata = (data - means)/stds

    km = KMeans(init='k-means++', n_clusters=nc, n_init=10)
    km.fit(sdata)

    if req_info == 'all':
        req_info = ['silhouette', 'inertia', 'centers']
    elif req_info is None:
        req_info = []

    info = {}

    if 'silhouette' in req_info:
        info['silhouette'] = metrics.silhouette_score(data, km.labels_)
    if 'inertia' in req_info:
        info['inertia'] = km.inertia_
    if 'centers' in req_info:
        info['centers'] = km.cluster_centers_*stds + means

    return km.labels_, info
项目:idealoom    作者:conversence    | 项目源码 | 文件源码
def internal_silhouette(self, idea_id, base_labels=None):
        labels = self.labels_for_idea(idea_id, True, False, base_labels)
        self.remove_singletons(labels, idea_id)
        idea_post_ids = self.get_posts_of_idea(idea_id)
        if base_labels:
            idea_post_ids = set(idea_post_ids)
            idea_post_ids.update(list(base_labels.keys()))
            idea_post_ids = np.array(list(idea_post_ids))
            idea_post_ids.sort()
        idea_post_ids = np.array(idea_post_ids)
        idea_post_nums = self.post_ids.searchsorted(idea_post_ids)
        # slicing one axis of a time
        # because simultaneous slice interpreted as diagonal
        distances = self.distance_matrix
        sub_distance = distances[idea_post_nums][:, idea_post_nums]
        sub_labels = labels[idea_post_nums]
        if len(set(sub_labels)) < 2:
            return 0
        return metrics.silhouette_score(sub_distance, sub_labels, 'precomputed')
项目:ml-deti    作者:mariolpantunes    | 项目源码 | 文件源码
def bench_k_means(estimator, name, data):
    t0 = time()
    estimator.fit(data)
    print('% 9s   %.2fs    %i   %.3f   %.3f   %.3f   %.3f   %.3f    %.3f'
          % (name, (time() - t0), estimator.inertia_,
             metrics.homogeneity_score(labels, estimator.labels_),
             metrics.completeness_score(labels, estimator.labels_),
             metrics.v_measure_score(labels, estimator.labels_),
             metrics.adjusted_rand_score(labels, estimator.labels_),
             metrics.adjusted_mutual_info_score(labels,  estimator.labels_),
             metrics.silhouette_score(data, estimator.labels_,
                                      metric='euclidean',
                                      sample_size=sample_size)))
项目:lol-category    作者:vonum    | 项目源码 | 文件源码
def db_scan(data, eps, min_samples, metric):
  dbscan = DBSCAN(eps=eps, min_samples=min_samples, metric=metric).fit(data)
  print 'DBSCAN'
  print metrics.silhouette_score(data, dbscan.labels_)
  print collections.Counter(dbscan.labels_)
  reduced_data = reduce_with_pca(data)
  plot_2d_data(reduced_data, dbscan.labels_)
项目:lol-category    作者:vonum    | 项目源码 | 文件源码
def mean_shift(data):
  mean_shift = MeanShift(cluster_all=False, n_jobs=1).fit(data)
  print 'Mean Shift'
  print metrics.silhouette_score(data, mean_shift.labels_)
  print collections.Counter(mean_shift.labels_)
项目:lol-category    作者:vonum    | 项目源码 | 文件源码
def affinity_prop(data):
  af = AffinityPropagation(damping=0.5, convergence_iter=15, affinity='euclidean').fit(data)
  print 'Affinity Propagation'
  print metrics.silhouette_score(data, af.labels_)
  print collections.Counter(af.labels_)

# mean_shift(np.array(data))
# affinity_prop(np.array(data))
项目:lol-category    作者:vonum    | 项目源码 | 文件源码
def cluster2d(data, n_clusters):
  reduced_data = reduce_with_pca(data)

  kmeans = KMeans(n_clusters = n_clusters, random_state=0).fit(reduced_data)
  print 'K-Means'
  print collections.Counter(kmeans.labels_)
  print metrics.silhouette_score(data, kmeans.labels_)

  plot_2d_data(reduced_data, kmeans.labels_)
项目:lol-category    作者:vonum    | 项目源码 | 文件源码
def em(data):
  gmm = GaussianMixture(
    n_components=6,
    covariance_type="tied"
  ).fit(data)
  predicted_data = gmm.predict(data)

  print collections.Counter(predicted_data)
  print metrics.silhouette_score(data, predicted_data)

  reduced_data = reduce_with_pca(data, 2)
  plot_2d_data(reduced_data, predicted_data)
项目:NBAPlayerValue    作者:TWanish    | 项目源码 | 文件源码
def kmeans(reduced_data, n_clusters):
    #----Do KMeans clustering and return relevant graphing/performance data
    kmeans = cluster.KMeans(n_clusters=n_clusters, random_state=42)
    kmeans = kmeans.fit(reduced_data)
    sil_score = metrics.silhouette_score(reduced_data, kmeans.labels_, metric='euclidean')

    data_dictionary = {
        "labels": kmeans.labels_,
        "centroids": kmeans.cluster_centers_,
        "silhouette_score": sil_score
    }

    return data_dictionary
项目:NBAPlayerValue    作者:TWanish    | 项目源码 | 文件源码
def agglom(reduced_data, n_clusters):
    #----Do Agglomerative clustering and return relevant performance data
    clustering = cluster.AgglomerativeClustering(n_clusters = n_clusters)
    clustering = clustering.fit(reduced_data)
    sil_score = metrics.silhouette_score(reduced_data, clustering.labels_, metric='euclidean')

    return {
        "labels":clustering.labels_,
        "silhouette_score": sil_score
        }
项目:NBAPlayerValue    作者:TWanish    | 项目源码 | 文件源码
def find_best_cluster(cluster_type,data,a,b):
    #----Prints silhouette scores for all # of clusters in range
    scores = []
    for i in range(a,b):

        if cluster_type.lower() == "kmeans":
            i_clusters = kmeans(data, i)
        elif cluster_type.lower() == "agglom":
            i_clusters = agglom(data, i)

        sil_score_i = i_clusters['silhouette_score']
        scores.append(sil_score_i)

    print(scores)
项目:ParseLawDocuments    作者:FanhuaandLuomu    | 项目源码 | 文件源码
def clustering(docs,n_clusters):  # ?? n_clusters ???
    kmeans_model=KMeans(n_clusters=n_clusters,random_state=1).fit(docs)  # kmeans??
    labels=kmeans_model.labels_
    # hmodel=AgglomerativeClustering(n_clusters=n_clusters).fit(docs)   # ????
    # labels=hmodel.labels_
    score=metrics.silhouette_score(np.array(docs),labels,metric='euclidean')  #   euclidean  ??
    return labels,score
项目:Clustering    作者:Ram81    | 项目源码 | 文件源码
def analyze_k_means(estimator, name, data):
    t0 = time()
    estimator.fit(data) 
    print(" %9s %.2fs %i %.3f %.3f %.3f %.3f %.3f %.3f"%( name, time()-t0, estimator.inertia_, metrics.homogeneity_score(labels,  estimator.labels_), metrics.completeness_score(labels, estimator.labels_), metrics.v_measure_score(labels, estimator.labels_), metrics.adjusted_rand_score(labels, estimator.labels_), metrics.adjusted_mutual_info_score(labels, estimator.labels_), metrics.silhouette_score(data, estimator.labels_, metric='euclidean', sample_size = samples) ))
项目:hyperstar    作者:nlpub    | 项目源码 | 文件源码
def evaluate(k):
    km = kmeans[k]
    score = silhouette_score(train_offsets, km.labels_, metric='euclidean', random_state=RANDOM_SEED)
    print('Silhouette score for k=%d is %f.' % (k, score))
    return (k, score)
项目:TPs    作者:DataMiningP7    | 项目源码 | 文件源码
def try_kmeans(X):
    """ Run the K-Means algorithm on X with different values of K, and return
     the one that gives the best score.

    Args:
        X: the TF-IDF matrix where each line represents a document and each
           column represents a word, typically obtained by running
           transform_text() from the TP2.
    """
    best_k = 1
    best_score = -1

    for k in range(2, 20+1):
        model = KMeans(n_clusters=k)
        model.fit(X)
        labels = model.predict(X)
        score = silhouette_score(model.transform(X), labels)

        print(k, "->", score)
        if score > best_score:
            best_k = k
            best_score = score

    print("The best K is", best_k)
    return best_k


# Ex3
项目:TPs    作者:DataMiningP7    | 项目源码 | 文件源码
def ex2_kmeans(X, y):
    """ Applies the KMeans algorithm on X, y using K=10 and print the
    silhouette score of this model. X and y are returned by transform_text
    above.
    """
    model = KMeans(10).fit(X, y)
    print "Silhouette score: %f" % metrics.silhouette_score(X, model.labels_)

# Ex 3
项目:TPs    作者:DataMiningP7    | 项目源码 | 文件源码
def ex4_agglomerative_clustering(X, y):
    """ This does the same thing as ex2_kmeans but with an agglomerative
    clustering and K=2.
    """
    # AgglomerativeClustering needs a non-spare matrix
    X = X.toarray()

    k = 2
    model = AgglomerativeClustering(k).fit(X, y)

    print "Silhouette score: %f" % metrics.silhouette_score(X, model.labels_)


# Ex 5
项目:Solr-ES-Similarity    作者:harsham05    | 项目源码 | 文件源码
def sk_kmeans(core): #, kval=3

    solrURL = "http://localhost:8983/solr/" + core
    solrInstance = Solr(solrURL)

    list_of_points = []
    docs = solrInstance.query_iterator(query="*:*", start=0)

    for doc in docs:
        list_of_points.append(Vector(doc['id'], doc))

    list_of_Dicts = (point.features for point in list_of_points)

    df = pd.DataFrame(list_of_Dicts)
    df = df.fillna(0)

    silhouettes = {}
    for k in range(2, 10):

        kmeans = KMeans(n_clusters=k,
                    init='k-means++',
                    max_iter=300,  # k-means convergence
                    n_init=10,  # find global minima
                    n_jobs=-2,  # parallelize
                    )

        labels = kmeans.fit_predict(df)
        silhouettes[k] = silhouette_score(df, labels)


    return str(silhouettes)
项目:crime_prediction    作者:livenb    | 项目源码 | 文件源码
def nmf_test(df):
    X = df.drop(['Year', 'zipcode'], axis=1).values
    scaler = MinMaxScaler()
    X_sca = scaler.fit_tranform(X)
    scores = []
    for k in xrange(2, 11):
        model = NMF(n_components=k)
        W = model.fit_transform(X_sca)
        labels = W.argmax(axis=1)
        score = silhouette_score(X_sca, labels)
        scores.append(score)
    plt.plot(xrange(2, 11), scores, 'b*-')
    plt.show()
项目:artorithmia    作者:alichtner    | 项目源码 | 文件源码
def silhouette(self):
        """
        Calculate the silhouette score for a certain clustering.

        Input:  None
        Output: silhouette score (None)
        """
        return silhouette_score(self.features, self.cluster_labels)
项目:cluster_paraphrases    作者:acocos    | 项目源码 | 文件源码
def h_cluster(wordlist, sims, distmat, thresh=0.01):

    B_, Bs, Ms, Ts, As = hgfc(sims, thresh=thresh)

    sil_coefs = []
    for i,a in enumerate(As):
        l = labels(a)
        if len(set(l)) > 2 and len(set(l)) < len(wordlist)-1:
            sil_coefs.append(silhouette_score(distmat, labels(a), metric='precomputed'))
        else:
            sil_coefs.append(0.0)
    ld = [labeldict(a,wordlist) for a in As]
    return ld, sil_coefs
项目:email-sherlock    作者:jgondin    | 项目源码 | 文件源码
def scores(dmat,cluster_labels):
    try:
        silhouette_avg = silhouette_score(dmat, cluster_labels, metric='precomputed', sample_size=100)
        return(silhouette_avg)
    except:
        return(None)
项目:yelp-contest    作者:AndyFou    | 项目源码 | 文件源码
def silhcoeff(data,labels):
    arrdata = array(data)
    print("Silhouette coefficient: ", metrics.silhouette_score(arrdata,labels,metric='euclidean'))

###################################  PHOTOS  ###########################################

# LOAD PHOTOS FROM FOLDER & SAVE IN A LIST [FILENAME,PHOTO,GRAYSCALE_PHOTO]
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def bench_k_means(estimator, name, data):
    t0 = time()
    estimator.fit(data)
    print('% 9s   %.2fs    %i   %.3f   %.3f   %.3f   %.3f   %.3f    %.3f'
          % (name, (time() - t0), estimator.inertia_,
             metrics.homogeneity_score(labels, estimator.labels_),
             metrics.completeness_score(labels, estimator.labels_),
             metrics.v_measure_score(labels, estimator.labels_),
             metrics.adjusted_rand_score(labels, estimator.labels_),
             metrics.adjusted_mutual_info_score(labels,  estimator.labels_),
             metrics.silhouette_score(data, estimator.labels_,
                                      metric='euclidean',
                                      sample_size=sample_size)))
项目:idealoom    作者:conversence    | 项目源码 | 文件源码
def get_all_results(self):
        discussion = self.discussion
        idea_ids = discussion.db.query(Idea.id).filter_by(
            discussion_id=discussion.id).all()
        results = {id: self.get_cluster_info(id)
                   for (id,) in idea_ids}
        results[None] = self.get_cluster_info()
        posres = {id: r for (id, r) in results.items() if r is not None}
        # for id, (silhouette_score, compare_with_ideas, clusters, post_info) in posres.iteritems():
        #     log.debug(" ".join((id, silhouette_score, repr([len(x['cluster']) for x in clusters]))))
        return posres
项目:idealoom    作者:conversence    | 项目源码 | 文件源码
def silhouette_score(self):
        if self._silhouette_score is None:
            self._silhouette_score = metrics.silhouette_score(
                self.model_matrix,
                self.optics.as_labels(self.optics_clusters),
                metric=self.metric)
        return self._silhouette_score
项目:py4design    作者:chenkianwee    | 项目源码 | 文件源码
def elbow_test(X, max_cluster):
    """
    This function performs the elbow test to determine the number of clusters for k-means clustering.

    Parameters
    ----------           
    X : numpy array
        2d list of floats.  

    max_cluster : int
        The maximum number of clusters to desirable.

    Returns
    -------
    number of clusters : int
        The number of clusters for kmeans clustering
    """
    from sklearn.cluster import KMeans
    from sklearn import metrics
    inertia_list = []
    s_list = []
    for cluster_cnt in range(max_cluster-1):
        k_means = KMeans(n_clusters=cluster_cnt+2)
        k_means.fit(X)
        k_means_labels = k_means.labels_
        s_factor = metrics.silhouette_score(X, k_means_labels, metric='euclidean')
        s_list.append(s_factor)
        kmeans_inertia = k_means.inertia_
        inertia_list.append(kmeans_inertia)

    inertia_cnt = 0
    i_diff_list = []
    for inertia in inertia_list:
        #look for the difference between each difference in cluster number
        if inertia_cnt != len(inertia_list) - 1:
            i_diff = inertia - inertia_list[inertia_cnt + 1]
            i_diff_list.append(i_diff)
        inertia_cnt = inertia_cnt + 1

    #find the biggest difference and use that number for the best number of cluster
    max_diff = max(i_diff_list)
    max_diff_index = i_diff_list.index(max_diff)
    #+3 because of the counting 
    best_no_cluster = max_diff_index + 3
    return best_no_cluster
项目:yellowbrick    作者:DistrictDataLabs    | 项目源码 | 文件源码
def distortion_score(X, labels, metric='euclidean'):
    """
    Compute the mean distortion of all samples.

    The distortion is computed as the the sum of the squared distances between
    each observation and its closest centroid. Logically, this is the metric
    that K-Means attempts to minimize as it is fitting the model.

    .. seealso:: http://kldavenport.com/the-cost-function-of-k-means/

    Parameters
    ----------
    X : array, shape = [n_samples, n_features] or [n_samples_a, n_samples_a]
        Array of pairwise distances between samples if metric == "precomputed"
        or a feature array for computing distances against the labels.

    labels : array, shape = [n_samples]
        Predicted labels for each sample

    metric : string
        The metric to use when calculating distance between instances in a
        feature array. If metric is a string, it must be one of the options
        allowed by `sklearn.metrics.pairwise.pairwise_distances
        <http://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.pairwise_distances.html#sklearn.metrics.pairwise.pairwise_distances>`_

    .. todo:: add sample_size and random_state kwds similar to silhouette_score
    """
    # Encode labels to get unique centers and groups
    le = LabelEncoder()
    labels = le.fit_transform(labels)
    unique_labels = le.classes_

    # Sum of the distortions
    distortion = 0

    # Loop through each label (center) to compute the centroid
    for current_label in unique_labels:
        # Mask the instances that belong to the current label
        mask = labels == current_label
        instances = X[mask]

        # Compute the center of these instances
        center = instances.mean(axis=0)

        # Compute the square distances from the instances to the center
        distances = pairwise_distances(instances, [center], metric=metric)
        distances = distances ** 2

        # Add the mean square distance to the distortion
        distortion += distances.mean()

    return distortion


##########################################################################
## Elbow Method
##########################################################################
项目:webdataconnector_ml    作者:DoubleEE    | 项目源码 | 文件源码
def runClustering(cluster_df):
    from sklearn.cluster import KMeans
    from sklearn.metrics import silhouette_score as silhouette_score

    Xcols = [col for col in cluster_df.columns if 'NOTMODEL' not in col.upper()]

    # Convert character columns to dummy variables
    X = cluster_df[Xcols]
    cols = X.columns
    num_cols = X._get_numeric_data().columns
    char_cols = list(set(cols) - set(num_cols))
    for col in char_cols:
        if len(X[col].unique()) <= 20:
            dummy = pd.get_dummies(X[col], prefix='dm' + col)
            column_name = X.columns.values.tolist()
            column_name.remove(col)
            X = X[column_name].join(dummy)
        else:
            if col in X.columns:    # If more than 20 distinct values then delete
                del X[col]

    # Standardize (Z-score normalize) all continuous variables
    from scipy.stats import zscore
    for col in X:
        if len(X[col].unique()) > 2:    # Standardize non-dummy variables
            col_zscore = 'z_' + col
            X[col_zscore] = zscore(X[col])
            del X[col]

    # Fill missing values with 0 = the mean in the z-normalize data
    # Obviously missing values can be handled in many different ways
    X.fillna(0, inplace=True)

    # convert to matrix/numpy array to use in KMeans clustering class
    data_for_clustering_matrix = X.as_matrix()

    number_of_Clusters = []
    silhouette_value = []
    # Loop through 2 and 20 clusters and identify which has the highest silhouette score
    k = range(2, 21)
    for i in k:
        clustering_method = KMeans(n_clusters=i)
        clustering_method.fit(data_for_clustering_matrix)
        labels = clustering_method.predict(data_for_clustering_matrix)
        silhouette_average = silhouette_score(data_for_clustering_matrix, labels)
        silhouette_value.append(silhouette_average)
        number_of_Clusters.append(int(i))

        # maxind = np.argmax(silhouette_value)
        max_value = max(silhouette_value)
        indexMaxValue = silhouette_value.index(max_value)

        # FIT KMEANS CLUSTER MODEL WITH NUMBER OF CLUSTERS WITH HIGHEST SILHOUETTE SCORE
        clustering_method = KMeans(n_clusters=number_of_Clusters[indexMaxValue])
        clustering_method.fit(data_for_clustering_matrix)
        labels = clustering_method.predict(data_for_clustering_matrix)

        # SCORE THE DATAFRAME  score_df
        cluster_df['cluster'] = labels
        return cluster_df
项目:ProjectOfDataMining    作者:IljaNovo    | 项目源码 | 文件源码
def compute_affinity_propagation(preference_, X):
    # DATA FILLING
    #text = io.Input.local_read_text_file(inputFilePath)
    #input_array = text.split('\n')
    centers = [[1, 1], [-1, -1], [1, -1]]
    n_samples = 300
    #Make Blobs used for generating of labels_true array
    if (X == None):
        X, labels_true = make_blobs(n_samples = n_samples, centers=centers, cluster_std=1, random_state=0)
        print("Data is none!!!")
        print("Generating " + str(n_samples) + " samples")
    else :
        data, labels_true = make_blobs(n_samples=len(X), centers=centers, cluster_std=1, random_state=0)
    #slist = list()
    #for line in X:
    #    slist.append(line)
    #io.Output.write_array_to_txt_file("clustering\\Affinity_Propagation\\input_data1.txt", slist)
    #float_array = []
    #for line in input_array:
    #    float_line = [float(i) for i in line.split(' ')]
    #    float_array.append(float_line)
    #X = array(float_array)

    af = AffinityPropagation(preference=preference_).fit(X)
    cluster_centers_indices = af.cluster_centers_indices_
    labels = af.labels_
    n_clusters_ = len(cluster_centers_indices)
    print('Estimated number of clusters: %d' % n_clusters_)
    print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels))
    print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels))
    print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels))
    print("Adjusted Rand Index: %0.3f" % metrics.adjusted_rand_score(labels_true, labels))
    print("Adjusted Mutual Information: %0.3f" % metrics.adjusted_mutual_info_score(labels_true, labels))
#    print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(X, labels, metric='sqeuclidean'))
    print("Fowlkes Mallows Score: %0.3f" % metrics.fowlkes_mallows_score(labels_true, labels))

    plt.close('all')
    plt.figure(1)
    plt.clf()
    colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')
    for k, col in zip(range(n_clusters_), colors):
        class_members = labels == k
        cluster_center = X[cluster_centers_indices[k]]
        plt.plot(X[class_members, 0], X[class_members, 1], col + '.')
        plt.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col, markeredgecolor='k', markersize=14)
        for x in X[class_members]:
            plt.plot([cluster_center[0], x[0]], [cluster_center[1], x[1]], col)

    plt.title('Estimated number of clusters: %d' % n_clusters_)
    plt.show()
项目:GRIPy    作者:giruenf    | 项目源码 | 文件源码
def expectation_maximization(data, nc, cv_type='full', req_info=None):
    gmm = GMM(n_components=nc, covariance_type=cv_type, thresh=1.0E-4, n_init=10)
    gmm.fit(data)

    labels = gmm.predict(data)

    if req_info == 'all':
        req_info = ['aic', 'bic', 'converged', 'weights', 'means', 'covars',
                    'silhouette', 'proba']
    elif req_info is None:
        req_info = []

    info = {}
    if 'aic' in req_info:
        info['aic'] = gmm.aic(data)
    if 'bic' in req_info:
        info['bic'] = gmm.bic(data)
    if 'converged' in req_info:
        info['converged'] = gmm.converged_
    if 'weights' in req_info:
        info['weights'] = gmm.weights_
    if 'means' in req_info:
        info['means'] = gmm.means_
    if 'covars' in req_info:
        if cv_type == 'full':
            info['covars'] = gmm.covars_
        elif cv_type == 'tied':
            cov = np.empty((nc, gmm.covars_.shape[0], gmm.covars_.shape[1]))
            for i in range(nc):
                cov[i] = gmm.covars_.copy()
            info['covars'] = cov
        else:
            cov = np.empty((nc, gmm.covars_.shape[0], gmm.covars_.shape[1]))
            for i in range(nc):
                cov[i] = np.diag(gmm.covars_[i])
            info['covars'] = cov
    if 'silhouette' in req_info:
        info['silhouette'] = metrics.silhouette_score(data, labels)
    if 'proba' in req_info:
        info['proba'] = gmm.predict_proba(data).T

    return labels, info
项目:SUPPA    作者:comprna    | 项目源码 | 文件源码
def calculate_cluster_scores(x, cluster_labels, output):

    with open("%s_scores.log" % output, "w+") as fh:
        # Filter out singleton "cluster" (labeled as -1)
        filtered_x, filtered_cluster_labels, singletons = ([] for _ in range(3))
        cluster_groups = defaultdict(list)
        for vec, lab in zip(x, cluster_labels):
            if not lab == -1:
                filtered_x.append(vec)
                filtered_cluster_labels.append(lab)

                cluster_groups[lab].append(vec)
            else:
                singletons.append(vec)

        ln = "Number of clustered events: %d/%d (%f%%)\n" % (len(filtered_x), len(filtered_x)+len(singletons),
                                                           (len(filtered_x)/(len(filtered_x)+len(singletons)))*100)
        print(ln.strip("\n"))
        fh.write(ln)

        for group in cluster_groups:
                n_events = len(cluster_groups[group])
                ln = "Cluster %d contains %d events\n" % (group, n_events)
                print(ln.strip("\n"))
                fh.write(ln)

        rmsstd_scores = []
        for group in cluster_groups:
            rmsstd = calculate_rmsstd(np.array(cluster_groups[group]))
            ln = "The RMSSTD score for cluster %d is %f\n" % (group, rmsstd)
            print(ln.strip("\n"))
            fh.write(ln)

            rmsstd_scores.append(rmsstd)

        try:
            silhouette_avg = silhouette_score(np.array(filtered_x), np.array(filtered_cluster_labels))
            ln = "The average silhouette score is : %f\n" % silhouette_avg
            print(ln.strip("\n"))
            fh.write(ln)
        except:
            silhouette_avg = float("nan")
            ln = "Impossible to calculate silhouette score. Only 1 cluster group identified.\n"
            print(ln.strip("\n"))
            fh.write(ln)

    return silhouette_avg, rmsstd_scores