Python scipy 模块,cluster() 实例源码

我们从Python开源项目中,提取了以下33个代码示例,用于说明如何使用scipy.cluster()

项目:massivedatans    作者:JohannesBuchner    | 项目源码 | 文件源码
def debugplot(self, u = None):
        print 'creating plot...'
        n = len(self.region['members'][0]) / 2
        plt.figure(figsize=(6, n/2*4+1))
        m = self.region['members']
        d = self.region['maxdistance']
        for i in range(n):
            plt.subplot(numpy.ceil(n / 2.), 2, 1+i)
            j = i * 2
            k = i * 2 + 1
            plt.plot(m[:,j], m[:,k], 'x', color='b', ms=1)
            plt.gca().add_artist(plt.Circle((m[0,j], m[0,k]), d, color='g', alpha=0.3))
            if u is not None:
                plt.plot(u[j], u[k], 's', color='r')
                plt.gca().add_artist(plt.Circle((u[j], u[k]), d, color='r', alpha=0.3))
        prefix='friends%s-%s_' % ('1' if self.jackknife else '', self.metric)
        plt.savefig(prefix + 'cluster.pdf')
        plt.close()
        print 'creating plot... done'
项目:pyxpose    作者:PetitPrince    | 项目源码 | 文件源码
def find_a_dominant_color(image):
    # K-mean clustering to find the k most dominant color, from:
    # http://stackoverflow.com/questions/3241929/python-find-dominant-most-common-color-in-an-image
    n_clusters = 5

    # Get image into a workable form
    im = image.copy()
    im = im.resize((150, 150))      # optional, to reduce time
    ar = scipy.misc.fromimage(im)
    im_shape = ar.shape
    ar = ar.reshape(scipy.product(im_shape[:2]), im_shape[2])
    ar = np.float_(ar)

    # Compute clusters
    codes, dist = scipy.cluster.vq.kmeans(ar, n_clusters)
    vecs, dist = scipy.cluster.vq.vq(ar, codes)         # assign codes
    counts, bins = scipy.histogram(vecs, len(codes))    # count occurrences

    # Get the indexes of the most frequent, 2nd most frequent, 3rd, ...
    sorted_idxs = np.argsort(counts)

    # Get the color
    peak = codes[sorted_idxs[1]] # get second most frequent color

    return [int(i) for i in peak.tolist()] # list comprehension to quickly cast everything to int
项目:BioNLP-2016    作者:cambridgeltl    | 项目源码 | 文件源码
def process_options(args):    
    options = argparser().parse_args(args)

    if options.max_rank is not None and options.max_rank < 1:
        raise ValueError('max-rank must be >= 1')
    if options.eps <= 0.0:
        raise ValueError('eps must be > 0')

    wv = wvlib.load(options.vectors[0], max_rank=options.max_rank)

    if options.normalize:
        logging.info('normalize vectors to unit length')
        wv.normalize()

    words, vectors = wv.words(), wv.vectors()

    if options.whiten:
        logging.info('normalize features to unit variance')
        vectors = scipy.cluster.vq.whiten(vectors)

    return words, vectors, options
项目:MIRA    作者:comprna    | 项目源码 | 文件源码
def write_fasta(file_list):
    '''
    Writes fasta files for cluster sequqnce from bedfile
    '''

    for infile in file_list:

        ifi = open(infile, "r")
        out_fa_file = infile + ".fa"

        with open(out_fa_file, "w") as of:
            of.close()
            pass

        of = open(out_fa_file,'a+')

        for line in ifi:
            f_header, f_seq = fasta_header(line)
            '''
            Create outfile
            '''
            of.write("{}\n{}\n".format(f_header, f_seq))

        print("done {}".format(out_fa_file))
        ifi.close()
项目:scanpy    作者:theislab    | 项目源码 | 文件源码
def compute_group_overlap_score(ref_labels, pred_labels,
                                threshold_overlap_pred=0.5,
                                threshold_overlap_ref=0.5):
    """How well do the pred_labels explain the ref_labels?

    A predicted cluster explains a reference cluster if it is contained within the reference
    cluster with at least 50% (threshold_overlap_pred) of its points and these correspond
    to at least 50% (threshold_overlap_ref) of the reference cluster.
    """
    ref_unique, ref_counts = np.unique(ref_labels, return_counts=True)
    ref_dict = dict(zip(ref_unique, ref_counts))
    pred_unique, pred_counts = np.unique(pred_labels, return_counts=True)
    pred_dict = dict(zip(pred_unique, pred_counts))
    summary = []
    for true in ref_unique:
        sub_pred_unique, sub_pred_counts = np.unique(pred_labels[true == ref_labels], return_counts=True)
        relative_overlaps_pred = [sub_pred_counts[i] / pred_dict[n] for i, n in enumerate(sub_pred_unique)]
        relative_overlaps_ref = [sub_pred_counts[i] / ref_dict[true] for i, n in enumerate(sub_pred_unique)]
        pred_best_index = np.argmax(relative_overlaps_pred)
        summary.append(1 if (relative_overlaps_pred[pred_best_index] >= threshold_overlap_pred and
                             relative_overlaps_ref[pred_best_index] >= threshold_overlap_ref)
                       else 0)
        # print(true, sub_pred_unique[pred_best_index], relative_overlaps_pred[pred_best_index],
        #       relative_overlaps_ref[pred_best_index], summary[-1])
    return sum(summary)/len(summary)
项目:scanpy    作者:theislab    | 项目源码 | 文件源码
def hierarch_cluster(M):
    """Cluster matrix using hierarchical clustering.

    Parameters
    ----------
    M : np.ndarray
        Matrix, for example, distance matrix.

    Returns
    -------
    Mclus : np.ndarray
        Clustered matrix.
    indices : np.ndarray
        Indices used to cluster the matrix.
    """
    import scipy as sp
    import scipy.cluster
    link = sp.cluster.hierarchy.linkage(M)
    indices = sp.cluster.hierarchy.leaves_list(link)
    Mclus = np.array(M[:, indices])
    Mclus = Mclus[indices, :]
    if False:
        pl.matshow(Mclus)
        pl.colorbar()
    return Mclus, indices
项目:PyMaid    作者:schlegelp    | 项目源码 | 文件源码
def get_leafs(self, use_labels=False):
        """ Use to retrieve labels.

        Parameters
        ----------
        use_labels :    bool, optional
                        If True, self.labels will be returned. If False, will
                        use either columns (if matrix is pandas DataFrame)
                        or indices (if matrix is np.ndarray)
        """

        if isinstance(self.mat, pd.DataFrame):
            if use_labels:
                return [self.labels[i] for i in scipy.cluster.hierarchy.leaves_list(self.linkage)]
            else:
                return [self.mat.columns.tolist()[i] for i in scipy.cluster.hierarchy.leaves_list(self.linkage)]
        else:
            return scipy.cluster.hierarchy.leaves_list(self.linkage)
项目:PyMaid    作者:schlegelp    | 项目源码 | 文件源码
def calc_agg_coeff(self):
        """ Returns the agglomerative coefficient, measuring the clustering
        structure of the linkage matrix. Because it grows with the number of 
        observations, this measure should not be used to compare datasets of 
        very different sizes.

        For each observation i, denote by m(i) its dissimilarity to the first 
        cluster it is merged with, divided by the dissimilarity of the merger 
        in the final step of the algorithm. The agglomerative coefficient is 
        the average of all 1 - m(i).
        """

        # Turn into pandas DataFrame for fancy indexing
        Z = pd.DataFrame(self.linkage, columns = ['obs1','obs2','dist','n_org'] )

        # Get all distances at which an original observation is merged 
        all_dist = Z[ ( Z.obs1.isin(self.leafs) ) | (Z.obs2.isin(self.leafs) ) ].dist.values

        # Divide all distances by last merger
        all_dist /= self.linkage[-1][2]

        # Calc final coefficient
        coeff = np.mean( 1 - all_dist )

        return coeff
项目:ccCluster    作者:gsantoni    | 项目源码 | 文件源码
def main():
    from optparse import OptionParser
    parser = OptionParser(usage="%prog --XSCALEfile=<LP filename> --outname=<output dendogram>")

    parser.add_option("-o","--outname", dest="outname", default='Dendrogram', help="output dendogram file name")
    parser.add_option("-t", "--threshold", dest="threshold", default='0.4', help="Distance threshold for clustering")
    parser.add_option("-c", "--count",action="store_true", dest="count", default=False, help="Counts datasets in the biggest cluster and exit")
    (options, args) = parser.parse_args()

    thr = float(options.threshold)
    CC = Clustering('Cluster_log.txt')
    link = CC.tree()
    if options.count:
        CC.checkMultiplicity(thr)
        print(CC.thrEstimation())
    else:
        CC.checkMultiplicity(thr) 
        CC.merge('ano', thr)
项目:treecat    作者:posterior    | 项目源码 | 文件源码
def plot_feature_overlap(df, cmap='binary', method='cluster'):
    """Plot feature-feature presence overlap of a pandas dataframe.

    Args:
        df: A pandas dataframe.
        cmap: A matplotlib colormap.
        method: Method of clustering, one of 'cluster' or 'tree'.
    """
    V = len(df.columns)
    present = (df == df).as_matrix().astype(np.float32)
    overlap = np.dot(present.T, present)
    assert overlap.shape == (V, V)

    # Sort features to make blocks contiguous.
    if method == 'tree':
        # TODO(fritzo) Fix this to not look awful.
        grid = make_complete_graph(V)
        weights = np.empty(grid.shape[1], dtype=np.float32)
        for k, v1, v2 in grid.T:
            weights[k] = overlap[v1, v2]
        edges = estimate_tree(grid, weights)
        order, order_inv = order_vertices(edges)
    elif method == 'cluster':
        distance = scipy.spatial.distance.pdist(overlap)
        clustering = scipy.cluster.hierarchy.complete(distance)
        order_inv = scipy.cluster.hierarchy.leaves_list(clustering)
    else:
        raise ValueError(method)
    overlap = overlap[order_inv, :]
    overlap = overlap[:, order_inv]
    assert overlap.shape == (V, V)

    pyplot.imshow(overlap**0.5, cmap=cmap)
    pyplot.axis('off')
项目:massivedatans    作者:JohannesBuchner    | 项目源码 | 文件源码
def cluster(self, u, ndim, keepRadius=False):
        """

        """
        if self.verbose: print 'building region ...'
        if len(u) > 10:
            if keepRadius and self.region is not None and 'maxdistance' in self.region:
                maxdistance = self.region['maxdistance']
            else:
                if self.radial:
                    if self.jackknife:
                        #maxdistance = initial_rdistance_guess(u, k=1, metric=self.metric)
                        maxdistance = nearest_rdistance_guess(u, metric=self.metric)
                    else:
                        maxdistance = find_rdistance(u, nbootstraps=20, metric=self.metric, verbose=self.verbose)
                else:
                    maxdistance = find_maxdistance(u)
            if self.force_shrink and self.region is not None and 'maxdistance' in self.region:
                maxdistance = min(maxdistance, self.region['maxdistance'])
            if self.keep_phantom_points and len(self.phantom_points) > 0:
                # add phantoms to u now
                print 'including phantom points in cluster members', self.phantom_points
                u = numpy.vstack((u, self.phantom_points))
            ulow  = numpy.max([u.min(axis=0) - maxdistance, numpy.zeros(ndim)], axis=0)
            uhigh = numpy.min([u.max(axis=0) + maxdistance, numpy.ones(ndim)], axis=0)
        else:
            maxdistance = None
            ulow = numpy.zeros(ndim)
            uhigh = numpy.ones(ndim)
        if self.verbose: print 'setting sampling region:', (ulow, uhigh), maxdistance
        self.region = dict(members=u, maxdistance=maxdistance, ulow=ulow, uhigh=uhigh)
        self.generator = None
项目:massivedatans    作者:JohannesBuchner    | 项目源码 | 文件源码
def rebuild(self, u, ndim, keepRadius=False):
        if self.last_cluster_points is None or \
            len(self.last_cluster_points) != len(u) or \
            numpy.any(self.last_cluster_points != u):
            self.cluster(u=self.transform_new_points(u), ndim=ndim, keepRadius=keepRadius)
            self.last_cluster_points = u

            # reset generator
            self.generator = self.generate(ndim=ndim)
项目:massivedatans    作者:JohannesBuchner    | 项目源码 | 文件源码
def rebuild(self, u, ndim, keepMetric=False):
        if self.last_cluster_points is not None and \
            len(self.last_cluster_points) == len(u) and \
            numpy.all(self.last_cluster_points == u):
            # do nothing if everything stayed the same
            return

        self.cluster(u=u, ndim=ndim, keepMetric=keepMetric)
        self.last_cluster_points = u

        print 'maxdistance:', self.region.maxdistance
        self.generator = self.generate(ndim)
项目:genepred    作者:egorbarsukoff    | 项目源码 | 文件源码
def start_clustering(self):
        functions.log('Calculate {0} distances...'.format(int(len(self.orfs) * (len(self.orfs) + 1) / 2)))
        self.distances = self.create_distance_matrix()
        functions.log('Start clustering...')
        self.linkage_matrix = scipy.cluster.hierarchy.linkage(ssd.squareform(self.distances), method='complete')
        functions.log('Clustering done.')
项目:genepred    作者:egorbarsukoff    | 项目源码 | 文件源码
def clustering(self, clustering_distance):
        """
        Create clusters
        :param clustering_distance: max distance between ORFs in one clusters
        :return:
        """
        functions.log('Create clusters...')
        for ind, o in enumerate(scipy.cluster.hierarchy.fcluster(self.linkage_matrix, clustering_distance, 'distance')):
            if self.clusters.get(o) is None:
                self.clusters[o] = set()
            self.clusters[o].add(ind)
            self.orfs_clusters[ind] = o
项目:genepred    作者:egorbarsukoff    | 项目源码 | 文件源码
def get_args():
    parser = argparse.ArgumentParser(description='Clustering ORFs by position in the graph')
    parser.add_argument('sequences', type=str, help='Path to ORFs sequences file')
    parser.add_argument('paths', type=str, help='Path to ORFs paths file from ORFFinderInGraph.py')
    parser.add_argument('graph', type=str, help='Path to graph in GFA format')
    parser.add_argument('threshold', type=int, help='Max level of dissimilarity of ORFs in one cluster(between 0 and 1)')
    parser.add_argument('-o', '--output', type=str, default='', help='Output folder')
    parser.add_argument('-sd', '--savedistances', default='False', action='store_true',
                        help='Save pairwise distances matrix (in .npy format)')
    parser.set_defaults(savedistances=False)
    return parser.parse_args()
项目:Maybe-Useful-Cogs    作者:AznStevy    | 项目源码 | 文件源码
def _auto_color(self, url:str, ranks):
        phrases = ["Calculating colors..."] # in case I want more
        #try:
        await self.bot.say("**{}**".format(random.choice(phrases)))
        clusters = 10

        async with self.session.get(url) as r:
            image = await r.content.read()
        with open('data/leveler/temp_auto.png','wb') as f:
            f.write(image)

        im = Image.open('data/leveler/temp_auto.png').convert('RGBA')
        im = im.resize((290, 290)) # resized to reduce time
        ar = scipy.misc.fromimage(im)
        shape = ar.shape
        ar = ar.reshape(scipy.product(shape[:2]), shape[2])

        codes, dist = scipy.cluster.vq.kmeans(ar.astype(float), clusters)
        vecs, dist = scipy.cluster.vq.vq(ar, codes)         # assign codes
        counts, bins = scipy.histogram(vecs, len(codes))    # count occurrences

        # sort counts
        freq_index = []
        index = 0
        for count in counts:
            freq_index.append((index, count))
            index += 1
        sorted_list = sorted(freq_index, key=operator.itemgetter(1), reverse=True)

        colors = []
        for rank in ranks:
            color_index = min(rank, len(codes))
            peak = codes[sorted_list[color_index][0]] # gets the original index
            peak = peak.astype(int)

            colors.append(''.join(format(c, '02x') for c in peak))
        return colors # returns array
        #except:
            #await self.bot.say("```Error or no scipy. Install scipy doing 'pip3 install numpy' and 'pip3 install scipy' or read here: https://github.com/AznStevy/Maybe-Useful-Cogs/blob/master/README.md```")

    # converts hex to rgb
项目:BioNLP-2016    作者:cambridgeltl    | 项目源码 | 文件源码
def write_cluster_ids(words, cluster_ids, out=None):
    """Write given list of words and their corresponding cluster ids to out."""

    assert len(words) == len(cluster_ids), 'word/cluster ids number mismatch'

    if out is None:
        out = sys.stdout
    for word, cid in izip(words, cluster_ids):
        print >> out, '%s\t%d' % (word, cid)
项目:BioNLP-2016    作者:cambridgeltl    | 项目源码 | 文件源码
def main(argv=None):
    if argv is None:
        argv = sys.argv

    try:
        words, vectors, options = process_options(argv[1:])
    except Exception, e:
        if str(e):
            print >> sys.stderr, 'Error: %s' % str(e)
            return 1
        else:
            raise

    dbscan = sklearn.cluster.DBSCAN(eps=options.eps, metric=options.metric)
    dbscan.fit(numpy.array(vectors))
    noisy = sum(1 for l in dbscan.labels_ if l == -1)
    unique = len(set(dbscan.labels_))
    logging.info('%d clusters, %d noisy, %d vectors' % (unique, noisy,
                                                        len(vectors)))
    if noisy >= len(vectors) / 4:
        logging.warning('%d/%d noisy (-1) labels (try higher eps?)' % \
                            (noisy, len(vectors)))
    elif unique < (len(vectors)/2)**0.5:
        logging.warning('only %d clusters (try lower eps?)' % unique)
    write_cluster_ids(words, dbscan.labels_)

    return 0
项目:BioNLP-2016    作者:cambridgeltl    | 项目源码 | 文件源码
def process_options(args):    
    options = argparser().parse_args(args)

    if options.max_rank is not None and options.max_rank < 1:
        raise ValueError('max-rank must be >= 1')
    if options.k is not None and options.k < 2:
        raise ValueError('cluster number must be >= 2')

    if options.method == MINIBATCH_KMEANS and not with_sklearn:
        logging.warning('minibatch kmeans not available, using kmeans (slow)')
        options.method = KMEANS

    if options.jobs != 1 and (options.method != KMEANS or not with_sklearn):
        logging.warning('jobs > 1 only supported scikit-learn %s' % KMEANS)
        options.jobs = 1

    wv = wvlib.load(options.vectors[0], max_rank=options.max_rank)

    if options.k is None:
        options.k = int(math.ceil((len(wv.words())/2)**0.5))
        logging.info('set k=%d (%d words)' % (options.k, len(wv.words())))

    if options.normalize:
        logging.info('normalize vectors to unit length')
        wv.normalize()

    words, vectors = wv.words(), wv.vectors()

    if options.whiten:
        logging.info('normalize features to unit variance')
        vectors = scipy.cluster.vq.whiten(vectors)

    return words, vectors, options
项目:BioNLP-2016    作者:cambridgeltl    | 项目源码 | 文件源码
def minibatch_kmeans(vectors, k):
    if not with_sklearn:
        raise NotImplementedError
    # Sculley (http://www.eecs.tufts.edu/~dsculley/papers/fastkmeans.pdf)
    # uses batch size 1000. sklearn KMeans defaults to n_init 10
    kmeans = sklearn.cluster.MiniBatchKMeans(k, batch_size=1000, n_init=10)
    kmeans.fit(vectors)
    return kmeans.labels_
项目:BioNLP-2016    作者:cambridgeltl    | 项目源码 | 文件源码
def write_cluster_ids(words, cluster_ids, out=None):
    """Write given list of words and their corresponding cluster ids to out."""

    assert len(words) == len(cluster_ids), 'word/cluster ids number mismatch'

    if out is None:
        out = sys.stdout
    for word, cid in izip(words, cluster_ids):
        print >> out, '%s\t%d' % (word, cid)
项目:PyMaid    作者:schlegelp    | 项目源码 | 文件源码
def __getattr__(self, key):
        if key == 'linkage':
            self.cluster()
            return self.linkage
        elif key in ['leafs', 'leaves']:
            return self.get_leafs()
        elif key == 'cophenet':
             return self.calc_cophenet()
        elif key == 'agg_coeff':
            return self.calc_agg_coeff()
项目:PyMaid    作者:schlegelp    | 项目源码 | 文件源码
def calc_cophenet(self):
        """ Returns Cophenetic Correlation coefficient of your clustering.
        This (very very briefly) compares (correlates) the actual pairwise 
        distances of all your samples to those implied by the hierarchical 
        clustering. The closer the value is to 1, the better the clustering 
        preserves the original distances,
        """        

        return scipy.cluster.hierarchy.cophenet( self.linkage, self.condensed_dist_mat )
项目:PyMaid    作者:schlegelp    | 项目源码 | 文件源码
def cluster(self, method='ward'):
        """ Cluster distance matrix. This will automatically be called when
        attribute linkage is requested for the first time.

        Parameters
        ----------
        method :    str, optional
                    Clustering method (see scipy.cluster.hierarchy.linkage 
                    for reference)
        """

        # First, convert similarity matrix to distance matrix
        if self.mat_type != 'distance':            
            if isinstance( self.mat, pd.DataFrame ):                
                self.dist_mat = ( self.mat.as_matrix() - self.mat.max().max() ) * -1
            else:
                self.dist_mat = ( self.mat - self.mat.max() ) * -1
        else:
            if isinstance( self.mat, pd.DataFrame ):
                self.dist_mat = self.mat.as_matrix()
            else:
                self.dist_mat = self.mat

        # Second, convert into condensed distance matrix - otherwise clustering
        # thinks we are passing observations instead of final scores
        self.condensed_dist_mat = scipy.spatial.distance.squareform( self.dist_mat, checks=False )

        self.linkage = scipy.cluster.hierarchy.linkage(self.condensed_dist_mat, method=method)

        # Save method in case we want to look it up later
        self.method = method

        module_logger.info('Clustering done using method "{0}"'.format(method) )
项目:PyMaid    作者:schlegelp    | 项目源码 | 文件源码
def plot3d(self, k=5, criterion='maxclust', **kwargs):
        """Plot neuron using :func:`pymaid.plot.plot3d`. Will only work if
        instance has neurons attached to it.

        Parameters
        ----------
        k :         {int, float}
        criterion : {'maxclust','distance'}, optional
                    If `maxclust`, `k` clusters will be formed. If `distance`, 
                    clusters will be created at threshold `k`.
        **kwargs
                will be passed to plot.plot3d() 
                see help(plot.plot3d) for a list of keywords      

        See Also
        --------
        :func:`pymaid.plot.plot3d` 
                    Function called to generate 3d plot                  
        """

        if 'neurons' not in self.__dict__:
            module_logger.error(
                'This works only with cluster results from neurons')
            return None       

        cmap = self.get_colormap(k=k, criterion=criterion)

        kwargs.update({'color': cmap})

        return plotting.plot3d(self.neurons, **kwargs)
项目:PyMaid    作者:schlegelp    | 项目源码 | 文件源码
def to_json(self, fname='cluster.json', k=5, criterion='maxclust'):
        """ Convert clustered neurons into json file that can be loaded into 
        CATMAID selection table.

        Parameters
        ----------
        fname :     str, optional
                    Filename to save selection to
        k :         {int, float}
        criterion : {'maxclust','distance'}, optional
                    If `maxclust`, `k` clusters will be formed. If `distance`, 
                    clusters will be created at threshold `k`.

        See Also
        --------
        :func:`pymaid.plot.plot3d` 
                    Function called to generate 3d plot                  
        """        

        cmap = self.get_colormap(k=k, criterion=criterion)

        # Convert to 0-255
        cmap = { n : [ int(v*255) for v in cmap[n] ] for n in cmap }  

        data = [ dict(skeleton_id=int(n),
                     color="#{:02x}{:02x}{:02x}".format( cmap[n][0],cmap[n][1],cmap[n][2] ),
                     opacity=1
                     ) for n in cmap ]

        with open(fname, 'w') as outfile:
            json.dump(data, outfile)

        module_logger.info('Selection saved as %s in %s' % (fname, os.getcwd()))    

        return
项目:PyMaid    作者:schlegelp    | 项目源码 | 文件源码
def get_clusters(self, k, criterion='maxclust', return_type='labels'):
        """ Wrapper for cluster.hierarchy.fcluster to get clusters.

        Parameters
        ----------
        k :             {int, float}
        criterion :     {'maxclust','distance'}, optional
                        If `maxclust`, `k` clusters will be formed. If 
                        `distance`, clusters will be created at threshold `k`.
        return_type :   {'labels','indices','columns','rows'}
                        Determines what to construct the clusters of. 'labels'
                        only works if labels are provided. 'indices' refers
                        to index in distance matrix. 'columns'/'rows' works
                        if distance matrix is pandas DataFrame                        

        Returns
        -------
        list 
                    list of clusters [ [leaf1, leaf5], [leaf2, ...], ... ]
        """

        cl = scipy.cluster.hierarchy.fcluster(self.linkage, k, criterion=criterion)

        if self.labels and return_type.lower()=='labels':
            return [[self.labels[j] for j in range(len(cl)) if cl[j] == i] for i in range(min(cl), max(cl) + 1)]
        elif return_type.lower() == 'rows':
            return [[self.mat.columns.tolist()[j] for j in range(len(cl)) if cl[j] == i] for i in range(min(cl), max(cl) + 1)]
        elif return_type.lower() == 'columns':
            return [[self.mat.index.tolist()[j] for j in range(len(cl)) if cl[j] == i] for i in range(min(cl), max(cl) + 1)]
        else:
            return [[j for j in range(len(cl)) if cl[j] == i] for i in range(min(cl), max(cl) + 1)]
项目:ccCluster    作者:gsantoni    | 项目源码 | 文件源码
def checkMultiplicity(self, thr):
        FlatC = hierarchy.fcluster(self.Tree, thr, criterion='distance')
        counter=collections.Counter(FlatC)
        Best = max(counter.iteritems(), key=operator.itemgetter(1))[0]
        print('You are clustering with a threshold of %s'%(thr))
        print('The biggest cluster contains %s datasets from a total of %s'%(counter[Best], len(self.labelList)))
项目:ccCluster    作者:gsantoni    | 项目源码 | 文件源码
def minimalForCompleteness(self):
        print("Running estimator for minimal threshold for completeness")
        labels=self.createLabels()
        x = 0.00
        dx = 0.05
        countsList = {}
        x_list = []
        while x < 1:
            Arrays= {}
            FlatC = hierarchy.fcluster(self.Tree, x, criterion='distance')
            counter=collections.Counter(FlatC)
            Best = max(counter.iteritems(), key=operator.itemgetter(1))[0]
            toProcess=[Best]
            y=0
            for cluster, filename in zip(FlatC,labels):
                if cluster in toProcess:
                    hklFile = any_reflection_file(filename)
                    b= hklFile.as_miller_arrays()
                    for column in b:
                        if column.is_xray_intensity_array():
                            Arrays[y]=column
                            break
                    y+=1
            try:
                Arr = Arrays[0]
            except:
                countsList.append(0)
            for label in range(1, y):
                try:
                    Arr = Arr.concatenate(Arrays[label])
                except:
                    pass
            countsList[x]=(Arr.completeness())
            x+= dx
       # return minimal for max
        L = []
        for key in countsList:
            if countsList[key]>0.98:
                L.append(key)
        L.sort()
        return L[0]
项目:massivedatans    作者:JohannesBuchner    | 项目源码 | 文件源码
def cluster(self, u, ndim, keepMetric=False):
        w = self.metric.transform(u)
        prev_region = self.region
        if keepMetric:
            self.region = RadFriendsRegion(members=w)
            if self.force_shrink and self.region.maxdistance > self.prev_maxdistance:
                self.region = RadFriendsRegion(members=w, maxdistance=self.prev_maxdistance)
            self.prev_maxdistance = self.region.maxdistance
            print 'keeping metric, not reclustering.'
            return

        metric_updated = False
        clustermetric = self.metric
        print 'computing distances for clustering...'
        # Overlay all clusters (shift by cluster mean) 
        print 'Metric update ...'
        cluster_mean = numpy.mean(u, axis=0)
        shifted_cluster_members = u - cluster_mean

        # Using original points and new metric, compute RadFriends bootstrapped distance and store
        if self.metriclearner == 'none':
            metric = self.metric # stay with identity matrix
            metric_updated = False
        elif self.metriclearner == 'simplescaling':
            metric = SimpleScaling()
            metric.fit(shifted_cluster_members)
            metric_updated = True
        elif self.metriclearner == 'truncatedscaling':
            metric = TruncatedScaling()
            metric.fit(shifted_cluster_members)
            metric_updated = self.metric == IdentityMetric() or not numpy.all(self.metric.scale == metric.scale)
        else:
            assert False, self.metriclearner

        self.metric = metric

        wnew = self.metric.transform(u)
        print 'Region update ...'

        self.region = RadFriendsRegion(members=wnew) #, maxdistance=shifted_region.maxdistance)
        if not metric_updated and self.force_shrink and self.prev_maxdistance is not None:
            if self.region.maxdistance > self.prev_maxdistance:
                self.region = RadFriendsRegion(members=w, maxdistance=self.prev_maxdistance)
        self.prev_maxdistance = self.region.maxdistance
        print 'done.'
项目:PyMaid    作者:schlegelp    | 项目源码 | 文件源码
def plot_matrix(self):
        """ Plot distance matrix and dendrogram using matplotlib.

        Returns
        -------
        matplotlib figure
        """

        # Compute and plot first dendrogram for all nodes.
        fig = pylab.figure(figsize=(8, 8))
        ax1 = fig.add_axes([0.09, 0.1, 0.2, 0.6])
        Z1 = scipy.cluster.hierarchy.dendrogram(
            self.linkage, orientation='left', labels=self.labels)
        ax1.set_xticks([])
        ax1.set_yticks([])

        # Compute and plot second dendrogram.
        ax2 = fig.add_axes([0.3, 0.71, 0.6, 0.2])
        Z2 = scipy.cluster.hierarchy.dendrogram(self.linkage, labels=self.labels)
        ax2.set_xticks([])
        ax2.set_yticks([])

        # Plot distance matrix.
        axmatrix = fig.add_axes([0.3, 0.1, 0.6, 0.6])
        idx1 = Z1['leaves']
        idx2 = Z2['leaves']
        D = self.mat.copy()

        if isinstance(D, pd.DataFrame):
            D = D.as_matrix()

        D = D[idx1, :]
        D = D[:, idx2]
        im = axmatrix.matshow(
            D, aspect='auto', origin='lower', cmap=pylab.cm.YlGnBu)
        axmatrix.set_xticks([])
        axmatrix.set_yticks([])

        # Plot colorbar.
        axcolor = fig.add_axes([0.91, 0.1, 0.02, 0.6])
        pylab.colorbar(im, cax=axcolor)

        module_logger.info(
            'Use matplotlib.pyplot.show() to render figure.')

        return fig
项目:ccCluster    作者:gsantoni    | 项目源码 | 文件源码
def merge(self, anomFlag, thr):
        FlatC = hierarchy.fcluster(self.Tree, thr, criterion='distance')
        Log = open(self.CurrentDir+'/.cc_cluster.log', 'a')
        counter=collections.Counter(FlatC)
        Best = max(counter.iteritems(), key=operator.itemgetter(1))[0]
        Process = True
#change checkboxes to standard variables
        if Process:
            ToProcess = [Best]    
        else:
            ToProcess = set(Clusters)
            for key in ToProcess:
                if counter[key]==1:
                    ToProcess = [x for x in ToProcess if x != key]



#Processing pipeline, 
#Does all the XSCALE run
        for x in ToProcess:
            if [thr,x, anomFlag] not in  self.alreadyDone:
                os.mkdir(self.CurrentDir+'/cc_Cluster_%.2f_%s_%s'%(float(thr),x, anomFlag))
                Xscale=open(self.CurrentDir+'/cc_Cluster_%.2f_%s_%s/XSCALE.INP'%(float(thr),x, anomFlag), 'a')
                Pointless=open(self.CurrentDir+'/cc_Cluster_%.2f_%s_%s/launch_pointless.sh'%(float(thr),x,anomFlag ), 'a')
                print('OUTPUT_FILE=scaled.hkl',file=Xscale)
                print('MERGE= TRUE', file=Xscale)
                print('pointless hklout clustered.mtz << eof', file=Pointless)
                if anomFlag=='ano':
                    print('FRIEDEL\'S_LAW= FALSE', file=Xscale)
                elif anomFlag=='no_ano':
                    print('FRIEDEL\'S_LAW= TRUE', file=Xscale)
                Xscale.close()
                Pointless.close()

        for cluster, filename in zip(FlatC,self.labelList):
            if cluster in ToProcess:
                OUT = open(self.CurrentDir+'/cc_Cluster_%.2f_%s_%s/XSCALE.INP'%(float(thr),cluster,anomFlag), 'a')
                Pointless=open(self.CurrentDir+'/cc_Cluster_%.2f_%s_%s/launch_pointless.sh'%(float(thr),cluster,anomFlag), 'a')
                print ('INPUT_FILE= ../%s'%(filename), file=OUT)
                #print ('INCLUDE_RESOLUTION_RANGE=20, 1.8', file=OUT)
                print ('MINIMUM_I/SIGMA= 0', file=OUT)
                print ('XDSIN ../%s'%(filename), file= Pointless)
                OUT.close()
                Pointless.close()
        #optional run of XSCALE

        newProcesses=[]
        for x in ToProcess:
            if [thr,x, anomFlag] not in  self.alreadyDone:
                plt.savefig(self.CurrentDir+'/cc_Cluster_%.2f_%s_%s/Dendrogram.png'%(float(thr),x,anomFlag))
                P= subprocess.Popen('/opt/pxsoft/xds/vdefault/linux-x86_64/xscale_par',cwd=self.CurrentDir+'/cc_Cluster_%.2f_%s_%s/'%(float(thr), x, anomFlag))     
                P.wait()
                print('Cluster, %s , %s , %s'%(float(thr),x, anomFlag), file=Log)             
                Pointless=open(self.CurrentDir+'/cc_Cluster_%.2f_%s_%s/launch_pointless.sh'%(float(thr),x,anomFlag), 'a')
                print('COPY \n bg\n TOLERANCE 4 \n eof', file= Pointless)
                Pointless.close()
                os.chmod(self.CurrentDir+'/cc_Cluster_%.2f_%s_%s/launch_pointless.sh'%(self.threshold,x,self.anomFlag ), st.st_mode | 0o111)              
                newProcesses.append([thr,x, anomFlag])