项目:ObjRecPoseEst    作者:paroj    | 项目源码 | 文件源码
def computeValDataDistanceMatrix(self):

        # TODO: use self.computeDescriptors(self.valdataDB)  ?

        batchSize = self.cfgParams.batch_size
        nSamp = self.n_val_batches * batchSize
        descr = numpy.zeros((nSamp,self.descrNet.cfgParams.outputDim[1]))

        for i in range(self.n_val_batches):
#             if self.macroBatchSize > 0:
#                 self.setMacroBatchData(self.valdataDB,numpy.floor(i / self.macroBatchSize).astype(
#                 miniBatchIdx = numpy.mod(i,self.macroBatchSize)
#             else:
#                 miniBatchIdx = i
            miniBatchIdx = self.dataManager.makeMinibatchAvailable(self.valdataDB,i)
            d = self.tfComputeDescr(miniBatchIdx)
            descr[i*batchSize:(i+1)*batchSize] = d

        dst = scipy.spatial.distance.pdist(descr,'euclidean')
        dst = scipy.spatial.distance.squareform(dst) 

        return dst
项目:ObjRecPoseEst    作者:paroj    | 项目源码 | 文件源码
def computeDistanceMatrix(self,test_set):

        batch_size = self.cfgParams.batch_size
        nSamp = test_set.numSamples

        descrLen = self.descrNet.cfgParams.outputDim[1]
        descr = numpy.zeros((nSamp,descrLen))

        n_test_batches = nSamp / batch_size
        for i in range(n_test_batches):
#             if self.macroBatchSize > 0:
#                 self.setMacroBatchData(test_set,numpy.floor(i / self.macroBatchSize).astype(
#                 miniBatchIdx = numpy.mod(i,self.macroBatchSize)
#             else:
#                 miniBatchIdx = i
            miniBatchIdx = self.dataManager.makeMinibatchAvailable(test_set,i)                
            d = self.tfComputeDescr(miniBatchIdx)
            descr[i*batch_size:(i+1)*batch_size] = d

        print("distances done")

        dst = scipy.spatial.distance.pdist(descr,'euclidean')
        dst = scipy.spatial.distance.squareform(dst) 

        return dst
项目:aid    作者:cvjena    | 项目源码 | 文件源码
def fit(self, feat):

        # Compute affinity matrix using RBF kernel on pair-wise distances
        affinity = scipy.spatial.distance.pdist(np.array([f for id, f in feat]))
        sigma = -2 * np.var(affinity)
        affinity = np.exp(scipy.spatial.distance.squareform(affinity) / sigma)

        # Recursive clustering
        self.tree = { 'depth' : 0, 'height' : 0, 'size' : 0, 'leafs' : 1, 'children' : [], 'parent' : None, 'items' : feat, 'affinity' : affinity }
        queue = []
        heapq.heappush(queue, (-1 * len(self.tree['items']), np.random.rand(), self.tree))
        while (self.tree['leafs'] < self.max_clusters) and (len(queue) > 0):
            if len(queue[0][2]['items']) <= self.min_cluster_size:
            left, right, ncut_value = self.split(heapq.heappop(queue)[2])
            if ncut_value > self.T:
            if (left is not None) and (right is not None):
                heapq.heappush(queue, (-1 * len(left['items']), np.random.rand(), left))
                heapq.heappush(queue, (-1 * len(right['items']), np.random.rand(), right))
项目:decoding_challenge_cortana_2016_3rd    作者:kingjr    | 项目源码 | 文件源码
def get_score_funcs():
    """Helper to get the score functions"""
    from scipy import stats
    from scipy.spatial import distance
    score_funcs = Bunch()
    xy_arg_dist_funcs = [(n, f) for n, f in vars(distance).items()
                         if isfunction(f) and not n.startswith('_')]
    xy_arg_stats_funcs = [(n, f) for n, f in vars(stats).items()
                          if isfunction(f) and not n.startswith('_')]
    score_funcs.update(dict((n, _make_xy_sfunc(f))
                            for n, f in xy_arg_dist_funcs
                            if _get_args(f) == ['u', 'v']))
    score_funcs.update(dict((n, _make_xy_sfunc(f, ndim_output=True))
                            for n, f in xy_arg_stats_funcs
                            if _get_args(f) == ['x', 'y']))
    return score_funcs
项目:ccCluster    作者:gsantoni    | 项目源码 | 文件源码
def thrEstimation(self):
        x = 0.00
        dx = 0.05
        countsList = []
        x_list = []
        while x < 1:

            FlatC = hierarchy.fcluster(self.Tree, x, criterion='distance')
            Best = max(counter.iteritems(), key=operator.itemgetter(1))[0]
            x+= dx
        dy = np.diff(countsList)

        for a, b in zip (x_list, dy):
            if b == max(dy):
                return a
项目:spyking-circus    作者:spyking-circus    | 项目源码 | 文件源码
def distancematrix(data, ydata=None):

    if ydata is None:
        distances = scipy.spatial.distance.pdist(data, 'euclidean')
        distances = scipy.spatial.distance.cdist(data, ydata, 'euclidean')

    return distances.astype(numpy.float32)
项目:pairwise_distance    作者:oliviaguest    | 项目源码 | 文件源码
def batch_pdist(data_slice):
    # Each data_slice has tuples consisting of two points that we need to
    # find the great circle distance between and their weight:
    partial_sum = 0
    for X, Y, weights in data_slice:
        dist = np.array([])
        zipped = zip(X, Y)
        for x, y in zipped:
            dist = np.append(dist, great_circle(x, y).km)
        partial_sum += np.sum(weights * dist )
    return partial_sum
    # return 10
项目:pairwise_distance    作者:oliviaguest    | 项目源码 | 文件源码
def mean_pairwise_distance(X, weights=None, n_jobs=None, axis=0):
    """Function that returns the sum and mean of the pairwise distances of an 2D
    array X.

    Required arguments:
    X       --  2D array of points.

    Optional arguments:
    weights -- 1D array of counts or weights per point in X (default: 1s).
    n_jobs  -- Number of cores to use for calculation (default: all).
    axis    -- The axis of X corresponding to data elements (default: 0).
    N = X.shape[axis]
    if weights is None:
        weights = np.ones((N,))
    if n_jobs is None:
        n_jobs = min(mp.cpu_count(),N)
    # Get the pairs and their weights to calculate the distances without
    # needing the whole of X, split it into roughly equal sub-arrays per cpu:
    pairs_split = np.array_split([(X[i:], X[:N - i], weights[i:] * weights[:N - i])
                                  for i in xrange(1, N)],
                                 n_jobs, axis=axis)

    # Create a pool for each cpu to send the batch_dist function to each split.
    # Then, close the pool and wait for jobs to complete before continuing:
    pool = mp.Pool(processes=n_jobs)
    queue_sum = sum(, pairs_split, chunksize=N // n_jobs))
    N = weights.sum()
    # Compute the number of combinations, add to the number of unique pairs
    # and use that as the denominator to calculate the mean pairwise distance:
    mean = queue_sum / (N * (N - 1.0) / 2.0)
    # If you do not want to include distance from an item to itself use:
    # mean = queue_sum / (((N - 1)**2 + (N + 1)) / 2.0)

    return queue_sum, mean
项目:pairwise_distance    作者:oliviaguest    | 项目源码 | 文件源码
def distance(a, b):
    """ Slow version of ``add`` to simulate work """
    return np.sum(np.sqrt(np.sum((a - b)**2, axis=1)))

# Parallel:
项目:ObjRecPoseEst    作者:paroj    | 项目源码 | 文件源码
def checkFiltersDist(descrNet):
    wvals = descrNet.layer0.W.get_value()
    wvals = wvals.reshape((wvals.shape[0],[1:])))
    dst = scipy.spatial.distance.pdist(wvals,'cosine')
    dst = scipy.spatial.distance.squareform(dst)
项目:aid    作者:cvjena    | 项目源码 | 文件源码
def split(self, node):

        # Perform normalized cut
            ind = SpectralClustering(2, affinity = 'precomputed', assign_labels = 'discretize').fit_predict(node['affinity'])
        except KeyboardInterrupt:
            return None, None, 0

        # Create left and right node
        mask1, mask2 = (ind == 0), (ind == 1)
        if not (np.any(mask1) and np.any(mask2)):
            return None, None, 0
        left = { 'depth' : node['depth'] + 1, 'height' : 0, 'size' : 0, 'leafs' : 1, 'children' : [], 'parent' : node, 'items' : [f for i, f in enumerate(node['items']) if ind[i] == 0], 'affinity' : node['affinity'][np.ix_(mask1, mask1)] }
        right = { 'depth' : node['depth'] + 1, 'height' : 0, 'size' : 0, 'leafs' : 1, 'children' : [], 'parent' : node, 'items' : [f for i, f in enumerate(node['items']) if ind[i] == 1], 'affinity' : node['affinity'][np.ix_(mask2, mask2)] }

        # Force the node with the lower minimum distance to the query to be the left node
        if ind[0] == 1: # items are already sorted when passed to fit(), so we just need to look at the first item instead of re-computing all distances
            left, right = right, left

        # Modify parent
        node['children'] = [left, right]

        # Modify parent chain
        parent = node
        while parent is not None:
            parent['height'] += 1
            parent['size'] += 2
            parent['leafs'] += 1
            parent = parent['parent']

        return left, right, self.ncut_value(node['affinity'], ind)
项目:learn-to-select-data    作者:sebastianruder    | 项目源码 | 文件源码
def cosine_similarity(repr1, repr2):
    """Calculates cosine similarity ("""
    if repr1 is None or repr2 is None:
        return 0
    assert not (np.isnan(repr2).any() or np.isinf(repr2).any())
    assert not (np.isnan(repr1).any() or np.isinf(repr1).any())
    sim = 1 - scipy.spatial.distance.cosine(repr1, repr2)
    if np.isnan(sim):
        # the similarity is nan if no term in the document is in the vocabulary
        return 0
    return sim
项目:learn-to-select-data    作者:sebastianruder    | 项目源码 | 文件源码
def euclidean_distance(repr1, repr2):
    """Calculates Euclidean distance ("""
    sim = np.sqrt(np.sum([np.power(p-q, 2) for (p, q) in zip(repr1, repr2)]))
    return sim
项目:learn-to-select-data    作者:sebastianruder    | 项目源码 | 文件源码
def variational_distance(repr1, repr2):
    """Also known as L1 or Manhattan distance ("""
    sim = np.sum([np.abs(p-q) for (p, q) in zip(repr1, repr2)])
    return sim
项目:learn-to-select-data    作者:sebastianruder    | 项目源码 | 文件源码
def bhattacharyya_distance(repr1, repr2):
    """Calculates Bhattacharyya distance ("""
    sim = - np.log(np.sum([np.sqrt(p*q) for (p, q) in zip(repr1, repr2)]))
    assert not np.isnan(sim), 'Error: Similarity is nan.'
    if np.isinf(sim):
        # the similarity is -inf if no term in the review is in the vocabulary
        return 0
    return sim
项目:ccCluster    作者:gsantoni    | 项目源码 | 文件源码
def createLabels(self):
        self.labelList= []
        with open(self.ccFile) as f:   
            for line in f:
                if line.strip() == 'Labels':
            for line in f:
                if line.strip() == 'Correlation coefficients':
                goodLine = line.split()
        return self.labelList

#changed, now the distance is defined directly by ccCalc
项目:ccCluster    作者:gsantoni    | 项目源码 | 文件源码
def checkMultiplicity(self, thr):
        FlatC = hierarchy.fcluster(self.Tree, thr, criterion='distance')
        Best = max(counter.iteritems(), key=operator.itemgetter(1))[0]
        print('You are clustering with a threshold of %s'%(thr))
        print('The biggest cluster contains %s datasets from a total of %s'%(counter[Best], len(self.labelList)))
项目:ccCluster    作者:gsantoni    | 项目源码 | 文件源码
def completenessEstimation(self):
        x = 0.00
        dx = 0.05
        while x > 1:
            FlatC = hierarchy.fcluster(self.Tree, x, criterion='distance')
            Best = max(counter.iteritems(), key=operator.itemgetter(1))[0]
项目:ccCluster    作者:gsantoni    | 项目源码 | 文件源码
def minimalForCompleteness(self):
        print("Running estimator for minimal threshold for completeness")
        x = 0.00
        dx = 0.05
        countsList = {}
        x_list = []
        while x < 1:
            Arrays= {}
            FlatC = hierarchy.fcluster(self.Tree, x, criterion='distance')
            Best = max(counter.iteritems(), key=operator.itemgetter(1))[0]
            for cluster, filename in zip(FlatC,labels):
                if cluster in toProcess:
                    hklFile = any_reflection_file(filename)
                    b= hklFile.as_miller_arrays()
                    for column in b:
                        if column.is_xray_intensity_array():
                Arr = Arrays[0]
            for label in range(1, y):
                    Arr = Arr.concatenate(Arrays[label])
            x+= dx
       # return minimal for max
        L = []
        for key in countsList:
            if countsList[key]>0.98:
        return L[0]
项目:ccCluster    作者:gsantoni    | 项目源码 | 文件源码
def createDendrogram(self):
        X = hierarchy.dendrogram(Tree, color_threshold=self.threshold)
        #self.textOutput.append('Plotted Dendrogram. Colored at a %s threshold for distance'%(threshold))
项目:ObjRecPoseEst    作者:paroj    | 项目源码 | 文件源码
def mineHardNegativeTrainingPairsWithinMiniBatches(self):

        dnParams = self.descrNet.cfgParams
        batch_size = self.cfgParams.batch_size
        pairIdx = self.tvPairIdx
        #pairLabels = self.tvPairLabels
        y = self.tvY
        margin = self.pair_neg_margin

        diff = self.descrNet.output[pairIdx[:,0]] - self.descrNet.output[pairIdx[:,1]]
        dst = T.sum(diff**2,axis=1) / dnParams.outputDim[1]  # divide by number of outputs, such that the max distance is 1

        pairLabels = T.eq(y[pairIdx[:,0]],y[pairIdx[:,1]])  #  same class / different class ?
        pair_cost = pairLabels*dst + (1-pairLabels)*T.sqr(T.maximum(0,margin - T.sqrt(dst)))                

        # indices for all pairs of vectors in the minibatch
        pidx1,pidx2 = numpy.triu_indices(batch_size, 1) #numpy.mask_indices(batch_size, numpy.triu, 1)
        pidx1 = pidx1.reshape((len(pidx1),1))
        pidx2 = pidx2.reshape((len(pidx2),1))
        comb_pairIdx = numpy.concatenate((pidx1,pidx2),axis=1).astype(numpy.int32)

        dm = self.dataManager

        if isinstance(self.tvX,list):            
            givens = { tv: data[self.tvIndex * batch_size:(self.tvIndex + 1) * batch_size] for (tv,data) in zip(self.tvX,dm.tvsData_x) }
            givens = { self.tvX : dm.tvsData_x[self.tvIndex * batch_size:(self.tvIndex + 1) * batch_size] }
        givens[self.y] = dm.tvsData_y[self.tvIndex * batch_size:(self.tvIndex + 1) * batch_size]
        givens[pairIdx] = comb_pairIdx 

        tf = theano.function(inputs=[self.tvIndex],

        # for every sample get the index of the other sample with which together it forms the most expensive (highest cost) pair 
        nSamp = self.n_train_batches*batch_size
        idx = numpy.zeros(nSamp,dtype=numpy.int32)
        labels = numpy.zeros(nSamp,dtype=numpy.int32)  
        for i in range(self.n_train_batches):  
#             if self.macroBatchSize > 0:
#                 self.setMacroBatchData(self.traindataDB,numpy.floor(i / self.macroBatchSize).astype(
#                 miniBatchIdx = numpy.mod(i,self.macroBatchSize)
#             else:
#                 miniBatchIdx = i
            miniBatchIdx = self.dataManager.makeMinibatchAvailable(self.traindataDB,i)            
            c = tf(miniBatchIdx)
            c = scipy.spatial.distance.squareform(c[0])
            # find the max for each
            offset = i*batch_size
            maxIdx = numpy.argmax(c,axis=0) + offset 
            idx[i*batch_size:(i+1)*batch_size] = maxIdx 
            labels[i*batch_size:(i+1)*batch_size] = self.traindataDB.y[maxIdx] == self.traindataDB.y[i*batch_size:(i+1)*batch_size]


        idx = numpy.concatenate((numpy.arange(nSamp,dtype=numpy.int32).reshape(nSamp,1),idx.reshape(nSamp,1)),axis=1)

        return idx,labels