Python sklearn.neighbors 模块,NearestNeighbors() 实例源码

我们从Python开源项目中,提取了以下49个代码示例,用于说明如何使用sklearn.neighbors.NearestNeighbors()

项目:SecuML    作者:ANSSI-FR    | 项目源码 | 文件源码
def computeNeighboursScores(self):
        all_instances = self.iteration.datasets.instances
        # Connectivity matrix
        pipeline = Pipeline([
            ('scaler', StandardScaler()),
            ('model', NearestNeighbors(self.num_neighbours, n_jobs = -1))])
        pipeline.fit(all_instances.getFeatures())
        # Labels
        labels = np.array([generateLabel(x) for x in all_instances.getLabels()])
        # Compute neighbour scores
        scores = []
        all_neighbours = pipeline.named_steps['model'].kneighbors(return_distance = False)
        for i, label in enumerate(labels):
            if label != 0:
                continue
            else:
                neighbours = all_neighbours[i]
                score = sum(labels[neighbours] + 1) / (2.0 * self.num_neighbours)
                scores.append(score)
        return np.array(scores)
项目:kharita    作者:vipyoung    | 项目源码 | 文件源码
def getpossibleedges(datapointwts,seeds):
#    datapointwts = densify(datapointwts);
    X = [(xx[0], xx[1]) for xx in datapointwts];    S = [(xx[0], xx[1]) for xx in seeds];cluster = {};p2cluster = []; gedges = {}; gedges1 = {}; nedges = {};
    nbrs = NearestNeighbors(n_neighbors=5, algorithm='ball_tree').fit(S)
    distances, indices = nbrs.kneighbors(X)
    for cd in range(len(seeds)):
        cluster[cd] = []
    for ii, ll in enumerate(indices):
        dd = [taxidist(seeds[xx], datapointwts[ii][:-1],theta) for xx in ll]
        cd = ll[dd.index(min(dd))];
        cluster[cd].append(datapointwts[ii])
        p2cluster.append(cd)
    for ii, xx in enumerate(datapointwts):
        if ii>1:
            if datapointwts[ii-1][-1]<datapointwts[ii][-1] and datapointwts[ii-1][-1]>datapointwts[ii][-1]-11:
                cd1 = p2cluster[ii-1]; cd2 = p2cluster[ii];
            if not cd1== cd2:
                gedges1[(cd1,cd2)] =  gedges1.get((cd1,cd2),0)+1;
    return(gedges1)
项目:kharita    作者:vipyoung    | 项目源码 | 文件源码
def point2cluster(datapointwts,seeds,theta):
    cluster = {};p2cluster = []; gedges = {}; gedges1 = {}; nedges = {}; std = {}; seeds1 = []; seedweight = [];
    X = [(lonconst * xx[0], latconst * xx[1], theta / 180 * xx[2]) for xx in datapointwts];    S = [(lonconst * xx[0], latconst * xx[1], theta / 180 * xx[2]) for xx in seeds];
    Xrot = [(lonconst * xx[0], latconst * xx[1], theta / 180 * (xx[2]%360)) for xx in datapointwts];    Srot = [(lonconst * xx[0], latconst * xx[1], theta / 180 * (xx[2]%360)) for xx in seeds];
    for cd in range(len(seeds)):
        cluster[cd] = []
    nbrs = NearestNeighbors(n_neighbors=1, algorithm='ball_tree').fit(S)
    distances, indices = nbrs.kneighbors(X)
    nbrsrot = NearestNeighbors(n_neighbors=1, algorithm='ball_tree').fit(Srot)
    distancesrot, indicesrot = nbrsrot.kneighbors(Xrot)
    for ii, ll in enumerate(indices):
        #        print(distances[ii],distancesrot[ii],ll,indices[ii],indicesrot[ii])
        cd = indicesrot[ii][0]
        if distances[ii][0] < distancesrot[ii][0]:
            cd = indices[ii][0];
            #        print(cd,distances[ii],distancesrot[ii],ll,indices[ii],indicesrot[ii])
        cluster[cd].append(datapointwts[ii])
        p2cluster.append(cd)
    return(cluster,p2cluster)
项目:kharita    作者:vipyoung    | 项目源码 | 文件源码
def splitclustersparallel(datapointwts,seeds):
    X = [(xx[0], xx[1]) for xx in datapointwts];    S = [(xx[0], xx[1]) for xx in seeds];cluster = {};p2cluster = []; gedges = {}; gedges1 = {}; nedges = {}; std = {}; seeds1 = []; seedweight = []; roadwidth = [];
    nbrs = NearestNeighbors(n_neighbors=20, algorithm='ball_tree').fit(S)
    distances, indices = nbrs.kneighbors(X)
    for cd in range(len(seeds)):
        cluster[cd] = []; roadwidth.append(0);
    for ii, ll in enumerate(indices):
        dd = [taxidist(seeds[xx], datapointwts[ii][:-1],theta) for xx in ll]
        cd = ll[dd.index(min(dd))];
        cluster[cd].append(datapointwts[ii])
        p2cluster.append(cd)
    for cl in cluster:
        mang = seeds[cl][-1];
        scl = seeds[cl]
        if len(cluster[cl]) > 10:
            std[cl] = np.percentile([angledist(xx[2], mang) for xx in cluster[cl]], 90)
            roadwidth[cl] = 1+5*np.std([geodist(scl,xx)*np.sin(anglebetweentwopoints(scl,xx)-scl[-1])  for xx in cluster[cl]])
            print(cl,scl,[(anglebetweentwopoints(scl,xx),scl[-1])  for xx in cluster[cl]])
项目:lsanomaly    作者:lsanomaly    | 项目源码 | 文件源码
def median_kneighbour_distance(X, k=5):
    """
    Calculate the median kneighbor distance.

    Find the distance between a set of random datapoints and
    their kth nearest neighbours. This is a heuristic for setting the
    kernel length scale.
    """
    N_all = X.shape[0]
    k = min(k, N_all)
    N_subset = min(N_all, 2000)
    sample_idx_train = np.random.permutation(N_all)[:N_subset]
    nn = neighbors.NearestNeighbors(k)
    nn.fit(X[sample_idx_train, :])
    d, idx = nn.kneighbors(X[sample_idx_train, :])
    return np.median(d[:, -1])
项目:job-salary-prediction    作者:soton-data-mining    | 项目源码 | 文件源码
def cosine_knn(corpus_vector, queries_vector, k=10):
    """

    :param corpus_vector: vectorized document text
    :param queries_vector: vectorized query text
    :param k: number of neighbours
    :return: (distances, indices) of knn
    """
    # based on
    # http://scikit-learn.org/stable/modules/neighbors.html
    # http://scikit-learn.org/stable/modules/generated/sklearn.neighbors.NearestNeighbors.html

    # since we want to use cosine similarity to account for document length
    # we have to use bruteforce search
    # parallelize to number of cores with n_jobs -1
    nbrs = NearestNeighbors(n_neighbors=k, algorithm='brute', metric='cosine')
    nbrs.fit(corpus_vector)
    distances, indices = nbrs.kneighbors(queries_vector)
    return distances, indices
项目:basis    作者:vaitech    | 项目源码 | 文件源码
def index(self, metric='cosine'):
        """ Build a nearest neighbor retrieval index to perform similarity 
        lookups and analogies

        Arguments:
            metric: string, or sklearn compatible callable

        Returns:
            self

        Raises:
            TokenContainerException if no pretrained vectors have been loaded
        """

        if self.W is not None:
            alg = 'brute' if (metric == 'cosine') else 'auto'
            from sklearn.neighbors import NearestNeighbors
            self._nn = NearestNeighbors(metric=metric, algorithm=alg)
            self._nn.fit(self.W)
        else:
            raise TokenContainerException(
                'cannot build similarity on vectorless structure'
            )
        return self
项目:kaggle-yelp-restaurant-photo-classification    作者:u1234x1234    | 项目源码 | 文件源码
def extract_lab_histogram(mode, clusters):

    nn = neighbors.NearestNeighbors(n_neighbors=1)
    nn.fit(clusters)
    out_filename = mode + '_color'    
    try:
        os.remove(out_filename)
    except:
        pass
    out = open(out_filename, 'ab')
    cnt = 0    
    with open(mode + '_list') as f:
        for line in f:
            line = line[:-1]
            image = cv2.imread(line)
            image = cv2.resize(image, (100, 100))
            image = cv2.cvtColor(image, cv2.COLOR_BGR2Lab)
            points = image.reshape((-1, 3))
            cn = nn.kneighbors(points)
            hist = np.histogram(cn[1], bins=50, range=(1, 50))[0]
            hist.tofile(out)            
            cnt = cnt + 1
            if cnt % 1000 == 0:
                print(cnt)
项目:ikdb    作者:krishauser    | 项目源码 | 文件源码
def buildNNDataStructure(self):
        """Builds a nearest neighbor data structure.  User doesn't need to
        call this unless the self.problems attribute was changed manually."""
        if len(self.problemFeatures)==0 or len(self.featureNames)==0:
            return
        try:
            from sklearn.neighbors import NearestNeighbors,BallTree
            from scipy.spatial import KDTree
            with self.lock:
                try:
                    farray = self.problemFeatures.array
                except AttributeError:
                    farray = np.array(self.problemFeatures.items)
                if self.metricTransform is not None:
                    farray = np.dot(farray,self.metricTransform)
                #self.nn = NearestNeighbors(n_neighbors=1,algorithm="auto").fit(farray)
                self.nn = BallTree(farray)
                #self.nn = KDTree(farray)
                self.nnBuildSize = len(self.problemFeatures)
        except ImportError:
            print "IKDatabase: Warning, scikit-learn is not installed, queries will be much slower"
            with self.lock:
                self.nn = None
                self.nnBuildSize = 0
        return
项目:DroidWatcher    作者:suemi994    | 项目源码 | 文件源码
def __init__(self, x, ys):
        import numpy as np
        from sklearn.neighbors import NearestNeighbors
        #print x, ys

        CI = np.array( [x.checksum.get_signature_entropy(), x.checksum.get_entropy()] )
        #print CI, x.get_info()
        #print

        for i in ys:
            CI = np.vstack( (CI, [i.checksum.get_signature_entropy(), i.checksum.get_entropy()]) )

        #idx = 0
        #for i in np.array(CI)[1:]:
        #    print idx+1, i, ys[idx].get_info()
        #    idx += 1

        self.neigh = NearestNeighbors(2, 0.4)
        self.neigh.fit(np.array(CI))
        #print self.neigh.kneighbors( CI[0], len(CI) )

        self.CI = CI
        self.ys = ys
项目:DroidWatcher    作者:suemi994    | 项目源码 | 文件源码
def __init__(self, x, ys):
        import numpy as np
        from sklearn.neighbors import NearestNeighbors
        #print x, ys

        CI = np.array( [x.checksum.get_signature_entropy(), x.checksum.get_entropy()] )
        #print CI, x.get_info()
        #print

        for i in ys:
            CI = np.vstack( (CI, [i.checksum.get_signature_entropy(), i.checksum.get_entropy()]) )

        #idx = 0
        #for i in np.array(CI)[1:]:
        #    print idx+1, i, ys[idx].get_info()
        #    idx += 1

        self.neigh = NearestNeighbors(2, 0.4)
        self.neigh.fit(np.array(CI))
        #print self.neigh.kneighbors( CI[0], len(CI) )

        self.CI = CI
        self.ys = ys
项目:uhcsdb    作者:bdecost    | 项目源码 | 文件源码
def build_search_tree(datadir, featurename='vgg16_block5_conv3-vlad-64.h5'):

    ndim = 64
    features_file = os.path.join(datadir, featurename)
    print(features_file)

    global keys, features
    keys, features = load_features(features_file)

    print('reducing features')
    pca = PCA(n_components=ndim)
    features = pca.fit_transform(features)
    print('ready')

    print('building search tree')
    nn = NearestNeighbors()

    global nneighs
    nneighs = nn.fit(features)
    print('ready')
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def test_unsupervised_kneighbors(n_samples=20, n_features=5,
                                 n_query_pts=2, n_neighbors=5):
    # Test unsupervised neighbors methods
    X = rng.rand(n_samples, n_features)

    test = rng.rand(n_query_pts, n_features)

    for p in P:
        results_nodist = []
        results = []

        for algorithm in ALGORITHMS:
            neigh = neighbors.NearestNeighbors(n_neighbors=n_neighbors,
                                               algorithm=algorithm,
                                               p=p)
            neigh.fit(X)

            results_nodist.append(neigh.kneighbors(test,
                                                   return_distance=False))
            results.append(neigh.kneighbors(test, return_distance=True))

        for i in range(len(results) - 1):
            assert_array_almost_equal(results_nodist[i], results[i][1])
            assert_array_almost_equal(results[i][0], results[i + 1][0])
            assert_array_almost_equal(results[i][1], results[i + 1][1])
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def test_unsupervised_inputs():
    # test the types of valid input into NearestNeighbors
    X = rng.random_sample((10, 3))

    nbrs_fid = neighbors.NearestNeighbors(n_neighbors=1)
    nbrs_fid.fit(X)

    dist1, ind1 = nbrs_fid.kneighbors(X)

    nbrs = neighbors.NearestNeighbors(n_neighbors=1)

    for input in (nbrs_fid, neighbors.BallTree(X), neighbors.KDTree(X)):
        nbrs.fit(input)
        dist2, ind2 = nbrs.kneighbors(X)

        assert_array_almost_equal(dist1, dist2)
        assert_array_almost_equal(ind1, ind2)
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def test_radius_neighbors_boundary_handling():
    """Test whether points lying on boundary are handled consistently

    Also ensures that even with only one query point, an object array
    is returned rather than a 2d array.
    """

    X = np.array([[1.5], [3.0], [3.01]])
    radius = 3.0

    for algorithm in ALGORITHMS:
        nbrs = neighbors.NearestNeighbors(radius=radius,
                                          algorithm=algorithm).fit(X)
        results = nbrs.radius_neighbors([[0.0]], return_distance=False)
        assert_equal(results.shape, (1,))
        assert_equal(results.dtype, object)
        assert_array_equal(results[0], [0, 1])
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def test_callable_metric():

    def custom_metric(x1, x2):
        return np.sqrt(np.sum(x1 ** 2 + x2 ** 2))

    X = np.random.RandomState(42).rand(20, 2)
    nbrs1 = neighbors.NearestNeighbors(3, algorithm='auto',
                                       metric=custom_metric)
    nbrs2 = neighbors.NearestNeighbors(3, algorithm='brute',
                                       metric=custom_metric)

    nbrs1.fit(X)
    nbrs2.fit(X)

    dist1, ind1 = nbrs1.kneighbors(X)
    dist2, ind2 = nbrs2.kneighbors(X)

    assert_array_almost_equal(dist1, dist2)
项目:website-fingerprinting    作者:AxelGoetz    | 项目源码 | 文件源码
def __init__(self, is_multiclass=True, K_CLOSEST_NEIGHBORS=2):
        # Constants
        self.K_RECO = 5.0 # Num of neighbors for weight learning
        self.K_CLOSEST_NEIGHBORS = K_CLOSEST_NEIGHBORS

        self.weights = None

        self.kNN_finder = NearestNeighbors(
            n_neighbors=K_CLOSEST_NEIGHBORS,
            metric=self._calculate_dist,
            metric_params=None, # Dict otherwise
            n_jobs=-1
        )
项目:geomdn    作者:afshinrahimi    | 项目源码 | 文件源码
def assignClasses(self):
        clusterer = kdtree.KDTreeClustering(bucket_size=self.bucket_size)
        train_locs = self.df_train[['lat', 'lon']].values
        clusterer.fit(train_locs)
        clusters = clusterer.get_clusters()
        cluster_points = dd(list)
        for i, cluster in enumerate(clusters):
            cluster_points[cluster].append(train_locs[i])
        logging.info('#labels: %d' %len(cluster_points))
        self.cluster_median = OrderedDict()
        for cluster in sorted(cluster_points):
            points = cluster_points[cluster]
            median_lat = np.median([p[0] for p in points])
            median_lon = np.median([p[1] for p in points]) 
            self.cluster_median[cluster] = (median_lat, median_lon)
        dev_locs = self.df_dev[['lat', 'lon']].values
        test_locs = self.df_test[['lat', 'lon']].values
        nnbr = NearestNeighbors(n_neighbors=1, algorithm='brute', leaf_size=1, metric=haversine, n_jobs=4)
        nnbr.fit(np.array(self.cluster_median.values()))
        self.dev_classes = nnbr.kneighbors(dev_locs, n_neighbors=1, return_distance=False)[:, 0]
        self.test_classes = nnbr.kneighbors(test_locs, n_neighbors=1, return_distance=False)[:, 0]

        self.train_classes = clusters
        if self.one_hot_labels:
            num_labels = np.max(self.train_classes) + 1
            y_train = np.zeros((len(self.train_classes), num_labels), dtype=np.float32)
            y_train[np.arange(len(self.train_classes)), self.train_classes] = 1
            y_dev = np.zeros((len(self.dev_classes), num_labels), dtype=np.float32)
            y_dev[np.arange(len(self.dev_classes)), self.dev_classes] = 1
            y_test = np.zeros((len(self.test_classes), num_labels), dtype=np.float32)
            y_test[np.arange(len(self.test_classes)), self.test_classes] = 1
            self.train_classes = y_train
            self.dev_classes = y_dev
            self.test_classes = y_test
项目:singlecell-dash    作者:czbiohub    | 项目源码 | 文件源码
def network_layout(matrix, k=30):
    nbrs = NearestNeighbors(k, algorithm='brute', metric='cosine').fit(matrix)
    G = networkx.from_scipy_sparse_matrix(nbrs.kneighbors_graph(matrix))

    node_labels = label_propagation(G, verbose=True)
    communities_labelprop = np.array([node_labels[i] for i in range(matrix.shape[0])])

    pos = graphviz_layout(G, prog="sfdp")
    coords = np.array([pos[i] for i in range(len(pos))])
    print(coords.shape)

    return coords, communities_labelprop
项目:singlecell-dash    作者:czbiohub    | 项目源码 | 文件源码
def network_layout(matrix, k=30):
    nbrs = NearestNeighbors(k, algorithm='brute',
                            metric='cosine').fit(matrix)
    G = networkx.from_scipy_sparse_matrix(nbrs.kneighbors_graph(matrix))

    node_labels = label_propagation(G, verbose=True)
    communities_labelprop = np.array([node_labels[i] for i in range(matrix.shape[0])])

    pos = graphviz_layout(G, prog="sfdp")
    coords = np.array([pos[i] for i in range(len(pos))])
    print(coords.shape)

    return coords, communities_labelprop
项目:Deep-Learning-Plugin    作者:flowjo-lakes    | 项目源码 | 文件源码
def __init__(self,
                 MMDLayer,
                 MMDTargetTrain,
                 MMDTargetValidation_split=0.1,
                 MMDTargetSampleSize=1000,
                 n_neighbors = 25,
                 scales = None,
                 weights = None):
        if scales == None:
            print("setting scales using KNN")
            med = np.zeros(20)
            for ii in range(1,20):
                sample = MMDTargetTrain[np.random.randint(MMDTargetTrain.shape[0], size=MMDTargetSampleSize),:]
                nbrs = NearestNeighbors(n_neighbors=n_neighbors).fit(sample)
                distances,dummy = nbrs.kneighbors(sample)
                #nearest neighbor is the point so we need to exclude it
                med[ii]=np.median(distances[:,1:n_neighbors])
            med = np.median(med)  
            scales = [med/2, med, med*2] # CyTOF    
            print(scales)
        scales = K.variable(value=np.asarray(scales))
        if weights == None:
            print("setting all scale weights to 1")
            weights = K.eval(K.shape(scales)[0])
        weights = K.variable(value=np.asarray(weights))
        self.MMDLayer =  MMDLayer
        MMDTargetTrain, MMDTargetValidation = train_test_split(MMDTargetTrain, test_size=MMDTargetValidation_split, random_state=42)
        self.MMDTargetTrain = K.variable(value=MMDTargetTrain)
        self.MMDTargetTrainSize = K.eval(K.shape(self.MMDTargetTrain)[0])
        self.MMDTargetValidation = K.variable(value=MMDTargetValidation)
        self.MMDTargetValidationSize = K.eval(K.shape(self.MMDTargetValidation)[0])
        self.MMDTargetSampleSize = MMDTargetSampleSize
        self.kernel = self.RaphyKernel
        self.scales = scales
        self.weights = weights


    #calculate the raphy kernel applied to all entries in a pairwise distance matrix
项目:BioIR    作者:nlpaueb    | 项目源码 | 文件源码
def get_chunk_nns(self, X, q_centroids, question_details, chunk):
        nbrs = NearestNeighbors(algorithm='brute', metric='cosine', n_neighbors=1000).fit(X)
        dist, nns = nbrs.kneighbors(q_centroids, return_distance=True)
        q_array = []
        for q_point in range(nns.shape[0]):
            doc_nns = []
            for n_point in range(nns.shape[1]):
                doc_nns.append(self.idmap[chunk[0] + nns[q_point, n_point]])
            q = Question(question_details[q_point][0], question_details[q_point][1], doc_nns, list(dist[q_point, :]))
            q_array.append(q)
        return q_array

    # Dataset indeces are splitted in N chucks. Nearest top-(N*k) neighbors are extracted from each chunk, and then
    # the final top-k neighbors are extracted from those.
项目:kharita    作者:vipyoung    | 项目源码 | 文件源码
def getseeds(datapoint,radius,theta):
    chosen = []; seeds = [];
#    random.shuffle(datapoint)
    periodsampl = 500000
    for p in datapoint:
        chosen.append(p);
    for j,p in enumerate(chosen):
        ok = -1;
        if j<periodsampl:
            for q in seeds:
                if taxidist(p,q,theta)<radius:
                    ok = 1
                    break;
            if ok <1:
                seeds.append(p)
        else:
            if j%periodsampl == 0:# and (is_power2(int(j/1000))):
#                print(j,time.time()-start)
                S = [(lonconst * xx[0], latconst * xx[1], theta / 180 * (xx[2]+45)) for xx in seeds];
                nbrs = NearestNeighbors(n_neighbors=1, algorithm='ball_tree').fit(S)
                X = [(lonconst * xx[0], latconst * xx[1], theta / 180 * (xx[2]+45)) for xx in chosen[j:min(len(chosen),j+periodsampl)]];
                distances, indices = nbrs.kneighbors(X)
            if distances[j%periodsampl][0] >radius:
                seeds.append(p)
    print('seeds: ', len(seeds))
    return (seeds)
项目:vec4ir    作者:lgalke    | 项目源码 | 文件源码
def __init__(self,
                 analyzer=None, matching=None,
                 name=None,
                 verbose=0,
                 n_epochs=10,
                 alpha=0.25,
                 min_alpha=0.05,
                 n_jobs=4,
                 **kwargs):
        # self.model = model
        self.alpha = alpha
        self.min_alpha = min_alpha
        self.verbose = verbose
        self.name = "paragraph-vectors" if name is None else name

        if matching is True:
            self._matching = Matching()
        elif matching is False or matching is None:
            self._matching = None
        else:
            self._matching = Matching(**dict(matching))

        self.analyzer = analyzer
        self.model = Doc2Vec(alpha=alpha,
                             min_alpha=alpha,
                             size=500,
                             window=8,
                             min_count=1,
                             sample=1e-5,
                             workers=n_jobs,
                             negative=20,
                             dm=0, dbow_words=1,  # words only with dm!=0?
                             dm_mean=0,  # unused when in concat mode
                             dm_concat=1,
                             dm_tag_count=1
                             )
        self.n_epochs = n_epochs
        self._neighbors = NearestNeighbors(**kwargs)
项目:vec4ir    作者:lgalke    | 项目源码 | 文件源码
def query(self, query, k=None):
        model, matching = self.model, self._matching
        nn, analyze = self._neighbors, self.analyzer
        verbose = self.verbose
        if k is None:
            k = len(self._centroids)
        if matching:
            matched = matching.predict(query)
            print("Matched:", matched)
            tags = self._y[matched]
            dvs = np.asarray([model.docvecs[tag] for tag in tags])
            n_ret = min(k, len(matched))
            if n_ret == 0:
                return []
            nn.fit(dvs)
        else:
            tags = self._y
            n_ret = k
            # NearestNeighbors are already fit

        if verbose > 0:
            print(len(tags), "documents matched.")
        q = analyze(query)
        qv = model.infer_vector(q).reshape(1, -1)
        ind = nn.kneighbors(qv, n_neighbors=n_ret, return_distance=False)[0]
        y = tags[ind]
        return y
项目:vec4ir    作者:lgalke    | 项目源码 | 文件源码
def query(self, query, k=None, matched_indices=None):
        # matching step
        matching_ind = self._matching(query)
        # print(matching_ind, file=sys.stderr)
        Xm, matched_doc_ids = self._X[matching_ind], self._y[matching_ind]
        # matching_docs, matching_doc_ids = self._matching(query)
        # calculate elements to retrieve
        n_ret = len(matching_ind)
        if n_ret == 0:
            return []
        if self.verbose > 0:
            print("Found {} matches:".format(n_ret))
        # n_ret = min(n_ret, k) if k > 0 else n_ret
        # model dependent transformation
        xq = self._cv.transform([query])
        q = self.tfidf.transform(xq)
        # Xm = self.vectorizer.transform(matching_docs)
        # model dependent nearest neighbor search or scoring or whatever
        nn = NearestNeighbors(metric='cosine', algorithm='brute').fit(Xm)
        # abuse kneighbors in this case
        # AS q only contains one element, we only need its results.
        if k is not None and k < n_ret:
            n_ret = k

        ind = nn.kneighbors(q,  # q contains a single element
                            n_neighbors=n_ret,  # limit to k neighbors
                            return_distance=False)[0]  # so we only need 1 res
        # dont forget to convert the indices to document ids of matching
        labels = matched_doc_ids[ind]
        return labels
项目:vec4ir    作者:lgalke    | 项目源码 | 文件源码
def __init__(self,
                 embedding,
                 analyzer,
                 name="WCD",
                 n_jobs=1,
                 normalize=True,
                 verbose=0,
                 oov=None,
                 matching=True,
                 **kwargs):
        self.name = name
        self._embedding = embedding
        self._normalize = normalize
        self._oov = oov
        self.verbose = verbose
        self.n_jobs = n_jobs
        self._neighbors = NearestNeighbors(**kwargs)

        self._analyzer = analyzer

        if matching is True:
            self._matching = Matching()
        elif matching is False or matching is None:
            self._matching = None
        else:
            self._matching = Matching(**dict(matching))
项目:vec4ir    作者:lgalke    | 项目源码 | 文件源码
def __init__(self, embedding, analyzer='word', matching=None, name="FWCD",
                 n_jobs=1, use_idf=True):
        """TODO: to be defined1. """
        self.name = name
        self.matching = Matching(**dict(matching)) if matching else None
        self.vect = EmbeddedVectorizer(embedding, analyzer=analyzer, norm='l2',
                                       use_idf=use_idf)
        self.nn = NearestNeighbors(n_jobs=n_jobs, metric='cosine',
                                   algorithm='brute')
项目:FreeDiscovery    作者:FreeDiscovery    | 项目源码 | 文件源码
def test_nearest_centroid_ranker():
    # in the case where there is a single point by centroid,
    # nearest centroid should reduce to nearest neighbor
    from sklearn.neighbors import NearestNeighbors
    np.random.seed(0)

    n_samples = 100
    n_features = 120
    X = np.random.rand(n_samples, n_features)
    normalize(X, copy=False)
    index = np.arange(n_samples, dtype='int')
    y = np.arange(n_samples, dtype='int')
    index_train, index_test, y_train, y_test = train_test_split(index, y)
    X_train = X[index_train]
    X_test = X[index_test]


    nn = NearestNeighbors(n_neighbors=1, algorithm='brute')
    nn.fit(X_train)
    dist_ref, idx_ref = nn.kneighbors(X_test)

    nc = NearestCentroidRanker()
    nc.fit(X_train, y_train)
    dist_pred = nc.decision_function(X_test)
    y_pred = nc.predict(X_test)

    # ensures that we have the same number of unique ouput points
    # (even if absolute labels are not preserved)
    assert np.unique(idx_ref[:,0]).shape ==  np.unique(y_pred).shape

    assert_allclose(dist_pred, dist_ref[:,0])
项目:FreeDiscovery    作者:FreeDiscovery    | 项目源码 | 文件源码
def fit(self, X, y):
        """Fit the model using X as training data
        Parameters
        ----------
        X : {array-like, sparse matrix, BallTree, KDTree}
            Training data, shape [n_samples, n_features],

        """
        X = check_array(X, accept_sparse='csr')
        y = np.asarray(y, dtype='int')
        y_unique = np.unique(y)

        index = np.arange(len(y), dtype='int')

        if len(y_unique) == 0:
            raise ValueError('The training set must have at least '
                             'one document category!')

        # define nearest neighbors search objects for each category
        self._mod = [NearestNeighbors(n_neighbors=1,
                                      leaf_size=self.leaf_size,
                                      algorithm=self.algorithm,
                                      n_jobs=self.n_jobs,
                                      # euclidean metric by default
                                      metric='cosine',
                                      ) for el in range(len(y_unique))]

        index_mapping = []
        for imod, y_val in enumerate(y_unique):
            mask = (y == y_val)
            index_mapping.append(index[mask])
            self._mod[imod].fit(X[mask])

        self.index_mapping = index_mapping
项目:tensorsne    作者:gokceneraslan    | 项目源码 | 文件源码
def __knn_sklearn(X, k, n_jobs=-1, verbose=False, **kwargs):

    nn = NearestNeighbors(n_neighbors=k+1, n_jobs=n_jobs,
                          algorithm='ball_tree', **kwargs)
    nn.fit(X)

    if verbose:
        print('Indexing done.')
    dist, ind = nn.kneighbors(X, k+1, return_distance=True)

    if verbose:
        print('Query done.')

    return dist[:,1:].astype(X.dtype), ind[:,1:]
项目:dyfunconn    作者:makism    | 项目源码 | 文件源码
def encode(self, data, metric = 'euclidean'):
        """ Employ a nearest-neighbor rule to encode the given ``data`` using the codebook.

        Parameters
        ----------
        data : real array-like, shape(n_samples, n_features)
            Data matrix, each row represents a sample.

        metric : string
            One of the following valid options as defined for function http://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.pairwise_distances.html.

            Valid options include:

             - euclidean
             - cityblock
             - l1
             - cosine

        Returns
        -------
        encoded_data : real array-like, shape(n_samples, n_features)
            ``data``, as represented by the prototypes in codebook.
        ts_symbols : list, shape(n_samples, 1)
            A discrete symbolic time series
        """
        nbrs = NearestNeighbors(n_neighbors = 1, algorithm = 'auto', metric = metric).fit(self.protos)
        _, self.__symbols = nbrs.kneighbors(data)
        self.__encoding = self.protos[self.__symbols]

        return (self.__encoding, self.__symbols)
项目:dyfunconn    作者:makism    | 项目源码 | 文件源码
def encode(self, data, metric = 'euclidean'):
        """ Employ a nearest-neighbor rule to encode the given ``data`` using the codebook.

        Parameters
        ----------
        data : real array-like, shape(n_samples, n_features)
            Data matrix, each row represents a sample.

        metric : string
            One of the following valid options as defined for function `http://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.pairwise_distances.html`.

            Valid options include:

             - euclidean
             - cityblock
             - l1
             - cosine

        Returns
        -------
        encoded_data : real array-like, shape(n_samples, n_features)
            ``data``, as represented by the prototypes in codebook.
        ts_symbols : list, shape(n_samples, 1)
            A discrete symbolic time series
        """
        nbrs = NearestNeighbors(n_neighbors = 1, algorithm = 'auto', metric = metric).fit(self.protos)
        _, self.__symbols = nbrs.kneighbors(data)
        self.__encoding = self.protos[self.__symbols]

        return (self.__encoding, self.__symbols)
项目:dyfunconn    作者:makism    | 项目源码 | 文件源码
def fit(self, data):
        """ Learn data, and construct a vector codebook.

        Parameters
        ----------
        data : real array-like, shape(n_samples, n_features)
            Data matrix, each row represents a sample.

        Returns
        -------
        self : object
            The instance itself
        """
        [n_samples, _] = data.shape
        self.protos = data[self.rng.choice(n_samples, self.n_protos), ]

        # avg_p = np.mean(data, 0)
        #dist_from_avg_p = np.sum(pairwise_distances(avg_p, data))
        #ndistortion = []

        for iteration in range(self.iterations):
            sample = data[self.rng.choice(n_samples, 1), ]

            t = iteration / float(self.iterations)
            lrate = self.lrate_i * (self.lrate_f / float(self.lrate_i)) ** t
            epsilon = self.epsilon_i * (self.epsilon_f / float(self.epsilon_i)) ** t

            D = pairwise_distances(sample, self.protos, metric='euclidean', n_jobs=self.n_jobs)
            I = np.argsort(np.argsort(D))

            H = np.exp(-I / epsilon).ravel()

            diff = sample - self.protos
            for proto_id in range(self.n_protos):
                self.protos[proto_id, :] += lrate * H[proto_id] * diff[proto_id, :]
                #nbrs = NearestNeighbors(n_neighbors=1, algorithm='auto').fit(protos)
                #distances, _ = nbrs.kneighbors(data)
        #ndistortion.append( np.sum(distances) / dist_from_avg_p )

        return self
项目:dyfunconn    作者:makism    | 项目源码 | 文件源码
def encode(self, data, metric='euclidean'):
        """ Employ a nearest-neighbor rule to encode the given ``data`` using the codebook.

        Parameters
        ----------
        data : real array-like, shape(n_samples, n_features)
            Data matrix, each row represents a sample.

        metric : string
            One of the following valid options as defined for function http://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.pairwise_distances.html.

            Valid options include:

             - euclidean
             - cityblock
             - l1
             - cosine

        Returns
        -------
        encoded_data : real array-like, shape(n_samples, n_features)
            ``data``, as represented by the prototypes in codebook.
        ts_symbols : list, shape(n_samples, 1)
            A discrete symbolic time series
        """
        # Perform a proposed data mining procedure as described in [Laskaris2004].
        mds = MDS(1, random_state=self.rng)
        protos_1d = mds.fit_transform(self.protos).ravel()
        sorted_protos_1d = np.argsort(protos_1d)

        sprotos = self.protos[sorted_protos_1d]

        nbrs = NearestNeighbors(n_neighbors=1, algorithm='auto', metric=metric).fit(sprotos)
        _, self.__symbols = nbrs.kneighbors(data)
        self.__encoding = sprotos[self.__symbols]

        return (self.__encoding, self.__symbols)
项目:kdd2017    作者:JinpengLI    | 项目源码 | 文件源码
def __init__(self,n_neighbors=5,loss='L2'):
        if loss in ['L1','L2','SMAPE']:
            loss = {'L1':L1,'L2':L2,'SMAPE':SMAPE}[loss]
        self.loss = loss
        self.n_neighbors = n_neighbors
        self.model = NearestNeighbors(n_neighbors,algorithm='auto',n_jobs=-1)
        self.solver = lambda x:solver(x,loss)
项目:esper    作者:scanner-research    | 项目源码 | 文件源码
def compute_distances(cls, inst_id):
        global feat_nn
        global feat_ids

        it = cls.objects.annotate(height=F('face__bbox_y2') - F('face__bbox_y1')).filter(
            height__gte=0.1).order_by('id')
        if feat_nn is None:
            _print('Loading features...')
            feats = list(it[::5])
            feat_ids = np.array([f.id for f in feats])
            feat_vectors = [f.load_features() for f in feats]
            X = np.vstack(feat_vectors)
            _print('Constructing KNN tree...')
            feat_nn = NearestNeighbors().fit(X)
            _print('Done!')

        # Erase distances from previous computation
        prev = list(cls.objects.filter(distto__isnull=False))
        for feat in prev:
            feat.distto = None
        cls.objects.bulk_update(prev)

        dists, indices = feat_nn.kneighbors([cls.objects.get(face=inst_id).load_features()], 1000)

        for dist, feat_id in zip(dists[0], feat_ids[indices[0]]):
            feat = cls.objects.get(id=feat_id)
            feat.distto = dist
            feat.save()
项目:esper    作者:scanner-research    | 项目源码 | 文件源码
def identity_detect(videos, exemplar, features):
    log.debug('Loading features')
    ids, vectors = zip(*[((i, j), f.load_features())
                         for i, vid_features in enumerate(features)
                         for j, f in enumerate(vid_features)])

    log.debug('Building k-nn tree')
    feat_nn = NearestNeighbors().fit(np.vstack(vectors))

    log.debug('Doing look-up')
    exemplar_vector = FaceFeatures.objects.get(
        face=exemplar, labeler__name='facenet').load_features()
    dists, id_indices = feat_nn.kneighbors([exemplar_vector], min(10000, len(vectors)))

    face_map = defaultdict(list)
    for (dist, k) in zip(dists[0], id_indices[0]):
        (i, j) = ids[k]
        if dist > FEATURE_DISTANCE_THRESHOLD:
            break

        face_map[videos[i].id].append(features[i][j])

    return [face_map[video.id] for video in videos]


# Remove faces with negative coords and small height
项目:BatchEffectRemoval    作者:ushaham    | 项目源码 | 文件源码
def __init__(self,
                 MMDLayer,
                 MMDTargetTrain,
                 MMDTargetValidation_split=0.1,
                 MMDTargetSampleSize=1000,
                 n_neighbors = 25,
                 scales = None,
                 weights = None):
        if scales == None:
            print("setting scales using KNN")
            med = np.zeros(20)
            for ii in range(1,20):
                sample = MMDTargetTrain[np.random.randint(MMDTargetTrain.shape[0], size=MMDTargetSampleSize),:]
                nbrs = NearestNeighbors(n_neighbors=n_neighbors).fit(sample)
                distances,dummy = nbrs.kneighbors(sample)
                #nearest neighbor is the point so we need to exclude it
                med[ii]=np.median(distances[:,1:n_neighbors])
            med = np.median(med)  
            scales = [med/2, med, med*2] # CyTOF    
            print(scales)
        scales = K.variable(value=np.asarray(scales))
        if weights == None:
            print("setting all scale weights to 1")
            weights = K.eval(K.shape(scales)[0])
        weights = K.variable(value=np.asarray(weights))
        self.MMDLayer =  MMDLayer
        MMDTargetTrain, MMDTargetValidation = train_test_split(MMDTargetTrain, test_size=MMDTargetValidation_split, random_state=42)
        self.MMDTargetTrain = K.variable(value=MMDTargetTrain)
        self.MMDTargetTrainSize = K.eval(K.shape(self.MMDTargetTrain)[0])
        self.MMDTargetValidation = K.variable(value=MMDTargetValidation)
        self.MMDTargetValidationSize = K.eval(K.shape(self.MMDTargetValidation)[0])
        self.MMDTargetSampleSize = MMDTargetSampleSize
        self.kernel = self.RaphyKernel
        self.scales = scales
        self.weights = weights


    #calculate the raphy kernel applied to all entries in a pairwise distance matrix
项目:intelligentCampus    作者:Jackal007    | 项目源码 | 文件源码
def __init__(self):
        SingleClassifier.SingleClassifier.__init__(self)
        # weak classifier
        algorithms = ['brute', 'ball_tree', 'kd_tree']
        self.clf =  NearestNeighbors(n_neighbors=2, algorithm='ball_tree')
项目:kenchi    作者:Y-oHr-N    | 项目源码 | 文件源码
def fit(self, X, y=None):
        """Fit the model according to the given training data.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Samples.

        Returns
        -------
        self : detector
            Return self.
        """

        X                 = check_array(X)

        self._knn         = NearestNeighbors(
            metric        = self.metric,
            metric_params = self.metric_params,
            n_jobs        = self.n_jobs,
            n_neighbors   = self.n_neighbors,
            p             = self.p
        ).fit(X)

        self.y_score_     = self.anomaly_score()
        self.threshold_   = np.percentile(
            self.y_score_, 100.0 * (1.0 - self.fpr)
        )

        return self
项目:kenchi    作者:Y-oHr-N    | 项目源码 | 文件源码
def fit(self, X, y=None):
        """Fit the model according to the given training data.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Samples.

        Returns
        -------
        self : detector
            Return self.
        """

        X                 = check_array(X)

        self._knn         = NearestNeighbors(
            metric        = self.metric,
            metric_params = self.metric_params,
            n_jobs        = self.n_jobs,
            n_neighbors   = self.n_neighbors,
            p             = self.p
        ).fit(X)

        self.y_score_     = self.anomaly_score()
        self.threshold_   = np.percentile(
            self.y_score_, 100.0 * (1.0 - self.fpr)
        )

        return self
项目:soinn    作者:fukatani    | 项目源码 | 文件源码
def calc_mahalanobis(x, y, n_neighbors):
    from sklearn.neighbors import DistanceMetric, NearestNeighbors
    DistanceMetric.get_metric('mahalanobis', V=np.cov(x))

    nn = NearestNeighbors(n_neighbors=n_neighbors,
                          algorithm='brute',
                          metric='mahalanobis',
                          metric_params={'V': np.cov(x)})
    return nn.fit(x).kneighbors(y)
项目:rec-sys-experiments    作者:rnowling    | 项目源码 | 文件源码
def train_and_score(metric, training, testing, ks):
    print "Training and scoring"
    scores = []
    knn = NearestNeighbors(metric=metric, algorithm="brute")
    knn.fit(training)
    for k in ks:
        print "Evaluating for", k, "neighbors"
        neighbor_indices = knn.kneighbors(testing,
                                          n_neighbors=k,
                                          return_distance=False)

        all_predicted_scores = []
        all_labels = []
        for user_id in xrange(testing.shape[0]):
            user_row = testing[user_id, :]

            _, interaction_indices = user_row.nonzero()
            interacted = set(interaction_indices)
            non_interacted = set(xrange(testing.shape[1])) - interacted

            n_samples = min(len(non_interacted), len(interacted))
            sampled_interacted = random.sample(interacted, n_samples)
            sampled_non_interacted = random.sample(non_interacted, n_samples)

            indices = list(sampled_interacted)
            indices.extend(sampled_non_interacted)
            labels = [1] * n_samples
            labels.extend([0] * n_samples)

            neighbors = training[neighbor_indices[user_id, :], :]
            predicted_scores = neighbors.mean(axis=0)
            for idx in indices:
                all_predicted_scores.append(predicted_scores[0, idx])
            all_labels.extend(labels)

        print len(all_labels), len(all_predicted_scores)

        auc = roc_auc_score(all_labels, all_predicted_scores)

        print "k", k, "AUC", auc
项目:Machine_Learning_Playground    作者:yao23    | 项目源码 | 文件源码
def __init__(self):
        self.knnModel = NearestNeighbors(n_neighbors=15)
        self.log = logging.getLogger(__name__)
项目:Machine_Learning_Playground    作者:yao23    | 项目源码 | 文件源码
def train(self, userFeatureTable, ratingsMat):
        userFeatureTable.loc[:, "age"] = userFeatureTable.loc[:, "age"] / 10.
        # ad hoc fix, make sure feature's range is similar
        self.knnModel = NearestNeighbors(n_neighbors=10, algorithm='ball_tree').fit(userFeatureTable)

        # ratingMat is the rating matrix
        self.ratingsMat = ratingsMat
        self.userFeatureTable = userFeatureTable
        self.userIds = self.userFeatureTable.index  # the actual order seen by the knnmodel
项目:hybrid-rs-trainner    作者:SeniorSA    | 项目源码 | 文件源码
def find_knn(self, target_matrix, target_features):
        neighbors = NearestNeighbors(n_neighbors=self.__args.n_neighbors, algorithm=self.__args.alg).fit(
            target_matrix.values)
        distances, indexes = neighbors.kneighbors(target_features)
        return distances, indexes
项目:hybrid-rs-trainner    作者:SeniorSA    | 项目源码 | 文件源码
def fit(atributos):
    neighbor = NearestNeighbors(metric='euclidean')
    neighbor.fit(atributos)
    return neighbor
项目:cervantes    作者:textclf    | 项目源码 | 文件源码
def index(self, metric='cosine'):
        alg = 'brute' if (metric == 'cosine') else 'auto'
        if not SKLEARN:
            raise WordVectorBoxException("Needs sklearn to work")
        self._nn = NearestNeighbors(metric=metric, algorithm=alg)
        self._nn.fit(self.W)
        return self