def make_complete_graph(num_vertices):
    """Constructs a complete graph.

    The pairing function is: k = v1 + v2 * (v2 - 1) // 2

        num_vertices: Number of vertices.

    Returns: A tuple with elements:
        V: Number of vertices.
        K: Number of edges.
        grid: a 3 x K grid of (edge, vertex, vertex) triples.
    V = num_vertices
    K = V * (V - 1) // 2
    grid = np.zeros([3, K], np.int32)
    k = 0
    for v2 in range(V):
        for v1 in range(v2):
            grid[:, k] = [k, v1, v2]
            k += 1
    return grid
def make_tree(edges):
    """Constructs a tree graph from a set of (vertex,vertex) pairs.

        edges: A list or set of unordered (vertex, vertex) pairs.

    Returns: A tuple with elements:
        V: Number of vertices.
        E: Number of edges.
        grid: a 3 x E grid of (edge, vertex, vertex) triples.
    assert all(isinstance(edge, tuple) for edge in edges)
    edges = [tuple(sorted(edge)) for edge in edges]
    E = len(edges)
    grid = np.zeros([3, E], np.int32)
    for e, (v1, v2) in enumerate(edges):
        grid[:, e] = [e, v1, v2]
    return grid
def __init__(self, num_vertices):
        logger.debug('TreeStructure with %d vertices', num_vertices)
        self._num_vertices = num_vertices
        self._num_edges = num_vertices - 1
        self.set_edges([(v, v + 1) for v in range(num_vertices - 1)])
        self._complete_grid = None  # Lazily constructed.
        self._vertices = np.arange(num_vertices, dtype=np.int32)
def numba_csgraph(csr, node_props=None):
    if node_props is None:
        node_props = np.broadcast_to(1., csr.shape[0])
        node_props.flags.writeable = True
    return CSGraph(csr.indptr, csr.indices,,
                   np.array(csr.shape, dtype=np.int32), node_props)
def cudatest_hist():
    # src1 = np.arange(n, dtype=np.float32)
    src1 = np.random.randint(BIN_COUNT,size=n).astype(np.float32)
    histogram = np.zeros(BIN_COUNT, dtype=np.int32)

    stream =  # use stream to trigger async memory transfer
    ts = timer()

    # Controll the iterations
    count = 1
    for i in range(count):
        with stream.auto_synchronize():
            # ts = timer()
            d_src1 = cuda.to_device(src1, stream=stream)
            d_hist = cuda.to_device(histogram, stream=stream)
            # gpu_1d_stencil[bpg, tpb, stream](d_src1)
            gpu_histogram[bpg, tpb, stream](d_src1,d_hist)
            d_src1.copy_to_host(src1, stream=stream)
            d_hist.copy_to_host(histogram, stream=stream)

    te = timer()
    print('pinned ',count," : ", te - ts)
    # Taking histogram on origional data.
    # This histogram will contain few more frequency due to the padding we add in the orional data.
    # in kernel code.
    hist = src1.astype(np.int64)
    x = itemfreq(hist.ravel())
    hist = x#[:, 1]/sum(x[:, 1])

# cudatest_stencil()
def thresholding(arry, hist):

    # We have 10*10 threads per block
    A = cuda.shared.array(shape=(32,32), dtype=int32)

    x,y = cuda.grid(2)

    ty = cuda.threadIdx.x
    tx = cuda.threadIdx.y

    A[ty,tx] = arry[x,y]


    threadCountX = A.shape[0] - 1
    threadCountY = A.shape[1] - 1
    # If within x range and y range then calculate the LBP discriptor along
    # with histogram value to specific bin

    # Other wise Ignore the Value
    if (ty > 0 and  (threadCountX-ty) > 0 ) and (tx > 0 and (threadCountY-tx) > 0):
        # You can do the Processing here. ^_^
        code = 0
        #  We need to make sure that each value is accessable to each thread
        #  TODO: make them atomic
        center = A[ty, tx]
        code = 0 if center > 150  else 255

        code = ( code - center)

        A[ty,tx] = code

        # Wait All Threads to Sync here.

        val  = A[ty,tx]
        cuda.atomic.add(arry, (x,y),val)

        # This Atomic Operation is equivalent to  hist[code % 256] += 1
        ind = code % BIN_COUNT
        cuda.atomic.add(hist, ind, 1)
def unsharp_masking(arry, hist):

    # We have 10*10 threads per block
    A = cuda.shared.array(shape=(32,32), dtype=int32)
    # H = cuda.shared.array(BIN_COUNT, dtype=int32)

    x,y = cuda.grid(2)

    ty = cuda.threadIdx.x
    tx = cuda.threadIdx.y

    A[ty,tx] = arry[x,y]


    threadCountX = A.shape[0] - 1
    threadCountY = A.shape[1] - 1
    # If within x range and y range then calculate the LBP discriptor along
    # with histogram value to specific bin

    # Other wise Ignore the Value
    if (ty > 0 and  (threadCountX-ty) > 0 ) and (tx > 0 and (threadCountY-tx) > 0):
    #     # You can do the Processing here. ^_^
        code = 0
        #  We need to make sure that each value is accessable to each thread
        #  TODO: make them atomic
        center = A[ty, tx]
        # Lets try averaging,
        code += A[ty-1][tx-1]*-1
        code += A[ty][tx-1]*-2
        code += A[ty+1][tx-1]*-1
        code += A[ty+1][tx]*-2
        code += A[ty+1][tx+1]*-1
        code += A[ty][tx+1]*-2
        code += A[ty-1][tx+1]*-1
        code += A[ty-1][tx-1]*-2

        code = code / 16

        code = ( code - center)

        A[ty,tx] = code

        # cuda.atomic.add(A, (ty,tx),code)

        val  = A[ty,tx]
        cuda.atomic.add(arry, (x,y),val)
        # This Atomic Operation is equivalent to  hist[code % 256] += 1
        ind = code % BIN_COUNT

        cuda.atomic.add(hist, ind, 1)