Python chainer.cuda 模块，Device() 实例源码

我们从Python开源项目中，提取了以下29个代码示例，用于说明如何使用chainer.cuda.Device()。

项目：trainer 作者：nutszebra | 项目源码 | 文件源码

def setup_workers(self):
        # work only once
        if self._initialized:
            return
        self._initialized = True

        self.model.cleargrads()
        for i in six.moves.range(1, len(self.gpus)):
            pipe, worker_end = multiprocessing.Pipe()
            worker = _Worker(i, worker_end, self.model, self.gpus, self.da, int(float(self.batch) / len(self.gpus) / self.train_batch_divide), self)
            worker.start()
            self._workers.append(worker)
            self._pipes.append(pipe)

        with cuda.Device(self.gpus[0]):
            self.model.to_gpu(self.gpus[0])
            if len(self.gpus) > 1:
                communication_id = nccl.get_unique_id()
                self._send_message(("set comm_id", communication_id))
                self.communication = nccl.NcclCommunicator(len(self.gpus),
                                                           communication_id,
                                                           0)

项目：trainer 作者：nutszebra | 项目源码 | 文件源码

def setup_workers(self):
        # work only once
        if self._initialized:
            return
        self._initialized = True

        self.model.zerograds()
        for i in six.moves.range(1, len(self.gpus)):
            pipe, worker_end = multiprocessing.Pipe()
            worker = _Worker(i, worker_end, self.model, self.gpus, self.da, int(self.batch / len(self.gpus) / self.train_batch_divide), self)
            worker.start()
            self._workers.append(worker)
            self._pipes.append(pipe)

        with cuda.Device(self.gpus[0]):
            self.model.to_gpu(self.gpus[0])
            if len(self.gpus) > 1:
                communication_id = nccl.get_unique_id()
                self._send_message(("set comm_id", communication_id))
                self.communication = nccl.NcclCommunicator(len(self.gpus),
                                                           communication_id,
                                                           0)

项目：chainer-deconv 作者：germanRos | 项目源码 | 文件源码

def _inv_gpu(b):
    # We do a batched LU decomposition on the GPU to compute the inverse
    # Change the shape of the array to be size=1 minibatch if necessary
    # Also copy the matrix as the elments will be modified in-place
    a = matmul._as_batch_mat(b).copy()
    n = a.shape[1]
    n_matrices = len(a)
    # Pivot array
    p = cuda.cupy.empty((n, n_matrices), dtype=numpy.int32)
    # Output array
    c = cuda.cupy.empty_like(a)
    # These arrays hold information on the execution success
    # or if the matrix was singular
    info = cuda.cupy.empty(n_matrices, dtype=numpy.int32)
    ap = matmul._mat_ptrs(a)
    cp = matmul._mat_ptrs(c)
    _, lda = matmul._get_ld(a)
    _, ldc = matmul._get_ld(c)
    handle = cuda.Device().cublas_handle
    cuda.cublas.sgetrfBatched(
        handle, n, ap.data.ptr, lda, p.data.ptr, info.data.ptr, n_matrices)
    cuda.cublas.sgetriBatched(
        handle, n, ap.data.ptr, lda, p.data.ptr, cp.data.ptr, ldc,
        info.data.ptr, n_matrices)
    return c, info

项目：nn_mask 作者：ZitengWang | 项目源码 | 文件源码

def forward_gpu(self, inputs):
        x = inputs[0]
        W = inputs[1]
        # Prepare BLAS call
        handle = cuda.Device().cublas_handle
        k, m = W.shape
        n, l = x.shape[0] * x.shape[1], x.shape[2]
        lda = max(1, x.shape[-1])
        ldb = max(1, W.strides[0] // W.dtype.itemsize)
        ldc = max(1, m)
        Wx = cupy.empty((x.shape[0], x.shape[1], W.shape[1]),
                        dtype=numpy.float32)
        sgemm(handle, False, False, m, n, k, 1, W.data.ptr, ldb,
              x.data.ptr, lda, 0, Wx.data.ptr, ldc)
        if len(inputs) > 2:
            b = inputs[2]
            Wx += b
        return Wx,

项目：nn-gev 作者：fgnt | 项目源码 | 文件源码

def forward_gpu(self, inputs):
        x = inputs[0]
        W = inputs[1]
        # Prepare BLAS call
        handle = cuda.Device().cublas_handle
        k, m = W.shape
        n, l = x.shape[0] * x.shape[1], x.shape[2]
        lda = max(1, x.shape[-1])
        ldb = max(1, W.strides[0] // W.dtype.itemsize)
        ldc = max(1, m)
        Wx = cupy.empty((x.shape[0], x.shape[1], W.shape[1]),
                        dtype=numpy.float32)
        sgemm(handle, False, False, m, n, k, 1, W.data.ptr, ldb,
              x.data.ptr, lda, 0, Wx.data.ptr, ldc)
        if len(inputs) > 2:
            b = inputs[2]
            Wx += b
        return Wx,

项目：chainer-faster-rcnn 作者：mitmul | 项目源码 | 文件源码

def forward(self, inputs):
        xp = cuda.get_array_module(*inputs)
        x0, x1 = inputs
        self.diff = self.inside_weights * (x0 - x1)
        abs_diff = xp.abs(self.diff)
        flag = abs_diff < 1.0 / self.sigma2
        y = (flag * 0.5 * xp.square(self.diff) * self.sigma2 +
             (~flag) * (abs_diff - 0.5 / self.sigma2))
        if xp == cuda.cupy:
            with cuda.Device(cuda.get_device(y)):
                num = xp.prod(xp.asarray(y.shape))
        else:
            num = xp.prod(y.shape)
        return xp.array(y.sum() / num).astype(numpy.float32),

项目：chainer-faster-rcnn 作者：mitmul | 项目源码 | 文件源码

def bbox_transform_inv(boxes, deltas, gpu=-1):
    if gpu >= 0:
        with cuda.Device(gpu):
            return _bbox_transform_inv(boxes, deltas)
    else:
        return _bbox_transform_inv(boxes, deltas)

项目：chainer-faster-rcnn 作者：mitmul | 项目源码 | 文件源码

def clip_boxes(boxes, im_shape, gpu=-1):
    if gpu >= 0:
        with cuda.Device(gpu):
            return _clip_boxes(boxes, im_shape)
    else:
        return _clip_boxes(boxes, im_shape)

项目：chainer-deconv 作者：germanRos | 项目源码 | 文件源码

def _batch_matmul_gpu(a, b, out, transa=False, transb=False, transout=False):
    a = _as_batch_mat(cuda.cupy.ascontiguousarray(a))
    b = _as_batch_mat(cuda.cupy.ascontiguousarray(b))
    trans_axis = (0, 2, 1)
    if transout:
        out = out.transpose(trans_axis)
    needtrans, _ = _get_ld(out)
    if needtrans == 1:
        # (A B)^T = B^T A^T
        a, b = b, a
        transa, transb = not transb, not transa
        out = out.transpose(trans_axis)
    if transa:
        a = a.transpose(trans_axis)
    if transb:
        b = b.transpose(trans_axis)

    transa, lda = _get_ld(a)
    transb, ldb = _get_ld(b)
    transout, ldout = _get_ld(out)
    la, n, ka = a.shape
    lb, kb, m = b.shape

    assert ka == kb
    assert transout == 0 or ldout == 1
    assert out.shape == (la, n, m)

    ap = _mat_ptrs(a)
    bp = _mat_ptrs(b)
    outp = _mat_ptrs(out)
    cuda.cublas.sgemmBatched(
        cuda.Device().cublas_handle,
        transa,
        transb,
        n, m, ka, 1.0,
        ap.data.ptr, lda,
        bp.data.ptr, ldb,
        0.0, outp.data.ptr, ldout, la)

项目：chainer-deconv 作者：germanRos | 项目源码 | 文件源码

def _det_gpu(b):
    # We do a batched LU decomposition on the GPU to compute
    # and compute the determinant by multiplying the diagonal.
    # Change the shape of the array to be size=1 minibatch if necessary.
    # Also copy the matrix as the elments will be modified in-place.
    a = matmul._as_batch_mat(b).copy()
    n = a.shape[1]
    n_matrices = len(a)
    # Pivot array
    p = cuda.cupy.zeros((n_matrices, n), dtype='int32')
    # Output array
    # These arrays hold information on the execution success
    # or if the matrix was singular.
    info = cuda.cupy.zeros(n_matrices, dtype=numpy.intp)
    ap = matmul._mat_ptrs(a)
    _, lda = matmul._get_ld(a)
    cuda.cublas.sgetrfBatched(cuda.Device().cublas_handle, n, ap.data.ptr, lda,
                              p.data.ptr, info.data.ptr, n_matrices)
    det = cuda.cupy.prod(a.diagonal(axis1=1, axis2=2), axis=1)
    # The determinant is equal to the product of the diagonal entries
    # of `a` where the sign of `a` is flipped depending on whether
    # the pivot array is equal to its index.
    rng = cuda.cupy.arange(1, n + 1, dtype='int32')
    parity = cuda.cupy.sum(p != rng, axis=1) % 2
    sign = 1. - 2. * parity.astype('float32')
    return det * sign, info

项目：chainer-deconv 作者：germanRos | 项目源码 | 文件源码

def test_linear_model_multi_gpu(self):
        with cuda.Device(0):
            self.assertGreater(
                cuda.to_cpu(self.model.accuracy_gpu(1).data), 0.9)

项目：chainer-deconv 作者：germanRos | 项目源码 | 文件源码

def test_model_setup_multi_gpu(self):
        with cuda.Device(0):
            model = self.model.model
            optimizer = self.model.optimizer
            model.to_gpu(1)
            optimizer.setup(model)
        for name, param in optimizer.target.namedparams():
            for v in six.itervalues(optimizer._states[name]):
                self.assertEqual(int(param.data.device), int(v.device))

项目：chainer-deconv 作者：germanRos | 项目源码 | 文件源码

def check_accumulate_grads_from_gpu(self, src_id):
        with cuda.Device(src_id):
            self.optimizer.accumulate_grads([cuda.cupy.arange(3)])
        grad = self.target.param.grad
        self.assertTrue((cuda.to_cpu(grad) == np.arange(3) * 2).all())

项目：chainer-deconv 作者：germanRos | 项目源码 | 文件源码

def test_accumulate_grads_gpu_to_cpu(self):
        self.setup_cpu()
        self.check_accumulate_grads_from_gpu(cuda.Device().id)

项目：chainer-deconv 作者：germanRos | 项目源码 | 文件源码

def test_accumulate_grads_gpu_to_gpu(self):
        device_id = cuda.Device().id
        self.setup_gpu(device_id)
        self.check_accumulate_grads_from_gpu(device_id)

项目：chainer-deconv 作者：germanRos | 项目源码 | 文件源码

def test_copy_parameters_from_cpu_to_gpu(self):
        self.check_copy_parameters_from(-1, cuda.Device().id)

项目：chainer-deconv 作者：germanRos | 项目源码 | 文件源码

def test_copy_parameters_from_gpu_to_cpu(self):
        self.check_copy_parameters_from(cuda.Device().id, -1)

项目：chainer-deconv 作者：germanRos | 项目源码 | 文件源码

def test_forward_gpu(self):
        device_id = cuda.Device().id
        self.check_forward(device_id, device_id)

项目：chainer-deconv 作者：germanRos | 项目源码 | 文件源码

def test_check_backward_gpu(self):
        device_id = cuda.Device().id
        self.check_forward(device_id, device_id)

项目：chainer-deconv 作者：germanRos | 项目源码 | 文件源码

def test_forward_cpu_to_gpu(self):
        device_id = cuda.Device().id
        self.check_forward(-1, device_id)

项目：chainer-deconv 作者：germanRos | 项目源码 | 文件源码

def test_backward_cpu_to_gpu(self):
        device_id = cuda.Device().id
        self.check_backward(-1, device_id)

项目：chainer-deconv 作者：germanRos | 项目源码 | 文件源码

def test_forward_gpu_to_cpu(self):
        device_id = cuda.Device().id
        self.check_forward(device_id, -1)

项目：nn_mask 作者：ZitengWang | 项目源码 | 文件源码

def backward_gpu(self, inputs, gy):
        x = inputs[0]
        W = inputs[1]
        # Backprop weight
        gW = cuda.cupy.empty_like(W)
        handle = cuda.Device().cublas_handle
        k, n = gy[0].shape[0] * gy[0].shape[1], W.shape[0]
        m = W.shape[1]
        lda = max(1, x.shape[-1])
        ldb = max(1, gy[0].shape[-1])
        ldc = max(1, m)
        sgemm(handle, False, True, m, n, k, 1, gy[0].data.ptr, ldb,
              x.data.ptr, lda, 1, gW.data.ptr, ldc)
        # Backprop input
        m, k = W.shape
        n, l = x.shape[0] * x.shape[1], gy[0].shape[2]
        lda = max(1, gy[0].shape[-1])
        ldb = max(1, W.shape[1])
        ldc = max(1, m)
        gx = cuda.cupy.empty_like(x)
        sgemm(handle, True, False, m, n, k, 1, W.data.ptr, ldb,
              gy[0].data.ptr, lda, 0, gx.data.ptr, ldc)
        # Backprop bias
        if len(inputs) > 2:
            gy_2d = _as_mat(gy[0])
            gb = gy_2d.sum(0)
            return gx, gW, gb
        else:
            return gx, gW

项目：nn-gev 作者：fgnt | 项目源码 | 文件源码

def backward_gpu(self, inputs, gy):
        x = inputs[0]
        W = inputs[1]
        # Backprop weight
        gW = cuda.cupy.empty_like(W)
        handle = cuda.Device().cublas_handle
        k, n = gy[0].shape[0] * gy[0].shape[1], W.shape[0]
        m = W.shape[1]
        lda = max(1, x.shape[-1])
        ldb = max(1, gy[0].shape[-1])
        ldc = max(1, m)
        sgemm(handle, False, True, m, n, k, 1, gy[0].data.ptr, ldb,
              x.data.ptr, lda, 1, gW.data.ptr, ldc)
        # Backprop input
        m, k = W.shape
        n, l = x.shape[0] * x.shape[1], gy[0].shape[2]
        lda = max(1, gy[0].shape[-1])
        ldb = max(1, W.shape[1])
        ldc = max(1, m)
        gx = cuda.cupy.empty_like(x)
        sgemm(handle, True, False, m, n, k, 1, W.data.ptr, ldb,
              gy[0].data.ptr, lda, 0, gx.data.ptr, ldc)
        # Backprop bias
        if len(inputs) > 2:
            gy_2d = _as_mat(gy[0])
            gb = gy_2d.sum(0)
            return gx, gW, gb
        else:
            return gx, gW

项目：deel 作者：uei | 项目源码 | 文件源码

def bbox_transform_inv(boxes, deltas, gpu=-1):
    if gpu >= 0:
        with cuda.Device(gpu):
            return _bbox_transform_inv(boxes, deltas)
    else:
        return _bbox_transform_inv(boxes, deltas)

项目：deel 作者：uei | 项目源码 | 文件源码

def clip_boxes(boxes, im_shape, gpu=-1):
    if gpu >= 0:
        with cuda.Device(gpu):
            return _clip_boxes(boxes, im_shape)
    else:
        return _clip_boxes(boxes, im_shape)

项目：trainer 作者：nutszebra | 项目源码 | 文件源码

def run(self):
        dev = cuda.Device(self.device)
        dev.use()
        # build communication via nccl
        self.setup()
        gp = None
        p = multiprocessing.Pool(self.parallel_train)
        args_da = [self.da() for _ in six.moves.range(self.batch)]
        while True:
            job, data = self.pipe.recv()
            if job == 'finalize':
                dev.synchronize()
                break
            if job == 'update':
                # for reducing memory
                self.model.cleargrads()
                indices = list(self.sampling.yield_random_batch_from_category(1, self.picture_number_at_each_categories, self.batch, shuffle=True))[0]
                x = self.train_x[indices]
                t = self.train_y[indices]
                args = list(zip(x, t, args_da))
                processed = p.starmap(process_train, args)
                tmp_x, tmp_t = list(zip(*processed))
                train = True
                x = self.model.prepare_input(tmp_x, dtype=np.float32, volatile=not train, gpu=self.device)
                t = self.model.prepare_input(tmp_t, dtype=np.int32, volatile=not train, gpu=self.device)
                y = self.model(x, train=train)
                loss = self.model.calc_loss(y, t) / self.number_of_devices / self.train_batch_divide
                loss.backward()

                del x
                del t
                del y
                del loss

                # send gradients of self.model
                gg = gather_grads(self.model)
                null_stream = cuda.Stream.null
                self.communication.reduce(gg.data.ptr,
                                          gg.data.ptr,
                                          gg.size,
                                          nccl.NCCL_FLOAT,
                                          nccl.NCCL_SUM,
                                          0,
                                          null_stream.ptr)
                del gg
                self.model.cleargrads()
                # send parameters of self.model
                gp = gather_params(self.model)
                self.communication.bcast(gp.data.ptr,
                                         gp.size,
                                         nccl.NCCL_FLOAT,
                                         0,
                                         null_stream.ptr)
                scatter_params(self.model, gp)
                gp = None

项目：trainer 作者：nutszebra | 项目源码 | 文件源码

def update_core(self, x, t, p, args_da):
        self._send_message(('update', None))
        with cuda.Device(self.gpus[0]):
            self.model.cleargrads()
            args = list(zip(x, t, args_da))
            processed = p.starmap(process_train, args)
            tmp_x, tmp_t = list(zip(*processed))
            data_length = len(tmp_x)
            train = True
            x = self.model.prepare_input(tmp_x, dtype=np.float32, volatile=not train, gpu=self.gpus[0])
            t = self.model.prepare_input(tmp_t, dtype=np.int32, volatile=not train, gpu=self.gpus[0])
            y = self.model(x, train=train)
            loss = self.model.calc_loss(y, t) / len(self.gpus)
            loss.backward()
            loss.to_cpu()
            loss = float(loss.data) * data_length

            del x
            del t
            del y

            # NCCL: reduce grads
            null_stream = cuda.Stream.null
            if self.communication is not None:
                # send grads
                gg = gather_grads(self.model)
                self.communication.reduce(gg.data.ptr,
                                          gg.data.ptr,
                                          gg.size,
                                          nccl.NCCL_FLOAT,
                                          nccl.NCCL_SUM,
                                          0,
                                          null_stream.ptr)
                # copy grads, gg, to  self.model
                scatter_grads(self.model, gg)
                del gg
            self.optimizer.update()
            if self.communication is not None:
                gp = gather_params(self.model)
                self.communication.bcast(gp.data.ptr,
                                         gp.size,
                                         nccl.NCCL_FLOAT,
                                         0,
                                         null_stream.ptr)
        return loss

项目：trainer 作者：nutszebra | 项目源码 | 文件源码

def run(self):
        dev = cuda.Device(self.device)
        dev.use()
        # build communication via nccl
        self.setup()
        gp = None
        da_args = [self.da() for _ in six.moves.range(self.batch)]
        p = multiprocessing.Pool(self.parallel)
        batch_of_batch = int(float(self.batch) / self.train_batch_divide)
        while True:
            job, data = self.pipe.recv()
            if job == 'finalize':
                dev.synchronize()
                break
            if job == 'update':
                # for reducing memory
                self.model.zerograds()
                indices = list(self.sampling.yield_random_batch_samples(1, self.batch, len(self.train_x), sort=False))[0]
                for ii in six.moves.range(0, len(indices), batch_of_batch):
                    x = self.train_x[indices[ii:ii + batch_of_batch]]
                    t = self.train_y[indices[ii:ii + batch_of_batch]]
                    args = list(six.moves.zip(x, t, da_args))
                    processed = p.starmap(process_train, args)
                    tmp_x, tmp_t = list(zip(*processed))
                    train = True
                    x = self.model.prepare_input(tmp_x, dtype=np.float32, volatile=not train, gpu=self.device)
                    t = self.model.prepare_input(tmp_t, dtype=np.int32, volatile=not train, gpu=self.device)
                    y = self.model(x, train=train)
                    loss = self.model.calc_loss(y, t) / self.number_of_devices / self.train_batch_divide
                    loss.backward()
                    del x
                    del t
                    del y
                    del loss

                # send gradients of self.model
                gg = gather_grads(self.model)
                null_stream = cuda.Stream.null
                self.communication.reduce(gg.data.ptr,
                                          gg.data.ptr,
                                          gg.size,
                                          nccl.NCCL_FLOAT,
                                          nccl.NCCL_SUM,
                                          0,
                                          null_stream.ptr)
                del gg
                self.model.zerograds()
                # send parameters of self.model
                gp = gather_params(self.model)
                self.communication.bcast(gp.data.ptr,
                                         gp.size,
                                         nccl.NCCL_FLOAT,
                                         0,
                                         null_stream.ptr)
                scatter_params(self.model, gp)
                gp = None