Python torch 模块,cuda() 实例源码

我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用torch.cuda()

项目:torch_light    作者:ne7ermore    | 项目源码 | 文件源码
def __init__(self, model_source="model", cuda=False):
        self.torch = torch.cuda if cuda else torch
        self.cuda = cuda
        if self.cuda:
            model_source = torch.load(model_source)
        else:
            model_source = torch.load(model_source, map_location=lambda storage, loc: storage)

        self.src_dict = model_source["src_dict"]
        self.trains_score = model_source["trains_score"]
        self.args = args = model_source["settings"]

        model = BiLSTM_Cut(args)
        model.load_state_dict(model_source['model'])

        if self.cuda:
            model = model.cuda()
            model.prob_projection = nn.Softmax().cuda()
        else:
            model = model.cpu()
            model.prob_projection = nn.Softmax().cpu()

        self.model = model.eval()
项目:ladder    作者:abhiskk    | 项目源码 | 文件源码
def evaluate_performance(ladder, valid_loader, e, agg_cost_scaled, agg_supervised_cost_scaled,
                         agg_unsupervised_cost_scaled, args):
    correct = 0.
    total = 0.
    for batch_idx, (data, target) in enumerate(valid_loader):
        if args.cuda:
            data = data.cuda()
        data, target = Variable(data), Variable(target)
        output = ladder.forward_encoders_clean(data)
        # TODO: Do away with the below hack for GPU tensors.
        if args.cuda:
            output = output.cpu()
            target = target.cpu()
        output = output.data.numpy()
        preds = np.argmax(output, axis=1)
        target = target.data.numpy()
        correct += np.sum(target == preds)
        total += target.shape[0]

    print("Epoch:", e + 1, "\t",
          "Total Cost:", "{:.4f}".format(agg_cost_scaled), "\t",
          "Supervised Cost:", "{:.4f}".format(agg_supervised_cost_scaled), "\t",
          "Unsupervised Cost:", "{:.4f}".format(agg_unsupervised_cost_scaled), "\t",
          "Validation Accuracy:", correct / total)
项目:pytorch-dist    作者:apaszke    | 项目源码 | 文件源码
def _generate_typedefs():
    typedefs = []
    for t in ['Double', 'Float', 'Long', 'Int', 'Short', 'Char', 'Byte']:
        for lib in ['TH', 'THCuda']:
            for kind in ['Tensor', 'Storage']:
                python_name = t + kind
                if t == 'Float' and lib == 'THCuda':
                    th_name = 'THCuda' + kind
                else:
                    th_name = lib + t + kind
                th_struct = 'struct ' + th_name

                typedefs += ['typedef {} {};'.format(th_struct, th_name)]
                module = torch if lib == 'TH' else torch.cuda
                python_class = getattr(module, python_name)
                _cffi_to_torch[th_struct] = python_class
                _torch_to_cffi[python_class] = th_struct
    return '\n'.join(typedefs) + '\n'
项目:pytorch-dist    作者:apaszke    | 项目源码 | 文件源码
def _setup_wrapper(with_cuda):
    here = os.path.abspath(os.path.dirname(__file__))
    lib_dir = os.path.join(here, '..', '..', 'lib')
    include_dirs = [
        os.path.join(lib_dir, 'include'),
        os.path.join(lib_dir, 'include', 'TH'),
    ]

    wrapper_source = '#include <TH/TH.h>\n'
    if with_cuda:
        import torch.cuda
        wrapper_source += '#include <THC/THC.h>\n'
        cuda_include_dirs = glob.glob('/usr/local/cuda/include')
        cuda_include_dirs += glob.glob('/Developer/NVIDIA/CUDA-*/include')
        include_dirs.append(os.path.join(lib_dir, 'include', 'THC'))
        include_dirs.extend(cuda_include_dirs)
    return wrapper_source, include_dirs
项目:pytorch-dist    作者:apaszke    | 项目源码 | 文件源码
def test_gpu(self):
        compile_extension(
                name='gpulib',
                header=test_dir + '/ffi/src/cuda/cudalib.h',
                sources=[
                    test_dir + '/ffi/src/cuda/cudalib.c',
                ],
                with_cuda=True,
                verbose=False,
        )
        import gpulib
        tensor = torch.ones(2, 2).float()

        gpulib.good_func(tensor, 2, 1.5)
        self.assertEqual(tensor, torch.ones(2, 2) * 2 + 1.5)

        ctensor = tensor.cuda().fill_(1)
        gpulib.cuda_func(ctensor, 2, 1.5)
        self.assertEqual(ctensor, torch.ones(2, 2) * 2 + 1.5)

        self.assertRaises(TypeError,
                lambda: gpulib.cuda_func(tensor, 2, 1.5))
        self.assertRaises(TypeError,
                lambda: gpulib.cuda_func(ctensor.storage(), 2, 1.5))
项目:pytorch-dist    作者:apaszke    | 项目源码 | 文件源码
def test_serialization(self):
        x = torch.randn(5, 5).cuda()
        y = torch.IntTensor(2, 5).fill_(0).cuda()
        q = [x, y, x, y.storage()]
        with tempfile.NamedTemporaryFile() as f:
            torch.save(q, f)
            f.seek(0)
            q_copy = torch.load(f)
        self.assertEqual(q_copy, q, 0)
        q_copy[0].fill_(5)
        self.assertEqual(q_copy[0], q_copy[2], 0)
        self.assertTrue(isinstance(q_copy[0], torch.cuda.DoubleTensor))
        self.assertTrue(isinstance(q_copy[1], torch.cuda.IntTensor))
        self.assertTrue(isinstance(q_copy[2], torch.cuda.DoubleTensor))
        self.assertTrue(isinstance(q_copy[3], torch.cuda.IntStorage))
        q_copy[1].fill_(10)
        self.assertTrue(q_copy[3], torch.cuda.IntStorage(10).fill_(10))
项目:pytorch-dist    作者:apaszke    | 项目源码 | 文件源码
def _test_gather(self, dim):
        if torch.cuda.device_count() < 2:
            raise unittest.SkipTest("only one GPU detected")
        x = torch.randn(2, 5).cuda(0)
        y = torch.randn(2, 5).cuda(1)
        result = comm.gather((x, y), dim)

        expected_size = list(x.size())
        expected_size[dim] += y.size(dim)
        expected_size = torch.Size(expected_size)
        self.assertEqual(result.get_device(), 0)
        self.assertEqual(result.size(), expected_size)

        index = [slice(None, None), slice(None, None)]
        index[dim] = slice(0, x.size(dim))
        self.assertEqual(result[tuple(index)], x)
        index[dim] = slice(x.size(dim), x.size(dim) + y.size(dim))
        self.assertEqual(result[tuple(index)], y)
项目:pytorch-dist    作者:apaszke    | 项目源码 | 文件源码
def test_cuda(self, test_case):
        if not TEST_CUDA or not self.should_test_cuda:
            raise unittest.SkipTest('Excluded from CUDA tests')
        try:
            cpu_input = self._get_input()
            type_map = {
                torch.DoubleTensor: torch.cuda.FloatTensor,
            }
            gpu_input = to_gpu(cpu_input, type_map=type_map)

            cpu_target = self.target
            gpu_target = to_gpu(self.target, type_map=type_map)

            cpu_module = self.constructor(*self.constructor_args)
            gpu_module = self.constructor(*self.constructor_args).float().cuda()

            cpu_output = test_case._forward_criterion(cpu_module, cpu_input, cpu_target)
            gpu_output = test_case._forward_criterion(gpu_module, gpu_input, gpu_target)
            test_case.assertEqual(cpu_output, gpu_output, 2e-4)

            cpu_gradInput = test_case._backward_criterion(cpu_module, cpu_input, cpu_target)
            gpu_gradInput = test_case._backward_criterion(gpu_module, gpu_input, gpu_target)
            test_case.assertEqual(cpu_gradInput, gpu_gradInput, 2e-4)
        except NotImplementedError:
            pass
项目:pytorch-dist    作者:apaszke    | 项目源码 | 文件源码
def test_print(self):
        for t in torch._tensor_classes:
            if t.is_cuda and not torch.cuda.is_available():
                continue
            obj = t(100, 100).fill_(1)
            obj.__repr__()
            str(obj)
        for t in torch._storage_classes:
            if  t.is_cuda and not torch.cuda.is_available():
                continue
            obj = t(100).fill_(1)
            obj.__repr__()
            str(obj)

        x = torch.Tensor([4, float('inf'), 1.5, float('-inf'), 0, float('nan'), 1])
        x.__repr__()
        str(x)
项目:pytorch-dist    作者:apaszke    | 项目源码 | 文件源码
def test_reduce_scatter(self):
        in_size = 32 * nGPUs
        out_size = 32

        inputs = [torch.FloatTensor(in_size).uniform_() for i in range(nGPUs)]
        expected = torch.FloatTensor(in_size).zero_()
        for t in inputs:
            expected.add_(t)
        expected = expected.view(nGPUs, 32)

        inputs = [inputs[i].cuda(i) for i in range(nGPUs)]
        outputs = [torch.cuda.FloatTensor(out_size, device=i)
                   for i in range(nGPUs)]
        nccl.reduce_scatter(inputs, outputs)

        for i in range(nGPUs):
            self.assertEqual(outputs[i], expected[i])
项目:SentEval    作者:facebookresearch    | 项目源码 | 文件源码
def trainepoch(self, X, y, nepoches=1):
        self.model.train()
        for _ in range(self.nepoch, self.nepoch + nepoches):
            permutation = np.random.permutation(len(X))
            all_costs = []
            for i in range(0, len(X), self.batch_size):
                # forward
                idx = torch.from_numpy(permutation[i:i + self.batch_size]).long().cuda()
                Xbatch = Variable(X.index_select(0, idx))
                ybatch = Variable(y.index_select(0, idx))
                output = self.model(Xbatch)
                # loss
                loss = self.loss_fn(output, ybatch)
                all_costs.append(loss.data[0])
                # backward
                self.optimizer.zero_grad()
                loss.backward()
                # Update parameters
                self.optimizer.step()
        self.nepoch += nepoches
项目:torch_light    作者:ne7ermore    | 项目源码 | 文件源码
def __init__(self, model_source, cuda=False, beam_size=3):
        self.torch = torch.cuda if cuda else torch
        self.cuda = cuda
        self.beam_size = beam_size

        if self.cuda:
            model_source = torch.load(model_source)
        else:
            model_source = torch.load(model_source, map_location=lambda storage, loc: storage)
        self.src_dict = model_source["src_dict"]
        self.tgt_dict = model_source["tgt_dict"]
        self.src_idx2word = {v: k for k, v in model_source["tgt_dict"].items()}
        self.args = args = model_source["settings"]
        model = Transformer(args)
        model.load_state_dict(model_source['model'])

        if self.cuda: model = model.cuda()
        else: model = model.cpu()
        self.model = model.eval()
项目:torch_light    作者:ne7ermore    | 项目源码 | 文件源码
def sent2tenosr(self, sentence):
        max_len = self.args.max_word_len - 2
        sentence = normalizeString(sentence)
        words = [w for w in sentence.strip().split()]

        if len(words) > max_len:
            words = words[:max_len]

        words = [WORD[BOS]] + words + [WORD[EOS]]
        idx = [self.src_dict[w] if w in self.src_dict else UNK for w in words]

        idx_data = torch.LongTensor(idx)
        idx_position = torch.LongTensor([pos_i+1 if w_i != PAD else 0 for pos_i, w_i in enumerate(idx)])
        idx_data_tensor = Variable(idx_data.unsqueeze(0), volatile=True)
        idx_position_tensor = Variable(idx_position.unsqueeze(0), volatile=True)

        if self.cuda:
            idx_data_tensor = idx_data_tensor.cuda()
            idx_position_tensor = idx_position_tensor.cuda()

        return idx_data_tensor, idx_position_tensor
项目:torch_light    作者:ne7ermore    | 项目源码 | 文件源码
def __init__(self, model_source, cuda=False, beam_size=3):
        self.torch = torch.cuda if cuda else torch
        self.cuda = cuda
        self.jb = Jieba("./segmenter_dicts", useSynonym=True, HMM=False)
        self.swf = StopwordFilter("./segmenter_dicts/stopwords.txt")

        model_source = torch.load(model_source)
        self.src_dict = model_source["src_dict"]
        self.tgt_dict = model_source["tgt_dict"]
        self.src_idx2ind = {v: k for k, v in model_source["tgt_dict"].items()}
        self.args = args = model_source["settings"]
        model = CNN_Ranking(args)
        model.load_state_dict(model_source['model'])

        if self.cuda:
            model = model.cuda()
        else:
            model = model.cpu()
        self.model = model.eval()
项目:torch_light    作者:ne7ermore    | 项目源码 | 文件源码
def __init__(self, model_source, cuda=False):
        self.torch = torch.cuda if cuda else torch
        self.cuda = cuda
        if self.cuda:
            model_source = torch.load(model_source)
        else:
            model_source = torch.load(model_source, map_location=lambda storage, loc: storage)

        self.src_dict = model_source["src_dict"]
        self.trains_score = model_source["trains_score"]
        self.args = args = model_source["settings"]

        model = BiLSTM_CRF_Size(args)
        model.load_state_dict(model_source['model'])

        if self.cuda:
            model = model.cuda()
            model.prob_projection = nn.Softmax().cuda()
        else:
            model = model.cpu()
            model.prob_projection = nn.Softmax().cpu()

        self.model = model.eval()
项目:attention-is-all-you-need-pytorch    作者:jadore801120    | 项目源码 | 文件源码
def __init__(self, size, cuda=False):

        self.size = size
        self.done = False

        self.tt = torch.cuda if cuda else torch

        # The score for each translation on the beam.
        self.scores = self.tt.FloatTensor(size).zero_()
        self.all_scores = []

        # The backpointers at each time-step.
        self.prev_ks = []

        # The outputs at each time-step.
        self.next_ys = [self.tt.LongTensor(size).fill_(Constants.PAD)]
        self.next_ys[0][0] = Constants.BOS
项目:fairseq-py    作者:facebookresearch    | 项目源码 | 文件源码
def __init__(self, args, model, criterion, device_ids=None,
                 multiprocessing_method='spawn'):
        if device_ids is None:
            device_ids = tuple(range(torch.cuda.device_count()))
        super().__init__(device_ids, multiprocessing_method)

        if not torch.cuda.is_available():
            raise NotImplementedError('Training on CPU is not supported')
        model = model.share_memory()
        nccl_uid = nccl.get_unique_id()
        self.criterion = criterion

        Future.gen_list([
            self.call_async(rank, '_async_init', args=args, model=model,
                            criterion=criterion, nccl_uid=nccl_uid)
            for rank in range(self.num_replicas)
        ])

        self._grads_initialized = False
项目:fairseq-py    作者:facebookresearch    | 项目源码 | 文件源码
def _async_init(self, rank, device_id, args, model, criterion, nccl_uid):
        """Initialize child processes."""
        self.args = args

        # set CUDA device
        torch.cuda.set_device(device_id)

        # initialize NCCL
        nccl.initialize(self.num_replicas, nccl_uid, device_id)

        # copy model and criterion to current device
        self.model = model.cuda()
        self.criterion = criterion.cuda()

        # initialize optimizer and LR scheduler
        self.args.lr = list(map(float, self.args.lr.split(',')))
        self.optimizer = self._build_optimizer()
        self.lr_scheduler = self._build_lr_scheduler()

        self.loss = None
        self._max_bsz_seen = 0
项目:fairseq-py    作者:facebookresearch    | 项目源码 | 文件源码
def _async_forward(self, rank, device_id, eval=False):
        if eval:
            self.model.eval()
        else:
            self.model.train()
            self.optimizer.zero_grad()

        sample_size, logging_output, oom = 0, {}, False
        if self._sample is not None:
            try:
                # calculate loss and sample size
                self.loss, sample_size, logging_output = self.criterion(self.model, self._sample)
            except RuntimeError as e:
                if not eval and 'out of memory' in str(e):
                    print('| WARNING: ran out of memory on GPU #{}, skipping batch'.format(device_id))
                    oom = True
                    self.loss = None
                    if hasattr(torch.cuda, 'empty_cache'):
                        torch.cuda.empty_cache()
                else:
                    raise e

        return sample_size, logging_output, oom
项目:NeuralMT    作者:hlt-mt    | 项目源码 | 文件源码
def __init__(self, size, cuda=False):

        self.size = size
        self.done = False

        self.tt = torch.cuda if cuda else torch

        # The score for each translation on the beam.
        self.scores = self.tt.FloatTensor(size).zero_()
        self.allScores = []

        # The backpointers at each time-step.
        self.prevKs = []

        # The outputs at each time-step.
        self.nextYs = [self.tt.LongTensor(size).fill_(onmt.Constants.PAD)]
        self.nextYs[0][0] = onmt.Constants.BOS

        # The attentions (matrix) for each time.
        self.attn = []
项目:NeuralMT    作者:hlt-mt    | 项目源码 | 文件源码
def buildData(self, srcBatch, goldBatch):
        # This needs to be the same as preprocess.py.
        if self._type == "text":
            srcData = [self.src_dict.convertToIdx(b,
                                                  onmt.Constants.UNK_WORD)
                       for b in srcBatch]
        elif self._type == "img":
            srcData = [transforms.ToTensor()(
                Image.open(self.opt.src_img_dir + "/" + b[0]))
                       for b in srcBatch]

        tgtData = None
        if goldBatch:
            tgtData = [self.tgt_dict.convertToIdx(b,
                       onmt.Constants.UNK_WORD,
                       onmt.Constants.BOS_WORD,
                       onmt.Constants.EOS_WORD) for b in goldBatch]

        return onmt.Dataset(srcData, tgtData, self.opt.batch_size,
                            self.opt.cuda, volatile=True,
                            data_type=self._type)
项目:e2e-model-learning    作者:locuslab    | 项目源码 | 文件源码
def __init__(self, X, Y, hidden_layer_sizes):
        super(Net, self).__init__()

        # Initialize linear layer with least squares solution
        X_ = np.hstack([X, np.ones((X.shape[0],1))])
        Theta = np.linalg.solve(X_.T.dot(X_), X_.T.dot(Y))

        self.lin = nn.Linear(X.shape[1], Y.shape[1])
        W,b = self.lin.parameters()
        W.data = torch.Tensor(Theta[:-1,:].T)
        b.data = torch.Tensor(Theta[-1,:])

        # Set up non-linear network of 
        # Linear -> BatchNorm -> ReLU -> Dropout layers
        layer_sizes = [X.shape[1]] + hidden_layer_sizes
        layers = reduce(operator.add, 
            [[nn.Linear(a,b), nn.BatchNorm1d(b), nn.ReLU(), nn.Dropout(p=0.2)] 
                for a,b in zip(layer_sizes[0:-1], layer_sizes[1:])])
        layers += [nn.Linear(layer_sizes[-1], Y.shape[1])]
        self.net = nn.Sequential(*layers)
        self.sig = Parameter(torch.ones(1, Y.shape[1]).cuda())
项目:e2e-model-learning    作者:locuslab    | 项目源码 | 文件源码
def __init__(self, params, eps=1e-2):
        super(SolveNewsvendor, self).__init__()
        k = len(params['d'])
        self.Q = Variable(torch.diag(torch.Tensor(
            [params['c_quad']] + [params['b_quad']]*k + [params['h_quad']]*k)) \
                .cuda())
        self.p = Variable(torch.Tensor(
            [params['c_lin']] + [params['b_lin']]*k + [params['h_lin']]*k) \
                .cuda())
        self.G = Variable(torch.cat([
            torch.cat([-torch.ones(k,1), -torch.eye(k), torch.zeros(k,k)], 1),
            torch.cat([torch.ones(k,1), torch.zeros(k,k), -torch.eye(k)], 1),
            -torch.eye(1 + 2*k)], 0).cuda())
        self.h = Variable(torch.Tensor(
            np.concatenate([-params['d'], params['d'], np.zeros(1+ 2*k)])).cuda())
        self.one = Variable(torch.Tensor([1])).cuda()
        self.eps_eye = eps * Variable(torch.eye(1 + 2*k).cuda()).unsqueeze(0)
项目:e2e-model-learning    作者:locuslab    | 项目源码 | 文件源码
def forward(self, y):
        nBatch, k = y.size()

        Q_scale = torch.cat([torch.diag(torch.cat(
            [self.one, y[i], y[i]])).unsqueeze(0) for i in range(nBatch)], 0)
        Q = self.Q.unsqueeze(0).expand_as(Q_scale).mul(Q_scale)
        p_scale = torch.cat([Variable(torch.ones(nBatch,1).cuda()), y, y], 1)
        p = self.p.unsqueeze(0).expand_as(p_scale).mul(p_scale)
        G = self.G.unsqueeze(0).expand(nBatch, self.G.size(0), self.G.size(1))
        h = self.h.unsqueeze(0).expand(nBatch, self.h.size(0))
        e = Variable(torch.Tensor().cuda()).double()

        out = QPFunction(verbose=False)\
            (Q.double(), p.double(), G.double(), h.double(), e, e).float()

        return out[:,:1]
项目:alpha-dimt-icmlws    作者:sotetsuk    | 项目源码 | 文件源码
def __init__(self, size, cuda=False):

        self.size = size
        self.done = False

        self.tt = torch.cuda if cuda else torch

        # The score for each translation on the beam.
        self.scores = self.tt.FloatTensor(size).zero_()

        # The backpointers at each time-step.
        self.prevKs = []

        # The outputs at each time-step.
        self.nextYs = [self.tt.LongTensor(size).fill_(onmt.Constants.PAD)]
        self.nextYs[0][0] = onmt.Constants.BOS

        # The attentions (matrix) for each time.
        self.attn = []

    # Get the outputs for the current timestep.
项目:pytorch    作者:tylergenter    | 项目源码 | 文件源码
def _generate_typedefs():
    typedefs = []
    for t in ['Double', 'Float', 'Long', 'Int', 'Short', 'Char', 'Byte']:
        for lib in ['TH', 'THCuda']:
            for kind in ['Tensor', 'Storage']:
                python_name = t + kind
                if t == 'Float' and lib == 'THCuda':
                    th_name = 'THCuda' + kind
                else:
                    th_name = lib + t + kind
                th_struct = 'struct ' + th_name

                typedefs += ['typedef {} {};'.format(th_struct, th_name)]
                module = torch if lib == 'TH' else torch.cuda
                python_class = getattr(module, python_name)
                _cffi_to_torch[th_struct] = python_class
                _torch_to_cffi[python_class] = th_struct
    return '\n'.join(typedefs) + '\n'
项目:pytorch    作者:tylergenter    | 项目源码 | 文件源码
def _setup_wrapper(with_cuda):
    here = os.path.abspath(os.path.dirname(__file__))
    lib_dir = os.path.join(here, '..', '..', 'lib')
    include_dirs = [
        os.path.join(lib_dir, 'include'),
        os.path.join(lib_dir, 'include', 'TH'),
    ]

    wrapper_source = '#include <TH/TH.h>\n'
    if with_cuda:
        import torch.cuda
        wrapper_source += '#include <THC/THC.h>\n'
        cuda_include_dirs = glob.glob('/usr/local/cuda/include')
        cuda_include_dirs += glob.glob('/Developer/NVIDIA/CUDA-*/include')
        include_dirs.append(os.path.join(lib_dir, 'include', 'THC'))
        include_dirs.extend(cuda_include_dirs)
    return wrapper_source, include_dirs
项目:pytorch    作者:tylergenter    | 项目源码 | 文件源码
def test_cuda_small_tensors(self):
        # Check multiple small tensors which will likely use the same
        # underlying cached allocation
        ctx = mp.get_context('spawn')
        tensors = []
        for i in range(5):
            tensors += [torch.arange(i * 5, (i + 1) * 5).cuda()]

        inq = ctx.Queue()
        outq = ctx.Queue()
        inq.put(tensors)
        p = ctx.Process(target=sum_tensors, args=(inq, outq))
        p.start()

        results = []
        for i in range(5):
            results.append(outq.get())
        p.join()

        for i, tensor in enumerate(tensors):
            v, device, tensor_size, storage_size = results[i]
            self.assertEqual(v, torch.arange(i * 5, (i + 1) * 5).sum())
            self.assertEqual(device, 0)
            self.assertEqual(tensor_size, 5)
            self.assertEqual(storage_size, 5)
项目:pytorch    作者:tylergenter    | 项目源码 | 文件源码
def test_event(self):
        ctx = mp.get_context('spawn')
        queue = ctx.Queue()
        ready = ctx.Event()
        done = ctx.Event()
        p = ctx.Process(target=cuda_multiply_two, args=(queue, ready, done))
        p.start()

        ready.wait()
        with torch.cuda.stream(torch.cuda.Stream()):
            tensor = torch.cuda.FloatTensor([1, 1, 1, 1])
            # Use a sleep kernel to test events. Without the event, the
            # multiply happens before the add.
            event = torch.cuda.Event(interprocess=True)
            torch.cuda._sleep(20000000)  # about 30 ms
            tensor.add_(1)
            event.record()
            queue.put((event, tensor))
            done.wait()  # must wait until subprocess records event
            event.synchronize()
            self.assertEqual(list(tensor), [4, 4, 4, 4])
        p.join()
项目:pytorch    作者:tylergenter    | 项目源码 | 文件源码
def test_gpu(self):
        compile_extension(
            name='gpulib',
            header=test_dir + '/ffi/src/cuda/cudalib.h',
            sources=[
                test_dir + '/ffi/src/cuda/cudalib.c',
            ],
            with_cuda=True,
            verbose=False,
        )
        import gpulib
        tensor = torch.ones(2, 2).float()

        gpulib.good_func(tensor, 2, 1.5)
        self.assertEqual(tensor, torch.ones(2, 2) * 2 + 1.5)

        ctensor = tensor.cuda().fill_(1)
        gpulib.cuda_func(ctensor, 2, 1.5)
        self.assertEqual(ctensor, torch.ones(2, 2) * 2 + 1.5)

        self.assertRaises(TypeError,
                          lambda: gpulib.cuda_func(tensor, 2, 1.5))
        self.assertRaises(TypeError,
                          lambda: gpulib.cuda_func(ctensor.storage(), 2, 1.5))
项目:pytorch    作者:tylergenter    | 项目源码 | 文件源码
def test_copy_device(self):
        x = torch.randn(5, 5).cuda()
        with torch.cuda.device(1):
            y = x.cuda()
            self.assertEqual(y.get_device(), 1)
            self.assertIs(y.cuda(), y)
            z = y.cuda(0)
            self.assertEqual(z.get_device(), 0)
            self.assertIs(z.cuda(0), z)

        x = torch.randn(5, 5)
        with torch.cuda.device(1):
            y = x.cuda()
            self.assertEqual(y.get_device(), 1)
            self.assertIs(y.cuda(), y)
            z = y.cuda(0)
            self.assertEqual(z.get_device(), 0)
            self.assertIs(z.cuda(0), z)
项目:pytorch    作者:tylergenter    | 项目源码 | 文件源码
def test_broadcast_coalesced(self):
        numel = 5
        num_bytes = numel * 8
        tensors = [
            torch.randn(numel).long().cuda(),
            torch.randn(numel).cuda(),
            torch.randn(numel).long().cuda(),
            torch.randn(numel).long().cuda(),
            torch.randn(numel * 2).int().cuda(),  # int is 2x shorter
            torch.randn(numel).cuda(),
        ]

        b_tensors = [comm.broadcast(t, (0, 1)) for t in tensors]
        for (_, bt), t in zip(b_tensors, tensors):
            self.assertEqual(bt.get_device(), 1)
            self.assertEqual(bt, t)
            self.assertIsInstance(bt, type(t))

        bc_tensors = comm.broadcast_coalesced(tensors, (0, 1), buffer_size=num_bytes * 5 // 2)
        bc_tensors_t = list(zip(*bc_tensors))
        self.assertEqual(b_tensors, bc_tensors_t)
        for (_, bt), (_, bct) in zip(b_tensors, bc_tensors_t):
            self.assertEqual(bt.get_device(), bct.get_device())
            self.assertIsInstance(bct, type(bt))
项目:pytorch    作者:tylergenter    | 项目源码 | 文件源码
def test_streams(self):
        default_stream = torch.cuda.current_stream()
        user_stream = torch.cuda.Stream()
        self.assertEqual(torch.cuda.current_stream(), default_stream)
        self.assertNotEqual(default_stream, user_stream)
        self.assertEqual(default_stream.cuda_stream, 0)
        self.assertNotEqual(user_stream.cuda_stream, 0)
        with torch.cuda.stream(user_stream):
            self.assertEqual(torch.cuda.current_stream(), user_stream)
        self.assertTrue(user_stream.query())
        # copy 10 MB tensor from CPU-GPU which should take some time
        tensor1 = torch.ByteTensor(10000000).pin_memory()
        tensor2 = tensor1.cuda(async=True)
        self.assertFalse(default_stream.query())
        default_stream.synchronize()
        self.assertTrue(default_stream.query())
项目:pytorch    作者:tylergenter    | 项目源码 | 文件源码
def test_caching_pinned_memory(self):
        cycles_per_ms = get_cycles_per_ms()

        # check that allocations are re-used after deletion
        t = torch.FloatTensor([1]).pin_memory()
        ptr = t.data_ptr()
        del t
        t = torch.FloatTensor([1]).pin_memory()
        self.assertEqual(t.data_ptr(), ptr, 'allocation not reused')

        # check that the allocation is not re-used if it's in-use by a copy
        gpu_tensor = torch.cuda.FloatTensor([0])
        torch.cuda._sleep(int(50 * cycles_per_ms))  # delay the copy
        gpu_tensor.copy_(t, async=True)
        del t
        t = torch.FloatTensor([1]).pin_memory()
        self.assertNotEqual(t.data_ptr(), ptr, 'allocation re-used too soon')
        self.assertEqual(list(gpu_tensor), [1])
项目:pytorch    作者:tylergenter    | 项目源码 | 文件源码
def test_caching_pinned_memory_multi_gpu(self):
        # checks that the events preventing pinned memory from being re-used
        # too early are recorded on the correct GPU
        cycles_per_ms = get_cycles_per_ms()

        t = torch.FloatTensor([1]).pin_memory()
        ptr = t.data_ptr()
        gpu_tensor0 = torch.cuda.FloatTensor([0], device=0)
        gpu_tensor1 = torch.cuda.FloatTensor([0], device=1)

        with torch.cuda.device(1):
            torch.cuda._sleep(int(50 * cycles_per_ms))  # delay the copy
            gpu_tensor1.copy_(t, async=True)

        del t
        t = torch.FloatTensor([2]).pin_memory()
        self.assertNotEqual(t.data_ptr(), ptr, 'allocation re-used too soon')

        with torch.cuda.device(0):
            gpu_tensor0.copy_(t, async=True)

        self.assertEqual(gpu_tensor1[0], 1)
        self.assertEqual(gpu_tensor0[0], 2)
项目:pytorch    作者:tylergenter    | 项目源码 | 文件源码
def test_serialization_map_location(self):
        DATA_URL = 'https://download.pytorch.org/test_data/gpu_tensors.pt'
        data_dir = os.path.join(os.path.dirname(__file__), 'data')
        test_file_path = os.path.join(data_dir, 'gpu_tensors.pt')
        succ = download_file(DATA_URL, test_file_path)
        if not succ:
            warnings.warn(
                "Couldn't download the test file for map_location! "
                "Tests will be incomplete!", RuntimeWarning)
            return

        def map_location(storage, loc):
            return storage

        tensor = torch.load(test_file_path, map_location=map_location)
        self.assertEqual(type(tensor), torch.FloatTensor)
        self.assertEqual(tensor, torch.FloatTensor([[1.0, 2.0], [3.0, 4.0]]))

        tensor = torch.load(test_file_path, map_location={'cuda:0': 'cpu'})
        self.assertEqual(type(tensor), torch.FloatTensor)
        self.assertEqual(tensor, torch.FloatTensor([[1.0, 2.0], [3.0, 4.0]]))
项目:pytorch    作者:tylergenter    | 项目源码 | 文件源码
def test_print(self):
        for t in torch._tensor_classes:
            if t in torch.sparse._sparse_tensor_classes:
                continue
            if t.is_cuda and not torch.cuda.is_available():
                continue
            obj = t(100, 100).fill_(1)
            obj.__repr__()
            str(obj)
        for t in torch._storage_classes:
            if t.is_cuda and not torch.cuda.is_available():
                continue
            obj = t(100).fill_(1)
            obj.__repr__()
            str(obj)

        x = torch.Tensor([4, float('inf'), 1.5, float('-inf'), 0, float('nan'), 1])
        x.__repr__()
        str(x)
项目:pytorch    作者:tylergenter    | 项目源码 | 文件源码
def test_reduce_scatter(self):
        in_size = 32 * nGPUs
        out_size = 32

        inputs = [torch.FloatTensor(in_size).uniform_() for i in range(nGPUs)]
        expected = torch.FloatTensor(in_size).zero_()
        for t in inputs:
            expected.add_(t)
        expected = expected.view(nGPUs, 32)

        inputs = [inputs[i].cuda(i) for i in range(nGPUs)]
        outputs = [torch.cuda.FloatTensor(out_size, device=i)
                   for i in range(nGPUs)]
        nccl.reduce_scatter(inputs, outputs)

        for i in range(nGPUs):
            self.assertEqual(outputs[i], expected[i])
项目:Seq2Seq-PyTorch    作者:MaximumEntropy    | 项目源码 | 文件源码
def __init__(self, size, vocab, cuda=False):
        """Initialize params."""
        self.size = size
        self.done = False
        self.pad = vocab['<pad>']
        self.bos = vocab['<s>']
        self.eos = vocab['</s>']
        self.tt = torch.cuda if cuda else torch

        # The score for each translation on the beam.
        self.scores = self.tt.FloatTensor(size).zero_()

        # The backpointers at each time-step.
        self.prevKs = []

        # The outputs at each time-step.
        self.nextYs = [self.tt.LongTensor(size).fill_(self.pad)]
        self.nextYs[0][0] = self.bos

        # The attentions (matrix) for each time.
        self.attn = []

    # Get the outputs for the current timestep.
项目:pytorch-coriander    作者:hughperkins    | 项目源码 | 文件源码
def _generate_typedefs():
    typedefs = []
    for t in ['Double', 'Float', 'Long', 'Int', 'Short', 'Char', 'Byte']:
        for lib in ['TH', 'THCuda']:
            for kind in ['Tensor', 'Storage']:
                python_name = t + kind
                if t == 'Float' and lib == 'THCuda':
                    th_name = 'THCuda' + kind
                else:
                    th_name = lib + t + kind
                th_struct = 'struct ' + th_name

                typedefs += ['typedef {} {};'.format(th_struct, th_name)]
                module = torch if lib == 'TH' else torch.cuda
                python_class = getattr(module, python_name)
                _cffi_to_torch[th_struct] = python_class
                _torch_to_cffi[python_class] = th_struct
    return '\n'.join(typedefs) + '\n'
项目:pytorch-coriander    作者:hughperkins    | 项目源码 | 文件源码
def _setup_wrapper(with_cuda):
    here = os.path.abspath(os.path.dirname(__file__))
    lib_dir = os.path.join(here, '..', '..', 'lib')
    include_dirs = [
        os.path.join(lib_dir, 'include'),
        os.path.join(lib_dir, 'include', 'TH'),
    ]

    wrapper_source = '#include <TH/TH.h>\n'
    if with_cuda:
        import torch.cuda
        wrapper_source += '#include <THC/THC.h>\n'
        cuda_include_dirs = glob.glob('/usr/local/cuda/include')
        cuda_include_dirs += glob.glob('/Developer/NVIDIA/CUDA-*/include')
        include_dirs.append(os.path.join(lib_dir, 'include', 'THC'))
        include_dirs.extend(cuda_include_dirs)
    return wrapper_source, include_dirs
项目:pytorch-coriander    作者:hughperkins    | 项目源码 | 文件源码
def test_cuda_small_tensors(self):
        # Check multiple small tensors which will likely use the same
        # underlying cached allocation
        ctx = mp.get_context('spawn')
        tensors = []
        for i in range(5):
            tensors += [torch.arange(i * 5, (i + 1) * 5).cuda()]

        inq = ctx.Queue()
        outq = ctx.Queue()
        inq.put(tensors)
        p = ctx.Process(target=sum_tensors, args=(inq, outq))
        p.start()

        results = []
        for i in range(5):
            results.append(outq.get())
        p.join()

        for i, tensor in enumerate(tensors):
            v, device, tensor_size, storage_size = results[i]
            self.assertEqual(v, torch.arange(i * 5, (i + 1) * 5).sum())
            self.assertEqual(device, 0)
            self.assertEqual(tensor_size, 5)
            self.assertEqual(storage_size, 5)
项目:pytorch-coriander    作者:hughperkins    | 项目源码 | 文件源码
def test_event(self):
        ctx = mp.get_context('spawn')
        queue = ctx.Queue()
        ready = ctx.Event()
        done = ctx.Event()
        p = ctx.Process(target=cuda_multiply_two, args=(queue, ready, done))
        p.start()

        ready.wait()
        with torch.cuda.stream(torch.cuda.Stream()):
            tensor = torch.cuda.FloatTensor([1, 1, 1, 1])
            # Use a sleep kernel to test events. Without the event, the
            # multiply happens before the add.
            event = torch.cuda.Event(interprocess=True)
            torch.cuda._sleep(20000000)  # about 30 ms
            tensor.add_(1)
            event.record()
            queue.put((event, tensor))
            done.wait()  # must wait until subprocess records event
            event.synchronize()
            self.assertEqual(list(tensor), [4, 4, 4, 4])
        p.join()
项目:pytorch-coriander    作者:hughperkins    | 项目源码 | 文件源码
def test_gpu(self):
        compile_extension(
            name='gpulib',
            header=test_dir + '/ffi/src/cuda/cudalib.h',
            sources=[
                test_dir + '/ffi/src/cuda/cudalib.c',
            ],
            with_cuda=True,
            verbose=False,
        )
        import gpulib
        tensor = torch.ones(2, 2).float()

        gpulib.good_func(tensor, 2, 1.5)
        self.assertEqual(tensor, torch.ones(2, 2) * 2 + 1.5)

        ctensor = tensor.cuda().fill_(1)
        gpulib.cuda_func(ctensor, 2, 1.5)
        self.assertEqual(ctensor, torch.ones(2, 2) * 2 + 1.5)

        self.assertRaises(TypeError,
                          lambda: gpulib.cuda_func(tensor, 2, 1.5))
        self.assertRaises(TypeError,
                          lambda: gpulib.cuda_func(ctensor.storage(), 2, 1.5))
项目:pytorch-coriander    作者:hughperkins    | 项目源码 | 文件源码
def test_copy_device(self):
        x = torch.randn(5, 5).cuda()
        with torch.cuda.device(1):
            y = x.cuda()
            self.assertEqual(y.get_device(), 1)
            self.assertIs(y.cuda(), y)
            z = y.cuda(0)
            self.assertEqual(z.get_device(), 0)
            self.assertIs(z.cuda(0), z)

        x = torch.randn(5, 5)
        with torch.cuda.device(1):
            y = x.cuda()
            self.assertEqual(y.get_device(), 1)
            self.assertIs(y.cuda(), y)
            z = y.cuda(0)
            self.assertEqual(z.get_device(), 0)
            self.assertIs(z.cuda(0), z)
项目:pytorch-coriander    作者:hughperkins    | 项目源码 | 文件源码
def test_broadcast_coalesced(self):
        numel = 5
        num_bytes = numel * 8
        tensors = [
            torch.randn(numel).long().cuda(),
            torch.randn(numel).cuda(),
            torch.randn(numel).long().cuda(),
            torch.randn(numel).long().cuda(),
            torch.randn(numel * 2).int().cuda(),  # int is 2x shorter
            torch.randn(numel).cuda(),
        ]

        b_tensors = [comm.broadcast(t, (0, 1)) for t in tensors]
        for (_, bt), t in zip(b_tensors, tensors):
            self.assertEqual(bt.get_device(), 1)
            self.assertEqual(bt, t)
            self.assertIsInstance(bt, type(t))

        bc_tensors = comm.broadcast_coalesced(tensors, (0, 1), buffer_size=num_bytes * 5 // 2)
        bc_tensors_t = list(zip(*bc_tensors))
        self.assertEqual(b_tensors, bc_tensors_t)
        for (_, bt), (_, bct) in zip(b_tensors, bc_tensors_t):
            self.assertEqual(bt.get_device(), bct.get_device())
            self.assertIsInstance(bct, type(bt))
项目:pytorch-coriander    作者:hughperkins    | 项目源码 | 文件源码
def test_streams(self):
        default_stream = torch.cuda.current_stream()
        user_stream = torch.cuda.Stream()
        self.assertEqual(torch.cuda.current_stream(), default_stream)
        self.assertNotEqual(default_stream, user_stream)
        self.assertEqual(default_stream.cuda_stream, 0)
        self.assertNotEqual(user_stream.cuda_stream, 0)
        with torch.cuda.stream(user_stream):
            self.assertEqual(torch.cuda.current_stream(), user_stream)
        self.assertTrue(user_stream.query())
        # copy 10 MB tensor from CPU-GPU which should take some time
        tensor1 = torch.ByteTensor(10000000).pin_memory()
        tensor2 = tensor1.cuda(async=True)
        self.assertFalse(default_stream.query())
        default_stream.synchronize()
        self.assertTrue(default_stream.query())
项目:pytorch-coriander    作者:hughperkins    | 项目源码 | 文件源码
def test_caching_pinned_memory(self):
        cycles_per_ms = get_cycles_per_ms()

        # check that allocations are re-used after deletion
        t = torch.FloatTensor([1]).pin_memory()
        ptr = t.data_ptr()
        del t
        t = torch.FloatTensor([1]).pin_memory()
        self.assertEqual(t.data_ptr(), ptr, 'allocation not reused')

        # check that the allocation is not re-used if it's in-use by a copy
        gpu_tensor = torch.cuda.FloatTensor([0])
        torch.cuda._sleep(int(50 * cycles_per_ms))  # delay the copy
        gpu_tensor.copy_(t, async=True)
        del t
        t = torch.FloatTensor([1]).pin_memory()
        self.assertNotEqual(t.data_ptr(), ptr, 'allocation re-used too soon')
        self.assertEqual(list(gpu_tensor), [1])
项目:pytorch-coriander    作者:hughperkins    | 项目源码 | 文件源码
def test_caching_pinned_memory_multi_gpu(self):
        # checks that the events preventing pinned memory from being re-used
        # too early are recorded on the correct GPU
        cycles_per_ms = get_cycles_per_ms()

        t = torch.FloatTensor([1]).pin_memory()
        ptr = t.data_ptr()
        gpu_tensor0 = torch.cuda.FloatTensor([0], device=0)
        gpu_tensor1 = torch.cuda.FloatTensor([0], device=1)

        with torch.cuda.device(1):
            torch.cuda._sleep(int(50 * cycles_per_ms))  # delay the copy
            gpu_tensor1.copy_(t, async=True)

        del t
        t = torch.FloatTensor([2]).pin_memory()
        self.assertNotEqual(t.data_ptr(), ptr, 'allocation re-used too soon')

        with torch.cuda.device(0):
            gpu_tensor0.copy_(t, async=True)

        self.assertEqual(gpu_tensor1[0], 1)
        self.assertEqual(gpu_tensor0[0], 2)
项目:pytorch-coriander    作者:hughperkins    | 项目源码 | 文件源码
def test_serialization_map_location(self):
        DATA_URL = 'https://download.pytorch.org/test_data/gpu_tensors.pt'
        data_dir = os.path.join(os.path.dirname(__file__), 'data')
        test_file_path = os.path.join(data_dir, 'gpu_tensors.pt')
        succ = download_file(DATA_URL, test_file_path)
        if not succ:
            warnings.warn(
                "Couldn't download the test file for map_location! "
                "Tests will be incomplete!", RuntimeWarning)
            return

        def map_location(storage, loc):
            return storage

        tensor = torch.load(test_file_path, map_location=map_location)
        self.assertEqual(type(tensor), torch.FloatTensor)
        self.assertEqual(tensor, torch.FloatTensor([[1.0, 2.0], [3.0, 4.0]]))

        tensor = torch.load(test_file_path, map_location={'cuda:0': 'cpu'})
        self.assertEqual(type(tensor), torch.FloatTensor)
        self.assertEqual(tensor, torch.FloatTensor([[1.0, 2.0], [3.0, 4.0]]))