我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用theano.tensor.cast()。
def adam_updates(params, cost, lr=0.001, mom1=0.9, mom2=0.999): updates = [] grads = T.grad(cost, params) t = th.shared(np.cast[th.config.floatX](1.)) for p, g in zip(params, grads): v = th.shared(np.cast[th.config.floatX](p.get_value() * 0.)) mg = th.shared(np.cast[th.config.floatX](p.get_value() * 0.)) v_t = mom1*v + (1. - mom1)*g mg_t = mom2*mg + (1. - mom2)*T.square(g) v_hat = v_t / (1. - mom1 ** t) mg_hat = mg_t / (1. - mom2 ** t) g_t = v_hat / T.sqrt(mg_hat + 1e-8) p_t = p - lr * g_t updates.append((v, v_t)) updates.append((mg, mg_t)) updates.append((p, p_t)) updates.append((t, t+1)) return updates
def get_output_for(self, input, deterministic=False, **kwargs): if deterministic: norm_features = (input-self.avg_batch_mean.dimshuffle(*self.dimshuffle_args)) / T.sqrt(1e-6 + self.avg_batch_var).dimshuffle(*self.dimshuffle_args) else: batch_mean = T.mean(input,axis=self.axes_to_sum).flatten() centered_input = input-batch_mean.dimshuffle(*self.dimshuffle_args) batch_var = T.mean(T.square(centered_input),axis=self.axes_to_sum).flatten() batch_stdv = T.sqrt(1e-6 + batch_var) norm_features = centered_input / batch_stdv.dimshuffle(*self.dimshuffle_args) # BN updates new_m = 0.9*self.avg_batch_mean + 0.1*batch_mean new_v = 0.9*self.avg_batch_var + T.cast((0.1*input.shape[0])/(input.shape[0]-1),th.config.floatX)*batch_var self.bn_updates = [(self.avg_batch_mean, new_m), (self.avg_batch_var, new_v)] if hasattr(self, 'g'): activation = norm_features*self.g.dimshuffle(*self.dimshuffle_args) else: activation = norm_features if hasattr(self, 'b'): activation += self.b.dimshuffle(*self.dimshuffle_args) return self.nonlinearity(activation)
def maxout2(x): shape = x.shape if x.ndim == 1: shape1 = T.cast(shape[0] / 2, 'int64') shape2 = T.cast(2, 'int64') x = x.reshape([shape1, shape2]) x = x.max(1) elif x.ndim == 2: shape1 = T.cast(shape[1] / 2, 'int64') shape2 = T.cast(2, 'int64') x = x.reshape([shape[0], shape1, shape2]) x = x.max(2) elif x.ndim == 3: shape1 = T.cast(shape[2] / 2, 'int64') shape2 = T.cast(2, 'int64') x = x.reshape([shape[0], shape[1], shape1, shape2]) x = x.max(3) return x
def dice_loss(y_pred, y_true, void_class, class_for_dice=1): ''' Dice loss -- works for only binary classes. y_pred is a softmax output y_true is one hot ''' smooth = 1 y_true_f = T.flatten(y_true[:, class_for_dice, :, :]) y_true_f = T.cast(y_true_f, 'int32') y_pred_f = T.flatten(y_pred[:, class_for_dice, :, :]) # remove void classes from dice if len(void_class): for i in range(len(void_class)): # get idx of non void classes and remove void classes # from y_true and y_pred idxs = T.neq(y_true_f, void_class[i]).nonzero() y_pred_f = y_pred_f[idxs] y_true_f = y_true_f[idxs] intersection = T.sum(y_true_f * y_pred_f) return -(2.*intersection+smooth) / (T.sum(y_true_f)+T.sum(y_pred_f)+smooth)
def build_objective(model, deterministic=False, epsilon=1e-12): predictions = nn.layers.get_output(model.l_out, deterministic=deterministic) targets = T.cast(nn.layers.get_output(model.l_target), 'int32') enable_targets = nn.layers.get_output(model.l_enable_target) predictions = T.clip(predictions, epsilon, 1.-epsilon) #is_nodule_ground_truth = T.cast(targets[:,0], 'float32') sum_of_objectives = 0 unit_ptr = 0 for obj_idx, obj_name in enumerate(order_objectives): n_classes = len(property_bin_borders[obj_name]) v_obj = objective(obj_idx, (unit_ptr, unit_ptr+n_classes), predictions, targets) if deterministic: d_objectives_deterministic[obj_name] = T.mean(v_obj) else: d_objectives[obj_name] = T.mean(v_obj) #sum_of_objectives += T.mean(enable_targets[obj_idx] * v_obj) sum_of_objectives += T.mean(enable_targets[:,obj_idx] * v_obj) unit_ptr = unit_ptr+n_classes #print 'for debug purposes: unit_ptr', unit_ptr return sum_of_objectives
def build_objective(model, deterministic=False, epsilon=1e-12): predictions = nn.layers.get_output(model.l_out, deterministic=deterministic) targets = T.cast(nn.layers.get_output(model.l_target), 'int32') enable_targets = nn.layers.get_output(model.l_enable_target) predictions = T.clip(predictions, epsilon, 1.-epsilon) sum_of_objectives = 0 unit_ptr = 0 for obj_idx, obj_name in enumerate(order_objectives): n_classes = len(property_bin_borders[obj_name]) v_obj = objective(obj_idx, (unit_ptr, unit_ptr+n_classes), predictions, targets) # take the mean of the objectives where it matters (enabled targets) obj_scalar = T.sum(enable_targets[:,obj_idx] * v_obj) / (0.00001 + T.sum(enable_targets[:,obj_idx])) if deterministic: d_objectives_deterministic[obj_name] = obj_scalar else: d_objectives[obj_name] = obj_scalar sum_of_objectives += T.mean(v_obj) unit_ptr = unit_ptr+n_classes return sum_of_objectives
def build_objective(model, deterministic=False, epsilon=1e-12): predictions = nn.layers.get_output(model.l_out, deterministic=deterministic) targets = T.cast(nn.layers.get_output(model.l_target), 'int32') enable_targets = nn.layers.get_output(model.l_enable_target) predictions = T.clip(predictions, epsilon, 1.-epsilon) sum_of_objectives = 0 unit_ptr = 0 for obj_idx, obj_name in enumerate(order_objectives): n_classes = len(property_bin_borders[obj_name]) v_obj = objective(obj_idx, (unit_ptr, unit_ptr+n_classes), predictions, targets) # take the mean of the objectives where it matters (enabled targets) obj_scalar = T.sum(enable_targets[:,obj_idx] * v_obj) / (0.00001 + T.sum(enable_targets[:,obj_idx])) if deterministic: d_objectives_deterministic[obj_name] = obj_scalar else: d_objectives[obj_name] = obj_scalar sum_of_objectives += obj_scalar unit_ptr = unit_ptr+n_classes return sum_of_objectives
def build_objective(model, deterministic=False, epsilon=1e-12): predictions = nn.layers.get_output(model.l_out, deterministic=deterministic) targets = T.cast(nn.layers.get_output(model.l_target), 'int32') predictions = T.clip(predictions, epsilon, 1.-epsilon) #is_nodule_ground_truth = T.cast(targets[:,0], 'float32') sum_of_objectives = 0 unit_ptr = 0 for obj_idx, obj_name in enumerate(order_objectives): n_classes = len(property_bin_borders[obj_name]) if deterministic: d_objectives_deterministic[obj_name] = objective(obj_idx, (unit_ptr, unit_ptr+n_classes), predictions, targets) else: d_objectives[obj_name] = objective(obj_idx, (unit_ptr, unit_ptr+n_classes), predictions, targets) sum_of_objectives += objective(obj_idx, (unit_ptr, unit_ptr+n_classes), predictions, targets) unit_ptr = unit_ptr+n_classes #print 'for debug purposes: unit_ptr', unit_ptr return sum_of_objectives
def shared_dataset(self, data_xy, train=False, borrow=True): """Load the data to the shared variables of Theano. Copy for once the data to the shared memory on the GPU. """ data_x, data_y = data_xy if train: dim_output = 10 # case of MNIST data_y = np.int32(self.labels(data_y, dim_output)) shared_x = theano.shared( np.asarray(data_x, dtype = theano.config.floatX), borrow=borrow) shared_y = theano.shared ( np.asarray(data_y, dtype = theano.config.floatX), borrow=borrow) return shared_x, T.cast(shared_y, 'int32')
def shared_dataset_xy(self, data_xy, nlabels = 10, train = False, task="cls", borrow=True): """Load the data to the shared variables of Theano. Copy for once the data to the shared memory on the GPU. """ data_x, data_y = data_xy if (train) and (task=='cls'): data_y = np.int32(self.labels(data_y, nlabels)) shared_x = theano.shared( np.asarray(data_x, dtype = theano.config.floatX), borrow=borrow) shared_y = theano.shared ( np.asarray(data_y, dtype = theano.config.floatX), borrow=borrow) return shared_x, T.cast(shared_y, 'int32')
def shared_dataset_x(data_x, borrow=True): """ Function that loads the dataset into shared variables The reason we store our dataset in shared variables is to allow Theano to copy it into the GPU memory (when code is run on GPU). Since copying data into the GPU is slow, copying a minibatch everytime is needed (the default behaviour if the data is not in a shared variable) would lead to a large decrease in performance. """ shared_x = theano.shared(numpy.asarray(data_x, dtype=theano.config.floatX), borrow=borrow) # When storing data on the GPU it has to be stored as floats # therefore we will store the labels as ``floatX`` as well # (``shared_y`` does exactly that). But during our computations # we need them as ints (we use labels as index, and if they are # floats it doesn't make sense) therefore instead of returning # ``shared_y`` we will have to cast it to int. This little hack # lets ous get around this issue return shared_x
def shared_dataset(data_x, data_y, borrow=True): """ Function that loads the dataset into shared variables The reason we store our dataset in shared variables is to allow Theano to copy it into the GPU memory (when code is run on GPU). Since copying data into the GPU is slow, copying a minibatch everytime is needed (the default behaviour if the data is not in a shared variable) would lead to a large decrease in performance. """ shared_x = theano.shared(numpy.asarray(data_x, dtype=theano.config.floatX), borrow=borrow) shared_y = theano.shared(numpy.asarray(data_y, dtype=theano.config.floatX), borrow=borrow) return shared_x, T.cast(shared_y, 'int32')
def ctc_update_log_p(skip_idxs, zeros, active, log_p_curr, log_p_prev): active_skip_idxs = skip_idxs[(skip_idxs < active).nonzero()] active_next = T.cast(T.minimum( T.maximum( active + 1, T.max(T.concatenate([active_skip_idxs, [-1]])) + 2 + 1 ), log_p_curr.shape[0]), 'int32') common_factor = T.max(log_p_prev[:active]) p_prev = T.exp(log_p_prev[:active] - common_factor) _p_prev = zeros[:active_next] # copy over _p_prev = T.set_subtensor(_p_prev[:active], p_prev) # previous transitions _p_prev = T.inc_subtensor(_p_prev[1:], _p_prev[:-1]) # skip transitions _p_prev = T.inc_subtensor(_p_prev[active_skip_idxs + 2], p_prev[active_skip_idxs]) updated_log_p_prev = T.log(_p_prev) + common_factor log_p_next = T.set_subtensor( zeros[:active_next], log_p_curr[:active_next] + updated_log_p_prev ) return active_next, log_p_next
def normalize_updates(old_mean, old_std, new_mean, new_std, old_W, old_b): """ Compute the updates for normalizing the last (linear) layer of a neural network """ # Make necessary transformation so that # (W_old * h + b_old) * std_old + mean_old == \ # (W_new * h + b_new) * std_new + mean_new new_W = old_W * old_std[0] / (new_std[0] + 1e-6) new_b = (old_b * old_std[0] + old_mean[0] - new_mean[0]) / (new_std[0] + 1e-6) return OrderedDict([ (old_W, TT.cast(new_W, old_W.dtype)), (old_b, TT.cast(new_b, old_b.dtype)), (old_mean, new_mean), (old_std, new_std), ])
def get_recon_loss(self, idxs, sent_output): len_sent, len_doc_batch, n_d = sent_output.shape recon_layer = self.recon_layer padding_id = self.padding_id dropout = self.dropout # (len(sent)*len(doc)*batch)*n_e input_flat = idxs.ravel() true_recon = self.embedding_layer.recon_forward(input_flat) sent_output = apply_dropout(sent_output, dropout) pred_recon = recon_layer.forward(sent_output.reshape((len_sent*len_doc_batch, n_d))) # (len(sent)*len(doc)*batch) mask = T.cast(T.neq(input_flat, padding_id), theano.config.floatX) n = T.sum(mask) loss = T.sum((true_recon - pred_recon) ** 2, axis=1) * mask loss = T.sum(loss) / n return loss
def create_updates(self, input): if self.mode == 0: now_mean = T.mean(input, axis=0) now_var = T.var(input, axis=0) batch = T.cast(input.shape[0], theano.config.floatX) else: now_mean = T.mean(input, axis=(0,2,3)) now_var = T.var(input, axis=(0,2,3)) batch = T.cast(input.shape[0]*input.shape[2]*input.shape[3], theano.config.floatX) if self.updates is None: new_mean = self.momentum * self.mean + (1.0-self.momentum) * now_mean new_var = self.momentum * self.var + (1.0-self.momentum) * ((batch+1.0)/batch*now_var) else: new_mean = self.momentum * self.updates[0][1] + (1.0-self.momentum) * now_mean new_var = self.momentum * self.updates[1][1] + (1.0-self.momentum) * ((batch+1.0)/batch*now_var) self.updates = [(self.mean, new_mean), (self.var, new_var)]
def __init__(self, rng, input, dropout_rate=0.5): """ input: output of last layer """ self.input = input self.dropout_rate = dropout_rate srng = T.shared_randomstreams.RandomStreams(rng.randint(999999)) if self.dropout_rate > 0: # p=1-p because 1's indicate keep and p is prob of dropping mask = srng.binomial(n=1, p = 1-self.dropout_rate, size=self.input.shape) # The cast is important because # int * float32 = float64 which pulls things off the gpu self.output = self.input * T.cast(mask, theano.config.floatX) else: self.output = input
def load_data(dataset): """????????????GPU????????""" f = gzip.open(dataset, 'rb') train_set, valid_set, test_set = cPickle.load(f) f.close() def shared_dataset(data_xy, borrow=True): data_x, data_y = data_xy # ????????float???? shared_x = theano.shared(np.asarray(data_x, dtype=theano.config.floatX), borrow=borrow) shared_y = theano.shared(np.asarray(data_y, dtype=theano.config.floatX), borrow=borrow) # ????int???????????? return shared_x, T.cast(shared_y, 'int32') test_set_x, test_set_y = shared_dataset(test_set) valid_set_x, valid_set_y = shared_dataset(valid_set) train_set_x, train_set_y = shared_dataset(train_set) rval = [(train_set_x, train_set_y), (valid_set_x, valid_set_y), (test_set_x, test_set_y)] return rval
def dual_copy_rounding(W,integer_bits=0,fractional_bits=1): """ Rounding as described in as in "Robustness of spiking Deep Belief Networks to noise and reduced bit precision of neuro-inspired hardware platforms" by Stromatidis et al. See http://dx.doi.org/10.3389/fnins.2015.00222 :param W: Weights :param integer_bits: number of bits to represent the integer part :param fractional_bits: number of bits to represent the fractional part :return:quantized weights """ #print "Dual copy rounding!" power = T.cast(2.**fractional_bits, theano.config.floatX) # float ! max_val = T.cast((2.**(fractional_bits+integer_bits))-1, theano.config.floatX) value = W*power value = GradPreserveRoundTensor(value) # rounding value = T.clip(value, -max_val, max_val) # saturation arithmetic Wb = value/power return Wb
def __init__(self, input, n_in, n_out, prob_drop=0.5, verbose=False): self.verbose = verbose self.prob_drop = prob_drop self.prob_keep = 1.0 - prob_drop self.flag_on = theano.shared(np.cast[theano.config.floatX](1.0)) self.flag_off = 1.0 - self.flag_on seed_this = DropoutLayer.seed_common.randint(0, 2**31-1) mask_rng = theano.tensor.shared_randomstreams.RandomStreams(seed_this) self.mask = mask_rng.binomial(n=1, p=self.prob_keep, size=input.shape) self.output = \ self.flag_on * T.cast(self.mask, theano.config.floatX) * input + \ self.flag_off * self.prob_keep * input DropoutLayer.layers.append(self) if self.verbose: print 'dropout layer with P_drop: ' + str(self.prob_drop)
def load_data_shared(filename="mnist.pkl.gz"): f = gzip.open(filename, 'rb') training_data, validation_data, test_data = pickle.load(f, encoding="latin1") f.close() def shared(data): """Place the data into shared variables. This allows Theano to copy the data to the GPU, if one is available. """ shared_x = theano.shared( np.asarray(data[0], dtype=theano.config.floatX), borrow=True) shared_y = theano.shared( np.asarray(data[1], dtype=theano.config.floatX), borrow=True) return shared_x, T.cast(shared_y, "int32") return [shared(training_data), shared(validation_data), shared(test_data)] #### Main class used to construct and train networks
def to_measure(self, q) : # Compute edges's vertices a = q[self.connectivity[:,0]] b = q[self.connectivity[:,1]] c = q[self.connectivity[:,2]] # A surface is represented as a sum of dirac, one for each triangle x = .33333333 * (a + b + c) # Mean # Cross product ab = (b-a).dimshuffle(0, 1, 'x') ac = (c-a).dimshuffle(0, 'x', 1) t = (ab * ac).reshape((self.connectivity.shape[0], 9)) cp = t.dot( np.array( [ [0., 0., 0., 0., 0., 1., 0., -1., 0.], [0., 0., -1., 0., 0., 0., 1., 0., 0.], [0., 1., 0., -1., 0., 0., 0., 0., 0.] ] ).T) mu = .5 * T.sqrt( (cp**2).sum(1) ) # Length mu = T.cast(mu, dtype=config.floatX) return (x, mu)
def to_varifold(self, q) : # Compute edges's vertices a = q[self.connectivity[:,0]] b = q[self.connectivity[:,1]] c = q[self.connectivity[:,2]] # A surface is represented as a sum of dirac, one for each triangle x = .33333333 * (a + b + c) # Mean # Cross product ab = (b-a).dimshuffle(0, 1, 'x') ac = (c-a).dimshuffle(0, 'x', 1) t = (ab * ac).reshape((self.connectivity.shape[0], 9)) cp = t.dot( np.array( [ [0., 0., 0., 0., 0., 1., 0., -1., 0.], [0., 0., -1., 0., 0., 0., 1., 0., 0.], [0., 1., 0., -1., 0., 0., 0., 0., 0.] ] ).T) mu = T.sqrt( (cp**2).sum(1) ) # Length u = ( cp / mu.dimshuffle(0,'x')) # Normal direction mu = T.cast(.5*mu, dtype=config.floatX) u = T.cast(u, dtype=config.floatX) return (x, mu, u)
def load_data(dataset): if dataset.split('.')[-1] == 'gz': f = gzip.open(dataset, 'r') else: f = open(dataset, 'r') train_set, valid_set, test_set = pkl.load(f) f.close() def shared_dataset(data_xy, borrow=True): data_x, data_y = data_xy shared_x = theano.shared( np.asarray(data_x, dtype=theano.config.floatX), borrow=borrow) shared_y = theano.shared( np.asarray(data_y, dtype=theano.config.floatX), borrow=borrow) return shared_x, T.cast(shared_y, 'int32') train_set_x, train_set_y = shared_dataset(train_set) valid_set_x, valid_set_y = shared_dataset(valid_set) test_set_x, test_set_y = shared_dataset(test_set) return [(train_set_x, train_set_y), (valid_set_x, valid_set_y), (test_set_x, test_set_y )]
def shared_dataset(data_xy, borrow=True): """ Function that loads the dataset into shared variables The reason we store our dataset in shared variables is to allow Theano to copy it into the GPU memory (when code is run on GPU). Since copying data into the GPU is slow, copying a minibatch everytime is needed (the default behaviour if the data is not in a shared variable) would lead to a large decrease in performance. """ data_x, data_y = data_xy shared_x = theano.shared(numpy.asarray(data_x, dtype=theano.config.floatX), borrow=borrow) shared_y = theano.shared(numpy.asarray(data_y, dtype=theano.config.floatX), borrow=borrow) # When storing data on the GPU it has to be stored as floats # therefore we will store the labels as ``floatX`` as well # (``shared_y`` does exactly that). But during our computations # we need them as ints (we use labels as index, and if they are # floats it doesn't make sense) therefore instead of returning # ``shared_y`` we will have to cast it to int. This little hack # lets ous get around this issue return shared_x, T.cast(shared_y, 'int32')
def adamax_updates(params, cost, lr=0.001, mom1=0.9, mom2=0.999): updates = [] grads = T.grad(cost, params) for p, g in zip(params, grads): mg = th.shared(np.cast[th.config.floatX](p.get_value() * 0.)) v = th.shared(np.cast[th.config.floatX](p.get_value() * 0.)) if mom1>0: v_t = mom1*v + (1. - mom1)*g updates.append((v,v_t)) else: v_t = g mg_t = T.maximum(mom2*mg, abs(g)) g_t = v_t / (mg_t + 1e-6) p_t = p - lr * g_t updates.append((mg, mg_t)) updates.append((p, p_t)) return updates
def get_output_for(self, input, deterministic=False, **kwargs): if deterministic: norm_features = (input-self.avg_batch_mean.dimshuffle(*self.dimshuffle_args)) / T.sqrt(1e-6 + self.avg_batch_var).dimshuffle(*self.dimshuffle_args) else: batch_mean = T.mean(input,axis=self.axes_to_sum).flatten() centered_input = input-batch_mean.dimshuffle(*self.dimshuffle_args) batch_var = T.mean(T.square(centered_input),axis=self.axes_to_sum).flatten() batch_stdv = T.sqrt(1e-6 + batch_var) norm_features = centered_input / batch_stdv.dimshuffle(*self.dimshuffle_args) # BN updates new_m = 0.9*self.avg_batch_mean + 0.1*batch_mean new_v = 0.9*self.avg_batch_var + T.cast((0.1*input.shape[0])/(input.shape[0]-1.), th.config.floatX)*batch_var self.bn_updates = [(self.avg_batch_mean, new_m), (self.avg_batch_var, new_v)] if hasattr(self, 'g'): activation = norm_features*self.g.dimshuffle(*self.dimshuffle_args) else: activation = norm_features if hasattr(self, 'b'): activation += self.b.dimshuffle(*self.dimshuffle_args) return self.nonlinearity(activation)
def forward(self, x, mask, hc): n_in, n_out, activation = self.n_in, self.n_out_t, self.activation if hc.ndim > 1: c_tm1 = hc[:, :n_out] h_tm1 = hc[:, n_out:] else: c_tm1 = hc[:n_out] h_tm1 = hc[n_out:] in_t = self.in_gate.forward(x,h_tm1) forget_t = self.forget_gate.forward(x,h_tm1) out_t = self.out_gate.forward(x, h_tm1) c_t = forget_t * c_tm1 + in_t * self.input_layer.forward(x,h_tm1) c_t = c_t * mask.dimshuffle(0, 'x') c_t = T.cast(c_t, 'float32') h_t = out_t * T.tanh(c_t) h_t = h_t * mask.dimshuffle(0, 'x') h_t = T.cast(h_t, 'float32') if hc.ndim > 1: return T.concatenate([ c_t, h_t ], axis=1) else: return T.concatenate([ c_t, h_t ])
def backward(self, x, mask, hc): n_in, n_out, activation = self.n_in, self.n_out_t, self.activation if hc.ndim > 1: c_tm1 = hc[:, :n_out] h_tm1 = hc[:, n_out:] else: c_tm1 = hc[:n_out] h_tm1 = hc[n_out:] in_t = self.in_gate_b.forward(x,h_tm1) forget_t = self.forget_gate_b.forward(x,h_tm1) out_t = self.out_gate_b.forward(x, h_tm1) c_t = forget_t * c_tm1 + in_t * self.input_layer_b.forward(x,h_tm1) c_t = c_t * mask.dimshuffle(0, 'x') c_t = T.cast(c_t, 'float32') h_t = out_t * T.tanh(c_t) h_t = h_t * mask.dimshuffle(0, 'x') h_t = T.cast(h_t, 'float32') if hc.ndim > 1: return T.concatenate([ c_t, h_t ], axis=1) else: return T.concatenate([ c_t, h_t ])
def get_parent_state(self, children_states, node_type, use_dropout: bool, iteration_number) -> tuple: layer_input = T.flatten(children_states) nn_out = self.__compute_layer_output(layer_input, node_type, use_dropout, iteration_number) encoder_input = T.flatten(T.concatenate((children_states, nn_out))) * self.__ae_noise encoding = T.tanh(T.dot(encoder_input, self.__encoder_weights[node_type])) decoded = T.tanh(T.dot(encoding, self.__decoder_weights)) decoded /= decoded.norm(2) / layer_input.norm(2) output_reconstruction = self.__compute_layer_output(decoded, node_type, use_dropout, iteration_number) reconstruction_cos = T.dot(nn_out[0], output_reconstruction[0]) children_reconstruction_cos = T.dot(decoded, layer_input) additional_objective = reconstruction_cos + children_reconstruction_cos constrain_usage_pct = T.cast(1. - T.pow(self.__hyperparameters['constrain_intro_rate'], iteration_number), theano.config.floatX) return nn_out[0], constrain_usage_pct * additional_objective
def conv2d_grad(topgrad, output_shape, filters, border_mode, strides): if (border_mode==BorderMode.same): #'half' kernel width padding results in outputs of the same #dimensions as input border_mode=BorderMode.half assert filters.shape[2]%2 == 1 and filters.shape[3]%2 == 1,\ "haven't handled even filter shapes for border mode 'half'" op = T.nnet.abstract_conv.AbstractConv2d_gradInputs( imshp=output_shape, kshp=filters.shape, subsample=strides, border_mode=border_mode, filter_flip=True) topgrad=T.cast(topgrad, dtype=theano.config.floatX) belowgrad = op(kern=filters, topgrad=topgrad, shape=output_shape[2:]) return belowgrad
def adam_conditional_updates(params, cost, mincost, lr=0.001, mom1=0.9, mom2=0.999): # if cost is less than mincost, don't do update updates = [] grads = T.grad(cost, params) t = th.shared(np.cast[th.config.floatX](1.)) for p, g in zip(params, grads): v = th.shared(np.cast[th.config.floatX](p.get_value() * 0.)) mg = th.shared(np.cast[th.config.floatX](p.get_value() * 0.)) v_t = mom1*v + (1. - mom1)*g mg_t = mom2*mg + (1. - mom2)*T.square(g) v_hat = v_t / (1. - mom1 ** t) mg_hat = mg_t / (1. - mom2 ** t) g_t = v_hat / T.sqrt(mg_hat + 1e-8) p_t = p - lr * g_t updates.append((v, ifelse(cost<mincost,v,v_t))) updates.append((mg, ifelse(cost<mincost,mg,mg_t))) updates.append((p, ifelse(cost<mincost,p,p_t))) updates.append((t, ifelse(cost<mincost,t,t+1))) return updates
def get_output_for(self, input, deterministic=False, **kwargs): if deterministic: norm_features = (input-self.avg_batch_mean.dimshuffle(*self.dimshuffle_args)) / T.sqrt(1e-6 + self.avg_batch_var).dimshuffle(*self.dimshuffle_args) else: batch_mean = T.mean(input,axis=self.axes_to_sum).flatten() centered_input = input-batch_mean.dimshuffle(*self.dimshuffle_args) batch_var = T.mean(T.square(centered_input),axis=self.axes_to_sum).flatten() batch_stdv = T.sqrt(1e-6 + batch_var) norm_features = centered_input / batch_stdv.dimshuffle(*self.dimshuffle_args) # BN updates new_m = 0.9*self.avg_batch_mean + 0.1*batch_mean new_v = 0.9*self.avg_batch_var + T.cast((0.1*input.shape[0])/(input.shape[0]-1),th.config.floatX)*batch_var self.bn_updates = [(self.avg_batch_mean, new_m), (self.avg_batch_var, new_v)] if hasattr(self, 'g'): activation = norm_features*self.g.dimshuffle(*self.dimshuffle_args) else: activation = norm_features if hasattr(self, 'b'): activation += self.b.dimshuffle(*self.dimshuffle_args) if self.nonlinearity is not None: return self.nonlinearity(activation) else: return activation
def shared_dataset(data_xy, borrow=True): """ Function that loads the dataset into shared variables The reason we store our dataset in shared variables is to allow Theano to copy it into the GPU memory (when code is run on GPU). Since copying data into the GPU is slow, copying a minibatch everytime is needed (the default behaviour if the data is not in a shared variable) would lead to a large decrease in performance. """ data_x, data_y = data_xy shared_x = theano.shared(np.asarray(data_x, dtype=theano.config.floatX), borrow=borrow) shared_y = theano.shared(np.asarray(data_y, dtype=theano.config.floatX), borrow=borrow) return shared_x, T.cast(shared_y, 'int32')
def _buildModel(self): self.updates_ack= True X = T.matrix('X', dtype=config.floatX) Y = T.matrix('Y', dtype=config.floatX) X.tag.test_value, Y.tag.test_value = self._fakeData() #output_params_t= T.nnet.sigmoid(self._LinearNL(self.tWeights['W_lr'], self.tWeights['b_lr'], X, onlyLinear=True)) output_params_t= T.nnet.sigmoid(self._BNlayer(self.tWeights['W_lr'], self.tWeights['b_lr'], X, validation=False, onlyLinear=True)) nll_t = T.nnet.binary_crossentropy(output_params_t, Y).sum() #output_params_e = T.nnet.sigmoid(self._LinearNL(self.tWeights['W_lr'], self.tWeights['b_lr'], X, onlyLinear=True)) output_params_e= T.nnet.sigmoid(self._BNlayer(self.tWeights['W_lr'], self.tWeights['b_lr'], X, validation=True, onlyLinear=True)) nll_e = T.nnet.binary_crossentropy(output_params_e, Y).sum() if not self.params['validate_only']: model_params = self._getModelParams() print len(self.updates),' extraneous updates' optimizer_up, norm_list = self._setupOptimizer(nll_t, model_params, lr=self.params['lr'], divide_grad = T.cast(X.shape[0],config.floatX)) optimizer_up+=self.updates self.train = theano.function([X,Y], [nll_t,self.tWeights['_lr_BN_running_mean'], self.tWeights['_lr_BN_running_var']], updates = optimizer_up) self.evaluate = theano.function([X,Y],nll_e)
def softmax_and_sample(logits): old_shape = logits.shape flattened_logits = logits.reshape((-1, logits.shape[logits.ndim-1])) samples = T.cast( srng.multinomial(pvals=T.nnet.softmax(flattened_logits)), theano.config.floatX ).reshape(old_shape) return T.argmax(samples, axis=samples.ndim-1) # TODO: Have a look at this benchmark: # https://github.com/MaximumEntropy/cudnn_rnn_theano_benchmarks
def centered_softplus(x): return T.nnet.softplus(x) - np.cast[th.config.floatX](np.log(2.))