我们从Python开源项目中,提取了以下47个代码示例,用于说明如何使用chainer.functions.broadcast_to()。
def _context(self, p, fb_mat, fbe_mat): batch_size, source_length, _ = fb_mat.data.shape # {pe,e}_mat: shape = [batch * srclen, atten] pe_mat = F.reshape( F.broadcast_to( F.expand_dims(self.p_e(p), 1), [batch_size, source_length, self.atten_size]), [batch_size * source_length, self.atten_size]) e_mat = F.tanh(fbe_mat + pe_mat) # a_mat: shape = [batch, srclen] a_mat = F.softmax(F.reshape(self.e_a(e_mat), [batch_size, source_length])) # q: shape = [batch, 2 * hidden] q = F.reshape( F.batch_matmul(a_mat, fb_mat, transa=True), [batch_size, 2 * self.hidden_size]) return q
def attend(self, query, key, value, mask, minfs=None): """ Input shapes: q=(b, units, dec_l), k=(b, units, enc_l), v=(b, units, dec_l, enc_l), m=(b, dec_l, enc_l) """ # Calculate Attention Scores with Mask for Zero-padded Areas pre_a = F.batch_matmul(query, key, transa=True) # (b, dec_l, enc_l) minfs = self.xp.full(pre_a.shape, -np.inf, pre_a.dtype) \ if minfs is None else minfs pre_a = F.where(mask, pre_a, minfs) a = F.softmax(pre_a, axis=2) # if values in axis=2 are all -inf, they become nan. thus do re-mask. a = F.where(self.xp.isnan(a.data), self.xp.zeros(a.shape, dtype=a.dtype), a) reshaped_a = a[:, None] # (b, 1, dec_xl, enc_l) # Calculate Weighted Sum pre_c = F.broadcast_to(reshaped_a, value.shape) * value c = F.sum(pre_c, axis=3, keepdims=True) # (b, units, dec_xl, 1) return c
def __call__(self, x, context): x = F.broadcast_to(x[:, None], (context.shape[0], context.shape[1])) x = F.reshape(x, (context.shape[0] * context.shape[1],)) if args.subword == 'rnn': context = context.reshape((context.shape[0] * context.shape[1])) e = self.rnn.charRNN(context) if args.subword == 'none': e = self.embed(context) e = F.reshape(e, (e.shape[0] * e.shape[1], e.shape[2])) loss = self.loss_func(e, x) reporter.report({'loss': loss}, self) return loss
def __call__(self, y, a, ht, y_lex): y_dict = F.squeeze(F.batch_matmul(y_lex, a, transa=True), axis=2) return (y + F.log(y_dict + self.alpha)) #class LinearInterpolationLexicon(chainer.Chain): # def __init__(self, hidden_size): # super(LinearInterpolationLexicon, self).__init__( # perceptron = chainer.links.Linear(hidden_size, 1) # ) # # def __call__(self, y, a, ht, y_lex): # y = F.softmax(y) # y_dict = F.squeeze(F.batch_matmul(y_lex, a, transa=True), axis=2) # gamma = F.broadcast_to(F.sigmoid(self.perceptron(ht)), y_dict.data.shape) # return (gamma * y_dict + (1-gamma) * y) #
def __call__(self, x1, x2): xp = self.xp out_size = self.out_size batch_size, len1, dim1 = x1.shape if not self.nobias[0]: x1 = F.concat((x1, xp.ones((batch_size, len1, 1), dtype=xp.float32)), axis=2) dim1 += 1 len2, dim2 = x2.shape[1:] if not self.nobias[1]: x2 = F.concat((x2, xp.ones((batch_size, len2, 1), dtype=xp.float32)), axis=2) dim2 += 1 x1_reshaped = F.reshape(x1, (batch_size * len1, dim1)) W_reshaped = F.reshape(F.transpose(self.W, (0, 2, 1)), (dim1, out_size * dim2)) affine = F.reshape(F.matmul(x1_reshaped, W_reshaped), (batch_size, len1 * out_size, dim2)) biaffine = F.transpose( F.reshape(batch_matmul(affine, x2, transb=True), (batch_size, len1, out_size, len2)), (0, 1, 3, 2)) if not self.nobias[2]: biaffine += F.broadcast_to(self.b, biaffine.shape) return biaffine
def term_bias(self, bs, train=True): """ Compute overall bias and broadcast to shape of batchsize """ shape = (bs, 1,) # Bias is drawn from a Gaussian with given mu and log variance bs_mu = F.broadcast_to(self.bias_mu.b, shape) bs_lv = F.broadcast_to(self.bias_lv.b, shape) bias = F.flatten(F.gaussian(bs_mu, bs_lv)) # Add a very negative log variance so we're sampling # from a very narrow distribution about the mean. # Useful for validation dataset when we want to only guess # the mean. if not train: bs_lv += self.lv_floor # Compute prior on the bias, so compute the KL div # from the KL(N(mu_bias, var_bias) | N(0, 1)) kld = F.gaussian_kl_divergence(self.bias_mu.b, self.bias_lv.b) return bias, kld
def term_feat(self, iloc, jloc, ival, jval, bs, nf, train=True): # Change all of the shapes to form interaction vectors shape = (bs, nf * 2, self.n_dim) feat_mu_vec = F.broadcast_to(self.feat_mu_vec.b, shape) feat_lv_vec = F.broadcast_to(self.feat_lv_vec.b, shape) if not train: feat_lv_vec += self.lv_floor # Construct the interaction mean and variance # iloc is (bs, nf), feat(iloc) is (bs, nf, ndim) and # dot(feat, feat) is (bs, nf) ivec = F.gaussian(feat_mu_vec + self.feat_delta_mu(iloc), feat_lv_vec + self.feat_delta_lv(iloc)) jvec = F.gaussian(feat_mu_vec + self.feat_delta_mu(jloc), feat_lv_vec + self.feat_delta_lv(jloc)) # feat is (bs, ) feat = dot(F.sum(ivec * jvec, axis=2), ival * jval) # Compute the KLD for the group mean vector and variance vector kld1 = F.gaussian_kl_divergence(self.feat_mu_vec.b, self.feat_lv_vec.b) # Compute the KLD for vector deviations from the group mean and var kld2 = F.gaussian_kl_divergence(self.feat_delta_mu.W, self.feat_delta_lv.W) return feat, kld1 + kld2
def __call__(self, x): minibatch_size = x.shape[0] activation = F.reshape(self.t(x), (-1, self.n_kernels, self.kernel_dim)) activation_ex = F.expand_dims(activation, 3) activation_ex_t = F.expand_dims(F.transpose(activation, (1, 2, 0)), 0) activation_ex, activation_ex_t = F.broadcast(activation_ex, activation_ex_t) diff = activation_ex - activation_ex_t xp = chainer.cuda.get_array_module(x.data) eps = F.expand_dims(xp.eye(minibatch_size, dtype=xp.float32), 1) eps = F.broadcast_to(eps, (minibatch_size, self.n_kernels, minibatch_size)) sum_diff = F.sum(abs(diff), axis=2) sum_diff = F.broadcast_to(sum_diff, eps.shape) abs_diff = sum_diff + eps minibatch_features = F.sum(F.exp(-abs_diff), 2) return F.concat((x, minibatch_features), axis=1)
def __call__(self, x): xp = chainer.cuda.get_array_module(x.data) batchsize = x.shape[0] if self.train_weights == False and self.initial_T is not None: self.T.W.data = self.initial_T M = F.reshape(self.T(x), (-1, self.num_kernels, self.ndim_kernel)) M = F.expand_dims(M, 3) M_T = F.transpose(M, (3, 1, 2, 0)) M, M_T = F.broadcast(M, M_T) norm = F.sum(abs(M - M_T), axis=2) eraser = F.broadcast_to(xp.eye(batchsize, dtype=x.dtype).reshape((batchsize, 1, batchsize)), norm.shape) c_b = F.exp(-(norm + 1e6 * eraser)) o_b = F.sum(c_b, axis=2) if self.train_weights == False: self.initial_T = self.T.W.data return F.concat((x, o_b), axis=1)
def __call__(self, v, h, label): v_t = self.vertical_conv_t(v) v_s = self.vertical_conv_s(v) to_vertical_t = self.v_to_h_conv_t(v_t) to_vertical_s = self.v_to_h_conv_s(v_s) # v_gate = self.vertical_gate_conv(v) # label bias is added to both vertical and horizontal conv # here we take only shape as it should be the same label = F.broadcast_to(F.expand_dims(F.expand_dims(self.label(label), -1), -1), v_t.shape) v_t, v_s = v_t + label, v_s + label v = F.tanh(v_t) * F.sigmoid(v_s) h_t = self.horizontal_conv_t(h) h_s = self.horizontal_conv_s(h) h_t, h_s = h_t + to_vertical_t + label, h_s + to_vertical_s + label h = self.horizontal_output(F.tanh(h_t) * F.sigmoid(h_s)) return v, h
def ordinal_loss(y, mask): xp = cuda.get_array_module(y.data) volatile = y.volatile b, c, n = y.data.shape max_y = F.broadcast_to(F.max(y, axis=1, keepdims=True), y.data.shape) y = y - max_y sum_y = F.broadcast_to(F.expand_dims(F.sum(y, axis=1), 1), y.data.shape) down_tri = np.tri(c, dtype=np.float32) up_tri = down_tri.T w1 = Variable(xp.asarray(down_tri.reshape(c, c, 1, 1)), volatile=volatile) w2 = Variable(xp.asarray(up_tri.reshape(c, c, 1, 1)), volatile=volatile) h = F.exp(F.expand_dims(y, -1)) h1 = F.convolution_2d(h, w1) h1 = F.convolution_2d(F.log(h1), w1) h2 = F.convolution_2d(h, w2) h2 = F.convolution_2d(F.log(h2), w2) h = F.reshape(h1 + h2, (b, c, n)) return F.sum((h - sum_y - y) * mask) / b
def __forward(self, batch_x, batch_t, weight, train=True): xp = self.xp x = Variable(xp.asarray(batch_x), volatile=not train) t = Variable(xp.asarray(batch_t), volatile=not train) y = self.net(x, train=train) b, c, n = y.data.shape mask = Variable(xp.asarray(np.broadcast_to(weight.reshape(-1, 1, 1), (b, c, n)) * loss_mask(batch_t, self.net.rating_num)), volatile=not train) if self.ordinal_weight == 0: loss = F.sum(-F.log_softmax(y) * mask) / b elif self.ordinal_weight == 1: loss = ordinal_loss(y, mask) else: loss = (1 - self.ordinal_weight) * F.sum(-F.log_softmax(y) * mask) / b + self.ordinal_weight * ordinal_loss(y, mask) acc = self.__accuracy(y, t) return loss, acc
def propup(self, vis): """ This function propagates the visible units activation upwards to the hidden units Eq.(7) :param vis: Variable Matrix(batch_size, in_channels, image_height, image_width) - given v_sample :return: Variable Matrix(batch_size, out_channels, image_height_out, image_width_out) - probability for each hidden units to be h_i=1 """ # conv.W: Matrix(out_channels, in_channels, filter height=ksize, filter width=ksize) # conv.b: Vec (out_channels, ) if self.real == 0: pre_sigmoid_activation = self.conv(vis) else: pre_sigmoid_activation = self.conv(vis / self.std_ch) # F.matmul(vis, self.conv.W, transb=True) + F.broadcast_to(self.conv.b, (vis.data.shape[0], self.n_hidden)) return F.sigmoid(pre_sigmoid_activation)
def propdown(self, hid): """ This function propagates the hidden units activation downwords to the visible units :param hid: Variable Matrix(batch_size, out_channels, image_height_out, image_width_out) - given h_sample :return: Variable Matrix(batch_size, in_channels, image_height, image_width) - probability for each visible units to be v_j = 1 """ batch_size = hid.data.shape[0] if self.real == 0: W_flipped = F.swapaxes(CF.flip(self.conv.W, axes=(2, 3)), axis1=0, axis2=1) pre_sigmoid_activation = F.convolution_2d(hid, W_flipped, self.conv.a, pad=self.ksize-1) # F.matmul(hid, self.l.W) + F.broadcast_to(self.l.a, (batch_size, self.n_visible)) v_mean = F.sigmoid(pre_sigmoid_activation) #print('W info ', self.conv.W.data.shape, 'W_flipped info ', W_flipped.data.shape) #print('W info ', self.conv.W.data[3, 0, 2, 3], 'W_flipped info ', W_flipped.data[0, 3, 8, 7]) #print('W info ', self.conv.W.data[3, 0, 8, 7], 'W_flipped info ', W_flipped.data[0, 3, 2, 3]) #print('W info ', self.conv.W.data[19, 0, 4, 0], 'W_flipped info ', W_flipped.data[0, 19, 6, 10]) #print('pre_sigmoidactivation', F.sum(pre_sigmoid_activation).data) #print('v_mean', v_mean.data.shape) #print('v_mean sum', F.sum(v_mean).data) #print('hid', hid.data.shape) else: # TODO: check W_flipped = F.swapaxes(CF.flip(self.conv.W, axes=(2, 3)), axis1=0, axis2=1) v_mean = F.convolution_2d(hid, W_flipped, self.conv.a, pad=self.ksize-1) return v_mean
def reconstruct(self, v): """ :param v: Variable Matrix(batch_size, in_channels, image_height, image_width) :return: reconstructed_v, Variable Matrix(batch_size, in_channels, image_height, image_width) """ batch_size = v.data.shape[0] xp = cuda.get_array_module(v.data) if self.real == 0: h = F.sigmoid(self.conv(v)) else: std_ch = xp.reshape(self.std, (1, self.in_channels, 1, 1)) h = F.sigmoid(self.conv(v / std_ch)) # F.sigmoid(F.matmul(v, self.l.W, transb=True) + F.broadcast_to(self.l.b, (batch_size, self.n_hidden))) W_flipped = F.swapaxes(CF.flip(self.conv.W, axes=(2, 3)), axis1=0, axis2=1) reconstructed_v = F.sigmoid(F.convolution_2d(h, W_flipped, self.conv.a, pad=self.ksize-1)) # = F.sigmoid(F.matmul(h, self.l.W) + F.broadcast_to(self.l.a, (batch_size, self.n_visible))) return reconstructed_v
def clip_actions(actions, min_action, max_action): min_actions = F.broadcast_to(min_action, actions.shape) max_actions = F.broadcast_to(max_action, actions.shape) return F.maximum(F.minimum(actions, max_actions), min_actions)
def compute_mean_and_var(self, x): h = x for layer in self.hidden_layers: h = self.nonlinearity(layer(h)) mean = self.mean_layer(h) if self.bound_mean: mean = bound_by_tanh(mean, self.min_action, self.max_action) var = F.broadcast_to(F.softplus(self.var_layer(h)), mean.shape) + \ self.min_var return mean, var
def __call__(self, x): mean = self.hidden_layers(x) var = F.broadcast_to( F.softplus(self.var_param), mean.shape) return distribution.GaussianDistribution(mean, var)
def instance_norm(self, x, gamma=None, beta=None): mean = F.mean(x, axis=-1) mean = F.mean(mean, axis=-1) mean = F.broadcast_to(mean[Ellipsis, None, None], x.shape) var = F.squared_difference(x, mean) std = F.sqrt(var + 1e-5) x_hat = (x - mean) / std if gamma is not None: gamma = F.broadcast_to(gamma[None, Ellipsis, None, None], x.shape) beta = F.broadcast_to(beta[None, Ellipsis, None, None], x.shape) return gamma * x_hat + beta else: return x_hat
def prepare_decoding(self, state, lengths, train=True): state = super().prepare_decoding(state, lengths, train=train) x = state['x'] h = state['h'] c = F.broadcast_to(self.encoder.c0, (self.batchsize, self.dim_hid)) lengths = lengths.astype(np.float32) lengths = lengths.reshape((self.batchsize, 1)) c = c * lengths return {'x': x, 'c': c, 'h': h}
def _attend(self, p): p = self.xh(p) p = F.expand_dims(p, 1) p = F.broadcast_to(p, self.shape2) h = F.tanh(self.h + p) shape3 = (self.batchsize * self.src_len, self.dim_hid) h_reshaped = F.reshape(h, shape3) weight_reshaped = self.hw(h_reshaped) weight = F.reshape(weight_reshaped, (self.batchsize, self.src_len, 1)) weight = F.where(self.mask, weight, self.minf) attention = F.softmax(weight) return attention
def __call__(self, x): return functions.broadcast_to(x, self.shape)
def setUp(self): self.x1 = numpy.random.uniform( .5, 1, (batch_size, m, k)).astype(numpy.float32) self.x2 = numpy.random.uniform( .5, 1, (1, k, n)).astype(numpy.float32) self.gy = numpy.random.uniform( -1, 1, (batch_size, m, n)).astype(numpy.float32) self.op = lambda x, y: F.batch_matmul( x, F.broadcast_to(y, (batch_size, k, n))) self.forward_answer = numpy.array([ numpy.dot(self.x1[i], self.x2[0]) for i in six.moves.range(batch_size)])
def setUp(self): self.x1 = numpy.random.uniform( .5, 1, (batch_size, m, k)).astype(numpy.float32) self.x2 = numpy.random.uniform( .5, 1, (k, n)).astype(numpy.float32) self.gy = numpy.random.uniform( -1, 1, (batch_size, m, n)).astype(numpy.float32) self.op = lambda x, y: F.batch_matmul( x, F.broadcast_to(F.expand_dims(y, 0), (batch_size, k, n))) self.forward_answer = numpy.array([ numpy.dot(self.x1[i], self.x2) for i in six.moves.range(batch_size)])
def check_forward(self, data): x = chainer.Variable(data) bx = functions.broadcast_to(x, self.out_shape) self.assertEqual(bx.data.shape, self.out_shape)
def test_type_check(self): x = chainer.Variable(self.data) with self.assertRaises(type_check.InvalidType): functions.broadcast_to(x, self.out_shape)
def squared_distance_matrix(X): n = X.shape[0] XX = F.sum(X ** 2.0, axis=1) distances = -2.0 * F.linear(X, X) distances = distances + F.broadcast_to(XX, (n, n)) distances = distances + F.broadcast_to(F.expand_dims(XX, 1), (n, n)) return distances
def angular_mc_loss(f, f_p, alpha=45, in_degree=True): ''' Args: f (chainer.Variable or xp.npdarray): Anchor vectors. Each vectors in f must be l2 normalized. f_p (chainer.Variable or xp.npdarray): Positive vectors. Each vectors in f must be l2 normalized. ''' xp = cuda.get_array_module(f) if in_degree: alpha = np.deg2rad(alpha) sq_tan_alpha = np.tan(alpha) ** 2 n_pairs = len(f) # first and second term of f_{a,p,n} term1 = 4 * sq_tan_alpha + matmul(f + f_p, transpose(f_p)) term2 = 2 * (1 + sq_tan_alpha) * F.sum(f * f_p, axis=1, keepdims=True) # term2 = 2 * (1 + sq_tan_alpha) * F.batch_matmul(f, f_p, transa=True).reshape(n_pairs, 1) f_apn = term1 - F.broadcast_to(term2, (n_pairs, n_pairs)) # multiply zero to diagonal components of f_apn mask = xp.ones_like(f_apn.data) - xp.eye(n_pairs, dtype=f.dtype) f_apn = f_apn * mask return F.average(F.logsumexp(f_apn, axis=1))
def __call__(self, x, context): e = self.embed(context) shape = e.shape x = F.broadcast_to(x[:, None], (shape[0], shape[1])) e = F.reshape(e, (shape[0] * shape[1], shape[2])) x = F.reshape(x, (shape[0] * shape[1],)) loss = self.loss_func(e, x) reporter.report({'loss': loss}, self) return loss
def __call__(self, x, context): x = F.broadcast_to(x[:, None], (context.shape[0], context.shape[1])) x = F.reshape(x, (context.shape[0] * context.shape[1],)) context = context.reshape((context.shape[0] * context.shape[1])) e = self.rnn.charRNN(context) loss = self.loss_func(e, x) reporter.report({'loss': loss}, self) return loss
def __call__(self, x): """Normalize input and scale it. Args: x (chainer.Variable): A variable holding 4-dimensional array. Its :obj:`dtype` is :obj:`numpy.float32`. Returns: chainer.Variable: The shape and :obj:`dtype` are same as those of input. """ x = F.normalize(x, eps=self.eps, axis=1) scale = F.broadcast_to(self.scale[:, np.newaxis, np.newaxis], x.shape) return x * scale
def __call__(self, S, h): batch_size, src_len, hidden_size = S.data.shape h = F.broadcast_to(F.expand_dims(h, axis=2), (batch_size, hidden_size, src_len)) h = F.swapaxes(h, 1, 2) S = F.reshape(F.concat((S, h), axis=2), (batch_size * src_len, 2 * hidden_size)) a = F.softmax(F.reshape(self.second_layer(F.tanh(self.first_layer(S))), (batch_size, src_len))) return a
def term_slop(self, loc, val, bs, nf, train=True): """ Compute the slope for each active feature. """ shape = (bs, nf) # Reshape all of our constants pr_mu = F.broadcast_to(self.slop_mu.b, shape) pr_lv = F.broadcast_to(self.slop_lv.b, shape) # This is either zero or a very negative number # indicating to sample N(mean, logvar) or just draw # the mean preicsely if not train: pr_lv += self.lv_floor # The feature slopes are grouped together so that they # all share a common mean. Then individual features slop_delta_lv # are shrunk towards zero, which effectively sets features to fall # back on the group mean. sl_mu = F.reshape(self.slop_delta_mu(loc), shape) + pr_mu sl_lv = F.reshape(self.slop_delta_lv(loc), shape) + pr_lv coef = F.gaussian(sl_mu, sl_lv) slop = F.sum(coef * val, axis=1) # Calculate divergence between group mean and N(0, 1) kld1 = F.gaussian_kl_divergence(self.slop_mu.b, self.slop_lv.b) # Calculate divergence of individual delta means and delta vars args = (self.slop_delta_mu.W, self.slop_delta_lv.W) kld2 = F.gaussian_kl_divergence(*args) return slop, kld1 + kld2
def kl_div(mu1, lv1, lv2): # KL Divergence between given normal and prior at N(0, sigma_2) # Prior assumes mean at zero # lns2 - lns1 + (s2^2 + (u1 - u2)**2)/ 2s2**2 - 0.5 if len(lv1.shape) == 2: lv1 = F.expand_dims(lv1, 0) mu1 = F.expand_dims(mu1, 0) lv2 = F.broadcast_to(lv2, lv1.shape) v12 = F.exp(lv1)**2.0 v22 = F.exp(lv2)**2.0 return lv2 - lv1 + .5 * v12 / v22 + .5 * mu1**2. / v22 - .5
def term_feat(self, iloc, jloc, ival, jval, bs, nf, train=True): # Change all of the shapes to form interaction vectors shape = (bs, nf * 2, self.n_dim) feat_mu_vec = F.broadcast_to(self.feat_mu_vec.b, shape) feat_lv_vec = F.broadcast_to(self.feat_lv_vec.b, shape) if not train: feat_lv_vec += self.lv_floor # Construct the interaction mean and variance # iloc is (bs, nf), feat(iloc) is (bs, nf, ndim) and # dot(feat, feat) is (bs, nf) ivec = F.gaussian(feat_mu_vec + self.feat_delta_mu(iloc), feat_lv_vec + self.feat_delta_lv(iloc)) jvec = F.gaussian(feat_mu_vec + self.feat_delta_mu(jloc), feat_lv_vec + self.feat_delta_lv(jloc)) # feat is (bs, ) feat = dot(F.sum(ivec * jvec, axis=2), ival * jval) # Compute the KLD for the group mean vector and variance vector # KL(N(group mu, group lv) || N(0, hyper_lv)) # hyper_lv ~ gamma(1, 1) kldg = F.sum(kl_div(self.feat_mu_vec.b, self.feat_lv_vec.b, self.hyper_feat_lv_vec.b)) # Compute deviations from hyperprior # KL(N(delta_i, delta_i lv) || N(0, hyper_delta_lv)) # hyper_delta_lv ~ gamma(1, 1) kldi = F.sum(kl_div(self.feat_delta_mu.W, self.feat_delta_lv.W, self.hyper_feat_delta_lv.b)) # Hyperprior penalty for log(var) ~ Gamma(alpha=1, beta=1) # Gamma(log(var) | alpha=1, beta=1) = -log(var) # The loss function will attempt to make log(var) as negative as # possible which will in turn make the variance as small as possible # The sum just casts a 1D vector to a scalar hyperg = -F.sum(self.hyper_feat_lv_vec.b) hyperi = -F.sum(self.hyper_feat_delta_lv.b) return feat, kldg, kldi, hyperg, hyperi
def _make_dis_input(self, input_img, label_map): b = F.broadcast_to(input_img[:,0,:,:], shape=label_map.shape) g = F.broadcast_to(input_img[:,1,:,:], shape=label_map.shape) r = F.broadcast_to(input_img[:,2,:,:], shape=label_map.shape) product_b = label_map * b product_g = label_map * g product_r = label_map * r dis_input = F.concat([product_b, product_g, product_r], axis=1) return dis_input
def free_energy(self, v): """ :param Variable (batch_size, in_channels, image_height, image_width) - input data (training data) :return: scalar """ batch_size = v.data.shape[0] in_channels = self.in_channels real = self.real if real == 0: ''' visible layer is 0, 1 (bit) vbias_term = 1 * SUM(a(i) * v(i)) ''' v_sum = F.sum(v, axis=(2, 3)) # sum over image_height & image_width # Originally, it should return sum for each batch. # but it returns scalar, which is sum over batches, since sum is used at the end anyway. vbias_term = F.sum(F.matmul(v_sum, self.conv.a)) wx_b = self.conv(v) else: ''' visible layer takes real value vbias_term = 0.5 * SUM((v(i)-a(i)) * (v(i) - a(i))) ''' #TODO: check #m = Variable(xp.ones((batch_size, 1), dtype=xp.float32)) n = F.reshape(self.conv.a, (1, in_channels, 1, 1)) xp = cuda.get_array_module(n.data) std_ch = xp.reshape(self.std, (1, in_channels, 1, 1)) #v_ = v - F.matmul(m, n) v_ = (v - F.broadcast_to(n, v.data.shape)) / std_ch vbias_term = F.sum(0.5 * v_ * v_) wx_b = self.conv(v / std_ch) hidden_term = F.sum(F.log(1 + F.exp(wx_b))) # print('vbias = ', vbias_term.data, ', hidden = ', hidden_term.data, 'F.exp(wx_b) = ', F.exp(wx_b).data) return - vbias_term - hidden_term
def maximum_entropy_mellowmax(values, omega=1., beta_min=-10, beta_max=10): """Maximum entropy mellowmax policy function. This function provides a categorical distribution whose expectation matches the one of mellowmax function while maximizing its entropy. See: http://arxiv.org/abs/1612.05628 Args: values (Variable or ndarray): Input values. Mellowmax is taken along the second axis. omega (float): Parameter of mellowmax. beta_min (float): Minimum value of beta, used in Brent's algorithm. beta_max (float): Maximum value of beta, used in Brent's algorithm. Returns: outputs (Variable) """ xp = chainer.cuda.get_array_module(values) mm = mellowmax(values, axis=1) # Advantage: Q - mellowmax(Q) batch_adv = values - F.broadcast_to(F.expand_dims(mm, 1), values.shape) # Move data to CPU because we use Brent's algorithm in scipy batch_adv = chainer.cuda.to_cpu(batch_adv.data) batch_beta = np.empty(mm.shape, dtype=np.float32) # Beta is computed as the root of this function def f(y, adv): return np.sum(np.exp(y * adv) * adv) for idx in np.ndindex(mm.shape): idx_full = idx[:1] + (slice(None),) + idx[1:] adv = batch_adv[idx_full] try: beta = scipy.optimize.brentq( f, a=beta_min, b=beta_max, args=(adv,)) except ValueError: beta = 0 batch_beta[idx] = beta return F.softmax(xp.expand_dims(xp.asarray(batch_beta), 1) * values)
def __init__(self, n_input_channels, action_size, var, n_hidden_layers=0, n_hidden_channels=None, min_action=None, max_action=None, bound_mean=False, nonlinearity=F.relu, mean_wscale=1): self.n_input_channels = n_input_channels self.action_size = action_size self.n_hidden_layers = n_hidden_layers self.n_hidden_channels = n_hidden_channels self.min_action = min_action self.max_action = max_action self.bound_mean = bound_mean self.nonlinearity = nonlinearity if np.isscalar(var): self.var = np.full(action_size, var, dtype=np.float32) else: self.var = var layers = [] if n_hidden_layers > 0: # Input to hidden layers.append(L.Linear(n_input_channels, n_hidden_channels)) layers.append(self.nonlinearity) for _ in range(n_hidden_layers - 1): # Hidden to hidden layers.append(L.Linear(n_hidden_channels, n_hidden_channels)) layers.append(self.nonlinearity) # The last layer is used to compute the mean layers.append( L.Linear(n_hidden_channels, action_size, initialW=LeCunNormal(mean_wscale))) else: # There's only one layer for computing the mean layers.append( L.Linear(n_input_channels, action_size, initialW=LeCunNormal(mean_wscale))) if self.bound_mean: layers.append(lambda x: bound_by_tanh( x, self.min_action, self.max_action)) def get_var_array(shape): self.var = self.xp.asarray(self.var) return self.xp.broadcast_to(self.var, shape) layers.append(lambda x: distribution.GaussianDistribution( x, get_var_array(x.shape))) super().__init__(*layers)
def __call__(self, X, ht_enc, H_enc, skip_mask=None): pad = self._kernel_size - 1 WX = self.W(X) if pad > 0: WX = WX[:, :, :-pad] Vh = self.V(ht_enc) Vh, WX = functions.broadcast(functions.expand_dims(Vh, axis=2), WX) # f-pooling Z, F, O = functions.split_axis(WX + Vh, 3, axis=1) Z = functions.tanh(Z) F = self.zoneout(F) O = functions.sigmoid(O) T = Z.shape[2] # compute ungated hidden states self.contexts = [] for t in xrange(T): z = Z[..., t] f = F[..., t] if t == 0: ct = (1 - f) * z self.contexts.append(ct) else: ct = f * self.contexts[-1] + (1 - f) * z self.contexts.append(ct) if skip_mask is not None: assert skip_mask.shape[1] == H_enc.shape[2] softmax_bias = (skip_mask == 0) * -1e6 # compute attention weights (eq.8) H_enc = functions.swapaxes(H_enc, 1, 2) for t in xrange(T): ct = self.contexts[t] bias = 0 if skip_mask is None else softmax_bias[..., None] # to skip PAD mask = 1 if skip_mask is None else skip_mask[..., None] # to skip PAD alpha = functions.batch_matmul(H_enc, ct) + bias alpha = functions.softmax(alpha) * mask alpha = functions.broadcast_to(alpha, H_enc.shape) # copy kt = functions.sum(alpha * H_enc, axis=1) ot = O[..., t] self.ht = ot * self.o(functions.concat((kt, ct), axis=1)) if t == 0: self.H = functions.expand_dims(self.ht, 2) else: self.H = functions.concat((self.H, functions.expand_dims(self.ht, 2)), axis=2) return self.H
def forward_one_step(self, X, ht_enc, H_enc, skip_mask): pad = self._kernel_size - 1 WX = self.W(X)[:, :, -pad-1, None] Vh = self.V(ht_enc) Vh, WX = functions.broadcast(functions.expand_dims(Vh, axis=2), WX) # f-pooling Z, F, O = functions.split_axis(WX + Vh, 3, axis=1) Z = functions.tanh(Z) F = self.zoneout(F) O = functions.sigmoid(O) T = Z.shape[2] # compute ungated hidden states for t in xrange(T): z = Z[..., t] f = F[..., t] if self.contexts is None: ct = (1 - f) * z self.contexts = [ct] else: ct = f * self.contexts[-1] + (1 - f) * z self.contexts.append(ct) if skip_mask is not None: assert skip_mask.shape[1] == H_enc.shape[2] softmax_bias = (skip_mask == 0) * -1e6 # compute attention weights (eq.8) H_enc = functions.swapaxes(H_enc, 1, 2) for t in xrange(T): ct = self.contexts[t - T] bias = 0 if skip_mask is None else softmax_bias[..., None] # to skip PAD mask = 1 if skip_mask is None else skip_mask[..., None] # to skip PAD alpha = functions.batch_matmul(H_enc, ct) + bias alpha = functions.softmax(alpha) * mask alpha = functions.broadcast_to(alpha, H_enc.shape) # copy kt = functions.sum(alpha * H_enc, axis=1) ot = O[..., t] self.ht = ot * self.o(functions.concat((kt, ct), axis=1)) if self.H is None: self.H = functions.expand_dims(self.ht, 2) else: self.H = functions.concat((self.H, functions.expand_dims(self.ht, 2)), axis=2) return self.H
def calcAttention(self, h1, hList, aList, encLen, cMBSize, args): # attention????????????????h1??? if self.attn_mode == 0: return h1 # 1, attention???????? target1 = self.model.attnIn_L1(h1) # ?????? # (cMBSize, self.hDim) => (cMBSize, 1, self.hDim) target2 = chaFunc.expand_dims(target1, axis=1) # (cMBSize, 1, self.hDim) => (cMBSize, encLen, self.hDim) target3 = chaFunc.broadcast_to(target2, (cMBSize, encLen, self.hDim)) # target3 = chaFunc.broadcast_to(chaFunc.reshape( # target1, (cMBSize, 1, self.hDim)), (cMBSize, encLen, self.hDim)) # 2, attention????????? if self.attn_mode == 1: # bilinear # bilinear??attention?????hList1 == hList2 ??? # shape: (cMBSize, encLen) aval = chaFunc.sum(target3 * aList, axis=2) elif self.attn_mode == 2: # MLP # attnSum ???????? t1 = chaFunc.reshape(target3, (cMBSize * encLen, self.hDim)) # (cMBSize*encLen, self.hDim) => (cMBSize*encLen, 1) t2 = self.model.attnSum(chaFunc.tanh(t1 + aList)) # shape: (cMBSize, encLen) aval = chaFunc.reshape(t2, (cMBSize, encLen)) # aval = chaFunc.reshape(self.model.attnSum( # chaFunc.tanh(t1 + aList)), (cMBSize, encLen)) else: assert 0, "ERROR" # 3, softmax???? cAttn1 = chaFunc.softmax(aval) # (cMBSize, encLen) # 4, attention???????context vector???????? # (cMBSize, encLen) => (cMBSize, 1, encLen) cAttn2 = chaFunc.expand_dims(cAttn1, axis=1) # (1, encLen) x (encLen, hDim) ?????(matmul)?cMBSize????? # => (cMBSize, 1, hDim) cAttn3 = chaFunc.batch_matmul(cAttn2, hList) # cAttn3 = chaFunc.batch_matmul(chaFunc.reshape( # cAttn1, (cMBSize, 1, encLen)), hList) # axis=1???1???????????? context = chaFunc.reshape(cAttn3, (cMBSize, self.hDim)) # 4, attention???????context vector???????? # ?????????? # (cMBSize, scrLen) => (cMBSize, scrLen, hDim) # cAttn2 = chaFunc.reshape(cAttn1, (cMBSize, encLen, 1)) # (cMBSize, scrLen) => (cMBSize, scrLen, hDim) # cAttn3 = chaFunc.broadcast_to(cAttn2, (cMBSize, encLen, self.hDim)) # ???????? (cMBSize, encLen, hDim) # => (cMBSize, hDim) # axis=1 ????? # context = chaFunc.sum(aList * cAttn3, axis=1) # 6, attention?????????? c1 = chaFunc.concat((h1, context)) c2 = self.model.attnOut_L2(c1) finalH = chaFunc.tanh(c2) # finalH = chaFunc.tanh(self.model.attnOut_L2( # chaFunc.concat((h1, context)))) return finalH # context # ??????