Python theano.tensor 模块，tensordot() 实例源码

我们从Python开源项目中，提取了以下38个代码示例，用于说明如何使用theano.tensor.tensordot()。

项目：sciDT 作者：edvisees | 项目源码 | 文件源码

def get_output(self, train=False):
    input = self.get_input(train)
    proj_input = self.activation(T.tensordot(input, self.att_proj, axes=(3,0)))
    if self.context == 'word':
      att_scores = T.tensordot(proj_input, self.att_scorer, axes=(3, 0))
    elif self.context == 'clause':
      def step(a_t, h_tm1, W_in, W, sc):
        h_t = T.tanh(T.tensordot(a_t, W_in, axes=(2,0)) + T.tensordot(h_tm1, W, axes=(2,0)))
        s_t = T.tensordot(h_t, sc, axes=(2,0))
        return h_t, s_t
      [_, scores], _ = theano.scan(step, sequences=[proj_input.dimshuffle(2,0,1,3)], outputs_info=[T.zeros((proj_input.shape[0], self.td1, self.rec_hid_dim)), None], non_sequences=[self.rec_in_weights, self.rec_hid_weights, self.att_scorer])
      att_scores = scores.dimshuffle(1,2,0)
    elif self.context == 'para':
      att_scores = T.tensordot(proj_input, self.att_scorer, axes=(3, 2)).sum(axis=(1, 2))
    # Nested scans. For shame!
    def get_sample_att(sample_input, sample_att):
      sample_att_inp, _ = theano.scan(fn=lambda s_att_i, s_input_i: T.dot(s_att_i, s_input_i), sequences=[T.nnet.softmax(sample_att), sample_input])
      return sample_att_inp

    att_input, _ = theano.scan(fn=get_sample_att, sequences=[input, att_scores])
    return att_input

项目：opt-mmd 作者：dougalsutherland | 项目源码 | 文件源码

def get_output_for(self, input, init=False, **kwargs):
        if input.ndim > 2:
            # if the input has more than two dimensions, flatten it into a
            # batch of feature vectors.
            input = input.flatten(2)

        activation = T.tensordot(input, self.W, [[1], [0]])
        abs_dif = (T.sum(abs(activation.dimshuffle(0,1,2,'x') - activation.dimshuffle('x',1,2,0)),axis=2)
                    + 1e6 * T.eye(input.shape[0]).dimshuffle(0,'x',1))

        if init:
            mean_min_abs_dif = 0.5 * T.mean(T.min(abs_dif, axis=2),axis=0)
            abs_dif /= mean_min_abs_dif.dimshuffle('x',0,'x')
            self.init_updates = [(self.log_weight_scale, self.log_weight_scale-T.log(mean_min_abs_dif).dimshuffle(0,'x'))]

        f = T.sum(T.exp(-abs_dif),axis=2)

        if init:
            mf = T.mean(f,axis=0)
            f -= mf.dimshuffle('x',0)
            self.init_updates.append((self.b, -mf))
        else:
            f += self.b.dimshuffle('x',0)

        return T.concatenate([input, f], axis=1)

项目：SNLI 作者：qiaojingy | 项目源码 | 文件源码

def get_output_for(self, input, **kwargs):
        # cf * bc01... = fb01...
        out_r = T.tensordot(self.W, input, axes=[[0], [1]])
        # input dims to broadcast over
        remaining_dims = range(2, input.ndim)
        # bf01...
        out = out_r.dimshuffle(1, 0, *remaining_dims)

        if self.b is None:
            activation = out
        else:
            if self.untie_biases:
                # no broadcast
                remaining_dims_biases = range(1, input.ndim - 1)
            else:
                remaining_dims_biases = ['x'] * (input.ndim - 2)  # broadcast
            b_shuffled = self.b.dimshuffle('x', 0, *remaining_dims_biases)
            activation = out + b_shuffled

        return self.nonlinearity(activation)

项目：DeepMonster 作者：olimastro | 项目源码 | 文件源码

def get_output_for(self, input, init=False, **kwargs):
        if input.ndim > 2:
            # if the input has more than two dimensions, flatten it into a
            # batch of feature vectors.
            input = input.flatten(2)

        activation = T.tensordot(input, self.W, [[1], [0]])
        abs_dif = (T.sum(abs(activation.dimshuffle(0,1,2,'x') - activation.dimshuffle('x',1,2,0)),axis=2)
                    + 1e6 * T.eye(input.shape[0]).dimshuffle(0,'x',1))

        if init:
            mean_min_abs_dif = 0.5 * T.mean(T.min(abs_dif, axis=2),axis=0)
            abs_dif /= mean_min_abs_dif.dimshuffle('x',0,'x')
            self.init_updates = [(self.log_weight_scale, self.log_weight_scale-T.log(mean_min_abs_dif).dimshuffle(0,'x'))]

        f = T.sum(T.exp(-abs_dif),axis=2)

        if init:
            mf = T.mean(f,axis=0)
            f -= mf.dimshuffle('x',0)
            self.init_updates.append((self.b, -mf))
        else:
            f += self.b.dimshuffle('x',0)

        return T.concatenate([input, f], axis=1)

项目：WGAN_mnist 作者：rajeswar18 | 项目源码 | 文件源码

def get_output_for(self, input, init=False, **kwargs):
        if input.ndim > 2:
            # if the input has more than two dimensions, flatten it into a
            # batch of feature vectors.
            input = input.flatten(2)

        activation = T.tensordot(input, self.W, [[1], [0]])
        abs_dif = (T.sum(abs(activation.dimshuffle(0,1,2,'x') - activation.dimshuffle('x',1,2,0)),axis=2)
                    + 1e6 * T.eye(input.shape[0]).dimshuffle(0,'x',1))

        if init:
            mean_min_abs_dif = 0.5 * T.mean(T.min(abs_dif, axis=2),axis=0)
            abs_dif /= mean_min_abs_dif.dimshuffle('x',0,'x')
            self.init_updates = [(self.log_weight_scale, self.log_weight_scale-T.log(mean_min_abs_dif).dimshuffle(0,'x'))]

        f = T.sum(T.exp(-abs_dif),axis=2)

        if init:
            mf = T.mean(f,axis=0)
            f -= mf.dimshuffle('x',0)
            self.init_updates.append((self.b, -mf))
        else:
            f += self.b.dimshuffle('x',0)

        return T.concatenate([input, f], axis=1)

项目：triple-gan 作者：zhenxuan00 | 项目源码 | 文件源码

def get_output_for(self, input, init=False, **kwargs):
        if input.ndim > 2:
            # if the input has more than two dimensions, flatten it into a
            # batch of feature vectors.
            input = input.flatten(2)

        activation = T.tensordot(input, self.W, [[1], [0]])
        abs_dif = (T.sum(abs(activation.dimshuffle(0,1,2,'x') - activation.dimshuffle('x',1,2,0)),axis=2)
                    + 1e6 * T.eye(input.shape[0]).dimshuffle(0,'x',1))

        if init:
            mean_min_abs_dif = 0.5 * T.mean(T.min(abs_dif, axis=2),axis=0)
            abs_dif /= mean_min_abs_dif.dimshuffle('x',0,'x')
            self.init_updates = [(self.log_weight_scale, self.log_weight_scale-T.log(mean_min_abs_dif).dimshuffle(0,'x'))]

        f = T.sum(T.exp(-abs_dif),axis=2)

        if init:
            mf = T.mean(f,axis=0)
            f -= mf.dimshuffle('x',0)
            self.init_updates.append((self.b, -mf))
        else:
            f += self.b.dimshuffle('x',0)

        return T.concatenate([input, f], axis=1)

项目：saliency-salgan-2017 作者：imatge-upc | 项目源码 | 文件源码

def get_output_for(self, input, init=False, **kwargs):
        if input.ndim > 2:
            # if the input has more than two dimensions, flatten it into a
            # batch of feature vectors.
            input = input.flatten(2)

        activation = T.tensordot(input, self.W, [[1], [0]])
        abs_dif = (T.sum(abs(activation.dimshuffle(0,1,2,'x') - activation.dimshuffle('x',1,2,0)),axis=2)
                    + 1e6 * T.eye(input.shape[0]).dimshuffle(0,'x',1))

        if init:
            mean_min_abs_dif = 0.5 * T.mean(T.min(abs_dif, axis=2),axis=0)
            abs_dif /= mean_min_abs_dif.dimshuffle('x',0,'x')
            self.init_updates = [(self.log_weight_scale, self.log_weight_scale-T.log(mean_min_abs_dif).dimshuffle(0,'x'))]

        f = T.sum(T.exp(-abs_dif),axis=2)

        if init:
            mf = T.mean(f,axis=0)
            f -= mf.dimshuffle('x',0)
            self.init_updates.append((self.b, -mf))
        else:
            f += self.b.dimshuffle('x',0)

        return T.concatenate([input, f], axis=1)

项目：convnet-for-geometric-matching 作者：hjweide | 项目源码 | 文件源码

def create_corr_func():
    import numpy as np
    Xa, Xb = T.tensor4('Xa'), T.tensor4('Xb')

    def correlation(A, B):
        Ap, Bp = A.reshape((-1, 15 * 15)), B.reshape((-1, 15 * 15))
        C = T.tensordot(Ap.T, Bp, axes=1).reshape((-1, 15, 15))
        return C

    result, updates = theano.scan(fn=correlation,
                                  outputs_info=None,
                                  sequences=[Xa, Xb],
                                  non_sequences=None)
    corr_func = theano.function(
        inputs=[Xa, Xb],
        outputs=result,
    )

    X = np.random.random((32, 128, 15, 15)).astype(np.float32)
    Y = np.random.random(X.shape).astype(np.float32)

    output = corr_func(X, Y)
    print output.shape

项目：mctest-model 作者：Maluuba | 项目源码 | 文件源码

def call(self, x, mask=None):
        ax = 1 if self.is_q else 2

        def _step(v1, v2):
            cosine_score = T.tensordot(v1 / T.sqrt(T.sum(T.sqr(v1), axis=2, keepdims=True) + 1e-6),
                                       (v2) / T.sqrt(T.sum(T.sqr(v2), axis=ax, keepdims=True) + 1e-6),
                                       [[2], [ax]])
            return cosine_score

        l_s = x[0]  # n_b x n_s x n_w_s x D
        l_a = x[1]  # n_b x 4 x n_w_qa x D
        # w_qa = self.layers[2].get_output(train)  # n_b x 4 x n_w_qa x 1
        # w_qa = T.addbroadcast(w_qa, len(self.layers[2].output_shape) - 1)

        # get cosine similarity for ALL word pairs
        output, _ = theano.scan(_step, sequences=[l_s, l_a], outputs_info=None)
        if not self.is_q:
            output = output.dimshuffle(0, 1, 3, 2, 4)  # n_b x n_s x 4 x n_w_s x n_w_qa
        return output

项目：deligan 作者：val-iisc | 项目源码 | 文件源码

def get_output_for(self, input, init=False, **kwargs):
        if input.ndim > 2:
            # if the input has more than two dimensions, flatten it into a
            # batch of feature vectors.
            input = input.flatten(2)

        activation = T.tensordot(input, self.W, [[1], [0]])
        abs_dif = (T.sum(abs(activation.dimshuffle(0,1,2,'x') - activation.dimshuffle('x',1,2,0)),axis=2)
                    + 1e6 * T.eye(input.shape[0]).dimshuffle(0,'x',1))

        if init:
            mean_min_abs_dif = 0.5 * T.mean(T.min(abs_dif, axis=2),axis=0)
            abs_dif /= mean_min_abs_dif.dimshuffle('x',0,'x')
            self.init_updates = [(self.log_weight_scale, self.log_weight_scale-T.log(mean_min_abs_dif).dimshuffle(0,'x'))]

        f = T.sum(T.exp(-abs_dif),axis=2)

        if init:
            mf = T.mean(f,axis=0)
            f -= mf.dimshuffle('x',0)
            self.init_updates.append((self.b, -mf))
        else:
            f += self.b.dimshuffle('x',0)

        return T.concatenate([input, f], axis=1)

# Input Mixture of Gaussian Layer

项目：deligan 作者：val-iisc | 项目源码 | 文件源码

def get_output_for(self, input, init=False, **kwargs):
        if input.ndim > 2:
            # if the input has more than two dimensions, flatten it into a
            # batch of feature vectors.
            input = input.flatten(2)

        activation = T.tensordot(input, self.W, [[1], [0]])
        abs_dif = (T.sum(abs(activation.dimshuffle(0,1,2,'x') - activation.dimshuffle('x',1,2,0)),axis=2)
                    + 1e6 * T.eye(input.shape[0]).dimshuffle(0,'x',1))

        if init:
            mean_min_abs_dif = 0.5 * T.mean(T.min(abs_dif, axis=2),axis=0)
            abs_dif /= mean_min_abs_dif.dimshuffle('x',0,'x')
            self.init_updates = [(self.log_weight_scale, self.log_weight_scale-T.log(mean_min_abs_dif).dimshuffle(0,'x'))]

        f = T.sum(T.exp(-abs_dif),axis=2)

        if init:
            mf = T.mean(f,axis=0)
            f -= mf.dimshuffle('x',0)
            self.init_updates.append((self.b, -mf))
        else:
            f += self.b.dimshuffle('x',0)

        return T.concatenate([input, f], axis=1)

# Input Mixture of Gaussian Layer

项目：CopyNet 作者：MultiPath | 项目源码 | 文件源码

def __call__(self, X, w_temp, m_temp):
        # input dimensions
        # X:      (nb_samples, input_dim)
        # w_temp: (nb_samples, memory_dim)
        # m_temp: (nb_samples, memory_dim, memory_width) ::tensor_memory

        key   = dot(X, self.W_key, self.b_key)  # (nb_samples, memory_width)
        lock  = dot(m_temp, self.W_lock)        # (nb_samples, memory_dim, memory_width)
        shift = self.softmax(
            dot(X, self.W_shift, self.b_shift))  # (nb_samples, shift_width)

        beta = self.softplus(dot(X, self.W_beta, self.b_beta))[:, None]  # (nb_samples, x)
        gamma = self.softplus(dot(X, self.W_gama, self.b_gama)) + 1.  # (nb_samples,)
        gamma = gamma[:, None]  # (nb_samples, x)
        g = self.sigmoid(dot(X, self.W_g, self.b_g))[:, None]  # (nb_samples, x)

        signal = [key, shift, beta, gamma, g]

        energy = T.sum(key[:, None, :] * lock, axis=2)
        # energy = T.tensordot(key[:, None, :] + lock, self.v, [2, 0])
        w_c    = self.softmax(beta * energy)
        # w_c = self.softmax(
        #     beta * cosine_sim2d(key, m_temp))  # (nb_samples, memory_dim) //content-based addressing
        w_g = g * w_c + (1 - g) * w_temp  # (nb_samples, memory_dim) //history interpolation
        w_s = shift_convolve2d(w_g, shift, self.shift_conv)  # (nb_samples, memory_dim) //convolutional shift
        w_p = w_s ** gamma  # (nb_samples, memory_dim) //sharpening
        w_t = w_p / T.sum(w_p, axis=1)[:, None]  # (nb_samples, memory_dim)
        return w_t

项目：Neural-Photo-Editor 作者：ajbrock | 项目源码 | 文件源码

def get_output_for(self, input, init=False, **kwargs):
        if input.ndim > 2:
            # if the input has more than two dimensions, flatten it into a
            # batch of feature vectors.
            input = input.flatten(2)

        activation = T.tensordot(input, self.W, [[1], [0]])
        abs_dif = (T.sum(abs(activation.dimshuffle(0,1,2,'x') - activation.dimshuffle('x',1,2,0)),axis=2)
                    + 1e6 * T.eye(input.shape[0]).dimshuffle(0,'x',1))

        if init:
            mean_min_abs_dif = 0.5 * T.mean(T.min(abs_dif, axis=2),axis=0)
            abs_dif /= mean_min_abs_dif.dimshuffle('x',0,'x')
            self.init_updates = [(self.log_weight_scale, self.log_weight_scale-T.log(mean_min_abs_dif).dimshuffle(0,'x'))]

        f = T.sum(T.exp(-abs_dif),axis=2)

        if init:
            mf = T.mean(f,axis=0)
            f -= mf.dimshuffle('x',0)
            self.init_updates.append((self.b, -mf))
        else:
            f += self.b.dimshuffle('x',0)

        return T.concatenate([input, f], axis=1)  

# Convenience function to define an inception-style block

项目：NeuroNLP 作者：XuezheMax | 项目源码 | 文件源码

def get_output_for(self, inputs, **kwargs):
        """
        Compute this layer's output function given a symbolic input variable.

        Parameters
        ----------
        :param inputs: list of theano.TensorType
            `inputs[0]` should always be the symbolic input variable.  When
            this layer has a mask input (i.e. was instantiated with
            `mask_input != None`, indicating that the lengths of sequences in
            each batch vary), `inputs` should have length 2, where `inputs[1]`
            is the `mask`.  The `mask` should be supplied as a Theano variable
            denoting whether each time step in each sequence in the batch is
            part of the sequence or not.  `mask` should be a matrix of shape
            ``(n_batch, n_time_steps)`` where ``mask[i, j] = 1`` when ``j <=
            (length of sequence i)`` and ``mask[i, j] = 0`` when ``j > (length
            of sequence i)``.
        :return: theano.TensorType
            Symbolic output variable.
        """
        input = inputs[0]
        mask = None
        if self.mask_incoming_index > 0:
            mask = inputs[self.mask_incoming_index]

        # compute out by tensor dot ([batch, length, input] * [input, num_label, num_label]
        # the shape of out should be [batch, length, num_label, num_label]
        out = T.tensordot(input, self.W, axes=[[2], [0]])

        if self.b is not None:
            b_shuffled = self.b.dimshuffle('x', 'x', 0, 1)
            out = out + b_shuffled

        if mask is not None:
            mask_shuffled = mask.dimshuffle(0, 1, 'x', 'x')
            out = out * mask_shuffled
        return out

项目：seq2seq-keyphrase 作者：memray | 项目源码 | 文件源码

def __call__(self, X, w_temp, m_temp):
        # input dimensions
        # X:      (nb_samples, input_dim)
        # w_temp: (nb_samples, memory_dim)
        # m_temp: (nb_samples, memory_dim, memory_width) ::tensor_memory

        key   = dot(X, self.W_key, self.b_key)  # (nb_samples, memory_width)
        lock  = dot(m_temp, self.W_lock)        # (nb_samples, memory_dim, memory_width)
        shift = self.softmax(
            dot(X, self.W_shift, self.b_shift))  # (nb_samples, shift_width)

        beta = self.softplus(dot(X, self.W_beta, self.b_beta))[:, None]  # (nb_samples, x)
        gamma = self.softplus(dot(X, self.W_gama, self.b_gama)) + 1.  # (nb_samples,)
        gamma = gamma[:, None]  # (nb_samples, x)
        g = self.sigmoid(dot(X, self.W_g, self.b_g))[:, None]  # (nb_samples, x)

        signal = [key, shift, beta, gamma, g]

        energy = T.sum(key[:, None, :] * lock, axis=2)
        # energy = T.tensordot(key[:, None, :] + lock, self.v, [2, 0])
        w_c    = self.softmax(beta * energy)
        # w_c = self.softmax(
        #     beta * cosine_sim2d(key, m_temp))  # (nb_samples, memory_dim) //content-based addressing
        w_g = g * w_c + (1 - g) * w_temp  # (nb_samples, memory_dim) //history interpolation
        w_s = shift_convolve2d(w_g, shift, self.shift_conv)  # (nb_samples, memory_dim) //convolutional shift
        w_p = w_s ** gamma  # (nb_samples, memory_dim) //sharpening
        w_t = w_p / T.sum(w_p, axis=1)[:, None]  # (nb_samples, memory_dim)
        return w_t

项目：DiscourseSenser 作者：WladimirSidorenko | 项目源码 | 文件源码

def __init__(self, a_n_x, a_n_y):
        """Class constructor.

        Args:
        a_n_x (int):
          number of underlying cassifiers
        a_n_y (int):
          number of classes to predict


        """
        self.n_x = a_n_x
        self.n_y = a_n_y
        # define the network
        # input matrix
        self.x = TT.dmatrix(name="x")
        # mapping from input to output vector
        self.X2Y = self._init_X2Y()
        self.y_bias = theano.shared(value=HE_UNIFORM((1, self.n_y)),
                                    name="y_bias")
        # prediction vector
        self.y_pred = TT.nnet.softmax(
            TT.tensordot(self.x, self.X2Y, ((1, 0), (2, 1))) + self.y_bias)
        # predicted label
        self.y_lbl = TT.argmax(self.y_pred, axis=1)[0]
        self._predict = theano.function([self.x],
                                        [self.y_lbl, self.y_pred],
                                        name="predict")
        # define trainable parameters
        self._params = [self.X2Y, self.y_bias]

项目：DeepFold 作者：largelymfs | 项目源码 | 文件源码

def get_output_for(self, input, **kwargs):
        res = [(input ** (- i * 2 - 2)).dimshuffle(0, 'x', 1, 2) for i in range(self.projection_level)]
        res = T.concatenate(res, axis=1)
        # return T.tensordot(res, self.W, [[1], [0]]).dimshuffle(0, 'x', 1, 2)
        return res

项目：LasagneNLP 作者：XuezheMax | 项目源码 | 文件源码

def get_output_for(self, inputs, **kwargs):
        """
        Compute this layer's output function given a symbolic input variable.

        Parameters
        ----------
        :param inputs: list of theano.TensorType
            `inputs[0]` should always be the symbolic input variable.  When
            this layer has a mask input (i.e. was instantiated with
            `mask_input != None`, indicating that the lengths of sequences in
            each batch vary), `inputs` should have length 2, where `inputs[1]`
            is the `mask`.  The `mask` should be supplied as a Theano variable
            denoting whether each time step in each sequence in the batch is
            part of the sequence or not.  `mask` should be a matrix of shape
            ``(n_batch, n_time_steps)`` where ``mask[i, j] = 1`` when ``j <=
            (length of sequence i)`` and ``mask[i, j] = 0`` when ``j > (length
            of sequence i)``.
        :return: theano.TensorType
            Symbolic output variable.
        """
        input = inputs[0]
        mask = None
        if self.mask_incoming_index > 0:
            mask = inputs[self.mask_incoming_index]

        # compute out by tensor dot ([batch, length, input] * [input, num_label, num_label]
        # the shape of out should be [batch, length, num_label, num_label]
        out = T.tensordot(input, self.W, axes=[[2], [0]])

        if self.b is not None:
            b_shuffled = self.b.dimshuffle('x', 'x', 0, 1)
            out = out + b_shuffled

        if mask is not None:
            mask_shuffled = mask.dimshuffle(0, 1, 'x', 'x')
            out = out * mask_shuffled
        return out

项目：NADE 作者：MarcCote | 项目源码 | 文件源码

def sym_mask_logdensity_estimator_intermediate(self, x, mask):
        non_linearity_name = self.parameters["nonlinearity"].get_name()
        assert(non_linearity_name == "sigmoid" or non_linearity_name == "RLU")
        x = x.T  # BxD
        mask = mask.T  # BxD
        output_mask = constantX(1) - mask  # BxD
        D = constantX(self.n_visible)
        d = mask.sum(1)  # d is the 1-based index of the dimension whose value to infer (not the size of the context)
        masked_input = x * mask  # BxD
        h = self.nonlinearity(T.dot(masked_input, self.W1) + T.dot(mask, self.Wflags) + self.b1)  # BxH
        for l in xrange(self.n_layers - 1):
            h = self.nonlinearity(T.dot(h, self.Ws[l]) + self.bs[l])  # BxH
        z_alpha = T.tensordot(h, self.V_alpha, [[1], [1]]) + T.shape_padleft(self.b_alpha)
        z_mu = T.tensordot(h, self.V_mu, [[1], [1]]) + T.shape_padleft(self.b_mu)
        z_sigma = T.tensordot(h, self.V_sigma, [[1], [1]]) + T.shape_padleft(self.b_sigma)
        temp = T.exp(z_alpha)  # + 1e-6
        # temp += T.shape_padright(temp.sum(2)/1e-3)
        Alpha = temp / T.shape_padright(temp.sum(2))  # BxDxC
        Mu = z_mu  # BxDxC
        Sigma = T.exp(z_sigma)  # + 1e-6 #BxDxC

        # Alpha = Alpha * T.shape_padright(output_mask) + T.shape_padright(mask)
        # Mu = Mu * T.shape_padright(output_mask)
        # Sigma = Sigma * T.shape_padright(output_mask) + T.shape_padright(mask)
        # Phi = -constantX(0.5) * T.sqr((Mu - T.shape_padright(x*output_mask)) / Sigma) - T.log(Sigma) - constantX(0.5 * np.log(2*np.pi)) #BxDxC

        Phi = -constantX(0.5) * T.sqr((Mu - T.shape_padright(x)) / Sigma) - T.log(Sigma) - constantX(0.5 * np.log(2 * np.pi))  # BxDxC
        logdensity = (log_sum_exp(Phi + T.log(Alpha), axis=2) * output_mask).sum(1) * D / (D - d)
        return (logdensity, z_alpha, z_mu, z_sigma, Alpha, Mu, Sigma, h)

项目：NADE 作者：MarcCote | 项目源码 | 文件源码

def sym_masked_neg_loglikelihood_gradient(self, x, mask):
        """ x is a matrix of column datapoints (DxB) D = n_visible, Bfloat = batch size """
        logdensity, z_alpha, z_mu, z_sigma, Alpha, Mu, Sigma, h = self.sym_mask_logdensity_estimator_intermediate(x, mask)

#        nnz = output_mask.sum(0)
#        sparsity_multiplier = T.shape_padright(T.shape_padleft((B+1e-6)/(nnz+1e-6)))

#        wPhi = T.maximum(Phi + T.log(Alpha), constantX(-100.0)) #BxDxC
#        lp_current = log_sum_exp(wPhi, axis = 2) * output_mask #BxD
#        lp_current_sum = (lp_current.sum(1) * D / (D-d)).sum() #1

        loglikelihood = logdensity.mean(dtype=floatX)
        loss = -loglikelihood

        dp_dz_alpha = T.grad(loss, z_alpha)  # BxDxC
        gb_alpha = dp_dz_alpha.sum(0)  # DxC
        gV_alpha = T.tensordot(h.T, dp_dz_alpha, [[1], [0]]).dimshuffle((1, 0, 2))  # DxHxC

        dp_dz_mu = T.grad(loss, z_mu)  # BxDxC
        dp_dz_mu = dp_dz_mu * Sigma  # Heuristic
        gb_mu = dp_dz_mu.sum(0)  # DxC
        gV_mu = T.tensordot(h.T, dp_dz_mu, [[1], [0]]).dimshuffle((1, 0, 2))  # DxHxC

        dp_dz_sigma = T.grad(loss, z_sigma)  # BxDxC
        gb_sigma = dp_dz_sigma.sum(0)  # DxC
        gV_sigma = T.tensordot(h.T, dp_dz_sigma, [[1], [0]]).dimshuffle((1, 0, 2))  # DxHxC

        if self.n_layers > 1:
            gWs, gbs, gW1, gWflags, gb1 = T.grad(loss, [self.Ws, self.bs, self.W1, self.Wflags, self.b1])
            gradients = {"V_alpha":gV_alpha, "b_alpha":gb_alpha, "V_mu":gV_mu, "b_mu":gb_mu, "V_sigma":gV_sigma, "b_sigma":gb_sigma, "Ws":gWs, "bs":gbs, "W1":gW1, "b1":gb1, "Wflags":gWflags}
        else:
            gW1, gWflags, gb1 = T.grad(loss, [self.W1, self.Wflags, self.b1])
            gradients = {"V_alpha":gV_alpha, "b_alpha":gb_alpha, "V_mu":gV_mu, "b_mu":gb_mu, "V_sigma":gV_sigma, "b_sigma":gb_sigma, "W1":gW1, "b1":gb1, "Wflags":gWflags}
        # Gradients
        return (loss, gradients)

项目：neural_wfst 作者：se4u | 项目源码 | 文件源码

def construct(self, input_tv):
        '''
        Params
        ------
        input_tv : a matrix of size (n_sentences, n_tokens, vecdim)
        '''
        # N is the linear transformation matrix.
        N = self._declare_mat('N', self.in_dim, self.out_dim)
        N.clip_gradient = self.prm('clip_gradient')
        N.l2_project = self.prm('l2_project')
        N.l2_projection_axis = 0
        if self.prm('do_dropout'):
            N.dropout_retention_freq = self.prm('dropout_retention_freq')
            # Create a dropout mask.
            dropout_mask = dropout_mask_creator(
                self.in_dim, N.dropout_retention_freq)
            # Apply dropout mask to input variable.
            # Note that dropout_mask is a vector and input_tv is a
            # matrix. We are broadcasting this multiplication.
            # Essentially we are dropping entire columns from input_tv.
            dropout_input_tv = (input_tv * dropout_mask)
            dropout_input_tv.name = self.kn('dropout_input_tv')
            # During train time the output is the matrix multiplication of
            # dropped out variables with the matrix.
            transformed_tv = T.tensordot(
                dropout_input_tv, N, axes=[dropout_input_tv.ndim - 1, 0])
        else:
            transformed_tv = T.dot(input_tv, N)

        if self.prm('add_bias'):
            b = self._declare_mat('b', self.out_dim, is_regularizable=False)
            b.l2_project = self.prm('l2_project')
            b.l2_projection_axis = 0
            self.output_tv = transformed_tv + b
            return (N, b)
        else:
            self.output_tv = transformed_tv
            return (N,)

项目：neural_wfst 作者：se4u | 项目源码 | 文件源码

def construct(self, input_tv):
        sod2c = self.prm('shape_of_dim_to_collapse')
        v = self._declare_mat('v', sod2c)
        self.output_tv = T.tensordot(input_tv, v, axes=(input_tv.ndim - 1, 0))
        return (v,)

项目：neural_wfst 作者：se4u | 项目源码 | 文件源码

def construct(self, input_tv):
        '''
        Params
        ------
        input_tv : The input is a 3D tensor representing a batch of sentences with
          embedded tokens.

        Returns
        -------
        The input_tv is a matrix that has the tokens as its 0th dimension and
        usually LSTM features as the first dimension.

        NOTE: We don't need to project the input of this class_chip inside. We can
        just add a Linear class_chip before ConcatenativeMixture.
        '''
        Y_prev = self._declare_mat('A', self.out_dim + 1, self.in_dim)
        Y_next = self._declare_mat('B', self.out_dim,  self.in_dim)
        Y_prev.clip_gradient = self.prm('clip_gradient')
        Y_next.clip_gradient = self.prm('clip_gradient')
        prev_next_cross = (Y_prev.dimshuffle(0, 'x', 1)
                           + Y_next.dimshuffle('x', 0, 1))
        Y_nl = self.prm('activation_fn')(prev_next_cross)
        # NOTE: The last dimension corresponds to the hidden layer
        # nodes.
        v = self._declare_mat('v', self.in_dim)
        PairWise_Factor = T.tensordot(Y_nl, v, axes=(Y_nl.ndim - 1, 0))
        self.output_tv = (PairWise_Factor.dimshuffle('x', 'x', 0, 1)
                          + input_tv.dimshuffle(0, 1, 'x', 2))
        return (Y_prev, Y_next, v)

项目：CopyNet 作者：MingyuanXie | 项目源码 | 文件源码

def __call__(self, X, w_temp, m_temp):
        # input dimensions
        # X:      (nb_samples, input_dim)
        # w_temp: (nb_samples, memory_dim)
        # m_temp: (nb_samples, memory_dim, memory_width) ::tensor_memory

        key   = dot(X, self.W_key, self.b_key)  # (nb_samples, memory_width)
        lock  = dot(m_temp, self.W_lock)        # (nb_samples, memory_dim, memory_width)
        shift = self.softmax(
            dot(X, self.W_shift, self.b_shift))  # (nb_samples, shift_width)

        beta = self.softplus(dot(X, self.W_beta, self.b_beta))[:, None]  # (nb_samples, x)
        gamma = self.softplus(dot(X, self.W_gama, self.b_gama)) + 1.  # (nb_samples,)
        gamma = gamma[:, None]  # (nb_samples, x)
        g = self.sigmoid(dot(X, self.W_g, self.b_g))[:, None]  # (nb_samples, x)

        signal = [key, shift, beta, gamma, g]

        energy = T.sum(key[:, None, :] * lock, axis=2)
        # energy = T.tensordot(key[:, None, :] + lock, self.v, [2, 0])
        w_c    = self.softmax(beta * energy)
        # w_c = self.softmax(
        #     beta * cosine_sim2d(key, m_temp))  # (nb_samples, memory_dim) //content-based addressing
        w_g = g * w_c + (1 - g) * w_temp  # (nb_samples, memory_dim) //history interpolation
        w_s = shift_convolve2d(w_g, shift, self.shift_conv)  # (nb_samples, memory_dim) //convolutional shift
        w_p = w_s ** gamma  # (nb_samples, memory_dim) //sharpening
        w_t = w_p / T.sum(w_p, axis=1)[:, None]  # (nb_samples, memory_dim)
        return w_t

项目：convnet-for-geometric-matching 作者：hjweide | 项目源码 | 文件源码

def correlation(self, A, B):
        Af = A.reshape((A.shape[0], A.shape[1] * A.shape[2]))
        Bf = B.reshape((B.shape[0], B.shape[1] * B.shape[2]))
        C = T.tensordot(Af.T, Bf, axes=1)

        return C.reshape((-1, A.shape[1], A.shape[2]))

项目：Theano-Deep-learning 作者：GeekLiB | 项目源码 | 文件源码

def grad(self, inputs, cost_grad):
        """
        In defining the gradient, the Finite Fourier Transform is viewed as
        a complex-differentiable function of a complex variable
        """
        a = inputs[0]
        n = inputs[1]
        axis = inputs[2]
        grad = cost_grad[0]
        if not isinstance(axis, tensor.TensorConstant):
            raise NotImplementedError('%s: gradient is currently implemented'
                                      ' only for axis being a Theano constant'
                                      % self.__class__.__name__)
        axis = int(axis.data)
        # notice that the number of actual elements in wrto is independent of
        # possible padding or truncation:
        elem = tensor.arange(0, tensor.shape(a)[axis], 1)
        # accounts for padding:
        freq = tensor.arange(0, n, 1)
        outer = tensor.outer(freq, elem)
        pow_outer = tensor.exp(((-2 * math.pi * 1j) * outer) / (1. * n))
        res = tensor.tensordot(grad, pow_outer, (axis, 0))

        # This would be simpler but not implemented by theano:
        # res = tensor.switch(tensor.lt(n, tensor.shape(a)[axis]),
        # tensor.set_subtensor(res[...,n::], 0, False, False), res)

        # Instead we resort to that to account for truncation:
        flip_shape = list(numpy.arange(0, a.ndim)[::-1])
        res = res.dimshuffle(flip_shape)
        res = tensor.switch(tensor.lt(n, tensor.shape(a)[axis]),
                            tensor.set_subtensor(res[n::, ], 0, False, False),
                            res)
        res = res.dimshuffle(flip_shape)

        # insures that gradient shape conforms to input shape:
        out_shape = list(numpy.arange(0, axis)) + [a.ndim - 1] +\
            list(numpy.arange(axis, a.ndim - 1))
        res = res.dimshuffle(*out_shape)
        return [res, None, None]

项目：kaggle-quora-solution-8th 作者：qqgeogor | 项目源码 | 文件源码

def __init__(self,x,y,metrics='eucdian'):
        if metrics=='eucdian':
            x=x.dimshuffle(1,0,2)
            y = y.dimshuffle(1,2,0)
            activation=T.batched_dot(x,y)
            #activation=T.tensordot(x,y,axes=-1)

        self.activation=activation.dimshuffle(0,'x',1,2)

项目：statestream 作者：VolkerFischer | 项目源码 | 文件源码

def tensordot(x, y, axes):
    return T.tensordot(x, y, axes=axes)

项目：recursive_WSABIE 作者：ktsaurabh | 项目源码 | 文件源码

def get_output(self, train=False):
        [X_w, X_t] = self.get_input(train)
        t_w = self.W_t[X_w[:,:, 0]] # doc_l, n_tags*n_samples, n_dim
        w_w = self.W_w[X_w[:,:, 1]]
        dot_tw = T.sum(w_w * t_w, axis=2)
        inter_1 = T.tensordot(w_w, self.S, axes = [[2],[2]])
        inter_2 = T.tensordot(t_w, self.P, axes = [[2],[2]]) # doc_l, n_tags*n_samples, 2,5
        inter = T.sum(inter_1 * inter_2, axis = 3)
        sim_tw = T.tensordot(inter + T.shape_padleft(self.B, 2), self.U, axes=[[2],[0]]) 
        sim_tw = T.reshape(sim_tw, (X_w.shape[0], X_w.shape[1]))
        dot_sum_w = T.sum(dot_tw * T.nnet.sigmoid(sim_tw), axis = 0)/(X_w.shape[0])
        dot_w = theano.tensor.reshape(dot_sum_w, (X_w.shape[1], 1))
        return self.activation(dot_w)
        '''
        t_t = self.W_t[X_t[:,:, 0]] # doc_l, n_tags*n_samples, n_dim
        w_t = self.W_t[X_t[:,:, 1]]
        dot_tt = T.sum(w_t * t_t, axis=2)
        #dot_sum = T.sum(dot_tw, axis = 0)#/(X.shape[0])
        #dot_sum_t = T.sum(dot_tt , axis = 0)#/(X_t.shape[0])
        inter_t_1 = T.tensordot(t_t, self.P, axes = [[2],[2]])
        inter_t_2 = T.tensordot(w_t, self.P, axes = [[2],[2]]) # doc_l, n_tags*n_samples, 2,5
        inter_t = T.sum(inter_t_1 * inter_t_2, axis = 3)
        sim_tt = T.tensordot(inter_t, self.U_t, axes=[[2],[0]]) 
        sim_tt = T.reshape(sim_tt, (X_t.shape[0], X_t.shape[1]))

        dot_sum_t = T.sum(dot_tt * sim_tt, axis = 0)/(X_t.shape[0])
        dot_twc_t = dot_sum_t#*dot_sum#_p
        dot_t = theano.tensor.reshape(dot_twc_t, (X_t.shape[1], 1))

        return 0.5 * self.activation(dot_w) + 0.5 * self.activation(dot_t)
        '''

项目：deep-hashtagprediction 作者：jderiu | 项目源码 | 文件源码

def output_func(self, input):
        self.f = T.tensordot(input.dimshuffle(0,'x',1),self.W.dimshuffle('x',0,1),axes=[[1,2],[0,2]]) # cosine sim
        self.y_pred = T.argmax(self.f,axis=0)
        return self.y_pred

项目：crayimage 作者：yandexdataschool | 项目源码 | 文件源码

def __call__(self, expected, observed, weights):
    obs = T.tensordot(observed, self.W_observed, axes=(2, 0))
    exp = T.dot(expected, self.W_expected)
    weights = weights[:, None] * self.W_weights[None, :]

    return self.nonlinearity(obs + exp + weights)

项目：crayimage 作者：yandexdataschool | 项目源码 | 文件源码

def __call__(self, expected, observed, weights):
    obs = T.tensordot(observed, self.W_observed, axes=(2, 0))
    exp = T.dot(expected, self.W_expected)
    weights = weights[:, None] * self.W_weights[None, :]

    return self.nonlinearity(obs + exp + weights)

项目：mctest-model 作者：Maluuba | 项目源码 | 文件源码

def call(self, x, mask=None):
        def _step(v1, v2):
            cosine_score = T.tensordot(v1 / T.sqrt(T.sum(T.sqr(v1), axis=2, keepdims=True) + 1e-6),
                                       (v2) / T.sqrt(T.sum(T.sqr(v2), axis=2, keepdims=True) + 1e-6),
                                       [[2], [2]])
            return cosine_score

        l_s = x[0]  # n_b x n_s x n_w_s x D
        l_a = x[1]  # n_b x 4 x n_w_qa x D
        # get cosine similarity for ALL word pairs
        output, _ = theano.scan(_step, sequences=[l_s, l_a], outputs_info=None)  # n_b x n_s x n_w_s x 4 x n_w_qa
        # return T.max(T.max(output, axis=4), axis=2)
        output = output.dimshuffle(2, 1, 0, 3, 4)  # n_w_s x n_s x n_b x 4 x n_w_qa

        def slide_max(i, X):
            size = self.window_size
            M = X[i:i + size]
            W = self.w_gaussian
            return T.max((W * M.T).T, axis=0), theano.scan_module.until(i >= X.shape[0] - size + 1)

        output, _ = theano.scan(slide_max,
                                sequences=[
                                    T.arange(0, stop=(output.shape[0] - self.window_size + 1), step=3, dtype='int32')],
                                non_sequences=output)
        if self.use_qa_idf:
            average = weighted_average(output.dimshuffle(2, 1, 0, 3, 4), x[2], axis=4)
        else:
            average = masked_mean(output.dimshuffle(2, 1, 0, 3, 4), axis=4)
        return T.max(average, axis=2) * self.alpha
        # return T.max(masked_mean(output.dimshuffle(2, 1, 0, 3, 4), axis=4), axis=2) * self.alpha

项目：mctest-model 作者：Maluuba | 项目源码 | 文件源码

def call(self, x, mask=None):
        def _step(v1, v2):
            cosine_score = T.tensordot(v1 / T.sqrt(T.sum(T.sqr(v1), axis=1, keepdims=True) + 1e-6),
                                       (v2) / T.sqrt(T.sum(T.sqr(v2), axis=2, keepdims=True) + 1e-6),
                                       [[1], [2]])
            return cosine_score

        l_s = x[0]  # n_b x n_w_st x D
        l_a = x[1]  # n_b x 4 x n_w_qa x D
        # get cosine similarity for ALL word pairs
        output, _ = theano.scan(_step, sequences=[l_s, l_a], outputs_info=None)  # n_b x n_w_st x 4 x n_w_qa
        output = output.dimshuffle(1, 0, 2, 3)

        def slide_max(i, X):
            size = self.window_size
            M = X[i:i + size]
            W = self.w_gaussian
            return T.max((W * M.T).T, axis=0), theano.scan_module.until(i >= X.shape[0] - size + 1)

        output, _ = theano.scan(slide_max,
                                sequences=[
                                    T.arange(0, stop=(output.shape[0] - self.window_size + 1), step=5, dtype='int32')],
                                non_sequences=output)
        if self.use_qa_idf:
            average = weighted_average(output.dimshuffle(1, 0, 2, 3), x[2], axis=3)
        else:
            average = masked_mean(output.dimshuffle(1, 0, 2, 3), axis=3)
        return T.max(average, axis=1) * self.alpha

项目：mctest-model 作者：Maluuba | 项目源码 | 文件源码

def call(self, x, mask=None):
        def _step(v1, v2):
            cosine_score = T.tensordot(v1 / T.sqrt(T.sum(T.sqr(v1), axis=2, keepdims=True) + 1e-6),
                                       (v2) / T.sqrt(T.sum(T.sqr(v2), axis=2, keepdims=True) + 1e-6),
                                       [[2], [2]])
            return cosine_score

        l_s = x[0]  # n_b x n_s x n_w_s x D
        l_a = x[1]  # n_b x 4 x n_w_qa x D
        # get cosine similarity for ALL word pairs
        output, _ = theano.scan(_step, sequences=[l_s, l_a], outputs_info=None)  # n_b x n_s x n_w_s x 4 x n_w_qa
        # return T.max(T.max(output, axis=4), axis=2)
        output = output.dimshuffle(2, 1, 0, 3, 4)  # n_w_s x n_s x n_b x 4 x n_w_qa

        def slide_max(i, X):
            size = self.window_size
            M = X[i:i + size]
            W = self.w_gaussian
            return T.max((W * M.T).T, axis=0), theano.scan_module.until(i >= X.shape[0] - size + 1)

        output, _ = theano.scan(slide_max,
                                sequences=[
                                    T.arange(0, stop=(output.shape[0] - self.window_size + 1), step=3, dtype='int32')],
                                non_sequences=output)
        if self.use_qa_idf:
            average = weighted_average(output.dimshuffle(2, 1, 0, 3, 4), x[2], axis=4)
        else:
            average = masked_mean(output.dimshuffle(2, 1, 0, 3, 4), axis=4)
        return T.max(average, axis=2) * self.alpha
        # return T.max(masked_mean(output.dimshuffle(2, 1, 0, 3, 4), axis=4), axis=2) * self.alpha

项目：NeuroNLP 作者：XuezheMax | 项目源码 | 文件源码

def get_output_for(self, inputs, **kwargs):
        """

        :param inputs: inputs: list of theano.TensorType
            `inputs[0]` should always be the symbolic input variable.  When
            this layer has a mask input (i.e. was instantiated with
            `mask_input != None`, indicating that the lengths of sequences in
            each batch vary), `inputs` should have length 2, where `inputs[1]`
            is the `mask`.  The `mask` should be supplied as a Theano variable
            denoting whether each time step in each sequence in the batch is
            part of the sequence or not.  `mask` should be a matrix of shape
            ``(n_batch, n_time_steps)`` where ``mask[i, j] = 1`` when ``j <=
            (length of sequence i)`` and ``mask[i, j] = 0`` when ``j > (length
            of sequence i)``.
        :return: theano.TensorType
            Symbolic output variable.
        """
        input = inputs[0]
        mask = None
        if self.mask_incoming_index > 0:
            mask = inputs[self.mask_incoming_index]

        # compute head part by tensor dot ([batch, length, dim] * [dim, num_label]
        # the shape of s_h should be [batch, length, num_label]
        s_h = T.tensordot(input, self.W_h, axes=[[2], [0]])

        if self.b is not None:
            b_shuffled = self.b.dimshuffle('x', 'x', 0)
            s_h = s_h + b_shuffled

        # compute child part by tensor dot ([batch, length, dim] * [dim, num_label]
        # the shape of s_c should be [batch, length, num_label]
        s_c = T.tensordot(input, self.W_c, axes=[[2], [0]])

        # compute out
        input_shape = input.shape
        # output shape = [batch, length, length, num_label]
        out = T.cast(T.alloc(0.0, input_shape[0], input_shape[1], input_shape[1], self.num_labels), 'floatX')
        out = out + s_h.dimshuffle(0, 1, 'x', 2)
        out = out + s_c.dimshuffle(0, 'x', 1, 2)

        if mask is not None:
            mask_shuffled = mask.dimshuffle(0, 1, 'x', 'x')
            out = out * mask_shuffled
            mask_shuffled = mask.dimshuffle(0, 'x', 1, 'x')
            out = out * mask_shuffled
        return out

项目：NeuroNLP 作者：XuezheMax | 项目源码 | 文件源码

def get_output_for(self, inputs, **kwargs):
        """

        :param inputs: inputs: list of theano.TensorType
            `inputs[0]` should always be the symbolic input variable.  When
            this layer has a mask input (i.e. was instantiated with
            `mask_input != None`, indicating that the lengths of sequences in
            each batch vary), `inputs` should have length 2, where `inputs[1]`
            is the `mask`.  The `mask` should be supplied as a Theano variable
            denoting whether each time step in each sequence in the batch is
            part of the sequence or not.  `mask` should be a matrix of shape
            ``(n_batch, n_time_steps)`` where ``mask[i, j] = 1`` when ``j <=
            (length of sequence i)`` and ``mask[i, j] = 0`` when ``j > (length
            of sequence i)``.
        :return: theano.TensorType
            Symbolic output variable.
        """
        input = inputs[0]
        mask = None
        if self.mask_incoming_index > 0:
            mask = inputs[self.mask_incoming_index]

        # compute the bi-affine part
        # first via tensor dot ([batch, length, dim] * [dim, dim, num_label])
        # output shape = [batch, length, dim, num_label]
        out = T.tensordot(input, self.U, axes=[[2], [0]])
        # second via tensor dot ([batch, length, dim, num_label] * [batch, dim, length)
        # output shape = [batch, length, length, num_label]
        out = T.batched_tensordot(out, input.dimshuffle(0, 2, 1), axes=([2], [1]))
        out = out.dimshuffle(0, 1, 3, 2)

        # compute head bias part by tensor dot ([batch, length, dim] * [dim, num_label])
        # the shape of s_h should be [batch, length, num_label]
        if self.W_h is not None:
            s_h = T.tensordot(input, self.W_h, axes=[[2], [0]])
            out = out + s_h.dimshuffle(0, 1, 'x', 2)

        # compute child part by tensor dot ([batch, length, dim] * [dim, num_label]
        # the shape of s_c should be [batch, length, num_label]
        if self.W_c is not None:
            s_c = T.tensordot(input, self.W_c, axes=[[2], [0]])
            out = out + s_c.dimshuffle(0, 'x', 1, 2)

        # add bias part.
        if self.b is not None:
            out = out + self.b.dimshuffle('x', 'x', 'x', 0)

        if mask is not None:
            mask_shuffled = mask.dimshuffle(0, 1, 'x', 'x')
            out = out * mask_shuffled
            mask_shuffled = mask.dimshuffle(0, 'x', 1, 'x')
            out = out * mask_shuffled
        return out

项目：LasagneNLP 作者：XuezheMax | 项目源码 | 文件源码

def get_output_for(self, inputs, **kwargs):
        """

        :param inputs: inputs: list of theano.TensorType
            `inputs[0]` should always be the symbolic input variable.  When
            this layer has a mask input (i.e. was instantiated with
            `mask_input != None`, indicating that the lengths of sequences in
            each batch vary), `inputs` should have length 2, where `inputs[1]`
            is the `mask`.  The `mask` should be supplied as a Theano variable
            denoting whether each time step in each sequence in the batch is
            part of the sequence or not.  `mask` should be a matrix of shape
            ``(n_batch, n_time_steps)`` where ``mask[i, j] = 1`` when ``j <=
            (length of sequence i)`` and ``mask[i, j] = 0`` when ``j > (length
            of sequence i)``.
        :return: theano.TensorType
            Symbolic output variable.
        """
        input = inputs[0]
        mask = None
        if self.mask_incoming_index > 0:
            mask = inputs[self.mask_incoming_index]

        # compute head part by tensor dot ([batch, length, input] * [input, num_label]
        # the shape of s_h should be [batch, length, num_label]
        s_h = T.tensordot(input, self.W_h, axes=[[2], [0]])

        if self.b is not None:
            b_shuffled = self.b.dimshuffle('x', 'x', 0)
            s_h = s_h + b_shuffled

        # compute child part by tensor dot ([batch, length, input] * [input, num_label]
        # the shape of s_c should be [batch, length, num_label]
        s_c = T.tensordot(input, self.W_c, axes=[[2], [0]])

        # compute out
        input_shape = input.shape
        out = T.cast(T.alloc(0.0, input_shape[0], input_shape[1], input_shape[1], self.num_labels), 'floatX')
        out = out + s_h.dimshuffle(0, 1, 'x', 2)
        out = out + s_c.dimshuffle(0, 'x', 1, 2)

        if mask is not None:
            mask_shuffled = mask.dimshuffle(0, 1, 'x', 'x')
            out = out * mask_shuffled
            mask_shuffled = mask.dimshuffle(0, 'x', 1, 'x')
            out = out * mask_shuffled
        return out