Python theano.tensor 模块,sqr() 实例源码


def dot_2d(k, M, b=None, g=None):
    # k: (nb_samples, memory_width)
    # M: (nb_samples, memory_dim, memory_width)

    # norms of keys and memories
    # k_norm = T.sqrt(T.sum(T.sqr(k), 1)) + 1e-5  # (nb_samples,)
    # M_norm = T.sqrt(T.sum(T.sqr(M), 2)) + 1e-5  # (nb_samples, memory_dim,)

    k      = k[:, None, :]                      # (nb_samples, 1, memory_width)
    value  = k * M
    if b is not None:
        b  = b[:, None, :]
        value *= b         # (nb_samples, memory_dim,)

    if g is not None:
        g  = g[None, None, :]
        value *= g

    sim    = T.sum(value, axis=2)
    return sim
def op_cosine_c(
    s_xr_, s_xi_, s_yr_, s_yi_, axis_=-1, keepdims_=True, eps_=1e-7):
    cosine between two complex vectors, uses standard complex inner product

        s_xr_: real part of x
        s_xi_: imag part of x
        s_yr_: real part of y
        s_yi_: imag part of y
        eps_: small number to prevent divide by zero
    s_nrm = s_xr_*s_yr_ + s_xi_*s_yi_
    s_nx = T.sum(T.sqr(s_xr_) + T.sqr(s_xi_), axis=axis_, keepdims=keepdims_)
    s_ny = T.sum(T.sqr(s_yr_) + T.sqr(s_yi_), axis=axis_, keepdims=keepdims_)
    return T.sum(s_nrm, axis=axis_, keepdims=keepdims_) / T.sqrt(s_nx * s_ny + eps_)
def op_ortho_loss(s_x_, axes_=(-2, -1), ndim_=None):
    orthogoal matrix loss
    used to regularize parameter to unitary

        s_x_: (batch of) matrices
        axes_: tuple of two integers, specify which axes to be for matrix,
            defaults to last two axes
        ndim_: specify args to be (ndim_ x ndim_) matrices

    if ndim_ is None:
        ax = axes_[0]
        ndim = T.shape(s_x_)[ax]
        ndim = ndim_

    tpat = list(range(ndim))
    bpat = ['x'] * s_x_.ndim
    tpat[axes_[0]], tpat[axes_[1]] = tpat[axes_[1]], tpat[axes_[0]]
    bpat[axes_[0]] = 0
    bpat[axes_[1]] = 1
    s_y =*tpat), s_x_)
    return T.sqr(s_y - T.eye(ndim).dimshuffle(*bpat))
def create_esgd_updates(updates, params, gparams, gsums, xsums, lr, eps, gamma, momentum):
    has_momentum = momentum.get_value() > 0.0
    samples = [ default_mrng.normal(size=p.shape, avg=0, std=1,
                    dtype=theano.config.floatX) for p in params ]
    HVs = T.Lop(gparams, params, samples)

    i = theano.shared(np.float64(0.0).astype(theano.config.floatX))
    i_t = i + 1.0
    omg_t = 1.0 - gamma**i_t
    for p, g, m, D, Hv in zip(params, gparams, gsums, xsums, HVs):
        if is_subtensor_op(p):
            raise Exception("ESGD subtensor update not implemented!")
            D_t = D * gamma + T.sqr(Hv) * (1.0-gamma)
            if has_momentum:
                m_t = m*momentum + g
                updates[m] = m_t
                m_t = g
            g_t = m_t / ( T.sqrt(D_t/omg_t + eps) )
            #g_t = m_t / ( T.sqrt(D_t + eps) )
            updates[D] = D_t
            updates[p] = p - lr*g_t
    updates[i] = i_t
def dot_2d(k, M, b=None, g=None):
    # k: (nb_samples, memory_width)
    # M: (nb_samples, memory_dim, memory_width)

    # norms of keys and memories
    # k_norm = T.sqrt(T.sum(T.sqr(k), 1)) + 1e-5  # (nb_samples,)
    # M_norm = T.sqrt(T.sum(T.sqr(M), 2)) + 1e-5  # (nb_samples, memory_dim,)

    k      = k[:, None, :]                      # (nb_samples, 1, memory_width)
    value  = k * M
    if b is not None:
        b  = b[:, None, :]
        value *= b         # (nb_samples, memory_dim,)

    if g is not None:
        g  = g[None, None, :]
        value *= g

    sim    = T.sum(value, axis=2)
    return sim
def Adam(cost, params, learning_rate=0.0002, b1=0.1, b2=0.001, e=1e-8):
    updates = OrderedDict()
    grads = T.grad(cost, params)
    i = theano.shared(np.asarray(0., dtype=theano.config.floatX))
    i_t = i + 1.
    fix1 = 1. - (1. - b1)**i_t
    fix2 = 1. - (1. - b2)**i_t
    lr_t = learning_rate * (T.sqrt(fix2) / fix1)
    for p, g in zip(params, grads):
        m = theano.shared(p.get_value() * 0.)
        v = theano.shared(p.get_value() * 0.)
        m_t = (b1 * g) + ((1. - b1) * m)
        v_t = (b2 * T.sqr(g)) + ((1. - b2) * v)
        g_t = m_t / (T.sqrt(v_t) + e)
        p_t = p - (lr_t * g_t)

        updates[m] = m_t
        updates[v] = v_t
        updates[p] = p_t
    updates[i] = i_t

    return updates
def adam(cost, params, lr=0.001, b1=0.9, b2=0.999, e=1e-8):
    updates = []
    grads = T.grad(cost, params)
    i = theano.shared(np.dtype(theano.config.floatX).type(1))
    i_t = i + 1.
    fix1 = 1. - (1. - b1)**i_t
    fix2 = 1. - (1. - b2)**i_t
    lr_t = lr * (T.sqrt(fix2) / fix1)
    for p, g in zip(params, grads):
        g = T.clip(g, -grad_clip, grad_clip)
        m = theano.shared(p.get_value() * 0.)
        v = theano.shared(p.get_value() * 0.)
        m_t = (b1 * g) + ((1. - b1) * m)
        v_t = (b2 * T.sqr(g)) + ((1. - b2) * v)
        g_t = m_t / (T.sqrt(v_t) + e)
        p_t = p - (lr_t * g_t)
        updates.append((m, m_t))
        updates.append((v, v_t))
        updates.append((p, p_t))
    updates.append((i, i_t))
    return updates
def adam(cost, params, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-6, **kwargs):
    """Adam Gradient Descent
    Scale learning rates by Adaptive moment estimation

    .. [1]
    gparams = T.grad(cost, params)
    updates = OrderedDict()
    t = shared_variable(to_float_X(0.))
    t_t = 1. + t
    l_r_t = learning_rate * T.sqrt(1. - beta2 ** t_t) / (1. - beta1 ** t_t)
    for param, gparam in zip(params, gparams):
        m = shared_variable(np.zeros(param.get_value(borrow=True).shape), broadcastable=param.broadcastable)
        v = shared_variable(np.zeros(param.get_value(borrow=True).shape), broadcastable=param.broadcastable)
        m_t = beta1 * m + (1. - beta1) * gparam
        v_t = beta2 * v + (1. - beta2) * T.sqr(gparam)
        updates[m] = m_t
        updates[v] = v_t
        updates[param] = param - l_r_t * m_t / (T.sqrt(v_t) + epsilon)
    updates[t] = t_t
    return updates
def __call__(self, c01b):
        .. todo::
        half = self.n // 2

        sq = T.sqr(c01b)

        ch, r, c, b = c01b.shape

        extra_channels = T.alloc(0., ch + 2*half, r, c, b)

        sq = T.set_subtensor(extra_channels[half:half+ch,:,:,:], sq)

        scale = self.k

        for i in xrange(self.n):
            scale += self.alpha * sq[i:i+ch,:,:,:]

        scale = scale ** self.beta

        return c01b / scale
def __call__(self, c01b):
        .. todo::
        half = self.n // 2

        sq = T.sqr(c01b)

        ch, r, c, b = c01b.shape

        extra_channels = T.alloc(0., ch + 2*half, r, c, b)

        sq = T.set_subtensor(extra_channels[half:half+ch,:,:,:], sq)

        scale = self.k

        for i in xrange(self.n):
            scale += self.alpha * sq[i:i+ch,:,:,:]

        scale = scale ** self.beta

        return c01b / scale
def get_cost(self, X, Y, X_sizes):
        Calculates cost for each values in mini batch, also
        regularizes all the input parameters and then returns
        final cost function as theano variable

        cost_fn, _ = theano.scan(
            sequences=[X, Y, X_sizes]

        cost_fn = cost_fn.mean()
        cost_fn += self.reg_lambda * T.sqr(self.W_c_r).sum() / 2.
        cost_fn += self.reg_lambda * T.sqr(self.W_c_l).sum() / 2.
        cost_fn += self.reg_lambda * T.sqr(self.W_conv).sum() / 2.
        cost_fn += self.reg_lambda * T.sqr(self.W_output).sum() / 2.
        cost_fn += self.reg_lambda * T.sqr(self.b_output).sum() / 2.

        # Regularizing word embedding
        cost_fn += self.reg_lambda * T.sqr(self.vector_dict).sum() / 2

        return cost_fn
def define_loss(self):

        #Inverse since those that have a smaller distance are the most probable.
        self.pred_func =  TT.nnet.sigmoid( TT.sum(self.e1[self.rows,:] * self.r1[self.cols,:] * self.e1[self.tubes,:], 1) \
                       + TT.sum(self.e2[self.rows,:] * self.r1[self.cols,:] * self.e2[self.tubes,:], 1) \
                       + TT.sum(self.e1[self.rows,:] * self.r2[self.cols,:] * self.e2[self.tubes,:], 1) \
                       - TT.sum(self.e2[self.rows,:] * self.r2[self.cols,:] * self.e1[self.tubes,:], 1) )

        self.loss = TT.nnet.softplus( - self.ys * ( TT.sum(self.e1[self.rows,:] * self.r1[self.cols,:] * self.e1[self.tubes,:], 1) \
                       + TT.sum(self.e2[self.rows,:] * self.r1[self.cols,:] * self.e2[self.tubes,:], 1) \
                       + TT.sum(self.e1[self.rows,:] * self.r2[self.cols,:] * self.e2[self.tubes,:], 1) \
                       - TT.sum(self.e2[self.rows,:] * self.r2[self.cols,:] * self.e1[self.tubes,:], 1) )).mean()

        self.regul_func = TT.sqr(self.e1[self.rows,:]).mean() \
                        + TT.sqr(self.e2[self.rows,:]).mean() \
                        + TT.sqr(self.e1[self.tubes,:]).mean() \
                        + TT.sqr(self.e2[self.tubes,:]).mean() \
                        + TT.sqr(self.r1[self.cols,:]).mean() \
                        + TT.sqr(self.r2[self.cols,:]).mean()
def fit(self, weights, o_error, tpo ):

        gradients = T.grad(o_error ,weights)
        updates = []
        for c, v, w, g in zip(self.t_cache, self.t_velocity, weights,gradients):
            new_velocity = T.sub( T.mul(tpo["momentum_rate"], v) , T.mul(tpo["learn_rate"], g) )
            new_cache = T.add( T.mul(tpo["decay_rate"] , c) , T.mul(T.sub( 1, tpo["decay_rate"]) , T.sqr(g)))
            new_weights = T.sub(T.add(w , new_velocity) , T.true_div( T.mul(g,tpo["learn_rate"]) , T.sqrt(T.add(new_cache,0.1**8))))
            updates.append((w, new_weights))
            updates.append((v, new_velocity))
            updates.append((c, new_cache))

        return updates

######                 Nesterov momentum
def fit(self, weights, o_error, tpo):
        updates = []
        gradients = theano.grad(o_error, weights)

        for c, w, g in zip(self.t_cache, weights, gradients):
            new_cache = tpo["decay_rate"] * c + ( 1- tpo["decay_rate"]) * T.sqr(g)
            new_weights = w - (g * tpo["learn_rate"]) / T.sqrt(new_cache + 0.1**8)
            updates.append((w, new_weights))
            updates.append((c, new_cache))

        return updates

def build_vae_loss(input_var, l_z_mu, l_z_ls, l_x_mu_list, l_x_ls_list, l_x_list, l_x,
        deterministic, binary, L):
    layer_outputs = nn.layers.get_output([l_z_mu, l_z_ls] + l_x_mu_list + l_x_ls_list
            + l_x_list + [l_x], deterministic=deterministic)
    z_mu =  layer_outputs[0]
    z_ls =  layer_outputs[1]
    x_mu =  [] if binary else layer_outputs[2:2+L]
    x_ls =  [] if binary else layer_outputs[2+L:2+2*L]
    x_list =  layer_outputs[2:2+L] if binary else layer_outputs[2+2*L:2+3*L]
    x = layer_outputs[-1]
    kl_div = 0.5 * T.sum(1 + 2*z_ls - T.sqr(z_mu) - T.exp(2 * z_ls))
    if binary:
        logpxz = sum(nn.objectives.binary_crossentropy(x, input_var).sum()
                for x in x_list) * (-1./L)
        prediction = x_list[0] if deterministic else x
        logpxz = sum(log_likelihood(input_var.flatten(2), mu, ls)
            for mu, ls in zip(x_mu, x_ls))/L
        prediction = x_mu[0] if deterministic else T.sum(x_mu, axis=0)/L
    loss = -1 * (logpxz + kl_div)
    return loss, prediction
def sym_logdensity(self, x):
        """ x is a matrix of column datapoints (VxB) V = n_visible, B = batch size """
        def density_given_previous_a_and_x(x, w, V_alpha, b_alpha, V_mu, b_mu, V_sigma, b_sigma, activations_factor, p_prev, a_prev, x_prev):
            a = a_prev +, 1), T.shape_padleft(w, 1))
            h = self.nonlinearity(a * activations_factor)  # BxH

            Alpha = T.nnet.softmax(, V_alpha) + T.shape_padleft(b_alpha))  # BxC
            Mu =, V_mu) + T.shape_padleft(b_mu)  # BxC
            Sigma = T.exp((, V_sigma) + T.shape_padleft(b_sigma)))  # BxC
            p = p_prev + log_sum_exp(-constantX(0.5) * T.sqr((Mu - T.shape_padright(x, 1)) / Sigma) - T.log(Sigma) - constantX(0.5 * np.log(2 * np.pi)) + T.log(Alpha))
            return (p, a, x)
        # First element is different (it is predicted from the bias only)
        a0 = T.zeros_like(, self.W))  # BxH
        p0 = T.zeros_like(x[0])
        x0 = T.ones_like(x[0])
        ([ps, _as, _xs], updates) = theano.scan(density_given_previous_a_and_x,
                                                sequences=[x, self.W, self.V_alpha, self.b_alpha, self.V_mu, self.b_mu, self.V_sigma, self.b_sigma, self.activation_rescaling],
                                                outputs_info=[p0, a0, x0])
        return (ps[-1], updates)
def adam(params, grads, lr=0.001, b1=0.9, b2=0.999, e=1e-8):
    updates = OrderedDict()
    i = theano.shared(np.float32(0))
    i_t = i + 1.

    for p, g in zip(params, grads):
        v = build_shared_zeros(p.get_value(True).shape)
        r = build_shared_zeros(p.get_value(True).shape)

        v_t = (b1 * v) + (1. - b1) * g
        r_t = (b2 * r) + (1. - b2) * T.sqr(g)

        r_hat = lr / (T.sqrt(r_t / (1 - b2 ** i_t)) + e)
        v_hat = v / (1 - b1 ** i_t)

        p_t = p - r_hat * v_hat
        updates[v] = v_t
        updates[r] = r_t
        updates[p] = p_t

    updates[i] = i_t
    return updates
def gradients_to_updates(self, params, grads):
        updates = OrderedDict()
        for pp, gg in zip(params, grads):
            value = pp.get_value(borrow=True)
            self.accu = theano.shared(np.zeros(value.shape, dtype=theano.config.floatX), 'adadelta_accu_'
            self.delta_accu = theano.shared(np.zeros(value.shape, dtype=theano.config.floatX), 'adadelta_delta_accu_'
            self.accu.tags = ['optimizer_param']
            self.delta_accu.tags = ['optimizer_param']
            accu_new = self.rho * self.accu + (1 - self.rho) * T.sqr(gg)
            updates[self.accu] = accu_new
            ud = gg * (T.sqrt(self.delta_accu) + 1e-7) / (T.sqrt(accu_new) + 1e-7)
            updates[pp] = pp - * ud
            delta_accu_new = self.rho * self.delta_accu + (1 - self.rho) * T.sqr(ud)
            updates[self.delta_accu] = delta_accu_new
        return updates
def create_esgd_updates(updates, params, gparams, gsums, xsums, lr, eps, gamma, momentum):
    has_momentum = momentum.get_value() > 0.0
    samples = [ default_mrng.normal(size=p.shape, avg=0, std=1,
                    dtype=theano.config.floatX) for p in params ]
    HVs = T.Lop(gparams, params, samples)

    i = theano.shared(np.float64(0.0).astype(theano.config.floatX))
    i_t = i + 1.0
    omg_t = 1.0 - gamma**i_t
    for p, g, m, D, Hv in zip(params, gparams, gsums, xsums, HVs):
        if is_subtensor_op(p):
            raise Exception("ESGD subtensor update not implemented!")
            D_t = D * gamma + T.sqr(Hv) * (1.0-gamma)
            if has_momentum:
                m_t = m*momentum + g
                updates[m] = m_t
                m_t = g
            g_t = m_t / ( T.sqrt(D_t/omg_t + eps) )
            #g_t = m_t / ( T.sqrt(D_t + eps) )
            updates[D] = D_t
            updates[p] = p - lr*g_t
    updates[i] = i_t
def build_vae_loss(input_var, l_z_mu, l_z_ls, l_x_mu_list, l_x_ls_list, l_x_list, l_x,
        deterministic, binary, L):
    layer_outputs = nn.layers.get_output([l_z_mu, l_z_ls] + l_x_mu_list + l_x_ls_list
            + l_x_list + [l_x], deterministic=deterministic)
    z_mu =  layer_outputs[0]
    z_ls =  layer_outputs[1]
    x_mu =  [] if binary else layer_outputs[2:2+L]
    x_ls =  [] if binary else layer_outputs[2+L:2+2*L]
    x_list =  layer_outputs[2:2+L] if binary else layer_outputs[2+2*L:2+3*L]
    x = layer_outputs[-1]
    kl_div = 0.5 * T.sum(1 + 2*z_ls - T.sqr(z_mu) - T.exp(2 * z_ls))
    if binary:
        logpxz = sum(nn.objectives.binary_crossentropy(x, input_var).sum()
                for x in x_list) * (-1./L)
        prediction = x_list[0] if deterministic else x
        logpxz = sum(log_likelihood(input_var.flatten(2), mu, ls)
            for mu, ls in zip(x_mu, x_ls))/L
        prediction = x_mu[0] if deterministic else T.sum(x_mu, axis=0)/L
    loss = -1 * (logpxz + kl_div)
    return loss, prediction
def RMSProp(self, learning_rate=0.01, decay=0.9, epsilon=1.0 / 100.):
        RMSProp of Tieleman et al.
        :param learning_rate: learning rate
        :param decay: decay rate of gradient history
        :param epsilon: gradient clip
        :return: update

        for param_i, grad_i in zip(self.params, self.grads):
            # Accumulate gradient
            msg = theano.shared(numpy.zeros(param_i.get_value().shape, dtype=theano.config.floatX))
            new_mean_squared_grad = (decay * msg + (1 - decay) * T.sqr(grad_i))

            # Compute update
            rms_grad_t = T.sqrt(new_mean_squared_grad)
            rms_grad_t = T.maximum(rms_grad_t, epsilon)
            delta_x_t = -learning_rate * grad_i / rms_grad_t

            # Apply update
            self.updates.append((param_i, param_i + delta_x_t))
            self.updates.append((msg, new_mean_squared_grad))

        return self.updates
项目:diagnose-heart    作者:woshialex    | 项目源码 | 文件源码
def build_vae_loss(input_var, l_z_mu, l_z_ls, l_x_mu_list, l_x_ls_list, l_x_list, l_x,
        deterministic, binary, L):
    layer_outputs = nn.layers.get_output([l_z_mu, l_z_ls] + l_x_mu_list + l_x_ls_list
            + l_x_list + [l_x], deterministic=deterministic)
    z_mu =  layer_outputs[0]
    z_ls =  layer_outputs[1]
    x_mu =  [] if binary else layer_outputs[2:2+L]
    x_ls =  [] if binary else layer_outputs[2+L:2+2*L]
    x_list =  layer_outputs[2:2+L] if binary else layer_outputs[2+2*L:2+3*L]
    x = layer_outputs[-1]
    kl_div = 0.5 * T.sum(1 + 2*z_ls - T.sqr(z_mu) - T.exp(2 * z_ls))
    if binary:
        logpxz = sum(nn.objectives.binary_crossentropy(x, input_var).sum()
                for x in x_list) * (-1./L)
        prediction = x_list[0] if deterministic else x
        logpxz = sum(log_likelihood(input_var.flatten(2), mu, ls)
            for mu, ls in zip(x_mu, x_ls))/L
        prediction = x_mu[0] if deterministic else T.sum(x_mu, axis=0)/L
    loss = -1 * (logpxz + kl_div)
    return loss, prediction
项目:diagnose-heart    作者:woshialex    | 项目源码 | 文件源码
def build_vae_loss(input_var, l_z_mu, l_z_ls, l_x_mu_list, l_x_ls_list, l_x_list, l_x,
        deterministic, binary, L):
    layer_outputs = nn.layers.get_output([l_z_mu, l_z_ls] + l_x_mu_list + l_x_ls_list
            + l_x_list + [l_x], deterministic=deterministic)
    z_mu =  layer_outputs[0]
    z_ls =  layer_outputs[1]
    x_mu =  [] if binary else layer_outputs[2:2+L]
    x_ls =  [] if binary else layer_outputs[2+L:2+2*L]
    x_list =  layer_outputs[2:2+L] if binary else layer_outputs[2+2*L:2+3*L]
    x = layer_outputs[-1]
    kl_div = 0.5 * T.sum(1 + 2*z_ls - T.sqr(z_mu) - T.exp(2 * z_ls))
    if binary:
        logpxz = sum(nn.objectives.binary_crossentropy(x, input_var).sum()
                for x in x_list) * (-1./L)
        prediction = x_list[0] if deterministic else x
        logpxz = sum(log_likelihood(input_var.flatten(2), mu, ls)
            for mu, ls in zip(x_mu, x_ls))/L
        prediction = x_mu[0] if deterministic else T.sum(x_mu, axis=0)/L
    loss = -1 * (logpxz + kl_div)
    return loss, prediction
def gradient_descent(self, loss):
        """Momentum GD with gradient clipping."""
        grad = T.grad(loss, self.params)
        self.momentum_velocity_ = [0.] * len(grad)
        grad_norm = T.sqrt(sum(map(lambda x: T.sqr(x).sum(), grad)))
        updates = OrderedDict()
        not_finite = T.or_(T.isnan(grad_norm), T.isinf(grad_norm))
        scaling_den = T.maximum(5.0, grad_norm)
        for n, (param, grad) in enumerate(zip(self.params, grad)):
            grad = T.switch(not_finite, 0.1 * param,
                            grad * (5.0 / scaling_den))
            velocity = self.momentum_velocity_[n]
            update_step = self.momentum * velocity - self.learning_rate * grad
            self.momentum_velocity_[n] = update_step
            updates[param] = param + update_step
        return updates
def Adam(self, params, cost, lr=0.0002, b1=0.1, b2=0.001, e=1e-8):
        updates = []
        grads = T.grad(cost, params)
        i = theano.shared(as_floatX(0.))
        i_t = i + 1.
        fix1 = 1. - (1. - b1)**i_t
        fix2 = 1. - (1. - b2)**i_t
        lr_t = lr * (T.sqrt(fix2) / fix1)
        for p, g in zip(params, grads):
            m = theano.shared(p.get_value() * 0.)
            v = theano.shared(p.get_value() * 0.)
            m_t = (b1 * g) + ((1. - b1) * m)
            v_t = (b2 * T.sqr(g)) + ((1. - b2) * v)
            g_t = m_t / (T.sqrt(v_t) + e)
            p_t = p - (lr_t * g_t)
            updates.append((m, m_t))
            updates.append((v, v_t))
            updates.append((p, p_t))
        updates.append((i, i_t))
        return updates
def Adam(grads, lr=0.0002, b1=0.1, b2=0.001, e=1e-8):
    updates = []
    varlist = []
    i = sharedX(0.)
    i_t = i + 1.
    fix1 = 1. - (1. - b1)**i_t
    fix2 = 1. - (1. - b2)**i_t
    lr_t = lr * (T.sqrt(fix2) / fix1)
    for p, g in grads.items():
        m = sharedX(p.get_value() * 0., + '_adam_optimizer_m')
        v = sharedX(p.get_value() * 0., + '_adam_optimizer_v')
        m_t = (b1 * g) + ((1. - b1) * m)
        v_t = (b2 * T.sqr(g)) + ((1. - b2) * v)
        g_t = m_t / (T.sqrt(v_t) + e)
        p_t = p - (lr_t * g_t)

        updates.append((m, m_t))
        updates.append((v, v_t))
        updates.append((p, p_t))


    updates.append((i, i_t))
    return updates, varlist
def Adagrad(grads, lr):
    updates = OrderedDict()
    for param in grads.keys():
        # sum_square_grad := \sum g^2
        sum_square_grad = sharedX(param.get_value() * 0.)
        if is not None:
   = 'sum_square_grad_' +

        # Accumulate gradient
        new_sum_squared_grad = sum_square_grad + T.sqr(grads[param])

        # Compute update
        delta_x_t = (- lr / T.sqrt(numpy.float32(1e-5) + new_sum_squared_grad)) * grads[param]

        # Apply update
        updates[sum_square_grad] = new_sum_squared_grad
        updates[param] = param + delta_x_t
    return updates
def dot_2d(k, M, b=None, g=None):
    # k: (nb_samples, memory_width)
    # M: (nb_samples, memory_dim, memory_width)

    # norms of keys and memories
    # k_norm = T.sqrt(T.sum(T.sqr(k), 1)) + 1e-5  # (nb_samples,)
    # M_norm = T.sqrt(T.sum(T.sqr(M), 2)) + 1e-5  # (nb_samples, memory_dim,)

    k      = k[:, None, :]                      # (nb_samples, 1, memory_width)
    value  = k * M
    if b is not None:
        b  = b[:, None, :]
        value *= b         # (nb_samples, memory_dim,)

    if g is not None:
        g  = g[None, None, :]
        value *= g

    sim    = T.sum(value, axis=2)
    return sim
def get_adam_updates(f, params, lr=10., b1=0.9, b2=0.999, e=1e-8, dec=5e-3, norm_grads=False):
    """Generate updates to optimize using the Adam optimizer with linear learning rate decay."""
    t = theano.shared(0)
    ms = [theano.shared(np.zeros(param.shape.eval(), dtype=floatX), borrow=True) for param in params]
    vs = [theano.shared(np.zeros(param.shape.eval(), dtype=floatX), borrow=True) for param in params]

    gs = T.grad(f, params)
    if norm_grads:
        gs = [g / (T.sum(T.abs_(g)) + 1e-8) for g in gs]
    t_u = (t, t + 1)
    m_us = [(m, b1 * m + (1. - b1) * g) for m, g in zip(ms, gs)]
    v_us = [(v, b2 * v + (1. - b2) * T.sqr(g)) for v, g in zip(vs, gs)]
    t_u_f = T.cast(t_u[1], floatX)
    lr_hat =  (lr / (1. + t_u_f * dec)) * T.sqrt(1. - T.pow(b2, t_u_f)) / (1. - T.pow(b1, t_u_f))
    param_us = [(param,  param - lr_hat * m_u[1] / (T.sqrt(v_u[1]) + e)) for m_u, v_u, param in zip(m_us, v_us, params)]
    return m_us + v_us + param_us + [t_u]
def Adam(grads, lr=0.0002, b1=0.1, b2=0.001, e=1e-8):
    updates = []
    varlist = []
    i = sharedX(0.)
    i_t = i + 1.
    fix1 = 1. - (1. - b1)**i_t
    fix2 = 1. - (1. - b2)**i_t
    lr_t = lr * (T.sqrt(fix2) / fix1)
    for p, g in grads.items():
        m = sharedX(p.get_value() * 0., + '_adam_optimizer_m')
        v = sharedX(p.get_value() * 0., + '_adam_optimizer_v')
        m_t = (b1 * g) + ((1. - b1) * m)
        v_t = (b2 * T.sqr(g)) + ((1. - b2) * v)
        g_t = m_t / (T.sqrt(v_t) + e)
        p_t = p - (lr_t * g_t)

        updates.append((m, m_t))
        updates.append((v, v_t))
        updates.append((p, p_t))


    updates.append((i, i_t))
    return updates, varlist
def Adagrad(grads, lr):
    updates = OrderedDict()
    for param in grads.keys():
        # sum_square_grad := \sum g^2
        sum_square_grad = sharedX(param.get_value() * 0.)
        if is not None:
   = 'sum_square_grad_' +

        # Accumulate gradient
        new_sum_squared_grad = sum_square_grad + T.sqr(grads[param])

        # Compute update
        delta_x_t = (- lr / T.sqrt(numpy.float32(1e-5) + new_sum_squared_grad)) * grads[param]

        # Apply update
        updates[sum_square_grad] = new_sum_squared_grad
        updates[param] = param + delta_x_t
    return updates
def op_l2norm(s_x_, eps_=1e-6):
    return T.sqrt(eps_+T.sum(T.sqr(s_x_)))
项目:fxnn    作者:khaotik    | 项目源码 | 文件源码
def op_cosine(s_u_, s_v_, flatten_=True, eps_=1e-6):
    if flatten_:
        s_u = s_u_.flatten()
        s_v = s_v_.flatten()
        return, s_v) / T.sqrt(eps_+T.sum(T.sqr(s_u))*T.sum(T.sqr(s_v)))
        s_u = s_u_
        s_v = s_v_
        T.sum(s_u*s_v, axis=-1)/T.sqrt(eps_+T.sum(T.sqr(s_u), axis=-1)*T.sum(T.sqr(s_v), axis=-1))
项目:deep_srl    作者:luheng    | 项目源码 | 文件源码
def gradient_clipping(gradients, max_norm=5.0):
  global_grad_norm = tensor.sqrt(sum(map(lambda x: tensor.sqr(x).sum(), gradients)))
  multiplier = tensor.switch(global_grad_norm < max_norm, 1.0, max_norm / global_grad_norm)
  return [g * multiplier for g in gradients]
def RMSProp(self, learning_rate=0.01, decay=0.9, epsilon=1.0 / 100.):
        RMSProp of Tieleman et al.
        :param learning_rate: learning rate
        :param decay: decay rate of gradient history
        :param epsilon: gradient clip
        :return: update

        updates = []

        for param_i, grad_i in zip(self.params, self.grads):
            # Accumulate gradient
            msg = theano.shared(numpy.zeros(param_i.get_value().shape, dtype=theano.config.floatX))
            new_mean_squared_grad = (decay * msg + (1 - decay) * T.sqr(grad_i))

            # Compute update
            rms_grad_t = T.sqrt(new_mean_squared_grad)
            rms_grad_t = T.maximum(rms_grad_t, epsilon)
            delta_x_t = -learning_rate * grad_i / rms_grad_t

            # Apply update
            updates.append((param_i, param_i + delta_x_t))
            updates.append((msg, new_mean_squared_grad))

        return updates
def mean_squared_error(y_true, y_pred):
    return T.sqr(y_pred - y_true).mean(axis=-1)
项目:CopyNet    作者:MultiPath    | 项目源码 | 文件源码
def mean_squared_logarithmic_error(y_true, y_pred):
    return T.sqr(T.log(T.clip(y_pred, epsilon, np.inf) + 1.) - T.log(T.clip(y_true, epsilon, np.inf) + 1.)).mean(axis=-1)
项目:CopyNet    作者:MultiPath    | 项目源码 | 文件源码
def squared_hinge(y_true, y_pred):
    return T.sqr(T.maximum(1. - y_true * y_pred, 0.)).mean(axis=-1)
项目:CopyNet    作者:MultiPath    | 项目源码 | 文件源码
def cosine_sim2d(k, M):
    # k: (nb_samples, memory_width)
    # M: (nb_samples, memory_dim, memory_width)

    # norms of keys and memories
    k_norm = T.sqrt(T.sum(T.sqr(k), 1)) + 1e-5  # (nb_samples,)
    M_norm = T.sqrt(T.sum(T.sqr(M), 2)) + 1e-5  # (nb_samples, memory_dim,)

    k      = k[:, None, :]                      # (nb_samples, 1, memory_width)
    k_norm = k_norm[:, None]                    # (nb_samples, 1)

    sim    = T.sum(k * M, axis=2)               # (nb_samples, memory_dim,)
    sim   /= k_norm * M_norm                    # (nb_samples, memory_dim,)
    return sim
def op_sqr_c(s_xr_, s_xi_):
    elemwise complex square
    return T.sqr(s_xr_) - T.sqr(s_xi_), 2*s_xr_*s_xi_
项目:dnc-theano    作者:khaotik    | 项目源码 | 文件源码
def op_norm2(s_x_, axis_=-1, use_mean_=False, keepdims_=True):
    Square of L2 norm

        s_x_: input (batch of) vector
        axis_: int or tuple of int
        use_mean_: cause mean of square to be one instead of sum
    op_sum = T.sum if not use_mean_ else T.mean
    return op_sum(T.sqr(s_x_), axis=axis_, keepdims=keepdims_)
项目:dnc-theano    作者:khaotik    | 项目源码 | 文件源码
def op_norm2_c(s_xr_, s_xi_, axis_=-1, use_mean_=False, keepdims_=True):
    Complex squared L2 norm
    op_sum = T.sum if not use_mean_ else T.mean
    return op_sum(T.sqr(s_xr_) + T.sqr(s_xi_), axis=axis_, keepdims=keepdims_)
项目:dnc-theano    作者:khaotik    | 项目源码 | 文件源码
def op_cosine(s_x_, s_y_, axis_=-1, keepdims_=True, eps_=1e-7):
    cosine between two vectors
    s_prod = s_x_ * s_y_
    s_nx = T.sum(T.sqr(s_x_), axis=axis_, keepdims=keepdims_)
    s_ny = T.sum(T.sqr(s_y_), axis=axis_, keepdims=keepdims_)
    return (T.sum(s_prod, axis=axis_, keepdims=keepdims_) / T.sqrt(s_nx * s_ny + eps_))
项目:dnc-theano    作者:khaotik    | 项目源码 | 文件源码
def op_sqr_cosine(s_x_, s_y_, axis_=-1, keepdims_=True, eps_=1e-7):
    squared cosine

    for some occasion, sqrt is not needed
    s_prod = s_x_ * s_y_
    s_nx = T.sum(T.sqr(s_x_), axis=axis_, keepdims=keepdims_)
    s_ny = T.sum(T.sqr(s_y_), axis=axis_, keepdims=keepdims_)
    return (T.sqr(T.sum(s_prod, axis=axis_, keepdims=keepdims_)) / (s_nx * s_ny + eps_))
项目:dnc-theano    作者:khaotik    | 项目源码 | 文件源码
def op_unitary_loss(s_re_, s_im_, axes_=None, size_=None):
    unitary matrix loss of real/imag part,
    used to regularize parameter to unitary

        s_re_: real part, square matrix
        s_im_: imag part, square matrix
        size_: specify args to be (size_ x size_) matrices
        axes_: tuple of two integers, specify which axes to be for matrix,
            defaults to last two axes
    if axes_ is None:
        axes_ = (-2, -1)

    if size_ is None:
        ax = axes_[0]
        size = T.shape(s_re_)[ax]
        size = size_

    assert s_re_.ndim == s_im_.ndim

    tpat = list(range(s_re_.ndim))
    bpat = ['x'] * s_re_.ndim
    tpat[axes_[0]], tpat[axes_[1]] = tpat[axes_[1]], tpat[axes_[0]]
    bpat[axes_[0]] = 0
    bpat[axes_[1]] = 1
    s_y_re_ =*tpat), s_re_) +*tpat), s_im_)
    s_tmp =*tpat), s_im_)
    s_y_im_ = s_tmp - s_tmp.transpose(*tpat)
    return T.mean(T.sqr(s_y_re_ - T.eye(size).dimshuffle(*bpat)) + T.sqr(s_y_im_))
项目:structured-output-ae    作者:sbelharbi    | 项目源码 | 文件源码
def get_updates(self, learning_rate, params, grads, lr_scalers):
        """Compute the AdaDelta updates of the model's parameters.

        param_t := param_(t-1) + AdaDelta_update_t
        if self._first_time:
            self.sum_square_grad = [
                    param.get_value() * 0.,
                    borrow=True) for param in params]
            self._first_time = False

        updates = []
        for (param, grad, sum_square_grad, lr_sc) in zip(
                params, grads, self.sum_square_grad, lr_scalers):
            # Calculate the running average gradient: E[g^2]_t
            new_sum_square_grad = sum_square_grad + T.sqr(grad)

            # The update: delta_x_t
            lr_scaled = learning_rate * lr_sc
            epsilon = lr_scaled
            sqrt_sum_grad_t = T.sqrt(new_sum_square_grad)
            delta_x_t = - (epsilon / sqrt_sum_grad_t) * grad

            # update the params
            new_param = param + delta_x_t
            # Send for the update
            updates.append((sum_square_grad, new_sum_square_grad))
            if self.max_colm_norm and in ["W", "w"]:
                new_param_final = norm_constraint(tensor_var=new_param,
                new_param_final = new_param
            updates.append((param, new_param_final))

        return updates
项目:structured-output-ae    作者:sbelharbi    | 项目源码 | 文件源码
def get_updates(self, learning_rate, params, grads, lr_scalers):
        """Compute the parameters' updates.

        if self._first_time:
            self.mean_square_grads = [
                    param.get_value() * 0.,
                    borrow=True) for param in params]
            self._first_time = False
        updates = []
        for (param, grad, mean_square_grad, lr_sc) in zip(
                params, grads, self.mean_square_grads, lr_scalers):
            new_mean_square_grad = (
                self.decay * mean_square_grad + (1-self.decay) * T.sqr(grad))
            # the update
            rms_grad_t = T.sqrt(new_mean_square_grad)
            rms_grad_t = T.maximum(rms_grad_t, self.epsilon)
            lr_scaled = learning_rate * lr_sc
            delta_x_t = - lr_scaled * grad / rms_grad_t

            new_param = param + delta_x_t
            # updates
            if self.max_colm_norm and in ["W", "w"]:
                new_param_final = norm_constraint(tensor_var=new_param,
                new_param_final = new_param
            updates.append((param, new_param_final))
            updates.append((mean_square_grad, new_mean_square_grad))

        return updates
项目:structured-output-ae    作者:sbelharbi    | 项目源码 | 文件源码
def localResponseNormalizationCrossChannel(incoming, alpha=1e-4,
                                           k=2, beta=0.75, n=5):
    Implement the local response normalization cross the channels described
    in <ImageNet Classification with Deep Convolutional Neural Networks>,
    A.Krizhevsky et al. sec.3.3.
    Reference of the code:
    incomping: The feature maps. (output of the convolution layer).
    alpha: float scalar
    k: float scalr
    beta: float scalar
    n: integer: number of adjacent channels. Must be odd.
    if n % 2 == 0:
        raise NotImplementedError("Works only with odd n")

    input_shape = incoming.shape
    half_n = n // 2
    input_sqr = T.sqr(incoming)
    b, ch, r, c = input_shape
    extra_channels = T.alloc(0., b, ch + 2*half_n, r, c)
    input_sqr = T.set_subtensor(extra_channels[:, half_n:half_n+ch, :, :],
    scale = k
    for i in range(n):
        scale += alpha * input_sqr[:, i:i+ch, :, :]
    scale = scale ** beta

    return incoming / scale
项目:structured-output-ae    作者:sbelharbi    | 项目源码 | 文件源码
def contractive_penality(self, h, linear_hid, contraction_level=0.0,
        if batch_size == -1 or batch_size == 0:
            raise Exception("invalid batch size.")

        grad = T.grad(h.sum(), linear_hid)
        jacob =, T.sqr(self.hidden.W.sum(axis=0)))
        frob_norm_jacob = T.sum(jacob) / batch_size
        contract_pen = contraction_level * frob_norm_jacob
        return contract_pen
项目:structured-output-ae    作者:sbelharbi    | 项目源码 | 文件源码
def get_net_cost(model, cost_type, eye=True):
    """Get the train cost of the network."""
    cost = None
    if eye:
        d_eyes = (
            (model.trg[:, 37] - model.trg[:, 46])**2 +
            (model.trg[:, 37] - model.trg[:, 46])**2).T
        if cost_type == CostType.MeanSquared:
            cost = T.mean(
                T.sqr(model.output_dropout - model.trg), axis=1) / d_eyes
        elif cost_type == CostType.CrossEntropy:
            cost = T.mean(
                    model.output_dropout, model.trg), axis=1)
            raise ValueError("cost type unknow.")
        if cost_type == CostType.MeanSquared:
            cost = T.mean(
                T.sqr(model.output_dropout - model.trg), axis=1)
        elif cost_type == CostType.CrossEntropy:
            cost = T.mean(
                    model.output_dropout, model.trg), axis=1)
            raise ValueError("cost type unknow.")

    if model.l1 != 0.:
        cost += model.l1
    if model.l2 != 0.:
        cost += model.l2
    return cost