Python numpy 模块,isnan() 实例源码

我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用numpy.isnan()

项目:AutoML5    作者:djajetic    | 项目源码 | 文件源码
def replace_missing(X):
    # This is ugly, but
    try:
        if X.getformat()=='csr':
            return X
    except:
    X[np.isnan(X)]=-999.0 #djajetic 05.09.2015
    return X #djajetic 05.09.2015

        p=len(X)
        nn=len(X[0])*2
        XX = np.zeros([p,nn])
        for i in range(len(X)):
            line = X[i]
            line1 = [0 if np.isnan(x) else x for x in line]
            line2 = [1 if np.isnan(x) else 0 for x in line] # indicator of missingness
            XX[i] = line1 + line2
    return XX
项目:pylspm    作者:lseman    | 项目源码 | 文件源码
def rhoA(self):
        # rhoA
        rhoA = pd.DataFrame(0, index=np.arange(1), columns=self.latent)

        for i in range(self.lenlatent):
            weights = pd.DataFrame(self.outer_weights[self.latent[i]])
            weights = weights[(weights.T != 0).any()]
            result = pd.DataFrame.dot(weights.T, weights)
            result_ = pd.DataFrame.dot(weights, weights.T)

            S = self.data_[self.Variables['measurement'][
                self.Variables['latent'] == self.latent[i]]]
            S = pd.DataFrame.dot(S.T, S) / S.shape[0]
            numerador = (
                np.dot(np.dot(weights.T, (S - np.diag(np.diag(S)))), weights))
            denominador = (
                (np.dot(np.dot(weights.T, (result_ - np.diag(np.diag(result_)))), weights)))
            rhoA_ = ((result)**2) * (numerador / denominador)
            if(np.isnan(rhoA_.values)):
                rhoA[self.latent[i]] = 1
            else:
                rhoA[self.latent[i]] = rhoA_.values

        return rhoA.T
项目:pylspm    作者:lseman    | 项目源码 | 文件源码
def get(self, X):
        X = np.array(X)
        X_nan = np.isnan(X)
        imputed = self.meanImput(X.copy())

        if len(self.estimators_) > 1:
            for i, estimator_ in enumerate(self.estimators_):
                X_s = np.delete(imputed, i, 1)
                y_nan = X_nan[:, i]

                X_unk = X_s[y_nan]

                result_ = []
                if len(X_unk) > 0:
                    for unk in X_unk:
                        result_.append(estimator_.predict(unk))
                    X[y_nan, i] = result_

        return X
项目:treecat    作者:posterior    | 项目源码 | 文件源码
def treegauss_remove_row(
        data_row,
        tree_grid,
        latent_row,
        vert_ss,
        edge_ss,
        feat_ss, ):
    # Update sufficient statistics.
    for v in range(latent_row.shape[0]):
        z = latent_row[v, :]
        vert_ss[v, :, :] -= np.outer(z, z)
    for e in range(tree_grid.shape[1]):
        z1 = latent_row[tree_grid[1, e], :]
        z2 = latent_row[tree_grid[2, e], :]
        edge_ss[e, :, :] -= np.outer(z1, z2)
    for v, x in enumerate(data_row):
        if np.isnan(x):
            continue
        z = latent_row[v, :]
        feat_ss[v] -= 1
        feat_ss[v, 1] -= x
        feat_ss[v, 2:] -= x * z  # TODO Use central covariance.
项目:seq2seq    作者:google    | 项目源码 | 文件源码
def test_train(self):
    model, fetches_ = self._test_pipeline(tf.contrib.learn.ModeKeys.TRAIN)
    predictions_, loss_, _ = fetches_

    target_len = self.sequence_length + 10 + 2
    max_decode_length = model.params["target.max_seq_len"]
    expected_decode_len = np.minimum(target_len, max_decode_length)

    np.testing.assert_array_equal(predictions_["logits"].shape, [
        self.batch_size, expected_decode_len - 1,
        model.target_vocab_info.total_size
    ])
    np.testing.assert_array_equal(predictions_["losses"].shape,
                                  [self.batch_size, expected_decode_len - 1])
    np.testing.assert_array_equal(predictions_["predicted_ids"].shape,
                                  [self.batch_size, expected_decode_len - 1])
    self.assertFalse(np.isnan(loss_))
项目:zipline-chinese    作者:zhanghan1990    | 项目源码 | 文件源码
def information_ratio(algorithm_returns, benchmark_returns):
    """
    http://en.wikipedia.org/wiki/Information_ratio

    Args:
        algorithm_returns (np.array-like):
            All returns during algorithm lifetime.
        benchmark_returns (np.array-like):
            All benchmark returns during algo lifetime.

    Returns:
        float. Information ratio.
    """
    relative_returns = algorithm_returns - benchmark_returns

    relative_deviation = relative_returns.std(ddof=1)

    if zp_math.tolerant_equals(relative_deviation, 0) or \
       np.isnan(relative_deviation):
        return 0.0

    return np.mean(relative_returns) / relative_deviation
项目:zipline-chinese    作者:zhanghan1990    | 项目源码 | 文件源码
def raw_data_gen(self):
        for dt, series in self.data.iterrows():
            for sid, price in series.iteritems():
                # Skip SIDs that can not be forward filled
                if np.isnan(price) and \
                   sid not in self.started_sids:
                    continue
                self.started_sids.add(sid)

                event = {
                    'dt': dt,
                    'sid': sid,
                    'price': price,
                    # Just chose something large
                    # if no volume available.
                    'volume': 1e9,
                }
                yield event
项目:zipline-chinese    作者:zhanghan1990    | 项目源码 | 文件源码
def test_nan_filter_dataframe(self):
        dates = pd.date_range('1/1/2000', periods=2, freq='B', tz='UTC')
        df = pd.DataFrame(np.random.randn(2, 2),
                          index=dates,
                          columns=[4, 5])
        # should be filtered
        df.loc[dates[0], 4] = np.nan
        # should not be filtered, should have been ffilled
        df.loc[dates[1], 5] = np.nan
        source = DataFrameSource(df)
        event = next(source)
        self.assertEqual(5, event.sid)
        event = next(source)
        self.assertEqual(4, event.sid)
        event = next(source)
        self.assertEqual(5, event.sid)
        self.assertFalse(np.isnan(event.price))
项目:table-compositor    作者:InvestmentSystems    | 项目源码 | 文件源码
def df_type_to_str(i):
    '''
    Convert into simple datatypes from pandas/numpy types
    '''
    if isinstance(i, np.bool_):
        return bool(i)
    if isinstance(i, np.int_):
        return int(i)
    if isinstance(i, np.float):
        if np.isnan(i):
            return 'NaN'
        elif np.isinf(i):
            return str(i)
        return float(i)
    if isinstance(i, np.uint):
        return int(i)
    if type(i) == bytes:
        return i.decode('UTF-8')
    if isinstance(i, (tuple, list)):
        return str(i)
    if i is pd.NaT:  # not identified as a float null
        return 'NaN'
    return str(i)
项目:hip-mdp-public    作者:dtak    | 项目源码 | 文件源码
def calc_reward(self, action=0, state=None, **kw ):
        """Calculate the reward for the specified transition."""
        eps1, eps2 = self.eps_values_for_actions[action]
        if state is None:
            state = self.observe()
        if self.logspace:
            T1, T2, T1s, T2s, V, E = 10**state
        else:
            T1, T2, T1s, T2s, V, E = state
        # the reward function penalizes treatment because of side-effects
        reward = -0.1*V - 2e4*eps1**2 - 2e3*eps2**2 + 1e3*E
        # Constrain reward to be within specified range
        if np.isnan(reward):
            reward = -self.reward_bound
        elif reward > self.reward_bound:
            reward = self.reward_bound
        elif reward < -self.reward_bound:
            reward = -self.reward_bound
        return reward
项目:lung-cancer-detector    作者:YichenGong    | 项目源码 | 文件源码
def to_rgb(img):
    """
    Converts the given array into a RGB image. If the number of channels is not
    3 the array is tiled such that it has 3 channels. Finally, the values are
    rescaled to [0,255) 

    :param img: the array to convert [nx, ny, channels]

    :returns img: the rgb image [nx, ny, 3]
    """
    img = np.atleast_3d(img)
    channels = img.shape[2]
    if channels < 3:
        img = np.tile(img, 3)

    img[np.isnan(img)] = 0
    img -= np.amin(img)
    img /= np.amax(img)
    img *= 255
    return img
项目:QUANTAXIS    作者:yutiansut    | 项目源码 | 文件源码
def SMA(Series, N, M=1):

    ret = []
    i = 1
    length = len(Series)
    # ??X????? nan ?
    while i < length:
        if np.isnan(Series[i]):
            i += 1
        else:
            break
    preY = Series[i]  # Y'
    ret.append(preY)
    while i < length:
        Y = (M * Series[i] + (N - M) * preY) / float(N)
        ret.append(Y)
        preY = Y
        i += 1
    return pd.Series(ret)
项目:NeoAnalysis    作者:neoanalysis    | 项目源码 | 文件源码
def map(self, data):
        data = data[self.fieldName]
        colors = np.empty((len(data), 4))
        default = np.array(fn.colorTuple(self['Default'])) / 255.
        colors[:] = default

        for v in self.param('Values'):
            mask = data == v.maskValue
            c = np.array(fn.colorTuple(v.value())) / 255.
            colors[mask] = c
        #scaled = np.clip((data-self['Min']) / (self['Max']-self['Min']), 0, 1)
        #cmap = self.value()
        #colors = cmap.map(scaled, mode='float')

        #mask = np.isnan(data) | np.isinf(data)
        #nanColor = self['NaN']
        #nanColor = (nanColor.red()/255., nanColor.green()/255., nanColor.blue()/255., nanColor.alpha()/255.)
        #colors[mask] = nanColor

        return colors
项目:NeoAnalysis    作者:neoanalysis    | 项目源码 | 文件源码
def map(self, data):
        data = data[self.fieldName]
        colors = np.empty((len(data), 4))
        default = np.array(fn.colorTuple(self['Default'])) / 255.
        colors[:] = default

        for v in self.param('Values'):
            mask = data == v.maskValue
            c = np.array(fn.colorTuple(v.value())) / 255.
            colors[mask] = c
        #scaled = np.clip((data-self['Min']) / (self['Max']-self['Min']), 0, 1)
        #cmap = self.value()
        #colors = cmap.map(scaled, mode='float')

        #mask = np.isnan(data) | np.isinf(data)
        #nanColor = self['NaN']
        #nanColor = (nanColor.red()/255., nanColor.green()/255., nanColor.blue()/255., nanColor.alpha()/255.)
        #colors[mask] = nanColor

        return colors
项目:risk-slim    作者:ustunb    | 项目源码 | 文件源码
def round_solution_pool(pool, constraints):

    pool.distinct().sort()
    P = pool.P
    L0_reg_ind = np.isnan(constraints['coef_set'].C_0j)
    L0_max = constraints['L0_max']
    rounded_pool = SolutionPool(P)

    for solution in pool.solutions:
        # sort from largest to smallest coefficients
        feature_order = np.argsort([-abs(x) for x in solution])
        rounded_solution = np.zeros(shape=(1, P))
        l0_norm_count = 0
        for k in range(0, P):
            j = feature_order[k]
            if not L0_reg_ind[j]:
                rounded_solution[0, j] = np.round(solution[j], 0)
            elif l0_norm_count < L0_max:
                rounded_solution[0, j] = np.round(solution[j], 0)
                l0_norm_count += L0_reg_ind[j]

        rounded_pool.add(objvals=np.nan, solutions=rounded_solution)

    rounded_pool.distinct().sort()
    return rounded_pool
项目:TADPOLE    作者:noxtoby    | 项目源码 | 文件源码
def checkFSXvalsAgainstADNIMERGE(tadpoleDF, mriADNI1FileFSX, otherSSvisCodeStr, ssNameTag,
                                 ignoreMissingCols = False):
  nrRows, nrCols = tadpoleDF.shape
  colListOtherSS = list(ssDF.columns.values)
  colListTadpoleDF = list(tadpoleDF.columns.values)

  tadpoleDF[['Hippocampus', 'ST29SV%s' % ssNameTag, 'ST88SV%s' % ssNameTag]] = \
    tadpoleDF[['Hippocampus', 'ST29SV%s' % ssNameTag, 'ST88SV%s' % ssNameTag]].apply(pd.to_numeric, errors='coerce')


  tadpoleDF['HIPPOSUM'] = tadpoleDF['ST29SV%s' % ssNameTag] + tadpoleDF['ST88SV%s' % ssNameTag]
  for r in range(nrRows):

    valsNan = np.isnan(tadpoleDF['Hippocampus'][r]) or (np.isnan(tadpoleDF['ST29SV%s' % ssNameTag][r]) and \
                 np.isnan(tadpoleDF['ST88SV%s' % ssNameTag][r]))
    if valsNan:
      continue

    valsNotEq = tadpoleDF['Hippocampus'][r] != (tadpoleDF['ST29SV%s' % ssNameTag][r] + tadpoleDF['ST88SV%s' % ssNameTag][r])
    if valsNotEq:
      print('entries dont match\n ', tadpoleDF[['RID','VISCODE', 'Hippocampus', 'ST29SV%s' % ssNameTag,\
        'ST88SV%s' % ssNameTag, 'HIPPOSUM']].iloc[r])

  # Conclusion: the reason why entries above don't match is because UCSFFSX has duplicate entries for the same subject and viscode.
项目:SWEETer-Cat    作者:DanielAndreasen    | 项目源码 | 文件源码
def test_hz():
    """Test the hz function."""
    df, _ = readSC()
    for (teff, logg, mass) in df.loc[:, ['teff', 'logg', 'mass']].values:
        lum = (teff / 5777)**4 * (mass / ((10**logg) / (10**4.44)))**2
        assert isinstance(hz(teff, lum, model=2), float)
        assert isinstance(hz(teff, lum, model=4), float)

    teff = 5777
    lum = 1
    invalids = [{teff: lum}, [teff, lum], (teff, lum), "..."]
    for model in range(1, 6):
        assert isinstance(hz(teff, lum, model), float)
    results = [0.75, 0.98, 0.99, 1.71, 1.77]
    for model, result in enumerate(results, start=1):
        assert round(hz(teff, lum, model), 2) == result
        for invalid in invalids:
            assert np.isnan(hz(invalid, lum, model))
            assert np.isnan(hz(teff, invalid, model))
    assert hz(teff, lum, 2) < hz(teff, lum, 4)  # hz1 < hz2
项目:PersonalizedMultitaskLearning    作者:mitmedialab    | 项目源码 | 文件源码
def generateWekaFile(X,Y,features,path,name):
    f = open(path + name + '.arff', 'w')
    f.write("@relation '" + name + "'\n\n")

    for feat in features:
        f.write("@attribute " + feat + " numeric\n")
    f.write("@attribute cluster {True,False}\n\n")

    f.write("@data\n\n")
    for i in range(X.shape[0]):
        for j in range(X.shape[1]):
            if np.isnan(X[i,j]):
                f.write("?,")
            else:
                f.write(str(X[i,j]) + ",")
        if Y[i] == 1.0 or Y[i] == True:
            f.write("True\n")
        else:
            f.write("False\n")

    f.close()
项目:attend_infer_repeat    作者:akosiorek    | 项目源码 | 文件源码
def test_posterior_zeros(self):
        p = np.asarray([.5, 0., 0.]).reshape((1, 3))

        posterior = self.eval(self.posterior, p)
        print 'posterior', posterior
        posterior_grad = self.eval(self.posterior_grad, p)
        print 'posterior grad', posterior_grad

        kl = self.eval(self.posterior_kl, p)
        print kl
        self.assertGreater(kl.sum(), 0)
        self.assertFalse(np.isnan(kl).any())
        self.assertTrue(np.isfinite(kl).all())

        grad = self.eval(self.posterior_kl_grad, p)
        print grad
        self.assertFalse(np.isnan(grad).any())
        self.assertTrue(np.isfinite(grad).all())
项目:seqhawkes    作者:mlukasik    | 项目源码 | 文件源码
def update_summary(
    var_up,
    var,
    start,
    end,
    ):
    diff = np.abs(var_up - var)
    reldiff = diff / var

    # filter out nan's

    try:
        reldiff = reldiff[~np.isnan(reldiff)]
    except:
        pass
    return (np.mean(diff), np.std(diff), np.mean(reldiff),
            np.std(reldiff), (end - start).microseconds)
项目:dc_stat_think    作者:justinbois    | 项目源码 | 文件源码
def test_bootstrap_replicate_1d(data, seed):
    np.random.seed(seed)
    x = dcst.bootstrap_replicate_1d(data, np.mean)
    np.random.seed(seed)
    x_correct = original.bootstrap_replicate_1d(data[~np.isnan(data)], np.mean)
    assert (np.isnan(x) and np.isnan(x_correct, atol=atol, equal_nan=True)) \
                or np.isclose(x, x_correct, atol=atol, equal_nan=True)

    np.random.seed(seed)
    x = dcst.bootstrap_replicate_1d(data, np.median)
    np.random.seed(seed)
    x_correct = original.bootstrap_replicate_1d(data[~np.isnan(data)], np.median)
    assert (np.isnan(x) and np.isnan(x_correct, atol=atol, equal_nan=True)) \
                or np.isclose(x, x_correct, atol=atol, equal_nan=True)

    np.random.seed(seed)
    x = dcst.bootstrap_replicate_1d(data, np.std)
    np.random.seed(seed)
    x_correct = original.bootstrap_replicate_1d(data[~np.isnan(data)], np.std)
    assert (np.isnan(x) and np.isnan(x_correct, atol=atol, equal_nan=True)) \
                or np.isclose(x, x_correct, atol=atol, equal_nan=True)
项目:pyrsss    作者:butala    | 项目源码 | 文件源码
def nan_helper(y):
    """
    Helper to handle indices and logical indices of NaNs.

    Input:
        - y, 1d numpy array with possible NaNs
    Output:
        - nans, logical indices of NaNs
        - index, a function, with signature indices= index(logical_indices),
          to convert logical indices of NaNs to 'equivalent' indices
    Example:
        >>> # linear interpolation of NaNs
        >>> nans, x= nan_helper(y)
        >>> y[nans]= NP.interp(x(nans), x(~nans), y[~nans])
    """
    # Source: http://stackoverflow.com/questions/6518811/interpolate-nan-values-in-a-numpy-array
    return NP.isnan(y), lambda z: z.nonzero()[0]
项目:DomainDependencyMemeJsai2017    作者:GINK03    | 项目源码 | 文件源码
def step4():
  key_vec = pickle.loads(open("key_vec.pkl", "rb").read()) 
  vecs = []
  for ev, vec in enumerate(key_vec.values()):
    x = np.array(vec)
    if np.isnan(x).any():
      # print(vec)
      continue
    vecs.append(x)
  vecs   = np.array(vecs)
  kmeans = KMeans(n_clusters=128, init='k-means++', n_init=10, max_iter=300,
                       tol=0.0001,precompute_distances='auto', verbose=0,
                       random_state=None, copy_x=True, n_jobs=1)
  print("now fitting...")
  kmeans.fit(vecs)

  open("kmeans.model", "wb").write( pickle.dumps(kmeans) )
  for p in kmeans.predict(vecs):
    print(p)
项目:DomainDependencyMemeJsai2017    作者:GINK03    | 项目源码 | 文件源码
def _step5(arr):
  kmeans = pickle.loads(open("kmeans.model", "rb").read())
  key, lines, tipe = arr
  print(key)
  open("./tmp/tmp.{tipe}.{key}.txt".format(tipe=tipe,key=key), "w").write("\n".join(lines))
  res  = os.popen("./fasttext print-sentence-vectors ./models/model.bin < tmp/tmp.{tipe}.{key}.txt".format(tipe=tipe, key=key)).read()
  w    = open("tmp/tmp.{tipe}.{key}.json".format(tipe=tipe,key=key), "w")
  for line in res.split("\n"):
    try:
      vec = list(map(float, line.split()[-100:]))
    except:
      print(line)
      print(res)
      continue
    x = np.array(vec)
    if np.isnan(x).any():
      continue
    cluster = kmeans.predict([vec])
    txt = line.split()[:-100]
    obj = {"txt": txt, "cluster": cluster.tolist()} 
    data = json.dumps(obj, ensure_ascii=False)
    w.write( data + "\n" )
项目:lm    作者:rafaljozefowicz    | 项目源码 | 文件源码
def test_lm(self):
        hps = get_test_hparams()

        with tf.variable_scope("model"):
            model = LM(hps)

        with self.test_session() as sess:
            tf.initialize_all_variables().run()
            tf.initialize_local_variables().run()

            loss = 1e5
            for i in range(50):
                x, y, w = simple_data_generator(hps.batch_size, hps.num_steps)
                loss, _ = sess.run([model.loss, model.train_op], {model.x: x, model.y: y, model.w: w})
                print("%d: %.3f %.3f" % (i, loss, np.exp(loss)))
                if np.isnan(loss):
                    print("NaN detected")
                    break

            self.assertLess(loss, 1.0)
项目:histwords    作者:williamleif    | 项目源码 | 文件源码
def get_series_median_peryear(word_time_series, i_year_words, one_minus=False, start_year=1900, end_year=2000, year_inc=10, exclude_partial_missing=False):
    """
    Return the mean and stderr arrays for the values of the words specified per year in i_year_words for specified years 
    """
    medians = []
    r_word_time_series = {}
    if exclude_partial_missing:
        for word, time_series in word_time_series.iteritems():
            if not np.isnan(np.sum(time_series.values())):
                r_word_time_series[word] = time_series
    else:
        r_word_time_series = word_time_series
    for year in xrange(start_year, end_year + 1, year_inc):
        word_array = np.array([r_word_time_series[word][year] for word in i_year_words[year] 
            if word in r_word_time_series and not np.isnan(r_word_time_series[word][year]) and not r_word_time_series[word][year] == 0])
        if len(word_array) == 0:
            continue
        if one_minus:
            word_array = 1 - word_array
        medians.append(np.median(word_array))
    return np.array(medians)
项目:histwords    作者:williamleif    | 项目源码 | 文件源码
def get_series_mean_std_peryear(word_time_series, i_year_words, one_minus=False, start_year=1900, end_year=2000, year_inc=1, exclude_partial_missing=False):
    """
    Return the mean and stderr arrays for the values of the words specified per year in i_year_words for specified years 
    """
    means = []
    stderrs = []
    r_word_time_series = {}
    if exclude_partial_missing:
        for word, time_series in word_time_series.iteritems():
            if not np.isnan(np.sum(time_series.values())):
                r_word_time_series[word] = time_series
    else:
        r_word_time_series = word_time_series
    for year in xrange(start_year, end_year + 1, year_inc):
        word_array = np.array([r_word_time_series[word][year] for word in i_year_words[year] 
            if word in r_word_time_series and not np.isnan(r_word_time_series[word][year]) and not np.isinf(r_word_time_series[word][year])])
        if len(word_array) == 0:
            continue
        if one_minus:
            word_array = 1 - word_array
        means.append(word_array.mean())
        stderrs.append(word_array.std())
    return np.array(means), np.array(stderrs)
项目:histwords    作者:williamleif    | 项目源码 | 文件源码
def get_series_mean_stderr_peryear(word_time_series, i_year_words, one_minus=False, start_year=1900, end_year=2000, year_inc=1,  exclude_partial_missing=False):
    """
    Return the mean and stderr arrays for the values of the words specified per year in i_year_words for specified years 
    """
    means = []
    stderrs = []
    r_word_time_series = {}
    if exclude_partial_missing:
        for word, time_series in word_time_series.iteritems():
            time_series = {year:val for year, val in time_series.iteritems() if year >= start_year and year <= end_year}
            if not np.isnan(np.sum(time_series.values())):
                r_word_time_series[word] = time_series
    else:
        r_word_time_series = word_time_series
    for year in xrange(start_year, end_year + 1, year_inc):
        word_array = np.array([r_word_time_series[word][year] for word in i_year_words[year] 
            if word in r_word_time_series and not np.isnan(r_word_time_series[word][year])])
        if one_minus:
            word_array = 1 - word_array
        means.append(word_array.mean())
        stderrs.append(word_array.std() / len(word_array))
    return np.array(means), np.array(stderrs)
项目:histwords    作者:williamleif    | 项目源码 | 文件源码
def get_yearly_set_dev(series, i_year_words, one_minus=False, start_year=1900, end_year=2000, method='diff'):
    """
    Gets the mean relative deviation of the words in words vs. the full series.
    """
    base_mat = _make_series_mat(series, series.keys(), one_minus=one_minus, start_year=start_year, end_year=end_year)
    means = []
    stderrs = []
    r_word_time_series = series
    for year in xrange(start_year, end_year + 1):
        word_array = np.array([r_word_time_series[word][year] for word in i_year_words[year] 
            if word in r_word_time_series and not np.isnan(r_word_time_series[word][year])])
        if one_minus:
            word_array = 1 - word_array
        if method == 'diff':
            word_array = word_array - base_mat.mean(0)[year-start_year]
        elif method == 'ratio':
            word_array = word_array / base_mat.mean(0)[year-start_year]
        else:
            raise RuntimeError("Unknown deviation method. Use diff or ratio.")
        means.append(word_array.mean())
        stderrs.append(word_array.std() / len(word_array))
    return np.array(means), np.array(stderrs)
项目:kmeans-service    作者:MAYHEM-Lab    | 项目源码 | 文件源码
def log_likelihood(self, data):
        nks = np.bincount(self.labels_, minlength=self.n_clusters)  # number of points in each cluster
        n, d = data.shape
        log_likelihood = 0
        covar_matrices = self.covariances(self.labels_, cluster_centers=self.cluster_centers_, data=data)
        covar_matrix_det_v = np.linalg.det(covar_matrices)
        self._inv_covar_matrices = self._matrix_inverses(covar_matrices)
        for k, nk in enumerate(nks):
            if self.verbose == 1:
                print('log_likelihood: covar_matrix_det = {}'.format(covar_matrix_det_v[k]))
            term_1 = nk * (np.log(float(nk)/n) - 0.5 * d * np.log(2*np.pi) - 0.5 * np.log(abs(covar_matrix_det_v[k])))
            cdist_result = cdist(data[self.labels_ == k], np.array([self.cluster_centers_[k]]), metric='mahalanobis', VI=self._inv_covar_matrices[k])
            cdist_no_nan = cdist_result[~np.isnan(cdist_result)]  #  to deal with nans returned by cdist
            term_2 = -0.5 * (np.sum(cdist_no_nan))
            k_sum = term_1 + term_2
            log_likelihood += k_sum
        if np.isnan(log_likelihood) or log_likelihood == float('inf'):
            raise Exception('ll is nan or inf')
        return log_likelihood
项目:empyrical    作者:quantopian    | 项目源码 | 文件源码
def test_alpha(self, returns, benchmark, expected):
        observed = self.empyrical.alpha(returns, benchmark)
        assert_almost_equal(
            observed,
            expected,
            DECIMAL_PLACES)

        if len(returns) == len(benchmark):
            # Compare to scipy linregress
            returns_arr = returns.values
            benchmark_arr = benchmark.values
            mask = ~np.isnan(returns_arr) & ~np.isnan(benchmark_arr)
            slope, intercept, _, _, _ = stats.linregress(benchmark_arr[mask],
                                                         returns_arr[mask])

            assert_almost_equal(
                observed,
                intercept * 252,
                DECIMAL_PLACES
            )

    # Alpha/beta translation tests.
项目:empyrical    作者:quantopian    | 项目源码 | 文件源码
def test_beta(self, returns, benchmark, expected):
        observed = self.empyrical.beta(returns, benchmark)
        assert_almost_equal(
            observed,
            expected,
            DECIMAL_PLACES)

        if len(returns) == len(benchmark):
            # Compare to scipy linregress
            returns_arr = returns.values
            benchmark_arr = benchmark.values
            mask = ~np.isnan(returns_arr) & ~np.isnan(benchmark_arr)
            slope, intercept, _, _, _ = stats.linregress(benchmark_arr[mask],
                                                         returns_arr[mask])

            assert_almost_equal(
                observed,
                slope
            )
项目:plotnine    作者:has2k1    | 项目源码 | 文件源码
def strategy(data, params):
        """
        Stack overlapping intervals.

        Assumes that each set has the same horizontal position
        """
        vjust = params['vjust']

        y = data['y'].copy()
        y[np.isnan(y)] = 0
        heights = np.append(0, y.cumsum())

        if params['fill']:
            heights = heights / np.abs(heights[-1])

        data['ymin'] = np.min([heights[:-1], heights[1:]], axis=0)
        data['ymax'] = np.max([heights[:-1], heights[1:]], axis=0)
        # less intuitive than (ymin + vjust(ymax-ymin)), but
        # this way avoids subtracting numbers of potentially
        # similar precision
        data['y'] = ((1-vjust)*data['ymin'] + vjust*data['ymax'])
        return data
项目:OpenAPS    作者:medicinexlab    | 项目源码 | 文件源码
def _find_index(bg_df, start_date, end_date, make_col_bool):
    if (make_col_bool): bg_df['date'] = bg_df['created_at'].apply(lambda x: x.date()) #create column with just the date if make_col_bool is True

    #Find the first date with the start date (first entry) and the last date with the end date (last entry)
    #Since the older dates have higher indices, we use max() for start and min() for the end dates
    start_index = bg_df[bg_df['date'] == start_date.date()].index.max()
    end_index = bg_df[bg_df['date'] == end_date.date()].index.min()

    #Raises exception if invalid dates (which are labeled as NaN)
    if np.isnan(start_index): raise Exception("Invalid start date: " + str(start_date.date()))
    if np.isnan(end_index): raise Exception("Invalid end date: " + str(end_date.date()))

    return bg_df, start_index, end_index


#Function to get the bg data
项目:LinearCorex    作者:gregversteeg    | 项目源码 | 文件源码
def plot_heatmaps(data, mis, column_label, cont, topk=30, prefix=''):
    cmap = sns.cubehelix_palette(as_cmap=True, light=.9)
    m, nv = mis.shape
    for j in range(m):
        inds = np.argsort(- mis[j, :])[:topk]
        if len(inds) >= 2:
            plt.clf()
            order = np.argsort(cont[:,j])
            subdata = data[:, inds][order].T
            subdata -= np.nanmean(subdata, axis=1, keepdims=True)
            subdata /= np.nanstd(subdata, axis=1, keepdims=True)
            columns = [column_label[i] for i in inds]
            sns.heatmap(subdata, vmin=-3, vmax=3, cmap=cmap, yticklabels=columns, xticklabels=False, mask=np.isnan(subdata))
            filename = '{}/heatmaps/group_num={}.png'.format(prefix, j)
            if not os.path.exists(os.path.dirname(filename)):
                os.makedirs(os.path.dirname(filename))
            plt.title("Latent factor {}".format(j))
            plt.yticks(rotation=0)
            plt.savefig(filename, bbox_inches='tight')
            plt.close('all')
            #plot_rels(data[:, inds], map(lambda q: column_label[q], inds), colors=cont[:, j],
            #          outfile=prefix + '/relationships/group_num=' + str(j), latent=labels[:, j], alpha=0.1)
项目:feagen    作者:ianlini    | 项目源码 | 文件源码
def write_data(self, result_dict):
        for key, result in six.iteritems(result_dict):
            if ss.isspmatrix(result):
                if np.isnan(result.data).any():
                    raise ValueError("data {} have nan".format(key))
            elif np.isnan(result).any():
                raise ValueError("data {} have nan".format(key))
            with SimpleTimer("Writing generated data {} to hdf5 file"
                             .format(key),
                             end_in_new_line=False):
                if key in self.h5f:
                    # self.h5f[key][...] = result
                    raise NotImplementedError("Overwriting not supported.")
                else:
                    if (isinstance(result, ss.csc_matrix)
                            or isinstance(result, ss.csr_matrix)):
                        # sparse matrix
                        h5sparse.Group(self.h5f).create_dataset(key,
                                                                data=result)
                    else:
                        self.h5f.create_dataset(key, data=result)
        self.h5f.flush()
项目:MachineLearningPracticePrograms    作者:Subarno    | 项目源码 | 文件源码
def repeat_until_convergence(labelled_data, labelled_clusters, unlabelled_centroids):
    #find best fitting centroids to the labelled_data
    previous_max_difference = 0
    while True:
        unlabelled_old_centroids = unlabelled_centroids
        unlabelled_centroids = move_centroids(labelled_clusters)
        labelled_clusters = form_clusters(labelled_data, unlabelled_centroids)

        differences = list(map(lambda a, b: np.linalg.norm(a-b),unlabelled_old_centroids,unlabelled_centroids))
        max_difference = max(differences)
        if np.isnan(max_difference-previous_max_difference):
            difference_change = np.nan
        else:
            difference_change = abs((max_difference-previous_max_difference)/np.mean([previous_max_difference,max_difference])) * 100

        previous_max_difference = max_difference
        # difference change is nan once the list of differences is all zeroes.
        if np.isnan(difference_change):
            break
    return labelled_clusters, unlabelled_centroids
项目:AutoML5    作者:djajetic    | 项目源码 | 文件源码
def loadData (self, filename, verbose=True, replace_missing=True):
        ''' Get the data from a text file in one of 3 formats: matrix, sparse, binary_sparse'''
        if verbose:  print("========= Reading " + filename)
        start = time.time()
        if self.use_pickle and os.path.exists (os.path.join (self.tmp_dir, os.path.basename(filename) + ".pickle")):
            with open (os.path.join (self.tmp_dir, os.path.basename(filename) + ".pickle"), "r") as pickle_file:
                vprint (verbose, "Loading pickle file : " + os.path.join(self.tmp_dir, os.path.basename(filename) + ".pickle"))
                return pickle.load(pickle_file)
        if 'format' not in self.info.keys():
            self.getFormatData(filename)
        if 'feat_num' not in self.info.keys():
            self.getNbrFeatures(filename)

        data_func = {'dense':data_io.data, 'sparse':data_io.data_sparse, 'sparse_binary':data_io.data_binary_sparse}

        data = data_func[self.info['format']](filename, self.info['feat_num'])

        # INPORTANT: when we replace missing values we double the number of variables

        if self.info['format']=='dense' and replace_missing and np.any(map(np.isnan,data)):
            vprint (verbose, "Replace missing values by 0 (slow, sorry)")
            data = data_converter.replace_missing(data)
        if self.use_pickle:
            with open (os.path.join (self.tmp_dir, os.path.basename(filename) + ".pickle"), "wb") as pickle_file:
                vprint (verbose, "Saving pickle file : " + os.path.join (self.tmp_dir, os.path.basename(filename) + ".pickle"))
                p = pickle.Pickler(pickle_file) 
                p.fast = True 
                p.dump(data)
        end = time.time()
        if verbose:  print( "[+] Success in %5.2f sec" % (end - start))
        return data
项目:AutoML5    作者:djajetic    | 项目源码 | 文件源码
def sanitize_array(array):
    ''' Replace NaN and Inf (there should not be any!)'''
    a=np.ravel(array)
    maxi = np.nanmax((filter(lambda x: x != float('inf'), a))) # Max except NaN and Inf
    mini = np.nanmin((filter(lambda x: x != float('-inf'), a))) # Mini except NaN and Inf
    array[array==float('inf')]=maxi
    array[array==float('-inf')]=mini
    mid = (maxi + mini)/2
    array[np.isnan(array)]=mid
    return array
项目:pylspm    作者:lseman    | 项目源码 | 文件源码
def htmt(self):

        htmt_ = pd.DataFrame(pd.DataFrame.corr(self.data_),
                             index=self.manifests, columns=self.manifests)

        mean = []
        allBlocks = []
        for i in range(self.lenlatent):
            block_ = self.Variables['measurement'][
                self.Variables['latent'] == self.latent[i]]
            allBlocks.append(list(block_.values))
            block = htmt_.ix[block_, block_]
            mean_ = (block - np.diag(np.diag(block))).values
            mean_[mean_ == 0] = np.nan
            mean.append(np.nanmean(mean_))

        comb = [[k, j] for k in range(self.lenlatent)
                for j in range(self.lenlatent)]

        comb_ = [(np.sqrt(mean[comb[i][1]] * mean[comb[i][0]]))
                 for i in range(self.lenlatent ** 2)]

        comb__ = []
        for i in range(self.lenlatent ** 2):
            block = (htmt_.ix[allBlocks[comb[i][1]],
                              allBlocks[comb[i][0]]]).values
#            block[block == 1] = np.nan
            comb__.append(np.nanmean(block))

        htmt__ = np.divide(comb__, comb_)
        where_are_NaNs = np.isnan(htmt__)
        htmt__[where_are_NaNs] = 0

        htmt = pd.DataFrame(np.tril(htmt__.reshape(
            (self.lenlatent, self.lenlatent)), k=-1), index=self.latent, columns=self.latent)

        return htmt
项目:YellowFin_Pytorch    作者:JianGoForIt    | 项目源码 | 文件源码
def get_cubic_root(self):
    # We have the equation x^2 D^2 + (1-x)^4 * C / h_min^2
    # where x = sqrt(mu).
    # We substitute x, which is sqrt(mu), with x = y + 1.
    # It gives y^3 + py = q
    # where p = (D^2 h_min^2)/(2*C) and q = -p.
    # We use the Vieta's substution to compute the root.
    # There is only one real solution y (which is in [0, 1] ).
    # http://mathworld.wolfram.com/VietasSubstitution.html
    # eps in the numerator is to prevent momentum = 1 in case of zero gradient
    if np.isnan(self._dist_to_opt) or np.isnan(self._h_min) or np.isnan(self._grad_var) \
      or np.isinf(self._dist_to_opt) or np.isinf(self._h_min) or np.isinf(self._grad_var):
      logging.warning("Input to cubic solver has invalid nan/inf value!")
      raise Exception("Input to cubic solver has invalid nan/inf value!")

    p = (self._dist_to_opt + eps)**2 * (self._h_min + eps)**2 / 2 / (self._grad_var + eps)
    w3 = (-math.sqrt(p**2 + 4.0 / 27.0 * p**3) - p) / 2.0
    w = math.copysign(1.0, w3) * math.pow(math.fabs(w3), 1.0/3.0)
    y = w - p / 3.0 / (w + eps)
    x = y + 1

    if self._verbose:
      logging.debug("p %f, denominator %f", p, self._grad_var + eps)
      logging.debug("w3 %f ", w3)
      logging.debug("y %f, denominator %f", y, w + eps)

    if np.isnan(x) or np.isinf(x):
      logging.warning("Output from cubic is invalid nan/inf value!")
      raise Exception("Output from cubic is invalid nan/inf value!")

    return x
项目:treecat    作者:posterior    | 项目源码 | 文件源码
def treegauss_add_row(
        data_row,
        tree_grid,
        program,
        latent_row,
        vert_ss,
        edge_ss,
        feat_ss, ):
    # Sample latent state using dynamic programming.
    TODO('https://github.com/posterior/treecat/issues/26')

    # Update sufficient statistics.
    for v in range(latent_row.shape[0]):
        z = latent_row[v, :]
        vert_ss[v, :, :] += np.outer(z, z)
    for e in range(tree_grid.shape[1]):
        z1 = latent_row[tree_grid[1, e], :]
        z2 = latent_row[tree_grid[2, e], :]
        edge_ss[e, :, :] += np.outer(z1, z2)
    for v, x in enumerate(data_row):
        if np.isnan(x):
            continue
        z = latent_row[v, :]
        feat_ss[v] += 1
        feat_ss[v, 1] += x
        feat_ss[v, 2:] += x * z  # TODO Use central covariance.
项目:MKLMM    作者:omerwe    | 项目源码 | 文件源码
def imputeSNPs(X):
    snpsMean = np.nanmean(X, axis=0)
    isNan = np.isnan(X)
    for i,m in enumerate(snpsMean): X[isNan[:,i], i] = m

    return X
项目:distributional_perspective_on_RL    作者:Kiwoo    | 项目源码 | 文件源码
def __call__(self, *args, **kwargs):
        assert len(args) <= len(self.inputs), "Too many arguments provided"
        feed_dict = {}
        # Update the args
        for inpt, value in zip(self.inputs, args):
            self._feed_input(feed_dict, inpt, value)
        # Update the kwargs
        kwargs_passed_inpt_names = set()
        for inpt in self.inputs[len(args):]:
            inpt_name = inpt.name.split(':')[0]
            inpt_name = inpt_name.split('/')[-1]
            assert inpt_name not in kwargs_passed_inpt_names, \
                "this function has two arguments with the same name \"{}\", so kwargs cannot be used.".format(inpt_name)
            if inpt_name in kwargs:
                kwargs_passed_inpt_names.add(inpt_name)
                self._feed_input(feed_dict, inpt, kwargs.pop(inpt_name))
            else:
                assert inpt in self.givens, "Missing argument " + inpt_name
        assert len(kwargs) == 0, "Function got extra arguments " + str(list(kwargs.keys()))
        # Update feed dict with givens.
        for inpt in self.givens:
            feed_dict[inpt] = feed_dict.get(inpt, self.givens[inpt])
        results = get_session().run(self.outputs_update, feed_dict=feed_dict)[:-1]
        if self.check_nan:
            if any(np.isnan(r).any() for r in results):
                raise RuntimeError("Nan detected")
        return results
项目:seq2seq    作者:google    | 项目源码 | 文件源码
def test_gradients(self):
    inputs = tf.random_normal(
        [self.batch_size, self.sequence_length, self.input_depth])
    seq_length = tf.ones(self.batch_size, dtype=tf.int32) * self.sequence_length
    labels = np.random.randint(0, self.vocab_size,
                               [self.batch_size, self.sequence_length])

    helper = decode_helper.TrainingHelper(
        inputs=inputs, sequence_length=seq_length)
    decoder_fn = self.create_decoder(
        helper=helper, mode=tf.contrib.learn.ModeKeys.TRAIN)
    initial_state = decoder_fn.cell.zero_state(
        self.batch_size, dtype=tf.float32)
    decoder_output, _ = decoder_fn(initial_state, helper)

    losses = tf.nn.sparse_softmax_cross_entropy_with_logits(
        logits=decoder_output.logits, labels=labels)
    optimizer = tf.train.AdamOptimizer(learning_rate=0.001)
    grads_and_vars = optimizer.compute_gradients(tf.reduce_mean(losses))

    #pylint: disable=E1101
    with self.test_session() as sess:
      sess.run(tf.global_variables_initializer())
      grads_and_vars_ = sess.run(grads_and_vars)

    for grad, _ in grads_and_vars_:
      self.assertFalse(np.isnan(grad).any())

    return grads_and_vars_
项目:zipline-chinese    作者:zhanghan1990    | 项目源码 | 文件源码
def frame_to_series(self, field, frame, columns=None):
        """
        Convert a frame with a DatetimeIndex and sid columns into a series with
        a sid index, using the aggregator defined by the given field.
        """
        if isinstance(frame, pd.DataFrame):
            columns = frame.columns
            frame = frame.values

        if not len(frame):
            return pd.Series(
                data=(0 if field == 'volume' else np.nan),
                index=columns,
            ).values

        if field in ['price', 'close']:
            # shortcircuit for full last row
            vals = frame[-1]
            if np.all(~np.isnan(vals)):
                return vals
            return ffill(frame)[-1]
        elif field == 'open':
            return bfill(frame)[0]
        elif field == 'volume':
            return np.nansum(frame, axis=0)
        elif field == 'high':
            return np.nanmax(frame, axis=0)
        elif field == 'low':
            return np.nanmin(frame, axis=0)
        else:
            raise ValueError("Unknown field {}".format(field))
项目:zipline-chinese    作者:zhanghan1990    | 项目源码 | 文件源码
def update_last_known_values(self):
        """
        Store the non-NaN values from our oldest frame in each frequency.
        """
        ffillable = self.ffillable_fields
        if not len(ffillable):
            return

        for frequency in self.unique_frequencies:
            digest_panel = self.digest_panels.get(frequency, None)
            if digest_panel:
                oldest_known_values = digest_panel.oldest_frame(raw=True)
            else:
                oldest_known_values = self.buffer_panel.oldest_frame(raw=True)

            oldest_vals = oldest_known_values
            oldest_columns = self.fields
            for field in ffillable:
                f_idx = oldest_columns.get_loc(field)
                field_vals = oldest_vals[f_idx]
                # isnan would be fast, possible to use?
                non_nan_sids = np.where(pd.notnull(field_vals))
                key = (frequency.freq_str, field)
                key_loc = self.last_known_prior_values.index.get_loc(key)
                self.last_known_prior_values.values[
                    key_loc, non_nan_sids
                ] = field_vals[non_nan_sids]
项目:zipline-chinese    作者:zhanghan1990    | 项目源码 | 文件源码
def check_entry(key, value):
    if key != 'period_label':
        return np.isnan(value) or np.isinf(value)
    else:
        return False


############################
# Risk Metric Calculations #
############################
项目:zipline-chinese    作者:zhanghan1990    | 项目源码 | 文件源码
def _compute_asset_lifetimes(self):
        """
        Compute and cache a recarry of asset lifetimes.
        """
        equities_cols = self.equities.c
        buf = np.array(
            tuple(
                sa.select((
                    equities_cols.sid,
                    equities_cols.start_date,
                    equities_cols.end_date,
                )).execute(),
            ), dtype='<f8',  # use doubles so we get NaNs
        )
        lifetimes = np.recarray(
            buf=buf,
            shape=(len(buf),),
            dtype=[
                ('sid', '<f8'),
                ('start', '<f8'),
                ('end', '<f8')
            ],
        )
        start = lifetimes.start
        end = lifetimes.end
        start[np.isnan(start)] = 0  # convert missing starts to 0
        end[np.isnan(end)] = np.iinfo(int).max  # convert missing end to INTMAX
        # Cast the results back down to int.
        return lifetimes.astype([
            ('sid', '<i8'),
            ('start', '<i8'),
            ('end', '<i8'),
        ])
项目:zipline-chinese    作者:zhanghan1990    | 项目源码 | 文件源码
def _compute(self, arrays, dates, assets, mask):
        data = arrays[0]
        bins = self.params['bins']
        to_bin = where(mask, data, nan)
        result = quantiles(to_bin, bins)
        # Write self.missing_value into nan locations, whether they were
        # generated by our input mask or not.
        result[isnan(result)] = self.missing_value
        return result.astype(int64_dtype)