Python scipy.stats 模块,skew() 实例源码

我们从Python开源项目中,提取了以下42个代码示例,用于说明如何使用scipy.stats.skew()

项目:HousePricePredictionKaggle    作者:Nuwantha    | 项目源码 | 文件源码
def data_preprocess(train,test):
    outlier_idx = [4,11,13,20,46,66,70,167,178,185,199, 224,261, 309,313,318, 349,412,423,440,454,477,478, 523,540, 581,588,595,654,688, 691, 774, 798, 875, 898,926,970,987,1027,1109, 1169,1182,1239, 1256,1298,1324,1353,1359,1405,1442,1447]
    train.drop(train.index[outlier_idx],inplace=True)
    all_data = pd.concat((train.loc[:,'MSSubClass':'SaleCondition'],
                          test.loc[:,'MSSubClass':'SaleCondition']))

    to_delete = ['Alley','FireplaceQu','PoolQC','Fence','MiscFeature']
    all_data = all_data.drop(to_delete,axis=1)

    train["SalePrice"] = np.log1p(train["SalePrice"])
    #log transform skewed numeric features
    numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index
    skewed_feats = train[numeric_feats].apply(lambda x: skew(x.dropna())) #compute skewness
    skewed_feats = skewed_feats[skewed_feats > 0.75]
    skewed_feats = skewed_feats.index
    all_data[skewed_feats] = np.log1p(all_data[skewed_feats])
    all_data = pd.get_dummies(all_data)
    all_data = all_data.fillna(all_data.mean())
    X_train = all_data[:train.shape[0]]
    X_test = all_data[train.shape[0]:]
    y = train.SalePrice

    return X_train,X_test,y
项目:HousePricePredictionKaggle    作者:Nuwantha    | 项目源码 | 文件源码
def data_preprocess(train,test):
    outlier_idx = [4,11,13,20,46,66,70,167,178,185,199, 224,261, 309,313,318, 349,412,423,440,454,477,478, 523,540, 581,588,595,654,688, 691, 774, 798, 875, 898,926,970,987,1027,1109, 1169,1182,1239, 1256,1298,1324,1353,1359,1405,1442,1447]
    train.drop(train.index[outlier_idx],inplace=True)
    all_data = pd.concat((train.loc[:,'MSSubClass':'SaleCondition'],
                          test.loc[:,'MSSubClass':'SaleCondition']))

    to_delete = ['Alley','FireplaceQu','PoolQC','Fence','MiscFeature']
    all_data = all_data.drop(to_delete,axis=1)

    train["SalePrice"] = np.log1p(train["SalePrice"])
    #log transform skewed numeric features
    numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index
    skewed_feats = train[numeric_feats].apply(lambda x: skew(x.dropna())) #compute skewness
    skewed_feats = skewed_feats[skewed_feats > 0.75]
    skewed_feats = skewed_feats.index
    all_data[skewed_feats] = np.log1p(all_data[skewed_feats])
    all_data = pd.get_dummies(all_data)
    all_data = all_data.fillna(all_data.mean())
    X_train = all_data[:train.shape[0]]
    X_test = all_data[train.shape[0]:]
    y = train.SalePrice

    return X_train,X_test,y
项目:HousePricePredictionKaggle    作者:Nuwantha    | 项目源码 | 文件源码
def data_preprocess(train, test):
    outlier_idx = [4, 11, 13, 20, 46, 66, 70, 167, 178, 185, 199, 224, 261, 309, 313, 318, 349, 412, 423, 440, 454, 477,
                   478, 523, 540, 581, 588, 595, 654, 688, 691, 774, 798, 875, 898, 926, 970, 987, 1027, 1109, 1169,
                   1182, 1239, 1256, 1298, 1324, 1353, 1359, 1405, 1442, 1447]
    train.drop(train.index[outlier_idx], inplace=True)
    all_data = pd.concat((train.loc[:, 'MSSubClass':'SaleCondition'],
                          test.loc[:, 'MSSubClass':'SaleCondition']))

    to_delete = ['Alley', 'FireplaceQu', 'PoolQC', 'Fence', 'MiscFeature']
    all_data = all_data.drop(to_delete, axis=1)

    train["SalePrice"] = np.log1p(train["SalePrice"])
    # log transform skewed numeric features
    numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index
    skewed_feats = train[numeric_feats].apply(lambda x: skew(x.dropna()))  # compute skewness
    skewed_feats = skewed_feats[skewed_feats > 0.75]
    skewed_feats = skewed_feats.index
    all_data[skewed_feats] = np.log1p(all_data[skewed_feats])
    all_data = pd.get_dummies(all_data)
    all_data = all_data.fillna(method='ffill')
    X_train = all_data[:train.shape[0]]
    X_test = all_data[train.shape[0]:]
    y = train.SalePrice

    return X_train, X_test, y
项目:tbp-next-basket    作者:GiulioRossetti    | 项目源码 | 文件源码
def calculate_aggregate(values):
    agg_measures = {
        'avg': np.mean(values),
        'std': np.std(values),
        'var': np.var(values),
        'med': np.median(values),
        '10p': np.percentile(values, 10),
        '25p': np.percentile(values, 25),
        '50p': np.percentile(values, 50),
        '75p': np.percentile(values, 75),
        '90p': np.percentile(values, 90),
        'iqr': np.percentile(values, 75) - np.percentile(values, 25),
        'iqm': interquartile_range_mean(values),
        'mad': mean_absolute_deviation(values),
        'cov': 1.0 * np.mean(values) / np.std(values),
        'gin': gini_coefficient(values),
        'skw': stats.skew(values),
        'kur': stats.kurtosis(values),
        'sum': np.sum(values)
    }

    return agg_measures
项目:TX-Means    作者:riccotti    | 项目源码 | 文件源码
def calculate_aggregate(values):
    agg_measures = {
        'avg': np.mean(values),
        'std': np.std(values),
        'var': np.var(values),
        'med': np.median(values),
        '10p': np.percentile(values, 10),
        '25p': np.percentile(values, 25),
        '50p': np.percentile(values, 50),
        '75p': np.percentile(values, 75),
        '90p': np.percentile(values, 90),
        'iqr': np.percentile(values, 75) - np.percentile(values, 25),
        'iqm': interquartile_range_mean(values),
        'mad': mean_absolute_deviation(values),
        'cov': 1.0 * np.mean(values) / np.std(values),
        'gin': gini_coefficient(values),
        'skw': stats.skew(values),
        'kur': stats.kurtosis(values)
    }

    return agg_measures
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda    作者:SignalMedia    | 项目源码 | 文件源码
def test_skew(self):
        tm._skip_if_no_scipy()

        from scipy.stats import skew
        alt = lambda x: skew(x, bias=False)
        self._check_stat_op('skew', alt)

        # test corner cases, skew() returns NaN unless there's at least 3
        # values
        min_N = 3
        for i in range(1, min_N + 1):
            s = Series(np.ones(i))
            df = DataFrame(np.ones((i, i)))
            if i < min_N:
                self.assertTrue(np.isnan(s.skew()))
                self.assertTrue(np.isnan(df.skew()).all())
            else:
                self.assertEqual(0, s.skew())
                self.assertTrue((df.skew() == 0).all())
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda    作者:SignalMedia    | 项目源码 | 文件源码
def test_skew(self):
        try:
            from scipy.stats import skew
        except ImportError:
            raise nose.SkipTest("no scipy.stats.skew")

        def this_skew(x):
            if len(x) < 3:
                return np.nan
            return skew(x, bias=False)

        self._check_stat_op('skew', this_skew)

    # def test_mad(self):
    #     f = lambda x: np.abs(x - x.mean()).mean()
    #     self._check_stat_op('mad', f)
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda    作者:SignalMedia    | 项目源码 | 文件源码
def test_sem(self):
        def alt(x):
            if len(x) < 2:
                return np.nan
            return np.std(x, ddof=1) / np.sqrt(len(x))

        self._check_stat_op('sem', alt)

    # def test_skew(self):
    #     from scipy.stats import skew

    #     def alt(x):
    #         if len(x) < 3:
    #             return np.nan
    #         return skew(x, bias=False)

    #     self._check_stat_op('skew', alt)
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda    作者:SignalMedia    | 项目源码 | 文件源码
def test_sem(self):
        def alt(x):
            if len(x) < 2:
                return np.nan
            return np.std(x, ddof=1) / np.sqrt(len(x))
        self._check_stat_op('sem', alt)

    # def test_skew(self):
    #     from scipy.stats import skew

    #     def alt(x):
    #         if len(x) < 3:
    #             return np.nan
    #         return skew(x, bias=False)

    #     self._check_stat_op('skew', alt)
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda    作者:SignalMedia    | 项目源码 | 文件源码
def test_returned_dtype(self):

        dtypes = [np.int16, np.int32, np.int64, np.float32, np.float64]
        if hasattr(np, 'float128'):
            dtypes.append(np.float128)

        for dtype in dtypes:
            s = Series(range(10), dtype=dtype)
            group_a = ['mean', 'std', 'var', 'skew', 'kurt']
            group_b = ['min', 'max']
            for method in group_a + group_b:
                result = getattr(s, method)()
                if is_integer_dtype(dtype) and method in group_a:
                    self.assertTrue(
                        result.dtype == np.float64,
                        "return dtype expected from %s is np.float64, "
                        "got %s instead" % (method, result.dtype))
                else:
                    self.assertTrue(
                        result.dtype == dtype,
                        "return dtype expected from %s is %s, "
                        "got %s instead" % (method, dtype, result.dtype))
项目:Kaggle_Buddy    作者:NickYi1990    | 项目源码 | 文件源码
def ka_display_skewnewss(data):
    '''show skewness information

        Parameters
        ----------
        data: pandas dataframe

        Return
        ------
        df: pandas dataframe
    '''
    numeric_cols = data.columns[data.dtypes != 'object'].tolist()
    skew_value = []

    for i in numeric_cols:
        skew_value += [skew(data[i])]
    df = pd.concat(
        [pd.Series(numeric_cols), pd.Series(data.dtypes[data.dtypes != 'object'].apply(lambda x: str(x)).values)
            , pd.Series(skew_value)], axis=1)
    df.columns = ['var_name', 'col_type', 'skew_value']

    return df
项目:toho_mir_ml    作者:kodack64    | 项目源码 | 文件源码
def mfccPostProcess(directory,fileCount):
    for count in range(fileCount):
        print("{0}/{1}".format(count+1,fileCount))
        for mfccext in mfccList:
            mfcc = np.loadtxt(directory+str(count)+mfccext+".csv",delimiter=",")
            dmfcc = librosa.feature.delta(mfcc)
            result = np.zeros((mfcc.shape[1],14))

            result[:,0] = np.mean(mfcc, axis=0)
            result[:,1] = np.var(mfcc, axis=0, dtype=np.float64)
            result[:,2] = stats.skew(mfcc, axis=0)
            result[:,3] = stats.kurtosis(mfcc, axis=0, fisher=False)
            result[:,4] = np.median(mfcc, axis=0)
            result[:,5] = np.min(mfcc, axis=0)
            result[:,6] = np.max(mfcc, axis=0)
            result[:,7] = np.mean(dmfcc, axis=0)
            result[:,8] = np.var(dmfcc, axis=0, dtype=np.float64)
            result[:,9] = stats.skew(dmfcc, axis=0)
            result[:,10] = stats.kurtosis(dmfcc, axis=0, fisher=False)
            result[:,11] = np.median(dmfcc, axis=0)
            result[:,12] = np.min(dmfcc, axis=0)
            result[:,13] = np.max(dmfcc, axis=0)
            result[np.where(np.isnan(result))] = 0
            np.savetxt(directory+str(count)+mfccext+"_stat.txt",result.flatten("F"),delimiter=",")
项目:scikit-discovery    作者:MITHaystack    | 项目源码 | 文件源码
def process(self, obj_data):
        '''
        Apply Skew analysis with results added to the data wrapper

        @param obj_data: Data wrapper
        '''

        column_names = obj_data.getDefaultColumns()

        results = defaultdict(dict)
        # for label, frame in tqdm(obj_data.getIterator()):
        for label, frame in obj_data.getIterator():
            for column in column_names:
                # dropping missing data in order to remove top and bottom 2%
                data = frame[column].dropna()
                # Remove top and bottom 2%
                rem_num = round(len(data)*0.02)
                res = skew(data.sort_values(ascending=True)[rem_num:-rem_num])
                if isinstance(res, np.ma.masked_array):
                    res = np.float(res.data)
                results[label][column] = res
                obj_data.addResult(self.str_description, results)
项目:PortfolioTimeSeriesAnalysis    作者:MizioAnd    | 项目源码 | 文件源码
def skew_correction(df, numerical_features):
        # Skew correction
        skewed_feats = df[numerical_features].apply(lambda x: skew(x.dropna()))  # compute skewness
        skewed_feats = skewed_feats[skewed_feats > 0.75]
        skewed_feats = skewed_feats.index
        df.loc[:, tuple(skewed_feats)] = np.log1p(np.asarray(df[skewed_feats], dtype=float))
项目:physt    作者:janpipek    | 项目源码 | 文件源码
def ideal_bin_count(data, method="default"):
    """A theoretically ideal bin count.

    Parameters
    ----------
    data: array_like or None
        Data to work on. Most methods don't use this.
    method: str
        Name of the method to apply, available values:
          - default (~sturges)
          - sqrt
          - sturges
          - doane
          - rice
        See https://en.wikipedia.org/wiki/Histogram for the description

    Returns
    -------
    int
        Number of bins, always >= 1
    """
    n = data.size
    if n < 1:
        return 1
    if method == "default":
        if n <= 32:
            return 7
        else:
            return ideal_bin_count(data, "sturges")
    elif method == "sqrt":
        return int(np.ceil(np.sqrt(n)))
    elif method == "sturges":
        return int(np.ceil(np.log2(n)) + 1)
    elif method == "doane":
        if n < 3:
            return 1
        from scipy.stats import skew
        sigma = np.sqrt(6 * (n-2) / (n + 1) * (n + 3))
        return int(np.ceil(1 + np.log2(n) + np.log2(1 + np.abs(skew(data)) / sigma)))
    elif method == "rice":
        return int(np.ceil(2 * np.power(n, 1 / 3)))
项目:HousePrices    作者:MizioAnd    | 项目源码 | 文件源码
def skew_correction(df, numerical_features):
        # Skew correction
        skewed_feats = df[numerical_features].apply(lambda x: skew(x.dropna()))  # compute skewness
        skewed_feats = skewed_feats[skewed_feats > 0.75]
        skewed_feats = skewed_feats.index
        df.loc[:, tuple(skewed_feats)] = np.log1p(np.asarray(df[skewed_feats], dtype=float))
        # df[skewed_feats] = np.log1p(np.asarray(df[skewed_feats], dtype=float))
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda    作者:SignalMedia    | 项目源码 | 文件源码
def test_skew(self):
        try:
            from scipy.stats import skew
        except ImportError:
            raise nose.SkipTest("no scipy.stats.skew")

        def this_skew(x):
            if len(x) < 3:
                return np.nan
            return skew(x, bias=False)
        self._check_stat_op('skew', this_skew)

    # def test_mad(self):
    #     f = lambda x: np.abs(x - x.mean()).mean()
    #     self._check_stat_op('mad', f)
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda    作者:SignalMedia    | 项目源码 | 文件源码
def test_skew(self):
        tm._skip_if_no_scipy()
        from scipy.stats import skew

        def alt(x):
            if len(x) < 3:
                return np.nan
            return skew(x, bias=False)

        self._check_stat_op('skew', alt)
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda    作者:SignalMedia    | 项目源码 | 文件源码
def test_stats_mixed_type(self):
        # don't blow up
        self.mixed_frame.std(1)
        self.mixed_frame.var(1)
        self.mixed_frame.mean(1)
        self.mixed_frame.skew(1)
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda    作者:SignalMedia    | 项目源码 | 文件源码
def test_how_compat(self):
        # in prior versions, we would allow how to be used in the resample
        # now that its deprecated, we need to handle this in the actual
        # aggregation functions
        s = pd.Series(
            np.random.randn(20),
            index=pd.date_range('1/1/2000', periods=20, freq='12H'))

        for how in ['min', 'max', 'median']:
            for op in ['mean', 'sum', 'std', 'var', 'kurt', 'skew']:
                for t in ['rolling', 'expanding']:

                    with tm.assert_produces_warning(FutureWarning,
                                                    check_stacklevel=False):

                        dfunc = getattr(pd, "{0}_{1}".format(t, op))
                        if dfunc is None:
                            continue

                        if t == 'rolling':
                            kwargs = {'window': 5}
                        else:
                            kwargs = {}
                        result = dfunc(s, freq='D', how=how, **kwargs)

                        expected = getattr(
                            getattr(s, t)(freq='D', **kwargs), op)(how=how)
                        assert_series_equal(result, expected)
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda    作者:SignalMedia    | 项目源码 | 文件源码
def test_rolling_skew(self):
        try:
            from scipy.stats import skew
        except ImportError:
            raise nose.SkipTest('no scipy')
        self._check_moment_func(mom.rolling_skew,
                                lambda x: skew(x, bias=False), name='skew')
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda    作者:SignalMedia    | 项目源码 | 文件源码
def test_nanskew(self):
        tm.skip_if_no_package('scipy.stats')
        tm._skip_if_scipy_0_17()
        from scipy.stats import skew
        func = partial(self._skew_kurt_wrap, func=skew)
        self.check_funs(nanops.nanskew, func, allow_complex=False,
                        allow_str=False, allow_date=False, allow_tdelta=False)
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda    作者:SignalMedia    | 项目源码 | 文件源码
def setUp(self):
        # Test data + skewness value (computed with scipy.stats.skew)
        self.samples = np.sin(np.linspace(0, 1, 200))
        self.actual_skew = -0.1875895205961754
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda    作者:SignalMedia    | 项目源码 | 文件源码
def test_constant_series(self):
        # xref GH 11974
        for val in [3075.2, 3075.3, 3075.5]:
            data = val * np.ones(300)
            skew = nanops.nanskew(data)
            self.assertEqual(skew, 0.0)
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda    作者:SignalMedia    | 项目源码 | 文件源码
def test_ground_truth(self):
        skew = nanops.nanskew(self.samples)
        self.assertAlmostEqual(skew, self.actual_skew)
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda    作者:SignalMedia    | 项目源码 | 文件源码
def test_nans(self):
        samples = np.hstack([self.samples, np.nan])
        skew = nanops.nanskew(samples, skipna=False)
        self.assertTrue(np.isnan(skew))
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda    作者:SignalMedia    | 项目源码 | 文件源码
def test_nans_skipna(self):
        samples = np.hstack([self.samples, np.nan])
        skew = nanops.nanskew(samples, skipna=True)
        tm.assert_almost_equal(skew, self.actual_skew)
项目:stegasawus    作者:rokkuran    | 项目源码 | 文件源码
def statistical_metrics(x):
    """
    Calculates statistical metrics on input array (mean, std, skew, kurtosis).
    """

    metrics = {
        'mean': np.mean,
        'stdev': np.std,
        'skew': stats.skew,
        'kurtosis': stats.kurtosis
    }
    return {k: fn(x.flatten()) for k, fn in metrics.items()}
项目:bmlingam    作者:taku-y    | 项目源码 | 文件源码
def test_gen_usr_distrib(n_samples=100000, verbose=False):
    rng  = np.random.RandomState(0)

    xs = _gen_usr_distrib(n_samples, ['laplace'], rng)
    assert_allclose(np.mean(xs), 0, atol=5e-2)
    assert_allclose(np.std(xs), 1, atol=5e-2)
    assert_allclose(skew(xs)[0], 0, atol=5e-2)
    assert_allclose(kurtosis(xs)[0], 3, atol=5e-2)

    xs = _gen_usr_distrib(n_samples, ['exp'], rng)
    assert_allclose(np.std(xs), 1, atol=5e-2)
项目:Quora-Kaggle    作者:PPshrimpGo    | 项目源码 | 文件源码
def get_features(df_features):
    print('use w2v to document presentation')
    now = datetime.datetime.now()
    print now.strftime('%Y-%m-%d %H:%M:%S')
    df_features['z_document_dis'] = df_features.apply(lambda x: getDiff_averge_tfidf(x['question1'], x['question2']), axis = 1)
    print('nones')
    now = datetime.datetime.now()
    print now.strftime('%Y-%m-%d %H:%M:%S') 
    df_features['q1_unique'] = df_features.apply(lambda x: getdiffwords(x['question1'], x['question2']), axis = 1)
    df_features['q2_unique'] = df_features.apply(lambda x: getdiffwords(x['question2'], x['question1']), axis = 1)
    #df_features['question1_nouns'] = df_features.question1.map(lambda x: [w for w, t in nltk.pos_tag(nltk.word_tokenize(str(x).lower())) if t[:1] in ['N']])
    #df_features['question2_nouns'] = df_features.question2.map(lambda x: [w for w, t in nltk.pos_tag(nltk.word_tokenize(str(x).lower())) if t[:1] in ['N']])
    df_features['question1_w2v'] = df_features.question1.map(lambda x: get_vector_tfidf(" ".join(x)))
    df_features['question2_w2v'] = df_features.question2.map(lambda x: get_vector_tfidf(" ".join(x)))
    print('z_dist')
    now = datetime.datetime.now()
    print now.strftime('%Y-%m-%d %H:%M:%S') 
    df_features['z_dist'] = df_features.apply(lambda x:Levenshtein.ratio(x['question1'], x['question2']), axis=1)
    now = datetime.datetime.now()
    print('z_tfidf_cos_sim')
    print now.strftime('%Y-%m-%d %H:%M:%S') 
    df_features['z_tfidf_cos_sim'] = df_features.apply(lambda x: cos_sim(x['question1'], x['question2']), axis=1)
    now = datetime.datetime.now()
    print('z_w2v_nones')
    print now.strftime('%Y-%m-%d %H:%M:%S') 
    df_features['z_w2v_unique'] = df_features.apply(lambda x: w2v_cos_sim_tfidf(x['q1_unique'], x['q2_unique']), axis=1)
    df_features['z_w2v_dis_e'] = df_features.apply(lambda x: spatial.distance.euclidean(x['question1_w2v'], x['question2_w2v']), axis=1)
    df_features['z_w2v_dis_mink'] = df_features.apply(lambda x: spatial.distance.minkowski(x['question1_w2v'], x['question2_w2v'],3), axis=1)
    df_features['z_w2v_dis_cityblock'] = df_features.apply(lambda x: spatial.distance.cityblock(x['question1_w2v'], x['question2_w2v']), axis=1)
    df_features['z_w2v_dis_canberra'] = df_features.apply(lambda x: spatial.distance.canberra(x['question1_w2v'], x['question2_w2v']), axis=1)
    df_features['z_q1_skew'] = df_features.question1_w2v.map(lambda x:skew(x))
    df_features['z_q2_skew'] = df_features.question2_w2v.map(lambda x:skew(x))
    df_features['z_q1_kur'] = df_features.question1_w2v.map(lambda x:kurtosis(x))
    df_features['z_q2_kur'] = df_features.question2_w2v.map(lambda x:kurtosis(x))
    del df_features['question1_w2v']
    del df_features['question2_w2v']
    print('all done')
    print now.strftime('%Y-%m-%d %H:%M:%S') 
    df_features.fillna(0.0)
    return df_features
项目:Kaggle_Allstate    作者:sadz2201    | 项目源码 | 文件源码
def mungeskewed(train, test, numeric_feats):
    ntrain = train.shape[0]
    test['loss'] = 0
    train_test = pd.concat((train, test)).reset_index(drop=True)
    skewed_feats = train[numeric_feats].apply(lambda x: skew(x.dropna()))
    skewed_feats = skewed_feats[skewed_feats > 0.25]
    skewed_feats = skewed_feats.index

    for feats in skewed_feats:
        train_test[feats] = train_test[feats] + 1
        train_test[feats], lam = boxcox(train_test[feats])
    return train_test, ntrain
项目:kaggle-quora-question-pairs    作者:stys    | 项目源码 | 文件源码
def features(self, q1, q2):
        q1 = str(q1).lower().split()
        q2 = str(q2).lower().split()
        q1 = [w for w in q1 if w not in stopwords]
        q2 = [w for w in q2 if w not in stopwords]

        wmd = min(self.model.wmdistance(q1, q2), 10)

        q1vec = self.sent2vec(q1)
        q2vec = self.sent2vec(q2)

        if q1vec is not None and q2vec is not None:
            cos = cosine(q1vec, q2vec)
            city = cityblock(q1vec, q2vec)
            jacc = jaccard(q1vec, q2vec)
            canb = canberra(q1vec, q2vec)
            eucl = euclidean(q1vec, q2vec)
            mink = minkowski(q1vec, q2vec, 3)
            bray = braycurtis(q1vec, q2vec)

            q1_skew = skew(q1vec)
            q2_skew = skew(q2vec)
            q1_kurt = kurtosis(q1vec)
            q2_kurt = kurtosis(q2vec)

        else:
            cos = -1
            city = -1
            jacc = -1
            canb = -1
            eucl = -1
            mink = -1
            bray = -1

            q1_skew = 0
            q2_skew = 0
            q1_kurt = 0
            q2_kurt = 0

        return wmd, cos, city, jacc, canb, eucl, mink, bray, q1_skew, q2_skew, q1_kurt, q2_kurt
项目:kaggle-quora-question-pairs    作者:stys    | 项目源码 | 文件源码
def features(self, q1, q2):
        q1 = str(q1).lower().split()
        q2 = str(q2).lower().split()
        q1 = [w for w in q1 if w not in stopwords]
        q2 = [w for w in q2 if w not in stopwords]

        wmd = min(self.model.wmdistance(q1, q2), 10)
        wmd_norm = min(self.model_norm.wmdistance(q1, q2), 10)

        q1vec = self.sent2vec(q1)
        q2vec = self.sent2vec(q2)

        if q1vec is not None and q2vec is not None:
            cos = cosine(q1vec, q2vec)
            city = cityblock(q1vec, q2vec)
            jacc = jaccard(q1vec, q2vec)
            canb = canberra(q1vec, q2vec)
            eucl = euclidean(q1vec, q2vec)
            mink = minkowski(q1vec, q2vec, 3)
            bray = braycurtis(q1vec, q2vec)

            q1_skew = skew(q1vec)
            q2_skew = skew(q2vec)
            q1_kurt = kurtosis(q1vec)
            q2_kurt = kurtosis(q2vec)

        else:
            cos = -1
            city = -1
            jacc = -1
            canb = -1
            eucl = -1
            mink = -1
            bray = -1

            q1_skew = 0
            q2_skew = 0
            q1_kurt = 0
            q2_kurt = 0

        return wmd, wmd_norm, cos, city, jacc, canb, eucl, mink, bray, q1_skew, q2_skew, q1_kurt, q2_kurt
项目:astrobase    作者:waqasbhatti    | 项目源码 | 文件源码
def lightcurve_moments(ftimes, fmags, ferrs):
    '''This calculates the weighted mean, stdev, median, MAD, percentiles, skew,
    kurtosis, fraction of LC beyond 1-stdev, and IQR.

    '''

    ndet = len(fmags)

    if ndet > 9:

        # now calculate the various things we need
        series_median = npmedian(fmags)
        series_wmean = (
            npsum(fmags*(1.0/(ferrs*ferrs)))/npsum(1.0/(ferrs*ferrs))
        )
        series_mad = npmedian(npabs(fmags - series_median))
        series_stdev = 1.483*series_mad
        series_skew = spskew(fmags)
        series_kurtosis = spkurtosis(fmags)

        # get the beyond1std fraction
        series_above1std = len(fmags[fmags > (series_median + series_stdev)])
        series_below1std = len(fmags[fmags < (series_median - series_stdev)])

        # this is the fraction beyond 1 stdev
        series_beyond1std = (series_above1std + series_below1std)/float(ndet)

        # get the magnitude percentiles
        series_mag_percentiles = nppercentile(
            fmags,
            [5.0,10,17.5,25,32.5,40,60,67.5,75,82.5,90,95]
        )

        return {
            'median':series_median,
            'wmean':series_wmean,
            'mad':series_mad,
            'stdev':series_stdev,
            'skew':series_skew,
            'kurtosis':series_kurtosis,
            'beyond1std':series_beyond1std,
            'mag_percentiles':series_mag_percentiles,
            'mag_iqr': series_mag_percentiles[8] - series_mag_percentiles[3],
        }


    else:
        LOGERROR('not enough detections in this magseries '
                 'to calculate light curve moments')
        return None
项目:AlphaPy    作者:ScottFreeLLC    | 项目源码 | 文件源码
def create_scipy_features(base_features, sentinel):
    r"""Calculate the skew, kurtosis, and other statistical features
    for each row.

    Parameters
    ----------
    base_features : numpy array
        The feature dataframe.
    sentinel : float
        The number to be imputed for NaN values.

    Returns
    -------
    sp_features : numpy array
        The calculated SciPy features.

    """

    logger.info("Creating SciPy Features")

    # Generate scipy features

    logger.info("SciPy Feature: geometric mean")
    row_gmean = sps.gmean(base_features, axis=1)
    logger.info("SciPy Feature: kurtosis")
    row_kurtosis = sps.kurtosis(base_features, axis=1)
    logger.info("SciPy Feature: kurtosis test")
    row_ktest, pvalue = sps.kurtosistest(base_features, axis=1)
    logger.info("SciPy Feature: normal test")
    row_normal, pvalue = sps.normaltest(base_features, axis=1)
    logger.info("SciPy Feature: skew")
    row_skew = sps.skew(base_features, axis=1)
    logger.info("SciPy Feature: skew test")
    row_stest, pvalue = sps.skewtest(base_features, axis=1)
    logger.info("SciPy Feature: variation")
    row_var = sps.variation(base_features, axis=1)
    logger.info("SciPy Feature: signal-to-noise ratio")
    row_stn = sps.signaltonoise(base_features, axis=1)
    logger.info("SciPy Feature: standard error of mean")
    row_sem = sps.sem(base_features, axis=1)

    sp_features = np.column_stack((row_gmean, row_kurtosis, row_ktest,
                                   row_normal, row_skew, row_stest,
                                   row_var, row_stn, row_sem))
    sp_features = impute_values(sp_features, 'float64', sentinel)
    sp_features = StandardScaler().fit_transform(sp_features)

    # Return new SciPy features

    logger.info("SciPy Feature Count : %d", sp_features.shape[1])
    return sp_features


#
# Function create_clusters
#
项目:SCaIP    作者:simonsfoundation    | 项目源码 | 文件源码
def create_images_for_labeling(pars):
    import scipy.stats as st
    import os 
    import numpy as np
    import calblitz as cb
    from  glob import glob

    try:
        f_name=pars
        cdir=os.path.dirname(f_name)

        print 'loading'
        m=cb.load(f_name)

        print 'corr image'
        img=m.local_correlations(eight_neighbours=True)
        im=cb.movie(img,fr=1)
        im.save(os.path.join(cdir,'correlation_image.tif'))

        print 'std image'
        img=np.std(m,0)
        im=cb.movie(np.array(img),fr=1)
        im.save(os.path.join(cdir,'std_projection.tif'))


        m1=m.resize(1,1,1./m.fr)   


        print 'median image'
        img=np.median(m1,0)
        im=cb.movie(np.array(img),fr=1)
        im.save(os.path.join(cdir,'median_projection.tif'))     

        print 'save BL'
        m1=m1-img
        m1.save(os.path.join(cdir,'MOV_BL.tif'))
        m1=m1.bilateral_blur_2D()
        m1.save(os.path.join(cdir,'MOV_BL_BIL.tif'))
        m=np.array(m1)

        print 'max image'
        img=np.max(m,0)
        im=cb.movie(np.array(img),fr=1)
        im.save(os.path.join(cdir,'max_projection.tif'))           

        print 'skew image'
        img=st.skew(m,0)
        im=cb.movie(img,fr=1)
        im.save(os.path.join(cdir,'skew_projection.tif'))
        del m
        del m1
    except Exception, e:

        return e

    return f_name
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda    作者:SignalMedia    | 项目源码 | 文件源码
def test_rolling_functions_window_non_shrinkage(self):
        # GH 7764
        s = Series(range(4))
        s_expected = Series(np.nan, index=s.index)
        df = DataFrame([[1, 5], [3, 2], [3, 9], [-1, 0]], columns=['A', 'B'])
        df_expected = DataFrame(np.nan, index=df.index, columns=df.columns)
        df_expected_panel = Panel(items=df.index, major_axis=df.columns,
                                  minor_axis=df.columns)

        functions = [lambda x: (x.rolling(window=10, min_periods=5)
                                .cov(x, pairwise=False)),
                     lambda x: (x.rolling(window=10, min_periods=5)
                                .corr(x, pairwise=False)),
                     lambda x: x.rolling(window=10, min_periods=5).max(),
                     lambda x: x.rolling(window=10, min_periods=5).min(),
                     lambda x: x.rolling(window=10, min_periods=5).sum(),
                     lambda x: x.rolling(window=10, min_periods=5).mean(),
                     lambda x: x.rolling(window=10, min_periods=5).std(),
                     lambda x: x.rolling(window=10, min_periods=5).var(),
                     lambda x: x.rolling(window=10, min_periods=5).skew(),
                     lambda x: x.rolling(window=10, min_periods=5).kurt(),
                     lambda x: x.rolling(
                         window=10, min_periods=5).quantile(quantile=0.5),
                     lambda x: x.rolling(window=10, min_periods=5).median(),
                     lambda x: x.rolling(window=10, min_periods=5).apply(sum),
                     lambda x: x.rolling(win_type='boxcar',
                                         window=10, min_periods=5).mean()]
        for f in functions:
            try:
                s_result = f(s)
                assert_series_equal(s_result, s_expected)

                df_result = f(df)
                assert_frame_equal(df_result, df_expected)
            except (ImportError):

                # scipy needed for rolling_window
                continue

        functions = [lambda x: (x.rolling(window=10, min_periods=5)
                                .cov(x, pairwise=True)),
                     lambda x: (x.rolling(window=10, min_periods=5)
                                .corr(x, pairwise=True))]
        for f in functions:
            df_result_panel = f(df)
            assert_panel_equal(df_result_panel, df_expected_panel)
项目:House-Pricing    作者:playing-kaggle    | 项目源码 | 文件源码
def pre_process(df):
    # LotFrontage's N/A is assigned zero, will it cause problem?
    df.fillna(value={'MasVnrType': 'None', 'MasVnrArea': 0,'Electrical': 'SBrkr', 'FireplaceQu': 'NoFP', 'GarageType': 'Noga',
                           'GarageFinish': 'Noga', 'GarageQual': 'Noga', 'Fence': 'NoFence', 
                           'BsmtFinSF1':0,'BsmtFinSF2':0,'BsmtUnfSF':0,'TotalBsmtSF':0,'BsmtFullBath':0,'BsmtHalfBath':0,
                           'LotFrontage': 0},
                    inplace=True)

    df.loc[:, 'YrSold'] = 2016 - df.loc[:, 'YrSold']

    df.loc[df.loc[:, 'PoolArea'] != 0, 'PoolArea'] = 1

    df.loc[:, 'Porch'] = np.sum(df.loc[:, ['EnclosedPorch', '3SsnPorch', 'ScreenPorch']], axis=1)
    df.drop(['EnclosedPorch', '3SsnPorch', 'ScreenPorch'], axis=1, inplace=True)

    df.replace({'BsmtFullBath': {3: 2}, 'LotShape': {'IR3': 'IR2'}}, inplace=True)


    # fill missing values in bsmt
    df = fill_bsmt_missing(df)

    def fill_na(df, col_name, value = None):
        if value == None:
            value = df[col_name].mean()
        df.loc[df[col_name].isnull(),col_name] = value

    fill_na(df, 'Fence','WD')
    fill_na(df, 'GarageArea')
    fill_na(df, 'GarageCars')
    fill_na(df, 'SaleType', df['SaleType'].mode().values[0])
    fill_na(df, 'KitchenQual', df['KitchenQual'].mode().values[0])
    fill_na(df, 'Functional', df['Functional'].mode().values[0])
    fill_na(df, 'Exterior1st', df['Exterior1st'].mode().values[0])
    fill_na(df, 'Exterior2nd', df['Exterior2nd'].mode().values[0])
    fill_na(df, 'MSZoning', 'RL')


    bool_cols = np.array([df[col_name].isnull() for col_name in df.columns])
    print('rows containing na:',np.sum(bool_cols.any(axis=0)))
    print('rows all na:',np.sum(bool_cols.all(axis=0)))


    # log1pskewed_feats
    numeric_feats = df.dtypes[df.dtypes != "object"].index

    skewed_feats = df[numeric_feats].apply(lambda x: skew(x.dropna())) #compute skewness
    skewed_feats = skewed_feats[skewed_feats > 0.75]
    skewed_feats = skewed_feats.index
    df[skewed_feats] = np.log1p(df[skewed_feats])


    return df

#%%
#log transform the target: ignore for test data
#

#train_data = pre_process(train_df.copy())
#test_data = pre_process(test_df.copy())
项目:SCFGP    作者:MaxInGaussian    | 项目源码 | 文件源码
def fit(self, X):
        self.data["cols"] = list(set(range(X.shape[1])).difference(
            np.where(np.all(X == X[0,:], axis = 0))[0]))
        tX = X[:, self.data["cols"]]
        if(self.algo == "min-max"):
            self.data['min'] = np.min(tX, axis=0)
            self.data['max'] = np.max(tX, axis=0)
        elif(self.algo == "normal"):
            self.data['mu'] = np.mean(tX, axis=0)
            self.data['std'] = np.std(tX, axis=0)
        elif(self.algo == "inv-normal"):
            self.data['mu'] = np.mean(tX, axis=0)
            self.data['std'] = np.std(tX, axis=0)
        elif(self.algo == "auto-normal"):
            self.data['min'] = np.min(tX, axis=0)
            self.data['max'] = np.max(tX, axis=0)
            tX = (tX-self.data["min"])/(self.data["max"]-self.data["min"])
            boxcox = lambda x, lm: (np.sign(x)*np.abs(x)**lm-1)/lm
            self.data['boxcox'] = np.zeros(tX.shape[1])
            for d in range(tX.shape[1]):
                Xd = tX[:, d]
                if(np.unique(tX[:, d]).shape[0] < 10):
                    self.data['boxcox'][d] = 1
                    continue
                skewness = lambda x: skew(x, bias=False)**2
                t_lm = lambda lm: np.log(np.exp(lm[0])+1)
                boxcox_Xd = lambda lm: boxcox(Xd, t_lm(lm))
                obj = lambda lm: skewness(boxcox_Xd(lm))
                bounds = [(-5, 5)]
                lm = minimize(obj, [0.], method='SLSQP', bounds=bounds,
                    options={'ftol': 1e-8, 'maxiter':100, 'disp':False})['x']
                self.data['boxcox'][d] = t_lm(lm)
            lm = self.data['boxcox'][None, :]
            tX = boxcox(tX, lm)
            self.data['mu'] = np.mean(tX, axis=0)
            self.data['std'] = np.std(tX, axis=0)
        elif(self.algo == "auto-inv-normal"):
            self.data['min'] = np.min(tX, axis=0)
            self.data['max'] = np.max(tX, axis=0)
            tX = (tX-self.data["min"])/(self.data["max"]-self.data["min"])
            boxcox = lambda x, lm: (np.sign(x)*np.abs(x)**lm-1)/lm
            self.data['boxcox'] = np.zeros(tX.shape[1])
            for d in range(tX.shape[1]):
                Xd = tX[:, d]
                if(np.unique(tX[:, d]).shape[0] < 10):
                    self.data['boxcox'][d] = 1
                    continue
                skewness = lambda x: skew(x, bias=False)**2
                t_lm = lambda lm: np.log(np.exp(lm[0])+1)
                boxcox_Xd = lambda lm: boxcox(Xd, t_lm(lm))
                obj = lambda lm: skewness(boxcox_Xd(lm))
                bounds = [(-5, 5)]
                lm = minimize(obj, [0.], method='SLSQP', bounds=bounds,
                    options={'ftol': 1e-8, 'maxiter':100, 'disp':False})['x']
                self.data['boxcox'][d] = t_lm(lm)
            lm = self.data['boxcox'][None, :]
            tX = boxcox(tX, lm)
            self.data['mu'] = np.mean(tX, axis=0)
            self.data['std'] = np.std(tX, axis=0)
项目:Quora-Kaggle    作者:PPshrimpGo    | 项目源码 | 文件源码
def get_features(df_features):
    print('use w2v to document presentation')
    now = datetime.datetime.now()
    print now.strftime('%Y-%m-%d %H:%M:%S')
   #df_features['z_document_dis'] = df_features.apply(lambda x: getDiff_averge(x['question1'], x['question2']), axis = 1)
    print('get_w2v')
    now = datetime.datetime.now()
    print now.strftime('%Y-%m-%d %H:%M:%S') 
    df_features['q1_unique'] = df_features.apply(lambda x: getdiffwords(x['question1'], x['question2']), axis = 1)
    df_features['q2_unique'] = df_features.apply(lambda x: getdiffwords(x['question2'], x['question1']), axis = 1)

    df_features['q1_unique_w2v_weight'] = df_features.q1_unique.map(lambda x: get_vector(" ".join(x)))
    df_features['q2_unique_w2v_weight'] = df_features.q2_unique.map(lambda x: get_vector(" ".join(x)))
    df_features['q1_unique_w2v'] = df_features.q1_unique.map(lambda x: get_weight_vector(" ".join(x)))
    df_features['q2_unique_w2v'] = df_features.q2_unique.map(lambda x: get_weight_vector(" ".join(x)))

    print('z_dist')
    now = datetime.datetime.now()
    print now.strftime('%Y-%m-%d %H:%M:%S') 
    #df_features['z_dist'] = df_features.apply(lambda x:Levenshtein.ratio(x['question1'], x['question2']), axis=1)
    now = datetime.datetime.now()
    print('z_tfidf_cos_sim')
    print now.strftime('%Y-%m-%d %H:%M:%S') 
    #df_features['z_tfidf_cos_sim'] = df_features.apply(lambda x: cos_sim(x['question1'], x['question2']), axis=1)
    now = datetime.datetime.now()
    print('z_w2v_calc')
    print now.strftime('%Y-%m-%d %H:%M:%S') 

    #df_features['z_w2v_unique'] = df_features.apply(lambda x: w2v_cos_sim(x['q1_unique'], x['q2_unique']), axis=1)
    df_features['z_w2v_unique_dis_e_weight'] = df_features.apply(lambda x: spatial.distance.euclidean(x['q1_unique_w2v_weight'], x['q2_unique_w2v_weight']), axis=1)
    df_features['z_w2v_unique_dis_e'] = df_features.apply(lambda x: spatial.distance.euclidean(x['q1_unique_w2v'], x['q2_unique_w2v']), axis=1)    

    df_features['z_w2v_unique_dis_mink_w'] = df_features.apply(lambda x: spatial.distance.minkowski(x['q1_unique_w2v_weight'], x['q2_unique_w2v_weight'],3), axis=1)
    df_features['z_w2v_unique_dis_cityblock_w'] = df_features.apply(lambda x: spatial.distance.cityblock(x['q1_unique_w2v_weight'], x['q2_unique_w2v_weight']), axis=1)
    df_features['z_w2v_unique_dis_canberra_w'] = df_features.apply(lambda x: spatial.distance.canberra(x['q1_unique_w2v_weight'], x['q2_unique_w2v_weight']), axis=1)

    df_features['z_w2v_unique_dis_mink'] = df_features.apply(lambda x: spatial.distance.minkowski(x['q1_unique_w2v'], x['q2_unique_w2v'],3), axis=1)
    df_features['z_w2v_unique_dis_cityblock'] = df_features.apply(lambda x: spatial.distance.cityblock(x['q1_unique_w2v'], x['q2_unique_w2v']), axis=1)
    df_features['z_w2v_unique_dis_canberra'] = df_features.apply(lambda x: spatial.distance.canberra(x['q1_unique_w2v'], x['q2_unique_w2v']), axis=1)

    df_features['z_q1_unique_skew_w'] = df_features.q1_unique_w2v_weight.map(lambda x:skew(x))
    df_features['z_q2_unique_skew_w'] = df_features.q2_unique_w2v_weight.map(lambda x:skew(x))
    df_features['z_q1_unique_kur_w'] = df_features.q1_unique_w2v_weight.map(lambda x:kurtosis(x))
    df_features['z_q2_unique_kur_w'] = df_features.q2_unique_w2v_weight.map(lambda x:kurtosis(x))


    df_features['z_q1_unique_skew'] = df_features.q1_unique_w2v.map(lambda x:skew(x))
    df_features['z_q2_unique_skew'] = df_features.q2_unique_w2v.map(lambda x:skew(x))
    df_features['z_q1_unique_kur'] = df_features.q1_unique_w2v.map(lambda x:kurtosis(x))
    df_features['z_q2_unique_kur'] = df_features.q2_unique_w2v.map(lambda x:kurtosis(x))
    del df_features['q1_unique_w2v_weight']
    del df_features['q2_unique_w2v_weight']
    del df_features['q1_unique_w2v']
    del df_features['q2_unique_w2v']
    print('all done')
    print now.strftime('%Y-%m-%d %H:%M:%S') 
    df_features.fillna(0.0)
    return df_features
项目:decoding_challenge_cortana_2016_3rd    作者:kingjr    | 项目源码 | 文件源码
def _detect_artifacts(ica, raw, start_find, stop_find, ecg_ch, ecg_score_func,
                      ecg_criterion, eog_ch, eog_score_func, eog_criterion,
                      skew_criterion, kurt_criterion, var_criterion,
                      add_nodes):
    """Aux Function"""
    from scipy import stats

    nodes = []
    if ecg_ch is not None:
        nodes += [_ica_node('ECG', ecg_ch, ecg_score_func, ecg_criterion)]

    if eog_ch not in [None, []]:
        if not isinstance(eog_ch, list):
            eog_ch = [eog_ch]
        for idx, ch in enumerate(eog_ch):
            nodes += [_ica_node('EOG %02d' % idx, ch, eog_score_func,
                      eog_criterion)]

    if skew_criterion is not None:
        nodes += [_ica_node('skewness', None, stats.skew, skew_criterion)]

    if kurt_criterion is not None:
        nodes += [_ica_node('kurtosis', None, stats.kurtosis, kurt_criterion)]

    if var_criterion is not None:
        nodes += [_ica_node('variance', None, np.var, var_criterion)]

    if add_nodes is not None:
        nodes.extend(add_nodes)

    for node in nodes:
        scores = ica.score_sources(raw, start=start_find, stop=stop_find,
                                   target=node.target,
                                   score_func=node.score_func)
        if isinstance(node.criterion, float):
            found = list(np.where(np.abs(scores) > node.criterion)[0])
        else:
            found = list(np.atleast_1d(abs(scores).argsort()[node.criterion]))

        case = (len(found), 's' if len(found) > 1 else '', node.name)
        logger.info('    found %s artifact%s by %s' % case)
        ica.exclude += found

    logger.info('Artifact indices found:\n    ' + str(ica.exclude).strip('[]'))
    if len(set(ica.exclude)) != len(ica.exclude):
        logger.info('    Removing duplicate indices...')
        ica.exclude = list(set(ica.exclude))

    logger.info('Ready.')
项目:Default-Credit-Card-Prediction    作者:AlexPnt    | 项目源码 | 文件源码
def get_feature_stats(self):

        # #get input feature
        feature_input=self.feature_input.currentText()
        try:
            if feature_input[0]=='X':
                try:
                    feature_index=int("".join(feature_input[1:]))
                    feature_index-=1

                except:
                    QtWidgets.QMessageBox.information(self, "Wrong Format","Please enter a feature name in the format: X%d.")
                    return
            elif "".join(feature_input[0]+feature_input[1])=='LD' or "".join(feature_input[0]+feature_input[1])=='PC':
                try:
                    feature_index=int("".join(feature_input[2:]))
                    feature_index-=1

                except:
                    QtWidgets.QMessageBox.information(self, "Wrong Format","Please enter a feature name in the format: X||LD||PC%d.")
                    return
            else:
                QtWidgets.QMessageBox.information(self, "Wrong Format","Feature names must be in the format: X%d.")
                return
        except:
            QtWidgets.QMessageBox.information(self, "Data Not Found","Please load a dataset first.")
            return


        try:
            max_value=self.X[:,feature_index].max()
            min_value=self.X[:,feature_index].min()
            mean_value=self.X[:,feature_index].mean()
            std_value=self.X[:,feature_index].std()
            var_value=self.X[:,feature_index].var()
            skewness=stats.skew(self.X[:,feature_index])
            kurtosis=stats.kurtosis(self.X[:,feature_index],fisher=True)
            chi2,chi_p_val=chi2_feature_test(self.X,self.y,int(feature_index))
            H_kw,kw_p_val=kw_feature_test(self.X,self.y,int(feature_index))
            info_gain=information_gain(self.X,self.y,int(feature_index))
            gain_rt=gain_ratio(self.X,self.y,int(feature_index))

        except:
            QtWidgets.QMessageBox.information(self, "Wrong Index","Feature Index Out Of Bounds.")
            return

        feature_stats="""Statistics:\n\nMinimum Value: """+str(min_value)\
                      +"""\n\nMaximum Value: """+str(max_value)\
                      +"""\n\nMean: """+str(mean_value)\
                      +"""\n\nStandard Deviation: """+str(std_value)\
                      +"""\n\nVariance: """+str(var_value)\
                      +"""\n\nSkewness: """+str(skewness)\
                      +"""\n\nKurtosis: """+str(kurtosis)\
                      +"""\n\nChi Squared Test: """+str(chi2[0])\
                      +"""\n\nKruskal-Wallis Test:  """+str(H_kw)\
                      +"""\n\nInformation Gain: """+str(info_gain)\
                      +"""\n\nGain Ratio: """+str(gain_rt)

        self.feature_stats.setText(feature_stats)