Python pandas 模块,unique() 实例源码

我们从Python开源项目中,提取了以下45个代码示例,用于说明如何使用pandas.unique()

项目:coquery    作者:gkunter    | 项目源码 | 文件源码
def get_levels(self, name):
        """
        Return a set containing all distinct values in the column 'name'.

        The values are returned in alphabetical order.

        Parameters
        ----------
        name : string
            The column name for which the unique values are requested

        Returns
        -------
        levels : list
            A unique list of all values that are contained in the specified
            data column.
        """
        return pd.unique(self._table[name].values.ravel())
项目:deep-action-proposals    作者:escorciav    | 项目源码 | 文件源码
def wrapper_nms(proposal_df, overlap=0.65):
    """Apply non-max-suppresion to a video batch.
    """
    vds_unique = pd.unique(proposal_df['video-name'])
    new_proposal_df = []
    for i, v in enumerate(vds_unique):
        idx = proposal_df['video-name'] == v
        p = proposal_df.loc[idx, ['video-name', 'f-init', 'f-end',
                                  'score', 'video-frames']]
        n_frames = np.int(p['video-frames'].mean())
        loc = np.stack((p['f-init'], p['f-end']), axis=-1)
        loc, score = nms_detections(loc, np.array(p['score']), overlap)
        n_proposals = score.shape[0]
        n_frames = np.repeat(p['video-frames'].mean(), n_proposals).astype(int)
        this_df = pd.DataFrame({'video-name': np.repeat(v, n_proposals),
                                'f-init': loc[:, 0], 'f-end': loc[:, 1],
                                'score': score,
                                'video-frames': n_frames})
        new_proposal_df.append(this_df)
    return pd.concat(new_proposal_df, axis=0)
项目:gullikson-scripts    作者:kgullikson88    | 项目源码 | 文件源码
def get_detected_objects(df, tol=1.0, debug=False):
    """
    Takes a summary dataframe with RV information. Finds the median rv for each star,
      and removes objects that are more than 'tol' km/s from the median value
    :param df: A summary dataframe, such as created by get_ccf_summary or find_best_pars
    :param tol: The tolerance, in km/s, to accept an observation as detected
    :return: a dataframe containing only detected companions
    """
    secondary_names = pd.unique(df.Secondary)
    secondary_to_rv = defaultdict(float)
    for secondary in secondary_names:
        rv = df.loc[df.Secondary == secondary]['rv'].median()
        secondary_to_rv[secondary] = rv

    if debug:
        for secondary in sorted(secondary_to_rv.keys()):
            print ('RV for {}: {:.2f} km/s'.format(secondary, secondary_to_rv[secondary]))

    keys = df.Secondary.values
    good = df.loc[abs(df.rv.values - np.array(itemgetter(*keys)(secondary_to_rv))) < tol]
    return good
项目:gullikson-scripts    作者:kgullikson88    | 项目源码 | 文件源码
def list_stars(self, print2screen=False):
        """
        List all of the stars in all of the CCF interfaces

        Parameters:
        ===========
        - print2screen:     bool
                            Should we print the stars and dates to screen?

        Returns:
        =========
        - star_list:        list
                            A list of every star in the file, sorted by name.
        """
        stars = []
        for inst in self._interfaces.keys():
            if print2screen:
                print('Stars observed with {}: \n============================\n\n'.format(inst))
            stars.extend(self._interfaces[inst].list_stars(print2screen=print2screen))

        return list(pd.unique(stars))
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda    作者:SignalMedia    | 项目源码 | 文件源码
def test_datetime64_dtype_array_returned(self):
        # GH 9431
        expected = np.array(['2015-01-03T00:00:00.000000000+0000',
                             '2015-01-01T00:00:00.000000000+0000'],
                            dtype='M8[ns]')

        dt_index = pd.to_datetime(['2015-01-03T00:00:00.000000000+0000',
                                   '2015-01-01T00:00:00.000000000+0000',
                                   '2015-01-01T00:00:00.000000000+0000'])
        result = algos.unique(dt_index)
        tm.assert_numpy_array_equal(result, expected)
        self.assertEqual(result.dtype, expected.dtype)

        s = pd.Series(dt_index)
        result = algos.unique(s)
        tm.assert_numpy_array_equal(result, expected)
        self.assertEqual(result.dtype, expected.dtype)

        arr = s.values
        result = algos.unique(arr)
        tm.assert_numpy_array_equal(result, expected)
        self.assertEqual(result.dtype, expected.dtype)
项目:KaggleExeter    作者:detomo    | 项目源码 | 文件源码
def rename_brands(phone_models):
    """ recast all phone brands and model as string integers brand_i and model_j """
    brands_table = {}
    i = 0
    for brand in pd.unique(phone_models['phone_brand']):
        brands_table[brand] = 'brand_%s' %i
        i += 1

    models_table = {}
    i = 0
    for model in pd.unique(phone_models['device_model']):
        models_table[model] = 'model_%s' %i
        i += 1

    converted = []
    for item in zip(phone_models['phone_brand'],phone_models['device_model']):
        converted.append((brands_table[item[0]],models_table[item[1]]))
    phone_models['phone_brand'] = [x[0] for x in converted]
    phone_models['device_model'] = [x[1] for x in converted]
    return phone_models
项目:KaggleExeter    作者:detomo    | 项目源码 | 文件源码
def rename_brands(phone_models):
    """ recast all phone brands and model as string integers brand_i and model_j """
    brands_table = {}
    i = 0
    for brand in pd.unique(phone_models['phone_brand']):
        brands_table[brand] = 'brand_%s' %i
        i += 1

    models_table = {}
    i = 0
    for model in pd.unique(phone_models['device_model']):
        models_table[model] = 'model_%s' %i
        i += 1

    converted = []
    for item in zip(phone_models['phone_brand'],phone_models['device_model']):
        converted.append((brands_table[item[0]],models_table[item[1]]))
    phone_models['phone_brand'] = [x[0] for x in converted]
    phone_models['device_model'] = [x[1] for x in converted]
    return phone_models
项目:calvin    作者:ucd-cws    | 项目源码 | 文件源码
def __init__(self, linksfile, ic=None):
    df = pd.read_csv(linksfile)
    df['link'] = df.i.map(str) + '_' + df.j.map(str) + '_' + df.k.map(str)
    df.set_index('link', inplace=True)

    self.df = df

    # self.T = len(self.df)
    SR_stats = pd.read_csv('calvin/data/SR_stats.csv', index_col=0).to_dict()
    self.min_storage = SR_stats['min']
    self.max_storage = SR_stats['max']

    if ic:
      self.apply_ic(ic)

    # a few network fixes to make things work
    self.add_ag_region_sinks()
    self.fix_hydropower_lbs()

    self.nodes = pd.unique(df[['i','j']].values.ravel()).tolist()
    self.links = list(zip(df.i,df.j,df.k))
    self.networkcheck() # make sure things aren't broken
项目:skp_edu_docker    作者:TensorMSA    | 项目源码 | 文件源码
def make_unique_value_each_column (self, df, node_id):
        """ Dataframe? ??? ???? ??? ??? ?? ??? ???? 
            Unique Value return in Dataframe
        Args:
          params:
            * df : dataframe
            * node_id: nnid
        Returns:
            json
        Raises:
        """
        try:
            data_conf = dict()
            column_cate_unique = dict()
            numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
            for i, v in df.dtypes.iteritems():
                if (str(v) not in numerics):  # maybe need float
                    column_cate_unique[i] = df[i].unique().size
            data_conf['unique_cell_feature'] = column_cate_unique
            data_conf_json_str = json.dumps(data_conf)
            data_conf_json = json.loads(data_conf_json_str)
            return data_conf_json
        except Exception as e:
            logging.error("make_unique_value_each_column error : {0}, {1}".format(i,v))
            raise e
项目:extract    作者:dblalock    | 项目源码 | 文件源码
def makeTable(df, rowsCol, colsCol, dataCol):
    # df.set_index(rowsCol)

    uniqRowVals = pd.unique(df[rowsCol])
    uniqColVals = pd.unique(df[colsCol])

    # "rows col = ", df[rowsCol]
    # print "uniq row vals", uniqRowVals
    # print "uniq col vals", uniqColVals
    # print df[[rowsCol, colsCol, dataCol]]

    out = pd.DataFrame(index=uniqRowVals, columns=uniqColVals)
    for rowVal in uniqRowVals:
        for colVal in uniqColVals:
            rowsMatch = df[rowsCol] == rowVal
            colsMatch = df[colsCol] == colVal
            thisIdx = np.where(rowsMatch * colsMatch)[0][0]
            out.ix[rowVal][colVal] = df[dataCol][thisIdx]

    return out
项目:StackedDAE    作者:glrs    | 项目源码 | 文件源码
def label_metadata(label_matrix, label_col):
    # Check whether the column value is given as index (number) or name (string) 
    try:
        label_col = int(label_col)

        # If given as number, take the name of the column out of it
        label_col = label_matrix.columns[label_col]
    except ValueError:
        pass

    import pandas as pd
    # Get the unique classes in the given column, and how many of them are there
    unique_classes = pd.unique(label_matrix[label_col].ravel())
    #num_classes = unique_classes.shape[0]

    # Map the unique n classes with a number from 0 to n  
    label_map = pd.DataFrame({label_col: unique_classes, label_col+'_id':range(len(unique_classes))})

    # Replace the given column's values with the mapped equivalent
    mapped_labels = label_matrix.replace(label_map[[0]].values.tolist(), label_map[[1]].values.tolist())

    # Return the mapped labels as numpy list and the label map (unique classes and number can be obtained from map)
    return np.reshape(mapped_labels[[label_col]].values, (mapped_labels.shape[0],)), np.asarray(label_map) #, unique_classes, num_classes
项目:StackedDAE    作者:glrs    | 项目源码 | 文件源码
def label_metadata(label_matrix, label_col):
    # Check whether the column value is given as index (number) or name (string) 
    try:
        label_col = int(label_col)

        # If given as number, take the name of the column out of it
        label_col = label_matrix.columns[label_col]
    except ValueError:
        pass

    # Get the unique classes in the given column, and how many of them are there
    unique_classes = pd.unique(label_matrix[label_col].ravel())

    # Map the unique n classes with a number from 0 to n
    label_map = pd.DataFrame({label_col: unique_classes, label_col+'_id':range(len(unique_classes))})

    # Replace the given column values with the mapped equivalent
    mapped_labels = label_matrix.replace(label_map[[0]].values.tolist(), label_map[[1]].values.tolist())
#     print("label_matrix", label_matrix)
#     print("mapped_labels", mapped_labels)

    # Return the mapped labels as ndarray and the label map (unique classes and number can be obtained from map)
    # np.reshape(mapped_labels[[label_col]].values, (mapped_labels.shape[0],))
    # Return the mapped labels as DataFrame and the label map (unique classes and number can be obtained from map)
    return mapped_labels[[label_col]], np.asarray(label_map) #, unique_classes, num_classes
项目:triage    作者:dssg    | 项目源码 | 文件源码
def create_subset(src, dest, n=250):
    "Given a csv file `src`, create a subset `dest` with `n` unique entities"
    df = pd.read_csv(src)
    lics = pd.unique(df["License #"])
    sublics = lics[random.sample(range(0,len(lics)), n)]
    subset = df[df["License #"].isin(sublics)]
    # Make the column names a little more readable
    subset.columns = map(clean_column_name, subset.columns)
    subset.to_csv(dest, index=False)
项目:johnson-county-ddj-public    作者:dssg    | 项目源码 | 文件源码
def convert_categorical(df):
    onecol = df.columns[1]
    onecol_name = df.columns.values.tolist()[1]
    df[onecol] = df[onecol].str.lower()
    categories = pd.unique(df[onecol])


    categories = [x for x in categories if x is not None]

    try:
        categories.remove(' ')
    except:
        pass

    categories = [str(x) for x in categories]

    categories = list(set([str.lower(x).strip() for x in categories]))

    #replaces spaces in middle of word w underscores
    categories = list(set([x.replace(" ", '_') for x in categories]))

    featnames = []
    for i in range(len(categories)):
        if type(categories[i]) is str:
            newfeatstr = onecol_name+'_is_' + categories[i] 
            featnames.append(newfeatstr)
            df[newfeatstr] = (df[onecol] == categories[i])

    onecol_null = onecol_name + "_is_null"
    df[onecol_null] = pd.isnull(df[onecol])
    df[onecol_null] = df[onecol_null].astype(float)
    df = df.drop(onecol, axis=1)
    df[featnames] = df[featnames].astype(float)
    df = df.groupby(config_db['id_column'], sort = False, as_index=False)[featnames].max()
    return df, featnames
项目:coquery    作者:gkunter    | 项目源码 | 文件源码
def _validate_layout(func):
        def func_wrapper(self):
            if self._col_wrap:
                if self._col_wrap > 16:
                    raise VisualizationInvalidLayout
                else:
                    return func(self)
            if self._col_factor and len(pd.unique(self._table[self._col_factor].values.ravel())) > 16:
                raise VisualizationInvalidLayout
            if self._row_factor and len(pd.unique(self._table[self._row_factor].values.ravel())) > 16:
                raise VisualizationInvalidLayout
            return func(self)
        return func_wrapper
项目:fstd2nc    作者:neishm    | 项目源码 | 文件源码
def vectorize (f):
  from functools import wraps
  try:
    from pandas import Series, unique
    @wraps(f)
    def vectorized_f (x):
      # If we're given a scalar value, then simply return it.
      if not hasattr(x,'__len__'):
        return f(x)
      # Get unique values
      inputs = unique(x)
      outputs = map(f,inputs)
      table = dict(zip(inputs,outputs))
      result = Series(x).map(table)
      return result.values
  except ImportError:
    def cached_f(x, cache={}):
      if x not in cache:
        cache[x] = f(x)
      return cache[x]
    @wraps(f)
    def vectorized_f (x):
      # If we're given a scalar value, then simply return it.
      if not hasattr(x,'__len__'):
        return cached_f(x)
      return map(cached_f,x)
  return vectorized_f


# The type of data returned by the Buffer iterator.
项目:ImgAnnotaPyQt4    作者:ZhengRui    | 项目源码 | 文件源码
def saveLabel(self):
        if not len(self.labelFile):
            self.labelFile = QtGui.QFileDialog.getSaveFileName(self, 'Save Label File', os.path.expanduser('~'), 'Txt (*.txt)')

        if len(self.labelFile):
            self.updateLabelsBuf()
            if self.labelsBuf is not None:
                if self.labels is None:
                    self.labels = self.labelsBuf

                self.labels = self.labels[~self.labels.image.isin(pd.unique(self.labelsBuf.image.ravel()))]
                self.labelsBuf = self.labelsBuf[self.labelsBuf.cateid.notnull()]
                self.labels = self.labels.append(self.labelsBuf, ignore_index=True)
                self.labels.to_csv(self.labelFile, index=False)
                self.labelsBuf = self.labelsBuf[self.labelsBuf.image == os.path.basename(self.imgsList[self.ith])]
项目:gullikson-scripts    作者:kgullikson88    | 项目源码 | 文件源码
def add_actual_temperature(df, method='excel', filename='SecondaryStar_Temperatures.xls'):
    """
    Add the actual temperature to a given summary dataframe
    :param df: The dataframe to which we will add the actual secondary star temperature
    :keyword method: How to get the actual temperature. Options are:
                   - 'spt': Use main-sequence relationships to go from spectral type --> temperature
                   - 'excel': Use tabulated data, available in the file 'SecondaryStar_Temperatures.xls'
    :keyword filename: The filename of the excel spreadsheet containing the literature temperatures.
                       Needs to have the right format! Ignored if method='spt'
    :return: copy of the original dataframe, with an extra column for the secondary star temperature
    """
    # First, get a list of the secondary stars in the data
    secondary_names = pd.unique(df.Secondary)
    secondary_to_temperature = defaultdict(float)
    secondary_to_error = defaultdict(float)

    if method.lower() == 'spt':
        MS = SpectralTypeRelations.MainSequence()
        for secondary in secondary_names:
            star_data = StarData.GetData(secondary)
            spt = star_data.spectype[0] + re.search('[0-9]\.*[0-9]*', star_data.spectype).group()
            T_sec = MS.Interpolate(MS.Temperature, spt)
            secondary_to_temperature[secondary] = T_sec

    elif method.lower() == 'excel':
        table = pd.read_excel(filename, 0)
        for secondary in secondary_names:
            T_sec = table.loc[table.Star.str.lower().str.contains(secondary.strip().lower())]['Literature_Temp'].item()
            T_error = table.loc[table.Star.str.lower().str.contains(secondary.strip().lower())][
                'Literature_error'].item()
            secondary_to_temperature[secondary] = T_sec
            secondary_to_error[secondary] = T_error

    df['Tactual'] = df['Secondary'].map(lambda s: secondary_to_temperature[s])
    df['Tact_err'] = df['Secondary'].map(lambda s: secondary_to_error[s])
    return
项目:gullikson-scripts    作者:kgullikson88    | 项目源码 | 文件源码
def fit_sigma(df, i):
    """
    Find the largest allowable standard deviation, given the possible values Tactual can take.
    """
    Tmeasured, Tactual, _, _ = get_values(df)
    Tm = Tmeasured[i]

    # Get the possible values, and bin those with this measured value
    possible_values = sorted(pd.unique(df.Tactual))
    edges = [(possible_values[i] + possible_values[i+1])/2 for i in range(len(possible_values)-1)]
    bins = [0] + edges + [9e9]
    good = df.loc[df.Temperature == Tm]
    values, _= np.histogram(good.Tactual.values, bins=bins)

    mean = np.mean(good.Tactual.values)
    std = np.std(good.Tactual.values, ddof=1)
    if std > 0:
        return std

    sigma_test = np.arange(500, 10, -10) #Just test a bunch of values
    idx = np.searchsorted(bins, mean)
    idx = np.argmin(abs(np.array(bins) - mean))
    x1 = bins[idx-2] if idx > 2 else -1
    x2 = bins[idx-1]
    x3 = bins[idx]
    x4 = bins[idx+1] if idx < len(bins)-2 else np.inf
    N = len(good)
    probs = [get_probability(x1, x2, x3, x4, N, mean, s) for s in sigma_test]
    for s, p in zip(sigma_test, probs):
        if p > 0.5:
            return s

    # If we get here, just return a guess value
    return 200.0

    #raise ValueError('No probability > 0!')
项目:gullikson-scripts    作者:kgullikson88    | 项目源码 | 文件源码
def read_hdf5(hdf5_file):
    """
    Reads the hdf5 file into a dataframe. Assumes a very specific format!

    Parameters:
    ===========
    - hdf5_file:   string
                   The full path to the hdf5 file.

    Returns
    ========
    A pandas DataFrame containing summary information
    """
    logging.info('Reading HDF5 file {}'.format(hdf5_file))
    hdf5_int = HDF5_Interface(hdf5_file)
    df = hdf5_int.to_df()


    # Get the contrast. Split by group and then merge to limit the amount of calculation needed
    logging.info('Estimating the V-band contrast ratio for each trial')
    test_vsini = df.vsini.unique()[0]
    temp = df.loc[(df.rv == 0) & (df.vsini == test_vsini)].drop_duplicates(subset=['star', 'temperature'])
    temp['contrast'] = temp.apply(lambda r: get_contrast(r, band='V'), axis=1)

    logging.info('Estimating the luminosity ratio for each trial')
    temp['lum_ratio'] = temp.apply(get_luminosity_ratio, axis=1)

    logging.info('Re-merging dataframe')
    df = pd.merge(df, temp[['star', 'temperature', 'contrast', 'lum_ratio']], on=['star', 'temperature'], how='left')
    df['logL'] = np.log10(df.lum_ratio)

    return df
项目:gullikson-scripts    作者:kgullikson88    | 项目源码 | 文件源码
def parse_input(inp, sort_output=True, ensure_unique=True):
    """
    Parse the user input to get a list of integers.

    Parameters:
    ===========
    - inp:           string
                     Can be in the form 'a-b', 'a,b,c', 'a-b,c-d', etc.
                     '-' means an inclusive list of every number between a and b
                     ',' means the numbers a and b

    - sort_output:   boolean
                     Sort the output integers?

    - ensure_unique: boolean
                     Make sure the final list has no repeats?
    :return: A list of integers
    """
    sublists = inp.split(',')
    final_list = []
    for l in sublists:
        if '-' in l:
            first, last = l.split('-')
            for i in range(int(first), int(last) + 1):
                final_list.append(i)
        else:
            final_list.append(int(l))

    if ensure_unique:
        final_list = pd.unique(final_list)
    if sort_output:
        final_list = sorted(final_list)
    return final_list
项目:gullikson-scripts    作者:kgullikson88    | 项目源码 | 文件源码
def get_ccf(self, params, df=None):
        """
        Get the ccf with the given parameters.

        Parameters:
        ===========
        - params:    dictionary:
                     All the parameters necessary to define a single ccf. This should be
                     a python dictionary with the keys:
                         - 'starname': The name of the star. Try self.list_stars() for the options.
                         - 'date': The UT date of the observations. Try self.list_dates() for the options.
                         - 'T': temperature of the model
                         - 'logg': the log(g) of the model
                         - 'vsini': the vsini by which the model was broadened before correlation
                         - '[Fe/H]': the metallicity of the model
                         - 'addmode': The way the order CCFs were added to make a total one. Can be:
                             - 'simple'
                             - 'ml'
                             - 'weighted'
                             - 'dc'


        - df:        a pandas DataFrame such as outputted by _compile_data

        Returns:
        ========
        -ccf:        pandas DataFrame
                     Holds columns of velocity and CCF power
        """
        if df is None:
            try:
                df = self._compile_data(params['starname'], params['date'])
            except KeyError:
                raise KeyError('Must give get_ccf params with starname and date keywords, if df is not given!')

        Tvals = df['T'].unique()
        T = Tvals[np.argmin(abs(Tvals - params['T']))]
        good = df.loc[(df['T'] == T) & (df.logg == params['logg']) & (df.vsini == params['vsini']) \
                      & (df['[Fe/H]'] == params['[Fe/H]']) & (df.addmode == params['addmode'])]

        return pd.DataFrame(data={'velocity': self.velocities, 'CCF': good['ccf'].item()})
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda    作者:SignalMedia    | 项目源码 | 文件源码
def test_ints(self):
        arr = np.random.randint(0, 100, size=50)

        result = algos.unique(arr)
        tm.assertIsInstance(result, np.ndarray)
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda    作者:SignalMedia    | 项目源码 | 文件源码
def test_objects(self):
        arr = np.random.randint(0, 100, size=50).astype('O')

        result = algos.unique(arr)
        tm.assertIsInstance(result, np.ndarray)
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda    作者:SignalMedia    | 项目源码 | 文件源码
def test_object_refcount_bug(self):
        lst = ['A', 'B', 'C', 'D', 'E']
        for i in range(1000):
            len(algos.unique(lst))
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda    作者:SignalMedia    | 项目源码 | 文件源码
def test_on_index_object(self):

        mindex = pd.MultiIndex.from_arrays([np.arange(5).repeat(5), np.tile(
            np.arange(5), 5)])
        expected = mindex.values
        expected.sort()

        mindex = mindex.repeat(2)

        result = pd.unique(mindex)
        result.sort()

        tm.assert_almost_equal(result, expected)
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda    作者:SignalMedia    | 项目源码 | 文件源码
def test_unique_label_indices():
    from pandas.hashtable import unique_label_indices

    a = np.random.randint(1, 1 << 10, 1 << 15).astype('i8')

    left = unique_label_indices(a)
    right = np.unique(a, return_index=True)[1]

    tm.assert_numpy_array_equal(left, right)

    a[np.random.choice(len(a), 10)] = -1
    left = unique_label_indices(a)
    right = np.unique(a, return_index=True)[1][1:]
    tm.assert_numpy_array_equal(left, right)
项目:plydata    作者:has2k1    | 项目源码 | 文件源码
def __init__(self, data=None, groups=None, **kwargs):
        super().__init__(data=data, **kwargs)
        if groups is not None:
            self.plydata_groups = list(pd.unique(groups))
项目:plydata    作者:has2k1    | 项目源码 | 文件源码
def _n_distinct(arr):
    """
    Number of unique values in array
    """
    return len(pd.unique(arr))
项目:serenata-toolbox    作者:datasciencebr    | 项目源码 | 文件源码
def test_clean_2017_reimbursements(self):
        copy(os.path.join(self.fixtures_path, 'reimbursements-2017.xz'), self.path)
        file_path = os.path.join(self.path, 'reimbursements.xz')

        self.subject.clean()

        assert(os.path.exists(file_path))

        dataset = pd.read_csv(file_path, compression='xz')
        all_subquotas = [subquota[1] for subquota in self.subject.subquotas]

        present_subquotas = pd.unique(dataset['subquota_description'])
        for subquota in present_subquotas:
            with self.subTest():
                assert(subquota in all_subquotas)
项目:KaggleExeter    作者:detomo    | 项目源码 | 文件源码
def app_activity_features():
    train = pd.read_csv("gender_age_train.csv")
    test = pd.read_csv("gender_age_test.csv")
    train.drop(['gender','age','group'],axis=1,inplace=True)
    data = train.append(test)

    """ Merge with brand_model table"""
    device_table = pd.read_csv("phone_brand_device_model.csv")
    data = pd.merge(data,device_table,how='left',on='device_id')
    data = data.drop_duplicates()  #drop duplicates  #note: there is still one device associated with 2 brands/models
    del device_table
    print "data build"
    """
    Create dataframe indicating for each device id, which app is present, and how much is it active
        - merge events and app_events on event_id
        - group by device_id and app_id, and take the mean of activity
    """
    events = pd.read_csv("events.csv")
    events = events[events['device_id'].isin(list(data['device_id']))]
    apps = pd.read_csv("app_events.csv")
    apps = pd.merge(apps[['event_id','app_id','is_active']],events[['event_id','device_id']],on='event_id')
    apps = apps.groupby(['device_id','app_id'],as_index=False)['is_active'].mean()
    del events
    print "events build"
    """Reshape the dataframe so that each app is a new feature"""
    reshaped = pd.DataFrame(columns=list(pd.unique(apps['app_id'])),index=list(pd.unique(apps['device_id'])))
    reshaped[list(pd.unique(apps['app_id']))]=0

    for app in list(pd.unique(apps['app_id'])):
        sliced = apps[apps['app_id']==app]
        reshaped[app].loc[list(sliced['device_id'])]=sliced['is_active'].values
    del apps
    return reshaped



########################################################################################################################################
########################################################################################################################################
项目:KaggleExeter    作者:detomo    | 项目源码 | 文件源码
def app_activity_features():
    train = pd.read_csv("gender_age_train.csv")
    test = pd.read_csv("gender_age_test.csv")
    train.drop(['gender','age','group'],axis=1,inplace=True)
    data = train.append(test)

    """ Merge with brand_model table"""
    device_table = pd.read_csv("phone_brand_device_model.csv")
    data = pd.merge(data,device_table,how='left',on='device_id')
    data = data.drop_duplicates()  #drop duplicates  #note: there is still one device associated with 2 brands/models
    del device_table
    print "data build"
    """
    Create dataframe indicating for each device id, which app is present, and how much is it active
        - merge events and app_events on event_id
        - group by device_id and app_id, and take the mean of activity
    """
    events = pd.read_csv("events.csv")
    events = events[events['device_id'].isin(list(data['device_id']))]
    apps = pd.read_csv("app_events.csv")
    apps = pd.merge(apps[['event_id','app_id','is_active']],events[['event_id','device_id']],on='event_id')
    apps = apps.groupby(['device_id','app_id'],as_index=False)['is_active'].mean()
    del events
    print "events build"
    """Reshape the dataframe so that each app is a new feature"""
    reshaped = pd.DataFrame(columns=list(pd.unique(apps['app_id'])),index=list(pd.unique(apps['device_id'])))
    reshaped[list(pd.unique(apps['app_id']))]=0

    for app in list(pd.unique(apps['app_id'])):
        sliced = apps[apps['app_id']==app]
        reshaped[app].loc[list(sliced['device_id'])]=sliced['is_active'].values
    del apps
    return reshaped
项目:sample-cnn    作者:tae-jun    | 项目源码 | 文件源码
def _process_dataset(anno, sample_rate, n_samples, n_threads):
  """Processes, and saves MagnaTagATune dataset using multi-processes.

  Args:
    anno: Annotation DataFrame contains tags, mp3_path, split, and shard.
    sample_rate: Sampling rate of the audios. If the sampling rate is different 
      with an audio's original sampling rate, then it re-samples the audio.
    n_samples: Number of samples one segment contains.
    n_threads: Number of threads to process the dataset.
  """
  args_queue = Queue()
  split_and_shard_sets = pd.unique([tuple(x) for x in anno[['split', 'shard']].values])

  for split, shard in split_and_shard_sets:
    assigned_anno = anno[(anno['split'] == split) & (anno['shard'] == shard)]
    n_shards = anno[anno['split'] == split]['shard'].nunique()

    args = (assigned_anno, sample_rate, n_samples, split, shard, n_shards)
    args_queue.put(args)

  if FLAGS.n_threads > 1:
    threads = []
    for _ in range(FLAGS.n_threads):
      thread = Thread(target=_process_audio_files, args=[args_queue])
      thread.start()
      threads.append(thread)

    for thread in threads:
      thread.join()
  else:
    _process_audio_files(args_queue)
项目:calvin    作者:ucd-cws    | 项目源码 | 文件源码
def aggregate_regions(fp):

  # aggregate regions and supply portfolios
  # easier to do this with pandas by just reading the CSVs again
  sc = pd.read_csv(fp + '/shortage_cost.csv', index_col=0, parse_dates=True)
  sv = pd.read_csv(fp + '/shortage_volume.csv', index_col=0, parse_dates=True)
  flow = pd.read_csv(fp + '/flow.csv', index_col=0, parse_dates=True)
  demand_nodes = pd.read_csv('calvin/data/demand_nodes.csv', index_col = 0)
  portfolio = pd.read_csv('calvin/data/portfolio.csv', index_col = 0)

  for R in demand_nodes.region.unique():
    for t in demand_nodes.type.unique():
      ix = demand_nodes.index[(demand_nodes.region == R) & 
                              (demand_nodes.type == t)]
      sc['%s_%s' % (R,t)] = sc[ix].sum(axis=1)
      sv['%s_%s' % (R,t)] = sv[ix].sum(axis=1)

  for P in portfolio.region.unique():
    for k in portfolio.supplytype.unique():
      for t in portfolio.type.unique():
        ix = portfolio.index[(portfolio.region == P) & 
                             (portfolio.type ==t) & 
                             (portfolio.supplytype == k)]
        flow['%s_%s_%s' % (P,k,t)] = flow[ix].sum(axis=1)

  sc.to_csv(fp + '/shortage_cost.csv')
  sv.to_csv(fp + '/shortage_volume.csv')
  flow.to_csv(fp + '/flow.csv')
项目:calvin    作者:ucd-cws    | 项目源码 | 文件源码
def remove_debug_links(self):
    df = self.df
    ix = df.index[df.index.str.contains('DBUG')]
    df.drop(ix, inplace=True, axis=0)
    self.nodes = pd.unique(df[['i','j']].values.ravel()).tolist()
    self.links = list(zip(df.i,df.j,df.k))
    return df
项目:finch    作者:chrisranderson    | 项目源码 | 文件源码
def nominal_to_numeric(array):
  mapper = {name: i for i, name in enumerate(pd.unique(array))}
  return np.array([mapper[name] for name in array])
项目:echonet    作者:karoldvl    | 项目源码 | 文件源码
def __init__(self, data_dir, work_dir, train_folds, validation_folds, test_folds, esc10=False):
        super().__init__(data_dir, work_dir)

        self.meta = pd.read_csv(data_dir + 'esc50.csv')

        self.train_folds = train_folds
        self.validation_folds = validation_folds
        self.test_folds = test_folds

        self.class_count = 50

        self.bands = 60
        self.segment_length = 101

        self.esc10 = esc10
        if self.esc10:
            self.class_count = 10
            self.meta = self.meta[self.meta['esc10']]
            self.categories = pd.unique(self.meta.sort_values('target')['category'])
            self.meta['target'] = self.to_targets(self.meta['category'])
        else:
            self.categories = pd.unique(self.meta.sort_values('target')['category'])

        self.train_meta = self.meta[self.meta['fold'].isin(self.train_folds)]
        self.validation_data.meta = self.meta[self.meta['fold'].isin(self.validation_folds)]
        self.test_data.meta = self.meta[self.meta['fold'].isin(self.test_folds)]

        self._validation_size = len(self.validation_data.meta)
        self._test_size = len(self.test_data.meta)

        self._generate_spectrograms()
        self._populate(self.validation_data)
        self._populate(self.test_data)
项目:skp_edu_docker    作者:TensorMSA    | 项目源码 | 文件源码
def dataconf_eval_time_check(self, _wf_data_conf_node, _node_name):
        """
        data conf? ???, eval?? unique?? ????.
        :param data_dfconf_list (nn00001_1_dataconf_node)
        :return True:
        """
        _value = False
        if ('evaldata' in _node_name):
             _value = True
        return _value
项目:skp_edu_docker    作者:TensorMSA    | 项目源码 | 文件源码
def set_dataconf_for_labels(self, df, label):
        """
        csv? ?? label? distict ?? ???
        Extract distinct label values
        :param wf_data_config, df, nnid, ver, node:
        :param conf_data:
        """
        #TODO : set_default_dataconf_from_csv ???? ?? ??
        label_values = pd.unique(df[label].values.ravel().astype('str')).tolist()
        return label_values
项目:stream2segment    作者:rizac    | 项目源码 | 文件源码
def test_get_events(self, mock_query):
        urlread_sideeffect = ["""1|2|3|4|5|6|7|8|9|10|11|12|13
20160508_0000129|2016-05-08 05:17:11.500000|40.57|52.23|60.0|AZER|EMSC-RTS|AZER|505483|ml|3.1|AZER|CASPIAN SEA, OFFSHR TURKMENISTAN
20160508_0000004|2016-05-08 01:45:30.300000|44.96|15.35|2.0|EMSC|EMSC-RTS|EMSC|505183|ml|3.6|EMSC|CROATIA
20160508_0000113|2016-05-08 22:37:20.100000|45.68|26.64|163.0|BUC|EMSC-RTS|BUC|505351|ml|3.4|BUC|ROMANIA
20160508_0000113|2016-05-08 22:37:20.100000|45.68|26.64|163.0|BUC|EMSC-RTS|BUC|505351|ml|3.4|BUC|ROMANIA
--- ERRROR --- THIS IS MALFORMED 20160508_abc0113|2016-05-08 22:37:20.100000| --- ERROR --- |26.64|163.0|BUC|EMSC-RTS|BUC|505351|ml|3.4|BUC|ROMANIA
"""]


        data = self.get_events_df(urlread_sideeffect, self.session, "http://eventws", db_bufsize=self.db_buf_size)
        # assert only first two events events were successfully saved 
        assert len(self.session.query(Event).all()) == len(pd.unique(data['id'])) == 2
        # AND data to save has length 2:
        assert len(data) == 2

        # now download again, with an url error:        
        urlread_sideeffect = [413, """1|2|3|4|5|6|7|8|9|10|11|12|13
20160508_0000129|2016-05-08 05:17:11.500000|40.57|52.23|60.0|AZER|EMSC-RTS|AZER|505483|ml|3.1|AZER|CASPIAN SEA, OFFSHR TURKMENISTAN
20160508_0000004|2016-05-08 01:45:30.300000|44.96|15.35|2.0|EMSC|EMSC-RTS|EMSC|505183|ml|3.6|EMSC|CROATIA
20160508_0000113|2016-05-08 22:37:20.100000|45.68|26.64|163.0|BUC|EMSC-RTS|BUC|505351|ml|3.4|BUC|ROMANIA
20160508_0000113|2016-05-08 22:37:20.100000|45.68|26.64|163.0|BUC|EMSC-RTS|BUC|505351|ml|3.4|BUC|ROMANIA
--- ERRROR --- THIS IS MALFORMED 20160508_abc0113|2016-05-08 22:37:20.100000| --- ERROR --- |26.64|163.0|BUC|EMSC-RTS|BUC|505351|ml|3.4|BUC|ROMANIA
""", URLError('blabla23___')]

        data = self.get_events_df(urlread_sideeffect, self.session, "http://eventws", db_bufsize=self.db_buf_size)
        # assert we got the same result as above:
        assert len(self.session.query(Event).all()) == len(pd.unique(data['id'])) == 2
        assert len(data) == 2
        # and since first response is 413, that having split the request into two, the
        # second response is our URLError (we could test it better, anyway):
        assert "blabla23___" in self.log_msg()
项目:seniority_list    作者:rubydatasystems    | 项目源码 | 文件源码
def sort_eg_attributes(df, attributes=['doh', 'ldate'],
                       reverse_list=[0, 0],
                       add_columns=False):
    '''Sort master list attribute columns by employee group in preparation
    for list construction.  The overall master list structure and order is
    unaffected, only the selected attribute columns are sorted (normally
    date-related columns such as doh or ldate)

    inputs
        df
            The master data dataframe (does not need to be sorted)
        attributes
            columns to sort by eg (inplace)
        reverse_list
            If an attribute is to be sorted in reverse order (descending),
            use a '1' in the list position corresponding to the position of
            the attribute within the attributes input
        add_columns
            If True, an additional column for each sorted attribute will be
            added to the resultant dataframe, with the suffix '_sort' added
            to it.
    '''
    date_cols = []
    for col in df:
        if (df[col]).dtype == 'datetime64[ns]':
            date_cols.append(col)
    try:
        df.sort_values(['eg', 'eg_number'], inplace=True)
    except LookupError:
        df.sort_values(['eg', 'eg_order'], inplace=True)

    egs = df.eg.values
    i = 0
    for measure in attributes:
        data = df[measure].values
        measure_col = np.empty_like(data)
        for eg in pd.unique(df.eg):
            measure_slice = data[egs == eg]
            measure_slice_index = np.where(egs == eg)[0]
            measure_slice_sorted = np.sort(measure_slice, axis=0)

            if reverse_list[i]:
                measure_slice_invert = measure_slice_sorted[::-1]
                measure_slice_sorted = measure_slice_invert
            np.put(measure_col, measure_slice_index, measure_slice_sorted)

        if add_columns:
            col_name = measure + '_sort'
        else:
            col_name = measure

        df[col_name] = measure_col

        if measure in date_cols:
            df[col_name] = pd.to_datetime(df[col_name].dt.date)
        i += 1

    return df
项目:plydata    作者:has2k1    | 项目源码 | 文件源码
def unique(lst):
    """
    Return unique elements

    :class:`pandas.unique` and :class:`numpy.unique` cast
    mixed type lists to the same type. They are faster, but
    some times we want to maintain the type.

    Parameters
    ----------
    lst : list-like
        List of items

    Returns
    -------
    out : list
        Unique items in the order that they appear in the
        input.

    Examples
    --------
    >>> import pandas as pd
    >>> import numpy as np
    >>> lst = ['one', 'two', 123, 'three']
    >>> pd.unique(lst)
    array(['one', 'two', '123', 'three'], dtype=object)
    >>> np.unique(lst)
    array(['123', 'one', 'three', 'two'],
          dtype='<U5')
    >>> unique(lst)
    ['one', 'two', 123, 'three']

    pandas and numpy cast 123 to a string!, and numpy does not
    even maintain the order.
    """
    seen = set()

    def make_seen(x):
        seen.add(x)
        return x

    return [make_seen(x) for x in lst if x not in seen]
项目:serenata-toolbox    作者:datasciencebr    | 项目源码 | 文件源码
def test_fetch_translate_clean_integration(self):
        self.subject.fetch()
        files = ["Ano-{}.csv".format(n) for n in [2017]]
        files.append('datasets-format.html')

        for name in files:
            file_path = os.path.join(self.path, name)
            assert(os.path.exists(file_path))

        self.subject.translate()
        for name in ["reimbursements-{}.xz".format(n) for n in self.years]:
            file_path = os.path.join(self.path, name)
            assert(os.path.exists(file_path))

        self.subject.clean()
        file_path = os.path.join(self.path, 'reimbursements.xz')
        assert(os.path.exists(file_path))

        # test for subquota translation
        dataset = pd.read_csv(file_path, compression='xz')
        all_subquotas = ['Maintenance of office supporting parliamentary activity',
                     'Locomotion, meal and lodging',
                     'Fuels and lubricants',
                     'Consultancy, research and technical work',
                     'Publicity of parliamentary activity',
                     'Purchase of office supplies',
                     'Software purchase or renting; Postal services; Subscriptions',
                     'Security service provided by specialized company',
                     'Flight tickets',
                     'Telecommunication',
                     'Postal services',
                     'Publication subscriptions',
                     'Congressperson meal',
                     'Lodging, except for congressperson from Distrito Federal',
                     'Automotive vehicle renting or watercraft charter',
                     'Aircraft renting or charter of aircraft',
                     'Automotive vehicle renting or charter',
                     'Watercraft renting or charter',
                     'Taxi, toll and parking',
                     'Terrestrial, maritime and fluvial tickets',
                     'Participation in course, talk or similar event',
                     'Flight ticket issue']

        present_subquotas = pd.unique(dataset['subquota_description'])
        for subquota in present_subquotas:
            assert(subquota in all_subquotas)
项目:echonet    作者:karoldvl    | 项目源码 | 文件源码
def __init__(self, data_dir, work_dir, train_folds, validation_folds, test_folds, esc10=False,
                 downsample=True):
        super().__init__(data_dir, work_dir)

        self.meta = pd.read_csv(data_dir + 'esc50.csv')

        self.train_folds = train_folds
        self.validation_folds = validation_folds
        self.test_folds = test_folds

        self.class_count = 50

        self.DOWNSAMPLE = downsample
        self.SEGMENT_LENGTH = 300
        self.BANDS = 180
        self.WITH_DELTA = False
        self.FMAX = 16000
        self.FFT = 2205
        self.HOP = 441

        self.esc10 = esc10
        if self.esc10:
            self.class_count = 10
            self.meta = self.meta[self.meta['esc10']]
            self.categories = pd.unique(self.meta.sort_values('target')['category'])
            self.meta['target'] = self.to_targets(self.meta['category'])
        else:
            self.categories = pd.unique(self.meta.sort_values('target')['category'])

        self.train_meta = self.meta[self.meta['fold'].isin(self.train_folds)]
        self.validation_data.meta = self.meta[self.meta['fold'].isin(self.validation_folds)]
        self.test_data.meta = self.meta[self.meta['fold'].isin(self.test_folds)]

        self._validation_size = len(self.validation_data.meta)
        self._test_size = len(self.test_data.meta)

        self._generate_spectrograms()

        if self.DOWNSAMPLE:
            self.SEGMENT_LENGTH //= 2
            self.BANDS //= 3

        self._populate(self.validation_data)
        self._populate(self.test_data)
项目:skp_edu_docker    作者:TensorMSA    | 项目源码 | 文件源码
def set_dataconf_for_checktype(self, df, node_id, data_dfconf_list):
        """
        csv? ?? column type? ???? data_conf? ??(data_conf? ????? )
        ???? ??? Unique ? ?? ??? cell_feature_unique? ???(Keras?)

        :param wf_data_config, df, nnid, ver, node:
        :param conf_data:
        """
        try:
            #TODO : set_default_dataconf_from_csv ???? ?? ??
            data_conf = dict()
            data_conf_unique_v = dict()
            data_conf_col_unique_v = dict()
            data_conf_col_type = dict()
            numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
            # Wdnn??? data_dfconf? ??? ??? ?? ??? ??? ?? ??? ??
            if len(data_dfconf_list) > 0:
                _wf_data_conf = wf_data_conf(data_dfconf_list)
                _cell_feature_unique = _wf_data_conf.cell_feature_unique if hasattr(_wf_data_conf,
                                                                      'cell_feature_unique') else list()  # ?? ???? ????? ??? ? ??? ??
            for i, v in df.dtypes.iteritems():
                # label
                column_dtypes = dict()
                column_unique_value = dict()
                if (str(v) in numerics):  # maybe need float
                    col_type = 'CONTINUOUS'
                    columns_unique_value = list()
                else:
                    col_type = 'CATEGORICAL'
                    columns_unique_value = pd.unique(df[i].fillna('').values.ravel()).tolist()  # null?? ???
                column_dtypes['column_type'] = col_type
                origin_feature_unique = _cell_feature_unique[i].get('column_u_values') if (i in _cell_feature_unique) else list()
                combined_col_u_list = utils.get_combine_label_list(origin_feature_unique, columns_unique_value)
                column_unique_value['column_u_values'] = combined_col_u_list    #???? ???? ?? ????.
                data_conf_col_type[i] = column_dtypes
                data_conf_col_unique_v[i] = column_unique_value
            data_conf['cell_feature'] = data_conf_col_type
            data_conf_unique_v['cell_feature_unique'] = data_conf_col_unique_v
            data_conf_json_str = json.dumps(data_conf)  #Json?? ???
            data_conf_json = json.loads(data_conf_json_str)
            data_conf_unique_json_str = json.dumps(data_conf_unique_v)
            data_conf_unique_json = json.loads(data_conf_unique_json_str)
            return data_conf_json, data_conf_unique_json
        except Exception as e:
            logging.error("set_dataconf_for_checktype {0} {1}".format(e, e.__traceback__.tb_lineno))