Python collections 模块,Counter() 实例源码

我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用collections.Counter()

项目:redberry    作者:michaelcho    | 项目源码 | 文件源码
def keywords(self, num=5):

        words_only = self.strip_tags(self.content, strip_punctuation=True)
        words = words_only.split()

        counter = collections.Counter(words)
        common = counter.most_common()

        keywords = []

        INSIGNIFICANT_WORDS = ('should', 'which', 'therefore')

        for word in common:
            lower_word = word[0].lower()
            if len(lower_word) > 4 and lower_word not in INSIGNIFICANT_WORDS:
                keywords.append(lower_word)

            if len(keywords) >= num:
                break

        return ", ".join(keywords)
项目:STA141C    作者:clarkfitzg    | 项目源码 | 文件源码
def overlap_score(q1, q2):
    """
    q1, q2 are preprocessed sentences (strings)

    >>> overlap_score("a b", "a")
    0.6666666666666666

    """

    c1 = Counter(q1.split())
    c2 = Counter(q2.split())
    c1c2 = c1 + c2

    both = set(c1.keys())
    both = both.intersection(c2.keys())

    bothscore = float(sum(c1c2[x] for x in both))
    mplusn = float(sum(c1c2.values()))

    score = bothscore / mplusn

    return score
项目:STA141C    作者:clarkfitzg    | 项目源码 | 文件源码
def overlap_score(q1, q2):
    """
    >>> overlap_score("fun", "real fun")
    0.6666666666666666
    >>> overlap_score("  ", "   ")
    0
    """

    q1count = Counter(q1.split())
    q2count = Counter(q2.split())

    both = set(q1count.keys())
    both = both.intersection(q2count.keys())
    combined = q1count + q2count

    mplusn = float(sum(combined.values()))
    overlap = float(sum(combined[x] for x in both))

    try:
        return overlap / mplusn
    except ZeroDivisionError:
        return 0
项目:Modeling_Preparation    作者:Yangruipis    | 项目源码 | 文件源码
def vote(df, columns_name, value):
        label_data = df.loc[df[columns_name] == value, 'label'].values
        return Counter(label_data).most_common()[0][0]
项目:NeoVintageous    作者:NeoVintageous    | 项目源码 | 文件源码
def update_xpos(self, force=False):
        if self.must_update_xpos or force:
            try:
                # TODO: we should check the current mode instead. ============
                sel = self.view.sel()[0]
                pos = sel.b
                if not sel.empty():
                    if sel.a < sel.b:
                        pos -= 1
                # ============================================================
                r = sublime.Region(self.view.line(pos).a, pos)
                counter = Counter(self.view.substr(r))
                tab_size = self.view.settings().get('tab_size')
                xpos = (self.view.rowcol(pos)[1] +
                        ((counter['\t'] * tab_size) - counter['\t']))
            except Exception as e:
                nvim.console_message(e)
                _logger.exception('error setting xpos; default to 0')
                self.xpos = 0
                return
            else:
                self.xpos = xpos
项目:IgDiscover    作者:NBISweden    | 项目源码 | 文件源码
def main(args):
    if args.minimum_frequency is None:
        minimum_frequency = max((len(args.tables) + 1) // 2, 2)
    else:
        minimum_frequency = args.minimum_frequency
    logger.info('Minimum frequency set to %s', minimum_frequency)

    # Read in tables
    tables = []
    for path in args.tables:
        table = pd.read_csv(path, sep='\t')
        table = table[table.database_diff >= args.minimum_db_diff]
        table = table.dropna()
        tables.append(table)
        if len(table) == 0:
            logger.warn('Table read from %r is empty after filtering out sequences with database diff >= %s.', path, args.minimum_db_diff)

    # Count V sequence occurrences
    counter = Counter()
    for table in tables:
        counter.update(set(table.consensus))

    # Find most frequent occurrences and print result
    print('count', 'gene', 'database_diff', 'sequence', 'names', sep='\t')
    for sequence, frequency in counter.most_common():
        if frequency < minimum_frequency:
            break
        names = []
        gene = None
        for table in tables:
            matching_rows = table[table.consensus == sequence]
            if matching_rows.empty:
                continue
            names.extend(matching_rows.name)
            if gene is None:
                row = matching_rows.iloc[0]
                gene = row.gene
                database_diff = row.database_diff
                #shm = row['V_SHM']
        print(frequency, gene, database_diff, sequence, *names, sep='\t')
项目:IgDiscover    作者:NBISweden    | 项目源码 | 文件源码
def main(args):
    if args.minimum_frequency is None:
        # args.table is a list of file names
        minimum_frequency = max((len(args.table) + 1) // 2, 2)
    else:
        minimum_frequency = args.minimum_frequency
    logger.info('Minimum frequency set to %s', minimum_frequency)

    # Read in tables
    tables = []
    for path in args.table:
        table = read_table(path)
        table = table.loc[:,['V_gene', 'V_SHM', 'V_nt', 'name']]
        tables.append(table)

    # Count V sequence occurrences
    counter = Counter()
    for table in tables:
        counter.update(set(table.V_nt))

    # Find most frequent occurrences and print result
    print('Frequency', 'Gene', '%SHM', 'Sequence', sep='\t')
    for sequence, frequency in counter.most_common():
        if frequency < minimum_frequency:
            break
        names = []
        gene = None
        for table in tables:
            matching_rows = table[table.V_nt == sequence]
            if matching_rows.empty:
                continue
            names.extend(matching_rows.name)
            if gene is None:
                row = matching_rows.iloc[0]
                gene = row['V_gene']
                shm = row['V_SHM']
        print(frequency, gene, shm, sequence, *names, sep='\t')
项目:xpandas    作者:alan-turing-institute    | 项目源码 | 文件源码
def __init__(self, dictionary=None, **kwargs):
        '''
        :param dictionary: custom dictionary to count against. if None, calculate dictionary from dataset
        '''
        self.dictionary = dictionary

        accepted_types = [
            pd.Series, list, np.array, tuple
        ]

        def bag_of_words_transform_function(corpus):
            counter = Counter(corpus)
            for el in self.dictionary:
                if counter.get(el) is None:
                    counter[el] = 0
            return counter

        super(BagOfWordsTransformer, self).__init__(data_types=accepted_types,
                                                    columns=None,
                                                    transform_function=bag_of_words_transform_function)
项目:zipline-chinese    作者:zhanghan1990    | 项目源码 | 文件源码
def assertDifferentObjects(self, *objs):
        id_counts = Counter(map(id, objs))
        ((most_common_id, count),) = id_counts.most_common(1)
        if count > 1:
            dupe = [o for o in objs if id(o) == most_common_id][0]
            self.fail("%s appeared %d times in %s" % (dupe, count, objs))
项目:trf    作者:aistairc    | 项目源码 | 文件源码
def calc_n_types(self) -> int:
        """Calculate the number of types of input text
        Returns:
            int: the number of types of input text
        """
        surfaces = []
        for sentence in self.sentences:
            juman_result = self.juman.analysis(sentence)
            surfaces += [mrph.midasi for mrph in juman_result.mrph_list()]
        word_type_counter = Counter(surfaces)
        return len(word_type_counter)
项目:trf    作者:aistairc    | 项目源码 | 文件源码
def calc_rs_modality(self) -> Dict[str, float]:

        modality_counter = Counter()
        for i, s in enumerate(self.sentences):
            chunks = []
            for bnst in self.knp.parse(s).bnst_list():
                chunk = Chunk(chunk_id=bnst.bnst_id,
                              link=bnst.parent,
                              description=bnst.fstring)
                chunks.append(chunk)

            s = "".join([chunk.description for chunk in chunks])
            ms = set(re.findall("<?????-(.+?)>", s))
            modality_counter += Counter(ms)

            n = len(self.sentences)

        return dict([(k, float(c) / n)
                     for k, c in modality_counter.items()])
项目:PlasoScaffolder    作者:ClaudiaSaxer    | 项目源码 | 文件源码
def GetDuplicateColumnNames(
      self, columns: sql_query_column_model.SQLColumnModel) -> [str]:
    """Find out if the query has duplicate column names and if a alias is
        needed.

    Args:
      columns (sql_query_column_model.SQLColumnModel): all columns parsed
          from the cursor
    Returns:
      [str]: a list of all the duplicate column names, if its empty it means it
          is a distinct list of columns
    """
    single_column_name_list = [column.sql_column for column in columns]
    duplicate_list = [column for column, count in
                      collections.Counter(single_column_name_list).items() if
                      count > 1]
    return sorted(duplicate_list)
项目:tf_rnnlm    作者:Ubiqus    | 项目源码 | 文件源码
def _build_vocab(self, filename):
    counts = Counter()
    with tf.gfile.GFile(filename, "r") as f:
      #for line in f:
      #  words = line.replace("\n"," ").split()
      #  counts += Counter(words)
      while True:
        chunk = f.read(int(500000000/2))
        if not chunk: 
          break
        counts += Counter(chunk.replace("\n", " ").split())

    sorted_pairs = sorted(counts.items(), key=lambda x: (-x[1], x[0]))
    self.word_to_id = {e[0]: (i+3) for (i, e) in enumerate(sorted_pairs)}
    self.word_to_id[EOS] = IEOS
    self.word_to_id[BOS] = IBOS
    self.word_to_id[PAD] = IPAD
项目:dl4mt-multi    作者:nyu-dl    | 项目源码 | 文件源码
def print_params(self, cgs):
        """
        cgs : list of computational graph names
        """
        for name, cg in cgs.iteritems():
            shapes = [param.get_value().shape for param in cg.parameters]
            logger.info(
                "Parameter shapes for computation graph[{}]".format(name))
            for shape, count in Counter(shapes).most_common():
                logger.info('    {:15}: {}'.format(shape, count))
            logger.info(
                "Total number of parameters for computation graph[{}]: {}"
                .format(name, len(shapes)))

            logger.info(
                "Parameter names for computation graph[{}]: ".format(name))
            for item in cg.parameters:
                logger.info(
                    "    {:15}: {}".format(item.get_value().shape, item.name))
            logger.info(
                "Total number of parameters for computation graph[{}]: {}"
                .format(name, len(cg.parameters)))
项目:manubot    作者:greenelab    | 项目源码 | 文件源码
def get_manuscript_stats(text, citation_df):
    """
    Compute manuscript statistics.
    """
    stats = collections.OrderedDict()

    # Number of distinct references by type
    ref_counts = (
        citation_df
        .standard_citation
        .drop_duplicates()
        .map(lambda x: x.split(':')[0])
        .pipe(collections.Counter)
    )
    ref_counts['total'] = sum(ref_counts.values())
    stats['reference_counts'] = ref_counts
    stats['word_count'] = len(text.split())
    logging.info(f"Generated manscript stats:\n{json.dumps(stats, indent=2)}")
    return stats
项目:otRebuilder    作者:Pal3love    | 项目源码 | 文件源码
def subset_glyphs(self, s):
    table = self.table.Baseline
    if table.Format in (1, 3):
        baselines = {glyph: table.BaselineValues.get(glyph, table.DefaultBaseline)
                     for glyph in s.glyphs}
        if len(baselines) > 0:
            mostCommon, _cnt = Counter(baselines.values()).most_common(1)[0]
            table.DefaultBaseline = mostCommon
            baselines = {glyph: b for glyph, b in baselines.items()
                         if b != mostCommon}
        if len(baselines) > 0:
            table.BaselineValues = baselines
        else:
            table.Format = {1: 0, 3: 2}[table.Format]
            del table.BaselineValues
    return True
项目:otRebuilder    作者:Pal3love    | 项目源码 | 文件源码
def subset_glyphs(self, s):
    prop = self.table.GlyphProperties
    if prop.Format == 0:
        return prop.DefaultProperties != 0
    elif prop.Format == 1:
        prop.Properties = {g: prop.Properties.get(g, prop.DefaultProperties)
                           for g in s.glyphs}
        mostCommon, _cnt = Counter(prop.Properties.values()).most_common(1)[0]
        prop.DefaultProperties = mostCommon
        prop.Properties = {g: prop for g, prop in prop.Properties.items()
                           if prop != mostCommon}
        if len(prop.Properties) == 0:
            del prop.Properties
            prop.Format = 0
            return prop.DefaultProperties != 0
        return True
    else:
        assert False, "unknown 'prop' format %s" % prop.Format
项目:Deep-Learning-with-Keras    作者:PacktPublishing    | 项目源码 | 文件源码
def build_vocab(train_data, test_data):
    counter = collections.Counter()
    for stories, questions, answers in [train_data, test_data]:
        for story in stories:
            for sent in story:
                for word in nltk.word_tokenize(sent):
                    counter[word.lower()] += 1
        for question in questions:
            for word in nltk.word_tokenize(question):
                counter[word.lower()] += 1
        for answer in answers:
            for word in nltk.word_tokenize(answer):
                counter[word.lower()] += 1
    # no OOV here because there are not too many words in dataset
    word2idx = {w:(i+1) for i, (w, _) in enumerate(counter.most_common())}
    word2idx["PAD"] = 0
    idx2word = {v:k for k, v in word2idx.items()}
    return word2idx, idx2word
项目:mbin    作者:fanglab    | 项目源码 | 文件源码
def kmer_freq ( ref_str, k ):
    """
    Walk through sequence and return k-mer counts plus
    a pseudocount of 1.
    """
    ref_str = ref_str.upper()
    kmers = []
    for seq in product("ATGC",repeat=k):
        kmers.append( "".join(seq) )

    kmer_counts = Counter()
    for j in range( len(ref_str)-(k-1) ):
        motif    = ref_str[j:j+k]
        kmer_counts[motif] += 1

    # Combine forward and reverse complement motifs into one count
    combined_kmer = Counter()
    for kmer in kmers:
        kmer_rc = rev_comp_motif(kmer)
        if not combined_kmer.get(kmer_rc):
            combined_kmer[kmer] = kmer_counts[kmer] + kmer_counts[kmer_rc] + 1

    return combined_kmer
项目:mbin    作者:fanglab    | 项目源码 | 文件源码
def kmer_freq ( mode, ref_str, strand, opts ):
    ref_str = ref_str.upper()
    if strand==1:
        ref_str = ref_str[::-1]
    k = opts.comp_kmer
    kmers = []
    for seq in product("ATGC",repeat=k):
        kmers.append( "".join(seq) )

    kmer_counts = Counter()
    for j in range( len(ref_str)-(k-1) ):
        motif    = ref_str[j:j+k]
        kmer_counts[motif] += 1

    # Combine forward and reverse complement motifs into one count
    combined_kmer = Counter()
    for kmer in kmers:
        kmer_rc = motif_tools.rev_comp_motif(kmer)
        if not combined_kmer.get(kmer_rc):
            combined_kmer[kmer] = kmer_counts[kmer] + kmer_counts[kmer_rc] + 1

    return combined_kmer
项目:keras-utilities    作者:cbaziotis    | 项目源码 | 文件源码
def get_class_weights2(y, smooth_factor=0):
    """
    Returns the normalized weights for each class based on the frequencies of the samples
    :param smooth_factor: factor that smooths extremely uneven weights
    :param y: list of true labels (the labels must be hashable)
    :return: dictionary with the weight for each class
    """
    counter = Counter(y)

    if smooth_factor > 0:
        p = max(counter.values()) * smooth_factor
        for k in counter.keys():
            counter[k] += p

    majority = max(counter.values())

    return {cls: float(majority / count) for cls, count in counter.items()}
项目:DeepPath    作者:xwhan    | 项目源码 | 文件源码
def path_clean(path):
    rel_ents = path.split(' -> ')
    relations = []
    entities = []
    for idx, item in enumerate(rel_ents):
        if idx%2 == 0:
            relations.append(item)
        else:
            entities.append(item)
    entity_stats = Counter(entities).items()
    duplicate_ents = [item for item in entity_stats if item[1]!=1]
    duplicate_ents.sort(key = lambda x:x[1], reverse=True)
    for item in duplicate_ents:
        ent = item[0]
        ent_idx = [i for i, x in enumerate(rel_ents) if x == ent]
        if len(ent_idx)!=0:
            min_idx = min(ent_idx)
            max_idx = max(ent_idx)
            if min_idx!=max_idx:
                rel_ents = rel_ents[:min_idx] + rel_ents[max_idx:]
    return ' -> '.join(rel_ents)
项目:dactyl    作者:ripple    | 项目源码 | 文件源码
def main(cli_args):
    if len(config["targets"]) == 0:
        exit("No target found; maybe you need to specify a Dactyl config file?")

    issues = check_all_pages(target=cli_args.target)
    if issues:
        num_issues = sum(len(p[1]) for p in issues)
        print("Found %d issues:" % num_issues)
        for pagename,issuelist in issues:
            print("Page: %s" % pagename)
            c = collections.Counter(issuelist)
            for i, count_i in c.items():
                if i[0]=="Unplain Phrase":
                    print("   Discouraged phrase: %s (%d instances); suggest '%s' instead." %
                                    ( i[1], count_i, config["disallowed_phrases"][i[1].lower()] ))
                elif i[0]=="Unplain Word":
                    print("   Discouraged word: %s (%d instances); suggest '%s' instead." %
                                    ( i[1], count_i, config["disallowed_words"][i[1].lower()] ))
                else:
                    print("   %s: %s (%d instances)" % (i[0], i[1], count_i))
        exit(1)
    else:
        print("Style check passed with flying colors!")
        exit(0)
项目:evaluation_tools    作者:JSALT-Rosetta    | 项目源码 | 文件源码
def get_nb_caption_per_img(n, selected_captions): 
    """
    Get image id from audio caption file names that were selected by their speakers
    Choose images that have at least n captions per image
    ----------
    n : int, 
        desired number of caption per image
    selected_captions : list of string, 
        list of caption file names selected by their speakers
    """

    counter_nb_caption=Counter()

    for cap in selected_captions: 
        #get image id 
        ImgID = cap.split('_')[-0]
        # add a count 
        counter_nb_caption[ImgID]+=1

    #choose img_id that have a count of n
    d=dict((k, v) for k, v in counter_nb_caption.items() if v == n)

    ImgID_selected=d.keys()

    return(ImgID_selected)
项目:deeppavlov    作者:deepmipt    | 项目源码 | 文件源码
def _f1_score(pred, answers):
    """Compute the F1 score."""

    def _score(g_tokens, a_tokens):
        common = Counter(g_tokens) & Counter(a_tokens)
        num_same = sum(common.values())
        if num_same == 0:
            return 0
        precision = 1. * num_same / len(g_tokens)
        recall = 1. * num_same / len(a_tokens)
        f1 = (2 * precision * recall) / (precision + recall)
        return f1

    if pred is None or answers is None:
        return 0
    g_tokens = _normalize_answer(pred).split()
    scores = [_score(g_tokens, _normalize_answer(a).split()) for a in answers]
    return max(scores)
项目:dsb3    作者:EliasVansteenkiste    | 项目源码 | 文件源码
def test2():
    patient_data_paths = utils_lung.get_patient_data_paths(pathfinder.DATA_PATH)
    print len(patient_data_paths)
    pixel_spacings_xy = []
    n_slices = []

    for k, p in enumerate(patient_data_paths):
        pid = utils_lung.extract_pid_dir(p)
        sid2data, sid2metadata = utils_lung.get_patient_data(p)
        mtd = sid2metadata.itervalues().next()

        assert mtd['PixelSpacing'][0] == mtd['PixelSpacing'][1]
        pixel_spacings_xy.append(mtd['PixelSpacing'][0])
        n_slices.append(len(sid2metadata))
        print pid, pixel_spacings_xy[-1], n_slices[-1]

    print 'nslices', np.max(n_slices), np.min(n_slices), np.mean(n_slices)
    counts = collections.Counter(pixel_spacings_xy)
    new_list = sorted(pixel_spacings_xy, key=counts.get, reverse=True)
    print 'spacing', new_list
项目:KATE    作者:hugochan    | 项目源码 | 文件源码
def retrieval_perlabel(X_train, Y_train, X_test, Y_test, fractions=[0.01, 0.5, 1.0]):
    X_train = unitmatrix(X_train) # normalize
    X_test = unitmatrix(X_test)
    score = X_test.dot(X_train.T)
    precisions = defaultdict(dict)
    label_counter = Counter(Y_test.tolist())

    for idx in range(len(X_test)):
        retrieval_idx = score[idx].argsort()[::-1]
        for fr in fractions:
            ntop = int(fr * len(X_train))
            pr = float(len([i for i in retrieval_idx[:ntop] if Y_train[i] == Y_test[idx]])) / ntop
            try:
                precisions[fr][Y_test[idx]] += pr
            except:
                precisions[fr][Y_test[idx]] = pr
    new_pr = {}
    for fr, val in precisions.iteritems():
        avg_pr = 0.
        for label, pr in val.iteritems():
            avg_pr += pr / label_counter[label]
        new_pr[fr] = avg_pr / len(label_counter)

    return sorted(new_pr.items(), key=lambda d:d[0])
项目:EventStoryLine    作者:tommasoc80    | 项目源码 | 文件源码
def cross_sentence(event_lemma_dict):
    """
    function to create all possible pairs between event mentions in a file
    :param event_lemma_dict: dictionary of event lemmas in file
    :return: counter dictionary of event pairs in a file
    """

    full_event_file = []
    pairs_circumstantial_corpus = Counter([])

    for k, v in event_lemma_dict.items():
        full_event_file.append(k)

    event_pairs_full = list(product(full_event_file, repeat=2))

    for i in event_pairs_full:
        pairs_circumstantial_corpus.update([i])

    return pairs_circumstantial_corpus
项目:sentrycli    作者:operasoftware    | 项目源码 | 文件源码
def print_grouping(attributes, grouping, top):
    """
    Print computed groups.

    :param attributes: list of grouped attributes
    :type: list(str)
    :param grouping: counter for each combination of attributes' values
    :type: Counter
    :type top: int
    """
    total = sum(grouping.values())

    table = Table(attributes + ['count', '%'])
    table.add_rows(total, grouping.most_common(top))

    print '\n' + table.by_count()
    print 'Total:', total
项目:gmlan_gw    作者:tmkdev    | 项目源码 | 文件源码
def __init__(self):
        self.handlers = {
            0x001: self._power,
            0x186: self._text,
            0x185: self._textparam,
            0x061: self._exttemp,
            0x005: self._tpms,
            #0x18e: self._textparam,
            0x026: self._fuel,
            0x053: self._gpsdate,
            0x055: self._gps,
        }

        self.counter = Counter() 
        self.locations = []
        self.fuel = [0,0]
项目:Eskapade    作者:KaveIO    | 项目源码 | 文件源码
def fill_histogram(self, idf, columns):
        """Fill input histogram with column(s) of input dataframe

        :param idf: input data frame used for filling histogram
        :param list columns: histogram column(s)
        """

        name = ':'.join(columns)
        if name not in self._counts:
            # create an (empty) value counts dict
            self._counts[name] = Counter()
        # value_counts() is faster than groupby().size(), but only works for series (1d).
        # else use groupby() for multi-dimensions
        g = idf.groupby(by=columns).size() if len(columns) > 1 else idf[columns[0]].value_counts()
        counts = Counter(g.to_dict())
        # remove specific keys from histogram before merging, if so requested
        counts = self.drop_requested_keys(name, counts)
        self._counts[name].update(counts)
项目:Eskapade    作者:KaveIO    | 项目源码 | 文件源码
def test_bin_edges(self):

        # constructor
        cnt = Counter()
        for i in range(10):
            cnt[i*2] = i

        vc = ValueCounts(key='x', counts=cnt)
        bin_specs = { 'bin_width': 1, 'bin_offset': 0 }

        h = Histogram(vc, variable='x', bin_specs = bin_specs)

        # uniform
        bin_edges = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]
        self.assertListEqual(h.get_uniform_bin_edges(), bin_edges)

        # truncated uniform bin edges
        truncated_bin_edges = [5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0]
        self.assertListEqual(h.truncated_bin_edges([5.5,12.5]), truncated_bin_edges)

        h_bin_edges = h.bin_edges()
        self.assertIsInstance(h_bin_edges, np.ndarray)
        self.assertListEqual(h_bin_edges.tolist(), bin_edges)
项目:Eskapade    作者:KaveIO    | 项目源码 | 文件源码
def test_bin_centers(self):

        # constructor
        cnt = Counter()
        for i in range(10):
            cnt[i*2] = i

        vc = ValueCounts(key='x', counts=cnt)
        bin_specs = { 'bin_width': 1, 'bin_offset': 0 }

        h = Histogram(vc, variable='x', bin_specs = bin_specs)

        bin_centers = [0.5, 2.5, 4.5, 6.5, 8.5, 10.5, 12.5, 14.5, 16.5, 18.5]
        h_bin_centers = h.bin_centers()
        self.assertIsInstance(h_bin_centers, np.ndarray)
        self.assertListEqual(h_bin_centers.tolist(), bin_centers)
项目:Eskapade    作者:KaveIO    | 项目源码 | 文件源码
def test_bin_entries(self):

        # constructor
        cnt = Counter()
        for i in range(10):
            cnt[i*2] = i

        vc = ValueCounts(key='x', counts=cnt)
        bin_specs = { 'bin_width': 1, 'bin_offset': 0 }

        h = Histogram(vc, variable='x', bin_specs = bin_specs)

        bin_entries = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
        h_bin_entries = h.bin_entries()
        self.assertIsInstance(h_bin_entries, np.ndarray)
        self.assertListEqual(h_bin_entries.tolist(), bin_entries)
项目:Eskapade    作者:KaveIO    | 项目源码 | 文件源码
def test_bin_labels(self):

        # constructor
        cnt = Counter()
        for i in range(10):
            cnt[i*2] = i

        vc = ValueCounts(key='x', counts=cnt)
        bin_specs = { 'bin_width': 1, 'bin_offset': 0 }

        h = Histogram(vc, variable='x', bin_specs = bin_specs)

        bin_labels = [0, 2, 4, 6, 8, 10, 12, 14, 16, 18]
        h_bin_labels = h.bin_labels()
        self.assertIsInstance(h_bin_labels, np.ndarray)
        self.assertListEqual(h_bin_labels.tolist(), bin_labels)
项目:identifiera-sarkasm    作者:risnejunior    | 项目源码 | 文件源码
def build_vocabulary( words, max_size ):
    vocab_instances = 0
    unique_counts = Counter(words)
    d = dict(unique_counts.most_common(cfg.vocabulary_size-2) )
    vocabulary = OrderedDict( sorted(d.items(), key=lambda t: t[1],  reverse=True) )

    # start at 2 to leave room for padding & unknown
    pb = Progress_bar(len(d) - 1) 
    for i, (key, value) in enumerate(vocabulary.items(), start=2):      
        vocab_instances += value
        vocabulary[key] = i
        pb.tick()

    vocabulary[cfg.padding_char] = 0
    vocabulary[cfg.placeholder_char] = 1
    #reverse the vocbulary (for reverse lookup)
    rev_vocabulary = {v: k for k, v in vocabulary.items()}  
    vocab = (len(unique_counts), vocab_instances, vocabulary, rev_vocabulary)

    return vocab
项目:histwords    作者:williamleif    | 项目源码 | 文件源码
def main():
    args = docopt("""
    Usage:
        counts2pmi.py <counts>
    """)

    counts_path = args['<counts>']

    words = Counter()
    contexts = Counter()
    with open(counts_path) as f:
        for line in f:
            count, word, context = line.strip().split()
            count = int(count)
            words[word] += count
            contexts[context] += count

    words = sorted(words.items(), key=lambda (x, y): y, reverse=True)
    contexts = sorted(contexts.items(), key=lambda (x, y): y, reverse=True)

    save_count_vocabulary(counts_path + '.words.vocab', words)
    save_count_vocabulary(counts_path + '.contexts.vocab', contexts)
项目:MetaphoricChange    作者:Garrafao    | 项目源码 | 文件源码
def build_frequency_file(dtatcfdir, freq_file, MIN_FREQ, join_sign):
    """
    Builds file with all lemma + POS pairs above certain frequency threshold. 
    :param dtatcfdir: path to directory with dta tcf files
    :param freq_file: path to frequency file
    :param MIN_FREQ: frequency threshold
    :param join_sign: sign to join lemma + first char of POS
    """

    # build frequency file from lemmas
    outputpath = freq_file
    print 'Building frequency file to ' + outputpath + "..."
    lemma_count = Counter(build_lemma_list(dtatcfdir, join_sign))
    frequent_lemmas = filter(lambda x: lemma_count[x] >= MIN_FREQ, lemma_count)
    with open(outputpath, 'w') as f_out:
        for lemma in frequent_lemmas:
            print >> f_out, lemma.encode('utf-8')
项目:mordecai    作者:openeventdata    | 项目源码 | 文件源码
def _feature_most_common(self, results):
        """
        Find the most common country name in ES/Geonames results

        Paramaters
        ----------
        results: dict
            output of `query_geonames`

        Returns
        -------
        most_common: str
            ISO code of most common country, or empty string if none
        """
        try:
            country_count = Counter([i['country_code3'] for i in results['hits']['hits']])
            most_common = country_count.most_common()[0][0]
            return most_common
        except IndexError:
            return ""
        except TypeError:
            return ""
项目:atma    作者:AtmaHou    | 项目源码 | 文件源码
def MP(candidate, references, n):
    """
    calculate modified precision
    """
    counts = Counter(ngrams(candidate, n))
    if not counts:
        return 0

    max_counts = {}
    for reference in references:
        reference_counts = Counter(ngrams(reference, n))
        for ngram in counts:
            max_counts[ngram] = max(max_counts.get(ngram, 0), reference_counts[ngram])

    clipped_counts = dict((ngram, min(count, max_counts[ngram])) for ngram, count in counts.items())

    return sum(clipped_counts.values()) / sum(counts.values())
项目:STA141C    作者:clarkfitzg    | 项目源码 | 文件源码
def overlap_score(q1, q2):
    """
    >>> overlap_score("a b c", "a b")
    0.8

    >>> overlap_score("   ", " ")
    0
    """

    c1 = Counter(q1.split())
    c2 = Counter(q2.split())

    numerator = 0
    for word in c1:
        if word in c2:
            numerator += c1[word]
    for word in c2:
        if word in c1:
            numerator += c2[word]

    m = sum(c1.values())
    n = sum(c2.values())

    try:
        score = numerator / (m + n)
    except ZeroDivisionError:
        score = 0
    return score
项目:ThreatPrep    作者:ThreatResponse    | 项目源码 | 文件源码
def get_category_stats(self):
        """Get a count of CheckState results for each category of checks.
        Ignore collection counts to avoid duplications"""
        flat_results = self.get_flattened_results()
        categories = list(set([x.category for x in flat_results]))
        metrics = {}
        for category in categories:
            metrics[category] = collections.Counter([
                x.status for x in filter(
                    lambda y: len(y.subchecks) == 0 and y.category==category,
                    flat_results
                )
            ])
        return metrics
项目:monasca-transform    作者:openstack    | 项目源码 | 文件源码
def check_list_field_for_row(
            self, row=None, field_name=None, expected_list=None):
        found_list = getattr(row, field_name)
        self.assertEqual(Counter(expected_list), Counter(found_list))
项目:python-driver    作者:bblfsh    | 项目源码 | 文件源码
def convert_uasts(self, file_uast_generator):
        for file_uast in file_uast_generator:
            print("-" * 20 + " " + str(file_uast.filepath))
            id_cnt = Counter()
            self.collect_id_cnt(file_uast.response.uast, id_cnt)
            print(id_cnt)
项目:companycase    作者:duedil-ltd    | 项目源码 | 文件源码
def fetch_all_transitions(self, language, ngram_length):
        """ Generate a dict of counts for transitions for all n-grams in the language word list """
        wordlist = os.path.join(os.path.dirname(__file__), "wordlists/{0}.txt".format(language))
        if not os.path.exists(wordlist):
            raise SystemError("Language '{0}' does not exist".format(language))

        all_grams = []
        with codecs.open(wordlist, 'r', encoding='utf-8') as f:
            for line in f:
                words = line.strip('\n').lower().split()
                ngrams = reduce(lambda x, y: x + y, map(lambda word: self.find_ngrams(word, ngram_length), words))
                all_grams += ngrams
        return dict(Counter(all_grams))
项目:variational-text-tensorflow    作者:carpedm20    | 项目源码 | 文件源码
def _build_vocab(self, file_path, vocab_path):
    counter = Counter(self._read_text(file_path).split())

    count_pairs = sorted(counter.items(), key=lambda x: (-x[1], x[0]))
    words, _ = list(zip(*count_pairs))
    self.vocab = dict(zip(words, range(len(words))))

    save_pkl(vocab_path, self.vocab)
项目:treecat    作者:posterior    | 项目源码 | 文件源码
def log_profiling_stats():
    logger.info('-----------------------------------------------------------')
    logger.info('Series:')
    for name, series in sorted(SERIES.items()):
        logger.info('  {}: {}'.format(name, ' '.join(map(str, series))))

    logger.info('-----------------------------------------------------------')
    logger.info('Histograms:')
    for name, histogram in sorted(HISTOGRAMS.items()):
        logger.info('{: >10s} {}'.format('Count', name))
        for value, count in sorted(histogram.items()):
            logger.info('{: >10d} {}'.format(count, value))

    logger.info('-----------------------------------------------------------')
    logger.info('Counters:')
    logger.info('{: >10s} {}'.format('Count', 'Counter'))
    for name, count in sorted(COUNTERS.items()):
        logger.info('{: >10d} {}'.format(count, name))

    logger.info('-----------------------------------------------------------')
    logger.info('Timers:')
    times = [(t.elapsed, t.count, f) for (f, t) in TIMERS.items()]
    times.sort(reverse=True, key=lambda x: x[0])
    logger.info('{: >10} {: >10} {}'.format('Seconds', 'Calls', 'Function'))
    for time, count, name in times:
        logger.info('{: >10.3f} {: >10} {}'.format(time, count, name))
项目:IgDiscover    作者:NBISweden    | 项目源码 | 文件源码
def _guess_cdr3_start(group):
        """
        Return a guess for the CDR3 start within sequences in the given group
        """
        return Counter(group.V_CDR3_start).most_common()[0][0]
项目:trf    作者:aistairc    | 项目源码 | 文件源码
def calc_rs_pos(self) -> Dict[str, float]:
        """Calculate the ratio of each pos of words in input text
        Returns:
            float: the ratio of each pos of words in input text
        """
        pos = []
        # TODO: It may take a long time when the number of sentences are large
        for sentence in self.sentences:
            juman_result = self.juman.analysis(sentence)
            pos += [mrph.hinsi for mrph in juman_result.mrph_list()]
        pos_counter = Counter(pos)
        total = sum(pos_counter.values())
        return {name: float(num) / total for name, num in pos_counter.items()}
项目:cellranger    作者:10XGenomics    | 项目源码 | 文件源码
def __init__(self, **kwargs):
        Metric.__init__(self, **kwargs)
        self.d = collections.Counter()