Python editdistance 模块,eval() 实例源码

我们从Python开源项目中,提取了以下48个代码示例,用于说明如何使用editdistance.eval()

项目:keras    作者:GeekLiB    | 项目源码 | 文件源码
def show_edit_distance(self, num):
        num_left = num
        mean_norm_ed = 0.0
        mean_ed = 0.0
        while num_left > 0:
            word_batch = next(self.text_img_gen)[0]
            num_proc = min(word_batch['the_input'].shape[0], num_left)
            decoded_res = decode_batch(self.test_func, word_batch['the_input'][0:num_proc])
            for j in range(0, num_proc):
                edit_dist = editdistance.eval(decoded_res[j], word_batch['source_str'][j])
                mean_ed += float(edit_dist)
                mean_norm_ed += float(edit_dist) / len(word_batch['source_str'][j])
            num_left -= num_proc
        mean_norm_ed = mean_norm_ed / num
        mean_ed = mean_ed / num
        print('\nOut of %d samples:  Mean edit distance: %.3f Mean normalized edit distance: %0.3f'
              % (num, mean_ed, mean_norm_ed))
项目:pCVR    作者:xjtushilei    | 项目源码 | 文件源码
def show_edit_distance(self, num):
        num_left = num
        mean_norm_ed = 0.0
        mean_ed = 0.0
        while num_left > 0:
            word_batch = next(self.text_img_gen)[0]
            num_proc = min(word_batch['the_input'].shape[0], num_left)
            decoded_res = decode_batch(self.test_func, word_batch['the_input'][0:num_proc])
            for j in range(0, num_proc):
                edit_dist = editdistance.eval(decoded_res[j], word_batch['source_str'][j])
                mean_ed += float(edit_dist)
                mean_norm_ed += float(edit_dist) / len(word_batch['source_str'][j])
            num_left -= num_proc
        mean_norm_ed = mean_norm_ed / num
        mean_ed = mean_ed / num
        print('\nOut of %d samples:  Mean edit distance: %.3f Mean normalized edit distance: %0.3f'
              % (num, mean_ed, mean_norm_ed))
项目:markov-sentence-correction    作者:anassinator    | 项目源码 | 文件源码
def total_distance(observed_sentence, corrected_sentence):
    """Calculates the total distance between the two given sentences.

    Args:
        observed_sentence: Observed sentence.
        corrected_sentence: Corrected sentence.

    Returns:
        Total Levenshtein distance between the two sentences.
    """
    total_distance = 0

    observed_words = list(observed_sentence)
    corrected_words = list(corrected_sentence)

    for i in range(len(observed_words)):
        comparable_words = observed_words[i], corrected_words[i]
        total_distance += editdistance.eval(*comparable_words)

    return total_distance
项目:thaanaOCR    作者:Sofwath    | 项目源码 | 文件源码
def show_edit_distance(self, num):
        num_left = num
        mean_norm_ed = 0.0
        mean_ed = 0.0
        while num_left > 0:
            word_batch = next(self.text_img_gen)[0]
            num_proc = min(word_batch['the_input'].shape[0], num_left)
            decoded_res = decode_batch(self.test_func, word_batch['the_input'][0:num_proc])
            for j in range(0, num_proc):
                edit_dist = editdistance.eval(decoded_res[j], word_batch['source_str'][j])
                mean_ed += float(edit_dist)
                mean_norm_ed += float(edit_dist) / len(word_batch['source_str'][j])
            num_left -= num_proc
        mean_norm_ed = mean_norm_ed / num
        mean_ed = mean_ed / num
        print('\nOut of %d samples:  Mean edit distance: %.3f Mean normalized edit distance: %0.3f'
              % (num, mean_ed, mean_norm_ed))
项目:keras-customized    作者:ambrite    | 项目源码 | 文件源码
def show_edit_distance(self, num):
        num_left = num
        mean_norm_ed = 0.0
        mean_ed = 0.0
        while num_left > 0:
            word_batch = next(self.text_img_gen)[0]
            num_proc = min(word_batch['the_input'].shape[0], num_left)
            decoded_res = decode_batch(self.test_func, word_batch['the_input'][0:num_proc])
            for j in range(0, num_proc):
                edit_dist = editdistance.eval(decoded_res[j], word_batch['source_str'][j])
                mean_ed += float(edit_dist)
                mean_norm_ed += float(edit_dist) / len(word_batch['source_str'][j])
            num_left -= num_proc
        mean_norm_ed = mean_norm_ed / num
        mean_ed = mean_ed / num
        print('\nOut of %d samples:  Mean edit distance: %.3f Mean normalized edit distance: %0.3f'
              % (num, mean_ed, mean_norm_ed))
项目:keras-mxnet-benchmarks    作者:sandeep-krishnamurthy    | 项目源码 | 文件源码
def show_edit_distance(self, num):
        num_left = num
        mean_norm_ed = 0.0
        mean_ed = 0.0
        while num_left > 0:
            word_batch = next(self.text_img_gen)[0]
            num_proc = min(word_batch['the_input'].shape[0], num_left)
            decoded_res = decode_batch(self.test_func, word_batch['the_input'][0:num_proc])
            for j in range(0, num_proc):
                edit_dist = editdistance.eval(decoded_res[j], word_batch['source_str'][j])
                mean_ed += float(edit_dist)
                mean_norm_ed += float(edit_dist) / len(word_batch['source_str'][j])
            num_left -= num_proc
        mean_norm_ed = mean_norm_ed / num
        mean_ed = mean_ed / num
        print('\nOut of %d samples:  Mean edit distance: %.3f Mean normalized edit distance: %0.3f'
              % (num, mean_ed, mean_norm_ed))
项目:chat-roulette-python    作者:ph4r05    | 项目源码 | 文件源码
def similarities(self):
        """
        Compute Levenshtein distance matrix between files (implemented in C++ pip package: editdistance)
        Later: https://docs.python.org/2/library/difflib.html
        :return:
        """

        ucos = sorted(self.filedb.keys())
        sims = {}

        for idx, uco in enumerate(ucos):
            logger.info('Comparing %s...' % uco)
            sims[uco] = {}

            for idx2, uco2 in enumerate(ucos[idx+1:]):
                dist = editdistance.eval(self.file_data[uco], self.file_data[uco2])
                sims[uco][uco2] = dist
                logger.info(' %6d vs %6d : %4d  %s  %s' % (uco, uco2, dist, self.filedb[uco], self.filedb[uco2]))
项目:event-cui-transfer    作者:mit-ddig    | 项目源码 | 文件源码
def best_match(word, corrected_med_list, corrected_english_list):
    min_dist_med = len(word)
    best_med_word = ''
    min_dist_eng = len(word)
    best_eng_word = ''
    for word_t in corrected_med_list:
        if editdistance.eval(word, word_t) < min_dist_med:
            min_dist_med = editdistance.eval(word, word_t)
            best_med_word = word_t

    for word_t in corrected_english_list:
        if editdistance.eval(word, word_t) < min_dist_eng:
            min_dist_eng = editdistance.eval(word, word_t)
            best_eng_word = word_t
    if min_dist_med <= min_dist_eng:
        return best_med_word
    else:
        return best_eng_word
项目:keras    作者:NVIDIA    | 项目源码 | 文件源码
def show_edit_distance(self, num):
        num_left = num
        mean_norm_ed = 0.0
        mean_ed = 0.0
        while num_left > 0:
            word_batch = next(self.text_img_gen)[0]
            num_proc = min(word_batch['the_input'].shape[0], num_left)
            decoded_res = decode_batch(self.test_func, word_batch['the_input'][0:num_proc])
            for j in range(0, num_proc):
                edit_dist = editdistance.eval(decoded_res[j], word_batch['source_str'][j])
                mean_ed += float(edit_dist)
                mean_norm_ed += float(edit_dist) / len(word_batch['source_str'][j])
            num_left -= num_proc
        mean_norm_ed = mean_norm_ed / num
        mean_ed = mean_ed / num
        print('\nOut of %d samples:  Mean edit distance: %.3f Mean normalized edit distance: %0.3f'
              % (num, mean_ed, mean_norm_ed))
项目:ws-backend-community    作者:lavalamp-    | 项目源码 | 文件源码
def compare_strings_by_edit_distance(first=None, second=None):
        """
        Get the edit distance between the two strings passed to this method.
        :param first: The first string to compare.
        :param second: The second string to compare.
        :return: A number representing the edit distance between the two strings passed
        as arguments to this method.
        """
        return editdistance.eval(first, second)

    # Class Methods

    # Public Methods

    # Protected Methods

    # Private Methods

    # Properties

    # Representation and Comparison
项目:keras-101    作者:burness    | 项目源码 | 文件源码
def show_edit_distance(self, num):
        num_left = num
        mean_norm_ed = 0.0
        mean_ed = 0.0
        while num_left > 0:
            word_batch = next(self.text_img_gen)[0]
            num_proc = min(word_batch['the_input'].shape[0], num_left)
            decoded_res = decode_batch(self.test_func, word_batch['the_input'][0:num_proc])
            for j in range(0, num_proc):
                edit_dist = editdistance.eval(decoded_res[j], word_batch['source_str'][j])
                mean_ed += float(edit_dist)
                mean_norm_ed += float(edit_dist) / len(word_batch['source_str'][j])
            num_left -= num_proc
        mean_norm_ed = mean_norm_ed / num
        mean_ed = mean_ed / num
        print('\nOut of %d samples:  Mean edit distance: %.3f Mean normalized edit distance: %0.3f'
              % (num, mean_ed, mean_norm_ed))
项目:rebuild_obfuscator    作者:irobert-tluo    | 项目源码 | 文件源码
def simscore(a1, b1):
        max_len = max([len(a1), len(b1)])
        if max_len == 0:
            return 0
        dist = editdistance.eval(a1, b1)
        if dist > max_len:
          print dist
        return 1.0 - (float(dist)/float(max_len))
项目:rebuild_obfuscator    作者:irobert-tluo    | 项目源码 | 文件源码
def similarity(a1, b1):
  max_len = max([len(a1), len(b1)])
  if max_len == 0:
      return 0
  dist = editdistance.eval(a1, b1)
  return 1.0 - (float(dist)/float(max_len))
项目:speechless    作者:JuliusKunze    | 项目源码 | 文件源码
def letter_error_count(self) -> float:
        return editdistance.eval(self.expected, self.predicted)
项目:speechless    作者:JuliusKunze    | 项目源码 | 文件源码
def word_error_count(self) -> float:
        return editdistance.eval(self.expected_words, self.predicted.split())
项目:DeepLearning-OCR    作者:xingjian-f    | 项目源码 | 文件源码
def edit_dis(a, b):
    return editdistance.eval(a, b)
项目:pe    作者:anguelos    | 项目源码 | 文件源码
def getEditDistanceMat(gtTranscriptions,sampleTranscriptions):
    outputShape=[len(gtTranscriptions),len(sampleTranscriptions)]
    distMat=np.empty(outputShape)
    maxSizeMat=np.empty(outputShape)
    for gtNum in range(len(gtTranscriptions)):
        for sampleNum in range(len(sampleTranscriptions)):
            distMat[gtNum,sampleNum]=editdistance.eval(gtTranscriptions[gtNum],sampleTranscriptions[sampleNum])
            maxSizeMat[gtNum,sampleNum]=max(len(gtTranscriptions[gtNum]),len(sampleTranscriptions[sampleNum]))
    return distMat/maxSizeMat,distMat
项目:json-merger    作者:inveniosoftware-contrib    | 项目源码 | 文件源码
def _normalized_edit_dist(s1, s2):
    return float(editdistance.eval(s1, s2)) / max(len(s1), len(s2), 1)
项目:Library-Identification    作者:Riscure    | 项目源码 | 文件源码
def compare_cc_list_levenshtein(sample, ref):
    """
    Compares the cyclomatic complexity values of all functions in `sample`
    with those of all functions in `ref`, by taking the Levenshtein distance
    between these lists. This detects added/removed functions and functions
    that have changed in complexity between a sample and a reference.
    """
    if hasattr(ref, 'cclist') and ref.cclist is not None:
        ratio = 1 - (editdistance.eval(sample.cclist, ref.cclist)
                    / float(max(len(sample.cclist), len(ref.cclist))))
    else:
        ratio = 0.0

    return (ratio * 100, ref.name, ref.version)
项目:pandora    作者:mikekestemont    | 项目源码 | 文件源码
def annotate(self, tokens):
        X_focus = self.preprocessor.transform(tokens=tokens)['X_focus']
        X_context = self.pretrainer.transform(tokens=tokens)

        # get predictions:
        new_in = {}
        if self.include_token:
            new_in['focus_in'] = X_focus
        if self.include_context:
            new_in['context_in'] = X_context
        preds = self.model.predict(new_in)

        if isinstance(preds, np.ndarray):
            preds = [preds]

        annotation_dict = {'tokens': tokens}
        if self.include_lemma:
            pred_lemmas = self.preprocessor.inverse_transform_lemmas(predictions=preds[self.lemma_out_idx])
            annotation_dict['lemmas'] = pred_lemmas
            if self.postcorrect:
                for i in range(len(pred_lemmas)):
                    if pred_lemmas[i] not in self.known_lemmas:
                        pred_lemmas[i] = min(self.known_lemmas,
                                            key=lambda x: editdistance.eval(x, pred_lemmas[i]))
                annotation_dict['postcorrect_lemmas'] = pred_lemmas

        if self.include_pos:
            pred_pos = self.preprocessor.inverse_transform_pos(predictions=preds[self.pos_out_idx])
            annotation_dict['pos'] = pred_pos

        if self.include_morph:
            pred_morph = self.preprocessor.inverse_transform_morph(predictions=preds[self.morph_out_idx])
            annotation_dict['morph'] = pred_morph

        return annotation_dict
项目:WebMan    作者:flipflop97    | 项目源码 | 文件源码
def searchPackages(name):
    results = loadJson('https://www.archlinux.org/packages/search/json/?q=%s' % name)['results']
    results = sorted(results, key=lambda x: levdist(name, x['pkgname']))[:100]
    packages = [parsePackage(package, name) for package in results if package['arch'] in (arch, 'any')]

    results = loadJson('https://aur.archlinux.org/rpc/?v=5&type=search&arg=%s' % name)['results']
    results = sorted(results, key=lambda x: levdist(name, x['Name']))[:100]
    packages += [parsePackage(package, name) for package in results]

    packages = sorted(packages, key=lambda x: levdist(name, x[0]))[:100]
    return packages
项目:atropos    作者:jdidion    | 项目源码 | 文件源码
def set_trimming(self, u, t, use_edit_distance=True):
        untrimmed = u.query_sequence.upper()
        untrimmed_len = len(untrimmed)
        trimmed = t.query_sequence.upper()
        trimmed_len = len(trimmed)

        trimmed_front = 0 if use_edit_distance else -1
        if use_edit_distance and (untrimmed_len > trimmed_len):
            for i in range(untrimmed_len - trimmed_len + 1):
                if untrimmed[i:(i+trimmed_len)] == trimmed:
                    trimmed_front = i
                    break
            else:
                # Since Skewer performs automatic error correction, the trimmed and
                # untrimmed reads may not match, so in that case we find the closest
                # match by Levenshtein distance.
                dist = None
                for i in range(untrimmed_len - trimmed_len + 1):
                    d = editdistance.eval(untrimmed[i:(i+trimmed_len)], trimmed)
                    if not dist:
                        dist = d
                    elif d < dist:
                        trimmed_front = i
                        dist = d

        self.trimmed_front = trimmed_front
        self.trimmed_back = untrimmed_len - (trimmed_len + trimmed_front)
项目:sequtils    作者:atgtag    | 项目源码 | 文件源码
def edit(seq1, seq2):
    """
    Wrapper around editdistance.eval for fast Levenshtein
    distance computation.

    Args:
        seq1 (str): Reference sequence
        seq2 (str): Sequence to compare

    Examples:
        >>> edit('banana', 'bahama')
        2
    """
    return int(ed.eval(seq1, seq2))
项目:kaggle    作者:rbauld    | 项目源码 | 文件源码
def edit_distance(train_in, test_in, qcolumns = ['question1', 'question2'], append=''):

    train = train_in.copy().loc[:,qcolumns]
    test = test_in.copy().loc[:,qcolumns]

    import editdistance

    def my_fun(row, qcolumns):
        return editdistance.eval(row[qcolumns[0]], row[qcolumns[1]])

    key = 'edit_dist'+append
    train[key] = train.apply(lambda x: my_fun(x, qcolumns=qcolumns), axis=1)
    test[key]  = test.apply(lambda x: my_fun(x, qcolumns=qcolumns), axis=1)

    return (train, test)
项目:social-vuln-scanner    作者:Betawolf    | 项目源码 | 文件源码
def bestNameDiff(profileone, profiletwo):
    """ Applies Levenshtein distance between best names of two profiles."""
    n1 = profileone.bestname()
    n2 = profiletwo.bestname()
    if (not n1) or (not n2):
      return 0
    l1 = profileone.name_length
    l2 = profiletwo.name_length
    diff = editdistance.eval(n1,n2)
    return 1-(diff/(l1 if l1 > l2 else l2))
项目:social-vuln-scanner    作者:Betawolf    | 项目源码 | 文件源码
def string_sim(n1, n2):
    """ Applies Levenshtein distance between strings."""
    if (not n1) or (not n2):
      return 0
    l1 = len(n1)
    l2 = len(n2)
    diff = editdistance.eval(n1,n2)
    return 1-(diff/(l1 if l1 > l2 else l2))
项目:Mandalorion    作者:christopher-vollmers    | 项目源码 | 文件源码
def collect_file_paths(path,gene_file):
   genes_of_interest=[]
   for line in open(gene_file):
       genes_of_interest.append(line.strip())

   isoform_list=[]
   gene_read_counter={}
   isoform_read_counter={}
   for gene in genes_of_interest:
       gene_read_counter[gene]=0
       for file1 in sorted(os.listdir(path+'/parsed_reads')):
           if gene in file1:

               file2=file1+'_sub'
               out_sub=open(path+'/parsed_reads/'+file2,'w') 
               counter=0
               isoform_reads=read_fasta(path+'/parsed_reads/'+file1)
               isoform_read_list=list(isoform_reads.keys())
               print(gene_read_counter,gene_read_counter[gene],len(isoform_reads.keys()))
               gene_read_counter[gene]+=len(isoform_reads.keys())
               isoform_read_counter[path+'/parsed_reads/'+file2]=len(isoform_reads.keys())
               read1 = isoform_read_list[0]
               out_sub.write('>'+read1+'\n'+isoform_reads[read1]+'\n')
               for read2 in isoform_read_list[1::]:
                   if counter<subsample:
                       out_sub.write('>'+read2+'\n')
                       dist_1 = editdistance.eval(isoform_reads[read1],isoform_reads[read2])**2/float(len(isoform_reads[read1])*len(isoform_reads[read2]))
                       dist_2 = editdistance.eval(isoform_reads[read1],reverse_complement(isoform_reads[read2]))**2/float(len(isoform_reads[read1])*len(isoform_reads[read2]))
                       if dist_1 < dist_2:
                           out_sub.write(isoform_reads[read2]+'\n')
                       else:
                           out_sub.write(reverse_complement(isoform_reads[read2])+'\n')
                   counter+=1


               isoform_list.append((path+'/parsed_reads/'+file2,gene))

   return isoform_list,gene_read_counter,isoform_read_counter
项目:wub    作者:nanoporetech    | 项目源码 | 文件源码
def test_simulate_sequencing_errors(self):
        """Test function simulating sequencing errors."""
        error_rate = 0.1
        error_weights = {'substitution': 1.0 / 6,
                         'insertion': 1.0 / 6,
                         'deletion': 4.0 / 6}
        sequence = sim_seq.simulate_sequence(5000)
        mutated_record = sim_seq.simulate_sequencing_errors(
            sequence, error_rate, error_weights)
        distance = editdistance.eval(sequence, mutated_record.seq)
        expected_errors = len(sequence) * error_rate
        errors_sd = np.sqrt(len(sequence) * error_rate * (1 - error_rate))
        # Should pass 0.9973 proportion of cases:
        self.assertTrue(expected_errors - errors_sd * 3 < distance < expected_errors +
                        errors_sd * 3, msg="expected: {} realised:{}".format(expected_errors, distance))
项目:OCkRE    作者:rossumai    | 项目源码 | 文件源码
def show_edit_distance(self, num):
        num_left = num
        mean_norm_ed = 0.0
        mean_ed = 0.0
        wrong = 0
        right = 0
        while num_left > 0:
            word_batch = next(self.text_img_gen)[0]
            num_proc = min(word_batch['the_input'].shape[0], num_left)
            decoded_res = decode_batch(self.test_func, word_batch['the_input'][0:num_proc], word_batch['labeltype_input'][0:num_proc])
            for j in range(0, num_proc):
                ocr_result = deaccent(unicode(re.sub("[\+\/]", "", re.sub("\\s", "", decoded_res[j])), 'utf-8'))
                gold_label = re.sub("[\+\/]", "", re.sub("\\s", "", word_batch['source_str'][j]))
                if gold_label == ocr_result:
                    right += 1
                else:
                    wrong += 1
                edit_dist = editdistance.eval(decoded_res[j], word_batch['source_str'][j])
                mean_ed += float(edit_dist)
                mean_norm_ed += float(edit_dist) / len(word_batch['source_str'][j])
            num_left -= num_proc
        absacc = float(right) / (float(right) + float(wrong))
        mean_norm_ed = mean_norm_ed / num
        mean_ed = mean_ed / num
        outline = ' Out of %d samples:  Mean edit distance: %.3f Mean normalized edit distance: %0.3f\n Absolute accuracy over labels is %0.2f\n' % (
            num, mean_ed, mean_norm_ed, absacc)
        print(outline)

        return mean_norm_ed, absacc
项目:rctw17    作者:bgshih    | 项目源码 | 文件源码
def text_distance(str1, str2):
  str1 = normalize_txt(str1)
  str2 = normalize_txt(str2)
  return editdistance.eval(str1, str2)
项目:speechT    作者:timediv    | 项目源码 | 文件源码
def track_decoding(self, decoded_str, expected_str):
    self.letter_edit_distance = editdistance.eval(expected_str, decoded_str)
    self.letter_error_rate = self.letter_edit_distance / len(expected_str)
    self.word_edit_distance = editdistance.eval(expected_str.split(), decoded_str.split())
    self.word_error_rate = self.word_edit_distance / len(expected_str.split())
    self.sum_letter_edit_distance += self.letter_edit_distance
    self.sum_letter_error_rate += self.letter_error_rate
    self.sum_word_edit_distance += self.word_edit_distance
    self.sum_word_error_rate += self.word_error_rate
    self.decodings_counter += 1
项目:speechT    作者:timediv    | 项目源码 | 文件源码
def run_step(self, model: SpeechModel, sess: tf.Session, stats: EvalStatistics,
               save: bool, verbose=True, feed_dict: Dict=None):
    global_step = model.global_step.eval()

    # Validate on data set and write summary
    if save:
      avg_loss, decoded, label, summary = model.step(sess, update=False, decode=True, return_label=True,
                                                     summary=True, feed_dict=feed_dict)
      model.summary_writer.add_summary(summary, global_step)
    else:
      avg_loss, decoded, label = model.step(sess, update=False, decode=True,
                                            return_label=True, feed_dict=feed_dict)

    if verbose:
      perplexity = np.exp(float(avg_loss)) if avg_loss < 300 else float("inf")
      print("validation average loss {:.2f} perplexity {:.2f}".format(avg_loss, perplexity))

    # Print decode
    decoded_ids_paths = [Evaluation.extract_decoded_ids(path) for path in decoded]
    for label_ids in Evaluation.extract_decoded_ids(label):
      expected_str = speecht.vocabulary.ids_to_sentence(label_ids)
      if verbose:
        print('expected: {}'.format(expected_str))
      for decoded_path in decoded_ids_paths:
        decoded_ids = next(decoded_path)
        decoded_str = speecht.vocabulary.ids_to_sentence(decoded_ids)
        stats.track_decoding(decoded_str, expected_str)
        if verbose:
          print('decoded: {}'.format(decoded_str))
          print('LED: {} LER: {:.2f} WED: {} WER: {:.2f}'.format(stats.letter_edit_distance,
                                                                 stats.letter_error_rate,
                                                                 stats.word_edit_distance,
                                                                 stats.word_error_rate))
项目:inflation_calc    作者:EricSchles    | 项目源码 | 文件源码
def closest(self, date=datetime.date.today(), country=None,
                limit=datetime.timedelta(days=366)):
        """
        Get the closest CPI value for a specified date. The date defaults to
        today. A limit can be provided to exclude all values for dates further
        away than defined by the limit. This defaults to 366 days.
        """

        # Try to get the country
        try:
            possible_countries = [self.data[country]]          
        except:
            possible_countries = [elem for elem in self.data.keys() if editdistance.eval(country,elem) < 3]
            if len(possible_countries) == 0:
                return "No country found, typo unlikely for ",country

        # Find the closest date
        country_cpi = {}
        for country in possible_countries:
            min_year_diff = 1000
            min_year = 0
            for year in self.data[country]:
                if min_year_diff > abs(date.year - int(year)):
                    min_year_diff = abs(date.year - int(year))
                    min_year = year
            country_cpi[country] = self.data[country][min_year]
        if len(country_cpi) == 1:
            return country_cpi[country_cpi.keys()[0]]
        else:
            return country_cpi
项目:inflation_calc    作者:EricSchles    | 项目源码 | 文件源码
def closest(self, date=datetime.date.today(), country=None,
                limit=datetime.timedelta(days=366)):
        """
        Get the closest CPI value for a specified date. The date defaults to
        today. A limit can be provided to exclude all values for dates further
        away than defined by the limit. This defaults to 366 days.
        """

        # Try to get the country
        try:
            possible_countries = [self.data[country]]          
        except:
            possible_countries = [elem for elem in self.data.keys() if editdistance.eval(country,elem) < 3]
            if len(possible_countries) == 0:
                return "No country found, typo unlikely for ",country

        # Find the closest date
        country_cpi = {}
        for country in possible_countries:
            min_year_diff = 1000
            min_year = 0
            for year in self.data[country]:
                if min_year_diff > abs(date.year - int(year)):
                    min_year_diff = abs(date.year - int(year))
                    min_year = year
            country_cpi[country] = self.data[country][min_year]
        if len(country_cpi) == 1:
            return country_cpi[country_cpi.keys()[0]]
        else:
            return country_cpi
项目:speech    作者:awni    | 项目源码 | 文件源码
def compute_cer(results):
    """
    Arguments:
        results (list): list of ground truth and
            predicted sequence pairs.

    Returns the CER for the full set.
    """
    dist = sum(editdistance.eval(label, pred)
                for label, pred in results)
    total = sum(len(label) for label, _ in results)
    return dist / total
项目:tensorflow-quorakaggle    作者:ram1988    | 项目源码 | 文件源码
def __evaluateLevensteinDistance(self, question1, question2):
        leven_dis = levendis.eval(question1.lower(), question2.lower())
        return leven_dis
项目:panphon    作者:dmort27    | 项目源码 | 文件源码
def fast_levenshtein_distance(self, source, target):
        """Wrapper for the distance function in the Levenshtein module

        Args:
            source (unicode): source word
            target (unicode): target word

        Returns:
            int: minimum number of Levenshtein edits required to get from
                 `source` to `target`
        """
        return int(editdistance.eval(source, target))
项目:panphon    作者:dmort27    | 项目源码 | 文件源码
def fast_levenshtein_distance_div_maxlen(self, source, target):
        """Levenshtein distance divided by maxlen

        Args:
            source (unicode): source word
            target (unicode): target word

        Returns:
            int: minimum number of Levenshtein edits required to get from
                 `source` to `target` divided by the length of the longest
                 of these arguments
        """
        maxlen = max(len(source), len(target))
        return int(editdistance.eval(source, target)) / maxlen
项目:agrigento    作者:ucsb-seclab    | 项目源码 | 文件源码
def calc_score(value, values):
    distance = 1000000000
    for v in values:
        if len(value) == len(v):
            d = bit_edit_distance(value, v)
        else:
            d = editdistance.eval(value, v) * 8
        distance = min(distance, d)

    return distance
项目:attention_ocr    作者:lightcaster    | 项目源码 | 文件源码
def batched_wer(ref, hyp):
    ''' Computes mean WER 

    ref: list of references
    hyp: list of corresponding hypotheses

    '''
    assert len(ref) == len(hyp)

    wer = 0.
    for r,f in zip(ref, hyp):
        rate = editdistance.eval(r, f) / len(r)
        wer += rate

    return wer/len(ref)
项目:dnnQuery    作者:richardxiong    | 项目源码 | 文件源码
def strSimilarity(word1, word2):
    ''' Measure the similarity based on Edit Distance
    ### Measure how similar word1 is with respect to word2
    '''
    diff = ed.eval(word1.lower(), word2.lower())   #search
    # lcs = LCS(word1, word2)   #search
    length = max(len(word1), len(word2))
    if diff >= length:
        similarity = 0.0
    else:
        similarity = 1.0 * (length-diff) / length
    return similarity
项目:dnnQuery    作者:richardxiong    | 项目源码 | 文件源码
def strSimilarity(word1, word2):
    ''' Measure the similarity based on Edit Distance
    ### Measure how similar word1 is with respect to word2
    '''
    diff = ed.eval(word1.lower(), word2.lower())   #search
    # lcs = LCS(word1, word2)   #search
    length = max(len(word1), len(word2))
    if diff >= length:
        similarity = 0.0
    else:
        similarity = 1.0 * (length-diff) / length
    return similarity
项目:dnnQuery    作者:richardxiong    | 项目源码 | 文件源码
def strSimilarity(word1, word2):
    ''' Measure the similarity based on Edit Distance
    ### Measure how similar word1 is with respect to word2
    '''
    diff = ed.eval(word1.lower(), word2.lower())   #search
    # lcs = LCS(word1, word2)   #search
    length = max(len(word1), len(word2))
    if diff >= length:
        similarity = 0.0
    else:
        similarity = 1.0 * (length-diff) / length
    return similarity
项目:dnnQuery    作者:richardxiong    | 项目源码 | 文件源码
def strSimilarity(word1, word2):
    ''' Measure the similarity based on Edit Distance
    ### Measure how similar word1 is with respect to word2
    '''
    diff = ed.eval(word1.lower(), word2.lower())   #search
    # lcs = LCS(word1, word2)   #search
    length = max(len(word1), len(word2))
    if diff >= length:
        similarity = 0.0
    else:
        similarity = 1.0 * (length-diff) / length
    return similarity
项目:pe    作者:anguelos    | 项目源码 | 文件源码
def getFSNSMetrics(gtIdTransDict,methodIdTransDict):
    """Provides metrics for the FSNS dataset.
    FM, precision, recall and correctSequences are an implementation of the metrics described in
    "End-to-End Interpretation of the French Street Name Signs Dataset"
    [https://link.springer.com/chapter/10.1007%2F978-3-319-46604-0_30]
    Params:
        gtIdTransDict : sample_id to data dictionary. A simple file name to file contents might do.
        methodIdTransDict : sample_id to data dictionary. A simple file name to file contents might do.

    returns:
        A tuple with floats between 0 and 1 with all worth reporting measurements.
        FM, Precision, Recall, global correct word trascriptions, if someone returned
        "rue" as the transcription of every image, assuming half the images have it, he
        would get a precision of 50%, a recall of ~5% and an FM of ~9.1%.
        He would get a correctSequences score of 0%, and a similarity of e%.
    """
    def compareTexts(sampleTxt,gtTxt):
        relevant=gtTxt.lower().split()
        retrieved=sampleTxt.lower().split()
        correct=(set(relevant).intersection(set(retrieved)))
        similarity=1.0/(1+editdistance.eval(gtTxt.lower(),sampleTxt.lower()))
        res=(len(correct),len(relevant),len(retrieved),relevant==retrieved,similarity)
        return res
    mDict={k:'' for k in gtIdTransDict.keys()}
    mDict.update(methodIdTransDict)
    methodIdTransDict=mDict
    methodKeys=sorted(methodIdTransDict.keys())
    gtKeys=sorted(gtIdTransDict.keys())
    if len(methodKeys)!= len(set(methodKeys))  or len(gtKeys)!= len(set(gtKeys)) or len(set(methodKeys)-set(gtKeys))>0 :#gt and method dissagree on samples
        sys.stderr.write("GT and submission dissagree on the sample ids\n")
        sys.exit(1)
    corectRelevantRetrievedSimilarity=np.zeros([len(gtKeys),5],dtype='float32')
    for k in range(len(gtKeys)):
        sId=gtKeys[k]
        corectRelevantRetrievedSimilarity[k,:]=compareTexts(methodIdTransDict[sId],gtIdTransDict[sId])
    precision=(corectRelevantRetrievedSimilarity[:,0].sum()/(corectRelevantRetrievedSimilarity[:,1].sum()))
    recall=(corectRelevantRetrievedSimilarity[:,0].sum()/(corectRelevantRetrievedSimilarity[:,2].sum()))
    FM=(2*precision*recall)/(precision+recall)
    correctSequences=corectRelevantRetrievedSimilarity[:,3].mean()
    similarity=corectRelevantRetrievedSimilarity[:,4].mean()
    combinedSoftMetric=(1-FM)*FM+FM*similarity#The better FM is, the less it maters in the overall score
    return combinedSoftMetric,FM,precision,recall,similarity,correctSequences,corectRelevantRetrievedSimilarity
项目:markov-sentence-correction    作者:anassinator    | 项目源码 | 文件源码
def _correct(observed_sentence, bigrams, distribution, max_error_rate):
    """Corrects a given sentence.

    Note: The lower the max_error_rate, the faster the algorithm, but the
          likelier it will fail.

    Args:
        observed_sentence: Observed sentence.
        bigrams: First-order Markov chain of likely word sequences.
        distribution: Error probability distribution function.
        max_error_rate: Maximum number of errors in a word to consider.

    Returns:
        Ordered list of tuples of (corrected sentence, its probability).
        Most likely interpretations come first.
    """
    trellis = [{Sentence.START: (1.0, None)}]

    observed_words = list(observed_sentence)
    number_of_words = len(observed_words)

    for k in range(1, number_of_words):
        observed_word = observed_words[k]
        max_errors = int(len(observed_word) * max_error_rate) + 1

        current_states = {}
        previous_states = trellis[k - 1]
        trellis.append(current_states)

        for previous_word in previous_states:
            previous_prob = previous_states[previous_word][0]

            future_states = bigrams.yield_future_states((previous_word,))
            for possible_word, conditional_prob in future_states:
                # Conditional probability: P(X_k | X_k-1) * previous
                # probability.
                total_prob = conditional_prob * previous_prob

                # Emission probability: P(E_k | X_k).
                distance = editdistance.eval(observed_word, possible_word)
                total_prob *= distribution(distance)

                # Ignore states that have too many mistakes.
                if distance > max_errors:
                    continue

                # Only keep link of max probability.
                if possible_word in current_states:
                    if current_states[possible_word][0] >= total_prob:
                        continue

                current_states[possible_word] = (total_prob, previous_word)

    # Find most likely ending.
    interpretations = list(_backtrack_path(trellis, x) for x in trellis[-1])
    interpretations.sort(key=lambda x: x[1], reverse=True)

    return interpretations
项目:pandora    作者:mikekestemont    | 项目源码 | 文件源码
def test(self, multilabel_threshold=0.5):
        if not self.include_test:
            raise ValueError('Please do not call .test() if no test data is available.')

        score_dict = {}

        # get test predictions:
        test_in = {}
        if self.include_token:
            test_in['focus_in'] = self.test_X_focus
        if self.include_context:
            test_in['context_in'] = self.test_contexts

        test_preds = self.model.predict(test_in,
                                batch_size=self.batch_size)

        if isinstance(test_preds, np.ndarray):
            test_preds = [test_preds]

        if self.include_lemma:
            print('::: Test scores (lemmas) :::')

            pred_lemmas = self.preprocessor.inverse_transform_lemmas(predictions=test_preds[self.lemma_out_idx])
            if self.postcorrect:
                for i in range(len(pred_lemmas)):
                    if pred_lemmas[i] not in self.known_lemmas:
                        pred_lemmas[i] = min(self.known_lemmas,
                                        key=lambda x: editdistance.eval(x, pred_lemmas[i]))
            score_dict['test_lemma'] = evaluation.single_label_accuracies(gold=self.test_lemmas,
                                                 silver=pred_lemmas,
                                                 test_tokens=self.test_tokens,
                                                 known_tokens=self.preprocessor.known_tokens)

        if self.include_pos:
            print('::: Test scores (pos) :::')
            pred_pos = self.preprocessor.inverse_transform_pos(predictions=test_preds[self.pos_out_idx])
            score_dict['test_pos'] = evaluation.single_label_accuracies(gold=self.test_pos,
                                                 silver=pred_pos,
                                                 test_tokens=self.test_tokens,
                                                 known_tokens=self.preprocessor.known_tokens)

        if self.include_morph:     
            print('::: Test scores (morph) :::')
            pred_morph = self.preprocessor.inverse_transform_morph(predictions=test_preds[self.morph_out_idx],
                                                                   threshold=multilabel_threshold)
            if self.include_morph == 'label':
                score_dict['test_morph'] = evaluation.single_label_accuracies(gold=self.test_morph,
                                                 silver=pred_morph,
                                                 test_tokens=self.test_tokens,
                                                 known_tokens=self.preprocessor.known_tokens)                
            elif self.include_morph == 'multilabel':
                score_dict['test_morph'] = evaluation.multilabel_accuracies(gold=self.test_morph,
                                                 silver=pred_morph,
                                                 test_tokens=self.test_tokens,
                                                 known_tokens=self.preprocessor.known_tokens)
        return score_dict
项目:handelsregister    作者:Amsterdam    | 项目源码 | 文件源码
def fix_ambiguous(ambiguous_sbi):
    """
    For each ambiguous sbi code find to most likely candidate

     0       vs.id,
     1       vs.naam,
     2       codes.hr_code,
     3       codes.alt_code,
     4       codes.title,
     5       codes.alt_title,
     6       codes.sub_cat,
     7       codes.alt_sub_cat,
     8       codes.mks_title

    """
    original_count = 0
    suggestion_count = 0

    for row in ambiguous_sbi:

        normalcode = row[2]
        zerocode = row[3]

        desc1 = row[4]
        desc2 = row[5]
        original = row[8]

        distance_desc1 = editdistance.eval(desc1, original)
        distance_desc2 = editdistance.eval(desc2, original)

        if distance_desc1 > distance_desc2:
            # the alternative match with 0 is better
            suggestion_count += 1
            ves = hrmodels.Vestiging.objects.get(id=row[0])
            invalid_activiteit = ves.activiteiten.get(sbi_code=normalcode)
            # fix the code
            invalid_activiteit.sbi_code = zerocode
            # save the corrected sbi code
            invalid_activiteit.save()
            # now save updated code
        else:
            # do nothing default is fine
            original_count += 1

        log.debug(f'{normalcode}, {zerocode}, {desc1[:18]}, {desc2[:18]}, {original[:18]}, {distance_desc1}, {distance_desc2}')  # noqa

    log.debug("%s-%s = Original-Suggestion", original_count, suggestion_count)