Python Levenshtein 模块,ratio() 实例源码

我们从Python开源项目中,提取了以下43个代码示例,用于说明如何使用Levenshtein.ratio()

项目:dupandas    作者:shivam5992    | 项目源码 | 文件源码
def match_elements(self, text1, text2):
        """
        utility function to match two strings, makes use of 
        match config initiated in __init__ 

        returns the output as confidence score of flexible match
        """

        conf = 0
        if self.m_config['exact']:
            if text1 == text2:
                conf += 1

        if self.m_config['levenshtein']:
            conf += ratio(text1, text2)

        if self.m_config['soundex']:
            if soundex(text1) == soundex(text2):
                conf += 1

        if self.m_config['nysiis']:
            if fuzzy.nysiis(text1) == fuzzy.nysiis(text2):
                conf += 1

        return conf
项目:entity-linker    作者:seucs    | 项目源码 | 文件源码
def getVec(kb, id1, id2):
    if kb == 'bh':
        title1, context1, category1 = getMsgbyId('baidu', id1)
        title2, context2, category2 = getMsgbyId('hudong', id2)
    if kb == 'bw':
        title1, context1, category1 = getMsgbyId('baidu', id1)
        title2, context2, category2 = getMsgbyId('wiki', id2)
    if kb == 'hw':
        title1, context1, category1 = getMsgbyId('hudong', id1)
        title2, context2, category2 = getMsgbyId('wiki', id2)

    title_r = Levenshtein.ratio(title1, title2)
    context_r = cosine(context1, context2)
    category_r = sameCategory(category1, category2)

    return (title_r, context_r, category_r, 0.0)
项目:lorelei-speech-evaluation    作者:usc-sail    | 项目源码 | 文件源码
def frame_similarity(frame1,frame2):
    similarity = 1
    if 'Type' in frame1:
        if frame1['Type'] != frame2['Type']:
            similarity = 0.0
    if similarity == 1:
        if 'PlaceMention' in frame1:
            # if PlaceMention is normalized use simple string comparison
            if not Levenshtein_arg:
                if frame1['PlaceMention']  != frame2['PlaceMention']:
                    similarity = 0.0
            else:
                # PlaceMention is not normalized so use Levinshtein distance
                similarity = Levenshtein.ratio(frame1['PlaceMention'], frame2['PlaceMention'])
    #print("similarity: ", similarity)
    return similarity


# evaluate at the document level -----------------------------------------------
项目:intake    作者:codeforamerica    | 项目源码 | 文件源码
def get_message_change_ratio(status_update):
    """Expects a status update instance, returns a number representing
    how much a message has been edited (1.0 completely changed, 0.0 unchanged)
    based on Levenshtein ratio.
    If a status update has no associated notification, returns None
    https://github.com/ztane/python-Levenshtein
    """
    if hasattr(status_update, 'notification'):
        author_profile = status_update.author.profile
        intro_text = get_notification_intro(author_profile) + '\n\n'
        return 1.0 - Levenshtein.ratio(
            *[message.replace(intro_text, '')
              for message in (
                status_update.notification.base_message,
                status_update.notification.sent_message)])
    else:
        return None
项目:ELBaselines    作者:cltl    | 项目源码 | 文件源码
def getCandidatesForLemma(lemma, min_size, max_size):
    hits=[]
    for match in ["phrase", "conjunct"]:
        url="http://lotus.lodlaundromat.org/retrieve?size=" + str(max_size) + "&match=" + match + "&rank=psf&noblank=true&" + urllib.parse.urlencode({"string": lemma, "predicate": "label", "subject": "\"http://dbpedia.org/resource\""})
        r = requests.get(url=url)
        content = r.json()

        these_hits=content["hits"]
        hits=hits + these_hits
        if content["numhits"]>=min_size or len(lemma.split(' '))==1:
            break

    subjects={}
    for hit in hits:
        lev_sim=Levenshtein.ratio(hit["string"].lower(), lemma.lower())
        if "Disambiguation" not in hit["subject"].lower() and "Category" not in hit["subject"]:
            if hit["subject"] not in subjects:
                #subjects[hit["subject"]]=hit["length"]*len(lemma.split())
                subjects[hit["subject"]]={"ss": lev_sim, "count": 1}
            else:
                subjects[hit["subject"]]["ss"]=max(subjects[hit["subject"]]["ss"], lev_sim)
                subjects[hit["subject"]]["count"]+=1
    return subjects
项目:iqra-api    作者:Crescent-Labs    | 项目源码 | 文件源码
def mostCommon(spoken, lst, threshold):
    highestCountItem = max(lst, key=lst.count)
    highestCount = lst.count(highestCountItem)
    contenders = []
    for item in lst:
        if (lst.count(item) == highestCount) and (item not in contenders):
            contenders.append(item)
    if len(contenders) > 1:
        print "\nContending"
        bestMatch = [None, 0]
        for ayah in contenders:
            score = ratio(spoken, ayah)
            print ayah
            print score
            if score > threshold and score > bestMatch[1]:
                bestMatch = [ayah, score]
        return bestMatch[0]
    elif ratio(spoken, highestCountItem) > threshold:
        return highestCountItem
    else:
        return None


# Takes in a query and list of matches
# Returns the match with the highest similarity to the query
项目:Hanhan_NLP    作者:hanhanwu    | 项目源码 | 文件源码
def print_matched_groups(extracted_combo_lst):
    dst_dct = {}

    for itm in extracted_combo_lst:
        dst_dct.setdefault(itm, [])
        if len(extracted_combo_lst) == 1: break

        match_dct = {}
        for i in range(len(extracted_combo_lst)):
            if extracted_combo_lst[i] == itm: continue
            dst = Levenshtein.ratio(itm, extracted_combo_lst[i])
            match_dct[extracted_combo_lst[i]] = dst

        sorted_match_lst = sorted(match_dct.items(), key = operator.itemgetter(1), reverse = True)
        top_n = 2
        dst_dct[itm] = [e[0] for e in sorted_match_lst[0:top_n]]
        extracted_combo_lst.remove(itm)
        for e in dst_dct[itm]:
            extracted_combo_lst.remove(e)

    for k, v in dst_dct.items():
        print k, v
        print
项目:phat    作者:danielfranca    | 项目源码 | 文件源码
def should_run(self):
        data = self.item_options.get('compare_url')

        if data:
            if isinstance(data, Dict):
                self.fuzzy = data.get('fuzzy', 1.0)
                self.url2 = data.get('url')
                if not self.url2:
                    logger.debug('compare_url must contain a url')
                    return False
            else:
                logger.debug('compare_url must be a nested dictionary containing url and ratio properties')
                return False

            return True

        return False
项目:health-mosconi    作者:GNUHealth-Mosconi    | 项目源码 | 文件源码
def ratio(self):
        if not self._ratio:
            self._ratio = ratio(self._str1, self._str2)
        return self._ratio
项目:health-mosconi    作者:GNUHealth-Mosconi    | 项目源码 | 文件源码
def quick_ratio(self):
        # This is usually quick enough :o)
        if not self._ratio:
            self._ratio = ratio(self._str1, self._str2)
        return self._ratio
项目:entity-linker    作者:seucs    | 项目源码 | 文件源码
def getCandidates(self, mention, threshold=0.7):
        res = []

        # ??title?
        for id, title, link_count in self.db_titles:
            m_score = Levenshtein.ratio(title, mention)
            if m_score > threshold:
                self.cur.execute("select abstract from abstract where id = %s"%id)
                context = self.cur.fetchall()
                if context != ():
                    context = json.loads(context[0][0])

                RE = []
                self.cur.execute("select to_id from link where from_id = %s"%id)
                linkto_ids = self.cur.fetchall()
                if linkto_ids != ():
                    for to_id in linkto_ids:
                        RE.append(to_id[0])

                res.append(Entity(title, id, m_score, context, link_count, RE))

        # ??disambiguation?
        for id, title, dis_context, link_count in self.db_disambiguations:
            m_score = Levenshtein.ratio(title, mention)
            if m_score > threshold:
                title += '[%s]'%dis_context
                self.cur.execute("select abstract from abstract where id = %s"%id)
                context = self.cur.fetchall()
                if context != ():
                    context = json.loads(context[0][0])
                    context.append(dis_context)

                RE = []
                self.cur.execute("select to_id from link where from_id = %s"%id)
                linkto_ids = self.cur.fetchall()
                if linkto_ids != ():
                    for to_id in linkto_ids:
                        RE.append(to_id[0])

                res.append(Entity(title, id, m_score, context, link_count, RE))
        return res
项目:entity-linker    作者:seucs    | 项目源码 | 文件源码
def getCandidates(self, mention, threshold=0.7):
        res = []

        # ??title?
        for id, title, link_count in self.db_titles:
            m_score = Levenshtein.ratio(title, mention)
            if m_score > threshold:
                self.cur.execute("select abstract from abstract where id = %s"%id)
                context = self.cur.fetchall()
                if context != ():
                    context = json.loads(context[0][0])

                RE = []
                self.cur.execute("select to_id from link where from_id = %s"%id)
                linkto_ids = self.cur.fetchall()
                if linkto_ids != ():
                    for to_id in linkto_ids:
                        RE.append(to_id[0])

                res.append(Entity(title, id, m_score, context, link_count, RE))

        # ??disambiguation?
        for id, title, dis_context, link_count in self.db_disambiguations:
            m_score = Levenshtein.ratio(title, mention)
            if m_score > threshold:
                title += '[%s]'%dis_context
                self.cur.execute("select abstract from abstract where id = %s"%id)
                context = self.cur.fetchall()
                if context != ():
                    context = json.loads(context[0][0])
                    context.append(dis_context)

                RE = []
                self.cur.execute("select to_id from link where from_id = %s"%id)
                linkto_ids = self.cur.fetchall()
                if linkto_ids != ():
                    for to_id in linkto_ids:
                        RE.append(to_id[0])

                res.append(Entity(title, id, m_score, context, link_count, RE))
        return res
项目:lorelei-speech-evaluation    作者:usc-sail    | 项目源码 | 文件源码
def frame_similarity(frame1,frame2):
    similarity = 1
    if 'Type' in frame1:
        if frame1['Type'] != frame2['Type']:
            similarity = 0
    if similarity == 1:
        if 'PlaceMention' in frame1:
            similarity = Levenshtein.ratio(frame1['PlaceMention'], frame2['PlaceMention'])
    return similarity


# evaluate at the document level -----------------------------------------------
项目:intake    作者:codeforamerica    | 项目源码 | 文件源码
def get_name_similarity_ratio(a, b):
    names = (get_full_lowercase_name(sub) for sub in (a, b))
    return Levenshtein.ratio(*names)
项目:relocaliser    作者:very-scary-scenario    | 项目源码 | 文件源码
def interesting_party(*a, **k):
    while True:
        while True:
            phrase = get_name()
            if len(phrase) < 100:
                break

        steps = party(phrase, *a, **k)
        result = steps[-1][-1]

        if ratio(phrase.lower(), result.lower()) < 0.7:
            return steps
项目:relocaliser    作者:very-scary-scenario    | 项目源码 | 文件源码
def play(self, guess):
        return ratio(normalise(guess), normalise(self.original))
项目:ELBaselines    作者:cltl    | 项目源码 | 文件源码
def moreLocalCandidates(m, previous, candidates):
    for pm, pl in previous.items():
        if is_abbrev(m, pm):
            for prevLink in previous[pm]:
                prevLinkDB=utils.makeDbpedia(prevLink)
                candidates.append(tuple([prevLinkDB, {"ss": 1.0, "count": 0.0}]))
        elif isEnoughSubset(m, pm):
            for prevLink in previous[pm]:
                prevLinkDB=utils.makeDbpedia(prevLink)
                candidates.append(tuple([prevLinkDB, {"ss": Levenshtein.ratio(m.lower(), pm.lower()), "count": 0.0}]))
    return candidates
项目:iqra-api    作者:Crescent-Labs    | 项目源码 | 文件源码
def bestLevMatch(spoken, lst):
    print " "
    bestMatch = [None, 0.65]
    for ayah in lst:
        score = ratio(spoken, ayah)
        print ayah
        print score
        if score > bestMatch[1]:
            bestMatch = [ayah, score]
    return bestMatch[0]


# Takes in an ayah object from alfanous
# Returns a cleaned-up ayah object
项目:iqra-api    作者:Crescent-Labs    | 项目源码 | 文件源码
def checkForWordInQuran(value):
    wordMatch = dbGet(models.QuranWord, value)
    if wordMatch:
        return wordMatch.text
    else:
        # The original word is not in the Quran so we try alfanous' suggestions
        wordSuggestionList = []
        wordSuggestions = alfanous.do({
            "action": "suggest", "query": value
        })["suggest"]
        for word in wordSuggestions:
            for suggestion in wordSuggestions[word]:
                wordMatch = dbGet(models.QuranWord, value)
                if wordMatch:
                    wordSuggestionList.append(wordMatch.text)
        if len(wordSuggestionList) > 1:
            topRatioValue = 0
            topSuggestion = ""
            while len(wordSuggestionList) > 0:
                suggestion = wordSuggestionList.pop(0)
                suggestionRatio = ratio(value, suggestion)
                if suggestionRatio > topRatioValue:
                    topRatioValue = suggestionRatio
                    topSuggestion = suggestion
            return topSuggestion
        elif len(wordSuggestionList) == 1:
            return wordSuggestionList[0]
        else:
            return None


# Takes in a query and checks if any part of it is in the Quran
# Return the part in the Quran if one is found, otherwise it returns None
项目:lang2program    作者:kelvinguu    | 项目源码 | 文件源码
def similarity_ratio(x, y, threshold=FuzzyMatchGenerator.SIMILARITY_THRESHOLD):
    """Compute the similarity ratio between two strings.
    If the ratio exceeds the threshold, return it; otherwise, return 0.

    The similarity ratio is given by
        1 - (levenshtein distance with substitution cost = 2) / (total length)
    """
    ratio = Levenshtein.ratio(x, y)
    return ratio if ratio > threshold else 0.


################################
# NERValueGenerator
项目:hipfrog    作者:wardweistra    | 项目源码 | 文件源码
def getLevenshteinDistance(item, keyword):
    item = item.lower().replace(' ', '').replace('-', '').replace('_', '')
    keyword = keyword.lower().replace(' ', '').replace('-', '').replace('_', '')
    return Levenshtein.ratio(item, keyword)
项目:nlpcc2016    作者:huangxiangzhou    | 项目源码 | 文件源码
def generateStemmingDict(inputPath = 'stemmer.txt', outputPath = 'stemmingDict'):
    inputEncoding = 'utf8'
    outputEncoding = 'utf8'

    distance = Levenshtein.ratio

    fi = open(inputPath, 'r', encoding=inputEncoding)
    fo = open(outputPath, 'w', encoding=outputEncoding)

    stemmingDict = {}

    for line in fi:
        if line.strip() == '':
            continue
        tmpList = line.strip().split(' => ')
        for word in tmpList[0].split(', '):
            if word not in stemmingDict:
                stemmingDict[word] = set()
            stemmingDict[word].add(tmpList[1])


    for key in stemmingDict:
        stemmingDict[key] = list(stemmingDict[key])
        for i in range(len(stemmingDict[key])):
            stemmingDict[key][i] = [stemmingDict[key][i],distance(stemmingDict[key][i],key)]


    json.dump(stemmingDict,fo)

    fi.close()
    fo.close()


    fotxt = open(outputPath+'.txt', 'w', encoding=outputEncoding)

    for key in stemmingDict:
        fotxt.write(key + ' ' + str(stemmingDict[key]) + '\n')

    fotxt.close()
项目:Quora-Kaggle    作者:PPshrimpGo    | 项目源码 | 文件源码
def get_features(df_features):
    print('use w2v to document presentation')
    now = datetime.datetime.now()
    print now.strftime('%Y-%m-%d %H:%M:%S')
    df_features['z_document_dis'] = df_features.apply(lambda x: getDiff_averge_tfidf(x['question1'], x['question2']), axis = 1)
    print('nones')
    now = datetime.datetime.now()
    print now.strftime('%Y-%m-%d %H:%M:%S') 
    df_features['q1_unique'] = df_features.apply(lambda x: getdiffwords(x['question1'], x['question2']), axis = 1)
    df_features['q2_unique'] = df_features.apply(lambda x: getdiffwords(x['question2'], x['question1']), axis = 1)
    #df_features['question1_nouns'] = df_features.question1.map(lambda x: [w for w, t in nltk.pos_tag(nltk.word_tokenize(str(x).lower())) if t[:1] in ['N']])
    #df_features['question2_nouns'] = df_features.question2.map(lambda x: [w for w, t in nltk.pos_tag(nltk.word_tokenize(str(x).lower())) if t[:1] in ['N']])
    df_features['question1_w2v'] = df_features.question1.map(lambda x: get_vector_tfidf(" ".join(x)))
    df_features['question2_w2v'] = df_features.question2.map(lambda x: get_vector_tfidf(" ".join(x)))
    print('z_dist')
    now = datetime.datetime.now()
    print now.strftime('%Y-%m-%d %H:%M:%S') 
    df_features['z_dist'] = df_features.apply(lambda x:Levenshtein.ratio(x['question1'], x['question2']), axis=1)
    now = datetime.datetime.now()
    print('z_tfidf_cos_sim')
    print now.strftime('%Y-%m-%d %H:%M:%S') 
    df_features['z_tfidf_cos_sim'] = df_features.apply(lambda x: cos_sim(x['question1'], x['question2']), axis=1)
    now = datetime.datetime.now()
    print('z_w2v_nones')
    print now.strftime('%Y-%m-%d %H:%M:%S') 
    df_features['z_w2v_unique'] = df_features.apply(lambda x: w2v_cos_sim_tfidf(x['q1_unique'], x['q2_unique']), axis=1)
    df_features['z_w2v_dis_e'] = df_features.apply(lambda x: spatial.distance.euclidean(x['question1_w2v'], x['question2_w2v']), axis=1)
    df_features['z_w2v_dis_mink'] = df_features.apply(lambda x: spatial.distance.minkowski(x['question1_w2v'], x['question2_w2v'],3), axis=1)
    df_features['z_w2v_dis_cityblock'] = df_features.apply(lambda x: spatial.distance.cityblock(x['question1_w2v'], x['question2_w2v']), axis=1)
    df_features['z_w2v_dis_canberra'] = df_features.apply(lambda x: spatial.distance.canberra(x['question1_w2v'], x['question2_w2v']), axis=1)
    df_features['z_q1_skew'] = df_features.question1_w2v.map(lambda x:skew(x))
    df_features['z_q2_skew'] = df_features.question2_w2v.map(lambda x:skew(x))
    df_features['z_q1_kur'] = df_features.question1_w2v.map(lambda x:kurtosis(x))
    df_features['z_q2_kur'] = df_features.question2_w2v.map(lambda x:kurtosis(x))
    del df_features['question1_w2v']
    del df_features['question2_w2v']
    print('all done')
    print now.strftime('%Y-%m-%d %H:%M:%S') 
    df_features.fillna(0.0)
    return df_features
项目:Quora-Kaggle    作者:PPshrimpGo    | 项目源码 | 文件源码
def get_features(df_features):
    print('z_dist')
    now = datetime.datetime.now()
    print now.strftime('%Y-%m-%d %H:%M:%S') 
    df_features['z_dist'] = df_features.apply(lambda x:Levenshtein.ratio(x['question1'], x['question2']), axis=1)
    now = datetime.datetime.now()
    print('z_tfidf_cos_sim')
    print now.strftime('%Y-%m-%d %H:%M:%S') 
    df_features['z_tfidf_cos_sim'] = df_features.apply(lambda x: cos_sim(x['question1'], x['question2']), axis=1)
    now = datetime.datetime.now()
    print('z_w2v')
    print now.strftime('%Y-%m-%d %H:%M:%S') 
    df_features['z_w2v'] = df_features.apply(lambda x: w2v_cos_sim(x['question1'], x['question2']), axis=1)
    return df_features
项目:whois    作者:wavenator    | 项目源码 | 文件源码
def get_registrar(
        cls,
        subject,
    ):
        cls.check_and_update_registrars()

        edited_subject = re.sub(
            pattern='[^\d\w]',
            repl='',
            string=subject,
        )
        edited_subject = edited_subject.lower()

        for registrar in cls.registrars:
            if edited_subject in registrar['edited'].lower():
                return registrar['original']

        most_close_registrar = ''
        most_close_registrar_distance_ratio = 0
        for registrar in cls.registrars:
            registrar_distance_ratio = Levenshtein.ratio(
                edited_subject,
                registrar['edited'],
            )
            if registrar_distance_ratio > most_close_registrar_distance_ratio:
                most_close_registrar = registrar['original']
                most_close_registrar_distance_ratio = registrar_distance_ratio

        return most_close_registrar
项目:opminreplicability    作者:epochx    | 项目源码 | 文件源码
def adjective_fuzzy_matching(token, adjectives, match):
    """
    Given a token and a list of terms to match, returns True if
    the stem of the token matches any of the items in the list.
    Input:
        token: Token object to match
        adjectives: list of items to match the Token
        match: minimum ratio (0-100) for matching
    """
    for adjective in adjectives:
        if Levenshtein.ratio(str(token.stem), str(adjective)) >= match:
            return True
    return False
项目:opminreplicability    作者:epochx    | 项目源码 | 文件源码
def _transactions_fuzzy_matching(transactions, match):
    """
    Runs fuzzy matching on the transactions, by applying a complete linkage
    hierarchical clustering algorithm to the set of different itemsets in the
    transactions. For clustering, the similarity ratio as given by
    fuzzywuzzy.ratio is used as the distance measure
    Input:
        transactions: list of tuples representing items on each transaction
        match: minimum similarity ratio (0 to 100) for clustering
    Output:
        transactions: new version of the transactions, where each item has been
                      replaced by the first item on its corresponding cluster
        word_clusters: dictionary that maps the cluster for each item
        in the transactions
    """
    words = set([])
    for transaction in transactions:
        words |= set(transaction)
    words = sorted(words)
    l = [((a, b), 100-Levenshtein.ratio(str(a), str(b)))
         for a, b in combinations(words, 2)]
    d = [value for pair, value in l]
    r = linkage(d, 'complete')
    clusters_index = fcluster(r, 100-match, "distance")
    clusters = {}
    for obs_i, cluster_i in enumerate(clusters_index):
        if cluster_i in clusters:
            clusters[cluster_i].append(words[obs_i])
        else:
            clusters[cluster_i] = [words[obs_i]]

    word_clusters = {word: clusters[clusters_index[i]]
                     for i, word in enumerate(words)}
    new_transactions = []
    for transaction in transactions:
        new_transaction = tuple(set(([word_clusters[word][0]
                                      for word in transaction])))
        new_transactions.append(new_transaction)
    return new_transactions, word_clusters
项目:opminreplicability    作者:epochx    | 项目源码 | 文件源码
def __init__(self, match=90, key=lambda x: x.string.lower()):
        """
        Fuzzy matching between the given token and term objects. For comparison
        applies the function given in the "key" parameter to the Token/tuple
        of Tokens. Parameter match defines the minimum similarity ratio for
        a match when comparing.

        Input:
            match : minimum similarity for fuzzy matching (%)
            key   : function to apply to the token,
                    default=lambda x: x.string.lower()
        """
        self.match = match
        self.key = key
项目:opminreplicability    作者:epochx    | 项目源码 | 文件源码
def __call__(self, token_tuple, terms):
        """
        Input:
            token_tuple : Token or tuple of Token objects
            terms       : term or iterable of terms to match

        Output:
            Returns None if no match is found.
            Returns the first matched in case many of them show the same
            similarity ratio.
        """
        if not hasattr(terms, '__iter__'):
            terms = [terms]
        if not isinstance(token_tuple, tuple):
            token_tuple = (token_tuple,)
        try:
            token_tuple = tuple(self.key(token) for token in token_tuple)
        except Exception:  # as e
            token_tuple = tuple(str(token) for token in token_tuple)

        best_term = None
        best_ratio = 0

        for term in terms:
            ratio = max([Levenshtein.ratio(unicode(" ".join(token_tuple)),
                                           unicode(" ".join(term_i)))*100
                         for term_i in term])
            if ratio >= self.match and ratio > best_ratio:
                best_term = term
                best_ratio = ratio

        return best_term


# ------- UTIL FUNCTIONS ------------------------------------------------------
项目:Kaggle_HomeDepot    作者:ChenglongChen    | 项目源码 | 文件源码
def _edit_dist(str1, str2):
    try:
        # very fast
        # http://stackoverflow.com/questions/14260126/how-python-levenshtein-ratio-is-computed
        # d = Levenshtein.ratio(str1, str2)
        d = Levenshtein.distance(str1, str2)/float(max(len(str1),len(str2)))
    except:
        # https://docs.python.org/2/library/difflib.html
        d = 1. - SequenceMatcher(lambda x: x==" ", str1, str2).ratio()
    return d
项目:kaggle-quora-solution-8th    作者:qqgeogor    | 项目源码 | 文件源码
def _edit_dist(str1, str2):
    try:
        # very fast
        # http://stackoverflow.com/questions/14260126/how-python-levenshtein-ratio-is-computed
        # d = Levenshtein.ratio(str1, str2)
        d = Levenshtein.distance(str1, str2)/float(max(len(str1),len(str2)))
    except:
        # https://docs.python.org/2/library/difflib.html
        d = 1. - SequenceMatcher(lambda x: x==" ", str1, str2).ratio()
    return d
项目:phat    作者:danielfranca    | 项目源码 | 文件源码
def similar_link_visited(link_url, links, fuzzy):
        for link in links:
            if ratio(link_url, link) >= fuzzy:
                # Link already accessed, return
                return True
        return False
项目:phat    作者:danielfranca    | 项目源码 | 文件源码
def check(self):
        headers = self.item_options.get('headers', {})
        cookies = self.item_options.get('cookies', {})
        username = self.global_options.get('username')
        password = self.global_options.get('password')

        r2 = requests.get(self.url2, headers=headers,
                          auth=HTTPBasicAuth(username, password), allow_redirects=True, cookies=cookies)

        logger.info("Comparing urls...")
        if self.fuzzy == 1.0:
            self.ok(self.response.text == r2.text,
                    'Urls don\'t have equal content: {tested} and {reference}'.format(tested=self.url,
                                                                                      reference=self.url2))
        else:
            actual_ratio = ratio(self.response.text, r2.text)
            self.ok(actual_ratio > self.fuzzy,
                    """
                    Urls don\'t have sufficiently similar content: {tested} and {reference} (expected {expected}, got {actual})
                    """
                    .format(
                        tested=self.url,
                        reference=self.url2,
                        expected=self.fuzzy,
                        actual=actual_ratio))

        return self.is_ok()
项目:dac    作者:jlonij    | 项目源码 | 文件源码
def set_levenshtein(self):
        '''
        Mean and max Levenshtein ratio for all labels.
        '''
        if not [f for f in self.features if f.startswith('match_str_lsr')]:
            return

        ne = self.cluster.entities[0].norm

        # Pref label
        l = self.document.get('pref_label')
        self.match_str_lsr_pref = Levenshtein.ratio(ne, l)

        # Wikidata alt labels
        if self.document.get('wd_alt_label'):
            wd_labels = self.document.get('wd_alt_label')
            ratios = [Levenshtein.ratio(ne, l) for l in wd_labels]
            self.match_str_lsr_wd_max = max(ratios) - 0.5
            self.match_str_lsr_wd_mean = (sum(ratios) /
                float(len(wd_labels))) - 0.375
        else:
            wd_labels = []

        # Any other alt labels
        if self.document.get('alt_label'):
            labels = self.document.get('alt_label')
            labels = [l for l in labels if l not in wd_labels]
            if labels:
                ratios = [Levenshtein.ratio(ne, l) for l in labels]
                self.match_str_lsr_alt_max = max(ratios) - 0.5
                self.match_str_lsr_alt_mean = (sum(ratios) /
                        float(len(labels))) - 0.375
项目:nlpcc2016    作者:huangxiangzhou    | 项目源码 | 文件源码
def appendWordNetStemmingDict(inputPath='stemmingDict.old', outputPath='stemmingDict',outputEncoding='utf8'):

    oldDict = json.load(open(inputPath,'r',encoding='utf8'))
    distance = Levenshtein.ratio
    fi = open('wordnet.map','r',encoding='utf8')
    fo = open(outputPath,'w',encoding='utf8')

    for m in list(oldDict):
        tmp = set()
        for l in list(oldDict[m]):
            tmp.add(l[0])
        oldDict[m] = set(tmp)

    for line in fi:
        m = line.strip().split(' ')
        if len(m) == 0:
            continue
        if m[0] not in oldDict:
            oldDict[m[0]]=set()
        oldDict[m[0]].add(m[1])


    for m in list(oldDict):
        oldDict[m] = list(oldDict[m])
        for i in range(len(oldDict[m])):
            if type(oldDict[m][i]) != str or type(m) != str:
                print(oldDict[m])
                input()
                continue
            oldDict[m][i] = [oldDict[m][i],distance(oldDict[m][i],m)]

    json.dump(oldDict,fo)




    fotxt = open(outputPath+'.txt', 'w', encoding=outputEncoding)

    for key in oldDict:
        fotxt.write(key + ' ' + str(oldDict[key]) + '\n')

    fotxt.close()               

##
##print('Dumping stemming mpping to json format......')
##generateStemmingDict()
##appendWordNetStemmingDict()
##print('Done!')
项目:nlpcc2016    作者:huangxiangzhou    | 项目源码 | 文件源码
def calScoreSub(self, countCharDict):

        distance = Levenshtein.ratio
        q = self.qRaw
        scoreSub = 0

        sub = ''

        if type(self.sub) == str:

            sub = self.sub
            subSplit = sub.split(' ')
            if sub in q:   
                for w in subSplit:
                    if w in countCharDict:
                        scoreSub += 1/(countCharDict[w] + 1)
                    else:
                        scoreSub += 1
            else:
                subSet = set(subSplit)
                qSet = set(q.split(' '))
                for w in (subSet & qSet):
                    if w in countCharDict:
                        scoreSub += 1/(countCharDict[w] + 1)
                    else:
                        scoreSub += 1
                if len(subSet) != 0:
                    scoreSub = scoreSub/len(subSet)


        if type(self.sub) == list:
            for s in self.sub[0]:
                sub += s + ' '
            sub = sub.strip()


        if type(self.sub) == list:
            if len(self.sub[0]) == len(self.sub[1]):
                lenSub = len(self.sub[0])
                for i in range(lenSub):
                    w = self.sub[0][i]
                    wC = self.sub[1][i]
                    if w in countCharDict:
                        scoreSub += 1/(countCharDict[w] + 1)*distance(w,wC)
                    else:
                        scoreSub += 1*distance(w,wC)
                scoreSub = scoreSub / lenSub

            else:
                subIntersaction = set(self.sub[0]) & set(self.sub[1])
                scoreSub = len(subIntersaction) / len(set(self.sub[0]) | set(self.sub[1]))



        self.scoreSub = scoreSub

        return scoreSub
项目:nlpcc2016    作者:huangxiangzhou    | 项目源码 | 文件源码
def calScorePreLast(self, countCharDict,qWithoutSubSet,stemmingDict):

        distance = Levenshtein.ratio
        pre = self.pre
        scorePre = 0

        lastPreIndex = pre.rfind('.')
        if lastPreIndex != -1:
            preLowerSet = set(re.split(r' ',pre[lastPreIndex+1:]))
        else:
            preLowerSet = set(re.split(r' ',pre))

        preLower = list(preLowerSet)
        preLowerSet = set()

        for i in range(len(preLower)):
            if preLower[i] in stemmingDict:
                preLower[i] = stemmingDict[preLower[i]][0][0]
            preLowerSet.add(preLower[i])


        maxIntersection = qWithoutSubSet & preLowerSet



        preFactor = 0
        for char in maxIntersection:
            if char in countCharDict:
                preFactor += 1/(countCharDict[char] + 1)
            else:
                preFactor += 1


        if len(maxIntersection) == 0:
            for w1 in qWithoutSubSet:
                for w2 in preLowerSet:
                    if w1 == '' or w2 == '' or w1[0] != w2[0]:
                        continue
                    div = 1
                    if w1 in countCharDict:
                        div = countCharDict[w1] + 1
                    dWord = distance(w1,w2) / div
                    if preFactor < dWord:
                        preFactor = dWord



        if len(pre) != 0:
            scorePre = preFactor / len(qWithoutSubSet | preLowerSet)
        else:
            scorePre = 0



        self.scorePreLast = scorePre


        return scorePre
项目:nlpcc2016    作者:huangxiangzhou    | 项目源码 | 文件源码
def calScorePreAll(self, countCharDict, qWithoutSubSet,stemmingDict):

        distance = Levenshtein.ratio
        pre = self.pre
        scorePre = 0

        preLowerSet = set(re.split(r' |\.',pre))

        preLower = list(preLowerSet)
        preLowerSet = set()

        for i in range(len(preLower)):
            if preLower[i] in stemmingDict:
                preLower[i] = stemmingDict[preLower[i]][0][0]
            preLowerSet.add(preLower[i])


        maxIntersection = qWithoutSubSet & preLowerSet



        preFactor = 0
        for char in maxIntersection:
            if char in countCharDict:
                preFactor += 1/(countCharDict[char] + 1)
            else:
                preFactor += 1


        if len(maxIntersection) == 0:
            for w1 in qWithoutSubSet:
                for w2 in preLowerSet:
                    if w1 == '' or w2 == '' or w1[0] != w2[0]:
                        continue
                    div = 1
                    if w1 in countCharDict:
                        div = countCharDict[w1] + 1
                    dWord = distance(w1,w2) / div
                    if preFactor < dWord:
                        preFactor = dWord



        if len(pre) != 0:
            scorePre = preFactor / len(qWithoutSubSet | preLowerSet)
        else:
            scorePre = 0



        self.scorePreAll = scorePre


        return scorePre
项目:nlpcc2016    作者:huangxiangzhou    | 项目源码 | 文件源码
def calScorePreLast(self, countCharDict,qWithoutSubSet,stemmingDict):

        distance = Levenshtein.ratio
        pre = self.pre
        scorePre = 0

        lastPreIndex = pre.rfind('.')
        if lastPreIndex != -1:
            preLowerSet = set(re.split(r' ',pre[lastPreIndex+1:]))
        else:
            preLowerSet = set(re.split(r' ',pre))

        preLower = list(preLowerSet)
        preLowerSet = set()

        for i in range(len(preLower)):
            if preLower[i] in stemmingDict:
                preLower[i] = stemmingDict[preLower[i]][0][0]
            preLowerSet.add(preLower[i])


        maxIntersection = qWithoutSubSet & preLowerSet



        preFactor = 0
        for char in maxIntersection:
            if char in countCharDict:
                preFactor += 1/(countCharDict[char] + 1)
            else:
                preFactor += 1


        if len(maxIntersection) == 0:
            for w1 in qWithoutSubSet:
                for w2 in preLowerSet:
                    if w1 == '' or w2 == '' or w1[0] != w2[0]:
                        continue
                    div = 1
                    if w1 in countCharDict:
                        div = countCharDict[w1] + 1
                    dWord = distance(w1,w2) / div
                    if preFactor < dWord:
                        preFactor = dWord



        if len(pre) != 0:
            scorePre = preFactor / len(qWithoutSubSet | preLowerSet)
        else:
            scorePre = 0



        self.scorePreLast = scorePre


        return scorePre
项目:nlpcc2016    作者:huangxiangzhou    | 项目源码 | 文件源码
def calScorePreAll(self, countCharDict, qWithoutSubSet,stemmingDict):

        distance = Levenshtein.ratio
        pre = self.pre
        scorePre = 0

        preLowerSet = set(re.split(r' |\.',pre))

        preLower = list(preLowerSet)
        preLowerSet = set()

        for i in range(len(preLower)):
            if preLower[i] in stemmingDict:
                preLower[i] = stemmingDict[preLower[i]][0][0]
            preLowerSet.add(preLower[i])


        maxIntersection = qWithoutSubSet & preLowerSet



        preFactor = 0
        for char in maxIntersection:
            if char in countCharDict:
                preFactor += 1/(countCharDict[char] + 1)
            else:
                preFactor += 1


        if len(maxIntersection) == 0:
            for w1 in qWithoutSubSet:
                for w2 in preLowerSet:
                    if w1 == '' or w2 == '' or w1[0] != w2[0]:
                        continue
                    div = 1
                    if w1 in countCharDict:
                        div = countCharDict[w1] + 1
                    dWord = distance(w1,w2) / div
                    if preFactor < dWord:
                        preFactor = dWord



        if len(pre) != 0:
            scorePre = preFactor / len(qWithoutSubSet | preLowerSet)
        else:
            scorePre = 0



        self.scorePreAll = scorePre


        return scorePre
项目:Quora-Kaggle    作者:PPshrimpGo    | 项目源码 | 文件源码
def get_features(df_features):
    print('use w2v to document presentation')
    now = datetime.datetime.now()
    print now.strftime('%Y-%m-%d %H:%M:%S')
   #df_features['z_document_dis'] = df_features.apply(lambda x: getDiff_averge(x['question1'], x['question2']), axis = 1)
    print('get_w2v')
    now = datetime.datetime.now()
    print now.strftime('%Y-%m-%d %H:%M:%S') 
    df_features['q1_unique'] = df_features.apply(lambda x: getdiffwords(x['question1'], x['question2']), axis = 1)
    df_features['q2_unique'] = df_features.apply(lambda x: getdiffwords(x['question2'], x['question1']), axis = 1)

    df_features['q1_unique_w2v_weight'] = df_features.q1_unique.map(lambda x: get_vector(" ".join(x)))
    df_features['q2_unique_w2v_weight'] = df_features.q2_unique.map(lambda x: get_vector(" ".join(x)))
    df_features['q1_unique_w2v'] = df_features.q1_unique.map(lambda x: get_weight_vector(" ".join(x)))
    df_features['q2_unique_w2v'] = df_features.q2_unique.map(lambda x: get_weight_vector(" ".join(x)))

    print('z_dist')
    now = datetime.datetime.now()
    print now.strftime('%Y-%m-%d %H:%M:%S') 
    #df_features['z_dist'] = df_features.apply(lambda x:Levenshtein.ratio(x['question1'], x['question2']), axis=1)
    now = datetime.datetime.now()
    print('z_tfidf_cos_sim')
    print now.strftime('%Y-%m-%d %H:%M:%S') 
    #df_features['z_tfidf_cos_sim'] = df_features.apply(lambda x: cos_sim(x['question1'], x['question2']), axis=1)
    now = datetime.datetime.now()
    print('z_w2v_calc')
    print now.strftime('%Y-%m-%d %H:%M:%S') 

    #df_features['z_w2v_unique'] = df_features.apply(lambda x: w2v_cos_sim(x['q1_unique'], x['q2_unique']), axis=1)
    df_features['z_w2v_unique_dis_e_weight'] = df_features.apply(lambda x: spatial.distance.euclidean(x['q1_unique_w2v_weight'], x['q2_unique_w2v_weight']), axis=1)
    df_features['z_w2v_unique_dis_e'] = df_features.apply(lambda x: spatial.distance.euclidean(x['q1_unique_w2v'], x['q2_unique_w2v']), axis=1)    

    df_features['z_w2v_unique_dis_mink_w'] = df_features.apply(lambda x: spatial.distance.minkowski(x['q1_unique_w2v_weight'], x['q2_unique_w2v_weight'],3), axis=1)
    df_features['z_w2v_unique_dis_cityblock_w'] = df_features.apply(lambda x: spatial.distance.cityblock(x['q1_unique_w2v_weight'], x['q2_unique_w2v_weight']), axis=1)
    df_features['z_w2v_unique_dis_canberra_w'] = df_features.apply(lambda x: spatial.distance.canberra(x['q1_unique_w2v_weight'], x['q2_unique_w2v_weight']), axis=1)

    df_features['z_w2v_unique_dis_mink'] = df_features.apply(lambda x: spatial.distance.minkowski(x['q1_unique_w2v'], x['q2_unique_w2v'],3), axis=1)
    df_features['z_w2v_unique_dis_cityblock'] = df_features.apply(lambda x: spatial.distance.cityblock(x['q1_unique_w2v'], x['q2_unique_w2v']), axis=1)
    df_features['z_w2v_unique_dis_canberra'] = df_features.apply(lambda x: spatial.distance.canberra(x['q1_unique_w2v'], x['q2_unique_w2v']), axis=1)

    df_features['z_q1_unique_skew_w'] = df_features.q1_unique_w2v_weight.map(lambda x:skew(x))
    df_features['z_q2_unique_skew_w'] = df_features.q2_unique_w2v_weight.map(lambda x:skew(x))
    df_features['z_q1_unique_kur_w'] = df_features.q1_unique_w2v_weight.map(lambda x:kurtosis(x))
    df_features['z_q2_unique_kur_w'] = df_features.q2_unique_w2v_weight.map(lambda x:kurtosis(x))


    df_features['z_q1_unique_skew'] = df_features.q1_unique_w2v.map(lambda x:skew(x))
    df_features['z_q2_unique_skew'] = df_features.q2_unique_w2v.map(lambda x:skew(x))
    df_features['z_q1_unique_kur'] = df_features.q1_unique_w2v.map(lambda x:kurtosis(x))
    df_features['z_q2_unique_kur'] = df_features.q2_unique_w2v.map(lambda x:kurtosis(x))
    del df_features['q1_unique_w2v_weight']
    del df_features['q2_unique_w2v_weight']
    del df_features['q1_unique_w2v']
    del df_features['q2_unique_w2v']
    print('all done')
    print now.strftime('%Y-%m-%d %H:%M:%S') 
    df_features.fillna(0.0)
    return df_features
项目:opminreplicability    作者:epochx    | 项目源码 | 文件源码
def __init__(self,
                 stopwords=NLTKStopwords(),
                 min_support=MIN_SUPPORT,
                 max_words=MAX_WORDS,
                 min_psupport=MIN_PSUPPORT,
                 min_compact_support=MIN_COMPACT_SUPPORT,
                 max_compact_distance=MAX_COMPACT_DISTANCE,
                 adj_key=StemKey(),
                 adj_win_size=ADJ_NEARBY_DISTANCE ,
                 match=85,
                 compactness=True,
                 redundancy=True,
                 infrequent=True):
        """
        Model to extract aspects using the algorithm by Hu et al. (2004)

            stopwords             : iterable of strings to use as stopwords
            min_support           : int, minimum support of an item set
                                    (positive: percentage, negative: absolute
                                    number of transactions)
            min_compact_support   : int minimum number of compact sentences
                                    of an aspect
            max_words             : int, maximum number of word on each aspect,
            max_compact_distance  : int, maximum distance between consecutive
                                    words in an aspect
            adj_win_size          : int, maximum distance to look for
                                    adjectives near an aspect on a sentence
            min_psupport          : int, minimum pure support of an aspect
            adj_key               : lambda function to extract adjectives
            match                 : int, minimum similarity ratio (0-100] for
                                    matching (use <100 for fuzzy) default=
            compactness           : boolean, True to run "compactness pruning"
            redundancy            : boolean, True to run "redundancy pruning"
            infrequent            : boolean, True to also extract infrequent
                                    aspects
        """
        self.params = {"stopwords": stopwords,
                       "min_support": min_support,
                       "max_words": max_words,
                       "min_psupport": min_psupport,
                       "min_compact_support": min_compact_support,
                       "max_compact_distance": max_compact_distance,
                       "adj_key": adj_key,
                       "adj_win_size": adj_win_size,
                       "match": match,
                       "compactness": compactness,
                       "redundancy": redundancy,
                       "infrequent": infrequent}
项目:B2FIND-Training    作者:EUDAT-Training    | 项目源码 | 文件源码
def map_discipl(self,invalue,disctab):
        """
        Convert disciplines along B2FIND disciplinary list

        Copyright (C) 2014 Heinrich Widmann
        Licensed under AGPLv3.
        """

        retval=list()
        if type(invalue) is not list :
            inlist=re.split(r'[;&\s]\s*',invalue)
            inlist.append(invalue)
        else:
            seplist=[re.split(r"[;&]",i) for i in invalue]
            swlist=[re.findall(r"[\w']+",i) for i in invalue]
            inlist=swlist+seplist
            inlist=[item for sublist in inlist for item in sublist]
        for indisc in inlist :
           ##indisc=indisc.encode('ascii','ignore').capitalize()
           indisc=indisc.encode('utf8').replace('\n',' ').replace('\r',' ').strip().title()
           maxr=0.0
           maxdisc=''
           for line in disctab :
             try:
               disc=line[2].strip()
               r=lvs.ratio(indisc,disc)
             except Exception as e:
                 logging.error('[ERROR] %s in map_discipl : %s can not compared to %s !' % (e,indisc,disc))
                 continue
             if r > maxr  :
                 maxdisc=disc
                 maxr=r
                 ##HEW-T                   print('--- %s \n|%s|%s| %f | %f' % (line,indisc,disc,r,maxr)
           if maxr == 1 and indisc == maxdisc :
               logging.debug('  | Perfect match of %s : nothing to do' % indisc)
               retval.append(indisc.strip())
           elif maxr > 0.90 :
               logging.debug('   | Similarity ratio %f is > 0.90 : replace value >>%s<< with best match --> %s' % (maxr,indisc,maxdisc))
               ##return maxdisc
               retval.append(indisc.strip())
           else:
               logging.debug('   | Similarity ratio %f is < 0.90 compare value >>%s<< and discipline >>%s<<' % (maxr,indisc,maxdisc))
               continue

        if len(retval) > 0:
            retval=list(OrderedDict.fromkeys(retval)) ## this elemenates real duplicates
            return ';'.join(retval)
        else:
            return 'Not stated'