Python enchant 模块,Dict() 实例源码

我们从Python开源项目中,提取了以下37个代码示例,用于说明如何使用enchant.Dict()

项目:Python-Plex-Controller    作者:MoeFwacky    | 项目源码 | 文件源码
def spellchecker(title):
    try:
        d = enchant.Dict("en_US")
    except ImportError:
        print ("Enchant Library Not Found. Spell Checking Failed.")
        return title
    options = []
    newt = ""
    ccount = 0
    fail = "no"
    for word in title.split(" "):
        if d.check(word) is True:
            newt = newt + word + " "
        else:
            clist = d.suggest(word)
            word = clist[ccount]
            newt = newt + word + " "
            fail = "yes"
    return newt
项目:saapy    作者:ashapochka    | 项目源码 | 文件源码
def misses_to_frame(parsed_lexemes: Iterable,
                    terms: Dict[str, str]=None) -> pd.DataFrame:
    if not terms:
        terms = {}
    miss_dict = collect_misses(parsed_lexemes)
    misses = []
    for miss in miss_dict:
        low_miss = miss.lower()
        miss_record = OrderedDict()
        miss_record['miss'] = low_miss
        miss_record['term'] = terms.get(low_miss, low_miss)
        miss_record['lexemes'] = ' '.join(miss_dict[miss])
        misses.append(miss_record)
    miss_frame = pd.DataFrame.from_records(
        misses, index='miss', columns=['miss', 'term', 'lexemes'])
    return miss_frame
项目:SWCheckIn    作者:gsugar87    | 项目源码 | 文件源码
def getConfNum(msgText):
    # get the confirmation number
    if 'confNum=' in msgText:
        strIndexStart = msgText.find('confNum=')+8
        strIndexEnd = strIndexStart+6
        confNum = str(msgText[strIndexStart:strIndexEnd])
    else:
        # get dictionary
        d = enchant.Dict("en_US")
        pattern = re.compile(r'(?<![A-Za-z0-9])[A-Z0-9]{6}(?![A-Za-z0-9])')
        msgTextConfNumSearch = msgText[200:]
        regExSearch = pattern.search(msgTextConfNumSearch)
        while regExSearch:
            # see if the found string is a real word
            possibleConfNum = regExSearch.group()
            if not d.check(possibleConfNum):
                confNum = str(possibleConfNum)
                break
            else:
                msgTextConfNumSearch = msgTextConfNumSearch[regExSearch.end():]
                regExSearch = pattern.search(msgTextConfNumSearch)
    return confNum
项目:Spider    作者:poluo    | 项目源码 | 文件源码
def process_vcode(self, response):
        vcode_url = response.css('#content > div > div.article > form > img::attr(src)').extract_first()
        vcode = recognize_url(vcode_url)

        import enchant
        import requests
        d = enchant.Dict("en_US")
        valid = d.check("enchant")
        if valid:
            id_index = response.url.find('id=')
            try:
                original_url = response.css(
                    '#content > div > div.article > form > input[type="hidden"]:nth-child(8)::attr(value)').extract_first()
            except Exception:
                original_url = 'https://movie.douban.com/search/%E6%B0%B8%E4%BD%9C%E5%8D%9A%E7%BE%8E'
            vcode_id = response.url[id_index + 3:]
            frmdata = {"captcha-solution": "".format(vcode), "captcha-id": "".format(vcode_id),
                       "original-url": "".format(original_url)}
            requests.post(url=response.url, data=frmdata, headers=response.headers)
        else:
            print('wrong vcode')
项目:negation-detection    作者:gkotsis    | 项目源码 | 文件源码
def breakWithOutWhiteSpace(sentence):
    import re
    r = "\.\w+"
    sentences = []
    tmp = re.findall(r, sentence, re.X)
    places = [0]
    if len(tmp)>0:
        import enchant
        d = enchant.Dict("en_UK")
        for item in tmp:
            word = item[1:]
            if len(word)<2:
                if word.lower() in ['i','a']:
                    places.extend([m.start() for m in re.finditer(item, sentence)])
            else:
                if d.check(item[1:]):
                    places.extend([m.start() for m in re.finditer(item, sentence)])

    places = sorted(set(places))
    places.append(len(sentence)-1)
    i = 0
    if len(places)==2:
        return [sentence]

    start = 0
    while True:
        start = places[i]
        if start>0:
            start +=1
        end = places[i+1] + 1
        if end>len(sentence):
            end = len(sentence)-1
        sentences.append(sentence[start:end])
        i +=1
        if len(sentences)==len(places)-1:
            break

    return sentences
项目:bubblesub    作者:rr-    | 项目源码 | 文件源码
def run(self):
        spell_check_lang = self.api.opt.general['spell_check']
        if not spell_check_lang:
            bubblesub.ui.util.error('Spell check was disabled in config.')
            return

        try:
            dictionary = enchant.Dict(spell_check_lang)
        except enchant.errors.DictNotFoundError:
            bubblesub.ui.util.error(
                f'Spell check language {spell_check_lang} was not found.')
            return

        async def run(api, main_window):
            SpellCheckDialog(api, main_window, dictionary)

        await self.api.gui.exec(run)
项目:bubblesub    作者:rr-    | 项目源码 | 文件源码
def __init__(self, api, *args):
        super().__init__(*args)

        spell_check_lang = api.opt.general['spell_check']
        try:
            self._dictionary = (
                enchant.Dict(spell_check_lang)
                if spell_check_lang
                else None)
        except enchant.errors.DictNotFoundError:
            self._dictionary = None
            api.log.warn(f'dictionary {spell_check_lang} not installed.')

        self._fmt = QtGui.QTextCharFormat()
        self._fmt.setUnderlineColor(QtCore.Qt.red)
        self._fmt.setUnderlineStyle(QtGui.QTextCharFormat.SpellCheckUnderline)
        self._fmt.setFontUnderline(True)
项目:gibbersense    作者:smxlabs    | 项目源码 | 文件源码
def english_test(string):

    dict_en = enchant.Dict("en_US")

    words = string.split()
    wcount = 0

    for word in words :
        if(dict_en.check(word)) :
            wcount +=1
        pass
    pass

    return wcount
项目:saapy    作者:ashapochka    | 项目源码 | 文件源码
def collect_misses(parsed_lexemes: Iterable) -> Dict:
    misses = SortedDict()
    for lexeme in parsed_lexemes:
        for sublexeme in lexeme:
            for segment in sublexeme:
                for sm in segment[1]:
                    if sm.seg_type == 'miss':
                        misses.setdefault(
                            sm.segment.lower(), default=SortedSet()).add(sm.lexeme)
    return misses
项目:saapy    作者:ashapochka    | 项目源码 | 文件源码
def split_file_path(s: str) -> Dict[str, Optional[Any]]:
    path_parts = split_slash(s)
    try:
        ext_index = path_parts[-1].rindex('.')
        name, ext = path_parts[-1][:ext_index], path_parts[-1][ext_index+1:]
    except ValueError:
        name, ext = path_parts[-1], None
    return dict(dirs=path_parts[:-1], name=name, ext=ext)
项目:typot    作者:chakki-works    | 项目源码 | 文件源码
def __init__(self, lang="en_US"):
        self.checker = enchant.Dict(lang)
项目:smt-for-gec    作者:cnap    | 项目源码 | 文件源码
def __init__(self):
        self.WN_TAGS = {'J': 'a', 'N': 'n', 'R': 'r', 'V': 'v'}
        self.wnl = WordNetLemmatizer()
        self.dictionary = enchant.Dict('en')
        self.inflengine = inflect.engine()
项目:smt-for-gec    作者:cnap    | 项目源码 | 文件源码
def __init__(self):
        self.WN_TAGS = {'J': 'a', 'N': 'n', 'R': 'r', 'V': 'v'}
        self.wnl = WordNetLemmatizer()
        self.dictionary = enchant.Dict('en')
        self.lookup_table = {}
项目:bpy_lambda    作者:bcongdon    | 项目源码 | 文件源码
def __init__(self, settings, lang="en_US"):
        self.settings = settings
        self.dict_spelling = enchant.Dict(lang)
        self.cache = set(self.uimsgs)

        cache = self.settings.SPELL_CACHE
        if cache and os.path.exists(cache):
            with open(cache, 'rb') as f:
                self.cache |= set(pickle.load(f))
项目:Taigabot    作者:FrozenPigs    | 项目源码 | 文件源码
def test_bug2785373(self):
        """Testcases for bug #2785373."""
        c = SpellChecker(enchant.Dict("en"),"")
        c.set_text("So, one dey when I wes 17, I left.")
        for err in c:
            pass
        c = SpellChecker(enchant.Dict("en"),"")
        c.set_text(raw_unicode("So, one dey when I wes 17, I left."))
        for err in c:
            pass
项目:Taigabot    作者:FrozenPigs    | 项目源码 | 文件源码
def spell(inp):
    """spell <word/sentence> -- Check spelling of a word or sentence."""

    if not enchant.dict_exists(locale):
        return "Could not find dictionary: {}".format(locale)

    if len(inp.split(" ")) > 1:
        # input is a sentence
        chkr = SpellChecker(locale)
        chkr.set_text(inp)

        offset = 0
        for err in chkr:
            # find the location of the incorrect word
            start = err.wordpos + offset
            finish = start + len(err.word)
            # get some suggestions for it
            suggestions = err.suggest()
            s_string = '/'.join(suggestions[:3])
            s_string = "\x02{}\x02".format(s_string)
            # calculate the offset for the next word
            offset = (offset + len(s_string)) - len(err.word)
            # replace the word with the suggestions
            inp = inp[:start] + s_string + inp[finish:]
        return inp
    else:
        # input is a word
        dictionary = enchant.Dict(locale)
        is_correct = dictionary.check(inp)
        suggestions = dictionary.suggest(inp)
        s_string = ', '.join(suggestions[:10])
        if is_correct:
            return '"{}" appears to be \x02valid\x02! ' \
                   '(suggestions: {})'.format(inp, s_string)
        else:
            return '"{}" appears to be \x02invalid\x02! ' \
                   '(suggestions: {})'.format(inp, s_string)
项目:resume-optimizer    作者:mhbuehler    | 项目源码 | 文件源码
def extract_acronyms(textblob):
    """Creates a list of words beginning with at least 2 capital letters that are not regular English words,
    in descending order of frequency. enchant dictionary returns True if word is an English word."""
    d = enchant.Dict("en_US")
    words = textblob.words
    counts = []
    for word in words:
        if len(word) > 1:
            if word[0].isupper() and word[1].isupper() and word not in [p[0] for p in counts]:
                if not d.check(word):
                    counts.append((word, textblob.words.count(word)))

    return counts
项目:Natural-Language-Processing-Python-and-NLTK    作者:PacktPublishing    | 项目源码 | 文件源码
def __init__(self, dict_name='en', max_dist=2):
        self.spell_dict = enchant.Dict(dict_name)
        self.max_dist = max_dist
项目:fenrir    作者:chrys87    | 项目源码 | 文件源码
def updateSpellLanguage(self):  
        if not initialized:
           self.env['runtime']['outputManager'].presentText('pychant is not installed', interrupt=True) 
           return
        self.spellChecker = enchant.Dict(self.env['runtime']['settingsManager'].getSetting('general', 'spellCheckLanguage'))
        self.language = self.env['runtime']['settingsManager'].getSetting('general', 'spellCheckLanguage')
项目:fenrir    作者:chrys87    | 项目源码 | 文件源码
def updateSpellLanguage(self):  
        self.spellChecker = enchant.Dict(self.env['runtime']['settingsManager'].getSetting('general', 'spellCheckLanguage'))
        self.language = self.env['runtime']['settingsManager'].getSetting('general', 'spellCheckLanguage')
项目:fenrir    作者:chrys87    | 项目源码 | 文件源码
def updateSpellLanguage(self):  
        self.spellChecker = enchant.Dict(self.env['runtime']['settingsManager'].getSetting('general', 'spellCheckLanguage'))
        self.language = self.env['runtime']['settingsManager'].getSetting('general', 'spellCheckLanguage')
项目:fenrir    作者:chrys87    | 项目源码 | 文件源码
def updateSpellLanguage(self):  
        if not initialized:  
           self.env['runtime']['outputManager'].presentText(_('pyenchant is not installed'), interrupt=True) 
           return            
        self.spellChecker = enchant.Dict(self.env['runtime']['settingsManager'].getSetting('general', 'spellCheckLanguage'))
        self.language = self.env['runtime']['settingsManager'].getSetting('general', 'spellCheckLanguage')
项目:GLaDOS2    作者:TheComet    | 项目源码 | 文件源码
def __init__(self, server_instance, full_name):
        super(Quotes, self).__init__(server_instance, full_name)

        self.quotes_path = os.path.join(self.local_data_dir, 'quotes')
        if not os.path.exists(self.quotes_path):
            os.makedirs(self.quotes_path)

        self.dictionaries = [
            enchant.Dict('en_US'),
            enchant.Dict('en_GB')
        ]
项目:GLaDOS2    作者:TheComet    | 项目源码 | 文件源码
def spellcheck(self, message, word):
        """
        Says whether the given word is spelled correctly, and gives suggestions if
        it's not.
        """
        if word == '':
            await self.provide_help('spell', message)
            return

        word = word.split(' ', 1)[0]
        dictionary = enchant.Dict("en_US")
        dictionary_uk = enchant.Dict("en_GB")

        # I don't want to make anyone angry, so I check both American and British English.
        if dictionary_uk.check(word):
            if dictionary.check(word):
                await self.client.send_message(message.channel, word + " is spelled correctly")
            else:
                await self.client.send_message(message.channel, word + " is spelled correctly (British)")
        elif dictionary.check(word):
            await self.client.send_message(message.channel, word + " is spelled correctly (American)")
        else:
            msg = word + " is not spelled correctly. Maybe you want one of these spellings:"
            sugWords = []
            for suggested_word in dictionary.suggest(word):
                    sugWords.append(suggested_word)
            for suggested_word in dictionary_uk.suggest(word):
                    sugWords.append(suggested_word)
            for suggested_word in sorted(set(sugWords)):  # removes duplicates
                msg = msg + " '" + suggested_word + "',"
            await self.client.send_message(message.channel, msg)
项目:csss-minion    作者:henrymzhao    | 项目源码 | 文件源码
def setup(bot):
    dictionary = enchant.Dict("en_CA") # should crash here if no dictionary installed. See comments above
    bot.add_cog(Spellcheck(bot, dictionary))
项目:Chat-Bot    作者:FredLoh    | 项目源码 | 文件源码
def even_or_odd(self, message=None, match=None, to=None):
        is_odd = len(match.group("evenOrOdd")) % 2
        num = random.randint(1, 10)
        if (is_odd and num % 2) or (not is_odd and not num % 2):
            return TextMessageProtocolEntity("[%d]\nYou win." % num, to=message.getFrom())
        else:
            return TextMessageProtocolEntity("[%d]\nYou lose!" % num, to=message.getFrom())

    # def beban_spell_checker(self, message=None, match=None, to=None):
    #     print(message.getBody())
    #     correctionList = ""
    #     text = message.getBody()
    #     d = enchant.DictWithPWL("es_MX","wordList.txt")
    #     d_en = enchant.Dict("en_US")

    #     wordList = text.split()
    #     for word in wordList:
    #       if(word.isalnum() == True):
    #         print(word)
    #         if(d.check(word) == False):
    #             # if(d_en.check(word) == False):
    #           solutions = d.suggest(word)
    #           print(solutions)
    #           sol = str(solutions[0])
    #           if(sol.isalnum() == False):
    #             correctionList += sol + "* "
    #     if (correctionList != ""):
    #         print(correctionList)
    #         return TextMessageProtocolEntity(correctionList, to=message.getFrom())
项目:ibus-typing-booster    作者:mike-fabian    | 项目源码 | 文件源码
def load_dictionary(self):
        '''Load a hunspell dictionary and instantiate a
        enchant.Dict() or a hunspell.Hunspell() object.

        '''
        if DEBUG_LEVEL > 0:
            sys.stderr.write("load_dictionary() ...\n")
        (self.dic_path,
         self.encoding,
         self.words) = itb_util.get_hunspell_dictionary_wordlist(self.name)
        if self.words:
            # List of languages where accent insensitive matching makes sense:
            accent_languages = (
                'af', 'ast', 'az', 'be', 'bg', 'br', 'bs', 'ca', 'cs', 'csb',
                'cv', 'cy', 'da', 'de', 'dsb', 'el', 'en', 'es', 'eu', 'fo',
                'fr', 'fur', 'fy', 'ga', 'gd', 'gl', 'grc', 'gv', 'haw', 'hr',
                'hsb', 'ht', 'hu', 'ia', 'is', 'it', 'kk', 'ku', 'ky', 'lb',
                'ln', 'lv', 'mg', 'mi', 'mk', 'mn', 'mos', 'mt', 'nb', 'nds',
                'nl', 'nn', 'nr', 'nso', 'ny', 'oc', 'pl', 'plt', 'pt', 'qu',
                'quh', 'ru', 'sc', 'se', 'sh', 'shs', 'sk', 'sl', 'smj', 'sq',
                'sr', 'ss', 'st', 'sv', 'tet', 'tk', 'tn', 'ts', 'uk', 'uz',
                've', 'vi', 'wa', 'xh',
            )
            if self.name.split('_')[0] in accent_languages:
                self.word_pairs = [
                    (x, itb_util.remove_accents(x))
                    for x in self.words
                ]
            for x in self.words:
                if len(x) > self.max_word_len:
                    self.max_word_len = len(x)
            if DEBUG_LEVEL > 1:
                sys.stderr.write(
                    'load_dictionary() max_word_len = %s\n'
                    % self.max_word_len)
            if IMPORT_ENCHANT_SUCCESSFUL:
                self.enchant_dict = enchant.Dict(self.name)
            elif IMPORT_HUNSPELL_SUCCESSFUL and self.dic_path:
                aff_path = self.dic_path.replace('.dic', '.aff')
                self.pyhunspell_object = hunspell.HunSpell(self.dic_path, aff_path)
项目:python-translate    作者:caspartse    | 项目源码 | 文件源码
def suggest(self):
        if re.sub(r'[a-zA-Z\d\'\-\.\s]', '', self.word):
            return None
        import enchant
        try:
            d = enchant.DictWithPWL(
                'en_US', path + '/data/spell-checker/american-english-large')
        except:
            d = enchant.Dict('en_US')
        suggestion = d.suggest(self.word)
        return suggestion
项目:PracticeInSandbox    作者:PlayPython    | 项目源码 | 文件源码
def is_word(self, word):
        dic = enchant.Dict("en_US")
        return dic.check(word)
项目:Chromium_DepotTools    作者:p07r0457    | 项目源码 | 文件源码
def open(self):
        self.initialized = False
        self.private_dict_file = None

        if enchant is None:
            return
        dict_name = self.config.spelling_dict
        if not dict_name:
            return

        self.ignore_list = [w.strip() for w in self.config.spelling_ignore_words.split(",")]
        # "param" appears in docstring in param description and
        # "pylint" appears in comments in pylint pragmas.
        self.ignore_list.extend(["param", "pylint"])

        if self.config.spelling_private_dict_file:
            self.spelling_dict = enchant.DictWithPWL(
                dict_name, self.config.spelling_private_dict_file)
            self.private_dict_file = open(
                self.config.spelling_private_dict_file, "a")
        else:
            self.spelling_dict = enchant.Dict(dict_name)

        if self.config.spelling_store_unknown_words:
            self.unknown_words = set()

        # Prepare regex for stripping punctuation signs from text.
        # ' and _ are treated in a special way.
        puncts = string.punctuation.replace("'", "").replace("_", "")
        self.punctuation_regex = re.compile('[%s]' % re.escape(puncts))
        self.initialized = True
项目:node-gn    作者:Shouqun    | 项目源码 | 文件源码
def open(self):
        self.initialized = False
        self.private_dict_file = None

        if enchant is None:
            return
        dict_name = self.config.spelling_dict
        if not dict_name:
            return

        self.ignore_list = [w.strip() for w in self.config.spelling_ignore_words.split(",")]
        # "param" appears in docstring in param description and
        # "pylint" appears in comments in pylint pragmas.
        self.ignore_list.extend(["param", "pylint"])

        if self.config.spelling_private_dict_file:
            self.spelling_dict = enchant.DictWithPWL(
                dict_name, self.config.spelling_private_dict_file)
            self.private_dict_file = open(
                self.config.spelling_private_dict_file, "a")
        else:
            self.spelling_dict = enchant.Dict(dict_name)

        if self.config.spelling_store_unknown_words:
            self.unknown_words = set()

        # Prepare regex for stripping punctuation signs from text.
        # ' and _ are treated in a special way.
        puncts = string.punctuation.replace("'", "").replace("_", "")
        self.punctuation_regex = re.compile('[%s]' % re.escape(puncts))
        self.initialized = True
项目:depot_tools    作者:webrtc-uwp    | 项目源码 | 文件源码
def open(self):
        self.initialized = False
        self.private_dict_file = None

        if enchant is None:
            return
        dict_name = self.config.spelling_dict
        if not dict_name:
            return

        self.ignore_list = [w.strip() for w in self.config.spelling_ignore_words.split(",")]
        # "param" appears in docstring in param description and
        # "pylint" appears in comments in pylint pragmas.
        self.ignore_list.extend(["param", "pylint"])

        if self.config.spelling_private_dict_file:
            self.spelling_dict = enchant.DictWithPWL(
                dict_name, self.config.spelling_private_dict_file)
            self.private_dict_file = open(
                self.config.spelling_private_dict_file, "a")
        else:
            self.spelling_dict = enchant.Dict(dict_name)

        if self.config.spelling_store_unknown_words:
            self.unknown_words = set()

        # Prepare regex for stripping punctuation signs from text.
        # ' and _ are treated in a special way.
        puncts = string.punctuation.replace("'", "").replace("_", "")
        self.punctuation_regex = re.compile('[%s]' % re.escape(puncts))
        self.initialized = True
项目:Notes2ppt    作者:gsengupta2810    | 项目源码 | 文件源码
def __init__(self, dict_name='en', max_dist=2):
    self.spell_dict = enchant.Dict(dict_name)
    self.max_dist = max_dist
项目:wuye.vim    作者:zhaoyingnan911    | 项目源码 | 文件源码
def open(self):
        self.initialized = False
        self.private_dict_file = None

        if enchant is None:
            return
        dict_name = self.config.spelling_dict
        if not dict_name:
            return

        self.ignore_list = [w.strip() for w in self.config.spelling_ignore_words.split(",")]
        # "param" appears in docstring in param description and
        # "pylint" appears in comments in pylint pragmas.
        self.ignore_list.extend(["param", "pylint"])

        if self.config.spelling_private_dict_file:
            self.spelling_dict = enchant.DictWithPWL(
                dict_name, self.config.spelling_private_dict_file)
            self.private_dict_file = open(
                self.config.spelling_private_dict_file, "a")
        else:
            self.spelling_dict = enchant.Dict(dict_name)

        if self.config.spelling_store_unknown_words:
            self.unknown_words = set()

        # Prepare regex for stripping punctuation signs from text.
        # ' and _ are treated in a special way.
        puncts = string.punctuation.replace("'", "").replace("_", "")
        self.punctuation_regex = re.compile('[%s]' % re.escape(puncts))
        self.initialized = True
项目:Taigabot    作者:FrozenPigs    | 项目源码 | 文件源码
def __init__(self,lang=None,text=None,tokenize=None,chunkers=None,filters=None):
        """Constructor for the SpellChecker class.

        SpellChecker objects can be created in two ways, depending on
        the nature of the first argument.  If it is a string, it
        specifies a language tag from which a dictionary is created.
        Otherwise, it must be an enchant Dict object to be used.

        Optional keyword arguments are:

            * text:  to set the text to be checked at creation time
            * tokenize:  a custom tokenization function to use
            * chunkers:  a list of chunkers to apply during tokenization
            * filters:  a list of filters to apply during tokenization

        If <tokenize> is not given and the first argument is a Dict,
        its 'tag' attribute must be a language tag so that a tokenization
        function can be created automatically.  If this attribute is missing
        the user's default language will be used.
        """
        if lang is None:
            lang = get_default_language()
        if isinstance(lang,basestring):
            dict = enchant.Dict(lang)
        else:
            dict = lang
            try:
                lang = dict.tag
            except AttributeError:
                lang = get_default_language()
        if lang is None:
            raise DefaultLanguageNotFoundError
        self.lang = lang
        self.dict = dict
        if tokenize is None:
            try:
                tokenize = get_tokenizer(lang,chunkers,filters)
            except TokenizerNotFoundError:
                # Fall back to default tokenization if no match for 'lang'
                tokenize = get_tokenizer(None,chunkers,filters)
        self._tokenize = tokenize

        self.word = None
        self.wordpos = None
        self._ignore_words = {}
        self._replace_words = {}
        # Default to the empty string as the text to be checked
        self._text = array.array('u')
        self._use_tostring = False
        self._tokens = iter([])

        if text is not None:
            self.set_text(text)
项目:SWCheckIn    作者:gsugar87    | 项目源码 | 文件源码
def getInfoFromEmail(emailData):
    msgTextList = getEmailText(emailData[0][1])
    for msgText in msgTextList:
        confNum = getConfNum(msgText)
        # see if there are multiple itineraries
        msgTextSplit = msgText.split()
        if confNum in msgTextSplit:
            confNumIndex = msgTextSplit.index(confNum)
        else:
            confNumIndex = msgTextSplit.index('*'+confNum+'*')
        firstName = msgTextSplit[confNumIndex+1]
        lastName = msgTextSplit[confNumIndex+2]
        if 'Passenger(s)' in firstName:
            # See if there is a / in the name
            if '/' in lastName:
                firstName = lastName[lastName.index('/')+1:]
                lastName = lastName[0:lastName.index('/')]
            else:
                print("PROBLEM PARSING THE FIRST AND LAST NAMES!")
        elif msgTextSplit[confNumIndex+4] == 'Date':
            lastName = msgTextSplit[confNumIndex+3]
            print("Make sure user used a middle initial")

        # see if there are < formatting issues
        if firstName == '>':
            firstName = msgTextSplit[confNumIndex+2]
            lastName = msgTextSplit[confNumIndex+4]
        if lastName == '>':
            print("AAAH")
            print(msgTextSplit[confNumIndex+3])
            lastName = msgTextSplit[confNumIndex+3]
        possible2ndConf = msgTextSplit[confNumIndex+3][1:-1]
        if len(possible2ndConf) == 6 and not enchant.Dict("en_US").check(possible2ndConf):
            confNum = [confNum,str(possible2ndConf)]
            firstName = [firstName, str(msgTextSplit[confNumIndex+4])]
            lastName = [lastName, str(msgTextSplit[confNumIndex+5])]
        else:
            confNum = [confNum]
            firstName = [firstName]
            lastName = [lastName]
        # get the time you need to check in
        checkInTime = getCheckInTime(msgText)
        checkInDate = getCheckInDate(msgText)
        checkInCity = getCheckInCity(msgText)

        try:
            infoList = []
            for j in xrange(len(checkInDate)):
                for i in xrange(len(firstName)):
                    info = {'confNum':confNum[i],
                            'firstName':firstName[i],
                            'lastName':lastName[i],
                            'datetime':parser.parse(checkInDate[j] + ' ' + \
                                                    checkInTime[j]),
                            'city':checkInCity[j]}
                    infoList.append(info)
        except:
            infoList = []
        print('info from email:')
        print(infoList)
        return infoList
项目:event-cui-transfer    作者:mit-ddig    | 项目源码 | 文件源码
def consolidate_carevue(carevue):
    """Consolidate itsems from CV.
    """
    cv_item_text = clean_text(carevue['label'])
    cv_vectorizer = CountVectorizer(analyzer = "word")
    cv_bow_data = cv_vectorizer.fit_transform(cv_item_text)
    cv_vocab = cv_vectorizer.get_feature_names()
    cv_counts = cv_bow_data.sum(axis=0)

    # Compute edit distance between each element in vocabulary
    # with "dictionary"
    correct_by_count = []
    corrected = {}
    count = 0
    corrected_words = []
    no_match = []
    d = enchant.request_pwl_dict(
        main_dir + "/metavision_ids_icds_vocab_new.txt")
    d_english = enchant.Dict("en_US")
    for word in cv_vocab:
        word = word.lower()
        count += 1
        if not d.check(word) and not d.check(word.upper()) \
           and not d_english.check(word):
            no_match.append(word)
            suggestions = d.suggest(word)
            if suggestions == []:
                corrected[word] = word
            else:
                corrected[word] = best_match(word, suggestions, [])
                corrected_words.append(word)
        else:
            corrected[word] = word
    # apply map to correct spellings
    cv_item_corrected = \
        cv_item_text.str.split().apply(translate_words, args=(corrected,))
    cv_items_spellcheck = cv_item_corrected.str.join(' ')
    cv_items_df = pd.DataFrame({'itemid': cv_items_spellcheck.index.values,
                                'label': cv_items_spellcheck.values})
    grouped = cv_items_df[['itemid', 'label']].groupby('label')
    grouped_trimmed = {}
    for key in grouped.groups.keys():
        # take the minimum itemid corresponding to this description.
        grouped_trimmed[key] = grouped.get_group(key).itemid.astype(str).min()
    dict_consolidate = {}
    for itemid in cv_items_df.itemid.astype(str):
        dict_consolidate[itemid] = []
    for key in grouped.groups.keys():
        values = grouped.get_group(key)
        min_val = min(values.itemid.astype(str))
        for val in values.itemid.astype(str):
            dict_consolidate[val].append(min_val)
    map_to_unique = set()
    for key in dict_consolidate:
        if min(dict_consolidate[key]) not in map_to_unique:
            map_to_unique.add(min(dict_consolidate[key]))
    cv_items_spellcheck.index = cv_items_spellcheck.index.astype(str)
    # filter cv_items_spellcheck so that there are no redundant items
    cv_items_spellcheck2 = cv_items_spellcheck.loc[map_to_unique]
    return cv_item_text, cv_items_spellcheck, \
        cv_items_spellcheck2, dict_consolidate