我们从Python开源项目中,提取了以下15个代码示例,用于说明如何使用enchant.DictWithPWL()。
def __init__(self, lang, suggest, word_list_filename, filters=[]): self.dictionary = enchant.DictWithPWL(lang, word_list_filename) self.tokenizer = get_tokenizer(lang, filters) self.original_tokenizer = self.tokenizer self.suggest = suggest
def test_pwl(self): """Test checker loop with PWL.""" from enchant import DictWithPWL d = DictWithPWL("en_US",None,None) txt = "I am sme text to be cheked with personal list of cheked words" chkr = SpellChecker(d,txt) for n,err in enumerate(chkr): if n == 0: self.assertEqual(err.word,"sme") if n == 1: self.assertEqual(err.word,"cheked") chkr.add() self.assertEqual(n,1)
def even_or_odd(self, message=None, match=None, to=None): is_odd = len(match.group("evenOrOdd")) % 2 num = random.randint(1, 10) if (is_odd and num % 2) or (not is_odd and not num % 2): return TextMessageProtocolEntity("[%d]\nYou win." % num, to=message.getFrom()) else: return TextMessageProtocolEntity("[%d]\nYou lose!" % num, to=message.getFrom()) # def beban_spell_checker(self, message=None, match=None, to=None): # print(message.getBody()) # correctionList = "" # text = message.getBody() # d = enchant.DictWithPWL("es_MX","wordList.txt") # d_en = enchant.Dict("en_US") # wordList = text.split() # for word in wordList: # if(word.isalnum() == True): # print(word) # if(d.check(word) == False): # # if(d_en.check(word) == False): # solutions = d.suggest(word) # print(solutions) # sol = str(solutions[0]) # if(sol.isalnum() == False): # correctionList += sol + "* " # if (correctionList != ""): # print(correctionList) # return TextMessageProtocolEntity(correctionList, to=message.getFrom())
def suggest(self): if re.sub(r'[a-zA-Z\d\'\-\.\s]', '', self.word): return None import enchant try: d = enchant.DictWithPWL( 'en_US', path + '/data/spell-checker/american-english-large') except: d = enchant.Dict('en_US') suggestion = d.suggest(self.word) return suggestion
def __init__(self): self.stemmer = LancasterStemmer() #Convert a collection of text documents to a matrix of token counts #Remove accents during the preprocessing step. self.vectorizer = CountVectorizer(strip_accents='ascii') self.tokenizer = self.vectorizer.build_tokenizer() self.preprocessor = self.vectorizer.build_preprocessor() self.spellchecker = enchant.DictWithPWL("en_US", pwl=path_config.PERSONAL_WORD_DICTIONARY_FILE)
def open(self): self.initialized = False self.private_dict_file = None if enchant is None: return dict_name = self.config.spelling_dict if not dict_name: return self.ignore_list = [w.strip() for w in self.config.spelling_ignore_words.split(",")] # "param" appears in docstring in param description and # "pylint" appears in comments in pylint pragmas. self.ignore_list.extend(["param", "pylint"]) if self.config.spelling_private_dict_file: self.spelling_dict = enchant.DictWithPWL( dict_name, self.config.spelling_private_dict_file) self.private_dict_file = open( self.config.spelling_private_dict_file, "a") else: self.spelling_dict = enchant.Dict(dict_name) if self.config.spelling_store_unknown_words: self.unknown_words = set() # Prepare regex for stripping punctuation signs from text. # ' and _ are treated in a special way. puncts = string.punctuation.replace("'", "").replace("_", "") self.punctuation_regex = re.compile('[%s]' % re.escape(puncts)) self.initialized = True
def get_new_dictionary(dictionary_lang="en_GB"): personal_words_list_path = os.path.join(CONFIG_PATH, 'personal-words-list.txt') return enchant.DictWithPWL(dictionary_lang, personal_words_list_path)
def spellcheck_hints(args, packages): spelldict = DictWithPWL('en-US') chkr = SpellChecker(spelldict, filters=[DescFilter]) misspellings = {} # add technical words not in spell-checking dictionary wordlist = [] with open('words.txt') as f: for w in f: # strip any trailing comment w = re.sub(r'#.*$', '', w) # strip any whitespace w = w.strip() spelldict.add(w) wordlist.append(w.lower()) # XXX: for the moment, to reduce the set of errors, ignore the fact # that words.txt gives a canonical capitalization, and accept any # capitalization spelldict.add(w.lower()) spelldict.add(w.capitalize()) # add all package names as valid words for p in packages: for w in re.split('[_-]', p): # remove punctuation characters w = re.sub(r'[+]', '', w) # strip off any trailing numbers w = re.sub(r'[\d.]*$', '', w) # both with and without any lib prefix for w1 in [w, re.sub(r'^lib', '', w)]: # add the package name unless it exists in the list above, which # will give a canonical capitalization if w.lower() not in wordlist: spelldict.add(w.lower()) spelldict.add(w) spelldict.add(w.capitalize()) # for each package for p in sorted(packages.keys()): # debuginfo packages have uninteresting, auto-generated text which # contains the package name if p.endswith('-debuginfo'): continue # spell-check the spell-checkable keys for k in ['sdesc', 'ldesc', 'message']: if k in packages[p].hints: chkr.set_text(packages[p].hints[k]) # XXX: this is doing all the work to generate suggestions, which # we then ignore, so could be written much more efficiently for err in chkr: # logging.error("package '%s', hint '%s': Is '%s' a word?" % (p, k, err.word)) misspellings.setdefault(err.word, 0) misspellings[err.word] += 1 # summarize for c in sorted(misspellings, key=misspellings.get, reverse=True): print('%16s: %4d' % (c, misspellings[c]))
def tesseract_ocr_helper(base_image, config="Default"): """ A wrapper for using tesseract to do OCR """ tools = pyocr.get_available_tools() if len(tools) == 0: print("No OCR tool found") sys.exit(1) # The tools are returned in the recommended order of usage tool = tools[0] print("Will use tool '%s'" % (tool.get_name())) langs = tool.get_available_languages() print("Available languages: %s" % ", ".join(langs)) lang = langs[0] print("Will use lang '%s'" % (lang)) custom_builder = pyocr.builders.TextBuilder() if config != "Default": custom_builder.tesseract_configs = [config] txt = tool.image_to_string( base_image, lang=lang, builder=custom_builder ) # Spell correct dict_path = os.path.join(os.path.dirname(__file__),"dict/urban_dict.txt") d = enchant.DictWithPWL("en_US", dict_path) txtA = txt.replace('\n', ' \n ') A = txtA.split(" ") B = [] for x in A: if (x != '\n' and len(x) != 0 and d.check(x) is False and len(d.suggest(x)) != 0): B.append(d.suggest(x)[0]) else: B.append(x) return " ".join(B)
def _init_spell_checker(self): """ Initialize spell checker dictionary """ default_dict = "en_US" spell_dict = None jargonfile = self.params.get('jargonfile') if not jargonfile: jargonfile = os.environ.get('JARGONFILE') if jargonfile is not None: try: jargonfile = str(jargonfile) spell_dict = DictWithPWL(default_dict, jargonfile) except: self.error( "Could not initialize dictionary using %s file" % jargonfile) if not spell_dict: try: spell_dict = DictWithPWL(default_dict) except: self.error( "Could not initialize spell checker with dictionary %s" % default_dict) #Check if there is jargonfile on module repo url = ("https://src.fedoraproject.org/cgit/modules/%s.git/plain/jargon.txt" % self.mmd.name) resp = requests.get(url) if resp.status_code >= 200 and resp.status_code < 300: for w in resp.content.split("\n"): if w != '': spell_dict.add_to_session(w) #add words from module name as jargon for w in self.mmd.name.split('-'): spell_dict.add_to_session(w) try: chkr = SpellChecker(spell_dict) except: self.error("Could not initialize spell checker") return chkr