我们从Python开源项目中,提取了以下14个代码示例,用于说明如何使用enchant.tokenize()。
def set_offset(self,off,whence=0): """Set the offset of the tokenization routine. For more details on the purpose of the tokenization offset, see the documentation of the 'enchant.tokenize' module. The optional argument whence indicates the method by which to change the offset: * 0 (the default) treats <off> as an increment * 1 treats <off> as a distance from the start * 2 treats <off> as a distance from the end """ if whence == 0: self._tokens.set_offset(self._tokens.offset + off) elif whence == 1: assert(off > 0) self._tokens.set_offset(off) elif whence == 2: assert(off > 0) self._tokens.set_offset(len(self._text) - 1 - off) else: raise ValueError("Invalid value for whence: %s"%(whence,))
def buildtestsuite(recurse=True): from enchant.checker.tests import TestChecker from enchant.tokenize.tests import TestTokenization, TestFilters from enchant.tokenize.tests import TestTokenizeEN suite = unittest.TestSuite() if recurse: suite.addTest(unittest.makeSuite(TestInstallEnv)) suite.addTest(unittest.makeSuite(TestPy2exe)) suite.addTest(unittest.makeSuite(TestBroker)) suite.addTest(unittest.makeSuite(TestDict)) suite.addTest(unittest.makeSuite(TestPWL)) suite.addTest(unittest.makeSuite(TestUtils)) suite.addTest(unittest.makeSuite(TestDocStrings)) suite.addTest(unittest.makeSuite(TestChecker)) suite.addTest(unittest.makeSuite(TestTokenization)) suite.addTest(unittest.makeSuite(TestTokenizeEN)) suite.addTest(unittest.makeSuite(TestFilters)) return suite
def __init__(self, words): tokenize.__init__(self, '') self._words = words
def test_filters(self): """Test SpellChecker with the 'filters' argument.""" text = """I contain WikiWords that ShouldBe skipped by the filters""" chkr = SpellChecker("en_US",text=text, filters=[enchant.tokenize.WikiWordFilter]) for err in chkr: # There are no errors once the WikiWords are skipped self.fail("Extraneous spelling errors were found") self.assertEqual(chkr.get_text(),text)
def test_chunkers(self): """Test SpellChecker with the 'chunkers' argument.""" text = """I contain <html a=xjvf>tags</html> that should be skipped""" chkr = SpellChecker("en_US",text=text, chunkers=[enchant.tokenize.HTMLChunker]) for err in chkr: # There are no errors when the <html> tag is skipped self.fail("Extraneous spelling errors were found") self.assertEqual(chkr.get_text(),text)
def test_chunkers_and_filters(self): """Test SpellChecker with the 'chunkers' and 'filters' arguments.""" text = """I contain <html a=xjvf>tags</html> that should be skipped along with a <a href='http://example.com/">link to http://example.com/</a> that should also be skipped""" # There are no errors when things are correctly skipped chkr = SpellChecker("en_US",text=text, filters=[enchant.tokenize.URLFilter], chunkers=[enchant.tokenize.HTMLChunker]) for err in chkr: self.fail("Extraneous spelling errors were found") self.assertEqual(chkr.get_text(),text) # The "html" is an error when not using HTMLChunker chkr = SpellChecker("en_US",text=text, filters=[enchant.tokenize.URLFilter]) for err in chkr: self.assertEqual(err.word,"html") break self.assertEqual(chkr.get_text(),text) # The "http" from the URL is an error when not using URLFilter chkr = SpellChecker("en_US",text=text, chunkers=[enchant.tokenize.HTMLChunker]) for err in chkr: self.assertEqual(err.word,"http") break self.assertEqual(chkr.get_text(),text)
def _check_docstrings(self,obj,errors): import enchant if hasattr(obj,"__doc__"): skip_errors = [w for w in getattr(obj,"_DOC_ERRORS",[])] chkr = enchant.checker.SpellChecker("en_AU",obj.__doc__,filters=[enchant.tokenize.URLFilter]) for err in chkr: if len(err.word) == 1: continue if err.word.lower() in self.WORDS: continue if skip_errors and skip_errors[0] == err.word: skip_errors.pop(0) continue errors.append((obj,err.word,err.wordpos)) msg = "\nDOCSTRING SPELLING ERROR: %s %s %d %s\n" % (obj,err.word,err.wordpos,chkr.suggest()) printf([msg],file=sys.stderr) # Find and yield all child objects that should be checked for name in dir(obj): if name.startswith("__"): continue child = getattr(obj,name) if hasattr(child,"__file__"): if not hasattr(globals(),"__file__"): continue if not child.__file__.startswith(os.path.dirname(__file__)): continue else: cmod = getattr(child,"__module__",None) if not cmod: cclass = getattr(child,"__class__",None) cmod = getattr(cclass,"__module__",None) if cmod and not cmod.startswith("enchant"): continue yield child
def __init__(self): tokenize.__init__(self,"")
def __init__(self,text): tokenize.__init__(self,text) self._done = False
def _try_tokenizer(modName): """Look for a tokenizer in the named module. Returns the function if found, None otherwise. """ modBase = "enchant.tokenize." funcName = "tokenize" modName = modBase + modName try: mod = __import__(modName,globals(),{},funcName) return getattr(mod,funcName) except ImportError: return None
def __init__(self,lang=None,text=None,tokenize=None,chunkers=None,filters=None): """Constructor for the SpellChecker class. SpellChecker objects can be created in two ways, depending on the nature of the first argument. If it is a string, it specifies a language tag from which a dictionary is created. Otherwise, it must be an enchant Dict object to be used. Optional keyword arguments are: * text: to set the text to be checked at creation time * tokenize: a custom tokenization function to use * chunkers: a list of chunkers to apply during tokenization * filters: a list of filters to apply during tokenization If <tokenize> is not given and the first argument is a Dict, its 'tag' attribute must be a language tag so that a tokenization function can be created automatically. If this attribute is missing the user's default language will be used. """ if lang is None: lang = get_default_language() if isinstance(lang,basestring): dict = enchant.Dict(lang) else: dict = lang try: lang = dict.tag except AttributeError: lang = get_default_language() if lang is None: raise DefaultLanguageNotFoundError self.lang = lang self.dict = dict if tokenize is None: try: tokenize = get_tokenizer(lang,chunkers,filters) except TokenizerNotFoundError: # Fall back to default tokenization if no match for 'lang' tokenize = get_tokenizer(None,chunkers,filters) self._tokenize = tokenize self.word = None self.wordpos = None self._ignore_words = {} self._replace_words = {} # Default to the empty string as the text to be checked self._text = array.array('u') self._use_tostring = False self._tokens = iter([]) if text is not None: self.set_text(text)
def check_spelling(spelling_lang, txt): """ Check the spelling in the text, and compute a score. The score is the number of words correctly (or almost correctly) spelled, minus the number of mispelled words. Words "almost" correct remains neutral (-> are not included in the score) Returns: A tuple : (fixed text, score) """ if os.name == "nt": assert(not "check_spelling() not available on Windows") return with _ENCHANT_LOCK: # Maximum distance from the first suggestion from python-enchant words_dict = enchant.request_dict(spelling_lang) try: tknzr = enchant.tokenize.get_tokenizer(spelling_lang) except enchant.tokenize.TokenizerNotFoundError: # Fall back to default tokenization if no match for 'lang' tknzr = enchant.tokenize.get_tokenizer() score = 0 offset = 0 for (word, word_pos) in tknzr(txt): if len(word) < _MIN_WORD_LEN: continue if words_dict.check(word): # immediately correct words are a really good hint for # orientation score += 100 continue suggestions = words_dict.suggest(word) if (len(suggestions) <= 0): # this word is useless. It may even indicates a bad orientation score -= 10 continue main_suggestion = suggestions[0] lv_dist = Levenshtein.distance(word, main_suggestion) if (lv_dist > _MAX_LEVENSHTEIN_DISTANCE): # hm, this word looks like it's in a bad shape continue logger.debug("Spell checking: Replacing: %s -> %s" % (word, main_suggestion)) # let's replace the word by its suggestion pre_txt = txt[:word_pos + offset] post_txt = txt[word_pos + len(word) + offset:] txt = pre_txt + main_suggestion + post_txt offset += (len(main_suggestion) - len(word)) # fixed words may be a good hint for orientation score += 5 return (txt, score)