Python enchant 模块,tokenize() 实例源码

我们从Python开源项目中,提取了以下14个代码示例,用于说明如何使用enchant.tokenize()

项目:Taigabot    作者:FrozenPigs    | 项目源码 | 文件源码
def set_offset(self,off,whence=0):
        """Set the offset of the tokenization routine.

        For more details on the purpose of the tokenization offset,
        see the documentation of the 'enchant.tokenize' module.
        The optional argument whence indicates the method by
        which to change the offset:
            * 0 (the default) treats <off> as an increment
            * 1 treats <off> as a distance from the start
            * 2 treats <off> as a distance from the end
        """
        if whence == 0:
            self._tokens.set_offset(self._tokens.offset + off)
        elif whence == 1:
            assert(off > 0)
            self._tokens.set_offset(off)
        elif whence == 2:
            assert(off > 0)
            self._tokens.set_offset(len(self._text) - 1 - off)
        else:
            raise ValueError("Invalid value for whence: %s"%(whence,))
项目:Taigabot    作者:FrozenPigs    | 项目源码 | 文件源码
def buildtestsuite(recurse=True):
    from enchant.checker.tests import TestChecker
    from enchant.tokenize.tests import TestTokenization, TestFilters
    from enchant.tokenize.tests import TestTokenizeEN
    suite = unittest.TestSuite()
    if recurse:
        suite.addTest(unittest.makeSuite(TestInstallEnv))
        suite.addTest(unittest.makeSuite(TestPy2exe))
    suite.addTest(unittest.makeSuite(TestBroker))
    suite.addTest(unittest.makeSuite(TestDict))
    suite.addTest(unittest.makeSuite(TestPWL))
    suite.addTest(unittest.makeSuite(TestUtils))
    suite.addTest(unittest.makeSuite(TestDocStrings))
    suite.addTest(unittest.makeSuite(TestChecker))
    suite.addTest(unittest.makeSuite(TestTokenization))
    suite.addTest(unittest.makeSuite(TestTokenizeEN))
    suite.addTest(unittest.makeSuite(TestFilters))
    return suite
项目:palladio    作者:slipguru    | 项目源码 | 文件源码
def __init__(self, words):
        tokenize.__init__(self, '')
        self._words = words
项目:Taigabot    作者:FrozenPigs    | 项目源码 | 文件源码
def test_filters(self):
        """Test SpellChecker with the 'filters' argument."""
        text = """I contain WikiWords that ShouldBe skipped by the filters"""
        chkr = SpellChecker("en_US",text=text,
                            filters=[enchant.tokenize.WikiWordFilter])
        for err in chkr:
            # There are no errors once the WikiWords are skipped
            self.fail("Extraneous spelling errors were found")
        self.assertEqual(chkr.get_text(),text)
项目:Taigabot    作者:FrozenPigs    | 项目源码 | 文件源码
def test_chunkers(self):
        """Test SpellChecker with the 'chunkers' argument."""
        text = """I contain <html a=xjvf>tags</html> that should be skipped"""
        chkr = SpellChecker("en_US",text=text,
                            chunkers=[enchant.tokenize.HTMLChunker])
        for err in chkr:
            # There are no errors when the <html> tag is skipped
            self.fail("Extraneous spelling errors were found")
        self.assertEqual(chkr.get_text(),text)
项目:Taigabot    作者:FrozenPigs    | 项目源码 | 文件源码
def test_chunkers_and_filters(self):
        """Test SpellChecker with the 'chunkers' and 'filters' arguments."""
        text = """I contain <html a=xjvf>tags</html> that should be skipped
                  along with a <a href='http://example.com/">link to
                  http://example.com/</a> that should also be skipped"""
        # There are no errors when things are correctly skipped
        chkr = SpellChecker("en_US",text=text,
                            filters=[enchant.tokenize.URLFilter],
                            chunkers=[enchant.tokenize.HTMLChunker])
        for err in chkr:
            self.fail("Extraneous spelling errors were found")
        self.assertEqual(chkr.get_text(),text)
        # The "html" is an error when not using HTMLChunker
        chkr = SpellChecker("en_US",text=text,
                            filters=[enchant.tokenize.URLFilter])
        for err in chkr:
            self.assertEqual(err.word,"html")
            break
        self.assertEqual(chkr.get_text(),text)
        # The "http" from the URL is an error when not using URLFilter
        chkr = SpellChecker("en_US",text=text,
                            chunkers=[enchant.tokenize.HTMLChunker])
        for err in chkr:
            self.assertEqual(err.word,"http")
            break
        self.assertEqual(chkr.get_text(),text)
项目:Taigabot    作者:FrozenPigs    | 项目源码 | 文件源码
def _check_docstrings(self,obj,errors):
        import enchant
        if hasattr(obj,"__doc__"):
            skip_errors = [w for w in getattr(obj,"_DOC_ERRORS",[])]
            chkr = enchant.checker.SpellChecker("en_AU",obj.__doc__,filters=[enchant.tokenize.URLFilter])
            for err in chkr:
                if len(err.word) == 1:
                    continue
                if err.word.lower() in self.WORDS:
                    continue
                if skip_errors and skip_errors[0] == err.word:
                    skip_errors.pop(0)
                    continue
                errors.append((obj,err.word,err.wordpos))
                msg = "\nDOCSTRING SPELLING ERROR: %s %s %d %s\n" % (obj,err.word,err.wordpos,chkr.suggest())
                printf([msg],file=sys.stderr)
        #  Find and yield all child objects that should be checked
        for name in dir(obj):
            if name.startswith("__"):
                continue
            child = getattr(obj,name)
            if hasattr(child,"__file__"):
                if not hasattr(globals(),"__file__"):
                    continue
                if not child.__file__.startswith(os.path.dirname(__file__)):
                    continue
            else:
                cmod = getattr(child,"__module__",None)
                if not cmod:
                    cclass = getattr(child,"__class__",None)
                    cmod = getattr(cclass,"__module__",None)
                if cmod and not cmod.startswith("enchant"):
                    continue
            yield child
项目:Taigabot    作者:FrozenPigs    | 项目源码 | 文件源码
def __init__(self):
        tokenize.__init__(self,"")
项目:Taigabot    作者:FrozenPigs    | 项目源码 | 文件源码
def __init__(self,text):
        tokenize.__init__(self,text)
        self._done = False
项目:Taigabot    作者:FrozenPigs    | 项目源码 | 文件源码
def _try_tokenizer(modName):
    """Look for a tokenizer in the named module.

    Returns the function if found, None otherwise.
    """
    modBase = "enchant.tokenize."
    funcName = "tokenize"
    modName = modBase + modName
    try:
        mod = __import__(modName,globals(),{},funcName)
        return getattr(mod,funcName)
    except ImportError:
       return None
项目:dalila    作者:slipguru    | 项目源码 | 文件源码
def __init__(self, words):
        tokenize.__init__(self, '')
        self._words = words
项目:icing    作者:slipguru    | 项目源码 | 文件源码
def __init__(self, words):
        tokenize.__init__(self, '')
        self._words = words
项目:Taigabot    作者:FrozenPigs    | 项目源码 | 文件源码
def __init__(self,lang=None,text=None,tokenize=None,chunkers=None,filters=None):
        """Constructor for the SpellChecker class.

        SpellChecker objects can be created in two ways, depending on
        the nature of the first argument.  If it is a string, it
        specifies a language tag from which a dictionary is created.
        Otherwise, it must be an enchant Dict object to be used.

        Optional keyword arguments are:

            * text:  to set the text to be checked at creation time
            * tokenize:  a custom tokenization function to use
            * chunkers:  a list of chunkers to apply during tokenization
            * filters:  a list of filters to apply during tokenization

        If <tokenize> is not given and the first argument is a Dict,
        its 'tag' attribute must be a language tag so that a tokenization
        function can be created automatically.  If this attribute is missing
        the user's default language will be used.
        """
        if lang is None:
            lang = get_default_language()
        if isinstance(lang,basestring):
            dict = enchant.Dict(lang)
        else:
            dict = lang
            try:
                lang = dict.tag
            except AttributeError:
                lang = get_default_language()
        if lang is None:
            raise DefaultLanguageNotFoundError
        self.lang = lang
        self.dict = dict
        if tokenize is None:
            try:
                tokenize = get_tokenizer(lang,chunkers,filters)
            except TokenizerNotFoundError:
                # Fall back to default tokenization if no match for 'lang'
                tokenize = get_tokenizer(None,chunkers,filters)
        self._tokenize = tokenize

        self.word = None
        self.wordpos = None
        self._ignore_words = {}
        self._replace_words = {}
        # Default to the empty string as the text to be checked
        self._text = array.array('u')
        self._use_tostring = False
        self._tokens = iter([])

        if text is not None:
            self.set_text(text)
项目:paperwork-backend    作者:openpaperwork    | 项目源码 | 文件源码
def check_spelling(spelling_lang, txt):
    """
    Check the spelling in the text, and compute a score. The score is the
    number of words correctly (or almost correctly) spelled, minus the number
    of mispelled words. Words "almost" correct remains neutral (-> are not
    included in the score)

    Returns:
        A tuple : (fixed text, score)
    """
    if os.name == "nt":
        assert(not "check_spelling() not available on Windows")
        return
    with _ENCHANT_LOCK:
        # Maximum distance from the first suggestion from python-enchant

        words_dict = enchant.request_dict(spelling_lang)
        try:
            tknzr = enchant.tokenize.get_tokenizer(spelling_lang)
        except enchant.tokenize.TokenizerNotFoundError:
            # Fall back to default tokenization if no match for 'lang'
            tknzr = enchant.tokenize.get_tokenizer()

        score = 0
        offset = 0
        for (word, word_pos) in tknzr(txt):
            if len(word) < _MIN_WORD_LEN:
                continue
            if words_dict.check(word):
                # immediately correct words are a really good hint for
                # orientation
                score += 100
                continue
            suggestions = words_dict.suggest(word)
            if (len(suggestions) <= 0):
                # this word is useless. It may even indicates a bad orientation
                score -= 10
                continue
            main_suggestion = suggestions[0]
            lv_dist = Levenshtein.distance(word, main_suggestion)
            if (lv_dist > _MAX_LEVENSHTEIN_DISTANCE):
                # hm, this word looks like it's in a bad shape
                continue

            logger.debug("Spell checking: Replacing: %s -> %s"
                         % (word, main_suggestion))

            # let's replace the word by its suggestion

            pre_txt = txt[:word_pos + offset]
            post_txt = txt[word_pos + len(word) + offset:]
            txt = pre_txt + main_suggestion + post_txt
            offset += (len(main_suggestion) - len(word))

            # fixed words may be a good hint for orientation
            score += 5

        return (txt, score)