Java 类org.apache.lucene.analysis.tokenattributes.OffsetAttribute 实例源码

项目:elasticsearch-analysis-openkoreantext    文件:TokenStreamAssertions.java   
public static void assertTokenStream(TokenStream tokenStream, String[] expectedCharTerms, String[] expectedTypes, int[] expectedStartOffsets, int[] expectedEndOffsets) throws IOException {
    tokenStream.reset();
    int index = 0;
    while (tokenStream.incrementToken() == true) {
        assertEquals(expectedCharTerms[index], tokenStream.getAttribute(CharTermAttribute.class).toString());

        if(expectedTypes != null) {
            assertEquals(expectedTypes[index], tokenStream.getAttribute(TypeAttribute.class).type());
        }

        OffsetAttribute offsets = tokenStream.getAttribute(OffsetAttribute.class);

        if(expectedStartOffsets != null) {
            assertEquals(expectedStartOffsets[index], offsets.startOffset());
        }

        if(expectedEndOffsets != null) {
            assertEquals(expectedEndOffsets[index], offsets.endOffset());
        }

        index++;
    }
    tokenStream.end();
}
项目:improved-journey    文件:TestAnsj.java   
public static void main(String[] args) throws IOException {
    List<Term> parse = ToAnalysis.parse("中华人民 共和国 成立了 ");
    System.out.println(parse);
    List<Term> parse1 = IndexAnalysis.parse("你吃过饭了没有!!!!!吃过无妨论文");


    //System.out.println(parse1);
    String text11="ZW321282050000000325";

    Tokenizer tokenizer = new AnsjTokenizer(new StringReader(text11), 0, true);
    CharTermAttribute termAtt = tokenizer.addAttribute(CharTermAttribute.class);
    OffsetAttribute offsetAtt = 
            tokenizer.addAttribute(OffsetAttribute.class);
        PositionIncrementAttribute positionIncrementAtt = 
            tokenizer.addAttribute(PositionIncrementAttribute.class);

    tokenizer.reset();
    while (tokenizer.incrementToken()){

          System.out.print(new String(termAtt.toString()+" ") );
        //  System.out.print( offsetAtt.startOffset() + "-" + offsetAtt.endOffset() + "-" );
        //System.out.print( positionIncrementAtt.getPositionIncrement() +"/");

    }
    tokenizer.close();
}
项目:improved-journey    文件:TestAnsj.java   
public static void main(String[] args) throws IOException {
    List<Term> parse = ToAnalysis.parse("中华人民 共和国 成立了 ");
    System.out.println(parse);
    List<Term> parse1 = IndexAnalysis.parse("你吃过饭了没有!!!!!吃过无妨论文");


    //System.out.println(parse1);
    String text11="ZW321282050000000325";

    Tokenizer tokenizer = new AnsjTokenizer(new StringReader(text11), 0, true);
    CharTermAttribute termAtt = tokenizer.addAttribute(CharTermAttribute.class);
    OffsetAttribute offsetAtt = 
            tokenizer.addAttribute(OffsetAttribute.class);
        PositionIncrementAttribute positionIncrementAtt = 
            tokenizer.addAttribute(PositionIncrementAttribute.class);

    tokenizer.reset();
    while (tokenizer.incrementToken()){

          System.out.print(new String(termAtt.toString()+" ") );
        //  System.out.print( offsetAtt.startOffset() + "-" + offsetAtt.endOffset() + "-" );
        //System.out.print( positionIncrementAtt.getPositionIncrement() +"/");

    }
    tokenizer.close();
}
项目:elasticsearch-analysis-voikko    文件:VoikkoTokenFilterTests.java   
private List<TokenData> parse(String text) {
    NamedAnalyzer analyzer = getAnalysisService().indexAnalyzers.get("test");

    try {
        try (TokenStream ts = analyzer.tokenStream("test", new StringReader(text))) {
            List<TokenData> result = new ArrayList<>();
            CharTermAttribute charTerm = ts.addAttribute(CharTermAttribute.class);
            OffsetAttribute offset = ts.addAttribute(OffsetAttribute.class);
            PositionIncrementAttribute position = ts.addAttribute(PositionIncrementAttribute.class);
            ts.reset();
            while (ts.incrementToken()) {
                String original = text.substring(offset.startOffset(), offset.endOffset());
                result.add(token(original, charTerm.toString(), position.getPositionIncrement()));
            }
            ts.end();

            return result;
        }
    } catch (IOException e) {
        throw new RuntimeException(e);
    }
}
项目:elasticsearch_my    文件:PlainHighlighter.java   
private static int findGoodEndForNoHighlightExcerpt(int noMatchSize, Analyzer analyzer, String fieldName, String contents)
        throws IOException {
    try (TokenStream tokenStream = analyzer.tokenStream(fieldName, contents)) {
        if (!tokenStream.hasAttribute(OffsetAttribute.class)) {
            // Can't split on term boundaries without offsets
            return -1;
        }
        int end = -1;
        tokenStream.reset();
        while (tokenStream.incrementToken()) {
            OffsetAttribute attr = tokenStream.getAttribute(OffsetAttribute.class);
            if (attr.endOffset() >= noMatchSize) {
                // Jump to the end of this token if it wouldn't put us past the boundary
                if (attr.endOffset() == noMatchSize) {
                    end = noMatchSize;
                }
                return end;
            }
            end = attr.endOffset();
        }
        tokenStream.end();
        // We've exhausted the token stream so we should just highlight everything.
        return end;
    }
}
项目:lams    文件:PrefixAwareTokenFilter.java   
public PrefixAwareTokenFilter(TokenStream prefix, TokenStream suffix) {
  super(suffix);
  this.suffix = suffix;
  this.prefix = prefix;
  prefixExhausted = false;

  termAtt = addAttribute(CharTermAttribute.class);
  posIncrAtt = addAttribute(PositionIncrementAttribute.class);
  payloadAtt = addAttribute(PayloadAttribute.class);
  offsetAtt = addAttribute(OffsetAttribute.class);
  typeAtt = addAttribute(TypeAttribute.class);
  flagsAtt = addAttribute(FlagsAttribute.class);

  p_termAtt = prefix.addAttribute(CharTermAttribute.class);
  p_posIncrAtt = prefix.addAttribute(PositionIncrementAttribute.class);
  p_payloadAtt = prefix.addAttribute(PayloadAttribute.class);
  p_offsetAtt = prefix.addAttribute(OffsetAttribute.class);
  p_typeAtt = prefix.addAttribute(TypeAttribute.class);
  p_flagsAtt = prefix.addAttribute(FlagsAttribute.class);
}
项目:lucene-bo    文件:TibetanAnalyzerTest.java   
static private void assertOffsets(String inputStr, TokenStream tokenStream, List<String> expected) {
    try {
        List<String> termList = new ArrayList<String>();
        // CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
        OffsetAttribute offsetAttr = tokenStream.addAttribute(OffsetAttribute.class);
        while (tokenStream.incrementToken()) {
            int start = offsetAttr.startOffset();
            int end = offsetAttr.endOffset();
            termList.add(inputStr.substring(start, end));
        }
        System.out.println(String.join(" ", termList));
        assertThat(termList, is(expected));
    } catch (IOException e) {
        assertTrue(false);
    }
}
项目:Elasticsearch    文件:PlainHighlighter.java   
private static int findGoodEndForNoHighlightExcerpt(int noMatchSize, Analyzer analyzer, String fieldName, String contents) throws IOException {
    try (TokenStream tokenStream = analyzer.tokenStream(fieldName, contents)) {
        if (!tokenStream.hasAttribute(OffsetAttribute.class)) {
            // Can't split on term boundaries without offsets
            return -1;
        }
        int end = -1;
        tokenStream.reset();
        while (tokenStream.incrementToken()) {
            OffsetAttribute attr = tokenStream.getAttribute(OffsetAttribute.class);
            if (attr.endOffset() >= noMatchSize) {
                // Jump to the end of this token if it wouldn't put us past the boundary
                if (attr.endOffset() == noMatchSize) {
                    end = noMatchSize;
                }
                return end;
            }
            end = attr.endOffset();
        }
        tokenStream.end();
        // We've exhausted the token stream so we should just highlight everything.
        return end;
    }
}
项目:elasticsearch-analysis-ltp    文件:LTPTokenizer.java   
/**
 * Lucene constructor
 *
 * @throws UnirestException
 * @throws JSONException
 * @throws IOException
 */
public LTPTokenizer(Set<String> filter)
        throws IOException, JSONException, UnirestException {
    super();
    logger.info("LTPTokenizer Initialize......");
    // Add token offset attribute
    offsetAttr = addAttribute(OffsetAttribute.class);
    // Add token content attribute
    charTermAttr = addAttribute(CharTermAttribute.class);
    // Add token type attribute
    typeAttr = addAttribute(TypeAttribute.class);
    // Add token position attribute
    piAttr = addAttribute(PositionIncrementAttribute.class);
    // Create a new word segmenter to get tokens
    LTPSeg = new LTPWordSegmenter(input);
    // Add filter words set
    this.filter = filter;
}
项目:elasticsearch-analysis-lc-pinyin    文件:PinyinAnalysisTest.java   
@Test
public void testSearch() throws IOException {
    LcPinyinAnalyzer analyzer = new LcPinyinAnalyzer(AnalysisSetting.search);
    TokenStream tokenStream = analyzer.tokenStream("lc", "重qing");

    CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class);
    OffsetAttribute offsetAttribute = tokenStream.getAttribute(OffsetAttribute.class);
    PositionIncrementAttribute positionIncrementAttribute = tokenStream.getAttribute(PositionIncrementAttribute.class);

    tokenStream.reset();
    Assert.assertTrue(tokenStream.incrementToken());
    Assert.assertEquals(charTermAttribute.toString(), "重");
    Assert.assertEquals(offsetAttribute.startOffset(), 0);
    Assert.assertEquals(offsetAttribute.endOffset(), 1);
    Assert.assertEquals(positionIncrementAttribute.getPositionIncrement(), 1);

    Assert.assertTrue(tokenStream.incrementToken());
    Assert.assertEquals(charTermAttribute.toString(), "qing");
    Assert.assertEquals(offsetAttribute.startOffset(), 1);
    Assert.assertEquals(offsetAttribute.endOffset(), 5);
    Assert.assertEquals(positionIncrementAttribute.getPositionIncrement(), 1);

    tokenStream.close();
}
项目:elasticsearch-analysis-lc-pinyin    文件:PinyinFilterTest.java   
public void testFullPinyinFilter() throws IOException {

        LcPinyinAnalyzer analyzer = new LcPinyinAnalyzer(AnalysisSetting.search);
        TokenStream tokenStream = analyzer.tokenStream("lc", "作者 : 陈楠");

        LcPinyinTokenFilter lcPinyinTokenFilter = new LcPinyinTokenFilter(tokenStream, PinyinFilterSetting.full_pinyin);

        CharTermAttribute charTermAttribute = lcPinyinTokenFilter.getAttribute(CharTermAttribute.class);
        OffsetAttribute offsetAttribute = lcPinyinTokenFilter.getAttribute(OffsetAttribute.class);
        PositionIncrementAttribute positionIncrementAttribute = lcPinyinTokenFilter.getAttribute(PositionIncrementAttribute.class);

        lcPinyinTokenFilter.reset();
        while (lcPinyinTokenFilter.incrementToken()) {
            System.out.println(charTermAttribute.toString() + ":" + offsetAttribute.startOffset() + "," + offsetAttribute.endOffset() + ":" + positionIncrementAttribute.getPositionIncrement());
        }
        lcPinyinTokenFilter.close();
    }
项目:elasticsearch-analysis-lc-pinyin    文件:PinyinFilterTest.java   
public void testFirstLetterFilter() throws IOException {

        LcPinyinAnalyzer analyzer = new LcPinyinAnalyzer(AnalysisSetting.search);
        TokenStream tokenStream = analyzer.tokenStream("lc", "作者 : 陈楠");

        LcPinyinTokenFilter lcPinyinTokenFilter = new LcPinyinTokenFilter(tokenStream, PinyinFilterSetting.first_letter);

        CharTermAttribute charTermAttribute = lcPinyinTokenFilter.getAttribute(CharTermAttribute.class);
        OffsetAttribute offsetAttribute = lcPinyinTokenFilter.getAttribute(OffsetAttribute.class);
        PositionIncrementAttribute positionIncrementAttribute = lcPinyinTokenFilter.getAttribute(PositionIncrementAttribute.class);

        lcPinyinTokenFilter.reset();
        while (lcPinyinTokenFilter.incrementToken()) {
            System.out.println(charTermAttribute.toString() + ":" + offsetAttribute.startOffset() + "," + offsetAttribute.endOffset() + ":" + positionIncrementAttribute.getPositionIncrement());
        }
        lcPinyinTokenFilter.close();
    }
项目:fastcatsearch3    文件:Token.java   
@Override
public void copyTo(AttributeImpl target) {
  if (target instanceof Token) {
    final Token to = (Token) target;
    to.reinit(this);
    // reinit shares the payload, so clone it:
    if (payload !=null) {
      to.payload = payload.clone();
    }
  } else {
    super.copyTo(target);
    ((OffsetAttribute) target).setOffset(startOffset, endOffset);
    ((PositionIncrementAttribute) target).setPositionIncrement(positionIncrement);
    ((PayloadAttribute) target).setPayload((payload == null) ? null : payload.clone());
    ((FlagsAttribute) target).setFlags(flags);
    ((TypeAttribute) target).setType(type);
  }
}
项目:fastcatsearch3    文件:BasicHighlightAndSummary.java   
public WrappedTokenStream(TokenStream tokenStream, String pText) {
    this.pText = pText;
    this.tokenStream = tokenStream;
    if(tokenStream.hasAttribute(CharTermAttribute.class)) {
        charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class);
    }
    if(tokenStream.hasAttribute(OffsetAttribute.class)) {
        offsetAttribute = tokenStream.getAttribute(OffsetAttribute.class);
    }
    if(tokenStream.hasAttribute(CharsRefTermAttribute.class)) {
        charsRefTermAttribute = tokenStream.getAttribute(CharsRefTermAttribute.class);
    }

    if(tokenStream.hasAttribute(AdditionalTermAttribute.class)) {
        additionalTermAttribute = tokenStream.getAttribute(AdditionalTermAttribute.class);
    }

    additionalTermAttributeLocal.init(this);
}
项目:fastcatsearch3    文件:CSVAnalyzerTest.java   
@Test
public void testBulk() throws IOException {
    String str = "";
    str = "SK,  하이닉스";
    //str = "하이닉스";

    StringReader input = new StringReader(str);
    CSVAnalyzer analyzer = new CSVAnalyzer();
    TokenStream tokenStream = analyzer.tokenStream("", input);
    tokenStream.reset();
    logger.debug("tokenStream:{}", tokenStream);
    CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class);
    OffsetAttribute offsetAttribute = tokenStream.getAttribute(OffsetAttribute.class);
    for(int inx=0;tokenStream.incrementToken();inx++) {
        String term = charTermAttribute.toString();
        logger.debug("[{}] \"{}\" {}~{}", inx, term, offsetAttribute.startOffset(), offsetAttribute.endOffset());
    }
    analyzer.close();
}
项目:lucene-token-filter-pinyin    文件:PinyinTransformTokenFilter.java   
/**
 * @param input            词元输入
 * @param type             输出拼音缩写还是完整拼音 可取值:{@link #TYPE_ABBREVIATION}、{@link #TYPE_PINYIN}、{@link #TYPE_BOTH}
 * @param minTermLength    中文词组过滤长度
 * @param maxPolyphoneFreq 多音字出现最大次数
 * @param isOutChinese     是否输入原中文词元
 */
public PinyinTransformTokenFilter(TokenStream input, int type,
                                  int minTermLength, int maxPolyphoneFreq, boolean isOutChinese) {
    super(input);
    this._minTermLength = minTermLength;
    this.maxPolyphoneFreq = maxPolyphoneFreq;
    if (this._minTermLength < 1) {
        this._minTermLength = 1;
    }
    if (this.maxPolyphoneFreq < 1) {
        this.maxPolyphoneFreq = Integer.MAX_VALUE;
    }
    this.isOutChinese = isOutChinese;
    this.outputFormat.setCaseType(HanyuPinyinCaseType.LOWERCASE);
    this.outputFormat.setToneType(HanyuPinyinToneType.WITHOUT_TONE);
    this.type = type;
    addAttribute(OffsetAttribute.class); // 偏移量属性
}
项目:hmftools    文件:TreatmentCurator.java   
@NotNull
private static List<SearchToken> generateSearchTokens(@NotNull final String searchTerm) throws IOException {
    final Set<SearchToken> searchTokens = Sets.newHashSet();
    final TokenStream tokenStream = getSpellCheckedShingleStream(searchTerm);
    tokenStream.reset();
    while (tokenStream.incrementToken()) {
        final String searchToken = tokenStream.getAttribute(CharTermAttribute.class).toString();
        final OffsetAttribute offsetAttribute = tokenStream.getAttribute(OffsetAttribute.class);
        searchTokens.add(ImmutableSearchToken.of(searchToken, offsetAttribute.startOffset(), offsetAttribute.endOffset()));
    }
    tokenStream.end();
    tokenStream.close();
    return searchTokens.stream()
            .sorted(Comparator.comparing(SearchToken::length).reversed().thenComparing(SearchToken::startOffset))
            .collect(Collectors.toList());
}
项目:jasperreports    文件:LuceneUtil.java   
protected void displayTokens(String text, String elementId) throws IOException {
    if (log.isDebugEnabled()) {
        Analyzer analyzer = getConfiguredAnalyzer();
        StringBuilder sb = new StringBuilder();
        sb.append(elementId).append(": ").append(text).append(": ");

        TokenStream tokenStream = analyzer.tokenStream(null, new StringReader(text));
        CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
        OffsetAttribute offsetAttribute = tokenStream.addAttribute(OffsetAttribute.class);

        tokenStream.reset();
        while (tokenStream.incrementToken()) {
            int startOffset = offsetAttribute.startOffset();
            int endOffset = offsetAttribute.endOffset();
            String term = charTermAttribute.toString();
            sb.append("[" + term + "](" + startOffset + "," + endOffset + ") ");
        }

        log.debug(sb);
    }
}
项目:langforia    文件:LuceneTokenizer.java   
@Override
public void apply(Document doc) {
       try 
       {
           TokenStream stream = analyzer.tokenStream("contents", new StringReader(doc.text()));
           OffsetAttribute offsetAttribute = stream.addAttribute(OffsetAttribute.class);
           CharTermAttribute charTermAttribute = stream.addAttribute(CharTermAttribute.class);

           stream.reset();
           while (stream.incrementToken())
           {
            Token tok = new Token(doc).setRange(offsetAttribute.startOffset(), offsetAttribute.endOffset());
            tok.putProperty(TokenProperties.STEM, charTermAttribute.toString());
           }
           stream.close();
       }
       catch(IOException ex) {
           throw new LangforiaRuntimeException(ex);
       }
}
项目:elasticsearch-analysis-opennlp    文件:OpenNLPTokenFilter.java   
private String[] walkTokens() throws IOException {
    List<String> wordList = new ArrayList<>();
    while (input.incrementToken()) {
        CharTermAttribute textAtt = input.getAttribute(CharTermAttribute.class);
        OffsetAttribute offsetAtt = input.getAttribute(OffsetAttribute.class);
        char[] buffer = textAtt.buffer();
        String word = new String(buffer, 0, offsetAtt.endOffset() - offsetAtt.startOffset());
        wordList.add(word);
        AttributeSource attrs = input.cloneAttributes();
        tokenAttrs.add(attrs);
    }
    String[] words = new String[wordList.size()];
    for (int i = 0; i < words.length; i++) {
        words[i] = wordList.get(i);
    }
    return words;
}
项目:Alix    文件:Demo.java   
public static MyToken[] tokensFromAnalysis(Analyzer analyzer, String text, String field) throws IOException
{
  ;
  TokenStream stream = analyzer.tokenStream(field, new StringReader(text));
  CharTermAttribute term = stream.addAttribute(CharTermAttribute.class);
  PositionIncrementAttribute positionIncrementAttr = stream.addAttribute(PositionIncrementAttribute.class);
  TypeAttribute typeAttr = stream.addAttribute(TypeAttribute.class);
  OffsetAttribute offsetAttr = stream.addAttribute(OffsetAttribute.class);

  ArrayList<MyToken> tokenList = new ArrayList<MyToken>();
  while (stream.incrementToken()) {
    tokenList.add(new MyToken(term.toString(), positionIncrementAttr.getPositionIncrement(), typeAttr.type(),
        offsetAttr.startOffset(), offsetAttr.endOffset()));
  }

  return tokenList.toArray(new MyToken[0]);
}
项目:hanlp-lucene-plugin    文件:HanLPAnalyzerTest.java   
public void testCreateComponents() throws Exception
{
    String text = "中华人民共和国很辽阔";
    for (int i = 0; i < text.length(); ++i)
    {
        System.out.print(text.charAt(i) + "" + i + " ");
    }
    System.out.println();
    Analyzer analyzer = new HanLPAnalyzer();
    TokenStream tokenStream = analyzer.tokenStream("field", text);
    tokenStream.reset();
    while (tokenStream.incrementToken())
    {
        CharTermAttribute attribute = tokenStream.getAttribute(CharTermAttribute.class);
        // 偏移量
        OffsetAttribute offsetAtt = tokenStream.getAttribute(OffsetAttribute.class);
        // 距离
        PositionIncrementAttribute positionAttr = tokenStream.getAttribute(PositionIncrementAttribute.class);
        // 词性
        TypeAttribute typeAttr = tokenStream.getAttribute(TypeAttribute.class);
        System.out.printf("[%d:%d %d] %s/%s\n", offsetAtt.startOffset(), offsetAtt.endOffset(), positionAttr.getPositionIncrement(), attribute, typeAttr.type());
    }
}
项目:hanlp-lucene-plugin    文件:HanLPAnalyzerTest.java   
public void testIssue() throws Exception
{
    Map<String, String> args = new TreeMap<>();
    args.put("enableTraditionalChineseMode", "true");
    args.put("enableNormalization", "true");
    HanLPTokenizerFactory factory = new HanLPTokenizerFactory(args);
    Tokenizer tokenizer = factory.create();
    String text = "會辦台星保證最低價的原因?";

    tokenizer.setReader(new StringReader(text));
    tokenizer.reset();
    while (tokenizer.incrementToken())
    {
        CharTermAttribute attribute = tokenizer.getAttribute(CharTermAttribute.class);
        // 偏移量
        OffsetAttribute offsetAtt = tokenizer.getAttribute(OffsetAttribute.class);
        // 距离
        PositionIncrementAttribute positionAttr = tokenizer.getAttribute(PositionIncrementAttribute.class);
        // 词性
        TypeAttribute typeAttr = tokenizer.getAttribute(TypeAttribute.class);
        System.out.printf("[%d:%d %d] %s/%s\n", offsetAtt.startOffset(), offsetAtt.endOffset(), positionAttr.getPositionIncrement(), attribute, typeAttr.type());
    }
}
项目:hanlp-lucene-plugin    文件:HanLPIndexAnalyzerTest.java   
public void testCreateComponents() throws Exception
{
    String text = "中华人民共和国很辽阔";
    for (int i = 0; i < text.length(); ++i)
    {
        System.out.print(text.charAt(i) + "" + i + " ");
    }
    System.out.println();
    Analyzer analyzer = new HanLPIndexAnalyzer();
    TokenStream tokenStream = analyzer.tokenStream("field", text);
    tokenStream.reset();
    while (tokenStream.incrementToken())
    {
        CharTermAttribute attribute = tokenStream.getAttribute(CharTermAttribute.class);
        // 偏移量
        OffsetAttribute offsetAtt = tokenStream.getAttribute(OffsetAttribute.class);
        // 距离
        PositionIncrementAttribute positionAttr = tokenStream.getAttribute(PositionIncrementAttribute.class);
        // 词性
        TypeAttribute typeAttr = tokenStream.getAttribute(TypeAttribute.class);
        System.out.printf("[%d:%d %d] %s/%s\n", offsetAtt.startOffset(), offsetAtt.endOffset(), positionAttr.getPositionIncrement(), attribute, typeAttr.type());
    }
}
项目:elasticsearch-analysis-url    文件:URLTokenFilter.java   
/**
 * Tokenize the given input using a {@link URLTokenizer}. Settings which have been set on this {@link URLTokenFilter}
 * will be passed along to the tokenizer.
 * @param input a string to be tokenized
 * @return a list of tokens extracted from the input string
 * @throws IOException
 */
private List<Token> tokenize(String input) throws IOException {
    List<Token> tokens = new ArrayList<>();
    URLTokenizer tokenizer = new URLTokenizer();
    // create a copy of the parts list to avoid ConcurrentModificationException when sorting
    tokenizer.setParts(new ArrayList<>(parts));
    tokenizer.setUrlDecode(urlDeocde);
    tokenizer.setTokenizeHost(tokenizeHost);
    tokenizer.setTokenizePath(tokenizePath);
    tokenizer.setTokenizeQuery(tokenizeQuery);
    tokenizer.setAllowMalformed(allowMalformed || passthrough);
    tokenizer.setTokenizeMalformed(tokenizeMalformed);
    tokenizer.setReader(new StringReader(input));
    tokenizer.reset();

    String term;
    URLPart part;
    OffsetAttribute offset;
    while (tokenizer.incrementToken()) {
        term = tokenizer.getAttribute(CharTermAttribute.class).toString();
        part = URLPart.fromString(tokenizer.getAttribute(TypeAttribute.class).type());
        offset = tokenizer.getAttribute(OffsetAttribute.class);
        tokens.add(new Token(term, part, offset.startOffset(), offset.endOffset()));
    }
    return tokens;
}
项目:elasticsearch-twitter-korean    文件:AnalyzerTestUtil.java   
protected List<TestToken> collectExtractedNouns(TokenStream stream) throws IOException {
    CharTermAttribute charTermAtt = stream.addAttribute(CharTermAttribute.class);
    OffsetAttribute offSetAtt = stream.addAttribute(OffsetAttribute.class);
    TypeAttribute typeAttr = stream.addAttribute(TypeAttribute.class);

    List<TestToken> extractedTokens = Lists.newArrayList();

    while(stream.incrementToken()) {
        TestToken t = getToken(charTermAtt.toString(), offSetAtt.startOffset(), offSetAtt.endOffset());

        System.out.println("termAtt.term() : " + charTermAtt.toString());
        System.out.println("startoffSetAtt : " + offSetAtt.startOffset());
        System.out.println("endoffSetAtt : " + offSetAtt.endOffset());
        System.out.println("typeAttr : " + typeAttr.toString());

        extractedTokens.add(t);
    }

    return extractedTokens;
}
项目:elasticsearch-twitter-korean    文件:TwitterKoreanTokenizerTest.java   
@Test
public void testIncrementToken() throws IOException {
    CharTermAttribute charTermAtt = tokenizer.getAttribute(CharTermAttribute.class);
    OffsetAttribute offSetAtt = tokenizer.getAttribute(OffsetAttribute.class);

    int expected_token_count = 6;
    int observed_token_count = 0;
    while(tokenizer.incrementToken()) {
        observed_token_count++;
        TestToken t = getToken(charTermAtt.toString(), offSetAtt.startOffset(), offSetAtt.endOffset());

        System.out.println("termAtt.term() : " + charTermAtt.toString());
        System.out.println("startOffset : " + offSetAtt.startOffset());
        System.out.println("endOffset : " + offSetAtt.endOffset());

        Assert.assertTrue(tokenizedToken.contains(t));
    }

    Assert.assertEquals(expected_token_count, observed_token_count);

}
项目:search    文件:PrefixAwareTokenFilter.java   
public PrefixAwareTokenFilter(TokenStream prefix, TokenStream suffix) {
  super(suffix);
  this.suffix = suffix;
  this.prefix = prefix;
  prefixExhausted = false;

  termAtt = addAttribute(CharTermAttribute.class);
  posIncrAtt = addAttribute(PositionIncrementAttribute.class);
  payloadAtt = addAttribute(PayloadAttribute.class);
  offsetAtt = addAttribute(OffsetAttribute.class);
  typeAtt = addAttribute(TypeAttribute.class);
  flagsAtt = addAttribute(FlagsAttribute.class);

  p_termAtt = prefix.addAttribute(CharTermAttribute.class);
  p_posIncrAtt = prefix.addAttribute(PositionIncrementAttribute.class);
  p_payloadAtt = prefix.addAttribute(PayloadAttribute.class);
  p_offsetAtt = prefix.addAttribute(OffsetAttribute.class);
  p_typeAtt = prefix.addAttribute(TypeAttribute.class);
  p_flagsAtt = prefix.addAttribute(FlagsAttribute.class);
}
项目:search    文件:EdgeNGramTokenFilterTest.java   
public void testSupplementaryCharacters() throws IOException {
  final String s = TestUtil.randomUnicodeString(random(), 10);
  final int codePointCount = s.codePointCount(0, s.length());
  final int minGram = TestUtil.nextInt(random(), 1, 3);
  final int maxGram = TestUtil.nextInt(random(), minGram, 10);
  TokenStream tk = new KeywordTokenizer(new StringReader(s));
  tk = new EdgeNGramTokenFilter(tk, minGram, maxGram);
  final CharTermAttribute termAtt = tk.addAttribute(CharTermAttribute.class);
  final OffsetAttribute offsetAtt = tk.addAttribute(OffsetAttribute.class);
  tk.reset();
  for (int i = minGram; i <= Math.min(codePointCount, maxGram); ++i) {
    assertTrue(tk.incrementToken());
    assertEquals(0, offsetAtt.startOffset());
    assertEquals(s.length(), offsetAtt.endOffset());
    final int end = Character.offsetByCodePoints(s, 0, i);
    assertEquals(s.substring(0, end), termAtt.toString());
  }
  assertFalse(tk.incrementToken());
}
项目:search    文件:NGramTokenFilterTest.java   
public void testSupplementaryCharacters() throws IOException {
  final String s = TestUtil.randomUnicodeString(random(), 10);
  final int codePointCount = s.codePointCount(0, s.length());
  final int minGram = TestUtil.nextInt(random(), 1, 3);
  final int maxGram = TestUtil.nextInt(random(), minGram, 10);
  TokenStream tk = new KeywordTokenizer(new StringReader(s));
  tk = new NGramTokenFilter(tk, minGram, maxGram);
  final CharTermAttribute termAtt = tk.addAttribute(CharTermAttribute.class);
  final OffsetAttribute offsetAtt = tk.addAttribute(OffsetAttribute.class);
  tk.reset();
  for (int start = 0; start < codePointCount; ++start) {
    for (int end = start + minGram; end <= Math.min(codePointCount, start + maxGram); ++end) {
      assertTrue(tk.incrementToken());
      assertEquals(0, offsetAtt.startOffset());
      assertEquals(s.length(), offsetAtt.endOffset());
      final int startIndex = Character.offsetByCodePoints(s, 0, start);
      final int endIndex = Character.offsetByCodePoints(s, 0, end);
      assertEquals(s.substring(startIndex, endIndex), termAtt.toString());
    }
  }
  assertFalse(tk.incrementToken());
}
项目:search    文件:TestChineseTokenizer.java   
public void testOtherLetterOffset() throws IOException
{
    String s = "a天b";
    ChineseTokenizer tokenizer = new ChineseTokenizer(new StringReader(s));

    int correctStartOffset = 0;
    int correctEndOffset = 1;
    OffsetAttribute offsetAtt = tokenizer.getAttribute(OffsetAttribute.class);
    tokenizer.reset();
    while (tokenizer.incrementToken()) {
      assertEquals(correctStartOffset, offsetAtt.startOffset());
      assertEquals(correctEndOffset, offsetAtt.endOffset());
      correctStartOffset++;
      correctEndOffset++;
    }
    tokenizer.end();
    tokenizer.close();
}
项目:search    文件:TestRemoveDuplicatesTokenFilter.java   
public void testDups(final String expected, final Token... tokens)
  throws Exception {

  final Iterator<Token> toks = Arrays.asList(tokens).iterator();
  final TokenStream ts = new RemoveDuplicatesTokenFilter(
    (new TokenStream() {
        CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
        OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
        PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
        @Override
        public boolean incrementToken() {
          if (toks.hasNext()) {
            clearAttributes();
            Token tok = toks.next();
            termAtt.setEmpty().append(tok);
            offsetAtt.setOffset(tok.startOffset(), tok.endOffset());
            posIncAtt.setPositionIncrement(tok.getPositionIncrement());
            return true;
          } else {
            return false;
          }
        }
      }));

  assertTokenStreamContents(ts, expected.split("\\s"));   
}
项目:search    文件:TestDuelingAnalyzers.java   
public void assertEquals(String s, TokenStream left, TokenStream right) throws Exception {
  left.reset();
  right.reset();
  CharTermAttribute leftTerm = left.addAttribute(CharTermAttribute.class);
  CharTermAttribute rightTerm = right.addAttribute(CharTermAttribute.class);
  OffsetAttribute leftOffset = left.addAttribute(OffsetAttribute.class);
  OffsetAttribute rightOffset = right.addAttribute(OffsetAttribute.class);
  PositionIncrementAttribute leftPos = left.addAttribute(PositionIncrementAttribute.class);
  PositionIncrementAttribute rightPos = right.addAttribute(PositionIncrementAttribute.class);

  while (left.incrementToken()) {
    assertTrue("wrong number of tokens for input: " + s, right.incrementToken());
    assertEquals("wrong term text for input: " + s, leftTerm.toString(), rightTerm.toString());
    assertEquals("wrong position for input: " + s, leftPos.getPositionIncrement(), rightPos.getPositionIncrement());
    assertEquals("wrong start offset for input: " + s, leftOffset.startOffset(), rightOffset.startOffset());
    assertEquals("wrong end offset for input: " + s, leftOffset.endOffset(), rightOffset.endOffset());
  };
  assertFalse("wrong number of tokens for input: " + s, right.incrementToken());
  left.end();
  right.end();
  assertEquals("wrong final offset for input: " + s, leftOffset.endOffset(), rightOffset.endOffset());
  left.close();
  right.close();
}
项目:search    文件:TestSnowball.java   
public void testFilterTokens() throws Exception {
  SnowballFilter filter = new SnowballFilter(new TestTokenStream(), "English");
  CharTermAttribute termAtt = filter.getAttribute(CharTermAttribute.class);
  OffsetAttribute offsetAtt = filter.getAttribute(OffsetAttribute.class);
  TypeAttribute typeAtt = filter.getAttribute(TypeAttribute.class);
  PayloadAttribute payloadAtt = filter.getAttribute(PayloadAttribute.class);
  PositionIncrementAttribute posIncAtt = filter.getAttribute(PositionIncrementAttribute.class);
  FlagsAttribute flagsAtt = filter.getAttribute(FlagsAttribute.class);

  filter.incrementToken();

  assertEquals("accent", termAtt.toString());
  assertEquals(2, offsetAtt.startOffset());
  assertEquals(7, offsetAtt.endOffset());
  assertEquals("wrd", typeAtt.type());
  assertEquals(3, posIncAtt.getPositionIncrement());
  assertEquals(77, flagsAtt.getFlags());
  assertEquals(new BytesRef(new byte[]{0,1,2,3}), payloadAtt.getPayload());
}
项目:search    文件:SpellingQueryConverter.java   
protected void analyze(Collection<Token> result, String text, int offset, int flagsAttValue) throws IOException {
  TokenStream stream = analyzer.tokenStream("", text);
  // TODO: support custom attributes
  CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
  TypeAttribute typeAtt = stream.addAttribute(TypeAttribute.class);
  PayloadAttribute payloadAtt = stream.addAttribute(PayloadAttribute.class);
  PositionIncrementAttribute posIncAtt = stream.addAttribute(PositionIncrementAttribute.class);
  OffsetAttribute offsetAtt = stream.addAttribute(OffsetAttribute.class);
  stream.reset();
  while (stream.incrementToken()) {      
    Token token = new Token();
    token.copyBuffer(termAtt.buffer(), 0, termAtt.length());
    token.setOffset(offset + offsetAtt.startOffset(), 
                    offset + offsetAtt.endOffset());
    token.setFlags(flagsAttValue); //overwriting any flags already set...
    token.setType(typeAtt.type());
    token.setPayload(payloadAtt.getPayload());
    token.setPositionIncrement(posIncAtt.getPositionIncrement());
    result.add(token);
  }
  stream.end();
  stream.close();
}
项目:search    文件:FieldType.java   
@Override
public TokenStreamComponents createComponents(String fieldName, Reader reader) {
  Tokenizer ts = new Tokenizer(reader) {
    final char[] cbuf = new char[maxChars];
    final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
    final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
    @Override
    public boolean incrementToken() throws IOException {
      clearAttributes();
      int n = input.read(cbuf,0,maxChars);
      if (n<=0) return false;
      String s = toInternal(new String(cbuf,0,n));
      termAtt.setEmpty().append(s);
      offsetAtt.setOffset(correctOffset(0),correctOffset(n));
      return true;
    }
  };

  return new TokenStreamComponents(ts);
}
项目:solr-multilingual-analyzer    文件:MultiLangTokenizer.java   
private void handleTokenStream(Map<Integer, List<Token>> tokenPosMap, TokenStream tokenStream) throws IOException {
    tokenStream.reset();
    int pos = 0;

    CharTermAttribute charTermAttribute = getCharTermAttribute(tokenStream);
    OffsetAttribute offsetAttribute = getOffsetAttribute(tokenStream);
    TypeAttribute typeAttribute = getTypeAttribute(tokenStream);
    PositionIncrementAttribute positionIncrementAttribute = getPositionIncrementAttribute(tokenStream);

    while (tokenStream.incrementToken()) {
        if (null == charTermAttribute || null == offsetAttribute) {
            return;
        }
        Token token = new Token(charTermAttribute.buffer(), 0, charTermAttribute.length(),
                offsetAttribute.startOffset(), offsetAttribute.endOffset());
        if (null != typeAttribute) {
            token.setType(typeAttribute.type());
        }
        pos += null != positionIncrementAttribute ? positionIncrementAttribute.getPositionIncrement() : 1;
        if (!tokenPosMap.containsKey(pos)) {
            tokenPosMap.put(pos, new LinkedList<Token>());
        }
        tokenPosMap.get(pos).add(token);
    }
    tokenStream.close();
}
项目:edits    文件:LuceneTokenizer.java   
@Override
public List<Annotation> annotate(String text) throws Exception {
    text = SimpleTokenizer.format(text);
    Analyzer analyser = new EnglishAnalyzer(Version.LUCENE_47, CharArraySet.EMPTY_SET);
    TokenFilter filter = new EnglishMinimalStemFilter(analyser.tokenStream("text", new StringReader(text)));
    List<Annotation> out = Lists.newArrayList();
    while (filter.incrementToken()) {
        CharTermAttribute az = filter.getAttribute(CharTermAttribute.class);
        OffsetAttribute o = filter.getAttribute(OffsetAttribute.class);
        String token = text.substring(o.startOffset(), o.endOffset());
        String lemma = az.toString();
        Annotation t = new Annotation();
        t.setForm(token);
        t.setLemma(lemma);
        out.add(t);
    }
    if (out.size() == 0) {
        log.debug("Input string is empty");
    }
    filter.close();
    analyser.close();
    return out;
}
项目:lucene-addons    文件:OffsetLengthStartComparator.java   
@Override
public int compare(OffsetAttribute offsetA, OffsetAttribute offsetB) {

  int lenA = offsetA.endOffset() - offsetA.startOffset();
  int lenB = offsetB.endOffset() - offsetB.startOffset();
  if (lenA < lenB) {
    return 1;
  } else if (lenA > lenB) {
    return -1;
    // by here, the length is the same
  } else if (offsetA.startOffset() < offsetB.startOffset()) {
    return -1;
  } else if (offsetA.startOffset() > offsetB.startOffset()) {
    return 1;
  }
  return 0;
}
项目:community-edition-old    文件:PathTokenFilterTest.java   
public void testTokenizerReuse() throws IOException
{
    // We should be able to use the same Tokenizer twice.
    final String path = "uri1:one";
    StringReader reader = new StringReader(path);
    PathTokenFilter ts = new PathTokenFilter(reader, PathTokenFilter.PATH_SEPARATOR,
            PathTokenFilter.SEPARATOR_TOKEN_TEXT, PathTokenFilter.NO_NS_TOKEN_TEXT,
            PathTokenFilter.NAMESPACE_START_DELIMITER, PathTokenFilter.NAMESPACE_END_DELIMITER, true);

    OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);

    // First use
    tokenise(ts, new String[]{"uri1", "one"});
    assertEquals(path.length(), offsetAtt.startOffset());
    assertEquals(path.length(), offsetAtt.endOffset());

    // Second use
    final String path2 = "/{uri1}one/uri2:two/";
    StringReader reader2 = new StringReader(path2);
    ts.setReader(reader2);
    tokenise(ts, new String[]{"uri1", "one", "uri2", "two"});
    assertEquals(path2.length(), offsetAtt.startOffset());
    assertEquals(path2.length(), offsetAtt.endOffset());
}