Java 类org.apache.lucene.analysis.Token 实例源码

项目:elasticsearch_my    文件:TokenCountFieldMapperTests.java   
public void testCountPositions() throws IOException {
    // We're looking to make sure that we:
    Token t1 = new Token();      // Don't count tokens without an increment
    t1.setPositionIncrement(0);
    Token t2 = new Token();
    t2.setPositionIncrement(1);  // Count normal tokens with one increment
    Token t3 = new Token();
    t2.setPositionIncrement(2);  // Count funny tokens with more than one increment
    int finalTokenIncrement = 4; // Count the final token increment on the rare token streams that have them
    Token[] tokens = new Token[] {t1, t2, t3};
    Collections.shuffle(Arrays.asList(tokens), random());
    final TokenStream tokenStream = new CannedTokenStream(finalTokenIncrement, 0, tokens);
    // TODO: we have no CannedAnalyzer?
    Analyzer analyzer = new Analyzer() {
            @Override
            public TokenStreamComponents createComponents(String fieldName) {
                return new TokenStreamComponents(new MockTokenizer(), tokenStream);
            }
        };
    assertThat(TokenCountFieldMapper.countPositions(analyzer, "", ""), equalTo(7));
}
项目:fangorn    文件:TreebankSentenceTokenizer.java   
@Override
public Token next(Token reusableToken) throws IOException {
    Token token = reusableToken;
    if (tokenizer() != null) {
        Token t = tokenizer().next(token);
        if (t != null) {
            return t;
        }
    }
    char[] sent = new char[] {};
    do {
        read = input.read(ioBuffer);
        if (read > 0)
            sent = add(sent, ioBuffer, read);
    } while (read != -1);
    if (sent.length == 0) {
        return null;
    }
    if (tokenizer() == null) {
        tokenizer = new Tknzr(sent);
    } else {
        tokenizer().reset(sent);
    }
    return tokenizer().next(token);

}
项目:fangorn    文件:NodeTreebankSentenceTokenizer.java   
@Override
public Token next(Token reusableToken) throws IOException {
    Token token = reusableToken;
    if (elementTokenizer() != null) {
        Token t = elementTokenizer().next(token);
        if (t != null) {
            return t;
        }
    }
    char[] sent = new char[] {};
    do {
        read = input.read(ioBuffer);
        if (read > 0) sent = add(sent, ioBuffer, read);
    } while (read != -1);
    if (sent.length == 0) {
        return null;
    }
    if (elementTokenizer() == null) {
        elementTokenizer = new JsonSentenceParser(compressPayload);
    } 
    elementTokenizer().parse(String.valueOf(sent));
    return elementTokenizer().next(token);

}
项目:fangorn    文件:FastStringParser.java   
public Token next(Token token) {
    if (currentPos == 0) return null;
    if (tokenPos <= currentPos) {
        token.setTermBuffer(sentence, textPositions[2 * tokenPos],
                textPositions[2 * tokenPos + 1]
                        - textPositions[2 * tokenPos]);
        Payload p = new Payload();
        byte[] b = new byte[4];
        b[0] = (byte) ((payloads[tokenPos] >>> 16) & 255);
        b[1] = (byte) ((payloads[tokenPos] >>> 24) & 255);
        b[2] = (byte) ((payloads[tokenPos] >>> 8) & 255);
        b[3] = (byte) (payloads[tokenPos] & 255);
        p.setData(b);
        token.setPayload(p);
        tokenPos++;
        return token;
    }
    return null;
}
项目:mmseg4j    文件:AnalyzerTest.java   
private void printlnToken(String txt, Analyzer analyzer) throws IOException {
    System.out.println("---------"+txt.length()+"\n"+txt);
    TokenStream ts = analyzer.tokenStream("text", new StringReader(txt));
    /*//lucene 2.9 以下
    for(Token t= new Token(); (t=ts.next(t)) !=null;) {
        System.out.println(t);
    }*/
    /*while(ts.incrementToken()) {
        TermAttribute termAtt = (TermAttribute)ts.getAttribute(TermAttribute.class);
        OffsetAttribute offsetAtt = (OffsetAttribute)ts.getAttribute(OffsetAttribute.class);
        TypeAttribute typeAtt = (TypeAttribute)ts.getAttribute(TypeAttribute.class);

        System.out.println("("+termAtt.term()+","+offsetAtt.startOffset()+","+offsetAtt.endOffset()+",type="+typeAtt.type()+")");
    }*/
    for(Token t= new Token(); (t=TokenUtils.nextToken(ts, t)) !=null;) {
        System.out.println(t);
    }
}
项目:document-management-system    文件:IndexHelper.java   
@SuppressWarnings("unused")
SetDictionary(String words, Analyzer analyzer) throws IOException {
    wordSet = new HashSet<String>();
    if (words != null) {
        TokenStream tokenStream = analyzer.tokenStream(NodeDocument.TEXT_FIELD, new StringReader(words));
        Token reusableToken = new Token();
        Token nextToken = null;

        //while ((nextToken = tokenStream.next(reusableToken)) != null) {
        //String term = nextToken.term();
        //if (term != null) {
        //wordSet.add(term);
        //}
        //}
    }
}
项目:THUTag    文件:WordChineseTokenizer.java   
@Override
public Token next() throws IOException {
    if (segbuf == null) {
        while (segbuf == null || segbuf.length == 0) {
            String line = bufreader.readLine();
            if (line == null) {
                return null;
            }
            segbuf = segmentor.segment(line);
        }
        currentSeg = 0;
    }

    Token t = new Token(segbuf[currentSeg], currentPos, currentPos + segbuf[currentSeg].length());
    currentPos += segbuf[currentSeg].length();
    currentSeg++;
    if (currentSeg >= segbuf.length)
        segbuf = null;

    return t;
}
项目:THUTag    文件:WordChineseTokenizerTest.java   
public void testGetToken() throws IOException {
    String content = "我们的生活\n很美好";
    String[] str = { "我们", "们的", "的生", "生活", "很美", "美好" };
    StringReader reader = new StringReader(content);
    WordSegment ws = new BigramWordSegment();
    WordChineseTokenizer tokenizer = new WordChineseTokenizer(ws, reader);
    LinkedList<Token> results = new LinkedList<Token>();
    Token t;
    while ((t = tokenizer.next()) != null) {
        results.add(t);
    }
    Assert.assertEquals(str.length, results.size());
    for (int i = 0; i < results.size(); i++) {
        Assert.assertEquals(str[i], results.get(i).termText());
    }
}
项目:SolrPlugins    文件:DiceMultipleCaseSuggester.java   
private List<LookupResult> getLookupResults(SpellingOptions options, Token currentToken) throws IOException {
    CharsRef scratch = new CharsRef();
    scratch.chars = currentToken.buffer();
    scratch.offset = 0;
    scratch.length = currentToken.length();
    boolean onlyMorePopular = (options.suggestMode == SuggestMode.SUGGEST_MORE_POPULAR) &&
            !(lookup instanceof WFSTCompletionLookup) &&
            !(lookup instanceof AnalyzingSuggester);

    List<LookupResult> suggestions = lookup.lookup(scratch, onlyMorePopular, options.count);
    if (suggestions == null || suggestions.size() == 0) {
        return null;
    }

    return suggestions;
}
项目:search    文件:TestSlowSynonymFilter.java   
@Override
public boolean incrementToken() throws IOException {
  if (index >= tokens.length)
    return false;
  else {
    clearAttributes();
    Token token = tokens[index++];
    termAtt.setEmpty().append(token);
    offsetAtt.setOffset(token.startOffset(), token.endOffset());
    posIncAtt.setPositionIncrement(token.getPositionIncrement());
    flagsAtt.setFlags(token.getFlags());
    typeAtt.setType(token.type());
    payloadAtt.setPayload(token.getPayload());
    return true;
  }
}
项目:search    文件:DummyCustomParamSpellChecker.java   
@Override
public SpellingResult getSuggestions(SpellingOptions options) throws IOException {

  SpellingResult result = new SpellingResult();
  //just spit back out the results

  // sort the keys to make ordering predictable
  Iterator<String> iterator = options.customParams.getParameterNamesIterator();
  List<String> lst = new ArrayList<>();
  while (iterator.hasNext()) {
    lst.add(iterator.next());
  }
  Collections.sort(lst);

  int i = 0;
  for (String name : lst) {
    String value = options.customParams.get(name);
    result.add(new Token(name, i, i+1),  Collections.singletonList(value));
    i += 2;
  }    
  return result;
}
项目:search    文件:TestTrimFilter.java   
@Override
public boolean incrementToken() throws IOException {
  if (index >= tokens.length)
    return false;
  else {
    clearAttributes();
    Token token = tokens[index++];
    termAtt.setEmpty().append(token);
    offsetAtt.setOffset(token.startOffset(), token.endOffset());
    posIncAtt.setPositionIncrement(token.getPositionIncrement());
    flagsAtt.setFlags(token.getFlags());
    typeAtt.setType(token.type());
    payloadAtt.setPayload(token.getPayload());
    return true;
  }
}
项目:search    文件:ShingleFilterTest.java   
protected void shingleFilterTestCommon(ShingleFilter filter,
                                       Token[] tokensToCompare,
                                       int[] positionIncrements,
                                       String[] types)
  throws IOException {
  String text[] = new String[tokensToCompare.length];
  int startOffsets[] = new int[tokensToCompare.length];
  int endOffsets[] = new int[tokensToCompare.length];

  for (int i = 0; i < tokensToCompare.length; i++) {
    text[i] = new String(tokensToCompare[i].buffer(),0, tokensToCompare[i].length());
    startOffsets[i] = tokensToCompare[i].startOffset();
    endOffsets[i] = tokensToCompare[i].endOffset();
  }

  assertTokenStreamContents(filter, text, startOffsets, endOffsets, types, positionIncrements);
}
项目:search    文件:SpellingQueryConverterTest.java   
@Test
public void testUnicode() {
  SpellingQueryConverter converter = new SpellingQueryConverter();
  converter.init(new NamedList());
  converter.setAnalyzer(new WhitespaceAnalyzer());

  // chinese text value
  Collection<Token> tokens = converter.convert("text_field:我购买了道具和服装。");
  assertTrue("tokens is null and it shouldn't be", tokens != null);
  assertEquals("tokens Size: " + tokens.size() + " is not 1", 1, tokens.size());

  tokens = converter.convert("text_购field:我购买了道具和服装。");
  assertTrue("tokens is null and it shouldn't be", tokens != null);
  assertEquals("tokens Size: " + tokens.size() + " is not 1", 1, tokens.size());

  tokens = converter.convert("text_field:我购xyz买了道具和服装。");
  assertTrue("tokens is null and it shouldn't be", tokens != null);
  assertEquals("tokens Size: " + tokens.size() + " is not 1", 1, tokens.size());
}
项目:search    文件:TestIndexWriterExceptions.java   
public void testLegalbutVeryLargePositions() throws Exception {
  Directory dir = newDirectory();
  IndexWriter iw = new IndexWriter(dir, newIndexWriterConfig(null));
  Document doc = new Document();
  Token t1 = new Token("foo", 0, 3);
  t1.setPositionIncrement(Integer.MAX_VALUE-500);
  if (random().nextBoolean()) {
    t1.setPayload(new BytesRef(new byte[] { 0x1 } ));
  }
  TokenStream overflowingTokenStream = new CannedTokenStream(
      new Token[] { t1 }
  );
  Field field = new TextField("foo", overflowingTokenStream);
  doc.add(field);
  iw.addDocument(doc);
  iw.close();
  dir.close();
}
项目:search    文件:TestPostingsOffsets.java   
public void testLegalbutVeryLargeOffsets() throws Exception {
  Directory dir = newDirectory();
  IndexWriter iw = new IndexWriter(dir, newIndexWriterConfig(null));
  Document doc = new Document();
  Token t1 = new Token("foo", 0, Integer.MAX_VALUE-500);
  if (random().nextBoolean()) {
    t1.setPayload(new BytesRef("test"));
  }
  Token t2 = new Token("foo", Integer.MAX_VALUE-500, Integer.MAX_VALUE);
  TokenStream tokenStream = new CannedTokenStream(
      new Token[] { t1, t2 }
  );
  FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
  ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
  // store some term vectors for the checkindex cross-check
  ft.setStoreTermVectors(true);
  ft.setStoreTermVectorPositions(true);
  ft.setStoreTermVectorOffsets(true);
  Field field = new Field("foo", tokenStream, ft);
  doc.add(field);
  iw.addDocument(doc);
  iw.close();
  dir.close();
}
项目:search    文件:SpellingQueryConverter.java   
protected void analyze(Collection<Token> result, String text, int offset, int flagsAttValue) throws IOException {
  TokenStream stream = analyzer.tokenStream("", text);
  // TODO: support custom attributes
  CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
  TypeAttribute typeAtt = stream.addAttribute(TypeAttribute.class);
  PayloadAttribute payloadAtt = stream.addAttribute(PayloadAttribute.class);
  PositionIncrementAttribute posIncAtt = stream.addAttribute(PositionIncrementAttribute.class);
  OffsetAttribute offsetAtt = stream.addAttribute(OffsetAttribute.class);
  stream.reset();
  while (stream.incrementToken()) {      
    Token token = new Token();
    token.copyBuffer(termAtt.buffer(), 0, termAtt.length());
    token.setOffset(offset + offsetAtt.startOffset(), 
                    offset + offsetAtt.endOffset());
    token.setFlags(flagsAttValue); //overwriting any flags already set...
    token.setType(typeAtt.type());
    token.setPayload(payloadAtt.getPayload());
    token.setPositionIncrement(posIncAtt.getPositionIncrement());
    result.add(token);
  }
  stream.end();
  stream.close();
}
项目:search    文件:TestField.java   
public void testTextFieldString() throws Exception {
  Field fields[] = new Field[] {
      new TextField("foo", "bar", Field.Store.NO),
      new TextField("foo", "bar", Field.Store.YES)
  };

  for (Field field : fields) {
    field.setBoost(5f);
    trySetByteValue(field);
    trySetBytesValue(field);
    trySetBytesRefValue(field);
    trySetDoubleValue(field);
    trySetIntValue(field);
    trySetFloatValue(field);
    trySetLongValue(field);
    trySetReaderValue(field);
    trySetShortValue(field);
    field.setStringValue("baz");
    field.setTokenStream(new CannedTokenStream(new Token("foo", 0, 3)));

    assertEquals("baz", field.stringValue());
    assertEquals(5f, field.boost(), 0f);
  }
}
项目:search    文件:TestField.java   
public void testTextFieldReader() throws Exception {
  Field field = new TextField("foo", new StringReader("bar"));

  field.setBoost(5f);
  trySetByteValue(field);
  trySetBytesValue(field);
  trySetBytesRefValue(field);
  trySetDoubleValue(field);
  trySetIntValue(field);
  trySetFloatValue(field);
  trySetLongValue(field);
  field.setReaderValue(new StringReader("foobar"));
  trySetShortValue(field);
  trySetStringValue(field);
  field.setTokenStream(new CannedTokenStream(new Token("foo", 0, 3)));

  assertNotNull(field.readerValue());
  assertEquals(5f, field.boost(), 0f);
}
项目:search    文件:SpellingQueryConverterTest.java   
@Test
public void testMultipleClauses() {
  SpellingQueryConverter converter = new SpellingQueryConverter();
  converter.init(new NamedList());
  converter.setAnalyzer(new WhitespaceAnalyzer());

  // two field:value pairs should give two tokens
  Collection<Token> tokens = converter.convert("买text_field:我购买了道具和服装。 field2:bar");
  assertTrue("tokens is null and it shouldn't be", tokens != null);
  assertEquals("tokens Size: " + tokens.size() + " is not 2", 2, tokens.size());

  // a field:value pair and a search term should give two tokens
  tokens = converter.convert("text_field:我购买了道具和服装。 bar");
  assertTrue("tokens is null and it shouldn't be", tokens != null);
  assertEquals("tokens Size: " + tokens.size() + " is not 2", 2, tokens.size());
}
项目:elasticsearch_my    文件:FlattenGraphTokenFilterFactoryTests.java   
public void testBasic() throws IOException {

        Index index = new Index("test", "_na_");
        String name = "ngr";
        Settings indexSettings = newAnalysisSettingsBuilder().build();
        IndexSettings indexProperties = IndexSettingsModule.newIndexSettings(index, indexSettings);
        Settings settings = newAnalysisSettingsBuilder().build();

        // "wow that's funny" and "what the fudge" are separate side paths, in parallel with "wtf", on input:
        TokenStream in = new CannedTokenStream(0, 12, new Token[] {
                    token("wtf", 1, 5, 0, 3),
                    token("what", 0, 1, 0, 3),
                    token("wow", 0, 3, 0, 3),
                    token("the", 1, 1, 0, 3),
                    token("fudge", 1, 3, 0, 3),
                    token("that's", 1, 1, 0, 3),
                    token("funny", 1, 1, 0, 3),
                    token("happened", 1, 1, 4, 12)
                });

        TokenStream tokens = new FlattenGraphTokenFilterFactory(indexProperties, null, name, settings).create(in);

        // ... but on output, it's flattened to wtf/what/wow that's/the fudge/funny happened:
        assertTokenStreamContents(tokens,
                new String[] {"wtf", "what", "wow", "the", "that's", "fudge", "funny", "happened"},
                new int[] {0, 0, 0, 0, 0, 0, 0, 4},
                new int[] {3, 3, 3, 3, 3, 3, 3, 12},
                new int[] {1, 0, 0, 1, 0, 1, 0, 1},
                new int[] {3, 1, 1, 1, 1, 1, 1, 1},
                12);
    }
项目:lams    文件:SlowSynonymMap.java   
/**
 * @param singleMatch  List<String>, the sequence of strings to match
 * @param replacement  List<Token> the list of tokens to use on a match
 * @param includeOrig  sets a flag on this mapping signaling the generation of matched tokens in addition to the replacement tokens
 * @param mergeExisting merge the replacement tokens with any other mappings that exist
 */
public void add(List<String> singleMatch, List<Token> replacement, boolean includeOrig, boolean mergeExisting) {
  SlowSynonymMap currMap = this;
  for (String str : singleMatch) {
    if (currMap.submap==null) {
      // for now hardcode at 4.0, as its what the old code did.
      // would be nice to fix, but shouldn't store a version in each submap!!!
      currMap.submap = new CharArrayMap<>(Version.LUCENE_CURRENT, 1, ignoreCase());
    }

    SlowSynonymMap map = currMap.submap.get(str);
    if (map==null) {
      map = new SlowSynonymMap();
      map.flags |= flags & IGNORE_CASE;
      currMap.submap.put(str, map);
    }

    currMap = map;
  }

  if (currMap.synonyms != null && !mergeExisting) {
    throw new IllegalArgumentException("SynonymFilter: there is already a mapping for " + singleMatch);
  }
  List<Token> superset = currMap.synonyms==null ? replacement :
        mergeTokens(Arrays.asList(currMap.synonyms), replacement);
  currMap.synonyms = superset.toArray(new Token[superset.size()]);
  if (includeOrig) currMap.flags |= INCLUDE_ORIG;
}
项目:lams    文件:SlowSynonymMap.java   
/** Produces a List<Token> from a List<String> */
public static List<Token> makeTokens(List<String> strings) {
  List<Token> ret = new ArrayList<>(strings.size());
  for (String str : strings) {
    //Token newTok = new Token(str,0,0,"SYNONYM");
    Token newTok = new Token(str, 0,0,"SYNONYM");
    ret.add(newTok);
  }
  return ret;
}
项目:lams    文件:PrefixAwareTokenFilter.java   
private void setCurrentToken(Token token) {
  if (token == null) return;
  clearAttributes();
  termAtt.copyBuffer(token.buffer(), 0, token.length());
  posIncrAtt.setPositionIncrement(token.getPositionIncrement());
  flagsAtt.setFlags(token.getFlags());
  offsetAtt.setOffset(token.startOffset(), token.endOffset());
  typeAtt.setType(token.type());
  payloadAtt.setPayload(token.getPayload());
}
项目:lams    文件:PrefixAwareTokenFilter.java   
private Token getNextPrefixInputToken(Token token) throws IOException {
  if (!prefix.incrementToken()) return null;
  token.copyBuffer(p_termAtt.buffer(), 0, p_termAtt.length());
  token.setPositionIncrement(p_posIncrAtt.getPositionIncrement());
  token.setFlags(p_flagsAtt.getFlags());
  token.setOffset(p_offsetAtt.startOffset(), p_offsetAtt.endOffset());
  token.setType(p_typeAtt.type());
  token.setPayload(p_payloadAtt.getPayload());
  return token;
}
项目:lams    文件:PrefixAwareTokenFilter.java   
private Token getNextSuffixInputToken(Token token) throws IOException {
  if (!suffix.incrementToken()) return null;
  token.copyBuffer(termAtt.buffer(), 0, termAtt.length());
  token.setPositionIncrement(posIncrAtt.getPositionIncrement());
  token.setFlags(flagsAtt.getFlags());
  token.setOffset(offsetAtt.startOffset(), offsetAtt.endOffset());
  token.setType(typeAtt.type());
  token.setPayload(payloadAtt.getPayload());
  return token;
}
项目:lams    文件:PrefixAndSuffixAwareTokenFilter.java   
public PrefixAndSuffixAwareTokenFilter(TokenStream prefix, TokenStream input, TokenStream suffix) {
  super(suffix);
  prefix = new PrefixAwareTokenFilter(prefix, input) {
    @Override
    public Token updateSuffixToken(Token suffixToken, Token lastInputToken) {
      return PrefixAndSuffixAwareTokenFilter.this.updateInputToken(suffixToken, lastInputToken);
    }
  };
  this.suffix = new PrefixAwareTokenFilter(prefix, suffix) {
    @Override
    public Token updateSuffixToken(Token suffixToken, Token lastInputToken) {
      return PrefixAndSuffixAwareTokenFilter.this.updateSuffixToken(suffixToken, lastInputToken);
    }
  };
}
项目:lams    文件:SingleTokenTokenStream.java   
public SingleTokenTokenStream(Token token) {
  super(Token.TOKEN_ATTRIBUTE_FACTORY);

  assert token != null;
  this.singleToken = token.clone();

  tokenAtt = (AttributeImpl) addAttribute(CharTermAttribute.class);
  assert (tokenAtt instanceof Token);
}
项目:fangorn    文件:String2NodesParser.java   
public Token next(final Token reusableToken) throws IOException {
    assert reusableToken != null;
    nodesPosition++;
    if (nodesPosition < nodes.size()) {
        reusableToken.clear();
        Node node = nodes.get(nodesPosition);
        reusableToken.setTermBuffer(node.name);
        reusableToken.setPayload(node.getPayload());
        return reusableToken;
    }
    return null;
}
项目:fangorn    文件:TreebankSentenceTokenizer.java   
public Token next(final Token reusableToken) throws IOException {
    assert reusableToken != null;
    reusableToken.clear();
    if (spacePending) {
        return setReusableTokenFromLocal(reusableToken,
                processSpace(lastWordBeginPos));
    }
    int i = lastReadPosition;
    boolean closeFound = false;
    while (i < buffer.length) {
        char c = buffer[i];
        if ('(' == c) {
            if (encounteredOpen) {
                openPositions.add(cOpen.size() - 1);
            }
            cOpen.add(i);
            encounteredOpen = true;
        } else if (Character.isWhitespace(c)) {
            cSpace.add(i);
            lastWordBeginPos = cOpen.get(cOpen.size() - 1);
            spacePending = true;
            encounteredOpen = false;
        } else if (')' == c) {
            cClose.add(i);
            closeFound = true;
            encounteredOpen = false;
            break;
        }
        i++;
    }
    lastReadPosition = i;
    if (closeFound) {
        lastReadPosition++;
        return setReusableTokenFromLocal(reusableToken, processClose());
    }
    return null;
}
项目:fangorn    文件:FastStringPerThreadTokenizer.java   
@Override
public Token next(Token reusableToken) throws IOException {
    Token t = actualParser.next(reusableToken);
    if (t != null) return t;
    int readSoFar = 0;
    int read;
    do {
        read = input.read(ioBuffer);
        if (read > 0) {
            while (readSoFar + read > sent.length) {
                char[] oldSent = sent;
                sent = new char[sent.length + 512];
                System.arraycopy(oldSent, 0, sent, 0, readSoFar);
            }
            System.arraycopy(ioBuffer, 0, sent, readSoFar, read);
            readSoFar += read;
        }
    } while (read != -1);
    if (readSoFar == 0) {
        return null;
    }
    try {
        actualParser.reset(new String(sent, 0, readSoFar));
    } catch (ParseException e) {
        return null;
    }
    return actualParser.next(reusableToken);
}
项目:fangorn    文件:JsonSentenceParserTest.java   
public void testLong() {
    String jsonString = "{\"n\":\"S\", \"i\":\"0_32_0_65\", \"c\":[{\"n\":\"NP\", \"i\":\"0_2_1_64\", \"c\":[{\"n\":\"NP\", \"i\":\"0_1_2_4\", \"c\":[{\"n\":\"NNP\", \"i\":\"0_1_3_1\", \"c\":[{\"n\":\"Arafat\", \"i\":\"0_1_4_0\", \"c\":[]}]}]}, {\"n\":\"NP\", \"i\":\"1_2_2_4\", \"c\":[{\"n\":\"PRP\", \"i\":\"1_2_3_3\", \"c\":[{\"n\":\"himself\", \"i\":\"1_2_4_2\", \"c\":[]}]}]}]}, {\"n\":\"VP\", \"i\":\"2_30_1_64\", \"c\":[{\"n\":\"VBD\", \"i\":\"2_3_2_61\", \"c\":[{\"n\":\"said\", \"i\":\"2_3_3_5\", \"c\":[]}]}, {\"n\":\"SBAR\", \"i\":\"3_30_2_61\", \"c\":[{\"n\":\"S\", \"i\":\"3_30_3_60\", \"c\":[{\"n\":\"NP\", \"i\":\"3_5_4_59\", \"c\":[{\"n\":\"DT\", \"i\":\"3_4_5_8\", \"c\":[{\"n\":\"the\", \"i\":\"3_4_6_6\", \"c\":[]}]}, {\"n\":\"NN\", \"i\":\"4_5_5_8\", \"c\":[{\"n\":\"award\", \"i\":\"4_5_6_7\", \"c\":[]}]}]}, {\"n\":\"VP\", \"i\":\"5_30_4_59\", \"c\":[{\"n\":\"VBD\", \"i\":\"5_6_5_58\", \"c\":[{\"n\":\"was\", \"i\":\"5_6_6_9\", \"c\":[]}]}, {\"n\":\"RB\", \"i\":\"6_7_5_58\", \"c\":[{\"n\":\"not\", \"i\":\"6_7_6_10\", \"c\":[]}]}, {\"n\":\"VP\", \"i\":\"7_30_5_58\", \"c\":[{\"n\":\"VBN\", \"i\":\"7_8_6_57\", \"c\":[{\"n\":\"granted\", \"i\":\"7_8_7_11\", \"c\":[]}]}, {\"n\":\"PP\", \"i\":\"8_30_6_57\", \"c\":[{\"n\":\"``\", \"i\":\"8_9_7_56\", \"c\":[{\"n\":\"``\", \"i\":\"8_9_8_12\", \"c\":[]}]}, {\"n\":\"TO\", \"i\":\"9_10_7_56\", \"c\":[{\"n\":\"to\", \"i\":\"9_10_8_13\", \"c\":[]}]}, {\"n\":\"NP\", \"i\":\"10_30_7_56\", \"c\":[{\"n\":\"NP\", \"i\":\"10_11_8_55\", \"c\":[{\"n\":\"NN\", \"i\":\"10_11_9_15\", \"c\":[{\"n\":\"crown\", \"i\":\"10_11_10_14\", \"c\":[]}]}]}, {\"n\":\"NP\", \"i\":\"11_26_8_55\", \"c\":[{\"n\":\"DT\", \"i\":\"11_12_9_44\", \"c\":[{\"n\":\"an\", \"i\":\"11_12_10_16\", \"c\":[]}]}, {\"n\":\"NN\", \"i\":\"12_13_9_44\", \"c\":[{\"n\":\"endeavor\", \"i\":\"12_13_10_17\", \"c\":[]}]}, {\"n\":\"SBAR\", \"i\":\"13_26_9_44\", \"c\":[{\"n\":\"IN\", \"i\":\"13_14_10_43\", \"c\":[{\"n\":\"that\", \"i\":\"13_14_11_18\", \"c\":[]}]}, {\"n\":\"S\", \"i\":\"14_26_10_43\", \"c\":[{\"n\":\"NP\", \"i\":\"14_15_11_42\", \"c\":[{\"n\":\"PRP\", \"i\":\"14_15_12_20\", \"c\":[{\"n\":\"we\", \"i\":\"14_15_13_19\", \"c\":[]}]}]}, {\"n\":\"VP\", \"i\":\"15_26_11_42\", \"c\":[{\"n\":\"VP\", \"i\":\"15_17_12_41\", \"c\":[{\"n\":\"VBP\", \"i\":\"15_16_13_24\", \"c\":[{\"n\":\"have\", \"i\":\"15_16_14_21\", \"c\":[]}]}, {\"n\":\"VP\", \"i\":\"16_17_13_24\", \"c\":[{\"n\":\"VBN\", \"i\":\"16_17_14_23\", \"c\":[{\"n\":\"completed\", \"i\":\"16_17_15_22\", \"c\":[]}]}]}]}, {\"n\":\"CC\", \"i\":\"17_18_12_41\", \"c\":[{\"n\":\"but\", \"i\":\"17_18_13_25\", \"c\":[]}]}, {\"n\":\"RB\", \"i\":\"18_19_12_41\", \"c\":[{\"n\":\"rather\", \"i\":\"18_19_13_26\", \"c\":[]}]}, {\"n\":\"VP\", \"i\":\"19_26_12_41\", \"c\":[{\"n\":\"TO\", \"i\":\"19_20_13_40\", \"c\":[{\"n\":\"to\", \"i\":\"19_20_14_27\", \"c\":[]}]}, {\"n\":\"VP\", \"i\":\"20_26_13_40\", \"c\":[{\"n\":\"VB\", \"i\":\"20_21_14_39\", \"c\":[{\"n\":\"encourage\", \"i\":\"20_21_15_28\", \"c\":[]}]}, {\"n\":\"S\", \"i\":\"21_26_14_39\", \"c\":[{\"n\":\"NP\", \"i\":\"21_22_15_38\", \"c\":[{\"n\":\"PRP\", \"i\":\"21_22_16_30\", \"c\":[{\"n\":\"us\", \"i\":\"21_22_17_29\", \"c\":[]}]}]}, {\"n\":\"VP\", \"i\":\"22_26_15_38\", \"c\":[{\"n\":\"TO\", \"i\":\"22_23_16_37\", \"c\":[{\"n\":\"to\", \"i\":\"22_23_17_31\", \"c\":[]}]}, {\"n\":\"VP\", \"i\":\"23_26_16_37\", \"c\":[{\"n\":\"VB\", \"i\":\"23_24_17_36\", \"c\":[{\"n\":\"continue\", \"i\":\"23_24_18_32\", \"c\":[]}]}, {\"n\":\"NP\", \"i\":\"24_26_17_36\", \"c\":[{\"n\":\"DT\", \"i\":\"24_25_18_35\", \"c\":[{\"n\":\"a\", \"i\":\"24_25_19_33\", \"c\":[]}]}, {\"n\":\"NN\", \"i\":\"25_26_18_35\", \"c\":[{\"n\":\"road\", \"i\":\"25_26_19_34\", \"c\":[]}]}]}]}]}]}]}]}]}]}]}]}, {\"n\":\"SBAR\", \"i\":\"26_30_8_55\", \"c\":[{\"n\":\"WHNP\", \"i\":\"26_27_9_54\", \"c\":[{\"n\":\"WDT\", \"i\":\"26_27_10_46\", \"c\":[{\"n\":\"which\", \"i\":\"26_27_11_45\", \"c\":[]}]}]}, {\"n\":\"S\", \"i\":\"27_30_9_54\", \"c\":[{\"n\":\"NP\", \"i\":\"27_28_10_53\", \"c\":[{\"n\":\"PRP\", \"i\":\"27_28_11_48\", \"c\":[{\"n\":\"we\", \"i\":\"27_28_12_47\", \"c\":[]}]}]}, {\"n\":\"VP\", \"i\":\"28_30_10_53\", \"c\":[{\"n\":\"VBP\", \"i\":\"28_29_11_52\", \"c\":[{\"n\":\"have\", \"i\":\"28_29_12_49\", \"c\":[]}]}, {\"n\":\"VP\", \"i\":\"29_30_11_52\", \"c\":[{\"n\":\"VBN\", \"i\":\"29_30_12_51\", \"c\":[{\"n\":\"started\", \"i\":\"29_30_13_50\", \"c\":[]}]}]}]}]}]}]}]}]}]}]}]}]}, {\"n\":\".\", \"i\":\"30_31_1_64\", \"c\":[{\"n\":\".\", \"i\":\"30_31_2_62\", \"c\":[]}]}, {\"n\":\"''\", \"i\":\"31_32_1_64\", \"c\":[{\"n\":\"''\", \"i\":\"31_32_2_63\", \"c\":[]}]}]}";
    JsonSentenceParser parser = new JsonSentenceParser(false);
    parser.parse(jsonString);
    Token token = new Token();
    parser.next(token);
    assertNotNull(token);
    assertEquals("S", token.term());
    assertEquals(0, token.getPayload().byteAt(0));
    assertEquals(32, token.getPayload().byteAt(1));
    assertEquals(0, token.getPayload().byteAt(2));
    assertEquals(65, token.getPayload().byteAt(3));

    parser.next(token);
    assertEquals("NP", token.term());
    assertEquals(0, token.getPayload().byteAt(0));
    assertEquals(2, token.getPayload().byteAt(1));
    assertEquals(1, token.getPayload().byteAt(2));
    assertEquals(64, token.getPayload().byteAt(3));

    parser.next(token);
    assertEquals("NP", token.term());
    assertEquals(0, token.getPayload().byteAt(0));
    assertEquals(1, token.getPayload().byteAt(1));
    assertEquals(2, token.getPayload().byteAt(2));
    assertEquals(4, token.getPayload().byteAt(3));

    parser.next(token);
    assertEquals("NNP", token.term());
    assertEquals(0, token.getPayload().byteAt(0));
    assertEquals(1, token.getPayload().byteAt(1));
    assertEquals(3, token.getPayload().byteAt(2));
    assertEquals(1, token.getPayload().byteAt(3));
}
项目:fangorn    文件:JsonSentenceParserTest.java   
public void testSentenceContainingEscapedDoubleQuotes() {
    String jsonString = "{\"n\":\"S\\\"\", \"i\":\"0_32_0_65\", \"c\":[{\"n\":\"NP\", \"i\":\"0_2_1_64\", \"c\":[{\"n\":\"NP\", \"i\":\"0_1_2_4\", \"c\":[{\"n\":\"NNP\", \"i\":\"0_1_3_1\", \"c\":[{\"n\":\"Arafat\", \"i\":\"0_1_4_0\", \"c\":[]}]}]}, {\"n\":\"NP\", \"i\":\"1_2_2_4\", \"c\":[{\"n\":\"PRP\", \"i\":\"1_2_3_3\", \"c\":[{\"n\":\"himself\", \"i\":\"1_2_4_2\", \"c\":[]}]}]}]}, {\"n\":\"VP\", \"i\":\"2_30_1_64\", \"c\":[{\"n\":\"VBD\", \"i\":\"2_3_2_61\", \"c\":[{\"n\":\"said\", \"i\":\"2_3_3_5\", \"c\":[]}]}, {\"n\":\"SBAR\", \"i\":\"3_30_2_61\", \"c\":[{\"n\":\"S\", \"i\":\"3_30_3_60\", \"c\":[{\"n\":\"NP\", \"i\":\"3_5_4_59\", \"c\":[{\"n\":\"DT\", \"i\":\"3_4_5_8\", \"c\":[{\"n\":\"the\", \"i\":\"3_4_6_6\", \"c\":[]}]}, {\"n\":\"NN\", \"i\":\"4_5_5_8\", \"c\":[{\"n\":\"award\", \"i\":\"4_5_6_7\", \"c\":[]}]}]}, {\"n\":\"VP\", \"i\":\"5_30_4_59\", \"c\":[{\"n\":\"VBD\", \"i\":\"5_6_5_58\", \"c\":[{\"n\":\"was\", \"i\":\"5_6_6_9\", \"c\":[]}]}, {\"n\":\"RB\", \"i\":\"6_7_5_58\", \"c\":[{\"n\":\"not\", \"i\":\"6_7_6_10\", \"c\":[]}]}, {\"n\":\"VP\", \"i\":\"7_30_5_58\", \"c\":[{\"n\":\"VBN\", \"i\":\"7_8_6_57\", \"c\":[{\"n\":\"granted\", \"i\":\"7_8_7_11\", \"c\":[]}]}, {\"n\":\"PP\", \"i\":\"8_30_6_57\", \"c\":[{\"n\":\"``\", \"i\":\"8_9_7_56\", \"c\":[{\"n\":\"``\", \"i\":\"8_9_8_12\", \"c\":[]}]}, {\"n\":\"TO\", \"i\":\"9_10_7_56\", \"c\":[{\"n\":\"to\", \"i\":\"9_10_8_13\", \"c\":[]}]}, {\"n\":\"NP\", \"i\":\"10_30_7_56\", \"c\":[{\"n\":\"NP\", \"i\":\"10_11_8_55\", \"c\":[{\"n\":\"NN\", \"i\":\"10_11_9_15\", \"c\":[{\"n\":\"crown\", \"i\":\"10_11_10_14\", \"c\":[]}]}]}, {\"n\":\"NP\", \"i\":\"11_26_8_55\", \"c\":[{\"n\":\"DT\", \"i\":\"11_12_9_44\", \"c\":[{\"n\":\"an\", \"i\":\"11_12_10_16\", \"c\":[]}]}, {\"n\":\"NN\", \"i\":\"12_13_9_44\", \"c\":[{\"n\":\"endeavor\", \"i\":\"12_13_10_17\", \"c\":[]}]}, {\"n\":\"SBAR\", \"i\":\"13_26_9_44\", \"c\":[{\"n\":\"IN\", \"i\":\"13_14_10_43\", \"c\":[{\"n\":\"that\", \"i\":\"13_14_11_18\", \"c\":[]}]}, {\"n\":\"S\", \"i\":\"14_26_10_43\", \"c\":[{\"n\":\"NP\", \"i\":\"14_15_11_42\", \"c\":[{\"n\":\"PRP\", \"i\":\"14_15_12_20\", \"c\":[{\"n\":\"we\", \"i\":\"14_15_13_19\", \"c\":[]}]}]}, {\"n\":\"VP\", \"i\":\"15_26_11_42\", \"c\":[{\"n\":\"VP\", \"i\":\"15_17_12_41\", \"c\":[{\"n\":\"VBP\", \"i\":\"15_16_13_24\", \"c\":[{\"n\":\"have\", \"i\":\"15_16_14_21\", \"c\":[]}]}, {\"n\":\"VP\", \"i\":\"16_17_13_24\", \"c\":[{\"n\":\"VBN\", \"i\":\"16_17_14_23\", \"c\":[{\"n\":\"completed\", \"i\":\"16_17_15_22\", \"c\":[]}]}]}]}, {\"n\":\"CC\", \"i\":\"17_18_12_41\", \"c\":[{\"n\":\"but\", \"i\":\"17_18_13_25\", \"c\":[]}]}, {\"n\":\"RB\", \"i\":\"18_19_12_41\", \"c\":[{\"n\":\"rather\", \"i\":\"18_19_13_26\", \"c\":[]}]}, {\"n\":\"VP\", \"i\":\"19_26_12_41\", \"c\":[{\"n\":\"TO\", \"i\":\"19_20_13_40\", \"c\":[{\"n\":\"to\", \"i\":\"19_20_14_27\", \"c\":[]}]}, {\"n\":\"VP\", \"i\":\"20_26_13_40\", \"c\":[{\"n\":\"VB\", \"i\":\"20_21_14_39\", \"c\":[{\"n\":\"encourage\", \"i\":\"20_21_15_28\", \"c\":[]}]}, {\"n\":\"S\", \"i\":\"21_26_14_39\", \"c\":[{\"n\":\"NP\", \"i\":\"21_22_15_38\", \"c\":[{\"n\":\"PRP\", \"i\":\"21_22_16_30\", \"c\":[{\"n\":\"us\", \"i\":\"21_22_17_29\", \"c\":[]}]}]}, {\"n\":\"VP\", \"i\":\"22_26_15_38\", \"c\":[{\"n\":\"TO\", \"i\":\"22_23_16_37\", \"c\":[{\"n\":\"to\", \"i\":\"22_23_17_31\", \"c\":[]}]}, {\"n\":\"VP\", \"i\":\"23_26_16_37\", \"c\":[{\"n\":\"VB\", \"i\":\"23_24_17_36\", \"c\":[{\"n\":\"continue\", \"i\":\"23_24_18_32\", \"c\":[]}]}, {\"n\":\"NP\", \"i\":\"24_26_17_36\", \"c\":[{\"n\":\"DT\", \"i\":\"24_25_18_35\", \"c\":[{\"n\":\"a\", \"i\":\"24_25_19_33\", \"c\":[]}]}, {\"n\":\"NN\", \"i\":\"25_26_18_35\", \"c\":[{\"n\":\"road\", \"i\":\"25_26_19_34\", \"c\":[]}]}]}]}]}]}]}]}]}]}]}]}, {\"n\":\"SBAR\", \"i\":\"26_30_8_55\", \"c\":[{\"n\":\"WHNP\", \"i\":\"26_27_9_54\", \"c\":[{\"n\":\"WDT\", \"i\":\"26_27_10_46\", \"c\":[{\"n\":\"which\", \"i\":\"26_27_11_45\", \"c\":[]}]}]}, {\"n\":\"S\", \"i\":\"27_30_9_54\", \"c\":[{\"n\":\"NP\", \"i\":\"27_28_10_53\", \"c\":[{\"n\":\"PRP\", \"i\":\"27_28_11_48\", \"c\":[{\"n\":\"we\", \"i\":\"27_28_12_47\", \"c\":[]}]}]}, {\"n\":\"VP\", \"i\":\"28_30_10_53\", \"c\":[{\"n\":\"VBP\", \"i\":\"28_29_11_52\", \"c\":[{\"n\":\"have\", \"i\":\"28_29_12_49\", \"c\":[]}]}, {\"n\":\"VP\", \"i\":\"29_30_11_52\", \"c\":[{\"n\":\"VBN\", \"i\":\"29_30_12_51\", \"c\":[{\"n\":\"started\", \"i\":\"29_30_13_50\", \"c\":[]}]}]}]}]}]}]}]}]}]}]}]}]}, {\"n\":\".\", \"i\":\"30_31_1_64\", \"c\":[{\"n\":\".\", \"i\":\"30_31_2_62\", \"c\":[]}]}, {\"n\":\"''\", \"i\":\"31_32_1_64\", \"c\":[{\"n\":\"''\", \"i\":\"31_32_2_63\", \"c\":[]}]}]}";
    JsonSentenceParser parser = new JsonSentenceParser(false);
    parser.parse(jsonString);
    Token token = new Token();
    parser.next(token);
    assertNotNull(token);
    assertEquals("S\"", token.term());
    assertEquals(0, token.getPayload().byteAt(0));
    assertEquals(32, token.getPayload().byteAt(1));
    assertEquals(0, token.getPayload().byteAt(2));
    assertEquals(65, token.getPayload().byteAt(3));
}
项目:fangorn    文件:String2NodesParserTest.java   
private void assertPayload(Token token, int right, int left, int depth,
        int parent) {
    Payload payload = token.getPayload();
    assertEquals(right, payload.byteAt(0));
    assertEquals(left, payload.byteAt(1));
    assertEquals(depth, payload.byteAt(2));
    assertEquals(parent, payload.byteAt(3));
}
项目:mmseg4j    文件:CutLetterDigitFilter.java   
public CutLetterDigitFilter(TokenStream input) {
    super(input);

    reusableToken = new Token();
    termAtt = (CharTermAttribute)addAttribute(CharTermAttribute.class);
    offsetAtt = (OffsetAttribute)addAttribute(OffsetAttribute.class);
    typeAtt = (TypeAttribute)addAttribute(TypeAttribute.class);
}
项目:mmseg4j    文件:CutLetterDigitFilter.java   
private void addToken(Token oriToken, int termBufferOffset, int termBufferLength, byte type) {
    Token token = new Token(oriToken.buffer(), termBufferOffset, termBufferLength,
            oriToken.startOffset()+termBufferOffset, oriToken.startOffset()+termBufferOffset+termBufferLength);

    if(type == Character.DECIMAL_DIGIT_NUMBER) {
        token.setType(Word.TYPE_DIGIT);
    } else {
        token.setType(Word.TYPE_LETTER);
    }

    tokenQueue.offer(token);
}
项目:mmseg4j    文件:CutLetterDigitFilter.java   
public final boolean incrementToken() throws IOException {
    clearAttributes();
    Token token = nextToken(reusableToken);
    if(token != null) {
        termAtt.copyBuffer(token.buffer(), 0, token.length());
        offsetAtt.setOffset(token.startOffset(), token.endOffset());
        typeAtt.setType(token.type());
        return true;
    } else {
        end();
        return false;
    }
}
项目:mmseg4j    文件:AnalyzerTest.java   
public static List<String> toWords(String txt, Analyzer analyzer) {
    List<String> words = new ArrayList<String>();
    TokenStream ts;
    try {
        ts = analyzer.tokenStream("text", new StringReader(txt));
        for(Token t= new Token(); (t=TokenUtils.nextToken(ts, t)) !=null;) {
            words.add(t.toString());
        }
    } catch (IOException e) {}

    return words;
}
项目:bisis-v4    文件:LatCyrFilter.java   
public Token next() throws IOException {
  Token nextToken = input.next();
  if (nextToken == null)
    return null;
  String term = nextToken.termText();
  term =  LatCyrUtils.toLatinUnaccented(term);
  return new Token(term, 0, term.length());
}
项目:fastcatsearch3    文件:TokenGroup.java   
void addToken(float score) {
  if (numTokens < MAX_NUM_TOKENS_PER_GROUP) {
    int termStartOffset = offsetAtt.startOffset();
    int termEndOffset = offsetAtt.endOffset();
    if (numTokens == 0) {
      startOffset = matchStartOffset = termStartOffset;
      endOffset = matchEndOffset = termEndOffset;
      tot += score;
    } else {
      startOffset = Math.min(startOffset, termStartOffset);
      endOffset = Math.max(endOffset, termEndOffset);
      if (score > 0) {
        if (tot == 0) {
          matchStartOffset = offsetAtt.startOffset();
          matchEndOffset = offsetAtt.endOffset();
        } else {
          matchStartOffset = Math.min(matchStartOffset, termStartOffset);
          matchEndOffset = Math.max(matchEndOffset, termEndOffset);
        }
        tot += score;
      }
    }
    Token token = new Token(termStartOffset, termEndOffset);
    token.setEmpty().append(termAtt);
    tokens[numTokens] = token;
    scores[numTokens] = score;
    numTokens++;
  }
}