Java 类org.apache.lucene.analysis.core.LowerCaseTokenizer 实例源码

项目:spimedb    文件:Crawl.java   
public static void url(String id, URL u, String url_in, SpimeDB db, float pri) {


        DObject p = db.get(id);
        Long whenCached = p != null ? p.get("url_cached") : null;
        try {
            if (whenCached == null || whenCached < u.openConnection().getLastModified()) {
                String urlString = u.toString();
                Set<String> keywords = parseKeywords(new LowerCaseTokenizer(), urlString);

                MutableNObject n = new MutableNObject(id)
                        .withTags(keywords.toArray(new String[keywords.size()]))
                        .put("url_in", url_in)
                        .put("url", urlString);

                //logger.info("crawl {}", n);

                db.addAsync(pri, n);
            }
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
项目:search    文件:TestGermanAnalyzer.java   
public void testWithKeywordAttribute() throws IOException {
  CharArraySet set = new CharArraySet( 1, true);
  set.add("fischen");
  GermanStemFilter filter = new GermanStemFilter(
      new SetKeywordMarkerFilter(new LowerCaseTokenizer(new StringReader(
          "Fischen Trinken")), set));
  assertTokenStreamContents(filter, new String[] { "fischen", "trink" });
}
项目:search    文件:TestBrazilianStemmer.java   
public void testWithKeywordAttribute() throws IOException {
  CharArraySet set = new CharArraySet(1, true);
  set.add("Brasília");
  BrazilianStemFilter filter = new BrazilianStemFilter(
      new SetKeywordMarkerFilter(new LowerCaseTokenizer(new StringReader(
          "Brasília Brasilia")), set));
  assertTokenStreamContents(filter, new String[] { "brasília", "brasil" });
}
项目:search    文件:TestCharTokenizers.java   
public void testReadSupplementaryChars() throws IOException {
  StringBuilder builder = new StringBuilder();
  // create random input
  int num = 1024 + random().nextInt(1024);
  num *= RANDOM_MULTIPLIER;
  for (int i = 1; i < num; i++) {
    builder.append("\ud801\udc1cabc");
    if((i % 10) == 0)
      builder.append(" ");
  }
  // internal buffer size is 1024 make sure we have a surrogate pair right at the border
  builder.insert(1023, "\ud801\udc1c");
  Tokenizer tokenizer = new LowerCaseTokenizer(newAttributeFactory(), new StringReader(builder.toString()));
  assertTokenStreamContents(tokenizer, builder.toString().toLowerCase(Locale.ROOT).split(" "));
}
项目:search    文件:TestCharTokenizers.java   
public void testExtendCharBuffer() throws IOException {
  for (int i = 0; i < 40; i++) {
    StringBuilder builder = new StringBuilder();
    for (int j = 0; j < 1+i; j++) {
      builder.append("a");
    }
    builder.append("\ud801\udc1cabc");
    Tokenizer tokenizer = new LowerCaseTokenizer(newAttributeFactory(), new StringReader(builder.toString()));
    assertTokenStreamContents(tokenizer, new String[] {builder.toString().toLowerCase(Locale.ROOT)});
  }
}
项目:search    文件:TestCharTokenizers.java   
public void testMaxWordLength() throws IOException {
  StringBuilder builder = new StringBuilder();

  for (int i = 0; i < 255; i++) {
    builder.append("A");
  }
  Tokenizer tokenizer = new LowerCaseTokenizer(newAttributeFactory(), new StringReader(builder.toString() + builder.toString()));
  assertTokenStreamContents(tokenizer, new String[] {builder.toString().toLowerCase(Locale.ROOT), builder.toString().toLowerCase(Locale.ROOT)});
}
项目:search    文件:TestCharTokenizers.java   
public void testMaxWordLengthWithSupplementary() throws IOException {
  StringBuilder builder = new StringBuilder();

  for (int i = 0; i < 254; i++) {
    builder.append("A");
  }
  builder.append("\ud801\udc1c");
  Tokenizer tokenizer = new LowerCaseTokenizer(newAttributeFactory(), new StringReader(builder.toString() + builder.toString()));
  assertTokenStreamContents(tokenizer, new String[] {builder.toString().toLowerCase(Locale.ROOT), builder.toString().toLowerCase(Locale.ROOT)});
}
项目:NYBC    文件:TestGermanAnalyzer.java   
public void testWithKeywordAttribute() throws IOException {
  CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, 1, true);
  set.add("fischen");
  GermanStemFilter filter = new GermanStemFilter(
      new KeywordMarkerFilter(new LowerCaseTokenizer(TEST_VERSION_CURRENT, new StringReader( 
          "Fischen Trinken")), set));
  assertTokenStreamContents(filter, new String[] { "fischen", "trink" });
}
项目:NYBC    文件:TestBrazilianStemmer.java   
public void testWithKeywordAttribute() throws IOException {
  CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, 1, true);
  set.add("Brasília");
  BrazilianStemFilter filter = new BrazilianStemFilter(
      new KeywordMarkerFilter(new LowerCaseTokenizer(TEST_VERSION_CURRENT, new StringReader(
          "Brasília Brasilia")), set));
  assertTokenStreamContents(filter, new String[] { "brasília", "brasil" });
}
项目:NYBC    文件:TestCharTokenizers.java   
public void testReadSupplementaryChars() throws IOException {
  StringBuilder builder = new StringBuilder();
  // create random input
  int num = 1024 + random().nextInt(1024);
  num *= RANDOM_MULTIPLIER;
  for (int i = 1; i < num; i++) {
    builder.append("\ud801\udc1cabc");
    if((i % 10) == 0)
      builder.append(" ");
  }
  // internal buffer size is 1024 make sure we have a surrogate pair right at the border
  builder.insert(1023, "\ud801\udc1c");
  Tokenizer tokenizer = new LowerCaseTokenizer(TEST_VERSION_CURRENT, new StringReader(builder.toString()));
  assertTokenStreamContents(tokenizer, builder.toString().toLowerCase(Locale.ROOT).split(" "));
}
项目:NYBC    文件:TestCharTokenizers.java   
public void testExtendCharBuffer() throws IOException {
  for (int i = 0; i < 40; i++) {
    StringBuilder builder = new StringBuilder();
    for (int j = 0; j < 1+i; j++) {
      builder.append("a");
    }
    builder.append("\ud801\udc1cabc");
    Tokenizer tokenizer = new LowerCaseTokenizer(TEST_VERSION_CURRENT, new StringReader(builder.toString()));
    assertTokenStreamContents(tokenizer, new String[] {builder.toString().toLowerCase(Locale.ROOT)});
  }
}
项目:NYBC    文件:TestCharTokenizers.java   
public void testMaxWordLength() throws IOException {
  StringBuilder builder = new StringBuilder();

  for (int i = 0; i < 255; i++) {
    builder.append("A");
  }
  Tokenizer tokenizer = new LowerCaseTokenizer(TEST_VERSION_CURRENT, new StringReader(builder.toString() + builder.toString()));
  assertTokenStreamContents(tokenizer, new String[] {builder.toString().toLowerCase(Locale.ROOT), builder.toString().toLowerCase(Locale.ROOT)});
}
项目:NYBC    文件:TestCharTokenizers.java   
public void testMaxWordLengthWithSupplementary() throws IOException {
  StringBuilder builder = new StringBuilder();

  for (int i = 0; i < 254; i++) {
    builder.append("A");
  }
  builder.append("\ud801\udc1c");
  Tokenizer tokenizer = new LowerCaseTokenizer(TEST_VERSION_CURRENT, new StringReader(builder.toString() + builder.toString()));
  assertTokenStreamContents(tokenizer, new String[] {builder.toString().toLowerCase(Locale.ROOT), builder.toString().toLowerCase(Locale.ROOT)});
}
项目:Maskana-Gestor-de-Conocimiento    文件:TestGermanAnalyzer.java   
public void testWithKeywordAttribute() throws IOException {
  CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, 1, true);
  set.add("fischen");
  GermanStemFilter filter = new GermanStemFilter(
      new SetKeywordMarkerFilter(new LowerCaseTokenizer(TEST_VERSION_CURRENT, new StringReader( 
          "Fischen Trinken")), set));
  assertTokenStreamContents(filter, new String[] { "fischen", "trink" });
}
项目:Maskana-Gestor-de-Conocimiento    文件:TestBrazilianStemmer.java   
public void testWithKeywordAttribute() throws IOException {
  CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, 1, true);
  set.add("Brasília");
  BrazilianStemFilter filter = new BrazilianStemFilter(
      new SetKeywordMarkerFilter(new LowerCaseTokenizer(TEST_VERSION_CURRENT, new StringReader(
          "Brasília Brasilia")), set));
  assertTokenStreamContents(filter, new String[] { "brasília", "brasil" });
}
项目:Maskana-Gestor-de-Conocimiento    文件:TestCharTokenizers.java   
public void testReadSupplementaryChars() throws IOException {
  StringBuilder builder = new StringBuilder();
  // create random input
  int num = 1024 + random().nextInt(1024);
  num *= RANDOM_MULTIPLIER;
  for (int i = 1; i < num; i++) {
    builder.append("\ud801\udc1cabc");
    if((i % 10) == 0)
      builder.append(" ");
  }
  // internal buffer size is 1024 make sure we have a surrogate pair right at the border
  builder.insert(1023, "\ud801\udc1c");
  Tokenizer tokenizer = new LowerCaseTokenizer(TEST_VERSION_CURRENT, new StringReader(builder.toString()));
  assertTokenStreamContents(tokenizer, builder.toString().toLowerCase(Locale.ROOT).split(" "));
}
项目:Maskana-Gestor-de-Conocimiento    文件:TestCharTokenizers.java   
public void testExtendCharBuffer() throws IOException {
  for (int i = 0; i < 40; i++) {
    StringBuilder builder = new StringBuilder();
    for (int j = 0; j < 1+i; j++) {
      builder.append("a");
    }
    builder.append("\ud801\udc1cabc");
    Tokenizer tokenizer = new LowerCaseTokenizer(TEST_VERSION_CURRENT, new StringReader(builder.toString()));
    assertTokenStreamContents(tokenizer, new String[] {builder.toString().toLowerCase(Locale.ROOT)});
  }
}
项目:Maskana-Gestor-de-Conocimiento    文件:TestCharTokenizers.java   
public void testMaxWordLength() throws IOException {
  StringBuilder builder = new StringBuilder();

  for (int i = 0; i < 255; i++) {
    builder.append("A");
  }
  Tokenizer tokenizer = new LowerCaseTokenizer(TEST_VERSION_CURRENT, new StringReader(builder.toString() + builder.toString()));
  assertTokenStreamContents(tokenizer, new String[] {builder.toString().toLowerCase(Locale.ROOT), builder.toString().toLowerCase(Locale.ROOT)});
}
项目:Maskana-Gestor-de-Conocimiento    文件:TestCharTokenizers.java   
public void testMaxWordLengthWithSupplementary() throws IOException {
  StringBuilder builder = new StringBuilder();

  for (int i = 0; i < 254; i++) {
    builder.append("A");
  }
  builder.append("\ud801\udc1c");
  Tokenizer tokenizer = new LowerCaseTokenizer(TEST_VERSION_CURRENT, new StringReader(builder.toString() + builder.toString()));
  assertTokenStreamContents(tokenizer, new String[] {builder.toString().toLowerCase(Locale.ROOT), builder.toString().toLowerCase(Locale.ROOT)});
}
项目:anycook-api    文件:NGramAnalyzer.java   
@Override
protected TokenStreamComponents createComponents(String fieldName) {
    Tokenizer tokenizer = new LowerCaseTokenizer();
    TokenStream filter = new NGramTokenFilter(tokenizer, 1, 5);

    return new TokenStreamComponents(tokenizer, filter);
}
项目:elasticsearch_my    文件:LowerCaseTokenizerFactory.java   
@Override
public Tokenizer create() {
    return new LowerCaseTokenizer();
}
项目:Elasticsearch    文件:LowerCaseTokenizerFactory.java   
@Override
public Tokenizer create() {
    return new LowerCaseTokenizer();
}
项目:mgraph-summarization    文件:TextAnalyser.java   
public static List<String> getNgrams(String text, int N) throws IOException {

    List<String> tokens = new ArrayList<String>();


    Reader reader = new StringReader(text);
    // Tokenizer
    //StandardTokenizer tokenizer = new StandardTokenizer(Version.LUCENE_46, reader);

    LowerCaseTokenizer tokenizer = new LowerCaseTokenizer(Version.LUCENE_46, reader);

    // Filters
    LowerCaseFilter lowerCaseFilter = new LowerCaseFilter(Version.LUCENE_46, tokenizer); 
    KStemFilter kStemFilter = new KStemFilter(lowerCaseFilter);

    CharArraySet stopwords = StopAnalyzer.ENGLISH_STOP_WORDS_SET;
    StopFilter stopFilter = new StopFilter(Version.LUCENE_46, kStemFilter, stopwords);

    TokenStream ts;
    if(N > 1) {

        PositionFilter positionFilter = new PositionFilter(stopFilter);

        //@SuppressWarnings("resource")
        //ShingleFilter shingleFilter = new ShingleFilter(positionFilter, N, N);
        //shingleFilter.setOutputUnigrams(false);

        @SuppressWarnings("resource")
        ShingleFilter shingleFilter = new ShingleFilter(positionFilter, 2, N);
        shingleFilter.setOutputUnigrams(true);

        ts = shingleFilter;
    }
    else {
        ts = stopFilter;
    }

    CharTermAttribute charTermAtt = ts.addAttribute(CharTermAttribute.class);

    ts.reset();
    while (ts.incrementToken()) {
        String token = charTermAtt.toString();
        if(token.length()>1)
            tokens.add(token);
      }
      ts.end();  
      ts.close();

    return tokens;
}
项目:semanticvectors    文件:PorterAnalyzer.java   
@Override
protected TokenStreamComponents createComponents(String s) {
  Tokenizer source = new LowerCaseTokenizer();
  return new TokenStreamComponents(source, new PorterStemFilter(source));
}
项目:NYBC    文件:LowerCaseTokenizerFactory.java   
@Override
public LowerCaseTokenizer create(Reader input) {
  return new LowerCaseTokenizer(luceneMatchVersion,input);
}
项目:AGDISTIS    文件:LiteralAnalyzer.java   
@Override
protected TokenStreamComponents createComponents(final String fieldName, final Reader reader) {
    final Tokenizer source = new LowerCaseTokenizer(matchVersion, reader);
    return new TokenStreamComponents(source, new ASCIIFoldingFilter(source));

}
项目:semanticvectors-googlecode    文件:PorterAnalyzer.java   
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
  Tokenizer source = new LowerCaseTokenizer(LUCENE_VERSION, reader);
  return new TokenStreamComponents(source, new PorterStemFilter(source));
}
项目:t4f-data    文件:SimpleAnalyzer.java   
@Override
public TokenStream tokenStream(String fieldName, Reader reader) {
    return new LowerCaseTokenizer(reader);
}
项目:t4f-data    文件:PositionalPorterStopAnalyzer.java   
@Override
public TokenStream tokenStream(String fieldName, Reader reader) {
    StopFilter stopFilter = new StopFilter(true, new LowerCaseTokenizer(reader), stopWords);
    stopFilter.setEnablePositionIncrements(true);
    return new PorterStemFilter(stopFilter);
}