Java 类org.apache.lucene.analysis.tokenattributes.TermAttribute 实例源码

项目:hadoop-distributed-crawler    文件:URLAnalyzer.java   
/**
 * 对一段文本进行分词,并将分词及其位置加入到urlInfo中
 * @param text 待分词的文本
 */
private void segment(String text) {
    IKAnalyzer analyzer = new IKAnalyzer(true);
    StringReader reader = new StringReader(text);
    TokenStream tokenStream = analyzer.tokenStream("*", reader);
    TermAttribute termAtt = tokenStream.getAttribute(TermAttribute.class);

    try {
        while (tokenStream.incrementToken()) {
            location ++;
            String term = termAtt.term();       
            urlInfo.putURLLocation(term, location);
        }
    }
    catch(IOException exp) {
        exp.printStackTrace();
    }
}
项目:olat    文件:SearchInputController.java   
protected Set<String> getHighlightWords(final String searchString) {
    try {
        final Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_CURRENT);
        final TokenStream stream = analyzer.tokenStream("content", new StringReader(searchString));
        final TermAttribute termAtt = stream.addAttribute(TermAttribute.class);
        for (boolean next = stream.incrementToken(); next; next = stream.incrementToken()) {
            final String term = termAtt.term();
            if (log.isDebugEnabled()) {
                log.debug(term);
            }
        }
    } catch (final IOException e) {
        log.error("", e);
    }
    return null;
}
项目:MFIBlocking    文件:WordProcessor.java   
public List<String> removeStopwordsAndSpecialChars(String value){
    List<String> retVal = new ArrayList<String>();
    value = value.replaceAll(replaceExpr, "");

    StringReader sr = new StringReader(value);
    TokenStream ts = analyzer.tokenStream(value, sr);       
    try {
        while(ts.incrementToken()){
            TermAttribute m = ts.getAttribute(TermAttribute.class);
            retVal.add(m.term());
        }
    } catch (IOException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    }
    return retVal;
}
项目:align-api-project    文件:CommonWords.java   
private void extractTerms(String e) {
Set<String> s = new LinkedHashSet<String>();
TokenStream ts = analyzer.tokenStream("", new StringReader(e));
TermAttribute termAtt = ts.addAttribute(TermAttribute.class);
try {
    while ( ts.incrementToken() ) {
    s.add( termAtt.term() );
    }
} catch (IOException ex) {
    ex.printStackTrace();
}
/*
Token token;
try {
    while ((token = ts.next()) != null) {
    s.add(token.termText());
    }
} catch (IOException ex) {
    ex.printStackTrace();
}
*/
map.put(e, s);
   }
项目:t4f-data    文件:SynonymAnalyzerTest.java   
public void testJumps() throws Exception {
  TokenStream stream =
    synonymAnalyzer.tokenStream("contents",                   // #A
                                new StringReader("jumps"));   // #A
  TermAttribute term = stream.addAttribute(TermAttribute.class);
  PositionIncrementAttribute posIncr = stream.addAttribute(PositionIncrementAttribute.class);

  int i = 0;
  String[] expected = new String[]{"jumps",              // #B
                                   "hops",               // #B
                                   "leaps"};             // #B
  while(stream.incrementToken()) {
    assertEquals(expected[i], term.term());

    int expectedPos;      // #C
    if (i == 0) {         // #C
      expectedPos = 1;    // #C
    } else {              // #C
      expectedPos = 0;    // #C
    }                     // #C
    assertEquals(expectedPos,                      // #C
                 posIncr.getPositionIncrement());  // #C
    i++;
  }
  assertEquals(3, i);
}
项目:airsonic    文件:SearchService.java   
private String analyzeQuery(String query) throws IOException {
    StringBuilder result = new StringBuilder();
    ASCIIFoldingFilter filter = new ASCIIFoldingFilter(new StandardTokenizer(LUCENE_VERSION, new StringReader(query)));
    TermAttribute termAttribute = filter.getAttribute(TermAttribute.class);
    while (filter.incrementToken()) {
        result.append(termAttribute.term()).append("* ");
    }
    return result.toString();
}
项目:subsonic    文件:SearchService.java   
private String analyzeQuery(String query) throws IOException {
    StringBuilder result = new StringBuilder();
    ASCIIFoldingFilter filter = new ASCIIFoldingFilter(new StandardTokenizer(LUCENE_VERSION, new StringReader(query)));
    TermAttribute termAttribute = filter.getAttribute(TermAttribute.class);
    while (filter.incrementToken()) {
        result.append(termAttribute.term()).append("* ");
    }
    return result.toString();
}
项目:lesk-wsd-dsm    文件:RevisedLesk.java   
/**
 *
 * @param text
 * @return
 * @throws IOException
 */
public Map<String, Float> buildBag(String text) throws IOException {
    Map<String, Float> bag = new HashMap<>();
    Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_CURRENT);
    SnowballStemmer stemmer = null;
    if (stemming) {
        stemmer = getStemmer(language);
        if (stemmer == null) {
            Logger.getLogger(RevisedLesk.class.getName()).log(Level.WARNING, "No stemmer for language {0}", language);
        }
    }
    TokenStream tokenStream = analyzer.tokenStream("gloss", new StringReader(text));
    while (tokenStream.incrementToken()) {
        TermAttribute token = (TermAttribute) tokenStream.getAttribute(TermAttribute.class);
        String term = token.term();
        if (stemmer != null) {
            stemmer.setCurrent(term);
            if (stemmer.stem()) {
                term = stemmer.getCurrent();
            }
        }
        Float c = bag.get(term);
        if (c == null) {
            bag.put(term, 1f);
        } else {
            bag.put(term, c + 1f);
        }
    }
    return bag;
}
项目:align-api-project    文件:JWNLDistances.java   
/**
    * Takes a gloss-like string (text) and returns it tokenized.
    * with:
    * - stopwords
    * - lower case
    * - porter stemmer
    */
   protected Set<String> tokenizeGloss( String s ) throws IOException {
Set<String> result = new HashSet<String>();
// I am affraid that I am reimplementing the StandardAnalizer...
TokenStream ts = new PorterStemFilter(
            new StopFilter( true, 
                                      new LowerCaseTokenizer( 
                                             new StringReader( s ) ), stopWords, true ));
TermAttribute termAtt = ts.addAttribute(TermAttribute.class);
while ( ts.incrementToken() ) {
    result.add( termAtt.term() );
}
return result;
   }
项目:align-api-project    文件:VectorSpaceMeasure.java   
/**
 * add all words contained in toAnalyse into words collection. Words are stemmed.
 * @param toAnalyse : the string to be analysed
 * @param words : the collection to add extracted words
 */
protected void analyseString(String toAnalyse, Collection<String> words) {
    TokenStream tokenS = analyzer.tokenStream("", new StringReader(toAnalyse));
    TermAttribute termAtt = tokenS.addAttribute(TermAttribute.class);
    try {
        while ( tokenS.incrementToken() ) {
        words.add( termAtt.term() );
        }
    } catch (IOException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    }
}
项目:t4f-data    文件:SynonymFilter.java   
public SynonymFilter(TokenStream in, SynonymEngine engine) {
  super(in);
  synonymStack = new Stack<String>();                     //#1 
  this.engine = engine;

  this.termAtt = addAttribute(TermAttribute.class);
  this.posIncrAtt = addAttribute(PositionIncrementAttribute.class);
}
项目:t4f-data    文件:ChineseDemo.java   
private static void analyze(String string, Analyzer analyzer)
       throws IOException {
  StringBuffer buffer = new StringBuffer();

  TokenStream stream = analyzer.tokenStream("contents",
                                            new StringReader(string));
  TermAttribute term = stream.addAttribute(TermAttribute.class);

  while(stream.incrementToken()) {   //C
    buffer.append("[");
    buffer.append(term.term());
    buffer.append("] ");
  }

  String output = buffer.toString();

  Frame f = new Frame();
  f.setTitle(analyzer.getClass().getSimpleName() + " : " + string);
  f.setResizable(true);

  Font font = new Font(null, Font.PLAIN, 36);
  int width = getWidth(f.getFontMetrics(font), output);

  f.setSize((width < 250) ? 250 : width + 50, 75);

  // NOTE: if Label doesn't render the Chinese characters
  // properly, try using javax.swing.JLabel instead
  Label label = new Label(output);   //D
  label.setSize(width, 75);
  label.setAlignment(Label.CENTER);
  label.setFont(font);
  f.add(label);

  f.setVisible(true);
}
项目:SynonymAnalyzer    文件:MySynonymFilter.java   
protected MySynonymFilter(TokenStream input) {
    super(input);
    termAtt = addAttribute(TermAttribute.class);

    synonymMap.put("lucene", "information retrieval");
    synonymMap.put("c#", "csharp");
}
项目:SynonymAnalyzer    文件:LuceneTest.java   
public static void testStandardAnalyzer() throws Exception {

    System.out.println("Standard Analyzer");

    Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_30);
    TokenStream ts = analyzer.tokenStream("Field", new StringReader("The quick brown fox jumps over lazy dog"));

    ts.reset();
    while (ts.incrementToken()) {
        //System.out.println("token: " +  ts.toString());
        System.out.println("token: " +  ts.getAttribute(TermAttribute.class).term());
    }
    ts.close();
}
项目:SynonymAnalyzer    文件:LuceneTest.java   
public static void testSynonymAnalyzer() throws Exception {

    Analyzer analyzer = new SynonymAnalyzer();
    TokenStream ts = analyzer.tokenStream("Address", new StringReader("Expertise in C# and Lucene"));

    ts.reset();
    while (ts.incrementToken()) {
        //System.out.println("token: " +  ts.toString());
        System.out.println("token: " +  ts.getAttribute(TermAttribute.class).term());
    }
    ts.close();
}
项目:t4f-data    文件:BulletinPayloadsFilter.java   
BulletinPayloadsFilter(TokenStream in, float warningBoost) {
  super(in);
  payloadAttr = addAttribute(PayloadAttribute.class);
  termAtt = addAttribute(TermAttribute.class);
  boostPayload = new Payload(PayloadHelper.encodeFloat(warningBoost));
}
项目:t4f-data    文件:PositionalStopFilter.java   
public PositionalStopFilter(TokenStream in, CharArraySet stopWords) {
  super(in);
  this.stopWords = stopWords;
  posIncrAttr = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
  termAttr = (TermAttribute) addAttribute(TermAttribute.class);
}