Java 类org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute 实例源码

项目:lucenelab    文件:SynonymFilterExample.java   
@SuppressWarnings("resource")
public static void main(String[] args) throws Exception {
    final Tokenizer tok = new WhitespaceTokenizer();
    tok.setReader(new StringReader("dark sea green sea green"));

    final SynonymMap.Builder builder = new SynonymMap.Builder(true);
    addSynonym("dark sea green", "color", builder);
    addSynonym("green", "color", builder);
    addSynonym("dark sea", "color", builder);
    addSynonym("sea green", "color", builder);
    final SynonymMap synMap = builder.build();
    final TokenStream ts = new SynonymFilter(tok, synMap, true);

    final CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
    final PositionIncrementAttribute posIncrAtt = ts.addAttribute(PositionIncrementAttribute.class);
    final PositionLengthAttribute posLengthAtt = ts.addAttribute(PositionLengthAttribute.class);

    ts.reset();
    int pos = -1;
    while (ts.incrementToken()) {
        pos += posIncrAtt.getPositionIncrement();
        System.out.println("term=" + termAtt + ", pos=" + pos + ", posLen=" + posLengthAtt.getPositionLength());
    }
    ts.end();
    ts.close();
}
项目:elasticsearch_my    文件:TransportAnalyzeAction.java   
private static List<AnalyzeResponse.AnalyzeToken> simpleAnalyze(AnalyzeRequest request, Analyzer analyzer, String field) {
    List<AnalyzeResponse.AnalyzeToken> tokens = new ArrayList<>();
    int lastPosition = -1;
    int lastOffset = 0;
    for (String text : request.text()) {
        try (TokenStream stream = analyzer.tokenStream(field, text)) {
            stream.reset();
            CharTermAttribute term = stream.addAttribute(CharTermAttribute.class);
            PositionIncrementAttribute posIncr = stream.addAttribute(PositionIncrementAttribute.class);
            OffsetAttribute offset = stream.addAttribute(OffsetAttribute.class);
            TypeAttribute type = stream.addAttribute(TypeAttribute.class);
            PositionLengthAttribute posLen = stream.addAttribute(PositionLengthAttribute.class);

            while (stream.incrementToken()) {
                int increment = posIncr.getPositionIncrement();
                if (increment > 0) {
                    lastPosition = lastPosition + increment;
                }
                tokens.add(new AnalyzeResponse.AnalyzeToken(term.toString(), lastPosition, lastOffset + offset.startOffset(),
                    lastOffset + offset.endOffset(), posLen.getPositionLength(), type.type(), null));

            }
            stream.end();
            lastOffset += offset.endOffset();
            lastPosition += posIncr.getPositionIncrement();

            lastPosition += analyzer.getPositionIncrementGap(field);
            lastOffset += analyzer.getOffsetGap(field);
        } catch (IOException e) {
            throw new ElasticsearchException("failed to analyze", e);
        }
    }
    return tokens;
}
项目:elasticsearch_my    文件:TransportAnalyzeAction.java   
private void analyze(TokenStream stream, Analyzer analyzer, String field, Set<String> includeAttributes) {
    try {
        stream.reset();
        CharTermAttribute term = stream.addAttribute(CharTermAttribute.class);
        PositionIncrementAttribute posIncr = stream.addAttribute(PositionIncrementAttribute.class);
        OffsetAttribute offset = stream.addAttribute(OffsetAttribute.class);
        TypeAttribute type = stream.addAttribute(TypeAttribute.class);
        PositionLengthAttribute posLen = stream.addAttribute(PositionLengthAttribute.class);

        while (stream.incrementToken()) {
            int increment = posIncr.getPositionIncrement();
            if (increment > 0) {
                lastPosition = lastPosition + increment;
            }
            tokens.add(new AnalyzeResponse.AnalyzeToken(term.toString(), lastPosition, lastOffset + offset.startOffset(),
                lastOffset + offset.endOffset(), posLen.getPositionLength(), type.type(), extractExtendedAttributes(stream, includeAttributes)));

        }
        stream.end();
        lastOffset += offset.endOffset();
        lastPosition += posIncr.getPositionIncrement();

        lastPosition += analyzer.getPositionIncrementGap(field);
        lastOffset += analyzer.getOffsetGap(field);

    } catch (IOException e) {
        throw new ElasticsearchException("failed to analyze", e);
    } finally {
        IOUtils.closeWhileHandlingException(stream);
    }
}
项目:lams    文件:Lucene43NGramTokenFilter.java   
/**
 * Creates Lucene43NGramTokenFilter with given min and max n-grams.
 * @param input {@link org.apache.lucene.analysis.TokenStream} holding the input to be tokenized
 * @param minGram the smallest n-gram to generate
 * @param maxGram the largest n-gram to generate
 */
public Lucene43NGramTokenFilter(TokenStream input, int minGram, int maxGram) {
  super(new CodepointCountFilter(input, minGram, Integer.MAX_VALUE));
  this.charUtils = CharacterUtils.getJava4Instance();
  if (minGram < 1) {
    throw new IllegalArgumentException("minGram must be greater than zero");
  }
  if (minGram > maxGram) {
    throw new IllegalArgumentException("minGram must not be greater than maxGram");
  }
  this.minGram = minGram;
  this.maxGram = maxGram;

  posIncAtt = new PositionIncrementAttribute() {
    @Override
    public void setPositionIncrement(int positionIncrement) {}
    @Override
    public int getPositionIncrement() {
        return 0;
      }
  };
  posLenAtt = new PositionLengthAttribute() {
    @Override
    public void setPositionLength(int positionLength) {}
    @Override
    public int getPositionLength() {
        return 0;
      }
  };
}
项目:mecab-ko-lucene-analyzer    文件:MeCabKoTokenizer.java   
private void setAttributes() {
  charTermAtt = addAttribute(CharTermAttribute.class);
  posIncrAtt = addAttribute(PositionIncrementAttribute.class);
  posLenAtt = addAttribute(PositionLengthAttribute.class);
  offsetAtt = addAttribute(OffsetAttribute.class);
  typeAtt = addAttribute(TypeAttribute.class);
  posAtt = addAttribute(PartOfSpeechAttribute.class);
  semanticClassAtt = addAttribute(SemanticClassAttribute.class);
}
项目:solr-analyzers    文件:AbstractTokenFilterTest.java   
protected Matcher<TokenStream> positionLength(final int expectedLength) {
   return new TypeSafeMatcher<TokenStream>() {
      @Override
      public void describeTo(Description description) {
         description.appendText("positionLength=").appendValue(expectedLength);
      }

      @Override
      protected boolean matchesSafely(TokenStream stream) {
         PositionLengthAttribute attr = stream.addAttribute(PositionLengthAttribute.class);
         return attr.getPositionLength() == expectedLength;
      }
   };
}
项目:search    文件:Lucene43NGramTokenFilter.java   
/**
 * Creates Lucene43NGramTokenFilter with given min and max n-grams.
 * @param input {@link org.apache.lucene.analysis.TokenStream} holding the input to be tokenized
 * @param minGram the smallest n-gram to generate
 * @param maxGram the largest n-gram to generate
 */
public Lucene43NGramTokenFilter(TokenStream input, int minGram, int maxGram) {
  super(new CodepointCountFilter(input, minGram, Integer.MAX_VALUE));
  this.charUtils = CharacterUtils.getJava4Instance();
  if (minGram < 1) {
    throw new IllegalArgumentException("minGram must be greater than zero");
  }
  if (minGram > maxGram) {
    throw new IllegalArgumentException("minGram must not be greater than maxGram");
  }
  this.minGram = minGram;
  this.maxGram = maxGram;

  posIncAtt = new PositionIncrementAttribute() {
    @Override
    public void setPositionIncrement(int positionIncrement) {}
    @Override
    public int getPositionIncrement() {
        return 0;
      }
  };
  posLenAtt = new PositionLengthAttribute() {
    @Override
    public void setPositionLength(int positionLength) {}
    @Override
    public int getPositionLength() {
        return 0;
      }
  };
}
项目:search    文件:TokenStreamToDot.java   
/** If inputText is non-null, and the TokenStream has
 *  offsets, we include the surface form in each arc's
 *  label. */
public TokenStreamToDot(String inputText, TokenStream in, PrintWriter out) {
  this.in = in;
  this.out = out;
  this.inputText = inputText;
  termAtt = in.addAttribute(CharTermAttribute.class);
  posIncAtt = in.addAttribute(PositionIncrementAttribute.class);
  posLengthAtt = in.addAttribute(PositionLengthAttribute.class);
  if (in.hasAttribute(OffsetAttribute.class)) {
    offsetAtt = in.addAttribute(OffsetAttribute.class);
  } else {
    offsetAtt = null;
  }
}
项目:cc-analysis    文件:AnalysisTestHelper.java   
public static void printResultOfTokenStream(PrintStream out, TokenStream ts) throws IOException {
    CharTermAttribute termAttr = ts.getAttribute(CharTermAttribute.class);
    TypeAttribute typeAttr = ts.getAttribute(TypeAttribute.class);
    OffsetAttribute offAttr = ts.getAttribute(OffsetAttribute.class);
    PositionIncrementAttribute posIncAttr = ts.getAttribute(PositionIncrementAttribute.class);
    PositionLengthAttribute posLenAttr = ts.getAttribute(PositionLengthAttribute.class);
    ts.reset();
    Table<String, String, String> contentTable = Tables.newCustomTable(new LinkedHashMap<String, Map<String, String>>(),
            new Supplier<Map<String, String>>() {
                @Override
                public Map<String, String> get() {
                    return Maps.newLinkedHashMap();
                }
            });
    int lineNo = 1;
    int pos = 0;
    while (ts.incrementToken()) {
        String lineId = lineNo + ".";
        contentTable.put(lineId, "term", termAttr.toString());
        contentTable.put(lineId, "type", typeAttr.type());
        contentTable.put(lineId, "startOffset", offAttr.startOffset() + "");
        contentTable.put(lineId, "endOffset", offAttr.endOffset() + "");
        contentTable.put(lineId, "posInc", posIncAttr.getPositionIncrement() + "");
        contentTable.put(lineId, "posLen", posLenAttr.getPositionLength() + "");
        pos += posIncAttr.getPositionIncrement();
        contentTable.put(lineId, "pos", pos + "");

        lineNo++;
    }
    printTable(out, contentTable);
}
项目:NYBC    文件:TokenStreamToDot.java   
/** If inputText is non-null, and the TokenStream has
 *  offsets, we include the surface form in each arc's
 *  label. */
public TokenStreamToDot(String inputText, TokenStream in, PrintWriter out) {
  this.in = in;
  this.out = out;
  this.inputText = inputText;
  termAtt = in.addAttribute(CharTermAttribute.class);
  posIncAtt = in.addAttribute(PositionIncrementAttribute.class);
  posLengthAtt = in.addAttribute(PositionLengthAttribute.class);
  if (in.hasAttribute(OffsetAttribute.class)) {
    offsetAtt = in.addAttribute(OffsetAttribute.class);
  } else {
    offsetAtt = null;
  }
}
项目:read-open-source-code    文件:Lucene43NGramTokenFilter.java   
/**
 * Creates Lucene43NGramTokenFilter with given min and max n-grams.
 * @param input {@link org.apache.lucene.analysis.TokenStream} holding the input to be tokenized
 * @param minGram the smallest n-gram to generate
 * @param maxGram the largest n-gram to generate
 */
public Lucene43NGramTokenFilter(TokenStream input, int minGram, int maxGram) {
  super(new CodepointCountFilter(input, minGram, Integer.MAX_VALUE));
  this.charUtils = CharacterUtils.getJava4Instance();
  if (minGram < 1) {
    throw new IllegalArgumentException("minGram must be greater than zero");
  }
  if (minGram > maxGram) {
    throw new IllegalArgumentException("minGram must not be greater than maxGram");
  }
  this.minGram = minGram;
  this.maxGram = maxGram;

  posIncAtt = new PositionIncrementAttribute() {
    @Override
    public void setPositionIncrement(int positionIncrement) {}
    @Override
    public int getPositionIncrement() {
        return 0;
      }
  };
  posLenAtt = new PositionLengthAttribute() {
    @Override
    public void setPositionLength(int positionLength) {}
    @Override
    public int getPositionLength() {
        return 0;
      }
  };
}
项目:Maskana-Gestor-de-Conocimiento    文件:TokenStreamToDot.java   
/** If inputText is non-null, and the TokenStream has
 *  offsets, we include the surface form in each arc's
 *  label. */
public TokenStreamToDot(String inputText, TokenStream in, PrintWriter out) {
  this.in = in;
  this.out = out;
  this.inputText = inputText;
  termAtt = in.addAttribute(CharTermAttribute.class);
  posIncAtt = in.addAttribute(PositionIncrementAttribute.class);
  posLengthAtt = in.addAttribute(PositionLengthAttribute.class);
  if (in.hasAttribute(OffsetAttribute.class)) {
    offsetAtt = in.addAttribute(OffsetAttribute.class);
  } else {
    offsetAtt = null;
  }
}
项目:search    文件:NGramTokenizerTest.java   
static void testNGrams(int minGram, int maxGram, String s, final String nonTokenChars, boolean edgesOnly) throws IOException {
  // convert the string to code points
  final int[] codePoints = toCodePoints(s);
  final int[] offsets = new int[codePoints.length + 1];
  for (int i = 0; i < codePoints.length; ++i) {
    offsets[i+1] = offsets[i] + Character.charCount(codePoints[i]);
  }
  final TokenStream grams = new NGramTokenizer(Version.LATEST, new StringReader(s), minGram, maxGram, edgesOnly) {
    @Override
    protected boolean isTokenChar(int chr) {
      return nonTokenChars.indexOf(chr) < 0;
    }
  };
  final CharTermAttribute termAtt = grams.addAttribute(CharTermAttribute.class);
  final PositionIncrementAttribute posIncAtt = grams.addAttribute(PositionIncrementAttribute.class);
  final PositionLengthAttribute posLenAtt = grams.addAttribute(PositionLengthAttribute.class);
  final OffsetAttribute offsetAtt = grams.addAttribute(OffsetAttribute.class);
  grams.reset();
  for (int start = 0; start < codePoints.length; ++start) {
    nextGram:
    for (int end = start + minGram; end <= start + maxGram && end <= codePoints.length; ++end) {
      if (edgesOnly && start > 0 && isTokenChar(nonTokenChars, codePoints[start - 1])) {
        // not on an edge
        continue nextGram;
      }
      for (int j = start; j < end; ++j) {
        if (!isTokenChar(nonTokenChars, codePoints[j])) {
          continue nextGram;
        }
      }
      assertTrue(grams.incrementToken());
      assertArrayEquals(Arrays.copyOfRange(codePoints, start, end), toCodePoints(termAtt));
      assertEquals(1, posIncAtt.getPositionIncrement());
      assertEquals(1, posLenAtt.getPositionLength());
      assertEquals(offsets[start], offsetAtt.startOffset());
      assertEquals(offsets[end], offsetAtt.endOffset());
    }
  }
  assertFalse(grams.incrementToken());
  grams.end();
  assertEquals(s.length(), offsetAtt.startOffset());
  assertEquals(s.length(), offsetAtt.endOffset());
}
项目:information-retrieval-adventure    文件:EntradaSalida.java   
/**
 * Now the graph is more interesting! For each token (arc), the PositionIncrementAttribute tells
 * us how many positions (nodes) ahead this arc starts from, while the new (as of 3.6.0)
 * PositionLengthAttribute tells us how many positions (nodes) ahead the arc arrives to.
 */
private static String getGraph(String input) throws IOException {
  final Tokenizer inputStream = new WhitespaceTokenizer();
  inputStream.setReader(new StringReader(input));
  //        final TokenStream inputStream = new LowerCaseFilter(in);

  TokenStream tokenStream = new SynonymGraphFilter(inputStream, builder.build(), false);
  PositionIncrementAttribute posIncAtt =
      tokenStream.addAttribute(PositionIncrementAttribute.class);
  PositionLengthAttribute posLenAtt = tokenStream.addAttribute(PositionLengthAttribute.class);
  CharTermAttribute termAtt = tokenStream.addAttribute(CharTermAttribute.class);
  tokenStream.reset();
  int srcNode = -1;
  int destNode;

  StringBuilder b = new StringBuilder();
  b.append("digraph Automaton {\n");
  b.append("  initial [shape=plaintext,label=\"\"]\n");
  b.append("  initial -> 0\n");

  while (tokenStream.incrementToken()) {
    int posInc = posIncAtt.getPositionIncrement();
    if (posInc != 0) {
      srcNode += posInc;
      b.append("  ");
      b.append(srcNode);
      b.append(" [shape=circle,label=\"" + srcNode + "\"]\n");
    }
    destNode = srcNode + posLenAtt.getPositionLength();
    b.append("  ");
    b.append(srcNode);
    b.append(" -> ");
    b.append(destNode);
    b.append(" [label=\"");
    b.append(termAtt);
    b.append("\"");
    b.append("]\n");
  }
  tokenStream.end();
  tokenStream.close();

  b.append('}');
  return b.toString();
}
项目:Maskana-Gestor-de-Conocimiento    文件:NGramTokenizerTest.java   
static void testNGrams(int minGram, int maxGram, String s, final String nonTokenChars, boolean edgesOnly) throws IOException {
  // convert the string to code points
  final int[] codePoints = toCodePoints(s);
  final int[] offsets = new int[codePoints.length + 1];
  for (int i = 0; i < codePoints.length; ++i) {
    offsets[i+1] = offsets[i] + Character.charCount(codePoints[i]);
  }
  final TokenStream grams = new NGramTokenizer(TEST_VERSION_CURRENT, new StringReader(s), minGram, maxGram, edgesOnly) {
    @Override
    protected boolean isTokenChar(int chr) {
      return nonTokenChars.indexOf(chr) < 0;
    }
  };
  final CharTermAttribute termAtt = grams.addAttribute(CharTermAttribute.class);
  final PositionIncrementAttribute posIncAtt = grams.addAttribute(PositionIncrementAttribute.class);
  final PositionLengthAttribute posLenAtt = grams.addAttribute(PositionLengthAttribute.class);
  final OffsetAttribute offsetAtt = grams.addAttribute(OffsetAttribute.class);
  grams.reset();
  for (int start = 0; start < codePoints.length; ++start) {
    nextGram:
    for (int end = start + minGram; end <= start + maxGram && end <= codePoints.length; ++end) {
      if (edgesOnly && start > 0 && isTokenChar(nonTokenChars, codePoints[start - 1])) {
        // not on an edge
        continue nextGram;
      }
      for (int j = start; j < end; ++j) {
        if (!isTokenChar(nonTokenChars, codePoints[j])) {
          continue nextGram;
        }
      }
      assertTrue(grams.incrementToken());
      assertArrayEquals(Arrays.copyOfRange(codePoints, start, end), toCodePoints(termAtt));
      assertEquals(1, posIncAtt.getPositionIncrement());
      assertEquals(1, posLenAtt.getPositionLength());
      assertEquals(offsets[start], offsetAtt.startOffset());
      assertEquals(offsets[end], offsetAtt.endOffset());
    }
  }
  assertFalse(grams.incrementToken());
  grams.end();
  assertEquals(s.length(), offsetAtt.startOffset());
  assertEquals(s.length(), offsetAtt.endOffset());
}