Java 类org.apache.lucene.search.FuzzyTermsEnum 实例源码

项目:DoSeR-Disambiguation    文件:LearnToRankFuzzyQuery.java   
@Override
protected TermsEnum getTermsEnum(final Terms terms,
        final AttributeSource atts) throws IOException {
    if ((maxEdits == 0) || (prefixLength >= term.text().length())) { // can
                                                                        // only
        // match
        // if
        // it's
        // exact
        return new SingleTermsEnum(terms.iterator(null), term.bytes());
    }
    return new FuzzyTermsEnum(terms, atts, getTerm(), maxEdits,
            prefixLength, transpositions);
}
项目:DoSeR    文件:LearnToRankFuzzyQuery.java   
@Override
protected TermsEnum getTermsEnum(Terms terms, AttributeSource atts)
        throws IOException {
    if (maxEdits == 0 || prefixLength >= term.text().length()) { // can only
                                                                    // match
                                                                    // if
                                                                    // it's
                                                                    // exact
        return new SingleTermsEnum(terms.iterator(null), term.bytes());
    }
    return new FuzzyTermsEnum(terms, atts, getTerm(), maxEdits,
            prefixLength, transpositions);
}
项目:DoSeR    文件:LearnToRankFuzzyQuery.java   
@Override
protected TermsEnum getTermsEnum(final Terms terms,
        final AttributeSource atts) throws IOException {
    if ((maxEdits == 0) || (prefixLength >= term.text().length())) { // can
                                                                        // only
        // match
        // if
        // it's
        // exact
        return new SingleTermsEnum(terms.iterator(null), term.bytes());
    }
    return new FuzzyTermsEnum(terms, atts, getTerm(), maxEdits,
            prefixLength, transpositions);
}
项目:search    文件:DirectSpellChecker.java   
/**
 * Provide spelling corrections based on several parameters.
 *
 * @param term The term to suggest spelling corrections for
 * @param numSug The maximum number of spelling corrections
 * @param ir The index reader to fetch the candidate spelling corrections from
 * @param docfreq The minimum document frequency a potential suggestion need to have in order to be included
 * @param editDistance The maximum edit distance candidates are allowed to have
 * @param accuracy The minimum accuracy a suggested spelling correction needs to have in order to be included
 * @param spare a chars scratch
 * @return a collection of spelling corrections sorted by <code>ScoreTerm</code>'s natural order.
 * @throws IOException If I/O related errors occur
 */
protected Collection<ScoreTerm> suggestSimilar(Term term, int numSug, IndexReader ir, int docfreq, int editDistance,
                                               float accuracy, final CharsRefBuilder spare) throws IOException {

  AttributeSource atts = new AttributeSource();
  MaxNonCompetitiveBoostAttribute maxBoostAtt =
    atts.addAttribute(MaxNonCompetitiveBoostAttribute.class);
  Terms terms = MultiFields.getTerms(ir, term.field());
  if (terms == null) {
    return Collections.emptyList();
  }
  FuzzyTermsEnum e = new FuzzyTermsEnum(terms, atts, term, editDistance, Math.max(minPrefix, editDistance-1), true);
  final PriorityQueue<ScoreTerm> stQueue = new PriorityQueue<>();

  BytesRef queryTerm = new BytesRef(term.text());
  BytesRef candidateTerm;
  ScoreTerm st = new ScoreTerm();
  BoostAttribute boostAtt =
    e.attributes().addAttribute(BoostAttribute.class);
  while ((candidateTerm = e.next()) != null) {
    final float boost = boostAtt.getBoost();
    // ignore uncompetitive hits
    if (stQueue.size() >= numSug && boost <= stQueue.peek().boost)
      continue;

    // ignore exact match of the same term
    if (queryTerm.bytesEquals(candidateTerm))
      continue;

    int df = e.docFreq();

    // check docFreq if required
    if (df <= docfreq)
      continue;

    final float score;
    final String termAsString;
    if (distance == INTERNAL_LEVENSHTEIN) {
      // delay creating strings until the end
      termAsString = null;
      // undo FuzzyTermsEnum's scale factor for a real scaled lev score
      score = boost / e.getScaleFactor() + e.getMinSimilarity();
    } else {
      spare.copyUTF8Bytes(candidateTerm);
      termAsString = spare.toString();
      score = distance.getDistance(term.text(), termAsString);
    }

    if (score < accuracy)
      continue;

    // add new entry in PQ
    st.term = BytesRef.deepCopyOf(candidateTerm);
    st.boost = boost;
    st.docfreq = df;
    st.termAsString = termAsString;
    st.score = score;
    stQueue.offer(st);
    // possibly drop entries from queue
    st = (stQueue.size() > numSug) ? stQueue.poll() : new ScoreTerm();
    maxBoostAtt.setMaxNonCompetitiveBoost((stQueue.size() >= numSug) ? stQueue.peek().boost : Float.NEGATIVE_INFINITY);
  }

  return stQueue;
}
项目:NYBC    文件:DirectSpellChecker.java   
/**
 * Provide spelling corrections based on several parameters.
 *
 * @param term The term to suggest spelling corrections for
 * @param numSug The maximum number of spelling corrections
 * @param ir The index reader to fetch the candidate spelling corrections from
 * @param docfreq The minimum document frequency a potential suggestion need to have in order to be included
 * @param editDistance The maximum edit distance candidates are allowed to have
 * @param accuracy The minimum accuracy a suggested spelling correction needs to have in order to be included
 * @param spare a chars scratch
 * @return a collection of spelling corrections sorted by <code>ScoreTerm</code>'s natural order.
 * @throws IOException If I/O related errors occur
 */
protected Collection<ScoreTerm> suggestSimilar(Term term, int numSug, IndexReader ir, int docfreq, int editDistance,
                                               float accuracy, final CharsRef spare) throws IOException {

  AttributeSource atts = new AttributeSource();
  MaxNonCompetitiveBoostAttribute maxBoostAtt =
    atts.addAttribute(MaxNonCompetitiveBoostAttribute.class);
  Terms terms = MultiFields.getTerms(ir, term.field());
  if (terms == null) {
    return Collections.emptyList();
  }
  FuzzyTermsEnum e = new FuzzyTermsEnum(terms, atts, term, editDistance, Math.max(minPrefix, editDistance-1), true);
  final PriorityQueue<ScoreTerm> stQueue = new PriorityQueue<ScoreTerm>();

  BytesRef queryTerm = new BytesRef(term.text());
  BytesRef candidateTerm;
  ScoreTerm st = new ScoreTerm();
  BoostAttribute boostAtt =
    e.attributes().addAttribute(BoostAttribute.class);
  while ((candidateTerm = e.next()) != null) {
    final float boost = boostAtt.getBoost();
    // ignore uncompetitive hits
    if (stQueue.size() >= numSug && boost <= stQueue.peek().boost)
      continue;

    // ignore exact match of the same term
    if (queryTerm.bytesEquals(candidateTerm))
      continue;

    int df = e.docFreq();

    // check docFreq if required
    if (df <= docfreq)
      continue;

    final float score;
    final String termAsString;
    if (distance == INTERNAL_LEVENSHTEIN) {
      // delay creating strings until the end
      termAsString = null;
      // undo FuzzyTermsEnum's scale factor for a real scaled lev score
      score = boost / e.getScaleFactor() + e.getMinSimilarity();
    } else {
      UnicodeUtil.UTF8toUTF16(candidateTerm, spare);
      termAsString = spare.toString();
      score = distance.getDistance(term.text(), termAsString);
    }

    if (score < accuracy)
      continue;

    // add new entry in PQ
    st.term = BytesRef.deepCopyOf(candidateTerm);
    st.boost = boost;
    st.docfreq = df;
    st.termAsString = termAsString;
    st.score = score;
    stQueue.offer(st);
    // possibly drop entries from queue
    st = (stQueue.size() > numSug) ? stQueue.poll() : new ScoreTerm();
    maxBoostAtt.setMaxNonCompetitiveBoost((stQueue.size() >= numSug) ? stQueue.peek().boost : Float.NEGATIVE_INFINITY);
  }

  return stQueue;
}
项目:read-open-source-code    文件:DirectSpellChecker.java   
/**
 * Provide spelling corrections based on several parameters.
 *
 * @param term The term to suggest spelling corrections for
 * @param numSug The maximum number of spelling corrections
 * @param ir The index reader to fetch the candidate spelling corrections from
 * @param docfreq The minimum document frequency a potential suggestion need to have in order to be included
 * @param editDistance The maximum edit distance candidates are allowed to have
 * @param accuracy The minimum accuracy a suggested spelling correction needs to have in order to be included
 * @param spare a chars scratch
 * @return a collection of spelling corrections sorted by <code>ScoreTerm</code>'s natural order.
 * @throws IOException If I/O related errors occur
 */
protected Collection<ScoreTerm> suggestSimilar(Term term, int numSug, IndexReader ir, int docfreq, int editDistance,
                                               float accuracy, final CharsRef spare) throws IOException {

  AttributeSource atts = new AttributeSource();
  MaxNonCompetitiveBoostAttribute maxBoostAtt =
    atts.addAttribute(MaxNonCompetitiveBoostAttribute.class);
  Terms terms = MultiFields.getTerms(ir, term.field());
  if (terms == null) {
    return Collections.emptyList();
  }
  FuzzyTermsEnum e = new FuzzyTermsEnum(terms, atts, term, editDistance, Math.max(minPrefix, editDistance-1), true);
  final PriorityQueue<ScoreTerm> stQueue = new PriorityQueue<ScoreTerm>();

  BytesRef queryTerm = new BytesRef(term.text());
  BytesRef candidateTerm;
  ScoreTerm st = new ScoreTerm();
  BoostAttribute boostAtt =
    e.attributes().addAttribute(BoostAttribute.class);
  while ((candidateTerm = e.next()) != null) {
    final float boost = boostAtt.getBoost();
    // ignore uncompetitive hits
    if (stQueue.size() >= numSug && boost <= stQueue.peek().boost)
      continue;

    // ignore exact match of the same term
    if (queryTerm.bytesEquals(candidateTerm))
      continue;

    int df = e.docFreq();

    // check docFreq if required
    if (df <= docfreq)
      continue;

    final float score;
    final String termAsString;
    if (distance == INTERNAL_LEVENSHTEIN) {
      // delay creating strings until the end
      termAsString = null;
      // undo FuzzyTermsEnum's scale factor for a real scaled lev score
      score = boost / e.getScaleFactor() + e.getMinSimilarity();
    } else {
      UnicodeUtil.UTF8toUTF16(candidateTerm, spare);
      termAsString = spare.toString();
      score = distance.getDistance(term.text(), termAsString);
    }

    if (score < accuracy)
      continue;

    // add new entry in PQ
    st.term = BytesRef.deepCopyOf(candidateTerm);
    st.boost = boost;
    st.docfreq = df;
    st.termAsString = termAsString;
    st.score = score;
    stQueue.offer(st);
    // possibly drop entries from queue
    st = (stQueue.size() > numSug) ? stQueue.poll() : new ScoreTerm();
    maxBoostAtt.setMaxNonCompetitiveBoost((stQueue.size() >= numSug) ? stQueue.peek().boost : Float.NEGATIVE_INFINITY);
  }

  return stQueue;
}
项目:read-open-source-code    文件:DirectSpellChecker.java   
/**
 * Provide spelling corrections based on several parameters.
 *
 * @param term The term to suggest spelling corrections for
 * @param numSug The maximum number of spelling corrections
 * @param ir The index reader to fetch the candidate spelling corrections from
 * @param docfreq The minimum document frequency a potential suggestion need to have in order to be included
 * @param editDistance The maximum edit distance candidates are allowed to have
 * @param accuracy The minimum accuracy a suggested spelling correction needs to have in order to be included
 * @param spare a chars scratch
 * @return a collection of spelling corrections sorted by <code>ScoreTerm</code>'s natural order.
 * @throws IOException If I/O related errors occur
 */
protected Collection<ScoreTerm> suggestSimilar(Term term, int numSug, IndexReader ir, int docfreq, int editDistance,
                                               float accuracy, final CharsRef spare) throws IOException {

  AttributeSource atts = new AttributeSource();
  MaxNonCompetitiveBoostAttribute maxBoostAtt =
    atts.addAttribute(MaxNonCompetitiveBoostAttribute.class);
  Terms terms = MultiFields.getTerms(ir, term.field());
  if (terms == null) {
    return Collections.emptyList();
  }
  FuzzyTermsEnum e = new FuzzyTermsEnum(terms, atts, term, editDistance, Math.max(minPrefix, editDistance-1), true);
  final PriorityQueue<ScoreTerm> stQueue = new PriorityQueue<ScoreTerm>();

  BytesRef queryTerm = new BytesRef(term.text());
  BytesRef candidateTerm;
  ScoreTerm st = new ScoreTerm();
  BoostAttribute boostAtt =
    e.attributes().addAttribute(BoostAttribute.class);
  while ((candidateTerm = e.next()) != null) {
    final float boost = boostAtt.getBoost();
    // ignore uncompetitive hits
    if (stQueue.size() >= numSug && boost <= stQueue.peek().boost)
      continue;

    // ignore exact match of the same term
    if (queryTerm.bytesEquals(candidateTerm))
      continue;

    int df = e.docFreq();

    // check docFreq if required
    if (df <= docfreq)
      continue;

    final float score;
    final String termAsString;
    if (distance == INTERNAL_LEVENSHTEIN) {
      // delay creating strings until the end
      termAsString = null;
      // undo FuzzyTermsEnum's scale factor for a real scaled lev score
      score = boost / e.getScaleFactor() + e.getMinSimilarity();
    } else {
      UnicodeUtil.UTF8toUTF16(candidateTerm, spare);
      termAsString = spare.toString();
      score = distance.getDistance(term.text(), termAsString);
    }

    if (score < accuracy)
      continue;

    // add new entry in PQ
    st.term = BytesRef.deepCopyOf(candidateTerm);
    st.boost = boost;
    st.docfreq = df;
    st.termAsString = termAsString;
    st.score = score;
    stQueue.offer(st);
    // possibly drop entries from queue
    st = (stQueue.size() > numSug) ? stQueue.poll() : new ScoreTerm();
    maxBoostAtt.setMaxNonCompetitiveBoost((stQueue.size() >= numSug) ? stQueue.peek().boost : Float.NEGATIVE_INFINITY);
  }

  return stQueue;
}
项目:read-open-source-code    文件:DirectSpellChecker.java   
/**
 * Provide spelling corrections based on several parameters.
 *
 * @param term The term to suggest spelling corrections for
 * @param numSug The maximum number of spelling corrections
 * @param ir The index reader to fetch the candidate spelling corrections from
 * @param docfreq The minimum document frequency a potential suggestion need to have in order to be included
 * @param editDistance The maximum edit distance candidates are allowed to have
 * @param accuracy The minimum accuracy a suggested spelling correction needs to have in order to be included
 * @param spare a chars scratch
 * @return a collection of spelling corrections sorted by <code>ScoreTerm</code>'s natural order.
 * @throws IOException If I/O related errors occur
 */
protected Collection<ScoreTerm> suggestSimilar(Term term, int numSug, IndexReader ir, int docfreq, int editDistance,
                                               float accuracy, final CharsRefBuilder spare) throws IOException {

  AttributeSource atts = new AttributeSource();
  MaxNonCompetitiveBoostAttribute maxBoostAtt =
    atts.addAttribute(MaxNonCompetitiveBoostAttribute.class);
  Terms terms = MultiFields.getTerms(ir, term.field());
  if (terms == null) {
    return Collections.emptyList();
  }
  FuzzyTermsEnum e = new FuzzyTermsEnum(terms, atts, term, editDistance, Math.max(minPrefix, editDistance-1), true);
  final PriorityQueue<ScoreTerm> stQueue = new PriorityQueue<>();

  BytesRef queryTerm = new BytesRef(term.text());
  BytesRef candidateTerm;
  ScoreTerm st = new ScoreTerm();
  BoostAttribute boostAtt =
    e.attributes().addAttribute(BoostAttribute.class);
  while ((candidateTerm = e.next()) != null) {
    final float boost = boostAtt.getBoost();
    // ignore uncompetitive hits
    if (stQueue.size() >= numSug && boost <= stQueue.peek().boost)
      continue;

    // ignore exact match of the same term
    if (queryTerm.bytesEquals(candidateTerm))
      continue;

    int df = e.docFreq();

    // check docFreq if required
    if (df <= docfreq)
      continue;

    final float score;
    final String termAsString;
    if (distance == INTERNAL_LEVENSHTEIN) {
      // delay creating strings until the end
      termAsString = null;
      // undo FuzzyTermsEnum's scale factor for a real scaled lev score
      score = boost / e.getScaleFactor() + e.getMinSimilarity();
    } else {
      spare.copyUTF8Bytes(candidateTerm);
      termAsString = spare.toString();
      score = distance.getDistance(term.text(), termAsString);
    }

    if (score < accuracy)
      continue;

    // add new entry in PQ
    st.term = BytesRef.deepCopyOf(candidateTerm);
    st.boost = boost;
    st.docfreq = df;
    st.termAsString = termAsString;
    st.score = score;
    stQueue.offer(st);
    // possibly drop entries from queue
    st = (stQueue.size() > numSug) ? stQueue.poll() : new ScoreTerm();
    maxBoostAtt.setMaxNonCompetitiveBoost((stQueue.size() >= numSug) ? stQueue.peek().boost : Float.NEGATIVE_INFINITY);
  }

  return stQueue;
}
项目:Maskana-Gestor-de-Conocimiento    文件:DirectSpellChecker.java   
/**
 * Provide spelling corrections based on several parameters.
 *
 * @param term The term to suggest spelling corrections for
 * @param numSug The maximum number of spelling corrections
 * @param ir The index reader to fetch the candidate spelling corrections from
 * @param docfreq The minimum document frequency a potential suggestion need to have in order to be included
 * @param editDistance The maximum edit distance candidates are allowed to have
 * @param accuracy The minimum accuracy a suggested spelling correction needs to have in order to be included
 * @param spare a chars scratch
 * @return a collection of spelling corrections sorted by <code>ScoreTerm</code>'s natural order.
 * @throws IOException If I/O related errors occur
 */
protected Collection<ScoreTerm> suggestSimilar(Term term, int numSug, IndexReader ir, int docfreq, int editDistance,
                                               float accuracy, final CharsRef spare) throws IOException {

  AttributeSource atts = new AttributeSource();
  MaxNonCompetitiveBoostAttribute maxBoostAtt =
    atts.addAttribute(MaxNonCompetitiveBoostAttribute.class);
  Terms terms = MultiFields.getTerms(ir, term.field());
  if (terms == null) {
    return Collections.emptyList();
  }
  FuzzyTermsEnum e = new FuzzyTermsEnum(terms, atts, term, editDistance, Math.max(minPrefix, editDistance-1), true);
  final PriorityQueue<ScoreTerm> stQueue = new PriorityQueue<ScoreTerm>();

  BytesRef queryTerm = new BytesRef(term.text());
  BytesRef candidateTerm;
  ScoreTerm st = new ScoreTerm();
  BoostAttribute boostAtt =
    e.attributes().addAttribute(BoostAttribute.class);
  while ((candidateTerm = e.next()) != null) {
    final float boost = boostAtt.getBoost();
    // ignore uncompetitive hits
    if (stQueue.size() >= numSug && boost <= stQueue.peek().boost)
      continue;

    // ignore exact match of the same term
    if (queryTerm.bytesEquals(candidateTerm))
      continue;

    int df = e.docFreq();

    // check docFreq if required
    if (df <= docfreq)
      continue;

    final float score;
    final String termAsString;
    if (distance == INTERNAL_LEVENSHTEIN) {
      // delay creating strings until the end
      termAsString = null;
      // undo FuzzyTermsEnum's scale factor for a real scaled lev score
      score = boost / e.getScaleFactor() + e.getMinSimilarity();
    } else {
      UnicodeUtil.UTF8toUTF16(candidateTerm, spare);
      termAsString = spare.toString();
      score = distance.getDistance(term.text(), termAsString);
    }

    if (score < accuracy)
      continue;

    // add new entry in PQ
    st.term = BytesRef.deepCopyOf(candidateTerm);
    st.boost = boost;
    st.docfreq = df;
    st.termAsString = termAsString;
    st.score = score;
    stQueue.offer(st);
    // possibly drop entries from queue
    st = (stQueue.size() > numSug) ? stQueue.poll() : new ScoreTerm();
    maxBoostAtt.setMaxNonCompetitiveBoost((stQueue.size() >= numSug) ? stQueue.peek().boost : Float.NEGATIVE_INFINITY);
  }

  return stQueue;
}