Java 类org.apache.lucene.util.UnicodeUtil 实例源码

项目:elasticsearch_my    文件:FieldSortIT.java   
public void testSortMetaField() throws Exception {
    createIndex("test");
    ensureGreen();
    final int numDocs = randomIntBetween(10, 20);
    IndexRequestBuilder[] indexReqs = new IndexRequestBuilder[numDocs];
    for (int i = 0; i < numDocs; ++i) {
        indexReqs[i] = client().prepareIndex("test", "type", Integer.toString(i))
                .setSource();
    }
    indexRandom(true, indexReqs);

    SortOrder order = randomFrom(SortOrder.values());
    SearchResponse searchResponse = client().prepareSearch()
            .setQuery(matchAllQuery())
            .setSize(randomIntBetween(1, numDocs + 5))
            .addSort("_uid", order)
            .execute().actionGet();
    assertNoFailures(searchResponse);
    SearchHit[] hits = searchResponse.getHits().getHits();
    BytesRef previous = order == SortOrder.ASC ? new BytesRef() : UnicodeUtil.BIG_TERM;
    for (int i = 0; i < hits.length; ++i) {
        final BytesRef uid = new BytesRef(Uid.createUid(hits[i].getType(), hits[i].getId()));
        assertThat(previous, order == SortOrder.ASC ? lessThan(uid) : greaterThan(uid));
        previous = uid;
    }
}
项目:lams    文件:FuzzyTermsEnum.java   
/** initialize levenshtein DFAs up to maxDistance, if possible */
private List<CompiledAutomaton> initAutomata(int maxDistance) {
  final List<CompiledAutomaton> runAutomata = dfaAtt.automata();
  //System.out.println("cached automata size: " + runAutomata.size());
  if (runAutomata.size() <= maxDistance &&
      maxDistance <= LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE) {
    LevenshteinAutomata builder = 
      new LevenshteinAutomata(UnicodeUtil.newString(termText, realPrefixLength, termText.length - realPrefixLength), transpositions);

    String prefix = UnicodeUtil.newString(termText, 0, realPrefixLength);
    for (int i = runAutomata.size(); i <= maxDistance; i++) {
      Automaton a = builder.toAutomaton(i, prefix);
      //System.out.println("compute automaton n=" + i);
      runAutomata.add(new CompiledAutomaton(a, true, false));
    }
  }
  return runAutomata;
}
项目:lams    文件:DaciukMihovAutomatonBuilder.java   
/**
 * Build a minimal, deterministic automaton from a sorted list of {@link BytesRef} representing
 * strings in UTF-8. These strings must be binary-sorted.
 */
public static Automaton build(Collection<BytesRef> input) {
  final DaciukMihovAutomatonBuilder builder = new DaciukMihovAutomatonBuilder();

  char[] chars = new char[0];
  CharsRef ref = new CharsRef();
  for (BytesRef b : input) {
    chars = ArrayUtil.grow(chars, b.length);
    final int len = UnicodeUtil.UTF8toUTF16(b, chars);
    ref.chars = chars;
    ref.length = len;
    builder.add(ref);
  }

  Automaton.Builder a = new Automaton.Builder();
  convert(a,
      builder.complete(), 
      new IdentityHashMap<State,Integer>());

  return a.finish();
}
项目:Elasticsearch    文件:LowerFunction.java   
@Override
public BytesRef evaluate(Input<Object>... args) {
    Object stringValue = args[0].value();
    if (stringValue == null) {
        return null;
    }

    BytesRef inputByteRef = BytesRefs.toBytesRef(stringValue);

    char[] ref = new char[inputByteRef.length];
    int len = UnicodeUtil.UTF8toUTF16(inputByteRef.bytes, inputByteRef.offset, inputByteRef.length, ref);
    charUtils.toLowerCase(ref, 0, len);

    byte[] res = new byte[UnicodeUtil.MAX_UTF8_BYTES_PER_CHAR * len];
    len = UnicodeUtil.UTF16toUTF8(ref, 0, len, res);
    return new BytesRef(res, 0, len);
}
项目:Elasticsearch    文件:UpperFunction.java   
@Override
public BytesRef evaluate(Input<Object>... args) {
    Object stringValue = args[0].value();
    if (stringValue == null) {
        return null;
    }

    BytesRef inputByteRef = BytesRefs.toBytesRef(stringValue);

    char[] ref = new char[inputByteRef.length];
    int len = UnicodeUtil.UTF8toUTF16(inputByteRef.bytes, inputByteRef.offset, inputByteRef.length, ref);
    charUtils.toUpperCase(ref, 0, len);

    byte[] res = new byte[UnicodeUtil.MAX_UTF8_BYTES_PER_CHAR * len];
    len = UnicodeUtil.UTF16toUTF8(ref, 0, len, res);
    return new BytesRef(res, 0, len);
}
项目:solrplugins    文件:DocBasedFacetResponseBuilder.java   
@Override
public TermDocIndexKey decrementKey(TermDocIndexKey previousKey) {
  int termIndex = previousKey.index;
  BytesRef docId = previousKey.docId;
  do {
    while ((docId = decrementDocId(termIndex, docId)) != null) {
      int docIndex = acceptDoc(termIndex, docId);
      if (docIndex >= 0) {
        localDocIndex = docIndex;
        return termDocIndexKey = new TermDocIndexKey(termIndex, docId);
      }
    }
    docId = UnicodeUtil.BIG_TERM;
  } while ((termIndex = decrementTermIndex(termIndex)) >= 0);
  localDocIndex = -1;
  return termDocIndexKey = null;
}
项目:solrplugins    文件:DocBasedFacetResponseBuilder.java   
@Override
public TermDocIndexKey targetKeyInit(boolean ascending) throws IOException {
  int termIndex = getTargetKeyIndexInit(ascending);
  if (termIndex < 0) {
    return null;
  }
  int rawTargetIdx = getTargetKeyIndex();
  BytesRef initTargetDoc = targetDoc;
  if (rawTargetIdx < termIndex) {
    initTargetDoc = null;
  } else if (rawTargetIdx > termIndex) {
    initTargetDoc = UnicodeUtil.BIG_TERM;
  }
  TermDocIndexKey ret = new TermDocIndexKey(termIndex, initTargetDoc);
  int docIndex = acceptDoc(termIndex, initTargetDoc);
  if (docIndex >= 0) {
    localDocIndex = docIndex;
    return termDocIndexKey = ret;
  } else if (ascending) {
    return incrementKey(ret);
  } else {
    return decrementKey(ret);
  }
}
项目:search    文件:TestJapaneseTokenizer.java   
/** random test ensuring we don't ever split supplementaries */
public void testSurrogates2() throws IOException {
  int numIterations = atLeast(10000);
  for (int i = 0; i < numIterations; i++) {
    if (VERBOSE) {
      System.out.println("\nTEST: iter=" + i);
    }
    String s = TestUtil.randomUnicodeString(random(), 100);
    TokenStream ts = analyzer.tokenStream("foo", s);
    try {
      CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
      ts.reset();
      while (ts.incrementToken()) {
        assertTrue(UnicodeUtil.validUTF16String(termAtt));
      }
      ts.end();
    } finally {
      IOUtils.closeWhileHandlingException(ts);
    }
  }
}
项目:search    文件:TestExtendedMode.java   
/** random test ensuring we don't ever split supplementaries */
public void testSurrogates2() throws IOException {
  int numIterations = atLeast(1000);
  for (int i = 0; i < numIterations; i++) {
    String s = TestUtil.randomUnicodeString(random(), 100);
    TokenStream ts = analyzer.tokenStream("foo", s);
    try {
      CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
      ts.reset();
      while (ts.incrementToken()) {
        assertTrue(UnicodeUtil.validUTF16String(termAtt));
      }
      ts.end();
    } finally {
      IOUtils.closeWhileHandlingException(ts);
    }
  }
}
项目:search    文件:TestMappingCharFilter.java   
@Override
public void setUp() throws Exception {
  super.setUp();
  NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();

  builder.add( "aa", "a" );
  builder.add( "bbb", "b" );
  builder.add( "cccc", "cc" );

  builder.add( "h", "i" );
  builder.add( "j", "jj" );
  builder.add( "k", "kkk" );
  builder.add( "ll", "llll" );

  builder.add( "empty", "" );

  // BMP (surrogate pair):
  builder.add(UnicodeUtil.newString(new int[] {0x1D122}, 0, 1), "fclef");

  builder.add("\uff01", "full-width-exclamation");

  normMap = builder.build();
}
项目:search    文件:FuzzyTermsEnum.java   
/** initialize levenshtein DFAs up to maxDistance, if possible */
private List<CompiledAutomaton> initAutomata(int maxDistance) {
  final List<CompiledAutomaton> runAutomata = dfaAtt.automata();
  //System.out.println("cached automata size: " + runAutomata.size());
  if (runAutomata.size() <= maxDistance &&
      maxDistance <= LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE) {
    LevenshteinAutomata builder = 
      new LevenshteinAutomata(UnicodeUtil.newString(termText, realPrefixLength, termText.length - realPrefixLength), transpositions);

    String prefix = UnicodeUtil.newString(termText, 0, realPrefixLength);
    for (int i = runAutomata.size(); i <= maxDistance; i++) {
      Automaton a = builder.toAutomaton(i, prefix);
      //System.out.println("compute automaton n=" + i);
      runAutomata.add(new CompiledAutomaton(a, true, false));
    }
  }
  return runAutomata;
}
项目:search    文件:DaciukMihovAutomatonBuilder.java   
/**
 * Build a minimal, deterministic automaton from a sorted list of {@link BytesRef} representing
 * strings in UTF-8. These strings must be binary-sorted.
 */
public static Automaton build(Collection<BytesRef> input) {
  final DaciukMihovAutomatonBuilder builder = new DaciukMihovAutomatonBuilder();

  char[] chars = new char[0];
  CharsRef ref = new CharsRef();
  for (BytesRef b : input) {
    chars = ArrayUtil.grow(chars, b.length);
    final int len = UnicodeUtil.UTF8toUTF16(b, chars);
    ref.chars = chars;
    ref.length = len;
    builder.add(ref);
  }

  Automaton.Builder a = new Automaton.Builder();
  convert(a,
      builder.complete(), 
      new IdentityHashMap<State,Integer>());

  return a.finish();
}
项目:NYBC    文件:JaspellLookup.java   
@Override
public void build(TermFreqIterator tfit) throws IOException {
  if (tfit.getComparator() != null) {
    // make sure it's unsorted
    // WTF - this could result in yet another sorted iteration....
    tfit = new UnsortedTermFreqIteratorWrapper(tfit);
  }
  trie = new JaspellTernarySearchTrie();
  trie.setMatchAlmostDiff(editDistance);
  BytesRef spare;
  final CharsRef charsSpare = new CharsRef();

  while ((spare = tfit.next()) != null) {
    final long weight = tfit.weight();
    if (spare.length == 0) {
      continue;
    }
    charsSpare.grow(spare.length);
    UnicodeUtil.UTF8toUTF16(spare.bytes, spare.offset, spare.length, charsSpare);
    trie.put(charsSpare.toString(), Long.valueOf(weight));
  }
}
项目:NYBC    文件:TSTLookup.java   
@Override
public void build(TermFreqIterator tfit) throws IOException {
  root = new TernaryTreeNode();
  // buffer first
  if (tfit.getComparator() != BytesRef.getUTF8SortedAsUTF16Comparator()) {
    // make sure it's sorted and the comparator uses UTF16 sort order
    tfit = new SortedTermFreqIteratorWrapper(tfit, BytesRef.getUTF8SortedAsUTF16Comparator());
  }

  ArrayList<String> tokens = new ArrayList<String>();
  ArrayList<Number> vals = new ArrayList<Number>();
  BytesRef spare;
  CharsRef charsSpare = new CharsRef();
  while ((spare = tfit.next()) != null) {
    charsSpare.grow(spare.length);
    UnicodeUtil.UTF8toUTF16(spare.bytes, spare.offset, spare.length, charsSpare);
    tokens.add(charsSpare.toString());
    vals.add(Long.valueOf(tfit.weight()));
  }
  autocomplete.balancedTree(tokens.toArray(), vals.toArray(), 0, tokens.size() - 1, root);
}
项目:NYBC    文件:TestJapaneseTokenizer.java   
/** random test ensuring we don't ever split supplementaries */
public void testSurrogates2() throws IOException {
  int numIterations = atLeast(10000);
  for (int i = 0; i < numIterations; i++) {
    if (VERBOSE) {
      System.out.println("\nTEST: iter=" + i);
    }
    String s = _TestUtil.randomUnicodeString(random(), 100);
    TokenStream ts = analyzer.tokenStream("foo", new StringReader(s));
    CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
    ts.reset();
    while (ts.incrementToken()) {
      assertTrue(UnicodeUtil.validUTF16String(termAtt));
    }
  }
}
项目:read-open-source-code    文件:DaciukMihovAutomatonBuilder.java   
/**
 * Build a minimal, deterministic automaton from a sorted list of {@link BytesRef} representing
 * strings in UTF-8. These strings must be binary-sorted.
 */
public static Automaton build(Collection<BytesRef> input) {
  final DaciukMihovAutomatonBuilder builder = new DaciukMihovAutomatonBuilder();

  char[] chars = new char[0];
  CharsRef ref = new CharsRef();
  for (BytesRef b : input) {
    chars = ArrayUtil.grow(chars, b.length);
    final int len = UnicodeUtil.UTF8toUTF16(b, chars);
    ref.chars = chars;
    ref.length = len;
    builder.add(ref);
  }

  Automaton.Builder a = new Automaton.Builder();
  convert(a,
      builder.complete(), 
      new IdentityHashMap<State,Integer>());

  return a.finish();
}
项目:read-open-source-code    文件:FuzzyTermsEnum.java   
/** initialize levenshtein DFAs up to maxDistance, if possible */
private List<CompiledAutomaton> initAutomata(int maxDistance) {
  final List<CompiledAutomaton> runAutomata = dfaAtt.automata();
  //System.out.println("cached automata size: " + runAutomata.size());
  if (runAutomata.size() <= maxDistance && 
      maxDistance <= LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE) {
    LevenshteinAutomata builder = 
      new LevenshteinAutomata(UnicodeUtil.newString(termText, realPrefixLength, termText.length - realPrefixLength), transpositions);

    for (int i = runAutomata.size(); i <= maxDistance; i++) {
      Automaton a = builder.toAutomaton(i);
      //System.out.println("compute automaton n=" + i);
      // constant prefix
      if (realPrefixLength > 0) {
        Automaton prefix = BasicAutomata.makeString(
          UnicodeUtil.newString(termText, 0, realPrefixLength));
        a = BasicOperations.concatenate(prefix, a);
      }
      runAutomata.add(new CompiledAutomaton(a, true, false));
    }
  }
  return runAutomata;
}
项目:NYBC    文件:TestMappingCharFilter.java   
@Override
public void setUp() throws Exception {
  super.setUp();
  NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();

  builder.add( "aa", "a" );
  builder.add( "bbb", "b" );
  builder.add( "cccc", "cc" );

  builder.add( "h", "i" );
  builder.add( "j", "jj" );
  builder.add( "k", "kkk" );
  builder.add( "ll", "llll" );

  builder.add( "empty", "" );

  // BMP (surrogate pair):
  builder.add(UnicodeUtil.newString(new int[] {0x1D122}, 0, 1), "fclef");

  builder.add("\uff01", "full-width-exclamation");

  normMap = builder.build();
}
项目:read-open-source-code    文件:StemmerOverrideFilter.java   
/**
 * Returns an {@link StemmerOverrideMap} to be used with the {@link StemmerOverrideFilter}
 * @return an {@link StemmerOverrideMap} to be used with the {@link StemmerOverrideFilter}
 * @throws IOException if an {@link IOException} occurs;
 */
public StemmerOverrideMap build() throws IOException {
  ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton();
  org.apache.lucene.util.fst.Builder<BytesRef> builder = new org.apache.lucene.util.fst.Builder<BytesRef>(
      FST.INPUT_TYPE.BYTE4, outputs);
  final int[] sort = hash.sort(BytesRef.getUTF8SortedAsUnicodeComparator());
  IntsRef intsSpare = new IntsRef();
  final int size = hash.size();
  for (int i = 0; i < size; i++) {
    int id = sort[i];
    BytesRef bytesRef = hash.get(id, spare);
    UnicodeUtil.UTF8toUTF32(bytesRef, intsSpare);
    builder.add(intsSpare, new BytesRef(outputValues.get(id)));
  }
  return new StemmerOverrideMap(builder.finish(), ignoreCase);
}
项目:NYBC    文件:PerSegmentSingleValuedFaceting.java   
@Override
public boolean collect(BytesRef term, int count) {
  if (count < mincount) {
    return false;
  }

  if (offset > 0) {
    offset--;
    return false;
  }

  if (limit > 0) {
    UnicodeUtil.UTF8toUTF16(term, spare);
    res.add(spare.toString(), count);
    limit--;
  }

  return limit <= 0;
}
项目:NYBC    文件:ValueSourceParser.java   
private static TInfo parseTerm(FunctionQParser fp) throws SyntaxError {
  TInfo tinfo = new TInfo();

  tinfo.indexedField = tinfo.field = fp.parseArg();
  tinfo.val = fp.parseArg();
  tinfo.indexedBytes = new BytesRef();

  FieldType ft = fp.getReq().getSchema().getFieldTypeNoEx(tinfo.field);
  if (ft == null) ft = new StrField();

  if (ft instanceof TextField) {
    // need to do analysis on the term
    String indexedVal = tinfo.val;
    Query q = ft.getFieldQuery(fp, fp.getReq().getSchema().getFieldOrNull(tinfo.field), tinfo.val);
    if (q instanceof TermQuery) {
      Term term = ((TermQuery)q).getTerm();
      tinfo.indexedField = term.field();
      indexedVal = term.text();
    }
    UnicodeUtil.UTF16toUTF8(indexedVal, 0, indexedVal.length(), tinfo.indexedBytes);
  } else {
    ft.readableToIndexed(tinfo.val, tinfo.indexedBytes);
  }

  return tinfo;
}
项目:search-core    文件:PerSegmentSingleValuedFaceting.java   
@Override
public boolean collect(BytesRef term, int count) {
  if (count < mincount) {
    return false;
  }

  if (offset > 0) {
    offset--;
    return false;
  }

  if (limit > 0) {
    UnicodeUtil.UTF8toUTF16(term, spare);
    res.add(spare.toString(), count);
    limit--;
  }

  return limit <= 0;
}
项目:read-open-source-code    文件:MoreLikeThis.java   
/**
 * Adds terms and frequencies found in vector into the Map termFreqMap
 *
 * @param termFreqMap a Map of terms and their frequencies
 * @param vector List of terms and their frequencies for a doc/field
 */
private void addTermFrequencies(Map<String, Int> termFreqMap, Terms vector) throws IOException {
  final TermsEnum termsEnum = vector.iterator(null);
  final CharsRef spare = new CharsRef();
  BytesRef text;
  while((text = termsEnum.next()) != null) {
    UnicodeUtil.UTF8toUTF16(text, spare);
    final String term = spare.toString();
    if (isNoiseWord(term)) {
      continue;
    }
    final int freq = (int) termsEnum.totalTermFreq();

    // increment frequency
    Int cnt = termFreqMap.get(term);
    if (cnt == null) {
      cnt = new Int();
      termFreqMap.put(term, cnt);
      cnt.x = freq;
    } else {
      cnt.x += freq;
    }
  }
}
项目:read-open-source-code    文件:MoreLikeThis.java   
/**
 * Adds terms and frequencies found in vector into the Map termFreqMap
 *
 * @param termFreqMap a Map of terms and their frequencies
 * @param vector List of terms and their frequencies for a doc/field
 */
private void addTermFrequencies(Map<String, Int> termFreqMap, Terms vector) throws IOException {
  final TermsEnum termsEnum = vector.iterator(null);
  final CharsRef spare = new CharsRef();
  BytesRef text;
  while((text = termsEnum.next()) != null) {
    UnicodeUtil.UTF8toUTF16(text, spare);
    final String term = spare.toString();
    if (isNoiseWord(term)) {
      continue;
    }
    final int freq = (int) termsEnum.totalTermFreq();

    // increment frequency
    Int cnt = termFreqMap.get(term);
    if (cnt == null) {
      cnt = new Int();
      termFreqMap.put(term, cnt);
      cnt.x = freq;
    } else {
      cnt.x += freq;
    }
  }
}
项目:read-open-source-code    文件:FuzzyTermsEnum.java   
/** initialize levenshtein DFAs up to maxDistance, if possible */
private List<CompiledAutomaton> initAutomata(int maxDistance) {
  final List<CompiledAutomaton> runAutomata = dfaAtt.automata();
  //System.out.println("cached automata size: " + runAutomata.size());
  if (runAutomata.size() <= maxDistance && 
      maxDistance <= LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE) {
    LevenshteinAutomata builder = 
      new LevenshteinAutomata(UnicodeUtil.newString(termText, realPrefixLength, termText.length - realPrefixLength), transpositions);

    for (int i = runAutomata.size(); i <= maxDistance; i++) {
      Automaton a = builder.toAutomaton(i);
      //System.out.println("compute automaton n=" + i);
      // constant prefix
      if (realPrefixLength > 0) {
        Automaton prefix = BasicAutomata.makeString(
          UnicodeUtil.newString(termText, 0, realPrefixLength));
        a = BasicOperations.concatenate(prefix, a);
      }
      runAutomata.add(new CompiledAutomaton(a, true, false));
    }
  }
  return runAutomata;
}
项目:read-open-source-code    文件:StemmerOverrideFilter.java   
/**
 * Adds an input string and it's stemmer override output to this builder.
 * 
 * @param input the input char sequence 
 * @param output the stemmer override output char sequence
 * @return <code>false</code> iff the input has already been added to this builder otherwise <code>true</code>.
 */
public boolean add(CharSequence input, CharSequence output) {
  final int length = input.length();
  if (ignoreCase) {
    // convert on the fly to lowercase
    charsSpare.grow(length);
    final char[] buffer = charsSpare.chars;
    for (int i = 0; i < length; ) {
        i += Character.toChars(
                Character.toLowerCase(
                    Character.codePointAt(input, i)), buffer, i);
    }
    UnicodeUtil.UTF16toUTF8(buffer, 0, length, spare);
  } else {
    UnicodeUtil.UTF16toUTF8(input, 0, length, spare);
  }
  if (hash.add(spare) >= 0) {
    outputValues.add(output);
    return true;
  }
  return false;
}
项目:read-open-source-code    文件:StemmerOverrideFilter.java   
/**
 * Adds an input string and it's stemmer override output to this builder.
 * 
 * @param input the input char sequence 
 * @param output the stemmer override output char sequence
 * @return <code>false</code> iff the input has already been added to this builder otherwise <code>true</code>.
 */
public boolean add(CharSequence input, CharSequence output) {
  final int length = input.length();
  if (ignoreCase) {
    // convert on the fly to lowercase
    charsSpare.grow(length);
    final char[] buffer = charsSpare.chars;
    for (int i = 0; i < length; ) {
        i += Character.toChars(
                Character.toLowerCase(
                    Character.codePointAt(input, i)), buffer, i);
    }
    UnicodeUtil.UTF16toUTF8(buffer, 0, length, spare);
  } else {
    UnicodeUtil.UTF16toUTF8(input, 0, length, spare);
  }
  if (hash.add(spare) >= 0) {
    outputValues.add(output);
    return true;
  }
  return false;
}
项目:read-open-source-code    文件:PerSegmentSingleValuedFaceting.java   
@Override
public boolean collect(BytesRef term, int count) {
  if (count < mincount) {
    return false;
  }

  if (offset > 0) {
    offset--;
    return false;
  }

  if (limit > 0) {
    UnicodeUtil.UTF8toUTF16(term, spare);
    res.add(spare.toString(), count);
    limit--;
  }

  return limit <= 0;
}
项目:lams    文件:FuzzyTermsEnum.java   
/** finds the smallest Lev(n) DFA that accepts the term. */
@Override
protected AcceptStatus accept(BytesRef term) {    
  //System.out.println("AFTE.accept term=" + term);
  int ed = matchers.length - 1;

  // we are wrapping either an intersect() TermsEnum or an AutomatonTermsENum,
  // so we know the outer DFA always matches.
  // now compute exact edit distance
  while (ed > 0) {
    if (matches(term, ed - 1)) {
      ed--;
    } else {
      break;
    }
  }
  //System.out.println("CHECK term=" + term.utf8ToString() + " ed=" + ed);

  // scale to a boost and return (if similarity > minSimilarity)
  if (ed == 0) { // exact match
    boostAtt.setBoost(1.0F);
    //System.out.println("  yes");
    return AcceptStatus.YES;
  } else {
    final int codePointCount = UnicodeUtil.codePointCount(term);
    final float similarity = 1.0f - ((float) ed / (float) 
        (Math.min(codePointCount, termLength)));
    if (similarity > minSimilarity) {
      boostAtt.setBoost((similarity - minSimilarity) * scale_factor);
      //System.out.println("  yes");
      return AcceptStatus.YES;
    } else {
      return AcceptStatus.NO;
    }
  }
}
项目:lams    文件:CompressionTools.java   
/** Decompress the byte array previously returned by
 *  compressString back into a String */
public static String decompressString(byte[] value, int offset, int length) throws DataFormatException {
  final byte[] bytes = decompress(value, offset, length);
  final char[] result = new char[bytes.length];
  final int len = UnicodeUtil.UTF8toUTF16(bytes, 0, bytes.length, result);
  return new String(result, 0, len);
}
项目:solrplugins    文件:DocBasedFacetResponseBuilder.java   
static String brToString(BytesRef br) {
  if (br == null) {
    return "null";
  } else if (UnicodeUtil.BIG_TERM.bytesEquals(br)) {
    return "[UnicodeUtil.BIG_TERM]";
  } else {
    return br.utf8ToString();
  }
}
项目:moar    文件:ByteCharSeq.java   
@Override
public int codePoint(int index) {
    //FIXME: is this the correct behaviour?
    this.tmpByte[0] = this.contents.bytes[index];
    UnicodeUtil.UTF8toUTF16( this.tmpByte, 0, 1, this.tmpChar );
    return this.tmpChar[0] & 0xFFFF;
}
项目:search    文件:SlowFuzzyTermsEnum.java   
/**
 * Constructor for enumeration of all terms from specified <code>reader</code> which share a prefix of
 * length <code>prefixLength</code> with <code>term</code> and which have a fuzzy similarity &gt;
 * <code>minSimilarity</code>.
 * <p>
 * After calling the constructor the enumeration is already pointing to the first 
 * valid term if such a term exists.
 *
 * @throws IOException If there is a low-level I/O error.
 */
public LinearFuzzyTermsEnum() throws IOException {
  super(terms.iterator(null));

  this.text = new int[termLength - realPrefixLength];
  System.arraycopy(termText, realPrefixLength, text, 0, text.length);
  final String prefix = UnicodeUtil.newString(termText, 0, realPrefixLength);
  prefixBytesRef = new BytesRef(prefix);
  this.d = new int[this.text.length + 1];
  this.p = new int[this.text.length + 1];

  setInitialSeekTerm(prefixBytesRef);
}
项目:search    文件:AutomatonTestUtil.java   
/** Returns random string, including full unicode range. */
public static String randomRegexp(Random r) {
  while (true) {
    String regexp = randomRegexpString(r);
    // we will also generate some undefined unicode queries
    if (!UnicodeUtil.validUTF16String(regexp))
      continue;
    try {
      new RegExp(regexp, RegExp.NONE);
      return regexp;
    } catch (Exception e) {}
  }
}
项目:search    文件:FSTTester.java   
static String inputToString(int inputMode, IntsRef term, boolean isValidUnicode) {
  if (!isValidUnicode) {
    return term.toString();
  } else if (inputMode == 0) {
    // utf8
    return toBytesRef(term).utf8ToString() + " " + term;
  } else {
    // utf32
    return UnicodeUtil.newString(term.ints, term.offset, term.length) + " " + term;
  }
}
项目:search    文件:FuzzyTermsEnum.java   
/** finds the smallest Lev(n) DFA that accepts the term. */
@Override
protected AcceptStatus accept(BytesRef term) {    
  //System.out.println("AFTE.accept term=" + term);
  int ed = matchers.length - 1;

  // we are wrapping either an intersect() TermsEnum or an AutomatonTermsENum,
  // so we know the outer DFA always matches.
  // now compute exact edit distance
  while (ed > 0) {
    if (matches(term, ed - 1)) {
      ed--;
    } else {
      break;
    }
  }
  //System.out.println("CHECK term=" + term.utf8ToString() + " ed=" + ed);

  // scale to a boost and return (if similarity > minSimilarity)
  if (ed == 0) { // exact match
    boostAtt.setBoost(1.0F);
    //System.out.println("  yes");
    return AcceptStatus.YES;
  } else {
    final int codePointCount = UnicodeUtil.codePointCount(term);
    final float similarity = 1.0f - ((float) ed / (float) 
        (Math.min(codePointCount, termLength)));
    if (similarity > minSimilarity) {
      boostAtt.setBoost((similarity - minSimilarity) * scale_factor);
      //System.out.println("  yes");
      return AcceptStatus.YES;
    } else {
      return AcceptStatus.NO;
    }
  }
}
项目:search    文件:CompressionTools.java   
/** Decompress the byte array previously returned by
 *  compressString back into a String */
public static String decompressString(byte[] value, int offset, int length) throws DataFormatException {
  final byte[] bytes = decompress(value, offset, length);
  final char[] result = new char[bytes.length];
  final int len = UnicodeUtil.UTF8toUTF16(bytes, 0, bytes.length, result);
  return new String(result, 0, len);
}
项目:search    文件:TestIndexWriterUnicode.java   
public void testAllUnicodeChars() throws Throwable {

    CharsRefBuilder utf16 = new CharsRefBuilder();
    char[] chars = new char[2];
    for(int ch=0;ch<0x0010FFFF;ch++) {

      if (ch == 0xd800)
        // Skip invalid code points
        ch = 0xe000;

      int len = 0;
      if (ch <= 0xffff) {
        chars[len++] = (char) ch;
      } else {
        chars[len++] = (char) (((ch-0x0010000) >> 10) + UnicodeUtil.UNI_SUR_HIGH_START);
        chars[len++] = (char) (((ch-0x0010000) & 0x3FFL) + UnicodeUtil.UNI_SUR_LOW_START);
      }

      BytesRef utf8 = new BytesRef(CharBuffer.wrap(chars, 0, len));

      String s1 = new String(chars, 0, len);
      String s2 = new String(utf8.bytes, 0, utf8.length, StandardCharsets.UTF_8);
      assertEquals("codepoint " + ch, s1, s2);

      utf16.copyUTF8Bytes(utf8.bytes, 0, utf8.length);
      assertEquals("codepoint " + ch, s1, utf16.toString());

      byte[] b = s1.getBytes(StandardCharsets.UTF_8);
      assertEquals(utf8.length, b.length);
      for(int j=0;j<utf8.length;j++)
        assertEquals(utf8.bytes[j], b[j]);
    }
  }
项目:search    文件:TestUTF32ToUTF8.java   
private void assertAutomaton(Automaton automaton) throws Exception {
  CharacterRunAutomaton cra = new CharacterRunAutomaton(automaton);
  ByteRunAutomaton bra = new ByteRunAutomaton(automaton);
  final AutomatonTestUtil.RandomAcceptedStrings ras = new AutomatonTestUtil.RandomAcceptedStrings(automaton);

  int num = atLeast(1000);
  for (int i = 0; i < num; i++) {
    final String string;
    if (random().nextBoolean()) {
      // likely not accepted
      string = TestUtil.randomUnicodeString(random());
    } else {
      // will be accepted
      int[] codepoints = ras.getRandomAcceptedString(random());
      try {
        string = UnicodeUtil.newString(codepoints, 0, codepoints.length);
      } catch (Exception e) {
        System.out.println(codepoints.length + " codepoints:");
        for(int j=0;j<codepoints.length;j++) {
          System.out.println("  " + Integer.toHexString(codepoints[j]));
        }
        throw e;
      }
    }
    byte bytes[] = string.getBytes(StandardCharsets.UTF_8);
    assertEquals(cra.run(string), bra.run(bytes, 0, bytes.length));
  }
}
项目:search    文件:PHPSerializedResponseWriter.java   
@Override
public void writeStr(String name, String val, boolean needsEscaping) throws IOException {
  // serialized PHP strings don't need to be escaped at all, however the 
  // string size reported needs be the number of bytes rather than chars.
  utf8 = ArrayUtil.grow(utf8, val.length() * UnicodeUtil.MAX_UTF8_BYTES_PER_CHAR);
  final int nBytes = UnicodeUtil.UTF16toUTF8(val, 0, val.length(), utf8);

  writer.write("s:");
  writer.write(Integer.toString(nBytes));
  writer.write(":\"");
  writer.write(val);
  writer.write("\";");
}