Java 类org.apache.lucene.analysis.compound.hyphenation.Hyphenation 实例源码

项目:lams    文件:Lucene43HyphenationCompoundWordTokenFilter.java   
@Override
protected void decompose() {
  // get the hyphenation points
  Hyphenation hyphens = hyphenator.hyphenate(termAtt.buffer(), 0, termAtt.length(), 1, 1);
  // No hyphen points found -> exit
  if (hyphens == null) {
    return;
  }

  final int[] hyp = hyphens.getHyphenationPoints();

  for (int i = 0; i < hyp.length; ++i) {
    int remaining = hyp.length - i;
    int start = hyp[i];
    CompoundToken longestMatchToken = null;
    for (int j = 1; j < remaining; j++) {
      int partLength = hyp[i + j] - start;

      // if the part is longer than maxSubwordSize we
      // are done with this round
      if (partLength > this.maxSubwordSize) {
        break;
      }

      // we only put subwords to the token stream
      // that are longer than minPartSize
      if (partLength < this.minSubwordSize) {
        // BOGUS/BROKEN/FUNKY/WACKO: somehow we have negative 'parts' according to the 
        // calculation above, and we rely upon minSubwordSize being >=0 to filter them out...
        continue;
      }

      // check the dictionary
      if (dictionary == null || dictionary.contains(termAtt.buffer(), start, partLength)) {
        if (this.onlyLongestMatch) {
          if (longestMatchToken != null) {
            if (longestMatchToken.txt.length() < partLength) {
              longestMatchToken = new CompoundToken(start, partLength);
            }
          } else {
            longestMatchToken = new CompoundToken(start, partLength);
          }
        } else {
          tokens.add(new CompoundToken(start, partLength));
        }
      } else if (dictionary.contains(termAtt.buffer(), start, partLength - 1)) {
        // check the dictionary again with a word that is one character
        // shorter
        // to avoid problems with genitive 's characters and other binding
        // characters
        if (this.onlyLongestMatch) {
          if (longestMatchToken != null) {
            if (longestMatchToken.txt.length() < partLength - 1) {
              longestMatchToken = new CompoundToken(start, partLength - 1);
            }
          } else {
            longestMatchToken = new CompoundToken(start, partLength - 1);
          }
        } else {
          tokens.add(new CompoundToken(start, partLength - 1));
        }
      }
    }
    if (this.onlyLongestMatch && longestMatchToken!=null) {
      tokens.add(longestMatchToken);
    }
  }
}
项目:lams    文件:HyphenationCompoundWordTokenFilter.java   
@Override
protected void decompose() {
  // get the hyphenation points
  Hyphenation hyphens = hyphenator.hyphenate(termAtt.buffer(), 0, termAtt.length(), 1, 1);
  // No hyphen points found -> exit
  if (hyphens == null) {
    return;
  }

  final int[] hyp = hyphens.getHyphenationPoints();

  for (int i = 0; i < hyp.length; ++i) {
    int remaining = hyp.length - i;
    int start = hyp[i];
    CompoundToken longestMatchToken = null;
    for (int j = 1; j < remaining; j++) {
      int partLength = hyp[i + j] - start;

      // if the part is longer than maxSubwordSize we
      // are done with this round
      if (partLength > this.maxSubwordSize) {
        break;
      }

      // we only put subwords to the token stream
      // that are longer than minPartSize
      if (partLength < this.minSubwordSize) {
        // BOGUS/BROKEN/FUNKY/WACKO: somehow we have negative 'parts' according to the 
        // calculation above, and we rely upon minSubwordSize being >=0 to filter them out...
        continue;
      }

      // check the dictionary
      if (dictionary == null || dictionary.contains(termAtt.buffer(), start, partLength)) {
        if (this.onlyLongestMatch) {
          if (longestMatchToken != null) {
            if (longestMatchToken.txt.length() < partLength) {
              longestMatchToken = new CompoundToken(start, partLength);
            }
          } else {
            longestMatchToken = new CompoundToken(start, partLength);
          }
        } else {
          tokens.add(new CompoundToken(start, partLength));
        }
      } else if (dictionary.contains(termAtt.buffer(), start, partLength - 1)) {
        // check the dictionary again with a word that is one character
        // shorter
        // to avoid problems with genitive 's characters and other binding
        // characters
        if (this.onlyLongestMatch) {
          if (longestMatchToken != null) {
            if (longestMatchToken.txt.length() < partLength - 1) {
              longestMatchToken = new CompoundToken(start, partLength - 1);
            }
          } else {
            longestMatchToken = new CompoundToken(start, partLength - 1);
          }
        } else {
          tokens.add(new CompoundToken(start, partLength - 1));
        }
      }
    }
    if (this.onlyLongestMatch && longestMatchToken!=null) {
      tokens.add(longestMatchToken);
    }
  }
}
项目:search    文件:Lucene43HyphenationCompoundWordTokenFilter.java   
@Override
protected void decompose() {
  // get the hyphenation points
  Hyphenation hyphens = hyphenator.hyphenate(termAtt.buffer(), 0, termAtt.length(), 1, 1);
  // No hyphen points found -> exit
  if (hyphens == null) {
    return;
  }

  final int[] hyp = hyphens.getHyphenationPoints();

  for (int i = 0; i < hyp.length; ++i) {
    int remaining = hyp.length - i;
    int start = hyp[i];
    CompoundToken longestMatchToken = null;
    for (int j = 1; j < remaining; j++) {
      int partLength = hyp[i + j] - start;

      // if the part is longer than maxSubwordSize we
      // are done with this round
      if (partLength > this.maxSubwordSize) {
        break;
      }

      // we only put subwords to the token stream
      // that are longer than minPartSize
      if (partLength < this.minSubwordSize) {
        // BOGUS/BROKEN/FUNKY/WACKO: somehow we have negative 'parts' according to the 
        // calculation above, and we rely upon minSubwordSize being >=0 to filter them out...
        continue;
      }

      // check the dictionary
      if (dictionary == null || dictionary.contains(termAtt.buffer(), start, partLength)) {
        if (this.onlyLongestMatch) {
          if (longestMatchToken != null) {
            if (longestMatchToken.txt.length() < partLength) {
              longestMatchToken = new CompoundToken(start, partLength);
            }
          } else {
            longestMatchToken = new CompoundToken(start, partLength);
          }
        } else {
          tokens.add(new CompoundToken(start, partLength));
        }
      } else if (dictionary.contains(termAtt.buffer(), start, partLength - 1)) {
        // check the dictionary again with a word that is one character
        // shorter
        // to avoid problems with genitive 's characters and other binding
        // characters
        if (this.onlyLongestMatch) {
          if (longestMatchToken != null) {
            if (longestMatchToken.txt.length() < partLength - 1) {
              longestMatchToken = new CompoundToken(start, partLength - 1);
            }
          } else {
            longestMatchToken = new CompoundToken(start, partLength - 1);
          }
        } else {
          tokens.add(new CompoundToken(start, partLength - 1));
        }
      }
    }
    if (this.onlyLongestMatch && longestMatchToken!=null) {
      tokens.add(longestMatchToken);
    }
  }
}
项目:search    文件:HyphenationCompoundWordTokenFilter.java   
@Override
protected void decompose() {
  // get the hyphenation points
  Hyphenation hyphens = hyphenator.hyphenate(termAtt.buffer(), 0, termAtt.length(), 1, 1);
  // No hyphen points found -> exit
  if (hyphens == null) {
    return;
  }

  final int[] hyp = hyphens.getHyphenationPoints();

  for (int i = 0; i < hyp.length; ++i) {
    int remaining = hyp.length - i;
    int start = hyp[i];
    CompoundToken longestMatchToken = null;
    for (int j = 1; j < remaining; j++) {
      int partLength = hyp[i + j] - start;

      // if the part is longer than maxSubwordSize we
      // are done with this round
      if (partLength > this.maxSubwordSize) {
        break;
      }

      // we only put subwords to the token stream
      // that are longer than minPartSize
      if (partLength < this.minSubwordSize) {
        // BOGUS/BROKEN/FUNKY/WACKO: somehow we have negative 'parts' according to the 
        // calculation above, and we rely upon minSubwordSize being >=0 to filter them out...
        continue;
      }

      // check the dictionary
      if (dictionary == null || dictionary.contains(termAtt.buffer(), start, partLength)) {
        if (this.onlyLongestMatch) {
          if (longestMatchToken != null) {
            if (longestMatchToken.txt.length() < partLength) {
              longestMatchToken = new CompoundToken(start, partLength);
            }
          } else {
            longestMatchToken = new CompoundToken(start, partLength);
          }
        } else {
          tokens.add(new CompoundToken(start, partLength));
        }
      } else if (dictionary.contains(termAtt.buffer(), start, partLength - 1)) {
        // check the dictionary again with a word that is one character
        // shorter
        // to avoid problems with genitive 's characters and other binding
        // characters
        if (this.onlyLongestMatch) {
          if (longestMatchToken != null) {
            if (longestMatchToken.txt.length() < partLength - 1) {
              longestMatchToken = new CompoundToken(start, partLength - 1);
            }
          } else {
            longestMatchToken = new CompoundToken(start, partLength - 1);
          }
        } else {
          tokens.add(new CompoundToken(start, partLength - 1));
        }
      }
    }
    if (this.onlyLongestMatch && longestMatchToken!=null) {
      tokens.add(longestMatchToken);
    }
  }
}
项目:NYBC    文件:HyphenationCompoundWordTokenFilter.java   
@Override
protected void decompose() {
  // get the hyphenation points
  Hyphenation hyphens = hyphenator.hyphenate(termAtt.buffer(), 0, termAtt.length(), 1, 1);
  // No hyphen points found -> exit
  if (hyphens == null) {
    return;
  }

  final int[] hyp = hyphens.getHyphenationPoints();

  for (int i = 0; i < hyp.length; ++i) {
    int remaining = hyp.length - i;
    int start = hyp[i];
    CompoundToken longestMatchToken = null;
    for (int j = 1; j < remaining; j++) {
      int partLength = hyp[i + j] - start;

      // if the part is longer than maxSubwordSize we
      // are done with this round
      if (partLength > this.maxSubwordSize) {
        break;
      }

      // we only put subwords to the token stream
      // that are longer than minPartSize
      if (partLength < this.minSubwordSize) {
        // BOGUS/BROKEN/FUNKY/WACKO: somehow we have negative 'parts' according to the 
        // calculation above, and we rely upon minSubwordSize being >=0 to filter them out...
        continue;
      }

      // check the dictionary
      if (dictionary == null || dictionary.contains(termAtt.buffer(), start, partLength)) {
        if (this.onlyLongestMatch) {
          if (longestMatchToken != null) {
            if (longestMatchToken.txt.length() < partLength) {
              longestMatchToken = new CompoundToken(start, partLength);
            }
          } else {
            longestMatchToken = new CompoundToken(start, partLength);
          }
        } else {
          tokens.add(new CompoundToken(start, partLength));
        }
      } else if (dictionary.contains(termAtt.buffer(), start, partLength - 1)) {
        // check the dictionary again with a word that is one character
        // shorter
        // to avoid problems with genitive 's characters and other binding
        // characters
        if (this.onlyLongestMatch) {
          if (longestMatchToken != null) {
            if (longestMatchToken.txt.length() < partLength - 1) {
              longestMatchToken = new CompoundToken(start, partLength - 1);
            }
          } else {
            longestMatchToken = new CompoundToken(start, partLength - 1);
          }
        } else {
          tokens.add(new CompoundToken(start, partLength - 1));
        }
      }
    }
    if (this.onlyLongestMatch && longestMatchToken!=null) {
      tokens.add(longestMatchToken);
    }
  }
}
项目:read-open-source-code    文件:HyphenationCompoundWordTokenFilter.java   
@Override
protected void decompose() {
  // get the hyphenation points
  Hyphenation hyphens = hyphenator.hyphenate(termAtt.buffer(), 0, termAtt.length(), 1, 1);
  // No hyphen points found -> exit
  if (hyphens == null) {
    return;
  }

  final int[] hyp = hyphens.getHyphenationPoints();

  for (int i = 0; i < hyp.length; ++i) {
    int remaining = hyp.length - i;
    int start = hyp[i];
    CompoundToken longestMatchToken = null;
    for (int j = 1; j < remaining; j++) {
      int partLength = hyp[i + j] - start;

      // if the part is longer than maxSubwordSize we
      // are done with this round
      if (partLength > this.maxSubwordSize) {
        break;
      }

      // we only put subwords to the token stream
      // that are longer than minPartSize
      if (partLength < this.minSubwordSize) {
        // BOGUS/BROKEN/FUNKY/WACKO: somehow we have negative 'parts' according to the 
        // calculation above, and we rely upon minSubwordSize being >=0 to filter them out...
        continue;
      }

      // check the dictionary
      if (dictionary == null || dictionary.contains(termAtt.buffer(), start, partLength)) {
        if (this.onlyLongestMatch) {
          if (longestMatchToken != null) {
            if (longestMatchToken.txt.length() < partLength) {
              longestMatchToken = new CompoundToken(start, partLength);
            }
          } else {
            longestMatchToken = new CompoundToken(start, partLength);
          }
        } else {
          tokens.add(new CompoundToken(start, partLength));
        }
      } else if (dictionary.contains(termAtt.buffer(), start, partLength - 1)) {
        // check the dictionary again with a word that is one character
        // shorter
        // to avoid problems with genitive 's characters and other binding
        // characters
        if (this.onlyLongestMatch) {
          if (longestMatchToken != null) {
            if (longestMatchToken.txt.length() < partLength - 1) {
              longestMatchToken = new CompoundToken(start, partLength - 1);
            }
          } else {
            longestMatchToken = new CompoundToken(start, partLength - 1);
          }
        } else {
          tokens.add(new CompoundToken(start, partLength - 1));
        }
      }
    }
    if (this.onlyLongestMatch && longestMatchToken!=null) {
      tokens.add(longestMatchToken);
    }
  }
}
项目:read-open-source-code    文件:HyphenationCompoundWordTokenFilter.java   
@Override
protected void decompose() {
  // get the hyphenation points
  Hyphenation hyphens = hyphenator.hyphenate(termAtt.buffer(), 0, termAtt.length(), 1, 1);
  // No hyphen points found -> exit
  if (hyphens == null) {
    return;
  }

  final int[] hyp = hyphens.getHyphenationPoints();

  for (int i = 0; i < hyp.length; ++i) {
    int remaining = hyp.length - i;
    int start = hyp[i];
    CompoundToken longestMatchToken = null;
    for (int j = 1; j < remaining; j++) {
      int partLength = hyp[i + j] - start;

      // if the part is longer than maxSubwordSize we
      // are done with this round
      if (partLength > this.maxSubwordSize) {
        break;
      }

      // we only put subwords to the token stream
      // that are longer than minPartSize
      if (partLength < this.minSubwordSize) {
        // BOGUS/BROKEN/FUNKY/WACKO: somehow we have negative 'parts' according to the 
        // calculation above, and we rely upon minSubwordSize being >=0 to filter them out...
        continue;
      }

      // check the dictionary
      if (dictionary == null || dictionary.contains(termAtt.buffer(), start, partLength)) {
        if (this.onlyLongestMatch) {
          if (longestMatchToken != null) {
            if (longestMatchToken.txt.length() < partLength) {
              longestMatchToken = new CompoundToken(start, partLength);
            }
          } else {
            longestMatchToken = new CompoundToken(start, partLength);
          }
        } else {
          tokens.add(new CompoundToken(start, partLength));
        }
      } else if (dictionary.contains(termAtt.buffer(), start, partLength - 1)) {
        // check the dictionary again with a word that is one character
        // shorter
        // to avoid problems with genitive 's characters and other binding
        // characters
        if (this.onlyLongestMatch) {
          if (longestMatchToken != null) {
            if (longestMatchToken.txt.length() < partLength - 1) {
              longestMatchToken = new CompoundToken(start, partLength - 1);
            }
          } else {
            longestMatchToken = new CompoundToken(start, partLength - 1);
          }
        } else {
          tokens.add(new CompoundToken(start, partLength - 1));
        }
      }
    }
    if (this.onlyLongestMatch && longestMatchToken!=null) {
      tokens.add(longestMatchToken);
    }
  }
}
项目:read-open-source-code    文件:Lucene43HyphenationCompoundWordTokenFilter.java   
@Override
protected void decompose() {
  // get the hyphenation points
  Hyphenation hyphens = hyphenator.hyphenate(termAtt.buffer(), 0, termAtt.length(), 1, 1);
  // No hyphen points found -> exit
  if (hyphens == null) {
    return;
  }

  final int[] hyp = hyphens.getHyphenationPoints();

  for (int i = 0; i < hyp.length; ++i) {
    int remaining = hyp.length - i;
    int start = hyp[i];
    CompoundToken longestMatchToken = null;
    for (int j = 1; j < remaining; j++) {
      int partLength = hyp[i + j] - start;

      // if the part is longer than maxSubwordSize we
      // are done with this round
      if (partLength > this.maxSubwordSize) {
        break;
      }

      // we only put subwords to the token stream
      // that are longer than minPartSize
      if (partLength < this.minSubwordSize) {
        // BOGUS/BROKEN/FUNKY/WACKO: somehow we have negative 'parts' according to the 
        // calculation above, and we rely upon minSubwordSize being >=0 to filter them out...
        continue;
      }

      // check the dictionary
      if (dictionary == null || dictionary.contains(termAtt.buffer(), start, partLength)) {
        if (this.onlyLongestMatch) {
          if (longestMatchToken != null) {
            if (longestMatchToken.txt.length() < partLength) {
              longestMatchToken = new CompoundToken(start, partLength);
            }
          } else {
            longestMatchToken = new CompoundToken(start, partLength);
          }
        } else {
          tokens.add(new CompoundToken(start, partLength));
        }
      } else if (dictionary.contains(termAtt.buffer(), start, partLength - 1)) {
        // check the dictionary again with a word that is one character
        // shorter
        // to avoid problems with genitive 's characters and other binding
        // characters
        if (this.onlyLongestMatch) {
          if (longestMatchToken != null) {
            if (longestMatchToken.txt.length() < partLength - 1) {
              longestMatchToken = new CompoundToken(start, partLength - 1);
            }
          } else {
            longestMatchToken = new CompoundToken(start, partLength - 1);
          }
        } else {
          tokens.add(new CompoundToken(start, partLength - 1));
        }
      }
    }
    if (this.onlyLongestMatch && longestMatchToken!=null) {
      tokens.add(longestMatchToken);
    }
  }
}
项目:read-open-source-code    文件:HyphenationCompoundWordTokenFilter.java   
@Override
protected void decompose() {
  // get the hyphenation points
  Hyphenation hyphens = hyphenator.hyphenate(termAtt.buffer(), 0, termAtt.length(), 1, 1);
  // No hyphen points found -> exit
  if (hyphens == null) {
    return;
  }

  final int[] hyp = hyphens.getHyphenationPoints();

  for (int i = 0; i < hyp.length; ++i) {
    int remaining = hyp.length - i;
    int start = hyp[i];
    CompoundToken longestMatchToken = null;
    for (int j = 1; j < remaining; j++) {
      int partLength = hyp[i + j] - start;

      // if the part is longer than maxSubwordSize we
      // are done with this round
      if (partLength > this.maxSubwordSize) {
        break;
      }

      // we only put subwords to the token stream
      // that are longer than minPartSize
      if (partLength < this.minSubwordSize) {
        // BOGUS/BROKEN/FUNKY/WACKO: somehow we have negative 'parts' according to the 
        // calculation above, and we rely upon minSubwordSize being >=0 to filter them out...
        continue;
      }

      // check the dictionary
      if (dictionary == null || dictionary.contains(termAtt.buffer(), start, partLength)) {
        if (this.onlyLongestMatch) {
          if (longestMatchToken != null) {
            if (longestMatchToken.txt.length() < partLength) {
              longestMatchToken = new CompoundToken(start, partLength);
            }
          } else {
            longestMatchToken = new CompoundToken(start, partLength);
          }
        } else {
          tokens.add(new CompoundToken(start, partLength));
        }
      } else if (dictionary.contains(termAtt.buffer(), start, partLength - 1)) {
        // check the dictionary again with a word that is one character
        // shorter
        // to avoid problems with genitive 's characters and other binding
        // characters
        if (this.onlyLongestMatch) {
          if (longestMatchToken != null) {
            if (longestMatchToken.txt.length() < partLength - 1) {
              longestMatchToken = new CompoundToken(start, partLength - 1);
            }
          } else {
            longestMatchToken = new CompoundToken(start, partLength - 1);
          }
        } else {
          tokens.add(new CompoundToken(start, partLength - 1));
        }
      }
    }
    if (this.onlyLongestMatch && longestMatchToken!=null) {
      tokens.add(longestMatchToken);
    }
  }
}
项目:Maskana-Gestor-de-Conocimiento    文件:HyphenationCompoundWordTokenFilter.java   
@Override
protected void decompose() {
  // get the hyphenation points
  Hyphenation hyphens = hyphenator.hyphenate(termAtt.buffer(), 0, termAtt.length(), 1, 1);
  // No hyphen points found -> exit
  if (hyphens == null) {
    return;
  }

  final int[] hyp = hyphens.getHyphenationPoints();

  for (int i = 0; i < hyp.length; ++i) {
    int remaining = hyp.length - i;
    int start = hyp[i];
    CompoundToken longestMatchToken = null;
    for (int j = 1; j < remaining; j++) {
      int partLength = hyp[i + j] - start;

      // if the part is longer than maxSubwordSize we
      // are done with this round
      if (partLength > this.maxSubwordSize) {
        break;
      }

      // we only put subwords to the token stream
      // that are longer than minPartSize
      if (partLength < this.minSubwordSize) {
        // BOGUS/BROKEN/FUNKY/WACKO: somehow we have negative 'parts' according to the 
        // calculation above, and we rely upon minSubwordSize being >=0 to filter them out...
        continue;
      }

      // check the dictionary
      if (dictionary == null || dictionary.contains(termAtt.buffer(), start, partLength)) {
        if (this.onlyLongestMatch) {
          if (longestMatchToken != null) {
            if (longestMatchToken.txt.length() < partLength) {
              longestMatchToken = new CompoundToken(start, partLength);
            }
          } else {
            longestMatchToken = new CompoundToken(start, partLength);
          }
        } else {
          tokens.add(new CompoundToken(start, partLength));
        }
      } else if (dictionary.contains(termAtt.buffer(), start, partLength - 1)) {
        // check the dictionary again with a word that is one character
        // shorter
        // to avoid problems with genitive 's characters and other binding
        // characters
        if (this.onlyLongestMatch) {
          if (longestMatchToken != null) {
            if (longestMatchToken.txt.length() < partLength - 1) {
              longestMatchToken = new CompoundToken(start, partLength - 1);
            }
          } else {
            longestMatchToken = new CompoundToken(start, partLength - 1);
          }
        } else {
          tokens.add(new CompoundToken(start, partLength - 1));
        }
      }
    }
    if (this.onlyLongestMatch && longestMatchToken!=null) {
      tokens.add(longestMatchToken);
    }
  }
}