Java 类weka.core.tokenizers.NGramTokenizer 实例源码

项目:ICDClassifier    文件:CipeClassifier.java   
private StringToWordVector parseTokenizer(StringToWordVector filter) {
    switch (Constants.CONFIG.getTokenizer()) {
    case ALPHABETIC:
        // Avoid. Does not support diacritics (ã, á, é, etc.)
        filter.setTokenizer(new AlphabeticTokenizer());
        break;
    case WORD:
        NGramTokenizer tokenizer = new NGramTokenizer();
        tokenizer.setNGramMaxSize(Constants.CONFIG.getNGrams());
        filter.setTokenizer(tokenizer);
        break;
    case OPENNLP:
        // TODO use WordTokenizer with a specific delimiter set via options
        // and printed by OpenNLP.
        break;
    case COGROO:
        // TODO use WordTokenizer with a specific delimiter set via options
        // and printed by CoGrOO.
        break;
    default:
        throw new IllegalArgumentException(Constants.CONFIG.getTokenizer() + " is not implemented.");
    }
    return filter;
}
项目:mathosphere    文件:WekaLearner.java   
private StringToWordVector getStringToWordVectorFilter(Instances instances) throws Exception {
  StringToWordVector stringToWordVector = new StringToWordVector();
  stringToWordVector.setAttributeIndices(indicesToRangeList(new int[]{
    instances.attribute(SURFACE_TEXT_AND_POS_TAG_OF_TWO_PRECEDING_AND_FOLLOWING_TOKENS_AROUND_THE_DESC_CANDIDATE).index(),
    instances.attribute(SURFACE_TEXT_AND_POS_TAG_OF_THREE_PRECEDING_AND_FOLLOWING_TOKENS_AROUND_THE_PAIRED_MATH_EXPR).index(),
    instances.attribute(SURFACE_TEXT_OF_THE_FIRST_VERB_THAT_APPEARS_BETWEEN_THE_DESC_CANDIDATE_AND_THE_TARGET_MATH_EXPR).index(),
    instances.attribute(SURFACE_TEXT_AND_POS_TAG_OF_DEPENDENCY_WITH_LENGTH_3_FROM_IDENTIFIER).index(),
    instances.attribute(SURFACE_TEXT_AND_POS_TAG_OF_DEPENDENCY_WITH_LENGTH_3_FROM_DEFINIEN).index()}));
  stringToWordVector.setWordsToKeep(1000);
  NGramTokenizer nGramTokenizer = new NGramTokenizer();
  nGramTokenizer.setNGramMaxSize(3);
  nGramTokenizer.setNGramMinSize(1);
  nGramTokenizer.setDelimiters(nGramTokenizer.getDelimiters().replaceAll(":", ""));
  stringToWordVector.setTokenizer(nGramTokenizer);
  stringToWordVector.setInputFormat(instances);
  return stringToWordVector;
}
项目:movie-rating-prediction    文件:SVMPredictorImpl.java   
/**
 * Creates a {@link StringToWordVector} filter with a 3-gram {@link Tokenizer}
 * and stop word handling.
 *
 * @param instances the model which is to be filtered
 * @return the filter
 * @throws Exception if filter creation fails
 */
private StringToWordVector createFilter(Instances instances) throws Exception {
    NGramTokenizer tokenizer = new NGramTokenizer();
    tokenizer.setNGramMaxSize(3);

    WordsFromFile stopwordsHandler = new WordsFromFile();
    stopwordsHandler.setStopwords(FileUtils.loadFile(resourceLoader,
            dataConfig.getBaseDataDirectory() + dataConfig.getStopWordsDirectory()));

    StringToWordVector stwv = new StringToWordVector();
    stwv.setTokenizer(tokenizer);
    stwv.setTFTransform(true);
    stwv.setIDFTransform(true);
    stwv.setStopwordsHandler(stopwordsHandler);
    stwv.setLowerCaseTokens(true);
    stwv.setInputFormat(instances);
    return stwv;
}
项目:french-sentiment-classification    文件:Tokenisation.java   
public static StringToWordVector WordNgrams(Properties prop) throws Exception{
    final StringToWordVector filter = new StringToWordVector();
    filter.setAttributeIndices("first-last");
    filter.setOutputWordCounts(false);
    filter.setTFTransform(false);
    filter.setIDFTransform(false);
    //if (prop.getProperty("Preprocessings.removeStopWords").equalsIgnoreCase("yes")) filter.setStopwords(new File("ressources//MotsVides.txt"));
    filter.setWordsToKeep(10000);
    filter.setMinTermFreq(1);
    NGramTokenizer tok = new NGramTokenizer();
    tok.setDelimiters(" \n  .,;'\"()?!-/<>‘’“”…«»•&{[|`^]}$*%");
    tok.setNGramMinSize(Integer.parseInt(prop.getProperty("Ngrams.min")));
    tok.setNGramMaxSize(Integer.parseInt(prop.getProperty("Ngrams.max")));
    filter.setTokenizer(tok);

    return filter;
}
项目:wekaDeeplearning4j    文件:NGramTokenizerFactory.java   
@Override
public Tokenizer create(String toTokenize) {

  this.wekaTokenizer = new NGramTokenizer();
  this.wekaTokenizer.setNGramMinSize(this.nMin);
  this.wekaTokenizer.setNGramMaxSize(this.nMax);
  this.wekaTokenizer.setDelimiters(this.delimiters);

  WekaTokenizer t = new WekaTokenizer(toTokenize, wekaTokenizer);
  t.setTokenPreProcessor(tokenPreProcess);
  return t;
}
项目:movie-rating-prediction    文件:SVMPredictorImpl.java   
/**
 * Creates a {@link StringToWordVector} filter with a 3-gram {@link Tokenizer}
 * and stop word handling.
 *
 * @param instances the model which is to be filtered
 * @return the filter
 * @throws Exception if filter creation fails
 */
private StringToWordVector createFilter(Instances instances) throws Exception {
    NGramTokenizer tokenizer = new NGramTokenizer();
    tokenizer.setNGramMaxSize(3);

    WordsFromFile stopwordsHandler = new WordsFromFile();
    stopwordsHandler.setStopwords(FileUtils.loadFile(resourceLoader,
            dataConfig.getBaseDataDirectory() + dataConfig.getStopWordsDirectory()));

    StringToWordVector stwv = new StringToWordVector();
    stwv.setTokenizer(tokenizer);
    stwv.setTFTransform(true);
    stwv.setIDFTransform(true);
    stwv.setStopwordsHandler(stopwordsHandler);
    stwv.setLowerCaseTokens(true);
    stwv.setInputFormat(instances);
    return stwv;
}
项目:cia    文件:WordCounterImpl.java   
@Override
public Map<String, Integer> calculateWordCount(final DocumentContentData documentContentData, final int maxResult) {

    final String html = documentContentData.getContent();

    final Attribute input = new Attribute(HTML, (ArrayList<String>) null);

    final ArrayList<Attribute> inputVec = new ArrayList<>();
    inputVec.add(input);

    final Instances htmlInst = new Instances(HTML, inputVec, 1);

    htmlInst.add(new DenseInstance(1));
    htmlInst.instance(0).setValue(0, html);


    final StopwordsHandler stopwordsHandler = new StopwordsHandler() {

        @Override
        public boolean isStopword(final String word) {

            return word.length() <5;
        }
    };

    final NGramTokenizer tokenizer = new NGramTokenizer();
    tokenizer.setNGramMinSize(1);
    tokenizer.setNGramMaxSize(1);
    tokenizer.setDelimiters(TOKEN_DELIMITERS);

    final StringToWordVector filter = new StringToWordVector();
    filter.setTokenizer(tokenizer);
    filter.setStopwordsHandler(stopwordsHandler);
    filter.setLowerCaseTokens(true);
    filter.setOutputWordCounts(true);
    filter.setWordsToKeep(maxResult);

    final Map<String,Integer> result = new HashMap<>();

    try {
        filter.setInputFormat(htmlInst);
        final Instances dataFiltered = Filter.useFilter(htmlInst, filter);

        final Instance last = dataFiltered.lastInstance();

        final int numAttributes = last.numAttributes();

        for (int i = 0; i < numAttributes; i++) {
            result.put(last.attribute(i).name(), Integer.valueOf(last.toString(i)));
        }
    } catch (final Exception e) {
        LOGGER.warn("Problem calculating wordcount for : {} , exception:{}",documentContentData.getId() ,e);
    }


    return result;
}