Java 类weka.core.stopwords.StopwordsHandler 实例源码

项目:AffectiveTweets    文件:TweetToFeatureVector.java   
@OptionMetadata(displayName = "stopwordsHandler",
        description = "The stopwords handler to use (Null means no stopwords are used).",
        commandLineParamName = "stopwords-handler",
        commandLineParamSynopsis = "-stopwords-handler <string>", displayOrder = 5)
public StopwordsHandler getStopwordsHandler() {
    return m_stopwordsHandler;
}
项目:AffectiveTweets    文件:Utils.java   
/**
 * Tokenizes a String
 * @param content the content
 * @param toLowerCase true for lowercasing the content
 * @param standarizeUrlsUsers true for standarizing urls and users
 * @param reduceRepeatedLetters true for reduing repeated letters
 * @param tokenizer the tokenizer
 * @param stemmer the stemmer
 * @param stop the stopwords handler
 * @return a list of tokens
 */
static public List<String> tokenize(String content, boolean toLowerCase, boolean standarizeUrlsUsers, boolean reduceRepeatedLetters, Tokenizer tokenizer, Stemmer stemmer, StopwordsHandler stop) {

    if (toLowerCase)
        content = content.toLowerCase();

    // if a letters appears two or more times it is replaced by only two
    // occurrences of it
    if (reduceRepeatedLetters)
        content = content.replaceAll("([a-z])\\1+", "$1$1");


    List<String> tokens = new ArrayList<String>();

    tokenizer.tokenize(content);
    for(;tokenizer.hasMoreElements();){
        String token=tokenizer.nextElement();
        if(!stop.isStopword(token)){

            if (standarizeUrlsUsers) {
                // Replace URLs to a generic URL
                if (token.matches("http.*|ww\\..*|www\\..*")) {
                    token="http://www.url.com";
                }
                // Replaces user mentions to a generic user
                else if (token.matches("@.*")) {
                    token="@user";
                }

            }               

            tokens.add(stemmer.stem(token));
        }
    }

    return tokens;

}
项目:AffectiveTweets    文件:TweetToFeatureVector.java   
public void setStopwordsHandler(StopwordsHandler m_stopwordsHandler) {
    this.m_stopwordsHandler = m_stopwordsHandler;
}
项目:repo.kmeanspp.silhouette_score    文件:NaiveBayesMultinomialText.java   
/**
 * Gets the stopwords handler.
 *
 * @return the stopwords handler
 */
public StopwordsHandler getStopwordsHandler() {
  return m_StopwordsHandler;
}
项目:repo.kmeanspp.silhouette_score    文件:SGDText.java   
/**
 * Gets the stopwords handler.
 * 
 * @return the stopwords handler
 */
public StopwordsHandler getStopwordsHandler() {
  return m_StopwordsHandler;
}
项目:repo.kmeanspp.silhouette_score    文件:StringToWordVector.java   
/**
 * Gets the stopwords handler.
 * 
 * @return the stopwords handler
 */
public StopwordsHandler getStopwordsHandler() {
  return m_StopwordsHandler;
}
项目:umple    文件:NaiveBayesMultinomialText.java   
/**
 * Gets the stopwords handler.
 * 
 * @return the stopwords handler
 */
public StopwordsHandler getStopwordsHandler() {
  return m_StopwordsHandler;
}
项目:umple    文件:SGDText.java   
/**
 * Gets the stopwords handler.
 * 
 * @return the stopwords handler
 */
public StopwordsHandler getStopwordsHandler() {
  return m_StopwordsHandler;
}
项目:umple    文件:StringToWordVector.java   
/**
 * Gets the stopwords handler.
 * 
 * @return the stopwords handler
 */
public StopwordsHandler getStopwordsHandler() {
  return m_StopwordsHandler;
}
项目:cia    文件:WordCounterImpl.java   
@Override
public Map<String, Integer> calculateWordCount(final DocumentContentData documentContentData, final int maxResult) {

    final String html = documentContentData.getContent();

    final Attribute input = new Attribute(HTML, (ArrayList<String>) null);

    final ArrayList<Attribute> inputVec = new ArrayList<>();
    inputVec.add(input);

    final Instances htmlInst = new Instances(HTML, inputVec, 1);

    htmlInst.add(new DenseInstance(1));
    htmlInst.instance(0).setValue(0, html);


    final StopwordsHandler stopwordsHandler = new StopwordsHandler() {

        @Override
        public boolean isStopword(final String word) {

            return word.length() <5;
        }
    };

    final NGramTokenizer tokenizer = new NGramTokenizer();
    tokenizer.setNGramMinSize(1);
    tokenizer.setNGramMaxSize(1);
    tokenizer.setDelimiters(TOKEN_DELIMITERS);

    final StringToWordVector filter = new StringToWordVector();
    filter.setTokenizer(tokenizer);
    filter.setStopwordsHandler(stopwordsHandler);
    filter.setLowerCaseTokens(true);
    filter.setOutputWordCounts(true);
    filter.setWordsToKeep(maxResult);

    final Map<String,Integer> result = new HashMap<>();

    try {
        filter.setInputFormat(htmlInst);
        final Instances dataFiltered = Filter.useFilter(htmlInst, filter);

        final Instance last = dataFiltered.lastInstance();

        final int numAttributes = last.numAttributes();

        for (int i = 0; i < numAttributes; i++) {
            result.put(last.attribute(i).name(), Integer.valueOf(last.toString(i)));
        }
    } catch (final Exception e) {
        LOGGER.warn("Problem calculating wordcount for : {} , exception:{}",documentContentData.getId() ,e);
    }


    return result;
}