Java 类weka.core.stemmers.SnowballStemmer 实例源码

项目：TableDisentangler 文件：FreqExtractor.java

public LinkedList<String> makeExtractList(String filePath)
{
    String content;
    LinkedList<String> extractList = new LinkedList<String>();
    File file = new File(filePath);
    try {
        Scanner sc = new Scanner(new FileInputStream(file));
        while (sc.hasNextLine()) {
            content = sc.nextLine();
            SnowballStemmer porter = new SnowballStemmer();
            content = porter.stem(content);
            System.out.println(content);
            extractList.add(content);
        }
        sc.close();
    } catch (FileNotFoundException fnf) {
        fnf.printStackTrace();
    } catch (Exception e) {
        e.printStackTrace();
        System.out.println("\nProgram terminated Safely...");
    }
    return extractList;
}

项目：wekaDeeplearning4j 文件：RnnSequenceClassifierTest.java

@Test
public void testConfigRotation() throws Exception {
  Map<String, String> failedConfigs = new HashMap<>();

  tii = new RnnTextEmbeddingInstanceIterator();
  tii.setWordVectorLocation(modelSlim);
  data = DatasetLoader.loadAnger();

  // Reduce datasize
  RemovePercentage rp = new RemovePercentage();
  rp.setPercentage(98);
  rp.setInputFormat(data);
  data = Filter.useFilter(data, rp);

  RnnOutputLayer out = new RnnOutputLayer();
  out.setLossFn(new LossMSE());
  out.setActivationFunction(new ActivationIdentity());

  final Dl4jWordsFromFile wff = new Dl4jWordsFromFile();
  wff.setStopwords(new File("src/test/resources/stopwords/english.txt"));
  // Iterate stopwords
  for (Dl4jAbstractStopwords sw :
      new Dl4jAbstractStopwords[] {new Dl4jRainbow(), new Dl4jNull(), wff}) {
    tii.setStopwords(sw);

    final StemmingPreprocessor spp = new StemmingPreprocessor();
    spp.setStemmer(new SnowballStemmer());
    // Iterate TokenPreProcess
    for (TokenPreProcess tpp :
        new TokenPreProcess[] {
          new CommonPreprocessor(), new EndingPreProcessor(), new LowCasePreProcessor(), spp
        }) {
      tii.setTokenPreProcess(tpp);

      // Iterate tokenizer faktory
      for (TokenizerFactory tf :
          new TokenizerFactory[] {
            new DefaultTokenizerFactory(),
            new CharacterNGramTokenizerFactory(),
            new TweetNLPTokenizerFactory(),
          }) {
        tii.setTokenizerFactory(tf);

        // Create clean classifier
        clf = new RnnSequenceClassifier();
        clf.setNumEpochs(1);
        clf.setLayers(out);
        clf.setInstanceIterator(tii);
        clf.settBPTTforwardLength(3);
        clf.settBPTTbackwardLength(3);

        String conf =
            "\n - TokenPreProcess: "
                + tpp.getClass().getSimpleName()
                + "\n - TokenizerFactory: "
                + tf.getClass().getSimpleName()
                + "\n - StopWords: "
                + sw.getClass().getSimpleName();
        log.info(conf);
        try {
          clf.buildClassifier(data);
        } catch (Exception e) {
          failedConfigs.put(conf, e.toString());
        }
      }
    }
  }

  // Check if anything failed
  if (!failedConfigs.isEmpty()) {

    final String err =
        failedConfigs
            .keySet()
            .stream()
            .map(s -> "Config failed: " + s + "\nException: " + failedConfigs.get(s))
            .collect(Collectors.joining("\n"));

    Assert.fail("Some of the configs failed:\n" + err);
  }
}