@Test public void charSetUtilsDemo() { System.out.println("**CharSetUtilsDemo**"); System.out.println("计算字符串中包含某字符数."); System.out.println(CharSetUtils.count("The quick brown fox jumps over the lazy dog.", "aeiou")); System.out.println("删除字符串中某字符."); System.out.println(CharSetUtils.delete("The quick brown fox jumps over the lazy dog.", "aeiou")); System.out.println("保留字符串中某字符."); System.out.println(CharSetUtils.keep("The quick brown fox jumps over the lazy dog.", "aeiou")); System.out.println("合并重复的字符."); System.out.println(CharSetUtils.squeeze("a bbbbbb c dd", "b d")); }
private static String clean(String text) { return CharSetUtils.keep(text.toLowerCase(), "abcdefghijklmnopqrstuvwxyz_"); }
/** * Find simple statements of type in regular text, such as "Diabetes is a * common disease" * * Subclasses are very similarly stated, such as "A hummingbird is a kind * of bird." But we don't distinguish between these yet. We should though. * * @return Pairs of nouns and their types. */ public static List<Pair<String, String>> extract(Phrase p) { List<Pair<String, String>> names_and_types = new ArrayList<>(); for (SemanticGraph graph: p.getGraphs()){ StringBuilder theory = new StringBuilder(); // Load data into a model // Add all the edges for (SemanticGraphEdge edge : graph.edgeIterable()) { // I like the specific prepositions better // so change them to match GrammaticalRelation rel = edge.getRelation(); String relation_name = rel.getShortName(); if ( (rel.getShortName().equals("prep") || rel.getShortName().equals("conj")) && rel.getSpecific() != null && !rel.getSpecific().isEmpty()) { relation_name = rel.getShortName() + "_" + CharSetUtils.keep(rel.getSpecific().toLowerCase(), "abcdefghijklmnopqrstuvwxyz"); } theory.append(relation_name); theory.append('('); theory.append(wordID(edge.getGovernor())); theory.append(','); theory.append(wordID(edge.getDependent())); theory.append(").\n"); } // Index the words for (IndexedWord word : graph.vertexSet()) { theory.append("tag("); theory.append(wordID(word)); theory.append(','); String tag = clean(word.tag()); theory.append(tag.isEmpty() ? "misc" : tag); theory.append(").\n"); } Prolog engine = new Prolog(); try { engine.setTheory(new Theory( Files.toString(new File("src/main/parse.pl"), Charset.forName("UTF-8")))); log.debug(theory); engine.addTheory(new Theory(theory.toString())); SolveInfo info = engine.solve("type_c(X, Y)."); // Get the resulting matches while (info.isSuccess()) { IndexedWord subj_idx = idWord(graph, info.getTerm("X").toString()); IndexedWord obj_idx = idWord(graph, info.getTerm("Y").toString()); if (subj_idx.tag().startsWith("NN") && obj_idx.tag().startsWith("NN")) { String noun = Trees.concatNoun(graph, subj_idx); String type = obj_idx.originalText(); //concatNoun(graph, obj_idx); log.info("Discovered " + noun + " is a(n) " + type); names_and_types.add(new Pair<>(noun,type)); } if (engine.hasOpenAlternatives()) { info = engine.solveNext(); } else { break; } } } catch (IOException | InvalidTheoryException | MalformedGoalException | NoSolutionException | NoMoreSolutionException | UnknownVarException e) { System.out.println(theory); e.printStackTrace(); } } return names_and_types; }