Java 类org.jsoup.nodes.Entities 实例源码

项目:common    文件:CleanerTest.java   
@Test public void supplyOutputSettings() {
    // test that one can override the default document output settings
    Document.OutputSettings os = new Document.OutputSettings();
    os.prettyPrint(false);
    os.escapeMode(Entities.EscapeMode.extended);
    os.charset("ascii");

    String html = "<div><p>&bernou;</p></div>";
    String customOut = Jsoup.clean(html, "http://foo.com/", Whitelist.relaxed(), os);
    String defaultOut = Jsoup.clean(html, "http://foo.com/", Whitelist.relaxed());
    assertNotSame(defaultOut, customOut);

    assertEquals("<div><p>&bernou;</p></div>", customOut);
    assertEquals("<div>\n" +
        " <p>ℬ</p>\n" +
        "</div>", defaultOut);

    os.charset("ASCII");
    os.escapeMode(Entities.EscapeMode.base);
    String customOut2 = Jsoup.clean(html, "http://foo.com/", Whitelist.relaxed(), os);
    assertEquals("<div><p>&#x212c;</p></div>", customOut2);
}
项目:greenpepper    文件:GreenPepperRepositoryTest.java   
private void assertSpecification( String expectedSpec, Document actualDoc)
  {
      assertNotNull( actualDoc );
      StringWriter buffer = new StringWriter();
      actualDoc.print( new PrintWriter( buffer ) );

org.jsoup.nodes.Document expectedDoc = Jsoup.parse(expectedSpec);
expectedDoc.outputSettings().escapeMode(Entities.EscapeMode.xhtml).prettyPrint(false);

Element expected = expectedDoc.body();

org.jsoup.nodes.Document resultDoc = Jsoup.parse(buffer.toString());
Element result = resultDoc.body();
result.select("style:first-of-type").remove();
resultDoc.outputSettings().escapeMode(Entities.EscapeMode.xhtml).prettyPrint(false);

assertEquals( expected.outerHtml(), result.outerHtml() );
  }
项目:greenpepper    文件:AtlassianRepositoryTest.java   
private void assertSpecification( Document doc )
{
    assertNotNull( doc );
    StringWriter buffer = new StringWriter();
    doc.print( new PrintWriter( buffer ) );
    org.jsoup.nodes.Document expectedDoc = Jsoup.parse(specification());
    expectedDoc.outputSettings().escapeMode(Entities.EscapeMode.xhtml).prettyPrint(false);

    Element expected = expectedDoc.body();

    org.jsoup.nodes.Document resultDoc = Jsoup.parse(buffer.toString());
    Element result = resultDoc.body();
    result.select("style:first-of-type").remove();
    resultDoc.outputSettings().escapeMode(Entities.EscapeMode.xhtml).prettyPrint(false);

    Assert.assertEquals( expected.outerHtml(), result.outerHtml() );
}
项目:CN1ML-NetbeansModule    文件:CleanerTest.java   
@Test public void supplyOutputSettings() {
    // test that one can override the default document output settings
    Document.OutputSettings os = new Document.OutputSettings();
    os.prettyPrint(false);
    os.escapeMode(Entities.EscapeMode.extended);
    os.charset("ascii");

    String html = "<div><p>&bernou;</p></div>";
    String customOut = Jsoup.clean(html, "http://foo.com/", Whitelist.relaxed(), os);
    String defaultOut = Jsoup.clean(html, "http://foo.com/", Whitelist.relaxed());
    assertNotSame(defaultOut, customOut);

    assertEquals("<div><p>&bernou;</p></div>", customOut);
    assertEquals("<div>\n" +
        " <p>ℬ</p>\n" +
        "</div>", defaultOut);

    os.charset("ASCII");
    os.escapeMode(Entities.EscapeMode.base);
    String customOut2 = Jsoup.clean(html, "http://foo.com/", Whitelist.relaxed(), os);
    assertEquals("<div><p>&#x212c;</p></div>", customOut2);
}
项目:astor    文件:CleanerTest.java   
@Test public void supplyOutputSettings() {
    // test that one can override the default document output settings
    Document.OutputSettings os = new Document.OutputSettings();
    os.prettyPrint(false);
    os.escapeMode(Entities.EscapeMode.extended);
    os.charset("ascii");

    String html = "<div><p>&bernou;</p></div>";
    String customOut = Jsoup.clean(html, "http://foo.com/", Whitelist.relaxed(), os);
    String defaultOut = Jsoup.clean(html, "http://foo.com/", Whitelist.relaxed());
    assertNotSame(defaultOut, customOut);

    assertEquals("<div><p>&bernou;</p></div>", customOut);
    assertEquals("<div>\n" +
        " <p>ℬ</p>\n" +
        "</div>", defaultOut);

    os.charset("ASCII");
    os.escapeMode(Entities.EscapeMode.base);
    String customOut2 = Jsoup.clean(html, "http://foo.com/", Whitelist.relaxed(), os);
    assertEquals("<div><p>&#x212c;</p></div>", customOut2);
}
项目:astor    文件:CleanerTest.java   
@Test public void supplyOutputSettings() {
    // test that one can override the default document output settings
    Document.OutputSettings os = new Document.OutputSettings();
    os.prettyPrint(false);
    os.escapeMode(Entities.EscapeMode.extended);
    os.charset("ascii");

    String html = "<div><p>&bernou;</p></div>";
    String customOut = Jsoup.clean(html, "http://foo.com/", Whitelist.relaxed(), os);
    String defaultOut = Jsoup.clean(html, "http://foo.com/", Whitelist.relaxed());
    assertNotSame(defaultOut, customOut);

    assertEquals("<div><p>&Bscr;</p></div>", customOut); // entities now prefers shorted names if aliased
    assertEquals("<div>\n" +
        " <p>ℬ</p>\n" +
        "</div>", defaultOut);

    os.charset("ASCII");
    os.escapeMode(Entities.EscapeMode.base);
    String customOut2 = Jsoup.clean(html, "http://foo.com/", Whitelist.relaxed(), os);
    assertEquals("<div><p>&#x212c;</p></div>", customOut2);
}
项目:astor    文件:CleanerTest.java   
@Test public void supplyOutputSettings() {
    // test that one can override the default document output settings
    Document.OutputSettings os = new Document.OutputSettings();
    os.prettyPrint(false);
    os.escapeMode(Entities.EscapeMode.extended);
    os.charset("ascii");

    String html = "<div><p>&bernou;</p></div>";
    String customOut = Jsoup.clean(html, "http://foo.com/", Whitelist.relaxed(), os);
    String defaultOut = Jsoup.clean(html, "http://foo.com/", Whitelist.relaxed());
    assertNotSame(defaultOut, customOut);

    assertEquals("<div><p>&Bscr;</p></div>", customOut); // entities now prefers shorted names if aliased
    assertEquals("<div>\n" +
        " <p>ℬ</p>\n" +
        "</div>", defaultOut);

    os.charset("ASCII");
    os.escapeMode(Entities.EscapeMode.base);
    String customOut2 = Jsoup.clean(html, "http://foo.com/", Whitelist.relaxed(), os);
    assertEquals("<div><p>&#x212c;</p></div>", customOut2);
}
项目:lyrics    文件:KaraokeTexty.java   
@Override
public String parsing() {
    String output="";

    try {
        Document doc=Jsoup.connect(super.getURL()).get();
        Elements lyr=doc.select("p.text");
        doc.outputSettings().escapeMode(Entities.EscapeMode.xhtml);
        try {
            output=lyr.first().html();
        } catch(NullPointerException npe) { System.err.println(npe);}
        output=Library.replacing(output);
    } catch(IOException ioe) { System.err.println(ioe); }

    if(output.isEmpty()) { output+="Error: There are no lyrics for this artist and song in the database."; }
    return output;
}
项目:lyrics    文件:LyricWiki.java   
@Override
 public String parsing() {
     String output="";
     try {
         Document doc=Jsoup.connect(super.getURL()).get();
         doc.select(".rtMatcher").remove(); doc.select(".lyricsBreak").remove(); 
doc.select("script").remove(); Library.removeComments(doc);
         Elements lyr=doc.select(".lyricbox");
         doc.outputSettings().escapeMode(Entities.EscapeMode.xhtml);
         output=lyr.html();
         output=Library.replacing(output);
     } catch(IOException ioe) { System.err.println(ioe); }

     if(output.contains("<span") && output.contains("title=\"Instrumental\"")) {
         output="This is an instrumental song with no lyrics.";
     }
     if(output.isEmpty()) { output+="Error: There are no lyrics for this artist and song in the database."; }
     return output;
 }
项目:jsoup-learning    文件:CleanerTest.java   
@Test public void supplyOutputSettings() {
    // test that one can override the default document output settings
    Document.OutputSettings os = new Document.OutputSettings();
    os.prettyPrint(false);
    os.escapeMode(Entities.EscapeMode.extended);

    String html = "<div><p>&bernou;</p></div>";
    String customOut = Jsoup.clean(html, "http://foo.com/", Whitelist.relaxed(), os);
    String defaultOut = Jsoup.clean(html, "http://foo.com/", Whitelist.relaxed());
    assertNotSame(defaultOut, customOut);

    assertEquals("<div><p>&bernou;</p></div>", customOut);
    assertEquals("<div>\n" +
        " <p>ℬ</p>\n" +
        "</div>", defaultOut);

    os.charset("ASCII");
    os.escapeMode(Entities.EscapeMode.base);
    String customOut2 = Jsoup.clean(html, "http://foo.com/", Whitelist.relaxed(), os);
    assertEquals("<div><p>&#x212c;</p></div>", customOut2);
}
项目:lyrics    文件:KaraokeTexty.java   
@Override
public String parsing() {
    String output="";

    try {
        Document doc=Jsoup.connect(super.getURL()).get();
        Elements lyr=doc.select("p.text");
        doc.outputSettings().escapeMode(Entities.EscapeMode.xhtml);
        try {
            output=lyr.first().html();
        } catch(NullPointerException npe) { System.err.println(npe);}
        output=Library.replacing(output);
    } catch(IOException ioe) { System.err.println(ioe); }

    if(output.isEmpty()) { output+="Error: There are no lyrics for this artist and song in the database."; }
    return output;
}
项目:lyrics    文件:LyricWiki.java   
@Override
 public String parsing() {
     String output="";
     try {
         Document doc=Jsoup.connect(super.getURL()).get();
         doc.select(".rtMatcher").remove(); doc.select(".lyricsBreak").remove(); 
doc.select("script").remove(); Library.removeComments(doc);
         Elements lyr=doc.select(".lyricbox");
         doc.outputSettings().escapeMode(Entities.EscapeMode.xhtml);
         output=lyr.html();
         output=Library.replacing(output);
     } catch(IOException ioe) { System.err.println(ioe); }

     if(output.contains("<span") && output.contains("title=\"Instrumental\"")) {
         output="This is an instrumental song with no lyrics.";
     }
     if(output.isEmpty()) { output+="Error: There are no lyrics for this artist and song in the database."; }
     return output;
 }
项目:metka    文件:DDIReadSectionBase.java   
protected <T extends XmlObject> String getText(T att) {
    if(att == null) return "";

    Document doc = Jsoup.parse(att.xmlText());
    doc.outputSettings().syntax(Document.OutputSettings.Syntax.html);
    doc.outputSettings().escapeMode(Entities.EscapeMode.xhtml);

    List<Node> childNodes = doc.body().childNodes();

    if ( childNodes.size() == 1 && "#text".equals(childNodes.get(0).nodeName())) {
        // only text, parse as plaintext.
        XmlCursor cursor = att.newCursor();
        String value = cursor.getTextValue();
        cursor.dispose();
        return value == null ? "" : value;
    } else {
        StringBuilder sb = new StringBuilder();
        for (Node child : childNodes) {
            child.traverse(new DDIReadNodeVisitor());
            child.html(sb);
        }
        return sb.toString();
    }
}
项目:docx4j-template    文件:XHTMLImporterUtils.java   
public static WordprocessingMLPackage handle(WordprocessingMLPackage wmlPackage, Document doc,boolean fragment,boolean altChunk) throws IOException, Docx4JException {
    //设置转换模式
    doc.outputSettings().syntax(Document.OutputSettings.Syntax.xml).escapeMode(Entities.EscapeMode.xhtml);  //转为 xhtml 格式

    if(altChunk){
        //Document对象
        MainDocumentPart document = wmlPackage.getMainDocumentPart();
        //获取Jsoup参数
        String charsetName = Docx4jProperties.getProperty(Docx4jConstants.DOCX4J_JSOUP_PARSE_CHARSETNAME, Docx4jConstants.DEFAULT_CHARSETNAME );
        //设置转换模式
        doc.outputSettings().syntax(Document.OutputSettings.Syntax.xml).escapeMode(Entities.EscapeMode.xhtml);  //转为 xhtml 格式
        //创建html导入对象
        //XHTMLImporterImpl xhtmlImporter = new XHTMLImporterImpl(wordMLPackage);
        document.addAltChunk(AltChunkType.Xhtml, (fragment ? doc.body().html() : doc.html()) .getBytes(Charset.forName(charsetName)));
        //document.addAltChunk(type, bytes, attachmentPoint)
        //document.addAltChunk(type, is)
        //document.addAltChunk(type, is, attachmentPoint)
        WordprocessingMLPackage tempPackage = document.convertAltChunks();

        //返回处理后的WordprocessingMLPackage对象
        return tempPackage;
    }

    //创建html导入对象
    XHTMLImporterImpl xhtmlImporter = new XHTMLImporterImpl(wmlPackage);
    //将xhtml转换为wmlPackage可用的对象
    List<Object> list = xhtmlImporter.convert((fragment ? doc.body().html() : doc.html()), doc.baseUri());
    //导入转换后的内容对象
    wmlPackage.getMainDocumentPart().getContent().addAll(list);
    //返回原WordprocessingMLPackage对象
    return wmlPackage;
}
项目:docx4j-template    文件:HtmlConverter.java   
/**
 * 将页面转为{@link org.jsoup.nodes.Document}对象,xhtml 格式
 *
 * @param url
 * @return
 * @throws Exception
 */
protected Document url2xhtml(String url) throws Exception {
    Document doc = Jsoup.connect(url).get(); //获得

    if (logger.isDebugEnabled()) {
        logger.debug("baseUri: {}", doc.baseUri());
    }

    for (Element script : doc.getElementsByTag("script")) { //除去所有 script
        script.remove();
    }

    for (Element a : doc.getElementsByTag("a")) { //除去 a 的 onclick,href 属性
        a.removeAttr("onclick");
        a.removeAttr("href");
    }

    Elements links = doc.getElementsByTag("link"); //将link中的地址替换为绝对地址
    for (Element element : links) {
        String href = element.absUrl("href");

        if (logger.isDebugEnabled()) {
            logger.debug("href: {} -> {}", element.attr("href"), href);
        }

        element.attr("href", href);
    }

    doc.outputSettings()
            .syntax(Document.OutputSettings.Syntax.xml)
            .escapeMode(Entities.EscapeMode.xhtml);  //转为 xhtml 格式

    if (logger.isDebugEnabled()) {
        String[] split = doc.html().split("\n");
        for (int c = 0; c < split.length; c++) {
            logger.debug("line {}:\t{}", c + 1, split[c]);
        }
    }
    return doc;
}
项目:docx4j-template    文件:HtmlToDOCDemo.java   
private static List<Object> convertToWmlObject(
        WordprocessingMLPackage wordMLPackage, String content)
        throws Docx4JException, JAXBException {
    MainDocumentPart document = wordMLPackage.getMainDocumentPart();
    //获取Jsoup参数
    String charsetName = Docx4jProperties.getProperty(Docx4jConstants.DOCX4J_CONVERT_OUT_WMLTEMPLATE_CHARSETNAME, Docx4jConstants.DEFAULT_CHARSETNAME );

    List<Object> wmlObjList = null;
    String templateString = XmlUtils.marshaltoString(document.getContents().getBody());
    System.out.println(templateString);
    Body templateBody = document.getContents().getBody();
    try {
        document.getContents().setBody(XmlUtils.deepCopy(templateBody));
        document.getContent().clear();
        Document doc = Jsoup.parse(content);
        doc.outputSettings().syntax(Document.OutputSettings.Syntax.xml).escapeMode(Entities.EscapeMode.xhtml);
        //XHTMLImporterImpl xhtmlImporter = new XHTMLImporterImpl(wordMLPackage);

        AlternativeFormatInputPart  part = document.addAltChunk(AltChunkType.Xhtml,doc.html().getBytes(Charset.forName(charsetName)));

        WordprocessingMLPackage tempPackage = document.convertAltChunks();
        File file = new File("d://temp.docx");
        tempPackage.save(file);
        wmlObjList = document.getContent();
        //part.getOwningRelationshipPart().getSourceP().get
        //wmlObjList = xhtmlImporter.convert(doc.html(), doc.baseUri());
    } finally {
        document.getContents().setBody(templateBody);
    }
    return wmlObjList;
}
项目:fastcrawler    文件:HtmlData.java   
private void disableJsoupHtmlEntityEscape() {
    if (DISABLE_HTML_ENTITY_ESCAPE && !INITED) {
        Entities.EscapeMode.base.getMap().clear();
        Entities.EscapeMode.extended.getMap().clear();
        Entities.EscapeMode.xhtml.getMap().clear();
        INITED = true;
    }
}
项目:Android-App    文件:NewsArticle.java   
/**
 * Method to handle the formatting of the news article's body. In here Jsoup is used to remove
 * the web article header, as well as regex overrides for "-Read-More-" and "-End-" tags, and an
 * override to adjust text size per the users currently set text size.
 *
 * @param html Unformatted HTML String, usually straight from the parser or Volley's cache
 * @return Formatted String, ready to be placed within NewsDetailActivity's WebView, or other
 */
public static String formatContent(String html) {

    Document resultD = Jsoup.parse(html);

    resultD.outputSettings().charset("ASCII");
    resultD.outputSettings().escapeMode(Entities.EscapeMode.extended);
    resultD.outputSettings().prettyPrint(false);

    // Select only the content, removing the web header
    String result = resultD.getElementsByTag("table").last()
            .getElementsByTag("tr").get(1)
            .getElementsByTag("td").get(1)
            .html();

    // Removing the -End- and -Read-More- tags created by fccms.psdr3.org
    result = result.replaceFirst("<div.+-End-.+<\\/div>", "");
    result = result.replaceFirst("<div.+-Read-More-.+<\\/div>", "");

    // Overriding the text size. Hard coded "15" can be changed as the scalar quantity.
    int fontScale = (int) (15 * Resources.getSystem().getConfiguration().fontScale);
    result = result.replaceAll("font-size:\\d+pt;", "font-size:" + fontScale + "px;");

    // Add an extra line to the HTML to make the content pad well at the bottom of the WebView
    result = result.concat("<br>");

    return result;
}
项目:zongtui-webcrawler    文件:Html.java   
/**
 * Disable jsoup html entity escape. It is a hack way only for jsoup 1.7.2.
 */
private void disableJsoupHtmlEntityEscape() {
    if (DISABLE_HTML_ENTITY_ESCAPE && !INITED) {
        Entities.EscapeMode.base.getMap().clear();
        Entities.EscapeMode.extended.getMap().clear();
        INITED = true;
    }
}
项目:herd    文件:HerdStringUtils.java   
/**
 * Strips HTML tags from a given input String, allows some tags to be retained via a whitelist
 *
 * @param fragment the specified String
 * @param whitelistTags the specified whitelist tags
 *
 * @return cleaned String with allowed tags
 */
public static String stripHtml(String fragment, String... whitelistTags)
{

    // Parse out html tags except those from a given list of whitelist tags
    Document dirty = Jsoup.parseBodyFragment(fragment);

    Whitelist whitelist = new Whitelist();

    for (String whitelistTag : whitelistTags)
    {
        // Get the actual tag name from the whitelist tag
        // this is vulnerable in general to complex tags but will suffice for our simple needs
        whitelistTag = StringUtils.removePattern(whitelistTag, "[^\\{IsAlphabetic}]");

        // Add all specified tags to the whitelist while preserving inline css
        whitelist.addTags(whitelistTag).addAttributes(whitelistTag, "class");
    }

    Cleaner cleaner = new Cleaner(whitelist);
    Document clean = cleaner.clean(dirty);
    // Set character encoding to UTF-8 and make sure no line-breaks are added
    clean.outputSettings().escapeMode(Entities.EscapeMode.base).charset(StandardCharsets.UTF_8).prettyPrint(false);

    // return 'cleaned' html body
    return clean.body().html();
}
项目:astor    文件:HtmlParserTest.java   
@Test public void relaxedBaseEntityMatchAndStrictExtendedMatch() {
    // extended entities need a ; at the end to match, base does not
    String html = "&amp &quot &reg &icy &hopf &icy; &hopf;";
    Document doc = Jsoup.parse(html);
    doc.outputSettings().escapeMode(Entities.EscapeMode.extended).charset("ascii"); // modifies output only to clarify test
    assertEquals("&amp; \" &reg; &amp;icy &amp;hopf &icy; &hopf;", doc.body().html());
}
项目:astor    文件:HtmlParserTest.java   
@Test public void relaxedBaseEntityMatchAndStrictExtendedMatch() {
    // extended entities need a ; at the end to match, base does not
    String html = "&amp &quot &reg &icy &hopf &icy; &hopf;";
    Document doc = Jsoup.parse(html);
    doc.outputSettings().escapeMode(Entities.EscapeMode.extended).charset("ascii"); // modifies output only to clarify test
    assertEquals("&amp; \" &reg; &amp;icy &amp;hopf &icy; &hopf;", doc.body().html());
}
项目:lyrics    文件:KaraokeTexty.java   
@Override
public void makeURL() {
    String query=super.getArtist()+" - "+super.getSong();
    String searchURL="";

    try {
        URI uri=new URI("http","www.karaoketexty.cz","/search","q="+super.getSong(),null);
        searchURL=uri.toASCIIString().replace("&","%26");
    } catch(URISyntaxException use) { System.err.println(use); }

    try {
        Document doc=Jsoup.connect(searchURL).get();
        Elements links=doc.select("#search > ul.title > li > a");
        doc.outputSettings().escapeMode(Entities.EscapeMode.xhtml);

        for(Element link:links) {
            String resultText=Library.replacing(link.text());
            if(resultText.equalsIgnoreCase(query)) {
                super.setURL("http://www.karaoketexty.cz"+link.attr("href"));
                return;
            }
            else if(resultText.contains(query)) {
                super.setURL("http://www.karaoketexty.cz"+link.attr("href"));
                return;
            }
        }
        super.setURL("http://www.karaoketexty.cz/search?q="+query);
    } catch(IOException ioe) { System.err.println(ioe); }
}
项目:lyrics    文件:MetroLyrics.java   
@Override
public String parsing() {
    String output="";
    try {
        Document doc=Jsoup.connect(super.getURL()).get();
        Elements lyr=doc.select("#lyrics-body-text");
        doc.outputSettings().escapeMode(Entities.EscapeMode.xhtml);
        output=lyr.html().replace("<p class=\"verse\">","");
        output=output.replace("</p>","<br/><br/>");
        output=Library.replacing(output);
    } catch(IOException ioe) { System.err.println(ioe); }

    if(output.isEmpty()) { output+="Error: There are no lyrics for this artist and song in the database."; }
    return output;
}
项目:lyrics    文件:Lastfm.java   
/**
   * Obtains information.
* @param method Method which is needed to call.
* @param info Information which is needed to obtain.
   * @return Information.
   */
  public String obtainInformation(String method,String info) {
      String url=createAPIrequestURL(method);
      String output="";
      try {
          Document doc=Jsoup.connect(url).get();
          Elements lyr=doc.select(info);
          doc.outputSettings().escapeMode(Entities.EscapeMode.xhtml);
          output=lyr.first().html();
          output=Library.replacing(output);
      } catch(IOException ioe) { System.err.println(ioe); }

      return output;
  }
项目:jcabi-http    文件:JsoupResponse.java   
@Override
public String body() {
    final Document html = Jsoup.parse(super.body());
    html.outputSettings().syntax(Document.OutputSettings.Syntax.xml);
    html.outputSettings().escapeMode(Entities.EscapeMode.xhtml);
    return html.html();
}
项目:yarg    文件:HtmlImportProcessorImpl.java   
@Override
public String processHtml(String source) {
    org.jsoup.nodes.Document document = Jsoup.parse(source);
    processHtmlDocument(document);
    document.outputSettings()
            .syntax(org.jsoup.nodes.Document.OutputSettings.Syntax.xml)
            .prettyPrint(false)
            .escapeMode(Entities.EscapeMode.xhtml);


    return document.html();
}
项目:Tanaguru    文件:Rgaa3Extractor.java   
private static void createTestcaseFiles() throws IOException {
    File srcDir = new File(RGAA3_TESTCASE_PATH);
    for (File file : srcDir.listFiles()) {
        String fileName = file.getName().replace("Rgaa30Rule", "").replace(".java", "");
        String theme = fileName.substring(0, 2);
        String crit = fileName.substring(2, 4);
        String test = fileName.substring(4, 6);
        String testKey = Integer.valueOf(theme).toString()+"-"+Integer.valueOf(crit).toString()+"-"+Integer.valueOf(test).toString();
        String wrongKey = theme+"."+crit+"."+test;
        for (File testcase : file.listFiles()) {
            if (testcase.isFile() && testcase.getName().contains("html")) {
                Document doc = Jsoup.parse(FileUtils.readFileToString(testcase));
                Element detail = doc.select(".test-detail").first();
                if (detail == null) {
                    System.out.println(doc.outerHtml());
                } else {
                    detail.tagName("div");
                    detail.text("");
                    for (Element el : detail.children()) {
                        el.remove();
                    }
                    if (!detail.hasAttr("lang")) {
                        detail.attr("lang", "fr");
                    }
                    detail.append("\n"+RGAA3.get(testKey).ruleRawHtml+"\n");
                    doc.outputSettings().escapeMode(Entities.EscapeMode.xhtml);
                    doc.outputSettings().outline(false);
                    doc.outputSettings().indentAmount(4);
                    String outputHtml = doc.outerHtml();
                    if (outputHtml.contains(wrongKey)) {
                        outputHtml = outputHtml.replaceAll(wrongKey, RGAA3.get(testKey).getRuleDot());
                    }
                    FileUtils.writeStringToFile(testcase, outputHtml);
                }
            }
        }
    }
}
项目:Tanaguru    文件:HTMLJsoupCleanerImpl.java   
@Override
 public void run() {
     dirtyHTML = removeBadNamespaceDefinition(dirtyHTML);
     Document doc = Jsoup.parse(dirtyHTML);
     doc.outputSettings().escapeMode(Entities.EscapeMode.xhtml);
     doc.outputSettings().outline(true);
     doc.outputSettings().indentAmount(2);
     removeComments(doc);
     removeMalformedAttributes(doc);
     result = doc.outerHtml();
}
项目:lyrics    文件:KaraokeTexty.java   
@Override
public void makeURL() {
    String query=super.getArtist()+" - "+super.getSong();
    String searchURL="";

    try {
        URI uri=new URI("http","www.karaoketexty.cz","/search","q="+super.getSong(),null);
        searchURL=uri.toASCIIString().replace("&","%26");
    } catch(URISyntaxException use) { System.err.println(use); }

    try {
        Document doc=Jsoup.connect(searchURL).get();
        Elements links=doc.select("#search > ul.title > li > a");
        doc.outputSettings().escapeMode(Entities.EscapeMode.xhtml);

        for(Element link:links) {
            String resultText=Library.replacing(link.text());
            if(resultText.equalsIgnoreCase(query)) {
                super.setURL("http://www.karaoketexty.cz"+link.attr("href"));
                return;
            }
            else if(resultText.contains(query)) {
                super.setURL("http://www.karaoketexty.cz"+link.attr("href"));
                return;
            }
        }
        super.setURL("http://www.karaoketexty.cz/search?q="+query);
    } catch(IOException ioe) { System.err.println(ioe); }
}
项目:lyrics    文件:MetroLyrics.java   
@Override
public String parsing() {
    String output="";
    try {
        Document doc=Jsoup.connect(super.getURL()).get();
        Elements lyr=doc.select("#lyrics-body-text");
        doc.outputSettings().escapeMode(Entities.EscapeMode.xhtml);
        output=lyr.html().replace("<p class=\"verse\">","");
        output=output.replace("</p>","<br/><br/>");
        output=Library.replacing(output);
    } catch(IOException ioe) { System.err.println(ioe); }

    if(output.isEmpty()) { output+="Error: There are no lyrics for this artist and song in the database."; }
    return output;
}
项目:lyrics    文件:Lastfm.java   
/**
   * Obtains information.
* @param method Method which is needed to call.
* @param info Information which is needed to obtain.
   * @return Information.
   */
  public String obtainInformation(String method,String info) {
      String url=createAPIrequestURL(method);
      String output="";
      try {
          Document doc=Jsoup.connect(url).get();
          Elements lyr=doc.select(info);
          doc.outputSettings().escapeMode(Entities.EscapeMode.xhtml);
          output=lyr.first().html();
          output=Library.replacing(output);
      } catch(IOException ioe) { System.err.println(ioe); }

      return output;
  }
项目:calendula    文件:LeafletHtmlPostProcessor.java   
@Override
public String process(String html) {
    // Parse str into a Document
    Document doc = Jsoup.parseBodyFragment(html);
    doc.select("nav").remove();
    doc.select("div#pdfurl").remove();

    // white list to clean html
    Whitelist wl = Whitelist.relaxed();
    wl.addTags("div", "span", "p", "h1", "h2", "h3", "ul", "ol", "li", "a", "img");
    wl.preserveRelativeLinks(true);
    wl.addAttributes("img", "src");
    wl.addAttributes("a", "href");

    // perform cleaning
    Document cleaned = new Cleaner(wl).clean(doc);
    cleaned.outputSettings().escapeMode(Entities.EscapeMode.xhtml);

    // Remove empty elements
    Set<String> removable = new HashSet<>(Arrays.asList("div", "span", "strong", "p", "h1", "h2", "h3", "ul", "ol", "li", "a"));
    cleaned.select("p:matchesOwn((?is) )").remove();
    // For each element in the cleaned document
    for (Element el : cleaned.getAllElements()) {
        if (el.children().isEmpty() && (!el.hasText() || el.text().replaceAll("\u00a0", "").trim().equals(""))) {
            // Element is empty, check if should be removed
            if (removable.contains(el.tagName())) el.remove();
        }
    }
    // return html for  display
    return cleaned.html();
}
项目:common    文件:Tokeniser.java   
char[] consumeCharacterReference(Character additionalAllowedCharacter, boolean inAttribute) {
    if (reader.isEmpty())
        return null;
    if (additionalAllowedCharacter != null && additionalAllowedCharacter == reader.current())
        return null;
    if (reader.matchesAnySorted(notCharRefCharsSorted))
        return null;

    final char[] charRef = charRefHolder;
    reader.mark();
    if (reader.matchConsume("#")) { // numbered
        boolean isHexMode = reader.matchConsumeIgnoreCase("X");
        String numRef = isHexMode ? reader.consumeHexSequence() : reader.consumeDigitSequence();
        if (numRef.length() == 0) { // didn't match anything
            characterReferenceError("numeric reference with no numerals");
            reader.rewindToMark();
            return null;
        }
        if (!reader.matchConsume(";"))
            characterReferenceError("missing semicolon"); // missing semi
        int charval = -1;
        try {
            int base = isHexMode ? 16 : 10;
            charval = Integer.valueOf(numRef, base);
        } catch (NumberFormatException e) {
        } // skip
        if (charval == -1 || (charval >= 0xD800 && charval <= 0xDFFF) || charval > 0x10FFFF) {
            characterReferenceError("character outside of valid range");
            charRef[0] = replacementChar;
            return charRef;
        } else {
            // todo: implement number replacement table
            // todo: check for extra illegal unicode points as parse errors
            if (charval < Character.MIN_SUPPLEMENTARY_CODE_POINT) {
                charRef[0] = (char) charval;
                return charRef;
            } else
            return Character.toChars(charval);
        }
    } else { // named
        // get as many letters as possible, and look for matching entities.
        String nameRef = reader.consumeLetterThenDigitSequence();
        boolean looksLegit = reader.matches(';');
        // found if a base named entity without a ;, or an extended entity with the ;.
        boolean found = (Entities.isBaseNamedEntity(nameRef) || (Entities.isNamedEntity(nameRef) && looksLegit));

        if (!found) {
            reader.rewindToMark();
            if (looksLegit) // named with semicolon
                characterReferenceError(String.format("invalid named referenece '%s'", nameRef));
            return null;
        }
        if (inAttribute && (reader.matchesLetter() || reader.matchesDigit() || reader.matchesAny('=', '-', '_'))) {
            // don't want that to match
            reader.rewindToMark();
            return null;
        }
        if (!reader.matchConsume(";"))
            characterReferenceError("missing semicolon"); // missing semi
        charRef[0] = Entities.getCharacterByName(nameRef);
        return charRef;
    }
}
项目:gestock    文件:Tokeniser.java   
char[] consumeCharacterReference(Character additionalAllowedCharacter, boolean inAttribute) {
    if (reader.isEmpty())
        return null;
    if (additionalAllowedCharacter != null && additionalAllowedCharacter == reader.current())
        return null;
    if (reader.matchesAnySorted(notCharRefCharsSorted))
        return null;

    final char[] charRef = charRefHolder;
    reader.mark();
    if (reader.matchConsume("#")) { // numbered
        boolean isHexMode = reader.matchConsumeIgnoreCase("X");
        String numRef = isHexMode ? reader.consumeHexSequence() : reader.consumeDigitSequence();
        if (numRef.length() == 0) { // didn't match anything
            characterReferenceError("numeric reference with no numerals");
            reader.rewindToMark();
            return null;
        }
        if (!reader.matchConsume(";"))
            characterReferenceError("missing semicolon"); // missing semi
        int charval = -1;
        try {
            int base = isHexMode ? 16 : 10;
            charval = Integer.valueOf(numRef, base);
        } catch (NumberFormatException e) {
        } // skip
        if (charval == -1 || (charval >= 0xD800 && charval <= 0xDFFF) || charval > 0x10FFFF) {
            characterReferenceError("character outside of valid range");
            charRef[0] = replacementChar;
            return charRef;
        } else {
            // todo: implement number replacement table
            // todo: check for extra illegal unicode points as parse errors
            if (charval < Character.MIN_SUPPLEMENTARY_CODE_POINT) {
                charRef[0] = (char) charval;
                return charRef;
            } else
            return Character.toChars(charval);
        }
    } else { // named
        // get as many letters as possible, and look for matching entities.
        String nameRef = reader.consumeLetterThenDigitSequence();
        boolean looksLegit = reader.matches(';');
        // found if a base named entity without a ;, or an extended entity with the ;.
        boolean found = (Entities.isBaseNamedEntity(nameRef) || (Entities.isNamedEntity(nameRef) && looksLegit));

        if (!found) {
            reader.rewindToMark();
            if (looksLegit) // named with semicolon
                characterReferenceError(String.format("invalid named referenece '%s'", nameRef));
            return null;
        }
        if (inAttribute && (reader.matchesLetter() || reader.matchesDigit() || reader.matchesAny('=', '-', '_'))) {
            // don't want that to match
            reader.rewindToMark();
            return null;
        }
        if (!reader.matchConsume(";"))
            characterReferenceError("missing semicolon"); // missing semi
        charRef[0] = Entities.getCharacterByName(nameRef);
        return charRef;
    }
}
项目:mygrades-app    文件:Scraper.java   
/**
 * Makes request to the given url with given request data and method.
 * Follows redirects (including HTML redirects).
 *
 * @param requestData Map of key value pairs for request
 * @param method Connection.Method - HTTP method
 * @param url url as string
 * @throws IOException if there is an error connecting to the url
 */
private void makeJsoupRequest(Map<String, String> requestData, Connection.Method method, String url) throws IOException, URISyntaxException {
    Connection.Response response = Jsoup.connect(url)
            .data(requestData)
            .cookies(cookies)
            .referrer(previousUrl) // some websites block without referrer
            .userAgent(Config.BROWSER_USER_AGENT) // set explicit user agent
            .method(method)
            .timeout(Config.SCRAPER_TIMEOUT)
            .followRedirects(false)
            .execute();

    // get cookies from response and add to all cookies
    addNewCookies(response.cookies());

    // get content from response
    document = response.parse();
    document.outputSettings().escapeMode(Entities.EscapeMode.xhtml);
    document.outputSettings().syntax(Document.OutputSettings.Syntax.xml);
    document.select("script").remove();
    document.select("td:contains(aktuellen ECTS-Grades)").remove(); // remove invalid html (see error #71)

    // check for location redirect
    String location = response.header("location");
    if (location != null) {
        baseUri = new URL(location).toURI();
        makeJsoupRequest(new HashMap<String, String>(), Connection.Method.GET, location);
    }

    // check for meta refresh tag
    Element meta = document.select("meta[http-equiv=Refresh").first();
    if (meta != null) {
        String content = meta.attr("content");
        if (content != null) {
            meta.attr("refresh-url", content.replaceAll("(?i)^(\\d+;.*URL=)(.+)$", "$2"));
            makeJsoupRequest(new HashMap<String, String>(), Connection.Method.GET, meta.absUrl("refresh-url"));
        }
    }

    // check for refresh pseudo header
    String refreshHeader = response.header("refresh");
    if (refreshHeader != null) {
        String relativeUrl = refreshHeader.replaceAll("(?i)^(\\d+;.*URL=)(.+)$", "$2");
        String redirectUrl = StringUtil.resolve(document.baseUri(), relativeUrl);
        makeJsoupRequest(new HashMap<String, String>(), Connection.Method.GET, redirectUrl);
    }
}
项目:CN1ML-NetbeansModule    文件:Tokeniser.java   
char[] consumeCharacterReference(Character additionalAllowedCharacter, boolean inAttribute) {
    if (reader.isEmpty())
        return null;
    if (additionalAllowedCharacter != null && additionalAllowedCharacter == reader.current())
        return null;
    if (reader.matchesAny('\t', '\n', '\r', '\f', ' ', '<', '&'))
        return null;

    reader.mark();
    if (reader.matchConsume("#")) { // numbered
        boolean isHexMode = reader.matchConsumeIgnoreCase("X");
        String numRef = isHexMode ? reader.consumeHexSequence() : reader.consumeDigitSequence();
        if (numRef.length() == 0) { // didn't match anything
            characterReferenceError("numeric reference with no numerals");
            reader.rewindToMark();
            return null;
        }
        if (!reader.matchConsume(";"))
            characterReferenceError("missing semicolon"); // missing semi
        int charval = -1;
        try {
            int base = isHexMode ? 16 : 10;
            charval = Integer.valueOf(numRef, base);
        } catch (NumberFormatException e) {
        } // skip
        if (charval == -1 || (charval >= 0xD800 && charval <= 0xDFFF) || charval > 0x10FFFF) {
            characterReferenceError("character outside of valid range");
            return new char[]{replacementChar};
        } else {
            // todo: implement number replacement table
            // todo: check for extra illegal unicode points as parse errors
            return Character.toChars(charval);
        }
    } else { // named
        // get as many letters as possible, and look for matching entities.
        String nameRef = reader.consumeLetterThenDigitSequence();
        boolean looksLegit = reader.matches(';');
        // found if a base named entity without a ;, or an extended entity with the ;.
        boolean found = (Entities.isBaseNamedEntity(nameRef) || (Entities.isNamedEntity(nameRef) && looksLegit));

        if (!found) {
            reader.rewindToMark();
            if (looksLegit) // named with semicolon
                characterReferenceError(String.format("invalid named referenece '%s'", nameRef));
            return null;
        }
        if (inAttribute && (reader.matchesLetter() || reader.matchesDigit() || reader.matchesAny('=', '-', '_'))) {
            // don't want that to match
            reader.rewindToMark();
            return null;
        }
        if (!reader.matchConsume(";"))
            characterReferenceError("missing semicolon"); // missing semi
        return new char[]{Entities.getCharacterByName(nameRef)};
    }
}
项目:astor    文件:Tokeniser.java   
char[] consumeCharacterReference(Character additionalAllowedCharacter, boolean inAttribute) {
    if (reader.isEmpty())
        return null;
    if (additionalAllowedCharacter != null && additionalAllowedCharacter == reader.current())
        return null;
    if (reader.matchesAnySorted(notCharRefCharsSorted))
        return null;

    final char[] charRef = charRefHolder;
    reader.mark();
    if (reader.matchConsume("#")) { // numbered
        boolean isHexMode = reader.matchConsumeIgnoreCase("X");
        String numRef = isHexMode ? reader.consumeHexSequence() : reader.consumeDigitSequence();
        if (numRef.length() == 0) { // didn't match anything
            characterReferenceError("numeric reference with no numerals");
            reader.rewindToMark();
            return null;
        }
        if (!reader.matchConsume(";"))
            characterReferenceError("missing semicolon"); // missing semi
        int charval = -1;
        try {
            int base = isHexMode ? 16 : 10;
            charval = Integer.valueOf(numRef, base);
        } catch (NumberFormatException e) {
        } // skip
        if (charval == -1 || (charval >= 0xD800 && charval <= 0xDFFF) || charval > 0x10FFFF) {
            characterReferenceError("character outside of valid range");
            charRef[0] = replacementChar;
            return charRef;
        } else {
            // todo: implement number replacement table
            // todo: check for extra illegal unicode points as parse errors
            if (charval < Character.MIN_SUPPLEMENTARY_CODE_POINT) {
                charRef[0] = (char) charval;
                return charRef;
            } else
            return Character.toChars(charval);
        }
    } else { // named
        // get as many letters as possible, and look for matching entities.
        String nameRef = reader.consumeLetterThenDigitSequence();
        boolean looksLegit = reader.matches(';');
        // found if a base named entity without a ;, or an extended entity with the ;.
        boolean found = (Entities.isBaseNamedEntity(nameRef) || (Entities.isNamedEntity(nameRef) && looksLegit));

        if (!found) {
            reader.rewindToMark();
            if (looksLegit) // named with semicolon
                characterReferenceError(String.format("invalid named referenece '%s'", nameRef));
            return null;
        }
        if (inAttribute && (reader.matchesLetter() || reader.matchesDigit() || reader.matchesAny('=', '-', '_'))) {
            // don't want that to match
            reader.rewindToMark();
            return null;
        }
        if (!reader.matchConsume(";"))
            characterReferenceError("missing semicolon"); // missing semi
        charRef[0] = Entities.getCharacterByName(nameRef);
        return charRef;
    }
}
项目:astor    文件:Tokeniser.java   
int[] consumeCharacterReference(Character additionalAllowedCharacter, boolean inAttribute) {
    if (reader.isEmpty())
        return null;
    if (additionalAllowedCharacter != null && additionalAllowedCharacter == reader.current())
        return null;
    if (reader.matchesAnySorted(notCharRefCharsSorted))
        return null;

    final int[] codeRef = codepointHolder;
    reader.mark();
    if (reader.matchConsume("#")) { // numbered
        boolean isHexMode = reader.matchConsumeIgnoreCase("X");
        String numRef = isHexMode ? reader.consumeHexSequence() : reader.consumeDigitSequence();
        if (numRef.length() == 0) { // didn't match anything
            characterReferenceError("numeric reference with no numerals");
            reader.rewindToMark();
            return null;
        }
        if (!reader.matchConsume(";"))
            characterReferenceError("missing semicolon"); // missing semi
        int charval = -1;
        try {
            int base = isHexMode ? 16 : 10;
            charval = Integer.valueOf(numRef, base);
        } catch (NumberFormatException ignored) {
        } // skip
        if (charval == -1 || (charval >= 0xD800 && charval <= 0xDFFF) || charval > 0x10FFFF) {
            characterReferenceError("character outside of valid range");
            codeRef[0] = replacementChar;
            return codeRef;
        } else {
            // todo: implement number replacement table
            // todo: check for extra illegal unicode points as parse errors
            codeRef[0] = charval;
            return codeRef;
        }
    } else { // named
        // get as many letters as possible, and look for matching entities.
        String nameRef = reader.consumeLetterThenDigitSequence();
        boolean looksLegit = reader.matches(';');
        // found if a base named entity without a ;, or an extended entity with the ;.
        boolean found = (Entities.isBaseNamedEntity(nameRef) || (Entities.isNamedEntity(nameRef) && looksLegit));

        if (!found) {
            reader.rewindToMark();
            if (looksLegit) // named with semicolon
                characterReferenceError(String.format("invalid named referenece '%s'", nameRef));
            return null;
        }
        if (inAttribute && (reader.matchesLetter() || reader.matchesDigit() || reader.matchesAny('=', '-', '_'))) {
            // don't want that to match
            reader.rewindToMark();
            return null;
        }
        if (!reader.matchConsume(";"))
            characterReferenceError("missing semicolon"); // missing semi
        int numChars = Entities.codepointsForName(nameRef, multipointHolder);
        if (numChars == 1) {
            codeRef[0] = multipointHolder[0];
            return codeRef;
        } else if (numChars ==2) {
            return multipointHolder;
        } else {
            Validate.fail("Unexpected characters returned for " + nameRef);
            return multipointHolder;
        }
    }
}
项目:astor    文件:Tokeniser.java   
int[] consumeCharacterReference(Character additionalAllowedCharacter, boolean inAttribute) {
    if (reader.isEmpty())
        return null;
    if (additionalAllowedCharacter != null && additionalAllowedCharacter == reader.current())
        return null;
    if (reader.matchesAnySorted(notCharRefCharsSorted))
        return null;

    final int[] codeRef = codepointHolder;
    reader.mark();
    if (reader.matchConsume("#")) { // numbered
        boolean isHexMode = reader.matchConsumeIgnoreCase("X");
        String numRef = isHexMode ? reader.consumeHexSequence() : reader.consumeDigitSequence();
        if (numRef.length() == 0) { // didn't match anything
            characterReferenceError("numeric reference with no numerals");
            reader.rewindToMark();
            return null;
        }
        if (!reader.matchConsume(";"))
            characterReferenceError("missing semicolon"); // missing semi
        int charval = -1;
        try {
            int base = isHexMode ? 16 : 10;
            charval = Integer.valueOf(numRef, base);
        } catch (NumberFormatException ignored) {
        } // skip
        if (charval == -1 || (charval >= 0xD800 && charval <= 0xDFFF) || charval > 0x10FFFF) {
            characterReferenceError("character outside of valid range");
            codeRef[0] = replacementChar;
            return codeRef;
        } else {
            // todo: implement number replacement table
            // todo: check for extra illegal unicode points as parse errors
            codeRef[0] = charval;
            return codeRef;
        }
    } else { // named
        // get as many letters as possible, and look for matching entities.
        String nameRef = reader.consumeLetterThenDigitSequence();
        boolean looksLegit = reader.matches(';');
        // found if a base named entity without a ;, or an extended entity with the ;.
        boolean found = (Entities.isBaseNamedEntity(nameRef) || (Entities.isNamedEntity(nameRef) && looksLegit));

        if (!found) {
            reader.rewindToMark();
            if (looksLegit) // named with semicolon
                characterReferenceError(String.format("invalid named referenece '%s'", nameRef));
            return null;
        }
        if (inAttribute && (reader.matchesLetter() || reader.matchesDigit() || reader.matchesAny('=', '-', '_'))) {
            // don't want that to match
            reader.rewindToMark();
            return null;
        }
        if (!reader.matchConsume(";"))
            characterReferenceError("missing semicolon"); // missing semi
        int numChars = Entities.codepointsForName(nameRef, multipointHolder);
        if (numChars == 1) {
            codeRef[0] = multipointHolder[0];
            return codeRef;
        } else if (numChars ==2) {
            return multipointHolder;
        } else {
            Validate.fail("Unexpected characters returned for " + nameRef);
            return multipointHolder;
        }
    }
}