Java 类org.jsoup.safety.Cleaner 实例源码

项目:yadaframework    文件:YadaWebUtil.java   
/**
 * Cleans the html content leaving only the following tags: b, em, i, strong, u, br, cite, em, i, p, strong, img, li, ul, ol, sup, sub, s
 * @param content html content
 * @param extraTags any other tags that you may want to keep, e. g. "a"
 * @return
 */
public String cleanContent(String content, String ... extraTags) {
    Whitelist allowedTags = Whitelist.simpleText(); // This whitelist allows only simple text formatting: b, em, i, strong, u. All other HTML (tags and attributes) will be removed.
    allowedTags.addTags("br", "cite", "em", "i", "p", "strong", "img", "li", "ul", "ol", "sup", "sub", "s");
    allowedTags.addTags(extraTags);
    allowedTags.addAttributes("p", "style"); // Serve per l'allineamento a destra e sinistra
    allowedTags.addAttributes("img", "src", "style", "class"); 
    if (Arrays.asList(extraTags).contains("a")) {
        allowedTags.addAttributes("a", "href", "target"); 
    }
    Document dirty = Jsoup.parseBodyFragment(content, "");
    Cleaner cleaner = new Cleaner(allowedTags);
    Document clean = cleaner.clean(dirty);
    clean.outputSettings().escapeMode(EscapeMode.xhtml); // Non fa l'escape dei caratteri utf-8
    String safe = clean.body().html();
    return safe;
}
项目:bennu-renderers    文件:JsoupSafeHtmlConverter.java   
@Override
public Object convert(Class type, Object value) {
    String htmlText = (String) value;

    if (Strings.isNullOrEmpty(htmlText)) {
        return null;
    }

    Document dirty = Jsoup.parseBodyFragment(htmlText);

    Cleaner cleaner = new Cleaner(whitelist);
    Document clean = cleaner.clean(dirty);
    cleanInvalidIframes(clean);
    clean.outputSettings().charset("ASCII");
    return clean.body().html();
}
项目:q-mail    文件:HtmlSanitizer.java   
HtmlSanitizer() {
    Whitelist whitelist = Whitelist.relaxed()
            .addTags("font")
            .addAttributes("table", "align", "bgcolor", "border", "cellpadding", "cellspacing", "width")
            .addAttributes(":all", "class", "style", "id")
            .addProtocols("img", "src", "http", "https", "cid", "data");

    cleaner = new Cleaner(whitelist);
    headCleaner = new HeadCleaner();
}
项目:gitplex-mit    文件:DefaultMarkdownManager.java   
@Override
public String postProcess(String html) {
    // Use a faked baseURI, otherwise all relative urls will be stripped out
    Document body = Jsoup.parseBodyFragment(html, "http://localhost/sanitize");

    Cleaner cleaner = new Cleaner(whiteList);
    body = cleaner.clean(body);

    for (HtmlTransformer transformer : htmlTransformers)
        transformer.transform(body);
    return body.body().html();
}
项目:interview-preparation    文件:App.java   
public static void main( String[] args )
  {
// load html from file
Document doc = loadHtmlFromFile("index.html", "utf-8");

// just leave if doc is null
if(doc == null) {           
    LogUtils.d(CLS_NAME, "main", "document is null");
    return;
}

/* the dirty html */
System.out.println("===BEFORE===");
System.out.println(doc.html());

/* create and config whitelist */
Whitelist allowList = Whitelist.relaxed();
allowList
    .addTags("meta", "title", "script", "iframe")
    .addAttributes("meta", "charset")
    .addAttributes("iframe", "src")
    .addProtocols("iframe", "src", "http", "https");

/* clean the dirty doc */
Cleaner cleaner = new Cleaner(allowList);
Document newDoc = cleaner.clean(doc);

/* the clean one */
System.out.println("===AFTER===");
System.out.println(newDoc.html());
  }
项目:site    文件:RegistrationService.java   
/**
 * Cleans some html text by stripping all tags but <code>br</code> and then
 * unescapes named entitiesl like '&quote';. brs will be replaced by
 * newlines.
 *
 * @param htmlText
 * @return
 */
String htmlTextToPlainText(final String htmlText) {
    final Whitelist whitelist = Whitelist.none();
    whitelist.addTags("br");
    final Cleaner cleaner = new Cleaner(whitelist);
    final Document cleanedDocument = cleaner.clean(Jsoup.parse(htmlText));
    cleanedDocument
            .outputSettings()
            .prettyPrint(false)
            .escapeMode(EscapeMode.xhtml)
            .charset(StandardCharsets.UTF_8);
    return Parser.unescapeEntities(cleanedDocument.body().html().trim(), true).replaceAll("<br(?: ?/)?>", "\r\n");
}
项目:herd    文件:HerdStringUtils.java   
/**
 * Strips HTML tags from a given input String, allows some tags to be retained via a whitelist
 *
 * @param fragment the specified String
 * @param whitelistTags the specified whitelist tags
 *
 * @return cleaned String with allowed tags
 */
public static String stripHtml(String fragment, String... whitelistTags)
{

    // Parse out html tags except those from a given list of whitelist tags
    Document dirty = Jsoup.parseBodyFragment(fragment);

    Whitelist whitelist = new Whitelist();

    for (String whitelistTag : whitelistTags)
    {
        // Get the actual tag name from the whitelist tag
        // this is vulnerable in general to complex tags but will suffice for our simple needs
        whitelistTag = StringUtils.removePattern(whitelistTag, "[^\\{IsAlphabetic}]");

        // Add all specified tags to the whitelist while preserving inline css
        whitelist.addTags(whitelistTag).addAttributes(whitelistTag, "class");
    }

    Cleaner cleaner = new Cleaner(whitelist);
    Document clean = cleaner.clean(dirty);
    // Set character encoding to UTF-8 and make sure no line-breaks are added
    clean.outputSettings().escapeMode(Entities.EscapeMode.base).charset(StandardCharsets.UTF_8).prettyPrint(false);

    // return 'cleaned' html body
    return clean.body().html();
}
项目:site    文件:RegistrationService.java   
/**
 * Cleans some html text by stripping all tags but <code>br</code> and then
 * unescapes named entitiesl like '&quote';. brs will be replaced by
 * newlines.
 *
 * @param htmlText
 * @return
 */
String htmlTextToPlainText(final String htmlText) {
    final Whitelist whitelist = Whitelist.none();
    whitelist.addTags("br");
    final Cleaner cleaner = new Cleaner(whitelist);
    final Document cleanedDocument = cleaner.clean(Jsoup.parse(htmlText));
    cleanedDocument
            .outputSettings()
            .prettyPrint(false)
            .escapeMode(EscapeMode.xhtml)
            .charset(StandardCharsets.UTF_8);
    return Parser.unescapeEntities(cleanedDocument.body().html().trim(), true).replaceAll("<br(?: ?/)?>", "\r\n");
}
项目:bamboo    文件:Markdown.java   
public static String render(String markdown, String baseUri) {
    if (markdown == null || markdown.isEmpty()) {
        return null;
    }
    String html = new PegDownProcessor(PEGDOWN_OPTIONS).markdownToHtml(markdown);
    Document dirty = Jsoup.parseBodyFragment(html, baseUri.toString());
    Cleaner cleaner = new Cleaner(HTML_WHITELIST);
    Document clean = cleaner.clean(dirty);
    rewriteFragmentLinks(clean, baseUri);
    return clean.body().html();
}
项目:uraptor    文件:HtmlCleaner.java   
/**
 * Clean HTML string and return the cleaner version.
 * 
 * @param html Input HTML string.
 * @return Cleaned version of the HTML as string.
 */
public String clean(String html)
{
    // Parser str into a Document
    Document doc = Jsoup.parse(html);
    // Clean the document
    doc = new Cleaner(wl).clean(doc);
    // Adjust escape mode
    doc.outputSettings().escapeMode(EscapeMode.xhtml);

    // Get back the string of the Document
    return doc.html();
}
项目:uraptor    文件:HtmlCleaner.java   
/**
 * Clean the HTML string and return a document.
 * 
 * @param html Input HTML string.
 * @param baseUri Base URI of the document.
 * @return Cleaned version of the HTML as document.
 */
public Document clean(String html, String baseUri)
{
    // Parser str into a Document
    Document doc = Jsoup.parse(html, baseUri);
    // Clean the document
    doc = new Cleaner(wl).clean(doc);
    // Adjust escape mode
    doc.outputSettings().escapeMode(EscapeMode.xhtml);

    // Get back the string of the Document
    return doc;
}
项目:MimeUI    文件:HTMLContentViewer.java   
@Override
public void render (final IncorporatedPart part, final PrintWriter htmlOut)
        throws MimeUIException
{
    InputStream inputStream = null;

    try
    {
        inputStream = part.getInputStream();

        // todo Move all of this logic to a separate class with a String sanitize(String) method.
        // This normalizes and sanitizes the HTML, preventing cross site scripting attacks and other issues.
        final Cleaner htmlCleaner = new Cleaner(new HTMLWhiteList());
        final HTMLTransformer htmlTransformer
                = new HTMLTransformer(part, this.untrustedContentUriResolver, this.contentLocationProvider);
        final String htmlSource = IOUtils.toString(inputStream, part.getCharacterEncoding());
        final Document dirtyDocument = Jsoup.parse(htmlSource, part.getContextLocation().toString());
        final Document cleanDocument = htmlCleaner.clean(dirtyDocument);

        htmlTransformer.transform(cleanDocument);

        // this removes the body element, which often contains a style/class attribute.
        htmlOut.println(cleanDocument.body().html());
    }
    catch (final IOException e)
    {
        throw new MimeUIException("Unable to read a textual part.", e);
    }
    finally
    {
        IOUtils.closeQuietly(inputStream);
    }
}
项目:calendula    文件:LeafletHtmlPostProcessor.java   
@Override
public String process(String html) {
    // Parse str into a Document
    Document doc = Jsoup.parseBodyFragment(html);
    doc.select("nav").remove();
    doc.select("div#pdfurl").remove();

    // white list to clean html
    Whitelist wl = Whitelist.relaxed();
    wl.addTags("div", "span", "p", "h1", "h2", "h3", "ul", "ol", "li", "a", "img");
    wl.preserveRelativeLinks(true);
    wl.addAttributes("img", "src");
    wl.addAttributes("a", "href");

    // perform cleaning
    Document cleaned = new Cleaner(wl).clean(doc);
    cleaned.outputSettings().escapeMode(Entities.EscapeMode.xhtml);

    // Remove empty elements
    Set<String> removable = new HashSet<>(Arrays.asList("div", "span", "strong", "p", "h1", "h2", "h3", "ul", "ol", "li", "a"));
    cleaned.select("p:matchesOwn((?is) )").remove();
    // For each element in the cleaned document
    for (Element el : cleaned.getAllElements()) {
        if (el.children().isEmpty() && (!el.hasText() || el.text().replaceAll("\u00a0", "").trim().equals(""))) {
            // Element is empty, check if should be removed
            if (removable.contains(el.tagName())) el.remove();
        }
    }
    // return html for  display
    return cleaned.html();
}
项目:GoogleIndexRetriever    文件:GoogleSearch.java   
/**
 * Make the query to google and return the data.
 *
 * @param query
 *            textfield for google
 * @return webpage in Document format
 */
private Document getData(String query) throws CaptchaException, EmptyQueryException, UnsupportedEncodingException {
    if (this.query.isEmpty() || this.query == null) {
        throw new EmptyQueryException();
    }

    Connection conn = null;
    Document doc = null;

    String request = "https://www.google.com/search?q=" + URLEncoder.encode( stripXSS(query), "UTF-8");
    if(!tokenCookie.isEmpty()){
        request = request + "&google_abuse=" + URLEncoder.encode(tokenCookie, "UTF-8");
    }

    try {
        conn = Jsoup
                .connect(request)
                .method(Method.GET)
                .userAgent("Mozilla/5.0 (Windows NT 6.3; WOW64; rv:36.0) Gecko/20100101 Firefox/48.0")
                .header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
                .header("Cookie", tokenCookie)
                .header("Connection", "keep-alive")
                .ignoreHttpErrors(true)
                .timeout(5000);

        if(!referer.isEmpty()){
            conn.header("Referer", referer);
        }

        Connection.Response response = conn.execute();

        if (response.statusCode() == 503) {

            referer = response.url().toString();
            idCaptcha = getIDCaptcha(response.parse());

            getCaptcha("https://ipv4.google.com/sorry/image?id=" + idCaptcha + "&hl=es&" + referer.substring(referer.indexOf('?')+1));

            throw new CaptchaException();

        }

        doc = Jsoup.parse(response.body());

        // Clean the response
        Whitelist wl = new Whitelist().basic();
        wl.addAttributes("span", "class");
        Cleaner clean = new Cleaner(wl);
        doc = clean.clean(doc);
    } catch (IOException e) {
        //System.out.println(e.getMessage());
        e.printStackTrace();
    }

    return doc;
}
项目:common    文件:Jsoup.java   
/**
 Test if the input HTML has only tags and attributes allowed by the Whitelist. Useful for form validation. The input HTML should
 still be run through the cleaner to set up enforced attributes, and to tidy the output.
 @param bodyHtml HTML to test
 @param whitelist whitelist to test against
 @return true if no tags or attributes were removed; false otherwise
 @see #clean(String, org.jsoup.safety.Whitelist) 
 */
public static boolean isValid(String bodyHtml, Whitelist whitelist) {
    Document dirty = parseBodyFragment(bodyHtml, "");
    Cleaner cleaner = new Cleaner(whitelist);
    return cleaner.isValid(dirty);
}
项目:gestock    文件:Jsoup.java   
/**
 Test if the input HTML has only tags and attributes allowed by the Whitelist. Useful for form validation. The input HTML should
 still be run through the cleaner to set up enforced attributes, and to tidy the output.
 @param bodyHtml HTML to test
 @param whitelist whitelist to test against
 @return true if no tags or attributes were removed; false otherwise
 @see #clean(String, org.jsoup.safety.Whitelist) 
 */
public static boolean isValid(String bodyHtml, Whitelist whitelist) {
    Document dirty = parseBodyFragment(bodyHtml, "");
    Cleaner cleaner = new Cleaner(whitelist);
    return cleaner.isValid(dirty);
}
项目:CN1ML-NetbeansModule    文件:Jsoup.java   
/**
 Test if the input HTML has only tags and attributes allowed by the Whitelist. Useful for form validation. The input HTML should
 still be run through the cleaner to set up enforced attributes, and to tidy the output.
 @param bodyHtml HTML to test
 @param whitelist whitelist to test against
 @return true if no tags or attributes were removed; false otherwise
 @see #clean(String, org.jsoup.safety.Whitelist) 
 */
public static boolean isValid(String bodyHtml, Whitelist whitelist) {
    Document dirty = parseBodyFragment(bodyHtml, "");
    Cleaner cleaner = new Cleaner(whitelist);
    return cleaner.isValid(dirty);
}
项目:astor    文件:Jsoup.java   
/**
 Test if the input HTML has only tags and attributes allowed by the Whitelist. Useful for form validation. The input HTML should
 still be run through the cleaner to set up enforced attributes, and to tidy the output.
 @param bodyHtml HTML to test
 @param whitelist whitelist to test against
 @return true if no tags or attributes were removed; false otherwise
 @see #clean(String, org.jsoup.safety.Whitelist) 
 */
public static boolean isValid(String bodyHtml, Whitelist whitelist) {
    Document dirty = parseBodyFragment(bodyHtml, "");
    Cleaner cleaner = new Cleaner(whitelist);
    return cleaner.isValid(dirty);
}
项目:astor    文件:Jsoup.java   
/**
 Test if the input body HTML has only tags and attributes allowed by the Whitelist. Useful for form validation.
 <p>The input HTML should still be run through the cleaner to set up enforced attributes, and to tidy the output.
 <p>Assumes the HTML is a body fragment (i.e. will be used in an existing HTML document body.)
 @param bodyHtml HTML to test
 @param whitelist whitelist to test against
 @return true if no tags or attributes were removed; false otherwise
 @see #clean(String, org.jsoup.safety.Whitelist) 
 */
public static boolean isValid(String bodyHtml, Whitelist whitelist) {
    return new Cleaner(whitelist).isValidBodyHtml(bodyHtml);
}
项目:astor    文件:Jsoup.java   
/**
 Test if the input body HTML has only tags and attributes allowed by the Whitelist. Useful for form validation.
 <p>The input HTML should still be run through the cleaner to set up enforced attributes, and to tidy the output.
 <p>Assumes the HTML is a body fragment (i.e. will be used in an existing HTML document body.)
 @param bodyHtml HTML to test
 @param whitelist whitelist to test against
 @return true if no tags or attributes were removed; false otherwise
 @see #clean(String, org.jsoup.safety.Whitelist) 
 */
public static boolean isValid(String bodyHtml, Whitelist whitelist) {
    return new Cleaner(whitelist).isValidBodyHtml(bodyHtml);
}
项目:BoL-API-Parser    文件:Jsoup.java   
/**
 Test if the input HTML has only tags and attributes allowed by the Whitelist. Useful for form validation. The input HTML should
 still be run through the cleaner to set up enforced attributes, and to tidy the output.
 @param bodyHtml HTML to test
 @param whitelist whitelist to test against
 @return true if no tags or attributes were removed; false otherwise
 @see #clean(String, org.jsoup.safety.Whitelist) 
 */
public static boolean isValid(String bodyHtml, Whitelist whitelist) {
    Document dirty = parseBodyFragment(bodyHtml, "");
    Cleaner cleaner = new Cleaner(whitelist);
    return cleaner.isValid(dirty);
}
项目:JabRefAutocomplete    文件:Jsoup.java   
/**
 Test if the input HTML has only tags and attributes allowed by the Whitelist. Useful for form validation. The input HTML should
 still be run through the cleaner to set up enforced attributes, and to tidy the output.
 @param bodyHtml HTML to test
 @param whitelist whitelist to test against
 @return true if no tags or attributes were removed; false otherwise
 @see #clean(String, org.jsoup.safety.Whitelist) 
 */
public static boolean isValid(String bodyHtml, Whitelist whitelist) {
    Document dirty = parseBodyFragment(bodyHtml, "");
    Cleaner cleaner = new Cleaner(whitelist);
    return cleaner.isValid(dirty);
}
项目:AngelList-Mobile    文件:Jsoup.java   
/**
 Test if the input HTML has only tags and attributes allowed by the Whitelist. Useful for form validation. The input HTML should
 still be run through the cleaner to set up enforced attributes, and to tidy the output.
 @param bodyHtml HTML to test
 @param whitelist whitelist to test against
 @return true if no tags or attributes were removed; false otherwise
 @see #clean(String, org.jsoup.safety.Whitelist) 
 */
public static boolean isValid(String bodyHtml, Whitelist whitelist) {
    Document dirty = parseBodyFragment(bodyHtml, "");
    Cleaner cleaner = new Cleaner(whitelist);
    return cleaner.isValid(dirty);
}
项目:jsoup-learning    文件:Jsoup.java   
/**
 Test if the input HTML has only tags and attributes allowed by the Whitelist. Useful for form validation. The input HTML should
 still be run through the cleaner to set up enforced attributes, and to tidy the output.
 @param bodyHtml HTML to test
 @param whitelist whitelist to test against
 @return true if no tags or attributes were removed; false otherwise
 @see #clean(String, org.jsoup.safety.Whitelist) 
 */
public static boolean isValid(String bodyHtml, Whitelist whitelist) {
    Document dirty = parseBodyFragment(bodyHtml, "");
    Cleaner cleaner = new Cleaner(whitelist);
    return cleaner.isValid(dirty);
}
项目:idylfin    文件:Jsoup.java   
/**
 Test if the input HTML has only tags and attributes allowed by the Whitelist. Useful for form validation. The input HTML should
 still be run through the cleaner to set up enforced attributes, and to tidy the output.
 @param bodyHtml HTML to test
 @param whitelist whitelist to test against
 @return true if no tags or attributes were removed; false otherwise
 @see #clean(String, org.jsoup.safety.Whitelist) 
 */
public static boolean isValid(String bodyHtml, Whitelist whitelist) {
    Document dirty = parseBodyFragment(bodyHtml, "");
    Cleaner cleaner = new Cleaner(whitelist);
    return cleaner.isValid(dirty);
}