@Test public void testEscapeXmlAllCharacters() { // http://www.w3.org/TR/xml/#charsets says: // Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF] /* any Unicode character, // excluding the surrogate blocks, FFFE, and FFFF. */ final CharSequenceTranslator escapeXml = StringEscapeUtils.ESCAPE_XML .with(NumericEntityEscaper.below(9), NumericEntityEscaper.between(0xB, 0xC), NumericEntityEscaper.between(0xE, 0x19), NumericEntityEscaper.between(0xD800, 0xDFFF), NumericEntityEscaper.between(0xFFFE, 0xFFFF), NumericEntityEscaper.above(0x110000)); assertEquals("�", escapeXml.translate("\u0000\u0001\u0002\u0003\u0004\u0005\u0006\u0007\u0008")); assertEquals("\t", escapeXml.translate("\t")); // 0x9 assertEquals("\n", escapeXml.translate("\n")); // 0xA assertEquals("", escapeXml.translate("\u000B\u000C")); assertEquals("\r", escapeXml.translate("\r")); // 0xD assertEquals("Hello World! Ain't this great?", escapeXml.translate("Hello World! Ain't this great?")); assertEquals("", escapeXml.translate("\u000E\u000F\u0018\u0019")); }
private String escapeJava(String str, boolean unicode) { CharSequenceTranslator tr = new LookupTranslator(new String[][] { { "\"", "\\\"" }, { "\\", "\\\\" } }) .with(new CharSequenceTranslator[] { new LookupTranslator( EntityArrays.JAVA_CTRL_CHARS_ESCAPE()) }); if (unicode) tr = tr.with(new CharSequenceTranslator[] { UnicodeEscaper .outsideOf(32, 127) }); return tr.translate(str); }
/** * @param out write to receieve the escaped string * @param str String to escape values in, may be null * @param escapeSingleQuote escapes single quotes if <code>true</code> * @param escapeForwardSlash TODO * @throws IOException if an IOException occurs */ private static void escapeJavaStyleString(Writer out, String str, boolean escapeSingleQuote, boolean escapeForwardSlash) throws IOException { if (out == null) { throw new IllegalArgumentException("The Writer must not be null"); } if (str == null) { return; } int sz; sz = str.length(); for (int i = 0; i < sz; i++) { char ch = str.charAt(i); // "[^\t\n\r\u0020-\u007E\u0085\u00A0-\uD7FF\uE000-\uFFFD]" // handle unicode if (ch > 0xFFFD) { out.write("\\u" + CharSequenceTranslator.hex(ch)); } else if (ch > 0xD7FF && ch < 0xE000) { out.write("\\u" + CharSequenceTranslator.hex(ch)); } else if (ch > 0x7E && ch != 0x85 && ch < 0xA0) { out.write("\\u00" + CharSequenceTranslator.hex(ch)); } else if (ch < 32) { switch (ch) { case '\t' : out.write('\\'); out.write('t'); break; case '\n' : out.write('\\'); out.write('n'); break; case '\r' : out.write('\\'); out.write('r'); break; default : if (ch > 0xf) { out.write("\\u00" + CharSequenceTranslator.hex(ch)); } else { out.write("\\u000" + CharSequenceTranslator.hex(ch)); } break; } } else { switch (ch) { case '\'' : if (escapeSingleQuote) { out.write('\\'); } out.write('\''); break; case '"' : out.write('\\'); out.write('"'); break; case '\\' : out.write('\\'); out.write('\\'); break; case '/' : if (escapeForwardSlash) { out.write('\\'); } out.write('/'); break; default : out.write(ch); break; } } } }
CharSequenceTranslator getArgumentEscaper() { return argumentEscaper; }
public void setTranslator(CharSequenceTranslator translator) { this.translator = translator; }
/** * Tests Supplementary characters. * <p> * From http://www.w3.org/International/questions/qa-escapes * </p> * <blockquote> * Supplementary characters are those Unicode characters that have code points higher than the characters in * the Basic Multilingual Plane (BMP). In UTF-16 a supplementary character is encoded using two 16-bit surrogate code points from the * BMP. Because of this, some people think that supplementary characters need to be represented using two escapes, but this is incorrect * - you must use the single, code point value for that character. For example, use 𣎴 rather than ��. * </blockquote> * @see <a href="http://www.w3.org/International/questions/qa-escapes">Using character escapes in markup and CSS</a> * @see <a href="https://issues.apache.org/jira/browse/LANG-728">LANG-728</a> */ @Test public void testEscapeXmlSupplementaryCharacters() { final CharSequenceTranslator escapeXml = StringEscapeUtils.ESCAPE_XML.with( NumericEntityEscaper.between(0x7f, Integer.MAX_VALUE) ); assertEquals("Supplementary character must be represented using a single escape", "𣎴", escapeXml.translate("\uD84C\uDFB4")); }
/** * Tests Supplementary characters. * <p> * From http://www.w3.org/International/questions/qa-escapes * </p> * <blockquote> * Supplementary characters are those Unicode characters that have code points higher than the characters in * the Basic Multilingual Plane (BMP). In UTF-16 a supplementary character is encoded using two 16-bit surrogate code points from the * BMP. Because of this, some people think that supplementary characters need to be represented using two escapes, but this is incorrect * - you must use the single, code point value for that character. For example, use 𣎴 rather than ��. * </blockquote> * @see <a href="http://www.w3.org/International/questions/qa-escapes">Using character escapes in markup and CSS</a> * @see <a href="https://issues.apache.org/jira/browse/LANG-728">LANG-728</a> */ @Test public void testEscapeXmlSupplementaryCharacters() { CharSequenceTranslator escapeXml = StringEscapeUtils.ESCAPE_XML.with( NumericEntityEscaper.between(0x7f, Integer.MAX_VALUE) ); assertEquals("Supplementary character must be represented using a single escape", "𣎴", escapeXml.translate("\uD84C\uDFB4")); }
/** * Unescapes special entity char sequences like < to its UTF-8 representation. * All ISO-8859-1, HTML4 and Basic entities will be translated. * * @param text the text that will be unescaped * @return the unescaped version of the string text */ public static String unescapeEntities(String text) { CharSequenceTranslator iso = new LookupTranslator(EntityArrays.ISO8859_1_UNESCAPE()); CharSequenceTranslator basic = new LookupTranslator(EntityArrays.BASIC_UNESCAPE()); //CharSequenceTranslator html4 = new LookupTranslator(EntityArrays.HTML40_EXTENDED_UNESCAPE()); return StringEscapeUtils.unescapeHtml4(iso.translate(basic.translate(text))); }
/** * Tests Supplementary characters. * <p> * From http://www.w3.org/International/questions/qa-escapes * </p> * <blockquote> * Supplementary characters are those Unicode characters that have code points higher than the characters in * the Basic Multilingual Plane (BMP). In UTF-16 a supplementary character is encoded using two 16-bit surrogate code points from the * BMP. Because of this, some people think that supplementary characters need to be represented using two escapes, but this is incorrect * - you must use the single, code point value for that character. For example, use &#x233B4; rather than * &#xD84C;&#xDFB4;. * </blockquote> * @see <a href="http://www.w3.org/International/questions/qa-escapes">Using character escapes in markup and CSS</a> * @see <a href="https://issues.apache.org/jira/browse/LANG-728">LANG-728</a> */ @Test public void testEscapeXmlSupplementaryCharacters() { final CharSequenceTranslator escapeXml = StringEscapeUtils.ESCAPE_XML.with( NumericEntityEscaper.between(0x7f, Integer.MAX_VALUE) ); assertEquals("Supplementary character must be represented using a single escape", "𣎴", escapeXml.translate("\uD84C\uDFB4")); assertEquals("Supplementary characters mixed with basic characters should be encoded correctly", "a b c 𣎴", escapeXml.translate("a b c \uD84C\uDFB4")); }