From dbab9b899544bbd5d9ece1bb3d0cc706c61708e3 Mon Sep 17 00:00:00 2001 From: Florian Schmaus Date: Tue, 18 Mar 2014 18:40:08 +0100 Subject: [PATCH] Unify StringUtils.escapeForXML() Rework StringUtils.escapeForXML() so that it can be used also for StringUtils.xmlAttribEncodeBinary(). escapeForXML() now uses a switch/case statement, which should leave the (JIT) compiler more room for optimizations. Removing the "do not escape unicode character references", because this behavior, introduced with 8264ebdfb5d083ec6c907d1ce55012b011f99476, is incorrect. --- .../jivesoftware/smack/util/StringUtils.java | 133 +++++++----------- 1 file changed, 49 insertions(+), 84 deletions(-) diff --git a/core/src/main/java/org/jivesoftware/smack/util/StringUtils.java b/core/src/main/java/org/jivesoftware/smack/util/StringUtils.java index 6dfe0884d..cea2a674f 100644 --- a/core/src/main/java/org/jivesoftware/smack/util/StringUtils.java +++ b/core/src/main/java/org/jivesoftware/smack/util/StringUtils.java @@ -30,11 +30,11 @@ import java.util.logging.Logger; public class StringUtils { private static final Logger LOGGER = Logger.getLogger(StringUtils.class.getName()); - private static final char[] QUOTE_ENCODE = """.toCharArray(); - private static final char[] APOS_ENCODE = "'".toCharArray(); - private static final char[] AMP_ENCODE = "&".toCharArray(); - private static final char[] LT_ENCODE = "<".toCharArray(); - private static final char[] GT_ENCODE = ">".toCharArray(); + public static final String QUOTE_ENCODE = """; + public static final String APOS_ENCODE = "'"; + public static final String AMP_ENCODE = "&"; + public static final String LT_ENCODE = "<"; + public static final String GT_ENCODE = ">"; /** * Returns the name portion of a XMPP address. For example, for the @@ -283,34 +283,6 @@ public class StringUtils { return buf.toString(); } - /** - * Encodes a string for use in an XML attribute by escaping characters with - * a special meaning. In particular, white spaces are encoded as character - * references, such that they are not replaced by ' ' on parsing. - */ - private static String xmlAttribEncodeBinary(String value) { - StringBuilder s = new StringBuilder(); - char buf[] = value.toCharArray(); - for (char c : buf) { - switch (c) { - case '<': s.append("<"); break; - case '>': s.append(">"); break; - case '&': s.append("&"); break; - case '"': s.append("""); break; - case '\'': s.append("'"); break; - default: - if (c <= 0x1f || (0x7f <= c && c <= 0x9f)) { // includes \t, \n, \r - s.append("&#x"); - s.append(String.format("%X", (int)c)); - s.append(';'); - } else { - s.append(c); - } - } - } - return s.toString(); - } - /** * Returns a string representing a XML attribute. The value parameter is escaped as necessary. In particular, * white spaces are encoded as character references, such that they are not replaced by ' ' on parsing. @@ -318,7 +290,7 @@ public class StringUtils { * @param value value of the XML attribute */ public static String xmlAttrib(String name, String value) { - return name + "=\"" + xmlAttribEncodeBinary(value) + "\""; + return name + "=\"" + escapeForXML(value, true) + "\""; } @@ -326,69 +298,62 @@ public class StringUtils { * Escapes all necessary characters in the String so that it can be used * in an XML doc. * - * Warning: This method does not escape unicode character references - * (i.e. references of the from ë) - * * @param string the string to escape. * @return the string with appropriate characters escaped. */ public static String escapeForXML(String string) { + return escapeForXML(string, false); + } + + public static String escapeForXML(final String string, final boolean escapeWhitespace) { if (string == null) { return null; } + final char[] input = string.toCharArray(); + final int len = input.length; + final StringBuilder out = new StringBuilder((int)(len*1.3)); + CharSequence toAppend; char ch; - int i=0; - int last=0; - char[] input = string.toCharArray(); - int len = input.length; - StringBuilder out = new StringBuilder((int)(len*1.3)); - for (; i < len; i++) { + int last = 0; + int i = 0; + while (i < len) { + toAppend = null; ch = input[i]; - if (ch > '>') { + switch(ch) { + case '<': + toAppend = LT_ENCODE; + break; + case '>': + toAppend = GT_ENCODE; + break; + case '&': + toAppend = AMP_ENCODE; + break; + case '"': + toAppend = QUOTE_ENCODE; + break; + case '\'': + toAppend = APOS_ENCODE; + break; + default: + // includes \t, \n, \r + if (escapeWhitespace && (ch <= 0x1f || (0x7f <= ch && ch <= 0x9f))) { + StringBuilder sb = new StringBuilder(); + sb.append("&#x"); + sb.append(String.format("%X", (int) ch)); + sb.append(';'); + toAppend = sb; + } + break; } - else if (ch == '<') { + if (toAppend != null) { if (i > last) { out.append(input, last, i - last); } - last = i + 1; - out.append(LT_ENCODE); - } - else if (ch == '>') { - if (i > last) { - out.append(input, last, i - last); - } - last = i + 1; - out.append(GT_ENCODE); - } - - else if (ch == '&') { - if (i > last) { - out.append(input, last, i - last); - } - // Do nothing if the string is of the form ë (unicode value) - if (!(len > i + 5 - && input[i + 1] == '#' - && Character.isDigit(input[i + 2]) - && Character.isDigit(input[i + 3]) - && Character.isDigit(input[i + 4]) - && input[i + 5] == ';')) { - last = i + 1; - out.append(AMP_ENCODE); - } - } - else if (ch == '"') { - if (i > last) { - out.append(input, last, i - last); - } - last = i + 1; - out.append(QUOTE_ENCODE); - } - else if (ch == '\'') { - if (i > last) { - out.append(input, last, i - last); - } - last = i + 1; - out.append(APOS_ENCODE); + out.append(toAppend); + last = ++i; + } else { + i++; } } if (last == 0) {