From 13e137ee8de5f2e122fde03b5a7b3b505d3fc6e1 Mon Sep 17 00:00:00 2001 From: ifly6 Date: Sat, 12 Dec 2020 09:11:32 -0500 Subject: [PATCH 1/6] added cp1252 escapes In EntityArrays, added escape map for CP-1252 encoding and unescape mapping. Added to StringEscapeUtils methods to employ them, preserving existing ESCAPE_HTML4 functionality. --- .../commons/text/StringEscapeUtils.java | 26 +++++++++++ .../commons/text/translate/EntityArrays.java | 45 +++++++++++++++++++ 2 files changed, 71 insertions(+) diff --git a/src/main/java/org/apache/commons/text/StringEscapeUtils.java b/src/main/java/org/apache/commons/text/StringEscapeUtils.java index 9720ddd723..4d4cfa6463 100644 --- a/src/main/java/org/apache/commons/text/StringEscapeUtils.java +++ b/src/main/java/org/apache/commons/text/StringEscapeUtils.java @@ -219,6 +219,20 @@ public class StringEscapeUtils { new LookupTranslator(EntityArrays.HTML40_EXTENDED_ESCAPE) ); + /** + * Translator object for escaping HTML version 4.0 using CP-1252 encoding. + * + * While {@link #escapeHtml4(String)} is the expected method of use, this + * object allows the HTML escaping functionality to be used + * as the foundation for a custom translator. + */ + public static final CharSequenceTranslator ESCAPE_HTML4_CP1252 = + new AggregateTranslator( + new LookupTranslator(EntityArrays.BASIC_ESCAPE), + new LookupTranslator(EntityArrays.CP1252_ESCAPE), + new LookupTranslator(EntityArrays.HTML40_EXTENDED_ESCAPE) + ); + /** * Translator object for escaping individual Comma Separated Values. * @@ -335,6 +349,18 @@ public class StringEscapeUtils { new NumericEntityUnescaper() ); + /** + * Translator object for unescaping escaped HTML 4.0, using the CP-1252 character + * with extension over ISO 8899-1 for code points 128 to 159. + */ + public static final CharSequenceTranslator UNESCAPE_HTML4_CP1252 = + new AggregateTranslator( + new LookupTranslator(EntityArrays.BASIC_UNESCAPE), + new LookupTranslator(EntityArrays.CP1252_UNESCAPE), + new LookupTranslator(EntityArrays.HTML40_EXTENDED_UNESCAPE), + new NumericEntityUnescaper() + ) + /** * Translator object for unescaping escaped XML. * diff --git a/src/main/java/org/apache/commons/text/translate/EntityArrays.java b/src/main/java/org/apache/commons/text/translate/EntityArrays.java index 0c36c6f1de..e939918ca8 100644 --- a/src/main/java/org/apache/commons/text/translate/EntityArrays.java +++ b/src/main/java/org/apache/commons/text/translate/EntityArrays.java @@ -426,6 +426,51 @@ public class EntityArrays { JAVA_CTRL_CHARS_UNESCAPE = Collections.unmodifiableMap(invert(JAVA_CTRL_CHARS_ESCAPE)); } + /** + * A Map<CharSequence, CharSequence> to escape the CP-1252 encoding. This map is a superset of + * ISO-8859-1 encoding, with an + * extension for characters with code points 128 to 159. + */ + public static final Map CP1252_ESCAPE; + static { + final Map initialMap = new HashMap<>(ISO8859_1_ESCAPE); + initialMap.put("\u20AC", "€"); // euro sign + initialMap.put("\u201A", "‚"); // german single quotes left + initialMap.put("\u0192", "ƒ"); // florin sign + initialMap.put("\u201E", "„"); // hungarian first level quotes left + initialMap.put("\u2026", "…"); // horizontal ellipsis + initialMap.put("\u2020", "†"); // dagger + initialMap.put("\u2021", "‡"); // double dagger + initialMap.put("\u02C6", "ˆ"); // modifier letter circumflex accent + initialMap.put("\u2030", "‰"); // per mille + initialMap.put("\u0160", "Š"); // LATIN CAPITAL LETTER S WITH CARON + initialMap.put("\u2039", "‹"); // SINGLE LEFT-POINTING ANGLE QUOTATION MARK + initialMap.put("\u0152", "Œ"); // LATIN CAPITAL LIGATURE OE + initialMap.put("\u017D", "Ž"); // LATIN CAPITAL LETTER Z WITH CARON + initialMap.put("\u2018", "‘"); // LEFT SINGLE QUOTATION MARK + initialMap.put("\u2019", "’"); // RIGHT SINGLE QUOTATION MARK + initialMap.put("\u201C", "“"); // LEFT DOUBLE QUOTATION MARK + initialMap.put("\u201D", "”"); // RIGHT DOUBLE QUOTATION MARK + initialMap.put("\u2022", "•"); // BULLET + initialMap.put("\u2013", "–"); // EN DASH + initialMap.put("\u2014", "—"); // EM DASH + initialMap.put("\u02DC", "˜"); // SMALL TILDE + initialMap.put("\u2122", "™"); // TRADE MARK SIGN + initialMap.put("\u0161", "š"); // LATIN SMALL LETTER S WITH CARON + initialMap.put("\u0153", "›"); // SINGLE RIGHT-POINTING ANGLE QUOTATION MARK + initialMap.put("\u203A", "œ"); // LATIN SMALL LIGATURE OE + initialMap.put("\u0178", "Ÿ"); // LATIN CAPITAL LETTER Y WITH DIAERESIS + CP1252_ESCAPE = Collections.unmodifiableMap(initialMap); + } + + /** + * Reverse of {@link #CP1252_ESCAPE} for unescaping purposes. + */ + public static final Map CP1252_UNESCAPE; + static { + CP1252_UNESCAPE = Collections.unmodifiableMap(invert(CP1252_ESCAPE)); + } + /** * Used to invert an escape Map into an unescape Map. * @param map Map<String, String> to be inverted From 335a4e5b3f3a702058a9f426335a8bbd81509440 Mon Sep 17 00:00:00 2001 From: ifly6 Date: Sat, 12 Dec 2020 09:15:58 -0500 Subject: [PATCH 2/6] Update StringEscapeUtils.java missed a semicolon --- src/main/java/org/apache/commons/text/StringEscapeUtils.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/org/apache/commons/text/StringEscapeUtils.java b/src/main/java/org/apache/commons/text/StringEscapeUtils.java index 4d4cfa6463..9fbfffeff2 100644 --- a/src/main/java/org/apache/commons/text/StringEscapeUtils.java +++ b/src/main/java/org/apache/commons/text/StringEscapeUtils.java @@ -359,7 +359,7 @@ public class StringEscapeUtils { new LookupTranslator(EntityArrays.CP1252_UNESCAPE), new LookupTranslator(EntityArrays.HTML40_EXTENDED_UNESCAPE), new NumericEntityUnescaper() - ) + ); /** * Translator object for unescaping escaped XML. From 072b9140b324246efcc94a52a46ece412374c978 Mon Sep 17 00:00:00 2001 From: ifly6 Date: Sat, 12 Dec 2020 10:35:38 -0500 Subject: [PATCH 3/6] separated cp1252 set from iso8859 set added smart quotes test separation needed to avoid entitiyarrays test error --- src/main/java/org/apache/commons/text/StringEscapeUtils.java | 2 ++ .../java/org/apache/commons/text/translate/EntityArrays.java | 5 +++-- .../java/org/apache/commons/text/StringEscapeUtilsTest.java | 1 + 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/src/main/java/org/apache/commons/text/StringEscapeUtils.java b/src/main/java/org/apache/commons/text/StringEscapeUtils.java index 9fbfffeff2..1d6b758528 100644 --- a/src/main/java/org/apache/commons/text/StringEscapeUtils.java +++ b/src/main/java/org/apache/commons/text/StringEscapeUtils.java @@ -229,6 +229,7 @@ public class StringEscapeUtils { public static final CharSequenceTranslator ESCAPE_HTML4_CP1252 = new AggregateTranslator( new LookupTranslator(EntityArrays.BASIC_ESCAPE), + new LookupTranslator(EntityArrays.ISO8859_1_ESCAPE), new LookupTranslator(EntityArrays.CP1252_ESCAPE), new LookupTranslator(EntityArrays.HTML40_EXTENDED_ESCAPE) ); @@ -356,6 +357,7 @@ public class StringEscapeUtils { public static final CharSequenceTranslator UNESCAPE_HTML4_CP1252 = new AggregateTranslator( new LookupTranslator(EntityArrays.BASIC_UNESCAPE), + new LookupTranslator(EntityArrays.ISO8859_1_ESCAPE), new LookupTranslator(EntityArrays.CP1252_UNESCAPE), new LookupTranslator(EntityArrays.HTML40_EXTENDED_UNESCAPE), new NumericEntityUnescaper() diff --git a/src/main/java/org/apache/commons/text/translate/EntityArrays.java b/src/main/java/org/apache/commons/text/translate/EntityArrays.java index e939918ca8..0c64734513 100644 --- a/src/main/java/org/apache/commons/text/translate/EntityArrays.java +++ b/src/main/java/org/apache/commons/text/translate/EntityArrays.java @@ -429,11 +429,12 @@ public class EntityArrays { /** * A Map<CharSequence, CharSequence> to escape the CP-1252 encoding. This map is a superset of * ISO-8859-1 encoding, with an - * extension for characters with code points 128 to 159. + * extension for characters with code points 128 to 159. This must be used with {@link #ISO8859_1_ESCAPE} + * to get all CP-1252 code points. */ public static final Map CP1252_ESCAPE; static { - final Map initialMap = new HashMap<>(ISO8859_1_ESCAPE); + final Map initialMap = new HashMap<>(); initialMap.put("\u20AC", "€"); // euro sign initialMap.put("\u201A", "‚"); // german single quotes left initialMap.put("\u0192", "ƒ"); // florin sign diff --git a/src/test/java/org/apache/commons/text/StringEscapeUtilsTest.java b/src/test/java/org/apache/commons/text/StringEscapeUtilsTest.java index d0c6ef5288..3175a44088 100644 --- a/src/test/java/org/apache/commons/text/StringEscapeUtilsTest.java +++ b/src/test/java/org/apache/commons/text/StringEscapeUtilsTest.java @@ -55,6 +55,7 @@ public class StringEscapeUtilsTest { {"null", null, null}, {"ampersand", "bread & butter", "bread & butter"}, {"quotes", ""bread" & butter", "\"bread\" & butter"}, + {"smart quotes", "“bread and circuses”", "“bread and circuses”"}, {"final character only", "greater than >", "greater than >"}, {"first character only", "< less than", "< less than"}, {"apostrophe", "Huntington's chorea", "Huntington's chorea"}, From f39dc3fead21f7e8132e333566805b07e6ec51ea Mon Sep 17 00:00:00 2001 From: ifly6 Date: Sat, 12 Dec 2020 10:43:44 -0500 Subject: [PATCH 4/6] update string escape utils html3 and 4 default operation update default operation; see https://issues.apache.org/jira/browse/TEXT-192 as to why made default --- .../commons/text/StringEscapeUtils.java | 29 ++----------------- .../commons/text/StringEscapeUtilsTest.java | 1 + 2 files changed, 3 insertions(+), 27 deletions(-) diff --git a/src/main/java/org/apache/commons/text/StringEscapeUtils.java b/src/main/java/org/apache/commons/text/StringEscapeUtils.java index 1d6b758528..44af86aef5 100644 --- a/src/main/java/org/apache/commons/text/StringEscapeUtils.java +++ b/src/main/java/org/apache/commons/text/StringEscapeUtils.java @@ -203,6 +203,7 @@ public class StringEscapeUtils { new AggregateTranslator( new LookupTranslator(EntityArrays.BASIC_ESCAPE), new LookupTranslator(EntityArrays.ISO8859_1_ESCAPE) + new LookupTranslator(EntityArrays.CP1252_ESCAPE) ); /** @@ -213,20 +214,6 @@ public class StringEscapeUtils { * as the foundation for a custom translator. */ public static final CharSequenceTranslator ESCAPE_HTML4 = - new AggregateTranslator( - new LookupTranslator(EntityArrays.BASIC_ESCAPE), - new LookupTranslator(EntityArrays.ISO8859_1_ESCAPE), - new LookupTranslator(EntityArrays.HTML40_EXTENDED_ESCAPE) - ); - - /** - * Translator object for escaping HTML version 4.0 using CP-1252 encoding. - * - * While {@link #escapeHtml4(String)} is the expected method of use, this - * object allows the HTML escaping functionality to be used - * as the foundation for a custom translator. - */ - public static final CharSequenceTranslator ESCAPE_HTML4_CP1252 = new AggregateTranslator( new LookupTranslator(EntityArrays.BASIC_ESCAPE), new LookupTranslator(EntityArrays.ISO8859_1_ESCAPE), @@ -332,6 +319,7 @@ public class StringEscapeUtils { new AggregateTranslator( new LookupTranslator(EntityArrays.BASIC_UNESCAPE), new LookupTranslator(EntityArrays.ISO8859_1_UNESCAPE), + new LookupTranslator(EntityArrays.CP1252_UNESCAPE), new NumericEntityUnescaper() ); @@ -346,23 +334,10 @@ public class StringEscapeUtils { new AggregateTranslator( new LookupTranslator(EntityArrays.BASIC_UNESCAPE), new LookupTranslator(EntityArrays.ISO8859_1_UNESCAPE), - new LookupTranslator(EntityArrays.HTML40_EXTENDED_UNESCAPE), - new NumericEntityUnescaper() - ); - - /** - * Translator object for unescaping escaped HTML 4.0, using the CP-1252 character - * with extension over ISO 8899-1 for code points 128 to 159. - */ - public static final CharSequenceTranslator UNESCAPE_HTML4_CP1252 = - new AggregateTranslator( - new LookupTranslator(EntityArrays.BASIC_UNESCAPE), - new LookupTranslator(EntityArrays.ISO8859_1_ESCAPE), new LookupTranslator(EntityArrays.CP1252_UNESCAPE), new LookupTranslator(EntityArrays.HTML40_EXTENDED_UNESCAPE), new NumericEntityUnescaper() ); - /** * Translator object for unescaping escaped XML. * diff --git a/src/test/java/org/apache/commons/text/StringEscapeUtilsTest.java b/src/test/java/org/apache/commons/text/StringEscapeUtilsTest.java index 3175a44088..57c928d6be 100644 --- a/src/test/java/org/apache/commons/text/StringEscapeUtilsTest.java +++ b/src/test/java/org/apache/commons/text/StringEscapeUtilsTest.java @@ -49,6 +49,7 @@ public class StringEscapeUtilsTest { private static final String FOO = "foo"; private static final String[][] HTML_ESCAPES = { + // message, expected, original {"no escaping", "plain text", "plain text"}, {"no escaping", "plain text", "plain text"}, {"empty string", "", ""}, From be2cfb493c3b1f96fe6c52cda4ffd513108eb0f4 Mon Sep 17 00:00:00 2001 From: ifly6 Date: Sat, 12 Dec 2020 10:45:30 -0500 Subject: [PATCH 5/6] Update StringEscapeUtils.java missed a comma ugh --- src/main/java/org/apache/commons/text/StringEscapeUtils.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/org/apache/commons/text/StringEscapeUtils.java b/src/main/java/org/apache/commons/text/StringEscapeUtils.java index 44af86aef5..7b9aefc5a3 100644 --- a/src/main/java/org/apache/commons/text/StringEscapeUtils.java +++ b/src/main/java/org/apache/commons/text/StringEscapeUtils.java @@ -202,7 +202,7 @@ public class StringEscapeUtils { public static final CharSequenceTranslator ESCAPE_HTML3 = new AggregateTranslator( new LookupTranslator(EntityArrays.BASIC_ESCAPE), - new LookupTranslator(EntityArrays.ISO8859_1_ESCAPE) + new LookupTranslator(EntityArrays.ISO8859_1_ESCAPE), new LookupTranslator(EntityArrays.CP1252_ESCAPE) ); From 5fb8fc75f6e6ef790817ee31e7c6a82794c4cdb8 Mon Sep 17 00:00:00 2001 From: ifly6 Date: Sat, 12 Dec 2020 10:53:34 -0500 Subject: [PATCH 6/6] Update StringEscapeUtilsTest.java seems the source is in iso 8859-1 --- .../commons/text/StringEscapeUtilsTest.java | 22 +++++++++---------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/src/test/java/org/apache/commons/text/StringEscapeUtilsTest.java b/src/test/java/org/apache/commons/text/StringEscapeUtilsTest.java index 57c928d6be..865a3dcb1c 100644 --- a/src/test/java/org/apache/commons/text/StringEscapeUtilsTest.java +++ b/src/test/java/org/apache/commons/text/StringEscapeUtilsTest.java @@ -16,15 +16,7 @@ */ package org.apache.commons.text; -import static org.apache.commons.text.StringEscapeUtils.escapeXSI; -import static org.apache.commons.text.StringEscapeUtils.unescapeXSI; -import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.api.Assertions.assertFalse; -import static org.junit.jupiter.api.Assertions.assertNotNull; -import static org.junit.jupiter.api.Assertions.assertNull; -import static org.junit.jupiter.api.Assertions.assertThrows; -import static org.junit.jupiter.api.Assertions.assertTrue; -import static org.junit.jupiter.api.Assertions.fail; +import org.junit.jupiter.api.Test; import java.io.IOException; import java.io.StringWriter; @@ -35,7 +27,15 @@ import java.nio.file.Files; import java.nio.file.Paths; -import org.junit.jupiter.api.Test; +import static org.apache.commons.text.StringEscapeUtils.escapeXSI; +import static org.apache.commons.text.StringEscapeUtils.unescapeXSI; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertNull; +import static org.junit.jupiter.api.Assertions.assertThrows; +import static org.junit.jupiter.api.Assertions.assertTrue; +import static org.junit.jupiter.api.Assertions.fail; /** * Unit tests for {@link StringEscapeUtils}. @@ -56,7 +56,7 @@ public class StringEscapeUtilsTest { {"null", null, null}, {"ampersand", "bread & butter", "bread & butter"}, {"quotes", ""bread" & butter", "\"bread\" & butter"}, - {"smart quotes", "“bread and circuses”", "“bread and circuses”"}, + {"smart quotes", "“bread and circuses”", "\u201Cbread and circuses\u201d"}, {"final character only", "greater than >", "greater than >"}, {"first character only", "< less than", "< less than"}, {"apostrophe", "Huntington's chorea", "Huntington's chorea"},