From 13e137ee8de5f2e122fde03b5a7b3b505d3fc6e1 Mon Sep 17 00:00:00 2001
From: ifly6
Date: Sat, 12 Dec 2020 09:11:32 -0500
Subject: [PATCH 1/6] added cp1252 escapes
In EntityArrays, added escape map for CP-1252 encoding and unescape mapping. Added to StringEscapeUtils methods to employ them, preserving existing ESCAPE_HTML4 functionality.
---
.../commons/text/StringEscapeUtils.java | 26 +++++++++++
.../commons/text/translate/EntityArrays.java | 45 +++++++++++++++++++
2 files changed, 71 insertions(+)
diff --git a/src/main/java/org/apache/commons/text/StringEscapeUtils.java b/src/main/java/org/apache/commons/text/StringEscapeUtils.java
index 9720ddd723..4d4cfa6463 100644
--- a/src/main/java/org/apache/commons/text/StringEscapeUtils.java
+++ b/src/main/java/org/apache/commons/text/StringEscapeUtils.java
@@ -219,6 +219,20 @@ public class StringEscapeUtils {
new LookupTranslator(EntityArrays.HTML40_EXTENDED_ESCAPE)
);
+ /**
+ * Translator object for escaping HTML version 4.0 using CP-1252 encoding.
+ *
+ * While {@link #escapeHtml4(String)} is the expected method of use, this
+ * object allows the HTML escaping functionality to be used
+ * as the foundation for a custom translator.
+ */
+ public static final CharSequenceTranslator ESCAPE_HTML4_CP1252 =
+ new AggregateTranslator(
+ new LookupTranslator(EntityArrays.BASIC_ESCAPE),
+ new LookupTranslator(EntityArrays.CP1252_ESCAPE),
+ new LookupTranslator(EntityArrays.HTML40_EXTENDED_ESCAPE)
+ );
+
/**
* Translator object for escaping individual Comma Separated Values.
*
@@ -335,6 +349,18 @@ public class StringEscapeUtils {
new NumericEntityUnescaper()
);
+ /**
+ * Translator object for unescaping escaped HTML 4.0, using the CP-1252 character
+ * with extension over ISO 8899-1 for code points 128 to 159.
+ */
+ public static final CharSequenceTranslator UNESCAPE_HTML4_CP1252 =
+ new AggregateTranslator(
+ new LookupTranslator(EntityArrays.BASIC_UNESCAPE),
+ new LookupTranslator(EntityArrays.CP1252_UNESCAPE),
+ new LookupTranslator(EntityArrays.HTML40_EXTENDED_UNESCAPE),
+ new NumericEntityUnescaper()
+ )
+
/**
* Translator object for unescaping escaped XML.
*
diff --git a/src/main/java/org/apache/commons/text/translate/EntityArrays.java b/src/main/java/org/apache/commons/text/translate/EntityArrays.java
index 0c36c6f1de..e939918ca8 100644
--- a/src/main/java/org/apache/commons/text/translate/EntityArrays.java
+++ b/src/main/java/org/apache/commons/text/translate/EntityArrays.java
@@ -426,6 +426,51 @@ public class EntityArrays {
JAVA_CTRL_CHARS_UNESCAPE = Collections.unmodifiableMap(invert(JAVA_CTRL_CHARS_ESCAPE));
}
+ /**
+ * A Map<CharSequence, CharSequence> to escape the CP-1252 encoding. This map is a superset of
+ * ISO-8859-1 encoding, with an
+ * extension for characters with code points 128 to 159.
+ */
+ public static final Map CP1252_ESCAPE;
+ static {
+ final Map initialMap = new HashMap<>(ISO8859_1_ESCAPE);
+ initialMap.put("\u20AC", "€"); // euro sign
+ initialMap.put("\u201A", "‚"); // german single quotes left
+ initialMap.put("\u0192", "ƒ"); // florin sign
+ initialMap.put("\u201E", "„"); // hungarian first level quotes left
+ initialMap.put("\u2026", "…"); // horizontal ellipsis
+ initialMap.put("\u2020", "†"); // dagger
+ initialMap.put("\u2021", "‡"); // double dagger
+ initialMap.put("\u02C6", "ˆ"); // modifier letter circumflex accent
+ initialMap.put("\u2030", "‰"); // per mille
+ initialMap.put("\u0160", "Š"); // LATIN CAPITAL LETTER S WITH CARON
+ initialMap.put("\u2039", "‹"); // SINGLE LEFT-POINTING ANGLE QUOTATION MARK
+ initialMap.put("\u0152", "Œ"); // LATIN CAPITAL LIGATURE OE
+ initialMap.put("\u017D", "Ž"); // LATIN CAPITAL LETTER Z WITH CARON
+ initialMap.put("\u2018", "‘"); // LEFT SINGLE QUOTATION MARK
+ initialMap.put("\u2019", "’"); // RIGHT SINGLE QUOTATION MARK
+ initialMap.put("\u201C", "“"); // LEFT DOUBLE QUOTATION MARK
+ initialMap.put("\u201D", "”"); // RIGHT DOUBLE QUOTATION MARK
+ initialMap.put("\u2022", "•"); // BULLET
+ initialMap.put("\u2013", "–"); // EN DASH
+ initialMap.put("\u2014", "—"); // EM DASH
+ initialMap.put("\u02DC", "˜"); // SMALL TILDE
+ initialMap.put("\u2122", "™"); // TRADE MARK SIGN
+ initialMap.put("\u0161", "š"); // LATIN SMALL LETTER S WITH CARON
+ initialMap.put("\u0153", "›"); // SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
+ initialMap.put("\u203A", "œ"); // LATIN SMALL LIGATURE OE
+ initialMap.put("\u0178", "Ÿ"); // LATIN CAPITAL LETTER Y WITH DIAERESIS
+ CP1252_ESCAPE = Collections.unmodifiableMap(initialMap);
+ }
+
+ /**
+ * Reverse of {@link #CP1252_ESCAPE} for unescaping purposes.
+ */
+ public static final Map CP1252_UNESCAPE;
+ static {
+ CP1252_UNESCAPE = Collections.unmodifiableMap(invert(CP1252_ESCAPE));
+ }
+
/**
* Used to invert an escape Map into an unescape Map.
* @param map Map<String, String> to be inverted
From 335a4e5b3f3a702058a9f426335a8bbd81509440 Mon Sep 17 00:00:00 2001
From: ifly6
Date: Sat, 12 Dec 2020 09:15:58 -0500
Subject: [PATCH 2/6] Update StringEscapeUtils.java
missed a semicolon
---
src/main/java/org/apache/commons/text/StringEscapeUtils.java | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/src/main/java/org/apache/commons/text/StringEscapeUtils.java b/src/main/java/org/apache/commons/text/StringEscapeUtils.java
index 4d4cfa6463..9fbfffeff2 100644
--- a/src/main/java/org/apache/commons/text/StringEscapeUtils.java
+++ b/src/main/java/org/apache/commons/text/StringEscapeUtils.java
@@ -359,7 +359,7 @@ public class StringEscapeUtils {
new LookupTranslator(EntityArrays.CP1252_UNESCAPE),
new LookupTranslator(EntityArrays.HTML40_EXTENDED_UNESCAPE),
new NumericEntityUnescaper()
- )
+ );
/**
* Translator object for unescaping escaped XML.
From 072b9140b324246efcc94a52a46ece412374c978 Mon Sep 17 00:00:00 2001
From: ifly6
Date: Sat, 12 Dec 2020 10:35:38 -0500
Subject: [PATCH 3/6] separated cp1252 set from iso8859 set
added smart quotes test
separation needed to avoid entitiyarrays test error
---
src/main/java/org/apache/commons/text/StringEscapeUtils.java | 2 ++
.../java/org/apache/commons/text/translate/EntityArrays.java | 5 +++--
.../java/org/apache/commons/text/StringEscapeUtilsTest.java | 1 +
3 files changed, 6 insertions(+), 2 deletions(-)
diff --git a/src/main/java/org/apache/commons/text/StringEscapeUtils.java b/src/main/java/org/apache/commons/text/StringEscapeUtils.java
index 9fbfffeff2..1d6b758528 100644
--- a/src/main/java/org/apache/commons/text/StringEscapeUtils.java
+++ b/src/main/java/org/apache/commons/text/StringEscapeUtils.java
@@ -229,6 +229,7 @@ public class StringEscapeUtils {
public static final CharSequenceTranslator ESCAPE_HTML4_CP1252 =
new AggregateTranslator(
new LookupTranslator(EntityArrays.BASIC_ESCAPE),
+ new LookupTranslator(EntityArrays.ISO8859_1_ESCAPE),
new LookupTranslator(EntityArrays.CP1252_ESCAPE),
new LookupTranslator(EntityArrays.HTML40_EXTENDED_ESCAPE)
);
@@ -356,6 +357,7 @@ public class StringEscapeUtils {
public static final CharSequenceTranslator UNESCAPE_HTML4_CP1252 =
new AggregateTranslator(
new LookupTranslator(EntityArrays.BASIC_UNESCAPE),
+ new LookupTranslator(EntityArrays.ISO8859_1_ESCAPE),
new LookupTranslator(EntityArrays.CP1252_UNESCAPE),
new LookupTranslator(EntityArrays.HTML40_EXTENDED_UNESCAPE),
new NumericEntityUnescaper()
diff --git a/src/main/java/org/apache/commons/text/translate/EntityArrays.java b/src/main/java/org/apache/commons/text/translate/EntityArrays.java
index e939918ca8..0c64734513 100644
--- a/src/main/java/org/apache/commons/text/translate/EntityArrays.java
+++ b/src/main/java/org/apache/commons/text/translate/EntityArrays.java
@@ -429,11 +429,12 @@ public class EntityArrays {
/**
* A Map<CharSequence, CharSequence> to escape the CP-1252 encoding. This map is a superset of
* ISO-8859-1 encoding, with an
- * extension for characters with code points 128 to 159.
+ * extension for characters with code points 128 to 159. This must be used with {@link #ISO8859_1_ESCAPE}
+ * to get all CP-1252 code points.
*/
public static final Map CP1252_ESCAPE;
static {
- final Map initialMap = new HashMap<>(ISO8859_1_ESCAPE);
+ final Map initialMap = new HashMap<>();
initialMap.put("\u20AC", "€"); // euro sign
initialMap.put("\u201A", "‚"); // german single quotes left
initialMap.put("\u0192", "ƒ"); // florin sign
diff --git a/src/test/java/org/apache/commons/text/StringEscapeUtilsTest.java b/src/test/java/org/apache/commons/text/StringEscapeUtilsTest.java
index d0c6ef5288..3175a44088 100644
--- a/src/test/java/org/apache/commons/text/StringEscapeUtilsTest.java
+++ b/src/test/java/org/apache/commons/text/StringEscapeUtilsTest.java
@@ -55,6 +55,7 @@ public class StringEscapeUtilsTest {
{"null", null, null},
{"ampersand", "bread & butter", "bread & butter"},
{"quotes", ""bread" & butter", "\"bread\" & butter"},
+ {"smart quotes", "“bread and circuses”", "“bread and circuses”"},
{"final character only", "greater than >", "greater than >"},
{"first character only", "< less than", "< less than"},
{"apostrophe", "Huntington's chorea", "Huntington's chorea"},
From f39dc3fead21f7e8132e333566805b07e6ec51ea Mon Sep 17 00:00:00 2001
From: ifly6
Date: Sat, 12 Dec 2020 10:43:44 -0500
Subject: [PATCH 4/6] update string escape utils html3 and 4 default operation
update default operation; see https://issues.apache.org/jira/browse/TEXT-192 as to why made default
---
.../commons/text/StringEscapeUtils.java | 29 ++-----------------
.../commons/text/StringEscapeUtilsTest.java | 1 +
2 files changed, 3 insertions(+), 27 deletions(-)
diff --git a/src/main/java/org/apache/commons/text/StringEscapeUtils.java b/src/main/java/org/apache/commons/text/StringEscapeUtils.java
index 1d6b758528..44af86aef5 100644
--- a/src/main/java/org/apache/commons/text/StringEscapeUtils.java
+++ b/src/main/java/org/apache/commons/text/StringEscapeUtils.java
@@ -203,6 +203,7 @@ public class StringEscapeUtils {
new AggregateTranslator(
new LookupTranslator(EntityArrays.BASIC_ESCAPE),
new LookupTranslator(EntityArrays.ISO8859_1_ESCAPE)
+ new LookupTranslator(EntityArrays.CP1252_ESCAPE)
);
/**
@@ -213,20 +214,6 @@ public class StringEscapeUtils {
* as the foundation for a custom translator.
*/
public static final CharSequenceTranslator ESCAPE_HTML4 =
- new AggregateTranslator(
- new LookupTranslator(EntityArrays.BASIC_ESCAPE),
- new LookupTranslator(EntityArrays.ISO8859_1_ESCAPE),
- new LookupTranslator(EntityArrays.HTML40_EXTENDED_ESCAPE)
- );
-
- /**
- * Translator object for escaping HTML version 4.0 using CP-1252 encoding.
- *
- * While {@link #escapeHtml4(String)} is the expected method of use, this
- * object allows the HTML escaping functionality to be used
- * as the foundation for a custom translator.
- */
- public static final CharSequenceTranslator ESCAPE_HTML4_CP1252 =
new AggregateTranslator(
new LookupTranslator(EntityArrays.BASIC_ESCAPE),
new LookupTranslator(EntityArrays.ISO8859_1_ESCAPE),
@@ -332,6 +319,7 @@ public class StringEscapeUtils {
new AggregateTranslator(
new LookupTranslator(EntityArrays.BASIC_UNESCAPE),
new LookupTranslator(EntityArrays.ISO8859_1_UNESCAPE),
+ new LookupTranslator(EntityArrays.CP1252_UNESCAPE),
new NumericEntityUnescaper()
);
@@ -346,23 +334,10 @@ public class StringEscapeUtils {
new AggregateTranslator(
new LookupTranslator(EntityArrays.BASIC_UNESCAPE),
new LookupTranslator(EntityArrays.ISO8859_1_UNESCAPE),
- new LookupTranslator(EntityArrays.HTML40_EXTENDED_UNESCAPE),
- new NumericEntityUnescaper()
- );
-
- /**
- * Translator object for unescaping escaped HTML 4.0, using the CP-1252 character
- * with extension over ISO 8899-1 for code points 128 to 159.
- */
- public static final CharSequenceTranslator UNESCAPE_HTML4_CP1252 =
- new AggregateTranslator(
- new LookupTranslator(EntityArrays.BASIC_UNESCAPE),
- new LookupTranslator(EntityArrays.ISO8859_1_ESCAPE),
new LookupTranslator(EntityArrays.CP1252_UNESCAPE),
new LookupTranslator(EntityArrays.HTML40_EXTENDED_UNESCAPE),
new NumericEntityUnescaper()
);
-
/**
* Translator object for unescaping escaped XML.
*
diff --git a/src/test/java/org/apache/commons/text/StringEscapeUtilsTest.java b/src/test/java/org/apache/commons/text/StringEscapeUtilsTest.java
index 3175a44088..57c928d6be 100644
--- a/src/test/java/org/apache/commons/text/StringEscapeUtilsTest.java
+++ b/src/test/java/org/apache/commons/text/StringEscapeUtilsTest.java
@@ -49,6 +49,7 @@ public class StringEscapeUtilsTest {
private static final String FOO = "foo";
private static final String[][] HTML_ESCAPES = {
+ // message, expected, original
{"no escaping", "plain text", "plain text"},
{"no escaping", "plain text", "plain text"},
{"empty string", "", ""},
From be2cfb493c3b1f96fe6c52cda4ffd513108eb0f4 Mon Sep 17 00:00:00 2001
From: ifly6
Date: Sat, 12 Dec 2020 10:45:30 -0500
Subject: [PATCH 5/6] Update StringEscapeUtils.java
missed a comma ugh
---
src/main/java/org/apache/commons/text/StringEscapeUtils.java | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/src/main/java/org/apache/commons/text/StringEscapeUtils.java b/src/main/java/org/apache/commons/text/StringEscapeUtils.java
index 44af86aef5..7b9aefc5a3 100644
--- a/src/main/java/org/apache/commons/text/StringEscapeUtils.java
+++ b/src/main/java/org/apache/commons/text/StringEscapeUtils.java
@@ -202,7 +202,7 @@ public class StringEscapeUtils {
public static final CharSequenceTranslator ESCAPE_HTML3 =
new AggregateTranslator(
new LookupTranslator(EntityArrays.BASIC_ESCAPE),
- new LookupTranslator(EntityArrays.ISO8859_1_ESCAPE)
+ new LookupTranslator(EntityArrays.ISO8859_1_ESCAPE),
new LookupTranslator(EntityArrays.CP1252_ESCAPE)
);
From 5fb8fc75f6e6ef790817ee31e7c6a82794c4cdb8 Mon Sep 17 00:00:00 2001
From: ifly6
Date: Sat, 12 Dec 2020 10:53:34 -0500
Subject: [PATCH 6/6] Update StringEscapeUtilsTest.java
seems the source is in iso 8859-1
---
.../commons/text/StringEscapeUtilsTest.java | 22 +++++++++----------
1 file changed, 11 insertions(+), 11 deletions(-)
diff --git a/src/test/java/org/apache/commons/text/StringEscapeUtilsTest.java b/src/test/java/org/apache/commons/text/StringEscapeUtilsTest.java
index 57c928d6be..865a3dcb1c 100644
--- a/src/test/java/org/apache/commons/text/StringEscapeUtilsTest.java
+++ b/src/test/java/org/apache/commons/text/StringEscapeUtilsTest.java
@@ -16,15 +16,7 @@
*/
package org.apache.commons.text;
-import static org.apache.commons.text.StringEscapeUtils.escapeXSI;
-import static org.apache.commons.text.StringEscapeUtils.unescapeXSI;
-import static org.junit.jupiter.api.Assertions.assertEquals;
-import static org.junit.jupiter.api.Assertions.assertFalse;
-import static org.junit.jupiter.api.Assertions.assertNotNull;
-import static org.junit.jupiter.api.Assertions.assertNull;
-import static org.junit.jupiter.api.Assertions.assertThrows;
-import static org.junit.jupiter.api.Assertions.assertTrue;
-import static org.junit.jupiter.api.Assertions.fail;
+import org.junit.jupiter.api.Test;
import java.io.IOException;
import java.io.StringWriter;
@@ -35,7 +27,15 @@
import java.nio.file.Files;
import java.nio.file.Paths;
-import org.junit.jupiter.api.Test;
+import static org.apache.commons.text.StringEscapeUtils.escapeXSI;
+import static org.apache.commons.text.StringEscapeUtils.unescapeXSI;
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertFalse;
+import static org.junit.jupiter.api.Assertions.assertNotNull;
+import static org.junit.jupiter.api.Assertions.assertNull;
+import static org.junit.jupiter.api.Assertions.assertThrows;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+import static org.junit.jupiter.api.Assertions.fail;
/**
* Unit tests for {@link StringEscapeUtils}.
@@ -56,7 +56,7 @@ public class StringEscapeUtilsTest {
{"null", null, null},
{"ampersand", "bread & butter", "bread & butter"},
{"quotes", ""bread" & butter", "\"bread\" & butter"},
- {"smart quotes", "“bread and circuses”", "“bread and circuses”"},
+ {"smart quotes", "“bread and circuses”", "\u201Cbread and circuses\u201d"},
{"final character only", "greater than >", "greater than >"},
{"first character only", "< less than", "< less than"},
{"apostrophe", "Huntington's chorea", "Huntington's chorea"},