diff --git a/src/main/java/org/apache/commons/text/StringEscapeUtils.java b/src/main/java/org/apache/commons/text/StringEscapeUtils.java index 9720ddd723..7b9aefc5a3 100644 --- a/src/main/java/org/apache/commons/text/StringEscapeUtils.java +++ b/src/main/java/org/apache/commons/text/StringEscapeUtils.java @@ -202,7 +202,8 @@ public class StringEscapeUtils { public static final CharSequenceTranslator ESCAPE_HTML3 = new AggregateTranslator( new LookupTranslator(EntityArrays.BASIC_ESCAPE), - new LookupTranslator(EntityArrays.ISO8859_1_ESCAPE) + new LookupTranslator(EntityArrays.ISO8859_1_ESCAPE), + new LookupTranslator(EntityArrays.CP1252_ESCAPE) ); /** @@ -216,6 +217,7 @@ public class StringEscapeUtils { new AggregateTranslator( new LookupTranslator(EntityArrays.BASIC_ESCAPE), new LookupTranslator(EntityArrays.ISO8859_1_ESCAPE), + new LookupTranslator(EntityArrays.CP1252_ESCAPE), new LookupTranslator(EntityArrays.HTML40_EXTENDED_ESCAPE) ); @@ -317,6 +319,7 @@ public class StringEscapeUtils { new AggregateTranslator( new LookupTranslator(EntityArrays.BASIC_UNESCAPE), new LookupTranslator(EntityArrays.ISO8859_1_UNESCAPE), + new LookupTranslator(EntityArrays.CP1252_UNESCAPE), new NumericEntityUnescaper() ); @@ -331,10 +334,10 @@ public class StringEscapeUtils { new AggregateTranslator( new LookupTranslator(EntityArrays.BASIC_UNESCAPE), new LookupTranslator(EntityArrays.ISO8859_1_UNESCAPE), + new LookupTranslator(EntityArrays.CP1252_UNESCAPE), new LookupTranslator(EntityArrays.HTML40_EXTENDED_UNESCAPE), new NumericEntityUnescaper() ); - /** * Translator object for unescaping escaped XML. * diff --git a/src/main/java/org/apache/commons/text/translate/EntityArrays.java b/src/main/java/org/apache/commons/text/translate/EntityArrays.java index 0c36c6f1de..0c64734513 100644 --- a/src/main/java/org/apache/commons/text/translate/EntityArrays.java +++ b/src/main/java/org/apache/commons/text/translate/EntityArrays.java @@ -426,6 +426,52 @@ public class EntityArrays { JAVA_CTRL_CHARS_UNESCAPE = Collections.unmodifiableMap(invert(JAVA_CTRL_CHARS_ESCAPE)); } + /** + * A Map<CharSequence, CharSequence> to escape the CP-1252 encoding. This map is a superset of + * ISO-8859-1 encoding, with an + * extension for characters with code points 128 to 159. This must be used with {@link #ISO8859_1_ESCAPE} + * to get all CP-1252 code points. + */ + public static final Map CP1252_ESCAPE; + static { + final Map initialMap = new HashMap<>(); + initialMap.put("\u20AC", "€"); // euro sign + initialMap.put("\u201A", "‚"); // german single quotes left + initialMap.put("\u0192", "ƒ"); // florin sign + initialMap.put("\u201E", "„"); // hungarian first level quotes left + initialMap.put("\u2026", "…"); // horizontal ellipsis + initialMap.put("\u2020", "†"); // dagger + initialMap.put("\u2021", "‡"); // double dagger + initialMap.put("\u02C6", "ˆ"); // modifier letter circumflex accent + initialMap.put("\u2030", "‰"); // per mille + initialMap.put("\u0160", "Š"); // LATIN CAPITAL LETTER S WITH CARON + initialMap.put("\u2039", "‹"); // SINGLE LEFT-POINTING ANGLE QUOTATION MARK + initialMap.put("\u0152", "Œ"); // LATIN CAPITAL LIGATURE OE + initialMap.put("\u017D", "Ž"); // LATIN CAPITAL LETTER Z WITH CARON + initialMap.put("\u2018", "‘"); // LEFT SINGLE QUOTATION MARK + initialMap.put("\u2019", "’"); // RIGHT SINGLE QUOTATION MARK + initialMap.put("\u201C", "“"); // LEFT DOUBLE QUOTATION MARK + initialMap.put("\u201D", "”"); // RIGHT DOUBLE QUOTATION MARK + initialMap.put("\u2022", "•"); // BULLET + initialMap.put("\u2013", "–"); // EN DASH + initialMap.put("\u2014", "—"); // EM DASH + initialMap.put("\u02DC", "˜"); // SMALL TILDE + initialMap.put("\u2122", "™"); // TRADE MARK SIGN + initialMap.put("\u0161", "š"); // LATIN SMALL LETTER S WITH CARON + initialMap.put("\u0153", "›"); // SINGLE RIGHT-POINTING ANGLE QUOTATION MARK + initialMap.put("\u203A", "œ"); // LATIN SMALL LIGATURE OE + initialMap.put("\u0178", "Ÿ"); // LATIN CAPITAL LETTER Y WITH DIAERESIS + CP1252_ESCAPE = Collections.unmodifiableMap(initialMap); + } + + /** + * Reverse of {@link #CP1252_ESCAPE} for unescaping purposes. + */ + public static final Map CP1252_UNESCAPE; + static { + CP1252_UNESCAPE = Collections.unmodifiableMap(invert(CP1252_ESCAPE)); + } + /** * Used to invert an escape Map into an unescape Map. * @param map Map<String, String> to be inverted diff --git a/src/test/java/org/apache/commons/text/StringEscapeUtilsTest.java b/src/test/java/org/apache/commons/text/StringEscapeUtilsTest.java index d0c6ef5288..865a3dcb1c 100644 --- a/src/test/java/org/apache/commons/text/StringEscapeUtilsTest.java +++ b/src/test/java/org/apache/commons/text/StringEscapeUtilsTest.java @@ -16,15 +16,7 @@ */ package org.apache.commons.text; -import static org.apache.commons.text.StringEscapeUtils.escapeXSI; -import static org.apache.commons.text.StringEscapeUtils.unescapeXSI; -import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.api.Assertions.assertFalse; -import static org.junit.jupiter.api.Assertions.assertNotNull; -import static org.junit.jupiter.api.Assertions.assertNull; -import static org.junit.jupiter.api.Assertions.assertThrows; -import static org.junit.jupiter.api.Assertions.assertTrue; -import static org.junit.jupiter.api.Assertions.fail; +import org.junit.jupiter.api.Test; import java.io.IOException; import java.io.StringWriter; @@ -35,7 +27,15 @@ import java.nio.file.Files; import java.nio.file.Paths; -import org.junit.jupiter.api.Test; +import static org.apache.commons.text.StringEscapeUtils.escapeXSI; +import static org.apache.commons.text.StringEscapeUtils.unescapeXSI; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertNull; +import static org.junit.jupiter.api.Assertions.assertThrows; +import static org.junit.jupiter.api.Assertions.assertTrue; +import static org.junit.jupiter.api.Assertions.fail; /** * Unit tests for {@link StringEscapeUtils}. @@ -49,12 +49,14 @@ public class StringEscapeUtilsTest { private static final String FOO = "foo"; private static final String[][] HTML_ESCAPES = { + // message, expected, original {"no escaping", "plain text", "plain text"}, {"no escaping", "plain text", "plain text"}, {"empty string", "", ""}, {"null", null, null}, {"ampersand", "bread & butter", "bread & butter"}, {"quotes", ""bread" & butter", "\"bread\" & butter"}, + {"smart quotes", "“bread and circuses”", "\u201Cbread and circuses\u201d"}, {"final character only", "greater than >", "greater than >"}, {"first character only", "< less than", "< less than"}, {"apostrophe", "Huntington's chorea", "Huntington's chorea"},