Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions src/main/java/org/apache/commons/text/StringEscapeUtils.java
Original file line number Diff line number Diff line change
Expand Up @@ -202,7 +202,8 @@ public class StringEscapeUtils {
public static final CharSequenceTranslator ESCAPE_HTML3 =
new AggregateTranslator(
new LookupTranslator(EntityArrays.BASIC_ESCAPE),
new LookupTranslator(EntityArrays.ISO8859_1_ESCAPE)
new LookupTranslator(EntityArrays.ISO8859_1_ESCAPE),
new LookupTranslator(EntityArrays.CP1252_ESCAPE)
);

/**
Expand All @@ -216,6 +217,7 @@ public class StringEscapeUtils {
new AggregateTranslator(
new LookupTranslator(EntityArrays.BASIC_ESCAPE),
new LookupTranslator(EntityArrays.ISO8859_1_ESCAPE),
new LookupTranslator(EntityArrays.CP1252_ESCAPE),
new LookupTranslator(EntityArrays.HTML40_EXTENDED_ESCAPE)
);

Expand Down Expand Up @@ -317,6 +319,7 @@ public class StringEscapeUtils {
new AggregateTranslator(
new LookupTranslator(EntityArrays.BASIC_UNESCAPE),
new LookupTranslator(EntityArrays.ISO8859_1_UNESCAPE),
new LookupTranslator(EntityArrays.CP1252_UNESCAPE),
new NumericEntityUnescaper()
);

Expand All @@ -331,10 +334,10 @@ public class StringEscapeUtils {
new AggregateTranslator(
new LookupTranslator(EntityArrays.BASIC_UNESCAPE),
new LookupTranslator(EntityArrays.ISO8859_1_UNESCAPE),
new LookupTranslator(EntityArrays.CP1252_UNESCAPE),
new LookupTranslator(EntityArrays.HTML40_EXTENDED_UNESCAPE),
new NumericEntityUnescaper()
);

/**
* Translator object for unescaping escaped XML.
*
Expand Down
46 changes: 46 additions & 0 deletions src/main/java/org/apache/commons/text/translate/EntityArrays.java
Original file line number Diff line number Diff line change
Expand Up @@ -426,6 +426,52 @@ public class EntityArrays {
JAVA_CTRL_CHARS_UNESCAPE = Collections.unmodifiableMap(invert(JAVA_CTRL_CHARS_ESCAPE));
}

/**
* A Map<CharSequence, CharSequence> to escape the CP-1252 encoding. This map is a superset of
* <a href="https://secure.wikimedia.org/wikipedia/en/wiki/ISO/IEC_8859-1">ISO-8859-1</a> encoding, with an
* extension for characters with code points 128 to 159. This must be used with {@link #ISO8859_1_ESCAPE}
* to get all CP-1252 code points.
*/
public static final Map<CharSequence, CharSequence> CP1252_ESCAPE;
static {
final Map<CharSequence, CharSequence> initialMap = new HashMap<>();
initialMap.put("\u20AC", "&euro;"); // euro sign
initialMap.put("\u201A", "&sbquo;"); // german single quotes left
initialMap.put("\u0192", "&fnof;"); // florin sign
initialMap.put("\u201E", "&bdquo;"); // hungarian first level quotes left
initialMap.put("\u2026", "&hellip;"); // horizontal ellipsis
initialMap.put("\u2020", "&dagger;"); // dagger
initialMap.put("\u2021", "&ddagger;"); // double dagger
initialMap.put("\u02C6", "&circ;"); // modifier letter circumflex accent
initialMap.put("\u2030", "&permil;"); // per mille
initialMap.put("\u0160", "&Scaron;"); // LATIN CAPITAL LETTER S WITH CARON
initialMap.put("\u2039", "&lsaquo;"); // SINGLE LEFT-POINTING ANGLE QUOTATION MARK
initialMap.put("\u0152", "&OElig;"); // LATIN CAPITAL LIGATURE OE
initialMap.put("\u017D", "&#x17d;"); // LATIN CAPITAL LETTER Z WITH CARON
initialMap.put("\u2018", "&lsquo;"); // LEFT SINGLE QUOTATION MARK
initialMap.put("\u2019", "&rsquo;"); // RIGHT SINGLE QUOTATION MARK
initialMap.put("\u201C", "&ldquo;"); // LEFT DOUBLE QUOTATION MARK
initialMap.put("\u201D", "&rdquo;"); // RIGHT DOUBLE QUOTATION MARK
initialMap.put("\u2022", "&bull;"); // BULLET
initialMap.put("\u2013", "&ndash;"); // EN DASH
initialMap.put("\u2014", "&mdash;"); // EM DASH
initialMap.put("\u02DC", "&tilde;"); // SMALL TILDE
initialMap.put("\u2122", "&trade;"); // TRADE MARK SIGN
initialMap.put("\u0161", "&scaron;"); // LATIN SMALL LETTER S WITH CARON
initialMap.put("\u0153", "&rsaquo;"); // SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
initialMap.put("\u203A", "&oelig;"); // LATIN SMALL LIGATURE OE
initialMap.put("\u0178", "&Yuml;"); // LATIN CAPITAL LETTER Y WITH DIAERESIS
CP1252_ESCAPE = Collections.unmodifiableMap(initialMap);
}

/**
* Reverse of {@link #CP1252_ESCAPE} for unescaping purposes.
*/
public static final Map<CharSequence, CharSequence> CP1252_UNESCAPE;
static {
CP1252_UNESCAPE = Collections.unmodifiableMap(invert(CP1252_ESCAPE));
}

/**
* Used to invert an escape Map into an unescape Map.
* @param map Map&lt;String, String&gt; to be inverted
Expand Down
22 changes: 12 additions & 10 deletions src/test/java/org/apache/commons/text/StringEscapeUtilsTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -16,15 +16,7 @@
*/
package org.apache.commons.text;

import static org.apache.commons.text.StringEscapeUtils.escapeXSI;
import static org.apache.commons.text.StringEscapeUtils.unescapeXSI;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.junit.jupiter.api.Assertions.assertNotNull;
import static org.junit.jupiter.api.Assertions.assertNull;
import static org.junit.jupiter.api.Assertions.assertThrows;
import static org.junit.jupiter.api.Assertions.assertTrue;
import static org.junit.jupiter.api.Assertions.fail;
import org.junit.jupiter.api.Test;

import java.io.IOException;
import java.io.StringWriter;
Expand All @@ -35,7 +27,15 @@
import java.nio.file.Files;
import java.nio.file.Paths;

import org.junit.jupiter.api.Test;
import static org.apache.commons.text.StringEscapeUtils.escapeXSI;
import static org.apache.commons.text.StringEscapeUtils.unescapeXSI;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.junit.jupiter.api.Assertions.assertNotNull;
import static org.junit.jupiter.api.Assertions.assertNull;
import static org.junit.jupiter.api.Assertions.assertThrows;
import static org.junit.jupiter.api.Assertions.assertTrue;
import static org.junit.jupiter.api.Assertions.fail;

/**
* Unit tests for {@link StringEscapeUtils}.
Expand All @@ -49,12 +49,14 @@ public class StringEscapeUtilsTest {
private static final String FOO = "foo";

private static final String[][] HTML_ESCAPES = {
// message, expected, original
{"no escaping", "plain text", "plain text"},
{"no escaping", "plain text", "plain text"},
{"empty string", "", ""},
{"null", null, null},
{"ampersand", "bread &amp; butter", "bread & butter"},
{"quotes", "&quot;bread&quot; &amp; butter", "\"bread\" & butter"},
{"smart quotes", "&ldquo;bread and circuses&rdquo;", "\u201Cbread and circuses\u201d"},
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see a new map with dozens of new entries but you are only testing a single value? Am I reading this right or are all the other map entries somehow also tested?

Copy link
Author

@ifly6 ifly6 Dec 12, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I tested a single value because the tests seem to test only a few values; adding an exhaustive number of tests seems mostly just to reproduce the existing map.

That said, thinking about how the library handles numeric escapes, I don't think it solves the initial problem that led me towards creating this pull request – translating something like &#147; to or &#137; to – so I've closed it.

{"final character only", "greater than &gt;", "greater than >"},
{"first character only", "&lt; less than", "< less than"},
{"apostrophe", "Huntington's chorea", "Huntington's chorea"},
Expand Down