From b3a56957e53aac8702f8c0cbfbca557353105a80 Mon Sep 17 00:00:00 2001 From: "Michael[tm] Smith" Date: Tue, 1 Nov 2022 03:22:57 +0900 Subject: [PATCH] Make preprocessing of input stream handle supplementary characters MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixes https://github.com/w3c/css-validator/issues/383 When performing preprocessing of the input stream as specified in https://drafts.csswg.org/css-syntax/#input-preprocessing, this change makes our implementation handle non-BMP supplementary characters as expected — by only replacing surrogates with U+FFFD if they are lone (unpaired) surrogates, but not replacing surrogates that are part of surrogate pairs (a high surrogate followed by a low surrogate). Otherwise, without this change, a parse error will occur when our implementation encounters supplementary characters in the input stream. --- org/w3c/css/util/UnescapeFilterReader.java | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/org/w3c/css/util/UnescapeFilterReader.java b/org/w3c/css/util/UnescapeFilterReader.java index 179d03bd1..cd417c64d 100644 --- a/org/w3c/css/util/UnescapeFilterReader.java +++ b/org/w3c/css/util/UnescapeFilterReader.java @@ -3,6 +3,7 @@ import java.io.FilterReader; import java.io.IOException; import java.io.Reader; +import java.lang.Character; public class UnescapeFilterReader extends FilterReader { @@ -32,7 +33,14 @@ public int read() return 0xfffd; // U+FFFD REPLACEMENT CHARACTER } if (c >= 0xd800 && c <= 0xdfff) { // surrogate - return 0xfffd; + if (!Character.isHighSurrogate((char) c)) { + return 0xfffd; + } + mark(1); + if (!Character.isLowSurrogate((char) in.read())) { + return 0xfffd; + } + reset(); } // now specific case of CSS unicode escape for ascii values [A-Za-z0-9]. @@ -109,7 +117,15 @@ public int read(char[] cbuf, int off, int len) throws IOException { } else if (chars[i] == 0) { chars[j++] = 0xfffd; } else if (chars[i] >= 0xd800 && chars[i] <= 0xdfff) { - chars[j++] = 0xfffd; + if (i + 1 >= l) { + chars[j++] = 0xfffd; + } else if (!Character.isHighSurrogate((char) chars[i])) { + chars[j++] = 0xfffd; + } else if (!Character.isLowSurrogate((char) chars[i + 1])) { + chars[j++] = 0xfffd; + } + i++; + j++; } // escaping