From b3a56957e53aac8702f8c0cbfbca557353105a80 Mon Sep 17 00:00:00 2001
From: "Michael[tm] Smith" <mike@w3.org>
Date: Tue, 1 Nov 2022 03:22:57 +0900
Subject: [PATCH] Make preprocessing of input stream handle supplementary
 characters
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fixes https://github.com/w3c/css-validator/issues/383

When performing preprocessing of the input stream as specified in
https://drafts.csswg.org/css-syntax/#input-preprocessing, this change
makes our implementation handle non-BMP supplementary characters as
expected — by only replacing surrogates with U+FFFD if they are lone
(unpaired) surrogates, but not replacing surrogates that are part of
surrogate pairs (a high surrogate followed by a low surrogate).

Otherwise, without this change, a parse error will occur when our
implementation encounters supplementary characters in the input stream.
---
 org/w3c/css/util/UnescapeFilterReader.java | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/org/w3c/css/util/UnescapeFilterReader.java b/org/w3c/css/util/UnescapeFilterReader.java
index 179d03bd1..cd417c64d 100644
--- a/org/w3c/css/util/UnescapeFilterReader.java
+++ b/org/w3c/css/util/UnescapeFilterReader.java
@@ -3,6 +3,7 @@
 import java.io.FilterReader;
 import java.io.IOException;
 import java.io.Reader;
+import java.lang.Character;
 
 public class UnescapeFilterReader extends FilterReader {
 
@@ -32,7 +33,14 @@ public int read()
             return 0xfffd; // U+FFFD REPLACEMENT CHARACTER
         }
         if (c >= 0xd800 && c <= 0xdfff) { // surrogate
-            return 0xfffd;
+            if (!Character.isHighSurrogate((char) c)) {
+                return 0xfffd;
+            }
+            mark(1);
+            if (!Character.isLowSurrogate((char) in.read())) {
+                return 0xfffd;
+            }
+            reset();
         }
 
         // now specific case of CSS unicode escape for ascii values [A-Za-z0-9].
@@ -109,7 +117,15 @@ public int read(char[] cbuf, int off, int len) throws IOException {
             } else if (chars[i] == 0) {
                 chars[j++] = 0xfffd;
             } else if (chars[i] >= 0xd800 && chars[i] <= 0xdfff) {
-                chars[j++] = 0xfffd;
+                if (i + 1 >= l) {
+                    chars[j++] = 0xfffd;
+                } else if (!Character.isHighSurrogate((char) chars[i])) {
+                    chars[j++] = 0xfffd;
+                } else if (!Character.isLowSurrogate((char) chars[i + 1])) {
+                    chars[j++] = 0xfffd;
+                }
+                i++;
+                j++;
             }
             // escaping