Rework unesc for a 60+% performance boost to all of postcss.

samccone · samccone · commit e1c76ff05d84 · 2021-04-08T23:06:21.000-07:00
In profiling postcss I found that a significant amount of time was being
spent in unesc, this was due to the expenive regex checks that were
being performed on the fly for every selector in the codebase.

By migrating the implementation to a constant runtime implementation I
am seeing in my local application testing a 60% speedup which is saving
me multiple seconds off of my overall build time!

This implementation passes all of the existing test cases and aims to
mirror the prior implementation's implementation details :)
diff --git a/src/__tests__/classes.js b/src/__tests__/classes.js
@@ -264,3 +264,9 @@ test('class selector with escaping (36)', '.not-pseudo\\:\\:focus', (t, tree) =>
     t.deepEqual(tree.nodes[0].nodes[0].type, 'class');
     t.deepEqual(tree.nodes[0].nodes[0].raws.value, 'not-pseudo\\:\\:focus');
 });
+
+test('class selector with escaping with more chars (37)', '.\\1D306k', (t, tree) => {
+    t.deepEqual(tree.nodes[0].nodes[0].value, '𝌆k');
+    t.deepEqual(tree.nodes[0].nodes[0].type, 'class');
+    t.deepEqual(tree.nodes[0].nodes[0].raws.value, '\\1D306k');
+});
diff --git a/src/util/unesc.js b/src/util/unesc.js
@@ -1,19 +1,96 @@
-const whitespace = '[\\x20\\t\\r\\n\\f]';
-const unescapeRegExp = new RegExp('\\\\([\\da-f]{1,6}' + whitespace + '?|(' + whitespace + ')|.)', 'ig');
+// Many thanks for this post which made this migration much easier.
+// https://mathiasbynens.be/notes/css-escapes
+
+const CSS_SPECIAL_CHARS = new Set([
+    "!", "\"", "#", "$", "%", "&", "'", "(", ")", "*", "+", ",", "-", ".", "/", ";", "<", "=", ">", "?", 
+    "@", "[", "]", "^", "`", "{", "|", "}", "~", "_", ":",
+]);
+
+const CSS_SPECIAL_CHARS_HEX_LONG = {}; 
+const CSS_SPECIAL_CHARS_HEX_SHORT = {};
+
+for (const char of Array.from(CSS_SPECIAL_CHARS)) {
+    CSS_SPECIAL_CHARS_HEX_LONG[`0000${char.charCodeAt(0).toString(16)}`] = char;
+    CSS_SPECIAL_CHARS_HEX_SHORT[`${char.charCodeAt(0).toString(16)}`] = char;
+}
+
+function matchesOctal (s) {
+    const short = s[0] + s[1];
+    if (CSS_SPECIAL_CHARS_HEX_SHORT[short]) {
+        return {
+            char: CSS_SPECIAL_CHARS_HEX_SHORT[short],
+            length: 3,
+        };
+    } 
+    const long = s[0] + s[1] + s[2] + s[3] + s[4] + s[5];
+    if (CSS_SPECIAL_CHARS_HEX_LONG[long]) {
+        return {
+            char: CSS_SPECIAL_CHARS_HEX_LONG[long],
+            length: 6,
+        };
+    }
+
+    return undefined;
+}
 
 export default function unesc (str) {
-    return str.replace(unescapeRegExp, (_, escaped, escapedWhitespace) => {
-        const high = '0x' + escaped - 0x10000;
-
-        // NaN means non-codepoint
-        // Workaround erroneous numeric interpretation of +"0x"
-        // eslint-disable-next-line no-self-compare
-        return high !== high || escapedWhitespace
-            ? escaped
-            : high < 0
-                ? // BMP codepoint
-                String.fromCharCode(high + 0x10000)
-                : // Supplemental Plane codepoint (surrogate pair)
-                String.fromCharCode((high >> 10) | 0xd800, (high & 0x3ff) | 0xdc00);
-    });
+    let ret = "";
+
+    for (let i = 0; i < str.length; i++) {
+        if (str[i] === "\\" && !isNaN(Number(str[i + 1]))) {
+            // Handle the \3 leading digit escape case.
+            if (str[i + 1] === "3" && Number(str[i + 2]) <= 9) {
+                ret += str[i + 2];
+                i += 3;
+                continue;
+            // Special case the ":" handling .. thus the "A" check.
+            } else if (str[i +2] === "A") {
+                ret += ":";
+                i += 3;
+                continue;
+            } else {
+                const match = matchesOctal(str.slice(i+1, i + 7));
+                if (match) {
+                    ret += match.char;
+                    i += match.length;
+                    continue;
+                }
+            } 
+        }
+
+        if ((str[i] === "\\")) {
+            // if // is at the end of the string retain it
+            // https://github.com/postcss/postcss-selector-parser/commit/01a6b346e3612ce1ab20219acc26abdc259ccefb
+            if (str.length === i + 1) {
+                ret += str[i];
+            }
+
+            // Retain a pair of \\ if double escaped `\\\\`
+            // https://github.com/postcss/postcss-selector-parser/commit/268c9a7656fb53f543dc620aa5b73a30ec3ff20e
+            if (str[i +1] === "\\") {
+                ret += "\\";
+                i++;
+                continue;
+            }
+
+            // Need to check if we have an emoji
+            //   \1D306
+            //   i_____ 
+            // Do a fast bounds check before doing the more expensive emoji check
+            if ((str.length) > i+5) {
+                // https://github.com/postcss/postcss-selector-parser/pull/184
+                const codePoint = parseInt(`0x${str.slice(i+1, i+6)}`, 16);
+                if (codePoint > 0x10000) {
+                    ret += String.fromCodePoint(codePoint);
+                    i+= 5;
+                }
+            }
+
+            continue;
+        }
+      
+        ret += str[i];
+    }
+
+    return ret;
 }