Skip to content

Commit 7832921

Browse files
committed
css-tokenizer: fix input preprocessing
1 parent d9024f5 commit 7832921

22 files changed

+232
-108
lines changed

package-lock.json

+4-4
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

packages/css-tokenizer/CHANGELOG.md

+4
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,9 @@
11
# Changes to CSS Tokenizer
22

3+
### Unreleased (patch)
4+
5+
- Fix input preprocessing
6+
37
### 3.0.2
48

59
_October 10, 2024_

packages/css-tokenizer/dist/index.cjs

+1-1
Large diffs are not rendered by default.

packages/css-tokenizer/dist/index.mjs

+1-1
Large diffs are not rendered by default.

packages/css-tokenizer/package.json

+1-1
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@
4848
"dist"
4949
],
5050
"devDependencies": {
51-
"@rmenke/css-tokenizer-tests": "^1.1.6",
51+
"@rmenke/css-tokenizer-tests": "^1.2.0",
5252
"postcss": "^8.4.45",
5353
"postcss-parser-tests": "^8.8.0"
5454
},

packages/css-tokenizer/src/checks/three-code-points-would-start-ident-sequence.ts

+3-3
Original file line numberDiff line numberDiff line change
@@ -14,12 +14,12 @@ export function checkIfThreeCodePointsWouldStartAnIdentSequence(ctx: Context, re
1414
}
1515

1616
// If the second code point is an ident-start code point return true
17-
if (isIdentStartCodePoint(reader.source.codePointAt(reader.cursor + 1))) {
17+
if (isIdentStartCodePoint(reader.source.codePointAt(reader.cursor + 1) ?? -1)) {
1818
return true;
1919
}
2020

2121
// If the second and third code points are a valid escape return true
22-
if (reader.source.codePointAt(reader.cursor + 1) === REVERSE_SOLIDUS && !isNewLine(reader.source.codePointAt(reader.cursor + 2))) {
22+
if (reader.source.codePointAt(reader.cursor + 1) === REVERSE_SOLIDUS && !isNewLine(reader.source.codePointAt(reader.cursor + 2) ?? -1)) {
2323
return true;
2424
}
2525

@@ -28,7 +28,7 @@ export function checkIfThreeCodePointsWouldStartAnIdentSequence(ctx: Context, re
2828

2929
// ident-start code point
3030
// Return true.
31-
if (isIdentStartCodePoint(reader.source.codePointAt(reader.cursor))) {
31+
if (isIdentStartCodePoint(reader.source.codePointAt(reader.cursor) ?? -1)) {
3232
return true;
3333
}
3434

packages/css-tokenizer/src/checks/three-code-points-would-start-number.ts

+4-4
Original file line numberDiff line numberDiff line change
@@ -6,14 +6,14 @@ import type { CodePointReader } from '../interfaces/code-point-reader';
66
export function checkIfThreeCodePointsWouldStartANumber(reader: CodePointReader): boolean {
77
if (reader.source.codePointAt(reader.cursor) === PLUS_SIGN || reader.source.codePointAt(reader.cursor) === HYPHEN_MINUS) { // U+002B PLUS SIGN (+) or U+002D HYPHEN-MINUS (-)
88
// If the second code point is a digit, return true.
9-
if (isDigitCodePoint(reader.source.codePointAt(reader.cursor + 1))) {
9+
if (isDigitCodePoint(reader.source.codePointAt(reader.cursor + 1) ?? -1)) {
1010
return true;
1111
}
1212

1313
// Otherwise, if the second code point is a U+002E FULL STOP (.)
1414
if (reader.source.codePointAt(reader.cursor + 1) === FULL_STOP) {
1515
// and the third code point is a digit, return true.
16-
return isDigitCodePoint(reader.source.codePointAt(reader.cursor + 2));
16+
return isDigitCodePoint(reader.source.codePointAt(reader.cursor + 2) ?? -1);
1717
}
1818

1919
// Otherwise, return false.
@@ -22,8 +22,8 @@ export function checkIfThreeCodePointsWouldStartANumber(reader: CodePointReader)
2222
} else if (reader.source.codePointAt(reader.cursor) === FULL_STOP) { // U+002E FULL STOP (.)
2323
// If the second code point is a digit, return true.
2424
// Otherwise, return false.
25-
return isDigitCodePoint(reader.source.codePointAt(reader.cursor + 1));
25+
return isDigitCodePoint(reader.source.codePointAt(reader.cursor + 1) ?? -1);
2626
}
2727

28-
return isDigitCodePoint(reader.source.codePointAt(reader.cursor)); // digit
28+
return isDigitCodePoint(reader.source.codePointAt(reader.cursor) ?? -1); // digit
2929
}

packages/css-tokenizer/src/checks/three-code-points-would-start-unicode-range.ts

+1-1
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ export function checkIfThreeCodePointsWouldStartAUnicodeRange(reader: CodePointR
1515
// The third code point is either U+003F QUESTION MARK (?) or a hex digit
1616
(
1717
reader.source.codePointAt(reader.cursor + 2) === QUESTION_MARK ||
18-
isHexDigitCodePoint(reader.source.codePointAt(reader.cursor + 2))
18+
isHexDigitCodePoint(reader.source.codePointAt(reader.cursor + 2) ?? -1)
1919
)
2020
) {
2121
// then return true.

packages/css-tokenizer/src/checks/two-code-points-are-valid-escape.ts

+1-1
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,6 @@ export function checkIfTwoCodePointsAreAValidEscape(reader: CodePointReader): bo
88
// If the first code point is not U+005C REVERSE SOLIDUS (\), return false.
99
reader.source.codePointAt(reader.cursor) === REVERSE_SOLIDUS &&
1010
// Otherwise, if the second code point is a newline, return false.
11-
!isNewLine(reader.source.codePointAt(reader.cursor + 1))
11+
!isNewLine(reader.source.codePointAt(reader.cursor + 1) ?? -1)
1212
);
1313
}

packages/css-tokenizer/src/code-points/ranges.ts

+25-22
Original file line numberDiff line numberDiff line change
@@ -2,46 +2,46 @@ import { BACKSPACE, DELETE, INFORMATION_SEPARATOR_ONE, LINE_TABULATION, LOW_LINE
22
// https://www.w3.org/TR/2021/CRD-css-syntax-3-20211224/#tokenizer-definitions
33

44
// https://www.w3.org/TR/2021/CRD-css-syntax-3-20211224/#digit
5-
export function isDigitCodePoint(search: number | undefined): search is number {
6-
return (typeof search !== "undefined") && search >= 0x0030 && search <= 0x0039;
5+
export function isDigitCodePoint(search: number): boolean {
6+
return search >= 0x0030 && search <= 0x0039;
77
}
88

99
// https://www.w3.org/TR/2021/CRD-css-syntax-3-20211224/#uppercase-letter
10-
function isUppercaseLetterCodePoint(search: number | undefined): search is number {
11-
return (typeof search !== "undefined") && search >= 0x0041 && search <= 0x005a;
10+
function isUppercaseLetterCodePoint(search: number): boolean {
11+
return search >= 0x0041 && search <= 0x005a;
1212
}
1313

1414
// https://www.w3.org/TR/2021/CRD-css-syntax-3-20211224/#lowercase-letter
15-
function isLowercaseLetterCodePoint(search: number | undefined): search is number {
16-
return (typeof search !== "undefined") && search >= 0x0061 && search <= 0x007a;
15+
function isLowercaseLetterCodePoint(search: number): boolean {
16+
return search >= 0x0061 && search <= 0x007a;
1717
}
1818

1919
// https://www.w3.org/TR/2021/CRD-css-syntax-3-20211224/#hex-digit
20-
export function isHexDigitCodePoint(search: number | undefined): search is number {
21-
return (typeof search !== "undefined") && (
20+
export function isHexDigitCodePoint(search: number): boolean {
21+
return (
2222
(search >= 0x0030 && search <= 0x0039) || // 0 .. 9
2323
(search >= 0x0061 && search <= 0x0066) || // a .. f
2424
(search >= 0x0041 && search <= 0x0046) // A .. F
2525
);
2626
}
2727

2828
// https://www.w3.org/TR/2021/CRD-css-syntax-3-20211224/#letter
29-
function isLetterCodePoint(search: number | undefined): search is number {
29+
function isLetterCodePoint(search: number): boolean {
3030
return isLowercaseLetterCodePoint(search) || isUppercaseLetterCodePoint(search);
3131
}
3232

3333
// https://www.w3.org/TR/2021/CRD-css-syntax-3-20211224/#ident-start-code-point
34-
export function isIdentStartCodePoint(search: number | undefined): search is number {
34+
export function isIdentStartCodePoint(search: number): boolean {
3535
return isLetterCodePoint(search) || isNonASCII_IdentCodePoint(search) || search === LOW_LINE;
3636
}
3737

3838
// https://www.w3.org/TR/2021/CRD-css-syntax-3-20211224/#ident-code-point
39-
export function isIdentCodePoint(search: number | undefined): search is number {
39+
export function isIdentCodePoint(search: number): boolean {
4040
return isIdentStartCodePoint(search) || isDigitCodePoint(search) || search === HYPHEN_MINUS;
4141
}
4242

4343
// https://drafts.csswg.org/css-syntax/#non-ascii-ident-code-point
44-
function isNonASCII_IdentCodePoint(search: number | undefined): search is number {
44+
function isNonASCII_IdentCodePoint(search: number): boolean {
4545
if (
4646
search === 0x00B7 ||
4747
search === 0x200C ||
@@ -53,10 +53,6 @@ function isNonASCII_IdentCodePoint(search: number | undefined): search is number
5353
return true;
5454
}
5555

56-
if (typeof search === "undefined") {
57-
return false;
58-
}
59-
6056
if (
6157
(0x00C0 <= search && search <= 0x00D6) ||
6258
(0x00D8 <= search && search <= 0x00F6) ||
@@ -71,12 +67,19 @@ function isNonASCII_IdentCodePoint(search: number | undefined): search is number
7167
return true;
7268
}
7369

70+
// Input preprocessing
71+
if (search === 0x000) {
72+
return true;
73+
} else if (isSurrogate(search)) {
74+
return true;
75+
}
76+
7477
return search >= 0x10000;
7578
}
7679

7780
// https://www.w3.org/TR/2021/CRD-css-syntax-3-20211224/#non-printable-code-point
78-
export function isNonPrintableCodePoint(search: number | undefined): search is number {
79-
return (typeof search !== "undefined") &&(
81+
export function isNonPrintableCodePoint(search: number): boolean {
82+
return (
8083
(search === LINE_TABULATION) ||
8184
(search === DELETE) ||
8285
(NULL <= search && search <= BACKSPACE) ||
@@ -85,16 +88,16 @@ export function isNonPrintableCodePoint(search: number | undefined): search is n
8588
}
8689

8790
// https://www.w3.org/TR/2021/CRD-css-syntax-3-20211224/#whitespace
88-
export function isNewLine(search: number | undefined): search is number {
91+
export function isNewLine(search: number): boolean {
8992
return search === LINE_FEED || search === CARRIAGE_RETURN || search === FORM_FEED;
9093
}
9194

9295
// https://www.w3.org/TR/2021/CRD-css-syntax-3-20211224/#whitespace
93-
export function isWhitespace(search: number | undefined): search is number {
96+
export function isWhitespace(search: number): boolean {
9497
return search === SPACE || search === LINE_FEED || search === CHARACTER_TABULATION || search === CARRIAGE_RETURN || search === FORM_FEED;
9598
}
9699

97100
// https://infra.spec.whatwg.org/#surrogate
98-
export function isSurrogate(search: number | undefined): search is number {
99-
return (typeof search !== "undefined") && search >= 0xd800 && search <= 0xdfff;
101+
export function isSurrogate(search: number): boolean {
102+
return search >= 0xd800 && search <= 0xdfff;
100103
}

packages/css-tokenizer/src/consume/escaped-code-point.ts

+14-6
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
import { MAXIMUM_ALLOWED_CODEPOINT, REPLACEMENT_CHARACTER } from '../code-points/code-points';
1+
import { CARRIAGE_RETURN, LINE_FEED, MAXIMUM_ALLOWED_CODEPOINT, REPLACEMENT_CHARACTER } from '../code-points/code-points';
22
import { isHexDigitCodePoint, isSurrogate, isWhitespace } from '../code-points/ranges';
33
import type { CodePointReader } from '../interfaces/code-point-reader';
44
import type { Context } from '../interfaces/context';
@@ -30,15 +30,19 @@ export function consumeEscapedCodePoint(ctx: Context, reader: CodePointReader):
3030
reader.advanceCodePoint();
3131
}
3232

33-
if (isWhitespace(reader.source.codePointAt(reader.cursor))) {
33+
if (isWhitespace(reader.source.codePointAt(reader.cursor) ?? -1)) {
34+
if (
35+
reader.source.codePointAt(reader.cursor) === CARRIAGE_RETURN &&
36+
reader.source.codePointAt(reader.cursor + 1) === LINE_FEED
37+
) {
38+
reader.advanceCodePoint();
39+
}
40+
3441
reader.advanceCodePoint();
3542
}
3643

3744
const codePointLiteral = parseInt(String.fromCodePoint(...hexSequence), 16);
38-
if (codePointLiteral === 0) {
39-
return REPLACEMENT_CHARACTER;
40-
}
41-
if (isSurrogate(codePointLiteral)) {
45+
if (codePointLiteral === 0 || isSurrogate(codePointLiteral)) {
4246
return REPLACEMENT_CHARACTER;
4347
}
4448
if (codePointLiteral > MAXIMUM_ALLOWED_CODEPOINT) {
@@ -48,5 +52,9 @@ export function consumeEscapedCodePoint(ctx: Context, reader: CodePointReader):
4852
return codePointLiteral;
4953
}
5054

55+
if (codePoint === 0 || isSurrogate(codePoint)) {
56+
return REPLACEMENT_CHARACTER;
57+
}
58+
5159
return codePoint;
5260
}

packages/css-tokenizer/src/consume/ident-like-token.ts

+2-2
Original file line numberDiff line numberDiff line change
@@ -29,8 +29,8 @@ export function consumeIdentLikeToken(ctx: Context, reader: CodePointReader): To
2929

3030
let read = 0;
3131
while (true) {
32-
const firstIsWhitespace = isWhitespace(reader.source.codePointAt(reader.cursor));
33-
const secondIsWhitespace = isWhitespace(reader.source.codePointAt(reader.cursor + 1));
32+
const firstIsWhitespace = isWhitespace(reader.source.codePointAt(reader.cursor) ?? -1);
33+
const secondIsWhitespace = isWhitespace(reader.source.codePointAt(reader.cursor + 1) ?? -1);
3434
if (firstIsWhitespace && secondIsWhitespace) {
3535
read = read + 1;
3636
reader.advanceCodePoint(1);

packages/css-tokenizer/src/consume/ident-sequence.ts

+9-2
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import { checkIfTwoCodePointsAreAValidEscape } from '../checks/two-code-points-are-valid-escape';
2-
import { isIdentCodePoint } from '../code-points/ranges';
2+
import { NULL, REPLACEMENT_CHARACTER } from '../code-points/code-points';
3+
import { isIdentCodePoint, isSurrogate } from '../code-points/ranges';
34
import type { CodePointReader } from '../interfaces/code-point-reader';
45
import type { Context } from '../interfaces/context';
56
import { consumeEscapedCodePoint } from './escaped-code-point';
@@ -9,7 +10,13 @@ export function consumeIdentSequence(ctx: Context, reader: CodePointReader): Arr
910
const result: Array<number> = [];
1011

1112
while (true) {
12-
const codePoint = reader.source.codePointAt(reader.cursor);
13+
const codePoint = reader.source.codePointAt(reader.cursor) ?? -1;
14+
if (codePoint === NULL || isSurrogate(codePoint)) {
15+
result.push(REPLACEMENT_CHARACTER);
16+
reader.advanceCodePoint(1 + +(codePoint > 0xffff));
17+
continue;
18+
}
19+
1320
if (isIdentCodePoint(codePoint)) {
1421
result.push(codePoint);
1522
reader.advanceCodePoint(1 + +(codePoint > 0xffff));

packages/css-tokenizer/src/consume/number.ts

+6-6
Original file line numberDiff line numberDiff line change
@@ -15,20 +15,20 @@ export function consumeNumber(ctx: Context, reader: CodePointReader): NumberType
1515
}
1616

1717
// 3. While the next input code point is a digit, consume it and append it to repr.
18-
while (isDigitCodePoint(reader.source.codePointAt(reader.cursor))) {
18+
while (isDigitCodePoint(reader.source.codePointAt(reader.cursor) ?? -1)) {
1919
reader.advanceCodePoint();
2020
}
2121

2222
// 4. If the next 2 input code points are U+002E FULL STOP (.) followed by a digit, then:
23-
if (reader.source.codePointAt(reader.cursor) === FULL_STOP && isDigitCodePoint(reader.source.codePointAt(reader.cursor + 1))) {
23+
if (reader.source.codePointAt(reader.cursor) === FULL_STOP && isDigitCodePoint(reader.source.codePointAt(reader.cursor + 1) ?? -1)) {
2424
// 4.1. Consume them.
2525
reader.advanceCodePoint(2);
2626

2727
// 4.3. Set type to "number".
2828
type = NumberType.Number;
2929

3030
// 4.4. While the next input code point is a digit, consume it and append it to repr.
31-
while (isDigitCodePoint(reader.source.codePointAt(reader.cursor))) {
31+
while (isDigitCodePoint(reader.source.codePointAt(reader.cursor) ?? -1)) {
3232
reader.advanceCodePoint();
3333
}
3434
}
@@ -37,12 +37,12 @@ export function consumeNumber(ctx: Context, reader: CodePointReader): NumberType
3737
// optionally followed by U+002D HYPHEN-MINUS (-) or U+002B PLUS SIGN (+),
3838
// followed by a digit, then:
3939
if (reader.source.codePointAt(reader.cursor) === LATIN_SMALL_LETTER_E || reader.source.codePointAt(reader.cursor) === LATIN_CAPITAL_LETTER_E) {
40-
if (isDigitCodePoint(reader.source.codePointAt(reader.cursor + 1))) {
40+
if (isDigitCodePoint(reader.source.codePointAt(reader.cursor + 1) ?? -1)) {
4141
// 5.1. Consume them.
4242
reader.advanceCodePoint(2);
4343
} else if (
4444
(reader.source.codePointAt(reader.cursor + 1) === HYPHEN_MINUS || reader.source.codePointAt(reader.cursor + 1) === PLUS_SIGN) &&
45-
isDigitCodePoint(reader.source.codePointAt(reader.cursor + 2))
45+
isDigitCodePoint(reader.source.codePointAt(reader.cursor + 2) ?? -1)
4646
) {
4747
// 5.1. Consume them.
4848
reader.advanceCodePoint(3);
@@ -54,7 +54,7 @@ export function consumeNumber(ctx: Context, reader: CodePointReader): NumberType
5454
type = NumberType.Number;
5555

5656
// 5.4. While the next input code point is a digit, consume it and append it to repr.
57-
while (isDigitCodePoint(reader.source.codePointAt(reader.cursor))) {
57+
while (isDigitCodePoint(reader.source.codePointAt(reader.cursor) ?? -1)) {
5858
reader.advanceCodePoint();
5959
}
6060
}

packages/css-tokenizer/src/consume/string-token.ts

+8-4
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
1-
import { CARRIAGE_RETURN, LINE_FEED, REVERSE_SOLIDUS } from '../code-points/code-points';
2-
import { isNewLine } from '../code-points/ranges';
1+
import { CARRIAGE_RETURN, LINE_FEED, NULL, REPLACEMENT_CHARACTER, REVERSE_SOLIDUS } from '../code-points/code-points';
2+
import { isNewLine, isSurrogate } from '../code-points/ranges';
33
import type { CodePointReader } from '../interfaces/code-point-reader';
44
import type { Context } from '../interfaces/context';
55
import { ParseErrorWithToken, ParseErrorMessage } from '../interfaces/error';
@@ -69,8 +69,7 @@ export function consumeStringToken(ctx: Context, reader: CodePointReader): Token
6969
if (typeof reader.source.codePointAt(reader.cursor) === "undefined") {
7070
continue;
7171
}
72-
73-
if (isNewLine(reader.source.codePointAt(reader.cursor))) {
72+
if (isNewLine(reader.source.codePointAt(reader.cursor) ?? -1)) {
7473
if (
7574
reader.source.codePointAt(reader.cursor) === CARRIAGE_RETURN &&
7675
reader.source.codePointAt(reader.cursor + 1) === LINE_FEED
@@ -86,6 +85,11 @@ export function consumeStringToken(ctx: Context, reader: CodePointReader): Token
8685
continue;
8786
}
8887

88+
if (next === NULL || isSurrogate(next)) {
89+
result = result + String.fromCodePoint(REPLACEMENT_CHARACTER);
90+
continue;
91+
}
92+
8993
result = result + String.fromCodePoint(next);
9094
}
9195
}

packages/css-tokenizer/src/consume/unicode-range-token.ts

+1-1
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ export function consumeUnicodeRangeToken(ctx: Context, reader: CodePointReader):
4949
// 5. If the next 2 input code points are U+002D HYPHEN-MINUS (-) followed by a hex digit
5050
if (
5151
reader.source.codePointAt(reader.cursor) === HYPHEN_MINUS &&
52-
isHexDigitCodePoint(reader.source.codePointAt(reader.cursor + 1))
52+
isHexDigitCodePoint(reader.source.codePointAt(reader.cursor + 1) ?? -1)
5353
) {
5454
// 5.1. Consume the next input code point.
5555
reader.advanceCodePoint();

0 commit comments

Comments
 (0)