Skip to content

Commit f721368

Browse files
committed
Adjust line start position to ensure column is computed in UTF-16 characters
1 parent 94bdfdc commit f721368

File tree

2 files changed

+78
-1
lines changed

2 files changed

+78
-1
lines changed

src/tests.rs

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1049,3 +1049,53 @@ fn roundtrip_percentage_token() {
10491049
}
10501050
}
10511051
}
1052+
1053+
#[test]
1054+
fn utf16_columns() {
1055+
// This particular test serves two purposes. First, it checks
1056+
// that the column number computations are correct. Second, it
1057+
// checks that tokenizer code paths correctly differentiate
1058+
// between the different UTF-8 encoding bytes. In particular
1059+
// different leader bytes and continuation bytes are treated
1060+
// differently, so we make sure to include all lengths in the
1061+
// tests, using the string "QΡ✈🆒". Also, remember that because
1062+
// the column is in units of UTF-16, the 4-byte sequence results
1063+
// in two columns.
1064+
let tests = vec![
1065+
("", 0),
1066+
("ascii", 5),
1067+
("/*QΡ✈🆒*/", 9),
1068+
("'QΡ✈🆒*'", 8),
1069+
("\"\\\"'QΡ✈🆒*'", 11),
1070+
("\\Q\\Ρ\\\\🆒", 9),
1071+
("QΡ✈🆒", 5),
1072+
("QΡ✈🆒\\Q\\Ρ\\\\🆒", 14),
1073+
("newline\r\nQΡ✈🆒", 5),
1074+
("url(QΡ✈🆒\\Q\\Ρ\\\\🆒)", 19),
1075+
("url(QΡ✈🆒)", 10),
1076+
("url(\r\nQΡ✈🆒\\Q\\Ρ\\\\🆒)", 15),
1077+
("url(\r\nQΡ✈🆒\\Q\\Ρ\\\\🆒", 14),
1078+
("url(\r\nQΡ✈🆒\\Q\\Ρ\\\\🆒 x", 16),
1079+
("QΡ✈🆒()", 7),
1080+
// Test that under/over-flow of current_line_start_position is
1081+
// handled properly; see the special case in consume_4byte_intro.
1082+
("🆒", 2),
1083+
];
1084+
1085+
for test in tests {
1086+
let mut input = ParserInput::new(test.0);
1087+
let mut parser = Parser::new(&mut input);
1088+
1089+
// Read all tokens.
1090+
loop {
1091+
match parser.next() {
1092+
Err(BasicParseError::EndOfInput) => { break; }
1093+
Err(_) => { assert!(false); }
1094+
Ok(_) => {}
1095+
};
1096+
}
1097+
1098+
// Check the resulting column.
1099+
assert_eq!(parser.current_source_location().column, test.1);
1100+
}
1101+
}

src/tokenizer.rs

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -206,6 +206,9 @@ pub struct Tokenizer<'a> {
206206
input: &'a str,
207207
/// Counted in bytes, not code points. From 0.
208208
position: usize,
209+
/// The position at the start of the current line; but adjusted to
210+
/// ensure that computing the column will give the result in units
211+
/// of UTF-16 characters.
209212
current_line_start_position: usize,
210213
current_line_number: u32,
211214
var_functions: SeenStatus,
@@ -370,6 +373,9 @@ impl<'a> Tokenizer<'a> {
370373
#[inline]
371374
fn consume_4byte_intro(&mut self) {
372375
debug_assert!(self.next_byte_unchecked() & 0xF0 == 0xF0);
376+
// This takes two UTF-16 characters to represent, so we
377+
// actually have an undercount.
378+
self.current_line_start_position = self.current_line_start_position.wrapping_sub(1);
373379
self.position += 1;
374380
}
375381

@@ -378,6 +384,10 @@ impl<'a> Tokenizer<'a> {
378384
#[inline]
379385
fn consume_continuation_byte(&mut self) {
380386
debug_assert!(self.next_byte_unchecked() & 0xC0 == 0x80);
387+
// Continuation bytes contribute to column overcount. Note
388+
// that due to the special case for the 4-byte sequence intro,
389+
// we must use wrapping add here.
390+
self.current_line_start_position = self.current_line_start_position.wrapping_add(1);
381391
self.position += 1;
382392
}
383393

@@ -386,6 +396,16 @@ impl<'a> Tokenizer<'a> {
386396
fn consume_known_byte(&mut self, byte: u8) {
387397
debug_assert!(byte != b'\r' && byte != b'\n' && byte != b'\x0C');
388398
self.position += 1;
399+
// Continuation bytes contribute to column overcount.
400+
if byte & 0xF0 == 0xF0 {
401+
// This takes two UTF-16 characters to represent, so we
402+
// actually have an undercount.
403+
self.current_line_start_position = self.current_line_start_position.wrapping_sub(1);
404+
} else if byte & 0xC0 == 0x80 {
405+
// Note that due to the special case for the 4-byte
406+
// sequence intro, we must use wrapping add here.
407+
self.current_line_start_position = self.current_line_start_position.wrapping_add(1);
408+
}
389409
}
390410

391411
#[inline]
@@ -416,7 +436,11 @@ impl<'a> Tokenizer<'a> {
416436
#[inline]
417437
fn consume_char(&mut self) -> char {
418438
let c = self.next_char();
419-
self.position += c.len_utf8();
439+
let len_utf8 = c.len_utf8();
440+
self.position += len_utf8;
441+
// Note that due to the special case for the 4-byte sequence
442+
// intro, we must use wrapping add here.
443+
self.current_line_start_position = self.current_line_start_position.wrapping_add(len_utf8 - c.len_utf16());
420444
c
421445
}
422446

@@ -498,6 +522,7 @@ pub struct SourceLocation {
498522
pub line: u32,
499523

500524
/// The column number within a line, starting at 0 for first the character of the line.
525+
/// Column numbers are in units of UTF-16 characters.
501526
pub column: u32,
502527
}
503528

@@ -1126,6 +1151,8 @@ fn consume_unquoted_url<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>,
11261151

11271152
if newlines > 0 {
11281153
tokenizer.current_line_number += newlines;
1154+
// No need for wrapping_add here, because there's no possible
1155+
// way to wrap.
11291156
tokenizer.current_line_start_position = start_position + last_newline + 1;
11301157
}
11311158

0 commit comments

Comments
 (0)