Skip to content

Commit 4a42697

Browse files
committed
Adjust line start position to ensure column is computed in UTF-16 characters
1 parent 32d69b7 commit 4a42697

File tree

2 files changed

+67
-1
lines changed

2 files changed

+67
-1
lines changed

src/tests.rs

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1049,3 +1049,50 @@ fn roundtrip_percentage_token() {
10491049
}
10501050
}
10511051
}
1052+
1053+
#[test]
1054+
fn utf16_columns() {
1055+
// This particular test serves two purposes. First, it checks
1056+
// that the column number computations are correct. Second, it
1057+
// checks that tokenizer code paths correctly differentiate
1058+
// between the different UTF-8 encoding bytes. In particular
1059+
// different leader bytes and continuation bytes are treated
1060+
// differently, so we make sure to include all lengths in the
1061+
// tests, using the string "QΡ✈🆒". Also, remember that because
1062+
// the column is in units of UTF-16, the 4-byte sequence results
1063+
// in two columns.
1064+
let tests = vec![
1065+
("", 0),
1066+
("ascii", 5),
1067+
("/*QΡ✈🆒*/", 9),
1068+
("'QΡ✈🆒*'", 8),
1069+
("\"\\\"'QΡ✈🆒*'", 11),
1070+
("\\Q\\Ρ\\\\🆒", 9),
1071+
("QΡ✈🆒", 5),
1072+
("QΡ✈🆒\\Q\\Ρ\\\\🆒", 14),
1073+
("newline\r\nQΡ✈🆒", 5),
1074+
("url(QΡ✈🆒\\Q\\Ρ\\\\🆒)", 19),
1075+
("url(QΡ✈🆒)", 10),
1076+
("url(\r\nQΡ✈🆒\\Q\\Ρ\\\\🆒)", 15),
1077+
("url(\r\nQΡ✈🆒\\Q\\Ρ\\\\🆒", 14),
1078+
("url(\r\nQΡ✈🆒\\Q\\Ρ\\\\🆒 x", 16),
1079+
("QΡ✈🆒()", 7),
1080+
];
1081+
1082+
for test in tests {
1083+
let mut input = ParserInput::new(test.0);
1084+
let mut parser = Parser::new(&mut input);
1085+
1086+
// Read all tokens.
1087+
loop {
1088+
match parser.next() {
1089+
Err(BasicParseError::EndOfInput) => { break; }
1090+
Err(_) => { assert!(false); }
1091+
Ok(_) => {}
1092+
};
1093+
}
1094+
1095+
// Check the resulting column.
1096+
assert_eq!(parser.current_source_location().column, test.1);
1097+
}
1098+
}

src/tokenizer.rs

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -206,6 +206,9 @@ pub struct Tokenizer<'a> {
206206
input: &'a str,
207207
/// Counted in bytes, not code points. From 0.
208208
position: usize,
209+
/// The position at the start of the current line; but adjusted to
210+
/// ensure that computing the column will give the result in units
211+
/// of UTF-16 characters.
209212
current_line_start_position: usize,
210213
current_line_number: u32,
211214
var_functions: SeenStatus,
@@ -370,6 +373,9 @@ impl<'a> Tokenizer<'a> {
370373
#[inline]
371374
fn consume_4byte_intro(&mut self) {
372375
debug_assert!(self.next_byte_unchecked() & 0xF0 == 0xF0);
376+
// This takes two UTF-16 characters to represent, so we
377+
// actually have an undercount.
378+
self.current_line_start_position = self.current_line_start_position.wrapping_sub(1);
373379
self.position += 1;
374380
}
375381

@@ -378,6 +384,8 @@ impl<'a> Tokenizer<'a> {
378384
#[inline]
379385
fn consume_continuation_byte(&mut self) {
380386
debug_assert!(self.next_byte_unchecked() & 0xC0 == 0x80);
387+
// Continuation bytes contribute to column overcount.
388+
self.current_line_start_position += 1;
381389
self.position += 1;
382390
}
383391

@@ -386,6 +394,14 @@ impl<'a> Tokenizer<'a> {
386394
fn consume_known_byte(&mut self, byte: u8) {
387395
debug_assert!(byte != b'\r' && byte != b'\n' && byte != b'\x0C');
388396
self.position += 1;
397+
// Continuation bytes contribute to column overcount.
398+
if byte & 0xF0 == 0xF0 {
399+
// This takes two UTF-16 characters to represent, so we
400+
// actually have an undercount.
401+
self.current_line_start_position = self.current_line_start_position.wrapping_sub(1);
402+
} else if byte & 0xC0 == 0x80 {
403+
self.current_line_start_position += 1;
404+
}
389405
}
390406

391407
#[inline]
@@ -416,7 +432,9 @@ impl<'a> Tokenizer<'a> {
416432
#[inline]
417433
fn consume_char(&mut self) -> char {
418434
let c = self.next_char();
419-
self.position += c.len_utf8();
435+
let len_utf8 = c.len_utf8();
436+
self.position += len_utf8;
437+
self.current_line_start_position += len_utf8 - c.len_utf16();
420438
c
421439
}
422440

@@ -498,6 +516,7 @@ pub struct SourceLocation {
498516
pub line: u32,
499517

500518
/// The column number within a line, starting at 0 for first the character of the line.
519+
/// Column numbers are in units of UTF-16 characters.
501520
pub column: u32,
502521
}
503522

0 commit comments

Comments
 (0)