@@ -206,6 +206,9 @@ pub struct Tokenizer<'a> {
206
206
input : & ' a str ,
207
207
/// Counted in bytes, not code points. From 0.
208
208
position : usize ,
209
+ /// The position at the start of the current line; but adjusted to
210
+ /// ensure that computing the column will give the result in units
211
+ /// of UTF-16 characters.
209
212
current_line_start_position : usize ,
210
213
current_line_number : u32 ,
211
214
var_functions : SeenStatus ,
@@ -370,6 +373,9 @@ impl<'a> Tokenizer<'a> {
370
373
#[ inline]
371
374
fn consume_4byte_intro ( & mut self ) {
372
375
debug_assert ! ( self . next_byte_unchecked( ) & 0xF0 == 0xF0 ) ;
376
+ // This takes two UTF-16 characters to represent, so we
377
+ // actually have an undercount.
378
+ self . current_line_start_position = self . current_line_start_position . wrapping_sub ( 1 ) ;
373
379
self . position += 1 ;
374
380
}
375
381
@@ -378,6 +384,10 @@ impl<'a> Tokenizer<'a> {
378
384
#[ inline]
379
385
fn consume_continuation_byte ( & mut self ) {
380
386
debug_assert ! ( self . next_byte_unchecked( ) & 0xC0 == 0x80 ) ;
387
+ // Continuation bytes contribute to column overcount. Note
388
+ // that due to the special case for the 4-byte sequence intro,
389
+ // we must use wrapping add here.
390
+ self . current_line_start_position = self . current_line_start_position . wrapping_add ( 1 ) ;
381
391
self . position += 1 ;
382
392
}
383
393
@@ -386,6 +396,16 @@ impl<'a> Tokenizer<'a> {
386
396
fn consume_known_byte ( & mut self , byte : u8 ) {
387
397
debug_assert ! ( byte != b'\r' && byte != b'\n' && byte != b'\x0C' ) ;
388
398
self . position += 1 ;
399
+ // Continuation bytes contribute to column overcount.
400
+ if byte & 0xF0 == 0xF0 {
401
+ // This takes two UTF-16 characters to represent, so we
402
+ // actually have an undercount.
403
+ self . current_line_start_position = self . current_line_start_position . wrapping_sub ( 1 ) ;
404
+ } else if byte & 0xC0 == 0x80 {
405
+ // Note that due to the special case for the 4-byte
406
+ // sequence intro, we must use wrapping add here.
407
+ self . current_line_start_position = self . current_line_start_position . wrapping_add ( 1 ) ;
408
+ }
389
409
}
390
410
391
411
#[ inline]
@@ -416,7 +436,11 @@ impl<'a> Tokenizer<'a> {
416
436
#[ inline]
417
437
fn consume_char ( & mut self ) -> char {
418
438
let c = self . next_char ( ) ;
419
- self . position += c. len_utf8 ( ) ;
439
+ let len_utf8 = c. len_utf8 ( ) ;
440
+ self . position += len_utf8;
441
+ // Note that due to the special case for the 4-byte sequence
442
+ // intro, we must use wrapping add here.
443
+ self . current_line_start_position = self . current_line_start_position . wrapping_add ( len_utf8 - c. len_utf16 ( ) ) ;
420
444
c
421
445
}
422
446
@@ -498,6 +522,7 @@ pub struct SourceLocation {
498
522
pub line : u32 ,
499
523
500
524
/// The column number within a line, starting at 0 for first the character of the line.
525
+ /// Column numbers are in units of UTF-16 characters.
501
526
pub column : u32 ,
502
527
}
503
528
@@ -1126,6 +1151,8 @@ fn consume_unquoted_url<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>,
1126
1151
1127
1152
if newlines > 0 {
1128
1153
tokenizer. current_line_number += newlines;
1154
+ // No need for wrapping_add here, because there's no possible
1155
+ // way to wrap.
1129
1156
tokenizer. current_line_start_position = start_position + last_newline + 1 ;
1130
1157
}
1131
1158
0 commit comments