@@ -206,6 +206,9 @@ pub struct Tokenizer<'a> {
206
206
input : & ' a str ,
207
207
/// Counted in bytes, not code points. From 0.
208
208
position : usize ,
209
+ /// The position at the start of the current line; but adjusted to
210
+ /// ensure that computing the column will give the result in units
211
+ /// of UTF-16 characters.
209
212
current_line_start_position : usize ,
210
213
current_line_number : u32 ,
211
214
var_functions : SeenStatus ,
@@ -370,6 +373,9 @@ impl<'a> Tokenizer<'a> {
370
373
#[ inline]
371
374
fn consume_4byte_intro ( & mut self ) {
372
375
debug_assert ! ( self . next_byte_unchecked( ) & 0xF0 == 0xF0 ) ;
376
+ // This takes two UTF-16 characters to represent, so we
377
+ // actually have an undercount.
378
+ self . current_line_start_position = self . current_line_start_position . wrapping_sub ( 1 ) ;
373
379
self . position += 1 ;
374
380
}
375
381
@@ -378,6 +384,8 @@ impl<'a> Tokenizer<'a> {
378
384
#[ inline]
379
385
fn consume_continuation_byte ( & mut self ) {
380
386
debug_assert ! ( self . next_byte_unchecked( ) & 0xC0 == 0x80 ) ;
387
+ // Continuation bytes contribute to column overcount.
388
+ self . current_line_start_position += 1 ;
381
389
self . position += 1 ;
382
390
}
383
391
@@ -386,6 +394,14 @@ impl<'a> Tokenizer<'a> {
386
394
fn consume_known_byte ( & mut self , byte : u8 ) {
387
395
debug_assert ! ( byte != b'\r' && byte != b'\n' && byte != b'\x0C' ) ;
388
396
self . position += 1 ;
397
+ // Continuation bytes contribute to column overcount.
398
+ if byte & 0xF0 == 0xF0 {
399
+ // This takes two UTF-16 characters to represent, so we
400
+ // actually have an undercount.
401
+ self . current_line_start_position = self . current_line_start_position . wrapping_sub ( 1 ) ;
402
+ } else if byte & 0xC0 == 0x80 {
403
+ self . current_line_start_position += 1 ;
404
+ }
389
405
}
390
406
391
407
#[ inline]
@@ -416,7 +432,9 @@ impl<'a> Tokenizer<'a> {
416
432
#[ inline]
417
433
fn consume_char ( & mut self ) -> char {
418
434
let c = self . next_char ( ) ;
419
- self . position += c. len_utf8 ( ) ;
435
+ let len_utf8 = c. len_utf8 ( ) ;
436
+ self . position += len_utf8;
437
+ self . current_line_start_position += len_utf8 - c. len_utf16 ( ) ;
420
438
c
421
439
}
422
440
@@ -498,6 +516,7 @@ pub struct SourceLocation {
498
516
pub line : u32 ,
499
517
500
518
/// The column number within a line, starting at 0 for first the character of the line.
519
+ /// Column numbers are in units of UTF-16 characters.
501
520
pub column : u32 ,
502
521
}
503
522
0 commit comments