@@ -379,6 +379,14 @@ impl<'a> Tokenizer<'a> {
379
379
self . input [ self . position ..] . chars ( ) . next ( ) . unwrap ( )
380
380
}
381
381
382
+ fn seen_newline ( & mut self , is_cr : bool ) {
383
+ if is_cr && self . next_byte ( ) == Some ( /* LF */ b'\n' ) {
384
+ return
385
+ }
386
+ self . current_line_start_position = self . position ;
387
+ self . current_line_number += 1 ;
388
+ }
389
+
382
390
#[ inline]
383
391
fn has_newline_at ( & self , offset : usize ) -> bool {
384
392
self . position + offset < self . input . len ( ) &&
@@ -420,16 +428,14 @@ fn next_token<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>, ()> {
420
428
}
421
429
let b = tokenizer. next_byte_unchecked ( ) ;
422
430
let token = match_byte ! { b,
423
- b'\t' | b'\n' | b' ' | b'\r' | b'\x0C' => {
424
- let start_position = tokenizer. position( ) ;
425
- tokenizer. advance( 1 ) ;
426
- while !tokenizer. is_eof( ) {
427
- match tokenizer. next_byte_unchecked( ) {
428
- b' ' | b'\t' | b'\n' | b'\r' | b'\x0C' => tokenizer. advance( 1 ) ,
429
- _ => break ,
430
- }
431
- }
432
- WhiteSpace ( tokenizer. slice_from( start_position) )
431
+ b' ' | b'\t' => {
432
+ consume_whitespace( tokenizer, false , false )
433
+ } ,
434
+ b'\n' | b'\x0C' => {
435
+ consume_whitespace( tokenizer, true , false )
436
+ } ,
437
+ b'\r' => {
438
+ consume_whitespace( tokenizer, true , true )
433
439
} ,
434
440
b'"' => { consume_string( tokenizer, false ) } ,
435
441
b'#' => {
@@ -501,21 +507,7 @@ fn next_token<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>, ()> {
501
507
}
502
508
b'/' => {
503
509
if tokenizer. starts_with( b"/*" ) {
504
- tokenizer. advance( 2 ) ; // consume "/*"
505
- let start_position = tokenizer. position( ) ;
506
- let content;
507
- match tokenizer. input[ tokenizer. position..] . find( "*/" ) {
508
- Some ( offset) => {
509
- tokenizer. advance( offset) ;
510
- content = tokenizer. slice_from( start_position) ;
511
- tokenizer. advance( 2 ) ;
512
- }
513
- None => {
514
- tokenizer. position = tokenizer. input. len( ) ;
515
- content = tokenizer. slice_from( start_position) ;
516
- }
517
- }
518
- Comment ( content)
510
+ Comment ( consume_comment( tokenizer) )
519
511
} else {
520
512
tokenizer. advance( 1 ) ;
521
513
Delim ( '/' )
@@ -573,6 +565,64 @@ fn next_token<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>, ()> {
573
565
}
574
566
575
567
568
+ fn consume_whitespace < ' a > ( tokenizer : & mut Tokenizer < ' a > , newline : bool , is_cr : bool ) -> Token < ' a > {
569
+ let start_position = tokenizer. position ( ) ;
570
+ tokenizer. advance ( 1 ) ;
571
+ if newline {
572
+ tokenizer. seen_newline ( is_cr)
573
+ }
574
+ while !tokenizer. is_eof ( ) {
575
+ let b = tokenizer. next_byte_unchecked ( ) ;
576
+ match_byte ! { b,
577
+ b' ' | b'\t' => {
578
+ tokenizer. advance( 1 ) ;
579
+ }
580
+ b'\n' | b'\x0C' => {
581
+ tokenizer. advance( 1 ) ;
582
+ tokenizer. seen_newline( false ) ;
583
+ }
584
+ b'\r' => {
585
+ tokenizer. advance( 1 ) ;
586
+ tokenizer. seen_newline( true ) ;
587
+ }
588
+ _ => {
589
+ break
590
+ }
591
+ }
592
+ }
593
+ WhiteSpace ( tokenizer. slice_from ( start_position) )
594
+ }
595
+
596
+
597
+ fn consume_comment < ' a > ( tokenizer : & mut Tokenizer < ' a > ) -> & ' a str {
598
+ tokenizer. advance ( 2 ) ; // consume "/*"
599
+ let start_position = tokenizer. position ( ) ;
600
+ while !tokenizer. is_eof ( ) {
601
+ match_byte ! { tokenizer. next_byte_unchecked( ) ,
602
+ b'*' => {
603
+ let end_position = tokenizer. position( ) ;
604
+ tokenizer. advance( 1 ) ;
605
+ if tokenizer. next_byte( ) == Some ( b'/' ) {
606
+ tokenizer. advance( 1 ) ;
607
+ return tokenizer. slice( start_position..end_position)
608
+ }
609
+ }
610
+ b'\n' | b'\x0C' => {
611
+ tokenizer. advance( 1 ) ;
612
+ tokenizer. seen_newline( false ) ;
613
+ }
614
+ b'\r' => {
615
+ tokenizer. advance( 1 ) ;
616
+ tokenizer. seen_newline( true ) ;
617
+ }
618
+ _ => {
619
+ tokenizer. advance( 1 ) ;
620
+ }
621
+ }
622
+ }
623
+ tokenizer. slice_from ( start_position)
624
+ }
625
+
576
626
fn consume_string < ' a > ( tokenizer : & mut Tokenizer < ' a > , single_quote : bool ) -> Token < ' a > {
577
627
match consume_quoted_string ( tokenizer, single_quote) {
578
628
Ok ( value) => QuotedString ( value) ,
@@ -649,12 +699,19 @@ fn consume_quoted_string<'a>(tokenizer: &mut Tokenizer<'a>, single_quote: bool)
649
699
if !tokenizer. is_eof( ) {
650
700
match tokenizer. next_byte_unchecked( ) {
651
701
// Escaped newline
652
- b'\n' | b'\x0C' => tokenizer. advance( 1 ) ,
702
+ b'\n' | b'\x0C' => {
703
+ tokenizer. advance( 1 ) ;
704
+ tokenizer. seen_newline( false ) ;
705
+ }
653
706
b'\r' => {
654
707
tokenizer. advance( 1 ) ;
655
708
if tokenizer. next_byte( ) == Some ( b'\n' ) {
656
709
tokenizer. advance( 1 ) ;
657
710
}
711
+ // `is_cr = true` is useful to skip \r when the next iteration
712
+ // of a loop will call `seen_newline` again for the following \n.
713
+ // In this case we’re consuming both in this iteration, so passing `false`.
714
+ tokenizer. seen_newline( false ) ;
658
715
}
659
716
// This pushes one well-formed code point
660
717
_ => consume_escape_and_write( tokenizer, & mut string_bytes)
@@ -921,24 +978,57 @@ unsafe fn from_utf8_release_unchecked(string_bytes: Vec<u8>) -> String {
921
978
922
979
fn consume_unquoted_url < ' a > ( tokenizer : & mut Tokenizer < ' a > ) -> Result < Token < ' a > , ( ) > {
923
980
// This is only called after "url(", so the current position is a code point boundary.
924
- for ( offset, c) in tokenizer. input [ tokenizer. position ..] . bytes ( ) . enumerate ( ) {
925
- match_byte ! { c,
926
- b' ' | b'\t' | b'\n' | b'\r' | b'\x0C' => { } ,
981
+ let start_position = tokenizer. position ;
982
+ let from_start = & tokenizer. input [ tokenizer. position ..] ;
983
+ let mut newlines = 0 ;
984
+ let mut last_newline = 0 ;
985
+ let mut found_printable_char = false ;
986
+ let mut iter = from_start. bytes ( ) . enumerate ( ) ;
987
+ loop {
988
+ let ( offset, b) = match iter. next ( ) {
989
+ Some ( item) => item,
990
+ None => {
991
+ tokenizer. position = tokenizer. input . len ( ) ;
992
+ break
993
+ }
994
+ } ;
995
+ match_byte ! { b,
996
+ b' ' | b'\t' => { } ,
997
+ b'\n' | b'\x0C' => {
998
+ newlines += 1 ;
999
+ last_newline = offset;
1000
+ }
1001
+ b'\r' => {
1002
+ if from_start. as_bytes( ) . get( offset + 1 ) != Some ( & b'\n' ) {
1003
+ newlines += 1 ;
1004
+ last_newline = offset;
1005
+ }
1006
+ }
927
1007
b'"' | b'\'' => { return Err ( ( ) ) } , // Do not advance
928
1008
b')' => {
929
1009
tokenizer. advance( offset + 1 ) ;
930
- return Ok ( UnquotedUrl ( "" . into ( ) ) ) ;
1010
+ break
931
1011
}
932
1012
_ => {
933
1013
tokenizer. advance( offset) ;
934
- // This function only consumed ASCII (whitespace) bytes,
935
- // so the current position is a code point boundary.
936
- return Ok ( consume_unquoted_url_internal( tokenizer) )
1014
+ found_printable_char = true ;
1015
+ break
937
1016
}
938
1017
}
939
1018
}
940
- tokenizer. position = tokenizer. input . len ( ) ;
941
- return Ok ( UnquotedUrl ( "" . into ( ) ) ) ;
1019
+
1020
+ if newlines > 0 {
1021
+ tokenizer. current_line_number += newlines;
1022
+ tokenizer. current_line_start_position = start_position + last_newline + 1 ;
1023
+ }
1024
+
1025
+ if found_printable_char {
1026
+ // This function only consumed ASCII (whitespace) bytes,
1027
+ // so the current position is a code point boundary.
1028
+ return Ok ( consume_unquoted_url_internal ( tokenizer) )
1029
+ } else {
1030
+ return Ok ( UnquotedUrl ( "" . into ( ) ) )
1031
+ }
942
1032
943
1033
fn consume_unquoted_url_internal < ' a > ( tokenizer : & mut Tokenizer < ' a > ) -> Token < ' a > {
944
1034
// This function is only called with start_pos at a code point boundary.
@@ -951,7 +1041,6 @@ fn consume_unquoted_url<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>,
951
1041
match_byte ! { tokenizer. next_byte_unchecked( ) ,
952
1042
b' ' | b'\t' | b'\n' | b'\r' | b'\x0C' => {
953
1043
let value = tokenizer. slice_from( start_pos) ;
954
- tokenizer. advance( 1 ) ;
955
1044
return consume_url_end( tokenizer, start_pos, value. into( ) )
956
1045
}
957
1046
b')' => {
@@ -974,7 +1063,7 @@ fn consume_unquoted_url<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>,
974
1063
break
975
1064
}
976
1065
_ => {
977
- tokenizer. consume_byte ( ) ;
1066
+ tokenizer. advance ( 1 ) ;
978
1067
}
979
1068
}
980
1069
}
@@ -983,6 +1072,7 @@ fn consume_unquoted_url<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>,
983
1072
b' ' | b'\t' | b'\n' | b'\r' | b'\x0C' => {
984
1073
// string_bytes is well-formed UTF-8, see other comments.
985
1074
let string = unsafe { from_utf8_release_unchecked( string_bytes) } . into( ) ;
1075
+ tokenizer. position -= 1 ;
986
1076
return consume_url_end( tokenizer, start_pos, string)
987
1077
}
988
1078
b')' => {
@@ -1020,8 +1110,16 @@ fn consume_unquoted_url<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>,
1020
1110
-> Token < ' a > {
1021
1111
while !tokenizer. is_eof ( ) {
1022
1112
match_byte ! { tokenizer. consume_byte( ) ,
1023
- b' ' | b'\t' | b'\n' | b'\r' | b'\x0C' => { } ,
1024
- b')' => { break } ,
1113
+ b')' => {
1114
+ break
1115
+ }
1116
+ b' ' | b'\t' => { }
1117
+ b'\n' | b'\x0C' => {
1118
+ tokenizer. seen_newline( false ) ;
1119
+ }
1120
+ b'\r' => {
1121
+ tokenizer. seen_newline( true ) ;
1122
+ }
1025
1123
_ => {
1026
1124
return consume_bad_url( tokenizer, start_pos) ;
1027
1125
}
@@ -1034,12 +1132,20 @@ fn consume_unquoted_url<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>,
1034
1132
// Consume up to the closing )
1035
1133
while !tokenizer. is_eof ( ) {
1036
1134
match_byte ! { tokenizer. consume_byte( ) ,
1037
- b')' => { break } ,
1135
+ b')' => {
1136
+ break
1137
+ }
1038
1138
b'\\' => {
1039
1139
if matches!( tokenizer. next_byte( ) , Some ( b')' ) | Some ( b'\\' ) ) {
1040
1140
tokenizer. advance( 1 ) ; // Skip an escaped ')' or '\'
1041
1141
}
1042
1142
}
1143
+ b'\n' | b'\x0C' => {
1144
+ tokenizer. seen_newline( false ) ;
1145
+ }
1146
+ b'\r' => {
1147
+ tokenizer. seen_newline( true ) ;
1148
+ }
1043
1149
_ => { } ,
1044
1150
}
1045
1151
}
@@ -1080,15 +1186,22 @@ fn consume_escape(tokenizer: &mut Tokenizer) -> char {
1080
1186
b'0' ...b'9' | b'A' ...b'F' | b'a' ...b'f' => {
1081
1187
let ( c, _) = consume_hex_digits( tokenizer) ;
1082
1188
if !tokenizer. is_eof( ) {
1083
- match tokenizer. next_byte_unchecked( ) {
1084
- b' ' | b'\t' | b'\n' | b'\x0C' => tokenizer. advance( 1 ) ,
1189
+ match_byte! { tokenizer. next_byte_unchecked( ) ,
1190
+ b' ' | b'\t' => {
1191
+ tokenizer. advance( 1 )
1192
+ }
1193
+ b'\n' | b'\x0C' => {
1194
+ tokenizer. advance( 1 ) ;
1195
+ tokenizer. seen_newline( false )
1196
+ }
1085
1197
b'\r' => {
1086
1198
tokenizer. advance( 1 ) ;
1087
1199
if !tokenizer. is_eof( ) && tokenizer. next_byte_unchecked( ) == b'\n' {
1088
1200
tokenizer. advance( 1 ) ;
1089
1201
}
1202
+ tokenizer. seen_newline( false )
1090
1203
}
1091
- _ => ( )
1204
+ _ => { }
1092
1205
}
1093
1206
}
1094
1207
static REPLACEMENT_CHAR : char = '\u{FFFD}' ;
0 commit comments