@@ -379,6 +379,14 @@ impl<'a> Tokenizer<'a> {
379
379
self . input [ self . position ..] . chars ( ) . next ( ) . unwrap ( )
380
380
}
381
381
382
+ fn seen_newline ( & mut self , is_cr : bool ) {
383
+ if is_cr && self . next_byte ( ) == Some ( /* LF */ b'\n' ) {
384
+ return
385
+ }
386
+ self . current_line_start_position = self . position ;
387
+ self . current_line_number += 1 ;
388
+ }
389
+
382
390
#[ inline]
383
391
fn has_newline_at ( & self , offset : usize ) -> bool {
384
392
self . position + offset < self . input . len ( ) &&
@@ -420,16 +428,14 @@ fn next_token<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>, ()> {
420
428
}
421
429
let b = tokenizer. next_byte_unchecked ( ) ;
422
430
let token = match_byte ! { b,
423
- b'\t' | b'\n' | b' ' | b'\r' | b'\x0C' => {
424
- let start_position = tokenizer. position( ) ;
425
- tokenizer. advance( 1 ) ;
426
- while !tokenizer. is_eof( ) {
427
- match tokenizer. next_byte_unchecked( ) {
428
- b' ' | b'\t' | b'\n' | b'\r' | b'\x0C' => tokenizer. advance( 1 ) ,
429
- _ => break ,
430
- }
431
- }
432
- WhiteSpace ( tokenizer. slice_from( start_position) )
431
+ b' ' | b'\t' => {
432
+ consume_whitespace( tokenizer, false , false )
433
+ } ,
434
+ b'\n' | b'\x0C' => {
435
+ consume_whitespace( tokenizer, true , false )
436
+ } ,
437
+ b'\r' => {
438
+ consume_whitespace( tokenizer, true , true )
433
439
} ,
434
440
b'"' => { consume_string( tokenizer, false ) } ,
435
441
b'#' => {
@@ -501,21 +507,7 @@ fn next_token<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>, ()> {
501
507
}
502
508
b'/' => {
503
509
if tokenizer. starts_with( b"/*" ) {
504
- tokenizer. advance( 2 ) ; // consume "/*"
505
- let start_position = tokenizer. position( ) ;
506
- let content;
507
- match tokenizer. input[ tokenizer. position..] . find( "*/" ) {
508
- Some ( offset) => {
509
- tokenizer. advance( offset) ;
510
- content = tokenizer. slice_from( start_position) ;
511
- tokenizer. advance( 2 ) ;
512
- }
513
- None => {
514
- tokenizer. position = tokenizer. input. len( ) ;
515
- content = tokenizer. slice_from( start_position) ;
516
- }
517
- }
518
- Comment ( content)
510
+ Comment ( consume_comment( tokenizer) )
519
511
} else {
520
512
tokenizer. advance( 1 ) ;
521
513
Delim ( '/' )
@@ -573,6 +565,64 @@ fn next_token<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>, ()> {
573
565
}
574
566
575
567
568
+ fn consume_whitespace < ' a > ( tokenizer : & mut Tokenizer < ' a > , newline : bool , is_cr : bool ) -> Token < ' a > {
569
+ let start_position = tokenizer. position ( ) ;
570
+ tokenizer. advance ( 1 ) ;
571
+ if newline {
572
+ tokenizer. seen_newline ( is_cr)
573
+ }
574
+ while !tokenizer. is_eof ( ) {
575
+ let b = tokenizer. next_byte_unchecked ( ) ;
576
+ match_byte ! { b,
577
+ b' ' | b'\t' => {
578
+ tokenizer. advance( 1 ) ;
579
+ }
580
+ b'\n' | b'\x0C' => {
581
+ tokenizer. advance( 1 ) ;
582
+ tokenizer. seen_newline( false ) ;
583
+ }
584
+ b'\r' => {
585
+ tokenizer. advance( 1 ) ;
586
+ tokenizer. seen_newline( true ) ;
587
+ }
588
+ _ => {
589
+ break
590
+ }
591
+ }
592
+ }
593
+ WhiteSpace ( tokenizer. slice_from ( start_position) )
594
+ }
595
+
596
+
597
+ fn consume_comment < ' a > ( tokenizer : & mut Tokenizer < ' a > ) -> & ' a str {
598
+ tokenizer. advance ( 2 ) ; // consume "/*"
599
+ let start_position = tokenizer. position ( ) ;
600
+ while !tokenizer. is_eof ( ) {
601
+ match_byte ! { tokenizer. next_byte_unchecked( ) ,
602
+ b'*' => {
603
+ let end_position = tokenizer. position( ) ;
604
+ tokenizer. advance( 1 ) ;
605
+ if tokenizer. next_byte( ) == Some ( b'/' ) {
606
+ tokenizer. advance( 1 ) ;
607
+ return tokenizer. slice( start_position..end_position)
608
+ }
609
+ }
610
+ b'\n' | b'\x0C' => {
611
+ tokenizer. advance( 1 ) ;
612
+ tokenizer. seen_newline( false ) ;
613
+ }
614
+ b'\r' => {
615
+ tokenizer. advance( 1 ) ;
616
+ tokenizer. seen_newline( true ) ;
617
+ }
618
+ _ => {
619
+ tokenizer. advance( 1 ) ;
620
+ }
621
+ }
622
+ }
623
+ tokenizer. slice_from ( start_position)
624
+ }
625
+
576
626
fn consume_string < ' a > ( tokenizer : & mut Tokenizer < ' a > , single_quote : bool ) -> Token < ' a > {
577
627
match consume_quoted_string ( tokenizer, single_quote) {
578
628
Ok ( value) => QuotedString ( value) ,
@@ -649,12 +699,16 @@ fn consume_quoted_string<'a>(tokenizer: &mut Tokenizer<'a>, single_quote: bool)
649
699
if !tokenizer. is_eof( ) {
650
700
match tokenizer. next_byte_unchecked( ) {
651
701
// Escaped newline
652
- b'\n' | b'\x0C' => tokenizer. advance( 1 ) ,
702
+ b'\n' | b'\x0C' => {
703
+ tokenizer. advance( 1 ) ;
704
+ tokenizer. seen_newline( false ) ;
705
+ }
653
706
b'\r' => {
654
707
tokenizer. advance( 1 ) ;
655
708
if tokenizer. next_byte( ) == Some ( b'\n' ) {
656
709
tokenizer. advance( 1 ) ;
657
710
}
711
+ tokenizer. seen_newline( false ) ;
658
712
}
659
713
// This pushes one well-formed code point
660
714
_ => consume_escape_and_write( tokenizer, & mut string_bytes)
@@ -921,24 +975,56 @@ unsafe fn from_utf8_release_unchecked(string_bytes: Vec<u8>) -> String {
921
975
922
976
fn consume_unquoted_url < ' a > ( tokenizer : & mut Tokenizer < ' a > ) -> Result < Token < ' a > , ( ) > {
923
977
// This is only called after "url(", so the current position is a code point boundary.
924
- for ( offset, c) in tokenizer. input [ tokenizer. position ..] . bytes ( ) . enumerate ( ) {
925
- match_byte ! { c,
926
- b' ' | b'\t' | b'\n' | b'\r' | b'\x0C' => { } ,
978
+ let start_position = tokenizer. position ;
979
+ let from_start = & tokenizer. input [ tokenizer. position ..] ;
980
+ let mut newlines = 0 ;
981
+ let mut last_newline = 0 ;
982
+ let mut found_printable_char = false ;
983
+ let mut iter = from_start. bytes ( ) . enumerate ( ) ;
984
+ loop {
985
+ let ( offset, b) = if let Some ( item) = iter. next ( ) {
986
+ item
987
+ } else {
988
+ tokenizer. position = tokenizer. input . len ( ) ;
989
+ break
990
+ } ;
991
+ match_byte ! { b,
992
+ b' ' | b'\t' => { } ,
993
+ b'\n' | b'\x0C' => {
994
+ newlines += 1 ;
995
+ last_newline = offset;
996
+ }
997
+ b'\r' => {
998
+ if from_start. as_bytes( ) . get( offset + 1 ) != Some ( & b'\n' ) {
999
+ newlines += 1 ;
1000
+ last_newline = offset;
1001
+ }
1002
+ }
927
1003
b'"' | b'\'' => { return Err ( ( ) ) } , // Do not advance
928
1004
b')' => {
929
1005
tokenizer. advance( offset + 1 ) ;
930
- return Ok ( UnquotedUrl ( "" . into ( ) ) ) ;
1006
+ break
931
1007
}
932
1008
_ => {
933
1009
tokenizer. advance( offset) ;
934
- // This function only consumed ASCII (whitespace) bytes,
935
- // so the current position is a code point boundary.
936
- return Ok ( consume_unquoted_url_internal( tokenizer) )
1010
+ found_printable_char = true ;
1011
+ break
937
1012
}
938
1013
}
939
1014
}
940
- tokenizer. position = tokenizer. input . len ( ) ;
941
- return Ok ( UnquotedUrl ( "" . into ( ) ) ) ;
1015
+
1016
+ if newlines > 0 {
1017
+ tokenizer. current_line_number += newlines;
1018
+ tokenizer. current_line_start_position = start_position + last_newline + 1 ;
1019
+ }
1020
+
1021
+ if found_printable_char {
1022
+ // This function only consumed ASCII (whitespace) bytes,
1023
+ // so the current position is a code point boundary.
1024
+ return Ok ( consume_unquoted_url_internal ( tokenizer) )
1025
+ } else {
1026
+ return Ok ( UnquotedUrl ( "" . into ( ) ) )
1027
+ }
942
1028
943
1029
fn consume_unquoted_url_internal < ' a > ( tokenizer : & mut Tokenizer < ' a > ) -> Token < ' a > {
944
1030
// This function is only called with start_pos at a code point boundary.
@@ -951,7 +1037,6 @@ fn consume_unquoted_url<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>,
951
1037
match_byte ! { tokenizer. next_byte_unchecked( ) ,
952
1038
b' ' | b'\t' | b'\n' | b'\r' | b'\x0C' => {
953
1039
let value = tokenizer. slice_from( start_pos) ;
954
- tokenizer. advance( 1 ) ;
955
1040
return consume_url_end( tokenizer, start_pos, value. into( ) )
956
1041
}
957
1042
b')' => {
@@ -974,7 +1059,7 @@ fn consume_unquoted_url<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>,
974
1059
break
975
1060
}
976
1061
_ => {
977
- tokenizer. consume_byte ( ) ;
1062
+ tokenizer. advance ( 1 ) ;
978
1063
}
979
1064
}
980
1065
}
@@ -983,6 +1068,7 @@ fn consume_unquoted_url<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>,
983
1068
b' ' | b'\t' | b'\n' | b'\r' | b'\x0C' => {
984
1069
// string_bytes is well-formed UTF-8, see other comments.
985
1070
let string = unsafe { from_utf8_release_unchecked( string_bytes) } . into( ) ;
1071
+ tokenizer. position -= 1 ;
986
1072
return consume_url_end( tokenizer, start_pos, string)
987
1073
}
988
1074
b')' => {
@@ -1020,8 +1106,16 @@ fn consume_unquoted_url<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>,
1020
1106
-> Token < ' a > {
1021
1107
while !tokenizer. is_eof ( ) {
1022
1108
match_byte ! { tokenizer. consume_byte( ) ,
1023
- b' ' | b'\t' | b'\n' | b'\r' | b'\x0C' => { } ,
1024
- b')' => { break } ,
1109
+ b')' => {
1110
+ break
1111
+ }
1112
+ b' ' | b'\t' => { }
1113
+ b'\n' | b'\x0C' => {
1114
+ tokenizer. seen_newline( false ) ;
1115
+ }
1116
+ b'\r' => {
1117
+ tokenizer. seen_newline( true ) ;
1118
+ }
1025
1119
_ => {
1026
1120
return consume_bad_url( tokenizer, start_pos) ;
1027
1121
}
@@ -1034,12 +1128,20 @@ fn consume_unquoted_url<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>,
1034
1128
// Consume up to the closing )
1035
1129
while !tokenizer. is_eof ( ) {
1036
1130
match_byte ! { tokenizer. consume_byte( ) ,
1037
- b')' => { break } ,
1131
+ b')' => {
1132
+ break
1133
+ }
1038
1134
b'\\' => {
1039
1135
if matches!( tokenizer. next_byte( ) , Some ( b')' ) | Some ( b'\\' ) ) {
1040
1136
tokenizer. advance( 1 ) ; // Skip an escaped ')' or '\'
1041
1137
}
1042
1138
}
1139
+ b'\n' | b'\x0C' => {
1140
+ tokenizer. seen_newline( false ) ;
1141
+ }
1142
+ b'\r' => {
1143
+ tokenizer. seen_newline( true ) ;
1144
+ }
1043
1145
_ => { } ,
1044
1146
}
1045
1147
}
@@ -1080,15 +1182,22 @@ fn consume_escape(tokenizer: &mut Tokenizer) -> char {
1080
1182
b'0' ...b'9' | b'A' ...b'F' | b'a' ...b'f' => {
1081
1183
let ( c, _) = consume_hex_digits( tokenizer) ;
1082
1184
if !tokenizer. is_eof( ) {
1083
- match tokenizer. next_byte_unchecked( ) {
1084
- b' ' | b'\t' | b'\n' | b'\x0C' => tokenizer. advance( 1 ) ,
1185
+ match_byte! { tokenizer. next_byte_unchecked( ) ,
1186
+ b' ' | b'\t' => {
1187
+ tokenizer. advance( 1 )
1188
+ }
1189
+ b'\n' | b'\x0C' => {
1190
+ tokenizer. advance( 1 ) ;
1191
+ tokenizer. seen_newline( false )
1192
+ }
1085
1193
b'\r' => {
1086
1194
tokenizer. advance( 1 ) ;
1087
1195
if !tokenizer. is_eof( ) && tokenizer. next_byte_unchecked( ) == b'\n' {
1088
1196
tokenizer. advance( 1 ) ;
1089
1197
}
1198
+ tokenizer. seen_newline( false )
1090
1199
}
1091
- _ => ( )
1200
+ _ => { }
1092
1201
}
1093
1202
}
1094
1203
static REPLACEMENT_CHAR : char = '\u{FFFD}' ;
0 commit comments