@@ -338,8 +338,23 @@ impl<'a> Tokenizer<'a> {
338
338
#[ inline]
339
339
fn has_at_least ( & self , n : usize ) -> bool { self . position + n < self . input . len ( ) }
340
340
341
+ // Advance over N bytes in the input. This function can advance
342
+ // over ASCII bytes (excluding newlines), or UTF-8 sequence
343
+ // leaders (excluding leaders for 4-byte sequences).
341
344
#[ inline]
342
- pub fn advance ( & mut self , n : usize ) { self . position += n }
345
+ pub fn advance ( & mut self , n : usize ) {
346
+ if cfg ! ( debug_assertions) {
347
+ // Each byte must either be an ASCII byte or a sequence
348
+ // leader, but not a 4-byte leader; also newlines are
349
+ // rejected.
350
+ for i in 0 ..n {
351
+ let b = self . byte_at ( i) ;
352
+ debug_assert ! ( b. is_ascii( ) || ( b & 0xF0 != 0xF0 && b & 0xC0 != 0x80 ) ) ;
353
+ debug_assert ! ( b != b'\r' && b != b'\n' && b != b'\x0C' ) ;
354
+ }
355
+ }
356
+ self . position += n
357
+ }
343
358
344
359
// Assumes non-EOF
345
360
#[ inline]
@@ -350,10 +365,27 @@ impl<'a> Tokenizer<'a> {
350
365
self . input . as_bytes ( ) [ self . position + offset]
351
366
}
352
367
368
+ // Advance over a single byte; the byte must be a UTF-8 sequence
369
+ // leader for a 4-byte sequence.
370
+ #[ inline]
371
+ fn consume_4byte_intro ( & mut self ) {
372
+ debug_assert ! ( self . next_byte_unchecked( ) & 0xF0 == 0xF0 ) ;
373
+ self . position += 1 ;
374
+ }
375
+
376
+ // Advance over a single byte; the byte must be a UTF-8
377
+ // continuation byte.
353
378
#[ inline]
354
- fn consume_byte ( & mut self ) -> u8 {
379
+ fn consume_continuation_byte ( & mut self ) {
380
+ debug_assert ! ( self . next_byte_unchecked( ) & 0xC0 == 0x80 ) ;
381
+ self . position += 1 ;
382
+ }
383
+
384
+ // Advance over any kind of byte, excluding newlines.
385
+ #[ inline( never) ]
386
+ fn consume_known_byte ( & mut self , byte : u8 ) {
387
+ debug_assert ! ( byte != b'\r' && byte != b'\n' && byte != b'\x0C' ) ;
355
388
self . position += 1 ;
356
- self . input . as_bytes ( ) [ self . position - 1 ]
357
389
}
358
390
359
391
#[ inline]
@@ -667,6 +699,8 @@ fn consume_comment<'a>(tokenizer: &mut Tokenizer<'a>) -> &'a str {
667
699
b'\n' | b'\x0C' | b'\r' => {
668
700
tokenizer. consume_newline( ) ;
669
701
}
702
+ b'\x80' ...b'\xBF' => { tokenizer. consume_continuation_byte( ) ; }
703
+ b'\xF0' ...b'\xFF' => { tokenizer. consume_4byte_intro( ) ; }
670
704
_ => {
671
705
tokenizer. advance( 1 ) ;
672
706
}
@@ -703,13 +737,15 @@ fn consume_quoted_string<'a>(tokenizer: &mut Tokenizer<'a>, single_quote: bool)
703
737
tokenizer. advance( 1 ) ;
704
738
return Ok ( value. into( ) )
705
739
}
740
+ tokenizer. advance( 1 ) ;
706
741
}
707
742
b'\'' => {
708
743
if single_quote {
709
744
let value = tokenizer. slice_from( start_pos) ;
710
745
tokenizer. advance( 1 ) ;
711
746
return Ok ( value. into( ) )
712
747
}
748
+ tokenizer. advance( 1 ) ;
713
749
}
714
750
b'\\' | b'\0' => {
715
751
// * The tokenizer’s input is UTF-8 since it’s `&str`.
@@ -723,33 +759,37 @@ fn consume_quoted_string<'a>(tokenizer: &mut Tokenizer<'a>, single_quote: bool)
723
759
b'\n' | b'\r' | b'\x0C' => {
724
760
return Err ( tokenizer. slice_from( start_pos) . into( ) )
725
761
} ,
726
- _ => { }
762
+ b'\x80' ...b'\xBF' => { tokenizer. consume_continuation_byte( ) ; }
763
+ b'\xF0' ...b'\xFF' => { tokenizer. consume_4byte_intro( ) ; }
764
+ _ => { tokenizer. advance( 1 ) ; }
727
765
}
728
- tokenizer. consume_byte ( ) ;
729
766
}
730
767
731
768
while !tokenizer. is_eof ( ) {
732
- if matches ! ( tokenizer. next_byte_unchecked( ) , b'\n' | b'\r' | b'\x0C' ) {
733
- return Err (
734
- // string_bytes is well-formed UTF-8, see other comments.
735
- unsafe {
736
- from_utf8_release_unchecked ( string_bytes)
737
- } . into ( )
738
- ) ;
739
- }
740
- let b = tokenizer. consume_byte ( ) ;
769
+ let b = tokenizer. next_byte_unchecked ( ) ;
741
770
match_byte ! { b,
771
+ b'\n' | b'\r' | b'\x0C' => {
772
+ return Err (
773
+ // string_bytes is well-formed UTF-8, see other comments.
774
+ unsafe {
775
+ from_utf8_release_unchecked( string_bytes)
776
+ } . into( )
777
+ ) ;
778
+ }
742
779
b'"' => {
780
+ tokenizer. advance( 1 ) ;
743
781
if !single_quote {
744
782
break ;
745
783
}
746
784
}
747
785
b'\'' => {
786
+ tokenizer. advance( 1 ) ;
748
787
if single_quote {
749
788
break ;
750
789
}
751
790
}
752
791
b'\\' => {
792
+ tokenizer. advance( 1 ) ;
753
793
if !tokenizer. is_eof( ) {
754
794
match tokenizer. next_byte_unchecked( ) {
755
795
// Escaped newline
@@ -764,10 +804,13 @@ fn consume_quoted_string<'a>(tokenizer: &mut Tokenizer<'a>, single_quote: bool)
764
804
continue ;
765
805
}
766
806
b'\0' => {
807
+ tokenizer. advance( 1 ) ;
767
808
string_bytes. extend( "\u{FFFD} " . as_bytes( ) ) ;
768
809
continue ;
769
810
}
770
- _ => { } ,
811
+ b'\x80' ...b'\xBF' => { tokenizer. consume_continuation_byte( ) ; }
812
+ b'\xF0' ...b'\xFF' => { tokenizer. consume_4byte_intro( ) ; }
813
+ _ => { tokenizer. advance( 1 ) ; } ,
771
814
}
772
815
773
816
// If this byte is part of a multi-byte code point,
@@ -835,11 +878,11 @@ fn consume_name<'a>(tokenizer: &mut Tokenizer<'a>) -> CowRcStr<'a> {
835
878
value_bytes = tokenizer. slice_from( start_pos) . as_bytes( ) . to_owned( ) ;
836
879
break
837
880
}
881
+ b'\x80' ...b'\xBF' => { tokenizer. consume_continuation_byte( ) ; }
882
+ b'\xC0' ...b'\xEF' => { tokenizer. advance( 1 ) ; }
883
+ b'\xF0' ...b'\xFF' => { tokenizer. consume_4byte_intro( ) ; }
838
884
b => {
839
- if b. is_ascii( ) {
840
- return tokenizer. slice_from( start_pos) . into( ) ;
841
- }
842
- tokenizer. advance( 1 ) ;
885
+ return tokenizer. slice_from( start_pos) . into( ) ;
843
886
}
844
887
}
845
888
}
@@ -861,15 +904,26 @@ fn consume_name<'a>(tokenizer: &mut Tokenizer<'a>) -> CowRcStr<'a> {
861
904
tokenizer. advance( 1 ) ;
862
905
value_bytes. extend( "\u{FFFD} " . as_bytes( ) ) ;
863
906
} ,
864
- _ => {
865
- if b. is_ascii( ) {
866
- break ;
867
- }
868
- tokenizer. advance( 1 ) ;
907
+ b'\x80' ...b'\xBF' => {
908
+ // This byte *is* part of a multi-byte code point,
909
+ // we’ll end up copying the whole code point before this loop does something else.
910
+ tokenizer. consume_continuation_byte( ) ;
911
+ value_bytes. push( b)
912
+ }
913
+ b'\xC0' ...b'\xEF' => {
869
914
// This byte *is* part of a multi-byte code point,
870
915
// we’ll end up copying the whole code point before this loop does something else.
916
+ tokenizer. advance( 1 ) ;
917
+ value_bytes. push( b)
918
+ }
919
+ b'\xF0' ...b'\xFF' => {
920
+ tokenizer. consume_4byte_intro( ) ;
871
921
value_bytes. push( b)
872
922
}
923
+ _ => {
924
+ // ascii
925
+ break ;
926
+ }
873
927
}
874
928
}
875
929
// string_bytes is well-formed UTF-8, see other comments.
@@ -1048,11 +1102,15 @@ fn consume_unquoted_url<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>,
1048
1102
}
1049
1103
b'"' | b'\'' => { return Err ( ( ) ) } , // Do not advance
1050
1104
b')' => {
1051
- tokenizer. advance( offset + 1 ) ;
1105
+ // Don't use advance, because we may be skipping
1106
+ // newlines here, and we want to avoid the assert.
1107
+ tokenizer. position += offset + 1 ;
1052
1108
break
1053
1109
}
1054
1110
_ => {
1055
- tokenizer. advance( offset) ;
1111
+ // Don't use advance, because we may be skipping
1112
+ // newlines here, and we want to avoid the assert.
1113
+ tokenizer. position += offset;
1056
1114
found_printable_char = true ;
1057
1115
break
1058
1116
}
@@ -1104,27 +1162,33 @@ fn consume_unquoted_url<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>,
1104
1162
string_bytes = tokenizer. slice_from( start_pos) . as_bytes( ) . to_owned( ) ;
1105
1163
break
1106
1164
}
1165
+ b'\x80' ...b'\xBF' => { tokenizer. consume_continuation_byte( ) ; }
1166
+ b'\xF0' ...b'\xFF' => { tokenizer. consume_4byte_intro( ) ; }
1167
+ // ASCII or other leading byte.
1107
1168
_ => {
1108
1169
tokenizer. advance( 1 ) ;
1109
1170
}
1110
1171
}
1111
1172
}
1112
1173
while !tokenizer. is_eof ( ) {
1113
- match_byte ! { tokenizer. consume_byte( ) ,
1174
+ let b = tokenizer. next_byte_unchecked ( ) ;
1175
+ match_byte ! { b,
1114
1176
b' ' | b'\t' | b'\n' | b'\r' | b'\x0C' => {
1115
1177
// string_bytes is well-formed UTF-8, see other comments.
1116
1178
let string = unsafe { from_utf8_release_unchecked( string_bytes) } . into( ) ;
1117
- tokenizer. position -= 1 ;
1118
1179
return consume_url_end( tokenizer, start_pos, string)
1119
1180
}
1120
1181
b')' => {
1182
+ tokenizer. advance( 1 ) ;
1121
1183
break ;
1122
1184
}
1123
1185
b'\x01' ...b'\x08' | b'\x0B' | b'\x0E' ...b'\x1F' | b'\x7F' // non-printable
1124
1186
| b'"' | b'\'' | b'(' => {
1187
+ tokenizer. advance( 1 ) ;
1125
1188
return consume_bad_url( tokenizer, start_pos) ;
1126
1189
}
1127
1190
b'\\' => {
1191
+ tokenizer. advance( 1 ) ;
1128
1192
if tokenizer. has_newline_at( 0 ) {
1129
1193
return consume_bad_url( tokenizer, start_pos)
1130
1194
}
@@ -1133,11 +1197,27 @@ fn consume_unquoted_url<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>,
1133
1197
consume_escape_and_write( tokenizer, & mut string_bytes)
1134
1198
} ,
1135
1199
b'\0' => {
1200
+ tokenizer. advance( 1 ) ;
1136
1201
string_bytes. extend( "\u{FFFD} " . as_bytes( ) ) ;
1137
1202
}
1203
+ b'\x80' ...b'\xBF' => {
1204
+ // We’ll end up copying the whole code point
1205
+ // before this loop does something else.
1206
+ tokenizer. consume_continuation_byte( ) ;
1207
+ string_bytes. push( b) ;
1208
+ }
1209
+ b'\xF0' ...b'\xFF' => {
1210
+ // We’ll end up copying the whole code point
1211
+ // before this loop does something else.
1212
+ tokenizer. consume_4byte_intro( ) ;
1213
+ string_bytes. push( b) ;
1214
+ }
1138
1215
// If this byte is part of a multi-byte code point,
1139
1216
// we’ll end up copying the whole code point before this loop does something else.
1140
- b => { string_bytes. push( b) }
1217
+ b => {
1218
+ tokenizer. advance( 1 ) ;
1219
+ string_bytes. push( b)
1220
+ }
1141
1221
}
1142
1222
}
1143
1223
UnquotedUrl (
@@ -1160,8 +1240,8 @@ fn consume_unquoted_url<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>,
1160
1240
b'\n' | b'\x0C' | b'\r' => {
1161
1241
tokenizer. consume_newline( ) ;
1162
1242
}
1163
- _ => {
1164
- tokenizer. advance ( 1 ) ;
1243
+ b => {
1244
+ tokenizer. consume_known_byte ( b ) ;
1165
1245
return consume_bad_url( tokenizer, start_pos) ;
1166
1246
}
1167
1247
}
@@ -1186,8 +1266,8 @@ fn consume_unquoted_url<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>,
1186
1266
b'\n' | b'\x0C' | b'\r' => {
1187
1267
tokenizer. consume_newline( ) ;
1188
1268
}
1189
- _ => {
1190
- tokenizer. advance ( 1 ) ;
1269
+ b => {
1270
+ tokenizer. consume_known_byte ( b ) ;
1191
1271
}
1192
1272
}
1193
1273
}
0 commit comments