@@ -338,8 +338,23 @@ impl<'a> Tokenizer<'a> {
338
338
#[ inline]
339
339
fn has_at_least ( & self , n : usize ) -> bool { self . position + n < self . input . len ( ) }
340
340
341
+ // Advance over N bytes in the input. This function can advance
342
+ // over ASCII bytes (excluding newlines), or UTF-8 sequence
343
+ // leaders (excluding leaders for 4-byte sequences).
341
344
#[ inline]
342
- pub fn advance ( & mut self , n : usize ) { self . position += n }
345
+ pub fn advance ( & mut self , n : usize ) {
346
+ if cfg ! ( debug_assertions) {
347
+ // Each byte must either be an ASCII byte or a sequence
348
+ // leader, but not a 4-byte leader; also newlines are
349
+ // rejected.
350
+ for i in 0 ..n {
351
+ let b = self . byte_at ( i) ;
352
+ debug_assert ! ( b. is_ascii( ) || ( b & 0xF0 != 0xF0 && b & 0xC0 != 0x80 ) ) ;
353
+ debug_assert ! ( b != b'\r' && b != b'\n' && b != b'\x0C' ) ;
354
+ }
355
+ }
356
+ self . position += n
357
+ }
343
358
344
359
// Assumes non-EOF
345
360
#[ inline]
@@ -350,10 +365,27 @@ impl<'a> Tokenizer<'a> {
350
365
self . input . as_bytes ( ) [ self . position + offset]
351
366
}
352
367
368
+ // Advance over a single byte; the byte must be a UTF-8 sequence
369
+ // leader for a 4-byte sequence.
370
+ #[ inline]
371
+ fn consume_4byte_intro ( & mut self ) {
372
+ debug_assert ! ( self . next_byte_unchecked( ) & 0xF0 == 0xF0 ) ;
373
+ self . position += 1 ;
374
+ }
375
+
376
+ // Advance over a single byte; the byte must be a UTF-8
377
+ // continuation byte.
353
378
#[ inline]
354
- fn consume_byte ( & mut self ) -> u8 {
379
+ fn consume_continuation_byte ( & mut self ) {
380
+ debug_assert ! ( self . next_byte_unchecked( ) & 0xC0 == 0x80 ) ;
381
+ self . position += 1 ;
382
+ }
383
+
384
+ // Advance over any kind of byte, excluding newlines.
385
+ #[ inline( never) ]
386
+ fn consume_known_byte ( & mut self , byte : u8 ) {
387
+ debug_assert ! ( byte != b'\r' && byte != b'\n' && byte != b'\x0C' ) ;
355
388
self . position += 1 ;
356
- self . input . as_bytes ( ) [ self . position - 1 ]
357
389
}
358
390
359
391
#[ inline]
@@ -667,7 +699,10 @@ fn consume_comment<'a>(tokenizer: &mut Tokenizer<'a>) -> &'a str {
667
699
b'\n' | b'\x0C' | b'\r' => {
668
700
tokenizer. consume_newline( ) ;
669
701
}
702
+ b'\x80' ...b'\xBF' => { tokenizer. consume_continuation_byte( ) ; }
703
+ b'\xF0' ...b'\xFF' => { tokenizer. consume_4byte_intro( ) ; }
670
704
_ => {
705
+ // ASCII or other leading byte.
671
706
tokenizer. advance( 1 ) ;
672
707
}
673
708
}
@@ -703,13 +738,15 @@ fn consume_quoted_string<'a>(tokenizer: &mut Tokenizer<'a>, single_quote: bool)
703
738
tokenizer. advance( 1 ) ;
704
739
return Ok ( value. into( ) )
705
740
}
741
+ tokenizer. advance( 1 ) ;
706
742
}
707
743
b'\'' => {
708
744
if single_quote {
709
745
let value = tokenizer. slice_from( start_pos) ;
710
746
tokenizer. advance( 1 ) ;
711
747
return Ok ( value. into( ) )
712
748
}
749
+ tokenizer. advance( 1 ) ;
713
750
}
714
751
b'\\' | b'\0' => {
715
752
// * The tokenizer’s input is UTF-8 since it’s `&str`.
@@ -723,33 +760,40 @@ fn consume_quoted_string<'a>(tokenizer: &mut Tokenizer<'a>, single_quote: bool)
723
760
b'\n' | b'\r' | b'\x0C' => {
724
761
return Err ( tokenizer. slice_from( start_pos) . into( ) )
725
762
} ,
726
- _ => { }
763
+ b'\x80' ...b'\xBF' => { tokenizer. consume_continuation_byte( ) ; }
764
+ b'\xF0' ...b'\xFF' => { tokenizer. consume_4byte_intro( ) ; }
765
+ _ => {
766
+ // ASCII or other leading byte.
767
+ tokenizer. advance( 1 ) ;
768
+ }
727
769
}
728
- tokenizer. consume_byte ( ) ;
729
770
}
730
771
731
772
while !tokenizer. is_eof ( ) {
732
- if matches ! ( tokenizer. next_byte_unchecked( ) , b'\n' | b'\r' | b'\x0C' ) {
733
- return Err (
734
- // string_bytes is well-formed UTF-8, see other comments.
735
- unsafe {
736
- from_utf8_release_unchecked ( string_bytes)
737
- } . into ( )
738
- ) ;
739
- }
740
- let b = tokenizer. consume_byte ( ) ;
773
+ let b = tokenizer. next_byte_unchecked ( ) ;
741
774
match_byte ! { b,
775
+ b'\n' | b'\r' | b'\x0C' => {
776
+ return Err (
777
+ // string_bytes is well-formed UTF-8, see other comments.
778
+ unsafe {
779
+ from_utf8_release_unchecked( string_bytes)
780
+ } . into( )
781
+ ) ;
782
+ }
742
783
b'"' => {
784
+ tokenizer. advance( 1 ) ;
743
785
if !single_quote {
744
786
break ;
745
787
}
746
788
}
747
789
b'\'' => {
790
+ tokenizer. advance( 1 ) ;
748
791
if single_quote {
749
792
break ;
750
793
}
751
794
}
752
795
b'\\' => {
796
+ tokenizer. advance( 1 ) ;
753
797
if !tokenizer. is_eof( ) {
754
798
match tokenizer. next_byte_unchecked( ) {
755
799
// Escaped newline
@@ -764,10 +808,16 @@ fn consume_quoted_string<'a>(tokenizer: &mut Tokenizer<'a>, single_quote: bool)
764
808
continue ;
765
809
}
766
810
b'\0' => {
811
+ tokenizer. advance( 1 ) ;
767
812
string_bytes. extend( "\u{FFFD} " . as_bytes( ) ) ;
768
813
continue ;
769
814
}
770
- _ => { } ,
815
+ b'\x80' ...b'\xBF' => { tokenizer. consume_continuation_byte( ) ; }
816
+ b'\xF0' ...b'\xFF' => { tokenizer. consume_4byte_intro( ) ; }
817
+ _ => {
818
+ // ASCII or other leading byte.
819
+ tokenizer. advance( 1 ) ;
820
+ } ,
771
821
}
772
822
773
823
// If this byte is part of a multi-byte code point,
@@ -835,11 +885,11 @@ fn consume_name<'a>(tokenizer: &mut Tokenizer<'a>) -> CowRcStr<'a> {
835
885
value_bytes = tokenizer. slice_from( start_pos) . as_bytes( ) . to_owned( ) ;
836
886
break
837
887
}
888
+ b'\x80' ...b'\xBF' => { tokenizer. consume_continuation_byte( ) ; }
889
+ b'\xC0' ...b'\xEF' => { tokenizer. advance( 1 ) ; }
890
+ b'\xF0' ...b'\xFF' => { tokenizer. consume_4byte_intro( ) ; }
838
891
b => {
839
- if b. is_ascii( ) {
840
- return tokenizer. slice_from( start_pos) . into( ) ;
841
- }
842
- tokenizer. advance( 1 ) ;
892
+ return tokenizer. slice_from( start_pos) . into( ) ;
843
893
}
844
894
}
845
895
}
@@ -861,15 +911,26 @@ fn consume_name<'a>(tokenizer: &mut Tokenizer<'a>) -> CowRcStr<'a> {
861
911
tokenizer. advance( 1 ) ;
862
912
value_bytes. extend( "\u{FFFD} " . as_bytes( ) ) ;
863
913
} ,
864
- _ => {
865
- if b. is_ascii( ) {
866
- break ;
867
- }
868
- tokenizer. advance( 1 ) ;
914
+ b'\x80' ...b'\xBF' => {
915
+ // This byte *is* part of a multi-byte code point,
916
+ // we’ll end up copying the whole code point before this loop does something else.
917
+ tokenizer. consume_continuation_byte( ) ;
918
+ value_bytes. push( b)
919
+ }
920
+ b'\xC0' ...b'\xEF' => {
869
921
// This byte *is* part of a multi-byte code point,
870
922
// we’ll end up copying the whole code point before this loop does something else.
923
+ tokenizer. advance( 1 ) ;
871
924
value_bytes. push( b)
872
925
}
926
+ b'\xF0' ...b'\xFF' => {
927
+ tokenizer. consume_4byte_intro( ) ;
928
+ value_bytes. push( b)
929
+ }
930
+ _ => {
931
+ // ASCII
932
+ break ;
933
+ }
873
934
}
874
935
}
875
936
// string_bytes is well-formed UTF-8, see other comments.
@@ -1048,11 +1109,15 @@ fn consume_unquoted_url<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>,
1048
1109
}
1049
1110
b'"' | b'\'' => { return Err ( ( ) ) } , // Do not advance
1050
1111
b')' => {
1051
- tokenizer. advance( offset + 1 ) ;
1112
+ // Don't use advance, because we may be skipping
1113
+ // newlines here, and we want to avoid the assert.
1114
+ tokenizer. position += offset + 1 ;
1052
1115
break
1053
1116
}
1054
1117
_ => {
1055
- tokenizer. advance( offset) ;
1118
+ // Don't use advance, because we may be skipping
1119
+ // newlines here, and we want to avoid the assert.
1120
+ tokenizer. position += offset;
1056
1121
found_printable_char = true ;
1057
1122
break
1058
1123
}
@@ -1104,27 +1169,33 @@ fn consume_unquoted_url<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>,
1104
1169
string_bytes = tokenizer. slice_from( start_pos) . as_bytes( ) . to_owned( ) ;
1105
1170
break
1106
1171
}
1172
+ b'\x80' ...b'\xBF' => { tokenizer. consume_continuation_byte( ) ; }
1173
+ b'\xF0' ...b'\xFF' => { tokenizer. consume_4byte_intro( ) ; }
1107
1174
_ => {
1175
+ // ASCII or other leading byte.
1108
1176
tokenizer. advance( 1 ) ;
1109
1177
}
1110
1178
}
1111
1179
}
1112
1180
while !tokenizer. is_eof ( ) {
1113
- match_byte ! { tokenizer. consume_byte( ) ,
1181
+ let b = tokenizer. next_byte_unchecked ( ) ;
1182
+ match_byte ! { b,
1114
1183
b' ' | b'\t' | b'\n' | b'\r' | b'\x0C' => {
1115
1184
// string_bytes is well-formed UTF-8, see other comments.
1116
1185
let string = unsafe { from_utf8_release_unchecked( string_bytes) } . into( ) ;
1117
- tokenizer. position -= 1 ;
1118
1186
return consume_url_end( tokenizer, start_pos, string)
1119
1187
}
1120
1188
b')' => {
1189
+ tokenizer. advance( 1 ) ;
1121
1190
break ;
1122
1191
}
1123
1192
b'\x01' ...b'\x08' | b'\x0B' | b'\x0E' ...b'\x1F' | b'\x7F' // non-printable
1124
1193
| b'"' | b'\'' | b'(' => {
1194
+ tokenizer. advance( 1 ) ;
1125
1195
return consume_bad_url( tokenizer, start_pos) ;
1126
1196
}
1127
1197
b'\\' => {
1198
+ tokenizer. advance( 1 ) ;
1128
1199
if tokenizer. has_newline_at( 0 ) {
1129
1200
return consume_bad_url( tokenizer, start_pos)
1130
1201
}
@@ -1133,11 +1204,28 @@ fn consume_unquoted_url<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>,
1133
1204
consume_escape_and_write( tokenizer, & mut string_bytes)
1134
1205
} ,
1135
1206
b'\0' => {
1207
+ tokenizer. advance( 1 ) ;
1136
1208
string_bytes. extend( "\u{FFFD} " . as_bytes( ) ) ;
1137
1209
}
1210
+ b'\x80' ...b'\xBF' => {
1211
+ // We’ll end up copying the whole code point
1212
+ // before this loop does something else.
1213
+ tokenizer. consume_continuation_byte( ) ;
1214
+ string_bytes. push( b) ;
1215
+ }
1216
+ b'\xF0' ...b'\xFF' => {
1217
+ // We’ll end up copying the whole code point
1218
+ // before this loop does something else.
1219
+ tokenizer. consume_4byte_intro( ) ;
1220
+ string_bytes. push( b) ;
1221
+ }
1138
1222
// If this byte is part of a multi-byte code point,
1139
1223
// we’ll end up copying the whole code point before this loop does something else.
1140
- b => { string_bytes. push( b) }
1224
+ b => {
1225
+ // ASCII or other leading byte.
1226
+ tokenizer. advance( 1 ) ;
1227
+ string_bytes. push( b)
1228
+ }
1141
1229
}
1142
1230
}
1143
1231
UnquotedUrl (
@@ -1160,8 +1248,8 @@ fn consume_unquoted_url<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>,
1160
1248
b'\n' | b'\x0C' | b'\r' => {
1161
1249
tokenizer. consume_newline( ) ;
1162
1250
}
1163
- _ => {
1164
- tokenizer. advance ( 1 ) ;
1251
+ b => {
1252
+ tokenizer. consume_known_byte ( b ) ;
1165
1253
return consume_bad_url( tokenizer, start_pos) ;
1166
1254
}
1167
1255
}
@@ -1186,8 +1274,8 @@ fn consume_unquoted_url<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>,
1186
1274
b'\n' | b'\x0C' | b'\r' => {
1187
1275
tokenizer. consume_newline( ) ;
1188
1276
}
1189
- _ => {
1190
- tokenizer. advance ( 1 ) ;
1277
+ b => {
1278
+ tokenizer. consume_known_byte ( b ) ;
1191
1279
}
1192
1280
}
1193
1281
}
0 commit comments