@@ -576,6 +576,7 @@ fn consume_string<'a>(tokenizer: &mut Tokenizer<'a>, single_quote: bool) -> Toke
576
576
fn consume_quoted_string < ' a > ( tokenizer : & mut Tokenizer < ' a > , single_quote : bool )
577
577
-> Result < Cow < ' a , str > , ( ) > {
578
578
tokenizer. advance ( 1 ) ; // Skip the initial quote
579
+ // start_pos is at code point boundary, after " or '
579
580
let start_pos = tokenizer. position ( ) ;
580
581
let mut string_bytes;
581
582
loop {
@@ -598,6 +599,11 @@ fn consume_quoted_string<'a>(tokenizer: &mut Tokenizer<'a>, single_quote: bool)
598
599
}
599
600
}
600
601
b'\\' | b'\0' => {
602
+ // * The tokenizer’s input is UTF-8 since it’s `&str`.
603
+ // * start_pos is at a code point boundary
604
+ // * so is the current position (which is before '\\' or '\0'
605
+ //
606
+ // So `string_bytes` is well-formed UTF-8.
601
607
string_bytes = tokenizer. slice_from( start_pos) . as_bytes( ) . to_owned( ) ;
602
608
break
603
609
}
@@ -611,8 +617,8 @@ fn consume_quoted_string<'a>(tokenizer: &mut Tokenizer<'a>, single_quote: bool)
611
617
if matches ! ( tokenizer. next_byte_unchecked( ) , b'\n' | b'\r' | b'\x0C' ) {
612
618
return Err ( ( ) ) ;
613
619
}
614
- let c = tokenizer. consume_byte ( ) ;
615
- match_byte ! { c ,
620
+ let b = tokenizer. consume_byte ( ) ;
621
+ match_byte ! { b ,
616
622
b'"' => {
617
623
if !single_quote {
618
624
break ;
@@ -634,26 +640,29 @@ fn consume_quoted_string<'a>(tokenizer: &mut Tokenizer<'a>, single_quote: bool)
634
640
tokenizer. advance( 1 ) ;
635
641
}
636
642
}
643
+ // This pushes one well-formed code point
637
644
_ => consume_escape_and_write( tokenizer, & mut string_bytes)
638
645
}
639
646
}
640
647
// else: escaped EOF, do nothing.
641
648
continue ;
642
649
}
643
650
b'\0' => {
644
- // string.push('\u{FFFD}'),
645
- string_bytes. push( 0xef ) ;
646
- string_bytes. push( 0xbf ) ;
647
- string_bytes. push( 0xbd ) ;
651
+ string_bytes. extend( "\u{FFFD} " . as_bytes( ) ) ;
648
652
continue ;
649
653
}
650
654
_ => { } ,
651
655
}
652
656
653
- string_bytes. push ( c) ;
657
+ // If this byte is part of a multi-byte code point,
658
+ // we’ll end up copying the whole code point before this loop does something else.
659
+ string_bytes. push ( b) ;
654
660
}
655
661
656
- Ok ( Owned ( to_utf8 ( string_bytes) ) )
662
+ Ok ( Owned (
663
+ // string_bytes is well-formed UTF-8, see other comments.
664
+ unsafe { from_utf8_release_unchecked ( string_bytes) }
665
+ ) )
657
666
}
658
667
659
668
@@ -695,6 +704,7 @@ fn consume_ident_like<'a>(tokenizer: &mut Tokenizer<'a>) -> Token<'a> {
695
704
}
696
705
697
706
fn consume_name < ' a > ( tokenizer : & mut Tokenizer < ' a > ) -> Cow < ' a , str > {
707
+ // start_pos is the end of the previous token, therefore at a code point boundary
698
708
let start_pos = tokenizer. position ( ) ;
699
709
let mut value_bytes;
700
710
loop {
@@ -704,11 +714,16 @@ fn consume_name<'a>(tokenizer: &mut Tokenizer<'a>) -> Cow<'a, str> {
704
714
match_byte ! { tokenizer. next_byte_unchecked( ) ,
705
715
b'a' ...b'z' | b'A' ...b'Z' | b'0' ...b'9' | b'_' | b'-' => { tokenizer. advance( 1 ) } ,
706
716
b'\\' | b'\0' => {
717
+ // * The tokenizer’s input is UTF-8 since it’s `&str`.
718
+ // * start_pos is at a code point boundary
719
+ // * so is the current position (which is before '\\' or '\0'
720
+ //
721
+ // So `value_bytes` is well-formed UTF-8.
707
722
value_bytes = tokenizer. slice_from( start_pos) . as_bytes( ) . to_owned( ) ;
708
723
break
709
724
}
710
- c => {
711
- if c . is_ascii( ) {
725
+ b => {
726
+ if b . is_ascii( ) {
712
727
return Borrowed ( tokenizer. slice_from( start_pos) ) ;
713
728
}
714
729
tokenizer. advance( 1 ) ;
@@ -717,34 +732,37 @@ fn consume_name<'a>(tokenizer: &mut Tokenizer<'a>) -> Cow<'a, str> {
717
732
}
718
733
719
734
while !tokenizer. is_eof ( ) {
720
- let c = tokenizer. next_byte_unchecked ( ) ;
721
- match_byte ! { c ,
735
+ let b = tokenizer. next_byte_unchecked ( ) ;
736
+ match_byte ! { b ,
722
737
b'a' ...b'z' | b'A' ...b'Z' | b'0' ...b'9' | b'_' | b'-' => {
723
738
tokenizer. advance( 1 ) ;
724
- value_bytes. push( c )
739
+ value_bytes. push( b ) // ASCII
725
740
}
726
741
b'\\' => {
727
742
if tokenizer. has_newline_at( 1 ) { break }
728
743
tokenizer. advance( 1 ) ;
744
+ // This pushes one well-formed code point
729
745
consume_escape_and_write( tokenizer, & mut value_bytes)
730
746
}
731
747
b'\0' => {
732
748
tokenizer. advance( 1 ) ;
733
- // value.push('\u{FFFD}')
734
- value_bytes. push( 0xef ) ;
735
- value_bytes. push( 0xbf ) ;
736
- value_bytes. push( 0xbd ) ;
749
+ value_bytes. extend( "\u{FFFD} " . as_bytes( ) ) ;
737
750
} ,
738
751
_ => {
739
- if c . is_ascii( ) {
752
+ if b . is_ascii( ) {
740
753
break ;
741
754
}
742
755
tokenizer. advance( 1 ) ;
743
- value_bytes. push( c)
756
+ // This byte *is* part of a multi-byte code point,
757
+ // we’ll end up copying the whole code point before this loop does something else.
758
+ value_bytes. push( b)
744
759
}
745
760
}
746
761
}
747
- Owned ( to_utf8 ( value_bytes) )
762
+ Owned (
763
+ // string_bytes is well-formed UTF-8, see other comments.
764
+ unsafe { from_utf8_release_unchecked ( value_bytes) }
765
+ )
748
766
}
749
767
750
768
@@ -867,17 +885,16 @@ fn consume_numeric<'a>(tokenizer: &mut Tokenizer<'a>) -> Token<'a> {
867
885
868
886
869
887
#[ inline]
870
- fn to_utf8 ( string_bytes : Vec < u8 > ) -> String {
888
+ unsafe fn from_utf8_release_unchecked ( string_bytes : Vec < u8 > ) -> String {
871
889
if cfg ! ( debug_assertions) {
872
890
String :: from_utf8 ( string_bytes) . unwrap ( )
873
891
} else {
874
- unsafe {
875
- String :: from_utf8_unchecked ( string_bytes)
876
- }
892
+ String :: from_utf8_unchecked ( string_bytes)
877
893
}
878
894
}
879
895
880
896
fn consume_unquoted_url < ' a > ( tokenizer : & mut Tokenizer < ' a > ) -> Result < Token < ' a > , ( ) > {
897
+ // This is only called after "url(", so the current position is a code point boundary.
881
898
for ( offset, c) in tokenizer. input [ tokenizer. position ..] . bytes ( ) . enumerate ( ) {
882
899
match_byte ! { c,
883
900
b' ' | b'\t' | b'\n' | b'\r' | b'\x0C' => { } ,
@@ -888,6 +905,8 @@ fn consume_unquoted_url<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>,
888
905
}
889
906
_ => {
890
907
tokenizer. advance( offset) ;
908
+ // This function only consumed ASCII (whitespace) bytes,
909
+ // so the current position is a code point boundary.
891
910
return Ok ( consume_unquoted_url_internal( tokenizer) )
892
911
}
893
912
}
@@ -896,6 +915,7 @@ fn consume_unquoted_url<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>,
896
915
return Ok ( UnquotedUrl ( Borrowed ( "" ) ) ) ;
897
916
898
917
fn consume_unquoted_url_internal < ' a > ( tokenizer : & mut Tokenizer < ' a > ) -> Token < ' a > {
918
+ // This function is only called with start_pos at a code point boundary.
899
919
let start_pos = tokenizer. position ( ) ;
900
920
let mut string_bytes: Vec < u8 > ;
901
921
loop {
@@ -919,6 +939,11 @@ fn consume_unquoted_url<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>,
919
939
return consume_bad_url( tokenizer)
920
940
} ,
921
941
b'\\' | b'\0' => {
942
+ // * The tokenizer’s input is UTF-8 since it’s `&str`.
943
+ // * start_pos is at a code point boundary
944
+ // * so is the current position (which is before '\\' or '\0'
945
+ //
946
+ // So `string_bytes` is well-formed UTF-8.
922
947
string_bytes = tokenizer. slice_from( start_pos) . as_bytes( ) . to_owned( ) ;
923
948
break
924
949
}
@@ -930,7 +955,10 @@ fn consume_unquoted_url<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>,
930
955
while !tokenizer. is_eof ( ) {
931
956
match_byte ! { tokenizer. consume_byte( ) ,
932
957
b' ' | b'\t' | b'\n' | b'\r' | b'\x0C' => {
933
- return consume_url_end( tokenizer, Owned ( to_utf8( string_bytes) ) ) ;
958
+ return consume_url_end( tokenizer, Owned (
959
+ // string_bytes is well-formed UTF-8, see other comments.
960
+ unsafe { from_utf8_release_unchecked( string_bytes) }
961
+ ) )
934
962
}
935
963
b')' => {
936
964
break ;
@@ -944,18 +972,21 @@ fn consume_unquoted_url<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>,
944
972
return consume_bad_url( tokenizer)
945
973
}
946
974
975
+ // This pushes one well-formed code point to string_bytes
947
976
consume_escape_and_write( tokenizer, & mut string_bytes)
948
977
} ,
949
978
b'\0' => {
950
- // string.push('\u{FFFD}');
951
- string_bytes. push( 0xef ) ;
952
- string_bytes. push( 0xbf ) ;
953
- string_bytes. push( 0xbd ) ;
979
+ string_bytes. extend( "\u{FFFD} " . as_bytes( ) ) ;
954
980
}
955
- c => { string_bytes. push( c) }
981
+ // If this byte is part of a multi-byte code point,
982
+ // we’ll end up copying the whole code point before this loop does something else.
983
+ b => { string_bytes. push( b) }
956
984
}
957
985
}
958
- UnquotedUrl ( Owned ( to_utf8 ( string_bytes) ) )
986
+ UnquotedUrl ( Owned (
987
+ // string_bytes is well-formed UTF-8, see other comments.
988
+ unsafe { from_utf8_release_unchecked ( string_bytes) }
989
+ ) )
959
990
}
960
991
961
992
fn consume_url_end < ' a > ( tokenizer : & mut Tokenizer < ' a > , string : Cow < ' a , str > ) -> Token < ' a > {
@@ -1039,11 +1070,8 @@ fn consume_hex_digits<'a>(tokenizer: &mut Tokenizer<'a>) -> (u32, u32) {
1039
1070
1040
1071
// Same constraints as consume_escape except it writes into `bytes` the result
1041
1072
// instead of returning it.
1042
- //
1043
- // TODO: This could be made more efficient with char::encode_utf8, I guess.
1044
1073
fn consume_escape_and_write ( tokenizer : & mut Tokenizer , bytes : & mut Vec < u8 > ) {
1045
- use std:: io:: Write ;
1046
- write ! ( bytes, "{}" , consume_escape( tokenizer) ) . unwrap ( ) ;
1074
+ bytes. extend ( consume_escape ( tokenizer) . encode_utf8 ( & mut [ 0 ; 4 ] ) . as_bytes ( ) )
1047
1075
}
1048
1076
1049
1077
// Assumes that the U+005C REVERSE SOLIDUS (\) has already been consumed
0 commit comments