@@ -576,6 +576,7 @@ fn consume_string<'a>(tokenizer: &mut Tokenizer<'a>, single_quote: bool) -> Toke
576576fn consume_quoted_string < ' a > ( tokenizer : & mut Tokenizer < ' a > , single_quote : bool )
577577 -> Result < Cow < ' a , str > , ( ) > {
578578 tokenizer. advance ( 1 ) ; // Skip the initial quote
579+ // start_pos is at code point boundary, after " or '
579580 let start_pos = tokenizer. position ( ) ;
580581 let mut string_bytes;
581582 loop {
@@ -598,6 +599,11 @@ fn consume_quoted_string<'a>(tokenizer: &mut Tokenizer<'a>, single_quote: bool)
598599 }
599600 }
600601 b'\\' | b'\0' => {
602+ // * The tokenizer’s input is UTF-8 since it’s `&str`.
603+ // * start_pos is at a code point boundary
604+ // * so is the current position (which is before '\\' or '\0'
605+ //
606+ // So `string_bytes` is well-formed UTF-8.
601607 string_bytes = tokenizer. slice_from( start_pos) . as_bytes( ) . to_owned( ) ;
602608 break
603609 }
@@ -611,8 +617,8 @@ fn consume_quoted_string<'a>(tokenizer: &mut Tokenizer<'a>, single_quote: bool)
611617 if matches ! ( tokenizer. next_byte_unchecked( ) , b'\n' | b'\r' | b'\x0C' ) {
612618 return Err ( ( ) ) ;
613619 }
614- let c = tokenizer. consume_byte ( ) ;
615- match_byte ! { c ,
620+ let b = tokenizer. consume_byte ( ) ;
621+ match_byte ! { b ,
616622 b'"' => {
617623 if !single_quote {
618624 break ;
@@ -634,26 +640,29 @@ fn consume_quoted_string<'a>(tokenizer: &mut Tokenizer<'a>, single_quote: bool)
634640 tokenizer. advance( 1 ) ;
635641 }
636642 }
643+ // This pushes one well-formed code point
637644 _ => consume_escape_and_write( tokenizer, & mut string_bytes)
638645 }
639646 }
640647 // else: escaped EOF, do nothing.
641648 continue ;
642649 }
643650 b'\0' => {
644- // string.push('\u{FFFD}'),
645- string_bytes. push( 0xef ) ;
646- string_bytes. push( 0xbf ) ;
647- string_bytes. push( 0xbd ) ;
651+ string_bytes. extend( "\u{FFFD} " . as_bytes( ) ) ;
648652 continue ;
649653 }
650654 _ => { } ,
651655 }
652656
653- string_bytes. push ( c) ;
657+ // If this byte is part of a multi-byte code point,
658+ // we’ll end up copying the whole code point before this loop does something else.
659+ string_bytes. push ( b) ;
654660 }
655661
656- Ok ( Owned ( to_utf8 ( string_bytes) ) )
662+ Ok ( Owned (
663+ // string_bytes is well-formed UTF-8, see other comments.
664+ unsafe { from_utf8_release_unchecked ( string_bytes) }
665+ ) )
657666}
658667
659668
@@ -695,6 +704,7 @@ fn consume_ident_like<'a>(tokenizer: &mut Tokenizer<'a>) -> Token<'a> {
695704}
696705
697706fn consume_name < ' a > ( tokenizer : & mut Tokenizer < ' a > ) -> Cow < ' a , str > {
707+ // start_pos is the end of the previous token, therefore at a code point boundary
698708 let start_pos = tokenizer. position ( ) ;
699709 let mut value_bytes;
700710 loop {
@@ -704,11 +714,16 @@ fn consume_name<'a>(tokenizer: &mut Tokenizer<'a>) -> Cow<'a, str> {
704714 match_byte ! { tokenizer. next_byte_unchecked( ) ,
705715 b'a' ...b'z' | b'A' ...b'Z' | b'0' ...b'9' | b'_' | b'-' => { tokenizer. advance( 1 ) } ,
706716 b'\\' | b'\0' => {
717+ // * The tokenizer’s input is UTF-8 since it’s `&str`.
718+ // * start_pos is at a code point boundary
719+ // * so is the current position (which is before '\\' or '\0'
720+ //
721+ // So `value_bytes` is well-formed UTF-8.
707722 value_bytes = tokenizer. slice_from( start_pos) . as_bytes( ) . to_owned( ) ;
708723 break
709724 }
710- c => {
711- if c . is_ascii( ) {
725+ b => {
726+ if b . is_ascii( ) {
712727 return Borrowed ( tokenizer. slice_from( start_pos) ) ;
713728 }
714729 tokenizer. advance( 1 ) ;
@@ -717,34 +732,37 @@ fn consume_name<'a>(tokenizer: &mut Tokenizer<'a>) -> Cow<'a, str> {
717732 }
718733
719734 while !tokenizer. is_eof ( ) {
720- let c = tokenizer. next_byte_unchecked ( ) ;
721- match_byte ! { c ,
735+ let b = tokenizer. next_byte_unchecked ( ) ;
736+ match_byte ! { b ,
722737 b'a' ...b'z' | b'A' ...b'Z' | b'0' ...b'9' | b'_' | b'-' => {
723738 tokenizer. advance( 1 ) ;
724- value_bytes. push( c )
739+ value_bytes. push( b ) // ASCII
725740 }
726741 b'\\' => {
727742 if tokenizer. has_newline_at( 1 ) { break }
728743 tokenizer. advance( 1 ) ;
744+ // This pushes one well-formed code point
729745 consume_escape_and_write( tokenizer, & mut value_bytes)
730746 }
731747 b'\0' => {
732748 tokenizer. advance( 1 ) ;
733- // value.push('\u{FFFD}')
734- value_bytes. push( 0xef ) ;
735- value_bytes. push( 0xbf ) ;
736- value_bytes. push( 0xbd ) ;
749+ value_bytes. extend( "\u{FFFD} " . as_bytes( ) ) ;
737750 } ,
738751 _ => {
739- if c . is_ascii( ) {
752+ if b . is_ascii( ) {
740753 break ;
741754 }
742755 tokenizer. advance( 1 ) ;
743- value_bytes. push( c)
756+ // This byte *is* part of a multi-byte code point,
757+ // we’ll end up copying the whole code point before this loop does something else.
758+ value_bytes. push( b)
744759 }
745760 }
746761 }
747- Owned ( to_utf8 ( value_bytes) )
762+ Owned (
763+ // string_bytes is well-formed UTF-8, see other comments.
764+ unsafe { from_utf8_release_unchecked ( value_bytes) }
765+ )
748766}
749767
750768
@@ -867,17 +885,16 @@ fn consume_numeric<'a>(tokenizer: &mut Tokenizer<'a>) -> Token<'a> {
867885
868886
869887#[ inline]
870- fn to_utf8 ( string_bytes : Vec < u8 > ) -> String {
888+ unsafe fn from_utf8_release_unchecked ( string_bytes : Vec < u8 > ) -> String {
871889 if cfg ! ( debug_assertions) {
872890 String :: from_utf8 ( string_bytes) . unwrap ( )
873891 } else {
874- unsafe {
875- String :: from_utf8_unchecked ( string_bytes)
876- }
892+ String :: from_utf8_unchecked ( string_bytes)
877893 }
878894}
879895
880896fn consume_unquoted_url < ' a > ( tokenizer : & mut Tokenizer < ' a > ) -> Result < Token < ' a > , ( ) > {
897+ // This is only called after "url(", so the current position is a code point boundary.
881898 for ( offset, c) in tokenizer. input [ tokenizer. position ..] . bytes ( ) . enumerate ( ) {
882899 match_byte ! { c,
883900 b' ' | b'\t' | b'\n' | b'\r' | b'\x0C' => { } ,
@@ -888,6 +905,8 @@ fn consume_unquoted_url<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>,
888905 }
889906 _ => {
890907 tokenizer. advance( offset) ;
908+ // This function only consumed ASCII (whitespace) bytes,
909+ // so the current position is a code point boundary.
891910 return Ok ( consume_unquoted_url_internal( tokenizer) )
892911 }
893912 }
@@ -896,6 +915,7 @@ fn consume_unquoted_url<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>,
896915 return Ok ( UnquotedUrl ( Borrowed ( "" ) ) ) ;
897916
898917 fn consume_unquoted_url_internal < ' a > ( tokenizer : & mut Tokenizer < ' a > ) -> Token < ' a > {
918+ // This function is only called with start_pos at a code point boundary.
899919 let start_pos = tokenizer. position ( ) ;
900920 let mut string_bytes: Vec < u8 > ;
901921 loop {
@@ -919,6 +939,11 @@ fn consume_unquoted_url<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>,
919939 return consume_bad_url( tokenizer)
920940 } ,
921941 b'\\' | b'\0' => {
942+ // * The tokenizer’s input is UTF-8 since it’s `&str`.
943+ // * start_pos is at a code point boundary
944+ // * so is the current position (which is before '\\' or '\0'
945+ //
946+ // So `string_bytes` is well-formed UTF-8.
922947 string_bytes = tokenizer. slice_from( start_pos) . as_bytes( ) . to_owned( ) ;
923948 break
924949 }
@@ -930,7 +955,10 @@ fn consume_unquoted_url<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>,
930955 while !tokenizer. is_eof ( ) {
931956 match_byte ! { tokenizer. consume_byte( ) ,
932957 b' ' | b'\t' | b'\n' | b'\r' | b'\x0C' => {
933- return consume_url_end( tokenizer, Owned ( to_utf8( string_bytes) ) ) ;
958+ return consume_url_end( tokenizer, Owned (
959+ // string_bytes is well-formed UTF-8, see other comments.
960+ unsafe { from_utf8_release_unchecked( string_bytes) }
961+ ) )
934962 }
935963 b')' => {
936964 break ;
@@ -944,18 +972,21 @@ fn consume_unquoted_url<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>,
944972 return consume_bad_url( tokenizer)
945973 }
946974
975+ // This pushes one well-formed code point to string_bytes
947976 consume_escape_and_write( tokenizer, & mut string_bytes)
948977 } ,
949978 b'\0' => {
950- // string.push('\u{FFFD}');
951- string_bytes. push( 0xef ) ;
952- string_bytes. push( 0xbf ) ;
953- string_bytes. push( 0xbd ) ;
979+ string_bytes. extend( "\u{FFFD} " . as_bytes( ) ) ;
954980 }
955- c => { string_bytes. push( c) }
981+ // If this byte is part of a multi-byte code point,
982+ // we’ll end up copying the whole code point before this loop does something else.
983+ b => { string_bytes. push( b) }
956984 }
957985 }
958- UnquotedUrl ( Owned ( to_utf8 ( string_bytes) ) )
986+ UnquotedUrl ( Owned (
987+ // string_bytes is well-formed UTF-8, see other comments.
988+ unsafe { from_utf8_release_unchecked ( string_bytes) }
989+ ) )
959990 }
960991
961992 fn consume_url_end < ' a > ( tokenizer : & mut Tokenizer < ' a > , string : Cow < ' a , str > ) -> Token < ' a > {
@@ -1039,11 +1070,8 @@ fn consume_hex_digits<'a>(tokenizer: &mut Tokenizer<'a>) -> (u32, u32) {
10391070
10401071// Same constraints as consume_escape except it writes into `bytes` the result
10411072// instead of returning it.
1042- //
1043- // TODO: This could be made more efficient with char::encode_utf8, I guess.
10441073fn consume_escape_and_write ( tokenizer : & mut Tokenizer , bytes : & mut Vec < u8 > ) {
1045- use std:: io:: Write ;
1046- write ! ( bytes, "{}" , consume_escape( tokenizer) ) . unwrap ( ) ;
1074+ bytes. extend ( consume_escape ( tokenizer) . encode_utf8 ( & mut [ 0 ; 4 ] ) . as_bytes ( ) )
10471075}
10481076
10491077// Assumes that the U+005C REVERSE SOLIDUS (\) has already been consumed
0 commit comments