@@ -575,7 +575,7 @@ fn consume_quoted_string<'a>(tokenizer: &mut Tokenizer<'a>, single_quote: bool)
575575 -> Result < Cow < ' a , str > , ( ) > {
576576 tokenizer. advance ( 1 ) ; // Skip the initial quote
577577 let start_pos = tokenizer. position ( ) ;
578- let mut string ;
578+ let mut string_bytes ;
579579 loop {
580580 if tokenizer. is_eof ( ) {
581581 return Ok ( Borrowed ( tokenizer. slice_from ( start_pos) ) )
@@ -592,7 +592,7 @@ fn consume_quoted_string<'a>(tokenizer: &mut Tokenizer<'a>, single_quote: bool)
592592 return Ok ( Borrowed ( value) )
593593 }
594594 b'\\' | b'\0' => {
595- string = tokenizer. slice_from ( start_pos) . to_owned ( ) ;
595+ string_bytes = tokenizer. slice_from ( start_pos) . as_bytes ( ) . to_owned ( ) ;
596596 break
597597 }
598598 b'\n' | b'\r' | b'\x0C' => return Err ( ( ) ) ,
@@ -606,10 +606,10 @@ fn consume_quoted_string<'a>(tokenizer: &mut Tokenizer<'a>, single_quote: bool)
606606 if matches ! ( tokenizer. next_byte_unchecked( ) , b'\n' | b'\r' | b'\x0C' ) {
607607 return Err ( ( ) ) ;
608608 }
609- match tokenizer. consume_char ( ) {
610- '"' if !single_quote => break ,
611- '\'' if single_quote => break ,
612- '\\' => {
609+ match tokenizer. consume_byte ( ) {
610+ b '"' if !single_quote => break ,
611+ b '\'' if single_quote => break ,
612+ b '\\' => {
613613 if !tokenizer. is_eof ( ) {
614614 match tokenizer. next_byte_unchecked ( ) {
615615 // Escaped newline
@@ -620,16 +620,22 @@ fn consume_quoted_string<'a>(tokenizer: &mut Tokenizer<'a>, single_quote: bool)
620620 tokenizer. advance ( 1 ) ;
621621 }
622622 }
623- _ => string . push ( consume_escape ( tokenizer) )
623+ _ => consume_escape_and_write ( tokenizer, & mut string_bytes )
624624 }
625625 }
626626 // else: escaped EOF, do nothing.
627627 }
628- '\0' => string. push ( '\u{FFFD}' ) ,
629- c => string. push ( c) ,
628+ b'\0' => {
629+ // string.push('\u{FFFD}'),
630+ string_bytes. push ( 0xef ) ;
631+ string_bytes. push ( 0xbf ) ;
632+ string_bytes. push ( 0xbd ) ;
633+ }
634+ c => string_bytes. push ( c) ,
630635 }
631636 }
632- Ok ( Owned ( string) )
637+
638+ Ok ( Owned ( to_utf8 ( string_bytes) ) )
633639}
634640
635641
@@ -650,7 +656,7 @@ fn is_ident_start(tokenizer: &mut Tokenizer) -> bool {
650656
651657fn consume_ident_like < ' a > ( tokenizer : & mut Tokenizer < ' a > ) -> Token < ' a > {
652658 let value = consume_name ( tokenizer) ;
653- if !tokenizer. is_eof ( ) && tokenizer. next_char ( ) == '(' {
659+ if !tokenizer. is_eof ( ) && tokenizer. next_byte_unchecked ( ) == b '(' {
654660 tokenizer. advance ( 1 ) ;
655661 if value. eq_ignore_ascii_case ( "url" ) {
656662 consume_unquoted_url ( tokenizer) . unwrap_or ( Function ( value) )
@@ -668,42 +674,51 @@ fn consume_ident_like<'a>(tokenizer: &mut Tokenizer<'a>) -> Token<'a> {
668674
669675fn consume_name < ' a > ( tokenizer : & mut Tokenizer < ' a > ) -> Cow < ' a , str > {
670676 let start_pos = tokenizer. position ( ) ;
671- let mut value ;
677+ let mut value_bytes ;
672678 loop {
673679 if tokenizer. is_eof ( ) {
674680 return Borrowed ( tokenizer. slice_from ( start_pos) )
675681 }
676- match tokenizer. next_char ( ) {
677- 'a' ...'z' | 'A' ...'Z' | '0' ...'9' | '_' | '-' => tokenizer. advance ( 1 ) ,
678- '\\' | '\0' => {
679- value = tokenizer. slice_from ( start_pos) . to_owned ( ) ;
682+ match tokenizer. next_byte_unchecked ( ) {
683+ b 'a' ...b 'z' | b 'A' ...b 'Z' | b '0' ...b '9' | b '_' | b '-' => tokenizer. advance ( 1 ) ,
684+ b '\\' | b '\0' => {
685+ value_bytes = tokenizer. slice_from ( start_pos) . as_bytes ( ) . to_owned ( ) ;
680686 break
681687 }
682688 c if c. is_ascii ( ) => return Borrowed ( tokenizer. slice_from ( start_pos) ) ,
683689 _ => {
684- tokenizer. consume_char ( ) ;
690+ tokenizer. advance ( 1 ) ;
685691 }
686692 }
687693 }
688694
689695 while !tokenizer. is_eof ( ) {
690- let c = tokenizer. next_char ( ) ;
691- value . push ( match c {
692- 'a' ...'z' | 'A' ...'Z' | '0' ...'9' | '_' | '-' => {
696+ let c = tokenizer. next_byte_unchecked ( ) ;
697+ match c {
698+ b 'a' ...b 'z' | b 'A' ...b 'Z' | b '0' ...b '9' | b '_' | b '-' => {
693699 tokenizer. advance ( 1 ) ;
694- c
700+ value_bytes . push ( c )
695701 }
696- '\\' => {
702+ b '\\' => {
697703 if tokenizer. has_newline_at ( 1 ) { break }
698704 tokenizer. advance ( 1 ) ;
699- consume_escape ( tokenizer)
705+ consume_escape_and_write ( tokenizer, & mut value_bytes )
700706 }
701- '\0' => { tokenizer. advance ( 1 ) ; '\u{FFFD}' } ,
707+ b'\0' => {
708+ tokenizer. advance ( 1 ) ;
709+ // value.push('\u{FFFD}')
710+ value_bytes. push ( 0xef ) ;
711+ value_bytes. push ( 0xbf ) ;
712+ value_bytes. push ( 0xbd ) ;
713+ } ,
702714 c if c. is_ascii ( ) => break ,
703- _ => tokenizer. consume_char ( ) ,
704- } )
715+ other => {
716+ tokenizer. advance ( 1 ) ;
717+ value_bytes. push ( other)
718+ }
719+ }
705720 }
706- Owned ( value )
721+ Owned ( to_utf8 ( value_bytes ) )
707722}
708723
709724
@@ -825,7 +840,19 @@ fn consume_numeric<'a>(tokenizer: &mut Tokenizer<'a>) -> Token<'a> {
825840}
826841
827842
843+ #[ inline]
844+ fn to_utf8 ( string_bytes : Vec < u8 > ) -> String {
845+ if cfg ! ( debug_assertions) {
846+ String :: from_utf8 ( string_bytes) . unwrap ( )
847+ } else {
848+ unsafe {
849+ String :: from_utf8_unchecked ( string_bytes)
850+ }
851+ }
852+ }
853+
828854fn consume_unquoted_url < ' a > ( tokenizer : & mut Tokenizer < ' a > ) -> Result < Token < ' a > , ( ) > {
855+
829856 for ( offset, c) in tokenizer. input [ tokenizer. position ..] . as_bytes ( ) . iter ( ) . cloned ( ) . enumerate ( ) {
830857 match c {
831858 b' ' | b'\t' | b'\n' | b'\r' | b'\x0C' => { } ,
@@ -845,7 +872,7 @@ fn consume_unquoted_url<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>,
845872
846873 fn consume_unquoted_url < ' a > ( tokenizer : & mut Tokenizer < ' a > ) -> Token < ' a > {
847874 let start_pos = tokenizer. position ( ) ;
848- let mut string ;
875+ let mut string_bytes : Vec < u8 > ;
849876 loop {
850877 if tokenizer. is_eof ( ) {
851878 return UnquotedUrl ( Borrowed ( tokenizer. slice_from ( start_pos) ) )
@@ -867,7 +894,7 @@ fn consume_unquoted_url<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>,
867894 return consume_bad_url ( tokenizer)
868895 } ,
869896 b'\\' | b'\0' => {
870- string = tokenizer. slice_from ( start_pos) . to_owned ( ) ;
897+ string_bytes = tokenizer. slice_from ( start_pos) . as_bytes ( ) . to_owned ( ) ;
871898 break
872899 }
873900 _ => {
@@ -876,32 +903,37 @@ fn consume_unquoted_url<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>,
876903 }
877904 }
878905 while !tokenizer. is_eof ( ) {
879- let next_char = match tokenizer. consume_char ( ) {
880- ' ' | '\t' | '\n' | '\r' | '\x0C' => {
881- return consume_url_end ( tokenizer, Owned ( string ) )
906+ match tokenizer. consume_byte ( ) {
907+ b ' ' | b '\t' | b '\n' | b '\r' | b '\x0C' => {
908+ return consume_url_end ( tokenizer, Owned ( to_utf8 ( string_bytes ) ) ) ;
882909 }
883- ')' => break ,
884- '\x01' ...'\x08' | '\x0B' | '\x0E' ...'\x1F' | '\x7F' // non-printable
885- | '"' | '\'' | '(' => return consume_bad_url ( tokenizer) ,
886- '\\' => {
910+ b ')' => break ,
911+ b '\x01' ...b '\x08' | b '\x0B' | b '\x0E' ...b '\x1F' | b '\x7F' // non-printable
912+ | b '"' | b '\'' | b '(' => return consume_bad_url ( tokenizer) ,
913+ b '\\' => {
887914 if tokenizer. has_newline_at ( 0 ) {
888915 return consume_bad_url ( tokenizer)
889916 }
890- consume_escape ( tokenizer)
917+
918+ consume_escape_and_write ( tokenizer, & mut string_bytes)
891919 } ,
892- '\0' => '\u{FFFD}' ,
893- c => c
894- } ;
895- string. push ( next_char)
920+ b'\0' => {
921+ // string.push('\u{FFFD}');
922+ string_bytes. push ( 0xef ) ;
923+ string_bytes. push ( 0xbf ) ;
924+ string_bytes. push ( 0xbd ) ;
925+ }
926+ c => string_bytes. push ( c)
927+ }
896928 }
897- UnquotedUrl ( Owned ( string ) )
929+ UnquotedUrl ( Owned ( to_utf8 ( string_bytes ) ) )
898930 }
899931
900932 fn consume_url_end < ' a > ( tokenizer : & mut Tokenizer < ' a > , string : Cow < ' a , str > ) -> Token < ' a > {
901933 while !tokenizer. is_eof ( ) {
902- match tokenizer. consume_char ( ) {
903- ' ' | '\t' | '\n' | '\r' | '\x0C' => ( ) ,
904- ')' => break ,
934+ match tokenizer. consume_byte ( ) {
935+ b ' ' | b '\t' | b '\n' | b '\r' | b '\x0C' => ( ) ,
936+ b ')' => break ,
905937 _ => return consume_bad_url ( tokenizer)
906938 }
907939 }
@@ -911,9 +943,9 @@ fn consume_unquoted_url<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>,
911943 fn consume_bad_url < ' a > ( tokenizer : & mut Tokenizer < ' a > ) -> Token < ' a > {
912944 // Consume up to the closing )
913945 while !tokenizer. is_eof ( ) {
914- match tokenizer. consume_char ( ) {
915- ')' => break ,
916- '\\' => tokenizer. advance ( 1 ) , // Skip an escaped ')' or '\'
946+ match tokenizer. consume_byte ( ) {
947+ b ')' => break ,
948+ b '\\' => tokenizer. advance ( 1 ) , // Skip an escaped ')' or '\'
917949 _ => ( )
918950 }
919951 }
@@ -972,20 +1004,29 @@ fn consume_hex_digits<'a>(tokenizer: &mut Tokenizer<'a>) -> (u32, u32) {
9721004}
9731005
9741006
1007+ // Same constraints as consume_escape except it writes into `bytes` the result
1008+ // instead of returning it.
1009+ //
1010+ // TODO: This could be made more efficient with char::encode_utf8, I guess.
1011+ fn consume_escape_and_write ( tokenizer : & mut Tokenizer , bytes : & mut Vec < u8 > ) {
1012+ use std:: io:: Write ;
1013+ write ! ( bytes, "{}" , consume_escape( tokenizer) ) . unwrap ( ) ;
1014+ }
1015+
9751016// Assumes that the U+005C REVERSE SOLIDUS (\) has already been consumed
9761017// and that the next input character has already been verified
9771018// to not be a newline.
9781019fn consume_escape ( tokenizer : & mut Tokenizer ) -> char {
9791020 if tokenizer. is_eof ( ) { return '\u{FFFD}' } // Escaped EOF
980- match tokenizer. next_char ( ) {
981- '0' ...'9' | 'A' ...'F' | 'a' ...'f' => {
1021+ match tokenizer. next_byte_unchecked ( ) {
1022+ b '0' ...b '9' | b 'A' ...b 'F' | b 'a' ...b 'f' => {
9821023 let ( c, _) = consume_hex_digits ( tokenizer) ;
9831024 if !tokenizer. is_eof ( ) {
984- match tokenizer. next_char ( ) {
985- ' ' | '\t' | '\n' | '\x0C' => tokenizer. advance ( 1 ) ,
986- '\r' => {
1025+ match tokenizer. next_byte_unchecked ( ) {
1026+ b ' ' | b '\t' | b '\n' | b '\x0C' => tokenizer. advance ( 1 ) ,
1027+ b '\r' => {
9871028 tokenizer. advance ( 1 ) ;
988- if !tokenizer. is_eof ( ) && tokenizer. next_char ( ) == '\n' {
1029+ if !tokenizer. is_eof ( ) && tokenizer. next_byte_unchecked ( ) == b '\n' {
9891030 tokenizer. advance ( 1 ) ;
9901031 }
9911032 }
@@ -1000,7 +1041,7 @@ fn consume_escape(tokenizer: &mut Tokenizer) -> char {
10001041 REPLACEMENT_CHAR
10011042 }
10021043 } ,
1003- '\0' => {
1044+ b '\0' => {
10041045 tokenizer. advance ( 1 ) ;
10051046 '\u{FFFD}'
10061047 }
0 commit comments