@@ -11,6 +11,7 @@ use std::ascii::AsciiExt;
11
11
use std:: borrow:: { Cow , ToOwned } ;
12
12
use std:: borrow:: Cow :: { Owned , Borrowed } ;
13
13
use std:: i32;
14
+ use std:: mem;
14
15
15
16
use self :: Token :: * ;
16
17
@@ -575,7 +576,7 @@ fn consume_quoted_string<'a>(tokenizer: &mut Tokenizer<'a>, single_quote: bool)
575
576
-> Result < Cow < ' a , str > , ( ) > {
576
577
tokenizer. advance ( 1 ) ; // Skip the initial quote
577
578
let start_pos = tokenizer. position ( ) ;
578
- let mut string ;
579
+ let mut string_bytes ;
579
580
loop {
580
581
if tokenizer. is_eof ( ) {
581
582
return Ok ( Borrowed ( tokenizer. slice_from ( start_pos) ) )
@@ -592,7 +593,7 @@ fn consume_quoted_string<'a>(tokenizer: &mut Tokenizer<'a>, single_quote: bool)
592
593
return Ok ( Borrowed ( value) )
593
594
}
594
595
b'\\' | b'\0' => {
595
- string = tokenizer. slice_from ( start_pos) . to_owned ( ) ;
596
+ string_bytes = tokenizer. slice_from ( start_pos) . as_bytes ( ) . to_owned ( ) ;
596
597
break
597
598
}
598
599
b'\n' | b'\r' | b'\x0C' => return Err ( ( ) ) ,
@@ -606,10 +607,10 @@ fn consume_quoted_string<'a>(tokenizer: &mut Tokenizer<'a>, single_quote: bool)
606
607
if matches ! ( tokenizer. next_byte_unchecked( ) , b'\n' | b'\r' | b'\x0C' ) {
607
608
return Err ( ( ) ) ;
608
609
}
609
- match tokenizer. consume_char ( ) {
610
- '"' if !single_quote => break ,
611
- '\'' if single_quote => break ,
612
- '\\' => {
610
+ match tokenizer. consume_byte ( ) {
611
+ b '"' if !single_quote => break ,
612
+ b '\'' if single_quote => break ,
613
+ b '\\' => {
613
614
if !tokenizer. is_eof ( ) {
614
615
match tokenizer. next_byte_unchecked ( ) {
615
616
// Escaped newline
@@ -620,16 +621,22 @@ fn consume_quoted_string<'a>(tokenizer: &mut Tokenizer<'a>, single_quote: bool)
620
621
tokenizer. advance ( 1 ) ;
621
622
}
622
623
}
623
- _ => string . push ( consume_escape ( tokenizer) )
624
+ _ => consume_escape_and_write ( tokenizer, & mut string_bytes )
624
625
}
625
626
}
626
627
// else: escaped EOF, do nothing.
627
628
}
628
- '\0' => string. push ( '\u{FFFD}' ) ,
629
- c => string. push ( c) ,
629
+ b'\0' => {
630
+ // string.push('\u{FFFD}'),
631
+ string_bytes. push ( 0xef ) ;
632
+ string_bytes. push ( 0xbf ) ;
633
+ string_bytes. push ( 0xbd ) ;
634
+ }
635
+ c => string_bytes. push ( c) ,
630
636
}
631
637
}
632
- Ok ( Owned ( string) )
638
+
639
+ Ok ( Owned ( to_utf8 ( string_bytes) ) )
633
640
}
634
641
635
642
@@ -650,7 +657,7 @@ fn is_ident_start(tokenizer: &mut Tokenizer) -> bool {
650
657
651
658
fn consume_ident_like < ' a > ( tokenizer : & mut Tokenizer < ' a > ) -> Token < ' a > {
652
659
let value = consume_name ( tokenizer) ;
653
- if !tokenizer. is_eof ( ) && tokenizer. next_char ( ) == '(' {
660
+ if !tokenizer. is_eof ( ) && tokenizer. next_byte_unchecked ( ) == b '(' {
654
661
tokenizer. advance ( 1 ) ;
655
662
if value. eq_ignore_ascii_case ( "url" ) {
656
663
consume_unquoted_url ( tokenizer) . unwrap_or ( Function ( value) )
@@ -668,42 +675,51 @@ fn consume_ident_like<'a>(tokenizer: &mut Tokenizer<'a>) -> Token<'a> {
668
675
669
676
fn consume_name < ' a > ( tokenizer : & mut Tokenizer < ' a > ) -> Cow < ' a , str > {
670
677
let start_pos = tokenizer. position ( ) ;
671
- let mut value ;
678
+ let mut value_bytes ;
672
679
loop {
673
680
if tokenizer. is_eof ( ) {
674
681
return Borrowed ( tokenizer. slice_from ( start_pos) )
675
682
}
676
- match tokenizer. next_char ( ) {
677
- 'a' ...'z' | 'A' ...'Z' | '0' ...'9' | '_' | '-' => tokenizer. advance ( 1 ) ,
678
- '\\' | '\0' => {
679
- value = tokenizer. slice_from ( start_pos) . to_owned ( ) ;
683
+ match tokenizer. next_byte_unchecked ( ) {
684
+ b 'a' ...b 'z' | b 'A' ...b 'Z' | b '0' ...b '9' | b '_' | b '-' => tokenizer. advance ( 1 ) ,
685
+ b '\\' | b '\0' => {
686
+ value_bytes = tokenizer. slice_from ( start_pos) . as_bytes ( ) . to_owned ( ) ;
680
687
break
681
688
}
682
689
c if c. is_ascii ( ) => return Borrowed ( tokenizer. slice_from ( start_pos) ) ,
683
690
_ => {
684
- tokenizer. consume_char ( ) ;
691
+ tokenizer. advance ( 1 ) ;
685
692
}
686
693
}
687
694
}
688
695
689
696
while !tokenizer. is_eof ( ) {
690
- let c = tokenizer. next_char ( ) ;
691
- value . push ( match c {
692
- 'a' ...'z' | 'A' ...'Z' | '0' ...'9' | '_' | '-' => {
697
+ let c = tokenizer. next_byte_unchecked ( ) ;
698
+ match c {
699
+ b 'a' ...b 'z' | b 'A' ...b 'Z' | b '0' ...b '9' | b '_' | b '-' => {
693
700
tokenizer. advance ( 1 ) ;
694
- c
701
+ value_bytes . push ( c )
695
702
}
696
- '\\' => {
703
+ b '\\' => {
697
704
if tokenizer. has_newline_at ( 1 ) { break }
698
705
tokenizer. advance ( 1 ) ;
699
- consume_escape ( tokenizer)
706
+ consume_escape_and_write ( tokenizer, & mut value_bytes )
700
707
}
701
- '\0' => { tokenizer. advance ( 1 ) ; '\u{FFFD}' } ,
708
+ b'\0' => {
709
+ tokenizer. advance ( 1 ) ;
710
+ // value.push('\u{FFFD}')
711
+ value_bytes. push ( 0xef ) ;
712
+ value_bytes. push ( 0xbf ) ;
713
+ value_bytes. push ( 0xbd ) ;
714
+ } ,
702
715
c if c. is_ascii ( ) => break ,
703
- _ => tokenizer. consume_char ( ) ,
704
- } )
716
+ other => {
717
+ tokenizer. advance ( 1 ) ;
718
+ value_bytes. push ( other)
719
+ }
720
+ }
705
721
}
706
- Owned ( value )
722
+ Owned ( to_utf8 ( value_bytes ) )
707
723
}
708
724
709
725
@@ -825,7 +841,19 @@ fn consume_numeric<'a>(tokenizer: &mut Tokenizer<'a>) -> Token<'a> {
825
841
}
826
842
827
843
844
+ #[ inline]
845
+ fn to_utf8 ( string_bytes : Vec < u8 > ) -> String {
846
+ if cfg ! ( debug_assertions) {
847
+ String :: from_utf8 ( string_bytes) . unwrap ( )
848
+ } else {
849
+ unsafe {
850
+ String :: from_utf8_unchecked ( string_bytes)
851
+ }
852
+ }
853
+ }
854
+
828
855
fn consume_unquoted_url < ' a > ( tokenizer : & mut Tokenizer < ' a > ) -> Result < Token < ' a > , ( ) > {
856
+
829
857
for ( offset, c) in tokenizer. input [ tokenizer. position ..] . as_bytes ( ) . iter ( ) . cloned ( ) . enumerate ( ) {
830
858
match c {
831
859
b' ' | b'\t' | b'\n' | b'\r' | b'\x0C' => { } ,
@@ -845,7 +873,7 @@ fn consume_unquoted_url<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>,
845
873
846
874
fn consume_unquoted_url < ' a > ( tokenizer : & mut Tokenizer < ' a > ) -> Token < ' a > {
847
875
let start_pos = tokenizer. position ( ) ;
848
- let mut string ;
876
+ let mut string_bytes : Vec < u8 > ;
849
877
loop {
850
878
if tokenizer. is_eof ( ) {
851
879
return UnquotedUrl ( Borrowed ( tokenizer. slice_from ( start_pos) ) )
@@ -867,7 +895,7 @@ fn consume_unquoted_url<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>,
867
895
return consume_bad_url ( tokenizer)
868
896
} ,
869
897
b'\\' | b'\0' => {
870
- string = tokenizer. slice_from ( start_pos) . to_owned ( ) ;
898
+ string_bytes = tokenizer. slice_from ( start_pos) . as_bytes ( ) . to_owned ( ) ;
871
899
break
872
900
}
873
901
_ => {
@@ -876,32 +904,37 @@ fn consume_unquoted_url<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>,
876
904
}
877
905
}
878
906
while !tokenizer. is_eof ( ) {
879
- let next_char = match tokenizer. consume_char ( ) {
880
- ' ' | '\t' | '\n' | '\r' | '\x0C' => {
881
- return consume_url_end ( tokenizer, Owned ( string ) )
907
+ match tokenizer. consume_byte ( ) {
908
+ b ' ' | b '\t' | b '\n' | b '\r' | b '\x0C' => {
909
+ return consume_url_end ( tokenizer, Owned ( to_utf8 ( string_bytes ) ) ) ;
882
910
}
883
- ')' => break ,
884
- '\x01' ...'\x08' | '\x0B' | '\x0E' ...'\x1F' | '\x7F' // non-printable
885
- | '"' | '\'' | '(' => return consume_bad_url ( tokenizer) ,
886
- '\\' => {
911
+ b ')' => break ,
912
+ b '\x01' ...b '\x08' | b '\x0B' | b '\x0E' ...b '\x1F' | b '\x7F' // non-printable
913
+ | b '"' | b '\'' | b '(' => return consume_bad_url ( tokenizer) ,
914
+ b '\\' => {
887
915
if tokenizer. has_newline_at ( 0 ) {
888
916
return consume_bad_url ( tokenizer)
889
917
}
890
- consume_escape ( tokenizer)
918
+
919
+ consume_escape_and_write ( tokenizer, & mut string_bytes)
891
920
} ,
892
- '\0' => '\u{FFFD}' ,
893
- c => c
894
- } ;
895
- string. push ( next_char)
921
+ b'\0' => {
922
+ // string.push('\u{FFFD}');
923
+ string_bytes. push ( 0xef ) ;
924
+ string_bytes. push ( 0xbf ) ;
925
+ string_bytes. push ( 0xbd ) ;
926
+ }
927
+ c => string_bytes. push ( c)
928
+ }
896
929
}
897
- UnquotedUrl ( Owned ( string ) )
930
+ UnquotedUrl ( Owned ( to_utf8 ( string_bytes ) ) )
898
931
}
899
932
900
933
fn consume_url_end < ' a > ( tokenizer : & mut Tokenizer < ' a > , string : Cow < ' a , str > ) -> Token < ' a > {
901
934
while !tokenizer. is_eof ( ) {
902
- match tokenizer. consume_char ( ) {
903
- ' ' | '\t' | '\n' | '\r' | '\x0C' => ( ) ,
904
- ')' => break ,
935
+ match tokenizer. consume_byte ( ) {
936
+ b ' ' | b '\t' | b '\n' | b '\r' | b '\x0C' => ( ) ,
937
+ b ')' => break ,
905
938
_ => return consume_bad_url ( tokenizer)
906
939
}
907
940
}
@@ -911,9 +944,9 @@ fn consume_unquoted_url<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>,
911
944
fn consume_bad_url < ' a > ( tokenizer : & mut Tokenizer < ' a > ) -> Token < ' a > {
912
945
// Consume up to the closing )
913
946
while !tokenizer. is_eof ( ) {
914
- match tokenizer. consume_char ( ) {
915
- ')' => break ,
916
- '\\' => tokenizer. advance ( 1 ) , // Skip an escaped ')' or '\'
947
+ match tokenizer. consume_byte ( ) {
948
+ b ')' => break ,
949
+ b '\\' => tokenizer. advance ( 1 ) , // Skip an escaped ')' or '\'
917
950
_ => ( )
918
951
}
919
952
}
@@ -972,20 +1005,31 @@ fn consume_hex_digits<'a>(tokenizer: &mut Tokenizer<'a>) -> (u32, u32) {
972
1005
}
973
1006
974
1007
1008
+ // Same constraints as consume_escape except it writes into `bytes` the result
1009
+ // instead of returning it.
1010
+ //
1011
+ // TODO: This could be made more efficient with char::encode_utf8, I guess.
1012
+ fn consume_escape_and_write ( tokenizer : & mut Tokenizer , bytes : & mut Vec < u8 > ) {
1013
+ let orig_bytes = mem:: replace ( bytes, vec ! [ ] ) ;
1014
+ let mut s = to_utf8 ( orig_bytes) ;
1015
+ s. push ( consume_escape ( tokenizer) ) ;
1016
+ mem:: replace ( bytes, s. into_bytes ( ) ) ;
1017
+ }
1018
+
975
1019
// Assumes that the U+005C REVERSE SOLIDUS (\) has already been consumed
976
1020
// and that the next input character has already been verified
977
1021
// to not be a newline.
978
1022
fn consume_escape ( tokenizer : & mut Tokenizer ) -> char {
979
1023
if tokenizer. is_eof ( ) { return '\u{FFFD}' } // Escaped EOF
980
- match tokenizer. next_char ( ) {
981
- '0' ...'9' | 'A' ...'F' | 'a' ...'f' => {
1024
+ match tokenizer. next_byte_unchecked ( ) {
1025
+ b '0' ...b '9' | b 'A' ...b 'F' | b 'a' ...b 'f' => {
982
1026
let ( c, _) = consume_hex_digits ( tokenizer) ;
983
1027
if !tokenizer. is_eof ( ) {
984
- match tokenizer. next_char ( ) {
985
- ' ' | '\t' | '\n' | '\x0C' => tokenizer. advance ( 1 ) ,
986
- '\r' => {
1028
+ match tokenizer. next_byte_unchecked ( ) {
1029
+ b ' ' | b '\t' | b '\n' | b '\x0C' => tokenizer. advance ( 1 ) ,
1030
+ b '\r' => {
987
1031
tokenizer. advance ( 1 ) ;
988
- if !tokenizer. is_eof ( ) && tokenizer. next_char ( ) == '\n' {
1032
+ if !tokenizer. is_eof ( ) && tokenizer. next_byte_unchecked ( ) == b '\n' {
989
1033
tokenizer. advance ( 1 ) ;
990
1034
}
991
1035
}
@@ -1000,7 +1044,7 @@ fn consume_escape(tokenizer: &mut Tokenizer) -> char {
1000
1044
REPLACEMENT_CHAR
1001
1045
}
1002
1046
} ,
1003
- '\0' => {
1047
+ b '\0' => {
1004
1048
tokenizer. advance ( 1 ) ;
1005
1049
'\u{FFFD}'
1006
1050
}
0 commit comments