Skip to content

Commit db2a1a4

Browse files
committed
tokenizer: Do the same thing of the last commits in a bunch more places.
If this is still too slow, the other thing to do would be to use table lookup to scan the properties of the given byte in all these hot loops.
1 parent fd86875 commit db2a1a4

File tree

1 file changed

+99
-55
lines changed

1 file changed

+99
-55
lines changed

src/tokenizer.rs

Lines changed: 99 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ use std::ascii::AsciiExt;
1111
use std::borrow::{Cow, ToOwned};
1212
use std::borrow::Cow::{Owned, Borrowed};
1313
use std::i32;
14+
use std::mem;
1415

1516
use self::Token::*;
1617

@@ -575,7 +576,7 @@ fn consume_quoted_string<'a>(tokenizer: &mut Tokenizer<'a>, single_quote: bool)
575576
-> Result<Cow<'a, str>, ()> {
576577
tokenizer.advance(1); // Skip the initial quote
577578
let start_pos = tokenizer.position();
578-
let mut string;
579+
let mut string_bytes;
579580
loop {
580581
if tokenizer.is_eof() {
581582
return Ok(Borrowed(tokenizer.slice_from(start_pos)))
@@ -592,7 +593,7 @@ fn consume_quoted_string<'a>(tokenizer: &mut Tokenizer<'a>, single_quote: bool)
592593
return Ok(Borrowed(value))
593594
}
594595
b'\\' | b'\0' => {
595-
string = tokenizer.slice_from(start_pos).to_owned();
596+
string_bytes = tokenizer.slice_from(start_pos).as_bytes().to_owned();
596597
break
597598
}
598599
b'\n' | b'\r' | b'\x0C' => return Err(()),
@@ -606,10 +607,10 @@ fn consume_quoted_string<'a>(tokenizer: &mut Tokenizer<'a>, single_quote: bool)
606607
if matches!(tokenizer.next_byte_unchecked(), b'\n' | b'\r' | b'\x0C') {
607608
return Err(());
608609
}
609-
match tokenizer.consume_char() {
610-
'"' if !single_quote => break,
611-
'\'' if single_quote => break,
612-
'\\' => {
610+
match tokenizer.consume_byte() {
611+
b'"' if !single_quote => break,
612+
b'\'' if single_quote => break,
613+
b'\\' => {
613614
if !tokenizer.is_eof() {
614615
match tokenizer.next_byte_unchecked() {
615616
// Escaped newline
@@ -620,16 +621,22 @@ fn consume_quoted_string<'a>(tokenizer: &mut Tokenizer<'a>, single_quote: bool)
620621
tokenizer.advance(1);
621622
}
622623
}
623-
_ => string.push(consume_escape(tokenizer))
624+
_ => consume_escape_and_write(tokenizer, &mut string_bytes)
624625
}
625626
}
626627
// else: escaped EOF, do nothing.
627628
}
628-
'\0' => string.push('\u{FFFD}'),
629-
c => string.push(c),
629+
b'\0' => {
630+
// string.push('\u{FFFD}'),
631+
string_bytes.push(0xef);
632+
string_bytes.push(0xbf);
633+
string_bytes.push(0xbd);
634+
}
635+
c => string_bytes.push(c),
630636
}
631637
}
632-
Ok(Owned(string))
638+
639+
Ok(Owned(to_utf8(string_bytes)))
633640
}
634641

635642

@@ -650,7 +657,7 @@ fn is_ident_start(tokenizer: &mut Tokenizer) -> bool {
650657

651658
fn consume_ident_like<'a>(tokenizer: &mut Tokenizer<'a>) -> Token<'a> {
652659
let value = consume_name(tokenizer);
653-
if !tokenizer.is_eof() && tokenizer.next_char() == '(' {
660+
if !tokenizer.is_eof() && tokenizer.next_byte_unchecked() == b'(' {
654661
tokenizer.advance(1);
655662
if value.eq_ignore_ascii_case("url") {
656663
consume_unquoted_url(tokenizer).unwrap_or(Function(value))
@@ -668,42 +675,51 @@ fn consume_ident_like<'a>(tokenizer: &mut Tokenizer<'a>) -> Token<'a> {
668675

669676
fn consume_name<'a>(tokenizer: &mut Tokenizer<'a>) -> Cow<'a, str> {
670677
let start_pos = tokenizer.position();
671-
let mut value;
678+
let mut value_bytes;
672679
loop {
673680
if tokenizer.is_eof() {
674681
return Borrowed(tokenizer.slice_from(start_pos))
675682
}
676-
match tokenizer.next_char() {
677-
'a'...'z' | 'A'...'Z' | '0'...'9' | '_' | '-' => tokenizer.advance(1),
678-
'\\' | '\0' => {
679-
value = tokenizer.slice_from(start_pos).to_owned();
683+
match tokenizer.next_byte_unchecked() {
684+
b'a'...b'z' | b'A'...b'Z' | b'0'...b'9' | b'_' | b'-' => tokenizer.advance(1),
685+
b'\\' | b'\0' => {
686+
value_bytes = tokenizer.slice_from(start_pos).as_bytes().to_owned();
680687
break
681688
}
682689
c if c.is_ascii() => return Borrowed(tokenizer.slice_from(start_pos)),
683690
_ => {
684-
tokenizer.consume_char();
691+
tokenizer.advance(1);
685692
}
686693
}
687694
}
688695

689696
while !tokenizer.is_eof() {
690-
let c = tokenizer.next_char();
691-
value.push(match c {
692-
'a'...'z' | 'A'...'Z' | '0'...'9' | '_' | '-' => {
697+
let c = tokenizer.next_byte_unchecked();
698+
match c {
699+
b'a'...b'z' | b'A'...b'Z' | b'0'...b'9' | b'_' | b'-' => {
693700
tokenizer.advance(1);
694-
c
701+
value_bytes.push(c)
695702
}
696-
'\\' => {
703+
b'\\' => {
697704
if tokenizer.has_newline_at(1) { break }
698705
tokenizer.advance(1);
699-
consume_escape(tokenizer)
706+
consume_escape_and_write(tokenizer, &mut value_bytes)
700707
}
701-
'\0' => { tokenizer.advance(1); '\u{FFFD}' },
708+
b'\0' => {
709+
tokenizer.advance(1);
710+
// value.push('\u{FFFD}')
711+
value_bytes.push(0xef);
712+
value_bytes.push(0xbf);
713+
value_bytes.push(0xbd);
714+
},
702715
c if c.is_ascii() => break,
703-
_ => tokenizer.consume_char(),
704-
})
716+
other => {
717+
tokenizer.advance(1);
718+
value_bytes.push(other)
719+
}
720+
}
705721
}
706-
Owned(value)
722+
Owned(to_utf8(value_bytes))
707723
}
708724

709725

@@ -825,7 +841,19 @@ fn consume_numeric<'a>(tokenizer: &mut Tokenizer<'a>) -> Token<'a> {
825841
}
826842

827843

844+
#[inline]
845+
fn to_utf8(string_bytes: Vec<u8>) -> String {
846+
if cfg!(debug_assertions) {
847+
String::from_utf8(string_bytes).unwrap()
848+
} else {
849+
unsafe {
850+
String::from_utf8_unchecked(string_bytes)
851+
}
852+
}
853+
}
854+
828855
fn consume_unquoted_url<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>, ()> {
856+
829857
for (offset, c) in tokenizer.input[tokenizer.position..].as_bytes().iter().cloned().enumerate() {
830858
match c {
831859
b' ' | b'\t' | b'\n' | b'\r' | b'\x0C' => {},
@@ -845,7 +873,7 @@ fn consume_unquoted_url<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>,
845873

846874
fn consume_unquoted_url<'a>(tokenizer: &mut Tokenizer<'a>) -> Token<'a> {
847875
let start_pos = tokenizer.position();
848-
let mut string;
876+
let mut string_bytes: Vec<u8>;
849877
loop {
850878
if tokenizer.is_eof() {
851879
return UnquotedUrl(Borrowed(tokenizer.slice_from(start_pos)))
@@ -867,7 +895,7 @@ fn consume_unquoted_url<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>,
867895
return consume_bad_url(tokenizer)
868896
},
869897
b'\\' | b'\0' => {
870-
string = tokenizer.slice_from(start_pos).to_owned();
898+
string_bytes = tokenizer.slice_from(start_pos).as_bytes().to_owned();
871899
break
872900
}
873901
_ => {
@@ -876,32 +904,37 @@ fn consume_unquoted_url<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>,
876904
}
877905
}
878906
while !tokenizer.is_eof() {
879-
let next_char = match tokenizer.consume_char() {
880-
' ' | '\t' | '\n' | '\r' | '\x0C' => {
881-
return consume_url_end(tokenizer, Owned(string))
907+
match tokenizer.consume_byte() {
908+
b' ' | b'\t' | b'\n' | b'\r' | b'\x0C' => {
909+
return consume_url_end(tokenizer, Owned(to_utf8(string_bytes)));
882910
}
883-
')' => break,
884-
'\x01'...'\x08' | '\x0B' | '\x0E'...'\x1F' | '\x7F' // non-printable
885-
| '"' | '\'' | '(' => return consume_bad_url(tokenizer),
886-
'\\' => {
911+
b')' => break,
912+
b'\x01'...b'\x08' | b'\x0B' | b'\x0E'...b'\x1F' | b'\x7F' // non-printable
913+
| b'"' | b'\'' | b'(' => return consume_bad_url(tokenizer),
914+
b'\\' => {
887915
if tokenizer.has_newline_at(0) {
888916
return consume_bad_url(tokenizer)
889917
}
890-
consume_escape(tokenizer)
918+
919+
consume_escape_and_write(tokenizer, &mut string_bytes)
891920
},
892-
'\0' => '\u{FFFD}',
893-
c => c
894-
};
895-
string.push(next_char)
921+
b'\0' => {
922+
// string.push('\u{FFFD}');
923+
string_bytes.push(0xef);
924+
string_bytes.push(0xbf);
925+
string_bytes.push(0xbd);
926+
}
927+
c => string_bytes.push(c)
928+
}
896929
}
897-
UnquotedUrl(Owned(string))
930+
UnquotedUrl(Owned(to_utf8(string_bytes)))
898931
}
899932

900933
fn consume_url_end<'a>(tokenizer: &mut Tokenizer<'a>, string: Cow<'a, str>) -> Token<'a> {
901934
while !tokenizer.is_eof() {
902-
match tokenizer.consume_char() {
903-
' ' | '\t' | '\n' | '\r' | '\x0C' => (),
904-
')' => break,
935+
match tokenizer.consume_byte() {
936+
b' ' | b'\t' | b'\n' | b'\r' | b'\x0C' => (),
937+
b')' => break,
905938
_ => return consume_bad_url(tokenizer)
906939
}
907940
}
@@ -911,9 +944,9 @@ fn consume_unquoted_url<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>,
911944
fn consume_bad_url<'a>(tokenizer: &mut Tokenizer<'a>) -> Token<'a> {
912945
// Consume up to the closing )
913946
while !tokenizer.is_eof() {
914-
match tokenizer.consume_char() {
915-
')' => break,
916-
'\\' => tokenizer.advance(1), // Skip an escaped ')' or '\'
947+
match tokenizer.consume_byte() {
948+
b')' => break,
949+
b'\\' => tokenizer.advance(1), // Skip an escaped ')' or '\'
917950
_ => ()
918951
}
919952
}
@@ -972,20 +1005,31 @@ fn consume_hex_digits<'a>(tokenizer: &mut Tokenizer<'a>) -> (u32, u32) {
9721005
}
9731006

9741007

1008+
// Same constraints as consume_escape except it writes into `bytes` the result
1009+
// instead of returning it.
1010+
//
1011+
// TODO: This could be made more efficient with char::encode_utf8, I guess.
1012+
fn consume_escape_and_write(tokenizer: &mut Tokenizer, bytes: &mut Vec<u8>) {
1013+
let orig_bytes = mem::replace(bytes, vec![]);
1014+
let mut s = to_utf8(orig_bytes);
1015+
s.push(consume_escape(tokenizer));
1016+
mem::replace(bytes, s.into_bytes());
1017+
}
1018+
9751019
// Assumes that the U+005C REVERSE SOLIDUS (\) has already been consumed
9761020
// and that the next input character has already been verified
9771021
// to not be a newline.
9781022
fn consume_escape(tokenizer: &mut Tokenizer) -> char {
9791023
if tokenizer.is_eof() { return '\u{FFFD}' } // Escaped EOF
980-
match tokenizer.next_char() {
981-
'0'...'9' | 'A'...'F' | 'a'...'f' => {
1024+
match tokenizer.next_byte_unchecked() {
1025+
b'0'...b'9' | b'A'...b'F' | b'a'...b'f' => {
9821026
let (c, _) = consume_hex_digits(tokenizer);
9831027
if !tokenizer.is_eof() {
984-
match tokenizer.next_char() {
985-
' ' | '\t' | '\n' | '\x0C' => tokenizer.advance(1),
986-
'\r' => {
1028+
match tokenizer.next_byte_unchecked() {
1029+
b' ' | b'\t' | b'\n' | b'\x0C' => tokenizer.advance(1),
1030+
b'\r' => {
9871031
tokenizer.advance(1);
988-
if !tokenizer.is_eof() && tokenizer.next_char() == '\n' {
1032+
if !tokenizer.is_eof() && tokenizer.next_byte_unchecked() == b'\n' {
9891033
tokenizer.advance(1);
9901034
}
9911035
}
@@ -1000,7 +1044,7 @@ fn consume_escape(tokenizer: &mut Tokenizer) -> char {
10001044
REPLACEMENT_CHAR
10011045
}
10021046
},
1003-
'\0' => {
1047+
b'\0' => {
10041048
tokenizer.advance(1);
10051049
'\u{FFFD}'
10061050
}

0 commit comments

Comments
 (0)