tokenizer: Do the same thing of the last commits in a bunch more places.

emilio · emilio · commit db2a1a4d7f46 · 2016-11-01T18:37:51.000+01:00
If this is still too slow, the other thing to do would be to use table lookup
to scan the properties of the given byte in all these hot loops.
diff --git a/src/tokenizer.rs b/src/tokenizer.rs
@@ -11,6 +11,7 @@ use std::ascii::AsciiExt;
 use std::borrow::{Cow, ToOwned};
 use std::borrow::Cow::{Owned, Borrowed};
 use std::i32;
+use std::mem;
 
 use self::Token::*;
 
@@ -575,7 +576,7 @@ fn consume_quoted_string<'a>(tokenizer: &mut Tokenizer<'a>, single_quote: bool)
                              -> Result<Cow<'a, str>, ()> {
     tokenizer.advance(1);  // Skip the initial quote
     let start_pos = tokenizer.position();
-    let mut string;
+    let mut string_bytes;
     loop {
         if tokenizer.is_eof() {
             return Ok(Borrowed(tokenizer.slice_from(start_pos)))
@@ -592,7 +593,7 @@ fn consume_quoted_string<'a>(tokenizer: &mut Tokenizer<'a>, single_quote: bool)
                 return Ok(Borrowed(value))
             }
             b'\\' | b'\0' => {
-                string = tokenizer.slice_from(start_pos).to_owned();
+                string_bytes = tokenizer.slice_from(start_pos).as_bytes().to_owned();
                 break
             }
             b'\n' | b'\r' | b'\x0C' => return Err(()),
@@ -606,10 +607,10 @@ fn consume_quoted_string<'a>(tokenizer: &mut Tokenizer<'a>, single_quote: bool)
         if matches!(tokenizer.next_byte_unchecked(), b'\n' | b'\r' | b'\x0C') {
             return Err(());
         }
-        match tokenizer.consume_char() {
-            '"' if !single_quote => break,
-            '\'' if single_quote => break,
-            '\\' => {
+        match tokenizer.consume_byte() {
+            b'"' if !single_quote => break,
+            b'\'' if single_quote => break,
+            b'\\' => {
                 if !tokenizer.is_eof() {
                     match tokenizer.next_byte_unchecked() {
                         // Escaped newline
@@ -620,16 +621,22 @@ fn consume_quoted_string<'a>(tokenizer: &mut Tokenizer<'a>, single_quote: bool)
                                 tokenizer.advance(1);
                             }
                         }
-                        _ => string.push(consume_escape(tokenizer))
+                        _ => consume_escape_and_write(tokenizer, &mut string_bytes)
                     }
                 }
                 // else: escaped EOF, do nothing.
             }
-            '\0' => string.push('\u{FFFD}'),
-            c => string.push(c),
+            b'\0' => {
+                // string.push('\u{FFFD}'),
+                string_bytes.push(0xef);
+                string_bytes.push(0xbf);
+                string_bytes.push(0xbd);
+            }
+            c => string_bytes.push(c),
         }
     }
-    Ok(Owned(string))
+
+    Ok(Owned(to_utf8(string_bytes)))
 }
 
 
@@ -650,7 +657,7 @@ fn is_ident_start(tokenizer: &mut Tokenizer) -> bool {
 
 fn consume_ident_like<'a>(tokenizer: &mut Tokenizer<'a>) -> Token<'a> {
     let value = consume_name(tokenizer);
-    if !tokenizer.is_eof() && tokenizer.next_char() == '(' {
+    if !tokenizer.is_eof() && tokenizer.next_byte_unchecked() == b'(' {
         tokenizer.advance(1);
         if value.eq_ignore_ascii_case("url") {
             consume_unquoted_url(tokenizer).unwrap_or(Function(value))
@@ -668,42 +675,51 @@ fn consume_ident_like<'a>(tokenizer: &mut Tokenizer<'a>) -> Token<'a> {
 
 fn consume_name<'a>(tokenizer: &mut Tokenizer<'a>) -> Cow<'a, str> {
     let start_pos = tokenizer.position();
-    let mut value;
+    let mut value_bytes;
     loop {
         if tokenizer.is_eof() {
             return Borrowed(tokenizer.slice_from(start_pos))
         }
-        match tokenizer.next_char() {
-            'a'...'z' | 'A'...'Z' | '0'...'9' | '_' | '-'  => tokenizer.advance(1),
-            '\\' | '\0' => {
-                value = tokenizer.slice_from(start_pos).to_owned();
+        match tokenizer.next_byte_unchecked() {
+            b'a'...b'z' | b'A'...b'Z' | b'0'...b'9' | b'_' | b'-'  => tokenizer.advance(1),
+            b'\\' | b'\0' => {
+                value_bytes = tokenizer.slice_from(start_pos).as_bytes().to_owned();
                 break
             }
             c if c.is_ascii() => return Borrowed(tokenizer.slice_from(start_pos)),
             _ => {
-                tokenizer.consume_char();
+                tokenizer.advance(1);
             }
         }
     }
 
     while !tokenizer.is_eof() {
-        let c = tokenizer.next_char();
-        value.push(match c {
-            'a'...'z' | 'A'...'Z' | '0'...'9' | '_' | '-'  => {
+        let c = tokenizer.next_byte_unchecked();
+        match c {
+            b'a'...b'z' | b'A'...b'Z' | b'0'...b'9' | b'_' | b'-'  => {
                 tokenizer.advance(1);
-                c
+                value_bytes.push(c)
             }
-            '\\' => {
+            b'\\' => {
                 if tokenizer.has_newline_at(1) { break }
                 tokenizer.advance(1);
-                consume_escape(tokenizer)
+                consume_escape_and_write(tokenizer, &mut value_bytes)
             }
-            '\0' => { tokenizer.advance(1); '\u{FFFD}' },
+            b'\0' => {
+                tokenizer.advance(1);
+                // value.push('\u{FFFD}')
+                value_bytes.push(0xef);
+                value_bytes.push(0xbf);
+                value_bytes.push(0xbd);
+            },
             c if c.is_ascii() => break,
-            _ => tokenizer.consume_char(),
-        })
+            other => {
+                tokenizer.advance(1);
+                value_bytes.push(other)
+            }
+        }
     }
-    Owned(value)
+    Owned(to_utf8(value_bytes))
 }
 
 
@@ -825,7 +841,19 @@ fn consume_numeric<'a>(tokenizer: &mut Tokenizer<'a>) -> Token<'a> {
 }
 
 
+#[inline]
+fn to_utf8(string_bytes: Vec<u8>) -> String {
+    if cfg!(debug_assertions) {
+        String::from_utf8(string_bytes).unwrap()
+    } else {
+        unsafe {
+            String::from_utf8_unchecked(string_bytes)
+        }
+    }
+}
+
 fn consume_unquoted_url<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>, ()> {
+
     for (offset, c) in tokenizer.input[tokenizer.position..].as_bytes().iter().cloned().enumerate() {
         match c {
             b' ' | b'\t' | b'\n' | b'\r' | b'\x0C' => {},
@@ -845,7 +873,7 @@ fn consume_unquoted_url<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>,
 
     fn consume_unquoted_url<'a>(tokenizer: &mut Tokenizer<'a>) -> Token<'a> {
         let start_pos = tokenizer.position();
-        let mut string;
+        let mut string_bytes: Vec<u8>;
         loop {
             if tokenizer.is_eof() {
                 return UnquotedUrl(Borrowed(tokenizer.slice_from(start_pos)))
@@ -867,7 +895,7 @@ fn consume_unquoted_url<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>,
                     return consume_bad_url(tokenizer)
                 },
                 b'\\' | b'\0' => {
-                    string = tokenizer.slice_from(start_pos).to_owned();
+                    string_bytes = tokenizer.slice_from(start_pos).as_bytes().to_owned();
                     break
                 }
                 _ => {
@@ -876,32 +904,37 @@ fn consume_unquoted_url<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>,
             }
         }
         while !tokenizer.is_eof() {
-            let next_char = match tokenizer.consume_char() {
-                ' ' | '\t' | '\n' | '\r' | '\x0C' => {
-                    return consume_url_end(tokenizer, Owned(string))
+            match tokenizer.consume_byte() {
+                b' ' | b'\t' | b'\n' | b'\r' | b'\x0C' => {
+                    return consume_url_end(tokenizer, Owned(to_utf8(string_bytes)));
                 }
-                ')' => break,
-                '\x01'...'\x08' | '\x0B' | '\x0E'...'\x1F' | '\x7F'  // non-printable
-                    | '"' | '\'' | '(' => return consume_bad_url(tokenizer),
-                '\\' => {
+                b')' => break,
+                b'\x01'...b'\x08' | b'\x0B' | b'\x0E'...b'\x1F' | b'\x7F'  // non-printable
+                    | b'"' | b'\'' | b'(' => return consume_bad_url(tokenizer),
+                b'\\' => {
                     if tokenizer.has_newline_at(0) {
                         return consume_bad_url(tokenizer)
                     }
-                    consume_escape(tokenizer)
+
+                    consume_escape_and_write(tokenizer, &mut string_bytes)
                 },
-                '\0' => '\u{FFFD}',
-                c => c
-            };
-            string.push(next_char)
+                b'\0' => {
+                    // string.push('\u{FFFD}');
+                    string_bytes.push(0xef);
+                    string_bytes.push(0xbf);
+                    string_bytes.push(0xbd);
+                }
+                c => string_bytes.push(c)
+            }
         }
-        UnquotedUrl(Owned(string))
+        UnquotedUrl(Owned(to_utf8(string_bytes)))
     }
 
     fn consume_url_end<'a>(tokenizer: &mut Tokenizer<'a>, string: Cow<'a, str>) -> Token<'a> {
         while !tokenizer.is_eof() {
-            match tokenizer.consume_char() {
-                ' ' | '\t' | '\n' | '\r' | '\x0C' => (),
-                ')' => break,
+            match tokenizer.consume_byte() {
+                b' ' | b'\t' | b'\n' | b'\r' | b'\x0C' => (),
+                b')' => break,
                 _ => return consume_bad_url(tokenizer)
             }
         }
@@ -911,9 +944,9 @@ fn consume_unquoted_url<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>,
     fn consume_bad_url<'a>(tokenizer: &mut Tokenizer<'a>) -> Token<'a> {
         // Consume up to the closing )
         while !tokenizer.is_eof() {
-            match tokenizer.consume_char() {
-                ')' => break,
-                '\\' => tokenizer.advance(1), // Skip an escaped ')' or '\'
+            match tokenizer.consume_byte() {
+                b')' => break,
+                b'\\' => tokenizer.advance(1), // Skip an escaped ')' or '\'
                 _ => ()
             }
         }
@@ -972,20 +1005,31 @@ fn consume_hex_digits<'a>(tokenizer: &mut Tokenizer<'a>) -> (u32, u32) {
 }
 
 
+// Same constraints as consume_escape except it writes into `bytes` the result
+// instead of returning it.
+//
+// TODO: This could be made more efficient with char::encode_utf8, I guess.
+fn consume_escape_and_write(tokenizer: &mut Tokenizer, bytes: &mut Vec<u8>) {
+    let orig_bytes = mem::replace(bytes, vec![]);
+    let mut s = to_utf8(orig_bytes);
+    s.push(consume_escape(tokenizer));
+    mem::replace(bytes, s.into_bytes());
+}
+
 // Assumes that the U+005C REVERSE SOLIDUS (\) has already been consumed
 // and that the next input character has already been verified
 // to not be a newline.
 fn consume_escape(tokenizer: &mut Tokenizer) -> char {
     if tokenizer.is_eof() { return '\u{FFFD}' }  // Escaped EOF
-    match tokenizer.next_char() {
-        '0'...'9' | 'A'...'F' | 'a'...'f' => {
+    match tokenizer.next_byte_unchecked() {
+        b'0'...b'9' | b'A'...b'F' | b'a'...b'f' => {
             let (c, _) = consume_hex_digits(tokenizer);
             if !tokenizer.is_eof() {
-                match tokenizer.next_char() {
-                    ' ' | '\t' | '\n' | '\x0C' => tokenizer.advance(1),
-                    '\r' => {
+                match tokenizer.next_byte_unchecked() {
+                    b' ' | b'\t' | b'\n' | b'\x0C' => tokenizer.advance(1),
+                    b'\r' => {
                         tokenizer.advance(1);
-                        if !tokenizer.is_eof() && tokenizer.next_char() == '\n' {
+                        if !tokenizer.is_eof() && tokenizer.next_byte_unchecked() == b'\n' {
                             tokenizer.advance(1);
                         }
                     }
@@ -1000,7 +1044,7 @@ fn consume_escape(tokenizer: &mut Tokenizer) -> char {
                 REPLACEMENT_CHAR
             }
         },
-        '\0' => {
+        b'\0' => {
             tokenizer.advance(1);
             '\u{FFFD}'
         }