Skip to content

Commit 32d69b7

Browse files
committed
Introduce specialized advancing functions and use them universally
1 parent c7c9356 commit 32d69b7

File tree

2 files changed

+114
-33
lines changed

2 files changed

+114
-33
lines changed

src/parser.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -811,6 +811,7 @@ pub fn parse_until_after<'i: 't, 't, F, T, E>(parser: &mut Parser<'i, 't>,
811811
let next_byte = (parser.input.tokenizer).next_byte();
812812
if next_byte.is_some() && !parser.stop_before.contains(Delimiters::from_byte(next_byte)) {
813813
debug_assert!(delimiters.contains(Delimiters::from_byte(next_byte)));
814+
// We know this byte is ASCII.
814815
(parser.input.tokenizer).advance(1);
815816
if next_byte == Some(b'{') {
816817
consume_until_end_of_block(BlockType::CurlyBracket, &mut parser.input.tokenizer);

src/tokenizer.rs

Lines changed: 113 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -338,8 +338,23 @@ impl<'a> Tokenizer<'a> {
338338
#[inline]
339339
fn has_at_least(&self, n: usize) -> bool { self.position + n < self.input.len() }
340340

341+
// Advance over N bytes in the input. This function can advance
342+
// over ASCII bytes (excluding newlines), or UTF-8 sequence
343+
// leaders (excluding leaders for 4-byte sequences).
341344
#[inline]
342-
pub fn advance(&mut self, n: usize) { self.position += n }
345+
pub fn advance(&mut self, n: usize) {
346+
if cfg!(debug_assertions) {
347+
// Each byte must either be an ASCII byte or a sequence
348+
// leader, but not a 4-byte leader; also newlines are
349+
// rejected.
350+
for i in 0..n {
351+
let b = self.byte_at(i);
352+
debug_assert!(b.is_ascii() || (b & 0xF0 != 0xF0 && b & 0xC0 != 0x80));
353+
debug_assert!(b != b'\r' && b != b'\n' && b != b'\x0C');
354+
}
355+
}
356+
self.position += n
357+
}
343358

344359
// Assumes non-EOF
345360
#[inline]
@@ -350,10 +365,27 @@ impl<'a> Tokenizer<'a> {
350365
self.input.as_bytes()[self.position + offset]
351366
}
352367

368+
// Advance over a single byte; the byte must be a UTF-8 sequence
369+
// leader for a 4-byte sequence.
370+
#[inline]
371+
fn consume_4byte_intro(&mut self) {
372+
debug_assert!(self.next_byte_unchecked() & 0xF0 == 0xF0);
373+
self.position += 1;
374+
}
375+
376+
// Advance over a single byte; the byte must be a UTF-8
377+
// continuation byte.
353378
#[inline]
354-
fn consume_byte(&mut self) -> u8 {
379+
fn consume_continuation_byte(&mut self) {
380+
debug_assert!(self.next_byte_unchecked() & 0xC0 == 0x80);
381+
self.position += 1;
382+
}
383+
384+
// Advance over any kind of byte, excluding newlines.
385+
#[inline(never)]
386+
fn consume_known_byte(&mut self, byte: u8) {
387+
debug_assert!(byte != b'\r' && byte != b'\n' && byte != b'\x0C');
355388
self.position += 1;
356-
self.input.as_bytes()[self.position - 1]
357389
}
358390

359391
#[inline]
@@ -667,6 +699,8 @@ fn consume_comment<'a>(tokenizer: &mut Tokenizer<'a>) -> &'a str {
667699
b'\n' | b'\x0C' | b'\r' => {
668700
tokenizer.consume_newline();
669701
}
702+
b'\x80'...b'\xBF' => { tokenizer.consume_continuation_byte(); }
703+
b'\xF0'...b'\xFF' => { tokenizer.consume_4byte_intro(); }
670704
_ => {
671705
tokenizer.advance(1);
672706
}
@@ -703,13 +737,15 @@ fn consume_quoted_string<'a>(tokenizer: &mut Tokenizer<'a>, single_quote: bool)
703737
tokenizer.advance(1);
704738
return Ok(value.into())
705739
}
740+
tokenizer.advance(1);
706741
}
707742
b'\'' => {
708743
if single_quote {
709744
let value = tokenizer.slice_from(start_pos);
710745
tokenizer.advance(1);
711746
return Ok(value.into())
712747
}
748+
tokenizer.advance(1);
713749
}
714750
b'\\' | b'\0' => {
715751
// * The tokenizer’s input is UTF-8 since it’s `&str`.
@@ -723,33 +759,37 @@ fn consume_quoted_string<'a>(tokenizer: &mut Tokenizer<'a>, single_quote: bool)
723759
b'\n' | b'\r' | b'\x0C' => {
724760
return Err(tokenizer.slice_from(start_pos).into())
725761
},
726-
_ => {}
762+
b'\x80'...b'\xBF' => { tokenizer.consume_continuation_byte(); }
763+
b'\xF0'...b'\xFF' => { tokenizer.consume_4byte_intro(); }
764+
_ => { tokenizer.advance(1); }
727765
}
728-
tokenizer.consume_byte();
729766
}
730767

731768
while !tokenizer.is_eof() {
732-
if matches!(tokenizer.next_byte_unchecked(), b'\n' | b'\r' | b'\x0C') {
733-
return Err(
734-
// string_bytes is well-formed UTF-8, see other comments.
735-
unsafe {
736-
from_utf8_release_unchecked(string_bytes)
737-
}.into()
738-
);
739-
}
740-
let b = tokenizer.consume_byte();
769+
let b = tokenizer.next_byte_unchecked();
741770
match_byte! { b,
771+
b'\n' | b'\r' | b'\x0C' => {
772+
return Err(
773+
// string_bytes is well-formed UTF-8, see other comments.
774+
unsafe {
775+
from_utf8_release_unchecked(string_bytes)
776+
}.into()
777+
);
778+
}
742779
b'"' => {
780+
tokenizer.advance(1);
743781
if !single_quote {
744782
break;
745783
}
746784
}
747785
b'\'' => {
786+
tokenizer.advance(1);
748787
if single_quote {
749788
break;
750789
}
751790
}
752791
b'\\' => {
792+
tokenizer.advance(1);
753793
if !tokenizer.is_eof() {
754794
match tokenizer.next_byte_unchecked() {
755795
// Escaped newline
@@ -764,10 +804,13 @@ fn consume_quoted_string<'a>(tokenizer: &mut Tokenizer<'a>, single_quote: bool)
764804
continue;
765805
}
766806
b'\0' => {
807+
tokenizer.advance(1);
767808
string_bytes.extend("\u{FFFD}".as_bytes());
768809
continue;
769810
}
770-
_ => {},
811+
b'\x80'...b'\xBF' => { tokenizer.consume_continuation_byte(); }
812+
b'\xF0'...b'\xFF' => { tokenizer.consume_4byte_intro(); }
813+
_ => { tokenizer.advance(1); },
771814
}
772815

773816
// If this byte is part of a multi-byte code point,
@@ -835,11 +878,11 @@ fn consume_name<'a>(tokenizer: &mut Tokenizer<'a>) -> CowRcStr<'a> {
835878
value_bytes = tokenizer.slice_from(start_pos).as_bytes().to_owned();
836879
break
837880
}
881+
b'\x80'...b'\xBF' => { tokenizer.consume_continuation_byte(); }
882+
b'\xC0'...b'\xEF' => { tokenizer.advance(1); }
883+
b'\xF0'...b'\xFF' => { tokenizer.consume_4byte_intro(); }
838884
b => {
839-
if b.is_ascii() {
840-
return tokenizer.slice_from(start_pos).into();
841-
}
842-
tokenizer.advance(1);
885+
return tokenizer.slice_from(start_pos).into();
843886
}
844887
}
845888
}
@@ -861,15 +904,26 @@ fn consume_name<'a>(tokenizer: &mut Tokenizer<'a>) -> CowRcStr<'a> {
861904
tokenizer.advance(1);
862905
value_bytes.extend("\u{FFFD}".as_bytes());
863906
},
864-
_ => {
865-
if b.is_ascii() {
866-
break;
867-
}
868-
tokenizer.advance(1);
907+
b'\x80'...b'\xBF' => {
908+
// This byte *is* part of a multi-byte code point,
909+
// we’ll end up copying the whole code point before this loop does something else.
910+
tokenizer.consume_continuation_byte();
911+
value_bytes.push(b)
912+
}
913+
b'\xC0'...b'\xEF' => {
869914
// This byte *is* part of a multi-byte code point,
870915
// we’ll end up copying the whole code point before this loop does something else.
916+
tokenizer.advance(1);
917+
value_bytes.push(b)
918+
}
919+
b'\xF0'...b'\xFF' => {
920+
tokenizer.consume_4byte_intro();
871921
value_bytes.push(b)
872922
}
923+
_ => {
924+
// ascii
925+
break;
926+
}
873927
}
874928
}
875929
// string_bytes is well-formed UTF-8, see other comments.
@@ -1048,11 +1102,15 @@ fn consume_unquoted_url<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>,
10481102
}
10491103
b'"' | b'\'' => { return Err(()) }, // Do not advance
10501104
b')' => {
1051-
tokenizer.advance(offset + 1);
1105+
// Don't use advance, because we may be skipping
1106+
// newlines here, and we want to avoid the assert.
1107+
tokenizer.position += offset + 1;
10521108
break
10531109
}
10541110
_ => {
1055-
tokenizer.advance(offset);
1111+
// Don't use advance, because we may be skipping
1112+
// newlines here, and we want to avoid the assert.
1113+
tokenizer.position += offset;
10561114
found_printable_char = true;
10571115
break
10581116
}
@@ -1104,27 +1162,33 @@ fn consume_unquoted_url<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>,
11041162
string_bytes = tokenizer.slice_from(start_pos).as_bytes().to_owned();
11051163
break
11061164
}
1165+
b'\x80'...b'\xBF' => { tokenizer.consume_continuation_byte(); }
1166+
b'\xF0'...b'\xFF' => { tokenizer.consume_4byte_intro(); }
1167+
// ASCII or other leading byte.
11071168
_ => {
11081169
tokenizer.advance(1);
11091170
}
11101171
}
11111172
}
11121173
while !tokenizer.is_eof() {
1113-
match_byte! { tokenizer.consume_byte(),
1174+
let b = tokenizer.next_byte_unchecked();
1175+
match_byte! { b,
11141176
b' ' | b'\t' | b'\n' | b'\r' | b'\x0C' => {
11151177
// string_bytes is well-formed UTF-8, see other comments.
11161178
let string = unsafe { from_utf8_release_unchecked(string_bytes) }.into();
1117-
tokenizer.position -= 1;
11181179
return consume_url_end(tokenizer, start_pos, string)
11191180
}
11201181
b')' => {
1182+
tokenizer.advance(1);
11211183
break;
11221184
}
11231185
b'\x01'...b'\x08' | b'\x0B' | b'\x0E'...b'\x1F' | b'\x7F' // non-printable
11241186
| b'"' | b'\'' | b'(' => {
1187+
tokenizer.advance(1);
11251188
return consume_bad_url(tokenizer, start_pos);
11261189
}
11271190
b'\\' => {
1191+
tokenizer.advance(1);
11281192
if tokenizer.has_newline_at(0) {
11291193
return consume_bad_url(tokenizer, start_pos)
11301194
}
@@ -1133,11 +1197,27 @@ fn consume_unquoted_url<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>,
11331197
consume_escape_and_write(tokenizer, &mut string_bytes)
11341198
},
11351199
b'\0' => {
1200+
tokenizer.advance(1);
11361201
string_bytes.extend("\u{FFFD}".as_bytes());
11371202
}
1203+
b'\x80'...b'\xBF' => {
1204+
// We’ll end up copying the whole code point
1205+
// before this loop does something else.
1206+
tokenizer.consume_continuation_byte();
1207+
string_bytes.push(b);
1208+
}
1209+
b'\xF0'...b'\xFF' => {
1210+
// We’ll end up copying the whole code point
1211+
// before this loop does something else.
1212+
tokenizer.consume_4byte_intro();
1213+
string_bytes.push(b);
1214+
}
11381215
// If this byte is part of a multi-byte code point,
11391216
// we’ll end up copying the whole code point before this loop does something else.
1140-
b => { string_bytes.push(b) }
1217+
b => {
1218+
tokenizer.advance(1);
1219+
string_bytes.push(b)
1220+
}
11411221
}
11421222
}
11431223
UnquotedUrl(
@@ -1160,8 +1240,8 @@ fn consume_unquoted_url<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>,
11601240
b'\n' | b'\x0C' | b'\r' => {
11611241
tokenizer.consume_newline();
11621242
}
1163-
_ => {
1164-
tokenizer.advance(1);
1243+
b => {
1244+
tokenizer.consume_known_byte(b);
11651245
return consume_bad_url(tokenizer, start_pos);
11661246
}
11671247
}
@@ -1186,8 +1266,8 @@ fn consume_unquoted_url<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>,
11861266
b'\n' | b'\x0C' | b'\r' => {
11871267
tokenizer.consume_newline();
11881268
}
1189-
_ => {
1190-
tokenizer.advance(1);
1269+
b => {
1270+
tokenizer.consume_known_byte(b);
11911271
}
11921272
}
11931273
}

0 commit comments

Comments
 (0)