Skip to content

Commit 94bdfdc

Browse files
committed
Introduce specialized advancing functions and use them universally
1 parent 7e67b8f commit 94bdfdc

File tree

2 files changed

+122
-33
lines changed

2 files changed

+122
-33
lines changed

src/parser.rs

+1
Original file line numberDiff line numberDiff line change
@@ -811,6 +811,7 @@ pub fn parse_until_after<'i: 't, 't, F, T, E>(parser: &mut Parser<'i, 't>,
811811
let next_byte = (parser.input.tokenizer).next_byte();
812812
if next_byte.is_some() && !parser.stop_before.contains(Delimiters::from_byte(next_byte)) {
813813
debug_assert!(delimiters.contains(Delimiters::from_byte(next_byte)));
814+
// We know this byte is ASCII.
814815
(parser.input.tokenizer).advance(1);
815816
if next_byte == Some(b'{') {
816817
consume_until_end_of_block(BlockType::CurlyBracket, &mut parser.input.tokenizer);

src/tokenizer.rs

+121-33
Original file line numberDiff line numberDiff line change
@@ -338,8 +338,23 @@ impl<'a> Tokenizer<'a> {
338338
#[inline]
339339
fn has_at_least(&self, n: usize) -> bool { self.position + n < self.input.len() }
340340

341+
// Advance over N bytes in the input. This function can advance
342+
// over ASCII bytes (excluding newlines), or UTF-8 sequence
343+
// leaders (excluding leaders for 4-byte sequences).
341344
#[inline]
342-
pub fn advance(&mut self, n: usize) { self.position += n }
345+
pub fn advance(&mut self, n: usize) {
346+
if cfg!(debug_assertions) {
347+
// Each byte must either be an ASCII byte or a sequence
348+
// leader, but not a 4-byte leader; also newlines are
349+
// rejected.
350+
for i in 0..n {
351+
let b = self.byte_at(i);
352+
debug_assert!(b.is_ascii() || (b & 0xF0 != 0xF0 && b & 0xC0 != 0x80));
353+
debug_assert!(b != b'\r' && b != b'\n' && b != b'\x0C');
354+
}
355+
}
356+
self.position += n
357+
}
343358

344359
// Assumes non-EOF
345360
#[inline]
@@ -350,10 +365,27 @@ impl<'a> Tokenizer<'a> {
350365
self.input.as_bytes()[self.position + offset]
351366
}
352367

368+
// Advance over a single byte; the byte must be a UTF-8 sequence
369+
// leader for a 4-byte sequence.
370+
#[inline]
371+
fn consume_4byte_intro(&mut self) {
372+
debug_assert!(self.next_byte_unchecked() & 0xF0 == 0xF0);
373+
self.position += 1;
374+
}
375+
376+
// Advance over a single byte; the byte must be a UTF-8
377+
// continuation byte.
353378
#[inline]
354-
fn consume_byte(&mut self) -> u8 {
379+
fn consume_continuation_byte(&mut self) {
380+
debug_assert!(self.next_byte_unchecked() & 0xC0 == 0x80);
381+
self.position += 1;
382+
}
383+
384+
// Advance over any kind of byte, excluding newlines.
385+
#[inline(never)]
386+
fn consume_known_byte(&mut self, byte: u8) {
387+
debug_assert!(byte != b'\r' && byte != b'\n' && byte != b'\x0C');
355388
self.position += 1;
356-
self.input.as_bytes()[self.position - 1]
357389
}
358390

359391
#[inline]
@@ -667,7 +699,10 @@ fn consume_comment<'a>(tokenizer: &mut Tokenizer<'a>) -> &'a str {
667699
b'\n' | b'\x0C' | b'\r' => {
668700
tokenizer.consume_newline();
669701
}
702+
b'\x80'...b'\xBF' => { tokenizer.consume_continuation_byte(); }
703+
b'\xF0'...b'\xFF' => { tokenizer.consume_4byte_intro(); }
670704
_ => {
705+
// ASCII or other leading byte.
671706
tokenizer.advance(1);
672707
}
673708
}
@@ -703,13 +738,15 @@ fn consume_quoted_string<'a>(tokenizer: &mut Tokenizer<'a>, single_quote: bool)
703738
tokenizer.advance(1);
704739
return Ok(value.into())
705740
}
741+
tokenizer.advance(1);
706742
}
707743
b'\'' => {
708744
if single_quote {
709745
let value = tokenizer.slice_from(start_pos);
710746
tokenizer.advance(1);
711747
return Ok(value.into())
712748
}
749+
tokenizer.advance(1);
713750
}
714751
b'\\' | b'\0' => {
715752
// * The tokenizer’s input is UTF-8 since it’s `&str`.
@@ -723,33 +760,40 @@ fn consume_quoted_string<'a>(tokenizer: &mut Tokenizer<'a>, single_quote: bool)
723760
b'\n' | b'\r' | b'\x0C' => {
724761
return Err(tokenizer.slice_from(start_pos).into())
725762
},
726-
_ => {}
763+
b'\x80'...b'\xBF' => { tokenizer.consume_continuation_byte(); }
764+
b'\xF0'...b'\xFF' => { tokenizer.consume_4byte_intro(); }
765+
_ => {
766+
// ASCII or other leading byte.
767+
tokenizer.advance(1);
768+
}
727769
}
728-
tokenizer.consume_byte();
729770
}
730771

731772
while !tokenizer.is_eof() {
732-
if matches!(tokenizer.next_byte_unchecked(), b'\n' | b'\r' | b'\x0C') {
733-
return Err(
734-
// string_bytes is well-formed UTF-8, see other comments.
735-
unsafe {
736-
from_utf8_release_unchecked(string_bytes)
737-
}.into()
738-
);
739-
}
740-
let b = tokenizer.consume_byte();
773+
let b = tokenizer.next_byte_unchecked();
741774
match_byte! { b,
775+
b'\n' | b'\r' | b'\x0C' => {
776+
return Err(
777+
// string_bytes is well-formed UTF-8, see other comments.
778+
unsafe {
779+
from_utf8_release_unchecked(string_bytes)
780+
}.into()
781+
);
782+
}
742783
b'"' => {
784+
tokenizer.advance(1);
743785
if !single_quote {
744786
break;
745787
}
746788
}
747789
b'\'' => {
790+
tokenizer.advance(1);
748791
if single_quote {
749792
break;
750793
}
751794
}
752795
b'\\' => {
796+
tokenizer.advance(1);
753797
if !tokenizer.is_eof() {
754798
match tokenizer.next_byte_unchecked() {
755799
// Escaped newline
@@ -764,10 +808,16 @@ fn consume_quoted_string<'a>(tokenizer: &mut Tokenizer<'a>, single_quote: bool)
764808
continue;
765809
}
766810
b'\0' => {
811+
tokenizer.advance(1);
767812
string_bytes.extend("\u{FFFD}".as_bytes());
768813
continue;
769814
}
770-
_ => {},
815+
b'\x80'...b'\xBF' => { tokenizer.consume_continuation_byte(); }
816+
b'\xF0'...b'\xFF' => { tokenizer.consume_4byte_intro(); }
817+
_ => {
818+
// ASCII or other leading byte.
819+
tokenizer.advance(1);
820+
},
771821
}
772822

773823
// If this byte is part of a multi-byte code point,
@@ -835,11 +885,11 @@ fn consume_name<'a>(tokenizer: &mut Tokenizer<'a>) -> CowRcStr<'a> {
835885
value_bytes = tokenizer.slice_from(start_pos).as_bytes().to_owned();
836886
break
837887
}
888+
b'\x80'...b'\xBF' => { tokenizer.consume_continuation_byte(); }
889+
b'\xC0'...b'\xEF' => { tokenizer.advance(1); }
890+
b'\xF0'...b'\xFF' => { tokenizer.consume_4byte_intro(); }
838891
b => {
839-
if b.is_ascii() {
840-
return tokenizer.slice_from(start_pos).into();
841-
}
842-
tokenizer.advance(1);
892+
return tokenizer.slice_from(start_pos).into();
843893
}
844894
}
845895
}
@@ -861,15 +911,26 @@ fn consume_name<'a>(tokenizer: &mut Tokenizer<'a>) -> CowRcStr<'a> {
861911
tokenizer.advance(1);
862912
value_bytes.extend("\u{FFFD}".as_bytes());
863913
},
864-
_ => {
865-
if b.is_ascii() {
866-
break;
867-
}
868-
tokenizer.advance(1);
914+
b'\x80'...b'\xBF' => {
915+
// This byte *is* part of a multi-byte code point,
916+
// we’ll end up copying the whole code point before this loop does something else.
917+
tokenizer.consume_continuation_byte();
918+
value_bytes.push(b)
919+
}
920+
b'\xC0'...b'\xEF' => {
869921
// This byte *is* part of a multi-byte code point,
870922
// we’ll end up copying the whole code point before this loop does something else.
923+
tokenizer.advance(1);
871924
value_bytes.push(b)
872925
}
926+
b'\xF0'...b'\xFF' => {
927+
tokenizer.consume_4byte_intro();
928+
value_bytes.push(b)
929+
}
930+
_ => {
931+
// ASCII
932+
break;
933+
}
873934
}
874935
}
875936
// string_bytes is well-formed UTF-8, see other comments.
@@ -1048,11 +1109,15 @@ fn consume_unquoted_url<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>,
10481109
}
10491110
b'"' | b'\'' => { return Err(()) }, // Do not advance
10501111
b')' => {
1051-
tokenizer.advance(offset + 1);
1112+
// Don't use advance, because we may be skipping
1113+
// newlines here, and we want to avoid the assert.
1114+
tokenizer.position += offset + 1;
10521115
break
10531116
}
10541117
_ => {
1055-
tokenizer.advance(offset);
1118+
// Don't use advance, because we may be skipping
1119+
// newlines here, and we want to avoid the assert.
1120+
tokenizer.position += offset;
10561121
found_printable_char = true;
10571122
break
10581123
}
@@ -1104,27 +1169,33 @@ fn consume_unquoted_url<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>,
11041169
string_bytes = tokenizer.slice_from(start_pos).as_bytes().to_owned();
11051170
break
11061171
}
1172+
b'\x80'...b'\xBF' => { tokenizer.consume_continuation_byte(); }
1173+
b'\xF0'...b'\xFF' => { tokenizer.consume_4byte_intro(); }
11071174
_ => {
1175+
// ASCII or other leading byte.
11081176
tokenizer.advance(1);
11091177
}
11101178
}
11111179
}
11121180
while !tokenizer.is_eof() {
1113-
match_byte! { tokenizer.consume_byte(),
1181+
let b = tokenizer.next_byte_unchecked();
1182+
match_byte! { b,
11141183
b' ' | b'\t' | b'\n' | b'\r' | b'\x0C' => {
11151184
// string_bytes is well-formed UTF-8, see other comments.
11161185
let string = unsafe { from_utf8_release_unchecked(string_bytes) }.into();
1117-
tokenizer.position -= 1;
11181186
return consume_url_end(tokenizer, start_pos, string)
11191187
}
11201188
b')' => {
1189+
tokenizer.advance(1);
11211190
break;
11221191
}
11231192
b'\x01'...b'\x08' | b'\x0B' | b'\x0E'...b'\x1F' | b'\x7F' // non-printable
11241193
| b'"' | b'\'' | b'(' => {
1194+
tokenizer.advance(1);
11251195
return consume_bad_url(tokenizer, start_pos);
11261196
}
11271197
b'\\' => {
1198+
tokenizer.advance(1);
11281199
if tokenizer.has_newline_at(0) {
11291200
return consume_bad_url(tokenizer, start_pos)
11301201
}
@@ -1133,11 +1204,28 @@ fn consume_unquoted_url<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>,
11331204
consume_escape_and_write(tokenizer, &mut string_bytes)
11341205
},
11351206
b'\0' => {
1207+
tokenizer.advance(1);
11361208
string_bytes.extend("\u{FFFD}".as_bytes());
11371209
}
1210+
b'\x80'...b'\xBF' => {
1211+
// We’ll end up copying the whole code point
1212+
// before this loop does something else.
1213+
tokenizer.consume_continuation_byte();
1214+
string_bytes.push(b);
1215+
}
1216+
b'\xF0'...b'\xFF' => {
1217+
// We’ll end up copying the whole code point
1218+
// before this loop does something else.
1219+
tokenizer.consume_4byte_intro();
1220+
string_bytes.push(b);
1221+
}
11381222
// If this byte is part of a multi-byte code point,
11391223
// we’ll end up copying the whole code point before this loop does something else.
1140-
b => { string_bytes.push(b) }
1224+
b => {
1225+
// ASCII or other leading byte.
1226+
tokenizer.advance(1);
1227+
string_bytes.push(b)
1228+
}
11411229
}
11421230
}
11431231
UnquotedUrl(
@@ -1160,8 +1248,8 @@ fn consume_unquoted_url<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>,
11601248
b'\n' | b'\x0C' | b'\r' => {
11611249
tokenizer.consume_newline();
11621250
}
1163-
_ => {
1164-
tokenizer.advance(1);
1251+
b => {
1252+
tokenizer.consume_known_byte(b);
11651253
return consume_bad_url(tokenizer, start_pos);
11661254
}
11671255
}
@@ -1186,8 +1274,8 @@ fn consume_unquoted_url<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>,
11861274
b'\n' | b'\x0C' | b'\r' => {
11871275
tokenizer.consume_newline();
11881276
}
1189-
_ => {
1190-
tokenizer.advance(1);
1277+
b => {
1278+
tokenizer.consume_known_byte(b);
11911279
}
11921280
}
11931281
}

0 commit comments

Comments
 (0)