Skip to content

Commit 7e67b8f

Browse files
committed
Introduce consume_newline and use it universally
1 parent ada0c6a commit 7e67b8f

File tree

1 file changed

+41
-76
lines changed

1 file changed

+41
-76
lines changed

src/tokenizer.rs

Lines changed: 41 additions & 76 deletions
Original file line numberDiff line numberDiff line change
@@ -361,9 +361,15 @@ impl<'a> Tokenizer<'a> {
361361
self.input[self.position..].chars().next().unwrap()
362362
}
363363

364-
fn seen_newline(&mut self, is_cr: bool) {
365-
if is_cr && self.next_byte() == Some(/* LF */ b'\n') {
366-
return
364+
// Given that a newline has been seen, advance over the newline
365+
// and update the state.
366+
#[inline]
367+
fn consume_newline(&mut self) {
368+
let byte = self.next_byte_unchecked();
369+
debug_assert!(byte == b'\r' || byte == b'\n' || byte == b'\x0C');
370+
self.position += 1;
371+
if byte == b'\r' && self.next_byte() == Some(b'\n') {
372+
self.position += 1;
367373
}
368374
self.current_line_start_position = self.position;
369375
self.current_line_number += 1;
@@ -393,13 +399,8 @@ impl<'a> Tokenizer<'a> {
393399
b' ' | b'\t' => {
394400
self.advance(1)
395401
},
396-
b'\n' | b'\x0C' => {
397-
self.advance(1);
398-
self.seen_newline(false);
399-
},
400-
b'\r' => {
401-
self.advance(1);
402-
self.seen_newline(true);
402+
b'\n' | b'\x0C' | b'\r' => {
403+
self.consume_newline();
403404
},
404405
b'/' => {
405406
if self.starts_with(b"/*") {
@@ -421,13 +422,8 @@ impl<'a> Tokenizer<'a> {
421422
b' ' | b'\t' => {
422423
self.advance(1)
423424
},
424-
b'\n' | b'\x0C' => {
425-
self.advance(1);
426-
self.seen_newline(false);
427-
},
428-
b'\r' => {
429-
self.advance(1);
430-
self.seen_newline(true);
425+
b'\n' | b'\x0C' | b'\r' => {
426+
self.consume_newline();
431427
},
432428
b'/' => {
433429
if self.starts_with(b"/*") {
@@ -481,13 +477,10 @@ fn next_token<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>, ()> {
481477
let b = tokenizer.next_byte_unchecked();
482478
let token = match_byte! { b,
483479
b' ' | b'\t' => {
484-
consume_whitespace(tokenizer, false, false)
480+
consume_whitespace(tokenizer, false)
485481
},
486-
b'\n' | b'\x0C' => {
487-
consume_whitespace(tokenizer, true, false)
488-
},
489-
b'\r' => {
490-
consume_whitespace(tokenizer, true, true)
482+
b'\n' | b'\x0C' | b'\r' => {
483+
consume_whitespace(tokenizer, true)
491484
},
492485
b'"' => { consume_string(tokenizer, false) },
493486
b'#' => {
@@ -617,25 +610,21 @@ fn next_token<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>, ()> {
617610
}
618611

619612

620-
fn consume_whitespace<'a>(tokenizer: &mut Tokenizer<'a>, newline: bool, is_cr: bool) -> Token<'a> {
613+
fn consume_whitespace<'a>(tokenizer: &mut Tokenizer<'a>, newline: bool) -> Token<'a> {
621614
let start_position = tokenizer.position();
622-
tokenizer.advance(1);
623615
if newline {
624-
tokenizer.seen_newline(is_cr)
616+
tokenizer.consume_newline();
617+
} else {
618+
tokenizer.advance(1);
625619
}
626620
while !tokenizer.is_eof() {
627621
let b = tokenizer.next_byte_unchecked();
628622
match_byte! { b,
629623
b' ' | b'\t' => {
630624
tokenizer.advance(1);
631625
}
632-
b'\n' | b'\x0C' => {
633-
tokenizer.advance(1);
634-
tokenizer.seen_newline(false);
635-
}
636-
b'\r' => {
637-
tokenizer.advance(1);
638-
tokenizer.seen_newline(true);
626+
b'\n' | b'\x0C' | b'\r' => {
627+
tokenizer.consume_newline();
639628
}
640629
_ => {
641630
break
@@ -675,13 +664,8 @@ fn consume_comment<'a>(tokenizer: &mut Tokenizer<'a>) -> &'a str {
675664
return contents
676665
}
677666
}
678-
b'\n' | b'\x0C' => {
679-
tokenizer.advance(1);
680-
tokenizer.seen_newline(false);
681-
}
682-
b'\r' => {
683-
tokenizer.advance(1);
684-
tokenizer.seen_newline(true);
667+
b'\n' | b'\x0C' | b'\r' => {
668+
tokenizer.consume_newline();
685669
}
686670
_ => {
687671
tokenizer.advance(1);
@@ -769,19 +753,8 @@ fn consume_quoted_string<'a>(tokenizer: &mut Tokenizer<'a>, single_quote: bool)
769753
if !tokenizer.is_eof() {
770754
match tokenizer.next_byte_unchecked() {
771755
// Escaped newline
772-
b'\n' | b'\x0C' => {
773-
tokenizer.advance(1);
774-
tokenizer.seen_newline(false);
775-
}
776-
b'\r' => {
777-
tokenizer.advance(1);
778-
if tokenizer.next_byte() == Some(b'\n') {
779-
tokenizer.advance(1);
780-
}
781-
// `is_cr = true` is useful to skip \r when the next iteration
782-
// of a loop will call `seen_newline` again for the following \n.
783-
// In this case we’re consuming both in this iteration, so passing `false`.
784-
tokenizer.seen_newline(false);
756+
b'\n' | b'\x0C' | b'\r' => {
757+
tokenizer.consume_newline();
785758
}
786759
// This pushes one well-formed code point
787760
_ => consume_escape_and_write(tokenizer, &mut string_bytes)
@@ -1178,18 +1151,17 @@ fn consume_unquoted_url<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>,
11781151
string: CowRcStr<'a>)
11791152
-> Token<'a> {
11801153
while !tokenizer.is_eof() {
1181-
match_byte! { tokenizer.consume_byte(),
1154+
match_byte! { tokenizer.next_byte_unchecked(),
11821155
b')' => {
1156+
tokenizer.advance(1);
11831157
break
11841158
}
1185-
b' ' | b'\t' => {}
1186-
b'\n' | b'\x0C' => {
1187-
tokenizer.seen_newline(false);
1188-
}
1189-
b'\r' => {
1190-
tokenizer.seen_newline(true);
1159+
b' ' | b'\t' => { tokenizer.advance(1); }
1160+
b'\n' | b'\x0C' | b'\r' => {
1161+
tokenizer.consume_newline();
11911162
}
11921163
_ => {
1164+
tokenizer.advance(1);
11931165
return consume_bad_url(tokenizer, start_pos);
11941166
}
11951167
}
@@ -1200,22 +1172,23 @@ fn consume_unquoted_url<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>,
12001172
fn consume_bad_url<'a>(tokenizer: &mut Tokenizer<'a>, start_pos: SourcePosition) -> Token<'a> {
12011173
// Consume up to the closing )
12021174
while !tokenizer.is_eof() {
1203-
match_byte! { tokenizer.consume_byte(),
1175+
match_byte! { tokenizer.next_byte_unchecked(),
12041176
b')' => {
1177+
tokenizer.advance(1);
12051178
break
12061179
}
12071180
b'\\' => {
1181+
tokenizer.advance(1);
12081182
if matches!(tokenizer.next_byte(), Some(b')') | Some(b'\\')) {
12091183
tokenizer.advance(1); // Skip an escaped ')' or '\'
12101184
}
12111185
}
1212-
b'\n' | b'\x0C' => {
1213-
tokenizer.seen_newline(false);
1186+
b'\n' | b'\x0C' | b'\r' => {
1187+
tokenizer.consume_newline();
12141188
}
1215-
b'\r' => {
1216-
tokenizer.seen_newline(true);
1189+
_ => {
1190+
tokenizer.advance(1);
12171191
}
1218-
_ => {},
12191192
}
12201193
}
12211194
BadUrl(tokenizer.slice_from(start_pos).into())
@@ -1259,16 +1232,8 @@ fn consume_escape(tokenizer: &mut Tokenizer) -> char {
12591232
b' ' | b'\t' => {
12601233
tokenizer.advance(1)
12611234
}
1262-
b'\n' | b'\x0C' => {
1263-
tokenizer.advance(1);
1264-
tokenizer.seen_newline(false)
1265-
}
1266-
b'\r' => {
1267-
tokenizer.advance(1);
1268-
if !tokenizer.is_eof() && tokenizer.next_byte_unchecked() == b'\n' {
1269-
tokenizer.advance(1);
1270-
}
1271-
tokenizer.seen_newline(false)
1235+
b'\n' | b'\x0C' | b'\r' => {
1236+
tokenizer.consume_newline();
12721237
}
12731238
_ => {}
12741239
}

0 commit comments

Comments
 (0)