Skip to content

Commit 72bc6ff

Browse files
committed
Count line numbers during tokenization
1 parent e1ff8c1 commit 72bc6ff

File tree

1 file changed

+156
-43
lines changed

1 file changed

+156
-43
lines changed

src/tokenizer.rs

Lines changed: 156 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -379,6 +379,14 @@ impl<'a> Tokenizer<'a> {
379379
self.input[self.position..].chars().next().unwrap()
380380
}
381381

382+
fn seen_newline(&mut self, is_cr: bool) {
383+
if is_cr && self.next_byte() == Some(/* LF */ b'\n') {
384+
return
385+
}
386+
self.current_line_start_position = self.position;
387+
self.current_line_number += 1;
388+
}
389+
382390
#[inline]
383391
fn has_newline_at(&self, offset: usize) -> bool {
384392
self.position + offset < self.input.len() &&
@@ -420,16 +428,14 @@ fn next_token<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>, ()> {
420428
}
421429
let b = tokenizer.next_byte_unchecked();
422430
let token = match_byte! { b,
423-
b'\t' | b'\n' | b' ' | b'\r' | b'\x0C' => {
424-
let start_position = tokenizer.position();
425-
tokenizer.advance(1);
426-
while !tokenizer.is_eof() {
427-
match tokenizer.next_byte_unchecked() {
428-
b' ' | b'\t' | b'\n' | b'\r' | b'\x0C' => tokenizer.advance(1),
429-
_ => break,
430-
}
431-
}
432-
WhiteSpace(tokenizer.slice_from(start_position))
431+
b' ' | b'\t' => {
432+
consume_whitespace(tokenizer, false, false)
433+
},
434+
b'\n' | b'\x0C' => {
435+
consume_whitespace(tokenizer, true, false)
436+
},
437+
b'\r' => {
438+
consume_whitespace(tokenizer, true, true)
433439
},
434440
b'"' => { consume_string(tokenizer, false) },
435441
b'#' => {
@@ -501,21 +507,7 @@ fn next_token<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>, ()> {
501507
}
502508
b'/' => {
503509
if tokenizer.starts_with(b"/*") {
504-
tokenizer.advance(2); // consume "/*"
505-
let start_position = tokenizer.position();
506-
let content;
507-
match tokenizer.input[tokenizer.position..].find("*/") {
508-
Some(offset) => {
509-
tokenizer.advance(offset);
510-
content = tokenizer.slice_from(start_position);
511-
tokenizer.advance(2);
512-
}
513-
None => {
514-
tokenizer.position = tokenizer.input.len();
515-
content = tokenizer.slice_from(start_position);
516-
}
517-
}
518-
Comment(content)
510+
Comment(consume_comment(tokenizer))
519511
} else {
520512
tokenizer.advance(1);
521513
Delim('/')
@@ -573,6 +565,64 @@ fn next_token<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>, ()> {
573565
}
574566

575567

568+
fn consume_whitespace<'a>(tokenizer: &mut Tokenizer<'a>, newline: bool, is_cr: bool) -> Token<'a> {
569+
let start_position = tokenizer.position();
570+
tokenizer.advance(1);
571+
if newline {
572+
tokenizer.seen_newline(is_cr)
573+
}
574+
while !tokenizer.is_eof() {
575+
let b = tokenizer.next_byte_unchecked();
576+
match_byte! { b,
577+
b' ' | b'\t' => {
578+
tokenizer.advance(1);
579+
}
580+
b'\n' | b'\x0C' => {
581+
tokenizer.advance(1);
582+
tokenizer.seen_newline(false);
583+
}
584+
b'\r' => {
585+
tokenizer.advance(1);
586+
tokenizer.seen_newline(true);
587+
}
588+
_ => {
589+
break
590+
}
591+
}
592+
}
593+
WhiteSpace(tokenizer.slice_from(start_position))
594+
}
595+
596+
597+
fn consume_comment<'a>(tokenizer: &mut Tokenizer<'a>) -> &'a str {
598+
tokenizer.advance(2); // consume "/*"
599+
let start_position = tokenizer.position();
600+
while !tokenizer.is_eof() {
601+
match_byte! { tokenizer.next_byte_unchecked(),
602+
b'*' => {
603+
let end_position = tokenizer.position();
604+
tokenizer.advance(1);
605+
if tokenizer.next_byte() == Some(b'/') {
606+
tokenizer.advance(1);
607+
return tokenizer.slice(start_position..end_position)
608+
}
609+
}
610+
b'\n' | b'\x0C' => {
611+
tokenizer.advance(1);
612+
tokenizer.seen_newline(false);
613+
}
614+
b'\r' => {
615+
tokenizer.advance(1);
616+
tokenizer.seen_newline(true);
617+
}
618+
_ => {
619+
tokenizer.advance(1);
620+
}
621+
}
622+
}
623+
tokenizer.slice_from(start_position)
624+
}
625+
576626
fn consume_string<'a>(tokenizer: &mut Tokenizer<'a>, single_quote: bool) -> Token<'a> {
577627
match consume_quoted_string(tokenizer, single_quote) {
578628
Ok(value) => QuotedString(value),
@@ -649,12 +699,19 @@ fn consume_quoted_string<'a>(tokenizer: &mut Tokenizer<'a>, single_quote: bool)
649699
if !tokenizer.is_eof() {
650700
match tokenizer.next_byte_unchecked() {
651701
// Escaped newline
652-
b'\n' | b'\x0C' => tokenizer.advance(1),
702+
b'\n' | b'\x0C' => {
703+
tokenizer.advance(1);
704+
tokenizer.seen_newline(false);
705+
}
653706
b'\r' => {
654707
tokenizer.advance(1);
655708
if tokenizer.next_byte() == Some(b'\n') {
656709
tokenizer.advance(1);
657710
}
711+
// `is_cr = true` is useful to skip \r when the next iteration
712+
// of a loop will call `seen_newline` again for the following \n.
713+
// In this case we’re consuming both in this iteration, so passing `false`.
714+
tokenizer.seen_newline(false);
658715
}
659716
// This pushes one well-formed code point
660717
_ => consume_escape_and_write(tokenizer, &mut string_bytes)
@@ -921,24 +978,57 @@ unsafe fn from_utf8_release_unchecked(string_bytes: Vec<u8>) -> String {
921978

922979
fn consume_unquoted_url<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>, ()> {
923980
// This is only called after "url(", so the current position is a code point boundary.
924-
for (offset, c) in tokenizer.input[tokenizer.position..].bytes().enumerate() {
925-
match_byte! { c,
926-
b' ' | b'\t' | b'\n' | b'\r' | b'\x0C' => {},
981+
let start_position = tokenizer.position;
982+
let from_start = &tokenizer.input[tokenizer.position..];
983+
let mut newlines = 0;
984+
let mut last_newline = 0;
985+
let mut found_printable_char = false;
986+
let mut iter = from_start.bytes().enumerate();
987+
loop {
988+
let (offset, b) = match iter.next() {
989+
Some(item) => item,
990+
None => {
991+
tokenizer.position = tokenizer.input.len();
992+
break
993+
}
994+
};
995+
match_byte! { b,
996+
b' ' | b'\t' => {},
997+
b'\n' | b'\x0C' => {
998+
newlines += 1;
999+
last_newline = offset;
1000+
}
1001+
b'\r' => {
1002+
if from_start.as_bytes().get(offset + 1) != Some(&b'\n') {
1003+
newlines += 1;
1004+
last_newline = offset;
1005+
}
1006+
}
9271007
b'"' | b'\'' => { return Err(()) }, // Do not advance
9281008
b')' => {
9291009
tokenizer.advance(offset + 1);
930-
return Ok(UnquotedUrl("".into()));
1010+
break
9311011
}
9321012
_ => {
9331013
tokenizer.advance(offset);
934-
// This function only consumed ASCII (whitespace) bytes,
935-
// so the current position is a code point boundary.
936-
return Ok(consume_unquoted_url_internal(tokenizer))
1014+
found_printable_char = true;
1015+
break
9371016
}
9381017
}
9391018
}
940-
tokenizer.position = tokenizer.input.len();
941-
return Ok(UnquotedUrl("".into()));
1019+
1020+
if newlines > 0 {
1021+
tokenizer.current_line_number += newlines;
1022+
tokenizer.current_line_start_position = start_position + last_newline + 1;
1023+
}
1024+
1025+
if found_printable_char {
1026+
// This function only consumed ASCII (whitespace) bytes,
1027+
// so the current position is a code point boundary.
1028+
return Ok(consume_unquoted_url_internal(tokenizer))
1029+
} else {
1030+
return Ok(UnquotedUrl("".into()))
1031+
}
9421032

9431033
fn consume_unquoted_url_internal<'a>(tokenizer: &mut Tokenizer<'a>) -> Token<'a> {
9441034
// This function is only called with start_pos at a code point boundary.
@@ -951,7 +1041,6 @@ fn consume_unquoted_url<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>,
9511041
match_byte! { tokenizer.next_byte_unchecked(),
9521042
b' ' | b'\t' | b'\n' | b'\r' | b'\x0C' => {
9531043
let value = tokenizer.slice_from(start_pos);
954-
tokenizer.advance(1);
9551044
return consume_url_end(tokenizer, start_pos, value.into())
9561045
}
9571046
b')' => {
@@ -974,7 +1063,7 @@ fn consume_unquoted_url<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>,
9741063
break
9751064
}
9761065
_ => {
977-
tokenizer.consume_byte();
1066+
tokenizer.advance(1);
9781067
}
9791068
}
9801069
}
@@ -983,6 +1072,7 @@ fn consume_unquoted_url<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>,
9831072
b' ' | b'\t' | b'\n' | b'\r' | b'\x0C' => {
9841073
// string_bytes is well-formed UTF-8, see other comments.
9851074
let string = unsafe { from_utf8_release_unchecked(string_bytes) }.into();
1075+
tokenizer.position -= 1;
9861076
return consume_url_end(tokenizer, start_pos, string)
9871077
}
9881078
b')' => {
@@ -1020,8 +1110,16 @@ fn consume_unquoted_url<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>,
10201110
-> Token<'a> {
10211111
while !tokenizer.is_eof() {
10221112
match_byte! { tokenizer.consume_byte(),
1023-
b' ' | b'\t' | b'\n' | b'\r' | b'\x0C' => {},
1024-
b')' => { break },
1113+
b')' => {
1114+
break
1115+
}
1116+
b' ' | b'\t' => {}
1117+
b'\n' | b'\x0C' => {
1118+
tokenizer.seen_newline(false);
1119+
}
1120+
b'\r' => {
1121+
tokenizer.seen_newline(true);
1122+
}
10251123
_ => {
10261124
return consume_bad_url(tokenizer, start_pos);
10271125
}
@@ -1034,12 +1132,20 @@ fn consume_unquoted_url<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>,
10341132
// Consume up to the closing )
10351133
while !tokenizer.is_eof() {
10361134
match_byte! { tokenizer.consume_byte(),
1037-
b')' => { break },
1135+
b')' => {
1136+
break
1137+
}
10381138
b'\\' => {
10391139
if matches!(tokenizer.next_byte(), Some(b')') | Some(b'\\')) {
10401140
tokenizer.advance(1); // Skip an escaped ')' or '\'
10411141
}
10421142
}
1143+
b'\n' | b'\x0C' => {
1144+
tokenizer.seen_newline(false);
1145+
}
1146+
b'\r' => {
1147+
tokenizer.seen_newline(true);
1148+
}
10431149
_ => {},
10441150
}
10451151
}
@@ -1080,15 +1186,22 @@ fn consume_escape(tokenizer: &mut Tokenizer) -> char {
10801186
b'0'...b'9' | b'A'...b'F' | b'a'...b'f' => {
10811187
let (c, _) = consume_hex_digits(tokenizer);
10821188
if !tokenizer.is_eof() {
1083-
match tokenizer.next_byte_unchecked() {
1084-
b' ' | b'\t' | b'\n' | b'\x0C' => tokenizer.advance(1),
1189+
match_byte! { tokenizer.next_byte_unchecked(),
1190+
b' ' | b'\t' => {
1191+
tokenizer.advance(1)
1192+
}
1193+
b'\n' | b'\x0C' => {
1194+
tokenizer.advance(1);
1195+
tokenizer.seen_newline(false)
1196+
}
10851197
b'\r' => {
10861198
tokenizer.advance(1);
10871199
if !tokenizer.is_eof() && tokenizer.next_byte_unchecked() == b'\n' {
10881200
tokenizer.advance(1);
10891201
}
1202+
tokenizer.seen_newline(false)
10901203
}
1091-
_ => ()
1204+
_ => {}
10921205
}
10931206
}
10941207
static REPLACEMENT_CHAR: char = '\u{FFFD}';

0 commit comments

Comments
 (0)