Skip to content

Commit 9f1d746

Browse files
committed
Count line numbers during tokenization
1 parent e1ff8c1 commit 9f1d746

File tree

1 file changed

+152
-43
lines changed

1 file changed

+152
-43
lines changed

src/tokenizer.rs

Lines changed: 152 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -379,6 +379,14 @@ impl<'a> Tokenizer<'a> {
379379
self.input[self.position..].chars().next().unwrap()
380380
}
381381

382+
fn seen_newline(&mut self, is_cr: bool) {
383+
if is_cr && self.next_byte() == Some(/* LF */ b'\n') {
384+
return
385+
}
386+
self.current_line_start_position = self.position;
387+
self.current_line_number += 1;
388+
}
389+
382390
#[inline]
383391
fn has_newline_at(&self, offset: usize) -> bool {
384392
self.position + offset < self.input.len() &&
@@ -420,16 +428,14 @@ fn next_token<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>, ()> {
420428
}
421429
let b = tokenizer.next_byte_unchecked();
422430
let token = match_byte! { b,
423-
b'\t' | b'\n' | b' ' | b'\r' | b'\x0C' => {
424-
let start_position = tokenizer.position();
425-
tokenizer.advance(1);
426-
while !tokenizer.is_eof() {
427-
match tokenizer.next_byte_unchecked() {
428-
b' ' | b'\t' | b'\n' | b'\r' | b'\x0C' => tokenizer.advance(1),
429-
_ => break,
430-
}
431-
}
432-
WhiteSpace(tokenizer.slice_from(start_position))
431+
b' ' | b'\t' => {
432+
consume_whitespace(tokenizer, false, false)
433+
},
434+
b'\n' | b'\x0C' => {
435+
consume_whitespace(tokenizer, true, false)
436+
},
437+
b'\r' => {
438+
consume_whitespace(tokenizer, true, true)
433439
},
434440
b'"' => { consume_string(tokenizer, false) },
435441
b'#' => {
@@ -501,21 +507,7 @@ fn next_token<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>, ()> {
501507
}
502508
b'/' => {
503509
if tokenizer.starts_with(b"/*") {
504-
tokenizer.advance(2); // consume "/*"
505-
let start_position = tokenizer.position();
506-
let content;
507-
match tokenizer.input[tokenizer.position..].find("*/") {
508-
Some(offset) => {
509-
tokenizer.advance(offset);
510-
content = tokenizer.slice_from(start_position);
511-
tokenizer.advance(2);
512-
}
513-
None => {
514-
tokenizer.position = tokenizer.input.len();
515-
content = tokenizer.slice_from(start_position);
516-
}
517-
}
518-
Comment(content)
510+
Comment(consume_comment(tokenizer))
519511
} else {
520512
tokenizer.advance(1);
521513
Delim('/')
@@ -573,6 +565,64 @@ fn next_token<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>, ()> {
573565
}
574566

575567

568+
fn consume_whitespace<'a>(tokenizer: &mut Tokenizer<'a>, newline: bool, is_cr: bool) -> Token<'a> {
569+
let start_position = tokenizer.position();
570+
tokenizer.advance(1);
571+
if newline {
572+
tokenizer.seen_newline(is_cr)
573+
}
574+
while !tokenizer.is_eof() {
575+
let b = tokenizer.next_byte_unchecked();
576+
match_byte! { b,
577+
b' ' | b'\t' => {
578+
tokenizer.advance(1);
579+
}
580+
b'\n' | b'\x0C' => {
581+
tokenizer.advance(1);
582+
tokenizer.seen_newline(false);
583+
}
584+
b'\r' => {
585+
tokenizer.advance(1);
586+
tokenizer.seen_newline(true);
587+
}
588+
_ => {
589+
break
590+
}
591+
}
592+
}
593+
WhiteSpace(tokenizer.slice_from(start_position))
594+
}
595+
596+
597+
fn consume_comment<'a>(tokenizer: &mut Tokenizer<'a>) -> &'a str {
598+
tokenizer.advance(2); // consume "/*"
599+
let start_position = tokenizer.position();
600+
while !tokenizer.is_eof() {
601+
match_byte! { tokenizer.next_byte_unchecked(),
602+
b'*' => {
603+
let end_position = tokenizer.position();
604+
tokenizer.advance(1);
605+
if tokenizer.next_byte() == Some(b'/') {
606+
tokenizer.advance(1);
607+
return tokenizer.slice(start_position..end_position)
608+
}
609+
}
610+
b'\n' | b'\x0C' => {
611+
tokenizer.advance(1);
612+
tokenizer.seen_newline(false);
613+
}
614+
b'\r' => {
615+
tokenizer.advance(1);
616+
tokenizer.seen_newline(true);
617+
}
618+
_ => {
619+
tokenizer.advance(1);
620+
}
621+
}
622+
}
623+
tokenizer.slice_from(start_position)
624+
}
625+
576626
fn consume_string<'a>(tokenizer: &mut Tokenizer<'a>, single_quote: bool) -> Token<'a> {
577627
match consume_quoted_string(tokenizer, single_quote) {
578628
Ok(value) => QuotedString(value),
@@ -649,12 +699,16 @@ fn consume_quoted_string<'a>(tokenizer: &mut Tokenizer<'a>, single_quote: bool)
649699
if !tokenizer.is_eof() {
650700
match tokenizer.next_byte_unchecked() {
651701
// Escaped newline
652-
b'\n' | b'\x0C' => tokenizer.advance(1),
702+
b'\n' | b'\x0C' => {
703+
tokenizer.advance(1);
704+
tokenizer.seen_newline(false);
705+
}
653706
b'\r' => {
654707
tokenizer.advance(1);
655708
if tokenizer.next_byte() == Some(b'\n') {
656709
tokenizer.advance(1);
657710
}
711+
tokenizer.seen_newline(false);
658712
}
659713
// This pushes one well-formed code point
660714
_ => consume_escape_and_write(tokenizer, &mut string_bytes)
@@ -921,24 +975,56 @@ unsafe fn from_utf8_release_unchecked(string_bytes: Vec<u8>) -> String {
921975

922976
fn consume_unquoted_url<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>, ()> {
923977
// This is only called after "url(", so the current position is a code point boundary.
924-
for (offset, c) in tokenizer.input[tokenizer.position..].bytes().enumerate() {
925-
match_byte! { c,
926-
b' ' | b'\t' | b'\n' | b'\r' | b'\x0C' => {},
978+
let start_position = tokenizer.position;
979+
let from_start = &tokenizer.input[tokenizer.position..];
980+
let mut newlines = 0;
981+
let mut last_newline = 0;
982+
let mut found_printable_char = false;
983+
let mut iter = from_start.bytes().enumerate();
984+
loop {
985+
let (offset, b) = if let Some(item) = iter.next() {
986+
item
987+
} else {
988+
tokenizer.position = tokenizer.input.len();
989+
break
990+
};
991+
match_byte! { b,
992+
b' ' | b'\t' => {},
993+
b'\n' | b'\x0C' => {
994+
newlines += 1;
995+
last_newline = offset;
996+
}
997+
b'\r' => {
998+
if from_start.as_bytes().get(offset + 1) != Some(&b'\n') {
999+
newlines += 1;
1000+
last_newline = offset;
1001+
}
1002+
}
9271003
b'"' | b'\'' => { return Err(()) }, // Do not advance
9281004
b')' => {
9291005
tokenizer.advance(offset + 1);
930-
return Ok(UnquotedUrl("".into()));
1006+
break
9311007
}
9321008
_ => {
9331009
tokenizer.advance(offset);
934-
// This function only consumed ASCII (whitespace) bytes,
935-
// so the current position is a code point boundary.
936-
return Ok(consume_unquoted_url_internal(tokenizer))
1010+
found_printable_char = true;
1011+
break
9371012
}
9381013
}
9391014
}
940-
tokenizer.position = tokenizer.input.len();
941-
return Ok(UnquotedUrl("".into()));
1015+
1016+
if newlines > 0 {
1017+
tokenizer.current_line_number += newlines;
1018+
tokenizer.current_line_start_position = start_position + last_newline + 1;
1019+
}
1020+
1021+
if found_printable_char {
1022+
// This function only consumed ASCII (whitespace) bytes,
1023+
// so the current position is a code point boundary.
1024+
return Ok(consume_unquoted_url_internal(tokenizer))
1025+
} else {
1026+
return Ok(UnquotedUrl("".into()))
1027+
}
9421028

9431029
fn consume_unquoted_url_internal<'a>(tokenizer: &mut Tokenizer<'a>) -> Token<'a> {
9441030
// This function is only called with start_pos at a code point boundary.
@@ -951,7 +1037,6 @@ fn consume_unquoted_url<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>,
9511037
match_byte! { tokenizer.next_byte_unchecked(),
9521038
b' ' | b'\t' | b'\n' | b'\r' | b'\x0C' => {
9531039
let value = tokenizer.slice_from(start_pos);
954-
tokenizer.advance(1);
9551040
return consume_url_end(tokenizer, start_pos, value.into())
9561041
}
9571042
b')' => {
@@ -974,7 +1059,7 @@ fn consume_unquoted_url<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>,
9741059
break
9751060
}
9761061
_ => {
977-
tokenizer.consume_byte();
1062+
tokenizer.advance(1);
9781063
}
9791064
}
9801065
}
@@ -983,6 +1068,7 @@ fn consume_unquoted_url<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>,
9831068
b' ' | b'\t' | b'\n' | b'\r' | b'\x0C' => {
9841069
// string_bytes is well-formed UTF-8, see other comments.
9851070
let string = unsafe { from_utf8_release_unchecked(string_bytes) }.into();
1071+
tokenizer.position -= 1;
9861072
return consume_url_end(tokenizer, start_pos, string)
9871073
}
9881074
b')' => {
@@ -1020,8 +1106,16 @@ fn consume_unquoted_url<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>,
10201106
-> Token<'a> {
10211107
while !tokenizer.is_eof() {
10221108
match_byte! { tokenizer.consume_byte(),
1023-
b' ' | b'\t' | b'\n' | b'\r' | b'\x0C' => {},
1024-
b')' => { break },
1109+
b')' => {
1110+
break
1111+
}
1112+
b' ' | b'\t' => {}
1113+
b'\n' | b'\x0C' => {
1114+
tokenizer.seen_newline(false);
1115+
}
1116+
b'\r' => {
1117+
tokenizer.seen_newline(true);
1118+
}
10251119
_ => {
10261120
return consume_bad_url(tokenizer, start_pos);
10271121
}
@@ -1034,12 +1128,20 @@ fn consume_unquoted_url<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>,
10341128
// Consume up to the closing )
10351129
while !tokenizer.is_eof() {
10361130
match_byte! { tokenizer.consume_byte(),
1037-
b')' => { break },
1131+
b')' => {
1132+
break
1133+
}
10381134
b'\\' => {
10391135
if matches!(tokenizer.next_byte(), Some(b')') | Some(b'\\')) {
10401136
tokenizer.advance(1); // Skip an escaped ')' or '\'
10411137
}
10421138
}
1139+
b'\n' | b'\x0C' => {
1140+
tokenizer.seen_newline(false);
1141+
}
1142+
b'\r' => {
1143+
tokenizer.seen_newline(true);
1144+
}
10431145
_ => {},
10441146
}
10451147
}
@@ -1080,15 +1182,22 @@ fn consume_escape(tokenizer: &mut Tokenizer) -> char {
10801182
b'0'...b'9' | b'A'...b'F' | b'a'...b'f' => {
10811183
let (c, _) = consume_hex_digits(tokenizer);
10821184
if !tokenizer.is_eof() {
1083-
match tokenizer.next_byte_unchecked() {
1084-
b' ' | b'\t' | b'\n' | b'\x0C' => tokenizer.advance(1),
1185+
match_byte! { tokenizer.next_byte_unchecked(),
1186+
b' ' | b'\t' => {
1187+
tokenizer.advance(1)
1188+
}
1189+
b'\n' | b'\x0C' => {
1190+
tokenizer.advance(1);
1191+
tokenizer.seen_newline(false)
1192+
}
10851193
b'\r' => {
10861194
tokenizer.advance(1);
10871195
if !tokenizer.is_eof() && tokenizer.next_byte_unchecked() == b'\n' {
10881196
tokenizer.advance(1);
10891197
}
1198+
tokenizer.seen_newline(false)
10901199
}
1091-
_ => ()
1200+
_ => {}
10921201
}
10931202
}
10941203
static REPLACEMENT_CHAR: char = '\u{FFFD}';

0 commit comments

Comments
 (0)