Skip to content

Commit 65b570b

Browse files
committed
Use less UTF-8 logic when not needed.
1 parent dbdf639 commit 65b570b

File tree

1 file changed

+77
-76
lines changed

1 file changed

+77
-76
lines changed

src/tokenizer.rs

Lines changed: 77 additions & 76 deletions
Original file line numberDiff line numberDiff line change
@@ -380,8 +380,8 @@ impl<'a> Tokenizer<'a> {
380380
}
381381

382382
#[inline]
383-
fn starts_with(&self, needle: &str) -> bool {
384-
self.input[self.position..].starts_with(needle)
383+
fn starts_with(&self, needle: &[u8]) -> bool {
384+
self.input.as_bytes()[self.position..].starts_with(needle)
385385
}
386386
}
387387

@@ -405,88 +405,88 @@ fn next_token<'a>(tokenizer: &mut Tokenizer<'a>) -> Option<Token<'a>> {
405405
if tokenizer.is_eof() {
406406
return None
407407
}
408-
let c = tokenizer.next_char();
408+
let c = tokenizer.next_byte_unchecked();
409409
let token = match c {
410-
'\t' | '\n' | ' ' | '\r' | '\x0C' => {
410+
b'\t' | b'\n' | b' ' | b'\r' | b'\x0C' => {
411411
let start_position = tokenizer.position();
412412
tokenizer.advance(1);
413413
while !tokenizer.is_eof() {
414-
match tokenizer.next_char() {
415-
' ' | '\t' | '\n' | '\r' | '\x0C' => tokenizer.advance(1),
414+
match tokenizer.next_byte_unchecked() {
415+
b' ' | b'\t' | b'\n' | b'\r' | b'\x0C' => tokenizer.advance(1),
416416
_ => break,
417417
}
418418
}
419419
WhiteSpace(tokenizer.slice_from(start_position))
420420
},
421-
'"' => consume_string(tokenizer, false),
422-
'#' => {
421+
b'"' => consume_string(tokenizer, false),
422+
b'#' => {
423423
tokenizer.advance(1);
424424
if is_ident_start(tokenizer) { IDHash(consume_name(tokenizer)) }
425-
else if !tokenizer.is_eof() && match tokenizer.next_char() {
426-
'a'...'z' | 'A'...'Z' | '0'...'9' | '-' | '_' => true,
427-
'\\' => !tokenizer.has_newline_at(1),
428-
_ => c > '\x7F', // Non-ASCII
425+
else if !tokenizer.is_eof() && match tokenizer.next_byte_unchecked() {
426+
b'a'...b'z' | b'A'...b'Z' | b'0'...b'9' | b'-' | b'_' => true,
427+
b'\\' => !tokenizer.has_newline_at(1),
428+
_ => !c.is_ascii(),
429429
} { Hash(consume_name(tokenizer)) }
430-
else { Delim(c) }
430+
else { Delim('#') }
431431
},
432-
'$' => {
433-
if tokenizer.starts_with("$=") { tokenizer.advance(2); SuffixMatch }
434-
else { tokenizer.advance(1); Delim(c) }
432+
b'$' => {
433+
if tokenizer.starts_with(b"$=") { tokenizer.advance(2); SuffixMatch }
434+
else { tokenizer.advance(1); Delim('$') }
435435
},
436-
'\'' => consume_string(tokenizer, true),
437-
'(' => { tokenizer.advance(1); ParenthesisBlock },
438-
')' => { tokenizer.advance(1); CloseParenthesis },
439-
'*' => {
440-
if tokenizer.starts_with("*=") { tokenizer.advance(2); SubstringMatch }
441-
else { tokenizer.advance(1); Delim(c) }
436+
b'\'' => consume_string(tokenizer, true),
437+
b'(' => { tokenizer.advance(1); ParenthesisBlock },
438+
b')' => { tokenizer.advance(1); CloseParenthesis },
439+
b'*' => {
440+
if tokenizer.starts_with(b"*=") { tokenizer.advance(2); SubstringMatch }
441+
else { tokenizer.advance(1); Delim('*') }
442442
},
443-
'+' => {
443+
b'+' => {
444444
if (
445445
tokenizer.has_at_least(1)
446-
&& matches!(tokenizer.char_at(1), '0'...'9')
446+
&& matches!(tokenizer.byte_at(1), b'0'...b'9')
447447
) || (
448448
tokenizer.has_at_least(2)
449-
&& tokenizer.char_at(1) == '.'
450-
&& matches!(tokenizer.char_at(2), '0'...'9')
449+
&& tokenizer.byte_at(1) == b'.'
450+
&& matches!(tokenizer.byte_at(2), b'0'...b'9')
451451
) {
452452
consume_numeric(tokenizer)
453453
} else {
454454
tokenizer.advance(1);
455-
Delim(c)
455+
Delim('+')
456456
}
457457
},
458-
',' => { tokenizer.advance(1); Comma },
459-
'-' => {
458+
b',' => { tokenizer.advance(1); Comma },
459+
b'-' => {
460460
if (
461461
tokenizer.has_at_least(1)
462-
&& matches!(tokenizer.char_at(1), '0'...'9')
462+
&& matches!(tokenizer.byte_at(1), b'0'...b'9')
463463
) || (
464464
tokenizer.has_at_least(2)
465-
&& tokenizer.char_at(1) == '.'
466-
&& matches!(tokenizer.char_at(2), '0'...'9')
465+
&& tokenizer.byte_at(1) == b'.'
466+
&& matches!(tokenizer.byte_at(2), b'0'...b'9')
467467
) {
468468
consume_numeric(tokenizer)
469-
} else if tokenizer.starts_with("-->") {
469+
} else if tokenizer.starts_with(b"-->") {
470470
tokenizer.advance(3);
471471
CDC
472472
} else if is_ident_start(tokenizer) {
473473
consume_ident_like(tokenizer)
474474
} else {
475475
tokenizer.advance(1);
476-
Delim(c)
476+
Delim('-')
477477
}
478478
},
479-
'.' => {
479+
b'.' => {
480480
if tokenizer.has_at_least(1)
481-
&& matches!(tokenizer.char_at(1), '0'...'9'
481+
&& matches!(tokenizer.byte_at(1), b'0'...b'9'
482482
) {
483483
consume_numeric(tokenizer)
484484
} else {
485485
tokenizer.advance(1);
486-
Delim(c)
486+
Delim('.')
487487
}
488488
}
489-
'/' if tokenizer.starts_with("/*") => {
489+
b'/' if tokenizer.starts_with(b"/*") => {
490490
tokenizer.advance(2); // consume "/*"
491491
let start_position = tokenizer.position();
492492
let content;
@@ -503,58 +503,59 @@ fn next_token<'a>(tokenizer: &mut Tokenizer<'a>) -> Option<Token<'a>> {
503503
}
504504
Comment(content)
505505
}
506-
'0'...'9' => consume_numeric(tokenizer),
507-
':' => { tokenizer.advance(1); Colon },
508-
';' => { tokenizer.advance(1); Semicolon },
509-
'<' => {
510-
if tokenizer.starts_with("<!--") {
506+
b'0'...b'9' => consume_numeric(tokenizer),
507+
b':' => { tokenizer.advance(1); Colon },
508+
b';' => { tokenizer.advance(1); Semicolon },
509+
b'<' => {
510+
if tokenizer.starts_with(b"<!--") {
511511
tokenizer.advance(4);
512512
CDO
513513
} else {
514514
tokenizer.advance(1);
515-
Delim(c)
515+
Delim('<')
516516
}
517517
},
518-
'@' => {
518+
b'@' => {
519519
tokenizer.advance(1);
520520
if is_ident_start(tokenizer) { AtKeyword(consume_name(tokenizer)) }
521-
else { Delim(c) }
521+
else { Delim('@') }
522522
},
523-
'u' | 'U' => {
523+
b'u' | b'U' => {
524524
if tokenizer.has_at_least(2)
525-
&& tokenizer.char_at(1) == '+'
526-
&& matches!(tokenizer.char_at(2), '0'...'9' | 'a'...'f' | 'A'...'F' | '?')
525+
&& tokenizer.byte_at(1) == b'+'
526+
&& matches!(tokenizer.byte_at(2), b'0'...b'9' | b'a'...b'f' | b'A'...b'F' | b'?')
527527
{ consume_unicode_range(tokenizer) }
528528
else { consume_ident_like(tokenizer) }
529529
},
530-
'a'...'z' | 'A'...'Z' | '_' | '\0' => consume_ident_like(tokenizer),
531-
'[' => { tokenizer.advance(1); SquareBracketBlock },
532-
'\\' => {
530+
b'a'...b'z' | b'A'...b'Z' | b'_' | b'\0' => consume_ident_like(tokenizer),
531+
b'[' => { tokenizer.advance(1); SquareBracketBlock },
532+
b'\\' => {
533533
if !tokenizer.has_newline_at(1) { consume_ident_like(tokenizer) }
534-
else { tokenizer.advance(1); Delim(c) }
534+
else { tokenizer.advance(1); Delim('\\') }
535535
},
536-
']' => { tokenizer.advance(1); CloseSquareBracket },
537-
'^' => {
538-
if tokenizer.starts_with("^=") { tokenizer.advance(2); PrefixMatch }
539-
else { tokenizer.advance(1); Delim(c) }
536+
b']' => { tokenizer.advance(1); CloseSquareBracket },
537+
b'^' => {
538+
if tokenizer.starts_with(b"^=") { tokenizer.advance(2); PrefixMatch }
539+
else { tokenizer.advance(1); Delim('^') }
540540
},
541-
'{' => { tokenizer.advance(1); CurlyBracketBlock },
542-
'|' => {
543-
if tokenizer.starts_with("|=") { tokenizer.advance(2); DashMatch }
544-
else if tokenizer.starts_with("||") { tokenizer.advance(2); Column }
545-
else { tokenizer.advance(1); Delim(c) }
541+
b'{' => { tokenizer.advance(1); CurlyBracketBlock },
542+
b'|' => {
543+
if tokenizer.starts_with(b"|=") { tokenizer.advance(2); DashMatch }
544+
else if tokenizer.starts_with(b"||") { tokenizer.advance(2); Column }
545+
else { tokenizer.advance(1); Delim('|') }
546546
},
547-
'}' => { tokenizer.advance(1); CloseCurlyBracket },
548-
'~' => {
549-
if tokenizer.starts_with("~=") { tokenizer.advance(2); IncludeMatch }
550-
else { tokenizer.advance(1); Delim(c) }
547+
b'}' => { tokenizer.advance(1); CloseCurlyBracket },
548+
b'~' => {
549+
if tokenizer.starts_with(b"~=") { tokenizer.advance(2); IncludeMatch }
550+
else { tokenizer.advance(1); Delim('~') }
551551
},
552552
_ => {
553-
if c > '\x7F' { // Non-ASCII
553+
if !c.is_ascii() { // Non-ASCII
554554
consume_ident_like(tokenizer)
555555
} else {
556+
let ret = Delim(tokenizer.next_char());
556557
tokenizer.advance(1);
557-
Delim(c)
558+
ret
558559
}
559560
},
560561
};
@@ -641,15 +642,15 @@ fn consume_quoted_string<'a>(tokenizer: &mut Tokenizer<'a>, single_quote: bool)
641642

642643
#[inline]
643644
fn is_ident_start(tokenizer: &mut Tokenizer) -> bool {
644-
!tokenizer.is_eof() && match tokenizer.next_char() {
645-
'a'...'z' | 'A'...'Z' | '_' | '\0' => true,
646-
'-' => tokenizer.has_at_least(1) && match tokenizer.char_at(1) {
647-
'a'...'z' | 'A'...'Z' | '-' | '_' | '\0' => true,
648-
'\\' => !tokenizer.has_newline_at(1),
649-
c => c > '\x7F', // Non-ASCII
645+
!tokenizer.is_eof() && match tokenizer.next_byte_unchecked() {
646+
b'a'...b'z' | b'A'...b'Z' | b'_' | b'\0' => true,
647+
b'-' => tokenizer.has_at_least(1) && match tokenizer.byte_at(1) {
648+
b'a'...b'z' | b'A'...b'Z' | b'-' | b'_' | b'\0' => true,
649+
b'\\' => !tokenizer.has_newline_at(1),
650+
c => !c.is_ascii(),
650651
},
651-
'\\' => !tokenizer.has_newline_at(1),
652-
c => c > '\x7F', // Non-ASCII
652+
b'\\' => !tokenizer.has_newline_at(1),
653+
c => !c.is_ascii(),
653654
}
654655
}
655656

0 commit comments

Comments
 (0)