Skip to content

Commit 4507fe5

Browse files
author
bors-servo
authored
Auto merge of servo#119 - servo:unsafe, r=emilio
Add comments to justify `unsafe` blocks. <!-- Reviewable:start --> This change is [<img src="https://reviewable.io/review_button.svg" height="34" align="absmiddle" alt="Reviewable"/>](https://reviewable.io/reviews/servo/rust-cssparser/119) <!-- Reviewable:end -->
2 parents 570d83f + 6a6f157 commit 4507fe5

File tree

1 file changed

+63
-35
lines changed

1 file changed

+63
-35
lines changed

src/tokenizer.rs

+63-35
Original file line numberDiff line numberDiff line change
@@ -576,6 +576,7 @@ fn consume_string<'a>(tokenizer: &mut Tokenizer<'a>, single_quote: bool) -> Toke
576576
fn consume_quoted_string<'a>(tokenizer: &mut Tokenizer<'a>, single_quote: bool)
577577
-> Result<Cow<'a, str>, ()> {
578578
tokenizer.advance(1); // Skip the initial quote
579+
// start_pos is at code point boundary, after " or '
579580
let start_pos = tokenizer.position();
580581
let mut string_bytes;
581582
loop {
@@ -598,6 +599,11 @@ fn consume_quoted_string<'a>(tokenizer: &mut Tokenizer<'a>, single_quote: bool)
598599
}
599600
}
600601
b'\\' | b'\0' => {
602+
// * The tokenizer’s input is UTF-8 since it’s `&str`.
603+
// * start_pos is at a code point boundary
604+
// * so is the current position (which is before '\\' or '\0'
605+
//
606+
// So `string_bytes` is well-formed UTF-8.
601607
string_bytes = tokenizer.slice_from(start_pos).as_bytes().to_owned();
602608
break
603609
}
@@ -611,8 +617,8 @@ fn consume_quoted_string<'a>(tokenizer: &mut Tokenizer<'a>, single_quote: bool)
611617
if matches!(tokenizer.next_byte_unchecked(), b'\n' | b'\r' | b'\x0C') {
612618
return Err(());
613619
}
614-
let c = tokenizer.consume_byte();
615-
match_byte! { c,
620+
let b = tokenizer.consume_byte();
621+
match_byte! { b,
616622
b'"' => {
617623
if !single_quote {
618624
break;
@@ -634,26 +640,29 @@ fn consume_quoted_string<'a>(tokenizer: &mut Tokenizer<'a>, single_quote: bool)
634640
tokenizer.advance(1);
635641
}
636642
}
643+
// This pushes one well-formed code point
637644
_ => consume_escape_and_write(tokenizer, &mut string_bytes)
638645
}
639646
}
640647
// else: escaped EOF, do nothing.
641648
continue;
642649
}
643650
b'\0' => {
644-
// string.push('\u{FFFD}'),
645-
string_bytes.push(0xef);
646-
string_bytes.push(0xbf);
647-
string_bytes.push(0xbd);
651+
string_bytes.extend("\u{FFFD}".as_bytes());
648652
continue;
649653
}
650654
_ => {},
651655
}
652656

653-
string_bytes.push(c);
657+
// If this byte is part of a multi-byte code point,
658+
// we’ll end up copying the whole code point before this loop does something else.
659+
string_bytes.push(b);
654660
}
655661

656-
Ok(Owned(to_utf8(string_bytes)))
662+
Ok(Owned(
663+
// string_bytes is well-formed UTF-8, see other comments.
664+
unsafe { from_utf8_release_unchecked(string_bytes) }
665+
))
657666
}
658667

659668

@@ -695,6 +704,7 @@ fn consume_ident_like<'a>(tokenizer: &mut Tokenizer<'a>) -> Token<'a> {
695704
}
696705

697706
fn consume_name<'a>(tokenizer: &mut Tokenizer<'a>) -> Cow<'a, str> {
707+
// start_pos is the end of the previous token, therefore at a code point boundary
698708
let start_pos = tokenizer.position();
699709
let mut value_bytes;
700710
loop {
@@ -704,11 +714,16 @@ fn consume_name<'a>(tokenizer: &mut Tokenizer<'a>) -> Cow<'a, str> {
704714
match_byte! { tokenizer.next_byte_unchecked(),
705715
b'a'...b'z' | b'A'...b'Z' | b'0'...b'9' | b'_' | b'-' => { tokenizer.advance(1) },
706716
b'\\' | b'\0' => {
717+
// * The tokenizer’s input is UTF-8 since it’s `&str`.
718+
// * start_pos is at a code point boundary
719+
// * so is the current position (which is before '\\' or '\0'
720+
//
721+
// So `value_bytes` is well-formed UTF-8.
707722
value_bytes = tokenizer.slice_from(start_pos).as_bytes().to_owned();
708723
break
709724
}
710-
c => {
711-
if c.is_ascii() {
725+
b => {
726+
if b.is_ascii() {
712727
return Borrowed(tokenizer.slice_from(start_pos));
713728
}
714729
tokenizer.advance(1);
@@ -717,34 +732,37 @@ fn consume_name<'a>(tokenizer: &mut Tokenizer<'a>) -> Cow<'a, str> {
717732
}
718733

719734
while !tokenizer.is_eof() {
720-
let c = tokenizer.next_byte_unchecked();
721-
match_byte! { c,
735+
let b = tokenizer.next_byte_unchecked();
736+
match_byte! { b,
722737
b'a'...b'z' | b'A'...b'Z' | b'0'...b'9' | b'_' | b'-' => {
723738
tokenizer.advance(1);
724-
value_bytes.push(c)
739+
value_bytes.push(b) // ASCII
725740
}
726741
b'\\' => {
727742
if tokenizer.has_newline_at(1) { break }
728743
tokenizer.advance(1);
744+
// This pushes one well-formed code point
729745
consume_escape_and_write(tokenizer, &mut value_bytes)
730746
}
731747
b'\0' => {
732748
tokenizer.advance(1);
733-
// value.push('\u{FFFD}')
734-
value_bytes.push(0xef);
735-
value_bytes.push(0xbf);
736-
value_bytes.push(0xbd);
749+
value_bytes.extend("\u{FFFD}".as_bytes());
737750
},
738751
_ => {
739-
if c.is_ascii() {
752+
if b.is_ascii() {
740753
break;
741754
}
742755
tokenizer.advance(1);
743-
value_bytes.push(c)
756+
// This byte *is* part of a multi-byte code point,
757+
// we’ll end up copying the whole code point before this loop does something else.
758+
value_bytes.push(b)
744759
}
745760
}
746761
}
747-
Owned(to_utf8(value_bytes))
762+
Owned(
763+
// string_bytes is well-formed UTF-8, see other comments.
764+
unsafe { from_utf8_release_unchecked(value_bytes) }
765+
)
748766
}
749767

750768

@@ -867,17 +885,16 @@ fn consume_numeric<'a>(tokenizer: &mut Tokenizer<'a>) -> Token<'a> {
867885

868886

869887
#[inline]
870-
fn to_utf8(string_bytes: Vec<u8>) -> String {
888+
unsafe fn from_utf8_release_unchecked(string_bytes: Vec<u8>) -> String {
871889
if cfg!(debug_assertions) {
872890
String::from_utf8(string_bytes).unwrap()
873891
} else {
874-
unsafe {
875-
String::from_utf8_unchecked(string_bytes)
876-
}
892+
String::from_utf8_unchecked(string_bytes)
877893
}
878894
}
879895

880896
fn consume_unquoted_url<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>, ()> {
897+
// This is only called after "url(", so the current position is a code point boundary.
881898
for (offset, c) in tokenizer.input[tokenizer.position..].bytes().enumerate() {
882899
match_byte! { c,
883900
b' ' | b'\t' | b'\n' | b'\r' | b'\x0C' => {},
@@ -888,6 +905,8 @@ fn consume_unquoted_url<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>,
888905
}
889906
_ => {
890907
tokenizer.advance(offset);
908+
// This function only consumed ASCII (whitespace) bytes,
909+
// so the current position is a code point boundary.
891910
return Ok(consume_unquoted_url_internal(tokenizer))
892911
}
893912
}
@@ -896,6 +915,7 @@ fn consume_unquoted_url<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>,
896915
return Ok(UnquotedUrl(Borrowed("")));
897916

898917
fn consume_unquoted_url_internal<'a>(tokenizer: &mut Tokenizer<'a>) -> Token<'a> {
918+
// This function is only called with start_pos at a code point boundary.
899919
let start_pos = tokenizer.position();
900920
let mut string_bytes: Vec<u8>;
901921
loop {
@@ -919,6 +939,11 @@ fn consume_unquoted_url<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>,
919939
return consume_bad_url(tokenizer)
920940
},
921941
b'\\' | b'\0' => {
942+
// * The tokenizer’s input is UTF-8 since it’s `&str`.
943+
// * start_pos is at a code point boundary
944+
// * so is the current position (which is before '\\' or '\0'
945+
//
946+
// So `string_bytes` is well-formed UTF-8.
922947
string_bytes = tokenizer.slice_from(start_pos).as_bytes().to_owned();
923948
break
924949
}
@@ -930,7 +955,10 @@ fn consume_unquoted_url<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>,
930955
while !tokenizer.is_eof() {
931956
match_byte! { tokenizer.consume_byte(),
932957
b' ' | b'\t' | b'\n' | b'\r' | b'\x0C' => {
933-
return consume_url_end(tokenizer, Owned(to_utf8(string_bytes)));
958+
return consume_url_end(tokenizer, Owned(
959+
// string_bytes is well-formed UTF-8, see other comments.
960+
unsafe { from_utf8_release_unchecked(string_bytes) }
961+
))
934962
}
935963
b')' => {
936964
break;
@@ -944,18 +972,21 @@ fn consume_unquoted_url<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>,
944972
return consume_bad_url(tokenizer)
945973
}
946974

975+
// This pushes one well-formed code point to string_bytes
947976
consume_escape_and_write(tokenizer, &mut string_bytes)
948977
},
949978
b'\0' => {
950-
// string.push('\u{FFFD}');
951-
string_bytes.push(0xef);
952-
string_bytes.push(0xbf);
953-
string_bytes.push(0xbd);
979+
string_bytes.extend("\u{FFFD}".as_bytes());
954980
}
955-
c => { string_bytes.push(c) }
981+
// If this byte is part of a multi-byte code point,
982+
// we’ll end up copying the whole code point before this loop does something else.
983+
b => { string_bytes.push(b) }
956984
}
957985
}
958-
UnquotedUrl(Owned(to_utf8(string_bytes)))
986+
UnquotedUrl(Owned(
987+
// string_bytes is well-formed UTF-8, see other comments.
988+
unsafe { from_utf8_release_unchecked(string_bytes) }
989+
))
959990
}
960991

961992
fn consume_url_end<'a>(tokenizer: &mut Tokenizer<'a>, string: Cow<'a, str>) -> Token<'a> {
@@ -1039,11 +1070,8 @@ fn consume_hex_digits<'a>(tokenizer: &mut Tokenizer<'a>) -> (u32, u32) {
10391070

10401071
// Same constraints as consume_escape except it writes into `bytes` the result
10411072
// instead of returning it.
1042-
//
1043-
// TODO: This could be made more efficient with char::encode_utf8, I guess.
10441073
fn consume_escape_and_write(tokenizer: &mut Tokenizer, bytes: &mut Vec<u8>) {
1045-
use std::io::Write;
1046-
write!(bytes, "{}", consume_escape(tokenizer)).unwrap();
1074+
bytes.extend(consume_escape(tokenizer).encode_utf8(&mut [0; 4]).as_bytes())
10471075
}
10481076

10491077
// Assumes that the U+005C REVERSE SOLIDUS (\) has already been consumed

0 commit comments

Comments
 (0)