Skip to content

Utf 16 columns #192

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Sep 1, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/parser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -811,6 +811,7 @@ pub fn parse_until_after<'i: 't, 't, F, T, E>(parser: &mut Parser<'i, 't>,
let next_byte = (parser.input.tokenizer).next_byte();
if next_byte.is_some() && !parser.stop_before.contains(Delimiters::from_byte(next_byte)) {
debug_assert!(delimiters.contains(Delimiters::from_byte(next_byte)));
// We know this byte is ASCII.
(parser.input.tokenizer).advance(1);
if next_byte == Some(b'{') {
consume_until_end_of_block(BlockType::CurlyBracket, &mut parser.input.tokenizer);
Expand Down
50 changes: 50 additions & 0 deletions src/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1049,3 +1049,53 @@ fn roundtrip_percentage_token() {
}
}
}

#[test]
fn utf16_columns() {
// This particular test serves two purposes. First, it checks
// that the column number computations are correct. Second, it
// checks that tokenizer code paths correctly differentiate
// between the different UTF-8 encoding bytes. In particular
// different leader bytes and continuation bytes are treated
// differently, so we make sure to include all lengths in the
// tests, using the string "QΡ✈🆒". Also, remember that because
// the column is in units of UTF-16, the 4-byte sequence results
// in two columns.
let tests = vec![
("", 0),
("ascii", 5),
("/*QΡ✈🆒*/", 9),
("'QΡ✈🆒*'", 8),
("\"\\\"'QΡ✈🆒*'", 11),
("\\Q\\Ρ\\✈\\🆒", 9),
("QΡ✈🆒", 5),
("QΡ✈🆒\\Q\\Ρ\\✈\\🆒", 14),
("newline\r\nQΡ✈🆒", 5),
("url(QΡ✈🆒\\Q\\Ρ\\✈\\🆒)", 19),
("url(QΡ✈🆒)", 10),
("url(\r\nQΡ✈🆒\\Q\\Ρ\\✈\\🆒)", 15),
("url(\r\nQΡ✈🆒\\Q\\Ρ\\✈\\🆒", 14),
("url(\r\nQΡ✈🆒\\Q\\Ρ\\✈\\🆒 x", 16),
("QΡ✈🆒()", 7),
// Test that under/over-flow of current_line_start_position is
// handled properly; see the special case in consume_4byte_intro.
("🆒", 2),
];

for test in tests {
let mut input = ParserInput::new(test.0);
let mut parser = Parser::new(&mut input);

// Read all tokens.
loop {
match parser.next() {
Err(BasicParseError::EndOfInput) => { break; }
Err(_) => { assert!(false); }
Ok(_) => {}
};
}

// Check the resulting column.
assert_eq!(parser.current_source_location().column, test.1);
}
}
Loading