From 5bab829437184f17dc3a546a55734fca435f9323 Mon Sep 17 00:00:00 2001 From: Josh Matthews Date: Tue, 4 Jul 2017 18:05:32 -0400 Subject: [PATCH 1/9] Add API to retrieve contents of current line being parsed. --- src/parser.rs | 5 +++++ src/tests.rs | 19 +++++++++++++++++++ src/tokenizer.rs | 11 +++++++++++ 3 files changed, 35 insertions(+) diff --git a/src/parser.rs b/src/parser.rs index 75ddf86a..2a2c0ff4 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -188,6 +188,11 @@ impl<'i: 't, 't> Parser<'i, 't> { } } + /// Return the current line that is being parsed. + pub fn current_line(&self) -> &'i str { + self.tokenizer.0.current_source_line() + } + /// Check whether the input is exhausted. That is, if `.next()` would return a token. /// /// This ignores whitespace and comments. diff --git a/src/tests.rs b/src/tests.rs index c05bd63d..8d067308 100644 --- a/src/tests.rs +++ b/src/tests.rs @@ -920,3 +920,22 @@ fn parse_until_before_stops_at_delimiter_or_end_of_input() { } } } + +#[test] +fn parser_maintains_current_line() { + let mut input = ParserInput::new("ident ident;\nident ident ident;\nident"); + let mut parser = Parser::new(&mut input); + assert_eq!(parser.current_line(), "ident ident;"); + assert_eq!(parser.next(), Ok(Token::Ident("ident".into()))); + assert_eq!(parser.next(), Ok(Token::Ident("ident".into()))); + assert_eq!(parser.next(), Ok(Token::Semicolon)); + + assert_eq!(parser.next(), Ok(Token::Ident("ident".into()))); + assert_eq!(parser.current_line(), "ident ident ident;"); + assert_eq!(parser.next(), Ok(Token::Ident("ident".into()))); + assert_eq!(parser.next(), Ok(Token::Ident("ident".into()))); + assert_eq!(parser.next(), Ok(Token::Semicolon)); + + assert_eq!(parser.next(), Ok(Token::Ident("ident".into()))); + assert_eq!(parser.current_line(), "ident"); +} diff --git a/src/tokenizer.rs b/src/tokenizer.rs index ff15bb28..b1c6eb01 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -287,6 +287,17 @@ impl<'a> Tokenizer<'a> { self.source_location(position) } + pub fn current_source_line(&self) -> &'a str { + let current = self.position; + let start = self.input[0..current] + .rfind(|c| matches!(c, '\r' | '\n' | '\x0C')) + .map_or(0, |start| start + 1); + let end = self.input[current..] + .find(|c| matches!(c, '\r' | '\n' | '\x0C')) + .map_or(self.input.len(), |end| current + end); + &self.input[start..end] + } + pub fn source_location(&self, position: SourcePosition) -> SourceLocation { let target = position.0; let mut location; From f71a5a2a3669dafe5f10c4ef153ed609292d31f7 Mon Sep 17 00:00:00 2001 From: Josh Matthews Date: Tue, 4 Jul 2017 18:06:48 -0400 Subject: [PATCH 2/9] Differentiate between default errors for at-rule parsing. --- src/parser.rs | 4 +++- src/rules_and_declarations.rs | 4 ++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/src/parser.rs b/src/parser.rs index 2a2c0ff4..e831af74 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -30,7 +30,9 @@ pub enum BasicParseError<'a> { /// The end of the input was encountered unexpectedly. EndOfInput, /// An `@` rule was encountered that was invalid. - AtRuleInvalid, + AtRuleInvalid(CompactCowStr<'a>), + /// The body of an '@' rule was invalid. + AtRuleBodyInvalid, /// A qualified rule was encountered that was invalid. QualifiedRuleInvalid, } diff --git a/src/rules_and_declarations.rs b/src/rules_and_declarations.rs index 90f73409..975ee5b3 100644 --- a/src/rules_and_declarations.rs +++ b/src/rules_and_declarations.rs @@ -116,7 +116,7 @@ pub trait AtRuleParser<'i> { -> Result, ParseError<'i, Self::Error>> { let _ = name; let _ = input; - Err(ParseError::Basic(BasicParseError::AtRuleInvalid)) + Err(ParseError::Basic(BasicParseError::AtRuleInvalid(name))) } /// Parse the content of a `{ /* ... */ }` block for the body of the at-rule. @@ -131,7 +131,7 @@ pub trait AtRuleParser<'i> { -> Result> { let _ = prelude; let _ = input; - Err(ParseError::Basic(BasicParseError::AtRuleInvalid)) + Err(ParseError::Basic(BasicParseError::AtRuleBodyInvalid)) } /// An `OptionalBlock` prelude was followed by `;`. From 0c8a8a93e8483e81803bdf1c5c409501830b75b9 Mon Sep 17 00:00:00 2001 From: Josh Matthews Date: Tue, 4 Jul 2017 18:11:55 -0400 Subject: [PATCH 3/9] parse_entirely should not prioritize EndOfInput errors. --- src/parser.rs | 4 ++-- src/tests.rs | 10 ++++++++++ 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/src/parser.rs b/src/parser.rs index e831af74..f7b61a43 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -364,9 +364,9 @@ impl<'i: 't, 't> Parser<'i, 't> { #[inline] pub fn parse_entirely(&mut self, parse: F) -> Result> where F: FnOnce(&mut Parser<'i, 't>) -> Result> { - let result = parse(self); + let result = parse(self)?; self.expect_exhausted()?; - result + Ok(result) } /// Parse a list of comma-separated values, all with the same syntax. diff --git a/src/tests.rs b/src/tests.rs index 8d067308..d294b8a0 100644 --- a/src/tests.rs +++ b/src/tests.rs @@ -939,3 +939,13 @@ fn parser_maintains_current_line() { assert_eq!(parser.next(), Ok(Token::Ident("ident".into()))); assert_eq!(parser.current_line(), "ident"); } + +#[test] +fn parse_entirely_reports_first_error() { + #[derive(PartialEq, Debug)] + enum E { Foo } + let mut input = ParserInput::new("ident"); + let mut parser = Parser::new(&mut input); + let result: Result<(), _> = parser.parse_entirely(|_| Err(ParseError::Custom(E::Foo))); + assert_eq!(result, Err(ParseError::Custom(E::Foo))); +} From b0c8cdf2a90a27c48ae63706bb40e6c5890be37a Mon Sep 17 00:00:00 2001 From: Josh Matthews Date: Tue, 4 Jul 2017 18:12:29 -0400 Subject: [PATCH 4/9] Style cleanup for existing parser code. --- src/parser.rs | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/parser.rs b/src/parser.rs index f7b61a43..edd797df 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -489,8 +489,7 @@ impl<'i: 't, 't> Parser<'i, 't> { match self.next()? { Token::UnquotedUrl(value) => Ok(value), Token::Function(ref name) if name.eq_ignore_ascii_case("url") => { - self.parse_nested_block(|input| input.expect_string() - .map_err(|e| ParseError::Basic(e))) + self.parse_nested_block(|input| input.expect_string().map_err(ParseError::Basic)) .map_err(ParseError::<()>::basic) }, t => Err(BasicParseError::UnexpectedToken(t)) @@ -504,7 +503,7 @@ impl<'i: 't, 't> Parser<'i, 't> { Token::UnquotedUrl(value) => Ok(value), Token::QuotedString(value) => Ok(value), Token::Function(ref name) if name.eq_ignore_ascii_case("url") => { - self.parse_nested_block(|input| input.expect_string().map_err(|e| ParseError::Basic(e))) + self.parse_nested_block(|input| input.expect_string().map_err(ParseError::Basic)) .map_err(ParseError::<()>::basic) }, t => Err(BasicParseError::UnexpectedToken(t)) From 0d93af098f5875177e8c040981fa4f2721e0ba25 Mon Sep 17 00:00:00 2001 From: Josh Matthews Date: Tue, 4 Jul 2017 18:14:37 -0400 Subject: [PATCH 5/9] Report actual unexpected token in declarations. --- src/parser.rs | 2 -- src/rules_and_declarations.rs | 4 ++-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/src/parser.rs b/src/parser.rs index edd797df..5a38b7cc 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -25,8 +25,6 @@ pub struct SourcePosition { pub enum BasicParseError<'a> { /// An unexpected token was encountered. UnexpectedToken(Token<'a>), - /// A particular token was expected but not found. - ExpectedToken(Token<'a>), /// The end of the input was encountered unexpectedly. EndOfInput, /// An `@` rule was encountered that was invalid. diff --git a/src/rules_and_declarations.rs b/src/rules_and_declarations.rs index 975ee5b3..7fb8b788 100644 --- a/src/rules_and_declarations.rs +++ b/src/rules_and_declarations.rs @@ -257,9 +257,9 @@ where P: DeclarationParser<'i, Declaration = I, Error = E> + Ok(Token::AtKeyword(name)) => { return Some(parse_at_rule(start_position, name, self.input, &mut self.parser)) } - Ok(_) => { + Ok(t) => { return Some(self.input.parse_until_after(Delimiter::Semicolon, - |_| Err(ParseError::Basic(BasicParseError::ExpectedToken(Token::Semicolon)))) + |_| Err(ParseError::Basic(BasicParseError::UnexpectedToken(t)))) .map_err(|e| PreciseParseError { error: e, span: start_position..self.input.position() From c4c8f78e0f28280ca9f7f90cf0eba5a6fcaeefed Mon Sep 17 00:00:00 2001 From: Josh Matthews Date: Tue, 4 Jul 2017 18:15:25 -0400 Subject: [PATCH 6/9] Propagate error from parsing at rules. --- src/rules_and_declarations.rs | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/src/rules_and_declarations.rs b/src/rules_and_declarations.rs index 7fb8b788..e7bdf29e 100644 --- a/src/rules_and_declarations.rs +++ b/src/rules_and_declarations.rs @@ -462,16 +462,14 @@ fn parse_at_rule<'i: 't, 't, P, E>(start_position: SourcePosition, name: Compact _ => unreachable!() } } - Err(_) => { + Err(error) => { let end_position = input.position(); - let error = match input.next() { - Ok(Token::CurlyBracketBlock) => BasicParseError::UnexpectedToken(Token::CurlyBracketBlock), - Ok(Token::Semicolon) => BasicParseError::UnexpectedToken(Token::Semicolon), - Err(e) => e, + match input.next() { + Ok(Token::CurlyBracketBlock) | Ok(Token::Semicolon) | Err(_) => {}, _ => unreachable!() }; Err(PreciseParseError { - error: ParseError::Basic(error), + error: error, span: start_position..end_position, }) } From 5a2d392ee3121c687e3d1d235d5750e6927e2a04 Mon Sep 17 00:00:00 2001 From: Josh Matthews Date: Tue, 4 Jul 2017 18:16:22 -0400 Subject: [PATCH 7/9] Store bad string and url values. --- src/serializer.rs | 8 ++++---- src/tests.rs | 4 ++-- src/tokenizer.rs | 24 ++++++++++++++++-------- 3 files changed, 22 insertions(+), 14 deletions(-) diff --git a/src/serializer.rs b/src/serializer.rs index fcfeaaf3..7347463b 100644 --- a/src/serializer.rs +++ b/src/serializer.rs @@ -129,8 +129,8 @@ impl<'a> ToCss for Token<'a> { Token::SquareBracketBlock => dest.write_str("[")?, Token::CurlyBracketBlock => dest.write_str("{")?, - Token::BadUrl => dest.write_str("url()")?, - Token::BadString => dest.write_str("\"\n")?, + Token::BadUrl(_) => dest.write_str("url()")?, + Token::BadString(_) => dest.write_str("\"\n")?, Token::CloseParenthesis => dest.write_str(")")?, Token::CloseSquareBracket => dest.write_str("]")?, Token::CloseCurlyBracket => dest.write_str("}")?, @@ -376,7 +376,7 @@ impl<'a> Token<'a> { TokenSerializationType(match *self { Token::Ident(_) => Ident, Token::AtKeyword(_) | Token::Hash(_) | Token::IDHash(_) => AtKeywordOrHash, - Token::UnquotedUrl(_) | Token::BadUrl => UrlOrBadUrl, + Token::UnquotedUrl(_) | Token::BadUrl(_) => UrlOrBadUrl, Token::Delim('#') => DelimHash, Token::Delim('@') => DelimAt, Token::Delim('.') | Token::Delim('+') => DelimDotOrPlus, @@ -400,7 +400,7 @@ impl<'a> Token<'a> { Token::ParenthesisBlock => OpenParen, Token::SquareBracketBlock | Token::CurlyBracketBlock | Token::CloseParenthesis | Token::CloseSquareBracket | Token::CloseCurlyBracket | - Token::QuotedString(_) | Token::BadString | + Token::QuotedString(_) | Token::BadString(_) | Token::Delim(_) | Token::Colon | Token::Semicolon | Token::Comma | Token::CDO | Token::IncludeMatch | Token::PrefixMatch | Token::SuffixMatch => Other, diff --git a/src/tests.rs b/src/tests.rs index d294b8a0..010e722c 100644 --- a/src/tests.rs +++ b/src/tests.rs @@ -848,8 +848,8 @@ fn one_component_value_to_json(token: Token, input: &mut Parser) -> Json { v.extend(nested(input)); v }), - Token::BadUrl => JArray!["error", "bad-url"], - Token::BadString => JArray!["error", "bad-string"], + Token::BadUrl(_) => JArray!["error", "bad-url"], + Token::BadString(_) => JArray!["error", "bad-string"], Token::CloseParenthesis => JArray!["error", ")"], Token::CloseSquareBracket => JArray!["error", "]"], Token::CloseCurlyBracket => JArray!["error", "}"], diff --git a/src/tokenizer.rs b/src/tokenizer.rs index b1c6eb01..157da76d 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -157,12 +157,12 @@ pub enum Token<'a> { /// A `` /// /// This token always indicates a parse error. - BadUrl, + BadUrl(CompactCowStr<'a>), /// A `` /// /// This token always indicates a parse error. - BadString, + BadString(CompactCowStr<'a>), /// A `<)-token>` /// @@ -194,7 +194,7 @@ impl<'a> Token<'a> { pub fn is_parse_error(&self) -> bool { matches!( *self, - BadUrl | BadString | CloseParenthesis | CloseSquareBracket | CloseCurlyBracket + BadUrl(_) | BadString(_) | CloseParenthesis | CloseSquareBracket | CloseCurlyBracket ) } } @@ -567,14 +567,14 @@ fn next_token<'a>(tokenizer: &mut Tokenizer<'a>) -> Result, ()> { fn consume_string<'a>(tokenizer: &mut Tokenizer<'a>, single_quote: bool) -> Token<'a> { match consume_quoted_string(tokenizer, single_quote) { Ok(value) => QuotedString(value), - Err(()) => BadString + Err(value) => BadString(value) } } /// Return `Err(())` on syntax error (ie. unescaped newline) fn consume_quoted_string<'a>(tokenizer: &mut Tokenizer<'a>, single_quote: bool) - -> Result, ()> { + -> Result, CompactCowStr<'a>> { tokenizer.advance(1); // Skip the initial quote // start_pos is at code point boundary, after " or ' let start_pos = tokenizer.position(); @@ -607,7 +607,9 @@ fn consume_quoted_string<'a>(tokenizer: &mut Tokenizer<'a>, single_quote: bool) string_bytes = tokenizer.slice_from(start_pos).as_bytes().to_owned(); break } - b'\n' | b'\r' | b'\x0C' => { return Err(()) }, + b'\n' | b'\r' | b'\x0C' => { + return Err(tokenizer.slice_from(start_pos).into()) + }, _ => {} } tokenizer.consume_byte(); @@ -615,7 +617,12 @@ fn consume_quoted_string<'a>(tokenizer: &mut Tokenizer<'a>, single_quote: bool) while !tokenizer.is_eof() { if matches!(tokenizer.next_byte_unchecked(), b'\n' | b'\r' | b'\x0C') { - return Err(()); + return Err( + // string_bytes is well-formed UTF-8, see other comments. + unsafe { + from_utf8_release_unchecked(string_bytes) + }.into() + ); } let b = tokenizer.consume_byte(); match_byte! { b, @@ -1024,6 +1031,7 @@ fn consume_unquoted_url<'a>(tokenizer: &mut Tokenizer<'a>) -> Result, } fn consume_bad_url<'a>(tokenizer: &mut Tokenizer<'a>) -> Token<'a> { + let start_pos = tokenizer.position(); // Consume up to the closing ) while !tokenizer.is_eof() { match_byte! { tokenizer.consume_byte(), @@ -1034,7 +1042,7 @@ fn consume_unquoted_url<'a>(tokenizer: &mut Tokenizer<'a>) -> Result, _ => {}, } } - BadUrl + BadUrl(tokenizer.slice_from(start_pos).into()) } } From a8e2252c1f7d68df17d90516d3798e4bba32e89c Mon Sep 17 00:00:00 2001 From: Josh Matthews Date: Tue, 4 Jul 2017 18:16:43 -0400 Subject: [PATCH 8/9] Make column and line numbers match Gecko's CSS parser. --- src/tests.rs | 18 +++++++++--------- src/tokenizer.rs | 10 +++++----- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/src/tests.rs b/src/tests.rs index 010e722c..454c1cb3 100644 --- a/src/tests.rs +++ b/src/tests.rs @@ -451,26 +451,26 @@ fn serialize_rgba_two_digit_float_if_roundtrips() { fn line_numbers() { let mut input = ParserInput::new("foo bar\nbaz\r\n\n\"a\\\r\nb\""); let mut input = Parser::new(&mut input); - assert_eq!(input.current_source_location(), SourceLocation { line: 1, column: 1 }); + assert_eq!(input.current_source_location(), SourceLocation { line: 0, column: 0 }); assert_eq!(input.next_including_whitespace(), Ok(Token::Ident("foo".into()))); - assert_eq!(input.current_source_location(), SourceLocation { line: 1, column: 4 }); + assert_eq!(input.current_source_location(), SourceLocation { line: 0, column: 3 }); assert_eq!(input.next_including_whitespace(), Ok(Token::WhiteSpace(" "))); - assert_eq!(input.current_source_location(), SourceLocation { line: 1, column: 5 }); + assert_eq!(input.current_source_location(), SourceLocation { line: 0, column: 4 }); assert_eq!(input.next_including_whitespace(), Ok(Token::Ident("bar".into()))); - assert_eq!(input.current_source_location(), SourceLocation { line: 1, column: 8 }); + assert_eq!(input.current_source_location(), SourceLocation { line: 0, column: 7 }); assert_eq!(input.next_including_whitespace(), Ok(Token::WhiteSpace("\n"))); - assert_eq!(input.current_source_location(), SourceLocation { line: 2, column: 1 }); + assert_eq!(input.current_source_location(), SourceLocation { line: 1, column: 0 }); assert_eq!(input.next_including_whitespace(), Ok(Token::Ident("baz".into()))); - assert_eq!(input.current_source_location(), SourceLocation { line: 2, column: 4 }); + assert_eq!(input.current_source_location(), SourceLocation { line: 1, column: 3 }); let position = input.position(); assert_eq!(input.next_including_whitespace(), Ok(Token::WhiteSpace("\r\n\n"))); - assert_eq!(input.current_source_location(), SourceLocation { line: 4, column: 1 }); + assert_eq!(input.current_source_location(), SourceLocation { line: 3, column: 0 }); - assert_eq!(input.source_location(position), SourceLocation { line: 2, column: 4 }); + assert_eq!(input.source_location(position), SourceLocation { line: 1, column: 3 }); assert_eq!(input.next_including_whitespace(), Ok(Token::QuotedString("ab".into()))); - assert_eq!(input.current_source_location(), SourceLocation { line: 5, column: 3 }); + assert_eq!(input.current_source_location(), SourceLocation { line: 4, column: 2 }); assert!(input.next_including_whitespace().is_err()); } diff --git a/src/tokenizer.rs b/src/tokenizer.rs index 157da76d..2244e253 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -226,7 +226,7 @@ impl<'a> Tokenizer<'a> { input: input, position: 0, last_known_source_location: Cell::new((SourcePosition(0), - SourceLocation { line: 1, column: 1 })), + SourceLocation { line: 0, column: 0 })), var_functions: SeenStatus::DontCare, viewport_percentages: SeenStatus::DontCare, } @@ -312,7 +312,7 @@ impl<'a> Tokenizer<'a> { // So if the requested position is before the last known one, // start over from the beginning. position = 0; - location = SourceLocation { line: 1, column: 1 }; + location = SourceLocation { line: 0, column: 0 }; } let mut source = &self.input[position..target]; while let Some(newline_position) = source.find(|c| matches!(c, '\n' | '\r' | '\x0C')) { @@ -321,7 +321,7 @@ impl<'a> Tokenizer<'a> { source = &source[offset..]; position += offset; location.line += 1; - location.column = 1; + location.column = 0; } debug_assert!(position <= target); location.column += (target - position) as u32; @@ -397,10 +397,10 @@ pub struct SourcePosition(usize); /// The line and column number for a given position within the input. #[derive(PartialEq, Eq, Debug, Clone, Copy)] pub struct SourceLocation { - /// The line number, starting at 1 for the first line. + /// The line number, starting at 0 for the first line. pub line: u32, - /// The column number within a line, starting at 1 for first the character of the line. + /// The column number within a line, starting at 0 for first the character of the line. pub column: u32, } From c6156c00072fadc10855867a817bac5020480604 Mon Sep 17 00:00:00 2001 From: Josh Matthews Date: Tue, 4 Jul 2017 18:17:20 -0400 Subject: [PATCH 9/9] Increase package version. --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 355694c0..de9b769e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "cssparser" -version = "0.16.1" +version = "0.17.0" authors = [ "Simon Sapin " ] description = "Rust implementation of CSS Syntax Level 3"