diff --git a/Cargo.toml b/Cargo.toml index 355694c0..de9b769e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "cssparser" -version = "0.16.1" +version = "0.17.0" authors = [ "Simon Sapin " ] description = "Rust implementation of CSS Syntax Level 3" diff --git a/src/parser.rs b/src/parser.rs index 75ddf86a..5a38b7cc 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -25,12 +25,12 @@ pub struct SourcePosition { pub enum BasicParseError<'a> { /// An unexpected token was encountered. UnexpectedToken(Token<'a>), - /// A particular token was expected but not found. - ExpectedToken(Token<'a>), /// The end of the input was encountered unexpectedly. EndOfInput, /// An `@` rule was encountered that was invalid. - AtRuleInvalid, + AtRuleInvalid(CompactCowStr<'a>), + /// The body of an '@' rule was invalid. + AtRuleBodyInvalid, /// A qualified rule was encountered that was invalid. QualifiedRuleInvalid, } @@ -188,6 +188,11 @@ impl<'i: 't, 't> Parser<'i, 't> { } } + /// Return the current line that is being parsed. + pub fn current_line(&self) -> &'i str { + self.tokenizer.0.current_source_line() + } + /// Check whether the input is exhausted. That is, if `.next()` would return a token. /// /// This ignores whitespace and comments. @@ -357,9 +362,9 @@ impl<'i: 't, 't> Parser<'i, 't> { #[inline] pub fn parse_entirely(&mut self, parse: F) -> Result> where F: FnOnce(&mut Parser<'i, 't>) -> Result> { - let result = parse(self); + let result = parse(self)?; self.expect_exhausted()?; - result + Ok(result) } /// Parse a list of comma-separated values, all with the same syntax. @@ -482,8 +487,7 @@ impl<'i: 't, 't> Parser<'i, 't> { match self.next()? { Token::UnquotedUrl(value) => Ok(value), Token::Function(ref name) if name.eq_ignore_ascii_case("url") => { - self.parse_nested_block(|input| input.expect_string() - .map_err(|e| ParseError::Basic(e))) + self.parse_nested_block(|input| input.expect_string().map_err(ParseError::Basic)) .map_err(ParseError::<()>::basic) }, t => Err(BasicParseError::UnexpectedToken(t)) @@ -497,7 +501,7 @@ impl<'i: 't, 't> Parser<'i, 't> { Token::UnquotedUrl(value) => Ok(value), Token::QuotedString(value) => Ok(value), Token::Function(ref name) if name.eq_ignore_ascii_case("url") => { - self.parse_nested_block(|input| input.expect_string().map_err(|e| ParseError::Basic(e))) + self.parse_nested_block(|input| input.expect_string().map_err(ParseError::Basic)) .map_err(ParseError::<()>::basic) }, t => Err(BasicParseError::UnexpectedToken(t)) diff --git a/src/rules_and_declarations.rs b/src/rules_and_declarations.rs index 90f73409..e7bdf29e 100644 --- a/src/rules_and_declarations.rs +++ b/src/rules_and_declarations.rs @@ -116,7 +116,7 @@ pub trait AtRuleParser<'i> { -> Result, ParseError<'i, Self::Error>> { let _ = name; let _ = input; - Err(ParseError::Basic(BasicParseError::AtRuleInvalid)) + Err(ParseError::Basic(BasicParseError::AtRuleInvalid(name))) } /// Parse the content of a `{ /* ... */ }` block for the body of the at-rule. @@ -131,7 +131,7 @@ pub trait AtRuleParser<'i> { -> Result> { let _ = prelude; let _ = input; - Err(ParseError::Basic(BasicParseError::AtRuleInvalid)) + Err(ParseError::Basic(BasicParseError::AtRuleBodyInvalid)) } /// An `OptionalBlock` prelude was followed by `;`. @@ -257,9 +257,9 @@ where P: DeclarationParser<'i, Declaration = I, Error = E> + Ok(Token::AtKeyword(name)) => { return Some(parse_at_rule(start_position, name, self.input, &mut self.parser)) } - Ok(_) => { + Ok(t) => { return Some(self.input.parse_until_after(Delimiter::Semicolon, - |_| Err(ParseError::Basic(BasicParseError::ExpectedToken(Token::Semicolon)))) + |_| Err(ParseError::Basic(BasicParseError::UnexpectedToken(t)))) .map_err(|e| PreciseParseError { error: e, span: start_position..self.input.position() @@ -462,16 +462,14 @@ fn parse_at_rule<'i: 't, 't, P, E>(start_position: SourcePosition, name: Compact _ => unreachable!() } } - Err(_) => { + Err(error) => { let end_position = input.position(); - let error = match input.next() { - Ok(Token::CurlyBracketBlock) => BasicParseError::UnexpectedToken(Token::CurlyBracketBlock), - Ok(Token::Semicolon) => BasicParseError::UnexpectedToken(Token::Semicolon), - Err(e) => e, + match input.next() { + Ok(Token::CurlyBracketBlock) | Ok(Token::Semicolon) | Err(_) => {}, _ => unreachable!() }; Err(PreciseParseError { - error: ParseError::Basic(error), + error: error, span: start_position..end_position, }) } diff --git a/src/serializer.rs b/src/serializer.rs index fcfeaaf3..7347463b 100644 --- a/src/serializer.rs +++ b/src/serializer.rs @@ -129,8 +129,8 @@ impl<'a> ToCss for Token<'a> { Token::SquareBracketBlock => dest.write_str("[")?, Token::CurlyBracketBlock => dest.write_str("{")?, - Token::BadUrl => dest.write_str("url()")?, - Token::BadString => dest.write_str("\"\n")?, + Token::BadUrl(_) => dest.write_str("url()")?, + Token::BadString(_) => dest.write_str("\"\n")?, Token::CloseParenthesis => dest.write_str(")")?, Token::CloseSquareBracket => dest.write_str("]")?, Token::CloseCurlyBracket => dest.write_str("}")?, @@ -376,7 +376,7 @@ impl<'a> Token<'a> { TokenSerializationType(match *self { Token::Ident(_) => Ident, Token::AtKeyword(_) | Token::Hash(_) | Token::IDHash(_) => AtKeywordOrHash, - Token::UnquotedUrl(_) | Token::BadUrl => UrlOrBadUrl, + Token::UnquotedUrl(_) | Token::BadUrl(_) => UrlOrBadUrl, Token::Delim('#') => DelimHash, Token::Delim('@') => DelimAt, Token::Delim('.') | Token::Delim('+') => DelimDotOrPlus, @@ -400,7 +400,7 @@ impl<'a> Token<'a> { Token::ParenthesisBlock => OpenParen, Token::SquareBracketBlock | Token::CurlyBracketBlock | Token::CloseParenthesis | Token::CloseSquareBracket | Token::CloseCurlyBracket | - Token::QuotedString(_) | Token::BadString | + Token::QuotedString(_) | Token::BadString(_) | Token::Delim(_) | Token::Colon | Token::Semicolon | Token::Comma | Token::CDO | Token::IncludeMatch | Token::PrefixMatch | Token::SuffixMatch => Other, diff --git a/src/tests.rs b/src/tests.rs index c05bd63d..454c1cb3 100644 --- a/src/tests.rs +++ b/src/tests.rs @@ -451,26 +451,26 @@ fn serialize_rgba_two_digit_float_if_roundtrips() { fn line_numbers() { let mut input = ParserInput::new("foo bar\nbaz\r\n\n\"a\\\r\nb\""); let mut input = Parser::new(&mut input); - assert_eq!(input.current_source_location(), SourceLocation { line: 1, column: 1 }); + assert_eq!(input.current_source_location(), SourceLocation { line: 0, column: 0 }); assert_eq!(input.next_including_whitespace(), Ok(Token::Ident("foo".into()))); - assert_eq!(input.current_source_location(), SourceLocation { line: 1, column: 4 }); + assert_eq!(input.current_source_location(), SourceLocation { line: 0, column: 3 }); assert_eq!(input.next_including_whitespace(), Ok(Token::WhiteSpace(" "))); - assert_eq!(input.current_source_location(), SourceLocation { line: 1, column: 5 }); + assert_eq!(input.current_source_location(), SourceLocation { line: 0, column: 4 }); assert_eq!(input.next_including_whitespace(), Ok(Token::Ident("bar".into()))); - assert_eq!(input.current_source_location(), SourceLocation { line: 1, column: 8 }); + assert_eq!(input.current_source_location(), SourceLocation { line: 0, column: 7 }); assert_eq!(input.next_including_whitespace(), Ok(Token::WhiteSpace("\n"))); - assert_eq!(input.current_source_location(), SourceLocation { line: 2, column: 1 }); + assert_eq!(input.current_source_location(), SourceLocation { line: 1, column: 0 }); assert_eq!(input.next_including_whitespace(), Ok(Token::Ident("baz".into()))); - assert_eq!(input.current_source_location(), SourceLocation { line: 2, column: 4 }); + assert_eq!(input.current_source_location(), SourceLocation { line: 1, column: 3 }); let position = input.position(); assert_eq!(input.next_including_whitespace(), Ok(Token::WhiteSpace("\r\n\n"))); - assert_eq!(input.current_source_location(), SourceLocation { line: 4, column: 1 }); + assert_eq!(input.current_source_location(), SourceLocation { line: 3, column: 0 }); - assert_eq!(input.source_location(position), SourceLocation { line: 2, column: 4 }); + assert_eq!(input.source_location(position), SourceLocation { line: 1, column: 3 }); assert_eq!(input.next_including_whitespace(), Ok(Token::QuotedString("ab".into()))); - assert_eq!(input.current_source_location(), SourceLocation { line: 5, column: 3 }); + assert_eq!(input.current_source_location(), SourceLocation { line: 4, column: 2 }); assert!(input.next_including_whitespace().is_err()); } @@ -848,8 +848,8 @@ fn one_component_value_to_json(token: Token, input: &mut Parser) -> Json { v.extend(nested(input)); v }), - Token::BadUrl => JArray!["error", "bad-url"], - Token::BadString => JArray!["error", "bad-string"], + Token::BadUrl(_) => JArray!["error", "bad-url"], + Token::BadString(_) => JArray!["error", "bad-string"], Token::CloseParenthesis => JArray!["error", ")"], Token::CloseSquareBracket => JArray!["error", "]"], Token::CloseCurlyBracket => JArray!["error", "}"], @@ -920,3 +920,32 @@ fn parse_until_before_stops_at_delimiter_or_end_of_input() { } } } + +#[test] +fn parser_maintains_current_line() { + let mut input = ParserInput::new("ident ident;\nident ident ident;\nident"); + let mut parser = Parser::new(&mut input); + assert_eq!(parser.current_line(), "ident ident;"); + assert_eq!(parser.next(), Ok(Token::Ident("ident".into()))); + assert_eq!(parser.next(), Ok(Token::Ident("ident".into()))); + assert_eq!(parser.next(), Ok(Token::Semicolon)); + + assert_eq!(parser.next(), Ok(Token::Ident("ident".into()))); + assert_eq!(parser.current_line(), "ident ident ident;"); + assert_eq!(parser.next(), Ok(Token::Ident("ident".into()))); + assert_eq!(parser.next(), Ok(Token::Ident("ident".into()))); + assert_eq!(parser.next(), Ok(Token::Semicolon)); + + assert_eq!(parser.next(), Ok(Token::Ident("ident".into()))); + assert_eq!(parser.current_line(), "ident"); +} + +#[test] +fn parse_entirely_reports_first_error() { + #[derive(PartialEq, Debug)] + enum E { Foo } + let mut input = ParserInput::new("ident"); + let mut parser = Parser::new(&mut input); + let result: Result<(), _> = parser.parse_entirely(|_| Err(ParseError::Custom(E::Foo))); + assert_eq!(result, Err(ParseError::Custom(E::Foo))); +} diff --git a/src/tokenizer.rs b/src/tokenizer.rs index ff15bb28..2244e253 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -157,12 +157,12 @@ pub enum Token<'a> { /// A `` /// /// This token always indicates a parse error. - BadUrl, + BadUrl(CompactCowStr<'a>), /// A `` /// /// This token always indicates a parse error. - BadString, + BadString(CompactCowStr<'a>), /// A `<)-token>` /// @@ -194,7 +194,7 @@ impl<'a> Token<'a> { pub fn is_parse_error(&self) -> bool { matches!( *self, - BadUrl | BadString | CloseParenthesis | CloseSquareBracket | CloseCurlyBracket + BadUrl(_) | BadString(_) | CloseParenthesis | CloseSquareBracket | CloseCurlyBracket ) } } @@ -226,7 +226,7 @@ impl<'a> Tokenizer<'a> { input: input, position: 0, last_known_source_location: Cell::new((SourcePosition(0), - SourceLocation { line: 1, column: 1 })), + SourceLocation { line: 0, column: 0 })), var_functions: SeenStatus::DontCare, viewport_percentages: SeenStatus::DontCare, } @@ -287,6 +287,17 @@ impl<'a> Tokenizer<'a> { self.source_location(position) } + pub fn current_source_line(&self) -> &'a str { + let current = self.position; + let start = self.input[0..current] + .rfind(|c| matches!(c, '\r' | '\n' | '\x0C')) + .map_or(0, |start| start + 1); + let end = self.input[current..] + .find(|c| matches!(c, '\r' | '\n' | '\x0C')) + .map_or(self.input.len(), |end| current + end); + &self.input[start..end] + } + pub fn source_location(&self, position: SourcePosition) -> SourceLocation { let target = position.0; let mut location; @@ -301,7 +312,7 @@ impl<'a> Tokenizer<'a> { // So if the requested position is before the last known one, // start over from the beginning. position = 0; - location = SourceLocation { line: 1, column: 1 }; + location = SourceLocation { line: 0, column: 0 }; } let mut source = &self.input[position..target]; while let Some(newline_position) = source.find(|c| matches!(c, '\n' | '\r' | '\x0C')) { @@ -310,7 +321,7 @@ impl<'a> Tokenizer<'a> { source = &source[offset..]; position += offset; location.line += 1; - location.column = 1; + location.column = 0; } debug_assert!(position <= target); location.column += (target - position) as u32; @@ -386,10 +397,10 @@ pub struct SourcePosition(usize); /// The line and column number for a given position within the input. #[derive(PartialEq, Eq, Debug, Clone, Copy)] pub struct SourceLocation { - /// The line number, starting at 1 for the first line. + /// The line number, starting at 0 for the first line. pub line: u32, - /// The column number within a line, starting at 1 for first the character of the line. + /// The column number within a line, starting at 0 for first the character of the line. pub column: u32, } @@ -556,14 +567,14 @@ fn next_token<'a>(tokenizer: &mut Tokenizer<'a>) -> Result, ()> { fn consume_string<'a>(tokenizer: &mut Tokenizer<'a>, single_quote: bool) -> Token<'a> { match consume_quoted_string(tokenizer, single_quote) { Ok(value) => QuotedString(value), - Err(()) => BadString + Err(value) => BadString(value) } } /// Return `Err(())` on syntax error (ie. unescaped newline) fn consume_quoted_string<'a>(tokenizer: &mut Tokenizer<'a>, single_quote: bool) - -> Result, ()> { + -> Result, CompactCowStr<'a>> { tokenizer.advance(1); // Skip the initial quote // start_pos is at code point boundary, after " or ' let start_pos = tokenizer.position(); @@ -596,7 +607,9 @@ fn consume_quoted_string<'a>(tokenizer: &mut Tokenizer<'a>, single_quote: bool) string_bytes = tokenizer.slice_from(start_pos).as_bytes().to_owned(); break } - b'\n' | b'\r' | b'\x0C' => { return Err(()) }, + b'\n' | b'\r' | b'\x0C' => { + return Err(tokenizer.slice_from(start_pos).into()) + }, _ => {} } tokenizer.consume_byte(); @@ -604,7 +617,12 @@ fn consume_quoted_string<'a>(tokenizer: &mut Tokenizer<'a>, single_quote: bool) while !tokenizer.is_eof() { if matches!(tokenizer.next_byte_unchecked(), b'\n' | b'\r' | b'\x0C') { - return Err(()); + return Err( + // string_bytes is well-formed UTF-8, see other comments. + unsafe { + from_utf8_release_unchecked(string_bytes) + }.into() + ); } let b = tokenizer.consume_byte(); match_byte! { b, @@ -1013,6 +1031,7 @@ fn consume_unquoted_url<'a>(tokenizer: &mut Tokenizer<'a>) -> Result, } fn consume_bad_url<'a>(tokenizer: &mut Tokenizer<'a>) -> Token<'a> { + let start_pos = tokenizer.position(); // Consume up to the closing ) while !tokenizer.is_eof() { match_byte! { tokenizer.consume_byte(), @@ -1023,7 +1042,7 @@ fn consume_unquoted_url<'a>(tokenizer: &mut Tokenizer<'a>) -> Result, _ => {}, } } - BadUrl + BadUrl(tokenizer.slice_from(start_pos).into()) } }