From b63e2a9f513eb38b5f10b96f9e213ac4d7dacfb7 Mon Sep 17 00:00:00 2001 From: Simon Sapin Date: Sat, 19 Aug 2017 17:35:59 +0200 Subject: [PATCH 1/2] Move check_for_source_map calls inside consume_comment --- src/tokenizer.rs | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/src/tokenizer.rs b/src/tokenizer.rs index 425de66b..c8923f41 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -514,9 +514,7 @@ fn next_token<'a>(tokenizer: &mut Tokenizer<'a>) -> Result, ()> { } b'/' => { if tokenizer.starts_with(b"/*") { - let contents = consume_comment(tokenizer); - check_for_source_map(tokenizer, contents); - Comment(contents) + Comment(consume_comment(tokenizer)) } else { tokenizer.advance(1); Delim('/') @@ -627,7 +625,9 @@ fn consume_comment<'a>(tokenizer: &mut Tokenizer<'a>) -> &'a str { tokenizer.advance(1); if tokenizer.next_byte() == Some(b'/') { tokenizer.advance(1); - return tokenizer.slice(start_position..end_position) + let contents = tokenizer.slice(start_position..end_position); + check_for_source_map(tokenizer, contents); + return contents } } b'\n' | b'\x0C' => { @@ -643,7 +643,9 @@ fn consume_comment<'a>(tokenizer: &mut Tokenizer<'a>) -> &'a str { } } } - tokenizer.slice_from(start_position) + let contents = tokenizer.slice_from(start_position); + check_for_source_map(tokenizer, contents); + contents } fn consume_string<'a>(tokenizer: &mut Tokenizer<'a>, single_quote: bool) -> Token<'a> { From e4624cad8bd7c05cb454b97049504a88706a6623 Mon Sep 17 00:00:00 2001 From: Simon Sapin Date: Sat, 19 Aug 2017 17:39:14 +0200 Subject: [PATCH 2/2] Add Parser::skip_whitespace (and use it it rule parsing to make it hopefully more efficient.) --- Cargo.toml | 2 +- src/parser.rs | 37 ++++++++++++++---- src/rules_and_declarations.rs | 64 +++++++++++++++++++++----------- src/tokenizer.rs | 70 +++++++++++++++++++++++++++++++++++ 4 files changed, 142 insertions(+), 31 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index b259cd99..8f1249b9 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "cssparser" -version = "0.19.2" +version = "0.19.3" authors = [ "Simon Sapin " ] description = "Rust implementation of CSS Syntax Level 3" diff --git a/src/parser.rs b/src/parser.rs index 9e957f17..534eefc5 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -288,6 +288,32 @@ impl<'i: 't, 't> Parser<'i, 't> { } } + /// Advance the input until the next token that’s not whitespace or a comment. + #[inline] + pub fn skip_whitespace(&mut self) { + // If we just consumed `{`, `[`, `(`, or `function(`, leave whitespace + // or comments inside the block or function up to the nested parser. + if self.at_start_of.is_some() { + return + } + + self.input.tokenizer.skip_whitespace() + } + + #[inline] + pub(crate) fn skip_cdc_and_cdo(&mut self) { + self.input.tokenizer.skip_cdc_and_cdo() + } + + #[inline] + pub(crate) fn next_byte(&self) -> Option { + let byte = self.input.tokenizer.next_byte(); + if self.stop_before.contains(Delimiters::from_byte(byte)) { + return None + } + byte + } + /// Restore the internal state of the parser (including position within the input) /// to what was previously saved by the `Parser::position` method. /// @@ -364,14 +390,8 @@ impl<'i: 't, 't> Parser<'i, 't> { /// /// This only returns a closing token when it is unmatched (and therefore an error). pub fn next(&mut self) -> Result<&Token<'i>, BasicParseError<'i>> { - loop { - match self.next_including_whitespace_and_comments() { - Err(e) => return Err(e), - Ok(&Token::WhiteSpace(_)) | Ok(&Token::Comment(_)) => {}, - _ => break - } - } - Ok(self.input.cached_token_ref()) + self.skip_whitespace(); + self.next_including_whitespace_and_comments() } /// Same as `Parser::next`, but does not skip whitespace tokens. @@ -459,6 +479,7 @@ impl<'i: 't, 't> Parser<'i, 't> { where F: for<'tt> FnMut(&mut Parser<'i, 'tt>) -> Result> { let mut values = vec![]; loop { + self.skip_whitespace(); // Unnecessary for correctness, but may help try() in parse_one rewind less. values.push(self.parse_until_before(Delimiter::Comma, &mut parse_one)?); match self.next() { Err(_) => return Ok(values), diff --git a/src/rules_and_declarations.rs b/src/rules_and_declarations.rs index 8cccea2f..4e8ef438 100644 --- a/src/rules_and_declarations.rs +++ b/src/rules_and_declarations.rs @@ -342,15 +342,29 @@ where P: QualifiedRuleParser<'i, QualifiedRule = R, Error = E> + fn next(&mut self) -> Option>> { loop { + if self.is_stylesheet { + self.input.skip_cdc_and_cdo() + } else { + self.input.skip_whitespace() + } let start = self.input.state(); - // FIXME: remove intermediate variable when lifetimes are non-lexical - let at_keyword = match self.input.next_including_whitespace_and_comments() { - Ok(&Token::WhiteSpace(_)) | Ok(&Token::Comment(_)) => continue, - Ok(&Token::CDO) | Ok(&Token::CDC) if self.is_stylesheet => continue, - Ok(&Token::AtKeyword(ref name)) => Some(name.clone()), - Ok(_) => None, - Err(_) => return None, - }; + + let at_keyword; + match self.input.next_byte() { + Some(b'@') => { + match self.input.next_including_whitespace_and_comments() { + Ok(&Token::AtKeyword(ref name)) => at_keyword = Some(name.clone()), + _ => at_keyword = None, + } + // FIXME: move this back inside `match` when lifetimes are non-lexical + if at_keyword.is_none() { + self.input.reset(&start) + } + } + Some(_) => at_keyword = None, + None => return None + } + if let Some(name) = at_keyword { let first_stylesheet_rule = self.is_stylesheet && !self.any_rule_so_far; self.any_rule_so_far = true; @@ -362,7 +376,6 @@ where P: QualifiedRuleParser<'i, QualifiedRule = R, Error = E> + } } else { self.any_rule_so_far = true; - self.input.reset(&start); return Some(parse_qualified_rule(self.input, &mut self.parser) .map_err(|e| PreciseParseError { error: e, @@ -400,20 +413,27 @@ pub fn parse_one_rule<'i, 't, R, P, E>(input: &mut Parser<'i, 't>, parser: &mut where P: QualifiedRuleParser<'i, QualifiedRule = R, Error = E> + AtRuleParser<'i, AtRule = R, Error = E> { input.parse_entirely(|input| { - loop { - let start = input.state(); - // FIXME: remove intermediate variable when lifetimes are non-lexical - let at_keyword = match *input.next_including_whitespace_and_comments()? { - Token::WhiteSpace(_) | Token::Comment(_) => continue, - Token::AtKeyword(ref name) => Some(name.clone()), - _ => None - }; - if let Some(name) = at_keyword { - return parse_at_rule(&start, name, input, parser).map_err(|e| e.error) - } else { - input.reset(&start); - return parse_qualified_rule(input, parser) + input.skip_whitespace(); + let start = input.state(); + + let at_keyword; + if input.next_byte() == Some(b'@') { + match *input.next_including_whitespace_and_comments()? { + Token::AtKeyword(ref name) => at_keyword = Some(name.clone()), + _ => at_keyword = None, + } + // FIXME: move this back inside `match` when lifetimes are non-lexical + if at_keyword.is_none() { + input.reset(&start) } + } else { + at_keyword = None + } + + if let Some(name) = at_keyword { + parse_at_rule(&start, name, input, parser).map_err(|e| e.error) + } else { + parse_qualified_rule(input, parser) } }) } diff --git a/src/tokenizer.rs b/src/tokenizer.rs index c8923f41..f7c13f43 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -411,6 +411,76 @@ impl<'a> Tokenizer<'a> { fn starts_with(&self, needle: &[u8]) -> bool { self.input.as_bytes()[self.position..].starts_with(needle) } + + pub fn skip_whitespace(&mut self) { + while !self.is_eof() { + match_byte! { self.next_byte_unchecked(), + b' ' | b'\t' => { + self.advance(1) + }, + b'\n' | b'\x0C' => { + self.advance(1); + self.seen_newline(false); + }, + b'\r' => { + self.advance(1); + self.seen_newline(true); + }, + b'/' => { + if self.starts_with(b"/*") { + consume_comment(self); + } else { + return + } + } + _ => { + return + } + } + } + } + + pub fn skip_cdc_and_cdo(&mut self) { + while !self.is_eof() { + match_byte! { self.next_byte_unchecked(), + b' ' | b'\t' => { + self.advance(1) + }, + b'\n' | b'\x0C' => { + self.advance(1); + self.seen_newline(false); + }, + b'\r' => { + self.advance(1); + self.seen_newline(true); + }, + b'/' => { + if self.starts_with(b"/*") { + consume_comment(self); + } else { + return + } + } + b'<' => { + if self.starts_with(b"") { + self.advance(4) + } else { + return + } + } + _ => { + return + } + } + } + } } /// A position from the start of the input, counted in UTF-8 bytes.