diff --git a/Makefile.in b/Makefile.in index 99ec0a5b..dd90481c 100644 --- a/Makefile.in +++ b/Makefile.in @@ -14,7 +14,7 @@ RUST_SRC=$(shell find $(VPATH)/. -type f -name '*.rs') $(COLOR_DATA_RS) all: libcssparser.dummy libcssparser.dummy: cssparser.rc $(RUST_SRC) - $(RUSTC) $(RUSTFLAGS) $< -o $@ + $(RUSTC) $(RUSTFLAGS) $< touch $@ cssparser-test: cssparser.rc $(RUST_SRC) diff --git a/README.md b/README.md index 5608c027..eb1c6ccf 100644 --- a/README.md +++ b/README.md @@ -1,16 +1,14 @@ rust-cssparser ============== -WIP rust implementation of the 2013 version of +Rust implementation of the 2013 version of [css3-syntax](http://dev.w3.org/csswg/css3-syntax/) TODO ---- -* [x] Tokenization -* [x] Declaration and rule parsing -* [ ] Detect character encoding & decode from bytes -* [ ] Track line/column number for tokens. -* [ ] Figure out float and integer overflow -* [ ] Make it fast! +* Detect character encoding & decode from bytes +* Figure out float and integer overflow +* Serialize tokens back to CSS +* Make it fast! diff --git a/ast.rs b/ast.rs index 4d996a25..adf0f65a 100644 --- a/ast.rs +++ b/ast.rs @@ -3,6 +3,7 @@ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ use std::str::ToStr; +use std::vec; #[deriving(Eq)] @@ -20,13 +21,16 @@ pub struct SourceLocation { } +pub type Node = (ComponentValue, SourceLocation); // TODO this is not a good name + + #[deriving(Eq)] pub enum ComponentValue { - // Preserved tokens. Same as in the tokenizer. + // Preserved tokens. Ident(~str), AtKeyword(~str), Hash(~str), - IDHash(~str), // Hash token that is a valid ID selector. + IDHash(~str), // Hash that is a valid ID selector. String(~str), URL(~str), Delim(char), @@ -39,7 +43,7 @@ pub enum ComponentValue { Colon, // : Semicolon, // ; Comma, // , - IncludeMath, // ~= + IncludeMatch, // ~= DashMatch, // |= PrefixMatch, // ^= SuffixMatch, // $= @@ -49,12 +53,12 @@ pub enum ComponentValue { CDC, // --> // Function - Function(~str, ~[(ComponentValue, SourceLocation)]), // name, arguments + Function(~str, ~[ComponentValue]), // name, arguments // Simple block - ParenthesisBlock(~[(ComponentValue, SourceLocation)]), // (…) - SquareBracketBlock(~[(ComponentValue, SourceLocation)]), // […] - CurlyBracketBlock(~[(ComponentValue, SourceLocation)]), // {…} + ParenthesisBlock(~[ComponentValue]), // (…) + SquareBracketBlock(~[ComponentValue]), // […] + CurlyBracketBlock(~[Node]), // {…} // These are always invalid BadURL, @@ -69,23 +73,23 @@ pub enum ComponentValue { pub struct Declaration { location: SourceLocation, name: ~str, - value: ~[(ComponentValue, SourceLocation)], + value: ~[ComponentValue], important: bool, } #[deriving(Eq)] pub struct QualifiedRule { location: SourceLocation, - prelude: ~[(ComponentValue, SourceLocation)], - block: ~[(ComponentValue, SourceLocation)], + prelude: ~[ComponentValue], + block: ~[Node], } #[deriving(Eq)] pub struct AtRule { location: SourceLocation, name: ~str, - prelude: ~[(ComponentValue, SourceLocation)], - block: Option<~[(ComponentValue, SourceLocation)]>, + prelude: ~[ComponentValue], + block: Option<~[Node]>, } #[deriving(Eq)] @@ -101,6 +105,12 @@ pub enum Rule { AtRule(AtRule), } +#[deriving(Eq)] +pub struct SyntaxError { + location: SourceLocation, + reason: ErrorReason, +} + #[deriving(Eq)] pub enum ErrorReason { ErrEmptyInput, // Parsing a single "thing", found only whitespace. @@ -111,6 +121,56 @@ pub enum ErrorReason { // This is meant to be extended } -impl ToStr for ErrorReason { - fn to_str(&self) -> ~str { fmt!("%?", self) } +impl ToStr for SyntaxError { + fn to_str(&self) -> ~str { + fmt!("%u:%u %?", self.location.line, self.location.column, self.reason) + } +} + + +pub trait SkipWhitespaceIterable<'self> { + pub fn skip_whitespace(self) -> SkipWhitespaceIterator<'self>; +} + +impl<'self> SkipWhitespaceIterable<'self> for &'self [ComponentValue] { + pub fn skip_whitespace(self) -> SkipWhitespaceIterator<'self> { + SkipWhitespaceIterator{ iter_with_whitespace: self.iter() } + } +} + +pub struct SkipWhitespaceIterator<'self> { + iter_with_whitespace: vec::VecIterator<'self, ComponentValue>, +} + +impl<'self> Iterator<&'self ComponentValue> for SkipWhitespaceIterator<'self> { + fn next(&mut self) -> Option<&'self ComponentValue> { + for component_value in self.iter_with_whitespace { + if component_value != &WhiteSpace { return Some(component_value) } + } + None + } +} + + +pub trait MoveSkipWhitespaceIterable { + pub fn move_skip_whitespace(self) -> MoveSkipWhitespaceIterator; +} + +impl MoveSkipWhitespaceIterable for ~[ComponentValue] { + pub fn move_skip_whitespace(self) -> MoveSkipWhitespaceIterator { + MoveSkipWhitespaceIterator{ iter_with_whitespace: self.move_iter() } + } +} + +pub struct MoveSkipWhitespaceIterator { + iter_with_whitespace: vec::MoveIterator, +} + +impl Iterator for MoveSkipWhitespaceIterator { + fn next(&mut self) -> Option { + for component_value in self.iter_with_whitespace { + if component_value != WhiteSpace { return Some(component_value) } + } + None + } } diff --git a/color.rs b/color.rs index 10bf8f13..356d82f8 100644 --- a/color.rs +++ b/color.rs @@ -1,3 +1,7 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + use std::libc::c_float; use std::ascii::to_ascii_lower; @@ -76,7 +80,7 @@ fn parse_color_hash(value: &str) -> Option { #[inline] -fn parse_color_function(name: &str, arguments: &[(ComponentValue, SourceLocation)]) +fn parse_color_function(name: &str, arguments: &[ComponentValue]) -> Option { let lower_name = to_ascii_lower(name); @@ -87,9 +91,7 @@ fn parse_color_function(name: &str, arguments: &[(ComponentValue, SourceLocation else if "hsla" == lower_name { (false, true) } else { return None }; - let mut iter = do arguments.iter().filter_map |&(ref c, _)| { - if c != &WhiteSpace { Some(c) } else { None } - }; + let mut iter = arguments.skip_whitespace(); macro_rules! expect_comma( () => ( if iter.next() != Some(&Comma) { return None } ); ) diff --git a/css-parsing-tests/An+B.json b/css-parsing-tests/An+B.json new file mode 100644 index 00000000..3f4e89b2 --- /dev/null +++ b/css-parsing-tests/An+B.json @@ -0,0 +1,151 @@ +[ + +"", null, +" \n", null, + +"odd", [2, 1], +"even", [2, 0], +"ödd", null, +"éven", null, +" /**/\t OdD /**/\n", [2, 1], +" /**/\t EveN /**/\n", [2, 0], + + +"3", [0, 3], +"+2 ", [0, 2], +" -14 ", [0, -14], +"+ 2 ", null, +"- 14 ", null, +"3.1", null, + +"3N", [3, 0], +"+2N ", [2, 0], +" -14n ", [-14, 0], +"+ 2N ", null, +"- 14N ", null, +"3.1N", null, +"3 n", null, + +" N", [1, 0], +" +n", [1, 0], +" -n", [-1, 0], +"+ n", null, +"- n", null, + + +"3N+1", [3, 1], +"+2n+1 ", [2, 1], +" -14n+1 ", [-14, 1], +"+ 2N+1 ", null, +"- 14n+1 ", null, +"3.1n+1", null, +"3 n+1", null, + +" n+1", [1, 1], +" +N+1", [1, 1], +" -n+1", [-1, 1], +"+ N+1", null, +"- N+1", null, + +"3n-1", [3, -1], +"+2N-1 ", [2, -1], +" -14n-1 ", [-14, -1], +"+ 2N-1 ", null, +"- 14N-1 ", null, +"3.1n-1", null, +"3 n-1", null, + +" n-1", [1, -1], +" +n-1", [1, -1], +" -n-1", [-1, -1], +"+ n-1", null, +"- n-1", null, + + +"3N +1", [3, 1], +"+2N +1 ", [2, 1], +" -14n +1 ", [-14, 1], +"+ 2N +1 ", null, +"- 14n +1 ", null, +"3.1N +1", null, +"3 n +1", null, + +" n +1", [1, 1], +" +N +1", [1, 1], +" -n +1", [-1, 1], +"+ n +1", null, +"- N +1", null, + +"3N -1", [3, -1], +"+2n -1 ", [2, -1], +" -14n -1 ", [-14, -1], +"+ 2n -1 ", null, +"- 14N -1 ", null, +"3.1N -1", null, +"3 N -1", null, + +" N -1", [1, -1], +" +N -1", [1, -1], +" -n -1", [-1, -1], +"+ n -1", null, +"- n -1", null, + + +"3n+ 1", [3, 1], +"+2n+ 1 ", [2, 1], +" -14n+ 1 ", [-14, 1], +"+ 2n+ 1 ", null, +"- 14N+ 1 ", null, +"3.1n+ 1", null, +"3 N+ 1", null, + +" N+ 1", [1, 1], +" +N+ 1", [1, 1], +" -N+ 1", [-1, 1], +"+ n+ 1", null, +"- N+ 1", null, + +"3n- 1", [3, -1], +"+2N- 1 ", [2, -1], +" -14N- 1 ", [-14, -1], +"+ 2N- 1 ", null, +"- 14n- 1 ", null, +"3.1n- 1", null, +"3 n- 1", null, + +" N- 1", [1, -1], +" +N- 1", [1, -1], +" -n- 1", [-1, -1], +"+ n- 1", null, +"- N- 1", null, + + +"3N + 1", [3, 1], +"+2N + 1 ", [2, 1], +" -14n + 1 ", [-14, 1], +"+ 2n + 1 ", null, +"- 14N + 1 ", null, +"3.1n + 1", null, +"3 N + 1", null, + +" n + 1", [1, 1], +" +n + 1", [1, 1], +" -N + 1", [-1, 1], +"+ N + 1", null, +"- N + 1", null, + +"3N - 1", [3, -1], +"+2n - 1 ", [2, -1], +" -14n - 1 ", [-14, -1], +"+ 2N - 1 ", null, +"- 14N - 1 ", null, +"3.1N - 1", null, +"3 n - 1", null, + +" N - 1", [1, -1], +" +n - 1", [1, -1], +" -n - 1", [-1, -1], +"+ N - 1", null, +"- N - 1", null + +] diff --git a/css-parsing-tests/README.rst b/css-parsing-tests/README.rst index 955e6359..7531ec53 100644 --- a/css-parsing-tests/README.rst +++ b/css-parsing-tests/README.rst @@ -102,6 +102,17 @@ associated with the expected result. are between 0 and 255. This file is generated the ``make_color3_keywords.py`` Python script. +``an+b.json`` + Tests the `an+b `_ + syntax defined in CSS Syntax Level 3. + This `differs `_ from the + `nth grammar rule `_ + in Selectors Level 3 only in that + ``-`` charecters and digits can be escaped in some cases. + The Unicode input is represented by a JSON string, + the output as null for invalid syntax, + or an array of two integers ``[A, B]``. + Result representation ===================== diff --git a/cssparser.rc b/cssparser.rc index 303b4d8b..20dd3051 100644 --- a/cssparser.rc +++ b/cssparser.rc @@ -7,10 +7,17 @@ extern mod extra; -pub mod tokenizer; -pub mod parser; -pub mod ast; -pub mod color; +pub use ast::*; +pub use tokenizer::*; +pub use parser::*; +pub use color::*; +pub use nth::*; + +mod ast; +mod tokenizer; +mod parser; +mod color; +mod nth; #[cfg(test)] mod tests; diff --git a/make_color_data.py b/make_color_data.py index 7c5f9891..af7c957b 100644 --- a/make_color_data.py +++ b/make_color_data.py @@ -1,3 +1,7 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + COLORS = sorted([ ('transparent', (0, 0, 0, 0)), diff --git a/nth.rs b/nth.rs new file mode 100644 index 00000000..0d3e4257 --- /dev/null +++ b/nth.rs @@ -0,0 +1,123 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +use std::i32; +use std::ascii::to_ascii_lower; +use ast::*; + + +/// Parse the An+B notation, as found in the ``:nth-child()`` selector. +/// The input is typically the arguments of a function component value. +/// Return Some((A, B)), or None for a syntax error. +pub fn parse_nth(input: &[ComponentValue]) -> Option<(i32, i32)> { + let iter = &mut input.skip_whitespace(); + match iter.next() { + Some(&Number(ref value)) => match value.int_value { + Some(b) => parse_end(iter, 0, b as i32), + _ => None, + }, + Some(&Dimension(ref value, ref unit)) => match value.int_value { + Some(a) => { + let unit: &str = to_ascii_lower(unit.as_slice()); + match unit { + "n" => parse_b(iter, a as i32), + "n-" => parse_signless_b(iter, a as i32, -1), + _ => match(parse_n_dash_digits(unit)) { + Some(b) => parse_end(iter, a as i32, b), + _ => None + }, + } + }, + _ => None, + }, + Some(&Ident(ref value)) => { + let ident: &str = to_ascii_lower(value.as_slice()); + match ident { + "even" => parse_end(iter, 2, 0), + "odd" => parse_end(iter, 2, 1), + "n" => parse_b(iter, 1), + "-n" => parse_b(iter, -1), + "n-" => parse_signless_b(iter, 1, -1), + "-n-" => parse_signless_b(iter, -1, -1), + _ if ident.starts_with("-") => match(parse_n_dash_digits(ident.slice_from(1))) { + Some(b) => parse_end(iter, -1, b), + _ => None + }, + _ => match(parse_n_dash_digits(ident)) { + Some(b) => parse_end(iter, 1, b), + _ => None + }, + } + }, + Some(&Delim('+')) => match iter.iter_with_whitespace.next() { + Some(&Ident(ref value)) => { + let ident: &str = to_ascii_lower(value.as_slice()); + match ident { + "n" => parse_b(iter, 1), + "n-" => parse_signless_b(iter, 1, -1), + _ => match(parse_n_dash_digits(ident)) { + Some(b) => parse_end(iter, 1, b), + _ => None + }, + } + }, + _ => None + }, + _ => None + } +} + + +type Nth = Option<(i32, i32)>; +type Iter<'self> = SkipWhitespaceIterator<'self>; + +fn parse_b(iter: &mut Iter, a: i32) -> Nth { + match iter.next() { + None => Some((a, 0)), + Some(&Delim('+')) => parse_signless_b(iter, a, 1), + Some(&Delim('-')) => parse_signless_b(iter, a, -1), + Some(&Number(ref value)) => match value.int_value { + Some(b) if has_sign(value) => parse_end(iter, a, b as i32), + _ => None, + }, + _ => None + } +} + +fn parse_signless_b(iter: &mut Iter, a: i32, b_sign: i32) -> Nth { + match iter.next() { + Some(&Number(ref value)) => match value.int_value { + Some(b) if !has_sign(value) => parse_end(iter, a, b_sign * (b as i32)), + _ => None, + }, + _ => None + } +} + +fn parse_end(iter: &mut Iter, a: i32, b: i32) -> Nth { + match iter.next() { + None => Some((a, b)), + Some(_) => None, + } +} + +fn parse_n_dash_digits(string: &str) -> Option { + if string.len() >= 3 + && string.starts_with("n-") + && string.slice_from(2).iter().all(|c| match c { '0'..'9' => true, _ => false }) + { + let result = i32::from_str(string.slice_from(1)); // Include the minus sign + assert!(result.is_some()); + result + } + else { None } +} + +#[inline] +fn has_sign(value: &NumericValue) -> bool { + match value.representation[0] as char { + '+' | '-' => true, + _ => false + } +} diff --git a/parser.rs b/parser.rs index d83ffc1f..38538da4 100644 --- a/parser.rs +++ b/parser.rs @@ -3,68 +3,110 @@ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ // http://dev.w3.org/csswg/css-syntax/#parsing -// -// The input to the tree construction stage is a sequence of tokens -// from the tokenization stage. -// The output is a tree of items with a stylesheet at the root -// and all other nodes being at-rules, style rules, or declarations. + +/// The input to these functions needs to implement Iterator<(ComponentValue, SourceLocation)>. +/// The input is consumed to avoid doing a lot of copying. +/// A conforming input can be obtained: +/// +/// * From a string in CSS syntax, with tokenize() +/// * From a ~[(ComponentValue, SourceLocation)] vector +/// (as found in "nested" component values such as CurlyBracketBlock), +/// with v.consume_iter() use std::iterator::Iterator; -use std::vec; use std::ascii::eq_ignore_ascii_case; use ast::*; -use tokenizer::*; -// TODO: Use a trait? -enum ComponentValueIterator { - ParserIter(~Parser), - VectorIter(vec::ConsumeIterator<(ComponentValue, SourceLocation)>), +/// Parse top-level of a CSS stylesheet. +/// Return a Iterator> +#[inline] +pub fn parse_stylesheet_rules>(iter: T) -> StylesheetParser { + StylesheetParser{ iter: iter } } -impl ComponentValueIterator { - #[inline] - pub fn from_str(input: ~str) -> ComponentValueIterator { - ComponentValueIterator::from_parser(~Parser::from_str(input)) - } +/// Parse a non-top level list of rules eg. the content of an @media rule. +/// Return a Iterator> +#[inline] +pub fn parse_rule_list>(iter: T) -> RuleListParser { + RuleListParser{ iter: iter } +} - #[inline] - pub fn from_parser(parser: ~Parser) -> ComponentValueIterator { - ParserIter(parser) - } - #[inline] - pub fn from_vector(values: ~[(ComponentValue, SourceLocation)]) -> ComponentValueIterator { - VectorIter(values.consume_iter()) +/// Parse a list of declarations and at-rules, +/// like @page in CSS 2.1, all declaration lists in level 3 +/// Return a Iterator> +#[inline] +pub fn parse_declaration_list>(iter: T) -> DeclarationListParser { + DeclarationListParser{ iter: iter } +} + + +/// Parse a single rule. +/// Used eg. for CSSRuleList.insertRule() +pub fn parse_one_rule>(iter: T) -> Result { + let mut parser = RuleListParser{ iter: iter }; + match parser.next() { + None => error(START_LOCATION, ErrEmptyInput), + Some(result) => { + if result.is_err() { result } + else { match next_non_whitespace(&mut parser.iter) { + None => result, + Some((_component_value, location)) => error(location, ErrExtraInput), + }} + } } +} - #[inline] - pub fn next_non_whitespace(&mut self) -> Option<(ComponentValue, SourceLocation)> { - for (component_value, location) in *self { - if component_value != WhiteSpace { return Some((component_value, location)) } + +/// Parse a single declaration (not an at-rule) +/// Used eg. in @supports +pub fn parse_one_declaration>(mut iter: T) -> Result { + match next_non_whitespace(&mut iter) { + None => error(START_LOCATION, ErrEmptyInput), + Some((component_value, location)) => { + let result = parse_declaration(&mut iter, component_value, location); + if result.is_err() { result } + else { match next_non_whitespace(&mut iter) { + None => result, + Some((_component_value, location)) => error(location, ErrExtraInput), + }} } - None } } -impl Iterator<(ComponentValue, SourceLocation)> for ComponentValueIterator { - fn next(&mut self) -> Option<(ComponentValue, SourceLocation)> { - match self { - &ParserIter(ref mut parser) => next_component_value(*parser), - &VectorIter(ref mut iter) => iter.next() +/// Parse a single component value. +/// Used eg. in attr(foo, color) +pub fn parse_one_component_value>(mut iter: T) + -> Result { + match next_non_whitespace(&mut iter) { + None => error(START_LOCATION, ErrEmptyInput), + Some((component_value, _location)) => { + match next_non_whitespace(&mut iter) { + None => Ok(component_value), + Some((_component_value, location)) => error(location, ErrExtraInput), + } } } } +// *********** End of public API *********** + + +struct StylesheetParser{ iter: T } +struct RuleListParser{ iter: T } +struct DeclarationListParser{ iter: T } + + // Work around "error: cannot borrow `*iter` as mutable more than once at a time" // when using a normal for loop. macro_rules! for_iter( - ($iter: expr, $pattern: pat, $loop_body: expr) => ( + ($iter: ident, $pattern: pat, $loop_body: expr) => ( loop { match $iter.next() { None => break, Some($pattern) => $loop_body } } @@ -72,156 +114,124 @@ macro_rules! for_iter( ) -/// Call repeatedly for the top-level of a CSS stylesheet -pub fn parse_stylesheet_rule(iter: &mut ComponentValueIterator) -> Option> { - for_iter!(iter, (component_value, location), { - match component_value { - WhiteSpace | CDO | CDC => (), - AtKeyword(name) => return Some(Ok(AtRule(parse_at_rule(iter, name, location)))), - _ => return Some(match parse_qualified_rule(iter, (component_value, location)) { - Ok(rule) => Ok(QualifiedRule(rule)), - Err(reason) => Err(reason), - }), - } - }) - None -} - - -/// Call repeatedly for a non-top level list of rules eg. the content of an @media rule. -/// Same as parse_stylesheet() except for the handling of top-level CDO and CDC -pub fn parse_rule(iter: &mut ComponentValueIterator) -> Option> { - for_iter!(iter, (component_value, location), { - match component_value { - WhiteSpace => (), - AtKeyword(name) => return Some(Ok(AtRule(parse_at_rule(iter, name, location)))), - _ => return Some(match parse_qualified_rule(iter, (component_value, location)) { - Ok(rule) => Ok(QualifiedRule(rule)), - Err(reason) => Err(reason), - }), - } - }) - None -} - - -/// Used eg. for CSSRuleList.insertRule() -pub fn parse_one_rule(iter: &mut ComponentValueIterator) -> Result { - match parse_rule(iter) { - None => Err(ErrEmptyInput), - Some(result) => if result.is_err() || iter.next_non_whitespace().is_none() { result } - else { Err(ErrExtraInput) } +impl> Iterator> for StylesheetParser { + fn next(&mut self) -> Option> { + let iter = &mut self.iter; + for_iter!(iter, (component_value, location), { + match component_value { + WhiteSpace | CDO | CDC => (), + AtKeyword(name) => return Some(Ok(AtRule(parse_at_rule(iter, name, location)))), + _ => return Some(match parse_qualified_rule(iter, component_value, location) { + Ok(rule) => Ok(QualifiedRule(rule)), + Err(e) => Err(e), + }), + } + }) + None } } -/// Call repeatedly of a list of declarations. -/// @page in CSS 2.1, all declaration lists in level 3 -pub fn parse_declaration_or_at_rule(iter: &mut ComponentValueIterator) - -> Option> { - for_iter!(iter, (component_value, location), { - match component_value { - WhiteSpace | Semicolon => (), - AtKeyword(name) => return Some(Ok(Decl_AtRule(parse_at_rule(iter, name, location)))), - _ => return Some(match parse_declaration(iter, (component_value, location)) { - Ok(declaration) => Ok(Declaration(declaration)), - Err(reason) => { - // Find the end of the declaration - for (v, _) in *iter { if v == Semicolon { break } } - Err(reason) - } - }), - } - }) - None -} - - -/// Used eg. in @supports -pub fn parse_one_declaration(iter: &mut ComponentValueIterator) -> Result { - match iter.next_non_whitespace() { - None => Err(ErrEmptyInput), - Some(item) => { - let result = parse_declaration(iter, item); - if result.is_err() || iter.next_non_whitespace().is_none() { result } - else { Err(ErrExtraInput) } - } +impl> Iterator> for RuleListParser { + fn next(&mut self) -> Option> { + let iter = &mut self.iter; + for_iter!(iter, (component_value, location), { + match component_value { + WhiteSpace => (), + AtKeyword(name) => return Some(Ok(AtRule(parse_at_rule(iter, name, location)))), + _ => return Some(match parse_qualified_rule(iter, component_value, location) { + Ok(rule) => Ok(QualifiedRule(rule)), + Err(e) => Err(e), + }), + } + }) + None } } -/// Used eg. in attr(foo, color) -pub fn parse_one_component_value(iter: &mut ComponentValueIterator) - -> Result<(ComponentValue, SourceLocation), ErrorReason> { - match iter.next_non_whitespace() { - None => Err(ErrEmptyInput), - Some(item) => { - if iter.next_non_whitespace().is_none() { Ok(item) } - else { Err(ErrExtraInput) } - } +impl> Iterator> +for DeclarationListParser { + fn next(&mut self) -> Option> { + let iter = &mut self.iter; + for_iter!(iter, (component_value, location), { + match component_value { + WhiteSpace | Semicolon => (), + AtKeyword(name) + => return Some(Ok(Decl_AtRule(parse_at_rule(iter, name, location)))), + _ => return Some(match parse_declaration(iter, component_value, location) { + Ok(declaration) => Ok(Declaration(declaration)), + Err(e) => { + // Find the end of the declaration + for (v, _) in *iter { if v == Semicolon { break } } + Err(e) + } + }), + } + }) + None } } -// *********** End of public API *********** - - -fn parse_at_rule(iter: &mut ComponentValueIterator, name: ~str, location: SourceLocation) +fn parse_at_rule>(iter: &mut T, name: ~str, location: SourceLocation) -> AtRule { let mut prelude = ~[]; let mut block = None; - for_iter!(iter, (component_value, location), { + for_iter!(iter, (component_value, _location), { match component_value { CurlyBracketBlock(content) => { block = Some(content); break }, Semicolon => break, - component_value => prelude.push((component_value, location)), + component_value => prelude.push(component_value), } }) AtRule {location: location, name: name, prelude: prelude, block: block} } -fn parse_qualified_rule(iter: &mut ComponentValueIterator, first: (ComponentValue, SourceLocation)) - -> Result { +fn parse_qualified_rule>(iter: &mut T, first: ComponentValue, + location: SourceLocation) + -> Result { match first { - (CurlyBracketBlock(content), location) + CurlyBracketBlock(content) => return Ok(QualifiedRule { location: location, prelude: ~[], block: content }), _ => (), } let mut prelude = ~[first]; - for_iter!(iter, (component_value, location), { + for_iter!(iter, (component_value, _location), { match component_value { CurlyBracketBlock(content) => return Ok(QualifiedRule {location: location, prelude: prelude, block: content}), - component_value => prelude.push((component_value, location)), + component_value => prelude.push(component_value), } }) - Err(ErrMissingQualifiedRuleBlock) + error(location, ErrMissingQualifiedRuleBlock) } -fn parse_declaration(iter: &mut ComponentValueIterator, first: (ComponentValue, SourceLocation)) - -> Result { - let (name, location) = match first { - (Ident(name), location) => (name, location), - _ => return Err(ErrInvalidDeclarationSyntax) +fn parse_declaration>(iter: &mut T, first: ComponentValue, + location: SourceLocation) + -> Result { + let name = match first { + Ident(name) => name, + _ => return error(location, ErrInvalidDeclarationSyntax) }; - match iter.next_non_whitespace() { + match next_non_whitespace(iter) { Some((Colon, _)) => (), - _ => return Err(ErrInvalidDeclarationSyntax), + _ => return error(location, ErrInvalidDeclarationSyntax), } let mut value = ~[]; let mut important = false; - for_iter!(iter, (component_value, location), { + for_iter!(iter, (component_value, _location), { match component_value { Semicolon => break, Delim('!') => if parse_declaration_important(iter) { important = true; break } else { - return Err(ErrInvalidBangImportantSyntax) + return error(location, ErrInvalidBangImportantSyntax) }, - component_value => value.push((component_value, location)), + component_value => value.push(component_value), } }) Ok(Declaration{location: location, name: name, value: value, important: important}) @@ -229,15 +239,34 @@ fn parse_declaration(iter: &mut ComponentValueIterator, first: (ComponentValue, #[inline] -fn parse_declaration_important(iter: &mut ComponentValueIterator) -> bool { - let ident_value = match iter.next_non_whitespace() { +fn parse_declaration_important>(iter: &mut T) -> bool { + let ident_value = match next_non_whitespace(iter) { Some((Ident(value), _)) => value, _ => return false, }; if !eq_ignore_ascii_case(ident_value, "important") { return false } - match iter.next_non_whitespace() { + match next_non_whitespace(iter) { Some((Semicolon, _)) => true, None => true, _ => false } } + + +#[inline] +fn next_non_whitespace>(iter: &mut T) -> Option { + for (component_value, location) in *iter { + if component_value != WhiteSpace { return Some((component_value, location)) } + } + None +} + + +#[inline] +fn error(location: SourceLocation, reason: ErrorReason) -> Result { + Err(SyntaxError{location: location, reason: reason}) +} + + +// When parsing one thing on an empty input +static START_LOCATION: SourceLocation = SourceLocation{ line: 1, column: 1 }; diff --git a/tests.rs b/tests.rs index 2ce0243c..20f056e4 100644 --- a/tests.rs +++ b/tests.rs @@ -6,10 +6,7 @@ use std::{io, os, str, run, task}; use extra::{tempfile, json}; use extra::json::ToJson; -use ast::*; -use tokenizer::*; -use parser::*; -use color::*; +use super::*; fn write_whole_file(path: &Path, data: &str) { @@ -55,19 +52,19 @@ fn assert_json_eq(results: json::Json, expected: json::Json, message: ~str) { } -fn run_json_tests(json_data: &str, parse: &fn (input: ~str) -> json::Json) { +fn run_json_tests(json_data: &str, parse: &fn (input: ~str) -> T) { let items = match json::from_str(json_data) { Ok(json::List(items)) => items, _ => fail!("Invalid JSON") }; assert!(items.len() % 2 == 0); let mut input: Option<~str> = None; - for item in items.consume_iter() { + for item in items.move_iter() { match (&input, item) { (&None, json::String(string)) => input = Some(string), (&Some(_), expected) => { let input = input.take_unwrap(); - let result = parse(input.to_owned()); + let result = parse(input.to_owned()).to_json(); assert_json_eq(result, expected, input); }, _ => fail!("Unexpected JSON") @@ -79,15 +76,7 @@ fn run_json_tests(json_data: &str, parse: &fn (input: ~str) -> json::Json) { #[test] fn component_value_list() { do run_json_tests(include_str!("css-parsing-tests/component_value_list.json")) |input| { - let parser = &mut Parser::from_str(input); - let mut results = ~[]; - loop { - match next_component_value(parser) { - Some((c, _)) => results.push(c), - None => break, - } - } - results.to_json() + tokenize(input).map(|(c, _)| c).to_owned_vec() } } @@ -95,8 +84,7 @@ fn component_value_list() { #[test] fn one_component_value() { do run_json_tests(include_str!("css-parsing-tests/one_component_value.json")) |input| { - let iter = &mut ComponentValueIterator::from_str(input); - result_to_json(parse_one_component_value(iter).chain(|(c, _)| Ok(c))) + parse_one_component_value(tokenize(input)) } } @@ -104,15 +92,7 @@ fn one_component_value() { #[test] fn declaration_list() { do run_json_tests(include_str!("css-parsing-tests/declaration_list.json")) |input| { - let iter = &mut ComponentValueIterator::from_str(input); - let mut declarations = ~[]; - loop { - match parse_declaration_or_at_rule(iter) { - None => break, - Some(result) => declarations.push(result_to_json(result)), - } - } - json::List(declarations) + parse_declaration_list(tokenize(input)).to_owned_vec() } } @@ -120,7 +100,7 @@ fn declaration_list() { #[test] fn one_declaration() { do run_json_tests(include_str!("css-parsing-tests/one_declaration.json")) |input| { - result_to_json(parse_one_declaration(&mut ComponentValueIterator::from_str(input))) + parse_one_declaration(tokenize(input)) } } @@ -128,15 +108,15 @@ fn one_declaration() { #[test] fn rule_list() { do run_json_tests(include_str!("css-parsing-tests/rule_list.json")) |input| { - let iter = &mut ComponentValueIterator::from_str(input); - let mut rules = ~[]; - loop { - match parse_rule(iter) { - None => break, - Some(result) => rules.push(result_to_json(result)), - } - } - json::List(rules) + parse_rule_list(tokenize(input)).to_owned_vec() + } +} + + +#[test] +fn stylesheet() { + do run_json_tests(include_str!("css-parsing-tests/stylesheet.json")) |input| { + parse_stylesheet_rules(tokenize(input)).to_owned_vec() } } @@ -144,15 +124,15 @@ fn rule_list() { #[test] fn one_rule() { do run_json_tests(include_str!("css-parsing-tests/one_rule.json")) |input| { - result_to_json(parse_one_rule(&mut ComponentValueIterator::from_str(input))) + parse_one_rule(tokenize(input)) } } fn run_color_tests(json_data: &str, to_json: &fn(result: Option) -> json::Json) { do run_json_tests(json_data) |input| { - match parse_one_component_value(&mut ComponentValueIterator::from_str(input)) { - Ok((component_value, _location)) => to_json(parse_color(&component_value)), + match parse_one_component_value(tokenize(input)) { + Ok(component_value) => to_json(parse_color(&component_value)), Err(_reason) => json::Null, } } @@ -165,10 +145,10 @@ fn color3() { } -//#[test] -//fn color3_hsl() { -// run_color_tests(include_str!("css-parsing-tests/color3_hsl.json"), |c| c.to_json()) -//} +#[test] +fn color3_hsl() { + run_color_tests(include_str!("css-parsing-tests/color3_hsl.json"), |c| c.to_json()) +} /// color3_keywords.json is different: R, G and B are in 0..255 rather than 0..1 @@ -184,27 +164,57 @@ fn color3_keywords() { } -fn result_to_json(result: Result) -> json::Json { - match result { - Ok(ref a) => a.to_json(), - Err(ref b) => b.to_json(), +#[test] +fn nth() { + do run_json_tests(include_str!("css-parsing-tests/An+B.json")) |input| { + parse_nth(tokenize(input).map(|(c, _)| c).to_owned_vec()) } } -impl ToJson for Color { +impl ToJson for Result { fn to_json(&self) -> json::Json { match *self { - RGBA(r, g, b, a) => (~[r, g, b, a]).to_json(), - CurrentColor => json::String(~"currentColor"), + Ok(ref a) => a.to_json(), + Err(ref b) => b.to_json(), + } + } +} + + +impl ToJson for Result { + fn to_json(&self) -> json::Json { + match *self { + Ok(ref a) => a.to_json(), + Err(ref b) => b.to_json(), } } } -impl ToJson for ErrorReason { +impl ToJson for Result { fn to_json(&self) -> json::Json { - json::List(~[json::String(~"error"), json::String(match *self { + match *self { + Ok(ref a) => a.to_json(), + Err(ref b) => b.to_json(), + } + } +} + + +impl ToJson for Result { + fn to_json(&self) -> json::Json { + match *self { + Ok(ref a) => a.to_json(), + Err(ref b) => b.to_json(), + } + } +} + + +impl ToJson for SyntaxError { + fn to_json(&self) -> json::Json { + json::List(~[json::String(~"error"), json::String(match self.reason { ErrEmptyInput => ~"empty", ErrExtraInput => ~"extra-input", _ => ~"invalid", @@ -213,6 +223,16 @@ impl ToJson for ErrorReason { } +impl ToJson for Color { + fn to_json(&self) -> json::Json { + match *self { + RGBA(r, g, b, a) => (~[r, g, b, a]).to_json(), + CurrentColor => json::String(~"currentColor"), + } + } +} + + impl ToJson for Rule { fn to_json(&self) -> json::Json { match *self { @@ -247,8 +267,7 @@ impl ToJson for AtRule { match *self { AtRule{name: ref name, prelude: ref prelude, block: ref block, _} => json::List(~[json::String(~"at-rule"), name.to_json(), - json::List(list_to_json(prelude)), - block.map(list_to_json).to_json()]) + prelude.to_json(), block.map(list_to_json).to_json()]) } } } @@ -259,7 +278,7 @@ impl ToJson for QualifiedRule { match *self { QualifiedRule{prelude: ref prelude, block: ref block, _} => json::List(~[json::String(~"qualified rule"), - json::List(list_to_json(prelude)), json::List(list_to_json(block))]) + prelude.to_json(), json::List(list_to_json(block))]) } } } @@ -270,7 +289,7 @@ impl ToJson for Declaration { match *self { Declaration{name: ref name, value: ref value, important: ref important, _} => json::List(~[json::String(~"declaration"), name.to_json(), - json::List(list_to_json(value)), important.to_json()]) + value.to_json(), important.to_json()]) } } } @@ -313,7 +332,7 @@ impl ToJson for ComponentValue { Colon => JString(~":"), Semicolon => JString(~";"), Comma => JString(~","), - IncludeMath => JString(~"~="), + IncludeMatch => JString(~"~="), DashMatch => JString(~"|="), PrefixMatch => JString(~"^="), SuffixMatch => JString(~"$="), @@ -323,11 +342,11 @@ impl ToJson for ComponentValue { CDC => JString(~"-->"), Function(ref name, ref arguments) - => JList(~[JString(~"function"), name.to_json()] + list_to_json(arguments)), + => JList(~[JString(~"function"), name.to_json()] + arguments.map(|a| a.to_json())), ParenthesisBlock(ref content) - => JList(~[JString(~"()")] + list_to_json(content)), + => JList(~[JString(~"()")] + content.map(|c| c.to_json())), SquareBracketBlock(ref content) - => JList(~[JString(~"[]")] + list_to_json(content)), + => JList(~[JString(~"[]")] + content.map(|c| c.to_json())), CurlyBracketBlock(ref content) => JList(~[JString(~"{}")] + list_to_json(content)), diff --git a/tokenizer.rs b/tokenizer.rs index c0925854..96b71842 100644 --- a/tokenizer.rs +++ b/tokenizer.rs @@ -10,29 +10,86 @@ use std::ascii::eq_ignore_ascii_case; use ast::*; -struct Parser { - input: ~str, - length: uint, // All counted in bytes, not characters - position: uint, // All counted in bytes, not characters - line: uint, - last_line_start: uint, // All counted in bytes, not characters +/// Returns a Iterator<(ComponentValue, SourceLocation)> +pub fn tokenize(input: &str) -> Tokenizer { + let input = preprocess(input); + Tokenizer { + length: input.len(), + input: input, + position: 0, + line: 1, + last_line_start: 0, + } +} + +impl Iterator for Tokenizer { + #[inline] + pub fn next(&mut self) -> Option { next_component_value(self) } +} + + +// *********** End of public API *********** + + +#[inline] +fn preprocess(input: &str) -> ~str { + // TODO: Is this faster if done in one pass? + input.replace("\r\n", "\n").replace("\r", "\n").replace("\x0C", "\n").replace("\x00", "\uFFFD") } -impl Parser { - pub fn from_str(input: &str) -> Parser { - let input = preprocess(input); - Parser { - length: input.len(), - input: input, - position: 0, - line: 1, - last_line_start: 0, +#[test] +fn test_preprocess() { + assert!(preprocess("") == ~""); + assert!(preprocess("Lorem\r\n\t\x00ipusm\ndoror\uFFFD\r") + == ~"Lorem\n\t\uFFFDipusm\ndoror\uFFFD\n"); +} + + +struct Tokenizer { + priv input: ~str, + priv length: uint, // All counted in bytes, not characters + priv position: uint, // All counted in bytes, not characters + priv line: uint, + priv last_line_start: uint, // All counted in bytes, not characters +} + + +impl Tokenizer { + #[inline] + fn is_eof(&self) -> bool { self.position >= self.length } + + // Assumes non-EOF + #[inline] + fn current_char(&self) -> char { self.char_at(0) } + + #[inline] + fn char_at(&self, offset: uint) -> char { + self.input.char_at(self.position + offset) + } + + #[inline] + fn consume_char(&mut self) -> char { + let range = self.input.char_range_at(self.position); + self.position = range.next; + range.ch + } + + #[inline] + fn starts_with(&self, needle: &str) -> bool { + self.input.slice_from(self.position).starts_with(needle) + } + + #[inline] + fn new_line(&mut self) { + if cfg!(test) { + assert!(self.input.char_at(self.position - 1) == '\n') } + self.line += 1; + self.last_line_start = self.position; } } - macro_rules! is_match( ($value:expr, $($pattern:pat)|+) => ( match $value { $($pattern)|+ => true, _ => false } @@ -40,152 +97,155 @@ macro_rules! is_match( ) -pub fn next_component_value(parser: &mut Parser) -> Option<(ComponentValue, SourceLocation)> { - consume_comments(parser); - if parser.is_eof() { +fn next_component_value(tokenizer: &mut Tokenizer) -> Option { + consume_comments(tokenizer); + if tokenizer.is_eof() { if cfg!(test) { - assert!(parser.line == parser.input.split_iter('\n').len_(), - "The tokenizer is missing a parser.new_line() call somewhere.") + assert!(tokenizer.line == tokenizer.input.split_iter('\n').len(), + "The tokenizer is missing a tokenizer.new_line() call somewhere.") } return None } let start_location = SourceLocation{ - line: parser.line, + line: tokenizer.line, // The start of the line is column 1: - column: parser.position - parser.last_line_start + 1, + column: tokenizer.position - tokenizer.last_line_start + 1, }; - let c = parser.current_char(); + let c = tokenizer.current_char(); let component_value = match c { '\t' | '\n' | ' ' => { - while !parser.is_eof() { - match parser.current_char() { - ' ' | '\t' => parser.position += 1, + while !tokenizer.is_eof() { + match tokenizer.current_char() { + ' ' | '\t' => tokenizer.position += 1, '\n' => { - parser.position += 1; - parser.new_line(); + tokenizer.position += 1; + tokenizer.new_line(); }, _ => break, } } WhiteSpace }, - '"' => consume_string(parser, false), + '"' => consume_string(tokenizer, false), '#' => { - parser.position += 1; - if is_ident_start(parser) { IDHash(consume_name(parser)) } - else if !parser.is_eof() && match parser.current_char() { + tokenizer.position += 1; + if is_ident_start(tokenizer) { IDHash(consume_name(tokenizer)) } + else if !tokenizer.is_eof() && match tokenizer.current_char() { 'a'..'z' | 'A'..'Z' | '0'..'9' | '-' | '_' => true, - '\\' => !parser.starts_with("\\\n"), + '\\' => !tokenizer.starts_with("\\\n"), _ => c > '\x7F', // Non-ASCII - } { Hash(consume_name(parser)) } + } { Hash(consume_name(tokenizer)) } else { Delim(c) } }, '$' => { - if parser.starts_with("$=") { parser.position += 2; SuffixMatch } - else { parser.position += 1; Delim(c) } + if tokenizer.starts_with("$=") { tokenizer.position += 2; SuffixMatch } + else { tokenizer.position += 1; Delim(c) } }, - '\'' => consume_string(parser, true), - '(' => ParenthesisBlock(consume_block(parser, CloseParenthesis)), - ')' => { parser.position += 1; CloseParenthesis }, + '\'' => consume_string(tokenizer, true), + '(' => ParenthesisBlock(consume_block(tokenizer, CloseParenthesis)), + ')' => { tokenizer.position += 1; CloseParenthesis }, '*' => { - if parser.starts_with("*=") { parser.position += 2; SubstringMatch } - else { parser.position += 1; Delim(c) } + if tokenizer.starts_with("*=") { tokenizer.position += 2; SubstringMatch } + else { tokenizer.position += 1; Delim(c) } }, '+' => { if ( - parser.position + 1 < parser.length - && is_match!(parser.char_at(1), '0'..'9') + tokenizer.position + 1 < tokenizer.length + && is_match!(tokenizer.char_at(1), '0'..'9') ) || ( - parser.position + 2 < parser.length - && parser.char_at(1) == '.' - && is_match!(parser.char_at(2), '0'..'9') + tokenizer.position + 2 < tokenizer.length + && tokenizer.char_at(1) == '.' + && is_match!(tokenizer.char_at(2), '0'..'9') ) { - consume_numeric(parser) + consume_numeric(tokenizer) } else { - parser.position += 1; + tokenizer.position += 1; Delim(c) } }, - ',' => { parser.position += 1; Comma }, + ',' => { tokenizer.position += 1; Comma }, '-' => { if ( - parser.position + 1 < parser.length - && is_match!(parser.char_at(1), '0'..'9') + tokenizer.position + 1 < tokenizer.length + && is_match!(tokenizer.char_at(1), '0'..'9') ) || ( - parser.position + 2 < parser.length - && parser.char_at(1) == '.' - && is_match!(parser.char_at(2), '0'..'9') + tokenizer.position + 2 < tokenizer.length + && tokenizer.char_at(1) == '.' + && is_match!(tokenizer.char_at(2), '0'..'9') ) { - consume_numeric(parser) - } else if is_ident_start(parser) { - consume_ident_like(parser) - } else if parser.starts_with("-->") { - parser.position += 3; + consume_numeric(tokenizer) + } else if is_ident_start(tokenizer) { + consume_ident_like(tokenizer) + } else if tokenizer.starts_with("-->") { + tokenizer.position += 3; CDC } else { - parser.position += 1; + tokenizer.position += 1; Delim(c) } }, '.' => { - if parser.position + 1 < parser.length && is_match!(parser.char_at(1), '0'..'9') { - consume_numeric(parser) + if ( + tokenizer.position + 1 < tokenizer.length + && is_match!(tokenizer.char_at(1), '0'..'9') + ) { + consume_numeric(tokenizer) } else { - parser.position += 1; + tokenizer.position += 1; Delim(c) } } - '0'..'9' => consume_numeric(parser), - ':' => { parser.position += 1; Colon }, - ';' => { parser.position += 1; Semicolon }, + '0'..'9' => consume_numeric(tokenizer), + ':' => { tokenizer.position += 1; Colon }, + ';' => { tokenizer.position += 1; Semicolon }, '<' => { - if parser.starts_with("