From dd699d47d0bb8f021837f966be0528f922138b7c Mon Sep 17 00:00:00 2001 From: Simon Sapin Date: Sat, 30 Nov 2013 18:54:43 +0000 Subject: [PATCH 1/4] Add tests for character encodings / parsing from bytes. --- README.rst | 18 ++++++ stylesheet_bytes.json | 126 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 144 insertions(+) create mode 100644 stylesheet_bytes.json diff --git a/README.rst b/README.rst index 9f4f5675..d54b0d8b 100644 --- a/README.rst +++ b/README.rst @@ -91,6 +91,24 @@ associated with the expected result. The Unicode input is represented by a JSON string, the output as a list of `qualified rules`_ or at-rules_. +``stylesheet_bytes.json`` + Tests `Parse a stylesheet + `_ + together with `The input byte stream + `_. + The input is represented as a JSON object containing: + + * A required ``css_bytes``, the input byte string, + represented as a JSON string where code points U+0000 to U+00FF + represent bytes of the same value. + * An optional ``protocol_encoding``, + a protocol encoding label as a JSON string, or null. + * An optional ``environment_encoding``, + an environment encoding label as a JSON string, or null. + * An optional ``comment`` that is ignored. + + The output is represented a list of `qualified rules`_ or at-rules_. + ``color3.json`` Tests the ```` syntax `defined in CSS Color Level 3 `_. diff --git a/stylesheet_bytes.json b/stylesheet_bytes.json new file mode 100644 index 00000000..8b2c512e --- /dev/null +++ b/stylesheet_bytes.json @@ -0,0 +1,126 @@ +[ + +{"css_bytes": ""}, +[[], "utf-8"], + +{"css_bytes": "@\u00C3\u00A9", + "protocol_encoding": null, "environment_encoding": null}, +[[["at-rule", "é", [], null]], "utf-8"], + +{"css_bytes": "@\u00C3\u00A9"}, +[[["at-rule", "é", [], null]], "utf-8"], + +{"css_bytes": "@\u0000\u00E9\u0000", + "comment": "Untagged UTF-16, parsed as UTF-8"}, +[[["at-rule", "���", [], null]], "utf-8"], + +{"css_bytes": "\u00FF\u00FE@\u0000\u00E9\u0000", + "comment": "UTF-16 with a BOM"}, +[[["at-rule", "é", [], null]], "utf-16le"], + +{"css_bytes": "\u00FE\u00FF\u0000@\u0000\u00E9"}, +[[["at-rule", "é", [], null]], "utf-16be"], + +{"css_bytes": "@\u00E9"}, +[[["at-rule", "�", [], null]], "utf-8"], + + +{"css_bytes": "@\u00E9", "protocol_encoding": "ISO-8859-2"}, +[[["at-rule", "é", [], null]], "iso-8859-2"], + +{"css_bytes": "@\u00E9", "protocol_encoding": "ISO-8859-5"}, +[[["at-rule", "щ", [], null]], "iso-8859-5"], + +{"css_bytes": "@\u00C3\u00A9", "protocol_encoding": "ISO-8859-2"}, +[[["at-rule", "ĂŠ", [], null]], "iso-8859-2"], + +{"css_bytes": "\u00EF\u00BB\u00BF @\u00C3\u00A9", + "protocol_encoding": "ISO-8859-2", + "comment": "BOM takes precedence over protocol"}, +[[["at-rule", "é", [], null]], "utf-8"], + + +{"css_bytes": "@charset \"ISO-8859-5\"; @\u00E9"}, +[[["at-rule", "charset", [" ", ["string", "ISO-8859-5"]], null], + ["at-rule", "щ", [], null]], + "iso-8859-5"], + +{"css_bytes": "@Charset \"ISO-8859-5\"; @\u00E9", + "comment": "@charset has to match an exact byte pattern"}, +[[["at-rule", "Charset", [" ", ["string", "ISO-8859-5"]], null], + ["at-rule", "�", [], null]], + "utf-8"], + +{"css_bytes": "@charset \"ISO-8859-5\"; @\u00E9", + "comment": "@charset has to match an exact byte pattern"}, +[[["at-rule", "charset", [" ", ["string", "ISO-8859-5"]], null], + ["at-rule", "�", [], null]], + "utf-8"], + +{"css_bytes": "@charset 'ISO-8859-5'; @\u00E9", + "comment": "@charset has to match an exact byte pattern"}, +[[["at-rule", "charset", [" ", ["string", "ISO-8859-5"]], null], + ["at-rule", "�", [], null]], + "utf-8"], + + +{"css_bytes": "@\u0000c\u0000h\u0000a\u0000r\u0000s\u0000e\u0000t\u0000 \u0000\"\u0000U\u0000T\u0000F\u0000-\u00001\u00006\u0000L\u0000E\u0000\"\u0000;\u0000@\u0000\u00e9\u0000", + "comment": "@charset has to be ASCII-compatible itself"}, +[[["at-rule", "�c�h�a�r�s�e�t�", + [" ", ["ident", "�"], ["string", "�U�T�F�-�1�6�L�E�"], ["ident", "�"]], null], + ["error", "invalid"]], + "utf-8"], + +{"css_bytes": "@charset \"UTF-16LE\"; @\u00C3\u00A9", + "comment": "@charset can only specify ASCII-compatible encodings"}, +[[["at-rule", "charset", [" ", ["string", "UTF-16LE"]], null], + ["at-rule", "é", [], null]], + "utf-8"], + + +{"css_bytes": "\u00EF\u00BB\u00BF @charset \"ISO-8859-5\"; @\u00E9", + "comment": "BOM takes precedence over @charset"}, +[[["at-rule", "charset", [" ", ["string", "ISO-8859-5"]], null], + ["at-rule", "�", [], null]], + "utf-8"], + +{"css_bytes": "\u00EF\u00BB\u00BF @charset \"ISO-8859-5\"; @\u00C3\u00A9", + "comment": "BOM takes precedence over @charset"}, +[[["at-rule", "charset", [" ", ["string", "ISO-8859-5"]], null], + ["at-rule", "é", [], null]], + "utf-8"], + +{"css_bytes": "@charset \"ISO-8859-5\"; @\u00E9", + "protocol_encoding": " Iso-8859-2", + "comment": "Protocol takes precedence over @charset"}, +[[["at-rule", "charset", [" ", ["string", "ISO-8859-5"]], null], + ["at-rule", "é", [], null]], + "iso-8859-2"], + + +{"css_bytes": "@\u00E9", "environment_encoding": "ISO-8859-2"}, +[[["at-rule", "é", [], null]], "iso-8859-2"], + +{"css_bytes": "@\u00E9", "environment_encoding": "ISO-8859-5"}, +[[["at-rule", "щ", [], null]], "iso-8859-5"], + +{"css_bytes": "@charset \"ISO-8859-5\"; @\u00E9", + "environment_encoding": "ISO-8859-2", + "comment": "@character takes precedence over environment"}, +[[["at-rule", "charset", [" ", ["string", "ISO-8859-5"]], null], + ["at-rule", "щ", [], null]], + "iso-8859-5"], + +{"css_bytes": "@\u00E9", + "protocol_encoding": "ISO-8859-2", + "environment_encoding": "ISO-8859-5", + "comment": "protocol takes precedence over environment"}, +[[["at-rule", "é", [], null]], "iso-8859-2"], + +{"css_bytes": "\u00EF\u00BB\u00BF @\u00C3\u00A9", + "environment_encoding": "ISO-8859-5", + "comment": "BOM takes precedence over environment"}, +[[["at-rule", "é", [], null]], "utf-8"] + + +] From 022ccf7dde4f4a3ef8a9c9929b194a8986ed10cd Mon Sep 17 00:00:00 2001 From: Simon Sapin Date: Wed, 11 Dec 2013 12:50:44 +0000 Subject: [PATCH 2/4] Add parsing from bytes, with rust-encoding. New dependency: https://github.com/lifthrasiir/rust-encoding --- from_bytes.rs | 102 ++++++++++++++++++++++++++++++++++++++++++++++++++ lib.rs | 3 ++ parser.rs | 4 +- tests.rs | 58 ++++++++++++++++++++++++++-- 4 files changed, 162 insertions(+), 5 deletions(-) create mode 100644 from_bytes.rs diff --git a/from_bytes.rs b/from_bytes.rs new file mode 100644 index 00000000..565bb20a --- /dev/null +++ b/from_bytes.rs @@ -0,0 +1,102 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +use std::str; + +use encoding::label::encoding_from_whatwg_label; +use encoding::all::UTF_8; +use encoding::Encoding; +use encoding::DecodeReplace; +use encoding::decode; + +use tokenizer::{tokenize, Tokenizer}; +use parser::{parse_stylesheet_rules, StylesheetParser}; + + +/// Determine the character encoding of a CSS stylesheet and decode it. +/// +/// This is based on the presence of a :abbr:`BOM (Byte Order Mark)`, +/// an `@charset` rule, +/// and encoding meta-information. +/// +/// :param css_bytes: A byte string. +/// :param protocol_encoding: +/// The encoding label, if any, defined by HTTP or equivalent protocol. +/// (e.g. via the `charset` parameter of the `Content-Type` header.) +/// :param environment_encoding: +/// An optional `Encoding` object +/// for the `environment encoding +/// `_, +/// if any. +/// :returns: +/// A 2-tuple of a decoded Unicode string +/// and the `Encoding` object that was used. +pub fn decode_stylesheet_bytes(css: &[u8], protocol_encoding_label: Option<&str>, + environment_encoding: Option<&'static Encoding>) + -> (~str, &'static Encoding) { + // http://dev.w3.org/csswg/css-syntax/#the-input-byte-stream + match protocol_encoding_label { + None => (), + Some(label) => match encoding_from_whatwg_label(label) { + None => (), + Some(fallback) => return decode_replace(css, fallback) + } + } + if css.starts_with("@charset \"".as_bytes()) { + // 10 is "@charset \"".len() + // 100 is arbitrary so that no encoding label is more than 100-10 bytes. + match css.slice(10, css.len().min(&100)).position_elem(&('"' as u8)) { + None => (), + Some(label_length) + => if css.slice_from(10 + label_length).starts_with("\";".as_bytes()) { + let label = css.slice(10, 10 + label_length); + let label = str::from_chars(label.iter().map(|&b| b as char).to_owned_vec()); + match encoding_from_whatwg_label(label) { + None => (), + Some(fallback) => match fallback.name() { + "utf-16be" | "utf-16le" + => return decode_replace(css, UTF_8 as &'static Encoding), + _ => return decode_replace(css, fallback), + } + } + } + } + } + match environment_encoding { + None => (), + Some(fallback) => return decode_replace(css, fallback) + } + return decode_replace(css, UTF_8 as &'static Encoding) +} + + +#[inline] +fn decode_replace(input: &[u8], fallback_encoding: &'static Encoding)-> (~str, &'static Encoding) { + let (result, used_encoding) = decode(input, DecodeReplace, fallback_encoding); + (result.unwrap(), used_encoding) +} + + +/// Parse stylesheet from bytes. +/// +/// :param css_bytes: A byte string. +/// :param protocol_encoding: +/// The encoding label, if any, defined by HTTP or equivalent protocol. +/// (e.g. via the `charset` parameter of the `Content-Type` header.) +/// :param environment_encoding: +/// An optional `Encoding` object +/// for the `environment encoding +/// `_, +/// if any. +/// :returns: +/// A 2-tuple of a Iterator> +/// and the `Encoding` object that was used. +pub fn parse_stylesheet_rules_from_bytes( + css_bytes: &[u8], protocol_encoding_label: Option<&str>, + environment_encoding: Option<&'static Encoding>) + -> (StylesheetParser, &'static Encoding) { + let (css_unicode, encoding) = decode_stylesheet_bytes( + css_bytes, protocol_encoding_label, environment_encoding); + (parse_stylesheet_rules(tokenize(css_unicode)), encoding) +} diff --git a/lib.rs b/lib.rs index dd024dfb..f71d4645 100644 --- a/lib.rs +++ b/lib.rs @@ -7,10 +7,12 @@ #[feature(globs, macro_rules)]; extern mod extra; +extern mod encoding; // https://github.com/lifthrasiir/rust-encoding pub use tokenizer::tokenize; pub use parser::{parse_stylesheet_rules, parse_rule_list, parse_declaration_list, parse_one_rule, parse_one_declaration, parse_one_component_value}; +pub use from_bytes::{decode_stylesheet_bytes, parse_stylesheet_rules_from_bytes}; pub use color::{RGBA, Color, CurrentColor}; pub use nth::parse_nth; pub use serializer::{ToCss, serialize_identifier, serialize_string}; @@ -18,6 +20,7 @@ pub use serializer::{ToCss, serialize_identifier, serialize_string}; pub mod ast; mod tokenizer; mod parser; +mod from_bytes; mod color; mod nth; mod serializer; diff --git a/parser.rs b/parser.rs index 77b6b12f..7c2dcf3a 100644 --- a/parser.rs +++ b/parser.rs @@ -98,7 +98,9 @@ pub fn parse_one_component_value>(mut iter: T) // *********** End of public API *********** -struct StylesheetParser{ iter: T } +// used in from_bytes.rs but not reexported in the crate top-level +pub struct StylesheetParser{ iter: T } + struct RuleListParser{ iter: T } struct DeclarationListParser{ iter: T } diff --git a/tests.rs b/tests.rs index 1ed582df..0cb36574 100644 --- a/tests.rs +++ b/tests.rs @@ -9,6 +9,8 @@ use extra::{tempfile, json}; use extra::json::ToJson; use extra::test; +use encoding::label::encoding_from_whatwg_label; + use super::*; use ast::*; @@ -56,23 +58,34 @@ fn assert_json_eq(results: json::Json, expected: json::Json, message: ~str) { } -fn run_json_tests(json_data: &str, parse: &fn (input: ~str) -> T) { +fn run_raw_json_tests(json_data: &str, run: &fn (json::Json, json::Json)) { let items = match json::from_str(json_data) { Ok(json::List(items)) => items, _ => fail!("Invalid JSON") }; assert!(items.len() % 2 == 0); - let mut input: Option<~str> = None; + let mut input = None; for item in items.move_iter() { match (&input, item) { - (&None, json::String(string)) => input = Some(string), + (&None, json_obj) => input = Some(json_obj), (&Some(_), expected) => { let input = input.take_unwrap(); + run(input, expected) + }, + }; + } +} + + +fn run_json_tests(json_data: &str, parse: &fn (input: ~str) -> T) { + do run_raw_json_tests(json_data) |input, expected| { + match input { + json::String(input) => { let result = parse(input.to_owned()).to_json(); assert_json_eq(result, expected, input); }, _ => fail!("Unexpected JSON") - }; + } } } @@ -133,6 +146,43 @@ fn one_rule() { } +#[test] +fn stylesheet_from_bytes() { + do run_raw_json_tests(include_str!("css-parsing-tests/stylesheet_bytes.json")) + |input, expected| { + let map = match input { + json::Object(map) => map, + _ => fail!("Unexpected JSON") + }; + + let result = { + let css = get_string(map, &~"css_bytes").unwrap().iter().map(|c| { + assert!(c as u32 <= 0xFF); + c as u8 + }).to_owned_vec(); + let protocol_encoding_label = get_string(map, &~"protocol_encoding"); + let environment_encoding = get_string(map, &~"environment_encoding") + .and_then(encoding_from_whatwg_label); + + let (mut rules, used_encoding) = parse_stylesheet_rules_from_bytes( + css, protocol_encoding_label, environment_encoding); + + (rules.to_owned_vec(), used_encoding.name().to_owned()).to_json() + }; + assert_json_eq(result, expected, json::Object(map).to_str()); + } + + fn get_string<'a>(map: &'a json::Object, key: &~str) -> Option<&'a str> { + match map.find(key) { + Some(&json::String(ref s)) => Some(s.as_slice()), + Some(&json::Null) => None, + None => None, + _ => fail!("Unexpected JSON"), + } + } +} + + fn run_color_tests(json_data: &str, to_json: &fn(result: Option) -> json::Json) { do run_json_tests(json_data) |input| { match parse_one_component_value(tokenize(input)) { From 4eb52d5cb22e39c9edff848420c910b0b7ecafa5 Mon Sep 17 00:00:00 2001 From: Simon Sapin Date: Wed, 11 Dec 2013 16:26:27 +0000 Subject: [PATCH 3/4] =?UTF-8?q?Use=20rust-encoding=E2=80=99s=20new=20Encod?= =?UTF-8?q?eObj=20type.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit See https://github.com/mozilla-servo/rust-encoding/commit/b9e0332e4d362eccb919c74d3e59bf6850e0fe1c --- from_bytes.rs | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/from_bytes.rs b/from_bytes.rs index 565bb20a..488c954d 100644 --- a/from_bytes.rs +++ b/from_bytes.rs @@ -6,9 +6,7 @@ use std::str; use encoding::label::encoding_from_whatwg_label; use encoding::all::UTF_8; -use encoding::Encoding; -use encoding::DecodeReplace; -use encoding::decode; +use encoding::{EncodingObj, DecodeReplace, decode}; use tokenizer::{tokenize, Tokenizer}; use parser::{parse_stylesheet_rules, StylesheetParser}; @@ -33,8 +31,8 @@ use parser::{parse_stylesheet_rules, StylesheetParser}; /// A 2-tuple of a decoded Unicode string /// and the `Encoding` object that was used. pub fn decode_stylesheet_bytes(css: &[u8], protocol_encoding_label: Option<&str>, - environment_encoding: Option<&'static Encoding>) - -> (~str, &'static Encoding) { + environment_encoding: Option) + -> (~str, EncodingObj) { // http://dev.w3.org/csswg/css-syntax/#the-input-byte-stream match protocol_encoding_label { None => (), @@ -56,7 +54,7 @@ pub fn decode_stylesheet_bytes(css: &[u8], protocol_encoding_label: Option<&str> None => (), Some(fallback) => match fallback.name() { "utf-16be" | "utf-16le" - => return decode_replace(css, UTF_8 as &'static Encoding), + => return decode_replace(css, UTF_8 as EncodingObj), _ => return decode_replace(css, fallback), } } @@ -67,12 +65,12 @@ pub fn decode_stylesheet_bytes(css: &[u8], protocol_encoding_label: Option<&str> None => (), Some(fallback) => return decode_replace(css, fallback) } - return decode_replace(css, UTF_8 as &'static Encoding) + return decode_replace(css, UTF_8 as EncodingObj) } #[inline] -fn decode_replace(input: &[u8], fallback_encoding: &'static Encoding)-> (~str, &'static Encoding) { +fn decode_replace(input: &[u8], fallback_encoding: EncodingObj)-> (~str, EncodingObj) { let (result, used_encoding) = decode(input, DecodeReplace, fallback_encoding); (result.unwrap(), used_encoding) } @@ -94,8 +92,8 @@ fn decode_replace(input: &[u8], fallback_encoding: &'static Encoding)-> (~str, & /// and the `Encoding` object that was used. pub fn parse_stylesheet_rules_from_bytes( css_bytes: &[u8], protocol_encoding_label: Option<&str>, - environment_encoding: Option<&'static Encoding>) - -> (StylesheetParser, &'static Encoding) { + environment_encoding: Option) + -> (StylesheetParser, EncodingObj) { let (css_unicode, encoding) = decode_stylesheet_bytes( css_bytes, protocol_encoding_label, environment_encoding); (parse_stylesheet_rules(tokenize(css_unicode)), encoding) From 5178431109f4a51f38c257e36640702320c54958 Mon Sep 17 00:00:00 2001 From: Simon Sapin Date: Fri, 13 Dec 2013 00:37:03 +0000 Subject: [PATCH 4/4] Update to upstream rust-encoding, that renamed EncodingObj to EncodingRef --- from_bytes.rs | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/from_bytes.rs b/from_bytes.rs index 488c954d..c4a64ccd 100644 --- a/from_bytes.rs +++ b/from_bytes.rs @@ -6,7 +6,7 @@ use std::str; use encoding::label::encoding_from_whatwg_label; use encoding::all::UTF_8; -use encoding::{EncodingObj, DecodeReplace, decode}; +use encoding::{EncodingRef, DecodeReplace, decode}; use tokenizer::{tokenize, Tokenizer}; use parser::{parse_stylesheet_rules, StylesheetParser}; @@ -31,8 +31,8 @@ use parser::{parse_stylesheet_rules, StylesheetParser}; /// A 2-tuple of a decoded Unicode string /// and the `Encoding` object that was used. pub fn decode_stylesheet_bytes(css: &[u8], protocol_encoding_label: Option<&str>, - environment_encoding: Option) - -> (~str, EncodingObj) { + environment_encoding: Option) + -> (~str, EncodingRef) { // http://dev.w3.org/csswg/css-syntax/#the-input-byte-stream match protocol_encoding_label { None => (), @@ -54,7 +54,7 @@ pub fn decode_stylesheet_bytes(css: &[u8], protocol_encoding_label: Option<&str> None => (), Some(fallback) => match fallback.name() { "utf-16be" | "utf-16le" - => return decode_replace(css, UTF_8 as EncodingObj), + => return decode_replace(css, UTF_8 as EncodingRef), _ => return decode_replace(css, fallback), } } @@ -65,12 +65,12 @@ pub fn decode_stylesheet_bytes(css: &[u8], protocol_encoding_label: Option<&str> None => (), Some(fallback) => return decode_replace(css, fallback) } - return decode_replace(css, UTF_8 as EncodingObj) + return decode_replace(css, UTF_8 as EncodingRef) } #[inline] -fn decode_replace(input: &[u8], fallback_encoding: EncodingObj)-> (~str, EncodingObj) { +fn decode_replace(input: &[u8], fallback_encoding: EncodingRef)-> (~str, EncodingRef) { let (result, used_encoding) = decode(input, DecodeReplace, fallback_encoding); (result.unwrap(), used_encoding) } @@ -92,8 +92,8 @@ fn decode_replace(input: &[u8], fallback_encoding: EncodingObj)-> (~str, Encodin /// and the `Encoding` object that was used. pub fn parse_stylesheet_rules_from_bytes( css_bytes: &[u8], protocol_encoding_label: Option<&str>, - environment_encoding: Option) - -> (StylesheetParser, EncodingObj) { + environment_encoding: Option) + -> (StylesheetParser, EncodingRef) { let (css_unicode, encoding) = decode_stylesheet_bytes( css_bytes, protocol_encoding_label, environment_encoding); (parse_stylesheet_rules(tokenize(css_unicode)), encoding)