From c33638269fadb22b4c2339ff3269070a683032f9 Mon Sep 17 00:00:00 2001 From: Simon Sapin Date: Thu, 9 Feb 2017 18:40:02 +0100 Subject: [PATCH 1/6] Fix a warning. --- build.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build.rs b/build.rs index 84e36be5..30e3d8b1 100644 --- a/build.rs +++ b/build.rs @@ -2,7 +2,7 @@ * License, v. 2.0. If a copy of the MPL was not distributed with this * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ -#[macro_use] extern crate quote; +extern crate quote; extern crate syn; use std::env; From 87a1eee8f84417a802ce9a80f2a3a330080a241c Mon Sep 17 00:00:00 2001 From: Simon Sapin Date: Thu, 9 Feb 2017 19:37:13 +0100 Subject: [PATCH 2/6] Make encoding support generic. Use encoding-rs in tests. --- Cargo.toml | 7 +++-- src/from_bytes.rs | 69 +++++++++++++++++++++++------------------------ src/lib.rs | 10 ++++--- src/tests.rs | 37 +++++++++++++++++++------ 4 files changed, 73 insertions(+), 50 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 9efd886e..a459df26 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "cssparser" -version = "0.8.0" +version = "0.9.0" authors = [ "Simon Sapin " ] description = "Rust implementation of CSS Syntax Level 3" @@ -14,12 +14,15 @@ build = "build.rs" exclude = ["src/css-parsing-tests"] +[lib] +doctest = false + [dev-dependencies] rustc-serialize = "0.3" tempdir = "0.3" +encoding_rs = "0.3.2" [dependencies] -encoding = "0.2" heapsize = {version = ">=0.1.1, <0.4.0", optional = true} matches = "0.1" serde = {version = ">=0.6.6, <0.9", optional = true} diff --git a/src/from_bytes.rs b/src/from_bytes.rs index 705b65e3..d71d9efb 100644 --- a/src/from_bytes.rs +++ b/src/from_bytes.rs @@ -2,14 +2,23 @@ * License, v. 2.0. If a copy of the MPL was not distributed with this * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ -use std::cmp; +/// Abstraction for avoiding a dependency from cssparser to an encoding library +pub trait EncodingSupport { + /// One character encoding + type Encoding; -use encoding::label::encoding_from_whatwg_label; -use encoding::all::UTF_8; -use encoding::{EncodingRef, DecoderTrap, decode}; + /// https://encoding.spec.whatwg.org/#concept-encoding-get + fn from_label(ascii_label: &[u8]) -> Option; + /// Return the UTF-8 encoding + fn utf8() -> Self::Encoding; -/// Determine the character encoding of a CSS stylesheet and decode it. + /// Whether the given encoding is UTF-16BE or UTF-16LE + fn is_utf16_be_or_le(encoding: &Self::Encoding) -> bool; +} + + +/// Determine the character encoding of a CSS stylesheet. /// /// This is based on the presence of a BOM (Byte Order Mark), an `@charset` rule, and /// encoding meta-information. @@ -20,48 +29,36 @@ use encoding::{EncodingRef, DecoderTrap, decode}; /// * `environment_encoding`: An optional `Encoding` object for the [environment encoding] /// (https://drafts.csswg.org/css-syntax/#environment-encoding), if any. /// -/// Returns a 2-tuple of a decoded Unicode string and the `Encoding` object that was used. -pub fn decode_stylesheet_bytes(css: &[u8], protocol_encoding_label: Option<&str>, - environment_encoding: Option) - -> (String, EncodingRef) { +/// Returns the encoding to use. +pub fn stylesheet_encoding(css: &[u8], protocol_encoding_label: Option<&[u8]>, + environment_encoding: Option) + -> E::Encoding + where E: EncodingSupport { // https://drafts.csswg.org/css-syntax/#the-input-byte-stream match protocol_encoding_label { None => (), - Some(label) => match encoding_from_whatwg_label(label) { + Some(label) => match E::from_label(label) { None => (), - Some(fallback) => return decode_replace(css, fallback) + Some(protocol_encoding) => return protocol_encoding } } - if css.starts_with("@charset \"".as_bytes()) { - // 10 is "@charset \"".len() - // 100 is arbitrary so that no encoding label is more than 100-10 bytes. - match css[10..cmp::min(css.len(), 100)].iter().position(|&b| b == b'"') { + let prefix = b"@charset \""; + if css.starts_with(prefix) { + let rest = &css[prefix.len()..]; + match rest.iter().position(|&b| b == b'"') { None => (), - Some(label_length) - => if css[10 + label_length..].starts_with("\";".as_bytes()) { - let label = &css[10..10 + label_length]; - let label = label.iter().map(|&b| b as char).collect::(); - match encoding_from_whatwg_label(&*label) { + Some(label_length) => if rest[label_length..].starts_with(b"\";") { + let label = &rest[..label_length]; + match E::from_label(label) { None => (), - Some(fallback) => match fallback.name() { - "utf-16be" | "utf-16le" - => return decode_replace(css, UTF_8 as EncodingRef), - _ => return decode_replace(css, fallback), + Some(charset_encoding) => if E::is_utf16_be_or_le(&charset_encoding) { + return E::utf8() + } else { + return charset_encoding } } } } } - match environment_encoding { - None => (), - Some(fallback) => return decode_replace(css, fallback) - } - return decode_replace(css, UTF_8 as EncodingRef) -} - - -#[inline] -fn decode_replace(input: &[u8], fallback_encoding: EncodingRef)-> (String, EncodingRef) { - let (result, used_encoding) = decode(input, DecoderTrap::Replace, fallback_encoding); - (result.unwrap(), used_encoding) + environment_encoding.unwrap_or_else(E::utf8) } diff --git a/src/lib.rs b/src/lib.rs index 58859e88..fbc6c5a1 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -15,8 +15,10 @@ Implementation of [CSS Syntax Module Level 3](https://drafts.csswg.org/css-synta # Input Everything is based on `Parser` objects, which borrow a `&str` input. -If you have bytes (from a file, the network, or something), -see the `decode_stylesheet_bytes` function. +If you have bytes (from a file, the network, or something) +and want to support character encodings other than UTF-8, +see the `stylesheet_encoding` function, +which can be used together with rust-encoding or encoding-rs. # Conventions for parsing functions @@ -66,8 +68,8 @@ fn parse_border_spacing(_context: &ParserContext, input: &mut Parser) #![recursion_limit="200"] // For color::parse_color_keyword -extern crate encoding; #[macro_use] extern crate matches; +#[cfg(test)] extern crate encoding_rs; #[cfg(test)] extern crate tempdir; #[cfg(test)] extern crate rustc_serialize; #[cfg(feature = "serde")] extern crate serde; @@ -78,7 +80,7 @@ pub use rules_and_declarations::{parse_important}; pub use rules_and_declarations::{DeclarationParser, DeclarationListParser, parse_one_declaration}; pub use rules_and_declarations::{RuleListParser, parse_one_rule}; pub use rules_and_declarations::{AtRuleType, QualifiedRuleParser, AtRuleParser}; -pub use from_bytes::decode_stylesheet_bytes; +pub use from_bytes::{stylesheet_encoding, EncodingSupport}; pub use color::{RGBA, Color, parse_color_keyword}; pub use nth::parse_nth; pub use serializer::{ToCss, CssStringWriter, serialize_identifier, serialize_string, TokenSerializationType}; diff --git a/src/tests.rs b/src/tests.rs index 9800c416..326f6f07 100644 --- a/src/tests.rs +++ b/src/tests.rs @@ -5,6 +5,7 @@ #[cfg(feature = "bench")] extern crate test; +use encoding_rs; use std::borrow::Cow::{self, Borrowed}; use std::fs::File; use std::io::{self, Write}; @@ -16,17 +17,14 @@ use tempdir::TempDir; #[cfg(feature = "bench")] use self::test::Bencher; -use encoding::label::encoding_from_whatwg_label; - use super::{Parser, Delimiter, Token, NumericValue, PercentageValue, SourceLocation, DeclarationListParser, DeclarationParser, RuleListParser, AtRuleType, AtRuleParser, QualifiedRuleParser, parse_one_declaration, parse_one_rule, parse_important, - decode_stylesheet_bytes, + stylesheet_encoding, EncodingSupport, TokenSerializationType, Color, RGBA, parse_nth, ToCss}; - macro_rules! JArray { ($($e: expr,)*) => { JArray![ $( $e ),* ] }; ($($e: expr),*) => { Json::Array(vec!( $( $e.to_json() ),* )) } @@ -198,6 +196,26 @@ fn one_rule() { #[test] fn stylesheet_from_bytes() { + pub struct EncodingRs; + + impl EncodingSupport for EncodingRs { + type Encoding = &'static encoding_rs::Encoding; + + fn utf8() -> Self::Encoding { + encoding_rs::UTF_8 + } + + fn is_utf16_be_or_le(encoding: &Self::Encoding) -> bool { + *encoding == encoding_rs::UTF_16LE || + *encoding == encoding_rs::UTF_16BE + } + + fn from_label(ascii_label: &[u8]) -> Option { + encoding_rs::Encoding::for_label(ascii_label) + } + } + + run_raw_json_tests(include_str!("css-parsing-tests/stylesheet_bytes.json"), |input, expected| { let map = match input { @@ -210,17 +228,20 @@ fn stylesheet_from_bytes() { assert!(c as u32 <= 0xFF); c as u8 }).collect::>(); - let protocol_encoding_label = get_string(&map, "protocol_encoding"); + let protocol_encoding_label = get_string(&map, "protocol_encoding") + .map(|s| s.as_bytes()); let environment_encoding = get_string(&map, "environment_encoding") - .and_then(encoding_from_whatwg_label); + .map(|s| s.as_bytes()) + .and_then(EncodingRs::from_label); - let (css_unicode, encoding) = decode_stylesheet_bytes( + let encoding = stylesheet_encoding::( &css, protocol_encoding_label, environment_encoding); + let (css_unicode, used_encoding, _) = encoding.decode(&css); let input = &mut Parser::new(&css_unicode); let rules = RuleListParser::new_for_stylesheet(input, JsonParser) .map(|result| result.unwrap_or(JArray!["error", "invalid"])) .collect::>(); - JArray![rules, encoding.name()] + JArray![rules, used_encoding.name().to_lowercase()] }; assert_json_eq(result, expected, Json::Object(map).to_string()); }); From 46e0e80f8f7e8f6c64573c1af23fa25dff48a133 Mon Sep 17 00:00:00 2001 From: Simon Sapin Date: Thu, 9 Feb 2017 20:03:23 +0100 Subject: [PATCH 3/6] Remove unicode-range tokens, per spec change. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit https://github.com/w3c/csswg-drafts/commit/01c55ee4c9a0bf565b88b6d581c24a0462d8257a They’re being replaced by a micro-syntax / parsing algorithm based on other tokens: https://drafts.csswg.org/css-syntax/#urange --- src/css-parsing-tests/README.rst | 4 - .../component_value_list.json | 74 ------------------- src/serializer.rs | 36 +-------- src/tests.rs | 2 - src/tokenizer.rs | 48 ------------ 5 files changed, 4 insertions(+), 160 deletions(-) diff --git a/src/css-parsing-tests/README.rst b/src/css-parsing-tests/README.rst index d54b0d8b..9779cf24 100644 --- a/src/css-parsing-tests/README.rst +++ b/src/css-parsing-tests/README.rst @@ -228,10 +228,6 @@ Component values the value as a number, the type as the string ``"integer"`` or ``"number"``, and the unit as a string. - - Array of length 3: the string ``"unicode-range"``, - followed by the *start* and *end* integers as two numbers. - The string ``"~="``. diff --git a/src/css-parsing-tests/component_value_list.json b/src/css-parsing-tests/component_value_list.json index a3a2a8b3..42cac5fd 100644 --- a/src/css-parsing-tests/component_value_list.json +++ b/src/css-parsing-tests/component_value_list.json @@ -325,80 +325,6 @@ ["dimension", "12", 12, "integer", "rêd"] ], -"u+1 U+10 U+100 U+1000 U+10000 U+100000 U+1000000", [ - ["unicode-range", 1, 1], " ", - ["unicode-range", 16, 16], " ", - ["unicode-range", 256, 256], " ", - ["unicode-range", 4096, 4096], " ", - ["unicode-range", 65536, 65536], " ", - ["unicode-range", 1048576, 1048576], " ", - ["unicode-range", 1048576, 1048576], ["number", "0", 0, "integer"] -], - -"u+? u+1? U+10? U+100? U+1000? U+10000? U+100000?", [ - ["unicode-range", 0, 15], " ", - ["unicode-range", 16, 31], " ", - ["unicode-range", 256, 271], " ", - ["unicode-range", 4096, 4111], " ", - ["unicode-range", 65536, 65551], " ", - ["unicode-range", 1048576, 1048591], " ", - ["unicode-range", 1048576, 1048576], "?" -], - -"u+?? U+1?? U+10?? U+100?? U+1000?? U+10000??", [ - ["unicode-range", 0, 255], " ", - ["unicode-range", 256, 511], " ", - ["unicode-range", 4096, 4351], " ", - ["unicode-range", 65536, 65791], " ", - ["unicode-range", 1048576, 1048831], " ", - ["unicode-range", 1048576, 1048591], "?" -], - -"u+??? U+1??? U+10??? U+100??? U+1000???", [ - ["unicode-range", 0, 4095], " ", - ["unicode-range", 4096, 8191], " ", - ["unicode-range", 65536, 69631], " ", - ["unicode-range", 1048576, 1052671], " ", - ["unicode-range", 1048576, 1048831], "?" -], - -"u+???? U+1???? U+10???? U+100????", [ - ["unicode-range", 0, 65535], " ", - ["unicode-range", 65536, 131071], " ", - ["unicode-range", 1048576, 1114111], " ", - ["unicode-range", 1048576, 1052671], "?" -], - -"u+????? U+1????? U+10?????", [ - ["unicode-range", 0, 1048575], " ", - ["unicode-range", 1048576, 2097151], " ", - ["unicode-range", 1048576, 1114111], "?" -], - -"u+?????? U+1??????", [ - ["unicode-range", 0, 16777215], " ", - ["unicode-range", 1048576, 2097151], "?" -], - -"u+20-3F U+100000-2 U+1000000-2 U+10-200000", [ - ["unicode-range", 32, 63], " ", - ["unicode-range", 1048576, 2], " ", - ["unicode-range", 1048576, 1048576], ["number", "0", 0, "integer"], - ["number", "-2", -2, "integer"], " ", - ["unicode-range", 16, 2097152] -], - -"ù+12 Ü+12 u +12 U+ 12 U+12 - 20 U+1?2 U+1?-50 U+1- 2", [ - ["ident", "ù"], ["number", "+12", 12, "integer"], " ", - ["ident", "Ü"], ["number", "+12", 12, "integer"], " ", - ["ident", "u"], " ", ["number", "+12", 12, "integer"], " ", - ["ident", "U"], "+", " ", ["number", "12", 12, "integer"], " ", - ["unicode-range", 18, 18], " ", "-", " ", ["number", "20", 20, "integer"], " ", - ["unicode-range", 16, 31], ["number", "2", 2, "integer"], " ", - ["unicode-range", 16, 31], ["number", "-50", -50, "integer"], " ", - ["unicode-range", 1, 1], "-", " ", ["number", "2", 2, "integer"] -], - "~=|=^=$=*=|| |/**/| ~/**/=", [ "~=", "|=", "^=", "$=", "*=", "||", "