Skip to content

[Do not merge yet] Batched breaking changes #117

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 7 commits into from
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Make encoding support generic.
Use encoding-rs in tests.
  • Loading branch information
SimonSapin committed Feb 14, 2017
commit 5f0ba1d5ade8c6720ee6aedb666a77cfda84a5bb
5 changes: 4 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,15 @@ build = "build.rs"

exclude = ["src/css-parsing-tests"]

[lib]
doctest = false

[dev-dependencies]
rustc-serialize = "0.3"
tempdir = "0.3"
encoding_rs = "0.3.2"

[dependencies]
encoding = "0.2"
heapsize = {version = ">=0.1.1, <0.4.0", optional = true}
matches = "0.1"
serde = {version = ">=0.6.6, <0.9", optional = true}
Expand Down
69 changes: 33 additions & 36 deletions src/from_bytes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,23 @@
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */

use std::cmp;
/// Abstraction for avoiding a dependency from cssparser to an encoding library
pub trait EncodingSupport {
/// One character encoding
type Encoding;

use encoding::label::encoding_from_whatwg_label;
use encoding::all::UTF_8;
use encoding::{EncodingRef, DecoderTrap, decode};
/// https://encoding.spec.whatwg.org/#concept-encoding-get
fn from_label(ascii_label: &[u8]) -> Option<Self::Encoding>;

/// Return the UTF-8 encoding
fn utf8() -> Self::Encoding;

/// Determine the character encoding of a CSS stylesheet and decode it.
/// Whether the given encoding is UTF-16BE or UTF-16LE
fn is_utf16_be_or_le(encoding: &Self::Encoding) -> bool;
}


/// Determine the character encoding of a CSS stylesheet.
///
/// This is based on the presence of a BOM (Byte Order Mark), an `@charset` rule, and
/// encoding meta-information.
Expand All @@ -20,48 +29,36 @@ use encoding::{EncodingRef, DecoderTrap, decode};
/// * `environment_encoding`: An optional `Encoding` object for the [environment encoding]
/// (https://drafts.csswg.org/css-syntax/#environment-encoding), if any.
///
/// Returns a 2-tuple of a decoded Unicode string and the `Encoding` object that was used.
pub fn decode_stylesheet_bytes(css: &[u8], protocol_encoding_label: Option<&str>,
environment_encoding: Option<EncodingRef>)
-> (String, EncodingRef) {
/// Returns the encoding to use.
pub fn stylesheet_encoding<E>(css: &[u8], protocol_encoding_label: Option<&[u8]>,
environment_encoding: Option<E::Encoding>)
-> E::Encoding
where E: EncodingSupport {
// https://drafts.csswg.org/css-syntax/#the-input-byte-stream
match protocol_encoding_label {
None => (),
Some(label) => match encoding_from_whatwg_label(label) {
Some(label) => match E::from_label(label) {
None => (),
Some(fallback) => return decode_replace(css, fallback)
Some(protocol_encoding) => return protocol_encoding
}
}
if css.starts_with("@charset \"".as_bytes()) {
// 10 is "@charset \"".len()
// 100 is arbitrary so that no encoding label is more than 100-10 bytes.
match css[10..cmp::min(css.len(), 100)].iter().position(|&b| b == b'"') {
let prefix = b"@charset \"";
if css.starts_with(prefix) {
let rest = &css[prefix.len()..];
match rest.iter().position(|&b| b == b'"') {
None => (),
Some(label_length)
=> if css[10 + label_length..].starts_with("\";".as_bytes()) {
let label = &css[10..10 + label_length];
let label = label.iter().map(|&b| b as char).collect::<String>();
match encoding_from_whatwg_label(&*label) {
Some(label_length) => if rest[label_length..].starts_with(b"\";") {
let label = &rest[..label_length];
match E::from_label(label) {
None => (),
Some(fallback) => match fallback.name() {
"utf-16be" | "utf-16le"
=> return decode_replace(css, UTF_8 as EncodingRef),
_ => return decode_replace(css, fallback),
Some(charset_encoding) => if E::is_utf16_be_or_le(&charset_encoding) {
return E::utf8()
} else {
return charset_encoding
}
}
}
}
}
match environment_encoding {
None => (),
Some(fallback) => return decode_replace(css, fallback)
}
return decode_replace(css, UTF_8 as EncodingRef)
}


#[inline]
fn decode_replace(input: &[u8], fallback_encoding: EncodingRef)-> (String, EncodingRef) {
let (result, used_encoding) = decode(input, DecoderTrap::Replace, fallback_encoding);
(result.unwrap(), used_encoding)
environment_encoding.unwrap_or_else(E::utf8)
}
10 changes: 6 additions & 4 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,10 @@ Implementation of [CSS Syntax Module Level 3](https://drafts.csswg.org/css-synta
# Input

Everything is based on `Parser` objects, which borrow a `&str` input.
If you have bytes (from a file, the network, or something),
see the `decode_stylesheet_bytes` function.
If you have bytes (from a file, the network, or something)
and want to support character encodings other than UTF-8,
see the `stylesheet_encoding` function,
which can be used together with rust-encoding or encoding-rs.

# Conventions for parsing functions

Expand Down Expand Up @@ -66,8 +68,8 @@ fn parse_border_spacing(_context: &ParserContext, input: &mut Parser)

#![recursion_limit="200"] // For color::parse_color_keyword

extern crate encoding;
#[macro_use] extern crate matches;
#[cfg(test)] extern crate encoding_rs;
#[cfg(test)] extern crate tempdir;
#[cfg(test)] extern crate rustc_serialize;
#[cfg(feature = "serde")] extern crate serde;
Expand All @@ -78,7 +80,7 @@ pub use rules_and_declarations::{parse_important};
pub use rules_and_declarations::{DeclarationParser, DeclarationListParser, parse_one_declaration};
pub use rules_and_declarations::{RuleListParser, parse_one_rule};
pub use rules_and_declarations::{AtRuleType, QualifiedRuleParser, AtRuleParser};
pub use from_bytes::decode_stylesheet_bytes;
pub use from_bytes::{stylesheet_encoding, EncodingSupport};
pub use color::{RGBA, Color, parse_color_keyword};
pub use nth::parse_nth;
pub use serializer::{ToCss, CssStringWriter, serialize_identifier, serialize_string, TokenSerializationType};
Expand Down
37 changes: 29 additions & 8 deletions src/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
#[cfg(feature = "bench")]
extern crate test;

use encoding_rs;
use std::borrow::Cow::{self, Borrowed};
use std::fs::File;
use std::io::{self, Write};
Expand All @@ -16,17 +17,14 @@ use tempdir::TempDir;
#[cfg(feature = "bench")]
use self::test::Bencher;

use encoding::label::encoding_from_whatwg_label;

use super::{Parser, Delimiter, Token, NumericValue, PercentageValue, SourceLocation,
DeclarationListParser, DeclarationParser, RuleListParser,
AtRuleType, AtRuleParser, QualifiedRuleParser,
parse_one_declaration, parse_one_rule, parse_important,
decode_stylesheet_bytes,
stylesheet_encoding, EncodingSupport,
TokenSerializationType,
Color, RGBA, parse_nth, ToCss};


macro_rules! JArray {
($($e: expr,)*) => { JArray![ $( $e ),* ] };
($($e: expr),*) => { Json::Array(vec!( $( $e.to_json() ),* )) }
Expand Down Expand Up @@ -198,6 +196,26 @@ fn one_rule() {

#[test]
fn stylesheet_from_bytes() {
pub struct EncodingRs;

impl EncodingSupport for EncodingRs {
type Encoding = &'static encoding_rs::Encoding;

fn utf8() -> Self::Encoding {
encoding_rs::UTF_8
}

fn is_utf16_be_or_le(encoding: &Self::Encoding) -> bool {
*encoding == encoding_rs::UTF_16LE ||
*encoding == encoding_rs::UTF_16BE
}

fn from_label(ascii_label: &[u8]) -> Option<Self::Encoding> {
encoding_rs::Encoding::for_label(ascii_label)
}
}


run_raw_json_tests(include_str!("css-parsing-tests/stylesheet_bytes.json"),
|input, expected| {
let map = match input {
Expand All @@ -210,17 +228,20 @@ fn stylesheet_from_bytes() {
assert!(c as u32 <= 0xFF);
c as u8
}).collect::<Vec<u8>>();
let protocol_encoding_label = get_string(&map, "protocol_encoding");
let protocol_encoding_label = get_string(&map, "protocol_encoding")
.map(|s| s.as_bytes());
let environment_encoding = get_string(&map, "environment_encoding")
.and_then(encoding_from_whatwg_label);
.map(|s| s.as_bytes())
.and_then(EncodingRs::from_label);

let (css_unicode, encoding) = decode_stylesheet_bytes(
let encoding = stylesheet_encoding::<EncodingRs>(
&css, protocol_encoding_label, environment_encoding);
let (css_unicode, used_encoding, _) = encoding.decode(&css);
let input = &mut Parser::new(&css_unicode);
let rules = RuleListParser::new_for_stylesheet(input, JsonParser)
.map(|result| result.unwrap_or(JArray!["error", "invalid"]))
.collect::<Vec<_>>();
JArray![rules, encoding.name()]
JArray![rules, used_encoding.name().to_lowercase()]
};
assert_json_eq(result, expected, Json::Object(map).to_string());
});
Expand Down