Skip to content

Commit 87a1eee

Browse files
committed
Make encoding support generic.
Use encoding-rs in tests.
1 parent c336382 commit 87a1eee

File tree

4 files changed

+73
-50
lines changed

4 files changed

+73
-50
lines changed

Cargo.toml

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
[package]
22

33
name = "cssparser"
4-
version = "0.8.0"
4+
version = "0.9.0"
55
authors = [ "Simon Sapin <simon.sapin@exyr.org>" ]
66

77
description = "Rust implementation of CSS Syntax Level 3"
@@ -14,12 +14,15 @@ build = "build.rs"
1414

1515
exclude = ["src/css-parsing-tests"]
1616

17+
[lib]
18+
doctest = false
19+
1720
[dev-dependencies]
1821
rustc-serialize = "0.3"
1922
tempdir = "0.3"
23+
encoding_rs = "0.3.2"
2024

2125
[dependencies]
22-
encoding = "0.2"
2326
heapsize = {version = ">=0.1.1, <0.4.0", optional = true}
2427
matches = "0.1"
2528
serde = {version = ">=0.6.6, <0.9", optional = true}

src/from_bytes.rs

Lines changed: 33 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -2,14 +2,23 @@
22
* License, v. 2.0. If a copy of the MPL was not distributed with this
33
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
44

5-
use std::cmp;
5+
/// Abstraction for avoiding a dependency from cssparser to an encoding library
6+
pub trait EncodingSupport {
7+
/// One character encoding
8+
type Encoding;
69

7-
use encoding::label::encoding_from_whatwg_label;
8-
use encoding::all::UTF_8;
9-
use encoding::{EncodingRef, DecoderTrap, decode};
10+
/// https://encoding.spec.whatwg.org/#concept-encoding-get
11+
fn from_label(ascii_label: &[u8]) -> Option<Self::Encoding>;
1012

13+
/// Return the UTF-8 encoding
14+
fn utf8() -> Self::Encoding;
1115

12-
/// Determine the character encoding of a CSS stylesheet and decode it.
16+
/// Whether the given encoding is UTF-16BE or UTF-16LE
17+
fn is_utf16_be_or_le(encoding: &Self::Encoding) -> bool;
18+
}
19+
20+
21+
/// Determine the character encoding of a CSS stylesheet.
1322
///
1423
/// This is based on the presence of a BOM (Byte Order Mark), an `@charset` rule, and
1524
/// encoding meta-information.
@@ -20,48 +29,36 @@ use encoding::{EncodingRef, DecoderTrap, decode};
2029
/// * `environment_encoding`: An optional `Encoding` object for the [environment encoding]
2130
/// (https://drafts.csswg.org/css-syntax/#environment-encoding), if any.
2231
///
23-
/// Returns a 2-tuple of a decoded Unicode string and the `Encoding` object that was used.
24-
pub fn decode_stylesheet_bytes(css: &[u8], protocol_encoding_label: Option<&str>,
25-
environment_encoding: Option<EncodingRef>)
26-
-> (String, EncodingRef) {
32+
/// Returns the encoding to use.
33+
pub fn stylesheet_encoding<E>(css: &[u8], protocol_encoding_label: Option<&[u8]>,
34+
environment_encoding: Option<E::Encoding>)
35+
-> E::Encoding
36+
where E: EncodingSupport {
2737
// https://drafts.csswg.org/css-syntax/#the-input-byte-stream
2838
match protocol_encoding_label {
2939
None => (),
30-
Some(label) => match encoding_from_whatwg_label(label) {
40+
Some(label) => match E::from_label(label) {
3141
None => (),
32-
Some(fallback) => return decode_replace(css, fallback)
42+
Some(protocol_encoding) => return protocol_encoding
3343
}
3444
}
35-
if css.starts_with("@charset \"".as_bytes()) {
36-
// 10 is "@charset \"".len()
37-
// 100 is arbitrary so that no encoding label is more than 100-10 bytes.
38-
match css[10..cmp::min(css.len(), 100)].iter().position(|&b| b == b'"') {
45+
let prefix = b"@charset \"";
46+
if css.starts_with(prefix) {
47+
let rest = &css[prefix.len()..];
48+
match rest.iter().position(|&b| b == b'"') {
3949
None => (),
40-
Some(label_length)
41-
=> if css[10 + label_length..].starts_with("\";".as_bytes()) {
42-
let label = &css[10..10 + label_length];
43-
let label = label.iter().map(|&b| b as char).collect::<String>();
44-
match encoding_from_whatwg_label(&*label) {
50+
Some(label_length) => if rest[label_length..].starts_with(b"\";") {
51+
let label = &rest[..label_length];
52+
match E::from_label(label) {
4553
None => (),
46-
Some(fallback) => match fallback.name() {
47-
"utf-16be" | "utf-16le"
48-
=> return decode_replace(css, UTF_8 as EncodingRef),
49-
_ => return decode_replace(css, fallback),
54+
Some(charset_encoding) => if E::is_utf16_be_or_le(&charset_encoding) {
55+
return E::utf8()
56+
} else {
57+
return charset_encoding
5058
}
5159
}
5260
}
5361
}
5462
}
55-
match environment_encoding {
56-
None => (),
57-
Some(fallback) => return decode_replace(css, fallback)
58-
}
59-
return decode_replace(css, UTF_8 as EncodingRef)
60-
}
61-
62-
63-
#[inline]
64-
fn decode_replace(input: &[u8], fallback_encoding: EncodingRef)-> (String, EncodingRef) {
65-
let (result, used_encoding) = decode(input, DecoderTrap::Replace, fallback_encoding);
66-
(result.unwrap(), used_encoding)
63+
environment_encoding.unwrap_or_else(E::utf8)
6764
}

src/lib.rs

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,10 @@ Implementation of [CSS Syntax Module Level 3](https://drafts.csswg.org/css-synta
1515
# Input
1616
1717
Everything is based on `Parser` objects, which borrow a `&str` input.
18-
If you have bytes (from a file, the network, or something),
19-
see the `decode_stylesheet_bytes` function.
18+
If you have bytes (from a file, the network, or something)
19+
and want to support character encodings other than UTF-8,
20+
see the `stylesheet_encoding` function,
21+
which can be used together with rust-encoding or encoding-rs.
2022
2123
# Conventions for parsing functions
2224
@@ -66,8 +68,8 @@ fn parse_border_spacing(_context: &ParserContext, input: &mut Parser)
6668

6769
#![recursion_limit="200"] // For color::parse_color_keyword
6870

69-
extern crate encoding;
7071
#[macro_use] extern crate matches;
72+
#[cfg(test)] extern crate encoding_rs;
7173
#[cfg(test)] extern crate tempdir;
7274
#[cfg(test)] extern crate rustc_serialize;
7375
#[cfg(feature = "serde")] extern crate serde;
@@ -78,7 +80,7 @@ pub use rules_and_declarations::{parse_important};
7880
pub use rules_and_declarations::{DeclarationParser, DeclarationListParser, parse_one_declaration};
7981
pub use rules_and_declarations::{RuleListParser, parse_one_rule};
8082
pub use rules_and_declarations::{AtRuleType, QualifiedRuleParser, AtRuleParser};
81-
pub use from_bytes::decode_stylesheet_bytes;
83+
pub use from_bytes::{stylesheet_encoding, EncodingSupport};
8284
pub use color::{RGBA, Color, parse_color_keyword};
8385
pub use nth::parse_nth;
8486
pub use serializer::{ToCss, CssStringWriter, serialize_identifier, serialize_string, TokenSerializationType};

src/tests.rs

Lines changed: 29 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
#[cfg(feature = "bench")]
66
extern crate test;
77

8+
use encoding_rs;
89
use std::borrow::Cow::{self, Borrowed};
910
use std::fs::File;
1011
use std::io::{self, Write};
@@ -16,17 +17,14 @@ use tempdir::TempDir;
1617
#[cfg(feature = "bench")]
1718
use self::test::Bencher;
1819

19-
use encoding::label::encoding_from_whatwg_label;
20-
2120
use super::{Parser, Delimiter, Token, NumericValue, PercentageValue, SourceLocation,
2221
DeclarationListParser, DeclarationParser, RuleListParser,
2322
AtRuleType, AtRuleParser, QualifiedRuleParser,
2423
parse_one_declaration, parse_one_rule, parse_important,
25-
decode_stylesheet_bytes,
24+
stylesheet_encoding, EncodingSupport,
2625
TokenSerializationType,
2726
Color, RGBA, parse_nth, ToCss};
2827

29-
3028
macro_rules! JArray {
3129
($($e: expr,)*) => { JArray![ $( $e ),* ] };
3230
($($e: expr),*) => { Json::Array(vec!( $( $e.to_json() ),* )) }
@@ -198,6 +196,26 @@ fn one_rule() {
198196

199197
#[test]
200198
fn stylesheet_from_bytes() {
199+
pub struct EncodingRs;
200+
201+
impl EncodingSupport for EncodingRs {
202+
type Encoding = &'static encoding_rs::Encoding;
203+
204+
fn utf8() -> Self::Encoding {
205+
encoding_rs::UTF_8
206+
}
207+
208+
fn is_utf16_be_or_le(encoding: &Self::Encoding) -> bool {
209+
*encoding == encoding_rs::UTF_16LE ||
210+
*encoding == encoding_rs::UTF_16BE
211+
}
212+
213+
fn from_label(ascii_label: &[u8]) -> Option<Self::Encoding> {
214+
encoding_rs::Encoding::for_label(ascii_label)
215+
}
216+
}
217+
218+
201219
run_raw_json_tests(include_str!("css-parsing-tests/stylesheet_bytes.json"),
202220
|input, expected| {
203221
let map = match input {
@@ -210,17 +228,20 @@ fn stylesheet_from_bytes() {
210228
assert!(c as u32 <= 0xFF);
211229
c as u8
212230
}).collect::<Vec<u8>>();
213-
let protocol_encoding_label = get_string(&map, "protocol_encoding");
231+
let protocol_encoding_label = get_string(&map, "protocol_encoding")
232+
.map(|s| s.as_bytes());
214233
let environment_encoding = get_string(&map, "environment_encoding")
215-
.and_then(encoding_from_whatwg_label);
234+
.map(|s| s.as_bytes())
235+
.and_then(EncodingRs::from_label);
216236

217-
let (css_unicode, encoding) = decode_stylesheet_bytes(
237+
let encoding = stylesheet_encoding::<EncodingRs>(
218238
&css, protocol_encoding_label, environment_encoding);
239+
let (css_unicode, used_encoding, _) = encoding.decode(&css);
219240
let input = &mut Parser::new(&css_unicode);
220241
let rules = RuleListParser::new_for_stylesheet(input, JsonParser)
221242
.map(|result| result.unwrap_or(JArray!["error", "invalid"]))
222243
.collect::<Vec<_>>();
223-
JArray![rules, encoding.name()]
244+
JArray![rules, used_encoding.name().to_lowercase()]
224245
};
225246
assert_json_eq(result, expected, Json::Object(map).to_string());
226247
});

0 commit comments

Comments
 (0)