Skip to content

Commit 6e3fbbe

Browse files
committed
Merge pull request servo#33 from SimonSapin/master
Add parsing from bytes, with rust-encoding.
2 parents de4812d + 5178431 commit 6e3fbbe

File tree

6 files changed

+304
-5
lines changed

6 files changed

+304
-5
lines changed

css-parsing-tests/README.rst

+18
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,24 @@ associated with the expected result.
9191
The Unicode input is represented by a JSON string,
9292
the output as a list of `qualified rules`_ or at-rules_.
9393

94+
``stylesheet_bytes.json``
95+
Tests `Parse a stylesheet
96+
<http://dev.w3.org/csswg/css-syntax-3/#parse-a-stylesheet>`_
97+
together with `The input byte stream
98+
<http://dev.w3.org/csswg/css-syntax/#input-byte-stream>`_.
99+
The input is represented as a JSON object containing:
100+
101+
* A required ``css_bytes``, the input byte string,
102+
represented as a JSON string where code points U+0000 to U+00FF
103+
represent bytes of the same value.
104+
* An optional ``protocol_encoding``,
105+
a protocol encoding label as a JSON string, or null.
106+
* An optional ``environment_encoding``,
107+
an environment encoding label as a JSON string, or null.
108+
* An optional ``comment`` that is ignored.
109+
110+
The output is represented a list of `qualified rules`_ or at-rules_.
111+
94112
``color3.json``
95113
Tests the ``<color>`` syntax `defined in CSS Color Level 3
96114
<http://www.w3.org/TR/css3-color/#colorunits>`_.
+126
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,126 @@
1+
[
2+
3+
{"css_bytes": ""},
4+
[[], "utf-8"],
5+
6+
{"css_bytes": "@\u00C3\u00A9",
7+
"protocol_encoding": null, "environment_encoding": null},
8+
[[["at-rule", "é", [], null]], "utf-8"],
9+
10+
{"css_bytes": "@\u00C3\u00A9"},
11+
[[["at-rule", "é", [], null]], "utf-8"],
12+
13+
{"css_bytes": "@\u0000\u00E9\u0000",
14+
"comment": "Untagged UTF-16, parsed as UTF-8"},
15+
[[["at-rule", "���", [], null]], "utf-8"],
16+
17+
{"css_bytes": "\u00FF\u00FE@\u0000\u00E9\u0000",
18+
"comment": "UTF-16 with a BOM"},
19+
[[["at-rule", "é", [], null]], "utf-16le"],
20+
21+
{"css_bytes": "\u00FE\u00FF\u0000@\u0000\u00E9"},
22+
[[["at-rule", "é", [], null]], "utf-16be"],
23+
24+
{"css_bytes": "@\u00E9"},
25+
[[["at-rule", "", [], null]], "utf-8"],
26+
27+
28+
{"css_bytes": "@\u00E9", "protocol_encoding": "ISO-8859-2"},
29+
[[["at-rule", "é", [], null]], "iso-8859-2"],
30+
31+
{"css_bytes": "@\u00E9", "protocol_encoding": "ISO-8859-5"},
32+
[[["at-rule", "щ", [], null]], "iso-8859-5"],
33+
34+
{"css_bytes": "@\u00C3\u00A9", "protocol_encoding": "ISO-8859-2"},
35+
[[["at-rule", "ĂŠ", [], null]], "iso-8859-2"],
36+
37+
{"css_bytes": "\u00EF\u00BB\u00BF @\u00C3\u00A9",
38+
"protocol_encoding": "ISO-8859-2",
39+
"comment": "BOM takes precedence over protocol"},
40+
[[["at-rule", "é", [], null]], "utf-8"],
41+
42+
43+
{"css_bytes": "@charset \"ISO-8859-5\"; @\u00E9"},
44+
[[["at-rule", "charset", [" ", ["string", "ISO-8859-5"]], null],
45+
["at-rule", "щ", [], null]],
46+
"iso-8859-5"],
47+
48+
{"css_bytes": "@Charset \"ISO-8859-5\"; @\u00E9",
49+
"comment": "@charset has to match an exact byte pattern"},
50+
[[["at-rule", "Charset", [" ", ["string", "ISO-8859-5"]], null],
51+
["at-rule", "", [], null]],
52+
"utf-8"],
53+
54+
{"css_bytes": "@charset \"ISO-8859-5\"; @\u00E9",
55+
"comment": "@charset has to match an exact byte pattern"},
56+
[[["at-rule", "charset", [" ", ["string", "ISO-8859-5"]], null],
57+
["at-rule", "", [], null]],
58+
"utf-8"],
59+
60+
{"css_bytes": "@charset 'ISO-8859-5'; @\u00E9",
61+
"comment": "@charset has to match an exact byte pattern"},
62+
[[["at-rule", "charset", [" ", ["string", "ISO-8859-5"]], null],
63+
["at-rule", "", [], null]],
64+
"utf-8"],
65+
66+
67+
{"css_bytes": "@\u0000c\u0000h\u0000a\u0000r\u0000s\u0000e\u0000t\u0000 \u0000\"\u0000U\u0000T\u0000F\u0000-\u00001\u00006\u0000L\u0000E\u0000\"\u0000;\u0000@\u0000\u00e9\u0000",
68+
"comment": "@charset has to be ASCII-compatible itself"},
69+
[[["at-rule", "�c�h�a�r�s�e�t�",
70+
[" ", ["ident", ""], ["string", "�U�T�F�-�1�6�L�E�"], ["ident", ""]], null],
71+
["error", "invalid"]],
72+
"utf-8"],
73+
74+
{"css_bytes": "@charset \"UTF-16LE\"; @\u00C3\u00A9",
75+
"comment": "@charset can only specify ASCII-compatible encodings"},
76+
[[["at-rule", "charset", [" ", ["string", "UTF-16LE"]], null],
77+
["at-rule", "é", [], null]],
78+
"utf-8"],
79+
80+
81+
{"css_bytes": "\u00EF\u00BB\u00BF @charset \"ISO-8859-5\"; @\u00E9",
82+
"comment": "BOM takes precedence over @charset"},
83+
[[["at-rule", "charset", [" ", ["string", "ISO-8859-5"]], null],
84+
["at-rule", "", [], null]],
85+
"utf-8"],
86+
87+
{"css_bytes": "\u00EF\u00BB\u00BF @charset \"ISO-8859-5\"; @\u00C3\u00A9",
88+
"comment": "BOM takes precedence over @charset"},
89+
[[["at-rule", "charset", [" ", ["string", "ISO-8859-5"]], null],
90+
["at-rule", "é", [], null]],
91+
"utf-8"],
92+
93+
{"css_bytes": "@charset \"ISO-8859-5\"; @\u00E9",
94+
"protocol_encoding": " Iso-8859-2",
95+
"comment": "Protocol takes precedence over @charset"},
96+
[[["at-rule", "charset", [" ", ["string", "ISO-8859-5"]], null],
97+
["at-rule", "é", [], null]],
98+
"iso-8859-2"],
99+
100+
101+
{"css_bytes": "@\u00E9", "environment_encoding": "ISO-8859-2"},
102+
[[["at-rule", "é", [], null]], "iso-8859-2"],
103+
104+
{"css_bytes": "@\u00E9", "environment_encoding": "ISO-8859-5"},
105+
[[["at-rule", "щ", [], null]], "iso-8859-5"],
106+
107+
{"css_bytes": "@charset \"ISO-8859-5\"; @\u00E9",
108+
"environment_encoding": "ISO-8859-2",
109+
"comment": "@character takes precedence over environment"},
110+
[[["at-rule", "charset", [" ", ["string", "ISO-8859-5"]], null],
111+
["at-rule", "щ", [], null]],
112+
"iso-8859-5"],
113+
114+
{"css_bytes": "@\u00E9",
115+
"protocol_encoding": "ISO-8859-2",
116+
"environment_encoding": "ISO-8859-5",
117+
"comment": "protocol takes precedence over environment"},
118+
[[["at-rule", "é", [], null]], "iso-8859-2"],
119+
120+
{"css_bytes": "\u00EF\u00BB\u00BF @\u00C3\u00A9",
121+
"environment_encoding": "ISO-8859-5",
122+
"comment": "BOM takes precedence over environment"},
123+
[[["at-rule", "é", [], null]], "utf-8"]
124+
125+
126+
]

from_bytes.rs

+100
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
/* This Source Code Form is subject to the terms of the Mozilla Public
2+
* License, v. 2.0. If a copy of the MPL was not distributed with this
3+
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
4+
5+
use std::str;
6+
7+
use encoding::label::encoding_from_whatwg_label;
8+
use encoding::all::UTF_8;
9+
use encoding::{EncodingRef, DecodeReplace, decode};
10+
11+
use tokenizer::{tokenize, Tokenizer};
12+
use parser::{parse_stylesheet_rules, StylesheetParser};
13+
14+
15+
/// Determine the character encoding of a CSS stylesheet and decode it.
16+
///
17+
/// This is based on the presence of a :abbr:`BOM (Byte Order Mark)`,
18+
/// an `@charset` rule,
19+
/// and encoding meta-information.
20+
///
21+
/// :param css_bytes: A byte string.
22+
/// :param protocol_encoding:
23+
/// The encoding label, if any, defined by HTTP or equivalent protocol.
24+
/// (e.g. via the `charset` parameter of the `Content-Type` header.)
25+
/// :param environment_encoding:
26+
/// An optional `Encoding` object
27+
/// for the `environment encoding
28+
/// <http://www.w3.org/TR/css-syntax/#environment-encoding>`_,
29+
/// if any.
30+
/// :returns:
31+
/// A 2-tuple of a decoded Unicode string
32+
/// and the `Encoding` object that was used.
33+
pub fn decode_stylesheet_bytes(css: &[u8], protocol_encoding_label: Option<&str>,
34+
environment_encoding: Option<EncodingRef>)
35+
-> (~str, EncodingRef) {
36+
// http://dev.w3.org/csswg/css-syntax/#the-input-byte-stream
37+
match protocol_encoding_label {
38+
None => (),
39+
Some(label) => match encoding_from_whatwg_label(label) {
40+
None => (),
41+
Some(fallback) => return decode_replace(css, fallback)
42+
}
43+
}
44+
if css.starts_with("@charset \"".as_bytes()) {
45+
// 10 is "@charset \"".len()
46+
// 100 is arbitrary so that no encoding label is more than 100-10 bytes.
47+
match css.slice(10, css.len().min(&100)).position_elem(&('"' as u8)) {
48+
None => (),
49+
Some(label_length)
50+
=> if css.slice_from(10 + label_length).starts_with("\";".as_bytes()) {
51+
let label = css.slice(10, 10 + label_length);
52+
let label = str::from_chars(label.iter().map(|&b| b as char).to_owned_vec());
53+
match encoding_from_whatwg_label(label) {
54+
None => (),
55+
Some(fallback) => match fallback.name() {
56+
"utf-16be" | "utf-16le"
57+
=> return decode_replace(css, UTF_8 as EncodingRef),
58+
_ => return decode_replace(css, fallback),
59+
}
60+
}
61+
}
62+
}
63+
}
64+
match environment_encoding {
65+
None => (),
66+
Some(fallback) => return decode_replace(css, fallback)
67+
}
68+
return decode_replace(css, UTF_8 as EncodingRef)
69+
}
70+
71+
72+
#[inline]
73+
fn decode_replace(input: &[u8], fallback_encoding: EncodingRef)-> (~str, EncodingRef) {
74+
let (result, used_encoding) = decode(input, DecodeReplace, fallback_encoding);
75+
(result.unwrap(), used_encoding)
76+
}
77+
78+
79+
/// Parse stylesheet from bytes.
80+
///
81+
/// :param css_bytes: A byte string.
82+
/// :param protocol_encoding:
83+
/// The encoding label, if any, defined by HTTP or equivalent protocol.
84+
/// (e.g. via the `charset` parameter of the `Content-Type` header.)
85+
/// :param environment_encoding:
86+
/// An optional `Encoding` object
87+
/// for the `environment encoding
88+
/// <http://www.w3.org/TR/css-syntax/#environment-encoding>`_,
89+
/// if any.
90+
/// :returns:
91+
/// A 2-tuple of a Iterator<Result<Rule, SyntaxError>>
92+
/// and the `Encoding` object that was used.
93+
pub fn parse_stylesheet_rules_from_bytes(
94+
css_bytes: &[u8], protocol_encoding_label: Option<&str>,
95+
environment_encoding: Option<EncodingRef>)
96+
-> (StylesheetParser<Tokenizer>, EncodingRef) {
97+
let (css_unicode, encoding) = decode_stylesheet_bytes(
98+
css_bytes, protocol_encoding_label, environment_encoding);
99+
(parse_stylesheet_rules(tokenize(css_unicode)), encoding)
100+
}

lib.rs

+3
Original file line numberDiff line numberDiff line change
@@ -7,17 +7,20 @@
77
#[feature(globs, macro_rules)];
88

99
extern mod extra;
10+
extern mod encoding; // https://github.com/lifthrasiir/rust-encoding
1011

1112
pub use tokenizer::tokenize;
1213
pub use parser::{parse_stylesheet_rules, parse_rule_list, parse_declaration_list,
1314
parse_one_rule, parse_one_declaration, parse_one_component_value};
15+
pub use from_bytes::{decode_stylesheet_bytes, parse_stylesheet_rules_from_bytes};
1416
pub use color::{RGBA, Color, CurrentColor};
1517
pub use nth::parse_nth;
1618
pub use serializer::{ToCss, serialize_identifier, serialize_string};
1719

1820
pub mod ast;
1921
mod tokenizer;
2022
mod parser;
23+
mod from_bytes;
2124
mod color;
2225
mod nth;
2326
mod serializer;

parser.rs

+3-1
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,9 @@ pub fn parse_one_component_value<T: Iterator<Node>>(mut iter: T)
9898
// *********** End of public API ***********
9999

100100

101-
struct StylesheetParser<T>{ iter: T }
101+
// used in from_bytes.rs but not reexported in the crate top-level
102+
pub struct StylesheetParser<T>{ iter: T }
103+
102104
struct RuleListParser<T>{ iter: T }
103105
struct DeclarationListParser<T>{ iter: T }
104106

tests.rs

+54-4
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@ use extra::{tempfile, json};
99
use extra::json::ToJson;
1010
use extra::test;
1111

12+
use encoding::label::encoding_from_whatwg_label;
13+
1214
use super::*;
1315
use ast::*;
1416

@@ -56,23 +58,34 @@ fn assert_json_eq(results: json::Json, expected: json::Json, message: ~str) {
5658
}
5759
5860
59-
fn run_json_tests<T: ToJson>(json_data: &str, parse: &fn (input: ~str) -> T) {
61+
fn run_raw_json_tests(json_data: &str, run: &fn (json::Json, json::Json)) {
6062
let items = match json::from_str(json_data) {
6163
Ok(json::List(items)) => items,
6264
_ => fail!("Invalid JSON")
6365
};
6466
assert!(items.len() % 2 == 0);
65-
let mut input: Option<~str> = None;
67+
let mut input = None;
6668
for item in items.move_iter() {
6769
match (&input, item) {
68-
(&None, json::String(string)) => input = Some(string),
70+
(&None, json_obj) => input = Some(json_obj),
6971
(&Some(_), expected) => {
7072
let input = input.take_unwrap();
73+
run(input, expected)
74+
},
75+
};
76+
}
77+
}
78+
79+
80+
fn run_json_tests<T: ToJson>(json_data: &str, parse: &fn (input: ~str) -> T) {
81+
do run_raw_json_tests(json_data) |input, expected| {
82+
match input {
83+
json::String(input) => {
7184
let result = parse(input.to_owned()).to_json();
7285
assert_json_eq(result, expected, input);
7386
},
7487
_ => fail!("Unexpected JSON")
75-
};
88+
}
7689
}
7790
}
7891
@@ -133,6 +146,43 @@ fn one_rule() {
133146
}
134147
135148
149+
#[test]
150+
fn stylesheet_from_bytes() {
151+
do run_raw_json_tests(include_str!("css-parsing-tests/stylesheet_bytes.json"))
152+
|input, expected| {
153+
let map = match input {
154+
json::Object(map) => map,
155+
_ => fail!("Unexpected JSON")
156+
};
157+
158+
let result = {
159+
let css = get_string(map, &~"css_bytes").unwrap().iter().map(|c| {
160+
assert!(c as u32 <= 0xFF);
161+
c as u8
162+
}).to_owned_vec();
163+
let protocol_encoding_label = get_string(map, &~"protocol_encoding");
164+
let environment_encoding = get_string(map, &~"environment_encoding")
165+
.and_then(encoding_from_whatwg_label);
166+
167+
let (mut rules, used_encoding) = parse_stylesheet_rules_from_bytes(
168+
css, protocol_encoding_label, environment_encoding);
169+
170+
(rules.to_owned_vec(), used_encoding.name().to_owned()).to_json()
171+
};
172+
assert_json_eq(result, expected, json::Object(map).to_str());
173+
}
174+
175+
fn get_string<'a>(map: &'a json::Object, key: &~str) -> Option<&'a str> {
176+
match map.find(key) {
177+
Some(&json::String(ref s)) => Some(s.as_slice()),
178+
Some(&json::Null) => None,
179+
None => None,
180+
_ => fail!("Unexpected JSON"),
181+
}
182+
}
183+
}
184+
185+
136186
fn run_color_tests(json_data: &str, to_json: &fn(result: Option<Color>) -> json::Json) {
137187
do run_json_tests(json_data) |input| {
138188
match parse_one_component_value(tokenize(input)) {

0 commit comments

Comments
 (0)