Skip to content

Commit 022ccf7

Browse files
committed
Add parsing from bytes, with rust-encoding.
New dependency: https://github.com/lifthrasiir/rust-encoding
1 parent c17ef84 commit 022ccf7

File tree

4 files changed

+162
-5
lines changed

4 files changed

+162
-5
lines changed

from_bytes.rs

+102
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,102 @@
1+
/* This Source Code Form is subject to the terms of the Mozilla Public
2+
* License, v. 2.0. If a copy of the MPL was not distributed with this
3+
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
4+
5+
use std::str;
6+
7+
use encoding::label::encoding_from_whatwg_label;
8+
use encoding::all::UTF_8;
9+
use encoding::Encoding;
10+
use encoding::DecodeReplace;
11+
use encoding::decode;
12+
13+
use tokenizer::{tokenize, Tokenizer};
14+
use parser::{parse_stylesheet_rules, StylesheetParser};
15+
16+
17+
/// Determine the character encoding of a CSS stylesheet and decode it.
18+
///
19+
/// This is based on the presence of a :abbr:`BOM (Byte Order Mark)`,
20+
/// an `@charset` rule,
21+
/// and encoding meta-information.
22+
///
23+
/// :param css_bytes: A byte string.
24+
/// :param protocol_encoding:
25+
/// The encoding label, if any, defined by HTTP or equivalent protocol.
26+
/// (e.g. via the `charset` parameter of the `Content-Type` header.)
27+
/// :param environment_encoding:
28+
/// An optional `Encoding` object
29+
/// for the `environment encoding
30+
/// <http://www.w3.org/TR/css-syntax/#environment-encoding>`_,
31+
/// if any.
32+
/// :returns:
33+
/// A 2-tuple of a decoded Unicode string
34+
/// and the `Encoding` object that was used.
35+
pub fn decode_stylesheet_bytes(css: &[u8], protocol_encoding_label: Option<&str>,
36+
environment_encoding: Option<&'static Encoding>)
37+
-> (~str, &'static Encoding) {
38+
// http://dev.w3.org/csswg/css-syntax/#the-input-byte-stream
39+
match protocol_encoding_label {
40+
None => (),
41+
Some(label) => match encoding_from_whatwg_label(label) {
42+
None => (),
43+
Some(fallback) => return decode_replace(css, fallback)
44+
}
45+
}
46+
if css.starts_with("@charset \"".as_bytes()) {
47+
// 10 is "@charset \"".len()
48+
// 100 is arbitrary so that no encoding label is more than 100-10 bytes.
49+
match css.slice(10, css.len().min(&100)).position_elem(&('"' as u8)) {
50+
None => (),
51+
Some(label_length)
52+
=> if css.slice_from(10 + label_length).starts_with("\";".as_bytes()) {
53+
let label = css.slice(10, 10 + label_length);
54+
let label = str::from_chars(label.iter().map(|&b| b as char).to_owned_vec());
55+
match encoding_from_whatwg_label(label) {
56+
None => (),
57+
Some(fallback) => match fallback.name() {
58+
"utf-16be" | "utf-16le"
59+
=> return decode_replace(css, UTF_8 as &'static Encoding),
60+
_ => return decode_replace(css, fallback),
61+
}
62+
}
63+
}
64+
}
65+
}
66+
match environment_encoding {
67+
None => (),
68+
Some(fallback) => return decode_replace(css, fallback)
69+
}
70+
return decode_replace(css, UTF_8 as &'static Encoding)
71+
}
72+
73+
74+
#[inline]
75+
fn decode_replace(input: &[u8], fallback_encoding: &'static Encoding)-> (~str, &'static Encoding) {
76+
let (result, used_encoding) = decode(input, DecodeReplace, fallback_encoding);
77+
(result.unwrap(), used_encoding)
78+
}
79+
80+
81+
/// Parse stylesheet from bytes.
82+
///
83+
/// :param css_bytes: A byte string.
84+
/// :param protocol_encoding:
85+
/// The encoding label, if any, defined by HTTP or equivalent protocol.
86+
/// (e.g. via the `charset` parameter of the `Content-Type` header.)
87+
/// :param environment_encoding:
88+
/// An optional `Encoding` object
89+
/// for the `environment encoding
90+
/// <http://www.w3.org/TR/css-syntax/#environment-encoding>`_,
91+
/// if any.
92+
/// :returns:
93+
/// A 2-tuple of a Iterator<Result<Rule, SyntaxError>>
94+
/// and the `Encoding` object that was used.
95+
pub fn parse_stylesheet_rules_from_bytes(
96+
css_bytes: &[u8], protocol_encoding_label: Option<&str>,
97+
environment_encoding: Option<&'static Encoding>)
98+
-> (StylesheetParser<Tokenizer>, &'static Encoding) {
99+
let (css_unicode, encoding) = decode_stylesheet_bytes(
100+
css_bytes, protocol_encoding_label, environment_encoding);
101+
(parse_stylesheet_rules(tokenize(css_unicode)), encoding)
102+
}

lib.rs

+3
Original file line numberDiff line numberDiff line change
@@ -7,17 +7,20 @@
77
#[feature(globs, macro_rules)];
88

99
extern mod extra;
10+
extern mod encoding; // https://github.com/lifthrasiir/rust-encoding
1011

1112
pub use tokenizer::tokenize;
1213
pub use parser::{parse_stylesheet_rules, parse_rule_list, parse_declaration_list,
1314
parse_one_rule, parse_one_declaration, parse_one_component_value};
15+
pub use from_bytes::{decode_stylesheet_bytes, parse_stylesheet_rules_from_bytes};
1416
pub use color::{RGBA, Color, CurrentColor};
1517
pub use nth::parse_nth;
1618
pub use serializer::{ToCss, serialize_identifier, serialize_string};
1719

1820
pub mod ast;
1921
mod tokenizer;
2022
mod parser;
23+
mod from_bytes;
2124
mod color;
2225
mod nth;
2326
mod serializer;

parser.rs

+3-1
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,9 @@ pub fn parse_one_component_value<T: Iterator<Node>>(mut iter: T)
9898
// *********** End of public API ***********
9999

100100

101-
struct StylesheetParser<T>{ iter: T }
101+
// used in from_bytes.rs but not reexported in the crate top-level
102+
pub struct StylesheetParser<T>{ iter: T }
103+
102104
struct RuleListParser<T>{ iter: T }
103105
struct DeclarationListParser<T>{ iter: T }
104106

tests.rs

+54-4
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@ use extra::{tempfile, json};
99
use extra::json::ToJson;
1010
use extra::test;
1111

12+
use encoding::label::encoding_from_whatwg_label;
13+
1214
use super::*;
1315
use ast::*;
1416

@@ -56,23 +58,34 @@ fn assert_json_eq(results: json::Json, expected: json::Json, message: ~str) {
5658
}
5759
5860
59-
fn run_json_tests<T: ToJson>(json_data: &str, parse: &fn (input: ~str) -> T) {
61+
fn run_raw_json_tests(json_data: &str, run: &fn (json::Json, json::Json)) {
6062
let items = match json::from_str(json_data) {
6163
Ok(json::List(items)) => items,
6264
_ => fail!("Invalid JSON")
6365
};
6466
assert!(items.len() % 2 == 0);
65-
let mut input: Option<~str> = None;
67+
let mut input = None;
6668
for item in items.move_iter() {
6769
match (&input, item) {
68-
(&None, json::String(string)) => input = Some(string),
70+
(&None, json_obj) => input = Some(json_obj),
6971
(&Some(_), expected) => {
7072
let input = input.take_unwrap();
73+
run(input, expected)
74+
},
75+
};
76+
}
77+
}
78+
79+
80+
fn run_json_tests<T: ToJson>(json_data: &str, parse: &fn (input: ~str) -> T) {
81+
do run_raw_json_tests(json_data) |input, expected| {
82+
match input {
83+
json::String(input) => {
7184
let result = parse(input.to_owned()).to_json();
7285
assert_json_eq(result, expected, input);
7386
},
7487
_ => fail!("Unexpected JSON")
75-
};
88+
}
7689
}
7790
}
7891
@@ -133,6 +146,43 @@ fn one_rule() {
133146
}
134147
135148
149+
#[test]
150+
fn stylesheet_from_bytes() {
151+
do run_raw_json_tests(include_str!("css-parsing-tests/stylesheet_bytes.json"))
152+
|input, expected| {
153+
let map = match input {
154+
json::Object(map) => map,
155+
_ => fail!("Unexpected JSON")
156+
};
157+
158+
let result = {
159+
let css = get_string(map, &~"css_bytes").unwrap().iter().map(|c| {
160+
assert!(c as u32 <= 0xFF);
161+
c as u8
162+
}).to_owned_vec();
163+
let protocol_encoding_label = get_string(map, &~"protocol_encoding");
164+
let environment_encoding = get_string(map, &~"environment_encoding")
165+
.and_then(encoding_from_whatwg_label);
166+
167+
let (mut rules, used_encoding) = parse_stylesheet_rules_from_bytes(
168+
css, protocol_encoding_label, environment_encoding);
169+
170+
(rules.to_owned_vec(), used_encoding.name().to_owned()).to_json()
171+
};
172+
assert_json_eq(result, expected, json::Object(map).to_str());
173+
}
174+
175+
fn get_string<'a>(map: &'a json::Object, key: &~str) -> Option<&'a str> {
176+
match map.find(key) {
177+
Some(&json::String(ref s)) => Some(s.as_slice()),
178+
Some(&json::Null) => None,
179+
None => None,
180+
_ => fail!("Unexpected JSON"),
181+
}
182+
}
183+
}
184+
185+
136186
fn run_color_tests(json_data: &str, to_json: &fn(result: Option<Color>) -> json::Json) {
137187
do run_json_tests(json_data) |input| {
138188
match parse_one_component_value(tokenize(input)) {

0 commit comments

Comments
 (0)