Skip to content

Commit d3ddb92

Browse files
committed
Better ASCII case-insensitive matching.
1 parent 3dc9f50 commit d3ddb92

File tree

3 files changed

+95
-23
lines changed

3 files changed

+95
-23
lines changed

cssparser.rc

Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,3 +13,94 @@ pub mod ast;
1313

1414
#[cfg(test)]
1515
mod tests;
16+
17+
18+
/// Return whether `string` is an ASCII case-insensitive match for `reference`,
19+
/// where `reference` is already in ASCII lower-case.
20+
pub fn eq_ascii_lower(string: &str, reference: &str) -> bool {
21+
#[inline]
22+
fn eq(string: &str, reference: &str) -> bool {
23+
for std::uint::range(0, string.len()) |i| {
24+
if ASCII_LOWER_MAP[string[i]] != reference[i] {
25+
return false
26+
}
27+
}
28+
true
29+
}
30+
31+
#[cfg(not(test))]
32+
#[inline]
33+
fn check_reference(_reference: &str) {}
34+
#[cfg(test)]
35+
#[inline]
36+
fn check_reference(reference: &str) {
37+
assert!(eq(reference, reference), ~"Reference must be ASCII lower case.");
38+
}
39+
check_reference(reference);
40+
41+
string.len() == reference.len() && eq(string, reference)
42+
}
43+
44+
45+
/// Return an ASCII lower-case copy of `string`:
46+
/// A-Z is replaced by a-z, non-ASCII letters are unchanged.
47+
pub fn to_ascii_lower(string: &str) -> ~str {
48+
let mut lower = string.to_owned();
49+
for std::uint::range(0, lower.len()) |i| {
50+
lower[i] = ASCII_LOWER_MAP[lower[i]];
51+
}
52+
lower
53+
}
54+
55+
56+
#[test]
57+
fn test_ascii_lower() {
58+
assert!(eq_ascii_lower("url()URL()uRl()Ürl", "url()url()url()Ürl"));
59+
assert!(to_ascii_lower("url()URL()uRl()Ürl") == ~"url()url()url()Ürl");
60+
// Dotted capital I, Kelvin sign, Sharp S.
61+
assert!(eq_ascii_lower("HİKß", "hİKß"));
62+
assert!(to_ascii_lower("HİKß") == ~"hİKß");
63+
64+
for std::uint::range(0, 128) |i| {
65+
let c = i as char;
66+
let lower = if 'A' <= c && c <= 'Z' { c + 'a' - 'A' } else { c };
67+
assert!(ASCII_LOWER_MAP[i] as char == lower)
68+
}
69+
}
70+
71+
72+
// This is UTF8-safe because only bytes in the ASCII range are affected.
73+
static ASCII_LOWER_MAP: &'static [u8] = &[
74+
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
75+
0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
76+
0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
77+
0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
78+
0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27,
79+
0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
80+
0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
81+
0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f,
82+
0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
83+
0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f,
84+
0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
85+
0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f,
86+
0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
87+
0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f,
88+
0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
89+
0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f,
90+
0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
91+
0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
92+
0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
93+
0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f,
94+
0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7,
95+
0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf,
96+
0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7,
97+
0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf,
98+
0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7,
99+
0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf,
100+
0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7,
101+
0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf,
102+
0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7,
103+
0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef,
104+
0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7,
105+
0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff,
106+
];

parser.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ use std::vec;
1515

1616
use ast::*;
1717
use tokenizer::*;
18+
use super::eq_ascii_lower;
1819

1920

2021
// TODO: Use a trait?
@@ -217,7 +218,7 @@ fn parse_declaration_important(iter: &mut ComponentValueIterator) -> bool {
217218
Some(Ident(value)) => value,
218219
_ => return false,
219220
};
220-
if ascii_lower(ident_value) != ~"important" { return false }
221+
if !eq_ascii_lower(ident_value, "important") { return false }
221222
match iter.next_non_whitespace() {
222223
Some(Semicolon) => true,
223224
None => true,

tokenizer.rs

Lines changed: 2 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
use std::{str, u32, vec};
88

99
use ast::*;
10+
use super::eq_ascii_lower;
1011

1112

1213
pub struct Parser {
@@ -99,30 +100,9 @@ pub fn next_component_value(parser: &mut Parser) -> Option<ComponentValue> {
99100
}
100101

101102

102-
pub fn ascii_lower(string: &str) -> ~str {
103-
// Warning: premature optimization ahead ;)
104-
// TODO: would it be more efficient to work on bytes,
105-
// without decoding/re-encoding UTF-8?
106-
do string.map_chars |c| {
107-
match c {
108-
'A'..'Z' => c + 'a' - 'A',
109-
_ => c,
110-
}
111-
}
112-
}
113-
114-
115103
// *********** End of public API ***********
116104

117105

118-
#[test]
119-
fn test_ascii_lower() {
120-
assert!(ascii_lower("url()URL()uRl()Ürl") == ~"url()url()url()Ürl");
121-
// Dotted capital I, Kelvin sign, Sharp S.
122-
assert!(ascii_lower("HİKß") == ~"hİKß");
123-
}
124-
125-
126106
#[inline]
127107
fn preprocess(input: &str) -> ~str {
128108
// TODO: Is this faster if done in one pass?
@@ -281,7 +261,7 @@ fn consume_ident(parser: &mut Parser) -> ComponentValue {
281261
match parser.current_char() {
282262
'(' => {
283263
parser.position += 1;
284-
if ascii_lower(string) == ~"url" { consume_url(parser) }
264+
if eq_ascii_lower(string, "url") { consume_url(parser) }
285265
else { Function(string, consume_block(parser, CloseParenthesis)) }
286266
},
287267
_ => Ident(string)

0 commit comments

Comments
 (0)