Skip to content

Commit cb8a26a

Browse files
committed
Use a procedural macro to create jump tables instead of doing branching.
This increases the performance of the stylesheet tokenization test about 20%, and now one of the hottest instructions is the sign extension rust does to index in the array. Will add a way to compare both modes so it's easy to measure when it's worth.
1 parent 65b570b commit cb8a26a

File tree

7 files changed

+647
-86
lines changed

7 files changed

+647
-86
lines changed

Cargo.toml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ repository = "https://github.com/servo/rust-cssparser"
1010
readme = "README.md"
1111
keywords = ["css", "syntax", "parser"]
1212
license = "MPL-2.0"
13+
build = "build.rs"
1314

1415

1516
[dev-dependencies]
@@ -22,6 +23,10 @@ heapsize = {version = ">=0.1.1, <0.4.0", optional = true}
2223
matches = "0.1"
2324
serde = {version = ">=0.6.6, <0.9", optional = true}
2425

26+
[build-dependencies]
27+
syn = { version = "0.10.6", features = ["full", "visit"]}
28+
quote = "0.3"
29+
2530
[features]
2631
serde-serialization = [ "serde" ]
2732
heap_size = [ "heapsize" ]

build.rs

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
2+
#[macro_use] extern crate quote;
3+
extern crate syn;
4+
5+
use std::env;
6+
use std::path::Path;
7+
8+
#[path = "src/macros/mod.rs"]
9+
mod macros;
10+
11+
fn main() {
12+
let manifest_dir = env::var("CARGO_MANIFEST_DIR").unwrap();
13+
14+
let tokenizer_rs = Path::new(&manifest_dir).join("src/tokenizer.rs");
15+
macros::match_byte::expand(&tokenizer_rs,
16+
&Path::new(&env::var("OUT_DIR").unwrap()).join("tokenizer.rs"));
17+
18+
println!("cargo:rerun-if-changed={}", tokenizer_rs.display());
19+
}

src/lib.rs

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -137,7 +137,9 @@ macro_rules! match_ignore_ascii_case {
137137
}
138138

139139
mod rules_and_declarations;
140-
mod tokenizer;
140+
mod tokenizer {
141+
include!(concat!(env!("OUT_DIR"), "/tokenizer.rs"));
142+
}
141143
mod parser;
142144
mod from_bytes;
143145
mod color;

src/macros/match_byte.rs

Lines changed: 229 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,229 @@
1+
2+
use quote::{ToTokens, Tokens};
3+
use super::visit::{Visitor, RecursiveVisitor};
4+
use std::fs::File;
5+
use std::io::{Read, Write};
6+
use std::mem;
7+
use std::path::Path;
8+
use std::vec;
9+
use std::iter;
10+
use syn;
11+
12+
pub fn expand(from: &Path, to: &Path) {
13+
let mut source = String::new();
14+
File::open(from).unwrap().read_to_string(&mut source).unwrap();
15+
let mut crate_ = syn::parse_crate(&source).expect("Parsing rules.rs module");
16+
let mut visitor = ExpanderVisitor;
17+
18+
RecursiveVisitor { node_visitor: &mut visitor }.visit_crate(&mut crate_);
19+
20+
let mut tokens = Tokens::new();
21+
crate_.to_tokens(&mut tokens);
22+
let code = tokens.to_string().replace("{ ", "{\n").replace(" }", "\n}");
23+
File::create(to).unwrap().write_all(code.as_bytes()).unwrap();
24+
}
25+
26+
struct ExpanderVisitor;
27+
28+
impl Visitor for ExpanderVisitor {
29+
fn visit_expression(&mut self, expr: &mut syn::Expr) {
30+
let tokens = match expr.node {
31+
syn::ExprKind::Mac(ref mut macro_) if macro_.path == syn::Path::from("match_byte") => {
32+
mem::replace(&mut macro_.tts, vec![])
33+
}
34+
_ => return,
35+
};
36+
let (to_be_matched, table, cases, wildcard_binding) = parse_match_bytes_macro(tokens);
37+
*expr = expand_match_bytes_macro(to_be_matched, table, cases, wildcard_binding);
38+
}
39+
40+
fn visit_statement(&mut self, stmt: &mut syn::Stmt) {
41+
let tokens = match *stmt {
42+
syn::Stmt::Mac(ref mut macro_) if macro_.0.path == syn::Path::from("match_byte") => {
43+
mem::replace(&mut macro_.0.tts, vec![])
44+
}
45+
_ => return,
46+
};
47+
let (to_be_matched, table, cases, wildcard_binding) = parse_match_bytes_macro(tokens);
48+
let expr = expand_match_bytes_macro(to_be_matched, table, cases, wildcard_binding);
49+
*stmt = syn::Stmt::Expr(Box::new(expr));
50+
}
51+
}
52+
53+
fn parse_match_bytes_macro(tts: Vec<syn::TokenTree>) -> (Vec<syn::TokenTree>, Vec<u8>, Vec<Case>, Option<syn::Ident>) {
54+
use syn::TokenTree::Delimited;
55+
use syn::DelimToken::Brace;
56+
57+
let mut tts = tts.into_iter();
58+
let inner_tts = match tts.next() {
59+
Some(Delimited(syn::Delimited { delim: Brace, tts })) => tts,
60+
other => panic!("expected one top-level {{}} block, got: {:?}", other),
61+
};
62+
63+
assert_eq!(tts.next(), None);
64+
65+
let mut tts = inner_tts.into_iter();
66+
67+
// Grab the thing we're matching, until we find a comma.
68+
let mut left_hand_side = vec![];
69+
loop {
70+
match tts.next() {
71+
Some(syn::TokenTree::Token(syn::Token::Comma)) => break,
72+
Some(other) => left_hand_side.push(other),
73+
None => panic!("Expected not to run out of tokens looking for a comma"),
74+
}
75+
}
76+
77+
let mut cases = vec![];
78+
let mut table = vec![0; 256];
79+
80+
let mut tts = tts.peekable();
81+
let mut case_id: u8 = 1;
82+
let mut binding = None;
83+
while tts.len() > 0 {
84+
cases.push(parse_case(&mut tts, &mut *table, &mut binding, case_id));
85+
86+
// Allow an optional comma between cases.
87+
match tts.peek() {
88+
Some(&syn::TokenTree::Token(syn::Token::Comma)) => {
89+
tts.next();
90+
},
91+
_ => {},
92+
}
93+
94+
case_id += 1;
95+
}
96+
97+
(left_hand_side, table, cases, binding)
98+
}
99+
100+
#[derive(Debug)]
101+
struct Case(Vec<syn::TokenTree>);
102+
103+
/// Parses a single pattern => expression, and returns the case, filling in the
104+
/// table with the case id for every byte that matched.
105+
///
106+
/// The `binding` parameter is the identifier that is used by the wildcard
107+
/// pattern.
108+
fn parse_case(tts: &mut iter::Peekable<vec::IntoIter<syn::TokenTree>>,
109+
table: &mut [u8],
110+
binding: &mut Option<syn::Ident>,
111+
case_id: u8)
112+
-> Case {
113+
// The last byte checked, as part of this pattern, to properly detect
114+
// ranges.
115+
let mut last_byte: Option<u8> = None;
116+
117+
// Loop through the pattern filling with bytes the table.
118+
loop {
119+
match tts.next() {
120+
Some(syn::TokenTree::Token(syn::Token::Literal(syn::Lit::Byte(byte)))) => {
121+
table[byte as usize] = case_id;
122+
last_byte = Some(byte);
123+
}
124+
Some(syn::TokenTree::Token(syn::Token::BinOp(syn::BinOpToken::Or))) => {
125+
last_byte = None; // This pattern is over.
126+
},
127+
Some(syn::TokenTree::Token(syn::Token::DotDotDot)) => {
128+
assert!(last_byte.is_some(), "Expected closed range!");
129+
match tts.next() {
130+
Some(syn::TokenTree::Token(syn::Token::Literal(syn::Lit::Byte(byte)))) => {
131+
for b in last_byte.take().unwrap()..byte {
132+
if table[b as usize] == 0 {
133+
table[b as usize] = case_id;
134+
}
135+
}
136+
if table[byte as usize] == 0 {
137+
table[byte as usize] = case_id;
138+
}
139+
}
140+
other => panic!("Expected closed range, got: {:?}", other),
141+
}
142+
},
143+
Some(syn::TokenTree::Token(syn::Token::FatArrow)) => break,
144+
Some(syn::TokenTree::Token(syn::Token::Ident(ident))) => {
145+
assert_eq!(last_byte, None, "I don't support ranges with identifiers!");
146+
assert_eq!(*binding, None);
147+
for mut byte in table.iter_mut() {
148+
if *byte == 0 {
149+
*byte = case_id;
150+
}
151+
}
152+
*binding = Some(ident)
153+
}
154+
Some(syn::TokenTree::Token(syn::Token::Underscore)) => {
155+
assert_eq!(last_byte, None);
156+
for mut byte in table.iter_mut() {
157+
if *byte == 0 {
158+
*byte = case_id;
159+
}
160+
}
161+
},
162+
other => panic!("Expected literal byte, got: {:?}", other),
163+
}
164+
}
165+
166+
match tts.next() {
167+
Some(syn::TokenTree::Delimited(syn::Delimited { delim: syn::DelimToken::Brace, tts })) => {
168+
Case(tts)
169+
}
170+
other => panic!("Expected case with braces after fat arrow, got: {:?}", other),
171+
}
172+
}
173+
174+
fn expand_match_bytes_macro(to_be_matched: Vec<syn::TokenTree>,
175+
table: Vec<u8>,
176+
cases: Vec<Case>,
177+
binding: Option<syn::Ident>)
178+
-> syn::Expr {
179+
use std::fmt::Write;
180+
181+
assert!(!to_be_matched.is_empty());
182+
assert_eq!(table.len(), 256);
183+
assert!(table.iter().all(|b| *b != 0), "Incomplete pattern? Bogus code!");
184+
185+
// We build the expression with text since it's easier.
186+
let mut expr = "{\n".to_owned();
187+
expr.push_str("enum Case {\n");
188+
for (i, _) in cases.iter().enumerate() {
189+
write!(&mut expr, "Case{} = {},", i + 1, i + 1).unwrap();
190+
}
191+
expr.push_str("}\n"); // enum Case
192+
193+
expr.push_str("static __CASES: [Case; 256] = [");
194+
for byte in &table {
195+
write!(&mut expr, "Case::Case{}, ", *byte).unwrap();
196+
}
197+
expr.push_str("];\n");
198+
199+
let mut tokens = Tokens::new();
200+
let to_be_matched = syn::Delimited {
201+
delim: if binding.is_some() { syn::DelimToken::Brace } else { syn::DelimToken::Paren },
202+
tts: to_be_matched
203+
};
204+
to_be_matched.to_tokens(&mut tokens);
205+
206+
if let Some(ref binding) = binding {
207+
write!(&mut expr, "let {} = {};\n", binding.to_string(), tokens.as_str()).unwrap();
208+
}
209+
210+
write!(&mut expr, "match __CASES[{} as usize] {{", match binding {
211+
Some(binding) => binding.to_string(),
212+
None => tokens.to_string(),
213+
}).unwrap();
214+
215+
for (i, case) in cases.into_iter().enumerate() {
216+
let mut case_tokens = Tokens::new();
217+
let case = syn::Delimited {
218+
delim: syn::DelimToken::Brace,
219+
tts: case.0
220+
};
221+
case.to_tokens(&mut case_tokens);
222+
write!(&mut expr, "Case::Case{} => {},\n", i + 1, case_tokens.as_str()).unwrap();
223+
}
224+
expr.push_str("}\n"); // match
225+
226+
expr.push_str("}\n"); // top
227+
228+
syn::parse_expr(&expr).expect("couldn't parse expression?")
229+
}

src/macros/mod.rs

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
2+
3+
4+
pub mod match_byte;
5+
pub mod visit;

0 commit comments

Comments
 (0)