Skip to content

Commit eb75a1b

Browse files
committed
Merge pull request #66 from ayosec/faster-preprocess
Function tokenizer::preprocess 3x faster
2 parents fd6c871 + 2f08b15 commit eb75a1b

File tree

1 file changed

+36
-4
lines changed

1 file changed

+36
-4
lines changed

src/tokenizer.rs

+36-4
Original file line numberDiff line numberDiff line change
@@ -34,16 +34,48 @@ impl Iterator<Node> for Tokenizer {
3434

3535
#[inline]
3636
fn preprocess(input: &str) -> String {
37-
// TODO: Is this faster if done in one pass?
38-
input.replace("\r\n", "\n").replace("\r", "\n").replace("\x0C", "\n").replace("\x00", "\u{FFFD}")
37+
// Replace:
38+
// "\r\n" => "\n"
39+
// "\r" => "\n"
40+
// "\x0C" => "\n"
41+
// "\x00" => "\u{FFFD}"
42+
43+
let bytes = input.as_bytes();
44+
let mut result: Vec<u8> = Vec::with_capacity(bytes.len());
45+
let mut last: u8 = 0;
46+
for byte in bytes.iter() {
47+
match *byte {
48+
b'\n' if last == b'\r' => (),
49+
b'\r' | b'\x0C' => result.push(b'\n'),
50+
b'\0' => result.push_all("\u{FFFD}".as_bytes()),
51+
_ => result.push(*byte),
52+
}
53+
54+
last = *byte;
55+
}
56+
57+
unsafe { String::from_utf8_unchecked(result) }
3958
}
4059

4160

4261
#[test]
4362
fn test_preprocess() {
4463
assert!("" == preprocess("").as_slice());
45-
assert!("Lorem\n\t\u{FFFD}ipusm\ndoror\u{FFFD}\n" ==
46-
preprocess("Lorem\r\n\t\x00ipusm\ndoror\u{FFFD}\r").as_slice());
64+
assert!("Lorem\n\n\t\u{FFFD}ipusm\ndoror\u{FFFD}á\n" ==
65+
preprocess("Lorem\r\n\n\t\x00ipusm\ndoror\u{FFFD}á\r").as_slice());
66+
}
67+
68+
#[cfg(test)]
69+
mod bench_preprocess {
70+
extern crate test;
71+
72+
#[bench]
73+
fn bench_preprocess(b: &mut test::Bencher) {
74+
let source = "Lorem\n\t\u{FFFD}ipusm\ndoror\u{FFFD}á\n";
75+
b.iter(|| {
76+
let _ = super::preprocess(source);
77+
});
78+
}
4779
}
4880

4981

0 commit comments

Comments
 (0)