Add parsing and serialization for <urange>

SimonSapin · SimonSapin · commit 067eab1d0d41 · 2017-02-10T00:26:04.000+01:00
https://drafts.csswg.org/css-syntax/#urange-syntax
diff --git a/src/lib.rs b/src/lib.rs
@@ -85,6 +85,7 @@ pub use color::{RGBA, Color, parse_color_keyword};
 pub use nth::parse_nth;
 pub use serializer::{ToCss, CssStringWriter, serialize_identifier, serialize_string, TokenSerializationType};
 pub use parser::{Parser, Delimiter, Delimiters, SourcePosition};
+pub use unicode_range::UnicodeRange;
 
 
 /**
@@ -163,6 +164,7 @@ mod from_bytes;
 mod color;
 mod nth;
 mod serializer;
+mod unicode_range;
 
 #[cfg(test)]
 mod tests;
diff --git a/src/unicode_range.rs b/src/unicode_range.rs
@@ -0,0 +1,253 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+//! https://drafts.csswg.org/css-syntax/#urange
+
+use {Parser, ToCss};
+use std::char;
+use std::cmp;
+use std::fmt;
+use std::io::{self, Write};
+use tokenizer::{Token, NumericValue};
+
+/// One contiguous range of code points.
+///
+/// Can not be empty. Can represent a single code point when start == end.
+pub struct UnicodeRange {
+    /// Inclusive start of the range. In [0, end].
+    pub start: u32,
+
+    /// Inclusive end of the range. In [0, 0x10FFFF].
+    pub end: u32,
+}
+
+impl UnicodeRange {
+    /// https://drafts.csswg.org/css-syntax/#urange-syntax
+    pub fn parse(input: &mut Parser) -> Result<Self, ()> {
+        // <urange> =
+        //   u '+' <ident-token> '?'* |
+        //   u <dimension-token> '?'* |
+        //   u <number-token> '?'* |
+        //   u <number-token> <dimension-token> |
+        //   u <number-token> <number-token> |
+        //   u '+' '?'+
+
+        input.expect_ident_matching("u")?;
+
+        // Since start or end can’t be above 0x10FFFF, they can’t have more than 6 hex digits
+        // Conversely, input with more digits would end up returning Err anyway.
+        const MAX_LENGTH_AFTER_U_PLUS: usize = 6 + 1 + 6; // 6 digits, '-', 6 digits
+        let mut buffer = [0; MAX_LENGTH_AFTER_U_PLUS];
+
+        let remaining_len;
+        {
+            let mut remaining = &mut buffer[..];
+            concatenate_tokens(input, &mut remaining)?;
+            remaining_len = remaining.len();
+        }
+
+        let text_len = buffer.len() - remaining_len;
+        let text = &buffer[..text_len];
+        let range = parse_concatenated(text)?;
+        if range.end > char::MAX as u32 || range.start > range.end {
+            Err(())
+        } else {
+            Ok(range)
+        }
+    }
+}
+
+fn concatenate_tokens(input: &mut Parser, remaining: &mut &mut [u8]) -> Result<(), Error> {
+    match input.next_including_whitespace()? {
+        Token::Delim('+') => {
+            match input.next_including_whitespace()? {
+                Token::Ident(ident) => remaining.write_all(ident.as_bytes())?,
+                Token::Delim('?') => remaining.write_all(b"?")?,
+                _ => return Err(Error)
+            }
+            parse_question_marks(input, remaining)
+        }
+
+        Token::Dimension(ref value, ref unit) => {
+            // Require a '+' sign as part of the number
+            let int_value = positive_integer_with_plus_sign(value)?;
+            write!(remaining, "{}{}", int_value, unit)?;
+            parse_question_marks(input, remaining)
+        }
+
+        Token::Number(ref value) => {
+            // Require a '+' sign as part of the number
+            let int_value = positive_integer_with_plus_sign(value)?;
+            write!(remaining, "{}", int_value)?;
+
+            match input.next_including_whitespace() {
+                // EOF here is fine
+                Err(()) => {},
+
+                Ok(Token::Delim('?')) => {
+                    // If `remaining` is already full, `int_value` has too many digits
+                    // so we can use `result?` Rust syntax.
+                    remaining.write_all(b"?")?;
+                    parse_question_marks(input, remaining)
+                }
+
+                Ok(Token::Dimension(ref value, ref unit)) => {
+                    // Require a '-' sign as part of the number
+                    let int_value = negative_integer(value)?;
+                    write!(remaining, "{}{}", int_value, unit)?
+                }
+
+                Ok(Token::Number(ref value)) => {
+                    // Require a '-' sign as part of the number
+                    let int_value = negative_integer(value)?;
+                    write!(remaining, "{}", int_value)?
+                }
+
+                _ => return Err(Error)
+            }
+        }
+
+        _ => return Err(Error)
+    }
+    Ok(())
+}
+
+/// Consume as many '?' as possible and write them to `remaining` until it’s full
+fn parse_question_marks(input: &mut Parser, remaining: &mut &mut [u8]) {
+    loop {
+        let result = input.try(|input| {
+            match input.next_including_whitespace() {
+                Ok(Token::Delim('?')) => remaining.write_all(b"?").map_err(|_| ()),
+                _ => Err(())
+            }
+        });
+        if result.is_err() {
+            return
+        }
+    }
+}
+
+fn positive_integer_with_plus_sign(value: &NumericValue) -> Result<i32, ()> {
+    let int_value = value.int_value.ok_or(())?;
+    if value.has_sign && int_value >= 0 {
+        Ok(int_value)
+    } else {
+        Err(())
+    }
+}
+
+fn negative_integer(value: &NumericValue) -> Result<i32, ()> {  // Necessarily had a negative sign.
+    let int_value = value.int_value.ok_or(())?;
+    if int_value <= 0 {
+        Ok(int_value)
+    } else {
+        Err(())
+    }
+}
+
+fn parse_concatenated(mut text: &[u8]) -> Result<UnicodeRange, ()> {
+    let (first_hex_value, hex_digit_count) = consume_hex(&mut text);
+    let question_marks = consume_question_marks(&mut text);
+    let consumed = hex_digit_count + question_marks;
+    if consumed == 0 || consumed > 6 {
+        return Err(())
+    }
+
+    if question_marks > 0 {
+        if text.is_empty() {
+            return Ok(UnicodeRange {
+                start: first_hex_value << (question_marks * 4),
+                end: ((first_hex_value + 1) << (question_marks * 4)) - 1,
+            })
+        }
+    } else if text.is_empty() {
+        return Ok(UnicodeRange {
+            start: first_hex_value,
+            end: first_hex_value,
+        })
+    } else {
+        if let Some((&b'-', mut text)) = text.split_first() {
+            let (second_hex_value, hex_digit_count) = consume_hex(&mut text);
+            if hex_digit_count > 0 && hex_digit_count <= 6 && text.is_empty() {
+                return Ok(UnicodeRange {
+                    start: first_hex_value,
+                    end: second_hex_value,
+                })
+            }
+        }
+    }
+    Err(())
+}
+
+fn consume_hex(text: &mut &[u8]) -> (u32, usize) {
+    let mut value = 0;
+    let mut digits = 0;
+    while let Some((&byte, rest)) = text.split_first() {
+        if let Some(digit_value) = (byte as char).to_digit(16) {
+            value = value * 0x10 + digit_value;
+            digits += 1;
+            *text = rest
+        } else {
+            break
+        }
+    }
+    (value, digits)
+}
+
+fn consume_question_marks(text: &mut &[u8]) -> usize {
+    let mut question_marks = 0;
+    while let Some((&b'?', rest)) = text.split_first() {
+        question_marks += 1;
+        *text = rest
+    }
+    question_marks
+}
+
+impl ToCss for UnicodeRange {
+    fn to_css<W>(&self, dest: &mut W) -> fmt::Result where W: fmt::Write {
+        dest.write_str("U+")?;
+
+        // How many bits are 0 at the end of start and also 1 at the end of end.
+        let bits = cmp::min(self.start.trailing_zeros(), (!self.end).trailing_zeros());
+
+        let question_marks = bits / 4;
+
+        // How many lower bits can be represented as question marks
+        let bits = question_marks * 4;
+
+        let truncated_start = self.start >> bits;
+        let truncated_end = self.end >> bits;
+        if truncated_start == truncated_end {
+            // Bits not covered by question marks are the same in start and end,
+            // we can use the question mark syntax.
+            if truncated_start != 0 {
+                write!(dest, "{:X}", truncated_start)?;
+            }
+            for _ in 0..question_marks {
+                dest.write_str("?")?;
+            }
+        } else {
+            write!(dest, "{:X}", self.start)?;
+            if self.end != self.start {
+                write!(dest, "-{:X}", self.end)?;
+            }
+        }
+        Ok(())
+    }
+}
+
+/// Make conversions from io::Error implicit in `?` syntax.
+struct Error;
+
+impl From<Error> for () {
+    fn from(_: Error) -> Self { () }
+}
+
+impl From<()> for Error {
+    fn from(_: ()) -> Self { Error }
+}
+
+impl From<io::Error> for Error {
+    fn from(_: io::Error) -> Self { Error }
+}