Merge pull request servo#14 from SimonSapin/unicode-range

metajack · metajack · commit 56b3cb31695f · 2013-09-05T13:45:55.000-07:00
Fix tokenization of &lt;unicode-range&gt; tokens, per spec change.
diff --git a/ast.rs b/ast.rs
@@ -37,8 +37,7 @@ pub enum ComponentValue {
     Number(NumericValue),
     Percentage(NumericValue),
     Dimension(NumericValue, ~str),
-    UnicodeRange(char, char), // UnicodeRange {start: char, end: char},
-    EmptyUnicodeRange,
+    UnicodeRange { start: u32, end: u32 },
     WhiteSpace,
     Colon,  // :
     Semicolon,  // ;
diff --git a/css-parsing-tests/README.rst b/css-parsing-tests/README.rst
@@ -94,16 +94,16 @@ associated with the expected result.
 
 ``color3_hsl.json``
     Same as ``color3.json``.
-    This file is generated the ``make_color3_hsl.py`` Python script.
+    This file is generated by the ``make_color3_hsl.py`` Python script.
 
 ``color3_keywords.json``
     Same as ``color3.json``,
     except that the values for the Red, Green and Blue channel
     are between 0 and 255.
-    This file is generated the ``make_color3_keywords.py`` Python script.
+    This file is generated by the ``make_color3_keywords.py`` Python script.
 
-``an+b.json``
-    Tests the `an+b <http://dev.w3.org/csswg/css-syntax/#the-anb-type>`_
+``An+B.json``
+    Tests the `An+B <http://dev.w3.org/csswg/css-syntax/#the-anb-type>`_
     syntax defined in CSS Syntax Level 3.
     This `differs <http://dev.w3.org/csswg/css-syntax/#changes>`_ from the
     `nth grammar rule <http://www.w3.org/TR/css3-selectors/#nth-child-pseudo>`_
@@ -121,7 +121,7 @@ AST nodes (the results of parsing) are represented in JSON as follow.
 This representation was chosen to be compact
 (and thus less annoying to write by hand)
 while staying unambiguous.
-For example, the difference between @import and \@import is not lost:
+For example, the difference between ``@import`` and ``\@import`` is not lost:
 they are represented as ``["at-keyword", "import"]`` and ``["ident", "@import"]``,
 respectively.
 
@@ -200,8 +200,8 @@ Component values
     and the unit as a string.
 
 <unicode-range>
-    Array of length 2: the string ``"unicode-range"``, and the range as either
-    null for the empty range, or an array of two numbers.
+    Array of length 3: the string ``"unicode-range"``,
+    followed by the *start* and *end* integers as two numbers.
 
 <include-match>
     The string ``"~="``.
diff --git a/css-parsing-tests/component_value_list.json b/css-parsing-tests/component_value_list.json
@@ -317,6 +317,79 @@
 	["dimension", "12", 12, "integer", "rêd"]
 ],
 
+"u+1 U+10 U+100 U+1000 U+10000 U+100000 U+1000000", [
+    ["unicode-range", 1, 1], " ",
+    ["unicode-range", 16, 16], " ",
+    ["unicode-range", 256, 256], " ",
+    ["unicode-range", 4096, 4096], " ",
+    ["unicode-range", 65536, 65536], " ",
+    ["unicode-range", 1048576, 1048576], " ",
+    ["unicode-range", 1048576, 1048576], ["number", "0", 0, "integer"]
+],
+
+"u+? u+1? U+10? U+100? U+1000? U+10000? U+100000?", [
+    ["unicode-range", 0, 15], " ",
+    ["unicode-range", 16, 31], " ",
+    ["unicode-range", 256, 271], " ",
+    ["unicode-range", 4096, 4111], " ",
+    ["unicode-range", 65536, 65551], " ",
+    ["unicode-range", 1048576, 1048591], " ",
+    ["unicode-range", 1048576, 1048576], "?"
+],
+
+"u+?? U+1?? U+10?? U+100?? U+1000?? U+10000??", [
+    ["unicode-range", 0, 255], " ",
+    ["unicode-range", 256, 511], " ",
+    ["unicode-range", 4096, 4351], " ",
+    ["unicode-range", 65536, 65791], " ",
+    ["unicode-range", 1048576, 1048831], " ",
+    ["unicode-range", 1048576, 1048591], "?"
+],
+
+"u+??? U+1??? U+10??? U+100??? U+1000???", [
+    ["unicode-range", 0, 4095], " ",
+    ["unicode-range", 4096, 8191], " ",
+    ["unicode-range", 65536, 69631], " ",
+    ["unicode-range", 1048576, 1052671], " ",
+    ["unicode-range", 1048576, 1048831], "?"
+],
+
+"u+???? U+1???? U+10???? U+100????", [
+    ["unicode-range", 0, 65535], " ",
+    ["unicode-range", 65536, 131071], " ",
+    ["unicode-range", 1048576, 1114111], " ",
+    ["unicode-range", 1048576, 1052671], "?"
+],
+
+"u+????? U+1????? U+10?????", [
+    ["unicode-range", 0, 1048575], " ",
+    ["unicode-range", 1048576, 2097151], " ",
+    ["unicode-range", 1048576, 1114111], "?"
+],
+
+"u+?????? U+1??????", [
+    ["unicode-range", 0, 16777215], " ",
+    ["unicode-range", 1048576, 2097151], "?"
+],
+
+"u+1-2 U+100000-2 U+1000000-2 U+10-200000", [
+    ["unicode-range", 1, 2], " ",
+    ["unicode-range", 1048576, 2], " ",
+    ["unicode-range", 1048576, 1048576], ["number", "0", 0, "integer"],
+        ["number", "-2", -2, "integer"], " ",
+    ["unicode-range", 16, 2097152]
+],
+
+"ù+12 Ü+12 u +12 U+ 12 U+12 - 20 U+1?2 U+1?-50", [
+    ["ident", "ù"], ["number", "+12", 12, "integer"], " ",
+    ["ident", "Ü"], ["number", "+12", 12, "integer"], " ",
+    ["ident", "u"], " ", ["number", "+12", 12, "integer"], " ",
+    ["ident", "U"], "+", " ", ["number", "12", 12, "integer"], " ",
+    ["unicode-range", 18, 18], " ", "-", " ", ["number", "20", 20, "integer"], " ",
+    ["unicode-range", 16, 31], ["number", "2", 2, "integer"], " ",
+    ["unicode-range", 16, 31], ["number", "-50", -50, "integer"]
+],
+
 "~=|=^=$=*=||<!------> |/**/| ~/**/=", [
 	"~=", "|=", "^=", "$=", "*=", "||", "<!--", "-", "-", "-->",
 	" ", "|", "|", " ", "~", "="
diff --git a/tests.rs b/tests.rs
@@ -325,9 +325,8 @@ impl ToJson for ComponentValue {
             Dimension(ref value, ref unit)
             => JList(~[JString(~"dimension")] + numeric(value) + ~[unit.to_json()]),
 
-            // TODO:
-            UnicodeRange(_start, _end) => fail!(),
-            EmptyUnicodeRange => fail!(),
+            UnicodeRange { start: s, end: e }
+            => JList(~[JString(~"unicode-range"), s.to_json(), e.to_json()]),
 
             WhiteSpace => JString(~" "),
             Colon => JString(~":"),
diff --git a/tokenizer.rs b/tokenizer.rs
@@ -544,13 +544,13 @@ fn consume_unicode_range(tokenizer: &mut Tokenizer) -> ComponentValue {
         question_marks += 1;
         tokenizer.position += 1
     }
-    let start: char;
-    let end: char;
+    let start;
+    let end;
     if question_marks > 0 {
-        start = char_from_hex(hex + "0".repeat(question_marks));
-        end = char_from_hex(hex + "F".repeat(question_marks));
+        start = u32::from_str_radix(hex + "0".repeat(question_marks), 16).unwrap();
+        end = u32::from_str_radix(hex + "F".repeat(question_marks), 16).unwrap();
     } else {
-        start = char_from_hex(hex);
+        start = u32::from_str_radix(hex, 16).unwrap();
         hex = ~"";
         if !tokenizer.is_eof() && tokenizer.current_char() == '-' {
             tokenizer.position += 1;
@@ -563,21 +563,12 @@ fn consume_unicode_range(tokenizer: &mut Tokenizer) -> ComponentValue {
                 }
             }
         }
-        end = if hex.len() > 0 { char_from_hex(hex) } else { start }
-    }
-    if start > MAX_UNICODE || end < start {
-        EmptyUnicodeRange
-    } else {
-        let end = if end <= MAX_UNICODE { end } else { MAX_UNICODE };
-//        UnicodeRange {start: start, end: end}
-        UnicodeRange(start, end)
+        end = if hex.len() > 0 { u32::from_str_radix(hex, 16).unwrap() } else { start }
     }
+    UnicodeRange {start: start, end: end}
 }
 
 
-static MAX_UNICODE: char = '\U0010FFFF';
-
-
 // Assumes that the U+005C REVERSE SOLIDUS (\) has already been consumed
 // and that the next input character has already been verified
 // to not be a newline.
@@ -602,16 +593,11 @@ fn consume_escape(tokenizer: &mut Tokenizer) -> char {
                     _ => ()
                 }
             }
-            let c = char_from_hex(hex);
+            let c = u32::from_str_radix(hex, 16).unwrap() as char as char;
+            static MAX_UNICODE: char = '\U0010FFFF';
             if '\x00' < c && c <= MAX_UNICODE { c }
             else { '\uFFFD' }  // Replacement character
         },
         c => c
     }
 }
-
-
-#[inline]
-fn char_from_hex(hex: &str) -> char {
-    u32::from_str_radix(hex, 16).unwrap() as char
-}