Error on UTF8 surrogates

ijl · ijl · commit 432b15994397 · 2019-01-03T23:28:00.000Z
diff --git a/README.md b/README.md
@@ -45,6 +45,8 @@ support subclasses.
 It raises `TypeError` on an unsupported type. This exception message
 describes the invalid object.
 
+It raises `TypeError` on a `str` that contains invalid UTF-8.
+
 It raises `TypeError` on an integer that exceeds 64 bits. This is the same
 as the standard library's `json` module.
 
@@ -100,6 +102,36 @@ b'{"bool":true,"\xf0\x9f\x90\x88":"\xe5\x93\x88\xe5\x93\x88","int":9223372036854
 '{"bool": true, "\\ud83d\\udc08": "\\u54c8\\u54c8", "int": 9223372036854775807, "float": 1.337e+40}'
 ```
 
+### UTF-8
+
+orjson raises an exception on invalid UTF-8. This is
+necessary because Python 3 str objects may contain UTF-16 surrogates. The
+standard library's json module accepts invalid UTF-8.
+
+```python
+>>> import orjson, ujson, rapidjson, json
+>>> orjson.dumps('\ud800')
+TypeError: str is not valid UTF-8: surrogates not allowed
+>>> ujson.dumps('\ud800')
+UnicodeEncodeError: 'utf-8' codec ...
+>>> rapidjson.dumps('\ud800')
+UnicodeEncodeError: 'utf-8' codec ...
+>>> json.dumps('\ud800')
+'"\\ud800"'
+```
+
+```python
+>>> import orjson, ujson, rapidjson, json
+>>> orjson.loads('"\\ud800"')
+JSONDecodeError: unexpected end of hex escape at line 1 column 8: line 1 column 1 (char 0)
+>>> ujson.loads('"\\ud800"')
+''
+>>> rapidjson.loads('"\\ud800"')
+ValueError: Parse error at offset 1: The surrogate pair in string is invalid.
+>>> json.loads('"\\ud800"')
+'\ud800'
+```
+
 ## Testing
 
 The library has comprehensive tests. There are unit tests against the
@@ -108,7 +140,8 @@ roundtrip, jsonchecker, and fixtures files of the
 repository. It is tested to not crash against the
 [Big List of Naughty Strings](https://github.com/minimaxir/big-list-of-naughty-strings).
 It is tested to not leak memory. It is tested to be correct against
-input from the PyJFuzz JSON fuzzer. There are integration tests
+input from the PyJFuzz JSON fuzzer. It is tested to not crash
+against and not accept invalid UTF-8. There are integration tests
 exercising the library's use in web servers (uwsgi and gunicorn,
 using multiprocess/forked workers) and when
 multithreaded. It also uses some tests from the ultrajson library.
diff --git a/src/decode.rs b/src/decode.rs
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: (Apache-2.0 OR MIT)
 
 use crate::typeref;
+use crate::exc::*;
 use pyo3::prelude::*;
 use serde::de::{self, DeserializeSeed, Deserializer, MapAccess, SeqAccess, Visitor};
 use smallvec::SmallVec;
@@ -9,23 +10,29 @@ use std::fmt;
 use std::marker::PhantomData;
 use std::os::raw::c_char;
 
-import_exception!(json, JSONDecodeError);
-
 pub fn deserialize(py: Python, ptr: *mut pyo3::ffi::PyObject) -> PyResult<PyObject> {
     let obj_type_ptr = unsafe { (*ptr).ob_type };
     let data: Cow<str>;
     if unsafe { obj_type_ptr == typeref::STR_PTR } {
         let mut str_size: pyo3::ffi::Py_ssize_t = unsafe { std::mem::uninitialized() };
+        let uni = unsafe { pyo3::ffi::PyUnicode_AsUTF8AndSize(ptr, &mut str_size) as *const u8 };
+        if unsafe { std::intrinsics::unlikely(uni.is_null()) } {
+            return Err(JSONDecodeError::py_err((INVALID_STR, "", 0)));
+        }
         data = unsafe {
-            Cow::Borrowed(std::str::from_utf8_unchecked(std::slice::from_raw_parts(
-                pyo3::ffi::PyUnicode_AsUTF8AndSize(ptr, &mut str_size) as *const u8,
-                str_size as usize,
-            )))
+            Cow::Borrowed(std::str::from_utf8_unchecked(std::slice::from_raw_parts(uni, str_size as usize)))
         };
     } else if unsafe { obj_type_ptr == typeref::BYTES_PTR } {
         let buffer = unsafe { pyo3::ffi::PyBytes_AsString(ptr) as *const u8 };
         let length = unsafe { pyo3::ffi::PyBytes_Size(ptr) as usize };
-        data = unsafe { String::from_utf8_lossy(std::slice::from_raw_parts(buffer, length)) };
+        match String::from_utf8(unsafe { std::slice::from_raw_parts(buffer, length).to_vec() }) {
+            Ok(string) => {
+                data = Cow::Owned(string);
+            },
+            Err(_) => {
+                return Err(JSONDecodeError::py_err((INVALID_STR, "", 0)));
+            }
+        }
     } else {
         return Err(pyo3::exceptions::TypeError::py_err(
             "Input must be str or bytes",
diff --git a/src/encode.rs b/src/encode.rs
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: (Apache-2.0 OR MIT)
 
 use crate::typeref::*;
+use crate::exc::*;
 use pyo3::prelude::*;
 use serde::ser::{self, Serialize, SerializeMap, SerializeSeq, Serializer};
 use std::ffi::CStr;
@@ -34,10 +35,12 @@ impl<'p> Serialize for SerializePyObject {
         let obj_ptr = unsafe { (*self.ptr).ob_type };
         if unsafe { obj_ptr == STR_PTR } {
             let mut str_size: pyo3::ffi::Py_ssize_t = unsafe { std::mem::uninitialized() };
-            let data =
-                unsafe { pyo3::ffi::PyUnicode_AsUTF8AndSize(self.ptr, &mut str_size) as *const u8 };
+            let uni = unsafe { pyo3::ffi::PyUnicode_AsUTF8AndSize(self.ptr, &mut str_size) as *const u8 };
+            if unsafe { std::intrinsics::unlikely(uni.is_null()) } {
+                return Err(ser::Error::custom(INVALID_STR));
+            }
             serializer.serialize_str(unsafe {
-                std::str::from_utf8_unchecked(std::slice::from_raw_parts(data, str_size as usize))
+                std::str::from_utf8_unchecked(std::slice::from_raw_parts(uni, str_size as usize))
             })
         } else if unsafe { obj_ptr == FLOAT_PTR } {
             serializer.serialize_f64(unsafe { pyo3::ffi::PyFloat_AsDouble(self.ptr) })
@@ -70,6 +73,9 @@ impl<'p> Serialize for SerializePyObject {
                     let data = unsafe {
                         pyo3::ffi::PyUnicode_AsUTF8AndSize(key, &mut str_size) as *const u8
                     };
+                    if unsafe { std::intrinsics::unlikely(data.is_null()) } {
+                        return Err(ser::Error::custom(INVALID_STR));
+                    }
                     map.serialize_entry(
                         unsafe {
                             std::str::from_utf8_unchecked(std::slice::from_raw_parts(
@@ -118,8 +124,17 @@ impl<'p> Serialize for SerializePyObject {
         } else if unsafe { obj_ptr == BYTES_PTR } {
             let buffer = unsafe { pyo3::ffi::PyBytes_AsString(self.ptr) as *const u8 };
             let length = unsafe { pyo3::ffi::PyBytes_Size(self.ptr) as usize };
+            let pystr = unsafe { pyo3::ffi::PyUnicode_FromStringAndSize(
+                buffer as *const c_char,
+                length as pyo3::ffi::Py_ssize_t,
+            ) };
+            if unsafe { std::intrinsics::unlikely(pystr.is_null()) } {
+                return Err(ser::Error::custom(INVALID_STR));
+            }
+            let mut str_size: pyo3::ffi::Py_ssize_t = unsafe { std::mem::uninitialized() };
+            let uni = unsafe { pyo3::ffi::PyUnicode_AsUTF8AndSize(pystr, &mut str_size) as *const u8 };
             serializer.serialize_str(unsafe {
-                std::str::from_utf8_unchecked(std::slice::from_raw_parts(buffer, length))
+                std::str::from_utf8_unchecked(std::slice::from_raw_parts(uni, str_size as usize))
             })
         } else {
             Err(ser::Error::custom(format_args!(
diff --git a/src/exc.rs b/src/exc.rs
@@ -0,0 +1,5 @@
+// SPDX-License-Identifier: (Apache-2.0 OR MIT)
+
+pub const INVALID_STR: &str = "str is not valid UTF-8: surrogates not allowed";
+
+import_exception!(json, JSONDecodeError);
diff --git a/src/lib.rs b/src/lib.rs
@@ -15,6 +15,7 @@ use pyo3::ToPyPointer;
 
 mod decode;
 mod encode;
+mod exc;
 mod typeref;
 
 #[pymodule]
@@ -23,7 +24,7 @@ fn orjson(py: Python, m: &PyModule) -> PyResult<()> {
     m.add("__version__", env!("CARGO_PKG_VERSION"))?;
     m.add_wrapped(wrap_function!(dumps))?;
     m.add_wrapped(wrap_function!(loads))?;
-    m.add("JSONDecodeError", py.get_type::<decode::JSONDecodeError>())?;
+    m.add("JSONDecodeError", py.get_type::<exc::JSONDecodeError>())?;
     Ok(())
 }
 
diff --git a/test/test_type.py b/test/test_type.py
@@ -31,6 +31,32 @@ def test_str(self):
             self.assertEqual(orjson.dumps(obj), ref)
             self.assertEqual(orjson.loads(ref), obj)
 
+    def test_str_replacement(self):
+        """
+        str roundtrip �
+        """
+        self.assertEqual(orjson.dumps('�'), b'"\xef\xbf\xbd"')
+        self.assertEqual(orjson.loads(b'"\xef\xbf\xbd"'), '�')
+
+    def test_str_surrogates_loads(self):
+        """
+        str unicode surrogates loads()
+        """
+        self.assertRaises(orjson.JSONDecodeError, orjson.loads, '"\ud800"')
+        self.assertRaises(orjson.JSONDecodeError, orjson.loads, '"\ud83d\ude80"')
+        self.assertRaises(orjson.JSONDecodeError, orjson.loads, '"\udcff"')
+        self.assertRaises(orjson.JSONDecodeError, orjson.loads, b'"\xed\xa0\xbd\xed\xba\x80"') # \ud83d\ude80
+
+    def test_str_surrogates_dumps(self):
+        """
+        str unicode surrogates dumps()
+        """
+        self.assertRaises(TypeError, orjson.dumps, '\ud800')
+        self.assertRaises(TypeError, orjson.dumps, '\ud83d\ude80')
+        self.assertRaises(TypeError, orjson.dumps, '\udcff')
+        self.assertRaises(TypeError, orjson.dumps, {'\ud83d\ude80': None})
+        self.assertRaises(TypeError, orjson.dumps, b'\xed\xa0\xbd\xed\xba\x80') # \ud83d\ude80
+
     def test_bytes(self):
         """
         bytes