Skip to content

Commit 432b159

Browse files
committed
Error on UTF8 surrogates
1 parent a87e356 commit 432b159

6 files changed

Lines changed: 100 additions & 13 deletions

File tree

README.md

Lines changed: 34 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,8 @@ support subclasses.
4545
It raises `TypeError` on an unsupported type. This exception message
4646
describes the invalid object.
4747

48+
It raises `TypeError` on a `str` that contains invalid UTF-8.
49+
4850
It raises `TypeError` on an integer that exceeds 64 bits. This is the same
4951
as the standard library's `json` module.
5052

@@ -100,6 +102,36 @@ b'{"bool":true,"\xf0\x9f\x90\x88":"\xe5\x93\x88\xe5\x93\x88","int":9223372036854
100102
'{"bool": true, "\\ud83d\\udc08": "\\u54c8\\u54c8", "int": 9223372036854775807, "float": 1.337e+40}'
101103
```
102104

105+
### UTF-8
106+
107+
orjson raises an exception on invalid UTF-8. This is
108+
necessary because Python 3 str objects may contain UTF-16 surrogates. The
109+
standard library's json module accepts invalid UTF-8.
110+
111+
```python
112+
>>> import orjson, ujson, rapidjson, json
113+
>>> orjson.dumps('\ud800')
114+
TypeError: str is not valid UTF-8: surrogates not allowed
115+
>>> ujson.dumps('\ud800')
116+
UnicodeEncodeError: 'utf-8' codec ...
117+
>>> rapidjson.dumps('\ud800')
118+
UnicodeEncodeError: 'utf-8' codec ...
119+
>>> json.dumps('\ud800')
120+
'"\\ud800"'
121+
```
122+
123+
```python
124+
>>> import orjson, ujson, rapidjson, json
125+
>>> orjson.loads('"\\ud800"')
126+
JSONDecodeError: unexpected end of hex escape at line 1 column 8: line 1 column 1 (char 0)
127+
>>> ujson.loads('"\\ud800"')
128+
''
129+
>>> rapidjson.loads('"\\ud800"')
130+
ValueError: Parse error at offset 1: The surrogate pair in string is invalid.
131+
>>> json.loads('"\\ud800"')
132+
'\ud800'
133+
```
134+
103135
## Testing
104136

105137
The library has comprehensive tests. There are unit tests against the
@@ -108,7 +140,8 @@ roundtrip, jsonchecker, and fixtures files of the
108140
repository. It is tested to not crash against the
109141
[Big List of Naughty Strings](https://github.com/minimaxir/big-list-of-naughty-strings).
110142
It is tested to not leak memory. It is tested to be correct against
111-
input from the PyJFuzz JSON fuzzer. There are integration tests
143+
input from the PyJFuzz JSON fuzzer. It is tested to not crash
144+
against and not accept invalid UTF-8. There are integration tests
112145
exercising the library's use in web servers (uwsgi and gunicorn,
113146
using multiprocess/forked workers) and when
114147
multithreaded. It also uses some tests from the ultrajson library.

src/decode.rs

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
// SPDX-License-Identifier: (Apache-2.0 OR MIT)
22

33
use crate::typeref;
4+
use crate::exc::*;
45
use pyo3::prelude::*;
56
use serde::de::{self, DeserializeSeed, Deserializer, MapAccess, SeqAccess, Visitor};
67
use smallvec::SmallVec;
@@ -9,23 +10,29 @@ use std::fmt;
910
use std::marker::PhantomData;
1011
use std::os::raw::c_char;
1112

12-
import_exception!(json, JSONDecodeError);
13-
1413
pub fn deserialize(py: Python, ptr: *mut pyo3::ffi::PyObject) -> PyResult<PyObject> {
1514
let obj_type_ptr = unsafe { (*ptr).ob_type };
1615
let data: Cow<str>;
1716
if unsafe { obj_type_ptr == typeref::STR_PTR } {
1817
let mut str_size: pyo3::ffi::Py_ssize_t = unsafe { std::mem::uninitialized() };
18+
let uni = unsafe { pyo3::ffi::PyUnicode_AsUTF8AndSize(ptr, &mut str_size) as *const u8 };
19+
if unsafe { std::intrinsics::unlikely(uni.is_null()) } {
20+
return Err(JSONDecodeError::py_err((INVALID_STR, "", 0)));
21+
}
1922
data = unsafe {
20-
Cow::Borrowed(std::str::from_utf8_unchecked(std::slice::from_raw_parts(
21-
pyo3::ffi::PyUnicode_AsUTF8AndSize(ptr, &mut str_size) as *const u8,
22-
str_size as usize,
23-
)))
23+
Cow::Borrowed(std::str::from_utf8_unchecked(std::slice::from_raw_parts(uni, str_size as usize)))
2424
};
2525
} else if unsafe { obj_type_ptr == typeref::BYTES_PTR } {
2626
let buffer = unsafe { pyo3::ffi::PyBytes_AsString(ptr) as *const u8 };
2727
let length = unsafe { pyo3::ffi::PyBytes_Size(ptr) as usize };
28-
data = unsafe { String::from_utf8_lossy(std::slice::from_raw_parts(buffer, length)) };
28+
match String::from_utf8(unsafe { std::slice::from_raw_parts(buffer, length).to_vec() }) {
29+
Ok(string) => {
30+
data = Cow::Owned(string);
31+
},
32+
Err(_) => {
33+
return Err(JSONDecodeError::py_err((INVALID_STR, "", 0)));
34+
}
35+
}
2936
} else {
3037
return Err(pyo3::exceptions::TypeError::py_err(
3138
"Input must be str or bytes",

src/encode.rs

Lines changed: 19 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
// SPDX-License-Identifier: (Apache-2.0 OR MIT)
22

33
use crate::typeref::*;
4+
use crate::exc::*;
45
use pyo3::prelude::*;
56
use serde::ser::{self, Serialize, SerializeMap, SerializeSeq, Serializer};
67
use std::ffi::CStr;
@@ -34,10 +35,12 @@ impl<'p> Serialize for SerializePyObject {
3435
let obj_ptr = unsafe { (*self.ptr).ob_type };
3536
if unsafe { obj_ptr == STR_PTR } {
3637
let mut str_size: pyo3::ffi::Py_ssize_t = unsafe { std::mem::uninitialized() };
37-
let data =
38-
unsafe { pyo3::ffi::PyUnicode_AsUTF8AndSize(self.ptr, &mut str_size) as *const u8 };
38+
let uni = unsafe { pyo3::ffi::PyUnicode_AsUTF8AndSize(self.ptr, &mut str_size) as *const u8 };
39+
if unsafe { std::intrinsics::unlikely(uni.is_null()) } {
40+
return Err(ser::Error::custom(INVALID_STR));
41+
}
3942
serializer.serialize_str(unsafe {
40-
std::str::from_utf8_unchecked(std::slice::from_raw_parts(data, str_size as usize))
43+
std::str::from_utf8_unchecked(std::slice::from_raw_parts(uni, str_size as usize))
4144
})
4245
} else if unsafe { obj_ptr == FLOAT_PTR } {
4346
serializer.serialize_f64(unsafe { pyo3::ffi::PyFloat_AsDouble(self.ptr) })
@@ -70,6 +73,9 @@ impl<'p> Serialize for SerializePyObject {
7073
let data = unsafe {
7174
pyo3::ffi::PyUnicode_AsUTF8AndSize(key, &mut str_size) as *const u8
7275
};
76+
if unsafe { std::intrinsics::unlikely(data.is_null()) } {
77+
return Err(ser::Error::custom(INVALID_STR));
78+
}
7379
map.serialize_entry(
7480
unsafe {
7581
std::str::from_utf8_unchecked(std::slice::from_raw_parts(
@@ -118,8 +124,17 @@ impl<'p> Serialize for SerializePyObject {
118124
} else if unsafe { obj_ptr == BYTES_PTR } {
119125
let buffer = unsafe { pyo3::ffi::PyBytes_AsString(self.ptr) as *const u8 };
120126
let length = unsafe { pyo3::ffi::PyBytes_Size(self.ptr) as usize };
127+
let pystr = unsafe { pyo3::ffi::PyUnicode_FromStringAndSize(
128+
buffer as *const c_char,
129+
length as pyo3::ffi::Py_ssize_t,
130+
) };
131+
if unsafe { std::intrinsics::unlikely(pystr.is_null()) } {
132+
return Err(ser::Error::custom(INVALID_STR));
133+
}
134+
let mut str_size: pyo3::ffi::Py_ssize_t = unsafe { std::mem::uninitialized() };
135+
let uni = unsafe { pyo3::ffi::PyUnicode_AsUTF8AndSize(pystr, &mut str_size) as *const u8 };
121136
serializer.serialize_str(unsafe {
122-
std::str::from_utf8_unchecked(std::slice::from_raw_parts(buffer, length))
137+
std::str::from_utf8_unchecked(std::slice::from_raw_parts(uni, str_size as usize))
123138
})
124139
} else {
125140
Err(ser::Error::custom(format_args!(

src/exc.rs

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
// SPDX-License-Identifier: (Apache-2.0 OR MIT)
2+
3+
pub const INVALID_STR: &str = "str is not valid UTF-8: surrogates not allowed";
4+
5+
import_exception!(json, JSONDecodeError);

src/lib.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ use pyo3::ToPyPointer;
1515

1616
mod decode;
1717
mod encode;
18+
mod exc;
1819
mod typeref;
1920

2021
#[pymodule]
@@ -23,7 +24,7 @@ fn orjson(py: Python, m: &PyModule) -> PyResult<()> {
2324
m.add("__version__", env!("CARGO_PKG_VERSION"))?;
2425
m.add_wrapped(wrap_function!(dumps))?;
2526
m.add_wrapped(wrap_function!(loads))?;
26-
m.add("JSONDecodeError", py.get_type::<decode::JSONDecodeError>())?;
27+
m.add("JSONDecodeError", py.get_type::<exc::JSONDecodeError>())?;
2728
Ok(())
2829
}
2930

test/test_type.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,32 @@ def test_str(self):
3131
self.assertEqual(orjson.dumps(obj), ref)
3232
self.assertEqual(orjson.loads(ref), obj)
3333

34+
def test_str_replacement(self):
35+
"""
36+
str roundtrip �
37+
"""
38+
self.assertEqual(orjson.dumps('�'), b'"\xef\xbf\xbd"')
39+
self.assertEqual(orjson.loads(b'"\xef\xbf\xbd"'), '�')
40+
41+
def test_str_surrogates_loads(self):
42+
"""
43+
str unicode surrogates loads()
44+
"""
45+
self.assertRaises(orjson.JSONDecodeError, orjson.loads, '"\ud800"')
46+
self.assertRaises(orjson.JSONDecodeError, orjson.loads, '"\ud83d\ude80"')
47+
self.assertRaises(orjson.JSONDecodeError, orjson.loads, '"\udcff"')
48+
self.assertRaises(orjson.JSONDecodeError, orjson.loads, b'"\xed\xa0\xbd\xed\xba\x80"') # \ud83d\ude80
49+
50+
def test_str_surrogates_dumps(self):
51+
"""
52+
str unicode surrogates dumps()
53+
"""
54+
self.assertRaises(TypeError, orjson.dumps, '\ud800')
55+
self.assertRaises(TypeError, orjson.dumps, '\ud83d\ude80')
56+
self.assertRaises(TypeError, orjson.dumps, '\udcff')
57+
self.assertRaises(TypeError, orjson.dumps, {'\ud83d\ude80': None})
58+
self.assertRaises(TypeError, orjson.dumps, b'\xed\xa0\xbd\xed\xba\x80') # \ud83d\ude80
59+
3460
def test_bytes(self):
3561
"""
3662
bytes

0 commit comments

Comments
 (0)