8000 Implement native UUID serialization (#48) · ITCSsDeveloper/orjson@f9facf0 · GitHub
Skip to content

Commit f9facf0

Browse files
necarisijl
authored andcommitted
Implement native UUID serialization (#48)
Add an option `orjson.OPT_SERIALIZE_UUID` to detect Python `uuid.UUID` objects and serialize them natively, extracting the UUID integer bytes and stringifying them ourselves. Provides _substantial_ speedups over using `default=str` to stringify them in Python code.
1 parent 58c8642 commit f9facf0

9 files changed

Lines changed: 173 additions & 19 deletions

File tree

Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ pyo3 = { version = "0.9.0-alpha.1", default_features = false, features = ["exten
5757
rand = { version = "0.7", default_features = false, features = ["getrandom", "std"] }
5858
serde = { version = "1", default_features = false }
5959
serde_json = { git = "https://github.com/ijl/json.git", rev = "83efe507487afe2332c5d6f83d28056159a58e5d", default_features = false, features = ["perfect_float"] }
60-
smallvec = { version = "1", default_features = false, features = ["union", "specialization"] }
60+
smallvec = { version = "1", default_features = false, features = ["union", "specialization", "write"] }
6161
wyhash = { version = "0.3" }
6262

6363
[profile.release]

README.md

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -220,6 +220,9 @@ b'"1970-01-01T00:00:00+00:00"'
220220
)
221221
b'"1970-01-01T00:00:00Z"'
222222
```
223+
##### OPT_SERIALIZE_UUID
224+
225+
Serialize `uuid.UUID` instances. For more, see [uuid](#uuid).
223226

224227
### Deserialize
225228

@@ -424,6 +427,19 @@ ValueError: Parse error at offset 1: The surrogate pair in string is invalid.
424427
'\ud800'
425428
```
426429

430+
### uuid
431+
432+
If the `OPT_SERIALIZE_UUID` option is provided, orjson serializes
433+
`uuid.UUID` objects natively in ["canonical"](https://en.wikipedia.org/wiki/Universally_unique_identifier#Format)
434+
format: as 32 hexadecimal digits, in 5 groups separated by hyphens,
435+
in the form 8-4-4-4-12, for a total of 36 characters.
436+
437+
``` python
438+
>>> import orjson, uuid
439+
>>> orjson.dumps(uuid.UUID(int=0x12345678123456781234567812345678), option=orjson.OPT_SERIALIZE_UUID)
440+
b'"12345678-1234-5678-1234-567812345678"'
441+
```
442+
427443
## Testing
428444

429445
The library has comprehensive tests. There are tests against fixtures in the

src/encode.rs

Lines changed: 41 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,8 @@ use pyo3::prelude::*;
77
use serde::ser::{self, Serialize, SerializeMap, SerializeSeq, Serializer};
88
use smallvec::SmallVec;
99
use std::ffi::CStr;
10-
use std::os::raw::c_char;
10+
use std::io::Write;
11+
use std::os::raw::{c_char, c_uchar};
1112
use std::ptr::NonNull;
1213

1314
// https://tools.ietf.org/html/rfc7159#section-6
@@ -19,6 +20,7 @@ const RECURSION_LIMIT: u8 = 255;
1920

2021
pub const STRICT_INTEGER: u8 = 1;
2122
pub const SERIALIZE_DATACLASS: u8 = 1 << 4;
23+
pub const SERIALIZE_UUID: u8 = 1 << 5;
2224

2325
macro_rules! obj_name {
2426
($obj:ident) => {
@@ -186,6 +188,44 @@ impl<'p> Serialize for SerializePyObject {
186188
let mut dt: SmallVec<[u8; 32]> = SmallVec::with_capacity(32);
187189
write_time(self.ptr, self.opts, &mut dt);
188190
serializer.serialize_str(str_from_slice!(dt.as_ptr(), dt.len()))
191+
} else if is_type!(obj_ptr, UUID_PTR) && (self.opts & SERIALIZE_UUID == SERIALIZE_UUID) {
192+
// In Python, `self.int` is the 128-bit integer value of the UUID;
193+
// we can assume this will not fail, as tested in `test_uuid.py`
194+
let py_int = ffi!(PyObject_GetAttr(self.ptr, INT_ATTR_STR));
195+
// Copied in from https://github.com/PyO3/pyo3/blob/fb17d5e82f302f09b6611ac608edd1ce37504703/src/types/num.rs#L95
196+
// because we don't have a `pyo3::Python` reference. However, because
197+
// we haven't yet Py_DECREF'd the `py_int` attribute, the reference
198+
// to `self.int` should be valid, and this should be safe to do.
199+
F889 // We know it's a `PyLongObject` because `self.int` is a 128-bit int,
200+
// and know that _PyLong_AsByteArray won't error, as tested in test_uuid.py
201+
let buffer: [c_uchar; 16] = [0; 16];
202+
unsafe {
203+
pyo3::ffi::_PyLong_AsByteArray(
204+
py_int as *mut pyo3::ffi::PyLongObject,
205+
buffer.as_ptr() as *const c_uchar,
206+
16,
207+
1, // Return a little-endian array
208+
0, // Unsigned - UUIDs can't be negative
209+
)
210+
};
211+
ffi!(Py_DECREF(py_int));
212+
let value = u128::from_le_bytes(buffer);
213+
let mut hexadecimal: SmallVec<[u8; 32]> = SmallVec::with_capacity(32);
214+
write!(hexadecimal, "{:032x}", value).unwrap();
215+
// Now we manually format it in canonical form: 5 groups separated
216+
// by hyphens, 8-4-4-4-12
217+
// https://en.wikipedia.org/wiki/Universally_unique_identifier#Format
218+
let mut formatted: SmallVec<[u8; 36]> = SmallVec::with_capacity(36);
219+
formatted.extend_from_slice(&hexadecimal[..8]);
220+
formatted.push('-' as u8);
221+
formatted.extend_from_slice(&hexadecimal[8..12]);
222+
formatted.push('-' as u8);
223+
formatted.extend_from_slice(&hexadecimal[12..16]);
224+
formatted.push('-' as u8);
225+
formatted.extend_from_slice(&hexadecimal[16..20]);
226+
formatted.push('-' as u8);
227+
formatted.extend_from_slice(&hexadecimal[20..]);
228+
serializer.serialize_str(str_from_slice!(formatted.as_ptr(), 36))
189229
} else {
190230
if self.opts & SERIALIZE_DATACLASS == SERIALIZE_DATACLASS
191231
&& ffi!(PyObject_HasAttr(self.ptr, DATACLASS_FIELDS_STR)) == 1

src/lib.rs

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,8 @@ const MAX_OPT: i8 = (encode::STRICT_INTEGER
2323
| datetime::NAIVE_UTC
2424
| datetime::OMIT_MICROSECONDS
2525
| datetime::UTC_Z
26-
| encode::SERIALIZE_DATACLASS) as i8;
26+
| encode::SERIALIZE_DATACLASS
27+
| encode::SERIALIZE_UUID) as i8;
2728

2829
#[pymodule]
2930
fn orjson(py: Python, m: &PyModule) -> PyResult<()> {
@@ -58,6 +59,7 @@ fn orjson(py: Python, m: &PyModule) -> PyResult<()> {
5859
m.add("OPT_STRICT_INTEGER", encode::STRICT_INTEGER)?;
5960
m.add("OPT_UTC_Z", datetime::UTC_Z)?;
6061
m.add("OPT_SERIALIZE_DATACLASS", encode::SERIALIZE_DATACLASS)?;
62+
m.add("OPT_SERIALIZE_UUID", encode::SERIALIZE_UUID)?;
6163

6264
Ok(())
6365
}

src/typeref.rs

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,8 @@ pub static mut FLOAT_PTR: *mut pyo3::ffi::PyTypeObject = 0 as *mut pyo3::ffi::Py
2020
pub static mut DATETIME_PTR: *mut pyo3::ffi::PyTypeObject = 0 as *mut pyo3::ffi::PyTypeObject;
2121
pub static mut DATE_PTR: *mut pyo3::ffi::PyTypeObject = 0 as *mut pyo3::ffi::PyTypeObject;
2222
pub static mut TIME_PTR: *mut pyo3::ffi::PyTypeObject = 0 as *mut pyo3::ffi::PyTypeObject;
23+
pub static mut UUID_PTR: *mut pyo3::ffi::PyTypeObject = 0 as *mut pyo3::ffi::PyTypeObject;
24+
pub static mut INT_ATTR_STR: *mut pyo3::ffi::PyObject = 0 as *mut pyo3::ffi::PyObject;
2325
pub static mut UTCOFFSET_METHOD_STR: *mut pyo3::ffi::PyObject = 0 as *mut pyo3::ffi::PyObject;
2426
pub static mut NORMALIZE_METHOD_STR: *mut pyo3::ffi::PyObject = 0 as *mut pyo3::ffi::PyObject;
2527
pub static mut CONVERT_METHOD_STR: *mut pyo3::ffi::PyObject = 0 as *mut pyo3::ffi::PyObject;
@@ -30,6 +32,24 @@ static EMTPY_STR: &str = "";
3032

3133
static INIT: Once = Once::new();
3234

35+
/// Look up the `uuid.UUID` type pointer, which is defined in Python
36+
pub unsafe fn look_up_uuid_type() -> *mut pyo3::ffi::PyTypeObject {
37+
// Use the module-level `NAMESPACE_DNS` instance to get the type pointer
38+
// https://docs.python.org/3/library/uuid.html#uuid.NAMESPACE_DNS
39+
let uuid_mod = pyo3::ffi::PyImport_ImportModule("uuid\0".as_ptr() as *const c_char);
40+
let uuid_mod_dict = pyo3::ffi::PyModule_GetDict(uuid_mod);
41+
let uuid = pyo3::ffi::PyMapping_GetItemString(
42+
uuid_mod_dict,
43+
"NAMESPACE_DNS\0".as_ptr() as *const c_char,
44+
);
45+
let ptr = (*uuid).ob_type;
46+
// Ensure Python can garbage-collect everything
47+
pyo3::ffi::Py_DECREF(uuid);
48+
pyo3::ffi::Py_DECREF(uuid_mod_dict);
49+
pyo3::ffi::Py_DECREF(uuid_mod);
50+
ptr
51+
}
52+
3353
pub fn init_typerefs() {
3454
INIT.call_once(|| unsafe {
3555
pyo3::ffi::PyDateTime_IMPORT();
@@ -83,6 +103,10 @@ pub fn init_typerefs() {
83103
pyo3::ffi::PyDateTimeAPI.TimeType,
84104
);
85105
TIME_PTR = (*time).ob_type;
106+
UUID_PTR = look_up_uuid_type();
107+
// We'll be looking up the "int" attribute on UUIDs, so it's convenient
108+
// to define this ahead of time as a constant.
109+
INT_ATTR_STR = pyo3::ffi::PyUnicode_FromStringAndSize("int".as_ptr() as *const c_char, 3);
86110
UTCOFFSET_METHOD_STR =
87111
pyo3::ffi::PyUnicode_FromStringAndSize("utcoffset".as_ptr() as *const c_char, 9);
88112
NORMALIZE_METHOD_STR =

test/test_api.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,7 @@ def test_option_range_high(self):
8989
dumps() option out of range high
9090
"""
9191
with self.assertRaises(orjson.JSONEncodeError):
92-
orjson.dumps(True, option=1 << 5)
92+
orjson.dumps(True, option=1 << 6)
9393

9494
def test_opts_multiple(self):
9595
"""

test/test_default.py

Lines changed: 21 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,14 @@
77
import orjson
88

99

10+
class Custom:
11+
def __init__(self):
12+
self.name = uuid.uuid4().hex
13+
14+
def __str__(self):
15+
return "%s(%s)" % (self.__class__.__name__, self.name)
16+
17+
1018
class Recursive:
1119
def __init__(self, cur):
1220
self.cur = cur
@@ -25,13 +33,13 @@ def test_default_not_callable(self):
2533
dumps() default not callable
2634
"""
2735
with self.assertRaises(orjson.JSONEncodeError):
28-
orjson.dumps(uuid.uuid4(), default=NotImplementedError)
36+
orjson.dumps(Custom(), default=NotImplementedError)
2937

3038
def test_default_func(self):
3139
"""
3240
dumps() default function
3341
"""
34-
ref = uuid.uuid4()
42+
ref = Custom()
3543

3644
def default(obj):
3745
return str(obj)
@@ -44,7 +52,7 @@ def test_default_func_none(self):
4452
"""
4553
dumps() default function None ok
4654
"""
47-
self.assertEqual(orjson.dumps(uuid.uuid4(), default=lambda x: None), b"null")
55+
self.assertEqual(orjson.dumps(Custom(), default=lambda x: None), b"null")
4856

4957
def test_default_func_exc(self):
5058
"""
@@ -55,13 +63,13 @@ def default(obj):
5563
raise NotImplementedError
5664

5765
with self.assertRaises(orjson.JSONEncodeError):
58-
orjson.dumps(uuid.uuid4(), default=default)
66+
orjson.dumps(Custom(), default=default)
5967

6068
def test_default_func_nested_str(self):
6169
"""
6270
dumps() default function nested str
6371
"""
64-
ref = uuid.uuid4()
72+
ref = Custom()
6573

6674
def default(obj):
6775
return str(obj)
@@ -75,10 +83,10 @@ def test_default_func_list(self):
7583
"""
7684
dumps() default function nested list
7785
"""
78-
ref = uuid.uuid4()
86+
ref = Custom()
7987

8088
def default(obj):
81-
if isinstance(obj, uuid.UUID):
89+
if isinstance(obj, Custom):
8290
return [str(obj)]
8391

8492
self.assertEqual(
@@ -90,7 +98,7 @@ def test_default_func_nested_list(self):
9098
"""
9199
dumps() default function list
92100
"""
93-
ref = uuid.uuid4()
101+
ref = Custom()
94102

95103
def default(obj):
96104
return str(obj)
@@ -105,7 +113,7 @@ def test_default_func_bytes(self):
105113
"""
106114
dumps() default function errors on non-str
107115
"""
108-
ref = uuid.uuid4()
116+
ref = Custom()
109117

110118
def default(obj):
111119
return bytes(obj)
@@ -117,7 +125,7 @@ def test_default_func_invalid_str(self):
117125
"""
118126
dumps() default function errors on invalid str
119127
"""
120-
ref = uuid.uuid4()
128+
ref = Custom()
121129

122130
def default(obj):
123131
return "\ud800"
@@ -129,7 +137,7 @@ def test_default_lambda_ok(self):
129137
"""
130138
dumps() default lambda
131139
"""
132-
ref = uuid.uuid4()
140+
ref = Custom()
133141
self.assertEqual(
134142
orjson.dumps(ref, default=lambda x: str(x)),
135143
b'"%s"' % str(ref).encode("utf-8"),
@@ -149,7 +157,7 @@ def __call__(self, obj):
149157
self._cache[obj] = str(obj)
150158
return self._cache[obj]
151159

152-
ref_obj = uuid.uuid4()
160+
ref_obj = Custom()
153161
ref_bytes = b'"%s"' % str(ref_obj).encode("utf-8")
154162
for obj in [ref_obj] * 100:
155163
self.assertEqual(orjson.dumps(obj, default=CustomSerializer()), ref_bytes)
@@ -176,7 +184,7 @@ def test_default_recursion_infinite(self):
176184
"""
177185
dumps() default infinite recursion
178186
"""
179-
ref = uuid.uuid4()
187+
ref = Custom()
180188

181189
def default(obj):
182190
return obj

test/test_memory.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55
import gc
66
import random
77
import unittest
8-
import uuid
98
from typing import List
109

1110
import orjson
@@ -81,7 +80,15 @@ def test_memory_dumps_default(self):
8180
proc = psutil.Process()
8281
gc.collect()
8382
fixture = orjson.loads(FIXTURE)
84-
fixture["custom"] = uuid.uuid4()
83+
84+
class Custom:
85+
def __init__(self, name):
86+
self.name = name
87+
88+
def __str__(self):
89+
return "%s(%s)" % (self.__class__.__name__, self.name)
90+
91+
fixture["custom"] = Custom("orjson")
8592
val = orjson.dumps(fixture, default=default)
8693
mem = proc.memory_info().rss
8794
for _ in range(10000):

test/test_uuid.py

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,11 @@
11
# SPDX-License-Identifier: (Apache-2.0 OR MIT)
22

3+
import re
34
import uuid
45
import unittest
56

7+
import orjson
8+
69

710
class UUIDTests(unittest.TestCase):
811
def test_uuid_immutable(self):
@@ -33,3 +36,57 @@ def test_uuid_overflow(self):
3336
uuid.UUID(int=2 ** 128)
3437
with self.assertRaises(ValueError):
3538
uuid.UUID(int=-1)
39+
40+
def test_all_ways_to_create_uuid_behave_equivalently(self):
41+
# Note that according to the docstring for the uuid.UUID class, all the
42+
# forms below are equivalent -- they end up with the same value for
43+
# `self.int`, which is all that really matters
44+
uuids = [
45+
uuid.UUID("{12345678-1234-5678-1234-567812345678}"),
46+
uuid.UUID("12345678123456781234567812345678"),
47+
uuid.UUID("urn:uuid:12345678-1234-5678-1234-567812345678"),
48+
uuid.UUID(bytes=b"\x12\x34\x56\x78" * 4),
49+
uuid.UUID(
50+
bytes_le=b"\x78\x56\x34\x12\x34\x12\x78\x56"
51+
+ b"\x12\x34\x56\x78\x12\x34\x56\x78"
52+
),
53+
uuid.UUID(fields=(0x12345678, 0x1234, 0x5678, 0x12, 0x34, 0x567812345678)),
54+
uuid.UUID(int=0x12345678123456781234567812345678),
55+
]
56+
result = orjson.dumps(uuids, option=orjson.OPT_SERIALIZE_UUID)
57+
canonical_uuids = ['"%s"' % str(u) for u in uuids]
58+
serialized = ("[%s]" % ",".join(canonical_uuids)).encode("utf8")
59+
self.assertEqual(result, serialized)
60+
61+
def test_serialize_natively_equivalent_to_str(self):
62+
uuid_ = uuid.uuid4()
63+
self.assertEqual(
64+
orjson.dumps([uuid_], option=orjson.OPT_SERIALIZE_UUID),
65+
orjson.dumps([uuid_], default=str),
66+
)
67+
68+
def test_does_not_serialize_without_opt(self):
69+
with self.assertRaises(orjson.JSONEncodeError):
70+
_ = orjson.dumps([uuid.uuid4()])
71+
72+
def test_serializes_correctly_with_leading_zeroes(self):
73+
instance = uuid.UUID(int=0x00345678123456781234567812345678)
74+
self.assertEqual(
75+
orjson.dumps(instance, option=orjson.OPT_SERIALIZE_UUID),
76+
('"%s"' % str(instance)).encode("utf8"),
77+
)
78+
79+
def test_all_uuid_creation_functions_create_serializable_uuids(self):
80+
all_versioned_uuids = [
81+
uuid.uuid1(),
82+
uuid.uuid3(uuid.NAMESPACE_DNS, "python.org"),
83+
uuid.uuid4(),
84+
uuid.uuid5(uuid.NAMESPACE_DNS, "python.org"),
85+
]
86+
serialized = orjson.dumps(all_versioned_uuids, option=orjson.OPT_SERIALIZE_UUID)
87+
# Ensure that all the creator functions produce UUID strings that match
88+
# our expected 8-4-4-4-12 hexadecimal format
89+
assert re.match(
90+
rb'\[("[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}",?){4}]',
91+
serialized,
92+
)

0 commit comments

Comments
 (0)