Skip to content

Commit e5807f1

Browse files
committed
Directly read from unicode, bytes objects
1 parent 0d46690 commit e5807f1

5 files changed

Lines changed: 90 additions & 11 deletions

File tree

src/bytes.rs

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
// SPDX-License-Identifier: (Apache-2.0 OR MIT)
2+
3+
use pyo3::ffi::*;
4+
use std::os::raw::c_char;
5+
6+
#[repr(C)]
7+
pub struct PyBytesObject {
8+
pub ob_refcnt: Py_ssize_t,
9+
pub ob_type: *mut PyTypeObject,
10+
pub ob_size: Py_ssize_t,
11+
pub ob_shash: Py_hash_t,
12+
pub ob_sval: [c_char; 1],
13+
}
14+
15+
#[allow(non_snake_case)]
16+
#[inline(always)]
17+
pub unsafe fn PyBytes_AS_STRING(op: *mut PyObject) -> *const c_char {
18+
&(*op.cast::<PyBytesObject>()).ob_sval as *const c_char
19+
}
20+
21+
#[allow(non_snake_case)]
22+
#[inline(always)]
23+
pub unsafe fn PyBytes_GET_SIZE(op: *mut PyObject) -> Py_ssize_t {
24+
(*op.cast::<PyBytesObject>()).ob_size
25+
}

src/decode.rs

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
11
// SPDX-License-Identifier: (Apache-2.0 OR MIT)
22

3+
use crate::bytes::*;
34
use crate::exc::*;
45
use crate::typeref::*;
6+
use crate::unicode::*;
57
use associative_cache::replacement::RoundRobinReplacement;
68
use associative_cache::*;
79
use lazy_static::lazy_static;
@@ -50,31 +52,31 @@ lazy_static! {
5052
}
5153

5254
pub fn deserialize(ptr: *mut pyo3::ffi::PyObject) -> PyResult<NonNull<pyo3::ffi::PyObject>> {
55+
let data: &str;
5356
let obj_type_ptr = unsafe { (*ptr).ob_type };
54-
let data: Cow<str>;
5557
if is_type!(obj_type_ptr, STR_PTR) {
5658
let mut str_size: pyo3::ffi::Py_ssize_t = 0;
57-
let uni = ffi!(PyUnicode_AsUTF8AndSize(ptr, &mut str_size)) as *const u8;
59+
let uni = read_utf8_from_str(ptr, &mut str_size);
5860
if unlikely!(uni.is_null()) {
5961
return Err(JSONDecodeError::py_err((INVALID_STR, "", 0)));
6062
}
61-
data = Cow::Borrowed(str_from_slice!(uni, str_size));
63+
data = str_from_slice!(uni, str_size);
6264
} else if is_type!(obj_type_ptr, BYTES_PTR) {
63-
let buffer = ffi!(PyBytes_AsString(ptr)) as *const u8;
64-
let length = ffi!(PyBytes_Size(ptr)) as usize;
65+
let buffer = unsafe { PyBytes_AS_STRING(ptr) as *const u8 };
66+
let length = unsafe { PyBytes_GET_SIZE(ptr) as usize };
6567
let slice = unsafe { std::slice::from_raw_parts(buffer, length) };
6668
if encoding_rs::Encoding::utf8_valid_up_to(slice) != length {
6769
return Err(JSONDecodeError::py_err((INVALID_STR, "", 0)));
6870
}
69-
data = Cow::Borrowed(unsafe { std::str::from_utf8_unchecked(slice) });
71+
data = unsafe { std::str::from_utf8_unchecked(slice) };
7072
} else if is_type!(obj_type_ptr, BYTEARRAY_PTR) {
7173
let buffer = ffi!(PyByteArray_AsString(ptr)) as *const u8;
7274
let length = ffi!(PyByteArray_Size(ptr)) as usize;
7375
let slice = unsafe { std::slice::from_raw_parts(buffer, length) };
7476
if encoding_rs::Encoding::utf8_valid_up_to(slice) != length {
7577
return Err(JSONDecodeError::py_err((INVALID_STR, "", 0)));
7678
}
77-
data = Cow::Borrowed(unsafe { std::str::from_utf8_unchecked(slice) });
79+
data = unsafe { std::str::from_utf8_unchecked(slice) };
7880
} else {
7981
return Err(JSONDecodeError::py_err((
8082
"Input must be str or bytes",
@@ -84,7 +86,7 @@ pub fn deserialize(ptr: *mut pyo3::ffi::PyObject) -> PyResult<NonNull<pyo3::ffi:
8486
}
8587

8688
let seed = JsonValue {};
87-
let mut deserializer = serde_json::Deserializer::from_str(&data);
89+
let mut deserializer = serde_json::Deserializer::from_str(data);
8890
match seed.deserialize(&mut deserializer) {
8991
Ok(obj) => {
9092
deserializer

src/encode.rs

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
use crate::datetime::*;
44
use crate::exc::*;
55
use crate::typeref::*;
6+
use crate::unicode::*;
67
use crate::uuid::write_uuid;
78
use pyo3::prelude::*;
89
use serde::ser::{self, Serialize, SerializeMap, SerializeSeq, Serializer};
@@ -76,7 +77,7 @@ impl<'p> Serialize for SerializePyObject {
7677
let obj_ptr = unsafe { (*self.ptr).ob_type };
7778
if is_type!(obj_ptr, STR_PTR) {
7879
let mut str_size: pyo3::ffi::Py_ssize_t = 0;
79-
let uni = ffi!(PyUnicode_AsUTF8AndSize(self.ptr, &mut str_size)) as *const u8;
80+
let uni = read_utf8_from_str(self.ptr, &mut str_size);
8081
if unlikely!(uni.is_null()) {
8182
err!(INVALID_STR)
8283
}
@@ -128,7 +129,7 @@ impl<'p> Serialize for SerializePyObject {
128129
err!("Dict key must be str")
129130
}
130131
{
131-
let data = ffi!(PyUnicode_AsUTF8AndSize(key, &mut str_size)) as *const u8;
132+
let data = read_utf8_from_str(key, &mut str_size);
132133
if unlikely!(data.is_null()) {
133134
err!(INVALID_STR)
134135
}
@@ -210,7 +211,7 @@ impl<'p> Serialize for SerializePyObject {
210211
err!("Recursion limit reached")
211212
}
212213
{
213-
let data = ffi!(PyUnicode_AsUTF8AndSize(attr, &mut str_size)) as *const u8;
214+
let data = read_utf8_from_str(attr, &mut str_size);
214215
if unlikely!(data.is_null()) {
215216
err!(INVALID_STR);
216217
}

src/lib.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,11 +13,13 @@ use std::ptr::NonNull;
1313
#[macro_use]
1414
mod util;
1515

16+
mod bytes;
1617
mod datetime;
1718
mod decode;
1819
mod encode;
1920
mod exc;
2021
mod typeref;
22+
mod unicode;
2123
mod uuid;
2224

2325
const MAX_OPT: i8 = (encode::STRICT_INTEGER

src/unicode.rs

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
// SPDX-License-Identifier: (Apache-2.0 OR MIT)
2+
3+
use pyo3::ffi::*;
4+
use std::os::raw::c_char;
5+
6+
// see unicodeobject.h for documentation
7+
8+
#[repr(C)]
9+
pub struct PyASCIIObject {
10+
pub ob_refcnt: Py_ssize_t,
11+
pub ob_type: *mut PyTypeObject,
12+
pub length: Py_ssize_t,
13+
pub hash: Py_hash_t,
14+
pub state: u32,
15+
pub wstr: *mut c_char,
16+
}
17+
18+
#[repr(C)]
19+
pub struct PyCompactUnicodeObject {
20+
pub ob_refcnt: Py_ssize_t,
21+
pub ob_type: *mut PyTypeObject,
22+
pub length: Py_ssize_t,
23+
pub hash: Py_hash_t,
24+
pub state: u32,
25+
pub wstr: *mut Py_UNICODE,
26+
pub utf8_length: Py_ssize_t,
27+
pub utf8: *mut c_char,
28+
pub wstr_length: Py_ssize_t,
29+
}
30+
31+
const STATE_ASCII: u32 = 0b00000000000000000000000001000000;
32+
const STATE_COMPACT: u32 = 0b00000000000000000000000000100000;
33+
34+
#[inline]
35+
pub fn read_utf8_from_str(op: *mut PyObject, str_size: &mut Py_ssize_t) -> *const u8 {
36+
unsafe {
37+
if (*op.cast::<PyASCIIObject>()).state & STATE_ASCII == STATE_ASCII {
38+
*str_size = (*op.cast::<PyASCIIObject>()).length;
39+
op.cast::<PyASCIIObject>().offset(1) as *const u8
40+
} else if (*op.cast::<PyASCIIObject>()).state & STATE_COMPACT == STATE_COMPACT
41+
&& !(*op.cast::<PyCompactUnicodeObject>()).utf8.is_null()
42+
{
43+
*str_size = (*op.cast::<PyCompactUnicodeObject>()).utf8_length;
44+
(*op.cast::<PyCompactUnicodeObject>()).utf8 as *const u8
45+
} else {
46+
PyUnicode_AsUTF8AndSize(op, str_size) as *const u8
47+
}
48+
}
49+
}

0 commit comments

Comments
 (0)