Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Add CompactCowStr
  • Loading branch information
SimonSapin committed Jun 16, 2017
commit 5daadfd5fdd9b3ec6c74b3a0b2ca2cb6b9b503e7
231 changes: 231 additions & 0 deletions src/compact_cow_str.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,231 @@
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */

use std::borrow::{Borrow, Cow};
use std::cmp;
use std::fmt;
use std::hash;
use std::marker::PhantomData;
use std::mem;
use std::ops::Deref;
use std::slice;
use std::str;

// All bits set except the highest
const MAX_LEN: usize = !0 >> 1;

// Only the highest bit
const OWNED_TAG: usize = MAX_LEN + 1;

/// Like `Cow<'a, str>`, but with smaller `std::mem::size_of`. (Two words instead of four.)
pub struct CompactCowStr<'a> {
// `tagged_len` is a tag in its highest bit, and the string length in the rest of the bits.
//
// * If the tag is 1, the memory pointed to by `ptr` is owned
// and the lifetime parameter is irrelevant.
// `ptr` and `len` are the components of a `Box<str>`.
//
// * If the tag is 0, the memory is borrowed.
// `ptr` and `len` are the components of a `&'a str`.

// FIXME: https://github.com/rust-lang/rust/issues/27730 use NonZero or Shared
ptr: *const u8,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: Add a TODO for NonZero?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure, done. Although I think we never use this in Option or other places where NonZero would make a difference.

tagged_len: usize,
phantom: PhantomData<&'a str>,
}

impl<'a> From<&'a str> for CompactCowStr<'a> {
#[inline]
fn from(s: &'a str) -> Self {
let len = s.len();
assert!(len <= MAX_LEN);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Quick question, why add the tag to the length instead of the pointer?

Both seem fine, but with the second you get the benefit of not having the MAX_LENGTH restriction (though with the first you need to remove the tag).

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

At least in the current implementation of libstd, String and Vec in Rust are already restricted to a maximum capacity of isize::MAX, so this doesn't add any further restrictions on the length.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don’t know if any bit on the pointer is guaranteed to be available. Maybe not high bits since it seems like a userland process can get up to 3 GB on 32-bit. Not low bits since a subslice of &str can start at any byte.

CompactCowStr {
ptr: s.as_ptr(),
tagged_len: len,
phantom: PhantomData,
}
}
}

impl<'a> From<Box<str>> for CompactCowStr<'a> {
#[inline]
fn from(s: Box<str>) -> Self {
let ptr = s.as_ptr();
let len = s.len();
assert!(len <= MAX_LEN);
mem::forget(s);
CompactCowStr {
ptr: ptr,
tagged_len: len | OWNED_TAG,
phantom: PhantomData,
}
}
}

impl<'a> CompactCowStr<'a> {
/// Whether this string refers to borrowed memory
/// (as opposed to owned, which would be freed when `CompactCowStr` goes out of scope).
#[inline]
pub fn is_borrowed(&self) -> bool {
(self.tagged_len & OWNED_TAG) == 0
}

/// The length of this string
#[inline]
pub fn len(&self) -> usize {
self.tagged_len & !OWNED_TAG
}

// Intentionally private since it is easy to use incorrectly.
#[inline]
fn as_raw_str(&self) -> *const str {
unsafe {
str::from_utf8_unchecked(slice::from_raw_parts(self.ptr, self.len()))
}
}

/// If this string is borrowed, return a slice with the original lifetime,
/// not borrowing `self`.
///
/// (`Deref` is implemented unconditionally, but returns a slice with a shorter lifetime.)
#[inline]
pub fn as_str(&self) -> Option<&'a str> {
if self.is_borrowed() {
Some(unsafe { &*self.as_raw_str() })
} else {
None
}
}
}

impl<'a> Clone for CompactCowStr<'a> {
#[inline]
fn clone(&self) -> Self {
if self.is_borrowed() {
CompactCowStr { ..*self }
} else {
Self::from(Box::from(&**self))
}
}
}

impl<'a> Drop for CompactCowStr<'a> {
#[inline]
fn drop(&mut self) {
if !self.is_borrowed() {
unsafe {
Box::from_raw(self.as_raw_str() as *mut str);
}
}
}
}

impl<'a> Deref for CompactCowStr<'a> {
type Target = str;

#[inline]
fn deref(&self) -> &str {
unsafe {
&*self.as_raw_str()
}
}
}

impl<'a> From<CompactCowStr<'a>> for Cow<'a, str> {
#[inline]
fn from(cow: CompactCowStr<'a>) -> Self {
unsafe {
let raw = cow.as_raw_str();
let is_borrowed = cow.is_borrowed();
mem::forget(cow);
if is_borrowed {
Cow::Borrowed(&*raw)
} else {
Cow::Owned(String::from(Box::from_raw(raw as *mut str)))
}
}
}
}

impl<'a> From<String> for CompactCowStr<'a> {
#[inline]
fn from(s: String) -> Self {
Self::from(s.into_boxed_str())
}
}

impl<'a> From<Cow<'a, str>> for CompactCowStr<'a> {
#[inline]
fn from(s: Cow<'a, str>) -> Self {
match s {
Cow::Borrowed(s) => Self::from(s),
Cow::Owned(s) => Self::from(s),
}
}
}

impl<'a> AsRef<str> for CompactCowStr<'a> {
#[inline]
fn as_ref(&self) -> &str {
self
}
}

impl<'a> Borrow<str> for CompactCowStr<'a> {
#[inline]
fn borrow(&self) -> &str {
self
}
}

impl<'a> Default for CompactCowStr<'a> {
#[inline]
fn default() -> Self {
Self::from("")
}
}

impl<'a> hash::Hash for CompactCowStr<'a> {
#[inline]
fn hash<H: hash::Hasher>(&self, hasher: &mut H) {
str::hash(self, hasher)
}
}

impl<'a, T: AsRef<str>> PartialEq<T> for CompactCowStr<'a> {
#[inline]
fn eq(&self, other: &T) -> bool {
str::eq(self, other.as_ref())
}
}

impl<'a, T: AsRef<str>> PartialOrd<T> for CompactCowStr<'a> {
#[inline]
fn partial_cmp(&self, other: &T) -> Option<cmp::Ordering> {
str::partial_cmp(self, other.as_ref())
}
}

impl<'a> Eq for CompactCowStr<'a> {}

impl<'a> Ord for CompactCowStr<'a> {
#[inline]
fn cmp(&self, other: &Self) -> cmp::Ordering {
str::cmp(self, other)
}
}

impl<'a> fmt::Display for CompactCowStr<'a> {
#[inline]
fn fmt(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
str::fmt(self, formatter)
}
}

impl<'a> fmt::Debug for CompactCowStr<'a> {
#[inline]
fn fmt(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
str::fmt(self, formatter)
}
}
2 changes: 2 additions & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,7 @@ pub use nth::parse_nth;
pub use serializer::{ToCss, CssStringWriter, serialize_identifier, serialize_string, TokenSerializationType};
pub use parser::{Parser, Delimiter, Delimiters, SourcePosition, ParseError, BasicParseError, ParserInput};
pub use unicode_range::UnicodeRange;
pub use compact_cow_str::CompactCowStr;

// For macros
#[doc(hidden)] pub use macros::_internal__to_lowercase;
Expand All @@ -116,6 +117,7 @@ mod color;
mod nth;
mod serializer;
mod unicode_range;
mod compact_cow_str;

#[cfg(test)] mod tests;
#[cfg(test)] mod size_of_tests;
2 changes: 2 additions & 0 deletions src/size_of_tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */

use compact_cow_str::CompactCowStr;
use std::borrow::Cow;
use tokenizer::{Token, NumericValue, PercentageValue};

Expand Down Expand Up @@ -35,3 +36,4 @@ size_of_test!(token, Token, 56);
size_of_test!(numeric_value, NumericValue, 16);
size_of_test!(percentage_value, PercentageValue, 16);
size_of_test!(std_cow_str, Cow<'static, str>, 32);
size_of_test!(compact_cow_str, CompactCowStr, 16);
2 changes: 1 addition & 1 deletion src/tokenizer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ use self::Token::*;

/// One of the pieces the CSS input is broken into.
///
/// Some components use `CowString` in order to borrow from the original input string
/// Some components use `Cow` in order to borrow from the original input string
/// and avoid allocating/copying when possible.
#[derive(PartialEq, Debug, Clone)]
pub enum Token<'a> {
Expand Down