You're Invited:Meet the Socket Team at RSAC and BSidesSF 2026, March 23–26.RSVP
Socket
Book a DemoSign in
Socket

potential_utf

Package Overview
Dependencies
Maintainers
1
Versions
4
Alerts
File Explorer

Advanced tools

Socket logo

Install Socket

Detect and block malicious and high-risk dependencies

Install

potential-utf - cargo Package Compare versions

Comparing version
0.1.3
to
0.1.4
+6
potential_utf-0.1.4/.cargo_vcs_info.json
{
"git": {
"sha1": "29dfe2790b6cfdab94ca6a6b69f58ce54802dbf7"
},
"path_in_vcs": "utils/potential_utf"
}
# This file is automatically @generated by Cargo.
# It is not intended for manual editing.
version = 3
[[package]]
name = "bincode"
version = "1.3.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b1f45e9417d87227c7a56d22e471c6206462cba514c7590c09aff4cf6d1ddcad"
dependencies = [
"serde",
]
[[package]]
name = "databake"
version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ff6ee9e2d2afb173bcdeee45934c89ec341ab26f91c9933774fc15c2b58f83ef"
dependencies = [
"proc-macro2",
"quote",
]
[[package]]
name = "itoa"
version = "1.0.15"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c"
[[package]]
name = "memchr"
version = "2.7.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f52b00d39961fc5b2736ea853c9cc86238e165017a493d1d5c8eac6bdc4cc273"
[[package]]
name = "potential_utf"
version = "0.1.4"
dependencies = [
"bincode",
"databake",
"serde_core",
"serde_json",
"writeable",
"zerovec",
]
[[package]]
name = "proc-macro2"
version = "1.0.103"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5ee95bc4ef87b8d5ba32e8b7714ccc834865276eab0aed5c9958d00ec45f49e8"
dependencies = [
"unicode-ident",
]
[[package]]
name = "quote"
version = "1.0.41"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ce25767e7b499d1b604768e7cde645d14cc8584231ea6b295e9c9eb22c02e1d1"
dependencies = [
"proc-macro2",
]
[[package]]
name = "ryu"
version = "1.0.20"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "28d3b2b1366ec20994f1fd18c3c594f05c5dd4bc44d8bb0c1c632c8d6829481f"
[[package]]
name = "serde"
version = "1.0.228"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e"
dependencies = [
"serde_core",
"serde_derive",
]
[[package]]
name = "serde_core"
version = "1.0.228"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad"
dependencies = [
"serde_derive",
]
[[package]]
name = "serde_derive"
version = "1.0.228"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "serde_json"
version = "1.0.145"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "402a6f66d8c709116cf22f558eab210f5a50187f702eb4d7e5ef38d9a7f1c79c"
dependencies = [
"itoa",
"memchr",
"ryu",
"serde",
"serde_core",
]
[[package]]
name = "syn"
version = "2.0.108"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "da58917d35242480a05c2897064da0a80589a2a0476c9a3f2fdc83b53502e917"
dependencies = [
"proc-macro2",
"quote",
"unicode-ident",
]
[[package]]
name = "unicode-ident"
version = "1.0.20"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "462eeb75aeb73aea900253ce739c8e18a67423fadf006037cd3ff27e82748a06"
[[package]]
name = "writeable"
version = "0.6.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9edde0db4769d2dc68579893f2306b26c6ecfbe0ef499b013d731b7b9247e0b9"
[[package]]
name = "zerofrom"
version = "0.1.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "50cc42e0333e05660c3587f3bf9d0478688e15d870fab3346451ce7f8c9fbea5"
[[package]]
name = "zerovec"
version = "0.11.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6c28719294829477f525be0186d13efa9a3c602f7ec202ca9e353d310fb9a002"
dependencies = [
"serde",
"zerofrom",
]
# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO
#
# When uploading crates to the registry Cargo will automatically
# "normalize" Cargo.toml files for maximal compatibility
# with all versions of Cargo and also rewrite `path` dependencies
# to registry (e.g., crates.io) dependencies.
#
# If you are reading this file be aware that the original Cargo.toml
# will likely look very different (and much more reasonable).
# See Cargo.toml.orig for the original contents.
[package]
edition = "2021"
rust-version = "1.82"
name = "potential_utf"
version = "0.1.4"
authors = ["The ICU4X Project Developers"]
build = false
include = [
"data/**/*",
"src/**/*",
"examples/**/*",
"benches/**/*",
"tests/**/*",
"Cargo.toml",
"LICENSE",
"README.md",
"build.rs",
]
autolib = false
autobins = false
autoexamples = false
autotests = false
autobenches = false
description = "Unvalidated string and character types"
homepage = "https://icu4x.unicode.org"
readme = "README.md"
categories = ["internationalization"]
license = "Unicode-3.0"
repository = "https://github.com/unicode-org/icu4x"
[features]
alloc = [
"serde_core?/alloc",
"writeable/alloc",
"zerovec?/alloc",
]
databake = ["dep:databake"]
default = ["alloc"]
serde = ["dep:serde_core"]
writeable = ["dep:writeable"]
zerovec = ["dep:zerovec"]
[lib]
name = "potential_utf"
path = "src/lib.rs"
[dependencies.databake]
version = "0.2.0"
optional = true
default-features = false
[dependencies.serde_core]
version = "1.0.220"
optional = true
default-features = false
[dependencies.writeable]
version = "0.6.0"
optional = true
default-features = false
[dependencies.zerovec]
version = "0.11.3"
optional = true
default-features = false
[dev-dependencies.bincode]
version = "1.3.1"
[dev-dependencies.serde_json]
version = "1.0.45"

Sorry, the diff of this file is not supported yet

UNICODE LICENSE V3
COPYRIGHT AND PERMISSION NOTICE
Copyright © 2020-2024 Unicode, Inc.
NOTICE TO USER: Carefully read the following legal agreement. BY
DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING DATA FILES, AND/OR
SOFTWARE, YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE
TERMS AND CONDITIONS OF THIS AGREEMENT. IF YOU DO NOT AGREE, DO NOT
DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE THE DATA FILES OR SOFTWARE.
Permission is hereby granted, free of charge, to any person obtaining a
copy of data files and any associated documentation (the "Data Files") or
software and any associated documentation (the "Software") to deal in the
Data Files or Software without restriction, including without limitation
the rights to use, copy, modify, merge, publish, distribute, and/or sell
copies of the Data Files or Software, and to permit persons to whom the
Data Files or Software are furnished to do so, provided that either (a)
this copyright and permission notice appear with all copies of the Data
Files or Software, or (b) this copyright and permission notice appear in
associated Documentation.
THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY
KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF
THIRD PARTY RIGHTS.
IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE
BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES,
OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THE DATA
FILES OR SOFTWARE.
Except as contained in this notice, the name of a copyright holder shall
not be used in advertising or otherwise to promote the sale, use or other
dealings in these Data Files or Software without prior written
authorization of the copyright holder.
SPDX-License-Identifier: Unicode-3.0
Portions of ICU4X may have been adapted from ICU4C and/or ICU4J.
ICU 1.8.1 to ICU 57.1 © 1995-2016 International Business Machines Corporation and others.
# potential_utf [![crates.io](https://img.shields.io/crates/v/potential_utf)](https://crates.io/crates/potential_utf)
<!-- cargo-rdme start -->
A crate providing unvalidated string and character types.
<!-- cargo-rdme end -->
## More Information
For more information on development, authorship, contributing etc. please visit [`ICU4X home page`](https://github.com/unicode-org/icu4x).
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
#![cfg_attr(not(any(test, doc)), no_std)]
#![cfg_attr(
not(test),
deny(
clippy::indexing_slicing,
clippy::unwrap_used,
clippy::expect_used,
clippy::panic,
clippy::exhaustive_structs,
clippy::exhaustive_enums,
clippy::trivially_copy_pass_by_ref,
missing_debug_implementations,
)
)]
//! A crate providing unvalidated string and character types.
#[cfg(feature = "alloc")]
extern crate alloc;
mod uchar;
mod ustr;
pub use uchar::PotentialCodePoint;
pub use ustr::PotentialUtf16;
pub use ustr::PotentialUtf8;
#[cfg(feature = "writeable")]
mod writeable;
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
use core::cmp::Ordering;
use core::fmt;
/// A 24-bit numeric data type that is expected to be a Unicode scalar value, but is not
/// validated as such.
///
/// Use this type instead of `char` when you want to deal with data that is expected to be valid
/// Unicode scalar values, but you want control over when or if you validate that assumption.
///
/// # Examples
///
/// ```
/// use potential_utf::PotentialCodePoint;
///
/// assert_eq!(PotentialCodePoint::from_u24(0x68).try_to_char(), Ok('h'));
/// assert_eq!(PotentialCodePoint::from_char('i').try_to_char(), Ok('i'));
/// assert_eq!(
/// PotentialCodePoint::from_u24(0x1F44B).try_to_char(),
/// Ok('👋')
/// );
///
/// assert!(PotentialCodePoint::from_u24(0xDE01).try_to_char().is_err());
/// assert_eq!(
/// PotentialCodePoint::from_u24(0xDE01).to_char_lossy(),
/// char::REPLACEMENT_CHARACTER
/// );
/// ```
#[repr(transparent)]
#[allow(clippy::exhaustive_structs)] // transparent newtype
#[derive(PartialEq, Eq, Clone, Copy, Hash)]
pub struct PotentialCodePoint([u8; 3]);
impl PotentialCodePoint {
/// Create a [`PotentialCodePoint`] from a `char`.
///
/// # Examples
///
/// ```
/// use potential_utf::PotentialCodePoint;
///
/// let a = PotentialCodePoint::from_char('a');
/// assert_eq!(a.try_to_char().unwrap(), 'a');
/// ```
#[inline]
pub const fn from_char(c: char) -> Self {
let [u0, u1, u2, _u3] = (c as u32).to_le_bytes();
Self([u0, u1, u2])
}
/// Create [`PotentialCodePoint`] from a u32 value, ignoring the most significant 8 bits.
#[inline]
pub const fn from_u24(c: u32) -> Self {
let [u0, u1, u2, _u3] = c.to_le_bytes();
Self([u0, u1, u2])
}
/// Attempt to convert a [`PotentialCodePoint`] to a `char`.
///
/// # Examples
///
/// ```
/// use potential_utf::PotentialCodePoint;
/// use zerovec::ule::AsULE;
///
/// let a = PotentialCodePoint::from_char('a');
/// assert_eq!(a.try_to_char(), Ok('a'));
///
/// let b = PotentialCodePoint::from_unaligned([0xFF, 0xFF, 0xFF].into());
/// assert!(b.try_to_char().is_err());
/// ```
#[inline]
pub fn try_to_char(self) -> Result<char, core::char::CharTryFromError> {
char::try_from(u32::from(self))
}
/// Convert a [`PotentialCodePoint`] to a `char', returning [`char::REPLACEMENT_CHARACTER`]
/// if the `PotentialCodePoint` does not represent a valid Unicode scalar value.
///
/// # Examples
///
/// ```
/// use potential_utf::PotentialCodePoint;
/// use zerovec::ule::AsULE;
///
/// let a = PotentialCodePoint::from_unaligned([0xFF, 0xFF, 0xFF].into());
/// assert_eq!(a.to_char_lossy(), char::REPLACEMENT_CHARACTER);
/// ```
#[inline]
pub fn to_char_lossy(self) -> char {
self.try_to_char().unwrap_or(char::REPLACEMENT_CHARACTER)
}
/// Convert a [`PotentialCodePoint`] to a `char` without checking that it is
/// a valid Unicode scalar value.
///
/// # Safety
///
/// The `PotentialCodePoint` must be a valid Unicode scalar value in little-endian order.
///
/// # Examples
///
/// ```
/// use potential_utf::PotentialCodePoint;
///
/// let a = PotentialCodePoint::from_char('a');
/// assert_eq!(unsafe { a.to_char_unchecked() }, 'a');
/// ```
#[inline]
pub unsafe fn to_char_unchecked(self) -> char {
char::from_u32_unchecked(u32::from(self))
}
/// For converting to the ULE type in a const context
///
/// Can be removed once const traits are a thing
#[inline]
#[cfg(feature = "zerovec")]
pub const fn to_unaligned(self) -> zerovec::ule::RawBytesULE<3> {
zerovec::ule::RawBytesULE(self.0)
}
}
/// This impl requires enabling the optional `zerovec` Cargo feature
#[cfg(feature = "zerovec")]
impl zerovec::ule::AsULE for PotentialCodePoint {
type ULE = zerovec::ule::RawBytesULE<3>;
#[inline]
fn to_unaligned(self) -> Self::ULE {
zerovec::ule::RawBytesULE(self.0)
}
#[inline]
fn from_unaligned(unaligned: Self::ULE) -> Self {
Self(unaligned.0)
}
}
// Safety: PotentialCodePoint is always the little-endian representation of a char,
// which corresponds to its AsULE::ULE type
/// This impl requires enabling the optional `zerovec` Cargo feature
#[cfg(feature = "zerovec")]
unsafe impl zerovec::ule::EqULE for PotentialCodePoint {}
impl fmt::Debug for PotentialCodePoint {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
// Debug as a char if possible
match self.try_to_char() {
Ok(c) => fmt::Debug::fmt(&c, f),
Err(_) => fmt::Debug::fmt(&self.0, f),
}
}
}
impl PartialOrd for PotentialCodePoint {
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
Some(self.cmp(other))
}
}
impl PartialEq<char> for PotentialCodePoint {
fn eq(&self, other: &char) -> bool {
self.eq(&Self::from_char(*other))
}
}
impl PartialOrd<char> for PotentialCodePoint {
fn partial_cmp(&self, other: &char) -> Option<Ordering> {
self.partial_cmp(&Self::from_char(*other))
}
}
impl PartialEq<PotentialCodePoint> for char {
fn eq(&self, other: &PotentialCodePoint) -> bool {
PotentialCodePoint::from_char(*self).eq(other)
}
}
impl PartialOrd<PotentialCodePoint> for char {
fn partial_cmp(&self, other: &PotentialCodePoint) -> Option<Ordering> {
PotentialCodePoint::from_char(*self).partial_cmp(other)
}
}
impl Ord for PotentialCodePoint {
// custom implementation, as derived Ord would compare lexicographically
fn cmp(&self, other: &Self) -> Ordering {
let a = u32::from(*self);
let b = u32::from(*other);
a.cmp(&b)
}
}
impl From<PotentialCodePoint> for u32 {
fn from(x: PotentialCodePoint) -> Self {
let [a0, a1, a2] = x.0;
u32::from_le_bytes([a0, a1, a2, 0])
}
}
impl TryFrom<u32> for PotentialCodePoint {
type Error = ();
fn try_from(x: u32) -> Result<Self, ()> {
let [u0, u1, u2, u3] = x.to_le_bytes();
if u3 != 0 {
return Err(());
}
Ok(Self([u0, u1, u2]))
}
}
impl From<char> for PotentialCodePoint {
#[inline]
fn from(value: char) -> Self {
Self::from_char(value)
}
}
impl TryFrom<PotentialCodePoint> for char {
type Error = core::char::CharTryFromError;
#[inline]
fn try_from(value: PotentialCodePoint) -> Result<char, Self::Error> {
value.try_to_char()
}
}
/// This impl requires enabling the optional `serde` Cargo feature
#[cfg(feature = "serde")]
impl serde_core::Serialize for PotentialCodePoint {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: serde_core::Serializer,
{
use serde_core::ser::Error;
let c = self
.try_to_char()
.map_err(|_| S::Error::custom("invalid Unicode scalar value in PotentialCodePoint"))?;
if serializer.is_human_readable() {
serializer.serialize_char(c)
} else {
self.0.serialize(serializer)
}
}
}
/// This impl requires enabling the optional `serde` Cargo feature
#[cfg(feature = "serde")]
impl<'de> serde_core::Deserialize<'de> for PotentialCodePoint {
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where
D: serde_core::Deserializer<'de>,
{
if deserializer.is_human_readable() {
let c = <char>::deserialize(deserializer)?;
Ok(PotentialCodePoint::from_char(c))
} else {
let bytes = <[u8; 3]>::deserialize(deserializer)?;
Ok(PotentialCodePoint(bytes))
}
}
}
/// This impl requires enabling the optional `databake` Cargo feature
#[cfg(feature = "databake")]
impl databake::Bake for PotentialCodePoint {
fn bake(&self, env: &databake::CrateEnv) -> databake::TokenStream {
match self.try_to_char() {
Ok(ch) => {
env.insert("potential_utf");
let ch = ch.bake(env);
databake::quote! {
potential_utf::PotentialCodePoint::from_char(#ch)
}
}
Err(_) => {
env.insert("potential_utf");
let u24 = u32::from_le_bytes([self.0[0], self.0[1], self.0[2], 0]);
databake::quote! {
potential_utf::PotentialCodePoint::from_u24(#u24)
}
}
}
}
}
#[cfg(test)]
mod test {
use super::*;
use zerovec::ZeroVec;
#[test]
fn test_serde_fail() {
let uc = PotentialCodePoint([0xFF, 0xFF, 0xFF]);
serde_json::to_string(&uc).expect_err("serialize invalid char bytes");
bincode::serialize(&uc).expect_err("serialize invalid char bytes");
}
#[test]
fn test_serde_json() {
let c = '🙃';
let uc = PotentialCodePoint::from_char(c);
let json_ser = serde_json::to_string(&uc).unwrap();
assert_eq!(json_ser, r#""🙃""#);
let json_de: PotentialCodePoint = serde_json::from_str(&json_ser).unwrap();
assert_eq!(uc, json_de);
}
#[test]
fn test_serde_bincode() {
let c = '🙃';
let uc = PotentialCodePoint::from_char(c);
let bytes_ser = bincode::serialize(&uc).unwrap();
assert_eq!(bytes_ser, [0x43, 0xF6, 0x01]);
let bytes_de: PotentialCodePoint = bincode::deserialize(&bytes_ser).unwrap();
assert_eq!(uc, bytes_de);
}
#[test]
fn test_representation() {
let chars = ['w', 'ω', '文', '𑄃', '🙃'];
// backed by [PotentialCodePoint]
let uvchars: Vec<_> = chars
.iter()
.copied()
.map(PotentialCodePoint::from_char)
.collect();
// backed by [RawBytesULE<3>]
let zvec: ZeroVec<_> = uvchars.clone().into_iter().collect();
let ule_bytes = zvec.as_bytes();
let uvbytes;
unsafe {
let ptr = &uvchars[..] as *const _ as *const u8;
uvbytes = core::slice::from_raw_parts(ptr, ule_bytes.len());
}
// PotentialCodePoint is defined as little-endian, so this must be true on all platforms
// also asserts that to_unaligned/from_unaligned are no-ops
assert_eq!(uvbytes, ule_bytes);
assert_eq!(
&[119, 0, 0, 201, 3, 0, 135, 101, 0, 3, 17, 1, 67, 246, 1],
ule_bytes
);
}
#[test]
fn test_char_bake() {
databake::test_bake!(
PotentialCodePoint,
const,
crate::PotentialCodePoint::from_char('b'),
potential_utf
);
// surrogate code point
databake::test_bake!(
PotentialCodePoint,
const,
crate::PotentialCodePoint::from_u24(55296u32),
potential_utf
);
}
}
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
#[cfg(feature = "alloc")]
use alloc::boxed::Box;
use core::cmp::Ordering;
use core::fmt;
use core::ops::Deref;
/// A byte slice that is expected to be a UTF-8 string but does not enforce that invariant.
///
/// Use this type instead of `str` if you don't need to enforce UTF-8 during deserialization. For
/// example, strings that are keys of a map don't need to ever be reified as `str`s.
///
/// [`PotentialUtf8`] derefs to `[u8]`. To obtain a `str`, use [`Self::try_as_str()`].
///
/// The main advantage of this type over `[u8]` is that it serializes as a string in
/// human-readable formats like JSON.
///
/// # Examples
///
/// Using an [`PotentialUtf8`] as the key of a [`ZeroMap`]:
///
/// ```
/// use potential_utf::PotentialUtf8;
/// use zerovec::ZeroMap;
///
/// // This map is cheap to deserialize, as we don't need to perform UTF-8 validation.
/// let map: ZeroMap<PotentialUtf8, u8> = [
/// (PotentialUtf8::from_bytes(b"abc"), 11),
/// (PotentialUtf8::from_bytes(b"def"), 22),
/// (PotentialUtf8::from_bytes(b"ghi"), 33),
/// ]
/// .into_iter()
/// .collect();
///
/// let key = "abc";
/// let value = map.get_copied(PotentialUtf8::from_str(key));
/// assert_eq!(Some(11), value);
/// ```
///
/// [`ZeroMap`]: zerovec::ZeroMap
#[repr(transparent)]
#[derive(PartialEq, Eq, PartialOrd, Ord)]
#[allow(clippy::exhaustive_structs)] // transparent newtype
pub struct PotentialUtf8(pub [u8]);
impl fmt::Debug for PotentialUtf8 {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
// Debug as a string if possible
match self.try_as_str() {
Ok(s) => fmt::Debug::fmt(s, f),
Err(_) => fmt::Debug::fmt(&self.0, f),
}
}
}
impl PotentialUtf8 {
/// Create a [`PotentialUtf8`] from a byte slice.
#[inline]
pub const fn from_bytes(other: &[u8]) -> &Self {
// Safety: PotentialUtf8 is transparent over [u8]
unsafe { core::mem::transmute(other) }
}
/// Create a [`PotentialUtf8`] from a string slice.
#[inline]
pub const fn from_str(s: &str) -> &Self {
Self::from_bytes(s.as_bytes())
}
/// Create a [`PotentialUtf8`] from boxed bytes.
///
/// ✨ *Enabled with the `alloc` Cargo feature.*
#[inline]
#[cfg(feature = "alloc")]
pub fn from_boxed_bytes(other: Box<[u8]>) -> Box<Self> {
// Safety: PotentialUtf8 is transparent over [u8]
unsafe { core::mem::transmute(other) }
}
/// Create a [`PotentialUtf8`] from a boxed `str`.
///
/// ✨ *Enabled with the `alloc` Cargo feature.*
#[inline]
#[cfg(feature = "alloc")]
pub fn from_boxed_str(other: Box<str>) -> Box<Self> {
Self::from_boxed_bytes(other.into_boxed_bytes())
}
/// Get the bytes from a [`PotentialUtf8].
#[inline]
pub const fn as_bytes(&self) -> &[u8] {
&self.0
}
/// Attempt to convert a [`PotentialUtf8`] to a `str`.
///
/// # Examples
///
/// ```
/// use potential_utf::PotentialUtf8;
///
/// static A: &PotentialUtf8 = PotentialUtf8::from_bytes(b"abc");
///
/// let b = A.try_as_str().unwrap();
/// assert_eq!(b, "abc");
/// ```
// Note: this is const starting in 1.63
#[inline]
pub fn try_as_str(&self) -> Result<&str, core::str::Utf8Error> {
core::str::from_utf8(&self.0)
}
}
impl<'a> From<&'a str> for &'a PotentialUtf8 {
#[inline]
fn from(other: &'a str) -> Self {
PotentialUtf8::from_str(other)
}
}
impl PartialEq<str> for PotentialUtf8 {
fn eq(&self, other: &str) -> bool {
self.eq(Self::from_str(other))
}
}
impl PartialOrd<str> for PotentialUtf8 {
fn partial_cmp(&self, other: &str) -> Option<Ordering> {
self.partial_cmp(Self::from_str(other))
}
}
impl PartialEq<PotentialUtf8> for str {
fn eq(&self, other: &PotentialUtf8) -> bool {
PotentialUtf8::from_str(self).eq(other)
}
}
impl PartialOrd<PotentialUtf8> for str {
fn partial_cmp(&self, other: &PotentialUtf8) -> Option<Ordering> {
PotentialUtf8::from_str(self).partial_cmp(other)
}
}
#[cfg(feature = "alloc")]
impl From<Box<str>> for Box<PotentialUtf8> {
#[inline]
fn from(other: Box<str>) -> Self {
PotentialUtf8::from_boxed_str(other)
}
}
impl Deref for PotentialUtf8 {
type Target = [u8];
fn deref(&self) -> &Self::Target {
&self.0
}
}
/// This impl requires enabling the optional `zerovec` Cargo feature
#[cfg(all(feature = "zerovec", feature = "alloc"))]
impl<'a> zerovec::maps::ZeroMapKV<'a> for PotentialUtf8 {
type Container = zerovec::VarZeroVec<'a, PotentialUtf8>;
type Slice = zerovec::VarZeroSlice<PotentialUtf8>;
type GetType = PotentialUtf8;
type OwnedType = Box<PotentialUtf8>;
}
// Safety (based on the safety checklist on the VarULE trait):
// 1. PotentialUtf8 does not include any uninitialized or padding bytes (transparent over a ULE)
// 2. PotentialUtf8 is aligned to 1 byte (transparent over a ULE)
// 3. The impl of `validate_bytes()` returns an error if any byte is not valid (impossible)
// 4. The impl of `validate_bytes()` returns an error if the slice cannot be used in its entirety (impossible)
// 5. The impl of `from_bytes_unchecked()` returns a reference to the same data (returns the argument directly)
// 6. All other methods are defaulted
// 7. `[T]` byte equality is semantic equality (transparent over a ULE)
/// This impl requires enabling the optional `zerovec` Cargo feature
#[cfg(feature = "zerovec")]
unsafe impl zerovec::ule::VarULE for PotentialUtf8 {
#[inline]
fn validate_bytes(_: &[u8]) -> Result<(), zerovec::ule::UleError> {
Ok(())
}
#[inline]
unsafe fn from_bytes_unchecked(bytes: &[u8]) -> &Self {
PotentialUtf8::from_bytes(bytes)
}
}
/// This impl requires enabling the optional `serde` Cargo feature
#[cfg(feature = "serde")]
impl serde_core::Serialize for PotentialUtf8 {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: serde_core::Serializer,
{
use serde_core::ser::Error;
let s = self
.try_as_str()
.map_err(|_| S::Error::custom("invalid UTF-8 in PotentialUtf8"))?;
if serializer.is_human_readable() {
serializer.serialize_str(s)
} else {
serializer.serialize_bytes(s.as_bytes())
}
}
}
/// This impl requires enabling the optional `serde` Cargo feature
#[cfg(all(feature = "serde", feature = "alloc"))]
impl<'de> serde_core::Deserialize<'de> for Box<PotentialUtf8> {
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where
D: serde_core::Deserializer<'de>,
{
if deserializer.is_human_readable() {
let boxed_str = Box::<str>::deserialize(deserializer)?;
Ok(PotentialUtf8::from_boxed_str(boxed_str))
} else {
let boxed_bytes = Box::<[u8]>::deserialize(deserializer)?;
Ok(PotentialUtf8::from_boxed_bytes(boxed_bytes))
}
}
}
/// This impl requires enabling the optional `serde` Cargo feature
#[cfg(feature = "serde")]
impl<'de, 'a> serde_core::Deserialize<'de> for &'a PotentialUtf8
where
'de: 'a,
{
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where
D: serde_core::Deserializer<'de>,
{
if deserializer.is_human_readable() {
let s = <&str>::deserialize(deserializer)?;
Ok(PotentialUtf8::from_str(s))
} else {
let bytes = <&[u8]>::deserialize(deserializer)?;
Ok(PotentialUtf8::from_bytes(bytes))
}
}
}
#[repr(transparent)]
#[derive(PartialEq, Eq, PartialOrd, Ord)]
#[allow(clippy::exhaustive_structs)] // transparent newtype
pub struct PotentialUtf16(pub [u16]);
impl fmt::Debug for PotentialUtf16 {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
// Debug as a string if possible
for c in char::decode_utf16(self.0.iter().copied()) {
match c {
Ok(c) => write!(f, "{c}")?,
Err(e) => write!(f, "\\0x{:x}", e.unpaired_surrogate())?,
}
}
Ok(())
}
}
impl PotentialUtf16 {
/// Create a [`PotentialUtf16`] from a u16 slice.
#[inline]
pub const fn from_slice(other: &[u16]) -> &Self {
// Safety: PotentialUtf16 is transparent over [u16]
unsafe { core::mem::transmute(other) }
}
pub fn chars(&self) -> impl Iterator<Item = char> + '_ {
char::decode_utf16(self.0.iter().copied()).map(|c| c.unwrap_or(char::REPLACEMENT_CHARACTER))
}
}
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
use crate::{PotentialUtf16, PotentialUtf8};
use core::fmt::Write;
use writeable::{LengthHint, Part, PartsWrite, TryWriteable};
use core::{char::DecodeUtf16Error, fmt, str::Utf8Error};
/// This impl requires enabling the optional `writeable` Cargo feature
impl TryWriteable for &'_ PotentialUtf8 {
type Error = Utf8Error;
fn try_write_to_parts<S: PartsWrite + ?Sized>(
&self,
sink: &mut S,
) -> Result<Result<(), Self::Error>, fmt::Error> {
let mut remaining = &self.0;
let mut r = Ok(());
loop {
match core::str::from_utf8(remaining) {
Ok(valid) => {
sink.write_str(valid)?;
return Ok(r);
}
Err(e) => {
// SAFETY: By Utf8Error invariants
let valid = unsafe {
core::str::from_utf8_unchecked(remaining.get_unchecked(..e.valid_up_to()))
};
sink.write_str(valid)?;
sink.with_part(Part::ERROR, |s| s.write_char(char::REPLACEMENT_CHARACTER))?;
if r.is_ok() {
r = Err(e);
}
let Some(error_len) = e.error_len() else {
return Ok(r); // end of string
};
// SAFETY: By Utf8Error invariants
remaining = unsafe { remaining.get_unchecked(e.valid_up_to() + error_len..) }
}
}
}
}
fn writeable_length_hint(&self) -> LengthHint {
// Lower bound is all valid UTF-8, upper bound is all bytes with the high bit, which become replacement characters.
LengthHint::between(self.0.len(), self.0.len() * 3)
}
}
/// This impl requires enabling the optional `writeable` Cargo feature
impl TryWriteable for &'_ PotentialUtf16 {
type Error = DecodeUtf16Error;
fn try_write_to_parts<S: PartsWrite + ?Sized>(
&self,
sink: &mut S,
) -> Result<Result<(), Self::Error>, fmt::Error> {
let mut r = Ok(());
for c in core::char::decode_utf16(self.0.iter().copied()) {
match c {
Ok(c) => sink.write_char(c)?,
Err(e) => {
if r.is_ok() {
r = Err(e);
}
sink.with_part(Part::ERROR, |s| s.write_char(char::REPLACEMENT_CHARACTER))?;
}
}
}
Ok(r)
}
fn writeable_length_hint(&self) -> LengthHint {
// Lower bound is all ASCII, upper bound is all 3-byte code points (including replacement character)
LengthHint::between(self.0.len(), self.0.len() * 3)
}
}
#[cfg(test)]
mod test {
#![allow(invalid_from_utf8)] // only way to construct the error
use super::*;
use writeable::assert_try_writeable_parts_eq;
#[test]
fn test_utf8() {
assert_try_writeable_parts_eq!(
PotentialUtf8::from_bytes(b"Foo Bar"),
"Foo Bar",
Ok(()),
[]
);
assert_try_writeable_parts_eq!(
PotentialUtf8::from_bytes(b"Foo\xFDBar"),
"Foo�Bar",
Err(core::str::from_utf8(b"Foo\xFDBar").unwrap_err()),
[(3, 6, Part::ERROR)]
);
assert_try_writeable_parts_eq!(
PotentialUtf8::from_bytes(b"Foo\xFDBar\xff"),
"Foo�Bar�",
Err(core::str::from_utf8(b"Foo\xFDBar\xff").unwrap_err()),
[(3, 6, Part::ERROR), (9, 12, Part::ERROR)],
);
}
#[test]
fn test_utf16() {
assert_try_writeable_parts_eq!(
PotentialUtf16::from_slice(&[0xD83E, 0xDD73]),
"🥳",
Ok(()),
[]
);
assert_try_writeable_parts_eq!(
PotentialUtf16::from_slice(&[0xD83E, 0x20, 0xDD73]),
"� �",
Err(core::char::decode_utf16([0xD83E].into_iter())
.next()
.unwrap()
.unwrap_err()),
[(0, 3, Part::ERROR), (4, 7, Part::ERROR)]
);
}
}
-6
{
"git": {
"sha1": "fbb3eebe2f65e64a69b3a1837dce8ed9cbbe677e"
},
"path_in_vcs": "utils/potential_utf"
}
# This file is automatically @generated by Cargo.
# It is not intended for manual editing.
version = 3
[[package]]
name = "bincode"
version = "1.3.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b1f45e9417d87227c7a56d22e471c6206462cba514c7590c09aff4cf6d1ddcad"
dependencies = [
"serde",
]
[[package]]
name = "databake"
version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ff6ee9e2d2afb173bcdeee45934c89ec341ab26f91c9933774fc15c2b58f83ef"
dependencies = [
"proc-macro2",
"quote",
]
[[package]]
name = "itoa"
version = "1.0.15"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c"
[[package]]
name = "memchr"
version = "2.7.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3"
[[package]]
name = "potential_utf"
version = "0.1.3"
dependencies = [
"bincode",
"databake",
"serde",
"serde_json",
"writeable",
"zerovec",
]
[[package]]
name = "proc-macro2"
version = "1.0.95"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "02b3e5e68a3a1a02aad3ec490a98007cbc13c37cbe84a3cd7b8e406d76e7f778"
dependencies = [
"unicode-ident",
]
[[package]]
name = "quote"
version = "1.0.40"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1885c039570dc00dcb4ff087a89e185fd56bae234ddc7f056a945bf36467248d"
dependencies = [
"proc-macro2",
]
[[package]]
name = "ryu"
version = "1.0.20"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "28d3b2b1366ec20994f1fd18c3c594f05c5dd4bc44d8bb0c1c632c8d6829481f"
[[package]]
name = "serde"
version = "1.0.219"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5f0e2c6ed6606019b4e29e69dbaba95b11854410e5347d525002456dbbb786b6"
dependencies = [
"serde_derive",
]
[[package]]
name = "serde_derive"
version = "1.0.219"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5b0276cf7f2c73365f7157c8123c21cd9a50fbbd844757af28ca1f5925fc2a00"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "serde_json"
version = "1.0.140"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "20068b6e96dc6c9bd23e01df8827e6c7e1f2fddd43c21810382803c136b99373"
dependencies = [
"itoa",
"memchr",
"ryu",
"serde",
]
[[package]]
name = "syn"
version = "2.0.101"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8ce2b7fc941b3a24138a0a7cf8e858bfc6a992e7978a068a5c760deb0ed43caf"
dependencies = [
"proc-macro2",
"quote",
"unicode-ident",
]
[[package]]
name = "unicode-ident"
version = "1.0.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5a5f39404a5da50712a4c1eecf25e90dd62b613502b7e925fd4e4d19b5c96512"
[[package]]
name = "writeable"
version = "0.6.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ea2f10b9bb0928dfb1b42b65e1f9e36f7f54dbdf08457afefb38afcdec4fa2bb"
[[package]]
name = "zerofrom"
version = "0.1.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "50cc42e0333e05660c3587f3bf9d0478688e15d870fab3346451ce7f8c9fbea5"
[[package]]
name = "zerovec"
version = "0.11.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e7aa2bd55086f1ab526693ecbe444205da57e25f4489879da80635a46d90e73b"
dependencies = [
"zerofrom",
]
# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO
#
# When uploading crates to the registry Cargo will automatically
# "normalize" Cargo.toml files for maximal compatibility
# with all versions of Cargo and also rewrite `path` dependencies
# to registry (e.g., crates.io) dependencies.
#
# If you are reading this file be aware that the original Cargo.toml
# will likely look very different (and much more reasonable).
# See Cargo.toml.orig for the original contents.
[package]
edition = "2021"
rust-version = "1.82"
name = "potential_utf"
version = "0.1.3"
authors = ["The ICU4X Project Developers"]
build = false
include = [
"data/**/*",
"src/**/*",
"examples/**/*",
"benches/**/*",
"tests/**/*",
"Cargo.toml",
"LICENSE",
"README.md",
"build.rs",
]
autolib = false
autobins = false
autoexamples = false
autotests = false
autobenches = false
description = "Unvalidated string and character types"
homepage = "https://icu4x.unicode.org"
readme = "README.md"
categories = ["internationalization"]
license = "Unicode-3.0"
repository = "https://github.com/unicode-org/icu4x"
[features]
alloc = [
"serde?/alloc",
"zerovec?/alloc",
]
databake = ["dep:databake"]
serde = ["dep:serde"]
writeable = [
"dep:writeable",
"alloc",
]
zerovec = ["dep:zerovec"]
[lib]
name = "potential_utf"
path = "src/lib.rs"
[dependencies.databake]
version = "0.2.0"
optional = true
default-features = false
[dependencies.serde]
version = "1.0.110"
optional = true
default-features = false
[dependencies.writeable]
version = "0.6.0"
optional = true
default-features = false
[dependencies.zerovec]
version = "0.11.3"
optional = true
default-features = false
[dev-dependencies.bincode]
version = "1.3.1"
[dev-dependencies.serde_json]
version = "1.0.45"

Sorry, the diff of this file is not supported yet

UNICODE LICENSE V3
COPYRIGHT AND PERMISSION NOTICE
Copyright © 2020-2024 Unicode, Inc.
NOTICE TO USER: Carefully read the following legal agreement. BY
DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING DATA FILES, AND/OR
SOFTWARE, YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE
TERMS AND CONDITIONS OF THIS AGREEMENT. IF YOU DO NOT AGREE, DO NOT
DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE THE DATA FILES OR SOFTWARE.
Permission is hereby granted, free of charge, to any person obtaining a
copy of data files and any associated documentation (the "Data Files") or
software and any associated documentation (the "Software") to deal in the
Data Files or Software without restriction, including without limitation
the rights to use, copy, modify, merge, publish, distribute, and/or sell
copies of the Data Files or Software, and to permit persons to whom the
Data Files or Software are furnished to do so, provided that either (a)
this copyright and permission notice appear with all copies of the Data
Files or Software, or (b) this copyright and permission notice appear in
associated Documentation.
THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY
KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF
THIRD PARTY RIGHTS.
IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE
BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES,
OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THE DATA
FILES OR SOFTWARE.
Except as contained in this notice, the name of a copyright holder shall
not be used in advertising or otherwise to promote the sale, use or other
dealings in these Data Files or Software without prior written
authorization of the copyright holder.
SPDX-License-Identifier: Unicode-3.0
Portions of ICU4X may have been adapted from ICU4C and/or ICU4J.
ICU 1.8.1 to ICU 57.1 © 1995-2016 International Business Machines Corporation and others.
# unvalidated_utf [![crates.io](https://img.shields.io/crates/v/unvalidated_utf)](https://crates.io/crates/unvalidated_utf)
<!-- cargo-rdme start -->
A crate providing unvalidated string and character types.
<!-- cargo-rdme end -->
## More Information
For more information on development, authorship, contributing etc. please visit [`ICU4X home page`](https://github.com/unicode-org/icu4x).
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
#![cfg_attr(not(any(test, doc)), no_std)]
#![cfg_attr(
not(test),
deny(
clippy::indexing_slicing,
clippy::unwrap_used,
clippy::expect_used,
clippy::panic,
clippy::exhaustive_structs,
clippy::exhaustive_enums,
clippy::trivially_copy_pass_by_ref,
missing_debug_implementations,
)
)]
//! A crate providing unvalidated string and character types.
#[cfg(feature = "alloc")]
extern crate alloc;
mod uchar;
mod ustr;
pub use uchar::PotentialCodePoint;
pub use ustr::PotentialUtf16;
pub use ustr::PotentialUtf8;
#[cfg(feature = "writeable")]
mod writeable;
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
use core::cmp::Ordering;
use core::fmt;
/// A 24-bit numeric data type that is expected to be a Unicode scalar value, but is not
/// validated as such.
///
/// Use this type instead of `char` when you want to deal with data that is expected to be valid
/// Unicode scalar values, but you want control over when or if you validate that assumption.
///
/// # Examples
///
/// ```
/// use potential_utf::PotentialCodePoint;
///
/// assert_eq!(PotentialCodePoint::from_u24(0x68).try_to_char(), Ok('h'));
/// assert_eq!(PotentialCodePoint::from_char('i').try_to_char(), Ok('i'));
/// assert_eq!(
/// PotentialCodePoint::from_u24(0x1F44B).try_to_char(),
/// Ok('👋')
/// );
///
/// assert!(PotentialCodePoint::from_u24(0xDE01).try_to_char().is_err());
/// assert_eq!(
/// PotentialCodePoint::from_u24(0xDE01).to_char_lossy(),
/// char::REPLACEMENT_CHARACTER
/// );
/// ```
#[repr(transparent)]
#[allow(clippy::exhaustive_structs)] // transparent newtype
#[derive(PartialEq, Eq, Clone, Copy, Hash)]
pub struct PotentialCodePoint([u8; 3]);
impl PotentialCodePoint {
/// Create a [`PotentialCodePoint`] from a `char`.
///
/// # Examples
///
/// ```
/// use potential_utf::PotentialCodePoint;
///
/// let a = PotentialCodePoint::from_char('a');
/// assert_eq!(a.try_to_char().unwrap(), 'a');
/// ```
#[inline]
pub const fn from_char(c: char) -> Self {
let [u0, u1, u2, _u3] = (c as u32).to_le_bytes();
Self([u0, u1, u2])
}
/// Create [`PotentialCodePoint`] from a u32 value, ignoring the most significant 8 bits.
#[inline]
pub const fn from_u24(c: u32) -> Self {
let [u0, u1, u2, _u3] = c.to_le_bytes();
Self([u0, u1, u2])
}
/// Attempt to convert a [`PotentialCodePoint`] to a `char`.
///
/// # Examples
///
/// ```
/// use potential_utf::PotentialCodePoint;
/// use zerovec::ule::AsULE;
///
/// let a = PotentialCodePoint::from_char('a');
/// assert_eq!(a.try_to_char(), Ok('a'));
///
/// let b = PotentialCodePoint::from_unaligned([0xFF, 0xFF, 0xFF].into());
/// assert!(matches!(b.try_to_char(), Err(_)));
/// ```
#[inline]
pub fn try_to_char(self) -> Result<char, core::char::CharTryFromError> {
char::try_from(u32::from(self))
}
/// Convert a [`PotentialCodePoint`] to a `char', returning [`char::REPLACEMENT_CHARACTER`]
/// if the `PotentialCodePoint` does not represent a valid Unicode scalar value.
///
/// # Examples
///
/// ```
/// use potential_utf::PotentialCodePoint;
/// use zerovec::ule::AsULE;
///
/// let a = PotentialCodePoint::from_unaligned([0xFF, 0xFF, 0xFF].into());
/// assert_eq!(a.to_char_lossy(), char::REPLACEMENT_CHARACTER);
/// ```
#[inline]
pub fn to_char_lossy(self) -> char {
self.try_to_char().unwrap_or(char::REPLACEMENT_CHARACTER)
}
/// Convert a [`PotentialCodePoint`] to a `char` without checking that it is
/// a valid Unicode scalar value.
///
/// # Safety
///
/// The `PotentialCodePoint` must be a valid Unicode scalar value in little-endian order.
///
/// # Examples
///
/// ```
/// use potential_utf::PotentialCodePoint;
///
/// let a = PotentialCodePoint::from_char('a');
/// assert_eq!(unsafe { a.to_char_unchecked() }, 'a');
/// ```
#[inline]
pub unsafe fn to_char_unchecked(self) -> char {
char::from_u32_unchecked(u32::from(self))
}
/// For converting to the ULE type in a const context
///
/// Can be removed once const traits are a thing
#[inline]
#[cfg(feature = "zerovec")]
pub const fn to_unaligned(self) -> zerovec::ule::RawBytesULE<3> {
zerovec::ule::RawBytesULE(self.0)
}
}
/// This impl requires enabling the optional `zerovec` Cargo feature
#[cfg(feature = "zerovec")]
impl zerovec::ule::AsULE for PotentialCodePoint {
type ULE = zerovec::ule::RawBytesULE<3>;
#[inline]
fn to_unaligned(self) -> Self::ULE {
zerovec::ule::RawBytesULE(self.0)
}
#[inline]
fn from_unaligned(unaligned: Self::ULE) -> Self {
Self(unaligned.0)
}
}
// Safety: PotentialCodePoint is always the little-endian representation of a char,
// which corresponds to its AsULE::ULE type
/// This impl requires enabling the optional `zerovec` Cargo feature
#[cfg(feature = "zerovec")]
unsafe impl zerovec::ule::EqULE for PotentialCodePoint {}
impl fmt::Debug for PotentialCodePoint {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
// Debug as a char if possible
match self.try_to_char() {
Ok(c) => fmt::Debug::fmt(&c, f),
Err(_) => fmt::Debug::fmt(&self.0, f),
}
}
}
impl PartialOrd for PotentialCodePoint {
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
Some(self.cmp(other))
}
}
impl PartialEq<char> for PotentialCodePoint {
fn eq(&self, other: &char) -> bool {
self.eq(&Self::from_char(*other))
}
}
impl PartialOrd<char> for PotentialCodePoint {
fn partial_cmp(&self, other: &char) -> Option<Ordering> {
self.partial_cmp(&Self::from_char(*other))
}
}
impl PartialEq<PotentialCodePoint> for char {
fn eq(&self, other: &PotentialCodePoint) -> bool {
PotentialCodePoint::from_char(*self).eq(other)
}
}
impl PartialOrd<PotentialCodePoint> for char {
fn partial_cmp(&self, other: &PotentialCodePoint) -> Option<Ordering> {
PotentialCodePoint::from_char(*self).partial_cmp(other)
}
}
impl Ord for PotentialCodePoint {
// custom implementation, as derived Ord would compare lexicographically
fn cmp(&self, other: &Self) -> Ordering {
let a = u32::from(*self);
let b = u32::from(*other);
a.cmp(&b)
}
}
impl From<PotentialCodePoint> for u32 {
fn from(x: PotentialCodePoint) -> Self {
let [a0, a1, a2] = x.0;
u32::from_le_bytes([a0, a1, a2, 0])
}
}
impl TryFrom<u32> for PotentialCodePoint {
type Error = ();
fn try_from(x: u32) -> Result<Self, ()> {
let [u0, u1, u2, u3] = x.to_le_bytes();
if u3 != 0 {
return Err(());
}
Ok(Self([u0, u1, u2]))
}
}
impl From<char> for PotentialCodePoint {
#[inline]
fn from(value: char) -> Self {
Self::from_char(value)
}
}
impl TryFrom<PotentialCodePoint> for char {
type Error = core::char::CharTryFromError;
#[inline]
fn try_from(value: PotentialCodePoint) -> Result<char, Self::Error> {
value.try_to_char()
}
}
/// This impl requires enabling the optional `serde` Cargo feature
#[cfg(feature = "serde")]
impl serde::Serialize for PotentialCodePoint {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: serde::Serializer,
{
use serde::ser::Error;
let c = self
.try_to_char()
.map_err(|_| S::Error::custom("invalid Unicode scalar value in PotentialCodePoint"))?;
if serializer.is_human_readable() {
serializer.serialize_char(c)
} else {
self.0.serialize(serializer)
}
}
}
/// This impl requires enabling the optional `serde` Cargo feature
#[cfg(feature = "serde")]
impl<'de> serde::Deserialize<'de> for PotentialCodePoint {
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where
D: serde::Deserializer<'de>,
{
if deserializer.is_human_readable() {
let c = <char>::deserialize(deserializer)?;
Ok(PotentialCodePoint::from_char(c))
} else {
let bytes = <[u8; 3]>::deserialize(deserializer)?;
Ok(PotentialCodePoint(bytes))
}
}
}
/// This impl requires enabling the optional `databake` Cargo feature
#[cfg(feature = "databake")]
impl databake::Bake for PotentialCodePoint {
fn bake(&self, env: &databake::CrateEnv) -> databake::TokenStream {
match self.try_to_char() {
Ok(ch) => {
env.insert("potential_utf");
let ch = ch.bake(env);
databake::quote! {
potential_utf::PotentialCodePoint::from_char(#ch)
}
}
Err(_) => {
env.insert("potential_utf");
let u24 = u32::from_le_bytes([self.0[0], self.0[1], self.0[2], 0]);
databake::quote! {
potential_utf::PotentialCodePoint::from_u24(#u24)
}
}
}
}
}
#[cfg(test)]
mod test {
use super::*;
use zerovec::ZeroVec;
#[test]
fn test_serde_fail() {
let uc = PotentialCodePoint([0xFF, 0xFF, 0xFF]);
serde_json::to_string(&uc).expect_err("serialize invalid char bytes");
bincode::serialize(&uc).expect_err("serialize invalid char bytes");
}
#[test]
fn test_serde_json() {
let c = '🙃';
let uc = PotentialCodePoint::from_char(c);
let json_ser = serde_json::to_string(&uc).unwrap();
assert_eq!(json_ser, r#""🙃""#);
let json_de: PotentialCodePoint = serde_json::from_str(&json_ser).unwrap();
assert_eq!(uc, json_de);
}
#[test]
fn test_serde_bincode() {
let c = '🙃';
let uc = PotentialCodePoint::from_char(c);
let bytes_ser = bincode::serialize(&uc).unwrap();
assert_eq!(bytes_ser, [0x43, 0xF6, 0x01]);
let bytes_de: PotentialCodePoint = bincode::deserialize(&bytes_ser).unwrap();
assert_eq!(uc, bytes_de);
}
#[test]
fn test_representation() {
let chars = ['w', 'ω', '文', '𑄃', '🙃'];
// backed by [PotentialCodePoint]
let uvchars: Vec<_> = chars
.iter()
.copied()
.map(PotentialCodePoint::from_char)
.collect();
// backed by [RawBytesULE<3>]
let zvec: ZeroVec<_> = uvchars.clone().into_iter().collect();
let ule_bytes = zvec.as_bytes();
let uvbytes;
unsafe {
let ptr = &uvchars[..] as *const _ as *const u8;
uvbytes = core::slice::from_raw_parts(ptr, ule_bytes.len());
}
// PotentialCodePoint is defined as little-endian, so this must be true on all platforms
// also asserts that to_unaligned/from_unaligned are no-ops
assert_eq!(uvbytes, ule_bytes);
assert_eq!(
&[119, 0, 0, 201, 3, 0, 135, 101, 0, 3, 17, 1, 67, 246, 1],
ule_bytes
);
}
#[test]
fn test_char_bake() {
databake::test_bake!(
PotentialCodePoint,
const,
crate::PotentialCodePoint::from_char('b'),
potential_utf
);
// surrogate code point
databake::test_bake!(
PotentialCodePoint,
const,
crate::PotentialCodePoint::from_u24(55296u32),
potential_utf
);
}
}
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
#[cfg(feature = "alloc")]
use alloc::boxed::Box;
use core::cmp::Ordering;
use core::fmt;
use core::ops::Deref;
/// A byte slice that is expected to be a UTF-8 string but does not enforce that invariant.
///
/// Use this type instead of `str` if you don't need to enforce UTF-8 during deserialization. For
/// example, strings that are keys of a map don't need to ever be reified as `str`s.
///
/// [`PotentialUtf8`] derefs to `[u8]`. To obtain a `str`, use [`Self::try_as_str()`].
///
/// The main advantage of this type over `[u8]` is that it serializes as a string in
/// human-readable formats like JSON.
///
/// # Examples
///
/// Using an [`PotentialUtf8`] as the key of a [`ZeroMap`]:
///
/// ```
/// use potential_utf::PotentialUtf8;
/// use zerovec::ZeroMap;
///
/// // This map is cheap to deserialize, as we don't need to perform UTF-8 validation.
/// let map: ZeroMap<PotentialUtf8, u8> = [
/// (PotentialUtf8::from_bytes(b"abc"), 11),
/// (PotentialUtf8::from_bytes(b"def"), 22),
/// (PotentialUtf8::from_bytes(b"ghi"), 33),
/// ]
/// .into_iter()
/// .collect();
///
/// let key = "abc";
/// let value = map.get_copied(PotentialUtf8::from_str(key));
/// assert_eq!(Some(11), value);
/// ```
///
/// [`ZeroMap`]: zerovec::ZeroMap
#[repr(transparent)]
#[derive(PartialEq, Eq, PartialOrd, Ord)]
#[allow(clippy::exhaustive_structs)] // transparent newtype
pub struct PotentialUtf8(pub [u8]);
impl fmt::Debug for PotentialUtf8 {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
// Debug as a string if possible
match self.try_as_str() {
Ok(s) => fmt::Debug::fmt(s, f),
Err(_) => fmt::Debug::fmt(&self.0, f),
}
}
}
impl PotentialUtf8 {
/// Create a [`PotentialUtf8`] from a byte slice.
#[inline]
pub const fn from_bytes(other: &[u8]) -> &Self {
// Safety: PotentialUtf8 is transparent over [u8]
unsafe { core::mem::transmute(other) }
}
/// Create a [`PotentialUtf8`] from a string slice.
#[inline]
pub const fn from_str(s: &str) -> &Self {
Self::from_bytes(s.as_bytes())
}
/// Create a [`PotentialUtf8`] from boxed bytes.
#[inline]
#[cfg(feature = "alloc")]
pub fn from_boxed_bytes(other: Box<[u8]>) -> Box<Self> {
// Safety: PotentialUtf8 is transparent over [u8]
unsafe { core::mem::transmute(other) }
}
/// Create a [`PotentialUtf8`] from a boxed `str`.
#[inline]
#[cfg(feature = "alloc")]
pub fn from_boxed_str(other: Box<str>) -> Box<Self> {
Self::from_boxed_bytes(other.into_boxed_bytes())
}
/// Get the bytes from a [`PotentialUtf8].
#[inline]
pub const fn as_bytes(&self) -> &[u8] {
&self.0
}
/// Attempt to convert a [`PotentialUtf8`] to a `str`.
///
/// # Examples
///
/// ```
/// use potential_utf::PotentialUtf8;
///
/// static A: &PotentialUtf8 = PotentialUtf8::from_bytes(b"abc");
///
/// let b = A.try_as_str().unwrap();
/// assert_eq!(b, "abc");
/// ```
// Note: this is const starting in 1.63
#[inline]
pub fn try_as_str(&self) -> Result<&str, core::str::Utf8Error> {
core::str::from_utf8(&self.0)
}
}
impl<'a> From<&'a str> for &'a PotentialUtf8 {
#[inline]
fn from(other: &'a str) -> Self {
PotentialUtf8::from_str(other)
}
}
impl PartialEq<str> for PotentialUtf8 {
fn eq(&self, other: &str) -> bool {
self.eq(Self::from_str(other))
}
}
impl PartialOrd<str> for PotentialUtf8 {
fn partial_cmp(&self, other: &str) -> Option<Ordering> {
self.partial_cmp(Self::from_str(other))
}
}
impl PartialEq<PotentialUtf8> for str {
fn eq(&self, other: &PotentialUtf8) -> bool {
PotentialUtf8::from_str(self).eq(other)
}
}
impl PartialOrd<PotentialUtf8> for str {
fn partial_cmp(&self, other: &PotentialUtf8) -> Option<Ordering> {
PotentialUtf8::from_str(self).partial_cmp(other)
}
}
#[cfg(feature = "alloc")]
impl From<Box<str>> for Box<PotentialUtf8> {
#[inline]
fn from(other: Box<str>) -> Self {
PotentialUtf8::from_boxed_str(other)
}
}
impl Deref for PotentialUtf8 {
type Target = [u8];
fn deref(&self) -> &Self::Target {
&self.0
}
}
/// This impl requires enabling the optional `zerovec` Cargo feature
#[cfg(all(feature = "zerovec", feature = "alloc"))]
impl<'a> zerovec::maps::ZeroMapKV<'a> for PotentialUtf8 {
type Container = zerovec::VarZeroVec<'a, PotentialUtf8>;
type Slice = zerovec::VarZeroSlice<PotentialUtf8>;
type GetType = PotentialUtf8;
type OwnedType = Box<PotentialUtf8>;
}
// Safety (based on the safety checklist on the VarULE trait):
// 1. PotentialUtf8 does not include any uninitialized or padding bytes (transparent over a ULE)
// 2. PotentialUtf8 is aligned to 1 byte (transparent over a ULE)
// 3. The impl of `validate_bytes()` returns an error if any byte is not valid (impossible)
// 4. The impl of `validate_bytes()` returns an error if the slice cannot be used in its entirety (impossible)
// 5. The impl of `from_bytes_unchecked()` returns a reference to the same data (returns the argument directly)
// 6. All other methods are defaulted
// 7. `[T]` byte equality is semantic equality (transparent over a ULE)
/// This impl requires enabling the optional `zerovec` Cargo feature
#[cfg(feature = "zerovec")]
unsafe impl zerovec::ule::VarULE for PotentialUtf8 {
#[inline]
fn validate_bytes(_: &[u8]) -> Result<(), zerovec::ule::UleError> {
Ok(())
}
#[inline]
unsafe fn from_bytes_unchecked(bytes: &[u8]) -> &Self {
PotentialUtf8::from_bytes(bytes)
}
}
/// This impl requires enabling the optional `serde` Cargo feature
#[cfg(feature = "serde")]
impl serde::Serialize for PotentialUtf8 {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: serde::Serializer,
{
use serde::ser::Error;
let s = self
.try_as_str()
.map_err(|_| S::Error::custom("invalid UTF-8 in PotentialUtf8"))?;
if serializer.is_human_readable() {
serializer.serialize_str(s)
} else {
serializer.serialize_bytes(s.as_bytes())
}
}
}
/// This impl requires enabling the optional `serde` Cargo feature
#[cfg(all(feature = "serde", feature = "alloc"))]
impl<'de> serde::Deserialize<'de> for Box<PotentialUtf8> {
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where
D: serde::Deserializer<'de>,
{
if deserializer.is_human_readable() {
let boxed_str = Box::<str>::deserialize(deserializer)?;
Ok(PotentialUtf8::from_boxed_str(boxed_str))
} else {
let boxed_bytes = Box::<[u8]>::deserialize(deserializer)?;
Ok(PotentialUtf8::from_boxed_bytes(boxed_bytes))
}
}
}
/// This impl requires enabling the optional `serde` Cargo feature
#[cfg(feature = "serde")]
impl<'de, 'a> serde::Deserialize<'de> for &'a PotentialUtf8
where
'de: 'a,
{
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where
D: serde::Deserializer<'de>,
{
if deserializer.is_human_readable() {
let s = <&str>::deserialize(deserializer)?;
Ok(PotentialUtf8::from_str(s))
} else {
let bytes = <&[u8]>::deserialize(deserializer)?;
Ok(PotentialUtf8::from_bytes(bytes))
}
}
}
#[repr(transparent)]
#[derive(PartialEq, Eq, PartialOrd, Ord)]
#[allow(clippy::exhaustive_structs)] // transparent newtype
pub struct PotentialUtf16(pub [u16]);
impl fmt::Debug for PotentialUtf16 {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
// Debug as a string if possible
for c in char::decode_utf16(self.0.iter().copied()) {
match c {
Ok(c) => write!(f, "{c}")?,
Err(e) => write!(f, "\\0x{:x}", e.unpaired_surrogate())?,
}
}
Ok(())
}
}
impl PotentialUtf16 {
/// Create a [`PotentialUtf16`] from a u16 slice.
#[inline]
pub const fn from_slice(other: &[u16]) -> &Self {
// Safety: PotentialUtf16 is transparent over [u16]
unsafe { core::mem::transmute(other) }
}
pub fn chars(&self) -> impl Iterator<Item = char> + '_ {
char::decode_utf16(self.0.iter().copied()).map(|c| c.unwrap_or(char::REPLACEMENT_CHARACTER))
}
}
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
use crate::{PotentialUtf16, PotentialUtf8};
use alloc::borrow::Cow;
use core::fmt::Write;
use writeable::{LengthHint, Part, PartsWrite, TryWriteable};
use core::{char::DecodeUtf16Error, fmt, str::Utf8Error};
/// This impl requires enabling the optional `writeable` Cargo feature
impl TryWriteable for &'_ PotentialUtf8 {
type Error = Utf8Error;
fn try_write_to_parts<S: PartsWrite + ?Sized>(
&self,
sink: &mut S,
) -> Result<Result<(), Self::Error>, fmt::Error> {
let mut remaining = &self.0;
let mut r = Ok(());
loop {
match core::str::from_utf8(remaining) {
Ok(valid) => {
sink.write_str(valid)?;
return Ok(r);
}
Err(e) => {
// SAFETY: By Utf8Error invariants
let valid = unsafe {
core::str::from_utf8_unchecked(remaining.get_unchecked(..e.valid_up_to()))
};
sink.write_str(valid)?;
sink.with_part(Part::ERROR, |s| s.write_char(char::REPLACEMENT_CHARACTER))?;
if r.is_ok() {
r = Err(e);
}
let Some(error_len) = e.error_len() else {
return Ok(r); // end of string
};
// SAFETY: By Utf8Error invariants
remaining = unsafe { remaining.get_unchecked(e.valid_up_to() + error_len..) }
}
}
}
}
fn writeable_length_hint(&self) -> LengthHint {
// Lower bound is all valid UTF-8, upper bound is all bytes with the high bit, which become replacement characters.
LengthHint::between(self.0.len(), self.0.len() * 3)
}
fn try_write_to_string(&self) -> Result<Cow<'_, str>, (Self::Error, Cow<'_, str>)> {
match core::str::from_utf8(&self.0) {
Ok(valid) => Ok(Cow::Borrowed(valid)),
Err(e) => {
// SAFETY: By Utf8Error invariants
let valid = unsafe {
core::str::from_utf8_unchecked(self.0.get_unchecked(..e.valid_up_to()))
};
// Let's assume this is the only error
let mut out = alloc::string::String::with_capacity(
self.0.len() + char::REPLACEMENT_CHARACTER.len_utf8()
- e.error_len().unwrap_or(0),
);
out.push_str(valid);
out.push(char::REPLACEMENT_CHARACTER);
// If there's more, we can use `try_write_to`
if let Some(error_len) = e.error_len() {
// SAFETY: By Utf8Error invariants
let remaining = unsafe { self.0.get_unchecked(e.valid_up_to() + error_len..) };
let _discard = PotentialUtf8::from_bytes(remaining).try_write_to(&mut out);
}
Err((e, Cow::Owned(out)))
}
}
}
}
/// This impl requires enabling the optional `writeable` Cargo feature
impl TryWriteable for &'_ PotentialUtf16 {
type Error = DecodeUtf16Error;
fn try_write_to_parts<S: PartsWrite + ?Sized>(
&self,
sink: &mut S,
) -> Result<Result<(), Self::Error>, fmt::Error> {
let mut r = Ok(());
for c in core::char::decode_utf16(self.0.iter().copied()) {
match c {
Ok(c) => sink.write_char(c)?,
Err(e) => {
if r.is_ok() {
r = Err(e);
}
sink.with_part(Part::ERROR, |s| s.write_char(char::REPLACEMENT_CHARACTER))?;
}
}
}
Ok(r)
}
fn writeable_length_hint(&self) -> LengthHint {
// Lower bound is all ASCII, upper bound is all 3-byte code points (including replacement character)
LengthHint::between(self.0.len(), self.0.len() * 3)
}
}
#[cfg(test)]
mod test {
#![allow(invalid_from_utf8)] // only way to construct the error
use super::*;
use writeable::assert_try_writeable_parts_eq;
#[test]
fn test_utf8() {
assert_try_writeable_parts_eq!(
PotentialUtf8::from_bytes(b"Foo Bar"),
"Foo Bar",
Ok(()),
[]
);
assert_try_writeable_parts_eq!(
PotentialUtf8::from_bytes(b"Foo\xFDBar"),
"Foo�Bar",
Err(core::str::from_utf8(b"Foo\xFDBar").unwrap_err()),
[(3, 6, Part::ERROR)]
);
assert_try_writeable_parts_eq!(
PotentialUtf8::from_bytes(b"Foo\xFDBar\xff"),
"Foo�Bar�",
Err(core::str::from_utf8(b"Foo\xFDBar\xff").unwrap_err()),
[(3, 6, Part::ERROR), (9, 12, Part::ERROR)],
);
}
#[test]
fn test_utf16() {
assert_try_writeable_parts_eq!(
PotentialUtf16::from_slice(&[0xD83E, 0xDD73]),
"🥳",
Ok(()),
[]
);
assert_try_writeable_parts_eq!(
PotentialUtf16::from_slice(&[0xD83E, 0x20, 0xDD73]),
"� �",
Err(core::char::decode_utf16([0xD83E].into_iter())
.next()
.unwrap()
.unwrap_err()),
[(0, 3, Part::ERROR), (4, 7, Part::ERROR)]
);
}
}