potential_utf
Advanced tools
| { | ||
| "git": { | ||
| "sha1": "29dfe2790b6cfdab94ca6a6b69f58ce54802dbf7" | ||
| }, | ||
| "path_in_vcs": "utils/potential_utf" | ||
| } |
| # This file is automatically @generated by Cargo. | ||
| # It is not intended for manual editing. | ||
| version = 3 | ||
| [[package]] | ||
| name = "bincode" | ||
| version = "1.3.3" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "b1f45e9417d87227c7a56d22e471c6206462cba514c7590c09aff4cf6d1ddcad" | ||
| dependencies = [ | ||
| "serde", | ||
| ] | ||
| [[package]] | ||
| name = "databake" | ||
| version = "0.2.0" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "ff6ee9e2d2afb173bcdeee45934c89ec341ab26f91c9933774fc15c2b58f83ef" | ||
| dependencies = [ | ||
| "proc-macro2", | ||
| "quote", | ||
| ] | ||
| [[package]] | ||
| name = "itoa" | ||
| version = "1.0.15" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c" | ||
| [[package]] | ||
| name = "memchr" | ||
| version = "2.7.6" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "f52b00d39961fc5b2736ea853c9cc86238e165017a493d1d5c8eac6bdc4cc273" | ||
| [[package]] | ||
| name = "potential_utf" | ||
| version = "0.1.4" | ||
| dependencies = [ | ||
| "bincode", | ||
| "databake", | ||
| "serde_core", | ||
| "serde_json", | ||
| "writeable", | ||
| "zerovec", | ||
| ] | ||
| [[package]] | ||
| name = "proc-macro2" | ||
| version = "1.0.103" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "5ee95bc4ef87b8d5ba32e8b7714ccc834865276eab0aed5c9958d00ec45f49e8" | ||
| dependencies = [ | ||
| "unicode-ident", | ||
| ] | ||
| [[package]] | ||
| name = "quote" | ||
| version = "1.0.41" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "ce25767e7b499d1b604768e7cde645d14cc8584231ea6b295e9c9eb22c02e1d1" | ||
| dependencies = [ | ||
| "proc-macro2", | ||
| ] | ||
| [[package]] | ||
| name = "ryu" | ||
| version = "1.0.20" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "28d3b2b1366ec20994f1fd18c3c594f05c5dd4bc44d8bb0c1c632c8d6829481f" | ||
| [[package]] | ||
| name = "serde" | ||
| version = "1.0.228" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e" | ||
| dependencies = [ | ||
| "serde_core", | ||
| "serde_derive", | ||
| ] | ||
| [[package]] | ||
| name = "serde_core" | ||
| version = "1.0.228" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad" | ||
| dependencies = [ | ||
| "serde_derive", | ||
| ] | ||
| [[package]] | ||
| name = "serde_derive" | ||
| version = "1.0.228" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79" | ||
| dependencies = [ | ||
| "proc-macro2", | ||
| "quote", | ||
| "syn", | ||
| ] | ||
| [[package]] | ||
| name = "serde_json" | ||
| version = "1.0.145" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "402a6f66d8c709116cf22f558eab210f5a50187f702eb4d7e5ef38d9a7f1c79c" | ||
| dependencies = [ | ||
| "itoa", | ||
| "memchr", | ||
| "ryu", | ||
| "serde", | ||
| "serde_core", | ||
| ] | ||
| [[package]] | ||
| name = "syn" | ||
| version = "2.0.108" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "da58917d35242480a05c2897064da0a80589a2a0476c9a3f2fdc83b53502e917" | ||
| dependencies = [ | ||
| "proc-macro2", | ||
| "quote", | ||
| "unicode-ident", | ||
| ] | ||
| [[package]] | ||
| name = "unicode-ident" | ||
| version = "1.0.20" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "462eeb75aeb73aea900253ce739c8e18a67423fadf006037cd3ff27e82748a06" | ||
| [[package]] | ||
| name = "writeable" | ||
| version = "0.6.2" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "9edde0db4769d2dc68579893f2306b26c6ecfbe0ef499b013d731b7b9247e0b9" | ||
| [[package]] | ||
| name = "zerofrom" | ||
| version = "0.1.6" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "50cc42e0333e05660c3587f3bf9d0478688e15d870fab3346451ce7f8c9fbea5" | ||
| [[package]] | ||
| name = "zerovec" | ||
| version = "0.11.5" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "6c28719294829477f525be0186d13efa9a3c602f7ec202ca9e353d310fb9a002" | ||
| dependencies = [ | ||
| "serde", | ||
| "zerofrom", | ||
| ] |
| # THIS FILE IS AUTOMATICALLY GENERATED BY CARGO | ||
| # | ||
| # When uploading crates to the registry Cargo will automatically | ||
| # "normalize" Cargo.toml files for maximal compatibility | ||
| # with all versions of Cargo and also rewrite `path` dependencies | ||
| # to registry (e.g., crates.io) dependencies. | ||
| # | ||
| # If you are reading this file be aware that the original Cargo.toml | ||
| # will likely look very different (and much more reasonable). | ||
| # See Cargo.toml.orig for the original contents. | ||
| [package] | ||
| edition = "2021" | ||
| rust-version = "1.82" | ||
| name = "potential_utf" | ||
| version = "0.1.4" | ||
| authors = ["The ICU4X Project Developers"] | ||
| build = false | ||
| include = [ | ||
| "data/**/*", | ||
| "src/**/*", | ||
| "examples/**/*", | ||
| "benches/**/*", | ||
| "tests/**/*", | ||
| "Cargo.toml", | ||
| "LICENSE", | ||
| "README.md", | ||
| "build.rs", | ||
| ] | ||
| autolib = false | ||
| autobins = false | ||
| autoexamples = false | ||
| autotests = false | ||
| autobenches = false | ||
| description = "Unvalidated string and character types" | ||
| homepage = "https://icu4x.unicode.org" | ||
| readme = "README.md" | ||
| categories = ["internationalization"] | ||
| license = "Unicode-3.0" | ||
| repository = "https://github.com/unicode-org/icu4x" | ||
| [features] | ||
| alloc = [ | ||
| "serde_core?/alloc", | ||
| "writeable/alloc", | ||
| "zerovec?/alloc", | ||
| ] | ||
| databake = ["dep:databake"] | ||
| default = ["alloc"] | ||
| serde = ["dep:serde_core"] | ||
| writeable = ["dep:writeable"] | ||
| zerovec = ["dep:zerovec"] | ||
| [lib] | ||
| name = "potential_utf" | ||
| path = "src/lib.rs" | ||
| [dependencies.databake] | ||
| version = "0.2.0" | ||
| optional = true | ||
| default-features = false | ||
| [dependencies.serde_core] | ||
| version = "1.0.220" | ||
| optional = true | ||
| default-features = false | ||
| [dependencies.writeable] | ||
| version = "0.6.0" | ||
| optional = true | ||
| default-features = false | ||
| [dependencies.zerovec] | ||
| version = "0.11.3" | ||
| optional = true | ||
| default-features = false | ||
| [dev-dependencies.bincode] | ||
| version = "1.3.1" | ||
| [dev-dependencies.serde_json] | ||
| version = "1.0.45" |
Sorry, the diff of this file is not supported yet
| UNICODE LICENSE V3 | ||
| COPYRIGHT AND PERMISSION NOTICE | ||
| Copyright © 2020-2024 Unicode, Inc. | ||
| NOTICE TO USER: Carefully read the following legal agreement. BY | ||
| DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING DATA FILES, AND/OR | ||
| SOFTWARE, YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE | ||
| TERMS AND CONDITIONS OF THIS AGREEMENT. IF YOU DO NOT AGREE, DO NOT | ||
| DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE THE DATA FILES OR SOFTWARE. | ||
| Permission is hereby granted, free of charge, to any person obtaining a | ||
| copy of data files and any associated documentation (the "Data Files") or | ||
| software and any associated documentation (the "Software") to deal in the | ||
| Data Files or Software without restriction, including without limitation | ||
| the rights to use, copy, modify, merge, publish, distribute, and/or sell | ||
| copies of the Data Files or Software, and to permit persons to whom the | ||
| Data Files or Software are furnished to do so, provided that either (a) | ||
| this copyright and permission notice appear with all copies of the Data | ||
| Files or Software, or (b) this copyright and permission notice appear in | ||
| associated Documentation. | ||
| THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY | ||
| KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | ||
| MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF | ||
| THIRD PARTY RIGHTS. | ||
| IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE | ||
| BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, | ||
| OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, | ||
| WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, | ||
| ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THE DATA | ||
| FILES OR SOFTWARE. | ||
| Except as contained in this notice, the name of a copyright holder shall | ||
| not be used in advertising or otherwise to promote the sale, use or other | ||
| dealings in these Data Files or Software without prior written | ||
| authorization of the copyright holder. | ||
| SPDX-License-Identifier: Unicode-3.0 | ||
| — | ||
| Portions of ICU4X may have been adapted from ICU4C and/or ICU4J. | ||
| ICU 1.8.1 to ICU 57.1 © 1995-2016 International Business Machines Corporation and others. |
| # potential_utf [](https://crates.io/crates/potential_utf) | ||
| <!-- cargo-rdme start --> | ||
| A crate providing unvalidated string and character types. | ||
| <!-- cargo-rdme end --> | ||
| ## More Information | ||
| For more information on development, authorship, contributing etc. please visit [`ICU4X home page`](https://github.com/unicode-org/icu4x). |
| // This file is part of ICU4X. For terms of use, please see the file | ||
| // called LICENSE at the top level of the ICU4X source tree | ||
| // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). | ||
| #![cfg_attr(not(any(test, doc)), no_std)] | ||
| #![cfg_attr( | ||
| not(test), | ||
| deny( | ||
| clippy::indexing_slicing, | ||
| clippy::unwrap_used, | ||
| clippy::expect_used, | ||
| clippy::panic, | ||
| clippy::exhaustive_structs, | ||
| clippy::exhaustive_enums, | ||
| clippy::trivially_copy_pass_by_ref, | ||
| missing_debug_implementations, | ||
| ) | ||
| )] | ||
| //! A crate providing unvalidated string and character types. | ||
| #[cfg(feature = "alloc")] | ||
| extern crate alloc; | ||
| mod uchar; | ||
| mod ustr; | ||
| pub use uchar::PotentialCodePoint; | ||
| pub use ustr::PotentialUtf16; | ||
| pub use ustr::PotentialUtf8; | ||
| #[cfg(feature = "writeable")] | ||
| mod writeable; |
| // This file is part of ICU4X. For terms of use, please see the file | ||
| // called LICENSE at the top level of the ICU4X source tree | ||
| // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). | ||
| use core::cmp::Ordering; | ||
| use core::fmt; | ||
| /// A 24-bit numeric data type that is expected to be a Unicode scalar value, but is not | ||
| /// validated as such. | ||
| /// | ||
| /// Use this type instead of `char` when you want to deal with data that is expected to be valid | ||
| /// Unicode scalar values, but you want control over when or if you validate that assumption. | ||
| /// | ||
| /// # Examples | ||
| /// | ||
| /// ``` | ||
| /// use potential_utf::PotentialCodePoint; | ||
| /// | ||
| /// assert_eq!(PotentialCodePoint::from_u24(0x68).try_to_char(), Ok('h')); | ||
| /// assert_eq!(PotentialCodePoint::from_char('i').try_to_char(), Ok('i')); | ||
| /// assert_eq!( | ||
| /// PotentialCodePoint::from_u24(0x1F44B).try_to_char(), | ||
| /// Ok('👋') | ||
| /// ); | ||
| /// | ||
| /// assert!(PotentialCodePoint::from_u24(0xDE01).try_to_char().is_err()); | ||
| /// assert_eq!( | ||
| /// PotentialCodePoint::from_u24(0xDE01).to_char_lossy(), | ||
| /// char::REPLACEMENT_CHARACTER | ||
| /// ); | ||
| /// ``` | ||
| #[repr(transparent)] | ||
| #[allow(clippy::exhaustive_structs)] // transparent newtype | ||
| #[derive(PartialEq, Eq, Clone, Copy, Hash)] | ||
| pub struct PotentialCodePoint([u8; 3]); | ||
| impl PotentialCodePoint { | ||
| /// Create a [`PotentialCodePoint`] from a `char`. | ||
| /// | ||
| /// # Examples | ||
| /// | ||
| /// ``` | ||
| /// use potential_utf::PotentialCodePoint; | ||
| /// | ||
| /// let a = PotentialCodePoint::from_char('a'); | ||
| /// assert_eq!(a.try_to_char().unwrap(), 'a'); | ||
| /// ``` | ||
| #[inline] | ||
| pub const fn from_char(c: char) -> Self { | ||
| let [u0, u1, u2, _u3] = (c as u32).to_le_bytes(); | ||
| Self([u0, u1, u2]) | ||
| } | ||
| /// Create [`PotentialCodePoint`] from a u32 value, ignoring the most significant 8 bits. | ||
| #[inline] | ||
| pub const fn from_u24(c: u32) -> Self { | ||
| let [u0, u1, u2, _u3] = c.to_le_bytes(); | ||
| Self([u0, u1, u2]) | ||
| } | ||
| /// Attempt to convert a [`PotentialCodePoint`] to a `char`. | ||
| /// | ||
| /// # Examples | ||
| /// | ||
| /// ``` | ||
| /// use potential_utf::PotentialCodePoint; | ||
| /// use zerovec::ule::AsULE; | ||
| /// | ||
| /// let a = PotentialCodePoint::from_char('a'); | ||
| /// assert_eq!(a.try_to_char(), Ok('a')); | ||
| /// | ||
| /// let b = PotentialCodePoint::from_unaligned([0xFF, 0xFF, 0xFF].into()); | ||
| /// assert!(b.try_to_char().is_err()); | ||
| /// ``` | ||
| #[inline] | ||
| pub fn try_to_char(self) -> Result<char, core::char::CharTryFromError> { | ||
| char::try_from(u32::from(self)) | ||
| } | ||
| /// Convert a [`PotentialCodePoint`] to a `char', returning [`char::REPLACEMENT_CHARACTER`] | ||
| /// if the `PotentialCodePoint` does not represent a valid Unicode scalar value. | ||
| /// | ||
| /// # Examples | ||
| /// | ||
| /// ``` | ||
| /// use potential_utf::PotentialCodePoint; | ||
| /// use zerovec::ule::AsULE; | ||
| /// | ||
| /// let a = PotentialCodePoint::from_unaligned([0xFF, 0xFF, 0xFF].into()); | ||
| /// assert_eq!(a.to_char_lossy(), char::REPLACEMENT_CHARACTER); | ||
| /// ``` | ||
| #[inline] | ||
| pub fn to_char_lossy(self) -> char { | ||
| self.try_to_char().unwrap_or(char::REPLACEMENT_CHARACTER) | ||
| } | ||
| /// Convert a [`PotentialCodePoint`] to a `char` without checking that it is | ||
| /// a valid Unicode scalar value. | ||
| /// | ||
| /// # Safety | ||
| /// | ||
| /// The `PotentialCodePoint` must be a valid Unicode scalar value in little-endian order. | ||
| /// | ||
| /// # Examples | ||
| /// | ||
| /// ``` | ||
| /// use potential_utf::PotentialCodePoint; | ||
| /// | ||
| /// let a = PotentialCodePoint::from_char('a'); | ||
| /// assert_eq!(unsafe { a.to_char_unchecked() }, 'a'); | ||
| /// ``` | ||
| #[inline] | ||
| pub unsafe fn to_char_unchecked(self) -> char { | ||
| char::from_u32_unchecked(u32::from(self)) | ||
| } | ||
| /// For converting to the ULE type in a const context | ||
| /// | ||
| /// Can be removed once const traits are a thing | ||
| #[inline] | ||
| #[cfg(feature = "zerovec")] | ||
| pub const fn to_unaligned(self) -> zerovec::ule::RawBytesULE<3> { | ||
| zerovec::ule::RawBytesULE(self.0) | ||
| } | ||
| } | ||
| /// This impl requires enabling the optional `zerovec` Cargo feature | ||
| #[cfg(feature = "zerovec")] | ||
| impl zerovec::ule::AsULE for PotentialCodePoint { | ||
| type ULE = zerovec::ule::RawBytesULE<3>; | ||
| #[inline] | ||
| fn to_unaligned(self) -> Self::ULE { | ||
| zerovec::ule::RawBytesULE(self.0) | ||
| } | ||
| #[inline] | ||
| fn from_unaligned(unaligned: Self::ULE) -> Self { | ||
| Self(unaligned.0) | ||
| } | ||
| } | ||
| // Safety: PotentialCodePoint is always the little-endian representation of a char, | ||
| // which corresponds to its AsULE::ULE type | ||
| /// This impl requires enabling the optional `zerovec` Cargo feature | ||
| #[cfg(feature = "zerovec")] | ||
| unsafe impl zerovec::ule::EqULE for PotentialCodePoint {} | ||
| impl fmt::Debug for PotentialCodePoint { | ||
| fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { | ||
| // Debug as a char if possible | ||
| match self.try_to_char() { | ||
| Ok(c) => fmt::Debug::fmt(&c, f), | ||
| Err(_) => fmt::Debug::fmt(&self.0, f), | ||
| } | ||
| } | ||
| } | ||
| impl PartialOrd for PotentialCodePoint { | ||
| fn partial_cmp(&self, other: &Self) -> Option<Ordering> { | ||
| Some(self.cmp(other)) | ||
| } | ||
| } | ||
| impl PartialEq<char> for PotentialCodePoint { | ||
| fn eq(&self, other: &char) -> bool { | ||
| self.eq(&Self::from_char(*other)) | ||
| } | ||
| } | ||
| impl PartialOrd<char> for PotentialCodePoint { | ||
| fn partial_cmp(&self, other: &char) -> Option<Ordering> { | ||
| self.partial_cmp(&Self::from_char(*other)) | ||
| } | ||
| } | ||
| impl PartialEq<PotentialCodePoint> for char { | ||
| fn eq(&self, other: &PotentialCodePoint) -> bool { | ||
| PotentialCodePoint::from_char(*self).eq(other) | ||
| } | ||
| } | ||
| impl PartialOrd<PotentialCodePoint> for char { | ||
| fn partial_cmp(&self, other: &PotentialCodePoint) -> Option<Ordering> { | ||
| PotentialCodePoint::from_char(*self).partial_cmp(other) | ||
| } | ||
| } | ||
| impl Ord for PotentialCodePoint { | ||
| // custom implementation, as derived Ord would compare lexicographically | ||
| fn cmp(&self, other: &Self) -> Ordering { | ||
| let a = u32::from(*self); | ||
| let b = u32::from(*other); | ||
| a.cmp(&b) | ||
| } | ||
| } | ||
| impl From<PotentialCodePoint> for u32 { | ||
| fn from(x: PotentialCodePoint) -> Self { | ||
| let [a0, a1, a2] = x.0; | ||
| u32::from_le_bytes([a0, a1, a2, 0]) | ||
| } | ||
| } | ||
| impl TryFrom<u32> for PotentialCodePoint { | ||
| type Error = (); | ||
| fn try_from(x: u32) -> Result<Self, ()> { | ||
| let [u0, u1, u2, u3] = x.to_le_bytes(); | ||
| if u3 != 0 { | ||
| return Err(()); | ||
| } | ||
| Ok(Self([u0, u1, u2])) | ||
| } | ||
| } | ||
| impl From<char> for PotentialCodePoint { | ||
| #[inline] | ||
| fn from(value: char) -> Self { | ||
| Self::from_char(value) | ||
| } | ||
| } | ||
| impl TryFrom<PotentialCodePoint> for char { | ||
| type Error = core::char::CharTryFromError; | ||
| #[inline] | ||
| fn try_from(value: PotentialCodePoint) -> Result<char, Self::Error> { | ||
| value.try_to_char() | ||
| } | ||
| } | ||
| /// This impl requires enabling the optional `serde` Cargo feature | ||
| #[cfg(feature = "serde")] | ||
| impl serde_core::Serialize for PotentialCodePoint { | ||
| fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error> | ||
| where | ||
| S: serde_core::Serializer, | ||
| { | ||
| use serde_core::ser::Error; | ||
| let c = self | ||
| .try_to_char() | ||
| .map_err(|_| S::Error::custom("invalid Unicode scalar value in PotentialCodePoint"))?; | ||
| if serializer.is_human_readable() { | ||
| serializer.serialize_char(c) | ||
| } else { | ||
| self.0.serialize(serializer) | ||
| } | ||
| } | ||
| } | ||
| /// This impl requires enabling the optional `serde` Cargo feature | ||
| #[cfg(feature = "serde")] | ||
| impl<'de> serde_core::Deserialize<'de> for PotentialCodePoint { | ||
| fn deserialize<D>(deserializer: D) -> Result<Self, D::Error> | ||
| where | ||
| D: serde_core::Deserializer<'de>, | ||
| { | ||
| if deserializer.is_human_readable() { | ||
| let c = <char>::deserialize(deserializer)?; | ||
| Ok(PotentialCodePoint::from_char(c)) | ||
| } else { | ||
| let bytes = <[u8; 3]>::deserialize(deserializer)?; | ||
| Ok(PotentialCodePoint(bytes)) | ||
| } | ||
| } | ||
| } | ||
| /// This impl requires enabling the optional `databake` Cargo feature | ||
| #[cfg(feature = "databake")] | ||
| impl databake::Bake for PotentialCodePoint { | ||
| fn bake(&self, env: &databake::CrateEnv) -> databake::TokenStream { | ||
| match self.try_to_char() { | ||
| Ok(ch) => { | ||
| env.insert("potential_utf"); | ||
| let ch = ch.bake(env); | ||
| databake::quote! { | ||
| potential_utf::PotentialCodePoint::from_char(#ch) | ||
| } | ||
| } | ||
| Err(_) => { | ||
| env.insert("potential_utf"); | ||
| let u24 = u32::from_le_bytes([self.0[0], self.0[1], self.0[2], 0]); | ||
| databake::quote! { | ||
| potential_utf::PotentialCodePoint::from_u24(#u24) | ||
| } | ||
| } | ||
| } | ||
| } | ||
| } | ||
| #[cfg(test)] | ||
| mod test { | ||
| use super::*; | ||
| use zerovec::ZeroVec; | ||
| #[test] | ||
| fn test_serde_fail() { | ||
| let uc = PotentialCodePoint([0xFF, 0xFF, 0xFF]); | ||
| serde_json::to_string(&uc).expect_err("serialize invalid char bytes"); | ||
| bincode::serialize(&uc).expect_err("serialize invalid char bytes"); | ||
| } | ||
| #[test] | ||
| fn test_serde_json() { | ||
| let c = '🙃'; | ||
| let uc = PotentialCodePoint::from_char(c); | ||
| let json_ser = serde_json::to_string(&uc).unwrap(); | ||
| assert_eq!(json_ser, r#""🙃""#); | ||
| let json_de: PotentialCodePoint = serde_json::from_str(&json_ser).unwrap(); | ||
| assert_eq!(uc, json_de); | ||
| } | ||
| #[test] | ||
| fn test_serde_bincode() { | ||
| let c = '🙃'; | ||
| let uc = PotentialCodePoint::from_char(c); | ||
| let bytes_ser = bincode::serialize(&uc).unwrap(); | ||
| assert_eq!(bytes_ser, [0x43, 0xF6, 0x01]); | ||
| let bytes_de: PotentialCodePoint = bincode::deserialize(&bytes_ser).unwrap(); | ||
| assert_eq!(uc, bytes_de); | ||
| } | ||
| #[test] | ||
| fn test_representation() { | ||
| let chars = ['w', 'ω', '文', '𑄃', '🙃']; | ||
| // backed by [PotentialCodePoint] | ||
| let uvchars: Vec<_> = chars | ||
| .iter() | ||
| .copied() | ||
| .map(PotentialCodePoint::from_char) | ||
| .collect(); | ||
| // backed by [RawBytesULE<3>] | ||
| let zvec: ZeroVec<_> = uvchars.clone().into_iter().collect(); | ||
| let ule_bytes = zvec.as_bytes(); | ||
| let uvbytes; | ||
| unsafe { | ||
| let ptr = &uvchars[..] as *const _ as *const u8; | ||
| uvbytes = core::slice::from_raw_parts(ptr, ule_bytes.len()); | ||
| } | ||
| // PotentialCodePoint is defined as little-endian, so this must be true on all platforms | ||
| // also asserts that to_unaligned/from_unaligned are no-ops | ||
| assert_eq!(uvbytes, ule_bytes); | ||
| assert_eq!( | ||
| &[119, 0, 0, 201, 3, 0, 135, 101, 0, 3, 17, 1, 67, 246, 1], | ||
| ule_bytes | ||
| ); | ||
| } | ||
| #[test] | ||
| fn test_char_bake() { | ||
| databake::test_bake!( | ||
| PotentialCodePoint, | ||
| const, | ||
| crate::PotentialCodePoint::from_char('b'), | ||
| potential_utf | ||
| ); | ||
| // surrogate code point | ||
| databake::test_bake!( | ||
| PotentialCodePoint, | ||
| const, | ||
| crate::PotentialCodePoint::from_u24(55296u32), | ||
| potential_utf | ||
| ); | ||
| } | ||
| } |
| // This file is part of ICU4X. For terms of use, please see the file | ||
| // called LICENSE at the top level of the ICU4X source tree | ||
| // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). | ||
| #[cfg(feature = "alloc")] | ||
| use alloc::boxed::Box; | ||
| use core::cmp::Ordering; | ||
| use core::fmt; | ||
| use core::ops::Deref; | ||
| /// A byte slice that is expected to be a UTF-8 string but does not enforce that invariant. | ||
| /// | ||
| /// Use this type instead of `str` if you don't need to enforce UTF-8 during deserialization. For | ||
| /// example, strings that are keys of a map don't need to ever be reified as `str`s. | ||
| /// | ||
| /// [`PotentialUtf8`] derefs to `[u8]`. To obtain a `str`, use [`Self::try_as_str()`]. | ||
| /// | ||
| /// The main advantage of this type over `[u8]` is that it serializes as a string in | ||
| /// human-readable formats like JSON. | ||
| /// | ||
| /// # Examples | ||
| /// | ||
| /// Using an [`PotentialUtf8`] as the key of a [`ZeroMap`]: | ||
| /// | ||
| /// ``` | ||
| /// use potential_utf::PotentialUtf8; | ||
| /// use zerovec::ZeroMap; | ||
| /// | ||
| /// // This map is cheap to deserialize, as we don't need to perform UTF-8 validation. | ||
| /// let map: ZeroMap<PotentialUtf8, u8> = [ | ||
| /// (PotentialUtf8::from_bytes(b"abc"), 11), | ||
| /// (PotentialUtf8::from_bytes(b"def"), 22), | ||
| /// (PotentialUtf8::from_bytes(b"ghi"), 33), | ||
| /// ] | ||
| /// .into_iter() | ||
| /// .collect(); | ||
| /// | ||
| /// let key = "abc"; | ||
| /// let value = map.get_copied(PotentialUtf8::from_str(key)); | ||
| /// assert_eq!(Some(11), value); | ||
| /// ``` | ||
| /// | ||
| /// [`ZeroMap`]: zerovec::ZeroMap | ||
| #[repr(transparent)] | ||
| #[derive(PartialEq, Eq, PartialOrd, Ord)] | ||
| #[allow(clippy::exhaustive_structs)] // transparent newtype | ||
| pub struct PotentialUtf8(pub [u8]); | ||
| impl fmt::Debug for PotentialUtf8 { | ||
| fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { | ||
| // Debug as a string if possible | ||
| match self.try_as_str() { | ||
| Ok(s) => fmt::Debug::fmt(s, f), | ||
| Err(_) => fmt::Debug::fmt(&self.0, f), | ||
| } | ||
| } | ||
| } | ||
| impl PotentialUtf8 { | ||
| /// Create a [`PotentialUtf8`] from a byte slice. | ||
| #[inline] | ||
| pub const fn from_bytes(other: &[u8]) -> &Self { | ||
| // Safety: PotentialUtf8 is transparent over [u8] | ||
| unsafe { core::mem::transmute(other) } | ||
| } | ||
| /// Create a [`PotentialUtf8`] from a string slice. | ||
| #[inline] | ||
| pub const fn from_str(s: &str) -> &Self { | ||
| Self::from_bytes(s.as_bytes()) | ||
| } | ||
| /// Create a [`PotentialUtf8`] from boxed bytes. | ||
| /// | ||
| /// ✨ *Enabled with the `alloc` Cargo feature.* | ||
| #[inline] | ||
| #[cfg(feature = "alloc")] | ||
| pub fn from_boxed_bytes(other: Box<[u8]>) -> Box<Self> { | ||
| // Safety: PotentialUtf8 is transparent over [u8] | ||
| unsafe { core::mem::transmute(other) } | ||
| } | ||
| /// Create a [`PotentialUtf8`] from a boxed `str`. | ||
| /// | ||
| /// ✨ *Enabled with the `alloc` Cargo feature.* | ||
| #[inline] | ||
| #[cfg(feature = "alloc")] | ||
| pub fn from_boxed_str(other: Box<str>) -> Box<Self> { | ||
| Self::from_boxed_bytes(other.into_boxed_bytes()) | ||
| } | ||
| /// Get the bytes from a [`PotentialUtf8]. | ||
| #[inline] | ||
| pub const fn as_bytes(&self) -> &[u8] { | ||
| &self.0 | ||
| } | ||
| /// Attempt to convert a [`PotentialUtf8`] to a `str`. | ||
| /// | ||
| /// # Examples | ||
| /// | ||
| /// ``` | ||
| /// use potential_utf::PotentialUtf8; | ||
| /// | ||
| /// static A: &PotentialUtf8 = PotentialUtf8::from_bytes(b"abc"); | ||
| /// | ||
| /// let b = A.try_as_str().unwrap(); | ||
| /// assert_eq!(b, "abc"); | ||
| /// ``` | ||
| // Note: this is const starting in 1.63 | ||
| #[inline] | ||
| pub fn try_as_str(&self) -> Result<&str, core::str::Utf8Error> { | ||
| core::str::from_utf8(&self.0) | ||
| } | ||
| } | ||
| impl<'a> From<&'a str> for &'a PotentialUtf8 { | ||
| #[inline] | ||
| fn from(other: &'a str) -> Self { | ||
| PotentialUtf8::from_str(other) | ||
| } | ||
| } | ||
| impl PartialEq<str> for PotentialUtf8 { | ||
| fn eq(&self, other: &str) -> bool { | ||
| self.eq(Self::from_str(other)) | ||
| } | ||
| } | ||
| impl PartialOrd<str> for PotentialUtf8 { | ||
| fn partial_cmp(&self, other: &str) -> Option<Ordering> { | ||
| self.partial_cmp(Self::from_str(other)) | ||
| } | ||
| } | ||
| impl PartialEq<PotentialUtf8> for str { | ||
| fn eq(&self, other: &PotentialUtf8) -> bool { | ||
| PotentialUtf8::from_str(self).eq(other) | ||
| } | ||
| } | ||
| impl PartialOrd<PotentialUtf8> for str { | ||
| fn partial_cmp(&self, other: &PotentialUtf8) -> Option<Ordering> { | ||
| PotentialUtf8::from_str(self).partial_cmp(other) | ||
| } | ||
| } | ||
| #[cfg(feature = "alloc")] | ||
| impl From<Box<str>> for Box<PotentialUtf8> { | ||
| #[inline] | ||
| fn from(other: Box<str>) -> Self { | ||
| PotentialUtf8::from_boxed_str(other) | ||
| } | ||
| } | ||
| impl Deref for PotentialUtf8 { | ||
| type Target = [u8]; | ||
| fn deref(&self) -> &Self::Target { | ||
| &self.0 | ||
| } | ||
| } | ||
| /// This impl requires enabling the optional `zerovec` Cargo feature | ||
| #[cfg(all(feature = "zerovec", feature = "alloc"))] | ||
| impl<'a> zerovec::maps::ZeroMapKV<'a> for PotentialUtf8 { | ||
| type Container = zerovec::VarZeroVec<'a, PotentialUtf8>; | ||
| type Slice = zerovec::VarZeroSlice<PotentialUtf8>; | ||
| type GetType = PotentialUtf8; | ||
| type OwnedType = Box<PotentialUtf8>; | ||
| } | ||
| // Safety (based on the safety checklist on the VarULE trait): | ||
| // 1. PotentialUtf8 does not include any uninitialized or padding bytes (transparent over a ULE) | ||
| // 2. PotentialUtf8 is aligned to 1 byte (transparent over a ULE) | ||
| // 3. The impl of `validate_bytes()` returns an error if any byte is not valid (impossible) | ||
| // 4. The impl of `validate_bytes()` returns an error if the slice cannot be used in its entirety (impossible) | ||
| // 5. The impl of `from_bytes_unchecked()` returns a reference to the same data (returns the argument directly) | ||
| // 6. All other methods are defaulted | ||
| // 7. `[T]` byte equality is semantic equality (transparent over a ULE) | ||
| /// This impl requires enabling the optional `zerovec` Cargo feature | ||
| #[cfg(feature = "zerovec")] | ||
| unsafe impl zerovec::ule::VarULE for PotentialUtf8 { | ||
| #[inline] | ||
| fn validate_bytes(_: &[u8]) -> Result<(), zerovec::ule::UleError> { | ||
| Ok(()) | ||
| } | ||
| #[inline] | ||
| unsafe fn from_bytes_unchecked(bytes: &[u8]) -> &Self { | ||
| PotentialUtf8::from_bytes(bytes) | ||
| } | ||
| } | ||
| /// This impl requires enabling the optional `serde` Cargo feature | ||
| #[cfg(feature = "serde")] | ||
| impl serde_core::Serialize for PotentialUtf8 { | ||
| fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error> | ||
| where | ||
| S: serde_core::Serializer, | ||
| { | ||
| use serde_core::ser::Error; | ||
| let s = self | ||
| .try_as_str() | ||
| .map_err(|_| S::Error::custom("invalid UTF-8 in PotentialUtf8"))?; | ||
| if serializer.is_human_readable() { | ||
| serializer.serialize_str(s) | ||
| } else { | ||
| serializer.serialize_bytes(s.as_bytes()) | ||
| } | ||
| } | ||
| } | ||
| /// This impl requires enabling the optional `serde` Cargo feature | ||
| #[cfg(all(feature = "serde", feature = "alloc"))] | ||
| impl<'de> serde_core::Deserialize<'de> for Box<PotentialUtf8> { | ||
| fn deserialize<D>(deserializer: D) -> Result<Self, D::Error> | ||
| where | ||
| D: serde_core::Deserializer<'de>, | ||
| { | ||
| if deserializer.is_human_readable() { | ||
| let boxed_str = Box::<str>::deserialize(deserializer)?; | ||
| Ok(PotentialUtf8::from_boxed_str(boxed_str)) | ||
| } else { | ||
| let boxed_bytes = Box::<[u8]>::deserialize(deserializer)?; | ||
| Ok(PotentialUtf8::from_boxed_bytes(boxed_bytes)) | ||
| } | ||
| } | ||
| } | ||
| /// This impl requires enabling the optional `serde` Cargo feature | ||
| #[cfg(feature = "serde")] | ||
| impl<'de, 'a> serde_core::Deserialize<'de> for &'a PotentialUtf8 | ||
| where | ||
| 'de: 'a, | ||
| { | ||
| fn deserialize<D>(deserializer: D) -> Result<Self, D::Error> | ||
| where | ||
| D: serde_core::Deserializer<'de>, | ||
| { | ||
| if deserializer.is_human_readable() { | ||
| let s = <&str>::deserialize(deserializer)?; | ||
| Ok(PotentialUtf8::from_str(s)) | ||
| } else { | ||
| let bytes = <&[u8]>::deserialize(deserializer)?; | ||
| Ok(PotentialUtf8::from_bytes(bytes)) | ||
| } | ||
| } | ||
| } | ||
| #[repr(transparent)] | ||
| #[derive(PartialEq, Eq, PartialOrd, Ord)] | ||
| #[allow(clippy::exhaustive_structs)] // transparent newtype | ||
| pub struct PotentialUtf16(pub [u16]); | ||
| impl fmt::Debug for PotentialUtf16 { | ||
| fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { | ||
| // Debug as a string if possible | ||
| for c in char::decode_utf16(self.0.iter().copied()) { | ||
| match c { | ||
| Ok(c) => write!(f, "{c}")?, | ||
| Err(e) => write!(f, "\\0x{:x}", e.unpaired_surrogate())?, | ||
| } | ||
| } | ||
| Ok(()) | ||
| } | ||
| } | ||
| impl PotentialUtf16 { | ||
| /// Create a [`PotentialUtf16`] from a u16 slice. | ||
| #[inline] | ||
| pub const fn from_slice(other: &[u16]) -> &Self { | ||
| // Safety: PotentialUtf16 is transparent over [u16] | ||
| unsafe { core::mem::transmute(other) } | ||
| } | ||
| pub fn chars(&self) -> impl Iterator<Item = char> + '_ { | ||
| char::decode_utf16(self.0.iter().copied()).map(|c| c.unwrap_or(char::REPLACEMENT_CHARACTER)) | ||
| } | ||
| } |
| // This file is part of ICU4X. For terms of use, please see the file | ||
| // called LICENSE at the top level of the ICU4X source tree | ||
| // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). | ||
| use crate::{PotentialUtf16, PotentialUtf8}; | ||
| use core::fmt::Write; | ||
| use writeable::{LengthHint, Part, PartsWrite, TryWriteable}; | ||
| use core::{char::DecodeUtf16Error, fmt, str::Utf8Error}; | ||
| /// This impl requires enabling the optional `writeable` Cargo feature | ||
| impl TryWriteable for &'_ PotentialUtf8 { | ||
| type Error = Utf8Error; | ||
| fn try_write_to_parts<S: PartsWrite + ?Sized>( | ||
| &self, | ||
| sink: &mut S, | ||
| ) -> Result<Result<(), Self::Error>, fmt::Error> { | ||
| let mut remaining = &self.0; | ||
| let mut r = Ok(()); | ||
| loop { | ||
| match core::str::from_utf8(remaining) { | ||
| Ok(valid) => { | ||
| sink.write_str(valid)?; | ||
| return Ok(r); | ||
| } | ||
| Err(e) => { | ||
| // SAFETY: By Utf8Error invariants | ||
| let valid = unsafe { | ||
| core::str::from_utf8_unchecked(remaining.get_unchecked(..e.valid_up_to())) | ||
| }; | ||
| sink.write_str(valid)?; | ||
| sink.with_part(Part::ERROR, |s| s.write_char(char::REPLACEMENT_CHARACTER))?; | ||
| if r.is_ok() { | ||
| r = Err(e); | ||
| } | ||
| let Some(error_len) = e.error_len() else { | ||
| return Ok(r); // end of string | ||
| }; | ||
| // SAFETY: By Utf8Error invariants | ||
| remaining = unsafe { remaining.get_unchecked(e.valid_up_to() + error_len..) } | ||
| } | ||
| } | ||
| } | ||
| } | ||
| fn writeable_length_hint(&self) -> LengthHint { | ||
| // Lower bound is all valid UTF-8, upper bound is all bytes with the high bit, which become replacement characters. | ||
| LengthHint::between(self.0.len(), self.0.len() * 3) | ||
| } | ||
| } | ||
| /// This impl requires enabling the optional `writeable` Cargo feature | ||
| impl TryWriteable for &'_ PotentialUtf16 { | ||
| type Error = DecodeUtf16Error; | ||
| fn try_write_to_parts<S: PartsWrite + ?Sized>( | ||
| &self, | ||
| sink: &mut S, | ||
| ) -> Result<Result<(), Self::Error>, fmt::Error> { | ||
| let mut r = Ok(()); | ||
| for c in core::char::decode_utf16(self.0.iter().copied()) { | ||
| match c { | ||
| Ok(c) => sink.write_char(c)?, | ||
| Err(e) => { | ||
| if r.is_ok() { | ||
| r = Err(e); | ||
| } | ||
| sink.with_part(Part::ERROR, |s| s.write_char(char::REPLACEMENT_CHARACTER))?; | ||
| } | ||
| } | ||
| } | ||
| Ok(r) | ||
| } | ||
| fn writeable_length_hint(&self) -> LengthHint { | ||
| // Lower bound is all ASCII, upper bound is all 3-byte code points (including replacement character) | ||
| LengthHint::between(self.0.len(), self.0.len() * 3) | ||
| } | ||
| } | ||
| #[cfg(test)] | ||
| mod test { | ||
| #![allow(invalid_from_utf8)] // only way to construct the error | ||
| use super::*; | ||
| use writeable::assert_try_writeable_parts_eq; | ||
| #[test] | ||
| fn test_utf8() { | ||
| assert_try_writeable_parts_eq!( | ||
| PotentialUtf8::from_bytes(b"Foo Bar"), | ||
| "Foo Bar", | ||
| Ok(()), | ||
| [] | ||
| ); | ||
| assert_try_writeable_parts_eq!( | ||
| PotentialUtf8::from_bytes(b"Foo\xFDBar"), | ||
| "Foo�Bar", | ||
| Err(core::str::from_utf8(b"Foo\xFDBar").unwrap_err()), | ||
| [(3, 6, Part::ERROR)] | ||
| ); | ||
| assert_try_writeable_parts_eq!( | ||
| PotentialUtf8::from_bytes(b"Foo\xFDBar\xff"), | ||
| "Foo�Bar�", | ||
| Err(core::str::from_utf8(b"Foo\xFDBar\xff").unwrap_err()), | ||
| [(3, 6, Part::ERROR), (9, 12, Part::ERROR)], | ||
| ); | ||
| } | ||
| #[test] | ||
| fn test_utf16() { | ||
| assert_try_writeable_parts_eq!( | ||
| PotentialUtf16::from_slice(&[0xD83E, 0xDD73]), | ||
| "🥳", | ||
| Ok(()), | ||
| [] | ||
| ); | ||
| assert_try_writeable_parts_eq!( | ||
| PotentialUtf16::from_slice(&[0xD83E, 0x20, 0xDD73]), | ||
| "� �", | ||
| Err(core::char::decode_utf16([0xD83E].into_iter()) | ||
| .next() | ||
| .unwrap() | ||
| .unwrap_err()), | ||
| [(0, 3, Part::ERROR), (4, 7, Part::ERROR)] | ||
| ); | ||
| } | ||
| } |
| { | ||
| "git": { | ||
| "sha1": "fbb3eebe2f65e64a69b3a1837dce8ed9cbbe677e" | ||
| }, | ||
| "path_in_vcs": "utils/potential_utf" | ||
| } |
| # This file is automatically @generated by Cargo. | ||
| # It is not intended for manual editing. | ||
| version = 3 | ||
| [[package]] | ||
| name = "bincode" | ||
| version = "1.3.3" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "b1f45e9417d87227c7a56d22e471c6206462cba514c7590c09aff4cf6d1ddcad" | ||
| dependencies = [ | ||
| "serde", | ||
| ] | ||
| [[package]] | ||
| name = "databake" | ||
| version = "0.2.0" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "ff6ee9e2d2afb173bcdeee45934c89ec341ab26f91c9933774fc15c2b58f83ef" | ||
| dependencies = [ | ||
| "proc-macro2", | ||
| "quote", | ||
| ] | ||
| [[package]] | ||
| name = "itoa" | ||
| version = "1.0.15" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c" | ||
| [[package]] | ||
| name = "memchr" | ||
| version = "2.7.4" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3" | ||
| [[package]] | ||
| name = "potential_utf" | ||
| version = "0.1.3" | ||
| dependencies = [ | ||
| "bincode", | ||
| "databake", | ||
| "serde", | ||
| "serde_json", | ||
| "writeable", | ||
| "zerovec", | ||
| ] | ||
| [[package]] | ||
| name = "proc-macro2" | ||
| version = "1.0.95" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "02b3e5e68a3a1a02aad3ec490a98007cbc13c37cbe84a3cd7b8e406d76e7f778" | ||
| dependencies = [ | ||
| "unicode-ident", | ||
| ] | ||
| [[package]] | ||
| name = "quote" | ||
| version = "1.0.40" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "1885c039570dc00dcb4ff087a89e185fd56bae234ddc7f056a945bf36467248d" | ||
| dependencies = [ | ||
| "proc-macro2", | ||
| ] | ||
| [[package]] | ||
| name = "ryu" | ||
| version = "1.0.20" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "28d3b2b1366ec20994f1fd18c3c594f05c5dd4bc44d8bb0c1c632c8d6829481f" | ||
| [[package]] | ||
| name = "serde" | ||
| version = "1.0.219" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "5f0e2c6ed6606019b4e29e69dbaba95b11854410e5347d525002456dbbb786b6" | ||
| dependencies = [ | ||
| "serde_derive", | ||
| ] | ||
| [[package]] | ||
| name = "serde_derive" | ||
| version = "1.0.219" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "5b0276cf7f2c73365f7157c8123c21cd9a50fbbd844757af28ca1f5925fc2a00" | ||
| dependencies = [ | ||
| "proc-macro2", | ||
| "quote", | ||
| "syn", | ||
| ] | ||
| [[package]] | ||
| name = "serde_json" | ||
| version = "1.0.140" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "20068b6e96dc6c9bd23e01df8827e6c7e1f2fddd43c21810382803c136b99373" | ||
| dependencies = [ | ||
| "itoa", | ||
| "memchr", | ||
| "ryu", | ||
| "serde", | ||
| ] | ||
| [[package]] | ||
| name = "syn" | ||
| version = "2.0.101" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "8ce2b7fc941b3a24138a0a7cf8e858bfc6a992e7978a068a5c760deb0ed43caf" | ||
| dependencies = [ | ||
| "proc-macro2", | ||
| "quote", | ||
| "unicode-ident", | ||
| ] | ||
| [[package]] | ||
| name = "unicode-ident" | ||
| version = "1.0.18" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "5a5f39404a5da50712a4c1eecf25e90dd62b613502b7e925fd4e4d19b5c96512" | ||
| [[package]] | ||
| name = "writeable" | ||
| version = "0.6.1" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "ea2f10b9bb0928dfb1b42b65e1f9e36f7f54dbdf08457afefb38afcdec4fa2bb" | ||
| [[package]] | ||
| name = "zerofrom" | ||
| version = "0.1.6" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "50cc42e0333e05660c3587f3bf9d0478688e15d870fab3346451ce7f8c9fbea5" | ||
| [[package]] | ||
| name = "zerovec" | ||
| version = "0.11.4" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "e7aa2bd55086f1ab526693ecbe444205da57e25f4489879da80635a46d90e73b" | ||
| dependencies = [ | ||
| "zerofrom", | ||
| ] |
| # THIS FILE IS AUTOMATICALLY GENERATED BY CARGO | ||
| # | ||
| # When uploading crates to the registry Cargo will automatically | ||
| # "normalize" Cargo.toml files for maximal compatibility | ||
| # with all versions of Cargo and also rewrite `path` dependencies | ||
| # to registry (e.g., crates.io) dependencies. | ||
| # | ||
| # If you are reading this file be aware that the original Cargo.toml | ||
| # will likely look very different (and much more reasonable). | ||
| # See Cargo.toml.orig for the original contents. | ||
| [package] | ||
| edition = "2021" | ||
| rust-version = "1.82" | ||
| name = "potential_utf" | ||
| version = "0.1.3" | ||
| authors = ["The ICU4X Project Developers"] | ||
| build = false | ||
| include = [ | ||
| "data/**/*", | ||
| "src/**/*", | ||
| "examples/**/*", | ||
| "benches/**/*", | ||
| "tests/**/*", | ||
| "Cargo.toml", | ||
| "LICENSE", | ||
| "README.md", | ||
| "build.rs", | ||
| ] | ||
| autolib = false | ||
| autobins = false | ||
| autoexamples = false | ||
| autotests = false | ||
| autobenches = false | ||
| description = "Unvalidated string and character types" | ||
| homepage = "https://icu4x.unicode.org" | ||
| readme = "README.md" | ||
| categories = ["internationalization"] | ||
| license = "Unicode-3.0" | ||
| repository = "https://github.com/unicode-org/icu4x" | ||
| [features] | ||
| alloc = [ | ||
| "serde?/alloc", | ||
| "zerovec?/alloc", | ||
| ] | ||
| databake = ["dep:databake"] | ||
| serde = ["dep:serde"] | ||
| writeable = [ | ||
| "dep:writeable", | ||
| "alloc", | ||
| ] | ||
| zerovec = ["dep:zerovec"] | ||
| [lib] | ||
| name = "potential_utf" | ||
| path = "src/lib.rs" | ||
| [dependencies.databake] | ||
| version = "0.2.0" | ||
| optional = true | ||
| default-features = false | ||
| [dependencies.serde] | ||
| version = "1.0.110" | ||
| optional = true | ||
| default-features = false | ||
| [dependencies.writeable] | ||
| version = "0.6.0" | ||
| optional = true | ||
| default-features = false | ||
| [dependencies.zerovec] | ||
| version = "0.11.3" | ||
| optional = true | ||
| default-features = false | ||
| [dev-dependencies.bincode] | ||
| version = "1.3.1" | ||
| [dev-dependencies.serde_json] | ||
| version = "1.0.45" |
Sorry, the diff of this file is not supported yet
| UNICODE LICENSE V3 | ||
| COPYRIGHT AND PERMISSION NOTICE | ||
| Copyright © 2020-2024 Unicode, Inc. | ||
| NOTICE TO USER: Carefully read the following legal agreement. BY | ||
| DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING DATA FILES, AND/OR | ||
| SOFTWARE, YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE | ||
| TERMS AND CONDITIONS OF THIS AGREEMENT. IF YOU DO NOT AGREE, DO NOT | ||
| DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE THE DATA FILES OR SOFTWARE. | ||
| Permission is hereby granted, free of charge, to any person obtaining a | ||
| copy of data files and any associated documentation (the "Data Files") or | ||
| software and any associated documentation (the "Software") to deal in the | ||
| Data Files or Software without restriction, including without limitation | ||
| the rights to use, copy, modify, merge, publish, distribute, and/or sell | ||
| copies of the Data Files or Software, and to permit persons to whom the | ||
| Data Files or Software are furnished to do so, provided that either (a) | ||
| this copyright and permission notice appear with all copies of the Data | ||
| Files or Software, or (b) this copyright and permission notice appear in | ||
| associated Documentation. | ||
| THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY | ||
| KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | ||
| MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF | ||
| THIRD PARTY RIGHTS. | ||
| IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE | ||
| BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, | ||
| OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, | ||
| WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, | ||
| ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THE DATA | ||
| FILES OR SOFTWARE. | ||
| Except as contained in this notice, the name of a copyright holder shall | ||
| not be used in advertising or otherwise to promote the sale, use or other | ||
| dealings in these Data Files or Software without prior written | ||
| authorization of the copyright holder. | ||
| SPDX-License-Identifier: Unicode-3.0 | ||
| — | ||
| Portions of ICU4X may have been adapted from ICU4C and/or ICU4J. | ||
| ICU 1.8.1 to ICU 57.1 © 1995-2016 International Business Machines Corporation and others. |
| # unvalidated_utf [](https://crates.io/crates/unvalidated_utf) | ||
| <!-- cargo-rdme start --> | ||
| A crate providing unvalidated string and character types. | ||
| <!-- cargo-rdme end --> | ||
| ## More Information | ||
| For more information on development, authorship, contributing etc. please visit [`ICU4X home page`](https://github.com/unicode-org/icu4x). |
| // This file is part of ICU4X. For terms of use, please see the file | ||
| // called LICENSE at the top level of the ICU4X source tree | ||
| // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). | ||
| #![cfg_attr(not(any(test, doc)), no_std)] | ||
| #![cfg_attr( | ||
| not(test), | ||
| deny( | ||
| clippy::indexing_slicing, | ||
| clippy::unwrap_used, | ||
| clippy::expect_used, | ||
| clippy::panic, | ||
| clippy::exhaustive_structs, | ||
| clippy::exhaustive_enums, | ||
| clippy::trivially_copy_pass_by_ref, | ||
| missing_debug_implementations, | ||
| ) | ||
| )] | ||
| //! A crate providing unvalidated string and character types. | ||
| #[cfg(feature = "alloc")] | ||
| extern crate alloc; | ||
| mod uchar; | ||
| mod ustr; | ||
| pub use uchar::PotentialCodePoint; | ||
| pub use ustr::PotentialUtf16; | ||
| pub use ustr::PotentialUtf8; | ||
| #[cfg(feature = "writeable")] | ||
| mod writeable; |
| // This file is part of ICU4X. For terms of use, please see the file | ||
| // called LICENSE at the top level of the ICU4X source tree | ||
| // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). | ||
| use core::cmp::Ordering; | ||
| use core::fmt; | ||
| /// A 24-bit numeric data type that is expected to be a Unicode scalar value, but is not | ||
| /// validated as such. | ||
| /// | ||
| /// Use this type instead of `char` when you want to deal with data that is expected to be valid | ||
| /// Unicode scalar values, but you want control over when or if you validate that assumption. | ||
| /// | ||
| /// # Examples | ||
| /// | ||
| /// ``` | ||
| /// use potential_utf::PotentialCodePoint; | ||
| /// | ||
| /// assert_eq!(PotentialCodePoint::from_u24(0x68).try_to_char(), Ok('h')); | ||
| /// assert_eq!(PotentialCodePoint::from_char('i').try_to_char(), Ok('i')); | ||
| /// assert_eq!( | ||
| /// PotentialCodePoint::from_u24(0x1F44B).try_to_char(), | ||
| /// Ok('👋') | ||
| /// ); | ||
| /// | ||
| /// assert!(PotentialCodePoint::from_u24(0xDE01).try_to_char().is_err()); | ||
| /// assert_eq!( | ||
| /// PotentialCodePoint::from_u24(0xDE01).to_char_lossy(), | ||
| /// char::REPLACEMENT_CHARACTER | ||
| /// ); | ||
| /// ``` | ||
| #[repr(transparent)] | ||
| #[allow(clippy::exhaustive_structs)] // transparent newtype | ||
| #[derive(PartialEq, Eq, Clone, Copy, Hash)] | ||
| pub struct PotentialCodePoint([u8; 3]); | ||
| impl PotentialCodePoint { | ||
| /// Create a [`PotentialCodePoint`] from a `char`. | ||
| /// | ||
| /// # Examples | ||
| /// | ||
| /// ``` | ||
| /// use potential_utf::PotentialCodePoint; | ||
| /// | ||
| /// let a = PotentialCodePoint::from_char('a'); | ||
| /// assert_eq!(a.try_to_char().unwrap(), 'a'); | ||
| /// ``` | ||
| #[inline] | ||
| pub const fn from_char(c: char) -> Self { | ||
| let [u0, u1, u2, _u3] = (c as u32).to_le_bytes(); | ||
| Self([u0, u1, u2]) | ||
| } | ||
| /// Create [`PotentialCodePoint`] from a u32 value, ignoring the most significant 8 bits. | ||
| #[inline] | ||
| pub const fn from_u24(c: u32) -> Self { | ||
| let [u0, u1, u2, _u3] = c.to_le_bytes(); | ||
| Self([u0, u1, u2]) | ||
| } | ||
| /// Attempt to convert a [`PotentialCodePoint`] to a `char`. | ||
| /// | ||
| /// # Examples | ||
| /// | ||
| /// ``` | ||
| /// use potential_utf::PotentialCodePoint; | ||
| /// use zerovec::ule::AsULE; | ||
| /// | ||
| /// let a = PotentialCodePoint::from_char('a'); | ||
| /// assert_eq!(a.try_to_char(), Ok('a')); | ||
| /// | ||
| /// let b = PotentialCodePoint::from_unaligned([0xFF, 0xFF, 0xFF].into()); | ||
| /// assert!(matches!(b.try_to_char(), Err(_))); | ||
| /// ``` | ||
| #[inline] | ||
| pub fn try_to_char(self) -> Result<char, core::char::CharTryFromError> { | ||
| char::try_from(u32::from(self)) | ||
| } | ||
| /// Convert a [`PotentialCodePoint`] to a `char', returning [`char::REPLACEMENT_CHARACTER`] | ||
| /// if the `PotentialCodePoint` does not represent a valid Unicode scalar value. | ||
| /// | ||
| /// # Examples | ||
| /// | ||
| /// ``` | ||
| /// use potential_utf::PotentialCodePoint; | ||
| /// use zerovec::ule::AsULE; | ||
| /// | ||
| /// let a = PotentialCodePoint::from_unaligned([0xFF, 0xFF, 0xFF].into()); | ||
| /// assert_eq!(a.to_char_lossy(), char::REPLACEMENT_CHARACTER); | ||
| /// ``` | ||
| #[inline] | ||
| pub fn to_char_lossy(self) -> char { | ||
| self.try_to_char().unwrap_or(char::REPLACEMENT_CHARACTER) | ||
| } | ||
| /// Convert a [`PotentialCodePoint`] to a `char` without checking that it is | ||
| /// a valid Unicode scalar value. | ||
| /// | ||
| /// # Safety | ||
| /// | ||
| /// The `PotentialCodePoint` must be a valid Unicode scalar value in little-endian order. | ||
| /// | ||
| /// # Examples | ||
| /// | ||
| /// ``` | ||
| /// use potential_utf::PotentialCodePoint; | ||
| /// | ||
| /// let a = PotentialCodePoint::from_char('a'); | ||
| /// assert_eq!(unsafe { a.to_char_unchecked() }, 'a'); | ||
| /// ``` | ||
| #[inline] | ||
| pub unsafe fn to_char_unchecked(self) -> char { | ||
| char::from_u32_unchecked(u32::from(self)) | ||
| } | ||
| /// For converting to the ULE type in a const context | ||
| /// | ||
| /// Can be removed once const traits are a thing | ||
| #[inline] | ||
| #[cfg(feature = "zerovec")] | ||
| pub const fn to_unaligned(self) -> zerovec::ule::RawBytesULE<3> { | ||
| zerovec::ule::RawBytesULE(self.0) | ||
| } | ||
| } | ||
| /// This impl requires enabling the optional `zerovec` Cargo feature | ||
| #[cfg(feature = "zerovec")] | ||
| impl zerovec::ule::AsULE for PotentialCodePoint { | ||
| type ULE = zerovec::ule::RawBytesULE<3>; | ||
| #[inline] | ||
| fn to_unaligned(self) -> Self::ULE { | ||
| zerovec::ule::RawBytesULE(self.0) | ||
| } | ||
| #[inline] | ||
| fn from_unaligned(unaligned: Self::ULE) -> Self { | ||
| Self(unaligned.0) | ||
| } | ||
| } | ||
| // Safety: PotentialCodePoint is always the little-endian representation of a char, | ||
| // which corresponds to its AsULE::ULE type | ||
| /// This impl requires enabling the optional `zerovec` Cargo feature | ||
| #[cfg(feature = "zerovec")] | ||
| unsafe impl zerovec::ule::EqULE for PotentialCodePoint {} | ||
| impl fmt::Debug for PotentialCodePoint { | ||
| fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { | ||
| // Debug as a char if possible | ||
| match self.try_to_char() { | ||
| Ok(c) => fmt::Debug::fmt(&c, f), | ||
| Err(_) => fmt::Debug::fmt(&self.0, f), | ||
| } | ||
| } | ||
| } | ||
| impl PartialOrd for PotentialCodePoint { | ||
| fn partial_cmp(&self, other: &Self) -> Option<Ordering> { | ||
| Some(self.cmp(other)) | ||
| } | ||
| } | ||
| impl PartialEq<char> for PotentialCodePoint { | ||
| fn eq(&self, other: &char) -> bool { | ||
| self.eq(&Self::from_char(*other)) | ||
| } | ||
| } | ||
| impl PartialOrd<char> for PotentialCodePoint { | ||
| fn partial_cmp(&self, other: &char) -> Option<Ordering> { | ||
| self.partial_cmp(&Self::from_char(*other)) | ||
| } | ||
| } | ||
| impl PartialEq<PotentialCodePoint> for char { | ||
| fn eq(&self, other: &PotentialCodePoint) -> bool { | ||
| PotentialCodePoint::from_char(*self).eq(other) | ||
| } | ||
| } | ||
| impl PartialOrd<PotentialCodePoint> for char { | ||
| fn partial_cmp(&self, other: &PotentialCodePoint) -> Option<Ordering> { | ||
| PotentialCodePoint::from_char(*self).partial_cmp(other) | ||
| } | ||
| } | ||
| impl Ord for PotentialCodePoint { | ||
| // custom implementation, as derived Ord would compare lexicographically | ||
| fn cmp(&self, other: &Self) -> Ordering { | ||
| let a = u32::from(*self); | ||
| let b = u32::from(*other); | ||
| a.cmp(&b) | ||
| } | ||
| } | ||
| impl From<PotentialCodePoint> for u32 { | ||
| fn from(x: PotentialCodePoint) -> Self { | ||
| let [a0, a1, a2] = x.0; | ||
| u32::from_le_bytes([a0, a1, a2, 0]) | ||
| } | ||
| } | ||
| impl TryFrom<u32> for PotentialCodePoint { | ||
| type Error = (); | ||
| fn try_from(x: u32) -> Result<Self, ()> { | ||
| let [u0, u1, u2, u3] = x.to_le_bytes(); | ||
| if u3 != 0 { | ||
| return Err(()); | ||
| } | ||
| Ok(Self([u0, u1, u2])) | ||
| } | ||
| } | ||
| impl From<char> for PotentialCodePoint { | ||
| #[inline] | ||
| fn from(value: char) -> Self { | ||
| Self::from_char(value) | ||
| } | ||
| } | ||
| impl TryFrom<PotentialCodePoint> for char { | ||
| type Error = core::char::CharTryFromError; | ||
| #[inline] | ||
| fn try_from(value: PotentialCodePoint) -> Result<char, Self::Error> { | ||
| value.try_to_char() | ||
| } | ||
| } | ||
| /// This impl requires enabling the optional `serde` Cargo feature | ||
| #[cfg(feature = "serde")] | ||
| impl serde::Serialize for PotentialCodePoint { | ||
| fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error> | ||
| where | ||
| S: serde::Serializer, | ||
| { | ||
| use serde::ser::Error; | ||
| let c = self | ||
| .try_to_char() | ||
| .map_err(|_| S::Error::custom("invalid Unicode scalar value in PotentialCodePoint"))?; | ||
| if serializer.is_human_readable() { | ||
| serializer.serialize_char(c) | ||
| } else { | ||
| self.0.serialize(serializer) | ||
| } | ||
| } | ||
| } | ||
| /// This impl requires enabling the optional `serde` Cargo feature | ||
| #[cfg(feature = "serde")] | ||
| impl<'de> serde::Deserialize<'de> for PotentialCodePoint { | ||
| fn deserialize<D>(deserializer: D) -> Result<Self, D::Error> | ||
| where | ||
| D: serde::Deserializer<'de>, | ||
| { | ||
| if deserializer.is_human_readable() { | ||
| let c = <char>::deserialize(deserializer)?; | ||
| Ok(PotentialCodePoint::from_char(c)) | ||
| } else { | ||
| let bytes = <[u8; 3]>::deserialize(deserializer)?; | ||
| Ok(PotentialCodePoint(bytes)) | ||
| } | ||
| } | ||
| } | ||
| /// This impl requires enabling the optional `databake` Cargo feature | ||
| #[cfg(feature = "databake")] | ||
| impl databake::Bake for PotentialCodePoint { | ||
| fn bake(&self, env: &databake::CrateEnv) -> databake::TokenStream { | ||
| match self.try_to_char() { | ||
| Ok(ch) => { | ||
| env.insert("potential_utf"); | ||
| let ch = ch.bake(env); | ||
| databake::quote! { | ||
| potential_utf::PotentialCodePoint::from_char(#ch) | ||
| } | ||
| } | ||
| Err(_) => { | ||
| env.insert("potential_utf"); | ||
| let u24 = u32::from_le_bytes([self.0[0], self.0[1], self.0[2], 0]); | ||
| databake::quote! { | ||
| potential_utf::PotentialCodePoint::from_u24(#u24) | ||
| } | ||
| } | ||
| } | ||
| } | ||
| } | ||
| #[cfg(test)] | ||
| mod test { | ||
| use super::*; | ||
| use zerovec::ZeroVec; | ||
| #[test] | ||
| fn test_serde_fail() { | ||
| let uc = PotentialCodePoint([0xFF, 0xFF, 0xFF]); | ||
| serde_json::to_string(&uc).expect_err("serialize invalid char bytes"); | ||
| bincode::serialize(&uc).expect_err("serialize invalid char bytes"); | ||
| } | ||
| #[test] | ||
| fn test_serde_json() { | ||
| let c = '🙃'; | ||
| let uc = PotentialCodePoint::from_char(c); | ||
| let json_ser = serde_json::to_string(&uc).unwrap(); | ||
| assert_eq!(json_ser, r#""🙃""#); | ||
| let json_de: PotentialCodePoint = serde_json::from_str(&json_ser).unwrap(); | ||
| assert_eq!(uc, json_de); | ||
| } | ||
| #[test] | ||
| fn test_serde_bincode() { | ||
| let c = '🙃'; | ||
| let uc = PotentialCodePoint::from_char(c); | ||
| let bytes_ser = bincode::serialize(&uc).unwrap(); | ||
| assert_eq!(bytes_ser, [0x43, 0xF6, 0x01]); | ||
| let bytes_de: PotentialCodePoint = bincode::deserialize(&bytes_ser).unwrap(); | ||
| assert_eq!(uc, bytes_de); | ||
| } | ||
| #[test] | ||
| fn test_representation() { | ||
| let chars = ['w', 'ω', '文', '𑄃', '🙃']; | ||
| // backed by [PotentialCodePoint] | ||
| let uvchars: Vec<_> = chars | ||
| .iter() | ||
| .copied() | ||
| .map(PotentialCodePoint::from_char) | ||
| .collect(); | ||
| // backed by [RawBytesULE<3>] | ||
| let zvec: ZeroVec<_> = uvchars.clone().into_iter().collect(); | ||
| let ule_bytes = zvec.as_bytes(); | ||
| let uvbytes; | ||
| unsafe { | ||
| let ptr = &uvchars[..] as *const _ as *const u8; | ||
| uvbytes = core::slice::from_raw_parts(ptr, ule_bytes.len()); | ||
| } | ||
| // PotentialCodePoint is defined as little-endian, so this must be true on all platforms | ||
| // also asserts that to_unaligned/from_unaligned are no-ops | ||
| assert_eq!(uvbytes, ule_bytes); | ||
| assert_eq!( | ||
| &[119, 0, 0, 201, 3, 0, 135, 101, 0, 3, 17, 1, 67, 246, 1], | ||
| ule_bytes | ||
| ); | ||
| } | ||
| #[test] | ||
| fn test_char_bake() { | ||
| databake::test_bake!( | ||
| PotentialCodePoint, | ||
| const, | ||
| crate::PotentialCodePoint::from_char('b'), | ||
| potential_utf | ||
| ); | ||
| // surrogate code point | ||
| databake::test_bake!( | ||
| PotentialCodePoint, | ||
| const, | ||
| crate::PotentialCodePoint::from_u24(55296u32), | ||
| potential_utf | ||
| ); | ||
| } | ||
| } |
| // This file is part of ICU4X. For terms of use, please see the file | ||
| // called LICENSE at the top level of the ICU4X source tree | ||
| // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). | ||
| #[cfg(feature = "alloc")] | ||
| use alloc::boxed::Box; | ||
| use core::cmp::Ordering; | ||
| use core::fmt; | ||
| use core::ops::Deref; | ||
| /// A byte slice that is expected to be a UTF-8 string but does not enforce that invariant. | ||
| /// | ||
| /// Use this type instead of `str` if you don't need to enforce UTF-8 during deserialization. For | ||
| /// example, strings that are keys of a map don't need to ever be reified as `str`s. | ||
| /// | ||
| /// [`PotentialUtf8`] derefs to `[u8]`. To obtain a `str`, use [`Self::try_as_str()`]. | ||
| /// | ||
| /// The main advantage of this type over `[u8]` is that it serializes as a string in | ||
| /// human-readable formats like JSON. | ||
| /// | ||
| /// # Examples | ||
| /// | ||
| /// Using an [`PotentialUtf8`] as the key of a [`ZeroMap`]: | ||
| /// | ||
| /// ``` | ||
| /// use potential_utf::PotentialUtf8; | ||
| /// use zerovec::ZeroMap; | ||
| /// | ||
| /// // This map is cheap to deserialize, as we don't need to perform UTF-8 validation. | ||
| /// let map: ZeroMap<PotentialUtf8, u8> = [ | ||
| /// (PotentialUtf8::from_bytes(b"abc"), 11), | ||
| /// (PotentialUtf8::from_bytes(b"def"), 22), | ||
| /// (PotentialUtf8::from_bytes(b"ghi"), 33), | ||
| /// ] | ||
| /// .into_iter() | ||
| /// .collect(); | ||
| /// | ||
| /// let key = "abc"; | ||
| /// let value = map.get_copied(PotentialUtf8::from_str(key)); | ||
| /// assert_eq!(Some(11), value); | ||
| /// ``` | ||
| /// | ||
| /// [`ZeroMap`]: zerovec::ZeroMap | ||
| #[repr(transparent)] | ||
| #[derive(PartialEq, Eq, PartialOrd, Ord)] | ||
| #[allow(clippy::exhaustive_structs)] // transparent newtype | ||
| pub struct PotentialUtf8(pub [u8]); | ||
| impl fmt::Debug for PotentialUtf8 { | ||
| fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { | ||
| // Debug as a string if possible | ||
| match self.try_as_str() { | ||
| Ok(s) => fmt::Debug::fmt(s, f), | ||
| Err(_) => fmt::Debug::fmt(&self.0, f), | ||
| } | ||
| } | ||
| } | ||
| impl PotentialUtf8 { | ||
| /// Create a [`PotentialUtf8`] from a byte slice. | ||
| #[inline] | ||
| pub const fn from_bytes(other: &[u8]) -> &Self { | ||
| // Safety: PotentialUtf8 is transparent over [u8] | ||
| unsafe { core::mem::transmute(other) } | ||
| } | ||
| /// Create a [`PotentialUtf8`] from a string slice. | ||
| #[inline] | ||
| pub const fn from_str(s: &str) -> &Self { | ||
| Self::from_bytes(s.as_bytes()) | ||
| } | ||
| /// Create a [`PotentialUtf8`] from boxed bytes. | ||
| #[inline] | ||
| #[cfg(feature = "alloc")] | ||
| pub fn from_boxed_bytes(other: Box<[u8]>) -> Box<Self> { | ||
| // Safety: PotentialUtf8 is transparent over [u8] | ||
| unsafe { core::mem::transmute(other) } | ||
| } | ||
| /// Create a [`PotentialUtf8`] from a boxed `str`. | ||
| #[inline] | ||
| #[cfg(feature = "alloc")] | ||
| pub fn from_boxed_str(other: Box<str>) -> Box<Self> { | ||
| Self::from_boxed_bytes(other.into_boxed_bytes()) | ||
| } | ||
| /// Get the bytes from a [`PotentialUtf8]. | ||
| #[inline] | ||
| pub const fn as_bytes(&self) -> &[u8] { | ||
| &self.0 | ||
| } | ||
| /// Attempt to convert a [`PotentialUtf8`] to a `str`. | ||
| /// | ||
| /// # Examples | ||
| /// | ||
| /// ``` | ||
| /// use potential_utf::PotentialUtf8; | ||
| /// | ||
| /// static A: &PotentialUtf8 = PotentialUtf8::from_bytes(b"abc"); | ||
| /// | ||
| /// let b = A.try_as_str().unwrap(); | ||
| /// assert_eq!(b, "abc"); | ||
| /// ``` | ||
| // Note: this is const starting in 1.63 | ||
| #[inline] | ||
| pub fn try_as_str(&self) -> Result<&str, core::str::Utf8Error> { | ||
| core::str::from_utf8(&self.0) | ||
| } | ||
| } | ||
| impl<'a> From<&'a str> for &'a PotentialUtf8 { | ||
| #[inline] | ||
| fn from(other: &'a str) -> Self { | ||
| PotentialUtf8::from_str(other) | ||
| } | ||
| } | ||
| impl PartialEq<str> for PotentialUtf8 { | ||
| fn eq(&self, other: &str) -> bool { | ||
| self.eq(Self::from_str(other)) | ||
| } | ||
| } | ||
| impl PartialOrd<str> for PotentialUtf8 { | ||
| fn partial_cmp(&self, other: &str) -> Option<Ordering> { | ||
| self.partial_cmp(Self::from_str(other)) | ||
| } | ||
| } | ||
| impl PartialEq<PotentialUtf8> for str { | ||
| fn eq(&self, other: &PotentialUtf8) -> bool { | ||
| PotentialUtf8::from_str(self).eq(other) | ||
| } | ||
| } | ||
| impl PartialOrd<PotentialUtf8> for str { | ||
| fn partial_cmp(&self, other: &PotentialUtf8) -> Option<Ordering> { | ||
| PotentialUtf8::from_str(self).partial_cmp(other) | ||
| } | ||
| } | ||
| #[cfg(feature = "alloc")] | ||
| impl From<Box<str>> for Box<PotentialUtf8> { | ||
| #[inline] | ||
| fn from(other: Box<str>) -> Self { | ||
| PotentialUtf8::from_boxed_str(other) | ||
| } | ||
| } | ||
| impl Deref for PotentialUtf8 { | ||
| type Target = [u8]; | ||
| fn deref(&self) -> &Self::Target { | ||
| &self.0 | ||
| } | ||
| } | ||
| /// This impl requires enabling the optional `zerovec` Cargo feature | ||
| #[cfg(all(feature = "zerovec", feature = "alloc"))] | ||
| impl<'a> zerovec::maps::ZeroMapKV<'a> for PotentialUtf8 { | ||
| type Container = zerovec::VarZeroVec<'a, PotentialUtf8>; | ||
| type Slice = zerovec::VarZeroSlice<PotentialUtf8>; | ||
| type GetType = PotentialUtf8; | ||
| type OwnedType = Box<PotentialUtf8>; | ||
| } | ||
| // Safety (based on the safety checklist on the VarULE trait): | ||
| // 1. PotentialUtf8 does not include any uninitialized or padding bytes (transparent over a ULE) | ||
| // 2. PotentialUtf8 is aligned to 1 byte (transparent over a ULE) | ||
| // 3. The impl of `validate_bytes()` returns an error if any byte is not valid (impossible) | ||
| // 4. The impl of `validate_bytes()` returns an error if the slice cannot be used in its entirety (impossible) | ||
| // 5. The impl of `from_bytes_unchecked()` returns a reference to the same data (returns the argument directly) | ||
| // 6. All other methods are defaulted | ||
| // 7. `[T]` byte equality is semantic equality (transparent over a ULE) | ||
| /// This impl requires enabling the optional `zerovec` Cargo feature | ||
| #[cfg(feature = "zerovec")] | ||
| unsafe impl zerovec::ule::VarULE for PotentialUtf8 { | ||
| #[inline] | ||
| fn validate_bytes(_: &[u8]) -> Result<(), zerovec::ule::UleError> { | ||
| Ok(()) | ||
| } | ||
| #[inline] | ||
| unsafe fn from_bytes_unchecked(bytes: &[u8]) -> &Self { | ||
| PotentialUtf8::from_bytes(bytes) | ||
| } | ||
| } | ||
| /// This impl requires enabling the optional `serde` Cargo feature | ||
| #[cfg(feature = "serde")] | ||
| impl serde::Serialize for PotentialUtf8 { | ||
| fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error> | ||
| where | ||
| S: serde::Serializer, | ||
| { | ||
| use serde::ser::Error; | ||
| let s = self | ||
| .try_as_str() | ||
| .map_err(|_| S::Error::custom("invalid UTF-8 in PotentialUtf8"))?; | ||
| if serializer.is_human_readable() { | ||
| serializer.serialize_str(s) | ||
| } else { | ||
| serializer.serialize_bytes(s.as_bytes()) | ||
| } | ||
| } | ||
| } | ||
| /// This impl requires enabling the optional `serde` Cargo feature | ||
| #[cfg(all(feature = "serde", feature = "alloc"))] | ||
| impl<'de> serde::Deserialize<'de> for Box<PotentialUtf8> { | ||
| fn deserialize<D>(deserializer: D) -> Result<Self, D::Error> | ||
| where | ||
| D: serde::Deserializer<'de>, | ||
| { | ||
| if deserializer.is_human_readable() { | ||
| let boxed_str = Box::<str>::deserialize(deserializer)?; | ||
| Ok(PotentialUtf8::from_boxed_str(boxed_str)) | ||
| } else { | ||
| let boxed_bytes = Box::<[u8]>::deserialize(deserializer)?; | ||
| Ok(PotentialUtf8::from_boxed_bytes(boxed_bytes)) | ||
| } | ||
| } | ||
| } | ||
| /// This impl requires enabling the optional `serde` Cargo feature | ||
| #[cfg(feature = "serde")] | ||
| impl<'de, 'a> serde::Deserialize<'de> for &'a PotentialUtf8 | ||
| where | ||
| 'de: 'a, | ||
| { | ||
| fn deserialize<D>(deserializer: D) -> Result<Self, D::Error> | ||
| where | ||
| D: serde::Deserializer<'de>, | ||
| { | ||
| if deserializer.is_human_readable() { | ||
| let s = <&str>::deserialize(deserializer)?; | ||
| Ok(PotentialUtf8::from_str(s)) | ||
| } else { | ||
| let bytes = <&[u8]>::deserialize(deserializer)?; | ||
| Ok(PotentialUtf8::from_bytes(bytes)) | ||
| } | ||
| } | ||
| } | ||
| #[repr(transparent)] | ||
| #[derive(PartialEq, Eq, PartialOrd, Ord)] | ||
| #[allow(clippy::exhaustive_structs)] // transparent newtype | ||
| pub struct PotentialUtf16(pub [u16]); | ||
| impl fmt::Debug for PotentialUtf16 { | ||
| fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { | ||
| // Debug as a string if possible | ||
| for c in char::decode_utf16(self.0.iter().copied()) { | ||
| match c { | ||
| Ok(c) => write!(f, "{c}")?, | ||
| Err(e) => write!(f, "\\0x{:x}", e.unpaired_surrogate())?, | ||
| } | ||
| } | ||
| Ok(()) | ||
| } | ||
| } | ||
| impl PotentialUtf16 { | ||
| /// Create a [`PotentialUtf16`] from a u16 slice. | ||
| #[inline] | ||
| pub const fn from_slice(other: &[u16]) -> &Self { | ||
| // Safety: PotentialUtf16 is transparent over [u16] | ||
| unsafe { core::mem::transmute(other) } | ||
| } | ||
| pub fn chars(&self) -> impl Iterator<Item = char> + '_ { | ||
| char::decode_utf16(self.0.iter().copied()).map(|c| c.unwrap_or(char::REPLACEMENT_CHARACTER)) | ||
| } | ||
| } |
| // This file is part of ICU4X. For terms of use, please see the file | ||
| // called LICENSE at the top level of the ICU4X source tree | ||
| // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). | ||
| use crate::{PotentialUtf16, PotentialUtf8}; | ||
| use alloc::borrow::Cow; | ||
| use core::fmt::Write; | ||
| use writeable::{LengthHint, Part, PartsWrite, TryWriteable}; | ||
| use core::{char::DecodeUtf16Error, fmt, str::Utf8Error}; | ||
| /// This impl requires enabling the optional `writeable` Cargo feature | ||
| impl TryWriteable for &'_ PotentialUtf8 { | ||
| type Error = Utf8Error; | ||
| fn try_write_to_parts<S: PartsWrite + ?Sized>( | ||
| &self, | ||
| sink: &mut S, | ||
| ) -> Result<Result<(), Self::Error>, fmt::Error> { | ||
| let mut remaining = &self.0; | ||
| let mut r = Ok(()); | ||
| loop { | ||
| match core::str::from_utf8(remaining) { | ||
| Ok(valid) => { | ||
| sink.write_str(valid)?; | ||
| return Ok(r); | ||
| } | ||
| Err(e) => { | ||
| // SAFETY: By Utf8Error invariants | ||
| let valid = unsafe { | ||
| core::str::from_utf8_unchecked(remaining.get_unchecked(..e.valid_up_to())) | ||
| }; | ||
| sink.write_str(valid)?; | ||
| sink.with_part(Part::ERROR, |s| s.write_char(char::REPLACEMENT_CHARACTER))?; | ||
| if r.is_ok() { | ||
| r = Err(e); | ||
| } | ||
| let Some(error_len) = e.error_len() else { | ||
| return Ok(r); // end of string | ||
| }; | ||
| // SAFETY: By Utf8Error invariants | ||
| remaining = unsafe { remaining.get_unchecked(e.valid_up_to() + error_len..) } | ||
| } | ||
| } | ||
| } | ||
| } | ||
| fn writeable_length_hint(&self) -> LengthHint { | ||
| // Lower bound is all valid UTF-8, upper bound is all bytes with the high bit, which become replacement characters. | ||
| LengthHint::between(self.0.len(), self.0.len() * 3) | ||
| } | ||
| fn try_write_to_string(&self) -> Result<Cow<'_, str>, (Self::Error, Cow<'_, str>)> { | ||
| match core::str::from_utf8(&self.0) { | ||
| Ok(valid) => Ok(Cow::Borrowed(valid)), | ||
| Err(e) => { | ||
| // SAFETY: By Utf8Error invariants | ||
| let valid = unsafe { | ||
| core::str::from_utf8_unchecked(self.0.get_unchecked(..e.valid_up_to())) | ||
| }; | ||
| // Let's assume this is the only error | ||
| let mut out = alloc::string::String::with_capacity( | ||
| self.0.len() + char::REPLACEMENT_CHARACTER.len_utf8() | ||
| - e.error_len().unwrap_or(0), | ||
| ); | ||
| out.push_str(valid); | ||
| out.push(char::REPLACEMENT_CHARACTER); | ||
| // If there's more, we can use `try_write_to` | ||
| if let Some(error_len) = e.error_len() { | ||
| // SAFETY: By Utf8Error invariants | ||
| let remaining = unsafe { self.0.get_unchecked(e.valid_up_to() + error_len..) }; | ||
| let _discard = PotentialUtf8::from_bytes(remaining).try_write_to(&mut out); | ||
| } | ||
| Err((e, Cow::Owned(out))) | ||
| } | ||
| } | ||
| } | ||
| } | ||
| /// This impl requires enabling the optional `writeable` Cargo feature | ||
| impl TryWriteable for &'_ PotentialUtf16 { | ||
| type Error = DecodeUtf16Error; | ||
| fn try_write_to_parts<S: PartsWrite + ?Sized>( | ||
| &self, | ||
| sink: &mut S, | ||
| ) -> Result<Result<(), Self::Error>, fmt::Error> { | ||
| let mut r = Ok(()); | ||
| for c in core::char::decode_utf16(self.0.iter().copied()) { | ||
| match c { | ||
| Ok(c) => sink.write_char(c)?, | ||
| Err(e) => { | ||
| if r.is_ok() { | ||
| r = Err(e); | ||
| } | ||
| sink.with_part(Part::ERROR, |s| s.write_char(char::REPLACEMENT_CHARACTER))?; | ||
| } | ||
| } | ||
| } | ||
| Ok(r) | ||
| } | ||
| fn writeable_length_hint(&self) -> LengthHint { | ||
| // Lower bound is all ASCII, upper bound is all 3-byte code points (including replacement character) | ||
| LengthHint::between(self.0.len(), self.0.len() * 3) | ||
| } | ||
| } | ||
| #[cfg(test)] | ||
| mod test { | ||
| #![allow(invalid_from_utf8)] // only way to construct the error | ||
| use super::*; | ||
| use writeable::assert_try_writeable_parts_eq; | ||
| #[test] | ||
| fn test_utf8() { | ||
| assert_try_writeable_parts_eq!( | ||
| PotentialUtf8::from_bytes(b"Foo Bar"), | ||
| "Foo Bar", | ||
| Ok(()), | ||
| [] | ||
| ); | ||
| assert_try_writeable_parts_eq!( | ||
| PotentialUtf8::from_bytes(b"Foo\xFDBar"), | ||
| "Foo�Bar", | ||
| Err(core::str::from_utf8(b"Foo\xFDBar").unwrap_err()), | ||
| [(3, 6, Part::ERROR)] | ||
| ); | ||
| assert_try_writeable_parts_eq!( | ||
| PotentialUtf8::from_bytes(b"Foo\xFDBar\xff"), | ||
| "Foo�Bar�", | ||
| Err(core::str::from_utf8(b"Foo\xFDBar\xff").unwrap_err()), | ||
| [(3, 6, Part::ERROR), (9, 12, Part::ERROR)], | ||
| ); | ||
| } | ||
| #[test] | ||
| fn test_utf16() { | ||
| assert_try_writeable_parts_eq!( | ||
| PotentialUtf16::from_slice(&[0xD83E, 0xDD73]), | ||
| "🥳", | ||
| Ok(()), | ||
| [] | ||
| ); | ||
| assert_try_writeable_parts_eq!( | ||
| PotentialUtf16::from_slice(&[0xD83E, 0x20, 0xDD73]), | ||
| "� �", | ||
| Err(core::char::decode_utf16([0xD83E].into_iter()) | ||
| .next() | ||
| .unwrap() | ||
| .unwrap_err()), | ||
| [(0, 3, Part::ERROR), (4, 7, Part::ERROR)] | ||
| ); | ||
| } | ||
| } |