Latest Threat Research:SANDWORM_MODE: Shai-Hulud-Style npm Worm Hijacks CI Workflows and Poisons AI Toolchains.Details
Socket
Book a DemoInstallSign in
Socket

icu_properties

Package Overview
Dependencies
Maintainers
1
Versions
23
Alerts
File Explorer

Advanced tools

Socket logo

Install Socket

Detect and block malicious and high-risk dependencies

Install

icu-properties - npm Package Compare versions

Comparing version
2.1.1
to
2.1.2
+6
icu_properties-2.1.2/.cargo_vcs_info.json
{
"git": {
"sha1": "323738440f7ff810b8130b7881322466015f7fec"
},
"path_in_vcs": "components/properties"
}
# This file is automatically @generated by Cargo.
# It is not intended for manual editing.
version = 3
[[package]]
name = "cobs"
version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0fa961b519f0b462e3a3b4a34b64d119eeaca1d59af726fe450bbba07a9fc0a1"
dependencies = [
"thiserror",
]
[[package]]
name = "databake"
version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ff6ee9e2d2afb173bcdeee45934c89ec341ab26f91c9933774fc15c2b58f83ef"
dependencies = [
"databake-derive",
"proc-macro2",
"quote",
]
[[package]]
name = "databake-derive"
version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6834770958c7b84223607e49758ec0dde273c4df915e734aad50f62968a4c134"
dependencies = [
"proc-macro2",
"quote",
"syn",
"synstructure",
]
[[package]]
name = "displaydoc"
version = "0.2.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "erased-serde"
version = "0.4.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "259d404d09818dec19332e31d94558aeb442fea04c817006456c24b5460bbd4b"
dependencies = [
"serde",
"serde_core",
"typeid",
]
[[package]]
name = "icu_collections"
version = "2.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4c6b649701667bbe825c3b7e6388cb521c23d88644678e83c0c4d0a621a34b43"
dependencies = [
"databake",
"displaydoc",
"potential_utf",
"serde",
"yoke",
"zerofrom",
"zerovec",
]
[[package]]
name = "icu_locale_core"
version = "2.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "edba7861004dd3714265b4db54a3c390e880ab658fec5f7db895fae2046b5bb6"
dependencies = [
"databake",
"displaydoc",
"litemap",
"serde",
"tinystr",
"writeable",
"zerovec",
]
[[package]]
name = "icu_properties"
version = "2.1.2"
dependencies = [
"databake",
"icu_collections",
"icu_locale_core",
"icu_properties_data",
"icu_provider",
"serde",
"unicode-bidi",
"zerotrie",
"zerovec",
]
[[package]]
name = "icu_properties_data"
version = "2.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "616c294cf8d725c6afcd8f55abc17c56464ef6211f9ed59cccffe534129c77af"
[[package]]
name = "icu_provider"
version = "2.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "85962cf0ce02e1e0a629cc34e7ca3e373ce20dda4c4d7294bbd0bf1fdb59e614"
dependencies = [
"databake",
"displaydoc",
"erased-serde",
"icu_locale_core",
"postcard",
"serde",
"stable_deref_trait",
"writeable",
"yoke",
"zerofrom",
"zerotrie",
"zerovec",
]
[[package]]
name = "litemap"
version = "0.8.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6373607a59f0be73a39b6fe456b8192fcc3585f602af20751600e974dd455e77"
dependencies = [
"serde_core",
]
[[package]]
name = "postcard"
version = "1.1.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6764c3b5dd454e283a30e6dfe78e9b31096d9e32036b5d1eaac7a6119ccb9a24"
dependencies = [
"cobs",
"serde",
]
[[package]]
name = "potential_utf"
version = "0.1.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b73949432f5e2a09657003c25bca5e19a0e9c84f8058ca374f49e0ebe605af77"
dependencies = [
"serde_core",
"zerovec",
]
[[package]]
name = "proc-macro2"
version = "1.0.103"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5ee95bc4ef87b8d5ba32e8b7714ccc834865276eab0aed5c9958d00ec45f49e8"
dependencies = [
"unicode-ident",
]
[[package]]
name = "quote"
version = "1.0.41"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ce25767e7b499d1b604768e7cde645d14cc8584231ea6b295e9c9eb22c02e1d1"
dependencies = [
"proc-macro2",
]
[[package]]
name = "serde"
version = "1.0.228"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e"
dependencies = [
"serde_core",
"serde_derive",
]
[[package]]
name = "serde_core"
version = "1.0.228"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad"
dependencies = [
"serde_derive",
]
[[package]]
name = "serde_derive"
version = "1.0.228"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "stable_deref_trait"
version = "1.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6ce2be8dc25455e1f91df71bfa12ad37d7af1092ae736f3a6cd0e37bc7810596"
[[package]]
name = "syn"
version = "2.0.108"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "da58917d35242480a05c2897064da0a80589a2a0476c9a3f2fdc83b53502e917"
dependencies = [
"proc-macro2",
"quote",
"unicode-ident",
]
[[package]]
name = "synstructure"
version = "0.13.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "728a70f3dbaf5bab7f0c4b1ac8d7ae5ea60a4b5549c8a5914361c99147a709d2"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "thiserror"
version = "2.0.17"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f63587ca0f12b72a0600bcba1d40081f830876000bb46dd2337a3051618f4fc8"
dependencies = [
"thiserror-impl",
]
[[package]]
name = "thiserror-impl"
version = "2.0.17"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3ff15c8ecd7de3849db632e14d18d2571fa09dfc5ed93479bc4485c7a517c913"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "tinystr"
version = "0.8.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "42d3e9c45c09de15d06dd8acf5f4e0e399e85927b7f00711024eb7ae10fa4869"
dependencies = [
"displaydoc",
"serde_core",
"zerovec",
]
[[package]]
name = "typeid"
version = "1.0.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bc7d623258602320d5c55d1bc22793b57daff0ec7efc270ea7d55ce1d5f5471c"
[[package]]
name = "unicode-bidi"
version = "0.3.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5c1cb5db39152898a79168971543b1cb5020dff7fe43c8dc468b0885f5e29df5"
[[package]]
name = "unicode-ident"
version = "1.0.20"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "462eeb75aeb73aea900253ce739c8e18a67423fadf006037cd3ff27e82748a06"
[[package]]
name = "writeable"
version = "0.6.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9edde0db4769d2dc68579893f2306b26c6ecfbe0ef499b013d731b7b9247e0b9"
[[package]]
name = "yoke"
version = "0.8.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "72d6e5c6afb84d73944e5cedb052c4680d5657337201555f9f2a16b7406d4954"
dependencies = [
"stable_deref_trait",
"yoke-derive",
"zerofrom",
]
[[package]]
name = "yoke-derive"
version = "0.8.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b659052874eb698efe5b9e8cf382204678a0086ebf46982b79d6ca3182927e5d"
dependencies = [
"proc-macro2",
"quote",
"syn",
"synstructure",
]
[[package]]
name = "zerofrom"
version = "0.1.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "50cc42e0333e05660c3587f3bf9d0478688e15d870fab3346451ce7f8c9fbea5"
dependencies = [
"zerofrom-derive",
]
[[package]]
name = "zerofrom-derive"
version = "0.1.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d71e5d6e06ab090c67b5e44993ec16b72dcbaabc526db883a360057678b48502"
dependencies = [
"proc-macro2",
"quote",
"syn",
"synstructure",
]
[[package]]
name = "zerotrie"
version = "0.2.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2a59c17a5562d507e4b54960e8569ebee33bee890c70aa3fe7b97e85a9fd7851"
dependencies = [
"databake",
"displaydoc",
"litemap",
"serde_core",
"yoke",
"zerofrom",
"zerovec",
]
[[package]]
name = "zerovec"
version = "0.11.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6c28719294829477f525be0186d13efa9a3c602f7ec202ca9e353d310fb9a002"
dependencies = [
"databake",
"serde",
"yoke",
"zerofrom",
"zerovec-derive",
]
[[package]]
name = "zerovec-derive"
version = "0.11.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "eadce39539ca5cb3985590102671f2567e659fca9666581ad3411d59207951f3"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO
#
# When uploading crates to the registry Cargo will automatically
# "normalize" Cargo.toml files for maximal compatibility
# with all versions of Cargo and also rewrite `path` dependencies
# to registry (e.g., crates.io) dependencies.
#
# If you are reading this file be aware that the original Cargo.toml
# will likely look very different (and much more reasonable).
# See Cargo.toml.orig for the original contents.
[package]
edition = "2021"
rust-version = "1.83"
name = "icu_properties"
version = "2.1.2"
authors = ["The ICU4X Project Developers"]
build = false
include = [
"data/**/*",
"src/**/*",
"examples/**/*",
"benches/**/*",
"tests/**/*",
"Cargo.toml",
"LICENSE",
"README.md",
"build.rs",
]
autolib = false
autobins = false
autoexamples = false
autotests = false
autobenches = false
description = "Definitions for Unicode properties"
homepage = "https://icu4x.unicode.org"
readme = "README.md"
categories = ["internationalization"]
license = "Unicode-3.0"
repository = "https://github.com/unicode-org/icu4x"
[package.metadata.docs.rs]
all-features = true
[package.metadata.cargo-semver-checks.lints]
workspace = true
[features]
alloc = [
"zerovec/alloc",
"icu_collections/alloc",
"serde?/alloc",
]
compiled_data = [
"dep:icu_properties_data",
"icu_provider/baked",
]
datagen = [
"serde",
"dep:databake",
"zerovec/databake",
"icu_collections/databake",
"icu_locale_core/databake",
"zerotrie/databake",
"icu_provider/export",
]
default = ["compiled_data"]
serde = [
"dep:serde",
"icu_locale_core/serde",
"zerovec/serde",
"icu_collections/serde",
"icu_provider/serde",
"zerotrie/serde",
]
unicode_bidi = ["dep:unicode-bidi"]
[lib]
name = "icu_properties"
path = "src/lib.rs"
[dependencies.databake]
version = "0.2.0"
features = ["derive"]
optional = true
default-features = false
[dependencies.icu_collections]
version = "~2.1.1"
default-features = false
[dependencies.icu_locale_core]
version = "2.1.1"
features = ["zerovec"]
default-features = false
[dependencies.icu_properties_data]
version = "~2.1.2"
optional = true
default-features = false
[dependencies.icu_provider]
version = "2.1.1"
default-features = false
[dependencies.serde]
version = "1.0.220"
features = ["derive"]
optional = true
default-features = false
[dependencies.unicode-bidi]
version = "0.3.11"
optional = true
default-features = false
[dependencies.zerotrie]
version = "0.2.0"
features = [
"yoke",
"zerofrom",
]
default-features = false
[dependencies.zerovec]
version = "0.11.3"
features = [
"derive",
"yoke",
]
default-features = false
[dev-dependencies]

Sorry, the diff of this file is not supported yet

UNICODE LICENSE V3
COPYRIGHT AND PERMISSION NOTICE
Copyright © 2020-2024 Unicode, Inc.
NOTICE TO USER: Carefully read the following legal agreement. BY
DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING DATA FILES, AND/OR
SOFTWARE, YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE
TERMS AND CONDITIONS OF THIS AGREEMENT. IF YOU DO NOT AGREE, DO NOT
DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE THE DATA FILES OR SOFTWARE.
Permission is hereby granted, free of charge, to any person obtaining a
copy of data files and any associated documentation (the "Data Files") or
software and any associated documentation (the "Software") to deal in the
Data Files or Software without restriction, including without limitation
the rights to use, copy, modify, merge, publish, distribute, and/or sell
copies of the Data Files or Software, and to permit persons to whom the
Data Files or Software are furnished to do so, provided that either (a)
this copyright and permission notice appear with all copies of the Data
Files or Software, or (b) this copyright and permission notice appear in
associated Documentation.
THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY
KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF
THIRD PARTY RIGHTS.
IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE
BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES,
OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THE DATA
FILES OR SOFTWARE.
Except as contained in this notice, the name of a copyright holder shall
not be used in advertising or otherwise to promote the sale, use or other
dealings in these Data Files or Software without prior written
authorization of the copyright holder.
SPDX-License-Identifier: Unicode-3.0
Portions of ICU4X may have been adapted from ICU4C and/or ICU4J.
ICU 1.8.1 to ICU 57.1 © 1995-2016 International Business Machines Corporation and others.
# icu_properties [![crates.io](https://img.shields.io/crates/v/icu_properties)](https://crates.io/crates/icu_properties)
<!-- cargo-rdme start -->
Definitions of [Unicode Properties] and APIs for
retrieving property data in an appropriate data structure.
This module is published as its own crate ([`icu_properties`](https://docs.rs/icu_properties/latest/icu_properties/))
and as part of the [`icu`](https://docs.rs/icu/latest/icu/) crate. See the latter for more details on the ICU4X project.
APIs that return a `CodePointSetData` exist for binary properties and certain enumerated
properties.
APIs that return a `CodePointMapData` exist for certain enumerated properties.
## Examples
### Property data as `CodePointSetData`s
```rust
use icu::properties::{CodePointSetData, CodePointMapData};
use icu::properties::props::{GeneralCategory, Emoji};
// A binary property as a `CodePointSetData`
assert!(CodePointSetData::new::<Emoji>().contains('🎃')); // U+1F383 JACK-O-LANTERN
assert!(!CodePointSetData::new::<Emoji>().contains('木')); // U+6728
// An individual enumerated property value as a `CodePointSetData`
let line_sep_data = CodePointMapData::<GeneralCategory>::new()
.get_set_for_value(GeneralCategory::LineSeparator);
let line_sep = line_sep_data.as_borrowed();
assert!(line_sep.contains('\u{2028}'));
assert!(!line_sep.contains('\u{2029}'));
```
### Property data as `CodePointMapData`s
```rust
use icu::properties::CodePointMapData;
use icu::properties::props::Script;
assert_eq!(CodePointMapData::<Script>::new().get('🎃'), Script::Common); // U+1F383 JACK-O-LANTERN
assert_eq!(CodePointMapData::<Script>::new().get('木'), Script::Han); // U+6728
```
[`ICU4X`]: ../icu/index.html
[Unicode Properties]: https://unicode-org.github.io/icu/userguide/strings/properties.html
<!-- cargo-rdme end -->
## More Information
For more information on development, authorship, contributing etc. please visit [`ICU4X home page`](https://github.com/unicode-org/icu4x).
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
use crate::{props::EnumeratedProperty, provider::PropertyEnumBidiMirroringGlyphV1};
use icu_collections::codepointtrie::TrieValue;
use zerovec::ule::{AsULE, RawBytesULE};
/// This is a bitpacked combination of the `Bidi_Mirroring_Glyph`,
/// `Bidi_Mirrored`, and `Bidi_Paired_Bracket_Type` properties.
#[derive(Debug, Eq, PartialEq, Clone, Copy, Default)]
#[cfg_attr(feature = "datagen", derive(serde::Serialize, databake::Bake))]
#[cfg_attr(feature = "datagen", databake(path = icu_properties::props))]
#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
#[allow(clippy::exhaustive_structs)] // needed for baked construction
pub struct BidiMirroringGlyph {
/// The mirroring glyph
pub mirroring_glyph: Option<char>,
/// Whether the glyph is mirrored
pub mirrored: bool,
/// The paired bracket type
pub paired_bracket_type: BidiPairedBracketType,
}
impl EnumeratedProperty for BidiMirroringGlyph {
type DataMarker = PropertyEnumBidiMirroringGlyphV1;
#[cfg(feature = "compiled_data")]
const SINGLETON: &'static crate::provider::PropertyCodePointMap<'static, Self> =
crate::provider::Baked::SINGLETON_PROPERTY_ENUM_BIDI_MIRRORING_GLYPH_V1;
const NAME: &'static [u8] = b"Bidi_Mirroring_Glyph";
const SHORT_NAME: &'static [u8] = b"bmg";
}
impl crate::private::Sealed for BidiMirroringGlyph {}
impl AsULE for BidiMirroringGlyph {
type ULE = zerovec::ule::RawBytesULE<3>;
fn to_unaligned(self) -> Self::ULE {
let [a, b, c, _] = TrieValue::to_u32(self).to_le_bytes();
RawBytesULE([a, b, c])
}
fn from_unaligned(unaligned: Self::ULE) -> Self {
let [a, b, c] = unaligned.0;
TrieValue::try_from_u32(u32::from_le_bytes([a, b, c, 0])).unwrap_or_default()
}
}
/// The enum represents Bidi_Paired_Bracket_Type.
///
/// It does not implement [`EnumeratedProperty`], instead it can be obtained
/// through the bitpacked [`BidiMirroringGlyph`] property.
///
/// If you have a use case this property without also needing the [`BidiMirroringGlyph`]
/// property, and need to optimize data size, please file an issue.
#[derive(Debug, Eq, PartialEq, Copy, Clone, Default)]
#[cfg_attr(feature = "datagen", derive(serde::Serialize, databake::Bake))]
#[cfg_attr(feature = "datagen", databake(path = icu_properties::props))]
#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
#[non_exhaustive]
pub enum BidiPairedBracketType {
/// Represents Bidi_Paired_Bracket_Type=Open.
Open,
/// Represents Bidi_Paired_Bracket_Type=Close.
Close,
/// Represents Bidi_Paired_Bracket_Type=None.
#[default]
None,
}
#[cfg(feature = "unicode_bidi")]
use crate::props::BidiClass;
/// ✨ *Enabled with the `unicode_bidi` Cargo feature.*
///
/// # Examples
///
///```
/// use icu::properties::CodePointMapData;
/// use icu::properties::props::BidiClass;
/// use unicode_bidi::BidiInfo;
///
/// // This example text is defined using `concat!` because some browsers
/// // and text editors have trouble displaying bidi strings.
/// let text = concat!["א", // RTL#1
/// "ב", // RTL#2
/// "ג", // RTL#3
/// "a", // LTR#1
/// "b", // LTR#2
/// "c", // LTR#3
/// ]; //
///
///
/// let bidi_map = CodePointMapData::<BidiClass>::new();
///
/// // Resolve embedding levels within the text. Pass `None` to detect the
/// // paragraph level automatically.
/// let bidi_info = BidiInfo::new_with_data_source(&bidi_map, text, None);
///
/// // This paragraph has embedding level 1 because its first strong character is RTL.
/// assert_eq!(bidi_info.paragraphs.len(), 1);
/// let para = &bidi_info.paragraphs[0];
/// assert_eq!(para.level.number(), 1);
/// assert!(para.level.is_rtl());
///
/// // Re-ordering is done after wrapping each paragraph into a sequence of
/// // lines. For this example, I'll just use a single line that spans the
/// // entire paragraph.
/// let line = para.range.clone();
///
/// let display = bidi_info.reorder_line(para, line);
/// assert_eq!(display, concat!["a", // LTR#1
/// "b", // LTR#2
/// "c", // LTR#3
/// "ג", // RTL#3
/// "ב", // RTL#2
/// "א", // RTL#1
/// ]);
/// ```
#[cfg(feature = "unicode_bidi")]
impl unicode_bidi::data_source::BidiDataSource for crate::CodePointMapDataBorrowed<'_, BidiClass> {
fn bidi_class(&self, c: char) -> unicode_bidi::BidiClass {
self.get(c).into()
}
}
#[cfg(feature = "unicode_bidi")]
impl From<BidiClass> for unicode_bidi::BidiClass {
fn from(value: BidiClass) -> Self {
match value {
BidiClass::LeftToRight => unicode_bidi::BidiClass::L,
BidiClass::RightToLeft => unicode_bidi::BidiClass::R,
BidiClass::EuropeanNumber => unicode_bidi::BidiClass::EN,
BidiClass::EuropeanSeparator => unicode_bidi::BidiClass::ES,
BidiClass::EuropeanTerminator => unicode_bidi::BidiClass::ET,
BidiClass::ArabicNumber => unicode_bidi::BidiClass::AN,
BidiClass::CommonSeparator => unicode_bidi::BidiClass::CS,
BidiClass::ParagraphSeparator => unicode_bidi::BidiClass::B,
BidiClass::SegmentSeparator => unicode_bidi::BidiClass::S,
BidiClass::WhiteSpace => unicode_bidi::BidiClass::WS,
BidiClass::OtherNeutral => unicode_bidi::BidiClass::ON,
BidiClass::LeftToRightEmbedding => unicode_bidi::BidiClass::LRE,
BidiClass::LeftToRightOverride => unicode_bidi::BidiClass::LRO,
BidiClass::ArabicLetter => unicode_bidi::BidiClass::AL,
BidiClass::RightToLeftEmbedding => unicode_bidi::BidiClass::RLE,
BidiClass::RightToLeftOverride => unicode_bidi::BidiClass::RLO,
BidiClass::PopDirectionalFormat => unicode_bidi::BidiClass::PDF,
BidiClass::NonspacingMark => unicode_bidi::BidiClass::NSM,
BidiClass::BoundaryNeutral => unicode_bidi::BidiClass::BN,
BidiClass::FirstStrongIsolate => unicode_bidi::BidiClass::FSI,
BidiClass::LeftToRightIsolate => unicode_bidi::BidiClass::LRI,
BidiClass::RightToLeftIsolate => unicode_bidi::BidiClass::RLI,
BidiClass::PopDirectionalIsolate => unicode_bidi::BidiClass::PDI,
// This must not happen.
_ => unicode_bidi::BidiClass::ON,
}
}
}
#[cfg(feature = "unicode_bidi")]
impl From<unicode_bidi::BidiClass> for BidiClass {
fn from(value: unicode_bidi::BidiClass) -> Self {
match value {
unicode_bidi::BidiClass::L => BidiClass::LeftToRight,
unicode_bidi::BidiClass::R => BidiClass::RightToLeft,
unicode_bidi::BidiClass::EN => BidiClass::EuropeanNumber,
unicode_bidi::BidiClass::ES => BidiClass::EuropeanSeparator,
unicode_bidi::BidiClass::ET => BidiClass::EuropeanTerminator,
unicode_bidi::BidiClass::AN => BidiClass::ArabicNumber,
unicode_bidi::BidiClass::CS => BidiClass::CommonSeparator,
unicode_bidi::BidiClass::B => BidiClass::ParagraphSeparator,
unicode_bidi::BidiClass::S => BidiClass::SegmentSeparator,
unicode_bidi::BidiClass::WS => BidiClass::WhiteSpace,
unicode_bidi::BidiClass::ON => BidiClass::OtherNeutral,
unicode_bidi::BidiClass::LRE => BidiClass::LeftToRightEmbedding,
unicode_bidi::BidiClass::LRO => BidiClass::LeftToRightOverride,
unicode_bidi::BidiClass::AL => BidiClass::ArabicLetter,
unicode_bidi::BidiClass::RLE => BidiClass::RightToLeftEmbedding,
unicode_bidi::BidiClass::RLO => BidiClass::RightToLeftOverride,
unicode_bidi::BidiClass::PDF => BidiClass::PopDirectionalFormat,
unicode_bidi::BidiClass::NSM => BidiClass::NonspacingMark,
unicode_bidi::BidiClass::BN => BidiClass::BoundaryNeutral,
unicode_bidi::BidiClass::FSI => BidiClass::FirstStrongIsolate,
unicode_bidi::BidiClass::LRI => BidiClass::LeftToRightIsolate,
unicode_bidi::BidiClass::RLI => BidiClass::RightToLeftIsolate,
unicode_bidi::BidiClass::PDI => BidiClass::PopDirectionalIsolate,
}
}
}
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
#[cfg(feature = "alloc")]
use crate::code_point_set::CodePointSetData;
use crate::props::GeneralCategory;
use crate::props::GeneralCategoryGroup;
use crate::provider::*;
use core::ops::RangeInclusive;
use icu_collections::codepointtrie::{CodePointMapRange, CodePointTrie, TrieValue};
use icu_provider::marker::ErasedMarker;
use icu_provider::prelude::*;
/// A wrapper around code point map data.
///
/// It is returned by APIs that return Unicode
/// property data in a map-like form, ex: enumerated property value data keyed
/// by code point. Access its data via the borrowed version,
/// [`CodePointMapDataBorrowed`].
#[derive(Debug, Clone)]
pub struct CodePointMapData<T: TrieValue> {
data: DataPayload<ErasedMarker<PropertyCodePointMap<'static, T>>>,
}
impl<T: TrieValue> CodePointMapData<T> {
/// Creates a new [`CodePointMapData`] for a [`EnumeratedProperty`].
///
/// See the documentation on [`EnumeratedProperty`] implementations for details.
///
/// ✨ *Enabled with the `compiled_data` Cargo feature.*
///
/// [📚 Help choosing a constructor](icu_provider::constructors)
#[cfg(feature = "compiled_data")]
#[expect(clippy::new_ret_no_self)]
pub const fn new() -> CodePointMapDataBorrowed<'static, T>
where
T: EnumeratedProperty,
{
CodePointMapDataBorrowed::new()
}
#[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new)]
pub fn try_new_unstable(
provider: &(impl DataProvider<T::DataMarker> + ?Sized),
) -> Result<Self, DataError>
where
T: EnumeratedProperty,
{
Ok(Self {
data: provider.load(Default::default())?.payload.cast(),
})
}
/// Construct a borrowed version of this type that can be queried.
///
/// This avoids a potential small underlying cost per API call (like `get()`) by consolidating it
/// up front.
///
/// This owned version if returned by functions that use a runtime data provider.
#[inline]
pub fn as_borrowed(&self) -> CodePointMapDataBorrowed<'_, T> {
CodePointMapDataBorrowed {
map: self.data.get(),
}
}
/// Convert this map to a map around another type
///
/// Typically useful for type-erasing maps into maps around integers.
///
/// ✨ *Enabled with the `alloc` Cargo feature.*
///
/// # Panics
/// Will panic if T and P are different sizes
///
/// # Example
///
/// ```
/// use icu::properties::CodePointMapData;
/// use icu::properties::props::GeneralCategory;
///
/// let data = CodePointMapData::<GeneralCategory>::new().static_to_owned();
///
/// let gc = data.try_into_converted::<u8>().unwrap();
/// let gc = gc.as_borrowed();
///
/// assert_eq!(gc.get('木'), GeneralCategory::OtherLetter as u8); // U+6728
/// assert_eq!(gc.get('🎃'), GeneralCategory::OtherSymbol as u8); // U+1F383 JACK-O-LANTERN
/// ```
#[cfg(feature = "alloc")]
pub fn try_into_converted<P>(self) -> Result<CodePointMapData<P>, zerovec::ule::UleError>
where
P: TrieValue,
{
self.data
.try_map_project(|data, _| data.try_into_converted())
.map(CodePointMapData::from_data::<ErasedMarker<PropertyCodePointMap<'static, P>>>)
}
/// Construct a new one from loaded data
///
/// Typically it is preferable to use getters like [`load_general_category()`] instead
pub(crate) fn from_data<M>(data: DataPayload<M>) -> Self
where
M: DynamicDataMarker<DataStruct = PropertyCodePointMap<'static, T>>,
{
Self { data: data.cast() }
}
/// Construct a new one an owned [`CodePointTrie`]
pub fn from_code_point_trie(trie: CodePointTrie<'static, T>) -> Self {
let set = PropertyCodePointMap::from_code_point_trie(trie);
CodePointMapData::from_data(
DataPayload::<ErasedMarker<PropertyCodePointMap<'static, T>>>::from_owned(set),
)
}
/// Convert this type to a [`CodePointTrie`] as a borrowed value.
///
/// The data backing this is extensible and supports multiple implementations.
/// Currently it is always [`CodePointTrie`]; however in the future more backends may be
/// added, and users may select which at data generation time.
///
/// This method returns an `Option` in order to return `None` when the backing data provider
/// cannot return a [`CodePointTrie`], or cannot do so within the expected constant time
/// constraint.
pub fn as_code_point_trie(&self) -> Option<&CodePointTrie<'_, T>> {
self.data.get().as_code_point_trie()
}
/// Convert this type to a [`CodePointTrie`], borrowing if possible,
/// otherwise allocating a new [`CodePointTrie`].
///
/// The data backing this is extensible and supports multiple implementations.
/// Currently it is always [`CodePointTrie`]; however in the future more backends may be
/// added, and users may select which at data generation time.
///
/// The performance of the conversion to this specific return type will vary
/// depending on the data structure that is backing `self`.
pub fn to_code_point_trie(&self) -> CodePointTrie<'_, T> {
self.data.get().to_code_point_trie()
}
}
/// A borrowed wrapper around code point set data, returned by
/// [`CodePointSetData::as_borrowed()`]. More efficient to query.
#[derive(Clone, Copy, Debug)]
pub struct CodePointMapDataBorrowed<'a, T: TrieValue> {
map: &'a PropertyCodePointMap<'a, T>,
}
impl<'a, T: TrieValue> CodePointMapDataBorrowed<'a, T> {
/// Get the value this map has associated with code point `ch`
///
/// # Example
///
/// ```
/// use icu::properties::CodePointMapData;
/// use icu::properties::props::GeneralCategory;
///
/// let gc = CodePointMapData::<GeneralCategory>::new();
///
/// assert_eq!(gc.get('木'), GeneralCategory::OtherLetter); // U+6728
/// assert_eq!(gc.get('🎃'), GeneralCategory::OtherSymbol); // U+1F383 JACK-O-LANTERN
/// ```
#[inline]
pub fn get(self, ch: char) -> T {
self.map.get(ch)
}
/// See [`Self::get`].
#[inline]
pub fn get32(self, ch: u32) -> T {
self.map.get32(ch)
}
/// Get a [`CodePointSetData`] for all elements corresponding to a particular value
///
/// ✨ *Enabled with the `alloc` Cargo feature.*
///
/// # Example
///
/// ```
/// use icu::properties::props::GeneralCategory;
/// use icu::properties::CodePointMapData;
///
/// let gc = CodePointMapData::<GeneralCategory>::new();
///
/// let other_letter_set_data =
/// gc.get_set_for_value(GeneralCategory::OtherLetter);
/// let other_letter_set = other_letter_set_data.as_borrowed();
///
/// assert!(other_letter_set.contains('木')); // U+6728
/// assert!(!other_letter_set.contains('🎃')); // U+1F383 JACK-O-LANTERN
/// ```
#[cfg(feature = "alloc")]
pub fn get_set_for_value(self, value: T) -> CodePointSetData {
let set = self.map.get_set_for_value(value);
CodePointSetData::from_code_point_inversion_list(set)
}
/// Yields an [`Iterator`] returning ranges of consecutive code points that
/// share the same value in the [`CodePointMapData`].
///
/// # Examples
///
/// ```
/// use icu::properties::props::GeneralCategory;
/// use icu::properties::CodePointMapData;
///
/// let gc = CodePointMapData::<GeneralCategory>::new();
/// let mut ranges = gc.iter_ranges();
/// let next = ranges.next().unwrap();
/// assert_eq!(next.range, 0..=31);
/// assert_eq!(next.value, GeneralCategory::Control);
/// let next = ranges.next().unwrap();
/// assert_eq!(next.range, 32..=32);
/// assert_eq!(next.value, GeneralCategory::SpaceSeparator);
/// ```
pub fn iter_ranges(self) -> impl Iterator<Item = CodePointMapRange<T>> + 'a {
self.map.iter_ranges()
}
/// Yields an [`Iterator`] returning ranges of consecutive code points that
/// share the same value `v` in the [`CodePointMapData`].
///
/// # Examples
///
///
/// ```
/// use icu::properties::props::GeneralCategory;
/// use icu::properties::CodePointMapData;
///
/// let gc = CodePointMapData::<GeneralCategory>::new();
/// let mut ranges = gc.iter_ranges_for_value(GeneralCategory::UppercaseLetter);
/// assert_eq!(ranges.next().unwrap(), 'A' as u32..='Z' as u32);
/// assert_eq!(ranges.next().unwrap(), 'À' as u32..='Ö' as u32);
/// assert_eq!(ranges.next().unwrap(), 'Ø' as u32..='Þ' as u32);
/// ```
pub fn iter_ranges_for_value(self, val: T) -> impl Iterator<Item = RangeInclusive<u32>> + 'a {
self.map
.iter_ranges()
.filter(move |r| r.value == val)
.map(|r| r.range)
}
/// Yields an [`Iterator`] returning ranges of consecutive code points that
/// do *not* have the value `v` in the [`CodePointMapData`].
pub fn iter_ranges_for_value_complemented(
self,
val: T,
) -> impl Iterator<Item = RangeInclusive<u32>> + 'a {
self.map
.iter_ranges_mapped(move |value| value != val)
.filter(|v| v.value)
.map(|v| v.range)
}
/// Exposed for FFI needs, could be exposed in general in the future but we should
/// have a use case first.
///
/// FFI needs this since it operates on erased maps and can't use `iter_ranges_for_group()`
#[doc(hidden)] // used by FFI code
pub fn iter_ranges_mapped<U: Eq + 'a>(
self,
predicate: impl FnMut(T) -> U + Copy + 'a,
) -> impl Iterator<Item = CodePointMapRange<U>> + 'a {
self.map.iter_ranges_mapped(predicate)
}
}
impl CodePointMapDataBorrowed<'_, GeneralCategory> {
/// Get a [`CodePointSetData`] for all elements corresponding to a particular value group
///
/// ✨ *Enabled with the `alloc` Cargo feature.*
///
/// # Example
///
/// ```
/// use icu::properties::props::{GeneralCategory, GeneralCategoryGroup};
/// use icu::properties::CodePointMapData;
///
/// let gc = CodePointMapData::<GeneralCategory>::new();
///
/// let other_letter_set_data =
/// gc.get_set_for_value_group(GeneralCategoryGroup::OtherLetter);
/// let other_letter_set = other_letter_set_data.as_borrowed();
///
/// assert!(other_letter_set.contains('木')); // U+6728
/// assert!(!other_letter_set.contains('🎃')); // U+1F383 JACK-O-LANTERN
/// ```
#[cfg(feature = "alloc")]
pub fn get_set_for_value_group(self, value: GeneralCategoryGroup) -> crate::CodePointSetData {
let matching_gc_ranges = self
.iter_ranges()
.filter(|cpm_range| (1 << cpm_range.value as u32) & value.0 != 0)
.map(|cpm_range| cpm_range.range);
CodePointSetData::from_code_point_inversion_list(matching_gc_ranges.collect())
}
}
#[cfg(feature = "compiled_data")]
impl<T: EnumeratedProperty> Default for CodePointMapDataBorrowed<'static, T> {
fn default() -> Self {
Self::new()
}
}
impl<T: TrieValue> CodePointMapDataBorrowed<'static, T> {
/// Creates a new [`CodePointMapDataBorrowed`] for a [`EnumeratedProperty`].
///
/// See the documentation on [`EnumeratedProperty`] implementations for details.
///
/// ✨ *Enabled with the `compiled_data` Cargo feature.*
///
/// [📚 Help choosing a constructor](icu_provider::constructors)
#[cfg(feature = "compiled_data")]
pub const fn new() -> Self
where
T: EnumeratedProperty,
{
CodePointMapDataBorrowed { map: T::SINGLETON }
}
/// Cheaply converts a [`CodePointMapDataBorrowed<'static>`] into a [`CodePointMapData`].
///
/// Note: Due to branching and indirection, using [`CodePointMapData`] might inhibit some
/// compile-time optimizations that are possible with [`CodePointMapDataBorrowed`].
pub const fn static_to_owned(self) -> CodePointMapData<T> {
CodePointMapData {
data: DataPayload::from_static_ref(self.map),
}
}
}
impl<'a> CodePointMapDataBorrowed<'a, GeneralCategory> {
/// Yields an [`Iterator`] returning ranges of consecutive code points that
/// have a `General_Category` value belonging to the specified [`GeneralCategoryGroup`]
///
/// # Examples
///
/// ```
/// use icu::properties::props::{GeneralCategory, GeneralCategoryGroup};
/// use icu::properties::CodePointMapData;
///
/// let gc = CodePointMapData::<GeneralCategory>::new();
/// let mut ranges = gc.iter_ranges_for_group(GeneralCategoryGroup::Letter);
/// assert_eq!(ranges.next().unwrap(), 'A' as u32..='Z' as u32);
/// assert_eq!(ranges.next().unwrap(), 'a' as u32..='z' as u32);
/// assert_eq!(ranges.next().unwrap(), 'ª' as u32..='ª' as u32);
/// assert_eq!(ranges.next().unwrap(), 'µ' as u32..='µ' as u32);
/// assert_eq!(ranges.next().unwrap(), 'º' as u32..='º' as u32);
/// assert_eq!(ranges.next().unwrap(), 'À' as u32..='Ö' as u32);
/// assert_eq!(ranges.next().unwrap(), 'Ø' as u32..='ö' as u32);
/// ```
pub fn iter_ranges_for_group(
self,
group: GeneralCategoryGroup,
) -> impl Iterator<Item = RangeInclusive<u32>> + 'a {
self.map
.iter_ranges_mapped(move |value| group.contains(value))
.filter(|v| v.value)
.map(|v| v.range)
}
}
/// A Unicode character property that assigns a value to each code point.
///
/// The descriptions of most properties are taken from [`TR44`], the documentation for the
/// Unicode Character Database.
///
/// <div class="stab unstable">
/// 🚫 This trait is sealed; it cannot be implemented by user code. If an API requests an item that implements this
/// trait, please consider using a type from the implementors listed below.
/// </div>
///
/// [`TR44`]: https://www.unicode.org/reports/tr44
pub trait EnumeratedProperty: crate::private::Sealed + TrieValue {
#[doc(hidden)]
type DataMarker: DataMarker<DataStruct = PropertyCodePointMap<'static, Self>>;
#[doc(hidden)]
#[cfg(feature = "compiled_data")]
const SINGLETON: &'static PropertyCodePointMap<'static, Self>;
/// The name of this property
const NAME: &'static [u8];
/// The abbreviated name of this property, if it exists, otherwise the name
const SHORT_NAME: &'static [u8];
/// Convenience method for `CodePointMapData::new().get(ch)`
///
/// ✨ *Enabled with the `compiled_data` Cargo feature.*
#[cfg(feature = "compiled_data")]
fn for_char(ch: char) -> Self {
CodePointMapData::new().get(ch)
}
}
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
use crate::provider::*;
use core::ops::RangeInclusive;
use icu_collections::codepointinvlist::CodePointInversionList;
use icu_provider::marker::ErasedMarker;
use icu_provider::prelude::*;
/// A set of Unicode code points. Access its data via the borrowed version,
/// [`CodePointSetDataBorrowed`].
///
/// # Example
/// ```rust
/// use icu::properties::CodePointSetData;
/// use icu::properties::props::Alphabetic;
///
/// let alphabetic = CodePointSetData::new::<Alphabetic>();
///
/// assert!(!alphabetic.contains('3'));
/// assert!(!alphabetic.contains('੩')); // U+0A69 GURMUKHI DIGIT THREE
/// assert!(alphabetic.contains('A'));
/// assert!(alphabetic.contains('Ä')); // U+00C4 LATIN CAPITAL LETTER A WITH DIAERESIS
/// ```
#[derive(Debug)]
pub struct CodePointSetData {
data: DataPayload<ErasedMarker<PropertyCodePointSet<'static>>>,
}
impl CodePointSetData {
/// Creates a new [`CodePointSetDataBorrowed`] for a [`BinaryProperty`].
///
/// ✨ *Enabled with the `compiled_data` Cargo feature.*
///
/// [📚 Help choosing a constructor](icu_provider::constructors)
#[expect(clippy::new_ret_no_self)]
#[cfg(feature = "compiled_data")]
pub const fn new<P: BinaryProperty>() -> CodePointSetDataBorrowed<'static> {
CodePointSetDataBorrowed::new::<P>()
}
#[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new)]
pub fn try_new_unstable<P: BinaryProperty>(
provider: &(impl DataProvider<P::DataMarker> + ?Sized),
) -> Result<CodePointSetData, DataError> {
Ok(CodePointSetData::from_data(
provider.load(Default::default())?.payload,
))
}
/// Construct a borrowed version of this type that can be queried.
///
/// This owned version if returned by functions that use a runtime data provider.
#[inline]
pub fn as_borrowed(&self) -> CodePointSetDataBorrowed<'_> {
CodePointSetDataBorrowed {
set: self.data.get(),
}
}
/// Construct a new one from loaded data
///
/// Typically it is preferable to use getters like [`load_ascii_hex_digit()`] instead
pub(crate) fn from_data<M>(data: DataPayload<M>) -> Self
where
M: DynamicDataMarker<DataStruct = PropertyCodePointSet<'static>>,
{
Self { data: data.cast() }
}
/// Construct a new owned [`CodePointInversionList`]
pub fn from_code_point_inversion_list(set: CodePointInversionList<'static>) -> Self {
let set = PropertyCodePointSet::from_code_point_inversion_list(set);
CodePointSetData::from_data(
DataPayload::<ErasedMarker<PropertyCodePointSet<'static>>>::from_owned(set),
)
}
/// Convert this type to a [`CodePointInversionList`] as a borrowed value.
///
/// The data backing this is extensible and supports multiple implementations.
/// Currently it is always [`CodePointInversionList`]; however in the future more backends may be
/// added, and users may select which at data generation time.
///
/// This method returns an `Option` in order to return `None` when the backing data provider
/// cannot return a [`CodePointInversionList`], or cannot do so within the expected constant time
/// constraint.
pub fn as_code_point_inversion_list(&self) -> Option<&CodePointInversionList<'_>> {
self.data.get().as_code_point_inversion_list()
}
/// Convert this type to a [`CodePointInversionList`], borrowing if possible,
/// otherwise allocating a new [`CodePointInversionList`].
///
/// The data backing this is extensible and supports multiple implementations.
/// Currently it is always [`CodePointInversionList`]; however in the future more backends may be
/// added, and users may select which at data generation time.
///
/// The performance of the conversion to this specific return type will vary
/// depending on the data structure that is backing `self`.
pub fn to_code_point_inversion_list(&self) -> CodePointInversionList<'_> {
self.data.get().to_code_point_inversion_list()
}
}
/// A borrowed wrapper around code point set data, returned by
/// [`CodePointSetData::as_borrowed()`]. More efficient to query.
#[derive(Clone, Copy, Debug)]
pub struct CodePointSetDataBorrowed<'a> {
set: &'a PropertyCodePointSet<'a>,
}
impl CodePointSetDataBorrowed<'static> {
/// Creates a new [`CodePointSetData`] for a [`BinaryProperty`].
///
/// ✨ *Enabled with the `compiled_data` Cargo feature.*
///
/// [📚 Help choosing a constructor](icu_provider::constructors)
#[inline]
#[cfg(feature = "compiled_data")]
pub const fn new<P: BinaryProperty>() -> Self {
CodePointSetDataBorrowed { set: P::SINGLETON }
}
/// Cheaply converts a [`CodePointSetDataBorrowed<'static>`] into a [`CodePointSetData`].
///
/// Note: Due to branching and indirection, using [`CodePointSetData`] might inhibit some
/// compile-time optimizations that are possible with [`CodePointSetDataBorrowed`].
pub const fn static_to_owned(self) -> CodePointSetData {
CodePointSetData {
data: DataPayload::from_static_ref(self.set),
}
}
}
impl<'a> CodePointSetDataBorrowed<'a> {
/// Check if the set contains a character
///
/// ```rust
/// use icu::properties::CodePointSetData;
/// use icu::properties::props::Alphabetic;
///
/// let alphabetic = CodePointSetData::new::<Alphabetic>();
///
/// assert!(!alphabetic.contains('3'));
/// assert!(!alphabetic.contains('੩')); // U+0A69 GURMUKHI DIGIT THREE
/// assert!(alphabetic.contains('A'));
/// assert!(alphabetic.contains('Ä')); // U+00C4 LATIN CAPITAL LETTER A WITH DIAERESIS
/// ```
#[inline]
pub fn contains(self, ch: char) -> bool {
self.set.contains(ch)
}
/// See [`Self::contains`].
#[inline]
pub fn contains32(self, ch: u32) -> bool {
self.set.contains32(ch)
}
// Yields an [`Iterator`] returning the ranges of the code points that are
/// included in the [`CodePointSetData`]
///
/// Ranges are returned as [`RangeInclusive`], which is inclusive of its
/// `end` bound value. An end-inclusive behavior matches the ICU4C/J
/// behavior of ranges, ex: `UnicodeSet::contains(UChar32 start, UChar32 end)`.
///
/// # Example
///
/// ```
/// use icu::properties::props::Alphabetic;
/// use icu::properties::CodePointSetData;
///
/// let alphabetic = CodePointSetData::new::<Alphabetic>();
/// let mut ranges = alphabetic.iter_ranges();
///
/// assert_eq!(Some(0x0041..=0x005A), ranges.next()); // 'A'..'Z'
/// assert_eq!(Some(0x0061..=0x007A), ranges.next()); // 'a'..'z'
/// ```
#[inline]
pub fn iter_ranges(self) -> impl Iterator<Item = RangeInclusive<u32>> + 'a {
self.set.iter_ranges()
}
// Yields an [`Iterator`] returning the ranges of the code points that are
/// *not* included in the [`CodePointSetData`]
///
/// Ranges are returned as [`RangeInclusive`], which is inclusive of its
/// `end` bound value. An end-inclusive behavior matches the ICU4C/J
/// behavior of ranges, ex: `UnicodeSet::contains(UChar32 start, UChar32 end)`.
///
/// # Example
///
/// ```
/// use icu::properties::props::Alphabetic;
/// use icu::properties::CodePointSetData;
///
/// let alphabetic = CodePointSetData::new::<Alphabetic>();
/// let mut ranges = alphabetic.iter_ranges();
///
/// assert_eq!(Some(0x0041..=0x005A), ranges.next()); // 'A'..'Z'
/// assert_eq!(Some(0x0061..=0x007A), ranges.next()); // 'a'..'z'
/// ```
#[inline]
pub fn iter_ranges_complemented(self) -> impl Iterator<Item = RangeInclusive<u32>> + 'a {
self.set.iter_ranges_complemented()
}
}
/// A binary Unicode character property.
///
/// The descriptions of most properties are taken from [`TR44`], the documentation for the
/// Unicode Character Database. Some properties are instead defined in [`TR18`], the
/// documentation for Unicode regular expressions. In particular, Annex C of this document
/// defines properties for POSIX compatibility.
///
/// <div class="stab unstable">
/// 🚫 This trait is sealed; it cannot be implemented by user code. If an API requests an item that implements this
/// trait, please consider using a type from the implementors listed below.
/// </div>
///
/// [`TR44`]: https://www.unicode.org/reports/tr44
/// [`TR18`]: https://www.unicode.org/reports/tr18
pub trait BinaryProperty: crate::private::Sealed + Sized {
#[doc(hidden)]
type DataMarker: DataMarker<DataStruct = PropertyCodePointSet<'static>>;
#[doc(hidden)]
#[cfg(feature = "compiled_data")]
const SINGLETON: &'static PropertyCodePointSet<'static>;
/// The name of this property
const NAME: &'static [u8];
/// The abbreviated name of this property, if it exists, otherwise the name
const SHORT_NAME: &'static [u8];
/// Convenience method for `CodePointSetData::new().contains(ch)`
///
/// ✨ *Enabled with the `compiled_data` Cargo feature.*
#[cfg(feature = "compiled_data")]
fn for_char(ch: char) -> bool {
CodePointSetData::new::<Self>().contains(ch)
}
}
#[cfg(test)]
mod tests {
#[test]
fn test_general_category() {
use icu::properties::props::GeneralCategory;
use icu::properties::props::GeneralCategoryGroup;
use icu::properties::CodePointMapData;
let digits_data = CodePointMapData::<GeneralCategory>::new()
.get_set_for_value_group(GeneralCategoryGroup::Number);
let digits = digits_data.as_borrowed();
assert!(digits.contains('5'));
assert!(digits.contains('\u{0665}')); // U+0665 ARABIC-INDIC DIGIT FIVE
assert!(digits.contains('\u{096b}')); // U+0969 DEVANAGARI DIGIT FIVE
assert!(!digits.contains('A'));
}
#[test]
fn test_script() {
use icu::properties::props::Script;
use icu::properties::CodePointMapData;
let thai_data = CodePointMapData::<Script>::new().get_set_for_value(Script::Thai);
let thai = thai_data.as_borrowed();
assert!(thai.contains('\u{0e01}')); // U+0E01 THAI CHARACTER KO KAI
assert!(thai.contains('\u{0e50}')); // U+0E50 THAI DIGIT ZERO
assert!(!thai.contains('A'));
assert!(!thai.contains('\u{0e3f}')); // U+0E50 THAI CURRENCY SYMBOL BAHT
}
#[test]
fn test_gc_groupings() {
use icu::properties::props::{GeneralCategory, GeneralCategoryGroup};
use icu::properties::CodePointMapData;
use icu_collections::codepointinvlist::CodePointInversionListBuilder;
let test_group = |category: GeneralCategoryGroup, subcategories: &[GeneralCategory]| {
let category_set =
CodePointMapData::<GeneralCategory>::new().get_set_for_value_group(category);
let category_set = category_set
.as_code_point_inversion_list()
.expect("The data should be valid");
let mut builder = CodePointInversionListBuilder::new();
for &subcategory in subcategories {
let gc_set_data =
CodePointMapData::<GeneralCategory>::new().get_set_for_value(subcategory);
let gc_set = gc_set_data.as_borrowed();
for range in gc_set.iter_ranges() {
builder.add_range32(range);
}
}
let combined_set = builder.build();
println!("{category:?} {subcategories:?}");
assert_eq!(
category_set.get_inversion_list_vec(),
combined_set.get_inversion_list_vec()
);
};
test_group(
GeneralCategoryGroup::Letter,
&[
GeneralCategory::UppercaseLetter,
GeneralCategory::LowercaseLetter,
GeneralCategory::TitlecaseLetter,
GeneralCategory::ModifierLetter,
GeneralCategory::OtherLetter,
],
);
test_group(
GeneralCategoryGroup::Other,
&[
GeneralCategory::Control,
GeneralCategory::Format,
GeneralCategory::Unassigned,
GeneralCategory::PrivateUse,
GeneralCategory::Surrogate,
],
);
test_group(
GeneralCategoryGroup::Mark,
&[
GeneralCategory::SpacingMark,
GeneralCategory::EnclosingMark,
GeneralCategory::NonspacingMark,
],
);
test_group(
GeneralCategoryGroup::Number,
&[
GeneralCategory::DecimalNumber,
GeneralCategory::LetterNumber,
GeneralCategory::OtherNumber,
],
);
test_group(
GeneralCategoryGroup::Punctuation,
&[
GeneralCategory::ConnectorPunctuation,
GeneralCategory::DashPunctuation,
GeneralCategory::ClosePunctuation,
GeneralCategory::FinalPunctuation,
GeneralCategory::InitialPunctuation,
GeneralCategory::OtherPunctuation,
GeneralCategory::OpenPunctuation,
],
);
test_group(
GeneralCategoryGroup::Symbol,
&[
GeneralCategory::CurrencySymbol,
GeneralCategory::ModifierSymbol,
GeneralCategory::MathSymbol,
GeneralCategory::OtherSymbol,
],
);
test_group(
GeneralCategoryGroup::Separator,
&[
GeneralCategory::LineSeparator,
GeneralCategory::ParagraphSeparator,
GeneralCategory::SpaceSeparator,
],
);
}
#[test]
fn test_gc_surrogate() {
use icu::properties::props::GeneralCategory;
use icu::properties::CodePointMapData;
let surrogates_data = CodePointMapData::<GeneralCategory>::new()
.get_set_for_value(GeneralCategory::Surrogate);
let surrogates = surrogates_data.as_borrowed();
assert!(surrogates.contains32(0xd800));
assert!(surrogates.contains32(0xd900));
assert!(surrogates.contains32(0xdfff));
assert!(!surrogates.contains('A'));
}
}
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
use crate::provider::*;
use icu_collections::codepointinvliststringlist::CodePointInversionListAndStringList;
use icu_provider::marker::ErasedMarker;
use icu_provider::prelude::*;
/// A wrapper around `UnicodeSet` data (characters and strings)
#[derive(Debug)]
pub struct EmojiSetData {
data: DataPayload<ErasedMarker<PropertyUnicodeSet<'static>>>,
}
impl EmojiSetData {
/// Creates a new [`EmojiSetDataBorrowed`] for a [`EmojiSet`].
///
/// See the documentation on [`EmojiSet`] implementations for details.
///
/// ✨ *Enabled with the `compiled_data` Cargo feature.*
///
/// [📚 Help choosing a constructor](icu_provider::constructors)
#[cfg(feature = "compiled_data")]
#[expect(clippy::new_ret_no_self)]
pub const fn new<P: EmojiSet>() -> EmojiSetDataBorrowed<'static> {
EmojiSetDataBorrowed::new::<P>()
}
/// A version of `new()` that uses custom data provided by a [`DataProvider`].
///
/// Note that this will return an owned version of the data. Functionality is available on
/// the borrowed version, accessible through [`EmojiSetData::as_borrowed`].
pub fn try_new_unstable<P: EmojiSet>(
provider: &(impl DataProvider<P::DataMarker> + ?Sized),
) -> Result<EmojiSetData, DataError> {
Ok(EmojiSetData::from_data(
provider.load(Default::default())?.payload,
))
}
/// Construct a borrowed version of this type that can be queried.
///
/// This avoids a potential small underlying cost per API call (ex: `contains()`) by consolidating it
/// up front.
#[inline]
pub fn as_borrowed(&self) -> EmojiSetDataBorrowed<'_> {
EmojiSetDataBorrowed {
set: self.data.get(),
}
}
/// Construct a new one from loaded data
///
/// Typically it is preferable to use getters instead
pub(crate) fn from_data<M>(data: DataPayload<M>) -> Self
where
M: DynamicDataMarker<DataStruct = PropertyUnicodeSet<'static>>,
{
Self { data: data.cast() }
}
/// Construct a new owned [`CodePointInversionListAndStringList`]
pub fn from_code_point_inversion_list_string_list(
set: CodePointInversionListAndStringList<'static>,
) -> Self {
let set = PropertyUnicodeSet::from_code_point_inversion_list_string_list(set);
EmojiSetData::from_data(
DataPayload::<ErasedMarker<PropertyUnicodeSet<'static>>>::from_owned(set),
)
}
/// Convert this type to a [`CodePointInversionListAndStringList`] as a borrowed value.
///
/// The data backing this is extensible and supports multiple implementations.
/// Currently it is always [`CodePointInversionListAndStringList`]; however in the future more backends may be
/// added, and users may select which at data generation time.
///
/// This method returns an `Option` in order to return `None` when the backing data provider
/// cannot return a [`CodePointInversionListAndStringList`], or cannot do so within the expected constant time
/// constraint.
pub fn as_code_point_inversion_list_string_list(
&self,
) -> Option<&CodePointInversionListAndStringList<'_>> {
self.data.get().as_code_point_inversion_list_string_list()
}
/// Convert this type to a [`CodePointInversionListAndStringList`], borrowing if possible,
/// otherwise allocating a new [`CodePointInversionListAndStringList`].
///
/// The data backing this is extensible and supports multiple implementations.
/// Currently it is always [`CodePointInversionListAndStringList`]; however in the future more backends may be
/// added, and users may select which at data generation time.
///
/// The performance of the conversion to this specific return type will vary
/// depending on the data structure that is backing `self`.
pub fn to_code_point_inversion_list_string_list(
&self,
) -> CodePointInversionListAndStringList<'_> {
self.data.get().to_code_point_inversion_list_string_list()
}
}
/// A borrowed wrapper around code point set data, returned by
/// [`EmojiSetData::as_borrowed()`]. More efficient to query.
#[derive(Clone, Copy, Debug)]
pub struct EmojiSetDataBorrowed<'a> {
set: &'a PropertyUnicodeSet<'a>,
}
impl EmojiSetDataBorrowed<'_> {
/// Check if the set contains the string. Strings consisting of one character
/// are treated as a character/code point.
///
/// This matches ICU behavior for ICU's `UnicodeSet`.
#[inline]
pub fn contains_str(self, s: &str) -> bool {
self.set.contains_str(s)
}
/// Check if the set contains the code point.
#[inline]
pub fn contains(self, ch: char) -> bool {
self.set.contains(ch)
}
/// See [`Self::contains`].
#[inline]
pub fn contains32(self, cp: u32) -> bool {
self.set.contains32(cp)
}
}
impl EmojiSetDataBorrowed<'static> {
/// Creates a new [`EmojiSetDataBorrowed`] for a [`EmojiSet`].
///
/// See the documentation on [`EmojiSet`] implementations for details.
///
/// ✨ *Enabled with the `compiled_data` Cargo feature.*
///
/// [📚 Help choosing a constructor](icu_provider::constructors)
#[inline]
#[cfg(feature = "compiled_data")]
pub const fn new<P: EmojiSet>() -> Self {
EmojiSetDataBorrowed { set: P::SINGLETON }
}
/// Cheaply converts a [`EmojiSetDataBorrowed<'static>`] into a [`EmojiSetData`].
///
/// Note: Due to branching and indirection, using [`EmojiSetData`] might inhibit some
/// compile-time optimizations that are possible with [`EmojiSetDataBorrowed`].
pub const fn static_to_owned(self) -> EmojiSetData {
EmojiSetData {
data: DataPayload::from_static_ref(self.set),
}
}
}
/// An Emoji set as defined by [`Unicode Technical Standard #51`](https://unicode.org/reports/tr51/#Emoji_Sets>).
///
/// <div class="stab unstable">
/// 🚫 This trait is sealed; it cannot be implemented by user code. If an API requests an item that implements this
/// trait, please consider using a type from the implementors listed below.
/// </div>
pub trait EmojiSet: crate::private::Sealed {
#[doc(hidden)]
type DataMarker: DataMarker<DataStruct = PropertyUnicodeSet<'static>>;
#[doc(hidden)]
#[cfg(feature = "compiled_data")]
const SINGLETON: &'static PropertyUnicodeSet<'static>;
/// The name of this property
const NAME: &'static [u8];
/// The abbreviated name of this property, if it exists, otherwise the name
const SHORT_NAME: &'static [u8];
}
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
//! Definitions of [Unicode Properties] and APIs for
//! retrieving property data in an appropriate data structure.
//!
//! This module is published as its own crate ([`icu_properties`](https://docs.rs/icu_properties/latest/icu_properties/))
//! and as part of the [`icu`](https://docs.rs/icu/latest/icu/) crate. See the latter for more details on the ICU4X project.
//!
//! APIs that return a [`CodePointSetData`] exist for binary properties and certain enumerated
//! properties.
//!
//! APIs that return a [`CodePointMapData`] exist for certain enumerated properties.
//!
//! # Examples
//!
//! ## Property data as `CodePointSetData`s
//!
//! ```
//! use icu::properties::{CodePointSetData, CodePointMapData};
//! use icu::properties::props::{GeneralCategory, Emoji};
//!
//! // A binary property as a `CodePointSetData`
//!
//! assert!(CodePointSetData::new::<Emoji>().contains('🎃')); // U+1F383 JACK-O-LANTERN
//! assert!(!CodePointSetData::new::<Emoji>().contains('木')); // U+6728
//!
//! // An individual enumerated property value as a `CodePointSetData`
//!
//! let line_sep_data = CodePointMapData::<GeneralCategory>::new()
//! .get_set_for_value(GeneralCategory::LineSeparator);
//! let line_sep = line_sep_data.as_borrowed();
//!
//! assert!(line_sep.contains('\u{2028}'));
//! assert!(!line_sep.contains('\u{2029}'));
//! ```
//!
//! ## Property data as `CodePointMapData`s
//!
//! ```
//! use icu::properties::CodePointMapData;
//! use icu::properties::props::Script;
//!
//! assert_eq!(CodePointMapData::<Script>::new().get('🎃'), Script::Common); // U+1F383 JACK-O-LANTERN
//! assert_eq!(CodePointMapData::<Script>::new().get('木'), Script::Han); // U+6728
//! ```
//!
//! [`ICU4X`]: ../icu/index.html
//! [Unicode Properties]: https://unicode-org.github.io/icu/userguide/strings/properties.html
//! [`CodePointSetData`]: crate::CodePointSetData
//! [`CodePointMapData`]: crate::CodePointMapData
// https://github.com/unicode-org/icu4x/blob/main/documents/process/boilerplate.md#library-annotations
#![cfg_attr(not(any(test, doc)), no_std)]
#![cfg_attr(
not(test),
deny(
clippy::indexing_slicing,
clippy::unwrap_used,
clippy::expect_used,
clippy::panic,
clippy::exhaustive_structs,
clippy::exhaustive_enums,
clippy::trivially_copy_pass_by_ref,
missing_debug_implementations,
)
)]
#![warn(missing_docs)]
#[cfg(feature = "alloc")]
extern crate alloc;
mod code_point_set;
pub use code_point_set::{CodePointSetData, CodePointSetDataBorrowed};
mod code_point_map;
pub use code_point_map::{CodePointMapData, CodePointMapDataBorrowed};
mod emoji;
pub use emoji::{EmojiSetData, EmojiSetDataBorrowed};
mod names;
pub use names::{
PropertyNamesLong, PropertyNamesLongBorrowed, PropertyNamesShort, PropertyNamesShortBorrowed,
PropertyParser, PropertyParserBorrowed,
};
mod runtime;
// NOTE: The Pernosco debugger has special knowledge
// of the `CanonicalCombiningClass` struct inside the `props`
// module. Please do not change the crate-module-qualified
// name of that struct without coordination.
pub mod props;
pub mod provider;
pub mod script;
mod bidi;
mod trievalue;
mod private {
pub trait Sealed {}
}
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
use crate::props::*;
use crate::provider::names::*;
use core::marker::PhantomData;
use icu_collections::codepointtrie::TrieValue;
use icu_provider::marker::ErasedMarker;
use icu_provider::prelude::*;
use yoke::Yokeable;
use zerotrie::cursor::ZeroTrieSimpleAsciiCursor;
/// A struct capable of looking up a property value from a string name.
/// Access its data by calling [`Self::as_borrowed()`] and using the methods on
/// [`PropertyParserBorrowed`].
///
/// The name can be a short name (`Lu`), a long name(`Uppercase_Letter`),
/// or an alias.
///
/// Property names can be looked up using "strict" matching (looking for a name
/// that matches exactly), or "loose matching", where the name is allowed to deviate
/// in terms of ASCII casing, whitespace, underscores, and hyphens.
///
/// # Example
///
/// ```
/// use icu::properties::props::GeneralCategory;
/// use icu::properties::PropertyParser;
///
/// let lookup = PropertyParser::<GeneralCategory>::new();
/// // short name for value
/// assert_eq!(
/// lookup.get_strict("Lu"),
/// Some(GeneralCategory::UppercaseLetter)
/// );
/// assert_eq!(
/// lookup.get_strict("Pd"),
/// Some(GeneralCategory::DashPunctuation)
/// );
/// // long name for value
/// assert_eq!(
/// lookup.get_strict("Uppercase_Letter"),
/// Some(GeneralCategory::UppercaseLetter)
/// );
/// assert_eq!(
/// lookup.get_strict("Dash_Punctuation"),
/// Some(GeneralCategory::DashPunctuation)
/// );
/// // name has incorrect casing
/// assert_eq!(lookup.get_strict("dashpunctuation"), None);
/// // loose matching of name
/// assert_eq!(
/// lookup.get_loose("dash-punctuation"),
/// Some(GeneralCategory::DashPunctuation)
/// );
/// // fake property
/// assert_eq!(lookup.get_strict("Animated_Gif"), None);
/// ```
#[derive(Debug)]
pub struct PropertyParser<T> {
map: DataPayload<ErasedMarker<PropertyValueNameToEnumMap<'static>>>,
markers: PhantomData<fn() -> T>,
}
/// A borrowed wrapper around property value name-to-enum data, returned by
/// [`PropertyParser::as_borrowed()`]. More efficient to query.
#[derive(Debug)]
pub struct PropertyParserBorrowed<'a, T> {
map: &'a PropertyValueNameToEnumMap<'a>,
markers: PhantomData<fn() -> T>,
}
impl<T> Clone for PropertyParserBorrowed<'_, T> {
fn clone(&self) -> Self {
*self
}
}
impl<T> Copy for PropertyParserBorrowed<'_, T> {}
impl<T> PropertyParser<T> {
/// Creates a new instance of `PropertyParser<T>` using compiled data.
///
/// ✨ *Enabled with the `compiled_data` Cargo feature.*
///
/// [📚 Help choosing a constructor](icu_provider::constructors)
#[cfg(feature = "compiled_data")]
#[expect(clippy::new_ret_no_self)]
pub fn new() -> PropertyParserBorrowed<'static, T>
where
T: ParseableEnumeratedProperty,
{
PropertyParserBorrowed::new()
}
#[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new)]
pub fn try_new_unstable(
provider: &(impl DataProvider<T::DataMarker> + ?Sized),
) -> Result<Self, DataError>
where
T: ParseableEnumeratedProperty,
{
Ok(Self {
map: provider.load(Default::default())?.payload.cast(),
markers: PhantomData,
})
}
/// Construct a borrowed version of this type that can be queried.
///
/// This avoids a potential small underlying cost per API call (like `get_strict()`) by consolidating it
/// up front.
#[inline]
pub fn as_borrowed(&self) -> PropertyParserBorrowed<'_, T> {
PropertyParserBorrowed {
map: self.map.get(),
markers: PhantomData,
}
}
#[doc(hidden)] // used by FFI code
pub fn erase(self) -> PropertyParser<u16> {
PropertyParser {
map: self.map.cast(),
markers: PhantomData,
}
}
}
impl<T: TrieValue> PropertyParserBorrowed<'_, T> {
/// Get the property value as a u16, doing a strict search looking for
/// names that match exactly
///
/// # Example
///
/// ```
/// use icu::properties::props::GeneralCategory;
/// use icu::properties::PropertyParser;
///
/// let lookup = PropertyParser::<GeneralCategory>::new();
/// assert_eq!(
/// lookup.get_strict_u16("Lu"),
/// Some(GeneralCategory::UppercaseLetter as u16)
/// );
/// assert_eq!(
/// lookup.get_strict_u16("Uppercase_Letter"),
/// Some(GeneralCategory::UppercaseLetter as u16)
/// );
/// // does not do loose matching
/// assert_eq!(lookup.get_strict_u16("UppercaseLetter"), None);
/// ```
#[inline]
pub fn get_strict_u16(self, name: &str) -> Option<u16> {
get_strict_u16(self.map, name)
}
/// Get the property value as a `T`, doing a strict search looking for
/// names that match exactly
///
/// # Example
///
/// ```
/// use icu::properties::props::GeneralCategory;
/// use icu::properties::PropertyParser;
///
/// let lookup = PropertyParser::<GeneralCategory>::new();
/// assert_eq!(
/// lookup.get_strict("Lu"),
/// Some(GeneralCategory::UppercaseLetter)
/// );
/// assert_eq!(
/// lookup.get_strict("Uppercase_Letter"),
/// Some(GeneralCategory::UppercaseLetter)
/// );
/// // does not do loose matching
/// assert_eq!(lookup.get_strict("UppercaseLetter"), None);
/// ```
#[inline]
pub fn get_strict(self, name: &str) -> Option<T> {
T::try_from_u32(self.get_strict_u16(name)? as u32).ok()
}
/// Get the property value as a u16, doing a loose search looking for
/// names that match case-insensitively, ignoring ASCII hyphens, underscores, and
/// whitespaces.
///
/// # Example
///
/// ```
/// use icu::properties::props::GeneralCategory;
/// use icu::properties::PropertyParser;
///
/// let lookup = PropertyParser::<GeneralCategory>::new();
/// assert_eq!(
/// lookup.get_loose_u16("Lu"),
/// Some(GeneralCategory::UppercaseLetter as u16)
/// );
/// assert_eq!(
/// lookup.get_loose_u16("Uppercase_Letter"),
/// Some(GeneralCategory::UppercaseLetter as u16)
/// );
/// // does do loose matching
/// assert_eq!(
/// lookup.get_loose_u16("UppercaseLetter"),
/// Some(GeneralCategory::UppercaseLetter as u16)
/// );
/// ```
#[inline]
pub fn get_loose_u16(self, name: &str) -> Option<u16> {
get_loose_u16(self.map, name)
}
/// Get the property value as a `T`, doing a loose search looking for
/// names that match case-insensitively, ignoring ASCII hyphens, underscores, and
/// whitespaces.
///
/// # Example
///
/// ```
/// use icu::properties::props::GeneralCategory;
/// use icu::properties::PropertyParser;
///
/// let lookup = PropertyParser::<GeneralCategory>::new();
/// assert_eq!(
/// lookup.get_loose("Lu"),
/// Some(GeneralCategory::UppercaseLetter)
/// );
/// assert_eq!(
/// lookup.get_loose("Uppercase_Letter"),
/// Some(GeneralCategory::UppercaseLetter)
/// );
/// // does do loose matching
/// assert_eq!(
/// lookup.get_loose("UppercaseLetter"),
/// Some(GeneralCategory::UppercaseLetter)
/// );
/// ```
#[inline]
pub fn get_loose(self, name: &str) -> Option<T> {
T::try_from_u32(self.get_loose_u16(name)? as u32).ok()
}
}
#[cfg(feature = "compiled_data")]
impl<T: ParseableEnumeratedProperty> Default for PropertyParserBorrowed<'static, T> {
fn default() -> Self {
Self::new()
}
}
impl<T: TrieValue> PropertyParserBorrowed<'static, T> {
/// Creates a new instance of `PropertyParserBorrowed<T>` using compiled data.
///
/// ✨ *Enabled with the `compiled_data` Cargo feature.*
///
/// [📚 Help choosing a constructor](icu_provider::constructors)
#[cfg(feature = "compiled_data")]
pub fn new() -> Self
where
T: ParseableEnumeratedProperty,
{
Self {
map: T::SINGLETON,
markers: PhantomData,
}
}
/// Cheaply converts a [`PropertyParserBorrowed<'static>`] into a [`PropertyParser`].
///
/// Note: Due to branching and indirection, using [`PropertyParser`] might inhibit some
/// compile-time optimizations that are possible with [`PropertyParserBorrowed`].
pub const fn static_to_owned(self) -> PropertyParser<T> {
PropertyParser {
map: DataPayload::from_static_ref(self.map),
markers: PhantomData,
}
}
}
/// Avoid monomorphizing multiple copies of this function
fn get_strict_u16(payload: &PropertyValueNameToEnumMap<'_>, name: &str) -> Option<u16> {
payload.map.get(name).and_then(|i| i.try_into().ok())
}
/// Avoid monomorphizing multiple copies of this function
fn get_loose_u16(payload: &PropertyValueNameToEnumMap<'_>, name: &str) -> Option<u16> {
fn recurse(mut cursor: ZeroTrieSimpleAsciiCursor, mut rest: &[u8]) -> Option<usize> {
if cursor.is_empty() {
return None;
}
// Skip whitespace, underscore, hyphen in trie.
for skip in [b'\t', b'\n', b'\x0C', b'\r', b' ', 0x0B, b'_', b'-'] {
let mut skip_cursor = cursor.clone();
skip_cursor.step(skip);
if let Some(r) = recurse(skip_cursor, rest) {
return Some(r);
}
}
let ascii = loop {
let Some((&a, r)) = rest.split_first() else {
return cursor.take_value();
};
rest = r;
// Skip whitespace, underscore, hyphen in input
if !matches!(
a,
b'\t' | b'\n' | b'\x0C' | b'\r' | b' ' | 0x0B | b'_' | b'-'
) {
break a;
}
};
let mut other_case_cursor = cursor.clone();
cursor.step(ascii);
other_case_cursor.step(if ascii.is_ascii_lowercase() {
ascii.to_ascii_uppercase()
} else {
ascii.to_ascii_lowercase()
});
// This uses the call stack as the DFS stack. The recursion will terminate as
// rest's length is strictly shrinking. The call stack's depth is limited by
// name.len().
recurse(cursor, rest).or_else(|| recurse(other_case_cursor, rest))
}
recurse(payload.map.cursor(), name.as_bytes()).and_then(|i| i.try_into().ok())
}
/// A struct capable of looking up a property name from a value
/// Access its data by calling [`Self::as_borrowed()`] and using the methods on
/// [`PropertyNamesLongBorrowed`].
///
/// # Example
///
/// ```
/// use icu::properties::props::CanonicalCombiningClass;
/// use icu::properties::PropertyNamesLong;
///
/// let names = PropertyNamesLong::<CanonicalCombiningClass>::new();
/// assert_eq!(
/// names.get(CanonicalCombiningClass::KanaVoicing),
/// Some("Kana_Voicing")
/// );
/// assert_eq!(
/// names.get(CanonicalCombiningClass::AboveLeft),
/// Some("Above_Left")
/// );
/// ```
pub struct PropertyNamesLong<T: NamedEnumeratedProperty> {
map: DataPayload<ErasedMarker<T::DataStructLong>>,
}
impl<T: NamedEnumeratedProperty> core::fmt::Debug for PropertyNamesLong<T> {
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
f.debug_struct("PropertyNamesLong")
// .field("map", &self.map)
.finish()
}
}
/// A borrowed wrapper around property value name-to-enum data, returned by
/// [`PropertyNamesLong::as_borrowed()`]. More efficient to query.
#[derive(Debug)]
pub struct PropertyNamesLongBorrowed<'a, T: NamedEnumeratedProperty> {
map: &'a T::DataStructLongBorrowed<'a>,
}
impl<T: NamedEnumeratedProperty> Clone for PropertyNamesLongBorrowed<'_, T> {
fn clone(&self) -> Self {
*self
}
}
impl<T: NamedEnumeratedProperty> Copy for PropertyNamesLongBorrowed<'_, T> {}
impl<T: NamedEnumeratedProperty> PropertyNamesLong<T> {
/// Creates a new instance of `PropertyNamesLongBorrowed<T>`.
///
/// ✨ *Enabled with the `compiled_data` Cargo feature.*
///
/// [📚 Help choosing a constructor](icu_provider::constructors)
#[cfg(feature = "compiled_data")]
#[expect(clippy::new_ret_no_self)]
pub fn new() -> PropertyNamesLongBorrowed<'static, T> {
PropertyNamesLongBorrowed::new()
}
#[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new)]
pub fn try_new_unstable(
provider: &(impl DataProvider<T::DataMarkerLong> + ?Sized),
) -> Result<Self, DataError> {
Ok(Self {
map: provider.load(Default::default())?.payload.cast(),
})
}
/// Construct a borrowed version of this type that can be queried.
///
/// This avoids a potential small underlying cost per API call (like `get_static()`) by consolidating it
/// up front.
#[inline]
pub fn as_borrowed(&self) -> PropertyNamesLongBorrowed<'_, T> {
PropertyNamesLongBorrowed {
map: T::nep_long_identity(self.map.get()),
}
}
}
impl<'a, T: NamedEnumeratedProperty> PropertyNamesLongBorrowed<'a, T> {
/// Get the property name given a value
///
/// # Example
///
/// ```rust
/// use icu::properties::props::CanonicalCombiningClass;
/// use icu::properties::PropertyNamesLong;
///
/// let lookup = PropertyNamesLong::<CanonicalCombiningClass>::new();
/// assert_eq!(
/// lookup.get(CanonicalCombiningClass::KanaVoicing),
/// Some("Kana_Voicing")
/// );
/// assert_eq!(
/// lookup.get(CanonicalCombiningClass::AboveLeft),
/// Some("Above_Left")
/// );
/// ```
#[inline]
pub fn get(self, property: T) -> Option<&'a str> {
self.map.get(property.to_u32())
}
}
#[cfg(feature = "compiled_data")]
impl<T: NamedEnumeratedProperty> Default for PropertyNamesLongBorrowed<'static, T> {
fn default() -> Self {
Self::new()
}
}
impl<T: NamedEnumeratedProperty> PropertyNamesLongBorrowed<'static, T> {
/// Creates a new instance of `PropertyNamesLongBorrowed<T>`.
///
/// ✨ *Enabled with the `compiled_data` Cargo feature.*
///
/// [📚 Help choosing a constructor](icu_provider::constructors)
#[cfg(feature = "compiled_data")]
pub fn new() -> Self {
Self {
map: T::SINGLETON_LONG,
}
}
/// Cheaply converts a [`PropertyNamesLongBorrowed<'static>`] into a [`PropertyNamesLong`].
///
/// Note: Due to branching and indirection, using [`PropertyNamesLong`] might inhibit some
/// compile-time optimizations that are possible with [`PropertyNamesLongBorrowed`].
///
/// This is currently not `const` unlike other `static_to_owned()` functions since it needs
/// const traits to do that safely
pub fn static_to_owned(self) -> PropertyNamesLong<T> {
PropertyNamesLong {
map: DataPayload::from_static_ref(T::nep_long_identity_static(self.map)),
}
}
}
/// A struct capable of looking up a property name from a value
/// Access its data by calling [`Self::as_borrowed()`] and using the methods on
/// [`PropertyNamesShortBorrowed`].
///
/// # Example
///
/// ```
/// use icu::properties::props::CanonicalCombiningClass;
/// use icu::properties::PropertyNamesShort;
///
/// let names = PropertyNamesShort::<CanonicalCombiningClass>::new();
/// assert_eq!(names.get(CanonicalCombiningClass::KanaVoicing), Some("KV"));
/// assert_eq!(names.get(CanonicalCombiningClass::AboveLeft), Some("AL"));
/// ```
pub struct PropertyNamesShort<T: NamedEnumeratedProperty> {
map: DataPayload<ErasedMarker<T::DataStructShort>>,
}
impl<T: NamedEnumeratedProperty> core::fmt::Debug for PropertyNamesShort<T> {
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
f.debug_struct("PropertyNamesShort")
// .field("map", &self.map)
.finish()
}
}
/// A borrowed wrapper around property value name-to-enum data, returned by
/// [`PropertyNamesShort::as_borrowed()`]. More efficient to query.
#[derive(Debug)]
pub struct PropertyNamesShortBorrowed<'a, T: NamedEnumeratedProperty> {
map: &'a T::DataStructShortBorrowed<'a>,
}
impl<T: NamedEnumeratedProperty> Clone for PropertyNamesShortBorrowed<'_, T> {
fn clone(&self) -> Self {
*self
}
}
impl<T: NamedEnumeratedProperty> Copy for PropertyNamesShortBorrowed<'_, T> {}
impl<T: NamedEnumeratedProperty> PropertyNamesShort<T> {
/// Creates a new instance of `PropertyNamesShortBorrowed<T>`.
///
/// ✨ *Enabled with the `compiled_data` Cargo feature.*
///
/// [📚 Help choosing a constructor](icu_provider::constructors)
#[cfg(feature = "compiled_data")]
#[expect(clippy::new_ret_no_self)]
pub fn new() -> PropertyNamesShortBorrowed<'static, T> {
PropertyNamesShortBorrowed::new()
}
#[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new)]
pub fn try_new_unstable(
provider: &(impl DataProvider<T::DataMarkerShort> + ?Sized),
) -> Result<Self, DataError> {
Ok(Self {
map: provider.load(Default::default())?.payload.cast(),
})
}
/// Construct a borrowed version of this type that can be queried.
///
/// This avoids a potential small underlying cost per API call (like `get_static()`) by consolidating it
/// up front.
#[inline]
pub fn as_borrowed(&self) -> PropertyNamesShortBorrowed<'_, T> {
PropertyNamesShortBorrowed {
map: T::nep_short_identity(self.map.get()),
}
}
}
impl<'a, T: NamedEnumeratedProperty> PropertyNamesShortBorrowed<'a, T> {
/// Get the property name given a value
///
/// # Example
///
/// ```rust
/// use icu::properties::props::CanonicalCombiningClass;
/// use icu::properties::PropertyNamesShort;
///
/// let lookup = PropertyNamesShort::<CanonicalCombiningClass>::new();
/// assert_eq!(lookup.get(CanonicalCombiningClass::KanaVoicing), Some("KV"));
/// assert_eq!(lookup.get(CanonicalCombiningClass::AboveLeft), Some("AL"));
/// ```
#[inline]
pub fn get(self, property: T) -> Option<&'a str> {
self.map.get(property.to_u32())
}
}
impl PropertyNamesShortBorrowed<'_, Script> {
/// Gets the "name" of a script property as a `icu::locale::subtags::Script`.
///
/// This method is available only on `PropertyNamesShortBorrowed<Script>`.
///
/// # Example
///
/// ```rust
/// use icu::locale::subtags::script;
/// use icu::properties::props::Script;
/// use icu::properties::PropertyNamesShort;
///
/// let lookup = PropertyNamesShort::<Script>::new();
/// assert_eq!(
/// lookup.get_locale_script(Script::Brahmi),
/// Some(script!("Brah"))
/// );
/// assert_eq!(
/// lookup.get_locale_script(Script::Hangul),
/// Some(script!("Hang"))
/// );
/// ```
///
/// For the reverse direction, use property parsing as normal:
/// ```
/// use icu::locale::subtags::script;
/// use icu::properties::props::Script;
/// use icu::properties::PropertyParser;
///
/// let parser = PropertyParser::<Script>::new();
/// assert_eq!(
/// parser.get_strict(script!("Brah").as_str()),
/// Some(Script::Brahmi)
/// );
/// assert_eq!(
/// parser.get_strict(script!("Hang").as_str()),
/// Some(Script::Hangul)
/// );
/// ```
#[inline]
pub fn get_locale_script(self, property: Script) -> Option<icu_locale_core::subtags::Script> {
let prop = usize::try_from(property.to_u32()).ok()?;
self.map.map.get(prop).and_then(|o| o.0)
}
}
#[cfg(feature = "compiled_data")]
impl<T: NamedEnumeratedProperty> Default for PropertyNamesShortBorrowed<'static, T> {
fn default() -> Self {
Self::new()
}
}
impl<T: NamedEnumeratedProperty> PropertyNamesShortBorrowed<'static, T> {
/// Creates a new instance of `PropertyNamesShortBorrowed<T>`.
///
/// ✨ *Enabled with the `compiled_data` Cargo feature.*
///
/// [📚 Help choosing a constructor](icu_provider::constructors)
#[cfg(feature = "compiled_data")]
pub fn new() -> Self {
Self {
map: T::SINGLETON_SHORT,
}
}
/// Cheaply converts a [`PropertyNamesShortBorrowed<'static>`] into a [`PropertyNamesShort`].
///
/// Note: Due to branching and indirection, using [`PropertyNamesShort`] might inhibit some
/// compile-time optimizations that are possible with [`PropertyNamesShortBorrowed`].
///
/// This is currently not `const` unlike other `static_to_owned()` functions since it needs
/// const traits to do that safely
pub fn static_to_owned(self) -> PropertyNamesShort<T> {
PropertyNamesShort {
map: DataPayload::from_static_ref(T::nep_short_identity_static(self.map)),
}
}
}
/// A property whose value names can be parsed from strings.
pub trait ParseableEnumeratedProperty: crate::private::Sealed + TrieValue {
#[doc(hidden)]
type DataMarker: DataMarker<DataStruct = PropertyValueNameToEnumMap<'static>>;
#[doc(hidden)]
#[cfg(feature = "compiled_data")]
const SINGLETON: &'static PropertyValueNameToEnumMap<'static>;
}
// Abstract over Linear/Sparse/Script representation
// This trait is implicitly sealed by not being exported.
pub trait PropertyEnumToValueNameLookup {
fn get(&self, prop: u32) -> Option<&str>;
}
impl PropertyEnumToValueNameLookup for PropertyEnumToValueNameLinearMap<'_> {
fn get(&self, prop: u32) -> Option<&str> {
self.map.get(usize::try_from(prop).ok()?)
}
}
#[cfg(feature = "alloc")]
impl PropertyEnumToValueNameLookup for PropertyEnumToValueNameSparseMap<'_> {
fn get(&self, prop: u32) -> Option<&str> {
self.map.get(&u16::try_from(prop).ok()?)
}
}
impl PropertyEnumToValueNameLookup for PropertyScriptToIcuScriptMap<'_> {
fn get(&self, prop: u32) -> Option<&str> {
self.map
.get_ule_ref(usize::try_from(prop).ok()?)
.and_then(|no| no.as_ref())
.map(|s| s.as_str())
}
}
/// A property whose value names can be represented as strings.
pub trait NamedEnumeratedProperty: ParseableEnumeratedProperty {
#[doc(hidden)]
type DataStructLong: 'static
+ for<'a> Yokeable<'a, Output = Self::DataStructLongBorrowed<'a>>
+ PropertyEnumToValueNameLookup;
#[doc(hidden)]
type DataStructShort: 'static
+ for<'a> Yokeable<'a, Output = Self::DataStructShortBorrowed<'a>>
+ PropertyEnumToValueNameLookup;
#[doc(hidden)]
type DataStructLongBorrowed<'a>: PropertyEnumToValueNameLookup;
#[doc(hidden)]
type DataStructShortBorrowed<'a>: PropertyEnumToValueNameLookup;
#[doc(hidden)]
type DataMarkerLong: DataMarker<DataStruct = Self::DataStructLong>;
#[doc(hidden)]
type DataMarkerShort: DataMarker<DataStruct = Self::DataStructShort>;
#[doc(hidden)]
#[cfg(feature = "compiled_data")]
const SINGLETON_LONG: &'static Self::DataStructLongBorrowed<'static>;
#[doc(hidden)]
#[cfg(feature = "compiled_data")]
const SINGLETON_SHORT: &'static Self::DataStructShortBorrowed<'static>;
// These wouldn't be necessary if Yoke used GATs (#6057)
#[doc(hidden)]
fn nep_long_identity<'a>(
stat: &'a <Self::DataStructLong as Yokeable<'a>>::Output,
) -> &'a Self::DataStructLongBorrowed<'a>;
#[doc(hidden)]
fn nep_long_identity_static(
stat: &'static Self::DataStructLongBorrowed<'static>,
) -> &'static Self::DataStructLong;
#[doc(hidden)]
fn nep_short_identity<'a>(
stat: &'a <Self::DataStructShort as Yokeable<'a>>::Output,
) -> &'a Self::DataStructShortBorrowed<'a>;
#[doc(hidden)]
fn nep_short_identity_static(
stat: &'static Self::DataStructShortBorrowed<'static>,
) -> &'static Self::DataStructShort;
/// Convenience method for `PropertyParser::new().get_loose(s)`
///
/// ✨ *Enabled with the `compiled_data` Cargo feature.*
#[cfg(feature = "compiled_data")]
fn try_from_str(s: &str) -> Option<Self> {
PropertyParser::new().get_loose(s)
}
/// Convenience method for `PropertyNamesLong::new().get(*self).unwrap()`
///
/// ✨ *Enabled with the `compiled_data` Cargo feature.*
#[cfg(feature = "compiled_data")]
fn long_name(&self) -> &'static str {
PropertyNamesLong::new().get(*self).unwrap_or("unreachable")
}
/// Convenience method for `PropertyNamesShort::new().get(*self).unwrap()`
///
/// ✨ *Enabled with the `compiled_data` Cargo feature.*
#[cfg(feature = "compiled_data")]
fn short_name(&self) -> &'static str {
PropertyNamesShort::new()
.get(*self)
.unwrap_or("unreachable")
}
}
macro_rules! impl_value_getter {
(
impl $ty:ident {
$marker_n2e:ident / $singleton_n2e:ident;
$(
$(#[$meta:meta])*
$data_struct_s:ident / $marker_e2sn:ident / $singleton_e2sn:ident;
$data_struct_l:ident / $marker_e2ln:ident / $singleton_e2ln:ident;
)?
}
) => {
impl ParseableEnumeratedProperty for $ty {
type DataMarker = $marker_n2e;
#[cfg(feature = "compiled_data")]
const SINGLETON: &'static PropertyValueNameToEnumMap<'static> = crate::provider::Baked::$singleton_n2e;
}
$(
$(#[$meta])*
impl NamedEnumeratedProperty for $ty {
type DataStructLong = $data_struct_l<'static>;
type DataStructShort = $data_struct_s<'static>;
type DataStructLongBorrowed<'a> = $data_struct_l<'a>;
type DataStructShortBorrowed<'a> = $data_struct_s<'a>;
type DataMarkerLong = crate::provider::$marker_e2ln;
type DataMarkerShort = crate::provider::$marker_e2sn;
#[cfg(feature = "compiled_data")]
const SINGLETON_LONG: &'static Self::DataStructLong = crate::provider::Baked::$singleton_e2ln;
#[cfg(feature = "compiled_data")]
const SINGLETON_SHORT: &'static Self::DataStructShort = crate::provider::Baked::$singleton_e2sn;
fn nep_long_identity<'a>(yoked: &'a $data_struct_l<'a>) -> &'a Self::DataStructLongBorrowed<'a> {
yoked
}
fn nep_long_identity_static(stat: &'static $data_struct_l<'static>) -> &'static $data_struct_l<'static> {
stat
}
fn nep_short_identity<'a>(yoked: &'a $data_struct_s<'a>) -> &'a Self::DataStructShortBorrowed<'a> {
yoked
}
fn nep_short_identity_static(stat: &'static $data_struct_s<'static>) -> &'static $data_struct_s<'static> {
stat
}
}
)?
};
}
impl_value_getter! {
impl BidiClass {
PropertyNameParseBidiClassV1 / SINGLETON_PROPERTY_NAME_PARSE_BIDI_CLASS_V1;
PropertyEnumToValueNameLinearMap / PropertyNameShortBidiClassV1 / SINGLETON_PROPERTY_NAME_SHORT_BIDI_CLASS_V1;
PropertyEnumToValueNameLinearMap / PropertyNameLongBidiClassV1 / SINGLETON_PROPERTY_NAME_LONG_BIDI_CLASS_V1;
}
}
impl_value_getter! {
impl GeneralCategory {
PropertyNameParseGeneralCategoryV1 / SINGLETON_PROPERTY_NAME_PARSE_GENERAL_CATEGORY_V1;
PropertyEnumToValueNameLinearMap / PropertyNameShortGeneralCategoryV1 / SINGLETON_PROPERTY_NAME_SHORT_GENERAL_CATEGORY_V1;
PropertyEnumToValueNameLinearMap / PropertyNameLongGeneralCategoryV1 / SINGLETON_PROPERTY_NAME_LONG_GENERAL_CATEGORY_V1;
}
}
impl_value_getter! {
impl GeneralCategoryGroup {
PropertyNameParseGeneralCategoryMaskV1 / SINGLETON_PROPERTY_NAME_PARSE_GENERAL_CATEGORY_MASK_V1;
}
}
impl_value_getter! {
impl Script {
PropertyNameParseScriptV1 / SINGLETON_PROPERTY_NAME_PARSE_SCRIPT_V1;
PropertyScriptToIcuScriptMap / PropertyNameShortScriptV1 / SINGLETON_PROPERTY_NAME_SHORT_SCRIPT_V1;
PropertyEnumToValueNameLinearMap / PropertyNameLongScriptV1 / SINGLETON_PROPERTY_NAME_LONG_SCRIPT_V1;
}
}
impl_value_getter! {
impl HangulSyllableType {
PropertyNameParseHangulSyllableTypeV1 / SINGLETON_PROPERTY_NAME_PARSE_HANGUL_SYLLABLE_TYPE_V1;
PropertyEnumToValueNameLinearMap / PropertyNameShortHangulSyllableTypeV1 / SINGLETON_PROPERTY_NAME_SHORT_HANGUL_SYLLABLE_TYPE_V1;
PropertyEnumToValueNameLinearMap / PropertyNameLongHangulSyllableTypeV1 / SINGLETON_PROPERTY_NAME_LONG_HANGUL_SYLLABLE_TYPE_V1;
}
}
impl_value_getter! {
impl EastAsianWidth {
PropertyNameParseEastAsianWidthV1 / SINGLETON_PROPERTY_NAME_PARSE_EAST_ASIAN_WIDTH_V1;
PropertyEnumToValueNameLinearMap / PropertyNameShortEastAsianWidthV1 / SINGLETON_PROPERTY_NAME_SHORT_EAST_ASIAN_WIDTH_V1;
PropertyEnumToValueNameLinearMap / PropertyNameLongEastAsianWidthV1 / SINGLETON_PROPERTY_NAME_LONG_EAST_ASIAN_WIDTH_V1;
}
}
impl_value_getter! {
impl LineBreak {
PropertyNameParseLineBreakV1 / SINGLETON_PROPERTY_NAME_PARSE_LINE_BREAK_V1;
PropertyEnumToValueNameLinearMap / PropertyNameShortLineBreakV1 / SINGLETON_PROPERTY_NAME_SHORT_LINE_BREAK_V1;
PropertyEnumToValueNameLinearMap / PropertyNameLongLineBreakV1 / SINGLETON_PROPERTY_NAME_LONG_LINE_BREAK_V1;
}
}
impl_value_getter! {
impl GraphemeClusterBreak {
PropertyNameParseGraphemeClusterBreakV1 / SINGLETON_PROPERTY_NAME_PARSE_GRAPHEME_CLUSTER_BREAK_V1;
PropertyEnumToValueNameLinearMap / PropertyNameShortGraphemeClusterBreakV1 / SINGLETON_PROPERTY_NAME_SHORT_GRAPHEME_CLUSTER_BREAK_V1;
PropertyEnumToValueNameLinearMap / PropertyNameLongGraphemeClusterBreakV1 / SINGLETON_PROPERTY_NAME_LONG_GRAPHEME_CLUSTER_BREAK_V1;
}
}
impl_value_getter! {
impl WordBreak {
PropertyNameParseWordBreakV1 / SINGLETON_PROPERTY_NAME_PARSE_WORD_BREAK_V1;
PropertyEnumToValueNameLinearMap / PropertyNameShortWordBreakV1 / SINGLETON_PROPERTY_NAME_SHORT_WORD_BREAK_V1;
PropertyEnumToValueNameLinearMap / PropertyNameLongWordBreakV1 / SINGLETON_PROPERTY_NAME_LONG_WORD_BREAK_V1;
}
}
impl_value_getter! {
impl SentenceBreak {
PropertyNameParseSentenceBreakV1 / SINGLETON_PROPERTY_NAME_PARSE_SENTENCE_BREAK_V1;
PropertyEnumToValueNameLinearMap / PropertyNameShortSentenceBreakV1 / SINGLETON_PROPERTY_NAME_SHORT_SENTENCE_BREAK_V1;
PropertyEnumToValueNameLinearMap / PropertyNameLongSentenceBreakV1 / SINGLETON_PROPERTY_NAME_LONG_SENTENCE_BREAK_V1;
}
}
impl_value_getter! {
impl CanonicalCombiningClass {
PropertyNameParseCanonicalCombiningClassV1 / SINGLETON_PROPERTY_NAME_PARSE_CANONICAL_COMBINING_CLASS_V1;
#[cfg(feature = "alloc")]
/// ✨ *Enabled with the `alloc` Cargo feature.*
PropertyEnumToValueNameSparseMap / PropertyNameShortCanonicalCombiningClassV1 / SINGLETON_PROPERTY_NAME_SHORT_CANONICAL_COMBINING_CLASS_V1;
PropertyEnumToValueNameSparseMap / PropertyNameLongCanonicalCombiningClassV1 / SINGLETON_PROPERTY_NAME_LONG_CANONICAL_COMBINING_CLASS_V1;
}
}
impl_value_getter! {
impl IndicSyllabicCategory {
PropertyNameParseIndicSyllabicCategoryV1 / SINGLETON_PROPERTY_NAME_PARSE_INDIC_SYLLABIC_CATEGORY_V1;
PropertyEnumToValueNameLinearMap / PropertyNameShortIndicSyllabicCategoryV1 / SINGLETON_PROPERTY_NAME_SHORT_INDIC_SYLLABIC_CATEGORY_V1;
PropertyEnumToValueNameLinearMap / PropertyNameLongIndicSyllabicCategoryV1 / SINGLETON_PROPERTY_NAME_LONG_INDIC_SYLLABIC_CATEGORY_V1;
}
}
impl_value_getter! {
impl IndicConjunctBreak {
PropertyNameParseIndicConjunctBreakV1 / SINGLETON_PROPERTY_NAME_PARSE_INDIC_CONJUNCT_BREAK_V1;
PropertyEnumToValueNameLinearMap / PropertyNameShortIndicConjunctBreakV1 / SINGLETON_PROPERTY_NAME_SHORT_INDIC_CONJUNCT_BREAK_V1;
PropertyEnumToValueNameLinearMap / PropertyNameLongIndicConjunctBreakV1 / SINGLETON_PROPERTY_NAME_LONG_INDIC_CONJUNCT_BREAK_V1;
}
}
impl_value_getter! {
impl JoiningType {
PropertyNameParseJoiningTypeV1 / SINGLETON_PROPERTY_NAME_PARSE_JOINING_TYPE_V1;
PropertyEnumToValueNameLinearMap / PropertyNameShortJoiningTypeV1 / SINGLETON_PROPERTY_NAME_SHORT_JOINING_TYPE_V1;
PropertyEnumToValueNameLinearMap / PropertyNameLongJoiningTypeV1 / SINGLETON_PROPERTY_NAME_LONG_JOINING_TYPE_V1;
}
}
impl_value_getter! {
impl VerticalOrientation {
PropertyNameParseVerticalOrientationV1 / SINGLETON_PROPERTY_NAME_PARSE_VERTICAL_ORIENTATION_V1;
PropertyEnumToValueNameLinearMap / PropertyNameShortVerticalOrientationV1 / SINGLETON_PROPERTY_NAME_SHORT_VERTICAL_ORIENTATION_V1;
PropertyEnumToValueNameLinearMap / PropertyNameLongVerticalOrientationV1 / SINGLETON_PROPERTY_NAME_LONG_VERTICAL_ORIENTATION_V1;
}
}

Sorry, the diff of this file is too big to display

// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
// Provider structs must be stable
#![allow(clippy::exhaustive_structs, clippy::exhaustive_enums)]
//! 🚧 \[Unstable\] Data provider struct definitions for this ICU4X component.
//!
//! <div class="stab unstable">
//! 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
//! including in SemVer minor releases. While the serde representation of data structs is guaranteed
//! to be stable, their Rust representation might not be. Use with caution.
//! </div>
//!
//! Read more about data providers: [`icu_provider`]
pub mod names;
#[cfg(feature = "alloc")]
pub use names::{
PropertyNameLongCanonicalCombiningClassV1, PropertyNameShortCanonicalCombiningClassV1,
};
pub use names::{
PropertyNameLongBidiClassV1, PropertyNameLongEastAsianWidthV1,
PropertyNameLongGeneralCategoryV1, PropertyNameLongGraphemeClusterBreakV1,
PropertyNameLongHangulSyllableTypeV1, PropertyNameLongIndicConjunctBreakV1,
PropertyNameLongIndicSyllabicCategoryV1, PropertyNameLongJoiningTypeV1,
PropertyNameLongLineBreakV1, PropertyNameLongScriptV1, PropertyNameLongSentenceBreakV1,
PropertyNameLongVerticalOrientationV1, PropertyNameLongWordBreakV1,
PropertyNameParseBidiClassV1, PropertyNameParseCanonicalCombiningClassV1,
PropertyNameParseEastAsianWidthV1, PropertyNameParseGeneralCategoryMaskV1,
PropertyNameParseGeneralCategoryV1, PropertyNameParseGraphemeClusterBreakV1,
PropertyNameParseHangulSyllableTypeV1, PropertyNameParseIndicConjunctBreakV1,
PropertyNameParseIndicSyllabicCategoryV1, PropertyNameParseJoiningTypeV1,
PropertyNameParseLineBreakV1, PropertyNameParseScriptV1, PropertyNameParseSentenceBreakV1,
PropertyNameParseVerticalOrientationV1, PropertyNameParseWordBreakV1,
PropertyNameShortBidiClassV1, PropertyNameShortEastAsianWidthV1,
PropertyNameShortGeneralCategoryV1, PropertyNameShortGraphemeClusterBreakV1,
PropertyNameShortHangulSyllableTypeV1, PropertyNameShortIndicConjunctBreakV1,
PropertyNameShortIndicSyllabicCategoryV1, PropertyNameShortJoiningTypeV1,
PropertyNameShortLineBreakV1, PropertyNameShortScriptV1, PropertyNameShortSentenceBreakV1,
PropertyNameShortVerticalOrientationV1, PropertyNameShortWordBreakV1,
};
pub use crate::props::gc::GeneralCategoryULE;
use crate::props::*;
use crate::script::ScriptWithExt;
use core::ops::RangeInclusive;
use icu_collections::codepointinvlist::CodePointInversionList;
use icu_collections::codepointinvliststringlist::CodePointInversionListAndStringList;
use icu_collections::codepointtrie::{CodePointMapRange, CodePointTrie, TrieValue};
use icu_provider::prelude::*;
use zerofrom::ZeroFrom;
use zerovec::{VarZeroVec, ZeroSlice};
#[cfg(feature = "compiled_data")]
#[derive(Debug)]
/// Baked data
///
/// <div class="stab unstable">
/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
/// including in SemVer minor releases. In particular, the `DataProvider` implementations are only
/// guaranteed to match with this version's `*_unstable` providers. Use with caution.
/// </div>
pub struct Baked;
#[cfg(feature = "compiled_data")]
#[allow(unused_imports)]
const _: () = {
use icu_properties_data::*;
pub mod icu {
pub use crate as properties;
pub use icu_collections as collections;
}
make_provider!(Baked);
impl_property_binary_alnum_v1!(Baked);
impl_property_binary_alphabetic_v1!(Baked);
impl_property_binary_ascii_hex_digit_v1!(Baked);
impl_property_binary_basic_emoji_v1!(Baked);
impl_property_binary_bidi_control_v1!(Baked);
impl_property_binary_bidi_mirrored_v1!(Baked);
impl_property_binary_blank_v1!(Baked);
impl_property_binary_case_ignorable_v1!(Baked);
impl_property_binary_case_sensitive_v1!(Baked);
impl_property_binary_cased_v1!(Baked);
impl_property_binary_changes_when_casefolded_v1!(Baked);
impl_property_binary_changes_when_casemapped_v1!(Baked);
impl_property_binary_changes_when_lowercased_v1!(Baked);
impl_property_binary_changes_when_nfkc_casefolded_v1!(Baked);
impl_property_binary_changes_when_titlecased_v1!(Baked);
impl_property_binary_changes_when_uppercased_v1!(Baked);
impl_property_binary_dash_v1!(Baked);
impl_property_binary_default_ignorable_code_point_v1!(Baked);
impl_property_binary_deprecated_v1!(Baked);
impl_property_binary_diacritic_v1!(Baked);
impl_property_binary_emoji_component_v1!(Baked);
impl_property_binary_emoji_modifier_base_v1!(Baked);
impl_property_binary_emoji_modifier_v1!(Baked);
impl_property_binary_emoji_presentation_v1!(Baked);
impl_property_binary_emoji_v1!(Baked);
impl_property_binary_extended_pictographic_v1!(Baked);
impl_property_binary_extender_v1!(Baked);
impl_property_binary_full_composition_exclusion_v1!(Baked);
impl_property_binary_graph_v1!(Baked);
impl_property_binary_grapheme_base_v1!(Baked);
impl_property_binary_grapheme_extend_v1!(Baked);
impl_property_binary_grapheme_link_v1!(Baked);
impl_property_binary_hex_digit_v1!(Baked);
impl_property_binary_hyphen_v1!(Baked);
impl_property_binary_id_compat_math_continue_v1!(Baked);
impl_property_binary_id_compat_math_start_v1!(Baked);
impl_property_binary_id_continue_v1!(Baked);
impl_property_binary_id_start_v1!(Baked);
impl_property_binary_ideographic_v1!(Baked);
impl_property_binary_ids_binary_operator_v1!(Baked);
impl_property_binary_ids_trinary_operator_v1!(Baked);
impl_property_binary_ids_unary_operator_v1!(Baked);
impl_property_binary_join_control_v1!(Baked);
impl_property_binary_logical_order_exception_v1!(Baked);
impl_property_binary_lowercase_v1!(Baked);
impl_property_binary_math_v1!(Baked);
impl_property_binary_modifier_combining_mark_v1!(Baked);
impl_property_binary_nfc_inert_v1!(Baked);
impl_property_binary_nfd_inert_v1!(Baked);
impl_property_binary_nfkc_inert_v1!(Baked);
impl_property_binary_nfkd_inert_v1!(Baked);
impl_property_binary_noncharacter_code_point_v1!(Baked);
impl_property_binary_pattern_syntax_v1!(Baked);
impl_property_binary_pattern_white_space_v1!(Baked);
impl_property_binary_prepended_concatenation_mark_v1!(Baked);
impl_property_binary_print_v1!(Baked);
impl_property_binary_quotation_mark_v1!(Baked);
impl_property_binary_radical_v1!(Baked);
impl_property_binary_regional_indicator_v1!(Baked);
impl_property_binary_segment_starter_v1!(Baked);
impl_property_binary_sentence_terminal_v1!(Baked);
impl_property_binary_soft_dotted_v1!(Baked);
impl_property_binary_terminal_punctuation_v1!(Baked);
impl_property_binary_unified_ideograph_v1!(Baked);
impl_property_binary_uppercase_v1!(Baked);
impl_property_binary_variation_selector_v1!(Baked);
impl_property_binary_white_space_v1!(Baked);
impl_property_binary_xdigit_v1!(Baked);
impl_property_binary_xid_continue_v1!(Baked);
impl_property_binary_xid_start_v1!(Baked);
impl_property_enum_bidi_class_v1!(Baked);
impl_property_enum_bidi_mirroring_glyph_v1!(Baked);
impl_property_enum_canonical_combining_class_v1!(Baked);
impl_property_enum_east_asian_width_v1!(Baked);
impl_property_enum_general_category_v1!(Baked);
impl_property_enum_grapheme_cluster_break_v1!(Baked);
impl_property_enum_hangul_syllable_type_v1!(Baked);
impl_property_enum_indic_conjunct_break_v1!(Baked);
impl_property_enum_indic_syllabic_category_v1!(Baked);
impl_property_enum_joining_type_v1!(Baked);
impl_property_enum_line_break_v1!(Baked);
impl_property_enum_script_v1!(Baked);
impl_property_enum_sentence_break_v1!(Baked);
impl_property_enum_vertical_orientation_v1!(Baked);
impl_property_enum_word_break_v1!(Baked);
impl_property_name_long_bidi_class_v1!(Baked);
#[cfg(feature = "alloc")]
impl_property_name_long_canonical_combining_class_v1!(Baked);
impl_property_name_long_east_asian_width_v1!(Baked);
impl_property_name_long_general_category_v1!(Baked);
impl_property_name_long_grapheme_cluster_break_v1!(Baked);
impl_property_name_long_hangul_syllable_type_v1!(Baked);
impl_property_name_long_indic_syllabic_category_v1!(Baked);
impl_property_name_long_indic_conjunct_break_v1!(Baked);
impl_property_name_long_joining_type_v1!(Baked);
impl_property_name_long_line_break_v1!(Baked);
impl_property_name_long_script_v1!(Baked);
impl_property_name_long_sentence_break_v1!(Baked);
impl_property_name_long_vertical_orientation_v1!(Baked);
impl_property_name_long_word_break_v1!(Baked);
impl_property_name_parse_bidi_class_v1!(Baked);
impl_property_name_parse_canonical_combining_class_v1!(Baked);
impl_property_name_parse_east_asian_width_v1!(Baked);
impl_property_name_parse_general_category_mask_v1!(Baked);
impl_property_name_parse_general_category_v1!(Baked);
impl_property_name_parse_grapheme_cluster_break_v1!(Baked);
impl_property_name_parse_hangul_syllable_type_v1!(Baked);
impl_property_name_parse_indic_syllabic_category_v1!(Baked);
impl_property_name_parse_indic_conjunct_break_v1!(Baked);
impl_property_name_parse_joining_type_v1!(Baked);
impl_property_name_parse_line_break_v1!(Baked);
impl_property_name_parse_script_v1!(Baked);
impl_property_name_parse_sentence_break_v1!(Baked);
impl_property_name_parse_vertical_orientation_v1!(Baked);
impl_property_name_parse_word_break_v1!(Baked);
impl_property_name_short_bidi_class_v1!(Baked);
#[cfg(feature = "alloc")]
impl_property_name_short_canonical_combining_class_v1!(Baked);
impl_property_name_short_east_asian_width_v1!(Baked);
impl_property_name_short_general_category_v1!(Baked);
impl_property_name_short_grapheme_cluster_break_v1!(Baked);
impl_property_name_short_hangul_syllable_type_v1!(Baked);
impl_property_name_short_indic_syllabic_category_v1!(Baked);
impl_property_name_short_indic_conjunct_break_v1!(Baked);
impl_property_name_short_joining_type_v1!(Baked);
impl_property_name_short_line_break_v1!(Baked);
impl_property_name_short_script_v1!(Baked);
impl_property_name_short_sentence_break_v1!(Baked);
impl_property_name_short_vertical_orientation_v1!(Baked);
impl_property_name_short_word_break_v1!(Baked);
impl_property_script_with_extensions_v1!(Baked);
};
icu_provider::data_marker!(
/// `PropertyBinaryAlnumV1`
PropertyBinaryAlnumV1,
PropertyCodePointSet<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyBinaryAlphabeticV1`
PropertyBinaryAlphabeticV1,
PropertyCodePointSet<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyBinaryAsciiHexDigitV1`
PropertyBinaryAsciiHexDigitV1,
PropertyCodePointSet<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyBinaryBidiControlV1`
PropertyBinaryBidiControlV1,
PropertyCodePointSet<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyBinaryBidiMirroredV1`
PropertyBinaryBidiMirroredV1,
PropertyCodePointSet<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyBinaryBlankV1`
PropertyBinaryBlankV1,
PropertyCodePointSet<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyBinaryCasedV1`
PropertyBinaryCasedV1,
PropertyCodePointSet<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyBinaryCaseIgnorableV1`
PropertyBinaryCaseIgnorableV1,
PropertyCodePointSet<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyBinaryCaseSensitiveV1`
PropertyBinaryCaseSensitiveV1,
PropertyCodePointSet<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyBinaryChangesWhenCasefoldedV1`
PropertyBinaryChangesWhenCasefoldedV1,
PropertyCodePointSet<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyBinaryChangesWhenCasemappedV1`
PropertyBinaryChangesWhenCasemappedV1,
PropertyCodePointSet<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyBinaryChangesWhenLowercasedV1`
PropertyBinaryChangesWhenLowercasedV1,
PropertyCodePointSet<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyBinaryChangesWhenNfkcCasefoldedV1`
PropertyBinaryChangesWhenNfkcCasefoldedV1,
PropertyCodePointSet<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyBinaryChangesWhenTitlecasedV1`
PropertyBinaryChangesWhenTitlecasedV1,
PropertyCodePointSet<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyBinaryChangesWhenUppercasedV1`
PropertyBinaryChangesWhenUppercasedV1,
PropertyCodePointSet<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyBinaryDashV1`
PropertyBinaryDashV1,
PropertyCodePointSet<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyBinaryDefaultIgnorableCodePointV1`
PropertyBinaryDefaultIgnorableCodePointV1,
PropertyCodePointSet<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyBinaryDeprecatedV1`
PropertyBinaryDeprecatedV1,
PropertyCodePointSet<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyBinaryDiacriticV1`
PropertyBinaryDiacriticV1,
PropertyCodePointSet<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyBinaryEmojiComponentV1`
PropertyBinaryEmojiComponentV1,
PropertyCodePointSet<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyBinaryEmojiModifierBaseV1`
PropertyBinaryEmojiModifierBaseV1,
PropertyCodePointSet<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyBinaryEmojiModifierV1`
PropertyBinaryEmojiModifierV1,
PropertyCodePointSet<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyBinaryEmojiPresentationV1`
PropertyBinaryEmojiPresentationV1,
PropertyCodePointSet<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyBinaryEmojiV1`
PropertyBinaryEmojiV1,
PropertyCodePointSet<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyBinaryExtendedPictographicV1`
PropertyBinaryExtendedPictographicV1,
PropertyCodePointSet<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyBinaryExtenderV1`
PropertyBinaryExtenderV1,
PropertyCodePointSet<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyBinaryFullCompositionExclusionV1`
PropertyBinaryFullCompositionExclusionV1,
PropertyCodePointSet<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyBinaryGraphemeBaseV1`
PropertyBinaryGraphemeBaseV1,
PropertyCodePointSet<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyBinaryGraphemeExtendV1`
PropertyBinaryGraphemeExtendV1,
PropertyCodePointSet<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyBinaryGraphemeLinkV1`
PropertyBinaryGraphemeLinkV1,
PropertyCodePointSet<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyBinaryGraphV1`
PropertyBinaryGraphV1,
PropertyCodePointSet<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyBinaryHexDigitV1`
PropertyBinaryHexDigitV1,
PropertyCodePointSet<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyBinaryHyphenV1`
PropertyBinaryHyphenV1,
PropertyCodePointSet<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyBinaryIdCompatMathContinueV1`
PropertyBinaryIdCompatMathContinueV1,
PropertyCodePointSet<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyBinaryIdCompatMathStartV1`
PropertyBinaryIdCompatMathStartV1,
PropertyCodePointSet<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyBinaryIdContinueV1`
PropertyBinaryIdContinueV1,
PropertyCodePointSet<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyBinaryIdeographicV1`
PropertyBinaryIdeographicV1,
PropertyCodePointSet<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyBinaryIdsBinaryOperatorV1`
PropertyBinaryIdsBinaryOperatorV1,
PropertyCodePointSet<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyBinaryIdStartV1`
PropertyBinaryIdStartV1,
PropertyCodePointSet<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyBinaryIdsTrinaryOperatorV1`
PropertyBinaryIdsTrinaryOperatorV1,
PropertyCodePointSet<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyBinaryIdsUnaryOperatorV1`
PropertyBinaryIdsUnaryOperatorV1,
PropertyCodePointSet<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyBinaryJoinControlV1`
PropertyBinaryJoinControlV1,
PropertyCodePointSet<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyBinaryLogicalOrderExceptionV1`
PropertyBinaryLogicalOrderExceptionV1,
PropertyCodePointSet<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyBinaryLowercaseV1`
PropertyBinaryLowercaseV1,
PropertyCodePointSet<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyBinaryMathV1`
PropertyBinaryMathV1,
PropertyCodePointSet<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyBinaryModifierCombiningMarkV1`
PropertyBinaryModifierCombiningMarkV1,
PropertyCodePointSet<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyBinaryNfcInertV1`
PropertyBinaryNfcInertV1,
PropertyCodePointSet<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyBinaryNfdInertV1`
PropertyBinaryNfdInertV1,
PropertyCodePointSet<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyBinaryNfkcInertV1`
PropertyBinaryNfkcInertV1,
PropertyCodePointSet<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyBinaryNfkdInertV1`
PropertyBinaryNfkdInertV1,
PropertyCodePointSet<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyBinaryNoncharacterCodePointV1`
PropertyBinaryNoncharacterCodePointV1,
PropertyCodePointSet<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyBinaryPatternSyntaxV1`
PropertyBinaryPatternSyntaxV1,
PropertyCodePointSet<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyBinaryPatternWhiteSpaceV1`
PropertyBinaryPatternWhiteSpaceV1,
PropertyCodePointSet<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyBinaryPrependedConcatenationMarkV1`
PropertyBinaryPrependedConcatenationMarkV1,
PropertyCodePointSet<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyBinaryPrintV1`
PropertyBinaryPrintV1,
PropertyCodePointSet<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyBinaryQuotationMarkV1`
PropertyBinaryQuotationMarkV1,
PropertyCodePointSet<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyBinaryRadicalV1`
PropertyBinaryRadicalV1,
PropertyCodePointSet<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyBinaryRegionalIndicatorV1`
PropertyBinaryRegionalIndicatorV1,
PropertyCodePointSet<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyBinarySegmentStarterV1`
PropertyBinarySegmentStarterV1,
PropertyCodePointSet<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyBinarySentenceTerminalV1`
PropertyBinarySentenceTerminalV1,
PropertyCodePointSet<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyBinarySoftDottedV1`
PropertyBinarySoftDottedV1,
PropertyCodePointSet<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyBinaryTerminalPunctuationV1`
PropertyBinaryTerminalPunctuationV1,
PropertyCodePointSet<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyBinaryUnifiedIdeographV1`
PropertyBinaryUnifiedIdeographV1,
PropertyCodePointSet<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyBinaryUppercaseV1`
PropertyBinaryUppercaseV1,
PropertyCodePointSet<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyBinaryVariationSelectorV1`
PropertyBinaryVariationSelectorV1,
PropertyCodePointSet<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyBinaryWhiteSpaceV1`
PropertyBinaryWhiteSpaceV1,
PropertyCodePointSet<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyBinaryXdigitV1`
PropertyBinaryXdigitV1,
PropertyCodePointSet<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyBinaryXidContinueV1`
PropertyBinaryXidContinueV1,
PropertyCodePointSet<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyBinaryXidStartV1`
PropertyBinaryXidStartV1,
PropertyCodePointSet<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// Data marker for the 'BidiClass' Unicode property
PropertyEnumBidiClassV1,
PropertyCodePointMap<'static, crate::props::BidiClass>,
is_singleton = true,
);
icu_provider::data_marker!(
/// Data marker for the 'CanonicalCombiningClass' Unicode property
PropertyEnumCanonicalCombiningClassV1,
PropertyCodePointMap<'static, crate::props::CanonicalCombiningClass>,
is_singleton = true,
);
icu_provider::data_marker!(
/// Data marker for the 'EastAsianWidth' Unicode property
PropertyEnumEastAsianWidthV1,
PropertyCodePointMap<'static, crate::props::EastAsianWidth>,
is_singleton = true,
);
icu_provider::data_marker!(
/// Data marker for the 'GeneralCategory' Unicode property
PropertyEnumGeneralCategoryV1,
PropertyCodePointMap<'static, crate::props::GeneralCategory>,
is_singleton = true,
);
icu_provider::data_marker!(
/// Data marker for the 'GraphemeClusterBreak' Unicode property
PropertyEnumGraphemeClusterBreakV1,
PropertyCodePointMap<'static, crate::props::GraphemeClusterBreak>,
is_singleton = true,
);
icu_provider::data_marker!(
/// Data marker for the 'HangulSyllableType' Unicode property
PropertyEnumHangulSyllableTypeV1,
PropertyCodePointMap<'static, crate::props::HangulSyllableType>,
is_singleton = true,
);
icu_provider::data_marker!(
/// Data marker for the 'IndicConjunctBreak' Unicode property
PropertyEnumIndicConjunctBreakV1,
PropertyCodePointMap<'static, crate::props::IndicConjunctBreak>,
is_singleton = true,
);
icu_provider::data_marker!(
/// Data marker for the 'IndicSyllabicCategory' Unicode property
PropertyEnumIndicSyllabicCategoryV1,
PropertyCodePointMap<'static, crate::props::IndicSyllabicCategory>,
is_singleton = true,
);
icu_provider::data_marker!(
/// Data marker for the 'JoiningType' Unicode property
PropertyEnumJoiningTypeV1,
PropertyCodePointMap<'static, crate::props::JoiningType>,
is_singleton = true,
);
icu_provider::data_marker!(
/// Data marker for the 'LineBreak' Unicode property
PropertyEnumLineBreakV1,
PropertyCodePointMap<'static, crate::props::LineBreak>,
is_singleton = true,
);
icu_provider::data_marker!(
/// Data marker for the 'Script' Unicode property
PropertyEnumScriptV1,
PropertyCodePointMap<'static, crate::props::Script>,
is_singleton = true,
);
icu_provider::data_marker!(
/// Data marker for the 'SentenceBreak' Unicode property
PropertyEnumSentenceBreakV1,
PropertyCodePointMap<'static, crate::props::SentenceBreak>,
is_singleton = true,
);
icu_provider::data_marker!(
/// Data marker for the 'Vertical_Orientation' Unicode property
PropertyEnumVerticalOrientationV1,
PropertyCodePointMap<'static, crate::props::VerticalOrientation>,
is_singleton = true,
);
icu_provider::data_marker!(
/// Data marker for the 'WordBreak' Unicode property
PropertyEnumWordBreakV1,
PropertyCodePointMap<'static, crate::props::WordBreak>,
is_singleton = true,
);
icu_provider::data_marker!(
/// Data marker for the 'BidiMirroringGlyph' Unicode property
PropertyEnumBidiMirroringGlyphV1,
PropertyCodePointMap<'static, crate::bidi::BidiMirroringGlyph>,
is_singleton = true,
);
icu_provider::data_marker!(
/// `PropertyBinaryBasicEmojiV1`
PropertyBinaryBasicEmojiV1,
PropertyUnicodeSet<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyScriptWithExtensionsV1`
PropertyScriptWithExtensionsV1,
ScriptWithExtensionsProperty<'static>,
is_singleton = true
);
/// All data keys in this module.
pub const MARKERS: &[DataMarkerInfo] = &[
PropertyNameLongBidiClassV1::INFO,
#[cfg(feature = "alloc")]
PropertyNameLongCanonicalCombiningClassV1::INFO,
PropertyNameLongEastAsianWidthV1::INFO,
PropertyNameLongGeneralCategoryV1::INFO,
PropertyNameLongGraphemeClusterBreakV1::INFO,
PropertyNameLongHangulSyllableTypeV1::INFO,
PropertyNameLongIndicSyllabicCategoryV1::INFO,
PropertyNameLongIndicConjunctBreakV1::INFO,
PropertyNameLongJoiningTypeV1::INFO,
PropertyNameLongLineBreakV1::INFO,
PropertyNameLongScriptV1::INFO,
PropertyNameLongSentenceBreakV1::INFO,
PropertyNameLongVerticalOrientationV1::INFO,
PropertyNameLongWordBreakV1::INFO,
PropertyNameParseBidiClassV1::INFO,
PropertyNameParseCanonicalCombiningClassV1::INFO,
PropertyNameParseEastAsianWidthV1::INFO,
PropertyNameParseGeneralCategoryMaskV1::INFO,
PropertyNameParseGeneralCategoryV1::INFO,
PropertyNameParseGraphemeClusterBreakV1::INFO,
PropertyNameParseHangulSyllableTypeV1::INFO,
PropertyNameParseIndicSyllabicCategoryV1::INFO,
PropertyNameParseIndicConjunctBreakV1::INFO,
PropertyNameParseJoiningTypeV1::INFO,
PropertyNameParseLineBreakV1::INFO,
PropertyNameParseScriptV1::INFO,
PropertyNameParseSentenceBreakV1::INFO,
PropertyNameParseVerticalOrientationV1::INFO,
PropertyNameParseWordBreakV1::INFO,
PropertyNameShortBidiClassV1::INFO,
#[cfg(feature = "alloc")]
PropertyNameShortCanonicalCombiningClassV1::INFO,
PropertyNameShortEastAsianWidthV1::INFO,
PropertyNameShortGeneralCategoryV1::INFO,
PropertyNameShortGraphemeClusterBreakV1::INFO,
PropertyNameShortHangulSyllableTypeV1::INFO,
PropertyNameShortIndicSyllabicCategoryV1::INFO,
PropertyNameShortIndicConjunctBreakV1::INFO,
PropertyNameShortJoiningTypeV1::INFO,
PropertyNameShortLineBreakV1::INFO,
PropertyNameShortScriptV1::INFO,
PropertyNameShortSentenceBreakV1::INFO,
PropertyNameShortVerticalOrientationV1::INFO,
PropertyNameShortWordBreakV1::INFO,
PropertyBinaryAlnumV1::INFO,
PropertyBinaryAlphabeticV1::INFO,
PropertyBinaryAsciiHexDigitV1::INFO,
PropertyBinaryBidiControlV1::INFO,
PropertyBinaryBidiMirroredV1::INFO,
PropertyBinaryBlankV1::INFO,
PropertyBinaryCasedV1::INFO,
PropertyBinaryCaseIgnorableV1::INFO,
PropertyBinaryCaseSensitiveV1::INFO,
PropertyBinaryChangesWhenCasefoldedV1::INFO,
PropertyBinaryChangesWhenCasemappedV1::INFO,
PropertyBinaryChangesWhenLowercasedV1::INFO,
PropertyBinaryChangesWhenNfkcCasefoldedV1::INFO,
PropertyBinaryChangesWhenTitlecasedV1::INFO,
PropertyBinaryChangesWhenUppercasedV1::INFO,
PropertyBinaryDashV1::INFO,
PropertyBinaryDefaultIgnorableCodePointV1::INFO,
PropertyBinaryDeprecatedV1::INFO,
PropertyBinaryDiacriticV1::INFO,
PropertyBinaryEmojiComponentV1::INFO,
PropertyBinaryEmojiModifierBaseV1::INFO,
PropertyBinaryEmojiModifierV1::INFO,
PropertyBinaryEmojiPresentationV1::INFO,
PropertyBinaryEmojiV1::INFO,
PropertyBinaryExtendedPictographicV1::INFO,
PropertyBinaryExtenderV1::INFO,
PropertyBinaryFullCompositionExclusionV1::INFO,
PropertyBinaryGraphemeBaseV1::INFO,
PropertyBinaryGraphemeExtendV1::INFO,
PropertyBinaryGraphemeLinkV1::INFO,
PropertyBinaryGraphV1::INFO,
PropertyBinaryHexDigitV1::INFO,
PropertyBinaryHyphenV1::INFO,
PropertyBinaryIdCompatMathContinueV1::INFO,
PropertyBinaryIdCompatMathStartV1::INFO,
PropertyBinaryIdContinueV1::INFO,
PropertyBinaryIdeographicV1::INFO,
PropertyBinaryIdsBinaryOperatorV1::INFO,
PropertyBinaryIdStartV1::INFO,
PropertyBinaryIdsTrinaryOperatorV1::INFO,
PropertyBinaryIdsUnaryOperatorV1::INFO,
PropertyBinaryJoinControlV1::INFO,
PropertyBinaryLogicalOrderExceptionV1::INFO,
PropertyBinaryLowercaseV1::INFO,
PropertyBinaryMathV1::INFO,
PropertyBinaryModifierCombiningMarkV1::INFO,
PropertyBinaryNfcInertV1::INFO,
PropertyBinaryNfdInertV1::INFO,
PropertyBinaryNfkcInertV1::INFO,
PropertyBinaryNfkdInertV1::INFO,
PropertyBinaryNoncharacterCodePointV1::INFO,
PropertyBinaryPatternSyntaxV1::INFO,
PropertyBinaryPatternWhiteSpaceV1::INFO,
PropertyBinaryPrependedConcatenationMarkV1::INFO,
PropertyBinaryPrintV1::INFO,
PropertyBinaryQuotationMarkV1::INFO,
PropertyBinaryRadicalV1::INFO,
PropertyBinaryRegionalIndicatorV1::INFO,
PropertyBinarySegmentStarterV1::INFO,
PropertyBinarySentenceTerminalV1::INFO,
PropertyBinarySoftDottedV1::INFO,
PropertyBinaryTerminalPunctuationV1::INFO,
PropertyBinaryUnifiedIdeographV1::INFO,
PropertyBinaryUppercaseV1::INFO,
PropertyBinaryVariationSelectorV1::INFO,
PropertyBinaryWhiteSpaceV1::INFO,
PropertyBinaryXdigitV1::INFO,
PropertyBinaryXidContinueV1::INFO,
PropertyBinaryXidStartV1::INFO,
PropertyEnumBidiClassV1::INFO,
PropertyEnumCanonicalCombiningClassV1::INFO,
PropertyEnumEastAsianWidthV1::INFO,
PropertyEnumGeneralCategoryV1::INFO,
PropertyEnumGraphemeClusterBreakV1::INFO,
PropertyEnumHangulSyllableTypeV1::INFO,
PropertyEnumIndicConjunctBreakV1::INFO,
PropertyEnumIndicSyllabicCategoryV1::INFO,
PropertyEnumJoiningTypeV1::INFO,
PropertyEnumLineBreakV1::INFO,
PropertyEnumScriptV1::INFO,
PropertyEnumSentenceBreakV1::INFO,
PropertyEnumVerticalOrientationV1::INFO,
PropertyEnumWordBreakV1::INFO,
PropertyEnumBidiMirroringGlyphV1::INFO,
PropertyBinaryBasicEmojiV1::INFO,
PropertyScriptWithExtensionsV1::INFO,
];
/// A set of characters which share a particular property value.
///
/// This data enum is extensible, more backends may be added in the future.
/// Old data can be used with newer code but not vice versa.
///
/// <div class="stab unstable">
/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
/// including in SemVer minor releases. While the serde representation of data structs is guaranteed
/// to be stable, their Rust representation might not be. Use with caution.
/// </div>
#[derive(Debug, Eq, PartialEq, Clone, yoke::Yokeable, zerofrom::ZeroFrom)]
#[cfg_attr(feature = "datagen", derive(serde::Serialize, databake::Bake))]
#[cfg_attr(feature = "datagen", databake(path = icu_properties::provider))]
#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
#[non_exhaustive]
pub enum PropertyCodePointSet<'data> {
/// The set of characters, represented as an inversion list
InversionList(#[cfg_attr(feature = "serde", serde(borrow))] CodePointInversionList<'data>),
// new variants should go BELOW existing ones
// Serde serializes based on variant name and index in the enum
// https://docs.rs/serde/latest/serde/trait.Serializer.html#tymethod.serialize_unit_variant
}
icu_provider::data_struct!(
PropertyCodePointSet<'_>,
#[cfg(feature = "datagen")]
);
// See CodePointSetData for documentation of these functions
impl<'data> PropertyCodePointSet<'data> {
#[inline]
pub(crate) fn contains(&self, ch: char) -> bool {
match *self {
Self::InversionList(ref l) => l.contains(ch),
}
}
#[inline]
pub(crate) fn contains32(&self, ch: u32) -> bool {
match *self {
Self::InversionList(ref l) => l.contains32(ch),
}
}
#[inline]
pub(crate) fn iter_ranges(&self) -> impl Iterator<Item = RangeInclusive<u32>> + '_ {
match *self {
Self::InversionList(ref l) => l.iter_ranges(),
}
}
#[inline]
pub(crate) fn iter_ranges_complemented(
&self,
) -> impl Iterator<Item = RangeInclusive<u32>> + '_ {
match *self {
Self::InversionList(ref l) => l.iter_ranges_complemented(),
}
}
#[inline]
pub(crate) fn from_code_point_inversion_list(l: CodePointInversionList<'static>) -> Self {
Self::InversionList(l)
}
#[inline]
pub(crate) fn as_code_point_inversion_list(
&'_ self,
) -> Option<&'_ CodePointInversionList<'data>> {
match *self {
Self::InversionList(ref l) => Some(l),
// any other backing data structure that cannot return a CPInvList in O(1) time should return None
}
}
#[inline]
pub(crate) fn to_code_point_inversion_list(&self) -> CodePointInversionList<'_> {
match *self {
Self::InversionList(ref t) => ZeroFrom::zero_from(t),
}
}
}
/// A map efficiently storing data about individual characters.
///
/// This data enum is extensible, more backends may be added in the future.
/// Old data can be used with newer code but not vice versa.
///
/// <div class="stab unstable">
/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
/// including in SemVer minor releases. While the serde representation of data structs is guaranteed
/// to be stable, their Rust representation might not be. Use with caution.
/// </div>
#[derive(Clone, Debug, Eq, PartialEq, yoke::Yokeable, zerofrom::ZeroFrom)]
#[cfg_attr(feature = "datagen", derive(serde::Serialize, databake::Bake))]
#[cfg_attr(feature = "datagen", databake(path = icu_properties::provider))]
#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
#[non_exhaustive]
pub enum PropertyCodePointMap<'data, T: TrieValue> {
/// A codepoint trie storing the data
CodePointTrie(#[cfg_attr(feature = "serde", serde(borrow))] CodePointTrie<'data, T>),
// new variants should go BELOW existing ones
// Serde serializes based on variant name and index in the enum
// https://docs.rs/serde/latest/serde/trait.Serializer.html#tymethod.serialize_unit_variant
}
icu_provider::data_struct!(
<T: TrieValue> PropertyCodePointMap<'_, T>,
#[cfg(feature = "datagen")]
);
// See CodePointMapData for documentation of these functions
impl<'data, T: TrieValue> PropertyCodePointMap<'data, T> {
#[inline]
pub(crate) fn get32(&self, ch: u32) -> T {
match *self {
Self::CodePointTrie(ref t) => t.get32(ch),
}
}
#[inline]
pub(crate) fn get(&self, c: char) -> T {
match *self {
Self::CodePointTrie(ref t) => t.get(c),
}
}
#[inline]
#[cfg(feature = "alloc")]
pub(crate) fn try_into_converted<P>(
self,
) -> Result<PropertyCodePointMap<'data, P>, zerovec::ule::UleError>
where
P: TrieValue,
{
match self {
Self::CodePointTrie(t) => t
.try_into_converted()
.map(PropertyCodePointMap::CodePointTrie),
}
}
#[inline]
#[cfg(feature = "alloc")]
pub(crate) fn get_set_for_value(&self, value: T) -> CodePointInversionList<'static> {
match *self {
Self::CodePointTrie(ref t) => t.get_set_for_value(value),
}
}
#[inline]
pub(crate) fn iter_ranges(&self) -> impl Iterator<Item = CodePointMapRange<T>> + '_ {
match *self {
Self::CodePointTrie(ref t) => t.iter_ranges(),
}
}
#[inline]
pub(crate) fn iter_ranges_mapped<'a, U: Eq + 'a>(
&'a self,
map: impl FnMut(T) -> U + Copy + 'a,
) -> impl Iterator<Item = CodePointMapRange<U>> + 'a {
match *self {
Self::CodePointTrie(ref t) => t.iter_ranges_mapped(map),
}
}
#[inline]
pub(crate) fn from_code_point_trie(trie: CodePointTrie<'static, T>) -> Self {
Self::CodePointTrie(trie)
}
#[inline]
pub(crate) fn as_code_point_trie(&self) -> Option<&CodePointTrie<'data, T>> {
match *self {
Self::CodePointTrie(ref t) => Some(t),
// any other backing data structure that cannot return a CPT in O(1) time should return None
}
}
#[inline]
pub(crate) fn to_code_point_trie(&self) -> CodePointTrie<'_, T> {
match *self {
Self::CodePointTrie(ref t) => ZeroFrom::zero_from(t),
}
}
}
/// A set of characters and strings which share a particular property value.
///
/// <div class="stab unstable">
/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
/// including in SemVer minor releases. While the serde representation of data structs is guaranteed
/// to be stable, their Rust representation might not be. Use with caution.
/// </div>
#[derive(Debug, Eq, PartialEq, Clone, yoke::Yokeable, zerofrom::ZeroFrom)]
#[cfg_attr(feature = "datagen", derive(serde::Serialize, databake::Bake))]
#[cfg_attr(feature = "datagen", databake(path = icu_properties::provider))]
#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
#[non_exhaustive]
pub enum PropertyUnicodeSet<'data> {
/// A set representing characters in an inversion list, and the strings in a list.
CPInversionListStrList(
#[cfg_attr(feature = "serde", serde(borrow))] CodePointInversionListAndStringList<'data>,
),
// new variants should go BELOW existing ones
// Serde serializes based on variant name and index in the enum
// https://docs.rs/serde/latest/serde/trait.Serializer.html#tymethod.serialize_unit_variant
}
icu_provider::data_struct!(
PropertyUnicodeSet<'_>,
#[cfg(feature = "datagen")]
);
impl<'data> PropertyUnicodeSet<'data> {
#[inline]
pub(crate) fn contains_str(&self, s: &str) -> bool {
match *self {
Self::CPInversionListStrList(ref l) => l.contains_str(s),
}
}
#[inline]
pub(crate) fn contains32(&self, cp: u32) -> bool {
match *self {
Self::CPInversionListStrList(ref l) => l.contains32(cp),
}
}
#[inline]
pub(crate) fn contains(&self, ch: char) -> bool {
match *self {
Self::CPInversionListStrList(ref l) => l.contains(ch),
}
}
#[inline]
pub(crate) fn from_code_point_inversion_list_string_list(
l: CodePointInversionListAndStringList<'static>,
) -> Self {
Self::CPInversionListStrList(l)
}
#[inline]
pub(crate) fn as_code_point_inversion_list_string_list(
&'_ self,
) -> Option<&'_ CodePointInversionListAndStringList<'data>> {
match *self {
Self::CPInversionListStrList(ref l) => Some(l),
// any other backing data structure that cannot return a CPInversionListStrList in O(1) time should return None
}
}
#[inline]
pub(crate) fn to_code_point_inversion_list_string_list(
&self,
) -> CodePointInversionListAndStringList<'_> {
match *self {
Self::CPInversionListStrList(ref t) => ZeroFrom::zero_from(t),
}
}
}
/// A struct that efficiently stores `Script` and `Script_Extensions` property data.
///
/// <div class="stab unstable">
/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
/// including in SemVer minor releases. While the serde representation of data structs is guaranteed
/// to be stable, their Rust representation might not be. Use with caution.
/// </div>
#[derive(Debug, Eq, PartialEq, Clone, yoke::Yokeable, zerofrom::ZeroFrom)]
#[cfg_attr(feature = "datagen", derive(serde::Serialize, databake::Bake))]
#[cfg_attr(feature = "datagen", databake(path = icu_properties::provider))]
#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
pub struct ScriptWithExtensionsProperty<'data> {
/// Note: The `ScriptWithExt` values in this array will assume a 12-bit layout. The 2
/// higher order bits 11..10 will indicate how to deduce the Script value and
/// Script_Extensions value, nearly matching the representation
/// [in ICU](https://github.com/unicode-org/icu/blob/main/icu4c/source/common/uprops.h):
///
/// | High order 2 bits value | Script | Script_Extensions |
/// |-------------------------|--------------------------------------------------------|----------------------------------------------------------------|
/// | 3 | First value in sub-array, index given by lower 10 bits | Sub-array excluding first value, index given by lower 10 bits |
/// | 2 | Script=Inherited | Entire sub-array, index given by lower 10 bits |
/// | 1 | Script=Common | Entire sub-array, index given by lower 10 bits |
/// | 0 | Value in lower 10 bits | `[ Script value ]` single-element array |
///
/// When the lower 10 bits of the value are used as an index, that index is
/// used for the outer-level vector of the nested `extensions` structure.
#[cfg_attr(feature = "serde", serde(borrow))]
pub trie: CodePointTrie<'data, ScriptWithExt>,
/// This companion structure stores Script_Extensions values, which are
/// themselves arrays / vectors. This structure only stores the values for
/// cases in which `scx(cp) != [ sc(cp) ]`. Each sub-vector is distinct. The
/// sub-vector represents the Script_Extensions array value for a code point,
/// and may also indicate Script value, as described for the `trie` field.
#[cfg_attr(feature = "serde", serde(borrow))]
pub extensions: VarZeroVec<'data, ZeroSlice<Script>>,
}
icu_provider::data_struct!(
ScriptWithExtensionsProperty<'_>,
#[cfg(feature = "datagen")]
);
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
//! 🚧 \[Unstable\] Property names-related data for this component
//!
//! <div class="stab unstable">
//! 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
//! including in SemVer minor releases. While the serde representation of data structs is guaranteed
//! to be stable, their Rust representation might not be. Use with caution.
//! </div>
//!
//! Read more about data providers: [`icu_provider`]
use icu_locale_core::subtags::Script;
use icu_provider::prelude::{yoke, zerofrom};
use zerotrie::ZeroTrieSimpleAscii;
use zerovec::ule::NichedOption;
use zerovec::{VarZeroVec, ZeroVec};
icu_provider::data_marker!(
/// `PropertyNameParseBidiClassV1`
PropertyNameParseBidiClassV1,
PropertyValueNameToEnumMap<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyNameParseCanonicalCombiningClassV1`
PropertyNameParseCanonicalCombiningClassV1,
PropertyValueNameToEnumMap<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyNameParseEastAsianWidthV1`
PropertyNameParseEastAsianWidthV1,
PropertyValueNameToEnumMap<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyNameParseGeneralCategoryMaskV1`
PropertyNameParseGeneralCategoryMaskV1,
PropertyValueNameToEnumMap<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyNameParseGeneralCategoryV1`
PropertyNameParseGeneralCategoryV1,
PropertyValueNameToEnumMap<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyNameParseGraphemeClusterBreakV1`
PropertyNameParseGraphemeClusterBreakV1,
PropertyValueNameToEnumMap<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyNameParseHangulSyllableTypeV1`
PropertyNameParseHangulSyllableTypeV1,
PropertyValueNameToEnumMap<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyNameParseIndicSyllabicCategoryV1`
PropertyNameParseIndicSyllabicCategoryV1,
PropertyValueNameToEnumMap<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyNameParseIndicConjunctBreakV1`
PropertyNameParseIndicConjunctBreakV1,
PropertyValueNameToEnumMap<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyNameParseJoiningTypeV1`
PropertyNameParseJoiningTypeV1,
PropertyValueNameToEnumMap<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyNameParseLineBreakV1`
PropertyNameParseLineBreakV1,
PropertyValueNameToEnumMap<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyNameParseScriptV1`
PropertyNameParseScriptV1,
PropertyValueNameToEnumMap<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyNameParseSentenceBreakV1`
PropertyNameParseSentenceBreakV1,
PropertyValueNameToEnumMap<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyNameParseVerticalOrientationV1`
PropertyNameParseVerticalOrientationV1,
PropertyValueNameToEnumMap<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyNameParseWordBreakV1`
PropertyNameParseWordBreakV1,
PropertyValueNameToEnumMap<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyNameLongBidiClassV1`
PropertyNameLongBidiClassV1,
PropertyEnumToValueNameLinearMap<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyNameShortBidiClassV1`
PropertyNameShortBidiClassV1,
PropertyEnumToValueNameLinearMap<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyNameLongEastAsianWidthV1`
PropertyNameLongEastAsianWidthV1,
PropertyEnumToValueNameLinearMap<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyNameShortEastAsianWidthV1`
PropertyNameShortEastAsianWidthV1,
PropertyEnumToValueNameLinearMap<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyNameLongGeneralCategoryV1`
PropertyNameLongGeneralCategoryV1,
PropertyEnumToValueNameLinearMap<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyNameShortGeneralCategoryV1`
PropertyNameShortGeneralCategoryV1,
PropertyEnumToValueNameLinearMap<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyNameLongGraphemeClusterBreakV1`
PropertyNameLongGraphemeClusterBreakV1,
PropertyEnumToValueNameLinearMap<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyNameShortGraphemeClusterBreakV1`
PropertyNameShortGraphemeClusterBreakV1,
PropertyEnumToValueNameLinearMap<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyNameLongHangulSyllableTypeV1`
PropertyNameLongHangulSyllableTypeV1,
PropertyEnumToValueNameLinearMap<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyNameShortHangulSyllableTypeV1`
PropertyNameShortHangulSyllableTypeV1,
PropertyEnumToValueNameLinearMap<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyNameLongIndicSyllabicCategoryV1`
PropertyNameLongIndicSyllabicCategoryV1,
PropertyEnumToValueNameLinearMap<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyNameShortIndicSyllabicCategoryV1`
PropertyNameShortIndicSyllabicCategoryV1,
PropertyEnumToValueNameLinearMap<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyNameLongIndicConjunctBreakV1`
PropertyNameLongIndicConjunctBreakV1,
PropertyEnumToValueNameLinearMap<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyNameShortIndicConjunctBreakV1`
PropertyNameShortIndicConjunctBreakV1,
PropertyEnumToValueNameLinearMap<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyNameLongJoiningTypeV1`
PropertyNameLongJoiningTypeV1,
PropertyEnumToValueNameLinearMap<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyNameShortJoiningTypeV1`
PropertyNameShortJoiningTypeV1,
PropertyEnumToValueNameLinearMap<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyNameLongLineBreakV1`
PropertyNameLongLineBreakV1,
PropertyEnumToValueNameLinearMap<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyNameShortLineBreakV1`
PropertyNameShortLineBreakV1,
PropertyEnumToValueNameLinearMap<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyNameLongScriptV1`
PropertyNameLongScriptV1,
PropertyEnumToValueNameLinearMap<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyNameLongSentenceBreakV1`
PropertyNameLongSentenceBreakV1,
PropertyEnumToValueNameLinearMap<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyNameShortSentenceBreakV1`
PropertyNameShortSentenceBreakV1,
PropertyEnumToValueNameLinearMap<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyNameLongVerticalOrientationV1`
PropertyNameLongVerticalOrientationV1,
PropertyEnumToValueNameLinearMap<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyNameShortVerticalOrientationV1`
PropertyNameShortVerticalOrientationV1,
PropertyEnumToValueNameLinearMap<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyNameLongWordBreakV1`
PropertyNameLongWordBreakV1,
PropertyEnumToValueNameLinearMap<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyNameShortWordBreakV1`
PropertyNameShortWordBreakV1,
PropertyEnumToValueNameLinearMap<'static>,
is_singleton = true
);
#[cfg(feature = "alloc")]
icu_provider::data_marker!(
/// `PropertyNameLongCanonicalCombiningClassV1`
PropertyNameLongCanonicalCombiningClassV1,
PropertyEnumToValueNameSparseMap<'static>,
is_singleton = true,
);
#[cfg(feature = "alloc")]
icu_provider::data_marker!(
/// `PropertyNameShortCanonicalCombiningClassV1`
PropertyNameShortCanonicalCombiningClassV1,
PropertyEnumToValueNameSparseMap<'static>,
is_singleton = true,
);
icu_provider::data_marker!(
/// `PropertyNameShortScriptV1`
PropertyNameShortScriptV1,
PropertyScriptToIcuScriptMap<'static>,
is_singleton = true,
);
/// A set of characters and strings which share a particular property value.
///
/// <div class="stab unstable">
/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
/// including in SemVer minor releases. While the serde representation of data structs is guaranteed
/// to be stable, their Rust representation might not be. Use with caution.
/// </div>
#[derive(Debug, Clone, PartialEq, yoke::Yokeable, zerofrom::ZeroFrom)]
#[cfg_attr(feature = "datagen", derive(serde::Serialize, databake::Bake))]
#[cfg_attr(feature = "datagen", databake(path = icu_properties::provider::names))]
#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
pub struct PropertyValueNameToEnumMap<'data> {
/// A map from names to their value discriminant
#[cfg_attr(feature = "serde", serde(borrow))]
pub map: ZeroTrieSimpleAscii<ZeroVec<'data, u8>>,
}
icu_provider::data_struct!(
PropertyValueNameToEnumMap<'_>,
#[cfg(feature = "datagen")]
);
/// A mapping of property values to their names. A single instance of this map will only cover
/// either long or short names, determined whilst loading data.
///
/// <div class="stab unstable">
/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
/// including in SemVer minor releases. While the serde representation of data structs is guaranteed
/// to be stable, their Rust representation might not be. Use with caution.
/// </div>
#[derive(Debug, Clone, PartialEq, yoke::Yokeable, zerofrom::ZeroFrom)]
#[cfg_attr(feature = "datagen", derive(serde::Serialize, databake::Bake))]
#[cfg_attr(feature = "datagen", databake(path = icu_properties::provider::names))]
#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
#[yoke(prove_covariance_manually)]
#[cfg(feature = "alloc")]
pub struct PropertyEnumToValueNameSparseMap<'data> {
/// A map from the value discriminant to the names
#[cfg_attr(feature = "serde", serde(borrow))]
pub map: zerovec::ZeroMap<'data, u16, str>,
}
#[cfg(feature = "alloc")]
icu_provider::data_struct!(
PropertyEnumToValueNameSparseMap<'_>,
#[cfg(feature = "datagen")]
);
/// A mapping of property values to their names. A single instance of this map will only cover
/// either long or short names, determined whilst loading data.
///
/// <div class="stab unstable">
/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
/// including in SemVer minor releases. While the serde representation of data structs is guaranteed
/// to be stable, their Rust representation might not be. Use with caution.
/// </div>
#[derive(Debug, Clone, PartialEq, yoke::Yokeable, zerofrom::ZeroFrom)]
#[cfg_attr(feature = "datagen", derive(serde::Serialize, databake::Bake))]
#[cfg_attr(feature = "datagen", databake(path = icu_properties::provider::names))]
#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
#[yoke(prove_covariance_manually)]
pub struct PropertyEnumToValueNameLinearMap<'data> {
/// A map from the value discriminant (the index) to the names, for mostly
/// contiguous data. Empty strings count as missing.
#[cfg_attr(feature = "serde", serde(borrow))]
pub map: VarZeroVec<'data, str>,
}
icu_provider::data_struct!(
PropertyEnumToValueNameLinearMap<'_>,
#[cfg(feature = "datagen")]
);
/// A mapping of property values to their names. A single instance of this map will only cover
/// either long or short names, determined whilst loading data.
///
/// <div class="stab unstable">
/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
/// including in SemVer minor releases. While the serde representation of data structs is guaranteed
/// to be stable, their Rust representation might not be. Use with caution.
/// </div>
#[derive(Debug, Clone, PartialEq, yoke::Yokeable, zerofrom::ZeroFrom)]
#[cfg_attr(feature = "datagen", derive(serde::Serialize, databake::Bake))]
#[cfg_attr(feature = "datagen", databake(path = icu_properties::provider::names))]
#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
#[yoke(prove_covariance_manually)]
pub struct PropertyScriptToIcuScriptMap<'data> {
/// A map from the value discriminant (the index) to the names, for mostly
/// contiguous data. Empty strings count as missing.
#[cfg_attr(feature = "serde", serde(borrow))]
pub map: ZeroVec<'data, NichedOption<Script, 4>>,
}
icu_provider::data_struct!(
PropertyScriptToIcuScriptMap<'_>,
#[cfg(feature = "datagen")]
);
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
//! 🚧 \[Experimental\] This module is experimental and currently crate-private. Let us know if you
//! have a use case for this!
//!
//! This module contains utilities for working with properties where the specific property in use
//! is not known at compile time.
//!
//! For regex engines, [`crate::sets::load_for_ecma262_unstable()`] is a convenient API for working
//! with properties at runtime tailored for the use case of ECMA262-compatible regex engines.
use crate::provider::*;
use crate::CodePointSetData;
#[cfg(doc)]
use crate::{
props::{GeneralCategory, GeneralCategoryGroup, Script},
script, CodePointMapData, PropertyParser,
};
use icu_provider::prelude::*;
/// This type can represent any binary Unicode property.
///
/// This is intended to be used in situations where the exact unicode property needed is
/// only known at runtime, for example in regex engines.
///
/// The values are intended to be identical to ICU4C's UProperty enum
#[non_exhaustive]
#[allow(missing_docs)]
#[allow(dead_code)]
#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Debug)]
enum BinaryProperty {
Alnum = 44,
Alphabetic = 0,
AsciiHexDigit = 1,
BidiControl = 2,
BidiMirrored = 3,
Blank = 45,
Cased = 49,
CaseIgnorable = 50,
CaseSensitive = 34,
ChangesWhenCasefolded = 54,
ChangesWhenCasemapped = 55,
ChangesWhenLowercased = 51,
ChangesWhenNfkcCasefolded = 56,
ChangesWhenTitlecased = 53,
ChangesWhenUppercased = 52,
Dash = 4,
DefaultIgnorableCodePoint = 5,
Deprecated = 6,
Diacritic = 7,
Emoji = 57,
EmojiComponent = 61,
EmojiModifier = 59,
EmojiModifierBase = 60,
EmojiPresentation = 58,
ExtendedPictographic = 64,
Extender = 8,
FullCompositionExclusion = 9,
Graph = 46,
GraphemeBase = 10,
GraphemeExtend = 11,
GraphemeLink = 12,
HexDigit = 13,
Hyphen = 14,
IdCompatMathContinue = 65,
IdCompatMathStart = 66,
IdContinue = 15,
Ideographic = 17,
IdsBinaryOperator = 18,
IdStart = 16,
IdsTrinaryOperator = 19,
IdsUnaryOperator = 67,
JoinControl = 20,
LogicalOrderException = 21,
Lowercase = 22,
Math = 23,
ModifierCombiningMark = 68,
NfcInert = 39,
NfdInert = 37,
NfkcInert = 40,
NfkdInert = 38,
NoncharacterCodePoint = 24,
PatternSyntax = 42,
PatternWhiteSpace = 43,
PrependedConcatenationMark = 63,
Print = 47,
QuotationMark = 25,
Radical = 26,
RegionalIndicator = 62,
SegmentStarter = 41,
SentenceTerminal = 35,
SoftDotted = 27,
TerminalPunctuation = 28,
UnifiedIdeograph = 29,
Uppercase = 30,
VariationSelector = 36,
WhiteSpace = 31,
Xdigit = 48,
XidContinue = 32,
XidStart = 33,
}
/// This type can represent any binary property over strings.
///
/// This is intended to be used in situations where the exact unicode property needed is
/// only known at runtime, for example in regex engines.
///
/// The values are intended to be identical to ICU4C's UProperty enum
#[non_exhaustive]
#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Debug)]
#[allow(dead_code)]
#[allow(missing_docs)]
enum StringBinaryProperty {
BasicEmoji = 65,
EmojiKeycapSequence = 66,
RgiEmoji = 71,
RgiEmojiFlagSequence = 68,
RgiEmojiModifierSequence = 67,
RgiEmojiTagSequence = 69,
RgiEmojiZWJSequence = 70,
}
/// This type can represent any enumerated Unicode property.
///
/// This is intended to be used in situations where the exact unicode property needed is
/// only known at runtime, for example in regex engines.
///
/// The values are intended to be identical to ICU4C's UProperty enum
#[non_exhaustive]
#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Debug)]
#[allow(dead_code)]
#[allow(missing_docs)]
enum EnumeratedProperty {
BidiClass = 0x1000,
BidiPairedBracketType = 0x1015,
Block = 0x1001,
CombiningClass = 0x1002,
DecompositionType = 0x1003,
EastAsianWidth = 0x1004,
GeneralCategory = 0x1005,
GraphemeClusterBreak = 0x1012,
HangulSyllableType = 0x100B,
IndicConjunctBreak = 0x101A,
IndicPositionalCategory = 0x1016,
IndicSyllabicCategory = 0x1017,
JoiningGroup = 0x1006,
JoiningType = 0x1007,
LeadCanonicalCombiningClass = 0x1010,
LineBreak = 0x1008,
NFCQuickCheck = 0x100E,
NFDQuickCheck = 0x100C,
NFKCQuickCheck = 0x100F,
NFKDQuickCheck = 0x100D,
NumericType = 0x1009,
Script = 0x100A,
SentenceBreak = 0x1013,
TrailCanonicalCombiningClass = 0x1011,
VerticalOrientation = 0x1018,
WordBreak = 0x1014,
}
/// This type can represent any Unicode mask property.
///
/// This is intended to be used in situations where the exact unicode property needed is
/// only known at runtime, for example in regex engines.
///
/// The values are intended to be identical to ICU4C's UProperty enum
#[non_exhaustive]
#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Debug)]
#[allow(dead_code)]
#[allow(missing_docs)]
enum MaskProperty {
GeneralCategoryMask = 0x2000,
}
/// This type can represent any numeric Unicode property.
///
/// This is intended to be used in situations where the exact unicode property needed is
/// only known at runtime, for example in regex engines.
///
/// The values are intended to be identical to ICU4C's UProperty enum
#[non_exhaustive]
#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Debug)]
#[allow(dead_code)]
#[allow(missing_docs)]
enum NumericProperty {
NumericValue = 0x3000,
}
/// This type can represent any Unicode string property.
///
/// This is intended to be used in situations where the exact unicode property needed is
/// only known at runtime, for example in regex engines.
///
/// The values are intended to be identical to ICU4C's UProperty enum
#[non_exhaustive]
#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Debug)]
#[allow(dead_code)]
#[allow(missing_docs)]
enum StringProperty {
Age = 0x4000,
BidiMirroringGlyph = 0x4001,
BidiPairedBracket = 0x400D,
CaseFolding = 0x4002,
ISOComment = 0x4003,
LowercaseMapping = 0x4004,
Name = 0x4005,
SimpleCaseFolding = 0x4006,
SimpleLowercaseMapping = 0x4007,
SimpleTitlecaseMapping = 0x4008,
SimpleUppercaseMapping = 0x4009,
TitlecaseMapping = 0x400A,
Unicode1Name = 0x400B,
UppercaseMapping = 0x400C,
}
#[non_exhaustive]
#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Debug)]
#[allow(dead_code)]
#[allow(missing_docs)]
enum MiscProperty {
ScriptExtensions = 0x7000,
}
impl CodePointSetData {
/// Returns a type capable of looking up values for a property specified as a string, as long as it is a
/// [binary property listed in ECMA-262][ecma], using strict matching on the names in the spec.
///
/// This handles every property required by ECMA-262 `/u` regular expressions, except for:
///
/// - `Script` and `General_Category`: handle these directly using property values parsed via
/// [`PropertyParser<GeneralCategory>`] and [`PropertyParser<Script>`]
/// if necessary.
/// - `Script_Extensions`: handle this directly using APIs from [`crate::script::ScriptWithExtensions`]
/// - `General_Category` mask values: Handle this alongside `General_Category` using [`GeneralCategoryGroup`],
/// using property values parsed via [`PropertyParser<GeneralCategory>`] if necessary
/// - `Assigned`, `All`, and `ASCII` pseudoproperties: Handle these using their equivalent sets:
/// - `Any` can be expressed as the range `[\u{0}-\u{10FFFF}]`
/// - `Assigned` can be expressed as the inverse of the set `gc=Cn` (i.e., `\P{gc=Cn}`).
/// - `ASCII` can be expressed as the range `[\u{0}-\u{7F}]`
/// - `General_Category` property values can themselves be treated like properties using a shorthand in ECMA262,
/// simply create the corresponding `GeneralCategory` set.
///
/// ✨ *Enabled with the `compiled_data` Cargo feature.*
///
/// [📚 Help choosing a constructor](icu_provider::constructors)
///
/// ```
/// use icu::properties::CodePointSetData;
///
/// let emoji = CodePointSetData::new_for_ecma262(b"Emoji")
/// .expect("is an ECMA-262 property");
///
/// assert!(emoji.contains('🔥')); // U+1F525 FIRE
/// assert!(!emoji.contains('V'));
/// ```
///
/// [ecma]: https://tc39.es/ecma262/#table-binary-unicode-properties
#[cfg(feature = "compiled_data")]
pub fn new_for_ecma262(prop: &[u8]) -> Option<crate::CodePointSetDataBorrowed<'static>> {
use crate::props::*;
Some(match prop {
AsciiHexDigit::NAME | AsciiHexDigit::SHORT_NAME => Self::new::<AsciiHexDigit>(),
Alphabetic::NAME | Alphabetic::SHORT_NAME => Self::new::<Alphabetic>(),
BidiControl::NAME | BidiControl::SHORT_NAME => Self::new::<BidiControl>(),
BidiMirrored::NAME | BidiMirrored::SHORT_NAME => Self::new::<BidiMirrored>(),
CaseIgnorable::NAME | CaseIgnorable::SHORT_NAME => Self::new::<CaseIgnorable>(),
#[allow(unreachable_patterns)] // no short name
Cased::NAME | Cased::SHORT_NAME => Self::new::<Cased>(),
ChangesWhenCasefolded::NAME | ChangesWhenCasefolded::SHORT_NAME => {
Self::new::<ChangesWhenCasefolded>()
}
ChangesWhenCasemapped::NAME | ChangesWhenCasemapped::SHORT_NAME => {
Self::new::<ChangesWhenCasemapped>()
}
ChangesWhenLowercased::NAME | ChangesWhenLowercased::SHORT_NAME => {
Self::new::<ChangesWhenLowercased>()
}
ChangesWhenNfkcCasefolded::NAME | ChangesWhenNfkcCasefolded::SHORT_NAME => {
Self::new::<ChangesWhenNfkcCasefolded>()
}
ChangesWhenTitlecased::NAME | ChangesWhenTitlecased::SHORT_NAME => {
Self::new::<ChangesWhenTitlecased>()
}
ChangesWhenUppercased::NAME | ChangesWhenUppercased::SHORT_NAME => {
Self::new::<ChangesWhenUppercased>()
}
#[allow(unreachable_patterns)] // no short name
Dash::NAME | Dash::SHORT_NAME => Self::new::<Dash>(),
DefaultIgnorableCodePoint::NAME | DefaultIgnorableCodePoint::SHORT_NAME => {
Self::new::<DefaultIgnorableCodePoint>()
}
Deprecated::NAME | Deprecated::SHORT_NAME => Self::new::<Deprecated>(),
Diacritic::NAME | Diacritic::SHORT_NAME => Self::new::<Diacritic>(),
#[allow(unreachable_patterns)] // no short name
Emoji::NAME | Emoji::SHORT_NAME => Self::new::<Emoji>(),
EmojiComponent::NAME | EmojiComponent::SHORT_NAME => Self::new::<EmojiComponent>(),
EmojiModifier::NAME | EmojiModifier::SHORT_NAME => Self::new::<EmojiModifier>(),
EmojiModifierBase::NAME | EmojiModifierBase::SHORT_NAME => {
Self::new::<EmojiModifierBase>()
}
EmojiPresentation::NAME | EmojiPresentation::SHORT_NAME => {
Self::new::<EmojiPresentation>()
}
ExtendedPictographic::NAME | ExtendedPictographic::SHORT_NAME => {
Self::new::<ExtendedPictographic>()
}
Extender::NAME | Extender::SHORT_NAME => Self::new::<Extender>(),
GraphemeBase::NAME | GraphemeBase::SHORT_NAME => Self::new::<GraphemeBase>(),
GraphemeExtend::NAME | GraphemeExtend::SHORT_NAME => Self::new::<GraphemeExtend>(),
HexDigit::NAME | HexDigit::SHORT_NAME => Self::new::<HexDigit>(),
IdsBinaryOperator::NAME | IdsBinaryOperator::SHORT_NAME => {
Self::new::<IdsBinaryOperator>()
}
IdsTrinaryOperator::NAME | IdsTrinaryOperator::SHORT_NAME => {
Self::new::<IdsTrinaryOperator>()
}
IdContinue::NAME | IdContinue::SHORT_NAME => Self::new::<IdContinue>(),
IdStart::NAME | IdStart::SHORT_NAME => Self::new::<IdStart>(),
Ideographic::NAME | Ideographic::SHORT_NAME => Self::new::<Ideographic>(),
JoinControl::NAME | JoinControl::SHORT_NAME => Self::new::<JoinControl>(),
LogicalOrderException::NAME | LogicalOrderException::SHORT_NAME => {
Self::new::<LogicalOrderException>()
}
Lowercase::NAME | Lowercase::SHORT_NAME => Self::new::<Lowercase>(),
#[allow(unreachable_patterns)] // no short name
Math::NAME | Math::SHORT_NAME => Self::new::<Math>(),
NoncharacterCodePoint::NAME | NoncharacterCodePoint::SHORT_NAME => {
Self::new::<NoncharacterCodePoint>()
}
PatternSyntax::NAME | PatternSyntax::SHORT_NAME => Self::new::<PatternSyntax>(),
PatternWhiteSpace::NAME | PatternWhiteSpace::SHORT_NAME => {
Self::new::<PatternWhiteSpace>()
}
QuotationMark::NAME | QuotationMark::SHORT_NAME => Self::new::<QuotationMark>(),
#[allow(unreachable_patterns)] // no short name
Radical::NAME | Radical::SHORT_NAME => Self::new::<Radical>(),
RegionalIndicator::NAME | RegionalIndicator::SHORT_NAME => {
Self::new::<RegionalIndicator>()
}
SentenceTerminal::NAME | SentenceTerminal::SHORT_NAME => {
Self::new::<SentenceTerminal>()
}
SoftDotted::NAME | SoftDotted::SHORT_NAME => Self::new::<SoftDotted>(),
TerminalPunctuation::NAME | TerminalPunctuation::SHORT_NAME => {
Self::new::<TerminalPunctuation>()
}
UnifiedIdeograph::NAME | UnifiedIdeograph::SHORT_NAME => {
Self::new::<UnifiedIdeograph>()
}
Uppercase::NAME | Uppercase::SHORT_NAME => Self::new::<Uppercase>(),
VariationSelector::NAME | VariationSelector::SHORT_NAME => {
Self::new::<VariationSelector>()
}
WhiteSpace::NAME | WhiteSpace::SHORT_NAME => Self::new::<WhiteSpace>(),
XidContinue::NAME | XidContinue::SHORT_NAME => Self::new::<XidContinue>(),
XidStart::NAME | XidStart::SHORT_NAME => Self::new::<XidStart>(),
// Not an ECMA-262 property
_ => return None,
})
}
icu_provider::gen_buffer_data_constructors!(
(prop: &[u8]) -> result: Option<Result<Self, DataError>>,
functions: [
new_for_ecma262: skip,
try_new_for_ecma262_with_buffer_provider,
try_new_for_ecma262_unstable,
Self,
]
);
#[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new_for_ecma262)]
pub fn try_new_for_ecma262_unstable<P>(
provider: &P,
prop: &[u8],
) -> Option<Result<Self, DataError>>
where
P: ?Sized
+ DataProvider<PropertyBinaryAsciiHexDigitV1>
+ DataProvider<PropertyBinaryAlphabeticV1>
+ DataProvider<PropertyBinaryBidiControlV1>
+ DataProvider<PropertyBinaryBidiMirroredV1>
+ DataProvider<PropertyBinaryCaseIgnorableV1>
+ DataProvider<PropertyBinaryCasedV1>
+ DataProvider<PropertyBinaryChangesWhenCasefoldedV1>
+ DataProvider<PropertyBinaryChangesWhenCasemappedV1>
+ DataProvider<PropertyBinaryChangesWhenLowercasedV1>
+ DataProvider<PropertyBinaryChangesWhenNfkcCasefoldedV1>
+ DataProvider<PropertyBinaryChangesWhenTitlecasedV1>
+ DataProvider<PropertyBinaryChangesWhenUppercasedV1>
+ DataProvider<PropertyBinaryDashV1>
+ DataProvider<PropertyBinaryDefaultIgnorableCodePointV1>
+ DataProvider<PropertyBinaryDeprecatedV1>
+ DataProvider<PropertyBinaryDiacriticV1>
+ DataProvider<PropertyBinaryEmojiV1>
+ DataProvider<PropertyBinaryEmojiComponentV1>
+ DataProvider<PropertyBinaryEmojiModifierV1>
+ DataProvider<PropertyBinaryEmojiModifierBaseV1>
+ DataProvider<PropertyBinaryEmojiPresentationV1>
+ DataProvider<PropertyBinaryExtendedPictographicV1>
+ DataProvider<PropertyBinaryExtenderV1>
+ DataProvider<PropertyBinaryGraphemeBaseV1>
+ DataProvider<PropertyBinaryGraphemeExtendV1>
+ DataProvider<PropertyBinaryHexDigitV1>
+ DataProvider<PropertyBinaryIdsBinaryOperatorV1>
+ DataProvider<PropertyBinaryIdsTrinaryOperatorV1>
+ DataProvider<PropertyBinaryIdContinueV1>
+ DataProvider<PropertyBinaryIdStartV1>
+ DataProvider<PropertyBinaryIdeographicV1>
+ DataProvider<PropertyBinaryJoinControlV1>
+ DataProvider<PropertyBinaryLogicalOrderExceptionV1>
+ DataProvider<PropertyBinaryLowercaseV1>
+ DataProvider<PropertyBinaryMathV1>
+ DataProvider<PropertyBinaryNoncharacterCodePointV1>
+ DataProvider<PropertyBinaryPatternSyntaxV1>
+ DataProvider<PropertyBinaryPatternWhiteSpaceV1>
+ DataProvider<PropertyBinaryQuotationMarkV1>
+ DataProvider<PropertyBinaryRadicalV1>
+ DataProvider<PropertyBinaryRegionalIndicatorV1>
+ DataProvider<PropertyBinarySentenceTerminalV1>
+ DataProvider<PropertyBinarySoftDottedV1>
+ DataProvider<PropertyBinaryTerminalPunctuationV1>
+ DataProvider<PropertyBinaryUnifiedIdeographV1>
+ DataProvider<PropertyBinaryUppercaseV1>
+ DataProvider<PropertyBinaryVariationSelectorV1>
+ DataProvider<PropertyBinaryWhiteSpaceV1>
+ DataProvider<PropertyBinaryXidContinueV1>
+ DataProvider<PropertyBinaryXidStartV1>,
{
use crate::props::*;
Some(match prop {
AsciiHexDigit::NAME | AsciiHexDigit::SHORT_NAME => {
Self::try_new_unstable::<AsciiHexDigit>(provider)
}
Alphabetic::NAME | Alphabetic::SHORT_NAME => {
Self::try_new_unstable::<Alphabetic>(provider)
}
BidiControl::NAME | BidiControl::SHORT_NAME => {
Self::try_new_unstable::<BidiControl>(provider)
}
BidiMirrored::NAME | BidiMirrored::SHORT_NAME => {
Self::try_new_unstable::<BidiMirrored>(provider)
}
CaseIgnorable::NAME | CaseIgnorable::SHORT_NAME => {
Self::try_new_unstable::<CaseIgnorable>(provider)
}
#[allow(unreachable_patterns)] // no short name
Cased::NAME | Cased::SHORT_NAME => Self::try_new_unstable::<Cased>(provider),
ChangesWhenCasefolded::NAME | ChangesWhenCasefolded::SHORT_NAME => {
Self::try_new_unstable::<ChangesWhenCasefolded>(provider)
}
ChangesWhenCasemapped::NAME | ChangesWhenCasemapped::SHORT_NAME => {
Self::try_new_unstable::<ChangesWhenCasemapped>(provider)
}
ChangesWhenLowercased::NAME | ChangesWhenLowercased::SHORT_NAME => {
Self::try_new_unstable::<ChangesWhenLowercased>(provider)
}
ChangesWhenNfkcCasefolded::NAME | ChangesWhenNfkcCasefolded::SHORT_NAME => {
Self::try_new_unstable::<ChangesWhenNfkcCasefolded>(provider)
}
ChangesWhenTitlecased::NAME | ChangesWhenTitlecased::SHORT_NAME => {
Self::try_new_unstable::<ChangesWhenTitlecased>(provider)
}
ChangesWhenUppercased::NAME | ChangesWhenUppercased::SHORT_NAME => {
Self::try_new_unstable::<ChangesWhenUppercased>(provider)
}
#[allow(unreachable_patterns)] // no short name
Dash::NAME | Dash::SHORT_NAME => Self::try_new_unstable::<Dash>(provider),
DefaultIgnorableCodePoint::NAME | DefaultIgnorableCodePoint::SHORT_NAME => {
Self::try_new_unstable::<DefaultIgnorableCodePoint>(provider)
}
Deprecated::NAME | Deprecated::SHORT_NAME => {
Self::try_new_unstable::<Deprecated>(provider)
}
Diacritic::NAME | Diacritic::SHORT_NAME => {
Self::try_new_unstable::<Diacritic>(provider)
}
#[allow(unreachable_patterns)] // no short name
Emoji::NAME | Emoji::SHORT_NAME => Self::try_new_unstable::<Emoji>(provider),
EmojiComponent::NAME | EmojiComponent::SHORT_NAME => {
Self::try_new_unstable::<EmojiComponent>(provider)
}
EmojiModifier::NAME | EmojiModifier::SHORT_NAME => {
Self::try_new_unstable::<EmojiModifier>(provider)
}
EmojiModifierBase::NAME | EmojiModifierBase::SHORT_NAME => {
Self::try_new_unstable::<EmojiModifierBase>(provider)
}
EmojiPresentation::NAME | EmojiPresentation::SHORT_NAME => {
Self::try_new_unstable::<EmojiPresentation>(provider)
}
ExtendedPictographic::NAME | ExtendedPictographic::SHORT_NAME => {
Self::try_new_unstable::<ExtendedPictographic>(provider)
}
Extender::NAME | Extender::SHORT_NAME => Self::try_new_unstable::<Extender>(provider),
GraphemeBase::NAME | GraphemeBase::SHORT_NAME => {
Self::try_new_unstable::<GraphemeBase>(provider)
}
GraphemeExtend::NAME | GraphemeExtend::SHORT_NAME => {
Self::try_new_unstable::<GraphemeExtend>(provider)
}
HexDigit::NAME | HexDigit::SHORT_NAME => Self::try_new_unstable::<HexDigit>(provider),
IdsBinaryOperator::NAME | IdsBinaryOperator::SHORT_NAME => {
Self::try_new_unstable::<IdsBinaryOperator>(provider)
}
IdsTrinaryOperator::NAME | IdsTrinaryOperator::SHORT_NAME => {
Self::try_new_unstable::<IdsTrinaryOperator>(provider)
}
IdContinue::NAME | IdContinue::SHORT_NAME => {
Self::try_new_unstable::<IdContinue>(provider)
}
IdStart::NAME | IdStart::SHORT_NAME => Self::try_new_unstable::<IdStart>(provider),
Ideographic::NAME | Ideographic::SHORT_NAME => {
Self::try_new_unstable::<Ideographic>(provider)
}
JoinControl::NAME | JoinControl::SHORT_NAME => {
Self::try_new_unstable::<JoinControl>(provider)
}
LogicalOrderException::NAME | LogicalOrderException::SHORT_NAME => {
Self::try_new_unstable::<LogicalOrderException>(provider)
}
Lowercase::NAME | Lowercase::SHORT_NAME => {
Self::try_new_unstable::<Lowercase>(provider)
}
#[allow(unreachable_patterns)] // no short name
Math::NAME | Math::SHORT_NAME => Self::try_new_unstable::<Math>(provider),
NoncharacterCodePoint::NAME | NoncharacterCodePoint::SHORT_NAME => {
Self::try_new_unstable::<NoncharacterCodePoint>(provider)
}
PatternSyntax::NAME | PatternSyntax::SHORT_NAME => {
Self::try_new_unstable::<PatternSyntax>(provider)
}
PatternWhiteSpace::NAME | PatternWhiteSpace::SHORT_NAME => {
Self::try_new_unstable::<PatternWhiteSpace>(provider)
}
QuotationMark::NAME | QuotationMark::SHORT_NAME => {
Self::try_new_unstable::<QuotationMark>(provider)
}
#[allow(unreachable_patterns)] // no short name
Radical::NAME | Radical::SHORT_NAME => Self::try_new_unstable::<Radical>(provider),
RegionalIndicator::NAME | RegionalIndicator::SHORT_NAME => {
Self::try_new_unstable::<RegionalIndicator>(provider)
}
SentenceTerminal::NAME | SentenceTerminal::SHORT_NAME => {
Self::try_new_unstable::<SentenceTerminal>(provider)
}
SoftDotted::NAME | SoftDotted::SHORT_NAME => {
Self::try_new_unstable::<SoftDotted>(provider)
}
TerminalPunctuation::NAME | TerminalPunctuation::SHORT_NAME => {
Self::try_new_unstable::<TerminalPunctuation>(provider)
}
UnifiedIdeograph::NAME | UnifiedIdeograph::SHORT_NAME => {
Self::try_new_unstable::<UnifiedIdeograph>(provider)
}
Uppercase::NAME | Uppercase::SHORT_NAME => {
Self::try_new_unstable::<Uppercase>(provider)
}
VariationSelector::NAME | VariationSelector::SHORT_NAME => {
Self::try_new_unstable::<VariationSelector>(provider)
}
WhiteSpace::NAME | WhiteSpace::SHORT_NAME => {
Self::try_new_unstable::<WhiteSpace>(provider)
}
XidContinue::NAME | XidContinue::SHORT_NAME => {
Self::try_new_unstable::<XidContinue>(provider)
}
XidStart::NAME | XidStart::SHORT_NAME => Self::try_new_unstable::<XidStart>(provider),
// Not an ECMA-262 property
_ => return None,
})
}
}
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
//! Data and APIs for supporting Script_Extensions property
//! values in an efficient structure.
use crate::props::Script;
use crate::provider::*;
#[cfg(feature = "alloc")]
use core::iter::FromIterator;
use core::ops::RangeInclusive;
#[cfg(feature = "alloc")]
use icu_collections::codepointinvlist::CodePointInversionList;
use icu_provider::prelude::*;
use zerovec::{ule::AsULE, ZeroSlice};
/// The number of bits at the low-end of a `ScriptWithExt` value used for
/// storing the `Script` value (or `extensions` index).
const SCRIPT_VAL_LENGTH: u16 = 10;
/// The bit mask necessary to retrieve the `Script` value (or `extensions` index)
/// from a `ScriptWithExt` value.
const SCRIPT_X_SCRIPT_VAL: u16 = (1 << SCRIPT_VAL_LENGTH) - 1;
/// An internal-use only pseudo-property that represents the values stored in
/// the trie of the special data structure [`ScriptWithExtensionsProperty`].
///
/// Note: The will assume a 12-bit layout. The 2 higher order bits in positions
/// 11..10 will indicate how to deduce the Script value and Script_Extensions,
/// and the lower 10 bits 9..0 indicate either the Script value or the index
/// into the `extensions` structure.
#[derive(Copy, Clone, Debug, Eq, PartialEq)]
#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
#[cfg_attr(feature = "datagen", derive(databake::Bake))]
#[cfg_attr(feature = "datagen", databake(path = icu_properties::script))]
#[repr(transparent)]
#[doc(hidden)]
// `ScriptWithExt` not intended as public-facing but for `ScriptWithExtensionsProperty` constructor
#[allow(clippy::exhaustive_structs)] // this type is stable
pub struct ScriptWithExt(pub u16);
#[allow(missing_docs)] // These constants don't need individual documentation.
#[allow(non_upper_case_globals)]
#[doc(hidden)] // `ScriptWithExt` not intended as public-facing but for `ScriptWithExtensionsProperty` constructor
impl ScriptWithExt {
pub const Unknown: ScriptWithExt = ScriptWithExt(0);
}
impl AsULE for ScriptWithExt {
type ULE = <u16 as AsULE>::ULE;
#[inline]
fn to_unaligned(self) -> Self::ULE {
Script(self.0).to_unaligned()
}
#[inline]
fn from_unaligned(unaligned: Self::ULE) -> Self {
ScriptWithExt(Script::from_unaligned(unaligned).0)
}
}
#[doc(hidden)] // `ScriptWithExt` not intended as public-facing but for `ScriptWithExtensionsProperty` constructor
impl ScriptWithExt {
/// Returns whether the [`ScriptWithExt`] value has Script_Extensions and
/// also indicates a Script value of [`Script::Common`].
///
/// # Examples
///
/// ```
/// use icu::properties::script::ScriptWithExt;
///
/// assert!(ScriptWithExt(0x04FF).is_common());
/// assert!(ScriptWithExt(0x0400).is_common());
///
/// assert!(!ScriptWithExt(0x08FF).is_common());
/// assert!(!ScriptWithExt(0x0800).is_common());
///
/// assert!(!ScriptWithExt(0x0CFF).is_common());
/// assert!(!ScriptWithExt(0x0C00).is_common());
///
/// assert!(!ScriptWithExt(0xFF).is_common());
/// assert!(!ScriptWithExt(0x0).is_common());
/// ```
pub fn is_common(&self) -> bool {
self.0 >> SCRIPT_VAL_LENGTH == 1
}
/// Returns whether the [`ScriptWithExt`] value has Script_Extensions and
/// also indicates a Script value of [`Script::Inherited`].
///
/// # Examples
///
/// ```
/// use icu::properties::script::ScriptWithExt;
///
/// assert!(!ScriptWithExt(0x04FF).is_inherited());
/// assert!(!ScriptWithExt(0x0400).is_inherited());
///
/// assert!(ScriptWithExt(0x08FF).is_inherited());
/// assert!(ScriptWithExt(0x0800).is_inherited());
///
/// assert!(!ScriptWithExt(0x0CFF).is_inherited());
/// assert!(!ScriptWithExt(0x0C00).is_inherited());
///
/// assert!(!ScriptWithExt(0xFF).is_inherited());
/// assert!(!ScriptWithExt(0x0).is_inherited());
/// ```
pub fn is_inherited(&self) -> bool {
self.0 >> SCRIPT_VAL_LENGTH == 2
}
/// Returns whether the [`ScriptWithExt`] value has Script_Extensions and
/// also indicates that the Script value is neither [`Script::Common`] nor
/// [`Script::Inherited`].
///
/// # Examples
///
/// ```
/// use icu::properties::script::ScriptWithExt;
///
/// assert!(!ScriptWithExt(0x04FF).is_other());
/// assert!(!ScriptWithExt(0x0400).is_other());
///
/// assert!(!ScriptWithExt(0x08FF).is_other());
/// assert!(!ScriptWithExt(0x0800).is_other());
///
/// assert!(ScriptWithExt(0x0CFF).is_other());
/// assert!(ScriptWithExt(0x0C00).is_other());
///
/// assert!(!ScriptWithExt(0xFF).is_other());
/// assert!(!ScriptWithExt(0x0).is_other());
/// ```
pub fn is_other(&self) -> bool {
self.0 >> SCRIPT_VAL_LENGTH == 3
}
/// Returns whether the [`ScriptWithExt`] value has Script_Extensions.
///
/// # Examples
///
/// ```
/// use icu::properties::script::ScriptWithExt;
///
/// assert!(ScriptWithExt(0x04FF).has_extensions());
/// assert!(ScriptWithExt(0x0400).has_extensions());
///
/// assert!(ScriptWithExt(0x08FF).has_extensions());
/// assert!(ScriptWithExt(0x0800).has_extensions());
///
/// assert!(ScriptWithExt(0x0CFF).has_extensions());
/// assert!(ScriptWithExt(0x0C00).has_extensions());
///
/// assert!(!ScriptWithExt(0xFF).has_extensions());
/// assert!(!ScriptWithExt(0x0).has_extensions());
/// ```
pub fn has_extensions(&self) -> bool {
let high_order_bits = self.0 >> SCRIPT_VAL_LENGTH;
high_order_bits > 0
}
}
impl From<ScriptWithExt> for u32 {
fn from(swe: ScriptWithExt) -> Self {
swe.0 as u32
}
}
impl From<ScriptWithExt> for Script {
fn from(swe: ScriptWithExt) -> Self {
Script(swe.0)
}
}
/// A struct that wraps a [`Script`] array, such as in the return value for
/// [`get_script_extensions_val()`](ScriptWithExtensionsBorrowed::get_script_extensions_val).
#[derive(Copy, Clone, Debug, Eq, PartialEq)]
pub struct ScriptExtensionsSet<'a> {
values: &'a ZeroSlice<Script>,
}
impl<'a> ScriptExtensionsSet<'a> {
/// Returns whether this set contains the given script.
///
/// # Example
///
/// ```
/// use icu::properties::props::Script;
/// use icu::properties::script::ScriptWithExtensions;
/// let swe = ScriptWithExtensions::new();
///
/// assert!(swe
/// .get_script_extensions_val('\u{11303}') // GRANTHA SIGN VISARGA
/// .contains(&Script::Grantha));
/// ```
pub fn contains(&self, x: &Script) -> bool {
ZeroSlice::binary_search(self.values, x).is_ok()
}
/// Gets an iterator over the elements.
///
/// # Example
///
/// ```
/// use icu::properties::props::Script;
/// use icu::properties::script::ScriptWithExtensions;
/// let swe = ScriptWithExtensions::new();
///
/// assert_eq!(
/// swe.get_script_extensions_val('௫') // U+0BEB TAMIL DIGIT FIVE
/// .iter()
/// .collect::<Vec<_>>(),
/// [Script::Tamil, Script::Grantha]
/// );
/// ```
pub fn iter(&self) -> impl DoubleEndedIterator<Item = Script> + 'a {
ZeroSlice::iter(self.values)
}
/// For accessing this set as an array instead of an iterator
#[doc(hidden)] // used by FFI code
pub fn array_len(&self) -> usize {
self.values.len()
}
/// For accessing this set as an array instead of an iterator
#[doc(hidden)] // used by FFI code
pub fn array_get(&self, index: usize) -> Option<Script> {
self.values.get(index)
}
}
/// A struct that represents the data for the Script and Script_Extensions properties.
///
/// ✨ *Enabled with the `compiled_data` Cargo feature.*
///
/// [📚 Help choosing a constructor](icu_provider::constructors)
///
/// Most useful methods are on [`ScriptWithExtensionsBorrowed`] obtained by calling [`ScriptWithExtensions::as_borrowed()`]
///
/// # Examples
///
/// ```
/// use icu::properties::script::ScriptWithExtensions;
/// use icu::properties::props::Script;
/// let swe = ScriptWithExtensions::new();
///
/// // get the `Script` property value
/// assert_eq!(swe.get_script_val('ـ'), Script::Common); // U+0640 ARABIC TATWEEL
/// assert_eq!(swe.get_script_val('\u{0650}'), Script::Inherited); // U+0650 ARABIC KASRA
/// assert_eq!(swe.get_script_val('٠'), Script::Arabic); // // U+0660 ARABIC-INDIC DIGIT ZERO
/// assert_eq!(swe.get_script_val('ﷲ'), Script::Arabic); // U+FDF2 ARABIC LIGATURE ALLAH ISOLATED FORM
///
/// // get the `Script_Extensions` property value
/// assert_eq!(
/// swe.get_script_extensions_val('ـ') // U+0640 ARABIC TATWEEL
/// .iter().collect::<Vec<_>>(),
/// [Script::Arabic, Script::Syriac, Script::Mandaic, Script::Manichaean,
/// Script::PsalterPahlavi, Script::Adlam, Script::HanifiRohingya, Script::Sogdian,
/// Script::OldUyghur]
/// );
/// assert_eq!(
/// swe.get_script_extensions_val('🥳') // U+1F973 FACE WITH PARTY HORN AND PARTY HAT
/// .iter().collect::<Vec<_>>(),
/// [Script::Common]
/// );
/// assert_eq!(
/// swe.get_script_extensions_val('\u{200D}') // ZERO WIDTH JOINER
/// .iter().collect::<Vec<_>>(),
/// [Script::Inherited]
/// );
/// assert_eq!(
/// swe.get_script_extensions_val('௫') // U+0BEB TAMIL DIGIT FIVE
/// .iter().collect::<Vec<_>>(),
/// [Script::Tamil, Script::Grantha]
/// );
///
/// // check containment of a `Script` value in the `Script_Extensions` value
/// // U+0650 ARABIC KASRA
/// assert!(!swe.has_script('\u{0650}', Script::Inherited)); // main Script value
/// assert!(swe.has_script('\u{0650}', Script::Arabic));
/// assert!(swe.has_script('\u{0650}', Script::Syriac));
/// assert!(!swe.has_script('\u{0650}', Script::Thaana));
///
/// // get a `CodePointInversionList` for when `Script` value is contained in `Script_Extensions` value
/// let syriac = swe.get_script_extensions_set(Script::Syriac);
/// assert!(syriac.contains('\u{0650}')); // ARABIC KASRA
/// assert!(!syriac.contains('٠')); // ARABIC-INDIC DIGIT ZERO
/// assert!(!syriac.contains('ﷲ')); // ARABIC LIGATURE ALLAH ISOLATED FORM
/// assert!(syriac.contains('܀')); // SYRIAC END OF PARAGRAPH
/// assert!(syriac.contains('\u{074A}')); // SYRIAC BARREKH
/// ```
#[derive(Debug)]
pub struct ScriptWithExtensions {
data: DataPayload<PropertyScriptWithExtensionsV1>,
}
/// A borrowed wrapper around script extension data, returned by
/// [`ScriptWithExtensions::as_borrowed()`]. More efficient to query.
#[derive(Clone, Copy, Debug)]
pub struct ScriptWithExtensionsBorrowed<'a> {
data: &'a ScriptWithExtensionsProperty<'a>,
}
impl ScriptWithExtensions {
/// Creates a new instance of `ScriptWithExtensionsBorrowed` using compiled data.
///
/// ✨ *Enabled with the `compiled_data` Cargo feature.*
///
/// [📚 Help choosing a constructor](icu_provider::constructors)
#[cfg(feature = "compiled_data")]
#[expect(clippy::new_ret_no_self)]
pub fn new() -> ScriptWithExtensionsBorrowed<'static> {
ScriptWithExtensionsBorrowed::new()
}
icu_provider::gen_buffer_data_constructors!(
() -> result: Result<ScriptWithExtensions, DataError>,
functions: [
new: skip,
try_new_with_buffer_provider,
try_new_unstable,
Self,
]
);
#[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new)]
pub fn try_new_unstable(
provider: &(impl DataProvider<PropertyScriptWithExtensionsV1> + ?Sized),
) -> Result<Self, DataError> {
Ok(ScriptWithExtensions::from_data(
provider.load(Default::default())?.payload,
))
}
/// Construct a borrowed version of this type that can be queried.
///
/// This avoids a potential small underlying cost per API call (ex: `contains()`) by consolidating it
/// up front.
#[inline]
pub fn as_borrowed(&self) -> ScriptWithExtensionsBorrowed<'_> {
ScriptWithExtensionsBorrowed {
data: self.data.get(),
}
}
/// Construct a new one from loaded data
///
/// Typically it is preferable to use getters like [`load_script_with_extensions_unstable()`] instead
pub(crate) fn from_data(data: DataPayload<PropertyScriptWithExtensionsV1>) -> Self {
Self { data }
}
}
impl<'a> ScriptWithExtensionsBorrowed<'a> {
/// Returns the `Script` property value for this code point.
///
/// # Examples
///
/// ```
/// use icu::properties::script::ScriptWithExtensions;
/// use icu::properties::props::Script;
///
/// let swe = ScriptWithExtensions::new();
///
/// // U+0640 ARABIC TATWEEL
/// assert_eq!(swe.get_script_val('ـ'), Script::Common); // main Script value
/// assert_ne!(swe.get_script_val('ـ'), Script::Arabic);
/// assert_ne!(swe.get_script_val('ـ'), Script::Syriac);
/// assert_ne!(swe.get_script_val('ـ'), Script::Thaana);
///
/// // U+0650 ARABIC KASRA
/// assert_eq!(swe.get_script_val('\u{0650}'), Script::Inherited); // main Script value
/// assert_ne!(swe.get_script_val('\u{0650}'), Script::Arabic);
/// assert_ne!(swe.get_script_val('\u{0650}'), Script::Syriac);
/// assert_ne!(swe.get_script_val('\u{0650}'), Script::Thaana);
///
/// // U+0660 ARABIC-INDIC DIGIT ZERO
/// assert_ne!(swe.get_script_val('٠'), Script::Common);
/// assert_eq!(swe.get_script_val('٠'), Script::Arabic); // main Script value
/// assert_ne!(swe.get_script_val('٠'), Script::Syriac);
/// assert_ne!(swe.get_script_val('٠'), Script::Thaana);
///
/// // U+FDF2 ARABIC LIGATURE ALLAH ISOLATED FORM
/// assert_ne!(swe.get_script_val('ﷲ'), Script::Common);
/// assert_eq!(swe.get_script_val('ﷲ'), Script::Arabic); // main Script value
/// assert_ne!(swe.get_script_val('ﷲ'), Script::Syriac);
/// assert_ne!(swe.get_script_val('ﷲ'), Script::Thaana);
/// ```
pub fn get_script_val(self, ch: char) -> Script {
self.get_script_val32(ch as u32)
}
/// See [`Self::get_script_val`].
pub fn get_script_val32(self, code_point: u32) -> Script {
let sc_with_ext = self.data.trie.get32(code_point);
if sc_with_ext.is_other() {
let ext_idx = sc_with_ext.0 & SCRIPT_X_SCRIPT_VAL;
let scx_val = self.data.extensions.get(ext_idx as usize);
let scx_first_sc = scx_val.and_then(|scx| scx.get(0));
let default_sc_val = Script::Unknown;
scx_first_sc.unwrap_or(default_sc_val)
} else if sc_with_ext.is_common() {
Script::Common
} else if sc_with_ext.is_inherited() {
Script::Inherited
} else {
let script_val = sc_with_ext.0;
Script(script_val)
}
}
// Returns the Script_Extensions value for a code_point when the trie value
// is already known.
// This private helper method exists to prevent code duplication in callers like
// `get_script_extensions_val`, `get_script_extensions_set`, and `has_script`.
fn get_scx_val_using_trie_val(
self,
sc_with_ext_ule: &'a <ScriptWithExt as AsULE>::ULE,
) -> &'a ZeroSlice<Script> {
let sc_with_ext = ScriptWithExt::from_unaligned(*sc_with_ext_ule);
if sc_with_ext.is_other() {
let ext_idx = sc_with_ext.0 & SCRIPT_X_SCRIPT_VAL;
let ext_subarray = self.data.extensions.get(ext_idx as usize);
// In the OTHER case, where the 2 higher-order bits of the
// `ScriptWithExt` value in the trie doesn't indicate the Script value,
// the Script value is copied/inserted into the first position of the
// `extensions` array. So we must remove it to return the actual scx array val.
let scx_slice = ext_subarray
.and_then(|zslice| zslice.as_ule_slice().get(1..))
.unwrap_or_default();
ZeroSlice::from_ule_slice(scx_slice)
} else if sc_with_ext.is_common() || sc_with_ext.is_inherited() {
let ext_idx = sc_with_ext.0 & SCRIPT_X_SCRIPT_VAL;
let scx_val = self.data.extensions.get(ext_idx as usize);
scx_val.unwrap_or_default()
} else {
// Note: `Script` and `ScriptWithExt` are both represented as the same
// u16 value when the `ScriptWithExt` has no higher-order bits set.
let script_ule_slice = core::slice::from_ref(sc_with_ext_ule);
ZeroSlice::from_ule_slice(script_ule_slice)
}
}
/// Return the `Script_Extensions` property value for this code point.
///
/// If `code_point` has Script_Extensions, then return the Script codes in
/// the Script_Extensions. In this case, the Script property value
/// (normally Common or Inherited) is not included in the [`ScriptExtensionsSet`].
///
/// If c does not have Script_Extensions, then the one Script code is put
/// into the [`ScriptExtensionsSet`] and also returned.
///
/// If c is not a valid code point, then return an empty [`ScriptExtensionsSet`].
///
/// # Examples
///
/// ```
/// use icu::properties::script::ScriptWithExtensions;
/// use icu::properties::props::Script;
///
/// let swe = ScriptWithExtensions::new();
///
/// assert_eq!(
/// swe.get_script_extensions_val('𐓐') // U+104D0 OSAGE CAPITAL LETTER KHA
/// .iter()
/// .collect::<Vec<_>>(),
/// [Script::Osage]
/// );
/// assert_eq!(
/// swe.get_script_extensions_val('🥳') // U+1F973 FACE WITH PARTY HORN AND PARTY HAT
/// .iter()
/// .collect::<Vec<_>>(),
/// [Script::Common]
/// );
/// assert_eq!(
/// swe.get_script_extensions_val('\u{200D}') // ZERO WIDTH JOINER
/// .iter()
/// .collect::<Vec<_>>(),
/// [Script::Inherited]
/// );
/// assert_eq!(
/// swe.get_script_extensions_val('௫') // U+0BEB TAMIL DIGIT FIVE
/// .iter()
/// .collect::<Vec<_>>(),
/// [Script::Tamil, Script::Grantha]
/// );
/// ```
pub fn get_script_extensions_val(self, ch: char) -> ScriptExtensionsSet<'a> {
self.get_script_extensions_val32(ch as u32)
}
/// See [`Self::get_script_extensions_val`].
pub fn get_script_extensions_val32(self, code_point: u32) -> ScriptExtensionsSet<'a> {
let sc_with_ext_ule = self.data.trie.get32_ule(code_point);
ScriptExtensionsSet {
values: match sc_with_ext_ule {
Some(ule_ref) => self.get_scx_val_using_trie_val(ule_ref),
None => ZeroSlice::from_ule_slice(&[]),
},
}
}
/// Returns whether `script` is contained in the Script_Extensions
/// property value if the code_point has Script_Extensions, otherwise
/// if the code point does not have Script_Extensions then returns
/// whether the Script property value matches.
///
/// Some characters are commonly used in multiple scripts. For more information,
/// see UAX #24: <http://www.unicode.org/reports/tr24/>.
///
/// # Examples
///
/// ```
/// use icu::properties::script::ScriptWithExtensions;
/// use icu::properties::props::Script;
///
/// let swe = ScriptWithExtensions::new();
///
/// // U+0650 ARABIC KASRA
/// assert!(!swe.has_script('\u{0650}', Script::Inherited)); // main Script value
/// assert!(swe.has_script('\u{0650}', Script::Arabic));
/// assert!(swe.has_script('\u{0650}', Script::Syriac));
/// assert!(!swe.has_script('\u{0650}', Script::Thaana));
///
/// // U+0660 ARABIC-INDIC DIGIT ZERO
/// assert!(!swe.has_script('٠', Script::Common)); // main Script value
/// assert!(swe.has_script('٠', Script::Arabic));
/// assert!(!swe.has_script('٠', Script::Syriac));
/// assert!(swe.has_script('٠', Script::Thaana));
///
/// // U+FDF2 ARABIC LIGATURE ALLAH ISOLATED FORM
/// assert!(!swe.has_script('ﷲ', Script::Common));
/// assert!(swe.has_script('ﷲ', Script::Arabic)); // main Script value
/// assert!(!swe.has_script('ﷲ', Script::Syriac));
/// assert!(swe.has_script('ﷲ', Script::Thaana));
/// ```
pub fn has_script(self, ch: char, script: Script) -> bool {
self.has_script32(ch as u32, script)
}
/// See [`Self::has_script`].
pub fn has_script32(self, code_point: u32, script: Script) -> bool {
let sc_with_ext_ule = if let Some(scwe_ule) = self.data.trie.get32_ule(code_point) {
scwe_ule
} else {
return false;
};
let sc_with_ext = <ScriptWithExt as AsULE>::from_unaligned(*sc_with_ext_ule);
if !sc_with_ext.has_extensions() {
let script_val = sc_with_ext.0;
script == Script(script_val)
} else {
let scx_val = self.get_scx_val_using_trie_val(sc_with_ext_ule);
let script_find = scx_val.iter().find(|&sc| sc == script);
script_find.is_some()
}
}
/// Returns all of the matching `CodePointMapRange`s for the given [`Script`]
/// in which `has_script` will return true for all of the contained code points.
///
/// # Examples
///
/// ```
/// use icu::properties::props::Script;
/// use icu::properties::script::ScriptWithExtensions;
///
/// let swe = ScriptWithExtensions::new();
///
/// let syriac_script_extensions_ranges =
/// swe.get_script_extensions_ranges(Script::Syriac);
///
/// let exp_ranges = [
/// 0x0303..=0x0304, // COMBINING TILDE..COMBINING MACRON
/// 0x0307..=0x0308, // COMBINING DOT ABOVE..COMBINING DIAERESIS
/// 0x030A..=0x030A, // COMBINING RING ABOVE
/// 0x0323..=0x0325, // COMBINING DOT BELOW..COMBINING RING BELOW
/// 0x032D..=0x032E, // COMBINING CIRCUMFLEX ACCENT BELOW..COMBINING BREVE BELOW
/// 0x0330..=0x0331, // COMBINING TILDE BELOW..COMBINING MACRON BELOW
/// 0x060C..=0x060C, // ARABIC COMMA
/// 0x061B..=0x061C, // ARABIC SEMICOLON, ARABIC LETTER MARK
/// 0x061F..=0x061F, // ARABIC QUESTION MARK
/// 0x0640..=0x0640, // ARABIC TATWEEL
/// 0x064B..=0x0655, // ARABIC FATHATAN..ARABIC HAMZA BELOW
/// 0x0670..=0x0670, // ARABIC LETTER SUPERSCRIPT ALEF
/// 0x0700..=0x070D, // Syriac block begins at U+0700
/// 0x070F..=0x074A, // Syriac block
/// 0x074D..=0x074F, // Syriac block ends at U+074F
/// 0x0860..=0x086A, // Syriac Supplement block is U+0860..=U+086F
/// 0x1DF8..=0x1DF8, // COMBINING DOT ABOVE LEFT
/// 0x1DFA..=0x1DFA, // COMBINING DOT BELOW LEFT
/// ];
///
/// assert_eq!(
/// syriac_script_extensions_ranges.collect::<Vec<_>>(),
/// exp_ranges
/// );
/// ```
pub fn get_script_extensions_ranges(
self,
script: Script,
) -> impl Iterator<Item = RangeInclusive<u32>> + 'a {
self.data
.trie
.iter_ranges_mapped(move |value| {
let sc_with_ext = ScriptWithExt(value.0);
if sc_with_ext.has_extensions() {
self.get_scx_val_using_trie_val(&sc_with_ext.to_unaligned())
.iter()
.any(|sc| sc == script)
} else {
script == sc_with_ext.into()
}
})
.filter(|v| v.value)
.map(|v| v.range)
}
/// Returns a [`CodePointInversionList`] for the given [`Script`] which represents all
/// code points for which `has_script` will return true.
///
/// ✨ *Enabled with the `alloc` Cargo feature.*
///
/// # Examples
///
/// ```
/// use icu::properties::script::ScriptWithExtensions;
/// use icu::properties::props::Script;
///
/// let swe = ScriptWithExtensions::new();
///
/// let syriac = swe.get_script_extensions_set(Script::Syriac);
///
/// assert!(!syriac.contains('؞')); // ARABIC TRIPLE DOT PUNCTUATION MARK
/// assert!(syriac.contains('؟')); // ARABIC QUESTION MARK
/// assert!(!syriac.contains('ؠ')); // ARABIC LETTER KASHMIRI YEH
///
/// assert!(syriac.contains('܀')); // SYRIAC END OF PARAGRAPH
/// assert!(syriac.contains('\u{074A}')); // SYRIAC BARREKH
/// assert!(!syriac.contains('\u{074B}')); // unassigned
/// assert!(syriac.contains('ݏ')); // SYRIAC LETTER SOGDIAN FE
/// assert!(!syriac.contains('ݐ')); // ARABIC LETTER BEH WITH THREE DOTS HORIZONTALLY BELOW
///
/// assert!(syriac.contains('\u{1DF8}')); // COMBINING DOT ABOVE LEFT
/// assert!(!syriac.contains('\u{1DF9}')); // COMBINING WIDE INVERTED BRIDGE BELOW
/// assert!(syriac.contains('\u{1DFA}')); // COMBINING DOT BELOW LEFT
/// assert!(!syriac.contains('\u{1DFB}')); // COMBINING DELETION MARK
/// ```
#[cfg(feature = "alloc")]
pub fn get_script_extensions_set(self, script: Script) -> CodePointInversionList<'a> {
CodePointInversionList::from_iter(self.get_script_extensions_ranges(script))
}
}
#[cfg(feature = "compiled_data")]
impl Default for ScriptWithExtensionsBorrowed<'static> {
fn default() -> Self {
Self::new()
}
}
impl ScriptWithExtensionsBorrowed<'static> {
/// Creates a new instance of `ScriptWithExtensionsBorrowed` using compiled data.
///
/// ✨ *Enabled with the `compiled_data` Cargo feature.*
///
/// [📚 Help choosing a constructor](icu_provider::constructors)
#[cfg(feature = "compiled_data")]
pub fn new() -> Self {
Self {
data: crate::provider::Baked::SINGLETON_PROPERTY_SCRIPT_WITH_EXTENSIONS_V1,
}
}
/// Cheaply converts a [`ScriptWithExtensionsBorrowed<'static>`] into a [`ScriptWithExtensions`].
///
/// Note: Due to branching and indirection, using [`ScriptWithExtensions`] might inhibit some
/// compile-time optimizations that are possible with [`ScriptWithExtensionsBorrowed`].
pub const fn static_to_owned(self) -> ScriptWithExtensions {
ScriptWithExtensions {
data: DataPayload::from_static_ref(self.data),
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
/// Regression test for https://github.com/unicode-org/icu4x/issues/6041
fn test_scx_regression_6041() {
let scripts = ScriptWithExtensions::new()
.get_script_extensions_val('\u{2bc}')
.iter()
.collect::<Vec<_>>();
assert_eq!(
scripts,
[
Script::Bengali,
Script::Cyrillic,
Script::Devanagari,
Script::Latin,
Script::Thai,
Script::Lisu,
Script::Toto
]
);
}
}
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
use crate::bidi::BidiMirroringGlyph;
use crate::props::{
BidiClass, CanonicalCombiningClass, EastAsianWidth, GeneralCategory, GeneralCategoryGroup,
GraphemeClusterBreak, HangulSyllableType, IndicConjunctBreak, IndicSyllabicCategory,
JoiningType, LineBreak, Script, SentenceBreak, VerticalOrientation, WordBreak,
};
use crate::script::ScriptWithExt;
use core::convert::TryInto;
use core::num::TryFromIntError;
use zerovec::ule::{AsULE, RawBytesULE};
use icu_collections::codepointtrie::TrieValue;
use core::convert::TryFrom;
impl TrieValue for CanonicalCombiningClass {
type TryFromU32Error = TryFromIntError;
fn try_from_u32(i: u32) -> Result<Self, Self::TryFromU32Error> {
u8::try_from(i).map(Self)
}
fn to_u32(self) -> u32 {
u32::from(self.0)
}
}
impl TrieValue for BidiClass {
type TryFromU32Error = TryFromIntError;
fn try_from_u32(i: u32) -> Result<Self, Self::TryFromU32Error> {
u8::try_from(i).map(Self)
}
fn to_u32(self) -> u32 {
u32::from(self.0)
}
}
impl TrieValue for GeneralCategory {
type TryFromU32Error = &'static str;
fn try_from_u32(i: u32) -> Result<Self, Self::TryFromU32Error> {
// If the u32 is out of range, fall back to u8::MAX, which is out of range of the GeneralCategory enum.
GeneralCategory::new_from_u8(i.try_into().unwrap_or(u8::MAX))
.ok_or("Cannot parse GeneralCategory from integer")
}
fn to_u32(self) -> u32 {
u32::from(self as u8)
}
}
impl TrieValue for Script {
type TryFromU32Error = TryFromIntError;
fn try_from_u32(i: u32) -> Result<Self, Self::TryFromU32Error> {
u16::try_from(i).map(Script)
}
fn to_u32(self) -> u32 {
u32::from(self.0)
}
}
impl TrieValue for HangulSyllableType {
type TryFromU32Error = TryFromIntError;
fn try_from_u32(i: u32) -> Result<Self, Self::TryFromU32Error> {
u8::try_from(i).map(Self)
}
fn to_u32(self) -> u32 {
u32::from(self.0)
}
}
impl TrieValue for ScriptWithExt {
type TryFromU32Error = TryFromIntError;
fn try_from_u32(i: u32) -> Result<Self, Self::TryFromU32Error> {
u16::try_from(i).map(Self)
}
fn to_u32(self) -> u32 {
u32::from(self.0)
}
}
impl TrieValue for EastAsianWidth {
type TryFromU32Error = TryFromIntError;
fn try_from_u32(i: u32) -> Result<Self, Self::TryFromU32Error> {
u8::try_from(i).map(Self)
}
fn to_u32(self) -> u32 {
u32::from(self.0)
}
}
impl TrieValue for LineBreak {
type TryFromU32Error = TryFromIntError;
fn try_from_u32(i: u32) -> Result<Self, Self::TryFromU32Error> {
u8::try_from(i).map(Self)
}
fn to_u32(self) -> u32 {
u32::from(self.0)
}
}
impl TrieValue for GraphemeClusterBreak {
type TryFromU32Error = TryFromIntError;
fn try_from_u32(i: u32) -> Result<Self, Self::TryFromU32Error> {
u8::try_from(i).map(Self)
}
fn to_u32(self) -> u32 {
u32::from(self.0)
}
}
impl TrieValue for WordBreak {
type TryFromU32Error = TryFromIntError;
fn try_from_u32(i: u32) -> Result<Self, Self::TryFromU32Error> {
u8::try_from(i).map(Self)
}
fn to_u32(self) -> u32 {
u32::from(self.0)
}
}
impl TrieValue for SentenceBreak {
type TryFromU32Error = TryFromIntError;
fn try_from_u32(i: u32) -> Result<Self, Self::TryFromU32Error> {
u8::try_from(i).map(Self)
}
fn to_u32(self) -> u32 {
u32::from(self.0)
}
}
impl TrieValue for IndicConjunctBreak {
type TryFromU32Error = TryFromIntError;
fn try_from_u32(i: u32) -> Result<Self, Self::TryFromU32Error> {
u8::try_from(i).map(Self)
}
fn to_u32(self) -> u32 {
u32::from(self.0)
}
}
impl TrieValue for IndicSyllabicCategory {
type TryFromU32Error = TryFromIntError;
fn try_from_u32(i: u32) -> Result<Self, Self::TryFromU32Error> {
u8::try_from(i).map(Self)
}
fn to_u32(self) -> u32 {
u32::from(self.0)
}
}
impl TrieValue for VerticalOrientation {
type TryFromU32Error = TryFromIntError;
fn try_from_u32(i: u32) -> Result<Self, Self::TryFromU32Error> {
u8::try_from(i).map(Self)
}
fn to_u32(self) -> u32 {
u32::from(self.0)
}
}
// GCG is not used inside tries, but it is used in the name lookup type, and we want
// to squeeze it into a u16 for storage. Its named mask values are specced so we can
// do this in code.
//
// This is done by:
// - Single-value masks are translated to their corresponding GeneralCategory values
// - we know all of the multi-value masks and we give them special values
// - Anything else goes to 0xFF00, though this code path shouldn't be hit unless working with malformed icuexportdata
//
// In the reverse direction, unknown values go to the empty mask, but this codepath should not be hit except
// with malformed ICU4X generated data.
impl AsULE for GeneralCategoryGroup {
type ULE = RawBytesULE<2>;
fn to_unaligned(self) -> Self::ULE {
let value = gcg_to_packed_u16(self);
value.to_unaligned()
}
fn from_unaligned(ule: Self::ULE) -> Self {
let value = ule.as_unsigned_int();
packed_u16_to_gcg(value)
}
}
fn packed_u16_to_gcg(value: u16) -> GeneralCategoryGroup {
match value {
0xFFFF => GeneralCategoryGroup::CasedLetter,
0xFFFE => GeneralCategoryGroup::Letter,
0xFFFD => GeneralCategoryGroup::Mark,
0xFFFC => GeneralCategoryGroup::Number,
0xFFFB => GeneralCategoryGroup::Separator,
0xFFFA => GeneralCategoryGroup::Other,
0xFFF9 => GeneralCategoryGroup::Punctuation,
0xFFF8 => GeneralCategoryGroup::Symbol,
v if v < 32 => GeneralCategory::new_from_u8(v as u8)
.map(|gc| gc.into())
.unwrap_or(GeneralCategoryGroup(0)),
// unknown values produce an empty mask
_ => GeneralCategoryGroup(0),
}
}
fn gcg_to_packed_u16(gcg: GeneralCategoryGroup) -> u16 {
// if it's a single property, translate to that property
if gcg.0.is_power_of_two() {
// inverse operation of a bitshift
gcg.0.trailing_zeros() as u16
} else {
match gcg {
GeneralCategoryGroup::CasedLetter => 0xFFFF,
GeneralCategoryGroup::Letter => 0xFFFE,
GeneralCategoryGroup::Mark => 0xFFFD,
GeneralCategoryGroup::Number => 0xFFFC,
GeneralCategoryGroup::Separator => 0xFFFB,
GeneralCategoryGroup::Other => 0xFFFA,
GeneralCategoryGroup::Punctuation => 0xFFF9,
GeneralCategoryGroup::Symbol => 0xFFF8,
_ => 0xFF00, // random sentinel value
}
}
}
impl TrieValue for GeneralCategoryGroup {
type TryFromU32Error = TryFromIntError;
fn try_from_u32(i: u32) -> Result<Self, Self::TryFromU32Error> {
// Even though we're dealing with u32s here, TrieValue is about converting
// trie storage types to the actual type. This type will always be a packed u16
// in our case since the names map upcasts from u16
u16::try_from(i).map(packed_u16_to_gcg)
}
fn to_u32(self) -> u32 {
u32::from(gcg_to_packed_u16(self))
}
}
impl TrieValue for BidiMirroringGlyph {
type TryFromU32Error = u32;
fn try_from_u32(i: u32) -> Result<Self, Self::TryFromU32Error> {
let code_point = i & 0x1FFFFF;
let mirroring_glyph = if code_point == 0 {
None
} else {
Some(char::try_from_u32(code_point).map_err(|_| i)?)
};
let mirrored = ((i >> 21) & 0x1) == 1;
let paired_bracket_type = {
let value = ((i >> 22) & 0x3) as u8;
match value {
0 => crate::bidi::BidiPairedBracketType::None,
1 => crate::bidi::BidiPairedBracketType::Open,
2 => crate::bidi::BidiPairedBracketType::Close,
_ => return Err(i),
}
};
Ok(Self {
mirrored,
mirroring_glyph,
paired_bracket_type,
})
}
fn to_u32(self) -> u32 {
self.mirroring_glyph.unwrap_or_default() as u32
| ((self.mirrored as u32) << 21)
| (match self.paired_bracket_type {
crate::bidi::BidiPairedBracketType::None => 0,
crate::bidi::BidiPairedBracketType::Open => 1,
crate::bidi::BidiPairedBracketType::Close => 2,
} << 22)
}
}
impl TrieValue for JoiningType {
type TryFromU32Error = TryFromIntError;
fn try_from_u32(i: u32) -> Result<Self, Self::TryFromU32Error> {
u8::try_from(i).map(Self)
}
fn to_u32(self) -> u32 {
u32::from(self.0)
}
}
-6
{
"git": {
"sha1": "38a49da495248dd1ded84cf306e4ca42e64d5bb3"
},
"path_in_vcs": "components/properties"
}
# This file is automatically @generated by Cargo.
# It is not intended for manual editing.
version = 3
[[package]]
name = "cobs"
version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0fa961b519f0b462e3a3b4a34b64d119eeaca1d59af726fe450bbba07a9fc0a1"
dependencies = [
"thiserror",
]
[[package]]
name = "databake"
version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ff6ee9e2d2afb173bcdeee45934c89ec341ab26f91c9933774fc15c2b58f83ef"
dependencies = [
"databake-derive",
"proc-macro2",
"quote",
]
[[package]]
name = "databake-derive"
version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6834770958c7b84223607e49758ec0dde273c4df915e734aad50f62968a4c134"
dependencies = [
"proc-macro2",
"quote",
"syn",
"synstructure",
]
[[package]]
name = "displaydoc"
version = "0.2.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "erased-serde"
version = "0.4.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "259d404d09818dec19332e31d94558aeb442fea04c817006456c24b5460bbd4b"
dependencies = [
"serde",
"serde_core",
"typeid",
]
[[package]]
name = "icu_collections"
version = "2.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4c6b649701667bbe825c3b7e6388cb521c23d88644678e83c0c4d0a621a34b43"
dependencies = [
"databake",
"displaydoc",
"potential_utf",
"serde",
"yoke",
"zerofrom",
"zerovec",
]
[[package]]
name = "icu_locale_core"
version = "2.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "edba7861004dd3714265b4db54a3c390e880ab658fec5f7db895fae2046b5bb6"
dependencies = [
"databake",
"displaydoc",
"litemap",
"serde",
"tinystr",
"writeable",
"zerovec",
]
[[package]]
name = "icu_properties"
version = "2.1.1"
dependencies = [
"databake",
"icu_collections",
"icu_locale_core",
"icu_properties_data",
"icu_provider",
"serde",
"unicode-bidi",
"zerotrie",
"zerovec",
]
[[package]]
name = "icu_properties_data"
version = "2.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "02845b3647bb045f1100ecd6480ff52f34c35f82d9880e029d329c21d1054899"
[[package]]
name = "icu_provider"
version = "2.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "85962cf0ce02e1e0a629cc34e7ca3e373ce20dda4c4d7294bbd0bf1fdb59e614"
dependencies = [
"databake",
"displaydoc",
"erased-serde",
"icu_locale_core",
"postcard",
"serde",
"stable_deref_trait",
"writeable",
"yoke",
"zerofrom",
"zerotrie",
"zerovec",
]
[[package]]
name = "litemap"
version = "0.8.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6373607a59f0be73a39b6fe456b8192fcc3585f602af20751600e974dd455e77"
dependencies = [
"serde_core",
]
[[package]]
name = "postcard"
version = "1.1.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6764c3b5dd454e283a30e6dfe78e9b31096d9e32036b5d1eaac7a6119ccb9a24"
dependencies = [
"cobs",
"serde",
]
[[package]]
name = "potential_utf"
version = "0.1.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b73949432f5e2a09657003c25bca5e19a0e9c84f8058ca374f49e0ebe605af77"
dependencies = [
"serde_core",
"zerovec",
]
[[package]]
name = "proc-macro2"
version = "1.0.103"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5ee95bc4ef87b8d5ba32e8b7714ccc834865276eab0aed5c9958d00ec45f49e8"
dependencies = [
"unicode-ident",
]
[[package]]
name = "quote"
version = "1.0.41"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ce25767e7b499d1b604768e7cde645d14cc8584231ea6b295e9c9eb22c02e1d1"
dependencies = [
"proc-macro2",
]
[[package]]
name = "serde"
version = "1.0.228"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e"
dependencies = [
"serde_core",
"serde_derive",
]
[[package]]
name = "serde_core"
version = "1.0.228"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad"
dependencies = [
"serde_derive",
]
[[package]]
name = "serde_derive"
version = "1.0.228"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "stable_deref_trait"
version = "1.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6ce2be8dc25455e1f91df71bfa12ad37d7af1092ae736f3a6cd0e37bc7810596"
[[package]]
name = "syn"
version = "2.0.108"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "da58917d35242480a05c2897064da0a80589a2a0476c9a3f2fdc83b53502e917"
dependencies = [
"proc-macro2",
"quote",
"unicode-ident",
]
[[package]]
name = "synstructure"
version = "0.13.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "728a70f3dbaf5bab7f0c4b1ac8d7ae5ea60a4b5549c8a5914361c99147a709d2"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "thiserror"
version = "2.0.17"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f63587ca0f12b72a0600bcba1d40081f830876000bb46dd2337a3051618f4fc8"
dependencies = [
"thiserror-impl",
]
[[package]]
name = "thiserror-impl"
version = "2.0.17"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3ff15c8ecd7de3849db632e14d18d2571fa09dfc5ed93479bc4485c7a517c913"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "tinystr"
version = "0.8.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "42d3e9c45c09de15d06dd8acf5f4e0e399e85927b7f00711024eb7ae10fa4869"
dependencies = [
"displaydoc",
"serde_core",
"zerovec",
]
[[package]]
name = "typeid"
version = "1.0.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bc7d623258602320d5c55d1bc22793b57daff0ec7efc270ea7d55ce1d5f5471c"
[[package]]
name = "unicode-bidi"
version = "0.3.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5c1cb5db39152898a79168971543b1cb5020dff7fe43c8dc468b0885f5e29df5"
[[package]]
name = "unicode-ident"
version = "1.0.20"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "462eeb75aeb73aea900253ce739c8e18a67423fadf006037cd3ff27e82748a06"
[[package]]
name = "writeable"
version = "0.6.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9edde0db4769d2dc68579893f2306b26c6ecfbe0ef499b013d731b7b9247e0b9"
[[package]]
name = "yoke"
version = "0.8.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "72d6e5c6afb84d73944e5cedb052c4680d5657337201555f9f2a16b7406d4954"
dependencies = [
"stable_deref_trait",
"yoke-derive",
"zerofrom",
]
[[package]]
name = "yoke-derive"
version = "0.8.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b659052874eb698efe5b9e8cf382204678a0086ebf46982b79d6ca3182927e5d"
dependencies = [
"proc-macro2",
"quote",
"syn",
"synstructure",
]
[[package]]
name = "zerofrom"
version = "0.1.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "50cc42e0333e05660c3587f3bf9d0478688e15d870fab3346451ce7f8c9fbea5"
dependencies = [
"zerofrom-derive",
]
[[package]]
name = "zerofrom-derive"
version = "0.1.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d71e5d6e06ab090c67b5e44993ec16b72dcbaabc526db883a360057678b48502"
dependencies = [
"proc-macro2",
"quote",
"syn",
"synstructure",
]
[[package]]
name = "zerotrie"
version = "0.2.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2a59c17a5562d507e4b54960e8569ebee33bee890c70aa3fe7b97e85a9fd7851"
dependencies = [
"databake",
"displaydoc",
"litemap",
"serde_core",
"yoke",
"zerofrom",
"zerovec",
]
[[package]]
name = "zerovec"
version = "0.11.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6c28719294829477f525be0186d13efa9a3c602f7ec202ca9e353d310fb9a002"
dependencies = [
"databake",
"serde",
"yoke",
"zerofrom",
"zerovec-derive",
]
[[package]]
name = "zerovec-derive"
version = "0.11.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "eadce39539ca5cb3985590102671f2567e659fca9666581ad3411d59207951f3"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO
#
# When uploading crates to the registry Cargo will automatically
# "normalize" Cargo.toml files for maximal compatibility
# with all versions of Cargo and also rewrite `path` dependencies
# to registry (e.g., crates.io) dependencies.
#
# If you are reading this file be aware that the original Cargo.toml
# will likely look very different (and much more reasonable).
# See Cargo.toml.orig for the original contents.
[package]
edition = "2021"
rust-version = "1.83"
name = "icu_properties"
version = "2.1.1"
authors = ["The ICU4X Project Developers"]
build = false
include = [
"data/**/*",
"src/**/*",
"examples/**/*",
"benches/**/*",
"tests/**/*",
"Cargo.toml",
"LICENSE",
"README.md",
"build.rs",
]
autolib = false
autobins = false
autoexamples = false
autotests = false
autobenches = false
description = "Definitions for Unicode properties"
homepage = "https://icu4x.unicode.org"
readme = "README.md"
categories = ["internationalization"]
license = "Unicode-3.0"
repository = "https://github.com/unicode-org/icu4x"
[package.metadata.docs.rs]
all-features = true
[features]
alloc = [
"zerovec/alloc",
"icu_collections/alloc",
"serde?/alloc",
]
compiled_data = [
"dep:icu_properties_data",
"icu_provider/baked",
]
datagen = [
"serde",
"dep:databake",
"zerovec/databake",
"icu_collections/databake",
"icu_locale_core/databake",
"zerotrie/databake",
"icu_provider/export",
]
default = ["compiled_data"]
serde = [
"dep:serde",
"icu_locale_core/serde",
"zerovec/serde",
"icu_collections/serde",
"icu_provider/serde",
"zerotrie/serde",
]
unicode_bidi = ["dep:unicode-bidi"]
[lib]
name = "icu_properties"
path = "src/lib.rs"
[dependencies.databake]
version = "0.2.0"
features = ["derive"]
optional = true
default-features = false
[dependencies.icu_collections]
version = "~2.1.1"
default-features = false
[dependencies.icu_locale_core]
version = "2.1.1"
features = ["zerovec"]
default-features = false
[dependencies.icu_properties_data]
version = "~2.1.1"
optional = true
default-features = false
[dependencies.icu_provider]
version = "2.1.1"
default-features = false
[dependencies.serde]
version = "1.0.220"
features = ["derive"]
optional = true
default-features = false
[dependencies.unicode-bidi]
version = "0.3.11"
optional = true
default-features = false
[dependencies.zerotrie]
version = "0.2.0"
features = [
"yoke",
"zerofrom",
]
default-features = false
[dependencies.zerovec]
version = "0.11.3"
features = [
"derive",
"yoke",
]
default-features = false
[dev-dependencies]

Sorry, the diff of this file is not supported yet

UNICODE LICENSE V3
COPYRIGHT AND PERMISSION NOTICE
Copyright © 2020-2024 Unicode, Inc.
NOTICE TO USER: Carefully read the following legal agreement. BY
DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING DATA FILES, AND/OR
SOFTWARE, YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE
TERMS AND CONDITIONS OF THIS AGREEMENT. IF YOU DO NOT AGREE, DO NOT
DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE THE DATA FILES OR SOFTWARE.
Permission is hereby granted, free of charge, to any person obtaining a
copy of data files and any associated documentation (the "Data Files") or
software and any associated documentation (the "Software") to deal in the
Data Files or Software without restriction, including without limitation
the rights to use, copy, modify, merge, publish, distribute, and/or sell
copies of the Data Files or Software, and to permit persons to whom the
Data Files or Software are furnished to do so, provided that either (a)
this copyright and permission notice appear with all copies of the Data
Files or Software, or (b) this copyright and permission notice appear in
associated Documentation.
THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY
KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF
THIRD PARTY RIGHTS.
IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE
BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES,
OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THE DATA
FILES OR SOFTWARE.
Except as contained in this notice, the name of a copyright holder shall
not be used in advertising or otherwise to promote the sale, use or other
dealings in these Data Files or Software without prior written
authorization of the copyright holder.
SPDX-License-Identifier: Unicode-3.0
Portions of ICU4X may have been adapted from ICU4C and/or ICU4J.
ICU 1.8.1 to ICU 57.1 © 1995-2016 International Business Machines Corporation and others.
# icu_properties [![crates.io](https://img.shields.io/crates/v/icu_properties)](https://crates.io/crates/icu_properties)
<!-- cargo-rdme start -->
Definitions of [Unicode Properties] and APIs for
retrieving property data in an appropriate data structure.
This module is published as its own crate ([`icu_properties`](https://docs.rs/icu_properties/latest/icu_properties/))
and as part of the [`icu`](https://docs.rs/icu/latest/icu/) crate. See the latter for more details on the ICU4X project.
APIs that return a `CodePointSetData` exist for binary properties and certain enumerated
properties.
APIs that return a `CodePointMapData` exist for certain enumerated properties.
## Examples
### Property data as `CodePointSetData`s
```rust
use icu::properties::{CodePointSetData, CodePointMapData};
use icu::properties::props::{GeneralCategory, Emoji};
// A binary property as a `CodePointSetData`
assert!(CodePointSetData::new::<Emoji>().contains('🎃')); // U+1F383 JACK-O-LANTERN
assert!(!CodePointSetData::new::<Emoji>().contains('木')); // U+6728
// An individual enumerated property value as a `CodePointSetData`
let line_sep_data = CodePointMapData::<GeneralCategory>::new()
.get_set_for_value(GeneralCategory::LineSeparator);
let line_sep = line_sep_data.as_borrowed();
assert!(line_sep.contains('\u{2028}'));
assert!(!line_sep.contains('\u{2029}'));
```
### Property data as `CodePointMapData`s
```rust
use icu::properties::CodePointMapData;
use icu::properties::props::Script;
assert_eq!(CodePointMapData::<Script>::new().get('🎃'), Script::Common); // U+1F383 JACK-O-LANTERN
assert_eq!(CodePointMapData::<Script>::new().get('木'), Script::Han); // U+6728
```
[`ICU4X`]: ../icu/index.html
[Unicode Properties]: https://unicode-org.github.io/icu/userguide/strings/properties.html
<!-- cargo-rdme end -->
## More Information
For more information on development, authorship, contributing etc. please visit [`ICU4X home page`](https://github.com/unicode-org/icu4x).
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
use crate::{props::EnumeratedProperty, provider::PropertyEnumBidiMirroringGlyphV1};
use icu_collections::codepointtrie::TrieValue;
use zerovec::ule::{AsULE, RawBytesULE};
/// This is a bitpacked combination of the `Bidi_Mirroring_Glyph`,
/// `Bidi_Mirrored`, and `Bidi_Paired_Bracket_Type` properties.
#[derive(Debug, Eq, PartialEq, Clone, Copy, Default)]
#[cfg_attr(feature = "datagen", derive(serde::Serialize, databake::Bake))]
#[cfg_attr(feature = "datagen", databake(path = icu_properties::props))]
#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
#[allow(clippy::exhaustive_structs)] // needed for baked construction
pub struct BidiMirroringGlyph {
/// The mirroring glyph
pub mirroring_glyph: Option<char>,
/// Whether the glyph is mirrored
pub mirrored: bool,
/// The paired bracket type
pub paired_bracket_type: BidiPairedBracketType,
}
impl EnumeratedProperty for BidiMirroringGlyph {
type DataMarker = PropertyEnumBidiMirroringGlyphV1;
#[cfg(feature = "compiled_data")]
const SINGLETON: &'static crate::provider::PropertyCodePointMap<'static, Self> =
crate::provider::Baked::SINGLETON_PROPERTY_ENUM_BIDI_MIRRORING_GLYPH_V1;
const NAME: &'static [u8] = b"Bidi_Mirroring_Glyph";
const SHORT_NAME: &'static [u8] = b"Bidi_Mirroring_Glyph";
}
impl crate::private::Sealed for BidiMirroringGlyph {}
impl AsULE for BidiMirroringGlyph {
type ULE = zerovec::ule::RawBytesULE<3>;
fn to_unaligned(self) -> Self::ULE {
let [a, b, c, _] = TrieValue::to_u32(self).to_le_bytes();
RawBytesULE([a, b, c])
}
fn from_unaligned(unaligned: Self::ULE) -> Self {
let [a, b, c] = unaligned.0;
TrieValue::try_from_u32(u32::from_le_bytes([a, b, c, 0])).unwrap_or_default()
}
}
/// The enum represents Bidi_Paired_Bracket_Type.
///
/// It does not implement [`EnumeratedProperty`], instead it can be obtained
/// through the bitpacked [`BidiMirroringGlyph`] property.
///
/// If you have a use case this property without also needing the [`BidiMirroringGlyph`]
/// property, and need to optimize data size, please file an issue.
#[derive(Debug, Eq, PartialEq, Copy, Clone, Default)]
#[cfg_attr(feature = "datagen", derive(serde::Serialize, databake::Bake))]
#[cfg_attr(feature = "datagen", databake(path = icu_properties::props))]
#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
#[non_exhaustive]
pub enum BidiPairedBracketType {
/// Represents Bidi_Paired_Bracket_Type=Open.
Open,
/// Represents Bidi_Paired_Bracket_Type=Close.
Close,
/// Represents Bidi_Paired_Bracket_Type=None.
#[default]
None,
}
/// Implements [`unicode_bidi::BidiDataSource`] on [`CodePointMapDataBorrowed<BidiClass>`](crate::CodePointMapDataBorrowed).
///
/// ✨ *Enabled with the `unicode_bidi` Cargo feature.*
///
/// # Examples
///
///```
/// use icu::properties::CodePointMapData;
/// use icu::properties::props::BidiClass;
/// use unicode_bidi::BidiInfo;
///
/// // This example text is defined using `concat!` because some browsers
/// // and text editors have trouble displaying bidi strings.
/// let text = concat!["א", // RTL#1
/// "ב", // RTL#2
/// "ג", // RTL#3
/// "a", // LTR#1
/// "b", // LTR#2
/// "c", // LTR#3
/// ]; //
///
///
/// let bidi_map = CodePointMapData::<BidiClass>::new();
///
/// // Resolve embedding levels within the text. Pass `None` to detect the
/// // paragraph level automatically.
/// let bidi_info = BidiInfo::new_with_data_source(&bidi_map, text, None);
///
/// // This paragraph has embedding level 1 because its first strong character is RTL.
/// assert_eq!(bidi_info.paragraphs.len(), 1);
/// let para = &bidi_info.paragraphs[0];
/// assert_eq!(para.level.number(), 1);
/// assert!(para.level.is_rtl());
///
/// // Re-ordering is done after wrapping each paragraph into a sequence of
/// // lines. For this example, I'll just use a single line that spans the
/// // entire paragraph.
/// let line = para.range.clone();
///
/// let display = bidi_info.reorder_line(para, line);
/// assert_eq!(display, concat!["a", // LTR#1
/// "b", // LTR#2
/// "c", // LTR#3
/// "ג", // RTL#3
/// "ב", // RTL#2
/// "א", // RTL#1
/// ]);
/// ```
#[cfg(feature = "unicode_bidi")]
impl unicode_bidi::data_source::BidiDataSource
for crate::CodePointMapDataBorrowed<'_, crate::props::BidiClass>
{
fn bidi_class(&self, c: char) -> unicode_bidi::BidiClass {
use crate::props::BidiClass;
match self.get(c) {
BidiClass::LeftToRight => unicode_bidi::BidiClass::L,
BidiClass::RightToLeft => unicode_bidi::BidiClass::R,
BidiClass::EuropeanNumber => unicode_bidi::BidiClass::EN,
BidiClass::EuropeanSeparator => unicode_bidi::BidiClass::ES,
BidiClass::EuropeanTerminator => unicode_bidi::BidiClass::ET,
BidiClass::ArabicNumber => unicode_bidi::BidiClass::AN,
BidiClass::CommonSeparator => unicode_bidi::BidiClass::CS,
BidiClass::ParagraphSeparator => unicode_bidi::BidiClass::B,
BidiClass::SegmentSeparator => unicode_bidi::BidiClass::S,
BidiClass::WhiteSpace => unicode_bidi::BidiClass::WS,
BidiClass::OtherNeutral => unicode_bidi::BidiClass::ON,
BidiClass::LeftToRightEmbedding => unicode_bidi::BidiClass::LRE,
BidiClass::LeftToRightOverride => unicode_bidi::BidiClass::LRO,
BidiClass::ArabicLetter => unicode_bidi::BidiClass::AL,
BidiClass::RightToLeftEmbedding => unicode_bidi::BidiClass::RLE,
BidiClass::RightToLeftOverride => unicode_bidi::BidiClass::RLO,
BidiClass::PopDirectionalFormat => unicode_bidi::BidiClass::PDF,
BidiClass::NonspacingMark => unicode_bidi::BidiClass::NSM,
BidiClass::BoundaryNeutral => unicode_bidi::BidiClass::BN,
BidiClass::FirstStrongIsolate => unicode_bidi::BidiClass::FSI,
BidiClass::LeftToRightIsolate => unicode_bidi::BidiClass::LRI,
BidiClass::RightToLeftIsolate => unicode_bidi::BidiClass::RLI,
BidiClass::PopDirectionalIsolate => unicode_bidi::BidiClass::PDI,
// This must not happen.
_ => unicode_bidi::BidiClass::ON,
}
}
}
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
#[cfg(feature = "alloc")]
use crate::code_point_set::CodePointSetData;
use crate::props::GeneralCategory;
use crate::props::GeneralCategoryGroup;
use crate::provider::*;
use core::ops::RangeInclusive;
use icu_collections::codepointtrie::{CodePointMapRange, CodePointTrie, TrieValue};
use icu_provider::marker::ErasedMarker;
use icu_provider::prelude::*;
/// A wrapper around code point map data.
///
/// It is returned by APIs that return Unicode
/// property data in a map-like form, ex: enumerated property value data keyed
/// by code point. Access its data via the borrowed version,
/// [`CodePointMapDataBorrowed`].
#[derive(Debug, Clone)]
pub struct CodePointMapData<T: TrieValue> {
data: DataPayload<ErasedMarker<PropertyCodePointMap<'static, T>>>,
}
impl<T: TrieValue> CodePointMapData<T> {
/// Creates a new [`CodePointMapData`] for a [`EnumeratedProperty`].
///
/// See the documentation on [`EnumeratedProperty`] implementations for details.
///
/// ✨ *Enabled with the `compiled_data` Cargo feature.*
///
/// [📚 Help choosing a constructor](icu_provider::constructors)
#[cfg(feature = "compiled_data")]
#[expect(clippy::new_ret_no_self)]
pub const fn new() -> CodePointMapDataBorrowed<'static, T>
where
T: EnumeratedProperty,
{
CodePointMapDataBorrowed::new()
}
#[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new)]
pub fn try_new_unstable(
provider: &(impl DataProvider<T::DataMarker> + ?Sized),
) -> Result<Self, DataError>
where
T: EnumeratedProperty,
{
Ok(Self {
data: provider.load(Default::default())?.payload.cast(),
})
}
/// Construct a borrowed version of this type that can be queried.
///
/// This avoids a potential small underlying cost per API call (like `get()`) by consolidating it
/// up front.
///
/// This owned version if returned by functions that use a runtime data provider.
#[inline]
pub fn as_borrowed(&self) -> CodePointMapDataBorrowed<'_, T> {
CodePointMapDataBorrowed {
map: self.data.get(),
}
}
/// Convert this map to a map around another type
///
/// Typically useful for type-erasing maps into maps around integers.
///
/// ✨ *Enabled with the `alloc` Cargo feature.*
///
/// # Panics
/// Will panic if T and P are different sizes
///
/// # Example
///
/// ```
/// use icu::properties::CodePointMapData;
/// use icu::properties::props::GeneralCategory;
///
/// let data = CodePointMapData::<GeneralCategory>::new().static_to_owned();
///
/// let gc = data.try_into_converted::<u8>().unwrap();
/// let gc = gc.as_borrowed();
///
/// assert_eq!(gc.get('木'), GeneralCategory::OtherLetter as u8); // U+6728
/// assert_eq!(gc.get('🎃'), GeneralCategory::OtherSymbol as u8); // U+1F383 JACK-O-LANTERN
/// ```
#[cfg(feature = "alloc")]
pub fn try_into_converted<P>(self) -> Result<CodePointMapData<P>, zerovec::ule::UleError>
where
P: TrieValue,
{
self.data
.try_map_project(|data, _| data.try_into_converted())
.map(CodePointMapData::from_data::<ErasedMarker<PropertyCodePointMap<'static, P>>>)
}
/// Construct a new one from loaded data
///
/// Typically it is preferable to use getters like [`load_general_category()`] instead
pub(crate) fn from_data<M>(data: DataPayload<M>) -> Self
where
M: DynamicDataMarker<DataStruct = PropertyCodePointMap<'static, T>>,
{
Self { data: data.cast() }
}
/// Construct a new one an owned [`CodePointTrie`]
pub fn from_code_point_trie(trie: CodePointTrie<'static, T>) -> Self {
let set = PropertyCodePointMap::from_code_point_trie(trie);
CodePointMapData::from_data(
DataPayload::<ErasedMarker<PropertyCodePointMap<'static, T>>>::from_owned(set),
)
}
/// Convert this type to a [`CodePointTrie`] as a borrowed value.
///
/// The data backing this is extensible and supports multiple implementations.
/// Currently it is always [`CodePointTrie`]; however in the future more backends may be
/// added, and users may select which at data generation time.
///
/// This method returns an `Option` in order to return `None` when the backing data provider
/// cannot return a [`CodePointTrie`], or cannot do so within the expected constant time
/// constraint.
pub fn as_code_point_trie(&self) -> Option<&CodePointTrie<'_, T>> {
self.data.get().as_code_point_trie()
}
/// Convert this type to a [`CodePointTrie`], borrowing if possible,
/// otherwise allocating a new [`CodePointTrie`].
///
/// The data backing this is extensible and supports multiple implementations.
/// Currently it is always [`CodePointTrie`]; however in the future more backends may be
/// added, and users may select which at data generation time.
///
/// The performance of the conversion to this specific return type will vary
/// depending on the data structure that is backing `self`.
pub fn to_code_point_trie(&self) -> CodePointTrie<'_, T> {
self.data.get().to_code_point_trie()
}
}
/// A borrowed wrapper around code point set data, returned by
/// [`CodePointSetData::as_borrowed()`]. More efficient to query.
#[derive(Clone, Copy, Debug)]
pub struct CodePointMapDataBorrowed<'a, T: TrieValue> {
map: &'a PropertyCodePointMap<'a, T>,
}
impl<'a, T: TrieValue> CodePointMapDataBorrowed<'a, T> {
/// Get the value this map has associated with code point `ch`
///
/// # Example
///
/// ```
/// use icu::properties::CodePointMapData;
/// use icu::properties::props::GeneralCategory;
///
/// let gc = CodePointMapData::<GeneralCategory>::new();
///
/// assert_eq!(gc.get('木'), GeneralCategory::OtherLetter); // U+6728
/// assert_eq!(gc.get('🎃'), GeneralCategory::OtherSymbol); // U+1F383 JACK-O-LANTERN
/// ```
#[inline]
pub fn get(self, ch: char) -> T {
self.map.get(ch)
}
/// See [`Self::get`].
#[inline]
pub fn get32(self, ch: u32) -> T {
self.map.get32(ch)
}
/// Get a [`CodePointSetData`] for all elements corresponding to a particular value
///
/// ✨ *Enabled with the `alloc` Cargo feature.*
///
/// # Example
///
/// ```
/// use icu::properties::props::GeneralCategory;
/// use icu::properties::CodePointMapData;
///
/// let gc = CodePointMapData::<GeneralCategory>::new();
///
/// let other_letter_set_data =
/// gc.get_set_for_value(GeneralCategory::OtherLetter);
/// let other_letter_set = other_letter_set_data.as_borrowed();
///
/// assert!(other_letter_set.contains('木')); // U+6728
/// assert!(!other_letter_set.contains('🎃')); // U+1F383 JACK-O-LANTERN
/// ```
#[cfg(feature = "alloc")]
pub fn get_set_for_value(self, value: T) -> CodePointSetData {
let set = self.map.get_set_for_value(value);
CodePointSetData::from_code_point_inversion_list(set)
}
/// Yields an [`Iterator`] returning ranges of consecutive code points that
/// share the same value in the [`CodePointMapData`].
///
/// # Examples
///
/// ```
/// use icu::properties::props::GeneralCategory;
/// use icu::properties::CodePointMapData;
///
/// let gc = CodePointMapData::<GeneralCategory>::new();
/// let mut ranges = gc.iter_ranges();
/// let next = ranges.next().unwrap();
/// assert_eq!(next.range, 0..=31);
/// assert_eq!(next.value, GeneralCategory::Control);
/// let next = ranges.next().unwrap();
/// assert_eq!(next.range, 32..=32);
/// assert_eq!(next.value, GeneralCategory::SpaceSeparator);
/// ```
pub fn iter_ranges(self) -> impl Iterator<Item = CodePointMapRange<T>> + 'a {
self.map.iter_ranges()
}
/// Yields an [`Iterator`] returning ranges of consecutive code points that
/// share the same value `v` in the [`CodePointMapData`].
///
/// # Examples
///
///
/// ```
/// use icu::properties::props::GeneralCategory;
/// use icu::properties::CodePointMapData;
///
/// let gc = CodePointMapData::<GeneralCategory>::new();
/// let mut ranges = gc.iter_ranges_for_value(GeneralCategory::UppercaseLetter);
/// assert_eq!(ranges.next().unwrap(), 'A' as u32..='Z' as u32);
/// assert_eq!(ranges.next().unwrap(), 'À' as u32..='Ö' as u32);
/// assert_eq!(ranges.next().unwrap(), 'Ø' as u32..='Þ' as u32);
/// ```
pub fn iter_ranges_for_value(self, val: T) -> impl Iterator<Item = RangeInclusive<u32>> + 'a {
self.map
.iter_ranges()
.filter(move |r| r.value == val)
.map(|r| r.range)
}
/// Yields an [`Iterator`] returning ranges of consecutive code points that
/// do *not* have the value `v` in the [`CodePointMapData`].
pub fn iter_ranges_for_value_complemented(
self,
val: T,
) -> impl Iterator<Item = RangeInclusive<u32>> + 'a {
self.map
.iter_ranges_mapped(move |value| value != val)
.filter(|v| v.value)
.map(|v| v.range)
}
/// Exposed for FFI needs, could be exposed in general in the future but we should
/// have a use case first.
///
/// FFI needs this since it operates on erased maps and can't use `iter_ranges_for_group()`
#[doc(hidden)] // used by FFI code
pub fn iter_ranges_mapped<U: Eq + 'a>(
self,
predicate: impl FnMut(T) -> U + Copy + 'a,
) -> impl Iterator<Item = CodePointMapRange<U>> + 'a {
self.map.iter_ranges_mapped(predicate)
}
}
impl CodePointMapDataBorrowed<'_, GeneralCategory> {
/// Get a [`CodePointSetData`] for all elements corresponding to a particular value group
///
/// ✨ *Enabled with the `alloc` Cargo feature.*
///
/// # Example
///
/// ```
/// use icu::properties::props::{GeneralCategory, GeneralCategoryGroup};
/// use icu::properties::CodePointMapData;
///
/// let gc = CodePointMapData::<GeneralCategory>::new();
///
/// let other_letter_set_data =
/// gc.get_set_for_value_group(GeneralCategoryGroup::OtherLetter);
/// let other_letter_set = other_letter_set_data.as_borrowed();
///
/// assert!(other_letter_set.contains('木')); // U+6728
/// assert!(!other_letter_set.contains('🎃')); // U+1F383 JACK-O-LANTERN
/// ```
#[cfg(feature = "alloc")]
pub fn get_set_for_value_group(self, value: GeneralCategoryGroup) -> crate::CodePointSetData {
let matching_gc_ranges = self
.iter_ranges()
.filter(|cpm_range| (1 << cpm_range.value as u32) & value.0 != 0)
.map(|cpm_range| cpm_range.range);
CodePointSetData::from_code_point_inversion_list(matching_gc_ranges.collect())
}
}
#[cfg(feature = "compiled_data")]
impl<T: EnumeratedProperty> Default for CodePointMapDataBorrowed<'static, T> {
fn default() -> Self {
Self::new()
}
}
impl<T: TrieValue> CodePointMapDataBorrowed<'static, T> {
/// Creates a new [`CodePointMapDataBorrowed`] for a [`EnumeratedProperty`].
///
/// See the documentation on [`EnumeratedProperty`] implementations for details.
///
/// ✨ *Enabled with the `compiled_data` Cargo feature.*
///
/// [📚 Help choosing a constructor](icu_provider::constructors)
#[cfg(feature = "compiled_data")]
pub const fn new() -> Self
where
T: EnumeratedProperty,
{
CodePointMapDataBorrowed { map: T::SINGLETON }
}
/// Cheaply converts a [`CodePointMapDataBorrowed<'static>`] into a [`CodePointMapData`].
///
/// Note: Due to branching and indirection, using [`CodePointMapData`] might inhibit some
/// compile-time optimizations that are possible with [`CodePointMapDataBorrowed`].
pub const fn static_to_owned(self) -> CodePointMapData<T> {
CodePointMapData {
data: DataPayload::from_static_ref(self.map),
}
}
}
impl<'a> CodePointMapDataBorrowed<'a, GeneralCategory> {
/// Yields an [`Iterator`] returning ranges of consecutive code points that
/// have a `General_Category` value belonging to the specified [`GeneralCategoryGroup`]
///
/// # Examples
///
/// ```
/// use icu::properties::props::{GeneralCategory, GeneralCategoryGroup};
/// use icu::properties::CodePointMapData;
///
/// let gc = CodePointMapData::<GeneralCategory>::new();
/// let mut ranges = gc.iter_ranges_for_group(GeneralCategoryGroup::Letter);
/// assert_eq!(ranges.next().unwrap(), 'A' as u32..='Z' as u32);
/// assert_eq!(ranges.next().unwrap(), 'a' as u32..='z' as u32);
/// assert_eq!(ranges.next().unwrap(), 'ª' as u32..='ª' as u32);
/// assert_eq!(ranges.next().unwrap(), 'µ' as u32..='µ' as u32);
/// assert_eq!(ranges.next().unwrap(), 'º' as u32..='º' as u32);
/// assert_eq!(ranges.next().unwrap(), 'À' as u32..='Ö' as u32);
/// assert_eq!(ranges.next().unwrap(), 'Ø' as u32..='ö' as u32);
/// ```
pub fn iter_ranges_for_group(
self,
group: GeneralCategoryGroup,
) -> impl Iterator<Item = RangeInclusive<u32>> + 'a {
self.map
.iter_ranges_mapped(move |value| group.contains(value))
.filter(|v| v.value)
.map(|v| v.range)
}
}
/// A Unicode character property that assigns a value to each code point.
///
/// The descriptions of most properties are taken from [`TR44`], the documentation for the
/// Unicode Character Database.
///
/// <div class="stab unstable">
/// 🚫 This trait is sealed; it cannot be implemented by user code. If an API requests an item that implements this
/// trait, please consider using a type from the implementors listed below.
/// </div>
///
/// [`TR44`]: https://www.unicode.org/reports/tr44
pub trait EnumeratedProperty: crate::private::Sealed + TrieValue {
#[doc(hidden)]
type DataMarker: DataMarker<DataStruct = PropertyCodePointMap<'static, Self>>;
#[doc(hidden)]
#[cfg(feature = "compiled_data")]
const SINGLETON: &'static PropertyCodePointMap<'static, Self>;
/// The name of this property
const NAME: &'static [u8];
/// The abbreviated name of this property, if it exists, otherwise the name
const SHORT_NAME: &'static [u8];
/// Convenience method for `CodePointMapData::new().get(ch)`
///
/// ✨ *Enabled with the `compiled_data` Cargo feature.*
#[cfg(feature = "compiled_data")]
fn for_char(ch: char) -> Self {
CodePointMapData::new().get(ch)
}
}
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
use crate::provider::*;
use core::ops::RangeInclusive;
use icu_collections::codepointinvlist::CodePointInversionList;
use icu_provider::marker::ErasedMarker;
use icu_provider::prelude::*;
/// A set of Unicode code points. Access its data via the borrowed version,
/// [`CodePointSetDataBorrowed`].
///
/// # Example
/// ```rust
/// use icu::properties::CodePointSetData;
/// use icu::properties::props::Alphabetic;
///
/// let alphabetic = CodePointSetData::new::<Alphabetic>();
///
/// assert!(!alphabetic.contains('3'));
/// assert!(!alphabetic.contains('੩')); // U+0A69 GURMUKHI DIGIT THREE
/// assert!(alphabetic.contains('A'));
/// assert!(alphabetic.contains('Ä')); // U+00C4 LATIN CAPITAL LETTER A WITH DIAERESIS
/// ```
#[derive(Debug)]
pub struct CodePointSetData {
data: DataPayload<ErasedMarker<PropertyCodePointSet<'static>>>,
}
impl CodePointSetData {
/// Creates a new [`CodePointSetDataBorrowed`] for a [`BinaryProperty`].
///
/// ✨ *Enabled with the `compiled_data` Cargo feature.*
///
/// [📚 Help choosing a constructor](icu_provider::constructors)
#[expect(clippy::new_ret_no_self)]
#[cfg(feature = "compiled_data")]
pub const fn new<P: BinaryProperty>() -> CodePointSetDataBorrowed<'static> {
CodePointSetDataBorrowed::new::<P>()
}
#[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new)]
pub fn try_new_unstable<P: BinaryProperty>(
provider: &(impl DataProvider<P::DataMarker> + ?Sized),
) -> Result<CodePointSetData, DataError> {
Ok(CodePointSetData::from_data(
provider.load(Default::default())?.payload,
))
}
/// Construct a borrowed version of this type that can be queried.
///
/// This owned version if returned by functions that use a runtime data provider.
#[inline]
pub fn as_borrowed(&self) -> CodePointSetDataBorrowed<'_> {
CodePointSetDataBorrowed {
set: self.data.get(),
}
}
/// Construct a new one from loaded data
///
/// Typically it is preferable to use getters like [`load_ascii_hex_digit()`] instead
pub(crate) fn from_data<M>(data: DataPayload<M>) -> Self
where
M: DynamicDataMarker<DataStruct = PropertyCodePointSet<'static>>,
{
Self { data: data.cast() }
}
/// Construct a new owned [`CodePointInversionList`]
pub fn from_code_point_inversion_list(set: CodePointInversionList<'static>) -> Self {
let set = PropertyCodePointSet::from_code_point_inversion_list(set);
CodePointSetData::from_data(
DataPayload::<ErasedMarker<PropertyCodePointSet<'static>>>::from_owned(set),
)
}
/// Convert this type to a [`CodePointInversionList`] as a borrowed value.
///
/// The data backing this is extensible and supports multiple implementations.
/// Currently it is always [`CodePointInversionList`]; however in the future more backends may be
/// added, and users may select which at data generation time.
///
/// This method returns an `Option` in order to return `None` when the backing data provider
/// cannot return a [`CodePointInversionList`], or cannot do so within the expected constant time
/// constraint.
pub fn as_code_point_inversion_list(&self) -> Option<&CodePointInversionList<'_>> {
self.data.get().as_code_point_inversion_list()
}
/// Convert this type to a [`CodePointInversionList`], borrowing if possible,
/// otherwise allocating a new [`CodePointInversionList`].
///
/// The data backing this is extensible and supports multiple implementations.
/// Currently it is always [`CodePointInversionList`]; however in the future more backends may be
/// added, and users may select which at data generation time.
///
/// The performance of the conversion to this specific return type will vary
/// depending on the data structure that is backing `self`.
pub fn to_code_point_inversion_list(&self) -> CodePointInversionList<'_> {
self.data.get().to_code_point_inversion_list()
}
}
/// A borrowed wrapper around code point set data, returned by
/// [`CodePointSetData::as_borrowed()`]. More efficient to query.
#[derive(Clone, Copy, Debug)]
pub struct CodePointSetDataBorrowed<'a> {
set: &'a PropertyCodePointSet<'a>,
}
impl CodePointSetDataBorrowed<'static> {
/// Creates a new [`CodePointSetData`] for a [`BinaryProperty`].
///
/// ✨ *Enabled with the `compiled_data` Cargo feature.*
///
/// [📚 Help choosing a constructor](icu_provider::constructors)
#[inline]
#[cfg(feature = "compiled_data")]
pub const fn new<P: BinaryProperty>() -> Self {
CodePointSetDataBorrowed { set: P::SINGLETON }
}
/// Cheaply converts a [`CodePointSetDataBorrowed<'static>`] into a [`CodePointSetData`].
///
/// Note: Due to branching and indirection, using [`CodePointSetData`] might inhibit some
/// compile-time optimizations that are possible with [`CodePointSetDataBorrowed`].
pub const fn static_to_owned(self) -> CodePointSetData {
CodePointSetData {
data: DataPayload::from_static_ref(self.set),
}
}
}
impl<'a> CodePointSetDataBorrowed<'a> {
/// Check if the set contains a character
///
/// ```rust
/// use icu::properties::CodePointSetData;
/// use icu::properties::props::Alphabetic;
///
/// let alphabetic = CodePointSetData::new::<Alphabetic>();
///
/// assert!(!alphabetic.contains('3'));
/// assert!(!alphabetic.contains('੩')); // U+0A69 GURMUKHI DIGIT THREE
/// assert!(alphabetic.contains('A'));
/// assert!(alphabetic.contains('Ä')); // U+00C4 LATIN CAPITAL LETTER A WITH DIAERESIS
/// ```
#[inline]
pub fn contains(self, ch: char) -> bool {
self.set.contains(ch)
}
/// See [`Self::contains`].
#[inline]
pub fn contains32(self, ch: u32) -> bool {
self.set.contains32(ch)
}
// Yields an [`Iterator`] returning the ranges of the code points that are
/// included in the [`CodePointSetData`]
///
/// Ranges are returned as [`RangeInclusive`], which is inclusive of its
/// `end` bound value. An end-inclusive behavior matches the ICU4C/J
/// behavior of ranges, ex: `UnicodeSet::contains(UChar32 start, UChar32 end)`.
///
/// # Example
///
/// ```
/// use icu::properties::props::Alphabetic;
/// use icu::properties::CodePointSetData;
///
/// let alphabetic = CodePointSetData::new::<Alphabetic>();
/// let mut ranges = alphabetic.iter_ranges();
///
/// assert_eq!(Some(0x0041..=0x005A), ranges.next()); // 'A'..'Z'
/// assert_eq!(Some(0x0061..=0x007A), ranges.next()); // 'a'..'z'
/// ```
#[inline]
pub fn iter_ranges(self) -> impl Iterator<Item = RangeInclusive<u32>> + 'a {
self.set.iter_ranges()
}
// Yields an [`Iterator`] returning the ranges of the code points that are
/// *not* included in the [`CodePointSetData`]
///
/// Ranges are returned as [`RangeInclusive`], which is inclusive of its
/// `end` bound value. An end-inclusive behavior matches the ICU4C/J
/// behavior of ranges, ex: `UnicodeSet::contains(UChar32 start, UChar32 end)`.
///
/// # Example
///
/// ```
/// use icu::properties::props::Alphabetic;
/// use icu::properties::CodePointSetData;
///
/// let alphabetic = CodePointSetData::new::<Alphabetic>();
/// let mut ranges = alphabetic.iter_ranges();
///
/// assert_eq!(Some(0x0041..=0x005A), ranges.next()); // 'A'..'Z'
/// assert_eq!(Some(0x0061..=0x007A), ranges.next()); // 'a'..'z'
/// ```
#[inline]
pub fn iter_ranges_complemented(self) -> impl Iterator<Item = RangeInclusive<u32>> + 'a {
self.set.iter_ranges_complemented()
}
}
/// A binary Unicode character property.
///
/// The descriptions of most properties are taken from [`TR44`], the documentation for the
/// Unicode Character Database. Some properties are instead defined in [`TR18`], the
/// documentation for Unicode regular expressions. In particular, Annex C of this document
/// defines properties for POSIX compatibility.
///
/// <div class="stab unstable">
/// 🚫 This trait is sealed; it cannot be implemented by user code. If an API requests an item that implements this
/// trait, please consider using a type from the implementors listed below.
/// </div>
///
/// [`TR44`]: https://www.unicode.org/reports/tr44
/// [`TR18`]: https://www.unicode.org/reports/tr18
pub trait BinaryProperty: crate::private::Sealed + Sized {
#[doc(hidden)]
type DataMarker: DataMarker<DataStruct = PropertyCodePointSet<'static>>;
#[doc(hidden)]
#[cfg(feature = "compiled_data")]
const SINGLETON: &'static PropertyCodePointSet<'static>;
/// The name of this property
const NAME: &'static [u8];
/// The abbreviated name of this property, if it exists, otherwise the name
const SHORT_NAME: &'static [u8];
/// Convenience method for `CodePointSetData::new().contains(ch)`
///
/// ✨ *Enabled with the `compiled_data` Cargo feature.*
#[cfg(feature = "compiled_data")]
fn for_char(ch: char) -> bool {
CodePointSetData::new::<Self>().contains(ch)
}
}
#[cfg(test)]
mod tests {
#[test]
fn test_general_category() {
use icu::properties::props::GeneralCategory;
use icu::properties::props::GeneralCategoryGroup;
use icu::properties::CodePointMapData;
let digits_data = CodePointMapData::<GeneralCategory>::new()
.get_set_for_value_group(GeneralCategoryGroup::Number);
let digits = digits_data.as_borrowed();
assert!(digits.contains('5'));
assert!(digits.contains('\u{0665}')); // U+0665 ARABIC-INDIC DIGIT FIVE
assert!(digits.contains('\u{096b}')); // U+0969 DEVANAGARI DIGIT FIVE
assert!(!digits.contains('A'));
}
#[test]
fn test_script() {
use icu::properties::props::Script;
use icu::properties::CodePointMapData;
let thai_data = CodePointMapData::<Script>::new().get_set_for_value(Script::Thai);
let thai = thai_data.as_borrowed();
assert!(thai.contains('\u{0e01}')); // U+0E01 THAI CHARACTER KO KAI
assert!(thai.contains('\u{0e50}')); // U+0E50 THAI DIGIT ZERO
assert!(!thai.contains('A'));
assert!(!thai.contains('\u{0e3f}')); // U+0E50 THAI CURRENCY SYMBOL BAHT
}
#[test]
fn test_gc_groupings() {
use icu::properties::props::{GeneralCategory, GeneralCategoryGroup};
use icu::properties::CodePointMapData;
use icu_collections::codepointinvlist::CodePointInversionListBuilder;
let test_group = |category: GeneralCategoryGroup, subcategories: &[GeneralCategory]| {
let category_set =
CodePointMapData::<GeneralCategory>::new().get_set_for_value_group(category);
let category_set = category_set
.as_code_point_inversion_list()
.expect("The data should be valid");
let mut builder = CodePointInversionListBuilder::new();
for &subcategory in subcategories {
let gc_set_data =
CodePointMapData::<GeneralCategory>::new().get_set_for_value(subcategory);
let gc_set = gc_set_data.as_borrowed();
for range in gc_set.iter_ranges() {
builder.add_range32(range);
}
}
let combined_set = builder.build();
println!("{category:?} {subcategories:?}");
assert_eq!(
category_set.get_inversion_list_vec(),
combined_set.get_inversion_list_vec()
);
};
test_group(
GeneralCategoryGroup::Letter,
&[
GeneralCategory::UppercaseLetter,
GeneralCategory::LowercaseLetter,
GeneralCategory::TitlecaseLetter,
GeneralCategory::ModifierLetter,
GeneralCategory::OtherLetter,
],
);
test_group(
GeneralCategoryGroup::Other,
&[
GeneralCategory::Control,
GeneralCategory::Format,
GeneralCategory::Unassigned,
GeneralCategory::PrivateUse,
GeneralCategory::Surrogate,
],
);
test_group(
GeneralCategoryGroup::Mark,
&[
GeneralCategory::SpacingMark,
GeneralCategory::EnclosingMark,
GeneralCategory::NonspacingMark,
],
);
test_group(
GeneralCategoryGroup::Number,
&[
GeneralCategory::DecimalNumber,
GeneralCategory::LetterNumber,
GeneralCategory::OtherNumber,
],
);
test_group(
GeneralCategoryGroup::Punctuation,
&[
GeneralCategory::ConnectorPunctuation,
GeneralCategory::DashPunctuation,
GeneralCategory::ClosePunctuation,
GeneralCategory::FinalPunctuation,
GeneralCategory::InitialPunctuation,
GeneralCategory::OtherPunctuation,
GeneralCategory::OpenPunctuation,
],
);
test_group(
GeneralCategoryGroup::Symbol,
&[
GeneralCategory::CurrencySymbol,
GeneralCategory::ModifierSymbol,
GeneralCategory::MathSymbol,
GeneralCategory::OtherSymbol,
],
);
test_group(
GeneralCategoryGroup::Separator,
&[
GeneralCategory::LineSeparator,
GeneralCategory::ParagraphSeparator,
GeneralCategory::SpaceSeparator,
],
);
}
#[test]
fn test_gc_surrogate() {
use icu::properties::props::GeneralCategory;
use icu::properties::CodePointMapData;
let surrogates_data = CodePointMapData::<GeneralCategory>::new()
.get_set_for_value(GeneralCategory::Surrogate);
let surrogates = surrogates_data.as_borrowed();
assert!(surrogates.contains32(0xd800));
assert!(surrogates.contains32(0xd900));
assert!(surrogates.contains32(0xdfff));
assert!(!surrogates.contains('A'));
}
}
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
use crate::provider::*;
use icu_collections::codepointinvliststringlist::CodePointInversionListAndStringList;
use icu_provider::marker::ErasedMarker;
use icu_provider::prelude::*;
/// A wrapper around `UnicodeSet` data (characters and strings)
#[derive(Debug)]
pub struct EmojiSetData {
data: DataPayload<ErasedMarker<PropertyUnicodeSet<'static>>>,
}
impl EmojiSetData {
/// Creates a new [`EmojiSetDataBorrowed`] for a [`EmojiSet`].
///
/// See the documentation on [`EmojiSet`] implementations for details.
///
/// ✨ *Enabled with the `compiled_data` Cargo feature.*
///
/// [📚 Help choosing a constructor](icu_provider::constructors)
#[cfg(feature = "compiled_data")]
#[expect(clippy::new_ret_no_self)]
pub const fn new<P: EmojiSet>() -> EmojiSetDataBorrowed<'static> {
EmojiSetDataBorrowed::new::<P>()
}
/// A version of `new()` that uses custom data provided by a [`DataProvider`].
///
/// Note that this will return an owned version of the data. Functionality is available on
/// the borrowed version, accessible through [`EmojiSetData::as_borrowed`].
pub fn try_new_unstable<P: EmojiSet>(
provider: &(impl DataProvider<P::DataMarker> + ?Sized),
) -> Result<EmojiSetData, DataError> {
Ok(EmojiSetData::from_data(
provider.load(Default::default())?.payload,
))
}
/// Construct a borrowed version of this type that can be queried.
///
/// This avoids a potential small underlying cost per API call (ex: `contains()`) by consolidating it
/// up front.
#[inline]
pub fn as_borrowed(&self) -> EmojiSetDataBorrowed<'_> {
EmojiSetDataBorrowed {
set: self.data.get(),
}
}
/// Construct a new one from loaded data
///
/// Typically it is preferable to use getters instead
pub(crate) fn from_data<M>(data: DataPayload<M>) -> Self
where
M: DynamicDataMarker<DataStruct = PropertyUnicodeSet<'static>>,
{
Self { data: data.cast() }
}
/// Construct a new owned [`CodePointInversionListAndStringList`]
pub fn from_code_point_inversion_list_string_list(
set: CodePointInversionListAndStringList<'static>,
) -> Self {
let set = PropertyUnicodeSet::from_code_point_inversion_list_string_list(set);
EmojiSetData::from_data(
DataPayload::<ErasedMarker<PropertyUnicodeSet<'static>>>::from_owned(set),
)
}
/// Convert this type to a [`CodePointInversionListAndStringList`] as a borrowed value.
///
/// The data backing this is extensible and supports multiple implementations.
/// Currently it is always [`CodePointInversionListAndStringList`]; however in the future more backends may be
/// added, and users may select which at data generation time.
///
/// This method returns an `Option` in order to return `None` when the backing data provider
/// cannot return a [`CodePointInversionListAndStringList`], or cannot do so within the expected constant time
/// constraint.
pub fn as_code_point_inversion_list_string_list(
&self,
) -> Option<&CodePointInversionListAndStringList<'_>> {
self.data.get().as_code_point_inversion_list_string_list()
}
/// Convert this type to a [`CodePointInversionListAndStringList`], borrowing if possible,
/// otherwise allocating a new [`CodePointInversionListAndStringList`].
///
/// The data backing this is extensible and supports multiple implementations.
/// Currently it is always [`CodePointInversionListAndStringList`]; however in the future more backends may be
/// added, and users may select which at data generation time.
///
/// The performance of the conversion to this specific return type will vary
/// depending on the data structure that is backing `self`.
pub fn to_code_point_inversion_list_string_list(
&self,
) -> CodePointInversionListAndStringList<'_> {
self.data.get().to_code_point_inversion_list_string_list()
}
}
/// A borrowed wrapper around code point set data, returned by
/// [`EmojiSetData::as_borrowed()`]. More efficient to query.
#[derive(Clone, Copy, Debug)]
pub struct EmojiSetDataBorrowed<'a> {
set: &'a PropertyUnicodeSet<'a>,
}
impl EmojiSetDataBorrowed<'_> {
/// Check if the set contains the string. Strings consisting of one character
/// are treated as a character/code point.
///
/// This matches ICU behavior for ICU's `UnicodeSet`.
#[inline]
pub fn contains_str(self, s: &str) -> bool {
self.set.contains_str(s)
}
/// Check if the set contains the code point.
#[inline]
pub fn contains(self, ch: char) -> bool {
self.set.contains(ch)
}
/// See [`Self::contains`].
#[inline]
pub fn contains32(self, cp: u32) -> bool {
self.set.contains32(cp)
}
}
impl EmojiSetDataBorrowed<'static> {
/// Creates a new [`EmojiSetDataBorrowed`] for a [`EmojiSet`].
///
/// See the documentation on [`EmojiSet`] implementations for details.
///
/// ✨ *Enabled with the `compiled_data` Cargo feature.*
///
/// [📚 Help choosing a constructor](icu_provider::constructors)
#[inline]
#[cfg(feature = "compiled_data")]
pub const fn new<P: EmojiSet>() -> Self {
EmojiSetDataBorrowed { set: P::SINGLETON }
}
/// Cheaply converts a [`EmojiSetDataBorrowed<'static>`] into a [`EmojiSetData`].
///
/// Note: Due to branching and indirection, using [`EmojiSetData`] might inhibit some
/// compile-time optimizations that are possible with [`EmojiSetDataBorrowed`].
pub const fn static_to_owned(self) -> EmojiSetData {
EmojiSetData {
data: DataPayload::from_static_ref(self.set),
}
}
}
/// An Emoji set as defined by [`Unicode Technical Standard #51`](https://unicode.org/reports/tr51/#Emoji_Sets>).
///
/// <div class="stab unstable">
/// 🚫 This trait is sealed; it cannot be implemented by user code. If an API requests an item that implements this
/// trait, please consider using a type from the implementors listed below.
/// </div>
pub trait EmojiSet: crate::private::Sealed {
#[doc(hidden)]
type DataMarker: DataMarker<DataStruct = PropertyUnicodeSet<'static>>;
#[doc(hidden)]
#[cfg(feature = "compiled_data")]
const SINGLETON: &'static PropertyUnicodeSet<'static>;
}
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
//! Definitions of [Unicode Properties] and APIs for
//! retrieving property data in an appropriate data structure.
//!
//! This module is published as its own crate ([`icu_properties`](https://docs.rs/icu_properties/latest/icu_properties/))
//! and as part of the [`icu`](https://docs.rs/icu/latest/icu/) crate. See the latter for more details on the ICU4X project.
//!
//! APIs that return a [`CodePointSetData`] exist for binary properties and certain enumerated
//! properties.
//!
//! APIs that return a [`CodePointMapData`] exist for certain enumerated properties.
//!
//! # Examples
//!
//! ## Property data as `CodePointSetData`s
//!
//! ```
//! use icu::properties::{CodePointSetData, CodePointMapData};
//! use icu::properties::props::{GeneralCategory, Emoji};
//!
//! // A binary property as a `CodePointSetData`
//!
//! assert!(CodePointSetData::new::<Emoji>().contains('🎃')); // U+1F383 JACK-O-LANTERN
//! assert!(!CodePointSetData::new::<Emoji>().contains('木')); // U+6728
//!
//! // An individual enumerated property value as a `CodePointSetData`
//!
//! let line_sep_data = CodePointMapData::<GeneralCategory>::new()
//! .get_set_for_value(GeneralCategory::LineSeparator);
//! let line_sep = line_sep_data.as_borrowed();
//!
//! assert!(line_sep.contains('\u{2028}'));
//! assert!(!line_sep.contains('\u{2029}'));
//! ```
//!
//! ## Property data as `CodePointMapData`s
//!
//! ```
//! use icu::properties::CodePointMapData;
//! use icu::properties::props::Script;
//!
//! assert_eq!(CodePointMapData::<Script>::new().get('🎃'), Script::Common); // U+1F383 JACK-O-LANTERN
//! assert_eq!(CodePointMapData::<Script>::new().get('木'), Script::Han); // U+6728
//! ```
//!
//! [`ICU4X`]: ../icu/index.html
//! [Unicode Properties]: https://unicode-org.github.io/icu/userguide/strings/properties.html
//! [`CodePointSetData`]: crate::CodePointSetData
//! [`CodePointMapData`]: crate::CodePointMapData
// https://github.com/unicode-org/icu4x/blob/main/documents/process/boilerplate.md#library-annotations
#![cfg_attr(not(any(test, doc)), no_std)]
#![cfg_attr(
not(test),
deny(
clippy::indexing_slicing,
clippy::unwrap_used,
clippy::expect_used,
clippy::panic,
clippy::exhaustive_structs,
clippy::exhaustive_enums,
clippy::trivially_copy_pass_by_ref,
missing_debug_implementations,
)
)]
#![warn(missing_docs)]
#[cfg(feature = "alloc")]
extern crate alloc;
mod code_point_set;
pub use code_point_set::{CodePointSetData, CodePointSetDataBorrowed};
mod code_point_map;
pub use code_point_map::{CodePointMapData, CodePointMapDataBorrowed};
mod emoji;
pub use emoji::{EmojiSetData, EmojiSetDataBorrowed};
mod names;
pub use names::{
PropertyNamesLong, PropertyNamesLongBorrowed, PropertyNamesShort, PropertyNamesShortBorrowed,
PropertyParser, PropertyParserBorrowed,
};
mod runtime;
// NOTE: The Pernosco debugger has special knowledge
// of the `CanonicalCombiningClass` struct inside the `props`
// module. Please do not change the crate-module-qualified
// name of that struct without coordination.
pub mod props;
pub mod provider;
pub mod script;
mod bidi;
mod trievalue;
mod private {
pub trait Sealed {}
}
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
use crate::props::*;
use crate::provider::names::*;
use core::marker::PhantomData;
use icu_collections::codepointtrie::TrieValue;
use icu_provider::marker::ErasedMarker;
use icu_provider::prelude::*;
use yoke::Yokeable;
use zerotrie::cursor::ZeroTrieSimpleAsciiCursor;
/// A struct capable of looking up a property value from a string name.
/// Access its data by calling [`Self::as_borrowed()`] and using the methods on
/// [`PropertyParserBorrowed`].
///
/// The name can be a short name (`Lu`), a long name(`Uppercase_Letter`),
/// or an alias.
///
/// Property names can be looked up using "strict" matching (looking for a name
/// that matches exactly), or "loose matching", where the name is allowed to deviate
/// in terms of ASCII casing, whitespace, underscores, and hyphens.
///
/// # Example
///
/// ```
/// use icu::properties::props::GeneralCategory;
/// use icu::properties::PropertyParser;
///
/// let lookup = PropertyParser::<GeneralCategory>::new();
/// // short name for value
/// assert_eq!(
/// lookup.get_strict("Lu"),
/// Some(GeneralCategory::UppercaseLetter)
/// );
/// assert_eq!(
/// lookup.get_strict("Pd"),
/// Some(GeneralCategory::DashPunctuation)
/// );
/// // long name for value
/// assert_eq!(
/// lookup.get_strict("Uppercase_Letter"),
/// Some(GeneralCategory::UppercaseLetter)
/// );
/// assert_eq!(
/// lookup.get_strict("Dash_Punctuation"),
/// Some(GeneralCategory::DashPunctuation)
/// );
/// // name has incorrect casing
/// assert_eq!(lookup.get_strict("dashpunctuation"), None);
/// // loose matching of name
/// assert_eq!(
/// lookup.get_loose("dash-punctuation"),
/// Some(GeneralCategory::DashPunctuation)
/// );
/// // fake property
/// assert_eq!(lookup.get_strict("Animated_Gif"), None);
/// ```
#[derive(Debug)]
pub struct PropertyParser<T> {
map: DataPayload<ErasedMarker<PropertyValueNameToEnumMap<'static>>>,
markers: PhantomData<fn() -> T>,
}
/// A borrowed wrapper around property value name-to-enum data, returned by
/// [`PropertyParser::as_borrowed()`]. More efficient to query.
#[derive(Debug)]
pub struct PropertyParserBorrowed<'a, T> {
map: &'a PropertyValueNameToEnumMap<'a>,
markers: PhantomData<fn() -> T>,
}
impl<T> Clone for PropertyParserBorrowed<'_, T> {
fn clone(&self) -> Self {
*self
}
}
impl<T> Copy for PropertyParserBorrowed<'_, T> {}
impl<T> PropertyParser<T> {
/// Creates a new instance of `PropertyParser<T>` using compiled data.
///
/// ✨ *Enabled with the `compiled_data` Cargo feature.*
///
/// [📚 Help choosing a constructor](icu_provider::constructors)
#[cfg(feature = "compiled_data")]
#[expect(clippy::new_ret_no_self)]
pub fn new() -> PropertyParserBorrowed<'static, T>
where
T: ParseableEnumeratedProperty,
{
PropertyParserBorrowed::new()
}
#[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new)]
pub fn try_new_unstable(
provider: &(impl DataProvider<T::DataMarker> + ?Sized),
) -> Result<Self, DataError>
where
T: ParseableEnumeratedProperty,
{
Ok(Self {
map: provider.load(Default::default())?.payload.cast(),
markers: PhantomData,
})
}
/// Construct a borrowed version of this type that can be queried.
///
/// This avoids a potential small underlying cost per API call (like `get_strict()`) by consolidating it
/// up front.
#[inline]
pub fn as_borrowed(&self) -> PropertyParserBorrowed<'_, T> {
PropertyParserBorrowed {
map: self.map.get(),
markers: PhantomData,
}
}
#[doc(hidden)] // used by FFI code
pub fn erase(self) -> PropertyParser<u16> {
PropertyParser {
map: self.map.cast(),
markers: PhantomData,
}
}
}
impl<T: TrieValue> PropertyParserBorrowed<'_, T> {
/// Get the property value as a u16, doing a strict search looking for
/// names that match exactly
///
/// # Example
///
/// ```
/// use icu::properties::props::GeneralCategory;
/// use icu::properties::PropertyParser;
///
/// let lookup = PropertyParser::<GeneralCategory>::new();
/// assert_eq!(
/// lookup.get_strict_u16("Lu"),
/// Some(GeneralCategory::UppercaseLetter as u16)
/// );
/// assert_eq!(
/// lookup.get_strict_u16("Uppercase_Letter"),
/// Some(GeneralCategory::UppercaseLetter as u16)
/// );
/// // does not do loose matching
/// assert_eq!(lookup.get_strict_u16("UppercaseLetter"), None);
/// ```
#[inline]
pub fn get_strict_u16(self, name: &str) -> Option<u16> {
get_strict_u16(self.map, name)
}
/// Get the property value as a `T`, doing a strict search looking for
/// names that match exactly
///
/// # Example
///
/// ```
/// use icu::properties::props::GeneralCategory;
/// use icu::properties::PropertyParser;
///
/// let lookup = PropertyParser::<GeneralCategory>::new();
/// assert_eq!(
/// lookup.get_strict("Lu"),
/// Some(GeneralCategory::UppercaseLetter)
/// );
/// assert_eq!(
/// lookup.get_strict("Uppercase_Letter"),
/// Some(GeneralCategory::UppercaseLetter)
/// );
/// // does not do loose matching
/// assert_eq!(lookup.get_strict("UppercaseLetter"), None);
/// ```
#[inline]
pub fn get_strict(self, name: &str) -> Option<T> {
T::try_from_u32(self.get_strict_u16(name)? as u32).ok()
}
/// Get the property value as a u16, doing a loose search looking for
/// names that match case-insensitively, ignoring ASCII hyphens, underscores, and
/// whitespaces.
///
/// # Example
///
/// ```
/// use icu::properties::props::GeneralCategory;
/// use icu::properties::PropertyParser;
///
/// let lookup = PropertyParser::<GeneralCategory>::new();
/// assert_eq!(
/// lookup.get_loose_u16("Lu"),
/// Some(GeneralCategory::UppercaseLetter as u16)
/// );
/// assert_eq!(
/// lookup.get_loose_u16("Uppercase_Letter"),
/// Some(GeneralCategory::UppercaseLetter as u16)
/// );
/// // does do loose matching
/// assert_eq!(
/// lookup.get_loose_u16("UppercaseLetter"),
/// Some(GeneralCategory::UppercaseLetter as u16)
/// );
/// ```
#[inline]
pub fn get_loose_u16(self, name: &str) -> Option<u16> {
get_loose_u16(self.map, name)
}
/// Get the property value as a `T`, doing a loose search looking for
/// names that match case-insensitively, ignoring ASCII hyphens, underscores, and
/// whitespaces.
///
/// # Example
///
/// ```
/// use icu::properties::props::GeneralCategory;
/// use icu::properties::PropertyParser;
///
/// let lookup = PropertyParser::<GeneralCategory>::new();
/// assert_eq!(
/// lookup.get_loose("Lu"),
/// Some(GeneralCategory::UppercaseLetter)
/// );
/// assert_eq!(
/// lookup.get_loose("Uppercase_Letter"),
/// Some(GeneralCategory::UppercaseLetter)
/// );
/// // does do loose matching
/// assert_eq!(
/// lookup.get_loose("UppercaseLetter"),
/// Some(GeneralCategory::UppercaseLetter)
/// );
/// ```
#[inline]
pub fn get_loose(self, name: &str) -> Option<T> {
T::try_from_u32(self.get_loose_u16(name)? as u32).ok()
}
}
#[cfg(feature = "compiled_data")]
impl<T: ParseableEnumeratedProperty> Default for PropertyParserBorrowed<'static, T> {
fn default() -> Self {
Self::new()
}
}
impl<T: TrieValue> PropertyParserBorrowed<'static, T> {
/// Creates a new instance of `PropertyParserBorrowed<T>` using compiled data.
///
/// ✨ *Enabled with the `compiled_data` Cargo feature.*
///
/// [📚 Help choosing a constructor](icu_provider::constructors)
#[cfg(feature = "compiled_data")]
pub fn new() -> Self
where
T: ParseableEnumeratedProperty,
{
Self {
map: T::SINGLETON,
markers: PhantomData,
}
}
/// Cheaply converts a [`PropertyParserBorrowed<'static>`] into a [`PropertyParser`].
///
/// Note: Due to branching and indirection, using [`PropertyParser`] might inhibit some
/// compile-time optimizations that are possible with [`PropertyParserBorrowed`].
pub const fn static_to_owned(self) -> PropertyParser<T> {
PropertyParser {
map: DataPayload::from_static_ref(self.map),
markers: PhantomData,
}
}
}
/// Avoid monomorphizing multiple copies of this function
fn get_strict_u16(payload: &PropertyValueNameToEnumMap<'_>, name: &str) -> Option<u16> {
payload.map.get(name).and_then(|i| i.try_into().ok())
}
/// Avoid monomorphizing multiple copies of this function
fn get_loose_u16(payload: &PropertyValueNameToEnumMap<'_>, name: &str) -> Option<u16> {
fn recurse(mut cursor: ZeroTrieSimpleAsciiCursor, mut rest: &[u8]) -> Option<usize> {
if cursor.is_empty() {
return None;
}
// Skip whitespace, underscore, hyphen in trie.
for skip in [b'\t', b'\n', b'\x0C', b'\r', b' ', 0x0B, b'_', b'-'] {
let mut skip_cursor = cursor.clone();
skip_cursor.step(skip);
if let Some(r) = recurse(skip_cursor, rest) {
return Some(r);
}
}
let ascii = loop {
let Some((&a, r)) = rest.split_first() else {
return cursor.take_value();
};
rest = r;
// Skip whitespace, underscore, hyphen in input
if !matches!(
a,
b'\t' | b'\n' | b'\x0C' | b'\r' | b' ' | 0x0B | b'_' | b'-'
) {
break a;
}
};
let mut other_case_cursor = cursor.clone();
cursor.step(ascii);
other_case_cursor.step(if ascii.is_ascii_lowercase() {
ascii.to_ascii_uppercase()
} else {
ascii.to_ascii_lowercase()
});
// This uses the call stack as the DFS stack. The recursion will terminate as
// rest's length is strictly shrinking. The call stack's depth is limited by
// name.len().
recurse(cursor, rest).or_else(|| recurse(other_case_cursor, rest))
}
recurse(payload.map.cursor(), name.as_bytes()).and_then(|i| i.try_into().ok())
}
/// A struct capable of looking up a property name from a value
/// Access its data by calling [`Self::as_borrowed()`] and using the methods on
/// [`PropertyNamesLongBorrowed`].
///
/// # Example
///
/// ```
/// use icu::properties::props::CanonicalCombiningClass;
/// use icu::properties::PropertyNamesLong;
///
/// let names = PropertyNamesLong::<CanonicalCombiningClass>::new();
/// assert_eq!(
/// names.get(CanonicalCombiningClass::KanaVoicing),
/// Some("Kana_Voicing")
/// );
/// assert_eq!(
/// names.get(CanonicalCombiningClass::AboveLeft),
/// Some("Above_Left")
/// );
/// ```
pub struct PropertyNamesLong<T: NamedEnumeratedProperty> {
map: DataPayload<ErasedMarker<T::DataStructLong>>,
}
impl<T: NamedEnumeratedProperty> core::fmt::Debug for PropertyNamesLong<T> {
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
f.debug_struct("PropertyNamesLong")
// .field("map", &self.map)
.finish()
}
}
/// A borrowed wrapper around property value name-to-enum data, returned by
/// [`PropertyNamesLong::as_borrowed()`]. More efficient to query.
#[derive(Debug)]
pub struct PropertyNamesLongBorrowed<'a, T: NamedEnumeratedProperty> {
map: &'a T::DataStructLongBorrowed<'a>,
}
impl<T: NamedEnumeratedProperty> Clone for PropertyNamesLongBorrowed<'_, T> {
fn clone(&self) -> Self {
*self
}
}
impl<T: NamedEnumeratedProperty> Copy for PropertyNamesLongBorrowed<'_, T> {}
impl<T: NamedEnumeratedProperty> PropertyNamesLong<T> {
/// Creates a new instance of `PropertyNamesLongBorrowed<T>`.
///
/// ✨ *Enabled with the `compiled_data` Cargo feature.*
///
/// [📚 Help choosing a constructor](icu_provider::constructors)
#[cfg(feature = "compiled_data")]
#[expect(clippy::new_ret_no_self)]
pub fn new() -> PropertyNamesLongBorrowed<'static, T> {
PropertyNamesLongBorrowed::new()
}
#[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new)]
pub fn try_new_unstable(
provider: &(impl DataProvider<T::DataMarkerLong> + ?Sized),
) -> Result<Self, DataError> {
Ok(Self {
map: provider.load(Default::default())?.payload.cast(),
})
}
/// Construct a borrowed version of this type that can be queried.
///
/// This avoids a potential small underlying cost per API call (like `get_static()`) by consolidating it
/// up front.
#[inline]
pub fn as_borrowed(&self) -> PropertyNamesLongBorrowed<'_, T> {
PropertyNamesLongBorrowed {
map: T::nep_long_identity(self.map.get()),
}
}
}
impl<'a, T: NamedEnumeratedProperty> PropertyNamesLongBorrowed<'a, T> {
/// Get the property name given a value
///
/// # Example
///
/// ```rust
/// use icu::properties::props::CanonicalCombiningClass;
/// use icu::properties::PropertyNamesLong;
///
/// let lookup = PropertyNamesLong::<CanonicalCombiningClass>::new();
/// assert_eq!(
/// lookup.get(CanonicalCombiningClass::KanaVoicing),
/// Some("Kana_Voicing")
/// );
/// assert_eq!(
/// lookup.get(CanonicalCombiningClass::AboveLeft),
/// Some("Above_Left")
/// );
/// ```
#[inline]
pub fn get(self, property: T) -> Option<&'a str> {
self.map.get(property.to_u32())
}
}
#[cfg(feature = "compiled_data")]
impl<T: NamedEnumeratedProperty> Default for PropertyNamesLongBorrowed<'static, T> {
fn default() -> Self {
Self::new()
}
}
impl<T: NamedEnumeratedProperty> PropertyNamesLongBorrowed<'static, T> {
/// Creates a new instance of `PropertyNamesLongBorrowed<T>`.
///
/// ✨ *Enabled with the `compiled_data` Cargo feature.*
///
/// [📚 Help choosing a constructor](icu_provider::constructors)
#[cfg(feature = "compiled_data")]
pub fn new() -> Self {
Self {
map: T::SINGLETON_LONG,
}
}
/// Cheaply converts a [`PropertyNamesLongBorrowed<'static>`] into a [`PropertyNamesLong`].
///
/// Note: Due to branching and indirection, using [`PropertyNamesLong`] might inhibit some
/// compile-time optimizations that are possible with [`PropertyNamesLongBorrowed`].
///
/// This is currently not `const` unlike other `static_to_owned()` functions since it needs
/// const traits to do that safely
pub fn static_to_owned(self) -> PropertyNamesLong<T> {
PropertyNamesLong {
map: DataPayload::from_static_ref(T::nep_long_identity_static(self.map)),
}
}
}
/// A struct capable of looking up a property name from a value
/// Access its data by calling [`Self::as_borrowed()`] and using the methods on
/// [`PropertyNamesShortBorrowed`].
///
/// # Example
///
/// ```
/// use icu::properties::props::CanonicalCombiningClass;
/// use icu::properties::PropertyNamesShort;
///
/// let names = PropertyNamesShort::<CanonicalCombiningClass>::new();
/// assert_eq!(names.get(CanonicalCombiningClass::KanaVoicing), Some("KV"));
/// assert_eq!(names.get(CanonicalCombiningClass::AboveLeft), Some("AL"));
/// ```
pub struct PropertyNamesShort<T: NamedEnumeratedProperty> {
map: DataPayload<ErasedMarker<T::DataStructShort>>,
}
impl<T: NamedEnumeratedProperty> core::fmt::Debug for PropertyNamesShort<T> {
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
f.debug_struct("PropertyNamesShort")
// .field("map", &self.map)
.finish()
}
}
/// A borrowed wrapper around property value name-to-enum data, returned by
/// [`PropertyNamesShort::as_borrowed()`]. More efficient to query.
#[derive(Debug)]
pub struct PropertyNamesShortBorrowed<'a, T: NamedEnumeratedProperty> {
map: &'a T::DataStructShortBorrowed<'a>,
}
impl<T: NamedEnumeratedProperty> Clone for PropertyNamesShortBorrowed<'_, T> {
fn clone(&self) -> Self {
*self
}
}
impl<T: NamedEnumeratedProperty> Copy for PropertyNamesShortBorrowed<'_, T> {}
impl<T: NamedEnumeratedProperty> PropertyNamesShort<T> {
/// Creates a new instance of `PropertyNamesShortBorrowed<T>`.
///
/// ✨ *Enabled with the `compiled_data` Cargo feature.*
///
/// [📚 Help choosing a constructor](icu_provider::constructors)
#[cfg(feature = "compiled_data")]
#[expect(clippy::new_ret_no_self)]
pub fn new() -> PropertyNamesShortBorrowed<'static, T> {
PropertyNamesShortBorrowed::new()
}
#[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new)]
pub fn try_new_unstable(
provider: &(impl DataProvider<T::DataMarkerShort> + ?Sized),
) -> Result<Self, DataError> {
Ok(Self {
map: provider.load(Default::default())?.payload.cast(),
})
}
/// Construct a borrowed version of this type that can be queried.
///
/// This avoids a potential small underlying cost per API call (like `get_static()`) by consolidating it
/// up front.
#[inline]
pub fn as_borrowed(&self) -> PropertyNamesShortBorrowed<'_, T> {
PropertyNamesShortBorrowed {
map: T::nep_short_identity(self.map.get()),
}
}
}
impl<'a, T: NamedEnumeratedProperty> PropertyNamesShortBorrowed<'a, T> {
/// Get the property name given a value
///
/// # Example
///
/// ```rust
/// use icu::properties::props::CanonicalCombiningClass;
/// use icu::properties::PropertyNamesShort;
///
/// let lookup = PropertyNamesShort::<CanonicalCombiningClass>::new();
/// assert_eq!(lookup.get(CanonicalCombiningClass::KanaVoicing), Some("KV"));
/// assert_eq!(lookup.get(CanonicalCombiningClass::AboveLeft), Some("AL"));
/// ```
#[inline]
pub fn get(self, property: T) -> Option<&'a str> {
self.map.get(property.to_u32())
}
}
impl PropertyNamesShortBorrowed<'_, Script> {
/// Gets the "name" of a script property as a `icu::locale::subtags::Script`.
///
/// This method is available only on `PropertyNamesShortBorrowed<Script>`.
///
/// # Example
///
/// ```rust
/// use icu::locale::subtags::script;
/// use icu::properties::props::Script;
/// use icu::properties::PropertyNamesShort;
///
/// let lookup = PropertyNamesShort::<Script>::new();
/// assert_eq!(
/// lookup.get_locale_script(Script::Brahmi),
/// Some(script!("Brah"))
/// );
/// assert_eq!(
/// lookup.get_locale_script(Script::Hangul),
/// Some(script!("Hang"))
/// );
/// ```
///
/// For the reverse direction, use property parsing as normal:
/// ```
/// use icu::locale::subtags::script;
/// use icu::properties::props::Script;
/// use icu::properties::PropertyParser;
///
/// let parser = PropertyParser::<Script>::new();
/// assert_eq!(
/// parser.get_strict(script!("Brah").as_str()),
/// Some(Script::Brahmi)
/// );
/// assert_eq!(
/// parser.get_strict(script!("Hang").as_str()),
/// Some(Script::Hangul)
/// );
/// ```
#[inline]
pub fn get_locale_script(self, property: Script) -> Option<icu_locale_core::subtags::Script> {
let prop = usize::try_from(property.to_u32()).ok()?;
self.map.map.get(prop).and_then(|o| o.0)
}
}
#[cfg(feature = "compiled_data")]
impl<T: NamedEnumeratedProperty> Default for PropertyNamesShortBorrowed<'static, T> {
fn default() -> Self {
Self::new()
}
}
impl<T: NamedEnumeratedProperty> PropertyNamesShortBorrowed<'static, T> {
/// Creates a new instance of `PropertyNamesShortBorrowed<T>`.
///
/// ✨ *Enabled with the `compiled_data` Cargo feature.*
///
/// [📚 Help choosing a constructor](icu_provider::constructors)
#[cfg(feature = "compiled_data")]
pub fn new() -> Self {
Self {
map: T::SINGLETON_SHORT,
}
}
/// Cheaply converts a [`PropertyNamesShortBorrowed<'static>`] into a [`PropertyNamesShort`].
///
/// Note: Due to branching and indirection, using [`PropertyNamesShort`] might inhibit some
/// compile-time optimizations that are possible with [`PropertyNamesShortBorrowed`].
///
/// This is currently not `const` unlike other `static_to_owned()` functions since it needs
/// const traits to do that safely
pub fn static_to_owned(self) -> PropertyNamesShort<T> {
PropertyNamesShort {
map: DataPayload::from_static_ref(T::nep_short_identity_static(self.map)),
}
}
}
/// A property whose value names can be parsed from strings.
pub trait ParseableEnumeratedProperty: crate::private::Sealed + TrieValue {
#[doc(hidden)]
type DataMarker: DataMarker<DataStruct = PropertyValueNameToEnumMap<'static>>;
#[doc(hidden)]
#[cfg(feature = "compiled_data")]
const SINGLETON: &'static PropertyValueNameToEnumMap<'static>;
}
// Abstract over Linear/Sparse/Script representation
// This trait is implicitly sealed by not being exported.
pub trait PropertyEnumToValueNameLookup {
fn get(&self, prop: u32) -> Option<&str>;
}
impl PropertyEnumToValueNameLookup for PropertyEnumToValueNameLinearMap<'_> {
fn get(&self, prop: u32) -> Option<&str> {
self.map.get(usize::try_from(prop).ok()?)
}
}
#[cfg(feature = "alloc")]
impl PropertyEnumToValueNameLookup for PropertyEnumToValueNameSparseMap<'_> {
fn get(&self, prop: u32) -> Option<&str> {
self.map.get(&u16::try_from(prop).ok()?)
}
}
impl PropertyEnumToValueNameLookup for PropertyScriptToIcuScriptMap<'_> {
fn get(&self, prop: u32) -> Option<&str> {
self.map
.get_ule_ref(usize::try_from(prop).ok()?)
.and_then(|no| no.as_ref())
.map(|s| s.as_str())
}
}
/// A property whose value names can be represented as strings.
pub trait NamedEnumeratedProperty: ParseableEnumeratedProperty {
#[doc(hidden)]
type DataStructLong: 'static
+ for<'a> Yokeable<'a, Output = Self::DataStructLongBorrowed<'a>>
+ PropertyEnumToValueNameLookup;
#[doc(hidden)]
type DataStructShort: 'static
+ for<'a> Yokeable<'a, Output = Self::DataStructShortBorrowed<'a>>
+ PropertyEnumToValueNameLookup;
#[doc(hidden)]
type DataStructLongBorrowed<'a>: PropertyEnumToValueNameLookup;
#[doc(hidden)]
type DataStructShortBorrowed<'a>: PropertyEnumToValueNameLookup;
#[doc(hidden)]
type DataMarkerLong: DataMarker<DataStruct = Self::DataStructLong>;
#[doc(hidden)]
type DataMarkerShort: DataMarker<DataStruct = Self::DataStructShort>;
#[doc(hidden)]
#[cfg(feature = "compiled_data")]
const SINGLETON_LONG: &'static Self::DataStructLongBorrowed<'static>;
#[doc(hidden)]
#[cfg(feature = "compiled_data")]
const SINGLETON_SHORT: &'static Self::DataStructShortBorrowed<'static>;
// These wouldn't be necessary if Yoke used GATs (#6057)
#[doc(hidden)]
fn nep_long_identity<'a>(
stat: &'a <Self::DataStructLong as Yokeable<'a>>::Output,
) -> &'a Self::DataStructLongBorrowed<'a>;
#[doc(hidden)]
fn nep_long_identity_static(
stat: &'static Self::DataStructLongBorrowed<'static>,
) -> &'static Self::DataStructLong;
#[doc(hidden)]
fn nep_short_identity<'a>(
stat: &'a <Self::DataStructShort as Yokeable<'a>>::Output,
) -> &'a Self::DataStructShortBorrowed<'a>;
#[doc(hidden)]
fn nep_short_identity_static(
stat: &'static Self::DataStructShortBorrowed<'static>,
) -> &'static Self::DataStructShort;
/// Convenience method for `PropertyParser::new().get_loose(s)`
///
/// ✨ *Enabled with the `compiled_data` Cargo feature.*
#[cfg(feature = "compiled_data")]
fn try_from_str(s: &str) -> Option<Self> {
PropertyParser::new().get_loose(s)
}
/// Convenience method for `PropertyNamesLong::new().get(*self).unwrap()`
///
/// ✨ *Enabled with the `compiled_data` Cargo feature.*
#[cfg(feature = "compiled_data")]
fn long_name(&self) -> &'static str {
PropertyNamesLong::new().get(*self).unwrap_or("unreachable")
}
/// Convenience method for `PropertyNamesShort::new().get(*self).unwrap()`
///
/// ✨ *Enabled with the `compiled_data` Cargo feature.*
#[cfg(feature = "compiled_data")]
fn short_name(&self) -> &'static str {
PropertyNamesShort::new()
.get(*self)
.unwrap_or("unreachable")
}
}
macro_rules! impl_value_getter {
(
impl $ty:ident {
$marker_n2e:ident / $singleton_n2e:ident;
$(
$(#[$meta:meta])*
$data_struct_s:ident / $marker_e2sn:ident / $singleton_e2sn:ident;
$data_struct_l:ident / $marker_e2ln:ident / $singleton_e2ln:ident;
)?
}
) => {
impl ParseableEnumeratedProperty for $ty {
type DataMarker = $marker_n2e;
#[cfg(feature = "compiled_data")]
const SINGLETON: &'static PropertyValueNameToEnumMap<'static> = crate::provider::Baked::$singleton_n2e;
}
$(
$(#[$meta])*
impl NamedEnumeratedProperty for $ty {
type DataStructLong = $data_struct_l<'static>;
type DataStructShort = $data_struct_s<'static>;
type DataStructLongBorrowed<'a> = $data_struct_l<'a>;
type DataStructShortBorrowed<'a> = $data_struct_s<'a>;
type DataMarkerLong = crate::provider::$marker_e2ln;
type DataMarkerShort = crate::provider::$marker_e2sn;
#[cfg(feature = "compiled_data")]
const SINGLETON_LONG: &'static Self::DataStructLong = crate::provider::Baked::$singleton_e2ln;
#[cfg(feature = "compiled_data")]
const SINGLETON_SHORT: &'static Self::DataStructShort = crate::provider::Baked::$singleton_e2sn;
fn nep_long_identity<'a>(yoked: &'a $data_struct_l<'a>) -> &'a Self::DataStructLongBorrowed<'a> {
yoked
}
fn nep_long_identity_static(stat: &'static $data_struct_l<'static>) -> &'static $data_struct_l<'static> {
stat
}
fn nep_short_identity<'a>(yoked: &'a $data_struct_s<'a>) -> &'a Self::DataStructShortBorrowed<'a> {
yoked
}
fn nep_short_identity_static(stat: &'static $data_struct_s<'static>) -> &'static $data_struct_s<'static> {
stat
}
}
)?
};
}
impl_value_getter! {
impl BidiClass {
PropertyNameParseBidiClassV1 / SINGLETON_PROPERTY_NAME_PARSE_BIDI_CLASS_V1;
PropertyEnumToValueNameLinearMap / PropertyNameShortBidiClassV1 / SINGLETON_PROPERTY_NAME_SHORT_BIDI_CLASS_V1;
PropertyEnumToValueNameLinearMap / PropertyNameLongBidiClassV1 / SINGLETON_PROPERTY_NAME_LONG_BIDI_CLASS_V1;
}
}
impl_value_getter! {
impl GeneralCategory {
PropertyNameParseGeneralCategoryV1 / SINGLETON_PROPERTY_NAME_PARSE_GENERAL_CATEGORY_V1;
PropertyEnumToValueNameLinearMap / PropertyNameShortGeneralCategoryV1 / SINGLETON_PROPERTY_NAME_SHORT_GENERAL_CATEGORY_V1;
PropertyEnumToValueNameLinearMap / PropertyNameLongGeneralCategoryV1 / SINGLETON_PROPERTY_NAME_LONG_GENERAL_CATEGORY_V1;
}
}
impl_value_getter! {
impl GeneralCategoryGroup {
PropertyNameParseGeneralCategoryMaskV1 / SINGLETON_PROPERTY_NAME_PARSE_GENERAL_CATEGORY_MASK_V1;
}
}
impl_value_getter! {
impl Script {
PropertyNameParseScriptV1 / SINGLETON_PROPERTY_NAME_PARSE_SCRIPT_V1;
PropertyScriptToIcuScriptMap / PropertyNameShortScriptV1 / SINGLETON_PROPERTY_NAME_SHORT_SCRIPT_V1;
PropertyEnumToValueNameLinearMap / PropertyNameLongScriptV1 / SINGLETON_PROPERTY_NAME_LONG_SCRIPT_V1;
}
}
impl_value_getter! {
impl HangulSyllableType {
PropertyNameParseHangulSyllableTypeV1 / SINGLETON_PROPERTY_NAME_PARSE_HANGUL_SYLLABLE_TYPE_V1;
PropertyEnumToValueNameLinearMap / PropertyNameShortHangulSyllableTypeV1 / SINGLETON_PROPERTY_NAME_SHORT_HANGUL_SYLLABLE_TYPE_V1;
PropertyEnumToValueNameLinearMap / PropertyNameLongHangulSyllableTypeV1 / SINGLETON_PROPERTY_NAME_LONG_HANGUL_SYLLABLE_TYPE_V1;
}
}
impl_value_getter! {
impl EastAsianWidth {
PropertyNameParseEastAsianWidthV1 / SINGLETON_PROPERTY_NAME_PARSE_EAST_ASIAN_WIDTH_V1;
PropertyEnumToValueNameLinearMap / PropertyNameShortEastAsianWidthV1 / SINGLETON_PROPERTY_NAME_SHORT_EAST_ASIAN_WIDTH_V1;
PropertyEnumToValueNameLinearMap / PropertyNameLongEastAsianWidthV1 / SINGLETON_PROPERTY_NAME_LONG_EAST_ASIAN_WIDTH_V1;
}
}
impl_value_getter! {
impl LineBreak {
PropertyNameParseLineBreakV1 / SINGLETON_PROPERTY_NAME_PARSE_LINE_BREAK_V1;
PropertyEnumToValueNameLinearMap / PropertyNameShortLineBreakV1 / SINGLETON_PROPERTY_NAME_SHORT_LINE_BREAK_V1;
PropertyEnumToValueNameLinearMap / PropertyNameLongLineBreakV1 / SINGLETON_PROPERTY_NAME_LONG_LINE_BREAK_V1;
}
}
impl_value_getter! {
impl GraphemeClusterBreak {
PropertyNameParseGraphemeClusterBreakV1 / SINGLETON_PROPERTY_NAME_PARSE_GRAPHEME_CLUSTER_BREAK_V1;
PropertyEnumToValueNameLinearMap / PropertyNameShortGraphemeClusterBreakV1 / SINGLETON_PROPERTY_NAME_SHORT_GRAPHEME_CLUSTER_BREAK_V1;
PropertyEnumToValueNameLinearMap / PropertyNameLongGraphemeClusterBreakV1 / SINGLETON_PROPERTY_NAME_LONG_GRAPHEME_CLUSTER_BREAK_V1;
}
}
impl_value_getter! {
impl WordBreak {
PropertyNameParseWordBreakV1 / SINGLETON_PROPERTY_NAME_PARSE_WORD_BREAK_V1;
PropertyEnumToValueNameLinearMap / PropertyNameShortWordBreakV1 / SINGLETON_PROPERTY_NAME_SHORT_WORD_BREAK_V1;
PropertyEnumToValueNameLinearMap / PropertyNameLongWordBreakV1 / SINGLETON_PROPERTY_NAME_LONG_WORD_BREAK_V1;
}
}
impl_value_getter! {
impl SentenceBreak {
PropertyNameParseSentenceBreakV1 / SINGLETON_PROPERTY_NAME_PARSE_SENTENCE_BREAK_V1;
PropertyEnumToValueNameLinearMap / PropertyNameShortSentenceBreakV1 / SINGLETON_PROPERTY_NAME_SHORT_SENTENCE_BREAK_V1;
PropertyEnumToValueNameLinearMap / PropertyNameLongSentenceBreakV1 / SINGLETON_PROPERTY_NAME_LONG_SENTENCE_BREAK_V1;
}
}
impl_value_getter! {
impl CanonicalCombiningClass {
PropertyNameParseCanonicalCombiningClassV1 / SINGLETON_PROPERTY_NAME_PARSE_CANONICAL_COMBINING_CLASS_V1;
#[cfg(feature = "alloc")]
/// ✨ *Enabled with the `alloc` Cargo feature.*
PropertyEnumToValueNameSparseMap / PropertyNameShortCanonicalCombiningClassV1 / SINGLETON_PROPERTY_NAME_SHORT_CANONICAL_COMBINING_CLASS_V1;
PropertyEnumToValueNameSparseMap / PropertyNameLongCanonicalCombiningClassV1 / SINGLETON_PROPERTY_NAME_LONG_CANONICAL_COMBINING_CLASS_V1;
}
}
impl_value_getter! {
impl IndicSyllabicCategory {
PropertyNameParseIndicSyllabicCategoryV1 / SINGLETON_PROPERTY_NAME_PARSE_INDIC_SYLLABIC_CATEGORY_V1;
PropertyEnumToValueNameLinearMap / PropertyNameShortIndicSyllabicCategoryV1 / SINGLETON_PROPERTY_NAME_SHORT_INDIC_SYLLABIC_CATEGORY_V1;
PropertyEnumToValueNameLinearMap / PropertyNameLongIndicSyllabicCategoryV1 / SINGLETON_PROPERTY_NAME_LONG_INDIC_SYLLABIC_CATEGORY_V1;
}
}
impl_value_getter! {
impl JoiningType {
PropertyNameParseJoiningTypeV1 / SINGLETON_PROPERTY_NAME_PARSE_JOINING_TYPE_V1;
PropertyEnumToValueNameLinearMap / PropertyNameShortJoiningTypeV1 / SINGLETON_PROPERTY_NAME_SHORT_JOINING_TYPE_V1;
PropertyEnumToValueNameLinearMap / PropertyNameLongJoiningTypeV1 / SINGLETON_PROPERTY_NAME_LONG_JOINING_TYPE_V1;
}
}
impl_value_getter! {
impl VerticalOrientation {
PropertyNameParseVerticalOrientationV1 / SINGLETON_PROPERTY_NAME_PARSE_VERTICAL_ORIENTATION_V1;
PropertyEnumToValueNameLinearMap / PropertyNameShortVerticalOrientationV1 / SINGLETON_PROPERTY_NAME_SHORT_VERTICAL_ORIENTATION_V1;
PropertyEnumToValueNameLinearMap / PropertyNameLongVerticalOrientationV1 / SINGLETON_PROPERTY_NAME_LONG_VERTICAL_ORIENTATION_V1;
}
}

Sorry, the diff of this file is too big to display

// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
// Provider structs must be stable
#![allow(clippy::exhaustive_structs, clippy::exhaustive_enums)]
//! 🚧 \[Unstable\] Data provider struct definitions for this ICU4X component.
//!
//! <div class="stab unstable">
//! 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
//! including in SemVer minor releases. While the serde representation of data structs is guaranteed
//! to be stable, their Rust representation might not be. Use with caution.
//! </div>
//!
//! Read more about data providers: [`icu_provider`]
pub mod names;
#[cfg(feature = "alloc")]
pub use names::{
PropertyNameLongCanonicalCombiningClassV1, PropertyNameShortCanonicalCombiningClassV1,
};
pub use names::{
PropertyNameLongBidiClassV1, PropertyNameLongEastAsianWidthV1,
PropertyNameLongGeneralCategoryV1, PropertyNameLongGraphemeClusterBreakV1,
PropertyNameLongHangulSyllableTypeV1, PropertyNameLongIndicSyllabicCategoryV1,
PropertyNameLongJoiningTypeV1, PropertyNameLongLineBreakV1, PropertyNameLongScriptV1,
PropertyNameLongSentenceBreakV1, PropertyNameLongVerticalOrientationV1,
PropertyNameLongWordBreakV1, PropertyNameParseBidiClassV1,
PropertyNameParseCanonicalCombiningClassV1, PropertyNameParseEastAsianWidthV1,
PropertyNameParseGeneralCategoryMaskV1, PropertyNameParseGeneralCategoryV1,
PropertyNameParseGraphemeClusterBreakV1, PropertyNameParseHangulSyllableTypeV1,
PropertyNameParseIndicSyllabicCategoryV1, PropertyNameParseJoiningTypeV1,
PropertyNameParseLineBreakV1, PropertyNameParseScriptV1, PropertyNameParseSentenceBreakV1,
PropertyNameParseVerticalOrientationV1, PropertyNameParseWordBreakV1,
PropertyNameShortBidiClassV1, PropertyNameShortEastAsianWidthV1,
PropertyNameShortGeneralCategoryV1, PropertyNameShortGraphemeClusterBreakV1,
PropertyNameShortHangulSyllableTypeV1, PropertyNameShortIndicSyllabicCategoryV1,
PropertyNameShortJoiningTypeV1, PropertyNameShortLineBreakV1, PropertyNameShortScriptV1,
PropertyNameShortSentenceBreakV1, PropertyNameShortVerticalOrientationV1,
PropertyNameShortWordBreakV1,
};
pub use crate::props::gc::GeneralCategoryULE;
use crate::props::*;
use crate::script::ScriptWithExt;
use core::ops::RangeInclusive;
use icu_collections::codepointinvlist::CodePointInversionList;
use icu_collections::codepointinvliststringlist::CodePointInversionListAndStringList;
use icu_collections::codepointtrie::{CodePointMapRange, CodePointTrie, TrieValue};
use icu_provider::prelude::*;
use zerofrom::ZeroFrom;
use zerovec::{VarZeroVec, ZeroSlice};
#[cfg(feature = "compiled_data")]
#[derive(Debug)]
/// Baked data
///
/// <div class="stab unstable">
/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
/// including in SemVer minor releases. In particular, the `DataProvider` implementations are only
/// guaranteed to match with this version's `*_unstable` providers. Use with caution.
/// </div>
pub struct Baked;
#[cfg(feature = "compiled_data")]
#[allow(unused_imports)]
const _: () = {
use icu_properties_data::*;
pub mod icu {
pub use crate as properties;
pub use icu_collections as collections;
}
make_provider!(Baked);
impl_property_binary_alnum_v1!(Baked);
impl_property_binary_alphabetic_v1!(Baked);
impl_property_binary_ascii_hex_digit_v1!(Baked);
impl_property_binary_basic_emoji_v1!(Baked);
impl_property_binary_bidi_control_v1!(Baked);
impl_property_binary_bidi_mirrored_v1!(Baked);
impl_property_binary_blank_v1!(Baked);
impl_property_binary_case_ignorable_v1!(Baked);
impl_property_binary_case_sensitive_v1!(Baked);
impl_property_binary_cased_v1!(Baked);
impl_property_binary_changes_when_casefolded_v1!(Baked);
impl_property_binary_changes_when_casemapped_v1!(Baked);
impl_property_binary_changes_when_lowercased_v1!(Baked);
impl_property_binary_changes_when_nfkc_casefolded_v1!(Baked);
impl_property_binary_changes_when_titlecased_v1!(Baked);
impl_property_binary_changes_when_uppercased_v1!(Baked);
impl_property_binary_dash_v1!(Baked);
impl_property_binary_default_ignorable_code_point_v1!(Baked);
impl_property_binary_deprecated_v1!(Baked);
impl_property_binary_diacritic_v1!(Baked);
impl_property_binary_emoji_component_v1!(Baked);
impl_property_binary_emoji_modifier_base_v1!(Baked);
impl_property_binary_emoji_modifier_v1!(Baked);
impl_property_binary_emoji_presentation_v1!(Baked);
impl_property_binary_emoji_v1!(Baked);
impl_property_binary_extended_pictographic_v1!(Baked);
impl_property_binary_extender_v1!(Baked);
impl_property_binary_full_composition_exclusion_v1!(Baked);
impl_property_binary_graph_v1!(Baked);
impl_property_binary_grapheme_base_v1!(Baked);
impl_property_binary_grapheme_extend_v1!(Baked);
impl_property_binary_grapheme_link_v1!(Baked);
impl_property_binary_hex_digit_v1!(Baked);
impl_property_binary_hyphen_v1!(Baked);
impl_property_binary_id_compat_math_continue_v1!(Baked);
impl_property_binary_id_compat_math_start_v1!(Baked);
impl_property_binary_id_continue_v1!(Baked);
impl_property_binary_id_start_v1!(Baked);
impl_property_binary_ideographic_v1!(Baked);
impl_property_binary_ids_binary_operator_v1!(Baked);
impl_property_binary_ids_trinary_operator_v1!(Baked);
impl_property_binary_ids_unary_operator_v1!(Baked);
impl_property_binary_join_control_v1!(Baked);
impl_property_binary_logical_order_exception_v1!(Baked);
impl_property_binary_lowercase_v1!(Baked);
impl_property_binary_math_v1!(Baked);
impl_property_binary_modifier_combining_mark_v1!(Baked);
impl_property_binary_nfc_inert_v1!(Baked);
impl_property_binary_nfd_inert_v1!(Baked);
impl_property_binary_nfkc_inert_v1!(Baked);
impl_property_binary_nfkd_inert_v1!(Baked);
impl_property_binary_noncharacter_code_point_v1!(Baked);
impl_property_binary_pattern_syntax_v1!(Baked);
impl_property_binary_pattern_white_space_v1!(Baked);
impl_property_binary_prepended_concatenation_mark_v1!(Baked);
impl_property_binary_print_v1!(Baked);
impl_property_binary_quotation_mark_v1!(Baked);
impl_property_binary_radical_v1!(Baked);
impl_property_binary_regional_indicator_v1!(Baked);
impl_property_binary_segment_starter_v1!(Baked);
impl_property_binary_sentence_terminal_v1!(Baked);
impl_property_binary_soft_dotted_v1!(Baked);
impl_property_binary_terminal_punctuation_v1!(Baked);
impl_property_binary_unified_ideograph_v1!(Baked);
impl_property_binary_uppercase_v1!(Baked);
impl_property_binary_variation_selector_v1!(Baked);
impl_property_binary_white_space_v1!(Baked);
impl_property_binary_xdigit_v1!(Baked);
impl_property_binary_xid_continue_v1!(Baked);
impl_property_binary_xid_start_v1!(Baked);
impl_property_enum_bidi_class_v1!(Baked);
impl_property_enum_bidi_mirroring_glyph_v1!(Baked);
impl_property_enum_canonical_combining_class_v1!(Baked);
impl_property_enum_east_asian_width_v1!(Baked);
impl_property_enum_general_category_v1!(Baked);
impl_property_enum_grapheme_cluster_break_v1!(Baked);
impl_property_enum_hangul_syllable_type_v1!(Baked);
impl_property_enum_indic_conjunct_break_v1!(Baked);
impl_property_enum_indic_syllabic_category_v1!(Baked);
impl_property_enum_joining_type_v1!(Baked);
impl_property_enum_line_break_v1!(Baked);
impl_property_enum_script_v1!(Baked);
impl_property_enum_sentence_break_v1!(Baked);
impl_property_enum_vertical_orientation_v1!(Baked);
impl_property_enum_word_break_v1!(Baked);
impl_property_name_long_bidi_class_v1!(Baked);
#[cfg(feature = "alloc")]
impl_property_name_long_canonical_combining_class_v1!(Baked);
impl_property_name_long_east_asian_width_v1!(Baked);
impl_property_name_long_general_category_v1!(Baked);
impl_property_name_long_grapheme_cluster_break_v1!(Baked);
impl_property_name_long_hangul_syllable_type_v1!(Baked);
impl_property_name_long_indic_syllabic_category_v1!(Baked);
impl_property_name_long_joining_type_v1!(Baked);
impl_property_name_long_line_break_v1!(Baked);
impl_property_name_long_script_v1!(Baked);
impl_property_name_long_sentence_break_v1!(Baked);
impl_property_name_long_vertical_orientation_v1!(Baked);
impl_property_name_long_word_break_v1!(Baked);
impl_property_name_parse_bidi_class_v1!(Baked);
impl_property_name_parse_canonical_combining_class_v1!(Baked);
impl_property_name_parse_east_asian_width_v1!(Baked);
impl_property_name_parse_general_category_mask_v1!(Baked);
impl_property_name_parse_general_category_v1!(Baked);
impl_property_name_parse_grapheme_cluster_break_v1!(Baked);
impl_property_name_parse_hangul_syllable_type_v1!(Baked);
impl_property_name_parse_indic_syllabic_category_v1!(Baked);
impl_property_name_parse_joining_type_v1!(Baked);
impl_property_name_parse_line_break_v1!(Baked);
impl_property_name_parse_script_v1!(Baked);
impl_property_name_parse_sentence_break_v1!(Baked);
impl_property_name_parse_vertical_orientation_v1!(Baked);
impl_property_name_parse_word_break_v1!(Baked);
impl_property_name_short_bidi_class_v1!(Baked);
#[cfg(feature = "alloc")]
impl_property_name_short_canonical_combining_class_v1!(Baked);
impl_property_name_short_east_asian_width_v1!(Baked);
impl_property_name_short_general_category_v1!(Baked);
impl_property_name_short_grapheme_cluster_break_v1!(Baked);
impl_property_name_short_hangul_syllable_type_v1!(Baked);
impl_property_name_short_indic_syllabic_category_v1!(Baked);
impl_property_name_short_joining_type_v1!(Baked);
impl_property_name_short_line_break_v1!(Baked);
impl_property_name_short_script_v1!(Baked);
impl_property_name_short_sentence_break_v1!(Baked);
impl_property_name_short_vertical_orientation_v1!(Baked);
impl_property_name_short_word_break_v1!(Baked);
impl_property_script_with_extensions_v1!(Baked);
};
icu_provider::data_marker!(
/// `PropertyBinaryAlnumV1`
PropertyBinaryAlnumV1,
PropertyCodePointSet<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyBinaryAlphabeticV1`
PropertyBinaryAlphabeticV1,
PropertyCodePointSet<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyBinaryAsciiHexDigitV1`
PropertyBinaryAsciiHexDigitV1,
PropertyCodePointSet<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyBinaryBidiControlV1`
PropertyBinaryBidiControlV1,
PropertyCodePointSet<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyBinaryBidiMirroredV1`
PropertyBinaryBidiMirroredV1,
PropertyCodePointSet<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyBinaryBlankV1`
PropertyBinaryBlankV1,
PropertyCodePointSet<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyBinaryCasedV1`
PropertyBinaryCasedV1,
PropertyCodePointSet<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyBinaryCaseIgnorableV1`
PropertyBinaryCaseIgnorableV1,
PropertyCodePointSet<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyBinaryCaseSensitiveV1`
PropertyBinaryCaseSensitiveV1,
PropertyCodePointSet<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyBinaryChangesWhenCasefoldedV1`
PropertyBinaryChangesWhenCasefoldedV1,
PropertyCodePointSet<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyBinaryChangesWhenCasemappedV1`
PropertyBinaryChangesWhenCasemappedV1,
PropertyCodePointSet<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyBinaryChangesWhenLowercasedV1`
PropertyBinaryChangesWhenLowercasedV1,
PropertyCodePointSet<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyBinaryChangesWhenNfkcCasefoldedV1`
PropertyBinaryChangesWhenNfkcCasefoldedV1,
PropertyCodePointSet<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyBinaryChangesWhenTitlecasedV1`
PropertyBinaryChangesWhenTitlecasedV1,
PropertyCodePointSet<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyBinaryChangesWhenUppercasedV1`
PropertyBinaryChangesWhenUppercasedV1,
PropertyCodePointSet<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyBinaryDashV1`
PropertyBinaryDashV1,
PropertyCodePointSet<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyBinaryDefaultIgnorableCodePointV1`
PropertyBinaryDefaultIgnorableCodePointV1,
PropertyCodePointSet<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyBinaryDeprecatedV1`
PropertyBinaryDeprecatedV1,
PropertyCodePointSet<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyBinaryDiacriticV1`
PropertyBinaryDiacriticV1,
PropertyCodePointSet<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyBinaryEmojiComponentV1`
PropertyBinaryEmojiComponentV1,
PropertyCodePointSet<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyBinaryEmojiModifierBaseV1`
PropertyBinaryEmojiModifierBaseV1,
PropertyCodePointSet<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyBinaryEmojiModifierV1`
PropertyBinaryEmojiModifierV1,
PropertyCodePointSet<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyBinaryEmojiPresentationV1`
PropertyBinaryEmojiPresentationV1,
PropertyCodePointSet<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyBinaryEmojiV1`
PropertyBinaryEmojiV1,
PropertyCodePointSet<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyBinaryExtendedPictographicV1`
PropertyBinaryExtendedPictographicV1,
PropertyCodePointSet<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyBinaryExtenderV1`
PropertyBinaryExtenderV1,
PropertyCodePointSet<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyBinaryFullCompositionExclusionV1`
PropertyBinaryFullCompositionExclusionV1,
PropertyCodePointSet<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyBinaryGraphemeBaseV1`
PropertyBinaryGraphemeBaseV1,
PropertyCodePointSet<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyBinaryGraphemeExtendV1`
PropertyBinaryGraphemeExtendV1,
PropertyCodePointSet<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyBinaryGraphemeLinkV1`
PropertyBinaryGraphemeLinkV1,
PropertyCodePointSet<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyBinaryGraphV1`
PropertyBinaryGraphV1,
PropertyCodePointSet<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyBinaryHexDigitV1`
PropertyBinaryHexDigitV1,
PropertyCodePointSet<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyBinaryHyphenV1`
PropertyBinaryHyphenV1,
PropertyCodePointSet<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyBinaryIdCompatMathContinueV1`
PropertyBinaryIdCompatMathContinueV1,
PropertyCodePointSet<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyBinaryIdCompatMathStartV1`
PropertyBinaryIdCompatMathStartV1,
PropertyCodePointSet<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyBinaryIdContinueV1`
PropertyBinaryIdContinueV1,
PropertyCodePointSet<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyBinaryIdeographicV1`
PropertyBinaryIdeographicV1,
PropertyCodePointSet<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyBinaryIdsBinaryOperatorV1`
PropertyBinaryIdsBinaryOperatorV1,
PropertyCodePointSet<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyBinaryIdStartV1`
PropertyBinaryIdStartV1,
PropertyCodePointSet<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyBinaryIdsTrinaryOperatorV1`
PropertyBinaryIdsTrinaryOperatorV1,
PropertyCodePointSet<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyBinaryIdsUnaryOperatorV1`
PropertyBinaryIdsUnaryOperatorV1,
PropertyCodePointSet<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyBinaryJoinControlV1`
PropertyBinaryJoinControlV1,
PropertyCodePointSet<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyBinaryLogicalOrderExceptionV1`
PropertyBinaryLogicalOrderExceptionV1,
PropertyCodePointSet<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyBinaryLowercaseV1`
PropertyBinaryLowercaseV1,
PropertyCodePointSet<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyBinaryMathV1`
PropertyBinaryMathV1,
PropertyCodePointSet<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyBinaryModifierCombiningMarkV1`
PropertyBinaryModifierCombiningMarkV1,
PropertyCodePointSet<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyBinaryNfcInertV1`
PropertyBinaryNfcInertV1,
PropertyCodePointSet<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyBinaryNfdInertV1`
PropertyBinaryNfdInertV1,
PropertyCodePointSet<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyBinaryNfkcInertV1`
PropertyBinaryNfkcInertV1,
PropertyCodePointSet<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyBinaryNfkdInertV1`
PropertyBinaryNfkdInertV1,
PropertyCodePointSet<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyBinaryNoncharacterCodePointV1`
PropertyBinaryNoncharacterCodePointV1,
PropertyCodePointSet<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyBinaryPatternSyntaxV1`
PropertyBinaryPatternSyntaxV1,
PropertyCodePointSet<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyBinaryPatternWhiteSpaceV1`
PropertyBinaryPatternWhiteSpaceV1,
PropertyCodePointSet<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyBinaryPrependedConcatenationMarkV1`
PropertyBinaryPrependedConcatenationMarkV1,
PropertyCodePointSet<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyBinaryPrintV1`
PropertyBinaryPrintV1,
PropertyCodePointSet<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyBinaryQuotationMarkV1`
PropertyBinaryQuotationMarkV1,
PropertyCodePointSet<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyBinaryRadicalV1`
PropertyBinaryRadicalV1,
PropertyCodePointSet<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyBinaryRegionalIndicatorV1`
PropertyBinaryRegionalIndicatorV1,
PropertyCodePointSet<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyBinarySegmentStarterV1`
PropertyBinarySegmentStarterV1,
PropertyCodePointSet<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyBinarySentenceTerminalV1`
PropertyBinarySentenceTerminalV1,
PropertyCodePointSet<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyBinarySoftDottedV1`
PropertyBinarySoftDottedV1,
PropertyCodePointSet<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyBinaryTerminalPunctuationV1`
PropertyBinaryTerminalPunctuationV1,
PropertyCodePointSet<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyBinaryUnifiedIdeographV1`
PropertyBinaryUnifiedIdeographV1,
PropertyCodePointSet<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyBinaryUppercaseV1`
PropertyBinaryUppercaseV1,
PropertyCodePointSet<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyBinaryVariationSelectorV1`
PropertyBinaryVariationSelectorV1,
PropertyCodePointSet<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyBinaryWhiteSpaceV1`
PropertyBinaryWhiteSpaceV1,
PropertyCodePointSet<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyBinaryXdigitV1`
PropertyBinaryXdigitV1,
PropertyCodePointSet<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyBinaryXidContinueV1`
PropertyBinaryXidContinueV1,
PropertyCodePointSet<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyBinaryXidStartV1`
PropertyBinaryXidStartV1,
PropertyCodePointSet<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// Data marker for the 'BidiClass' Unicode property
PropertyEnumBidiClassV1,
PropertyCodePointMap<'static, crate::props::BidiClass>,
is_singleton = true,
);
icu_provider::data_marker!(
/// Data marker for the 'CanonicalCombiningClass' Unicode property
PropertyEnumCanonicalCombiningClassV1,
PropertyCodePointMap<'static, crate::props::CanonicalCombiningClass>,
is_singleton = true,
);
icu_provider::data_marker!(
/// Data marker for the 'EastAsianWidth' Unicode property
PropertyEnumEastAsianWidthV1,
PropertyCodePointMap<'static, crate::props::EastAsianWidth>,
is_singleton = true,
);
icu_provider::data_marker!(
/// Data marker for the 'GeneralCategory' Unicode property
PropertyEnumGeneralCategoryV1,
PropertyCodePointMap<'static, crate::props::GeneralCategory>,
is_singleton = true,
);
icu_provider::data_marker!(
/// Data marker for the 'GraphemeClusterBreak' Unicode property
PropertyEnumGraphemeClusterBreakV1,
PropertyCodePointMap<'static, crate::props::GraphemeClusterBreak>,
is_singleton = true,
);
icu_provider::data_marker!(
/// Data marker for the 'HangulSyllableType' Unicode property
PropertyEnumHangulSyllableTypeV1,
PropertyCodePointMap<'static, crate::props::HangulSyllableType>,
is_singleton = true,
);
icu_provider::data_marker!(
/// Data marker for the 'IndicConjunctBreak' Unicode property
PropertyEnumIndicConjunctBreakV1,
PropertyCodePointMap<'static, crate::props::IndicConjunctBreak>,
is_singleton = true,
);
icu_provider::data_marker!(
/// Data marker for the 'IndicSyllabicCategory' Unicode property
PropertyEnumIndicSyllabicCategoryV1,
PropertyCodePointMap<'static, crate::props::IndicSyllabicCategory>,
is_singleton = true,
);
icu_provider::data_marker!(
/// Data marker for the 'JoiningType' Unicode property
PropertyEnumJoiningTypeV1,
PropertyCodePointMap<'static, crate::props::JoiningType>,
is_singleton = true,
);
icu_provider::data_marker!(
/// Data marker for the 'LineBreak' Unicode property
PropertyEnumLineBreakV1,
PropertyCodePointMap<'static, crate::props::LineBreak>,
is_singleton = true,
);
icu_provider::data_marker!(
/// Data marker for the 'Script' Unicode property
PropertyEnumScriptV1,
PropertyCodePointMap<'static, crate::props::Script>,
is_singleton = true,
);
icu_provider::data_marker!(
/// Data marker for the 'SentenceBreak' Unicode property
PropertyEnumSentenceBreakV1,
PropertyCodePointMap<'static, crate::props::SentenceBreak>,
is_singleton = true,
);
icu_provider::data_marker!(
/// Data marker for the 'Vertical_Orientation' Unicode property
PropertyEnumVerticalOrientationV1,
PropertyCodePointMap<'static, crate::props::VerticalOrientation>,
is_singleton = true,
);
icu_provider::data_marker!(
/// Data marker for the 'WordBreak' Unicode property
PropertyEnumWordBreakV1,
PropertyCodePointMap<'static, crate::props::WordBreak>,
is_singleton = true,
);
icu_provider::data_marker!(
/// Data marker for the 'BidiMirroringGlyph' Unicode property
PropertyEnumBidiMirroringGlyphV1,
PropertyCodePointMap<'static, crate::bidi::BidiMirroringGlyph>,
is_singleton = true,
);
icu_provider::data_marker!(
/// `PropertyBinaryBasicEmojiV1`
PropertyBinaryBasicEmojiV1,
PropertyUnicodeSet<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyScriptWithExtensionsV1`
PropertyScriptWithExtensionsV1,
ScriptWithExtensionsProperty<'static>,
is_singleton = true
);
/// All data keys in this module.
pub const MARKERS: &[DataMarkerInfo] = &[
PropertyNameLongBidiClassV1::INFO,
#[cfg(feature = "alloc")]
PropertyNameLongCanonicalCombiningClassV1::INFO,
PropertyNameLongEastAsianWidthV1::INFO,
PropertyNameLongGeneralCategoryV1::INFO,
PropertyNameLongGraphemeClusterBreakV1::INFO,
PropertyNameLongHangulSyllableTypeV1::INFO,
PropertyNameLongIndicSyllabicCategoryV1::INFO,
PropertyNameLongJoiningTypeV1::INFO,
PropertyNameLongLineBreakV1::INFO,
PropertyNameLongScriptV1::INFO,
PropertyNameLongSentenceBreakV1::INFO,
PropertyNameLongVerticalOrientationV1::INFO,
PropertyNameLongWordBreakV1::INFO,
PropertyNameParseBidiClassV1::INFO,
PropertyNameParseCanonicalCombiningClassV1::INFO,
PropertyNameParseEastAsianWidthV1::INFO,
PropertyNameParseGeneralCategoryMaskV1::INFO,
PropertyNameParseGeneralCategoryV1::INFO,
PropertyNameParseGraphemeClusterBreakV1::INFO,
PropertyNameParseHangulSyllableTypeV1::INFO,
PropertyNameParseIndicSyllabicCategoryV1::INFO,
PropertyNameParseJoiningTypeV1::INFO,
PropertyNameParseLineBreakV1::INFO,
PropertyNameParseScriptV1::INFO,
PropertyNameParseSentenceBreakV1::INFO,
PropertyNameParseVerticalOrientationV1::INFO,
PropertyNameParseWordBreakV1::INFO,
PropertyNameShortBidiClassV1::INFO,
#[cfg(feature = "alloc")]
PropertyNameShortCanonicalCombiningClassV1::INFO,
PropertyNameShortEastAsianWidthV1::INFO,
PropertyNameShortGeneralCategoryV1::INFO,
PropertyNameShortGraphemeClusterBreakV1::INFO,
PropertyNameShortHangulSyllableTypeV1::INFO,
PropertyNameShortIndicSyllabicCategoryV1::INFO,
PropertyNameShortJoiningTypeV1::INFO,
PropertyNameShortLineBreakV1::INFO,
PropertyNameShortScriptV1::INFO,
PropertyNameShortSentenceBreakV1::INFO,
PropertyNameShortVerticalOrientationV1::INFO,
PropertyNameShortWordBreakV1::INFO,
PropertyBinaryAlnumV1::INFO,
PropertyBinaryAlphabeticV1::INFO,
PropertyBinaryAsciiHexDigitV1::INFO,
PropertyBinaryBidiControlV1::INFO,
PropertyBinaryBidiMirroredV1::INFO,
PropertyBinaryBlankV1::INFO,
PropertyBinaryCasedV1::INFO,
PropertyBinaryCaseIgnorableV1::INFO,
PropertyBinaryCaseSensitiveV1::INFO,
PropertyBinaryChangesWhenCasefoldedV1::INFO,
PropertyBinaryChangesWhenCasemappedV1::INFO,
PropertyBinaryChangesWhenLowercasedV1::INFO,
PropertyBinaryChangesWhenNfkcCasefoldedV1::INFO,
PropertyBinaryChangesWhenTitlecasedV1::INFO,
PropertyBinaryChangesWhenUppercasedV1::INFO,
PropertyBinaryDashV1::INFO,
PropertyBinaryDefaultIgnorableCodePointV1::INFO,
PropertyBinaryDeprecatedV1::INFO,
PropertyBinaryDiacriticV1::INFO,
PropertyBinaryEmojiComponentV1::INFO,
PropertyBinaryEmojiModifierBaseV1::INFO,
PropertyBinaryEmojiModifierV1::INFO,
PropertyBinaryEmojiPresentationV1::INFO,
PropertyBinaryEmojiV1::INFO,
PropertyBinaryExtendedPictographicV1::INFO,
PropertyBinaryExtenderV1::INFO,
PropertyBinaryFullCompositionExclusionV1::INFO,
PropertyBinaryGraphemeBaseV1::INFO,
PropertyBinaryGraphemeExtendV1::INFO,
PropertyBinaryGraphemeLinkV1::INFO,
PropertyBinaryGraphV1::INFO,
PropertyBinaryHexDigitV1::INFO,
PropertyBinaryHyphenV1::INFO,
PropertyBinaryIdCompatMathContinueV1::INFO,
PropertyBinaryIdCompatMathStartV1::INFO,
PropertyBinaryIdContinueV1::INFO,
PropertyBinaryIdeographicV1::INFO,
PropertyBinaryIdsBinaryOperatorV1::INFO,
PropertyBinaryIdStartV1::INFO,
PropertyBinaryIdsTrinaryOperatorV1::INFO,
PropertyBinaryIdsUnaryOperatorV1::INFO,
PropertyBinaryJoinControlV1::INFO,
PropertyBinaryLogicalOrderExceptionV1::INFO,
PropertyBinaryLowercaseV1::INFO,
PropertyBinaryMathV1::INFO,
PropertyBinaryModifierCombiningMarkV1::INFO,
PropertyBinaryNfcInertV1::INFO,
PropertyBinaryNfdInertV1::INFO,
PropertyBinaryNfkcInertV1::INFO,
PropertyBinaryNfkdInertV1::INFO,
PropertyBinaryNoncharacterCodePointV1::INFO,
PropertyBinaryPatternSyntaxV1::INFO,
PropertyBinaryPatternWhiteSpaceV1::INFO,
PropertyBinaryPrependedConcatenationMarkV1::INFO,
PropertyBinaryPrintV1::INFO,
PropertyBinaryQuotationMarkV1::INFO,
PropertyBinaryRadicalV1::INFO,
PropertyBinaryRegionalIndicatorV1::INFO,
PropertyBinarySegmentStarterV1::INFO,
PropertyBinarySentenceTerminalV1::INFO,
PropertyBinarySoftDottedV1::INFO,
PropertyBinaryTerminalPunctuationV1::INFO,
PropertyBinaryUnifiedIdeographV1::INFO,
PropertyBinaryUppercaseV1::INFO,
PropertyBinaryVariationSelectorV1::INFO,
PropertyBinaryWhiteSpaceV1::INFO,
PropertyBinaryXdigitV1::INFO,
PropertyBinaryXidContinueV1::INFO,
PropertyBinaryXidStartV1::INFO,
PropertyEnumBidiClassV1::INFO,
PropertyEnumCanonicalCombiningClassV1::INFO,
PropertyEnumEastAsianWidthV1::INFO,
PropertyEnumGeneralCategoryV1::INFO,
PropertyEnumGraphemeClusterBreakV1::INFO,
PropertyEnumHangulSyllableTypeV1::INFO,
PropertyEnumIndicConjunctBreakV1::INFO,
PropertyEnumIndicSyllabicCategoryV1::INFO,
PropertyEnumJoiningTypeV1::INFO,
PropertyEnumLineBreakV1::INFO,
PropertyEnumScriptV1::INFO,
PropertyEnumSentenceBreakV1::INFO,
PropertyEnumVerticalOrientationV1::INFO,
PropertyEnumWordBreakV1::INFO,
PropertyEnumBidiMirroringGlyphV1::INFO,
PropertyBinaryBasicEmojiV1::INFO,
PropertyScriptWithExtensionsV1::INFO,
];
/// A set of characters which share a particular property value.
///
/// This data enum is extensible, more backends may be added in the future.
/// Old data can be used with newer code but not vice versa.
///
/// <div class="stab unstable">
/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
/// including in SemVer minor releases. While the serde representation of data structs is guaranteed
/// to be stable, their Rust representation might not be. Use with caution.
/// </div>
#[derive(Debug, Eq, PartialEq, Clone, yoke::Yokeable, zerofrom::ZeroFrom)]
#[cfg_attr(feature = "datagen", derive(serde::Serialize, databake::Bake))]
#[cfg_attr(feature = "datagen", databake(path = icu_properties::provider))]
#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
#[non_exhaustive]
pub enum PropertyCodePointSet<'data> {
/// The set of characters, represented as an inversion list
InversionList(#[cfg_attr(feature = "serde", serde(borrow))] CodePointInversionList<'data>),
// new variants should go BELOW existing ones
// Serde serializes based on variant name and index in the enum
// https://docs.rs/serde/latest/serde/trait.Serializer.html#tymethod.serialize_unit_variant
}
icu_provider::data_struct!(
PropertyCodePointSet<'_>,
#[cfg(feature = "datagen")]
);
// See CodePointSetData for documentation of these functions
impl<'data> PropertyCodePointSet<'data> {
#[inline]
pub(crate) fn contains(&self, ch: char) -> bool {
match *self {
Self::InversionList(ref l) => l.contains(ch),
}
}
#[inline]
pub(crate) fn contains32(&self, ch: u32) -> bool {
match *self {
Self::InversionList(ref l) => l.contains32(ch),
}
}
#[inline]
pub(crate) fn iter_ranges(&self) -> impl Iterator<Item = RangeInclusive<u32>> + '_ {
match *self {
Self::InversionList(ref l) => l.iter_ranges(),
}
}
#[inline]
pub(crate) fn iter_ranges_complemented(
&self,
) -> impl Iterator<Item = RangeInclusive<u32>> + '_ {
match *self {
Self::InversionList(ref l) => l.iter_ranges_complemented(),
}
}
#[inline]
pub(crate) fn from_code_point_inversion_list(l: CodePointInversionList<'static>) -> Self {
Self::InversionList(l)
}
#[inline]
pub(crate) fn as_code_point_inversion_list(
&'_ self,
) -> Option<&'_ CodePointInversionList<'data>> {
match *self {
Self::InversionList(ref l) => Some(l),
// any other backing data structure that cannot return a CPInvList in O(1) time should return None
}
}
#[inline]
pub(crate) fn to_code_point_inversion_list(&self) -> CodePointInversionList<'_> {
match *self {
Self::InversionList(ref t) => ZeroFrom::zero_from(t),
}
}
}
/// A map efficiently storing data about individual characters.
///
/// This data enum is extensible, more backends may be added in the future.
/// Old data can be used with newer code but not vice versa.
///
/// <div class="stab unstable">
/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
/// including in SemVer minor releases. While the serde representation of data structs is guaranteed
/// to be stable, their Rust representation might not be. Use with caution.
/// </div>
#[derive(Clone, Debug, Eq, PartialEq, yoke::Yokeable, zerofrom::ZeroFrom)]
#[cfg_attr(feature = "datagen", derive(serde::Serialize, databake::Bake))]
#[cfg_attr(feature = "datagen", databake(path = icu_properties::provider))]
#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
#[non_exhaustive]
pub enum PropertyCodePointMap<'data, T: TrieValue> {
/// A codepoint trie storing the data
CodePointTrie(#[cfg_attr(feature = "serde", serde(borrow))] CodePointTrie<'data, T>),
// new variants should go BELOW existing ones
// Serde serializes based on variant name and index in the enum
// https://docs.rs/serde/latest/serde/trait.Serializer.html#tymethod.serialize_unit_variant
}
icu_provider::data_struct!(
<T: TrieValue> PropertyCodePointMap<'_, T>,
#[cfg(feature = "datagen")]
);
// See CodePointMapData for documentation of these functions
impl<'data, T: TrieValue> PropertyCodePointMap<'data, T> {
#[inline]
pub(crate) fn get32(&self, ch: u32) -> T {
match *self {
Self::CodePointTrie(ref t) => t.get32(ch),
}
}
#[inline]
pub(crate) fn get(&self, c: char) -> T {
match *self {
Self::CodePointTrie(ref t) => t.get(c),
}
}
#[inline]
#[cfg(feature = "alloc")]
pub(crate) fn try_into_converted<P>(
self,
) -> Result<PropertyCodePointMap<'data, P>, zerovec::ule::UleError>
where
P: TrieValue,
{
match self {
Self::CodePointTrie(t) => t
.try_into_converted()
.map(PropertyCodePointMap::CodePointTrie),
}
}
#[inline]
#[cfg(feature = "alloc")]
pub(crate) fn get_set_for_value(&self, value: T) -> CodePointInversionList<'static> {
match *self {
Self::CodePointTrie(ref t) => t.get_set_for_value(value),
}
}
#[inline]
pub(crate) fn iter_ranges(&self) -> impl Iterator<Item = CodePointMapRange<T>> + '_ {
match *self {
Self::CodePointTrie(ref t) => t.iter_ranges(),
}
}
#[inline]
pub(crate) fn iter_ranges_mapped<'a, U: Eq + 'a>(
&'a self,
map: impl FnMut(T) -> U + Copy + 'a,
) -> impl Iterator<Item = CodePointMapRange<U>> + 'a {
match *self {
Self::CodePointTrie(ref t) => t.iter_ranges_mapped(map),
}
}
#[inline]
pub(crate) fn from_code_point_trie(trie: CodePointTrie<'static, T>) -> Self {
Self::CodePointTrie(trie)
}
#[inline]
pub(crate) fn as_code_point_trie(&self) -> Option<&CodePointTrie<'data, T>> {
match *self {
Self::CodePointTrie(ref t) => Some(t),
// any other backing data structure that cannot return a CPT in O(1) time should return None
}
}
#[inline]
pub(crate) fn to_code_point_trie(&self) -> CodePointTrie<'_, T> {
match *self {
Self::CodePointTrie(ref t) => ZeroFrom::zero_from(t),
}
}
}
/// A set of characters and strings which share a particular property value.
///
/// <div class="stab unstable">
/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
/// including in SemVer minor releases. While the serde representation of data structs is guaranteed
/// to be stable, their Rust representation might not be. Use with caution.
/// </div>
#[derive(Debug, Eq, PartialEq, Clone, yoke::Yokeable, zerofrom::ZeroFrom)]
#[cfg_attr(feature = "datagen", derive(serde::Serialize, databake::Bake))]
#[cfg_attr(feature = "datagen", databake(path = icu_properties::provider))]
#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
#[non_exhaustive]
pub enum PropertyUnicodeSet<'data> {
/// A set representing characters in an inversion list, and the strings in a list.
CPInversionListStrList(
#[cfg_attr(feature = "serde", serde(borrow))] CodePointInversionListAndStringList<'data>,
),
// new variants should go BELOW existing ones
// Serde serializes based on variant name and index in the enum
// https://docs.rs/serde/latest/serde/trait.Serializer.html#tymethod.serialize_unit_variant
}
icu_provider::data_struct!(
PropertyUnicodeSet<'_>,
#[cfg(feature = "datagen")]
);
impl<'data> PropertyUnicodeSet<'data> {
#[inline]
pub(crate) fn contains_str(&self, s: &str) -> bool {
match *self {
Self::CPInversionListStrList(ref l) => l.contains_str(s),
}
}
#[inline]
pub(crate) fn contains32(&self, cp: u32) -> bool {
match *self {
Self::CPInversionListStrList(ref l) => l.contains32(cp),
}
}
#[inline]
pub(crate) fn contains(&self, ch: char) -> bool {
match *self {
Self::CPInversionListStrList(ref l) => l.contains(ch),
}
}
#[inline]
pub(crate) fn from_code_point_inversion_list_string_list(
l: CodePointInversionListAndStringList<'static>,
) -> Self {
Self::CPInversionListStrList(l)
}
#[inline]
pub(crate) fn as_code_point_inversion_list_string_list(
&'_ self,
) -> Option<&'_ CodePointInversionListAndStringList<'data>> {
match *self {
Self::CPInversionListStrList(ref l) => Some(l),
// any other backing data structure that cannot return a CPInversionListStrList in O(1) time should return None
}
}
#[inline]
pub(crate) fn to_code_point_inversion_list_string_list(
&self,
) -> CodePointInversionListAndStringList<'_> {
match *self {
Self::CPInversionListStrList(ref t) => ZeroFrom::zero_from(t),
}
}
}
/// A struct that efficiently stores `Script` and `Script_Extensions` property data.
///
/// <div class="stab unstable">
/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
/// including in SemVer minor releases. While the serde representation of data structs is guaranteed
/// to be stable, their Rust representation might not be. Use with caution.
/// </div>
#[derive(Debug, Eq, PartialEq, Clone, yoke::Yokeable, zerofrom::ZeroFrom)]
#[cfg_attr(feature = "datagen", derive(serde::Serialize, databake::Bake))]
#[cfg_attr(feature = "datagen", databake(path = icu_properties::provider))]
#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
pub struct ScriptWithExtensionsProperty<'data> {
/// Note: The `ScriptWithExt` values in this array will assume a 12-bit layout. The 2
/// higher order bits 11..10 will indicate how to deduce the Script value and
/// Script_Extensions value, nearly matching the representation
/// [in ICU](https://github.com/unicode-org/icu/blob/main/icu4c/source/common/uprops.h):
///
/// | High order 2 bits value | Script | Script_Extensions |
/// |-------------------------|--------------------------------------------------------|----------------------------------------------------------------|
/// | 3 | First value in sub-array, index given by lower 10 bits | Sub-array excluding first value, index given by lower 10 bits |
/// | 2 | Script=Inherited | Entire sub-array, index given by lower 10 bits |
/// | 1 | Script=Common | Entire sub-array, index given by lower 10 bits |
/// | 0 | Value in lower 10 bits | `[ Script value ]` single-element array |
///
/// When the lower 10 bits of the value are used as an index, that index is
/// used for the outer-level vector of the nested `extensions` structure.
#[cfg_attr(feature = "serde", serde(borrow))]
pub trie: CodePointTrie<'data, ScriptWithExt>,
/// This companion structure stores Script_Extensions values, which are
/// themselves arrays / vectors. This structure only stores the values for
/// cases in which `scx(cp) != [ sc(cp) ]`. Each sub-vector is distinct. The
/// sub-vector represents the Script_Extensions array value for a code point,
/// and may also indicate Script value, as described for the `trie` field.
#[cfg_attr(feature = "serde", serde(borrow))]
pub extensions: VarZeroVec<'data, ZeroSlice<Script>>,
}
icu_provider::data_struct!(
ScriptWithExtensionsProperty<'_>,
#[cfg(feature = "datagen")]
);
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
//! 🚧 \[Unstable\] Property names-related data for this component
//!
//! <div class="stab unstable">
//! 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
//! including in SemVer minor releases. While the serde representation of data structs is guaranteed
//! to be stable, their Rust representation might not be. Use with caution.
//! </div>
//!
//! Read more about data providers: [`icu_provider`]
use icu_locale_core::subtags::Script;
use icu_provider::prelude::{yoke, zerofrom};
use zerotrie::ZeroTrieSimpleAscii;
use zerovec::ule::NichedOption;
use zerovec::{VarZeroVec, ZeroVec};
icu_provider::data_marker!(
/// `PropertyNameParseBidiClassV1`
PropertyNameParseBidiClassV1,
PropertyValueNameToEnumMap<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyNameParseCanonicalCombiningClassV1`
PropertyNameParseCanonicalCombiningClassV1,
PropertyValueNameToEnumMap<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyNameParseEastAsianWidthV1`
PropertyNameParseEastAsianWidthV1,
PropertyValueNameToEnumMap<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyNameParseGeneralCategoryMaskV1`
PropertyNameParseGeneralCategoryMaskV1,
PropertyValueNameToEnumMap<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyNameParseGeneralCategoryV1`
PropertyNameParseGeneralCategoryV1,
PropertyValueNameToEnumMap<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyNameParseGraphemeClusterBreakV1`
PropertyNameParseGraphemeClusterBreakV1,
PropertyValueNameToEnumMap<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyNameParseHangulSyllableTypeV1`
PropertyNameParseHangulSyllableTypeV1,
PropertyValueNameToEnumMap<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyNameParseIndicSyllabicCategoryV1`
PropertyNameParseIndicSyllabicCategoryV1,
PropertyValueNameToEnumMap<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyNameParseJoiningTypeV1`
PropertyNameParseJoiningTypeV1,
PropertyValueNameToEnumMap<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyNameParseLineBreakV1`
PropertyNameParseLineBreakV1,
PropertyValueNameToEnumMap<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyNameParseScriptV1`
PropertyNameParseScriptV1,
PropertyValueNameToEnumMap<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyNameParseSentenceBreakV1`
PropertyNameParseSentenceBreakV1,
PropertyValueNameToEnumMap<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyNameParseVerticalOrientationV1`
PropertyNameParseVerticalOrientationV1,
PropertyValueNameToEnumMap<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyNameParseWordBreakV1`
PropertyNameParseWordBreakV1,
PropertyValueNameToEnumMap<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyNameLongBidiClassV1`
PropertyNameLongBidiClassV1,
PropertyEnumToValueNameLinearMap<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyNameShortBidiClassV1`
PropertyNameShortBidiClassV1,
PropertyEnumToValueNameLinearMap<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyNameLongEastAsianWidthV1`
PropertyNameLongEastAsianWidthV1,
PropertyEnumToValueNameLinearMap<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyNameShortEastAsianWidthV1`
PropertyNameShortEastAsianWidthV1,
PropertyEnumToValueNameLinearMap<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyNameLongGeneralCategoryV1`
PropertyNameLongGeneralCategoryV1,
PropertyEnumToValueNameLinearMap<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyNameShortGeneralCategoryV1`
PropertyNameShortGeneralCategoryV1,
PropertyEnumToValueNameLinearMap<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyNameLongGraphemeClusterBreakV1`
PropertyNameLongGraphemeClusterBreakV1,
PropertyEnumToValueNameLinearMap<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyNameShortGraphemeClusterBreakV1`
PropertyNameShortGraphemeClusterBreakV1,
PropertyEnumToValueNameLinearMap<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyNameLongHangulSyllableTypeV1`
PropertyNameLongHangulSyllableTypeV1,
PropertyEnumToValueNameLinearMap<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyNameShortHangulSyllableTypeV1`
PropertyNameShortHangulSyllableTypeV1,
PropertyEnumToValueNameLinearMap<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyNameLongIndicSyllabicCategoryV1`
PropertyNameLongIndicSyllabicCategoryV1,
PropertyEnumToValueNameLinearMap<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyNameShortIndicSyllabicCategoryV1`
PropertyNameShortIndicSyllabicCategoryV1,
PropertyEnumToValueNameLinearMap<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyNameLongJoiningTypeV1`
PropertyNameLongJoiningTypeV1,
PropertyEnumToValueNameLinearMap<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyNameShortJoiningTypeV1`
PropertyNameShortJoiningTypeV1,
PropertyEnumToValueNameLinearMap<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyNameLongLineBreakV1`
PropertyNameLongLineBreakV1,
PropertyEnumToValueNameLinearMap<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyNameShortLineBreakV1`
PropertyNameShortLineBreakV1,
PropertyEnumToValueNameLinearMap<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyNameLongScriptV1`
PropertyNameLongScriptV1,
PropertyEnumToValueNameLinearMap<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyNameLongSentenceBreakV1`
PropertyNameLongSentenceBreakV1,
PropertyEnumToValueNameLinearMap<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyNameShortSentenceBreakV1`
PropertyNameShortSentenceBreakV1,
PropertyEnumToValueNameLinearMap<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyNameLongVerticalOrientationV1`
PropertyNameLongVerticalOrientationV1,
PropertyEnumToValueNameLinearMap<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyNameShortVerticalOrientationV1`
PropertyNameShortVerticalOrientationV1,
PropertyEnumToValueNameLinearMap<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyNameLongWordBreakV1`
PropertyNameLongWordBreakV1,
PropertyEnumToValueNameLinearMap<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// `PropertyNameShortWordBreakV1`
PropertyNameShortWordBreakV1,
PropertyEnumToValueNameLinearMap<'static>,
is_singleton = true
);
#[cfg(feature = "alloc")]
icu_provider::data_marker!(
/// `PropertyNameLongCanonicalCombiningClassV1`
PropertyNameLongCanonicalCombiningClassV1,
PropertyEnumToValueNameSparseMap<'static>,
is_singleton = true,
);
#[cfg(feature = "alloc")]
icu_provider::data_marker!(
/// `PropertyNameShortCanonicalCombiningClassV1`
PropertyNameShortCanonicalCombiningClassV1,
PropertyEnumToValueNameSparseMap<'static>,
is_singleton = true,
);
icu_provider::data_marker!(
/// `PropertyNameShortScriptV1`
PropertyNameShortScriptV1,
PropertyScriptToIcuScriptMap<'static>,
is_singleton = true,
);
/// A set of characters and strings which share a particular property value.
///
/// <div class="stab unstable">
/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
/// including in SemVer minor releases. While the serde representation of data structs is guaranteed
/// to be stable, their Rust representation might not be. Use with caution.
/// </div>
#[derive(Debug, Clone, PartialEq, yoke::Yokeable, zerofrom::ZeroFrom)]
#[cfg_attr(feature = "datagen", derive(serde::Serialize, databake::Bake))]
#[cfg_attr(feature = "datagen", databake(path = icu_properties::provider::names))]
#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
pub struct PropertyValueNameToEnumMap<'data> {
/// A map from names to their value discriminant
#[cfg_attr(feature = "serde", serde(borrow))]
pub map: ZeroTrieSimpleAscii<ZeroVec<'data, u8>>,
}
icu_provider::data_struct!(
PropertyValueNameToEnumMap<'_>,
#[cfg(feature = "datagen")]
);
/// A mapping of property values to their names. A single instance of this map will only cover
/// either long or short names, determined whilst loading data.
///
/// <div class="stab unstable">
/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
/// including in SemVer minor releases. While the serde representation of data structs is guaranteed
/// to be stable, their Rust representation might not be. Use with caution.
/// </div>
#[derive(Debug, Clone, PartialEq, yoke::Yokeable, zerofrom::ZeroFrom)]
#[cfg_attr(feature = "datagen", derive(serde::Serialize, databake::Bake))]
#[cfg_attr(feature = "datagen", databake(path = icu_properties::provider::names))]
#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
#[yoke(prove_covariance_manually)]
#[cfg(feature = "alloc")]
pub struct PropertyEnumToValueNameSparseMap<'data> {
/// A map from the value discriminant to the names
#[cfg_attr(feature = "serde", serde(borrow))]
pub map: zerovec::ZeroMap<'data, u16, str>,
}
#[cfg(feature = "alloc")]
icu_provider::data_struct!(
PropertyEnumToValueNameSparseMap<'_>,
#[cfg(feature = "datagen")]
);
/// A mapping of property values to their names. A single instance of this map will only cover
/// either long or short names, determined whilst loading data.
///
/// <div class="stab unstable">
/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
/// including in SemVer minor releases. While the serde representation of data structs is guaranteed
/// to be stable, their Rust representation might not be. Use with caution.
/// </div>
#[derive(Debug, Clone, PartialEq, yoke::Yokeable, zerofrom::ZeroFrom)]
#[cfg_attr(feature = "datagen", derive(serde::Serialize, databake::Bake))]
#[cfg_attr(feature = "datagen", databake(path = icu_properties::provider::names))]
#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
#[yoke(prove_covariance_manually)]
pub struct PropertyEnumToValueNameLinearMap<'data> {
/// A map from the value discriminant (the index) to the names, for mostly
/// contiguous data. Empty strings count as missing.
#[cfg_attr(feature = "serde", serde(borrow))]
pub map: VarZeroVec<'data, str>,
}
icu_provider::data_struct!(
PropertyEnumToValueNameLinearMap<'_>,
#[cfg(feature = "datagen")]
);
/// A mapping of property values to their names. A single instance of this map will only cover
/// either long or short names, determined whilst loading data.
///
/// <div class="stab unstable">
/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
/// including in SemVer minor releases. While the serde representation of data structs is guaranteed
/// to be stable, their Rust representation might not be. Use with caution.
/// </div>
#[derive(Debug, Clone, PartialEq, yoke::Yokeable, zerofrom::ZeroFrom)]
#[cfg_attr(feature = "datagen", derive(serde::Serialize, databake::Bake))]
#[cfg_attr(feature = "datagen", databake(path = icu_properties::provider::names))]
#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
#[yoke(prove_covariance_manually)]
pub struct PropertyScriptToIcuScriptMap<'data> {
/// A map from the value discriminant (the index) to the names, for mostly
/// contiguous data. Empty strings count as missing.
#[cfg_attr(feature = "serde", serde(borrow))]
pub map: ZeroVec<'data, NichedOption<Script, 4>>,
}
icu_provider::data_struct!(
PropertyScriptToIcuScriptMap<'_>,
#[cfg(feature = "datagen")]
);
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
//! 🚧 \[Experimental\] This module is experimental and currently crate-private. Let us know if you
//! have a use case for this!
//!
//! This module contains utilities for working with properties where the specific property in use
//! is not known at compile time.
//!
//! For regex engines, [`crate::sets::load_for_ecma262_unstable()`] is a convenient API for working
//! with properties at runtime tailored for the use case of ECMA262-compatible regex engines.
use crate::provider::*;
use crate::CodePointSetData;
#[cfg(doc)]
use crate::{
props::{GeneralCategory, GeneralCategoryGroup, Script},
script, CodePointMapData, PropertyParser,
};
use icu_provider::prelude::*;
/// This type can represent any binary Unicode property.
///
/// This is intended to be used in situations where the exact unicode property needed is
/// only known at runtime, for example in regex engines.
///
/// The values are intended to be identical to ICU4C's UProperty enum
#[non_exhaustive]
#[allow(missing_docs)]
#[allow(dead_code)]
#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Debug)]
enum BinaryProperty {
Alnum = 44,
Alphabetic = 0,
AsciiHexDigit = 1,
BidiControl = 2,
BidiMirrored = 3,
Blank = 45,
Cased = 49,
CaseIgnorable = 50,
CaseSensitive = 34,
ChangesWhenCasefolded = 54,
ChangesWhenCasemapped = 55,
ChangesWhenLowercased = 51,
ChangesWhenNfkcCasefolded = 56,
ChangesWhenTitlecased = 53,
ChangesWhenUppercased = 52,
Dash = 4,
DefaultIgnorableCodePoint = 5,
Deprecated = 6,
Diacritic = 7,
Emoji = 57,
EmojiComponent = 61,
EmojiModifier = 59,
EmojiModifierBase = 60,
EmojiPresentation = 58,
ExtendedPictographic = 64,
Extender = 8,
FullCompositionExclusion = 9,
Graph = 46,
GraphemeBase = 10,
GraphemeExtend = 11,
GraphemeLink = 12,
HexDigit = 13,
Hyphen = 14,
IdCompatMathContinue = 65,
IdCompatMathStart = 66,
IdContinue = 15,
Ideographic = 17,
IdsBinaryOperator = 18,
IdStart = 16,
IdsTrinaryOperator = 19,
IdsUnaryOperator = 67,
JoinControl = 20,
LogicalOrderException = 21,
Lowercase = 22,
Math = 23,
ModifierCombiningMark = 68,
NfcInert = 39,
NfdInert = 37,
NfkcInert = 40,
NfkdInert = 38,
NoncharacterCodePoint = 24,
PatternSyntax = 42,
PatternWhiteSpace = 43,
PrependedConcatenationMark = 63,
Print = 47,
QuotationMark = 25,
Radical = 26,
RegionalIndicator = 62,
SegmentStarter = 41,
SentenceTerminal = 35,
SoftDotted = 27,
TerminalPunctuation = 28,
UnifiedIdeograph = 29,
Uppercase = 30,
VariationSelector = 36,
WhiteSpace = 31,
Xdigit = 48,
XidContinue = 32,
XidStart = 33,
}
/// This type can represent any binary property over strings.
///
/// This is intended to be used in situations where the exact unicode property needed is
/// only known at runtime, for example in regex engines.
///
/// The values are intended to be identical to ICU4C's UProperty enum
#[non_exhaustive]
#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Debug)]
#[allow(dead_code)]
#[allow(missing_docs)]
enum StringBinaryProperty {
BasicEmoji = 65,
EmojiKeycapSequence = 66,
RgiEmoji = 71,
RgiEmojiFlagSequence = 68,
RgiEmojiModifierSequence = 67,
RgiEmojiTagSequence = 69,
RgiEmojiZWJSequence = 70,
}
/// This type can represent any enumerated Unicode property.
///
/// This is intended to be used in situations where the exact unicode property needed is
/// only known at runtime, for example in regex engines.
///
/// The values are intended to be identical to ICU4C's UProperty enum
#[non_exhaustive]
#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Debug)]
#[allow(dead_code)]
#[allow(missing_docs)]
enum EnumeratedProperty {
BidiClass = 0x1000,
BidiPairedBracketType = 0x1015,
Block = 0x1001,
CombiningClass = 0x1002,
DecompositionType = 0x1003,
EastAsianWidth = 0x1004,
GeneralCategory = 0x1005,
GraphemeClusterBreak = 0x1012,
HangulSyllableType = 0x100B,
IndicConjunctBreak = 0x101A,
IndicPositionalCategory = 0x1016,
IndicSyllabicCategory = 0x1017,
JoiningGroup = 0x1006,
JoiningType = 0x1007,
LeadCanonicalCombiningClass = 0x1010,
LineBreak = 0x1008,
NFCQuickCheck = 0x100E,
NFDQuickCheck = 0x100C,
NFKCQuickCheck = 0x100F,
NFKDQuickCheck = 0x100D,
NumericType = 0x1009,
Script = 0x100A,
SentenceBreak = 0x1013,
TrailCanonicalCombiningClass = 0x1011,
VerticalOrientation = 0x1018,
WordBreak = 0x1014,
}
/// This type can represent any Unicode mask property.
///
/// This is intended to be used in situations where the exact unicode property needed is
/// only known at runtime, for example in regex engines.
///
/// The values are intended to be identical to ICU4C's UProperty enum
#[non_exhaustive]
#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Debug)]
#[allow(dead_code)]
#[allow(missing_docs)]
enum MaskProperty {
GeneralCategoryMask = 0x2000,
}
/// This type can represent any numeric Unicode property.
///
/// This is intended to be used in situations where the exact unicode property needed is
/// only known at runtime, for example in regex engines.
///
/// The values are intended to be identical to ICU4C's UProperty enum
#[non_exhaustive]
#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Debug)]
#[allow(dead_code)]
#[allow(missing_docs)]
enum NumericProperty {
NumericValue = 0x3000,
}
/// This type can represent any Unicode string property.
///
/// This is intended to be used in situations where the exact unicode property needed is
/// only known at runtime, for example in regex engines.
///
/// The values are intended to be identical to ICU4C's UProperty enum
#[non_exhaustive]
#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Debug)]
#[allow(dead_code)]
#[allow(missing_docs)]
enum StringProperty {
Age = 0x4000,
BidiMirroringGlyph = 0x4001,
BidiPairedBracket = 0x400D,
CaseFolding = 0x4002,
ISOComment = 0x4003,
LowercaseMapping = 0x4004,
Name = 0x4005,
SimpleCaseFolding = 0x4006,
SimpleLowercaseMapping = 0x4007,
SimpleTitlecaseMapping = 0x4008,
SimpleUppercaseMapping = 0x4009,
TitlecaseMapping = 0x400A,
Unicode1Name = 0x400B,
UppercaseMapping = 0x400C,
}
#[non_exhaustive]
#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Debug)]
#[allow(dead_code)]
#[allow(missing_docs)]
enum MiscProperty {
ScriptExtensions = 0x7000,
}
impl CodePointSetData {
/// Returns a type capable of looking up values for a property specified as a string, as long as it is a
/// [binary property listed in ECMA-262][ecma], using strict matching on the names in the spec.
///
/// This handles every property required by ECMA-262 `/u` regular expressions, except for:
///
/// - `Script` and `General_Category`: handle these directly using property values parsed via
/// [`PropertyParser<GeneralCategory>`] and [`PropertyParser<Script>`]
/// if necessary.
/// - `Script_Extensions`: handle this directly using APIs from [`crate::script::ScriptWithExtensions`]
/// - `General_Category` mask values: Handle this alongside `General_Category` using [`GeneralCategoryGroup`],
/// using property values parsed via [`PropertyParser<GeneralCategory>`] if necessary
/// - `Assigned`, `All`, and `ASCII` pseudoproperties: Handle these using their equivalent sets:
/// - `Any` can be expressed as the range `[\u{0}-\u{10FFFF}]`
/// - `Assigned` can be expressed as the inverse of the set `gc=Cn` (i.e., `\P{gc=Cn}`).
/// - `ASCII` can be expressed as the range `[\u{0}-\u{7F}]`
/// - `General_Category` property values can themselves be treated like properties using a shorthand in ECMA262,
/// simply create the corresponding `GeneralCategory` set.
///
/// ✨ *Enabled with the `compiled_data` Cargo feature.*
///
/// [📚 Help choosing a constructor](icu_provider::constructors)
///
/// ```
/// use icu::properties::CodePointSetData;
///
/// let emoji = CodePointSetData::new_for_ecma262(b"Emoji")
/// .expect("is an ECMA-262 property");
///
/// assert!(emoji.contains('🔥')); // U+1F525 FIRE
/// assert!(!emoji.contains('V'));
/// ```
///
/// [ecma]: https://tc39.es/ecma262/#table-binary-unicode-properties
#[cfg(feature = "compiled_data")]
pub fn new_for_ecma262(prop: &[u8]) -> Option<crate::CodePointSetDataBorrowed<'static>> {
use crate::props::*;
Some(match prop {
AsciiHexDigit::NAME | AsciiHexDigit::SHORT_NAME => Self::new::<AsciiHexDigit>(),
Alphabetic::NAME | Alphabetic::SHORT_NAME => Self::new::<Alphabetic>(),
BidiControl::NAME | BidiControl::SHORT_NAME => Self::new::<BidiControl>(),
BidiMirrored::NAME | BidiMirrored::SHORT_NAME => Self::new::<BidiMirrored>(),
CaseIgnorable::NAME | CaseIgnorable::SHORT_NAME => Self::new::<CaseIgnorable>(),
#[allow(unreachable_patterns)] // no short name
Cased::NAME | Cased::SHORT_NAME => Self::new::<Cased>(),
ChangesWhenCasefolded::NAME | ChangesWhenCasefolded::SHORT_NAME => {
Self::new::<ChangesWhenCasefolded>()
}
ChangesWhenCasemapped::NAME | ChangesWhenCasemapped::SHORT_NAME => {
Self::new::<ChangesWhenCasemapped>()
}
ChangesWhenLowercased::NAME | ChangesWhenLowercased::SHORT_NAME => {
Self::new::<ChangesWhenLowercased>()
}
ChangesWhenNfkcCasefolded::NAME | ChangesWhenNfkcCasefolded::SHORT_NAME => {
Self::new::<ChangesWhenNfkcCasefolded>()
}
ChangesWhenTitlecased::NAME | ChangesWhenTitlecased::SHORT_NAME => {
Self::new::<ChangesWhenTitlecased>()
}
ChangesWhenUppercased::NAME | ChangesWhenUppercased::SHORT_NAME => {
Self::new::<ChangesWhenUppercased>()
}
#[allow(unreachable_patterns)] // no short name
Dash::NAME | Dash::SHORT_NAME => Self::new::<Dash>(),
DefaultIgnorableCodePoint::NAME | DefaultIgnorableCodePoint::SHORT_NAME => {
Self::new::<DefaultIgnorableCodePoint>()
}
Deprecated::NAME | Deprecated::SHORT_NAME => Self::new::<Deprecated>(),
Diacritic::NAME | Diacritic::SHORT_NAME => Self::new::<Diacritic>(),
#[allow(unreachable_patterns)] // no short name
Emoji::NAME | Emoji::SHORT_NAME => Self::new::<Emoji>(),
EmojiComponent::NAME | EmojiComponent::SHORT_NAME => Self::new::<EmojiComponent>(),
EmojiModifier::NAME | EmojiModifier::SHORT_NAME => Self::new::<EmojiModifier>(),
EmojiModifierBase::NAME | EmojiModifierBase::SHORT_NAME => {
Self::new::<EmojiModifierBase>()
}
EmojiPresentation::NAME | EmojiPresentation::SHORT_NAME => {
Self::new::<EmojiPresentation>()
}
ExtendedPictographic::NAME | ExtendedPictographic::SHORT_NAME => {
Self::new::<ExtendedPictographic>()
}
Extender::NAME | Extender::SHORT_NAME => Self::new::<Extender>(),
GraphemeBase::NAME | GraphemeBase::SHORT_NAME => Self::new::<GraphemeBase>(),
GraphemeExtend::NAME | GraphemeExtend::SHORT_NAME => Self::new::<GraphemeExtend>(),
HexDigit::NAME | HexDigit::SHORT_NAME => Self::new::<HexDigit>(),
IdsBinaryOperator::NAME | IdsBinaryOperator::SHORT_NAME => {
Self::new::<IdsBinaryOperator>()
}
IdsTrinaryOperator::NAME | IdsTrinaryOperator::SHORT_NAME => {
Self::new::<IdsTrinaryOperator>()
}
IdContinue::NAME | IdContinue::SHORT_NAME => Self::new::<IdContinue>(),
IdStart::NAME | IdStart::SHORT_NAME => Self::new::<IdStart>(),
Ideographic::NAME | Ideographic::SHORT_NAME => Self::new::<Ideographic>(),
JoinControl::NAME | JoinControl::SHORT_NAME => Self::new::<JoinControl>(),
LogicalOrderException::NAME | LogicalOrderException::SHORT_NAME => {
Self::new::<LogicalOrderException>()
}
Lowercase::NAME | Lowercase::SHORT_NAME => Self::new::<Lowercase>(),
#[allow(unreachable_patterns)] // no short name
Math::NAME | Math::SHORT_NAME => Self::new::<Math>(),
NoncharacterCodePoint::NAME | NoncharacterCodePoint::SHORT_NAME => {
Self::new::<NoncharacterCodePoint>()
}
PatternSyntax::NAME | PatternSyntax::SHORT_NAME => Self::new::<PatternSyntax>(),
PatternWhiteSpace::NAME | PatternWhiteSpace::SHORT_NAME => {
Self::new::<PatternWhiteSpace>()
}
QuotationMark::NAME | QuotationMark::SHORT_NAME => Self::new::<QuotationMark>(),
#[allow(unreachable_patterns)] // no short name
Radical::NAME | Radical::SHORT_NAME => Self::new::<Radical>(),
RegionalIndicator::NAME | RegionalIndicator::SHORT_NAME => {
Self::new::<RegionalIndicator>()
}
SentenceTerminal::NAME | SentenceTerminal::SHORT_NAME => {
Self::new::<SentenceTerminal>()
}
SoftDotted::NAME | SoftDotted::SHORT_NAME => Self::new::<SoftDotted>(),
TerminalPunctuation::NAME | TerminalPunctuation::SHORT_NAME => {
Self::new::<TerminalPunctuation>()
}
UnifiedIdeograph::NAME | UnifiedIdeograph::SHORT_NAME => {
Self::new::<UnifiedIdeograph>()
}
Uppercase::NAME | Uppercase::SHORT_NAME => Self::new::<Uppercase>(),
VariationSelector::NAME | VariationSelector::SHORT_NAME => {
Self::new::<VariationSelector>()
}
WhiteSpace::NAME | WhiteSpace::SHORT_NAME => Self::new::<WhiteSpace>(),
XidContinue::NAME | XidContinue::SHORT_NAME => Self::new::<XidContinue>(),
XidStart::NAME | XidStart::SHORT_NAME => Self::new::<XidStart>(),
// Not an ECMA-262 property
_ => return None,
})
}
icu_provider::gen_buffer_data_constructors!(
(prop: &[u8]) -> result: Option<Result<Self, DataError>>,
functions: [
new_for_ecma262: skip,
try_new_for_ecma262_with_buffer_provider,
try_new_for_ecma262_unstable,
Self,
]
);
#[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new_for_ecma262)]
pub fn try_new_for_ecma262_unstable<P>(
provider: &P,
prop: &[u8],
) -> Option<Result<Self, DataError>>
where
P: ?Sized
+ DataProvider<PropertyBinaryAsciiHexDigitV1>
+ DataProvider<PropertyBinaryAlphabeticV1>
+ DataProvider<PropertyBinaryBidiControlV1>
+ DataProvider<PropertyBinaryBidiMirroredV1>
+ DataProvider<PropertyBinaryCaseIgnorableV1>
+ DataProvider<PropertyBinaryCasedV1>
+ DataProvider<PropertyBinaryChangesWhenCasefoldedV1>
+ DataProvider<PropertyBinaryChangesWhenCasemappedV1>
+ DataProvider<PropertyBinaryChangesWhenLowercasedV1>
+ DataProvider<PropertyBinaryChangesWhenNfkcCasefoldedV1>
+ DataProvider<PropertyBinaryChangesWhenTitlecasedV1>
+ DataProvider<PropertyBinaryChangesWhenUppercasedV1>
+ DataProvider<PropertyBinaryDashV1>
+ DataProvider<PropertyBinaryDefaultIgnorableCodePointV1>
+ DataProvider<PropertyBinaryDeprecatedV1>
+ DataProvider<PropertyBinaryDiacriticV1>
+ DataProvider<PropertyBinaryEmojiV1>
+ DataProvider<PropertyBinaryEmojiComponentV1>
+ DataProvider<PropertyBinaryEmojiModifierV1>
+ DataProvider<PropertyBinaryEmojiModifierBaseV1>
+ DataProvider<PropertyBinaryEmojiPresentationV1>
+ DataProvider<PropertyBinaryExtendedPictographicV1>
+ DataProvider<PropertyBinaryExtenderV1>
+ DataProvider<PropertyBinaryGraphemeBaseV1>
+ DataProvider<PropertyBinaryGraphemeExtendV1>
+ DataProvider<PropertyBinaryHexDigitV1>
+ DataProvider<PropertyBinaryIdsBinaryOperatorV1>
+ DataProvider<PropertyBinaryIdsTrinaryOperatorV1>
+ DataProvider<PropertyBinaryIdContinueV1>
+ DataProvider<PropertyBinaryIdStartV1>
+ DataProvider<PropertyBinaryIdeographicV1>
+ DataProvider<PropertyBinaryJoinControlV1>
+ DataProvider<PropertyBinaryLogicalOrderExceptionV1>
+ DataProvider<PropertyBinaryLowercaseV1>
+ DataProvider<PropertyBinaryMathV1>
+ DataProvider<PropertyBinaryNoncharacterCodePointV1>
+ DataProvider<PropertyBinaryPatternSyntaxV1>
+ DataProvider<PropertyBinaryPatternWhiteSpaceV1>
+ DataProvider<PropertyBinaryQuotationMarkV1>
+ DataProvider<PropertyBinaryRadicalV1>
+ DataProvider<PropertyBinaryRegionalIndicatorV1>
+ DataProvider<PropertyBinarySentenceTerminalV1>
+ DataProvider<PropertyBinarySoftDottedV1>
+ DataProvider<PropertyBinaryTerminalPunctuationV1>
+ DataProvider<PropertyBinaryUnifiedIdeographV1>
+ DataProvider<PropertyBinaryUppercaseV1>
+ DataProvider<PropertyBinaryVariationSelectorV1>
+ DataProvider<PropertyBinaryWhiteSpaceV1>
+ DataProvider<PropertyBinaryXidContinueV1>
+ DataProvider<PropertyBinaryXidStartV1>,
{
use crate::props::*;
Some(match prop {
AsciiHexDigit::NAME | AsciiHexDigit::SHORT_NAME => {
Self::try_new_unstable::<AsciiHexDigit>(provider)
}
Alphabetic::NAME | Alphabetic::SHORT_NAME => {
Self::try_new_unstable::<Alphabetic>(provider)
}
BidiControl::NAME | BidiControl::SHORT_NAME => {
Self::try_new_unstable::<BidiControl>(provider)
}
BidiMirrored::NAME | BidiMirrored::SHORT_NAME => {
Self::try_new_unstable::<BidiMirrored>(provider)
}
CaseIgnorable::NAME | CaseIgnorable::SHORT_NAME => {
Self::try_new_unstable::<CaseIgnorable>(provider)
}
#[allow(unreachable_patterns)] // no short name
Cased::NAME | Cased::SHORT_NAME => Self::try_new_unstable::<Cased>(provider),
ChangesWhenCasefolded::NAME | ChangesWhenCasefolded::SHORT_NAME => {
Self::try_new_unstable::<ChangesWhenCasefolded>(provider)
}
ChangesWhenCasemapped::NAME | ChangesWhenCasemapped::SHORT_NAME => {
Self::try_new_unstable::<ChangesWhenCasemapped>(provider)
}
ChangesWhenLowercased::NAME | ChangesWhenLowercased::SHORT_NAME => {
Self::try_new_unstable::<ChangesWhenLowercased>(provider)
}
ChangesWhenNfkcCasefolded::NAME | ChangesWhenNfkcCasefolded::SHORT_NAME => {
Self::try_new_unstable::<ChangesWhenNfkcCasefolded>(provider)
}
ChangesWhenTitlecased::NAME | ChangesWhenTitlecased::SHORT_NAME => {
Self::try_new_unstable::<ChangesWhenTitlecased>(provider)
}
ChangesWhenUppercased::NAME | ChangesWhenUppercased::SHORT_NAME => {
Self::try_new_unstable::<ChangesWhenUppercased>(provider)
}
#[allow(unreachable_patterns)] // no short name
Dash::NAME | Dash::SHORT_NAME => Self::try_new_unstable::<Dash>(provider),
DefaultIgnorableCodePoint::NAME | DefaultIgnorableCodePoint::SHORT_NAME => {
Self::try_new_unstable::<DefaultIgnorableCodePoint>(provider)
}
Deprecated::NAME | Deprecated::SHORT_NAME => {
Self::try_new_unstable::<Deprecated>(provider)
}
Diacritic::NAME | Diacritic::SHORT_NAME => {
Self::try_new_unstable::<Diacritic>(provider)
}
#[allow(unreachable_patterns)] // no short name
Emoji::NAME | Emoji::SHORT_NAME => Self::try_new_unstable::<Emoji>(provider),
EmojiComponent::NAME | EmojiComponent::SHORT_NAME => {
Self::try_new_unstable::<EmojiComponent>(provider)
}
EmojiModifier::NAME | EmojiModifier::SHORT_NAME => {
Self::try_new_unstable::<EmojiModifier>(provider)
}
EmojiModifierBase::NAME | EmojiModifierBase::SHORT_NAME => {
Self::try_new_unstable::<EmojiModifierBase>(provider)
}
EmojiPresentation::NAME | EmojiPresentation::SHORT_NAME => {
Self::try_new_unstable::<EmojiPresentation>(provider)
}
ExtendedPictographic::NAME | ExtendedPictographic::SHORT_NAME => {
Self::try_new_unstable::<ExtendedPictographic>(provider)
}
Extender::NAME | Extender::SHORT_NAME => Self::try_new_unstable::<Extender>(provider),
GraphemeBase::NAME | GraphemeBase::SHORT_NAME => {
Self::try_new_unstable::<GraphemeBase>(provider)
}
GraphemeExtend::NAME | GraphemeExtend::SHORT_NAME => {
Self::try_new_unstable::<GraphemeExtend>(provider)
}
HexDigit::NAME | HexDigit::SHORT_NAME => Self::try_new_unstable::<HexDigit>(provider),
IdsBinaryOperator::NAME | IdsBinaryOperator::SHORT_NAME => {
Self::try_new_unstable::<IdsBinaryOperator>(provider)
}
IdsTrinaryOperator::NAME | IdsTrinaryOperator::SHORT_NAME => {
Self::try_new_unstable::<IdsTrinaryOperator>(provider)
}
IdContinue::NAME | IdContinue::SHORT_NAME => {
Self::try_new_unstable::<IdContinue>(provider)
}
IdStart::NAME | IdStart::SHORT_NAME => Self::try_new_unstable::<IdStart>(provider),
Ideographic::NAME | Ideographic::SHORT_NAME => {
Self::try_new_unstable::<Ideographic>(provider)
}
JoinControl::NAME | JoinControl::SHORT_NAME => {
Self::try_new_unstable::<JoinControl>(provider)
}
LogicalOrderException::NAME | LogicalOrderException::SHORT_NAME => {
Self::try_new_unstable::<LogicalOrderException>(provider)
}
Lowercase::NAME | Lowercase::SHORT_NAME => {
Self::try_new_unstable::<Lowercase>(provider)
}
#[allow(unreachable_patterns)] // no short name
Math::NAME | Math::SHORT_NAME => Self::try_new_unstable::<Math>(provider),
NoncharacterCodePoint::NAME | NoncharacterCodePoint::SHORT_NAME => {
Self::try_new_unstable::<NoncharacterCodePoint>(provider)
}
PatternSyntax::NAME | PatternSyntax::SHORT_NAME => {
Self::try_new_unstable::<PatternSyntax>(provider)
}
PatternWhiteSpace::NAME | PatternWhiteSpace::SHORT_NAME => {
Self::try_new_unstable::<PatternWhiteSpace>(provider)
}
QuotationMark::NAME | QuotationMark::SHORT_NAME => {
Self::try_new_unstable::<QuotationMark>(provider)
}
#[allow(unreachable_patterns)] // no short name
Radical::NAME | Radical::SHORT_NAME => Self::try_new_unstable::<Radical>(provider),
RegionalIndicator::NAME | RegionalIndicator::SHORT_NAME => {
Self::try_new_unstable::<RegionalIndicator>(provider)
}
SentenceTerminal::NAME | SentenceTerminal::SHORT_NAME => {
Self::try_new_unstable::<SentenceTerminal>(provider)
}
SoftDotted::NAME | SoftDotted::SHORT_NAME => {
Self::try_new_unstable::<SoftDotted>(provider)
}
TerminalPunctuation::NAME | TerminalPunctuation::SHORT_NAME => {
Self::try_new_unstable::<TerminalPunctuation>(provider)
}
UnifiedIdeograph::NAME | UnifiedIdeograph::SHORT_NAME => {
Self::try_new_unstable::<UnifiedIdeograph>(provider)
}
Uppercase::NAME | Uppercase::SHORT_NAME => {
Self::try_new_unstable::<Uppercase>(provider)
}
VariationSelector::NAME | VariationSelector::SHORT_NAME => {
Self::try_new_unstable::<VariationSelector>(provider)
}
WhiteSpace::NAME | WhiteSpace::SHORT_NAME => {
Self::try_new_unstable::<WhiteSpace>(provider)
}
XidContinue::NAME | XidContinue::SHORT_NAME => {
Self::try_new_unstable::<XidContinue>(provider)
}
XidStart::NAME | XidStart::SHORT_NAME => Self::try_new_unstable::<XidStart>(provider),
// Not an ECMA-262 property
_ => return None,
})
}
}
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
//! Data and APIs for supporting Script_Extensions property
//! values in an efficient structure.
use crate::props::Script;
use crate::provider::*;
#[cfg(feature = "alloc")]
use core::iter::FromIterator;
use core::ops::RangeInclusive;
#[cfg(feature = "alloc")]
use icu_collections::codepointinvlist::CodePointInversionList;
use icu_provider::prelude::*;
use zerovec::{ule::AsULE, ZeroSlice};
/// The number of bits at the low-end of a `ScriptWithExt` value used for
/// storing the `Script` value (or `extensions` index).
const SCRIPT_VAL_LENGTH: u16 = 10;
/// The bit mask necessary to retrieve the `Script` value (or `extensions` index)
/// from a `ScriptWithExt` value.
const SCRIPT_X_SCRIPT_VAL: u16 = (1 << SCRIPT_VAL_LENGTH) - 1;
/// An internal-use only pseudo-property that represents the values stored in
/// the trie of the special data structure [`ScriptWithExtensionsProperty`].
///
/// Note: The will assume a 12-bit layout. The 2 higher order bits in positions
/// 11..10 will indicate how to deduce the Script value and Script_Extensions,
/// and the lower 10 bits 9..0 indicate either the Script value or the index
/// into the `extensions` structure.
#[derive(Copy, Clone, Debug, Eq, PartialEq)]
#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
#[cfg_attr(feature = "datagen", derive(databake::Bake))]
#[cfg_attr(feature = "datagen", databake(path = icu_properties::script))]
#[repr(transparent)]
#[doc(hidden)]
// `ScriptWithExt` not intended as public-facing but for `ScriptWithExtensionsProperty` constructor
#[allow(clippy::exhaustive_structs)] // this type is stable
pub struct ScriptWithExt(pub u16);
#[allow(missing_docs)] // These constants don't need individual documentation.
#[allow(non_upper_case_globals)]
#[doc(hidden)] // `ScriptWithExt` not intended as public-facing but for `ScriptWithExtensionsProperty` constructor
impl ScriptWithExt {
pub const Unknown: ScriptWithExt = ScriptWithExt(0);
}
impl AsULE for ScriptWithExt {
type ULE = <u16 as AsULE>::ULE;
#[inline]
fn to_unaligned(self) -> Self::ULE {
Script(self.0).to_unaligned()
}
#[inline]
fn from_unaligned(unaligned: Self::ULE) -> Self {
ScriptWithExt(Script::from_unaligned(unaligned).0)
}
}
#[doc(hidden)] // `ScriptWithExt` not intended as public-facing but for `ScriptWithExtensionsProperty` constructor
impl ScriptWithExt {
/// Returns whether the [`ScriptWithExt`] value has Script_Extensions and
/// also indicates a Script value of [`Script::Common`].
///
/// # Examples
///
/// ```
/// use icu::properties::script::ScriptWithExt;
///
/// assert!(ScriptWithExt(0x04FF).is_common());
/// assert!(ScriptWithExt(0x0400).is_common());
///
/// assert!(!ScriptWithExt(0x08FF).is_common());
/// assert!(!ScriptWithExt(0x0800).is_common());
///
/// assert!(!ScriptWithExt(0x0CFF).is_common());
/// assert!(!ScriptWithExt(0x0C00).is_common());
///
/// assert!(!ScriptWithExt(0xFF).is_common());
/// assert!(!ScriptWithExt(0x0).is_common());
/// ```
pub fn is_common(&self) -> bool {
self.0 >> SCRIPT_VAL_LENGTH == 1
}
/// Returns whether the [`ScriptWithExt`] value has Script_Extensions and
/// also indicates a Script value of [`Script::Inherited`].
///
/// # Examples
///
/// ```
/// use icu::properties::script::ScriptWithExt;
///
/// assert!(!ScriptWithExt(0x04FF).is_inherited());
/// assert!(!ScriptWithExt(0x0400).is_inherited());
///
/// assert!(ScriptWithExt(0x08FF).is_inherited());
/// assert!(ScriptWithExt(0x0800).is_inherited());
///
/// assert!(!ScriptWithExt(0x0CFF).is_inherited());
/// assert!(!ScriptWithExt(0x0C00).is_inherited());
///
/// assert!(!ScriptWithExt(0xFF).is_inherited());
/// assert!(!ScriptWithExt(0x0).is_inherited());
/// ```
pub fn is_inherited(&self) -> bool {
self.0 >> SCRIPT_VAL_LENGTH == 2
}
/// Returns whether the [`ScriptWithExt`] value has Script_Extensions and
/// also indicates that the Script value is neither [`Script::Common`] nor
/// [`Script::Inherited`].
///
/// # Examples
///
/// ```
/// use icu::properties::script::ScriptWithExt;
///
/// assert!(!ScriptWithExt(0x04FF).is_other());
/// assert!(!ScriptWithExt(0x0400).is_other());
///
/// assert!(!ScriptWithExt(0x08FF).is_other());
/// assert!(!ScriptWithExt(0x0800).is_other());
///
/// assert!(ScriptWithExt(0x0CFF).is_other());
/// assert!(ScriptWithExt(0x0C00).is_other());
///
/// assert!(!ScriptWithExt(0xFF).is_other());
/// assert!(!ScriptWithExt(0x0).is_other());
/// ```
pub fn is_other(&self) -> bool {
self.0 >> SCRIPT_VAL_LENGTH == 3
}
/// Returns whether the [`ScriptWithExt`] value has Script_Extensions.
///
/// # Examples
///
/// ```
/// use icu::properties::script::ScriptWithExt;
///
/// assert!(ScriptWithExt(0x04FF).has_extensions());
/// assert!(ScriptWithExt(0x0400).has_extensions());
///
/// assert!(ScriptWithExt(0x08FF).has_extensions());
/// assert!(ScriptWithExt(0x0800).has_extensions());
///
/// assert!(ScriptWithExt(0x0CFF).has_extensions());
/// assert!(ScriptWithExt(0x0C00).has_extensions());
///
/// assert!(!ScriptWithExt(0xFF).has_extensions());
/// assert!(!ScriptWithExt(0x0).has_extensions());
/// ```
pub fn has_extensions(&self) -> bool {
let high_order_bits = self.0 >> SCRIPT_VAL_LENGTH;
high_order_bits > 0
}
}
impl From<ScriptWithExt> for u32 {
fn from(swe: ScriptWithExt) -> Self {
swe.0 as u32
}
}
impl From<ScriptWithExt> for Script {
fn from(swe: ScriptWithExt) -> Self {
Script(swe.0)
}
}
/// A struct that wraps a [`Script`] array, such as in the return value for
/// [`get_script_extensions_val()`](ScriptWithExtensionsBorrowed::get_script_extensions_val).
#[derive(Copy, Clone, Debug, Eq, PartialEq)]
pub struct ScriptExtensionsSet<'a> {
values: &'a ZeroSlice<Script>,
}
impl<'a> ScriptExtensionsSet<'a> {
/// Returns whether this set contains the given script.
///
/// # Example
///
/// ```
/// use icu::properties::props::Script;
/// use icu::properties::script::ScriptWithExtensions;
/// let swe = ScriptWithExtensions::new();
///
/// assert!(swe
/// .get_script_extensions_val('\u{11303}') // GRANTHA SIGN VISARGA
/// .contains(&Script::Grantha));
/// ```
pub fn contains(&self, x: &Script) -> bool {
ZeroSlice::binary_search(self.values, x).is_ok()
}
/// Gets an iterator over the elements.
///
/// # Example
///
/// ```
/// use icu::properties::props::Script;
/// use icu::properties::script::ScriptWithExtensions;
/// let swe = ScriptWithExtensions::new();
///
/// assert_eq!(
/// swe.get_script_extensions_val('௫') // U+0BEB TAMIL DIGIT FIVE
/// .iter()
/// .collect::<Vec<_>>(),
/// [Script::Tamil, Script::Grantha]
/// );
/// ```
pub fn iter(&self) -> impl DoubleEndedIterator<Item = Script> + 'a {
ZeroSlice::iter(self.values)
}
/// For accessing this set as an array instead of an iterator
#[doc(hidden)] // used by FFI code
pub fn array_len(&self) -> usize {
self.values.len()
}
/// For accessing this set as an array instead of an iterator
#[doc(hidden)] // used by FFI code
pub fn array_get(&self, index: usize) -> Option<Script> {
self.values.get(index)
}
}
/// A struct that represents the data for the Script and Script_Extensions properties.
///
/// ✨ *Enabled with the `compiled_data` Cargo feature.*
///
/// [📚 Help choosing a constructor](icu_provider::constructors)
///
/// Most useful methods are on [`ScriptWithExtensionsBorrowed`] obtained by calling [`ScriptWithExtensions::as_borrowed()`]
///
/// # Examples
///
/// ```
/// use icu::properties::script::ScriptWithExtensions;
/// use icu::properties::props::Script;
/// let swe = ScriptWithExtensions::new();
///
/// // get the `Script` property value
/// assert_eq!(swe.get_script_val('ـ'), Script::Common); // U+0640 ARABIC TATWEEL
/// assert_eq!(swe.get_script_val('\u{0650}'), Script::Inherited); // U+0650 ARABIC KASRA
/// assert_eq!(swe.get_script_val('٠'), Script::Arabic); // // U+0660 ARABIC-INDIC DIGIT ZERO
/// assert_eq!(swe.get_script_val('ﷲ'), Script::Arabic); // U+FDF2 ARABIC LIGATURE ALLAH ISOLATED FORM
///
/// // get the `Script_Extensions` property value
/// assert_eq!(
/// swe.get_script_extensions_val('ـ') // U+0640 ARABIC TATWEEL
/// .iter().collect::<Vec<_>>(),
/// [Script::Arabic, Script::Syriac, Script::Mandaic, Script::Manichaean,
/// Script::PsalterPahlavi, Script::Adlam, Script::HanifiRohingya, Script::Sogdian,
/// Script::OldUyghur]
/// );
/// assert_eq!(
/// swe.get_script_extensions_val('🥳') // U+1F973 FACE WITH PARTY HORN AND PARTY HAT
/// .iter().collect::<Vec<_>>(),
/// [Script::Common]
/// );
/// assert_eq!(
/// swe.get_script_extensions_val('\u{200D}') // ZERO WIDTH JOINER
/// .iter().collect::<Vec<_>>(),
/// [Script::Inherited]
/// );
/// assert_eq!(
/// swe.get_script_extensions_val('௫') // U+0BEB TAMIL DIGIT FIVE
/// .iter().collect::<Vec<_>>(),
/// [Script::Tamil, Script::Grantha]
/// );
///
/// // check containment of a `Script` value in the `Script_Extensions` value
/// // U+0650 ARABIC KASRA
/// assert!(!swe.has_script('\u{0650}', Script::Inherited)); // main Script value
/// assert!(swe.has_script('\u{0650}', Script::Arabic));
/// assert!(swe.has_script('\u{0650}', Script::Syriac));
/// assert!(!swe.has_script('\u{0650}', Script::Thaana));
///
/// // get a `CodePointInversionList` for when `Script` value is contained in `Script_Extensions` value
/// let syriac = swe.get_script_extensions_set(Script::Syriac);
/// assert!(syriac.contains('\u{0650}')); // ARABIC KASRA
/// assert!(!syriac.contains('٠')); // ARABIC-INDIC DIGIT ZERO
/// assert!(!syriac.contains('ﷲ')); // ARABIC LIGATURE ALLAH ISOLATED FORM
/// assert!(syriac.contains('܀')); // SYRIAC END OF PARAGRAPH
/// assert!(syriac.contains('\u{074A}')); // SYRIAC BARREKH
/// ```
#[derive(Debug)]
pub struct ScriptWithExtensions {
data: DataPayload<PropertyScriptWithExtensionsV1>,
}
/// A borrowed wrapper around script extension data, returned by
/// [`ScriptWithExtensions::as_borrowed()`]. More efficient to query.
#[derive(Clone, Copy, Debug)]
pub struct ScriptWithExtensionsBorrowed<'a> {
data: &'a ScriptWithExtensionsProperty<'a>,
}
impl ScriptWithExtensions {
/// Creates a new instance of `ScriptWithExtensionsBorrowed` using compiled data.
///
/// ✨ *Enabled with the `compiled_data` Cargo feature.*
///
/// [📚 Help choosing a constructor](icu_provider::constructors)
#[cfg(feature = "compiled_data")]
#[expect(clippy::new_ret_no_self)]
pub fn new() -> ScriptWithExtensionsBorrowed<'static> {
ScriptWithExtensionsBorrowed::new()
}
icu_provider::gen_buffer_data_constructors!(
() -> result: Result<ScriptWithExtensions, DataError>,
functions: [
new: skip,
try_new_with_buffer_provider,
try_new_unstable,
Self,
]
);
#[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new)]
pub fn try_new_unstable(
provider: &(impl DataProvider<PropertyScriptWithExtensionsV1> + ?Sized),
) -> Result<Self, DataError> {
Ok(ScriptWithExtensions::from_data(
provider.load(Default::default())?.payload,
))
}
/// Construct a borrowed version of this type that can be queried.
///
/// This avoids a potential small underlying cost per API call (ex: `contains()`) by consolidating it
/// up front.
#[inline]
pub fn as_borrowed(&self) -> ScriptWithExtensionsBorrowed<'_> {
ScriptWithExtensionsBorrowed {
data: self.data.get(),
}
}
/// Construct a new one from loaded data
///
/// Typically it is preferable to use getters like [`load_script_with_extensions_unstable()`] instead
pub(crate) fn from_data(data: DataPayload<PropertyScriptWithExtensionsV1>) -> Self {
Self { data }
}
}
impl<'a> ScriptWithExtensionsBorrowed<'a> {
/// Returns the `Script` property value for this code point.
///
/// # Examples
///
/// ```
/// use icu::properties::script::ScriptWithExtensions;
/// use icu::properties::props::Script;
///
/// let swe = ScriptWithExtensions::new();
///
/// // U+0640 ARABIC TATWEEL
/// assert_eq!(swe.get_script_val('ـ'), Script::Common); // main Script value
/// assert_ne!(swe.get_script_val('ـ'), Script::Arabic);
/// assert_ne!(swe.get_script_val('ـ'), Script::Syriac);
/// assert_ne!(swe.get_script_val('ـ'), Script::Thaana);
///
/// // U+0650 ARABIC KASRA
/// assert_eq!(swe.get_script_val('\u{0650}'), Script::Inherited); // main Script value
/// assert_ne!(swe.get_script_val('\u{0650}'), Script::Arabic);
/// assert_ne!(swe.get_script_val('\u{0650}'), Script::Syriac);
/// assert_ne!(swe.get_script_val('\u{0650}'), Script::Thaana);
///
/// // U+0660 ARABIC-INDIC DIGIT ZERO
/// assert_ne!(swe.get_script_val('٠'), Script::Common);
/// assert_eq!(swe.get_script_val('٠'), Script::Arabic); // main Script value
/// assert_ne!(swe.get_script_val('٠'), Script::Syriac);
/// assert_ne!(swe.get_script_val('٠'), Script::Thaana);
///
/// // U+FDF2 ARABIC LIGATURE ALLAH ISOLATED FORM
/// assert_ne!(swe.get_script_val('ﷲ'), Script::Common);
/// assert_eq!(swe.get_script_val('ﷲ'), Script::Arabic); // main Script value
/// assert_ne!(swe.get_script_val('ﷲ'), Script::Syriac);
/// assert_ne!(swe.get_script_val('ﷲ'), Script::Thaana);
/// ```
pub fn get_script_val(self, ch: char) -> Script {
self.get_script_val32(ch as u32)
}
/// See [`Self::get_script_val`].
pub fn get_script_val32(self, code_point: u32) -> Script {
let sc_with_ext = self.data.trie.get32(code_point);
if sc_with_ext.is_other() {
let ext_idx = sc_with_ext.0 & SCRIPT_X_SCRIPT_VAL;
let scx_val = self.data.extensions.get(ext_idx as usize);
let scx_first_sc = scx_val.and_then(|scx| scx.get(0));
let default_sc_val = Script::Unknown;
scx_first_sc.unwrap_or(default_sc_val)
} else if sc_with_ext.is_common() {
Script::Common
} else if sc_with_ext.is_inherited() {
Script::Inherited
} else {
let script_val = sc_with_ext.0;
Script(script_val)
}
}
// Returns the Script_Extensions value for a code_point when the trie value
// is already known.
// This private helper method exists to prevent code duplication in callers like
// `get_script_extensions_val`, `get_script_extensions_set`, and `has_script`.
fn get_scx_val_using_trie_val(
self,
sc_with_ext_ule: &'a <ScriptWithExt as AsULE>::ULE,
) -> &'a ZeroSlice<Script> {
let sc_with_ext = ScriptWithExt::from_unaligned(*sc_with_ext_ule);
if sc_with_ext.is_other() {
let ext_idx = sc_with_ext.0 & SCRIPT_X_SCRIPT_VAL;
let ext_subarray = self.data.extensions.get(ext_idx as usize);
// In the OTHER case, where the 2 higher-order bits of the
// `ScriptWithExt` value in the trie doesn't indicate the Script value,
// the Script value is copied/inserted into the first position of the
// `extensions` array. So we must remove it to return the actual scx array val.
let scx_slice = ext_subarray
.and_then(|zslice| zslice.as_ule_slice().get(1..))
.unwrap_or_default();
ZeroSlice::from_ule_slice(scx_slice)
} else if sc_with_ext.is_common() || sc_with_ext.is_inherited() {
let ext_idx = sc_with_ext.0 & SCRIPT_X_SCRIPT_VAL;
let scx_val = self.data.extensions.get(ext_idx as usize);
scx_val.unwrap_or_default()
} else {
// Note: `Script` and `ScriptWithExt` are both represented as the same
// u16 value when the `ScriptWithExt` has no higher-order bits set.
let script_ule_slice = core::slice::from_ref(sc_with_ext_ule);
ZeroSlice::from_ule_slice(script_ule_slice)
}
}
/// Return the `Script_Extensions` property value for this code point.
///
/// If `code_point` has Script_Extensions, then return the Script codes in
/// the Script_Extensions. In this case, the Script property value
/// (normally Common or Inherited) is not included in the [`ScriptExtensionsSet`].
///
/// If c does not have Script_Extensions, then the one Script code is put
/// into the [`ScriptExtensionsSet`] and also returned.
///
/// If c is not a valid code point, then return an empty [`ScriptExtensionsSet`].
///
/// # Examples
///
/// ```
/// use icu::properties::script::ScriptWithExtensions;
/// use icu::properties::props::Script;
///
/// let swe = ScriptWithExtensions::new();
///
/// assert_eq!(
/// swe.get_script_extensions_val('𐓐') // U+104D0 OSAGE CAPITAL LETTER KHA
/// .iter()
/// .collect::<Vec<_>>(),
/// [Script::Osage]
/// );
/// assert_eq!(
/// swe.get_script_extensions_val('🥳') // U+1F973 FACE WITH PARTY HORN AND PARTY HAT
/// .iter()
/// .collect::<Vec<_>>(),
/// [Script::Common]
/// );
/// assert_eq!(
/// swe.get_script_extensions_val('\u{200D}') // ZERO WIDTH JOINER
/// .iter()
/// .collect::<Vec<_>>(),
/// [Script::Inherited]
/// );
/// assert_eq!(
/// swe.get_script_extensions_val('௫') // U+0BEB TAMIL DIGIT FIVE
/// .iter()
/// .collect::<Vec<_>>(),
/// [Script::Tamil, Script::Grantha]
/// );
/// ```
pub fn get_script_extensions_val(self, ch: char) -> ScriptExtensionsSet<'a> {
self.get_script_extensions_val32(ch as u32)
}
/// See [`Self::get_script_extensions_val`].
pub fn get_script_extensions_val32(self, code_point: u32) -> ScriptExtensionsSet<'a> {
let sc_with_ext_ule = self.data.trie.get32_ule(code_point);
ScriptExtensionsSet {
values: match sc_with_ext_ule {
Some(ule_ref) => self.get_scx_val_using_trie_val(ule_ref),
None => ZeroSlice::from_ule_slice(&[]),
},
}
}
/// Returns whether `script` is contained in the Script_Extensions
/// property value if the code_point has Script_Extensions, otherwise
/// if the code point does not have Script_Extensions then returns
/// whether the Script property value matches.
///
/// Some characters are commonly used in multiple scripts. For more information,
/// see UAX #24: <http://www.unicode.org/reports/tr24/>.
///
/// # Examples
///
/// ```
/// use icu::properties::script::ScriptWithExtensions;
/// use icu::properties::props::Script;
///
/// let swe = ScriptWithExtensions::new();
///
/// // U+0650 ARABIC KASRA
/// assert!(!swe.has_script('\u{0650}', Script::Inherited)); // main Script value
/// assert!(swe.has_script('\u{0650}', Script::Arabic));
/// assert!(swe.has_script('\u{0650}', Script::Syriac));
/// assert!(!swe.has_script('\u{0650}', Script::Thaana));
///
/// // U+0660 ARABIC-INDIC DIGIT ZERO
/// assert!(!swe.has_script('٠', Script::Common)); // main Script value
/// assert!(swe.has_script('٠', Script::Arabic));
/// assert!(!swe.has_script('٠', Script::Syriac));
/// assert!(swe.has_script('٠', Script::Thaana));
///
/// // U+FDF2 ARABIC LIGATURE ALLAH ISOLATED FORM
/// assert!(!swe.has_script('ﷲ', Script::Common));
/// assert!(swe.has_script('ﷲ', Script::Arabic)); // main Script value
/// assert!(!swe.has_script('ﷲ', Script::Syriac));
/// assert!(swe.has_script('ﷲ', Script::Thaana));
/// ```
pub fn has_script(self, ch: char, script: Script) -> bool {
self.has_script32(ch as u32, script)
}
/// See [`Self::has_script`].
pub fn has_script32(self, code_point: u32, script: Script) -> bool {
let sc_with_ext_ule = if let Some(scwe_ule) = self.data.trie.get32_ule(code_point) {
scwe_ule
} else {
return false;
};
let sc_with_ext = <ScriptWithExt as AsULE>::from_unaligned(*sc_with_ext_ule);
if !sc_with_ext.has_extensions() {
let script_val = sc_with_ext.0;
script == Script(script_val)
} else {
let scx_val = self.get_scx_val_using_trie_val(sc_with_ext_ule);
let script_find = scx_val.iter().find(|&sc| sc == script);
script_find.is_some()
}
}
/// Returns all of the matching `CodePointMapRange`s for the given [`Script`]
/// in which `has_script` will return true for all of the contained code points.
///
/// # Examples
///
/// ```
/// use icu::properties::props::Script;
/// use icu::properties::script::ScriptWithExtensions;
///
/// let swe = ScriptWithExtensions::new();
///
/// let syriac_script_extensions_ranges =
/// swe.get_script_extensions_ranges(Script::Syriac);
///
/// let exp_ranges = [
/// 0x0303..=0x0304, // COMBINING TILDE..COMBINING MACRON
/// 0x0307..=0x0308, // COMBINING DOT ABOVE..COMBINING DIAERESIS
/// 0x030A..=0x030A, // COMBINING RING ABOVE
/// 0x0323..=0x0325, // COMBINING DOT BELOW..COMBINING RING BELOW
/// 0x032D..=0x032E, // COMBINING CIRCUMFLEX ACCENT BELOW..COMBINING BREVE BELOW
/// 0x0330..=0x0331, // COMBINING TILDE BELOW..COMBINING MACRON BELOW
/// 0x060C..=0x060C, // ARABIC COMMA
/// 0x061B..=0x061C, // ARABIC SEMICOLON, ARABIC LETTER MARK
/// 0x061F..=0x061F, // ARABIC QUESTION MARK
/// 0x0640..=0x0640, // ARABIC TATWEEL
/// 0x064B..=0x0655, // ARABIC FATHATAN..ARABIC HAMZA BELOW
/// 0x0670..=0x0670, // ARABIC LETTER SUPERSCRIPT ALEF
/// 0x0700..=0x070D, // Syriac block begins at U+0700
/// 0x070F..=0x074A, // Syriac block
/// 0x074D..=0x074F, // Syriac block ends at U+074F
/// 0x0860..=0x086A, // Syriac Supplement block is U+0860..=U+086F
/// 0x1DF8..=0x1DF8, // COMBINING DOT ABOVE LEFT
/// 0x1DFA..=0x1DFA, // COMBINING DOT BELOW LEFT
/// ];
///
/// assert_eq!(
/// syriac_script_extensions_ranges.collect::<Vec<_>>(),
/// exp_ranges
/// );
/// ```
pub fn get_script_extensions_ranges(
self,
script: Script,
) -> impl Iterator<Item = RangeInclusive<u32>> + 'a {
self.data
.trie
.iter_ranges_mapped(move |value| {
let sc_with_ext = ScriptWithExt(value.0);
if sc_with_ext.has_extensions() {
self.get_scx_val_using_trie_val(&sc_with_ext.to_unaligned())
.iter()
.any(|sc| sc == script)
} else {
script == sc_with_ext.into()
}
})
.filter(|v| v.value)
.map(|v| v.range)
}
/// Returns a [`CodePointInversionList`] for the given [`Script`] which represents all
/// code points for which `has_script` will return true.
///
/// ✨ *Enabled with the `alloc` Cargo feature.*
///
/// # Examples
///
/// ```
/// use icu::properties::script::ScriptWithExtensions;
/// use icu::properties::props::Script;
///
/// let swe = ScriptWithExtensions::new();
///
/// let syriac = swe.get_script_extensions_set(Script::Syriac);
///
/// assert!(!syriac.contains('؞')); // ARABIC TRIPLE DOT PUNCTUATION MARK
/// assert!(syriac.contains('؟')); // ARABIC QUESTION MARK
/// assert!(!syriac.contains('ؠ')); // ARABIC LETTER KASHMIRI YEH
///
/// assert!(syriac.contains('܀')); // SYRIAC END OF PARAGRAPH
/// assert!(syriac.contains('\u{074A}')); // SYRIAC BARREKH
/// assert!(!syriac.contains('\u{074B}')); // unassigned
/// assert!(syriac.contains('ݏ')); // SYRIAC LETTER SOGDIAN FE
/// assert!(!syriac.contains('ݐ')); // ARABIC LETTER BEH WITH THREE DOTS HORIZONTALLY BELOW
///
/// assert!(syriac.contains('\u{1DF8}')); // COMBINING DOT ABOVE LEFT
/// assert!(!syriac.contains('\u{1DF9}')); // COMBINING WIDE INVERTED BRIDGE BELOW
/// assert!(syriac.contains('\u{1DFA}')); // COMBINING DOT BELOW LEFT
/// assert!(!syriac.contains('\u{1DFB}')); // COMBINING DELETION MARK
/// ```
#[cfg(feature = "alloc")]
pub fn get_script_extensions_set(self, script: Script) -> CodePointInversionList<'a> {
CodePointInversionList::from_iter(self.get_script_extensions_ranges(script))
}
}
#[cfg(feature = "compiled_data")]
impl Default for ScriptWithExtensionsBorrowed<'static> {
fn default() -> Self {
Self::new()
}
}
impl ScriptWithExtensionsBorrowed<'static> {
/// Creates a new instance of `ScriptWithExtensionsBorrowed` using compiled data.
///
/// ✨ *Enabled with the `compiled_data` Cargo feature.*
///
/// [📚 Help choosing a constructor](icu_provider::constructors)
#[cfg(feature = "compiled_data")]
pub fn new() -> Self {
Self {
data: crate::provider::Baked::SINGLETON_PROPERTY_SCRIPT_WITH_EXTENSIONS_V1,
}
}
/// Cheaply converts a [`ScriptWithExtensionsBorrowed<'static>`] into a [`ScriptWithExtensions`].
///
/// Note: Due to branching and indirection, using [`ScriptWithExtensions`] might inhibit some
/// compile-time optimizations that are possible with [`ScriptWithExtensionsBorrowed`].
pub const fn static_to_owned(self) -> ScriptWithExtensions {
ScriptWithExtensions {
data: DataPayload::from_static_ref(self.data),
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
/// Regression test for https://github.com/unicode-org/icu4x/issues/6041
fn test_scx_regression_6041() {
let scripts = ScriptWithExtensions::new()
.get_script_extensions_val('\u{2bc}')
.iter()
.collect::<Vec<_>>();
assert_eq!(
scripts,
[
Script::Bengali,
Script::Cyrillic,
Script::Devanagari,
Script::Latin,
Script::Thai,
Script::Lisu,
Script::Toto
]
);
}
}
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
use crate::bidi::BidiMirroringGlyph;
use crate::props::{
BidiClass, CanonicalCombiningClass, EastAsianWidth, GeneralCategory, GeneralCategoryGroup,
GraphemeClusterBreak, HangulSyllableType, IndicConjunctBreak, IndicSyllabicCategory,
JoiningType, LineBreak, Script, SentenceBreak, VerticalOrientation, WordBreak,
};
use crate::script::ScriptWithExt;
use core::convert::TryInto;
use core::num::TryFromIntError;
use zerovec::ule::{AsULE, RawBytesULE};
use icu_collections::codepointtrie::TrieValue;
use core::convert::TryFrom;
impl TrieValue for CanonicalCombiningClass {
type TryFromU32Error = TryFromIntError;
fn try_from_u32(i: u32) -> Result<Self, Self::TryFromU32Error> {
u8::try_from(i).map(Self)
}
fn to_u32(self) -> u32 {
u32::from(self.0)
}
}
impl TrieValue for BidiClass {
type TryFromU32Error = TryFromIntError;
fn try_from_u32(i: u32) -> Result<Self, Self::TryFromU32Error> {
u8::try_from(i).map(Self)
}
fn to_u32(self) -> u32 {
u32::from(self.0)
}
}
impl TrieValue for GeneralCategory {
type TryFromU32Error = &'static str;
fn try_from_u32(i: u32) -> Result<Self, Self::TryFromU32Error> {
// If the u32 is out of range, fall back to u8::MAX, which is out of range of the GeneralCategory enum.
GeneralCategory::new_from_u8(i.try_into().unwrap_or(u8::MAX))
.ok_or("Cannot parse GeneralCategory from integer")
}
fn to_u32(self) -> u32 {
u32::from(self as u8)
}
}
impl TrieValue for Script {
type TryFromU32Error = TryFromIntError;
fn try_from_u32(i: u32) -> Result<Self, Self::TryFromU32Error> {
u16::try_from(i).map(Script)
}
fn to_u32(self) -> u32 {
u32::from(self.0)
}
}
impl TrieValue for HangulSyllableType {
type TryFromU32Error = TryFromIntError;
fn try_from_u32(i: u32) -> Result<Self, Self::TryFromU32Error> {
u8::try_from(i).map(Self)
}
fn to_u32(self) -> u32 {
u32::from(self.0)
}
}
impl TrieValue for ScriptWithExt {
type TryFromU32Error = TryFromIntError;
fn try_from_u32(i: u32) -> Result<Self, Self::TryFromU32Error> {
u16::try_from(i).map(Self)
}
fn to_u32(self) -> u32 {
u32::from(self.0)
}
}
impl TrieValue for EastAsianWidth {
type TryFromU32Error = TryFromIntError;
fn try_from_u32(i: u32) -> Result<Self, Self::TryFromU32Error> {
u8::try_from(i).map(Self)
}
fn to_u32(self) -> u32 {
u32::from(self.0)
}
}
impl TrieValue for LineBreak {
type TryFromU32Error = TryFromIntError;
fn try_from_u32(i: u32) -> Result<Self, Self::TryFromU32Error> {
u8::try_from(i).map(Self)
}
fn to_u32(self) -> u32 {
u32::from(self.0)
}
}
impl TrieValue for GraphemeClusterBreak {
type TryFromU32Error = TryFromIntError;
fn try_from_u32(i: u32) -> Result<Self, Self::TryFromU32Error> {
u8::try_from(i).map(Self)
}
fn to_u32(self) -> u32 {
u32::from(self.0)
}
}
impl TrieValue for WordBreak {
type TryFromU32Error = TryFromIntError;
fn try_from_u32(i: u32) -> Result<Self, Self::TryFromU32Error> {
u8::try_from(i).map(Self)
}
fn to_u32(self) -> u32 {
u32::from(self.0)
}
}
impl TrieValue for SentenceBreak {
type TryFromU32Error = TryFromIntError;
fn try_from_u32(i: u32) -> Result<Self, Self::TryFromU32Error> {
u8::try_from(i).map(Self)
}
fn to_u32(self) -> u32 {
u32::from(self.0)
}
}
impl TrieValue for IndicConjunctBreak {
type TryFromU32Error = TryFromIntError;
fn try_from_u32(i: u32) -> Result<Self, Self::TryFromU32Error> {
u8::try_from(i).map(Self)
}
fn to_u32(self) -> u32 {
u32::from(self.0)
}
}
impl TrieValue for IndicSyllabicCategory {
type TryFromU32Error = TryFromIntError;
fn try_from_u32(i: u32) -> Result<Self, Self::TryFromU32Error> {
u8::try_from(i).map(Self)
}
fn to_u32(self) -> u32 {
u32::from(self.0)
}
}
impl TrieValue for VerticalOrientation {
type TryFromU32Error = TryFromIntError;
fn try_from_u32(i: u32) -> Result<Self, Self::TryFromU32Error> {
u8::try_from(i).map(Self)
}
fn to_u32(self) -> u32 {
u32::from(self.0)
}
}
// GCG is not used inside tries, but it is used in the name lookup type, and we want
// to squeeze it into a u16 for storage. Its named mask values are specced so we can
// do this in code.
//
// This is done by:
// - Single-value masks are translated to their corresponding GeneralCategory values
// - we know all of the multi-value masks and we give them special values
// - Anything else goes to 0xFF00, though this code path shouldn't be hit unless working with malformed icuexportdata
//
// In the reverse direction, unknown values go to the empty mask, but this codepath should not be hit except
// with malformed ICU4X generated data.
impl AsULE for GeneralCategoryGroup {
type ULE = RawBytesULE<2>;
fn to_unaligned(self) -> Self::ULE {
let value = gcg_to_packed_u16(self);
value.to_unaligned()
}
fn from_unaligned(ule: Self::ULE) -> Self {
let value = ule.as_unsigned_int();
packed_u16_to_gcg(value)
}
}
fn packed_u16_to_gcg(value: u16) -> GeneralCategoryGroup {
match value {
0xFFFF => GeneralCategoryGroup::CasedLetter,
0xFFFE => GeneralCategoryGroup::Letter,
0xFFFD => GeneralCategoryGroup::Mark,
0xFFFC => GeneralCategoryGroup::Number,
0xFFFB => GeneralCategoryGroup::Separator,
0xFFFA => GeneralCategoryGroup::Other,
0xFFF9 => GeneralCategoryGroup::Punctuation,
0xFFF8 => GeneralCategoryGroup::Symbol,
v if v < 32 => GeneralCategory::new_from_u8(v as u8)
.map(|gc| gc.into())
.unwrap_or(GeneralCategoryGroup(0)),
// unknown values produce an empty mask
_ => GeneralCategoryGroup(0),
}
}
fn gcg_to_packed_u16(gcg: GeneralCategoryGroup) -> u16 {
// if it's a single property, translate to that property
if gcg.0.is_power_of_two() {
// inverse operation of a bitshift
gcg.0.trailing_zeros() as u16
} else {
match gcg {
GeneralCategoryGroup::CasedLetter => 0xFFFF,
GeneralCategoryGroup::Letter => 0xFFFE,
GeneralCategoryGroup::Mark => 0xFFFD,
GeneralCategoryGroup::Number => 0xFFFC,
GeneralCategoryGroup::Separator => 0xFFFB,
GeneralCategoryGroup::Other => 0xFFFA,
GeneralCategoryGroup::Punctuation => 0xFFF9,
GeneralCategoryGroup::Symbol => 0xFFF8,
_ => 0xFF00, // random sentinel value
}
}
}
impl TrieValue for GeneralCategoryGroup {
type TryFromU32Error = TryFromIntError;
fn try_from_u32(i: u32) -> Result<Self, Self::TryFromU32Error> {
// Even though we're dealing with u32s here, TrieValue is about converting
// trie storage types to the actual type. This type will always be a packed u16
// in our case since the names map upcasts from u16
u16::try_from(i).map(packed_u16_to_gcg)
}
fn to_u32(self) -> u32 {
u32::from(gcg_to_packed_u16(self))
}
}
impl TrieValue for BidiMirroringGlyph {
type TryFromU32Error = u32;
fn try_from_u32(i: u32) -> Result<Self, Self::TryFromU32Error> {
let code_point = i & 0x1FFFFF;
let mirroring_glyph = if code_point == 0 {
None
} else {
Some(char::try_from_u32(code_point).map_err(|_| i)?)
};
let mirrored = ((i >> 21) & 0x1) == 1;
let paired_bracket_type = {
let value = ((i >> 22) & 0x3) as u8;
match value {
0 => crate::bidi::BidiPairedBracketType::None,
1 => crate::bidi::BidiPairedBracketType::Open,
2 => crate::bidi::BidiPairedBracketType::Close,
_ => return Err(i),
}
};
Ok(Self {
mirrored,
mirroring_glyph,
paired_bracket_type,
})
}
fn to_u32(self) -> u32 {
self.mirroring_glyph.unwrap_or_default() as u32
| ((self.mirrored as u32) << 21)
| (match self.paired_bracket_type {
crate::bidi::BidiPairedBracketType::None => 0,
crate::bidi::BidiPairedBracketType::Open => 1,
crate::bidi::BidiPairedBracketType::Close => 2,
} << 22)
}
}
impl TrieValue for JoiningType {
type TryFromU32Error = TryFromIntError;
fn try_from_u32(i: u32) -> Result<Self, Self::TryFromU32Error> {
u8::try_from(i).map(Self)
}
fn to_u32(self) -> u32 {
u32::from(self.0)
}
}