icu_normalizer
Advanced tools
| { | ||
| "git": { | ||
| "sha1": "c9fac4e625ccb2c6a7aa35079fff9709db4385ac" | ||
| }, | ||
| "path_in_vcs": "components/normalizer" | ||
| } |
| // This file is part of ICU4X. For terms of use, please see the file | ||
| // called LICENSE at the top level of the ICU4X source tree | ||
| // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). | ||
| use criterion::{criterion_group, criterion_main}; | ||
| mod canonical_composition; | ||
| mod canonical_decomposition; | ||
| mod composing_normalizer_nfc; | ||
| mod composing_normalizer_nfkc; | ||
| mod decomposing_normalizer_nfd; | ||
| mod decomposing_normalizer_nfkd; | ||
| mod utf16_throughput; | ||
| criterion_group!( | ||
| benches, | ||
| canonical_composition::criterion_benchmark, | ||
| canonical_decomposition::criterion_benchmark, | ||
| composing_normalizer_nfc::criterion_benchmark, | ||
| composing_normalizer_nfkc::criterion_benchmark, | ||
| decomposing_normalizer_nfd::criterion_benchmark, | ||
| decomposing_normalizer_nfkd::criterion_benchmark, | ||
| utf16_throughput::criterion_benchmark, | ||
| ); | ||
| criterion_main!(benches); |
| // This file is part of ICU4X. For terms of use, please see the file | ||
| // called LICENSE at the top level of the ICU4X source tree | ||
| // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). | ||
| use criterion::{black_box, BenchmarkId, Criterion}; | ||
| use detone::IterDecomposeVietnamese; | ||
| use icu_normalizer::properties::{ | ||
| CanonicalCompositionBorrowed, CanonicalDecompositionBorrowed, Decomposed, | ||
| }; | ||
| use icu_normalizer::ComposingNormalizerBorrowed; | ||
| struct BenchDataContent { | ||
| pub file_name: String, | ||
| pub pairs: Vec<(char, char)>, | ||
| } | ||
| fn strip_headers(content: &str) -> String { | ||
| content | ||
| .lines() | ||
| .filter(|&s| !s.starts_with('#')) | ||
| .map(|s| s.to_owned()) | ||
| .collect::<Vec<String>>() | ||
| .join("\n") | ||
| } | ||
| fn normalizer_bench_data() -> [BenchDataContent; 16] { | ||
| let nfc_normalizer = ComposingNormalizerBorrowed::new_nfc(); | ||
| [ | ||
| BenchDataContent { | ||
| file_name: "TestNames_Latin".to_owned(), | ||
| pairs: decompose_data( | ||
| &nfc_normalizer | ||
| .normalize(&strip_headers(include_str!("./data/TestNames_Latin.txt"))), | ||
| ), | ||
| }, | ||
| BenchDataContent { | ||
| file_name: "TestNames_Japanese_h".to_owned(), | ||
| pairs: decompose_data(&nfc_normalizer.normalize(&strip_headers(include_str!( | ||
| "./data/TestNames_Japanese_h.txt" | ||
| )))), | ||
| }, | ||
| BenchDataContent { | ||
| file_name: "TestNames_Japanese_k".to_owned(), | ||
| pairs: decompose_data(&nfc_normalizer.normalize(&strip_headers(include_str!( | ||
| "./data/TestNames_Japanese_k.txt" | ||
| )))), | ||
| }, | ||
| BenchDataContent { | ||
| file_name: "TestNames_Korean".to_owned(), | ||
| pairs: decompose_data( | ||
| &nfc_normalizer | ||
| .normalize(&strip_headers(include_str!("./data/TestNames_Korean.txt"))), | ||
| ), | ||
| }, | ||
| BenchDataContent { | ||
| file_name: "Carroll-11-ar".to_owned(), | ||
| #[cfg(debug_assertions)] | ||
| pairs: Vec::new(), | ||
| #[cfg(not(debug_assertions))] | ||
| pairs: decompose_data( | ||
| &nfc_normalizer.normalize(&strip_headers(include_str!("./data/Carroll-11-ar.txt"))), | ||
| ), | ||
| }, | ||
| BenchDataContent { | ||
| file_name: "Carroll-11-de".to_owned(), | ||
| pairs: decompose_data( | ||
| &nfc_normalizer.normalize(&strip_headers(include_str!("./data/Carroll-11-de.txt"))), | ||
| ), | ||
| }, | ||
| BenchDataContent { | ||
| file_name: "Carroll-11-el".to_owned(), | ||
| pairs: decompose_data( | ||
| &nfc_normalizer.normalize(&strip_headers(include_str!("./data/Carroll-11-el.txt"))), | ||
| ), | ||
| }, | ||
| BenchDataContent { | ||
| file_name: "Carroll-11-es".to_owned(), | ||
| pairs: decompose_data( | ||
| &nfc_normalizer.normalize(&strip_headers(include_str!("./data/Carroll-11-es.txt"))), | ||
| ), | ||
| }, | ||
| BenchDataContent { | ||
| file_name: "Carroll-11-fr".to_owned(), | ||
| pairs: decompose_data( | ||
| &nfc_normalizer.normalize(&strip_headers(include_str!("./data/Carroll-11-fr.txt"))), | ||
| ), | ||
| }, | ||
| BenchDataContent { | ||
| file_name: "Carroll-11-he".to_owned(), | ||
| pairs: decompose_data( | ||
| &nfc_normalizer.normalize(&strip_headers(include_str!("./data/Carroll-11-he.txt"))), | ||
| ), | ||
| }, | ||
| BenchDataContent { | ||
| file_name: "Carroll-11-pl".to_owned(), | ||
| pairs: decompose_data( | ||
| &nfc_normalizer.normalize(&strip_headers(include_str!("./data/Carroll-11-pl.txt"))), | ||
| ), | ||
| }, | ||
| BenchDataContent { | ||
| file_name: "Carroll-11-ru".to_owned(), | ||
| pairs: decompose_data( | ||
| &nfc_normalizer.normalize(&strip_headers(include_str!("./data/Carroll-11-ru.txt"))), | ||
| ), | ||
| }, | ||
| BenchDataContent { | ||
| file_name: "TestRandomWordsUDHR_th".to_owned(), | ||
| #[cfg(debug_assertions)] | ||
| pairs: Vec::new(), | ||
| #[cfg(not(debug_assertions))] | ||
| pairs: decompose_data(&nfc_normalizer.normalize(&strip_headers(include_str!( | ||
| "./data/TestRandomWordsUDHR_th.txt" | ||
| )))), | ||
| }, | ||
| BenchDataContent { | ||
| file_name: "Carroll-11-tr".to_owned(), | ||
| pairs: decompose_data( | ||
| &nfc_normalizer.normalize(&strip_headers(include_str!("./data/Carroll-11-tr.txt"))), | ||
| ), | ||
| }, | ||
| BenchDataContent { | ||
| file_name: "udhr_vie".to_owned(), | ||
| pairs: decompose_data( | ||
| &nfc_normalizer.normalize(&strip_headers(include_str!("data/wotw.txt"))), | ||
| ), | ||
| }, | ||
| BenchDataContent { | ||
| file_name: "udhr_vie_detone".to_owned(), | ||
| pairs: { | ||
| let result: Vec<(char, char)> = nfc_normalizer | ||
| .normalize(&strip_headers(include_str!("data/wotw.txt"))) | ||
| .chars() | ||
| .filter_map(|c| { | ||
| let mut iter = std::iter::once(c).decompose_vietnamese_tones(true); | ||
| if let Some(base) = iter.next() { | ||
| iter.next().map(|tone| (base, tone)) | ||
| } else { | ||
| None | ||
| } | ||
| }) | ||
| .collect(); | ||
| assert!(!result.is_empty()); | ||
| result | ||
| }, | ||
| }, | ||
| ] | ||
| } | ||
| fn function_under_bench( | ||
| canonical_composer: CanonicalCompositionBorrowed, | ||
| composable_points: &[(char, char)], | ||
| ) { | ||
| for pair in composable_points.iter() { | ||
| canonical_composer.compose(pair.0, pair.1); | ||
| } | ||
| } | ||
| pub fn criterion_benchmark(criterion: &mut Criterion) { | ||
| let group_name = "canonical_composition"; | ||
| let mut group = criterion.benchmark_group(group_name); | ||
| let composer = CanonicalCompositionBorrowed::new(); | ||
| for bench_data_content in black_box(normalizer_bench_data()) { | ||
| group.bench_function( | ||
| BenchmarkId::from_parameter(format!("from_nfc_{}", bench_data_content.file_name)), | ||
| |bencher| bencher.iter(|| function_under_bench(composer, &bench_data_content.pairs)), | ||
| ); | ||
| } | ||
| group.finish(); | ||
| } | ||
| fn decompose_data(nfc: &str) -> Vec<(char, char)> { | ||
| let decomposer = CanonicalDecompositionBorrowed::new(); | ||
| nfc.chars() | ||
| .map(|c| decomposer.decompose(c)) | ||
| .filter_map(|decomposed| { | ||
| if let Decomposed::Expansion(a, b) = decomposed { | ||
| Some((a, b)) | ||
| } else { | ||
| None | ||
| } | ||
| }) | ||
| .collect() | ||
| } |
| // This file is part of ICU4X. For terms of use, please see the file | ||
| // called LICENSE at the top level of the ICU4X source tree | ||
| // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). | ||
| use criterion::{black_box, BenchmarkId, Criterion}; | ||
| use icu_normalizer::properties::CanonicalDecompositionBorrowed; | ||
| use icu_normalizer::{ComposingNormalizerBorrowed, DecomposingNormalizerBorrowed}; | ||
| struct BenchDataContent { | ||
| pub file_name: String, | ||
| pub nfc: String, | ||
| pub nfd: String, | ||
| pub nfkc: String, | ||
| pub nfkd: String, | ||
| } | ||
| fn strip_headers(content: &str) -> String { | ||
| content | ||
| .lines() | ||
| .filter(|&s| !s.starts_with('#')) | ||
| .map(|s| s.to_owned()) | ||
| .collect::<Vec<String>>() | ||
| .join("\n") | ||
| } | ||
| fn normalizer_bench_data() -> [BenchDataContent; 15] { | ||
| let nfc_normalizer = ComposingNormalizerBorrowed::new_nfc(); | ||
| let nfd_normalizer = DecomposingNormalizerBorrowed::new_nfd(); | ||
| let nfkc_normalizer = ComposingNormalizerBorrowed::new_nfkc(); | ||
| let nfkd_normalizer = DecomposingNormalizerBorrowed::new_nfkd(); | ||
| let content_latin: (&str, &str) = ( | ||
| "TestNames_Latin", | ||
| &strip_headers(include_str!("./data/TestNames_Latin.txt")), | ||
| ); | ||
| let content_jp_h: (&str, &str) = ( | ||
| "TestNames_Japanese_h", | ||
| &strip_headers(include_str!("./data/TestNames_Japanese_h.txt")), | ||
| ); | ||
| let content_jp_k: (&str, &str) = ( | ||
| "TestNames_Japanese_k", | ||
| &strip_headers(include_str!("./data/TestNames_Japanese_k.txt")), | ||
| ); | ||
| let content_korean: (&str, &str) = ( | ||
| "TestNames_Korean", | ||
| &strip_headers(include_str!("./data/TestNames_Korean.txt")), | ||
| ); | ||
| let content_random_words_ar: (&str, &str) = ( | ||
| "Carroll-11-ar", | ||
| &strip_headers(include_str!("./data/Carroll-11-ar.txt")), | ||
| ); | ||
| let content_random_words_de: (&str, &str) = ( | ||
| "Carroll-11-de", | ||
| &strip_headers(include_str!("./data/Carroll-11-de.txt")), | ||
| ); | ||
| let content_random_words_el: (&str, &str) = ( | ||
| "Carroll-11-el", | ||
| &strip_headers(include_str!("./data/Carroll-11-el.txt")), | ||
| ); | ||
| let content_random_words_es: (&str, &str) = ( | ||
| "Carroll-11-es", | ||
| &strip_headers(include_str!("./data/Carroll-11-es.txt")), | ||
| ); | ||
| let content_random_words_fr: (&str, &str) = ( | ||
| "Carroll-11-fr", | ||
| &strip_headers(include_str!("./data/Carroll-11-fr.txt")), | ||
| ); | ||
| let content_random_words_he: (&str, &str) = ( | ||
| "Carroll-11-he", | ||
| &strip_headers(include_str!("./data/Carroll-11-he.txt")), | ||
| ); | ||
| let content_random_words_pl: (&str, &str) = ( | ||
| "Carroll-11-pl", | ||
| &strip_headers(include_str!("./data/Carroll-11-pl.txt")), | ||
| ); | ||
| let content_random_words_ru: (&str, &str) = ( | ||
| "Carroll-11-ru", | ||
| &strip_headers(include_str!("./data/Carroll-11-ru.txt")), | ||
| ); | ||
| let content_random_words_th: (&str, &str) = ( | ||
| "Carroll-11-th", | ||
| &strip_headers(include_str!("./data/Carroll-11-th.txt")), | ||
| ); | ||
| let content_random_words_tr: (&str, &str) = ( | ||
| "Carroll-11-tr", | ||
| &strip_headers(include_str!("./data/Carroll-11-tr.txt")), | ||
| ); | ||
| let content_viet: (&str, &str) = ("udhr_vie", &strip_headers(include_str!("data/wotw.txt"))); | ||
| [ | ||
| content_latin, | ||
| content_viet, | ||
| content_jp_k, | ||
| content_jp_h, | ||
| content_korean, | ||
| content_random_words_ru, | ||
| content_random_words_ar, | ||
| content_random_words_el, | ||
| content_random_words_es, | ||
| content_random_words_fr, | ||
| content_random_words_tr, | ||
| content_random_words_th, | ||
| content_random_words_pl, | ||
| content_random_words_he, | ||
| content_random_words_de, | ||
| ] | ||
| .map(|(file_name, raw_content)| BenchDataContent { | ||
| file_name: file_name.to_owned(), | ||
| nfc: nfc_normalizer.normalize(raw_content).to_string(), | ||
| nfd: nfd_normalizer.normalize(raw_content).to_string(), | ||
| nfkc: nfkc_normalizer.normalize(raw_content).to_string(), | ||
| nfkd: nfkd_normalizer.normalize(raw_content).to_string(), | ||
| }) | ||
| } | ||
| #[cfg(debug_assertions)] | ||
| fn function_under_bench( | ||
| _canonical_decomposer: &CanonicalDecompositionBorrowed, | ||
| _decomposable_points: &str, | ||
| ) { | ||
| // using debug assertion fails some test. | ||
| // "cargo test --bench bench" will pass | ||
| // "cargo bench" will work as expected, because the profile doesn't include debug assertions. | ||
| } | ||
| #[cfg(not(debug_assertions))] | ||
| fn function_under_bench( | ||
| canonical_decomposer: &CanonicalDecompositionBorrowed, | ||
| decomposable_points: &str, | ||
| ) { | ||
| decomposable_points.chars().for_each(|point| { | ||
| canonical_decomposer.decompose(point); | ||
| }); | ||
| } | ||
| pub fn criterion_benchmark(criterion: &mut Criterion) { | ||
| let group_name = "canonical_decomposition"; | ||
| let mut group = criterion.benchmark_group(group_name); | ||
| let decomposer = CanonicalDecompositionBorrowed::new(); | ||
| for bench_data_content in black_box(normalizer_bench_data()) { | ||
| group.bench_function( | ||
| BenchmarkId::from_parameter(format!("from_nfc_{}", bench_data_content.file_name)), | ||
| |bencher| bencher.iter(|| function_under_bench(&decomposer, &bench_data_content.nfc)), | ||
| ); | ||
| group.bench_function( | ||
| BenchmarkId::from_parameter(format!("from_nfd_{}", bench_data_content.file_name)), | ||
| |bencher| bencher.iter(|| function_under_bench(&decomposer, &bench_data_content.nfd)), | ||
| ); | ||
| group.bench_function( | ||
| BenchmarkId::from_parameter(format!("from_nfkc_{}", bench_data_content.file_name)), | ||
| |bencher| bencher.iter(|| function_under_bench(&decomposer, &bench_data_content.nfkc)), | ||
| ); | ||
| group.bench_function( | ||
| BenchmarkId::from_parameter(format!("from_nfkd_{}", bench_data_content.file_name)), | ||
| |bencher| bencher.iter(|| function_under_bench(&decomposer, &bench_data_content.nfkd)), | ||
| ); | ||
| } | ||
| group.finish(); | ||
| } |
| // This file is part of ICU4X. For terms of use, please see the file | ||
| // called LICENSE at the top level of the ICU4X source tree | ||
| // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). | ||
| use criterion::{black_box, BenchmarkId, Criterion}; | ||
| use icu_normalizer::{ComposingNormalizerBorrowed, DecomposingNormalizerBorrowed}; | ||
| struct BenchDataContent { | ||
| pub file_name: String, | ||
| pub nfc: String, | ||
| pub nfd: String, | ||
| pub nfkc: String, | ||
| pub nfkd: String, | ||
| pub nfc_u16: Vec<u16>, | ||
| pub nfd_u16: Vec<u16>, | ||
| pub nfkc_u16: Vec<u16>, | ||
| pub nfkd_u16: Vec<u16>, | ||
| } | ||
| fn strip_headers(content: &str) -> String { | ||
| content | ||
| .lines() | ||
| .filter(|&s| !s.starts_with('#')) | ||
| .map(|s| s.to_owned()) | ||
| .collect::<Vec<String>>() | ||
| .join("\n") | ||
| } | ||
| fn normalizer_bench_data() -> [BenchDataContent; 15] { | ||
| let nfc_normalizer = ComposingNormalizerBorrowed::new_nfc(); | ||
| let nfd_normalizer = DecomposingNormalizerBorrowed::new_nfd(); | ||
| let nfkc_normalizer = ComposingNormalizerBorrowed::new_nfkc(); | ||
| let nfkd_normalizer = DecomposingNormalizerBorrowed::new_nfkd(); | ||
| let content_latin: (&str, &str) = ( | ||
| "TestNames_Latin", | ||
| &strip_headers(include_str!("./data/TestNames_Latin.txt")), | ||
| ); | ||
| let content_jp_h: (&str, &str) = ( | ||
| "TestNames_Japanese_h", | ||
| &strip_headers(include_str!("./data/TestNames_Japanese_h.txt")), | ||
| ); | ||
| let content_jp_k: (&str, &str) = ( | ||
| "TestNames_Japanese_k", | ||
| &strip_headers(include_str!("./data/TestNames_Japanese_k.txt")), | ||
| ); | ||
| let content_korean: (&str, &str) = ( | ||
| "TestNames_Korean", | ||
| &strip_headers(include_str!("./data/TestNames_Korean.txt")), | ||
| ); | ||
| let content_random_words_ar: (&str, &str) = ( | ||
| "Carroll-11-ar", | ||
| &strip_headers(include_str!("./data/Carroll-11-ar.txt")), | ||
| ); | ||
| let content_random_words_de: (&str, &str) = ( | ||
| "Carroll-11-de", | ||
| &strip_headers(include_str!("./data/Carroll-11-de.txt")), | ||
| ); | ||
| let content_random_words_el: (&str, &str) = ( | ||
| "Carroll-11-el", | ||
| &strip_headers(include_str!("./data/Carroll-11-el.txt")), | ||
| ); | ||
| let content_random_words_es: (&str, &str) = ( | ||
| "Carroll-11-es", | ||
| &strip_headers(include_str!("./data/Carroll-11-es.txt")), | ||
| ); | ||
| let content_random_words_fr: (&str, &str) = ( | ||
| "Carroll-11-fr", | ||
| &strip_headers(include_str!("./data/Carroll-11-fr.txt")), | ||
| ); | ||
| let content_random_words_he: (&str, &str) = ( | ||
| "Carroll-11-he", | ||
| &strip_headers(include_str!("./data/Carroll-11-he.txt")), | ||
| ); | ||
| let content_random_words_pl: (&str, &str) = ( | ||
| "Carroll-11-pl", | ||
| &strip_headers(include_str!("./data/Carroll-11-pl.txt")), | ||
| ); | ||
| let content_random_words_ru: (&str, &str) = ( | ||
| "Carroll-11-ru", | ||
| &strip_headers(include_str!("./data/Carroll-11-ru.txt")), | ||
| ); | ||
| let content_random_words_th: (&str, &str) = ( | ||
| "Carroll-11-th", | ||
| &strip_headers(include_str!("./data/Carroll-11-th.txt")), | ||
| ); | ||
| let content_random_words_tr: (&str, &str) = ( | ||
| "Carroll-11-tr", | ||
| &strip_headers(include_str!("./data/Carroll-11-tr.txt")), | ||
| ); | ||
| let content_viet: (&str, &str) = ("wotw", &strip_headers(include_str!("./data/wotw.txt"))); | ||
| [ | ||
| content_latin, | ||
| content_viet, | ||
| content_jp_k, | ||
| content_jp_h, | ||
| content_korean, | ||
| content_random_words_ru, | ||
| content_random_words_ar, | ||
| content_random_words_el, | ||
| content_random_words_es, | ||
| content_random_words_fr, | ||
| content_random_words_tr, | ||
| content_random_words_th, | ||
| content_random_words_pl, | ||
| content_random_words_he, | ||
| content_random_words_de, | ||
| ] | ||
| .map(|(file_name, raw_content)| { | ||
| let nfc = &nfc_normalizer.normalize(raw_content); | ||
| let nfd = &nfd_normalizer.normalize(raw_content); | ||
| let nfkc = &nfkc_normalizer.normalize(raw_content); | ||
| let nfkd = &nfkd_normalizer.normalize(raw_content); | ||
| BenchDataContent { | ||
| file_name: file_name.to_owned(), | ||
| nfc: nfc.to_string(), | ||
| nfd: nfd.to_string(), | ||
| nfkc: nfkc.to_string(), | ||
| nfkd: nfkd.to_string(), | ||
| nfc_u16: nfc.encode_utf16().collect(), | ||
| nfd_u16: nfd.encode_utf16().collect(), | ||
| nfkc_u16: nfkc.encode_utf16().collect(), | ||
| nfkd_u16: nfkd.encode_utf16().collect(), | ||
| } | ||
| }) | ||
| } | ||
| fn function_under_bench(normalizer: &ComposingNormalizerBorrowed, text: &str) { | ||
| normalizer.normalize(text); | ||
| } | ||
| fn function_under_bench_utf16(normalizer: &ComposingNormalizerBorrowed, text: &[u16]) { | ||
| normalizer.normalize_utf16(text); | ||
| } | ||
| pub fn criterion_benchmark(criterion: &mut Criterion) { | ||
| let group_name = "composing_normalizer_nfc"; | ||
| let normalizer_under_bench = ComposingNormalizerBorrowed::new_nfc(); | ||
| let mut group = criterion.benchmark_group(group_name); | ||
| for bench_data_content in black_box(normalizer_bench_data()) { | ||
| group.bench_function( | ||
| BenchmarkId::from_parameter(format!("from_nfc_{}", bench_data_content.file_name)), | ||
| |bencher| { | ||
| bencher | ||
| .iter(|| function_under_bench(&normalizer_under_bench, &bench_data_content.nfc)) | ||
| }, | ||
| ); | ||
| group.bench_function( | ||
| BenchmarkId::from_parameter(format!("from_nfd_{}", bench_data_content.file_name)), | ||
| |bencher| { | ||
| bencher | ||
| .iter(|| function_under_bench(&normalizer_under_bench, &bench_data_content.nfd)) | ||
| }, | ||
| ); | ||
| group.bench_function( | ||
| BenchmarkId::from_parameter(format!("from_nfkc_{}", bench_data_content.file_name)), | ||
| |bencher| { | ||
| bencher.iter(|| { | ||
| function_under_bench(&normalizer_under_bench, &bench_data_content.nfkc) | ||
| }) | ||
| }, | ||
| ); | ||
| group.bench_function( | ||
| BenchmarkId::from_parameter(format!("from_nfkd_{}", bench_data_content.file_name)), | ||
| |bencher| { | ||
| bencher.iter(|| { | ||
| function_under_bench(&normalizer_under_bench, &bench_data_content.nfkd) | ||
| }) | ||
| }, | ||
| ); | ||
| // UTF_16 | ||
| group.bench_function( | ||
| BenchmarkId::from_parameter(format!( | ||
| "from_nfc_{}_utf_16", | ||
| bench_data_content.file_name | ||
| )), | ||
| |bencher| { | ||
| bencher.iter(|| { | ||
| function_under_bench_utf16(&normalizer_under_bench, &bench_data_content.nfc_u16) | ||
| }) | ||
| }, | ||
| ); | ||
| group.bench_function( | ||
| BenchmarkId::from_parameter(format!( | ||
| "from_nfd_{}_utf_16", | ||
| bench_data_content.file_name | ||
| )), | ||
| |bencher| { | ||
| bencher.iter(|| { | ||
| function_under_bench_utf16(&normalizer_under_bench, &bench_data_content.nfd_u16) | ||
| }) | ||
| }, | ||
| ); | ||
| group.bench_function( | ||
| BenchmarkId::from_parameter(format!( | ||
| "from_nfkc_{}_utf_16", | ||
| bench_data_content.file_name | ||
| )), | ||
| |bencher| { | ||
| bencher.iter(|| { | ||
| function_under_bench_utf16( | ||
| &normalizer_under_bench, | ||
| &bench_data_content.nfkc_u16, | ||
| ) | ||
| }) | ||
| }, | ||
| ); | ||
| group.bench_function( | ||
| BenchmarkId::from_parameter(format!( | ||
| "from_nfkd_{}_utf_16", | ||
| bench_data_content.file_name | ||
| )), | ||
| |bencher| { | ||
| bencher.iter(|| { | ||
| function_under_bench_utf16( | ||
| &normalizer_under_bench, | ||
| &bench_data_content.nfkd_u16, | ||
| ) | ||
| }) | ||
| }, | ||
| ); | ||
| } | ||
| group.finish(); | ||
| } |
| // This file is part of ICU4X. For terms of use, please see the file | ||
| // called LICENSE at the top level of the ICU4X source tree | ||
| // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). | ||
| use criterion::{black_box, BenchmarkId, Criterion}; | ||
| use icu_normalizer::{ComposingNormalizerBorrowed, DecomposingNormalizerBorrowed}; | ||
| struct BenchDataContent { | ||
| pub file_name: String, | ||
| pub nfc: String, | ||
| pub nfd: String, | ||
| pub nfkc: String, | ||
| pub nfkd: String, | ||
| pub nfc_u16: Vec<u16>, | ||
| pub nfd_u16: Vec<u16>, | ||
| pub nfkc_u16: Vec<u16>, | ||
| pub nfkd_u16: Vec<u16>, | ||
| } | ||
| fn strip_headers(content: &str) -> String { | ||
| content | ||
| .lines() | ||
| .filter(|&s| !s.starts_with('#')) | ||
| .map(|s| s.to_owned()) | ||
| .collect::<Vec<String>>() | ||
| .join("\n") | ||
| } | ||
| fn normalizer_bench_data() -> [BenchDataContent; 15] { | ||
| let nfc_normalizer = ComposingNormalizerBorrowed::new_nfc(); | ||
| let nfd_normalizer = DecomposingNormalizerBorrowed::new_nfd(); | ||
| let nfkc_normalizer = ComposingNormalizerBorrowed::new_nfkc(); | ||
| let nfkd_normalizer = DecomposingNormalizerBorrowed::new_nfkd(); | ||
| let content_latin: (&str, &str) = ( | ||
| "TestNames_Latin", | ||
| &strip_headers(include_str!("./data/TestNames_Latin.txt")), | ||
| ); | ||
| let content_jp_h: (&str, &str) = ( | ||
| "TestNames_Japanese_h", | ||
| &strip_headers(include_str!("./data/TestNames_Japanese_h.txt")), | ||
| ); | ||
| let content_jp_k: (&str, &str) = ( | ||
| "TestNames_Japanese_k", | ||
| &strip_headers(include_str!("./data/TestNames_Japanese_k.txt")), | ||
| ); | ||
| let content_korean: (&str, &str) = ( | ||
| "TestNames_Korean", | ||
| &strip_headers(include_str!("./data/TestNames_Korean.txt")), | ||
| ); | ||
| let content_random_words_ar: (&str, &str) = ( | ||
| "Carroll-11-ar", | ||
| &strip_headers(include_str!("./data/Carroll-11-ar.txt")), | ||
| ); | ||
| let content_random_words_de: (&str, &str) = ( | ||
| "Carroll-11-de", | ||
| &strip_headers(include_str!("./data/Carroll-11-de.txt")), | ||
| ); | ||
| let content_random_words_el: (&str, &str) = ( | ||
| "Carroll-11-el", | ||
| &strip_headers(include_str!("./data/Carroll-11-el.txt")), | ||
| ); | ||
| let content_random_words_es: (&str, &str) = ( | ||
| "Carroll-11-es", | ||
| &strip_headers(include_str!("./data/Carroll-11-es.txt")), | ||
| ); | ||
| let content_random_words_fr: (&str, &str) = ( | ||
| "Carroll-11-fr", | ||
| &strip_headers(include_str!("./data/Carroll-11-fr.txt")), | ||
| ); | ||
| let content_random_words_he: (&str, &str) = ( | ||
| "Carroll-11-he", | ||
| &strip_headers(include_str!("./data/Carroll-11-he.txt")), | ||
| ); | ||
| let content_random_words_pl: (&str, &str) = ( | ||
| "Carroll-11-pl", | ||
| &strip_headers(include_str!("./data/Carroll-11-pl.txt")), | ||
| ); | ||
| let content_random_words_ru: (&str, &str) = ( | ||
| "Carroll-11-ru", | ||
| &strip_headers(include_str!("./data/Carroll-11-ru.txt")), | ||
| ); | ||
| let content_random_words_th: (&str, &str) = ( | ||
| "Carroll-11-th", | ||
| &strip_headers(include_str!("./data/Carroll-11-th.txt")), | ||
| ); | ||
| let content_random_words_tr: (&str, &str) = ( | ||
| "Carroll-11-tr", | ||
| &strip_headers(include_str!("./data/Carroll-11-tr.txt")), | ||
| ); | ||
| let content_viet: (&str, &str) = ("wotw", &strip_headers(include_str!("./data/wotw.txt"))); | ||
| [ | ||
| content_latin, | ||
| content_viet, | ||
| content_jp_k, | ||
| content_jp_h, | ||
| content_korean, | ||
| content_random_words_ru, | ||
| content_random_words_ar, | ||
| content_random_words_el, | ||
| content_random_words_es, | ||
| content_random_words_fr, | ||
| content_random_words_tr, | ||
| content_random_words_th, | ||
| content_random_words_pl, | ||
| content_random_words_he, | ||
| content_random_words_de, | ||
| ] | ||
| .map(|(file_name, raw_content)| { | ||
| let nfc = &nfc_normalizer.normalize(raw_content); | ||
| let nfd = &nfd_normalizer.normalize(raw_content); | ||
| let nfkc = &nfkc_normalizer.normalize(raw_content); | ||
| let nfkd = &nfkd_normalizer.normalize(raw_content); | ||
| BenchDataContent { | ||
| file_name: file_name.to_owned(), | ||
| nfc: nfc.to_string(), | ||
| nfd: nfd.to_string(), | ||
| nfkc: nfkc.to_string(), | ||
| nfkd: nfkd.to_string(), | ||
| nfc_u16: nfc.encode_utf16().collect(), | ||
| nfd_u16: nfd.encode_utf16().collect(), | ||
| nfkc_u16: nfkc.encode_utf16().collect(), | ||
| nfkd_u16: nfkd.encode_utf16().collect(), | ||
| } | ||
| }) | ||
| } | ||
| fn function_under_bench(normalizer: &ComposingNormalizerBorrowed, text: &str) { | ||
| normalizer.normalize(text); | ||
| } | ||
| fn function_under_bench_u16(normalizer: &ComposingNormalizerBorrowed, text: &[u16]) { | ||
| normalizer.normalize_utf16(text); | ||
| } | ||
| pub fn criterion_benchmark(criterion: &mut Criterion) { | ||
| let group_name = "composing_normalizer_nfkc"; | ||
| let normalizer_under_bench = ComposingNormalizerBorrowed::new_nfkc(); | ||
| let mut group = criterion.benchmark_group(group_name); | ||
| for bench_data_content in black_box(normalizer_bench_data()) { | ||
| group.bench_function( | ||
| BenchmarkId::from_parameter(format!("from_nfc_{}", bench_data_content.file_name)), | ||
| |bencher| { | ||
| bencher | ||
| .iter(|| function_under_bench(&normalizer_under_bench, &bench_data_content.nfc)) | ||
| }, | ||
| ); | ||
| group.bench_function( | ||
| BenchmarkId::from_parameter(format!("from_nfd_{}", bench_data_content.file_name)), | ||
| |bencher| { | ||
| bencher | ||
| .iter(|| function_under_bench(&normalizer_under_bench, &bench_data_content.nfd)) | ||
| }, | ||
| ); | ||
| group.bench_function( | ||
| BenchmarkId::from_parameter(format!("from_nfkc_{}", bench_data_content.file_name)), | ||
| |bencher| { | ||
| bencher.iter(|| { | ||
| function_under_bench(&normalizer_under_bench, &bench_data_content.nfkc) | ||
| }) | ||
| }, | ||
| ); | ||
| group.bench_function( | ||
| BenchmarkId::from_parameter(format!("from_nfkd_{}", bench_data_content.file_name)), | ||
| |bencher| { | ||
| bencher.iter(|| { | ||
| function_under_bench(&normalizer_under_bench, &bench_data_content.nfkd) | ||
| }) | ||
| }, | ||
| ); | ||
| // UTF 16 | ||
| group.bench_function( | ||
| BenchmarkId::from_parameter(format!("from_nfc_{}_u16", bench_data_content.file_name)), | ||
| |bencher| { | ||
| bencher.iter(|| { | ||
| function_under_bench_u16(&normalizer_under_bench, &bench_data_content.nfc_u16) | ||
| }) | ||
| }, | ||
| ); | ||
| group.bench_function( | ||
| BenchmarkId::from_parameter(format!("from_nfd_{}_u16", bench_data_content.file_name)), | ||
| |bencher| { | ||
| bencher.iter(|| { | ||
| function_under_bench_u16(&normalizer_under_bench, &bench_data_content.nfd_u16) | ||
| }) | ||
| }, | ||
| ); | ||
| group.bench_function( | ||
| BenchmarkId::from_parameter(format!("from_nfkc_{}_u16", bench_data_content.file_name)), | ||
| |bencher| { | ||
| bencher.iter(|| { | ||
| function_under_bench_u16(&normalizer_under_bench, &bench_data_content.nfkc_u16) | ||
| }) | ||
| }, | ||
| ); | ||
| group.bench_function( | ||
| BenchmarkId::from_parameter(format!("from_nfkd_{}_u16", bench_data_content.file_name)), | ||
| |bencher| { | ||
| bencher.iter(|| { | ||
| function_under_bench_u16(&normalizer_under_bench, &bench_data_content.nfkd_u16) | ||
| }) | ||
| }, | ||
| ); | ||
| } | ||
| group.finish(); | ||
| } |
| # This file is part of ICU4X. For terms of use, please see the file | ||
| # called LICENSE at the top level of the ICU4X source tree | ||
| # (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). | ||
| # Source - unicode-org/test-corpora | ||
| # Files - gutenberg/Carroll-11/out/google/txt/ar/8860606395858576540_11-h-1.htm.txt (Continued into h-2, h-3) | ||
| # Language - Arabic | ||
| لم يكن هناك شيء مميز للغاية في ذلك؛ ولم تعتقد أليس أنه من غير المألوف أن تسمع الأرنب يقول لنفسه، "يا إلهي! يا إلهي! سأتأخر!" (عندما فكرت في الأمر بعد ذلك، خطر ببالها أنها كان يجب أن تتساءل عن هذا، ولكن في ذلك الوقت بدا الأمر طبيعيًا تمامًا)؛ ولكن عندما أخرج الأرنب ساعة من جيب سترته ، ونظر إليها، ثم أسرع، نهضت أليس على قدميها، لأنها لم ترَ من قبل أرنبًا بجيب سترته، أو ساعة لتخرجها منه، وبدافع الفضول، ركضت عبر الحقل خلفه، ولحسن الحظ كانت في الوقت المناسب تمامًا لرؤيته ينزل إلى جحر أرنب كبير تحت السياج. | ||
| وفي لحظة أخرى سقطت أليس خلفه، ولم تفكر قط في كيفية خروجها مرة أخرى. | ||
| استمر جحر الأرنب بشكل مستقيم مثل نفق لبعض الوقت، ثم انخفض فجأة إلى أسفل، فجأة لدرجة أن أليس لم يكن لديها لحظة للتفكير في إيقاف نفسها قبل أن تجد نفسها تسقط في بئر عميق للغاية. | ||
| إما أن البئر كانت عميقة جدًا، أو أنها سقطت ببطء شديد، إذ كان لديها متسع من الوقت وهي تنزل لتنظر حولها وتتساءل عما سيحدث بعد ذلك. في البداية، حاولت أن تنظر إلى الأسفل لتكتشف ما الذي ينتظرها، لكن الظلام حال دون رؤية أي شيء؛ ثم نظرت إلى جوانب البئر، فلاحظت أنها كانت مليئة بالخزائن وأرفف الكتب؛ ورأت هنا وهناك خرائط وصورًا معلقة على أوتاد. أنزلت جرة من أحد الأرفف أثناء مرورها؛ كانت مكتوبًا عليها "مربى برتقال"، لكن لخيبة أملها الكبيرة كانت فارغة: لم ترغب في إسقاط الجرة خوفًا من قتل أحد تحتها، لذلك تمكنت من وضعها في إحدى الخزائن أثناء سقوطها. | ||
| "حسنًا!" فكرت أليس في نفسها، "بعد سقوط كهذا، لن أفكر في السقوط من على الدرج! يا لشجاعتي التي سيعتبرونها جميعًا في المنزل! حسنًا، لن أقول شيئًا عن ذلك، حتى لو سقطت من أعلى المنزل!" (وهذا صحيح على الأرجح). | ||
| إلى أسفل، إلى أسفل، إلى أسفل. ألن ينتهي السقوط أبدًا ؟ قالت بصوت عالٍ: "أتساءل كم ميلًا سقطتُ حتى الآن؟" "لا بد أنني اقتربتُ من مركز الأرض. دعيني أرى: أعتقد أن ذلك سيكون أربعة آلاف ميل إلى أسفل—" (لأن أليس، كما ترى، تعلمت أشياءً كثيرة من هذا القبيل في دروسها المدرسية، ومع أن هذه لم تكن فرصةً جيدةً لإظهار معرفتها، إذ لم يكن هناك من يستمع إليها، إلا أنه كان من الجيد تكرارها). "—نعم، هذه المسافة تقريبًا صحيحة—ولكنني أتساءل ما هو خط العرض أو خط الطول الذي وصلتُ إليه؟" (لم تكن أليس تعرف ما هو خط العرض، أو خط الطول أيضًا، لكنها رأت أنهما كلمتان جميلتان.) |
| # This file is part of ICU4X. For terms of use, please see the file | ||
| # called LICENSE at the top level of the ICU4X source tree | ||
| # (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). | ||
| # Source - unicode-org/test-corpora | ||
| # Files - gutenberg/Carroll-11/out/google/txt/de/8860606395858576540_11-h-1.htm.txt (Continued into h-2, h-3) | ||
| # Language - German | ||
| Daran war nichts besonders Bemerkenswertes; und Alice fand es auch nicht besonders ungewöhnlich, das Kaninchen zu sich selbst sagen zu hören: „Oh je! Oh je! Ich werde zu spät kommen!" (als sie später darüber nachdachte, kam ihr der Gedanke, dass sie sich darüber hätte wundern sollen, aber in dem Moment schien ihr alles ganz natürlich); aber als das Kaninchen tatsächlich eine Uhr aus seiner Westentasche nahm , sie ansah und dann weitereilte, sprang Alice auf, denn ihr schoss durch den Kopf, dass sie noch nie zuvor ein Kaninchen mit einer Westentasche oder einer Uhr gesehen hatte, die es herausnehmen konnte, und brennend vor Neugier rannte sie hinter ihm über das Feld her und kam glücklicherweise gerade noch rechtzeitig, um zu sehen, wie es in ein großes Kaninchenloch unter der Hecke verschwand. | ||
| Im nächsten Moment stürzte Alice hinterher, ohne auch nur einmal darüber nachzudenken, wie sie um alles in der Welt wieder herauskommen sollte. | ||
| Das Kaninchenloch verlief eine Weile geradeaus wie ein Tunnel und ging dann plötzlich so plötzlich bergab, dass Alice keine Zeit hatte, darüber nachzudenken, sich zu stoppen, bevor sie feststellte, dass sie in einen sehr tiefen Brunnen fiel. | ||
| Entweder war der Brunnen sehr tief, oder sie fiel sehr langsam, denn sie hatte beim Hinuntergehen viel Zeit, sich umzusehen und sich zu fragen, was als Nächstes passieren würde. Zuerst versuchte sie nach unten zu schauen und zu erkennen, wohin sie kam, aber es war zu dunkel, um etwas zu erkennen; dann blickte sie an die Seiten des Brunnens und bemerkte, dass diese mit Schränken und Bücherregalen gefüllt waren; hier und da sah sie Karten und Bilder an Haken hängen. Im Vorbeigehen nahm sie ein Glas von einem der Regale; es trug die Aufschrift „ORANGENMARMELADE", aber zu ihrer großen Enttäuschung war es leer: Sie wollte das Glas nicht fallen lassen, aus Angst, jemanden darunter zu töten, und schaffte es daher, es in einen der Schränke zu stellen, als sie daran vorbeifiel. | ||
| „Na ja!", dachte Alice bei sich, „nach so einem Sturz werde ich mir nichts dabei denken, wenn ich die Treppe runterfalle! Wie tapfer werden mich alle zu Hause finden! Ich würde ja nichts davon sagen, selbst wenn ich vom Dach fallen würde!" (Was sehr wahrscheinlich stimmte.) |
| # This file is part of ICU4X. For terms of use, please see the file | ||
| # called LICENSE at the top level of the ICU4X source tree | ||
| # (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). | ||
| # Source - unicode-org/test-corpora | ||
| # Files - gutenberg/Carroll-11/out/google/txt/el/8860606395858576540_11-h-1.htm.txt (Continued into h-2, h-3) | ||
| # Language - Greek | ||
| Δεν υπήρχε τίποτα τόσο αξιοσημείωτο σε αυτό. Ούτε η Άλις πίστευε ότι ήταν τόσο παράξενο να ακούει το Κουνέλι να λέει στον εαυτό του: "Ω αγάπη μου! Ω αγαπητέ! Θα αργήσω!" (όταν το σκέφτηκε μετά, της πέρασε από το μυαλό ότι θα έπρεπε να αναρωτηθεί γι' αυτό, αλλά εκείνη τη στιγμή όλα φαινόταν αρκετά φυσιολογικά). αλλά όταν το κουνέλι έβγαλε ένα ρολόι από την τσέπη του γιλέκου του και το κοίταξε, και μετά βιάστηκε, η Αλίκη άρχισε να σηκώνεται, γιατί πέρασε από το μυαλό της ότι δεν είχε ξαναδεί κουνέλι με τσέπη γιλέκου ή ρολόι για να βγάλει από αυτό και καιγόταν από περιέργεια, έτρεξε λίγο μετά το κοίταξε στο χωράφι. κουνέλι-τρύπα κάτω από τον φράκτη. | ||
| Σε μια άλλη στιγμή, η Άλις έπεσε μετά από αυτό, χωρίς να σκεφτεί ούτε μια φορά πώς θα έβγαινε ξανά έξω. | ||
| Η τρύπα του κουνελιού προχώρησε κατευθείαν σαν τούνελ για κάποιο τρόπο, και μετά βυθίστηκε ξαφνικά κάτω, τόσο ξαφνικά που η Άλις δεν είχε ούτε μια στιγμή να σκεφτεί να σταματήσει τον εαυτό της πριν βρει τον εαυτό της να πέφτει σε ένα πολύ βαθύ πηγάδι. | ||
| Είτε το πηγάδι ήταν πολύ βαθύ, είτε έπεσε πολύ αργά, γιατί είχε άφθονο χρόνο καθώς κατέβαινε για να την κοιτάξει και να αναρωτηθεί τι επρόκειτο να συμβεί μετά. Πρώτα, προσπάθησε να κοιτάξει προς τα κάτω και να καταλάβει σε τι ερχόταν, αλλά ήταν πολύ σκοτάδι για να δει τίποτα. μετά κοίταξε τα πλαϊνά του πηγαδιού και παρατήρησε ότι ήταν γεμάτα με ντουλάπια και ράφια με βιβλία. εδώ κι εκεί έβλεπε χάρτες και εικόνες κρεμασμένες σε μανταλάκια. Κατέβασε ένα βάζο από ένα από τα ράφια καθώς περνούσε. έφερε την ετικέτα «ΜΑΡΜΑΛΑΔΑ ΠΟΡΤΟΚΑΛΙ», αλλά προς μεγάλη της απογοήτευση ήταν άδειο: δεν της άρεσε να πέσει το βάζο από φόβο μήπως σκοτώσει κάποιον από κάτω, οπότε κατάφερε να το βάλει σε ένα από τα ντουλάπια καθώς έπεσε δίπλα του. | ||
| "Λοιπόν!" σκέφτηκε η Αλίκη μέσα της, "μετά από μια τέτοια πτώση, δεν θα σκέφτομαι τίποτα να πέσω από σκάλες! Πόσο γενναία θα με νομίζουν όλοι στο σπίτι! Γιατί, δεν θα έλεγα τίποτα γι' αυτό, ακόμα κι αν έπεφτα από την κορυφή του σπιτιού!" (Το οποίο ήταν πολύ πιθανό να ήταν αλήθεια.) | ||
| Κάτω, κάτω, κάτω. Δεν θα τελείωνε ποτέ η πτώση; «Αναρωτιέμαι πόσα μίλια έχω πέσει αυτή τη φορά;» είπε δυνατά. "Πρέπει να φτάσω κάπου κοντά στο κέντρο της γης. Επιτρέψτε μου να δω: αυτό θα ήταν τέσσερις χιλιάδες μίλια κάτω, νομίζω..." (γιατί, βλέπετε, η Αλίκη είχε μάθει αρκετά πράγματα αυτού του είδους στα μαθήματά της στη σχολική αίθουσα, και παρόλο που αυτή δεν ήταν μια πολύ καλή ευκαιρία για να δείξει τις γνώσεις της, καθώς δεν υπήρχε κανείς να την ακούσει, ωστόσο ήταν καλή πρακτική να το πω...) Γεωγραφικό μήκος που πρέπει;» (Η Αλίκη δεν είχε ιδέα τι ήταν το Γεωγραφικό πλάτος ή το Γεωγραφικό μήκος, αλλά πίστευε ότι ήταν ωραία λόγια για να τα πει.) |
| # This file is part of ICU4X. For terms of use, please see the file | ||
| # called LICENSE at the top level of the ICU4X source tree | ||
| # (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). | ||
| # Source - unicode-org/test-corpora | ||
| # Files - gutenberg/Carroll-11/out/google/txt/es/8860606395858576540_11-h-1.htm.txt (Continued into h-2, h-3) | ||
| # Language - Spanish | ||
| No había nada de extraordinario en eso; ni a Alicia le pareció tan extraño oír al Conejo decirse a sí mismo: "¡Ay, Dios mío! ¡Ay, Dios mío! ¡Llegaré tarde!" (Cuando lo pensó después, se le ocurrió que debería haberse sorprendido, pero en ese momento todo le pareció bastante natural); pero cuando el Conejo sacó un reloj del bolsillo del chaleco , lo miró y luego se apresuró a seguir adelante, Alicia se puso de pie de un salto, pues recordó que nunca antes había visto un conejo con un bolsillo del chaleco ni un reloj que sacar de él, y, ardiendo de curiosidad, corrió por el campo tras él, y afortunadamente llegó justo a tiempo de verlo caer por una gran madriguera bajo el seto. | ||
| Un instante después, Alicia se lanzó tras él, sin pensar ni una sola vez en cómo iba a salir de nuevo. | ||
| La madriguera del conejo seguía recta como un túnel durante un trecho, y luego descendía de repente, tan de repente que Alicia no tuvo un momento para pensar en detenerse antes de encontrarse cayendo en un pozo muy profundo. | ||
| O bien el pozo era muy profundo, o bien cayó muy despacio, pues tuvo tiempo de sobra mientras bajaba para mirar a su alrededor y preguntarse qué sucedería después. Primero, intentó mirar hacia abajo y descifrar adónde se dirigía, pero estaba demasiado oscuro para ver nada; luego miró a los lados del pozo y notó que estaban llenos de armarios y estanterías; aquí y allá vio mapas y cuadros colgados de ganchos. Bajó un frasco de uno de los estantes al pasar; estaba etiquetado como «Mermelada de naranja», pero para su gran decepción estaba vacío: no quería dejar caer el frasco por miedo a matar a alguien que estuviera debajo, así que logró meterlo en uno de los armarios al caer junto a él. | ||
| —¡Vaya! —pensó Alicia—. ¡Después de una caída como esta, no me importará caerme por las escaleras! ¡Qué valiente me considerarán todos en casa! ¡Ni aunque me cayera del tejado! (Lo cual probablemente era cierto). | ||
| Abajo, abajo, abajo. ¿Acaso la caída nunca terminaría? «¿Cuántos kilómetros habré caído ya?», dijo en voz alta. «Debo estar acercándome al centro de la Tierra. Veamos: creo que serían seis mil kilómetros de caída...» (porque, verán, Alicia había aprendido varias cosas así en sus clases, y aunque esta no era una buena oportunidad para demostrar sus conocimientos, ya que no había nadie que la escuchara, era una buena práctica repetirlo). «Sí, esa es más o menos la distancia correcta, pero luego me pregunto a qué latitud o longitud habré llegado.» (Alicia no tenía ni idea de qué era latitud ni longitud, pero le parecían palabras bonitas y grandilocuentes). |
| # This file is part of ICU4X. For terms of use, please see the file | ||
| # called LICENSE at the top level of the ICU4X source tree | ||
| # (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). | ||
| # Source - unicode-org/test-corpora | ||
| # Files - gutenberg/Carroll-11/out/google/txt/fr/8860606395858576540_11-h-1.htm.txt (Continued into h-2, h-3) | ||
| # Language - French | ||
| Il n'y avait rien de si remarquable à cela ; et Alice ne trouvait pas non plus si étrange d'entendre le Lapin se dire : « Oh là là ! Oh là là ! Je vais être en retard ! » (En y repensant plus tard, il lui vint à l'esprit qu'elle aurait dû s'en étonner, mais sur le moment tout semblait tout naturel) ; mais lorsque le Lapin sortit effectivement une montre de la poche de son gilet , la regarda et se hâta de partir, Alice se leva d'un bond, car il lui traversa l'esprit qu'elle n'avait jamais vu auparavant un lapin avec une poche de gilet ou une montre à y prendre, et, brûlante de curiosité, elle traversa le champ à sa poursuite, et heureusement, elle arriva juste à temps pour le voir sauter dans un grand terrier sous la haie. | ||
| Un instant plus tard, Alice se lança à sa poursuite, sans jamais se demander comment elle allait pouvoir s'en sortir à nouveau. | ||
| Le terrier du lapin continuait tout droit comme un tunnel sur une certaine distance, puis plongeait soudainement, si soudainement qu'Alice n'eut pas un instant pour penser à s'arrêter avant de se retrouver en train de tomber dans un puits très profond. | ||
| Soit le puits était très profond, soit elle tombait très lentement, car elle avait tout le temps, en descendant, de regarder autour d'elle et de se demander ce qui allait se passer ensuite. Elle essaya d'abord de regarder en bas pour distinguer où elle allait arriver, mais il faisait trop sombre pour voir quoi que ce soit ; puis elle regarda les côtés du puits et remarqua qu'ils étaient remplis d'armoires et d'étagères ; çà et là, elle aperçut des cartes et des tableaux accrochés à des patères. Elle prit un pot d'une des étagères en passant ; il était étiqueté « MARMELADE D'ORANGE », mais à sa grande déception, il était vide : elle ne voulait pas le laisser tomber de peur de tuer quelqu'un en dessous, alors elle réussit à le mettre dans un des placards en tombant. | ||
| « Eh bien ! » pensa Alice, « après une chute pareille, je n'hésiterai pas à dégringoler ! Comme on me trouvera courageuse à la maison ! Je n'en parlerais même pas, même si je tombais du toit ! » (Ce qui était très probablement vrai.) |
| # This file is part of ICU4X. For terms of use, please see the file | ||
| # called LICENSE at the top level of the ICU4X source tree | ||
| # (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). | ||
| # Source - unicode-org/test-corpora | ||
| # Files - gutenberg/Carroll-11/out/google/txt/iw/8860606395858576540_11-h-1.htm.txt (Continued into h-2, h-3) | ||
| # Language - Hebrew | ||
| לא היה בזה שום דבר מדהים כל כך; וגם אליס לא חשבה שזה כל כך מופרך לשמוע את הארנב אומר לעצמו, "אוי יקירי! הו יקירי! אני אאחר!" (כשחשבה על זה אחר כך, עלה בדעתה שהיא הייתה צריכה לתהות על כך, אבל בזמנו הכל נראה טבעי למדי); אבל כשהארנב באמת הוציא שעון מכיס החזייה שלו , והביט בו, ואז מיהר הלאה, אליס קמה על רגליה, כי הבהב במוחה שמעולם לא ראתה ארנב עם כיס חזייה, או שעון שצריך להוציא ממנו, ובוער מסקרנות, אחרי שהוא רצה בשדה, רק כדי לראות חור ארנב מתחת לגדר החיה. | ||
| עוד רגע ירדה אליס אחריה, אף פעם לא חשבה איך לעזאזל היא תצא שוב. | ||
| חור הארנב המשיך ישר כמו מנהרה לזמן מה, ואז צלל בפתאומיות למטה, כל כך פתאום שלאליס לא היה רגע לחשוב על לעצור את עצמה לפני שהיא מצאה את עצמה נופלת בבאר עמוקה מאוד. | ||
| או שהבאר הייתה עמוקה מאוד, או שהיא נפלה לאט מאוד, כי היה לה הרבה זמן כשירדה להסתכל סביבה ולתהות מה עומד לקרות אחר כך. ראשית, היא ניסתה להביט למטה ולהבין לאן היא באה, אבל היה חשוך מכדי לראות משהו; אחר כך הביטה בדפנות הבאר, והבחינה שהם מלאים בארונות ובמדפי ספרים; פה ושם ראתה מפות ותמונות תלויות על יתדות. היא הורידה צנצנת מאחד המדפים כשחלפה על פניה; היא כונתה "מרמלדת תפוזים", אבל לאכזבתה הגדולה היא הייתה ריקה: היא לא אהבה להפיל את הצנצנת מחשש להרוג מישהו מתחתיה, אז הצליחה להכניס אותה לאחד הארונות כשנפלה על פניה. | ||
| "טוֹב!" חשבה אליס לעצמה, "אחרי נפילה כזו, אני לא אחשוב על צניחה במורד מדרגות! כמה אמיץ כולם יחשבו עליי בבית! למה, אני לא אגיד שום דבר על זה, אפילו אם אפול מהחלק העליון של הבית!" (מה שהיה נכון מאוד.) | ||
| למטה, למטה, למטה. האם הנפילה לעולם לא תיגמר? "מעניין כמה קילומטרים נפלתי בזמן הזה?" אמרה בקול. "אני בטח מתקרב לאיזשהו מקום למרכז כדור הארץ. תן לי לראות: זה יהיה ארבעת אלפים מייל למטה, אני חושב -" (שהרי, אתה מבין, אליס למדה כמה דברים מהסוג הזה בשיעורים שלה בחדר בית הספר, ולמרות שזו לא הייתה הזדמנות טובה במיוחד להפגין את הידע שלה, כי לא היה מי שיקשיב לה, עדיין זה היה תרגול טוב להגיד את זה על זה) "-כן אני תוהה על מה המרחק הנכון - או על מה. אני חייב?" (לאליס לא היה מושג מה זה קו רוחב, או קו אורך, אבל חשבה שאלה מילים נהדרות לומר.) | ||
| עד מהרה היא התחילה שוב. "אני תוהה אם אני אפול ישר דרך האדמה! כמה מצחיק שזה ייראה לצאת בין האנשים שהולכים עם הראש כלפי מטה! האנטיפטיות, אני חושבת -" (היא שמחה שלא היה אף אחד שהקשיב, הפעם, כי זה לא נשמע בכלל המילה הנכונה) "- אבל אני אצטרך לשאול אותם מה שם המדינה, את יודעת?" בבקשה, גברתי או אוסטרליה, זה (והיא ניסתה לקצץ תוך כדי דיבורה - חשק לקצץ כשאתה נופל באוויר! אתה חושב שתוכל להסתדר עם זה?) "ואיזו ילדה קטנה ובורה היא תחשוב שאני שואלת! לא, לעולם לא יעזור לשאול: אולי אני אראה את זה כתוב איפשהו." | ||
| # This file is part of ICU4X. For terms of use, please see the file | ||
| # called LICENSE at the top level of the ICU4X source tree | ||
| # (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). | ||
| # Source - unicode-org/test-corpora | ||
| # Files - gutenberg/Carroll-11/out/google/txt/pl/8860606395858576540_11-h-1.htm.txt (Continued into h-2, h-3) | ||
| # Language - Polish | ||
| Nie było w tym nic aż tak niezwykłego; Alicja też nie wydało się aż tak dziwne, że usłyszała Królika mówiącego do siebie: „Ojej! Ojej! Spóźnię się!" (kiedy później o tym pomyślała, przyszło jej do głowy, że powinna była się temu dziwić, ale wtedy wydawało się to całkiem naturalne); ale kiedy Królik rzeczywiście wyjął zegarek z kieszeni kamizelki i spojrzał na niego, a potem pospieszył dalej, Alicja zerwała się na równe nogi, bo błysnęła jej myśl, że nigdy wcześniej nie widziała królika z kieszenią kamizelki lub zegarkiem, który mógłby z niej wyjąć, i płonąc ciekawością, pobiegła za nim przez pole i na szczęście zdążyła zobaczyć, jak wpada do dużej króliczej nory pod żywopłotem. | ||
| W chwilę później Alicja ruszyła w jego stronę, nie zastanawiając się ani przez chwilę nad tym, jak w ogóle zdoła się stąd wydostać. | ||
| Królicza nora biegła prosto jak tunel przez jakiś czas, a potem nagle zapadła się w dół, tak niespodziewanie, że Alicja nie miała chwili na zastanowienie się nad zatrzymaniem się, gdy nagle wpadła do bardzo głębokiej studni. | ||
| Albo studnia była bardzo głęboka, albo spadała bardzo powoli, bo miała mnóstwo czasu, schodząc w dół, by rozejrzeć się i zastanowić, co będzie dalej. Najpierw próbowała spojrzeć w dół i dostrzec, dokąd zmierza, ale było zbyt ciemno, by cokolwiek dostrzec; potem spojrzała na ściany studni i zauważyła, że były wypełnione szafkami i regałami na książki; gdzieniegdzie dostrzegła mapy i obrazy wiszące na kołkach. Zdjęła słoik z jednej z półek, mijając go; widniał na nim napis „DARMOLADA POMARAŃCZOWA", ale ku jej wielkiemu rozczarowaniu był pusty: nie chciała upuścić słoika z obawy przed zabiciem kogoś pod spodem, więc udało jej się włożyć go do jednej z szafek, gdy spadała obok. | ||
| „No cóż!" – pomyślała Alicja – „po takim upadku, nie będę miała żadnych skrupułów, żeby spaść ze schodów! Wszyscy w domu będą mnie uważać za dzielną! Przecież nie pisnę o tym słowa, nawet gdybym spadła z dachu!" (Co było bardzo prawdopodobne). | ||
| W dół, w dół, w dół. Czy upadek nigdy się nie skończy? „Ciekawe, ile mil spadłam do tej pory?" powiedziała na głos. „Muszę być gdzieś w pobliżu środka Ziemi. Zobaczmy: to chyba cztery tysiące mil w dół…" (bo, widzicie, Alicja nauczyła się kilku takich rzeczy na lekcjach w klasie i choć nie była to najlepsza okazja do popisania się wiedzą, bo nikt jej nie słuchał, to jednak dobrze było to powtórzyć) „—tak, to mniej więcej odpowiednia odległość — ale zastanawiam się, do jakiej szerokości albo długości geograficznej doszłam?" (Alicja nie miała pojęcia, czym jest szerokość ani długość geograficzna, ale uważała to za piękne, wzniosłe słowa). |
| # This file is part of ICU4X. For terms of use, please see the file | ||
| # called LICENSE at the top level of the ICU4X source tree | ||
| # (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). | ||
| # Source - unicode-org/test-corpora | ||
| # Files - gutenberg/Carroll-11/out/google/txt/ru/8860606395858576540_11-h-1.htm.txt (Continued into h-2, h-3) | ||
| # Language - Russian | ||
| В этом не было ничего такого уж примечательного; и Алиса не считала чем-то таким уж необычным услышать, как Кролик сказал самому себе: «Ах! Ах! Я опоздаю!» (когда она потом обдумала это, ей пришло в голову, что ей следовало бы этому удивиться, но в тот момент все это казалось совершенно естественным); но когда Кролик действительно вынул часы из жилетного кармана , посмотрел на них и поспешил дальше, Алиса вскочила на ноги, потому что ей пришло в голову, что она никогда раньше не видела кролика ни с жилетным карманом, ни с часами, которые можно было бы из него вынуть, и, сгорая от любопытства, она побежала за ним через поле и, к счастью, успела как раз вовремя, чтобы увидеть, как он нырнул в большую кроличью нору под изгородью. | ||
| В следующий момент Алиса кинулась за ним вниз, ни разу не задумавшись о том, как ей теперь выбраться обратно. | ||
| Кроличья нора какое-то время шла прямо, как туннель, а затем внезапно обрывалась вниз, так внезапно, что Алиса не успела даже подумать об остановке, как обнаружила, что падает в очень глубокую яму. | ||
| Либо колодец был очень глубоким, либо она падала очень медленно, потому что у неё было достаточно времени, пока она спускалась, чтобы осмотреться и поразмыслить о том, что произойдёт дальше. Сначала она попыталась посмотреть вниз и понять, куда она идёт, но было слишком темно, чтобы что-либо увидеть; затем она посмотрела на стены колодца и заметила, что они были заполнены шкафами и книжными полками; тут и там она видела карты и картины, висящие на крючках. Она сняла банку с одной из полок, когда проходила мимо; на ней было написано «АПЕЛЬСИНОВЫЙ МАРМЕЛАД», но, к её великому разочарованию, она оказалась пустой: она не хотела ронять банку из-за страха убить кого-нибудь внизу, поэтому умудрилась поставить её в один из шкафов, когда падала мимо него. | ||
| «Ну что ж!» – подумала Алиса, – «после такого падения я без колебаний спущусь с лестницы! Дома все сочтут меня храброй! Да я бы и сама никому не сказала, даже если бы упала с крыши!» (Что, скорее всего, было правдой.) | ||
| Вниз, вниз, вниз. Неужели падение никогда не кончится? «Интересно, сколько миль я уже пролетела?» — спросила она вслух. «Должно быть, я уже где-то около центра Земли. Дай-ка подумать: это, кажется, четыре тысячи миль вниз…» (ведь, видите ли, Алиса уже многому научилась на уроках в классе, и хотя это был не самый подходящий случай блеснуть своими знаниями, ведь её никто не слушал, всё же было хорошей практикой повторить это снова.) «…да, это примерно то расстояние, но интересно, до какой широты или долготы я добралась?» (Алиса понятия не имела, что такое широта или долгота, но считала, что это красивые, высокие слова.) |
| # This file is part of ICU4X. For terms of use, please see the file | ||
| # called LICENSE at the top level of the ICU4X source tree | ||
| # (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). | ||
| # Source - unicode-org/test-corpora | ||
| # Files - gutenberg/Carroll-11/out/google/txt/th/8860606395858576540_11-h-1.htm.txt (Continued into h-2, h-3) | ||
| # Language - Thai | ||
| ไม่มีอะไรน่าสังเกต มาก นักในนั้น และอลิซก็ไม่ได้คิดว่ามันนอกลู่นอกทาง มาก นักที่จะได้ยินกระต่ายพูดกับตัวเองว่า "โอ้ พระเจ้า! โอ้ พระเจ้า! ฉันจะไปสาย!" (เมื่อเธอคิดดูทีหลัง เธอคิดได้ว่าเธอควรจะสงสัยในเรื่องนี้ แต่ตอนนั้นทุกอย่างดูเป็นเรื่องธรรมดา) แต่เมื่อกระต่าย หยิบนาฬิกาออกจากกระเป๋าเสื้อกั๊ก และมองดู แล้วรีบเดินต่อไป อลิซก็ลุกขึ้นยืน เพราะความคิดแวบเข้ามาในหัวของเธอว่าเธอไม่เคยเห็นกระต่ายที่มีกระเป๋าเสื้อกั๊กหรือนาฬิกาให้หยิบออกมาเลยมาก่อน และด้วยความอยากรู้อยากเห็น เธอจึงวิ่งข้ามทุ่งตามมันไป และโชคดีที่มาถึงทันเวลาพอดีที่จะเห็นมันกระโดดลงไปในโพรงกระต่ายขนาดใหญ่ใต้รั้ว | ||
| ในอีกชั่วขณะหนึ่ง อลิซก็ไล่ตามไปโดยไม่เคยคิดแม้แต่น้อยว่าจะออกไปได้อย่างไร | ||
| หลุมกระต่ายนั้นตรงไปเหมือนอุโมงค์ไปทางหนึ่ง จากนั้นก็จมลงไปอย่างกะทันหัน จนกระทั่งอลิซไม่มีเวลาแม้แต่น้อยที่จะคิดหยุดตัวเองก่อนที่จะพบว่าตัวเองกำลังตกลงไปในบ่อน้ำที่ลึกมาก | ||
| บ่อน้ำนั้นลึกมาก หรือไม่ก็เธอตกอย่างช้าๆ เพราะเธอมีเวลาเหลือเฟือที่จะมองไปรอบๆ และสงสัยว่าจะเกิดอะไรขึ้นต่อไป ตอนแรกเธอพยายามมองลงไปเพื่อดูว่ากำลังจะเจออะไร แต่มันมืดเกินกว่าจะมองเห็นอะไร จากนั้นเธอก็มองไปที่ด้านข้างของบ่อน้ำและสังเกตเห็นว่าเต็มไปด้วยตู้และชั้นหนังสือ ที่นั่นเธอเห็นแผนที่และรูปภาพแขวนอยู่บนตะขอ เธอหยิบโถใบหนึ่งลงมาจากชั้นหนึ่งขณะที่เดินผ่าน มันมีป้ายเขียนว่า "แยมส้ม" แต่เธอต้องผิดหวังอย่างมากที่มันว่างเปล่า เธอไม่ชอบทำโถหล่นเพราะกลัวว่าจะฆ่าคนข้างใต้ จึงจัดการเอาโถไปใส่ในตู้ใบหนึ่งขณะที่เธอตกผ่านไป | ||
| "เอาล่ะ!" อลิซคิดในใจ "หลังจากตกแบบนี้ ฉันคงไม่คิดอะไรมากถ้าจะตกบันได! ที่บ้านทุกคนจะคิดว่าฉันกล้าหาญมาก! ฉันไม่พูดอะไรเลย ถึงแม้ว่าฉันจะตกจากหลังคาบ้านก็ตาม!" (ซึ่งน่าจะเป็นเรื่องจริง) | ||
| ลง ลง ลง การตกจะ ไม่มี วันสิ้นสุดหรือ? "ฉันสงสัยว่าฉันตกลงไปกี่ไมล์แล้วเนี่ย" เธอพูดออกมาดังๆ "ฉันคงใกล้ถึงใจกลางโลกแล้วล่ะ ลองดูสิ ฉันคิดว่าน่าจะลงไปตั้งสี่พันไมล์—" (เพราะอย่างที่เห็น อลิซได้เรียนรู้อะไรแบบนี้หลายอย่างในห้องเรียน และถึงแม้นี่จะไม่ใช่โอกาส ดี ที่จะอวดความรู้ เพราะไม่มีใครฟังเธอ แต่มันก็ยังเป็นการฝึกฝนที่ดีที่จะพูดซ้ำ) "—ใช่ ระยะทางประมาณนั้น—แต่ฉันก็สงสัยว่าฉันจะต้องไปถึงละติจูดหรือลองจิจูดเท่าไหร่?" (อลิซเองก็ไม่รู้ว่าละติจูดหรือลองจิจูดคืออะไรเหมือนกัน แต่คิดว่ามันเป็นคำที่ดูดีทีเดียว) | ||
| ทันใดนั้นเธอก็เริ่มพูดอีกครั้ง "ฉันสงสัยว่าฉันจะร่วง ลงสู่ พื้นโลกไหม! มันจะดูตลกแค่ไหนนะที่ออกมาท่ามกลางผู้คนที่เดินก้มหน้า! พวกต่อต้านน่ะ ฉันคิด—" (คราวนี้เธอค่อนข้างดีใจที่ ไม่มี ใครฟังอยู่ เพราะฟังดูไม่ค่อยเหมาะเลย) "—แต่ฉันคงต้องถามพวกเขาก่อนว่าชื่อประเทศคืออะไร รู้ไหม ได้โปรดเถอะค่ะ คุณผู้หญิง ที่นี่นิวซีแลนด์หรือออสเตรเลีย" (แล้วเธอก็พยายามทำความเคารพไปด้วย—ทำท่าเหมือน กำลังทำความ เคารพขณะที่เธอกำลังร่วงหล่นกลางอากาศ! เธอคิดว่าจะทำได้ไหมนะ?) "แล้วเธอก็เป็นเด็กผู้หญิงโง่เขลาอะไรเช่นนี้ เธอจะคิดว่าฉันถามเหรอ! ไม่สิ มันไม่มีประโยชน์ที่จะถามหรอก บางทีฉันอาจจะเห็นมันเขียนไว้ที่ไหนสักแห่งก็ได้" |
| # This file is part of ICU4X. For terms of use, please see the file | ||
| # called LICENSE at the top level of the ICU4X source tree | ||
| # (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). | ||
| # Source - unicode-org/test-corpora | ||
| # Files - gutenberg/Carroll-11/out/google/txt/tr/8860606395858576540_11-h-1.htm.txt (Continued into h-2, h-3) | ||
| # Language - Turkish | ||
| Bunda çok dikkat çekici bir şey yoktu; Alice, Tavşan'ın kendi kendine "Aman Tanrım! Aman Tanrım! Geç kalacağım!" demesini duymanın çok da garip olduğunu düşünmedi (sonradan bunu düşündüğünde, buna şaşırması gerektiği aklına geldi, ama o anda her şey çok doğal görünüyordu); ama Tavşan yelek cebinden bir saat çıkarıp baktığında ve sonra aceleyle ilerlediğinde, Alice ayağa fırladı, çünkü daha önce yelek cebinde veya içinden saat çıkarılabilecek bir tavşan görmediği aklına dank etti ve merakla onun peşinden tarlanın karşısına koştu ve neyse ki çitin altındaki büyük bir tavşan deliğinden aşağı düştüğünü görmek için tam zamanında yetişti. | ||
| Bir an sonra Alice de peşinden gitti, bir daha nasıl çıkacağını hiç düşünmedi. | ||
| Tavşan deliği bir süre tünel gibi dümdüz devam etti ve sonra aniden aşağı doğru indi, o kadar aniden ki Alice kendini çok derin bir kuyuya düşerken bulmadan önce durmayı düşünmeye fırsat bulamadı. | ||
| Ya kuyu çok derindi ya da çok yavaş düşüyordu, çünkü aşağı inerken etrafına bakmak ve bundan sonra ne olacağını merak etmek için bolca vakti vardı. Önce aşağıya bakıp neye geldiğini anlamaya çalıştı ama hiçbir şey göremeyecek kadar karanlıktı; sonra kuyunun kenarlarına baktı ve dolaplar ve kitap raflarıyla dolu olduklarını fark etti; yer yer askılara asılmış haritalar ve resimler gördü. Geçerken raflardan birinden bir kavanoz aldı; üzerinde "PORTAKAL MARMELAT" yazıyordu ama büyük bir hayal kırıklığına uğrayarak boştu: Altında birini öldürme korkusuyla kavanozu düşürmek istemedi, bu yüzden düşerken onu dolaplardan birine koymayı başardı. | ||
| "Eh!" diye düşündü Alice kendi kendine, "böyle bir düşüşten sonra merdivenlerden yuvarlanmayı hiç umursamam! Evdekiler beni ne kadar cesur sanacaklar! Evin tepesinden düşsem bile, bu konuda hiçbir şey söylemem!" (Bu büyük olasılıkla doğruydu.) | ||
| Aşağı, aşağı, aşağı. Düşüş hiç bitmeyecek miydi? "Acaba şu ana kadar kaç mil düştüm?" diye yüksek sesle söyledi. "Dünyanın merkezine yakın bir yere yaklaşıyor olmalıyım. Bir bakayım: bu dört bin mil aşağı olmalı, sanırım-" (çünkü, görüyorsunuz, Alice okuldaki derslerinde bu türden birkaç şey öğrenmişti ve bu, onu dinleyecek kimse olmadığı için bilgisini sergilemek için çok iyi bir fırsat olmasa da, yine de tekrarlamak iyi bir alıştırmaydı) "—evet, doğru mesafe bu—ama sonra hangi Enlem veya Boylam'a ulaştığımı merak ediyorum?" (Alice'in Enlem'in veya Boylam'ın ne olduğunu da bilmiyordu, ama bunları söylemesi güzel ve görkemli sözler olduğunu düşünüyordu.) |
| # Generating microbench data | ||
| The full versions of these files are located | ||
| [in another part of the repository](https://github.com/unicode-org/icu/tree/main/icu4j/perf-tests/data). | ||
| ## Sanitizing the file | ||
| ```shell | ||
| sed -i '/^#/d' ${filename} | ||
| sed -i '/^$/d' ${filename} | ||
| ``` | ||
| ## Shuffling the file | ||
| ```shell | ||
| shuf -n 20 ${filename} -o ${filename} | ||
| ``` | ||
| ## Add back the header (if you plan on submitting the files) | ||
| ``` | ||
| # This file is part of ICU4X. For terms of use, please see the file | ||
| # called LICENSE at the top level of the ICU4X source tree | ||
| # (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). | ||
| ``` |
| # This file is part of ICU4X. For terms of use, please see the file | ||
| # called LICENSE at the top level of the ICU4X source tree | ||
| # (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). | ||
| かげやま,みのる | ||
| むらかみ,とおる | ||
| つじさわ,けい | ||
| やすい,たかゆき | ||
| むらさき,としお | ||
| はせがわ,ひであき | ||
| うるしばら,よしひこ | ||
| ままだ,ひろし | ||
| おおぼら,えいじろう | ||
| おおば,まさひで | ||
| きたばたけ,たかひこ | ||
| はまさき,あつし | ||
| ほりい,つねお | ||
| もり,だいいち | ||
| いとう,しんいち | ||
| くにもと,じゅんじ | ||
| おか,のりひと | ||
| たに,よしあき | ||
| しらがき,ひろあき | ||
| しらはま,たけひろ | ||
| むらかみ,やすひろ | ||
| うめはら,たかし | ||
| いわた,ひろし | ||
| すぎえ,かつとし | ||
| てらにし,ひろみつ | ||
| まつおか,だいすけ | ||
| もろほし,すすむ | ||
| いしはら,たかし | ||
| おしま,ひろお | ||
| なかお,ゆうじ | ||
| いかり,はるお | ||
| きまち,まさき | ||
| ふるかわ,みちお | ||
| かねこ,しゅうへい | ||
| なかがわ,ともみ | ||
| ささき,しんご | ||
| うちだ,たくじ | ||
| うめだ,さかえ | ||
| しばた,いくこ | ||
| まきした,けいこ | ||
| まつもと,しんいちろう | ||
| たかの,かずよし | ||
| いしわた,なおひさ | ||
| いうち,まこと | ||
| いまい,りほ | ||
| みずた,のりあき | ||
| かくたに,まなぶ | ||
| わだ,ほまれ | ||
| わかまつ,かずき | ||
| かわぐち,ひろき |
| # This file is part of ICU4X. For terms of use, please see the file | ||
| # called LICENSE at the top level of the ICU4X source tree | ||
| # (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). | ||
| ホリモト,ユウジ | ||
| ハナミ,ヤスヒデ | ||
| イシザカ,タカユキ | ||
| ゼンケ,トシオ | ||
| ハトリ,ユウコ | ||
| ナガオカ,トモユキ | ||
| コウダ,ケンイチ | ||
| イシダ,ヒロシ | ||
| ミワ,シゲユキ | ||
| イシカワ,ヒロシ | ||
| スズキ,ユウスケ | ||
| オクダ,ヨシノリ | ||
| シムラ,サカエ | ||
| エビシマ,ヤスユキ | ||
| イブカ,ヨシテル | ||
| タノ,マコト | ||
| ドウゾノ,セイヤ | ||
| ヤマナカ,サツミ | ||
| トミイエ,ハヤト | ||
| アザミ,ツトム | ||
| タナカ,キョウコ | ||
| コジマ,アツシ | ||
| フミハラ,カオリ | ||
| スズキ,マサユキ | ||
| ナトリ,ケンヤ | ||
| スズキ,ユウコ | ||
| スズキ,ヒサエ | ||
| ナカガワ,カツヨシ | ||
| スズキ,マサフミ | ||
| マツヤマ,トシオ | ||
| ヨシナガ,チカエ | ||
| キタムラ,リカコ | ||
| アオキ,タクオ | ||
| ヤマグチ,ヤスヒロ | ||
| スギムラ,シゲオ | ||
| ウエスギ,マサミ | ||
| マツムラ,シンイチ | ||
| クバ,タカシ | ||
| スドウ,タカトシ | ||
| フジモト,ヒロシ | ||
| イトウ,シュウイチ | ||
| コバヤシ,カズミ | ||
| タナカ,ヒロカツ | ||
| イシダ,ツカサ | ||
| ヤマダ,マサコ | ||
| カミヤ,トミエ | ||
| タケモト,ユウジ | ||
| スミノ,コウジ | ||
| ヒロハタ,タクヤ | ||
| ミヒラ,リョウヘイ |
| # This file is part of ICU4X. For terms of use, please see the file | ||
| # called LICENSE at the top level of the ICU4X source tree | ||
| # (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). | ||
| 김명희 | ||
| 홍차수 | ||
| 허순재 | ||
| 강영휘 | ||
| 김운주 | ||
| 이종환 | ||
| 이은국 | ||
| 강태호 | ||
| 강일래 | ||
| 김동현 | ||
| 곽기자 | ||
| 차재수 | ||
| 표봉기 | ||
| 문대원 | ||
| 이형기 | ||
| 최교표 | ||
| 박식현 | ||
| 홍종립 | ||
| 서창수 | ||
| 김쌍건 | ||
| 서말도 | ||
| 이병훈 | ||
| 김희수 | ||
| 박학태 | ||
| 강태종 | ||
| 조문란 | ||
| 신범균 | ||
| 백두진 | ||
| 이철정 | ||
| 김태중 | ||
| 이성현 | ||
| 김주조 | ||
| 김강행 | ||
| 이정길 | ||
| 김완일 | ||
| 권수자 | ||
| 이춘철 | ||
| 김판근 | ||
| 김곡리 | ||
| 이경형 | ||
| 이운만 | ||
| 손상철 | ||
| 유기숙 | ||
| 박정한 | ||
| 조윤래 | ||
| 유신호 | ||
| 이두수 | ||
| 김재률 | ||
| 김성홍 | ||
| 김혜경 |
| # This file is part of ICU4X. For terms of use, please see the file | ||
| # called LICENSE at the top level of the ICU4X source tree | ||
| # (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). | ||
| González, Joan | ||
| Reinders, Jim | ||
| Applebroog, Ida | ||
| Kidd, Joseph Bartholomew | ||
| Gulácsy, Lajos | ||
| Letendre, Rita | ||
| Zuccaro, Federico | ||
| Apt the Elder, Ulrich | ||
| Drummond, Arthur | ||
| Manley, Thomas | ||
| Broc, Jean | ||
| Ramunno, Tony | ||
| Simone dei Crocifissi | ||
| Lane, Theodore | ||
| Symonds, William Robert | ||
| Johnson, Frank Tenney | ||
| Cox, Gardner | ||
| Bunbury, Charles | ||
| Pedro de la Cuadra | ||
| Payne, William | ||
| Lucas, John Seymour | ||
| Holsman, Elizabeth T. | ||
| de Vries, Auke | ||
| Laszlo, Philip Alexius de | ||
| Shigemasa | ||
| Wolfe, Ruth Mitchell | ||
| Buck, John | ||
| Baselitz, Georg | ||
| Hook, Walter | ||
| Segall, Lasar | ||
| Brush, George deForest | ||
| Master of Jánosrét | ||
| Sutherland, Elizabeth Leveson-Gower, Countess of | ||
| Tuckerman, Jane | ||
| Varley, F.H. | ||
| Fosso, Samuel | ||
| Gardner, Daniel | ||
| Sadler, Walter Dendy | ||
| Clausen, Franciska | ||
| Coman, Charlotte Buell | ||
| Wakelin, Roland | ||
| Payne, Jon, CML | ||
| Campagna, Girolamo | ||
| Wiener, Phyllis | ||
| Sallee, Charles | ||
| Fitzgerald, John Anster | ||
| Gribbroek, Robert | ||
| Laporte, John | ||
| Lévy-Dhurmer, Lucien | ||
| Young, Stephen Scott |
| # This file is part of ICU4X. For terms of use, please see the file | ||
| # called LICENSE at the top level of the ICU4X source tree | ||
| # (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). | ||
| ณรงค์ โต๊ะเงิน | ||
| กิตติ บุญวันต์ | ||
| สมหมาย ดาบทองดี | ||
| ธวัชชัย อิสระนิมิตร | ||
| วรรณา โสภณนรินทร์ | ||
| วินัย หมู่มิ่ง | ||
| พัชรี ชูจิรวงศ์ | ||
| สมปอง จิวไพโรจน์กิจ | ||
| บุญส่ง กวยรักษา | ||
| นิพนธ์ นิ่มใหม่ | ||
| พัชรี สุวพรศิลป์ | ||
| เจริญ นววัฒนทรัพย์ | ||
| อรพินท์ แซ่เจี่ย | ||
| ชัยพร สมใจนึก | ||
| ประนอม โคศิลา | ||
| ฉวีวรรณ ศรสังข์ทอง | ||
| วัชรา เจริญรัตนพร | ||
| สุภัท นกศิริ | ||
| อู๋ มาลาเล็ก | ||
| ประยูร ไชโย | ||
| ละออ อยู่ยืนยง | ||
| สมใจ วิวัฒน์วานิช | ||
| จุมพล จันทรศรีเกษร | ||
| พุฒ ดอกไม้จีน | ||
| บุญชัย วรกิจพรสิน | ||
| สมาน ธูปเทียน | ||
| พงศ์ศักดิ์ แซ่แต้ | ||
| อำนาจ ไวจงเจริญ | ||
| พรทิพย์ แซ่ลี้ | ||
| อุไรวรรณ สาครสินธุ์ | ||
| อำพล วีระตะนนท์ | ||
| สมจิตร ใจวังโลก | ||
| สุเทพ ตันวินิจ | ||
| สวาท ทรัพย์มาก | ||
| สมศักดิ์ เจือจันทร์ | ||
| ดัสซันซิงห์ กุลาตี | ||
| ธีร ศรแก้ว | ||
| พรรณยุพา ฮ่อสกุล | ||
| สำราญ จันทร์เอี่ยม | ||
| พจน์ มั่นกันนาน | ||
| สุธี บุณยเกียรติ | ||
| บุญโชติ ทิพย์ประเสริฐสิน | ||
| ประดิษฐ์ ทองพสิฐสมบัติ | ||
| จำเนียร เพ็งเจริญ | ||
| สมศักดิ์ อรุณรัตน์ | ||
| อนุชา จารุหิรัญสกุล | ||
| พิกุล มโนภิญโญภิญญะ | ||
| ผ่องศรี นกแก้ว | ||
| อารี วิไลวรรณ | ||
| ณรงค์วิทย์ วิทสัทธาวรกุล |
| # This file is part of ICU4X. For terms of use, please see the file | ||
| # called LICENSE at the top level of the ICU4X source tree | ||
| # (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). | ||
| # The contents of this file have been translated by "Google Translate". | ||
| Vào những năm cuối của thế kỷ 19, không ai có thể tin rằng thế giới này | ||
| đang được theo dõi một cách sâu sắc và chặt chẽ bởi những trí thông minh | ||
| lớn hơn con người nhưng cũng nguy hiểm như chính con người; rằng khi con | ||
| người bận rộn với những mối quan tâm khác nhau của họ, họ bị xem xét và | ||
| nghiên cứu kỹ lưỡng, có lẽ gần như một người đàn ông với kính hiển vi có thể | ||
| xem xét kỹ lưỡng những sinh vật nhất thời tụ tập và sinh sôi nảy nở trong | ||
| một giọt nước. Với sự tự mãn vô hạn, con người đi đi lại lại khắp thế giới | ||
| này chỉ vì những công việc nhỏ nhặt của họ, thanh thản với niềm tin chắc | ||
| chắn về đế chế của họ đối với vật chất. Có thể là infusoria dưới kính hiển | ||
| vi cũng làm như vậy. Không ai coi các thế giới cũ hơn trong không gian là | ||
| nguồn gây nguy hiểm cho con người, hoặc nghĩ về chúng chỉ để bác bỏ ý | ||
| tưởng về sự sống đối với chúng là không thể hoặc không thể xảy ra. | ||
| Thật tò mò khi nhớ lại một số thói quen tinh thần của những ngày đã | ||
| qua. Hầu hết những người trên trái đất đều tưởng tượng rằng có thể có | ||
| những người khác trên sao Hỏa, có lẽ thấp kém hơn họ và sẵn sàng chào | ||
| đón một doanh nghiệp truyền giáo. Tuy nhiên, bên kia vịnh không gian, | ||
| những bộ óc đối với tâm trí của chúng ta cũng như tâm trí của chúng ta đối | ||
| với những con thú bị diệt vong, những bộ óc rộng lớn, lạnh lùng và vô cảm, | ||
| nhìn trái đất này với con mắt ghen tị, và dần dần và chắc chắn vạch ra | ||
| những kế hoạch chống lại chúng ta. Và đầu thế kỷ 20 đã xảy ra sự vỡ mộng | ||
| lớn. Hành tinh sao Hỏa, tôi không cần nhắc độc giả, quay xung quanh mặt | ||
| trời ở khoảng cách trung bình 140.000.000 dặm, và ánh sáng và nhiệt mà | ||
| nó nhận được từ mặt trời chỉ bằng một nửa so với thế giới này nhận được. | ||
| Nếu giả thuyết về tinh vân có bất kỳ sự thật nào, nó phải tồn tại lâu | ||
| đời hơn thế giới của chúng ta; và rất lâu trước khi trái đất này ngừng | ||
| nóng chảy, sự sống trên bề mặt của nó hẳn đã bắt đầu quá trình của nó. | ||
| Thực tế là nó chỉ chiếm một phần bảy thể tích của trái đất đã làm tăng | ||
| tốc độ nguội đi của nó đến nhiệt độ mà sự sống có thể bắt đầu. Nó có | ||
| không khí và nước và tất cả những gì cần thiết để hỗ trợ sự tồn tại | ||
| sinh động. Tuy nhiên, con người quá hão huyền và bị mù quáng bởi sự phù | ||
| phiếm của mình, đến nỗi cho đến tận cuối thế kỷ 19, không có nhà văn nào | ||
| bày tỏ bất kỳ ý tưởng nào rằng sự sống thông minh có thể đã phát triển ở đó xa, | ||
| hoặc thực sự là ở tất cả, vượt ra ngoài mức độ trần gian của nó. Người ta | ||
| cũng không hiểu một cách tổng quát rằng vì sao Hỏa già hơn trái đất của chúng | ||
| ta, chỉ bằng một phần tư diện tích bề mặt và ở xa mặt trời hơn, nên điều tất | ||
| yếu dẫn đến là nó không chỉ xa hơn so với thời điểm bắt đầu mà còn gần ngày kết | ||
| thúc hơn. Sự nguội lạnh thế tục mà một ngày nào đó phải vượt qua hành tinh của chúng | ||
| ta đã thực sự đi xa với người hàng xóm của chúng ta. Tình trạng vật lý của nó phần lớn | ||
| vẫn còn là một bí ẩn, nhưng giờ đây chúng ta biết rằng ngay cả ở vùng xích đạo của nó, | ||
| nhiệt độ giữa trưa hầu như không bằng nhiệt độ của mùa đông lạnh nhất của chúng ta. | ||
| Không khí của nó loãng hơn nhiều so với không khí của chúng ta, các đại dương của nó đã | ||
| thu hẹp lại cho đến khi chỉ bao phủ một phần ba bề mặt của nó, và khi các mùa chậm chạp | ||
| của nó thay đổi, các chỏm tuyết khổng lồ tụ lại và tan chảy ở hai cực và định kỳ làm ngập các vùng ôn đới của nó. | ||
| Giai đoạn cuối cùng của sự kiệt sức, mà đối với chúng ta vẫn còn quá xa vời, đã trở thành | ||
| một vấn đề ngày nay đối với các cư dân trên sao Hỏa. Áp lực trước mắt của sự cần | ||
| thiết đã làm sáng tỏ trí tuệ của họ, mở rộng sức mạnh của họ và làm chai đá trái | ||
| tim họ. Và nhìn xuyên qua không gian với các công cụ, và trí thông minh như chúng | ||
| ta hiếm khi mơ tới, họ thấy, ở khoảng cách gần nhất chỉ cách họ 35.000.000 dặm | ||
| về phía mặt trời, một ngôi sao buổi sáng của hy vọng, hành tinh ấm áp hơn của chúng | ||
| ta, màu xanh lục của thảm thực vật và màu xám của nước , với bầu không khí nhiều | ||
| mây hùng hồn của sự màu mỡ, với những cái nhìn thoáng qua qua những đám mây | ||
| trôi dạt của nó là những dải đất rộng lớn đông dân và những vùng biển chật hẹp đông đúc hải quân. |
| // This file is part of ICU4X. For terms of use, please see the file | ||
| // called LICENSE at the top level of the ICU4X source tree | ||
| // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). | ||
| use criterion::{black_box, BenchmarkId, Criterion}; | ||
| use icu_normalizer::{ComposingNormalizerBorrowed, DecomposingNormalizerBorrowed}; | ||
| struct BenchDataContent { | ||
| pub file_name: String, | ||
| pub nfc: String, | ||
| pub nfd: String, | ||
| pub nfkc: String, | ||
| pub nfkd: String, | ||
| pub nfc_u16: Vec<u16>, | ||
| pub nfd_u16: Vec<u16>, | ||
| pub nfkc_u16: Vec<u16>, | ||
| pub nfkd_u16: Vec<u16>, | ||
| } | ||
| fn strip_headers(content: &str) -> String { | ||
| content | ||
| .lines() | ||
| .filter(|&s| !s.starts_with('#')) | ||
| .map(|s| s.to_owned()) | ||
| .collect::<Vec<String>>() | ||
| .join("\n") | ||
| } | ||
| fn normalizer_bench_data() -> [BenchDataContent; 15] { | ||
| let nfc_normalizer = ComposingNormalizerBorrowed::new_nfc(); | ||
| let nfd_normalizer = DecomposingNormalizerBorrowed::new_nfd(); | ||
| let nfkc_normalizer = ComposingNormalizerBorrowed::new_nfkc(); | ||
| let nfkd_normalizer = DecomposingNormalizerBorrowed::new_nfkd(); | ||
| let content_latin: (&str, &str) = ( | ||
| "TestNames_Latin", | ||
| &strip_headers(include_str!("./data/TestNames_Latin.txt")), | ||
| ); | ||
| let content_jp_h: (&str, &str) = ( | ||
| "TestNames_Japanese_h", | ||
| &strip_headers(include_str!("./data/TestNames_Japanese_h.txt")), | ||
| ); | ||
| let content_jp_k: (&str, &str) = ( | ||
| "TestNames_Japanese_k", | ||
| &strip_headers(include_str!("./data/TestNames_Japanese_k.txt")), | ||
| ); | ||
| let content_korean: (&str, &str) = ( | ||
| "TestNames_Korean", | ||
| &strip_headers(include_str!("./data/TestNames_Korean.txt")), | ||
| ); | ||
| let content_random_words_ar: (&str, &str) = ( | ||
| "Carroll-11-ar", | ||
| &strip_headers(include_str!("./data/Carroll-11-ar.txt")), | ||
| ); | ||
| let content_random_words_de: (&str, &str) = ( | ||
| "Carroll-11-de", | ||
| &strip_headers(include_str!("./data/Carroll-11-de.txt")), | ||
| ); | ||
| let content_random_words_el: (&str, &str) = ( | ||
| "Carroll-11-el", | ||
| &strip_headers(include_str!("./data/Carroll-11-el.txt")), | ||
| ); | ||
| let content_random_words_es: (&str, &str) = ( | ||
| "Carroll-11-es", | ||
| &strip_headers(include_str!("./data/Carroll-11-es.txt")), | ||
| ); | ||
| let content_random_words_fr: (&str, &str) = ( | ||
| "Carroll-11-fr", | ||
| &strip_headers(include_str!("./data/Carroll-11-fr.txt")), | ||
| ); | ||
| let content_random_words_he: (&str, &str) = ( | ||
| "Carroll-11-he", | ||
| &strip_headers(include_str!("./data/Carroll-11-he.txt")), | ||
| ); | ||
| let content_random_words_pl: (&str, &str) = ( | ||
| "Carroll-11-pl", | ||
| &strip_headers(include_str!("./data/Carroll-11-pl.txt")), | ||
| ); | ||
| let content_random_words_ru: (&str, &str) = ( | ||
| "Carroll-11-ru", | ||
| &strip_headers(include_str!("./data/Carroll-11-ru.txt")), | ||
| ); | ||
| let content_random_words_th: (&str, &str) = ( | ||
| "Carroll-11-th", | ||
| &strip_headers(include_str!("./data/Carroll-11-th.txt")), | ||
| ); | ||
| let content_random_words_tr: (&str, &str) = ( | ||
| "Carroll-11-tr", | ||
| &strip_headers(include_str!("./data/Carroll-11-tr.txt")), | ||
| ); | ||
| let content_viet: (&str, &str) = ("wotw", &strip_headers(include_str!("./data/wotw.txt"))); | ||
| [ | ||
| content_latin, | ||
| content_viet, | ||
| content_jp_k, | ||
| content_jp_h, | ||
| content_korean, | ||
| content_random_words_ru, | ||
| content_random_words_ar, | ||
| content_random_words_el, | ||
| content_random_words_es, | ||
| content_random_words_fr, | ||
| content_random_words_tr, | ||
| content_random_words_th, | ||
| content_random_words_pl, | ||
| content_random_words_he, | ||
| content_random_words_de, | ||
| ] | ||
| .map(|(file_name, raw_content)| { | ||
| let nfc = &nfc_normalizer.normalize(raw_content); | ||
| let nfd = &nfd_normalizer.normalize(raw_content); | ||
| let nfkc = &nfkc_normalizer.normalize(raw_content); | ||
| let nfkd = &nfkd_normalizer.normalize(raw_content); | ||
| BenchDataContent { | ||
| file_name: file_name.to_owned(), | ||
| nfc: nfc.to_string(), | ||
| nfd: nfd.to_string(), | ||
| nfkc: nfkc.to_string(), | ||
| nfkd: nfkd.to_string(), | ||
| nfc_u16: nfc.encode_utf16().collect(), | ||
| nfd_u16: nfd.encode_utf16().collect(), | ||
| nfkc_u16: nfkc.encode_utf16().collect(), | ||
| nfkd_u16: nfkd.encode_utf16().collect(), | ||
| } | ||
| }) | ||
| } | ||
| fn function_under_bench(normalizer: &DecomposingNormalizerBorrowed, text: &str) { | ||
| normalizer.normalize(text); | ||
| } | ||
| fn function_under_bench_u16(normalizer: &DecomposingNormalizerBorrowed, text: &[u16]) { | ||
| normalizer.normalize_utf16(text); | ||
| } | ||
| pub fn criterion_benchmark(criterion: &mut Criterion) { | ||
| let group_name = "decomposing_normalizer_nfd"; | ||
| let normalizer_under_bench = DecomposingNormalizerBorrowed::new_nfd(); | ||
| let mut group = criterion.benchmark_group(group_name); | ||
| for bench_data_content in black_box(normalizer_bench_data()) { | ||
| group.bench_function( | ||
| BenchmarkId::from_parameter(format!("from_nfc_{}", bench_data_content.file_name)), | ||
| |bencher| { | ||
| bencher | ||
| .iter(|| function_under_bench(&normalizer_under_bench, &bench_data_content.nfc)) | ||
| }, | ||
| ); | ||
| group.bench_function( | ||
| BenchmarkId::from_parameter(format!("from_nfd_{}", bench_data_content.file_name)), | ||
| |bencher| { | ||
| bencher | ||
| .iter(|| function_under_bench(&normalizer_under_bench, &bench_data_content.nfd)) | ||
| }, | ||
| ); | ||
| group.bench_function( | ||
| BenchmarkId::from_parameter(format!("from_nfkc_{}", bench_data_content.file_name)), | ||
| |bencher| { | ||
| bencher.iter(|| { | ||
| function_under_bench(&normalizer_under_bench, &bench_data_content.nfkc) | ||
| }) | ||
| }, | ||
| ); | ||
| group.bench_function( | ||
| BenchmarkId::from_parameter(format!("from_nfkd_{}", bench_data_content.file_name)), | ||
| |bencher| { | ||
| bencher.iter(|| { | ||
| function_under_bench(&normalizer_under_bench, &bench_data_content.nfkd) | ||
| }) | ||
| }, | ||
| ); | ||
| // UTF 16 | ||
| group.bench_function( | ||
| BenchmarkId::from_parameter(format!("from_nfc_{}_u16", bench_data_content.file_name)), | ||
| |bencher| { | ||
| bencher.iter(|| { | ||
| function_under_bench_u16(&normalizer_under_bench, &bench_data_content.nfc_u16) | ||
| }) | ||
| }, | ||
| ); | ||
| group.bench_function( | ||
| BenchmarkId::from_parameter(format!("from_nfd_{}_u16", bench_data_content.file_name)), | ||
| |bencher| { | ||
| bencher.iter(|| { | ||
| function_under_bench_u16(&normalizer_under_bench, &bench_data_content.nfd_u16) | ||
| }) | ||
| }, | ||
| ); | ||
| group.bench_function( | ||
| BenchmarkId::from_parameter(format!("from_nfkc_{}_u16", bench_data_content.file_name)), | ||
| |bencher| { | ||
| bencher.iter(|| { | ||
| function_under_bench_u16(&normalizer_under_bench, &bench_data_content.nfkc_u16) | ||
| }) | ||
| }, | ||
| ); | ||
| group.bench_function( | ||
| BenchmarkId::from_parameter(format!("from_nfkd_{}_u16", bench_data_content.file_name)), | ||
| |bencher| { | ||
| bencher.iter(|| { | ||
| function_under_bench_u16(&normalizer_under_bench, &bench_data_content.nfkd_u16) | ||
| }) | ||
| }, | ||
| ); | ||
| } | ||
| group.finish(); | ||
| } |
| // This file is part of ICU4X. For terms of use, please see the file | ||
| // called LICENSE at the top level of the ICU4X source tree | ||
| // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). | ||
| use criterion::{black_box, BenchmarkId, Criterion}; | ||
| use icu_normalizer::{ComposingNormalizerBorrowed, DecomposingNormalizerBorrowed}; | ||
| struct BenchDataContent { | ||
| pub file_name: String, | ||
| pub nfc: String, | ||
| pub nfd: String, | ||
| pub nfkc: String, | ||
| pub nfkd: String, | ||
| pub nfc_u16: Vec<u16>, | ||
| pub nfd_u16: Vec<u16>, | ||
| pub nfkc_u16: Vec<u16>, | ||
| pub nfkd_u16: Vec<u16>, | ||
| } | ||
| fn strip_headers(content: &str) -> String { | ||
| content | ||
| .lines() | ||
| .filter(|&s| !s.starts_with('#')) | ||
| .map(|s| s.to_owned()) | ||
| .collect::<Vec<String>>() | ||
| .join("\n") | ||
| } | ||
| fn normalizer_bench_data() -> [BenchDataContent; 15] { | ||
| let nfc_normalizer = ComposingNormalizerBorrowed::new_nfc(); | ||
| let nfd_normalizer = DecomposingNormalizerBorrowed::new_nfd(); | ||
| let nfkc_normalizer = ComposingNormalizerBorrowed::new_nfkc(); | ||
| let nfkd_normalizer = DecomposingNormalizerBorrowed::new_nfkd(); | ||
| let content_latin: (&str, &str) = ( | ||
| "TestNames_Latin", | ||
| &strip_headers(include_str!("./data/TestNames_Latin.txt")), | ||
| ); | ||
| let content_jp_h: (&str, &str) = ( | ||
| "TestNames_Japanese_h", | ||
| &strip_headers(include_str!("./data/TestNames_Japanese_h.txt")), | ||
| ); | ||
| let content_jp_k: (&str, &str) = ( | ||
| "TestNames_Japanese_k", | ||
| &strip_headers(include_str!("./data/TestNames_Japanese_k.txt")), | ||
| ); | ||
| let content_korean: (&str, &str) = ( | ||
| "TestNames_Korean", | ||
| &strip_headers(include_str!("./data/TestNames_Korean.txt")), | ||
| ); | ||
| let content_random_words_ar: (&str, &str) = ( | ||
| "Carroll-11-ar", | ||
| &strip_headers(include_str!("./data/Carroll-11-ar.txt")), | ||
| ); | ||
| let content_random_words_de: (&str, &str) = ( | ||
| "Carroll-11-de", | ||
| &strip_headers(include_str!("./data/Carroll-11-de.txt")), | ||
| ); | ||
| let content_random_words_el: (&str, &str) = ( | ||
| "Carroll-11-el", | ||
| &strip_headers(include_str!("./data/Carroll-11-el.txt")), | ||
| ); | ||
| let content_random_words_es: (&str, &str) = ( | ||
| "Carroll-11-es", | ||
| &strip_headers(include_str!("./data/Carroll-11-es.txt")), | ||
| ); | ||
| let content_random_words_fr: (&str, &str) = ( | ||
| "Carroll-11-fr", | ||
| &strip_headers(include_str!("./data/Carroll-11-fr.txt")), | ||
| ); | ||
| let content_random_words_he: (&str, &str) = ( | ||
| "Carroll-11-he", | ||
| &strip_headers(include_str!("./data/Carroll-11-he.txt")), | ||
| ); | ||
| let content_random_words_pl: (&str, &str) = ( | ||
| "Carroll-11-pl", | ||
| &strip_headers(include_str!("./data/Carroll-11-pl.txt")), | ||
| ); | ||
| let content_random_words_ru: (&str, &str) = ( | ||
| "Carroll-11-ru", | ||
| &strip_headers(include_str!("./data/Carroll-11-ru.txt")), | ||
| ); | ||
| let content_random_words_th: (&str, &str) = ( | ||
| "Carroll-11-th", | ||
| &strip_headers(include_str!("./data/Carroll-11-th.txt")), | ||
| ); | ||
| let content_random_words_tr: (&str, &str) = ( | ||
| "Carroll-11-tr", | ||
| &strip_headers(include_str!("./data/Carroll-11-tr.txt")), | ||
| ); | ||
| let content_viet: (&str, &str) = ("wotw", &strip_headers(include_str!("./data/wotw.txt"))); | ||
| [ | ||
| content_latin, | ||
| content_viet, | ||
| content_jp_k, | ||
| content_jp_h, | ||
| content_korean, | ||
| content_random_words_ru, | ||
| content_random_words_ar, | ||
| content_random_words_el, | ||
| content_random_words_es, | ||
| content_random_words_fr, | ||
| content_random_words_tr, | ||
| content_random_words_th, | ||
| content_random_words_pl, | ||
| content_random_words_he, | ||
| content_random_words_de, | ||
| ] | ||
| .map(|(file_name, raw_content)| { | ||
| let nfc = &nfc_normalizer.normalize(raw_content); | ||
| let nfd = &nfd_normalizer.normalize(raw_content); | ||
| let nfkc = &nfkc_normalizer.normalize(raw_content); | ||
| let nfkd = &nfkd_normalizer.normalize(raw_content); | ||
| BenchDataContent { | ||
| file_name: file_name.to_owned(), | ||
| nfc: nfc.to_string(), | ||
| nfd: nfd.to_string(), | ||
| nfkc: nfkc.to_string(), | ||
| nfkd: nfkd.to_string(), | ||
| nfc_u16: nfc.encode_utf16().collect(), | ||
| nfd_u16: nfd.encode_utf16().collect(), | ||
| nfkc_u16: nfkc.encode_utf16().collect(), | ||
| nfkd_u16: nfkd.encode_utf16().collect(), | ||
| } | ||
| }) | ||
| } | ||
| fn function_under_bench(normalizer: &DecomposingNormalizerBorrowed, text: &str) { | ||
| normalizer.normalize(text); | ||
| } | ||
| fn function_under_bench_u16(normalizer: &DecomposingNormalizerBorrowed, text: &[u16]) { | ||
| normalizer.normalize_utf16(text); | ||
| } | ||
| pub fn criterion_benchmark(criterion: &mut Criterion) { | ||
| let group_name = "decomposing_normalizer_nfkd"; | ||
| let normalizer_under_bench = DecomposingNormalizerBorrowed::new_nfkd(); | ||
| let mut group = criterion.benchmark_group(group_name); | ||
| for bench_data_content in black_box(normalizer_bench_data()) { | ||
| group.bench_function( | ||
| BenchmarkId::from_parameter(format!("from_nfc_{}", bench_data_content.file_name)), | ||
| |bencher| { | ||
| bencher | ||
| .iter(|| function_under_bench(&normalizer_under_bench, &bench_data_content.nfc)) | ||
| }, | ||
| ); | ||
| group.bench_function( | ||
| BenchmarkId::from_parameter(format!("from_nfd_{}", bench_data_content.file_name)), | ||
| |bencher| { | ||
| bencher | ||
| .iter(|| function_under_bench(&normalizer_under_bench, &bench_data_content.nfd)) | ||
| }, | ||
| ); | ||
| group.bench_function( | ||
| BenchmarkId::from_parameter(format!("from_nfkc_{}", bench_data_content.file_name)), | ||
| |bencher| { | ||
| bencher.iter(|| { | ||
| function_under_bench(&normalizer_under_bench, &bench_data_content.nfkc) | ||
| }) | ||
| }, | ||
| ); | ||
| group.bench_function( | ||
| BenchmarkId::from_parameter(format!("from_nfkd_{}", bench_data_content.file_name)), | ||
| |bencher| { | ||
| bencher.iter(|| { | ||
| function_under_bench(&normalizer_under_bench, &bench_data_content.nfkd) | ||
| }) | ||
| }, | ||
| ); | ||
| // UTF 16 | ||
| group.bench_function( | ||
| BenchmarkId::from_parameter(format!("from_nfc_{}_u16", bench_data_content.file_name)), | ||
| |bencher| { | ||
| bencher.iter(|| { | ||
| function_under_bench_u16(&normalizer_under_bench, &bench_data_content.nfc_u16) | ||
| }) | ||
| }, | ||
| ); | ||
| group.bench_function( | ||
| BenchmarkId::from_parameter(format!("from_nfd_{}_u16", bench_data_content.file_name)), | ||
| |bencher| { | ||
| bencher.iter(|| { | ||
| function_under_bench_u16(&normalizer_under_bench, &bench_data_content.nfd_u16) | ||
| }) | ||
| }, | ||
| ); | ||
| group.bench_function( | ||
| BenchmarkId::from_parameter(format!("from_nfkc_{}_u16", bench_data_content.file_name)), | ||
| |bencher| { | ||
| bencher.iter(|| { | ||
| function_under_bench_u16(&normalizer_under_bench, &bench_data_content.nfkc_u16) | ||
| }) | ||
| }, | ||
| ); | ||
| group.bench_function( | ||
| BenchmarkId::from_parameter(format!("from_nfkd_{}_u16", bench_data_content.file_name)), | ||
| |bencher| { | ||
| bencher.iter(|| { | ||
| function_under_bench_u16(&normalizer_under_bench, &bench_data_content.nfkd_u16) | ||
| }) | ||
| }, | ||
| ); | ||
| } | ||
| group.finish(); | ||
| } |
| // This file is part of ICU4X. For terms of use, please see the file | ||
| // called LICENSE at the top level of the ICU4X source tree | ||
| // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). | ||
| use icu_normalizer::{ComposingNormalizerBorrowed, DecomposingNormalizerBorrowed}; | ||
| use criterion::{black_box, Criterion, Throughput}; | ||
| use smallvec::SmallVec; | ||
| //use detone::IterDecomposeVietnamese; | ||
| // 2048 times size of u16 fits on one 4KB memory page, which maximizes | ||
| // the run to take average over without introducing cross-page effects. | ||
| const INPUT_SIZE: usize = 2048; | ||
| fn generate_bmp_input_nfc(s: &str) -> Vec<u16> { | ||
| ComposingNormalizerBorrowed::new_nfc() | ||
| .normalize_iter(s.chars().cycle()) | ||
| .take(INPUT_SIZE) | ||
| .map(|c| { | ||
| if c <= '\u{FFFF}' { | ||
| c as u16 | ||
| } else { | ||
| unreachable!("Data should stay on the BMP!") | ||
| } | ||
| }) | ||
| .collect() | ||
| } | ||
| fn generate_bmp_input_nfd(s: &str) -> Vec<u16> { | ||
| DecomposingNormalizerBorrowed::new_nfd() | ||
| .normalize_iter(s.chars().cycle()) | ||
| .take(INPUT_SIZE) | ||
| .map(|c| { | ||
| if c <= '\u{FFFF}' { | ||
| c as u16 | ||
| } else { | ||
| unreachable!("Data should stay on the BMP!") | ||
| } | ||
| }) | ||
| .collect() | ||
| } | ||
| /// Removes headers and replaces line feed with space. | ||
| /// Do not use for languages that don't use spaces! | ||
| fn prepare_file_contents(content: &str) -> String { | ||
| content | ||
| .lines() | ||
| .filter(|&s| !s.starts_with('#')) | ||
| .map(|s| s.to_owned()) | ||
| .collect::<Vec<String>>() | ||
| .join(" ") | ||
| } | ||
| fn slice_as_slice(s: &[u16]) -> &[u16] { | ||
| black_box(s) | ||
| } | ||
| fn bench_lang(name: &str, data: &str, c: &mut Criterion) { | ||
| let input_nfc = generate_bmp_input_nfc(data); | ||
| let input_nfd = generate_bmp_input_nfd(data); | ||
| let nfc = ComposingNormalizerBorrowed::new_nfc(); | ||
| let nfd = DecomposingNormalizerBorrowed::new_nfd(); | ||
| // Appending to this output is infallible (does not return `Err`) and | ||
| // this is sized to be large enough not to actually take the the heap | ||
| // allocation path. | ||
| let mut output: SmallVec<[u16; INPUT_SIZE * 2]> = SmallVec::new(); | ||
| { | ||
| let mut group_name = "utf16_throughput_nfc_".to_string(); | ||
| group_name.push_str(name); | ||
| let mut group = c.benchmark_group(&group_name); | ||
| group.throughput(Throughput::Elements(input_nfc.len() as u64)); | ||
| group.bench_function("read", |b| { | ||
| b.iter(|| { | ||
| let _ = black_box( | ||
| nfc.split_normalized_utf16(slice_as_slice(&input_nfc)) | ||
| .0 | ||
| .len(), | ||
| ); | ||
| }) | ||
| }); | ||
| group.bench_function("writing_to_nfc", |b| { | ||
| b.iter(|| { | ||
| output.clear(); // Should be trivial and OK to do from within here. | ||
| let _ = black_box( | ||
| nfc.normalize_utf16_to(slice_as_slice(&input_nfc), black_box(&mut output)), | ||
| ); | ||
| }) | ||
| }); | ||
| group.bench_function("writing_to_nfd", |b| { | ||
| b.iter(|| { | ||
| output.clear(); // Should be trivial and OK to do from within here. | ||
| let _ = black_box( | ||
| nfd.normalize_utf16_to(slice_as_slice(&input_nfc), black_box(&mut output)), | ||
| ); | ||
| }) | ||
| }); | ||
| group.finish(); | ||
| } | ||
| { | ||
| let mut group_name = "utf16_throughput_nfd_".to_string(); | ||
| group_name.push_str(name); | ||
| let mut group = c.benchmark_group(&group_name); | ||
| group.throughput(Throughput::Elements(input_nfd.len() as u64)); | ||
| group.bench_function("read", |b| { | ||
| b.iter(|| { | ||
| let _ = black_box( | ||
| nfd.split_normalized_utf16(slice_as_slice(&input_nfd)) | ||
| .0 | ||
| .len(), | ||
| ); | ||
| }) | ||
| }); | ||
| group.bench_function("writing_to_nfd", |b| { | ||
| b.iter(|| { | ||
| output.clear(); // Should be trivial and OK to do from within here. | ||
| let _ = black_box( | ||
| nfd.normalize_utf16_to(slice_as_slice(&input_nfd), black_box(&mut output)), | ||
| ); | ||
| }) | ||
| }); | ||
| group.bench_function("writing_to_nfc", |b| { | ||
| b.iter(|| { | ||
| output.clear(); // Should be trivial and OK to do from within here. | ||
| let _ = black_box( | ||
| nfc.normalize_utf16_to(slice_as_slice(&input_nfd), black_box(&mut output)), | ||
| ); | ||
| }) | ||
| }); | ||
| group.finish(); | ||
| } | ||
| } | ||
| static EL: &str = include_str!("./data/Carroll-11-el.txt"); | ||
| static EN: &str = "The ICU4X normalizer is an implementation of Unicode Normalization Forms. "; | ||
| static FR: &str = include_str!("./data/Carroll-11-fr.txt"); | ||
| static VI: &str = include_str!("./data/wotw.txt"); | ||
| static ZH: &str = "單父人呂公善沛令,辟仇,從之客,因家焉。沛中豪傑吏聞令有重客,皆往賀。"; | ||
| // zh text from https://www.gutenberg.org/cache/epub/23841/pg23841.txt | ||
| // metadata at https://www.gutenberg.org/ebooks/23841 | ||
| // If you replace this text, be sure not to include ASCII spaces and be sure | ||
| // to include punctuation using code points actually used for punctuation in | ||
| // Chinese. | ||
| // TODO: Add: | ||
| // * Japanese with realistic proportion of kana voicing marks | ||
| // * Korean, since Hangul is special-cased in the normalizer | ||
| // * Kannada or some other non-Korean BMP language that uses | ||
| // backward-combining starters (with realistic proportion of such | ||
| // characters). | ||
| // * Chakma or some other living non-BMP language. | ||
| // * Vietnamese in the orthographic form (i.e. as produced by | ||
| // the official non-IME keyboard layout that's less common | ||
| // than the NFC-producing IME.) | ||
| pub fn criterion_benchmark(c: &mut Criterion) { | ||
| bench_lang("el", prepare_file_contents(EL).as_str(), c); | ||
| bench_lang("en", EN, c); | ||
| bench_lang("fr", prepare_file_contents(FR).as_str(), c); | ||
| bench_lang("vi", prepare_file_contents(VI).as_str(), c); | ||
| bench_lang("zh", ZH, c); | ||
| } |
| # This file is automatically @generated by Cargo. | ||
| # It is not intended for manual editing. | ||
| version = 3 | ||
| [[package]] | ||
| name = "aho-corasick" | ||
| version = "1.1.4" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "ddd31a130427c27518df266943a5308ed92d4b226cc639f5a8f1002816174301" | ||
| dependencies = [ | ||
| "memchr", | ||
| ] | ||
| [[package]] | ||
| name = "anes" | ||
| version = "0.1.6" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "4b46cbb362ab8752921c97e041f5e366ee6297bd428a31275b9fcf1e380f7299" | ||
| [[package]] | ||
| name = "anstyle" | ||
| version = "1.0.14" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "940b3a0ca603d1eade50a4846a2afffd5ef57a9feac2c0e2ec2e14f9ead76000" | ||
| [[package]] | ||
| name = "arraystring" | ||
| version = "0.3.0" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "4d517c467117e1d8ca795bc8cc90857ff7f79790cca0e26f6e9462694ece0185" | ||
| dependencies = [ | ||
| "typenum", | ||
| ] | ||
| [[package]] | ||
| name = "arrayvec" | ||
| version = "0.7.6" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" | ||
| [[package]] | ||
| name = "atoi" | ||
| version = "2.0.0" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "f28d99ec8bfea296261ca1af174f24225171fea9664ba9003cbebee704810528" | ||
| dependencies = [ | ||
| "num-traits", | ||
| ] | ||
| [[package]] | ||
| name = "autocfg" | ||
| version = "1.5.0" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8" | ||
| [[package]] | ||
| name = "bumpalo" | ||
| version = "3.20.2" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "5d20789868f4b01b2f2caec9f5c4e0213b41e3e5702a50157d699ae31ced2fcb" | ||
| [[package]] | ||
| name = "cast" | ||
| version = "0.3.0" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5" | ||
| [[package]] | ||
| name = "cfg-if" | ||
| version = "1.0.4" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801" | ||
| [[package]] | ||
| name = "ciborium" | ||
| version = "0.2.2" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "42e69ffd6f0917f5c029256a24d0161db17cea3997d185db0d35926308770f0e" | ||
| dependencies = [ | ||
| "ciborium-io", | ||
| "ciborium-ll", | ||
| "serde", | ||
| ] | ||
| [[package]] | ||
| name = "ciborium-io" | ||
| version = "0.2.2" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "05afea1e0a06c9be33d539b876f1ce3692f4afea2cb41f740e7743225ed1c757" | ||
| [[package]] | ||
| name = "ciborium-ll" | ||
| version = "0.2.2" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "57663b653d948a338bfb3eeba9bb2fd5fcfaecb9e199e87e1eda4d9e8b240fd9" | ||
| dependencies = [ | ||
| "ciborium-io", | ||
| "half", | ||
| ] | ||
| [[package]] | ||
| name = "clap" | ||
| version = "4.4.18" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "1e578d6ec4194633722ccf9544794b71b1385c3c027efe0c55db226fc880865c" | ||
| dependencies = [ | ||
| "clap_builder", | ||
| ] | ||
| [[package]] | ||
| name = "clap_builder" | ||
| version = "4.4.18" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "4df4df40ec50c46000231c914968278b1eb05098cf8f1b3a518a95030e71d1c7" | ||
| dependencies = [ | ||
| "anstyle", | ||
| "clap_lex", | ||
| ] | ||
| [[package]] | ||
| name = "clap_lex" | ||
| version = "0.6.0" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "702fc72eb24e5a1e48ce58027a675bc24edd52096d5397d4aea7c6dd9eca0bd1" | ||
| [[package]] | ||
| name = "cobs" | ||
| version = "0.3.0" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "0fa961b519f0b462e3a3b4a34b64d119eeaca1d59af726fe450bbba07a9fc0a1" | ||
| dependencies = [ | ||
| "thiserror", | ||
| ] | ||
| [[package]] | ||
| name = "criterion" | ||
| version = "0.5.1" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "f2b12d017a929603d80db1831cd3a24082f8137ce19c69e6447f54f5fc8d692f" | ||
| dependencies = [ | ||
| "anes", | ||
| "cast", | ||
| "ciborium", | ||
| "clap", | ||
| "criterion-plot", | ||
| "is-terminal", | ||
| "itertools", | ||
| "num-traits", | ||
| "once_cell", | ||
| "oorandom", | ||
| "plotters", | ||
| "rayon", | ||
| "regex", | ||
| "serde", | ||
| "serde_derive", | ||
| "serde_json", | ||
| "tinytemplate", | ||
| "walkdir", | ||
| ] | ||
| [[package]] | ||
| name = "criterion-plot" | ||
| version = "0.5.0" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "6b50826342786a51a89e2da3a28f1c32b06e387201bc2d19791f622c673706b1" | ||
| dependencies = [ | ||
| "cast", | ||
| "itertools", | ||
| ] | ||
| [[package]] | ||
| name = "crossbeam-deque" | ||
| version = "0.8.6" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51" | ||
| dependencies = [ | ||
| "crossbeam-epoch", | ||
| "crossbeam-utils", | ||
| ] | ||
| [[package]] | ||
| name = "crossbeam-epoch" | ||
| version = "0.9.18" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e" | ||
| dependencies = [ | ||
| "crossbeam-utils", | ||
| ] | ||
| [[package]] | ||
| name = "crossbeam-utils" | ||
| version = "0.8.21" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" | ||
| [[package]] | ||
| name = "crunchy" | ||
| version = "0.2.4" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "460fbee9c2c2f33933d720630a6a0bac33ba7053db5344fac858d4b8952d77d5" | ||
| [[package]] | ||
| name = "databake" | ||
| version = "0.2.1" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "74d4b1db5ca40636726f1f73daff0d626accbd49bcd8136fcade87d7cf1e6bbb" | ||
| dependencies = [ | ||
| "databake-derive", | ||
| "proc-macro2", | ||
| "quote", | ||
| ] | ||
| [[package]] | ||
| name = "databake-derive" | ||
| version = "0.2.1" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "72b537745234cbf0e296a3bd836d70a614dff4cb522b14e2680ef006bb1ed5ff" | ||
| dependencies = [ | ||
| "proc-macro2", | ||
| "quote", | ||
| "syn", | ||
| "synstructure", | ||
| ] | ||
| [[package]] | ||
| name = "detone" | ||
| version = "1.0.1" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "5d5b580660e7375410c9199e84aa298f919925fb53d8cc9b02d8010ff5a14d09" | ||
| [[package]] | ||
| name = "displaydoc" | ||
| version = "0.2.5" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0" | ||
| dependencies = [ | ||
| "proc-macro2", | ||
| "quote", | ||
| "syn", | ||
| ] | ||
| [[package]] | ||
| name = "either" | ||
| version = "1.15.0" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719" | ||
| [[package]] | ||
| name = "erased-serde" | ||
| version = "0.4.10" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "d2add8a07dd6a8d93ff627029c51de145e12686fbc36ecb298ac22e74cf02dec" | ||
| dependencies = [ | ||
| "serde", | ||
| "serde_core", | ||
| "typeid", | ||
| ] | ||
| [[package]] | ||
| name = "half" | ||
| version = "2.4.1" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "6dd08c532ae367adf81c312a4580bc67f1d0fe8bc9c460520283f4c0ff277888" | ||
| dependencies = [ | ||
| "cfg-if", | ||
| "crunchy", | ||
| ] | ||
| [[package]] | ||
| name = "harfbuzz-traits" | ||
| version = "0.6.0" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "2b2b09c6c4711837cc213910511345490ee139c6dbb368800f775b6a53b373ec" | ||
| [[package]] | ||
| name = "hermit-abi" | ||
| version = "0.5.2" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "fc0fef456e4baa96da950455cd02c081ca953b141298e41db3fc7e36b1da849c" | ||
| [[package]] | ||
| name = "icu_collections" | ||
| version = "2.2.0" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "2984d1cd16c883d7935b9e07e44071dca8d917fd52ecc02c04d5fa0b5a3f191c" | ||
| dependencies = [ | ||
| "databake", | ||
| "displaydoc", | ||
| "potential_utf", | ||
| "serde", | ||
| "utf8_iter", | ||
| "yoke", | ||
| "zerofrom", | ||
| "zerovec", | ||
| ] | ||
| [[package]] | ||
| name = "icu_locale_core" | ||
| version = "2.2.0" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "92219b62b3e2b4d88ac5119f8904c10f8f61bf7e95b640d25ba3075e6cac2c29" | ||
| dependencies = [ | ||
| "databake", | ||
| "displaydoc", | ||
| "litemap", | ||
| "serde", | ||
| "tinystr", | ||
| "writeable", | ||
| "zerovec", | ||
| ] | ||
| [[package]] | ||
| name = "icu_normalizer" | ||
| version = "2.2.0" | ||
| dependencies = [ | ||
| "arraystring", | ||
| "arrayvec", | ||
| "atoi", | ||
| "criterion", | ||
| "databake", | ||
| "detone", | ||
| "harfbuzz-traits", | ||
| "icu_collections", | ||
| "icu_normalizer_data", | ||
| "icu_properties", | ||
| "icu_provider", | ||
| "serde", | ||
| "smallvec", | ||
| "utf16_iter", | ||
| "utf8_iter", | ||
| "write16", | ||
| "zerovec", | ||
| ] | ||
| [[package]] | ||
| name = "icu_normalizer_data" | ||
| version = "2.2.0" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "da3be0ae77ea334f4da67c12f149704f19f81d1adf7c51cf482943e84a2bad38" | ||
| [[package]] | ||
| name = "icu_properties" | ||
| version = "2.2.0" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "bee3b67d0ea5c2cca5003417989af8996f8604e34fb9ddf96208a033901e70de" | ||
| dependencies = [ | ||
| "databake", | ||
| "icu_collections", | ||
| "icu_locale_core", | ||
| "icu_properties_data", | ||
| "icu_provider", | ||
| "serde", | ||
| "zerotrie", | ||
| "zerovec", | ||
| ] | ||
| [[package]] | ||
| name = "icu_properties_data" | ||
| version = "2.2.0" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "8e2bbb201e0c04f7b4b3e14382af113e17ba4f63e2c9d2ee626b720cbce54a14" | ||
| [[package]] | ||
| name = "icu_provider" | ||
| version = "2.2.0" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "139c4cf31c8b5f33d7e199446eff9c1e02decfc2f0eec2c8d71f65befa45b421" | ||
| dependencies = [ | ||
| "databake", | ||
| "displaydoc", | ||
| "erased-serde", | ||
| "icu_locale_core", | ||
| "postcard", | ||
| "serde", | ||
| "stable_deref_trait", | ||
| "writeable", | ||
| "yoke", | ||
| "zerofrom", | ||
| "zerotrie", | ||
| "zerovec", | ||
| ] | ||
| [[package]] | ||
| name = "is-terminal" | ||
| version = "0.4.17" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "3640c1c38b8e4e43584d8df18be5fc6b0aa314ce6ebf51b53313d4306cca8e46" | ||
| dependencies = [ | ||
| "hermit-abi", | ||
| "libc", | ||
| "windows-sys", | ||
| ] | ||
| [[package]] | ||
| name = "itertools" | ||
| version = "0.10.5" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "b0fd2260e829bddf4cb6ea802289de2f86d6a7a690192fbe91b3f46e0f2c8473" | ||
| dependencies = [ | ||
| "either", | ||
| ] | ||
| [[package]] | ||
| name = "itoa" | ||
| version = "1.0.18" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "8f42a60cbdf9a97f5d2305f08a87dc4e09308d1276d28c869c684d7777685682" | ||
| [[package]] | ||
| name = "js-sys" | ||
| version = "0.3.91" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "b49715b7073f385ba4bc528e5747d02e66cb39c6146efb66b781f131f0fb399c" | ||
| dependencies = [ | ||
| "once_cell", | ||
| "wasm-bindgen", | ||
| ] | ||
| [[package]] | ||
| name = "libc" | ||
| version = "0.2.183" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "b5b646652bf6661599e1da8901b3b9522896f01e736bad5f723fe7a3a27f899d" | ||
| [[package]] | ||
| name = "litemap" | ||
| version = "0.8.2" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "92daf443525c4cce67b150400bc2316076100ce0b3686209eb8cf3c31612e6f0" | ||
| dependencies = [ | ||
| "serde_core", | ||
| ] | ||
| [[package]] | ||
| name = "memchr" | ||
| version = "2.8.0" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "f8ca58f447f06ed17d5fc4043ce1b10dd205e060fb3ce5b979b8ed8e59ff3f79" | ||
| [[package]] | ||
| name = "num-traits" | ||
| version = "0.2.19" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" | ||
| dependencies = [ | ||
| "autocfg", | ||
| ] | ||
| [[package]] | ||
| name = "once_cell" | ||
| version = "1.21.4" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "9f7c3e4beb33f85d45ae3e3a1792185706c8e16d043238c593331cc7cd313b50" | ||
| [[package]] | ||
| name = "oorandom" | ||
| version = "11.1.5" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "d6790f58c7ff633d8771f42965289203411a5e5c68388703c06e14f24770b41e" | ||
| [[package]] | ||
| name = "plotters" | ||
| version = "0.3.7" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "5aeb6f403d7a4911efb1e33402027fc44f29b5bf6def3effcc22d7bb75f2b747" | ||
| dependencies = [ | ||
| "num-traits", | ||
| "plotters-backend", | ||
| "plotters-svg", | ||
| "wasm-bindgen", | ||
| "web-sys", | ||
| ] | ||
| [[package]] | ||
| name = "plotters-backend" | ||
| version = "0.3.7" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "df42e13c12958a16b3f7f4386b9ab1f3e7933914ecea48da7139435263a4172a" | ||
| [[package]] | ||
| name = "plotters-svg" | ||
| version = "0.3.7" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "51bae2ac328883f7acdfea3d66a7c35751187f870bc81f94563733a154d7a670" | ||
| dependencies = [ | ||
| "plotters-backend", | ||
| ] | ||
| [[package]] | ||
| name = "postcard" | ||
| version = "1.1.3" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "6764c3b5dd454e283a30e6dfe78e9b31096d9e32036b5d1eaac7a6119ccb9a24" | ||
| dependencies = [ | ||
| "cobs", | ||
| "serde", | ||
| ] | ||
| [[package]] | ||
| name = "potential_utf" | ||
| version = "0.1.5" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "0103b1cef7ec0cf76490e969665504990193874ea05c85ff9bab8b911d0a0564" | ||
| dependencies = [ | ||
| "serde_core", | ||
| "zerovec", | ||
| ] | ||
| [[package]] | ||
| name = "proc-macro2" | ||
| version = "1.0.106" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "8fd00f0bb2e90d81d1044c2b32617f68fcb9fa3bb7640c23e9c748e53fb30934" | ||
| dependencies = [ | ||
| "unicode-ident", | ||
| ] | ||
| [[package]] | ||
| name = "quote" | ||
| version = "1.0.45" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "41f2619966050689382d2b44f664f4bc593e129785a36d6ee376ddf37259b924" | ||
| dependencies = [ | ||
| "proc-macro2", | ||
| ] | ||
| [[package]] | ||
| name = "rayon" | ||
| version = "1.10.0" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "b418a60154510ca1a002a752ca9714984e21e4241e804d32555251faf8b78ffa" | ||
| dependencies = [ | ||
| "either", | ||
| "rayon-core", | ||
| ] | ||
| [[package]] | ||
| name = "rayon-core" | ||
| version = "1.12.1" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "1465873a3dfdaa8ae7cb14b4383657caab0b3e8a0aa9ae8e04b044854c8dfce2" | ||
| dependencies = [ | ||
| "crossbeam-deque", | ||
| "crossbeam-utils", | ||
| ] | ||
| [[package]] | ||
| name = "regex" | ||
| version = "1.12.3" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "e10754a14b9137dd7b1e3e5b0493cc9171fdd105e0ab477f51b72e7f3ac0e276" | ||
| dependencies = [ | ||
| "aho-corasick", | ||
| "memchr", | ||
| "regex-automata", | ||
| "regex-syntax", | ||
| ] | ||
| [[package]] | ||
| name = "regex-automata" | ||
| version = "0.4.14" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "6e1dd4122fc1595e8162618945476892eefca7b88c52820e74af6262213cae8f" | ||
| dependencies = [ | ||
| "aho-corasick", | ||
| "memchr", | ||
| "regex-syntax", | ||
| ] | ||
| [[package]] | ||
| name = "regex-syntax" | ||
| version = "0.8.10" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "dc897dd8d9e8bd1ed8cdad82b5966c3e0ecae09fb1907d58efaa013543185d0a" | ||
| [[package]] | ||
| name = "rustversion" | ||
| version = "1.0.22" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d" | ||
| [[package]] | ||
| name = "same-file" | ||
| version = "1.0.6" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502" | ||
| dependencies = [ | ||
| "winapi-util", | ||
| ] | ||
| [[package]] | ||
| name = "serde" | ||
| version = "1.0.228" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e" | ||
| dependencies = [ | ||
| "serde_core", | ||
| "serde_derive", | ||
| ] | ||
| [[package]] | ||
| name = "serde_core" | ||
| version = "1.0.228" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad" | ||
| dependencies = [ | ||
| "serde_derive", | ||
| ] | ||
| [[package]] | ||
| name = "serde_derive" | ||
| version = "1.0.228" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79" | ||
| dependencies = [ | ||
| "proc-macro2", | ||
| "quote", | ||
| "syn", | ||
| ] | ||
| [[package]] | ||
| name = "serde_json" | ||
| version = "1.0.149" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "83fc039473c5595ace860d8c4fafa220ff474b3fc6bfdb4293327f1a37e94d86" | ||
| dependencies = [ | ||
| "itoa", | ||
| "memchr", | ||
| "serde", | ||
| "serde_core", | ||
| "zmij", | ||
| ] | ||
| [[package]] | ||
| name = "smallvec" | ||
| version = "1.15.1" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "67b1b7a3b5fe4f1376887184045fcf45c69e92af734b7aaddc05fb777b6fbd03" | ||
| [[package]] | ||
| name = "stable_deref_trait" | ||
| version = "1.2.1" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "6ce2be8dc25455e1f91df71bfa12ad37d7af1092ae736f3a6cd0e37bc7810596" | ||
| [[package]] | ||
| name = "syn" | ||
| version = "2.0.117" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "e665b8803e7b1d2a727f4023456bbbbe74da67099c585258af0ad9c5013b9b99" | ||
| dependencies = [ | ||
| "proc-macro2", | ||
| "quote", | ||
| "unicode-ident", | ||
| ] | ||
| [[package]] | ||
| name = "synstructure" | ||
| version = "0.13.2" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "728a70f3dbaf5bab7f0c4b1ac8d7ae5ea60a4b5549c8a5914361c99147a709d2" | ||
| dependencies = [ | ||
| "proc-macro2", | ||
| "quote", | ||
| "syn", | ||
| ] | ||
| [[package]] | ||
| name = "thiserror" | ||
| version = "2.0.18" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "4288b5bcbc7920c07a1149a35cf9590a2aa808e0bc1eafaade0b80947865fbc4" | ||
| dependencies = [ | ||
| "thiserror-impl", | ||
| ] | ||
| [[package]] | ||
| name = "thiserror-impl" | ||
| version = "2.0.18" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "ebc4ee7f67670e9b64d05fa4253e753e016c6c95ff35b89b7941d6b856dec1d5" | ||
| dependencies = [ | ||
| "proc-macro2", | ||
| "quote", | ||
| "syn", | ||
| ] | ||
| [[package]] | ||
| name = "tinystr" | ||
| version = "0.8.3" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "c8323304221c2a851516f22236c5722a72eaa19749016521d6dff0824447d96d" | ||
| dependencies = [ | ||
| "displaydoc", | ||
| "serde_core", | ||
| "zerovec", | ||
| ] | ||
| [[package]] | ||
| name = "tinytemplate" | ||
| version = "1.2.1" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "be4d6b5f19ff7664e8c98d03e2139cb510db9b0a60b55f8e8709b689d939b6bc" | ||
| dependencies = [ | ||
| "serde", | ||
| "serde_json", | ||
| ] | ||
| [[package]] | ||
| name = "typeid" | ||
| version = "1.0.3" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "bc7d623258602320d5c55d1bc22793b57daff0ec7efc270ea7d55ce1d5f5471c" | ||
| [[package]] | ||
| name = "typenum" | ||
| version = "1.19.0" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "562d481066bde0658276a35467c4af00bdc6ee726305698a55b86e61d7ad82bb" | ||
| [[package]] | ||
| name = "unicode-ident" | ||
| version = "1.0.24" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "e6e4313cd5fcd3dad5cafa179702e2b244f760991f45397d14d4ebf38247da75" | ||
| [[package]] | ||
| name = "utf16_iter" | ||
| version = "1.0.5" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "c8232dd3cdaed5356e0f716d285e4b40b932ac434100fe9b7e0e8e935b9e6246" | ||
| [[package]] | ||
| name = "utf8_iter" | ||
| version = "1.0.4" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be" | ||
| [[package]] | ||
| name = "walkdir" | ||
| version = "2.5.0" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "29790946404f91d9c5d06f9874efddea1dc06c5efe94541a7d6863108e3a5e4b" | ||
| dependencies = [ | ||
| "same-file", | ||
| "winapi-util", | ||
| ] | ||
| [[package]] | ||
| name = "wasm-bindgen" | ||
| version = "0.2.114" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "6532f9a5c1ece3798cb1c2cfdba640b9b3ba884f5db45973a6f442510a87d38e" | ||
| dependencies = [ | ||
| "cfg-if", | ||
| "once_cell", | ||
| "rustversion", | ||
| "wasm-bindgen-macro", | ||
| "wasm-bindgen-shared", | ||
| ] | ||
| [[package]] | ||
| name = "wasm-bindgen-macro" | ||
| version = "0.2.114" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "18a2d50fcf105fb33bb15f00e7a77b772945a2ee45dcf454961fd843e74c18e6" | ||
| dependencies = [ | ||
| "quote", | ||
| "wasm-bindgen-macro-support", | ||
| ] | ||
| [[package]] | ||
| name = "wasm-bindgen-macro-support" | ||
| version = "0.2.114" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "03ce4caeaac547cdf713d280eda22a730824dd11e6b8c3ca9e42247b25c631e3" | ||
| dependencies = [ | ||
| "bumpalo", | ||
| "proc-macro2", | ||
| "quote", | ||
| "syn", | ||
| "wasm-bindgen-shared", | ||
| ] | ||
| [[package]] | ||
| name = "wasm-bindgen-shared" | ||
| version = "0.2.114" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "75a326b8c223ee17883a4251907455a2431acc2791c98c26279376490c378c16" | ||
| dependencies = [ | ||
| "unicode-ident", | ||
| ] | ||
| [[package]] | ||
| name = "web-sys" | ||
| version = "0.3.91" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "854ba17bb104abfb26ba36da9729addc7ce7f06f5c0f90f3c391f8461cca21f9" | ||
| dependencies = [ | ||
| "js-sys", | ||
| "wasm-bindgen", | ||
| ] | ||
| [[package]] | ||
| name = "winapi-util" | ||
| version = "0.1.11" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "c2a7b1c03c876122aa43f3020e6c3c3ee5c05081c9a00739faf7503aeba10d22" | ||
| dependencies = [ | ||
| "windows-sys", | ||
| ] | ||
| [[package]] | ||
| name = "windows-link" | ||
| version = "0.2.1" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5" | ||
| [[package]] | ||
| name = "windows-sys" | ||
| version = "0.61.2" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "ae137229bcbd6cdf0f7b80a31df61766145077ddf49416a728b02cb3921ff3fc" | ||
| dependencies = [ | ||
| "windows-link", | ||
| ] | ||
| [[package]] | ||
| name = "write16" | ||
| version = "1.0.0" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "d1890f4022759daae28ed4fe62859b1236caebfc61ede2f63ed4e695f3f6d936" | ||
| dependencies = [ | ||
| "arrayvec", | ||
| "smallvec", | ||
| ] | ||
| [[package]] | ||
| name = "writeable" | ||
| version = "0.6.2" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "9edde0db4769d2dc68579893f2306b26c6ecfbe0ef499b013d731b7b9247e0b9" | ||
| [[package]] | ||
| name = "yoke" | ||
| version = "0.8.2" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "abe8c5fda708d9ca3df187cae8bfb9ceda00dd96231bed36e445a1a48e66f9ca" | ||
| dependencies = [ | ||
| "stable_deref_trait", | ||
| "yoke-derive", | ||
| "zerofrom", | ||
| ] | ||
| [[package]] | ||
| name = "yoke-derive" | ||
| version = "0.8.2" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "de844c262c8848816172cef550288e7dc6c7b7814b4ee56b3e1553f275f1858e" | ||
| dependencies = [ | ||
| "proc-macro2", | ||
| "quote", | ||
| "syn", | ||
| "synstructure", | ||
| ] | ||
| [[package]] | ||
| name = "zerofrom" | ||
| version = "0.1.7" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "69faa1f2a1ea75661980b013019ed6687ed0e83d069bc1114e2cc74c6c04c4df" | ||
| dependencies = [ | ||
| "zerofrom-derive", | ||
| ] | ||
| [[package]] | ||
| name = "zerofrom-derive" | ||
| version = "0.1.7" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "11532158c46691caf0f2593ea8358fed6bbf68a0315e80aae9bd41fbade684a1" | ||
| dependencies = [ | ||
| "proc-macro2", | ||
| "quote", | ||
| "syn", | ||
| "synstructure", | ||
| ] | ||
| [[package]] | ||
| name = "zerotrie" | ||
| version = "0.2.4" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "0f9152d31db0792fa83f70fb2f83148effb5c1f5b8c7686c3459e361d9bc20bf" | ||
| dependencies = [ | ||
| "databake", | ||
| "displaydoc", | ||
| "litemap", | ||
| "serde_core", | ||
| "yoke", | ||
| "zerofrom", | ||
| "zerovec", | ||
| ] | ||
| [[package]] | ||
| name = "zerovec" | ||
| version = "0.11.6" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "90f911cbc359ab6af17377d242225f4d75119aec87ea711a880987b18cd7b239" | ||
| dependencies = [ | ||
| "databake", | ||
| "serde", | ||
| "yoke", | ||
| "zerofrom", | ||
| "zerovec-derive", | ||
| ] | ||
| [[package]] | ||
| name = "zerovec-derive" | ||
| version = "0.11.3" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "625dc425cab0dca6dc3c3319506e6593dcb08a9f387ea3b284dbd52a92c40555" | ||
| dependencies = [ | ||
| "proc-macro2", | ||
| "quote", | ||
| "syn", | ||
| ] | ||
| [[package]] | ||
| name = "zmij" | ||
| version = "1.0.21" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "b8848ee67ecc8aedbaf3e4122217aff892639231befc6a1b58d29fff4c2cabaa" |
| # THIS FILE IS AUTOMATICALLY GENERATED BY CARGO | ||
| # | ||
| # When uploading crates to the registry Cargo will automatically | ||
| # "normalize" Cargo.toml files for maximal compatibility | ||
| # with all versions of Cargo and also rewrite `path` dependencies | ||
| # to registry (e.g., crates.io) dependencies. | ||
| # | ||
| # If you are reading this file be aware that the original Cargo.toml | ||
| # will likely look very different (and much more reasonable). | ||
| # See Cargo.toml.orig for the original contents. | ||
| [package] | ||
| edition = "2021" | ||
| rust-version = "1.86" | ||
| name = "icu_normalizer" | ||
| version = "2.2.0" | ||
| authors = ["The ICU4X Project Developers"] | ||
| build = false | ||
| include = [ | ||
| "data/**/*", | ||
| "src/**/*", | ||
| "examples/**/*", | ||
| "benches/**/*", | ||
| "tests/**/*", | ||
| "Cargo.toml", | ||
| "LICENSE", | ||
| "README.md", | ||
| "build.rs", | ||
| ] | ||
| autolib = false | ||
| autobins = false | ||
| autoexamples = false | ||
| autotests = false | ||
| autobenches = false | ||
| description = "API for normalizing text into Unicode Normalization Forms" | ||
| homepage = "https://icu4x.unicode.org" | ||
| readme = "README.md" | ||
| keywords = [ | ||
| "unicode", | ||
| "normalization", | ||
| "text-processing", | ||
| ] | ||
| categories = [ | ||
| "internationalization", | ||
| "localization", | ||
| "no-std", | ||
| "embedded", | ||
| ] | ||
| license = "Unicode-3.0" | ||
| repository = "https://github.com/unicode-org/icu4x" | ||
| [package.metadata.docs.rs] | ||
| all-features = true | ||
| [package.metadata.cargo-all-features] | ||
| max_combination_size = 3 | ||
| [features] | ||
| compiled_data = [ | ||
| "dep:icu_normalizer_data", | ||
| "icu_properties?/compiled_data", | ||
| "icu_provider/baked", | ||
| ] | ||
| datagen = [ | ||
| "serde", | ||
| "dep:databake", | ||
| "icu_properties", | ||
| "icu_collections/databake", | ||
| "zerovec/databake", | ||
| "icu_properties?/datagen", | ||
| "icu_provider/export", | ||
| ] | ||
| default = [ | ||
| "compiled_data", | ||
| "utf8_iter", | ||
| "utf16_iter", | ||
| ] | ||
| harfbuzz_traits = ["dep:harfbuzz-traits"] | ||
| icu_properties = ["dep:icu_properties"] | ||
| serde = [ | ||
| "dep:serde", | ||
| "icu_collections/serde", | ||
| "zerovec/serde", | ||
| "icu_properties?/serde", | ||
| "icu_provider/serde", | ||
| ] | ||
| utf16_iter = [ | ||
| "dep:utf16_iter", | ||
| "dep:write16", | ||
| ] | ||
| utf8_iter = ["dep:utf8_iter"] | ||
| write16 = [] | ||
| [lib] | ||
| name = "icu_normalizer" | ||
| path = "src/lib.rs" | ||
| [[test]] | ||
| name = "tests" | ||
| path = "tests/tests.rs" | ||
| [[bench]] | ||
| name = "bench" | ||
| path = "benches/bench.rs" | ||
| harness = false | ||
| required-features = [ | ||
| "utf16_iter", | ||
| "utf8_iter", | ||
| ] | ||
| [[bench]] | ||
| name = "canonical_composition" | ||
| path = "benches/canonical_composition.rs" | ||
| [[bench]] | ||
| name = "canonical_decomposition" | ||
| path = "benches/canonical_decomposition.rs" | ||
| [[bench]] | ||
| name = "composing_normalizer_nfc" | ||
| path = "benches/composing_normalizer_nfc.rs" | ||
| [[bench]] | ||
| name = "composing_normalizer_nfkc" | ||
| path = "benches/composing_normalizer_nfkc.rs" | ||
| [[bench]] | ||
| name = "decomposing_normalizer_nfd" | ||
| path = "benches/decomposing_normalizer_nfd.rs" | ||
| [[bench]] | ||
| name = "decomposing_normalizer_nfkd" | ||
| path = "benches/decomposing_normalizer_nfkd.rs" | ||
| [[bench]] | ||
| name = "utf16_throughput" | ||
| path = "benches/utf16_throughput.rs" | ||
| [dependencies.databake] | ||
| version = "0.2.0" | ||
| features = ["derive"] | ||
| optional = true | ||
| default-features = false | ||
| [dependencies.harfbuzz-traits] | ||
| version = "0.6.0" | ||
| optional = true | ||
| default-features = false | ||
| [dependencies.icu_collections] | ||
| version = "~2.2.0" | ||
| default-features = false | ||
| [dependencies.icu_normalizer_data] | ||
| version = "~2.2.0" | ||
| optional = true | ||
| default-features = false | ||
| [dependencies.icu_properties] | ||
| version = "~2.2.0" | ||
| optional = true | ||
| default-features = false | ||
| [dependencies.icu_provider] | ||
| version = "2.2.0" | ||
| default-features = false | ||
| [dependencies.serde] | ||
| version = "1.0.220" | ||
| features = [ | ||
| "derive", | ||
| "alloc", | ||
| ] | ||
| optional = true | ||
| default-features = false | ||
| [dependencies.smallvec] | ||
| version = "1.10.0" | ||
| default-features = false | ||
| [dependencies.utf16_iter] | ||
| version = "1.0.2" | ||
| optional = true | ||
| default-features = false | ||
| [dependencies.utf8_iter] | ||
| version = "1.0.2" | ||
| optional = true | ||
| default-features = false | ||
| [dependencies.write16] | ||
| version = "1.0.0" | ||
| features = ["alloc"] | ||
| optional = true | ||
| default-features = false | ||
| [dependencies.zerovec] | ||
| version = "0.11.6" | ||
| default-features = false | ||
| [dev-dependencies.arraystring] | ||
| version = "0.3.0" | ||
| [dev-dependencies.arrayvec] | ||
| version = "0.7.2" | ||
| default-features = false | ||
| [dev-dependencies.atoi] | ||
| version = "2.0.0" | ||
| [dev-dependencies.detone] | ||
| version = "1.0.0" | ||
| [dev-dependencies.write16] | ||
| version = "1.0.0" | ||
| features = [ | ||
| "arrayvec", | ||
| "smallvec", | ||
| ] | ||
| default-features = false | ||
| [target.'cfg(not(target_arch = "wasm32"))'.dev-dependencies.criterion] | ||
| version = "0.5.0" | ||
| [lints.clippy] | ||
| alloc-instead-of-core = "warn" | ||
| branches-sharing-code = "warn" | ||
| collection_is_never_read = "warn" | ||
| crosspointer_transmute = "warn" | ||
| dbg_macro = "warn" | ||
| debug_assert_with_mut_call = "warn" | ||
| doc_markdown = "warn" | ||
| exhaustive_enums = "deny" | ||
| exhaustive_structs = "deny" | ||
| fn_to_numeric_cast_any = "warn" | ||
| infinite_loop = "warn" | ||
| large_stack_arrays = "warn" | ||
| mismatching_type_param_order = "warn" | ||
| missing_fields_in_debug = "warn" | ||
| missing_transmute_annotations = "warn" | ||
| negative_feature_names = "warn" | ||
| or-fun-call = "warn" | ||
| same_functions_in_if_condition = "warn" | ||
| todo = "warn" | ||
| transmute_bytes_to_str = "warn" | ||
| transmute_int_to_bool = "warn" | ||
| transmute_int_to_non_zero = "warn" | ||
| transmute_ptr_to_ptr = "warn" | ||
| transmute_ptr_to_ref = "warn" | ||
| transmute_undefined_repr = "warn" | ||
| transmutes_expressible_as_ptr_casts = "warn" | ||
| trivially_copy_pass_by_ref = "deny" | ||
| unnecessary-wraps = "warn" | ||
| useless_transmute = "warn" | ||
| wildcard_dependencies = "warn" | ||
| [lints.rust] | ||
| missing_debug_implementations = "deny" | ||
| trivial_numeric_casts = "deny" | ||
| unused_lifetimes = "warn" | ||
| unused_macro_rules = "warn" | ||
| unused_qualifications = "warn" | ||
| [lints.rust.unexpected_cfgs] | ||
| level = "warn" | ||
| priority = 0 | ||
| check-cfg = [ | ||
| "cfg(icu4c_enable_renaming)", | ||
| "cfg(needs_alloc_error_handler)", | ||
| "cfg(icu4x_run_size_tests)", | ||
| "cfg(icu4x_unstable_fast_trie_only)", | ||
| ] |
Sorry, the diff of this file is not supported yet
| UNICODE LICENSE V3 | ||
| COPYRIGHT AND PERMISSION NOTICE | ||
| Copyright © 2020-2024 Unicode, Inc. | ||
| NOTICE TO USER: Carefully read the following legal agreement. BY | ||
| DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING DATA FILES, AND/OR | ||
| SOFTWARE, YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE | ||
| TERMS AND CONDITIONS OF THIS AGREEMENT. IF YOU DO NOT AGREE, DO NOT | ||
| DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE THE DATA FILES OR SOFTWARE. | ||
| Permission is hereby granted, free of charge, to any person obtaining a | ||
| copy of data files and any associated documentation (the "Data Files") or | ||
| software and any associated documentation (the "Software") to deal in the | ||
| Data Files or Software without restriction, including without limitation | ||
| the rights to use, copy, modify, merge, publish, distribute, and/or sell | ||
| copies of the Data Files or Software, and to permit persons to whom the | ||
| Data Files or Software are furnished to do so, provided that either (a) | ||
| this copyright and permission notice appear with all copies of the Data | ||
| Files or Software, or (b) this copyright and permission notice appear in | ||
| associated Documentation. | ||
| THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY | ||
| KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | ||
| MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF | ||
| THIRD PARTY RIGHTS. | ||
| IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE | ||
| BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, | ||
| OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, | ||
| WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, | ||
| ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THE DATA | ||
| FILES OR SOFTWARE. | ||
| Except as contained in this notice, the name of a copyright holder shall | ||
| not be used in advertising or otherwise to promote the sale, use or other | ||
| dealings in these Data Files or Software without prior written | ||
| authorization of the copyright holder. | ||
| SPDX-License-Identifier: Unicode-3.0 | ||
| — | ||
| Portions of ICU4X may have been adapted from ICU4C and/or ICU4J. | ||
| ICU 1.8.1 to ICU 57.1 © 1995-2016 International Business Machines Corporation and others. |
| # icu_normalizer [](https://crates.io/crates/icu_normalizer) | ||
| <!-- cargo-rdme start --> | ||
| Normalizing text into Unicode Normalization Forms. | ||
| This module is published as its own crate ([`icu_normalizer`](https://docs.rs/icu_normalizer/latest/icu_normalizer/)) | ||
| and as part of the [`icu`](https://docs.rs/icu/latest/icu/) crate. See the latter for more details on the ICU4X project. | ||
| ## Functionality | ||
| The top level of the crate provides normalization of input into the four normalization forms defined in [UAX #15: Unicode | ||
| Normalization Forms](https://www.unicode.org/reports/tr15/): NFC, NFD, NFKC, and NFKD. | ||
| Three kinds of contiguous inputs are supported: known-well-formed UTF-8 (`&str`), potentially-not-well-formed UTF-8, | ||
| and potentially-not-well-formed UTF-16. Additionally, an iterator over `char` can be wrapped in a normalizing iterator. | ||
| The `uts46` module provides the combination of mapping and normalization operations for [UTS #46: Unicode IDNA | ||
| Compatibility Processing](https://www.unicode.org/reports/tr46/). This functionality is not meant to be used by | ||
| applications directly. Instead, it is meant as a building block for a full implementation of UTS #46, such as the | ||
| [`idna`](https://docs.rs/idna/latest/idna/) crate. | ||
| The `properties` module provides the non-recursive canonical decomposition operation on a per `char` basis and | ||
| the canonical compositon operation given two `char`s. It also provides access to the Canonical Combining Class | ||
| property. These operations are primarily meant for [HarfBuzz](https://harfbuzz.github.io/), the types | ||
| [`CanonicalComposition`](properties::CanonicalComposition), [`CanonicalDecomposition`](properties::CanonicalDecomposition), | ||
| and [`CanonicalCombiningClassMap`](properties::CanonicalCombiningClassMap) implement the [`harfbuzz_traits`] if | ||
| the `harfbuzz_traits` Cargo feature is enabled. | ||
| Notably, this normalizer does _not_ provide the normalization “quick check” that can result in “maybe” in | ||
| addition to “yes” and “no”. The normalization checks provided by this crate always give a definitive | ||
| non-“maybe” answer. | ||
| ## Examples | ||
| ```rust | ||
| let nfc = icu_normalizer::ComposingNormalizerBorrowed::new_nfc(); | ||
| assert_eq!(nfc.normalize("a\u{0308}"), "ä"); | ||
| assert!(nfc.is_normalized("ä")); | ||
| let nfd = icu_normalizer::DecomposingNormalizerBorrowed::new_nfd(); | ||
| assert_eq!(nfd.normalize("ä"), "a\u{0308}"); | ||
| assert!(!nfd.is_normalized("ä")); | ||
| ``` | ||
| <!-- cargo-rdme end --> | ||
| ## More Information | ||
| For more information on development, authorship, contributing etc. please visit [`ICU4X home page`](https://github.com/unicode-org/icu4x). |
| // This file is part of ICU4X. For terms of use, please see the file | ||
| // called LICENSE at the top level of the ICU4X source tree | ||
| // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). | ||
| use crate::properties::{ | ||
| CanonicalCombiningClassMap, CanonicalCombiningClassMapBorrowed, CanonicalComposition, | ||
| CanonicalCompositionBorrowed, CanonicalDecomposition, CanonicalDecompositionBorrowed, | ||
| Decomposed, | ||
| }; | ||
| use harfbuzz_traits::{CombiningClassFunc, ComposeFunc, DecomposeFunc}; | ||
| /// ✨ *Enabled with the `harfbuzz_traits` Cargo feature.* | ||
| impl ComposeFunc for CanonicalCompositionBorrowed<'_> { | ||
| fn compose(&self, a: char, b: char) -> Option<char> { | ||
| CanonicalCompositionBorrowed::compose(*self, a, b) | ||
| } | ||
| } | ||
| /// ✨ *Enabled with the `harfbuzz_traits` Cargo feature.* | ||
| impl ComposeFunc for CanonicalComposition { | ||
| fn compose(&self, a: char, b: char) -> Option<char> { | ||
| ComposeFunc::compose(&self.as_borrowed(), a, b) | ||
| } | ||
| } | ||
| /// ✨ *Enabled with the `harfbuzz_traits` Cargo feature.* | ||
| impl ComposeFunc for &'_ CanonicalComposition { | ||
| fn compose(&self, a: char, b: char) -> Option<char> { | ||
| ComposeFunc::compose(&self.as_borrowed(), a, b) | ||
| } | ||
| } | ||
| /// ✨ *Enabled with the `harfbuzz_traits` Cargo feature.* | ||
| impl DecomposeFunc for CanonicalDecompositionBorrowed<'_> { | ||
| fn decompose(&self, ab: char) -> Option<(char, char)> { | ||
| match CanonicalDecompositionBorrowed::decompose(self, ab) { | ||
| Decomposed::Default => None, | ||
| Decomposed::Expansion(first, second) => Some((first, second)), | ||
| Decomposed::Singleton(single) => Some((single, '\0')), | ||
| } | ||
| } | ||
| } | ||
| /// ✨ *Enabled with the `harfbuzz_traits` Cargo feature.* | ||
| impl DecomposeFunc for CanonicalDecomposition { | ||
| fn decompose(&self, ab: char) -> Option<(char, char)> { | ||
| DecomposeFunc::decompose(&self.as_borrowed(), ab) | ||
| } | ||
| } | ||
| /// ✨ *Enabled with the `harfbuzz_traits` Cargo feature.* | ||
| impl DecomposeFunc for &'_ CanonicalDecomposition { | ||
| fn decompose(&self, ab: char) -> Option<(char, char)> { | ||
| DecomposeFunc::decompose(&self.as_borrowed(), ab) | ||
| } | ||
| } | ||
| /// ✨ *Enabled with the `harfbuzz_traits` Cargo feature.* | ||
| impl CombiningClassFunc for CanonicalCombiningClassMapBorrowed<'_> { | ||
| fn combining_class(&self, ch: char) -> u8 { | ||
| self.get_u8(ch) | ||
| } | ||
| } | ||
| /// ✨ *Enabled with the `harfbuzz_traits` Cargo feature.* | ||
| impl CombiningClassFunc for CanonicalCombiningClassMap { | ||
| fn combining_class(&self, ch: char) -> u8 { | ||
| CombiningClassFunc::combining_class(&self.as_borrowed(), ch) | ||
| } | ||
| } | ||
| /// ✨ *Enabled with the `harfbuzz_traits` Cargo feature.* | ||
| impl CombiningClassFunc for &'_ CanonicalCombiningClassMap { | ||
| fn combining_class(&self, ch: char) -> u8 { | ||
| CombiningClassFunc::combining_class(&self.as_borrowed(), ch) | ||
| } | ||
| } |
Sorry, the diff of this file is too big to display
| // This file is part of ICU4X. For terms of use, please see the file | ||
| // called LICENSE at the top level of the ICU4X source tree | ||
| // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). | ||
| //! Access to the Unicode properties or property-based operations that | ||
| //! are required for NFC and NFD. | ||
| //! | ||
| //! Applications should generally use the full normalizers that are | ||
| //! provided at the top level of this crate. However, the APIs in this | ||
| //! module are provided for callers such as HarfBuzz that specifically | ||
| //! want access to the raw canonical composition operation e.g. for use in a | ||
| //! glyph-availability-guided custom normalizer. | ||
| use crate::char_from_u16; | ||
| use crate::char_from_u32; | ||
| use crate::in_inclusive_range; | ||
| use crate::provider::CanonicalCompositions; | ||
| use crate::provider::DecompositionData; | ||
| use crate::provider::DecompositionTables; | ||
| use crate::provider::NonRecursiveDecompositionSupplement; | ||
| use crate::provider::NormalizerNfcV1; | ||
| use crate::provider::NormalizerNfdDataV1; | ||
| use crate::provider::NormalizerNfdSupplementV1; | ||
| use crate::provider::NormalizerNfdTablesV1; | ||
| use crate::trie_value_has_ccc; | ||
| use crate::CanonicalCombiningClass; | ||
| use crate::BACKWARD_COMBINING_MARKER; | ||
| use crate::FDFA_MARKER; | ||
| use crate::HANGUL_L_BASE; | ||
| use crate::HANGUL_N_COUNT; | ||
| use crate::HANGUL_S_BASE; | ||
| use crate::HANGUL_S_COUNT; | ||
| use crate::HANGUL_T_BASE; | ||
| use crate::HANGUL_T_COUNT; | ||
| use crate::HANGUL_V_BASE; | ||
| use crate::HIGH_ZEROS_MASK; | ||
| use crate::LOW_ZEROS_MASK; | ||
| use crate::NON_ROUND_TRIP_MARKER; | ||
| use icu_provider::prelude::*; | ||
| /// Borrowed version of the raw canonical composition operation. | ||
| /// | ||
| /// Callers should generally use `ComposingNormalizer` instead of this API. | ||
| /// However, this API is provided for callers such as HarfBuzz that specifically | ||
| /// want access to the raw canonical composition operation e.g. for use in a | ||
| /// glyph-availability-guided custom normalizer. | ||
| #[derive(Debug, Copy, Clone)] | ||
| pub struct CanonicalCompositionBorrowed<'a> { | ||
| canonical_compositions: &'a CanonicalCompositions<'a>, | ||
| } | ||
| #[cfg(feature = "compiled_data")] | ||
| impl Default for CanonicalCompositionBorrowed<'static> { | ||
| fn default() -> Self { | ||
| Self::new() | ||
| } | ||
| } | ||
| impl CanonicalCompositionBorrowed<'static> { | ||
| /// Cheaply converts a [`CanonicalCompositionBorrowed<'static>`] into a [`CanonicalComposition`]. | ||
| /// | ||
| /// Note: Due to branching and indirection, using [`CanonicalComposition`] might inhibit some | ||
| /// compile-time optimizations that are possible with [`CanonicalCompositionBorrowed`]. | ||
| pub const fn static_to_owned(self) -> CanonicalComposition { | ||
| CanonicalComposition { | ||
| canonical_compositions: DataPayload::from_static_ref(self.canonical_compositions), | ||
| } | ||
| } | ||
| /// Constructs a new `CanonicalComposition` using compiled data. | ||
| /// | ||
| /// ✨ *Enabled with the `compiled_data` Cargo feature.* | ||
| /// | ||
| /// [📚 Help choosing a constructor](icu_provider::constructors) | ||
| #[cfg(feature = "compiled_data")] | ||
| pub const fn new() -> Self { | ||
| Self { | ||
| canonical_compositions: crate::provider::Baked::SINGLETON_NORMALIZER_NFC_V1, | ||
| } | ||
| } | ||
| } | ||
| impl CanonicalCompositionBorrowed<'_> { | ||
| /// Performs canonical composition (including Hangul) on a pair of | ||
| /// characters or returns `None` if these characters don't compose. | ||
| /// Composition exclusions are taken into account. | ||
| /// | ||
| /// # Examples | ||
| /// | ||
| /// ``` | ||
| /// let comp = icu::normalizer::properties::CanonicalCompositionBorrowed::new(); | ||
| /// | ||
| /// assert_eq!(comp.compose('a', 'b'), None); // Just two non-composing starters | ||
| /// assert_eq!(comp.compose('a', '\u{0308}'), Some('ä')); | ||
| /// assert_eq!(comp.compose('ẹ', '\u{0302}'), Some('ệ')); | ||
| /// assert_eq!(comp.compose('𝅗', '𝅥'), None); // Composition exclusion | ||
| /// assert_eq!(comp.compose('ে', 'া'), Some('ো')); // Second is starter | ||
| /// assert_eq!(comp.compose('ᄀ', 'ᅡ'), Some('가')); // Hangul LV | ||
| /// assert_eq!(comp.compose('가', 'ᆨ'), Some('각')); // Hangul LVT | ||
| /// ``` | ||
| #[inline(always)] | ||
| pub fn compose(self, starter: char, second: char) -> Option<char> { | ||
| crate::compose( | ||
| self.canonical_compositions.canonical_compositions.iter(), | ||
| starter, | ||
| second, | ||
| ) | ||
| } | ||
| } | ||
| /// The raw canonical composition operation. | ||
| /// | ||
| /// Callers should generally use `ComposingNormalizer` instead of this API. | ||
| /// However, this API is provided for callers such as HarfBuzz that specifically | ||
| /// want access to the raw canonical composition operation e.g. for use in a | ||
| /// glyph-availability-guided custom normalizer. | ||
| #[derive(Debug)] | ||
| pub struct CanonicalComposition { | ||
| canonical_compositions: DataPayload<NormalizerNfcV1>, | ||
| } | ||
| #[cfg(feature = "compiled_data")] | ||
| impl Default for CanonicalComposition { | ||
| fn default() -> Self { | ||
| Self::new().static_to_owned() | ||
| } | ||
| } | ||
| impl CanonicalComposition { | ||
| /// Constructs a borrowed version of this type for more efficient querying. | ||
| pub fn as_borrowed(&self) -> CanonicalCompositionBorrowed<'_> { | ||
| CanonicalCompositionBorrowed { | ||
| canonical_compositions: self.canonical_compositions.get(), | ||
| } | ||
| } | ||
| /// Constructs a new `CanonicalCompositionBorrowed` using compiled data. | ||
| /// | ||
| /// ✨ *Enabled with the `compiled_data` Cargo feature.* | ||
| /// | ||
| /// [📚 Help choosing a constructor](icu_provider::constructors) | ||
| #[cfg(feature = "compiled_data")] | ||
| #[expect(clippy::new_ret_no_self)] | ||
| pub const fn new() -> CanonicalCompositionBorrowed<'static> { | ||
| CanonicalCompositionBorrowed::new() | ||
| } | ||
| icu_provider::gen_buffer_data_constructors!(() -> error: DataError, | ||
| functions: [ | ||
| new: skip, | ||
| try_new_with_buffer_provider, | ||
| try_new_unstable, | ||
| Self, | ||
| ] | ||
| ); | ||
| #[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new)] | ||
| pub fn try_new_unstable<D>(provider: &D) -> Result<Self, DataError> | ||
| where | ||
| D: DataProvider<NormalizerNfcV1> + ?Sized, | ||
| { | ||
| let canonical_compositions: DataPayload<NormalizerNfcV1> = | ||
| provider.load(Default::default())?.payload; | ||
| Ok(CanonicalComposition { | ||
| canonical_compositions, | ||
| }) | ||
| } | ||
| } | ||
| /// The outcome of non-recursive canonical decomposition of a character. | ||
| #[allow(clippy::exhaustive_enums)] | ||
| #[derive(Debug, PartialEq, Eq)] | ||
| pub enum Decomposed { | ||
| /// The character is its own canonical decomposition. | ||
| Default, | ||
| /// The character decomposes to a single different character. | ||
| Singleton(char), | ||
| /// The character decomposes to two characters. | ||
| Expansion(char, char), | ||
| } | ||
| /// Borrowed version of the raw (non-recursive) canonical decomposition operation. | ||
| /// | ||
| /// Callers should generally use `DecomposingNormalizer` instead of this API. | ||
| /// However, this API is provided for callers such as HarfBuzz that specifically | ||
| /// want access to non-recursive canonical decomposition e.g. for use in a | ||
| /// glyph-availability-guided custom normalizer. | ||
| #[derive(Debug)] | ||
| pub struct CanonicalDecompositionBorrowed<'a> { | ||
| decompositions: &'a DecompositionData<'a>, | ||
| tables: &'a DecompositionTables<'a>, | ||
| non_recursive: &'a NonRecursiveDecompositionSupplement<'a>, | ||
| } | ||
| #[cfg(feature = "compiled_data")] | ||
| impl Default for CanonicalDecompositionBorrowed<'static> { | ||
| fn default() -> Self { | ||
| Self::new() | ||
| } | ||
| } | ||
| impl CanonicalDecompositionBorrowed<'static> { | ||
| /// Cheaply converts a [`CanonicalDecompositionBorrowed<'static>`] into a [`CanonicalDecomposition`]. | ||
| /// | ||
| /// Note: Due to branching and indirection, using [`CanonicalDecomposition`] might inhibit some | ||
| /// compile-time optimizations that are possible with [`CanonicalDecompositionBorrowed`]. | ||
| pub const fn static_to_owned(self) -> CanonicalDecomposition { | ||
| CanonicalDecomposition { | ||
| decompositions: DataPayload::from_static_ref(self.decompositions), | ||
| tables: DataPayload::from_static_ref(self.tables), | ||
| non_recursive: DataPayload::from_static_ref(self.non_recursive), | ||
| } | ||
| } | ||
| /// Construct from compiled data. | ||
| /// | ||
| /// ✨ *Enabled with the `compiled_data` Cargo feature.* | ||
| /// | ||
| /// [📚 Help choosing a constructor](icu_provider::constructors) | ||
| #[cfg(feature = "compiled_data")] | ||
| pub const fn new() -> Self { | ||
| const _: () = assert!( | ||
| crate::provider::Baked::SINGLETON_NORMALIZER_NFD_TABLES_V1 | ||
| .scalars16 | ||
| .const_len() | ||
| + crate::provider::Baked::SINGLETON_NORMALIZER_NFD_TABLES_V1 | ||
| .scalars24 | ||
| .const_len() | ||
| <= 0xFFF, | ||
| "future extension" | ||
| ); | ||
| Self { | ||
| decompositions: crate::provider::Baked::SINGLETON_NORMALIZER_NFD_DATA_V1, | ||
| tables: crate::provider::Baked::SINGLETON_NORMALIZER_NFD_TABLES_V1, | ||
| non_recursive: crate::provider::Baked::SINGLETON_NORMALIZER_NFD_SUPPLEMENT_V1, | ||
| } | ||
| } | ||
| } | ||
| impl CanonicalDecompositionBorrowed<'_> { | ||
| /// Performs non-recursive canonical decomposition (including for Hangul). | ||
| /// | ||
| /// ``` | ||
| /// use icu::normalizer::properties::Decomposed; | ||
| /// let decomp = icu::normalizer::properties::CanonicalDecompositionBorrowed::new(); | ||
| /// | ||
| /// assert_eq!(decomp.decompose('e'), Decomposed::Default); | ||
| /// assert_eq!( | ||
| /// decomp.decompose('ệ'), | ||
| /// Decomposed::Expansion('ẹ', '\u{0302}') | ||
| /// ); | ||
| /// assert_eq!(decomp.decompose('각'), Decomposed::Expansion('가', 'ᆨ')); | ||
| /// assert_eq!(decomp.decompose('\u{212B}'), Decomposed::Singleton('Å')); // ANGSTROM SIGN | ||
| /// assert_eq!(decomp.decompose('\u{2126}'), Decomposed::Singleton('Ω')); // OHM SIGN | ||
| /// assert_eq!(decomp.decompose('\u{1F71}'), Decomposed::Singleton('ά')); // oxia | ||
| /// ``` | ||
| #[inline] | ||
| pub fn decompose(&self, c: char) -> Decomposed { | ||
| let lvt = u32::from(c).wrapping_sub(HANGUL_S_BASE); | ||
| if lvt >= HANGUL_S_COUNT { | ||
| return self.decompose_non_hangul(c); | ||
| } | ||
| // Invariant: lvt ≤ HANGUL_S_COUNT = 1172 | ||
| let t = lvt % HANGUL_T_COUNT; | ||
| // Invariant: t ≤ (1172 / HANGUL_T_COUNT = 1172 / 28 = 41) | ||
| if t == 0 { | ||
| let l = lvt / HANGUL_N_COUNT; | ||
| // Invariant: v ≤ (1172 / HANGUL_N_COUNT = 1172 / 588 ≈ 2) | ||
| let v = (lvt % HANGUL_N_COUNT) / HANGUL_T_COUNT; | ||
| // Invariant: v < (HANGUL_N_COUNT / HANGUL_T_COUNT = 588 / 28 = 21) | ||
| return Decomposed::Expansion( | ||
| // Safety: HANGUL_*_BASE are 0x1nnn, addding numbers that are 21 and 41 | ||
| // max will keep it in range, less than 0xD800 | ||
| unsafe { char::from_u32_unchecked(HANGUL_L_BASE + l) }, | ||
| unsafe { char::from_u32_unchecked(HANGUL_V_BASE + v) }, | ||
| ); | ||
| } | ||
| let lv = lvt - t; | ||
| // Invariant: lvt < 1172 | ||
| // Safe because values known to be in range | ||
| Decomposed::Expansion( | ||
| // Safety: HANGUL_*_BASE are 0x1nnn, addding numbers that are 1172 and 41 | ||
| // max will keep it in range, less than 0xD800 | ||
| unsafe { char::from_u32_unchecked(HANGUL_S_BASE + lv) }, | ||
| unsafe { char::from_u32_unchecked(HANGUL_T_BASE + t) }, | ||
| ) | ||
| } | ||
| /// Performs non-recursive canonical decomposition except Hangul syllables | ||
| /// are reported as `Decomposed::Default`. | ||
| #[inline(always)] | ||
| fn decompose_non_hangul(&self, c: char) -> Decomposed { | ||
| let decomposition = self.decompositions.trie.get(c); | ||
| // The REPLACEMENT CHARACTER has `NON_ROUND_TRIP_MARKER` set, | ||
| // and that flag needs to be ignored here. | ||
| if (decomposition & !(BACKWARD_COMBINING_MARKER | NON_ROUND_TRIP_MARKER)) == 0 { | ||
| return Decomposed::Default; | ||
| } | ||
| // The loop is only broken out of as goto forward | ||
| #[expect(clippy::never_loop)] | ||
| loop { | ||
| let high_zeros = (decomposition & HIGH_ZEROS_MASK) == 0; | ||
| let low_zeros = (decomposition & LOW_ZEROS_MASK) == 0; | ||
| if !high_zeros && !low_zeros { | ||
| // Decomposition into two BMP characters: starter and non-starter | ||
| if in_inclusive_range(c, '\u{1F71}', '\u{1FFB}') { | ||
| // Look in the other trie due to oxia singleton | ||
| // mappings to corresponding character with tonos. | ||
| break; | ||
| } | ||
| let starter = char_from_u32(decomposition & 0x7FFF); | ||
| let combining = char_from_u32((decomposition >> 15) & 0x7FFF); | ||
| return Decomposed::Expansion(starter, combining); | ||
| } | ||
| if high_zeros { | ||
| // Decomposition into one BMP character or non-starter | ||
| if trie_value_has_ccc(decomposition) { | ||
| // Non-starter | ||
| if !in_inclusive_range(c, '\u{0340}', '\u{0F81}') { | ||
| return Decomposed::Default; | ||
| } | ||
| return match c { | ||
| '\u{0340}' => { | ||
| // COMBINING GRAVE TONE MARK | ||
| Decomposed::Singleton('\u{0300}') | ||
| } | ||
| '\u{0341}' => { | ||
| // COMBINING ACUTE TONE MARK | ||
| Decomposed::Singleton('\u{0301}') | ||
| } | ||
| '\u{0343}' => { | ||
| // COMBINING GREEK KORONIS | ||
| Decomposed::Singleton('\u{0313}') | ||
| } | ||
| '\u{0344}' => { | ||
| // COMBINING GREEK DIALYTIKA TONOS | ||
| Decomposed::Expansion('\u{0308}', '\u{0301}') | ||
| } | ||
| '\u{0F73}' => { | ||
| // TIBETAN VOWEL SIGN II | ||
| Decomposed::Expansion('\u{0F71}', '\u{0F72}') | ||
| } | ||
| '\u{0F75}' => { | ||
| // TIBETAN VOWEL SIGN UU | ||
| Decomposed::Expansion('\u{0F71}', '\u{0F74}') | ||
| } | ||
| '\u{0F81}' => { | ||
| // TIBETAN VOWEL SIGN REVERSED II | ||
| Decomposed::Expansion('\u{0F71}', '\u{0F80}') | ||
| } | ||
| _ => Decomposed::Default, | ||
| }; | ||
| } | ||
| let singleton = decomposition as u16; | ||
| debug_assert_ne!( | ||
| singleton, FDFA_MARKER, | ||
| "How come we got the U+FDFA NFKD marker here?" | ||
| ); | ||
| return Decomposed::Singleton(char_from_u16(singleton)); | ||
| } | ||
| if c == '\u{212B}' { | ||
| // ANGSTROM SIGN | ||
| return Decomposed::Singleton('\u{00C5}'); | ||
| } | ||
| // Only 12 of 14 bits used as of Unicode 16. | ||
| let offset = (((decomposition & !(0b11 << 30)) >> 16) as usize) - 1; | ||
| // Only 3 of 4 bits used as of Unicode 16. | ||
| let len_bits = decomposition & 0b1111; | ||
| let tables = self.tables; | ||
| if offset < tables.scalars16.len() { | ||
| if len_bits != 0 { | ||
| // i.e. logical len isn't 2 | ||
| break; | ||
| } | ||
| if let Some(first) = tables.scalars16.get(offset) { | ||
| if let Some(second) = tables.scalars16.get(offset + 1) { | ||
| // Two BMP starters | ||
| return Decomposed::Expansion(char_from_u16(first), char_from_u16(second)); | ||
| } | ||
| } | ||
| // GIGO case | ||
| debug_assert!(false); | ||
| return Decomposed::Default; | ||
| } | ||
| let len = len_bits + 1; | ||
| if len > 2 { | ||
| break; | ||
| } | ||
| let offset24 = offset - tables.scalars16.len(); | ||
| if let Some(first_c) = tables.scalars24.get(offset24) { | ||
| if len == 1 { | ||
| return Decomposed::Singleton(first_c); | ||
| } | ||
| if let Some(second_c) = tables.scalars24.get(offset24 + 1) { | ||
| return Decomposed::Expansion(first_c, second_c); | ||
| } | ||
| } | ||
| // GIGO case | ||
| debug_assert!(false); | ||
| return Decomposed::Default; | ||
| } | ||
| let non_recursive = self.non_recursive; | ||
| let non_recursive_decomposition = non_recursive.trie.get(c); | ||
| if non_recursive_decomposition == 0 { | ||
| // GIGO case | ||
| debug_assert!(false); | ||
| return Decomposed::Default; | ||
| } | ||
| let trail_or_complex = (non_recursive_decomposition >> 16) as u16; | ||
| let lead = non_recursive_decomposition as u16; | ||
| if lead != 0 && trail_or_complex != 0 { | ||
| // Decomposition into two BMP characters | ||
| return Decomposed::Expansion(char_from_u16(lead), char_from_u16(trail_or_complex)); | ||
| } | ||
| if lead != 0 { | ||
| // Decomposition into one BMP character | ||
| return Decomposed::Singleton(char_from_u16(lead)); | ||
| } | ||
| // Decomposition into two non-BMP characters | ||
| // Low is offset into a table plus one to keep it non-zero. | ||
| let offset = usize::from(trail_or_complex - 1); | ||
| if let Some(first) = non_recursive.scalars24.get(offset) { | ||
| if let Some(second) = non_recursive.scalars24.get(offset + 1) { | ||
| return Decomposed::Expansion(first, second); | ||
| } | ||
| } | ||
| // GIGO case | ||
| debug_assert!(false); | ||
| Decomposed::Default | ||
| } | ||
| } | ||
| /// The raw (non-recursive) canonical decomposition operation. | ||
| /// | ||
| /// Callers should generally use `DecomposingNormalizer` instead of this API. | ||
| /// However, this API is provided for callers such as HarfBuzz that specifically | ||
| /// want access to non-recursive canonical decomposition e.g. for use in a | ||
| /// glyph-availability-guided custom normalizer. | ||
| #[derive(Debug)] | ||
| pub struct CanonicalDecomposition { | ||
| decompositions: DataPayload<NormalizerNfdDataV1>, | ||
| tables: DataPayload<NormalizerNfdTablesV1>, | ||
| non_recursive: DataPayload<NormalizerNfdSupplementV1>, | ||
| } | ||
| #[cfg(feature = "compiled_data")] | ||
| impl Default for CanonicalDecomposition { | ||
| fn default() -> Self { | ||
| Self::new().static_to_owned() | ||
| } | ||
| } | ||
| impl CanonicalDecomposition { | ||
| /// Constructs a borrowed version of this type for more efficient querying. | ||
| pub fn as_borrowed(&self) -> CanonicalDecompositionBorrowed<'_> { | ||
| CanonicalDecompositionBorrowed { | ||
| decompositions: self.decompositions.get(), | ||
| tables: self.tables.get(), | ||
| non_recursive: self.non_recursive.get(), | ||
| } | ||
| } | ||
| /// Construct from compiled data. | ||
| /// | ||
| /// ✨ *Enabled with the `compiled_data` Cargo feature.* | ||
| /// | ||
| /// [📚 Help choosing a constructor](icu_provider::constructors) | ||
| #[cfg(feature = "compiled_data")] | ||
| #[expect(clippy::new_ret_no_self)] | ||
| pub const fn new() -> CanonicalDecompositionBorrowed<'static> { | ||
| CanonicalDecompositionBorrowed::new() | ||
| } | ||
| icu_provider::gen_buffer_data_constructors!(() -> error: DataError, | ||
| functions: [ | ||
| new: skip, | ||
| try_new_with_buffer_provider, | ||
| try_new_unstable, | ||
| Self, | ||
| ] | ||
| ); | ||
| #[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new)] | ||
| pub fn try_new_unstable<D>(provider: &D) -> Result<Self, DataError> | ||
| where | ||
| D: DataProvider<NormalizerNfdDataV1> | ||
| + DataProvider<NormalizerNfdTablesV1> | ||
| + DataProvider<NormalizerNfdSupplementV1> | ||
| + ?Sized, | ||
| { | ||
| let decompositions: DataPayload<NormalizerNfdDataV1> = | ||
| provider.load(Default::default())?.payload; | ||
| let tables: DataPayload<NormalizerNfdTablesV1> = provider.load(Default::default())?.payload; | ||
| if tables.get().scalars16.len() + tables.get().scalars24.len() > 0xFFF { | ||
| // The data is from a future where there exists a normalization flavor whose | ||
| // complex decompositions take more than 0xFFF but fewer than 0x1FFF code points | ||
| // of space. If a good use case from such a decomposition flavor arises, we can | ||
| // dynamically change the bit masks so that the length mask becomes 0x1FFF instead | ||
| // of 0xFFF and the all-non-starters mask becomes 0 instead of 0x1000. However, | ||
| // since for now the masks are hard-coded, error out. | ||
| return Err(DataError::custom("future extension")); | ||
| } | ||
| let non_recursive: DataPayload<NormalizerNfdSupplementV1> = | ||
| provider.load(Default::default())?.payload; | ||
| Ok(CanonicalDecomposition { | ||
| decompositions, | ||
| tables, | ||
| non_recursive, | ||
| }) | ||
| } | ||
| } | ||
| /// Borrowed version of lookup of the `Canonical_Combining_Class` Unicode property. | ||
| /// | ||
| /// # Example | ||
| /// | ||
| /// ``` | ||
| /// use icu::properties::props::CanonicalCombiningClass; | ||
| /// use icu::normalizer::properties::CanonicalCombiningClassMapBorrowed; | ||
| /// | ||
| /// let map = CanonicalCombiningClassMapBorrowed::new(); | ||
| /// assert_eq!(map.get('a'), CanonicalCombiningClass::NotReordered); // U+0061: LATIN SMALL LETTER A | ||
| /// assert_eq!(map.get32(0x0301), CanonicalCombiningClass::Above); // U+0301: COMBINING ACUTE ACCENT | ||
| /// ``` | ||
| #[derive(Debug)] | ||
| pub struct CanonicalCombiningClassMapBorrowed<'a> { | ||
| /// The data trie | ||
| decompositions: &'a DecompositionData<'a>, | ||
| } | ||
| #[cfg(feature = "compiled_data")] | ||
| impl Default for CanonicalCombiningClassMapBorrowed<'static> { | ||
| fn default() -> Self { | ||
| Self::new() | ||
| } | ||
| } | ||
| impl CanonicalCombiningClassMapBorrowed<'static> { | ||
| /// Cheaply converts a [`CanonicalCombiningClassMapBorrowed<'static>`] into a [`CanonicalCombiningClassMap`]. | ||
| /// | ||
| /// Note: Due to branching and indirection, using [`CanonicalCombiningClassMap`] might inhibit some | ||
| /// compile-time optimizations that are possible with [`CanonicalCombiningClassMapBorrowed`]. | ||
| pub const fn static_to_owned(self) -> CanonicalCombiningClassMap { | ||
| CanonicalCombiningClassMap { | ||
| decompositions: DataPayload::from_static_ref(self.decompositions), | ||
| } | ||
| } | ||
| /// Construct from compiled data. | ||
| /// | ||
| /// ✨ *Enabled with the `compiled_data` Cargo feature.* | ||
| /// | ||
| /// [📚 Help choosing a constructor](icu_provider::constructors) | ||
| #[cfg(feature = "compiled_data")] | ||
| pub const fn new() -> Self { | ||
| CanonicalCombiningClassMapBorrowed { | ||
| decompositions: crate::provider::Baked::SINGLETON_NORMALIZER_NFD_DATA_V1, | ||
| } | ||
| } | ||
| } | ||
| impl CanonicalCombiningClassMapBorrowed<'_> { | ||
| /// Look up the canonical combining class for a scalar value. | ||
| /// | ||
| /// The return value is a u8 representing the canonical combining class, | ||
| /// you may enable the `"icu_properties"` feature if you would like to use a typed | ||
| /// `CanonicalCombiningClass`. | ||
| #[inline(always)] | ||
| pub fn get_u8(&self, c: char) -> u8 { | ||
| let trie_value = self.decompositions.trie.get(c); | ||
| if trie_value_has_ccc(trie_value) { | ||
| trie_value as u8 | ||
| } else { | ||
| ccc!(NotReordered, 0).to_icu4c_value() | ||
| } | ||
| } | ||
| /// Look up the canonical combining class for a scalar value | ||
| /// represented as `u32`. If the argument is outside the scalar | ||
| /// value range, `Not_Reordered` is returned. | ||
| /// | ||
| /// The return value is a u8 representing the canonical combining class, | ||
| /// you may enable the `"icu_properties"` feature if you would like to use a typed | ||
| /// `CanonicalCombiningClass`. | ||
| pub fn get32_u8(&self, c: u32) -> u8 { | ||
| let trie_value = self.decompositions.trie.get32(c); | ||
| if trie_value_has_ccc(trie_value) { | ||
| trie_value as u8 | ||
| } else { | ||
| ccc!(NotReordered, 0).to_icu4c_value() | ||
| } | ||
| } | ||
| /// Look up the canonical combining class for a scalar value | ||
| /// | ||
| /// ✨ *Enabled with the `icu_properties` Cargo feature.* | ||
| #[inline(always)] | ||
| #[cfg(feature = "icu_properties")] | ||
| pub fn get(&self, c: char) -> CanonicalCombiningClass { | ||
| CanonicalCombiningClass::from_icu4c_value(self.get_u8(c)) | ||
| } | ||
| /// Look up the canonical combining class for a scalar value | ||
| /// represented as `u32`. If the argument is outside the scalar | ||
| /// value range, `CanonicalCombiningClass::NotReordered` is returned. | ||
| /// | ||
| /// ✨ *Enabled with the `icu_properties` Cargo feature.* | ||
| #[cfg(feature = "icu_properties")] | ||
| pub fn get32(&self, c: u32) -> CanonicalCombiningClass { | ||
| CanonicalCombiningClass::from_icu4c_value(self.get32_u8(c)) | ||
| } | ||
| } | ||
| /// Lookup of the `Canonical_Combining_Class` Unicode property. | ||
| #[derive(Debug)] | ||
| pub struct CanonicalCombiningClassMap { | ||
| /// The data trie | ||
| decompositions: DataPayload<NormalizerNfdDataV1>, | ||
| } | ||
| #[cfg(feature = "compiled_data")] | ||
| impl Default for CanonicalCombiningClassMap { | ||
| fn default() -> Self { | ||
| Self::new().static_to_owned() | ||
| } | ||
| } | ||
| impl CanonicalCombiningClassMap { | ||
| /// Constructs a borrowed version of this type for more efficient querying. | ||
| pub fn as_borrowed(&self) -> CanonicalCombiningClassMapBorrowed<'_> { | ||
| CanonicalCombiningClassMapBorrowed { | ||
| decompositions: self.decompositions.get(), | ||
| } | ||
| } | ||
| /// Construct from compiled data. | ||
| /// | ||
| /// ✨ *Enabled with the `compiled_data` Cargo feature.* | ||
| /// | ||
| /// [📚 Help choosing a constructor](icu_provider::constructors) | ||
| #[cfg(feature = "compiled_data")] | ||
| #[expect(clippy::new_ret_no_self)] | ||
| pub const fn new() -> CanonicalCombiningClassMapBorrowed<'static> { | ||
| CanonicalCombiningClassMapBorrowed::new() | ||
| } | ||
| icu_provider::gen_buffer_data_constructors!(() -> error: DataError, | ||
| functions: [ | ||
| new: skip, | ||
| try_new_with_buffer_provider, | ||
| try_new_unstable, | ||
| Self, | ||
| ]); | ||
| #[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new)] | ||
| pub fn try_new_unstable<D>(provider: &D) -> Result<Self, DataError> | ||
| where | ||
| D: DataProvider<NormalizerNfdDataV1> + ?Sized, | ||
| { | ||
| let decompositions: DataPayload<NormalizerNfdDataV1> = | ||
| provider.load(Default::default())?.payload; | ||
| Ok(CanonicalCombiningClassMap { decompositions }) | ||
| } | ||
| } |
| // This file is part of ICU4X. For terms of use, please see the file | ||
| // called LICENSE at the top level of the ICU4X source tree | ||
| // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). | ||
| //! 🚧 \[Unstable\] Data provider struct definitions for this ICU4X component. | ||
| //! | ||
| //! <div class="stab unstable"> | ||
| //! 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways, | ||
| //! including in SemVer minor releases. While the serde representation of data structs is guaranteed | ||
| //! to be stable, their Rust representation might not be. Use with caution. | ||
| //! </div> | ||
| //! | ||
| //! Read more about data providers: [`icu_provider`] | ||
| // Provider structs must be stable | ||
| #![allow(clippy::exhaustive_structs, clippy::exhaustive_enums)] | ||
| use icu_collections::char16trie::Char16Trie; | ||
| use icu_collections::codepointtrie::CodePointTrie; | ||
| use icu_provider::prelude::*; | ||
| use zerovec::ZeroVec; | ||
| #[cfg(feature = "compiled_data")] | ||
| #[derive(Debug)] | ||
| /// Baked data | ||
| /// | ||
| /// <div class="stab unstable"> | ||
| /// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways, | ||
| /// including in SemVer minor releases. In particular, the `DataProvider` implementations are only | ||
| /// guaranteed to match with this version's `*_unstable` providers. Use with caution. | ||
| /// </div> | ||
| pub struct Baked; | ||
| #[cfg(feature = "compiled_data")] | ||
| #[allow(unused_imports)] | ||
| const _: () = { | ||
| use icu_normalizer_data::*; | ||
| pub mod icu { | ||
| pub use crate as normalizer; | ||
| pub use icu_collections as collections; | ||
| } | ||
| make_provider!(Baked); | ||
| impl_normalizer_nfc_v1!(Baked); | ||
| impl_normalizer_nfd_data_v1!(Baked); | ||
| impl_normalizer_nfd_supplement_v1!(Baked); | ||
| impl_normalizer_nfd_tables_v1!(Baked); | ||
| impl_normalizer_nfkd_data_v1!(Baked); | ||
| impl_normalizer_nfkd_tables_v1!(Baked); | ||
| impl_normalizer_uts46_data_v1!(Baked); | ||
| }; | ||
| icu_provider::data_marker!( | ||
| /// Marker for data for canonical decomposition. | ||
| NormalizerNfdDataV1, | ||
| "normalizer/nfd/data/v1", | ||
| DecompositionData<'static>, | ||
| is_singleton = true | ||
| ); | ||
| icu_provider::data_marker!( | ||
| /// Marker for additional data for canonical decomposition. | ||
| NormalizerNfdTablesV1, | ||
| "normalizer/nfd/tables/v1", | ||
| DecompositionTables<'static>, | ||
| is_singleton = true | ||
| ); | ||
| icu_provider::data_marker!( | ||
| /// Marker for data for compatibility decomposition. | ||
| NormalizerNfkdDataV1, | ||
| "normalizer/nfkd/data/v1", | ||
| DecompositionData<'static>, | ||
| is_singleton = true | ||
| ); | ||
| icu_provider::data_marker!( | ||
| /// Marker for additional data for compatibility decomposition. | ||
| NormalizerNfkdTablesV1, | ||
| "normalizer/nfkd/tables/v1", | ||
| DecompositionTables<'static>, | ||
| is_singleton = true | ||
| ); | ||
| icu_provider::data_marker!( | ||
| /// Marker for data for UTS-46 decomposition. | ||
| NormalizerUts46DataV1, | ||
| "normalizer/uts46/data/v1", | ||
| DecompositionData<'static>, | ||
| is_singleton = true | ||
| ); | ||
| icu_provider::data_marker!( | ||
| /// Marker for data for composition. | ||
| NormalizerNfcV1, | ||
| "normalizer/nfc/v1", | ||
| CanonicalCompositions<'static>, | ||
| is_singleton = true | ||
| ); | ||
| icu_provider::data_marker!( | ||
| /// Marker for additional data for non-recusrsive composition. | ||
| NormalizerNfdSupplementV1, | ||
| "normalizer/nfd/supplement/v1", | ||
| NonRecursiveDecompositionSupplement<'static>, | ||
| is_singleton = true | ||
| ); | ||
| #[cfg(feature = "datagen")] | ||
| /// The latest minimum set of markers required by this component. | ||
| pub const MARKERS: &[DataMarkerInfo] = &[ | ||
| NormalizerNfcV1::INFO, | ||
| NormalizerNfdDataV1::INFO, | ||
| NormalizerNfdTablesV1::INFO, | ||
| NormalizerNfkdDataV1::INFO, | ||
| NormalizerNfkdTablesV1::INFO, | ||
| NormalizerNfdSupplementV1::INFO, | ||
| NormalizerUts46DataV1::INFO, | ||
| ]; | ||
| /// Decomposition data | ||
| /// | ||
| /// <div class="stab unstable"> | ||
| /// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways, | ||
| /// including in SemVer minor releases. While the serde representation of data structs is guaranteed | ||
| /// to be stable, their Rust representation might not be. Use with caution. | ||
| /// </div> | ||
| #[derive(Debug, PartialEq, Clone, yoke::Yokeable, zerofrom::ZeroFrom)] | ||
| #[cfg_attr(feature = "datagen", derive(serde::Serialize, databake::Bake))] | ||
| #[cfg_attr(feature = "datagen", databake(path = icu_normalizer::provider))] | ||
| #[cfg_attr(feature = "serde", derive(serde::Deserialize))] | ||
| pub struct DecompositionData<'data> { | ||
| /// Trie for decomposition. | ||
| #[cfg_attr(feature = "serde", serde(borrow))] | ||
| pub trie: CodePointTrie<'data, u32>, | ||
| /// The passthrough bounds of NFD/NFC are lowered to this | ||
| /// maximum instead. (16-bit, because cannot be higher | ||
| /// than 0x0300, which is the bound for NFC.) | ||
| pub passthrough_cap: u16, | ||
| } | ||
| icu_provider::data_struct!( | ||
| DecompositionData<'_>, | ||
| #[cfg(feature = "datagen")] | ||
| ); | ||
| /// The expansion tables for cases where the decomposition isn't | ||
| /// contained in the trie value | ||
| /// | ||
| /// <div class="stab unstable"> | ||
| /// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways, | ||
| /// including in SemVer minor releases. While the serde representation of data structs is guaranteed | ||
| /// to be stable, their Rust representation might not be. Use with caution. | ||
| /// </div> | ||
| #[derive(Debug, PartialEq, Clone, yoke::Yokeable, zerofrom::ZeroFrom)] | ||
| #[cfg_attr(feature = "datagen", derive(serde::Serialize, databake::Bake))] | ||
| #[cfg_attr(feature = "datagen", databake(path = icu_normalizer::provider))] | ||
| #[cfg_attr(feature = "serde", derive(serde::Deserialize))] | ||
| pub struct DecompositionTables<'data> { | ||
| /// Decompositions that are fully within the BMP | ||
| #[cfg_attr(feature = "serde", serde(borrow))] | ||
| pub scalars16: ZeroVec<'data, u16>, | ||
| /// Decompositions with at least one character outside | ||
| /// the BMP | ||
| #[cfg_attr(feature = "serde", serde(borrow))] | ||
| pub scalars24: ZeroVec<'data, char>, | ||
| } | ||
| icu_provider::data_struct!( | ||
| DecompositionTables<'_>, | ||
| #[cfg(feature = "datagen")] | ||
| ); | ||
| /// Non-Hangul canonical compositions | ||
| /// | ||
| /// <div class="stab unstable"> | ||
| /// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways, | ||
| /// including in SemVer minor releases. While the serde representation of data structs is guaranteed | ||
| /// to be stable, their Rust representation might not be. Use with caution. | ||
| /// </div> | ||
| #[derive(Debug, PartialEq, Clone, yoke::Yokeable, zerofrom::ZeroFrom)] | ||
| #[cfg_attr(feature = "datagen", derive(serde::Serialize, databake::Bake))] | ||
| #[cfg_attr(feature = "datagen", databake(path = icu_normalizer::provider))] | ||
| #[cfg_attr(feature = "serde", derive(serde::Deserialize))] | ||
| pub struct CanonicalCompositions<'data> { | ||
| /// Trie keys are two-`char` strings with the second | ||
| /// character coming first. The value, if any, is the | ||
| /// (non-Hangul) canonical composition. | ||
| #[cfg_attr(feature = "serde", serde(borrow))] | ||
| pub canonical_compositions: Char16Trie<'data>, | ||
| } | ||
| icu_provider::data_struct!( | ||
| CanonicalCompositions<'_>, | ||
| #[cfg(feature = "datagen")] | ||
| ); | ||
| /// Non-recursive canonical decompositions that differ from | ||
| /// `DecompositionData`. | ||
| /// | ||
| /// <div class="stab unstable"> | ||
| /// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways, | ||
| /// including in SemVer minor releases. While the serde representation of data structs is guaranteed | ||
| /// to be stable, their Rust representation might not be. Use with caution. | ||
| /// </div> | ||
| #[derive(Debug, PartialEq, Clone, yoke::Yokeable, zerofrom::ZeroFrom)] | ||
| #[cfg_attr(feature = "datagen", derive(serde::Serialize, databake::Bake))] | ||
| #[cfg_attr(feature = "datagen", databake(path = icu_normalizer::provider))] | ||
| #[cfg_attr(feature = "serde", derive(serde::Deserialize))] | ||
| pub struct NonRecursiveDecompositionSupplement<'data> { | ||
| /// Trie for the supplementary non-recursive decompositions | ||
| #[cfg_attr(feature = "serde", serde(borrow))] | ||
| pub trie: CodePointTrie<'data, u32>, | ||
| /// Decompositions with at least one character outside | ||
| /// the BMP | ||
| #[cfg_attr(feature = "serde", serde(borrow))] | ||
| pub scalars24: ZeroVec<'data, char>, | ||
| } | ||
| icu_provider::data_struct!( | ||
| NonRecursiveDecompositionSupplement<'_>, | ||
| #[cfg(feature = "datagen")] | ||
| ); |
| // This file is part of ICU4X. For terms of use, please see the file | ||
| // called LICENSE at the top level of the ICU4X source tree | ||
| // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). | ||
| //! Bundles the part of UTS 46 that makes sense to implement as a | ||
| //! normalization. | ||
| //! | ||
| //! This is meant to be used as a building block of an UTS 46 | ||
| //! implementation, such as the `idna` crate. | ||
| use crate::ComposingNormalizer; | ||
| use crate::ComposingNormalizerBorrowed; | ||
| use crate::NormalizerNfcV1; | ||
| use crate::NormalizerNfdTablesV1; | ||
| use crate::NormalizerNfkdTablesV1; | ||
| use crate::NormalizerUts46DataV1; | ||
| use icu_provider::DataError; | ||
| use icu_provider::DataProvider; | ||
| // Implementation note: Despite merely wrapping a `ComposingNormalizer`, | ||
| // having a `Uts46Mapper` serves two purposes: | ||
| // | ||
| // 1. Denying public access to parts of the `ComposingNormalizer` API | ||
| // that don't work when the data contains markers for ignorables. | ||
| // 2. Providing a place where additional iterator pre-processing or | ||
| // post-processing can take place if needed in the future. (When | ||
| // writing this, it looked like such processing was needed but | ||
| // now isn't needed after all.) | ||
| /// A borrowed version of a mapper that knows how to performs the | ||
| /// subsets of UTS 46 processing documented on the methods. | ||
| #[derive(Debug)] | ||
| pub struct Uts46MapperBorrowed<'a> { | ||
| normalizer: ComposingNormalizerBorrowed<'a>, | ||
| } | ||
| #[cfg(feature = "compiled_data")] | ||
| impl Default for Uts46MapperBorrowed<'static> { | ||
| fn default() -> Self { | ||
| Self::new() | ||
| } | ||
| } | ||
| impl Uts46MapperBorrowed<'static> { | ||
| /// Cheaply converts a [`Uts46MapperBorrowed<'static>`] into a [`Uts46Mapper`]. | ||
| /// | ||
| /// Note: Due to branching and indirection, using [`Uts46Mapper`] might inhibit some | ||
| /// compile-time optimizations that are possible with [`Uts46MapperBorrowed`]. | ||
| pub const fn static_to_owned(self) -> Uts46Mapper { | ||
| Uts46Mapper { | ||
| normalizer: self.normalizer.static_to_owned(), | ||
| } | ||
| } | ||
| /// Construct with compiled data. | ||
| #[cfg(feature = "compiled_data")] | ||
| pub const fn new() -> Self { | ||
| Uts46MapperBorrowed { | ||
| normalizer: ComposingNormalizerBorrowed::new_uts46(), | ||
| } | ||
| } | ||
| } | ||
| impl Uts46MapperBorrowed<'_> { | ||
| /// Returns `true` iff the canonical combining class of `c` is 9 (Virama). | ||
| /// | ||
| /// This method uses the UTS 46 data and does not add a dependency on NFD | ||
| /// data like `CanonicalCombiningClassMapBorrowed` does. | ||
| #[inline] | ||
| pub fn is_virama(&self, c: char) -> bool { | ||
| let trie_val = self | ||
| .normalizer | ||
| .decomposing_normalizer | ||
| .decompositions | ||
| .trie | ||
| .get(c); | ||
| if crate::trie_value_has_ccc(trie_val) { | ||
| (trie_val as u8) == 9 | ||
| } else { | ||
| false | ||
| } | ||
| } | ||
| /// Returns an iterator adaptor that turns an `Iterator` over `char` | ||
| /// into an iterator yielding a `char` sequence that gets the following | ||
| /// operations from the "Map" and "Normalize" steps of the "Processing" | ||
| /// section of UTS 46 lazily applied to it: | ||
| /// | ||
| /// 1. The `ignored` characters are ignored. | ||
| /// 2. The `mapped` characters are mapped. | ||
| /// 3. The `disallowed` characters are replaced with U+FFFD, | ||
| /// which itself is a disallowed character. | ||
| /// 4. The `deviation` characters are treated as `mapped` or `valid` | ||
| /// as appropriate. | ||
| /// 5. The `disallowed_STD3_valid` characters are treated as allowed. | ||
| /// 6. The `disallowed_STD3_mapped` characters are treated as | ||
| /// `mapped`. | ||
| /// 7. The result is normalized to NFC. | ||
| /// | ||
| /// Notably: | ||
| /// | ||
| /// * The STD3 or WHATWG ASCII deny list should be implemented as a | ||
| /// post-processing step. | ||
| /// * Transitional processing is not performed. Transitional mapping | ||
| /// would be a pre-processing step, but transitional processing is | ||
| /// deprecated, and none of Firefox, Safari, or Chrome use it. | ||
| pub fn map_normalize<'delegate, I: Iterator<Item = char> + 'delegate>( | ||
| &'delegate self, | ||
| iter: I, | ||
| ) -> impl Iterator<Item = char> + 'delegate { | ||
| self.normalizer | ||
| .normalize_iter_private(iter, crate::IgnorableBehavior::Ignored) | ||
| } | ||
| /// Returns an iterator adaptor that turns an `Iterator` over `char` | ||
| /// into an iterator yielding a `char` sequence that gets the following | ||
| /// operations from the NFC check and statucs steps of the "Validity | ||
| /// Criteria" section of UTS 46 lazily applied to it: | ||
| /// | ||
| /// 1. The `ignored` characters are treated as `disallowed`. | ||
| /// 2. The `mapped` characters are mapped. | ||
| /// 3. The `disallowed` characters are replaced with U+FFFD, | ||
| /// which itself is a disallowed character. | ||
| /// 4. The `deviation` characters are treated as `mapped` or `valid` | ||
| /// as appropriate. | ||
| /// 5. The `disallowed_STD3_valid` characters are treated as allowed. | ||
| /// 6. The `disallowed_STD3_mapped` characters are treated as | ||
| /// `mapped`. | ||
| /// 7. The result is normalized to NFC. | ||
| /// | ||
| /// Notably: | ||
| /// | ||
| /// * The STD3 or WHATWG ASCII deny list should be implemented as a | ||
| /// post-processing step. | ||
| /// * Transitional processing is not performed. Transitional mapping | ||
| /// would be a pre-processing step, but transitional processing is | ||
| /// deprecated, and none of Firefox, Safari, or Chrome use it. | ||
| /// * The output needs to be compared with input to see if anything | ||
| /// changed. This check catches failures to adhere to the normalization | ||
| /// and status requirements. In particular, this comparison results | ||
| /// in _mapped_ characters resulting in error like "Validity Criteria" | ||
| /// requires. | ||
| pub fn normalize_validate<'delegate, I: Iterator<Item = char> + 'delegate>( | ||
| &'delegate self, | ||
| iter: I, | ||
| ) -> impl Iterator<Item = char> + 'delegate { | ||
| self.normalizer | ||
| .normalize_iter_private(iter, crate::IgnorableBehavior::ReplacementCharacter) | ||
| } | ||
| } | ||
| /// A mapper that knows how to performs the subsets of UTS 46 processing | ||
| /// documented on the methods. | ||
| #[derive(Debug)] | ||
| pub struct Uts46Mapper { | ||
| normalizer: ComposingNormalizer, | ||
| } | ||
| #[cfg(feature = "compiled_data")] | ||
| impl Default for Uts46Mapper { | ||
| fn default() -> Self { | ||
| Self::new().static_to_owned() | ||
| } | ||
| } | ||
| impl Uts46Mapper { | ||
| /// Constructs a borrowed version of this type for more efficient querying. | ||
| pub fn as_borrowed(&self) -> Uts46MapperBorrowed<'_> { | ||
| Uts46MapperBorrowed { | ||
| normalizer: self.normalizer.as_borrowed(), | ||
| } | ||
| } | ||
| /// Construct with compiled data. | ||
| #[cfg(feature = "compiled_data")] | ||
| #[expect(clippy::new_ret_no_self)] | ||
| pub const fn new() -> Uts46MapperBorrowed<'static> { | ||
| Uts46MapperBorrowed::new() | ||
| } | ||
| /// Construct with provider. | ||
| #[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new)] | ||
| pub fn try_new<D>(provider: &D) -> Result<Self, DataError> | ||
| where | ||
| D: DataProvider<NormalizerUts46DataV1> | ||
| + DataProvider<NormalizerNfdTablesV1> | ||
| + DataProvider<NormalizerNfkdTablesV1> | ||
| // UTS 46 tables merged into NormalizerNfkdTablesV1 | ||
| + DataProvider<NormalizerNfcV1> | ||
| + ?Sized, | ||
| { | ||
| let normalizer = ComposingNormalizer::try_new_uts46_unstable(provider)?; | ||
| Ok(Uts46Mapper { normalizer }) | ||
| } | ||
| } |
| # This is a placeholder in the interest of keeping the repository size smaller. | ||
| # Replace this file with the contents of | ||
| # https://www.unicode.org/Public/UCD/latest/ucd/NormalizationTest.txt to actually | ||
| # run the conformance test. |
| The test data comes from | ||
| https://www.unicode.org/Public/UCD/latest/ucd/NormalizationTest.txt |
Sorry, the diff of this file is too big to display
| { | ||
| "git": { | ||
| "sha1": "38a49da495248dd1ded84cf306e4ca42e64d5bb3" | ||
| }, | ||
| "path_in_vcs": "components/normalizer" | ||
| } |
| // This file is part of ICU4X. For terms of use, please see the file | ||
| // called LICENSE at the top level of the ICU4X source tree | ||
| // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). | ||
| use criterion::{criterion_group, criterion_main}; | ||
| mod canonical_composition; | ||
| mod canonical_decomposition; | ||
| mod composing_normalizer_nfc; | ||
| mod composing_normalizer_nfkc; | ||
| mod decomposing_normalizer_nfd; | ||
| mod decomposing_normalizer_nfkd; | ||
| mod utf16_throughput; | ||
| criterion_group!( | ||
| benches, | ||
| canonical_composition::criterion_benchmark, | ||
| canonical_decomposition::criterion_benchmark, | ||
| composing_normalizer_nfc::criterion_benchmark, | ||
| composing_normalizer_nfkc::criterion_benchmark, | ||
| decomposing_normalizer_nfd::criterion_benchmark, | ||
| decomposing_normalizer_nfkd::criterion_benchmark, | ||
| utf16_throughput::criterion_benchmark, | ||
| ); | ||
| criterion_main!(benches); |
| // This file is part of ICU4X. For terms of use, please see the file | ||
| // called LICENSE at the top level of the ICU4X source tree | ||
| // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). | ||
| use criterion::{black_box, BenchmarkId, Criterion}; | ||
| use detone::IterDecomposeVietnamese; | ||
| use icu_normalizer::properties::{ | ||
| CanonicalCompositionBorrowed, CanonicalDecompositionBorrowed, Decomposed, | ||
| }; | ||
| use icu_normalizer::ComposingNormalizerBorrowed; | ||
| struct BenchDataContent { | ||
| pub file_name: String, | ||
| pub pairs: Vec<(char, char)>, | ||
| } | ||
| fn strip_headers(content: &str) -> String { | ||
| content | ||
| .lines() | ||
| .filter(|&s| !s.starts_with('#')) | ||
| .map(|s| s.to_owned()) | ||
| .collect::<Vec<String>>() | ||
| .join("\n") | ||
| } | ||
| fn normalizer_bench_data() -> [BenchDataContent; 16] { | ||
| let nfc_normalizer = ComposingNormalizerBorrowed::new_nfc(); | ||
| [ | ||
| BenchDataContent { | ||
| file_name: "TestNames_Latin".to_owned(), | ||
| pairs: decompose_data( | ||
| &nfc_normalizer | ||
| .normalize(&strip_headers(include_str!("./data/TestNames_Latin.txt"))), | ||
| ), | ||
| }, | ||
| BenchDataContent { | ||
| file_name: "TestNames_Japanese_h".to_owned(), | ||
| pairs: decompose_data(&nfc_normalizer.normalize(&strip_headers(include_str!( | ||
| "./data/TestNames_Japanese_h.txt" | ||
| )))), | ||
| }, | ||
| BenchDataContent { | ||
| file_name: "TestNames_Japanese_k".to_owned(), | ||
| pairs: decompose_data(&nfc_normalizer.normalize(&strip_headers(include_str!( | ||
| "./data/TestNames_Japanese_k.txt" | ||
| )))), | ||
| }, | ||
| BenchDataContent { | ||
| file_name: "TestNames_Korean".to_owned(), | ||
| pairs: decompose_data( | ||
| &nfc_normalizer | ||
| .normalize(&strip_headers(include_str!("./data/TestNames_Korean.txt"))), | ||
| ), | ||
| }, | ||
| BenchDataContent { | ||
| file_name: "TestRandomWordsUDHR_ar".to_owned(), | ||
| #[cfg(debug_assertions)] | ||
| pairs: Vec::new(), | ||
| #[cfg(not(debug_assertions))] | ||
| pairs: decompose_data(&nfc_normalizer.normalize(&strip_headers(include_str!( | ||
| "./data/TestRandomWordsUDHR_ar.txt" | ||
| )))), | ||
| }, | ||
| BenchDataContent { | ||
| file_name: "TestRandomWordsUDHR_de".to_owned(), | ||
| pairs: decompose_data(&nfc_normalizer.normalize(&strip_headers(include_str!( | ||
| "./data/TestRandomWordsUDHR_de.txt" | ||
| )))), | ||
| }, | ||
| BenchDataContent { | ||
| file_name: "TestRandomWordsUDHR_el".to_owned(), | ||
| pairs: decompose_data(&nfc_normalizer.normalize(&strip_headers(include_str!( | ||
| "./data/TestRandomWordsUDHR_el.txt" | ||
| )))), | ||
| }, | ||
| BenchDataContent { | ||
| file_name: "TestRandomWordsUDHR_es".to_owned(), | ||
| pairs: decompose_data(&nfc_normalizer.normalize(&strip_headers(include_str!( | ||
| "./data/TestRandomWordsUDHR_es.txt" | ||
| )))), | ||
| }, | ||
| BenchDataContent { | ||
| file_name: "TestRandomWordsUDHR_fr".to_owned(), | ||
| pairs: decompose_data(&nfc_normalizer.normalize(&strip_headers(include_str!( | ||
| "./data/TestRandomWordsUDHR_fr.txt" | ||
| )))), | ||
| }, | ||
| BenchDataContent { | ||
| file_name: "TestRandomWordsUDHR_he".to_owned(), | ||
| pairs: decompose_data(&nfc_normalizer.normalize(&strip_headers(include_str!( | ||
| "./data/TestRandomWordsUDHR_he.txt" | ||
| )))), | ||
| }, | ||
| BenchDataContent { | ||
| file_name: "TestRandomWordsUDHR_pl".to_owned(), | ||
| pairs: decompose_data(&nfc_normalizer.normalize(&strip_headers(include_str!( | ||
| "./data/TestRandomWordsUDHR_pl.txt" | ||
| )))), | ||
| }, | ||
| BenchDataContent { | ||
| file_name: "TestRandomWordsUDHR_ru".to_owned(), | ||
| pairs: decompose_data(&nfc_normalizer.normalize(&strip_headers(include_str!( | ||
| "./data/TestRandomWordsUDHR_ru.txt" | ||
| )))), | ||
| }, | ||
| BenchDataContent { | ||
| file_name: "TestRandomWordsUDHR_th".to_owned(), | ||
| #[cfg(debug_assertions)] | ||
| pairs: Vec::new(), | ||
| #[cfg(not(debug_assertions))] | ||
| pairs: decompose_data(&nfc_normalizer.normalize(&strip_headers(include_str!( | ||
| "./data/TestRandomWordsUDHR_th.txt" | ||
| )))), | ||
| }, | ||
| BenchDataContent { | ||
| file_name: "TestRandomWordsUDHR_tr".to_owned(), | ||
| pairs: decompose_data(&nfc_normalizer.normalize(&strip_headers(include_str!( | ||
| "./data/TestRandomWordsUDHR_tr.txt" | ||
| )))), | ||
| }, | ||
| BenchDataContent { | ||
| file_name: "udhr_vie".to_owned(), | ||
| pairs: decompose_data( | ||
| &nfc_normalizer.normalize(&strip_headers(include_str!("data/wotw.txt"))), | ||
| ), | ||
| }, | ||
| BenchDataContent { | ||
| file_name: "udhr_vie_detone".to_owned(), | ||
| pairs: { | ||
| let result: Vec<(char, char)> = nfc_normalizer | ||
| .normalize(&strip_headers(include_str!("data/wotw.txt"))) | ||
| .chars() | ||
| .filter_map(|c| { | ||
| let mut iter = std::iter::once(c).decompose_vietnamese_tones(true); | ||
| if let Some(base) = iter.next() { | ||
| iter.next().map(|tone| (base, tone)) | ||
| } else { | ||
| None | ||
| } | ||
| }) | ||
| .collect(); | ||
| assert!(!result.is_empty()); | ||
| result | ||
| }, | ||
| }, | ||
| ] | ||
| } | ||
| fn function_under_bench( | ||
| canonical_composer: &CanonicalCompositionBorrowed, | ||
| composable_points: &[(char, char)], | ||
| ) { | ||
| for pair in composable_points.iter() { | ||
| canonical_composer.compose(pair.0, pair.1); | ||
| } | ||
| } | ||
| pub fn criterion_benchmark(criterion: &mut Criterion) { | ||
| let group_name = "canonical_composition"; | ||
| let mut group = criterion.benchmark_group(group_name); | ||
| let composer = CanonicalCompositionBorrowed::new(); | ||
| for bench_data_content in black_box(normalizer_bench_data()) { | ||
| group.bench_function( | ||
| BenchmarkId::from_parameter(format!("from_nfc_{}", bench_data_content.file_name)), | ||
| |bencher| bencher.iter(|| function_under_bench(&composer, &bench_data_content.pairs)), | ||
| ); | ||
| } | ||
| group.finish(); | ||
| } | ||
| fn decompose_data(nfc: &str) -> Vec<(char, char)> { | ||
| let decomposer = CanonicalDecompositionBorrowed::new(); | ||
| nfc.chars() | ||
| .map(|c| decomposer.decompose(c)) | ||
| .filter_map(|decomposed| { | ||
| if let Decomposed::Expansion(a, b) = decomposed { | ||
| Some((a, b)) | ||
| } else { | ||
| None | ||
| } | ||
| }) | ||
| .collect() | ||
| } |
| // This file is part of ICU4X. For terms of use, please see the file | ||
| // called LICENSE at the top level of the ICU4X source tree | ||
| // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). | ||
| use criterion::{black_box, BenchmarkId, Criterion}; | ||
| use icu_normalizer::properties::CanonicalDecompositionBorrowed; | ||
| use icu_normalizer::{ComposingNormalizerBorrowed, DecomposingNormalizerBorrowed}; | ||
| struct BenchDataContent { | ||
| pub file_name: String, | ||
| pub nfc: String, | ||
| pub nfd: String, | ||
| pub nfkc: String, | ||
| pub nfkd: String, | ||
| } | ||
| fn strip_headers(content: &str) -> String { | ||
| content | ||
| .lines() | ||
| .filter(|&s| !s.starts_with('#')) | ||
| .map(|s| s.to_owned()) | ||
| .collect::<Vec<String>>() | ||
| .join("\n") | ||
| } | ||
| fn normalizer_bench_data() -> [BenchDataContent; 15] { | ||
| let nfc_normalizer = ComposingNormalizerBorrowed::new_nfc(); | ||
| let nfd_normalizer = DecomposingNormalizerBorrowed::new_nfd(); | ||
| let nfkc_normalizer = ComposingNormalizerBorrowed::new_nfkc(); | ||
| let nfkd_normalizer = DecomposingNormalizerBorrowed::new_nfkd(); | ||
| let content_latin: (&str, &str) = ( | ||
| "TestNames_Latin", | ||
| &strip_headers(include_str!("./data/TestNames_Latin.txt")), | ||
| ); | ||
| let content_jp_h: (&str, &str) = ( | ||
| "TestNames_Japanese_h", | ||
| &strip_headers(include_str!("./data/TestNames_Japanese_h.txt")), | ||
| ); | ||
| let content_jp_k: (&str, &str) = ( | ||
| "TestNames_Japanese_k", | ||
| &strip_headers(include_str!("./data/TestNames_Japanese_k.txt")), | ||
| ); | ||
| let content_korean: (&str, &str) = ( | ||
| "TestNames_Korean", | ||
| &strip_headers(include_str!("./data/TestNames_Korean.txt")), | ||
| ); | ||
| let content_random_words_ar: (&str, &str) = ( | ||
| "TestRandomWordsUDHR_ar", | ||
| &strip_headers(include_str!("./data/TestRandomWordsUDHR_ar.txt")), | ||
| ); | ||
| let content_random_words_de: (&str, &str) = ( | ||
| "TestRandomWordsUDHR_de", | ||
| &strip_headers(include_str!("./data/TestRandomWordsUDHR_de.txt")), | ||
| ); | ||
| let content_random_words_el: (&str, &str) = ( | ||
| "TestRandomWordsUDHR_el", | ||
| &strip_headers(include_str!("./data/TestRandomWordsUDHR_el.txt")), | ||
| ); | ||
| let content_random_words_es: (&str, &str) = ( | ||
| "TestRandomWordsUDHR_es", | ||
| &strip_headers(include_str!("./data/TestRandomWordsUDHR_es.txt")), | ||
| ); | ||
| let content_random_words_fr: (&str, &str) = ( | ||
| "TestRandomWordsUDHR_fr", | ||
| &strip_headers(include_str!("./data/TestRandomWordsUDHR_fr.txt")), | ||
| ); | ||
| let content_random_words_he: (&str, &str) = ( | ||
| "TestRandomWordsUDHR_he", | ||
| &strip_headers(include_str!("./data/TestRandomWordsUDHR_he.txt")), | ||
| ); | ||
| let content_random_words_pl: (&str, &str) = ( | ||
| "TestRandomWordsUDHR_pl", | ||
| &strip_headers(include_str!("./data/TestRandomWordsUDHR_pl.txt")), | ||
| ); | ||
| let content_random_words_ru: (&str, &str) = ( | ||
| "TestRandomWordsUDHR_ru", | ||
| &strip_headers(include_str!("./data/TestRandomWordsUDHR_ru.txt")), | ||
| ); | ||
| let content_random_words_th: (&str, &str) = ( | ||
| "TestRandomWordsUDHR_th", | ||
| &strip_headers(include_str!("./data/TestRandomWordsUDHR_th.txt")), | ||
| ); | ||
| let content_random_words_tr: (&str, &str) = ( | ||
| "TestRandomWordsUDHR_tr", | ||
| &strip_headers(include_str!("./data/TestRandomWordsUDHR_tr.txt")), | ||
| ); | ||
| let content_viet: (&str, &str) = ("udhr_vie", &strip_headers(include_str!("data/wotw.txt"))); | ||
| [ | ||
| content_latin, | ||
| content_viet, | ||
| content_jp_k, | ||
| content_jp_h, | ||
| content_korean, | ||
| content_random_words_ru, | ||
| content_random_words_ar, | ||
| content_random_words_el, | ||
| content_random_words_es, | ||
| content_random_words_fr, | ||
| content_random_words_tr, | ||
| content_random_words_th, | ||
| content_random_words_pl, | ||
| content_random_words_he, | ||
| content_random_words_de, | ||
| ] | ||
| .map(|(file_name, raw_content)| BenchDataContent { | ||
| file_name: file_name.to_owned(), | ||
| nfc: nfc_normalizer.normalize(raw_content).to_string(), | ||
| nfd: nfd_normalizer.normalize(raw_content).to_string(), | ||
| nfkc: nfkc_normalizer.normalize(raw_content).to_string(), | ||
| nfkd: nfkd_normalizer.normalize(raw_content).to_string(), | ||
| }) | ||
| } | ||
| #[cfg(debug_assertions)] | ||
| fn function_under_bench( | ||
| _canonical_decomposer: &CanonicalDecompositionBorrowed, | ||
| _decomposable_points: &str, | ||
| ) { | ||
| // using debug assertion fails some test. | ||
| // "cargo test --bench bench" will pass | ||
| // "cargo bench" will work as expected, because the profile doesn't include debug assertions. | ||
| } | ||
| #[cfg(not(debug_assertions))] | ||
| fn function_under_bench( | ||
| canonical_decomposer: &CanonicalDecompositionBorrowed, | ||
| decomposable_points: &str, | ||
| ) { | ||
| decomposable_points.chars().for_each(|point| { | ||
| canonical_decomposer.decompose(point); | ||
| }); | ||
| } | ||
| pub fn criterion_benchmark(criterion: &mut Criterion) { | ||
| let group_name = "canonical_decomposition"; | ||
| let mut group = criterion.benchmark_group(group_name); | ||
| let decomposer = CanonicalDecompositionBorrowed::new(); | ||
| for bench_data_content in black_box(normalizer_bench_data()) { | ||
| group.bench_function( | ||
| BenchmarkId::from_parameter(format!("from_nfc_{}", bench_data_content.file_name)), | ||
| |bencher| bencher.iter(|| function_under_bench(&decomposer, &bench_data_content.nfc)), | ||
| ); | ||
| group.bench_function( | ||
| BenchmarkId::from_parameter(format!("from_nfd_{}", bench_data_content.file_name)), | ||
| |bencher| bencher.iter(|| function_under_bench(&decomposer, &bench_data_content.nfd)), | ||
| ); | ||
| group.bench_function( | ||
| BenchmarkId::from_parameter(format!("from_nfkc_{}", bench_data_content.file_name)), | ||
| |bencher| bencher.iter(|| function_under_bench(&decomposer, &bench_data_content.nfkc)), | ||
| ); | ||
| group.bench_function( | ||
| BenchmarkId::from_parameter(format!("from_nfkd_{}", bench_data_content.file_name)), | ||
| |bencher| bencher.iter(|| function_under_bench(&decomposer, &bench_data_content.nfkd)), | ||
| ); | ||
| } | ||
| group.finish(); | ||
| } |
| // This file is part of ICU4X. For terms of use, please see the file | ||
| // called LICENSE at the top level of the ICU4X source tree | ||
| // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). | ||
| use criterion::{black_box, BenchmarkId, Criterion}; | ||
| use icu_normalizer::{ComposingNormalizerBorrowed, DecomposingNormalizerBorrowed}; | ||
| struct BenchDataContent { | ||
| pub file_name: String, | ||
| pub nfc: String, | ||
| pub nfd: String, | ||
| pub nfkc: String, | ||
| pub nfkd: String, | ||
| pub nfc_u16: Vec<u16>, | ||
| pub nfd_u16: Vec<u16>, | ||
| pub nfkc_u16: Vec<u16>, | ||
| pub nfkd_u16: Vec<u16>, | ||
| } | ||
| fn strip_headers(content: &str) -> String { | ||
| content | ||
| .lines() | ||
| .filter(|&s| !s.starts_with('#')) | ||
| .map(|s| s.to_owned()) | ||
| .collect::<Vec<String>>() | ||
| .join("\n") | ||
| } | ||
| fn normalizer_bench_data() -> [BenchDataContent; 15] { | ||
| let nfc_normalizer = ComposingNormalizerBorrowed::new_nfc(); | ||
| let nfd_normalizer = DecomposingNormalizerBorrowed::new_nfd(); | ||
| let nfkc_normalizer = ComposingNormalizerBorrowed::new_nfkc(); | ||
| let nfkd_normalizer = DecomposingNormalizerBorrowed::new_nfkd(); | ||
| let content_latin: (&str, &str) = ( | ||
| "TestNames_Latin", | ||
| &strip_headers(include_str!("./data/TestNames_Latin.txt")), | ||
| ); | ||
| let content_jp_h: (&str, &str) = ( | ||
| "TestNames_Japanese_h", | ||
| &strip_headers(include_str!("./data/TestNames_Japanese_h.txt")), | ||
| ); | ||
| let content_jp_k: (&str, &str) = ( | ||
| "TestNames_Japanese_k", | ||
| &strip_headers(include_str!("./data/TestNames_Japanese_k.txt")), | ||
| ); | ||
| let content_korean: (&str, &str) = ( | ||
| "TestNames_Korean", | ||
| &strip_headers(include_str!("./data/TestNames_Korean.txt")), | ||
| ); | ||
| let content_random_words_ar: (&str, &str) = ( | ||
| "TestRandomWordsUDHR_ar", | ||
| &strip_headers(include_str!("./data/TestRandomWordsUDHR_ar.txt")), | ||
| ); | ||
| let content_random_words_de: (&str, &str) = ( | ||
| "TestRandomWordsUDHR_de", | ||
| &strip_headers(include_str!("./data/TestRandomWordsUDHR_de.txt")), | ||
| ); | ||
| let content_random_words_el: (&str, &str) = ( | ||
| "TestRandomWordsUDHR_el", | ||
| &strip_headers(include_str!("./data/TestRandomWordsUDHR_el.txt")), | ||
| ); | ||
| let content_random_words_es: (&str, &str) = ( | ||
| "TestRandomWordsUDHR_es", | ||
| &strip_headers(include_str!("./data/TestRandomWordsUDHR_es.txt")), | ||
| ); | ||
| let content_random_words_fr: (&str, &str) = ( | ||
| "TestRandomWordsUDHR_fr", | ||
| &strip_headers(include_str!("./data/TestRandomWordsUDHR_fr.txt")), | ||
| ); | ||
| let content_random_words_he: (&str, &str) = ( | ||
| "TestRandomWordsUDHR_he", | ||
| &strip_headers(include_str!("./data/TestRandomWordsUDHR_he.txt")), | ||
| ); | ||
| let content_random_words_pl: (&str, &str) = ( | ||
| "TestRandomWordsUDHR_pl", | ||
| &strip_headers(include_str!("./data/TestRandomWordsUDHR_pl.txt")), | ||
| ); | ||
| let content_random_words_ru: (&str, &str) = ( | ||
| "TestRandomWordsUDHR_ru", | ||
| &strip_headers(include_str!("./data/TestRandomWordsUDHR_ru.txt")), | ||
| ); | ||
| let content_random_words_th: (&str, &str) = ( | ||
| "TestRandomWordsUDHR_th", | ||
| &strip_headers(include_str!("./data/TestRandomWordsUDHR_th.txt")), | ||
| ); | ||
| let content_random_words_tr: (&str, &str) = ( | ||
| "TestRandomWordsUDHR_tr", | ||
| &strip_headers(include_str!("./data/TestRandomWordsUDHR_tr.txt")), | ||
| ); | ||
| let content_viet: (&str, &str) = ("wotw", &strip_headers(include_str!("./data/wotw.txt"))); | ||
| [ | ||
| content_latin, | ||
| content_viet, | ||
| content_jp_k, | ||
| content_jp_h, | ||
| content_korean, | ||
| content_random_words_ru, | ||
| content_random_words_ar, | ||
| content_random_words_el, | ||
| content_random_words_es, | ||
| content_random_words_fr, | ||
| content_random_words_tr, | ||
| content_random_words_th, | ||
| content_random_words_pl, | ||
| content_random_words_he, | ||
| content_random_words_de, | ||
| ] | ||
| .map(|(file_name, raw_content)| { | ||
| let nfc = &nfc_normalizer.normalize(raw_content); | ||
| let nfd = &nfd_normalizer.normalize(raw_content); | ||
| let nfkc = &nfkc_normalizer.normalize(raw_content); | ||
| let nfkd = &nfkd_normalizer.normalize(raw_content); | ||
| BenchDataContent { | ||
| file_name: file_name.to_owned(), | ||
| nfc: nfc.to_string(), | ||
| nfd: nfd.to_string(), | ||
| nfkc: nfkc.to_string(), | ||
| nfkd: nfkd.to_string(), | ||
| nfc_u16: nfc.encode_utf16().collect(), | ||
| nfd_u16: nfd.encode_utf16().collect(), | ||
| nfkc_u16: nfkc.encode_utf16().collect(), | ||
| nfkd_u16: nfkd.encode_utf16().collect(), | ||
| } | ||
| }) | ||
| } | ||
| fn function_under_bench(normalizer: &ComposingNormalizerBorrowed, text: &str) { | ||
| normalizer.normalize(text); | ||
| } | ||
| fn function_under_bench_utf16(normalizer: &ComposingNormalizerBorrowed, text: &[u16]) { | ||
| normalizer.normalize_utf16(text); | ||
| } | ||
| pub fn criterion_benchmark(criterion: &mut Criterion) { | ||
| let group_name = "composing_normalizer_nfc"; | ||
| let normalizer_under_bench = ComposingNormalizerBorrowed::new_nfc(); | ||
| let mut group = criterion.benchmark_group(group_name); | ||
| for bench_data_content in black_box(normalizer_bench_data()) { | ||
| group.bench_function( | ||
| BenchmarkId::from_parameter(format!("from_nfc_{}", bench_data_content.file_name)), | ||
| |bencher| { | ||
| bencher | ||
| .iter(|| function_under_bench(&normalizer_under_bench, &bench_data_content.nfc)) | ||
| }, | ||
| ); | ||
| group.bench_function( | ||
| BenchmarkId::from_parameter(format!("from_nfd_{}", bench_data_content.file_name)), | ||
| |bencher| { | ||
| bencher | ||
| .iter(|| function_under_bench(&normalizer_under_bench, &bench_data_content.nfd)) | ||
| }, | ||
| ); | ||
| group.bench_function( | ||
| BenchmarkId::from_parameter(format!("from_nfkc_{}", bench_data_content.file_name)), | ||
| |bencher| { | ||
| bencher.iter(|| { | ||
| function_under_bench(&normalizer_under_bench, &bench_data_content.nfkc) | ||
| }) | ||
| }, | ||
| ); | ||
| group.bench_function( | ||
| BenchmarkId::from_parameter(format!("from_nfkd_{}", bench_data_content.file_name)), | ||
| |bencher| { | ||
| bencher.iter(|| { | ||
| function_under_bench(&normalizer_under_bench, &bench_data_content.nfkd) | ||
| }) | ||
| }, | ||
| ); | ||
| // UTF_16 | ||
| group.bench_function( | ||
| BenchmarkId::from_parameter(format!( | ||
| "from_nfc_{}_utf_16", | ||
| bench_data_content.file_name | ||
| )), | ||
| |bencher| { | ||
| bencher.iter(|| { | ||
| function_under_bench_utf16(&normalizer_under_bench, &bench_data_content.nfc_u16) | ||
| }) | ||
| }, | ||
| ); | ||
| group.bench_function( | ||
| BenchmarkId::from_parameter(format!( | ||
| "from_nfd_{}_utf_16", | ||
| bench_data_content.file_name | ||
| )), | ||
| |bencher| { | ||
| bencher.iter(|| { | ||
| function_under_bench_utf16(&normalizer_under_bench, &bench_data_content.nfd_u16) | ||
| }) | ||
| }, | ||
| ); | ||
| group.bench_function( | ||
| BenchmarkId::from_parameter(format!( | ||
| "from_nfkc_{}_utf_16", | ||
| bench_data_content.file_name | ||
| )), | ||
| |bencher| { | ||
| bencher.iter(|| { | ||
| function_under_bench_utf16( | ||
| &normalizer_under_bench, | ||
| &bench_data_content.nfkc_u16, | ||
| ) | ||
| }) | ||
| }, | ||
| ); | ||
| group.bench_function( | ||
| BenchmarkId::from_parameter(format!( | ||
| "from_nfkd_{}_utf_16", | ||
| bench_data_content.file_name | ||
| )), | ||
| |bencher| { | ||
| bencher.iter(|| { | ||
| function_under_bench_utf16( | ||
| &normalizer_under_bench, | ||
| &bench_data_content.nfkd_u16, | ||
| ) | ||
| }) | ||
| }, | ||
| ); | ||
| } | ||
| group.finish(); | ||
| } |
| // This file is part of ICU4X. For terms of use, please see the file | ||
| // called LICENSE at the top level of the ICU4X source tree | ||
| // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). | ||
| use criterion::{black_box, BenchmarkId, Criterion}; | ||
| use icu_normalizer::{ComposingNormalizerBorrowed, DecomposingNormalizerBorrowed}; | ||
| struct BenchDataContent { | ||
| pub file_name: String, | ||
| pub nfc: String, | ||
| pub nfd: String, | ||
| pub nfkc: String, | ||
| pub nfkd: String, | ||
| pub nfc_u16: Vec<u16>, | ||
| pub nfd_u16: Vec<u16>, | ||
| pub nfkc_u16: Vec<u16>, | ||
| pub nfkd_u16: Vec<u16>, | ||
| } | ||
| fn strip_headers(content: &str) -> String { | ||
| content | ||
| .lines() | ||
| .filter(|&s| !s.starts_with('#')) | ||
| .map(|s| s.to_owned()) | ||
| .collect::<Vec<String>>() | ||
| .join("\n") | ||
| } | ||
| fn normalizer_bench_data() -> [BenchDataContent; 15] { | ||
| let nfc_normalizer = ComposingNormalizerBorrowed::new_nfc(); | ||
| let nfd_normalizer = DecomposingNormalizerBorrowed::new_nfd(); | ||
| let nfkc_normalizer = ComposingNormalizerBorrowed::new_nfkc(); | ||
| let nfkd_normalizer = DecomposingNormalizerBorrowed::new_nfkd(); | ||
| let content_latin: (&str, &str) = ( | ||
| "TestNames_Latin", | ||
| &strip_headers(include_str!("./data/TestNames_Latin.txt")), | ||
| ); | ||
| let content_jp_h: (&str, &str) = ( | ||
| "TestNames_Japanese_h", | ||
| &strip_headers(include_str!("./data/TestNames_Japanese_h.txt")), | ||
| ); | ||
| let content_jp_k: (&str, &str) = ( | ||
| "TestNames_Japanese_k", | ||
| &strip_headers(include_str!("./data/TestNames_Japanese_k.txt")), | ||
| ); | ||
| let content_korean: (&str, &str) = ( | ||
| "TestNames_Korean", | ||
| &strip_headers(include_str!("./data/TestNames_Korean.txt")), | ||
| ); | ||
| let content_random_words_ar: (&str, &str) = ( | ||
| "TestRandomWordsUDHR_ar", | ||
| &strip_headers(include_str!("./data/TestRandomWordsUDHR_ar.txt")), | ||
| ); | ||
| let content_random_words_de: (&str, &str) = ( | ||
| "TestRandomWordsUDHR_de", | ||
| &strip_headers(include_str!("./data/TestRandomWordsUDHR_de.txt")), | ||
| ); | ||
| let content_random_words_el: (&str, &str) = ( | ||
| "TestRandomWordsUDHR_el", | ||
| &strip_headers(include_str!("./data/TestRandomWordsUDHR_el.txt")), | ||
| ); | ||
| let content_random_words_es: (&str, &str) = ( | ||
| "TestRandomWordsUDHR_es", | ||
| &strip_headers(include_str!("./data/TestRandomWordsUDHR_es.txt")), | ||
| ); | ||
| let content_random_words_fr: (&str, &str) = ( | ||
| "TestRandomWordsUDHR_fr", | ||
| &strip_headers(include_str!("./data/TestRandomWordsUDHR_fr.txt")), | ||
| ); | ||
| let content_random_words_he: (&str, &str) = ( | ||
| "TestRandomWordsUDHR_he", | ||
| &strip_headers(include_str!("./data/TestRandomWordsUDHR_he.txt")), | ||
| ); | ||
| let content_random_words_pl: (&str, &str) = ( | ||
| "TestRandomWordsUDHR_pl", | ||
| &strip_headers(include_str!("./data/TestRandomWordsUDHR_pl.txt")), | ||
| ); | ||
| let content_random_words_ru: (&str, &str) = ( | ||
| "TestRandomWordsUDHR_ru", | ||
| &strip_headers(include_str!("./data/TestRandomWordsUDHR_ru.txt")), | ||
| ); | ||
| let content_random_words_th: (&str, &str) = ( | ||
| "TestRandomWordsUDHR_th", | ||
| &strip_headers(include_str!("./data/TestRandomWordsUDHR_th.txt")), | ||
| ); | ||
| let content_random_words_tr: (&str, &str) = ( | ||
| "TestRandomWordsUDHR_tr", | ||
| &strip_headers(include_str!("./data/TestRandomWordsUDHR_tr.txt")), | ||
| ); | ||
| let content_viet: (&str, &str) = ("wotw", &strip_headers(include_str!("./data/wotw.txt"))); | ||
| [ | ||
| content_latin, | ||
| content_viet, | ||
| content_jp_k, | ||
| content_jp_h, | ||
| content_korean, | ||
| content_random_words_ru, | ||
| content_random_words_ar, | ||
| content_random_words_el, | ||
| content_random_words_es, | ||
| content_random_words_fr, | ||
| content_random_words_tr, | ||
| content_random_words_th, | ||
| content_random_words_pl, | ||
| content_random_words_he, | ||
| content_random_words_de, | ||
| ] | ||
| .map(|(file_name, raw_content)| { | ||
| let nfc = &nfc_normalizer.normalize(raw_content); | ||
| let nfd = &nfd_normalizer.normalize(raw_content); | ||
| let nfkc = &nfkc_normalizer.normalize(raw_content); | ||
| let nfkd = &nfkd_normalizer.normalize(raw_content); | ||
| BenchDataContent { | ||
| file_name: file_name.to_owned(), | ||
| nfc: nfc.to_string(), | ||
| nfd: nfd.to_string(), | ||
| nfkc: nfkc.to_string(), | ||
| nfkd: nfkd.to_string(), | ||
| nfc_u16: nfc.encode_utf16().collect(), | ||
| nfd_u16: nfd.encode_utf16().collect(), | ||
| nfkc_u16: nfkc.encode_utf16().collect(), | ||
| nfkd_u16: nfkd.encode_utf16().collect(), | ||
| } | ||
| }) | ||
| } | ||
| fn function_under_bench(normalizer: &ComposingNormalizerBorrowed, text: &str) { | ||
| normalizer.normalize(text); | ||
| } | ||
| fn function_under_bench_u16(normalizer: &ComposingNormalizerBorrowed, text: &[u16]) { | ||
| normalizer.normalize_utf16(text); | ||
| } | ||
| pub fn criterion_benchmark(criterion: &mut Criterion) { | ||
| let group_name = "composing_normalizer_nfkc"; | ||
| let normalizer_under_bench = ComposingNormalizerBorrowed::new_nfkc(); | ||
| let mut group = criterion.benchmark_group(group_name); | ||
| for bench_data_content in black_box(normalizer_bench_data()) { | ||
| group.bench_function( | ||
| BenchmarkId::from_parameter(format!("from_nfc_{}", bench_data_content.file_name)), | ||
| |bencher| { | ||
| bencher | ||
| .iter(|| function_under_bench(&normalizer_under_bench, &bench_data_content.nfc)) | ||
| }, | ||
| ); | ||
| group.bench_function( | ||
| BenchmarkId::from_parameter(format!("from_nfd_{}", bench_data_content.file_name)), | ||
| |bencher| { | ||
| bencher | ||
| .iter(|| function_under_bench(&normalizer_under_bench, &bench_data_content.nfd)) | ||
| }, | ||
| ); | ||
| group.bench_function( | ||
| BenchmarkId::from_parameter(format!("from_nfkc_{}", bench_data_content.file_name)), | ||
| |bencher| { | ||
| bencher.iter(|| { | ||
| function_under_bench(&normalizer_under_bench, &bench_data_content.nfkc) | ||
| }) | ||
| }, | ||
| ); | ||
| group.bench_function( | ||
| BenchmarkId::from_parameter(format!("from_nfkd_{}", bench_data_content.file_name)), | ||
| |bencher| { | ||
| bencher.iter(|| { | ||
| function_under_bench(&normalizer_under_bench, &bench_data_content.nfkd) | ||
| }) | ||
| }, | ||
| ); | ||
| // UTF 16 | ||
| group.bench_function( | ||
| BenchmarkId::from_parameter(format!("from_nfc_{}_u16", bench_data_content.file_name)), | ||
| |bencher| { | ||
| bencher.iter(|| { | ||
| function_under_bench_u16(&normalizer_under_bench, &bench_data_content.nfc_u16) | ||
| }) | ||
| }, | ||
| ); | ||
| group.bench_function( | ||
| BenchmarkId::from_parameter(format!("from_nfd_{}_u16", bench_data_content.file_name)), | ||
| |bencher| { | ||
| bencher.iter(|| { | ||
| function_under_bench_u16(&normalizer_under_bench, &bench_data_content.nfd_u16) | ||
| }) | ||
| }, | ||
| ); | ||
| group.bench_function( | ||
| BenchmarkId::from_parameter(format!("from_nfkc_{}_u16", bench_data_content.file_name)), | ||
| |bencher| { | ||
| bencher.iter(|| { | ||
| function_under_bench_u16(&normalizer_under_bench, &bench_data_content.nfkc_u16) | ||
| }) | ||
| }, | ||
| ); | ||
| group.bench_function( | ||
| BenchmarkId::from_parameter(format!("from_nfkd_{}_u16", bench_data_content.file_name)), | ||
| |bencher| { | ||
| bencher.iter(|| { | ||
| function_under_bench_u16(&normalizer_under_bench, &bench_data_content.nfkd_u16) | ||
| }) | ||
| }, | ||
| ); | ||
| } | ||
| group.finish(); | ||
| } |
| # Generating microbench data | ||
| The full versions of these files are located | ||
| [in another part of the repository](https://github.com/unicode-org/icu/tree/main/icu4j/perf-tests/data). | ||
| ## Sanitizing the file | ||
| ```shell | ||
| sed -i '/^#/d' ${filename} | ||
| sed -i '/^$/d' ${filename} | ||
| ``` | ||
| ## Shuffling the file | ||
| ```shell | ||
| shuf -n 20 ${filename} -o ${filename} | ||
| ``` | ||
| ## Add back the header (if you plan on submitting the files) | ||
| ``` | ||
| # This file is part of ICU4X. For terms of use, please see the file | ||
| # called LICENSE at the top level of the ICU4X source tree | ||
| # (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). | ||
| ``` |
| # This file is part of ICU4X. For terms of use, please see the file | ||
| # called LICENSE at the top level of the ICU4X source tree | ||
| # (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). | ||
| かげやま,みのる | ||
| むらかみ,とおる | ||
| つじさわ,けい | ||
| やすい,たかゆき | ||
| むらさき,としお | ||
| はせがわ,ひであき | ||
| うるしばら,よしひこ | ||
| ままだ,ひろし | ||
| おおぼら,えいじろう | ||
| おおば,まさひで | ||
| きたばたけ,たかひこ | ||
| はまさき,あつし | ||
| ほりい,つねお | ||
| もり,だいいち | ||
| いとう,しんいち | ||
| くにもと,じゅんじ | ||
| おか,のりひと | ||
| たに,よしあき | ||
| しらがき,ひろあき | ||
| しらはま,たけひろ | ||
| むらかみ,やすひろ | ||
| うめはら,たかし | ||
| いわた,ひろし | ||
| すぎえ,かつとし | ||
| てらにし,ひろみつ | ||
| まつおか,だいすけ | ||
| もろほし,すすむ | ||
| いしはら,たかし | ||
| おしま,ひろお | ||
| なかお,ゆうじ | ||
| いかり,はるお | ||
| きまち,まさき | ||
| ふるかわ,みちお | ||
| かねこ,しゅうへい | ||
| なかがわ,ともみ | ||
| ささき,しんご | ||
| うちだ,たくじ | ||
| うめだ,さかえ | ||
| しばた,いくこ | ||
| まきした,けいこ | ||
| まつもと,しんいちろう | ||
| たかの,かずよし | ||
| いしわた,なおひさ | ||
| いうち,まこと | ||
| いまい,りほ | ||
| みずた,のりあき | ||
| かくたに,まなぶ | ||
| わだ,ほまれ | ||
| わかまつ,かずき | ||
| かわぐち,ひろき |
| # This file is part of ICU4X. For terms of use, please see the file | ||
| # called LICENSE at the top level of the ICU4X source tree | ||
| # (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). | ||
| ホリモト,ユウジ | ||
| ハナミ,ヤスヒデ | ||
| イシザカ,タカユキ | ||
| ゼンケ,トシオ | ||
| ハトリ,ユウコ | ||
| ナガオカ,トモユキ | ||
| コウダ,ケンイチ | ||
| イシダ,ヒロシ | ||
| ミワ,シゲユキ | ||
| イシカワ,ヒロシ | ||
| スズキ,ユウスケ | ||
| オクダ,ヨシノリ | ||
| シムラ,サカエ | ||
| エビシマ,ヤスユキ | ||
| イブカ,ヨシテル | ||
| タノ,マコト | ||
| ドウゾノ,セイヤ | ||
| ヤマナカ,サツミ | ||
| トミイエ,ハヤト | ||
| アザミ,ツトム | ||
| タナカ,キョウコ | ||
| コジマ,アツシ | ||
| フミハラ,カオリ | ||
| スズキ,マサユキ | ||
| ナトリ,ケンヤ | ||
| スズキ,ユウコ | ||
| スズキ,ヒサエ | ||
| ナカガワ,カツヨシ | ||
| スズキ,マサフミ | ||
| マツヤマ,トシオ | ||
| ヨシナガ,チカエ | ||
| キタムラ,リカコ | ||
| アオキ,タクオ | ||
| ヤマグチ,ヤスヒロ | ||
| スギムラ,シゲオ | ||
| ウエスギ,マサミ | ||
| マツムラ,シンイチ | ||
| クバ,タカシ | ||
| スドウ,タカトシ | ||
| フジモト,ヒロシ | ||
| イトウ,シュウイチ | ||
| コバヤシ,カズミ | ||
| タナカ,ヒロカツ | ||
| イシダ,ツカサ | ||
| ヤマダ,マサコ | ||
| カミヤ,トミエ | ||
| タケモト,ユウジ | ||
| スミノ,コウジ | ||
| ヒロハタ,タクヤ | ||
| ミヒラ,リョウヘイ |
| # This file is part of ICU4X. For terms of use, please see the file | ||
| # called LICENSE at the top level of the ICU4X source tree | ||
| # (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). | ||
| 김명희 | ||
| 홍차수 | ||
| 허순재 | ||
| 강영휘 | ||
| 김운주 | ||
| 이종환 | ||
| 이은국 | ||
| 강태호 | ||
| 강일래 | ||
| 김동현 | ||
| 곽기자 | ||
| 차재수 | ||
| 표봉기 | ||
| 문대원 | ||
| 이형기 | ||
| 최교표 | ||
| 박식현 | ||
| 홍종립 | ||
| 서창수 | ||
| 김쌍건 | ||
| 서말도 | ||
| 이병훈 | ||
| 김희수 | ||
| 박학태 | ||
| 강태종 | ||
| 조문란 | ||
| 신범균 | ||
| 백두진 | ||
| 이철정 | ||
| 김태중 | ||
| 이성현 | ||
| 김주조 | ||
| 김강행 | ||
| 이정길 | ||
| 김완일 | ||
| 권수자 | ||
| 이춘철 | ||
| 김판근 | ||
| 김곡리 | ||
| 이경형 | ||
| 이운만 | ||
| 손상철 | ||
| 유기숙 | ||
| 박정한 | ||
| 조윤래 | ||
| 유신호 | ||
| 이두수 | ||
| 김재률 | ||
| 김성홍 | ||
| 김혜경 |
| # This file is part of ICU4X. For terms of use, please see the file | ||
| # called LICENSE at the top level of the ICU4X source tree | ||
| # (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). | ||
| González, Joan | ||
| Reinders, Jim | ||
| Applebroog, Ida | ||
| Kidd, Joseph Bartholomew | ||
| Gulácsy, Lajos | ||
| Letendre, Rita | ||
| Zuccaro, Federico | ||
| Apt the Elder, Ulrich | ||
| Drummond, Arthur | ||
| Manley, Thomas | ||
| Broc, Jean | ||
| Ramunno, Tony | ||
| Simone dei Crocifissi | ||
| Lane, Theodore | ||
| Symonds, William Robert | ||
| Johnson, Frank Tenney | ||
| Cox, Gardner | ||
| Bunbury, Charles | ||
| Pedro de la Cuadra | ||
| Payne, William | ||
| Lucas, John Seymour | ||
| Holsman, Elizabeth T. | ||
| de Vries, Auke | ||
| Laszlo, Philip Alexius de | ||
| Shigemasa | ||
| Wolfe, Ruth Mitchell | ||
| Buck, John | ||
| Baselitz, Georg | ||
| Hook, Walter | ||
| Segall, Lasar | ||
| Brush, George deForest | ||
| Master of Jánosrét | ||
| Sutherland, Elizabeth Leveson-Gower, Countess of | ||
| Tuckerman, Jane | ||
| Varley, F.H. | ||
| Fosso, Samuel | ||
| Gardner, Daniel | ||
| Sadler, Walter Dendy | ||
| Clausen, Franciska | ||
| Coman, Charlotte Buell | ||
| Wakelin, Roland | ||
| Payne, Jon, CML | ||
| Campagna, Girolamo | ||
| Wiener, Phyllis | ||
| Sallee, Charles | ||
| Fitzgerald, John Anster | ||
| Gribbroek, Robert | ||
| Laporte, John | ||
| Lévy-Dhurmer, Lucien | ||
| Young, Stephen Scott |
| # This file is part of ICU4X. For terms of use, please see the file | ||
| # called LICENSE at the top level of the ICU4X source tree | ||
| # (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). | ||
| ณรงค์ โต๊ะเงิน | ||
| กิตติ บุญวันต์ | ||
| สมหมาย ดาบทองดี | ||
| ธวัชชัย อิสระนิมิตร | ||
| วรรณา โสภณนรินทร์ | ||
| วินัย หมู่มิ่ง | ||
| พัชรี ชูจิรวงศ์ | ||
| สมปอง จิวไพโรจน์กิจ | ||
| บุญส่ง กวยรักษา | ||
| นิพนธ์ นิ่มใหม่ | ||
| พัชรี สุวพรศิลป์ | ||
| เจริญ นววัฒนทรัพย์ | ||
| อรพินท์ แซ่เจี่ย | ||
| ชัยพร สมใจนึก | ||
| ประนอม โคศิลา | ||
| ฉวีวรรณ ศรสังข์ทอง | ||
| วัชรา เจริญรัตนพร | ||
| สุภัท นกศิริ | ||
| อู๋ มาลาเล็ก | ||
| ประยูร ไชโย | ||
| ละออ อยู่ยืนยง | ||
| สมใจ วิวัฒน์วานิช | ||
| จุมพล จันทรศรีเกษร | ||
| พุฒ ดอกไม้จีน | ||
| บุญชัย วรกิจพรสิน | ||
| สมาน ธูปเทียน | ||
| พงศ์ศักดิ์ แซ่แต้ | ||
| อำนาจ ไวจงเจริญ | ||
| พรทิพย์ แซ่ลี้ | ||
| อุไรวรรณ สาครสินธุ์ | ||
| อำพล วีระตะนนท์ | ||
| สมจิตร ใจวังโลก | ||
| สุเทพ ตันวินิจ | ||
| สวาท ทรัพย์มาก | ||
| สมศักดิ์ เจือจันทร์ | ||
| ดัสซันซิงห์ กุลาตี | ||
| ธีร ศรแก้ว | ||
| พรรณยุพา ฮ่อสกุล | ||
| สำราญ จันทร์เอี่ยม | ||
| พจน์ มั่นกันนาน | ||
| สุธี บุณยเกียรติ | ||
| บุญโชติ ทิพย์ประเสริฐสิน | ||
| ประดิษฐ์ ทองพสิฐสมบัติ | ||
| จำเนียร เพ็งเจริญ | ||
| สมศักดิ์ อรุณรัตน์ | ||
| อนุชา จารุหิรัญสกุล | ||
| พิกุล มโนภิญโญภิญญะ | ||
| ผ่องศรี นกแก้ว | ||
| อารี วิไลวรรณ | ||
| ณรงค์วิทย์ วิทสัทธาวรกุล |
| # This file is part of ICU4X. For terms of use, please see the file | ||
| # called LICENSE at the top level of the ICU4X source tree | ||
| # (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). | ||
| ممارسة مراعاة | ||
| العنصرية | ||
| حدود والشيخوخة | ||
| بالحكم كهذا ينتفع | ||
| البلاد | ||
| تربية | ||
| الغير التقدم والعدل | ||
| نحو بالتعليم والحرية | ||
| تأمين متساو | ||
| للتعليم فيها | ||
| آذت اعتداء للتعليم | ||
| ليس المتأصلة | ||
| والمساهمة الضروري تتناقض | ||
| وتأسيس | ||
| رضى | ||
| شرعي الطبية | ||
| لكيلا الجمعية والحرية | ||
| للرجال التزوج | ||
| بالكرامة | ||
| حرية بين | ||
| هذه العيش تنظر | ||
| قيد | ||
| يقررها والصداقة | ||
| اعتُمد وينبغي اجتماعي | ||
| حرمان | ||
| للإدراك بأجر إنتاجه | ||
| التربية القانون | ||
| لإنصافه وتأسيس وسمعته | ||
| أساسه للرجال | ||
| كافة | ||
| المجهود دولي أينما | ||
| وإلى | ||
| بنشاط تجري | ||
| والأمم مثل لحقوق | ||
| الإنسان بشروط بحماية | ||
| شرفه | ||
| كما الوظائف | ||
| حياته ديسمبر | ||
| ولما | ||
| هذه | ||
| غاية جديد إنسان | ||
| حرية | ||
| متهم الوطنية قدمًا | ||
| التملك وضع | ||
| شرعية ويعبر تأدية | ||
| بنظام عمل والأخلاق | ||
| التملك لشخصيته يلجأ | ||
| بحال يضطر ولا | ||
| الانضمام بالكرامة | ||
| عضوا |
| # This file is part of ICU4X. For terms of use, please see the file | ||
| # called LICENSE at the top level of the ICU4X source tree | ||
| # (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). | ||
| Herrschaft Freiheiten Not | ||
| Gewalt | ||
| stets anderer begründet | ||
| erhobenen innerstaatliche | ||
| Heiratsfähige freie | ||
| offenstehen Begrenzung grausamer | ||
| Maßnahmen höchste | ||
| unentbehrlich privat | ||
| erniedrigender | ||
| Verachtung freie | ||
| innezuhaben innerstaatlichen | ||
| kommen | ||
| werden gleichgültig | ||
| Würde überall höchste | ||
| Schutzmaßnahmen den Pflichten | ||
| Wille Bestimmung | ||
| Leibeigenschaft einschließlich für | ||
| gleiche bekräftigt Gewissens | ||
| Wohles | ||
| Generalversammlung | ||
| Volkes | ||
| Völkern gegenwärtig Zusammenarbeit | ||
| Heiratsfähige sowie Jeder | ||
| Stellung | ||
| Lebensstandard | ||
| seinem | ||
| Rede strafbaren Sicherheit | ||
| mit | ||
| Kulthandlungen Grund | ||
| ärztlicher | ||
| Auflösung Anforderungen anzugehören | ||
| Furcht | ||
| keine Geburt | ||
| Wohles Furcht genügen | ||
| befriedigende Medien | ||
| anzugehören Urlaub Vereinigungen | ||
| hinzuwirken verboten Resolution | ||
| kommen | ||
| sozialer vor irgendein | ||
| Bestimmung Bestimmung | ||
| Fall natürliche kein | ||
| Geschlecht Aufhetzung eigenen | ||
| seinen | ||
| über | ||
| Unterlassung Berücksichtigung | ||
| war | ||
| Rufes stets | ||
| Volkes anderer Beschränkungen | ||
| Handlungen dessen | ||
| Die |
| # This file is part of ICU4X. For terms of use, please see the file | ||
| # called LICENSE at the top level of the ICU4X source tree | ||
| # (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). | ||
| προάγει αλληλογραφία | ||
| λογική έχει | ||
| ιδρύει ζωή τεχνική | ||
| δυνατότητες | ||
| περιορισμό συνόλου | ||
| ασκεί παραγνώριση συναφθεί | ||
| αναγνωρίζουν ποινικής εκδηλώνει | ||
| κοινότητας διακυβέρνηση στα | ||
| απέναντι υψηλή | ||
| περιστάσεων αξιόποινη | ||
| σεβασμό | ||
| συντήρησής κατά εξασφαλίσουν | ||
| παραβιάζουν συμπληρώνεται νόμο | ||
| άμεσα | ||
| σημαίνει καθεστώς | ||
| ΑΝΘΡΩΠΙΝΑ θέλησης ανθρωπίνων | ||
| ΔΙΑΚΗΡΥΞΗ αθλιότητα ασφάλιση | ||
| μέσο | ||
| ίση Εχει | ||
| ειρήνης Κάθε | ||
| μέλη μορφή | ||
| όσο | ||
| κρατείται Στο Διακηρύσσει | ||
| οικονομικών έκφρασης εξασφαλίζεται | ||
| κάθε | ||
| περίπτωση απολαμβάνουν | ||
| ποινικό γεροντική | ||
| είναι μαζί δικαστήρια | ||
| μαζί προοπτική | ||
| δική | ||
| βαρβαρότητας | ||
| οικονομικών εξασφαλίσει | ||
| υποχρεώσεις οδήγησαν | ||
| Οικουμενική Διακήρυξης γονείς | ||
| στις μυστική αντιπροσώπους | ||
| Διακήρυξης άδειες βιοτικό | ||
| αναπηρία ομάδα | ||
| πραγματικό | ||
| καλύτερες | ||
| ανάπαυση | ||
| δίκαιες ένα δικαίου | ||
| μετέχει στους | ||
| θρησκευτικών ποινικής | ||
| Κανείς ίσα | ||
| πεποιθήσεις | ||
| πολιτικές ανάλογα δουλεία | ||
| πολιτικές ιατρική ωσότου | ||
| ηθικής χωρίς | ||
| ανδρών ικανό | ||
| καθώς |
| # This file is part of ICU4X. For terms of use, please see the file | ||
| # called LICENSE at the top level of the ICU4X source tree | ||
| # (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). | ||
| duración común | ||
| delito reconocimiento alimentación | ||
| inalienables | ||
| entre seguridad escogidos | ||
| comportarse dignidad | ||
| autónomo gobierno tiempo | ||
| omisiones | ||
| comisión | ||
| Derechos territorios | ||
| debe | ||
| han | ||
| regresar inalienables | ||
| regresar | ||
| desempleo científico | ||
| arbitrariamente proclamada | ||
| están contraerse esposos | ||
| cualesquiera | ||
| salir carácter desarrollo | ||
| solamente justas | ||
| personalidad una | ||
| cuanto | ||
| garantice resolución | ||
| concepción | ||
| tomar impondrá | ||
| cualquier reconocimiento | ||
| obligatoria obligatoria satisfactoria | ||
| acusación sin | ||
| artísticas penal culturales | ||
| pagadas examen | ||
| Además Organización dignidad | ||
| opresión esposos ejercidos | ||
| barbarie están mientras | ||
| por | ||
| idioma | ||
| recursos pagadas | ||
| materia Nada ella | ||
| con injerencias | ||
| inspirándose | ||
| organización | ||
| gozar jurisdicción | ||
| que | ||
| asegurar | ||
| humana libertad | ||
| nadie equivalente | ||
| escoger remuneración | ||
| torturas | ||
| individuos poder | ||
| disfruten seres Preámbulo | ||
| desempleo | ||
| liberados |
| # This file is part of ICU4X. For terms of use, please see the file | ||
| # called LICENSE at the top level of the ICU4X source tree | ||
| # (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). | ||
| conforme êtres fonctions | ||
| non tout généralisé | ||
| premier lui | ||
| faire hommes d’égalité | ||
| peuple volonté bénéficier | ||
| générale nationales | ||
| cruels plus | ||
| d’encourager opinions | ||
| genre l’esprit | ||
| d’origine effectif | ||
| exigences auront | ||
| résultent situation recevoir | ||
| peuples Chacun | ||
| sont d’égalité | ||
| jouissent | ||
| auront l’esprit | ||
| pays telle | ||
| publiquement | ||
| mariage foi | ||
| travail démocratique religieux | ||
| rémunération | ||
| omissions telles | ||
| L’éducation | ||
| raison complétée donner | ||
| invoqué auront arbitraires | ||
| l’amitié suffisant affaires | ||
| travaille l’accomplissement l’intermédiaire | ||
| race | ||
| opinions celles | ||
| assurer par privée | ||
| valeur | ||
| violant traite premier | ||
| inhérente | ||
| bienfaits l’avènement | ||
| Unies s’il actions | ||
| inquiété l’esclavage | ||
| inquiété | ||
| esclaves lieu | ||
| salaire | ||
| par | ||
| toute | ||
| innocente procédure membres | ||
| arts l’idéal envers | ||
| suffrage territoires inhumains | ||
| d’immixtions l’organisation progrès | ||
| comme égalité Unies | ||
| maternité | ||
| violerait suprême sécurité | ||
| impliquant eux loisirs | ||
| nationalité |
| # This file is part of ICU4X. For terms of use, please see the file | ||
| # called LICENSE at the top level of the ICU4X source tree | ||
| # (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). | ||
| זקנה משפילים | ||
| ינתן חברתי עניניו | ||
| הפוב | ||
| ולהיות זכויות הישגים | ||
| יאסרו מטעמי וללא | ||
| ספרותית השלם | ||
| למנוחה חינם | ||
| וההתאגדות | ||
| לטפח | ||
| באלה במלואן | ||
| יהנו | ||
| ולרווחתם לגבר האדם | ||
| בכבודו שבארצות כבוד | ||
| ובינלאומיים | ||
| בכך לתנאי אישי | ||
| שאינן | ||
| שרירותי | ||
| במשפט | ||
| ולעקרונותיהן מטעם | ||
| שרירותית האשמה יהיה | ||
| החינוך ולבטחון | ||
| סובלנות אשמתו במגילה | ||
| המאוחדות חיוני | ||
| חשוב במקרה | ||
| כלתי העולם | ||
| שמקורה כציבור | ||
| לשויון | ||
| לתקנה | ||
| תלוי ההתאספות | ||
| הדיבור שהוא | ||
| והבלתי והבסיסית | ||
| ולעקרונותיהן יהא וישאף | ||
| ביתנ הבינלאומי | ||
| והזלזול להקנות | ||
| בגלל כולם שיושלם | ||
| לחיים | ||
| בדבר | ||
| לשירות | ||
| זכויות | ||
| לפני | ||
| אדם ולא מזזמנות | ||
| קנינו שהיה ההתאספות | ||
| בינלאומי חיוניות לבקש | ||
| תהיינה | ||
| ובזכות בכורה מהגנה | ||
| מתוך | ||
| ובמצפון מזומנות לאגד | ||
| והחמריים סוציאלי | ||
| אנושיים ובהצבעה | ||
| פראיים |
| # This file is part of ICU4X. For terms of use, please see the file | ||
| # called LICENSE at the top level of the ICU4X source tree | ||
| # (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). | ||
| uciskowi posiadania prawo | ||
| społecznego największych skazany | ||
| czy | ||
| potrzeby samodzielnie przystępowania | ||
| Krzewi też dokonania | ||
| pełną prawo | ||
| buntu | ||
| moralności | ||
| zapewnienia znaczenie | ||
| nieludzki wypadek Nikt | ||
| zasadności jakikolwiek Każdy | ||
| samowolnie krajem | ||
| międzynarodowego | ||
| członek wielu | ||
| rozwój wynikających obalenia | ||
| rasy | ||
| grudnia która | ||
| jedynie urlopu ani | ||
| małżeńskie stanowi ustaniu | ||
| człowieka postępowych | ||
| prześladowania | ||
| politycznej które zawarcia | ||
| Deklaracja | ||
| ingerować wyłącznie | ||
| studia Nikt | ||
| innego uprawianie zrozumienie | ||
| wybranych swobodę wyznania | ||
| wolni osobowości | ||
| ograniczenie Nie | ||
| równej społecznego uciekać | ||
| będącą POWSZECHNA | ||
| niezdolności poszukiwania międzynarodowej | ||
| konieczne potrzeby posiada | ||
| opinii wychowywania 1948 | ||
| międzynarodowej zatrzymać | ||
| przedstawicieli | ||
| przeciw | ||
| wynikających organy pracę | ||
| człowiek grupami | ||
| niezbędnych | ||
| wolności podstawowym | ||
| opinii małżonków wolność | ||
| postępować zdecydowanie komórką | ||
| odniesieniu | ||
| pokoju azyl | ||
| zawodowych powrócić człowiek | ||
| konstytucję | ||
| takiej postaciach powszechnego | ||
| wygnać wygnać | ||
| wspólny poszanowania |
| # This file is part of ICU4X. For terms of use, please see the file | ||
| # called LICENSE at the top level of the ICU4X source tree | ||
| # (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). | ||
| областях | ||
| будут должен | ||
| обеспечиваются нежели | ||
| котором Уставе | ||
| социального моральных | ||
| совершеннолетия предоставление | ||
| том независимо | ||
| существование | ||
| вмешательства какому ограниченной | ||
| распространять | ||
| находить помощь | ||
| искусством | ||
| унижающим положения искать | ||
| изгнанию член совершеннолетия | ||
| обществом имуществом государственной | ||
| идеи братства | ||
| наслаждаться значение социальной | ||
| осуществления юрисдикцией наказанию | ||
| достойное свою III | ||
| жизнь расторжения инвалидности | ||
| терпимости этого | ||
| целях равны | ||
| обеспечиваются законным | ||
| принуждаем правосубъектности | ||
| пыткам доступа неприкосновенность | ||
| Брак против | ||
| прибегать независимой | ||
| человека человеческой | ||
| быть независимо религии | ||
| публичным | ||
| членам против | ||
| разумом результатом семью | ||
| Принята участие | ||
| беспристрастным тем | ||
| частным основной | ||
| правового | ||
| страной обслуживание | ||
| было свободу полное | ||
| рабочего свободны | ||
| состоянии помощь религиозными | ||
| полное | ||
| владеть власти морали | ||
| меньшей | ||
| братства социальному убежища | ||
| государств | ||
| равны который дети | ||
| терпимости | ||
| получать бесплатным полного | ||
| богослужении | ||
| отдельным |
| # This file is part of ICU4X. For terms of use, please see the file | ||
| # called LICENSE at the top level of the ICU4X source tree | ||
| # (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). | ||
| คิด ใตัอำ เคลื่อนไหว | ||
| บังคับ บาก | ||
| สิ่ง สิ้น | ||
| วัตถุ | ||
| ชาย อาศัย เท่านั้น | ||
| สิน | ||
| เกา | ||
| ดูแล พิธีกรรม | ||
| ภายใน | ||
| เพศ | ||
| หนัก ประสงค์ | ||
| เหตุ | ||
| งาน รักษา | ||
| เพศ ภาษา | ||
| นี้ | ||
| คู่ สัญชาติ ต้องการ | ||
| วิธี ระหว่าง ตกลง | ||
| ทำนอง | ||
| สืบ กับ ศิลปกรรม | ||
| เหนือ วรรณกรรม | ||
| คิด การก หน้าที่ | ||
| ชาติ ศิลปกรรม แต่ | ||
| สามัญ สอด | ||
| เหยียด วิธี จุด | ||
| หน้า ถ้า เบื้อง | ||
| ประชุม | ||
| ศิลปกรรม | ||
| เสรีภาพ โหด ก่อ | ||
| เกียรติศักดิ์ ป่วย เอกราช | ||
| ประหัต มโนธรรม การ | ||
| แทน | ||
| ขัดขืน เวลา เสียง | ||
| กฎบัตร พยายาม | ||
| สิน หน้า | ||
| จำเป็น | ||
| ประชาธิปไตย หน่วย | ||
| กรณี จริงจัง | ||
| ทำนอง | ||
| ทาษ | ||
| เพิ่ม | ||
| บรรดา ขวาง | ||
| กักขัง | ||
| มนุษย์ | ||
| ชาย ประกัน มนุษยธรรม | ||
| จะบัน มูลฐาน เถื่อน | ||
| พฤติ | ||
| มิได้ | ||
| หญิง คู่ | ||
| สมา ปฏิบัติ อนึ่ง | ||
| สิ่ง ทาษ |
| # This file is part of ICU4X. For terms of use, please see the file | ||
| # called LICENSE at the top level of the ICU4X source tree | ||
| # (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). | ||
| mecburidir ilim | ||
| isnadın sınırları suç | ||
| tutuklanamaz diğer | ||
| memleket korunmasi kullanılamaz | ||
| İnsanlık ilerlemeyi | ||
| bir mülk menfaatlerinin | ||
| usul zümreye herhangi | ||
| mahkeme vicdana ilerleyişe | ||
| zulüm zalimane | ||
| ilim öncelikle çocuk | ||
| mevzubahis ancak | ||
| muamelesi dinlenmeye | ||
| eşitlikle prensiplerine ülkenin | ||
| öğretim bulunmalarına yardım | ||
| memleketler amacıyla | ||
| birbirlerine | ||
| olmalıdır | ||
| bırakılamaz serbestisine | ||
| hürriyetin iyi | ||
| hükmü işbu zalimane | ||
| evlenme memleketi tedbirlerle | ||
| evlenmek ahalisi işini | ||
| hürriyetler | ||
| belirlenmiş kere | ||
| elde cürüme | ||
| tanınan dünyaca yüksek | ||
| müddetinin ailesine | ||
| vicdan kırıcı itibariyle | ||
| geniş inanma | ||
| kendi görevleri Teşkilatı | ||
| yaymak | ||
| öğretim vesayet | ||
| renk kişiliğinin | ||
| tamamlanan | ||
| haklara bulunma | ||
| hükmü uygulanabilecek | ||
| etmiş geliştirilmesini hoşgörü | ||
| sahiptir temel | ||
| giyim | ||
| Bundan temeli | ||
| icaplarını | ||
| mülk karışma tekmil | ||
| vicdana hürriyetine işini | ||
| Herkesin vahşiliklere | ||
| dolaşma dünyanın | ||
| davasının Uluslararasında idamesi | ||
| eşittir | ||
| haklardan hakkı | ||
| kovuşturmalar hürriyetlerden gözönünde | ||
| Evrensel fiilli beyannamesi |
| # This file is part of ICU4X. For terms of use, please see the file | ||
| # called LICENSE at the top level of the ICU4X source tree | ||
| # (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). | ||
| # The contents of this file have been translated by "Google Translate". | ||
| Vào những năm cuối của thế kỷ 19, không ai có thể tin rằng thế giới này | ||
| đang được theo dõi một cách sâu sắc và chặt chẽ bởi những trí thông minh | ||
| lớn hơn con người nhưng cũng nguy hiểm như chính con người; rằng khi con | ||
| người bận rộn với những mối quan tâm khác nhau của họ, họ bị xem xét và | ||
| nghiên cứu kỹ lưỡng, có lẽ gần như một người đàn ông với kính hiển vi có thể | ||
| xem xét kỹ lưỡng những sinh vật nhất thời tụ tập và sinh sôi nảy nở trong | ||
| một giọt nước. Với sự tự mãn vô hạn, con người đi đi lại lại khắp thế giới | ||
| này chỉ vì những công việc nhỏ nhặt của họ, thanh thản với niềm tin chắc | ||
| chắn về đế chế của họ đối với vật chất. Có thể là infusoria dưới kính hiển | ||
| vi cũng làm như vậy. Không ai coi các thế giới cũ hơn trong không gian là | ||
| nguồn gây nguy hiểm cho con người, hoặc nghĩ về chúng chỉ để bác bỏ ý | ||
| tưởng về sự sống đối với chúng là không thể hoặc không thể xảy ra. | ||
| Thật tò mò khi nhớ lại một số thói quen tinh thần của những ngày đã | ||
| qua. Hầu hết những người trên trái đất đều tưởng tượng rằng có thể có | ||
| những người khác trên sao Hỏa, có lẽ thấp kém hơn họ và sẵn sàng chào | ||
| đón một doanh nghiệp truyền giáo. Tuy nhiên, bên kia vịnh không gian, | ||
| những bộ óc đối với tâm trí của chúng ta cũng như tâm trí của chúng ta đối | ||
| với những con thú bị diệt vong, những bộ óc rộng lớn, lạnh lùng và vô cảm, | ||
| nhìn trái đất này với con mắt ghen tị, và dần dần và chắc chắn vạch ra | ||
| những kế hoạch chống lại chúng ta. Và đầu thế kỷ 20 đã xảy ra sự vỡ mộng | ||
| lớn. Hành tinh sao Hỏa, tôi không cần nhắc độc giả, quay xung quanh mặt | ||
| trời ở khoảng cách trung bình 140.000.000 dặm, và ánh sáng và nhiệt mà | ||
| nó nhận được từ mặt trời chỉ bằng một nửa so với thế giới này nhận được. | ||
| Nếu giả thuyết về tinh vân có bất kỳ sự thật nào, nó phải tồn tại lâu | ||
| đời hơn thế giới của chúng ta; và rất lâu trước khi trái đất này ngừng | ||
| nóng chảy, sự sống trên bề mặt của nó hẳn đã bắt đầu quá trình của nó. | ||
| Thực tế là nó chỉ chiếm một phần bảy thể tích của trái đất đã làm tăng | ||
| tốc độ nguội đi của nó đến nhiệt độ mà sự sống có thể bắt đầu. Nó có | ||
| không khí và nước và tất cả những gì cần thiết để hỗ trợ sự tồn tại | ||
| sinh động. Tuy nhiên, con người quá hão huyền và bị mù quáng bởi sự phù | ||
| phiếm của mình, đến nỗi cho đến tận cuối thế kỷ 19, không có nhà văn nào | ||
| bày tỏ bất kỳ ý tưởng nào rằng sự sống thông minh có thể đã phát triển ở đó xa, | ||
| hoặc thực sự là ở tất cả, vượt ra ngoài mức độ trần gian của nó. Người ta | ||
| cũng không hiểu một cách tổng quát rằng vì sao Hỏa già hơn trái đất của chúng | ||
| ta, chỉ bằng một phần tư diện tích bề mặt và ở xa mặt trời hơn, nên điều tất | ||
| yếu dẫn đến là nó không chỉ xa hơn so với thời điểm bắt đầu mà còn gần ngày kết | ||
| thúc hơn. Sự nguội lạnh thế tục mà một ngày nào đó phải vượt qua hành tinh của chúng | ||
| ta đã thực sự đi xa với người hàng xóm của chúng ta. Tình trạng vật lý của nó phần lớn | ||
| vẫn còn là một bí ẩn, nhưng giờ đây chúng ta biết rằng ngay cả ở vùng xích đạo của nó, | ||
| nhiệt độ giữa trưa hầu như không bằng nhiệt độ của mùa đông lạnh nhất của chúng ta. | ||
| Không khí của nó loãng hơn nhiều so với không khí của chúng ta, các đại dương của nó đã | ||
| thu hẹp lại cho đến khi chỉ bao phủ một phần ba bề mặt của nó, và khi các mùa chậm chạp | ||
| của nó thay đổi, các chỏm tuyết khổng lồ tụ lại và tan chảy ở hai cực và định kỳ làm ngập các vùng ôn đới của nó. | ||
| Giai đoạn cuối cùng của sự kiệt sức, mà đối với chúng ta vẫn còn quá xa vời, đã trở thành | ||
| một vấn đề ngày nay đối với các cư dân trên sao Hỏa. Áp lực trước mắt của sự cần | ||
| thiết đã làm sáng tỏ trí tuệ của họ, mở rộng sức mạnh của họ và làm chai đá trái | ||
| tim họ. Và nhìn xuyên qua không gian với các công cụ, và trí thông minh như chúng | ||
| ta hiếm khi mơ tới, họ thấy, ở khoảng cách gần nhất chỉ cách họ 35.000.000 dặm | ||
| về phía mặt trời, một ngôi sao buổi sáng của hy vọng, hành tinh ấm áp hơn của chúng | ||
| ta, màu xanh lục của thảm thực vật và màu xám của nước , với bầu không khí nhiều | ||
| mây hùng hồn của sự màu mỡ, với những cái nhìn thoáng qua qua những đám mây | ||
| trôi dạt của nó là những dải đất rộng lớn đông dân và những vùng biển chật hẹp đông đúc hải quân. |
| // This file is part of ICU4X. For terms of use, please see the file | ||
| // called LICENSE at the top level of the ICU4X source tree | ||
| // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). | ||
| use criterion::{black_box, BenchmarkId, Criterion}; | ||
| use icu_normalizer::{ComposingNormalizerBorrowed, DecomposingNormalizerBorrowed}; | ||
| struct BenchDataContent { | ||
| pub file_name: String, | ||
| pub nfc: String, | ||
| pub nfd: String, | ||
| pub nfkc: String, | ||
| pub nfkd: String, | ||
| pub nfc_u16: Vec<u16>, | ||
| pub nfd_u16: Vec<u16>, | ||
| pub nfkc_u16: Vec<u16>, | ||
| pub nfkd_u16: Vec<u16>, | ||
| } | ||
| fn strip_headers(content: &str) -> String { | ||
| content | ||
| .lines() | ||
| .filter(|&s| !s.starts_with('#')) | ||
| .map(|s| s.to_owned()) | ||
| .collect::<Vec<String>>() | ||
| .join("\n") | ||
| } | ||
| fn normalizer_bench_data() -> [BenchDataContent; 15] { | ||
| let nfc_normalizer = ComposingNormalizerBorrowed::new_nfc(); | ||
| let nfd_normalizer = DecomposingNormalizerBorrowed::new_nfd(); | ||
| let nfkc_normalizer = ComposingNormalizerBorrowed::new_nfkc(); | ||
| let nfkd_normalizer = DecomposingNormalizerBorrowed::new_nfkd(); | ||
| let content_latin: (&str, &str) = ( | ||
| "TestNames_Latin", | ||
| &strip_headers(include_str!("./data/TestNames_Latin.txt")), | ||
| ); | ||
| let content_jp_h: (&str, &str) = ( | ||
| "TestNames_Japanese_h", | ||
| &strip_headers(include_str!("./data/TestNames_Japanese_h.txt")), | ||
| ); | ||
| let content_jp_k: (&str, &str) = ( | ||
| "TestNames_Japanese_k", | ||
| &strip_headers(include_str!("./data/TestNames_Japanese_k.txt")), | ||
| ); | ||
| let content_korean: (&str, &str) = ( | ||
| "TestNames_Korean", | ||
| &strip_headers(include_str!("./data/TestNames_Korean.txt")), | ||
| ); | ||
| let content_random_words_ar: (&str, &str) = ( | ||
| "TestRandomWordsUDHR_ar", | ||
| &strip_headers(include_str!("./data/TestRandomWordsUDHR_ar.txt")), | ||
| ); | ||
| let content_random_words_de: (&str, &str) = ( | ||
| "TestRandomWordsUDHR_de", | ||
| &strip_headers(include_str!("./data/TestRandomWordsUDHR_de.txt")), | ||
| ); | ||
| let content_random_words_el: (&str, &str) = ( | ||
| "TestRandomWordsUDHR_el", | ||
| &strip_headers(include_str!("./data/TestRandomWordsUDHR_el.txt")), | ||
| ); | ||
| let content_random_words_es: (&str, &str) = ( | ||
| "TestRandomWordsUDHR_es", | ||
| &strip_headers(include_str!("./data/TestRandomWordsUDHR_es.txt")), | ||
| ); | ||
| let content_random_words_fr: (&str, &str) = ( | ||
| "TestRandomWordsUDHR_fr", | ||
| &strip_headers(include_str!("./data/TestRandomWordsUDHR_fr.txt")), | ||
| ); | ||
| let content_random_words_he: (&str, &str) = ( | ||
| "TestRandomWordsUDHR_he", | ||
| &strip_headers(include_str!("./data/TestRandomWordsUDHR_he.txt")), | ||
| ); | ||
| let content_random_words_pl: (&str, &str) = ( | ||
| "TestRandomWordsUDHR_pl", | ||
| &strip_headers(include_str!("./data/TestRandomWordsUDHR_pl.txt")), | ||
| ); | ||
| let content_random_words_ru: (&str, &str) = ( | ||
| "TestRandomWordsUDHR_ru", | ||
| &strip_headers(include_str!("./data/TestRandomWordsUDHR_ru.txt")), | ||
| ); | ||
| let content_random_words_th: (&str, &str) = ( | ||
| "TestRandomWordsUDHR_th", | ||
| &strip_headers(include_str!("./data/TestRandomWordsUDHR_th.txt")), | ||
| ); | ||
| let content_random_words_tr: (&str, &str) = ( | ||
| "TestRandomWordsUDHR_tr", | ||
| &strip_headers(include_str!("./data/TestRandomWordsUDHR_tr.txt")), | ||
| ); | ||
| let content_viet: (&str, &str) = ("wotw", &strip_headers(include_str!("./data/wotw.txt"))); | ||
| [ | ||
| content_latin, | ||
| content_viet, | ||
| content_jp_k, | ||
| content_jp_h, | ||
| content_korean, | ||
| content_random_words_ru, | ||
| content_random_words_ar, | ||
| content_random_words_el, | ||
| content_random_words_es, | ||
| content_random_words_fr, | ||
| content_random_words_tr, | ||
| content_random_words_th, | ||
| content_random_words_pl, | ||
| content_random_words_he, | ||
| content_random_words_de, | ||
| ] | ||
| .map(|(file_name, raw_content)| { | ||
| let nfc = &nfc_normalizer.normalize(raw_content); | ||
| let nfd = &nfd_normalizer.normalize(raw_content); | ||
| let nfkc = &nfkc_normalizer.normalize(raw_content); | ||
| let nfkd = &nfkd_normalizer.normalize(raw_content); | ||
| BenchDataContent { | ||
| file_name: file_name.to_owned(), | ||
| nfc: nfc.to_string(), | ||
| nfd: nfd.to_string(), | ||
| nfkc: nfkc.to_string(), | ||
| nfkd: nfkd.to_string(), | ||
| nfc_u16: nfc.encode_utf16().collect(), | ||
| nfd_u16: nfd.encode_utf16().collect(), | ||
| nfkc_u16: nfkc.encode_utf16().collect(), | ||
| nfkd_u16: nfkd.encode_utf16().collect(), | ||
| } | ||
| }) | ||
| } | ||
| fn function_under_bench(normalizer: &DecomposingNormalizerBorrowed, text: &str) { | ||
| normalizer.normalize(text); | ||
| } | ||
| fn function_under_bench_u16(normalizer: &DecomposingNormalizerBorrowed, text: &[u16]) { | ||
| normalizer.normalize_utf16(text); | ||
| } | ||
| pub fn criterion_benchmark(criterion: &mut Criterion) { | ||
| let group_name = "decomposing_normalizer_nfd"; | ||
| let normalizer_under_bench = DecomposingNormalizerBorrowed::new_nfd(); | ||
| let mut group = criterion.benchmark_group(group_name); | ||
| for bench_data_content in black_box(normalizer_bench_data()) { | ||
| group.bench_function( | ||
| BenchmarkId::from_parameter(format!("from_nfc_{}", bench_data_content.file_name)), | ||
| |bencher| { | ||
| bencher | ||
| .iter(|| function_under_bench(&normalizer_under_bench, &bench_data_content.nfc)) | ||
| }, | ||
| ); | ||
| group.bench_function( | ||
| BenchmarkId::from_parameter(format!("from_nfd_{}", bench_data_content.file_name)), | ||
| |bencher| { | ||
| bencher | ||
| .iter(|| function_under_bench(&normalizer_under_bench, &bench_data_content.nfd)) | ||
| }, | ||
| ); | ||
| group.bench_function( | ||
| BenchmarkId::from_parameter(format!("from_nfkc_{}", bench_data_content.file_name)), | ||
| |bencher| { | ||
| bencher.iter(|| { | ||
| function_under_bench(&normalizer_under_bench, &bench_data_content.nfkc) | ||
| }) | ||
| }, | ||
| ); | ||
| group.bench_function( | ||
| BenchmarkId::from_parameter(format!("from_nfkd_{}", bench_data_content.file_name)), | ||
| |bencher| { | ||
| bencher.iter(|| { | ||
| function_under_bench(&normalizer_under_bench, &bench_data_content.nfkd) | ||
| }) | ||
| }, | ||
| ); | ||
| // UTF 16 | ||
| group.bench_function( | ||
| BenchmarkId::from_parameter(format!("from_nfc_{}_u16", bench_data_content.file_name)), | ||
| |bencher| { | ||
| bencher.iter(|| { | ||
| function_under_bench_u16(&normalizer_under_bench, &bench_data_content.nfc_u16) | ||
| }) | ||
| }, | ||
| ); | ||
| group.bench_function( | ||
| BenchmarkId::from_parameter(format!("from_nfd_{}_u16", bench_data_content.file_name)), | ||
| |bencher| { | ||
| bencher.iter(|| { | ||
| function_under_bench_u16(&normalizer_under_bench, &bench_data_content.nfd_u16) | ||
| }) | ||
| }, | ||
| ); | ||
| group.bench_function( | ||
| BenchmarkId::from_parameter(format!("from_nfkc_{}_u16", bench_data_content.file_name)), | ||
| |bencher| { | ||
| bencher.iter(|| { | ||
| function_under_bench_u16(&normalizer_under_bench, &bench_data_content.nfkc_u16) | ||
| }) | ||
| }, | ||
| ); | ||
| group.bench_function( | ||
| BenchmarkId::from_parameter(format!("from_nfkd_{}_u16", bench_data_content.file_name)), | ||
| |bencher| { | ||
| bencher.iter(|| { | ||
| function_under_bench_u16(&normalizer_under_bench, &bench_data_content.nfkd_u16) | ||
| }) | ||
| }, | ||
| ); | ||
| } | ||
| group.finish(); | ||
| } |
| // This file is part of ICU4X. For terms of use, please see the file | ||
| // called LICENSE at the top level of the ICU4X source tree | ||
| // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). | ||
| use criterion::{black_box, BenchmarkId, Criterion}; | ||
| use icu_normalizer::{ComposingNormalizerBorrowed, DecomposingNormalizerBorrowed}; | ||
| struct BenchDataContent { | ||
| pub file_name: String, | ||
| pub nfc: String, | ||
| pub nfd: String, | ||
| pub nfkc: String, | ||
| pub nfkd: String, | ||
| pub nfc_u16: Vec<u16>, | ||
| pub nfd_u16: Vec<u16>, | ||
| pub nfkc_u16: Vec<u16>, | ||
| pub nfkd_u16: Vec<u16>, | ||
| } | ||
| fn strip_headers(content: &str) -> String { | ||
| content | ||
| .lines() | ||
| .filter(|&s| !s.starts_with('#')) | ||
| .map(|s| s.to_owned()) | ||
| .collect::<Vec<String>>() | ||
| .join("\n") | ||
| } | ||
| fn normalizer_bench_data() -> [BenchDataContent; 15] { | ||
| let nfc_normalizer = ComposingNormalizerBorrowed::new_nfc(); | ||
| let nfd_normalizer = DecomposingNormalizerBorrowed::new_nfd(); | ||
| let nfkc_normalizer = ComposingNormalizerBorrowed::new_nfkc(); | ||
| let nfkd_normalizer = DecomposingNormalizerBorrowed::new_nfkd(); | ||
| let content_latin: (&str, &str) = ( | ||
| "TestNames_Latin", | ||
| &strip_headers(include_str!("./data/TestNames_Latin.txt")), | ||
| ); | ||
| let content_jp_h: (&str, &str) = ( | ||
| "TestNames_Japanese_h", | ||
| &strip_headers(include_str!("./data/TestNames_Japanese_h.txt")), | ||
| ); | ||
| let content_jp_k: (&str, &str) = ( | ||
| "TestNames_Japanese_k", | ||
| &strip_headers(include_str!("./data/TestNames_Japanese_k.txt")), | ||
| ); | ||
| let content_korean: (&str, &str) = ( | ||
| "TestNames_Korean", | ||
| &strip_headers(include_str!("./data/TestNames_Korean.txt")), | ||
| ); | ||
| let content_random_words_ar: (&str, &str) = ( | ||
| "TestRandomWordsUDHR_ar", | ||
| &strip_headers(include_str!("./data/TestRandomWordsUDHR_ar.txt")), | ||
| ); | ||
| let content_random_words_de: (&str, &str) = ( | ||
| "TestRandomWordsUDHR_de", | ||
| &strip_headers(include_str!("./data/TestRandomWordsUDHR_de.txt")), | ||
| ); | ||
| let content_random_words_el: (&str, &str) = ( | ||
| "TestRandomWordsUDHR_el", | ||
| &strip_headers(include_str!("./data/TestRandomWordsUDHR_el.txt")), | ||
| ); | ||
| let content_random_words_es: (&str, &str) = ( | ||
| "TestRandomWordsUDHR_es", | ||
| &strip_headers(include_str!("./data/TestRandomWordsUDHR_es.txt")), | ||
| ); | ||
| let content_random_words_fr: (&str, &str) = ( | ||
| "TestRandomWordsUDHR_fr", | ||
| &strip_headers(include_str!("./data/TestRandomWordsUDHR_fr.txt")), | ||
| ); | ||
| let content_random_words_he: (&str, &str) = ( | ||
| "TestRandomWordsUDHR_he", | ||
| &strip_headers(include_str!("./data/TestRandomWordsUDHR_he.txt")), | ||
| ); | ||
| let content_random_words_pl: (&str, &str) = ( | ||
| "TestRandomWordsUDHR_pl", | ||
| &strip_headers(include_str!("./data/TestRandomWordsUDHR_pl.txt")), | ||
| ); | ||
| let content_random_words_ru: (&str, &str) = ( | ||
| "TestRandomWordsUDHR_ru", | ||
| &strip_headers(include_str!("./data/TestRandomWordsUDHR_ru.txt")), | ||
| ); | ||
| let content_random_words_th: (&str, &str) = ( | ||
| "TestRandomWordsUDHR_th", | ||
| &strip_headers(include_str!("./data/TestRandomWordsUDHR_th.txt")), | ||
| ); | ||
| let content_random_words_tr: (&str, &str) = ( | ||
| "TestRandomWordsUDHR_tr", | ||
| &strip_headers(include_str!("./data/TestRandomWordsUDHR_tr.txt")), | ||
| ); | ||
| let content_viet: (&str, &str) = ("wotw", &strip_headers(include_str!("./data/wotw.txt"))); | ||
| [ | ||
| content_latin, | ||
| content_viet, | ||
| content_jp_k, | ||
| content_jp_h, | ||
| content_korean, | ||
| content_random_words_ru, | ||
| content_random_words_ar, | ||
| content_random_words_el, | ||
| content_random_words_es, | ||
| content_random_words_fr, | ||
| content_random_words_tr, | ||
| content_random_words_th, | ||
| content_random_words_pl, | ||
| content_random_words_he, | ||
| content_random_words_de, | ||
| ] | ||
| .map(|(file_name, raw_content)| { | ||
| let nfc = &nfc_normalizer.normalize(raw_content); | ||
| let nfd = &nfd_normalizer.normalize(raw_content); | ||
| let nfkc = &nfkc_normalizer.normalize(raw_content); | ||
| let nfkd = &nfkd_normalizer.normalize(raw_content); | ||
| BenchDataContent { | ||
| file_name: file_name.to_owned(), | ||
| nfc: nfc.to_string(), | ||
| nfd: nfd.to_string(), | ||
| nfkc: nfkc.to_string(), | ||
| nfkd: nfkd.to_string(), | ||
| nfc_u16: nfc.encode_utf16().collect(), | ||
| nfd_u16: nfd.encode_utf16().collect(), | ||
| nfkc_u16: nfkc.encode_utf16().collect(), | ||
| nfkd_u16: nfkd.encode_utf16().collect(), | ||
| } | ||
| }) | ||
| } | ||
| fn function_under_bench(normalizer: &DecomposingNormalizerBorrowed, text: &str) { | ||
| normalizer.normalize(text); | ||
| } | ||
| fn function_under_bench_u16(normalizer: &DecomposingNormalizerBorrowed, text: &[u16]) { | ||
| normalizer.normalize_utf16(text); | ||
| } | ||
| pub fn criterion_benchmark(criterion: &mut Criterion) { | ||
| let group_name = "decomposing_normalizer_nfkd"; | ||
| let normalizer_under_bench = DecomposingNormalizerBorrowed::new_nfkd(); | ||
| let mut group = criterion.benchmark_group(group_name); | ||
| for bench_data_content in black_box(normalizer_bench_data()) { | ||
| group.bench_function( | ||
| BenchmarkId::from_parameter(format!("from_nfc_{}", bench_data_content.file_name)), | ||
| |bencher| { | ||
| bencher | ||
| .iter(|| function_under_bench(&normalizer_under_bench, &bench_data_content.nfc)) | ||
| }, | ||
| ); | ||
| group.bench_function( | ||
| BenchmarkId::from_parameter(format!("from_nfd_{}", bench_data_content.file_name)), | ||
| |bencher| { | ||
| bencher | ||
| .iter(|| function_under_bench(&normalizer_under_bench, &bench_data_content.nfd)) | ||
| }, | ||
| ); | ||
| group.bench_function( | ||
| BenchmarkId::from_parameter(format!("from_nfkc_{}", bench_data_content.file_name)), | ||
| |bencher| { | ||
| bencher.iter(|| { | ||
| function_under_bench(&normalizer_under_bench, &bench_data_content.nfkc) | ||
| }) | ||
| }, | ||
| ); | ||
| group.bench_function( | ||
| BenchmarkId::from_parameter(format!("from_nfkd_{}", bench_data_content.file_name)), | ||
| |bencher| { | ||
| bencher.iter(|| { | ||
| function_under_bench(&normalizer_under_bench, &bench_data_content.nfkd) | ||
| }) | ||
| }, | ||
| ); | ||
| // UTF 16 | ||
| group.bench_function( | ||
| BenchmarkId::from_parameter(format!("from_nfc_{}_u16", bench_data_content.file_name)), | ||
| |bencher| { | ||
| bencher.iter(|| { | ||
| function_under_bench_u16(&normalizer_under_bench, &bench_data_content.nfc_u16) | ||
| }) | ||
| }, | ||
| ); | ||
| group.bench_function( | ||
| BenchmarkId::from_parameter(format!("from_nfd_{}_u16", bench_data_content.file_name)), | ||
| |bencher| { | ||
| bencher.iter(|| { | ||
| function_under_bench_u16(&normalizer_under_bench, &bench_data_content.nfd_u16) | ||
| }) | ||
| }, | ||
| ); | ||
| group.bench_function( | ||
| BenchmarkId::from_parameter(format!("from_nfkc_{}_u16", bench_data_content.file_name)), | ||
| |bencher| { | ||
| bencher.iter(|| { | ||
| function_under_bench_u16(&normalizer_under_bench, &bench_data_content.nfkc_u16) | ||
| }) | ||
| }, | ||
| ); | ||
| group.bench_function( | ||
| BenchmarkId::from_parameter(format!("from_nfkd_{}_u16", bench_data_content.file_name)), | ||
| |bencher| { | ||
| bencher.iter(|| { | ||
| function_under_bench_u16(&normalizer_under_bench, &bench_data_content.nfkd_u16) | ||
| }) | ||
| }, | ||
| ); | ||
| } | ||
| group.finish(); | ||
| } |
| // This file is part of ICU4X. For terms of use, please see the file | ||
| // called LICENSE at the top level of the ICU4X source tree | ||
| // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). | ||
| use icu_normalizer::{ComposingNormalizerBorrowed, DecomposingNormalizerBorrowed}; | ||
| use criterion::{black_box, Criterion, Throughput}; | ||
| use smallvec::SmallVec; | ||
| //use detone::IterDecomposeVietnamese; | ||
| // 2048 times size of u16 fits on one 4KB memory page, which maximizes | ||
| // the run to take average over without introducing cross-page effects. | ||
| const INPUT_SIZE: usize = 2048; | ||
| fn generate_bmp_input_nfc(s: &str) -> Vec<u16> { | ||
| ComposingNormalizerBorrowed::new_nfc() | ||
| .normalize_iter(s.chars().cycle()) | ||
| .take(INPUT_SIZE) | ||
| .map(|c| { | ||
| if c <= '\u{FFFF}' { | ||
| c as u16 | ||
| } else { | ||
| unreachable!("Data should stay on the BMP!") | ||
| } | ||
| }) | ||
| .collect() | ||
| } | ||
| fn generate_bmp_input_nfd(s: &str) -> Vec<u16> { | ||
| DecomposingNormalizerBorrowed::new_nfd() | ||
| .normalize_iter(s.chars().cycle()) | ||
| .take(INPUT_SIZE) | ||
| .map(|c| { | ||
| if c <= '\u{FFFF}' { | ||
| c as u16 | ||
| } else { | ||
| unreachable!("Data should stay on the BMP!") | ||
| } | ||
| }) | ||
| .collect() | ||
| } | ||
| /// Removes headers and replaces line feed with space. | ||
| /// Do not use for languages that don't use spaces! | ||
| fn prepare_file_contents(content: &str) -> String { | ||
| content | ||
| .lines() | ||
| .filter(|&s| !s.starts_with('#')) | ||
| .map(|s| s.to_owned()) | ||
| .collect::<Vec<String>>() | ||
| .join(" ") | ||
| } | ||
| fn slice_as_slice(s: &[u16]) -> &[u16] { | ||
| black_box(s) | ||
| } | ||
| fn bench_lang(name: &str, data: &str, c: &mut Criterion) { | ||
| let input_nfc = generate_bmp_input_nfc(data); | ||
| let input_nfd = generate_bmp_input_nfd(data); | ||
| let nfc = ComposingNormalizerBorrowed::new_nfc(); | ||
| let nfd = DecomposingNormalizerBorrowed::new_nfd(); | ||
| // Appending to this output is infallible (does not return `Err`) and | ||
| // this is sized to be large enough not to actually take the the heap | ||
| // allocation path. | ||
| let mut output: SmallVec<[u16; INPUT_SIZE * 2]> = SmallVec::new(); | ||
| { | ||
| let mut group_name = "utf16_throughput_nfc_".to_string(); | ||
| group_name.push_str(name); | ||
| let mut group = c.benchmark_group(&group_name); | ||
| group.throughput(Throughput::Elements(input_nfc.len() as u64)); | ||
| group.bench_function("read", |b| { | ||
| b.iter(|| { | ||
| let _ = black_box( | ||
| nfc.split_normalized_utf16(slice_as_slice(&input_nfc)) | ||
| .0 | ||
| .len(), | ||
| ); | ||
| }) | ||
| }); | ||
| group.bench_function("writing_to_nfc", |b| { | ||
| b.iter(|| { | ||
| output.clear(); // Should be trivial and OK to do from within here. | ||
| let _ = black_box( | ||
| nfc.normalize_utf16_to(slice_as_slice(&input_nfc), black_box(&mut output)), | ||
| ); | ||
| }) | ||
| }); | ||
| group.bench_function("writing_to_nfd", |b| { | ||
| b.iter(|| { | ||
| output.clear(); // Should be trivial and OK to do from within here. | ||
| let _ = black_box( | ||
| nfd.normalize_utf16_to(slice_as_slice(&input_nfc), black_box(&mut output)), | ||
| ); | ||
| }) | ||
| }); | ||
| group.finish(); | ||
| } | ||
| { | ||
| let mut group_name = "utf16_throughput_nfd_".to_string(); | ||
| group_name.push_str(name); | ||
| let mut group = c.benchmark_group(&group_name); | ||
| group.throughput(Throughput::Elements(input_nfd.len() as u64)); | ||
| group.bench_function("read", |b| { | ||
| b.iter(|| { | ||
| let _ = black_box( | ||
| nfd.split_normalized_utf16(slice_as_slice(&input_nfd)) | ||
| .0 | ||
| .len(), | ||
| ); | ||
| }) | ||
| }); | ||
| group.bench_function("writing_to_nfd", |b| { | ||
| b.iter(|| { | ||
| output.clear(); // Should be trivial and OK to do from within here. | ||
| let _ = black_box( | ||
| nfd.normalize_utf16_to(slice_as_slice(&input_nfd), black_box(&mut output)), | ||
| ); | ||
| }) | ||
| }); | ||
| group.bench_function("writing_to_nfc", |b| { | ||
| b.iter(|| { | ||
| output.clear(); // Should be trivial and OK to do from within here. | ||
| let _ = black_box( | ||
| nfc.normalize_utf16_to(slice_as_slice(&input_nfd), black_box(&mut output)), | ||
| ); | ||
| }) | ||
| }); | ||
| group.finish(); | ||
| } | ||
| } | ||
| static EL: &str = include_str!("./data/TestRandomWordsUDHR_el.txt"); | ||
| static EN: &str = "The ICU4X normalizer is an implementation of Unicode Normalization Forms. "; | ||
| static FR: &str = include_str!("./data/TestRandomWordsUDHR_fr.txt"); | ||
| static VI: &str = include_str!("./data/wotw.txt"); | ||
| static ZH: &str = "單父人呂公善沛令,辟仇,從之客,因家焉。沛中豪傑吏聞令有重客,皆往賀。"; | ||
| // zh text from https://www.gutenberg.org/cache/epub/23841/pg23841.txt | ||
| // metadata at https://www.gutenberg.org/ebooks/23841 | ||
| // If you replace this text, be sure not to include ASCII spaces and be sure | ||
| // to include punctuation using code points actually used for punctuation in | ||
| // Chinese. | ||
| // TODO: Add: | ||
| // * Japanese with realistic proportion of kana voicing marks | ||
| // * Korean, since Hangul is special-cased in the normalizer | ||
| // * Kannada or some other non-Korean BMP language that uses | ||
| // backward-combining starters (with realistic proportion of such | ||
| // characters). | ||
| // * Chakma or some other living non-BMP language. | ||
| // * Vietnamese in the orthographic form (i.e. as produced by | ||
| // the official non-IME keyboard layout that's less common | ||
| // than the NFC-producing IME.) | ||
| pub fn criterion_benchmark(c: &mut Criterion) { | ||
| bench_lang("el", prepare_file_contents(EL).as_str(), c); | ||
| bench_lang("en", EN, c); | ||
| bench_lang("fr", prepare_file_contents(FR).as_str(), c); | ||
| bench_lang("vi", prepare_file_contents(VI).as_str(), c); | ||
| bench_lang("zh", ZH, c); | ||
| } |
| # This file is automatically @generated by Cargo. | ||
| # It is not intended for manual editing. | ||
| version = 3 | ||
| [[package]] | ||
| name = "aho-corasick" | ||
| version = "1.1.3" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916" | ||
| dependencies = [ | ||
| "memchr", | ||
| ] | ||
| [[package]] | ||
| name = "anes" | ||
| version = "0.1.6" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "4b46cbb362ab8752921c97e041f5e366ee6297bd428a31275b9fcf1e380f7299" | ||
| [[package]] | ||
| name = "anstyle" | ||
| version = "1.0.13" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "5192cca8006f1fd4f7237516f40fa183bb07f8fbdfedaa0036de5ea9b0b45e78" | ||
| [[package]] | ||
| name = "arraystring" | ||
| version = "0.3.0" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "4d517c467117e1d8ca795bc8cc90857ff7f79790cca0e26f6e9462694ece0185" | ||
| dependencies = [ | ||
| "typenum", | ||
| ] | ||
| [[package]] | ||
| name = "arrayvec" | ||
| version = "0.7.6" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" | ||
| [[package]] | ||
| name = "atoi" | ||
| version = "2.0.0" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "f28d99ec8bfea296261ca1af174f24225171fea9664ba9003cbebee704810528" | ||
| dependencies = [ | ||
| "num-traits", | ||
| ] | ||
| [[package]] | ||
| name = "autocfg" | ||
| version = "1.5.0" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8" | ||
| [[package]] | ||
| name = "bumpalo" | ||
| version = "3.19.0" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "46c5e41b57b8bba42a04676d81cb89e9ee8e859a1a66f80a5a72e1cb76b34d43" | ||
| [[package]] | ||
| name = "cast" | ||
| version = "0.3.0" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5" | ||
| [[package]] | ||
| name = "cfg-if" | ||
| version = "1.0.4" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801" | ||
| [[package]] | ||
| name = "ciborium" | ||
| version = "0.2.2" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "42e69ffd6f0917f5c029256a24d0161db17cea3997d185db0d35926308770f0e" | ||
| dependencies = [ | ||
| "ciborium-io", | ||
| "ciborium-ll", | ||
| "serde", | ||
| ] | ||
| [[package]] | ||
| name = "ciborium-io" | ||
| version = "0.2.2" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "05afea1e0a06c9be33d539b876f1ce3692f4afea2cb41f740e7743225ed1c757" | ||
| [[package]] | ||
| name = "ciborium-ll" | ||
| version = "0.2.2" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "57663b653d948a338bfb3eeba9bb2fd5fcfaecb9e199e87e1eda4d9e8b240fd9" | ||
| dependencies = [ | ||
| "ciborium-io", | ||
| "half", | ||
| ] | ||
| [[package]] | ||
| name = "clap" | ||
| version = "4.4.18" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "1e578d6ec4194633722ccf9544794b71b1385c3c027efe0c55db226fc880865c" | ||
| dependencies = [ | ||
| "clap_builder", | ||
| ] | ||
| [[package]] | ||
| name = "clap_builder" | ||
| version = "4.4.18" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "4df4df40ec50c46000231c914968278b1eb05098cf8f1b3a518a95030e71d1c7" | ||
| dependencies = [ | ||
| "anstyle", | ||
| "clap_lex", | ||
| ] | ||
| [[package]] | ||
| name = "clap_lex" | ||
| version = "0.6.0" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "702fc72eb24e5a1e48ce58027a675bc24edd52096d5397d4aea7c6dd9eca0bd1" | ||
| [[package]] | ||
| name = "cobs" | ||
| version = "0.3.0" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "0fa961b519f0b462e3a3b4a34b64d119eeaca1d59af726fe450bbba07a9fc0a1" | ||
| dependencies = [ | ||
| "thiserror", | ||
| ] | ||
| [[package]] | ||
| name = "criterion" | ||
| version = "0.5.1" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "f2b12d017a929603d80db1831cd3a24082f8137ce19c69e6447f54f5fc8d692f" | ||
| dependencies = [ | ||
| "anes", | ||
| "cast", | ||
| "ciborium", | ||
| "clap", | ||
| "criterion-plot", | ||
| "is-terminal", | ||
| "itertools", | ||
| "num-traits", | ||
| "once_cell", | ||
| "oorandom", | ||
| "plotters", | ||
| "rayon", | ||
| "regex", | ||
| "serde", | ||
| "serde_derive", | ||
| "serde_json", | ||
| "tinytemplate", | ||
| "walkdir", | ||
| ] | ||
| [[package]] | ||
| name = "criterion-plot" | ||
| version = "0.5.0" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "6b50826342786a51a89e2da3a28f1c32b06e387201bc2d19791f622c673706b1" | ||
| dependencies = [ | ||
| "cast", | ||
| "itertools", | ||
| ] | ||
| [[package]] | ||
| name = "crossbeam-deque" | ||
| version = "0.8.6" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51" | ||
| dependencies = [ | ||
| "crossbeam-epoch", | ||
| "crossbeam-utils", | ||
| ] | ||
| [[package]] | ||
| name = "crossbeam-epoch" | ||
| version = "0.9.18" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e" | ||
| dependencies = [ | ||
| "crossbeam-utils", | ||
| ] | ||
| [[package]] | ||
| name = "crossbeam-utils" | ||
| version = "0.8.21" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" | ||
| [[package]] | ||
| name = "crunchy" | ||
| version = "0.2.4" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "460fbee9c2c2f33933d720630a6a0bac33ba7053db5344fac858d4b8952d77d5" | ||
| [[package]] | ||
| name = "databake" | ||
| version = "0.2.0" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "ff6ee9e2d2afb173bcdeee45934c89ec341ab26f91c9933774fc15c2b58f83ef" | ||
| dependencies = [ | ||
| "databake-derive", | ||
| "proc-macro2", | ||
| "quote", | ||
| ] | ||
| [[package]] | ||
| name = "databake-derive" | ||
| version = "0.2.0" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "6834770958c7b84223607e49758ec0dde273c4df915e734aad50f62968a4c134" | ||
| dependencies = [ | ||
| "proc-macro2", | ||
| "quote", | ||
| "syn", | ||
| "synstructure", | ||
| ] | ||
| [[package]] | ||
| name = "detone" | ||
| version = "1.0.1" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "5d5b580660e7375410c9199e84aa298f919925fb53d8cc9b02d8010ff5a14d09" | ||
| [[package]] | ||
| name = "displaydoc" | ||
| version = "0.2.5" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0" | ||
| dependencies = [ | ||
| "proc-macro2", | ||
| "quote", | ||
| "syn", | ||
| ] | ||
| [[package]] | ||
| name = "either" | ||
| version = "1.15.0" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719" | ||
| [[package]] | ||
| name = "erased-serde" | ||
| version = "0.4.8" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "259d404d09818dec19332e31d94558aeb442fea04c817006456c24b5460bbd4b" | ||
| dependencies = [ | ||
| "serde", | ||
| "serde_core", | ||
| "typeid", | ||
| ] | ||
| [[package]] | ||
| name = "half" | ||
| version = "2.4.1" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "6dd08c532ae367adf81c312a4580bc67f1d0fe8bc9c460520283f4c0ff277888" | ||
| dependencies = [ | ||
| "cfg-if", | ||
| "crunchy", | ||
| ] | ||
| [[package]] | ||
| name = "hermit-abi" | ||
| version = "0.5.2" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "fc0fef456e4baa96da950455cd02c081ca953b141298e41db3fc7e36b1da849c" | ||
| [[package]] | ||
| name = "icu_collections" | ||
| version = "2.1.1" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "4c6b649701667bbe825c3b7e6388cb521c23d88644678e83c0c4d0a621a34b43" | ||
| dependencies = [ | ||
| "databake", | ||
| "displaydoc", | ||
| "potential_utf", | ||
| "serde", | ||
| "yoke", | ||
| "zerofrom", | ||
| "zerovec", | ||
| ] | ||
| [[package]] | ||
| name = "icu_locale_core" | ||
| version = "2.1.1" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "edba7861004dd3714265b4db54a3c390e880ab658fec5f7db895fae2046b5bb6" | ||
| dependencies = [ | ||
| "databake", | ||
| "displaydoc", | ||
| "litemap", | ||
| "serde", | ||
| "tinystr", | ||
| "writeable", | ||
| "zerovec", | ||
| ] | ||
| [[package]] | ||
| name = "icu_normalizer" | ||
| version = "2.1.1" | ||
| dependencies = [ | ||
| "arraystring", | ||
| "arrayvec", | ||
| "atoi", | ||
| "criterion", | ||
| "databake", | ||
| "detone", | ||
| "icu_collections", | ||
| "icu_normalizer_data", | ||
| "icu_properties", | ||
| "icu_provider", | ||
| "serde", | ||
| "smallvec", | ||
| "utf16_iter", | ||
| "utf8_iter", | ||
| "write16", | ||
| "zerovec", | ||
| ] | ||
| [[package]] | ||
| name = "icu_normalizer_data" | ||
| version = "2.1.1" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "7aedcccd01fc5fe81e6b489c15b247b8b0690feb23304303a9e560f37efc560a" | ||
| [[package]] | ||
| name = "icu_properties" | ||
| version = "2.1.1" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "e93fcd3157766c0c8da2f8cff6ce651a31f0810eaa1c51ec363ef790bbb5fb99" | ||
| dependencies = [ | ||
| "databake", | ||
| "icu_collections", | ||
| "icu_locale_core", | ||
| "icu_properties_data", | ||
| "icu_provider", | ||
| "serde", | ||
| "zerotrie", | ||
| "zerovec", | ||
| ] | ||
| [[package]] | ||
| name = "icu_properties_data" | ||
| version = "2.1.1" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "02845b3647bb045f1100ecd6480ff52f34c35f82d9880e029d329c21d1054899" | ||
| [[package]] | ||
| name = "icu_provider" | ||
| version = "2.1.1" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "85962cf0ce02e1e0a629cc34e7ca3e373ce20dda4c4d7294bbd0bf1fdb59e614" | ||
| dependencies = [ | ||
| "databake", | ||
| "displaydoc", | ||
| "erased-serde", | ||
| "icu_locale_core", | ||
| "postcard", | ||
| "serde", | ||
| "stable_deref_trait", | ||
| "writeable", | ||
| "yoke", | ||
| "zerofrom", | ||
| "zerotrie", | ||
| "zerovec", | ||
| ] | ||
| [[package]] | ||
| name = "is-terminal" | ||
| version = "0.4.17" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "3640c1c38b8e4e43584d8df18be5fc6b0aa314ce6ebf51b53313d4306cca8e46" | ||
| dependencies = [ | ||
| "hermit-abi", | ||
| "libc", | ||
| "windows-sys", | ||
| ] | ||
| [[package]] | ||
| name = "itertools" | ||
| version = "0.10.5" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "b0fd2260e829bddf4cb6ea802289de2f86d6a7a690192fbe91b3f46e0f2c8473" | ||
| dependencies = [ | ||
| "either", | ||
| ] | ||
| [[package]] | ||
| name = "itoa" | ||
| version = "1.0.15" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c" | ||
| [[package]] | ||
| name = "js-sys" | ||
| version = "0.3.81" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "ec48937a97411dcb524a265206ccd4c90bb711fca92b2792c407f268825b9305" | ||
| dependencies = [ | ||
| "once_cell", | ||
| "wasm-bindgen", | ||
| ] | ||
| [[package]] | ||
| name = "libc" | ||
| version = "0.2.177" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "2874a2af47a2325c2001a6e6fad9b16a53b802102b528163885171cf92b15976" | ||
| [[package]] | ||
| name = "litemap" | ||
| version = "0.8.1" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "6373607a59f0be73a39b6fe456b8192fcc3585f602af20751600e974dd455e77" | ||
| dependencies = [ | ||
| "serde_core", | ||
| ] | ||
| [[package]] | ||
| name = "log" | ||
| version = "0.4.28" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "34080505efa8e45a4b816c349525ebe327ceaa8559756f0356cba97ef3bf7432" | ||
| [[package]] | ||
| name = "memchr" | ||
| version = "2.7.6" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "f52b00d39961fc5b2736ea853c9cc86238e165017a493d1d5c8eac6bdc4cc273" | ||
| [[package]] | ||
| name = "num-traits" | ||
| version = "0.2.19" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" | ||
| dependencies = [ | ||
| "autocfg", | ||
| ] | ||
| [[package]] | ||
| name = "once_cell" | ||
| version = "1.21.3" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d" | ||
| [[package]] | ||
| name = "oorandom" | ||
| version = "11.1.5" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "d6790f58c7ff633d8771f42965289203411a5e5c68388703c06e14f24770b41e" | ||
| [[package]] | ||
| name = "plotters" | ||
| version = "0.3.7" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "5aeb6f403d7a4911efb1e33402027fc44f29b5bf6def3effcc22d7bb75f2b747" | ||
| dependencies = [ | ||
| "num-traits", | ||
| "plotters-backend", | ||
| "plotters-svg", | ||
| "wasm-bindgen", | ||
| "web-sys", | ||
| ] | ||
| [[package]] | ||
| name = "plotters-backend" | ||
| version = "0.3.7" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "df42e13c12958a16b3f7f4386b9ab1f3e7933914ecea48da7139435263a4172a" | ||
| [[package]] | ||
| name = "plotters-svg" | ||
| version = "0.3.7" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "51bae2ac328883f7acdfea3d66a7c35751187f870bc81f94563733a154d7a670" | ||
| dependencies = [ | ||
| "plotters-backend", | ||
| ] | ||
| [[package]] | ||
| name = "postcard" | ||
| version = "1.1.3" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "6764c3b5dd454e283a30e6dfe78e9b31096d9e32036b5d1eaac7a6119ccb9a24" | ||
| dependencies = [ | ||
| "cobs", | ||
| "serde", | ||
| ] | ||
| [[package]] | ||
| name = "potential_utf" | ||
| version = "0.1.4" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "b73949432f5e2a09657003c25bca5e19a0e9c84f8058ca374f49e0ebe605af77" | ||
| dependencies = [ | ||
| "serde_core", | ||
| "zerovec", | ||
| ] | ||
| [[package]] | ||
| name = "proc-macro2" | ||
| version = "1.0.103" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "5ee95bc4ef87b8d5ba32e8b7714ccc834865276eab0aed5c9958d00ec45f49e8" | ||
| dependencies = [ | ||
| "unicode-ident", | ||
| ] | ||
| [[package]] | ||
| name = "quote" | ||
| version = "1.0.41" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "ce25767e7b499d1b604768e7cde645d14cc8584231ea6b295e9c9eb22c02e1d1" | ||
| dependencies = [ | ||
| "proc-macro2", | ||
| ] | ||
| [[package]] | ||
| name = "rayon" | ||
| version = "1.10.0" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "b418a60154510ca1a002a752ca9714984e21e4241e804d32555251faf8b78ffa" | ||
| dependencies = [ | ||
| "either", | ||
| "rayon-core", | ||
| ] | ||
| [[package]] | ||
| name = "rayon-core" | ||
| version = "1.12.1" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "1465873a3dfdaa8ae7cb14b4383657caab0b3e8a0aa9ae8e04b044854c8dfce2" | ||
| dependencies = [ | ||
| "crossbeam-deque", | ||
| "crossbeam-utils", | ||
| ] | ||
| [[package]] | ||
| name = "regex" | ||
| version = "1.12.2" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "843bc0191f75f3e22651ae5f1e72939ab2f72a4bc30fa80a066bd66edefc24d4" | ||
| dependencies = [ | ||
| "aho-corasick", | ||
| "memchr", | ||
| "regex-automata", | ||
| "regex-syntax", | ||
| ] | ||
| [[package]] | ||
| name = "regex-automata" | ||
| version = "0.4.13" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "5276caf25ac86c8d810222b3dbb938e512c55c6831a10f3e6ed1c93b84041f1c" | ||
| dependencies = [ | ||
| "aho-corasick", | ||
| "memchr", | ||
| "regex-syntax", | ||
| ] | ||
| [[package]] | ||
| name = "regex-syntax" | ||
| version = "0.8.8" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "7a2d987857b319362043e95f5353c0535c1f58eec5336fdfcf626430af7def58" | ||
| [[package]] | ||
| name = "rustversion" | ||
| version = "1.0.22" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d" | ||
| [[package]] | ||
| name = "ryu" | ||
| version = "1.0.20" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "28d3b2b1366ec20994f1fd18c3c594f05c5dd4bc44d8bb0c1c632c8d6829481f" | ||
| [[package]] | ||
| name = "same-file" | ||
| version = "1.0.6" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502" | ||
| dependencies = [ | ||
| "winapi-util", | ||
| ] | ||
| [[package]] | ||
| name = "serde" | ||
| version = "1.0.228" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e" | ||
| dependencies = [ | ||
| "serde_core", | ||
| "serde_derive", | ||
| ] | ||
| [[package]] | ||
| name = "serde_core" | ||
| version = "1.0.228" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad" | ||
| dependencies = [ | ||
| "serde_derive", | ||
| ] | ||
| [[package]] | ||
| name = "serde_derive" | ||
| version = "1.0.228" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79" | ||
| dependencies = [ | ||
| "proc-macro2", | ||
| "quote", | ||
| "syn", | ||
| ] | ||
| [[package]] | ||
| name = "serde_json" | ||
| version = "1.0.145" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "402a6f66d8c709116cf22f558eab210f5a50187f702eb4d7e5ef38d9a7f1c79c" | ||
| dependencies = [ | ||
| "itoa", | ||
| "memchr", | ||
| "ryu", | ||
| "serde", | ||
| "serde_core", | ||
| ] | ||
| [[package]] | ||
| name = "smallvec" | ||
| version = "1.15.1" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "67b1b7a3b5fe4f1376887184045fcf45c69e92af734b7aaddc05fb777b6fbd03" | ||
| [[package]] | ||
| name = "stable_deref_trait" | ||
| version = "1.2.1" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "6ce2be8dc25455e1f91df71bfa12ad37d7af1092ae736f3a6cd0e37bc7810596" | ||
| [[package]] | ||
| name = "syn" | ||
| version = "2.0.108" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "da58917d35242480a05c2897064da0a80589a2a0476c9a3f2fdc83b53502e917" | ||
| dependencies = [ | ||
| "proc-macro2", | ||
| "quote", | ||
| "unicode-ident", | ||
| ] | ||
| [[package]] | ||
| name = "synstructure" | ||
| version = "0.13.2" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "728a70f3dbaf5bab7f0c4b1ac8d7ae5ea60a4b5549c8a5914361c99147a709d2" | ||
| dependencies = [ | ||
| "proc-macro2", | ||
| "quote", | ||
| "syn", | ||
| ] | ||
| [[package]] | ||
| name = "thiserror" | ||
| version = "2.0.17" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "f63587ca0f12b72a0600bcba1d40081f830876000bb46dd2337a3051618f4fc8" | ||
| dependencies = [ | ||
| "thiserror-impl", | ||
| ] | ||
| [[package]] | ||
| name = "thiserror-impl" | ||
| version = "2.0.17" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "3ff15c8ecd7de3849db632e14d18d2571fa09dfc5ed93479bc4485c7a517c913" | ||
| dependencies = [ | ||
| "proc-macro2", | ||
| "quote", | ||
| "syn", | ||
| ] | ||
| [[package]] | ||
| name = "tinystr" | ||
| version = "0.8.2" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "42d3e9c45c09de15d06dd8acf5f4e0e399e85927b7f00711024eb7ae10fa4869" | ||
| dependencies = [ | ||
| "displaydoc", | ||
| "serde_core", | ||
| "zerovec", | ||
| ] | ||
| [[package]] | ||
| name = "tinytemplate" | ||
| version = "1.2.1" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "be4d6b5f19ff7664e8c98d03e2139cb510db9b0a60b55f8e8709b689d939b6bc" | ||
| dependencies = [ | ||
| "serde", | ||
| "serde_json", | ||
| ] | ||
| [[package]] | ||
| name = "typeid" | ||
| version = "1.0.3" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "bc7d623258602320d5c55d1bc22793b57daff0ec7efc270ea7d55ce1d5f5471c" | ||
| [[package]] | ||
| name = "typenum" | ||
| version = "1.19.0" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "562d481066bde0658276a35467c4af00bdc6ee726305698a55b86e61d7ad82bb" | ||
| [[package]] | ||
| name = "unicode-ident" | ||
| version = "1.0.20" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "462eeb75aeb73aea900253ce739c8e18a67423fadf006037cd3ff27e82748a06" | ||
| [[package]] | ||
| name = "utf16_iter" | ||
| version = "1.0.5" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "c8232dd3cdaed5356e0f716d285e4b40b932ac434100fe9b7e0e8e935b9e6246" | ||
| [[package]] | ||
| name = "utf8_iter" | ||
| version = "1.0.4" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be" | ||
| [[package]] | ||
| name = "walkdir" | ||
| version = "2.5.0" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "29790946404f91d9c5d06f9874efddea1dc06c5efe94541a7d6863108e3a5e4b" | ||
| dependencies = [ | ||
| "same-file", | ||
| "winapi-util", | ||
| ] | ||
| [[package]] | ||
| name = "wasm-bindgen" | ||
| version = "0.2.104" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "c1da10c01ae9f1ae40cbfac0bac3b1e724b320abfcf52229f80b547c0d250e2d" | ||
| dependencies = [ | ||
| "cfg-if", | ||
| "once_cell", | ||
| "rustversion", | ||
| "wasm-bindgen-macro", | ||
| "wasm-bindgen-shared", | ||
| ] | ||
| [[package]] | ||
| name = "wasm-bindgen-backend" | ||
| version = "0.2.104" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "671c9a5a66f49d8a47345ab942e2cb93c7d1d0339065d4f8139c486121b43b19" | ||
| dependencies = [ | ||
| "bumpalo", | ||
| "log", | ||
| "proc-macro2", | ||
| "quote", | ||
| "syn", | ||
| "wasm-bindgen-shared", | ||
| ] | ||
| [[package]] | ||
| name = "wasm-bindgen-macro" | ||
| version = "0.2.104" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "7ca60477e4c59f5f2986c50191cd972e3a50d8a95603bc9434501cf156a9a119" | ||
| dependencies = [ | ||
| "quote", | ||
| "wasm-bindgen-macro-support", | ||
| ] | ||
| [[package]] | ||
| name = "wasm-bindgen-macro-support" | ||
| version = "0.2.104" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "9f07d2f20d4da7b26400c9f4a0511e6e0345b040694e8a75bd41d578fa4421d7" | ||
| dependencies = [ | ||
| "proc-macro2", | ||
| "quote", | ||
| "syn", | ||
| "wasm-bindgen-backend", | ||
| "wasm-bindgen-shared", | ||
| ] | ||
| [[package]] | ||
| name = "wasm-bindgen-shared" | ||
| version = "0.2.104" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "bad67dc8b2a1a6e5448428adec4c3e84c43e561d8c9ee8a9e5aabeb193ec41d1" | ||
| dependencies = [ | ||
| "unicode-ident", | ||
| ] | ||
| [[package]] | ||
| name = "web-sys" | ||
| version = "0.3.81" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "9367c417a924a74cae129e6a2ae3b47fabb1f8995595ab474029da749a8be120" | ||
| dependencies = [ | ||
| "js-sys", | ||
| "wasm-bindgen", | ||
| ] | ||
| [[package]] | ||
| name = "winapi-util" | ||
| version = "0.1.11" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "c2a7b1c03c876122aa43f3020e6c3c3ee5c05081c9a00739faf7503aeba10d22" | ||
| dependencies = [ | ||
| "windows-sys", | ||
| ] | ||
| [[package]] | ||
| name = "windows-link" | ||
| version = "0.2.1" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5" | ||
| [[package]] | ||
| name = "windows-sys" | ||
| version = "0.61.2" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "ae137229bcbd6cdf0f7b80a31df61766145077ddf49416a728b02cb3921ff3fc" | ||
| dependencies = [ | ||
| "windows-link", | ||
| ] | ||
| [[package]] | ||
| name = "write16" | ||
| version = "1.0.0" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "d1890f4022759daae28ed4fe62859b1236caebfc61ede2f63ed4e695f3f6d936" | ||
| dependencies = [ | ||
| "arrayvec", | ||
| "smallvec", | ||
| ] | ||
| [[package]] | ||
| name = "writeable" | ||
| version = "0.6.2" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "9edde0db4769d2dc68579893f2306b26c6ecfbe0ef499b013d731b7b9247e0b9" | ||
| [[package]] | ||
| name = "yoke" | ||
| version = "0.8.1" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "72d6e5c6afb84d73944e5cedb052c4680d5657337201555f9f2a16b7406d4954" | ||
| dependencies = [ | ||
| "stable_deref_trait", | ||
| "yoke-derive", | ||
| "zerofrom", | ||
| ] | ||
| [[package]] | ||
| name = "yoke-derive" | ||
| version = "0.8.1" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "b659052874eb698efe5b9e8cf382204678a0086ebf46982b79d6ca3182927e5d" | ||
| dependencies = [ | ||
| "proc-macro2", | ||
| "quote", | ||
| "syn", | ||
| "synstructure", | ||
| ] | ||
| [[package]] | ||
| name = "zerofrom" | ||
| version = "0.1.6" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "50cc42e0333e05660c3587f3bf9d0478688e15d870fab3346451ce7f8c9fbea5" | ||
| dependencies = [ | ||
| "zerofrom-derive", | ||
| ] | ||
| [[package]] | ||
| name = "zerofrom-derive" | ||
| version = "0.1.6" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "d71e5d6e06ab090c67b5e44993ec16b72dcbaabc526db883a360057678b48502" | ||
| dependencies = [ | ||
| "proc-macro2", | ||
| "quote", | ||
| "syn", | ||
| "synstructure", | ||
| ] | ||
| [[package]] | ||
| name = "zerotrie" | ||
| version = "0.2.3" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "2a59c17a5562d507e4b54960e8569ebee33bee890c70aa3fe7b97e85a9fd7851" | ||
| dependencies = [ | ||
| "databake", | ||
| "displaydoc", | ||
| "litemap", | ||
| "serde_core", | ||
| "yoke", | ||
| "zerofrom", | ||
| "zerovec", | ||
| ] | ||
| [[package]] | ||
| name = "zerovec" | ||
| version = "0.11.5" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "6c28719294829477f525be0186d13efa9a3c602f7ec202ca9e353d310fb9a002" | ||
| dependencies = [ | ||
| "databake", | ||
| "serde", | ||
| "yoke", | ||
| "zerofrom", | ||
| "zerovec-derive", | ||
| ] | ||
| [[package]] | ||
| name = "zerovec-derive" | ||
| version = "0.11.2" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "eadce39539ca5cb3985590102671f2567e659fca9666581ad3411d59207951f3" | ||
| dependencies = [ | ||
| "proc-macro2", | ||
| "quote", | ||
| "syn", | ||
| ] |
| # THIS FILE IS AUTOMATICALLY GENERATED BY CARGO | ||
| # | ||
| # When uploading crates to the registry Cargo will automatically | ||
| # "normalize" Cargo.toml files for maximal compatibility | ||
| # with all versions of Cargo and also rewrite `path` dependencies | ||
| # to registry (e.g., crates.io) dependencies. | ||
| # | ||
| # If you are reading this file be aware that the original Cargo.toml | ||
| # will likely look very different (and much more reasonable). | ||
| # See Cargo.toml.orig for the original contents. | ||
| [package] | ||
| edition = "2021" | ||
| rust-version = "1.83" | ||
| name = "icu_normalizer" | ||
| version = "2.1.1" | ||
| authors = ["The ICU4X Project Developers"] | ||
| build = false | ||
| include = [ | ||
| "data/**/*", | ||
| "src/**/*", | ||
| "examples/**/*", | ||
| "benches/**/*", | ||
| "tests/**/*", | ||
| "Cargo.toml", | ||
| "LICENSE", | ||
| "README.md", | ||
| "build.rs", | ||
| ] | ||
| autolib = false | ||
| autobins = false | ||
| autoexamples = false | ||
| autotests = false | ||
| autobenches = false | ||
| description = "API for normalizing text into Unicode Normalization Forms" | ||
| homepage = "https://icu4x.unicode.org" | ||
| readme = "README.md" | ||
| categories = ["internationalization"] | ||
| license = "Unicode-3.0" | ||
| repository = "https://github.com/unicode-org/icu4x" | ||
| [package.metadata.docs.rs] | ||
| all-features = true | ||
| [features] | ||
| compiled_data = [ | ||
| "dep:icu_normalizer_data", | ||
| "icu_properties?/compiled_data", | ||
| "icu_provider/baked", | ||
| ] | ||
| datagen = [ | ||
| "serde", | ||
| "dep:databake", | ||
| "icu_properties", | ||
| "icu_collections/databake", | ||
| "zerovec/databake", | ||
| "icu_properties?/datagen", | ||
| "icu_provider/export", | ||
| ] | ||
| default = [ | ||
| "compiled_data", | ||
| "utf8_iter", | ||
| "utf16_iter", | ||
| ] | ||
| experimental = [] | ||
| icu_properties = ["dep:icu_properties"] | ||
| serde = [ | ||
| "dep:serde", | ||
| "icu_collections/serde", | ||
| "zerovec/serde", | ||
| "icu_properties?/serde", | ||
| "icu_provider/serde", | ||
| ] | ||
| utf16_iter = [ | ||
| "dep:utf16_iter", | ||
| "dep:write16", | ||
| ] | ||
| utf8_iter = ["dep:utf8_iter"] | ||
| write16 = [] | ||
| [lib] | ||
| name = "icu_normalizer" | ||
| path = "src/lib.rs" | ||
| [[test]] | ||
| name = "tests" | ||
| path = "tests/tests.rs" | ||
| [[bench]] | ||
| name = "bench" | ||
| path = "benches/bench.rs" | ||
| harness = false | ||
| required-features = [ | ||
| "utf16_iter", | ||
| "utf8_iter", | ||
| ] | ||
| [[bench]] | ||
| name = "canonical_composition" | ||
| path = "benches/canonical_composition.rs" | ||
| [[bench]] | ||
| name = "canonical_decomposition" | ||
| path = "benches/canonical_decomposition.rs" | ||
| [[bench]] | ||
| name = "composing_normalizer_nfc" | ||
| path = "benches/composing_normalizer_nfc.rs" | ||
| [[bench]] | ||
| name = "composing_normalizer_nfkc" | ||
| path = "benches/composing_normalizer_nfkc.rs" | ||
| [[bench]] | ||
| name = "decomposing_normalizer_nfd" | ||
| path = "benches/decomposing_normalizer_nfd.rs" | ||
| [[bench]] | ||
| name = "decomposing_normalizer_nfkd" | ||
| path = "benches/decomposing_normalizer_nfkd.rs" | ||
| [[bench]] | ||
| name = "utf16_throughput" | ||
| path = "benches/utf16_throughput.rs" | ||
| [dependencies.databake] | ||
| version = "0.2.0" | ||
| features = ["derive"] | ||
| optional = true | ||
| default-features = false | ||
| [dependencies.icu_collections] | ||
| version = "~2.1.1" | ||
| default-features = false | ||
| [dependencies.icu_normalizer_data] | ||
| version = "~2.1.1" | ||
| optional = true | ||
| default-features = false | ||
| [dependencies.icu_properties] | ||
| version = "~2.1.1" | ||
| optional = true | ||
| default-features = false | ||
| [dependencies.icu_provider] | ||
| version = "2.1.1" | ||
| default-features = false | ||
| [dependencies.serde] | ||
| version = "1.0.220" | ||
| features = [ | ||
| "derive", | ||
| "alloc", | ||
| ] | ||
| optional = true | ||
| default-features = false | ||
| [dependencies.smallvec] | ||
| version = "1.10.0" | ||
| default-features = false | ||
| [dependencies.utf16_iter] | ||
| version = "1.0.2" | ||
| optional = true | ||
| default-features = false | ||
| [dependencies.utf8_iter] | ||
| version = "1.0.2" | ||
| optional = true | ||
| default-features = false | ||
| [dependencies.write16] | ||
| version = "1.0.0" | ||
| features = ["alloc"] | ||
| optional = true | ||
| default-features = false | ||
| [dependencies.zerovec] | ||
| version = "0.11.3" | ||
| default-features = false | ||
| [dev-dependencies.arraystring] | ||
| version = "0.3.0" | ||
| [dev-dependencies.arrayvec] | ||
| version = "0.7.2" | ||
| default-features = false | ||
| [dev-dependencies.atoi] | ||
| version = "2.0.0" | ||
| [dev-dependencies.detone] | ||
| version = "1.0.0" | ||
| [dev-dependencies.write16] | ||
| version = "1.0.0" | ||
| features = [ | ||
| "arrayvec", | ||
| "smallvec", | ||
| ] | ||
| default-features = false | ||
| [target.'cfg(not(target_arch = "wasm32"))'.dev-dependencies.criterion] | ||
| version = "0.5.0" | ||
| [lints.rust.unexpected_cfgs] | ||
| level = "warn" | ||
| priority = 0 | ||
| check-cfg = ["cfg(icu4x_unstable_fast_trie_only)"] |
Sorry, the diff of this file is not supported yet
| UNICODE LICENSE V3 | ||
| COPYRIGHT AND PERMISSION NOTICE | ||
| Copyright © 2020-2024 Unicode, Inc. | ||
| NOTICE TO USER: Carefully read the following legal agreement. BY | ||
| DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING DATA FILES, AND/OR | ||
| SOFTWARE, YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE | ||
| TERMS AND CONDITIONS OF THIS AGREEMENT. IF YOU DO NOT AGREE, DO NOT | ||
| DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE THE DATA FILES OR SOFTWARE. | ||
| Permission is hereby granted, free of charge, to any person obtaining a | ||
| copy of data files and any associated documentation (the "Data Files") or | ||
| software and any associated documentation (the "Software") to deal in the | ||
| Data Files or Software without restriction, including without limitation | ||
| the rights to use, copy, modify, merge, publish, distribute, and/or sell | ||
| copies of the Data Files or Software, and to permit persons to whom the | ||
| Data Files or Software are furnished to do so, provided that either (a) | ||
| this copyright and permission notice appear with all copies of the Data | ||
| Files or Software, or (b) this copyright and permission notice appear in | ||
| associated Documentation. | ||
| THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY | ||
| KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | ||
| MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF | ||
| THIRD PARTY RIGHTS. | ||
| IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE | ||
| BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, | ||
| OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, | ||
| WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, | ||
| ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THE DATA | ||
| FILES OR SOFTWARE. | ||
| Except as contained in this notice, the name of a copyright holder shall | ||
| not be used in advertising or otherwise to promote the sale, use or other | ||
| dealings in these Data Files or Software without prior written | ||
| authorization of the copyright holder. | ||
| SPDX-License-Identifier: Unicode-3.0 | ||
| — | ||
| Portions of ICU4X may have been adapted from ICU4C and/or ICU4J. | ||
| ICU 1.8.1 to ICU 57.1 © 1995-2016 International Business Machines Corporation and others. |
| # icu_normalizer [](https://crates.io/crates/icu_normalizer) | ||
| <!-- cargo-rdme start --> | ||
| Normalizing text into Unicode Normalization Forms. | ||
| This module is published as its own crate ([`icu_normalizer`](https://docs.rs/icu_normalizer/latest/icu_normalizer/)) | ||
| and as part of the [`icu`](https://docs.rs/icu/latest/icu/) crate. See the latter for more details on the ICU4X project. | ||
| ## Functionality | ||
| The top level of the crate provides normalization of input into the four normalization forms defined in [UAX #15: Unicode | ||
| Normalization Forms](https://www.unicode.org/reports/tr15/): NFC, NFD, NFKC, and NFKD. | ||
| Three kinds of contiguous inputs are supported: known-well-formed UTF-8 (`&str`), potentially-not-well-formed UTF-8, | ||
| and potentially-not-well-formed UTF-16. Additionally, an iterator over `char` can be wrapped in a normalizing iterator. | ||
| The `uts46` module provides the combination of mapping and normalization operations for [UTS #46: Unicode IDNA | ||
| Compatibility Processing](https://www.unicode.org/reports/tr46/). This functionality is not meant to be used by | ||
| applications directly. Instead, it is meant as a building block for a full implementation of UTS #46, such as the | ||
| [`idna`](https://docs.rs/idna/latest/idna/) crate. | ||
| The `properties` module provides the non-recursive canonical decomposition operation on a per `char` basis and | ||
| the canonical compositon operation given two `char`s. It also provides access to the Canonical Combining Class | ||
| property. These operations are primarily meant for [HarfBuzz](https://harfbuzz.github.io/) via the | ||
| [`icu_harfbuzz`](https://docs.rs/icu_harfbuzz/latest/icu_harfbuzz/) crate. | ||
| Notably, this normalizer does _not_ provide the normalization “quick check” that can result in “maybe” in | ||
| addition to “yes” and “no”. The normalization checks provided by this crate always give a definitive | ||
| non-“maybe” answer. | ||
| ## Examples | ||
| ```rust | ||
| let nfc = icu_normalizer::ComposingNormalizerBorrowed::new_nfc(); | ||
| assert_eq!(nfc.normalize("a\u{0308}"), "ä"); | ||
| assert!(nfc.is_normalized("ä")); | ||
| let nfd = icu_normalizer::DecomposingNormalizerBorrowed::new_nfd(); | ||
| assert_eq!(nfd.normalize("ä"), "a\u{0308}"); | ||
| assert!(!nfd.is_normalized("ä")); | ||
| ``` | ||
| <!-- cargo-rdme end --> | ||
| ## More Information | ||
| For more information on development, authorship, contributing etc. please visit [`ICU4X home page`](https://github.com/unicode-org/icu4x). |
Sorry, the diff of this file is too big to display
| // This file is part of ICU4X. For terms of use, please see the file | ||
| // called LICENSE at the top level of the ICU4X source tree | ||
| // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). | ||
| //! Access to the Unicode properties or property-based operations that | ||
| //! are required for NFC and NFD. | ||
| //! | ||
| //! Applications should generally use the full normalizers that are | ||
| //! provided at the top level of this crate. However, the APIs in this | ||
| //! module are provided for callers such as HarfBuzz that specifically | ||
| //! want access to the raw canonical composition operation e.g. for use in a | ||
| //! glyph-availability-guided custom normalizer. | ||
| use crate::char_from_u16; | ||
| use crate::char_from_u32; | ||
| use crate::in_inclusive_range; | ||
| use crate::provider::CanonicalCompositions; | ||
| use crate::provider::DecompositionData; | ||
| use crate::provider::DecompositionTables; | ||
| use crate::provider::NonRecursiveDecompositionSupplement; | ||
| use crate::provider::NormalizerNfcV1; | ||
| use crate::provider::NormalizerNfdDataV1; | ||
| use crate::provider::NormalizerNfdSupplementV1; | ||
| use crate::provider::NormalizerNfdTablesV1; | ||
| use crate::trie_value_has_ccc; | ||
| use crate::CanonicalCombiningClass; | ||
| use crate::BACKWARD_COMBINING_MARKER; | ||
| use crate::FDFA_MARKER; | ||
| use crate::HANGUL_L_BASE; | ||
| use crate::HANGUL_N_COUNT; | ||
| use crate::HANGUL_S_BASE; | ||
| use crate::HANGUL_S_COUNT; | ||
| use crate::HANGUL_T_BASE; | ||
| use crate::HANGUL_T_COUNT; | ||
| use crate::HANGUL_V_BASE; | ||
| use crate::HIGH_ZEROS_MASK; | ||
| use crate::LOW_ZEROS_MASK; | ||
| use crate::NON_ROUND_TRIP_MARKER; | ||
| use icu_provider::prelude::*; | ||
| /// Borrowed version of the raw canonical composition operation. | ||
| /// | ||
| /// Callers should generally use `ComposingNormalizer` instead of this API. | ||
| /// However, this API is provided for callers such as HarfBuzz that specifically | ||
| /// want access to the raw canonical composition operation e.g. for use in a | ||
| /// glyph-availability-guided custom normalizer. | ||
| #[derive(Debug, Copy, Clone)] | ||
| pub struct CanonicalCompositionBorrowed<'a> { | ||
| canonical_compositions: &'a CanonicalCompositions<'a>, | ||
| } | ||
| #[cfg(feature = "compiled_data")] | ||
| impl Default for CanonicalCompositionBorrowed<'static> { | ||
| fn default() -> Self { | ||
| Self::new() | ||
| } | ||
| } | ||
| impl CanonicalCompositionBorrowed<'static> { | ||
| /// Cheaply converts a [`CanonicalCompositionBorrowed<'static>`] into a [`CanonicalComposition`]. | ||
| /// | ||
| /// Note: Due to branching and indirection, using [`CanonicalComposition`] might inhibit some | ||
| /// compile-time optimizations that are possible with [`CanonicalCompositionBorrowed`]. | ||
| pub const fn static_to_owned(self) -> CanonicalComposition { | ||
| CanonicalComposition { | ||
| canonical_compositions: DataPayload::from_static_ref(self.canonical_compositions), | ||
| } | ||
| } | ||
| /// Constructs a new `CanonicalComposition` using compiled data. | ||
| /// | ||
| /// ✨ *Enabled with the `compiled_data` Cargo feature.* | ||
| /// | ||
| /// [📚 Help choosing a constructor](icu_provider::constructors) | ||
| #[cfg(feature = "compiled_data")] | ||
| pub const fn new() -> Self { | ||
| Self { | ||
| canonical_compositions: crate::provider::Baked::SINGLETON_NORMALIZER_NFC_V1, | ||
| } | ||
| } | ||
| } | ||
| impl CanonicalCompositionBorrowed<'_> { | ||
| /// Performs canonical composition (including Hangul) on a pair of | ||
| /// characters or returns `None` if these characters don't compose. | ||
| /// Composition exclusions are taken into account. | ||
| /// | ||
| /// # Examples | ||
| /// | ||
| /// ``` | ||
| /// let comp = icu::normalizer::properties::CanonicalCompositionBorrowed::new(); | ||
| /// | ||
| /// assert_eq!(comp.compose('a', 'b'), None); // Just two non-composing starters | ||
| /// assert_eq!(comp.compose('a', '\u{0308}'), Some('ä')); | ||
| /// assert_eq!(comp.compose('ẹ', '\u{0302}'), Some('ệ')); | ||
| /// assert_eq!(comp.compose('𝅗', '𝅥'), None); // Composition exclusion | ||
| /// assert_eq!(comp.compose('ে', 'া'), Some('ো')); // Second is starter | ||
| /// assert_eq!(comp.compose('ᄀ', 'ᅡ'), Some('가')); // Hangul LV | ||
| /// assert_eq!(comp.compose('가', 'ᆨ'), Some('각')); // Hangul LVT | ||
| /// ``` | ||
| #[inline(always)] | ||
| pub fn compose(self, starter: char, second: char) -> Option<char> { | ||
| crate::compose( | ||
| self.canonical_compositions.canonical_compositions.iter(), | ||
| starter, | ||
| second, | ||
| ) | ||
| } | ||
| } | ||
| /// The raw canonical composition operation. | ||
| /// | ||
| /// Callers should generally use `ComposingNormalizer` instead of this API. | ||
| /// However, this API is provided for callers such as HarfBuzz that specifically | ||
| /// want access to the raw canonical composition operation e.g. for use in a | ||
| /// glyph-availability-guided custom normalizer. | ||
| #[derive(Debug)] | ||
| pub struct CanonicalComposition { | ||
| canonical_compositions: DataPayload<NormalizerNfcV1>, | ||
| } | ||
| #[cfg(feature = "compiled_data")] | ||
| impl Default for CanonicalComposition { | ||
| fn default() -> Self { | ||
| Self::new().static_to_owned() | ||
| } | ||
| } | ||
| impl CanonicalComposition { | ||
| /// Constructs a borrowed version of this type for more efficient querying. | ||
| pub fn as_borrowed(&self) -> CanonicalCompositionBorrowed<'_> { | ||
| CanonicalCompositionBorrowed { | ||
| canonical_compositions: self.canonical_compositions.get(), | ||
| } | ||
| } | ||
| /// Constructs a new `CanonicalCompositionBorrowed` using compiled data. | ||
| /// | ||
| /// ✨ *Enabled with the `compiled_data` Cargo feature.* | ||
| /// | ||
| /// [📚 Help choosing a constructor](icu_provider::constructors) | ||
| #[cfg(feature = "compiled_data")] | ||
| #[expect(clippy::new_ret_no_self)] | ||
| pub const fn new() -> CanonicalCompositionBorrowed<'static> { | ||
| CanonicalCompositionBorrowed::new() | ||
| } | ||
| icu_provider::gen_buffer_data_constructors!(() -> error: DataError, | ||
| functions: [ | ||
| new: skip, | ||
| try_new_with_buffer_provider, | ||
| try_new_unstable, | ||
| Self, | ||
| ] | ||
| ); | ||
| #[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new)] | ||
| pub fn try_new_unstable<D>(provider: &D) -> Result<Self, DataError> | ||
| where | ||
| D: DataProvider<NormalizerNfcV1> + ?Sized, | ||
| { | ||
| let canonical_compositions: DataPayload<NormalizerNfcV1> = | ||
| provider.load(Default::default())?.payload; | ||
| Ok(CanonicalComposition { | ||
| canonical_compositions, | ||
| }) | ||
| } | ||
| } | ||
| /// The outcome of non-recursive canonical decomposition of a character. | ||
| #[allow(clippy::exhaustive_enums)] | ||
| #[derive(Debug, PartialEq, Eq)] | ||
| pub enum Decomposed { | ||
| /// The character is its own canonical decomposition. | ||
| Default, | ||
| /// The character decomposes to a single different character. | ||
| Singleton(char), | ||
| /// The character decomposes to two characters. | ||
| Expansion(char, char), | ||
| } | ||
| /// Borrowed version of the raw (non-recursive) canonical decomposition operation. | ||
| /// | ||
| /// Callers should generally use `DecomposingNormalizer` instead of this API. | ||
| /// However, this API is provided for callers such as HarfBuzz that specifically | ||
| /// want access to non-recursive canonical decomposition e.g. for use in a | ||
| /// glyph-availability-guided custom normalizer. | ||
| #[derive(Debug)] | ||
| pub struct CanonicalDecompositionBorrowed<'a> { | ||
| decompositions: &'a DecompositionData<'a>, | ||
| tables: &'a DecompositionTables<'a>, | ||
| non_recursive: &'a NonRecursiveDecompositionSupplement<'a>, | ||
| } | ||
| #[cfg(feature = "compiled_data")] | ||
| impl Default for CanonicalDecompositionBorrowed<'static> { | ||
| fn default() -> Self { | ||
| Self::new() | ||
| } | ||
| } | ||
| impl CanonicalDecompositionBorrowed<'static> { | ||
| /// Cheaply converts a [`CanonicalDecompositionBorrowed<'static>`] into a [`CanonicalDecomposition`]. | ||
| /// | ||
| /// Note: Due to branching and indirection, using [`CanonicalDecomposition`] might inhibit some | ||
| /// compile-time optimizations that are possible with [`CanonicalDecompositionBorrowed`]. | ||
| pub const fn static_to_owned(self) -> CanonicalDecomposition { | ||
| CanonicalDecomposition { | ||
| decompositions: DataPayload::from_static_ref(self.decompositions), | ||
| tables: DataPayload::from_static_ref(self.tables), | ||
| non_recursive: DataPayload::from_static_ref(self.non_recursive), | ||
| } | ||
| } | ||
| /// Construct from compiled data. | ||
| /// | ||
| /// ✨ *Enabled with the `compiled_data` Cargo feature.* | ||
| /// | ||
| /// [📚 Help choosing a constructor](icu_provider::constructors) | ||
| #[cfg(feature = "compiled_data")] | ||
| pub const fn new() -> Self { | ||
| const _: () = assert!( | ||
| crate::provider::Baked::SINGLETON_NORMALIZER_NFD_TABLES_V1 | ||
| .scalars16 | ||
| .const_len() | ||
| + crate::provider::Baked::SINGLETON_NORMALIZER_NFD_TABLES_V1 | ||
| .scalars24 | ||
| .const_len() | ||
| <= 0xFFF, | ||
| "future extension" | ||
| ); | ||
| Self { | ||
| decompositions: crate::provider::Baked::SINGLETON_NORMALIZER_NFD_DATA_V1, | ||
| tables: crate::provider::Baked::SINGLETON_NORMALIZER_NFD_TABLES_V1, | ||
| non_recursive: crate::provider::Baked::SINGLETON_NORMALIZER_NFD_SUPPLEMENT_V1, | ||
| } | ||
| } | ||
| } | ||
| impl CanonicalDecompositionBorrowed<'_> { | ||
| /// Performs non-recursive canonical decomposition (including for Hangul). | ||
| /// | ||
| /// ``` | ||
| /// use icu::normalizer::properties::Decomposed; | ||
| /// let decomp = icu::normalizer::properties::CanonicalDecompositionBorrowed::new(); | ||
| /// | ||
| /// assert_eq!(decomp.decompose('e'), Decomposed::Default); | ||
| /// assert_eq!( | ||
| /// decomp.decompose('ệ'), | ||
| /// Decomposed::Expansion('ẹ', '\u{0302}') | ||
| /// ); | ||
| /// assert_eq!(decomp.decompose('각'), Decomposed::Expansion('가', 'ᆨ')); | ||
| /// assert_eq!(decomp.decompose('\u{212B}'), Decomposed::Singleton('Å')); // ANGSTROM SIGN | ||
| /// assert_eq!(decomp.decompose('\u{2126}'), Decomposed::Singleton('Ω')); // OHM SIGN | ||
| /// assert_eq!(decomp.decompose('\u{1F71}'), Decomposed::Singleton('ά')); // oxia | ||
| /// ``` | ||
| #[inline] | ||
| pub fn decompose(&self, c: char) -> Decomposed { | ||
| let lvt = u32::from(c).wrapping_sub(HANGUL_S_BASE); | ||
| if lvt >= HANGUL_S_COUNT { | ||
| return self.decompose_non_hangul(c); | ||
| } | ||
| // Invariant: lvt ≤ HANGUL_S_COUNT = 1172 | ||
| let t = lvt % HANGUL_T_COUNT; | ||
| // Invariant: t ≤ (1172 / HANGUL_T_COUNT = 1172 / 28 = 41) | ||
| if t == 0 { | ||
| let l = lvt / HANGUL_N_COUNT; | ||
| // Invariant: v ≤ (1172 / HANGUL_N_COUNT = 1172 / 588 ≈ 2) | ||
| let v = (lvt % HANGUL_N_COUNT) / HANGUL_T_COUNT; | ||
| // Invariant: v < (HANGUL_N_COUNT / HANGUL_T_COUNT = 588 / 28 = 21) | ||
| return Decomposed::Expansion( | ||
| // Safety: HANGUL_*_BASE are 0x1nnn, addding numbers that are 21 and 41 | ||
| // max will keep it in range, less than 0xD800 | ||
| unsafe { char::from_u32_unchecked(HANGUL_L_BASE + l) }, | ||
| unsafe { char::from_u32_unchecked(HANGUL_V_BASE + v) }, | ||
| ); | ||
| } | ||
| let lv = lvt - t; | ||
| // Invariant: lvt < 1172 | ||
| // Safe because values known to be in range | ||
| Decomposed::Expansion( | ||
| // Safety: HANGUL_*_BASE are 0x1nnn, addding numbers that are 1172 and 41 | ||
| // max will keep it in range, less than 0xD800 | ||
| unsafe { char::from_u32_unchecked(HANGUL_S_BASE + lv) }, | ||
| unsafe { char::from_u32_unchecked(HANGUL_T_BASE + t) }, | ||
| ) | ||
| } | ||
| /// Performs non-recursive canonical decomposition except Hangul syllables | ||
| /// are reported as `Decomposed::Default`. | ||
| #[inline(always)] | ||
| fn decompose_non_hangul(&self, c: char) -> Decomposed { | ||
| let decomposition = self.decompositions.trie.get(c); | ||
| // The REPLACEMENT CHARACTER has `NON_ROUND_TRIP_MARKER` set, | ||
| // and that flag needs to be ignored here. | ||
| if (decomposition & !(BACKWARD_COMBINING_MARKER | NON_ROUND_TRIP_MARKER)) == 0 { | ||
| return Decomposed::Default; | ||
| } | ||
| // The loop is only broken out of as goto forward | ||
| #[expect(clippy::never_loop)] | ||
| loop { | ||
| let high_zeros = (decomposition & HIGH_ZEROS_MASK) == 0; | ||
| let low_zeros = (decomposition & LOW_ZEROS_MASK) == 0; | ||
| if !high_zeros && !low_zeros { | ||
| // Decomposition into two BMP characters: starter and non-starter | ||
| if in_inclusive_range(c, '\u{1F71}', '\u{1FFB}') { | ||
| // Look in the other trie due to oxia singleton | ||
| // mappings to corresponding character with tonos. | ||
| break; | ||
| } | ||
| let starter = char_from_u32(decomposition & 0x7FFF); | ||
| let combining = char_from_u32((decomposition >> 15) & 0x7FFF); | ||
| return Decomposed::Expansion(starter, combining); | ||
| } | ||
| if high_zeros { | ||
| // Decomposition into one BMP character or non-starter | ||
| if trie_value_has_ccc(decomposition) { | ||
| // Non-starter | ||
| if !in_inclusive_range(c, '\u{0340}', '\u{0F81}') { | ||
| return Decomposed::Default; | ||
| } | ||
| return match c { | ||
| '\u{0340}' => { | ||
| // COMBINING GRAVE TONE MARK | ||
| Decomposed::Singleton('\u{0300}') | ||
| } | ||
| '\u{0341}' => { | ||
| // COMBINING ACUTE TONE MARK | ||
| Decomposed::Singleton('\u{0301}') | ||
| } | ||
| '\u{0343}' => { | ||
| // COMBINING GREEK KORONIS | ||
| Decomposed::Singleton('\u{0313}') | ||
| } | ||
| '\u{0344}' => { | ||
| // COMBINING GREEK DIALYTIKA TONOS | ||
| Decomposed::Expansion('\u{0308}', '\u{0301}') | ||
| } | ||
| '\u{0F73}' => { | ||
| // TIBETAN VOWEL SIGN II | ||
| Decomposed::Expansion('\u{0F71}', '\u{0F72}') | ||
| } | ||
| '\u{0F75}' => { | ||
| // TIBETAN VOWEL SIGN UU | ||
| Decomposed::Expansion('\u{0F71}', '\u{0F74}') | ||
| } | ||
| '\u{0F81}' => { | ||
| // TIBETAN VOWEL SIGN REVERSED II | ||
| Decomposed::Expansion('\u{0F71}', '\u{0F80}') | ||
| } | ||
| _ => Decomposed::Default, | ||
| }; | ||
| } | ||
| let singleton = decomposition as u16; | ||
| debug_assert_ne!( | ||
| singleton, FDFA_MARKER, | ||
| "How come we got the U+FDFA NFKD marker here?" | ||
| ); | ||
| return Decomposed::Singleton(char_from_u16(singleton)); | ||
| } | ||
| if c == '\u{212B}' { | ||
| // ANGSTROM SIGN | ||
| return Decomposed::Singleton('\u{00C5}'); | ||
| } | ||
| // Only 12 of 14 bits used as of Unicode 16. | ||
| let offset = (((decomposition & !(0b11 << 30)) >> 16) as usize) - 1; | ||
| // Only 3 of 4 bits used as of Unicode 16. | ||
| let len_bits = decomposition & 0b1111; | ||
| let tables = self.tables; | ||
| if offset < tables.scalars16.len() { | ||
| if len_bits != 0 { | ||
| // i.e. logical len isn't 2 | ||
| break; | ||
| } | ||
| if let Some(first) = tables.scalars16.get(offset) { | ||
| if let Some(second) = tables.scalars16.get(offset + 1) { | ||
| // Two BMP starters | ||
| return Decomposed::Expansion(char_from_u16(first), char_from_u16(second)); | ||
| } | ||
| } | ||
| // GIGO case | ||
| debug_assert!(false); | ||
| return Decomposed::Default; | ||
| } | ||
| let len = len_bits + 1; | ||
| if len > 2 { | ||
| break; | ||
| } | ||
| let offset24 = offset - tables.scalars16.len(); | ||
| if let Some(first_c) = tables.scalars24.get(offset24) { | ||
| if len == 1 { | ||
| return Decomposed::Singleton(first_c); | ||
| } | ||
| if let Some(second_c) = tables.scalars24.get(offset24 + 1) { | ||
| return Decomposed::Expansion(first_c, second_c); | ||
| } | ||
| } | ||
| // GIGO case | ||
| debug_assert!(false); | ||
| return Decomposed::Default; | ||
| } | ||
| let non_recursive = self.non_recursive; | ||
| let non_recursive_decomposition = non_recursive.trie.get(c); | ||
| if non_recursive_decomposition == 0 { | ||
| // GIGO case | ||
| debug_assert!(false); | ||
| return Decomposed::Default; | ||
| } | ||
| let trail_or_complex = (non_recursive_decomposition >> 16) as u16; | ||
| let lead = non_recursive_decomposition as u16; | ||
| if lead != 0 && trail_or_complex != 0 { | ||
| // Decomposition into two BMP characters | ||
| return Decomposed::Expansion(char_from_u16(lead), char_from_u16(trail_or_complex)); | ||
| } | ||
| if lead != 0 { | ||
| // Decomposition into one BMP character | ||
| return Decomposed::Singleton(char_from_u16(lead)); | ||
| } | ||
| // Decomposition into two non-BMP characters | ||
| // Low is offset into a table plus one to keep it non-zero. | ||
| let offset = usize::from(trail_or_complex - 1); | ||
| if let Some(first) = non_recursive.scalars24.get(offset) { | ||
| if let Some(second) = non_recursive.scalars24.get(offset + 1) { | ||
| return Decomposed::Expansion(first, second); | ||
| } | ||
| } | ||
| // GIGO case | ||
| debug_assert!(false); | ||
| Decomposed::Default | ||
| } | ||
| } | ||
| /// The raw (non-recursive) canonical decomposition operation. | ||
| /// | ||
| /// Callers should generally use `DecomposingNormalizer` instead of this API. | ||
| /// However, this API is provided for callers such as HarfBuzz that specifically | ||
| /// want access to non-recursive canonical decomposition e.g. for use in a | ||
| /// glyph-availability-guided custom normalizer. | ||
| #[derive(Debug)] | ||
| pub struct CanonicalDecomposition { | ||
| decompositions: DataPayload<NormalizerNfdDataV1>, | ||
| tables: DataPayload<NormalizerNfdTablesV1>, | ||
| non_recursive: DataPayload<NormalizerNfdSupplementV1>, | ||
| } | ||
| #[cfg(feature = "compiled_data")] | ||
| impl Default for CanonicalDecomposition { | ||
| fn default() -> Self { | ||
| Self::new().static_to_owned() | ||
| } | ||
| } | ||
| impl CanonicalDecomposition { | ||
| /// Constructs a borrowed version of this type for more efficient querying. | ||
| pub fn as_borrowed(&self) -> CanonicalDecompositionBorrowed<'_> { | ||
| CanonicalDecompositionBorrowed { | ||
| decompositions: self.decompositions.get(), | ||
| tables: self.tables.get(), | ||
| non_recursive: self.non_recursive.get(), | ||
| } | ||
| } | ||
| /// Construct from compiled data. | ||
| /// | ||
| /// ✨ *Enabled with the `compiled_data` Cargo feature.* | ||
| /// | ||
| /// [📚 Help choosing a constructor](icu_provider::constructors) | ||
| #[cfg(feature = "compiled_data")] | ||
| #[expect(clippy::new_ret_no_self)] | ||
| pub const fn new() -> CanonicalDecompositionBorrowed<'static> { | ||
| CanonicalDecompositionBorrowed::new() | ||
| } | ||
| icu_provider::gen_buffer_data_constructors!(() -> error: DataError, | ||
| functions: [ | ||
| new: skip, | ||
| try_new_with_buffer_provider, | ||
| try_new_unstable, | ||
| Self, | ||
| ] | ||
| ); | ||
| #[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new)] | ||
| pub fn try_new_unstable<D>(provider: &D) -> Result<Self, DataError> | ||
| where | ||
| D: DataProvider<NormalizerNfdDataV1> | ||
| + DataProvider<NormalizerNfdTablesV1> | ||
| + DataProvider<NormalizerNfdSupplementV1> | ||
| + ?Sized, | ||
| { | ||
| let decompositions: DataPayload<NormalizerNfdDataV1> = | ||
| provider.load(Default::default())?.payload; | ||
| let tables: DataPayload<NormalizerNfdTablesV1> = provider.load(Default::default())?.payload; | ||
| if tables.get().scalars16.len() + tables.get().scalars24.len() > 0xFFF { | ||
| // The data is from a future where there exists a normalization flavor whose | ||
| // complex decompositions take more than 0xFFF but fewer than 0x1FFF code points | ||
| // of space. If a good use case from such a decomposition flavor arises, we can | ||
| // dynamically change the bit masks so that the length mask becomes 0x1FFF instead | ||
| // of 0xFFF and the all-non-starters mask becomes 0 instead of 0x1000. However, | ||
| // since for now the masks are hard-coded, error out. | ||
| return Err(DataError::custom("future extension")); | ||
| } | ||
| let non_recursive: DataPayload<NormalizerNfdSupplementV1> = | ||
| provider.load(Default::default())?.payload; | ||
| Ok(CanonicalDecomposition { | ||
| decompositions, | ||
| tables, | ||
| non_recursive, | ||
| }) | ||
| } | ||
| } | ||
| /// Borrowed version of lookup of the Canonical_Combining_Class Unicode property. | ||
| /// | ||
| /// # Example | ||
| /// | ||
| /// ``` | ||
| /// use icu::properties::props::CanonicalCombiningClass; | ||
| /// use icu::normalizer::properties::CanonicalCombiningClassMapBorrowed; | ||
| /// | ||
| /// let map = CanonicalCombiningClassMapBorrowed::new(); | ||
| /// assert_eq!(map.get('a'), CanonicalCombiningClass::NotReordered); // U+0061: LATIN SMALL LETTER A | ||
| /// assert_eq!(map.get32(0x0301), CanonicalCombiningClass::Above); // U+0301: COMBINING ACUTE ACCENT | ||
| /// ``` | ||
| #[derive(Debug)] | ||
| pub struct CanonicalCombiningClassMapBorrowed<'a> { | ||
| /// The data trie | ||
| decompositions: &'a DecompositionData<'a>, | ||
| } | ||
| #[cfg(feature = "compiled_data")] | ||
| impl Default for CanonicalCombiningClassMapBorrowed<'static> { | ||
| fn default() -> Self { | ||
| Self::new() | ||
| } | ||
| } | ||
| impl CanonicalCombiningClassMapBorrowed<'static> { | ||
| /// Cheaply converts a [`CanonicalCombiningClassMapBorrowed<'static>`] into a [`CanonicalCombiningClassMap`]. | ||
| /// | ||
| /// Note: Due to branching and indirection, using [`CanonicalCombiningClassMap`] might inhibit some | ||
| /// compile-time optimizations that are possible with [`CanonicalCombiningClassMapBorrowed`]. | ||
| pub const fn static_to_owned(self) -> CanonicalCombiningClassMap { | ||
| CanonicalCombiningClassMap { | ||
| decompositions: DataPayload::from_static_ref(self.decompositions), | ||
| } | ||
| } | ||
| /// Construct from compiled data. | ||
| /// | ||
| /// ✨ *Enabled with the `compiled_data` Cargo feature.* | ||
| /// | ||
| /// [📚 Help choosing a constructor](icu_provider::constructors) | ||
| #[cfg(feature = "compiled_data")] | ||
| pub const fn new() -> Self { | ||
| CanonicalCombiningClassMapBorrowed { | ||
| decompositions: crate::provider::Baked::SINGLETON_NORMALIZER_NFD_DATA_V1, | ||
| } | ||
| } | ||
| } | ||
| impl CanonicalCombiningClassMapBorrowed<'_> { | ||
| /// Look up the canonical combining class for a scalar value. | ||
| /// | ||
| /// The return value is a u8 representing the canonical combining class, | ||
| /// you may enable the `"icu_properties"` feature if you would like to use a typed | ||
| /// `CanonicalCombiningClass`. | ||
| #[inline(always)] | ||
| pub fn get_u8(&self, c: char) -> u8 { | ||
| self.get32_u8(u32::from(c)) | ||
| } | ||
| /// Look up the canonical combining class for a scalar value | ||
| /// represented as `u32`. If the argument is outside the scalar | ||
| /// value range, `Not_Reordered` is returned. | ||
| /// | ||
| /// The return value is a u8 representing the canonical combining class, | ||
| /// you may enable the `"icu_properties"` feature if you would like to use a typed | ||
| /// `CanonicalCombiningClass`. | ||
| pub fn get32_u8(&self, c: u32) -> u8 { | ||
| let trie_value = self.decompositions.trie.get32(c); | ||
| if trie_value_has_ccc(trie_value) { | ||
| trie_value as u8 | ||
| } else { | ||
| ccc!(NotReordered, 0).to_icu4c_value() | ||
| } | ||
| } | ||
| /// Look up the canonical combining class for a scalar value | ||
| /// | ||
| /// ✨ *Enabled with the `icu_properties` Cargo feature.* | ||
| #[inline(always)] | ||
| #[cfg(feature = "icu_properties")] | ||
| pub fn get(&self, c: char) -> CanonicalCombiningClass { | ||
| CanonicalCombiningClass::from_icu4c_value(self.get_u8(c)) | ||
| } | ||
| /// Look up the canonical combining class for a scalar value | ||
| /// represented as `u32`. If the argument is outside the scalar | ||
| /// value range, `CanonicalCombiningClass::NotReordered` is returned. | ||
| /// | ||
| /// ✨ *Enabled with the `icu_properties` Cargo feature.* | ||
| #[cfg(feature = "icu_properties")] | ||
| pub fn get32(&self, c: u32) -> CanonicalCombiningClass { | ||
| CanonicalCombiningClass::from_icu4c_value(self.get32_u8(c)) | ||
| } | ||
| } | ||
| /// Lookup of the Canonical_Combining_Class Unicode property. | ||
| #[derive(Debug)] | ||
| pub struct CanonicalCombiningClassMap { | ||
| /// The data trie | ||
| decompositions: DataPayload<NormalizerNfdDataV1>, | ||
| } | ||
| #[cfg(feature = "compiled_data")] | ||
| impl Default for CanonicalCombiningClassMap { | ||
| fn default() -> Self { | ||
| Self::new().static_to_owned() | ||
| } | ||
| } | ||
| impl CanonicalCombiningClassMap { | ||
| /// Constructs a borrowed version of this type for more efficient querying. | ||
| pub fn as_borrowed(&self) -> CanonicalCombiningClassMapBorrowed<'_> { | ||
| CanonicalCombiningClassMapBorrowed { | ||
| decompositions: self.decompositions.get(), | ||
| } | ||
| } | ||
| /// Construct from compiled data. | ||
| /// | ||
| /// ✨ *Enabled with the `compiled_data` Cargo feature.* | ||
| /// | ||
| /// [📚 Help choosing a constructor](icu_provider::constructors) | ||
| #[cfg(feature = "compiled_data")] | ||
| #[expect(clippy::new_ret_no_self)] | ||
| pub const fn new() -> CanonicalCombiningClassMapBorrowed<'static> { | ||
| CanonicalCombiningClassMapBorrowed::new() | ||
| } | ||
| icu_provider::gen_buffer_data_constructors!(() -> error: DataError, | ||
| functions: [ | ||
| new: skip, | ||
| try_new_with_buffer_provider, | ||
| try_new_unstable, | ||
| Self, | ||
| ]); | ||
| #[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new)] | ||
| pub fn try_new_unstable<D>(provider: &D) -> Result<Self, DataError> | ||
| where | ||
| D: DataProvider<NormalizerNfdDataV1> + ?Sized, | ||
| { | ||
| let decompositions: DataPayload<NormalizerNfdDataV1> = | ||
| provider.load(Default::default())?.payload; | ||
| Ok(CanonicalCombiningClassMap { decompositions }) | ||
| } | ||
| } |
| // This file is part of ICU4X. For terms of use, please see the file | ||
| // called LICENSE at the top level of the ICU4X source tree | ||
| // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). | ||
| //! 🚧 \[Unstable\] Data provider struct definitions for this ICU4X component. | ||
| //! | ||
| //! <div class="stab unstable"> | ||
| //! 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways, | ||
| //! including in SemVer minor releases. While the serde representation of data structs is guaranteed | ||
| //! to be stable, their Rust representation might not be. Use with caution. | ||
| //! </div> | ||
| //! | ||
| //! Read more about data providers: [`icu_provider`] | ||
| // Provider structs must be stable | ||
| #![allow(clippy::exhaustive_structs, clippy::exhaustive_enums)] | ||
| use icu_collections::char16trie::Char16Trie; | ||
| use icu_collections::codepointtrie::CodePointTrie; | ||
| use icu_provider::prelude::*; | ||
| use zerovec::ZeroVec; | ||
| #[cfg(feature = "compiled_data")] | ||
| #[derive(Debug)] | ||
| /// Baked data | ||
| /// | ||
| /// <div class="stab unstable"> | ||
| /// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways, | ||
| /// including in SemVer minor releases. In particular, the `DataProvider` implementations are only | ||
| /// guaranteed to match with this version's `*_unstable` providers. Use with caution. | ||
| /// </div> | ||
| pub struct Baked; | ||
| #[cfg(feature = "compiled_data")] | ||
| #[allow(unused_imports)] | ||
| const _: () = { | ||
| use icu_normalizer_data::*; | ||
| pub mod icu { | ||
| pub use crate as normalizer; | ||
| pub use icu_collections as collections; | ||
| } | ||
| make_provider!(Baked); | ||
| impl_normalizer_nfc_v1!(Baked); | ||
| impl_normalizer_nfd_data_v1!(Baked); | ||
| impl_normalizer_nfd_supplement_v1!(Baked); | ||
| impl_normalizer_nfd_tables_v1!(Baked); | ||
| impl_normalizer_nfkd_data_v1!(Baked); | ||
| impl_normalizer_nfkd_tables_v1!(Baked); | ||
| impl_normalizer_uts46_data_v1!(Baked); | ||
| }; | ||
| icu_provider::data_marker!( | ||
| /// Marker for data for canonical decomposition. | ||
| NormalizerNfdDataV1, | ||
| "normalizer/nfd/data/v1", | ||
| DecompositionData<'static>, | ||
| is_singleton = true | ||
| ); | ||
| icu_provider::data_marker!( | ||
| /// Marker for additional data for canonical decomposition. | ||
| NormalizerNfdTablesV1, | ||
| "normalizer/nfd/tables/v1", | ||
| DecompositionTables<'static>, | ||
| is_singleton = true | ||
| ); | ||
| icu_provider::data_marker!( | ||
| /// Marker for data for compatibility decomposition. | ||
| NormalizerNfkdDataV1, | ||
| "normalizer/nfkd/data/v1", | ||
| DecompositionData<'static>, | ||
| is_singleton = true | ||
| ); | ||
| icu_provider::data_marker!( | ||
| /// Marker for additional data for compatibility decomposition. | ||
| NormalizerNfkdTablesV1, | ||
| "normalizer/nfkd/tables/v1", | ||
| DecompositionTables<'static>, | ||
| is_singleton = true | ||
| ); | ||
| icu_provider::data_marker!( | ||
| /// Marker for data for UTS-46 decomposition. | ||
| NormalizerUts46DataV1, | ||
| "normalizer/uts46/data/v1", | ||
| DecompositionData<'static>, | ||
| is_singleton = true | ||
| ); | ||
| icu_provider::data_marker!( | ||
| /// Marker for data for composition. | ||
| NormalizerNfcV1, | ||
| "normalizer/nfc/v1", | ||
| CanonicalCompositions<'static>, | ||
| is_singleton = true | ||
| ); | ||
| icu_provider::data_marker!( | ||
| /// Marker for additional data for non-recusrsive composition. | ||
| NormalizerNfdSupplementV1, | ||
| "normalizer/nfd/supplement/v1", | ||
| NonRecursiveDecompositionSupplement<'static>, | ||
| is_singleton = true | ||
| ); | ||
| #[cfg(feature = "datagen")] | ||
| /// The latest minimum set of markers required by this component. | ||
| pub const MARKERS: &[DataMarkerInfo] = &[ | ||
| NormalizerNfcV1::INFO, | ||
| NormalizerNfdDataV1::INFO, | ||
| NormalizerNfdTablesV1::INFO, | ||
| NormalizerNfkdDataV1::INFO, | ||
| NormalizerNfkdTablesV1::INFO, | ||
| NormalizerNfdSupplementV1::INFO, | ||
| NormalizerUts46DataV1::INFO, | ||
| ]; | ||
| /// Decomposition data | ||
| /// | ||
| /// <div class="stab unstable"> | ||
| /// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways, | ||
| /// including in SemVer minor releases. While the serde representation of data structs is guaranteed | ||
| /// to be stable, their Rust representation might not be. Use with caution. | ||
| /// </div> | ||
| #[derive(Debug, PartialEq, Clone, yoke::Yokeable, zerofrom::ZeroFrom)] | ||
| #[cfg_attr(feature = "datagen", derive(serde::Serialize, databake::Bake))] | ||
| #[cfg_attr(feature = "datagen", databake(path = icu_normalizer::provider))] | ||
| #[cfg_attr(feature = "serde", derive(serde::Deserialize))] | ||
| pub struct DecompositionData<'data> { | ||
| /// Trie for decomposition. | ||
| #[cfg_attr(feature = "serde", serde(borrow))] | ||
| pub trie: CodePointTrie<'data, u32>, | ||
| /// The passthrough bounds of NFD/NFC are lowered to this | ||
| /// maximum instead. (16-bit, because cannot be higher | ||
| /// than 0x0300, which is the bound for NFC.) | ||
| pub passthrough_cap: u16, | ||
| } | ||
| icu_provider::data_struct!( | ||
| DecompositionData<'_>, | ||
| #[cfg(feature = "datagen")] | ||
| ); | ||
| /// The expansion tables for cases where the decomposition isn't | ||
| /// contained in the trie value | ||
| /// | ||
| /// <div class="stab unstable"> | ||
| /// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways, | ||
| /// including in SemVer minor releases. While the serde representation of data structs is guaranteed | ||
| /// to be stable, their Rust representation might not be. Use with caution. | ||
| /// </div> | ||
| #[derive(Debug, PartialEq, Clone, yoke::Yokeable, zerofrom::ZeroFrom)] | ||
| #[cfg_attr(feature = "datagen", derive(serde::Serialize, databake::Bake))] | ||
| #[cfg_attr(feature = "datagen", databake(path = icu_normalizer::provider))] | ||
| #[cfg_attr(feature = "serde", derive(serde::Deserialize))] | ||
| pub struct DecompositionTables<'data> { | ||
| /// Decompositions that are fully within the BMP | ||
| #[cfg_attr(feature = "serde", serde(borrow))] | ||
| pub scalars16: ZeroVec<'data, u16>, | ||
| /// Decompositions with at least one character outside | ||
| /// the BMP | ||
| #[cfg_attr(feature = "serde", serde(borrow))] | ||
| pub scalars24: ZeroVec<'data, char>, | ||
| } | ||
| icu_provider::data_struct!( | ||
| DecompositionTables<'_>, | ||
| #[cfg(feature = "datagen")] | ||
| ); | ||
| /// Non-Hangul canonical compositions | ||
| /// | ||
| /// <div class="stab unstable"> | ||
| /// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways, | ||
| /// including in SemVer minor releases. While the serde representation of data structs is guaranteed | ||
| /// to be stable, their Rust representation might not be. Use with caution. | ||
| /// </div> | ||
| #[derive(Debug, PartialEq, Clone, yoke::Yokeable, zerofrom::ZeroFrom)] | ||
| #[cfg_attr(feature = "datagen", derive(serde::Serialize, databake::Bake))] | ||
| #[cfg_attr(feature = "datagen", databake(path = icu_normalizer::provider))] | ||
| #[cfg_attr(feature = "serde", derive(serde::Deserialize))] | ||
| pub struct CanonicalCompositions<'data> { | ||
| /// Trie keys are two-`char` strings with the second | ||
| /// character coming first. The value, if any, is the | ||
| /// (non-Hangul) canonical composition. | ||
| #[cfg_attr(feature = "serde", serde(borrow))] | ||
| pub canonical_compositions: Char16Trie<'data>, | ||
| } | ||
| icu_provider::data_struct!( | ||
| CanonicalCompositions<'_>, | ||
| #[cfg(feature = "datagen")] | ||
| ); | ||
| /// Non-recursive canonical decompositions that differ from | ||
| /// `DecompositionData`. | ||
| /// | ||
| /// <div class="stab unstable"> | ||
| /// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways, | ||
| /// including in SemVer minor releases. While the serde representation of data structs is guaranteed | ||
| /// to be stable, their Rust representation might not be. Use with caution. | ||
| /// </div> | ||
| #[derive(Debug, PartialEq, Clone, yoke::Yokeable, zerofrom::ZeroFrom)] | ||
| #[cfg_attr(feature = "datagen", derive(serde::Serialize, databake::Bake))] | ||
| #[cfg_attr(feature = "datagen", databake(path = icu_normalizer::provider))] | ||
| #[cfg_attr(feature = "serde", derive(serde::Deserialize))] | ||
| pub struct NonRecursiveDecompositionSupplement<'data> { | ||
| /// Trie for the supplementary non-recursive decompositions | ||
| #[cfg_attr(feature = "serde", serde(borrow))] | ||
| pub trie: CodePointTrie<'data, u32>, | ||
| /// Decompositions with at least one character outside | ||
| /// the BMP | ||
| #[cfg_attr(feature = "serde", serde(borrow))] | ||
| pub scalars24: ZeroVec<'data, char>, | ||
| } | ||
| icu_provider::data_struct!( | ||
| NonRecursiveDecompositionSupplement<'_>, | ||
| #[cfg(feature = "datagen")] | ||
| ); |
| // This file is part of ICU4X. For terms of use, please see the file | ||
| // called LICENSE at the top level of the ICU4X source tree | ||
| // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). | ||
| //! Bundles the part of UTS 46 that makes sense to implement as a | ||
| //! normalization. | ||
| //! | ||
| //! This is meant to be used as a building block of an UTS 46 | ||
| //! implementation, such as the `idna` crate. | ||
| use crate::ComposingNormalizer; | ||
| use crate::ComposingNormalizerBorrowed; | ||
| use crate::NormalizerNfcV1; | ||
| use crate::NormalizerNfdTablesV1; | ||
| use crate::NormalizerNfkdTablesV1; | ||
| use crate::NormalizerUts46DataV1; | ||
| use icu_provider::DataError; | ||
| use icu_provider::DataProvider; | ||
| // Implementation note: Despite merely wrapping a `ComposingNormalizer`, | ||
| // having a `Uts46Mapper` serves two purposes: | ||
| // | ||
| // 1. Denying public access to parts of the `ComposingNormalizer` API | ||
| // that don't work when the data contains markers for ignorables. | ||
| // 2. Providing a place where additional iterator pre-processing or | ||
| // post-processing can take place if needed in the future. (When | ||
| // writing this, it looked like such processing was needed but | ||
| // now isn't needed after all.) | ||
| /// A borrowed version of a mapper that knows how to performs the | ||
| /// subsets of UTS 46 processing documented on the methods. | ||
| #[derive(Debug)] | ||
| pub struct Uts46MapperBorrowed<'a> { | ||
| normalizer: ComposingNormalizerBorrowed<'a>, | ||
| } | ||
| #[cfg(feature = "compiled_data")] | ||
| impl Default for Uts46MapperBorrowed<'static> { | ||
| fn default() -> Self { | ||
| Self::new() | ||
| } | ||
| } | ||
| impl Uts46MapperBorrowed<'static> { | ||
| /// Cheaply converts a [`Uts46MapperBorrowed<'static>`] into a [`Uts46Mapper`]. | ||
| /// | ||
| /// Note: Due to branching and indirection, using [`Uts46Mapper`] might inhibit some | ||
| /// compile-time optimizations that are possible with [`Uts46MapperBorrowed`]. | ||
| pub const fn static_to_owned(self) -> Uts46Mapper { | ||
| Uts46Mapper { | ||
| normalizer: self.normalizer.static_to_owned(), | ||
| } | ||
| } | ||
| /// Construct with compiled data. | ||
| #[cfg(feature = "compiled_data")] | ||
| pub const fn new() -> Self { | ||
| Uts46MapperBorrowed { | ||
| normalizer: ComposingNormalizerBorrowed::new_uts46(), | ||
| } | ||
| } | ||
| } | ||
| impl Uts46MapperBorrowed<'_> { | ||
| /// Returns an iterator adaptor that turns an `Iterator` over `char` | ||
| /// into an iterator yielding a `char` sequence that gets the following | ||
| /// operations from the "Map" and "Normalize" steps of the "Processing" | ||
| /// section of UTS 46 lazily applied to it: | ||
| /// | ||
| /// 1. The _ignored_ characters are ignored. | ||
| /// 2. The _mapped_ characters are mapped. | ||
| /// 3. The _disallowed_ characters are replaced with U+FFFD, | ||
| /// which itself is a disallowed character. | ||
| /// 4. The _deviation_ characters are treated as _mapped_ or _valid_ | ||
| /// as appropriate. | ||
| /// 5. The _disallowed_STD3_valid_ characters are treated as allowed. | ||
| /// 6. The _disallowed_STD3_mapped_ characters are treated as | ||
| /// _mapped_. | ||
| /// 7. The result is normalized to NFC. | ||
| /// | ||
| /// Notably: | ||
| /// | ||
| /// * The STD3 or WHATWG ASCII deny list should be implemented as a | ||
| /// post-processing step. | ||
| /// * Transitional processing is not performed. Transitional mapping | ||
| /// would be a pre-processing step, but transitional processing is | ||
| /// deprecated, and none of Firefox, Safari, or Chrome use it. | ||
| pub fn map_normalize<'delegate, I: Iterator<Item = char> + 'delegate>( | ||
| &'delegate self, | ||
| iter: I, | ||
| ) -> impl Iterator<Item = char> + 'delegate { | ||
| self.normalizer | ||
| .normalize_iter_private(iter, crate::IgnorableBehavior::Ignored) | ||
| } | ||
| /// Returns an iterator adaptor that turns an `Iterator` over `char` | ||
| /// into an iterator yielding a `char` sequence that gets the following | ||
| /// operations from the NFC check and statucs steps of the "Validity | ||
| /// Criteria" section of UTS 46 lazily applied to it: | ||
| /// | ||
| /// 1. The _ignored_ characters are treated as _disallowed_. | ||
| /// 2. The _mapped_ characters are mapped. | ||
| /// 3. The _disallowed_ characters are replaced with U+FFFD, | ||
| /// which itself is a disallowed character. | ||
| /// 4. The _deviation_ characters are treated as _mapped_ or _valid_ | ||
| /// as appropriate. | ||
| /// 5. The _disallowed_STD3_valid_ characters are treated as allowed. | ||
| /// 6. The _disallowed_STD3_mapped_ characters are treated as | ||
| /// _mapped_. | ||
| /// 7. The result is normalized to NFC. | ||
| /// | ||
| /// Notably: | ||
| /// | ||
| /// * The STD3 or WHATWG ASCII deny list should be implemented as a | ||
| /// post-processing step. | ||
| /// * Transitional processing is not performed. Transitional mapping | ||
| /// would be a pre-processing step, but transitional processing is | ||
| /// deprecated, and none of Firefox, Safari, or Chrome use it. | ||
| /// * The output needs to be compared with input to see if anything | ||
| /// changed. This check catches failures to adhere to the normalization | ||
| /// and status requirements. In particular, this comparison results | ||
| /// in _mapped_ characters resulting in error like "Validity Criteria" | ||
| /// requires. | ||
| pub fn normalize_validate<'delegate, I: Iterator<Item = char> + 'delegate>( | ||
| &'delegate self, | ||
| iter: I, | ||
| ) -> impl Iterator<Item = char> + 'delegate { | ||
| self.normalizer | ||
| .normalize_iter_private(iter, crate::IgnorableBehavior::ReplacementCharacter) | ||
| } | ||
| } | ||
| /// A mapper that knows how to performs the subsets of UTS 46 processing | ||
| /// documented on the methods. | ||
| #[derive(Debug)] | ||
| pub struct Uts46Mapper { | ||
| normalizer: ComposingNormalizer, | ||
| } | ||
| #[cfg(feature = "compiled_data")] | ||
| impl Default for Uts46Mapper { | ||
| fn default() -> Self { | ||
| Self::new().static_to_owned() | ||
| } | ||
| } | ||
| impl Uts46Mapper { | ||
| /// Constructs a borrowed version of this type for more efficient querying. | ||
| pub fn as_borrowed(&self) -> Uts46MapperBorrowed<'_> { | ||
| Uts46MapperBorrowed { | ||
| normalizer: self.normalizer.as_borrowed(), | ||
| } | ||
| } | ||
| /// Construct with compiled data. | ||
| #[cfg(feature = "compiled_data")] | ||
| #[expect(clippy::new_ret_no_self)] | ||
| pub const fn new() -> Uts46MapperBorrowed<'static> { | ||
| Uts46MapperBorrowed::new() | ||
| } | ||
| /// Construct with provider. | ||
| #[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new)] | ||
| pub fn try_new<D>(provider: &D) -> Result<Self, DataError> | ||
| where | ||
| D: DataProvider<NormalizerUts46DataV1> | ||
| + DataProvider<NormalizerNfdTablesV1> | ||
| + DataProvider<NormalizerNfkdTablesV1> | ||
| // UTS 46 tables merged into NormalizerNfkdTablesV1 | ||
| + DataProvider<NormalizerNfcV1> | ||
| + ?Sized, | ||
| { | ||
| let normalizer = ComposingNormalizer::try_new_uts46_unstable(provider)?; | ||
| Ok(Uts46Mapper { normalizer }) | ||
| } | ||
| } |
| # This is a placeholder in the interest of keeping the repository size smaller. | ||
| # Replace this file with the contents of | ||
| # https://www.unicode.org/Public/UCD/latest/ucd/NormalizationTest.txt to actually | ||
| # run the conformance test. |
| The test data comes from | ||
| https://www.unicode.org/Public/UCD/latest/ucd/NormalizationTest.txt |
Sorry, the diff of this file is too big to display