icu_normalizer
Advanced tools
| { | ||
| "git": { | ||
| "sha1": "29dfe2790b6cfdab94ca6a6b69f58ce54802dbf7" | ||
| }, | ||
| "path_in_vcs": "components/normalizer" | ||
| } |
| // This file is part of ICU4X. For terms of use, please see the file | ||
| // called LICENSE at the top level of the ICU4X source tree | ||
| // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). | ||
| use criterion::{criterion_group, criterion_main}; | ||
| mod canonical_composition; | ||
| mod canonical_decomposition; | ||
| mod composing_normalizer_nfc; | ||
| mod composing_normalizer_nfkc; | ||
| mod decomposing_normalizer_nfd; | ||
| mod decomposing_normalizer_nfkd; | ||
| mod utf16_throughput; | ||
| criterion_group!( | ||
| benches, | ||
| canonical_composition::criterion_benchmark, | ||
| canonical_decomposition::criterion_benchmark, | ||
| composing_normalizer_nfc::criterion_benchmark, | ||
| composing_normalizer_nfkc::criterion_benchmark, | ||
| decomposing_normalizer_nfd::criterion_benchmark, | ||
| decomposing_normalizer_nfkd::criterion_benchmark, | ||
| utf16_throughput::criterion_benchmark, | ||
| ); | ||
| criterion_main!(benches); |
| // This file is part of ICU4X. For terms of use, please see the file | ||
| // called LICENSE at the top level of the ICU4X source tree | ||
| // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). | ||
| use criterion::{black_box, BenchmarkId, Criterion}; | ||
| use detone::IterDecomposeVietnamese; | ||
| use icu_normalizer::properties::{ | ||
| CanonicalCompositionBorrowed, CanonicalDecompositionBorrowed, Decomposed, | ||
| }; | ||
| use icu_normalizer::ComposingNormalizerBorrowed; | ||
| struct BenchDataContent { | ||
| pub file_name: String, | ||
| pub pairs: Vec<(char, char)>, | ||
| } | ||
| fn strip_headers(content: &str) -> String { | ||
| content | ||
| .lines() | ||
| .filter(|&s| !s.starts_with('#')) | ||
| .map(|s| s.to_owned()) | ||
| .collect::<Vec<String>>() | ||
| .join("\n") | ||
| } | ||
| fn normalizer_bench_data() -> [BenchDataContent; 16] { | ||
| let nfc_normalizer = ComposingNormalizerBorrowed::new_nfc(); | ||
| [ | ||
| BenchDataContent { | ||
| file_name: "TestNames_Latin".to_owned(), | ||
| pairs: decompose_data( | ||
| &nfc_normalizer | ||
| .normalize(&strip_headers(include_str!("./data/TestNames_Latin.txt"))), | ||
| ), | ||
| }, | ||
| BenchDataContent { | ||
| file_name: "TestNames_Japanese_h".to_owned(), | ||
| pairs: decompose_data(&nfc_normalizer.normalize(&strip_headers(include_str!( | ||
| "./data/TestNames_Japanese_h.txt" | ||
| )))), | ||
| }, | ||
| BenchDataContent { | ||
| file_name: "TestNames_Japanese_k".to_owned(), | ||
| pairs: decompose_data(&nfc_normalizer.normalize(&strip_headers(include_str!( | ||
| "./data/TestNames_Japanese_k.txt" | ||
| )))), | ||
| }, | ||
| BenchDataContent { | ||
| file_name: "TestNames_Korean".to_owned(), | ||
| pairs: decompose_data( | ||
| &nfc_normalizer | ||
| .normalize(&strip_headers(include_str!("./data/TestNames_Korean.txt"))), | ||
| ), | ||
| }, | ||
| BenchDataContent { | ||
| file_name: "TestRandomWordsUDHR_ar".to_owned(), | ||
| #[cfg(debug_assertions)] | ||
| pairs: Vec::new(), | ||
| #[cfg(not(debug_assertions))] | ||
| pairs: decompose_data(&nfc_normalizer.normalize(&strip_headers(include_str!( | ||
| "./data/TestRandomWordsUDHR_ar.txt" | ||
| )))), | ||
| }, | ||
| BenchDataContent { | ||
| file_name: "TestRandomWordsUDHR_de".to_owned(), | ||
| pairs: decompose_data(&nfc_normalizer.normalize(&strip_headers(include_str!( | ||
| "./data/TestRandomWordsUDHR_de.txt" | ||
| )))), | ||
| }, | ||
| BenchDataContent { | ||
| file_name: "TestRandomWordsUDHR_el".to_owned(), | ||
| pairs: decompose_data(&nfc_normalizer.normalize(&strip_headers(include_str!( | ||
| "./data/TestRandomWordsUDHR_el.txt" | ||
| )))), | ||
| }, | ||
| BenchDataContent { | ||
| file_name: "TestRandomWordsUDHR_es".to_owned(), | ||
| pairs: decompose_data(&nfc_normalizer.normalize(&strip_headers(include_str!( | ||
| "./data/TestRandomWordsUDHR_es.txt" | ||
| )))), | ||
| }, | ||
| BenchDataContent { | ||
| file_name: "TestRandomWordsUDHR_fr".to_owned(), | ||
| pairs: decompose_data(&nfc_normalizer.normalize(&strip_headers(include_str!( | ||
| "./data/TestRandomWordsUDHR_fr.txt" | ||
| )))), | ||
| }, | ||
| BenchDataContent { | ||
| file_name: "TestRandomWordsUDHR_he".to_owned(), | ||
| pairs: decompose_data(&nfc_normalizer.normalize(&strip_headers(include_str!( | ||
| "./data/TestRandomWordsUDHR_he.txt" | ||
| )))), | ||
| }, | ||
| BenchDataContent { | ||
| file_name: "TestRandomWordsUDHR_pl".to_owned(), | ||
| pairs: decompose_data(&nfc_normalizer.normalize(&strip_headers(include_str!( | ||
| "./data/TestRandomWordsUDHR_pl.txt" | ||
| )))), | ||
| }, | ||
| BenchDataContent { | ||
| file_name: "TestRandomWordsUDHR_ru".to_owned(), | ||
| pairs: decompose_data(&nfc_normalizer.normalize(&strip_headers(include_str!( | ||
| "./data/TestRandomWordsUDHR_ru.txt" | ||
| )))), | ||
| }, | ||
| BenchDataContent { | ||
| file_name: "TestRandomWordsUDHR_th".to_owned(), | ||
| #[cfg(debug_assertions)] | ||
| pairs: Vec::new(), | ||
| #[cfg(not(debug_assertions))] | ||
| pairs: decompose_data(&nfc_normalizer.normalize(&strip_headers(include_str!( | ||
| "./data/TestRandomWordsUDHR_th.txt" | ||
| )))), | ||
| }, | ||
| BenchDataContent { | ||
| file_name: "TestRandomWordsUDHR_tr".to_owned(), | ||
| pairs: decompose_data(&nfc_normalizer.normalize(&strip_headers(include_str!( | ||
| "./data/TestRandomWordsUDHR_tr.txt" | ||
| )))), | ||
| }, | ||
| BenchDataContent { | ||
| file_name: "udhr_vie".to_owned(), | ||
| pairs: decompose_data( | ||
| &nfc_normalizer.normalize(&strip_headers(include_str!("data/wotw.txt"))), | ||
| ), | ||
| }, | ||
| BenchDataContent { | ||
| file_name: "udhr_vie_detone".to_owned(), | ||
| pairs: { | ||
| let result: Vec<(char, char)> = nfc_normalizer | ||
| .normalize(&strip_headers(include_str!("data/wotw.txt"))) | ||
| .chars() | ||
| .filter_map(|c| { | ||
| let mut iter = std::iter::once(c).decompose_vietnamese_tones(true); | ||
| if let Some(base) = iter.next() { | ||
| iter.next().map(|tone| (base, tone)) | ||
| } else { | ||
| None | ||
| } | ||
| }) | ||
| .collect(); | ||
| assert!(!result.is_empty()); | ||
| result | ||
| }, | ||
| }, | ||
| ] | ||
| } | ||
| fn function_under_bench( | ||
| canonical_composer: &CanonicalCompositionBorrowed, | ||
| composable_points: &[(char, char)], | ||
| ) { | ||
| for pair in composable_points.iter() { | ||
| canonical_composer.compose(pair.0, pair.1); | ||
| } | ||
| } | ||
| pub fn criterion_benchmark(criterion: &mut Criterion) { | ||
| let group_name = "canonical_composition"; | ||
| let mut group = criterion.benchmark_group(group_name); | ||
| let composer = CanonicalCompositionBorrowed::new(); | ||
| for bench_data_content in black_box(normalizer_bench_data()) { | ||
| group.bench_function( | ||
| BenchmarkId::from_parameter(format!("from_nfc_{}", bench_data_content.file_name)), | ||
| |bencher| bencher.iter(|| function_under_bench(&composer, &bench_data_content.pairs)), | ||
| ); | ||
| } | ||
| group.finish(); | ||
| } | ||
| fn decompose_data(nfc: &str) -> Vec<(char, char)> { | ||
| let decomposer = CanonicalDecompositionBorrowed::new(); | ||
| nfc.chars() | ||
| .map(|c| decomposer.decompose(c)) | ||
| .filter_map(|decomposed| { | ||
| if let Decomposed::Expansion(a, b) = decomposed { | ||
| Some((a, b)) | ||
| } else { | ||
| None | ||
| } | ||
| }) | ||
| .collect() | ||
| } |
| // This file is part of ICU4X. For terms of use, please see the file | ||
| // called LICENSE at the top level of the ICU4X source tree | ||
| // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). | ||
| use criterion::{black_box, BenchmarkId, Criterion}; | ||
| use icu_normalizer::properties::CanonicalDecompositionBorrowed; | ||
| use icu_normalizer::{ComposingNormalizerBorrowed, DecomposingNormalizerBorrowed}; | ||
| struct BenchDataContent { | ||
| pub file_name: String, | ||
| pub nfc: String, | ||
| pub nfd: String, | ||
| pub nfkc: String, | ||
| pub nfkd: String, | ||
| } | ||
| fn strip_headers(content: &str) -> String { | ||
| content | ||
| .lines() | ||
| .filter(|&s| !s.starts_with('#')) | ||
| .map(|s| s.to_owned()) | ||
| .collect::<Vec<String>>() | ||
| .join("\n") | ||
| } | ||
| fn normalizer_bench_data() -> [BenchDataContent; 15] { | ||
| let nfc_normalizer = ComposingNormalizerBorrowed::new_nfc(); | ||
| let nfd_normalizer = DecomposingNormalizerBorrowed::new_nfd(); | ||
| let nfkc_normalizer = ComposingNormalizerBorrowed::new_nfkc(); | ||
| let nfkd_normalizer = DecomposingNormalizerBorrowed::new_nfkd(); | ||
| let content_latin: (&str, &str) = ( | ||
| "TestNames_Latin", | ||
| &strip_headers(include_str!("./data/TestNames_Latin.txt")), | ||
| ); | ||
| let content_jp_h: (&str, &str) = ( | ||
| "TestNames_Japanese_h", | ||
| &strip_headers(include_str!("./data/TestNames_Japanese_h.txt")), | ||
| ); | ||
| let content_jp_k: (&str, &str) = ( | ||
| "TestNames_Japanese_k", | ||
| &strip_headers(include_str!("./data/TestNames_Japanese_k.txt")), | ||
| ); | ||
| let content_korean: (&str, &str) = ( | ||
| "TestNames_Korean", | ||
| &strip_headers(include_str!("./data/TestNames_Korean.txt")), | ||
| ); | ||
| let content_random_words_ar: (&str, &str) = ( | ||
| "TestRandomWordsUDHR_ar", | ||
| &strip_headers(include_str!("./data/TestRandomWordsUDHR_ar.txt")), | ||
| ); | ||
| let content_random_words_de: (&str, &str) = ( | ||
| "TestRandomWordsUDHR_de", | ||
| &strip_headers(include_str!("./data/TestRandomWordsUDHR_de.txt")), | ||
| ); | ||
| let content_random_words_el: (&str, &str) = ( | ||
| "TestRandomWordsUDHR_el", | ||
| &strip_headers(include_str!("./data/TestRandomWordsUDHR_el.txt")), | ||
| ); | ||
| let content_random_words_es: (&str, &str) = ( | ||
| "TestRandomWordsUDHR_es", | ||
| &strip_headers(include_str!("./data/TestRandomWordsUDHR_es.txt")), | ||
| ); | ||
| let content_random_words_fr: (&str, &str) = ( | ||
| "TestRandomWordsUDHR_fr", | ||
| &strip_headers(include_str!("./data/TestRandomWordsUDHR_fr.txt")), | ||
| ); | ||
| let content_random_words_he: (&str, &str) = ( | ||
| "TestRandomWordsUDHR_he", | ||
| &strip_headers(include_str!("./data/TestRandomWordsUDHR_he.txt")), | ||
| ); | ||
| let content_random_words_pl: (&str, &str) = ( | ||
| "TestRandomWordsUDHR_pl", | ||
| &strip_headers(include_str!("./data/TestRandomWordsUDHR_pl.txt")), | ||
| ); | ||
| let content_random_words_ru: (&str, &str) = ( | ||
| "TestRandomWordsUDHR_ru", | ||
| &strip_headers(include_str!("./data/TestRandomWordsUDHR_ru.txt")), | ||
| ); | ||
| let content_random_words_th: (&str, &str) = ( | ||
| "TestRandomWordsUDHR_th", | ||
| &strip_headers(include_str!("./data/TestRandomWordsUDHR_th.txt")), | ||
| ); | ||
| let content_random_words_tr: (&str, &str) = ( | ||
| "TestRandomWordsUDHR_tr", | ||
| &strip_headers(include_str!("./data/TestRandomWordsUDHR_tr.txt")), | ||
| ); | ||
| let content_viet: (&str, &str) = ("udhr_vie", &strip_headers(include_str!("data/wotw.txt"))); | ||
| [ | ||
| content_latin, | ||
| content_viet, | ||
| content_jp_k, | ||
| content_jp_h, | ||
| content_korean, | ||
| content_random_words_ru, | ||
| content_random_words_ar, | ||
| content_random_words_el, | ||
| content_random_words_es, | ||
| content_random_words_fr, | ||
| content_random_words_tr, | ||
| content_random_words_th, | ||
| content_random_words_pl, | ||
| content_random_words_he, | ||
| content_random_words_de, | ||
| ] | ||
| .map(|(file_name, raw_content)| BenchDataContent { | ||
| file_name: file_name.to_owned(), | ||
| nfc: nfc_normalizer.normalize(raw_content).to_string(), | ||
| nfd: nfd_normalizer.normalize(raw_content).to_string(), | ||
| nfkc: nfkc_normalizer.normalize(raw_content).to_string(), | ||
| nfkd: nfkd_normalizer.normalize(raw_content).to_string(), | ||
| }) | ||
| } | ||
| #[cfg(debug_assertions)] | ||
| fn function_under_bench( | ||
| _canonical_decomposer: &CanonicalDecompositionBorrowed, | ||
| _decomposable_points: &str, | ||
| ) { | ||
| // using debug assertion fails some test. | ||
| // "cargo test --bench bench" will pass | ||
| // "cargo bench" will work as expected, because the profile doesn't include debug assertions. | ||
| } | ||
| #[cfg(not(debug_assertions))] | ||
| fn function_under_bench( | ||
| canonical_decomposer: &CanonicalDecompositionBorrowed, | ||
| decomposable_points: &str, | ||
| ) { | ||
| decomposable_points.chars().for_each(|point| { | ||
| canonical_decomposer.decompose(point); | ||
| }); | ||
| } | ||
| pub fn criterion_benchmark(criterion: &mut Criterion) { | ||
| let group_name = "canonical_decomposition"; | ||
| let mut group = criterion.benchmark_group(group_name); | ||
| let decomposer = CanonicalDecompositionBorrowed::new(); | ||
| for bench_data_content in black_box(normalizer_bench_data()) { | ||
| group.bench_function( | ||
| BenchmarkId::from_parameter(format!("from_nfc_{}", bench_data_content.file_name)), | ||
| |bencher| bencher.iter(|| function_under_bench(&decomposer, &bench_data_content.nfc)), | ||
| ); | ||
| group.bench_function( | ||
| BenchmarkId::from_parameter(format!("from_nfd_{}", bench_data_content.file_name)), | ||
| |bencher| bencher.iter(|| function_under_bench(&decomposer, &bench_data_content.nfd)), | ||
| ); | ||
| group.bench_function( | ||
| BenchmarkId::from_parameter(format!("from_nfkc_{}", bench_data_content.file_name)), | ||
| |bencher| bencher.iter(|| function_under_bench(&decomposer, &bench_data_content.nfkc)), | ||
| ); | ||
| group.bench_function( | ||
| BenchmarkId::from_parameter(format!("from_nfkd_{}", bench_data_content.file_name)), | ||
| |bencher| bencher.iter(|| function_under_bench(&decomposer, &bench_data_content.nfkd)), | ||
| ); | ||
| } | ||
| group.finish(); | ||
| } |
| // This file is part of ICU4X. For terms of use, please see the file | ||
| // called LICENSE at the top level of the ICU4X source tree | ||
| // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). | ||
| use criterion::{black_box, BenchmarkId, Criterion}; | ||
| use icu_normalizer::{ComposingNormalizerBorrowed, DecomposingNormalizerBorrowed}; | ||
| struct BenchDataContent { | ||
| pub file_name: String, | ||
| pub nfc: String, | ||
| pub nfd: String, | ||
| pub nfkc: String, | ||
| pub nfkd: String, | ||
| pub nfc_u16: Vec<u16>, | ||
| pub nfd_u16: Vec<u16>, | ||
| pub nfkc_u16: Vec<u16>, | ||
| pub nfkd_u16: Vec<u16>, | ||
| } | ||
| fn strip_headers(content: &str) -> String { | ||
| content | ||
| .lines() | ||
| .filter(|&s| !s.starts_with('#')) | ||
| .map(|s| s.to_owned()) | ||
| .collect::<Vec<String>>() | ||
| .join("\n") | ||
| } | ||
| fn normalizer_bench_data() -> [BenchDataContent; 15] { | ||
| let nfc_normalizer = ComposingNormalizerBorrowed::new_nfc(); | ||
| let nfd_normalizer = DecomposingNormalizerBorrowed::new_nfd(); | ||
| let nfkc_normalizer = ComposingNormalizerBorrowed::new_nfkc(); | ||
| let nfkd_normalizer = DecomposingNormalizerBorrowed::new_nfkd(); | ||
| let content_latin: (&str, &str) = ( | ||
| "TestNames_Latin", | ||
| &strip_headers(include_str!("./data/TestNames_Latin.txt")), | ||
| ); | ||
| let content_jp_h: (&str, &str) = ( | ||
| "TestNames_Japanese_h", | ||
| &strip_headers(include_str!("./data/TestNames_Japanese_h.txt")), | ||
| ); | ||
| let content_jp_k: (&str, &str) = ( | ||
| "TestNames_Japanese_k", | ||
| &strip_headers(include_str!("./data/TestNames_Japanese_k.txt")), | ||
| ); | ||
| let content_korean: (&str, &str) = ( | ||
| "TestNames_Korean", | ||
| &strip_headers(include_str!("./data/TestNames_Korean.txt")), | ||
| ); | ||
| let content_random_words_ar: (&str, &str) = ( | ||
| "TestRandomWordsUDHR_ar", | ||
| &strip_headers(include_str!("./data/TestRandomWordsUDHR_ar.txt")), | ||
| ); | ||
| let content_random_words_de: (&str, &str) = ( | ||
| "TestRandomWordsUDHR_de", | ||
| &strip_headers(include_str!("./data/TestRandomWordsUDHR_de.txt")), | ||
| ); | ||
| let content_random_words_el: (&str, &str) = ( | ||
| "TestRandomWordsUDHR_el", | ||
| &strip_headers(include_str!("./data/TestRandomWordsUDHR_el.txt")), | ||
| ); | ||
| let content_random_words_es: (&str, &str) = ( | ||
| "TestRandomWordsUDHR_es", | ||
| &strip_headers(include_str!("./data/TestRandomWordsUDHR_es.txt")), | ||
| ); | ||
| let content_random_words_fr: (&str, &str) = ( | ||
| "TestRandomWordsUDHR_fr", | ||
| &strip_headers(include_str!("./data/TestRandomWordsUDHR_fr.txt")), | ||
| ); | ||
| let content_random_words_he: (&str, &str) = ( | ||
| "TestRandomWordsUDHR_he", | ||
| &strip_headers(include_str!("./data/TestRandomWordsUDHR_he.txt")), | ||
| ); | ||
| let content_random_words_pl: (&str, &str) = ( | ||
| "TestRandomWordsUDHR_pl", | ||
| &strip_headers(include_str!("./data/TestRandomWordsUDHR_pl.txt")), | ||
| ); | ||
| let content_random_words_ru: (&str, &str) = ( | ||
| "TestRandomWordsUDHR_ru", | ||
| &strip_headers(include_str!("./data/TestRandomWordsUDHR_ru.txt")), | ||
| ); | ||
| let content_random_words_th: (&str, &str) = ( | ||
| "TestRandomWordsUDHR_th", | ||
| &strip_headers(include_str!("./data/TestRandomWordsUDHR_th.txt")), | ||
| ); | ||
| let content_random_words_tr: (&str, &str) = ( | ||
| "TestRandomWordsUDHR_tr", | ||
| &strip_headers(include_str!("./data/TestRandomWordsUDHR_tr.txt")), | ||
| ); | ||
| let content_viet: (&str, &str) = ("wotw", &strip_headers(include_str!("./data/wotw.txt"))); | ||
| [ | ||
| content_latin, | ||
| content_viet, | ||
| content_jp_k, | ||
| content_jp_h, | ||
| content_korean, | ||
| content_random_words_ru, | ||
| content_random_words_ar, | ||
| content_random_words_el, | ||
| content_random_words_es, | ||
| content_random_words_fr, | ||
| content_random_words_tr, | ||
| content_random_words_th, | ||
| content_random_words_pl, | ||
| content_random_words_he, | ||
| content_random_words_de, | ||
| ] | ||
| .map(|(file_name, raw_content)| { | ||
| let nfc = &nfc_normalizer.normalize(raw_content); | ||
| let nfd = &nfd_normalizer.normalize(raw_content); | ||
| let nfkc = &nfkc_normalizer.normalize(raw_content); | ||
| let nfkd = &nfkd_normalizer.normalize(raw_content); | ||
| BenchDataContent { | ||
| file_name: file_name.to_owned(), | ||
| nfc: nfc.to_string(), | ||
| nfd: nfd.to_string(), | ||
| nfkc: nfkc.to_string(), | ||
| nfkd: nfkd.to_string(), | ||
| nfc_u16: nfc.encode_utf16().collect(), | ||
| nfd_u16: nfd.encode_utf16().collect(), | ||
| nfkc_u16: nfkc.encode_utf16().collect(), | ||
| nfkd_u16: nfkd.encode_utf16().collect(), | ||
| } | ||
| }) | ||
| } | ||
| fn function_under_bench(normalizer: &ComposingNormalizerBorrowed, text: &str) { | ||
| normalizer.normalize(text); | ||
| } | ||
| fn function_under_bench_utf16(normalizer: &ComposingNormalizerBorrowed, text: &[u16]) { | ||
| normalizer.normalize_utf16(text); | ||
| } | ||
| pub fn criterion_benchmark(criterion: &mut Criterion) { | ||
| let group_name = "composing_normalizer_nfc"; | ||
| let normalizer_under_bench = ComposingNormalizerBorrowed::new_nfc(); | ||
| let mut group = criterion.benchmark_group(group_name); | ||
| for bench_data_content in black_box(normalizer_bench_data()) { | ||
| group.bench_function( | ||
| BenchmarkId::from_parameter(format!("from_nfc_{}", bench_data_content.file_name)), | ||
| |bencher| { | ||
| bencher | ||
| .iter(|| function_under_bench(&normalizer_under_bench, &bench_data_content.nfc)) | ||
| }, | ||
| ); | ||
| group.bench_function( | ||
| BenchmarkId::from_parameter(format!("from_nfd_{}", bench_data_content.file_name)), | ||
| |bencher| { | ||
| bencher | ||
| .iter(|| function_under_bench(&normalizer_under_bench, &bench_data_content.nfd)) | ||
| }, | ||
| ); | ||
| group.bench_function( | ||
| BenchmarkId::from_parameter(format!("from_nfkc_{}", bench_data_content.file_name)), | ||
| |bencher| { | ||
| bencher.iter(|| { | ||
| function_under_bench(&normalizer_under_bench, &bench_data_content.nfkc) | ||
| }) | ||
| }, | ||
| ); | ||
| group.bench_function( | ||
| BenchmarkId::from_parameter(format!("from_nfkd_{}", bench_data_content.file_name)), | ||
| |bencher| { | ||
| bencher.iter(|| { | ||
| function_under_bench(&normalizer_under_bench, &bench_data_content.nfkd) | ||
| }) | ||
| }, | ||
| ); | ||
| // UTF_16 | ||
| group.bench_function( | ||
| BenchmarkId::from_parameter(format!( | ||
| "from_nfc_{}_utf_16", | ||
| bench_data_content.file_name | ||
| )), | ||
| |bencher| { | ||
| bencher.iter(|| { | ||
| function_under_bench_utf16(&normalizer_under_bench, &bench_data_content.nfc_u16) | ||
| }) | ||
| }, | ||
| ); | ||
| group.bench_function( | ||
| BenchmarkId::from_parameter(format!( | ||
| "from_nfd_{}_utf_16", | ||
| bench_data_content.file_name | ||
| )), | ||
| |bencher| { | ||
| bencher.iter(|| { | ||
| function_under_bench_utf16(&normalizer_under_bench, &bench_data_content.nfd_u16) | ||
| }) | ||
| }, | ||
| ); | ||
| group.bench_function( | ||
| BenchmarkId::from_parameter(format!( | ||
| "from_nfkc_{}_utf_16", | ||
| bench_data_content.file_name | ||
| )), | ||
| |bencher| { | ||
| bencher.iter(|| { | ||
| function_under_bench_utf16( | ||
| &normalizer_under_bench, | ||
| &bench_data_content.nfkc_u16, | ||
| ) | ||
| }) | ||
| }, | ||
| ); | ||
| group.bench_function( | ||
| BenchmarkId::from_parameter(format!( | ||
| "from_nfkd_{}_utf_16", | ||
| bench_data_content.file_name | ||
| )), | ||
| |bencher| { | ||
| bencher.iter(|| { | ||
| function_under_bench_utf16( | ||
| &normalizer_under_bench, | ||
| &bench_data_content.nfkd_u16, | ||
| ) | ||
| }) | ||
| }, | ||
| ); | ||
| } | ||
| group.finish(); | ||
| } |
| // This file is part of ICU4X. For terms of use, please see the file | ||
| // called LICENSE at the top level of the ICU4X source tree | ||
| // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). | ||
| use criterion::{black_box, BenchmarkId, Criterion}; | ||
| use icu_normalizer::{ComposingNormalizerBorrowed, DecomposingNormalizerBorrowed}; | ||
| struct BenchDataContent { | ||
| pub file_name: String, | ||
| pub nfc: String, | ||
| pub nfd: String, | ||
| pub nfkc: String, | ||
| pub nfkd: String, | ||
| pub nfc_u16: Vec<u16>, | ||
| pub nfd_u16: Vec<u16>, | ||
| pub nfkc_u16: Vec<u16>, | ||
| pub nfkd_u16: Vec<u16>, | ||
| } | ||
| fn strip_headers(content: &str) -> String { | ||
| content | ||
| .lines() | ||
| .filter(|&s| !s.starts_with('#')) | ||
| .map(|s| s.to_owned()) | ||
| .collect::<Vec<String>>() | ||
| .join("\n") | ||
| } | ||
| fn normalizer_bench_data() -> [BenchDataContent; 15] { | ||
| let nfc_normalizer = ComposingNormalizerBorrowed::new_nfc(); | ||
| let nfd_normalizer = DecomposingNormalizerBorrowed::new_nfd(); | ||
| let nfkc_normalizer = ComposingNormalizerBorrowed::new_nfkc(); | ||
| let nfkd_normalizer = DecomposingNormalizerBorrowed::new_nfkd(); | ||
| let content_latin: (&str, &str) = ( | ||
| "TestNames_Latin", | ||
| &strip_headers(include_str!("./data/TestNames_Latin.txt")), | ||
| ); | ||
| let content_jp_h: (&str, &str) = ( | ||
| "TestNames_Japanese_h", | ||
| &strip_headers(include_str!("./data/TestNames_Japanese_h.txt")), | ||
| ); | ||
| let content_jp_k: (&str, &str) = ( | ||
| "TestNames_Japanese_k", | ||
| &strip_headers(include_str!("./data/TestNames_Japanese_k.txt")), | ||
| ); | ||
| let content_korean: (&str, &str) = ( | ||
| "TestNames_Korean", | ||
| &strip_headers(include_str!("./data/TestNames_Korean.txt")), | ||
| ); | ||
| let content_random_words_ar: (&str, &str) = ( | ||
| "TestRandomWordsUDHR_ar", | ||
| &strip_headers(include_str!("./data/TestRandomWordsUDHR_ar.txt")), | ||
| ); | ||
| let content_random_words_de: (&str, &str) = ( | ||
| "TestRandomWordsUDHR_de", | ||
| &strip_headers(include_str!("./data/TestRandomWordsUDHR_de.txt")), | ||
| ); | ||
| let content_random_words_el: (&str, &str) = ( | ||
| "TestRandomWordsUDHR_el", | ||
| &strip_headers(include_str!("./data/TestRandomWordsUDHR_el.txt")), | ||
| ); | ||
| let content_random_words_es: (&str, &str) = ( | ||
| "TestRandomWordsUDHR_es", | ||
| &strip_headers(include_str!("./data/TestRandomWordsUDHR_es.txt")), | ||
| ); | ||
| let content_random_words_fr: (&str, &str) = ( | ||
| "TestRandomWordsUDHR_fr", | ||
| &strip_headers(include_str!("./data/TestRandomWordsUDHR_fr.txt")), | ||
| ); | ||
| let content_random_words_he: (&str, &str) = ( | ||
| "TestRandomWordsUDHR_he", | ||
| &strip_headers(include_str!("./data/TestRandomWordsUDHR_he.txt")), | ||
| ); | ||
| let content_random_words_pl: (&str, &str) = ( | ||
| "TestRandomWordsUDHR_pl", | ||
| &strip_headers(include_str!("./data/TestRandomWordsUDHR_pl.txt")), | ||
| ); | ||
| let content_random_words_ru: (&str, &str) = ( | ||
| "TestRandomWordsUDHR_ru", | ||
| &strip_headers(include_str!("./data/TestRandomWordsUDHR_ru.txt")), | ||
| ); | ||
| let content_random_words_th: (&str, &str) = ( | ||
| "TestRandomWordsUDHR_th", | ||
| &strip_headers(include_str!("./data/TestRandomWordsUDHR_th.txt")), | ||
| ); | ||
| let content_random_words_tr: (&str, &str) = ( | ||
| "TestRandomWordsUDHR_tr", | ||
| &strip_headers(include_str!("./data/TestRandomWordsUDHR_tr.txt")), | ||
| ); | ||
| let content_viet: (&str, &str) = ("wotw", &strip_headers(include_str!("./data/wotw.txt"))); | ||
| [ | ||
| content_latin, | ||
| content_viet, | ||
| content_jp_k, | ||
| content_jp_h, | ||
| content_korean, | ||
| content_random_words_ru, | ||
| content_random_words_ar, | ||
| content_random_words_el, | ||
| content_random_words_es, | ||
| content_random_words_fr, | ||
| content_random_words_tr, | ||
| content_random_words_th, | ||
| content_random_words_pl, | ||
| content_random_words_he, | ||
| content_random_words_de, | ||
| ] | ||
| .map(|(file_name, raw_content)| { | ||
| let nfc = &nfc_normalizer.normalize(raw_content); | ||
| let nfd = &nfd_normalizer.normalize(raw_content); | ||
| let nfkc = &nfkc_normalizer.normalize(raw_content); | ||
| let nfkd = &nfkd_normalizer.normalize(raw_content); | ||
| BenchDataContent { | ||
| file_name: file_name.to_owned(), | ||
| nfc: nfc.to_string(), | ||
| nfd: nfd.to_string(), | ||
| nfkc: nfkc.to_string(), | ||
| nfkd: nfkd.to_string(), | ||
| nfc_u16: nfc.encode_utf16().collect(), | ||
| nfd_u16: nfd.encode_utf16().collect(), | ||
| nfkc_u16: nfkc.encode_utf16().collect(), | ||
| nfkd_u16: nfkd.encode_utf16().collect(), | ||
| } | ||
| }) | ||
| } | ||
| fn function_under_bench(normalizer: &ComposingNormalizerBorrowed, text: &str) { | ||
| normalizer.normalize(text); | ||
| } | ||
| fn function_under_bench_u16(normalizer: &ComposingNormalizerBorrowed, text: &[u16]) { | ||
| normalizer.normalize_utf16(text); | ||
| } | ||
| pub fn criterion_benchmark(criterion: &mut Criterion) { | ||
| let group_name = "composing_normalizer_nfkc"; | ||
| let normalizer_under_bench = ComposingNormalizerBorrowed::new_nfkc(); | ||
| let mut group = criterion.benchmark_group(group_name); | ||
| for bench_data_content in black_box(normalizer_bench_data()) { | ||
| group.bench_function( | ||
| BenchmarkId::from_parameter(format!("from_nfc_{}", bench_data_content.file_name)), | ||
| |bencher| { | ||
| bencher | ||
| .iter(|| function_under_bench(&normalizer_under_bench, &bench_data_content.nfc)) | ||
| }, | ||
| ); | ||
| group.bench_function( | ||
| BenchmarkId::from_parameter(format!("from_nfd_{}", bench_data_content.file_name)), | ||
| |bencher| { | ||
| bencher | ||
| .iter(|| function_under_bench(&normalizer_under_bench, &bench_data_content.nfd)) | ||
| }, | ||
| ); | ||
| group.bench_function( | ||
| BenchmarkId::from_parameter(format!("from_nfkc_{}", bench_data_content.file_name)), | ||
| |bencher| { | ||
| bencher.iter(|| { | ||
| function_under_bench(&normalizer_under_bench, &bench_data_content.nfkc) | ||
| }) | ||
| }, | ||
| ); | ||
| group.bench_function( | ||
| BenchmarkId::from_parameter(format!("from_nfkd_{}", bench_data_content.file_name)), | ||
| |bencher| { | ||
| bencher.iter(|| { | ||
| function_under_bench(&normalizer_under_bench, &bench_data_content.nfkd) | ||
| }) | ||
| }, | ||
| ); | ||
| // UTF 16 | ||
| group.bench_function( | ||
| BenchmarkId::from_parameter(format!("from_nfc_{}_u16", bench_data_content.file_name)), | ||
| |bencher| { | ||
| bencher.iter(|| { | ||
| function_under_bench_u16(&normalizer_under_bench, &bench_data_content.nfc_u16) | ||
| }) | ||
| }, | ||
| ); | ||
| group.bench_function( | ||
| BenchmarkId::from_parameter(format!("from_nfd_{}_u16", bench_data_content.file_name)), | ||
| |bencher| { | ||
| bencher.iter(|| { | ||
| function_under_bench_u16(&normalizer_under_bench, &bench_data_content.nfd_u16) | ||
| }) | ||
| }, | ||
| ); | ||
| group.bench_function( | ||
| BenchmarkId::from_parameter(format!("from_nfkc_{}_u16", bench_data_content.file_name)), | ||
| |bencher| { | ||
| bencher.iter(|| { | ||
| function_under_bench_u16(&normalizer_under_bench, &bench_data_content.nfkc_u16) | ||
| }) | ||
| }, | ||
| ); | ||
| group.bench_function( | ||
| BenchmarkId::from_parameter(format!("from_nfkd_{}_u16", bench_data_content.file_name)), | ||
| |bencher| { | ||
| bencher.iter(|| { | ||
| function_under_bench_u16(&normalizer_under_bench, &bench_data_content.nfkd_u16) | ||
| }) | ||
| }, | ||
| ); | ||
| } | ||
| group.finish(); | ||
| } |
| # Generating microbench data | ||
| The full versions of these files are located | ||
| [in another part of the repository](https://github.com/unicode-org/icu/tree/main/icu4j/perf-tests/data). | ||
| ## Sanitizing the file | ||
| ```shell | ||
| sed -i '/^#/d' ${filename} | ||
| sed -i '/^$/d' ${filename} | ||
| ``` | ||
| ## Shuffling the file | ||
| ```shell | ||
| shuf -n 20 ${filename} -o ${filename} | ||
| ``` | ||
| ## Add back the header (if you plan on submitting the files) | ||
| ``` | ||
| # This file is part of ICU4X. For terms of use, please see the file | ||
| # called LICENSE at the top level of the ICU4X source tree | ||
| # (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). | ||
| ``` |
| # This file is part of ICU4X. For terms of use, please see the file | ||
| # called LICENSE at the top level of the ICU4X source tree | ||
| # (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). | ||
| かげやま,みのる | ||
| むらかみ,とおる | ||
| つじさわ,けい | ||
| やすい,たかゆき | ||
| むらさき,としお | ||
| はせがわ,ひであき | ||
| うるしばら,よしひこ | ||
| ままだ,ひろし | ||
| おおぼら,えいじろう | ||
| おおば,まさひで | ||
| きたばたけ,たかひこ | ||
| はまさき,あつし | ||
| ほりい,つねお | ||
| もり,だいいち | ||
| いとう,しんいち | ||
| くにもと,じゅんじ | ||
| おか,のりひと | ||
| たに,よしあき | ||
| しらがき,ひろあき | ||
| しらはま,たけひろ | ||
| むらかみ,やすひろ | ||
| うめはら,たかし | ||
| いわた,ひろし | ||
| すぎえ,かつとし | ||
| てらにし,ひろみつ | ||
| まつおか,だいすけ | ||
| もろほし,すすむ | ||
| いしはら,たかし | ||
| おしま,ひろお | ||
| なかお,ゆうじ | ||
| いかり,はるお | ||
| きまち,まさき | ||
| ふるかわ,みちお | ||
| かねこ,しゅうへい | ||
| なかがわ,ともみ | ||
| ささき,しんご | ||
| うちだ,たくじ | ||
| うめだ,さかえ | ||
| しばた,いくこ | ||
| まきした,けいこ | ||
| まつもと,しんいちろう | ||
| たかの,かずよし | ||
| いしわた,なおひさ | ||
| いうち,まこと | ||
| いまい,りほ | ||
| みずた,のりあき | ||
| かくたに,まなぶ | ||
| わだ,ほまれ | ||
| わかまつ,かずき | ||
| かわぐち,ひろき |
| # This file is part of ICU4X. For terms of use, please see the file | ||
| # called LICENSE at the top level of the ICU4X source tree | ||
| # (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). | ||
| ホリモト,ユウジ | ||
| ハナミ,ヤスヒデ | ||
| イシザカ,タカユキ | ||
| ゼンケ,トシオ | ||
| ハトリ,ユウコ | ||
| ナガオカ,トモユキ | ||
| コウダ,ケンイチ | ||
| イシダ,ヒロシ | ||
| ミワ,シゲユキ | ||
| イシカワ,ヒロシ | ||
| スズキ,ユウスケ | ||
| オクダ,ヨシノリ | ||
| シムラ,サカエ | ||
| エビシマ,ヤスユキ | ||
| イブカ,ヨシテル | ||
| タノ,マコト | ||
| ドウゾノ,セイヤ | ||
| ヤマナカ,サツミ | ||
| トミイエ,ハヤト | ||
| アザミ,ツトム | ||
| タナカ,キョウコ | ||
| コジマ,アツシ | ||
| フミハラ,カオリ | ||
| スズキ,マサユキ | ||
| ナトリ,ケンヤ | ||
| スズキ,ユウコ | ||
| スズキ,ヒサエ | ||
| ナカガワ,カツヨシ | ||
| スズキ,マサフミ | ||
| マツヤマ,トシオ | ||
| ヨシナガ,チカエ | ||
| キタムラ,リカコ | ||
| アオキ,タクオ | ||
| ヤマグチ,ヤスヒロ | ||
| スギムラ,シゲオ | ||
| ウエスギ,マサミ | ||
| マツムラ,シンイチ | ||
| クバ,タカシ | ||
| スドウ,タカトシ | ||
| フジモト,ヒロシ | ||
| イトウ,シュウイチ | ||
| コバヤシ,カズミ | ||
| タナカ,ヒロカツ | ||
| イシダ,ツカサ | ||
| ヤマダ,マサコ | ||
| カミヤ,トミエ | ||
| タケモト,ユウジ | ||
| スミノ,コウジ | ||
| ヒロハタ,タクヤ | ||
| ミヒラ,リョウヘイ |
| # This file is part of ICU4X. For terms of use, please see the file | ||
| # called LICENSE at the top level of the ICU4X source tree | ||
| # (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). | ||
| 김명희 | ||
| 홍차수 | ||
| 허순재 | ||
| 강영휘 | ||
| 김운주 | ||
| 이종환 | ||
| 이은국 | ||
| 강태호 | ||
| 강일래 | ||
| 김동현 | ||
| 곽기자 | ||
| 차재수 | ||
| 표봉기 | ||
| 문대원 | ||
| 이형기 | ||
| 최교표 | ||
| 박식현 | ||
| 홍종립 | ||
| 서창수 | ||
| 김쌍건 | ||
| 서말도 | ||
| 이병훈 | ||
| 김희수 | ||
| 박학태 | ||
| 강태종 | ||
| 조문란 | ||
| 신범균 | ||
| 백두진 | ||
| 이철정 | ||
| 김태중 | ||
| 이성현 | ||
| 김주조 | ||
| 김강행 | ||
| 이정길 | ||
| 김완일 | ||
| 권수자 | ||
| 이춘철 | ||
| 김판근 | ||
| 김곡리 | ||
| 이경형 | ||
| 이운만 | ||
| 손상철 | ||
| 유기숙 | ||
| 박정한 | ||
| 조윤래 | ||
| 유신호 | ||
| 이두수 | ||
| 김재률 | ||
| 김성홍 | ||
| 김혜경 |
| # This file is part of ICU4X. For terms of use, please see the file | ||
| # called LICENSE at the top level of the ICU4X source tree | ||
| # (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). | ||
| González, Joan | ||
| Reinders, Jim | ||
| Applebroog, Ida | ||
| Kidd, Joseph Bartholomew | ||
| Gulácsy, Lajos | ||
| Letendre, Rita | ||
| Zuccaro, Federico | ||
| Apt the Elder, Ulrich | ||
| Drummond, Arthur | ||
| Manley, Thomas | ||
| Broc, Jean | ||
| Ramunno, Tony | ||
| Simone dei Crocifissi | ||
| Lane, Theodore | ||
| Symonds, William Robert | ||
| Johnson, Frank Tenney | ||
| Cox, Gardner | ||
| Bunbury, Charles | ||
| Pedro de la Cuadra | ||
| Payne, William | ||
| Lucas, John Seymour | ||
| Holsman, Elizabeth T. | ||
| de Vries, Auke | ||
| Laszlo, Philip Alexius de | ||
| Shigemasa | ||
| Wolfe, Ruth Mitchell | ||
| Buck, John | ||
| Baselitz, Georg | ||
| Hook, Walter | ||
| Segall, Lasar | ||
| Brush, George deForest | ||
| Master of Jánosrét | ||
| Sutherland, Elizabeth Leveson-Gower, Countess of | ||
| Tuckerman, Jane | ||
| Varley, F.H. | ||
| Fosso, Samuel | ||
| Gardner, Daniel | ||
| Sadler, Walter Dendy | ||
| Clausen, Franciska | ||
| Coman, Charlotte Buell | ||
| Wakelin, Roland | ||
| Payne, Jon, CML | ||
| Campagna, Girolamo | ||
| Wiener, Phyllis | ||
| Sallee, Charles | ||
| Fitzgerald, John Anster | ||
| Gribbroek, Robert | ||
| Laporte, John | ||
| Lévy-Dhurmer, Lucien | ||
| Young, Stephen Scott |
| # This file is part of ICU4X. For terms of use, please see the file | ||
| # called LICENSE at the top level of the ICU4X source tree | ||
| # (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). | ||
| ณรงค์ โต๊ะเงิน | ||
| กิตติ บุญวันต์ | ||
| สมหมาย ดาบทองดี | ||
| ธวัชชัย อิสระนิมิตร | ||
| วรรณา โสภณนรินทร์ | ||
| วินัย หมู่มิ่ง | ||
| พัชรี ชูจิรวงศ์ | ||
| สมปอง จิวไพโรจน์กิจ | ||
| บุญส่ง กวยรักษา | ||
| นิพนธ์ นิ่มใหม่ | ||
| พัชรี สุวพรศิลป์ | ||
| เจริญ นววัฒนทรัพย์ | ||
| อรพินท์ แซ่เจี่ย | ||
| ชัยพร สมใจนึก | ||
| ประนอม โคศิลา | ||
| ฉวีวรรณ ศรสังข์ทอง | ||
| วัชรา เจริญรัตนพร | ||
| สุภัท นกศิริ | ||
| อู๋ มาลาเล็ก | ||
| ประยูร ไชโย | ||
| ละออ อยู่ยืนยง | ||
| สมใจ วิวัฒน์วานิช | ||
| จุมพล จันทรศรีเกษร | ||
| พุฒ ดอกไม้จีน | ||
| บุญชัย วรกิจพรสิน | ||
| สมาน ธูปเทียน | ||
| พงศ์ศักดิ์ แซ่แต้ | ||
| อำนาจ ไวจงเจริญ | ||
| พรทิพย์ แซ่ลี้ | ||
| อุไรวรรณ สาครสินธุ์ | ||
| อำพล วีระตะนนท์ | ||
| สมจิตร ใจวังโลก | ||
| สุเทพ ตันวินิจ | ||
| สวาท ทรัพย์มาก | ||
| สมศักดิ์ เจือจันทร์ | ||
| ดัสซันซิงห์ กุลาตี | ||
| ธีร ศรแก้ว | ||
| พรรณยุพา ฮ่อสกุล | ||
| สำราญ จันทร์เอี่ยม | ||
| พจน์ มั่นกันนาน | ||
| สุธี บุณยเกียรติ | ||
| บุญโชติ ทิพย์ประเสริฐสิน | ||
| ประดิษฐ์ ทองพสิฐสมบัติ | ||
| จำเนียร เพ็งเจริญ | ||
| สมศักดิ์ อรุณรัตน์ | ||
| อนุชา จารุหิรัญสกุล | ||
| พิกุล มโนภิญโญภิญญะ | ||
| ผ่องศรี นกแก้ว | ||
| อารี วิไลวรรณ | ||
| ณรงค์วิทย์ วิทสัทธาวรกุล |
| # This file is part of ICU4X. For terms of use, please see the file | ||
| # called LICENSE at the top level of the ICU4X source tree | ||
| # (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). | ||
| ممارسة مراعاة | ||
| العنصرية | ||
| حدود والشيخوخة | ||
| بالحكم كهذا ينتفع | ||
| البلاد | ||
| تربية | ||
| الغير التقدم والعدل | ||
| نحو بالتعليم والحرية | ||
| تأمين متساو | ||
| للتعليم فيها | ||
| آذت اعتداء للتعليم | ||
| ليس المتأصلة | ||
| والمساهمة الضروري تتناقض | ||
| وتأسيس | ||
| رضى | ||
| شرعي الطبية | ||
| لكيلا الجمعية والحرية | ||
| للرجال التزوج | ||
| بالكرامة | ||
| حرية بين | ||
| هذه العيش تنظر | ||
| قيد | ||
| يقررها والصداقة | ||
| اعتُمد وينبغي اجتماعي | ||
| حرمان | ||
| للإدراك بأجر إنتاجه | ||
| التربية القانون | ||
| لإنصافه وتأسيس وسمعته | ||
| أساسه للرجال | ||
| كافة | ||
| المجهود دولي أينما | ||
| وإلى | ||
| بنشاط تجري | ||
| والأمم مثل لحقوق | ||
| الإنسان بشروط بحماية | ||
| شرفه | ||
| كما الوظائف | ||
| حياته ديسمبر | ||
| ولما | ||
| هذه | ||
| غاية جديد إنسان | ||
| حرية | ||
| متهم الوطنية قدمًا | ||
| التملك وضع | ||
| شرعية ويعبر تأدية | ||
| بنظام عمل والأخلاق | ||
| التملك لشخصيته يلجأ | ||
| بحال يضطر ولا | ||
| الانضمام بالكرامة | ||
| عضوا |
| # This file is part of ICU4X. For terms of use, please see the file | ||
| # called LICENSE at the top level of the ICU4X source tree | ||
| # (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). | ||
| Herrschaft Freiheiten Not | ||
| Gewalt | ||
| stets anderer begründet | ||
| erhobenen innerstaatliche | ||
| Heiratsfähige freie | ||
| offenstehen Begrenzung grausamer | ||
| Maßnahmen höchste | ||
| unentbehrlich privat | ||
| erniedrigender | ||
| Verachtung freie | ||
| innezuhaben innerstaatlichen | ||
| kommen | ||
| werden gleichgültig | ||
| Würde überall höchste | ||
| Schutzmaßnahmen den Pflichten | ||
| Wille Bestimmung | ||
| Leibeigenschaft einschließlich für | ||
| gleiche bekräftigt Gewissens | ||
| Wohles | ||
| Generalversammlung | ||
| Volkes | ||
| Völkern gegenwärtig Zusammenarbeit | ||
| Heiratsfähige sowie Jeder | ||
| Stellung | ||
| Lebensstandard | ||
| seinem | ||
| Rede strafbaren Sicherheit | ||
| mit | ||
| Kulthandlungen Grund | ||
| ärztlicher | ||
| Auflösung Anforderungen anzugehören | ||
| Furcht | ||
| keine Geburt | ||
| Wohles Furcht genügen | ||
| befriedigende Medien | ||
| anzugehören Urlaub Vereinigungen | ||
| hinzuwirken verboten Resolution | ||
| kommen | ||
| sozialer vor irgendein | ||
| Bestimmung Bestimmung | ||
| Fall natürliche kein | ||
| Geschlecht Aufhetzung eigenen | ||
| seinen | ||
| über | ||
| Unterlassung Berücksichtigung | ||
| war | ||
| Rufes stets | ||
| Volkes anderer Beschränkungen | ||
| Handlungen dessen | ||
| Die |
| # This file is part of ICU4X. For terms of use, please see the file | ||
| # called LICENSE at the top level of the ICU4X source tree | ||
| # (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). | ||
| προάγει αλληλογραφία | ||
| λογική έχει | ||
| ιδρύει ζωή τεχνική | ||
| δυνατότητες | ||
| περιορισμό συνόλου | ||
| ασκεί παραγνώριση συναφθεί | ||
| αναγνωρίζουν ποινικής εκδηλώνει | ||
| κοινότητας διακυβέρνηση στα | ||
| απέναντι υψηλή | ||
| περιστάσεων αξιόποινη | ||
| σεβασμό | ||
| συντήρησής κατά εξασφαλίσουν | ||
| παραβιάζουν συμπληρώνεται νόμο | ||
| άμεσα | ||
| σημαίνει καθεστώς | ||
| ΑΝΘΡΩΠΙΝΑ θέλησης ανθρωπίνων | ||
| ΔΙΑΚΗΡΥΞΗ αθλιότητα ασφάλιση | ||
| μέσο | ||
| ίση Εχει | ||
| ειρήνης Κάθε | ||
| μέλη μορφή | ||
| όσο | ||
| κρατείται Στο Διακηρύσσει | ||
| οικονομικών έκφρασης εξασφαλίζεται | ||
| κάθε | ||
| περίπτωση απολαμβάνουν | ||
| ποινικό γεροντική | ||
| είναι μαζί δικαστήρια | ||
| μαζί προοπτική | ||
| δική | ||
| βαρβαρότητας | ||
| οικονομικών εξασφαλίσει | ||
| υποχρεώσεις οδήγησαν | ||
| Οικουμενική Διακήρυξης γονείς | ||
| στις μυστική αντιπροσώπους | ||
| Διακήρυξης άδειες βιοτικό | ||
| αναπηρία ομάδα | ||
| πραγματικό | ||
| καλύτερες | ||
| ανάπαυση | ||
| δίκαιες ένα δικαίου | ||
| μετέχει στους | ||
| θρησκευτικών ποινικής | ||
| Κανείς ίσα | ||
| πεποιθήσεις | ||
| πολιτικές ανάλογα δουλεία | ||
| πολιτικές ιατρική ωσότου | ||
| ηθικής χωρίς | ||
| ανδρών ικανό | ||
| καθώς |
| # This file is part of ICU4X. For terms of use, please see the file | ||
| # called LICENSE at the top level of the ICU4X source tree | ||
| # (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). | ||
| duración común | ||
| delito reconocimiento alimentación | ||
| inalienables | ||
| entre seguridad escogidos | ||
| comportarse dignidad | ||
| autónomo gobierno tiempo | ||
| omisiones | ||
| comisión | ||
| Derechos territorios | ||
| debe | ||
| han | ||
| regresar inalienables | ||
| regresar | ||
| desempleo científico | ||
| arbitrariamente proclamada | ||
| están contraerse esposos | ||
| cualesquiera | ||
| salir carácter desarrollo | ||
| solamente justas | ||
| personalidad una | ||
| cuanto | ||
| garantice resolución | ||
| concepción | ||
| tomar impondrá | ||
| cualquier reconocimiento | ||
| obligatoria obligatoria satisfactoria | ||
| acusación sin | ||
| artísticas penal culturales | ||
| pagadas examen | ||
| Además Organización dignidad | ||
| opresión esposos ejercidos | ||
| barbarie están mientras | ||
| por | ||
| idioma | ||
| recursos pagadas | ||
| materia Nada ella | ||
| con injerencias | ||
| inspirándose | ||
| organización | ||
| gozar jurisdicción | ||
| que | ||
| asegurar | ||
| humana libertad | ||
| nadie equivalente | ||
| escoger remuneración | ||
| torturas | ||
| individuos poder | ||
| disfruten seres Preámbulo | ||
| desempleo | ||
| liberados |
| # This file is part of ICU4X. For terms of use, please see the file | ||
| # called LICENSE at the top level of the ICU4X source tree | ||
| # (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). | ||
| conforme êtres fonctions | ||
| non tout généralisé | ||
| premier lui | ||
| faire hommes d’égalité | ||
| peuple volonté bénéficier | ||
| générale nationales | ||
| cruels plus | ||
| d’encourager opinions | ||
| genre l’esprit | ||
| d’origine effectif | ||
| exigences auront | ||
| résultent situation recevoir | ||
| peuples Chacun | ||
| sont d’égalité | ||
| jouissent | ||
| auront l’esprit | ||
| pays telle | ||
| publiquement | ||
| mariage foi | ||
| travail démocratique religieux | ||
| rémunération | ||
| omissions telles | ||
| L’éducation | ||
| raison complétée donner | ||
| invoqué auront arbitraires | ||
| l’amitié suffisant affaires | ||
| travaille l’accomplissement l’intermédiaire | ||
| race | ||
| opinions celles | ||
| assurer par privée | ||
| valeur | ||
| violant traite premier | ||
| inhérente | ||
| bienfaits l’avènement | ||
| Unies s’il actions | ||
| inquiété l’esclavage | ||
| inquiété | ||
| esclaves lieu | ||
| salaire | ||
| par | ||
| toute | ||
| innocente procédure membres | ||
| arts l’idéal envers | ||
| suffrage territoires inhumains | ||
| d’immixtions l’organisation progrès | ||
| comme égalité Unies | ||
| maternité | ||
| violerait suprême sécurité | ||
| impliquant eux loisirs | ||
| nationalité |
| # This file is part of ICU4X. For terms of use, please see the file | ||
| # called LICENSE at the top level of the ICU4X source tree | ||
| # (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). | ||
| זקנה משפילים | ||
| ינתן חברתי עניניו | ||
| הפוב | ||
| ולהיות זכויות הישגים | ||
| יאסרו מטעמי וללא | ||
| ספרותית השלם | ||
| למנוחה חינם | ||
| וההתאגדות | ||
| לטפח | ||
| באלה במלואן | ||
| יהנו | ||
| ולרווחתם לגבר האדם | ||
| בכבודו שבארצות כבוד | ||
| ובינלאומיים | ||
| בכך לתנאי אישי | ||
| שאינן | ||
| שרירותי | ||
| במשפט | ||
| ולעקרונותיהן מטעם | ||
| שרירותית האשמה יהיה | ||
| החינוך ולבטחון | ||
| סובלנות אשמתו במגילה | ||
| המאוחדות חיוני | ||
| חשוב במקרה | ||
| כלתי העולם | ||
| שמקורה כציבור | ||
| לשויון | ||
| לתקנה | ||
| תלוי ההתאספות | ||
| הדיבור שהוא | ||
| והבלתי והבסיסית | ||
| ולעקרונותיהן יהא וישאף | ||
| ביתנ הבינלאומי | ||
| והזלזול להקנות | ||
| בגלל כולם שיושלם | ||
| לחיים | ||
| בדבר | ||
| לשירות | ||
| זכויות | ||
| לפני | ||
| אדם ולא מזזמנות | ||
| קנינו שהיה ההתאספות | ||
| בינלאומי חיוניות לבקש | ||
| תהיינה | ||
| ובזכות בכורה מהגנה | ||
| מתוך | ||
| ובמצפון מזומנות לאגד | ||
| והחמריים סוציאלי | ||
| אנושיים ובהצבעה | ||
| פראיים |
| # This file is part of ICU4X. For terms of use, please see the file | ||
| # called LICENSE at the top level of the ICU4X source tree | ||
| # (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). | ||
| uciskowi posiadania prawo | ||
| społecznego największych skazany | ||
| czy | ||
| potrzeby samodzielnie przystępowania | ||
| Krzewi też dokonania | ||
| pełną prawo | ||
| buntu | ||
| moralności | ||
| zapewnienia znaczenie | ||
| nieludzki wypadek Nikt | ||
| zasadności jakikolwiek Każdy | ||
| samowolnie krajem | ||
| międzynarodowego | ||
| członek wielu | ||
| rozwój wynikających obalenia | ||
| rasy | ||
| grudnia która | ||
| jedynie urlopu ani | ||
| małżeńskie stanowi ustaniu | ||
| człowieka postępowych | ||
| prześladowania | ||
| politycznej które zawarcia | ||
| Deklaracja | ||
| ingerować wyłącznie | ||
| studia Nikt | ||
| innego uprawianie zrozumienie | ||
| wybranych swobodę wyznania | ||
| wolni osobowości | ||
| ograniczenie Nie | ||
| równej społecznego uciekać | ||
| będącą POWSZECHNA | ||
| niezdolności poszukiwania międzynarodowej | ||
| konieczne potrzeby posiada | ||
| opinii wychowywania 1948 | ||
| międzynarodowej zatrzymać | ||
| przedstawicieli | ||
| przeciw | ||
| wynikających organy pracę | ||
| człowiek grupami | ||
| niezbędnych | ||
| wolności podstawowym | ||
| opinii małżonków wolność | ||
| postępować zdecydowanie komórką | ||
| odniesieniu | ||
| pokoju azyl | ||
| zawodowych powrócić człowiek | ||
| konstytucję | ||
| takiej postaciach powszechnego | ||
| wygnać wygnać | ||
| wspólny poszanowania |
| # This file is part of ICU4X. For terms of use, please see the file | ||
| # called LICENSE at the top level of the ICU4X source tree | ||
| # (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). | ||
| областях | ||
| будут должен | ||
| обеспечиваются нежели | ||
| котором Уставе | ||
| социального моральных | ||
| совершеннолетия предоставление | ||
| том независимо | ||
| существование | ||
| вмешательства какому ограниченной | ||
| распространять | ||
| находить помощь | ||
| искусством | ||
| унижающим положения искать | ||
| изгнанию член совершеннолетия | ||
| обществом имуществом государственной | ||
| идеи братства | ||
| наслаждаться значение социальной | ||
| осуществления юрисдикцией наказанию | ||
| достойное свою III | ||
| жизнь расторжения инвалидности | ||
| терпимости этого | ||
| целях равны | ||
| обеспечиваются законным | ||
| принуждаем правосубъектности | ||
| пыткам доступа неприкосновенность | ||
| Брак против | ||
| прибегать независимой | ||
| человека человеческой | ||
| быть независимо религии | ||
| публичным | ||
| членам против | ||
| разумом результатом семью | ||
| Принята участие | ||
| беспристрастным тем | ||
| частным основной | ||
| правового | ||
| страной обслуживание | ||
| было свободу полное | ||
| рабочего свободны | ||
| состоянии помощь религиозными | ||
| полное | ||
| владеть власти морали | ||
| меньшей | ||
| братства социальному убежища | ||
| государств | ||
| равны который дети | ||
| терпимости | ||
| получать бесплатным полного | ||
| богослужении | ||
| отдельным |
| # This file is part of ICU4X. For terms of use, please see the file | ||
| # called LICENSE at the top level of the ICU4X source tree | ||
| # (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). | ||
| คิด ใตัอำ เคลื่อนไหว | ||
| บังคับ บาก | ||
| สิ่ง สิ้น | ||
| วัตถุ | ||
| ชาย อาศัย เท่านั้น | ||
| สิน | ||
| เกา | ||
| ดูแล พิธีกรรม | ||
| ภายใน | ||
| เพศ | ||
| หนัก ประสงค์ | ||
| เหตุ | ||
| งาน รักษา | ||
| เพศ ภาษา | ||
| นี้ | ||
| คู่ สัญชาติ ต้องการ | ||
| วิธี ระหว่าง ตกลง | ||
| ทำนอง | ||
| สืบ กับ ศิลปกรรม | ||
| เหนือ วรรณกรรม | ||
| คิด การก หน้าที่ | ||
| ชาติ ศิลปกรรม แต่ | ||
| สามัญ สอด | ||
| เหยียด วิธี จุด | ||
| หน้า ถ้า เบื้อง | ||
| ประชุม | ||
| ศิลปกรรม | ||
| เสรีภาพ โหด ก่อ | ||
| เกียรติศักดิ์ ป่วย เอกราช | ||
| ประหัต มโนธรรม การ | ||
| แทน | ||
| ขัดขืน เวลา เสียง | ||
| กฎบัตร พยายาม | ||
| สิน หน้า | ||
| จำเป็น | ||
| ประชาธิปไตย หน่วย | ||
| กรณี จริงจัง | ||
| ทำนอง | ||
| ทาษ | ||
| เพิ่ม | ||
| บรรดา ขวาง | ||
| กักขัง | ||
| มนุษย์ | ||
| ชาย ประกัน มนุษยธรรม | ||
| จะบัน มูลฐาน เถื่อน | ||
| พฤติ | ||
| มิได้ | ||
| หญิง คู่ | ||
| สมา ปฏิบัติ อนึ่ง | ||
| สิ่ง ทาษ |
| # This file is part of ICU4X. For terms of use, please see the file | ||
| # called LICENSE at the top level of the ICU4X source tree | ||
| # (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). | ||
| mecburidir ilim | ||
| isnadın sınırları suç | ||
| tutuklanamaz diğer | ||
| memleket korunmasi kullanılamaz | ||
| İnsanlık ilerlemeyi | ||
| bir mülk menfaatlerinin | ||
| usul zümreye herhangi | ||
| mahkeme vicdana ilerleyişe | ||
| zulüm zalimane | ||
| ilim öncelikle çocuk | ||
| mevzubahis ancak | ||
| muamelesi dinlenmeye | ||
| eşitlikle prensiplerine ülkenin | ||
| öğretim bulunmalarına yardım | ||
| memleketler amacıyla | ||
| birbirlerine | ||
| olmalıdır | ||
| bırakılamaz serbestisine | ||
| hürriyetin iyi | ||
| hükmü işbu zalimane | ||
| evlenme memleketi tedbirlerle | ||
| evlenmek ahalisi işini | ||
| hürriyetler | ||
| belirlenmiş kere | ||
| elde cürüme | ||
| tanınan dünyaca yüksek | ||
| müddetinin ailesine | ||
| vicdan kırıcı itibariyle | ||
| geniş inanma | ||
| kendi görevleri Teşkilatı | ||
| yaymak | ||
| öğretim vesayet | ||
| renk kişiliğinin | ||
| tamamlanan | ||
| haklara bulunma | ||
| hükmü uygulanabilecek | ||
| etmiş geliştirilmesini hoşgörü | ||
| sahiptir temel | ||
| giyim | ||
| Bundan temeli | ||
| icaplarını | ||
| mülk karışma tekmil | ||
| vicdana hürriyetine işini | ||
| Herkesin vahşiliklere | ||
| dolaşma dünyanın | ||
| davasının Uluslararasında idamesi | ||
| eşittir | ||
| haklardan hakkı | ||
| kovuşturmalar hürriyetlerden gözönünde | ||
| Evrensel fiilli beyannamesi |
| # This file is part of ICU4X. For terms of use, please see the file | ||
| # called LICENSE at the top level of the ICU4X source tree | ||
| # (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). | ||
| # The contents of this file have been translated by "Google Translate". | ||
| Vào những năm cuối của thế kỷ 19, không ai có thể tin rằng thế giới này | ||
| đang được theo dõi một cách sâu sắc và chặt chẽ bởi những trí thông minh | ||
| lớn hơn con người nhưng cũng nguy hiểm như chính con người; rằng khi con | ||
| người bận rộn với những mối quan tâm khác nhau của họ, họ bị xem xét và | ||
| nghiên cứu kỹ lưỡng, có lẽ gần như một người đàn ông với kính hiển vi có thể | ||
| xem xét kỹ lưỡng những sinh vật nhất thời tụ tập và sinh sôi nảy nở trong | ||
| một giọt nước. Với sự tự mãn vô hạn, con người đi đi lại lại khắp thế giới | ||
| này chỉ vì những công việc nhỏ nhặt của họ, thanh thản với niềm tin chắc | ||
| chắn về đế chế của họ đối với vật chất. Có thể là infusoria dưới kính hiển | ||
| vi cũng làm như vậy. Không ai coi các thế giới cũ hơn trong không gian là | ||
| nguồn gây nguy hiểm cho con người, hoặc nghĩ về chúng chỉ để bác bỏ ý | ||
| tưởng về sự sống đối với chúng là không thể hoặc không thể xảy ra. | ||
| Thật tò mò khi nhớ lại một số thói quen tinh thần của những ngày đã | ||
| qua. Hầu hết những người trên trái đất đều tưởng tượng rằng có thể có | ||
| những người khác trên sao Hỏa, có lẽ thấp kém hơn họ và sẵn sàng chào | ||
| đón một doanh nghiệp truyền giáo. Tuy nhiên, bên kia vịnh không gian, | ||
| những bộ óc đối với tâm trí của chúng ta cũng như tâm trí của chúng ta đối | ||
| với những con thú bị diệt vong, những bộ óc rộng lớn, lạnh lùng và vô cảm, | ||
| nhìn trái đất này với con mắt ghen tị, và dần dần và chắc chắn vạch ra | ||
| những kế hoạch chống lại chúng ta. Và đầu thế kỷ 20 đã xảy ra sự vỡ mộng | ||
| lớn. Hành tinh sao Hỏa, tôi không cần nhắc độc giả, quay xung quanh mặt | ||
| trời ở khoảng cách trung bình 140.000.000 dặm, và ánh sáng và nhiệt mà | ||
| nó nhận được từ mặt trời chỉ bằng một nửa so với thế giới này nhận được. | ||
| Nếu giả thuyết về tinh vân có bất kỳ sự thật nào, nó phải tồn tại lâu | ||
| đời hơn thế giới của chúng ta; và rất lâu trước khi trái đất này ngừng | ||
| nóng chảy, sự sống trên bề mặt của nó hẳn đã bắt đầu quá trình của nó. | ||
| Thực tế là nó chỉ chiếm một phần bảy thể tích của trái đất đã làm tăng | ||
| tốc độ nguội đi của nó đến nhiệt độ mà sự sống có thể bắt đầu. Nó có | ||
| không khí và nước và tất cả những gì cần thiết để hỗ trợ sự tồn tại | ||
| sinh động. Tuy nhiên, con người quá hão huyền và bị mù quáng bởi sự phù | ||
| phiếm của mình, đến nỗi cho đến tận cuối thế kỷ 19, không có nhà văn nào | ||
| bày tỏ bất kỳ ý tưởng nào rằng sự sống thông minh có thể đã phát triển ở đó xa, | ||
| hoặc thực sự là ở tất cả, vượt ra ngoài mức độ trần gian của nó. Người ta | ||
| cũng không hiểu một cách tổng quát rằng vì sao Hỏa già hơn trái đất của chúng | ||
| ta, chỉ bằng một phần tư diện tích bề mặt và ở xa mặt trời hơn, nên điều tất | ||
| yếu dẫn đến là nó không chỉ xa hơn so với thời điểm bắt đầu mà còn gần ngày kết | ||
| thúc hơn. Sự nguội lạnh thế tục mà một ngày nào đó phải vượt qua hành tinh của chúng | ||
| ta đã thực sự đi xa với người hàng xóm của chúng ta. Tình trạng vật lý của nó phần lớn | ||
| vẫn còn là một bí ẩn, nhưng giờ đây chúng ta biết rằng ngay cả ở vùng xích đạo của nó, | ||
| nhiệt độ giữa trưa hầu như không bằng nhiệt độ của mùa đông lạnh nhất của chúng ta. | ||
| Không khí của nó loãng hơn nhiều so với không khí của chúng ta, các đại dương của nó đã | ||
| thu hẹp lại cho đến khi chỉ bao phủ một phần ba bề mặt của nó, và khi các mùa chậm chạp | ||
| của nó thay đổi, các chỏm tuyết khổng lồ tụ lại và tan chảy ở hai cực và định kỳ làm ngập các vùng ôn đới của nó. | ||
| Giai đoạn cuối cùng của sự kiệt sức, mà đối với chúng ta vẫn còn quá xa vời, đã trở thành | ||
| một vấn đề ngày nay đối với các cư dân trên sao Hỏa. Áp lực trước mắt của sự cần | ||
| thiết đã làm sáng tỏ trí tuệ của họ, mở rộng sức mạnh của họ và làm chai đá trái | ||
| tim họ. Và nhìn xuyên qua không gian với các công cụ, và trí thông minh như chúng | ||
| ta hiếm khi mơ tới, họ thấy, ở khoảng cách gần nhất chỉ cách họ 35.000.000 dặm | ||
| về phía mặt trời, một ngôi sao buổi sáng của hy vọng, hành tinh ấm áp hơn của chúng | ||
| ta, màu xanh lục của thảm thực vật và màu xám của nước , với bầu không khí nhiều | ||
| mây hùng hồn của sự màu mỡ, với những cái nhìn thoáng qua qua những đám mây | ||
| trôi dạt của nó là những dải đất rộng lớn đông dân và những vùng biển chật hẹp đông đúc hải quân. |
| // This file is part of ICU4X. For terms of use, please see the file | ||
| // called LICENSE at the top level of the ICU4X source tree | ||
| // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). | ||
| use criterion::{black_box, BenchmarkId, Criterion}; | ||
| use icu_normalizer::{ComposingNormalizerBorrowed, DecomposingNormalizerBorrowed}; | ||
| struct BenchDataContent { | ||
| pub file_name: String, | ||
| pub nfc: String, | ||
| pub nfd: String, | ||
| pub nfkc: String, | ||
| pub nfkd: String, | ||
| pub nfc_u16: Vec<u16>, | ||
| pub nfd_u16: Vec<u16>, | ||
| pub nfkc_u16: Vec<u16>, | ||
| pub nfkd_u16: Vec<u16>, | ||
| } | ||
| fn strip_headers(content: &str) -> String { | ||
| content | ||
| .lines() | ||
| .filter(|&s| !s.starts_with('#')) | ||
| .map(|s| s.to_owned()) | ||
| .collect::<Vec<String>>() | ||
| .join("\n") | ||
| } | ||
| fn normalizer_bench_data() -> [BenchDataContent; 15] { | ||
| let nfc_normalizer = ComposingNormalizerBorrowed::new_nfc(); | ||
| let nfd_normalizer = DecomposingNormalizerBorrowed::new_nfd(); | ||
| let nfkc_normalizer = ComposingNormalizerBorrowed::new_nfkc(); | ||
| let nfkd_normalizer = DecomposingNormalizerBorrowed::new_nfkd(); | ||
| let content_latin: (&str, &str) = ( | ||
| "TestNames_Latin", | ||
| &strip_headers(include_str!("./data/TestNames_Latin.txt")), | ||
| ); | ||
| let content_jp_h: (&str, &str) = ( | ||
| "TestNames_Japanese_h", | ||
| &strip_headers(include_str!("./data/TestNames_Japanese_h.txt")), | ||
| ); | ||
| let content_jp_k: (&str, &str) = ( | ||
| "TestNames_Japanese_k", | ||
| &strip_headers(include_str!("./data/TestNames_Japanese_k.txt")), | ||
| ); | ||
| let content_korean: (&str, &str) = ( | ||
| "TestNames_Korean", | ||
| &strip_headers(include_str!("./data/TestNames_Korean.txt")), | ||
| ); | ||
| let content_random_words_ar: (&str, &str) = ( | ||
| "TestRandomWordsUDHR_ar", | ||
| &strip_headers(include_str!("./data/TestRandomWordsUDHR_ar.txt")), | ||
| ); | ||
| let content_random_words_de: (&str, &str) = ( | ||
| "TestRandomWordsUDHR_de", | ||
| &strip_headers(include_str!("./data/TestRandomWordsUDHR_de.txt")), | ||
| ); | ||
| let content_random_words_el: (&str, &str) = ( | ||
| "TestRandomWordsUDHR_el", | ||
| &strip_headers(include_str!("./data/TestRandomWordsUDHR_el.txt")), | ||
| ); | ||
| let content_random_words_es: (&str, &str) = ( | ||
| "TestRandomWordsUDHR_es", | ||
| &strip_headers(include_str!("./data/TestRandomWordsUDHR_es.txt")), | ||
| ); | ||
| let content_random_words_fr: (&str, &str) = ( | ||
| "TestRandomWordsUDHR_fr", | ||
| &strip_headers(include_str!("./data/TestRandomWordsUDHR_fr.txt")), | ||
| ); | ||
| let content_random_words_he: (&str, &str) = ( | ||
| "TestRandomWordsUDHR_he", | ||
| &strip_headers(include_str!("./data/TestRandomWordsUDHR_he.txt")), | ||
| ); | ||
| let content_random_words_pl: (&str, &str) = ( | ||
| "TestRandomWordsUDHR_pl", | ||
| &strip_headers(include_str!("./data/TestRandomWordsUDHR_pl.txt")), | ||
| ); | ||
| let content_random_words_ru: (&str, &str) = ( | ||
| "TestRandomWordsUDHR_ru", | ||
| &strip_headers(include_str!("./data/TestRandomWordsUDHR_ru.txt")), | ||
| ); | ||
| let content_random_words_th: (&str, &str) = ( | ||
| "TestRandomWordsUDHR_th", | ||
| &strip_headers(include_str!("./data/TestRandomWordsUDHR_th.txt")), | ||
| ); | ||
| let content_random_words_tr: (&str, &str) = ( | ||
| "TestRandomWordsUDHR_tr", | ||
| &strip_headers(include_str!("./data/TestRandomWordsUDHR_tr.txt")), | ||
| ); | ||
| let content_viet: (&str, &str) = ("wotw", &strip_headers(include_str!("./data/wotw.txt"))); | ||
| [ | ||
| content_latin, | ||
| content_viet, | ||
| content_jp_k, | ||
| content_jp_h, | ||
| content_korean, | ||
| content_random_words_ru, | ||
| content_random_words_ar, | ||
| content_random_words_el, | ||
| content_random_words_es, | ||
| content_random_words_fr, | ||
| content_random_words_tr, | ||
| content_random_words_th, | ||
| content_random_words_pl, | ||
| content_random_words_he, | ||
| content_random_words_de, | ||
| ] | ||
| .map(|(file_name, raw_content)| { | ||
| let nfc = &nfc_normalizer.normalize(raw_content); | ||
| let nfd = &nfd_normalizer.normalize(raw_content); | ||
| let nfkc = &nfkc_normalizer.normalize(raw_content); | ||
| let nfkd = &nfkd_normalizer.normalize(raw_content); | ||
| BenchDataContent { | ||
| file_name: file_name.to_owned(), | ||
| nfc: nfc.to_string(), | ||
| nfd: nfd.to_string(), | ||
| nfkc: nfkc.to_string(), | ||
| nfkd: nfkd.to_string(), | ||
| nfc_u16: nfc.encode_utf16().collect(), | ||
| nfd_u16: nfd.encode_utf16().collect(), | ||
| nfkc_u16: nfkc.encode_utf16().collect(), | ||
| nfkd_u16: nfkd.encode_utf16().collect(), | ||
| } | ||
| }) | ||
| } | ||
| fn function_under_bench(normalizer: &DecomposingNormalizerBorrowed, text: &str) { | ||
| normalizer.normalize(text); | ||
| } | ||
| fn function_under_bench_u16(normalizer: &DecomposingNormalizerBorrowed, text: &[u16]) { | ||
| normalizer.normalize_utf16(text); | ||
| } | ||
| pub fn criterion_benchmark(criterion: &mut Criterion) { | ||
| let group_name = "decomposing_normalizer_nfd"; | ||
| let normalizer_under_bench = DecomposingNormalizerBorrowed::new_nfd(); | ||
| let mut group = criterion.benchmark_group(group_name); | ||
| for bench_data_content in black_box(normalizer_bench_data()) { | ||
| group.bench_function( | ||
| BenchmarkId::from_parameter(format!("from_nfc_{}", bench_data_content.file_name)), | ||
| |bencher| { | ||
| bencher | ||
| .iter(|| function_under_bench(&normalizer_under_bench, &bench_data_content.nfc)) | ||
| }, | ||
| ); | ||
| group.bench_function( | ||
| BenchmarkId::from_parameter(format!("from_nfd_{}", bench_data_content.file_name)), | ||
| |bencher| { | ||
| bencher | ||
| .iter(|| function_under_bench(&normalizer_under_bench, &bench_data_content.nfd)) | ||
| }, | ||
| ); | ||
| group.bench_function( | ||
| BenchmarkId::from_parameter(format!("from_nfkc_{}", bench_data_content.file_name)), | ||
| |bencher| { | ||
| bencher.iter(|| { | ||
| function_under_bench(&normalizer_under_bench, &bench_data_content.nfkc) | ||
| }) | ||
| }, | ||
| ); | ||
| group.bench_function( | ||
| BenchmarkId::from_parameter(format!("from_nfkd_{}", bench_data_content.file_name)), | ||
| |bencher| { | ||
| bencher.iter(|| { | ||
| function_under_bench(&normalizer_under_bench, &bench_data_content.nfkd) | ||
| }) | ||
| }, | ||
| ); | ||
| // UTF 16 | ||
| group.bench_function( | ||
| BenchmarkId::from_parameter(format!("from_nfc_{}_u16", bench_data_content.file_name)), | ||
| |bencher| { | ||
| bencher.iter(|| { | ||
| function_under_bench_u16(&normalizer_under_bench, &bench_data_content.nfc_u16) | ||
| }) | ||
| }, | ||
| ); | ||
| group.bench_function( | ||
| BenchmarkId::from_parameter(format!("from_nfd_{}_u16", bench_data_content.file_name)), | ||
| |bencher| { | ||
| bencher.iter(|| { | ||
| function_under_bench_u16(&normalizer_under_bench, &bench_data_content.nfd_u16) | ||
| }) | ||
| }, | ||
| ); | ||
| group.bench_function( | ||
| BenchmarkId::from_parameter(format!("from_nfkc_{}_u16", bench_data_content.file_name)), | ||
| |bencher| { | ||
| bencher.iter(|| { | ||
| function_under_bench_u16(&normalizer_under_bench, &bench_data_content.nfkc_u16) | ||
| }) | ||
| }, | ||
| ); | ||
| group.bench_function( | ||
| BenchmarkId::from_parameter(format!("from_nfkd_{}_u16", bench_data_content.file_name)), | ||
| |bencher| { | ||
| bencher.iter(|| { | ||
| function_under_bench_u16(&normalizer_under_bench, &bench_data_content.nfkd_u16) | ||
| }) | ||
| }, | ||
| ); | ||
| } | ||
| group.finish(); | ||
| } |
| // This file is part of ICU4X. For terms of use, please see the file | ||
| // called LICENSE at the top level of the ICU4X source tree | ||
| // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). | ||
| use criterion::{black_box, BenchmarkId, Criterion}; | ||
| use icu_normalizer::{ComposingNormalizerBorrowed, DecomposingNormalizerBorrowed}; | ||
| struct BenchDataContent { | ||
| pub file_name: String, | ||
| pub nfc: String, | ||
| pub nfd: String, | ||
| pub nfkc: String, | ||
| pub nfkd: String, | ||
| pub nfc_u16: Vec<u16>, | ||
| pub nfd_u16: Vec<u16>, | ||
| pub nfkc_u16: Vec<u16>, | ||
| pub nfkd_u16: Vec<u16>, | ||
| } | ||
| fn strip_headers(content: &str) -> String { | ||
| content | ||
| .lines() | ||
| .filter(|&s| !s.starts_with('#')) | ||
| .map(|s| s.to_owned()) | ||
| .collect::<Vec<String>>() | ||
| .join("\n") | ||
| } | ||
| fn normalizer_bench_data() -> [BenchDataContent; 15] { | ||
| let nfc_normalizer = ComposingNormalizerBorrowed::new_nfc(); | ||
| let nfd_normalizer = DecomposingNormalizerBorrowed::new_nfd(); | ||
| let nfkc_normalizer = ComposingNormalizerBorrowed::new_nfkc(); | ||
| let nfkd_normalizer = DecomposingNormalizerBorrowed::new_nfkd(); | ||
| let content_latin: (&str, &str) = ( | ||
| "TestNames_Latin", | ||
| &strip_headers(include_str!("./data/TestNames_Latin.txt")), | ||
| ); | ||
| let content_jp_h: (&str, &str) = ( | ||
| "TestNames_Japanese_h", | ||
| &strip_headers(include_str!("./data/TestNames_Japanese_h.txt")), | ||
| ); | ||
| let content_jp_k: (&str, &str) = ( | ||
| "TestNames_Japanese_k", | ||
| &strip_headers(include_str!("./data/TestNames_Japanese_k.txt")), | ||
| ); | ||
| let content_korean: (&str, &str) = ( | ||
| "TestNames_Korean", | ||
| &strip_headers(include_str!("./data/TestNames_Korean.txt")), | ||
| ); | ||
| let content_random_words_ar: (&str, &str) = ( | ||
| "TestRandomWordsUDHR_ar", | ||
| &strip_headers(include_str!("./data/TestRandomWordsUDHR_ar.txt")), | ||
| ); | ||
| let content_random_words_de: (&str, &str) = ( | ||
| "TestRandomWordsUDHR_de", | ||
| &strip_headers(include_str!("./data/TestRandomWordsUDHR_de.txt")), | ||
| ); | ||
| let content_random_words_el: (&str, &str) = ( | ||
| "TestRandomWordsUDHR_el", | ||
| &strip_headers(include_str!("./data/TestRandomWordsUDHR_el.txt")), | ||
| ); | ||
| let content_random_words_es: (&str, &str) = ( | ||
| "TestRandomWordsUDHR_es", | ||
| &strip_headers(include_str!("./data/TestRandomWordsUDHR_es.txt")), | ||
| ); | ||
| let content_random_words_fr: (&str, &str) = ( | ||
| "TestRandomWordsUDHR_fr", | ||
| &strip_headers(include_str!("./data/TestRandomWordsUDHR_fr.txt")), | ||
| ); | ||
| let content_random_words_he: (&str, &str) = ( | ||
| "TestRandomWordsUDHR_he", | ||
| &strip_headers(include_str!("./data/TestRandomWordsUDHR_he.txt")), | ||
| ); | ||
| let content_random_words_pl: (&str, &str) = ( | ||
| "TestRandomWordsUDHR_pl", | ||
| &strip_headers(include_str!("./data/TestRandomWordsUDHR_pl.txt")), | ||
| ); | ||
| let content_random_words_ru: (&str, &str) = ( | ||
| "TestRandomWordsUDHR_ru", | ||
| &strip_headers(include_str!("./data/TestRandomWordsUDHR_ru.txt")), | ||
| ); | ||
| let content_random_words_th: (&str, &str) = ( | ||
| "TestRandomWordsUDHR_th", | ||
| &strip_headers(include_str!("./data/TestRandomWordsUDHR_th.txt")), | ||
| ); | ||
| let content_random_words_tr: (&str, &str) = ( | ||
| "TestRandomWordsUDHR_tr", | ||
| &strip_headers(include_str!("./data/TestRandomWordsUDHR_tr.txt")), | ||
| ); | ||
| let content_viet: (&str, &str) = ("wotw", &strip_headers(include_str!("./data/wotw.txt"))); | ||
| [ | ||
| content_latin, | ||
| content_viet, | ||
| content_jp_k, | ||
| content_jp_h, | ||
| content_korean, | ||
| content_random_words_ru, | ||
| content_random_words_ar, | ||
| content_random_words_el, | ||
| content_random_words_es, | ||
| content_random_words_fr, | ||
| content_random_words_tr, | ||
| content_random_words_th, | ||
| content_random_words_pl, | ||
| content_random_words_he, | ||
| content_random_words_de, | ||
| ] | ||
| .map(|(file_name, raw_content)| { | ||
| let nfc = &nfc_normalizer.normalize(raw_content); | ||
| let nfd = &nfd_normalizer.normalize(raw_content); | ||
| let nfkc = &nfkc_normalizer.normalize(raw_content); | ||
| let nfkd = &nfkd_normalizer.normalize(raw_content); | ||
| BenchDataContent { | ||
| file_name: file_name.to_owned(), | ||
| nfc: nfc.to_string(), | ||
| nfd: nfd.to_string(), | ||
| nfkc: nfkc.to_string(), | ||
| nfkd: nfkd.to_string(), | ||
| nfc_u16: nfc.encode_utf16().collect(), | ||
| nfd_u16: nfd.encode_utf16().collect(), | ||
| nfkc_u16: nfkc.encode_utf16().collect(), | ||
| nfkd_u16: nfkd.encode_utf16().collect(), | ||
| } | ||
| }) | ||
| } | ||
| fn function_under_bench(normalizer: &DecomposingNormalizerBorrowed, text: &str) { | ||
| normalizer.normalize(text); | ||
| } | ||
| fn function_under_bench_u16(normalizer: &DecomposingNormalizerBorrowed, text: &[u16]) { | ||
| normalizer.normalize_utf16(text); | ||
| } | ||
| pub fn criterion_benchmark(criterion: &mut Criterion) { | ||
| let group_name = "decomposing_normalizer_nfkd"; | ||
| let normalizer_under_bench = DecomposingNormalizerBorrowed::new_nfkd(); | ||
| let mut group = criterion.benchmark_group(group_name); | ||
| for bench_data_content in black_box(normalizer_bench_data()) { | ||
| group.bench_function( | ||
| BenchmarkId::from_parameter(format!("from_nfc_{}", bench_data_content.file_name)), | ||
| |bencher| { | ||
| bencher | ||
| .iter(|| function_under_bench(&normalizer_under_bench, &bench_data_content.nfc)) | ||
| }, | ||
| ); | ||
| group.bench_function( | ||
| BenchmarkId::from_parameter(format!("from_nfd_{}", bench_data_content.file_name)), | ||
| |bencher| { | ||
| bencher | ||
| .iter(|| function_under_bench(&normalizer_under_bench, &bench_data_content.nfd)) | ||
| }, | ||
| ); | ||
| group.bench_function( | ||
| BenchmarkId::from_parameter(format!("from_nfkc_{}", bench_data_content.file_name)), | ||
| |bencher| { | ||
| bencher.iter(|| { | ||
| function_under_bench(&normalizer_under_bench, &bench_data_content.nfkc) | ||
| }) | ||
| }, | ||
| ); | ||
| group.bench_function( | ||
| BenchmarkId::from_parameter(format!("from_nfkd_{}", bench_data_content.file_name)), | ||
| |bencher| { | ||
| bencher.iter(|| { | ||
| function_under_bench(&normalizer_under_bench, &bench_data_content.nfkd) | ||
| }) | ||
| }, | ||
| ); | ||
| // UTF 16 | ||
| group.bench_function( | ||
| BenchmarkId::from_parameter(format!("from_nfc_{}_u16", bench_data_content.file_name)), | ||
| |bencher| { | ||
| bencher.iter(|| { | ||
| function_under_bench_u16(&normalizer_under_bench, &bench_data_content.nfc_u16) | ||
| }) | ||
| }, | ||
| ); | ||
| group.bench_function( | ||
| BenchmarkId::from_parameter(format!("from_nfd_{}_u16", bench_data_content.file_name)), | ||
| |bencher| { | ||
| bencher.iter(|| { | ||
| function_under_bench_u16(&normalizer_under_bench, &bench_data_content.nfd_u16) | ||
| }) | ||
| }, | ||
| ); | ||
| group.bench_function( | ||
| BenchmarkId::from_parameter(format!("from_nfkc_{}_u16", bench_data_content.file_name)), | ||
| |bencher| { | ||
| bencher.iter(|| { | ||
| function_under_bench_u16(&normalizer_under_bench, &bench_data_content.nfkc_u16) | ||
| }) | ||
| }, | ||
| ); | ||
| group.bench_function( | ||
| BenchmarkId::from_parameter(format!("from_nfkd_{}_u16", bench_data_content.file_name)), | ||
| |bencher| { | ||
| bencher.iter(|| { | ||
| function_under_bench_u16(&normalizer_under_bench, &bench_data_content.nfkd_u16) | ||
| }) | ||
| }, | ||
| ); | ||
| } | ||
| group.finish(); | ||
| } |
| // This file is part of ICU4X. For terms of use, please see the file | ||
| // called LICENSE at the top level of the ICU4X source tree | ||
| // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). | ||
| use icu_normalizer::{ComposingNormalizerBorrowed, DecomposingNormalizerBorrowed}; | ||
| use criterion::{black_box, Criterion, Throughput}; | ||
| use smallvec::SmallVec; | ||
| //use detone::IterDecomposeVietnamese; | ||
| // 2048 times size of u16 fits on one 4KB memory page, which maximizes | ||
| // the run to take average over without introducing cross-page effects. | ||
| const INPUT_SIZE: usize = 2048; | ||
| fn generate_bmp_input_nfc(s: &str) -> Vec<u16> { | ||
| ComposingNormalizerBorrowed::new_nfc() | ||
| .normalize_iter(s.chars().cycle()) | ||
| .take(INPUT_SIZE) | ||
| .map(|c| { | ||
| if c <= '\u{FFFF}' { | ||
| c as u16 | ||
| } else { | ||
| unreachable!("Data should stay on the BMP!") | ||
| } | ||
| }) | ||
| .collect() | ||
| } | ||
| fn generate_bmp_input_nfd(s: &str) -> Vec<u16> { | ||
| DecomposingNormalizerBorrowed::new_nfd() | ||
| .normalize_iter(s.chars().cycle()) | ||
| .take(INPUT_SIZE) | ||
| .map(|c| { | ||
| if c <= '\u{FFFF}' { | ||
| c as u16 | ||
| } else { | ||
| unreachable!("Data should stay on the BMP!") | ||
| } | ||
| }) | ||
| .collect() | ||
| } | ||
| /// Removes headers and replaces line feed with space. | ||
| /// Do not use for languages that don't use spaces! | ||
| fn prepare_file_contents(content: &str) -> String { | ||
| content | ||
| .lines() | ||
| .filter(|&s| !s.starts_with('#')) | ||
| .map(|s| s.to_owned()) | ||
| .collect::<Vec<String>>() | ||
| .join(" ") | ||
| } | ||
| fn slice_as_slice(s: &[u16]) -> &[u16] { | ||
| black_box(s) | ||
| } | ||
| fn bench_lang(name: &str, data: &str, c: &mut Criterion) { | ||
| let input_nfc = generate_bmp_input_nfc(data); | ||
| let input_nfd = generate_bmp_input_nfd(data); | ||
| let nfc = ComposingNormalizerBorrowed::new_nfc(); | ||
| let nfd = DecomposingNormalizerBorrowed::new_nfd(); | ||
| // Appending to this output is infallible (does not return `Err`) and | ||
| // this is sized to be large enough not to actually take the the heap | ||
| // allocation path. | ||
| let mut output: SmallVec<[u16; INPUT_SIZE * 2]> = SmallVec::new(); | ||
| { | ||
| let mut group_name = "utf16_throughput_nfc_".to_string(); | ||
| group_name.push_str(name); | ||
| let mut group = c.benchmark_group(&group_name); | ||
| group.throughput(Throughput::Elements(input_nfc.len() as u64)); | ||
| group.bench_function("read", |b| { | ||
| b.iter(|| { | ||
| let _ = black_box( | ||
| nfc.split_normalized_utf16(slice_as_slice(&input_nfc)) | ||
| .0 | ||
| .len(), | ||
| ); | ||
| }) | ||
| }); | ||
| group.bench_function("writing_to_nfc", |b| { | ||
| b.iter(|| { | ||
| output.clear(); // Should be trivial and OK to do from within here. | ||
| let _ = black_box( | ||
| nfc.normalize_utf16_to(slice_as_slice(&input_nfc), black_box(&mut output)), | ||
| ); | ||
| }) | ||
| }); | ||
| group.bench_function("writing_to_nfd", |b| { | ||
| b.iter(|| { | ||
| output.clear(); // Should be trivial and OK to do from within here. | ||
| let _ = black_box( | ||
| nfd.normalize_utf16_to(slice_as_slice(&input_nfc), black_box(&mut output)), | ||
| ); | ||
| }) | ||
| }); | ||
| group.finish(); | ||
| } | ||
| { | ||
| let mut group_name = "utf16_throughput_nfd_".to_string(); | ||
| group_name.push_str(name); | ||
| let mut group = c.benchmark_group(&group_name); | ||
| group.throughput(Throughput::Elements(input_nfd.len() as u64)); | ||
| group.bench_function("read", |b| { | ||
| b.iter(|| { | ||
| let _ = black_box( | ||
| nfd.split_normalized_utf16(slice_as_slice(&input_nfd)) | ||
| .0 | ||
| .len(), | ||
| ); | ||
| }) | ||
| }); | ||
| group.bench_function("writing_to_nfd", |b| { | ||
| b.iter(|| { | ||
| output.clear(); // Should be trivial and OK to do from within here. | ||
| let _ = black_box( | ||
| nfd.normalize_utf16_to(slice_as_slice(&input_nfd), black_box(&mut output)), | ||
| ); | ||
| }) | ||
| }); | ||
| group.bench_function("writing_to_nfc", |b| { | ||
| b.iter(|| { | ||
| output.clear(); // Should be trivial and OK to do from within here. | ||
| let _ = black_box( | ||
| nfc.normalize_utf16_to(slice_as_slice(&input_nfd), black_box(&mut output)), | ||
| ); | ||
| }) | ||
| }); | ||
| group.finish(); | ||
| } | ||
| } | ||
| static EL: &str = include_str!("./data/TestRandomWordsUDHR_el.txt"); | ||
| static EN: &str = "The ICU4X normalizer is an implementation of Unicode Normalization Forms. "; | ||
| static FR: &str = include_str!("./data/TestRandomWordsUDHR_fr.txt"); | ||
| static VI: &str = include_str!("./data/wotw.txt"); | ||
| static ZH: &str = "單父人呂公善沛令,辟仇,從之客,因家焉。沛中豪傑吏聞令有重客,皆往賀。"; | ||
| // zh text from https://www.gutenberg.org/cache/epub/23841/pg23841.txt | ||
| // metadata at https://www.gutenberg.org/ebooks/23841 | ||
| // If you replace this text, be sure not to include ASCII spaces and be sure | ||
| // to include punctuation using code points actually used for punctuation in | ||
| // Chinese. | ||
| // TODO: Add: | ||
| // * Japanese with realistic proportion of kana voicing marks | ||
| // * Korean, since Hangul is special-cased in the normalizer | ||
| // * Kannada or some other non-Korean BMP language that uses | ||
| // backward-combining starters (with realistic proportion of such | ||
| // characters). | ||
| // * Chakma or some other living non-BMP language. | ||
| // * Vietnamese in the orthographic form (i.e. as produced by | ||
| // the official non-IME keyboard layout that's less common | ||
| // than the NFC-producing IME.) | ||
| pub fn criterion_benchmark(c: &mut Criterion) { | ||
| bench_lang("el", prepare_file_contents(EL).as_str(), c); | ||
| bench_lang("en", EN, c); | ||
| bench_lang("fr", prepare_file_contents(FR).as_str(), c); | ||
| bench_lang("vi", prepare_file_contents(VI).as_str(), c); | ||
| bench_lang("zh", ZH, c); | ||
| } |
| # This file is automatically @generated by Cargo. | ||
| # It is not intended for manual editing. | ||
| version = 3 | ||
| [[package]] | ||
| name = "aho-corasick" | ||
| version = "1.1.3" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916" | ||
| dependencies = [ | ||
| "memchr", | ||
| ] | ||
| [[package]] | ||
| name = "anes" | ||
| version = "0.1.6" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "4b46cbb362ab8752921c97e041f5e366ee6297bd428a31275b9fcf1e380f7299" | ||
| [[package]] | ||
| name = "anstyle" | ||
| version = "1.0.13" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "5192cca8006f1fd4f7237516f40fa183bb07f8fbdfedaa0036de5ea9b0b45e78" | ||
| [[package]] | ||
| name = "arraystring" | ||
| version = "0.3.0" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "4d517c467117e1d8ca795bc8cc90857ff7f79790cca0e26f6e9462694ece0185" | ||
| dependencies = [ | ||
| "typenum", | ||
| ] | ||
| [[package]] | ||
| name = "arrayvec" | ||
| version = "0.7.6" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" | ||
| [[package]] | ||
| name = "atoi" | ||
| version = "2.0.0" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "f28d99ec8bfea296261ca1af174f24225171fea9664ba9003cbebee704810528" | ||
| dependencies = [ | ||
| "num-traits", | ||
| ] | ||
| [[package]] | ||
| name = "autocfg" | ||
| version = "1.5.0" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8" | ||
| [[package]] | ||
| name = "bumpalo" | ||
| version = "3.19.0" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "46c5e41b57b8bba42a04676d81cb89e9ee8e859a1a66f80a5a72e1cb76b34d43" | ||
| [[package]] | ||
| name = "cast" | ||
| version = "0.3.0" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5" | ||
| [[package]] | ||
| name = "cfg-if" | ||
| version = "1.0.4" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801" | ||
| [[package]] | ||
| name = "ciborium" | ||
| version = "0.2.2" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "42e69ffd6f0917f5c029256a24d0161db17cea3997d185db0d35926308770f0e" | ||
| dependencies = [ | ||
| "ciborium-io", | ||
| "ciborium-ll", | ||
| "serde", | ||
| ] | ||
| [[package]] | ||
| name = "ciborium-io" | ||
| version = "0.2.2" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "05afea1e0a06c9be33d539b876f1ce3692f4afea2cb41f740e7743225ed1c757" | ||
| [[package]] | ||
| name = "ciborium-ll" | ||
| version = "0.2.2" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "57663b653d948a338bfb3eeba9bb2fd5fcfaecb9e199e87e1eda4d9e8b240fd9" | ||
| dependencies = [ | ||
| "ciborium-io", | ||
| "half", | ||
| ] | ||
| [[package]] | ||
| name = "clap" | ||
| version = "4.4.18" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "1e578d6ec4194633722ccf9544794b71b1385c3c027efe0c55db226fc880865c" | ||
| dependencies = [ | ||
| "clap_builder", | ||
| ] | ||
| [[package]] | ||
| name = "clap_builder" | ||
| version = "4.4.18" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "4df4df40ec50c46000231c914968278b1eb05098cf8f1b3a518a95030e71d1c7" | ||
| dependencies = [ | ||
| "anstyle", | ||
| "clap_lex", | ||
| ] | ||
| [[package]] | ||
| name = "clap_lex" | ||
| version = "0.6.0" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "702fc72eb24e5a1e48ce58027a675bc24edd52096d5397d4aea7c6dd9eca0bd1" | ||
| [[package]] | ||
| name = "cobs" | ||
| version = "0.3.0" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "0fa961b519f0b462e3a3b4a34b64d119eeaca1d59af726fe450bbba07a9fc0a1" | ||
| dependencies = [ | ||
| "thiserror", | ||
| ] | ||
| [[package]] | ||
| name = "criterion" | ||
| version = "0.5.1" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "f2b12d017a929603d80db1831cd3a24082f8137ce19c69e6447f54f5fc8d692f" | ||
| dependencies = [ | ||
| "anes", | ||
| "cast", | ||
| "ciborium", | ||
| "clap", | ||
| "criterion-plot", | ||
| "is-terminal", | ||
| "itertools", | ||
| "num-traits", | ||
| "once_cell", | ||
| "oorandom", | ||
| "plotters", | ||
| "rayon", | ||
| "regex", | ||
| "serde", | ||
| "serde_derive", | ||
| "serde_json", | ||
| "tinytemplate", | ||
| "walkdir", | ||
| ] | ||
| [[package]] | ||
| name = "criterion-plot" | ||
| version = "0.5.0" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "6b50826342786a51a89e2da3a28f1c32b06e387201bc2d19791f622c673706b1" | ||
| dependencies = [ | ||
| "cast", | ||
| "itertools", | ||
| ] | ||
| [[package]] | ||
| name = "crossbeam-deque" | ||
| version = "0.8.6" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51" | ||
| dependencies = [ | ||
| "crossbeam-epoch", | ||
| "crossbeam-utils", | ||
| ] | ||
| [[package]] | ||
| name = "crossbeam-epoch" | ||
| version = "0.9.18" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e" | ||
| dependencies = [ | ||
| "crossbeam-utils", | ||
| ] | ||
| [[package]] | ||
| name = "crossbeam-utils" | ||
| version = "0.8.21" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" | ||
| [[package]] | ||
| name = "crunchy" | ||
| version = "0.2.4" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "460fbee9c2c2f33933d720630a6a0bac33ba7053db5344fac858d4b8952d77d5" | ||
| [[package]] | ||
| name = "databake" | ||
| version = "0.2.0" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "ff6ee9e2d2afb173bcdeee45934c89ec341ab26f91c9933774fc15c2b58f83ef" | ||
| dependencies = [ | ||
| "databake-derive", | ||
| "proc-macro2", | ||
| "quote", | ||
| ] | ||
| [[package]] | ||
| name = "databake-derive" | ||
| version = "0.2.0" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "6834770958c7b84223607e49758ec0dde273c4df915e734aad50f62968a4c134" | ||
| dependencies = [ | ||
| "proc-macro2", | ||
| "quote", | ||
| "syn", | ||
| "synstructure", | ||
| ] | ||
| [[package]] | ||
| name = "detone" | ||
| version = "1.0.1" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "5d5b580660e7375410c9199e84aa298f919925fb53d8cc9b02d8010ff5a14d09" | ||
| [[package]] | ||
| name = "displaydoc" | ||
| version = "0.2.5" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0" | ||
| dependencies = [ | ||
| "proc-macro2", | ||
| "quote", | ||
| "syn", | ||
| ] | ||
| [[package]] | ||
| name = "either" | ||
| version = "1.15.0" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719" | ||
| [[package]] | ||
| name = "erased-serde" | ||
| version = "0.4.8" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "259d404d09818dec19332e31d94558aeb442fea04c817006456c24b5460bbd4b" | ||
| dependencies = [ | ||
| "serde", | ||
| "serde_core", | ||
| "typeid", | ||
| ] | ||
| [[package]] | ||
| name = "half" | ||
| version = "2.4.1" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "6dd08c532ae367adf81c312a4580bc67f1d0fe8bc9c460520283f4c0ff277888" | ||
| dependencies = [ | ||
| "cfg-if", | ||
| "crunchy", | ||
| ] | ||
| [[package]] | ||
| name = "hermit-abi" | ||
| version = "0.5.2" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "fc0fef456e4baa96da950455cd02c081ca953b141298e41db3fc7e36b1da849c" | ||
| [[package]] | ||
| name = "icu_collections" | ||
| version = "2.1.0" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "f578a71f2bfaf7ceb30b519a645ae48024b45f9eecbe060a31a004d7b4ba9462" | ||
| dependencies = [ | ||
| "databake", | ||
| "displaydoc", | ||
| "potential_utf", | ||
| "serde", | ||
| "yoke", | ||
| "zerofrom", | ||
| "zerovec", | ||
| ] | ||
| [[package]] | ||
| name = "icu_locale_core" | ||
| version = "2.1.0" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "4c219b62bf5a06801012446193fdfcbd7970e876823aba4c62def2ce957dcb44" | ||
| dependencies = [ | ||
| "databake", | ||
| "displaydoc", | ||
| "litemap", | ||
| "serde", | ||
| "tinystr", | ||
| "writeable", | ||
| "zerovec", | ||
| ] | ||
| [[package]] | ||
| name = "icu_normalizer" | ||
| version = "2.1.0" | ||
| dependencies = [ | ||
| "arraystring", | ||
| "arrayvec", | ||
| "atoi", | ||
| "criterion", | ||
| "databake", | ||
| "detone", | ||
| "icu_collections", | ||
| "icu_normalizer_data", | ||
| "icu_properties", | ||
| "icu_provider", | ||
| "serde", | ||
| "smallvec", | ||
| "utf16_iter", | ||
| "utf8_iter", | ||
| "write16", | ||
| "zerovec", | ||
| ] | ||
| [[package]] | ||
| name = "icu_normalizer_data" | ||
| version = "2.1.0" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "d6ce2d23e1b3c45624ba6a23e2c767e01c9680e0c0800b39c7abfff9565175d8" | ||
| [[package]] | ||
| name = "icu_properties" | ||
| version = "2.1.0" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "6d70f9b6574c79f7a83ea5ce72cc88d271a3e77355c5f7748a107e751d8617fb" | ||
| dependencies = [ | ||
| "databake", | ||
| "icu_collections", | ||
| "icu_locale_core", | ||
| "icu_properties_data", | ||
| "icu_provider", | ||
| "serde", | ||
| "zerotrie", | ||
| "zerovec", | ||
| ] | ||
| [[package]] | ||
| name = "icu_properties_data" | ||
| version = "2.1.0" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "17fa55bf868e28e638ed132bcee1e5c21ba2c1e52c15e7c78b781858e7b54342" | ||
| [[package]] | ||
| name = "icu_provider" | ||
| version = "2.1.0" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "f64958e359123591ae1f17a27b5fc9ebdb50c98b04e0401146154de1d8fe3e44" | ||
| dependencies = [ | ||
| "databake", | ||
| "displaydoc", | ||
| "erased-serde", | ||
| "icu_locale_core", | ||
| "postcard", | ||
| "serde", | ||
| "stable_deref_trait", | ||
| "writeable", | ||
| "yoke", | ||
| "zerofrom", | ||
| "zerotrie", | ||
| "zerovec", | ||
| ] | ||
| [[package]] | ||
| name = "is-terminal" | ||
| version = "0.4.17" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "3640c1c38b8e4e43584d8df18be5fc6b0aa314ce6ebf51b53313d4306cca8e46" | ||
| dependencies = [ | ||
| "hermit-abi", | ||
| "libc", | ||
| "windows-sys", | ||
| ] | ||
| [[package]] | ||
| name = "itertools" | ||
| version = "0.10.5" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "b0fd2260e829bddf4cb6ea802289de2f86d6a7a690192fbe91b3f46e0f2c8473" | ||
| dependencies = [ | ||
| "either", | ||
| ] | ||
| [[package]] | ||
| name = "itoa" | ||
| version = "1.0.15" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c" | ||
| [[package]] | ||
| name = "js-sys" | ||
| version = "0.3.81" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "ec48937a97411dcb524a265206ccd4c90bb711fca92b2792c407f268825b9305" | ||
| dependencies = [ | ||
| "once_cell", | ||
| "wasm-bindgen", | ||
| ] | ||
| [[package]] | ||
| name = "libc" | ||
| version = "0.2.177" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "2874a2af47a2325c2001a6e6fad9b16a53b802102b528163885171cf92b15976" | ||
| [[package]] | ||
| name = "litemap" | ||
| version = "0.8.1" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "6373607a59f0be73a39b6fe456b8192fcc3585f602af20751600e974dd455e77" | ||
| dependencies = [ | ||
| "serde_core", | ||
| ] | ||
| [[package]] | ||
| name = "log" | ||
| version = "0.4.28" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "34080505efa8e45a4b816c349525ebe327ceaa8559756f0356cba97ef3bf7432" | ||
| [[package]] | ||
| name = "memchr" | ||
| version = "2.7.6" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "f52b00d39961fc5b2736ea853c9cc86238e165017a493d1d5c8eac6bdc4cc273" | ||
| [[package]] | ||
| name = "num-traits" | ||
| version = "0.2.19" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" | ||
| dependencies = [ | ||
| "autocfg", | ||
| ] | ||
| [[package]] | ||
| name = "once_cell" | ||
| version = "1.21.3" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d" | ||
| [[package]] | ||
| name = "oorandom" | ||
| version = "11.1.5" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "d6790f58c7ff633d8771f42965289203411a5e5c68388703c06e14f24770b41e" | ||
| [[package]] | ||
| name = "plotters" | ||
| version = "0.3.7" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "5aeb6f403d7a4911efb1e33402027fc44f29b5bf6def3effcc22d7bb75f2b747" | ||
| dependencies = [ | ||
| "num-traits", | ||
| "plotters-backend", | ||
| "plotters-svg", | ||
| "wasm-bindgen", | ||
| "web-sys", | ||
| ] | ||
| [[package]] | ||
| name = "plotters-backend" | ||
| version = "0.3.7" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "df42e13c12958a16b3f7f4386b9ab1f3e7933914ecea48da7139435263a4172a" | ||
| [[package]] | ||
| name = "plotters-svg" | ||
| version = "0.3.7" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "51bae2ac328883f7acdfea3d66a7c35751187f870bc81f94563733a154d7a670" | ||
| dependencies = [ | ||
| "plotters-backend", | ||
| ] | ||
| [[package]] | ||
| name = "postcard" | ||
| version = "1.1.3" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "6764c3b5dd454e283a30e6dfe78e9b31096d9e32036b5d1eaac7a6119ccb9a24" | ||
| dependencies = [ | ||
| "cobs", | ||
| "serde", | ||
| ] | ||
| [[package]] | ||
| name = "potential_utf" | ||
| version = "0.1.4" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "b73949432f5e2a09657003c25bca5e19a0e9c84f8058ca374f49e0ebe605af77" | ||
| dependencies = [ | ||
| "serde_core", | ||
| "zerovec", | ||
| ] | ||
| [[package]] | ||
| name = "proc-macro2" | ||
| version = "1.0.103" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "5ee95bc4ef87b8d5ba32e8b7714ccc834865276eab0aed5c9958d00ec45f49e8" | ||
| dependencies = [ | ||
| "unicode-ident", | ||
| ] | ||
| [[package]] | ||
| name = "quote" | ||
| version = "1.0.41" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "ce25767e7b499d1b604768e7cde645d14cc8584231ea6b295e9c9eb22c02e1d1" | ||
| dependencies = [ | ||
| "proc-macro2", | ||
| ] | ||
| [[package]] | ||
| name = "rayon" | ||
| version = "1.10.0" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "b418a60154510ca1a002a752ca9714984e21e4241e804d32555251faf8b78ffa" | ||
| dependencies = [ | ||
| "either", | ||
| "rayon-core", | ||
| ] | ||
| [[package]] | ||
| name = "rayon-core" | ||
| version = "1.12.1" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "1465873a3dfdaa8ae7cb14b4383657caab0b3e8a0aa9ae8e04b044854c8dfce2" | ||
| dependencies = [ | ||
| "crossbeam-deque", | ||
| "crossbeam-utils", | ||
| ] | ||
| [[package]] | ||
| name = "regex" | ||
| version = "1.12.2" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "843bc0191f75f3e22651ae5f1e72939ab2f72a4bc30fa80a066bd66edefc24d4" | ||
| dependencies = [ | ||
| "aho-corasick", | ||
| "memchr", | ||
| "regex-automata", | ||
| "regex-syntax", | ||
| ] | ||
| [[package]] | ||
| name = "regex-automata" | ||
| version = "0.4.13" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "5276caf25ac86c8d810222b3dbb938e512c55c6831a10f3e6ed1c93b84041f1c" | ||
| dependencies = [ | ||
| "aho-corasick", | ||
| "memchr", | ||
| "regex-syntax", | ||
| ] | ||
| [[package]] | ||
| name = "regex-syntax" | ||
| version = "0.8.8" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "7a2d987857b319362043e95f5353c0535c1f58eec5336fdfcf626430af7def58" | ||
| [[package]] | ||
| name = "rustversion" | ||
| version = "1.0.22" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d" | ||
| [[package]] | ||
| name = "ryu" | ||
| version = "1.0.20" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "28d3b2b1366ec20994f1fd18c3c594f05c5dd4bc44d8bb0c1c632c8d6829481f" | ||
| [[package]] | ||
| name = "same-file" | ||
| version = "1.0.6" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502" | ||
| dependencies = [ | ||
| "winapi-util", | ||
| ] | ||
| [[package]] | ||
| name = "serde" | ||
| version = "1.0.228" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e" | ||
| dependencies = [ | ||
| "serde_core", | ||
| "serde_derive", | ||
| ] | ||
| [[package]] | ||
| name = "serde_core" | ||
| version = "1.0.228" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad" | ||
| dependencies = [ | ||
| "serde_derive", | ||
| ] | ||
| [[package]] | ||
| name = "serde_derive" | ||
| version = "1.0.228" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79" | ||
| dependencies = [ | ||
| "proc-macro2", | ||
| "quote", | ||
| "syn", | ||
| ] | ||
| [[package]] | ||
| name = "serde_json" | ||
| version = "1.0.145" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "402a6f66d8c709116cf22f558eab210f5a50187f702eb4d7e5ef38d9a7f1c79c" | ||
| dependencies = [ | ||
| "itoa", | ||
| "memchr", | ||
| "ryu", | ||
| "serde", | ||
| "serde_core", | ||
| ] | ||
| [[package]] | ||
| name = "smallvec" | ||
| version = "1.15.1" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "67b1b7a3b5fe4f1376887184045fcf45c69e92af734b7aaddc05fb777b6fbd03" | ||
| [[package]] | ||
| name = "stable_deref_trait" | ||
| version = "1.2.1" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "6ce2be8dc25455e1f91df71bfa12ad37d7af1092ae736f3a6cd0e37bc7810596" | ||
| [[package]] | ||
| name = "syn" | ||
| version = "2.0.108" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "da58917d35242480a05c2897064da0a80589a2a0476c9a3f2fdc83b53502e917" | ||
| dependencies = [ | ||
| "proc-macro2", | ||
| "quote", | ||
| "unicode-ident", | ||
| ] | ||
| [[package]] | ||
| name = "synstructure" | ||
| version = "0.13.2" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "728a70f3dbaf5bab7f0c4b1ac8d7ae5ea60a4b5549c8a5914361c99147a709d2" | ||
| dependencies = [ | ||
| "proc-macro2", | ||
| "quote", | ||
| "syn", | ||
| ] | ||
| [[package]] | ||
| name = "thiserror" | ||
| version = "2.0.17" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "f63587ca0f12b72a0600bcba1d40081f830876000bb46dd2337a3051618f4fc8" | ||
| dependencies = [ | ||
| "thiserror-impl", | ||
| ] | ||
| [[package]] | ||
| name = "thiserror-impl" | ||
| version = "2.0.17" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "3ff15c8ecd7de3849db632e14d18d2571fa09dfc5ed93479bc4485c7a517c913" | ||
| dependencies = [ | ||
| "proc-macro2", | ||
| "quote", | ||
| "syn", | ||
| ] | ||
| [[package]] | ||
| name = "tinystr" | ||
| version = "0.8.2" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "42d3e9c45c09de15d06dd8acf5f4e0e399e85927b7f00711024eb7ae10fa4869" | ||
| dependencies = [ | ||
| "displaydoc", | ||
| "serde_core", | ||
| "zerovec", | ||
| ] | ||
| [[package]] | ||
| name = "tinytemplate" | ||
| version = "1.2.1" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "be4d6b5f19ff7664e8c98d03e2139cb510db9b0a60b55f8e8709b689d939b6bc" | ||
| dependencies = [ | ||
| "serde", | ||
| "serde_json", | ||
| ] | ||
| [[package]] | ||
| name = "typeid" | ||
| version = "1.0.3" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "bc7d623258602320d5c55d1bc22793b57daff0ec7efc270ea7d55ce1d5f5471c" | ||
| [[package]] | ||
| name = "typenum" | ||
| version = "1.19.0" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "562d481066bde0658276a35467c4af00bdc6ee726305698a55b86e61d7ad82bb" | ||
| [[package]] | ||
| name = "unicode-ident" | ||
| version = "1.0.20" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "462eeb75aeb73aea900253ce739c8e18a67423fadf006037cd3ff27e82748a06" | ||
| [[package]] | ||
| name = "utf16_iter" | ||
| version = "1.0.5" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "c8232dd3cdaed5356e0f716d285e4b40b932ac434100fe9b7e0e8e935b9e6246" | ||
| [[package]] | ||
| name = "utf8_iter" | ||
| version = "1.0.4" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be" | ||
| [[package]] | ||
| name = "walkdir" | ||
| version = "2.5.0" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "29790946404f91d9c5d06f9874efddea1dc06c5efe94541a7d6863108e3a5e4b" | ||
| dependencies = [ | ||
| "same-file", | ||
| "winapi-util", | ||
| ] | ||
| [[package]] | ||
| name = "wasm-bindgen" | ||
| version = "0.2.104" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "c1da10c01ae9f1ae40cbfac0bac3b1e724b320abfcf52229f80b547c0d250e2d" | ||
| dependencies = [ | ||
| "cfg-if", | ||
| "once_cell", | ||
| "rustversion", | ||
| "wasm-bindgen-macro", | ||
| "wasm-bindgen-shared", | ||
| ] | ||
| [[package]] | ||
| name = "wasm-bindgen-backend" | ||
| version = "0.2.104" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "671c9a5a66f49d8a47345ab942e2cb93c7d1d0339065d4f8139c486121b43b19" | ||
| dependencies = [ | ||
| "bumpalo", | ||
| "log", | ||
| "proc-macro2", | ||
| "quote", | ||
| "syn", | ||
| "wasm-bindgen-shared", | ||
| ] | ||
| [[package]] | ||
| name = "wasm-bindgen-macro" | ||
| version = "0.2.104" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "7ca60477e4c59f5f2986c50191cd972e3a50d8a95603bc9434501cf156a9a119" | ||
| dependencies = [ | ||
| "quote", | ||
| "wasm-bindgen-macro-support", | ||
| ] | ||
| [[package]] | ||
| name = "wasm-bindgen-macro-support" | ||
| version = "0.2.104" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "9f07d2f20d4da7b26400c9f4a0511e6e0345b040694e8a75bd41d578fa4421d7" | ||
| dependencies = [ | ||
| "proc-macro2", | ||
| "quote", | ||
| "syn", | ||
| "wasm-bindgen-backend", | ||
| "wasm-bindgen-shared", | ||
| ] | ||
| [[package]] | ||
| name = "wasm-bindgen-shared" | ||
| version = "0.2.104" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "bad67dc8b2a1a6e5448428adec4c3e84c43e561d8c9ee8a9e5aabeb193ec41d1" | ||
| dependencies = [ | ||
| "unicode-ident", | ||
| ] | ||
| [[package]] | ||
| name = "web-sys" | ||
| version = "0.3.81" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "9367c417a924a74cae129e6a2ae3b47fabb1f8995595ab474029da749a8be120" | ||
| dependencies = [ | ||
| "js-sys", | ||
| "wasm-bindgen", | ||
| ] | ||
| [[package]] | ||
| name = "winapi-util" | ||
| version = "0.1.11" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "c2a7b1c03c876122aa43f3020e6c3c3ee5c05081c9a00739faf7503aeba10d22" | ||
| dependencies = [ | ||
| "windows-sys", | ||
| ] | ||
| [[package]] | ||
| name = "windows-link" | ||
| version = "0.2.1" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5" | ||
| [[package]] | ||
| name = "windows-sys" | ||
| version = "0.61.2" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "ae137229bcbd6cdf0f7b80a31df61766145077ddf49416a728b02cb3921ff3fc" | ||
| dependencies = [ | ||
| "windows-link", | ||
| ] | ||
| [[package]] | ||
| name = "write16" | ||
| version = "1.0.0" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "d1890f4022759daae28ed4fe62859b1236caebfc61ede2f63ed4e695f3f6d936" | ||
| dependencies = [ | ||
| "arrayvec", | ||
| "smallvec", | ||
| ] | ||
| [[package]] | ||
| name = "writeable" | ||
| version = "0.6.2" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "9edde0db4769d2dc68579893f2306b26c6ecfbe0ef499b013d731b7b9247e0b9" | ||
| [[package]] | ||
| name = "yoke" | ||
| version = "0.8.1" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "72d6e5c6afb84d73944e5cedb052c4680d5657337201555f9f2a16b7406d4954" | ||
| dependencies = [ | ||
| "stable_deref_trait", | ||
| "yoke-derive", | ||
| "zerofrom", | ||
| ] | ||
| [[package]] | ||
| name = "yoke-derive" | ||
| version = "0.8.1" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "b659052874eb698efe5b9e8cf382204678a0086ebf46982b79d6ca3182927e5d" | ||
| dependencies = [ | ||
| "proc-macro2", | ||
| "quote", | ||
| "syn", | ||
| "synstructure", | ||
| ] | ||
| [[package]] | ||
| name = "zerofrom" | ||
| version = "0.1.6" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "50cc42e0333e05660c3587f3bf9d0478688e15d870fab3346451ce7f8c9fbea5" | ||
| dependencies = [ | ||
| "zerofrom-derive", | ||
| ] | ||
| [[package]] | ||
| name = "zerofrom-derive" | ||
| version = "0.1.6" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "d71e5d6e06ab090c67b5e44993ec16b72dcbaabc526db883a360057678b48502" | ||
| dependencies = [ | ||
| "proc-macro2", | ||
| "quote", | ||
| "syn", | ||
| "synstructure", | ||
| ] | ||
| [[package]] | ||
| name = "zerotrie" | ||
| version = "0.2.3" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "2a59c17a5562d507e4b54960e8569ebee33bee890c70aa3fe7b97e85a9fd7851" | ||
| dependencies = [ | ||
| "databake", | ||
| "displaydoc", | ||
| "litemap", | ||
| "serde_core", | ||
| "yoke", | ||
| "zerofrom", | ||
| "zerovec", | ||
| ] | ||
| [[package]] | ||
| name = "zerovec" | ||
| version = "0.11.5" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "6c28719294829477f525be0186d13efa9a3c602f7ec202ca9e353d310fb9a002" | ||
| dependencies = [ | ||
| "databake", | ||
| "serde", | ||
| "yoke", | ||
| "zerofrom", | ||
| "zerovec-derive", | ||
| ] | ||
| [[package]] | ||
| name = "zerovec-derive" | ||
| version = "0.11.2" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "eadce39539ca5cb3985590102671f2567e659fca9666581ad3411d59207951f3" | ||
| dependencies = [ | ||
| "proc-macro2", | ||
| "quote", | ||
| "syn", | ||
| ] |
| # THIS FILE IS AUTOMATICALLY GENERATED BY CARGO | ||
| # | ||
| # When uploading crates to the registry Cargo will automatically | ||
| # "normalize" Cargo.toml files for maximal compatibility | ||
| # with all versions of Cargo and also rewrite `path` dependencies | ||
| # to registry (e.g., crates.io) dependencies. | ||
| # | ||
| # If you are reading this file be aware that the original Cargo.toml | ||
| # will likely look very different (and much more reasonable). | ||
| # See Cargo.toml.orig for the original contents. | ||
| [package] | ||
| edition = "2021" | ||
| rust-version = "1.83" | ||
| name = "icu_normalizer" | ||
| version = "2.1.0" | ||
| authors = ["The ICU4X Project Developers"] | ||
| build = false | ||
| include = [ | ||
| "data/**/*", | ||
| "src/**/*", | ||
| "examples/**/*", | ||
| "benches/**/*", | ||
| "tests/**/*", | ||
| "Cargo.toml", | ||
| "LICENSE", | ||
| "README.md", | ||
| "build.rs", | ||
| ] | ||
| autolib = false | ||
| autobins = false | ||
| autoexamples = false | ||
| autotests = false | ||
| autobenches = false | ||
| description = "API for normalizing text into Unicode Normalization Forms" | ||
| homepage = "https://icu4x.unicode.org" | ||
| readme = "README.md" | ||
| categories = ["internationalization"] | ||
| license = "Unicode-3.0" | ||
| repository = "https://github.com/unicode-org/icu4x" | ||
| [package.metadata.docs.rs] | ||
| all-features = true | ||
| [features] | ||
| compiled_data = [ | ||
| "dep:icu_normalizer_data", | ||
| "icu_properties?/compiled_data", | ||
| "icu_provider/baked", | ||
| ] | ||
| datagen = [ | ||
| "serde", | ||
| "dep:databake", | ||
| "icu_properties", | ||
| "icu_collections/databake", | ||
| "zerovec/databake", | ||
| "icu_properties?/datagen", | ||
| "icu_provider/export", | ||
| ] | ||
| default = [ | ||
| "compiled_data", | ||
| "utf8_iter", | ||
| "utf16_iter", | ||
| ] | ||
| experimental = [] | ||
| icu_properties = ["dep:icu_properties"] | ||
| serde = [ | ||
| "dep:serde", | ||
| "icu_collections/serde", | ||
| "zerovec/serde", | ||
| "icu_properties?/serde", | ||
| "icu_provider/serde", | ||
| ] | ||
| utf16_iter = [ | ||
| "dep:utf16_iter", | ||
| "dep:write16", | ||
| ] | ||
| utf8_iter = ["dep:utf8_iter"] | ||
| write16 = [] | ||
| [lib] | ||
| name = "icu_normalizer" | ||
| path = "src/lib.rs" | ||
| [[test]] | ||
| name = "tests" | ||
| path = "tests/tests.rs" | ||
| [[bench]] | ||
| name = "bench" | ||
| path = "benches/bench.rs" | ||
| harness = false | ||
| required-features = [ | ||
| "utf16_iter", | ||
| "utf8_iter", | ||
| ] | ||
| [[bench]] | ||
| name = "canonical_composition" | ||
| path = "benches/canonical_composition.rs" | ||
| [[bench]] | ||
| name = "canonical_decomposition" | ||
| path = "benches/canonical_decomposition.rs" | ||
| [[bench]] | ||
| name = "composing_normalizer_nfc" | ||
| path = "benches/composing_normalizer_nfc.rs" | ||
| [[bench]] | ||
| name = "composing_normalizer_nfkc" | ||
| path = "benches/composing_normalizer_nfkc.rs" | ||
| [[bench]] | ||
| name = "decomposing_normalizer_nfd" | ||
| path = "benches/decomposing_normalizer_nfd.rs" | ||
| [[bench]] | ||
| name = "decomposing_normalizer_nfkd" | ||
| path = "benches/decomposing_normalizer_nfkd.rs" | ||
| [[bench]] | ||
| name = "utf16_throughput" | ||
| path = "benches/utf16_throughput.rs" | ||
| [dependencies.databake] | ||
| version = "0.2.0" | ||
| features = ["derive"] | ||
| optional = true | ||
| default-features = false | ||
| [dependencies.icu_collections] | ||
| version = "~2.1.0" | ||
| default-features = false | ||
| [dependencies.icu_normalizer_data] | ||
| version = "~2.1.0" | ||
| optional = true | ||
| default-features = false | ||
| [dependencies.icu_properties] | ||
| version = "~2.1.0" | ||
| optional = true | ||
| default-features = false | ||
| [dependencies.icu_provider] | ||
| version = "2.0.0" | ||
| default-features = false | ||
| [dependencies.serde] | ||
| version = "1.0.220" | ||
| features = [ | ||
| "derive", | ||
| "alloc", | ||
| ] | ||
| optional = true | ||
| default-features = false | ||
| [dependencies.smallvec] | ||
| version = "1.10.0" | ||
| default-features = false | ||
| [dependencies.utf16_iter] | ||
| version = "1.0.2" | ||
| optional = true | ||
| default-features = false | ||
| [dependencies.utf8_iter] | ||
| version = "1.0.2" | ||
| optional = true | ||
| default-features = false | ||
| [dependencies.write16] | ||
| version = "1.0.0" | ||
| features = ["alloc"] | ||
| optional = true | ||
| default-features = false | ||
| [dependencies.zerovec] | ||
| version = "0.11.3" | ||
| default-features = false | ||
| [dev-dependencies.arraystring] | ||
| version = "0.3.0" | ||
| [dev-dependencies.arrayvec] | ||
| version = "0.7.2" | ||
| default-features = false | ||
| [dev-dependencies.atoi] | ||
| version = "2.0.0" | ||
| [dev-dependencies.detone] | ||
| version = "1.0.0" | ||
| [dev-dependencies.write16] | ||
| version = "1.0.0" | ||
| features = [ | ||
| "arrayvec", | ||
| "smallvec", | ||
| ] | ||
| default-features = false | ||
| [target.'cfg(not(target_arch = "wasm32"))'.dev-dependencies.criterion] | ||
| version = "0.5.0" | ||
| [lints.rust.unexpected_cfgs] | ||
| level = "warn" | ||
| priority = 0 | ||
| check-cfg = ["cfg(icu4x_unstable_fast_trie_only)"] |
Sorry, the diff of this file is not supported yet
| UNICODE LICENSE V3 | ||
| COPYRIGHT AND PERMISSION NOTICE | ||
| Copyright © 2020-2024 Unicode, Inc. | ||
| NOTICE TO USER: Carefully read the following legal agreement. BY | ||
| DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING DATA FILES, AND/OR | ||
| SOFTWARE, YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE | ||
| TERMS AND CONDITIONS OF THIS AGREEMENT. IF YOU DO NOT AGREE, DO NOT | ||
| DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE THE DATA FILES OR SOFTWARE. | ||
| Permission is hereby granted, free of charge, to any person obtaining a | ||
| copy of data files and any associated documentation (the "Data Files") or | ||
| software and any associated documentation (the "Software") to deal in the | ||
| Data Files or Software without restriction, including without limitation | ||
| the rights to use, copy, modify, merge, publish, distribute, and/or sell | ||
| copies of the Data Files or Software, and to permit persons to whom the | ||
| Data Files or Software are furnished to do so, provided that either (a) | ||
| this copyright and permission notice appear with all copies of the Data | ||
| Files or Software, or (b) this copyright and permission notice appear in | ||
| associated Documentation. | ||
| THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY | ||
| KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | ||
| MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF | ||
| THIRD PARTY RIGHTS. | ||
| IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE | ||
| BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, | ||
| OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, | ||
| WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, | ||
| ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THE DATA | ||
| FILES OR SOFTWARE. | ||
| Except as contained in this notice, the name of a copyright holder shall | ||
| not be used in advertising or otherwise to promote the sale, use or other | ||
| dealings in these Data Files or Software without prior written | ||
| authorization of the copyright holder. | ||
| SPDX-License-Identifier: Unicode-3.0 | ||
| — | ||
| Portions of ICU4X may have been adapted from ICU4C and/or ICU4J. | ||
| ICU 1.8.1 to ICU 57.1 © 1995-2016 International Business Machines Corporation and others. |
| # icu_normalizer [](https://crates.io/crates/icu_normalizer) | ||
| <!-- cargo-rdme start --> | ||
| Normalizing text into Unicode Normalization Forms. | ||
| This module is published as its own crate ([`icu_normalizer`](https://docs.rs/icu_normalizer/latest/icu_normalizer/)) | ||
| and as part of the [`icu`](https://docs.rs/icu/latest/icu/) crate. See the latter for more details on the ICU4X project. | ||
| ## Functionality | ||
| The top level of the crate provides normalization of input into the four normalization forms defined in [UAX #15: Unicode | ||
| Normalization Forms](https://www.unicode.org/reports/tr15/): NFC, NFD, NFKC, and NFKD. | ||
| Three kinds of contiguous inputs are supported: known-well-formed UTF-8 (`&str`), potentially-not-well-formed UTF-8, | ||
| and potentially-not-well-formed UTF-16. Additionally, an iterator over `char` can be wrapped in a normalizing iterator. | ||
| The `uts46` module provides the combination of mapping and normalization operations for [UTS #46: Unicode IDNA | ||
| Compatibility Processing](https://www.unicode.org/reports/tr46/). This functionality is not meant to be used by | ||
| applications directly. Instead, it is meant as a building block for a full implementation of UTS #46, such as the | ||
| [`idna`](https://docs.rs/idna/latest/idna/) crate. | ||
| The `properties` module provides the non-recursive canonical decomposition operation on a per `char` basis and | ||
| the canonical compositon operation given two `char`s. It also provides access to the Canonical Combining Class | ||
| property. These operations are primarily meant for [HarfBuzz](https://harfbuzz.github.io/) via the | ||
| [`icu_harfbuzz`](https://docs.rs/icu_harfbuzz/latest/icu_harfbuzz/) crate. | ||
| Notably, this normalizer does _not_ provide the normalization “quick check” that can result in “maybe” in | ||
| addition to “yes” and “no”. The normalization checks provided by this crate always give a definitive | ||
| non-“maybe” answer. | ||
| ## Examples | ||
| ```rust | ||
| let nfc = icu_normalizer::ComposingNormalizerBorrowed::new_nfc(); | ||
| assert_eq!(nfc.normalize("a\u{0308}"), "ä"); | ||
| assert!(nfc.is_normalized("ä")); | ||
| let nfd = icu_normalizer::DecomposingNormalizerBorrowed::new_nfd(); | ||
| assert_eq!(nfd.normalize("ä"), "a\u{0308}"); | ||
| assert!(!nfd.is_normalized("ä")); | ||
| ``` | ||
| <!-- cargo-rdme end --> | ||
| ## More Information | ||
| For more information on development, authorship, contributing etc. please visit [`ICU4X home page`](https://github.com/unicode-org/icu4x). |
Sorry, the diff of this file is too big to display
| // This file is part of ICU4X. For terms of use, please see the file | ||
| // called LICENSE at the top level of the ICU4X source tree | ||
| // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). | ||
| //! Access to the Unicode properties or property-based operations that | ||
| //! are required for NFC and NFD. | ||
| //! | ||
| //! Applications should generally use the full normalizers that are | ||
| //! provided at the top level of this crate. However, the APIs in this | ||
| //! module are provided for callers such as HarfBuzz that specifically | ||
| //! want access to the raw canonical composition operation e.g. for use in a | ||
| //! glyph-availability-guided custom normalizer. | ||
| use crate::char_from_u16; | ||
| use crate::char_from_u32; | ||
| use crate::in_inclusive_range; | ||
| use crate::provider::CanonicalCompositions; | ||
| use crate::provider::DecompositionData; | ||
| use crate::provider::DecompositionTables; | ||
| use crate::provider::NonRecursiveDecompositionSupplement; | ||
| use crate::provider::NormalizerNfcV1; | ||
| use crate::provider::NormalizerNfdDataV1; | ||
| use crate::provider::NormalizerNfdSupplementV1; | ||
| use crate::provider::NormalizerNfdTablesV1; | ||
| use crate::trie_value_has_ccc; | ||
| use crate::CanonicalCombiningClass; | ||
| use crate::BACKWARD_COMBINING_MARKER; | ||
| use crate::FDFA_MARKER; | ||
| use crate::HANGUL_L_BASE; | ||
| use crate::HANGUL_N_COUNT; | ||
| use crate::HANGUL_S_BASE; | ||
| use crate::HANGUL_S_COUNT; | ||
| use crate::HANGUL_T_BASE; | ||
| use crate::HANGUL_T_COUNT; | ||
| use crate::HANGUL_V_BASE; | ||
| use crate::HIGH_ZEROS_MASK; | ||
| use crate::LOW_ZEROS_MASK; | ||
| use crate::NON_ROUND_TRIP_MARKER; | ||
| use icu_provider::prelude::*; | ||
| /// Borrowed version of the raw canonical composition operation. | ||
| /// | ||
| /// Callers should generally use `ComposingNormalizer` instead of this API. | ||
| /// However, this API is provided for callers such as HarfBuzz that specifically | ||
| /// want access to the raw canonical composition operation e.g. for use in a | ||
| /// glyph-availability-guided custom normalizer. | ||
| #[derive(Debug, Copy, Clone)] | ||
| pub struct CanonicalCompositionBorrowed<'a> { | ||
| canonical_compositions: &'a CanonicalCompositions<'a>, | ||
| } | ||
| #[cfg(feature = "compiled_data")] | ||
| impl Default for CanonicalCompositionBorrowed<'static> { | ||
| fn default() -> Self { | ||
| Self::new() | ||
| } | ||
| } | ||
| impl CanonicalCompositionBorrowed<'static> { | ||
| /// Cheaply converts a [`CanonicalCompositionBorrowed<'static>`] into a [`CanonicalComposition`]. | ||
| /// | ||
| /// Note: Due to branching and indirection, using [`CanonicalComposition`] might inhibit some | ||
| /// compile-time optimizations that are possible with [`CanonicalCompositionBorrowed`]. | ||
| pub const fn static_to_owned(self) -> CanonicalComposition { | ||
| CanonicalComposition { | ||
| canonical_compositions: DataPayload::from_static_ref(self.canonical_compositions), | ||
| } | ||
| } | ||
| /// Constructs a new `CanonicalComposition` using compiled data. | ||
| /// | ||
| /// ✨ *Enabled with the `compiled_data` Cargo feature.* | ||
| /// | ||
| /// [📚 Help choosing a constructor](icu_provider::constructors) | ||
| #[cfg(feature = "compiled_data")] | ||
| pub const fn new() -> Self { | ||
| Self { | ||
| canonical_compositions: crate::provider::Baked::SINGLETON_NORMALIZER_NFC_V1, | ||
| } | ||
| } | ||
| } | ||
| impl CanonicalCompositionBorrowed<'_> { | ||
| /// Performs canonical composition (including Hangul) on a pair of | ||
| /// characters or returns `None` if these characters don't compose. | ||
| /// Composition exclusions are taken into account. | ||
| /// | ||
| /// # Examples | ||
| /// | ||
| /// ``` | ||
| /// let comp = icu::normalizer::properties::CanonicalCompositionBorrowed::new(); | ||
| /// | ||
| /// assert_eq!(comp.compose('a', 'b'), None); // Just two non-composing starters | ||
| /// assert_eq!(comp.compose('a', '\u{0308}'), Some('ä')); | ||
| /// assert_eq!(comp.compose('ẹ', '\u{0302}'), Some('ệ')); | ||
| /// assert_eq!(comp.compose('𝅗', '𝅥'), None); // Composition exclusion | ||
| /// assert_eq!(comp.compose('ে', 'া'), Some('ো')); // Second is starter | ||
| /// assert_eq!(comp.compose('ᄀ', 'ᅡ'), Some('가')); // Hangul LV | ||
| /// assert_eq!(comp.compose('가', 'ᆨ'), Some('각')); // Hangul LVT | ||
| /// ``` | ||
| #[inline(always)] | ||
| pub fn compose(self, starter: char, second: char) -> Option<char> { | ||
| crate::compose( | ||
| self.canonical_compositions.canonical_compositions.iter(), | ||
| starter, | ||
| second, | ||
| ) | ||
| } | ||
| } | ||
| /// The raw canonical composition operation. | ||
| /// | ||
| /// Callers should generally use `ComposingNormalizer` instead of this API. | ||
| /// However, this API is provided for callers such as HarfBuzz that specifically | ||
| /// want access to the raw canonical composition operation e.g. for use in a | ||
| /// glyph-availability-guided custom normalizer. | ||
| #[derive(Debug)] | ||
| pub struct CanonicalComposition { | ||
| canonical_compositions: DataPayload<NormalizerNfcV1>, | ||
| } | ||
| #[cfg(feature = "compiled_data")] | ||
| impl Default for CanonicalComposition { | ||
| fn default() -> Self { | ||
| Self::new().static_to_owned() | ||
| } | ||
| } | ||
| impl CanonicalComposition { | ||
| /// Constructs a borrowed version of this type for more efficient querying. | ||
| pub fn as_borrowed(&self) -> CanonicalCompositionBorrowed<'_> { | ||
| CanonicalCompositionBorrowed { | ||
| canonical_compositions: self.canonical_compositions.get(), | ||
| } | ||
| } | ||
| /// Constructs a new `CanonicalCompositionBorrowed` using compiled data. | ||
| /// | ||
| /// ✨ *Enabled with the `compiled_data` Cargo feature.* | ||
| /// | ||
| /// [📚 Help choosing a constructor](icu_provider::constructors) | ||
| #[cfg(feature = "compiled_data")] | ||
| #[expect(clippy::new_ret_no_self)] | ||
| pub const fn new() -> CanonicalCompositionBorrowed<'static> { | ||
| CanonicalCompositionBorrowed::new() | ||
| } | ||
| icu_provider::gen_buffer_data_constructors!(() -> error: DataError, | ||
| functions: [ | ||
| new: skip, | ||
| try_new_with_buffer_provider, | ||
| try_new_unstable, | ||
| Self, | ||
| ] | ||
| ); | ||
| #[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new)] | ||
| pub fn try_new_unstable<D>(provider: &D) -> Result<Self, DataError> | ||
| where | ||
| D: DataProvider<NormalizerNfcV1> + ?Sized, | ||
| { | ||
| let canonical_compositions: DataPayload<NormalizerNfcV1> = | ||
| provider.load(Default::default())?.payload; | ||
| Ok(CanonicalComposition { | ||
| canonical_compositions, | ||
| }) | ||
| } | ||
| } | ||
| /// The outcome of non-recursive canonical decomposition of a character. | ||
| #[allow(clippy::exhaustive_enums)] | ||
| #[derive(Debug, PartialEq, Eq)] | ||
| pub enum Decomposed { | ||
| /// The character is its own canonical decomposition. | ||
| Default, | ||
| /// The character decomposes to a single different character. | ||
| Singleton(char), | ||
| /// The character decomposes to two characters. | ||
| Expansion(char, char), | ||
| } | ||
| /// Borrowed version of the raw (non-recursive) canonical decomposition operation. | ||
| /// | ||
| /// Callers should generally use `DecomposingNormalizer` instead of this API. | ||
| /// However, this API is provided for callers such as HarfBuzz that specifically | ||
| /// want access to non-recursive canonical decomposition e.g. for use in a | ||
| /// glyph-availability-guided custom normalizer. | ||
| #[derive(Debug)] | ||
| pub struct CanonicalDecompositionBorrowed<'a> { | ||
| decompositions: &'a DecompositionData<'a>, | ||
| tables: &'a DecompositionTables<'a>, | ||
| non_recursive: &'a NonRecursiveDecompositionSupplement<'a>, | ||
| } | ||
| #[cfg(feature = "compiled_data")] | ||
| impl Default for CanonicalDecompositionBorrowed<'static> { | ||
| fn default() -> Self { | ||
| Self::new() | ||
| } | ||
| } | ||
| impl CanonicalDecompositionBorrowed<'static> { | ||
| /// Cheaply converts a [`CanonicalDecompositionBorrowed<'static>`] into a [`CanonicalDecomposition`]. | ||
| /// | ||
| /// Note: Due to branching and indirection, using [`CanonicalDecomposition`] might inhibit some | ||
| /// compile-time optimizations that are possible with [`CanonicalDecompositionBorrowed`]. | ||
| pub const fn static_to_owned(self) -> CanonicalDecomposition { | ||
| CanonicalDecomposition { | ||
| decompositions: DataPayload::from_static_ref(self.decompositions), | ||
| tables: DataPayload::from_static_ref(self.tables), | ||
| non_recursive: DataPayload::from_static_ref(self.non_recursive), | ||
| } | ||
| } | ||
| /// Construct from compiled data. | ||
| /// | ||
| /// ✨ *Enabled with the `compiled_data` Cargo feature.* | ||
| /// | ||
| /// [📚 Help choosing a constructor](icu_provider::constructors) | ||
| #[cfg(feature = "compiled_data")] | ||
| pub const fn new() -> Self { | ||
| const _: () = assert!( | ||
| crate::provider::Baked::SINGLETON_NORMALIZER_NFD_TABLES_V1 | ||
| .scalars16 | ||
| .const_len() | ||
| + crate::provider::Baked::SINGLETON_NORMALIZER_NFD_TABLES_V1 | ||
| .scalars24 | ||
| .const_len() | ||
| <= 0xFFF, | ||
| "future extension" | ||
| ); | ||
| Self { | ||
| decompositions: crate::provider::Baked::SINGLETON_NORMALIZER_NFD_DATA_V1, | ||
| tables: crate::provider::Baked::SINGLETON_NORMALIZER_NFD_TABLES_V1, | ||
| non_recursive: crate::provider::Baked::SINGLETON_NORMALIZER_NFD_SUPPLEMENT_V1, | ||
| } | ||
| } | ||
| } | ||
| impl CanonicalDecompositionBorrowed<'_> { | ||
| /// Performs non-recursive canonical decomposition (including for Hangul). | ||
| /// | ||
| /// ``` | ||
| /// use icu::normalizer::properties::Decomposed; | ||
| /// let decomp = icu::normalizer::properties::CanonicalDecompositionBorrowed::new(); | ||
| /// | ||
| /// assert_eq!(decomp.decompose('e'), Decomposed::Default); | ||
| /// assert_eq!( | ||
| /// decomp.decompose('ệ'), | ||
| /// Decomposed::Expansion('ẹ', '\u{0302}') | ||
| /// ); | ||
| /// assert_eq!(decomp.decompose('각'), Decomposed::Expansion('가', 'ᆨ')); | ||
| /// assert_eq!(decomp.decompose('\u{212B}'), Decomposed::Singleton('Å')); // ANGSTROM SIGN | ||
| /// assert_eq!(decomp.decompose('\u{2126}'), Decomposed::Singleton('Ω')); // OHM SIGN | ||
| /// assert_eq!(decomp.decompose('\u{1F71}'), Decomposed::Singleton('ά')); // oxia | ||
| /// ``` | ||
| #[inline] | ||
| pub fn decompose(&self, c: char) -> Decomposed { | ||
| let lvt = u32::from(c).wrapping_sub(HANGUL_S_BASE); | ||
| if lvt >= HANGUL_S_COUNT { | ||
| return self.decompose_non_hangul(c); | ||
| } | ||
| // Invariant: lvt ≤ HANGUL_S_COUNT = 1172 | ||
| let t = lvt % HANGUL_T_COUNT; | ||
| // Invariant: t ≤ (1172 / HANGUL_T_COUNT = 1172 / 28 = 41) | ||
| if t == 0 { | ||
| let l = lvt / HANGUL_N_COUNT; | ||
| // Invariant: v ≤ (1172 / HANGUL_N_COUNT = 1172 / 588 ≈ 2) | ||
| let v = (lvt % HANGUL_N_COUNT) / HANGUL_T_COUNT; | ||
| // Invariant: v < (HANGUL_N_COUNT / HANGUL_T_COUNT = 588 / 28 = 21) | ||
| return Decomposed::Expansion( | ||
| // Safety: HANGUL_*_BASE are 0x1nnn, addding numbers that are 21 and 41 | ||
| // max will keep it in range, less than 0xD800 | ||
| unsafe { char::from_u32_unchecked(HANGUL_L_BASE + l) }, | ||
| unsafe { char::from_u32_unchecked(HANGUL_V_BASE + v) }, | ||
| ); | ||
| } | ||
| let lv = lvt - t; | ||
| // Invariant: lvt < 1172 | ||
| // Safe because values known to be in range | ||
| Decomposed::Expansion( | ||
| // Safety: HANGUL_*_BASE are 0x1nnn, addding numbers that are 1172 and 41 | ||
| // max will keep it in range, less than 0xD800 | ||
| unsafe { char::from_u32_unchecked(HANGUL_S_BASE + lv) }, | ||
| unsafe { char::from_u32_unchecked(HANGUL_T_BASE + t) }, | ||
| ) | ||
| } | ||
| /// Performs non-recursive canonical decomposition except Hangul syllables | ||
| /// are reported as `Decomposed::Default`. | ||
| #[inline(always)] | ||
| fn decompose_non_hangul(&self, c: char) -> Decomposed { | ||
| let decomposition = self.decompositions.trie.get(c); | ||
| // The REPLACEMENT CHARACTER has `NON_ROUND_TRIP_MARKER` set, | ||
| // and that flag needs to be ignored here. | ||
| if (decomposition & !(BACKWARD_COMBINING_MARKER | NON_ROUND_TRIP_MARKER)) == 0 { | ||
| return Decomposed::Default; | ||
| } | ||
| // The loop is only broken out of as goto forward | ||
| #[expect(clippy::never_loop)] | ||
| loop { | ||
| let high_zeros = (decomposition & HIGH_ZEROS_MASK) == 0; | ||
| let low_zeros = (decomposition & LOW_ZEROS_MASK) == 0; | ||
| if !high_zeros && !low_zeros { | ||
| // Decomposition into two BMP characters: starter and non-starter | ||
| if in_inclusive_range(c, '\u{1F71}', '\u{1FFB}') { | ||
| // Look in the other trie due to oxia singleton | ||
| // mappings to corresponding character with tonos. | ||
| break; | ||
| } | ||
| let starter = char_from_u32(decomposition & 0x7FFF); | ||
| let combining = char_from_u32((decomposition >> 15) & 0x7FFF); | ||
| return Decomposed::Expansion(starter, combining); | ||
| } | ||
| if high_zeros { | ||
| // Decomposition into one BMP character or non-starter | ||
| if trie_value_has_ccc(decomposition) { | ||
| // Non-starter | ||
| if !in_inclusive_range(c, '\u{0340}', '\u{0F81}') { | ||
| return Decomposed::Default; | ||
| } | ||
| return match c { | ||
| '\u{0340}' => { | ||
| // COMBINING GRAVE TONE MARK | ||
| Decomposed::Singleton('\u{0300}') | ||
| } | ||
| '\u{0341}' => { | ||
| // COMBINING ACUTE TONE MARK | ||
| Decomposed::Singleton('\u{0301}') | ||
| } | ||
| '\u{0343}' => { | ||
| // COMBINING GREEK KORONIS | ||
| Decomposed::Singleton('\u{0313}') | ||
| } | ||
| '\u{0344}' => { | ||
| // COMBINING GREEK DIALYTIKA TONOS | ||
| Decomposed::Expansion('\u{0308}', '\u{0301}') | ||
| } | ||
| '\u{0F73}' => { | ||
| // TIBETAN VOWEL SIGN II | ||
| Decomposed::Expansion('\u{0F71}', '\u{0F72}') | ||
| } | ||
| '\u{0F75}' => { | ||
| // TIBETAN VOWEL SIGN UU | ||
| Decomposed::Expansion('\u{0F71}', '\u{0F74}') | ||
| } | ||
| '\u{0F81}' => { | ||
| // TIBETAN VOWEL SIGN REVERSED II | ||
| Decomposed::Expansion('\u{0F71}', '\u{0F80}') | ||
| } | ||
| _ => Decomposed::Default, | ||
| }; | ||
| } | ||
| let singleton = decomposition as u16; | ||
| debug_assert_ne!( | ||
| singleton, FDFA_MARKER, | ||
| "How come we got the U+FDFA NFKD marker here?" | ||
| ); | ||
| return Decomposed::Singleton(char_from_u16(singleton)); | ||
| } | ||
| if c == '\u{212B}' { | ||
| // ANGSTROM SIGN | ||
| return Decomposed::Singleton('\u{00C5}'); | ||
| } | ||
| // Only 12 of 14 bits used as of Unicode 16. | ||
| let offset = (((decomposition & !(0b11 << 30)) >> 16) as usize) - 1; | ||
| // Only 3 of 4 bits used as of Unicode 16. | ||
| let len_bits = decomposition & 0b1111; | ||
| let tables = self.tables; | ||
| if offset < tables.scalars16.len() { | ||
| if len_bits != 0 { | ||
| // i.e. logical len isn't 2 | ||
| break; | ||
| } | ||
| if let Some(first) = tables.scalars16.get(offset) { | ||
| if let Some(second) = tables.scalars16.get(offset + 1) { | ||
| // Two BMP starters | ||
| return Decomposed::Expansion(char_from_u16(first), char_from_u16(second)); | ||
| } | ||
| } | ||
| // GIGO case | ||
| debug_assert!(false); | ||
| return Decomposed::Default; | ||
| } | ||
| let len = len_bits + 1; | ||
| if len > 2 { | ||
| break; | ||
| } | ||
| let offset24 = offset - tables.scalars16.len(); | ||
| if let Some(first_c) = tables.scalars24.get(offset24) { | ||
| if len == 1 { | ||
| return Decomposed::Singleton(first_c); | ||
| } | ||
| if let Some(second_c) = tables.scalars24.get(offset24 + 1) { | ||
| return Decomposed::Expansion(first_c, second_c); | ||
| } | ||
| } | ||
| // GIGO case | ||
| debug_assert!(false); | ||
| return Decomposed::Default; | ||
| } | ||
| let non_recursive = self.non_recursive; | ||
| let non_recursive_decomposition = non_recursive.trie.get(c); | ||
| if non_recursive_decomposition == 0 { | ||
| // GIGO case | ||
| debug_assert!(false); | ||
| return Decomposed::Default; | ||
| } | ||
| let trail_or_complex = (non_recursive_decomposition >> 16) as u16; | ||
| let lead = non_recursive_decomposition as u16; | ||
| if lead != 0 && trail_or_complex != 0 { | ||
| // Decomposition into two BMP characters | ||
| return Decomposed::Expansion(char_from_u16(lead), char_from_u16(trail_or_complex)); | ||
| } | ||
| if lead != 0 { | ||
| // Decomposition into one BMP character | ||
| return Decomposed::Singleton(char_from_u16(lead)); | ||
| } | ||
| // Decomposition into two non-BMP characters | ||
| // Low is offset into a table plus one to keep it non-zero. | ||
| let offset = usize::from(trail_or_complex - 1); | ||
| if let Some(first) = non_recursive.scalars24.get(offset) { | ||
| if let Some(second) = non_recursive.scalars24.get(offset + 1) { | ||
| return Decomposed::Expansion(first, second); | ||
| } | ||
| } | ||
| // GIGO case | ||
| debug_assert!(false); | ||
| Decomposed::Default | ||
| } | ||
| } | ||
| /// The raw (non-recursive) canonical decomposition operation. | ||
| /// | ||
| /// Callers should generally use `DecomposingNormalizer` instead of this API. | ||
| /// However, this API is provided for callers such as HarfBuzz that specifically | ||
| /// want access to non-recursive canonical decomposition e.g. for use in a | ||
| /// glyph-availability-guided custom normalizer. | ||
| #[derive(Debug)] | ||
| pub struct CanonicalDecomposition { | ||
| decompositions: DataPayload<NormalizerNfdDataV1>, | ||
| tables: DataPayload<NormalizerNfdTablesV1>, | ||
| non_recursive: DataPayload<NormalizerNfdSupplementV1>, | ||
| } | ||
| #[cfg(feature = "compiled_data")] | ||
| impl Default for CanonicalDecomposition { | ||
| fn default() -> Self { | ||
| Self::new().static_to_owned() | ||
| } | ||
| } | ||
| impl CanonicalDecomposition { | ||
| /// Constructs a borrowed version of this type for more efficient querying. | ||
| pub fn as_borrowed(&self) -> CanonicalDecompositionBorrowed<'_> { | ||
| CanonicalDecompositionBorrowed { | ||
| decompositions: self.decompositions.get(), | ||
| tables: self.tables.get(), | ||
| non_recursive: self.non_recursive.get(), | ||
| } | ||
| } | ||
| /// Construct from compiled data. | ||
| /// | ||
| /// ✨ *Enabled with the `compiled_data` Cargo feature.* | ||
| /// | ||
| /// [📚 Help choosing a constructor](icu_provider::constructors) | ||
| #[cfg(feature = "compiled_data")] | ||
| #[expect(clippy::new_ret_no_self)] | ||
| pub const fn new() -> CanonicalDecompositionBorrowed<'static> { | ||
| CanonicalDecompositionBorrowed::new() | ||
| } | ||
| icu_provider::gen_buffer_data_constructors!(() -> error: DataError, | ||
| functions: [ | ||
| new: skip, | ||
| try_new_with_buffer_provider, | ||
| try_new_unstable, | ||
| Self, | ||
| ] | ||
| ); | ||
| #[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new)] | ||
| pub fn try_new_unstable<D>(provider: &D) -> Result<Self, DataError> | ||
| where | ||
| D: DataProvider<NormalizerNfdDataV1> | ||
| + DataProvider<NormalizerNfdTablesV1> | ||
| + DataProvider<NormalizerNfdSupplementV1> | ||
| + ?Sized, | ||
| { | ||
| let decompositions: DataPayload<NormalizerNfdDataV1> = | ||
| provider.load(Default::default())?.payload; | ||
| let tables: DataPayload<NormalizerNfdTablesV1> = provider.load(Default::default())?.payload; | ||
| if tables.get().scalars16.len() + tables.get().scalars24.len() > 0xFFF { | ||
| // The data is from a future where there exists a normalization flavor whose | ||
| // complex decompositions take more than 0xFFF but fewer than 0x1FFF code points | ||
| // of space. If a good use case from such a decomposition flavor arises, we can | ||
| // dynamically change the bit masks so that the length mask becomes 0x1FFF instead | ||
| // of 0xFFF and the all-non-starters mask becomes 0 instead of 0x1000. However, | ||
| // since for now the masks are hard-coded, error out. | ||
| return Err(DataError::custom("future extension")); | ||
| } | ||
| let non_recursive: DataPayload<NormalizerNfdSupplementV1> = | ||
| provider.load(Default::default())?.payload; | ||
| Ok(CanonicalDecomposition { | ||
| decompositions, | ||
| tables, | ||
| non_recursive, | ||
| }) | ||
| } | ||
| } | ||
| /// Borrowed version of lookup of the Canonical_Combining_Class Unicode property. | ||
| /// | ||
| /// # Example | ||
| /// | ||
| /// ``` | ||
| /// use icu::properties::props::CanonicalCombiningClass; | ||
| /// use icu::normalizer::properties::CanonicalCombiningClassMapBorrowed; | ||
| /// | ||
| /// let map = CanonicalCombiningClassMapBorrowed::new(); | ||
| /// assert_eq!(map.get('a'), CanonicalCombiningClass::NotReordered); // U+0061: LATIN SMALL LETTER A | ||
| /// assert_eq!(map.get32(0x0301), CanonicalCombiningClass::Above); // U+0301: COMBINING ACUTE ACCENT | ||
| /// ``` | ||
| #[derive(Debug)] | ||
| pub struct CanonicalCombiningClassMapBorrowed<'a> { | ||
| /// The data trie | ||
| decompositions: &'a DecompositionData<'a>, | ||
| } | ||
| #[cfg(feature = "compiled_data")] | ||
| impl Default for CanonicalCombiningClassMapBorrowed<'static> { | ||
| fn default() -> Self { | ||
| Self::new() | ||
| } | ||
| } | ||
| impl CanonicalCombiningClassMapBorrowed<'static> { | ||
| /// Cheaply converts a [`CanonicalCombiningClassMapBorrowed<'static>`] into a [`CanonicalCombiningClassMap`]. | ||
| /// | ||
| /// Note: Due to branching and indirection, using [`CanonicalCombiningClassMap`] might inhibit some | ||
| /// compile-time optimizations that are possible with [`CanonicalCombiningClassMapBorrowed`]. | ||
| pub const fn static_to_owned(self) -> CanonicalCombiningClassMap { | ||
| CanonicalCombiningClassMap { | ||
| decompositions: DataPayload::from_static_ref(self.decompositions), | ||
| } | ||
| } | ||
| /// Construct from compiled data. | ||
| /// | ||
| /// ✨ *Enabled with the `compiled_data` Cargo feature.* | ||
| /// | ||
| /// [📚 Help choosing a constructor](icu_provider::constructors) | ||
| #[cfg(feature = "compiled_data")] | ||
| pub const fn new() -> Self { | ||
| CanonicalCombiningClassMapBorrowed { | ||
| decompositions: crate::provider::Baked::SINGLETON_NORMALIZER_NFD_DATA_V1, | ||
| } | ||
| } | ||
| } | ||
| impl CanonicalCombiningClassMapBorrowed<'_> { | ||
| /// Look up the canonical combining class for a scalar value. | ||
| /// | ||
| /// The return value is a u8 representing the canonical combining class, | ||
| /// you may enable the `"icu_properties"` feature if you would like to use a typed | ||
| /// `CanonicalCombiningClass`. | ||
| #[inline(always)] | ||
| pub fn get_u8(&self, c: char) -> u8 { | ||
| self.get32_u8(u32::from(c)) | ||
| } | ||
| /// Look up the canonical combining class for a scalar value | ||
| /// represented as `u32`. If the argument is outside the scalar | ||
| /// value range, `Not_Reordered` is returned. | ||
| /// | ||
| /// The return value is a u8 representing the canonical combining class, | ||
| /// you may enable the `"icu_properties"` feature if you would like to use a typed | ||
| /// `CanonicalCombiningClass`. | ||
| pub fn get32_u8(&self, c: u32) -> u8 { | ||
| let trie_value = self.decompositions.trie.get32(c); | ||
| if trie_value_has_ccc(trie_value) { | ||
| trie_value as u8 | ||
| } else { | ||
| ccc!(NotReordered, 0).to_icu4c_value() | ||
| } | ||
| } | ||
| /// Look up the canonical combining class for a scalar value | ||
| /// | ||
| /// ✨ *Enabled with the `icu_properties` Cargo feature.* | ||
| #[inline(always)] | ||
| #[cfg(feature = "icu_properties")] | ||
| pub fn get(&self, c: char) -> CanonicalCombiningClass { | ||
| CanonicalCombiningClass::from_icu4c_value(self.get_u8(c)) | ||
| } | ||
| /// Look up the canonical combining class for a scalar value | ||
| /// represented as `u32`. If the argument is outside the scalar | ||
| /// value range, `CanonicalCombiningClass::NotReordered` is returned. | ||
| /// | ||
| /// ✨ *Enabled with the `icu_properties` Cargo feature.* | ||
| #[cfg(feature = "icu_properties")] | ||
| pub fn get32(&self, c: u32) -> CanonicalCombiningClass { | ||
| CanonicalCombiningClass::from_icu4c_value(self.get32_u8(c)) | ||
| } | ||
| } | ||
| /// Lookup of the Canonical_Combining_Class Unicode property. | ||
| #[derive(Debug)] | ||
| pub struct CanonicalCombiningClassMap { | ||
| /// The data trie | ||
| decompositions: DataPayload<NormalizerNfdDataV1>, | ||
| } | ||
| #[cfg(feature = "compiled_data")] | ||
| impl Default for CanonicalCombiningClassMap { | ||
| fn default() -> Self { | ||
| Self::new().static_to_owned() | ||
| } | ||
| } | ||
| impl CanonicalCombiningClassMap { | ||
| /// Constructs a borrowed version of this type for more efficient querying. | ||
| pub fn as_borrowed(&self) -> CanonicalCombiningClassMapBorrowed<'_> { | ||
| CanonicalCombiningClassMapBorrowed { | ||
| decompositions: self.decompositions.get(), | ||
| } | ||
| } | ||
| /// Construct from compiled data. | ||
| /// | ||
| /// ✨ *Enabled with the `compiled_data` Cargo feature.* | ||
| /// | ||
| /// [📚 Help choosing a constructor](icu_provider::constructors) | ||
| #[cfg(feature = "compiled_data")] | ||
| #[expect(clippy::new_ret_no_self)] | ||
| pub const fn new() -> CanonicalCombiningClassMapBorrowed<'static> { | ||
| CanonicalCombiningClassMapBorrowed::new() | ||
| } | ||
| icu_provider::gen_buffer_data_constructors!(() -> error: DataError, | ||
| functions: [ | ||
| new: skip, | ||
| try_new_with_buffer_provider, | ||
| try_new_unstable, | ||
| Self, | ||
| ]); | ||
| #[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new)] | ||
| pub fn try_new_unstable<D>(provider: &D) -> Result<Self, DataError> | ||
| where | ||
| D: DataProvider<NormalizerNfdDataV1> + ?Sized, | ||
| { | ||
| let decompositions: DataPayload<NormalizerNfdDataV1> = | ||
| provider.load(Default::default())?.payload; | ||
| Ok(CanonicalCombiningClassMap { decompositions }) | ||
| } | ||
| } |
| // This file is part of ICU4X. For terms of use, please see the file | ||
| // called LICENSE at the top level of the ICU4X source tree | ||
| // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). | ||
| //! 🚧 \[Unstable\] Data provider struct definitions for this ICU4X component. | ||
| //! | ||
| //! <div class="stab unstable"> | ||
| //! 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways, | ||
| //! including in SemVer minor releases. While the serde representation of data structs is guaranteed | ||
| //! to be stable, their Rust representation might not be. Use with caution. | ||
| //! </div> | ||
| //! | ||
| //! Read more about data providers: [`icu_provider`] | ||
| // Provider structs must be stable | ||
| #![allow(clippy::exhaustive_structs, clippy::exhaustive_enums)] | ||
| use icu_collections::char16trie::Char16Trie; | ||
| use icu_collections::codepointtrie::CodePointTrie; | ||
| use icu_provider::prelude::*; | ||
| use zerovec::ZeroVec; | ||
| #[cfg(feature = "compiled_data")] | ||
| #[derive(Debug)] | ||
| /// Baked data | ||
| /// | ||
| /// <div class="stab unstable"> | ||
| /// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways, | ||
| /// including in SemVer minor releases. In particular, the `DataProvider` implementations are only | ||
| /// guaranteed to match with this version's `*_unstable` providers. Use with caution. | ||
| /// </div> | ||
| pub struct Baked; | ||
| #[cfg(feature = "compiled_data")] | ||
| #[allow(unused_imports)] | ||
| const _: () = { | ||
| use icu_normalizer_data::*; | ||
| pub mod icu { | ||
| pub use crate as normalizer; | ||
| pub use icu_collections as collections; | ||
| } | ||
| make_provider!(Baked); | ||
| impl_normalizer_nfc_v1!(Baked); | ||
| impl_normalizer_nfd_data_v1!(Baked); | ||
| impl_normalizer_nfd_supplement_v1!(Baked); | ||
| impl_normalizer_nfd_tables_v1!(Baked); | ||
| impl_normalizer_nfkd_data_v1!(Baked); | ||
| impl_normalizer_nfkd_tables_v1!(Baked); | ||
| impl_normalizer_uts46_data_v1!(Baked); | ||
| }; | ||
| icu_provider::data_marker!( | ||
| /// Marker for data for canonical decomposition. | ||
| NormalizerNfdDataV1, | ||
| "normalizer/nfd/data/v1", | ||
| DecompositionData<'static>, | ||
| is_singleton = true | ||
| ); | ||
| icu_provider::data_marker!( | ||
| /// Marker for additional data for canonical decomposition. | ||
| NormalizerNfdTablesV1, | ||
| "normalizer/nfd/tables/v1", | ||
| DecompositionTables<'static>, | ||
| is_singleton = true | ||
| ); | ||
| icu_provider::data_marker!( | ||
| /// Marker for data for compatibility decomposition. | ||
| NormalizerNfkdDataV1, | ||
| "normalizer/nfkd/data/v1", | ||
| DecompositionData<'static>, | ||
| is_singleton = true | ||
| ); | ||
| icu_provider::data_marker!( | ||
| /// Marker for additional data for compatibility decomposition. | ||
| NormalizerNfkdTablesV1, | ||
| "normalizer/nfkd/tables/v1", | ||
| DecompositionTables<'static>, | ||
| is_singleton = true | ||
| ); | ||
| icu_provider::data_marker!( | ||
| /// Marker for data for UTS-46 decomposition. | ||
| NormalizerUts46DataV1, | ||
| "normalizer/uts46/data/v1", | ||
| DecompositionData<'static>, | ||
| is_singleton = true | ||
| ); | ||
| icu_provider::data_marker!( | ||
| /// Marker for data for composition. | ||
| NormalizerNfcV1, | ||
| "normalizer/nfc/v1", | ||
| CanonicalCompositions<'static>, | ||
| is_singleton = true | ||
| ); | ||
| icu_provider::data_marker!( | ||
| /// Marker for additional data for non-recusrsive composition. | ||
| NormalizerNfdSupplementV1, | ||
| "normalizer/nfd/supplement/v1", | ||
| NonRecursiveDecompositionSupplement<'static>, | ||
| is_singleton = true | ||
| ); | ||
| #[cfg(feature = "datagen")] | ||
| /// The latest minimum set of markers required by this component. | ||
| pub const MARKERS: &[DataMarkerInfo] = &[ | ||
| NormalizerNfcV1::INFO, | ||
| NormalizerNfdDataV1::INFO, | ||
| NormalizerNfdTablesV1::INFO, | ||
| NormalizerNfkdDataV1::INFO, | ||
| NormalizerNfkdTablesV1::INFO, | ||
| NormalizerNfdSupplementV1::INFO, | ||
| NormalizerUts46DataV1::INFO, | ||
| ]; | ||
| /// Decomposition data | ||
| /// | ||
| /// <div class="stab unstable"> | ||
| /// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways, | ||
| /// including in SemVer minor releases. While the serde representation of data structs is guaranteed | ||
| /// to be stable, their Rust representation might not be. Use with caution. | ||
| /// </div> | ||
| #[derive(Debug, PartialEq, Clone, yoke::Yokeable, zerofrom::ZeroFrom)] | ||
| #[cfg_attr(feature = "datagen", derive(serde::Serialize, databake::Bake))] | ||
| #[cfg_attr(feature = "datagen", databake(path = icu_normalizer::provider))] | ||
| #[cfg_attr(feature = "serde", derive(serde::Deserialize))] | ||
| pub struct DecompositionData<'data> { | ||
| /// Trie for decomposition. | ||
| #[cfg_attr(feature = "serde", serde(borrow))] | ||
| pub trie: CodePointTrie<'data, u32>, | ||
| /// The passthrough bounds of NFD/NFC are lowered to this | ||
| /// maximum instead. (16-bit, because cannot be higher | ||
| /// than 0x0300, which is the bound for NFC.) | ||
| pub passthrough_cap: u16, | ||
| } | ||
| icu_provider::data_struct!( | ||
| DecompositionData<'_>, | ||
| #[cfg(feature = "datagen")] | ||
| ); | ||
| /// The expansion tables for cases where the decomposition isn't | ||
| /// contained in the trie value | ||
| /// | ||
| /// <div class="stab unstable"> | ||
| /// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways, | ||
| /// including in SemVer minor releases. While the serde representation of data structs is guaranteed | ||
| /// to be stable, their Rust representation might not be. Use with caution. | ||
| /// </div> | ||
| #[derive(Debug, PartialEq, Clone, yoke::Yokeable, zerofrom::ZeroFrom)] | ||
| #[cfg_attr(feature = "datagen", derive(serde::Serialize, databake::Bake))] | ||
| #[cfg_attr(feature = "datagen", databake(path = icu_normalizer::provider))] | ||
| #[cfg_attr(feature = "serde", derive(serde::Deserialize))] | ||
| pub struct DecompositionTables<'data> { | ||
| /// Decompositions that are fully within the BMP | ||
| #[cfg_attr(feature = "serde", serde(borrow))] | ||
| pub scalars16: ZeroVec<'data, u16>, | ||
| /// Decompositions with at least one character outside | ||
| /// the BMP | ||
| #[cfg_attr(feature = "serde", serde(borrow))] | ||
| pub scalars24: ZeroVec<'data, char>, | ||
| } | ||
| icu_provider::data_struct!( | ||
| DecompositionTables<'_>, | ||
| #[cfg(feature = "datagen")] | ||
| ); | ||
| /// Non-Hangul canonical compositions | ||
| /// | ||
| /// <div class="stab unstable"> | ||
| /// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways, | ||
| /// including in SemVer minor releases. While the serde representation of data structs is guaranteed | ||
| /// to be stable, their Rust representation might not be. Use with caution. | ||
| /// </div> | ||
| #[derive(Debug, PartialEq, Clone, yoke::Yokeable, zerofrom::ZeroFrom)] | ||
| #[cfg_attr(feature = "datagen", derive(serde::Serialize, databake::Bake))] | ||
| #[cfg_attr(feature = "datagen", databake(path = icu_normalizer::provider))] | ||
| #[cfg_attr(feature = "serde", derive(serde::Deserialize))] | ||
| pub struct CanonicalCompositions<'data> { | ||
| /// Trie keys are two-`char` strings with the second | ||
| /// character coming first. The value, if any, is the | ||
| /// (non-Hangul) canonical composition. | ||
| #[cfg_attr(feature = "serde", serde(borrow))] | ||
| pub canonical_compositions: Char16Trie<'data>, | ||
| } | ||
| icu_provider::data_struct!( | ||
| CanonicalCompositions<'_>, | ||
| #[cfg(feature = "datagen")] | ||
| ); | ||
| /// Non-recursive canonical decompositions that differ from | ||
| /// `DecompositionData`. | ||
| /// | ||
| /// <div class="stab unstable"> | ||
| /// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways, | ||
| /// including in SemVer minor releases. While the serde representation of data structs is guaranteed | ||
| /// to be stable, their Rust representation might not be. Use with caution. | ||
| /// </div> | ||
| #[derive(Debug, PartialEq, Clone, yoke::Yokeable, zerofrom::ZeroFrom)] | ||
| #[cfg_attr(feature = "datagen", derive(serde::Serialize, databake::Bake))] | ||
| #[cfg_attr(feature = "datagen", databake(path = icu_normalizer::provider))] | ||
| #[cfg_attr(feature = "serde", derive(serde::Deserialize))] | ||
| pub struct NonRecursiveDecompositionSupplement<'data> { | ||
| /// Trie for the supplementary non-recursive decompositions | ||
| #[cfg_attr(feature = "serde", serde(borrow))] | ||
| pub trie: CodePointTrie<'data, u32>, | ||
| /// Decompositions with at least one character outside | ||
| /// the BMP | ||
| #[cfg_attr(feature = "serde", serde(borrow))] | ||
| pub scalars24: ZeroVec<'data, char>, | ||
| } | ||
| icu_provider::data_struct!( | ||
| NonRecursiveDecompositionSupplement<'_>, | ||
| #[cfg(feature = "datagen")] | ||
| ); |
| // This file is part of ICU4X. For terms of use, please see the file | ||
| // called LICENSE at the top level of the ICU4X source tree | ||
| // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). | ||
| //! Bundles the part of UTS 46 that makes sense to implement as a | ||
| //! normalization. | ||
| //! | ||
| //! This is meant to be used as a building block of an UTS 46 | ||
| //! implementation, such as the `idna` crate. | ||
| use crate::ComposingNormalizer; | ||
| use crate::ComposingNormalizerBorrowed; | ||
| use crate::NormalizerNfcV1; | ||
| use crate::NormalizerNfdTablesV1; | ||
| use crate::NormalizerNfkdTablesV1; | ||
| use crate::NormalizerUts46DataV1; | ||
| use icu_provider::DataError; | ||
| use icu_provider::DataProvider; | ||
| // Implementation note: Despite merely wrapping a `ComposingNormalizer`, | ||
| // having a `Uts46Mapper` serves two purposes: | ||
| // | ||
| // 1. Denying public access to parts of the `ComposingNormalizer` API | ||
| // that don't work when the data contains markers for ignorables. | ||
| // 2. Providing a place where additional iterator pre-processing or | ||
| // post-processing can take place if needed in the future. (When | ||
| // writing this, it looked like such processing was needed but | ||
| // now isn't needed after all.) | ||
| /// A borrowed version of a mapper that knows how to performs the | ||
| /// subsets of UTS 46 processing documented on the methods. | ||
| #[derive(Debug)] | ||
| pub struct Uts46MapperBorrowed<'a> { | ||
| normalizer: ComposingNormalizerBorrowed<'a>, | ||
| } | ||
| #[cfg(feature = "compiled_data")] | ||
| impl Default for Uts46MapperBorrowed<'static> { | ||
| fn default() -> Self { | ||
| Self::new() | ||
| } | ||
| } | ||
| impl Uts46MapperBorrowed<'static> { | ||
| /// Cheaply converts a [`Uts46MapperBorrowed<'static>`] into a [`Uts46Mapper`]. | ||
| /// | ||
| /// Note: Due to branching and indirection, using [`Uts46Mapper`] might inhibit some | ||
| /// compile-time optimizations that are possible with [`Uts46MapperBorrowed`]. | ||
| pub const fn static_to_owned(self) -> Uts46Mapper { | ||
| Uts46Mapper { | ||
| normalizer: self.normalizer.static_to_owned(), | ||
| } | ||
| } | ||
| /// Construct with compiled data. | ||
| #[cfg(feature = "compiled_data")] | ||
| pub const fn new() -> Self { | ||
| Uts46MapperBorrowed { | ||
| normalizer: ComposingNormalizerBorrowed::new_uts46(), | ||
| } | ||
| } | ||
| } | ||
| impl Uts46MapperBorrowed<'_> { | ||
| /// Returns an iterator adaptor that turns an `Iterator` over `char` | ||
| /// into an iterator yielding a `char` sequence that gets the following | ||
| /// operations from the "Map" and "Normalize" steps of the "Processing" | ||
| /// section of UTS 46 lazily applied to it: | ||
| /// | ||
| /// 1. The _ignored_ characters are ignored. | ||
| /// 2. The _mapped_ characters are mapped. | ||
| /// 3. The _disallowed_ characters are replaced with U+FFFD, | ||
| /// which itself is a disallowed character. | ||
| /// 4. The _deviation_ characters are treated as _mapped_ or _valid_ | ||
| /// as appropriate. | ||
| /// 5. The _disallowed_STD3_valid_ characters are treated as allowed. | ||
| /// 6. The _disallowed_STD3_mapped_ characters are treated as | ||
| /// _mapped_. | ||
| /// 7. The result is normalized to NFC. | ||
| /// | ||
| /// Notably: | ||
| /// | ||
| /// * The STD3 or WHATWG ASCII deny list should be implemented as a | ||
| /// post-processing step. | ||
| /// * Transitional processing is not performed. Transitional mapping | ||
| /// would be a pre-processing step, but transitional processing is | ||
| /// deprecated, and none of Firefox, Safari, or Chrome use it. | ||
| pub fn map_normalize<'delegate, I: Iterator<Item = char> + 'delegate>( | ||
| &'delegate self, | ||
| iter: I, | ||
| ) -> impl Iterator<Item = char> + 'delegate { | ||
| self.normalizer | ||
| .normalize_iter_private(iter, crate::IgnorableBehavior::Ignored) | ||
| } | ||
| /// Returns an iterator adaptor that turns an `Iterator` over `char` | ||
| /// into an iterator yielding a `char` sequence that gets the following | ||
| /// operations from the NFC check and statucs steps of the "Validity | ||
| /// Criteria" section of UTS 46 lazily applied to it: | ||
| /// | ||
| /// 1. The _ignored_ characters are treated as _disallowed_. | ||
| /// 2. The _mapped_ characters are mapped. | ||
| /// 3. The _disallowed_ characters are replaced with U+FFFD, | ||
| /// which itself is a disallowed character. | ||
| /// 4. The _deviation_ characters are treated as _mapped_ or _valid_ | ||
| /// as appropriate. | ||
| /// 5. The _disallowed_STD3_valid_ characters are treated as allowed. | ||
| /// 6. The _disallowed_STD3_mapped_ characters are treated as | ||
| /// _mapped_. | ||
| /// 7. The result is normalized to NFC. | ||
| /// | ||
| /// Notably: | ||
| /// | ||
| /// * The STD3 or WHATWG ASCII deny list should be implemented as a | ||
| /// post-processing step. | ||
| /// * Transitional processing is not performed. Transitional mapping | ||
| /// would be a pre-processing step, but transitional processing is | ||
| /// deprecated, and none of Firefox, Safari, or Chrome use it. | ||
| /// * The output needs to be compared with input to see if anything | ||
| /// changed. This check catches failures to adhere to the normalization | ||
| /// and status requirements. In particular, this comparison results | ||
| /// in _mapped_ characters resulting in error like "Validity Criteria" | ||
| /// requires. | ||
| pub fn normalize_validate<'delegate, I: Iterator<Item = char> + 'delegate>( | ||
| &'delegate self, | ||
| iter: I, | ||
| ) -> impl Iterator<Item = char> + 'delegate { | ||
| self.normalizer | ||
| .normalize_iter_private(iter, crate::IgnorableBehavior::ReplacementCharacter) | ||
| } | ||
| } | ||
| /// A mapper that knows how to performs the subsets of UTS 46 processing | ||
| /// documented on the methods. | ||
| #[derive(Debug)] | ||
| pub struct Uts46Mapper { | ||
| normalizer: ComposingNormalizer, | ||
| } | ||
| #[cfg(feature = "compiled_data")] | ||
| impl Default for Uts46Mapper { | ||
| fn default() -> Self { | ||
| Self::new().static_to_owned() | ||
| } | ||
| } | ||
| impl Uts46Mapper { | ||
| /// Constructs a borrowed version of this type for more efficient querying. | ||
| pub fn as_borrowed(&self) -> Uts46MapperBorrowed<'_> { | ||
| Uts46MapperBorrowed { | ||
| normalizer: self.normalizer.as_borrowed(), | ||
| } | ||
| } | ||
| /// Construct with compiled data. | ||
| #[cfg(feature = "compiled_data")] | ||
| #[expect(clippy::new_ret_no_self)] | ||
| pub const fn new() -> Uts46MapperBorrowed<'static> { | ||
| Uts46MapperBorrowed::new() | ||
| } | ||
| /// Construct with provider. | ||
| #[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new)] | ||
| pub fn try_new<D>(provider: &D) -> Result<Self, DataError> | ||
| where | ||
| D: DataProvider<NormalizerUts46DataV1> | ||
| + DataProvider<NormalizerNfdTablesV1> | ||
| + DataProvider<NormalizerNfkdTablesV1> | ||
| // UTS 46 tables merged into NormalizerNfkdTablesV1 | ||
| + DataProvider<NormalizerNfcV1> | ||
| + ?Sized, | ||
| { | ||
| let normalizer = ComposingNormalizer::try_new_uts46_unstable(provider)?; | ||
| Ok(Uts46Mapper { normalizer }) | ||
| } | ||
| } |
| # This is a placeholder in the interest of keeping the repository size smaller. | ||
| # Replace this file with the contents of | ||
| # https://www.unicode.org/Public/UCD/latest/ucd/NormalizationTest.txt to actually | ||
| # run the conformance test. |
| The test data comes from | ||
| https://www.unicode.org/Public/UCD/latest/ucd/NormalizationTest.txt |
Sorry, the diff of this file is too big to display
| { | ||
| "git": { | ||
| "sha1": "31e2bfa8e39e069dcef6de3f6914c5d722e90d00" | ||
| }, | ||
| "path_in_vcs": "components/normalizer" | ||
| } |
| // This file is part of ICU4X. For terms of use, please see the file | ||
| // called LICENSE at the top level of the ICU4X source tree | ||
| // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). | ||
| use criterion::{criterion_group, criterion_main}; | ||
| mod canonical_composition; | ||
| mod canonical_decomposition; | ||
| mod composing_normalizer_nfc; | ||
| mod composing_normalizer_nfkc; | ||
| mod decomposing_normalizer_nfd; | ||
| mod decomposing_normalizer_nfkd; | ||
| criterion_group!( | ||
| benches, | ||
| canonical_composition::criterion_benchmark, | ||
| canonical_decomposition::criterion_benchmark, | ||
| composing_normalizer_nfc::criterion_benchmark, | ||
| composing_normalizer_nfkc::criterion_benchmark, | ||
| decomposing_normalizer_nfd::criterion_benchmark, | ||
| decomposing_normalizer_nfkd::criterion_benchmark, | ||
| ); | ||
| criterion_main!(benches); |
| // This file is part of ICU4X. For terms of use, please see the file | ||
| // called LICENSE at the top level of the ICU4X source tree | ||
| // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). | ||
| use criterion::{black_box, BenchmarkId, Criterion}; | ||
| use detone::IterDecomposeVietnamese; | ||
| use icu_normalizer::properties::{ | ||
| CanonicalCompositionBorrowed, CanonicalDecompositionBorrowed, Decomposed, | ||
| }; | ||
| use icu_normalizer::ComposingNormalizerBorrowed; | ||
| struct BenchDataContent { | ||
| pub file_name: String, | ||
| pub pairs: Vec<(char, char)>, | ||
| } | ||
| fn strip_headers(content: &str) -> String { | ||
| content | ||
| .lines() | ||
| .filter(|&s| !s.starts_with('#')) | ||
| .map(|s| s.to_owned()) | ||
| .collect::<Vec<String>>() | ||
| .join("\n") | ||
| } | ||
| fn normalizer_bench_data() -> [BenchDataContent; 16] { | ||
| let nfc_normalizer = ComposingNormalizerBorrowed::new_nfc(); | ||
| [ | ||
| BenchDataContent { | ||
| file_name: "TestNames_Latin".to_owned(), | ||
| pairs: decompose_data( | ||
| &nfc_normalizer | ||
| .normalize(&strip_headers(include_str!("./data/TestNames_Latin.txt"))), | ||
| ), | ||
| }, | ||
| BenchDataContent { | ||
| file_name: "TestNames_Japanese_h".to_owned(), | ||
| pairs: decompose_data(&nfc_normalizer.normalize(&strip_headers(include_str!( | ||
| "./data/TestNames_Japanese_h.txt" | ||
| )))), | ||
| }, | ||
| BenchDataContent { | ||
| file_name: "TestNames_Japanese_k".to_owned(), | ||
| pairs: decompose_data(&nfc_normalizer.normalize(&strip_headers(include_str!( | ||
| "./data/TestNames_Japanese_k.txt" | ||
| )))), | ||
| }, | ||
| BenchDataContent { | ||
| file_name: "TestNames_Korean".to_owned(), | ||
| pairs: decompose_data( | ||
| &nfc_normalizer | ||
| .normalize(&strip_headers(include_str!("./data/TestNames_Korean.txt"))), | ||
| ), | ||
| }, | ||
| BenchDataContent { | ||
| file_name: "TestRandomWordsUDHR_ar".to_owned(), | ||
| #[cfg(debug_assertions)] | ||
| pairs: Vec::new(), | ||
| #[cfg(not(debug_assertions))] | ||
| pairs: decompose_data(&nfc_normalizer.normalize(&strip_headers(include_str!( | ||
| "./data/TestRandomWordsUDHR_ar.txt" | ||
| )))), | ||
| }, | ||
| BenchDataContent { | ||
| file_name: "TestRandomWordsUDHR_de".to_owned(), | ||
| pairs: decompose_data(&nfc_normalizer.normalize(&strip_headers(include_str!( | ||
| "./data/TestRandomWordsUDHR_de.txt" | ||
| )))), | ||
| }, | ||
| BenchDataContent { | ||
| file_name: "TestRandomWordsUDHR_el".to_owned(), | ||
| pairs: decompose_data(&nfc_normalizer.normalize(&strip_headers(include_str!( | ||
| "./data/TestRandomWordsUDHR_el.txt" | ||
| )))), | ||
| }, | ||
| BenchDataContent { | ||
| file_name: "TestRandomWordsUDHR_es".to_owned(), | ||
| pairs: decompose_data(&nfc_normalizer.normalize(&strip_headers(include_str!( | ||
| "./data/TestRandomWordsUDHR_es.txt" | ||
| )))), | ||
| }, | ||
| BenchDataContent { | ||
| file_name: "TestRandomWordsUDHR_fr".to_owned(), | ||
| pairs: decompose_data(&nfc_normalizer.normalize(&strip_headers(include_str!( | ||
| "./data/TestRandomWordsUDHR_fr.txt" | ||
| )))), | ||
| }, | ||
| BenchDataContent { | ||
| file_name: "TestRandomWordsUDHR_he".to_owned(), | ||
| pairs: decompose_data(&nfc_normalizer.normalize(&strip_headers(include_str!( | ||
| "./data/TestRandomWordsUDHR_he.txt" | ||
| )))), | ||
| }, | ||
| BenchDataContent { | ||
| file_name: "TestRandomWordsUDHR_pl".to_owned(), | ||
| pairs: decompose_data(&nfc_normalizer.normalize(&strip_headers(include_str!( | ||
| "./data/TestRandomWordsUDHR_pl.txt" | ||
| )))), | ||
| }, | ||
| BenchDataContent { | ||
| file_name: "TestRandomWordsUDHR_ru".to_owned(), | ||
| pairs: decompose_data(&nfc_normalizer.normalize(&strip_headers(include_str!( | ||
| "./data/TestRandomWordsUDHR_ru.txt" | ||
| )))), | ||
| }, | ||
| BenchDataContent { | ||
| file_name: "TestRandomWordsUDHR_th".to_owned(), | ||
| #[cfg(debug_assertions)] | ||
| pairs: Vec::new(), | ||
| #[cfg(not(debug_assertions))] | ||
| pairs: decompose_data(&nfc_normalizer.normalize(&strip_headers(include_str!( | ||
| "./data/TestRandomWordsUDHR_th.txt" | ||
| )))), | ||
| }, | ||
| BenchDataContent { | ||
| file_name: "TestRandomWordsUDHR_tr".to_owned(), | ||
| pairs: decompose_data(&nfc_normalizer.normalize(&strip_headers(include_str!( | ||
| "./data/TestRandomWordsUDHR_tr.txt" | ||
| )))), | ||
| }, | ||
| BenchDataContent { | ||
| file_name: "udhr_vie".to_owned(), | ||
| pairs: decompose_data( | ||
| &nfc_normalizer.normalize(&strip_headers(include_str!("data/wotw.txt"))), | ||
| ), | ||
| }, | ||
| BenchDataContent { | ||
| file_name: "udhr_vie_detone".to_owned(), | ||
| pairs: { | ||
| let result: Vec<(char, char)> = nfc_normalizer | ||
| .normalize(&strip_headers(include_str!("data/wotw.txt"))) | ||
| .chars() | ||
| .filter_map(|c| { | ||
| let mut iter = std::iter::once(c).decompose_vietnamese_tones(true); | ||
| if let Some(base) = iter.next() { | ||
| iter.next().map(|tone| (base, tone)) | ||
| } else { | ||
| None | ||
| } | ||
| }) | ||
| .collect(); | ||
| assert!(!result.is_empty()); | ||
| result | ||
| }, | ||
| }, | ||
| ] | ||
| } | ||
| fn function_under_bench( | ||
| canonical_composer: &CanonicalCompositionBorrowed, | ||
| composable_points: &[(char, char)], | ||
| ) { | ||
| for pair in composable_points.iter() { | ||
| canonical_composer.compose(pair.0, pair.1); | ||
| } | ||
| } | ||
| pub fn criterion_benchmark(criterion: &mut Criterion) { | ||
| let group_name = "canonical_composition"; | ||
| let mut group = criterion.benchmark_group(group_name); | ||
| let composer = CanonicalCompositionBorrowed::new(); | ||
| for bench_data_content in black_box(normalizer_bench_data()) { | ||
| group.bench_function( | ||
| BenchmarkId::from_parameter(format!("from_nfc_{}", bench_data_content.file_name)), | ||
| |bencher| bencher.iter(|| function_under_bench(&composer, &bench_data_content.pairs)), | ||
| ); | ||
| } | ||
| group.finish(); | ||
| } | ||
| fn decompose_data(nfc: &str) -> Vec<(char, char)> { | ||
| let decomposer = CanonicalDecompositionBorrowed::new(); | ||
| nfc.chars() | ||
| .map(|c| decomposer.decompose(c)) | ||
| .filter_map(|decomposed| { | ||
| if let Decomposed::Expansion(a, b) = decomposed { | ||
| Some((a, b)) | ||
| } else { | ||
| None | ||
| } | ||
| }) | ||
| .collect() | ||
| } |
| // This file is part of ICU4X. For terms of use, please see the file | ||
| // called LICENSE at the top level of the ICU4X source tree | ||
| // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). | ||
| use criterion::{black_box, BenchmarkId, Criterion}; | ||
| use icu_normalizer::properties::CanonicalDecompositionBorrowed; | ||
| use icu_normalizer::{ComposingNormalizerBorrowed, DecomposingNormalizerBorrowed}; | ||
| struct BenchDataContent { | ||
| pub file_name: String, | ||
| pub nfc: String, | ||
| pub nfd: String, | ||
| pub nfkc: String, | ||
| pub nfkd: String, | ||
| } | ||
| fn strip_headers(content: &str) -> String { | ||
| content | ||
| .lines() | ||
| .filter(|&s| !s.starts_with('#')) | ||
| .map(|s| s.to_owned()) | ||
| .collect::<Vec<String>>() | ||
| .join("\n") | ||
| } | ||
| fn normalizer_bench_data() -> [BenchDataContent; 15] { | ||
| let nfc_normalizer = ComposingNormalizerBorrowed::new_nfc(); | ||
| let nfd_normalizer = DecomposingNormalizerBorrowed::new_nfd(); | ||
| let nfkc_normalizer = ComposingNormalizerBorrowed::new_nfkc(); | ||
| let nfkd_normalizer = DecomposingNormalizerBorrowed::new_nfkd(); | ||
| let content_latin: (&str, &str) = ( | ||
| "TestNames_Latin", | ||
| &strip_headers(include_str!("./data/TestNames_Latin.txt")), | ||
| ); | ||
| let content_jp_h: (&str, &str) = ( | ||
| "TestNames_Japanese_h", | ||
| &strip_headers(include_str!("./data/TestNames_Japanese_h.txt")), | ||
| ); | ||
| let content_jp_k: (&str, &str) = ( | ||
| "TestNames_Japanese_k", | ||
| &strip_headers(include_str!("./data/TestNames_Japanese_k.txt")), | ||
| ); | ||
| let content_korean: (&str, &str) = ( | ||
| "TestNames_Korean", | ||
| &strip_headers(include_str!("./data/TestNames_Korean.txt")), | ||
| ); | ||
| let content_random_words_ar: (&str, &str) = ( | ||
| "TestRandomWordsUDHR_ar", | ||
| &strip_headers(include_str!("./data/TestRandomWordsUDHR_ar.txt")), | ||
| ); | ||
| let content_random_words_de: (&str, &str) = ( | ||
| "TestRandomWordsUDHR_de", | ||
| &strip_headers(include_str!("./data/TestRandomWordsUDHR_de.txt")), | ||
| ); | ||
| let content_random_words_el: (&str, &str) = ( | ||
| "TestRandomWordsUDHR_el", | ||
| &strip_headers(include_str!("./data/TestRandomWordsUDHR_el.txt")), | ||
| ); | ||
| let content_random_words_es: (&str, &str) = ( | ||
| "TestRandomWordsUDHR_es", | ||
| &strip_headers(include_str!("./data/TestRandomWordsUDHR_es.txt")), | ||
| ); | ||
| let content_random_words_fr: (&str, &str) = ( | ||
| "TestRandomWordsUDHR_fr", | ||
| &strip_headers(include_str!("./data/TestRandomWordsUDHR_fr.txt")), | ||
| ); | ||
| let content_random_words_he: (&str, &str) = ( | ||
| "TestRandomWordsUDHR_he", | ||
| &strip_headers(include_str!("./data/TestRandomWordsUDHR_he.txt")), | ||
| ); | ||
| let content_random_words_pl: (&str, &str) = ( | ||
| "TestRandomWordsUDHR_pl", | ||
| &strip_headers(include_str!("./data/TestRandomWordsUDHR_pl.txt")), | ||
| ); | ||
| let content_random_words_ru: (&str, &str) = ( | ||
| "TestRandomWordsUDHR_ru", | ||
| &strip_headers(include_str!("./data/TestRandomWordsUDHR_ru.txt")), | ||
| ); | ||
| let content_random_words_th: (&str, &str) = ( | ||
| "TestRandomWordsUDHR_th", | ||
| &strip_headers(include_str!("./data/TestRandomWordsUDHR_th.txt")), | ||
| ); | ||
| let content_random_words_tr: (&str, &str) = ( | ||
| "TestRandomWordsUDHR_tr", | ||
| &strip_headers(include_str!("./data/TestRandomWordsUDHR_tr.txt")), | ||
| ); | ||
| let content_viet: (&str, &str) = ("udhr_vie", &strip_headers(include_str!("data/wotw.txt"))); | ||
| [ | ||
| content_latin, | ||
| content_viet, | ||
| content_jp_k, | ||
| content_jp_h, | ||
| content_korean, | ||
| content_random_words_ru, | ||
| content_random_words_ar, | ||
| content_random_words_el, | ||
| content_random_words_es, | ||
| content_random_words_fr, | ||
| content_random_words_tr, | ||
| content_random_words_th, | ||
| content_random_words_pl, | ||
| content_random_words_he, | ||
| content_random_words_de, | ||
| ] | ||
| .map(|(file_name, raw_content)| BenchDataContent { | ||
| file_name: file_name.to_owned(), | ||
| nfc: nfc_normalizer.normalize(raw_content).to_string(), | ||
| nfd: nfd_normalizer.normalize(raw_content).to_string(), | ||
| nfkc: nfkc_normalizer.normalize(raw_content).to_string(), | ||
| nfkd: nfkd_normalizer.normalize(raw_content).to_string(), | ||
| }) | ||
| } | ||
| #[cfg(debug_assertions)] | ||
| fn function_under_bench( | ||
| _canonical_decomposer: &CanonicalDecompositionBorrowed, | ||
| _decomposable_points: &str, | ||
| ) { | ||
| // using debug assertion fails some test. | ||
| // "cargo test --bench bench" will pass | ||
| // "cargo bench" will work as expected, because the profile doesn't include debug assertions. | ||
| } | ||
| #[cfg(not(debug_assertions))] | ||
| fn function_under_bench( | ||
| canonical_decomposer: &CanonicalDecompositionBorrowed, | ||
| decomposable_points: &str, | ||
| ) { | ||
| decomposable_points.chars().for_each(|point| { | ||
| canonical_decomposer.decompose(point); | ||
| }); | ||
| } | ||
| pub fn criterion_benchmark(criterion: &mut Criterion) { | ||
| let group_name = "canonical_decomposition"; | ||
| let mut group = criterion.benchmark_group(group_name); | ||
| let decomposer = CanonicalDecompositionBorrowed::new(); | ||
| for bench_data_content in black_box(normalizer_bench_data()) { | ||
| group.bench_function( | ||
| BenchmarkId::from_parameter(format!("from_nfc_{}", bench_data_content.file_name)), | ||
| |bencher| bencher.iter(|| function_under_bench(&decomposer, &bench_data_content.nfc)), | ||
| ); | ||
| group.bench_function( | ||
| BenchmarkId::from_parameter(format!("from_nfd_{}", bench_data_content.file_name)), | ||
| |bencher| bencher.iter(|| function_under_bench(&decomposer, &bench_data_content.nfd)), | ||
| ); | ||
| group.bench_function( | ||
| BenchmarkId::from_parameter(format!("from_nfkc_{}", bench_data_content.file_name)), | ||
| |bencher| bencher.iter(|| function_under_bench(&decomposer, &bench_data_content.nfkc)), | ||
| ); | ||
| group.bench_function( | ||
| BenchmarkId::from_parameter(format!("from_nfkd_{}", bench_data_content.file_name)), | ||
| |bencher| bencher.iter(|| function_under_bench(&decomposer, &bench_data_content.nfkd)), | ||
| ); | ||
| } | ||
| group.finish(); | ||
| } |
| // This file is part of ICU4X. For terms of use, please see the file | ||
| // called LICENSE at the top level of the ICU4X source tree | ||
| // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). | ||
| use criterion::{black_box, BenchmarkId, Criterion}; | ||
| use icu_normalizer::{ComposingNormalizerBorrowed, DecomposingNormalizerBorrowed}; | ||
| struct BenchDataContent { | ||
| pub file_name: String, | ||
| pub nfc: String, | ||
| pub nfd: String, | ||
| pub nfkc: String, | ||
| pub nfkd: String, | ||
| pub nfc_u16: Vec<u16>, | ||
| pub nfd_u16: Vec<u16>, | ||
| pub nfkc_u16: Vec<u16>, | ||
| pub nfkd_u16: Vec<u16>, | ||
| } | ||
| fn strip_headers(content: &str) -> String { | ||
| content | ||
| .lines() | ||
| .filter(|&s| !s.starts_with('#')) | ||
| .map(|s| s.to_owned()) | ||
| .collect::<Vec<String>>() | ||
| .join("\n") | ||
| } | ||
| fn normalizer_bench_data() -> [BenchDataContent; 15] { | ||
| let nfc_normalizer = ComposingNormalizerBorrowed::new_nfc(); | ||
| let nfd_normalizer = DecomposingNormalizerBorrowed::new_nfd(); | ||
| let nfkc_normalizer = ComposingNormalizerBorrowed::new_nfkc(); | ||
| let nfkd_normalizer = DecomposingNormalizerBorrowed::new_nfkd(); | ||
| let content_latin: (&str, &str) = ( | ||
| "TestNames_Latin", | ||
| &strip_headers(include_str!("./data/TestNames_Latin.txt")), | ||
| ); | ||
| let content_jp_h: (&str, &str) = ( | ||
| "TestNames_Japanese_h", | ||
| &strip_headers(include_str!("./data/TestNames_Japanese_h.txt")), | ||
| ); | ||
| let content_jp_k: (&str, &str) = ( | ||
| "TestNames_Japanese_k", | ||
| &strip_headers(include_str!("./data/TestNames_Japanese_k.txt")), | ||
| ); | ||
| let content_korean: (&str, &str) = ( | ||
| "TestNames_Korean", | ||
| &strip_headers(include_str!("./data/TestNames_Korean.txt")), | ||
| ); | ||
| let content_random_words_ar: (&str, &str) = ( | ||
| "TestRandomWordsUDHR_ar", | ||
| &strip_headers(include_str!("./data/TestRandomWordsUDHR_ar.txt")), | ||
| ); | ||
| let content_random_words_de: (&str, &str) = ( | ||
| "TestRandomWordsUDHR_de", | ||
| &strip_headers(include_str!("./data/TestRandomWordsUDHR_de.txt")), | ||
| ); | ||
| let content_random_words_el: (&str, &str) = ( | ||
| "TestRandomWordsUDHR_el", | ||
| &strip_headers(include_str!("./data/TestRandomWordsUDHR_el.txt")), | ||
| ); | ||
| let content_random_words_es: (&str, &str) = ( | ||
| "TestRandomWordsUDHR_es", | ||
| &strip_headers(include_str!("./data/TestRandomWordsUDHR_es.txt")), | ||
| ); | ||
| let content_random_words_fr: (&str, &str) = ( | ||
| "TestRandomWordsUDHR_fr", | ||
| &strip_headers(include_str!("./data/TestRandomWordsUDHR_fr.txt")), | ||
| ); | ||
| let content_random_words_he: (&str, &str) = ( | ||
| "TestRandomWordsUDHR_he", | ||
| &strip_headers(include_str!("./data/TestRandomWordsUDHR_he.txt")), | ||
| ); | ||
| let content_random_words_pl: (&str, &str) = ( | ||
| "TestRandomWordsUDHR_pl", | ||
| &strip_headers(include_str!("./data/TestRandomWordsUDHR_pl.txt")), | ||
| ); | ||
| let content_random_words_ru: (&str, &str) = ( | ||
| "TestRandomWordsUDHR_ru", | ||
| &strip_headers(include_str!("./data/TestRandomWordsUDHR_ru.txt")), | ||
| ); | ||
| let content_random_words_th: (&str, &str) = ( | ||
| "TestRandomWordsUDHR_th", | ||
| &strip_headers(include_str!("./data/TestRandomWordsUDHR_th.txt")), | ||
| ); | ||
| let content_random_words_tr: (&str, &str) = ( | ||
| "TestRandomWordsUDHR_tr", | ||
| &strip_headers(include_str!("./data/TestRandomWordsUDHR_tr.txt")), | ||
| ); | ||
| let content_viet: (&str, &str) = ("wotw", &strip_headers(include_str!("./data/wotw.txt"))); | ||
| [ | ||
| content_latin, | ||
| content_viet, | ||
| content_jp_k, | ||
| content_jp_h, | ||
| content_korean, | ||
| content_random_words_ru, | ||
| content_random_words_ar, | ||
| content_random_words_el, | ||
| content_random_words_es, | ||
| content_random_words_fr, | ||
| content_random_words_tr, | ||
| content_random_words_th, | ||
| content_random_words_pl, | ||
| content_random_words_he, | ||
| content_random_words_de, | ||
| ] | ||
| .map(|(file_name, raw_content)| { | ||
| let nfc = &nfc_normalizer.normalize(raw_content); | ||
| let nfd = &nfd_normalizer.normalize(raw_content); | ||
| let nfkc = &nfkc_normalizer.normalize(raw_content); | ||
| let nfkd = &nfkd_normalizer.normalize(raw_content); | ||
| BenchDataContent { | ||
| file_name: file_name.to_owned(), | ||
| nfc: nfc.to_string(), | ||
| nfd: nfd.to_string(), | ||
| nfkc: nfkc.to_string(), | ||
| nfkd: nfkd.to_string(), | ||
| nfc_u16: nfc.encode_utf16().collect(), | ||
| nfd_u16: nfd.encode_utf16().collect(), | ||
| nfkc_u16: nfkc.encode_utf16().collect(), | ||
| nfkd_u16: nfkd.encode_utf16().collect(), | ||
| } | ||
| }) | ||
| } | ||
| fn function_under_bench(normalizer: &ComposingNormalizerBorrowed, text: &str) { | ||
| normalizer.normalize(text); | ||
| } | ||
| fn function_under_bench_utf16(normalizer: &ComposingNormalizerBorrowed, text: &[u16]) { | ||
| normalizer.normalize_utf16(text); | ||
| } | ||
| pub fn criterion_benchmark(criterion: &mut Criterion) { | ||
| let group_name = "composing_normalizer_nfc"; | ||
| let normalizer_under_bench = ComposingNormalizerBorrowed::new_nfc(); | ||
| let mut group = criterion.benchmark_group(group_name); | ||
| for bench_data_content in black_box(normalizer_bench_data()) { | ||
| group.bench_function( | ||
| BenchmarkId::from_parameter(format!("from_nfc_{}", bench_data_content.file_name)), | ||
| |bencher| { | ||
| bencher | ||
| .iter(|| function_under_bench(&normalizer_under_bench, &bench_data_content.nfc)) | ||
| }, | ||
| ); | ||
| group.bench_function( | ||
| BenchmarkId::from_parameter(format!("from_nfd_{}", bench_data_content.file_name)), | ||
| |bencher| { | ||
| bencher | ||
| .iter(|| function_under_bench(&normalizer_under_bench, &bench_data_content.nfd)) | ||
| }, | ||
| ); | ||
| group.bench_function( | ||
| BenchmarkId::from_parameter(format!("from_nfkc_{}", bench_data_content.file_name)), | ||
| |bencher| { | ||
| bencher.iter(|| { | ||
| function_under_bench(&normalizer_under_bench, &bench_data_content.nfkc) | ||
| }) | ||
| }, | ||
| ); | ||
| group.bench_function( | ||
| BenchmarkId::from_parameter(format!("from_nfkd_{}", bench_data_content.file_name)), | ||
| |bencher| { | ||
| bencher.iter(|| { | ||
| function_under_bench(&normalizer_under_bench, &bench_data_content.nfkd) | ||
| }) | ||
| }, | ||
| ); | ||
| // UTF_16 | ||
| group.bench_function( | ||
| BenchmarkId::from_parameter(format!( | ||
| "from_nfc_{}_utf_16", | ||
| bench_data_content.file_name | ||
| )), | ||
| |bencher| { | ||
| bencher.iter(|| { | ||
| function_under_bench_utf16(&normalizer_under_bench, &bench_data_content.nfc_u16) | ||
| }) | ||
| }, | ||
| ); | ||
| group.bench_function( | ||
| BenchmarkId::from_parameter(format!( | ||
| "from_nfd_{}_utf_16", | ||
| bench_data_content.file_name | ||
| )), | ||
| |bencher| { | ||
| bencher.iter(|| { | ||
| function_under_bench_utf16(&normalizer_under_bench, &bench_data_content.nfd_u16) | ||
| }) | ||
| }, | ||
| ); | ||
| group.bench_function( | ||
| BenchmarkId::from_parameter(format!( | ||
| "from_nfkc_{}_utf_16", | ||
| bench_data_content.file_name | ||
| )), | ||
| |bencher| { | ||
| bencher.iter(|| { | ||
| function_under_bench_utf16( | ||
| &normalizer_under_bench, | ||
| &bench_data_content.nfkc_u16, | ||
| ) | ||
| }) | ||
| }, | ||
| ); | ||
| group.bench_function( | ||
| BenchmarkId::from_parameter(format!( | ||
| "from_nfkd_{}_utf_16", | ||
| bench_data_content.file_name | ||
| )), | ||
| |bencher| { | ||
| bencher.iter(|| { | ||
| function_under_bench_utf16( | ||
| &normalizer_under_bench, | ||
| &bench_data_content.nfkd_u16, | ||
| ) | ||
| }) | ||
| }, | ||
| ); | ||
| } | ||
| group.finish(); | ||
| } |
| // This file is part of ICU4X. For terms of use, please see the file | ||
| // called LICENSE at the top level of the ICU4X source tree | ||
| // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). | ||
| use criterion::{black_box, BenchmarkId, Criterion}; | ||
| use icu_normalizer::{ComposingNormalizerBorrowed, DecomposingNormalizerBorrowed}; | ||
| struct BenchDataContent { | ||
| pub file_name: String, | ||
| pub nfc: String, | ||
| pub nfd: String, | ||
| pub nfkc: String, | ||
| pub nfkd: String, | ||
| pub nfc_u16: Vec<u16>, | ||
| pub nfd_u16: Vec<u16>, | ||
| pub nfkc_u16: Vec<u16>, | ||
| pub nfkd_u16: Vec<u16>, | ||
| } | ||
| fn strip_headers(content: &str) -> String { | ||
| content | ||
| .lines() | ||
| .filter(|&s| !s.starts_with('#')) | ||
| .map(|s| s.to_owned()) | ||
| .collect::<Vec<String>>() | ||
| .join("\n") | ||
| } | ||
| fn normalizer_bench_data() -> [BenchDataContent; 15] { | ||
| let nfc_normalizer = ComposingNormalizerBorrowed::new_nfc(); | ||
| let nfd_normalizer = DecomposingNormalizerBorrowed::new_nfd(); | ||
| let nfkc_normalizer = ComposingNormalizerBorrowed::new_nfkc(); | ||
| let nfkd_normalizer = DecomposingNormalizerBorrowed::new_nfkd(); | ||
| let content_latin: (&str, &str) = ( | ||
| "TestNames_Latin", | ||
| &strip_headers(include_str!("./data/TestNames_Latin.txt")), | ||
| ); | ||
| let content_jp_h: (&str, &str) = ( | ||
| "TestNames_Japanese_h", | ||
| &strip_headers(include_str!("./data/TestNames_Japanese_h.txt")), | ||
| ); | ||
| let content_jp_k: (&str, &str) = ( | ||
| "TestNames_Japanese_k", | ||
| &strip_headers(include_str!("./data/TestNames_Japanese_k.txt")), | ||
| ); | ||
| let content_korean: (&str, &str) = ( | ||
| "TestNames_Korean", | ||
| &strip_headers(include_str!("./data/TestNames_Korean.txt")), | ||
| ); | ||
| let content_random_words_ar: (&str, &str) = ( | ||
| "TestRandomWordsUDHR_ar", | ||
| &strip_headers(include_str!("./data/TestRandomWordsUDHR_ar.txt")), | ||
| ); | ||
| let content_random_words_de: (&str, &str) = ( | ||
| "TestRandomWordsUDHR_de", | ||
| &strip_headers(include_str!("./data/TestRandomWordsUDHR_de.txt")), | ||
| ); | ||
| let content_random_words_el: (&str, &str) = ( | ||
| "TestRandomWordsUDHR_el", | ||
| &strip_headers(include_str!("./data/TestRandomWordsUDHR_el.txt")), | ||
| ); | ||
| let content_random_words_es: (&str, &str) = ( | ||
| "TestRandomWordsUDHR_es", | ||
| &strip_headers(include_str!("./data/TestRandomWordsUDHR_es.txt")), | ||
| ); | ||
| let content_random_words_fr: (&str, &str) = ( | ||
| "TestRandomWordsUDHR_fr", | ||
| &strip_headers(include_str!("./data/TestRandomWordsUDHR_fr.txt")), | ||
| ); | ||
| let content_random_words_he: (&str, &str) = ( | ||
| "TestRandomWordsUDHR_he", | ||
| &strip_headers(include_str!("./data/TestRandomWordsUDHR_he.txt")), | ||
| ); | ||
| let content_random_words_pl: (&str, &str) = ( | ||
| "TestRandomWordsUDHR_pl", | ||
| &strip_headers(include_str!("./data/TestRandomWordsUDHR_pl.txt")), | ||
| ); | ||
| let content_random_words_ru: (&str, &str) = ( | ||
| "TestRandomWordsUDHR_ru", | ||
| &strip_headers(include_str!("./data/TestRandomWordsUDHR_ru.txt")), | ||
| ); | ||
| let content_random_words_th: (&str, &str) = ( | ||
| "TestRandomWordsUDHR_th", | ||
| &strip_headers(include_str!("./data/TestRandomWordsUDHR_th.txt")), | ||
| ); | ||
| let content_random_words_tr: (&str, &str) = ( | ||
| "TestRandomWordsUDHR_tr", | ||
| &strip_headers(include_str!("./data/TestRandomWordsUDHR_tr.txt")), | ||
| ); | ||
| let content_viet: (&str, &str) = ("wotw", &strip_headers(include_str!("./data/wotw.txt"))); | ||
| [ | ||
| content_latin, | ||
| content_viet, | ||
| content_jp_k, | ||
| content_jp_h, | ||
| content_korean, | ||
| content_random_words_ru, | ||
| content_random_words_ar, | ||
| content_random_words_el, | ||
| content_random_words_es, | ||
| content_random_words_fr, | ||
| content_random_words_tr, | ||
| content_random_words_th, | ||
| content_random_words_pl, | ||
| content_random_words_he, | ||
| content_random_words_de, | ||
| ] | ||
| .map(|(file_name, raw_content)| { | ||
| let nfc = &nfc_normalizer.normalize(raw_content); | ||
| let nfd = &nfd_normalizer.normalize(raw_content); | ||
| let nfkc = &nfkc_normalizer.normalize(raw_content); | ||
| let nfkd = &nfkd_normalizer.normalize(raw_content); | ||
| BenchDataContent { | ||
| file_name: file_name.to_owned(), | ||
| nfc: nfc.to_string(), | ||
| nfd: nfd.to_string(), | ||
| nfkc: nfkc.to_string(), | ||
| nfkd: nfkd.to_string(), | ||
| nfc_u16: nfc.encode_utf16().collect(), | ||
| nfd_u16: nfd.encode_utf16().collect(), | ||
| nfkc_u16: nfkc.encode_utf16().collect(), | ||
| nfkd_u16: nfkd.encode_utf16().collect(), | ||
| } | ||
| }) | ||
| } | ||
| fn function_under_bench(normalizer: &ComposingNormalizerBorrowed, text: &str) { | ||
| normalizer.normalize(text); | ||
| } | ||
| fn function_under_bench_u16(normalizer: &ComposingNormalizerBorrowed, text: &[u16]) { | ||
| normalizer.normalize_utf16(text); | ||
| } | ||
| pub fn criterion_benchmark(criterion: &mut Criterion) { | ||
| let group_name = "composing_normalizer_nfkc"; | ||
| let normalizer_under_bench = ComposingNormalizerBorrowed::new_nfkc(); | ||
| let mut group = criterion.benchmark_group(group_name); | ||
| for bench_data_content in black_box(normalizer_bench_data()) { | ||
| group.bench_function( | ||
| BenchmarkId::from_parameter(format!("from_nfc_{}", bench_data_content.file_name)), | ||
| |bencher| { | ||
| bencher | ||
| .iter(|| function_under_bench(&normalizer_under_bench, &bench_data_content.nfc)) | ||
| }, | ||
| ); | ||
| group.bench_function( | ||
| BenchmarkId::from_parameter(format!("from_nfd_{}", bench_data_content.file_name)), | ||
| |bencher| { | ||
| bencher | ||
| .iter(|| function_under_bench(&normalizer_under_bench, &bench_data_content.nfd)) | ||
| }, | ||
| ); | ||
| group.bench_function( | ||
| BenchmarkId::from_parameter(format!("from_nfkc_{}", bench_data_content.file_name)), | ||
| |bencher| { | ||
| bencher.iter(|| { | ||
| function_under_bench(&normalizer_under_bench, &bench_data_content.nfkc) | ||
| }) | ||
| }, | ||
| ); | ||
| group.bench_function( | ||
| BenchmarkId::from_parameter(format!("from_nfkd_{}", bench_data_content.file_name)), | ||
| |bencher| { | ||
| bencher.iter(|| { | ||
| function_under_bench(&normalizer_under_bench, &bench_data_content.nfkd) | ||
| }) | ||
| }, | ||
| ); | ||
| // UTF 16 | ||
| group.bench_function( | ||
| BenchmarkId::from_parameter(format!("from_nfc_{}_u16", bench_data_content.file_name)), | ||
| |bencher| { | ||
| bencher.iter(|| { | ||
| function_under_bench_u16(&normalizer_under_bench, &bench_data_content.nfc_u16) | ||
| }) | ||
| }, | ||
| ); | ||
| group.bench_function( | ||
| BenchmarkId::from_parameter(format!("from_nfd_{}_u16", bench_data_content.file_name)), | ||
| |bencher| { | ||
| bencher.iter(|| { | ||
| function_under_bench_u16(&normalizer_under_bench, &bench_data_content.nfd_u16) | ||
| }) | ||
| }, | ||
| ); | ||
| group.bench_function( | ||
| BenchmarkId::from_parameter(format!("from_nfkc_{}_u16", bench_data_content.file_name)), | ||
| |bencher| { | ||
| bencher.iter(|| { | ||
| function_under_bench_u16(&normalizer_under_bench, &bench_data_content.nfkc_u16) | ||
| }) | ||
| }, | ||
| ); | ||
| group.bench_function( | ||
| BenchmarkId::from_parameter(format!("from_nfkd_{}_u16", bench_data_content.file_name)), | ||
| |bencher| { | ||
| bencher.iter(|| { | ||
| function_under_bench_u16(&normalizer_under_bench, &bench_data_content.nfkd_u16) | ||
| }) | ||
| }, | ||
| ); | ||
| } | ||
| group.finish(); | ||
| } |
| # Generating microbench data | ||
| The full versions of these files are located | ||
| [in another part of the repository](https://github.com/unicode-org/icu/tree/main/icu4j/perf-tests/data). | ||
| ## Sanitizing the file | ||
| ```shell | ||
| sed -i '/^#/d' ${filename} | ||
| sed -i '/^$/d' ${filename} | ||
| ``` | ||
| ## Shuffling the file | ||
| ```shell | ||
| shuf -n 20 ${filename} -o ${filename} | ||
| ``` | ||
| ## Add back the header (if you plan on submitting the files) | ||
| ``` | ||
| # This file is part of ICU4X. For terms of use, please see the file | ||
| # called LICENSE at the top level of the ICU4X source tree | ||
| # (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). | ||
| ``` |
| # This file is part of ICU4X. For terms of use, please see the file | ||
| # called LICENSE at the top level of the ICU4X source tree | ||
| # (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). | ||
| かげやま,みのる | ||
| むらかみ,とおる | ||
| つじさわ,けい | ||
| やすい,たかゆき | ||
| むらさき,としお | ||
| はせがわ,ひであき | ||
| うるしばら,よしひこ | ||
| ままだ,ひろし | ||
| おおぼら,えいじろう | ||
| おおば,まさひで | ||
| きたばたけ,たかひこ | ||
| はまさき,あつし | ||
| ほりい,つねお | ||
| もり,だいいち | ||
| いとう,しんいち | ||
| くにもと,じゅんじ | ||
| おか,のりひと | ||
| たに,よしあき | ||
| しらがき,ひろあき | ||
| しらはま,たけひろ | ||
| むらかみ,やすひろ | ||
| うめはら,たかし | ||
| いわた,ひろし | ||
| すぎえ,かつとし | ||
| てらにし,ひろみつ | ||
| まつおか,だいすけ | ||
| もろほし,すすむ | ||
| いしはら,たかし | ||
| おしま,ひろお | ||
| なかお,ゆうじ | ||
| いかり,はるお | ||
| きまち,まさき | ||
| ふるかわ,みちお | ||
| かねこ,しゅうへい | ||
| なかがわ,ともみ | ||
| ささき,しんご | ||
| うちだ,たくじ | ||
| うめだ,さかえ | ||
| しばた,いくこ | ||
| まきした,けいこ | ||
| まつもと,しんいちろう | ||
| たかの,かずよし | ||
| いしわた,なおひさ | ||
| いうち,まこと | ||
| いまい,りほ | ||
| みずた,のりあき | ||
| かくたに,まなぶ | ||
| わだ,ほまれ | ||
| わかまつ,かずき | ||
| かわぐち,ひろき |
| # This file is part of ICU4X. For terms of use, please see the file | ||
| # called LICENSE at the top level of the ICU4X source tree | ||
| # (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). | ||
| ホリモト,ユウジ | ||
| ハナミ,ヤスヒデ | ||
| イシザカ,タカユキ | ||
| ゼンケ,トシオ | ||
| ハトリ,ユウコ | ||
| ナガオカ,トモユキ | ||
| コウダ,ケンイチ | ||
| イシダ,ヒロシ | ||
| ミワ,シゲユキ | ||
| イシカワ,ヒロシ | ||
| スズキ,ユウスケ | ||
| オクダ,ヨシノリ | ||
| シムラ,サカエ | ||
| エビシマ,ヤスユキ | ||
| イブカ,ヨシテル | ||
| タノ,マコト | ||
| ドウゾノ,セイヤ | ||
| ヤマナカ,サツミ | ||
| トミイエ,ハヤト | ||
| アザミ,ツトム | ||
| タナカ,キョウコ | ||
| コジマ,アツシ | ||
| フミハラ,カオリ | ||
| スズキ,マサユキ | ||
| ナトリ,ケンヤ | ||
| スズキ,ユウコ | ||
| スズキ,ヒサエ | ||
| ナカガワ,カツヨシ | ||
| スズキ,マサフミ | ||
| マツヤマ,トシオ | ||
| ヨシナガ,チカエ | ||
| キタムラ,リカコ | ||
| アオキ,タクオ | ||
| ヤマグチ,ヤスヒロ | ||
| スギムラ,シゲオ | ||
| ウエスギ,マサミ | ||
| マツムラ,シンイチ | ||
| クバ,タカシ | ||
| スドウ,タカトシ | ||
| フジモト,ヒロシ | ||
| イトウ,シュウイチ | ||
| コバヤシ,カズミ | ||
| タナカ,ヒロカツ | ||
| イシダ,ツカサ | ||
| ヤマダ,マサコ | ||
| カミヤ,トミエ | ||
| タケモト,ユウジ | ||
| スミノ,コウジ | ||
| ヒロハタ,タクヤ | ||
| ミヒラ,リョウヘイ |
| # This file is part of ICU4X. For terms of use, please see the file | ||
| # called LICENSE at the top level of the ICU4X source tree | ||
| # (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). | ||
| 김명희 | ||
| 홍차수 | ||
| 허순재 | ||
| 강영휘 | ||
| 김운주 | ||
| 이종환 | ||
| 이은국 | ||
| 강태호 | ||
| 강일래 | ||
| 김동현 | ||
| 곽기자 | ||
| 차재수 | ||
| 표봉기 | ||
| 문대원 | ||
| 이형기 | ||
| 최교표 | ||
| 박식현 | ||
| 홍종립 | ||
| 서창수 | ||
| 김쌍건 | ||
| 서말도 | ||
| 이병훈 | ||
| 김희수 | ||
| 박학태 | ||
| 강태종 | ||
| 조문란 | ||
| 신범균 | ||
| 백두진 | ||
| 이철정 | ||
| 김태중 | ||
| 이성현 | ||
| 김주조 | ||
| 김강행 | ||
| 이정길 | ||
| 김완일 | ||
| 권수자 | ||
| 이춘철 | ||
| 김판근 | ||
| 김곡리 | ||
| 이경형 | ||
| 이운만 | ||
| 손상철 | ||
| 유기숙 | ||
| 박정한 | ||
| 조윤래 | ||
| 유신호 | ||
| 이두수 | ||
| 김재률 | ||
| 김성홍 | ||
| 김혜경 |
| # This file is part of ICU4X. For terms of use, please see the file | ||
| # called LICENSE at the top level of the ICU4X source tree | ||
| # (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). | ||
| González, Joan | ||
| Reinders, Jim | ||
| Applebroog, Ida | ||
| Kidd, Joseph Bartholomew | ||
| Gulácsy, Lajos | ||
| Letendre, Rita | ||
| Zuccaro, Federico | ||
| Apt the Elder, Ulrich | ||
| Drummond, Arthur | ||
| Manley, Thomas | ||
| Broc, Jean | ||
| Ramunno, Tony | ||
| Simone dei Crocifissi | ||
| Lane, Theodore | ||
| Symonds, William Robert | ||
| Johnson, Frank Tenney | ||
| Cox, Gardner | ||
| Bunbury, Charles | ||
| Pedro de la Cuadra | ||
| Payne, William | ||
| Lucas, John Seymour | ||
| Holsman, Elizabeth T. | ||
| de Vries, Auke | ||
| Laszlo, Philip Alexius de | ||
| Shigemasa | ||
| Wolfe, Ruth Mitchell | ||
| Buck, John | ||
| Baselitz, Georg | ||
| Hook, Walter | ||
| Segall, Lasar | ||
| Brush, George deForest | ||
| Master of Jánosrét | ||
| Sutherland, Elizabeth Leveson-Gower, Countess of | ||
| Tuckerman, Jane | ||
| Varley, F.H. | ||
| Fosso, Samuel | ||
| Gardner, Daniel | ||
| Sadler, Walter Dendy | ||
| Clausen, Franciska | ||
| Coman, Charlotte Buell | ||
| Wakelin, Roland | ||
| Payne, Jon, CML | ||
| Campagna, Girolamo | ||
| Wiener, Phyllis | ||
| Sallee, Charles | ||
| Fitzgerald, John Anster | ||
| Gribbroek, Robert | ||
| Laporte, John | ||
| Lévy-Dhurmer, Lucien | ||
| Young, Stephen Scott |
| # This file is part of ICU4X. For terms of use, please see the file | ||
| # called LICENSE at the top level of the ICU4X source tree | ||
| # (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). | ||
| ณรงค์ โต๊ะเงิน | ||
| กิตติ บุญวันต์ | ||
| สมหมาย ดาบทองดี | ||
| ธวัชชัย อิสระนิมิตร | ||
| วรรณา โสภณนรินทร์ | ||
| วินัย หมู่มิ่ง | ||
| พัชรี ชูจิรวงศ์ | ||
| สมปอง จิวไพโรจน์กิจ | ||
| บุญส่ง กวยรักษา | ||
| นิพนธ์ นิ่มใหม่ | ||
| พัชรี สุวพรศิลป์ | ||
| เจริญ นววัฒนทรัพย์ | ||
| อรพินท์ แซ่เจี่ย | ||
| ชัยพร สมใจนึก | ||
| ประนอม โคศิลา | ||
| ฉวีวรรณ ศรสังข์ทอง | ||
| วัชรา เจริญรัตนพร | ||
| สุภัท นกศิริ | ||
| อู๋ มาลาเล็ก | ||
| ประยูร ไชโย | ||
| ละออ อยู่ยืนยง | ||
| สมใจ วิวัฒน์วานิช | ||
| จุมพล จันทรศรีเกษร | ||
| พุฒ ดอกไม้จีน | ||
| บุญชัย วรกิจพรสิน | ||
| สมาน ธูปเทียน | ||
| พงศ์ศักดิ์ แซ่แต้ | ||
| อำนาจ ไวจงเจริญ | ||
| พรทิพย์ แซ่ลี้ | ||
| อุไรวรรณ สาครสินธุ์ | ||
| อำพล วีระตะนนท์ | ||
| สมจิตร ใจวังโลก | ||
| สุเทพ ตันวินิจ | ||
| สวาท ทรัพย์มาก | ||
| สมศักดิ์ เจือจันทร์ | ||
| ดัสซันซิงห์ กุลาตี | ||
| ธีร ศรแก้ว | ||
| พรรณยุพา ฮ่อสกุล | ||
| สำราญ จันทร์เอี่ยม | ||
| พจน์ มั่นกันนาน | ||
| สุธี บุณยเกียรติ | ||
| บุญโชติ ทิพย์ประเสริฐสิน | ||
| ประดิษฐ์ ทองพสิฐสมบัติ | ||
| จำเนียร เพ็งเจริญ | ||
| สมศักดิ์ อรุณรัตน์ | ||
| อนุชา จารุหิรัญสกุล | ||
| พิกุล มโนภิญโญภิญญะ | ||
| ผ่องศรี นกแก้ว | ||
| อารี วิไลวรรณ | ||
| ณรงค์วิทย์ วิทสัทธาวรกุล |
| # This file is part of ICU4X. For terms of use, please see the file | ||
| # called LICENSE at the top level of the ICU4X source tree | ||
| # (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). | ||
| ممارسة مراعاة | ||
| العنصرية | ||
| حدود والشيخوخة | ||
| بالحكم كهذا ينتفع | ||
| البلاد | ||
| تربية | ||
| الغير التقدم والعدل | ||
| نحو بالتعليم والحرية | ||
| تأمين متساو | ||
| للتعليم فيها | ||
| آذت اعتداء للتعليم | ||
| ليس المتأصلة | ||
| والمساهمة الضروري تتناقض | ||
| وتأسيس | ||
| رضى | ||
| شرعي الطبية | ||
| لكيلا الجمعية والحرية | ||
| للرجال التزوج | ||
| بالكرامة | ||
| حرية بين | ||
| هذه العيش تنظر | ||
| قيد | ||
| يقررها والصداقة | ||
| اعتُمد وينبغي اجتماعي | ||
| حرمان | ||
| للإدراك بأجر إنتاجه | ||
| التربية القانون | ||
| لإنصافه وتأسيس وسمعته | ||
| أساسه للرجال | ||
| كافة | ||
| المجهود دولي أينما | ||
| وإلى | ||
| بنشاط تجري | ||
| والأمم مثل لحقوق | ||
| الإنسان بشروط بحماية | ||
| شرفه | ||
| كما الوظائف | ||
| حياته ديسمبر | ||
| ولما | ||
| هذه | ||
| غاية جديد إنسان | ||
| حرية | ||
| متهم الوطنية قدمًا | ||
| التملك وضع | ||
| شرعية ويعبر تأدية | ||
| بنظام عمل والأخلاق | ||
| التملك لشخصيته يلجأ | ||
| بحال يضطر ولا | ||
| الانضمام بالكرامة | ||
| عضوا |
| # This file is part of ICU4X. For terms of use, please see the file | ||
| # called LICENSE at the top level of the ICU4X source tree | ||
| # (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). | ||
| Herrschaft Freiheiten Not | ||
| Gewalt | ||
| stets anderer begründet | ||
| erhobenen innerstaatliche | ||
| Heiratsfähige freie | ||
| offenstehen Begrenzung grausamer | ||
| Maßnahmen höchste | ||
| unentbehrlich privat | ||
| erniedrigender | ||
| Verachtung freie | ||
| innezuhaben innerstaatlichen | ||
| kommen | ||
| werden gleichgültig | ||
| Würde überall höchste | ||
| Schutzmaßnahmen den Pflichten | ||
| Wille Bestimmung | ||
| Leibeigenschaft einschließlich für | ||
| gleiche bekräftigt Gewissens | ||
| Wohles | ||
| Generalversammlung | ||
| Volkes | ||
| Völkern gegenwärtig Zusammenarbeit | ||
| Heiratsfähige sowie Jeder | ||
| Stellung | ||
| Lebensstandard | ||
| seinem | ||
| Rede strafbaren Sicherheit | ||
| mit | ||
| Kulthandlungen Grund | ||
| ärztlicher | ||
| Auflösung Anforderungen anzugehören | ||
| Furcht | ||
| keine Geburt | ||
| Wohles Furcht genügen | ||
| befriedigende Medien | ||
| anzugehören Urlaub Vereinigungen | ||
| hinzuwirken verboten Resolution | ||
| kommen | ||
| sozialer vor irgendein | ||
| Bestimmung Bestimmung | ||
| Fall natürliche kein | ||
| Geschlecht Aufhetzung eigenen | ||
| seinen | ||
| über | ||
| Unterlassung Berücksichtigung | ||
| war | ||
| Rufes stets | ||
| Volkes anderer Beschränkungen | ||
| Handlungen dessen | ||
| Die |
| # This file is part of ICU4X. For terms of use, please see the file | ||
| # called LICENSE at the top level of the ICU4X source tree | ||
| # (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). | ||
| προάγει αλληλογραφία | ||
| λογική έχει | ||
| ιδρύει ζωή τεχνική | ||
| δυνατότητες | ||
| περιορισμό συνόλου | ||
| ασκεί παραγνώριση συναφθεί | ||
| αναγνωρίζουν ποινικής εκδηλώνει | ||
| κοινότητας διακυβέρνηση στα | ||
| απέναντι υψηλή | ||
| περιστάσεων αξιόποινη | ||
| σεβασμό | ||
| συντήρησής κατά εξασφαλίσουν | ||
| παραβιάζουν συμπληρώνεται νόμο | ||
| άμεσα | ||
| σημαίνει καθεστώς | ||
| ΑΝΘΡΩΠΙΝΑ θέλησης ανθρωπίνων | ||
| ΔΙΑΚΗΡΥΞΗ αθλιότητα ασφάλιση | ||
| μέσο | ||
| ίση Εχει | ||
| ειρήνης Κάθε | ||
| μέλη μορφή | ||
| όσο | ||
| κρατείται Στο Διακηρύσσει | ||
| οικονομικών έκφρασης εξασφαλίζεται | ||
| κάθε | ||
| περίπτωση απολαμβάνουν | ||
| ποινικό γεροντική | ||
| είναι μαζί δικαστήρια | ||
| μαζί προοπτική | ||
| δική | ||
| βαρβαρότητας | ||
| οικονομικών εξασφαλίσει | ||
| υποχρεώσεις οδήγησαν | ||
| Οικουμενική Διακήρυξης γονείς | ||
| στις μυστική αντιπροσώπους | ||
| Διακήρυξης άδειες βιοτικό | ||
| αναπηρία ομάδα | ||
| πραγματικό | ||
| καλύτερες | ||
| ανάπαυση | ||
| δίκαιες ένα δικαίου | ||
| μετέχει στους | ||
| θρησκευτικών ποινικής | ||
| Κανείς ίσα | ||
| πεποιθήσεις | ||
| πολιτικές ανάλογα δουλεία | ||
| πολιτικές ιατρική ωσότου | ||
| ηθικής χωρίς | ||
| ανδρών ικανό | ||
| καθώς |
| # This file is part of ICU4X. For terms of use, please see the file | ||
| # called LICENSE at the top level of the ICU4X source tree | ||
| # (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). | ||
| duración común | ||
| delito reconocimiento alimentación | ||
| inalienables | ||
| entre seguridad escogidos | ||
| comportarse dignidad | ||
| autónomo gobierno tiempo | ||
| omisiones | ||
| comisión | ||
| Derechos territorios | ||
| debe | ||
| han | ||
| regresar inalienables | ||
| regresar | ||
| desempleo científico | ||
| arbitrariamente proclamada | ||
| están contraerse esposos | ||
| cualesquiera | ||
| salir carácter desarrollo | ||
| solamente justas | ||
| personalidad una | ||
| cuanto | ||
| garantice resolución | ||
| concepción | ||
| tomar impondrá | ||
| cualquier reconocimiento | ||
| obligatoria obligatoria satisfactoria | ||
| acusación sin | ||
| artísticas penal culturales | ||
| pagadas examen | ||
| Además Organización dignidad | ||
| opresión esposos ejercidos | ||
| barbarie están mientras | ||
| por | ||
| idioma | ||
| recursos pagadas | ||
| materia Nada ella | ||
| con injerencias | ||
| inspirándose | ||
| organización | ||
| gozar jurisdicción | ||
| que | ||
| asegurar | ||
| humana libertad | ||
| nadie equivalente | ||
| escoger remuneración | ||
| torturas | ||
| individuos poder | ||
| disfruten seres Preámbulo | ||
| desempleo | ||
| liberados |
| # This file is part of ICU4X. For terms of use, please see the file | ||
| # called LICENSE at the top level of the ICU4X source tree | ||
| # (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). | ||
| conforme êtres fonctions | ||
| non tout généralisé | ||
| premier lui | ||
| faire hommes d’égalité | ||
| peuple volonté bénéficier | ||
| générale nationales | ||
| cruels plus | ||
| d’encourager opinions | ||
| genre l’esprit | ||
| d’origine effectif | ||
| exigences auront | ||
| résultent situation recevoir | ||
| peuples Chacun | ||
| sont d’égalité | ||
| jouissent | ||
| auront l’esprit | ||
| pays telle | ||
| publiquement | ||
| mariage foi | ||
| travail démocratique religieux | ||
| rémunération | ||
| omissions telles | ||
| L’éducation | ||
| raison complétée donner | ||
| invoqué auront arbitraires | ||
| l’amitié suffisant affaires | ||
| travaille l’accomplissement l’intermédiaire | ||
| race | ||
| opinions celles | ||
| assurer par privée | ||
| valeur | ||
| violant traite premier | ||
| inhérente | ||
| bienfaits l’avènement | ||
| Unies s’il actions | ||
| inquiété l’esclavage | ||
| inquiété | ||
| esclaves lieu | ||
| salaire | ||
| par | ||
| toute | ||
| innocente procédure membres | ||
| arts l’idéal envers | ||
| suffrage territoires inhumains | ||
| d’immixtions l’organisation progrès | ||
| comme égalité Unies | ||
| maternité | ||
| violerait suprême sécurité | ||
| impliquant eux loisirs | ||
| nationalité |
| # This file is part of ICU4X. For terms of use, please see the file | ||
| # called LICENSE at the top level of the ICU4X source tree | ||
| # (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). | ||
| זקנה משפילים | ||
| ינתן חברתי עניניו | ||
| הפוב | ||
| ולהיות זכויות הישגים | ||
| יאסרו מטעמי וללא | ||
| ספרותית השלם | ||
| למנוחה חינם | ||
| וההתאגדות | ||
| לטפח | ||
| באלה במלואן | ||
| יהנו | ||
| ולרווחתם לגבר האדם | ||
| בכבודו שבארצות כבוד | ||
| ובינלאומיים | ||
| בכך לתנאי אישי | ||
| שאינן | ||
| שרירותי | ||
| במשפט | ||
| ולעקרונותיהן מטעם | ||
| שרירותית האשמה יהיה | ||
| החינוך ולבטחון | ||
| סובלנות אשמתו במגילה | ||
| המאוחדות חיוני | ||
| חשוב במקרה | ||
| כלתי העולם | ||
| שמקורה כציבור | ||
| לשויון | ||
| לתקנה | ||
| תלוי ההתאספות | ||
| הדיבור שהוא | ||
| והבלתי והבסיסית | ||
| ולעקרונותיהן יהא וישאף | ||
| ביתנ הבינלאומי | ||
| והזלזול להקנות | ||
| בגלל כולם שיושלם | ||
| לחיים | ||
| בדבר | ||
| לשירות | ||
| זכויות | ||
| לפני | ||
| אדם ולא מזזמנות | ||
| קנינו שהיה ההתאספות | ||
| בינלאומי חיוניות לבקש | ||
| תהיינה | ||
| ובזכות בכורה מהגנה | ||
| מתוך | ||
| ובמצפון מזומנות לאגד | ||
| והחמריים סוציאלי | ||
| אנושיים ובהצבעה | ||
| פראיים |
| # This file is part of ICU4X. For terms of use, please see the file | ||
| # called LICENSE at the top level of the ICU4X source tree | ||
| # (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). | ||
| uciskowi posiadania prawo | ||
| społecznego największych skazany | ||
| czy | ||
| potrzeby samodzielnie przystępowania | ||
| Krzewi też dokonania | ||
| pełną prawo | ||
| buntu | ||
| moralności | ||
| zapewnienia znaczenie | ||
| nieludzki wypadek Nikt | ||
| zasadności jakikolwiek Każdy | ||
| samowolnie krajem | ||
| międzynarodowego | ||
| członek wielu | ||
| rozwój wynikających obalenia | ||
| rasy | ||
| grudnia która | ||
| jedynie urlopu ani | ||
| małżeńskie stanowi ustaniu | ||
| człowieka postępowych | ||
| prześladowania | ||
| politycznej które zawarcia | ||
| Deklaracja | ||
| ingerować wyłącznie | ||
| studia Nikt | ||
| innego uprawianie zrozumienie | ||
| wybranych swobodę wyznania | ||
| wolni osobowości | ||
| ograniczenie Nie | ||
| równej społecznego uciekać | ||
| będącą POWSZECHNA | ||
| niezdolności poszukiwania międzynarodowej | ||
| konieczne potrzeby posiada | ||
| opinii wychowywania 1948 | ||
| międzynarodowej zatrzymać | ||
| przedstawicieli | ||
| przeciw | ||
| wynikających organy pracę | ||
| człowiek grupami | ||
| niezbędnych | ||
| wolności podstawowym | ||
| opinii małżonków wolność | ||
| postępować zdecydowanie komórką | ||
| odniesieniu | ||
| pokoju azyl | ||
| zawodowych powrócić człowiek | ||
| konstytucję | ||
| takiej postaciach powszechnego | ||
| wygnać wygnać | ||
| wspólny poszanowania |
| # This file is part of ICU4X. For terms of use, please see the file | ||
| # called LICENSE at the top level of the ICU4X source tree | ||
| # (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). | ||
| областях | ||
| будут должен | ||
| обеспечиваются нежели | ||
| котором Уставе | ||
| социального моральных | ||
| совершеннолетия предоставление | ||
| том независимо | ||
| существование | ||
| вмешательства какому ограниченной | ||
| распространять | ||
| находить помощь | ||
| искусством | ||
| унижающим положения искать | ||
| изгнанию член совершеннолетия | ||
| обществом имуществом государственной | ||
| идеи братства | ||
| наслаждаться значение социальной | ||
| осуществления юрисдикцией наказанию | ||
| достойное свою III | ||
| жизнь расторжения инвалидности | ||
| терпимости этого | ||
| целях равны | ||
| обеспечиваются законным | ||
| принуждаем правосубъектности | ||
| пыткам доступа неприкосновенность | ||
| Брак против | ||
| прибегать независимой | ||
| человека человеческой | ||
| быть независимо религии | ||
| публичным | ||
| членам против | ||
| разумом результатом семью | ||
| Принята участие | ||
| беспристрастным тем | ||
| частным основной | ||
| правового | ||
| страной обслуживание | ||
| было свободу полное | ||
| рабочего свободны | ||
| состоянии помощь религиозными | ||
| полное | ||
| владеть власти морали | ||
| меньшей | ||
| братства социальному убежища | ||
| государств | ||
| равны который дети | ||
| терпимости | ||
| получать бесплатным полного | ||
| богослужении | ||
| отдельным |
| # This file is part of ICU4X. For terms of use, please see the file | ||
| # called LICENSE at the top level of the ICU4X source tree | ||
| # (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). | ||
| คิด ใตัอำ เคลื่อนไหว | ||
| บังคับ บาก | ||
| สิ่ง สิ้น | ||
| วัตถุ | ||
| ชาย อาศัย เท่านั้น | ||
| สิน | ||
| เกา | ||
| ดูแล พิธีกรรม | ||
| ภายใน | ||
| เพศ | ||
| หนัก ประสงค์ | ||
| เหตุ | ||
| งาน รักษา | ||
| เพศ ภาษา | ||
| นี้ | ||
| คู่ สัญชาติ ต้องการ | ||
| วิธี ระหว่าง ตกลง | ||
| ทำนอง | ||
| สืบ กับ ศิลปกรรม | ||
| เหนือ วรรณกรรม | ||
| คิด การก หน้าที่ | ||
| ชาติ ศิลปกรรม แต่ | ||
| สามัญ สอด | ||
| เหยียด วิธี จุด | ||
| หน้า ถ้า เบื้อง | ||
| ประชุม | ||
| ศิลปกรรม | ||
| เสรีภาพ โหด ก่อ | ||
| เกียรติศักดิ์ ป่วย เอกราช | ||
| ประหัต มโนธรรม การ | ||
| แทน | ||
| ขัดขืน เวลา เสียง | ||
| กฎบัตร พยายาม | ||
| สิน หน้า | ||
| จำเป็น | ||
| ประชาธิปไตย หน่วย | ||
| กรณี จริงจัง | ||
| ทำนอง | ||
| ทาษ | ||
| เพิ่ม | ||
| บรรดา ขวาง | ||
| กักขัง | ||
| มนุษย์ | ||
| ชาย ประกัน มนุษยธรรม | ||
| จะบัน มูลฐาน เถื่อน | ||
| พฤติ | ||
| มิได้ | ||
| หญิง คู่ | ||
| สมา ปฏิบัติ อนึ่ง | ||
| สิ่ง ทาษ |
| # This file is part of ICU4X. For terms of use, please see the file | ||
| # called LICENSE at the top level of the ICU4X source tree | ||
| # (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). | ||
| mecburidir ilim | ||
| isnadın sınırları suç | ||
| tutuklanamaz diğer | ||
| memleket korunmasi kullanılamaz | ||
| İnsanlık ilerlemeyi | ||
| bir mülk menfaatlerinin | ||
| usul zümreye herhangi | ||
| mahkeme vicdana ilerleyişe | ||
| zulüm zalimane | ||
| ilim öncelikle çocuk | ||
| mevzubahis ancak | ||
| muamelesi dinlenmeye | ||
| eşitlikle prensiplerine ülkenin | ||
| öğretim bulunmalarına yardım | ||
| memleketler amacıyla | ||
| birbirlerine | ||
| olmalıdır | ||
| bırakılamaz serbestisine | ||
| hürriyetin iyi | ||
| hükmü işbu zalimane | ||
| evlenme memleketi tedbirlerle | ||
| evlenmek ahalisi işini | ||
| hürriyetler | ||
| belirlenmiş kere | ||
| elde cürüme | ||
| tanınan dünyaca yüksek | ||
| müddetinin ailesine | ||
| vicdan kırıcı itibariyle | ||
| geniş inanma | ||
| kendi görevleri Teşkilatı | ||
| yaymak | ||
| öğretim vesayet | ||
| renk kişiliğinin | ||
| tamamlanan | ||
| haklara bulunma | ||
| hükmü uygulanabilecek | ||
| etmiş geliştirilmesini hoşgörü | ||
| sahiptir temel | ||
| giyim | ||
| Bundan temeli | ||
| icaplarını | ||
| mülk karışma tekmil | ||
| vicdana hürriyetine işini | ||
| Herkesin vahşiliklere | ||
| dolaşma dünyanın | ||
| davasının Uluslararasında idamesi | ||
| eşittir | ||
| haklardan hakkı | ||
| kovuşturmalar hürriyetlerden gözönünde | ||
| Evrensel fiilli beyannamesi |
| # This file is part of ICU4X. For terms of use, please see the file | ||
| # called LICENSE at the top level of the ICU4X source tree | ||
| # (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). | ||
| # The contents of this file have been translated by "Google Translate". | ||
| Vào những năm cuối của thế kỷ 19, không ai có thể tin rằng thế giới này | ||
| đang được theo dõi một cách sâu sắc và chặt chẽ bởi những trí thông minh | ||
| lớn hơn con người nhưng cũng nguy hiểm như chính con người; rằng khi con | ||
| người bận rộn với những mối quan tâm khác nhau của họ, họ bị xem xét và | ||
| nghiên cứu kỹ lưỡng, có lẽ gần như một người đàn ông với kính hiển vi có thể | ||
| xem xét kỹ lưỡng những sinh vật nhất thời tụ tập và sinh sôi nảy nở trong | ||
| một giọt nước. Với sự tự mãn vô hạn, con người đi đi lại lại khắp thế giới | ||
| này chỉ vì những công việc nhỏ nhặt của họ, thanh thản với niềm tin chắc | ||
| chắn về đế chế của họ đối với vật chất. Có thể là infusoria dưới kính hiển | ||
| vi cũng làm như vậy. Không ai coi các thế giới cũ hơn trong không gian là | ||
| nguồn gây nguy hiểm cho con người, hoặc nghĩ về chúng chỉ để bác bỏ ý | ||
| tưởng về sự sống đối với chúng là không thể hoặc không thể xảy ra. | ||
| Thật tò mò khi nhớ lại một số thói quen tinh thần của những ngày đã | ||
| qua. Hầu hết những người trên trái đất đều tưởng tượng rằng có thể có | ||
| những người khác trên sao Hỏa, có lẽ thấp kém hơn họ và sẵn sàng chào | ||
| đón một doanh nghiệp truyền giáo. Tuy nhiên, bên kia vịnh không gian, | ||
| những bộ óc đối với tâm trí của chúng ta cũng như tâm trí của chúng ta đối | ||
| với những con thú bị diệt vong, những bộ óc rộng lớn, lạnh lùng và vô cảm, | ||
| nhìn trái đất này với con mắt ghen tị, và dần dần và chắc chắn vạch ra | ||
| những kế hoạch chống lại chúng ta. Và đầu thế kỷ 20 đã xảy ra sự vỡ mộng | ||
| lớn. Hành tinh sao Hỏa, tôi không cần nhắc độc giả, quay xung quanh mặt | ||
| trời ở khoảng cách trung bình 140.000.000 dặm, và ánh sáng và nhiệt mà | ||
| nó nhận được từ mặt trời chỉ bằng một nửa so với thế giới này nhận được. | ||
| Nếu giả thuyết về tinh vân có bất kỳ sự thật nào, nó phải tồn tại lâu | ||
| đời hơn thế giới của chúng ta; và rất lâu trước khi trái đất này ngừng | ||
| nóng chảy, sự sống trên bề mặt của nó hẳn đã bắt đầu quá trình của nó. | ||
| Thực tế là nó chỉ chiếm một phần bảy thể tích của trái đất đã làm tăng | ||
| tốc độ nguội đi của nó đến nhiệt độ mà sự sống có thể bắt đầu. Nó có | ||
| không khí và nước và tất cả những gì cần thiết để hỗ trợ sự tồn tại | ||
| sinh động. Tuy nhiên, con người quá hão huyền và bị mù quáng bởi sự phù | ||
| phiếm của mình, đến nỗi cho đến tận cuối thế kỷ 19, không có nhà văn nào | ||
| bày tỏ bất kỳ ý tưởng nào rằng sự sống thông minh có thể đã phát triển ở đó xa, | ||
| hoặc thực sự là ở tất cả, vượt ra ngoài mức độ trần gian của nó. Người ta | ||
| cũng không hiểu một cách tổng quát rằng vì sao Hỏa già hơn trái đất của chúng | ||
| ta, chỉ bằng một phần tư diện tích bề mặt và ở xa mặt trời hơn, nên điều tất | ||
| yếu dẫn đến là nó không chỉ xa hơn so với thời điểm bắt đầu mà còn gần ngày kết | ||
| thúc hơn. Sự nguội lạnh thế tục mà một ngày nào đó phải vượt qua hành tinh của chúng | ||
| ta đã thực sự đi xa với người hàng xóm của chúng ta. Tình trạng vật lý của nó phần lớn | ||
| vẫn còn là một bí ẩn, nhưng giờ đây chúng ta biết rằng ngay cả ở vùng xích đạo của nó, | ||
| nhiệt độ giữa trưa hầu như không bằng nhiệt độ của mùa đông lạnh nhất của chúng ta. | ||
| Không khí của nó loãng hơn nhiều so với không khí của chúng ta, các đại dương của nó đã | ||
| thu hẹp lại cho đến khi chỉ bao phủ một phần ba bề mặt của nó, và khi các mùa chậm chạp | ||
| của nó thay đổi, các chỏm tuyết khổng lồ tụ lại và tan chảy ở hai cực và định kỳ làm ngập các vùng ôn đới của nó. | ||
| Giai đoạn cuối cùng của sự kiệt sức, mà đối với chúng ta vẫn còn quá xa vời, đã trở thành | ||
| một vấn đề ngày nay đối với các cư dân trên sao Hỏa. Áp lực trước mắt của sự cần | ||
| thiết đã làm sáng tỏ trí tuệ của họ, mở rộng sức mạnh của họ và làm chai đá trái | ||
| tim họ. Và nhìn xuyên qua không gian với các công cụ, và trí thông minh như chúng | ||
| ta hiếm khi mơ tới, họ thấy, ở khoảng cách gần nhất chỉ cách họ 35.000.000 dặm | ||
| về phía mặt trời, một ngôi sao buổi sáng của hy vọng, hành tinh ấm áp hơn của chúng | ||
| ta, màu xanh lục của thảm thực vật và màu xám của nước , với bầu không khí nhiều | ||
| mây hùng hồn của sự màu mỡ, với những cái nhìn thoáng qua qua những đám mây | ||
| trôi dạt của nó là những dải đất rộng lớn đông dân và những vùng biển chật hẹp đông đúc hải quân. |
| // This file is part of ICU4X. For terms of use, please see the file | ||
| // called LICENSE at the top level of the ICU4X source tree | ||
| // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). | ||
| use criterion::{black_box, BenchmarkId, Criterion}; | ||
| use icu_normalizer::{ComposingNormalizerBorrowed, DecomposingNormalizerBorrowed}; | ||
| struct BenchDataContent { | ||
| pub file_name: String, | ||
| pub nfc: String, | ||
| pub nfd: String, | ||
| pub nfkc: String, | ||
| pub nfkd: String, | ||
| pub nfc_u16: Vec<u16>, | ||
| pub nfd_u16: Vec<u16>, | ||
| pub nfkc_u16: Vec<u16>, | ||
| pub nfkd_u16: Vec<u16>, | ||
| } | ||
| fn strip_headers(content: &str) -> String { | ||
| content | ||
| .lines() | ||
| .filter(|&s| !s.starts_with('#')) | ||
| .map(|s| s.to_owned()) | ||
| .collect::<Vec<String>>() | ||
| .join("\n") | ||
| } | ||
| fn normalizer_bench_data() -> [BenchDataContent; 15] { | ||
| let nfc_normalizer = ComposingNormalizerBorrowed::new_nfc(); | ||
| let nfd_normalizer = DecomposingNormalizerBorrowed::new_nfd(); | ||
| let nfkc_normalizer = ComposingNormalizerBorrowed::new_nfkc(); | ||
| let nfkd_normalizer = DecomposingNormalizerBorrowed::new_nfkd(); | ||
| let content_latin: (&str, &str) = ( | ||
| "TestNames_Latin", | ||
| &strip_headers(include_str!("./data/TestNames_Latin.txt")), | ||
| ); | ||
| let content_jp_h: (&str, &str) = ( | ||
| "TestNames_Japanese_h", | ||
| &strip_headers(include_str!("./data/TestNames_Japanese_h.txt")), | ||
| ); | ||
| let content_jp_k: (&str, &str) = ( | ||
| "TestNames_Japanese_k", | ||
| &strip_headers(include_str!("./data/TestNames_Japanese_k.txt")), | ||
| ); | ||
| let content_korean: (&str, &str) = ( | ||
| "TestNames_Korean", | ||
| &strip_headers(include_str!("./data/TestNames_Korean.txt")), | ||
| ); | ||
| let content_random_words_ar: (&str, &str) = ( | ||
| "TestRandomWordsUDHR_ar", | ||
| &strip_headers(include_str!("./data/TestRandomWordsUDHR_ar.txt")), | ||
| ); | ||
| let content_random_words_de: (&str, &str) = ( | ||
| "TestRandomWordsUDHR_de", | ||
| &strip_headers(include_str!("./data/TestRandomWordsUDHR_de.txt")), | ||
| ); | ||
| let content_random_words_el: (&str, &str) = ( | ||
| "TestRandomWordsUDHR_el", | ||
| &strip_headers(include_str!("./data/TestRandomWordsUDHR_el.txt")), | ||
| ); | ||
| let content_random_words_es: (&str, &str) = ( | ||
| "TestRandomWordsUDHR_es", | ||
| &strip_headers(include_str!("./data/TestRandomWordsUDHR_es.txt")), | ||
| ); | ||
| let content_random_words_fr: (&str, &str) = ( | ||
| "TestRandomWordsUDHR_fr", | ||
| &strip_headers(include_str!("./data/TestRandomWordsUDHR_fr.txt")), | ||
| ); | ||
| let content_random_words_he: (&str, &str) = ( | ||
| "TestRandomWordsUDHR_he", | ||
| &strip_headers(include_str!("./data/TestRandomWordsUDHR_he.txt")), | ||
| ); | ||
| let content_random_words_pl: (&str, &str) = ( | ||
| "TestRandomWordsUDHR_pl", | ||
| &strip_headers(include_str!("./data/TestRandomWordsUDHR_pl.txt")), | ||
| ); | ||
| let content_random_words_ru: (&str, &str) = ( | ||
| "TestRandomWordsUDHR_ru", | ||
| &strip_headers(include_str!("./data/TestRandomWordsUDHR_ru.txt")), | ||
| ); | ||
| let content_random_words_th: (&str, &str) = ( | ||
| "TestRandomWordsUDHR_th", | ||
| &strip_headers(include_str!("./data/TestRandomWordsUDHR_th.txt")), | ||
| ); | ||
| let content_random_words_tr: (&str, &str) = ( | ||
| "TestRandomWordsUDHR_tr", | ||
| &strip_headers(include_str!("./data/TestRandomWordsUDHR_tr.txt")), | ||
| ); | ||
| let content_viet: (&str, &str) = ("wotw", &strip_headers(include_str!("./data/wotw.txt"))); | ||
| [ | ||
| content_latin, | ||
| content_viet, | ||
| content_jp_k, | ||
| content_jp_h, | ||
| content_korean, | ||
| content_random_words_ru, | ||
| content_random_words_ar, | ||
| content_random_words_el, | ||
| content_random_words_es, | ||
| content_random_words_fr, | ||
| content_random_words_tr, | ||
| content_random_words_th, | ||
| content_random_words_pl, | ||
| content_random_words_he, | ||
| content_random_words_de, | ||
| ] | ||
| .map(|(file_name, raw_content)| { | ||
| let nfc = &nfc_normalizer.normalize(raw_content); | ||
| let nfd = &nfd_normalizer.normalize(raw_content); | ||
| let nfkc = &nfkc_normalizer.normalize(raw_content); | ||
| let nfkd = &nfkd_normalizer.normalize(raw_content); | ||
| BenchDataContent { | ||
| file_name: file_name.to_owned(), | ||
| nfc: nfc.to_string(), | ||
| nfd: nfd.to_string(), | ||
| nfkc: nfkc.to_string(), | ||
| nfkd: nfkd.to_string(), | ||
| nfc_u16: nfc.encode_utf16().collect(), | ||
| nfd_u16: nfd.encode_utf16().collect(), | ||
| nfkc_u16: nfkc.encode_utf16().collect(), | ||
| nfkd_u16: nfkd.encode_utf16().collect(), | ||
| } | ||
| }) | ||
| } | ||
| fn function_under_bench(normalizer: &DecomposingNormalizerBorrowed, text: &str) { | ||
| normalizer.normalize(text); | ||
| } | ||
| fn function_under_bench_u16(normalizer: &DecomposingNormalizerBorrowed, text: &[u16]) { | ||
| normalizer.normalize_utf16(text); | ||
| } | ||
| pub fn criterion_benchmark(criterion: &mut Criterion) { | ||
| let group_name = "decomposing_normalizer_nfd"; | ||
| let normalizer_under_bench = DecomposingNormalizerBorrowed::new_nfd(); | ||
| let mut group = criterion.benchmark_group(group_name); | ||
| for bench_data_content in black_box(normalizer_bench_data()) { | ||
| group.bench_function( | ||
| BenchmarkId::from_parameter(format!("from_nfc_{}", bench_data_content.file_name)), | ||
| |bencher| { | ||
| bencher | ||
| .iter(|| function_under_bench(&normalizer_under_bench, &bench_data_content.nfc)) | ||
| }, | ||
| ); | ||
| group.bench_function( | ||
| BenchmarkId::from_parameter(format!("from_nfd_{}", bench_data_content.file_name)), | ||
| |bencher| { | ||
| bencher | ||
| .iter(|| function_under_bench(&normalizer_under_bench, &bench_data_content.nfd)) | ||
| }, | ||
| ); | ||
| group.bench_function( | ||
| BenchmarkId::from_parameter(format!("from_nfkc_{}", bench_data_content.file_name)), | ||
| |bencher| { | ||
| bencher.iter(|| { | ||
| function_under_bench(&normalizer_under_bench, &bench_data_content.nfkc) | ||
| }) | ||
| }, | ||
| ); | ||
| group.bench_function( | ||
| BenchmarkId::from_parameter(format!("from_nfkd_{}", bench_data_content.file_name)), | ||
| |bencher| { | ||
| bencher.iter(|| { | ||
| function_under_bench(&normalizer_under_bench, &bench_data_content.nfkd) | ||
| }) | ||
| }, | ||
| ); | ||
| // UTF 16 | ||
| group.bench_function( | ||
| BenchmarkId::from_parameter(format!("from_nfc_{}_u16", bench_data_content.file_name)), | ||
| |bencher| { | ||
| bencher.iter(|| { | ||
| function_under_bench_u16(&normalizer_under_bench, &bench_data_content.nfc_u16) | ||
| }) | ||
| }, | ||
| ); | ||
| group.bench_function( | ||
| BenchmarkId::from_parameter(format!("from_nfd_{}_u16", bench_data_content.file_name)), | ||
| |bencher| { | ||
| bencher.iter(|| { | ||
| function_under_bench_u16(&normalizer_under_bench, &bench_data_content.nfd_u16) | ||
| }) | ||
| }, | ||
| ); | ||
| group.bench_function( | ||
| BenchmarkId::from_parameter(format!("from_nfkc_{}_u16", bench_data_content.file_name)), | ||
| |bencher| { | ||
| bencher.iter(|| { | ||
| function_under_bench_u16(&normalizer_under_bench, &bench_data_content.nfkc_u16) | ||
| }) | ||
| }, | ||
| ); | ||
| group.bench_function( | ||
| BenchmarkId::from_parameter(format!("from_nfkd_{}_u16", bench_data_content.file_name)), | ||
| |bencher| { | ||
| bencher.iter(|| { | ||
| function_under_bench_u16(&normalizer_under_bench, &bench_data_content.nfkd_u16) | ||
| }) | ||
| }, | ||
| ); | ||
| } | ||
| group.finish(); | ||
| } |
| // This file is part of ICU4X. For terms of use, please see the file | ||
| // called LICENSE at the top level of the ICU4X source tree | ||
| // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). | ||
| use criterion::{black_box, BenchmarkId, Criterion}; | ||
| use icu_normalizer::{ComposingNormalizerBorrowed, DecomposingNormalizerBorrowed}; | ||
| struct BenchDataContent { | ||
| pub file_name: String, | ||
| pub nfc: String, | ||
| pub nfd: String, | ||
| pub nfkc: String, | ||
| pub nfkd: String, | ||
| pub nfc_u16: Vec<u16>, | ||
| pub nfd_u16: Vec<u16>, | ||
| pub nfkc_u16: Vec<u16>, | ||
| pub nfkd_u16: Vec<u16>, | ||
| } | ||
| fn strip_headers(content: &str) -> String { | ||
| content | ||
| .lines() | ||
| .filter(|&s| !s.starts_with('#')) | ||
| .map(|s| s.to_owned()) | ||
| .collect::<Vec<String>>() | ||
| .join("\n") | ||
| } | ||
| fn normalizer_bench_data() -> [BenchDataContent; 15] { | ||
| let nfc_normalizer = ComposingNormalizerBorrowed::new_nfc(); | ||
| let nfd_normalizer = DecomposingNormalizerBorrowed::new_nfd(); | ||
| let nfkc_normalizer = ComposingNormalizerBorrowed::new_nfkc(); | ||
| let nfkd_normalizer = DecomposingNormalizerBorrowed::new_nfkd(); | ||
| let content_latin: (&str, &str) = ( | ||
| "TestNames_Latin", | ||
| &strip_headers(include_str!("./data/TestNames_Latin.txt")), | ||
| ); | ||
| let content_jp_h: (&str, &str) = ( | ||
| "TestNames_Japanese_h", | ||
| &strip_headers(include_str!("./data/TestNames_Japanese_h.txt")), | ||
| ); | ||
| let content_jp_k: (&str, &str) = ( | ||
| "TestNames_Japanese_k", | ||
| &strip_headers(include_str!("./data/TestNames_Japanese_k.txt")), | ||
| ); | ||
| let content_korean: (&str, &str) = ( | ||
| "TestNames_Korean", | ||
| &strip_headers(include_str!("./data/TestNames_Korean.txt")), | ||
| ); | ||
| let content_random_words_ar: (&str, &str) = ( | ||
| "TestRandomWordsUDHR_ar", | ||
| &strip_headers(include_str!("./data/TestRandomWordsUDHR_ar.txt")), | ||
| ); | ||
| let content_random_words_de: (&str, &str) = ( | ||
| "TestRandomWordsUDHR_de", | ||
| &strip_headers(include_str!("./data/TestRandomWordsUDHR_de.txt")), | ||
| ); | ||
| let content_random_words_el: (&str, &str) = ( | ||
| "TestRandomWordsUDHR_el", | ||
| &strip_headers(include_str!("./data/TestRandomWordsUDHR_el.txt")), | ||
| ); | ||
| let content_random_words_es: (&str, &str) = ( | ||
| "TestRandomWordsUDHR_es", | ||
| &strip_headers(include_str!("./data/TestRandomWordsUDHR_es.txt")), | ||
| ); | ||
| let content_random_words_fr: (&str, &str) = ( | ||
| "TestRandomWordsUDHR_fr", | ||
| &strip_headers(include_str!("./data/TestRandomWordsUDHR_fr.txt")), | ||
| ); | ||
| let content_random_words_he: (&str, &str) = ( | ||
| "TestRandomWordsUDHR_he", | ||
| &strip_headers(include_str!("./data/TestRandomWordsUDHR_he.txt")), | ||
| ); | ||
| let content_random_words_pl: (&str, &str) = ( | ||
| "TestRandomWordsUDHR_pl", | ||
| &strip_headers(include_str!("./data/TestRandomWordsUDHR_pl.txt")), | ||
| ); | ||
| let content_random_words_ru: (&str, &str) = ( | ||
| "TestRandomWordsUDHR_ru", | ||
| &strip_headers(include_str!("./data/TestRandomWordsUDHR_ru.txt")), | ||
| ); | ||
| let content_random_words_th: (&str, &str) = ( | ||
| "TestRandomWordsUDHR_th", | ||
| &strip_headers(include_str!("./data/TestRandomWordsUDHR_th.txt")), | ||
| ); | ||
| let content_random_words_tr: (&str, &str) = ( | ||
| "TestRandomWordsUDHR_tr", | ||
| &strip_headers(include_str!("./data/TestRandomWordsUDHR_tr.txt")), | ||
| ); | ||
| let content_viet: (&str, &str) = ("wotw", &strip_headers(include_str!("./data/wotw.txt"))); | ||
| [ | ||
| content_latin, | ||
| content_viet, | ||
| content_jp_k, | ||
| content_jp_h, | ||
| content_korean, | ||
| content_random_words_ru, | ||
| content_random_words_ar, | ||
| content_random_words_el, | ||
| content_random_words_es, | ||
| content_random_words_fr, | ||
| content_random_words_tr, | ||
| content_random_words_th, | ||
| content_random_words_pl, | ||
| content_random_words_he, | ||
| content_random_words_de, | ||
| ] | ||
| .map(|(file_name, raw_content)| { | ||
| let nfc = &nfc_normalizer.normalize(raw_content); | ||
| let nfd = &nfd_normalizer.normalize(raw_content); | ||
| let nfkc = &nfkc_normalizer.normalize(raw_content); | ||
| let nfkd = &nfkd_normalizer.normalize(raw_content); | ||
| BenchDataContent { | ||
| file_name: file_name.to_owned(), | ||
| nfc: nfc.to_string(), | ||
| nfd: nfd.to_string(), | ||
| nfkc: nfkc.to_string(), | ||
| nfkd: nfkd.to_string(), | ||
| nfc_u16: nfc.encode_utf16().collect(), | ||
| nfd_u16: nfd.encode_utf16().collect(), | ||
| nfkc_u16: nfkc.encode_utf16().collect(), | ||
| nfkd_u16: nfkd.encode_utf16().collect(), | ||
| } | ||
| }) | ||
| } | ||
| fn function_under_bench(normalizer: &DecomposingNormalizerBorrowed, text: &str) { | ||
| normalizer.normalize(text); | ||
| } | ||
| fn function_under_bench_u16(normalizer: &DecomposingNormalizerBorrowed, text: &[u16]) { | ||
| normalizer.normalize_utf16(text); | ||
| } | ||
| pub fn criterion_benchmark(criterion: &mut Criterion) { | ||
| let group_name = "decomposing_normalizer_nfkd"; | ||
| let normalizer_under_bench = DecomposingNormalizerBorrowed::new_nfkd(); | ||
| let mut group = criterion.benchmark_group(group_name); | ||
| for bench_data_content in black_box(normalizer_bench_data()) { | ||
| group.bench_function( | ||
| BenchmarkId::from_parameter(format!("from_nfc_{}", bench_data_content.file_name)), | ||
| |bencher| { | ||
| bencher | ||
| .iter(|| function_under_bench(&normalizer_under_bench, &bench_data_content.nfc)) | ||
| }, | ||
| ); | ||
| group.bench_function( | ||
| BenchmarkId::from_parameter(format!("from_nfd_{}", bench_data_content.file_name)), | ||
| |bencher| { | ||
| bencher | ||
| .iter(|| function_under_bench(&normalizer_under_bench, &bench_data_content.nfd)) | ||
| }, | ||
| ); | ||
| group.bench_function( | ||
| BenchmarkId::from_parameter(format!("from_nfkc_{}", bench_data_content.file_name)), | ||
| |bencher| { | ||
| bencher.iter(|| { | ||
| function_under_bench(&normalizer_under_bench, &bench_data_content.nfkc) | ||
| }) | ||
| }, | ||
| ); | ||
| group.bench_function( | ||
| BenchmarkId::from_parameter(format!("from_nfkd_{}", bench_data_content.file_name)), | ||
| |bencher| { | ||
| bencher.iter(|| { | ||
| function_under_bench(&normalizer_under_bench, &bench_data_content.nfkd) | ||
| }) | ||
| }, | ||
| ); | ||
| // UTF 16 | ||
| group.bench_function( | ||
| BenchmarkId::from_parameter(format!("from_nfc_{}_u16", bench_data_content.file_name)), | ||
| |bencher| { | ||
| bencher.iter(|| { | ||
| function_under_bench_u16(&normalizer_under_bench, &bench_data_content.nfc_u16) | ||
| }) | ||
| }, | ||
| ); | ||
| group.bench_function( | ||
| BenchmarkId::from_parameter(format!("from_nfd_{}_u16", bench_data_content.file_name)), | ||
| |bencher| { | ||
| bencher.iter(|| { | ||
| function_under_bench_u16(&normalizer_under_bench, &bench_data_content.nfd_u16) | ||
| }) | ||
| }, | ||
| ); | ||
| group.bench_function( | ||
| BenchmarkId::from_parameter(format!("from_nfkc_{}_u16", bench_data_content.file_name)), | ||
| |bencher| { | ||
| bencher.iter(|| { | ||
| function_under_bench_u16(&normalizer_under_bench, &bench_data_content.nfkc_u16) | ||
| }) | ||
| }, | ||
| ); | ||
| group.bench_function( | ||
| BenchmarkId::from_parameter(format!("from_nfkd_{}_u16", bench_data_content.file_name)), | ||
| |bencher| { | ||
| bencher.iter(|| { | ||
| function_under_bench_u16(&normalizer_under_bench, &bench_data_content.nfkd_u16) | ||
| }) | ||
| }, | ||
| ); | ||
| } | ||
| group.finish(); | ||
| } |
| # This file is automatically @generated by Cargo. | ||
| # It is not intended for manual editing. | ||
| version = 3 | ||
| [[package]] | ||
| name = "aho-corasick" | ||
| version = "1.1.3" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916" | ||
| dependencies = [ | ||
| "memchr", | ||
| ] | ||
| [[package]] | ||
| name = "anes" | ||
| version = "0.1.6" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "4b46cbb362ab8752921c97e041f5e366ee6297bd428a31275b9fcf1e380f7299" | ||
| [[package]] | ||
| name = "anstyle" | ||
| version = "1.0.10" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "55cc3b69f167a1ef2e161439aa98aed94e6028e5f9a59be9a6ffb47aef1651f9" | ||
| [[package]] | ||
| name = "arraystring" | ||
| version = "0.3.0" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "4d517c467117e1d8ca795bc8cc90857ff7f79790cca0e26f6e9462694ece0185" | ||
| dependencies = [ | ||
| "typenum", | ||
| ] | ||
| [[package]] | ||
| name = "arrayvec" | ||
| version = "0.7.6" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" | ||
| [[package]] | ||
| name = "atoi" | ||
| version = "2.0.0" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "f28d99ec8bfea296261ca1af174f24225171fea9664ba9003cbebee704810528" | ||
| dependencies = [ | ||
| "num-traits", | ||
| ] | ||
| [[package]] | ||
| name = "autocfg" | ||
| version = "1.4.0" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "ace50bade8e6234aa140d9a2f552bbee1db4d353f69b8217bc503490fc1a9f26" | ||
| [[package]] | ||
| name = "bumpalo" | ||
| version = "3.17.0" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "1628fb46dfa0b37568d12e5edd512553eccf6a22a78e8bde00bb4aed84d5bdbf" | ||
| [[package]] | ||
| name = "cast" | ||
| version = "0.3.0" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5" | ||
| [[package]] | ||
| name = "cfg-if" | ||
| version = "1.0.0" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" | ||
| [[package]] | ||
| name = "ciborium" | ||
| version = "0.2.2" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "42e69ffd6f0917f5c029256a24d0161db17cea3997d185db0d35926308770f0e" | ||
| dependencies = [ | ||
| "ciborium-io", | ||
| "ciborium-ll", | ||
| "serde", | ||
| ] | ||
| [[package]] | ||
| name = "ciborium-io" | ||
| version = "0.2.2" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "05afea1e0a06c9be33d539b876f1ce3692f4afea2cb41f740e7743225ed1c757" | ||
| [[package]] | ||
| name = "ciborium-ll" | ||
| version = "0.2.2" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "57663b653d948a338bfb3eeba9bb2fd5fcfaecb9e199e87e1eda4d9e8b240fd9" | ||
| dependencies = [ | ||
| "ciborium-io", | ||
| "half", | ||
| ] | ||
| [[package]] | ||
| name = "clap" | ||
| version = "4.4.18" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "1e578d6ec4194633722ccf9544794b71b1385c3c027efe0c55db226fc880865c" | ||
| dependencies = [ | ||
| "clap_builder", | ||
| ] | ||
| [[package]] | ||
| name = "clap_builder" | ||
| version = "4.4.18" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "4df4df40ec50c46000231c914968278b1eb05098cf8f1b3a518a95030e71d1c7" | ||
| dependencies = [ | ||
| "anstyle", | ||
| "clap_lex", | ||
| ] | ||
| [[package]] | ||
| name = "clap_lex" | ||
| version = "0.6.0" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "702fc72eb24e5a1e48ce58027a675bc24edd52096d5397d4aea7c6dd9eca0bd1" | ||
| [[package]] | ||
| name = "cobs" | ||
| version = "0.2.3" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "67ba02a97a2bd10f4b59b25c7973101c79642302776489e030cd13cdab09ed15" | ||
| [[package]] | ||
| name = "criterion" | ||
| version = "0.5.1" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "f2b12d017a929603d80db1831cd3a24082f8137ce19c69e6447f54f5fc8d692f" | ||
| dependencies = [ | ||
| "anes", | ||
| "cast", | ||
| "ciborium", | ||
| "clap", | ||
| "criterion-plot", | ||
| "is-terminal", | ||
| "itertools", | ||
| "num-traits", | ||
| "once_cell", | ||
| "oorandom", | ||
| "plotters", | ||
| "rayon", | ||
| "regex", | ||
| "serde", | ||
| "serde_derive", | ||
| "serde_json", | ||
| "tinytemplate", | ||
| "walkdir", | ||
| ] | ||
| [[package]] | ||
| name = "criterion-plot" | ||
| version = "0.5.0" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "6b50826342786a51a89e2da3a28f1c32b06e387201bc2d19791f622c673706b1" | ||
| dependencies = [ | ||
| "cast", | ||
| "itertools", | ||
| ] | ||
| [[package]] | ||
| name = "crossbeam-deque" | ||
| version = "0.8.6" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51" | ||
| dependencies = [ | ||
| "crossbeam-epoch", | ||
| "crossbeam-utils", | ||
| ] | ||
| [[package]] | ||
| name = "crossbeam-epoch" | ||
| version = "0.9.18" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e" | ||
| dependencies = [ | ||
| "crossbeam-utils", | ||
| ] | ||
| [[package]] | ||
| name = "crossbeam-utils" | ||
| version = "0.8.21" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" | ||
| [[package]] | ||
| name = "crunchy" | ||
| version = "0.2.3" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "43da5946c66ffcc7745f48db692ffbb10a83bfe0afd96235c5c2a4fb23994929" | ||
| [[package]] | ||
| name = "databake" | ||
| version = "0.2.0" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "ff6ee9e2d2afb173bcdeee45934c89ec341ab26f91c9933774fc15c2b58f83ef" | ||
| dependencies = [ | ||
| "databake-derive", | ||
| "proc-macro2", | ||
| "quote", | ||
| ] | ||
| [[package]] | ||
| name = "databake-derive" | ||
| version = "0.2.0" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "6834770958c7b84223607e49758ec0dde273c4df915e734aad50f62968a4c134" | ||
| dependencies = [ | ||
| "proc-macro2", | ||
| "quote", | ||
| "syn", | ||
| "synstructure", | ||
| ] | ||
| [[package]] | ||
| name = "detone" | ||
| version = "1.0.1" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "5d5b580660e7375410c9199e84aa298f919925fb53d8cc9b02d8010ff5a14d09" | ||
| [[package]] | ||
| name = "displaydoc" | ||
| version = "0.2.5" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0" | ||
| dependencies = [ | ||
| "proc-macro2", | ||
| "quote", | ||
| "syn", | ||
| ] | ||
| [[package]] | ||
| name = "either" | ||
| version = "1.15.0" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719" | ||
| [[package]] | ||
| name = "erased-serde" | ||
| version = "0.4.6" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "e004d887f51fcb9fef17317a2f3525c887d8aa3f4f50fed920816a688284a5b7" | ||
| dependencies = [ | ||
| "serde", | ||
| "typeid", | ||
| ] | ||
| [[package]] | ||
| name = "half" | ||
| version = "2.4.1" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "6dd08c532ae367adf81c312a4580bc67f1d0fe8bc9c460520283f4c0ff277888" | ||
| dependencies = [ | ||
| "cfg-if", | ||
| "crunchy", | ||
| ] | ||
| [[package]] | ||
| name = "hermit-abi" | ||
| version = "0.5.1" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "f154ce46856750ed433c8649605bf7ed2de3bc35fd9d2a9f30cddd873c80cb08" | ||
| [[package]] | ||
| name = "icu_collections" | ||
| version = "2.0.0" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "200072f5d0e3614556f94a9930d5dc3e0662a652823904c3a75dc3b0af7fee47" | ||
| dependencies = [ | ||
| "databake", | ||
| "displaydoc", | ||
| "potential_utf", | ||
| "serde", | ||
| "yoke", | ||
| "zerofrom", | ||
| "zerovec", | ||
| ] | ||
| [[package]] | ||
| name = "icu_locale_core" | ||
| version = "2.0.0" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "0cde2700ccaed3872079a65fb1a78f6c0a36c91570f28755dda67bc8f7d9f00a" | ||
| dependencies = [ | ||
| "databake", | ||
| "displaydoc", | ||
| "litemap 0.8.1", | ||
| "serde", | ||
| "tinystr", | ||
| "writeable", | ||
| "zerovec", | ||
| ] | ||
| [[package]] | ||
| name = "icu_normalizer" | ||
| version = "2.0.1" | ||
| dependencies = [ | ||
| "arraystring", | ||
| "arrayvec", | ||
| "atoi", | ||
| "criterion", | ||
| "databake", | ||
| "detone", | ||
| "displaydoc", | ||
| "icu_collections", | ||
| "icu_normalizer_data", | ||
| "icu_properties", | ||
| "icu_provider", | ||
| "serde", | ||
| "smallvec", | ||
| "utf16_iter", | ||
| "utf8_iter", | ||
| "write16", | ||
| "zerovec", | ||
| ] | ||
| [[package]] | ||
| name = "icu_normalizer_data" | ||
| version = "2.0.0" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "00210d6893afc98edb752b664b8890f0ef174c8adbb8d0be9710fa66fbbf72d3" | ||
| [[package]] | ||
| name = "icu_properties" | ||
| version = "2.0.1" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "016c619c1eeb94efb86809b015c58f479963de65bdb6253345c1a1276f22e32b" | ||
| dependencies = [ | ||
| "databake", | ||
| "displaydoc", | ||
| "icu_collections", | ||
| "icu_locale_core", | ||
| "icu_properties_data", | ||
| "icu_provider", | ||
| "potential_utf", | ||
| "serde", | ||
| "zerotrie", | ||
| "zerovec", | ||
| ] | ||
| [[package]] | ||
| name = "icu_properties_data" | ||
| version = "2.0.1" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "298459143998310acd25ffe6810ed544932242d3f07083eee1084d83a71bd632" | ||
| [[package]] | ||
| name = "icu_provider" | ||
| version = "2.0.0" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "03c80da27b5f4187909049ee2d72f276f0d9f99a42c306bd0131ecfe04d8e5af" | ||
| dependencies = [ | ||
| "databake", | ||
| "displaydoc", | ||
| "erased-serde", | ||
| "icu_locale_core", | ||
| "postcard", | ||
| "serde", | ||
| "stable_deref_trait", | ||
| "tinystr", | ||
| "writeable", | ||
| "yoke", | ||
| "zerofrom", | ||
| "zerotrie", | ||
| "zerovec", | ||
| ] | ||
| [[package]] | ||
| name = "is-terminal" | ||
| version = "0.4.16" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "e04d7f318608d35d4b61ddd75cbdaee86b023ebe2bd5a66ee0915f0bf93095a9" | ||
| dependencies = [ | ||
| "hermit-abi", | ||
| "libc", | ||
| "windows-sys", | ||
| ] | ||
| [[package]] | ||
| name = "itertools" | ||
| version = "0.10.5" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "b0fd2260e829bddf4cb6ea802289de2f86d6a7a690192fbe91b3f46e0f2c8473" | ||
| dependencies = [ | ||
| "either", | ||
| ] | ||
| [[package]] | ||
| name = "itoa" | ||
| version = "1.0.15" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c" | ||
| [[package]] | ||
| name = "js-sys" | ||
| version = "0.3.77" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "1cfaf33c695fc6e08064efbc1f72ec937429614f25eef83af942d0e227c3a28f" | ||
| dependencies = [ | ||
| "once_cell", | ||
| "wasm-bindgen", | ||
| ] | ||
| [[package]] | ||
| name = "libc" | ||
| version = "0.2.172" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "d750af042f7ef4f724306de029d18836c26c1765a54a6a3f094cbd23a7267ffa" | ||
| [[package]] | ||
| name = "litemap" | ||
| version = "0.7.5" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "23fb14cb19457329c82206317a5663005a4d404783dc74f4252769b0d5f42856" | ||
| dependencies = [ | ||
| "serde", | ||
| ] | ||
| [[package]] | ||
| name = "litemap" | ||
| version = "0.8.1" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "6373607a59f0be73a39b6fe456b8192fcc3585f602af20751600e974dd455e77" | ||
| [[package]] | ||
| name = "log" | ||
| version = "0.4.27" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "13dc2df351e3202783a1fe0d44375f7295ffb4049267b0f3018346dc122a1d94" | ||
| [[package]] | ||
| name = "memchr" | ||
| version = "2.7.4" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3" | ||
| [[package]] | ||
| name = "num-traits" | ||
| version = "0.2.19" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" | ||
| dependencies = [ | ||
| "autocfg", | ||
| ] | ||
| [[package]] | ||
| name = "once_cell" | ||
| version = "1.21.3" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d" | ||
| [[package]] | ||
| name = "oorandom" | ||
| version = "11.1.5" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "d6790f58c7ff633d8771f42965289203411a5e5c68388703c06e14f24770b41e" | ||
| [[package]] | ||
| name = "plotters" | ||
| version = "0.3.7" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "5aeb6f403d7a4911efb1e33402027fc44f29b5bf6def3effcc22d7bb75f2b747" | ||
| dependencies = [ | ||
| "num-traits", | ||
| "plotters-backend", | ||
| "plotters-svg", | ||
| "wasm-bindgen", | ||
| "web-sys", | ||
| ] | ||
| [[package]] | ||
| name = "plotters-backend" | ||
| version = "0.3.7" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "df42e13c12958a16b3f7f4386b9ab1f3e7933914ecea48da7139435263a4172a" | ||
| [[package]] | ||
| name = "plotters-svg" | ||
| version = "0.3.7" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "51bae2ac328883f7acdfea3d66a7c35751187f870bc81f94563733a154d7a670" | ||
| dependencies = [ | ||
| "plotters-backend", | ||
| ] | ||
| [[package]] | ||
| name = "postcard" | ||
| version = "1.1.1" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "170a2601f67cc9dba8edd8c4870b15f71a6a2dc196daec8c83f72b59dff628a8" | ||
| dependencies = [ | ||
| "cobs", | ||
| "serde", | ||
| ] | ||
| [[package]] | ||
| name = "potential_utf" | ||
| version = "0.1.3" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "84df19adbe5b5a0782edcab45899906947ab039ccf4573713735ee7de1e6b08a" | ||
| dependencies = [ | ||
| "databake", | ||
| "serde", | ||
| "zerovec", | ||
| ] | ||
| [[package]] | ||
| name = "proc-macro2" | ||
| version = "1.0.95" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "02b3e5e68a3a1a02aad3ec490a98007cbc13c37cbe84a3cd7b8e406d76e7f778" | ||
| dependencies = [ | ||
| "unicode-ident", | ||
| ] | ||
| [[package]] | ||
| name = "quote" | ||
| version = "1.0.40" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "1885c039570dc00dcb4ff087a89e185fd56bae234ddc7f056a945bf36467248d" | ||
| dependencies = [ | ||
| "proc-macro2", | ||
| ] | ||
| [[package]] | ||
| name = "rayon" | ||
| version = "1.10.0" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "b418a60154510ca1a002a752ca9714984e21e4241e804d32555251faf8b78ffa" | ||
| dependencies = [ | ||
| "either", | ||
| "rayon-core", | ||
| ] | ||
| [[package]] | ||
| name = "rayon-core" | ||
| version = "1.12.1" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "1465873a3dfdaa8ae7cb14b4383657caab0b3e8a0aa9ae8e04b044854c8dfce2" | ||
| dependencies = [ | ||
| "crossbeam-deque", | ||
| "crossbeam-utils", | ||
| ] | ||
| [[package]] | ||
| name = "regex" | ||
| version = "1.11.1" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "b544ef1b4eac5dc2db33ea63606ae9ffcfac26c1416a2806ae0bf5f56b201191" | ||
| dependencies = [ | ||
| "aho-corasick", | ||
| "memchr", | ||
| "regex-automata", | ||
| "regex-syntax", | ||
| ] | ||
| [[package]] | ||
| name = "regex-automata" | ||
| version = "0.4.9" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "809e8dc61f6de73b46c85f4c96486310fe304c434cfa43669d7b40f711150908" | ||
| dependencies = [ | ||
| "aho-corasick", | ||
| "memchr", | ||
| "regex-syntax", | ||
| ] | ||
| [[package]] | ||
| name = "regex-syntax" | ||
| version = "0.8.5" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c" | ||
| [[package]] | ||
| name = "rustversion" | ||
| version = "1.0.20" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "eded382c5f5f786b989652c49544c4877d9f015cc22e145a5ea8ea66c2921cd2" | ||
| [[package]] | ||
| name = "ryu" | ||
| version = "1.0.20" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "28d3b2b1366ec20994f1fd18c3c594f05c5dd4bc44d8bb0c1c632c8d6829481f" | ||
| [[package]] | ||
| name = "same-file" | ||
| version = "1.0.6" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502" | ||
| dependencies = [ | ||
| "winapi-util", | ||
| ] | ||
| [[package]] | ||
| name = "serde" | ||
| version = "1.0.219" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "5f0e2c6ed6606019b4e29e69dbaba95b11854410e5347d525002456dbbb786b6" | ||
| dependencies = [ | ||
| "serde_derive", | ||
| ] | ||
| [[package]] | ||
| name = "serde_derive" | ||
| version = "1.0.219" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "5b0276cf7f2c73365f7157c8123c21cd9a50fbbd844757af28ca1f5925fc2a00" | ||
| dependencies = [ | ||
| "proc-macro2", | ||
| "quote", | ||
| "syn", | ||
| ] | ||
| [[package]] | ||
| name = "serde_json" | ||
| version = "1.0.140" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "20068b6e96dc6c9bd23e01df8827e6c7e1f2fddd43c21810382803c136b99373" | ||
| dependencies = [ | ||
| "itoa", | ||
| "memchr", | ||
| "ryu", | ||
| "serde", | ||
| ] | ||
| [[package]] | ||
| name = "smallvec" | ||
| version = "1.15.0" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "8917285742e9f3e1683f0a9c4e6b57960b7314d0b08d30d1ecd426713ee2eee9" | ||
| [[package]] | ||
| name = "stable_deref_trait" | ||
| version = "1.2.0" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3" | ||
| [[package]] | ||
| name = "syn" | ||
| version = "2.0.101" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "8ce2b7fc941b3a24138a0a7cf8e858bfc6a992e7978a068a5c760deb0ed43caf" | ||
| dependencies = [ | ||
| "proc-macro2", | ||
| "quote", | ||
| "unicode-ident", | ||
| ] | ||
| [[package]] | ||
| name = "synstructure" | ||
| version = "0.13.2" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "728a70f3dbaf5bab7f0c4b1ac8d7ae5ea60a4b5549c8a5914361c99147a709d2" | ||
| dependencies = [ | ||
| "proc-macro2", | ||
| "quote", | ||
| "syn", | ||
| ] | ||
| [[package]] | ||
| name = "tinystr" | ||
| version = "0.8.1" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "5d4f6d1145dcb577acf783d4e601bc1d76a13337bb54e6233add580b07344c8b" | ||
| dependencies = [ | ||
| "displaydoc", | ||
| "serde", | ||
| "zerovec", | ||
| ] | ||
| [[package]] | ||
| name = "tinytemplate" | ||
| version = "1.2.1" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "be4d6b5f19ff7664e8c98d03e2139cb510db9b0a60b55f8e8709b689d939b6bc" | ||
| dependencies = [ | ||
| "serde", | ||
| "serde_json", | ||
| ] | ||
| [[package]] | ||
| name = "typeid" | ||
| version = "1.0.3" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "bc7d623258602320d5c55d1bc22793b57daff0ec7efc270ea7d55ce1d5f5471c" | ||
| [[package]] | ||
| name = "typenum" | ||
| version = "1.18.0" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "1dccffe3ce07af9386bfd29e80c0ab1a8205a2fc34e4bcd40364df902cfa8f3f" | ||
| [[package]] | ||
| name = "unicode-ident" | ||
| version = "1.0.18" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "5a5f39404a5da50712a4c1eecf25e90dd62b613502b7e925fd4e4d19b5c96512" | ||
| [[package]] | ||
| name = "utf16_iter" | ||
| version = "1.0.5" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "c8232dd3cdaed5356e0f716d285e4b40b932ac434100fe9b7e0e8e935b9e6246" | ||
| [[package]] | ||
| name = "utf8_iter" | ||
| version = "1.0.4" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be" | ||
| [[package]] | ||
| name = "walkdir" | ||
| version = "2.5.0" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "29790946404f91d9c5d06f9874efddea1dc06c5efe94541a7d6863108e3a5e4b" | ||
| dependencies = [ | ||
| "same-file", | ||
| "winapi-util", | ||
| ] | ||
| [[package]] | ||
| name = "wasm-bindgen" | ||
| version = "0.2.100" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "1edc8929d7499fc4e8f0be2262a241556cfc54a0bea223790e71446f2aab1ef5" | ||
| dependencies = [ | ||
| "cfg-if", | ||
| "once_cell", | ||
| "rustversion", | ||
| "wasm-bindgen-macro", | ||
| ] | ||
| [[package]] | ||
| name = "wasm-bindgen-backend" | ||
| version = "0.2.100" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "2f0a0651a5c2bc21487bde11ee802ccaf4c51935d0d3d42a6101f98161700bc6" | ||
| dependencies = [ | ||
| "bumpalo", | ||
| "log", | ||
| "proc-macro2", | ||
| "quote", | ||
| "syn", | ||
| "wasm-bindgen-shared", | ||
| ] | ||
| [[package]] | ||
| name = "wasm-bindgen-macro" | ||
| version = "0.2.100" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "7fe63fc6d09ed3792bd0897b314f53de8e16568c2b3f7982f468c0bf9bd0b407" | ||
| dependencies = [ | ||
| "quote", | ||
| "wasm-bindgen-macro-support", | ||
| ] | ||
| [[package]] | ||
| name = "wasm-bindgen-macro-support" | ||
| version = "0.2.100" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "8ae87ea40c9f689fc23f209965b6fb8a99ad69aeeb0231408be24920604395de" | ||
| dependencies = [ | ||
| "proc-macro2", | ||
| "quote", | ||
| "syn", | ||
| "wasm-bindgen-backend", | ||
| "wasm-bindgen-shared", | ||
| ] | ||
| [[package]] | ||
| name = "wasm-bindgen-shared" | ||
| version = "0.2.100" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "1a05d73b933a847d6cccdda8f838a22ff101ad9bf93e33684f39c1f5f0eece3d" | ||
| dependencies = [ | ||
| "unicode-ident", | ||
| ] | ||
| [[package]] | ||
| name = "web-sys" | ||
| version = "0.3.77" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "33b6dd2ef9186f1f2072e409e99cd22a975331a6b3591b12c764e0e55c60d5d2" | ||
| dependencies = [ | ||
| "js-sys", | ||
| "wasm-bindgen", | ||
| ] | ||
| [[package]] | ||
| name = "winapi-util" | ||
| version = "0.1.9" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "cf221c93e13a30d793f7645a0e7762c55d169dbb0a49671918a2319d289b10bb" | ||
| dependencies = [ | ||
| "windows-sys", | ||
| ] | ||
| [[package]] | ||
| name = "windows-sys" | ||
| version = "0.59.0" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b" | ||
| dependencies = [ | ||
| "windows-targets", | ||
| ] | ||
| [[package]] | ||
| name = "windows-targets" | ||
| version = "0.52.6" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" | ||
| dependencies = [ | ||
| "windows_aarch64_gnullvm", | ||
| "windows_aarch64_msvc", | ||
| "windows_i686_gnu", | ||
| "windows_i686_gnullvm", | ||
| "windows_i686_msvc", | ||
| "windows_x86_64_gnu", | ||
| "windows_x86_64_gnullvm", | ||
| "windows_x86_64_msvc", | ||
| ] | ||
| [[package]] | ||
| name = "windows_aarch64_gnullvm" | ||
| version = "0.52.6" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" | ||
| [[package]] | ||
| name = "windows_aarch64_msvc" | ||
| version = "0.52.6" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" | ||
| [[package]] | ||
| name = "windows_i686_gnu" | ||
| version = "0.52.6" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" | ||
| [[package]] | ||
| name = "windows_i686_gnullvm" | ||
| version = "0.52.6" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" | ||
| [[package]] | ||
| name = "windows_i686_msvc" | ||
| version = "0.52.6" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" | ||
| [[package]] | ||
| name = "windows_x86_64_gnu" | ||
| version = "0.52.6" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" | ||
| [[package]] | ||
| name = "windows_x86_64_gnullvm" | ||
| version = "0.52.6" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" | ||
| [[package]] | ||
| name = "windows_x86_64_msvc" | ||
| version = "0.52.6" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" | ||
| [[package]] | ||
| name = "write16" | ||
| version = "1.0.0" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "d1890f4022759daae28ed4fe62859b1236caebfc61ede2f63ed4e695f3f6d936" | ||
| dependencies = [ | ||
| "arrayvec", | ||
| ] | ||
| [[package]] | ||
| name = "writeable" | ||
| version = "0.6.2" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "9edde0db4769d2dc68579893f2306b26c6ecfbe0ef499b013d731b7b9247e0b9" | ||
| [[package]] | ||
| name = "yoke" | ||
| version = "0.8.1" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "72d6e5c6afb84d73944e5cedb052c4680d5657337201555f9f2a16b7406d4954" | ||
| dependencies = [ | ||
| "stable_deref_trait", | ||
| "yoke-derive", | ||
| "zerofrom", | ||
| ] | ||
| [[package]] | ||
| name = "yoke-derive" | ||
| version = "0.8.1" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "b659052874eb698efe5b9e8cf382204678a0086ebf46982b79d6ca3182927e5d" | ||
| dependencies = [ | ||
| "proc-macro2", | ||
| "quote", | ||
| "syn", | ||
| "synstructure", | ||
| ] | ||
| [[package]] | ||
| name = "zerofrom" | ||
| version = "0.1.6" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "50cc42e0333e05660c3587f3bf9d0478688e15d870fab3346451ce7f8c9fbea5" | ||
| dependencies = [ | ||
| "zerofrom-derive", | ||
| ] | ||
| [[package]] | ||
| name = "zerofrom-derive" | ||
| version = "0.1.6" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "d71e5d6e06ab090c67b5e44993ec16b72dcbaabc526db883a360057678b48502" | ||
| dependencies = [ | ||
| "proc-macro2", | ||
| "quote", | ||
| "syn", | ||
| "synstructure", | ||
| ] | ||
| [[package]] | ||
| name = "zerotrie" | ||
| version = "0.2.1" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "8b7a6cf4865aac8394f19ad46e37f60b929c1ba5eed798b96a32820aa9392929" | ||
| dependencies = [ | ||
| "databake", | ||
| "displaydoc", | ||
| "litemap 0.7.5", | ||
| "serde", | ||
| "yoke", | ||
| "zerofrom", | ||
| "zerovec", | ||
| ] | ||
| [[package]] | ||
| name = "zerovec" | ||
| version = "0.11.4" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "e7aa2bd55086f1ab526693ecbe444205da57e25f4489879da80635a46d90e73b" | ||
| dependencies = [ | ||
| "databake", | ||
| "serde", | ||
| "yoke", | ||
| "zerofrom", | ||
| "zerovec-derive", | ||
| ] | ||
| [[package]] | ||
| name = "zerovec-derive" | ||
| version = "0.11.2" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "eadce39539ca5cb3985590102671f2567e659fca9666581ad3411d59207951f3" | ||
| dependencies = [ | ||
| "proc-macro2", | ||
| "quote", | ||
| "syn", | ||
| ] |
| # THIS FILE IS AUTOMATICALLY GENERATED BY CARGO | ||
| # | ||
| # When uploading crates to the registry Cargo will automatically | ||
| # "normalize" Cargo.toml files for maximal compatibility | ||
| # with all versions of Cargo and also rewrite `path` dependencies | ||
| # to registry (e.g., crates.io) dependencies. | ||
| # | ||
| # If you are reading this file be aware that the original Cargo.toml | ||
| # will likely look very different (and much more reasonable). | ||
| # See Cargo.toml.orig for the original contents. | ||
| [package] | ||
| edition = "2021" | ||
| rust-version = "1.82" | ||
| name = "icu_normalizer" | ||
| version = "2.0.1" | ||
| authors = ["The ICU4X Project Developers"] | ||
| build = false | ||
| include = [ | ||
| "data/**/*", | ||
| "src/**/*", | ||
| "examples/**/*", | ||
| "benches/**/*", | ||
| "tests/**/*", | ||
| "Cargo.toml", | ||
| "LICENSE", | ||
| "README.md", | ||
| "build.rs", | ||
| ] | ||
| autolib = false | ||
| autobins = false | ||
| autoexamples = false | ||
| autotests = false | ||
| autobenches = false | ||
| description = "API for normalizing text into Unicode Normalization Forms" | ||
| homepage = "https://icu4x.unicode.org" | ||
| readme = "README.md" | ||
| categories = ["internationalization"] | ||
| license = "Unicode-3.0" | ||
| repository = "https://github.com/unicode-org/icu4x" | ||
| [package.metadata.docs.rs] | ||
| all-features = true | ||
| [features] | ||
| compiled_data = [ | ||
| "dep:icu_normalizer_data", | ||
| "icu_properties?/compiled_data", | ||
| "icu_provider/baked", | ||
| ] | ||
| datagen = [ | ||
| "serde", | ||
| "dep:databake", | ||
| "icu_properties", | ||
| "icu_collections/databake", | ||
| "zerovec/databake", | ||
| "icu_properties?/datagen", | ||
| "icu_provider/export", | ||
| ] | ||
| default = [ | ||
| "compiled_data", | ||
| "utf8_iter", | ||
| "utf16_iter", | ||
| ] | ||
| experimental = [] | ||
| icu_properties = ["dep:icu_properties"] | ||
| serde = [ | ||
| "dep:serde", | ||
| "icu_collections/serde", | ||
| "zerovec/serde", | ||
| "icu_properties?/serde", | ||
| "icu_provider/serde", | ||
| ] | ||
| utf16_iter = [ | ||
| "dep:utf16_iter", | ||
| "write16", | ||
| ] | ||
| utf8_iter = ["dep:utf8_iter"] | ||
| [lib] | ||
| name = "icu_normalizer" | ||
| path = "src/lib.rs" | ||
| [[test]] | ||
| name = "tests" | ||
| path = "tests/tests.rs" | ||
| [[bench]] | ||
| name = "bench" | ||
| path = "benches/bench.rs" | ||
| harness = false | ||
| required-features = [ | ||
| "utf16_iter", | ||
| "utf8_iter", | ||
| ] | ||
| [[bench]] | ||
| name = "canonical_composition" | ||
| path = "benches/canonical_composition.rs" | ||
| [[bench]] | ||
| name = "canonical_decomposition" | ||
| path = "benches/canonical_decomposition.rs" | ||
| [[bench]] | ||
| name = "composing_normalizer_nfc" | ||
| path = "benches/composing_normalizer_nfc.rs" | ||
| [[bench]] | ||
| name = "composing_normalizer_nfkc" | ||
| path = "benches/composing_normalizer_nfkc.rs" | ||
| [[bench]] | ||
| name = "decomposing_normalizer_nfd" | ||
| path = "benches/decomposing_normalizer_nfd.rs" | ||
| [[bench]] | ||
| name = "decomposing_normalizer_nfkd" | ||
| path = "benches/decomposing_normalizer_nfkd.rs" | ||
| [dependencies.databake] | ||
| version = "0.2.0" | ||
| features = ["derive"] | ||
| optional = true | ||
| default-features = false | ||
| [dependencies.displaydoc] | ||
| version = "0.2.3" | ||
| default-features = false | ||
| [dependencies.icu_collections] | ||
| version = "~2.0.0" | ||
| default-features = false | ||
| [dependencies.icu_normalizer_data] | ||
| version = "~2.0.0" | ||
| optional = true | ||
| default-features = false | ||
| [dependencies.icu_properties] | ||
| version = "~2.0.0" | ||
| optional = true | ||
| default-features = false | ||
| [dependencies.icu_provider] | ||
| version = "2.0.0" | ||
| features = ["alloc"] | ||
| default-features = false | ||
| [dependencies.serde] | ||
| version = "1.0.110" | ||
| features = [ | ||
| "derive", | ||
| "alloc", | ||
| ] | ||
| optional = true | ||
| default-features = false | ||
| [dependencies.smallvec] | ||
| version = "1.10.0" | ||
| default-features = false | ||
| [dependencies.utf16_iter] | ||
| version = "1.0.2" | ||
| optional = true | ||
| default-features = false | ||
| [dependencies.utf8_iter] | ||
| version = "1.0.2" | ||
| optional = true | ||
| default-features = false | ||
| [dependencies.write16] | ||
| version = "1.0.0" | ||
| features = ["alloc"] | ||
| optional = true | ||
| default-features = false | ||
| [dependencies.zerovec] | ||
| version = "0.11.1" | ||
| default-features = false | ||
| [dev-dependencies.arraystring] | ||
| version = "0.3.0" | ||
| [dev-dependencies.arrayvec] | ||
| version = "0.7.2" | ||
| default-features = false | ||
| [dev-dependencies.atoi] | ||
| version = "2.0.0" | ||
| [dev-dependencies.detone] | ||
| version = "1.0.0" | ||
| [dev-dependencies.write16] | ||
| version = "1.0.0" | ||
| features = ["arrayvec"] | ||
| default-features = false | ||
| [target.'cfg(not(target_arch = "wasm32"))'.dev-dependencies.criterion] | ||
| version = "0.5.0" |
Sorry, the diff of this file is not supported yet
| UNICODE LICENSE V3 | ||
| COPYRIGHT AND PERMISSION NOTICE | ||
| Copyright © 2020-2024 Unicode, Inc. | ||
| NOTICE TO USER: Carefully read the following legal agreement. BY | ||
| DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING DATA FILES, AND/OR | ||
| SOFTWARE, YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE | ||
| TERMS AND CONDITIONS OF THIS AGREEMENT. IF YOU DO NOT AGREE, DO NOT | ||
| DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE THE DATA FILES OR SOFTWARE. | ||
| Permission is hereby granted, free of charge, to any person obtaining a | ||
| copy of data files and any associated documentation (the "Data Files") or | ||
| software and any associated documentation (the "Software") to deal in the | ||
| Data Files or Software without restriction, including without limitation | ||
| the rights to use, copy, modify, merge, publish, distribute, and/or sell | ||
| copies of the Data Files or Software, and to permit persons to whom the | ||
| Data Files or Software are furnished to do so, provided that either (a) | ||
| this copyright and permission notice appear with all copies of the Data | ||
| Files or Software, or (b) this copyright and permission notice appear in | ||
| associated Documentation. | ||
| THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY | ||
| KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | ||
| MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF | ||
| THIRD PARTY RIGHTS. | ||
| IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE | ||
| BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, | ||
| OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, | ||
| WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, | ||
| ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THE DATA | ||
| FILES OR SOFTWARE. | ||
| Except as contained in this notice, the name of a copyright holder shall | ||
| not be used in advertising or otherwise to promote the sale, use or other | ||
| dealings in these Data Files or Software without prior written | ||
| authorization of the copyright holder. | ||
| SPDX-License-Identifier: Unicode-3.0 | ||
| — | ||
| Portions of ICU4X may have been adapted from ICU4C and/or ICU4J. | ||
| ICU 1.8.1 to ICU 57.1 © 1995-2016 International Business Machines Corporation and others. |
| # icu_normalizer [](https://crates.io/crates/icu_normalizer) | ||
| <!-- cargo-rdme start --> | ||
| Normalizing text into Unicode Normalization Forms. | ||
| This module is published as its own crate ([`icu_normalizer`](https://docs.rs/icu_normalizer/latest/icu_normalizer/)) | ||
| and as part of the [`icu`](https://docs.rs/icu/latest/icu/) crate. See the latter for more details on the ICU4X project. | ||
| ## Functionality | ||
| The top level of the crate provides normalization of input into the four normalization forms defined in [UAX #15: Unicode | ||
| Normalization Forms](https://www.unicode.org/reports/tr15/): NFC, NFD, NFKC, and NFKD. | ||
| Three kinds of contiguous inputs are supported: known-well-formed UTF-8 (`&str`), potentially-not-well-formed UTF-8, | ||
| and potentially-not-well-formed UTF-16. Additionally, an iterator over `char` can be wrapped in a normalizing iterator. | ||
| The `uts46` module provides the combination of mapping and normalization operations for [UTS #46: Unicode IDNA | ||
| Compatibility Processing](https://www.unicode.org/reports/tr46/). This functionality is not meant to be used by | ||
| applications directly. Instead, it is meant as a building block for a full implementation of UTS #46, such as the | ||
| [`idna`](https://docs.rs/idna/latest/idna/) crate. | ||
| The `properties` module provides the non-recursive canonical decomposition operation on a per `char` basis and | ||
| the canonical compositon operation given two `char`s. It also provides access to the Canonical Combining Class | ||
| property. These operations are primarily meant for [HarfBuzz](https://harfbuzz.github.io/) via the | ||
| [`icu_harfbuzz`](https://docs.rs/icu_harfbuzz/latest/icu_harfbuzz/) crate. | ||
| Notably, this normalizer does _not_ provide the normalization “quick check” that can result in “maybe” in | ||
| addition to “yes” and “no”. The normalization checks provided by this crate always give a definitive | ||
| non-“maybe” answer. | ||
| ## Examples | ||
| ```rust | ||
| let nfc = icu_normalizer::ComposingNormalizerBorrowed::new_nfc(); | ||
| assert_eq!(nfc.normalize("a\u{0308}"), "ä"); | ||
| assert!(nfc.is_normalized("ä")); | ||
| let nfd = icu_normalizer::DecomposingNormalizerBorrowed::new_nfd(); | ||
| assert_eq!(nfd.normalize("ä"), "a\u{0308}"); | ||
| assert!(!nfd.is_normalized("ä")); | ||
| ``` | ||
| <!-- cargo-rdme end --> | ||
| ## More Information | ||
| For more information on development, authorship, contributing etc. please visit [`ICU4X home page`](https://github.com/unicode-org/icu4x). |
Sorry, the diff of this file is too big to display
| // This file is part of ICU4X. For terms of use, please see the file | ||
| // called LICENSE at the top level of the ICU4X source tree | ||
| // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). | ||
| //! Access to the Unicode properties or property-based operations that | ||
| //! are required for NFC and NFD. | ||
| //! | ||
| //! Applications should generally use the full normalizers that are | ||
| //! provided at the top level of this crate. However, the APIs in this | ||
| //! module are provided for callers such as HarfBuzz that specifically | ||
| //! want access to the raw canonical composition operation e.g. for use in a | ||
| //! glyph-availability-guided custom normalizer. | ||
| use crate::char_from_u16; | ||
| use crate::char_from_u32; | ||
| use crate::in_inclusive_range; | ||
| use crate::provider::CanonicalCompositions; | ||
| use crate::provider::DecompositionData; | ||
| use crate::provider::DecompositionTables; | ||
| use crate::provider::NonRecursiveDecompositionSupplement; | ||
| use crate::provider::NormalizerNfcV1; | ||
| use crate::provider::NormalizerNfdDataV1; | ||
| use crate::provider::NormalizerNfdSupplementV1; | ||
| use crate::provider::NormalizerNfdTablesV1; | ||
| use crate::trie_value_has_ccc; | ||
| use crate::CanonicalCombiningClass; | ||
| use crate::BACKWARD_COMBINING_MARKER; | ||
| use crate::FDFA_MARKER; | ||
| use crate::HANGUL_L_BASE; | ||
| use crate::HANGUL_N_COUNT; | ||
| use crate::HANGUL_S_BASE; | ||
| use crate::HANGUL_S_COUNT; | ||
| use crate::HANGUL_T_BASE; | ||
| use crate::HANGUL_T_COUNT; | ||
| use crate::HANGUL_V_BASE; | ||
| use crate::HIGH_ZEROS_MASK; | ||
| use crate::LOW_ZEROS_MASK; | ||
| use crate::NON_ROUND_TRIP_MARKER; | ||
| use icu_provider::prelude::*; | ||
| /// Borrowed version of the raw canonical composition operation. | ||
| /// | ||
| /// Callers should generally use `ComposingNormalizer` instead of this API. | ||
| /// However, this API is provided for callers such as HarfBuzz that specifically | ||
| /// want access to the raw canonical composition operation e.g. for use in a | ||
| /// glyph-availability-guided custom normalizer. | ||
| #[derive(Debug, Copy, Clone)] | ||
| pub struct CanonicalCompositionBorrowed<'a> { | ||
| canonical_compositions: &'a CanonicalCompositions<'a>, | ||
| } | ||
| #[cfg(feature = "compiled_data")] | ||
| impl Default for CanonicalCompositionBorrowed<'static> { | ||
| fn default() -> Self { | ||
| Self::new() | ||
| } | ||
| } | ||
| impl CanonicalCompositionBorrowed<'static> { | ||
| /// Cheaply converts a [`CanonicalCompositionBorrowed<'static>`] into a [`CanonicalComposition`]. | ||
| /// | ||
| /// Note: Due to branching and indirection, using [`CanonicalComposition`] might inhibit some | ||
| /// compile-time optimizations that are possible with [`CanonicalCompositionBorrowed`]. | ||
| pub const fn static_to_owned(self) -> CanonicalComposition { | ||
| CanonicalComposition { | ||
| canonical_compositions: DataPayload::from_static_ref(self.canonical_compositions), | ||
| } | ||
| } | ||
| /// Constructs a new `CanonicalComposition` using compiled data. | ||
| /// | ||
| /// ✨ *Enabled with the `compiled_data` Cargo feature.* | ||
| /// | ||
| /// [📚 Help choosing a constructor](icu_provider::constructors) | ||
| #[cfg(feature = "compiled_data")] | ||
| pub const fn new() -> Self { | ||
| Self { | ||
| canonical_compositions: crate::provider::Baked::SINGLETON_NORMALIZER_NFC_V1, | ||
| } | ||
| } | ||
| } | ||
| impl CanonicalCompositionBorrowed<'_> { | ||
| /// Performs canonical composition (including Hangul) on a pair of | ||
| /// characters or returns `None` if these characters don't compose. | ||
| /// Composition exclusions are taken into account. | ||
| /// | ||
| /// # Examples | ||
| /// | ||
| /// ``` | ||
| /// let comp = icu::normalizer::properties::CanonicalCompositionBorrowed::new(); | ||
| /// | ||
| /// assert_eq!(comp.compose('a', 'b'), None); // Just two non-composing starters | ||
| /// assert_eq!(comp.compose('a', '\u{0308}'), Some('ä')); | ||
| /// assert_eq!(comp.compose('ẹ', '\u{0302}'), Some('ệ')); | ||
| /// assert_eq!(comp.compose('𝅗', '𝅥'), None); // Composition exclusion | ||
| /// assert_eq!(comp.compose('ে', 'া'), Some('ো')); // Second is starter | ||
| /// assert_eq!(comp.compose('ᄀ', 'ᅡ'), Some('가')); // Hangul LV | ||
| /// assert_eq!(comp.compose('가', 'ᆨ'), Some('각')); // Hangul LVT | ||
| /// ``` | ||
| #[inline(always)] | ||
| pub fn compose(self, starter: char, second: char) -> Option<char> { | ||
| crate::compose( | ||
| self.canonical_compositions.canonical_compositions.iter(), | ||
| starter, | ||
| second, | ||
| ) | ||
| } | ||
| } | ||
| /// The raw canonical composition operation. | ||
| /// | ||
| /// Callers should generally use `ComposingNormalizer` instead of this API. | ||
| /// However, this API is provided for callers such as HarfBuzz that specifically | ||
| /// want access to the raw canonical composition operation e.g. for use in a | ||
| /// glyph-availability-guided custom normalizer. | ||
| #[derive(Debug)] | ||
| pub struct CanonicalComposition { | ||
| canonical_compositions: DataPayload<NormalizerNfcV1>, | ||
| } | ||
| #[cfg(feature = "compiled_data")] | ||
| impl Default for CanonicalComposition { | ||
| fn default() -> Self { | ||
| Self::new().static_to_owned() | ||
| } | ||
| } | ||
| impl CanonicalComposition { | ||
| /// Constructs a borrowed version of this type for more efficient querying. | ||
| pub fn as_borrowed(&self) -> CanonicalCompositionBorrowed<'_> { | ||
| CanonicalCompositionBorrowed { | ||
| canonical_compositions: self.canonical_compositions.get(), | ||
| } | ||
| } | ||
| /// Constructs a new `CanonicalCompositionBorrowed` using compiled data. | ||
| /// | ||
| /// ✨ *Enabled with the `compiled_data` Cargo feature.* | ||
| /// | ||
| /// [📚 Help choosing a constructor](icu_provider::constructors) | ||
| #[cfg(feature = "compiled_data")] | ||
| #[allow(clippy::new_ret_no_self)] | ||
| pub const fn new() -> CanonicalCompositionBorrowed<'static> { | ||
| CanonicalCompositionBorrowed::new() | ||
| } | ||
| icu_provider::gen_buffer_data_constructors!(() -> error: DataError, | ||
| functions: [ | ||
| new: skip, | ||
| try_new_with_buffer_provider, | ||
| try_new_unstable, | ||
| Self, | ||
| ] | ||
| ); | ||
| #[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new)] | ||
| pub fn try_new_unstable<D>(provider: &D) -> Result<Self, DataError> | ||
| where | ||
| D: DataProvider<NormalizerNfcV1> + ?Sized, | ||
| { | ||
| let canonical_compositions: DataPayload<NormalizerNfcV1> = | ||
| provider.load(Default::default())?.payload; | ||
| Ok(CanonicalComposition { | ||
| canonical_compositions, | ||
| }) | ||
| } | ||
| } | ||
| /// The outcome of non-recursive canonical decomposition of a character. | ||
| #[allow(clippy::exhaustive_enums)] | ||
| #[derive(Debug, PartialEq, Eq)] | ||
| pub enum Decomposed { | ||
| /// The character is its own canonical decomposition. | ||
| Default, | ||
| /// The character decomposes to a single different character. | ||
| Singleton(char), | ||
| /// The character decomposes to two characters. | ||
| Expansion(char, char), | ||
| } | ||
| /// Borrowed version of the raw (non-recursive) canonical decomposition operation. | ||
| /// | ||
| /// Callers should generally use `DecomposingNormalizer` instead of this API. | ||
| /// However, this API is provided for callers such as HarfBuzz that specifically | ||
| /// want access to non-recursive canonical decomposition e.g. for use in a | ||
| /// glyph-availability-guided custom normalizer. | ||
| #[derive(Debug)] | ||
| pub struct CanonicalDecompositionBorrowed<'a> { | ||
| decompositions: &'a DecompositionData<'a>, | ||
| tables: &'a DecompositionTables<'a>, | ||
| non_recursive: &'a NonRecursiveDecompositionSupplement<'a>, | ||
| } | ||
| #[cfg(feature = "compiled_data")] | ||
| impl Default for CanonicalDecompositionBorrowed<'static> { | ||
| fn default() -> Self { | ||
| Self::new() | ||
| } | ||
| } | ||
| impl CanonicalDecompositionBorrowed<'static> { | ||
| /// Cheaply converts a [`CanonicalDecompositionBorrowed<'static>`] into a [`CanonicalDecomposition`]. | ||
| /// | ||
| /// Note: Due to branching and indirection, using [`CanonicalDecomposition`] might inhibit some | ||
| /// compile-time optimizations that are possible with [`CanonicalDecompositionBorrowed`]. | ||
| pub const fn static_to_owned(self) -> CanonicalDecomposition { | ||
| CanonicalDecomposition { | ||
| decompositions: DataPayload::from_static_ref(self.decompositions), | ||
| tables: DataPayload::from_static_ref(self.tables), | ||
| non_recursive: DataPayload::from_static_ref(self.non_recursive), | ||
| } | ||
| } | ||
| /// Construct from compiled data. | ||
| /// | ||
| /// ✨ *Enabled with the `compiled_data` Cargo feature.* | ||
| /// | ||
| /// [📚 Help choosing a constructor](icu_provider::constructors) | ||
| #[cfg(feature = "compiled_data")] | ||
| pub const fn new() -> Self { | ||
| const _: () = assert!( | ||
| crate::provider::Baked::SINGLETON_NORMALIZER_NFD_TABLES_V1 | ||
| .scalars16 | ||
| .const_len() | ||
| + crate::provider::Baked::SINGLETON_NORMALIZER_NFD_TABLES_V1 | ||
| .scalars24 | ||
| .const_len() | ||
| <= 0xFFF, | ||
| "future extension" | ||
| ); | ||
| Self { | ||
| decompositions: crate::provider::Baked::SINGLETON_NORMALIZER_NFD_DATA_V1, | ||
| tables: crate::provider::Baked::SINGLETON_NORMALIZER_NFD_TABLES_V1, | ||
| non_recursive: crate::provider::Baked::SINGLETON_NORMALIZER_NFD_SUPPLEMENT_V1, | ||
| } | ||
| } | ||
| } | ||
| impl CanonicalDecompositionBorrowed<'_> { | ||
| /// Performs non-recursive canonical decomposition (including for Hangul). | ||
| /// | ||
| /// ``` | ||
| /// use icu::normalizer::properties::Decomposed; | ||
| /// let decomp = icu::normalizer::properties::CanonicalDecompositionBorrowed::new(); | ||
| /// | ||
| /// assert_eq!(decomp.decompose('e'), Decomposed::Default); | ||
| /// assert_eq!( | ||
| /// decomp.decompose('ệ'), | ||
| /// Decomposed::Expansion('ẹ', '\u{0302}') | ||
| /// ); | ||
| /// assert_eq!(decomp.decompose('각'), Decomposed::Expansion('가', 'ᆨ')); | ||
| /// assert_eq!(decomp.decompose('\u{212B}'), Decomposed::Singleton('Å')); // ANGSTROM SIGN | ||
| /// assert_eq!(decomp.decompose('\u{2126}'), Decomposed::Singleton('Ω')); // OHM SIGN | ||
| /// assert_eq!(decomp.decompose('\u{1F71}'), Decomposed::Singleton('ά')); // oxia | ||
| /// ``` | ||
| #[inline] | ||
| pub fn decompose(&self, c: char) -> Decomposed { | ||
| let lvt = u32::from(c).wrapping_sub(HANGUL_S_BASE); | ||
| if lvt >= HANGUL_S_COUNT { | ||
| return self.decompose_non_hangul(c); | ||
| } | ||
| // Invariant: lvt ≤ HANGUL_S_COUNT = 1172 | ||
| let t = lvt % HANGUL_T_COUNT; | ||
| // Invariant: t ≤ (1172 / HANGUL_T_COUNT = 1172 / 28 = 41) | ||
| if t == 0 { | ||
| let l = lvt / HANGUL_N_COUNT; | ||
| // Invariant: v ≤ (1172 / HANGUL_N_COUNT = 1172 / 588 ≈ 2) | ||
| let v = (lvt % HANGUL_N_COUNT) / HANGUL_T_COUNT; | ||
| // Invariant: v < (HANGUL_N_COUNT / HANGUL_T_COUNT = 588 / 28 = 21) | ||
| return Decomposed::Expansion( | ||
| // Safety: HANGUL_*_BASE are 0x1nnn, addding numbers that are 21 and 41 | ||
| // max will keep it in range, less than 0xD800 | ||
| unsafe { char::from_u32_unchecked(HANGUL_L_BASE + l) }, | ||
| unsafe { char::from_u32_unchecked(HANGUL_V_BASE + v) }, | ||
| ); | ||
| } | ||
| let lv = lvt - t; | ||
| // Invariant: lvt < 1172 | ||
| // Safe because values known to be in range | ||
| Decomposed::Expansion( | ||
| // Safety: HANGUL_*_BASE are 0x1nnn, addding numbers that are 1172 and 41 | ||
| // max will keep it in range, less than 0xD800 | ||
| unsafe { char::from_u32_unchecked(HANGUL_S_BASE + lv) }, | ||
| unsafe { char::from_u32_unchecked(HANGUL_T_BASE + t) }, | ||
| ) | ||
| } | ||
| /// Performs non-recursive canonical decomposition except Hangul syllables | ||
| /// are reported as `Decomposed::Default`. | ||
| #[inline(always)] | ||
| fn decompose_non_hangul(&self, c: char) -> Decomposed { | ||
| let decomposition = self.decompositions.trie.get(c); | ||
| // The REPLACEMENT CHARACTER has `NON_ROUND_TRIP_MARKER` set, | ||
| // and that flag needs to be ignored here. | ||
| if (decomposition & !(BACKWARD_COMBINING_MARKER | NON_ROUND_TRIP_MARKER)) == 0 { | ||
| return Decomposed::Default; | ||
| } | ||
| // The loop is only broken out of as goto forward | ||
| #[allow(clippy::never_loop)] | ||
| loop { | ||
| let high_zeros = (decomposition & HIGH_ZEROS_MASK) == 0; | ||
| let low_zeros = (decomposition & LOW_ZEROS_MASK) == 0; | ||
| if !high_zeros && !low_zeros { | ||
| // Decomposition into two BMP characters: starter and non-starter | ||
| if in_inclusive_range(c, '\u{1F71}', '\u{1FFB}') { | ||
| // Look in the other trie due to oxia singleton | ||
| // mappings to corresponding character with tonos. | ||
| break; | ||
| } | ||
| let starter = char_from_u32(decomposition & 0x7FFF); | ||
| let combining = char_from_u32((decomposition >> 15) & 0x7FFF); | ||
| return Decomposed::Expansion(starter, combining); | ||
| } | ||
| if high_zeros { | ||
| // Decomposition into one BMP character or non-starter | ||
| if trie_value_has_ccc(decomposition) { | ||
| // Non-starter | ||
| if !in_inclusive_range(c, '\u{0340}', '\u{0F81}') { | ||
| return Decomposed::Default; | ||
| } | ||
| return match c { | ||
| '\u{0340}' => { | ||
| // COMBINING GRAVE TONE MARK | ||
| Decomposed::Singleton('\u{0300}') | ||
| } | ||
| '\u{0341}' => { | ||
| // COMBINING ACUTE TONE MARK | ||
| Decomposed::Singleton('\u{0301}') | ||
| } | ||
| '\u{0343}' => { | ||
| // COMBINING GREEK KORONIS | ||
| Decomposed::Singleton('\u{0313}') | ||
| } | ||
| '\u{0344}' => { | ||
| // COMBINING GREEK DIALYTIKA TONOS | ||
| Decomposed::Expansion('\u{0308}', '\u{0301}') | ||
| } | ||
| '\u{0F73}' => { | ||
| // TIBETAN VOWEL SIGN II | ||
| Decomposed::Expansion('\u{0F71}', '\u{0F72}') | ||
| } | ||
| '\u{0F75}' => { | ||
| // TIBETAN VOWEL SIGN UU | ||
| Decomposed::Expansion('\u{0F71}', '\u{0F74}') | ||
| } | ||
| '\u{0F81}' => { | ||
| // TIBETAN VOWEL SIGN REVERSED II | ||
| Decomposed::Expansion('\u{0F71}', '\u{0F80}') | ||
| } | ||
| _ => Decomposed::Default, | ||
| }; | ||
| } | ||
| let singleton = decomposition as u16; | ||
| debug_assert_ne!( | ||
| singleton, FDFA_MARKER, | ||
| "How come we got the U+FDFA NFKD marker here?" | ||
| ); | ||
| return Decomposed::Singleton(char_from_u16(singleton)); | ||
| } | ||
| if c == '\u{212B}' { | ||
| // ANGSTROM SIGN | ||
| return Decomposed::Singleton('\u{00C5}'); | ||
| } | ||
| // Only 12 of 14 bits used as of Unicode 16. | ||
| let offset = (((decomposition & !(0b11 << 30)) >> 16) as usize) - 1; | ||
| // Only 3 of 4 bits used as of Unicode 16. | ||
| let len_bits = decomposition & 0b1111; | ||
| let tables = self.tables; | ||
| if offset < tables.scalars16.len() { | ||
| if len_bits != 0 { | ||
| // i.e. logical len isn't 2 | ||
| break; | ||
| } | ||
| if let Some(first) = tables.scalars16.get(offset) { | ||
| if let Some(second) = tables.scalars16.get(offset + 1) { | ||
| // Two BMP starters | ||
| return Decomposed::Expansion(char_from_u16(first), char_from_u16(second)); | ||
| } | ||
| } | ||
| // GIGO case | ||
| debug_assert!(false); | ||
| return Decomposed::Default; | ||
| } | ||
| let len = len_bits + 1; | ||
| if len > 2 { | ||
| break; | ||
| } | ||
| let offset24 = offset - tables.scalars16.len(); | ||
| if let Some(first_c) = tables.scalars24.get(offset24) { | ||
| if len == 1 { | ||
| return Decomposed::Singleton(first_c); | ||
| } | ||
| if let Some(second_c) = tables.scalars24.get(offset24 + 1) { | ||
| return Decomposed::Expansion(first_c, second_c); | ||
| } | ||
| } | ||
| // GIGO case | ||
| debug_assert!(false); | ||
| return Decomposed::Default; | ||
| } | ||
| let non_recursive = self.non_recursive; | ||
| let non_recursive_decomposition = non_recursive.trie.get(c); | ||
| if non_recursive_decomposition == 0 { | ||
| // GIGO case | ||
| debug_assert!(false); | ||
| return Decomposed::Default; | ||
| } | ||
| let trail_or_complex = (non_recursive_decomposition >> 16) as u16; | ||
| let lead = non_recursive_decomposition as u16; | ||
| if lead != 0 && trail_or_complex != 0 { | ||
| // Decomposition into two BMP characters | ||
| return Decomposed::Expansion(char_from_u16(lead), char_from_u16(trail_or_complex)); | ||
| } | ||
| if lead != 0 { | ||
| // Decomposition into one BMP character | ||
| return Decomposed::Singleton(char_from_u16(lead)); | ||
| } | ||
| // Decomposition into two non-BMP characters | ||
| // Low is offset into a table plus one to keep it non-zero. | ||
| let offset = usize::from(trail_or_complex - 1); | ||
| if let Some(first) = non_recursive.scalars24.get(offset) { | ||
| if let Some(second) = non_recursive.scalars24.get(offset + 1) { | ||
| return Decomposed::Expansion(first, second); | ||
| } | ||
| } | ||
| // GIGO case | ||
| debug_assert!(false); | ||
| Decomposed::Default | ||
| } | ||
| } | ||
| /// The raw (non-recursive) canonical decomposition operation. | ||
| /// | ||
| /// Callers should generally use `DecomposingNormalizer` instead of this API. | ||
| /// However, this API is provided for callers such as HarfBuzz that specifically | ||
| /// want access to non-recursive canonical decomposition e.g. for use in a | ||
| /// glyph-availability-guided custom normalizer. | ||
| #[derive(Debug)] | ||
| pub struct CanonicalDecomposition { | ||
| decompositions: DataPayload<NormalizerNfdDataV1>, | ||
| tables: DataPayload<NormalizerNfdTablesV1>, | ||
| non_recursive: DataPayload<NormalizerNfdSupplementV1>, | ||
| } | ||
| #[cfg(feature = "compiled_data")] | ||
| impl Default for CanonicalDecomposition { | ||
| fn default() -> Self { | ||
| Self::new().static_to_owned() | ||
| } | ||
| } | ||
| impl CanonicalDecomposition { | ||
| /// Constructs a borrowed version of this type for more efficient querying. | ||
| pub fn as_borrowed(&self) -> CanonicalDecompositionBorrowed<'_> { | ||
| CanonicalDecompositionBorrowed { | ||
| decompositions: self.decompositions.get(), | ||
| tables: self.tables.get(), | ||
| non_recursive: self.non_recursive.get(), | ||
| } | ||
| } | ||
| /// Construct from compiled data. | ||
| /// | ||
| /// ✨ *Enabled with the `compiled_data` Cargo feature.* | ||
| /// | ||
| /// [📚 Help choosing a constructor](icu_provider::constructors) | ||
| #[cfg(feature = "compiled_data")] | ||
| #[allow(clippy::new_ret_no_self)] | ||
| pub const fn new() -> CanonicalDecompositionBorrowed<'static> { | ||
| CanonicalDecompositionBorrowed::new() | ||
| } | ||
| icu_provider::gen_buffer_data_constructors!(() -> error: DataError, | ||
| functions: [ | ||
| new: skip, | ||
| try_new_with_buffer_provider, | ||
| try_new_unstable, | ||
| Self, | ||
| ] | ||
| ); | ||
| #[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new)] | ||
| pub fn try_new_unstable<D>(provider: &D) -> Result<Self, DataError> | ||
| where | ||
| D: DataProvider<NormalizerNfdDataV1> | ||
| + DataProvider<NormalizerNfdTablesV1> | ||
| + DataProvider<NormalizerNfdSupplementV1> | ||
| + ?Sized, | ||
| { | ||
| let decompositions: DataPayload<NormalizerNfdDataV1> = | ||
| provider.load(Default::default())?.payload; | ||
| let tables: DataPayload<NormalizerNfdTablesV1> = provider.load(Default::default())?.payload; | ||
| if tables.get().scalars16.len() + tables.get().scalars24.len() > 0xFFF { | ||
| // The data is from a future where there exists a normalization flavor whose | ||
| // complex decompositions take more than 0xFFF but fewer than 0x1FFF code points | ||
| // of space. If a good use case from such a decomposition flavor arises, we can | ||
| // dynamically change the bit masks so that the length mask becomes 0x1FFF instead | ||
| // of 0xFFF and the all-non-starters mask becomes 0 instead of 0x1000. However, | ||
| // since for now the masks are hard-coded, error out. | ||
| return Err(DataError::custom("future extension")); | ||
| } | ||
| let non_recursive: DataPayload<NormalizerNfdSupplementV1> = | ||
| provider.load(Default::default())?.payload; | ||
| Ok(CanonicalDecomposition { | ||
| decompositions, | ||
| tables, | ||
| non_recursive, | ||
| }) | ||
| } | ||
| } | ||
| /// Borrowed version of lookup of the Canonical_Combining_Class Unicode property. | ||
| /// | ||
| /// # Example | ||
| /// | ||
| /// ``` | ||
| /// use icu::properties::props::CanonicalCombiningClass; | ||
| /// use icu::normalizer::properties::CanonicalCombiningClassMapBorrowed; | ||
| /// | ||
| /// let map = CanonicalCombiningClassMapBorrowed::new(); | ||
| /// assert_eq!(map.get('a'), CanonicalCombiningClass::NotReordered); // U+0061: LATIN SMALL LETTER A | ||
| /// assert_eq!(map.get32(0x0301), CanonicalCombiningClass::Above); // U+0301: COMBINING ACUTE ACCENT | ||
| /// ``` | ||
| #[derive(Debug)] | ||
| pub struct CanonicalCombiningClassMapBorrowed<'a> { | ||
| /// The data trie | ||
| decompositions: &'a DecompositionData<'a>, | ||
| } | ||
| #[cfg(feature = "compiled_data")] | ||
| impl Default for CanonicalCombiningClassMapBorrowed<'static> { | ||
| fn default() -> Self { | ||
| Self::new() | ||
| } | ||
| } | ||
| impl CanonicalCombiningClassMapBorrowed<'static> { | ||
| /// Cheaply converts a [`CanonicalCombiningClassMapBorrowed<'static>`] into a [`CanonicalCombiningClassMap`]. | ||
| /// | ||
| /// Note: Due to branching and indirection, using [`CanonicalCombiningClassMap`] might inhibit some | ||
| /// compile-time optimizations that are possible with [`CanonicalCombiningClassMapBorrowed`]. | ||
| pub const fn static_to_owned(self) -> CanonicalCombiningClassMap { | ||
| CanonicalCombiningClassMap { | ||
| decompositions: DataPayload::from_static_ref(self.decompositions), | ||
| } | ||
| } | ||
| /// Construct from compiled data. | ||
| /// | ||
| /// ✨ *Enabled with the `compiled_data` Cargo feature.* | ||
| /// | ||
| /// [📚 Help choosing a constructor](icu_provider::constructors) | ||
| #[cfg(feature = "compiled_data")] | ||
| pub const fn new() -> Self { | ||
| CanonicalCombiningClassMapBorrowed { | ||
| decompositions: crate::provider::Baked::SINGLETON_NORMALIZER_NFD_DATA_V1, | ||
| } | ||
| } | ||
| } | ||
| impl CanonicalCombiningClassMapBorrowed<'_> { | ||
| /// Look up the canonical combining class for a scalar value. | ||
| /// | ||
| /// The return value is a u8 representing the canonical combining class, | ||
| /// you may enable the `"icu_properties"` feature if you would like to use a typed | ||
| /// `CanonicalCombiningClass`. | ||
| #[inline(always)] | ||
| pub fn get_u8(&self, c: char) -> u8 { | ||
| self.get32_u8(u32::from(c)) | ||
| } | ||
| /// Look up the canonical combining class for a scalar value | ||
| /// represented as `u32`. If the argument is outside the scalar | ||
| /// value range, `Not_Reordered` is returned. | ||
| /// | ||
| /// The return value is a u8 representing the canonical combining class, | ||
| /// you may enable the `"icu_properties"` feature if you would like to use a typed | ||
| /// `CanonicalCombiningClass`. | ||
| pub fn get32_u8(&self, c: u32) -> u8 { | ||
| let trie_value = self.decompositions.trie.get32(c); | ||
| if trie_value_has_ccc(trie_value) { | ||
| trie_value as u8 | ||
| } else { | ||
| ccc!(NotReordered, 0).to_icu4c_value() | ||
| } | ||
| } | ||
| /// Look up the canonical combining class for a scalar value | ||
| /// | ||
| /// ✨ *Enabled with the `icu_properties` Cargo feature.* | ||
| #[inline(always)] | ||
| #[cfg(feature = "icu_properties")] | ||
| pub fn get(&self, c: char) -> CanonicalCombiningClass { | ||
| CanonicalCombiningClass::from_icu4c_value(self.get_u8(c)) | ||
| } | ||
| /// Look up the canonical combining class for a scalar value | ||
| /// represented as `u32`. If the argument is outside the scalar | ||
| /// value range, `CanonicalCombiningClass::NotReordered` is returned. | ||
| /// | ||
| /// ✨ *Enabled with the `icu_properties` Cargo feature.* | ||
| #[cfg(feature = "icu_properties")] | ||
| pub fn get32(&self, c: u32) -> CanonicalCombiningClass { | ||
| CanonicalCombiningClass::from_icu4c_value(self.get32_u8(c)) | ||
| } | ||
| } | ||
| /// Lookup of the Canonical_Combining_Class Unicode property. | ||
| #[derive(Debug)] | ||
| pub struct CanonicalCombiningClassMap { | ||
| /// The data trie | ||
| decompositions: DataPayload<NormalizerNfdDataV1>, | ||
| } | ||
| #[cfg(feature = "compiled_data")] | ||
| impl Default for CanonicalCombiningClassMap { | ||
| fn default() -> Self { | ||
| Self::new().static_to_owned() | ||
| } | ||
| } | ||
| impl CanonicalCombiningClassMap { | ||
| /// Constructs a borrowed version of this type for more efficient querying. | ||
| pub fn as_borrowed(&self) -> CanonicalCombiningClassMapBorrowed<'_> { | ||
| CanonicalCombiningClassMapBorrowed { | ||
| decompositions: self.decompositions.get(), | ||
| } | ||
| } | ||
| /// Construct from compiled data. | ||
| /// | ||
| /// ✨ *Enabled with the `compiled_data` Cargo feature.* | ||
| /// | ||
| /// [📚 Help choosing a constructor](icu_provider::constructors) | ||
| #[cfg(feature = "compiled_data")] | ||
| #[allow(clippy::new_ret_no_self)] | ||
| pub const fn new() -> CanonicalCombiningClassMapBorrowed<'static> { | ||
| CanonicalCombiningClassMapBorrowed::new() | ||
| } | ||
| icu_provider::gen_buffer_data_constructors!(() -> error: DataError, | ||
| functions: [ | ||
| new: skip, | ||
| try_new_with_buffer_provider, | ||
| try_new_unstable, | ||
| Self, | ||
| ]); | ||
| #[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new)] | ||
| pub fn try_new_unstable<D>(provider: &D) -> Result<Self, DataError> | ||
| where | ||
| D: DataProvider<NormalizerNfdDataV1> + ?Sized, | ||
| { | ||
| let decompositions: DataPayload<NormalizerNfdDataV1> = | ||
| provider.load(Default::default())?.payload; | ||
| Ok(CanonicalCombiningClassMap { decompositions }) | ||
| } | ||
| } |
| // This file is part of ICU4X. For terms of use, please see the file | ||
| // called LICENSE at the top level of the ICU4X source tree | ||
| // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). | ||
| //! 🚧 \[Unstable\] Data provider struct definitions for this ICU4X component. | ||
| //! | ||
| //! <div class="stab unstable"> | ||
| //! 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways, | ||
| //! including in SemVer minor releases. While the serde representation of data structs is guaranteed | ||
| //! to be stable, their Rust representation might not be. Use with caution. | ||
| //! </div> | ||
| //! | ||
| //! Read more about data providers: [`icu_provider`] | ||
| // Provider structs must be stable | ||
| #![allow(clippy::exhaustive_structs, clippy::exhaustive_enums)] | ||
| use icu_collections::char16trie::Char16Trie; | ||
| use icu_collections::codepointtrie::CodePointTrie; | ||
| use icu_provider::prelude::*; | ||
| use zerovec::ZeroVec; | ||
| #[cfg(feature = "compiled_data")] | ||
| #[derive(Debug)] | ||
| /// Baked data | ||
| /// | ||
| /// <div class="stab unstable"> | ||
| /// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways, | ||
| /// including in SemVer minor releases. In particular, the `DataProvider` implementations are only | ||
| /// guaranteed to match with this version's `*_unstable` providers. Use with caution. | ||
| /// </div> | ||
| pub struct Baked; | ||
| #[cfg(feature = "compiled_data")] | ||
| #[allow(unused_imports)] | ||
| const _: () = { | ||
| use icu_normalizer_data::*; | ||
| pub mod icu { | ||
| pub use crate as normalizer; | ||
| pub use icu_collections as collections; | ||
| } | ||
| make_provider!(Baked); | ||
| impl_normalizer_nfc_v1!(Baked); | ||
| impl_normalizer_nfd_data_v1!(Baked); | ||
| impl_normalizer_nfd_supplement_v1!(Baked); | ||
| impl_normalizer_nfd_tables_v1!(Baked); | ||
| impl_normalizer_nfkd_data_v1!(Baked); | ||
| impl_normalizer_nfkd_tables_v1!(Baked); | ||
| impl_normalizer_uts46_data_v1!(Baked); | ||
| }; | ||
| icu_provider::data_marker!( | ||
| /// Marker for data for canonical decomposition. | ||
| NormalizerNfdDataV1, | ||
| "normalizer/nfd/data/v1", | ||
| DecompositionData<'static>, | ||
| is_singleton = true | ||
| ); | ||
| icu_provider::data_marker!( | ||
| /// Marker for additional data for canonical decomposition. | ||
| NormalizerNfdTablesV1, | ||
| "normalizer/nfd/tables/v1", | ||
| DecompositionTables<'static>, | ||
| is_singleton = true | ||
| ); | ||
| icu_provider::data_marker!( | ||
| /// Marker for data for compatibility decomposition. | ||
| NormalizerNfkdDataV1, | ||
| "normalizer/nfkd/data/v1", | ||
| DecompositionData<'static>, | ||
| is_singleton = true | ||
| ); | ||
| icu_provider::data_marker!( | ||
| /// Marker for additional data for compatibility decomposition. | ||
| NormalizerNfkdTablesV1, | ||
| "normalizer/nfkd/tables/v1", | ||
| DecompositionTables<'static>, | ||
| is_singleton = true | ||
| ); | ||
| icu_provider::data_marker!( | ||
| /// Marker for data for UTS-46 decomposition. | ||
| NormalizerUts46DataV1, | ||
| "normalizer/uts46/data/v1", | ||
| DecompositionData<'static>, | ||
| is_singleton = true | ||
| ); | ||
| icu_provider::data_marker!( | ||
| /// Marker for data for composition. | ||
| NormalizerNfcV1, | ||
| "normalizer/nfc/v1", | ||
| CanonicalCompositions<'static>, | ||
| is_singleton = true | ||
| ); | ||
| icu_provider::data_marker!( | ||
| /// Marker for additional data for non-recusrsive composition. | ||
| NormalizerNfdSupplementV1, | ||
| "normalizer/nfd/supplement/v1", | ||
| NonRecursiveDecompositionSupplement<'static>, | ||
| is_singleton = true | ||
| ); | ||
| #[cfg(feature = "datagen")] | ||
| /// The latest minimum set of markers required by this component. | ||
| pub const MARKERS: &[DataMarkerInfo] = &[ | ||
| NormalizerNfcV1::INFO, | ||
| NormalizerNfdDataV1::INFO, | ||
| NormalizerNfdTablesV1::INFO, | ||
| NormalizerNfkdDataV1::INFO, | ||
| NormalizerNfkdTablesV1::INFO, | ||
| NormalizerNfdSupplementV1::INFO, | ||
| NormalizerUts46DataV1::INFO, | ||
| ]; | ||
| /// Decomposition data | ||
| /// | ||
| /// <div class="stab unstable"> | ||
| /// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways, | ||
| /// including in SemVer minor releases. While the serde representation of data structs is guaranteed | ||
| /// to be stable, their Rust representation might not be. Use with caution. | ||
| /// </div> | ||
| #[derive(Debug, PartialEq, Clone, yoke::Yokeable, zerofrom::ZeroFrom)] | ||
| #[cfg_attr(feature = "datagen", derive(serde::Serialize, databake::Bake))] | ||
| #[cfg_attr(feature = "datagen", databake(path = icu_normalizer::provider))] | ||
| #[cfg_attr(feature = "serde", derive(serde::Deserialize))] | ||
| pub struct DecompositionData<'data> { | ||
| /// Trie for decomposition. | ||
| #[cfg_attr(feature = "serde", serde(borrow))] | ||
| pub trie: CodePointTrie<'data, u32>, | ||
| /// The passthrough bounds of NFD/NFC are lowered to this | ||
| /// maximum instead. (16-bit, because cannot be higher | ||
| /// than 0x0300, which is the bound for NFC.) | ||
| pub passthrough_cap: u16, | ||
| } | ||
| icu_provider::data_struct!( | ||
| DecompositionData<'_>, | ||
| #[cfg(feature = "datagen")] | ||
| ); | ||
| /// The expansion tables for cases where the decomposition isn't | ||
| /// contained in the trie value | ||
| /// | ||
| /// <div class="stab unstable"> | ||
| /// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways, | ||
| /// including in SemVer minor releases. While the serde representation of data structs is guaranteed | ||
| /// to be stable, their Rust representation might not be. Use with caution. | ||
| /// </div> | ||
| #[derive(Debug, PartialEq, Clone, yoke::Yokeable, zerofrom::ZeroFrom)] | ||
| #[cfg_attr(feature = "datagen", derive(serde::Serialize, databake::Bake))] | ||
| #[cfg_attr(feature = "datagen", databake(path = icu_normalizer::provider))] | ||
| #[cfg_attr(feature = "serde", derive(serde::Deserialize))] | ||
| pub struct DecompositionTables<'data> { | ||
| /// Decompositions that are fully within the BMP | ||
| #[cfg_attr(feature = "serde", serde(borrow))] | ||
| pub scalars16: ZeroVec<'data, u16>, | ||
| /// Decompositions with at least one character outside | ||
| /// the BMP | ||
| #[cfg_attr(feature = "serde", serde(borrow))] | ||
| pub scalars24: ZeroVec<'data, char>, | ||
| } | ||
| icu_provider::data_struct!( | ||
| DecompositionTables<'_>, | ||
| #[cfg(feature = "datagen")] | ||
| ); | ||
| /// Non-Hangul canonical compositions | ||
| /// | ||
| /// <div class="stab unstable"> | ||
| /// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways, | ||
| /// including in SemVer minor releases. While the serde representation of data structs is guaranteed | ||
| /// to be stable, their Rust representation might not be. Use with caution. | ||
| /// </div> | ||
| #[derive(Debug, PartialEq, Clone, yoke::Yokeable, zerofrom::ZeroFrom)] | ||
| #[cfg_attr(feature = "datagen", derive(serde::Serialize, databake::Bake))] | ||
| #[cfg_attr(feature = "datagen", databake(path = icu_normalizer::provider))] | ||
| #[cfg_attr(feature = "serde", derive(serde::Deserialize))] | ||
| pub struct CanonicalCompositions<'data> { | ||
| /// Trie keys are two-`char` strings with the second | ||
| /// character coming first. The value, if any, is the | ||
| /// (non-Hangul) canonical composition. | ||
| #[cfg_attr(feature = "serde", serde(borrow))] | ||
| pub canonical_compositions: Char16Trie<'data>, | ||
| } | ||
| icu_provider::data_struct!( | ||
| CanonicalCompositions<'_>, | ||
| #[cfg(feature = "datagen")] | ||
| ); | ||
| /// Non-recursive canonical decompositions that differ from | ||
| /// `DecompositionData`. | ||
| /// | ||
| /// <div class="stab unstable"> | ||
| /// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways, | ||
| /// including in SemVer minor releases. While the serde representation of data structs is guaranteed | ||
| /// to be stable, their Rust representation might not be. Use with caution. | ||
| /// </div> | ||
| #[derive(Debug, PartialEq, Clone, yoke::Yokeable, zerofrom::ZeroFrom)] | ||
| #[cfg_attr(feature = "datagen", derive(serde::Serialize, databake::Bake))] | ||
| #[cfg_attr(feature = "datagen", databake(path = icu_normalizer::provider))] | ||
| #[cfg_attr(feature = "serde", derive(serde::Deserialize))] | ||
| pub struct NonRecursiveDecompositionSupplement<'data> { | ||
| /// Trie for the supplementary non-recursive decompositions | ||
| #[cfg_attr(feature = "serde", serde(borrow))] | ||
| pub trie: CodePointTrie<'data, u32>, | ||
| /// Decompositions with at least one character outside | ||
| /// the BMP | ||
| #[cfg_attr(feature = "serde", serde(borrow))] | ||
| pub scalars24: ZeroVec<'data, char>, | ||
| } | ||
| icu_provider::data_struct!( | ||
| NonRecursiveDecompositionSupplement<'_>, | ||
| #[cfg(feature = "datagen")] | ||
| ); |
| // This file is part of ICU4X. For terms of use, please see the file | ||
| // called LICENSE at the top level of the ICU4X source tree | ||
| // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). | ||
| //! Bundles the part of UTS 46 that makes sense to implement as a | ||
| //! normalization. | ||
| //! | ||
| //! This is meant to be used as a building block of an UTS 46 | ||
| //! implementation, such as the `idna` crate. | ||
| use crate::ComposingNormalizer; | ||
| use crate::ComposingNormalizerBorrowed; | ||
| use crate::NormalizerNfcV1; | ||
| use crate::NormalizerNfdTablesV1; | ||
| use crate::NormalizerNfkdTablesV1; | ||
| use crate::NormalizerUts46DataV1; | ||
| use icu_provider::DataError; | ||
| use icu_provider::DataProvider; | ||
| // Implementation note: Despite merely wrapping a `ComposingNormalizer`, | ||
| // having a `Uts46Mapper` serves two purposes: | ||
| // | ||
| // 1. Denying public access to parts of the `ComposingNormalizer` API | ||
| // that don't work when the data contains markers for ignorables. | ||
| // 2. Providing a place where additional iterator pre-processing or | ||
| // post-processing can take place if needed in the future. (When | ||
| // writing this, it looked like such processing was needed but | ||
| // now isn't needed after all.) | ||
| /// A borrowed version of a mapper that knows how to performs the | ||
| /// subsets of UTS 46 processing documented on the methods. | ||
| #[derive(Debug)] | ||
| pub struct Uts46MapperBorrowed<'a> { | ||
| normalizer: ComposingNormalizerBorrowed<'a>, | ||
| } | ||
| #[cfg(feature = "compiled_data")] | ||
| impl Default for Uts46MapperBorrowed<'static> { | ||
| fn default() -> Self { | ||
| Self::new() | ||
| } | ||
| } | ||
| impl Uts46MapperBorrowed<'static> { | ||
| /// Cheaply converts a [`Uts46MapperBorrowed<'static>`] into a [`Uts46Mapper`]. | ||
| /// | ||
| /// Note: Due to branching and indirection, using [`Uts46Mapper`] might inhibit some | ||
| /// compile-time optimizations that are possible with [`Uts46MapperBorrowed`]. | ||
| pub const fn static_to_owned(self) -> Uts46Mapper { | ||
| Uts46Mapper { | ||
| normalizer: self.normalizer.static_to_owned(), | ||
| } | ||
| } | ||
| /// Construct with compiled data. | ||
| #[cfg(feature = "compiled_data")] | ||
| pub const fn new() -> Self { | ||
| Uts46MapperBorrowed { | ||
| normalizer: ComposingNormalizerBorrowed::new_uts46(), | ||
| } | ||
| } | ||
| } | ||
| impl Uts46MapperBorrowed<'_> { | ||
| /// Returns an iterator adaptor that turns an `Iterator` over `char` | ||
| /// into an iterator yielding a `char` sequence that gets the following | ||
| /// operations from the "Map" and "Normalize" steps of the "Processing" | ||
| /// section of UTS 46 lazily applied to it: | ||
| /// | ||
| /// 1. The _ignored_ characters are ignored. | ||
| /// 2. The _mapped_ characters are mapped. | ||
| /// 3. The _disallowed_ characters are replaced with U+FFFD, | ||
| /// which itself is a disallowed character. | ||
| /// 4. The _deviation_ characters are treated as _mapped_ or _valid_ | ||
| /// as appropriate. | ||
| /// 5. The _disallowed_STD3_valid_ characters are treated as allowed. | ||
| /// 6. The _disallowed_STD3_mapped_ characters are treated as | ||
| /// _mapped_. | ||
| /// 7. The result is normalized to NFC. | ||
| /// | ||
| /// Notably: | ||
| /// | ||
| /// * The STD3 or WHATWG ASCII deny list should be implemented as a | ||
| /// post-processing step. | ||
| /// * Transitional processing is not performed. Transitional mapping | ||
| /// would be a pre-processing step, but transitional processing is | ||
| /// deprecated, and none of Firefox, Safari, or Chrome use it. | ||
| pub fn map_normalize<'delegate, I: Iterator<Item = char> + 'delegate>( | ||
| &'delegate self, | ||
| iter: I, | ||
| ) -> impl Iterator<Item = char> + 'delegate { | ||
| self.normalizer | ||
| .normalize_iter_private(iter, crate::IgnorableBehavior::Ignored) | ||
| } | ||
| /// Returns an iterator adaptor that turns an `Iterator` over `char` | ||
| /// into an iterator yielding a `char` sequence that gets the following | ||
| /// operations from the NFC check and statucs steps of the "Validity | ||
| /// Criteria" section of UTS 46 lazily applied to it: | ||
| /// | ||
| /// 1. The _ignored_ characters are treated as _disallowed_. | ||
| /// 2. The _mapped_ characters are mapped. | ||
| /// 3. The _disallowed_ characters are replaced with U+FFFD, | ||
| /// which itself is a disallowed character. | ||
| /// 4. The _deviation_ characters are treated as _mapped_ or _valid_ | ||
| /// as appropriate. | ||
| /// 5. The _disallowed_STD3_valid_ characters are treated as allowed. | ||
| /// 6. The _disallowed_STD3_mapped_ characters are treated as | ||
| /// _mapped_. | ||
| /// 7. The result is normalized to NFC. | ||
| /// | ||
| /// Notably: | ||
| /// | ||
| /// * The STD3 or WHATWG ASCII deny list should be implemented as a | ||
| /// post-processing step. | ||
| /// * Transitional processing is not performed. Transitional mapping | ||
| /// would be a pre-processing step, but transitional processing is | ||
| /// deprecated, and none of Firefox, Safari, or Chrome use it. | ||
| /// * The output needs to be compared with input to see if anything | ||
| /// changed. This check catches failures to adhere to the normalization | ||
| /// and status requirements. In particular, this comparison results | ||
| /// in _mapped_ characters resulting in error like "Validity Criteria" | ||
| /// requires. | ||
| pub fn normalize_validate<'delegate, I: Iterator<Item = char> + 'delegate>( | ||
| &'delegate self, | ||
| iter: I, | ||
| ) -> impl Iterator<Item = char> + 'delegate { | ||
| self.normalizer | ||
| .normalize_iter_private(iter, crate::IgnorableBehavior::ReplacementCharacter) | ||
| } | ||
| } | ||
| /// A mapper that knows how to performs the subsets of UTS 46 processing | ||
| /// documented on the methods. | ||
| #[derive(Debug)] | ||
| pub struct Uts46Mapper { | ||
| normalizer: ComposingNormalizer, | ||
| } | ||
| #[cfg(feature = "compiled_data")] | ||
| impl Default for Uts46Mapper { | ||
| fn default() -> Self { | ||
| Self::new().static_to_owned() | ||
| } | ||
| } | ||
| impl Uts46Mapper { | ||
| /// Constructs a borrowed version of this type for more efficient querying. | ||
| pub fn as_borrowed(&self) -> Uts46MapperBorrowed<'_> { | ||
| Uts46MapperBorrowed { | ||
| normalizer: self.normalizer.as_borrowed(), | ||
| } | ||
| } | ||
| /// Construct with compiled data. | ||
| #[cfg(feature = "compiled_data")] | ||
| #[allow(clippy::new_ret_no_self)] | ||
| pub const fn new() -> Uts46MapperBorrowed<'static> { | ||
| Uts46MapperBorrowed::new() | ||
| } | ||
| /// Construct with provider. | ||
| #[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new)] | ||
| pub fn try_new<D>(provider: &D) -> Result<Self, DataError> | ||
| where | ||
| D: DataProvider<NormalizerUts46DataV1> | ||
| + DataProvider<NormalizerNfdTablesV1> | ||
| + DataProvider<NormalizerNfkdTablesV1> | ||
| // UTS 46 tables merged into NormalizerNfkdTablesV1 | ||
| + DataProvider<NormalizerNfcV1> | ||
| + ?Sized, | ||
| { | ||
| let normalizer = ComposingNormalizer::try_new_uts46_unstable(provider)?; | ||
| Ok(Uts46Mapper { normalizer }) | ||
| } | ||
| } |
| # This is a placeholder in the interest of keeping the repository size smaller. | ||
| # Replace this file with the contents of | ||
| # https://www.unicode.org/Public/UCD/latest/ucd/NormalizationTest.txt to actually | ||
| # run the conformance test. |
| The test data comes from | ||
| https://www.unicode.org/Public/UCD/latest/ucd/NormalizationTest.txt |
Sorry, the diff of this file is too big to display