icu_collections/codepointtrie/cptrie.rs
1// This file is part of ICU4X. For terms of use, please see the file
2// called LICENSE at the top level of the ICU4X source tree
3// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5use crate::codepointtrie::error::Error;
6use crate::codepointtrie::impl_const::*;
7
8#[cfg(feature = "alloc")]
9use crate::codepointinvlist::CodePointInversionList;
10use core::char::CharTryFromError;
11use core::convert::Infallible;
12use core::convert::TryFrom;
13use core::fmt::Display;
14#[cfg(feature = "alloc")]
15use core::iter::FromIterator;
16use core::num::TryFromIntError;
17use core::ops::RangeInclusive;
18use yoke::Yokeable;
19use zerofrom::ZeroFrom;
20use zerovec::ule::AsULE;
21#[cfg(feature = "alloc")]
22use zerovec::ule::UleError;
23use zerovec::ZeroSlice;
24use zerovec::ZeroVec;
25
26/// The type of trie represents whether the trie has an optimization that
27/// would make it smaller or faster.
28///
29/// Regarding performance, a trie being a small or fast type affects the number of array lookups
30/// needed for code points in the range `[0x1000, 0x10000)`. In this range, `Small` tries use 4 array lookups,
31/// while `Fast` tries use 2 array lookups.
32/// Code points before the interval (in `[0, 0x1000)`) will always use 2 array lookups.
33/// Code points after the interval (in `[0x10000, 0x10FFFF]`) will always use 4 array lookups.
34///
35/// Regarding size, `Fast` type tries are larger than `Small` type tries because the minimum size of
36/// the index array is larger. The minimum size is the "fast max" limit, which is the limit of the range
37/// of code points with 2 array lookups.
38///
39/// See the document [Unicode Properties and Code Point Tries in ICU4X](https://github.com/unicode-org/icu4x/blob/main/documents/design/properties_code_point_trie.md).
40///
41/// Also see [`UCPTrieType`](https://unicode-org.github.io/icu-docs/apidoc/dev/icu4c/ucptrie_8h.html) in ICU4C.
42#[derive(Clone, Copy, PartialEq, Debug, Eq)]
43#[cfg_attr(feature = "serde", derive(serde::Deserialize, serde::Serialize))]
44#[cfg_attr(feature = "databake", derive(databake::Bake))]
45#[cfg_attr(feature = "databake", databake(path = icu_collections::codepointtrie))]
46#[allow(clippy::exhaustive_enums)] // based on a stable serialized form
47pub enum TrieType {
48 /// Represents the "fast" type code point tries for the
49 /// [`TrieType`] trait. The "fast max" limit is set to `0xffff`.
50 Fast = 0,
51 /// Represents the "small" type code point tries for the
52 /// [`TrieType`] trait. The "fast max" limit is set to `0x0fff`.
53 Small = 1,
54}
55
56// TrieValue trait
57
58// AsULE is AsUnalignedLittleEndian, i.e. "allowed in a zerovec"
59
60/// A trait representing the values stored in the data array of a [`CodePointTrie`].
61/// This trait is used as a type parameter in constructing a `CodePointTrie`.
62///
63/// This trait can be implemented on anything that can be represented as a u32s worth of data.
64pub trait TrieValue: Copy + Eq + PartialEq + AsULE + 'static {
65 /// Last-resort fallback value to return if we cannot read data from the trie.
66 ///
67 /// In most cases, the error value is read from the last element of the `data` array,
68 /// this value is used for empty codepointtrie arrays
69 /// Error type when converting from a u32 to this `TrieValue`.
70 type TryFromU32Error: Display;
71 /// A parsing function that is primarily motivated by deserialization contexts.
72 /// When the serialization type width is smaller than 32 bits, then it is expected
73 /// that the call site will widen the value to a `u32` first.
74 fn try_from_u32(i: u32) -> Result<Self, Self::TryFromU32Error>;
75
76 /// A method for converting back to a `u32` that can roundtrip through
77 /// [`Self::try_from_u32()`]. The default implementation of this trait
78 /// method panics in debug mode and returns 0 in release mode.
79 ///
80 /// This method is allowed to have GIGO behavior when fed a value that has
81 /// no corresponding `u32` (since such values cannot be stored in the trie)
82 fn to_u32(self) -> u32;
83}
84
85macro_rules! impl_primitive_trie_value {
86 ($primitive:ty, $error:ty) => {
87 impl TrieValue for $primitive {
88 type TryFromU32Error = $error;
89 fn try_from_u32(i: u32) -> Result<Self, Self::TryFromU32Error> {
90 Self::try_from(i)
91 }
92
93 #[allow(trivial_numeric_casts)]
94 fn to_u32(self) -> u32 {
95 // bitcast when the same size, zero-extend/sign-extend
96 // when not the same size
97 self as u32
98 }
99 }
100 };
101}
102
103impl_primitive_trie_value!(u8, TryFromIntError);
104impl_primitive_trie_value!(u16, TryFromIntError);
105impl_primitive_trie_value!(u32, Infallible);
106impl_primitive_trie_value!(i8, TryFromIntError);
107impl_primitive_trie_value!(i16, TryFromIntError);
108impl_primitive_trie_value!(i32, TryFromIntError);
109impl_primitive_trie_value!(char, CharTryFromError);
110
111/// Helper function used by [`get_range`]. Converts occurrences of trie's null
112/// value into the provided `null_value`.
113///
114/// Note: the ICU version of this helper function uses a `ValueFilter` function
115/// to apply a transform on a non-null value. But currently, this implementation
116/// stops short of that functionality, and instead leaves the non-null trie value
117/// untouched. This is equivalent to having a `ValueFilter` function that is the
118/// identity function.
119fn maybe_filter_value<T: TrieValue>(value: T, trie_null_value: T, null_value: T) -> T {
120 if value == trie_null_value {
121 null_value
122 } else {
123 value
124 }
125}
126
127/// This struct represents a de-serialized [`CodePointTrie`] that was exported from
128/// ICU binary data.
129///
130/// For more information:
131/// - [ICU Site design doc](https://unicode-org.github.io/icu/design/struct/utrie)
132/// - [ICU User Guide section on Properties lookup](https://unicode-org.github.io/icu/userguide/strings/properties.html#lookup)
133// serde impls in crate::serde
134#[derive(Debug, Eq, PartialEq, Yokeable, ZeroFrom)]
135pub struct CodePointTrie<'trie, T: TrieValue> {
136 /// # Safety Invariant
137 ///
138 /// The value of `header.trie_type` must not change after construction.
139 pub(crate) header: CodePointTrieHeader,
140 /// # Safety Invariant
141 ///
142 /// If `header.trie_type == TrieType::Fast`, `index.len()` must be greater
143 /// than `FAST_TYPE_FAST_INDEXING_MAX`. Otherwise, `index.len()`
144 /// must be greater than `SMALL_TYPE_FAST_INDEXING_MAX`. Furthermore,
145 /// this field must not change after construction. (Strictly: It must
146 /// not become shorter than the length requirement stated above and the
147 /// values within the prefix up to the length requirement must not change.)
148 pub(crate) index: ZeroVec<'trie, u16>,
149 /// # Safety Invariant
150 ///
151 /// If `header.trie_type == TrieType::Fast`, `data.len()` must be greater
152 /// than `FAST_TYPE_DATA_MASK` plus the largest value in
153 /// `index[0..FAST_TYPE_FAST_INDEXING_MAX + 1]`. Otherwise, `data.len()`
154 /// must be greater than `FAST_TYPE_DATA_MASK` plus the largest value in
155 /// `index[0..SMALL_TYPE_FAST_INDEXING_MAX + 1]`. Furthermore, this field
156 /// must not change after construction. (Strictly: The stated length
157 /// requirement must continue to hold.)
158 pub(crate) data: ZeroVec<'trie, T>,
159 // serde impl skips this field
160 #[zerofrom(clone)] // TrieValue is Copy, this allows us to avoid
161 // a T: ZeroFrom bound
162 pub(crate) error_value: T,
163}
164
165/// This struct contains the fixed-length header fields of a [`CodePointTrie`].
166///
167/// # Safety Invariant
168///
169/// The `trie_type` field must not change after construction.
170///
171/// (In practice, all the fields here remain unchanged during the lifetime
172/// of the trie.)
173#[cfg_attr(feature = "serde", derive(serde::Deserialize, serde::Serialize))]
174#[cfg_attr(feature = "databake", derive(databake::Bake))]
175#[cfg_attr(feature = "databake", databake(path = icu_collections::codepointtrie))]
176#[derive(Copy, Clone, Debug, Eq, PartialEq, Yokeable, ZeroFrom)]
177#[allow(clippy::exhaustive_structs)] // based on a stable serialized form
178pub struct CodePointTrieHeader {
179 /// The code point of the start of the last range of the trie. A
180 /// range is defined as a partition of the code point space such that the
181 /// value in this trie associated with all code points of the same range is
182 /// the same.
183 ///
184 /// For the property value data for many Unicode properties,
185 /// often times, `high_start` is `U+10000` or lower. In such cases, not
186 /// reserving space in the `index` array for duplicate values is a large
187 /// savings. The "highValue" associated with the `high_start` range is
188 /// stored at the second-to-last position of the `data` array.
189 /// (See `impl_const::HIGH_VALUE_NEG_DATA_OFFSET`.)
190 pub high_start: u32,
191 /// A version of the `high_start` value that is right-shifted 12 spaces,
192 /// but is rounded up to a multiple `0x1000` for easy testing from UTF-8
193 /// lead bytes.
194 pub shifted12_high_start: u16,
195 /// Offset for the null block in the "index-3" table of the `index` array.
196 /// Set to an impossibly high value (e.g., `0xffff`) if there is no
197 /// dedicated index-3 null block.
198 pub index3_null_offset: u16,
199 /// Internal data null block offset, not shifted.
200 /// Set to an impossibly high value (e.g., `0xfffff`) if there is no
201 /// dedicated data null block.
202 pub data_null_offset: u32,
203 /// The value stored in the trie that represents a null value being
204 /// associated to a code point.
205 pub null_value: u32,
206 /// The enum value representing the type of trie, where trie type is as it
207 /// is defined in ICU (ex: Fast, Small).
208 ///
209 /// # Safety Invariant
210 ///
211 /// Must not change after construction.
212 pub trie_type: TrieType,
213}
214
215impl TryFrom<u8> for TrieType {
216 type Error = Error;
217
218 fn try_from(trie_type_int: u8) -> Result<TrieType, Error> {
219 match trie_type_int {
220 0 => Ok(TrieType::Fast),
221 1 => Ok(TrieType::Small),
222 _ => Err(Error::FromDeserialized {
223 reason: "Cannot parse value for trie_type",
224 }),
225 }
226 }
227}
228
229// Helper macro that turns arithmetic into wrapping-in-release, checked-in-debug arithmetic
230//
231// This is rustc's default behavior anyway, however some projects like Android deliberately
232// enable overflow checks. CodePointTrie::get() is intended to be used in Android bionic which
233// cares about codesize and we don't want the pile of panicking infrastructure brought in by overflow
234// checks, so we force wrapping in release.
235// See #6052
236macro_rules! w(
237 // Note: these matchers are not perfect since you cannot have an operator after an expr matcher
238 // Use variables if you need complex first operands.
239 ($a:tt + $b:expr) => {
240 {
241 #[allow(unused_parens)]
242 let a = $a;
243 let b = $b;
244 debug_assert!(a.checked_add(b).is_some());
245 $a.wrapping_add($b)
246 }
247 };
248 ($a:tt - $b:expr) => {
249
250 {
251 #[allow(unused_parens)]
252 let a = $a;
253 let b = $b;
254 debug_assert!(a.checked_sub(b).is_some());
255 $a.wrapping_sub($b)
256 }
257 };
258 ($a:tt * $b:expr) => {
259 {
260 #[allow(unused_parens)]
261 let a = $a;
262 let b = $b;
263 debug_assert!(a.checked_mul(b).is_some());
264 $a.wrapping_mul($b)
265 }
266 };
267);
268
269impl<'trie, T: TrieValue> CodePointTrie<'trie, T> {
270 #[doc(hidden)] // databake internal
271 /// # Safety
272 ///
273 /// `header.trie_type`, `index`, and `data` must
274 /// satisfy the invariants for the fields of the
275 /// same names on `CodePointTrie`.
276 pub const unsafe fn from_parts_unstable_unchecked_v1(
277 header: CodePointTrieHeader,
278 index: ZeroVec<'trie, u16>,
279 data: ZeroVec<'trie, T>,
280 error_value: T,
281 ) -> Self {
282 // Field invariants upheld: The caller is responsible.
283 // In practice, this means that datagen in the databake
284 // mode upholds these invariants when constructing the
285 // `CodePointTrie` that is then baked.
286 Self {
287 header,
288 index,
289 data,
290 error_value,
291 }
292 }
293
294 /// Returns a new [`CodePointTrie`] backed by borrowed data for the `index`
295 /// array and `data` array, whose data values have width `W`.
296 pub fn try_new(
297 header: CodePointTrieHeader,
298 index: ZeroVec<'trie, u16>,
299 data: ZeroVec<'trie, T>,
300 ) -> Result<CodePointTrie<'trie, T>, Error> {
301 let error_value = Self::validate_fields(&header, &index, &data)?;
302 // Field invariants upheld: Checked by `validate_fields` above.
303 let trie: CodePointTrie<'trie, T> = CodePointTrie {
304 header,
305 index,
306 data,
307 error_value,
308 };
309 Ok(trie)
310 }
311
312 /// Checks the invariant on the fields that fast-path access relies on for
313 /// safety in order to omit slice bound checks and upon success returns the
314 /// `error_value` for the trie.
315 ///
316 /// # Safety Usable Invariant
317 ///
318 /// Iff this function returns `Ok(T)`, the arguments satisfy the invariants
319 /// for corresponding fields of `CodePointTrie`. (Other than proving that
320 /// nothing else changes the fields subsequently.)
321 pub(crate) fn validate_fields(
322 header: &CodePointTrieHeader,
323 index: &ZeroSlice<u16>,
324 data: &ZeroSlice<T>,
325 ) -> Result<T, Error> {
326 let error_value = data.last().ok_or(Error::EmptyDataVector)?;
327
328 // `CodePointTrie` lookup has two stages: fast and small (the trie types
329 // are also fast and small; they affect where the boundary between fast
330 // and small lookups is).
331 //
332 // The length requirements for `index` and `data` are checked here only
333 // for the fast lookup case so that the fast lookup can omit bound checks
334 // at the time of access. In the small lookup case, bounds are checked at
335 // the time of access.
336 //
337 // The fast lookup happens on the prefixes of `index` and `data` with
338 // more items for the small lookup case afterwards. The check here
339 // only looks at the prefixes relevant to the fast case.
340 //
341 // In the fast lookup case, the bits of the of the code point are
342 // partitioned into a bit prefix and a bit suffix. First, a value
343 // is read from `index` by indexing into it using the bit prefix.
344 // Then `data` is accessed by the value just read from `index` plus
345 // the bit suffix.
346 //
347 // Therefore, the length of `index` needs to accommodate access
348 // by the maximum possible bit prefix, and the length of `data`
349 // needs to accommodate access by the largest value stored in the part
350 // of `data` reachable by the bit prefix plus the maximum possible bit
351 // suffix.
352 //
353 // The maximum possible bit prefix depends on the trie type.
354
355 // The maximum code point that can be used for fast-path access:
356 let fast_max = match header.trie_type {
357 TrieType::Fast => FAST_TYPE_FAST_INDEXING_MAX,
358 TrieType::Small => SMALL_TYPE_FAST_INDEXING_MAX,
359 };
360 // Keep only the prefix bits:
361 let max_bit_prefix = fast_max >> FAST_TYPE_SHIFT;
362 // Attempt slice the part of `index` that the fast path can index into.
363 // Since `max_bit_prefix` is the largest possible value used for
364 // indexing (inclusive bound), we need to add one to get the exclusive
365 // bound, which is what `get_subslice` wants.
366 let fast_index = index
367 .get_subslice(0..(max_bit_prefix as usize) + 1)
368 .ok_or(Error::IndexTooShortForFastAccess)?;
369 // Invariant upheld for `index`: If we got this far, the length of `index`
370 // satisfies its length invariant on the assumption that `header.trie_type`
371 // will not change subsequently.
372
373 // Now find the largest offset in the part of `index` reachable by the
374 // bit prefix. `max` can never actually return `None`, since we already
375 // know the slice isn't empty. Hence, reusing the error kind instead of
376 // minting a new one for this check.
377 let max_offset = fast_index
378 .iter()
379 .max()
380 .ok_or(Error::IndexTooShortForFastAccess)?;
381 // `FAST_TYPE_DATA_MASK` is the maximum possible bit suffix, since the
382 // maximum is when all the bits in the suffix are set, and the mask
383 // has that many bits set.
384 if (max_offset) as usize + (FAST_TYPE_DATA_MASK as usize) >= data.len() {
385 return Err(Error::DataTooShortForFastAccess);
386 }
387
388 // Invariant upheld for `data`: If we got this far, the length of `data`
389 // satisfies `data`'s length invariant on the assumption that the contents
390 // of `fast_index` subslice of `index` and `header.trie_type` will not
391 // change subsequently.
392
393 Ok(error_value)
394 }
395
396 /// Turns this trie into a version whose trie type is encoded in the Rust type.
397 #[inline]
398 pub fn to_typed(self) -> Typed<FastCodePointTrie<'trie, T>, SmallCodePointTrie<'trie, T>> {
399 match self.header.trie_type {
400 TrieType::Fast => Typed::Fast(FastCodePointTrie { inner: self }),
401 TrieType::Small => Typed::Small(SmallCodePointTrie { inner: self }),
402 }
403 }
404
405 /// Obtains a reference to this trie as a Rust type that encodes the trie type in the Rust type.
406 #[inline]
407 pub fn as_typed_ref(
408 &self,
409 ) -> Typed<&FastCodePointTrie<'trie, T>, &SmallCodePointTrie<'trie, T>> {
410 // SAFETY: `FastCodePointTrie` and `SmallCodePointTrie` are `repr(transparent)`
411 // with `CodePointTrie`, so transmuting between the references is OK when the
412 // actual trie type agrees with the semantics of the typed wrapper.
413 match self.header.trie_type {
414 TrieType::Fast => Typed::Fast(unsafe {
415 &*(self as *const CodePointTrie<'trie, T> as *const FastCodePointTrie<'trie, T>)
416 }),
417 TrieType::Small => Typed::Small(unsafe {
418 &*(self as *const CodePointTrie<'trie, T> as *const SmallCodePointTrie<'trie, T>)
419 }),
420 }
421 }
422
423 /// Returns the position in the data array containing the trie's stored
424 /// error value.
425 #[inline(always)] // `always` was based on previous normalizer benchmarking
426 fn trie_error_val_index(&self) -> u32 {
427 // We use wrapping_sub here to avoid panicky overflow checks.
428 // len should always be > 1, but if it isn't this will just cause GIGO behavior of producing
429 // None on `.get()`
430 debug_assert!(self.data.len() as u32 >= ERROR_VALUE_NEG_DATA_OFFSET);
431 w!((self.data.len() as u32) - ERROR_VALUE_NEG_DATA_OFFSET)
432 }
433
434 fn internal_small_index(&self, code_point: u32) -> u32 {
435 // We use wrapping arithmetic here to avoid overflow checks making their way into binaries
436 // with overflow checks enabled. Ultimately this code ends up as a checked index, so any
437 // bugs here will cause GIGO
438 let mut index1_pos: u32 = code_point >> SHIFT_1;
439 if self.header.trie_type == TrieType::Fast {
440 debug_assert!(
441 FAST_TYPE_FAST_INDEXING_MAX < code_point && code_point < self.header.high_start
442 );
443 index1_pos = w!(index1_pos + BMP_INDEX_LENGTH - OMITTED_BMP_INDEX_1_LENGTH);
444 } else {
445 assert!(code_point < self.header.high_start && self.header.high_start > SMALL_LIMIT);
446 index1_pos = w!(index1_pos + SMALL_INDEX_LENGTH);
447 }
448 let index1_val = if let Some(index1_val) = self.index.get(index1_pos as usize) {
449 index1_val
450 } else {
451 return self.trie_error_val_index();
452 };
453 let index3_block_idx: u32 =
454 w!((index1_val as u32) + (code_point >> SHIFT_2) & INDEX_2_MASK);
455 let mut index3_block: u32 =
456 if let Some(index3_block) = self.index.get(index3_block_idx as usize) {
457 index3_block as u32
458 } else {
459 return self.trie_error_val_index();
460 };
461 let mut index3_pos: u32 = (code_point >> SHIFT_3) & INDEX_3_MASK;
462 let mut data_block: u32;
463 if index3_block & 0x8000 == 0 {
464 // 16-bit indexes
465 data_block =
466 if let Some(data_block) = self.index.get(w!(index3_block + index3_pos) as usize) {
467 data_block as u32
468 } else {
469 return self.trie_error_val_index();
470 };
471 } else {
472 // 18-bit indexes stored in groups of 9 entries per 8 indexes.
473 index3_block = w!((index3_block & 0x7fff) + w!((index3_pos & !7) + index3_pos >> 3));
474 index3_pos &= 7;
475 data_block = if let Some(data_block) = self.index.get(index3_block as usize) {
476 data_block as u32
477 } else {
478 return self.trie_error_val_index();
479 };
480 data_block = (data_block << w!(2u32 + w!(2u32 * index3_pos))) & 0x30000;
481 index3_block += 1;
482 data_block =
483 if let Some(index3_val) = self.index.get(w!(index3_block + index3_pos) as usize) {
484 data_block | (index3_val as u32)
485 } else {
486 return self.trie_error_val_index();
487 };
488 }
489 // Returns data_pos == data_block (offset) +
490 // portion of code_point bit field for last (4th) lookup
491 w!(data_block + code_point & SMALL_DATA_MASK)
492 }
493
494 /// Returns the position in the `data` array for the given code point,
495 /// where this code point is at or above the fast limit associated for the
496 /// `trie_type`. We will refer to that limit as "`fastMax`" here.
497 ///
498 /// A lookup of the value in the code point trie for a code point in the
499 /// code point space range [`fastMax`, `high_start`) will be a 4-step
500 /// lookup: 3 lookups in the `index` array and one lookup in the `data`
501 /// array. Lookups for code points in the range [`high_start`,
502 /// `CODE_POINT_MAX`] are short-circuited to be a single lookup, see
503 /// [`CodePointTrieHeader::high_start`].
504 fn small_index(&self, code_point: u32) -> u32 {
505 if code_point >= self.header.high_start {
506 w!((self.data.len() as u32) - HIGH_VALUE_NEG_DATA_OFFSET)
507 } else {
508 self.internal_small_index(code_point) // helper fn
509 }
510 }
511
512 /// Returns the position in the `data` array for the given code point,
513 /// where this code point is below the fast limit associated for the
514 /// `trie type`. We will refer to that limit as "`fastMax`" here.
515 ///
516 /// A lookup of the value in the code point trie for a code point in the
517 /// code point space range [0, `fastMax`) will be a 2-step lookup: 1
518 /// lookup in the `index` array and one lookup in the `data` array. By
519 /// design, for trie type `T`, there is an element allocated in the `index`
520 /// array for each block of code points in [0, `fastMax`), which in
521 /// turn guarantees that those code points are represented and only need 1
522 /// lookup.
523 fn fast_index(&self, code_point: u32) -> u32 {
524 let index_array_pos: u32 = code_point >> FAST_TYPE_SHIFT;
525 let index_array_val: u16 =
526 if let Some(index_array_val) = self.index.get(index_array_pos as usize) {
527 index_array_val
528 } else {
529 return self.trie_error_val_index();
530 };
531 let masked_cp = code_point & FAST_TYPE_DATA_MASK;
532 let index_array_val = index_array_val as u32;
533 let fast_index_val: u32 = w!(index_array_val + masked_cp);
534 fast_index_val
535 }
536
537 /// Returns the value that is associated with `code_point` in this [`CodePointTrie`]
538 /// if `code_point` uses fast-path lookup or `None` if `code_point`
539 /// should use small-path lookup or is above the supported range.
540 #[inline(always)] // "always" to make the `Option` collapse away
541 fn get32_by_fast_index(&self, code_point: u32) -> Option<T> {
542 let fast_max = match self.header.trie_type {
543 TrieType::Fast => FAST_TYPE_FAST_INDEXING_MAX,
544 TrieType::Small => SMALL_TYPE_FAST_INDEXING_MAX,
545 };
546 if code_point <= fast_max {
547 // SAFETY: We just checked the invariant of
548 // `get32_assuming_fast_index`,
549 // which is
550 // "If `self.header.trie_type == TrieType::Small`, `code_point` must be at most
551 // `SMALL_TYPE_FAST_INDEXING_MAX`. If `self.header.trie_type ==
552 // TrieType::Fast`, `code_point` must be at most `FAST_TYPE_FAST_INDEXING_MAX`."
553 Some(unsafe { self.get32_assuming_fast_index(code_point) })
554 } else {
555 // The caller needs to call `get32_by_small_index` or determine
556 // that the argument is above the permitted range.
557 None
558 }
559 }
560
561 /// Performs the actual fast-mode lookup
562 ///
563 /// # Safety
564 ///
565 /// If `self.header.trie_type == TrieType::Small`, `code_point` must be at most
566 /// `SMALL_TYPE_FAST_INDEXING_MAX`. If `self.header.trie_type ==
567 /// TrieType::Fast`, `code_point` must be at most `FAST_TYPE_FAST_INDEXING_MAX`.
568 #[inline(always)]
569 unsafe fn get32_assuming_fast_index(&self, code_point: u32) -> T {
570 // Check the safety invariant of this method.
571 debug_assert!(
572 code_point
573 <= match self.header.trie_type {
574 TrieType::Fast => FAST_TYPE_FAST_INDEXING_MAX,
575 TrieType::Small => SMALL_TYPE_FAST_INDEXING_MAX,
576 }
577 );
578
579 let bit_prefix = (code_point as usize) >> FAST_TYPE_SHIFT;
580 debug_assert!(bit_prefix < self.index.len());
581 // SAFETY: Relying on the length invariant of `self.index` having
582 // been checked and on the unchangedness invariant of `self.index`
583 // and `self.header.trie_type` after construction.
584 let base_offset_to_data: usize = usize::from(u16::from_unaligned(*unsafe {
585 self.index.as_ule_slice().get_unchecked(bit_prefix)
586 }));
587 let bit_suffix = (code_point & FAST_TYPE_DATA_MASK) as usize;
588 // SAFETY: Cannot overflow with supported (32-bit and 64-bit) `usize`
589 // sizes, since `base_offset_to_data` was extended from `u16` and
590 // `bit_suffix` is at most `FAST_TYPE_DATA_MASK`, which is well
591 // under what it takes to reach the 32-bit (or 64-bit) max with
592 // additon from the max of `u16`.
593 let offset_to_data = w!(base_offset_to_data + bit_suffix);
594 debug_assert!(offset_to_data < self.data.len());
595 // SAFETY: Relying on the length invariant of `self.data` having
596 // been checked and on the unchangedness invariant of `self.data`,
597 // `self.index`, and `self.header.trie_type` after construction.
598 T::from_unaligned(*unsafe { self.data.as_ule_slice().get_unchecked(offset_to_data) })
599 }
600
601 /// Coldness wrapper for `get32_by_small_index` to also allow
602 /// calls without the effects of `#[cold]`.
603 #[cold]
604 #[inline(always)]
605 fn get32_by_small_index_cold(&self, code_point: u32) -> T {
606 self.get32_by_small_index(code_point)
607 }
608
609 /// Returns the value that is associated with `code_point` in this [`CodePointTrie`]
610 /// assuming that the small index path should be used.
611 ///
612 /// # Intended Precondition
613 ///
614 /// `code_point` must be at most `CODE_POINT_MAX` AND greter than
615 /// `FAST_TYPE_FAST_INDEXING_MAX` if the trie type is fast or greater
616 /// than `SMALL_TYPE_FAST_INDEXING_MAX` if the trie type is small.
617 /// This is checked when debug assertions are enabled. If this
618 /// precondition is violated, the behavior of this method is
619 /// memory-safe, but the returned value may be bogus (not
620 /// necessarily the designated error value).
621 #[inline(never)]
622 fn get32_by_small_index(&self, code_point: u32) -> T {
623 debug_assert!(code_point <= CODE_POINT_MAX);
624 debug_assert!(
625 code_point
626 > match self.header.trie_type {
627 TrieType::Fast => FAST_TYPE_FAST_INDEXING_MAX,
628 TrieType::Small => SMALL_TYPE_FAST_INDEXING_MAX,
629 }
630 );
631 self.data
632 .get(self.small_index(code_point) as usize)
633 .unwrap_or(self.error_value)
634 }
635
636 /// Returns the value that is associated with `code_point` in this [`CodePointTrie`].
637 ///
638 /// # Examples
639 ///
640 /// ```
641 /// use icu::collections::codepointtrie::planes;
642 /// let trie = planes::get_planes_trie();
643 ///
644 /// assert_eq!(0, trie.get32(0x41)); // 'A' as u32
645 /// assert_eq!(0, trie.get32(0x13E0)); // 'Ꮰ' as u32
646 /// assert_eq!(1, trie.get32(0x10044)); // '𐁄' as u32
647 /// ```
648 #[inline(always)] // `always` based on normalizer benchmarking
649 pub fn get32(&self, code_point: u32) -> T {
650 if let Some(v) = self.get32_by_fast_index(code_point) {
651 v
652 } else if code_point <= CODE_POINT_MAX {
653 self.get32_by_small_index_cold(code_point)
654 } else {
655 self.error_value
656 }
657 }
658
659 /// Returns the value that is associated with `char` in this [`CodePointTrie`].
660 ///
661 /// # Examples
662 ///
663 /// ```
664 /// use icu::collections::codepointtrie::planes;
665 /// let trie = planes::get_planes_trie();
666 ///
667 /// assert_eq!(0, trie.get('A')); // 'A' as u32
668 /// assert_eq!(0, trie.get('Ꮰ')); // 'Ꮰ' as u32
669 /// assert_eq!(1, trie.get('𐁄')); // '𐁄' as u32
670 /// ```
671 #[inline(always)]
672 pub fn get(&self, c: char) -> T {
673 // LLVM's optimizations have been observed not to be 100%
674 // reliable around collapsing away unnecessary parts of
675 // `get32`, so not just calling `get32` here.
676 let code_point = u32::from(c);
677 if let Some(v) = self.get32_by_fast_index(code_point) {
678 v
679 } else {
680 self.get32_by_small_index_cold(code_point)
681 }
682 }
683
684 /// Returns the value that is associated with `bmp` in this [`CodePointTrie`].
685 #[inline(always)]
686 pub fn get16(&self, bmp: u16) -> T {
687 // LLVM's optimizations have been observed not to be 100%
688 // reliable around collapsing away unnecessary parts of
689 // `get32`, so not just calling `get32` here.
690 let code_point = u32::from(bmp);
691 if let Some(v) = self.get32_by_fast_index(code_point) {
692 v
693 } else {
694 self.get32_by_small_index_cold(code_point)
695 }
696 }
697
698 /// Lookup trie value by non-Basic Multilingual Plane Scalar Value.
699 ///
700 /// The return value may be bogus (not necessarily `error_value`) is the argument is actually in
701 /// the Basic Multilingual Plane or above the Unicode Scalar Value
702 /// range (panics instead with debug assertions enabled).
703 #[inline(always)]
704 pub fn get32_supplementary(&self, supplementary: u32) -> T {
705 debug_assert!(supplementary > 0xFFFF);
706 debug_assert!(supplementary <= CODE_POINT_MAX);
707 self.get32_by_small_index(supplementary)
708 }
709
710 /// Returns a reference to the ULE of the value that is associated with `code_point` in this [`CodePointTrie`].
711 ///
712 /// # Examples
713 ///
714 /// ```
715 /// use icu::collections::codepointtrie::planes;
716 /// let trie = planes::get_planes_trie();
717 ///
718 /// assert_eq!(Some(&0), trie.get32_ule(0x41)); // 'A' as u32
719 /// assert_eq!(Some(&0), trie.get32_ule(0x13E0)); // 'Ꮰ' as u32
720 /// assert_eq!(Some(&1), trie.get32_ule(0x10044)); // '𐁄' as u32
721 /// ```
722 #[inline(always)] // `always` was based on previous normalizer benchmarking
723 pub fn get32_ule(&self, code_point: u32) -> Option<&T::ULE> {
724 // All code points up to the fast max limit are represented
725 // individually in the `index` array to hold their `data` array position, and
726 // thus only need 2 lookups for a [CodePointTrie::get()](`crate::codepointtrie::CodePointTrie::get`).
727 // Code points above the "fast max" limit require 4 lookups.
728 let fast_max = match self.header.trie_type {
729 TrieType::Fast => FAST_TYPE_FAST_INDEXING_MAX,
730 TrieType::Small => SMALL_TYPE_FAST_INDEXING_MAX,
731 };
732 let data_pos: u32 = if code_point <= fast_max {
733 Self::fast_index(self, code_point)
734 } else if code_point <= CODE_POINT_MAX {
735 Self::small_index(self, code_point)
736 } else {
737 self.trie_error_val_index()
738 };
739 // Returns the trie value (or trie's error value).
740 self.data.as_ule_slice().get(data_pos as usize)
741 }
742
743 /// Converts the [`CodePointTrie`] into one that returns another type of the same size.
744 ///
745 /// Borrowed data remains borrowed, and owned data remains owned.
746 ///
747 /// If the old and new types are not the same size, use
748 /// [`CodePointTrie::try_alloc_map_value`].
749 ///
750 /// # Panics
751 ///
752 /// Panics if `T` and `P` are different sizes.
753 ///
754 /// More specifically, panics if [`ZeroVec::try_into_converted()`] panics when converting
755 /// `ZeroVec<T>` into `ZeroVec<P>`, which happens if `T::ULE` and `P::ULE` differ in size.
756 ///
757 /// ✨ *Enabled with the `alloc` Cargo feature.*
758 ///
759 /// # Examples
760 ///
761 /// ```no_run
762 /// use icu::collections::codepointtrie::planes;
763 /// use icu::collections::codepointtrie::CodePointTrie;
764 ///
765 /// let planes_trie_u8: CodePointTrie<u8> = planes::get_planes_trie();
766 /// let planes_trie_i8: CodePointTrie<i8> =
767 /// planes_trie_u8.try_into_converted().expect("infallible");
768 ///
769 /// assert_eq!(planes_trie_i8.get32(0x30000), 3);
770 /// ```
771 #[cfg(feature = "alloc")]
772 pub fn try_into_converted<P>(self) -> Result<CodePointTrie<'trie, P>, UleError>
773 where
774 P: TrieValue,
775 {
776 let converted_data = self.data.try_into_converted()?;
777 let error_ule = self.error_value.to_unaligned();
778 let slice = &[error_ule];
779 let error_vec = ZeroVec::<T>::new_borrowed(slice);
780 let error_converted = error_vec.try_into_converted::<P>()?;
781 #[expect(clippy::expect_used)] // we know this cannot fail
782 Ok(CodePointTrie {
783 header: self.header,
784 index: self.index,
785 data: converted_data,
786 error_value: error_converted
787 .get(0)
788 .expect("vector known to have one element"),
789 })
790 }
791
792 /// Maps the [`CodePointTrie`] into one that returns a different type.
793 ///
794 /// This function returns owned data.
795 ///
796 /// If the old and new types are the same size, use the more efficient
797 /// [`CodePointTrie::try_into_converted`].
798 ///
799 /// ✨ *Enabled with the `alloc` Cargo feature.*
800 ///
801 /// # Examples
802 ///
803 /// ```
804 /// use icu::collections::codepointtrie::planes;
805 /// use icu::collections::codepointtrie::CodePointTrie;
806 ///
807 /// let planes_trie_u8: CodePointTrie<u8> = planes::get_planes_trie();
808 /// let planes_trie_u16: CodePointTrie<u16> = planes_trie_u8
809 /// .try_alloc_map_value(TryFrom::try_from)
810 /// .expect("infallible");
811 ///
812 /// assert_eq!(planes_trie_u16.get32(0x30000), 3);
813 /// ```
814 #[cfg(feature = "alloc")]
815 pub fn try_alloc_map_value<P, E>(
816 &self,
817 mut f: impl FnMut(T) -> Result<P, E>,
818 ) -> Result<CodePointTrie<'trie, P>, E>
819 where
820 P: TrieValue,
821 {
822 let error_converted = f(self.error_value)?;
823 let converted_data = self.data.iter().map(f).collect::<Result<ZeroVec<P>, E>>()?;
824 Ok(CodePointTrie {
825 header: self.header,
826 index: self.index.clone(),
827 data: converted_data,
828 error_value: error_converted,
829 })
830 }
831
832 /// Returns a [`CodePointMapRange`] struct which represents a range of code
833 /// points associated with the same trie value. The returned range will be
834 /// the longest stretch of consecutive code points starting at `start` that
835 /// share this value.
836 ///
837 /// This method is designed to use the internal details of
838 /// the structure of [`CodePointTrie`] to be optimally efficient. This will
839 /// outperform a naive approach that just uses [`CodePointTrie::get()`].
840 ///
841 /// This method provides lower-level functionality that can be used in the
842 /// implementation of other methods that are more convenient to the user.
843 /// To obtain an optimal partition of the code point space for
844 /// this trie resulting in the fewest number of ranges, see
845 /// [`CodePointTrie::iter_ranges()`].
846 ///
847 /// # Examples
848 ///
849 /// ```
850 /// use icu::collections::codepointtrie::planes;
851 ///
852 /// let trie = planes::get_planes_trie();
853 ///
854 /// const CODE_POINT_MAX: u32 = 0x10ffff;
855 /// let start = 0x1_0000;
856 /// let exp_end = 0x1_ffff;
857 ///
858 /// let start_val = trie.get32(start);
859 /// assert_eq!(trie.get32(exp_end), start_val);
860 /// assert_ne!(trie.get32(exp_end + 1), start_val);
861 ///
862 /// use icu::collections::codepointtrie::CodePointMapRange;
863 ///
864 /// let cpm_range: CodePointMapRange<u8> = trie.get_range(start).unwrap();
865 ///
866 /// assert_eq!(cpm_range.range.start(), &start);
867 /// assert_eq!(cpm_range.range.end(), &exp_end);
868 /// assert_eq!(cpm_range.value, start_val);
869 ///
870 /// // `start` can be any code point, whether or not it lies on the boundary
871 /// // of a maximally large range that still contains `start`
872 ///
873 /// let submaximal_1_start = start + 0x1234;
874 /// let submaximal_1 = trie.get_range(submaximal_1_start).unwrap();
875 /// assert_eq!(submaximal_1.range.start(), &0x1_1234);
876 /// assert_eq!(submaximal_1.range.end(), &0x1_ffff);
877 /// assert_eq!(submaximal_1.value, start_val);
878 ///
879 /// let submaximal_2_start = start + 0xffff;
880 /// let submaximal_2 = trie.get_range(submaximal_2_start).unwrap();
881 /// assert_eq!(submaximal_2.range.start(), &0x1_ffff);
882 /// assert_eq!(submaximal_2.range.end(), &0x1_ffff);
883 /// assert_eq!(submaximal_2.value, start_val);
884 /// ```
885 pub fn get_range(&self, start: u32) -> Option<CodePointMapRange<T>> {
886 // Exit early if the start code point is out of range, or if it is
887 // in the last range of code points in high_start..=CODE_POINT_MAX
888 // (start- and end-inclusive) that all share the same trie value.
889 if CODE_POINT_MAX < start {
890 return None;
891 }
892 if start >= self.header.high_start {
893 let di: usize = self.data.len() - (HIGH_VALUE_NEG_DATA_OFFSET as usize);
894 let value: T = self.data.get(di)?;
895 return Some(CodePointMapRange {
896 range: start..=CODE_POINT_MAX,
897 value,
898 });
899 }
900
901 let null_value: T = T::try_from_u32(self.header.null_value).ok()?;
902
903 let mut prev_i3_block: u32 = u32::MAX; // using u32::MAX (instead of -1 as an i32 in ICU)
904 let mut prev_block: u32 = u32::MAX; // using u32::MAX (instead of -1 as an i32 in ICU)
905 let mut c: u32 = start;
906 let mut trie_value: T = self.error_value();
907 let mut value: T = self.error_value();
908 let mut have_value: bool = false;
909
910 loop {
911 let i3_block: u32;
912 let mut i3: u32;
913 let i3_block_length: u32;
914 let data_block_length: u32;
915
916 // Initialize values before beginning the iteration in the subsequent
917 // `loop` block. In particular, use the "i3*" local variables
918 // (representing the `index` array position's offset + increment
919 // for a 3rd-level trie lookup) to help initialize the data block
920 // variable `block` in the loop for the `data` array.
921 //
922 // When a lookup code point is <= the trie's *_FAST_INDEXING_MAX that
923 // corresponds to its `trie_type`, the lookup only takes 2 steps
924 // (once into the `index`, once into the `data` array); otherwise,
925 // takes 4 steps (3 iterative lookups into the `index`, once more
926 // into the `data` array). So for convenience's sake, when we have the
927 // 2-stage lookup, reuse the "i3*" variable names for the first lookup.
928 if c <= 0xffff
929 && (self.header.trie_type == TrieType::Fast || c <= SMALL_TYPE_FAST_INDEXING_MAX)
930 {
931 i3_block = 0;
932 i3 = c >> FAST_TYPE_SHIFT;
933 i3_block_length = if self.header.trie_type == TrieType::Fast {
934 BMP_INDEX_LENGTH
935 } else {
936 SMALL_INDEX_LENGTH
937 };
938 data_block_length = FAST_TYPE_DATA_BLOCK_LENGTH;
939 } else {
940 // Use the multi-stage index.
941 let mut i1: u32 = c >> SHIFT_1;
942 if self.header.trie_type == TrieType::Fast {
943 debug_assert!(0xffff < c && c < self.header.high_start);
944 i1 = i1 + BMP_INDEX_LENGTH - OMITTED_BMP_INDEX_1_LENGTH;
945 } else {
946 debug_assert!(
947 c < self.header.high_start && self.header.high_start > SMALL_LIMIT
948 );
949 i1 += SMALL_INDEX_LENGTH;
950 }
951 let i2: u16 = self.index.get(i1 as usize)?;
952 let i3_block_idx: u32 = (i2 as u32) + ((c >> SHIFT_2) & INDEX_2_MASK);
953 i3_block = if let Some(i3b) = self.index.get(i3_block_idx as usize) {
954 i3b as u32
955 } else {
956 return None;
957 };
958 if i3_block == prev_i3_block && (c - start) >= CP_PER_INDEX_2_ENTRY {
959 // The index-3 block is the same as the previous one, and filled with value.
960 debug_assert!((c & (CP_PER_INDEX_2_ENTRY - 1)) == 0);
961 c += CP_PER_INDEX_2_ENTRY;
962
963 if c >= self.header.high_start {
964 break;
965 } else {
966 continue;
967 }
968 }
969 prev_i3_block = i3_block;
970 if i3_block == self.header.index3_null_offset as u32 {
971 // This is the index-3 null block.
972 // All of the `data` array blocks pointed to by the values
973 // in this block of the `index` 3rd-stage subarray will
974 // contain this trie's null_value. So if we are in the middle
975 // of a range, end it and return early, otherwise start a new
976 // range of null values.
977 if have_value {
978 if null_value != value {
979 return Some(CodePointMapRange {
980 range: start..=(c - 1),
981 value,
982 });
983 }
984 } else {
985 trie_value = T::try_from_u32(self.header.null_value).ok()?;
986 value = null_value;
987 have_value = true;
988 }
989 prev_block = self.header.data_null_offset;
990 c = (c + CP_PER_INDEX_2_ENTRY) & !(CP_PER_INDEX_2_ENTRY - 1);
991
992 if c >= self.header.high_start {
993 break;
994 } else {
995 continue;
996 }
997 }
998 i3 = (c >> SHIFT_3) & INDEX_3_MASK;
999 i3_block_length = INDEX_3_BLOCK_LENGTH;
1000 data_block_length = SMALL_DATA_BLOCK_LENGTH;
1001 }
1002
1003 // Enumerate data blocks for one index-3 block.
1004 loop {
1005 let mut block: u32;
1006 if (i3_block & 0x8000) == 0 {
1007 block = if let Some(b) = self.index.get((i3_block + i3) as usize) {
1008 b as u32
1009 } else {
1010 return None;
1011 };
1012 } else {
1013 // 18-bit indexes stored in groups of 9 entries per 8 indexes.
1014 let mut group: u32 = (i3_block & 0x7fff) + (i3 & !7) + (i3 >> 3);
1015 let gi: u32 = i3 & 7;
1016 let gi_val: u32 = if let Some(giv) = self.index.get(group as usize) {
1017 giv.into()
1018 } else {
1019 return None;
1020 };
1021 block = (gi_val << (2 + (2 * gi))) & 0x30000;
1022 group += 1;
1023 let ggi_val: u32 = if let Some(ggiv) = self.index.get((group + gi) as usize) {
1024 ggiv as u32
1025 } else {
1026 return None;
1027 };
1028 block |= ggi_val;
1029 }
1030
1031 // If our previous and current return values of the 3rd-stage `index`
1032 // lookup yield the same `data` block offset, and if we already know that
1033 // the entire `data` block / subarray starting at that offset stores
1034 // `value` and nothing else, then we can extend our range by the length
1035 // of a data block and continue.
1036 // Otherwise, we have to iterate over the values stored in the
1037 // new data block to see if they differ from `value`.
1038 if block == prev_block && (c - start) >= data_block_length {
1039 // The block is the same as the previous one, and filled with value.
1040 debug_assert!((c & (data_block_length - 1)) == 0);
1041 c += data_block_length;
1042 } else {
1043 let data_mask: u32 = data_block_length - 1;
1044 prev_block = block;
1045 if block == self.header.data_null_offset {
1046 // This is the data null block.
1047 // If we are in the middle of a range, end it and
1048 // return early, otherwise start a new range of null
1049 // values.
1050 if have_value {
1051 if null_value != value {
1052 return Some(CodePointMapRange {
1053 range: start..=(c - 1),
1054 value,
1055 });
1056 }
1057 } else {
1058 trie_value = T::try_from_u32(self.header.null_value).ok()?;
1059 value = null_value;
1060 have_value = true;
1061 }
1062 c = (c + data_block_length) & !data_mask;
1063 } else {
1064 let mut di: u32 = block + (c & data_mask);
1065 let mut trie_value_2: T = self.data.get(di as usize)?;
1066 if have_value {
1067 if trie_value_2 != trie_value {
1068 if maybe_filter_value(
1069 trie_value_2,
1070 T::try_from_u32(self.header.null_value).ok()?,
1071 null_value,
1072 ) != value
1073 {
1074 return Some(CodePointMapRange {
1075 range: start..=(c - 1),
1076 value,
1077 });
1078 }
1079 // `trie_value` stores the previous value that was retrieved
1080 // from the trie.
1081 // `value` stores the value associated for the range (return
1082 // value) that we are currently building, which is computed
1083 // as a transformation by applying maybe_filter_value()
1084 // to the trie value.
1085 // The current trie value `trie_value_2` within this data block
1086 // differs here from the previous value in `trie_value`.
1087 // But both map to `value` after applying `maybe_filter_value`.
1088 // It is not clear whether the previous or the current trie value
1089 // (or neither) is more likely to match potential subsequent trie
1090 // values that would extend the range by mapping to `value`.
1091 // On the assumption of locality -- often times consecutive
1092 // characters map to the same trie values -- remembering the new
1093 // one might make it faster to extend this range further
1094 // (by increasing the chance that the next `trie_value_2 !=
1095 // trie_value` test will be false).
1096 trie_value = trie_value_2; // may or may not help
1097 }
1098 } else {
1099 trie_value = trie_value_2;
1100 value = maybe_filter_value(
1101 trie_value_2,
1102 T::try_from_u32(self.header.null_value).ok()?,
1103 null_value,
1104 );
1105 have_value = true;
1106 }
1107
1108 c += 1;
1109 while (c & data_mask) != 0 {
1110 di += 1;
1111 trie_value_2 = self.data.get(di as usize)?;
1112 if trie_value_2 != trie_value {
1113 if maybe_filter_value(
1114 trie_value_2,
1115 T::try_from_u32(self.header.null_value).ok()?,
1116 null_value,
1117 ) != value
1118 {
1119 return Some(CodePointMapRange {
1120 range: start..=(c - 1),
1121 value,
1122 });
1123 }
1124 // `trie_value` stores the previous value that was retrieved
1125 // from the trie.
1126 // `value` stores the value associated for the range (return
1127 // value) that we are currently building, which is computed
1128 // as a transformation by applying maybe_filter_value()
1129 // to the trie value.
1130 // The current trie value `trie_value_2` within this data block
1131 // differs here from the previous value in `trie_value`.
1132 // But both map to `value` after applying `maybe_filter_value`.
1133 // It is not clear whether the previous or the current trie value
1134 // (or neither) is more likely to match potential subsequent trie
1135 // values that would extend the range by mapping to `value`.
1136 // On the assumption of locality -- often times consecutive
1137 // characters map to the same trie values -- remembering the new
1138 // one might make it faster to extend this range further
1139 // (by increasing the chance that the next `trie_value_2 !=
1140 // trie_value` test will be false).
1141 trie_value = trie_value_2; // may or may not help
1142 }
1143
1144 c += 1;
1145 }
1146 }
1147 }
1148
1149 i3 += 1;
1150 if i3 >= i3_block_length {
1151 break;
1152 }
1153 }
1154
1155 if c >= self.header.high_start {
1156 break;
1157 }
1158 }
1159
1160 debug_assert!(have_value);
1161
1162 // Now that c >= high_start, compare `value` to `high_value` to see
1163 // if we can merge our current range with the high_value range
1164 // high_start..=CODE_POINT_MAX (start- and end-inclusive), otherwise
1165 // stop at high_start - 1.
1166 let di: u32 = self.data.len() as u32 - HIGH_VALUE_NEG_DATA_OFFSET;
1167 let high_value: T = self.data.get(di as usize)?;
1168 if maybe_filter_value(
1169 high_value,
1170 T::try_from_u32(self.header.null_value).ok()?,
1171 null_value,
1172 ) != value
1173 {
1174 c -= 1;
1175 } else {
1176 c = CODE_POINT_MAX;
1177 }
1178 Some(CodePointMapRange {
1179 range: start..=c,
1180 value,
1181 })
1182 }
1183
1184 /// Yields an [`Iterator`] returning ranges of consecutive code points that
1185 /// share the same value in the [`CodePointTrie`], as given by
1186 /// [`CodePointTrie::get_range()`].
1187 ///
1188 /// # Examples
1189 ///
1190 /// ```
1191 /// use core::ops::RangeInclusive;
1192 /// use icu::collections::codepointtrie::planes;
1193 /// use icu::collections::codepointtrie::CodePointMapRange;
1194 ///
1195 /// let planes_trie = planes::get_planes_trie();
1196 ///
1197 /// let mut ranges = planes_trie.iter_ranges();
1198 ///
1199 /// for plane in 0..=16 {
1200 /// let exp_start = plane * 0x1_0000;
1201 /// let exp_end = exp_start + 0xffff;
1202 /// assert_eq!(
1203 /// ranges.next(),
1204 /// Some(CodePointMapRange {
1205 /// range: exp_start..=exp_end,
1206 /// value: plane as u8
1207 /// })
1208 /// );
1209 /// }
1210 ///
1211 /// // Hitting the end of the iterator returns `None`, as will subsequent
1212 /// // calls to .next().
1213 /// assert_eq!(ranges.next(), None);
1214 /// assert_eq!(ranges.next(), None);
1215 /// ```
1216 pub fn iter_ranges(&self) -> CodePointMapRangeIterator<'_, T> {
1217 let init_range = Some(CodePointMapRange {
1218 range: u32::MAX..=u32::MAX,
1219 value: self.error_value(),
1220 });
1221 CodePointMapRangeIterator::<T> {
1222 cpt: self,
1223 cpm_range: init_range,
1224 }
1225 }
1226
1227 /// Yields an [`Iterator`] returning the ranges of the code points whose values
1228 /// match `value` in the [`CodePointTrie`].
1229 ///
1230 /// # Examples
1231 ///
1232 /// ```
1233 /// use icu::collections::codepointtrie::planes;
1234 ///
1235 /// let trie = planes::get_planes_trie();
1236 ///
1237 /// let plane_val = 2;
1238 /// let mut sip_range_iter = trie.iter_ranges_for_value(plane_val as u8);
1239 ///
1240 /// let start = plane_val * 0x1_0000;
1241 /// let end = start + 0xffff;
1242 ///
1243 /// let sip_range = sip_range_iter.next()
1244 /// .expect("Plane 2 (SIP) should exist in planes data");
1245 /// assert_eq!(start..=end, sip_range);
1246 ///
1247 /// assert!(sip_range_iter.next().is_none());
1248 pub fn iter_ranges_for_value(
1249 &self,
1250 value: T,
1251 ) -> impl Iterator<Item = RangeInclusive<u32>> + '_ {
1252 self.iter_ranges()
1253 .filter(move |cpm_range| cpm_range.value == value)
1254 .map(|cpm_range| cpm_range.range)
1255 }
1256
1257 /// Yields an [`Iterator`] returning the ranges of the code points after passing
1258 /// the value through a mapping function.
1259 ///
1260 /// This is preferable to calling `.get_ranges().map()` since it will coalesce
1261 /// adjacent ranges into one.
1262 ///
1263 /// # Examples
1264 ///
1265 /// ```
1266 /// use icu::collections::codepointtrie::planes;
1267 ///
1268 /// let trie = planes::get_planes_trie();
1269 ///
1270 /// let plane_val = 2;
1271 /// let mut sip_range_iter = trie.iter_ranges_mapped(|value| value != plane_val as u8).filter(|range| range.value);
1272 ///
1273 /// let end = plane_val * 0x1_0000 - 1;
1274 ///
1275 /// let sip_range = sip_range_iter.next()
1276 /// .expect("Complemented planes data should have at least one entry");
1277 /// assert_eq!(0..=end, sip_range.range);
1278 pub fn iter_ranges_mapped<'a, U: Eq + 'a>(
1279 &'a self,
1280 mut map: impl FnMut(T) -> U + Copy + 'a,
1281 ) -> impl Iterator<Item = CodePointMapRange<U>> + 'a {
1282 crate::iterator_utils::RangeListIteratorCoalescer::new(self.iter_ranges().map(
1283 move |range| CodePointMapRange {
1284 range: range.range,
1285 value: map(range.value),
1286 },
1287 ))
1288 }
1289
1290 /// Returns a [`CodePointInversionList`] for the code points that have the given
1291 /// [`TrieValue`] in the trie.
1292 ///
1293 /// ✨ *Enabled with the `alloc` Cargo feature.*
1294 ///
1295 /// # Examples
1296 ///
1297 /// ```
1298 /// use icu::collections::codepointtrie::planes;
1299 ///
1300 /// let trie = planes::get_planes_trie();
1301 ///
1302 /// let plane_val = 2;
1303 /// let sip = trie.get_set_for_value(plane_val as u8);
1304 ///
1305 /// let start = plane_val * 0x1_0000;
1306 /// let end = start + 0xffff;
1307 ///
1308 /// assert!(!sip.contains32(start - 1));
1309 /// assert!(sip.contains32(start));
1310 /// assert!(sip.contains32(end));
1311 /// assert!(!sip.contains32(end + 1));
1312 /// ```
1313 #[cfg(feature = "alloc")]
1314 pub fn get_set_for_value(&self, value: T) -> CodePointInversionList<'static> {
1315 let value_ranges = self.iter_ranges_for_value(value);
1316 CodePointInversionList::from_iter(value_ranges)
1317 }
1318
1319 /// Returns the value used as an error value for this trie
1320 #[inline]
1321 pub fn error_value(&self) -> T {
1322 self.error_value
1323 }
1324}
1325
1326#[cfg(feature = "databake")]
1327impl<T: TrieValue + databake::Bake> databake::Bake for CodePointTrie<'_, T> {
1328 fn bake(&self, env: &databake::CrateEnv) -> databake::TokenStream {
1329 let header = self.header.bake(env);
1330 let index = self.index.bake(env);
1331 let data = self.data.bake(env);
1332 let error_value = self.error_value.bake(env);
1333 databake::quote! { unsafe { icu_collections::codepointtrie::CodePointTrie::from_parts_unstable_unchecked_v1(#header, #index, #data, #error_value) } }
1334 }
1335}
1336
1337#[cfg(feature = "databake")]
1338impl<T: TrieValue + databake::Bake> databake::BakeSize for CodePointTrie<'_, T> {
1339 fn borrows_size(&self) -> usize {
1340 self.header.borrows_size() + self.index.borrows_size() + self.data.borrows_size()
1341 }
1342}
1343
1344impl<T: TrieValue + Into<u32>> CodePointTrie<'_, T> {
1345 /// Returns the value that is associated with `code_point` for this [`CodePointTrie`]
1346 /// as a `u32`.
1347 ///
1348 /// # Examples
1349 ///
1350 /// ```
1351 /// use icu::collections::codepointtrie::planes;
1352 /// let trie = planes::get_planes_trie();
1353 ///
1354 /// let cp = '𑖎' as u32;
1355 /// assert_eq!(cp, 0x1158E);
1356 ///
1357 /// let plane_num: u8 = trie.get32(cp);
1358 /// assert_eq!(trie.get32_u32(cp), plane_num as u32);
1359 /// ```
1360 // Note: This API method maintains consistency with the corresponding
1361 // original ICU APIs.
1362 pub fn get32_u32(&self, code_point: u32) -> u32 {
1363 self.get32(code_point).into()
1364 }
1365}
1366
1367impl<T: TrieValue> Clone for CodePointTrie<'_, T>
1368where
1369 <T as AsULE>::ULE: Clone,
1370{
1371 fn clone(&self) -> Self {
1372 CodePointTrie {
1373 header: self.header,
1374 index: self.index.clone(),
1375 data: self.data.clone(),
1376 error_value: self.error_value,
1377 }
1378 }
1379}
1380
1381/// Represents a range of consecutive code points sharing the same value in a
1382/// code point map.
1383///
1384/// The start and end of the interval is represented as a
1385/// `RangeInclusive<u32>`, and the value is represented as `T`.
1386#[derive(PartialEq, Eq, Debug, Clone)]
1387#[allow(clippy::exhaustive_structs)] // based on a stable serialized form
1388pub struct CodePointMapRange<T> {
1389 /// Range of code points from start to end (inclusive).
1390 pub range: RangeInclusive<u32>,
1391 /// Trie value associated with this range.
1392 pub value: T,
1393}
1394
1395/// A custom [`Iterator`] type specifically for a code point trie that returns
1396/// [`CodePointMapRange`]s.
1397#[derive(Debug)]
1398pub struct CodePointMapRangeIterator<'a, T: TrieValue> {
1399 cpt: &'a CodePointTrie<'a, T>,
1400 // Initialize `range` to Some(CodePointMapRange{ start: u32::MAX, end: u32::MAX, value: 0}).
1401 // When `range` is Some(...) and has a start value different from u32::MAX, then we have
1402 // returned at least one code point range due to a call to `next()`.
1403 // When `range` == `None`, it means that we have hit the end of iteration. It would occur
1404 // after a call to `next()` returns a None <=> we attempted to call `get_range()`
1405 // with a start code point that is > CODE_POINT_MAX.
1406 cpm_range: Option<CodePointMapRange<T>>,
1407}
1408
1409impl<T: TrieValue> Iterator for CodePointMapRangeIterator<'_, T> {
1410 type Item = CodePointMapRange<T>;
1411
1412 fn next(&mut self) -> Option<Self::Item> {
1413 self.cpm_range = match &self.cpm_range {
1414 Some(cpmr) => {
1415 if *cpmr.range.start() == u32::MAX {
1416 self.cpt.get_range(0)
1417 } else {
1418 self.cpt.get_range(cpmr.range.end() + 1)
1419 }
1420 }
1421 None => None,
1422 };
1423 // Note: Clone is cheap. We can't Copy because RangeInclusive does not impl Copy.
1424 self.cpm_range.clone()
1425 }
1426}
1427
1428/// For sealing `TypedCodePointTrie`
1429///
1430/// # Safety Usable Invariant
1431///
1432/// All implementations of `TypedCodePointTrie` are reviewable in this module.
1433trait Seal {}
1434
1435/// Trait for writing trait bounds for monomorphizing over either
1436/// `FastCodePointTrie` or `SmallCodePointTrie`.
1437#[allow(private_bounds)] // Permit sealing
1438pub trait TypedCodePointTrie<'trie, T: TrieValue>: Seal {
1439 /// The `TrieType` associated with this `TypedCodePointTrie`
1440 ///
1441 /// # Safety Usable Invariant
1442 ///
1443 /// This constant matches `self.as_untyped_ref().header.trie_type`.
1444 const TRIE_TYPE: TrieType;
1445
1446 /// Lookup trie value as `u32` by Unicode Scalar Value without branching on trie type.
1447 #[inline(always)]
1448 fn get32_u32(&self, code_point: u32) -> u32 {
1449 self.get32(code_point).to_u32()
1450 }
1451
1452 /// Lookup trie value by Basic Multilingual Plane Code Point without branching on trie type.
1453 #[inline(always)]
1454 fn get16(&self, bmp: u16) -> T {
1455 // LLVM's optimizations have been observed not to be 100%
1456 // reliable around collapsing away unnecessary parts of
1457 // `get32`, so not just calling `get32` here.
1458 let code_point = u32::from(bmp);
1459 if let Some(v) = self.get32_by_fast_index(code_point) {
1460 v
1461 } else {
1462 self.as_untyped_ref().get32_by_small_index_cold(code_point)
1463 }
1464 }
1465
1466 /// Lookup trie value by non-Basic Multilingual Plane Scalar Value without branching on trie type.
1467 #[inline(always)]
1468 fn get32_supplementary(&self, supplementary: u32) -> T {
1469 self.as_untyped_ref().get32_supplementary(supplementary)
1470 }
1471
1472 /// Lookup trie value by Unicode Scalar Value without branching on trie type.
1473 #[inline(always)]
1474 fn get(&self, c: char) -> T {
1475 // LLVM's optimizations have been observed not to be 100%
1476 // reliable around collapsing away unnecessary parts of
1477 // `get32`, so not just calling `get32` here.
1478 let code_point = u32::from(c);
1479 if let Some(v) = self.get32_by_fast_index(code_point) {
1480 v
1481 } else {
1482 self.as_untyped_ref().get32_by_small_index_cold(code_point)
1483 }
1484 }
1485
1486 /// Lookup trie value by Unicode Code Point without branching on trie type.
1487 #[inline(always)]
1488 fn get32(&self, code_point: u32) -> T {
1489 if let Some(v) = self.get32_by_fast_index(code_point) {
1490 v
1491 } else if code_point <= CODE_POINT_MAX {
1492 self.as_untyped_ref().get32_by_small_index_cold(code_point)
1493 } else {
1494 self.as_untyped_ref().error_value
1495 }
1496 }
1497
1498 /// Returns the value that is associated with `code_point` in this [`CodePointTrie`]
1499 /// if `code_point` uses fast-path lookup or `None` if `code_point`
1500 /// should use small-path lookup or is above the supported range.
1501 #[inline(always)] // "always" to make the `Option` collapse away
1502 fn get32_by_fast_index(&self, code_point: u32) -> Option<T> {
1503 debug_assert_eq!(Self::TRIE_TYPE, self.as_untyped_ref().header.trie_type);
1504 let fast_max = match Self::TRIE_TYPE {
1505 TrieType::Fast => FAST_TYPE_FAST_INDEXING_MAX,
1506 TrieType::Small => SMALL_TYPE_FAST_INDEXING_MAX,
1507 };
1508 if code_point <= fast_max {
1509 // SAFETY: We just checked the invariant of
1510 // `get32_assuming_fast_index`,
1511 // which is
1512 // "If `self.header.trie_type == TrieType::Small`, `code_point` must be at most
1513 // `SMALL_TYPE_FAST_INDEXING_MAX`. If `self.header.trie_type ==
1514 // TrieType::Fast`, `code_point` must be at most `FAST_TYPE_FAST_INDEXING_MAX`."
1515 // ... assuming that `Self::TRIE_TYPE` always matches
1516 // `self.as_untyped_ref().header.trie_type`, i.e. we're relying on
1517 // `CodePointTrie::to_typed` and `CodePointTrie::as_typed_ref` being correct
1518 // and the exclusive ways of obtaining `Self`.
1519 Some(unsafe { self.as_untyped_ref().get32_assuming_fast_index(code_point) })
1520 } else {
1521 // The caller needs to call `get32_by_small_index` or determine
1522 // that the argument is above the permitted range.
1523 None
1524 }
1525 }
1526
1527 /// Returns a reference to the wrapped `CodePointTrie`.
1528 fn as_untyped_ref(&self) -> &CodePointTrie<'trie, T>;
1529
1530 /// Extracts the wrapped `CodePointTrie`.
1531 fn to_untyped(self) -> CodePointTrie<'trie, T>;
1532}
1533
1534/// Type-safe wrapper for a fast trie guaranteeing
1535/// the the getters don't branch on the trie type
1536/// and for guarenteeing that `get16` is branchless
1537/// in release builds.
1538#[derive(Debug)]
1539#[repr(transparent)]
1540pub struct FastCodePointTrie<'trie, T: TrieValue> {
1541 inner: CodePointTrie<'trie, T>,
1542}
1543
1544impl<'trie, T: TrieValue> TypedCodePointTrie<'trie, T> for FastCodePointTrie<'trie, T> {
1545 const TRIE_TYPE: TrieType = TrieType::Fast;
1546
1547 /// Returns a reference to the wrapped `CodePointTrie`.
1548 #[inline(always)]
1549 fn as_untyped_ref(&self) -> &CodePointTrie<'trie, T> {
1550 &self.inner
1551 }
1552
1553 /// Extracts the wrapped `CodePointTrie`.
1554 #[inline(always)]
1555 fn to_untyped(self) -> CodePointTrie<'trie, T> {
1556 self.inner
1557 }
1558
1559 /// Lookup trie value by Basic Multilingual Plane Code Point without branching on trie type.
1560 #[inline(always)]
1561 fn get16(&self, bmp: u16) -> T {
1562 debug_assert!(u32::from(u16::MAX) <= FAST_TYPE_FAST_INDEXING_MAX);
1563 debug_assert_eq!(Self::TRIE_TYPE, TrieType::Fast);
1564 debug_assert_eq!(self.as_untyped_ref().header.trie_type, TrieType::Fast);
1565 let code_point = u32::from(bmp);
1566 // SAFETY: With `TrieType::Fast`, the `u16` range satisfies
1567 // the invariant of `get32_assuming_fast_index`, which is
1568 // "If `self.header.trie_type == TrieType::Small`, `code_point` must be at most
1569 // `SMALL_TYPE_FAST_INDEXING_MAX`. If `self.header.trie_type ==
1570 // TrieType::Fast`, `code_point` must be at most `FAST_TYPE_FAST_INDEXING_MAX`."
1571 //
1572 // We're relying on `CodePointTrie::to_typed` and `CodePointTrie::as_typed_ref`
1573 // being correct and the exclusive ways of obtaining `Self`.
1574 unsafe { self.as_untyped_ref().get32_assuming_fast_index(code_point) }
1575 }
1576}
1577
1578impl<'trie, T: TrieValue> Seal for FastCodePointTrie<'trie, T> {}
1579
1580impl<'trie, T: TrieValue> TryFrom<&'trie CodePointTrie<'trie, T>>
1581 for &'trie FastCodePointTrie<'trie, T>
1582{
1583 type Error = TypedCodePointTrieError;
1584
1585 fn try_from(
1586 reference: &'trie CodePointTrie<'trie, T>,
1587 ) -> Result<&'trie FastCodePointTrie<'trie, T>, TypedCodePointTrieError> {
1588 match reference.as_typed_ref() {
1589 Typed::Fast(trie) => Ok(trie),
1590 Typed::Small(_) => Err(TypedCodePointTrieError),
1591 }
1592 }
1593}
1594
1595impl<'trie, T: TrieValue> TryFrom<CodePointTrie<'trie, T>> for FastCodePointTrie<'trie, T> {
1596 type Error = TypedCodePointTrieError;
1597
1598 fn try_from(
1599 value: CodePointTrie<'trie, T>,
1600 ) -> Result<FastCodePointTrie<'trie, T>, TypedCodePointTrieError> {
1601 match value.to_typed() {
1602 Typed::Fast(trie) => Ok(trie),
1603 Typed::Small(_) => Err(TypedCodePointTrieError),
1604 }
1605 }
1606}
1607
1608/// Type-safe wrapper for a small trie guaranteeing
1609/// the the getters don't branch on the trie type.
1610#[derive(Debug)]
1611#[repr(transparent)]
1612pub struct SmallCodePointTrie<'trie, T: TrieValue> {
1613 inner: CodePointTrie<'trie, T>,
1614}
1615
1616impl<'trie, T: TrieValue> TypedCodePointTrie<'trie, T> for SmallCodePointTrie<'trie, T> {
1617 const TRIE_TYPE: TrieType = TrieType::Small;
1618
1619 /// Returns a reference to the wrapped `CodePointTrie`.
1620 #[inline(always)]
1621 fn as_untyped_ref(&self) -> &CodePointTrie<'trie, T> {
1622 &self.inner
1623 }
1624
1625 /// Extracts the wrapped `CodePointTrie`.
1626 #[inline(always)]
1627 fn to_untyped(self) -> CodePointTrie<'trie, T> {
1628 self.inner
1629 }
1630}
1631
1632impl<'trie, T: TrieValue> Seal for SmallCodePointTrie<'trie, T> {}
1633
1634impl<'trie, T: TrieValue> TryFrom<&'trie CodePointTrie<'trie, T>>
1635 for &'trie SmallCodePointTrie<'trie, T>
1636{
1637 type Error = TypedCodePointTrieError;
1638
1639 fn try_from(
1640 reference: &'trie CodePointTrie<'trie, T>,
1641 ) -> Result<&'trie SmallCodePointTrie<'trie, T>, TypedCodePointTrieError> {
1642 match reference.as_typed_ref() {
1643 Typed::Fast(_) => Err(TypedCodePointTrieError),
1644 Typed::Small(trie) => Ok(trie),
1645 }
1646 }
1647}
1648
1649impl<'trie, T: TrieValue> TryFrom<CodePointTrie<'trie, T>> for SmallCodePointTrie<'trie, T> {
1650 type Error = TypedCodePointTrieError;
1651
1652 fn try_from(
1653 value: CodePointTrie<'trie, T>,
1654 ) -> Result<SmallCodePointTrie<'trie, T>, TypedCodePointTrieError> {
1655 match value.to_typed() {
1656 Typed::Fast(_) => Err(TypedCodePointTrieError),
1657 Typed::Small(trie) => Ok(trie),
1658 }
1659 }
1660}
1661
1662/// Error indicating that the `TrieType` of an untyped trie
1663/// does not match the requested typed trie type.
1664#[derive(Debug)]
1665#[non_exhaustive]
1666pub struct TypedCodePointTrieError;
1667
1668/// Holder for either fast or small trie with the trie
1669/// type encoded into the Rust type.
1670#[allow(clippy::exhaustive_enums)]
1671#[derive(Debug)]
1672pub enum Typed<F, S> {
1673 /// The trie type is fast.
1674 Fast(F),
1675 /// The trie type is small.
1676 Small(S),
1677}
1678
1679#[cfg(test)]
1680mod tests {
1681 use super::*;
1682 use crate::codepointtrie::planes;
1683 use alloc::vec::Vec;
1684
1685 #[test]
1686 #[cfg(feature = "serde")]
1687 fn test_serde_with_postcard_roundtrip() -> Result<(), postcard::Error> {
1688 let trie = planes::get_planes_trie();
1689 let trie_serialized: Vec<u8> = postcard::to_allocvec(&trie).unwrap();
1690
1691 // Assert an expected (golden data) version of the serialized trie.
1692 const EXP_TRIE_SERIALIZED: &[u8] = &[
1693 128, 128, 64, 128, 2, 2, 0, 0, 1, 160, 18, 0, 0, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1694 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1695 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1696 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1697 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 136,
1698 2, 144, 2, 144, 2, 144, 2, 176, 2, 176, 2, 176, 2, 176, 2, 208, 2, 208, 2, 208, 2, 208,
1699 2, 240, 2, 240, 2, 240, 2, 240, 2, 16, 3, 16, 3, 16, 3, 16, 3, 48, 3, 48, 3, 48, 3, 48,
1700 3, 80, 3, 80, 3, 80, 3, 80, 3, 112, 3, 112, 3, 112, 3, 112, 3, 144, 3, 144, 3, 144, 3,
1701 144, 3, 176, 3, 176, 3, 176, 3, 176, 3, 208, 3, 208, 3, 208, 3, 208, 3, 240, 3, 240, 3,
1702 240, 3, 240, 3, 16, 4, 16, 4, 16, 4, 16, 4, 48, 4, 48, 4, 48, 4, 48, 4, 80, 4, 80, 4,
1703 80, 4, 80, 4, 112, 4, 112, 4, 112, 4, 112, 4, 0, 0, 16, 0, 32, 0, 48, 0, 64, 0, 80, 0,
1704 96, 0, 112, 0, 0, 0, 16, 0, 32, 0, 48, 0, 0, 0, 16, 0, 32, 0, 48, 0, 0, 0, 16, 0, 32,
1705 0, 48, 0, 0, 0, 16, 0, 32, 0, 48, 0, 0, 0, 16, 0, 32, 0, 48, 0, 0, 0, 16, 0, 32, 0, 48,
1706 0, 0, 0, 16, 0, 32, 0, 48, 0, 0, 0, 16, 0, 32, 0, 48, 0, 128, 0, 128, 0, 128, 0, 128,
1707 0, 128, 0, 128, 0, 128, 0, 128, 0, 128, 0, 128, 0, 128, 0, 128, 0, 128, 0, 128, 0, 128,
1708 0, 128, 0, 128, 0, 128, 0, 128, 0, 128, 0, 128, 0, 128, 0, 128, 0, 128, 0, 128, 0, 128,
1709 0, 128, 0, 128, 0, 128, 0, 128, 0, 128, 0, 128, 0, 144, 0, 144, 0, 144, 0, 144, 0, 144,
1710 0, 144, 0, 144, 0, 144, 0, 144, 0, 144, 0, 144, 0, 144, 0, 144, 0, 144, 0, 144, 0, 144,
1711 0, 144, 0, 144, 0, 144, 0, 144, 0, 144, 0, 144, 0, 144, 0, 144, 0, 144, 0, 144, 0, 144,
1712 0, 144, 0, 144, 0, 144, 0, 144, 0, 144, 0, 160, 0, 160, 0, 160, 0, 160, 0, 160, 0, 160,
1713 0, 160, 0, 160, 0, 160, 0, 160, 0, 160, 0, 160, 0, 160, 0, 160, 0, 160, 0, 160, 0, 160,
1714 0, 160, 0, 160, 0, 160, 0, 160, 0, 160, 0, 160, 0, 160, 0, 160, 0, 160, 0, 160, 0, 160,
1715 0, 160, 0, 160, 0, 160, 0, 160, 0, 176, 0, 176, 0, 176, 0, 176, 0, 176, 0, 176, 0, 176,
1716 0, 176, 0, 176, 0, 176, 0, 176, 0, 176, 0, 176, 0, 176, 0, 176, 0, 176, 0, 176, 0, 176,
1717 0, 176, 0, 176, 0, 176, 0, 176, 0, 176, 0, 176, 0, 176, 0, 176, 0, 176, 0, 176, 0, 176,
1718 0, 176, 0, 176, 0, 176, 0, 192, 0, 192, 0, 192, 0, 192, 0, 192, 0, 192, 0, 192, 0, 192,
1719 0, 192, 0, 192, 0, 192, 0, 192, 0, 192, 0, 192, 0, 192, 0, 192, 0, 192, 0, 192, 0, 192,
1720 0, 192, 0, 192, 0, 192, 0, 192, 0, 192, 0, 192, 0, 192, 0, 192, 0, 192, 0, 192, 0, 192,
1721 0, 192, 0, 192, 0, 208, 0, 208, 0, 208, 0, 208, 0, 208, 0, 208, 0, 208, 0, 208, 0, 208,
1722 0, 208, 0, 208, 0, 208, 0, 208, 0, 208, 0, 208, 0, 208, 0, 208, 0, 208, 0, 208, 0, 208,
1723 0, 208, 0, 208, 0, 208, 0, 208, 0, 208, 0, 208, 0, 208, 0, 208, 0, 208, 0, 208, 0, 208,
1724 0, 208, 0, 224, 0, 224, 0, 224, 0, 224, 0, 224, 0, 224, 0, 224, 0, 224, 0, 224, 0, 224,
1725 0, 224, 0, 224, 0, 224, 0, 224, 0, 224, 0, 224, 0, 224, 0, 224, 0, 224, 0, 224, 0, 224,
1726 0, 224, 0, 224, 0, 224, 0, 224, 0, 224, 0, 224, 0, 224, 0, 224, 0, 224, 0, 224, 0, 224,
1727 0, 240, 0, 240, 0, 240, 0, 240, 0, 240, 0, 240, 0, 240, 0, 240, 0, 240, 0, 240, 0, 240,
1728 0, 240, 0, 240, 0, 240, 0, 240, 0, 240, 0, 240, 0, 240, 0, 240, 0, 240, 0, 240, 0, 240,
1729 0, 240, 0, 240, 0, 240, 0, 240, 0, 240, 0, 240, 0, 240, 0, 240, 0, 240, 0, 240, 0, 0,
1730 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
1731 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
1732 1, 0, 1, 0, 1, 16, 1, 16, 1, 16, 1, 16, 1, 16, 1, 16, 1, 16, 1, 16, 1, 16, 1, 16, 1,
1733 16, 1, 16, 1, 16, 1, 16, 1, 16, 1, 16, 1, 16, 1, 16, 1, 16, 1, 16, 1, 16, 1, 16, 1, 16,
1734 1, 16, 1, 16, 1, 16, 1, 16, 1, 16, 1, 16, 1, 16, 1, 16, 1, 16, 1, 32, 1, 32, 1, 32, 1,
1735 32, 1, 32, 1, 32, 1, 32, 1, 32, 1, 32, 1, 32, 1, 32, 1, 32, 1, 32, 1, 32, 1, 32, 1, 32,
1736 1, 32, 1, 32, 1, 32, 1, 32, 1, 32, 1, 32, 1, 32, 1, 32, 1, 32, 1, 32, 1, 32, 1, 32, 1,
1737 32, 1, 32, 1, 32, 1, 32, 1, 48, 1, 48, 1, 48, 1, 48, 1, 48, 1, 48, 1, 48, 1, 48, 1, 48,
1738 1, 48, 1, 48, 1, 48, 1, 48, 1, 48, 1, 48, 1, 48, 1, 48, 1, 48, 1, 48, 1, 48, 1, 48, 1,
1739 48, 1, 48, 1, 48, 1, 48, 1, 48, 1, 48, 1, 48, 1, 48, 1, 48, 1, 48, 1, 48, 1, 64, 1, 64,
1740 1, 64, 1, 64, 1, 64, 1, 64, 1, 64, 1, 64, 1, 64, 1, 64, 1, 64, 1, 64, 1, 64, 1, 64, 1,
1741 64, 1, 64, 1, 64, 1, 64, 1, 64, 1, 64, 1, 64, 1, 64, 1, 64, 1, 64, 1, 64, 1, 64, 1, 64,
1742 1, 64, 1, 64, 1, 64, 1, 64, 1, 64, 1, 80, 1, 80, 1, 80, 1, 80, 1, 80, 1, 80, 1, 80, 1,
1743 80, 1, 80, 1, 80, 1, 80, 1, 80, 1, 80, 1, 80, 1, 80, 1, 80, 1, 80, 1, 80, 1, 80, 1, 80,
1744 1, 80, 1, 80, 1, 80, 1, 80, 1, 80, 1, 80, 1, 80, 1, 80, 1, 80, 1, 80, 1, 80, 1, 80, 1,
1745 96, 1, 96, 1, 96, 1, 96, 1, 96, 1, 96, 1, 96, 1, 96, 1, 96, 1, 96, 1, 96, 1, 96, 1, 96,
1746 1, 96, 1, 96, 1, 96, 1, 96, 1, 96, 1, 96, 1, 96, 1, 96, 1, 96, 1, 96, 1, 96, 1, 96, 1,
1747 96, 1, 96, 1, 96, 1, 96, 1, 96, 1, 96, 1, 96, 1, 128, 0, 136, 0, 136, 0, 136, 0, 136,
1748 0, 136, 0, 136, 0, 136, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0,
1749 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2,
1750 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 168, 0, 168, 0, 168, 0, 168, 0, 168, 0,
1751 168, 0, 168, 0, 168, 0, 168, 0, 168, 0, 168, 0, 168, 0, 168, 0, 168, 0, 168, 0, 168, 0,
1752 168, 0, 168, 0, 168, 0, 168, 0, 168, 0, 168, 0, 168, 0, 168, 0, 168, 0, 168, 0, 168, 0,
1753 168, 0, 168, 0, 168, 0, 168, 0, 168, 0, 200, 0, 200, 0, 200, 0, 200, 0, 200, 0, 200, 0,
1754 200, 0, 200, 0, 200, 0, 200, 0, 200, 0, 200, 0, 200, 0, 200, 0, 200, 0, 200, 0, 200, 0,
1755 200, 0, 200, 0, 200, 0, 200, 0, 200, 0, 200, 0, 200, 0, 200, 0, 200, 0, 200, 0, 200, 0,
1756 200, 0, 200, 0, 200, 0, 200, 0, 232, 0, 232, 0, 232, 0, 232, 0, 232, 0, 232, 0, 232, 0,
1757 232, 0, 232, 0, 232, 0, 232, 0, 232, 0, 232, 0, 232, 0, 232, 0, 232, 0, 232, 0, 232, 0,
1758 232, 0, 232, 0, 232, 0, 232, 0, 232, 0, 232, 0, 232, 0, 232, 0, 232, 0, 232, 0, 232, 0,
1759 232, 0, 232, 0, 232, 0, 8, 1, 8, 1, 8, 1, 8, 1, 8, 1, 8, 1, 8, 1, 8, 1, 8, 1, 8, 1, 8,
1760 1, 8, 1, 8, 1, 8, 1, 8, 1, 8, 1, 8, 1, 8, 1, 8, 1, 8, 1, 8, 1, 8, 1, 8, 1, 8, 1, 8, 1,
1761 8, 1, 8, 1, 8, 1, 8, 1, 8, 1, 8, 1, 8, 1, 40, 1, 40, 1, 40, 1, 40, 1, 40, 1, 40, 1, 40,
1762 1, 40, 1, 40, 1, 40, 1, 40, 1, 40, 1, 40, 1, 40, 1, 40, 1, 40, 1, 40, 1, 40, 1, 40, 1,
1763 40, 1, 40, 1, 40, 1, 40, 1, 40, 1, 40, 1, 40, 1, 40, 1, 40, 1, 40, 1, 40, 1, 40, 1, 40,
1764 1, 72, 1, 72, 1, 72, 1, 72, 1, 72, 1, 72, 1, 72, 1, 72, 1, 72, 1, 72, 1, 72, 1, 72, 1,
1765 72, 1, 72, 1, 72, 1, 72, 1, 72, 1, 72, 1, 72, 1, 72, 1, 72, 1, 72, 1, 72, 1, 72, 1, 72,
1766 1, 72, 1, 72, 1, 72, 1, 72, 1, 72, 1, 72, 1, 72, 1, 104, 1, 104, 1, 104, 1, 104, 1,
1767 104, 1, 104, 1, 104, 1, 104, 1, 104, 1, 104, 1, 104, 1, 104, 1, 104, 1, 104, 1, 104, 1,
1768 104, 1, 104, 1, 104, 1, 104, 1, 104, 1, 104, 1, 104, 1, 104, 1, 104, 1, 104, 1, 104, 1,
1769 104, 1, 104, 1, 104, 1, 104, 1, 104, 1, 104, 1, 136, 1, 136, 1, 136, 1, 136, 1, 136, 1,
1770 136, 1, 136, 1, 136, 1, 136, 1, 136, 1, 136, 1, 136, 1, 136, 1, 136, 1, 136, 1, 136, 1,
1771 136, 1, 136, 1, 136, 1, 136, 1, 136, 1, 136, 1, 136, 1, 136, 1, 136, 1, 136, 1, 136, 1,
1772 136, 1, 136, 1, 136, 1, 136, 1, 136, 1, 168, 1, 168, 1, 168, 1, 168, 1, 168, 1, 168, 1,
1773 168, 1, 168, 1, 168, 1, 168, 1, 168, 1, 168, 1, 168, 1, 168, 1, 168, 1, 168, 1, 168, 1,
1774 168, 1, 168, 1, 168, 1, 168, 1, 168, 1, 168, 1, 168, 1, 168, 1, 168, 1, 168, 1, 168, 1,
1775 168, 1, 168, 1, 168, 1, 168, 1, 200, 1, 200, 1, 200, 1, 200, 1, 200, 1, 200, 1, 200, 1,
1776 200, 1, 200, 1, 200, 1, 200, 1, 200, 1, 200, 1, 200, 1, 200, 1, 200, 1, 200, 1, 200, 1,
1777 200, 1, 200, 1, 200, 1, 200, 1, 200, 1, 200, 1, 200, 1, 200, 1, 200, 1, 200, 1, 200, 1,
1778 200, 1, 200, 1, 200, 1, 232, 1, 232, 1, 232, 1, 232, 1, 232, 1, 232, 1, 232, 1, 232, 1,
1779 232, 1, 232, 1, 232, 1, 232, 1, 232, 1, 232, 1, 232, 1, 232, 1, 232, 1, 232, 1, 232, 1,
1780 232, 1, 232, 1, 232, 1, 232, 1, 232, 1, 232, 1, 232, 1, 232, 1, 232, 1, 232, 1, 232, 1,
1781 232, 1, 232, 1, 8, 2, 8, 2, 8, 2, 8, 2, 8, 2, 8, 2, 8, 2, 8, 2, 8, 2, 8, 2, 8, 2, 8, 2,
1782 8, 2, 8, 2, 8, 2, 8, 2, 8, 2, 8, 2, 8, 2, 8, 2, 8, 2, 8, 2, 8, 2, 8, 2, 8, 2, 8, 2, 8,
1783 2, 8, 2, 8, 2, 8, 2, 8, 2, 8, 2, 40, 2, 40, 2, 40, 2, 40, 2, 40, 2, 40, 2, 40, 2, 40,
1784 2, 40, 2, 40, 2, 40, 2, 40, 2, 40, 2, 40, 2, 40, 2, 40, 2, 40, 2, 40, 2, 40, 2, 40, 2,
1785 40, 2, 40, 2, 40, 2, 40, 2, 40, 2, 40, 2, 40, 2, 40, 2, 40, 2, 40, 2, 40, 2, 40, 2, 72,
1786 2, 72, 2, 72, 2, 72, 2, 72, 2, 72, 2, 72, 2, 72, 2, 72, 2, 72, 2, 72, 2, 72, 2, 72, 2,
1787 72, 2, 72, 2, 72, 2, 72, 2, 72, 2, 72, 2, 72, 2, 72, 2, 72, 2, 72, 2, 72, 2, 72, 2, 72,
1788 2, 72, 2, 72, 2, 72, 2, 72, 2, 72, 2, 72, 2, 104, 2, 104, 2, 104, 2, 104, 2, 104, 2,
1789 104, 2, 104, 2, 104, 2, 104, 2, 104, 2, 104, 2, 104, 2, 104, 2, 104, 2, 104, 2, 104, 2,
1790 104, 2, 104, 2, 104, 2, 104, 2, 104, 2, 104, 2, 104, 2, 104, 2, 104, 2, 104, 2, 104, 2,
1791 104, 2, 104, 2, 104, 2, 104, 2, 104, 2, 244, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1792 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1793 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1794 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1795 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
1796 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1797 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
1798 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6,
1799 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8,
1800 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 10,
1801 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 11, 11,
1802 11, 11, 11, 11, 11, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
1803 12, 12, 12, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 14, 14, 14,
1804 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15,
1805 15, 15, 15, 15, 15, 15, 15, 16, 16, 16, 0,
1806 ];
1807 assert_eq!(trie_serialized, EXP_TRIE_SERIALIZED);
1808
1809 let trie_deserialized = postcard::from_bytes::<CodePointTrie<u8>>(&trie_serialized)?;
1810
1811 assert_eq!(&trie.index, &trie_deserialized.index);
1812 assert_eq!(&trie.data, &trie_deserialized.data);
1813
1814 assert!(!trie_deserialized.index.is_owned());
1815 assert!(!trie_deserialized.data.is_owned());
1816
1817 Ok(())
1818 }
1819
1820 #[test]
1821 fn test_typed() {
1822 let untyped = planes::get_planes_trie();
1823 assert_eq!(untyped.get('\u{10000}'), 1);
1824 let small_ref = <&SmallCodePointTrie<_>>::try_from(&untyped).unwrap();
1825 assert_eq!(small_ref.get('\u{10000}'), 1);
1826 let _ = <&FastCodePointTrie<_>>::try_from(&untyped).is_err();
1827 let small = <SmallCodePointTrie<_>>::try_from(untyped).unwrap();
1828 assert_eq!(small.get('\u{10000}'), 1);
1829 }
1830
1831 #[test]
1832 fn test_get_range() {
1833 let planes_trie = planes::get_planes_trie();
1834
1835 let first_range: Option<CodePointMapRange<u8>> = planes_trie.get_range(0x0);
1836 assert_eq!(
1837 first_range,
1838 Some(CodePointMapRange {
1839 range: 0x0..=0xffff,
1840 value: 0
1841 })
1842 );
1843
1844 let second_range: Option<CodePointMapRange<u8>> = planes_trie.get_range(0x1_0000);
1845 assert_eq!(
1846 second_range,
1847 Some(CodePointMapRange {
1848 range: 0x10000..=0x1ffff,
1849 value: 1
1850 })
1851 );
1852
1853 let penultimate_range: Option<CodePointMapRange<u8>> = planes_trie.get_range(0xf_0000);
1854 assert_eq!(
1855 penultimate_range,
1856 Some(CodePointMapRange {
1857 range: 0xf_0000..=0xf_ffff,
1858 value: 15
1859 })
1860 );
1861
1862 let last_range: Option<CodePointMapRange<u8>> = planes_trie.get_range(0x10_0000);
1863 assert_eq!(
1864 last_range,
1865 Some(CodePointMapRange {
1866 range: 0x10_0000..=0x10_ffff,
1867 value: 16
1868 })
1869 );
1870 }
1871
1872 #[test]
1873 #[allow(unused_unsafe)] // `unsafe` below is both necessary and unnecessary
1874 fn databake() {
1875 databake::test_bake!(
1876 CodePointTrie<'static, u32>,
1877 const,
1878 unsafe {
1879 crate::codepointtrie::CodePointTrie::from_parts_unstable_unchecked_v1(
1880 crate::codepointtrie::CodePointTrieHeader {
1881 high_start: 1u32,
1882 shifted12_high_start: 2u16,
1883 index3_null_offset: 3u16,
1884 data_null_offset: 4u32,
1885 null_value: 5u32,
1886 trie_type: crate::codepointtrie::TrieType::Small,
1887 },
1888 zerovec::ZeroVec::new(),
1889 zerovec::ZeroVec::new(),
1890 0u32,
1891 )
1892 },
1893 icu_collections,
1894 [zerovec],
1895 );
1896 }
1897}