unicode_truncate/
lib.rs

1// Copyright 2019 Aetf <aetf at unlimitedcodeworks dot xyz>.
2// See the COPYRIGHT file at the top-level directory of this distribution.
3//
4// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
7// option. This file may not be copied, modified, or distributed
8// except according to those terms.
9
10#![forbid(missing_docs, unsafe_code)]
11#![warn(clippy::arithmetic_side_effects)]
12#![cfg_attr(not(feature = "std"), no_std)]
13
14//! Unicode-aware algorithm to pad or truncate `str` in terms of displayed width.
15//!
16//! See the [`UnicodeTruncateStr`](crate::UnicodeTruncateStr) trait for new methods available on
17//! `str`.
18//!
19//! # Examples
20//! Safely truncate string to display width even not at character boundaries.
21//! ```rust
22//! use unicode_truncate::UnicodeTruncateStr;
23//! assert_eq!("你好吗".unicode_truncate(5), ("你好", 4));
24//! ```
25#![cfg_attr(
26    feature = "std",
27    doc = r##"
28Making sure the string is displayed in exactly number of columns by
29combining padding and truncating.
30
31```rust
32use unicode_truncate::UnicodeTruncateStr;
33use unicode_truncate::Alignment;
34use unicode_width::UnicodeWidthStr;
35
36let str = "你好吗".unicode_pad(5, Alignment::Left, true);
37assert_eq!(str, "你好 ");
38assert_eq!(str.width(), 5);
39```
40"##
41)]
42
43use itertools::{merge_join_by, Either};
44use unicode_segmentation::UnicodeSegmentation;
45use unicode_width::UnicodeWidthStr;
46
47/// Defines the alignment for truncation and padding.
48#[derive(PartialEq, Eq, Debug, Copy, Clone)]
49pub enum Alignment {
50    /// Align to the left
51    Left,
52    /// Align center
53    Center,
54    /// Align to the right
55    Right,
56}
57
58/// Methods for padding or truncating using displayed width of Unicode strings.
59pub trait UnicodeTruncateStr {
60    /// Truncates a string to be at most `width` in terms of display width by removing the end
61    /// characters.
62    ///
63    /// For wide characters, it may not always be possible to truncate at exact width. In this case,
64    /// the longest possible string is returned. To help the caller determine the situation, the
65    /// display width of the returned string slice is also returned.
66    ///
67    /// Zero-width characters decided by [`unicode_width`] are always included when deciding the
68    /// truncation point.
69    ///
70    /// # Arguments
71    /// * `max_width` - the maximum display width
72    fn unicode_truncate(&self, max_width: usize) -> (&str, usize);
73
74    /// Truncates a string to be at most `width` in terms of display width by removing the start
75    /// characters.
76    ///
77    /// For wide characters, it may not always be possible to truncate at exact width. In this case,
78    /// the longest possible string is returned. To help the caller determine the situation, the
79    /// display width of the returned string slice is also returned.
80    ///
81    /// Zero-width characters decided by [`unicode_width`] are always removed when deciding the
82    /// truncation point.
83    ///
84    /// # Arguments
85    /// * `max_width` - the maximum display width
86    fn unicode_truncate_start(&self, max_width: usize) -> (&str, usize);
87
88    /// Truncates a string to be at most `width` in terms of display width by removing
89    /// characters at both start and end.
90    ///
91    /// For wide characters, it may not always be possible to truncate at exact width. In this case,
92    /// the longest possible string is returned. To help the caller determine the situation, the
93    /// display width of the returned string slice is also returned.
94    ///
95    /// Zero-width characters decided by [`unicode_width`] are included if they are at end, or
96    /// removed if they are at the beginning when deciding the truncation point.
97    ///
98    /// # Arguments
99    /// * `max_width` - the maximum display width
100    fn unicode_truncate_centered(&self, max_width: usize) -> (&str, usize);
101
102    /// Truncates a string to be at most `width` in terms of display width by removing
103    /// characters.
104    ///
105    /// Depending on the alignment characters are removed. When left aligned characters from the end
106    /// are removed. When right aligned characters from the start are removed. When centered
107    /// characters from both sides are removed.
108    ///
109    /// For wide characters, it may not always be possible to truncate at exact width. In this case,
110    /// the longest possible string is returned. To help the caller determine the situation, the
111    /// display width of the returned string slice is also returned.
112    ///
113    /// Zero-width characters decided by [`unicode_width`] are included if they are at end, or
114    /// removed if they are at the beginning when deciding the truncation point.
115    ///
116    /// # Arguments
117    /// * `max_width` - the maximum display width
118    /// * `align` - alignment for truncation
119    #[inline]
120    fn unicode_truncate_aligned(&self, max_width: usize, align: Alignment) -> (&str, usize) {
121        match align {
122            Alignment::Left => self.unicode_truncate(max_width),
123            Alignment::Center => self.unicode_truncate_centered(max_width),
124            Alignment::Right => self.unicode_truncate_start(max_width),
125        }
126    }
127
128    /// Pads a string to be `width` in terms of display width. Only available when the `std` feature
129    /// of this library is activated, and it is activated by default.
130    ///
131    /// When `truncate` is true, the string is truncated to `width` if necessary. In case of wide
132    /// characters and truncation point not at character boundary, the longest possible string is
133    /// used, and padded to exact `width` according to `align`.
134    /// See [`unicode_truncate`](crate::UnicodeTruncateStr::unicode_truncate) for the behavior of
135    /// truncation.
136    ///
137    /// # Arguments
138    /// * `target_width` - the display width to pad to
139    /// * `align` - alignment for truncation and padding
140    /// * `truncate` - whether to truncate string if necessary
141    #[cfg(feature = "std")]
142    fn unicode_pad(
143        &self,
144        target_width: usize,
145        align: Alignment,
146        truncate: bool,
147    ) -> std::borrow::Cow<'_, str>;
148}
149
150impl UnicodeTruncateStr for str {
151    #[inline]
152    fn unicode_truncate(&self, max_width: usize) -> (&str, usize) {
153        let (byte_index, new_width) = self
154            .grapheme_indices(true)
155            // map to byte index and the width of grapheme at the index
156            .map(|(byte_index, grapheme)| (byte_index, grapheme.width()))
157            // chain a final element representing the position past the last char
158            .chain(core::iter::once((self.len(), 0)))
159            // fold to byte index and the width up to the index
160            .scan(0, |sum: &mut usize, (byte_index, grapheme_width)| {
161                // byte_index is the start while the grapheme_width is at the end. Current width is
162                // the sum until now while the next byte_index is including the current
163                // grapheme_width.
164                let current_width = *sum;
165                *sum = sum.checked_add(grapheme_width)?;
166                Some((byte_index, current_width))
167            })
168            // take the longest but still shorter than requested
169            .take_while(|&(_, current_width)| current_width <= max_width)
170            .last()
171            .unwrap_or((0, 0));
172
173        // unwrap is safe as the index comes from grapheme_indices
174        let result = self.get(..byte_index).unwrap();
175        debug_assert_eq!(result.width(), new_width);
176        (result, new_width)
177    }
178
179    #[inline]
180    fn unicode_truncate_start(&self, max_width: usize) -> (&str, usize) {
181        let (byte_index, new_width) = self
182            .grapheme_indices(true)
183            // instead of start checking from the start do so from the end
184            .rev()
185            // map to byte index and the width of grapheme start at the index
186            .map(|(byte_index, grapheme)| (byte_index, grapheme.width()))
187            // fold to byte index and the width from end to the index
188            .scan(0, |sum: &mut usize, (byte_index, grapheme_width)| {
189                *sum = sum.checked_add(grapheme_width)?;
190                Some((byte_index, *sum))
191            })
192            .take_while(|&(_, current_width)| current_width <= max_width)
193            .last()
194            .unwrap_or((self.len(), 0));
195
196        // unwrap is safe as the index comes from grapheme_indices
197        let result = self.get(byte_index..).unwrap();
198        debug_assert_eq!(result.width(), new_width);
199        (result, new_width)
200    }
201
202    #[inline]
203    fn unicode_truncate_centered(&self, max_width: usize) -> (&str, usize) {
204        if max_width == 0 {
205            return ("", 0);
206        }
207
208        let original_width = self.width();
209        if original_width <= max_width {
210            return (self, original_width);
211        }
212
213        // We need to remove at least this much
214        // unwrap is safe as original_width > max_width
215        let min_removal_width = original_width.checked_sub(max_width).unwrap();
216
217        // Around the half to improve performance. In order to ensure the center grapheme stays
218        // remove its max possible length. This assumes a grapheme width is always <= 10 (4 people
219        // family emoji has width 8). This might end up not perfect on graphemes wider than this but
220        // performance is more important here.
221        let less_than_half = min_removal_width.saturating_sub(10) / 2;
222
223        let from_start = self
224            .grapheme_indices(true)
225            .map(|(byte_index, grapheme)| (byte_index, grapheme.width()))
226            // fold to byte index and the width from start to the index (not including the current
227            // grapheme width)
228            .scan(
229                (0usize, 0usize),
230                |(sum, prev_width), (byte_index, grapheme_width)| {
231                    *sum = sum.checked_add(*prev_width)?;
232                    *prev_width = grapheme_width;
233                    Some((byte_index, *sum))
234                },
235            )
236            // fast forward to around the half
237            .skip_while(|&(_, removed)| removed < less_than_half);
238
239        let from_end = self
240            .grapheme_indices(true)
241            .map(|(byte_index, grapheme)| (byte_index, grapheme.width()))
242            .rev()
243            // fold to byte index and the width from end to the index (including the current
244            // grapheme width)
245            .scan(0usize, |sum, (byte_index, grapheme_width)| {
246                *sum = sum.checked_add(grapheme_width)?;
247                Some((byte_index, *sum))
248            })
249            // fast forward to around the half
250            .skip_while(|&(_, removed)| removed < less_than_half);
251
252        let (start_index, end_index, removed_width) = merge_join_by(
253            from_start,
254            from_end,
255            // taking from either left or right iter depending on which side has less removed width
256            |&(_, start_removed), &(_, end_removed)| start_removed < end_removed,
257        )
258        // remember the last left or right and combine them to one sequence of operations
259        .scan(
260            (0usize, 0usize, 0usize, 0usize),
261            |(start_removed, end_removed, start_index, end_index), position| {
262                match position {
263                    Either::Left((idx, removed)) => {
264                        *start_index = idx;
265                        *start_removed = removed;
266                    }
267                    Either::Right((idx, removed)) => {
268                        *end_index = idx;
269                        *end_removed = removed;
270                    }
271                }
272                // unwrap is safe as total length was also <= usize::MAX
273                let total_removed = start_removed.checked_add(*end_removed).unwrap();
274                Some((*start_index, *end_index, total_removed))
275            },
276        )
277        .find(|&(_, _, removed)| removed >= min_removal_width)
278        // should not happen as the removed width is not larger than the original width
279        // but a sane default is to remove everything (i.e. min_removal_width too large)
280        .unwrap_or((0, 0, original_width));
281
282        // unwrap is safe as the index comes from grapheme_indices
283        let result = self.get(start_index..end_index).unwrap();
284        // unwrap is safe as removed is always smaller than total width
285        let result_width = original_width.checked_sub(removed_width).unwrap();
286        debug_assert_eq!(result.width(), result_width);
287        (result, result_width)
288    }
289
290    #[cfg(feature = "std")]
291    #[inline]
292    fn unicode_pad(
293        &self,
294        target_width: usize,
295        align: Alignment,
296        truncate: bool,
297    ) -> std::borrow::Cow<'_, str> {
298        use std::borrow::Cow;
299
300        if !truncate && self.width() >= target_width {
301            return Cow::Borrowed(self);
302        }
303
304        let (truncated, columns) = self.unicode_truncate(target_width);
305        if columns == target_width {
306            return Cow::Borrowed(truncated);
307        }
308
309        // the string is less than width, or truncated to less than width
310        let diff = target_width.saturating_sub(columns);
311        let (left_pad, right_pad) = match align {
312            Alignment::Left => (0, diff),
313            Alignment::Right => (diff, 0),
314            Alignment::Center => (diff / 2, diff.saturating_sub(diff / 2)),
315        };
316        debug_assert_eq!(diff, left_pad.saturating_add(right_pad));
317
318        let new_len = truncated
319            .len()
320            .checked_add(diff)
321            .expect("Padded result should fit in a new String");
322        let mut result = String::with_capacity(new_len);
323        for _ in 0..left_pad {
324            result.push(' ');
325        }
326        result += truncated;
327        for _ in 0..right_pad {
328            result.push(' ');
329        }
330        Cow::Owned(result)
331    }
332}
333
334#[cfg(test)]
335mod tests {
336    use super::*;
337
338    mod truncate_end {
339        use super::*;
340
341        #[test]
342        fn empty() {
343            assert_eq!("".unicode_truncate(4), ("", 0));
344        }
345
346        #[test]
347        fn zero_width() {
348            assert_eq!("ab".unicode_truncate(0), ("", 0));
349            assert_eq!("你好".unicode_truncate(0), ("", 0));
350        }
351
352        #[test]
353        fn less_than_limit() {
354            assert_eq!("abc".unicode_truncate(4), ("abc", 3));
355            assert_eq!("你".unicode_truncate(4), ("你", 2));
356        }
357
358        #[test]
359        fn at_boundary() {
360            assert_eq!("boundary".unicode_truncate(5), ("bound", 5));
361            assert_eq!("你好吗".unicode_truncate(4), ("你好", 4));
362        }
363
364        #[test]
365        fn not_boundary() {
366            assert_eq!("你好吗".unicode_truncate(3), ("你", 2));
367            assert_eq!("你好吗".unicode_truncate(1), ("", 0));
368        }
369
370        #[test]
371        fn zero_width_char_in_middle() {
372            // zero width character in the middle is intact
373            assert_eq!("y\u{0306}es".unicode_truncate(2), ("y\u{0306}e", 2));
374        }
375
376        #[test]
377        fn keep_zero_width_char_at_boundary() {
378            // zero width character at end is preserved
379            assert_eq!(
380                "y\u{0306}ey\u{0306}s".unicode_truncate(3),
381                ("y\u{0306}ey\u{0306}", 3)
382            );
383        }
384
385        #[test]
386        fn family_stays_together() {
387            let input = "123👨‍👩‍👧‍👦456";
388            assert_eq!(input.unicode_truncate(4), ("123", 3));
389            assert_eq!(input.unicode_truncate(8), ("123", 3));
390            assert_eq!(input.unicode_truncate(12), ("123👨‍👩‍👧‍👦4", 12));
391            assert_eq!(input.unicode_truncate(20), (input, 14));
392        }
393    }
394
395    mod truncate_start {
396        use super::*;
397
398        #[test]
399        fn empty() {
400            assert_eq!("".unicode_truncate_start(4), ("", 0));
401        }
402
403        #[test]
404        fn zero_width() {
405            assert_eq!("ab".unicode_truncate_start(0), ("", 0));
406            assert_eq!("你好".unicode_truncate_start(0), ("", 0));
407        }
408
409        #[test]
410        fn less_than_limit() {
411            assert_eq!("abc".unicode_truncate_start(4), ("abc", 3));
412            assert_eq!("你".unicode_truncate_start(4), ("你", 2));
413        }
414
415        #[test]
416        fn at_boundary() {
417            assert_eq!("boundary".unicode_truncate_start(5), ("ndary", 5));
418            assert_eq!("你好吗".unicode_truncate_start(4), ("好吗", 4));
419        }
420
421        #[test]
422        fn not_boundary() {
423            assert_eq!("你好吗".unicode_truncate_start(3), ("吗", 2));
424            assert_eq!("你好吗".unicode_truncate_start(1), ("", 0));
425        }
426
427        #[test]
428        fn zero_width_char_in_middle() {
429            // zero width character in middle is preserved
430            assert_eq!(
431                "y\u{0306}ey\u{0306}s".unicode_truncate_start(2),
432                ("y\u{0306}s", 2)
433            );
434        }
435
436        #[test]
437        fn remove_zero_width_char_at_boundary() {
438            // zero width character in the middle at the cutting boundary is removed
439            assert_eq!("y\u{0306}es".unicode_truncate_start(2), ("es", 2));
440        }
441
442        #[test]
443        fn family_stays_together() {
444            let input = "123👨‍👩‍👧‍👦456";
445            assert_eq!(input.unicode_truncate_start(4), ("456", 3));
446            assert_eq!(input.unicode_truncate_start(8), ("456", 3));
447            assert_eq!(input.unicode_truncate_start(12), ("3👨‍👩‍👧‍👦456", 12));
448            assert_eq!(input.unicode_truncate_start(20), (input, 14));
449        }
450    }
451
452    mod truncate_centered {
453        use super::*;
454
455        #[test]
456        fn empty() {
457            assert_eq!("".unicode_truncate_centered(4), ("", 0));
458        }
459
460        #[test]
461        fn zero_width() {
462            assert_eq!("ab".unicode_truncate_centered(0), ("", 0));
463            assert_eq!("你好".unicode_truncate_centered(0), ("", 0));
464        }
465
466        #[test]
467        fn less_than_limit() {
468            assert_eq!("abc".unicode_truncate_centered(4), ("abc", 3));
469            assert_eq!("你".unicode_truncate_centered(4), ("你", 2));
470        }
471
472        /// The source code has special handling for small `min_removal_width` (half-point)
473        #[test]
474        fn truncate_exactly_one() {
475            assert_eq!("abcd".unicode_truncate_centered(3), ("abc", 3));
476        }
477
478        #[test]
479        fn at_boundary() {
480            assert_eq!(
481                "boundaryboundary".unicode_truncate_centered(5),
482                ("arybo", 5)
483            );
484            assert_eq!(
485                "你好吗你好吗你好吗".unicode_truncate_centered(4),
486                ("你好", 4)
487            );
488        }
489
490        #[test]
491        fn not_boundary() {
492            assert_eq!("你好吗你好吗".unicode_truncate_centered(3), ("吗", 2));
493            assert_eq!("你好吗你好吗".unicode_truncate_centered(1), ("", 0));
494        }
495
496        #[test]
497        fn zero_width_char_in_middle() {
498            // zero width character in middle is preserved
499            assert_eq!(
500                "yy\u{0306}es".unicode_truncate_centered(2),
501                ("y\u{0306}e", 2)
502            );
503        }
504
505        #[test]
506        fn zero_width_char_at_boundary() {
507            // zero width character at the cutting boundary in the start is removed
508            // but those in the end is kept.
509            assert_eq!(
510                "y\u{0306}ea\u{0306}b\u{0306}y\u{0306}ea\u{0306}b\u{0306}"
511                    .unicode_truncate_centered(2),
512                ("b\u{0306}y\u{0306}", 2)
513            );
514            assert_eq!(
515                "ay\u{0306}ea\u{0306}b\u{0306}y\u{0306}ea\u{0306}b\u{0306}"
516                    .unicode_truncate_centered(2),
517                ("a\u{0306}b\u{0306}", 2)
518            );
519            assert_eq!(
520                "y\u{0306}ea\u{0306}b\u{0306}y\u{0306}ea\u{0306}b\u{0306}a"
521                    .unicode_truncate_centered(2),
522                ("b\u{0306}y\u{0306}", 2)
523            );
524        }
525
526        #[test]
527        fn control_char() {
528            use unicode_width::UnicodeWidthChar;
529            assert_eq!("\u{0019}".width(), 1);
530            assert_eq!('\u{0019}'.width(), None);
531            assert_eq!("\u{0019}".unicode_truncate(2), ("\u{0019}", 1));
532        }
533
534        #[test]
535        fn family_stays_together() {
536            let input = "123👨‍👩‍👧‍👦456";
537            assert_eq!(input.unicode_truncate_centered(4), ("", 0));
538            assert_eq!(input.unicode_truncate_centered(8), ("👨‍👩‍👧‍👦", 8));
539            assert_eq!(input.unicode_truncate_centered(12), ("23👨‍👩‍👧‍👦45", 12));
540            assert_eq!(input.unicode_truncate_centered(20), (input, 14));
541        }
542    }
543
544    #[test]
545    fn truncate_aligned() {
546        assert_eq!("abc".unicode_truncate_aligned(1, Alignment::Left), ("a", 1));
547        assert_eq!(
548            "abc".unicode_truncate_aligned(1, Alignment::Center),
549            ("b", 1)
550        );
551        assert_eq!(
552            "abc".unicode_truncate_aligned(1, Alignment::Right),
553            ("c", 1)
554        );
555    }
556
557    #[cfg(feature = "std")]
558    mod pad {
559        use super::*;
560
561        #[test]
562        fn zero_width() {
563            assert_eq!("你好".unicode_pad(0, Alignment::Left, true), "");
564            assert_eq!("你好".unicode_pad(0, Alignment::Left, false), "你好");
565        }
566
567        #[test]
568        fn less_than_limit() {
569            assert_eq!("你".unicode_pad(4, Alignment::Left, true), "你  ");
570            assert_eq!("你".unicode_pad(4, Alignment::Left, false), "你  ");
571        }
572
573        #[test]
574        fn width_at_boundary() {
575            assert_eq!("你好吗".unicode_pad(4, Alignment::Left, true), "你好");
576            assert_eq!("你好吗".unicode_pad(4, Alignment::Left, false), "你好吗");
577        }
578
579        #[test]
580        fn width_not_boundary() {
581            // above limit wide chars not at boundary
582            assert_eq!("你好吗".unicode_pad(3, Alignment::Left, true), "你 ");
583            assert_eq!("你好吗".unicode_pad(1, Alignment::Left, true), " ");
584            assert_eq!("你好吗".unicode_pad(3, Alignment::Left, false), "你好吗");
585
586            assert_eq!("你好吗".unicode_pad(3, Alignment::Center, true), "你 ");
587
588            assert_eq!("你好吗".unicode_pad(3, Alignment::Right, true), " 你");
589        }
590    }
591}
unicode_truncate/lib.rs

unicode_truncate/
lib.rs