polars_arrow/bitmap/utils/
mod.rs

1//! General utilities for bitmaps representing items where LSB is the first item.
2mod chunk_iterator;
3mod chunks_exact_mut;
4mod fmt;
5mod iterator;
6mod slice_iterator;
7mod zip_validity;
8
9pub(crate) use chunk_iterator::merge_reversed;
10pub use chunk_iterator::{BitChunk, BitChunkIterExact, BitChunks, BitChunksExact};
11pub use chunks_exact_mut::BitChunksExactMut;
12pub use fmt::fmt;
13pub use iterator::BitmapIter;
14use polars_utils::slice::load_padded_le_u64;
15pub use slice_iterator::SlicesIterator;
16pub use zip_validity::{ZipValidity, ZipValidityIter};
17
18use crate::bitmap::aligned::AlignedBitmapSlice;
19
20/// Returns whether bit at position `i` in `byte` is set or not
21#[inline]
22pub fn is_set(byte: u8, i: usize) -> bool {
23    debug_assert!(i < 8);
24    byte & (1 << i) != 0
25}
26
27/// Sets bit at position `i` in `byte`.
28#[inline(always)]
29pub fn set_bit_in_byte(byte: u8, i: usize, value: bool) -> u8 {
30    debug_assert!(i < 8);
31    let mask = !(1 << i);
32    let insert = (value as u8) << i;
33    (byte & mask) | insert
34}
35
36/// Returns whether bit at position `i` in `bytes` is set or not.
37///
38/// # Safety
39/// `i >= bytes.len() * 8` results in undefined behavior.
40#[inline(always)]
41pub unsafe fn get_bit_unchecked(bytes: &[u8], i: usize) -> bool {
42    let byte = *bytes.get_unchecked(i / 8);
43    let bit = (byte >> (i % 8)) & 1;
44    bit != 0
45}
46
47/// Sets bit at position `i` in `bytes` without doing bound checks.
48/// # Safety
49/// `i >= bytes.len() * 8` results in undefined behavior.
50#[inline(always)]
51pub unsafe fn set_bit_unchecked(bytes: &mut [u8], i: usize, value: bool) {
52    let byte = bytes.get_unchecked_mut(i / 8);
53    *byte = set_bit_in_byte(*byte, i % 8, value);
54}
55
56/// Returns the number of bytes required to hold `bits` bits.
57#[inline]
58pub fn bytes_for(bits: usize) -> usize {
59    bits.saturating_add(7) / 8
60}
61
62/// Returns the number of zero bits in the slice offsetted by `offset` and a length of `length`.
63/// # Panics
64/// This function panics iff `offset + len > 8 * slice.len()``.
65pub fn count_zeros(slice: &[u8], offset: usize, len: usize) -> usize {
66    if len == 0 {
67        return 0;
68    }
69
70    assert!(8 * slice.len() >= offset + len);
71
72    // Fast-path: fits in a single u64 load.
73    let first_byte_idx = offset / 8;
74    let offset_in_byte = offset % 8;
75    if offset_in_byte + len <= 64 {
76        let mut word = load_padded_le_u64(&slice[first_byte_idx..]);
77        word >>= offset_in_byte;
78        word <<= 64 - len;
79        return len - word.count_ones() as usize;
80    }
81
82    let aligned = AlignedBitmapSlice::<u64>::new(slice, offset, len);
83    let ones_in_prefix = aligned.prefix().count_ones() as usize;
84    let ones_in_bulk: usize = aligned.bulk_iter().map(|w| w.count_ones() as usize).sum();
85    let ones_in_suffix = aligned.suffix().count_ones() as usize;
86    len - ones_in_prefix - ones_in_bulk - ones_in_suffix
87}
88
89/// Returns the number of zero bits before seeing a one bit in the slice offsetted by `offset` and
90/// a length of `length`.
91///
92/// # Panics
93/// This function panics iff `offset + len > 8 * slice.len()``.
94pub fn leading_zeros(slice: &[u8], offset: usize, len: usize) -> usize {
95    if len == 0 {
96        return 0;
97    }
98
99    assert!(8 * slice.len() >= offset + len);
100
101    let aligned = AlignedBitmapSlice::<u64>::new(slice, offset, len);
102    let leading_zeros_in_prefix =
103        (aligned.prefix().trailing_zeros() as usize).min(aligned.prefix_bitlen());
104    if leading_zeros_in_prefix < aligned.prefix_bitlen() {
105        return leading_zeros_in_prefix;
106    }
107    if let Some(full_zero_bulk_words) = aligned.bulk_iter().position(|w| w != 0) {
108        return aligned.prefix_bitlen()
109            + full_zero_bulk_words * 64
110            + aligned.bulk()[full_zero_bulk_words].trailing_zeros() as usize;
111    }
112
113    aligned.prefix_bitlen()
114        + aligned.bulk_bitlen()
115        + (aligned.suffix().trailing_zeros() as usize).min(aligned.suffix_bitlen())
116}
117
118/// Returns the number of one bits before seeing a zero bit in the slice offsetted by `offset` and
119/// a length of `length`.
120///
121/// # Panics
122/// This function panics iff `offset + len > 8 * slice.len()``.
123pub fn leading_ones(slice: &[u8], offset: usize, len: usize) -> usize {
124    if len == 0 {
125        return 0;
126    }
127
128    assert!(8 * slice.len() >= offset + len);
129
130    let aligned = AlignedBitmapSlice::<u64>::new(slice, offset, len);
131    let leading_ones_in_prefix = aligned.prefix().trailing_ones() as usize;
132    if leading_ones_in_prefix < aligned.prefix_bitlen() {
133        return leading_ones_in_prefix;
134    }
135    if let Some(full_one_bulk_words) = aligned.bulk_iter().position(|w| w != u64::MAX) {
136        return aligned.prefix_bitlen()
137            + full_one_bulk_words * 64
138            + aligned.bulk()[full_one_bulk_words].trailing_ones() as usize;
139    }
140
141    aligned.prefix_bitlen() + aligned.bulk_bitlen() + aligned.suffix().trailing_ones() as usize
142}
143
144/// Returns the number of zero bits before seeing a one bit in the slice offsetted by `offset` and
145/// a length of `length`.
146///
147/// # Panics
148/// This function panics iff `offset + len > 8 * slice.len()``.
149pub fn trailing_zeros(slice: &[u8], offset: usize, len: usize) -> usize {
150    if len == 0 {
151        return 0;
152    }
153
154    assert!(8 * slice.len() >= offset + len);
155
156    let aligned = AlignedBitmapSlice::<u64>::new(slice, offset, len);
157    let trailing_zeros_in_suffix = ((aligned.suffix() << ((64 - aligned.suffix_bitlen()) % 64))
158        .leading_zeros() as usize)
159        .min(aligned.suffix_bitlen());
160    if trailing_zeros_in_suffix < aligned.suffix_bitlen() {
161        return trailing_zeros_in_suffix;
162    }
163    if let Some(full_zero_bulk_words) = aligned.bulk_iter().rev().position(|w| w != 0) {
164        return aligned.suffix_bitlen()
165            + full_zero_bulk_words * 64
166            + aligned.bulk()[aligned.bulk().len() - full_zero_bulk_words - 1].leading_zeros()
167                as usize;
168    }
169
170    let trailing_zeros_in_prefix = ((aligned.prefix() << ((64 - aligned.prefix_bitlen()) % 64))
171        .leading_zeros() as usize)
172        .min(aligned.prefix_bitlen());
173    aligned.suffix_bitlen() + aligned.bulk_bitlen() + trailing_zeros_in_prefix
174}
175
176/// Returns the number of one bits before seeing a zero bit in the slice offsetted by `offset` and
177/// a length of `length`.
178///
179/// # Panics
180/// This function panics iff `offset + len > 8 * slice.len()``.
181pub fn trailing_ones(slice: &[u8], offset: usize, len: usize) -> usize {
182    if len == 0 {
183        return 0;
184    }
185
186    assert!(8 * slice.len() >= offset + len);
187
188    let aligned = AlignedBitmapSlice::<u64>::new(slice, offset, len);
189    let trailing_ones_in_suffix =
190        (aligned.suffix() << ((64 - aligned.suffix_bitlen()) % 64)).leading_ones() as usize;
191    if trailing_ones_in_suffix < aligned.suffix_bitlen() {
192        return trailing_ones_in_suffix;
193    }
194    if let Some(full_one_bulk_words) = aligned.bulk_iter().rev().position(|w| w != u64::MAX) {
195        return aligned.suffix_bitlen()
196            + full_one_bulk_words * 64
197            + aligned.bulk()[aligned.bulk().len() - full_one_bulk_words - 1].leading_ones()
198                as usize;
199    }
200
201    let trailing_ones_in_prefix =
202        (aligned.prefix() << ((64 - aligned.prefix_bitlen()) % 64)).leading_ones() as usize;
203    aligned.suffix_bitlen() + aligned.bulk_bitlen() + trailing_ones_in_prefix
204}
205
206#[cfg(test)]
207mod tests {
208    use rand::Rng;
209
210    use super::*;
211    use crate::bitmap::Bitmap;
212
213    #[test]
214    fn leading_trailing() {
215        macro_rules! testcase {
216            ($slice:expr, $offset:expr, $length:expr => lz=$lz:expr,lo=$lo:expr,tz=$tz:expr,to=$to:expr) => {
217                assert_eq!(
218                    leading_zeros($slice, $offset, $length),
219                    $lz,
220                    "leading_zeros"
221                );
222                assert_eq!(leading_ones($slice, $offset, $length), $lo, "leading_ones");
223                assert_eq!(
224                    trailing_zeros($slice, $offset, $length),
225                    $tz,
226                    "trailing_zeros"
227                );
228                assert_eq!(
229                    trailing_ones($slice, $offset, $length),
230                    $to,
231                    "trailing_ones"
232                );
233            };
234        }
235
236        testcase!(&[], 0, 0 => lz=0,lo=0,tz=0,to=0);
237        testcase!(&[0], 0, 1 => lz=1,lo=0,tz=1,to=0);
238        testcase!(&[1], 0, 1 => lz=0,lo=1,tz=0,to=1);
239
240        testcase!(&[0b010], 0, 3 => lz=1,lo=0,tz=1,to=0);
241        testcase!(&[0b101], 0, 3 => lz=0,lo=1,tz=0,to=1);
242        testcase!(&[0b100], 0, 3 => lz=2,lo=0,tz=0,to=1);
243        testcase!(&[0b110], 0, 3 => lz=1,lo=0,tz=0,to=2);
244        testcase!(&[0b001], 0, 3 => lz=0,lo=1,tz=2,to=0);
245        testcase!(&[0b011], 0, 3 => lz=0,lo=2,tz=1,to=0);
246
247        testcase!(&[0b010], 1, 2 => lz=0,lo=1,tz=1,to=0);
248        testcase!(&[0b101], 1, 2 => lz=1,lo=0,tz=0,to=1);
249        testcase!(&[0b100], 1, 2 => lz=1,lo=0,tz=0,to=1);
250        testcase!(&[0b110], 1, 2 => lz=0,lo=2,tz=0,to=2);
251        testcase!(&[0b001], 1, 2 => lz=2,lo=0,tz=2,to=0);
252        testcase!(&[0b011], 1, 2 => lz=0,lo=1,tz=1,to=0);
253    }
254
255    #[ignore = "Fuzz test. Too slow"]
256    #[test]
257    fn leading_trailing_fuzz() {
258        let mut rng = rand::thread_rng();
259
260        const SIZE: usize = 1000;
261        const REPEATS: usize = 10_000;
262
263        let mut v = Vec::<bool>::with_capacity(SIZE);
264
265        for _ in 0..REPEATS {
266            v.clear();
267            let offset = rng.gen_range(0..SIZE);
268            let length = rng.gen_range(0..SIZE - offset);
269            let extra_padding = rng.gen_range(0..64);
270
271            let mut num_remaining = usize::min(SIZE, offset + length + extra_padding);
272            while num_remaining > 0 {
273                let chunk_size = rng.gen_range(1..=num_remaining);
274                v.extend(
275                    rng.clone()
276                        .sample_iter(rand::distributions::Slice::new(&[false, true]).unwrap())
277                        .take(chunk_size),
278                );
279                num_remaining -= chunk_size;
280            }
281
282            let v_slice = &v[offset..offset + length];
283            let lz = v_slice.iter().take_while(|&v| !*v).count();
284            let lo = v_slice.iter().take_while(|&v| *v).count();
285            let tz = v_slice.iter().rev().take_while(|&v| !*v).count();
286            let to = v_slice.iter().rev().take_while(|&v| *v).count();
287
288            let bm = Bitmap::from_iter(v.iter().copied());
289            let (slice, _, _) = bm.as_slice();
290
291            assert_eq!(leading_zeros(slice, offset, length), lz);
292            assert_eq!(leading_ones(slice, offset, length), lo);
293            assert_eq!(trailing_zeros(slice, offset, length), tz);
294            assert_eq!(trailing_ones(slice, offset, length), to);
295        }
296    }
297}