polars_arrow/bitmap/
iterator.rs

1use polars_utils::slice::load_padded_le_u64;
2
3use super::bitmask::BitMask;
4use super::Bitmap;
5use crate::trusted_len::TrustedLen;
6
7/// Calculates how many iterations are remaining, assuming:
8///  - We have length elements left.
9///  - We need max(consume, min_length_for_iter) elements to start a new iteration.
10///  - On each iteration we consume the given amount of elements.
11fn calc_iters_remaining(length: usize, min_length_for_iter: usize, consume: usize) -> usize {
12    let min_length_for_iter = min_length_for_iter.max(consume);
13    if length < min_length_for_iter {
14        return 0;
15    }
16
17    let obvious_part = length - min_length_for_iter;
18    let obvious_iters = obvious_part / consume;
19    // let obvious_part_remaining = obvious_part % consume;
20    // let total_remaining = min_length_for_iter + obvious_part_remaining;
21    // assert!(total_remaining >= min_length_for_iter);          // We have at least 1 more iter.
22    // assert!(obvious_part_remaining < consume);                // Basic modulo property.
23    // assert!(total_remaining < min_length_for_iter + consume); // Add min_length_for_iter to both sides.
24    // assert!(total_remaining - consume < min_length_for_iter); // Not enough remaining after 1 iter.
25    1 + obvious_iters // Thus always exactly 1 more iter.
26}
27
28#[derive(Clone)]
29pub struct TrueIdxIter<'a> {
30    mask: BitMask<'a>,
31    first_unknown: usize,
32    i: usize,
33    len: usize,
34    remaining: usize,
35}
36
37impl<'a> TrueIdxIter<'a> {
38    #[inline]
39    pub fn new(len: usize, validity: Option<&'a Bitmap>) -> Self {
40        if let Some(bitmap) = validity {
41            assert!(len == bitmap.len());
42            Self {
43                mask: BitMask::from_bitmap(bitmap),
44                first_unknown: 0,
45                i: 0,
46                remaining: bitmap.len() - bitmap.unset_bits(),
47                len,
48            }
49        } else {
50            Self {
51                mask: BitMask::default(),
52                first_unknown: len,
53                i: 0,
54                remaining: len,
55                len,
56            }
57        }
58    }
59}
60
61impl Iterator for TrueIdxIter<'_> {
62    type Item = usize;
63
64    #[inline]
65    fn next(&mut self) -> Option<Self::Item> {
66        // Fast path for many non-nulls in a row.
67        if self.i < self.first_unknown {
68            let ret = self.i;
69            self.i += 1;
70            self.remaining -= 1;
71            return Some(ret);
72        }
73
74        while self.i < self.len {
75            let mask = self.mask.get_u32(self.i);
76            let num_null = mask.trailing_zeros();
77            self.i += num_null as usize;
78            if num_null < 32 {
79                self.first_unknown = self.i + (mask >> num_null).trailing_ones() as usize;
80                let ret = self.i;
81                self.i += 1;
82                self.remaining -= 1;
83                return Some(ret);
84            }
85        }
86
87        None
88    }
89
90    #[inline]
91    fn size_hint(&self) -> (usize, Option<usize>) {
92        (self.remaining, Some(self.remaining))
93    }
94}
95
96unsafe impl TrustedLen for TrueIdxIter<'_> {}
97
98pub struct FastU32BitmapIter<'a> {
99    bytes: &'a [u8],
100    shift: u32,
101    bits_left: usize,
102}
103
104impl<'a> FastU32BitmapIter<'a> {
105    pub fn new(bytes: &'a [u8], offset: usize, len: usize) -> Self {
106        assert!(bytes.len() * 8 >= offset + len);
107        let shift = (offset % 8) as u32;
108        let bytes = &bytes[offset / 8..];
109        Self {
110            bytes,
111            shift,
112            bits_left: len,
113        }
114    }
115
116    // The iteration logic that would normally follow the fast-path.
117    fn next_remainder(&mut self) -> Option<u32> {
118        if self.bits_left > 0 {
119            let word = load_padded_le_u64(self.bytes);
120            let mask;
121            if self.bits_left >= 32 {
122                mask = u32::MAX;
123                self.bits_left -= 32;
124                self.bytes = unsafe { self.bytes.get_unchecked(4..) };
125            } else {
126                mask = (1 << self.bits_left) - 1;
127                self.bits_left = 0;
128            }
129
130            return Some((word >> self.shift) as u32 & mask);
131        }
132
133        None
134    }
135
136    /// Returns the remainder bits and how many there are,
137    /// assuming the iterator was fully consumed.
138    pub fn remainder(mut self) -> (u64, usize) {
139        let bits_left = self.bits_left;
140        let lo = self.next_remainder().unwrap_or(0);
141        let hi = self.next_remainder().unwrap_or(0);
142        (((hi as u64) << 32) | (lo as u64), bits_left)
143    }
144}
145
146impl Iterator for FastU32BitmapIter<'_> {
147    type Item = u32;
148
149    #[inline]
150    fn next(&mut self) -> Option<Self::Item> {
151        // Fast path, can load a whole u64.
152        if self.bits_left >= 64 {
153            let chunk;
154            unsafe {
155                // SAFETY: bits_left ensures this is in-bounds.
156                chunk = self.bytes.get_unchecked(0..8);
157                self.bytes = self.bytes.get_unchecked(4..);
158            }
159            self.bits_left -= 32;
160            let word = u64::from_le_bytes(chunk.try_into().unwrap());
161            return Some((word >> self.shift) as u32);
162        }
163
164        None
165    }
166
167    #[inline]
168    fn size_hint(&self) -> (usize, Option<usize>) {
169        let hint = calc_iters_remaining(self.bits_left, 64, 32);
170        (hint, Some(hint))
171    }
172}
173
174unsafe impl TrustedLen for FastU32BitmapIter<'_> {}
175
176#[derive(Clone)]
177pub struct FastU56BitmapIter<'a> {
178    bytes: &'a [u8],
179    shift: u32,
180    bits_left: usize,
181}
182
183impl<'a> FastU56BitmapIter<'a> {
184    pub fn new(bytes: &'a [u8], offset: usize, len: usize) -> Self {
185        assert!(bytes.len() * 8 >= offset + len);
186        let shift = (offset % 8) as u32;
187        let bytes = &bytes[offset / 8..];
188        Self {
189            bytes,
190            shift,
191            bits_left: len,
192        }
193    }
194
195    // The iteration logic that would normally follow the fast-path.
196    fn next_remainder(&mut self) -> Option<u64> {
197        if self.bits_left > 0 {
198            let word = load_padded_le_u64(self.bytes);
199            let mask;
200            if self.bits_left >= 56 {
201                mask = (1 << 56) - 1;
202                self.bits_left -= 56;
203                self.bytes = unsafe { self.bytes.get_unchecked(7..) };
204            } else {
205                mask = (1 << self.bits_left) - 1;
206                self.bits_left = 0;
207            };
208
209            return Some((word >> self.shift) & mask);
210        }
211
212        None
213    }
214
215    /// Returns the remainder bits and how many there are,
216    /// assuming the iterator was fully consumed. Output is safe but
217    /// not specified if the iterator wasn't fully consumed.
218    pub fn remainder(mut self) -> (u64, usize) {
219        let bits_left = self.bits_left;
220        let lo = self.next_remainder().unwrap_or(0);
221        let hi = self.next_remainder().unwrap_or(0);
222        ((hi << 56) | lo, bits_left)
223    }
224}
225
226impl Iterator for FastU56BitmapIter<'_> {
227    type Item = u64;
228
229    #[inline]
230    fn next(&mut self) -> Option<Self::Item> {
231        // Fast path, can load a whole u64.
232        if self.bits_left >= 64 {
233            let chunk;
234            unsafe {
235                // SAFETY: bits_left ensures this is in-bounds.
236                chunk = self.bytes.get_unchecked(0..8);
237                self.bytes = self.bytes.get_unchecked(7..);
238                self.bits_left -= 56;
239            }
240
241            let word = u64::from_le_bytes(chunk.try_into().unwrap());
242            let mask = (1 << 56) - 1;
243            return Some((word >> self.shift) & mask);
244        }
245
246        None
247    }
248
249    #[inline]
250    fn size_hint(&self) -> (usize, Option<usize>) {
251        let hint = calc_iters_remaining(self.bits_left, 64, 56);
252        (hint, Some(hint))
253    }
254}
255
256unsafe impl TrustedLen for FastU56BitmapIter<'_> {}
257
258pub struct FastU64BitmapIter<'a> {
259    bytes: &'a [u8],
260    shift: u32,
261    bits_left: usize,
262    next_word: u64,
263}
264
265impl<'a> FastU64BitmapIter<'a> {
266    pub fn new(bytes: &'a [u8], offset: usize, len: usize) -> Self {
267        assert!(bytes.len() * 8 >= offset + len);
268        let shift = (offset % 8) as u32;
269        let bytes = &bytes[offset / 8..];
270        let next_word = load_padded_le_u64(bytes);
271        let bytes = bytes.get(8..).unwrap_or(&[]);
272        Self {
273            bytes,
274            shift,
275            bits_left: len,
276            next_word,
277        }
278    }
279
280    #[inline]
281    fn combine(&self, lo: u64, hi: u64) -> u64 {
282        // Compiles to 128-bit SHRD instruction on x86-64.
283        // Yes, the % 64 is important for the compiler to generate optimal code.
284        let wide = ((hi as u128) << 64) | lo as u128;
285        (wide >> (self.shift % 64)) as u64
286    }
287
288    // The iteration logic that would normally follow the fast-path.
289    fn next_remainder(&mut self) -> Option<u64> {
290        if self.bits_left > 0 {
291            let lo = self.next_word;
292            let hi = load_padded_le_u64(self.bytes);
293            let mask;
294            if self.bits_left >= 64 {
295                mask = u64::MAX;
296                self.bits_left -= 64;
297                self.bytes = self.bytes.get(8..).unwrap_or(&[]);
298            } else {
299                mask = (1 << self.bits_left) - 1;
300                self.bits_left = 0;
301            };
302            self.next_word = hi;
303
304            return Some(self.combine(lo, hi) & mask);
305        }
306
307        None
308    }
309
310    /// Returns the remainder bits and how many there are,
311    /// assuming the iterator was fully consumed. Output is safe but
312    /// not specified if the iterator wasn't fully consumed.
313    pub fn remainder(mut self) -> ([u64; 2], usize) {
314        let bits_left = self.bits_left;
315        let lo = self.next_remainder().unwrap_or(0);
316        let hi = self.next_remainder().unwrap_or(0);
317        ([lo, hi], bits_left)
318    }
319}
320
321impl Iterator for FastU64BitmapIter<'_> {
322    type Item = u64;
323
324    #[inline]
325    fn next(&mut self) -> Option<Self::Item> {
326        // Fast path: can load two u64s in a row.
327        // (Note that we already loaded one in the form of self.next_word).
328        if self.bits_left >= 128 {
329            let chunk;
330            unsafe {
331                // SAFETY: bits_left ensures this is in-bounds.
332                chunk = self.bytes.get_unchecked(0..8);
333                self.bytes = self.bytes.get_unchecked(8..);
334            }
335            let lo = self.next_word;
336            let hi = u64::from_le_bytes(chunk.try_into().unwrap());
337            self.next_word = hi;
338            self.bits_left -= 64;
339
340            return Some(self.combine(lo, hi));
341        }
342
343        None
344    }
345
346    #[inline]
347    fn size_hint(&self) -> (usize, Option<usize>) {
348        let hint = calc_iters_remaining(self.bits_left, 128, 64);
349        (hint, Some(hint))
350    }
351}
352
353unsafe impl TrustedLen for FastU64BitmapIter<'_> {}
354
355/// This crates' equivalent of [`std::vec::IntoIter`] for [`Bitmap`].
356#[derive(Debug, Clone)]
357pub struct IntoIter {
358    values: Bitmap,
359    index: usize,
360    end: usize,
361}
362
363impl IntoIter {
364    /// Creates a new [`IntoIter`] from a [`Bitmap`]
365    #[inline]
366    pub fn new(values: Bitmap) -> Self {
367        let end = values.len();
368        Self {
369            values,
370            index: 0,
371            end,
372        }
373    }
374}
375
376impl Iterator for IntoIter {
377    type Item = bool;
378
379    #[inline]
380    fn next(&mut self) -> Option<Self::Item> {
381        if self.index == self.end {
382            return None;
383        }
384        let old = self.index;
385        self.index += 1;
386        Some(unsafe { self.values.get_bit_unchecked(old) })
387    }
388
389    #[inline]
390    fn size_hint(&self) -> (usize, Option<usize>) {
391        (self.end - self.index, Some(self.end - self.index))
392    }
393
394    #[inline]
395    fn nth(&mut self, n: usize) -> Option<Self::Item> {
396        let new_index = self.index + n;
397        if new_index > self.end {
398            self.index = self.end;
399            None
400        } else {
401            self.index = new_index;
402            self.next()
403        }
404    }
405}
406
407impl DoubleEndedIterator for IntoIter {
408    #[inline]
409    fn next_back(&mut self) -> Option<Self::Item> {
410        if self.index == self.end {
411            None
412        } else {
413            self.end -= 1;
414            Some(unsafe { self.values.get_bit_unchecked(self.end) })
415        }
416    }
417}
418
419unsafe impl TrustedLen for IntoIter {}