polars_arrow/bitmap/
builder.rs

1use polars_utils::slice::load_padded_le_u64;
2
3use crate::bitmap::{Bitmap, MutableBitmap};
4use crate::storage::SharedStorage;
5use crate::trusted_len::TrustedLen;
6
7/// Used to build bitmaps bool-by-bool in sequential order.
8#[derive(Default, Clone)]
9pub struct BitmapBuilder {
10    buf: u64,                 // A buffer containing the last self.bit_len % 64 bits.
11    bit_len: usize,           // Length in bits.
12    bit_cap: usize,           // Capacity in bits (always multiple of 64).
13    set_bits_in_bytes: usize, // The number of bits set in self.bytes, not including self.buf.
14    bytes: Vec<u8>,
15}
16
17impl BitmapBuilder {
18    pub fn new() -> Self {
19        Self::default()
20    }
21
22    #[inline(always)]
23    pub fn len(&self) -> usize {
24        self.bit_len
25    }
26
27    #[inline(always)]
28    pub fn is_empty(&self) -> bool {
29        self.bit_len == 0
30    }
31
32    #[inline(always)]
33    pub fn capacity(&self) -> usize {
34        self.bit_cap
35    }
36
37    #[inline(always)]
38    pub fn set_bits(&self) -> usize {
39        self.set_bits_in_bytes + self.buf.count_ones() as usize
40    }
41
42    #[inline(always)]
43    pub fn unset_bits(&self) -> usize {
44        self.bit_len - self.set_bits()
45    }
46
47    pub fn with_capacity(bits: usize) -> Self {
48        let bytes = Vec::with_capacity(bits.div_ceil(64) * 8);
49        let words_available = bytes.capacity() / 8;
50        Self {
51            buf: 0,
52            bit_len: 0,
53            bit_cap: words_available * 64,
54            set_bits_in_bytes: 0,
55            bytes,
56        }
57    }
58
59    #[inline(always)]
60    pub fn reserve(&mut self, additional: usize) {
61        if self.bit_len + additional > self.bit_cap {
62            self.reserve_slow(additional)
63        }
64    }
65
66    #[cold]
67    #[inline(never)]
68    fn reserve_slow(&mut self, additional: usize) {
69        let bytes_needed = (self.bit_len + additional).div_ceil(64) * 8;
70        self.bytes.reserve(bytes_needed - self.bytes.len());
71        let words_available = self.bytes.capacity() / 8;
72        self.bit_cap = words_available * 64;
73    }
74
75    #[inline(always)]
76    pub fn push(&mut self, x: bool) {
77        self.reserve(1);
78        unsafe { self.push_unchecked(x) }
79    }
80
81    /// Does not update len/set_bits, simply writes to the output buffer.
82    /// # Safety
83    /// self.bytes.len() + 8 <= self.bytes.capacity() must hold.
84    #[inline(always)]
85    unsafe fn flush_word_unchecked(&mut self, w: u64) {
86        let cur_len = self.bytes.len();
87        let p = self.bytes.as_mut_ptr().add(cur_len).cast::<u64>();
88        p.write_unaligned(w.to_le());
89        self.bytes.set_len(cur_len + 8);
90    }
91
92    /// # Safety
93    /// self.len() < self.capacity() must hold.
94    #[inline(always)]
95    pub unsafe fn push_unchecked(&mut self, x: bool) {
96        debug_assert!(self.bit_len < self.bit_cap);
97        self.buf |= (x as u64) << (self.bit_len % 64);
98        self.bit_len += 1;
99        if self.bit_len % 64 == 0 {
100            self.flush_word_unchecked(self.buf);
101            self.set_bits_in_bytes += self.buf.count_ones() as usize;
102            self.buf = 0;
103        }
104    }
105
106    #[inline(always)]
107    pub fn extend_constant(&mut self, length: usize, value: bool) {
108        // Fast path if the extension still fits in buf with room left to spare.
109        let bits_in_buf = self.bit_len % 64;
110        if bits_in_buf + length < 64 {
111            let bit_block = ((value as u64) << length) - (value as u64);
112            self.buf |= bit_block << bits_in_buf;
113            self.bit_len += length;
114        } else {
115            self.extend_constant_slow(length, value);
116        }
117    }
118
119    #[cold]
120    fn extend_constant_slow(&mut self, length: usize, value: bool) {
121        unsafe {
122            let value_spread = if value { u64::MAX } else { 0 }; // Branchless neg.
123
124            // Extend and flush current buf.
125            self.reserve(length);
126            let bits_in_buf = self.bit_len % 64;
127            let ext_buf = self.buf | (value_spread << bits_in_buf);
128            self.flush_word_unchecked(ext_buf);
129            self.set_bits_in_bytes += ext_buf.count_ones() as usize;
130
131            // Write complete words.
132            let remaining_bits = length - (64 - bits_in_buf);
133            let remaining_words = remaining_bits / 64;
134            for _ in 0..remaining_words {
135                self.flush_word_unchecked(value_spread);
136            }
137            self.set_bits_in_bytes += (remaining_words * 64) & value_spread as usize;
138
139            // Put remainder in buf and update length.
140            self.buf = ((value as u64) << (remaining_bits % 64)) - (value as u64);
141            self.bit_len += length;
142        }
143    }
144
145    /// Pushes the first length bits from the given word, assuming the rest of
146    /// the bits are zero.
147    /// # Safety
148    /// self.len + length <= self.cap and length <= 64 must hold.
149    pub unsafe fn push_word_with_len_unchecked(&mut self, word: u64, length: usize) {
150        debug_assert!(self.bit_len + length <= self.bit_cap);
151        debug_assert!(length <= 64);
152        debug_assert!(length == 64 || (word >> length) == 0);
153        let bits_in_buf = self.bit_len % 64;
154        self.buf |= word << bits_in_buf;
155        if bits_in_buf + length >= 64 {
156            self.flush_word_unchecked(self.buf);
157            self.set_bits_in_bytes += self.buf.count_ones() as usize;
158            self.buf = if bits_in_buf > 0 {
159                word >> (64 - bits_in_buf)
160            } else {
161                0
162            };
163        }
164        self.bit_len += length;
165    }
166
167    /// # Safety
168    /// self.len() + length <= self.capacity() must hold, as well as
169    /// offset + length <= 8 * slice.len().
170    unsafe fn extend_from_slice_unchecked(
171        &mut self,
172        mut slice: &[u8],
173        mut offset: usize,
174        mut length: usize,
175    ) {
176        if length == 0 {
177            return;
178        }
179
180        // Deal with slice offset so it's aligned to bytes.
181        let slice_bit_offset = offset % 8;
182        if slice_bit_offset > 0 {
183            let bits_in_first_byte = (8 - slice_bit_offset).min(length);
184            let first_byte = *slice.get_unchecked(offset / 8) >> slice_bit_offset;
185            self.push_word_with_len_unchecked(
186                first_byte as u64 & ((1 << bits_in_first_byte) - 1),
187                bits_in_first_byte,
188            );
189            length -= bits_in_first_byte;
190            offset += bits_in_first_byte;
191        }
192        slice = slice.get_unchecked(offset / 8..);
193
194        // Write word-by-word.
195        let bits_in_buf = self.bit_len % 64;
196        if bits_in_buf > 0 {
197            while length >= 64 {
198                let word = u64::from_le_bytes(slice.get_unchecked(0..8).try_into().unwrap());
199                self.buf |= word << bits_in_buf;
200                self.flush_word_unchecked(self.buf);
201                self.set_bits_in_bytes += self.buf.count_ones() as usize;
202                self.buf = word >> (64 - bits_in_buf);
203                self.bit_len += 64;
204                length -= 64;
205                slice = slice.get_unchecked(8..);
206            }
207        } else {
208            while length >= 64 {
209                let word = u64::from_le_bytes(slice.get_unchecked(0..8).try_into().unwrap());
210                self.flush_word_unchecked(word);
211                self.set_bits_in_bytes += word.count_ones() as usize;
212                self.bit_len += 64;
213                length -= 64;
214                slice = slice.get_unchecked(8..);
215            }
216        }
217
218        // Just the last word left.
219        if length > 0 {
220            let word = load_padded_le_u64(slice);
221            self.push_word_with_len_unchecked(word & ((1 << length) - 1), length);
222        }
223    }
224
225    pub fn extend_from_slice(&mut self, slice: &[u8], offset: usize, length: usize) {
226        assert!(8 * slice.len() >= offset + length);
227        self.reserve(length);
228        unsafe {
229            self.extend_from_slice_unchecked(slice, offset, length);
230        }
231    }
232
233    pub fn extend_from_bitmap(&mut self, bitmap: &Bitmap) {
234        // TODO: we can perhaps use the bitmaps bitcount here instead of
235        // recomputing it if it has a known bitcount.
236        let (slice, offset, length) = bitmap.as_slice();
237        self.extend_from_slice(slice, offset, length);
238    }
239
240    /// # Safety
241    /// May only be called once at the end.
242    unsafe fn finish(&mut self) {
243        if self.bit_len % 64 != 0 {
244            self.bytes.extend_from_slice(&self.buf.to_le_bytes());
245            self.set_bits_in_bytes += self.buf.count_ones() as usize;
246            self.buf = 0;
247        }
248    }
249
250    /// Converts this BitmapBuilder into a mutable bitmap.
251    pub fn into_mut(mut self) -> MutableBitmap {
252        unsafe {
253            self.finish();
254            MutableBitmap::from_vec(self.bytes, self.bit_len)
255        }
256    }
257
258    /// The same as into_mut, but returns None if the bitmap is all-ones.
259    pub fn into_opt_mut_validity(mut self) -> Option<MutableBitmap> {
260        unsafe {
261            self.finish();
262            if self.set_bits_in_bytes == self.bit_len {
263                return None;
264            }
265            Some(MutableBitmap::from_vec(self.bytes, self.bit_len))
266        }
267    }
268
269    /// Freezes this BitmapBuilder into an immutable Bitmap.
270    pub fn freeze(mut self) -> Bitmap {
271        unsafe {
272            self.finish();
273            let storage = SharedStorage::from_vec(self.bytes);
274            Bitmap::from_inner_unchecked(
275                storage,
276                0,
277                self.bit_len,
278                Some(self.bit_len - self.set_bits_in_bytes),
279            )
280        }
281    }
282
283    /// The same as freeze, but returns None if the bitmap is all-ones.
284    pub fn into_opt_validity(mut self) -> Option<Bitmap> {
285        unsafe {
286            self.finish();
287            if self.set_bits_in_bytes == self.bit_len {
288                return None;
289            }
290            let storage = SharedStorage::from_vec(self.bytes);
291            let bitmap = Bitmap::from_inner_unchecked(
292                storage,
293                0,
294                self.bit_len,
295                Some(self.bit_len - self.set_bits_in_bytes),
296            );
297            Some(bitmap)
298        }
299    }
300
301    pub fn extend_trusted_len_iter<I>(&mut self, iterator: I)
302    where
303        I: Iterator<Item = bool> + TrustedLen,
304    {
305        self.reserve(iterator.size_hint().1.unwrap());
306        for b in iterator {
307            // SAFETY: we reserved and the iterator's length is trusted.
308            unsafe {
309                self.push_unchecked(b);
310            }
311        }
312    }
313
314    #[inline]
315    pub fn from_trusted_len_iter<I>(iterator: I) -> Self
316    where
317        I: Iterator<Item = bool> + TrustedLen,
318    {
319        let mut builder = Self::new();
320        builder.extend_trusted_len_iter(iterator);
321        builder
322    }
323}