polars_row/
row.rs

1use arrow::array::{BinaryArray, BinaryViewArray};
2use arrow::datatypes::ArrowDataType;
3use arrow::ffi::mmap;
4use arrow::offset::{Offsets, OffsetsBuffer};
5use polars_compute::cast::binary_to_binview;
6
7const BOOLEAN_TRUE_SENTINEL: u8 = 0x03;
8const BOOLEAN_FALSE_SENTINEL: u8 = 0x02;
9
10/// Additional context provided to row encoding regarding a column.
11///
12/// This allows communication based on the Polars datatype instead on the Arrow datatype. Since
13/// polars-row is used under polars-core, we don't have access to the actual datatypes.
14#[derive(Debug, Clone)]
15pub enum RowEncodingContext {
16    Struct(Vec<Option<RowEncodingContext>>),
17    /// Categorical / Enum
18    Categorical(RowEncodingCategoricalContext),
19    /// Decimal with given precision
20    Decimal(usize),
21}
22
23#[derive(Debug, Clone)]
24pub struct RowEncodingCategoricalContext {
25    /// The number of known categories in categorical / enum currently.
26    pub num_known_categories: u32,
27    pub is_enum: bool,
28
29    /// The mapping from key to lexical sort index
30    pub lexical_sort_idxs: Option<Vec<u32>>,
31}
32
33impl RowEncodingCategoricalContext {
34    pub fn needed_num_bits(&self) -> usize {
35        if self.num_known_categories == 0 {
36            0
37        } else {
38            let max_category_index = self.num_known_categories - 1;
39            (max_category_index.next_power_of_two().trailing_zeros() + 1) as usize
40        }
41    }
42}
43
44bitflags::bitflags! {
45    /// Options for the Polars Row Encoding.
46    ///
47    /// The row encoding provides a method to combine several columns into one binary column which
48    /// has the same sort-order as the original columns.test
49    ///
50    /// By default, the row encoding provides the ascending, nulls first sort-order of the columns.
51    #[derive(Debug, Clone, Copy, Default)]
52    pub struct RowEncodingOptions: u8 {
53        /// Sort in descending order instead of ascending order
54        const DESCENDING               = 0x01;
55        /// Sort such that nulls / missing values are last
56        const NULLS_LAST               = 0x02;
57
58        /// Ignore all order-related flags and don't encode order-preserving. This will keep
59        /// uniqueness.
60        ///
61        /// This is faster for several encodings
62        const NO_ORDER                 = 0x04;
63    }
64}
65
66const LIST_CONTINUATION_TOKEN: u8 = 0xFE;
67const EMPTY_STR_TOKEN: u8 = 0x01;
68
69impl RowEncodingOptions {
70    pub fn new_sorted(descending: bool, nulls_last: bool) -> Self {
71        let mut slf = Self::default();
72        slf.set(Self::DESCENDING, descending);
73        slf.set(Self::NULLS_LAST, nulls_last);
74        slf
75    }
76
77    pub fn new_unsorted() -> Self {
78        Self::NO_ORDER
79    }
80
81    pub fn null_sentinel(self) -> u8 {
82        if self.contains(Self::NULLS_LAST) {
83            0xFF
84        } else {
85            0x00
86        }
87    }
88
89    pub(crate) fn bool_true_sentinel(self) -> u8 {
90        if self.contains(Self::DESCENDING) {
91            !BOOLEAN_TRUE_SENTINEL
92        } else {
93            BOOLEAN_TRUE_SENTINEL
94        }
95    }
96
97    pub(crate) fn bool_false_sentinel(self) -> u8 {
98        if self.contains(Self::DESCENDING) {
99            !BOOLEAN_FALSE_SENTINEL
100        } else {
101            BOOLEAN_FALSE_SENTINEL
102        }
103    }
104
105    pub fn list_null_sentinel(self) -> u8 {
106        self.null_sentinel()
107    }
108
109    pub fn list_continuation_token(self) -> u8 {
110        if self.contains(Self::DESCENDING) {
111            !LIST_CONTINUATION_TOKEN
112        } else {
113            LIST_CONTINUATION_TOKEN
114        }
115    }
116
117    pub fn list_termination_token(self) -> u8 {
118        !self.list_continuation_token()
119    }
120
121    pub fn empty_str_token(self) -> u8 {
122        if self.contains(Self::DESCENDING) {
123            !EMPTY_STR_TOKEN
124        } else {
125            EMPTY_STR_TOKEN
126        }
127    }
128}
129
130#[derive(Default, Clone)]
131pub struct RowsEncoded {
132    pub(crate) values: Vec<u8>,
133    pub(crate) offsets: Vec<usize>,
134}
135
136unsafe fn rows_to_array(buf: Vec<u8>, offsets: Vec<usize>) -> BinaryArray<i64> {
137    let offsets = if size_of::<usize>() == size_of::<i64>() {
138        assert!(
139            (*offsets.last().unwrap() as u64) < i64::MAX as u64,
140            "row encoding output overflowed"
141        );
142
143        // SAFETY: we checked overflow and size
144        bytemuck::cast_vec::<usize, i64>(offsets)
145    } else {
146        offsets.into_iter().map(|v| v as i64).collect()
147    };
148
149    // SAFETY: monotonically increasing
150    let offsets = Offsets::new_unchecked(offsets);
151
152    BinaryArray::new(ArrowDataType::LargeBinary, offsets.into(), buf.into(), None)
153}
154
155impl RowsEncoded {
156    pub(crate) fn new(values: Vec<u8>, offsets: Vec<usize>) -> Self {
157        RowsEncoded { values, offsets }
158    }
159
160    pub fn iter(&self) -> RowsEncodedIter {
161        let iter = self.offsets[1..].iter();
162        let offset = self.offsets[0];
163        RowsEncodedIter {
164            offset,
165            end: iter,
166            values: &self.values,
167        }
168    }
169
170    /// Borrows the buffers and returns a [`BinaryArray`].
171    ///
172    /// # Safety
173    /// The lifetime of that `BinaryArray` is tied to the lifetime of
174    /// `Self`. The caller must ensure that both stay alive for the same time.
175    pub unsafe fn borrow_array(&self) -> BinaryArray<i64> {
176        let (_, values, _) = unsafe { mmap::slice(&self.values) }.into_inner();
177        let offsets = if size_of::<usize>() == size_of::<i64>() {
178            assert!(
179                (*self.offsets.last().unwrap() as u64) < i64::MAX as u64,
180                "row encoding output overflowed"
181            );
182
183            let offsets = bytemuck::cast_slice::<usize, i64>(self.offsets.as_slice());
184            let (_, offsets, _) = unsafe { mmap::slice(offsets) }.into_inner();
185            offsets
186        } else {
187            self.offsets.iter().map(|&v| v as i64).collect()
188        };
189        let offsets = OffsetsBuffer::new_unchecked(offsets);
190
191        BinaryArray::new(ArrowDataType::LargeBinary, offsets, values, None)
192    }
193
194    /// This conversion is free.
195    pub fn into_array(self) -> BinaryArray<i64> {
196        unsafe { rows_to_array(self.values, self.offsets) }
197    }
198
199    /// This does allocate views.
200    pub fn into_binview(self) -> BinaryViewArray {
201        binary_to_binview(&self.into_array())
202    }
203
204    #[cfg(test)]
205    pub fn get(&self, i: usize) -> &[u8] {
206        let start = self.offsets[i];
207        let end = self.offsets[i + 1];
208        &self.values[start..end]
209    }
210}
211
212pub struct RowsEncodedIter<'a> {
213    offset: usize,
214    end: std::slice::Iter<'a, usize>,
215    values: &'a [u8],
216}
217
218impl<'a> Iterator for RowsEncodedIter<'a> {
219    type Item = &'a [u8];
220
221    fn next(&mut self) -> Option<Self::Item> {
222        let new_offset = *self.end.next()?;
223        let payload = unsafe { self.values.get_unchecked(self.offset..new_offset) };
224        self.offset = new_offset;
225        Some(payload)
226    }
227
228    fn size_hint(&self) -> (usize, Option<usize>) {
229        self.end.size_hint()
230    }
231}