1use arrow::array::{BinaryArray, BinaryViewArray};
2use arrow::datatypes::ArrowDataType;
3use arrow::ffi::mmap;
4use arrow::offset::{Offsets, OffsetsBuffer};
5use polars_compute::cast::binary_to_binview;
6
7const BOOLEAN_TRUE_SENTINEL: u8 = 0x03;
8const BOOLEAN_FALSE_SENTINEL: u8 = 0x02;
9
10#[derive(Debug, Clone)]
15pub enum RowEncodingContext {
16 Struct(Vec<Option<RowEncodingContext>>),
17 Categorical(RowEncodingCategoricalContext),
19 Decimal(usize),
21}
22
23#[derive(Debug, Clone)]
24pub struct RowEncodingCategoricalContext {
25 pub num_known_categories: u32,
27 pub is_enum: bool,
28
29 pub lexical_sort_idxs: Option<Vec<u32>>,
31}
32
33impl RowEncodingCategoricalContext {
34 pub fn needed_num_bits(&self) -> usize {
35 if self.num_known_categories == 0 {
36 0
37 } else {
38 let max_category_index = self.num_known_categories - 1;
39 (max_category_index.next_power_of_two().trailing_zeros() + 1) as usize
40 }
41 }
42}
43
44bitflags::bitflags! {
45 #[derive(Debug, Clone, Copy, Default)]
52 pub struct RowEncodingOptions: u8 {
53 const DESCENDING = 0x01;
55 const NULLS_LAST = 0x02;
57
58 const NO_ORDER = 0x04;
63 }
64}
65
66const LIST_CONTINUATION_TOKEN: u8 = 0xFE;
67const EMPTY_STR_TOKEN: u8 = 0x01;
68
69impl RowEncodingOptions {
70 pub fn new_sorted(descending: bool, nulls_last: bool) -> Self {
71 let mut slf = Self::default();
72 slf.set(Self::DESCENDING, descending);
73 slf.set(Self::NULLS_LAST, nulls_last);
74 slf
75 }
76
77 pub fn new_unsorted() -> Self {
78 Self::NO_ORDER
79 }
80
81 pub fn null_sentinel(self) -> u8 {
82 if self.contains(Self::NULLS_LAST) {
83 0xFF
84 } else {
85 0x00
86 }
87 }
88
89 pub(crate) fn bool_true_sentinel(self) -> u8 {
90 if self.contains(Self::DESCENDING) {
91 !BOOLEAN_TRUE_SENTINEL
92 } else {
93 BOOLEAN_TRUE_SENTINEL
94 }
95 }
96
97 pub(crate) fn bool_false_sentinel(self) -> u8 {
98 if self.contains(Self::DESCENDING) {
99 !BOOLEAN_FALSE_SENTINEL
100 } else {
101 BOOLEAN_FALSE_SENTINEL
102 }
103 }
104
105 pub fn list_null_sentinel(self) -> u8 {
106 self.null_sentinel()
107 }
108
109 pub fn list_continuation_token(self) -> u8 {
110 if self.contains(Self::DESCENDING) {
111 !LIST_CONTINUATION_TOKEN
112 } else {
113 LIST_CONTINUATION_TOKEN
114 }
115 }
116
117 pub fn list_termination_token(self) -> u8 {
118 !self.list_continuation_token()
119 }
120
121 pub fn empty_str_token(self) -> u8 {
122 if self.contains(Self::DESCENDING) {
123 !EMPTY_STR_TOKEN
124 } else {
125 EMPTY_STR_TOKEN
126 }
127 }
128}
129
130#[derive(Default, Clone)]
131pub struct RowsEncoded {
132 pub(crate) values: Vec<u8>,
133 pub(crate) offsets: Vec<usize>,
134}
135
136unsafe fn rows_to_array(buf: Vec<u8>, offsets: Vec<usize>) -> BinaryArray<i64> {
137 let offsets = if size_of::<usize>() == size_of::<i64>() {
138 assert!(
139 (*offsets.last().unwrap() as u64) < i64::MAX as u64,
140 "row encoding output overflowed"
141 );
142
143 bytemuck::cast_vec::<usize, i64>(offsets)
145 } else {
146 offsets.into_iter().map(|v| v as i64).collect()
147 };
148
149 let offsets = Offsets::new_unchecked(offsets);
151
152 BinaryArray::new(ArrowDataType::LargeBinary, offsets.into(), buf.into(), None)
153}
154
155impl RowsEncoded {
156 pub(crate) fn new(values: Vec<u8>, offsets: Vec<usize>) -> Self {
157 RowsEncoded { values, offsets }
158 }
159
160 pub fn iter(&self) -> RowsEncodedIter {
161 let iter = self.offsets[1..].iter();
162 let offset = self.offsets[0];
163 RowsEncodedIter {
164 offset,
165 end: iter,
166 values: &self.values,
167 }
168 }
169
170 pub unsafe fn borrow_array(&self) -> BinaryArray<i64> {
176 let (_, values, _) = unsafe { mmap::slice(&self.values) }.into_inner();
177 let offsets = if size_of::<usize>() == size_of::<i64>() {
178 assert!(
179 (*self.offsets.last().unwrap() as u64) < i64::MAX as u64,
180 "row encoding output overflowed"
181 );
182
183 let offsets = bytemuck::cast_slice::<usize, i64>(self.offsets.as_slice());
184 let (_, offsets, _) = unsafe { mmap::slice(offsets) }.into_inner();
185 offsets
186 } else {
187 self.offsets.iter().map(|&v| v as i64).collect()
188 };
189 let offsets = OffsetsBuffer::new_unchecked(offsets);
190
191 BinaryArray::new(ArrowDataType::LargeBinary, offsets, values, None)
192 }
193
194 pub fn into_array(self) -> BinaryArray<i64> {
196 unsafe { rows_to_array(self.values, self.offsets) }
197 }
198
199 pub fn into_binview(self) -> BinaryViewArray {
201 binary_to_binview(&self.into_array())
202 }
203
204 #[cfg(test)]
205 pub fn get(&self, i: usize) -> &[u8] {
206 let start = self.offsets[i];
207 let end = self.offsets[i + 1];
208 &self.values[start..end]
209 }
210}
211
212pub struct RowsEncodedIter<'a> {
213 offset: usize,
214 end: std::slice::Iter<'a, usize>,
215 values: &'a [u8],
216}
217
218impl<'a> Iterator for RowsEncodedIter<'a> {
219 type Item = &'a [u8];
220
221 fn next(&mut self) -> Option<Self::Item> {
222 let new_offset = *self.end.next()?;
223 let payload = unsafe { self.values.get_unchecked(self.offset..new_offset) };
224 self.offset = new_offset;
225 Some(payload)
226 }
227
228 fn size_hint(&self) -> (usize, Option<usize>) {
229 self.end.size_hint()
230 }
231}