polars_arrow/array/binview/
view.rs1use std::cmp::Ordering;
2use std::fmt::{self, Display, Formatter};
3use std::ops::Add;
4
5use bytemuck::{Pod, Zeroable};
6use polars_error::*;
7use polars_utils::min_max::MinMax;
8use polars_utils::nulls::IsNull;
9use polars_utils::total_ord::{TotalEq, TotalOrd};
10
11use crate::buffer::Buffer;
12use crate::datatypes::PrimitiveType;
13use crate::types::{Bytes16Alignment4, NativeType};
14
15#[derive(Copy, Clone, Default)]
21#[repr(C)]
22pub struct View {
23 pub length: u32,
25 pub prefix: u32,
27 pub buffer_idx: u32,
29 pub offset: u32,
31}
32
33impl fmt::Debug for View {
34 fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result {
35 if self.length <= Self::MAX_INLINE_SIZE {
36 fmt.debug_struct("View")
37 .field("length", &self.length)
38 .field("content", &unsafe {
39 std::slice::from_raw_parts(
40 (self as *const _ as *const u8).add(4),
41 self.length as usize,
42 )
43 })
44 .finish()
45 } else {
46 fmt.debug_struct("View")
47 .field("length", &self.length)
48 .field("prefix", &self.prefix.to_be_bytes())
49 .field("buffer_idx", &self.buffer_idx)
50 .field("offset", &self.offset)
51 .finish()
52 }
53 }
54}
55
56impl View {
57 pub const MAX_INLINE_SIZE: u32 = 12;
58
59 #[inline(always)]
60 pub fn as_u128(self) -> u128 {
61 unsafe { std::mem::transmute(self) }
62 }
63
64 #[inline]
70 pub unsafe fn new_inline_unchecked(bytes: &[u8]) -> Self {
71 debug_assert!(bytes.len() <= u32::MAX as usize);
72 debug_assert!(bytes.len() as u32 <= Self::MAX_INLINE_SIZE);
73
74 let mut view = Self {
75 length: bytes.len() as u32,
76 ..Default::default()
77 };
78
79 let view_ptr = &mut view as *mut _ as *mut u8;
80
81 unsafe {
88 let inline_data_ptr = view_ptr.add(4);
89 core::ptr::copy_nonoverlapping(bytes.as_ptr(), inline_data_ptr, bytes.len());
90 }
91 view
92 }
93
94 #[inline]
100 pub fn new_inline(bytes: &[u8]) -> Self {
101 assert!(bytes.len() as u32 <= Self::MAX_INLINE_SIZE);
102 unsafe { Self::new_inline_unchecked(bytes) }
103 }
104
105 #[inline]
111 pub unsafe fn new_noninline_unchecked(bytes: &[u8], buffer_idx: u32, offset: u32) -> Self {
112 debug_assert!(bytes.len() <= u32::MAX as usize);
113 debug_assert!(bytes.len() as u32 > View::MAX_INLINE_SIZE);
114
115 let prefix = unsafe { u32::from_le_bytes(bytes[0..4].try_into().unwrap_unchecked()) };
117 Self {
118 length: bytes.len() as u32,
119 prefix,
120 buffer_idx,
121 offset,
122 }
123 }
124
125 #[inline]
126 pub fn new_from_bytes(bytes: &[u8], buffer_idx: u32, offset: u32) -> Self {
127 debug_assert!(bytes.len() <= u32::MAX as usize);
128
129 unsafe {
131 if bytes.len() as u32 <= Self::MAX_INLINE_SIZE {
132 Self::new_inline_unchecked(bytes)
133 } else {
134 Self::new_noninline_unchecked(bytes, buffer_idx, offset)
135 }
136 }
137 }
138
139 pub unsafe fn get_slice_unchecked<'a>(&'a self, buffers: &'a [Buffer<u8>]) -> &'a [u8] {
144 unsafe {
145 if self.length <= Self::MAX_INLINE_SIZE {
146 let ptr = self as *const View as *const u8;
147 std::slice::from_raw_parts(ptr.add(4), self.length as usize)
148 } else {
149 let data = buffers.get_unchecked(self.buffer_idx as usize);
150 let offset = self.offset as usize;
151 data.get_unchecked(offset..offset + self.length as usize)
152 }
153 }
154 }
155
156 pub unsafe fn get_inlined_slice_unchecked(&self) -> &[u8] {
162 debug_assert!(self.length <= View::MAX_INLINE_SIZE);
163
164 let ptr = self as *const View as *const u8;
165 unsafe { std::slice::from_raw_parts(ptr.add(4), self.length as usize) }
167 }
168
169 pub fn extend_with_inlinable_strided(views: &mut Vec<Self>, src: &[u8], width: u8) {
179 macro_rules! dispatch {
180 ($n:ident = $match:ident in [$($v:literal),+ $(,)?] => $block:block, otherwise = $otherwise:expr) => {
181 match $match {
182 $(
183 $v => {
184 const $n: usize = $v;
185
186 $block
187 }
188 )+
189 _ => $otherwise,
190 }
191 }
192 }
193
194 let width = width as usize;
195
196 assert!(width > 0);
197 assert!(width <= View::MAX_INLINE_SIZE as usize);
198
199 assert_eq!(src.len() % width, 0);
200
201 let num_values = src.len() / width;
202
203 views.reserve(num_values);
204
205 #[allow(unused_mut)]
206 let mut src = src;
207
208 dispatch! {
209 N = width in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12] => {
210 #[cfg(feature = "simd")]
211 {
212 macro_rules! repeat_with {
213 ($i:ident = [$($v:literal),+ $(,)?] => $block:block) => {
214 $({
215 const $i: usize = $v;
216
217 $block
218 })+
219 }
220 }
221
222 use std::simd::*;
223
224 let mut dst = unsafe { views.as_mut_ptr().add(views.len()).cast::<u8>() };
227
228 let length_mask = u8x16::from_array([N as u8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]);
229
230 const BLOCKS_PER_LOAD: usize = 16 / N;
231 const BYTES_PER_LOOP: usize = N * BLOCKS_PER_LOAD;
232
233 let num_loops = (src.len() / BYTES_PER_LOOP).saturating_sub(1);
234
235 for _ in 0..num_loops {
236 let loaded = u8x16::from_array(unsafe {
238 src.get_unchecked(..16).try_into().unwrap()
239 });
240 src = unsafe { src.get_unchecked(BYTES_PER_LOOP..) };
241
242 repeat_with!(
244 I = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] => {
245 if I < BLOCKS_PER_LOAD {
246 let zero = u8x16::default();
247 const SWIZZLE: [usize; 16] = const {
248 let mut swizzle = [16usize; 16];
249
250 let mut i = 0;
251 while i < N {
252 let idx = i + I * N;
253 if idx < 16 {
254 swizzle[4+i] = idx;
255 }
256 i += 1;
257 }
258
259 swizzle
260 };
261
262 let scattered = simd_swizzle!(loaded, zero, SWIZZLE);
263 let view_bytes = (scattered | length_mask).to_array();
264
265 unsafe {
268 core::ptr::copy_nonoverlapping(view_bytes.as_ptr(), dst, 16);
269 dst = dst.add(16);
270 }
271 }
272 }
273 );
274 }
275
276 unsafe {
277 views.set_len(views.len() + num_loops * BLOCKS_PER_LOAD);
278 }
279 }
280
281 views.extend(src.chunks_exact(N).map(|slice| unsafe {
282 View::new_inline_unchecked(slice)
283 }));
284 },
285 otherwise = unreachable!()
286 }
287 }
288}
289
290impl IsNull for View {
291 const HAS_NULLS: bool = false;
292 type Inner = Self;
293
294 fn is_null(&self) -> bool {
295 false
296 }
297
298 fn unwrap_inner(self) -> Self::Inner {
299 self
300 }
301}
302
303impl Display for View {
304 fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
305 write!(f, "{:?}", self)
306 }
307}
308
309unsafe impl Zeroable for View {}
310
311unsafe impl Pod for View {}
312
313impl Add<Self> for View {
314 type Output = View;
315
316 fn add(self, _rhs: Self) -> Self::Output {
317 unimplemented!()
318 }
319}
320
321impl num_traits::Zero for View {
322 fn zero() -> Self {
323 Default::default()
324 }
325
326 fn is_zero(&self) -> bool {
327 *self == Self::zero()
328 }
329}
330
331impl PartialEq for View {
332 fn eq(&self, other: &Self) -> bool {
333 self.as_u128() == other.as_u128()
334 }
335}
336
337impl TotalOrd for View {
338 fn tot_cmp(&self, _other: &Self) -> Ordering {
339 unimplemented!()
340 }
341}
342
343impl TotalEq for View {
344 fn tot_eq(&self, other: &Self) -> bool {
345 self.eq(other)
346 }
347}
348
349impl MinMax for View {
350 fn nan_min_lt(&self, _other: &Self) -> bool {
351 unimplemented!()
352 }
353
354 fn nan_max_lt(&self, _other: &Self) -> bool {
355 unimplemented!()
356 }
357}
358
359impl NativeType for View {
360 const PRIMITIVE: PrimitiveType = PrimitiveType::UInt128;
361
362 type Bytes = [u8; 16];
363 type AlignedBytes = Bytes16Alignment4;
364
365 #[inline]
366 fn to_le_bytes(&self) -> Self::Bytes {
367 self.as_u128().to_le_bytes()
368 }
369
370 #[inline]
371 fn to_be_bytes(&self) -> Self::Bytes {
372 self.as_u128().to_be_bytes()
373 }
374
375 #[inline]
376 fn from_le_bytes(bytes: Self::Bytes) -> Self {
377 Self::from(u128::from_le_bytes(bytes))
378 }
379
380 #[inline]
381 fn from_be_bytes(bytes: Self::Bytes) -> Self {
382 Self::from(u128::from_be_bytes(bytes))
383 }
384}
385
386impl From<u128> for View {
387 #[inline]
388 fn from(value: u128) -> Self {
389 unsafe { std::mem::transmute(value) }
390 }
391}
392
393impl From<View> for u128 {
394 #[inline]
395 fn from(value: View) -> Self {
396 value.as_u128()
397 }
398}
399
400fn validate_view<F>(views: &[View], buffers: &[Buffer<u8>], validate_bytes: F) -> PolarsResult<()>
401where
402 F: Fn(&[u8]) -> PolarsResult<()>,
403{
404 for view in views {
405 let len = view.length;
406 if len <= View::MAX_INLINE_SIZE {
407 if len < View::MAX_INLINE_SIZE && view.as_u128() >> (32 + len * 8) != 0 {
408 polars_bail!(ComputeError: "view contained non-zero padding in prefix");
409 }
410
411 validate_bytes(&view.to_le_bytes()[4..4 + len as usize])?;
412 } else {
413 let data = buffers.get(view.buffer_idx as usize).ok_or_else(|| {
414 polars_err!(OutOfBounds: "view index out of bounds\n\nGot: {} buffers and index: {}", buffers.len(), view.buffer_idx)
415 })?;
416
417 let start = view.offset as usize;
418 let end = start + len as usize;
419 let b = data
420 .as_slice()
421 .get(start..end)
422 .ok_or_else(|| polars_err!(OutOfBounds: "buffer slice out of bounds"))?;
423
424 polars_ensure!(b.starts_with(&view.prefix.to_le_bytes()), ComputeError: "prefix does not match string data");
425 validate_bytes(b)?;
426 };
427 }
428
429 Ok(())
430}
431
432pub(super) fn validate_binary_view(views: &[View], buffers: &[Buffer<u8>]) -> PolarsResult<()> {
433 validate_view(views, buffers, |_| Ok(()))
434}
435
436fn validate_utf8(b: &[u8]) -> PolarsResult<()> {
437 match simdutf8::basic::from_utf8(b) {
438 Ok(_) => Ok(()),
439 Err(_) => Err(polars_err!(ComputeError: "invalid utf8")),
440 }
441}
442
443pub fn validate_utf8_view(views: &[View], buffers: &[Buffer<u8>]) -> PolarsResult<()> {
444 validate_view(views, buffers, validate_utf8)
445}
446
447pub(super) unsafe fn validate_utf8_only(
450 views: &[View],
451 buffers_to_check: &[Buffer<u8>],
452 all_buffers: &[Buffer<u8>],
453) -> PolarsResult<()> {
454 if all_buffers.is_empty() {
456 for view in views {
457 let len = view.length;
458 validate_utf8(view.to_le_bytes().get_unchecked(4..4 + len as usize))?;
459 }
460 return Ok(());
461 }
462
463 if buffers_to_check.iter().all(|buf| buf.is_ascii()) {
465 for view in views {
466 let len = view.length;
467 if len <= 12 {
468 validate_utf8(view.to_le_bytes().get_unchecked(4..4 + len as usize))?;
469 }
470 }
471 } else {
472 for view in views {
473 let len = view.length;
474 if len <= 12 {
475 validate_utf8(view.to_le_bytes().get_unchecked(4..4 + len as usize))?;
476 } else {
477 let buffer_idx = view.buffer_idx;
478 let offset = view.offset;
479 let data = all_buffers.get_unchecked(buffer_idx as usize);
480
481 let start = offset as usize;
482 let end = start + len as usize;
483 let b = &data.as_slice().get_unchecked(start..end);
484 validate_utf8(b)?;
485 };
486 }
487 }
488
489 Ok(())
490}