polars_arrow/array/binview/
view.rs

1use std::cmp::Ordering;
2use std::fmt::{self, Display, Formatter};
3use std::ops::Add;
4
5use bytemuck::{Pod, Zeroable};
6use polars_error::*;
7use polars_utils::min_max::MinMax;
8use polars_utils::nulls::IsNull;
9use polars_utils::total_ord::{TotalEq, TotalOrd};
10
11use crate::buffer::Buffer;
12use crate::datatypes::PrimitiveType;
13use crate::types::{Bytes16Alignment4, NativeType};
14
15// We use this instead of u128 because we want alignment of <= 8 bytes.
16/// A reference to a set of bytes.
17///
18/// If `length <= 12`, these bytes are inlined over the `prefix`, `buffer_idx` and `offset` fields.
19/// If `length > 12`, these fields specify a slice of a buffer.
20#[derive(Copy, Clone, Default)]
21#[repr(C)]
22pub struct View {
23    /// The length of the string/bytes.
24    pub length: u32,
25    /// First 4 bytes of string/bytes data.
26    pub prefix: u32,
27    /// The buffer index.
28    pub buffer_idx: u32,
29    /// The offset into the buffer.
30    pub offset: u32,
31}
32
33impl fmt::Debug for View {
34    fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result {
35        if self.length <= Self::MAX_INLINE_SIZE {
36            fmt.debug_struct("View")
37                .field("length", &self.length)
38                .field("content", &unsafe {
39                    std::slice::from_raw_parts(
40                        (self as *const _ as *const u8).add(4),
41                        self.length as usize,
42                    )
43                })
44                .finish()
45        } else {
46            fmt.debug_struct("View")
47                .field("length", &self.length)
48                .field("prefix", &self.prefix.to_be_bytes())
49                .field("buffer_idx", &self.buffer_idx)
50                .field("offset", &self.offset)
51                .finish()
52        }
53    }
54}
55
56impl View {
57    pub const MAX_INLINE_SIZE: u32 = 12;
58
59    #[inline(always)]
60    pub fn as_u128(self) -> u128 {
61        unsafe { std::mem::transmute(self) }
62    }
63
64    /// Create a new inline view without verifying the length
65    ///
66    /// # Safety
67    ///
68    /// It needs to hold that `bytes.len() <= View::MAX_INLINE_SIZE`.
69    #[inline]
70    pub unsafe fn new_inline_unchecked(bytes: &[u8]) -> Self {
71        debug_assert!(bytes.len() <= u32::MAX as usize);
72        debug_assert!(bytes.len() as u32 <= Self::MAX_INLINE_SIZE);
73
74        let mut view = Self {
75            length: bytes.len() as u32,
76            ..Default::default()
77        };
78
79        let view_ptr = &mut view as *mut _ as *mut u8;
80
81        // SAFETY:
82        // - bytes length <= 12,
83        // - size_of::<View> == 16
84        // - View is laid out as [length, prefix, buffer_idx, offset] (using repr(C))
85        // - By grabbing the view_ptr and adding 4, we have provenance over prefix, buffer_idx and
86        // offset. (i.e. the same could not be achieved with &mut self.prefix as *mut _ as *mut u8)
87        unsafe {
88            let inline_data_ptr = view_ptr.add(4);
89            core::ptr::copy_nonoverlapping(bytes.as_ptr(), inline_data_ptr, bytes.len());
90        }
91        view
92    }
93
94    /// Create a new inline view
95    ///
96    /// # Panics
97    ///
98    /// Panics if the `bytes.len() > View::MAX_INLINE_SIZE`.
99    #[inline]
100    pub fn new_inline(bytes: &[u8]) -> Self {
101        assert!(bytes.len() as u32 <= Self::MAX_INLINE_SIZE);
102        unsafe { Self::new_inline_unchecked(bytes) }
103    }
104
105    /// Create a new inline view
106    ///
107    /// # Safety
108    ///
109    /// It needs to hold that `bytes.len() > View::MAX_INLINE_SIZE`.
110    #[inline]
111    pub unsafe fn new_noninline_unchecked(bytes: &[u8], buffer_idx: u32, offset: u32) -> Self {
112        debug_assert!(bytes.len() <= u32::MAX as usize);
113        debug_assert!(bytes.len() as u32 > View::MAX_INLINE_SIZE);
114
115        // SAFETY: The invariant of this function guarantees that this is safe.
116        let prefix = unsafe { u32::from_le_bytes(bytes[0..4].try_into().unwrap_unchecked()) };
117        Self {
118            length: bytes.len() as u32,
119            prefix,
120            buffer_idx,
121            offset,
122        }
123    }
124
125    #[inline]
126    pub fn new_from_bytes(bytes: &[u8], buffer_idx: u32, offset: u32) -> Self {
127        debug_assert!(bytes.len() <= u32::MAX as usize);
128
129        // SAFETY: We verify the invariant with the outer if statement
130        unsafe {
131            if bytes.len() as u32 <= Self::MAX_INLINE_SIZE {
132                Self::new_inline_unchecked(bytes)
133            } else {
134                Self::new_noninline_unchecked(bytes, buffer_idx, offset)
135            }
136        }
137    }
138
139    /// Constructs a byteslice from this view.
140    ///
141    /// # Safety
142    /// Assumes that this view is valid for the given buffers.
143    pub unsafe fn get_slice_unchecked<'a>(&'a self, buffers: &'a [Buffer<u8>]) -> &'a [u8] {
144        unsafe {
145            if self.length <= Self::MAX_INLINE_SIZE {
146                let ptr = self as *const View as *const u8;
147                std::slice::from_raw_parts(ptr.add(4), self.length as usize)
148            } else {
149                let data = buffers.get_unchecked(self.buffer_idx as usize);
150                let offset = self.offset as usize;
151                data.get_unchecked(offset..offset + self.length as usize)
152            }
153        }
154    }
155
156    /// Construct a byte slice from an inline view.
157    ///
158    /// # Safety
159    ///
160    /// Assumes that this view is inlinable.
161    pub unsafe fn get_inlined_slice_unchecked(&self) -> &[u8] {
162        debug_assert!(self.length <= View::MAX_INLINE_SIZE);
163
164        let ptr = self as *const View as *const u8;
165        // SAFETY: Invariant of function
166        unsafe { std::slice::from_raw_parts(ptr.add(4), self.length as usize) }
167    }
168
169    /// Extend a `Vec<View>` with inline views slices of `src` with `width`.
170    ///
171    /// This tries to use SIMD to optimize the copying and can be massively faster than doing a
172    /// `views.extend(src.chunks_exact(width).map(View::new_inline))`.
173    ///
174    /// # Panics
175    ///
176    /// This function panics if `src.len()` is not divisible by `width`, `width >
177    /// View::MAX_INLINE_SIZE` or `width == 0`.
178    pub fn extend_with_inlinable_strided(views: &mut Vec<Self>, src: &[u8], width: u8) {
179        macro_rules! dispatch {
180            ($n:ident = $match:ident in [$($v:literal),+ $(,)?] => $block:block, otherwise = $otherwise:expr) => {
181                match $match {
182                    $(
183                        $v => {
184                            const $n: usize = $v;
185
186                            $block
187                        }
188                    )+
189                    _ => $otherwise,
190                }
191            }
192        }
193
194        let width = width as usize;
195
196        assert!(width > 0);
197        assert!(width <= View::MAX_INLINE_SIZE as usize);
198
199        assert_eq!(src.len() % width, 0);
200
201        let num_values = src.len() / width;
202
203        views.reserve(num_values);
204
205        #[allow(unused_mut)]
206        let mut src = src;
207
208        dispatch! {
209            N = width in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12] => {
210                #[cfg(feature = "simd")]
211                {
212                    macro_rules! repeat_with {
213                        ($i:ident = [$($v:literal),+ $(,)?] => $block:block) => {
214                            $({
215                                const $i: usize = $v;
216
217                                $block
218                            })+
219                        }
220                    }
221
222                    use std::simd::*;
223
224                    // SAFETY: This is always allowed, since views.len() is always in the Vec
225                    // buffer.
226                    let mut dst = unsafe { views.as_mut_ptr().add(views.len()).cast::<u8>() };
227
228                    let length_mask = u8x16::from_array([N as u8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]);
229
230                    const BLOCKS_PER_LOAD: usize = 16 / N;
231                    const BYTES_PER_LOOP: usize = N * BLOCKS_PER_LOAD;
232
233                    let num_loops = (src.len() / BYTES_PER_LOOP).saturating_sub(1);
234
235                    for _ in 0..num_loops {
236                        // SAFETY: The num_loops calculates how many times we can do this.
237                        let loaded = u8x16::from_array(unsafe {
238                            src.get_unchecked(..16).try_into().unwrap()
239                        });
240                        src = unsafe { src.get_unchecked(BYTES_PER_LOOP..) };
241
242                        // This way we can reuse the same load for multiple views.
243                        repeat_with!(
244                            I = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] => {
245                                if I < BLOCKS_PER_LOAD {
246                                    let zero = u8x16::default();
247                                    const SWIZZLE: [usize; 16] = const {
248                                        let mut swizzle = [16usize; 16];
249
250                                        let mut i = 0;
251                                        while i < N {
252                                            let idx = i + I * N;
253                                            if idx < 16 {
254                                                swizzle[4+i] = idx;
255                                            }
256                                            i += 1;
257                                        }
258
259                                        swizzle
260                                    };
261
262                                    let scattered = simd_swizzle!(loaded, zero, SWIZZLE);
263                                    let view_bytes = (scattered | length_mask).to_array();
264
265                                    // SAFETY: dst has the capacity reserved and view_bytes is 16
266                                    // bytes long.
267                                    unsafe {
268                                        core::ptr::copy_nonoverlapping(view_bytes.as_ptr(), dst, 16);
269                                        dst = dst.add(16);
270                                    }
271                                }
272                            }
273                        );
274                    }
275
276                    unsafe {
277                        views.set_len(views.len() + num_loops * BLOCKS_PER_LOAD);
278                    }
279                }
280
281                views.extend(src.chunks_exact(N).map(|slice| unsafe {
282                    View::new_inline_unchecked(slice)
283                }));
284            },
285            otherwise = unreachable!()
286        }
287    }
288}
289
290impl IsNull for View {
291    const HAS_NULLS: bool = false;
292    type Inner = Self;
293
294    fn is_null(&self) -> bool {
295        false
296    }
297
298    fn unwrap_inner(self) -> Self::Inner {
299        self
300    }
301}
302
303impl Display for View {
304    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
305        write!(f, "{:?}", self)
306    }
307}
308
309unsafe impl Zeroable for View {}
310
311unsafe impl Pod for View {}
312
313impl Add<Self> for View {
314    type Output = View;
315
316    fn add(self, _rhs: Self) -> Self::Output {
317        unimplemented!()
318    }
319}
320
321impl num_traits::Zero for View {
322    fn zero() -> Self {
323        Default::default()
324    }
325
326    fn is_zero(&self) -> bool {
327        *self == Self::zero()
328    }
329}
330
331impl PartialEq for View {
332    fn eq(&self, other: &Self) -> bool {
333        self.as_u128() == other.as_u128()
334    }
335}
336
337impl TotalOrd for View {
338    fn tot_cmp(&self, _other: &Self) -> Ordering {
339        unimplemented!()
340    }
341}
342
343impl TotalEq for View {
344    fn tot_eq(&self, other: &Self) -> bool {
345        self.eq(other)
346    }
347}
348
349impl MinMax for View {
350    fn nan_min_lt(&self, _other: &Self) -> bool {
351        unimplemented!()
352    }
353
354    fn nan_max_lt(&self, _other: &Self) -> bool {
355        unimplemented!()
356    }
357}
358
359impl NativeType for View {
360    const PRIMITIVE: PrimitiveType = PrimitiveType::UInt128;
361
362    type Bytes = [u8; 16];
363    type AlignedBytes = Bytes16Alignment4;
364
365    #[inline]
366    fn to_le_bytes(&self) -> Self::Bytes {
367        self.as_u128().to_le_bytes()
368    }
369
370    #[inline]
371    fn to_be_bytes(&self) -> Self::Bytes {
372        self.as_u128().to_be_bytes()
373    }
374
375    #[inline]
376    fn from_le_bytes(bytes: Self::Bytes) -> Self {
377        Self::from(u128::from_le_bytes(bytes))
378    }
379
380    #[inline]
381    fn from_be_bytes(bytes: Self::Bytes) -> Self {
382        Self::from(u128::from_be_bytes(bytes))
383    }
384}
385
386impl From<u128> for View {
387    #[inline]
388    fn from(value: u128) -> Self {
389        unsafe { std::mem::transmute(value) }
390    }
391}
392
393impl From<View> for u128 {
394    #[inline]
395    fn from(value: View) -> Self {
396        value.as_u128()
397    }
398}
399
400fn validate_view<F>(views: &[View], buffers: &[Buffer<u8>], validate_bytes: F) -> PolarsResult<()>
401where
402    F: Fn(&[u8]) -> PolarsResult<()>,
403{
404    for view in views {
405        let len = view.length;
406        if len <= View::MAX_INLINE_SIZE {
407            if len < View::MAX_INLINE_SIZE && view.as_u128() >> (32 + len * 8) != 0 {
408                polars_bail!(ComputeError: "view contained non-zero padding in prefix");
409            }
410
411            validate_bytes(&view.to_le_bytes()[4..4 + len as usize])?;
412        } else {
413            let data = buffers.get(view.buffer_idx as usize).ok_or_else(|| {
414                polars_err!(OutOfBounds: "view index out of bounds\n\nGot: {} buffers and index: {}", buffers.len(), view.buffer_idx)
415            })?;
416
417            let start = view.offset as usize;
418            let end = start + len as usize;
419            let b = data
420                .as_slice()
421                .get(start..end)
422                .ok_or_else(|| polars_err!(OutOfBounds: "buffer slice out of bounds"))?;
423
424            polars_ensure!(b.starts_with(&view.prefix.to_le_bytes()), ComputeError: "prefix does not match string data");
425            validate_bytes(b)?;
426        };
427    }
428
429    Ok(())
430}
431
432pub(super) fn validate_binary_view(views: &[View], buffers: &[Buffer<u8>]) -> PolarsResult<()> {
433    validate_view(views, buffers, |_| Ok(()))
434}
435
436fn validate_utf8(b: &[u8]) -> PolarsResult<()> {
437    match simdutf8::basic::from_utf8(b) {
438        Ok(_) => Ok(()),
439        Err(_) => Err(polars_err!(ComputeError: "invalid utf8")),
440    }
441}
442
443pub fn validate_utf8_view(views: &[View], buffers: &[Buffer<u8>]) -> PolarsResult<()> {
444    validate_view(views, buffers, validate_utf8)
445}
446
447/// # Safety
448/// The views and buffers must uphold the invariants of BinaryView otherwise we will go OOB.
449pub(super) unsafe fn validate_utf8_only(
450    views: &[View],
451    buffers_to_check: &[Buffer<u8>],
452    all_buffers: &[Buffer<u8>],
453) -> PolarsResult<()> {
454    // If we have no buffers, we don't have to branch.
455    if all_buffers.is_empty() {
456        for view in views {
457            let len = view.length;
458            validate_utf8(view.to_le_bytes().get_unchecked(4..4 + len as usize))?;
459        }
460        return Ok(());
461    }
462
463    // Fast path if all buffers are ascii
464    if buffers_to_check.iter().all(|buf| buf.is_ascii()) {
465        for view in views {
466            let len = view.length;
467            if len <= 12 {
468                validate_utf8(view.to_le_bytes().get_unchecked(4..4 + len as usize))?;
469            }
470        }
471    } else {
472        for view in views {
473            let len = view.length;
474            if len <= 12 {
475                validate_utf8(view.to_le_bytes().get_unchecked(4..4 + len as usize))?;
476            } else {
477                let buffer_idx = view.buffer_idx;
478                let offset = view.offset;
479                let data = all_buffers.get_unchecked(buffer_idx as usize);
480
481                let start = offset as usize;
482                let end = start + len as usize;
483                let b = &data.as_slice().get_unchecked(start..end);
484                validate_utf8(b)?;
485            };
486        }
487    }
488
489    Ok(())
490}