polars_arrow/array/list/
mod.rs

1use super::specification::try_check_offsets_bounds;
2use super::{new_empty_array, Array, Splitable};
3use crate::bitmap::Bitmap;
4use crate::datatypes::{ArrowDataType, Field};
5use crate::offset::{Offset, Offsets, OffsetsBuffer};
6
7mod ffi;
8pub(super) mod fmt;
9mod iterator;
10pub use iterator::*;
11mod mutable;
12pub use mutable::*;
13use polars_error::{polars_bail, PolarsResult};
14use polars_utils::pl_str::PlSmallStr;
15
16/// An [`Array`] semantically equivalent to `Vec<Option<Vec<Option<T>>>>` with Arrow's in-memory.
17#[derive(Clone)]
18pub struct ListArray<O: Offset> {
19    dtype: ArrowDataType,
20    offsets: OffsetsBuffer<O>,
21    values: Box<dyn Array>,
22    validity: Option<Bitmap>,
23}
24
25impl<O: Offset> ListArray<O> {
26    /// Creates a new [`ListArray`].
27    ///
28    /// # Errors
29    /// This function returns an error iff:
30    /// * The last offset is not equal to the values' length.
31    /// * the validity's length is not equal to `offsets.len()`.
32    /// * The `dtype`'s [`crate::datatypes::PhysicalType`] is not equal to either [`crate::datatypes::PhysicalType::List`] or [`crate::datatypes::PhysicalType::LargeList`].
33    /// * The `dtype`'s inner field's data type is not equal to `values.dtype`.
34    /// # Implementation
35    /// This function is `O(1)`
36    pub fn try_new(
37        dtype: ArrowDataType,
38        offsets: OffsetsBuffer<O>,
39        values: Box<dyn Array>,
40        validity: Option<Bitmap>,
41    ) -> PolarsResult<Self> {
42        try_check_offsets_bounds(&offsets, values.len())?;
43
44        if validity
45            .as_ref()
46            .is_some_and(|validity| validity.len() != offsets.len_proxy())
47        {
48            polars_bail!(ComputeError: "validity mask length must match the number of values")
49        }
50
51        let child_dtype = Self::try_get_child(&dtype)?.dtype();
52        let values_dtype = values.dtype();
53        if child_dtype != values_dtype {
54            polars_bail!(ComputeError: "ListArray's child's DataType must match. However, the expected DataType is {child_dtype:?} while it got {values_dtype:?}.");
55        }
56
57        Ok(Self {
58            dtype,
59            offsets,
60            values,
61            validity,
62        })
63    }
64
65    /// Creates a new [`ListArray`].
66    ///
67    /// # Panics
68    /// This function panics iff:
69    /// * The last offset is not equal to the values' length.
70    /// * the validity's length is not equal to `offsets.len()`.
71    /// * The `dtype`'s [`crate::datatypes::PhysicalType`] is not equal to either [`crate::datatypes::PhysicalType::List`] or [`crate::datatypes::PhysicalType::LargeList`].
72    /// * The `dtype`'s inner field's data type is not equal to `values.dtype`.
73    /// # Implementation
74    /// This function is `O(1)`
75    pub fn new(
76        dtype: ArrowDataType,
77        offsets: OffsetsBuffer<O>,
78        values: Box<dyn Array>,
79        validity: Option<Bitmap>,
80    ) -> Self {
81        Self::try_new(dtype, offsets, values, validity).unwrap()
82    }
83
84    /// Returns a new empty [`ListArray`].
85    pub fn new_empty(dtype: ArrowDataType) -> Self {
86        let values = new_empty_array(Self::get_child_type(&dtype).clone());
87        Self::new(dtype, OffsetsBuffer::default(), values, None)
88    }
89
90    /// Returns a new null [`ListArray`].
91    #[inline]
92    pub fn new_null(dtype: ArrowDataType, length: usize) -> Self {
93        let child = Self::get_child_type(&dtype).clone();
94        Self::new(
95            dtype,
96            Offsets::new_zeroed(length).into(),
97            new_empty_array(child),
98            Some(Bitmap::new_zeroed(length)),
99        )
100    }
101}
102
103impl<O: Offset> ListArray<O> {
104    /// Slices this [`ListArray`].
105    /// # Panics
106    /// panics iff `offset + length > self.len()`
107    pub fn slice(&mut self, offset: usize, length: usize) {
108        assert!(
109            offset + length <= self.len(),
110            "the offset of the new Buffer cannot exceed the existing length"
111        );
112        unsafe { self.slice_unchecked(offset, length) }
113    }
114
115    /// Slices this [`ListArray`].
116    ///
117    /// # Safety
118    /// The caller must ensure that `offset + length < self.len()`.
119    pub unsafe fn slice_unchecked(&mut self, offset: usize, length: usize) {
120        self.validity = self
121            .validity
122            .take()
123            .map(|bitmap| bitmap.sliced_unchecked(offset, length))
124            .filter(|bitmap| bitmap.unset_bits() > 0);
125        self.offsets.slice_unchecked(offset, length + 1);
126    }
127
128    impl_sliced!();
129    impl_mut_validity!();
130    impl_into_array!();
131
132    pub fn trim_to_normalized_offsets_recursive(&self) -> Self {
133        let offsets = self.offsets();
134        let values = self.values();
135
136        let first_idx = *offsets.first();
137        let len = offsets.range().to_usize();
138
139        if first_idx.to_usize() == 0 && values.len() == len {
140            return self.clone();
141        }
142
143        let offsets = if first_idx.to_usize() == 0 {
144            offsets.clone()
145        } else {
146            let v = offsets.iter().map(|x| *x - first_idx).collect::<Vec<_>>();
147            unsafe { OffsetsBuffer::<O>::new_unchecked(v.into()) }
148        };
149
150        let values = values.sliced(first_idx.to_usize(), len);
151
152        let values = match values.dtype() {
153            ArrowDataType::List(_) => {
154                let inner: &ListArray<i32> = values.as_ref().as_any().downcast_ref().unwrap();
155                Box::new(inner.trim_to_normalized_offsets_recursive()) as Box<dyn Array>
156            },
157            ArrowDataType::LargeList(_) => {
158                let inner: &ListArray<i64> = values.as_ref().as_any().downcast_ref().unwrap();
159                Box::new(inner.trim_to_normalized_offsets_recursive()) as Box<dyn Array>
160            },
161            _ => values,
162        };
163
164        assert_eq!(offsets.first().to_usize(), 0);
165        assert_eq!(values.len(), offsets.range().to_usize());
166
167        Self::new(
168            self.dtype().clone(),
169            offsets,
170            values,
171            self.validity().cloned(),
172        )
173    }
174}
175
176// Accessors
177impl<O: Offset> ListArray<O> {
178    /// Returns the length of this array
179    #[inline]
180    pub fn len(&self) -> usize {
181        self.offsets.len_proxy()
182    }
183
184    /// Returns the element at index `i`
185    /// # Panic
186    /// Panics iff `i >= self.len()`
187    #[inline]
188    pub fn value(&self, i: usize) -> Box<dyn Array> {
189        assert!(i < self.len());
190        // SAFETY: invariant of this function
191        unsafe { self.value_unchecked(i) }
192    }
193
194    /// Returns the element at index `i` as &str
195    ///
196    /// # Safety
197    /// Assumes that the `i < self.len`.
198    #[inline]
199    pub unsafe fn value_unchecked(&self, i: usize) -> Box<dyn Array> {
200        // SAFETY: the invariant of the function
201        let (start, end) = self.offsets.start_end_unchecked(i);
202        let length = end - start;
203
204        // SAFETY: the invariant of the struct
205        self.values.sliced_unchecked(start, length)
206    }
207
208    /// The optional validity.
209    #[inline]
210    pub fn validity(&self) -> Option<&Bitmap> {
211        self.validity.as_ref()
212    }
213
214    /// The offsets [`Buffer`].
215    #[inline]
216    pub fn offsets(&self) -> &OffsetsBuffer<O> {
217        &self.offsets
218    }
219
220    /// The values.
221    #[inline]
222    pub fn values(&self) -> &Box<dyn Array> {
223        &self.values
224    }
225}
226
227impl<O: Offset> ListArray<O> {
228    /// Returns a default [`ArrowDataType`]: inner field is named "item" and is nullable
229    pub fn default_datatype(dtype: ArrowDataType) -> ArrowDataType {
230        let field = Box::new(Field::new(PlSmallStr::from_static("item"), dtype, true));
231        if O::IS_LARGE {
232            ArrowDataType::LargeList(field)
233        } else {
234            ArrowDataType::List(field)
235        }
236    }
237
238    /// Returns a the inner [`Field`]
239    /// # Panics
240    /// Panics iff the logical type is not consistent with this struct.
241    pub fn get_child_field(dtype: &ArrowDataType) -> &Field {
242        Self::try_get_child(dtype).unwrap()
243    }
244
245    /// Returns a the inner [`Field`]
246    /// # Errors
247    /// Panics iff the logical type is not consistent with this struct.
248    pub fn try_get_child(dtype: &ArrowDataType) -> PolarsResult<&Field> {
249        if O::IS_LARGE {
250            match dtype.to_logical_type() {
251                ArrowDataType::LargeList(child) => Ok(child.as_ref()),
252                _ => polars_bail!(ComputeError: "ListArray<i64> expects DataType::LargeList"),
253            }
254        } else {
255            match dtype.to_logical_type() {
256                ArrowDataType::List(child) => Ok(child.as_ref()),
257                _ => polars_bail!(ComputeError: "ListArray<i32> expects DataType::List"),
258            }
259        }
260    }
261
262    /// Returns a the inner [`ArrowDataType`]
263    /// # Panics
264    /// Panics iff the logical type is not consistent with this struct.
265    pub fn get_child_type(dtype: &ArrowDataType) -> &ArrowDataType {
266        Self::get_child_field(dtype).dtype()
267    }
268}
269
270impl<O: Offset> Array for ListArray<O> {
271    impl_common_array!();
272
273    fn validity(&self) -> Option<&Bitmap> {
274        self.validity.as_ref()
275    }
276
277    #[inline]
278    fn with_validity(&self, validity: Option<Bitmap>) -> Box<dyn Array> {
279        Box::new(self.clone().with_validity(validity))
280    }
281}
282
283impl<O: Offset> Splitable for ListArray<O> {
284    fn check_bound(&self, offset: usize) -> bool {
285        offset <= self.len()
286    }
287
288    unsafe fn _split_at_unchecked(&self, offset: usize) -> (Self, Self) {
289        let (lhs_offsets, rhs_offsets) = unsafe { self.offsets.split_at_unchecked(offset) };
290        let (lhs_validity, rhs_validity) = unsafe { self.validity.split_at_unchecked(offset) };
291
292        (
293            Self {
294                dtype: self.dtype.clone(),
295                offsets: lhs_offsets,
296                validity: lhs_validity,
297                values: self.values.clone(),
298            },
299            Self {
300                dtype: self.dtype.clone(),
301                offsets: rhs_offsets,
302                validity: rhs_validity,
303                values: self.values.clone(),
304            },
305        )
306    }
307}