polars_core/chunked_array/
from.rs

1use super::*;
2
3#[allow(clippy::all)]
4fn from_chunks_list_dtype(chunks: &mut Vec<ArrayRef>, dtype: DataType) -> DataType {
5    // ensure we don't get List<null>
6    let dtype = if let Some(arr) = chunks.get(0) {
7        DataType::from_arrow_dtype(arr.dtype())
8    } else {
9        dtype
10    };
11
12    match dtype {
13        #[cfg(feature = "dtype-categorical")]
14        // arrow dictionaries are not nested as dictionaries, but only by their keys, so we must
15        // change the list-value array to the keys and store the dictionary values in the datatype.
16        // if a global string cache is set, we also must modify the keys.
17        DataType::List(inner)
18            if matches!(
19                *inner,
20                DataType::Categorical(None, _) | DataType::Enum(None, _)
21            ) =>
22        {
23            let array = concatenate_owned_unchecked(chunks).unwrap();
24            let list_arr = array.as_any().downcast_ref::<ListArray<i64>>().unwrap();
25            let values_arr = list_arr.values();
26            let cat = unsafe {
27                Series::_try_from_arrow_unchecked(
28                    PlSmallStr::EMPTY,
29                    vec![values_arr.clone()],
30                    values_arr.dtype(),
31                )
32                .unwrap()
33            };
34
35            // we nest only the physical representation
36            // the mapping is still in our rev-map
37            let arrow_dtype = ListArray::<i64>::default_datatype(ArrowDataType::UInt32);
38            let new_array = ListArray::new(
39                arrow_dtype,
40                list_arr.offsets().clone(),
41                cat.array_ref(0).clone(),
42                list_arr.validity().cloned(),
43            );
44            chunks.clear();
45            chunks.push(Box::new(new_array));
46            DataType::List(Box::new(cat.dtype().clone()))
47        },
48        #[cfg(all(feature = "dtype-array", feature = "dtype-categorical"))]
49        DataType::Array(inner, width)
50            if matches!(
51                *inner,
52                DataType::Categorical(None, _) | DataType::Enum(None, _)
53            ) =>
54        {
55            let array = concatenate_owned_unchecked(chunks).unwrap();
56            let list_arr = array.as_any().downcast_ref::<FixedSizeListArray>().unwrap();
57            let values_arr = list_arr.values();
58            let cat = unsafe {
59                Series::_try_from_arrow_unchecked(
60                    PlSmallStr::EMPTY,
61                    vec![values_arr.clone()],
62                    values_arr.dtype(),
63                )
64                .unwrap()
65            };
66
67            // we nest only the physical representation
68            // the mapping is still in our rev-map
69            let arrow_dtype = FixedSizeListArray::default_datatype(ArrowDataType::UInt32, width);
70            let new_array = FixedSizeListArray::new(
71                arrow_dtype,
72                values_arr.len(),
73                cat.array_ref(0).clone(),
74                list_arr.validity().cloned(),
75            );
76            chunks.clear();
77            chunks.push(Box::new(new_array));
78            DataType::Array(Box::new(cat.dtype().clone()), width)
79        },
80        _ => dtype,
81    }
82}
83
84impl<T, A> From<A> for ChunkedArray<T>
85where
86    T: PolarsDataType<Array = A>,
87    A: Array,
88{
89    fn from(arr: A) -> Self {
90        Self::with_chunk(PlSmallStr::EMPTY, arr)
91    }
92}
93
94impl<T> ChunkedArray<T>
95where
96    T: PolarsDataType,
97{
98    pub fn with_chunk<A>(name: PlSmallStr, arr: A) -> Self
99    where
100        A: Array,
101        T: PolarsDataType<Array = A>,
102    {
103        unsafe { Self::from_chunks(name, vec![Box::new(arr)]) }
104    }
105
106    pub fn with_chunk_like<A>(ca: &Self, arr: A) -> Self
107    where
108        A: Array,
109        T: PolarsDataType<Array = A>,
110    {
111        Self::from_chunk_iter_like(ca, std::iter::once(arr))
112    }
113
114    pub fn from_chunk_iter<I>(name: PlSmallStr, iter: I) -> Self
115    where
116        I: IntoIterator,
117        T: PolarsDataType<Array = <I as IntoIterator>::Item>,
118        <I as IntoIterator>::Item: Array,
119    {
120        let chunks = iter
121            .into_iter()
122            .map(|x| Box::new(x) as Box<dyn Array>)
123            .collect();
124        unsafe { Self::from_chunks(name, chunks) }
125    }
126
127    pub fn from_chunk_iter_like<I>(ca: &Self, iter: I) -> Self
128    where
129        I: IntoIterator,
130        T: PolarsDataType<Array = <I as IntoIterator>::Item>,
131        <I as IntoIterator>::Item: Array,
132    {
133        let chunks = iter
134            .into_iter()
135            .map(|x| Box::new(x) as Box<dyn Array>)
136            .collect();
137        unsafe {
138            Self::from_chunks_and_dtype_unchecked(ca.name().clone(), chunks, ca.dtype().clone())
139        }
140    }
141
142    pub fn try_from_chunk_iter<I, A, E>(name: PlSmallStr, iter: I) -> Result<Self, E>
143    where
144        I: IntoIterator<Item = Result<A, E>>,
145        T: PolarsDataType<Array = A>,
146        A: Array,
147    {
148        let chunks: Result<_, _> = iter
149            .into_iter()
150            .map(|x| Ok(Box::new(x?) as Box<dyn Array>))
151            .collect();
152        unsafe { Ok(Self::from_chunks(name, chunks?)) }
153    }
154
155    pub(crate) fn from_chunk_iter_and_field<I>(field: Arc<Field>, chunks: I) -> Self
156    where
157        I: IntoIterator,
158        T: PolarsDataType<Array = <I as IntoIterator>::Item>,
159        <I as IntoIterator>::Item: Array,
160    {
161        assert_eq!(
162            std::mem::discriminant(&T::get_dtype()),
163            std::mem::discriminant(&field.dtype)
164        );
165
166        let mut length = 0;
167        let mut null_count = 0;
168        let chunks = chunks
169            .into_iter()
170            .map(|x| {
171                length += x.len();
172                null_count += x.null_count();
173                Box::new(x) as Box<dyn Array>
174            })
175            .collect();
176
177        unsafe { ChunkedArray::new_with_dims(field, chunks, length, null_count) }
178    }
179
180    /// Create a new [`ChunkedArray`] from existing chunks.
181    ///
182    /// # Safety
183    /// The Arrow datatype of all chunks must match the [`PolarsDataType`] `T`.
184    pub unsafe fn from_chunks(name: PlSmallStr, mut chunks: Vec<ArrayRef>) -> Self {
185        let dtype = match T::get_dtype() {
186            dtype @ DataType::List(_) => from_chunks_list_dtype(&mut chunks, dtype),
187            #[cfg(feature = "dtype-array")]
188            dtype @ DataType::Array(_, _) => from_chunks_list_dtype(&mut chunks, dtype),
189            #[cfg(feature = "dtype-struct")]
190            dtype @ DataType::Struct(_) => from_chunks_list_dtype(&mut chunks, dtype),
191            dt => dt,
192        };
193        Self::from_chunks_and_dtype(name, chunks, dtype)
194    }
195
196    /// # Safety
197    /// The Arrow datatype of all chunks must match the [`PolarsDataType`] `T`.
198    pub unsafe fn with_chunks(&self, chunks: Vec<ArrayRef>) -> Self {
199        ChunkedArray::new_with_compute_len(self.field.clone(), chunks)
200    }
201
202    /// Create a new [`ChunkedArray`] from existing chunks.
203    ///
204    /// # Safety
205    ///
206    /// The Arrow datatype of all chunks must match the [`PolarsDataType`] `T`.
207    pub unsafe fn from_chunks_and_dtype(
208        name: PlSmallStr,
209        chunks: Vec<ArrayRef>,
210        dtype: DataType,
211    ) -> Self {
212        // assertions in debug mode
213        // that check if the data types in the arrays are as expected
214        #[cfg(debug_assertions)]
215        {
216            if !chunks.is_empty() && !chunks[0].is_empty() && dtype.is_primitive() {
217                assert_eq!(chunks[0].dtype(), &dtype.to_arrow(CompatLevel::newest()))
218            }
219        }
220
221        Self::from_chunks_and_dtype_unchecked(name, chunks, dtype)
222    }
223
224    /// Create a new [`ChunkedArray`] from existing chunks.
225    ///
226    /// # Safety
227    ///
228    /// The Arrow datatype of all chunks must match the [`PolarsDataType`] `T`.
229    pub(crate) unsafe fn from_chunks_and_dtype_unchecked(
230        name: PlSmallStr,
231        chunks: Vec<ArrayRef>,
232        dtype: DataType,
233    ) -> Self {
234        let field = Arc::new(Field::new(name, dtype));
235        ChunkedArray::new_with_compute_len(field, chunks)
236    }
237
238    pub fn full_null_like(ca: &Self, length: usize) -> Self {
239        let chunks = std::iter::once(T::Array::full_null(
240            length,
241            ca.dtype().to_arrow(CompatLevel::newest()),
242        ));
243        Self::from_chunk_iter_like(ca, chunks)
244    }
245}
246
247impl<T> ChunkedArray<T>
248where
249    T: PolarsNumericType,
250{
251    /// Create a new ChunkedArray by taking ownership of the Vec. This operation is zero copy.
252    pub fn from_vec(name: PlSmallStr, v: Vec<T::Native>) -> Self {
253        Self::with_chunk(name, to_primitive::<T>(v, None))
254    }
255
256    /// Create a new ChunkedArray from a Vec and a validity mask.
257    pub fn from_vec_validity(
258        name: PlSmallStr,
259        values: Vec<T::Native>,
260        buffer: Option<Bitmap>,
261    ) -> Self {
262        let arr = to_array::<T>(values, buffer);
263        ChunkedArray::new_with_compute_len(Arc::new(Field::new(name, T::get_dtype())), vec![arr])
264    }
265
266    /// Create a temporary [`ChunkedArray`] from a slice.
267    ///
268    /// # Safety
269    /// The lifetime will be bound to the lifetime of the slice.
270    /// This will not be checked by the borrowchecker.
271    pub unsafe fn mmap_slice(name: PlSmallStr, values: &[T::Native]) -> Self {
272        Self::with_chunk(name, arrow::ffi::mmap::slice(values))
273    }
274}
275
276impl BooleanChunked {
277    /// Create a temporary [`ChunkedArray`] from a slice.
278    ///
279    /// # Safety
280    /// The lifetime will be bound to the lifetime of the slice.
281    /// This will not be checked by the borrowchecker.
282    pub unsafe fn mmap_slice(name: PlSmallStr, values: &[u8], offset: usize, len: usize) -> Self {
283        let arr = arrow::ffi::mmap::bitmap(values, offset, len).unwrap();
284        Self::with_chunk(name, arr)
285    }
286}
287
288impl<'a, T> From<&'a ChunkedArray<T>> for Vec<Option<T::Physical<'a>>>
289where
290    T: PolarsDataType,
291{
292    fn from(ca: &'a ChunkedArray<T>) -> Self {
293        let mut out = Vec::with_capacity(ca.len());
294        for arr in ca.downcast_iter() {
295            out.extend(arr.iter())
296        }
297        out
298    }
299}
300impl From<StringChunked> for Vec<Option<String>> {
301    fn from(ca: StringChunked) -> Self {
302        ca.iter().map(|opt| opt.map(|s| s.to_string())).collect()
303    }
304}
305
306impl From<BooleanChunked> for Vec<Option<bool>> {
307    fn from(ca: BooleanChunked) -> Self {
308        let mut out = Vec::with_capacity(ca.len());
309        for arr in ca.downcast_iter() {
310            out.extend(arr.iter())
311        }
312        out
313    }
314}