polars_compute/cast/
binary_to.rs

1use std::sync::Arc;
2
3use arrow::array::*;
4use arrow::buffer::Buffer;
5use arrow::datatypes::ArrowDataType;
6use arrow::offset::{Offset, Offsets};
7use arrow::types::NativeType;
8use polars_error::PolarsResult;
9
10use super::CastOptionsImpl;
11
12pub(super) trait Parse {
13    fn parse(val: &[u8]) -> Option<Self>
14    where
15        Self: Sized;
16}
17
18macro_rules! impl_parse {
19    ($primitive_type:ident) => {
20        impl Parse for $primitive_type {
21            fn parse(val: &[u8]) -> Option<Self> {
22                atoi_simd::parse_skipped(val).ok()
23            }
24        }
25    };
26}
27impl_parse!(i8);
28impl_parse!(i16);
29impl_parse!(i32);
30impl_parse!(i64);
31
32impl_parse!(u8);
33impl_parse!(u16);
34impl_parse!(u32);
35impl_parse!(u64);
36
37#[cfg(feature = "dtype-i128")]
38impl_parse!(i128);
39
40impl Parse for f32 {
41    fn parse(val: &[u8]) -> Option<Self>
42    where
43        Self: Sized,
44    {
45        fast_float2::parse(val).ok()
46    }
47}
48impl Parse for f64 {
49    fn parse(val: &[u8]) -> Option<Self>
50    where
51        Self: Sized,
52    {
53        fast_float2::parse(val).ok()
54    }
55}
56
57/// Conversion of binary
58pub fn binary_to_large_binary(
59    from: &BinaryArray<i32>,
60    to_dtype: ArrowDataType,
61) -> BinaryArray<i64> {
62    let values = from.values().clone();
63    BinaryArray::<i64>::new(
64        to_dtype,
65        from.offsets().into(),
66        values,
67        from.validity().cloned(),
68    )
69}
70
71/// Conversion of binary
72pub fn binary_large_to_binary(
73    from: &BinaryArray<i64>,
74    to_dtype: ArrowDataType,
75) -> PolarsResult<BinaryArray<i32>> {
76    let values = from.values().clone();
77    let offsets = from.offsets().try_into()?;
78    Ok(BinaryArray::<i32>::new(
79        to_dtype,
80        offsets,
81        values,
82        from.validity().cloned(),
83    ))
84}
85
86/// Conversion to utf8
87pub fn binary_to_utf8<O: Offset>(
88    from: &BinaryArray<O>,
89    to_dtype: ArrowDataType,
90) -> PolarsResult<Utf8Array<O>> {
91    Utf8Array::<O>::try_new(
92        to_dtype,
93        from.offsets().clone(),
94        from.values().clone(),
95        from.validity().cloned(),
96    )
97}
98
99/// Casts a [`BinaryArray`] to a [`PrimitiveArray`], making any uncastable value a Null.
100pub(super) fn binary_to_primitive<O: Offset, T>(
101    from: &BinaryArray<O>,
102    to: &ArrowDataType,
103) -> PrimitiveArray<T>
104where
105    T: NativeType + Parse,
106{
107    let iter = from.iter().map(|x| x.and_then::<T, _>(|x| T::parse(x)));
108
109    PrimitiveArray::<T>::from_trusted_len_iter(iter).to(to.clone())
110}
111
112pub(super) fn binary_to_primitive_dyn<O: Offset, T>(
113    from: &dyn Array,
114    to: &ArrowDataType,
115    options: CastOptionsImpl,
116) -> PolarsResult<Box<dyn Array>>
117where
118    T: NativeType + Parse,
119{
120    let from = from.as_any().downcast_ref().unwrap();
121    if options.partial {
122        unimplemented!()
123    } else {
124        Ok(Box::new(binary_to_primitive::<O, T>(from, to)))
125    }
126}
127
128/// Cast [`BinaryArray`] to [`DictionaryArray`], also known as packing.
129/// # Errors
130/// This function errors if the maximum key is smaller than the number of distinct elements
131/// in the array.
132pub fn binary_to_dictionary<O: Offset, K: DictionaryKey>(
133    from: &BinaryArray<O>,
134) -> PolarsResult<DictionaryArray<K>> {
135    let mut array = MutableDictionaryArray::<K, MutableBinaryArray<O>>::new();
136    array.reserve(from.len());
137    array.try_extend(from.iter())?;
138
139    Ok(array.into())
140}
141
142pub(super) fn binary_to_dictionary_dyn<O: Offset, K: DictionaryKey>(
143    from: &dyn Array,
144) -> PolarsResult<Box<dyn Array>> {
145    let values = from.as_any().downcast_ref().unwrap();
146    binary_to_dictionary::<O, K>(values).map(|x| Box::new(x) as Box<dyn Array>)
147}
148
149fn fixed_size_to_offsets<O: Offset>(values_len: usize, fixed_size: usize) -> Offsets<O> {
150    let offsets = (0..(values_len + 1))
151        .step_by(fixed_size)
152        .map(|v| O::from_as_usize(v))
153        .collect();
154    // SAFETY:
155    // * every element is `>= 0`
156    // * element at position `i` is >= than element at position `i-1`.
157    unsafe { Offsets::new_unchecked(offsets) }
158}
159
160/// Conversion of `FixedSizeBinary` to `Binary`.
161pub fn fixed_size_binary_binary<O: Offset>(
162    from: &FixedSizeBinaryArray,
163    to_dtype: ArrowDataType,
164) -> BinaryArray<O> {
165    let values = from.values().clone();
166    let offsets = fixed_size_to_offsets(values.len(), from.size());
167    BinaryArray::<O>::new(to_dtype, offsets.into(), values, from.validity().cloned())
168}
169
170pub fn fixed_size_binary_to_binview(from: &FixedSizeBinaryArray) -> BinaryViewArray {
171    let datatype = <[u8] as ViewType>::DATA_TYPE;
172
173    // Fast path: all the views are inlineable
174    if from.size() <= View::MAX_INLINE_SIZE as usize {
175        // @NOTE: There is something with the code-generation of `View::new_inline_unchecked` that
176        // prevents it from properly SIMD-ing this loop. It insists on memcpying while it should
177        // know that the size is really small. Dispatching over the `from.size()` and making it
178        // constant does make loop SIMD, but it does not actually speed anything up and the code it
179        // generates is still horrible.
180        //
181        // This is really slow, and I don't think it has to be.
182
183        // SAFETY: We checked that slice.len() <= View::MAX_INLINE_SIZE before
184        let mut views = Vec::new();
185        View::extend_with_inlinable_strided(
186            &mut views,
187            from.values().as_slice(),
188            from.size() as u8,
189        );
190        let views = Buffer::from(views);
191        return BinaryViewArray::try_new(datatype, views, Arc::default(), from.validity().cloned())
192            .unwrap();
193    }
194
195    const MAX_BYTES_PER_BUFFER: usize = u32::MAX as usize;
196
197    let size = from.size();
198    let num_bytes = from.len() * size;
199    let num_buffers = num_bytes.div_ceil(MAX_BYTES_PER_BUFFER);
200    assert!(num_buffers < u32::MAX as usize);
201
202    let num_elements_per_buffer = MAX_BYTES_PER_BUFFER / size;
203    // This is NOT equal to MAX_BYTES_PER_BUFFER because of integer division
204    let split_point = num_elements_per_buffer * size;
205
206    // This is zero-copy for the buffer since split just increases the data since
207    let mut buffer = from.values().clone();
208    let mut buffers = Vec::with_capacity(num_buffers);
209
210    if let Some(num_buffers) = num_buffers.checked_sub(1) {
211        for _ in 0..num_buffers {
212            let slice;
213            (slice, buffer) = buffer.split_at(split_point);
214            buffers.push(slice);
215        }
216        buffers.push(buffer);
217    }
218
219    let mut iter = from.values_iter();
220    let iter = iter.by_ref();
221    let mut views = Vec::with_capacity(from.len());
222    for buffer_idx in 0..num_buffers {
223        views.extend(
224            iter.take(num_elements_per_buffer)
225                .enumerate()
226                .map(|(i, slice)| {
227                    // SAFETY: We checked that slice.len() > View::MAX_INLINE_SIZE before
228                    unsafe {
229                        View::new_noninline_unchecked(slice, buffer_idx as u32, (i * size) as u32)
230                    }
231                }),
232        );
233    }
234    let views = views.into();
235
236    BinaryViewArray::try_new(datatype, views, buffers.into(), from.validity().cloned()).unwrap()
237}
238
239/// Conversion of binary
240pub fn binary_to_list<O: Offset>(from: &BinaryArray<O>, to_dtype: ArrowDataType) -> ListArray<O> {
241    let values = from.values().clone();
242    let values = PrimitiveArray::new(ArrowDataType::UInt8, values, None);
243    ListArray::<O>::new(
244        to_dtype,
245        from.offsets().clone(),
246        values.boxed(),
247        from.validity().cloned(),
248    )
249}