polars_compute/cast/
utf8_to.rs1use std::sync::Arc;
2
3use arrow::array::*;
4use arrow::buffer::Buffer;
5use arrow::datatypes::ArrowDataType;
6use arrow::offset::Offset;
7use arrow::types::NativeType;
8use polars_error::PolarsResult;
9use polars_utils::vec::PushUnchecked;
10
11pub(super) const RFC3339: &str = "%Y-%m-%dT%H:%M:%S%.f%:z";
12
13pub(super) fn utf8_to_dictionary_dyn<O: Offset, K: DictionaryKey>(
14 from: &dyn Array,
15) -> PolarsResult<Box<dyn Array>> {
16 let values = from.as_any().downcast_ref().unwrap();
17 utf8_to_dictionary::<O, K>(values).map(|x| Box::new(x) as Box<dyn Array>)
18}
19
20pub fn utf8_to_dictionary<O: Offset, K: DictionaryKey>(
25 from: &Utf8Array<O>,
26) -> PolarsResult<DictionaryArray<K>> {
27 let mut array = MutableDictionaryArray::<K, MutableUtf8Array<O>>::new();
28 array.reserve(from.len());
29 array.try_extend(from.iter())?;
30
31 Ok(array.into())
32}
33
34pub fn utf8_to_large_utf8(from: &Utf8Array<i32>) -> Utf8Array<i64> {
36 let dtype = Utf8Array::<i64>::default_dtype();
37 let validity = from.validity().cloned();
38 let values = from.values().clone();
39
40 let offsets = from.offsets().into();
41 unsafe { Utf8Array::<i64>::new_unchecked(dtype, offsets, values, validity) }
43}
44
45pub fn utf8_large_to_utf8(from: &Utf8Array<i64>) -> PolarsResult<Utf8Array<i32>> {
47 let dtype = Utf8Array::<i32>::default_dtype();
48 let validity = from.validity().cloned();
49 let values = from.values().clone();
50 let offsets = from.offsets().try_into()?;
51
52 Ok(unsafe { Utf8Array::<i32>::new_unchecked(dtype, offsets, values, validity) })
54}
55
56pub fn utf8_to_binary<O: Offset>(from: &Utf8Array<O>, to_dtype: ArrowDataType) -> BinaryArray<O> {
58 BinaryArray::<O>::new(
60 to_dtype,
61 from.offsets().clone(),
62 from.values().clone(),
63 from.validity().cloned(),
64 )
65}
66
67#[cfg(not(test))]
69type OffsetType = u32;
70
71#[cfg(test)]
73type OffsetType = i8;
74
75fn truncate_buffer(buf: &Buffer<u8>) -> Buffer<u8> {
78 buf.clone().sliced(
80 0,
81 std::cmp::min(buf.len(), ((OffsetType::MAX as u64) * 2) as usize),
82 )
83}
84
85pub fn binary_to_binview<O: Offset>(arr: &BinaryArray<O>) -> BinaryViewArray {
86 #[cfg(not(debug_assertions))]
88 let _ = std::mem::transmute::<OffsetType, u32>;
89
90 let mut views = Vec::with_capacity(arr.len());
91 let mut uses_buffer = false;
92
93 let mut base_buffer = arr.values().clone();
94 let mut base_ptr = base_buffer.as_ptr() as usize;
96
97 let mut buffer_idx = 0_u32;
99
100 let mut buffers = vec![truncate_buffer(&base_buffer)];
103
104 for bytes in arr.values_iter() {
105 let len: u32 = bytes
106 .len()
107 .try_into()
108 .expect("max string/binary length exceeded");
109
110 let mut payload = [0; 16];
111 payload[0..4].copy_from_slice(&len.to_le_bytes());
112
113 if len <= 12 {
114 payload[4..4 + bytes.len()].copy_from_slice(bytes);
115 } else {
116 uses_buffer = true;
117
118 unsafe { payload[4..8].copy_from_slice(bytes.get_unchecked(0..4)) };
120 payload[0..4].copy_from_slice(&len.to_le_bytes());
121
122 let current_bytes_ptr = bytes.as_ptr() as usize;
123 let offset = current_bytes_ptr - base_ptr;
124
125 if let Ok(offset) = OffsetType::try_from(offset) {
127 #[allow(clippy::unnecessary_cast)]
128 let offset = offset as u32;
129 payload[12..16].copy_from_slice(&offset.to_le_bytes());
130 payload[8..12].copy_from_slice(&buffer_idx.to_le_bytes());
131 } else {
132 let len = base_buffer.len() - offset;
133
134 base_buffer = base_buffer.clone().sliced(offset, len);
136 base_ptr = base_buffer.as_ptr() as usize;
137
138 buffers.push(truncate_buffer(&base_buffer));
140 buffer_idx = buffer_idx.checked_add(1).expect("max buffers exceeded");
141
142 let offset = 0u32;
143 payload[12..16].copy_from_slice(&offset.to_le_bytes());
144 payload[8..12].copy_from_slice(&buffer_idx.to_le_bytes());
145 }
146 }
147
148 let value = View::from_le_bytes(payload);
149 unsafe { views.push_unchecked(value) };
150 }
151 let buffers = if uses_buffer {
152 Arc::from(buffers)
153 } else {
154 Arc::from([])
155 };
156 unsafe {
157 BinaryViewArray::new_unchecked_unknown_md(
158 ArrowDataType::BinaryView,
159 views.into(),
160 buffers,
161 arr.validity().cloned(),
162 None,
163 )
164 }
165}
166
167pub fn utf8_to_utf8view<O: Offset>(arr: &Utf8Array<O>) -> Utf8ViewArray {
168 unsafe { binary_to_binview(&arr.to_binary()).to_utf8view_unchecked() }
169}
170
171#[cfg(test)]
172mod test {
173 use super::*;
174
175 #[test]
176 fn overflowing_utf8_to_binview() {
177 let values = [
178 "lksafjdlkakjslkjsafkjdalkjfalkdsalkjfaslkfjlkakdsjfkajfksdajfkasjdflkasjdf", "123", "lksafjdlkakjslkjsafkjdalkjfalkdsalkjfaslkfjlkakdsjfkajfksdajfkasjdflkasjdf", "lksafjdlkakjslkjsafkjdalkjfalkdsalkjfaslkfjlkakdsjfkajfksdajfkasjdflkasjdf", "lksafjdlkakjslkjsafkjdalkjfalkdsalkjfaslkfjlkakdsjfkajfksdajfkasjdflkasjdf", "234", "lksafjdlkakjslkjsafkjdalkjfalkdsalkjfaslkfjlkakdsjfkajfksdajfkasjdflkasjdf", "lksafjdlkakjslkjsafkjdalkjfalkdsalkjfaslkfjlkakdsjfkajfksdajfkasjdflkasjdf", "lksafjdlkakjslkjsafkjdalkjfalkdsalkjfaslkfjlkakdsjfkajfksdajfkasjdflkasjdf", "lksafjdlkakjslkjsafkjdalkjfalkdsalkjfaslkfjlkakdsjfkajfksdajfkasjdflkasjdf", "324", ];
190 let array = Utf8Array::<i64>::from_slice(values);
191
192 let out = utf8_to_utf8view(&array);
193 assert_eq!(out.data_buffers().len(), 4);
195 let out = out.values_iter().collect::<Vec<_>>();
197 assert_eq!(out, values);
198 }
199}