polars_arrow/array/growable/
dictionary.rs

1use std::sync::Arc;
2
3use super::{make_growable, Growable};
4use crate::array::growable::utils::{extend_validity, prepare_validity};
5use crate::array::{Array, DictionaryArray, DictionaryKey, PrimitiveArray};
6use crate::bitmap::BitmapBuilder;
7use crate::datatypes::ArrowDataType;
8
9/// Concrete [`Growable`] for the [`DictionaryArray`].
10/// # Implementation
11/// This growable does not perform collision checks and instead concatenates
12/// the values of each [`DictionaryArray`] one after the other.
13pub struct GrowableDictionary<'a, K: DictionaryKey> {
14    dtype: ArrowDataType,
15    keys: Vec<&'a PrimitiveArray<K>>,
16    key_values: Vec<K>,
17    validity: Option<BitmapBuilder>,
18    offsets: Vec<usize>,
19    values: Box<dyn Array>,
20}
21
22fn concatenate_values<K: DictionaryKey>(
23    arrays_keys: &[&PrimitiveArray<K>],
24    arrays_values: &[&dyn Array],
25    capacity: usize,
26) -> (Box<dyn Array>, Vec<usize>) {
27    let mut mutable = make_growable(arrays_values, false, capacity);
28    let mut offsets = Vec::with_capacity(arrays_keys.len() + 1);
29    offsets.push(0);
30    for (i, values) in arrays_values.iter().enumerate() {
31        unsafe { mutable.extend(i, 0, values.len()) };
32        offsets.push(offsets[i] + values.len());
33    }
34    (mutable.as_box(), offsets)
35}
36
37impl<'a, T: DictionaryKey> GrowableDictionary<'a, T> {
38    /// Creates a new [`GrowableDictionary`] bound to `arrays` with a pre-allocated `capacity`.
39    /// # Panics
40    /// If `arrays` is empty.
41    pub fn new(arrays: &[&'a DictionaryArray<T>], mut use_validity: bool, capacity: usize) -> Self {
42        let dtype = arrays[0].dtype().clone();
43
44        // if any of the arrays has nulls, insertions from any array requires setting bits
45        // as there is at least one array with nulls.
46        if arrays.iter().any(|array| array.null_count() > 0) {
47            use_validity = true;
48        };
49
50        let arrays_keys = arrays.iter().map(|array| array.keys()).collect::<Vec<_>>();
51        let arrays_values = arrays
52            .iter()
53            .map(|array| array.values().as_ref())
54            .collect::<Vec<_>>();
55
56        let (values, offsets) = concatenate_values(&arrays_keys, &arrays_values, capacity);
57
58        Self {
59            dtype,
60            offsets,
61            values,
62            keys: arrays_keys,
63            key_values: Vec::with_capacity(capacity),
64            validity: prepare_validity(use_validity, capacity),
65        }
66    }
67
68    #[inline]
69    fn to(&mut self) -> DictionaryArray<T> {
70        let validity = self.validity.take();
71        let key_values = std::mem::take(&mut self.key_values);
72
73        #[cfg(debug_assertions)]
74        {
75            crate::array::specification::check_indexes(&key_values, self.values.len()).unwrap();
76        }
77        let keys = PrimitiveArray::<T>::new(
78            T::PRIMITIVE.into(),
79            key_values.into(),
80            validity.map(|v| v.freeze()),
81        );
82
83        // SAFETY: the invariant of this struct ensures that this is up-held
84        unsafe {
85            DictionaryArray::<T>::try_new_unchecked(self.dtype.clone(), keys, self.values.clone())
86                .unwrap()
87        }
88    }
89}
90
91impl<'a, T: DictionaryKey> Growable<'a> for GrowableDictionary<'a, T> {
92    #[inline]
93    unsafe fn extend(&mut self, index: usize, start: usize, len: usize) {
94        let keys_array = *self.keys.get_unchecked(index);
95        extend_validity(&mut self.validity, keys_array, start, len);
96
97        let values = &keys_array.values().get_unchecked(start..start + len);
98        let offset = self.offsets.get_unchecked(index);
99        self.key_values.extend(
100            values
101                .iter()
102                // `.unwrap_or(0)` because this operation does not check for null values, which may contain any key.
103                .map(|x| {
104                    let x: usize = offset + (*x).try_into().unwrap_or(0);
105                    let x: T = match x.try_into() {
106                        Ok(key) => key,
107                        // todo: convert this to an error.
108                        Err(_) => {
109                            panic!("The maximum key is too small")
110                        },
111                    };
112                    x
113                }),
114        );
115    }
116
117    #[inline]
118    fn len(&self) -> usize {
119        self.key_values.len()
120    }
121
122    #[inline]
123    fn extend_validity(&mut self, additional: usize) {
124        self.key_values
125            .resize(self.key_values.len() + additional, T::default());
126        if let Some(validity) = &mut self.validity {
127            validity.extend_constant(additional, false);
128        }
129    }
130
131    #[inline]
132    fn as_arc(&mut self) -> Arc<dyn Array> {
133        Arc::new(self.to())
134    }
135
136    #[inline]
137    fn as_box(&mut self) -> Box<dyn Array> {
138        Box::new(self.to())
139    }
140}
141
142impl<'a, T: DictionaryKey> From<GrowableDictionary<'a, T>> for DictionaryArray<T> {
143    #[inline]
144    fn from(mut val: GrowableDictionary<'a, T>) -> Self {
145        val.to()
146    }
147}