polars_arrow/compute/aggregate/
memory.rs

1use crate::array::*;
2use crate::bitmap::Bitmap;
3use crate::datatypes::PhysicalType;
4pub use crate::types::PrimitiveType;
5use crate::{match_integer_type, with_match_primitive_type_full};
6fn validity_size(validity: Option<&Bitmap>) -> usize {
7    validity.as_ref().map(|b| b.as_slice().0.len()).unwrap_or(0)
8}
9
10macro_rules! dyn_binary {
11    ($array:expr, $ty:ty, $o:ty) => {{
12        let array = $array.as_any().downcast_ref::<$ty>().unwrap();
13        let offsets = array.offsets().buffer();
14
15        // in case of Binary/Utf8/List the offsets are sliced,
16        // not the values buffer
17        let values_start = offsets[0] as usize;
18        let values_end = offsets[offsets.len() - 1] as usize;
19
20        values_end - values_start
21            + offsets.len() * size_of::<$o>()
22            + validity_size(array.validity())
23    }};
24}
25
26fn binview_size<T: ViewType + ?Sized>(array: &BinaryViewArrayGeneric<T>) -> usize {
27    // We choose the optimal usage as data can be shared across buffers.
28    // If we would sum all buffers we overestimate memory usage and trigger OOC when not needed.
29    array.total_bytes_len()
30}
31
32/// Returns the total (heap) allocated size of the array in bytes.
33/// # Implementation
34/// This estimation is the sum of the size of its buffers, validity, including nested arrays.
35/// Multiple arrays may share buffers and bitmaps. Therefore, the size of 2 arrays is not the
36/// sum of the sizes computed from this function. In particular, [`StructArray`]'s size is an upper bound.
37///
38/// When an array is sliced, its allocated size remains constant because the buffer unchanged.
39/// However, this function will yield a smaller number. This is because this function returns
40/// the visible size of the buffer, not its total capacity.
41///
42/// FFI buffers are included in this estimation.
43pub fn estimated_bytes_size(array: &dyn Array) -> usize {
44    use PhysicalType::*;
45    match array.dtype().to_physical_type() {
46        Null => 0,
47        Boolean => {
48            let array = array.as_any().downcast_ref::<BooleanArray>().unwrap();
49            array.values().as_slice().0.len() + validity_size(array.validity())
50        },
51        Primitive(PrimitiveType::DaysMs) => {
52            let array = array.as_any().downcast_ref::<DaysMsArray>().unwrap();
53            array.values().len() * size_of::<i32>() * 2 + validity_size(array.validity())
54        },
55        Primitive(primitive) => with_match_primitive_type_full!(primitive, |$T| {
56            let array = array
57                .as_any()
58                .downcast_ref::<PrimitiveArray<$T>>()
59                .unwrap();
60
61            array.values().len() * size_of::<$T>() + validity_size(array.validity())
62        }),
63        Binary => dyn_binary!(array, BinaryArray<i32>, i32),
64        FixedSizeBinary => {
65            let array = array
66                .as_any()
67                .downcast_ref::<FixedSizeBinaryArray>()
68                .unwrap();
69            array.values().len() + validity_size(array.validity())
70        },
71        LargeBinary => dyn_binary!(array, BinaryArray<i64>, i64),
72        Utf8 => dyn_binary!(array, Utf8Array<i32>, i32),
73        LargeUtf8 => dyn_binary!(array, Utf8Array<i64>, i64),
74        List => {
75            let array = array.as_any().downcast_ref::<ListArray<i32>>().unwrap();
76            estimated_bytes_size(array.values().as_ref())
77                + array.offsets().len_proxy() * size_of::<i32>()
78                + validity_size(array.validity())
79        },
80        FixedSizeList => {
81            let array = array.as_any().downcast_ref::<FixedSizeListArray>().unwrap();
82            estimated_bytes_size(array.values().as_ref()) + validity_size(array.validity())
83        },
84        LargeList => {
85            let array = array.as_any().downcast_ref::<ListArray<i64>>().unwrap();
86            estimated_bytes_size(array.values().as_ref())
87                + array.offsets().len_proxy() * size_of::<i64>()
88                + validity_size(array.validity())
89        },
90        Struct => {
91            let array = array.as_any().downcast_ref::<StructArray>().unwrap();
92            array
93                .values()
94                .iter()
95                .map(|x| x.as_ref())
96                .map(estimated_bytes_size)
97                .sum::<usize>()
98                + validity_size(array.validity())
99        },
100        Union => {
101            let array = array.as_any().downcast_ref::<UnionArray>().unwrap();
102            let types = array.types().len() * size_of::<i8>();
103            let offsets = array
104                .offsets()
105                .as_ref()
106                .map(|x| x.len() * size_of::<i32>())
107                .unwrap_or_default();
108            let fields = array
109                .fields()
110                .iter()
111                .map(|x| x.as_ref())
112                .map(estimated_bytes_size)
113                .sum::<usize>();
114            types + offsets + fields
115        },
116        Dictionary(key_type) => match_integer_type!(key_type, |$T| {
117            let array = array
118                .as_any()
119                .downcast_ref::<DictionaryArray<$T>>()
120                .unwrap();
121            estimated_bytes_size(array.keys()) + estimated_bytes_size(array.values().as_ref())
122        }),
123        Utf8View => binview_size::<str>(array.as_any().downcast_ref().unwrap()),
124        BinaryView => binview_size::<[u8]>(array.as_any().downcast_ref().unwrap()),
125        Map => {
126            let array = array.as_any().downcast_ref::<MapArray>().unwrap();
127            let offsets = array.offsets().len_proxy() * size_of::<i32>();
128            offsets + estimated_bytes_size(array.field().as_ref()) + validity_size(array.validity())
129        },
130    }
131}