polars_compute/
cardinality.rs

1use arrow::array::{
2    Array, BinaryArray, BinaryViewArray, BooleanArray, FixedSizeBinaryArray, PrimitiveArray,
3    Utf8Array, Utf8ViewArray,
4};
5use arrow::datatypes::PhysicalType;
6use arrow::types::Offset;
7use arrow::with_match_primitive_type_full;
8use polars_utils::total_ord::ToTotalOrd;
9
10use crate::hyperloglogplus::HyperLogLog;
11
12/// Get an estimate for the *cardinality* of the array (i.e. the number of unique values)
13///
14/// This is not currently implemented for nested types.
15pub fn estimate_cardinality(array: &dyn Array) -> usize {
16    if array.is_empty() {
17        return 0;
18    }
19
20    if array.null_count() == array.len() {
21        return 1;
22    }
23
24    // Estimate the cardinality with HyperLogLog
25    use PhysicalType as PT;
26    match array.dtype().to_physical_type() {
27        PT::Null => 1,
28
29        PT::Boolean => {
30            let mut cardinality = 0;
31
32            let array = array.as_any().downcast_ref::<BooleanArray>().unwrap();
33
34            cardinality += usize::from(array.has_nulls());
35
36            if let Some(unset_bits) = array.values().lazy_unset_bits() {
37                cardinality += 1 + usize::from(unset_bits != array.len());
38            } else {
39                cardinality += 2;
40            }
41
42            cardinality
43        },
44
45        PT::Primitive(primitive_type) => with_match_primitive_type_full!(primitive_type, |$T| {
46             let mut hll = HyperLogLog::new();
47
48             let array = array
49                 .as_any()
50                 .downcast_ref::<PrimitiveArray<$T>>()
51                 .unwrap();
52
53             if array.has_nulls() {
54                 for v in array.iter() {
55                     let v = v.copied().unwrap_or_default();
56                     hll.add(&v.to_total_ord());
57                 }
58             } else {
59                 for v in array.values_iter() {
60                     hll.add(&v.to_total_ord());
61                 }
62             }
63
64             hll.count()
65        }),
66        PT::FixedSizeBinary => {
67            let mut hll = HyperLogLog::new();
68
69            let array = array
70                .as_any()
71                .downcast_ref::<FixedSizeBinaryArray>()
72                .unwrap();
73
74            if array.has_nulls() {
75                for v in array.iter() {
76                    let v = v.unwrap_or_default();
77                    hll.add(v);
78                }
79            } else {
80                for v in array.values_iter() {
81                    hll.add(v);
82                }
83            }
84
85            hll.count()
86        },
87        PT::Binary => {
88            binary_offset_array_estimate(array.as_any().downcast_ref::<BinaryArray<i32>>().unwrap())
89        },
90        PT::LargeBinary => {
91            binary_offset_array_estimate(array.as_any().downcast_ref::<BinaryArray<i64>>().unwrap())
92        },
93        PT::Utf8 => binary_offset_array_estimate(
94            &array
95                .as_any()
96                .downcast_ref::<Utf8Array<i32>>()
97                .unwrap()
98                .to_binary(),
99        ),
100        PT::LargeUtf8 => binary_offset_array_estimate(
101            &array
102                .as_any()
103                .downcast_ref::<Utf8Array<i64>>()
104                .unwrap()
105                .to_binary(),
106        ),
107        PT::BinaryView => {
108            binary_view_array_estimate(array.as_any().downcast_ref::<BinaryViewArray>().unwrap())
109        },
110        PT::Utf8View => binary_view_array_estimate(
111            &array
112                .as_any()
113                .downcast_ref::<Utf8ViewArray>()
114                .unwrap()
115                .to_binview(),
116        ),
117        PT::List => unimplemented!(),
118        PT::FixedSizeList => unimplemented!(),
119        PT::LargeList => unimplemented!(),
120        PT::Struct => unimplemented!(),
121        PT::Union => unimplemented!(),
122        PT::Map => unimplemented!(),
123        PT::Dictionary(_) => unimplemented!(),
124    }
125}
126
127fn binary_offset_array_estimate<O: Offset>(array: &BinaryArray<O>) -> usize {
128    let mut hll = HyperLogLog::new();
129
130    if array.has_nulls() {
131        for v in array.iter() {
132            let v = v.unwrap_or_default();
133            hll.add(v);
134        }
135    } else {
136        for v in array.values_iter() {
137            hll.add(v);
138        }
139    }
140
141    hll.count()
142}
143
144fn binary_view_array_estimate(array: &BinaryViewArray) -> usize {
145    let mut hll = HyperLogLog::new();
146
147    if array.has_nulls() {
148        for v in array.iter() {
149            let v = v.unwrap_or_default();
150            hll.add(v);
151        }
152    } else {
153        for v in array.values_iter() {
154            hll.add(v);
155        }
156    }
157
158    hll.count()
159}