polars_compute/
cardinality.rs1use arrow::array::{
2 Array, BinaryArray, BinaryViewArray, BooleanArray, FixedSizeBinaryArray, PrimitiveArray,
3 Utf8Array, Utf8ViewArray,
4};
5use arrow::datatypes::PhysicalType;
6use arrow::types::Offset;
7use arrow::with_match_primitive_type_full;
8use polars_utils::total_ord::ToTotalOrd;
9
10use crate::hyperloglogplus::HyperLogLog;
11
12pub fn estimate_cardinality(array: &dyn Array) -> usize {
16 if array.is_empty() {
17 return 0;
18 }
19
20 if array.null_count() == array.len() {
21 return 1;
22 }
23
24 use PhysicalType as PT;
26 match array.dtype().to_physical_type() {
27 PT::Null => 1,
28
29 PT::Boolean => {
30 let mut cardinality = 0;
31
32 let array = array.as_any().downcast_ref::<BooleanArray>().unwrap();
33
34 cardinality += usize::from(array.has_nulls());
35
36 if let Some(unset_bits) = array.values().lazy_unset_bits() {
37 cardinality += 1 + usize::from(unset_bits != array.len());
38 } else {
39 cardinality += 2;
40 }
41
42 cardinality
43 },
44
45 PT::Primitive(primitive_type) => with_match_primitive_type_full!(primitive_type, |$T| {
46 let mut hll = HyperLogLog::new();
47
48 let array = array
49 .as_any()
50 .downcast_ref::<PrimitiveArray<$T>>()
51 .unwrap();
52
53 if array.has_nulls() {
54 for v in array.iter() {
55 let v = v.copied().unwrap_or_default();
56 hll.add(&v.to_total_ord());
57 }
58 } else {
59 for v in array.values_iter() {
60 hll.add(&v.to_total_ord());
61 }
62 }
63
64 hll.count()
65 }),
66 PT::FixedSizeBinary => {
67 let mut hll = HyperLogLog::new();
68
69 let array = array
70 .as_any()
71 .downcast_ref::<FixedSizeBinaryArray>()
72 .unwrap();
73
74 if array.has_nulls() {
75 for v in array.iter() {
76 let v = v.unwrap_or_default();
77 hll.add(v);
78 }
79 } else {
80 for v in array.values_iter() {
81 hll.add(v);
82 }
83 }
84
85 hll.count()
86 },
87 PT::Binary => {
88 binary_offset_array_estimate(array.as_any().downcast_ref::<BinaryArray<i32>>().unwrap())
89 },
90 PT::LargeBinary => {
91 binary_offset_array_estimate(array.as_any().downcast_ref::<BinaryArray<i64>>().unwrap())
92 },
93 PT::Utf8 => binary_offset_array_estimate(
94 &array
95 .as_any()
96 .downcast_ref::<Utf8Array<i32>>()
97 .unwrap()
98 .to_binary(),
99 ),
100 PT::LargeUtf8 => binary_offset_array_estimate(
101 &array
102 .as_any()
103 .downcast_ref::<Utf8Array<i64>>()
104 .unwrap()
105 .to_binary(),
106 ),
107 PT::BinaryView => {
108 binary_view_array_estimate(array.as_any().downcast_ref::<BinaryViewArray>().unwrap())
109 },
110 PT::Utf8View => binary_view_array_estimate(
111 &array
112 .as_any()
113 .downcast_ref::<Utf8ViewArray>()
114 .unwrap()
115 .to_binview(),
116 ),
117 PT::List => unimplemented!(),
118 PT::FixedSizeList => unimplemented!(),
119 PT::LargeList => unimplemented!(),
120 PT::Struct => unimplemented!(),
121 PT::Union => unimplemented!(),
122 PT::Map => unimplemented!(),
123 PT::Dictionary(_) => unimplemented!(),
124 }
125}
126
127fn binary_offset_array_estimate<O: Offset>(array: &BinaryArray<O>) -> usize {
128 let mut hll = HyperLogLog::new();
129
130 if array.has_nulls() {
131 for v in array.iter() {
132 let v = v.unwrap_or_default();
133 hll.add(v);
134 }
135 } else {
136 for v in array.values_iter() {
137 hll.add(v);
138 }
139 }
140
141 hll.count()
142}
143
144fn binary_view_array_estimate(array: &BinaryViewArray) -> usize {
145 let mut hll = HyperLogLog::new();
146
147 if array.has_nulls() {
148 for v in array.iter() {
149 let v = v.unwrap_or_default();
150 hll.add(v);
151 }
152 } else {
153 for v in array.values_iter() {
154 hll.add(v);
155 }
156 }
157
158 hll.count()
159}