polars_compute/filter/
mod.rs

1//! Contains operators to filter arrays such as [`filter`].
2mod boolean;
3mod primitive;
4mod scalar;
5
6#[cfg(all(target_arch = "x86_64", feature = "simd"))]
7mod avx512;
8
9use arrow::array::growable::make_growable;
10use arrow::array::{new_empty_array, Array, BinaryViewArray, BooleanArray, PrimitiveArray};
11use arrow::bitmap::utils::SlicesIterator;
12use arrow::bitmap::Bitmap;
13use arrow::with_match_primitive_type_full;
14pub use boolean::filter_boolean_kernel;
15
16pub fn filter(array: &dyn Array, mask: &BooleanArray) -> Box<dyn Array> {
17    assert_eq!(array.len(), mask.len());
18
19    // Treat null mask values as false.
20    if let Some(validities) = mask.validity() {
21        let combined_mask = mask.values() & validities;
22        filter_with_bitmap(array, &combined_mask)
23    } else {
24        filter_with_bitmap(array, mask.values())
25    }
26}
27
28pub fn filter_with_bitmap(array: &dyn Array, mask: &Bitmap) -> Box<dyn Array> {
29    // Many filters involve filtering values in a subsection of the array. When we trim the leading
30    // and trailing filtered items, we can close in on those items and not have to perform and
31    // thinking about those. The overhead for when there are no leading or trailing filtered values
32    // is very minimal: only a clone of the mask and the array.
33    //
34    // This also allows dispatching to the fast paths way, way, way more often.
35    let mut mask = mask.clone();
36    let leading_zeros = mask.take_leading_zeros();
37    mask.take_trailing_zeros();
38    let array = array.sliced(leading_zeros, mask.len());
39
40    let mask = &mask;
41    let array = array.as_ref();
42
43    // Fast-path: completely empty or completely full mask.
44    let false_count = mask.unset_bits();
45    if false_count == mask.len() {
46        return new_empty_array(array.dtype().clone());
47    }
48    if false_count == 0 {
49        return array.to_boxed();
50    }
51
52    use arrow::datatypes::PhysicalType::*;
53    match array.dtype().to_physical_type() {
54        Primitive(primitive) => with_match_primitive_type_full!(primitive, |$T| {
55            let array: &PrimitiveArray<$T> = array.as_any().downcast_ref().unwrap();
56            let (values, validity) = primitive::filter_values_and_validity::<$T>(array.values(), array.validity(), mask);
57            Box::new(PrimitiveArray::from_vec(values).with_validity(validity))
58        }),
59        Boolean => {
60            let array = array.as_any().downcast_ref::<BooleanArray>().unwrap();
61            let (values, validity) =
62                boolean::filter_bitmap_and_validity(array.values(), array.validity(), mask);
63            BooleanArray::new(array.dtype().clone(), values, validity).boxed()
64        },
65        BinaryView => {
66            let array = array.as_any().downcast_ref::<BinaryViewArray>().unwrap();
67            let views = array.views();
68            let validity = array.validity();
69            let (views, validity) = primitive::filter_values_and_validity(views, validity, mask);
70            unsafe {
71                BinaryViewArray::new_unchecked_unknown_md(
72                    array.dtype().clone(),
73                    views.into(),
74                    array.data_buffers().clone(),
75                    validity,
76                    Some(array.total_buffer_len()),
77                )
78            }
79            .boxed()
80        },
81        // Should go via BinaryView
82        Utf8View => {
83            unreachable!()
84        },
85        _ => {
86            let iter = SlicesIterator::new(mask);
87            let mut mutable = make_growable(&[array], false, iter.slots());
88            // SAFETY:
89            // we are in bounds
90            iter.for_each(|(start, len)| unsafe { mutable.extend(0, start, len) });
91            mutable.as_box()
92        },
93    }
94}