polars_arrow/legacy/kernels/
sort_partition.rs

1use std::fmt::Debug;
2
3use polars_utils::IdxSize;
4
5use crate::types::NativeType;
6
7/// Find partition indexes such that every partition contains unique groups.
8fn find_partition_points<T>(values: &[T], n: usize, descending: bool) -> Vec<usize>
9where
10    T: Debug + NativeType,
11{
12    let len = values.len();
13    if n > len {
14        return find_partition_points(values, len / 2, descending);
15    }
16    if n < 2 {
17        return vec![];
18    }
19    let chunk_size = len / n;
20
21    let mut partition_points = Vec::with_capacity(n + 1);
22
23    let mut start_idx = 0;
24    loop {
25        let end_idx = start_idx + chunk_size;
26        if end_idx >= len {
27            break;
28        }
29        // first take that partition as a slice
30        // and then find the location where the group of the latest value starts
31        let part = &values[start_idx..end_idx];
32
33        let latest_val = values[end_idx];
34        let idx = if descending {
35            part.partition_point(|v| v.tot_gt(&latest_val))
36        } else {
37            part.partition_point(|v| v.tot_lt(&latest_val))
38        };
39
40        if idx != 0 {
41            partition_points.push(idx + start_idx)
42        }
43
44        start_idx += chunk_size;
45    }
46    partition_points
47}
48
49pub fn create_clean_partitions<T>(values: &[T], n: usize, descending: bool) -> Vec<&[T]>
50where
51    T: Debug + NativeType,
52{
53    let part_idx = find_partition_points(values, n, descending);
54    let mut out = Vec::with_capacity(n + 1);
55
56    let mut start_idx = 0_usize;
57    for end_idx in part_idx {
58        if end_idx != start_idx {
59            out.push(&values[start_idx..end_idx]);
60            start_idx = end_idx;
61        }
62    }
63    let latest = &values[start_idx..];
64    if !latest.is_empty() {
65        out.push(latest)
66    }
67
68    out
69}
70
71pub fn partition_to_groups_amortized<T>(
72    values: &[T],
73    first_group_offset: IdxSize,
74    nulls_first: bool,
75    offset: IdxSize,
76    out: &mut Vec<[IdxSize; 2]>,
77) where
78    T: Debug + NativeType,
79{
80    if let Some(mut first) = values.first() {
81        out.clear();
82        if nulls_first && first_group_offset > 0 {
83            out.push([0, first_group_offset])
84        }
85
86        let mut first_idx = if nulls_first { first_group_offset } else { 0 } + offset;
87
88        for val in values {
89            // new group reached
90            if val.tot_ne(first) {
91                let val_ptr = val as *const T;
92                let first_ptr = first as *const T;
93
94                // SAFETY:
95                // all pointers suffice the invariants
96                let len = unsafe { val_ptr.offset_from(first_ptr) } as IdxSize;
97                out.push([first_idx, len]);
98                first_idx += len;
99                first = val;
100            }
101        }
102        // add last group
103        if nulls_first {
104            out.push([
105                first_idx,
106                values.len() as IdxSize + first_group_offset - first_idx,
107            ]);
108        } else {
109            out.push([first_idx, values.len() as IdxSize - (first_idx - offset)]);
110        }
111
112        if !nulls_first && first_group_offset > 0 {
113            out.push([values.len() as IdxSize + offset, first_group_offset])
114        }
115    }
116}
117
118/// Take a clean-partitioned slice and return the groups slices
119/// With clean-partitioned we mean that the slice contains all groups and are not spilled to another partition.
120///
121/// `first_group_offset` can be used to add insert the `null` values group.
122pub fn partition_to_groups<T>(
123    values: &[T],
124    first_group_offset: IdxSize,
125    nulls_first: bool,
126    offset: IdxSize,
127) -> Vec<[IdxSize; 2]>
128where
129    T: Debug + NativeType,
130{
131    if values.is_empty() {
132        return vec![];
133    }
134    let mut out = Vec::with_capacity(values.len() / 10);
135    partition_to_groups_amortized(values, first_group_offset, nulls_first, offset, &mut out);
136    out
137}
138
139#[cfg(test)]
140mod test {
141    use super::*;
142
143    #[test]
144    fn test_partition_points() {
145        let values = &[1, 3, 3, 3, 3, 5, 5, 5, 9, 9, 10];
146
147        assert_eq!(find_partition_points(values, 4, false), &[1, 5, 8, 10]);
148        assert_eq!(
149            partition_to_groups(values, 0, true, 0),
150            &[[0, 1], [1, 4], [5, 3], [8, 2], [10, 1]]
151        );
152        assert_eq!(
153            partition_to_groups(values, 5, true, 0),
154            &[[0, 5], [5, 1], [6, 4], [10, 3], [13, 2], [15, 1]]
155        );
156        assert_eq!(
157            partition_to_groups(values, 5, false, 0),
158            &[[0, 1], [1, 4], [5, 3], [8, 2], [10, 1], [11, 5]]
159        );
160    }
161}