polars_core/frame/
chunks.rs

1use arrow::record_batch::RecordBatch;
2use rayon::prelude::*;
3
4use crate::prelude::*;
5use crate::utils::{_split_offsets, accumulate_dataframes_vertical_unchecked, split_df_as_ref};
6use crate::POOL;
7
8impl TryFrom<(RecordBatch, &ArrowSchema)> for DataFrame {
9    type Error = PolarsError;
10
11    fn try_from(arg: (RecordBatch, &ArrowSchema)) -> PolarsResult<DataFrame> {
12        let columns: PolarsResult<Vec<Column>> = arg
13            .0
14            .columns()
15            .iter()
16            .zip(arg.1.iter_values())
17            .map(|(arr, field)| Series::try_from((field, arr.clone())).map(Column::from))
18            .collect();
19
20        DataFrame::new(columns?)
21    }
22}
23
24impl DataFrame {
25    pub fn split_chunks(&mut self) -> impl Iterator<Item = DataFrame> + '_ {
26        self.align_chunks_par();
27
28        (0..self.first_col_n_chunks()).map(move |i| unsafe {
29            let columns = self
30                .get_columns()
31                .iter()
32                .map(|column| column.as_materialized_series().select_chunk(i))
33                .map(Column::from)
34                .collect::<Vec<_>>();
35
36            let height = Self::infer_height(&columns);
37            DataFrame::new_no_checks(height, columns)
38        })
39    }
40
41    pub fn split_chunks_by_n(self, n: usize, parallel: bool) -> Vec<DataFrame> {
42        let split = _split_offsets(self.height(), n);
43
44        let split_fn = |(offset, len)| self.slice(offset as i64, len);
45
46        if parallel {
47            // Parallel so that null_counts run in parallel
48            POOL.install(|| split.into_par_iter().map(split_fn).collect())
49        } else {
50            split.into_iter().map(split_fn).collect()
51        }
52    }
53}
54
55/// Split DataFrame into chunks in preparation for writing. The chunks have a
56/// maximum number of rows per chunk to ensure reasonable memory efficiency when
57/// reading the resulting file, and a minimum size per chunk to ensure
58/// reasonable performance when writing.
59pub fn chunk_df_for_writing(
60    df: &mut DataFrame,
61    row_group_size: usize,
62) -> PolarsResult<std::borrow::Cow<DataFrame>> {
63    // ensures all chunks are aligned.
64    df.align_chunks_par();
65
66    // Accumulate many small chunks to the row group size.
67    // See: #16403
68    if !df.get_columns().is_empty()
69        && df.get_columns()[0]
70            .as_materialized_series()
71            .chunk_lengths()
72            .take(5)
73            .all(|len| len < row_group_size)
74    {
75        fn finish(scratch: &mut Vec<DataFrame>, new_chunks: &mut Vec<DataFrame>) {
76            let mut new = accumulate_dataframes_vertical_unchecked(scratch.drain(..));
77            new.as_single_chunk_par();
78            new_chunks.push(new);
79        }
80
81        let mut new_chunks = Vec::with_capacity(df.first_col_n_chunks()); // upper limit;
82        let mut scratch = vec![];
83        let mut remaining = row_group_size;
84
85        for df in df.split_chunks() {
86            remaining = remaining.saturating_sub(df.height());
87            scratch.push(df);
88
89            if remaining == 0 {
90                remaining = row_group_size;
91                finish(&mut scratch, &mut new_chunks);
92            }
93        }
94        if !scratch.is_empty() {
95            finish(&mut scratch, &mut new_chunks);
96        }
97        return Ok(std::borrow::Cow::Owned(
98            accumulate_dataframes_vertical_unchecked(new_chunks),
99        ));
100    }
101
102    let n_splits = df.height() / row_group_size;
103    let result = if n_splits > 0 {
104        let mut splits = split_df_as_ref(df, n_splits, false);
105
106        for df in splits.iter_mut() {
107            // If the chunks are small enough, writing many small chunks
108            // leads to slow writing performance, so in that case we
109            // merge them.
110            let n_chunks = df.first_col_n_chunks();
111            if n_chunks > 1 && (df.estimated_size() / n_chunks < 128 * 1024) {
112                df.as_single_chunk_par();
113            }
114        }
115
116        std::borrow::Cow::Owned(accumulate_dataframes_vertical_unchecked(splits))
117    } else {
118        std::borrow::Cow::Borrowed(df)
119    };
120    Ok(result)
121}