polars_core/frame/
mod.rs

1//! DataFrame module.
2use std::sync::OnceLock;
3use std::{mem, ops};
4
5use arrow::datatypes::ArrowSchemaRef;
6use polars_row::ArrayRef;
7use polars_schema::schema::debug_ensure_matching_schema_names;
8use polars_utils::itertools::Itertools;
9use rayon::prelude::*;
10
11use crate::chunked_array::flags::StatisticsFlags;
12#[cfg(feature = "algorithm_group_by")]
13use crate::chunked_array::ops::unique::is_unique_helper;
14use crate::prelude::*;
15#[cfg(feature = "row_hash")]
16use crate::utils::split_df;
17use crate::utils::{slice_offsets, try_get_supertype, Container, NoNull};
18use crate::{HEAD_DEFAULT_LENGTH, TAIL_DEFAULT_LENGTH};
19
20#[cfg(feature = "dataframe_arithmetic")]
21mod arithmetic;
22mod chunks;
23pub use chunks::chunk_df_for_writing;
24pub mod column;
25pub mod explode;
26mod from;
27#[cfg(feature = "algorithm_group_by")]
28pub mod group_by;
29pub(crate) mod horizontal;
30#[cfg(any(feature = "rows", feature = "object"))]
31pub mod row;
32mod top_k;
33mod upstream_traits;
34
35use arrow::record_batch::{RecordBatch, RecordBatchT};
36use polars_utils::pl_str::PlSmallStr;
37#[cfg(feature = "serde")]
38use serde::{Deserialize, Serialize};
39use strum_macros::IntoStaticStr;
40
41#[cfg(feature = "row_hash")]
42use crate::hashing::_df_rows_to_hashes_threaded_vertical;
43use crate::prelude::sort::{argsort_multiple_row_fmt, prepare_arg_sort};
44use crate::series::IsSorted;
45use crate::POOL;
46
47#[derive(Copy, Clone, Debug, PartialEq, Eq, Default, Hash, IntoStaticStr)]
48#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
49#[strum(serialize_all = "snake_case")]
50pub enum UniqueKeepStrategy {
51    /// Keep the first unique row.
52    First,
53    /// Keep the last unique row.
54    Last,
55    /// Keep None of the unique rows.
56    None,
57    /// Keep any of the unique rows
58    /// This allows more optimizations
59    #[default]
60    Any,
61}
62
63fn ensure_names_unique<T, F>(items: &[T], mut get_name: F) -> PolarsResult<()>
64where
65    F: for<'a> FnMut(&'a T) -> &'a str,
66{
67    // Always unique.
68    if items.len() <= 1 {
69        return Ok(());
70    }
71
72    if items.len() <= 4 {
73        // Too small to be worth spawning a hashmap for, this is at most 6 comparisons.
74        for i in 0..items.len() - 1 {
75            let name = get_name(&items[i]);
76            for other in items.iter().skip(i + 1) {
77                if name == get_name(other) {
78                    polars_bail!(duplicate = name);
79                }
80            }
81        }
82    } else {
83        let mut names = PlHashSet::with_capacity(items.len());
84        for item in items {
85            let name = get_name(item);
86            if !names.insert(name) {
87                polars_bail!(duplicate = name);
88            }
89        }
90    }
91    Ok(())
92}
93
94/// A contiguous growable collection of `Series` that have the same length.
95///
96/// ## Use declarations
97///
98/// All the common tools can be found in [`crate::prelude`] (or in `polars::prelude`).
99///
100/// ```rust
101/// use polars_core::prelude::*; // if the crate polars-core is used directly
102/// // use polars::prelude::*;      if the crate polars is used
103/// ```
104///
105/// # Initialization
106/// ## Default
107///
108/// A `DataFrame` can be initialized empty:
109///
110/// ```rust
111/// # use polars_core::prelude::*;
112/// let df = DataFrame::default();
113/// assert!(df.is_empty());
114/// ```
115///
116/// ## Wrapping a `Vec<Series>`
117///
118/// A `DataFrame` is built upon a `Vec<Series>` where the `Series` have the same length.
119///
120/// ```rust
121/// # use polars_core::prelude::*;
122/// let s1 = Column::new("Fruit".into(), ["Apple", "Apple", "Pear"]);
123/// let s2 = Column::new("Color".into(), ["Red", "Yellow", "Green"]);
124///
125/// let df: PolarsResult<DataFrame> = DataFrame::new(vec![s1, s2]);
126/// ```
127///
128/// ## Using a macro
129///
130/// The [`df!`] macro is a convenient method:
131///
132/// ```rust
133/// # use polars_core::prelude::*;
134/// let df: PolarsResult<DataFrame> = df!("Fruit" => ["Apple", "Apple", "Pear"],
135///                                       "Color" => ["Red", "Yellow", "Green"]);
136/// ```
137///
138/// ## Using a CSV file
139///
140/// See the `polars_io::csv::CsvReader`.
141///
142/// # Indexing
143/// ## By a number
144///
145/// The `Index<usize>` is implemented for the `DataFrame`.
146///
147/// ```rust
148/// # use polars_core::prelude::*;
149/// let df = df!("Fruit" => ["Apple", "Apple", "Pear"],
150///              "Color" => ["Red", "Yellow", "Green"])?;
151///
152/// assert_eq!(df[0], Column::new("Fruit".into(), &["Apple", "Apple", "Pear"]));
153/// assert_eq!(df[1], Column::new("Color".into(), &["Red", "Yellow", "Green"]));
154/// # Ok::<(), PolarsError>(())
155/// ```
156///
157/// ## By a `Series` name
158///
159/// ```rust
160/// # use polars_core::prelude::*;
161/// let df = df!("Fruit" => ["Apple", "Apple", "Pear"],
162///              "Color" => ["Red", "Yellow", "Green"])?;
163///
164/// assert_eq!(df["Fruit"], Column::new("Fruit".into(), &["Apple", "Apple", "Pear"]));
165/// assert_eq!(df["Color"], Column::new("Color".into(), &["Red", "Yellow", "Green"]));
166/// # Ok::<(), PolarsError>(())
167/// ```
168#[derive(Clone)]
169pub struct DataFrame {
170    height: usize,
171    // invariant: columns[i].len() == height for each 0 >= i > columns.len()
172    pub(crate) columns: Vec<Column>,
173
174    /// A cached schema. This might not give correct results if the DataFrame was modified in place
175    /// between schema and reading.
176    cached_schema: OnceLock<SchemaRef>,
177}
178
179impl DataFrame {
180    pub fn clear_schema(&mut self) {
181        self.cached_schema = OnceLock::new();
182    }
183
184    #[inline]
185    pub fn materialized_column_iter(&self) -> impl ExactSizeIterator<Item = &Series> {
186        self.columns.iter().map(Column::as_materialized_series)
187    }
188
189    #[inline]
190    pub fn par_materialized_column_iter(&self) -> impl ParallelIterator<Item = &Series> {
191        self.columns.par_iter().map(Column::as_materialized_series)
192    }
193
194    /// Returns an estimation of the total (heap) allocated size of the `DataFrame` in bytes.
195    ///
196    /// # Implementation
197    /// This estimation is the sum of the size of its buffers, validity, including nested arrays.
198    /// Multiple arrays may share buffers and bitmaps. Therefore, the size of 2 arrays is not the
199    /// sum of the sizes computed from this function. In particular, [`StructArray`]'s size is an upper bound.
200    ///
201    /// When an array is sliced, its allocated size remains constant because the buffer unchanged.
202    /// However, this function will yield a smaller number. This is because this function returns
203    /// the visible size of the buffer, not its total capacity.
204    ///
205    /// FFI buffers are included in this estimation.
206    pub fn estimated_size(&self) -> usize {
207        self.columns.iter().map(Column::estimated_size).sum()
208    }
209
210    // Reduce monomorphization.
211    fn try_apply_columns(
212        &self,
213        func: &(dyn Fn(&Column) -> PolarsResult<Column> + Send + Sync),
214    ) -> PolarsResult<Vec<Column>> {
215        self.columns.iter().map(func).collect()
216    }
217    // Reduce monomorphization.
218    pub fn _apply_columns(&self, func: &(dyn Fn(&Column) -> Column)) -> Vec<Column> {
219        self.columns.iter().map(func).collect()
220    }
221    // Reduce monomorphization.
222    fn try_apply_columns_par(
223        &self,
224        func: &(dyn Fn(&Column) -> PolarsResult<Column> + Send + Sync),
225    ) -> PolarsResult<Vec<Column>> {
226        POOL.install(|| self.columns.par_iter().map(func).collect())
227    }
228    // Reduce monomorphization.
229    pub fn _apply_columns_par(
230        &self,
231        func: &(dyn Fn(&Column) -> Column + Send + Sync),
232    ) -> Vec<Column> {
233        POOL.install(|| self.columns.par_iter().map(func).collect())
234    }
235
236    /// Get the index of the column.
237    fn check_name_to_idx(&self, name: &str) -> PolarsResult<usize> {
238        self.get_column_index(name)
239            .ok_or_else(|| polars_err!(col_not_found = name))
240    }
241
242    fn check_already_present(&self, name: &str) -> PolarsResult<()> {
243        polars_ensure!(
244            self.columns.iter().all(|s| s.name().as_str() != name),
245            Duplicate: "column with name {:?} is already present in the DataFrame", name
246        );
247        Ok(())
248    }
249
250    /// Reserve additional slots into the chunks of the series.
251    pub(crate) fn reserve_chunks(&mut self, additional: usize) {
252        for s in &mut self.columns {
253            if let Column::Series(s) = s {
254                // SAFETY:
255                // do not modify the data, simply resize.
256                unsafe { s.chunks_mut().reserve(additional) }
257            }
258        }
259    }
260
261    /// Create a DataFrame from a Vector of Series.
262    ///
263    /// # Example
264    ///
265    /// ```
266    /// # use polars_core::prelude::*;
267    /// let s0 = Column::new("days".into(), [0, 1, 2].as_ref());
268    /// let s1 = Column::new("temp".into(), [22.1, 19.9, 7.].as_ref());
269    ///
270    /// let df = DataFrame::new(vec![s0, s1])?;
271    /// # Ok::<(), PolarsError>(())
272    /// ```
273    pub fn new(columns: Vec<Column>) -> PolarsResult<Self> {
274        ensure_names_unique(&columns, |s| s.name().as_str())?;
275
276        let Some(fst) = columns.first() else {
277            return Ok(DataFrame {
278                height: 0,
279                columns,
280                cached_schema: OnceLock::new(),
281            });
282        };
283
284        let height = fst.len();
285        for col in &columns[1..] {
286            polars_ensure!(
287                col.len() == height,
288                ShapeMismatch: "could not create a new DataFrame: series {:?} has length {} while series {:?} has length {}",
289                columns[0].name(), height, col.name(), col.len()
290            );
291        }
292
293        Ok(DataFrame {
294            height,
295            columns,
296            cached_schema: OnceLock::new(),
297        })
298    }
299
300    /// Converts a sequence of columns into a DataFrame, broadcasting length-1
301    /// columns to match the other columns.
302    pub fn new_with_broadcast(columns: Vec<Column>) -> PolarsResult<Self> {
303        // The length of the longest non-unit length column determines the
304        // broadcast length. If all columns are unit-length the broadcast length
305        // is one.
306        let broadcast_len = columns
307            .iter()
308            .map(|s| s.len())
309            .filter(|l| *l != 1)
310            .max()
311            .unwrap_or(1);
312        Self::new_with_broadcast_len(columns, broadcast_len)
313    }
314
315    /// Converts a sequence of columns into a DataFrame, broadcasting length-1
316    /// columns to broadcast_len.
317    pub fn new_with_broadcast_len(
318        columns: Vec<Column>,
319        broadcast_len: usize,
320    ) -> PolarsResult<Self> {
321        ensure_names_unique(&columns, |s| s.name().as_str())?;
322        unsafe { Self::new_with_broadcast_no_namecheck(columns, broadcast_len) }
323    }
324
325    /// Converts a sequence of columns into a DataFrame, broadcasting length-1
326    /// columns to match the other columns.
327    ///  
328    /// # Safety
329    /// Does not check that the column names are unique (which they must be).
330    pub unsafe fn new_with_broadcast_no_namecheck(
331        mut columns: Vec<Column>,
332        broadcast_len: usize,
333    ) -> PolarsResult<Self> {
334        for col in &mut columns {
335            // Length not equal to the broadcast len, needs broadcast or is an error.
336            let len = col.len();
337            if len != broadcast_len {
338                if len != 1 {
339                    let name = col.name().to_owned();
340                    let extra_info =
341                        if let Some(c) = columns.iter().find(|c| c.len() == broadcast_len) {
342                            format!(" (matching column '{}')", c.name())
343                        } else {
344                            String::new()
345                        };
346                    polars_bail!(
347                        ShapeMismatch: "could not create a new DataFrame: series {name:?} has length {len} while trying to broadcast to length {broadcast_len}{extra_info}",
348                    );
349                }
350                *col = col.new_from_index(0, broadcast_len);
351            }
352        }
353
354        let length = if columns.is_empty() { 0 } else { broadcast_len };
355
356        Ok(unsafe { DataFrame::new_no_checks(length, columns) })
357    }
358
359    /// Creates an empty `DataFrame` usable in a compile time context (such as static initializers).
360    ///
361    /// # Example
362    ///
363    /// ```rust
364    /// use polars_core::prelude::DataFrame;
365    /// static EMPTY: DataFrame = DataFrame::empty();
366    /// ```
367    pub const fn empty() -> Self {
368        DataFrame {
369            height: 0,
370            columns: vec![],
371            cached_schema: OnceLock::new(),
372        }
373    }
374
375    /// Create an empty `DataFrame` with empty columns as per the `schema`.
376    pub fn empty_with_schema(schema: &Schema) -> Self {
377        let cols = schema
378            .iter()
379            .map(|(name, dtype)| Column::from(Series::new_empty(name.clone(), dtype)))
380            .collect();
381        unsafe { DataFrame::new_no_checks(0, cols) }
382    }
383
384    /// Create an empty `DataFrame` with empty columns as per the `schema`.
385    pub fn empty_with_arrow_schema(schema: &ArrowSchema) -> Self {
386        let cols = schema
387            .iter_values()
388            .map(|fld| {
389                Column::from(Series::new_empty(
390                    fld.name.clone(),
391                    &(DataType::from_arrow_field(fld)),
392                ))
393            })
394            .collect();
395        unsafe { DataFrame::new_no_checks(0, cols) }
396    }
397
398    /// Create a new `DataFrame` with the given schema, only containing nulls.
399    pub fn full_null(schema: &Schema, height: usize) -> Self {
400        let columns = schema
401            .iter_fields()
402            .map(|f| Column::full_null(f.name.clone(), height, f.dtype()))
403            .collect();
404        unsafe { DataFrame::new_no_checks(height, columns) }
405    }
406
407    /// Removes the last `Series` from the `DataFrame` and returns it, or [`None`] if it is empty.
408    ///
409    /// # Example
410    ///
411    /// ```rust
412    /// # use polars_core::prelude::*;
413    /// let s1 = Column::new("Ocean".into(), ["Atlantic", "Indian"]);
414    /// let s2 = Column::new("Area (km²)".into(), [106_460_000, 70_560_000]);
415    /// let mut df = DataFrame::new(vec![s1.clone(), s2.clone()])?;
416    ///
417    /// assert_eq!(df.pop(), Some(s2));
418    /// assert_eq!(df.pop(), Some(s1));
419    /// assert_eq!(df.pop(), None);
420    /// assert!(df.is_empty());
421    /// # Ok::<(), PolarsError>(())
422    /// ```
423    pub fn pop(&mut self) -> Option<Column> {
424        self.clear_schema();
425
426        self.columns.pop()
427    }
428
429    /// Add a new column at index 0 that counts the rows.
430    ///
431    /// # Example
432    ///
433    /// ```
434    /// # use polars_core::prelude::*;
435    /// let df1: DataFrame = df!("Name" => ["James", "Mary", "John", "Patricia"])?;
436    /// assert_eq!(df1.shape(), (4, 1));
437    ///
438    /// let df2: DataFrame = df1.with_row_index("Id".into(), None)?;
439    /// assert_eq!(df2.shape(), (4, 2));
440    /// println!("{}", df2);
441    ///
442    /// # Ok::<(), PolarsError>(())
443    /// ```
444    ///
445    /// Output:
446    ///
447    /// ```text
448    ///  shape: (4, 2)
449    ///  +-----+----------+
450    ///  | Id  | Name     |
451    ///  | --- | ---      |
452    ///  | u32 | str      |
453    ///  +=====+==========+
454    ///  | 0   | James    |
455    ///  +-----+----------+
456    ///  | 1   | Mary     |
457    ///  +-----+----------+
458    ///  | 2   | John     |
459    ///  +-----+----------+
460    ///  | 3   | Patricia |
461    ///  +-----+----------+
462    /// ```
463    pub fn with_row_index(&self, name: PlSmallStr, offset: Option<IdxSize>) -> PolarsResult<Self> {
464        let mut columns = Vec::with_capacity(self.columns.len() + 1);
465        let offset = offset.unwrap_or(0);
466
467        let mut ca = IdxCa::from_vec(
468            name,
469            (offset..(self.height() as IdxSize) + offset).collect(),
470        );
471        ca.set_sorted_flag(IsSorted::Ascending);
472        columns.push(ca.into_series().into());
473
474        columns.extend_from_slice(&self.columns);
475        DataFrame::new(columns)
476    }
477
478    /// Add a row index column in place.
479    pub fn with_row_index_mut(&mut self, name: PlSmallStr, offset: Option<IdxSize>) -> &mut Self {
480        let offset = offset.unwrap_or(0);
481        let mut ca = IdxCa::from_vec(
482            name,
483            (offset..(self.height() as IdxSize) + offset).collect(),
484        );
485        ca.set_sorted_flag(IsSorted::Ascending);
486
487        self.clear_schema();
488        self.columns.insert(0, ca.into_series().into());
489        self
490    }
491
492    /// Create a new `DataFrame` but does not check the length or duplicate occurrence of the
493    /// `Series`.
494    ///
495    /// Calculates the height from the first column or `0` if no columns are given.
496    ///
497    /// # Safety
498    ///
499    /// It is the callers responsibility to uphold the contract of all `Series`
500    /// having an equal length and a unique name, if not this may panic down the line.
501    pub unsafe fn new_no_checks_height_from_first(columns: Vec<Column>) -> DataFrame {
502        let height = columns.first().map_or(0, Column::len);
503        unsafe { Self::new_no_checks(height, columns) }
504    }
505
506    /// Create a new `DataFrame` but does not check the length or duplicate occurrence of the
507    /// `Series`.
508    ///
509    /// It is advised to use [DataFrame::new] in favor of this method.
510    ///
511    /// # Safety
512    ///
513    /// It is the callers responsibility to uphold the contract of all `Series`
514    /// having an equal length and a unique name, if not this may panic down the line.
515    pub unsafe fn new_no_checks(height: usize, columns: Vec<Column>) -> DataFrame {
516        if cfg!(debug_assertions) {
517            ensure_names_unique(&columns, |s| s.name().as_str()).unwrap();
518
519            for col in &columns {
520                assert_eq!(col.len(), height);
521            }
522        }
523
524        unsafe { Self::_new_no_checks_impl(height, columns) }
525    }
526
527    /// This will not panic even in debug mode - there are some (rare) use cases where a DataFrame
528    /// is temporarily constructed containing duplicates for dispatching to functions. A DataFrame
529    /// constructed with this method is generally highly unsafe and should not be long-lived.
530    #[allow(clippy::missing_safety_doc)]
531    pub const unsafe fn _new_no_checks_impl(height: usize, columns: Vec<Column>) -> DataFrame {
532        DataFrame {
533            height,
534            columns,
535            cached_schema: OnceLock::new(),
536        }
537    }
538
539    /// Create a new `DataFrame` but does not check the length of the `Series`,
540    /// only check for duplicates.
541    ///
542    /// It is advised to use [DataFrame::new] in favor of this method.
543    ///
544    /// # Safety
545    ///
546    /// It is the callers responsibility to uphold the contract of all `Series`
547    /// having an equal length, if not this may panic down the line.
548    pub unsafe fn new_no_length_checks(columns: Vec<Column>) -> PolarsResult<DataFrame> {
549        ensure_names_unique(&columns, |s| s.name().as_str())?;
550
551        Ok(if cfg!(debug_assertions) {
552            Self::new(columns).unwrap()
553        } else {
554            let height = Self::infer_height(&columns);
555            DataFrame {
556                height,
557                columns,
558                cached_schema: OnceLock::new(),
559            }
560        })
561    }
562
563    /// Shrink the capacity of this DataFrame to fit its length.
564    pub fn shrink_to_fit(&mut self) {
565        // Don't parallelize this. Memory overhead
566        for s in &mut self.columns {
567            s.shrink_to_fit();
568        }
569    }
570
571    /// Aggregate all the chunks in the DataFrame to a single chunk.
572    pub fn as_single_chunk(&mut self) -> &mut Self {
573        // Don't parallelize this. Memory overhead
574        for s in &mut self.columns {
575            if let Column::Series(s) = s {
576                *s = s.rechunk().into();
577            }
578        }
579        self
580    }
581
582    /// Aggregate all the chunks in the DataFrame to a single chunk in parallel.
583    /// This may lead to more peak memory consumption.
584    pub fn as_single_chunk_par(&mut self) -> &mut Self {
585        if self.columns.iter().any(|c| c.n_chunks() > 1) {
586            self.columns = self._apply_columns_par(&|s| s.rechunk());
587        }
588        self
589    }
590
591    /// Rechunks all columns to only have a single chunk.
592    pub fn rechunk_mut(&mut self) {
593        // SAFETY: We never adjust the length or names of the columns.
594        let columns = unsafe { self.get_columns_mut() };
595
596        for col in columns.iter_mut().filter(|c| c.n_chunks() > 1) {
597            *col = col.rechunk();
598        }
599    }
600
601    /// Rechunks all columns to only have a single chunk and turns it into a [`RecordBatchT`].
602    pub fn rechunk_to_record_batch(
603        self,
604        compat_level: CompatLevel,
605    ) -> RecordBatchT<Box<dyn Array>> {
606        let height = self.height();
607
608        let (schema, arrays) = self
609            .columns
610            .into_iter()
611            .map(|col| {
612                let mut series = col.take_materialized_series();
613                // Rechunk to one chunk if necessary
614                if series.n_chunks() > 1 {
615                    series = series.rechunk();
616                }
617                (
618                    series.field().to_arrow(compat_level),
619                    series.to_arrow(0, compat_level),
620                )
621            })
622            .collect();
623
624        RecordBatchT::new(height, Arc::new(schema), arrays)
625    }
626
627    /// Returns true if the chunks of the columns do not align and re-chunking should be done
628    pub fn should_rechunk(&self) -> bool {
629        // Fast check. It is also needed for correctness, as code below doesn't check if the number
630        // of chunks is equal.
631        if !self
632            .get_columns()
633            .iter()
634            .filter_map(|c| c.as_series().map(|s| s.n_chunks()))
635            .all_equal()
636        {
637            return true;
638        }
639
640        // From here we check chunk lengths.
641        let mut chunk_lengths = self.materialized_column_iter().map(|s| s.chunk_lengths());
642        match chunk_lengths.next() {
643            None => false,
644            Some(first_column_chunk_lengths) => {
645                // Fast Path for single Chunk Series
646                if first_column_chunk_lengths.size_hint().0 == 1 {
647                    return chunk_lengths.any(|cl| cl.size_hint().0 != 1);
648                }
649                // Always rechunk if we have more chunks than rows.
650                // except when we have an empty df containing a single chunk
651                let height = self.height();
652                let n_chunks = first_column_chunk_lengths.size_hint().0;
653                if n_chunks > height && !(height == 0 && n_chunks == 1) {
654                    return true;
655                }
656                // Slow Path for multi Chunk series
657                let v: Vec<_> = first_column_chunk_lengths.collect();
658                for cl in chunk_lengths {
659                    if cl.enumerate().any(|(idx, el)| Some(&el) != v.get(idx)) {
660                        return true;
661                    }
662                }
663                false
664            },
665        }
666    }
667
668    /// Ensure all the chunks in the [`DataFrame`] are aligned.
669    pub fn align_chunks_par(&mut self) -> &mut Self {
670        if self.should_rechunk() {
671            self.as_single_chunk_par()
672        } else {
673            self
674        }
675    }
676
677    pub fn align_chunks(&mut self) -> &mut Self {
678        if self.should_rechunk() {
679            self.as_single_chunk()
680        } else {
681            self
682        }
683    }
684
685    /// Get the [`DataFrame`] schema.
686    ///
687    /// # Example
688    ///
689    /// ```rust
690    /// # use polars_core::prelude::*;
691    /// let df: DataFrame = df!("Thing" => ["Observable universe", "Human stupidity"],
692    ///                         "Diameter (m)" => [8.8e26, f64::INFINITY])?;
693    ///
694    /// let f1: Field = Field::new("Thing".into(), DataType::String);
695    /// let f2: Field = Field::new("Diameter (m)".into(), DataType::Float64);
696    /// let sc: Schema = Schema::from_iter(vec![f1, f2]);
697    ///
698    /// assert_eq!(&**df.schema(), &sc);
699    /// # Ok::<(), PolarsError>(())
700    /// ```
701    pub fn schema(&self) -> &SchemaRef {
702        let out = self.cached_schema.get_or_init(|| {
703            Arc::new(
704                self.columns
705                    .iter()
706                    .map(|x| (x.name().clone(), x.dtype().clone()))
707                    .collect(),
708            )
709        });
710
711        debug_assert_eq!(out.len(), self.width());
712
713        out
714    }
715
716    /// Get a reference to the [`DataFrame`] columns.
717    ///
718    /// # Example
719    ///
720    /// ```rust
721    /// # use polars_core::prelude::*;
722    /// let df: DataFrame = df!("Name" => ["Adenine", "Cytosine", "Guanine", "Thymine"],
723    ///                         "Symbol" => ["A", "C", "G", "T"])?;
724    /// let columns: &[Column] = df.get_columns();
725    ///
726    /// assert_eq!(columns[0].name(), "Name");
727    /// assert_eq!(columns[1].name(), "Symbol");
728    /// # Ok::<(), PolarsError>(())
729    /// ```
730    #[inline]
731    pub fn get_columns(&self) -> &[Column] {
732        &self.columns
733    }
734
735    #[inline]
736    /// Get mutable access to the underlying columns.
737    ///
738    /// # Safety
739    ///
740    /// The caller must ensure the length of all [`Series`] remains equal to `height` or
741    /// [`DataFrame::set_height`] is called afterwards with the appropriate `height`.
742    /// The caller must ensure that the cached schema is cleared if it modifies the schema by
743    /// calling [`DataFrame::clear_schema`].
744    pub unsafe fn get_columns_mut(&mut self) -> &mut Vec<Column> {
745        &mut self.columns
746    }
747
748    #[inline]
749    /// Remove all the columns in the [`DataFrame`] but keep the `height`.
750    pub fn clear_columns(&mut self) {
751        unsafe { self.get_columns_mut() }.clear();
752        self.clear_schema();
753    }
754
755    #[inline]
756    /// Extend the columns without checking for name collisions or height.
757    ///
758    /// # Safety
759    ///
760    /// The caller needs to ensure that:
761    /// - Column names are unique within the resulting [`DataFrame`].
762    /// - The length of each appended column matches the height of the [`DataFrame`]. For
763    ///   `DataFrame`]s with no columns (ZCDFs), it is important that the height is set afterwards
764    ///   with [`DataFrame::set_height`].
765    pub unsafe fn column_extend_unchecked(&mut self, iter: impl IntoIterator<Item = Column>) {
766        unsafe { self.get_columns_mut() }.extend(iter);
767        self.clear_schema();
768    }
769
770    /// Take ownership of the underlying columns vec.
771    pub fn take_columns(self) -> Vec<Column> {
772        self.columns
773    }
774
775    /// Iterator over the columns as [`Series`].
776    ///
777    /// # Example
778    ///
779    /// ```rust
780    /// # use polars_core::prelude::*;
781    /// let s1 = Column::new("Name".into(), ["Pythagoras' theorem", "Shannon entropy"]);
782    /// let s2 = Column::new("Formula".into(), ["a²+b²=c²", "H=-Σ[P(x)log|P(x)|]"]);
783    /// let df: DataFrame = DataFrame::new(vec![s1.clone(), s2.clone()])?;
784    ///
785    /// let mut iterator = df.iter();
786    ///
787    /// assert_eq!(iterator.next(), Some(s1.as_materialized_series()));
788    /// assert_eq!(iterator.next(), Some(s2.as_materialized_series()));
789    /// assert_eq!(iterator.next(), None);
790    /// # Ok::<(), PolarsError>(())
791    /// ```
792    pub fn iter(&self) -> impl ExactSizeIterator<Item = &Series> {
793        self.materialized_column_iter()
794    }
795
796    /// # Example
797    ///
798    /// ```rust
799    /// # use polars_core::prelude::*;
800    /// let df: DataFrame = df!("Language" => ["Rust", "Python"],
801    ///                         "Designer" => ["Graydon Hoare", "Guido van Rossum"])?;
802    ///
803    /// assert_eq!(df.get_column_names(), &["Language", "Designer"]);
804    /// # Ok::<(), PolarsError>(())
805    /// ```
806    pub fn get_column_names(&self) -> Vec<&PlSmallStr> {
807        self.columns.iter().map(|s| s.name()).collect()
808    }
809
810    /// Get the [`Vec<PlSmallStr>`] representing the column names.
811    pub fn get_column_names_owned(&self) -> Vec<PlSmallStr> {
812        self.columns.iter().map(|s| s.name().clone()).collect()
813    }
814
815    pub fn get_column_names_str(&self) -> Vec<&str> {
816        self.columns.iter().map(|s| s.name().as_str()).collect()
817    }
818
819    /// Set the column names.
820    /// # Example
821    ///
822    /// ```rust
823    /// # use polars_core::prelude::*;
824    /// let mut df: DataFrame = df!("Mathematical set" => ["ℕ", "ℤ", "𝔻", "ℚ", "ℝ", "ℂ"])?;
825    /// df.set_column_names(["Set"])?;
826    ///
827    /// assert_eq!(df.get_column_names(), &["Set"]);
828    /// # Ok::<(), PolarsError>(())
829    /// ```
830    pub fn set_column_names<I, S>(&mut self, names: I) -> PolarsResult<()>
831    where
832        I: IntoIterator<Item = S>,
833        S: Into<PlSmallStr>,
834    {
835        let names = names.into_iter().map(Into::into).collect::<Vec<_>>();
836        self._set_column_names_impl(names.as_slice())
837    }
838
839    fn _set_column_names_impl(&mut self, names: &[PlSmallStr]) -> PolarsResult<()> {
840        polars_ensure!(
841            names.len() == self.width(),
842            ShapeMismatch: "{} column names provided for a DataFrame of width {}",
843            names.len(), self.width()
844        );
845        ensure_names_unique(names, |s| s.as_str())?;
846
847        let columns = mem::take(&mut self.columns);
848        self.columns = columns
849            .into_iter()
850            .zip(names)
851            .map(|(s, name)| {
852                let mut s = s;
853                s.rename(name.clone());
854                s
855            })
856            .collect();
857        self.clear_schema();
858        Ok(())
859    }
860
861    /// Get the data types of the columns in the [`DataFrame`].
862    ///
863    /// # Example
864    ///
865    /// ```rust
866    /// # use polars_core::prelude::*;
867    /// let venus_air: DataFrame = df!("Element" => ["Carbon dioxide", "Nitrogen"],
868    ///                                "Fraction" => [0.965, 0.035])?;
869    ///
870    /// assert_eq!(venus_air.dtypes(), &[DataType::String, DataType::Float64]);
871    /// # Ok::<(), PolarsError>(())
872    /// ```
873    pub fn dtypes(&self) -> Vec<DataType> {
874        self.columns.iter().map(|s| s.dtype().clone()).collect()
875    }
876
877    pub(crate) fn first_series_column(&self) -> Option<&Series> {
878        self.columns.iter().find_map(|col| col.as_series())
879    }
880
881    /// The number of chunks for the first column.
882    pub fn first_col_n_chunks(&self) -> usize {
883        match self.first_series_column() {
884            None if self.columns.is_empty() => 0,
885            None => 1,
886            Some(s) => s.n_chunks(),
887        }
888    }
889
890    /// The highest number of chunks for any column.
891    pub fn max_n_chunks(&self) -> usize {
892        self.columns
893            .iter()
894            .map(|s| s.as_series().map(|s| s.n_chunks()).unwrap_or(1))
895            .max()
896            .unwrap_or(0)
897    }
898
899    /// Get a reference to the schema fields of the [`DataFrame`].
900    ///
901    /// # Example
902    ///
903    /// ```rust
904    /// # use polars_core::prelude::*;
905    /// let earth: DataFrame = df!("Surface type" => ["Water", "Land"],
906    ///                            "Fraction" => [0.708, 0.292])?;
907    ///
908    /// let f1: Field = Field::new("Surface type".into(), DataType::String);
909    /// let f2: Field = Field::new("Fraction".into(), DataType::Float64);
910    ///
911    /// assert_eq!(earth.fields(), &[f1, f2]);
912    /// # Ok::<(), PolarsError>(())
913    /// ```
914    pub fn fields(&self) -> Vec<Field> {
915        self.columns
916            .iter()
917            .map(|s| s.field().into_owned())
918            .collect()
919    }
920
921    /// Get (height, width) of the [`DataFrame`].
922    ///
923    /// # Example
924    ///
925    /// ```rust
926    /// # use polars_core::prelude::*;
927    /// let df0: DataFrame = DataFrame::default();
928    /// let df1: DataFrame = df!("1" => [1, 2, 3, 4, 5])?;
929    /// let df2: DataFrame = df!("1" => [1, 2, 3, 4, 5],
930    ///                          "2" => [1, 2, 3, 4, 5])?;
931    ///
932    /// assert_eq!(df0.shape(), (0 ,0));
933    /// assert_eq!(df1.shape(), (5, 1));
934    /// assert_eq!(df2.shape(), (5, 2));
935    /// # Ok::<(), PolarsError>(())
936    /// ```
937    pub fn shape(&self) -> (usize, usize) {
938        (self.height, self.columns.len())
939    }
940
941    /// Get the width of the [`DataFrame`] which is the number of columns.
942    ///
943    /// # Example
944    ///
945    /// ```rust
946    /// # use polars_core::prelude::*;
947    /// let df0: DataFrame = DataFrame::default();
948    /// let df1: DataFrame = df!("Series 1" => [0; 0])?;
949    /// let df2: DataFrame = df!("Series 1" => [0; 0],
950    ///                          "Series 2" => [0; 0])?;
951    ///
952    /// assert_eq!(df0.width(), 0);
953    /// assert_eq!(df1.width(), 1);
954    /// assert_eq!(df2.width(), 2);
955    /// # Ok::<(), PolarsError>(())
956    /// ```
957    pub fn width(&self) -> usize {
958        self.columns.len()
959    }
960
961    /// Get the height of the [`DataFrame`] which is the number of rows.
962    ///
963    /// # Example
964    ///
965    /// ```rust
966    /// # use polars_core::prelude::*;
967    /// let df0: DataFrame = DataFrame::default();
968    /// let df1: DataFrame = df!("Currency" => ["€", "$"])?;
969    /// let df2: DataFrame = df!("Currency" => ["€", "$", "¥", "£", "₿"])?;
970    ///
971    /// assert_eq!(df0.height(), 0);
972    /// assert_eq!(df1.height(), 2);
973    /// assert_eq!(df2.height(), 5);
974    /// # Ok::<(), PolarsError>(())
975    /// ```
976    pub fn height(&self) -> usize {
977        self.height
978    }
979
980    /// Returns the size as number of rows * number of columns
981    pub fn size(&self) -> usize {
982        let s = self.shape();
983        s.0 * s.1
984    }
985
986    /// Returns `true` if the [`DataFrame`] contains no rows.
987    ///
988    /// # Example
989    ///
990    /// ```rust
991    /// # use polars_core::prelude::*;
992    /// let df1: DataFrame = DataFrame::default();
993    /// assert!(df1.is_empty());
994    ///
995    /// let df2: DataFrame = df!("First name" => ["Forever"],
996    ///                          "Last name" => ["Alone"])?;
997    /// assert!(!df2.is_empty());
998    /// # Ok::<(), PolarsError>(())
999    /// ```
1000    pub fn is_empty(&self) -> bool {
1001        matches!(self.shape(), (0, _) | (_, 0))
1002    }
1003
1004    /// Set the height (i.e. number of rows) of this [`DataFrame`].
1005    ///
1006    /// # Safety
1007    ///
1008    /// This needs to be equal to the length of all the columns.
1009    pub unsafe fn set_height(&mut self, height: usize) {
1010        self.height = height;
1011    }
1012
1013    /// Add multiple [`Series`] to a [`DataFrame`].
1014    /// The added `Series` are required to have the same length.
1015    ///
1016    /// # Example
1017    ///
1018    /// ```rust
1019    /// # use polars_core::prelude::*;
1020    /// let df1: DataFrame = df!("Element" => ["Copper", "Silver", "Gold"])?;
1021    /// let s1 = Column::new("Proton".into(), [29, 47, 79]);
1022    /// let s2 = Column::new("Electron".into(), [29, 47, 79]);
1023    ///
1024    /// let df2: DataFrame = df1.hstack(&[s1, s2])?;
1025    /// assert_eq!(df2.shape(), (3, 3));
1026    /// println!("{}", df2);
1027    /// # Ok::<(), PolarsError>(())
1028    /// ```
1029    ///
1030    /// Output:
1031    ///
1032    /// ```text
1033    /// shape: (3, 3)
1034    /// +---------+--------+----------+
1035    /// | Element | Proton | Electron |
1036    /// | ---     | ---    | ---      |
1037    /// | str     | i32    | i32      |
1038    /// +=========+========+==========+
1039    /// | Copper  | 29     | 29       |
1040    /// +---------+--------+----------+
1041    /// | Silver  | 47     | 47       |
1042    /// +---------+--------+----------+
1043    /// | Gold    | 79     | 79       |
1044    /// +---------+--------+----------+
1045    /// ```
1046    pub fn hstack(&self, columns: &[Column]) -> PolarsResult<Self> {
1047        let mut new_cols = self.columns.clone();
1048        new_cols.extend_from_slice(columns);
1049        DataFrame::new(new_cols)
1050    }
1051
1052    /// Concatenate a [`DataFrame`] to this [`DataFrame`] and return as newly allocated [`DataFrame`].
1053    ///
1054    /// If many `vstack` operations are done, it is recommended to call [`DataFrame::align_chunks_par`].
1055    ///
1056    /// # Example
1057    ///
1058    /// ```rust
1059    /// # use polars_core::prelude::*;
1060    /// let df1: DataFrame = df!("Element" => ["Copper", "Silver", "Gold"],
1061    ///                          "Melting Point (K)" => [1357.77, 1234.93, 1337.33])?;
1062    /// let df2: DataFrame = df!("Element" => ["Platinum", "Palladium"],
1063    ///                          "Melting Point (K)" => [2041.4, 1828.05])?;
1064    ///
1065    /// let df3: DataFrame = df1.vstack(&df2)?;
1066    ///
1067    /// assert_eq!(df3.shape(), (5, 2));
1068    /// println!("{}", df3);
1069    /// # Ok::<(), PolarsError>(())
1070    /// ```
1071    ///
1072    /// Output:
1073    ///
1074    /// ```text
1075    /// shape: (5, 2)
1076    /// +-----------+-------------------+
1077    /// | Element   | Melting Point (K) |
1078    /// | ---       | ---               |
1079    /// | str       | f64               |
1080    /// +===========+===================+
1081    /// | Copper    | 1357.77           |
1082    /// +-----------+-------------------+
1083    /// | Silver    | 1234.93           |
1084    /// +-----------+-------------------+
1085    /// | Gold      | 1337.33           |
1086    /// +-----------+-------------------+
1087    /// | Platinum  | 2041.4            |
1088    /// +-----------+-------------------+
1089    /// | Palladium | 1828.05           |
1090    /// +-----------+-------------------+
1091    /// ```
1092    pub fn vstack(&self, other: &DataFrame) -> PolarsResult<Self> {
1093        let mut df = self.clone();
1094        df.vstack_mut(other)?;
1095        Ok(df)
1096    }
1097
1098    /// Concatenate a [`DataFrame`] to this [`DataFrame`]
1099    ///
1100    /// If many `vstack` operations are done, it is recommended to call [`DataFrame::align_chunks_par`].
1101    ///
1102    /// # Example
1103    ///
1104    /// ```rust
1105    /// # use polars_core::prelude::*;
1106    /// let mut df1: DataFrame = df!("Element" => ["Copper", "Silver", "Gold"],
1107    ///                          "Melting Point (K)" => [1357.77, 1234.93, 1337.33])?;
1108    /// let df2: DataFrame = df!("Element" => ["Platinum", "Palladium"],
1109    ///                          "Melting Point (K)" => [2041.4, 1828.05])?;
1110    ///
1111    /// df1.vstack_mut(&df2)?;
1112    ///
1113    /// assert_eq!(df1.shape(), (5, 2));
1114    /// println!("{}", df1);
1115    /// # Ok::<(), PolarsError>(())
1116    /// ```
1117    ///
1118    /// Output:
1119    ///
1120    /// ```text
1121    /// shape: (5, 2)
1122    /// +-----------+-------------------+
1123    /// | Element   | Melting Point (K) |
1124    /// | ---       | ---               |
1125    /// | str       | f64               |
1126    /// +===========+===================+
1127    /// | Copper    | 1357.77           |
1128    /// +-----------+-------------------+
1129    /// | Silver    | 1234.93           |
1130    /// +-----------+-------------------+
1131    /// | Gold      | 1337.33           |
1132    /// +-----------+-------------------+
1133    /// | Platinum  | 2041.4            |
1134    /// +-----------+-------------------+
1135    /// | Palladium | 1828.05           |
1136    /// +-----------+-------------------+
1137    /// ```
1138    pub fn vstack_mut(&mut self, other: &DataFrame) -> PolarsResult<&mut Self> {
1139        if self.width() != other.width() {
1140            polars_ensure!(
1141                self.width() == 0,
1142                ShapeMismatch:
1143                "unable to append to a DataFrame of width {} with a DataFrame of width {}",
1144                self.width(), other.width(),
1145            );
1146            self.columns.clone_from(&other.columns);
1147            self.height = other.height;
1148            return Ok(self);
1149        }
1150
1151        self.columns
1152            .iter_mut()
1153            .zip(other.columns.iter())
1154            .try_for_each::<_, PolarsResult<_>>(|(left, right)| {
1155                ensure_can_extend(&*left, right)?;
1156                left.append(right).map_err(|e| {
1157                    e.context(format!("failed to vstack column '{}'", right.name()).into())
1158                })?;
1159                Ok(())
1160            })?;
1161        self.height += other.height;
1162        Ok(self)
1163    }
1164
1165    /// Concatenate a [`DataFrame`] to this [`DataFrame`]
1166    ///
1167    /// If many `vstack` operations are done, it is recommended to call [`DataFrame::align_chunks_par`].
1168    ///
1169    /// # Panics
1170    /// Panics if the schema's don't match.
1171    pub fn vstack_mut_unchecked(&mut self, other: &DataFrame) {
1172        self.columns
1173            .iter_mut()
1174            .zip(other.columns.iter())
1175            .for_each(|(left, right)| {
1176                left.append(right)
1177                    .map_err(|e| {
1178                        e.context(format!("failed to vstack column '{}'", right.name()).into())
1179                    })
1180                    .expect("should not fail");
1181            });
1182        self.height += other.height;
1183    }
1184
1185    /// Concatenate a [`DataFrame`] to this [`DataFrame`]
1186    ///
1187    /// If many `vstack` operations are done, it is recommended to call [`DataFrame::align_chunks_par`].
1188    ///
1189    /// # Panics
1190    /// Panics if the schema's don't match.
1191    pub fn vstack_mut_owned_unchecked(&mut self, other: DataFrame) {
1192        self.columns
1193            .iter_mut()
1194            .zip(other.columns)
1195            .for_each(|(left, right)| {
1196                left.append_owned(right).expect("should not fail");
1197            });
1198        self.height += other.height;
1199    }
1200
1201    /// Extend the memory backed by this [`DataFrame`] with the values from `other`.
1202    ///
1203    /// Different from [`vstack`](Self::vstack) which adds the chunks from `other` to the chunks of this [`DataFrame`]
1204    /// `extend` appends the data from `other` to the underlying memory locations and thus may cause a reallocation.
1205    ///
1206    /// If this does not cause a reallocation, the resulting data structure will not have any extra chunks
1207    /// and thus will yield faster queries.
1208    ///
1209    /// Prefer `extend` over `vstack` when you want to do a query after a single append. For instance during
1210    /// online operations where you add `n` rows and rerun a query.
1211    ///
1212    /// Prefer `vstack` over `extend` when you want to append many times before doing a query. For instance
1213    /// when you read in multiple files and when to store them in a single `DataFrame`. In the latter case, finish the sequence
1214    /// of `append` operations with a [`rechunk`](Self::align_chunks_par).
1215    pub fn extend(&mut self, other: &DataFrame) -> PolarsResult<()> {
1216        polars_ensure!(
1217            self.width() == other.width(),
1218            ShapeMismatch:
1219            "unable to extend a DataFrame of width {} with a DataFrame of width {}",
1220            self.width(), other.width(),
1221        );
1222
1223        self.columns
1224            .iter_mut()
1225            .zip(other.columns.iter())
1226            .try_for_each::<_, PolarsResult<_>>(|(left, right)| {
1227                ensure_can_extend(&*left, right)?;
1228                left.extend(right).map_err(|e| {
1229                    e.context(format!("failed to extend column '{}'", right.name()).into())
1230                })?;
1231                Ok(())
1232            })?;
1233        self.height += other.height;
1234        self.clear_schema();
1235        Ok(())
1236    }
1237
1238    /// Remove a column by name and return the column removed.
1239    ///
1240    /// # Example
1241    ///
1242    /// ```rust
1243    /// # use polars_core::prelude::*;
1244    /// let mut df: DataFrame = df!("Animal" => ["Tiger", "Lion", "Great auk"],
1245    ///                             "IUCN" => ["Endangered", "Vulnerable", "Extinct"])?;
1246    ///
1247    /// let s1: PolarsResult<Column> = df.drop_in_place("Average weight");
1248    /// assert!(s1.is_err());
1249    ///
1250    /// let s2: Column = df.drop_in_place("Animal")?;
1251    /// assert_eq!(s2, Column::new("Animal".into(), &["Tiger", "Lion", "Great auk"]));
1252    /// # Ok::<(), PolarsError>(())
1253    /// ```
1254    pub fn drop_in_place(&mut self, name: &str) -> PolarsResult<Column> {
1255        let idx = self.check_name_to_idx(name)?;
1256        self.clear_schema();
1257        Ok(self.columns.remove(idx))
1258    }
1259
1260    /// Return a new [`DataFrame`] where all null values are dropped.
1261    ///
1262    /// # Example
1263    ///
1264    /// ```no_run
1265    /// # use polars_core::prelude::*;
1266    /// let df1: DataFrame = df!("Country" => ["Malta", "Liechtenstein", "North Korea"],
1267    ///                         "Tax revenue (% GDP)" => [Some(32.7), None, None])?;
1268    /// assert_eq!(df1.shape(), (3, 2));
1269    ///
1270    /// let df2: DataFrame = df1.drop_nulls::<String>(None)?;
1271    /// assert_eq!(df2.shape(), (1, 2));
1272    /// println!("{}", df2);
1273    /// # Ok::<(), PolarsError>(())
1274    /// ```
1275    ///
1276    /// Output:
1277    ///
1278    /// ```text
1279    /// shape: (1, 2)
1280    /// +---------+---------------------+
1281    /// | Country | Tax revenue (% GDP) |
1282    /// | ---     | ---                 |
1283    /// | str     | f64                 |
1284    /// +=========+=====================+
1285    /// | Malta   | 32.7                |
1286    /// +---------+---------------------+
1287    /// ```
1288    pub fn drop_nulls<S>(&self, subset: Option<&[S]>) -> PolarsResult<Self>
1289    where
1290        for<'a> &'a S: Into<PlSmallStr>,
1291    {
1292        if let Some(v) = subset {
1293            let v = self.select_columns(v)?;
1294            self._drop_nulls_impl(v.as_slice())
1295        } else {
1296            self._drop_nulls_impl(self.columns.as_slice())
1297        }
1298    }
1299
1300    fn _drop_nulls_impl(&self, subset: &[Column]) -> PolarsResult<Self> {
1301        // fast path for no nulls in df
1302        if subset.iter().all(|s| !s.has_nulls()) {
1303            return Ok(self.clone());
1304        }
1305
1306        let mut iter = subset.iter();
1307
1308        let mask = iter
1309            .next()
1310            .ok_or_else(|| polars_err!(NoData: "no data to drop nulls from"))?;
1311        let mut mask = mask.is_not_null();
1312
1313        for c in iter {
1314            mask = mask & c.is_not_null();
1315        }
1316        self.filter(&mask)
1317    }
1318
1319    /// Drop a column by name.
1320    /// This is a pure method and will return a new [`DataFrame`] instead of modifying
1321    /// the current one in place.
1322    ///
1323    /// # Example
1324    ///
1325    /// ```rust
1326    /// # use polars_core::prelude::*;
1327    /// let df1: DataFrame = df!("Ray type" => ["α", "β", "X", "γ"])?;
1328    /// let df2: DataFrame = df1.drop("Ray type")?;
1329    ///
1330    /// assert!(df2.is_empty());
1331    /// # Ok::<(), PolarsError>(())
1332    /// ```
1333    pub fn drop(&self, name: &str) -> PolarsResult<Self> {
1334        let idx = self.check_name_to_idx(name)?;
1335        let mut new_cols = Vec::with_capacity(self.columns.len() - 1);
1336
1337        self.columns.iter().enumerate().for_each(|(i, s)| {
1338            if i != idx {
1339                new_cols.push(s.clone())
1340            }
1341        });
1342
1343        Ok(unsafe { DataFrame::new_no_checks(self.height(), new_cols) })
1344    }
1345
1346    /// Drop columns that are in `names`.
1347    pub fn drop_many<I, S>(&self, names: I) -> Self
1348    where
1349        I: IntoIterator<Item = S>,
1350        S: Into<PlSmallStr>,
1351    {
1352        let names: PlHashSet<PlSmallStr> = names.into_iter().map(|s| s.into()).collect();
1353        self.drop_many_amortized(&names)
1354    }
1355
1356    /// Drop columns that are in `names` without allocating a [`HashSet`](std::collections::HashSet).
1357    pub fn drop_many_amortized(&self, names: &PlHashSet<PlSmallStr>) -> DataFrame {
1358        if names.is_empty() {
1359            return self.clone();
1360        }
1361        let mut new_cols = Vec::with_capacity(self.columns.len().saturating_sub(names.len()));
1362        self.columns.iter().for_each(|s| {
1363            if !names.contains(s.name()) {
1364                new_cols.push(s.clone())
1365            }
1366        });
1367
1368        unsafe { DataFrame::new_no_checks(self.height(), new_cols) }
1369    }
1370
1371    /// Insert a new column at a given index without checking for duplicates.
1372    /// This can leave the [`DataFrame`] at an invalid state
1373    fn insert_column_no_name_check(
1374        &mut self,
1375        index: usize,
1376        column: Column,
1377    ) -> PolarsResult<&mut Self> {
1378        polars_ensure!(
1379            self.width() == 0 || column.len() == self.height(),
1380            ShapeMismatch: "unable to add a column of length {} to a DataFrame of height {}",
1381            column.len(), self.height(),
1382        );
1383
1384        if self.width() == 0 {
1385            self.height = column.len();
1386        }
1387
1388        self.columns.insert(index, column);
1389        self.clear_schema();
1390        Ok(self)
1391    }
1392
1393    /// Insert a new column at a given index.
1394    pub fn insert_column<S: IntoColumn>(
1395        &mut self,
1396        index: usize,
1397        column: S,
1398    ) -> PolarsResult<&mut Self> {
1399        let column = column.into_column();
1400        self.check_already_present(column.name().as_str())?;
1401        self.insert_column_no_name_check(index, column)
1402    }
1403
1404    fn add_column_by_search(&mut self, column: Column) -> PolarsResult<()> {
1405        if let Some(idx) = self.get_column_index(column.name().as_str()) {
1406            self.replace_column(idx, column)?;
1407        } else {
1408            if self.width() == 0 {
1409                self.height = column.len();
1410            }
1411
1412            self.columns.push(column);
1413            self.clear_schema();
1414        }
1415        Ok(())
1416    }
1417
1418    /// Add a new column to this [`DataFrame`] or replace an existing one.
1419    pub fn with_column<C: IntoColumn>(&mut self, column: C) -> PolarsResult<&mut Self> {
1420        fn inner(df: &mut DataFrame, mut column: Column) -> PolarsResult<&mut DataFrame> {
1421            let height = df.height();
1422            if column.len() == 1 && height > 1 {
1423                column = column.new_from_index(0, height);
1424            }
1425
1426            if column.len() == height || df.get_columns().is_empty() {
1427                df.add_column_by_search(column)?;
1428                Ok(df)
1429            }
1430            // special case for literals
1431            else if height == 0 && column.len() == 1 {
1432                let s = column.clear();
1433                df.add_column_by_search(s)?;
1434                Ok(df)
1435            } else {
1436                polars_bail!(
1437                    ShapeMismatch: "unable to add a column of length {} to a DataFrame of height {}",
1438                    column.len(), height,
1439                );
1440            }
1441        }
1442        let column = column.into_column();
1443        inner(self, column)
1444    }
1445
1446    /// Adds a column to the [`DataFrame`] without doing any checks
1447    /// on length or duplicates.
1448    ///
1449    /// # Safety
1450    /// The caller must ensure `self.width() == 0 || column.len() == self.height()` .
1451    pub unsafe fn with_column_unchecked(&mut self, column: Column) -> &mut Self {
1452        debug_assert!(self.width() == 0 || self.height() == column.len());
1453        debug_assert!(self.get_column_index(column.name().as_str()).is_none());
1454
1455        // SAFETY: Invariant of function guarantees for case `width` > 0. We set the height
1456        // properly for `width` == 0.
1457        if self.width() == 0 {
1458            unsafe { self.set_height(column.len()) };
1459        }
1460        unsafe { self.get_columns_mut() }.push(column);
1461        self.clear_schema();
1462
1463        self
1464    }
1465
1466    // Note: Schema can be both input or output_schema
1467    fn add_column_by_schema(&mut self, c: Column, schema: &Schema) -> PolarsResult<()> {
1468        let name = c.name();
1469        if let Some((idx, _, _)) = schema.get_full(name.as_str()) {
1470            if self.columns.get(idx).map(|s| s.name()) != Some(name) {
1471                // Given schema is output_schema and we can push.
1472                if idx == self.columns.len() {
1473                    if self.width() == 0 {
1474                        self.height = c.len();
1475                    }
1476
1477                    self.columns.push(c);
1478                    self.clear_schema();
1479                }
1480                // Schema is incorrect fallback to search
1481                else {
1482                    debug_assert!(false);
1483                    self.add_column_by_search(c)?;
1484                }
1485            } else {
1486                self.replace_column(idx, c)?;
1487            }
1488        } else {
1489            if self.width() == 0 {
1490                self.height = c.len();
1491            }
1492
1493            self.columns.push(c);
1494            self.clear_schema();
1495        }
1496
1497        Ok(())
1498    }
1499
1500    // Note: Schema can be both input or output_schema
1501    pub fn _add_series(&mut self, series: Vec<Series>, schema: &Schema) -> PolarsResult<()> {
1502        for (i, s) in series.into_iter().enumerate() {
1503            // we need to branch here
1504            // because users can add multiple columns with the same name
1505            if i == 0 || schema.get(s.name().as_str()).is_some() {
1506                self.with_column_and_schema(s.into_column(), schema)?;
1507            } else {
1508                self.with_column(s.clone().into_column())?;
1509            }
1510        }
1511        Ok(())
1512    }
1513
1514    pub fn _add_columns(&mut self, columns: Vec<Column>, schema: &Schema) -> PolarsResult<()> {
1515        for (i, s) in columns.into_iter().enumerate() {
1516            // we need to branch here
1517            // because users can add multiple columns with the same name
1518            if i == 0 || schema.get(s.name().as_str()).is_some() {
1519                self.with_column_and_schema(s, schema)?;
1520            } else {
1521                self.with_column(s.clone())?;
1522            }
1523        }
1524
1525        Ok(())
1526    }
1527
1528    /// Add a new column to this [`DataFrame`] or replace an existing one.
1529    /// Uses an existing schema to amortize lookups.
1530    /// If the schema is incorrect, we will fallback to linear search.
1531    ///
1532    /// Note: Schema can be both input or output_schema
1533    pub fn with_column_and_schema<C: IntoColumn>(
1534        &mut self,
1535        column: C,
1536        schema: &Schema,
1537    ) -> PolarsResult<&mut Self> {
1538        let mut column = column.into_column();
1539
1540        let height = self.height();
1541        if column.len() == 1 && height > 1 {
1542            column = column.new_from_index(0, height);
1543        }
1544
1545        if column.len() == height || self.columns.is_empty() {
1546            self.add_column_by_schema(column, schema)?;
1547            Ok(self)
1548        }
1549        // special case for literals
1550        else if height == 0 && column.len() == 1 {
1551            let s = column.clear();
1552            self.add_column_by_schema(s, schema)?;
1553            Ok(self)
1554        } else {
1555            polars_bail!(
1556                ShapeMismatch: "unable to add a column of length {} to a DataFrame of height {}",
1557                column.len(), height,
1558            );
1559        }
1560    }
1561
1562    /// Get a row in the [`DataFrame`]. Beware this is slow.
1563    ///
1564    /// # Example
1565    ///
1566    /// ```
1567    /// # use polars_core::prelude::*;
1568    /// fn example(df: &mut DataFrame, idx: usize) -> Option<Vec<AnyValue>> {
1569    ///     df.get(idx)
1570    /// }
1571    /// ```
1572    pub fn get(&self, idx: usize) -> Option<Vec<AnyValue>> {
1573        match self.columns.first() {
1574            Some(s) => {
1575                if s.len() <= idx {
1576                    return None;
1577                }
1578            },
1579            None => return None,
1580        }
1581        // SAFETY: we just checked bounds
1582        unsafe { Some(self.columns.iter().map(|c| c.get_unchecked(idx)).collect()) }
1583    }
1584
1585    /// Select a [`Series`] by index.
1586    ///
1587    /// # Example
1588    ///
1589    /// ```rust
1590    /// # use polars_core::prelude::*;
1591    /// let df: DataFrame = df!("Star" => ["Sun", "Betelgeuse", "Sirius A", "Sirius B"],
1592    ///                         "Absolute magnitude" => [4.83, -5.85, 1.42, 11.18])?;
1593    ///
1594    /// let s1: Option<&Column> = df.select_at_idx(0);
1595    /// let s2 = Column::new("Star".into(), ["Sun", "Betelgeuse", "Sirius A", "Sirius B"]);
1596    ///
1597    /// assert_eq!(s1, Some(&s2));
1598    /// # Ok::<(), PolarsError>(())
1599    /// ```
1600    pub fn select_at_idx(&self, idx: usize) -> Option<&Column> {
1601        self.columns.get(idx)
1602    }
1603
1604    /// Select column(s) from this [`DataFrame`] by range and return a new [`DataFrame`]
1605    ///
1606    /// # Examples
1607    ///
1608    /// ```rust
1609    /// # use polars_core::prelude::*;
1610    /// let df = df! {
1611    ///     "0" => [0, 0, 0],
1612    ///     "1" => [1, 1, 1],
1613    ///     "2" => [2, 2, 2]
1614    /// }?;
1615    ///
1616    /// assert!(df.select(["0", "1"])?.equals(&df.select_by_range(0..=1)?));
1617    /// assert!(df.equals(&df.select_by_range(..)?));
1618    /// # Ok::<(), PolarsError>(())
1619    /// ```
1620    pub fn select_by_range<R>(&self, range: R) -> PolarsResult<Self>
1621    where
1622        R: ops::RangeBounds<usize>,
1623    {
1624        // This function is copied from std::slice::range (https://doc.rust-lang.org/std/slice/fn.range.html)
1625        // because it is the nightly feature. We should change here if this function were stable.
1626        fn get_range<R>(range: R, bounds: ops::RangeTo<usize>) -> ops::Range<usize>
1627        where
1628            R: ops::RangeBounds<usize>,
1629        {
1630            let len = bounds.end;
1631
1632            let start: ops::Bound<&usize> = range.start_bound();
1633            let start = match start {
1634                ops::Bound::Included(&start) => start,
1635                ops::Bound::Excluded(start) => start.checked_add(1).unwrap_or_else(|| {
1636                    panic!("attempted to index slice from after maximum usize");
1637                }),
1638                ops::Bound::Unbounded => 0,
1639            };
1640
1641            let end: ops::Bound<&usize> = range.end_bound();
1642            let end = match end {
1643                ops::Bound::Included(end) => end.checked_add(1).unwrap_or_else(|| {
1644                    panic!("attempted to index slice up to maximum usize");
1645                }),
1646                ops::Bound::Excluded(&end) => end,
1647                ops::Bound::Unbounded => len,
1648            };
1649
1650            if start > end {
1651                panic!("slice index starts at {start} but ends at {end}");
1652            }
1653            if end > len {
1654                panic!("range end index {end} out of range for slice of length {len}",);
1655            }
1656
1657            ops::Range { start, end }
1658        }
1659
1660        let colnames = self.get_column_names_owned();
1661        let range = get_range(range, ..colnames.len());
1662
1663        self._select_impl(&colnames[range])
1664    }
1665
1666    /// Get column index of a [`Series`] by name.
1667    /// # Example
1668    ///
1669    /// ```rust
1670    /// # use polars_core::prelude::*;
1671    /// let df: DataFrame = df!("Name" => ["Player 1", "Player 2", "Player 3"],
1672    ///                         "Health" => [100, 200, 500],
1673    ///                         "Mana" => [250, 100, 0],
1674    ///                         "Strength" => [30, 150, 300])?;
1675    ///
1676    /// assert_eq!(df.get_column_index("Name"), Some(0));
1677    /// assert_eq!(df.get_column_index("Health"), Some(1));
1678    /// assert_eq!(df.get_column_index("Mana"), Some(2));
1679    /// assert_eq!(df.get_column_index("Strength"), Some(3));
1680    /// assert_eq!(df.get_column_index("Haste"), None);
1681    /// # Ok::<(), PolarsError>(())
1682    /// ```
1683    pub fn get_column_index(&self, name: &str) -> Option<usize> {
1684        let schema = self.schema();
1685        if let Some(idx) = schema.index_of(name) {
1686            if self
1687                .get_columns()
1688                .get(idx)
1689                .is_some_and(|c| c.name() == name)
1690            {
1691                return Some(idx);
1692            }
1693        }
1694
1695        self.columns.iter().position(|s| s.name().as_str() == name)
1696    }
1697
1698    /// Get column index of a [`Series`] by name.
1699    pub fn try_get_column_index(&self, name: &str) -> PolarsResult<usize> {
1700        self.get_column_index(name)
1701            .ok_or_else(|| polars_err!(col_not_found = name))
1702    }
1703
1704    /// Select a single column by name.
1705    ///
1706    /// # Example
1707    ///
1708    /// ```rust
1709    /// # use polars_core::prelude::*;
1710    /// let s1 = Column::new("Password".into(), ["123456", "[]B$u$g$s$B#u#n#n#y[]{}"]);
1711    /// let s2 = Column::new("Robustness".into(), ["Weak", "Strong"]);
1712    /// let df: DataFrame = DataFrame::new(vec![s1.clone(), s2])?;
1713    ///
1714    /// assert_eq!(df.column("Password")?, &s1);
1715    /// # Ok::<(), PolarsError>(())
1716    /// ```
1717    pub fn column(&self, name: &str) -> PolarsResult<&Column> {
1718        let idx = self.try_get_column_index(name)?;
1719        Ok(self.select_at_idx(idx).unwrap())
1720    }
1721
1722    /// Selected multiple columns by name.
1723    ///
1724    /// # Example
1725    ///
1726    /// ```rust
1727    /// # use polars_core::prelude::*;
1728    /// let df: DataFrame = df!("Latin name" => ["Oncorhynchus kisutch", "Salmo salar"],
1729    ///                         "Max weight (kg)" => [16.0, 35.89])?;
1730    /// let sv: Vec<&Column> = df.columns(["Latin name", "Max weight (kg)"])?;
1731    ///
1732    /// assert_eq!(&df[0], sv[0]);
1733    /// assert_eq!(&df[1], sv[1]);
1734    /// # Ok::<(), PolarsError>(())
1735    /// ```
1736    pub fn columns<I, S>(&self, names: I) -> PolarsResult<Vec<&Column>>
1737    where
1738        I: IntoIterator<Item = S>,
1739        S: AsRef<str>,
1740    {
1741        names
1742            .into_iter()
1743            .map(|name| self.column(name.as_ref()))
1744            .collect()
1745    }
1746
1747    /// Select column(s) from this [`DataFrame`] and return a new [`DataFrame`].
1748    ///
1749    /// # Examples
1750    ///
1751    /// ```
1752    /// # use polars_core::prelude::*;
1753    /// fn example(df: &DataFrame) -> PolarsResult<DataFrame> {
1754    ///     df.select(["foo", "bar"])
1755    /// }
1756    /// ```
1757    pub fn select<I, S>(&self, selection: I) -> PolarsResult<Self>
1758    where
1759        I: IntoIterator<Item = S>,
1760        S: Into<PlSmallStr>,
1761    {
1762        let cols = selection.into_iter().map(|s| s.into()).collect::<Vec<_>>();
1763        self._select_impl(cols.as_slice())
1764    }
1765
1766    pub fn _select_impl(&self, cols: &[PlSmallStr]) -> PolarsResult<Self> {
1767        ensure_names_unique(cols, |s| s.as_str())?;
1768        self._select_impl_unchecked(cols)
1769    }
1770
1771    pub fn _select_impl_unchecked(&self, cols: &[PlSmallStr]) -> PolarsResult<Self> {
1772        let selected = self.select_columns_impl(cols)?;
1773        Ok(unsafe { DataFrame::new_no_checks(self.height(), selected) })
1774    }
1775
1776    /// Select with a known schema. The schema names must match the column names of this DataFrame.
1777    pub fn select_with_schema<I, S>(&self, selection: I, schema: &SchemaRef) -> PolarsResult<Self>
1778    where
1779        I: IntoIterator<Item = S>,
1780        S: Into<PlSmallStr>,
1781    {
1782        let cols = selection.into_iter().map(|s| s.into()).collect::<Vec<_>>();
1783        self._select_with_schema_impl(&cols, schema, true)
1784    }
1785
1786    /// Select with a known schema without checking for duplicates in `selection`.
1787    /// The schema names must match the column names of this DataFrame.
1788    pub fn select_with_schema_unchecked<I, S>(
1789        &self,
1790        selection: I,
1791        schema: &Schema,
1792    ) -> PolarsResult<Self>
1793    where
1794        I: IntoIterator<Item = S>,
1795        S: Into<PlSmallStr>,
1796    {
1797        let cols = selection.into_iter().map(|s| s.into()).collect::<Vec<_>>();
1798        self._select_with_schema_impl(&cols, schema, false)
1799    }
1800
1801    /// * The schema names must match the column names of this DataFrame.
1802    pub fn _select_with_schema_impl(
1803        &self,
1804        cols: &[PlSmallStr],
1805        schema: &Schema,
1806        check_duplicates: bool,
1807    ) -> PolarsResult<Self> {
1808        if check_duplicates {
1809            ensure_names_unique(cols, |s| s.as_str())?;
1810        }
1811
1812        let selected = self.select_columns_impl_with_schema(cols, schema)?;
1813        Ok(unsafe { DataFrame::new_no_checks(self.height(), selected) })
1814    }
1815
1816    /// A non generic implementation to reduce compiler bloat.
1817    fn select_columns_impl_with_schema(
1818        &self,
1819        cols: &[PlSmallStr],
1820        schema: &Schema,
1821    ) -> PolarsResult<Vec<Column>> {
1822        debug_ensure_matching_schema_names(schema, self.schema())?;
1823
1824        cols.iter()
1825            .map(|name| {
1826                let index = schema.try_get_full(name.as_str())?.0;
1827                Ok(self.columns[index].clone())
1828            })
1829            .collect()
1830    }
1831
1832    pub fn select_physical<I, S>(&self, selection: I) -> PolarsResult<Self>
1833    where
1834        I: IntoIterator<Item = S>,
1835        S: Into<PlSmallStr>,
1836    {
1837        let cols = selection.into_iter().map(|s| s.into()).collect::<Vec<_>>();
1838        self.select_physical_impl(&cols)
1839    }
1840
1841    fn select_physical_impl(&self, cols: &[PlSmallStr]) -> PolarsResult<Self> {
1842        ensure_names_unique(cols, |s| s.as_str())?;
1843        let selected = self.select_columns_physical_impl(cols)?;
1844        Ok(unsafe { DataFrame::new_no_checks(self.height(), selected) })
1845    }
1846
1847    /// Select column(s) from this [`DataFrame`] and return them into a [`Vec`].
1848    ///
1849    /// # Example
1850    ///
1851    /// ```rust
1852    /// # use polars_core::prelude::*;
1853    /// let df: DataFrame = df!("Name" => ["Methane", "Ethane", "Propane"],
1854    ///                         "Carbon" => [1, 2, 3],
1855    ///                         "Hydrogen" => [4, 6, 8])?;
1856    /// let sv: Vec<Column> = df.select_columns(["Carbon", "Hydrogen"])?;
1857    ///
1858    /// assert_eq!(df["Carbon"], sv[0]);
1859    /// assert_eq!(df["Hydrogen"], sv[1]);
1860    /// # Ok::<(), PolarsError>(())
1861    /// ```
1862    pub fn select_columns(&self, selection: impl IntoVec<PlSmallStr>) -> PolarsResult<Vec<Column>> {
1863        let cols = selection.into_vec();
1864        self.select_columns_impl(&cols)
1865    }
1866
1867    fn _names_to_idx_map(&self) -> PlHashMap<&str, usize> {
1868        self.columns
1869            .iter()
1870            .enumerate()
1871            .map(|(i, s)| (s.name().as_str(), i))
1872            .collect()
1873    }
1874
1875    /// A non generic implementation to reduce compiler bloat.
1876    fn select_columns_physical_impl(&self, cols: &[PlSmallStr]) -> PolarsResult<Vec<Column>> {
1877        let selected = if cols.len() > 1 && self.columns.len() > 10 {
1878            let name_to_idx = self._names_to_idx_map();
1879            cols.iter()
1880                .map(|name| {
1881                    let idx = *name_to_idx
1882                        .get(name.as_str())
1883                        .ok_or_else(|| polars_err!(col_not_found = name))?;
1884                    Ok(self.select_at_idx(idx).unwrap().to_physical_repr())
1885                })
1886                .collect::<PolarsResult<Vec<_>>>()?
1887        } else {
1888            cols.iter()
1889                .map(|c| self.column(c.as_str()).map(|s| s.to_physical_repr()))
1890                .collect::<PolarsResult<Vec<_>>>()?
1891        };
1892
1893        Ok(selected)
1894    }
1895
1896    /// A non generic implementation to reduce compiler bloat.
1897    fn select_columns_impl(&self, cols: &[PlSmallStr]) -> PolarsResult<Vec<Column>> {
1898        let selected = if cols.len() > 1 && self.columns.len() > 10 {
1899            // we hash, because there are user that having millions of columns.
1900            // # https://github.com/pola-rs/polars/issues/1023
1901            let name_to_idx = self._names_to_idx_map();
1902
1903            cols.iter()
1904                .map(|name| {
1905                    let idx = *name_to_idx
1906                        .get(name.as_str())
1907                        .ok_or_else(|| polars_err!(col_not_found = name))?;
1908                    Ok(self.select_at_idx(idx).unwrap().clone())
1909                })
1910                .collect::<PolarsResult<Vec<_>>>()?
1911        } else {
1912            cols.iter()
1913                .map(|c| self.column(c.as_str()).cloned())
1914                .collect::<PolarsResult<Vec<_>>>()?
1915        };
1916
1917        Ok(selected)
1918    }
1919
1920    fn filter_height(&self, filtered: &[Column], mask: &BooleanChunked) -> usize {
1921        // If there is a filtered column just see how many columns there are left.
1922        if let Some(fst) = filtered.first() {
1923            return fst.len();
1924        }
1925
1926        // Otherwise, count the number of values that would be filtered and return that height.
1927        let num_trues = mask.num_trues();
1928        if mask.len() == self.height() {
1929            num_trues
1930        } else {
1931            // This is for broadcasting masks
1932            debug_assert!(num_trues == 0 || num_trues == 1);
1933            self.height() * num_trues
1934        }
1935    }
1936
1937    /// Take the [`DataFrame`] rows by a boolean mask.
1938    ///
1939    /// # Example
1940    ///
1941    /// ```
1942    /// # use polars_core::prelude::*;
1943    /// fn example(df: &DataFrame) -> PolarsResult<DataFrame> {
1944    ///     let mask = df.column("sepal_width")?.is_not_null();
1945    ///     df.filter(&mask)
1946    /// }
1947    /// ```
1948    pub fn filter(&self, mask: &BooleanChunked) -> PolarsResult<Self> {
1949        let new_col = self.try_apply_columns_par(&|s| s.filter(mask))?;
1950        let height = self.filter_height(&new_col, mask);
1951
1952        Ok(unsafe { DataFrame::new_no_checks(height, new_col) })
1953    }
1954
1955    /// Same as `filter` but does not parallelize.
1956    pub fn _filter_seq(&self, mask: &BooleanChunked) -> PolarsResult<Self> {
1957        let new_col = self.try_apply_columns(&|s| s.filter(mask))?;
1958        let height = self.filter_height(&new_col, mask);
1959
1960        Ok(unsafe { DataFrame::new_no_checks(height, new_col) })
1961    }
1962
1963    /// Take [`DataFrame`] rows by index values.
1964    ///
1965    /// # Example
1966    ///
1967    /// ```
1968    /// # use polars_core::prelude::*;
1969    /// fn example(df: &DataFrame) -> PolarsResult<DataFrame> {
1970    ///     let idx = IdxCa::new("idx".into(), [0, 1, 9]);
1971    ///     df.take(&idx)
1972    /// }
1973    /// ```
1974    pub fn take(&self, indices: &IdxCa) -> PolarsResult<Self> {
1975        let new_col = POOL.install(|| self.try_apply_columns_par(&|s| s.take(indices)))?;
1976
1977        Ok(unsafe { DataFrame::new_no_checks(indices.len(), new_col) })
1978    }
1979
1980    /// # Safety
1981    /// The indices must be in-bounds.
1982    pub unsafe fn take_unchecked(&self, idx: &IdxCa) -> Self {
1983        self.take_unchecked_impl(idx, true)
1984    }
1985
1986    /// # Safety
1987    /// The indices must be in-bounds.
1988    pub unsafe fn take_unchecked_impl(&self, idx: &IdxCa, allow_threads: bool) -> Self {
1989        let cols = if allow_threads {
1990            POOL.install(|| self._apply_columns_par(&|c| c.take_unchecked(idx)))
1991        } else {
1992            self._apply_columns(&|s| s.take_unchecked(idx))
1993        };
1994        unsafe { DataFrame::new_no_checks(idx.len(), cols) }
1995    }
1996
1997    /// # Safety
1998    /// The indices must be in-bounds.
1999    pub unsafe fn take_slice_unchecked(&self, idx: &[IdxSize]) -> Self {
2000        self.take_slice_unchecked_impl(idx, true)
2001    }
2002
2003    /// # Safety
2004    /// The indices must be in-bounds.
2005    pub unsafe fn take_slice_unchecked_impl(&self, idx: &[IdxSize], allow_threads: bool) -> Self {
2006        let cols = if allow_threads {
2007            POOL.install(|| self._apply_columns_par(&|s| s.take_slice_unchecked(idx)))
2008        } else {
2009            self._apply_columns(&|s| s.take_slice_unchecked(idx))
2010        };
2011        unsafe { DataFrame::new_no_checks(idx.len(), cols) }
2012    }
2013
2014    /// Rename a column in the [`DataFrame`].
2015    ///
2016    /// # Example
2017    ///
2018    /// ```
2019    /// # use polars_core::prelude::*;
2020    /// fn example(df: &mut DataFrame) -> PolarsResult<&mut DataFrame> {
2021    ///     let original_name = "foo";
2022    ///     let new_name = "bar";
2023    ///     df.rename(original_name, new_name.into())
2024    /// }
2025    /// ```
2026    pub fn rename(&mut self, column: &str, name: PlSmallStr) -> PolarsResult<&mut Self> {
2027        if column == name.as_str() {
2028            return Ok(self);
2029        }
2030        polars_ensure!(
2031            !self.schema().contains(&name),
2032            Duplicate: "column rename attempted with already existing name \"{name}\""
2033        );
2034
2035        self.get_column_index(column)
2036            .and_then(|idx| self.columns.get_mut(idx))
2037            .ok_or_else(|| polars_err!(col_not_found = column))
2038            .map(|c| c.rename(name))?;
2039        Ok(self)
2040    }
2041
2042    /// Sort [`DataFrame`] in place.
2043    ///
2044    /// See [`DataFrame::sort`] for more instruction.
2045    pub fn sort_in_place(
2046        &mut self,
2047        by: impl IntoVec<PlSmallStr>,
2048        sort_options: SortMultipleOptions,
2049    ) -> PolarsResult<&mut Self> {
2050        let by_column = self.select_columns(by)?;
2051        self.columns = self.sort_impl(by_column, sort_options, None)?.columns;
2052        Ok(self)
2053    }
2054
2055    #[doc(hidden)]
2056    /// This is the dispatch of Self::sort, and exists to reduce compile bloat by monomorphization.
2057    pub fn sort_impl(
2058        &self,
2059        by_column: Vec<Column>,
2060        mut sort_options: SortMultipleOptions,
2061        slice: Option<(i64, usize)>,
2062    ) -> PolarsResult<Self> {
2063        if by_column.is_empty() {
2064            // If no columns selected, any order (including original order) is correct.
2065            return if let Some((offset, len)) = slice {
2066                Ok(self.slice(offset, len))
2067            } else {
2068                Ok(self.clone())
2069            };
2070        }
2071
2072        // note that the by_column argument also contains evaluated expression from
2073        // polars-lazy that may not even be present in this dataframe. therefore
2074        // when we try to set the first columns as sorted, we ignore the error as
2075        // expressions are not present (they are renamed to _POLARS_SORT_COLUMN_i.
2076        let first_descending = sort_options.descending[0];
2077        let first_by_column = by_column[0].name().to_string();
2078
2079        let set_sorted = |df: &mut DataFrame| {
2080            // Mark the first sort column as sorted; if the column does not exist it
2081            // is ok, because we sorted by an expression not present in the dataframe
2082            let _ = df.apply(&first_by_column, |s| {
2083                let mut s = s.clone();
2084                if first_descending {
2085                    s.set_sorted_flag(IsSorted::Descending)
2086                } else {
2087                    s.set_sorted_flag(IsSorted::Ascending)
2088                }
2089                s
2090            });
2091        };
2092        if self.is_empty() {
2093            let mut out = self.clone();
2094            set_sorted(&mut out);
2095            return Ok(out);
2096        }
2097
2098        if let Some((0, k)) = slice {
2099            if k < self.len() {
2100                return self.bottom_k_impl(k, by_column, sort_options);
2101            }
2102        }
2103        // Check if the required column is already sorted; if so we can exit early
2104        // We can do so when there is only one column to sort by, for multiple columns
2105        // it will be complicated to do so
2106        #[cfg(feature = "dtype-categorical")]
2107        let is_not_categorical_enum =
2108            !(matches!(by_column[0].dtype(), DataType::Categorical(_, _))
2109                || matches!(by_column[0].dtype(), DataType::Enum(_, _)));
2110
2111        #[cfg(not(feature = "dtype-categorical"))]
2112        #[allow(non_upper_case_globals)]
2113        const is_not_categorical_enum: bool = true;
2114
2115        if by_column.len() == 1 && is_not_categorical_enum {
2116            let required_sorting = if sort_options.descending[0] {
2117                IsSorted::Descending
2118            } else {
2119                IsSorted::Ascending
2120            };
2121            // If null count is 0 then nulls_last doesnt matter
2122            // Safe to get value at last position since the dataframe is not empty (taken care above)
2123            let no_sorting_required = (by_column[0].is_sorted_flag() == required_sorting)
2124                && ((by_column[0].null_count() == 0)
2125                    || by_column[0].get(by_column[0].len() - 1).unwrap().is_null()
2126                        == sort_options.nulls_last[0]);
2127
2128            if no_sorting_required {
2129                return if let Some((offset, len)) = slice {
2130                    Ok(self.slice(offset, len))
2131                } else {
2132                    Ok(self.clone())
2133                };
2134            }
2135        }
2136
2137        #[cfg(feature = "dtype-struct")]
2138        let has_struct = by_column
2139            .iter()
2140            .any(|s| matches!(s.dtype(), DataType::Struct(_)));
2141
2142        #[cfg(not(feature = "dtype-struct"))]
2143        #[allow(non_upper_case_globals)]
2144        const has_struct: bool = false;
2145
2146        // a lot of indirection in both sorting and take
2147        let mut df = self.clone();
2148        let df = df.as_single_chunk_par();
2149        let mut take = match (by_column.len(), has_struct) {
2150            (1, false) => {
2151                let s = &by_column[0];
2152                let options = SortOptions {
2153                    descending: sort_options.descending[0],
2154                    nulls_last: sort_options.nulls_last[0],
2155                    multithreaded: sort_options.multithreaded,
2156                    maintain_order: sort_options.maintain_order,
2157                    limit: sort_options.limit,
2158                };
2159                // fast path for a frame with a single series
2160                // no need to compute the sort indices and then take by these indices
2161                // simply sort and return as frame
2162                if df.width() == 1 && df.check_name_to_idx(s.name().as_str()).is_ok() {
2163                    let mut out = s.sort_with(options)?;
2164                    if let Some((offset, len)) = slice {
2165                        out = out.slice(offset, len);
2166                    }
2167                    return Ok(out.into_frame());
2168                }
2169                s.arg_sort(options)
2170            },
2171            _ => {
2172                if sort_options.nulls_last.iter().all(|&x| x)
2173                    || has_struct
2174                    || std::env::var("POLARS_ROW_FMT_SORT").is_ok()
2175                {
2176                    argsort_multiple_row_fmt(
2177                        &by_column,
2178                        sort_options.descending,
2179                        sort_options.nulls_last,
2180                        sort_options.multithreaded,
2181                    )?
2182                } else {
2183                    let (first, other) = prepare_arg_sort(by_column, &mut sort_options)?;
2184                    first
2185                        .as_materialized_series()
2186                        .arg_sort_multiple(&other, &sort_options)?
2187                }
2188            },
2189        };
2190
2191        if let Some((offset, len)) = slice {
2192            take = take.slice(offset, len);
2193        }
2194
2195        // SAFETY:
2196        // the created indices are in bounds
2197        let mut df = unsafe { df.take_unchecked_impl(&take, sort_options.multithreaded) };
2198        set_sorted(&mut df);
2199        Ok(df)
2200    }
2201
2202    /// Create a `DataFrame` that has fields for all the known runtime metadata for each column.
2203    ///
2204    /// This dataframe does not necessarily have a specified schema and may be changed at any
2205    /// point. It is primarily used for debugging.
2206    pub fn _to_metadata(&self) -> DataFrame {
2207        let num_columns = self.columns.len();
2208
2209        let mut column_names =
2210            StringChunkedBuilder::new(PlSmallStr::from_static("column_name"), num_columns);
2211        let mut repr_ca = StringChunkedBuilder::new(PlSmallStr::from_static("repr"), num_columns);
2212        let mut sorted_asc_ca =
2213            BooleanChunkedBuilder::new(PlSmallStr::from_static("sorted_asc"), num_columns);
2214        let mut sorted_dsc_ca =
2215            BooleanChunkedBuilder::new(PlSmallStr::from_static("sorted_dsc"), num_columns);
2216        let mut fast_explode_list_ca =
2217            BooleanChunkedBuilder::new(PlSmallStr::from_static("fast_explode_list"), num_columns);
2218        let mut materialized_at_ca =
2219            StringChunkedBuilder::new(PlSmallStr::from_static("materialized_at"), num_columns);
2220
2221        for col in &self.columns {
2222            let flags = col.get_flags();
2223
2224            let (repr, materialized_at) = match col {
2225                Column::Series(s) => ("series", s.materialized_at()),
2226                Column::Partitioned(_) => ("partitioned", None),
2227                Column::Scalar(_) => ("scalar", None),
2228            };
2229            let sorted_asc = flags.contains(StatisticsFlags::IS_SORTED_ASC);
2230            let sorted_dsc = flags.contains(StatisticsFlags::IS_SORTED_DSC);
2231            let fast_explode_list = flags.contains(StatisticsFlags::CAN_FAST_EXPLODE_LIST);
2232
2233            column_names.append_value(col.name().clone());
2234            repr_ca.append_value(repr);
2235            sorted_asc_ca.append_value(sorted_asc);
2236            sorted_dsc_ca.append_value(sorted_dsc);
2237            fast_explode_list_ca.append_value(fast_explode_list);
2238            materialized_at_ca.append_option(materialized_at.map(|v| format!("{v:#?}")));
2239        }
2240
2241        unsafe {
2242            DataFrame::new_no_checks(
2243                self.width(),
2244                vec![
2245                    column_names.finish().into_column(),
2246                    repr_ca.finish().into_column(),
2247                    sorted_asc_ca.finish().into_column(),
2248                    sorted_dsc_ca.finish().into_column(),
2249                    fast_explode_list_ca.finish().into_column(),
2250                    materialized_at_ca.finish().into_column(),
2251                ],
2252            )
2253        }
2254    }
2255
2256    /// Return a sorted clone of this [`DataFrame`].
2257    ///
2258    /// In many cases the output chunks will be continuous in memory but this is not guaranteed
2259    /// # Example
2260    ///
2261    /// Sort by a single column with default options:
2262    /// ```
2263    /// # use polars_core::prelude::*;
2264    /// fn sort_by_sepal_width(df: &DataFrame) -> PolarsResult<DataFrame> {
2265    ///     df.sort(["sepal_width"], Default::default())
2266    /// }
2267    /// ```
2268    /// Sort by a single column with specific order:
2269    /// ```
2270    /// # use polars_core::prelude::*;
2271    /// fn sort_with_specific_order(df: &DataFrame, descending: bool) -> PolarsResult<DataFrame> {
2272    ///     df.sort(
2273    ///         ["sepal_width"],
2274    ///         SortMultipleOptions::new()
2275    ///             .with_order_descending(descending)
2276    ///     )
2277    /// }
2278    /// ```
2279    /// Sort by multiple columns with specifying order for each column:
2280    /// ```
2281    /// # use polars_core::prelude::*;
2282    /// fn sort_by_multiple_columns_with_specific_order(df: &DataFrame) -> PolarsResult<DataFrame> {
2283    ///     df.sort(
2284    ///         ["sepal_width", "sepal_length"],
2285    ///         SortMultipleOptions::new()
2286    ///             .with_order_descending_multi([false, true])
2287    ///     )
2288    /// }
2289    /// ```
2290    /// See [`SortMultipleOptions`] for more options.
2291    ///
2292    /// Also see [`DataFrame::sort_in_place`].
2293    pub fn sort(
2294        &self,
2295        by: impl IntoVec<PlSmallStr>,
2296        sort_options: SortMultipleOptions,
2297    ) -> PolarsResult<Self> {
2298        let mut df = self.clone();
2299        df.sort_in_place(by, sort_options)?;
2300        Ok(df)
2301    }
2302
2303    /// Replace a column with a [`Series`].
2304    ///
2305    /// # Example
2306    ///
2307    /// ```rust
2308    /// # use polars_core::prelude::*;
2309    /// let mut df: DataFrame = df!("Country" => ["United States", "China"],
2310    ///                         "Area (km²)" => [9_833_520, 9_596_961])?;
2311    /// let s: Series = Series::new("Country".into(), ["USA", "PRC"]);
2312    ///
2313    /// assert!(df.replace("Nation", s.clone()).is_err());
2314    /// assert!(df.replace("Country", s).is_ok());
2315    /// # Ok::<(), PolarsError>(())
2316    /// ```
2317    pub fn replace<S: IntoSeries>(&mut self, column: &str, new_col: S) -> PolarsResult<&mut Self> {
2318        self.apply(column, |_| new_col.into_series())
2319    }
2320
2321    /// Replace or update a column. The difference between this method and [DataFrame::with_column]
2322    /// is that now the value of `column: &str` determines the name of the column and not the name
2323    /// of the `Series` passed to this method.
2324    pub fn replace_or_add<S: IntoSeries>(
2325        &mut self,
2326        column: PlSmallStr,
2327        new_col: S,
2328    ) -> PolarsResult<&mut Self> {
2329        let mut new_col = new_col.into_series();
2330        new_col.rename(column);
2331        self.with_column(new_col)
2332    }
2333
2334    /// Replace column at index `idx` with a [`Series`].
2335    ///
2336    /// # Example
2337    ///
2338    /// ```ignored
2339    /// # use polars_core::prelude::*;
2340    /// let s0 = Series::new("foo".into(), ["ham", "spam", "egg"]);
2341    /// let s1 = Series::new("ascii".into(), [70, 79, 79]);
2342    /// let mut df = DataFrame::new(vec![s0, s1])?;
2343    ///
2344    /// // Add 32 to get lowercase ascii values
2345    /// df.replace_column(1, df.select_at_idx(1).unwrap() + 32);
2346    /// # Ok::<(), PolarsError>(())
2347    /// ```
2348    pub fn replace_column<C: IntoColumn>(
2349        &mut self,
2350        index: usize,
2351        new_column: C,
2352    ) -> PolarsResult<&mut Self> {
2353        polars_ensure!(
2354            index < self.width(),
2355            ShapeMismatch:
2356            "unable to replace at index {}, the DataFrame has only {} columns",
2357            index, self.width(),
2358        );
2359        let mut new_column = new_column.into_column();
2360        polars_ensure!(
2361            new_column.len() == self.height(),
2362            ShapeMismatch:
2363            "unable to replace a column, series length {} doesn't match the DataFrame height {}",
2364            new_column.len(), self.height(),
2365        );
2366        let old_col = &mut self.columns[index];
2367        mem::swap(old_col, &mut new_column);
2368        self.clear_schema();
2369        Ok(self)
2370    }
2371
2372    /// Apply a closure to a column. This is the recommended way to do in place modification.
2373    ///
2374    /// # Example
2375    ///
2376    /// ```rust
2377    /// # use polars_core::prelude::*;
2378    /// let s0 = Column::new("foo".into(), ["ham", "spam", "egg"]);
2379    /// let s1 = Column::new("names".into(), ["Jean", "Claude", "van"]);
2380    /// let mut df = DataFrame::new(vec![s0, s1])?;
2381    ///
2382    /// fn str_to_len(str_val: &Column) -> Column {
2383    ///     str_val.str()
2384    ///         .unwrap()
2385    ///         .into_iter()
2386    ///         .map(|opt_name: Option<&str>| {
2387    ///             opt_name.map(|name: &str| name.len() as u32)
2388    ///          })
2389    ///         .collect::<UInt32Chunked>()
2390    ///         .into_column()
2391    /// }
2392    ///
2393    /// // Replace the names column by the length of the names.
2394    /// df.apply("names", str_to_len);
2395    /// # Ok::<(), PolarsError>(())
2396    /// ```
2397    /// Results in:
2398    ///
2399    /// ```text
2400    /// +--------+-------+
2401    /// | foo    |       |
2402    /// | ---    | names |
2403    /// | str    | u32   |
2404    /// +========+=======+
2405    /// | "ham"  | 4     |
2406    /// +--------+-------+
2407    /// | "spam" | 6     |
2408    /// +--------+-------+
2409    /// | "egg"  | 3     |
2410    /// +--------+-------+
2411    /// ```
2412    pub fn apply<F, C>(&mut self, name: &str, f: F) -> PolarsResult<&mut Self>
2413    where
2414        F: FnOnce(&Column) -> C,
2415        C: IntoColumn,
2416    {
2417        let idx = self.check_name_to_idx(name)?;
2418        self.apply_at_idx(idx, f)
2419    }
2420
2421    /// Apply a closure to a column at index `idx`. This is the recommended way to do in place
2422    /// modification.
2423    ///
2424    /// # Example
2425    ///
2426    /// ```rust
2427    /// # use polars_core::prelude::*;
2428    /// let s0 = Column::new("foo".into(), ["ham", "spam", "egg"]);
2429    /// let s1 = Column::new("ascii".into(), [70, 79, 79]);
2430    /// let mut df = DataFrame::new(vec![s0, s1])?;
2431    ///
2432    /// // Add 32 to get lowercase ascii values
2433    /// df.apply_at_idx(1, |s| s + 32);
2434    /// # Ok::<(), PolarsError>(())
2435    /// ```
2436    /// Results in:
2437    ///
2438    /// ```text
2439    /// +--------+-------+
2440    /// | foo    | ascii |
2441    /// | ---    | ---   |
2442    /// | str    | i32   |
2443    /// +========+=======+
2444    /// | "ham"  | 102   |
2445    /// +--------+-------+
2446    /// | "spam" | 111   |
2447    /// +--------+-------+
2448    /// | "egg"  | 111   |
2449    /// +--------+-------+
2450    /// ```
2451    pub fn apply_at_idx<F, C>(&mut self, idx: usize, f: F) -> PolarsResult<&mut Self>
2452    where
2453        F: FnOnce(&Column) -> C,
2454        C: IntoColumn,
2455    {
2456        let df_height = self.height();
2457        let width = self.width();
2458        let col = self.columns.get_mut(idx).ok_or_else(|| {
2459            polars_err!(
2460                ComputeError: "invalid column index: {} for a DataFrame with {} columns",
2461                idx, width
2462            )
2463        })?;
2464        let name = col.name().clone();
2465        let new_col = f(col).into_column();
2466        match new_col.len() {
2467            1 => {
2468                let new_col = new_col.new_from_index(0, df_height);
2469                let _ = mem::replace(col, new_col);
2470            },
2471            len if (len == df_height) => {
2472                let _ = mem::replace(col, new_col);
2473            },
2474            len => polars_bail!(
2475                ShapeMismatch:
2476                "resulting Series has length {} while the DataFrame has height {}",
2477                len, df_height
2478            ),
2479        }
2480
2481        // make sure the name remains the same after applying the closure
2482        unsafe {
2483            let col = self.columns.get_unchecked_mut(idx);
2484            col.rename(name);
2485        }
2486        Ok(self)
2487    }
2488
2489    /// Apply a closure that may fail to a column at index `idx`. This is the recommended way to do in place
2490    /// modification.
2491    ///
2492    /// # Example
2493    ///
2494    /// This is the idiomatic way to replace some values a column of a `DataFrame` given range of indexes.
2495    ///
2496    /// ```rust
2497    /// # use polars_core::prelude::*;
2498    /// let s0 = Column::new("foo".into(), ["ham", "spam", "egg", "bacon", "quack"]);
2499    /// let s1 = Column::new("values".into(), [1, 2, 3, 4, 5]);
2500    /// let mut df = DataFrame::new(vec![s0, s1])?;
2501    ///
2502    /// let idx = vec![0, 1, 4];
2503    ///
2504    /// df.try_apply("foo", |c| {
2505    ///     c.str()?
2506    ///     .scatter_with(idx, |opt_val| opt_val.map(|string| format!("{}-is-modified", string)))
2507    /// });
2508    /// # Ok::<(), PolarsError>(())
2509    /// ```
2510    /// Results in:
2511    ///
2512    /// ```text
2513    /// +---------------------+--------+
2514    /// | foo                 | values |
2515    /// | ---                 | ---    |
2516    /// | str                 | i32    |
2517    /// +=====================+========+
2518    /// | "ham-is-modified"   | 1      |
2519    /// +---------------------+--------+
2520    /// | "spam-is-modified"  | 2      |
2521    /// +---------------------+--------+
2522    /// | "egg"               | 3      |
2523    /// +---------------------+--------+
2524    /// | "bacon"             | 4      |
2525    /// +---------------------+--------+
2526    /// | "quack-is-modified" | 5      |
2527    /// +---------------------+--------+
2528    /// ```
2529    pub fn try_apply_at_idx<F, C>(&mut self, idx: usize, f: F) -> PolarsResult<&mut Self>
2530    where
2531        F: FnOnce(&Column) -> PolarsResult<C>,
2532        C: IntoColumn,
2533    {
2534        let width = self.width();
2535        let col = self.columns.get_mut(idx).ok_or_else(|| {
2536            polars_err!(
2537                ComputeError: "invalid column index: {} for a DataFrame with {} columns",
2538                idx, width
2539            )
2540        })?;
2541        let name = col.name().clone();
2542
2543        let _ = mem::replace(col, f(col).map(|c| c.into_column())?);
2544
2545        // make sure the name remains the same after applying the closure
2546        unsafe {
2547            let col = self.columns.get_unchecked_mut(idx);
2548            col.rename(name);
2549        }
2550        Ok(self)
2551    }
2552
2553    /// Apply a closure that may fail to a column. This is the recommended way to do in place
2554    /// modification.
2555    ///
2556    /// # Example
2557    ///
2558    /// This is the idiomatic way to replace some values a column of a `DataFrame` given a boolean mask.
2559    ///
2560    /// ```rust
2561    /// # use polars_core::prelude::*;
2562    /// let s0 = Column::new("foo".into(), ["ham", "spam", "egg", "bacon", "quack"]);
2563    /// let s1 = Column::new("values".into(), [1, 2, 3, 4, 5]);
2564    /// let mut df = DataFrame::new(vec![s0, s1])?;
2565    ///
2566    /// // create a mask
2567    /// let values = df.column("values")?.as_materialized_series();
2568    /// let mask = values.lt_eq(1)? | values.gt_eq(5_i32)?;
2569    ///
2570    /// df.try_apply("foo", |c| {
2571    ///     c.str()?
2572    ///     .set(&mask, Some("not_within_bounds"))
2573    /// });
2574    /// # Ok::<(), PolarsError>(())
2575    /// ```
2576    /// Results in:
2577    ///
2578    /// ```text
2579    /// +---------------------+--------+
2580    /// | foo                 | values |
2581    /// | ---                 | ---    |
2582    /// | str                 | i32    |
2583    /// +=====================+========+
2584    /// | "not_within_bounds" | 1      |
2585    /// +---------------------+--------+
2586    /// | "spam"              | 2      |
2587    /// +---------------------+--------+
2588    /// | "egg"               | 3      |
2589    /// +---------------------+--------+
2590    /// | "bacon"             | 4      |
2591    /// +---------------------+--------+
2592    /// | "not_within_bounds" | 5      |
2593    /// +---------------------+--------+
2594    /// ```
2595    pub fn try_apply<F, C>(&mut self, column: &str, f: F) -> PolarsResult<&mut Self>
2596    where
2597        F: FnOnce(&Series) -> PolarsResult<C>,
2598        C: IntoColumn,
2599    {
2600        let idx = self.try_get_column_index(column)?;
2601        self.try_apply_at_idx(idx, |c| f(c.as_materialized_series()))
2602    }
2603
2604    /// Slice the [`DataFrame`] along the rows.
2605    ///
2606    /// # Example
2607    ///
2608    /// ```rust
2609    /// # use polars_core::prelude::*;
2610    /// let df: DataFrame = df!("Fruit" => ["Apple", "Grape", "Grape", "Fig", "Fig"],
2611    ///                         "Color" => ["Green", "Red", "White", "White", "Red"])?;
2612    /// let sl: DataFrame = df.slice(2, 3);
2613    ///
2614    /// assert_eq!(sl.shape(), (3, 2));
2615    /// println!("{}", sl);
2616    /// # Ok::<(), PolarsError>(())
2617    /// ```
2618    /// Output:
2619    /// ```text
2620    /// shape: (3, 2)
2621    /// +-------+-------+
2622    /// | Fruit | Color |
2623    /// | ---   | ---   |
2624    /// | str   | str   |
2625    /// +=======+=======+
2626    /// | Grape | White |
2627    /// +-------+-------+
2628    /// | Fig   | White |
2629    /// +-------+-------+
2630    /// | Fig   | Red   |
2631    /// +-------+-------+
2632    /// ```
2633    #[must_use]
2634    pub fn slice(&self, offset: i64, length: usize) -> Self {
2635        if offset == 0 && length == self.height() {
2636            return self.clone();
2637        }
2638        if length == 0 {
2639            return self.clear();
2640        }
2641        let col = self
2642            .columns
2643            .iter()
2644            .map(|s| s.slice(offset, length))
2645            .collect::<Vec<_>>();
2646
2647        let height = if let Some(fst) = col.first() {
2648            fst.len()
2649        } else {
2650            let (_, length) = slice_offsets(offset, length, self.height());
2651            length
2652        };
2653
2654        unsafe { DataFrame::new_no_checks(height, col) }
2655    }
2656
2657    /// Split [`DataFrame`] at the given `offset`.
2658    pub fn split_at(&self, offset: i64) -> (Self, Self) {
2659        let (a, b) = self.columns.iter().map(|s| s.split_at(offset)).unzip();
2660
2661        let (idx, _) = slice_offsets(offset, 0, self.height());
2662
2663        let a = unsafe { DataFrame::new_no_checks(idx, a) };
2664        let b = unsafe { DataFrame::new_no_checks(self.height() - idx, b) };
2665        (a, b)
2666    }
2667
2668    pub fn clear(&self) -> Self {
2669        let col = self.columns.iter().map(|s| s.clear()).collect::<Vec<_>>();
2670        unsafe { DataFrame::new_no_checks(0, col) }
2671    }
2672
2673    #[must_use]
2674    pub fn slice_par(&self, offset: i64, length: usize) -> Self {
2675        if offset == 0 && length == self.height() {
2676            return self.clone();
2677        }
2678        let columns = self._apply_columns_par(&|s| s.slice(offset, length));
2679        unsafe { DataFrame::new_no_checks(length, columns) }
2680    }
2681
2682    #[must_use]
2683    pub fn _slice_and_realloc(&self, offset: i64, length: usize) -> Self {
2684        if offset == 0 && length == self.height() {
2685            return self.clone();
2686        }
2687        // @scalar-opt
2688        let columns = self._apply_columns(&|s| {
2689            let mut out = s.slice(offset, length);
2690            out.shrink_to_fit();
2691            out
2692        });
2693        unsafe { DataFrame::new_no_checks(length, columns) }
2694    }
2695
2696    /// Get the head of the [`DataFrame`].
2697    ///
2698    /// # Example
2699    ///
2700    /// ```rust
2701    /// # use polars_core::prelude::*;
2702    /// let countries: DataFrame =
2703    ///     df!("Rank by GDP (2021)" => [1, 2, 3, 4, 5],
2704    ///         "Continent" => ["North America", "Asia", "Asia", "Europe", "Europe"],
2705    ///         "Country" => ["United States", "China", "Japan", "Germany", "United Kingdom"],
2706    ///         "Capital" => ["Washington", "Beijing", "Tokyo", "Berlin", "London"])?;
2707    /// assert_eq!(countries.shape(), (5, 4));
2708    ///
2709    /// println!("{}", countries.head(Some(3)));
2710    /// # Ok::<(), PolarsError>(())
2711    /// ```
2712    ///
2713    /// Output:
2714    ///
2715    /// ```text
2716    /// shape: (3, 4)
2717    /// +--------------------+---------------+---------------+------------+
2718    /// | Rank by GDP (2021) | Continent     | Country       | Capital    |
2719    /// | ---                | ---           | ---           | ---        |
2720    /// | i32                | str           | str           | str        |
2721    /// +====================+===============+===============+============+
2722    /// | 1                  | North America | United States | Washington |
2723    /// +--------------------+---------------+---------------+------------+
2724    /// | 2                  | Asia          | China         | Beijing    |
2725    /// +--------------------+---------------+---------------+------------+
2726    /// | 3                  | Asia          | Japan         | Tokyo      |
2727    /// +--------------------+---------------+---------------+------------+
2728    /// ```
2729    #[must_use]
2730    pub fn head(&self, length: Option<usize>) -> Self {
2731        let col = self
2732            .columns
2733            .iter()
2734            .map(|c| c.head(length))
2735            .collect::<Vec<_>>();
2736
2737        let height = length.unwrap_or(HEAD_DEFAULT_LENGTH);
2738        let height = usize::min(height, self.height());
2739        unsafe { DataFrame::new_no_checks(height, col) }
2740    }
2741
2742    /// Get the tail of the [`DataFrame`].
2743    ///
2744    /// # Example
2745    ///
2746    /// ```rust
2747    /// # use polars_core::prelude::*;
2748    /// let countries: DataFrame =
2749    ///     df!("Rank (2021)" => [105, 106, 107, 108, 109],
2750    ///         "Apple Price (€/kg)" => [0.75, 0.70, 0.70, 0.65, 0.52],
2751    ///         "Country" => ["Kosovo", "Moldova", "North Macedonia", "Syria", "Turkey"])?;
2752    /// assert_eq!(countries.shape(), (5, 3));
2753    ///
2754    /// println!("{}", countries.tail(Some(2)));
2755    /// # Ok::<(), PolarsError>(())
2756    /// ```
2757    ///
2758    /// Output:
2759    ///
2760    /// ```text
2761    /// shape: (2, 3)
2762    /// +-------------+--------------------+---------+
2763    /// | Rank (2021) | Apple Price (€/kg) | Country |
2764    /// | ---         | ---                | ---     |
2765    /// | i32         | f64                | str     |
2766    /// +=============+====================+=========+
2767    /// | 108         | 0.63               | Syria   |
2768    /// +-------------+--------------------+---------+
2769    /// | 109         | 0.63               | Turkey  |
2770    /// +-------------+--------------------+---------+
2771    /// ```
2772    #[must_use]
2773    pub fn tail(&self, length: Option<usize>) -> Self {
2774        let col = self
2775            .columns
2776            .iter()
2777            .map(|c| c.tail(length))
2778            .collect::<Vec<_>>();
2779
2780        let height = length.unwrap_or(TAIL_DEFAULT_LENGTH);
2781        let height = usize::min(height, self.height());
2782        unsafe { DataFrame::new_no_checks(height, col) }
2783    }
2784
2785    /// Iterator over the rows in this [`DataFrame`] as Arrow RecordBatches.
2786    ///
2787    /// # Panics
2788    ///
2789    /// Panics if the [`DataFrame`] that is passed is not rechunked.
2790    ///
2791    /// This responsibility is left to the caller as we don't want to take mutable references here,
2792    /// but we also don't want to rechunk here, as this operation is costly and would benefit the caller
2793    /// as well.
2794    pub fn iter_chunks(&self, compat_level: CompatLevel, parallel: bool) -> RecordBatchIter {
2795        debug_assert!(!self.should_rechunk(), "expected equal chunks");
2796        // If any of the columns is binview and we don't convert `compat_level` we allow parallelism
2797        // as we must allocate arrow strings/binaries.
2798        let must_convert = compat_level.0 == 0;
2799        let parallel = parallel
2800            && must_convert
2801            && self.columns.len() > 1
2802            && self
2803                .columns
2804                .iter()
2805                .any(|s| matches!(s.dtype(), DataType::String | DataType::Binary));
2806
2807        RecordBatchIter {
2808            columns: &self.columns,
2809            schema: Arc::new(
2810                self.columns
2811                    .iter()
2812                    .map(|c| c.field().to_arrow(compat_level))
2813                    .collect(),
2814            ),
2815            idx: 0,
2816            n_chunks: self.first_col_n_chunks(),
2817            compat_level,
2818            parallel,
2819        }
2820    }
2821
2822    /// Iterator over the rows in this [`DataFrame`] as Arrow RecordBatches as physical values.
2823    ///
2824    /// # Panics
2825    ///
2826    /// Panics if the [`DataFrame`] that is passed is not rechunked.
2827    ///
2828    /// This responsibility is left to the caller as we don't want to take mutable references here,
2829    /// but we also don't want to rechunk here, as this operation is costly and would benefit the caller
2830    /// as well.
2831    pub fn iter_chunks_physical(&self) -> PhysRecordBatchIter<'_> {
2832        PhysRecordBatchIter {
2833            schema: Arc::new(
2834                self.get_columns()
2835                    .iter()
2836                    .map(|c| c.field().to_arrow(CompatLevel::newest()))
2837                    .collect(),
2838            ),
2839            arr_iters: self
2840                .materialized_column_iter()
2841                .map(|s| s.chunks().iter())
2842                .collect(),
2843        }
2844    }
2845
2846    /// Get a [`DataFrame`] with all the columns in reversed order.
2847    #[must_use]
2848    pub fn reverse(&self) -> Self {
2849        let col = self.columns.iter().map(|s| s.reverse()).collect::<Vec<_>>();
2850        unsafe { DataFrame::new_no_checks(self.height(), col) }
2851    }
2852
2853    /// Shift the values by a given period and fill the parts that will be empty due to this operation
2854    /// with `Nones`.
2855    ///
2856    /// See the method on [Series](crate::series::SeriesTrait::shift) for more info on the `shift` operation.
2857    #[must_use]
2858    pub fn shift(&self, periods: i64) -> Self {
2859        let col = self._apply_columns_par(&|s| s.shift(periods));
2860        unsafe { DataFrame::new_no_checks(self.height(), col) }
2861    }
2862
2863    /// Replace None values with one of the following strategies:
2864    /// * Forward fill (replace None with the previous value)
2865    /// * Backward fill (replace None with the next value)
2866    /// * Mean fill (replace None with the mean of the whole array)
2867    /// * Min fill (replace None with the minimum of the whole array)
2868    /// * Max fill (replace None with the maximum of the whole array)
2869    ///
2870    /// See the method on [Series](crate::series::Series::fill_null) for more info on the `fill_null` operation.
2871    pub fn fill_null(&self, strategy: FillNullStrategy) -> PolarsResult<Self> {
2872        let col = self.try_apply_columns_par(&|s| s.fill_null(strategy))?;
2873
2874        Ok(unsafe { DataFrame::new_no_checks(self.height(), col) })
2875    }
2876
2877    /// Pipe different functions/ closure operations that work on a DataFrame together.
2878    pub fn pipe<F, B>(self, f: F) -> PolarsResult<B>
2879    where
2880        F: Fn(DataFrame) -> PolarsResult<B>,
2881    {
2882        f(self)
2883    }
2884
2885    /// Pipe different functions/ closure operations that work on a DataFrame together.
2886    pub fn pipe_mut<F, B>(&mut self, f: F) -> PolarsResult<B>
2887    where
2888        F: Fn(&mut DataFrame) -> PolarsResult<B>,
2889    {
2890        f(self)
2891    }
2892
2893    /// Pipe different functions/ closure operations that work on a DataFrame together.
2894    pub fn pipe_with_args<F, B, Args>(self, f: F, args: Args) -> PolarsResult<B>
2895    where
2896        F: Fn(DataFrame, Args) -> PolarsResult<B>,
2897    {
2898        f(self, args)
2899    }
2900
2901    /// Drop duplicate rows from a [`DataFrame`].
2902    /// *This fails when there is a column of type List in DataFrame*
2903    ///
2904    /// Stable means that the order is maintained. This has a higher cost than an unstable distinct.
2905    ///
2906    /// # Example
2907    ///
2908    /// ```no_run
2909    /// # use polars_core::prelude::*;
2910    /// let df = df! {
2911    ///               "flt" => [1., 1., 2., 2., 3., 3.],
2912    ///               "int" => [1, 1, 2, 2, 3, 3, ],
2913    ///               "str" => ["a", "a", "b", "b", "c", "c"]
2914    ///           }?;
2915    ///
2916    /// println!("{}", df.unique_stable(None, UniqueKeepStrategy::First, None)?);
2917    /// # Ok::<(), PolarsError>(())
2918    /// ```
2919    /// Returns
2920    ///
2921    /// ```text
2922    /// +-----+-----+-----+
2923    /// | flt | int | str |
2924    /// | --- | --- | --- |
2925    /// | f64 | i32 | str |
2926    /// +=====+=====+=====+
2927    /// | 1   | 1   | "a" |
2928    /// +-----+-----+-----+
2929    /// | 2   | 2   | "b" |
2930    /// +-----+-----+-----+
2931    /// | 3   | 3   | "c" |
2932    /// +-----+-----+-----+
2933    /// ```
2934    #[cfg(feature = "algorithm_group_by")]
2935    pub fn unique_stable(
2936        &self,
2937        subset: Option<&[String]>,
2938        keep: UniqueKeepStrategy,
2939        slice: Option<(i64, usize)>,
2940    ) -> PolarsResult<DataFrame> {
2941        self.unique_impl(
2942            true,
2943            subset.map(|v| v.iter().map(|x| PlSmallStr::from_str(x.as_str())).collect()),
2944            keep,
2945            slice,
2946        )
2947    }
2948
2949    /// Unstable distinct. See [`DataFrame::unique_stable`].
2950    #[cfg(feature = "algorithm_group_by")]
2951    pub fn unique<I, S>(
2952        &self,
2953        subset: Option<&[String]>,
2954        keep: UniqueKeepStrategy,
2955        slice: Option<(i64, usize)>,
2956    ) -> PolarsResult<DataFrame> {
2957        self.unique_impl(
2958            false,
2959            subset.map(|v| v.iter().map(|x| PlSmallStr::from_str(x.as_str())).collect()),
2960            keep,
2961            slice,
2962        )
2963    }
2964
2965    #[cfg(feature = "algorithm_group_by")]
2966    pub fn unique_impl(
2967        &self,
2968        maintain_order: bool,
2969        subset: Option<Vec<PlSmallStr>>,
2970        keep: UniqueKeepStrategy,
2971        slice: Option<(i64, usize)>,
2972    ) -> PolarsResult<Self> {
2973        let names = subset.unwrap_or_else(|| self.get_column_names_owned());
2974        let mut df = self.clone();
2975        // take on multiple chunks is terrible
2976        df.as_single_chunk_par();
2977
2978        let columns = match (keep, maintain_order) {
2979            (UniqueKeepStrategy::First | UniqueKeepStrategy::Any, true) => {
2980                let gb = df.group_by_stable(names)?;
2981                let groups = gb.get_groups();
2982                let (offset, len) = slice.unwrap_or((0, groups.len()));
2983                let groups = groups.slice(offset, len);
2984                df._apply_columns_par(&|s| unsafe { s.agg_first(&groups) })
2985            },
2986            (UniqueKeepStrategy::Last, true) => {
2987                // maintain order by last values, so the sorted groups are not correct as they
2988                // are sorted by the first value
2989                let gb = df.group_by(names)?;
2990                let groups = gb.get_groups();
2991
2992                let func = |g: GroupsIndicator| match g {
2993                    GroupsIndicator::Idx((_first, idx)) => idx[idx.len() - 1],
2994                    GroupsIndicator::Slice([first, len]) => first + len - 1,
2995                };
2996
2997                let last_idx: NoNull<IdxCa> = match slice {
2998                    None => groups.iter().map(func).collect(),
2999                    Some((offset, len)) => {
3000                        let (offset, len) = slice_offsets(offset, len, groups.len());
3001                        groups.iter().skip(offset).take(len).map(func).collect()
3002                    },
3003                };
3004
3005                let last_idx = last_idx.sort(false);
3006                return Ok(unsafe { df.take_unchecked(&last_idx) });
3007            },
3008            (UniqueKeepStrategy::First | UniqueKeepStrategy::Any, false) => {
3009                let gb = df.group_by(names)?;
3010                let groups = gb.get_groups();
3011                let (offset, len) = slice.unwrap_or((0, groups.len()));
3012                let groups = groups.slice(offset, len);
3013                df._apply_columns_par(&|s| unsafe { s.agg_first(&groups) })
3014            },
3015            (UniqueKeepStrategy::Last, false) => {
3016                let gb = df.group_by(names)?;
3017                let groups = gb.get_groups();
3018                let (offset, len) = slice.unwrap_or((0, groups.len()));
3019                let groups = groups.slice(offset, len);
3020                df._apply_columns_par(&|s| unsafe { s.agg_last(&groups) })
3021            },
3022            (UniqueKeepStrategy::None, _) => {
3023                let df_part = df.select(names)?;
3024                let mask = df_part.is_unique()?;
3025                let mask = match slice {
3026                    None => mask,
3027                    Some((offset, len)) => mask.slice(offset, len),
3028                };
3029                return df.filter(&mask);
3030            },
3031        };
3032
3033        let height = Self::infer_height(&columns);
3034        Ok(unsafe { DataFrame::new_no_checks(height, columns) })
3035    }
3036
3037    /// Get a mask of all the unique rows in the [`DataFrame`].
3038    ///
3039    /// # Example
3040    ///
3041    /// ```no_run
3042    /// # use polars_core::prelude::*;
3043    /// let df: DataFrame = df!("Company" => ["Apple", "Microsoft"],
3044    ///                         "ISIN" => ["US0378331005", "US5949181045"])?;
3045    /// let ca: ChunkedArray<BooleanType> = df.is_unique()?;
3046    ///
3047    /// assert!(ca.all());
3048    /// # Ok::<(), PolarsError>(())
3049    /// ```
3050    #[cfg(feature = "algorithm_group_by")]
3051    pub fn is_unique(&self) -> PolarsResult<BooleanChunked> {
3052        let gb = self.group_by(self.get_column_names_owned())?;
3053        let groups = gb.get_groups();
3054        Ok(is_unique_helper(
3055            groups,
3056            self.height() as IdxSize,
3057            true,
3058            false,
3059        ))
3060    }
3061
3062    /// Get a mask of all the duplicated rows in the [`DataFrame`].
3063    ///
3064    /// # Example
3065    ///
3066    /// ```no_run
3067    /// # use polars_core::prelude::*;
3068    /// let df: DataFrame = df!("Company" => ["Alphabet", "Alphabet"],
3069    ///                         "ISIN" => ["US02079K3059", "US02079K1079"])?;
3070    /// let ca: ChunkedArray<BooleanType> = df.is_duplicated()?;
3071    ///
3072    /// assert!(!ca.all());
3073    /// # Ok::<(), PolarsError>(())
3074    /// ```
3075    #[cfg(feature = "algorithm_group_by")]
3076    pub fn is_duplicated(&self) -> PolarsResult<BooleanChunked> {
3077        let gb = self.group_by(self.get_column_names_owned())?;
3078        let groups = gb.get_groups();
3079        Ok(is_unique_helper(
3080            groups,
3081            self.height() as IdxSize,
3082            false,
3083            true,
3084        ))
3085    }
3086
3087    /// Create a new [`DataFrame`] that shows the null counts per column.
3088    #[must_use]
3089    pub fn null_count(&self) -> Self {
3090        let cols = self
3091            .columns
3092            .iter()
3093            .map(|c| Column::new(c.name().clone(), [c.null_count() as IdxSize]))
3094            .collect();
3095        unsafe { Self::new_no_checks(1, cols) }
3096    }
3097
3098    /// Hash and combine the row values
3099    #[cfg(feature = "row_hash")]
3100    pub fn hash_rows(
3101        &mut self,
3102        hasher_builder: Option<PlRandomState>,
3103    ) -> PolarsResult<UInt64Chunked> {
3104        let dfs = split_df(self, POOL.current_num_threads(), false);
3105        let (cas, _) = _df_rows_to_hashes_threaded_vertical(&dfs, hasher_builder)?;
3106
3107        let mut iter = cas.into_iter();
3108        let mut acc_ca = iter.next().unwrap();
3109        for ca in iter {
3110            acc_ca.append(&ca)?;
3111        }
3112        Ok(acc_ca.rechunk())
3113    }
3114
3115    /// Get the supertype of the columns in this DataFrame
3116    pub fn get_supertype(&self) -> Option<PolarsResult<DataType>> {
3117        self.columns
3118            .iter()
3119            .map(|s| Ok(s.dtype().clone()))
3120            .reduce(|acc, b| try_get_supertype(&acc?, &b.unwrap()))
3121    }
3122
3123    /// Take by index values given by the slice `idx`.
3124    /// # Warning
3125    /// Be careful with allowing threads when calling this in a large hot loop
3126    /// every thread split may be on rayon stack and lead to SO
3127    #[doc(hidden)]
3128    pub unsafe fn _take_unchecked_slice(&self, idx: &[IdxSize], allow_threads: bool) -> Self {
3129        self._take_unchecked_slice_sorted(idx, allow_threads, IsSorted::Not)
3130    }
3131
3132    /// Take by index values given by the slice `idx`. Use this over `_take_unchecked_slice`
3133    /// if the index value in `idx` are sorted. This will maintain sorted flags.
3134    ///
3135    /// # Warning
3136    /// Be careful with allowing threads when calling this in a large hot loop
3137    /// every thread split may be on rayon stack and lead to SO
3138    #[doc(hidden)]
3139    pub unsafe fn _take_unchecked_slice_sorted(
3140        &self,
3141        idx: &[IdxSize],
3142        allow_threads: bool,
3143        sorted: IsSorted,
3144    ) -> Self {
3145        #[cfg(debug_assertions)]
3146        {
3147            if idx.len() > 2 {
3148                match sorted {
3149                    IsSorted::Ascending => {
3150                        assert!(idx[0] <= idx[idx.len() - 1]);
3151                    },
3152                    IsSorted::Descending => {
3153                        assert!(idx[0] >= idx[idx.len() - 1]);
3154                    },
3155                    _ => {},
3156                }
3157            }
3158        }
3159        let mut ca = IdxCa::mmap_slice(PlSmallStr::EMPTY, idx);
3160        ca.set_sorted_flag(sorted);
3161        self.take_unchecked_impl(&ca, allow_threads)
3162    }
3163
3164    #[cfg(all(feature = "partition_by", feature = "algorithm_group_by"))]
3165    #[doc(hidden)]
3166    pub fn _partition_by_impl(
3167        &self,
3168        cols: &[PlSmallStr],
3169        stable: bool,
3170        include_key: bool,
3171    ) -> PolarsResult<Vec<DataFrame>> {
3172        let groups = if stable {
3173            self.group_by_stable(cols.iter().cloned())?.take_groups()
3174        } else {
3175            self.group_by(cols.iter().cloned())?.take_groups()
3176        };
3177
3178        // drop key columns prior to calculation if requested
3179        let df = if include_key {
3180            self.clone()
3181        } else {
3182            self.drop_many(cols.iter().cloned())
3183        };
3184
3185        // don't parallelize this
3186        // there is a lot of parallelization in take and this may easily SO
3187        POOL.install(|| {
3188            match groups.as_ref() {
3189                GroupsType::Idx(idx) => {
3190                    // Rechunk as the gather may rechunk for every group #17562.
3191                    let mut df = df.clone();
3192                    df.as_single_chunk_par();
3193                    Ok(idx
3194                        .into_par_iter()
3195                        .map(|(_, group)| {
3196                            // groups are in bounds
3197                            unsafe {
3198                                df._take_unchecked_slice_sorted(group, false, IsSorted::Ascending)
3199                            }
3200                        })
3201                        .collect())
3202                },
3203                GroupsType::Slice { groups, .. } => Ok(groups
3204                    .into_par_iter()
3205                    .map(|[first, len]| df.slice(*first as i64, *len as usize))
3206                    .collect()),
3207            }
3208        })
3209    }
3210
3211    /// Split into multiple DataFrames partitioned by groups
3212    #[cfg(feature = "partition_by")]
3213    pub fn partition_by<I, S>(&self, cols: I, include_key: bool) -> PolarsResult<Vec<DataFrame>>
3214    where
3215        I: IntoIterator<Item = S>,
3216        S: Into<PlSmallStr>,
3217    {
3218        let cols = cols
3219            .into_iter()
3220            .map(Into::into)
3221            .collect::<Vec<PlSmallStr>>();
3222        self._partition_by_impl(cols.as_slice(), false, include_key)
3223    }
3224
3225    /// Split into multiple DataFrames partitioned by groups
3226    /// Order of the groups are maintained.
3227    #[cfg(feature = "partition_by")]
3228    pub fn partition_by_stable<I, S>(
3229        &self,
3230        cols: I,
3231        include_key: bool,
3232    ) -> PolarsResult<Vec<DataFrame>>
3233    where
3234        I: IntoIterator<Item = S>,
3235        S: Into<PlSmallStr>,
3236    {
3237        let cols = cols
3238            .into_iter()
3239            .map(Into::into)
3240            .collect::<Vec<PlSmallStr>>();
3241        self._partition_by_impl(cols.as_slice(), true, include_key)
3242    }
3243
3244    /// Unnest the given `Struct` columns. This means that the fields of the `Struct` type will be
3245    /// inserted as columns.
3246    #[cfg(feature = "dtype-struct")]
3247    pub fn unnest<I: IntoVec<PlSmallStr>>(&self, cols: I) -> PolarsResult<DataFrame> {
3248        let cols = cols.into_vec();
3249        self.unnest_impl(cols.into_iter().collect())
3250    }
3251
3252    #[cfg(feature = "dtype-struct")]
3253    fn unnest_impl(&self, cols: PlHashSet<PlSmallStr>) -> PolarsResult<DataFrame> {
3254        let mut new_cols = Vec::with_capacity(std::cmp::min(self.width() * 2, self.width() + 128));
3255        let mut count = 0;
3256        for s in &self.columns {
3257            if cols.contains(s.name()) {
3258                let ca = s.struct_()?.clone();
3259                new_cols.extend(ca.fields_as_series().into_iter().map(Column::from));
3260                count += 1;
3261            } else {
3262                new_cols.push(s.clone())
3263            }
3264        }
3265        if count != cols.len() {
3266            // one or more columns not found
3267            // the code below will return an error with the missing name
3268            let schema = self.schema();
3269            for col in cols {
3270                let _ = schema
3271                    .get(col.as_str())
3272                    .ok_or_else(|| polars_err!(col_not_found = col))?;
3273            }
3274        }
3275        DataFrame::new(new_cols)
3276    }
3277
3278    pub(crate) fn infer_height(cols: &[Column]) -> usize {
3279        cols.first().map_or(0, Column::len)
3280    }
3281
3282    pub fn append_record_batch(&mut self, rb: RecordBatchT<ArrayRef>) -> PolarsResult<()> {
3283        polars_ensure!(
3284            rb.arrays().len() == self.width(),
3285            InvalidOperation: "attempt to extend dataframe of width {} with record batch of width {}",
3286            self.width(),
3287            rb.arrays().len(),
3288        );
3289
3290        if rb.height() == 0 {
3291            return Ok(());
3292        }
3293
3294        // SAFETY:
3295        // - we don't adjust the names of the columns
3296        // - each column gets appended the same number of rows, which is an invariant of
3297        //   record_batch.
3298        let columns = unsafe { self.get_columns_mut() };
3299        for (col, arr) in columns.iter_mut().zip(rb.into_arrays()) {
3300            let arr_series = Series::from_arrow_chunks(PlSmallStr::EMPTY, vec![arr])?.into_column();
3301            col.append(&arr_series)?;
3302        }
3303
3304        Ok(())
3305    }
3306}
3307
3308pub struct RecordBatchIter<'a> {
3309    columns: &'a Vec<Column>,
3310    schema: ArrowSchemaRef,
3311    idx: usize,
3312    n_chunks: usize,
3313    compat_level: CompatLevel,
3314    parallel: bool,
3315}
3316
3317impl Iterator for RecordBatchIter<'_> {
3318    type Item = RecordBatch;
3319
3320    fn next(&mut self) -> Option<Self::Item> {
3321        if self.idx >= self.n_chunks {
3322            return None;
3323        }
3324
3325        // Create a batch of the columns with the same chunk no.
3326        let batch_cols: Vec<ArrayRef> = if self.parallel {
3327            let iter = self
3328                .columns
3329                .par_iter()
3330                .map(Column::as_materialized_series)
3331                .map(|s| s.to_arrow(self.idx, self.compat_level));
3332            POOL.install(|| iter.collect())
3333        } else {
3334            self.columns
3335                .iter()
3336                .map(Column::as_materialized_series)
3337                .map(|s| s.to_arrow(self.idx, self.compat_level))
3338                .collect()
3339        };
3340        self.idx += 1;
3341
3342        let length = batch_cols.first().map_or(0, |arr| arr.len());
3343        Some(RecordBatch::new(length, self.schema.clone(), batch_cols))
3344    }
3345
3346    fn size_hint(&self) -> (usize, Option<usize>) {
3347        let n = self.n_chunks - self.idx;
3348        (n, Some(n))
3349    }
3350}
3351
3352pub struct PhysRecordBatchIter<'a> {
3353    schema: ArrowSchemaRef,
3354    arr_iters: Vec<std::slice::Iter<'a, ArrayRef>>,
3355}
3356
3357impl Iterator for PhysRecordBatchIter<'_> {
3358    type Item = RecordBatch;
3359
3360    fn next(&mut self) -> Option<Self::Item> {
3361        let arrs = self
3362            .arr_iters
3363            .iter_mut()
3364            .map(|phys_iter| phys_iter.next().cloned())
3365            .collect::<Option<Vec<_>>>()?;
3366
3367        let length = arrs.first().map_or(0, |arr| arr.len());
3368        Some(RecordBatch::new(length, self.schema.clone(), arrs))
3369    }
3370
3371    fn size_hint(&self) -> (usize, Option<usize>) {
3372        if let Some(iter) = self.arr_iters.first() {
3373            iter.size_hint()
3374        } else {
3375            (0, None)
3376        }
3377    }
3378}
3379
3380impl Default for DataFrame {
3381    fn default() -> Self {
3382        DataFrame::empty()
3383    }
3384}
3385
3386impl From<DataFrame> for Vec<Column> {
3387    fn from(df: DataFrame) -> Self {
3388        df.columns
3389    }
3390}
3391
3392// utility to test if we can vstack/extend the columns
3393fn ensure_can_extend(left: &Column, right: &Column) -> PolarsResult<()> {
3394    polars_ensure!(
3395        left.name() == right.name(),
3396        ShapeMismatch: "unable to vstack, column names don't match: {:?} and {:?}",
3397        left.name(), right.name(),
3398    );
3399    Ok(())
3400}
3401
3402#[cfg(test)]
3403mod test {
3404    use super::*;
3405
3406    fn create_frame() -> DataFrame {
3407        let s0 = Column::new("days".into(), [0, 1, 2].as_ref());
3408        let s1 = Column::new("temp".into(), [22.1, 19.9, 7.].as_ref());
3409        DataFrame::new(vec![s0, s1]).unwrap()
3410    }
3411
3412    #[test]
3413    #[cfg_attr(miri, ignore)]
3414    fn test_recordbatch_iterator() {
3415        let df = df!(
3416            "foo" => [1, 2, 3, 4, 5]
3417        )
3418        .unwrap();
3419        let mut iter = df.iter_chunks(CompatLevel::newest(), false);
3420        assert_eq!(5, iter.next().unwrap().len());
3421        assert!(iter.next().is_none());
3422    }
3423
3424    #[test]
3425    #[cfg_attr(miri, ignore)]
3426    fn test_select() {
3427        let df = create_frame();
3428        assert_eq!(
3429            df.column("days")
3430                .unwrap()
3431                .as_series()
3432                .unwrap()
3433                .equal(1)
3434                .unwrap()
3435                .sum(),
3436            Some(1)
3437        );
3438    }
3439
3440    #[test]
3441    #[cfg_attr(miri, ignore)]
3442    fn test_filter_broadcast_on_string_col() {
3443        let col_name = "some_col";
3444        let v = vec!["test".to_string()];
3445        let s0 = Column::new(PlSmallStr::from_str(col_name), v);
3446        let mut df = DataFrame::new(vec![s0]).unwrap();
3447
3448        df = df
3449            .filter(
3450                &df.column(col_name)
3451                    .unwrap()
3452                    .as_materialized_series()
3453                    .equal("")
3454                    .unwrap(),
3455            )
3456            .unwrap();
3457        assert_eq!(
3458            df.column(col_name)
3459                .unwrap()
3460                .as_materialized_series()
3461                .n_chunks(),
3462            1
3463        );
3464    }
3465
3466    #[test]
3467    #[cfg_attr(miri, ignore)]
3468    fn test_filter_broadcast_on_list_col() {
3469        let s1 = Series::new(PlSmallStr::EMPTY, [true, false, true]);
3470        let ll: ListChunked = [&s1].iter().copied().collect();
3471
3472        let mask = BooleanChunked::from_slice(PlSmallStr::EMPTY, &[false]);
3473        let new = ll.filter(&mask).unwrap();
3474
3475        assert_eq!(new.chunks.len(), 1);
3476        assert_eq!(new.len(), 0);
3477    }
3478
3479    #[test]
3480    fn slice() {
3481        let df = create_frame();
3482        let sliced_df = df.slice(0, 2);
3483        assert_eq!(sliced_df.shape(), (2, 2));
3484    }
3485
3486    #[test]
3487    fn rechunk_false() {
3488        let df = create_frame();
3489        assert!(!df.should_rechunk())
3490    }
3491
3492    #[test]
3493    fn rechunk_true() -> PolarsResult<()> {
3494        let mut base = df!(
3495            "a" => [1, 2, 3],
3496            "b" => [1, 2, 3]
3497        )?;
3498
3499        // Create a series with multiple chunks
3500        let mut s = Series::new("foo".into(), 0..2);
3501        let s2 = Series::new("bar".into(), 0..1);
3502        s.append(&s2)?;
3503
3504        // Append series to frame
3505        let out = base.with_column(s)?;
3506
3507        // Now we should rechunk
3508        assert!(out.should_rechunk());
3509        Ok(())
3510    }
3511
3512    #[test]
3513    fn test_duplicate_column() {
3514        let mut df = df! {
3515            "foo" => [1, 2, 3]
3516        }
3517        .unwrap();
3518        // check if column is replaced
3519        assert!(df
3520            .with_column(Series::new("foo".into(), &[1, 2, 3]))
3521            .is_ok());
3522        assert!(df
3523            .with_column(Series::new("bar".into(), &[1, 2, 3]))
3524            .is_ok());
3525        assert!(df.column("bar").is_ok())
3526    }
3527
3528    #[test]
3529    #[cfg_attr(miri, ignore)]
3530    fn distinct() {
3531        let df = df! {
3532            "flt" => [1., 1., 2., 2., 3., 3.],
3533            "int" => [1, 1, 2, 2, 3, 3, ],
3534            "str" => ["a", "a", "b", "b", "c", "c"]
3535        }
3536        .unwrap();
3537        let df = df
3538            .unique_stable(None, UniqueKeepStrategy::First, None)
3539            .unwrap()
3540            .sort(["flt"], SortMultipleOptions::default())
3541            .unwrap();
3542        let valid = df! {
3543            "flt" => [1., 2., 3.],
3544            "int" => [1, 2, 3],
3545            "str" => ["a", "b", "c"]
3546        }
3547        .unwrap();
3548        assert!(df.equals(&valid));
3549    }
3550
3551    #[test]
3552    fn test_vstack() {
3553        // check that it does not accidentally rechunks
3554        let mut df = df! {
3555            "flt" => [1., 1., 2., 2., 3., 3.],
3556            "int" => [1, 1, 2, 2, 3, 3, ],
3557            "str" => ["a", "a", "b", "b", "c", "c"]
3558        }
3559        .unwrap();
3560
3561        df.vstack_mut(&df.slice(0, 3)).unwrap();
3562        assert_eq!(df.first_col_n_chunks(), 2)
3563    }
3564
3565    #[test]
3566    fn test_vstack_on_empty_dataframe() {
3567        let mut df = DataFrame::empty();
3568
3569        let df_data = df! {
3570            "flt" => [1., 1., 2., 2., 3., 3.],
3571            "int" => [1, 1, 2, 2, 3, 3, ],
3572            "str" => ["a", "a", "b", "b", "c", "c"]
3573        }
3574        .unwrap();
3575
3576        df.vstack_mut(&df_data).unwrap();
3577        assert_eq!(df.height, 6)
3578    }
3579
3580    #[test]
3581    fn test_replace_or_add() -> PolarsResult<()> {
3582        let mut df = df!(
3583            "a" => [1, 2, 3],
3584            "b" => [1, 2, 3]
3585        )?;
3586
3587        // check that the new column is "c" and not "bar".
3588        df.replace_or_add("c".into(), Series::new("bar".into(), [1, 2, 3]))?;
3589
3590        assert_eq!(df.get_column_names(), &["a", "b", "c"]);
3591        Ok(())
3592    }
3593}
polars_core/frame/mod.rs

polars_core/frame/
mod.rs