polars_arrow/datatypes/
mod.rs

1//! Contains all metadata, such as [`PhysicalType`], [`ArrowDataType`], [`Field`] and [`ArrowSchema`].
2
3mod field;
4mod physical_type;
5pub mod reshape;
6mod schema;
7
8use std::collections::BTreeMap;
9use std::sync::Arc;
10
11pub use field::{Field, DTYPE_CATEGORICAL, DTYPE_ENUM_VALUES};
12pub use physical_type::*;
13use polars_utils::pl_str::PlSmallStr;
14pub use schema::{ArrowSchema, ArrowSchemaRef};
15#[cfg(feature = "serde")]
16use serde::{Deserialize, Serialize};
17
18/// typedef for [BTreeMap<PlSmallStr, PlSmallStr>] denoting [`Field`]'s and [`ArrowSchema`]'s metadata.
19pub type Metadata = BTreeMap<PlSmallStr, PlSmallStr>;
20/// typedef for [Option<(PlSmallStr, Option<PlSmallStr>)>] descr
21pub(crate) type Extension = Option<(PlSmallStr, Option<PlSmallStr>)>;
22
23/// The set of supported logical types in this crate.
24///
25/// Each variant uniquely identifies a logical type, which define specific semantics to the data
26/// (e.g. how it should be represented).
27/// Each variant has a corresponding [`PhysicalType`], obtained via [`ArrowDataType::to_physical_type`],
28/// which declares the in-memory representation of data.
29/// The [`ArrowDataType::Extension`] is special in that it augments a [`ArrowDataType`] with metadata to support custom types.
30/// Use `to_logical_type` to desugar such type and return its corresponding logical type.
31#[derive(Debug, Clone, PartialEq, Eq, Hash, Default)]
32#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
33pub enum ArrowDataType {
34    /// Null type
35    #[default]
36    Null,
37    /// `true` and `false`.
38    Boolean,
39    /// An [`i8`]
40    Int8,
41    /// An [`i16`]
42    Int16,
43    /// An [`i32`]
44    Int32,
45    /// An [`i64`]
46    Int64,
47    /// An [`i128`]
48    Int128,
49    /// An [`u8`]
50    UInt8,
51    /// An [`u16`]
52    UInt16,
53    /// An [`u32`]
54    UInt32,
55    /// An [`u64`]
56    UInt64,
57    /// An 16-bit float
58    Float16,
59    /// A [`f32`]
60    Float32,
61    /// A [`f64`]
62    Float64,
63    /// A [`i64`] representing a timestamp measured in [`TimeUnit`] with an optional timezone.
64    ///
65    /// Time is measured as a Unix epoch, counting the seconds from
66    /// 00:00:00.000 on 1 January 1970, excluding leap seconds,
67    /// as a 64-bit signed integer.
68    ///
69    /// The time zone is a string indicating the name of a time zone, one of:
70    ///
71    /// * As used in the Olson time zone database (the "tz database" or
72    ///   "tzdata"), such as "America/New_York"
73    /// * An absolute time zone offset of the form +XX:XX or -XX:XX, such as +07:30
74    ///
75    /// When the timezone is not specified, the timestamp is considered to have no timezone
76    /// and is represented _as is_
77    Timestamp(TimeUnit, Option<PlSmallStr>),
78    /// An [`i32`] representing the elapsed time since UNIX epoch (1970-01-01)
79    /// in days.
80    Date32,
81    /// An [`i64`] representing the elapsed time since UNIX epoch (1970-01-01)
82    /// in milliseconds. Values are evenly divisible by 86400000.
83    Date64,
84    /// A 32-bit time representing the elapsed time since midnight in the unit of `TimeUnit`.
85    /// Only [`TimeUnit::Second`] and [`TimeUnit::Millisecond`] are supported on this variant.
86    Time32(TimeUnit),
87    /// A 64-bit time representing the elapsed time since midnight in the unit of `TimeUnit`.
88    /// Only [`TimeUnit::Microsecond`] and [`TimeUnit::Nanosecond`] are supported on this variant.
89    Time64(TimeUnit),
90    /// Measure of elapsed time. This elapsed time is a physical duration (i.e. 1s as defined in S.I.)
91    Duration(TimeUnit),
92    /// A "calendar" interval modeling elapsed time that takes into account calendar shifts.
93    /// For example an interval of 1 day may represent more than 24 hours.
94    Interval(IntervalUnit),
95    /// Opaque binary data of variable length whose offsets are represented as [`i32`].
96    Binary,
97    /// Opaque binary data of fixed size.
98    /// Enum parameter specifies the number of bytes per value.
99    FixedSizeBinary(usize),
100    /// Opaque binary data of variable length whose offsets are represented as [`i64`].
101    LargeBinary,
102    /// A variable-length UTF-8 encoded string whose offsets are represented as [`i32`].
103    Utf8,
104    /// A variable-length UTF-8 encoded string whose offsets are represented as [`i64`].
105    LargeUtf8,
106    /// A list of some logical data type whose offsets are represented as [`i32`].
107    List(Box<Field>),
108    /// A list of some logical data type with a fixed number of elements.
109    FixedSizeList(Box<Field>, usize),
110    /// A list of some logical data type whose offsets are represented as [`i64`].
111    LargeList(Box<Field>),
112    /// A nested [`ArrowDataType`] with a given number of [`Field`]s.
113    Struct(Vec<Field>),
114    /// A nested type that is represented as
115    ///
116    /// List<entries: Struct<key: K, value: V>>
117    ///
118    /// In this layout, the keys and values are each respectively contiguous. We do
119    /// not constrain the key and value types, so the application is responsible
120    /// for ensuring that the keys are hashable and unique. Whether the keys are sorted
121    /// may be set in the metadata for this field.
122    ///
123    /// In a field with Map type, the field has a child Struct field, which then
124    /// has two children: key type and the second the value type. The names of the
125    /// child fields may be respectively "entries", "key", and "value", but this is
126    /// not enforced.
127    ///
128    /// Map
129    /// ```text
130    ///   - child[0] entries: Struct
131    ///     - child[0] key: K
132    ///     - child[1] value: V
133    /// ```
134    /// Neither the "entries" field nor the "key" field may be nullable.
135    ///
136    /// The metadata is structured so that Arrow systems without special handling
137    /// for Map can make Map an alias for List. The "layout" attribute for the Map
138    /// field must have the same contents as a List.
139    /// - Field
140    /// - ordered
141    Map(Box<Field>, bool),
142    /// A dictionary encoded array (`key_type`, `value_type`), where
143    /// each array element is an index of `key_type` into an
144    /// associated dictionary of `value_type`.
145    ///
146    /// Dictionary arrays are used to store columns of `value_type`
147    /// that contain many repeated values using less memory, but with
148    /// a higher CPU overhead for some operations.
149    ///
150    /// This type mostly used to represent low cardinality string
151    /// arrays or a limited set of primitive types as integers.
152    ///
153    /// The `bool` value indicates the `Dictionary` is sorted if set to `true`.
154    Dictionary(IntegerType, Box<ArrowDataType>, bool),
155    /// Decimal value with precision and scale
156    /// precision is the number of digits in the number and
157    /// scale is the number of decimal places.
158    /// The number 999.99 has a precision of 5 and scale of 2.
159    Decimal(usize, usize),
160    /// Decimal backed by 256 bits
161    Decimal256(usize, usize),
162    /// Extension type.
163    Extension(Box<ExtensionType>),
164    /// A binary type that inlines small values
165    /// and can intern bytes.
166    BinaryView,
167    /// A string type that inlines small values
168    /// and can intern strings.
169    Utf8View,
170    /// A type unknown to Arrow.
171    Unknown,
172    /// A nested datatype that can represent slots of differing types.
173    /// Third argument represents mode
174    #[cfg_attr(feature = "serde", serde(skip))]
175    Union(Box<UnionType>),
176}
177
178#[derive(Debug, Clone, PartialEq, Eq, Hash)]
179#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
180pub struct ExtensionType {
181    pub name: PlSmallStr,
182    pub inner: ArrowDataType,
183    pub metadata: Option<PlSmallStr>,
184}
185
186#[derive(Debug, Clone, PartialEq, Eq, Hash)]
187pub struct UnionType {
188    pub fields: Vec<Field>,
189    pub ids: Option<Vec<i32>>,
190    pub mode: UnionMode,
191}
192
193/// Mode of [`ArrowDataType::Union`]
194#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
195#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
196pub enum UnionMode {
197    /// Dense union
198    Dense,
199    /// Sparse union
200    Sparse,
201}
202
203impl UnionMode {
204    /// Constructs a [`UnionMode::Sparse`] if the input bool is true,
205    /// or otherwise constructs a [`UnionMode::Dense`]
206    pub fn sparse(is_sparse: bool) -> Self {
207        if is_sparse {
208            Self::Sparse
209        } else {
210            Self::Dense
211        }
212    }
213
214    /// Returns whether the mode is sparse
215    pub fn is_sparse(&self) -> bool {
216        matches!(self, Self::Sparse)
217    }
218
219    /// Returns whether the mode is dense
220    pub fn is_dense(&self) -> bool {
221        matches!(self, Self::Dense)
222    }
223}
224
225/// The time units defined in Arrow.
226#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
227#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
228pub enum TimeUnit {
229    /// Time in seconds.
230    Second,
231    /// Time in milliseconds.
232    Millisecond,
233    /// Time in microseconds.
234    Microsecond,
235    /// Time in nanoseconds.
236    Nanosecond,
237}
238
239/// Interval units defined in Arrow
240#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
241#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
242pub enum IntervalUnit {
243    /// The number of elapsed whole months.
244    YearMonth,
245    /// The number of elapsed days and milliseconds,
246    /// stored as 2 contiguous `i32`
247    DayTime,
248    /// The number of elapsed months (i32), days (i32) and nanoseconds (i64).
249    MonthDayNano,
250}
251
252impl ArrowDataType {
253    /// the [`PhysicalType`] of this [`ArrowDataType`].
254    pub fn to_physical_type(&self) -> PhysicalType {
255        use ArrowDataType::*;
256        match self {
257            Null => PhysicalType::Null,
258            Boolean => PhysicalType::Boolean,
259            Int8 => PhysicalType::Primitive(PrimitiveType::Int8),
260            Int16 => PhysicalType::Primitive(PrimitiveType::Int16),
261            Int32 | Date32 | Time32(_) | Interval(IntervalUnit::YearMonth) => {
262                PhysicalType::Primitive(PrimitiveType::Int32)
263            },
264            Int64 | Date64 | Timestamp(_, _) | Time64(_) | Duration(_) => {
265                PhysicalType::Primitive(PrimitiveType::Int64)
266            },
267            Decimal(_, _) => PhysicalType::Primitive(PrimitiveType::Int128),
268            Decimal256(_, _) => PhysicalType::Primitive(PrimitiveType::Int256),
269            UInt8 => PhysicalType::Primitive(PrimitiveType::UInt8),
270            UInt16 => PhysicalType::Primitive(PrimitiveType::UInt16),
271            UInt32 => PhysicalType::Primitive(PrimitiveType::UInt32),
272            UInt64 => PhysicalType::Primitive(PrimitiveType::UInt64),
273            Float16 => PhysicalType::Primitive(PrimitiveType::Float16),
274            Float32 => PhysicalType::Primitive(PrimitiveType::Float32),
275            Float64 => PhysicalType::Primitive(PrimitiveType::Float64),
276            Int128 => PhysicalType::Primitive(PrimitiveType::Int128),
277            Interval(IntervalUnit::DayTime) => PhysicalType::Primitive(PrimitiveType::DaysMs),
278            Interval(IntervalUnit::MonthDayNano) => {
279                PhysicalType::Primitive(PrimitiveType::MonthDayNano)
280            },
281            Binary => PhysicalType::Binary,
282            FixedSizeBinary(_) => PhysicalType::FixedSizeBinary,
283            LargeBinary => PhysicalType::LargeBinary,
284            Utf8 => PhysicalType::Utf8,
285            LargeUtf8 => PhysicalType::LargeUtf8,
286            BinaryView => PhysicalType::BinaryView,
287            Utf8View => PhysicalType::Utf8View,
288            List(_) => PhysicalType::List,
289            FixedSizeList(_, _) => PhysicalType::FixedSizeList,
290            LargeList(_) => PhysicalType::LargeList,
291            Struct(_) => PhysicalType::Struct,
292            Union(_) => PhysicalType::Union,
293            Map(_, _) => PhysicalType::Map,
294            Dictionary(key, _, _) => PhysicalType::Dictionary(*key),
295            Extension(ext) => ext.inner.to_physical_type(),
296            Unknown => unimplemented!(),
297        }
298    }
299
300    // The datatype underlying this (possibly logical) arrow data type.
301    pub fn underlying_physical_type(&self) -> ArrowDataType {
302        use ArrowDataType::*;
303        match self {
304            Date32 | Time32(_) | Interval(IntervalUnit::YearMonth) => Int32,
305            Date64
306            | Timestamp(_, _)
307            | Time64(_)
308            | Duration(_)
309            | Interval(IntervalUnit::DayTime) => Int64,
310            Interval(IntervalUnit::MonthDayNano) => unimplemented!(),
311            Binary => Binary,
312            List(field) => List(Box::new(Field {
313                dtype: field.dtype.underlying_physical_type(),
314                ..*field.clone()
315            })),
316            LargeList(field) => LargeList(Box::new(Field {
317                dtype: field.dtype.underlying_physical_type(),
318                ..*field.clone()
319            })),
320            FixedSizeList(field, width) => FixedSizeList(
321                Box::new(Field {
322                    dtype: field.dtype.underlying_physical_type(),
323                    ..*field.clone()
324                }),
325                *width,
326            ),
327            Struct(fields) => Struct(
328                fields
329                    .iter()
330                    .map(|field| Field {
331                        dtype: field.dtype.underlying_physical_type(),
332                        ..field.clone()
333                    })
334                    .collect(),
335            ),
336            Dictionary(keys, _, _) => (*keys).into(),
337            Union(_) => unimplemented!(),
338            Map(_, _) => unimplemented!(),
339            Extension(ext) => ext.inner.underlying_physical_type(),
340            _ => self.clone(),
341        }
342    }
343
344    /// Returns `&self` for all but [`ArrowDataType::Extension`]. For [`ArrowDataType::Extension`],
345    /// (recursively) returns the inner [`ArrowDataType`].
346    /// Never returns the variant [`ArrowDataType::Extension`].
347    pub fn to_logical_type(&self) -> &ArrowDataType {
348        use ArrowDataType::*;
349        match self {
350            Extension(ext) => ext.inner.to_logical_type(),
351            _ => self,
352        }
353    }
354
355    pub fn inner_dtype(&self) -> Option<&ArrowDataType> {
356        match self {
357            ArrowDataType::List(inner) => Some(inner.dtype()),
358            ArrowDataType::LargeList(inner) => Some(inner.dtype()),
359            ArrowDataType::FixedSizeList(inner, _) => Some(inner.dtype()),
360            _ => None,
361        }
362    }
363
364    pub fn is_nested(&self) -> bool {
365        use ArrowDataType as D;
366
367        matches!(
368            self,
369            D::List(_)
370                | D::LargeList(_)
371                | D::FixedSizeList(_, _)
372                | D::Struct(_)
373                | D::Union(_)
374                | D::Map(_, _)
375                | D::Dictionary(_, _, _)
376                | D::Extension(_)
377        )
378    }
379
380    pub fn is_view(&self) -> bool {
381        matches!(self, ArrowDataType::Utf8View | ArrowDataType::BinaryView)
382    }
383
384    pub fn is_numeric(&self) -> bool {
385        use ArrowDataType as D;
386        matches!(
387            self,
388            D::Int8
389                | D::Int16
390                | D::Int32
391                | D::Int64
392                | D::Int128
393                | D::UInt8
394                | D::UInt16
395                | D::UInt32
396                | D::UInt64
397                | D::Float32
398                | D::Float64
399                | D::Decimal(_, _)
400                | D::Decimal256(_, _)
401        )
402    }
403
404    pub fn to_fixed_size_list(self, size: usize, is_nullable: bool) -> ArrowDataType {
405        ArrowDataType::FixedSizeList(
406            Box::new(Field::new(
407                PlSmallStr::from_static("item"),
408                self,
409                is_nullable,
410            )),
411            size,
412        )
413    }
414
415    /// Check (recursively) whether datatype contains an [`ArrowDataType::Dictionary`] type.
416    pub fn contains_dictionary(&self) -> bool {
417        use ArrowDataType as D;
418        match self {
419            D::Null
420            | D::Boolean
421            | D::Int8
422            | D::Int16
423            | D::Int32
424            | D::Int64
425            | D::UInt8
426            | D::UInt16
427            | D::UInt32
428            | D::UInt64
429            | D::Int128
430            | D::Float16
431            | D::Float32
432            | D::Float64
433            | D::Timestamp(_, _)
434            | D::Date32
435            | D::Date64
436            | D::Time32(_)
437            | D::Time64(_)
438            | D::Duration(_)
439            | D::Interval(_)
440            | D::Binary
441            | D::FixedSizeBinary(_)
442            | D::LargeBinary
443            | D::Utf8
444            | D::LargeUtf8
445            | D::Decimal(_, _)
446            | D::Decimal256(_, _)
447            | D::BinaryView
448            | D::Utf8View
449            | D::Unknown => false,
450            D::List(field)
451            | D::FixedSizeList(field, _)
452            | D::Map(field, _)
453            | D::LargeList(field) => field.dtype().contains_dictionary(),
454            D::Struct(fields) => fields.iter().any(|f| f.dtype().contains_dictionary()),
455            D::Union(union) => union.fields.iter().any(|f| f.dtype().contains_dictionary()),
456            D::Dictionary(_, _, _) => true,
457            D::Extension(ext) => ext.inner.contains_dictionary(),
458        }
459    }
460}
461
462impl From<IntegerType> for ArrowDataType {
463    fn from(item: IntegerType) -> Self {
464        match item {
465            IntegerType::Int8 => ArrowDataType::Int8,
466            IntegerType::Int16 => ArrowDataType::Int16,
467            IntegerType::Int32 => ArrowDataType::Int32,
468            IntegerType::Int64 => ArrowDataType::Int64,
469            IntegerType::Int128 => ArrowDataType::Int128,
470            IntegerType::UInt8 => ArrowDataType::UInt8,
471            IntegerType::UInt16 => ArrowDataType::UInt16,
472            IntegerType::UInt32 => ArrowDataType::UInt32,
473            IntegerType::UInt64 => ArrowDataType::UInt64,
474        }
475    }
476}
477
478impl From<PrimitiveType> for ArrowDataType {
479    fn from(item: PrimitiveType) -> Self {
480        match item {
481            PrimitiveType::Int8 => ArrowDataType::Int8,
482            PrimitiveType::Int16 => ArrowDataType::Int16,
483            PrimitiveType::Int32 => ArrowDataType::Int32,
484            PrimitiveType::Int64 => ArrowDataType::Int64,
485            PrimitiveType::UInt8 => ArrowDataType::UInt8,
486            PrimitiveType::UInt16 => ArrowDataType::UInt16,
487            PrimitiveType::UInt32 => ArrowDataType::UInt32,
488            PrimitiveType::UInt64 => ArrowDataType::UInt64,
489            PrimitiveType::Int128 => ArrowDataType::Int128,
490            PrimitiveType::Int256 => ArrowDataType::Decimal256(32, 32),
491            PrimitiveType::Float16 => ArrowDataType::Float16,
492            PrimitiveType::Float32 => ArrowDataType::Float32,
493            PrimitiveType::Float64 => ArrowDataType::Float64,
494            PrimitiveType::DaysMs => ArrowDataType::Interval(IntervalUnit::DayTime),
495            PrimitiveType::MonthDayNano => ArrowDataType::Interval(IntervalUnit::MonthDayNano),
496            PrimitiveType::UInt128 => unimplemented!(),
497        }
498    }
499}
500
501/// typedef for [`Arc<ArrowSchema>`].
502pub type SchemaRef = Arc<ArrowSchema>;
503
504/// support get extension for metadata
505pub fn get_extension(metadata: &Metadata) -> Extension {
506    if let Some(name) = metadata.get(&PlSmallStr::from_static("ARROW:extension:name")) {
507        let metadata = metadata
508            .get(&PlSmallStr::from_static("ARROW:extension:metadata"))
509            .cloned();
510        Some((name.clone(), metadata))
511    } else {
512        None
513    }
514}
515
516#[cfg(not(feature = "bigidx"))]
517pub type IdxArr = super::array::UInt32Array;
518#[cfg(feature = "bigidx")]
519pub type IdxArr = super::array::UInt64Array;