statrs/distribution/
empirical.rs

1use crate::distribution::{Continuous, ContinuousCDF, Uniform};
2use crate::statistics::*;
3use crate::{Result, StatsError};
4use ::num_traits::float::Float;
5use core::cmp::Ordering;
6use rand::Rng;
7use std::collections::BTreeMap;
8
9#[derive(Clone, Debug, PartialEq)]
10struct NonNAN<T>(T);
11
12impl<T: PartialEq> Eq for NonNAN<T> {}
13
14impl<T: PartialOrd> PartialOrd for NonNAN<T> {
15    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
16        self.0.partial_cmp(&other.0)
17    }
18}
19
20impl<T: PartialOrd> Ord for NonNAN<T> {
21    fn cmp(&self, other: &Self) -> Ordering {
22        self.partial_cmp(other).unwrap()
23    }
24}
25
26/// Implements the [Empirical
27/// Distribution](https://en.wikipedia.org/wiki/Empirical_distribution_function)
28///
29/// # Examples
30///
31/// ```
32/// use statrs::distribution::{Continuous, Empirical};
33/// use statrs::statistics::Distribution;
34///
35/// let samples = vec![0.0, 5.0, 10.0];
36///
37/// let empirical = Empirical::from_vec(samples);
38/// assert_eq!(empirical.mean().unwrap(), 5.0);
39/// ```
40#[derive(Debug, Clone, PartialEq)]
41pub struct Empirical {
42    sum: f64,
43    mean_and_var: Option<(f64, f64)>,
44    // keys are data points, values are number of data points with equal value
45    data: BTreeMap<NonNAN<f64>, u64>,
46}
47
48impl Empirical {
49    /// Constructs a new discrete uniform distribution with a minimum value
50    /// of `min` and a maximum value of `max`.
51    ///
52    /// # Examples
53    ///
54    /// ```
55    /// use statrs::distribution::Empirical;
56    ///
57    /// let mut result = Empirical::new();
58    /// assert!(result.is_ok());
59    ///
60    /// ```
61    pub fn new() -> Result<Empirical> {
62        Ok(Empirical {
63            sum: 0.,
64            mean_and_var: None,
65            data: BTreeMap::new(),
66        })
67    }
68    pub fn from_vec(src: Vec<f64>) -> Empirical {
69        let mut empirical = Empirical::new().unwrap();
70        for elt in src.into_iter() {
71            empirical.add(elt);
72        }
73        empirical
74    }
75    pub fn add(&mut self, data_point: f64) {
76        if !data_point.is_nan() {
77            self.sum += 1.;
78            match self.mean_and_var {
79                Some((mean, var)) => {
80                    let sum = self.sum;
81                    let var = var + (sum - 1.) * (data_point - mean) * (data_point - mean) / sum;
82                    let mean = mean + (data_point - mean) / sum;
83                    self.mean_and_var = Some((mean, var));
84                }
85                None => {
86                    self.mean_and_var = Some((data_point, 0.));
87                }
88            }
89            *self.data.entry(NonNAN(data_point)).or_insert(0) += 1;
90        }
91    }
92    pub fn remove(&mut self, data_point: f64) {
93        if !data_point.is_nan() {
94            if let (Some(val), Some((mean, var))) =
95                (self.data.remove(&NonNAN(data_point)), self.mean_and_var)
96            {
97                if val == 1 && self.data.is_empty() {
98                    self.mean_and_var = None;
99                    self.sum = 0.;
100                    return;
101                };
102                // reset mean and var
103                let mean = (self.sum * mean - data_point) / (self.sum - 1.);
104                let var =
105                    var - (self.sum - 1.) * (data_point - mean) * (data_point - mean) / self.sum;
106                self.sum -= 1.;
107                if val != 1 {
108                    self.data.insert(NonNAN(data_point), val - 1);
109                };
110                self.mean_and_var = Some((mean, var));
111            }
112        }
113    }
114    // Due to issues with rounding and floating-point accuracy the default
115    // implementation may be ill-behaved.
116    // Specialized inverse cdfs should be used whenever possible.
117    // Performs a binary search on the domain of `cdf` to obtain an approximation
118    // of `F^-1(p) := inf { x | F(x) >= p }`. Needless to say, performance may
119    // may be lacking.
120    // This function is identical to the default method implementation in the
121    // `ContinuousCDF` trait and is used to implement the rand trait `Distribution`.
122    fn __inverse_cdf(&self, p: f64) -> f64 {
123        if p == 0.0 {
124            return self.min();
125        };
126        if p == 1.0 {
127            return self.max();
128        };
129        let mut high = 2.0;
130        let mut low = -high;
131        while self.cdf(low) > p {
132            low = low + low;
133        }
134        while self.cdf(high) < p {
135            high = high + high;
136        }
137        let mut i = 16;
138        while i != 0 {
139            let mid = (high + low) / 2.0;
140            if self.cdf(mid) >= p {
141                high = mid;
142            } else {
143                low = mid;
144            }
145            i -= 1;
146        }
147        (high + low) / 2.0
148    }
149}
150
151impl ::rand::distributions::Distribution<f64> for Empirical {
152    fn sample<R: ?Sized + Rng>(&self, rng: &mut R) -> f64 {
153        let uniform = Uniform::new(0.0, 1.0).unwrap();
154        self.__inverse_cdf(uniform.sample(rng))
155    }
156}
157
158/// Panics if number of samples is zero
159impl Max<f64> for Empirical {
160    fn max(&self) -> f64 {
161        self.data.iter().rev().map(|(key, _)| key.0).next().unwrap()
162    }
163}
164
165/// Panics if number of samples is zero
166impl Min<f64> for Empirical {
167    fn min(&self) -> f64 {
168        self.data.iter().map(|(key, _)| key.0).next().unwrap()
169    }
170}
171
172impl Distribution<f64> for Empirical {
173    fn mean(&self) -> Option<f64> {
174        self.mean_and_var.map(|(mean, _)| mean)
175    }
176    fn variance(&self) -> Option<f64> {
177        self.mean_and_var.map(|(_, var)| var / (self.sum - 1.))
178    }
179}
180
181impl ContinuousCDF<f64, f64> for Empirical {
182    fn cdf(&self, x: f64) -> f64 {
183        let mut sum = 0;
184        for (keys, values) in &self.data {
185            if keys.0 > x {
186                return sum as f64 / self.sum;
187            }
188            sum += values;
189        }
190        sum as f64 / self.sum
191    }
192
193    fn sf(&self, x: f64) -> f64 {
194        let mut sum = 0;
195        for (keys, values) in self.data.iter().rev() {
196            if keys.0 <= x {
197                return sum as f64 / self.sum;
198            }
199            sum += values;
200        }
201        sum as f64 / self.sum
202    }
203}
204
205#[cfg(all(test, feature = "nightly"))]
206mod tests {
207    use super::*;
208    #[test]
209    fn test_cdf() {
210        let samples = vec![5.0, 10.0];
211        let mut empirical = Empirical::from_vec(samples);
212        assert_eq!(empirical.cdf(0.0), 0.0);
213        assert_eq!(empirical.cdf(5.0), 0.5);
214        assert_eq!(empirical.cdf(5.5), 0.5);
215        assert_eq!(empirical.cdf(6.0), 0.5);
216        assert_eq!(empirical.cdf(10.0), 1.0);
217        assert_eq!(empirical.min(), 5.0);
218        assert_eq!(empirical.max(), 10.0);
219        empirical.add(2.0);
220        empirical.add(2.0);
221        assert_eq!(empirical.cdf(0.0), 0.0);
222        assert_eq!(empirical.cdf(5.0), 0.75);
223        assert_eq!(empirical.cdf(5.5), 0.75);
224        assert_eq!(empirical.cdf(6.0), 0.75);
225        assert_eq!(empirical.cdf(10.0), 1.0);
226        assert_eq!(empirical.min(), 2.0);
227        assert_eq!(empirical.max(), 10.0);
228        let unchanged = empirical.clone();
229        empirical.add(2.0);
230        empirical.remove(2.0);
231        // because of rounding errors, this doesn't hold in general
232        // due to the mean and variance being calculated in a streaming way
233        assert_eq!(unchanged, empirical);
234    }
235
236    #[test]
237    fn test_sf() {
238        let samples = vec![5.0, 10.0];
239        let mut empirical = Empirical::from_vec(samples);
240        assert_eq!(empirical.sf(0.0), 1.0);
241        assert_eq!(empirical.sf(5.0), 0.5);
242        assert_eq!(empirical.sf(5.5), 0.5);
243        assert_eq!(empirical.sf(6.0), 0.5);
244        assert_eq!(empirical.sf(10.0), 0.0);
245        assert_eq!(empirical.min(), 5.0);
246        assert_eq!(empirical.max(), 10.0);
247        empirical.add(2.0);
248        empirical.add(2.0);
249        assert_eq!(empirical.sf(0.0), 1.0);
250        assert_eq!(empirical.sf(5.0), 0.25);
251        assert_eq!(empirical.sf(5.5), 0.25);
252        assert_eq!(empirical.sf(6.0), 0.25);
253        assert_eq!(empirical.sf(10.0), 0.0);
254        assert_eq!(empirical.min(), 2.0);
255        assert_eq!(empirical.max(), 10.0);
256        let unchanged = empirical.clone();
257        empirical.add(2.0);
258        empirical.remove(2.0);
259         //because of rounding errors, this doesn't hold in general
260         //due to the mean and variance being calculated in a streaming way
261        assert_eq!(unchanged, empirical);
262    }
263}