statrs/distribution/
empirical.rs1use crate::distribution::{Continuous, ContinuousCDF, Uniform};
2use crate::statistics::*;
3use crate::{Result, StatsError};
4use ::num_traits::float::Float;
5use core::cmp::Ordering;
6use rand::Rng;
7use std::collections::BTreeMap;
8
9#[derive(Clone, Debug, PartialEq)]
10struct NonNAN<T>(T);
11
12impl<T: PartialEq> Eq for NonNAN<T> {}
13
14impl<T: PartialOrd> PartialOrd for NonNAN<T> {
15 fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
16 self.0.partial_cmp(&other.0)
17 }
18}
19
20impl<T: PartialOrd> Ord for NonNAN<T> {
21 fn cmp(&self, other: &Self) -> Ordering {
22 self.partial_cmp(other).unwrap()
23 }
24}
25
26#[derive(Debug, Clone, PartialEq)]
41pub struct Empirical {
42 sum: f64,
43 mean_and_var: Option<(f64, f64)>,
44 data: BTreeMap<NonNAN<f64>, u64>,
46}
47
48impl Empirical {
49 pub fn new() -> Result<Empirical> {
62 Ok(Empirical {
63 sum: 0.,
64 mean_and_var: None,
65 data: BTreeMap::new(),
66 })
67 }
68 pub fn from_vec(src: Vec<f64>) -> Empirical {
69 let mut empirical = Empirical::new().unwrap();
70 for elt in src.into_iter() {
71 empirical.add(elt);
72 }
73 empirical
74 }
75 pub fn add(&mut self, data_point: f64) {
76 if !data_point.is_nan() {
77 self.sum += 1.;
78 match self.mean_and_var {
79 Some((mean, var)) => {
80 let sum = self.sum;
81 let var = var + (sum - 1.) * (data_point - mean) * (data_point - mean) / sum;
82 let mean = mean + (data_point - mean) / sum;
83 self.mean_and_var = Some((mean, var));
84 }
85 None => {
86 self.mean_and_var = Some((data_point, 0.));
87 }
88 }
89 *self.data.entry(NonNAN(data_point)).or_insert(0) += 1;
90 }
91 }
92 pub fn remove(&mut self, data_point: f64) {
93 if !data_point.is_nan() {
94 if let (Some(val), Some((mean, var))) =
95 (self.data.remove(&NonNAN(data_point)), self.mean_and_var)
96 {
97 if val == 1 && self.data.is_empty() {
98 self.mean_and_var = None;
99 self.sum = 0.;
100 return;
101 };
102 let mean = (self.sum * mean - data_point) / (self.sum - 1.);
104 let var =
105 var - (self.sum - 1.) * (data_point - mean) * (data_point - mean) / self.sum;
106 self.sum -= 1.;
107 if val != 1 {
108 self.data.insert(NonNAN(data_point), val - 1);
109 };
110 self.mean_and_var = Some((mean, var));
111 }
112 }
113 }
114 fn __inverse_cdf(&self, p: f64) -> f64 {
123 if p == 0.0 {
124 return self.min();
125 };
126 if p == 1.0 {
127 return self.max();
128 };
129 let mut high = 2.0;
130 let mut low = -high;
131 while self.cdf(low) > p {
132 low = low + low;
133 }
134 while self.cdf(high) < p {
135 high = high + high;
136 }
137 let mut i = 16;
138 while i != 0 {
139 let mid = (high + low) / 2.0;
140 if self.cdf(mid) >= p {
141 high = mid;
142 } else {
143 low = mid;
144 }
145 i -= 1;
146 }
147 (high + low) / 2.0
148 }
149}
150
151impl ::rand::distributions::Distribution<f64> for Empirical {
152 fn sample<R: ?Sized + Rng>(&self, rng: &mut R) -> f64 {
153 let uniform = Uniform::new(0.0, 1.0).unwrap();
154 self.__inverse_cdf(uniform.sample(rng))
155 }
156}
157
158impl Max<f64> for Empirical {
160 fn max(&self) -> f64 {
161 self.data.iter().rev().map(|(key, _)| key.0).next().unwrap()
162 }
163}
164
165impl Min<f64> for Empirical {
167 fn min(&self) -> f64 {
168 self.data.iter().map(|(key, _)| key.0).next().unwrap()
169 }
170}
171
172impl Distribution<f64> for Empirical {
173 fn mean(&self) -> Option<f64> {
174 self.mean_and_var.map(|(mean, _)| mean)
175 }
176 fn variance(&self) -> Option<f64> {
177 self.mean_and_var.map(|(_, var)| var / (self.sum - 1.))
178 }
179}
180
181impl ContinuousCDF<f64, f64> for Empirical {
182 fn cdf(&self, x: f64) -> f64 {
183 let mut sum = 0;
184 for (keys, values) in &self.data {
185 if keys.0 > x {
186 return sum as f64 / self.sum;
187 }
188 sum += values;
189 }
190 sum as f64 / self.sum
191 }
192
193 fn sf(&self, x: f64) -> f64 {
194 let mut sum = 0;
195 for (keys, values) in self.data.iter().rev() {
196 if keys.0 <= x {
197 return sum as f64 / self.sum;
198 }
199 sum += values;
200 }
201 sum as f64 / self.sum
202 }
203}
204
205#[cfg(all(test, feature = "nightly"))]
206mod tests {
207 use super::*;
208 #[test]
209 fn test_cdf() {
210 let samples = vec![5.0, 10.0];
211 let mut empirical = Empirical::from_vec(samples);
212 assert_eq!(empirical.cdf(0.0), 0.0);
213 assert_eq!(empirical.cdf(5.0), 0.5);
214 assert_eq!(empirical.cdf(5.5), 0.5);
215 assert_eq!(empirical.cdf(6.0), 0.5);
216 assert_eq!(empirical.cdf(10.0), 1.0);
217 assert_eq!(empirical.min(), 5.0);
218 assert_eq!(empirical.max(), 10.0);
219 empirical.add(2.0);
220 empirical.add(2.0);
221 assert_eq!(empirical.cdf(0.0), 0.0);
222 assert_eq!(empirical.cdf(5.0), 0.75);
223 assert_eq!(empirical.cdf(5.5), 0.75);
224 assert_eq!(empirical.cdf(6.0), 0.75);
225 assert_eq!(empirical.cdf(10.0), 1.0);
226 assert_eq!(empirical.min(), 2.0);
227 assert_eq!(empirical.max(), 10.0);
228 let unchanged = empirical.clone();
229 empirical.add(2.0);
230 empirical.remove(2.0);
231 assert_eq!(unchanged, empirical);
234 }
235
236 #[test]
237 fn test_sf() {
238 let samples = vec![5.0, 10.0];
239 let mut empirical = Empirical::from_vec(samples);
240 assert_eq!(empirical.sf(0.0), 1.0);
241 assert_eq!(empirical.sf(5.0), 0.5);
242 assert_eq!(empirical.sf(5.5), 0.5);
243 assert_eq!(empirical.sf(6.0), 0.5);
244 assert_eq!(empirical.sf(10.0), 0.0);
245 assert_eq!(empirical.min(), 5.0);
246 assert_eq!(empirical.max(), 10.0);
247 empirical.add(2.0);
248 empirical.add(2.0);
249 assert_eq!(empirical.sf(0.0), 1.0);
250 assert_eq!(empirical.sf(5.0), 0.25);
251 assert_eq!(empirical.sf(5.5), 0.25);
252 assert_eq!(empirical.sf(6.0), 0.25);
253 assert_eq!(empirical.sf(10.0), 0.0);
254 assert_eq!(empirical.min(), 2.0);
255 assert_eq!(empirical.max(), 10.0);
256 let unchanged = empirical.clone();
257 empirical.add(2.0);
258 empirical.remove(2.0);
259 assert_eq!(unchanged, empirical);
262 }
263}