vector/sinks/util/
statistic.rs

1use std::cmp::Ordering;
2
3use snafu::Snafu;
4
5use crate::event::metric::Sample;
6
7#[derive(Debug, Snafu)]
8pub enum ValidationError {
9    #[snafu(display("Quantiles must be in range [0.0,1.0]"))]
10    QuantileOutOfRange,
11}
12
13#[derive(Debug)]
14pub struct DistributionStatistic {
15    pub min: f64,
16    pub max: f64,
17    pub median: f64,
18    pub avg: f64,
19    pub sum: f64,
20    pub count: u64,
21    /// (quantile, value)
22    pub quantiles: Vec<(f64, f64)>,
23}
24
25impl DistributionStatistic {
26    pub fn from_samples(source: &[Sample], quantiles: &[f64]) -> Option<Self> {
27        let mut bins = source
28            .iter()
29            .filter(|sample| sample.rate > 0)
30            .copied()
31            .collect::<Vec<_>>();
32
33        match bins.len() {
34            0 => None,
35            1 => Some({
36                let val = bins[0].value;
37                let count = bins[0].rate;
38                Self {
39                    min: val,
40                    max: val,
41                    median: val,
42                    avg: val,
43                    sum: val * count as f64,
44                    count: count as u64,
45                    quantiles: quantiles.iter().map(|&p| (p, val)).collect(),
46                }
47            }),
48            _ => Some({
49                bins.sort_unstable_by(|a, b| {
50                    a.value.partial_cmp(&b.value).unwrap_or(Ordering::Equal)
51                });
52
53                let min = bins.first().unwrap().value;
54                let max = bins.last().unwrap().value;
55                let sum = bins
56                    .iter()
57                    .map(|sample| sample.value * sample.rate as f64)
58                    .sum::<f64>();
59
60                for i in 1..bins.len() {
61                    bins[i].rate += bins[i - 1].rate;
62                }
63
64                let count = bins.last().unwrap().rate;
65                let avg = sum / count as f64;
66
67                let median = find_quantile(&bins, 0.5);
68                let quantiles = quantiles
69                    .iter()
70                    .map(|&p| (p, find_quantile(&bins, p)))
71                    .collect();
72
73                Self {
74                    min,
75                    max,
76                    median,
77                    avg,
78                    sum,
79                    count: count as u64,
80                    quantiles,
81                }
82            }),
83        }
84    }
85}
86
87/// `bins` is a cumulative histogram
88/// We are using R-3 (without choosing the even integer in the case of a tie),
89/// it might be preferable to use a more common function, such as R-7.
90///
91/// List of quantile functions:
92/// <https://en.wikipedia.org/wiki/Quantile#Estimating_quantiles_from_a_sample>
93fn find_quantile(bins: &[Sample], p: f64) -> f64 {
94    let count = bins.last().expect("bins is empty").rate;
95    find_sample(bins, (p * count as f64).round() as u32)
96}
97
98/// `bins` is a cumulative histogram
99/// Return the i-th smallest value,
100/// i starts from 1 (i == 1 mean the smallest value).
101/// i == 0 is equivalent to i == 1.
102fn find_sample(bins: &[Sample], i: u32) -> f64 {
103    let index = match bins.binary_search_by_key(&i, |sample| sample.rate) {
104        Ok(index) => index,
105        Err(index) => index,
106    };
107    bins[index].value
108}
109
110pub fn validate_quantiles(quantiles: &[f64]) -> Result<(), ValidationError> {
111    if quantiles
112        .iter()
113        .all(|&quantile| (0.0..=1.0).contains(&quantile))
114    {
115        Ok(())
116    } else {
117        Err(ValidationError::QuantileOutOfRange)
118    }
119}
120
121#[cfg(test)]
122mod test {
123    use super::*;
124
125    impl PartialEq<Self> for DistributionStatistic {
126        fn eq(&self, other: &Self) -> bool {
127            self.min == other.min
128                && self.max == other.max
129                && self.median == other.median
130                && self.avg == other.avg
131                && self.sum == other.sum
132                && self.count == other.count
133                && self
134                    .quantiles
135                    .iter()
136                    .zip(other.quantiles.iter())
137                    .all(|(this, other)| this.0 == other.0 && this.1 == other.1)
138        }
139    }
140
141    impl Eq for DistributionStatistic {}
142
143    fn samples(v: &[(f64, u32)]) -> Vec<Sample> {
144        v.iter()
145            .map(|&(value, rate)| Sample { value, rate })
146            .collect()
147    }
148
149    #[test]
150    fn test_distribution() {
151        // should return None on empty input
152        assert_eq!(DistributionStatistic::from_samples(&[], &[0.5]), None);
153        assert_eq!(
154            DistributionStatistic::from_samples(&samples(&[(0.0, 0)]), &[0.5]),
155            None
156        );
157
158        // test len == 1 case
159        assert_eq!(
160            DistributionStatistic::from_samples(&samples(&[(0.9, 100)]), &[0.5],).unwrap(),
161            DistributionStatistic {
162                min: 0.9,
163                max: 0.9,
164                median: 0.9,
165                avg: 0.9,
166                sum: 90.0,
167                count: 100,
168                quantiles: vec![(0.5, 0.9)],
169            }
170        );
171
172        assert_eq!(
173            DistributionStatistic::from_samples(
174                &samples(&[(1.0, 1), (2.0, 1), (3.0, 1), (4.0, 1), (5.0, 1)]),
175                &[]
176            )
177            .unwrap(),
178            DistributionStatistic {
179                min: 1.0,
180                max: 5.0,
181                median: 3.0,
182                avg: 3.0,
183                sum: 15.0,
184                count: 5,
185                quantiles: Vec::new(),
186            }
187        );
188
189        assert_eq!(
190            DistributionStatistic::from_samples(
191                &samples(&[(1.0, 1), (2.0, 1), (4.0, 1), (3.0, 1)]),
192                &[0.0, 1.0, 0.9]
193            )
194            .unwrap(),
195            DistributionStatistic {
196                min: 1.0,
197                max: 4.0,
198                median: 2.0,
199                avg: 2.5,
200                sum: 10.0,
201                count: 4,
202                quantiles: vec![(0.0, 1.0), (1.0, 4.0), (0.9, 4.0)],
203            }
204        );
205
206        assert_eq!(
207            DistributionStatistic::from_samples(
208                &samples(&[(1.0, 2), (2.0, 1), (3.0, 4), (4.0, 3)]),
209                &[0.75, 0.3, 0.31, 0.29, 0.24],
210            )
211            .unwrap(),
212            DistributionStatistic {
213                min: 1.0,
214                max: 4.0,
215                median: 3.0,
216                avg: 2.8,
217                sum: 28.0,
218                count: 10,
219                quantiles: vec![
220                    (0.75, 4.0),
221                    (0.3, 2.0),
222                    (0.31, 2.0),
223                    (0.29, 2.0),
224                    (0.24, 1.0)
225                ],
226            }
227        );
228    }
229}