vrl/stdlib/
shannon_entropy.rs

1use std::{collections::HashMap, str::FromStr};
2
3use unicode_segmentation::UnicodeSegmentation;
4
5use crate::compiler::function::EnumVariant;
6use crate::compiler::prelude::*;
7use std::sync::LazyLock;
8
9static DEFAULT_SEGMENTATION: LazyLock<Value> = LazyLock::new(|| Value::Bytes(Bytes::from("byte")));
10
11static SEGMENTATION_ENUM: &[EnumVariant] = &[
12    EnumVariant {
13        value: "byte",
14        description: "Considers individual bytes when calculating entropy",
15    },
16    EnumVariant {
17        value: "codepoint",
18        description: "Considers codepoints when calculating entropy",
19    },
20    EnumVariant {
21        value: "grapheme",
22        description: "Considers graphemes when calculating entropy",
23    },
24];
25
26static PARAMETERS: LazyLock<Vec<Parameter>> = LazyLock::new(|| {
27    vec![
28        Parameter::required("value", kind::BYTES, "The input string."),
29        Parameter::optional(
30            "segmentation",
31            kind::BYTES,
32            "Defines how to split the string to calculate entropy, based on occurrences of
33segments.
34
35Byte segmentation is the fastest, but it might give undesired results when handling
36UTF-8 strings, while grapheme segmentation is the slowest, but most correct in these
37cases.",
38        )
39        .default(&DEFAULT_SEGMENTATION)
40        .enum_variants(SEGMENTATION_ENUM),
41    ]
42});
43
44// Casting to f64 in this function is only done to enable proper division (when calculating probability)
45// Since numbers being casted represent lenghts of input strings and number of character occurences,
46// we can assume that there will never really be precision loss here, because that would mean that
47// the string is at least 2^52 bytes in size (4.5 PB)
48#[allow(clippy::cast_precision_loss)]
49fn shannon_entropy(value: &Value, segmentation: &Segmentation) -> Resolved {
50    let (occurence_counts, total_length): (Vec<usize>, usize) = match segmentation {
51        Segmentation::Byte => {
52            // Optimized version for bytes, since there is a limited number of options, that could
53            // easily be kept track of
54            let bytes = value.clone().try_bytes()?;
55            let mut counts = [0usize; 256];
56            let total_len = bytes.len() as f64;
57
58            for b in bytes {
59                counts[b as usize] += 1;
60            }
61
62            let mut entropy = 0.0;
63
64            for count in counts {
65                if count == 0 {
66                    continue;
67                }
68
69                let p = count as f64 / total_len;
70
71                entropy -= p * p.log2();
72            }
73
74            return Ok(Value::from_f64_or_zero(entropy));
75        }
76        Segmentation::Codepoint => {
77            let string = value.try_bytes_utf8_lossy()?;
78            let chars = string.chars();
79            let mut counts = HashMap::new();
80            let mut total_len = 0;
81
82            for char in chars {
83                counts.entry(char).and_modify(|c| *c += 1).or_insert(1);
84                total_len += 1;
85            }
86
87            (counts.into_values().collect(), total_len)
88        }
89        Segmentation::Grapheme => {
90            let string = value.try_bytes_utf8_lossy()?;
91            let graphemes = string.graphemes(true);
92            let mut counts = HashMap::new();
93            let mut total_len = 0;
94
95            for grapheme in graphemes {
96                counts.entry(grapheme).and_modify(|c| *c += 1).or_insert(1);
97                total_len += 1;
98            }
99
100            (counts.into_values().collect(), total_len)
101        }
102    };
103
104    Ok(Value::from_f64_or_zero(
105        occurence_counts
106            .iter()
107            // Calculate probability of each item by diving occurence count by total length
108            .map(|occurence_count| *occurence_count as f64 / total_length as f64)
109            // Calculate entropy using definition: sum(-p * log2(p))
110            .fold(0f64, |acc, p| acc - (p * p.log2())),
111    ))
112}
113
114#[derive(Default, Debug, Clone)]
115enum Segmentation {
116    #[default]
117    Byte,
118    Codepoint,
119    Grapheme,
120}
121
122impl FromStr for Segmentation {
123    type Err = ();
124
125    fn from_str(s: &str) -> Result<Self, Self::Err> {
126        match s {
127            "byte" => Ok(Self::Byte),
128            "codepoint" => Ok(Self::Codepoint),
129            "grapheme" => Ok(Self::Grapheme),
130            _ => Err(()),
131        }
132    }
133}
134
135#[derive(Clone, Copy, Debug)]
136pub struct ShannonEntropy;
137
138impl Function for ShannonEntropy {
139    fn identifier(&self) -> &'static str {
140        "shannon_entropy"
141    }
142
143    fn usage(&self) -> &'static str {
144        "Generates [Shannon entropy](https://en.wikipedia.org/wiki/Entropy_(information_theory)) from given string. It can generate it based on string bytes, codepoints, or graphemes."
145    }
146
147    fn category(&self) -> &'static str {
148        Category::String.as_ref()
149    }
150
151    fn return_kind(&self) -> u16 {
152        kind::FLOAT
153    }
154
155    fn parameters(&self) -> &'static [Parameter] {
156        PARAMETERS.as_slice()
157    }
158
159    fn examples(&self) -> &'static [Example] {
160        &[
161            example! {
162                title: "Simple byte segmentation example",
163                source: r#"floor(shannon_entropy("vector.dev"), precision: 4)"#,
164                result: Ok("2.9219"),
165            },
166            example! {
167                title: "UTF-8 string with bytes segmentation",
168                source: r#"floor(shannon_entropy("test123%456.فوائد.net."), precision: 4)"#,
169                result: Ok("4.0784"),
170            },
171            example! {
172                title: "UTF-8 string with grapheme segmentation",
173                source: r#"floor(shannon_entropy("test123%456.فوائد.net.", segmentation: "grapheme"), precision: 4)"#,
174                result: Ok("3.9362"),
175            },
176            example! {
177                title: "UTF-8 emoji (7 Unicode scalar values) with grapheme segmentation",
178                source: r#"shannon_entropy("👨‍👩‍👧‍👦", segmentation: "grapheme")"#,
179                result: Ok("0.0"),
180            },
181        ]
182    }
183
184    fn compile(
185        &self,
186        state: &state::TypeState,
187        _ctx: &mut FunctionCompileContext,
188        arguments: ArgumentList,
189    ) -> Compiled {
190        let value = arguments.required("value");
191        let segmentation = arguments
192            .optional_enum(
193                "segmentation",
194                &["byte".into(), "codepoint".into(), "grapheme".into()],
195                state,
196            )?
197            .unwrap_or_else(|| DEFAULT_SEGMENTATION.clone())
198            .try_bytes_utf8_lossy()
199            .map(|s| Segmentation::from_str(&s).expect("validated enum"))
200            .expect("segmentation not bytes");
201
202        Ok(ShannonEntropyFn {
203            value,
204            segmentation,
205        }
206        .as_expr())
207    }
208}
209
210#[derive(Debug, Clone)]
211struct ShannonEntropyFn {
212    value: Box<dyn Expression>,
213    segmentation: Segmentation,
214}
215
216impl FunctionExpression for ShannonEntropyFn {
217    fn resolve(&self, ctx: &mut Context) -> Resolved {
218        let value = self.value.resolve(ctx)?;
219
220        shannon_entropy(&value, &self.segmentation)
221    }
222
223    fn type_def(&self, _: &state::TypeState) -> TypeDef {
224        TypeDef::float().infallible()
225    }
226}
227
228#[cfg(test)]
229mod tests {
230    use std::collections::BTreeMap;
231
232    use super::*;
233    use crate::{stdlib::util::round_to_precision, value};
234
235    #[test]
236    fn simple_example() {
237        assert_eq!(
238            value!(2.9219),
239            execute_function_with_precision(
240                &ShannonEntropyFn {
241                    value: expr!("vector.dev"),
242                    segmentation: Segmentation::default()
243                },
244                4
245            )
246        );
247    }
248
249    #[test]
250    fn longer_example() {
251        assert_eq!(
252            value!(3.737),
253            execute_function_with_precision(
254                &ShannonEntropyFn {
255                    value: expr!("Supercalifragilisticexpialidocious"),
256                    segmentation: Segmentation::default()
257                },
258                4
259            )
260        );
261    }
262
263    #[test]
264    fn fancy_foo_example() {
265        assert_eq!(
266            value!(1.5),
267            execute_function(&ShannonEntropyFn {
268                value: expr!("ƒoo"),
269                segmentation: Segmentation::default()
270            })
271        );
272    }
273
274    #[test]
275    fn fancy_foo_codepoint_segmentation_example() {
276        assert_eq!(
277            value!(0.9183),
278            execute_function_with_precision(
279                &ShannonEntropyFn {
280                    value: expr!("ƒoo"),
281                    segmentation: Segmentation::Codepoint
282                },
283                4
284            )
285        );
286    }
287
288    #[test]
289    fn utf_8_byte_segmentation_example() {
290        assert_eq!(
291            value!(4.0784),
292            execute_function_with_precision(
293                &ShannonEntropyFn {
294                    value: expr!("test123%456.فوائد.net."),
295                    segmentation: Segmentation::default()
296                },
297                4
298            )
299        );
300    }
301
302    #[test]
303    fn utf_8_codepoint_segmentation_example() {
304        assert_eq!(
305            value!(3.9363),
306            execute_function_with_precision(
307                &ShannonEntropyFn {
308                    value: expr!("test123%456.فوائد.net."),
309                    segmentation: Segmentation::Codepoint
310                },
311                4
312            )
313        );
314    }
315
316    #[test]
317    fn utf_8_example() {
318        assert_eq!(
319            value!(3.9363),
320            execute_function_with_precision(
321                &ShannonEntropyFn {
322                    value: expr!("test123%456.فوائد.net."),
323                    segmentation: Segmentation::Grapheme
324                },
325                4
326            )
327        );
328    }
329
330    fn prepare_function(function: &ShannonEntropyFn) -> Resolved {
331        let tz = TimeZone::default();
332        let mut object: Value = Value::Object(BTreeMap::new());
333        let mut runtime_state = state::RuntimeState::default();
334        let mut ctx = Context::new(&mut object, &mut runtime_state, &tz);
335        function.resolve(&mut ctx)
336    }
337
338    fn execute_function(function: &ShannonEntropyFn) -> Value {
339        prepare_function(function)
340            .map_err(|e| format!("{:#}", anyhow::anyhow!(e)))
341            .unwrap()
342    }
343
344    fn execute_function_with_precision(function: &ShannonEntropyFn, precision: i64) -> Value {
345        Value::from_f64_or_zero(round_to_precision(
346            execute_function(function).try_float().unwrap(),
347            precision,
348            f64::round,
349        ))
350    }
351}