1use std::{collections::HashMap, str::FromStr};
2
3use unicode_segmentation::UnicodeSegmentation;
4
5use crate::compiler::function::EnumVariant;
6use crate::compiler::prelude::*;
7use std::sync::LazyLock;
8
9static DEFAULT_SEGMENTATION: LazyLock<Value> = LazyLock::new(|| Value::Bytes(Bytes::from("byte")));
10
11static SEGMENTATION_ENUM: &[EnumVariant] = &[
12 EnumVariant {
13 value: "byte",
14 description: "Considers individual bytes when calculating entropy",
15 },
16 EnumVariant {
17 value: "codepoint",
18 description: "Considers codepoints when calculating entropy",
19 },
20 EnumVariant {
21 value: "grapheme",
22 description: "Considers graphemes when calculating entropy",
23 },
24];
25
26static PARAMETERS: LazyLock<Vec<Parameter>> = LazyLock::new(|| {
27 vec![
28 Parameter::required("value", kind::BYTES, "The input string."),
29 Parameter::optional(
30 "segmentation",
31 kind::BYTES,
32 "Defines how to split the string to calculate entropy, based on occurrences of
33segments.
34
35Byte segmentation is the fastest, but it might give undesired results when handling
36UTF-8 strings, while grapheme segmentation is the slowest, but most correct in these
37cases.",
38 )
39 .default(&DEFAULT_SEGMENTATION)
40 .enum_variants(SEGMENTATION_ENUM),
41 ]
42});
43
44#[allow(clippy::cast_precision_loss)]
49fn shannon_entropy(value: &Value, segmentation: &Segmentation) -> Resolved {
50 let (occurence_counts, total_length): (Vec<usize>, usize) = match segmentation {
51 Segmentation::Byte => {
52 let bytes = value.clone().try_bytes()?;
55 let mut counts = [0usize; 256];
56 let total_len = bytes.len() as f64;
57
58 for b in bytes {
59 counts[b as usize] += 1;
60 }
61
62 let mut entropy = 0.0;
63
64 for count in counts {
65 if count == 0 {
66 continue;
67 }
68
69 let p = count as f64 / total_len;
70
71 entropy -= p * p.log2();
72 }
73
74 return Ok(Value::from_f64_or_zero(entropy));
75 }
76 Segmentation::Codepoint => {
77 let string = value.try_bytes_utf8_lossy()?;
78 let chars = string.chars();
79 let mut counts = HashMap::new();
80 let mut total_len = 0;
81
82 for char in chars {
83 counts.entry(char).and_modify(|c| *c += 1).or_insert(1);
84 total_len += 1;
85 }
86
87 (counts.into_values().collect(), total_len)
88 }
89 Segmentation::Grapheme => {
90 let string = value.try_bytes_utf8_lossy()?;
91 let graphemes = string.graphemes(true);
92 let mut counts = HashMap::new();
93 let mut total_len = 0;
94
95 for grapheme in graphemes {
96 counts.entry(grapheme).and_modify(|c| *c += 1).or_insert(1);
97 total_len += 1;
98 }
99
100 (counts.into_values().collect(), total_len)
101 }
102 };
103
104 Ok(Value::from_f64_or_zero(
105 occurence_counts
106 .iter()
107 .map(|occurence_count| *occurence_count as f64 / total_length as f64)
109 .fold(0f64, |acc, p| acc - (p * p.log2())),
111 ))
112}
113
114#[derive(Default, Debug, Clone)]
115enum Segmentation {
116 #[default]
117 Byte,
118 Codepoint,
119 Grapheme,
120}
121
122impl FromStr for Segmentation {
123 type Err = ();
124
125 fn from_str(s: &str) -> Result<Self, Self::Err> {
126 match s {
127 "byte" => Ok(Self::Byte),
128 "codepoint" => Ok(Self::Codepoint),
129 "grapheme" => Ok(Self::Grapheme),
130 _ => Err(()),
131 }
132 }
133}
134
135#[derive(Clone, Copy, Debug)]
136pub struct ShannonEntropy;
137
138impl Function for ShannonEntropy {
139 fn identifier(&self) -> &'static str {
140 "shannon_entropy"
141 }
142
143 fn usage(&self) -> &'static str {
144 "Generates [Shannon entropy](https://en.wikipedia.org/wiki/Entropy_(information_theory)) from given string. It can generate it based on string bytes, codepoints, or graphemes."
145 }
146
147 fn category(&self) -> &'static str {
148 Category::String.as_ref()
149 }
150
151 fn return_kind(&self) -> u16 {
152 kind::FLOAT
153 }
154
155 fn parameters(&self) -> &'static [Parameter] {
156 PARAMETERS.as_slice()
157 }
158
159 fn examples(&self) -> &'static [Example] {
160 &[
161 example! {
162 title: "Simple byte segmentation example",
163 source: r#"floor(shannon_entropy("vector.dev"), precision: 4)"#,
164 result: Ok("2.9219"),
165 },
166 example! {
167 title: "UTF-8 string with bytes segmentation",
168 source: r#"floor(shannon_entropy("test123%456.فوائد.net."), precision: 4)"#,
169 result: Ok("4.0784"),
170 },
171 example! {
172 title: "UTF-8 string with grapheme segmentation",
173 source: r#"floor(shannon_entropy("test123%456.فوائد.net.", segmentation: "grapheme"), precision: 4)"#,
174 result: Ok("3.9362"),
175 },
176 example! {
177 title: "UTF-8 emoji (7 Unicode scalar values) with grapheme segmentation",
178 source: r#"shannon_entropy("👨👩👧👦", segmentation: "grapheme")"#,
179 result: Ok("0.0"),
180 },
181 ]
182 }
183
184 fn compile(
185 &self,
186 state: &state::TypeState,
187 _ctx: &mut FunctionCompileContext,
188 arguments: ArgumentList,
189 ) -> Compiled {
190 let value = arguments.required("value");
191 let segmentation = arguments
192 .optional_enum(
193 "segmentation",
194 &["byte".into(), "codepoint".into(), "grapheme".into()],
195 state,
196 )?
197 .unwrap_or_else(|| DEFAULT_SEGMENTATION.clone())
198 .try_bytes_utf8_lossy()
199 .map(|s| Segmentation::from_str(&s).expect("validated enum"))
200 .expect("segmentation not bytes");
201
202 Ok(ShannonEntropyFn {
203 value,
204 segmentation,
205 }
206 .as_expr())
207 }
208}
209
210#[derive(Debug, Clone)]
211struct ShannonEntropyFn {
212 value: Box<dyn Expression>,
213 segmentation: Segmentation,
214}
215
216impl FunctionExpression for ShannonEntropyFn {
217 fn resolve(&self, ctx: &mut Context) -> Resolved {
218 let value = self.value.resolve(ctx)?;
219
220 shannon_entropy(&value, &self.segmentation)
221 }
222
223 fn type_def(&self, _: &state::TypeState) -> TypeDef {
224 TypeDef::float().infallible()
225 }
226}
227
228#[cfg(test)]
229mod tests {
230 use std::collections::BTreeMap;
231
232 use super::*;
233 use crate::{stdlib::util::round_to_precision, value};
234
235 #[test]
236 fn simple_example() {
237 assert_eq!(
238 value!(2.9219),
239 execute_function_with_precision(
240 &ShannonEntropyFn {
241 value: expr!("vector.dev"),
242 segmentation: Segmentation::default()
243 },
244 4
245 )
246 );
247 }
248
249 #[test]
250 fn longer_example() {
251 assert_eq!(
252 value!(3.737),
253 execute_function_with_precision(
254 &ShannonEntropyFn {
255 value: expr!("Supercalifragilisticexpialidocious"),
256 segmentation: Segmentation::default()
257 },
258 4
259 )
260 );
261 }
262
263 #[test]
264 fn fancy_foo_example() {
265 assert_eq!(
266 value!(1.5),
267 execute_function(&ShannonEntropyFn {
268 value: expr!("ƒoo"),
269 segmentation: Segmentation::default()
270 })
271 );
272 }
273
274 #[test]
275 fn fancy_foo_codepoint_segmentation_example() {
276 assert_eq!(
277 value!(0.9183),
278 execute_function_with_precision(
279 &ShannonEntropyFn {
280 value: expr!("ƒoo"),
281 segmentation: Segmentation::Codepoint
282 },
283 4
284 )
285 );
286 }
287
288 #[test]
289 fn utf_8_byte_segmentation_example() {
290 assert_eq!(
291 value!(4.0784),
292 execute_function_with_precision(
293 &ShannonEntropyFn {
294 value: expr!("test123%456.فوائد.net."),
295 segmentation: Segmentation::default()
296 },
297 4
298 )
299 );
300 }
301
302 #[test]
303 fn utf_8_codepoint_segmentation_example() {
304 assert_eq!(
305 value!(3.9363),
306 execute_function_with_precision(
307 &ShannonEntropyFn {
308 value: expr!("test123%456.فوائد.net."),
309 segmentation: Segmentation::Codepoint
310 },
311 4
312 )
313 );
314 }
315
316 #[test]
317 fn utf_8_example() {
318 assert_eq!(
319 value!(3.9363),
320 execute_function_with_precision(
321 &ShannonEntropyFn {
322 value: expr!("test123%456.فوائد.net."),
323 segmentation: Segmentation::Grapheme
324 },
325 4
326 )
327 );
328 }
329
330 fn prepare_function(function: &ShannonEntropyFn) -> Resolved {
331 let tz = TimeZone::default();
332 let mut object: Value = Value::Object(BTreeMap::new());
333 let mut runtime_state = state::RuntimeState::default();
334 let mut ctx = Context::new(&mut object, &mut runtime_state, &tz);
335 function.resolve(&mut ctx)
336 }
337
338 fn execute_function(function: &ShannonEntropyFn) -> Value {
339 prepare_function(function)
340 .map_err(|e| format!("{:#}", anyhow::anyhow!(e)))
341 .unwrap()
342 }
343
344 fn execute_function_with_precision(function: &ShannonEntropyFn, precision: i64) -> Value {
345 Value::from_f64_or_zero(round_to_precision(
346 execute_function(function).try_float().unwrap(),
347 precision,
348 f64::round,
349 ))
350 }
351}