vrl/stdlib/
sieve.rs

1use crate::compiler::prelude::*;
2use std::sync::LazyLock;
3
4static DEFAULT_REPLACE_SINGLE: LazyLock<Value> = LazyLock::new(|| Value::Bytes(Bytes::from("")));
5static DEFAULT_REPLACE_REPEATED: LazyLock<Value> = LazyLock::new(|| Value::Bytes(Bytes::from("")));
6
7static PARAMETERS: LazyLock<Vec<Parameter>> = LazyLock::new(|| {
8    vec![
9        Parameter::required("value", kind::BYTES, "The original string."),
10        Parameter::required(
11            "permitted_characters",
12            kind::REGEX,
13            "Keep all matches of this pattern.",
14        ),
15        Parameter::optional(
16            "replace_single",
17            kind::BYTES,
18            "The string to use to replace single rejected characters.",
19        )
20        .default(&DEFAULT_REPLACE_SINGLE),
21        Parameter::optional(
22            "replace_repeated",
23            kind::BYTES,
24            "The string to use to replace multiple sequential instances of rejected characters.",
25        )
26        .default(&DEFAULT_REPLACE_REPEATED),
27    ]
28});
29
30fn sieve(
31    value: &Value,
32    permitted_characters: Value,
33    replace_single: &Value,
34    replace_repeated: &Value,
35) -> Resolved {
36    let value = value.try_bytes_utf8_lossy()?;
37    let replace_single = replace_single.try_bytes_utf8_lossy()?;
38    let replace_repeated = replace_repeated.try_bytes_utf8_lossy()?;
39
40    match permitted_characters {
41        Value::Regex(regex) => {
42            let mut result = String::with_capacity(value.len());
43            let mut last_end = 0;
44            for m in regex.find_iter(&value) {
45                match m.start() - last_end {
46                    l if l > 1 => result += &replace_repeated,
47                    1 => result += &replace_single,
48                    _ => (),
49                }
50                last_end = m.end();
51                result += m.as_str();
52            }
53            Ok(result.into())
54        }
55        value => Err(ValueError::Expected {
56            got: value.kind(),
57            expected: Kind::regex(),
58        }
59        .into()),
60    }
61}
62
63#[derive(Clone, Copy, Debug)]
64pub struct Sieve;
65
66impl Function for Sieve {
67    fn identifier(&self) -> &'static str {
68        "sieve"
69    }
70
71    fn usage(&self) -> &'static str {
72        indoc! {"
73            Keeps only matches of `pattern` in `value`.
74
75            This can be used to define patterns that are allowed in the string and
76            remove everything else.
77        "}
78    }
79
80    fn category(&self) -> &'static str {
81        Category::String.as_ref()
82    }
83
84    fn return_kind(&self) -> u16 {
85        kind::BYTES
86    }
87
88    fn parameters(&self) -> &'static [Parameter] {
89        PARAMETERS.as_slice()
90    }
91
92    fn examples(&self) -> &'static [Example] {
93        &[
94            example! {
95                title: "Keep only lowercase letters",
96                source: r#"sieve("vector.dev/lowerUPPER", permitted_characters: r'[a-z]')"#,
97                result: Ok("vectordevlower"),
98            },
99            example! {
100                title: "Sieve with regex",
101                source: r#"sieve("test123%456.فوائد.net.", r'[a-z0-9.]')"#,
102                result: Ok("test123456..net."),
103            },
104            example! {
105                title: "Custom replacements",
106                source: r#"sieve("test123%456.فوائد.net.", r'[a-z.0-9]', replace_single: "X", replace_repeated: "<REMOVED>")"#,
107                result: Ok("test123X456.<REMOVED>.net."),
108            },
109        ]
110    }
111
112    fn compile(
113        &self,
114        _state: &state::TypeState,
115        _ctx: &mut FunctionCompileContext,
116        arguments: ArgumentList,
117    ) -> Compiled {
118        let value = arguments.required("value");
119        let permitted_characters = arguments.required("permitted_characters");
120        let replace_single = arguments.optional("replace_single");
121        let replace_repeated = arguments.optional("replace_repeated");
122
123        Ok(SieveFn {
124            value,
125            permitted_characters,
126            replace_single,
127            replace_repeated,
128        }
129        .as_expr())
130    }
131}
132
133#[derive(Debug, Clone)]
134struct SieveFn {
135    value: Box<dyn Expression>,
136    permitted_characters: Box<dyn Expression>,
137    replace_single: Option<Box<dyn Expression>>,
138    replace_repeated: Option<Box<dyn Expression>>,
139}
140
141impl FunctionExpression for SieveFn {
142    fn resolve(&self, ctx: &mut Context) -> Resolved {
143        let value = self.value.resolve(ctx)?;
144        let permitted_characters = self.permitted_characters.resolve(ctx)?;
145        let replace_single = self
146            .replace_single
147            .map_resolve_with_default(ctx, || DEFAULT_REPLACE_SINGLE.clone())?;
148        let replace_repeated = self
149            .replace_repeated
150            .map_resolve_with_default(ctx, || DEFAULT_REPLACE_REPEATED.clone())?;
151
152        sieve(
153            &value,
154            permitted_characters,
155            &replace_single,
156            &replace_repeated,
157        )
158    }
159
160    fn type_def(&self, _: &state::TypeState) -> TypeDef {
161        TypeDef::bytes().infallible()
162    }
163}
164
165#[cfg(test)]
166mod tests {
167    use super::*;
168    use crate::value;
169
170    test_function![
171        sieve => Sieve;
172
173        lowercase_letters_only {
174            args: func_args![value: value!("vector.dev"), permitted_characters: regex::Regex::new("[a-z]").unwrap()],
175            want: Ok(value!("vectordev")),
176            tdef: TypeDef::bytes().infallible(),
177        }
178
179        alphanumeric_and_dots {
180            args: func_args![value: value!("37ccx6a5uf52a7dv2hfxgpmltji09x6xkg0zv6yxsoi4kqs9atmjh7k50dcjb7z.فوائد.net."), permitted_characters: regex::Regex::new("[a-z.0-9]").unwrap()],
181            want: Ok(value!("37ccx6a5uf52a7dv2hfxgpmltji09x6xkg0zv6yxsoi4kqs9atmjh7k50dcjb7z..net.")),
182            tdef: TypeDef::bytes().infallible(),
183        }
184
185        all_options {
186            args: func_args![value: value!("test123%456.فوائد.net."), permitted_characters: regex::Regex::new("[a-z.0-9]").unwrap(), replace_single: "X", replace_repeated: "<REMOVED>"],
187            want: Ok(value!("test123X456.<REMOVED>.net.")),
188            tdef: TypeDef::bytes().infallible(),
189        }
190
191        replace_repeated {
192            args: func_args![value: value!("37ccx6a5uf52a7dv2hfxgpmltji09x6xkg0zv6yxsoi4kqs9atmjh7k50dcjb7z.فوائد.net."), permitted_characters: regex::Regex::new(r"[\.]").unwrap(), replace_repeated: "<REMOVED>"],
193            want: Ok(value!("<REMOVED>.<REMOVED>.<REMOVED>.")),
194            tdef: TypeDef::bytes().infallible(),
195        }
196    ];
197}