1use crate::compiler::prelude::*;
2use std::sync::LazyLock;
3
4static DEFAULT_REPLACE_SINGLE: LazyLock<Value> = LazyLock::new(|| Value::Bytes(Bytes::from("")));
5static DEFAULT_REPLACE_REPEATED: LazyLock<Value> = LazyLock::new(|| Value::Bytes(Bytes::from("")));
6
7static PARAMETERS: LazyLock<Vec<Parameter>> = LazyLock::new(|| {
8 vec![
9 Parameter::required("value", kind::BYTES, "The original string."),
10 Parameter::required(
11 "permitted_characters",
12 kind::REGEX,
13 "Keep all matches of this pattern.",
14 ),
15 Parameter::optional(
16 "replace_single",
17 kind::BYTES,
18 "The string to use to replace single rejected characters.",
19 )
20 .default(&DEFAULT_REPLACE_SINGLE),
21 Parameter::optional(
22 "replace_repeated",
23 kind::BYTES,
24 "The string to use to replace multiple sequential instances of rejected characters.",
25 )
26 .default(&DEFAULT_REPLACE_REPEATED),
27 ]
28});
29
30fn sieve(
31 value: &Value,
32 permitted_characters: Value,
33 replace_single: &Value,
34 replace_repeated: &Value,
35) -> Resolved {
36 let value = value.try_bytes_utf8_lossy()?;
37 let replace_single = replace_single.try_bytes_utf8_lossy()?;
38 let replace_repeated = replace_repeated.try_bytes_utf8_lossy()?;
39
40 match permitted_characters {
41 Value::Regex(regex) => {
42 let mut result = String::with_capacity(value.len());
43 let mut last_end = 0;
44 for m in regex.find_iter(&value) {
45 match m.start() - last_end {
46 l if l > 1 => result += &replace_repeated,
47 1 => result += &replace_single,
48 _ => (),
49 }
50 last_end = m.end();
51 result += m.as_str();
52 }
53 Ok(result.into())
54 }
55 value => Err(ValueError::Expected {
56 got: value.kind(),
57 expected: Kind::regex(),
58 }
59 .into()),
60 }
61}
62
63#[derive(Clone, Copy, Debug)]
64pub struct Sieve;
65
66impl Function for Sieve {
67 fn identifier(&self) -> &'static str {
68 "sieve"
69 }
70
71 fn usage(&self) -> &'static str {
72 indoc! {"
73 Keeps only matches of `pattern` in `value`.
74
75 This can be used to define patterns that are allowed in the string and
76 remove everything else.
77 "}
78 }
79
80 fn category(&self) -> &'static str {
81 Category::String.as_ref()
82 }
83
84 fn return_kind(&self) -> u16 {
85 kind::BYTES
86 }
87
88 fn parameters(&self) -> &'static [Parameter] {
89 PARAMETERS.as_slice()
90 }
91
92 fn examples(&self) -> &'static [Example] {
93 &[
94 example! {
95 title: "Keep only lowercase letters",
96 source: r#"sieve("vector.dev/lowerUPPER", permitted_characters: r'[a-z]')"#,
97 result: Ok("vectordevlower"),
98 },
99 example! {
100 title: "Sieve with regex",
101 source: r#"sieve("test123%456.فوائد.net.", r'[a-z0-9.]')"#,
102 result: Ok("test123456..net."),
103 },
104 example! {
105 title: "Custom replacements",
106 source: r#"sieve("test123%456.فوائد.net.", r'[a-z.0-9]', replace_single: "X", replace_repeated: "<REMOVED>")"#,
107 result: Ok("test123X456.<REMOVED>.net."),
108 },
109 ]
110 }
111
112 fn compile(
113 &self,
114 _state: &state::TypeState,
115 _ctx: &mut FunctionCompileContext,
116 arguments: ArgumentList,
117 ) -> Compiled {
118 let value = arguments.required("value");
119 let permitted_characters = arguments.required("permitted_characters");
120 let replace_single = arguments.optional("replace_single");
121 let replace_repeated = arguments.optional("replace_repeated");
122
123 Ok(SieveFn {
124 value,
125 permitted_characters,
126 replace_single,
127 replace_repeated,
128 }
129 .as_expr())
130 }
131}
132
133#[derive(Debug, Clone)]
134struct SieveFn {
135 value: Box<dyn Expression>,
136 permitted_characters: Box<dyn Expression>,
137 replace_single: Option<Box<dyn Expression>>,
138 replace_repeated: Option<Box<dyn Expression>>,
139}
140
141impl FunctionExpression for SieveFn {
142 fn resolve(&self, ctx: &mut Context) -> Resolved {
143 let value = self.value.resolve(ctx)?;
144 let permitted_characters = self.permitted_characters.resolve(ctx)?;
145 let replace_single = self
146 .replace_single
147 .map_resolve_with_default(ctx, || DEFAULT_REPLACE_SINGLE.clone())?;
148 let replace_repeated = self
149 .replace_repeated
150 .map_resolve_with_default(ctx, || DEFAULT_REPLACE_REPEATED.clone())?;
151
152 sieve(
153 &value,
154 permitted_characters,
155 &replace_single,
156 &replace_repeated,
157 )
158 }
159
160 fn type_def(&self, _: &state::TypeState) -> TypeDef {
161 TypeDef::bytes().infallible()
162 }
163}
164
165#[cfg(test)]
166mod tests {
167 use super::*;
168 use crate::value;
169
170 test_function![
171 sieve => Sieve;
172
173 lowercase_letters_only {
174 args: func_args![value: value!("vector.dev"), permitted_characters: regex::Regex::new("[a-z]").unwrap()],
175 want: Ok(value!("vectordev")),
176 tdef: TypeDef::bytes().infallible(),
177 }
178
179 alphanumeric_and_dots {
180 args: func_args![value: value!("37ccx6a5uf52a7dv2hfxgpmltji09x6xkg0zv6yxsoi4kqs9atmjh7k50dcjb7z.فوائد.net."), permitted_characters: regex::Regex::new("[a-z.0-9]").unwrap()],
181 want: Ok(value!("37ccx6a5uf52a7dv2hfxgpmltji09x6xkg0zv6yxsoi4kqs9atmjh7k50dcjb7z..net.")),
182 tdef: TypeDef::bytes().infallible(),
183 }
184
185 all_options {
186 args: func_args![value: value!("test123%456.فوائد.net."), permitted_characters: regex::Regex::new("[a-z.0-9]").unwrap(), replace_single: "X", replace_repeated: "<REMOVED>"],
187 want: Ok(value!("test123X456.<REMOVED>.net.")),
188 tdef: TypeDef::bytes().infallible(),
189 }
190
191 replace_repeated {
192 args: func_args![value: value!("37ccx6a5uf52a7dv2hfxgpmltji09x6xkg0zv6yxsoi4kqs9atmjh7k50dcjb7z.فوائد.net."), permitted_characters: regex::Regex::new(r"[\.]").unwrap(), replace_repeated: "<REMOVED>"],
193 want: Ok(value!("<REMOVED>.<REMOVED>.<REMOVED>.")),
194 tdef: TypeDef::bytes().infallible(),
195 }
196 ];
197}