vrl/stdlib/
redact.rs

1use crate::compiler::prelude::*;
2use std::{
3    borrow::Cow,
4    convert::{TryFrom, TryInto},
5    sync::LazyLock,
6};
7
8// https://www.oreilly.com/library/view/regular-expressions-cookbook/9781449327453/ch04s12.html
9// (converted to non-lookaround version given `regex` does not support lookarounds)
10// See also: https://www.ssa.gov/history/ssn/geocard.html
11static US_SOCIAL_SECURITY_NUMBER: LazyLock<regex::Regex> = LazyLock::new(|| {
12    regex::Regex::new(
13        "(?x)                                                               # Ignore whitespace and comments in the regex expression.
14    (?:00[1-9]|0[1-9][0-9]|[1-578][0-9]{2}|6[0-57-9][0-9]|66[0-57-9])-    # Area number: 001-899 except 666
15    (?:0[1-9]|[1-9]0|[1-9][1-9])-                                         # Group number: 01-99
16    (?:000[1-9]|00[1-9]0|0[1-9]00|[1-9]000|[1-9]{4})                      # Serial number: 0001-9999
17    ").unwrap()
18});
19
20#[derive(Clone, Copy, Debug)]
21pub struct Redact;
22
23impl Function for Redact {
24    fn identifier(&self) -> &'static str {
25        "redact"
26    }
27
28    fn usage(&self) -> &'static str {
29        indoc! {"
30            Redact sensitive data in `value` such as:
31
32            - [US social security card numbers](https://www.ssa.gov/history/ssn/geocard.html)
33            - Other forms of personally identifiable information with custom patterns
34
35            This can help achieve compliance by ensuring sensitive data does not leave your network.
36        "}
37    }
38
39    fn category(&self) -> &'static str {
40        Category::String.as_ref()
41    }
42
43    fn return_kind(&self) -> u16 {
44        kind::BYTES | kind::OBJECT | kind::ARRAY
45    }
46
47    fn parameters(&self) -> &'static [Parameter] {
48        const PARAMETERS: &[Parameter] = &[
49            Parameter::required(
50                "value",
51                kind::BYTES | kind::OBJECT | kind::ARRAY,
52                "The value to redact sensitive data from.
53
54The function's behavior depends on `value`'s type:
55
56- For strings, the sensitive data is redacted and a new string is returned.
57- For arrays, the sensitive data is redacted in each string element.
58- For objects, the sensitive data in each string value is masked, but the keys are not masked.
59
60For arrays and objects, the function recurses into any nested arrays or objects. Any non-string elements are
61skipped.
62
63Redacted text is replaced with `[REDACTED]`.",
64            ),
65            Parameter::required(
66                "filters",
67                kind::ARRAY,
68                "List of filters applied to `value`.
69
70Each filter can be specified in the following ways:
71
72- As a regular expression, which is used to redact text that match it.
73- As an object with a `type` key that corresponds to a named filter and additional keys for customizing that filter.
74- As a named filter, if it has no required parameters.
75
76Named filters can be a:
77
78- `pattern`: Redacts text matching any regular expressions specified in the `patterns`
79	key, which is required. This is the expanded version of just passing a regular expression as a filter.
80- `us_social_security_number`: Redacts US social security card numbers.
81
82See examples for more details.
83
84This parameter must be a static expression so that the argument can be validated at compile-time
85to avoid runtime errors. You cannot use variables or other dynamic expressions with it.",
86            ),
87            // TODO: Should default to Full
88            Parameter::optional(
89                "redactor",
90                kind::OBJECT | kind::BYTES,
91                "Specifies what to replace the redacted strings with.
92
93It is given as an object with a \"type\" key specifying the type of redactor to use
94and additional keys depending on the type. The following types are supported:
95
96- `full`: The default. Replace with the string \"[REDACTED]\".
97- `text`: Replace with a custom string. The `replacement` key is required, and must
98  contain the string that is used as a replacement.
99- `sha2`: Hash the redacted text with SHA-2 as with [`sha2`](https://en.wikipedia.org/wiki/SHA-2). Supports two optional parameters:
100	- `variant`: The variant of the algorithm to use. Defaults to SHA-512/256.
101	- `encoding`: How to encode the hash as text. Can be base16 or base64.
102		Defaults to base64.
103- `sha3`: Hash the redacted text with SHA-3 as with [`sha3`](https://en.wikipedia.org/wiki/SHA-3). Supports two optional parameters:
104	- `variant`: The variant of the algorithm to use. Defaults to SHA3-512.
105	- `encoding`: How to encode the hash as text. Can be base16 or base64.
106		Defaults to base64.
107
108
109As a convenience you can use a string as a shorthand for common redactor patterns:
110
111- `\"full\"` is equivalent to `{\"type\": \"full\"}`
112- `\"sha2\"` is equivalent to `{\"type\": \"sha2\", \"variant\": \"SHA-512/256\", \"encoding\": \"base64\"}`
113- `\"sha3\"` is equivalent to `{\"type\": \"sha3\", \"variant\": \"SHA3-512\", \"encoding\": \"base64\"}`
114
115This parameter must be a static expression so that the argument can be validated at compile-time
116to avoid runtime errors. You cannot use variables or other dynamic expressions with it.",
117            ),
118        ];
119        PARAMETERS
120    }
121
122    fn examples(&self) -> &'static [Example] {
123        &[
124            example! {
125                title: "Replace text using a regex",
126                source: r#"redact("my id is 123456", filters: [r'\d+'])"#,
127                result: Ok("my id is [REDACTED]"),
128            },
129            example! {
130                title: "Replace us social security numbers in any field",
131                source: r#"redact({ "name": "John Doe", "ssn": "123-12-1234"}, filters: ["us_social_security_number"])"#,
132                result: Ok(r#"{ "name": "John Doe", "ssn": "[REDACTED]" }"#),
133            },
134            example! {
135                title: "Replace with custom text",
136                source: r#"redact("my id is 123456", filters: [r'\d+'], redactor: {"type": "text", "replacement": "***"})"#,
137                result: Ok("my id is ***"),
138            },
139            example! {
140                title: "Replace with SHA-2 hash",
141                source: r#"redact("my id is 123456", filters: [r'\d+'], redactor: "sha2")"#,
142                result: Ok("my id is GEtTedW1p6tC094dDKH+3B8P+xSnZz69AmpjaXRd63I="),
143            },
144            example! {
145                title: "Replace with SHA-3 hash",
146                source: r#"redact("my id is 123456", filters: [r'\d+'], redactor: "sha3")"#,
147                result: Ok(
148                    "my id is ZNCdmTDI7PeeUTFnpYjLdUObdizo+bIupZdl8yqnTKGdLx6X3JIqPUlUWUoFBikX+yTR+OcvLtAqWO11NPlNJw==",
149                ),
150            },
151            example! {
152                title: "Replace with SHA-256 hash using hex encoding",
153                source: r#"redact("my id is 123456", filters: [r'\d+'], redactor: {"type": "sha2", "variant": "SHA-256", "encoding": "base16"})"#,
154                result: Ok(
155                    "my id is 8d969eef6ecad3c29a3a629280e686cf0c3f5d5a86aff3ca12020c923adc6c92",
156                ),
157            },
158        ]
159    }
160
161    fn compile(
162        &self,
163        state: &state::TypeState,
164        _ctx: &mut FunctionCompileContext,
165        arguments: ArgumentList,
166    ) -> Compiled {
167        let value = arguments.required("value");
168
169        let filters = arguments
170            .required_array("filters")?
171            .into_iter()
172            .map(|expr| {
173                expr.resolve_constant(state)
174                    .ok_or(function::Error::ExpectedStaticExpression {
175                        keyword: "filters",
176                        expr,
177                    })
178            })
179            .map(|value| {
180                value.and_then(|value| {
181                    value
182                        .clone()
183                        .try_into()
184                        .map_err(|error| function::Error::InvalidArgument {
185                            keyword: "filters",
186                            value,
187                            error,
188                        })
189                })
190            })
191            .collect::<std::result::Result<Vec<Filter>, _>>()?;
192
193        let redactor = arguments
194            .optional_literal("redactor", state)?
195            .map(|value| {
196                value
197                    .clone()
198                    .try_into()
199                    .map_err(|error| function::Error::InvalidArgument {
200                        keyword: "redactor",
201                        value,
202                        error,
203                    })
204            })
205            .transpose()?
206            .unwrap_or(Redactor::Full);
207
208        Ok(RedactFn {
209            value,
210            filters,
211            redactor,
212        }
213        .as_expr())
214    }
215}
216
217//-----------------------------------------------------------------------------
218
219#[derive(Clone, Debug)]
220struct RedactFn {
221    value: Box<dyn Expression>,
222    filters: Vec<Filter>,
223    redactor: Redactor,
224}
225
226fn redact(value: Value, filters: &[Filter], redactor: &Redactor) -> Value {
227    // possible optimization. match the redactor here, and use different calls depending on
228    // the value, so that we don't have to do the comparision in the loop of replacment.
229    // that would complicate the code though.
230    match value {
231        Value::Bytes(bytes) => {
232            let input = String::from_utf8_lossy(&bytes);
233            let output = filters.iter().fold(input, |input, filter| {
234                filter.redact(&input, redactor).into_owned().into()
235            });
236            Value::Bytes(output.into_owned().into())
237        }
238        Value::Array(values) => {
239            let values = values
240                .into_iter()
241                .map(|value| redact(value, filters, redactor))
242                .collect();
243            Value::Array(values)
244        }
245        Value::Object(map) => {
246            let map = map
247                .into_iter()
248                .map(|(key, value)| (key, redact(value, filters, redactor)))
249                .collect();
250            Value::Object(map)
251        }
252        _ => value,
253    }
254}
255
256impl FunctionExpression for RedactFn {
257    fn resolve(&self, ctx: &mut Context) -> Resolved {
258        let value = self.value.resolve(ctx)?;
259        let filters = &self.filters;
260        let redactor = &self.redactor;
261
262        Ok(redact(value, filters, redactor))
263    }
264
265    fn type_def(&self, state: &state::TypeState) -> TypeDef {
266        self.value.type_def(state).infallible()
267    }
268}
269
270//-----------------------------------------------------------------------------
271
272/// The redaction filter to apply to the given value.
273#[derive(Debug, Clone)]
274enum Filter {
275    Pattern(Vec<Pattern>),
276    UsSocialSecurityNumber,
277}
278
279#[derive(Debug, Clone)]
280enum Pattern {
281    Regex(regex::Regex),
282    String(String),
283}
284
285impl TryFrom<Value> for Filter {
286    type Error = &'static str;
287
288    fn try_from(value: Value) -> std::result::Result<Self, Self::Error> {
289        match value {
290            Value::Object(object) => {
291                let r#type = match object
292                    .get("type")
293                    .ok_or("filters specified as objects must have type parameter")?
294                {
295                    Value::Bytes(bytes) => Ok(bytes.clone()),
296                    _ => Err("type key in filters must be a string"),
297                }?;
298
299                match r#type.as_ref() {
300                    b"us_social_security_number" => Ok(Filter::UsSocialSecurityNumber),
301                    b"pattern" => {
302                        let patterns = match object
303                            .get("patterns")
304                            .ok_or("pattern filter must have `patterns` specified")?
305                        {
306                            Value::Array(array) => Ok(array
307                                .iter()
308                                .map(|value| match value {
309                                    Value::Regex(regex) => Ok(Pattern::Regex((**regex).clone())),
310                                    Value::Bytes(bytes) => Ok(Pattern::String(
311                                        String::from_utf8_lossy(bytes).into_owned(),
312                                    )),
313                                    _ => Err("`patterns` must be regular expressions"),
314                                })
315                                .collect::<std::result::Result<Vec<_>, _>>()?),
316                            _ => Err("`patterns` must be array of regular expression literals"),
317                        }?;
318                        Ok(Filter::Pattern(patterns))
319                    }
320                    _ => Err("unknown filter name"),
321                }
322            }
323            Value::Bytes(bytes) => match bytes.as_ref() {
324                b"pattern" => Err("pattern cannot be used without arguments"),
325                b"us_social_security_number" => Ok(Filter::UsSocialSecurityNumber),
326                _ => Err("unknown filter name"),
327            },
328            Value::Regex(regex) => Ok(Filter::Pattern(vec![Pattern::Regex((*regex).clone())])),
329            _ => Err("unknown literal for filter, must be a regex, filter name, or object"),
330        }
331    }
332}
333
334impl Filter {
335    fn redact<'t>(&self, input: &'t str, redactor: &Redactor) -> Cow<'t, str> {
336        match &self {
337            Filter::Pattern(patterns) => {
338                patterns
339                    .iter()
340                    .fold(Cow::Borrowed(input), |input, pattern| match pattern {
341                        Pattern::Regex(regex) => {
342                            regex.replace_all(&input, redactor).into_owned().into()
343                        }
344                        Pattern::String(pattern) => str_replace(&input, pattern, redactor).into(),
345                    })
346            }
347            Filter::UsSocialSecurityNumber => {
348                US_SOCIAL_SECURITY_NUMBER.replace_all(input, redactor)
349            }
350        }
351    }
352}
353
354fn str_replace(haystack: &str, pattern: &str, redactor: &Redactor) -> String {
355    let mut result = String::new();
356    let mut last_end = 0;
357    for (start, original) in haystack.match_indices(pattern) {
358        result.push_str(&haystack[last_end..start]);
359        redactor.replace_str(original, &mut result);
360        last_end = start + original.len();
361    }
362    result.push_str(&haystack[last_end..]);
363    result
364}
365
366#[allow(unpredictable_function_pointer_comparisons)]
367/// The recipe for redacting the matched filters.
368#[derive(Debug, Default, Clone, PartialEq, Eq)]
369enum Redactor {
370    #[default]
371    Full,
372    /// Replace with a fixed string
373    Text(String), // possible optimization: use Arc<str> instead of String to speed up cloning
374    // using function pointers simplifies the code,
375    // but the Debug implmentation probably isn't very useful
376    // alternatively we could have a separate variant for each hash algorithm/variant combination
377    // we could also create a custom Debug implementation that does a comparison of the fn pointer
378    // to function pointers we might use.
379    /// Replace with a hash of the redacted content
380    Hash {
381        encoder: Encoder,
382        hasher: fn(Encoder, &[u8]) -> String,
383    },
384}
385
386const REDACTED: &str = "[REDACTED]";
387
388impl Redactor {
389    fn replace_str(&self, original: &str, dst: &mut String) {
390        match self {
391            Redactor::Full => {
392                dst.push_str(REDACTED);
393            }
394            Redactor::Text(s) => {
395                dst.push_str(s);
396            }
397            Redactor::Hash { encoder, hasher } => {
398                dst.push_str(&hasher(*encoder, original.as_bytes()));
399            }
400        }
401    }
402
403    fn from_object(obj: &ObjectMap) -> std::result::Result<Self, &'static str> {
404        let r#type = match obj.get("type").ok_or(
405            "redactor specified as objects must have type
406        parameter",
407        )? {
408            Value::Bytes(bytes) => Ok(bytes.clone()),
409            _ => Err("type key in redactor must be a string"),
410        }?;
411
412        match r#type.as_ref() {
413            b"full" => Ok(Redactor::Full),
414            b"text" => {
415                match obj.get("replacement").ok_or(
416                    "text redactor must have
417                `replacement` specified",
418                )? {
419                    Value::Bytes(bytes) => {
420                        Ok(Redactor::Text(String::from_utf8_lossy(bytes).into_owned()))
421                    }
422                    _ => Err("`replacement` must be a string"),
423                }
424            }
425            b"sha2" => {
426                let hasher = if let Some(variant) = obj.get("variant") {
427                    match variant
428                        .as_bytes()
429                        .ok_or("`variant` must be a string")?
430                        .as_ref()
431                    {
432                        b"SHA-224" => encoded_hash::<sha_2::Sha224>,
433                        b"SHA-256" => encoded_hash::<sha_2::Sha256>,
434                        b"SHA-384" => encoded_hash::<sha_2::Sha384>,
435                        b"SHA-512" => encoded_hash::<sha_2::Sha512>,
436                        b"SHA-512/224" => encoded_hash::<sha_2::Sha512_224>,
437                        b"SHA-512/256" => encoded_hash::<sha_2::Sha512_256>,
438                        _ => return Err("invalid sha2 variant"),
439                    }
440                } else {
441                    encoded_hash::<sha_2::Sha512_256>
442                };
443                let encoder = obj
444                    .get("encoding")
445                    .map(Encoder::try_from)
446                    .transpose()?
447                    .unwrap_or(Encoder::Base64);
448                Ok(Redactor::Hash { hasher, encoder })
449            }
450            b"sha3" => {
451                let hasher = if let Some(variant) = obj.get("variant") {
452                    match variant
453                        .as_bytes()
454                        .ok_or("`variant must be a string")?
455                        .as_ref()
456                    {
457                        b"SHA3-224" => encoded_hash::<sha_3::Sha3_224>,
458                        b"SHA3-256" => encoded_hash::<sha_3::Sha3_256>,
459                        b"SHA3-384" => encoded_hash::<sha_3::Sha3_384>,
460                        b"SHA3-512" => encoded_hash::<sha_3::Sha3_512>,
461                        _ => return Err("invalid sha2 variant"),
462                    }
463                } else {
464                    encoded_hash::<sha_3::Sha3_512>
465                };
466                let encoder = obj
467                    .get("encoding")
468                    .map(Encoder::try_from)
469                    .transpose()?
470                    .unwrap_or(Encoder::Base64);
471                Ok(Redactor::Hash { hasher, encoder })
472            }
473            _ => Err("unknown `type` for `redactor`"),
474        }
475    }
476}
477
478impl regex::Replacer for &Redactor {
479    fn replace_append(&mut self, caps: &regex::Captures, dst: &mut String) {
480        self.replace_str(&caps[0], dst);
481    }
482
483    fn no_expansion(&mut self) -> Option<Cow<'_, str>> {
484        match self {
485            Redactor::Full => Some(REDACTED.into()),
486            Redactor::Text(s) => Some(s.into()),
487            Redactor::Hash { .. } => None,
488        }
489    }
490}
491
492impl TryFrom<Value> for Redactor {
493    type Error = &'static str;
494
495    fn try_from(value: Value) -> std::result::Result<Self, Self::Error> {
496        match value {
497            Value::Object(object) => Redactor::from_object(&object),
498            Value::Bytes(bytes) => match bytes.as_ref() {
499                b"full" => Ok(Redactor::Full),
500                b"sha2" => Ok(Redactor::Hash {
501                    hasher: encoded_hash::<sha_2::Sha512_256>,
502                    encoder: Encoder::Base64,
503                }),
504                b"sha3" => Ok(Redactor::Hash {
505                    hasher: encoded_hash::<sha_3::Sha3_512>,
506                    encoder: Encoder::Base64,
507                }),
508                _ => Err("unknown name of redactor"),
509            },
510            _ => Err("unknown literal for redactor, must be redactor name or object"),
511        }
512    }
513}
514
515#[derive(Debug, Copy, Clone, PartialEq, Eq)]
516enum Encoder {
517    Base64,
518    Base16,
519}
520
521impl TryFrom<&Value> for Encoder {
522    type Error = &'static str;
523
524    fn try_from(value: &Value) -> std::result::Result<Self, Self::Error> {
525        match value.as_bytes().ok_or("encoding must be string")?.as_ref() {
526            b"base64" => Ok(Self::Base64),
527            b"base16" | b"hex" => Ok(Self::Base16),
528            _ => Err("unexpected encoding"),
529        }
530    }
531}
532
533impl Encoder {
534    fn encode(self, data: &[u8]) -> String {
535        use Encoder::{Base16, Base64};
536        match self {
537            Base64 => base64_simd::STANDARD.encode_to_string(data),
538            Base16 => base16::encode_lower(data),
539        }
540    }
541}
542
543/// Compute the hash of `data` using `T` as the digest, then encode it using `encoder`
544/// to get a String
545fn encoded_hash<T: digest::Digest>(encoder: Encoder, data: &[u8]) -> String {
546    encoder.encode(&T::digest(data))
547}
548
549#[cfg(test)]
550mod test {
551    use super::*;
552    use crate::{btreemap, value};
553    use regex::Regex;
554
555    test_function![
556        redact => Redact;
557
558        regex {
559             args: func_args![
560                 value: "hello 123456 world",
561                 filters: vec![Regex::new(r"\d+").unwrap()],
562             ],
563             want: Ok("hello [REDACTED] world"),
564             tdef: TypeDef::bytes().infallible(),
565        }
566
567        patterns {
568             args: func_args![
569                 value: "hello 123456 world",
570                 filters: vec![
571                     value!({
572                         "type": "pattern",
573                         "patterns": ["123456"]
574                     })
575                 ],
576             ],
577             want: Ok("hello [REDACTED] world"),
578             tdef: TypeDef::bytes().infallible(),
579        }
580
581        us_social_security_number{
582             args: func_args![
583                 value: "hello 123-12-1234 world",
584                 filters: vec!["us_social_security_number"],
585             ],
586             want: Ok("hello [REDACTED] world"),
587             tdef: TypeDef::bytes().infallible(),
588        }
589
590        invalid_filter {
591             args: func_args![
592                 value: "hello 123456 world",
593                 filters: vec!["not a filter"],
594             ],
595             want: Err("invalid argument"),
596             tdef: TypeDef::bytes().infallible(),
597        }
598
599        missing_patterns {
600             args: func_args![
601                 value: "hello 123456 world",
602                 filters: vec![
603                     value!({
604                         "type": "pattern",
605                     })
606                 ],
607             ],
608             want: Err("invalid argument"),
609             tdef: TypeDef::bytes().infallible(),
610        }
611
612        text_redactor {
613            args: func_args![
614                value: "my id is 123456",
615                filters: vec![Regex::new(r"\d+").unwrap()],
616                redactor: btreemap!{"type" => "text", "replacement" => "***"},
617            ],
618            want: Ok("my id is ***"),
619            tdef: TypeDef::bytes().infallible(),
620        }
621
622        sha2 {
623            args: func_args![
624                value: "my id is 123456",
625                filters: vec![Regex::new(r"\d+").unwrap()],
626                redactor: "sha2",
627            ],
628            want: Ok("my id is GEtTedW1p6tC094dDKH+3B8P+xSnZz69AmpjaXRd63I="),
629            tdef: TypeDef::bytes().infallible(),
630        }
631
632        sha3 {
633            args: func_args![
634                value: "my id is 123456",
635                filters: vec![Regex::new(r"\d+").unwrap()],
636                redactor: "sha3",
637            ],
638            want: Ok("my id is ZNCdmTDI7PeeUTFnpYjLdUObdizo+bIupZdl8yqnTKGdLx6X3JIqPUlUWUoFBikX+yTR+OcvLtAqWO11NPlNJw=="),
639            tdef: TypeDef::bytes().infallible(),
640        }
641
642        sha256_hex {
643            args: func_args![
644                value: "my id is 123456",
645                filters: vec![Regex::new(r"\d+").unwrap()],
646                redactor: btreemap!{"type" => "sha2", "variant" => "SHA-256", "encoding" =>
647                    "base16"},
648            ],
649            want: Ok("my id is 8d969eef6ecad3c29a3a629280e686cf0c3f5d5a86aff3ca12020c923adc6c92"),
650            tdef: TypeDef::bytes().infallible(),
651        }
652
653        invalid_redactor {
654             args: func_args![
655                 value: "hello 123456 world",
656                 filters: vec!["us_social_security_number"],
657                 redactor: "not a redactor"
658             ],
659             want: Err("invalid argument"),
660             tdef: TypeDef::bytes().infallible(),
661        }
662
663        invalid_redactor_obj {
664             args: func_args![
665                 value: "hello 123456 world",
666                 filters: vec!["us_social_security_number"],
667                 redactor: btreemap!{"type" => "wrongtype"},
668             ],
669             want: Err("invalid argument"),
670             tdef: TypeDef::bytes().infallible(),
671        }
672
673        invalid_redactor_no_type {
674             args: func_args![
675                 value: "hello 123456 world",
676                 filters: vec!["us_social_security_number"],
677                 redactor: btreemap!{"key" => "value"},
678             ],
679             want: Err("invalid argument"),
680             tdef: TypeDef::bytes().infallible(),
681        }
682
683        invalid_hash_variant {
684             args: func_args![
685                 value: "hello 123456 world",
686                 filters: vec!["us_social_security_number"],
687                 redactor: btreemap!{"type" => "sha2", "variant" => "MD5"},
688             ],
689             want: Err("invalid argument"),
690             tdef: TypeDef::bytes().infallible(),
691        }
692    ];
693}