1use crate::compiler::prelude::*;
2use std::{
3 borrow::Cow,
4 convert::{TryFrom, TryInto},
5 sync::LazyLock,
6};
7
8static US_SOCIAL_SECURITY_NUMBER: LazyLock<regex::Regex> = LazyLock::new(|| {
12 regex::Regex::new(
13 "(?x) # Ignore whitespace and comments in the regex expression.
14 (?:00[1-9]|0[1-9][0-9]|[1-578][0-9]{2}|6[0-57-9][0-9]|66[0-57-9])- # Area number: 001-899 except 666
15 (?:0[1-9]|[1-9]0|[1-9][1-9])- # Group number: 01-99
16 (?:000[1-9]|00[1-9]0|0[1-9]00|[1-9]000|[1-9]{4}) # Serial number: 0001-9999
17 ").unwrap()
18});
19
20#[derive(Clone, Copy, Debug)]
21pub struct Redact;
22
23impl Function for Redact {
24 fn identifier(&self) -> &'static str {
25 "redact"
26 }
27
28 fn usage(&self) -> &'static str {
29 indoc! {"
30 Redact sensitive data in `value` such as:
31
32 - [US social security card numbers](https://www.ssa.gov/history/ssn/geocard.html)
33 - Other forms of personally identifiable information with custom patterns
34
35 This can help achieve compliance by ensuring sensitive data does not leave your network.
36 "}
37 }
38
39 fn category(&self) -> &'static str {
40 Category::String.as_ref()
41 }
42
43 fn return_kind(&self) -> u16 {
44 kind::BYTES | kind::OBJECT | kind::ARRAY
45 }
46
47 fn parameters(&self) -> &'static [Parameter] {
48 const PARAMETERS: &[Parameter] = &[
49 Parameter::required(
50 "value",
51 kind::BYTES | kind::OBJECT | kind::ARRAY,
52 "The value to redact sensitive data from.
53
54The function's behavior depends on `value`'s type:
55
56- For strings, the sensitive data is redacted and a new string is returned.
57- For arrays, the sensitive data is redacted in each string element.
58- For objects, the sensitive data in each string value is masked, but the keys are not masked.
59
60For arrays and objects, the function recurses into any nested arrays or objects. Any non-string elements are
61skipped.
62
63Redacted text is replaced with `[REDACTED]`.",
64 ),
65 Parameter::required(
66 "filters",
67 kind::ARRAY,
68 "List of filters applied to `value`.
69
70Each filter can be specified in the following ways:
71
72- As a regular expression, which is used to redact text that match it.
73- As an object with a `type` key that corresponds to a named filter and additional keys for customizing that filter.
74- As a named filter, if it has no required parameters.
75
76Named filters can be a:
77
78- `pattern`: Redacts text matching any regular expressions specified in the `patterns`
79 key, which is required. This is the expanded version of just passing a regular expression as a filter.
80- `us_social_security_number`: Redacts US social security card numbers.
81
82See examples for more details.
83
84This parameter must be a static expression so that the argument can be validated at compile-time
85to avoid runtime errors. You cannot use variables or other dynamic expressions with it.",
86 ),
87 Parameter::optional(
89 "redactor",
90 kind::OBJECT | kind::BYTES,
91 "Specifies what to replace the redacted strings with.
92
93It is given as an object with a \"type\" key specifying the type of redactor to use
94and additional keys depending on the type. The following types are supported:
95
96- `full`: The default. Replace with the string \"[REDACTED]\".
97- `text`: Replace with a custom string. The `replacement` key is required, and must
98 contain the string that is used as a replacement.
99- `sha2`: Hash the redacted text with SHA-2 as with [`sha2`](https://en.wikipedia.org/wiki/SHA-2). Supports two optional parameters:
100 - `variant`: The variant of the algorithm to use. Defaults to SHA-512/256.
101 - `encoding`: How to encode the hash as text. Can be base16 or base64.
102 Defaults to base64.
103- `sha3`: Hash the redacted text with SHA-3 as with [`sha3`](https://en.wikipedia.org/wiki/SHA-3). Supports two optional parameters:
104 - `variant`: The variant of the algorithm to use. Defaults to SHA3-512.
105 - `encoding`: How to encode the hash as text. Can be base16 or base64.
106 Defaults to base64.
107
108
109As a convenience you can use a string as a shorthand for common redactor patterns:
110
111- `\"full\"` is equivalent to `{\"type\": \"full\"}`
112- `\"sha2\"` is equivalent to `{\"type\": \"sha2\", \"variant\": \"SHA-512/256\", \"encoding\": \"base64\"}`
113- `\"sha3\"` is equivalent to `{\"type\": \"sha3\", \"variant\": \"SHA3-512\", \"encoding\": \"base64\"}`
114
115This parameter must be a static expression so that the argument can be validated at compile-time
116to avoid runtime errors. You cannot use variables or other dynamic expressions with it.",
117 ),
118 ];
119 PARAMETERS
120 }
121
122 fn examples(&self) -> &'static [Example] {
123 &[
124 example! {
125 title: "Replace text using a regex",
126 source: r#"redact("my id is 123456", filters: [r'\d+'])"#,
127 result: Ok("my id is [REDACTED]"),
128 },
129 example! {
130 title: "Replace us social security numbers in any field",
131 source: r#"redact({ "name": "John Doe", "ssn": "123-12-1234"}, filters: ["us_social_security_number"])"#,
132 result: Ok(r#"{ "name": "John Doe", "ssn": "[REDACTED]" }"#),
133 },
134 example! {
135 title: "Replace with custom text",
136 source: r#"redact("my id is 123456", filters: [r'\d+'], redactor: {"type": "text", "replacement": "***"})"#,
137 result: Ok("my id is ***"),
138 },
139 example! {
140 title: "Replace with SHA-2 hash",
141 source: r#"redact("my id is 123456", filters: [r'\d+'], redactor: "sha2")"#,
142 result: Ok("my id is GEtTedW1p6tC094dDKH+3B8P+xSnZz69AmpjaXRd63I="),
143 },
144 example! {
145 title: "Replace with SHA-3 hash",
146 source: r#"redact("my id is 123456", filters: [r'\d+'], redactor: "sha3")"#,
147 result: Ok(
148 "my id is ZNCdmTDI7PeeUTFnpYjLdUObdizo+bIupZdl8yqnTKGdLx6X3JIqPUlUWUoFBikX+yTR+OcvLtAqWO11NPlNJw==",
149 ),
150 },
151 example! {
152 title: "Replace with SHA-256 hash using hex encoding",
153 source: r#"redact("my id is 123456", filters: [r'\d+'], redactor: {"type": "sha2", "variant": "SHA-256", "encoding": "base16"})"#,
154 result: Ok(
155 "my id is 8d969eef6ecad3c29a3a629280e686cf0c3f5d5a86aff3ca12020c923adc6c92",
156 ),
157 },
158 ]
159 }
160
161 fn compile(
162 &self,
163 state: &state::TypeState,
164 _ctx: &mut FunctionCompileContext,
165 arguments: ArgumentList,
166 ) -> Compiled {
167 let value = arguments.required("value");
168
169 let filters = arguments
170 .required_array("filters")?
171 .into_iter()
172 .map(|expr| {
173 expr.resolve_constant(state)
174 .ok_or(function::Error::ExpectedStaticExpression {
175 keyword: "filters",
176 expr,
177 })
178 })
179 .map(|value| {
180 value.and_then(|value| {
181 value
182 .clone()
183 .try_into()
184 .map_err(|error| function::Error::InvalidArgument {
185 keyword: "filters",
186 value,
187 error,
188 })
189 })
190 })
191 .collect::<std::result::Result<Vec<Filter>, _>>()?;
192
193 let redactor = arguments
194 .optional_literal("redactor", state)?
195 .map(|value| {
196 value
197 .clone()
198 .try_into()
199 .map_err(|error| function::Error::InvalidArgument {
200 keyword: "redactor",
201 value,
202 error,
203 })
204 })
205 .transpose()?
206 .unwrap_or(Redactor::Full);
207
208 Ok(RedactFn {
209 value,
210 filters,
211 redactor,
212 }
213 .as_expr())
214 }
215}
216
217#[derive(Clone, Debug)]
220struct RedactFn {
221 value: Box<dyn Expression>,
222 filters: Vec<Filter>,
223 redactor: Redactor,
224}
225
226fn redact(value: Value, filters: &[Filter], redactor: &Redactor) -> Value {
227 match value {
231 Value::Bytes(bytes) => {
232 let input = String::from_utf8_lossy(&bytes);
233 let output = filters.iter().fold(input, |input, filter| {
234 filter.redact(&input, redactor).into_owned().into()
235 });
236 Value::Bytes(output.into_owned().into())
237 }
238 Value::Array(values) => {
239 let values = values
240 .into_iter()
241 .map(|value| redact(value, filters, redactor))
242 .collect();
243 Value::Array(values)
244 }
245 Value::Object(map) => {
246 let map = map
247 .into_iter()
248 .map(|(key, value)| (key, redact(value, filters, redactor)))
249 .collect();
250 Value::Object(map)
251 }
252 _ => value,
253 }
254}
255
256impl FunctionExpression for RedactFn {
257 fn resolve(&self, ctx: &mut Context) -> Resolved {
258 let value = self.value.resolve(ctx)?;
259 let filters = &self.filters;
260 let redactor = &self.redactor;
261
262 Ok(redact(value, filters, redactor))
263 }
264
265 fn type_def(&self, state: &state::TypeState) -> TypeDef {
266 self.value.type_def(state).infallible()
267 }
268}
269
270#[derive(Debug, Clone)]
274enum Filter {
275 Pattern(Vec<Pattern>),
276 UsSocialSecurityNumber,
277}
278
279#[derive(Debug, Clone)]
280enum Pattern {
281 Regex(regex::Regex),
282 String(String),
283}
284
285impl TryFrom<Value> for Filter {
286 type Error = &'static str;
287
288 fn try_from(value: Value) -> std::result::Result<Self, Self::Error> {
289 match value {
290 Value::Object(object) => {
291 let r#type = match object
292 .get("type")
293 .ok_or("filters specified as objects must have type parameter")?
294 {
295 Value::Bytes(bytes) => Ok(bytes.clone()),
296 _ => Err("type key in filters must be a string"),
297 }?;
298
299 match r#type.as_ref() {
300 b"us_social_security_number" => Ok(Filter::UsSocialSecurityNumber),
301 b"pattern" => {
302 let patterns = match object
303 .get("patterns")
304 .ok_or("pattern filter must have `patterns` specified")?
305 {
306 Value::Array(array) => Ok(array
307 .iter()
308 .map(|value| match value {
309 Value::Regex(regex) => Ok(Pattern::Regex((**regex).clone())),
310 Value::Bytes(bytes) => Ok(Pattern::String(
311 String::from_utf8_lossy(bytes).into_owned(),
312 )),
313 _ => Err("`patterns` must be regular expressions"),
314 })
315 .collect::<std::result::Result<Vec<_>, _>>()?),
316 _ => Err("`patterns` must be array of regular expression literals"),
317 }?;
318 Ok(Filter::Pattern(patterns))
319 }
320 _ => Err("unknown filter name"),
321 }
322 }
323 Value::Bytes(bytes) => match bytes.as_ref() {
324 b"pattern" => Err("pattern cannot be used without arguments"),
325 b"us_social_security_number" => Ok(Filter::UsSocialSecurityNumber),
326 _ => Err("unknown filter name"),
327 },
328 Value::Regex(regex) => Ok(Filter::Pattern(vec![Pattern::Regex((*regex).clone())])),
329 _ => Err("unknown literal for filter, must be a regex, filter name, or object"),
330 }
331 }
332}
333
334impl Filter {
335 fn redact<'t>(&self, input: &'t str, redactor: &Redactor) -> Cow<'t, str> {
336 match &self {
337 Filter::Pattern(patterns) => {
338 patterns
339 .iter()
340 .fold(Cow::Borrowed(input), |input, pattern| match pattern {
341 Pattern::Regex(regex) => {
342 regex.replace_all(&input, redactor).into_owned().into()
343 }
344 Pattern::String(pattern) => str_replace(&input, pattern, redactor).into(),
345 })
346 }
347 Filter::UsSocialSecurityNumber => {
348 US_SOCIAL_SECURITY_NUMBER.replace_all(input, redactor)
349 }
350 }
351 }
352}
353
354fn str_replace(haystack: &str, pattern: &str, redactor: &Redactor) -> String {
355 let mut result = String::new();
356 let mut last_end = 0;
357 for (start, original) in haystack.match_indices(pattern) {
358 result.push_str(&haystack[last_end..start]);
359 redactor.replace_str(original, &mut result);
360 last_end = start + original.len();
361 }
362 result.push_str(&haystack[last_end..]);
363 result
364}
365
366#[allow(unpredictable_function_pointer_comparisons)]
367#[derive(Debug, Default, Clone, PartialEq, Eq)]
369enum Redactor {
370 #[default]
371 Full,
372 Text(String), Hash {
381 encoder: Encoder,
382 hasher: fn(Encoder, &[u8]) -> String,
383 },
384}
385
386const REDACTED: &str = "[REDACTED]";
387
388impl Redactor {
389 fn replace_str(&self, original: &str, dst: &mut String) {
390 match self {
391 Redactor::Full => {
392 dst.push_str(REDACTED);
393 }
394 Redactor::Text(s) => {
395 dst.push_str(s);
396 }
397 Redactor::Hash { encoder, hasher } => {
398 dst.push_str(&hasher(*encoder, original.as_bytes()));
399 }
400 }
401 }
402
403 fn from_object(obj: &ObjectMap) -> std::result::Result<Self, &'static str> {
404 let r#type = match obj.get("type").ok_or(
405 "redactor specified as objects must have type
406 parameter",
407 )? {
408 Value::Bytes(bytes) => Ok(bytes.clone()),
409 _ => Err("type key in redactor must be a string"),
410 }?;
411
412 match r#type.as_ref() {
413 b"full" => Ok(Redactor::Full),
414 b"text" => {
415 match obj.get("replacement").ok_or(
416 "text redactor must have
417 `replacement` specified",
418 )? {
419 Value::Bytes(bytes) => {
420 Ok(Redactor::Text(String::from_utf8_lossy(bytes).into_owned()))
421 }
422 _ => Err("`replacement` must be a string"),
423 }
424 }
425 b"sha2" => {
426 let hasher = if let Some(variant) = obj.get("variant") {
427 match variant
428 .as_bytes()
429 .ok_or("`variant` must be a string")?
430 .as_ref()
431 {
432 b"SHA-224" => encoded_hash::<sha_2::Sha224>,
433 b"SHA-256" => encoded_hash::<sha_2::Sha256>,
434 b"SHA-384" => encoded_hash::<sha_2::Sha384>,
435 b"SHA-512" => encoded_hash::<sha_2::Sha512>,
436 b"SHA-512/224" => encoded_hash::<sha_2::Sha512_224>,
437 b"SHA-512/256" => encoded_hash::<sha_2::Sha512_256>,
438 _ => return Err("invalid sha2 variant"),
439 }
440 } else {
441 encoded_hash::<sha_2::Sha512_256>
442 };
443 let encoder = obj
444 .get("encoding")
445 .map(Encoder::try_from)
446 .transpose()?
447 .unwrap_or(Encoder::Base64);
448 Ok(Redactor::Hash { hasher, encoder })
449 }
450 b"sha3" => {
451 let hasher = if let Some(variant) = obj.get("variant") {
452 match variant
453 .as_bytes()
454 .ok_or("`variant must be a string")?
455 .as_ref()
456 {
457 b"SHA3-224" => encoded_hash::<sha_3::Sha3_224>,
458 b"SHA3-256" => encoded_hash::<sha_3::Sha3_256>,
459 b"SHA3-384" => encoded_hash::<sha_3::Sha3_384>,
460 b"SHA3-512" => encoded_hash::<sha_3::Sha3_512>,
461 _ => return Err("invalid sha2 variant"),
462 }
463 } else {
464 encoded_hash::<sha_3::Sha3_512>
465 };
466 let encoder = obj
467 .get("encoding")
468 .map(Encoder::try_from)
469 .transpose()?
470 .unwrap_or(Encoder::Base64);
471 Ok(Redactor::Hash { hasher, encoder })
472 }
473 _ => Err("unknown `type` for `redactor`"),
474 }
475 }
476}
477
478impl regex::Replacer for &Redactor {
479 fn replace_append(&mut self, caps: ®ex::Captures, dst: &mut String) {
480 self.replace_str(&caps[0], dst);
481 }
482
483 fn no_expansion(&mut self) -> Option<Cow<'_, str>> {
484 match self {
485 Redactor::Full => Some(REDACTED.into()),
486 Redactor::Text(s) => Some(s.into()),
487 Redactor::Hash { .. } => None,
488 }
489 }
490}
491
492impl TryFrom<Value> for Redactor {
493 type Error = &'static str;
494
495 fn try_from(value: Value) -> std::result::Result<Self, Self::Error> {
496 match value {
497 Value::Object(object) => Redactor::from_object(&object),
498 Value::Bytes(bytes) => match bytes.as_ref() {
499 b"full" => Ok(Redactor::Full),
500 b"sha2" => Ok(Redactor::Hash {
501 hasher: encoded_hash::<sha_2::Sha512_256>,
502 encoder: Encoder::Base64,
503 }),
504 b"sha3" => Ok(Redactor::Hash {
505 hasher: encoded_hash::<sha_3::Sha3_512>,
506 encoder: Encoder::Base64,
507 }),
508 _ => Err("unknown name of redactor"),
509 },
510 _ => Err("unknown literal for redactor, must be redactor name or object"),
511 }
512 }
513}
514
515#[derive(Debug, Copy, Clone, PartialEq, Eq)]
516enum Encoder {
517 Base64,
518 Base16,
519}
520
521impl TryFrom<&Value> for Encoder {
522 type Error = &'static str;
523
524 fn try_from(value: &Value) -> std::result::Result<Self, Self::Error> {
525 match value.as_bytes().ok_or("encoding must be string")?.as_ref() {
526 b"base64" => Ok(Self::Base64),
527 b"base16" | b"hex" => Ok(Self::Base16),
528 _ => Err("unexpected encoding"),
529 }
530 }
531}
532
533impl Encoder {
534 fn encode(self, data: &[u8]) -> String {
535 use Encoder::{Base16, Base64};
536 match self {
537 Base64 => base64_simd::STANDARD.encode_to_string(data),
538 Base16 => base16::encode_lower(data),
539 }
540 }
541}
542
543fn encoded_hash<T: digest::Digest>(encoder: Encoder, data: &[u8]) -> String {
546 encoder.encode(&T::digest(data))
547}
548
549#[cfg(test)]
550mod test {
551 use super::*;
552 use crate::{btreemap, value};
553 use regex::Regex;
554
555 test_function![
556 redact => Redact;
557
558 regex {
559 args: func_args![
560 value: "hello 123456 world",
561 filters: vec![Regex::new(r"\d+").unwrap()],
562 ],
563 want: Ok("hello [REDACTED] world"),
564 tdef: TypeDef::bytes().infallible(),
565 }
566
567 patterns {
568 args: func_args![
569 value: "hello 123456 world",
570 filters: vec![
571 value!({
572 "type": "pattern",
573 "patterns": ["123456"]
574 })
575 ],
576 ],
577 want: Ok("hello [REDACTED] world"),
578 tdef: TypeDef::bytes().infallible(),
579 }
580
581 us_social_security_number{
582 args: func_args![
583 value: "hello 123-12-1234 world",
584 filters: vec!["us_social_security_number"],
585 ],
586 want: Ok("hello [REDACTED] world"),
587 tdef: TypeDef::bytes().infallible(),
588 }
589
590 invalid_filter {
591 args: func_args![
592 value: "hello 123456 world",
593 filters: vec!["not a filter"],
594 ],
595 want: Err("invalid argument"),
596 tdef: TypeDef::bytes().infallible(),
597 }
598
599 missing_patterns {
600 args: func_args![
601 value: "hello 123456 world",
602 filters: vec![
603 value!({
604 "type": "pattern",
605 })
606 ],
607 ],
608 want: Err("invalid argument"),
609 tdef: TypeDef::bytes().infallible(),
610 }
611
612 text_redactor {
613 args: func_args![
614 value: "my id is 123456",
615 filters: vec![Regex::new(r"\d+").unwrap()],
616 redactor: btreemap!{"type" => "text", "replacement" => "***"},
617 ],
618 want: Ok("my id is ***"),
619 tdef: TypeDef::bytes().infallible(),
620 }
621
622 sha2 {
623 args: func_args![
624 value: "my id is 123456",
625 filters: vec![Regex::new(r"\d+").unwrap()],
626 redactor: "sha2",
627 ],
628 want: Ok("my id is GEtTedW1p6tC094dDKH+3B8P+xSnZz69AmpjaXRd63I="),
629 tdef: TypeDef::bytes().infallible(),
630 }
631
632 sha3 {
633 args: func_args![
634 value: "my id is 123456",
635 filters: vec![Regex::new(r"\d+").unwrap()],
636 redactor: "sha3",
637 ],
638 want: Ok("my id is ZNCdmTDI7PeeUTFnpYjLdUObdizo+bIupZdl8yqnTKGdLx6X3JIqPUlUWUoFBikX+yTR+OcvLtAqWO11NPlNJw=="),
639 tdef: TypeDef::bytes().infallible(),
640 }
641
642 sha256_hex {
643 args: func_args![
644 value: "my id is 123456",
645 filters: vec![Regex::new(r"\d+").unwrap()],
646 redactor: btreemap!{"type" => "sha2", "variant" => "SHA-256", "encoding" =>
647 "base16"},
648 ],
649 want: Ok("my id is 8d969eef6ecad3c29a3a629280e686cf0c3f5d5a86aff3ca12020c923adc6c92"),
650 tdef: TypeDef::bytes().infallible(),
651 }
652
653 invalid_redactor {
654 args: func_args![
655 value: "hello 123456 world",
656 filters: vec!["us_social_security_number"],
657 redactor: "not a redactor"
658 ],
659 want: Err("invalid argument"),
660 tdef: TypeDef::bytes().infallible(),
661 }
662
663 invalid_redactor_obj {
664 args: func_args![
665 value: "hello 123456 world",
666 filters: vec!["us_social_security_number"],
667 redactor: btreemap!{"type" => "wrongtype"},
668 ],
669 want: Err("invalid argument"),
670 tdef: TypeDef::bytes().infallible(),
671 }
672
673 invalid_redactor_no_type {
674 args: func_args![
675 value: "hello 123456 world",
676 filters: vec!["us_social_security_number"],
677 redactor: btreemap!{"key" => "value"},
678 ],
679 want: Err("invalid argument"),
680 tdef: TypeDef::bytes().infallible(),
681 }
682
683 invalid_hash_variant {
684 args: func_args![
685 value: "hello 123456 world",
686 filters: vec!["us_social_security_number"],
687 redactor: btreemap!{"type" => "sha2", "variant" => "MD5"},
688 ],
689 want: Err("invalid argument"),
690 tdef: TypeDef::bytes().infallible(),
691 }
692 ];
693}