vrl/stdlib/
parse_groks.rs

1use crate::compiler::prelude::*;
2
3#[cfg(not(target_arch = "wasm32"))]
4mod non_wasm {
5    use crate::compiler::prelude::*;
6    use crate::datadog_grok::{parse_grok, parse_grok_rules::GrokRule};
7    use crate::diagnostic::{Label, Span};
8    use std::fmt;
9
10    #[derive(Debug)]
11    pub(crate) enum Error {
12        InvalidGrokPattern(crate::datadog_grok::parse_grok_rules::Error),
13    }
14
15    impl fmt::Display for Error {
16        fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
17            match self {
18                Error::InvalidGrokPattern(err) => err.fmt(f),
19            }
20        }
21    }
22
23    impl std::error::Error for Error {}
24
25    impl DiagnosticMessage for Error {
26        fn code(&self) -> usize {
27            109
28        }
29
30        fn labels(&self) -> Vec<Label> {
31            match self {
32                Error::InvalidGrokPattern(err) => {
33                    vec![Label::primary(
34                        format!("grok pattern error: {err}"),
35                        Span::default(),
36                    )]
37                }
38            }
39        }
40    }
41
42    #[derive(Clone, Debug)]
43    pub(super) struct ParseGroksFn {
44        pub(super) value: Box<dyn Expression>,
45        pub(super) grok_rules: Vec<GrokRule>,
46    }
47
48    impl FunctionExpression for ParseGroksFn {
49        fn resolve(&self, ctx: &mut Context) -> Resolved {
50            let value = self.value.resolve(ctx)?;
51            let bytes = value.try_bytes_utf8_lossy()?;
52
53            let v = parse_grok::parse_grok(bytes.as_ref(), &self.grok_rules)
54                .map_err(|err| format!("unable to parse grok: {err}"))?
55                .parsed;
56
57            Ok(v)
58        }
59
60        fn type_def(&self, _: &state::TypeState) -> TypeDef {
61            TypeDef::object(Collection::any()).fallible()
62        }
63    }
64}
65
66#[allow(clippy::wildcard_imports)]
67#[cfg(not(target_arch = "wasm32"))]
68use non_wasm::*;
69use std::sync::LazyLock;
70#[cfg(not(target_arch = "wasm32"))]
71use std::{fs::File, io::BufReader, path::Path};
72
73static DEFAULT_ALIASES: LazyLock<Value> =
74    LazyLock::new(|| Value::Object(std::collections::BTreeMap::new()));
75static DEFAULT_ALIAS_SOURCES: LazyLock<Value> = LazyLock::new(|| Value::Array(vec![]));
76
77static PARAMETERS: LazyLock<Vec<Parameter>> = LazyLock::new(|| {
78    vec![
79        Parameter::required("value", kind::BYTES, "The string to parse."),
80        Parameter::required(
81            "patterns",
82            kind::ARRAY,
83            "The [Grok patterns](https://github.com/daschl/grok/tree/master/patterns), which are tried in order until the first match.",
84        ),
85        Parameter::optional("aliases", kind::OBJECT, "The shared set of grok aliases that can be referenced in the patterns to simplify them.")
86            .default(&DEFAULT_ALIASES),
87        Parameter::optional("alias_sources", kind::ARRAY, "Path to the file containing aliases in a JSON format.")
88            .default(&DEFAULT_ALIAS_SOURCES),
89    ]
90});
91
92#[derive(Clone, Copy, Debug)]
93pub struct ParseGroks;
94
95impl Function for ParseGroks {
96    fn identifier(&self) -> &'static str {
97        "parse_groks"
98    }
99
100    fn usage(&self) -> &'static str {
101        "Parses the `value` using multiple [`grok`](https://github.com/daschl/grok/tree/master/patterns) patterns. All patterns [listed here](https://github.com/daschl/grok/tree/master/patterns) are supported."
102    }
103
104    fn category(&self) -> &'static str {
105        Category::Parse.as_ref()
106    }
107
108    fn internal_failure_reasons(&self) -> &'static [&'static str] {
109        &[
110            "`value` fails to parse using the provided `pattern`.",
111            "`patterns` is not an array.",
112            "`aliases` is not an object.",
113            "`alias_sources` is not a string array or doesn't point to a valid file.",
114        ]
115    }
116
117    fn return_kind(&self) -> u16 {
118        kind::OBJECT
119    }
120
121    fn notices(&self) -> &'static [&'static str] {
122        &[indoc! {"
123            We recommend using community-maintained Grok patterns when possible, as they're more
124            likely to be properly vetted and improved over time than bespoke patterns.
125        "}]
126    }
127
128    fn parameters(&self) -> &'static [Parameter] {
129        PARAMETERS.as_slice()
130    }
131
132    fn examples(&self) -> &'static [Example] {
133        &[
134            example! {
135                title: "Parse using multiple Grok patterns",
136                source: indoc! {r#"
137                    parse_groks!(
138                        "2020-10-02T23:22:12.223222Z info Hello world",
139                        patterns: [
140                            "%{common_prefix} %{_status} %{_message}",
141                            "%{common_prefix} %{_message}",
142                        ],
143                        aliases: {
144                            "common_prefix": "%{_timestamp} %{_loglevel}",
145                            "_timestamp": "%{TIMESTAMP_ISO8601:timestamp}",
146                            "_loglevel": "%{LOGLEVEL:level}",
147                            "_status": "%{POSINT:status}",
148                            "_message": "%{GREEDYDATA:message}"
149                        }
150                    )
151                "#},
152                result: Ok(indoc! {r#"
153                    {
154                        "timestamp": "2020-10-02T23:22:12.223222Z",
155                        "level": "info",
156                        "message": "Hello world"
157                    }
158                "#}),
159            },
160            example! {
161                title: "Parse using aliases from file",
162                source: indoc! {r#"
163                    parse_groks!(
164                      "username=foo",
165                      patterns: [ "%{PATTERN_A}" ],
166                      alias_sources: [ "tests/data/grok/aliases.json" ]
167                    )
168                    # aliases.json contents:
169                    # {
170                    #   "PATTERN_A": "%{PATTERN_B}",
171                    #   "PATTERN_B": "username=%{USERNAME:username}"
172                    # }
173                "#},
174                result: Ok(r#"{"username": "foo"}"#),
175            },
176        ]
177    }
178
179    #[cfg(not(target_arch = "wasm32"))]
180    fn compile(
181        &self,
182        state: &state::TypeState,
183        _ctx: &mut FunctionCompileContext,
184        arguments: ArgumentList,
185    ) -> Compiled {
186        use std::collections::BTreeMap;
187
188        let value = arguments.required("value");
189
190        let patterns = arguments
191            .required_array("patterns")?
192            .into_iter()
193            .map(|expr| {
194                let pattern = expr
195                    .clone()
196                    .resolve_constant(state)
197                    .ok_or(function::Error::ExpectedStaticExpression {
198                        keyword: "patterns",
199                        expr: expr.clone(),
200                    })?
201                    .try_bytes_utf8_lossy()
202                    .map_err(|_| function::Error::InvalidArgument {
203                        keyword: "patterns",
204                        value: format!("{expr:?}").into(),
205                        error: "grok pattern should be a string",
206                    })?
207                    .into_owned();
208                Ok(pattern)
209            })
210            .collect::<std::result::Result<Vec<String>, function::Error>>()?;
211
212        let mut aliases = arguments
213            .optional_object("aliases")?
214            .unwrap_or_default()
215            .into_iter()
216            .map(|(key, expr)| {
217                let alias = expr
218                    .clone()
219                    .resolve_constant(state)
220                    .ok_or(function::Error::ExpectedStaticExpression {
221                        keyword: "aliases",
222                        expr: expr.clone(),
223                    })?
224                    .try_bytes_utf8_lossy()
225                    .map_err(|_| function::Error::InvalidArgument {
226                        keyword: "aliases",
227                        value: format!("{expr:?}").into(),
228                        error: "alias pattern should be a string",
229                    })?
230                    .into_owned();
231                Ok((key, alias))
232            })
233            .collect::<std::result::Result<BTreeMap<KeyString, String>, function::Error>>()?;
234
235        let alias_sources = arguments
236            .optional_array("alias_sources")?
237            .unwrap_or_default();
238
239        // With enable_system_functions feature disabled, alias_sources is not allowed
240        // to be used because it uses file operations.
241        #[cfg(not(feature = "enable_system_functions"))]
242        if !alias_sources.is_empty() {
243            return Err(function::Error::InvalidArgument {
244                keyword: "alias_sources",
245                value: "alias_sources".into(),
246                error: "alias_sources is disabled when enable_system_functions feature is disabled",
247            }
248            .into());
249        }
250
251        let alias_sources = alias_sources
252            .into_iter()
253            .map(|expr| {
254                let path = expr
255                    .clone()
256                    .resolve_constant(state)
257                    .ok_or(function::Error::ExpectedStaticExpression {
258                        keyword: "alias_sources",
259                        expr: expr.clone(),
260                    })?
261                    .try_bytes_utf8_lossy()
262                    .map_err(|_| function::Error::InvalidArgument {
263                        keyword: "alias_sources",
264                        value: format!("{expr:?}").into(),
265                        error: "alias source should be a string",
266                    })?
267                    .into_owned();
268                Ok(path)
269            })
270            .collect::<std::result::Result<Vec<String>, function::Error>>()?;
271
272        for src in alias_sources {
273            let path = Path::new(&src);
274            let file = File::open(path).map_err(|_| function::Error::InvalidArgument {
275                keyword: "alias_sources",
276                value: format!("{}", path.display()).into(),
277                error: "Unable to open alias source file",
278            })?;
279            let reader = BufReader::new(file);
280            let mut src_aliases =
281                serde_json::from_reader(reader).map_err(|_| function::Error::InvalidArgument {
282                    keyword: "alias_sources",
283                    value: format!("{}", path.display()).into(),
284                    error: "Unable to read alias source",
285                })?;
286
287            aliases.append(&mut src_aliases);
288        }
289
290        // we use a datadog library here because it is a superset of grok
291        let grok_rules = crate::datadog_grok::parse_grok_rules::parse_grok_rules(
292            &patterns, aliases,
293        )
294        .map_err(|e| Box::new(Error::InvalidGrokPattern(e)) as Box<dyn DiagnosticMessage>)?;
295
296        Ok(ParseGroksFn { value, grok_rules }.as_expr())
297    }
298
299    #[cfg(target_arch = "wasm32")]
300    fn compile(
301        &self,
302        _state: &state::TypeState,
303        ctx: &mut FunctionCompileContext,
304        _: ArgumentList,
305    ) -> Compiled {
306        Ok(super::WasmUnsupportedFunction::new(
307            ctx.span(),
308            TypeDef::object(Collection::any()).fallible(),
309        )
310        .as_expr())
311    }
312}
313
314#[cfg(test)]
315mod test {
316    use crate::btreemap;
317    use crate::value;
318    use crate::value::Value;
319
320    use super::*;
321
322    test_function![
323        parse_grok => ParseGroks;
324
325        invalid_grok {
326            args: func_args![ value: "foo",
327                              patterns: vec!["%{NOG}"]],
328            want: Err("failed to parse grok expression '(?m)\\A%{NOG}\\z': The given pattern definition name \"NOG\" could not be found in the definition map"),
329            tdef: TypeDef::object(Collection::any()).fallible(),
330        }
331
332        error {
333            args: func_args![ value: "an ungrokkable message",
334                              patterns: vec!["%{TIMESTAMP_ISO8601:timestamp} %{LOGLEVEL:level} %{GREEDYDATA:message}"]],
335            want: Err("unable to parse grok: value does not match any rule"),
336            tdef: TypeDef::object(Collection::any()).fallible(),
337        }
338
339        error2 {
340            args: func_args![ value: "2020-10-02T23:22:12.223222Z an ungrokkable message",
341                              patterns: vec!["%{TIMESTAMP_ISO8601:timestamp} %{LOGLEVEL:level} %{GREEDYDATA:message}"]],
342            want: Err("unable to parse grok: value does not match any rule"),
343            tdef: TypeDef::object(Collection::any()).fallible(),
344        }
345
346        error3 {
347            args: func_args![ value: "2020-10-02T23:22:12.223222Z info Hello world",
348                              patterns: vec!["%{TIMESTAMP_ISO8601:timestamp} %{LOGLEVEL:level} %{GREEDYDATA:message}"],
349                              aliases: value!({
350                                  "TEST": 3
351                              })],
352            want: Err("invalid argument"),
353            tdef: TypeDef::object(Collection::any()).fallible(),
354        }
355
356        parsed {
357            args: func_args![ value: "2020-10-02T23:22:12.223222Z info Hello world",
358                              patterns: vec!["%{TIMESTAMP_ISO8601:timestamp} %{LOGLEVEL:level} %{GREEDYDATA:message}"]],
359            want: Ok(Value::from(btreemap! {
360                "timestamp" => "2020-10-02T23:22:12.223222Z",
361                "level" => "info",
362                "message" => "Hello world",
363            })),
364            tdef: TypeDef::object(Collection::any()).fallible(),
365        }
366
367        parsed2 {
368            args: func_args![ value: "2020-10-02T23:22:12.223222Z",
369                              patterns: vec!["(%{TIMESTAMP_ISO8601:timestamp}|%{LOGLEVEL:level})"]],
370            want: Ok(Value::from(btreemap! {
371                "timestamp" => "2020-10-02T23:22:12.223222Z",
372            })),
373            tdef: TypeDef::object(Collection::any()).fallible(),
374        }
375
376        multiple_patterns_and_aliases_first_pattern_matches {
377            args: func_args![
378                value: "2020-10-02T23:22:12.223222Z info 200 hello world",
379                patterns: Value::Array(vec![
380                    "%{common_prefix} %{_status} %{_message}".into(),
381                    "%{common_prefix} %{_message}".into(),
382                    ]),
383                aliases: value!({
384                    "common_prefix": "%{_timestamp} %{_loglevel}",
385                    "_timestamp": "%{TIMESTAMP_ISO8601:timestamp}",
386                    "_loglevel": "%{LOGLEVEL:level}",
387                    "_status": "%{POSINT:status}",
388                    "_message": "%{GREEDYDATA:message}"
389                })
390            ],
391            want: Ok(Value::from(btreemap! {
392                "timestamp" => "2020-10-02T23:22:12.223222Z",
393                "level" => "info",
394                "status" => "200",
395                "message" => "hello world"
396            })),
397            tdef: TypeDef::object(Collection::any()).fallible(),
398        }
399
400        presence_of_alias_sources_argument {
401            args: func_args![
402                value: "2020-10-02T23:22:12.223222Z info 200 hello world",
403                patterns: Value::Array(vec![
404                    "%{common_prefix} %{_status} %{_message}".into(),
405                    "%{common_prefix} %{_message}".into(),
406                    ]),
407                aliases: value!({
408                    "common_prefix": "%{_timestamp} %{_loglevel}",
409                    "_timestamp": "%{TIMESTAMP_ISO8601:timestamp}",
410                    "_loglevel": "%{LOGLEVEL:level}",
411                    "_status": "%{POSINT:status}",
412                    "_message": "%{GREEDYDATA:message}"
413                }),
414                alias_sources: Value::Array(vec![]),
415            ],
416            want: Ok(Value::from(btreemap! {
417                "timestamp" => "2020-10-02T23:22:12.223222Z",
418                "level" => "info",
419                "status" => "200",
420                "message" => "hello world"
421            })),
422            tdef: TypeDef::object(Collection::any()).fallible(),
423        }
424
425        multiple_patterns_and_aliases_second_pattern_matches {
426            args: func_args![
427                value: "2020-10-02T23:22:12.223222Z info hello world",
428                patterns: Value::Array(vec![
429                    "%{common_prefix} %{_status} %{_message}".into(),
430                    "%{common_prefix} %{_message}".into(),
431                    ]),
432                aliases: value!({
433                    "common_prefix": "%{_timestamp} %{_loglevel}",
434                    "_timestamp": "%{TIMESTAMP_ISO8601:timestamp}",
435                    "_loglevel": "%{LOGLEVEL:level}",
436                    "_status": "%{POSINT:status}",
437                    "_message": "%{GREEDYDATA:message}"
438                })
439            ],
440            want: Ok(Value::from(btreemap! {
441                "timestamp" => "2020-10-02T23:22:12.223222Z",
442                "level" => "info",
443                "message" => "hello world"
444            })),
445            tdef: TypeDef::object(Collection::any()).fallible(),
446        }
447
448        datadog_nginx {
449            args: func_args![
450                value: r#"127.0.0.1 - frank [13/Jul/2016:10:55:36] "GET /apache_pb.gif HTTP/1.0" 200 2326 0.202 "http://www.perdu.com/" "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36" "-""#,
451                patterns: Value::Array(vec![
452                    "%{access_common}".into(),
453                    r#"%{access_common} (%{number:duration:scale(1000000000)} )?"%{_referer}" "%{_user_agent}"( "%{_x_forwarded_for}")?.*"#.into(),
454                    ]),
455                aliases: value!({
456                    "access_common": r#"%{_client_ip} %{_ident} %{_auth} \[%{_date_access}\] "(?>%{_method} |)%{_url}(?> %{_version}|)" %{_status_code} (?>%{_bytes_written}|-)"#,
457                    "_auth": r#"%{notSpace:http.auth:nullIf("-")}"#,
458                    "_bytes_written": "%{integer:network.bytes_written}",
459                    "_client_ip": "%{ipOrHost:network.client.ip}",
460                    "_version": r#"HTTP\/%{regex("\\d+\\.\\d+"):http.version}"#,
461                    "_url": "%{notSpace:http.url}",
462                    "_ident": "%{notSpace:http.ident}",
463                    "_user_agent": r#"%{regex("[^\\\"]*"):http.useragent}"#,
464                    "_referer": "%{notSpace:http.referer}",
465                    "_status_code": "%{integer:http.status_code}",
466                    "_method": "%{word:http.method}",
467                    "_date_access": "%{notSpace:date_access}",
468                    "_x_forwarded_for": r#"%{regex("[^\\\"]*"):http._x_forwarded_for:nullIf("-")}"#
469                })
470            ],
471            want: Ok(Value::Object(btreemap! {
472                "date_access" => "13/Jul/2016:10:55:36",
473                "duration" => 202_000_000,
474                "http" => btreemap! {
475                    "auth" => "frank",
476                    "ident" => "-",
477                    "method" => "GET",
478                    "status_code" => 200,
479                    "url" => "/apache_pb.gif",
480                    "version" => "1.0",
481                    "referer" => "http://www.perdu.com/",
482                    "useragent" => "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36",
483                },
484                "network" => btreemap! {
485                    "bytes_written" => 2326,
486                    "client" => btreemap! {
487                        "ip" => "127.0.0.1"
488                    }
489                }
490            })),
491            tdef: TypeDef::object(Collection::any()).fallible(),
492        }
493    ];
494
495    // Test that alias_sources errors when enable_system_functions is NOT enabled
496    #[cfg(not(feature = "enable_system_functions"))]
497    #[test]
498    fn alias_sources_errors_without_enable_flag() {
499        use crate::compiler::{CompileConfig, TypeState, compile_with_state};
500        use crate::diagnostic::Formatter;
501
502        let src = r#"
503            parse_groks!(
504                "username=foo",
505                patterns: ["%{PATTERN_A}"],
506                alias_sources: ["tests/data/grok/aliases.json"]
507            )
508        "#;
509
510        let fns = crate::stdlib::all();
511        let state = TypeState::default();
512        let config = CompileConfig::default();
513        let result = compile_with_state(src, &fns, &state, config);
514        assert!(
515            result.is_err(),
516            "Expected compilation to fail when alias_sources is used without enable_system_functions"
517        );
518
519        let diagnostics = result.err().unwrap();
520        let err = Formatter::new(src, diagnostics).to_string();
521        assert!(
522            err.contains("alias_sources is disabled"),
523            "Expected error about alias_sources being disabled, got: {err}"
524        );
525    }
526}