1use crate::compiler::prelude::*;
2
3#[cfg(not(target_arch = "wasm32"))]
4mod non_wasm {
5 use crate::compiler::prelude::*;
6 use crate::datadog_grok::{parse_grok, parse_grok_rules::GrokRule};
7 use crate::diagnostic::{Label, Span};
8 use std::fmt;
9
10 #[derive(Debug)]
11 pub(crate) enum Error {
12 InvalidGrokPattern(crate::datadog_grok::parse_grok_rules::Error),
13 }
14
15 impl fmt::Display for Error {
16 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
17 match self {
18 Error::InvalidGrokPattern(err) => err.fmt(f),
19 }
20 }
21 }
22
23 impl std::error::Error for Error {}
24
25 impl DiagnosticMessage for Error {
26 fn code(&self) -> usize {
27 109
28 }
29
30 fn labels(&self) -> Vec<Label> {
31 match self {
32 Error::InvalidGrokPattern(err) => {
33 vec![Label::primary(
34 format!("grok pattern error: {err}"),
35 Span::default(),
36 )]
37 }
38 }
39 }
40 }
41
42 #[derive(Clone, Debug)]
43 pub(super) struct ParseGroksFn {
44 pub(super) value: Box<dyn Expression>,
45 pub(super) grok_rules: Vec<GrokRule>,
46 }
47
48 impl FunctionExpression for ParseGroksFn {
49 fn resolve(&self, ctx: &mut Context) -> Resolved {
50 let value = self.value.resolve(ctx)?;
51 let bytes = value.try_bytes_utf8_lossy()?;
52
53 let v = parse_grok::parse_grok(bytes.as_ref(), &self.grok_rules)
54 .map_err(|err| format!("unable to parse grok: {err}"))?
55 .parsed;
56
57 Ok(v)
58 }
59
60 fn type_def(&self, _: &state::TypeState) -> TypeDef {
61 TypeDef::object(Collection::any()).fallible()
62 }
63 }
64}
65
66#[allow(clippy::wildcard_imports)]
67#[cfg(not(target_arch = "wasm32"))]
68use non_wasm::*;
69use std::sync::LazyLock;
70#[cfg(not(target_arch = "wasm32"))]
71use std::{fs::File, io::BufReader, path::Path};
72
73static DEFAULT_ALIASES: LazyLock<Value> =
74 LazyLock::new(|| Value::Object(std::collections::BTreeMap::new()));
75static DEFAULT_ALIAS_SOURCES: LazyLock<Value> = LazyLock::new(|| Value::Array(vec![]));
76
77static PARAMETERS: LazyLock<Vec<Parameter>> = LazyLock::new(|| {
78 vec, which are tried in order until the first match.",
84 ),
85 Parameter::optional("aliases", kind::OBJECT, "The shared set of grok aliases that can be referenced in the patterns to simplify them.")
86 .default(&DEFAULT_ALIASES),
87 Parameter::optional("alias_sources", kind::ARRAY, "Path to the file containing aliases in a JSON format.")
88 .default(&DEFAULT_ALIAS_SOURCES),
89 ]
90});
91
92#[derive(Clone, Copy, Debug)]
93pub struct ParseGroks;
94
95impl Function for ParseGroks {
96 fn identifier(&self) -> &'static str {
97 "parse_groks"
98 }
99
100 fn usage(&self) -> &'static str {
101 "Parses the `value` using multiple [`grok`](https://github.com/daschl/grok/tree/master/patterns) patterns. All patterns [listed here](https://github.com/daschl/grok/tree/master/patterns) are supported."
102 }
103
104 fn category(&self) -> &'static str {
105 Category::Parse.as_ref()
106 }
107
108 fn internal_failure_reasons(&self) -> &'static [&'static str] {
109 &[
110 "`value` fails to parse using the provided `pattern`.",
111 "`patterns` is not an array.",
112 "`aliases` is not an object.",
113 "`alias_sources` is not a string array or doesn't point to a valid file.",
114 ]
115 }
116
117 fn return_kind(&self) -> u16 {
118 kind::OBJECT
119 }
120
121 fn notices(&self) -> &'static [&'static str] {
122 &[indoc! {"
123 We recommend using community-maintained Grok patterns when possible, as they're more
124 likely to be properly vetted and improved over time than bespoke patterns.
125 "}]
126 }
127
128 fn parameters(&self) -> &'static [Parameter] {
129 PARAMETERS.as_slice()
130 }
131
132 fn examples(&self) -> &'static [Example] {
133 &[
134 example! {
135 title: "Parse using multiple Grok patterns",
136 source: indoc! {r#"
137 parse_groks!(
138 "2020-10-02T23:22:12.223222Z info Hello world",
139 patterns: [
140 "%{common_prefix} %{_status} %{_message}",
141 "%{common_prefix} %{_message}",
142 ],
143 aliases: {
144 "common_prefix": "%{_timestamp} %{_loglevel}",
145 "_timestamp": "%{TIMESTAMP_ISO8601:timestamp}",
146 "_loglevel": "%{LOGLEVEL:level}",
147 "_status": "%{POSINT:status}",
148 "_message": "%{GREEDYDATA:message}"
149 }
150 )
151 "#},
152 result: Ok(indoc! {r#"
153 {
154 "timestamp": "2020-10-02T23:22:12.223222Z",
155 "level": "info",
156 "message": "Hello world"
157 }
158 "#}),
159 },
160 example! {
161 title: "Parse using aliases from file",
162 source: indoc! {r#"
163 parse_groks!(
164 "username=foo",
165 patterns: [ "%{PATTERN_A}" ],
166 alias_sources: [ "tests/data/grok/aliases.json" ]
167 )
168 # aliases.json contents:
169 # {
170 # "PATTERN_A": "%{PATTERN_B}",
171 # "PATTERN_B": "username=%{USERNAME:username}"
172 # }
173 "#},
174 result: Ok(r#"{"username": "foo"}"#),
175 },
176 ]
177 }
178
179 #[cfg(not(target_arch = "wasm32"))]
180 fn compile(
181 &self,
182 state: &state::TypeState,
183 _ctx: &mut FunctionCompileContext,
184 arguments: ArgumentList,
185 ) -> Compiled {
186 use std::collections::BTreeMap;
187
188 let value = arguments.required("value");
189
190 let patterns = arguments
191 .required_array("patterns")?
192 .into_iter()
193 .map(|expr| {
194 let pattern = expr
195 .clone()
196 .resolve_constant(state)
197 .ok_or(function::Error::ExpectedStaticExpression {
198 keyword: "patterns",
199 expr: expr.clone(),
200 })?
201 .try_bytes_utf8_lossy()
202 .map_err(|_| function::Error::InvalidArgument {
203 keyword: "patterns",
204 value: format!("{expr:?}").into(),
205 error: "grok pattern should be a string",
206 })?
207 .into_owned();
208 Ok(pattern)
209 })
210 .collect::<std::result::Result<Vec<String>, function::Error>>()?;
211
212 let mut aliases = arguments
213 .optional_object("aliases")?
214 .unwrap_or_default()
215 .into_iter()
216 .map(|(key, expr)| {
217 let alias = expr
218 .clone()
219 .resolve_constant(state)
220 .ok_or(function::Error::ExpectedStaticExpression {
221 keyword: "aliases",
222 expr: expr.clone(),
223 })?
224 .try_bytes_utf8_lossy()
225 .map_err(|_| function::Error::InvalidArgument {
226 keyword: "aliases",
227 value: format!("{expr:?}").into(),
228 error: "alias pattern should be a string",
229 })?
230 .into_owned();
231 Ok((key, alias))
232 })
233 .collect::<std::result::Result<BTreeMap<KeyString, String>, function::Error>>()?;
234
235 let alias_sources = arguments
236 .optional_array("alias_sources")?
237 .unwrap_or_default();
238
239 #[cfg(not(feature = "enable_system_functions"))]
242 if !alias_sources.is_empty() {
243 return Err(function::Error::InvalidArgument {
244 keyword: "alias_sources",
245 value: "alias_sources".into(),
246 error: "alias_sources is disabled when enable_system_functions feature is disabled",
247 }
248 .into());
249 }
250
251 let alias_sources = alias_sources
252 .into_iter()
253 .map(|expr| {
254 let path = expr
255 .clone()
256 .resolve_constant(state)
257 .ok_or(function::Error::ExpectedStaticExpression {
258 keyword: "alias_sources",
259 expr: expr.clone(),
260 })?
261 .try_bytes_utf8_lossy()
262 .map_err(|_| function::Error::InvalidArgument {
263 keyword: "alias_sources",
264 value: format!("{expr:?}").into(),
265 error: "alias source should be a string",
266 })?
267 .into_owned();
268 Ok(path)
269 })
270 .collect::<std::result::Result<Vec<String>, function::Error>>()?;
271
272 for src in alias_sources {
273 let path = Path::new(&src);
274 let file = File::open(path).map_err(|_| function::Error::InvalidArgument {
275 keyword: "alias_sources",
276 value: format!("{}", path.display()).into(),
277 error: "Unable to open alias source file",
278 })?;
279 let reader = BufReader::new(file);
280 let mut src_aliases =
281 serde_json::from_reader(reader).map_err(|_| function::Error::InvalidArgument {
282 keyword: "alias_sources",
283 value: format!("{}", path.display()).into(),
284 error: "Unable to read alias source",
285 })?;
286
287 aliases.append(&mut src_aliases);
288 }
289
290 let grok_rules = crate::datadog_grok::parse_grok_rules::parse_grok_rules(
292 &patterns, aliases,
293 )
294 .map_err(|e| Box::new(Error::InvalidGrokPattern(e)) as Box<dyn DiagnosticMessage>)?;
295
296 Ok(ParseGroksFn { value, grok_rules }.as_expr())
297 }
298
299 #[cfg(target_arch = "wasm32")]
300 fn compile(
301 &self,
302 _state: &state::TypeState,
303 ctx: &mut FunctionCompileContext,
304 _: ArgumentList,
305 ) -> Compiled {
306 Ok(super::WasmUnsupportedFunction::new(
307 ctx.span(),
308 TypeDef::object(Collection::any()).fallible(),
309 )
310 .as_expr())
311 }
312}
313
314#[cfg(test)]
315mod test {
316 use crate::btreemap;
317 use crate::value;
318 use crate::value::Value;
319
320 use super::*;
321
322 test_function![
323 parse_grok => ParseGroks;
324
325 invalid_grok {
326 args: func_args![ value: "foo",
327 patterns: vec!["%{NOG}"]],
328 want: Err("failed to parse grok expression '(?m)\\A%{NOG}\\z': The given pattern definition name \"NOG\" could not be found in the definition map"),
329 tdef: TypeDef::object(Collection::any()).fallible(),
330 }
331
332 error {
333 args: func_args![ value: "an ungrokkable message",
334 patterns: vec!["%{TIMESTAMP_ISO8601:timestamp} %{LOGLEVEL:level} %{GREEDYDATA:message}"]],
335 want: Err("unable to parse grok: value does not match any rule"),
336 tdef: TypeDef::object(Collection::any()).fallible(),
337 }
338
339 error2 {
340 args: func_args![ value: "2020-10-02T23:22:12.223222Z an ungrokkable message",
341 patterns: vec!["%{TIMESTAMP_ISO8601:timestamp} %{LOGLEVEL:level} %{GREEDYDATA:message}"]],
342 want: Err("unable to parse grok: value does not match any rule"),
343 tdef: TypeDef::object(Collection::any()).fallible(),
344 }
345
346 error3 {
347 args: func_args![ value: "2020-10-02T23:22:12.223222Z info Hello world",
348 patterns: vec!["%{TIMESTAMP_ISO8601:timestamp} %{LOGLEVEL:level} %{GREEDYDATA:message}"],
349 aliases: value!({
350 "TEST": 3
351 })],
352 want: Err("invalid argument"),
353 tdef: TypeDef::object(Collection::any()).fallible(),
354 }
355
356 parsed {
357 args: func_args![ value: "2020-10-02T23:22:12.223222Z info Hello world",
358 patterns: vec!["%{TIMESTAMP_ISO8601:timestamp} %{LOGLEVEL:level} %{GREEDYDATA:message}"]],
359 want: Ok(Value::from(btreemap! {
360 "timestamp" => "2020-10-02T23:22:12.223222Z",
361 "level" => "info",
362 "message" => "Hello world",
363 })),
364 tdef: TypeDef::object(Collection::any()).fallible(),
365 }
366
367 parsed2 {
368 args: func_args![ value: "2020-10-02T23:22:12.223222Z",
369 patterns: vec!["(%{TIMESTAMP_ISO8601:timestamp}|%{LOGLEVEL:level})"]],
370 want: Ok(Value::from(btreemap! {
371 "timestamp" => "2020-10-02T23:22:12.223222Z",
372 })),
373 tdef: TypeDef::object(Collection::any()).fallible(),
374 }
375
376 multiple_patterns_and_aliases_first_pattern_matches {
377 args: func_args![
378 value: "2020-10-02T23:22:12.223222Z info 200 hello world",
379 patterns: Value::Array(vec![
380 "%{common_prefix} %{_status} %{_message}".into(),
381 "%{common_prefix} %{_message}".into(),
382 ]),
383 aliases: value!({
384 "common_prefix": "%{_timestamp} %{_loglevel}",
385 "_timestamp": "%{TIMESTAMP_ISO8601:timestamp}",
386 "_loglevel": "%{LOGLEVEL:level}",
387 "_status": "%{POSINT:status}",
388 "_message": "%{GREEDYDATA:message}"
389 })
390 ],
391 want: Ok(Value::from(btreemap! {
392 "timestamp" => "2020-10-02T23:22:12.223222Z",
393 "level" => "info",
394 "status" => "200",
395 "message" => "hello world"
396 })),
397 tdef: TypeDef::object(Collection::any()).fallible(),
398 }
399
400 presence_of_alias_sources_argument {
401 args: func_args![
402 value: "2020-10-02T23:22:12.223222Z info 200 hello world",
403 patterns: Value::Array(vec![
404 "%{common_prefix} %{_status} %{_message}".into(),
405 "%{common_prefix} %{_message}".into(),
406 ]),
407 aliases: value!({
408 "common_prefix": "%{_timestamp} %{_loglevel}",
409 "_timestamp": "%{TIMESTAMP_ISO8601:timestamp}",
410 "_loglevel": "%{LOGLEVEL:level}",
411 "_status": "%{POSINT:status}",
412 "_message": "%{GREEDYDATA:message}"
413 }),
414 alias_sources: Value::Array(vec![]),
415 ],
416 want: Ok(Value::from(btreemap! {
417 "timestamp" => "2020-10-02T23:22:12.223222Z",
418 "level" => "info",
419 "status" => "200",
420 "message" => "hello world"
421 })),
422 tdef: TypeDef::object(Collection::any()).fallible(),
423 }
424
425 multiple_patterns_and_aliases_second_pattern_matches {
426 args: func_args![
427 value: "2020-10-02T23:22:12.223222Z info hello world",
428 patterns: Value::Array(vec![
429 "%{common_prefix} %{_status} %{_message}".into(),
430 "%{common_prefix} %{_message}".into(),
431 ]),
432 aliases: value!({
433 "common_prefix": "%{_timestamp} %{_loglevel}",
434 "_timestamp": "%{TIMESTAMP_ISO8601:timestamp}",
435 "_loglevel": "%{LOGLEVEL:level}",
436 "_status": "%{POSINT:status}",
437 "_message": "%{GREEDYDATA:message}"
438 })
439 ],
440 want: Ok(Value::from(btreemap! {
441 "timestamp" => "2020-10-02T23:22:12.223222Z",
442 "level" => "info",
443 "message" => "hello world"
444 })),
445 tdef: TypeDef::object(Collection::any()).fallible(),
446 }
447
448 datadog_nginx {
449 args: func_args![
450 value: r#"127.0.0.1 - frank [13/Jul/2016:10:55:36] "GET /apache_pb.gif HTTP/1.0" 200 2326 0.202 "http://www.perdu.com/" "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36" "-""#,
451 patterns: Value::Array(vec![
452 "%{access_common}".into(),
453 r#"%{access_common} (%{number:duration:scale(1000000000)} )?"%{_referer}" "%{_user_agent}"( "%{_x_forwarded_for}")?.*"#.into(),
454 ]),
455 aliases: value!({
456 "access_common": r#"%{_client_ip} %{_ident} %{_auth} \[%{_date_access}\] "(?>%{_method} |)%{_url}(?> %{_version}|)" %{_status_code} (?>%{_bytes_written}|-)"#,
457 "_auth": r#"%{notSpace:http.auth:nullIf("-")}"#,
458 "_bytes_written": "%{integer:network.bytes_written}",
459 "_client_ip": "%{ipOrHost:network.client.ip}",
460 "_version": r#"HTTP\/%{regex("\\d+\\.\\d+"):http.version}"#,
461 "_url": "%{notSpace:http.url}",
462 "_ident": "%{notSpace:http.ident}",
463 "_user_agent": r#"%{regex("[^\\\"]*"):http.useragent}"#,
464 "_referer": "%{notSpace:http.referer}",
465 "_status_code": "%{integer:http.status_code}",
466 "_method": "%{word:http.method}",
467 "_date_access": "%{notSpace:date_access}",
468 "_x_forwarded_for": r#"%{regex("[^\\\"]*"):http._x_forwarded_for:nullIf("-")}"#
469 })
470 ],
471 want: Ok(Value::Object(btreemap! {
472 "date_access" => "13/Jul/2016:10:55:36",
473 "duration" => 202_000_000,
474 "http" => btreemap! {
475 "auth" => "frank",
476 "ident" => "-",
477 "method" => "GET",
478 "status_code" => 200,
479 "url" => "/apache_pb.gif",
480 "version" => "1.0",
481 "referer" => "http://www.perdu.com/",
482 "useragent" => "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36",
483 },
484 "network" => btreemap! {
485 "bytes_written" => 2326,
486 "client" => btreemap! {
487 "ip" => "127.0.0.1"
488 }
489 }
490 })),
491 tdef: TypeDef::object(Collection::any()).fallible(),
492 }
493 ];
494
495 #[cfg(not(feature = "enable_system_functions"))]
497 #[test]
498 fn alias_sources_errors_without_enable_flag() {
499 use crate::compiler::{CompileConfig, TypeState, compile_with_state};
500 use crate::diagnostic::Formatter;
501
502 let src = r#"
503 parse_groks!(
504 "username=foo",
505 patterns: ["%{PATTERN_A}"],
506 alias_sources: ["tests/data/grok/aliases.json"]
507 )
508 "#;
509
510 let fns = crate::stdlib::all();
511 let state = TypeState::default();
512 let config = CompileConfig::default();
513 let result = compile_with_state(src, &fns, &state, config);
514 assert!(
515 result.is_err(),
516 "Expected compilation to fail when alias_sources is used without enable_system_functions"
517 );
518
519 let diagnostics = result.err().unwrap();
520 let err = Formatter::new(src, diagnostics).to_string();
521 assert!(
522 err.contains("alias_sources is disabled"),
523 "Expected error about alias_sources being disabled, got: {err}"
524 );
525 }
526}