vrl/datadog/grok/
parse_grok_rules.rs

1use crate::path::OwnedValuePath;
2use crate::value::{KeyString, Value};
3use std::sync::LazyLock;
4use std::{
5    collections::{BTreeMap, HashMap},
6    convert::TryFrom,
7};
8use tracing::error;
9
10use super::grok::Grok;
11use super::{
12    ast::{self, Destination, GrokPattern},
13    grok_filter::GrokFilter,
14    matchers::{date, date::DateFilter},
15    parse_grok_pattern::parse_grok_pattern,
16};
17
18static GROK_PATTERN_RE: LazyLock<onig::Regex> = LazyLock::new(|| {
19    onig::Regex::new(r#"%\{(?:[^"\}]|(?<!\\)"(?:\\"|[^"])*(?<!\\)")+\}"#).unwrap()
20});
21
22/// The result of parsing a grok rule with a final regular expression and the
23/// related field information, needed at runtime.
24#[derive(Clone, Debug)]
25pub struct GrokRule {
26    /// a compiled regex pattern
27    pub pattern: super::grok::Pattern,
28    /// a map of capture names(grok0, grok1, ...) to field information.
29    pub fields: HashMap<String, GrokField>,
30}
31
32/// A grok field, that should be extracted, with its lookup path and
33/// post-processing filters to apply.
34#[derive(Debug, Clone)]
35pub struct GrokField {
36    pub lookup: OwnedValuePath,
37    pub filters: Vec<GrokFilter>,
38}
39
40/// The context used to parse grok rules.
41#[derive(Debug, Clone)]
42pub struct GrokRuleParseContext {
43    /// a currently built regular expression
44    pub regex: String,
45    /// a map of capture names(grok0, grok1, ...) to field information.
46    pub fields: HashMap<String, GrokField>,
47    /// aliases and their definitions
48    pub aliases: BTreeMap<KeyString, String>,
49    /// used to detect cycles in alias definitions
50    pub alias_stack: Vec<String>,
51}
52
53impl GrokRuleParseContext {
54    /// appends to the rule's regular expression
55    fn append_regex(&mut self, regex: &str) {
56        self.regex.push_str(regex);
57    }
58
59    /// registers a given grok field under a given grok name(used in a regex)
60    fn register_grok_field(&mut self, grok_name: &str, field: GrokField) {
61        self.fields.insert(grok_name.to_string(), field);
62    }
63
64    /// adds a filter to a field, associated with this grok alias
65    fn register_filter(&mut self, grok_name: &str, filter: GrokFilter) {
66        self.fields
67            .entry(grok_name.to_string())
68            .and_modify(|v| v.filters.insert(0, filter));
69    }
70
71    fn new(aliases: BTreeMap<KeyString, String>) -> Self {
72        Self {
73            regex: String::new(),
74            fields: HashMap::new(),
75            aliases,
76            alias_stack: vec![],
77        }
78    }
79
80    /// Generates a grok-safe name for a given field(grok0, grok1 ...)
81    fn generate_grok_compliant_name(&mut self) -> String {
82        format!("grok{}", self.fields.len())
83    }
84}
85
86#[derive(thiserror::Error, Debug, PartialEq, Eq)]
87pub enum Error {
88    #[error("failed to parse grok expression '{}': {}", .0, .1)]
89    InvalidGrokExpression(String, String),
90    #[error("invalid arguments for the function '{}'", .0)]
91    InvalidFunctionArguments(String),
92    #[error("unknown filter '{}'", .0)]
93    UnknownFilter(String),
94    #[error("Circular dependency found in the alias '{}'", .0)]
95    CircularDependencyInAliasDefinition(String),
96}
97
98///
99/// Parses DD grok rules.
100///
101/// Here is an example:
102/// patterns:
103///  %{access.common} \[%{_date_access}\] "(?>%{_method} |)%{_url}(?> %{_version}|)" %{_status_code} (?>%{_bytes_written}|-)
104///  %{access.common} (%{number:duration:scale(1000000000)} )?"%{_referer}" "%{_user_agent}"( "%{_x_forwarded_for}")?.*"#
105/// aliases:
106///  "access.common" : %{_client_ip} %{_ident} %{_auth}
107///
108/// You can write grok patterns with the %{MATCHER:EXTRACT:FILTER} syntax:
109/// - Matcher: A rule (possibly a reference to another token rule) that describes what to expect (number, word, notSpace, etc.)
110/// - Extract (optional): An identifier representing the capture destination for the piece of text matched by the Matcher.
111/// - Filter (optional): A post-processor of the match to transform it.
112///
113/// Rules can reference aliases as %{alias_name}, aliases can reference each other themselves, cross-references or circular dependencies are not allowed and result in an error.
114/// Only one can match any given log. The first one that matches, from top to bottom, is the one that does the parsing.
115/// For further documentation and the full list of available matcher and filters check out <https://docs.datadoghq.com/logs/processing/parsing>
116pub fn parse_grok_rules(
117    patterns: &[String],
118    aliases: BTreeMap<KeyString, String>,
119) -> Result<Vec<GrokRule>, Error> {
120    let mut grok = Grok::with_patterns();
121
122    patterns
123        .iter()
124        .filter(|&r| !r.is_empty())
125        .map(|r| {
126            parse_pattern(
127                r,
128                &mut GrokRuleParseContext::new(aliases.clone()),
129                &mut grok,
130            )
131        })
132        .collect::<Result<Vec<GrokRule>, Error>>()
133}
134
135///
136/// Parses alias definitions.
137///
138/// # Arguments
139///
140/// - `name` - the name of the alias
141/// - `definition` - the definition of the alias
142/// - `context` - the context required to parse the current grok rule
143fn parse_alias(
144    name: &str,
145    definition: &str,
146    context: &mut GrokRuleParseContext,
147) -> Result<(), Error> {
148    // track circular dependencies
149    if context.alias_stack.iter().any(|a| a == name) {
150        return Err(Error::CircularDependencyInAliasDefinition(
151            context.alias_stack.first().unwrap().to_string(),
152        ));
153    } else {
154        context.alias_stack.push(name.to_string());
155    }
156
157    parse_grok_rule(definition, context)?;
158
159    context.alias_stack.pop();
160
161    Ok(())
162}
163
164///
165/// Parses pattern definitions.
166///
167/// # Arguments
168///
169/// - `pattern` - the definition of the pattern
170/// - `context` - the context required to parse the current grok rule
171/// - `grok` - an instance of Grok parser
172fn parse_pattern(
173    pattern: &str,
174    context: &mut GrokRuleParseContext,
175    grok: &mut Grok,
176) -> Result<GrokRule, Error> {
177    parse_grok_rule(pattern, context)?;
178    let pattern = [
179        // In Oniguruma the (?m) modifier is used to enable the DOTALL mode(dot includes newlines),
180        // as opposed to the (?s) modifier in other regex flavors.
181        // \A, \z - parses from the beginning to the end of string, not line(until \n)
182        r"(?m)\A", // (?m) enables the DOTALL mode by default
183        &context
184            .regex
185            .replace("(?s)", "(?m)")
186            .replace("(?-s)", "(?-m)"),
187        r"\z",
188    ]
189    .concat();
190
191    // compile pattern
192    let pattern = grok
193        .compile(&pattern, true)
194        .map_err(|e| Error::InvalidGrokExpression(pattern, e.to_string()))?;
195
196    Ok(GrokRule {
197        pattern,
198        fields: context.fields.clone(),
199    })
200}
201
202/// Parses a given rule to a pure grok pattern with a set of post-processing filters.
203///
204/// # Arguments
205///
206/// - `rule` - the definition of a grok rule(can be a pattern or an alias)
207/// - `aliases` - all aliases and their definitions
208/// - `context` - the context required to parse the current grok rule
209fn parse_grok_rule(rule: &str, context: &mut GrokRuleParseContext) -> Result<(), Error> {
210    let mut regex_i = 0;
211    for (start, end) in GROK_PATTERN_RE.find_iter(rule) {
212        context.append_regex(&rule[regex_i..start]);
213        regex_i = end;
214        let pattern = parse_grok_pattern(&rule[start..end])
215            .map_err(|e| Error::InvalidGrokExpression(rule[start..end].to_string(), e))?;
216        resolve_grok_pattern(&pattern, context)?;
217    }
218    context.append_regex(&rule[regex_i..]);
219
220    Ok(())
221}
222
223/// Converts each rule to a pure grok rule:
224///  - strips filters and collects them to apply later
225///  - replaces references to aliases with their definitions
226///  - replaces match functions with corresponding regex groups.
227///
228/// # Arguments
229///
230/// - `pattern` - a parsed grok pattern
231/// - `context` - the context required to parse the current grok rule
232fn resolve_grok_pattern(
233    pattern: &GrokPattern,
234    context: &mut GrokRuleParseContext,
235) -> Result<(), Error> {
236    let grok_alias = pattern
237        .destination
238        .as_ref()
239        .map(|_| context.generate_grok_compliant_name());
240    match pattern {
241        GrokPattern {
242            destination:
243                Some(Destination {
244                    path,
245                    filter_fn: Some(filter),
246                }),
247            ..
248        } => {
249            context.register_grok_field(
250                grok_alias.as_ref().expect("grok alias is not defined"),
251                GrokField {
252                    lookup: path.clone(),
253                    filters: vec![GrokFilter::try_from(filter)?],
254                },
255            );
256        }
257        GrokPattern {
258            destination:
259                Some(Destination {
260                    path,
261                    filter_fn: None,
262                }),
263            ..
264        } => {
265            context.register_grok_field(
266                grok_alias.as_ref().expect("grok alias is not defined"),
267                GrokField {
268                    lookup: path.clone(),
269                    filters: vec![],
270                },
271            );
272        }
273        _ => {}
274    }
275
276    let match_name = &pattern.match_fn.name;
277    match context.aliases.get(match_name.as_str()).cloned() {
278        Some(alias_def) => match &grok_alias {
279            Some(grok_alias) => {
280                context.append_regex("(?<");
281                context.append_regex(grok_alias);
282                context.append_regex(">");
283                parse_alias(match_name, &alias_def, context)?;
284                context.append_regex(")");
285            }
286            None => {
287                parse_alias(match_name, &alias_def, context)?;
288            }
289        },
290        None if match_name == "regex" || match_name == "date" || match_name == "boolean" => {
291            // these patterns will be converted to named capture groups e.g. (?<http.status_code>[0-9]{3})
292            match &grok_alias {
293                Some(grok_alias) => {
294                    context.append_regex("(?<");
295                    context.append_regex(grok_alias);
296                    context.append_regex(">");
297                }
298                None => {
299                    context.append_regex("(?:"); // non-capturing group
300                }
301            }
302            resolves_match_function(grok_alias, pattern, context)?;
303            context.append_regex(")");
304        }
305        None => {
306            // these will be converted to "pure" grok patterns %{PATTERN:DESTINATION} but without filters
307            context.append_regex("%{");
308            resolves_match_function(grok_alias.clone(), pattern, context)?;
309
310            if let Some(grok_alias) = &grok_alias {
311                context.append_regex(&format!(":{grok_alias}"));
312            }
313            context.append_regex("}");
314        }
315    }
316
317    Ok(())
318}
319
320/// Process a match function from a given pattern:
321/// - returns a grok expression(a grok pattern or a regular expression) corresponding to a given match function
322/// - some match functions(e.g. number) implicitly introduce a filter to be applied to an extracted value - stores it to `fields`.
323fn resolves_match_function(
324    grok_alias: Option<String>,
325    pattern: &ast::GrokPattern,
326    context: &mut GrokRuleParseContext,
327) -> Result<(), Error> {
328    let match_fn = &pattern.match_fn;
329    match match_fn.name.as_ref() {
330        "regex" => match match_fn.args.as_ref() {
331            Some(args) if !args.is_empty() => {
332                if let ast::FunctionArgument::Arg(Value::Bytes(ref b)) = args[0] {
333                    context.append_regex(&String::from_utf8_lossy(b));
334                    return Ok(());
335                }
336                Err(Error::InvalidFunctionArguments(match_fn.name.clone()))
337            }
338            _ => Err(Error::InvalidFunctionArguments(match_fn.name.clone())),
339        },
340        "integer" => {
341            if let Some(grok_alias) = &grok_alias {
342                context.register_filter(grok_alias, GrokFilter::Integer);
343            }
344            context.append_regex("integerStr");
345            Ok(())
346        }
347        "integerExt" => {
348            if let Some(grok_alias) = &grok_alias {
349                context.register_filter(grok_alias, GrokFilter::IntegerExt);
350            }
351            context.append_regex("integerExtStr");
352            Ok(())
353        }
354        "number" => {
355            if let Some(grok_alias) = &grok_alias {
356                context.register_filter(grok_alias, GrokFilter::Number);
357            }
358            context.append_regex("numberStr");
359            Ok(())
360        }
361        "numberExt" => {
362            if let Some(grok_alias) = &grok_alias {
363                context.register_filter(grok_alias, GrokFilter::NumberExt);
364            }
365            context.append_regex("numberExtStr");
366            Ok(())
367        }
368        "date" => {
369            match match_fn.args.as_ref() {
370                Some(args) if !args.is_empty() && args.len() <= 2 => {
371                    if let ast::FunctionArgument::Arg(Value::Bytes(b)) = &args[0] {
372                        let format = String::from_utf8_lossy(b);
373                        // get regex with captures, so that we can extract timezone and fraction char in the filter
374                        let result = date::time_format_to_regex(&format, true)
375                            .map_err(|_e| Error::InvalidFunctionArguments(match_fn.name.clone()))?;
376                        let filter_re = regex::Regex::new(&result.regex).map_err(|error| {
377                            error!(message = "Error compiling regex", regex = %result.regex, %error);
378                            Error::InvalidFunctionArguments(match_fn.name.clone())
379                        })?;
380
381                        let strp_format = date::convert_time_format(&format).map_err(|error| {
382                            error!(message = "Error compiling regex", regex = %result.regex, %error);
383                            Error::InvalidFunctionArguments(match_fn.name.clone())
384                        })?;
385                        let mut target_tz = None;
386                        if args.len() == 2
387                            && let ast::FunctionArgument::Arg(Value::Bytes(b)) = &args[1]
388                        {
389                            let tz = String::from_utf8_lossy(b);
390                            date::parse_timezone(&tz).map_err(|error| {
391                                error!(message = "Invalid(unrecognized) timezone", %error);
392                                Error::InvalidFunctionArguments(match_fn.name.clone())
393                            })?;
394                            target_tz = Some(tz.to_string());
395                        }
396                        let filter = GrokFilter::Date(DateFilter {
397                            original_format: format.to_string(),
398                            strp_format,
399                            regex: filter_re,
400                            target_tz,
401                            tz_aware: result.with_tz,
402                            with_tz_capture: result.with_tz_capture,
403                            with_fraction_second: result.with_fraction_second,
404                        });
405                        // get the regex without captures, so that we can append it to the grok pattern
406                        let grok_re = date::time_format_to_regex(&format, false)
407                            .map_err(|error| {
408                                error!(message = "Invalid time format", format = %format, %error);
409                                Error::InvalidFunctionArguments(match_fn.name.clone())
410                            })?
411                            .regex;
412                        if let Some(grok_alias) = &grok_alias {
413                            context.register_filter(grok_alias, filter);
414                        }
415                        context.append_regex(&grok_re);
416                        return Ok(());
417                    }
418                    Err(Error::InvalidFunctionArguments(match_fn.name.clone()))
419                }
420                _ => Err(Error::InvalidFunctionArguments(match_fn.name.clone())),
421            }
422        }
423        // otherwise just add it as is, it should be a known grok pattern
424        grok_pattern_name => {
425            context.append_regex(grok_pattern_name);
426            Ok(())
427        }
428    }
429}
430
431// test some tricky cases here, more high-level tests are in parse_grok
432#[cfg(test)]
433mod tests {
434    use super::*;
435
436    #[test]
437    fn supports_escaped_quotes() {
438        let rules = parse_grok_rules(
439            &[r#"%{notSpace:field:nullIf("with \"escaped\" quotes")}"#.to_string()],
440            BTreeMap::new(),
441        )
442        .expect("couldn't parse rules");
443        assert!(matches!(
444            &rules[0]
445                .fields
446                .iter().next()
447                .expect("invalid grok pattern").1
448            .filters[0],
449            GrokFilter::NullIf(v) if *v == r#"with "escaped" quotes"#
450        ));
451    }
452}