vrl/stdlib/
parse_regex.rs

1use crate::compiler::prelude::*;
2use regex::Regex;
3
4use super::util;
5use std::sync::LazyLock;
6
7static DEFAULT_NUMERIC_GROUPS: LazyLock<Value> = LazyLock::new(|| Value::Boolean(false));
8
9static PARAMETERS: LazyLock<Vec<Parameter>> = LazyLock::new(|| {
10    vec![
11        Parameter::required("value", kind::BYTES, "The string to search."),
12        Parameter::required(
13            "pattern",
14            kind::REGEX,
15            "The regular expression pattern to search against.",
16        ),
17        Parameter::optional(
18            "numeric_groups",
19            kind::BOOLEAN,
20            "If true, the index of each group in the regular expression is also captured. Index `0`
21contains the whole match.",
22        )
23        .default(&DEFAULT_NUMERIC_GROUPS),
24    ]
25});
26
27fn parse_regex(value: &Value, numeric_groups: bool, pattern: &Regex) -> Resolved {
28    let value = value.try_bytes_utf8_lossy()?;
29    let parsed = pattern
30        .captures(&value)
31        .map(|capture| util::capture_regex_to_map(pattern, &capture, numeric_groups))
32        .ok_or("could not find any pattern matches")?;
33    Ok(parsed.into())
34}
35
36#[derive(Clone, Copy, Debug)]
37pub struct ParseRegex;
38
39impl Function for ParseRegex {
40    fn identifier(&self) -> &'static str {
41        "parse_regex"
42    }
43
44    fn usage(&self) -> &'static str {
45        indoc! {"
46            Parses the `value` using the provided [Regex](https://en.wikipedia.org/wiki/Regular_expression) `pattern`.
47
48            This function differs from the `parse_regex_all` function in that it returns only the first match.
49        "}
50    }
51
52    fn category(&self) -> &'static str {
53        Category::Parse.as_ref()
54    }
55
56    fn internal_failure_reasons(&self) -> &'static [&'static str] {
57        &["`value` fails to parse using the provided `pattern`."]
58    }
59
60    fn return_kind(&self) -> u16 {
61        kind::OBJECT
62    }
63
64    fn return_rules(&self) -> &'static [&'static str] {
65        &[
66            "Matches return all capture groups corresponding to the leftmost matches in the text.",
67            "Raises an error if no match is found.",
68        ]
69    }
70
71    fn notices(&self) -> &'static [&'static str] {
72        &[
73            indoc! {"
74                VRL aims to provide purpose-specific [parsing functions](/docs/reference/vrl/functions/#parse-functions)
75                for common log formats. Before reaching for the `parse_regex` function, see if a VRL
76                [`parse_*` function](/docs/reference/vrl/functions/#parse-functions) already exists
77                for your format. If not, we recommend
78                [opening an issue](https://github.com/vectordotdev/vector/issues/new?labels=type%3A+new+feature)
79                to request support for the desired format.
80            "},
81            indoc! {"
82                All values are returned as strings. We recommend manually coercing values to desired
83                types as you see fit.
84            "},
85        ]
86    }
87
88    fn parameters(&self) -> &'static [Parameter] {
89        PARAMETERS.as_slice()
90    }
91
92    fn compile(
93        &self,
94        state: &state::TypeState,
95        _ctx: &mut FunctionCompileContext,
96        arguments: ArgumentList,
97    ) -> Compiled {
98        let value = arguments.required("value");
99        let pattern = arguments.required_regex("pattern", state)?;
100        let numeric_groups = arguments.optional("numeric_groups");
101
102        Ok(ParseRegexFn {
103            value,
104            pattern,
105            numeric_groups,
106        }
107        .as_expr())
108    }
109
110    fn examples(&self) -> &'static [Example] {
111        &[
112            example! {
113                title: "Parse using Regex (with capture groups)",
114                source: r#"parse_regex!("first group and second group.", r'(?P<number>.*?) group')"#,
115                result: Ok(r#"{"number": "first"}"#),
116            },
117            example! {
118                title: "Parse using Regex (without capture groups)",
119                source: r#"parse_regex!("first group and second group.", r'(\w+) group', numeric_groups: true)"#,
120                result: Ok(indoc! { r#"{
121                "0": "first group",
122                "1": "first"
123            }"# }),
124            },
125            example! {
126                title: "Parse using Regex with simple match",
127                source: r#"parse_regex!("8.7.6.5 - zorp", r'^(?P<host>[\w\.]+) - (?P<user>[\w]+)')"#,
128                result: Ok(indoc! { r#"{
129                "host": "8.7.6.5",
130                "user": "zorp"
131            }"# }),
132            },
133            example! {
134                title: "Parse using Regex with all numeric groups",
135                source: r#"parse_regex!("8.7.6.5 - zorp", r'^(?P<host>[\w\.]+) - (?P<user>[\w]+)', numeric_groups: true)"#,
136                result: Ok(indoc! { r#"{
137                "0": "8.7.6.5 - zorp",
138                "1": "8.7.6.5",
139                "2": "zorp",
140                "host": "8.7.6.5",
141                "user": "zorp"
142            }"# }),
143            },
144            example! {
145                title: "Parse using Regex with variables",
146                source: indoc! {r#"
147                    variable = r'^(?P<host>[\w\.]+) - (?P<user>[\w]+)';
148                    parse_regex!("8.7.6.5 - zorp", variable)
149                "#},
150                result: Ok(indoc! { r#"{
151                "host": "8.7.6.5",
152                "user": "zorp"
153            }"# }),
154            },
155        ]
156    }
157}
158
159#[derive(Debug, Clone)]
160pub(crate) struct ParseRegexFn {
161    value: Box<dyn Expression>,
162    pattern: Regex,
163    numeric_groups: Option<Box<dyn Expression>>,
164}
165
166impl FunctionExpression for ParseRegexFn {
167    fn resolve(&self, ctx: &mut Context) -> Resolved {
168        let value = self.value.resolve(ctx)?;
169        let numeric_groups = self
170            .numeric_groups
171            .map_resolve_with_default(ctx, || DEFAULT_NUMERIC_GROUPS.clone())?;
172        let pattern = &self.pattern;
173
174        parse_regex(&value, numeric_groups.try_boolean()?, pattern)
175    }
176
177    fn type_def(&self, _: &state::TypeState) -> TypeDef {
178        TypeDef::object(util::regex_kind(&self.pattern)).fallible()
179    }
180}
181
182#[cfg(test)]
183#[allow(clippy::trivial_regex)]
184mod tests {
185    use super::*;
186    use crate::{btreemap, value};
187
188    test_function![
189        find => ParseRegex;
190
191        numeric_groups {
192            args: func_args! [
193                value: "5.86.210.12 - zieme4647 5667 [19/06/2019:17:20:49 -0400] \"GET /embrace/supply-chains/dynamic/vertical\" 201 20574",
194                pattern: Regex::new(r#"^(?P<host>[\w\.]+) - (?P<user>[\w]+) (?P<bytes_in>[\d]+) \[(?P<timestamp>.*)\] "(?P<method>[\w]+) (?P<path>.*)" (?P<status>[\d]+) (?P<bytes_out>[\d]+)$"#)
195                    .unwrap(),
196                numeric_groups: true,
197            ],
198            want: Ok(value!({"bytes_in": "5667",
199                             "host": "5.86.210.12",
200                             "user": "zieme4647",
201                             "timestamp": "19/06/2019:17:20:49 -0400",
202                             "method": "GET",
203                             "path": "/embrace/supply-chains/dynamic/vertical",
204                             "status": "201",
205                             "bytes_out": "20574",
206                             "0": "5.86.210.12 - zieme4647 5667 [19/06/2019:17:20:49 -0400] \"GET /embrace/supply-chains/dynamic/vertical\" 201 20574",
207                             "1": "5.86.210.12",
208                             "2": "zieme4647",
209                             "3": "5667",
210                             "4": "19/06/2019:17:20:49 -0400",
211                             "5": "GET",
212                             "6": "/embrace/supply-chains/dynamic/vertical",
213                             "7": "201",
214                             "8": "20574",
215            })),
216            tdef: TypeDef::object(btreemap! {
217                    Field::from("bytes_in") => Kind::bytes(),
218                    Field::from("host") => Kind::bytes(),
219                    Field::from("user") => Kind::bytes(),
220                    Field::from("timestamp") => Kind::bytes(),
221                    Field::from("method") => Kind::bytes(),
222                    Field::from("path") => Kind::bytes(),
223                    Field::from("status") => Kind::bytes(),
224                    Field::from("bytes_out") => Kind::bytes(),
225                    Field::from("0") => Kind::bytes() | Kind::null(),
226                    Field::from("1") => Kind::bytes() | Kind::null(),
227                    Field::from("2") => Kind::bytes() | Kind::null(),
228                    Field::from("3") => Kind::bytes() | Kind::null(),
229                    Field::from("4") => Kind::bytes() | Kind::null(),
230                    Field::from("5") => Kind::bytes() | Kind::null(),
231                    Field::from("6") => Kind::bytes() | Kind::null(),
232                    Field::from("7") => Kind::bytes() | Kind::null(),
233                    Field::from("8") => Kind::bytes() | Kind::null(),
234                }).fallible(),
235        }
236
237        single_match {
238            args: func_args! [
239                value: "first group and second group",
240                pattern: Regex::new("(?P<number>.*?) group").unwrap()
241            ],
242            want: Ok(value!({"number": "first"})),
243            tdef: TypeDef::object(btreemap! {
244                        Field::from("number") => Kind::bytes(),
245                        Field::from("0") => Kind::bytes() | Kind::null(),
246                        Field::from("1") => Kind::bytes() | Kind::null(),
247                }).fallible(),
248        }
249
250        no_match {
251            args: func_args! [
252                value: "I don't match",
253                pattern: Regex::new(r#"^(?P<host>[\w\.]+) - (?P<user>[\w]+) (?P<bytes_in>[\d]+) \[(?P<timestamp>.*)\] "(?P<method>[\w]+) (?P<path>.*)" (?P<status>[\d]+) (?P<bytes_out>[\d]+)$"#)
254                            .unwrap()
255            ],
256            want: Err("could not find any pattern matches"),
257            tdef: TypeDef::object(btreemap! {
258                    Field::from("host") => Kind::bytes(),
259                    Field::from("user") => Kind::bytes(),
260                    Field::from("bytes_in") => Kind::bytes(),
261                    Field::from("timestamp") => Kind::bytes(),
262                    Field::from("method") => Kind::bytes(),
263                    Field::from("path") => Kind::bytes(),
264                    Field::from("status") => Kind::bytes(),
265                    Field::from("bytes_out") => Kind::bytes(),
266                    Field::from("0") => Kind::bytes() | Kind::null(),
267                    Field::from("1") => Kind::bytes() | Kind::null(),
268                    Field::from("2") => Kind::bytes() | Kind::null(),
269                    Field::from("3") => Kind::bytes() | Kind::null(),
270                    Field::from("4") => Kind::bytes() | Kind::null(),
271                    Field::from("5") => Kind::bytes() | Kind::null(),
272                    Field::from("6") => Kind::bytes() | Kind::null(),
273                    Field::from("7") => Kind::bytes() | Kind::null(),
274                    Field::from("8") => Kind::bytes() | Kind::null(),
275                }).fallible(),
276        }
277    ];
278}