1use crate::compiler::prelude::*;
2use regex::Regex;
3
4use super::util;
5use std::sync::LazyLock;
6
7static DEFAULT_NUMERIC_GROUPS: LazyLock<Value> = LazyLock::new(|| Value::Boolean(false));
8
9static PARAMETERS: LazyLock<Vec<Parameter>> = LazyLock::new(|| {
10 vec![
11 Parameter::required("value", kind::BYTES, "The string to search."),
12 Parameter::required(
13 "pattern",
14 kind::REGEX,
15 "The regular expression pattern to search against.",
16 ),
17 Parameter::optional(
18 "numeric_groups",
19 kind::BOOLEAN,
20 "If true, the index of each group in the regular expression is also captured. Index `0`
21contains the whole match.",
22 )
23 .default(&DEFAULT_NUMERIC_GROUPS),
24 ]
25});
26
27fn parse_regex(value: &Value, numeric_groups: bool, pattern: &Regex) -> Resolved {
28 let value = value.try_bytes_utf8_lossy()?;
29 let parsed = pattern
30 .captures(&value)
31 .map(|capture| util::capture_regex_to_map(pattern, &capture, numeric_groups))
32 .ok_or("could not find any pattern matches")?;
33 Ok(parsed.into())
34}
35
36#[derive(Clone, Copy, Debug)]
37pub struct ParseRegex;
38
39impl Function for ParseRegex {
40 fn identifier(&self) -> &'static str {
41 "parse_regex"
42 }
43
44 fn usage(&self) -> &'static str {
45 indoc! {"
46 Parses the `value` using the provided [Regex](https://en.wikipedia.org/wiki/Regular_expression) `pattern`.
47
48 This function differs from the `parse_regex_all` function in that it returns only the first match.
49 "}
50 }
51
52 fn category(&self) -> &'static str {
53 Category::Parse.as_ref()
54 }
55
56 fn internal_failure_reasons(&self) -> &'static [&'static str] {
57 &["`value` fails to parse using the provided `pattern`."]
58 }
59
60 fn return_kind(&self) -> u16 {
61 kind::OBJECT
62 }
63
64 fn return_rules(&self) -> &'static [&'static str] {
65 &[
66 "Matches return all capture groups corresponding to the leftmost matches in the text.",
67 "Raises an error if no match is found.",
68 ]
69 }
70
71 fn notices(&self) -> &'static [&'static str] {
72 &[
73 indoc! {"
74 VRL aims to provide purpose-specific [parsing functions](/docs/reference/vrl/functions/#parse-functions)
75 for common log formats. Before reaching for the `parse_regex` function, see if a VRL
76 [`parse_*` function](/docs/reference/vrl/functions/#parse-functions) already exists
77 for your format. If not, we recommend
78 [opening an issue](https://github.com/vectordotdev/vector/issues/new?labels=type%3A+new+feature)
79 to request support for the desired format.
80 "},
81 indoc! {"
82 All values are returned as strings. We recommend manually coercing values to desired
83 types as you see fit.
84 "},
85 ]
86 }
87
88 fn parameters(&self) -> &'static [Parameter] {
89 PARAMETERS.as_slice()
90 }
91
92 fn compile(
93 &self,
94 state: &state::TypeState,
95 _ctx: &mut FunctionCompileContext,
96 arguments: ArgumentList,
97 ) -> Compiled {
98 let value = arguments.required("value");
99 let pattern = arguments.required_regex("pattern", state)?;
100 let numeric_groups = arguments.optional("numeric_groups");
101
102 Ok(ParseRegexFn {
103 value,
104 pattern,
105 numeric_groups,
106 }
107 .as_expr())
108 }
109
110 fn examples(&self) -> &'static [Example] {
111 &[
112 example! {
113 title: "Parse using Regex (with capture groups)",
114 source: r#"parse_regex!("first group and second group.", r'(?P<number>.*?) group')"#,
115 result: Ok(r#"{"number": "first"}"#),
116 },
117 example! {
118 title: "Parse using Regex (without capture groups)",
119 source: r#"parse_regex!("first group and second group.", r'(\w+) group', numeric_groups: true)"#,
120 result: Ok(indoc! { r#"{
121 "0": "first group",
122 "1": "first"
123 }"# }),
124 },
125 example! {
126 title: "Parse using Regex with simple match",
127 source: r#"parse_regex!("8.7.6.5 - zorp", r'^(?P<host>[\w\.]+) - (?P<user>[\w]+)')"#,
128 result: Ok(indoc! { r#"{
129 "host": "8.7.6.5",
130 "user": "zorp"
131 }"# }),
132 },
133 example! {
134 title: "Parse using Regex with all numeric groups",
135 source: r#"parse_regex!("8.7.6.5 - zorp", r'^(?P<host>[\w\.]+) - (?P<user>[\w]+)', numeric_groups: true)"#,
136 result: Ok(indoc! { r#"{
137 "0": "8.7.6.5 - zorp",
138 "1": "8.7.6.5",
139 "2": "zorp",
140 "host": "8.7.6.5",
141 "user": "zorp"
142 }"# }),
143 },
144 example! {
145 title: "Parse using Regex with variables",
146 source: indoc! {r#"
147 variable = r'^(?P<host>[\w\.]+) - (?P<user>[\w]+)';
148 parse_regex!("8.7.6.5 - zorp", variable)
149 "#},
150 result: Ok(indoc! { r#"{
151 "host": "8.7.6.5",
152 "user": "zorp"
153 }"# }),
154 },
155 ]
156 }
157}
158
159#[derive(Debug, Clone)]
160pub(crate) struct ParseRegexFn {
161 value: Box<dyn Expression>,
162 pattern: Regex,
163 numeric_groups: Option<Box<dyn Expression>>,
164}
165
166impl FunctionExpression for ParseRegexFn {
167 fn resolve(&self, ctx: &mut Context) -> Resolved {
168 let value = self.value.resolve(ctx)?;
169 let numeric_groups = self
170 .numeric_groups
171 .map_resolve_with_default(ctx, || DEFAULT_NUMERIC_GROUPS.clone())?;
172 let pattern = &self.pattern;
173
174 parse_regex(&value, numeric_groups.try_boolean()?, pattern)
175 }
176
177 fn type_def(&self, _: &state::TypeState) -> TypeDef {
178 TypeDef::object(util::regex_kind(&self.pattern)).fallible()
179 }
180}
181
182#[cfg(test)]
183#[allow(clippy::trivial_regex)]
184mod tests {
185 use super::*;
186 use crate::{btreemap, value};
187
188 test_function![
189 find => ParseRegex;
190
191 numeric_groups {
192 args: func_args! [
193 value: "5.86.210.12 - zieme4647 5667 [19/06/2019:17:20:49 -0400] \"GET /embrace/supply-chains/dynamic/vertical\" 201 20574",
194 pattern: Regex::new(r#"^(?P<host>[\w\.]+) - (?P<user>[\w]+) (?P<bytes_in>[\d]+) \[(?P<timestamp>.*)\] "(?P<method>[\w]+) (?P<path>.*)" (?P<status>[\d]+) (?P<bytes_out>[\d]+)$"#)
195 .unwrap(),
196 numeric_groups: true,
197 ],
198 want: Ok(value!({"bytes_in": "5667",
199 "host": "5.86.210.12",
200 "user": "zieme4647",
201 "timestamp": "19/06/2019:17:20:49 -0400",
202 "method": "GET",
203 "path": "/embrace/supply-chains/dynamic/vertical",
204 "status": "201",
205 "bytes_out": "20574",
206 "0": "5.86.210.12 - zieme4647 5667 [19/06/2019:17:20:49 -0400] \"GET /embrace/supply-chains/dynamic/vertical\" 201 20574",
207 "1": "5.86.210.12",
208 "2": "zieme4647",
209 "3": "5667",
210 "4": "19/06/2019:17:20:49 -0400",
211 "5": "GET",
212 "6": "/embrace/supply-chains/dynamic/vertical",
213 "7": "201",
214 "8": "20574",
215 })),
216 tdef: TypeDef::object(btreemap! {
217 Field::from("bytes_in") => Kind::bytes(),
218 Field::from("host") => Kind::bytes(),
219 Field::from("user") => Kind::bytes(),
220 Field::from("timestamp") => Kind::bytes(),
221 Field::from("method") => Kind::bytes(),
222 Field::from("path") => Kind::bytes(),
223 Field::from("status") => Kind::bytes(),
224 Field::from("bytes_out") => Kind::bytes(),
225 Field::from("0") => Kind::bytes() | Kind::null(),
226 Field::from("1") => Kind::bytes() | Kind::null(),
227 Field::from("2") => Kind::bytes() | Kind::null(),
228 Field::from("3") => Kind::bytes() | Kind::null(),
229 Field::from("4") => Kind::bytes() | Kind::null(),
230 Field::from("5") => Kind::bytes() | Kind::null(),
231 Field::from("6") => Kind::bytes() | Kind::null(),
232 Field::from("7") => Kind::bytes() | Kind::null(),
233 Field::from("8") => Kind::bytes() | Kind::null(),
234 }).fallible(),
235 }
236
237 single_match {
238 args: func_args! [
239 value: "first group and second group",
240 pattern: Regex::new("(?P<number>.*?) group").unwrap()
241 ],
242 want: Ok(value!({"number": "first"})),
243 tdef: TypeDef::object(btreemap! {
244 Field::from("number") => Kind::bytes(),
245 Field::from("0") => Kind::bytes() | Kind::null(),
246 Field::from("1") => Kind::bytes() | Kind::null(),
247 }).fallible(),
248 }
249
250 no_match {
251 args: func_args! [
252 value: "I don't match",
253 pattern: Regex::new(r#"^(?P<host>[\w\.]+) - (?P<user>[\w]+) (?P<bytes_in>[\d]+) \[(?P<timestamp>.*)\] "(?P<method>[\w]+) (?P<path>.*)" (?P<status>[\d]+) (?P<bytes_out>[\d]+)$"#)
254 .unwrap()
255 ],
256 want: Err("could not find any pattern matches"),
257 tdef: TypeDef::object(btreemap! {
258 Field::from("host") => Kind::bytes(),
259 Field::from("user") => Kind::bytes(),
260 Field::from("bytes_in") => Kind::bytes(),
261 Field::from("timestamp") => Kind::bytes(),
262 Field::from("method") => Kind::bytes(),
263 Field::from("path") => Kind::bytes(),
264 Field::from("status") => Kind::bytes(),
265 Field::from("bytes_out") => Kind::bytes(),
266 Field::from("0") => Kind::bytes() | Kind::null(),
267 Field::from("1") => Kind::bytes() | Kind::null(),
268 Field::from("2") => Kind::bytes() | Kind::null(),
269 Field::from("3") => Kind::bytes() | Kind::null(),
270 Field::from("4") => Kind::bytes() | Kind::null(),
271 Field::from("5") => Kind::bytes() | Kind::null(),
272 Field::from("6") => Kind::bytes() | Kind::null(),
273 Field::from("7") => Kind::bytes() | Kind::null(),
274 Field::from("8") => Kind::bytes() | Kind::null(),
275 }).fallible(),
276 }
277 ];
278}