vrl/stdlib/
parse_apache_log.rs

1use super::log_util;
2use crate::compiler::function::EnumVariant;
3use crate::compiler::prelude::*;
4use crate::value;
5use std::collections::BTreeMap;
6use std::sync::LazyLock;
7
8static DEFAULT_TIMESTAMP_FORMAT: LazyLock<Value> =
9    LazyLock::new(|| Value::Bytes(Bytes::from("%d/%b/%Y:%T %z")));
10
11static FORMAT_ENUM: &[EnumVariant] = &[
12    EnumVariant {
13        value: "common",
14        description: "Common format",
15    },
16    EnumVariant {
17        value: "combined",
18        description: "Apache combined format",
19    },
20    EnumVariant {
21        value: "error",
22        description: "Default Apache error format",
23    },
24];
25
26static PARAMETERS: LazyLock<Vec<Parameter>> = LazyLock::new(|| {
27    vec![
28        Parameter::required("value", kind::BYTES, "The string to parse."),
29        Parameter::required("format", kind::BYTES, "The format to use for parsing the log.")
30            .enum_variants(FORMAT_ENUM),
31        Parameter::optional("timestamp_format", kind::BYTES, "The [date/time format](https://docs.rs/chrono/latest/chrono/format/strftime/index.html) to use for
32encoding the timestamp. The time is parsed in local time if the timestamp does not specify a timezone.")
33            .default(&DEFAULT_TIMESTAMP_FORMAT),
34    ]
35});
36
37fn parse_apache_log(
38    bytes: &Value,
39    timestamp_format: &Value,
40    format: &Bytes,
41    ctx: &Context,
42) -> Resolved {
43    let message = bytes.try_bytes_utf8_lossy()?;
44    let timestamp_format = timestamp_format.try_bytes_utf8_lossy()?.to_string();
45    let regexes = match format.as_ref() {
46        b"common" => &*log_util::REGEX_APACHE_COMMON_LOG,
47        b"combined" => &*log_util::REGEX_APACHE_COMBINED_LOG,
48        b"error" => &*log_util::REGEX_APACHE_ERROR_LOG,
49        _ => unreachable!(),
50    };
51
52    log_util::parse_message(
53        regexes,
54        &message,
55        &timestamp_format,
56        *ctx.timezone(),
57        std::str::from_utf8(format.as_ref()).unwrap(),
58    )
59    .map_err(Into::into)
60}
61
62fn variants() -> Vec<Value> {
63    vec![value!("common"), value!("combined"), value!("error")]
64}
65
66#[derive(Clone, Copy, Debug)]
67pub struct ParseApacheLog;
68
69impl Function for ParseApacheLog {
70    fn identifier(&self) -> &'static str {
71        "parse_apache_log"
72    }
73
74    fn usage(&self) -> &'static str {
75        indoc! {"
76            Parses Apache access and error log lines. Lines can be in [`common`](https://httpd.apache.org/docs/current/logs.html#common),
77            [`combined`](https://httpd.apache.org/docs/current/logs.html#combined), or the default [`error`](https://httpd.apache.org/docs/current/logs.html#errorlog) format.
78        "}
79    }
80
81    fn category(&self) -> &'static str {
82        Category::Parse.as_ref()
83    }
84
85    fn internal_failure_reasons(&self) -> &'static [&'static str] {
86        &[
87            "`value` does not match the specified format.",
88            "`timestamp_format` is not a valid format string.",
89            "The timestamp in `value` fails to parse using the provided `timestamp_format`.",
90        ]
91    }
92
93    fn return_kind(&self) -> u16 {
94        kind::OBJECT
95    }
96
97    fn notices(&self) -> &'static [&'static str] {
98        &[
99            "Missing information in the log message may be indicated by `-`. These fields are omitted in the result.",
100        ]
101    }
102
103    fn parameters(&self) -> &'static [Parameter] {
104        PARAMETERS.as_slice()
105    }
106
107    fn compile(
108        &self,
109        state: &state::TypeState,
110        _ctx: &mut FunctionCompileContext,
111        arguments: ArgumentList,
112    ) -> Compiled {
113        let value = arguments.required("value");
114        let format = arguments
115            .required_enum("format", &variants(), state)?
116            .try_bytes()
117            .expect("format not bytes");
118
119        let timestamp_format = arguments.optional("timestamp_format");
120
121        Ok(ParseApacheLogFn {
122            value,
123            format,
124            timestamp_format,
125        }
126        .as_expr())
127    }
128
129    fn examples(&self) -> &'static [Example] {
130        &[
131            example! {
132                title: "Parse using Apache log format (common)",
133                source: r#"parse_apache_log!(s'127.0.0.1 bob frank [10/Oct/2000:13:55:36 -0700] "GET /apache_pb.gif HTTP/1.0" 200 2326', format: "common")"#,
134                result: Ok(indoc!{
135                    r#"
136                        {
137                          "host": "127.0.0.1",
138                          "identity": "bob",
139                          "message": "GET /apache_pb.gif HTTP/1.0",
140                          "method": "GET",
141                          "path": "/apache_pb.gif",
142                          "protocol": "HTTP/1.0",
143                          "size": 2326,
144                          "status": 200,
145                          "timestamp": "2000-10-10T20:55:36Z",
146                          "user": "frank"
147                        }
148                    "#,
149                }),
150            },
151            example! {
152                title: "Parse using Apache log format (combined)",
153                source: indoc! {r#"
154                    parse_apache_log!(
155                        s'127.0.0.1 bob frank [10/Oct/2000:13:55:36 -0700] "GET /apache_pb.gif HTTP/1.0" 200 2326 "http://www.seniorinfomediaries.com/vertical/channels/front-end/bandwidth" "Mozilla/5.0 (X11; Linux i686; rv:5.0) Gecko/1945-10-12 Firefox/37.0"',
156                        "combined",
157                    )
158                "#},
159                result: Ok(indoc!{
160                    r#"
161                        {
162                          "agent": "Mozilla/5.0 (X11; Linux i686; rv:5.0) Gecko/1945-10-12 Firefox/37.0",
163                          "host": "127.0.0.1",
164                          "identity": "bob",
165                          "message": "GET /apache_pb.gif HTTP/1.0",
166                          "method": "GET",
167                          "path": "/apache_pb.gif",
168                          "protocol": "HTTP/1.0",
169                          "referrer": "http://www.seniorinfomediaries.com/vertical/channels/front-end/bandwidth",
170                          "size": 2326,
171                          "status": 200,
172                          "timestamp": "2000-10-10T20:55:36Z",
173                          "user": "frank"
174                        }
175                    "#,
176                }),
177            },
178            example! {
179                title: "Parse using Apache log format (error)",
180                source: indoc! {r#"
181                    parse_apache_log!(
182                        s'[01/Mar/2021:12:00:19 +0000] [ab:alert] [pid 4803:tid 3814] [client 147.159.108.175:24259] I will bypass the haptic COM bandwidth, that should matrix the CSS driver!',
183                        "error"
184                    )
185                "#},
186                result: Ok(indoc!{
187                    r#"
188                        {
189                          "client": "147.159.108.175",
190                          "message": "I will bypass the haptic COM bandwidth, that should matrix the CSS driver!",
191                          "module": "ab",
192                          "pid": 4803,
193                          "port": 24259,
194                          "severity": "alert",
195                          "thread": "3814",
196                          "timestamp": "2021-03-01T12:00:19Z"
197                        }
198                    "#,
199                }),
200            },
201        ]
202    }
203}
204
205#[derive(Debug, Clone)]
206struct ParseApacheLogFn {
207    value: Box<dyn Expression>,
208    format: Bytes,
209    timestamp_format: Option<Box<dyn Expression>>,
210}
211
212impl FunctionExpression for ParseApacheLogFn {
213    fn resolve(&self, ctx: &mut Context) -> Resolved {
214        let bytes = self.value.resolve(ctx)?;
215        let timestamp_format = self
216            .timestamp_format
217            .map_resolve_with_default(ctx, || DEFAULT_TIMESTAMP_FORMAT.clone())?;
218
219        parse_apache_log(&bytes, &timestamp_format, &self.format, ctx)
220    }
221
222    fn type_def(&self, _: &state::TypeState) -> TypeDef {
223        TypeDef::object(match self.format.as_ref() {
224            b"common" => kind_common(),
225            b"combined" => kind_combined(),
226            b"error" => kind_error(),
227            _ => unreachable!(),
228        })
229        .fallible()
230    }
231}
232
233fn kind_common() -> BTreeMap<Field, Kind> {
234    BTreeMap::from([
235        (Field::from("host"), Kind::bytes() | Kind::null()),
236        (Field::from("identity"), Kind::bytes() | Kind::null()),
237        (Field::from("user"), Kind::bytes() | Kind::null()),
238        (Field::from("timestamp"), Kind::timestamp() | Kind::null()),
239        (Field::from("message"), Kind::bytes() | Kind::null()),
240        (Field::from("method"), Kind::bytes() | Kind::null()),
241        (Field::from("path"), Kind::bytes() | Kind::null()),
242        (Field::from("protocol"), Kind::bytes() | Kind::null()),
243        (Field::from("status"), Kind::integer() | Kind::null()),
244        (Field::from("size"), Kind::integer() | Kind::null()),
245    ])
246}
247
248fn kind_combined() -> BTreeMap<Field, Kind> {
249    BTreeMap::from([
250        (Field::from("host"), Kind::bytes() | Kind::null()),
251        (Field::from("identity"), Kind::bytes() | Kind::null()),
252        (Field::from("user"), Kind::bytes() | Kind::null()),
253        (Field::from("timestamp"), Kind::timestamp() | Kind::null()),
254        (Field::from("message"), Kind::bytes() | Kind::null()),
255        (Field::from("method"), Kind::bytes() | Kind::null()),
256        (Field::from("path"), Kind::bytes() | Kind::null()),
257        (Field::from("protocol"), Kind::bytes() | Kind::null()),
258        (Field::from("status"), Kind::integer() | Kind::null()),
259        (Field::from("size"), Kind::integer() | Kind::null()),
260        (Field::from("referrer"), Kind::bytes() | Kind::null()),
261        (Field::from("agent"), Kind::bytes() | Kind::null()),
262    ])
263}
264
265fn kind_error() -> BTreeMap<Field, Kind> {
266    BTreeMap::from([
267        (Field::from("timestamp"), Kind::timestamp() | Kind::null()),
268        (Field::from("module"), Kind::bytes() | Kind::null()),
269        (Field::from("severity"), Kind::bytes() | Kind::null()),
270        (Field::from("thread"), Kind::bytes() | Kind::null()),
271        (Field::from("port"), Kind::bytes() | Kind::null()),
272        (Field::from("message"), Kind::bytes() | Kind::null()),
273    ])
274}
275
276#[cfg(test)]
277mod tests {
278    use crate::compiler::TimeZone;
279    use chrono::DateTime;
280    use chrono::TimeZone as ChronoTimezone;
281    use chrono::prelude::*;
282
283    use super::*;
284    use crate::btreemap;
285    use chrono::Utc;
286
287    test_function![
288        parse_common_log => ParseApacheLog;
289
290        common_line_valid {
291            args: func_args![value: r#"127.0.0.1 bob frank [10/Oct/2000:13:55:36 -0700] "GET /apache_pb.gif HTTP/1.0" 200 2326"#,
292                             format: "common"
293            ],
294            want: Ok(btreemap! {
295                "host" => "127.0.0.1",
296                "identity" => "bob",
297                "user" => "frank",
298                "timestamp" => Value::Timestamp(DateTime::parse_from_rfc3339("2000-10-10T20:55:36Z").unwrap().into()),
299                "message" => "GET /apache_pb.gif HTTP/1.0",
300                "method" => "GET",
301                "path" => "/apache_pb.gif",
302                "protocol" => "HTTP/1.0",
303                "status" => 200,
304                "size" => 2326,
305            }),
306            tdef: TypeDef::object(kind_common()).fallible(),
307            tz: TimeZone::default(),
308        }
309
310        combined_line_valid {
311            args: func_args![value: r#"224.92.49.50 bob frank [25/Feb/2021:12:44:08 +0000] "PATCH /one-to-one HTTP/1.1" 401 84170 "http://www.seniorinfomediaries.com/vertical/channels/front-end/bandwidth" "Mozilla/5.0 (X11; Linux i686; rv:5.0) Gecko/1945-10-12 Firefox/37.0""#,
312                             format: "combined"
313                             ],
314            want: Ok(btreemap! {
315                "host" => "224.92.49.50",
316                "identity" => "bob",
317                "user" => "frank",
318                "timestamp" => Value::Timestamp(DateTime::parse_from_rfc3339("2021-02-25T12:44:08Z").unwrap().into()),
319                "message" => "PATCH /one-to-one HTTP/1.1",
320                "method" => "PATCH",
321                "path" => "/one-to-one",
322                "protocol" => "HTTP/1.1",
323                "status" => 401,
324                "size" => 84170,
325                "referrer" => "http://www.seniorinfomediaries.com/vertical/channels/front-end/bandwidth",
326                "agent" => "Mozilla/5.0 (X11; Linux i686; rv:5.0) Gecko/1945-10-12 Firefox/37.0",
327            }),
328            tdef: TypeDef::object(kind_combined()).fallible(),
329            tz: TimeZone::default(),
330        }
331
332        combined_line_missing_fields_valid {
333            args: func_args![value: r#"224.92.49.50 bob frank [25/Feb/2021:12:44:08 +0000] "PATCH /one-to-one HTTP/1.1" 401 84170 - -"#,
334                             format: "combined"
335                             ],
336            want: Ok(btreemap! {
337                "host" => "224.92.49.50",
338                "identity" => "bob",
339                "user" => "frank",
340                "timestamp" => Value::Timestamp(DateTime::parse_from_rfc3339("2021-02-25T12:44:08Z").unwrap().into()),
341                "message" => "PATCH /one-to-one HTTP/1.1",
342                "method" => "PATCH",
343                "path" => "/one-to-one",
344                "protocol" => "HTTP/1.1",
345                "status" => 401,
346                "size" => 84170,
347            }),
348            tdef: TypeDef::object(kind_combined()).fallible(),
349            tz: TimeZone::default(),
350        }
351
352        error_line_valid {
353            args: func_args![value: "[01/Mar/2021:12:00:19 +0000] [ab:alert] [pid 4803:tid 3814] [client 147.159.108.175:24259] I'll bypass the haptic COM bandwidth, that should matrix the CSS driver!",
354                             format: "error"
355                             ],
356            want: Ok(btreemap! {
357                "timestamp" => Value::Timestamp(DateTime::parse_from_rfc3339("2021-03-01T12:00:19Z").unwrap().into()),
358                "message" => "I'll bypass the haptic COM bandwidth, that should matrix the CSS driver!",
359                "module" => "ab",
360                "severity" => "alert",
361                "pid" => 4803,
362                "thread" => "3814",
363                "client" => "147.159.108.175",
364                "port" => 24259
365            }),
366            tdef: TypeDef::object(kind_error()).fallible(),
367            tz: TimeZone::default(),
368        }
369
370        error_line_ip_v6 {
371            args: func_args![value: "[01/Mar/2021:12:00:19 +0000] [ab:alert] [pid 4803:tid 3814] [client eda7:35d:3ceb:ef1e:2133:e7bf:116e:24cc:24259] I'll bypass the haptic COM bandwidth, that should matrix the CSS driver!",
372                             format: "error"
373                             ],
374            want: Ok(btreemap! {
375                "timestamp" => Value::Timestamp(DateTime::parse_from_rfc3339("2021-03-01T12:00:19Z").unwrap().into()),
376                "message" => "I'll bypass the haptic COM bandwidth, that should matrix the CSS driver!",
377                "module" => "ab",
378                "severity" => "alert",
379                "pid" => 4803,
380                "thread" => "3814",
381                "client" => "eda7:35d:3ceb:ef1e:2133:e7bf:116e:24cc",
382                "port" => 24259
383            }),
384            tdef: TypeDef::object(kind_error()).fallible(),
385            tz: TimeZone::default(),
386        }
387
388        error_line_thread_id {
389            args: func_args![
390                value: r"[2021-06-04 15:40:27.138633] [php7:emerg] [pid 4803] [client 95.223.77.60:35106] PHP Parse error:  syntax error, unexpected \'->\' (T_OBJECT_OPERATOR) in /var/www/prod/releases/master-c7225365fd9faa26262cffeeb57b31bd7448c94a/source/index.php on line 14",
391                timestamp_format: "%Y-%m-%d %H:%M:%S.%f",
392                format: "error",
393            ],
394            want: Ok(btreemap! {
395                "timestamp" => Value::Timestamp(DateTime::parse_from_rfc3339("2021-06-04T15:40:27.000138633Z").unwrap().into()),
396                "message" => "PHP Parse error:  syntax error, unexpected \\\'->\\\' (T_OBJECT_OPERATOR) in /var/www/prod/releases/master-c7225365fd9faa26262cffeeb57b31bd7448c94a/source/index.php on line 14",
397                "module" => "php7",
398                "severity" => "emerg",
399                "pid" => 4803,
400                "client" => "95.223.77.60",
401                "port" => 35106
402
403            }),
404            tdef: TypeDef::object(kind_error()).fallible(),
405            tz: TimeZone::Named(chrono_tz::Tz::UTC),
406        }
407
408        error_line_threaded_mpms_valid {
409            args: func_args![value: "[01/Mar/2021:12:00:19 +0000] [proxy:error] [pid 23964] (113)No route to host: AH00957: HTTP: attempt to connect to 10.1.0.244:9000 (hostname.domain.com) failed",
410                             format: "error"
411                             ],
412            want: Ok(btreemap! {
413                "timestamp" => Value::Timestamp(DateTime::parse_from_rfc3339("2021-03-01T12:00:19Z").unwrap().into()),
414                "message1" => "(113)No route to host: AH00957: ",
415                "message2" => "HTTP: attempt to connect to 10.1.0.244:9000 (hostname.domain.com) failed",
416                "module" => "proxy",
417                "severity" => "error",
418                "pid" => 23964,
419            }),
420            tdef: TypeDef::object(kind_error()).fallible(),
421            tz: TimeZone::default(),
422        }
423
424        log_line_valid_empty {
425            args: func_args![value: "- - - - - - -",
426                             format: "common",
427            ],
428            want: Ok(BTreeMap::new()),
429            tdef: TypeDef::object(kind_common()).fallible(),
430            tz: TimeZone::default(),
431        }
432
433        log_line_valid_empty_variant {
434            args: func_args![value: r#"- - - [-] "-" - -"#,
435                             format: "common",
436            ],
437            want: Ok(BTreeMap::new()),
438            tdef: TypeDef::object(kind_common()).fallible(),
439            tz: TimeZone::default(),
440        }
441
442       log_line_valid_with_local_timestamp_format {
443            args: func_args![value: format!("[{}] - - - -",
444                                            Utc.with_ymd_and_hms(2000, 10, 10, 20, 55, 36).unwrap()
445                                              .with_timezone(&Local)
446                                              .format("%a %b %d %H:%M:%S %Y")
447                                            ),
448                             timestamp_format: "%a %b %d %H:%M:%S %Y",
449                             format: "error",
450            ],
451            want: Ok(btreemap! {
452                "timestamp" => Value::Timestamp(DateTime::parse_from_rfc3339("2000-10-10T20:55:36Z").unwrap().into()),
453            }),
454            tdef: TypeDef::object(kind_error()).fallible(),
455            tz: TimeZone::default(),
456        }
457
458
459        log_line_valid_with_timezone {
460            args: func_args![
461                value: "[2021/06/03 09:30:50] - - - -",
462                timestamp_format: "%Y/%m/%d %H:%M:%S",
463                format: "error",
464            ],
465            want: Ok(btreemap! {
466                "timestamp" => Value::Timestamp(DateTime::parse_from_rfc3339("2021-06-03T07:30:50Z").unwrap().into()),
467            }),
468            tdef: TypeDef::object(kind_error()).fallible(),
469            tz: TimeZone::Named(chrono_tz::Europe::Paris),
470        }
471
472        log_line_invalid {
473            args: func_args![value: "not a common log line",
474                             format: "common",
475            ],
476            want: Err("failed parsing common log line"),
477            tdef: TypeDef::object(kind_common()).fallible(),
478            tz: TimeZone::default(),
479        }
480
481        log_line_invalid_timestamp {
482            args: func_args![value: "- - - [1234] - - - - - ",
483                             format: "combined",
484            ],
485            want: Err("failed parsing timestamp 1234 using format %d/%b/%Y:%T %z: input contains invalid characters"),
486            tdef: TypeDef::object(kind_combined()).fallible(),
487            tz: TimeZone::default(),
488        }
489    ];
490}