1use super::log_util;
2use crate::compiler::function::EnumVariant;
3use crate::compiler::prelude::*;
4use crate::value;
5use std::collections::BTreeMap;
6use std::sync::LazyLock;
7
8static DEFAULT_TIMESTAMP_FORMAT: LazyLock<Value> =
9 LazyLock::new(|| Value::Bytes(Bytes::from("%d/%b/%Y:%T %z")));
10
11static FORMAT_ENUM: &[EnumVariant] = &[
12 EnumVariant {
13 value: "common",
14 description: "Common format",
15 },
16 EnumVariant {
17 value: "combined",
18 description: "Apache combined format",
19 },
20 EnumVariant {
21 value: "error",
22 description: "Default Apache error format",
23 },
24];
25
26static PARAMETERS: LazyLock<Vec<Parameter>> = LazyLock::new(|| {
27 vec to use for
32encoding the timestamp. The time is parsed in local time if the timestamp does not specify a timezone.")
33 .default(&DEFAULT_TIMESTAMP_FORMAT),
34 ]
35});
36
37fn parse_apache_log(
38 bytes: &Value,
39 timestamp_format: &Value,
40 format: &Bytes,
41 ctx: &Context,
42) -> Resolved {
43 let message = bytes.try_bytes_utf8_lossy()?;
44 let timestamp_format = timestamp_format.try_bytes_utf8_lossy()?.to_string();
45 let regexes = match format.as_ref() {
46 b"common" => &*log_util::REGEX_APACHE_COMMON_LOG,
47 b"combined" => &*log_util::REGEX_APACHE_COMBINED_LOG,
48 b"error" => &*log_util::REGEX_APACHE_ERROR_LOG,
49 _ => unreachable!(),
50 };
51
52 log_util::parse_message(
53 regexes,
54 &message,
55 ×tamp_format,
56 *ctx.timezone(),
57 std::str::from_utf8(format.as_ref()).unwrap(),
58 )
59 .map_err(Into::into)
60}
61
62fn variants() -> Vec<Value> {
63 vec![value!("common"), value!("combined"), value!("error")]
64}
65
66#[derive(Clone, Copy, Debug)]
67pub struct ParseApacheLog;
68
69impl Function for ParseApacheLog {
70 fn identifier(&self) -> &'static str {
71 "parse_apache_log"
72 }
73
74 fn usage(&self) -> &'static str {
75 indoc! {"
76 Parses Apache access and error log lines. Lines can be in [`common`](https://httpd.apache.org/docs/current/logs.html#common),
77 [`combined`](https://httpd.apache.org/docs/current/logs.html#combined), or the default [`error`](https://httpd.apache.org/docs/current/logs.html#errorlog) format.
78 "}
79 }
80
81 fn category(&self) -> &'static str {
82 Category::Parse.as_ref()
83 }
84
85 fn internal_failure_reasons(&self) -> &'static [&'static str] {
86 &[
87 "`value` does not match the specified format.",
88 "`timestamp_format` is not a valid format string.",
89 "The timestamp in `value` fails to parse using the provided `timestamp_format`.",
90 ]
91 }
92
93 fn return_kind(&self) -> u16 {
94 kind::OBJECT
95 }
96
97 fn notices(&self) -> &'static [&'static str] {
98 &[
99 "Missing information in the log message may be indicated by `-`. These fields are omitted in the result.",
100 ]
101 }
102
103 fn parameters(&self) -> &'static [Parameter] {
104 PARAMETERS.as_slice()
105 }
106
107 fn compile(
108 &self,
109 state: &state::TypeState,
110 _ctx: &mut FunctionCompileContext,
111 arguments: ArgumentList,
112 ) -> Compiled {
113 let value = arguments.required("value");
114 let format = arguments
115 .required_enum("format", &variants(), state)?
116 .try_bytes()
117 .expect("format not bytes");
118
119 let timestamp_format = arguments.optional("timestamp_format");
120
121 Ok(ParseApacheLogFn {
122 value,
123 format,
124 timestamp_format,
125 }
126 .as_expr())
127 }
128
129 fn examples(&self) -> &'static [Example] {
130 &[
131 example! {
132 title: "Parse using Apache log format (common)",
133 source: r#"parse_apache_log!(s'127.0.0.1 bob frank [10/Oct/2000:13:55:36 -0700] "GET /apache_pb.gif HTTP/1.0" 200 2326', format: "common")"#,
134 result: Ok(indoc!{
135 r#"
136 {
137 "host": "127.0.0.1",
138 "identity": "bob",
139 "message": "GET /apache_pb.gif HTTP/1.0",
140 "method": "GET",
141 "path": "/apache_pb.gif",
142 "protocol": "HTTP/1.0",
143 "size": 2326,
144 "status": 200,
145 "timestamp": "2000-10-10T20:55:36Z",
146 "user": "frank"
147 }
148 "#,
149 }),
150 },
151 example! {
152 title: "Parse using Apache log format (combined)",
153 source: indoc! {r#"
154 parse_apache_log!(
155 s'127.0.0.1 bob frank [10/Oct/2000:13:55:36 -0700] "GET /apache_pb.gif HTTP/1.0" 200 2326 "http://www.seniorinfomediaries.com/vertical/channels/front-end/bandwidth" "Mozilla/5.0 (X11; Linux i686; rv:5.0) Gecko/1945-10-12 Firefox/37.0"',
156 "combined",
157 )
158 "#},
159 result: Ok(indoc!{
160 r#"
161 {
162 "agent": "Mozilla/5.0 (X11; Linux i686; rv:5.0) Gecko/1945-10-12 Firefox/37.0",
163 "host": "127.0.0.1",
164 "identity": "bob",
165 "message": "GET /apache_pb.gif HTTP/1.0",
166 "method": "GET",
167 "path": "/apache_pb.gif",
168 "protocol": "HTTP/1.0",
169 "referrer": "http://www.seniorinfomediaries.com/vertical/channels/front-end/bandwidth",
170 "size": 2326,
171 "status": 200,
172 "timestamp": "2000-10-10T20:55:36Z",
173 "user": "frank"
174 }
175 "#,
176 }),
177 },
178 example! {
179 title: "Parse using Apache log format (error)",
180 source: indoc! {r#"
181 parse_apache_log!(
182 s'[01/Mar/2021:12:00:19 +0000] [ab:alert] [pid 4803:tid 3814] [client 147.159.108.175:24259] I will bypass the haptic COM bandwidth, that should matrix the CSS driver!',
183 "error"
184 )
185 "#},
186 result: Ok(indoc!{
187 r#"
188 {
189 "client": "147.159.108.175",
190 "message": "I will bypass the haptic COM bandwidth, that should matrix the CSS driver!",
191 "module": "ab",
192 "pid": 4803,
193 "port": 24259,
194 "severity": "alert",
195 "thread": "3814",
196 "timestamp": "2021-03-01T12:00:19Z"
197 }
198 "#,
199 }),
200 },
201 ]
202 }
203}
204
205#[derive(Debug, Clone)]
206struct ParseApacheLogFn {
207 value: Box<dyn Expression>,
208 format: Bytes,
209 timestamp_format: Option<Box<dyn Expression>>,
210}
211
212impl FunctionExpression for ParseApacheLogFn {
213 fn resolve(&self, ctx: &mut Context) -> Resolved {
214 let bytes = self.value.resolve(ctx)?;
215 let timestamp_format = self
216 .timestamp_format
217 .map_resolve_with_default(ctx, || DEFAULT_TIMESTAMP_FORMAT.clone())?;
218
219 parse_apache_log(&bytes, ×tamp_format, &self.format, ctx)
220 }
221
222 fn type_def(&self, _: &state::TypeState) -> TypeDef {
223 TypeDef::object(match self.format.as_ref() {
224 b"common" => kind_common(),
225 b"combined" => kind_combined(),
226 b"error" => kind_error(),
227 _ => unreachable!(),
228 })
229 .fallible()
230 }
231}
232
233fn kind_common() -> BTreeMap<Field, Kind> {
234 BTreeMap::from([
235 (Field::from("host"), Kind::bytes() | Kind::null()),
236 (Field::from("identity"), Kind::bytes() | Kind::null()),
237 (Field::from("user"), Kind::bytes() | Kind::null()),
238 (Field::from("timestamp"), Kind::timestamp() | Kind::null()),
239 (Field::from("message"), Kind::bytes() | Kind::null()),
240 (Field::from("method"), Kind::bytes() | Kind::null()),
241 (Field::from("path"), Kind::bytes() | Kind::null()),
242 (Field::from("protocol"), Kind::bytes() | Kind::null()),
243 (Field::from("status"), Kind::integer() | Kind::null()),
244 (Field::from("size"), Kind::integer() | Kind::null()),
245 ])
246}
247
248fn kind_combined() -> BTreeMap<Field, Kind> {
249 BTreeMap::from([
250 (Field::from("host"), Kind::bytes() | Kind::null()),
251 (Field::from("identity"), Kind::bytes() | Kind::null()),
252 (Field::from("user"), Kind::bytes() | Kind::null()),
253 (Field::from("timestamp"), Kind::timestamp() | Kind::null()),
254 (Field::from("message"), Kind::bytes() | Kind::null()),
255 (Field::from("method"), Kind::bytes() | Kind::null()),
256 (Field::from("path"), Kind::bytes() | Kind::null()),
257 (Field::from("protocol"), Kind::bytes() | Kind::null()),
258 (Field::from("status"), Kind::integer() | Kind::null()),
259 (Field::from("size"), Kind::integer() | Kind::null()),
260 (Field::from("referrer"), Kind::bytes() | Kind::null()),
261 (Field::from("agent"), Kind::bytes() | Kind::null()),
262 ])
263}
264
265fn kind_error() -> BTreeMap<Field, Kind> {
266 BTreeMap::from([
267 (Field::from("timestamp"), Kind::timestamp() | Kind::null()),
268 (Field::from("module"), Kind::bytes() | Kind::null()),
269 (Field::from("severity"), Kind::bytes() | Kind::null()),
270 (Field::from("thread"), Kind::bytes() | Kind::null()),
271 (Field::from("port"), Kind::bytes() | Kind::null()),
272 (Field::from("message"), Kind::bytes() | Kind::null()),
273 ])
274}
275
276#[cfg(test)]
277mod tests {
278 use crate::compiler::TimeZone;
279 use chrono::DateTime;
280 use chrono::TimeZone as ChronoTimezone;
281 use chrono::prelude::*;
282
283 use super::*;
284 use crate::btreemap;
285 use chrono::Utc;
286
287 test_function![
288 parse_common_log => ParseApacheLog;
289
290 common_line_valid {
291 args: func_args![value: r#"127.0.0.1 bob frank [10/Oct/2000:13:55:36 -0700] "GET /apache_pb.gif HTTP/1.0" 200 2326"#,
292 format: "common"
293 ],
294 want: Ok(btreemap! {
295 "host" => "127.0.0.1",
296 "identity" => "bob",
297 "user" => "frank",
298 "timestamp" => Value::Timestamp(DateTime::parse_from_rfc3339("2000-10-10T20:55:36Z").unwrap().into()),
299 "message" => "GET /apache_pb.gif HTTP/1.0",
300 "method" => "GET",
301 "path" => "/apache_pb.gif",
302 "protocol" => "HTTP/1.0",
303 "status" => 200,
304 "size" => 2326,
305 }),
306 tdef: TypeDef::object(kind_common()).fallible(),
307 tz: TimeZone::default(),
308 }
309
310 combined_line_valid {
311 args: func_args![value: r#"224.92.49.50 bob frank [25/Feb/2021:12:44:08 +0000] "PATCH /one-to-one HTTP/1.1" 401 84170 "http://www.seniorinfomediaries.com/vertical/channels/front-end/bandwidth" "Mozilla/5.0 (X11; Linux i686; rv:5.0) Gecko/1945-10-12 Firefox/37.0""#,
312 format: "combined"
313 ],
314 want: Ok(btreemap! {
315 "host" => "224.92.49.50",
316 "identity" => "bob",
317 "user" => "frank",
318 "timestamp" => Value::Timestamp(DateTime::parse_from_rfc3339("2021-02-25T12:44:08Z").unwrap().into()),
319 "message" => "PATCH /one-to-one HTTP/1.1",
320 "method" => "PATCH",
321 "path" => "/one-to-one",
322 "protocol" => "HTTP/1.1",
323 "status" => 401,
324 "size" => 84170,
325 "referrer" => "http://www.seniorinfomediaries.com/vertical/channels/front-end/bandwidth",
326 "agent" => "Mozilla/5.0 (X11; Linux i686; rv:5.0) Gecko/1945-10-12 Firefox/37.0",
327 }),
328 tdef: TypeDef::object(kind_combined()).fallible(),
329 tz: TimeZone::default(),
330 }
331
332 combined_line_missing_fields_valid {
333 args: func_args![value: r#"224.92.49.50 bob frank [25/Feb/2021:12:44:08 +0000] "PATCH /one-to-one HTTP/1.1" 401 84170 - -"#,
334 format: "combined"
335 ],
336 want: Ok(btreemap! {
337 "host" => "224.92.49.50",
338 "identity" => "bob",
339 "user" => "frank",
340 "timestamp" => Value::Timestamp(DateTime::parse_from_rfc3339("2021-02-25T12:44:08Z").unwrap().into()),
341 "message" => "PATCH /one-to-one HTTP/1.1",
342 "method" => "PATCH",
343 "path" => "/one-to-one",
344 "protocol" => "HTTP/1.1",
345 "status" => 401,
346 "size" => 84170,
347 }),
348 tdef: TypeDef::object(kind_combined()).fallible(),
349 tz: TimeZone::default(),
350 }
351
352 error_line_valid {
353 args: func_args![value: "[01/Mar/2021:12:00:19 +0000] [ab:alert] [pid 4803:tid 3814] [client 147.159.108.175:24259] I'll bypass the haptic COM bandwidth, that should matrix the CSS driver!",
354 format: "error"
355 ],
356 want: Ok(btreemap! {
357 "timestamp" => Value::Timestamp(DateTime::parse_from_rfc3339("2021-03-01T12:00:19Z").unwrap().into()),
358 "message" => "I'll bypass the haptic COM bandwidth, that should matrix the CSS driver!",
359 "module" => "ab",
360 "severity" => "alert",
361 "pid" => 4803,
362 "thread" => "3814",
363 "client" => "147.159.108.175",
364 "port" => 24259
365 }),
366 tdef: TypeDef::object(kind_error()).fallible(),
367 tz: TimeZone::default(),
368 }
369
370 error_line_ip_v6 {
371 args: func_args![value: "[01/Mar/2021:12:00:19 +0000] [ab:alert] [pid 4803:tid 3814] [client eda7:35d:3ceb:ef1e:2133:e7bf:116e:24cc:24259] I'll bypass the haptic COM bandwidth, that should matrix the CSS driver!",
372 format: "error"
373 ],
374 want: Ok(btreemap! {
375 "timestamp" => Value::Timestamp(DateTime::parse_from_rfc3339("2021-03-01T12:00:19Z").unwrap().into()),
376 "message" => "I'll bypass the haptic COM bandwidth, that should matrix the CSS driver!",
377 "module" => "ab",
378 "severity" => "alert",
379 "pid" => 4803,
380 "thread" => "3814",
381 "client" => "eda7:35d:3ceb:ef1e:2133:e7bf:116e:24cc",
382 "port" => 24259
383 }),
384 tdef: TypeDef::object(kind_error()).fallible(),
385 tz: TimeZone::default(),
386 }
387
388 error_line_thread_id {
389 args: func_args![
390 value: r"[2021-06-04 15:40:27.138633] [php7:emerg] [pid 4803] [client 95.223.77.60:35106] PHP Parse error: syntax error, unexpected \'->\' (T_OBJECT_OPERATOR) in /var/www/prod/releases/master-c7225365fd9faa26262cffeeb57b31bd7448c94a/source/index.php on line 14",
391 timestamp_format: "%Y-%m-%d %H:%M:%S.%f",
392 format: "error",
393 ],
394 want: Ok(btreemap! {
395 "timestamp" => Value::Timestamp(DateTime::parse_from_rfc3339("2021-06-04T15:40:27.000138633Z").unwrap().into()),
396 "message" => "PHP Parse error: syntax error, unexpected \\\'->\\\' (T_OBJECT_OPERATOR) in /var/www/prod/releases/master-c7225365fd9faa26262cffeeb57b31bd7448c94a/source/index.php on line 14",
397 "module" => "php7",
398 "severity" => "emerg",
399 "pid" => 4803,
400 "client" => "95.223.77.60",
401 "port" => 35106
402
403 }),
404 tdef: TypeDef::object(kind_error()).fallible(),
405 tz: TimeZone::Named(chrono_tz::Tz::UTC),
406 }
407
408 error_line_threaded_mpms_valid {
409 args: func_args![value: "[01/Mar/2021:12:00:19 +0000] [proxy:error] [pid 23964] (113)No route to host: AH00957: HTTP: attempt to connect to 10.1.0.244:9000 (hostname.domain.com) failed",
410 format: "error"
411 ],
412 want: Ok(btreemap! {
413 "timestamp" => Value::Timestamp(DateTime::parse_from_rfc3339("2021-03-01T12:00:19Z").unwrap().into()),
414 "message1" => "(113)No route to host: AH00957: ",
415 "message2" => "HTTP: attempt to connect to 10.1.0.244:9000 (hostname.domain.com) failed",
416 "module" => "proxy",
417 "severity" => "error",
418 "pid" => 23964,
419 }),
420 tdef: TypeDef::object(kind_error()).fallible(),
421 tz: TimeZone::default(),
422 }
423
424 log_line_valid_empty {
425 args: func_args![value: "- - - - - - -",
426 format: "common",
427 ],
428 want: Ok(BTreeMap::new()),
429 tdef: TypeDef::object(kind_common()).fallible(),
430 tz: TimeZone::default(),
431 }
432
433 log_line_valid_empty_variant {
434 args: func_args![value: r#"- - - [-] "-" - -"#,
435 format: "common",
436 ],
437 want: Ok(BTreeMap::new()),
438 tdef: TypeDef::object(kind_common()).fallible(),
439 tz: TimeZone::default(),
440 }
441
442 log_line_valid_with_local_timestamp_format {
443 args: func_args![value: format!("[{}] - - - -",
444 Utc.with_ymd_and_hms(2000, 10, 10, 20, 55, 36).unwrap()
445 .with_timezone(&Local)
446 .format("%a %b %d %H:%M:%S %Y")
447 ),
448 timestamp_format: "%a %b %d %H:%M:%S %Y",
449 format: "error",
450 ],
451 want: Ok(btreemap! {
452 "timestamp" => Value::Timestamp(DateTime::parse_from_rfc3339("2000-10-10T20:55:36Z").unwrap().into()),
453 }),
454 tdef: TypeDef::object(kind_error()).fallible(),
455 tz: TimeZone::default(),
456 }
457
458
459 log_line_valid_with_timezone {
460 args: func_args![
461 value: "[2021/06/03 09:30:50] - - - -",
462 timestamp_format: "%Y/%m/%d %H:%M:%S",
463 format: "error",
464 ],
465 want: Ok(btreemap! {
466 "timestamp" => Value::Timestamp(DateTime::parse_from_rfc3339("2021-06-03T07:30:50Z").unwrap().into()),
467 }),
468 tdef: TypeDef::object(kind_error()).fallible(),
469 tz: TimeZone::Named(chrono_tz::Europe::Paris),
470 }
471
472 log_line_invalid {
473 args: func_args![value: "not a common log line",
474 format: "common",
475 ],
476 want: Err("failed parsing common log line"),
477 tdef: TypeDef::object(kind_common()).fallible(),
478 tz: TimeZone::default(),
479 }
480
481 log_line_invalid_timestamp {
482 args: func_args![value: "- - - [1234] - - - - - ",
483 format: "combined",
484 ],
485 want: Err("failed parsing timestamp 1234 using format %d/%b/%Y:%T %z: input contains invalid characters"),
486 tdef: TypeDef::object(kind_combined()).fallible(),
487 tz: TimeZone::default(),
488 }
489 ];
490}