vrl/stdlib/
parse_user_agent.rs

1use crate::compiler::function::EnumVariant;
2use crate::compiler::prelude::*;
3use std::{
4    borrow::Cow,
5    collections::BTreeMap,
6    fmt,
7    str::FromStr,
8    sync::{Arc, LazyLock},
9};
10use woothee::parser::Parser as WootheeParser;
11
12static UA_EXTRACTOR: LazyLock<ua_parser::Extractor> = LazyLock::new(|| {
13    let regexes = include!(concat!(env!("OUT_DIR"), "/user_agent_regexes.rs"));
14    ua_parser::Extractor::try_from(regexes).expect("Regex file is not valid.")
15});
16
17static DEFAULT_MODE: LazyLock<Value> = LazyLock::new(|| Value::Bytes(Bytes::from("fast")));
18
19static MODE_ENUM: &[EnumVariant] = &[
20    EnumVariant {
21        value: "fast",
22        description: "Fastest mode but most unreliable. Uses parser from project [Woothee](https://github.com/woothee/woothee).",
23    },
24    EnumVariant {
25        value: "reliable",
26        description: indoc! {"
27            Provides greater reliability than `fast` and retains it's speed in common cases.
28            Parses with [Woothee](https://github.com/woothee/woothee) parser and with parser from
29            [uap project](https://github.com/ua-parser/uap-core) if there are some missing fields
30            that the first parser wasn't able to parse out but the second one maybe can.
31        "},
32    },
33    EnumVariant {
34        value: "enriched",
35        description: indoc! {"
36            Parses with both parser from [Woothee](https://github.com/woothee/woothee) and parser from
37            [uap project](https://github.com/ua-parser/uap-core) and combines results. Result has the full schema.
38        "},
39    },
40];
41
42static PARAMETERS: LazyLock<Vec<Parameter>> = LazyLock::new(|| {
43    vec![
44        Parameter::required("value", kind::BYTES, "The string to parse."),
45        Parameter::optional(
46            "mode",
47            kind::BYTES,
48            "Determines performance and reliability characteristics.",
49        )
50        .default(&DEFAULT_MODE)
51        .enum_variants(MODE_ENUM),
52    ]
53});
54
55#[derive(Clone, Copy, Debug)]
56pub struct ParseUserAgent;
57
58impl Function for ParseUserAgent {
59    fn identifier(&self) -> &'static str {
60        "parse_user_agent"
61    }
62
63    fn summary(&self) -> &'static str {
64        "parse a user agent string"
65    }
66
67    fn usage(&self) -> &'static str {
68        indoc! {"
69            Parses the provided `value` as a user agent, which has
70            [a loosely defined format](https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/User-Agent).
71
72            Parses on the basis of best effort. Returned schema depends only on the configured `mode`,
73            so if the function fails to parse a field it will set it to `null`.
74        "}
75    }
76
77    fn category(&self) -> &'static str {
78        Category::Parse.as_ref()
79    }
80
81    fn return_kind(&self) -> u16 {
82        kind::OBJECT
83    }
84
85    fn notices(&self) -> &'static [&'static str] {
86        &[
87            indoc! {"
88                All values are returned as strings or as null. We recommend manually coercing values
89                to desired types as you see fit.
90            "},
91            "Different modes return different schema.",
92            "Field which were not parsed out are set as `null`.",
93        ]
94    }
95
96    fn parameters(&self) -> &'static [Parameter] {
97        PARAMETERS.as_slice()
98    }
99
100    fn examples(&self) -> &'static [Example] {
101        &[
102            example! {
103                title: "Fast mode",
104                source: indoc! {r#"
105                    parse_user_agent(
106                        "Mozilla Firefox 1.0.1 Mozilla/5.0 (X11; U; Linux i686; de-DE; rv:1.7.6) Gecko/20050223 Firefox/1.0.1"
107                    )
108                "#},
109                result: Ok(indoc! {r#"
110                    {
111                        "browser": {
112                            "family": "Firefox",
113                            "version": "1.0.1"
114                        },
115                        "device": {
116                            "category": "pc"
117                        },
118                        "os": {
119                            "family": "Linux",
120                            "version": null
121                        }
122                    }
123                    "#}),
124            },
125            example! {
126                title: "Reliable mode",
127                source: indoc! {r#"
128                    parse_user_agent(
129                        "Mozilla/4.0 (compatible; MSIE 7.66; Windows NT 5.1; SV1; .NET CLR 1.1.4322)",
130                        mode: "reliable")
131                "#},
132                result: Ok(indoc! {r#"
133                    {
134                        "browser": {
135                            "family": "Internet Explorer",
136                            "version": "7.66"
137                        },
138                        "device": {
139                            "category": "pc"
140                        },
141                        "os": {
142                            "family": "Windows XP",
143                            "version": "NT 5.1"
144                        }
145                    }
146                    "#}),
147            },
148            example! {
149                title: "Enriched mode",
150                source: indoc! {r#"
151                    parse_user_agent(
152                        "Opera/9.80 (J2ME/MIDP; Opera Mini/4.3.24214; iPhone; CPU iPhone OS 4_2_1 like Mac OS X; AppleWebKit/24.783; U; en) Presto/2.5.25 Version/10.54",
153                        mode: "enriched"
154                    )
155                "#},
156                result: Ok(indoc! {r#"
157                    {
158                        "browser": {
159                            "family": "Opera Mini",
160                            "major": "4",
161                            "minor": "3",
162                            "patch": "24214",
163                            "version": "10.54"
164                        },
165                        "device": {
166                            "brand": "Apple",
167                            "category": "smartphone",
168                            "family": "iPhone",
169                            "model": "iPhone"
170                        },
171                        "os": {
172                            "family": "iOS",
173                            "major": "4",
174                            "minor": "2",
175                            "patch": "1",
176                            "patch_minor": null,
177                            "version": "4.2.1"
178                        }
179                    }
180                    "#}),
181            },
182        ]
183    }
184
185    fn compile(
186        &self,
187        state: &state::TypeState,
188        _ctx: &mut FunctionCompileContext,
189        arguments: ArgumentList,
190    ) -> Compiled {
191        let value = arguments.required("value");
192
193        let mode = arguments
194            .optional_enum("mode", &Mode::all_value(), state)?
195            .unwrap_or_else(|| DEFAULT_MODE.clone())
196            .try_bytes_utf8_lossy()
197            .map(|s| Mode::from_str(&s).expect("validated enum"))
198            .expect("mode not bytes");
199
200        let parser = match mode {
201            Mode::Fast => {
202                let parser = WootheeParser::new();
203
204                Arc::new(move |s: &str| parser.parse_user_agent(s).partial_schema()) as Arc<_>
205            }
206            Mode::Reliable => {
207                let fast = WootheeParser::new();
208                let slow = &UA_EXTRACTOR;
209
210                Arc::new(move |s: &str| {
211                    let ua = fast.parse_user_agent(s);
212                    let ua = if ua.browser.family.is_none() || ua.os.family.is_none() {
213                        let better_ua = slow.parse_user_agent(s);
214                        better_ua.or(ua)
215                    } else {
216                        ua
217                    };
218                    ua.partial_schema()
219                }) as Arc<_>
220            }
221            Mode::Enriched => {
222                let fast = WootheeParser::new();
223                let slow = &UA_EXTRACTOR;
224
225                Arc::new(move |s: &str| {
226                    slow.parse_user_agent(s)
227                        .or(fast.parse_user_agent(s))
228                        .full_schema()
229                }) as Arc<_>
230            }
231        };
232
233        Ok(ParseUserAgentFn {
234            value,
235            mode,
236            parser,
237        }
238        .as_expr())
239    }
240}
241
242#[derive(Clone)]
243struct ParseUserAgentFn {
244    value: Box<dyn Expression>,
245    mode: Mode,
246    parser: Arc<dyn Fn(&str) -> Value + Send + Sync>,
247}
248
249impl FunctionExpression for ParseUserAgentFn {
250    fn resolve(&self, ctx: &mut Context) -> Resolved {
251        let value = self.value.resolve(ctx)?;
252        let string = value.try_bytes_utf8_lossy()?;
253
254        Ok((self.parser)(&string))
255    }
256
257    fn type_def(&self, _: &state::TypeState) -> TypeDef {
258        self.mode.type_def()
259    }
260}
261
262impl fmt::Debug for ParseUserAgentFn {
263    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
264        write!(
265            f,
266            "ParseUserAgentFn{{ value: {:?}, mode: {:?}}}",
267            self.value, self.mode
268        )
269    }
270}
271
272#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
273pub(crate) enum Mode {
274    #[default]
275    Fast,
276    Reliable,
277    Enriched,
278}
279
280impl Mode {
281    fn all_value() -> Vec<Value> {
282        use Mode::{Enriched, Fast, Reliable};
283
284        vec![Fast, Reliable, Enriched]
285            .into_iter()
286            .map(|u| u.as_str().into())
287            .collect::<Vec<_>>()
288    }
289
290    const fn as_str(self) -> &'static str {
291        use Mode::{Enriched, Fast, Reliable};
292
293        match self {
294            Fast => "fast",
295            Reliable => "reliable",
296            Enriched => "enriched",
297        }
298    }
299
300    fn type_def(self) -> TypeDef {
301        match self {
302            Mode::Fast | Mode::Reliable => TypeDef::object(BTreeMap::from([
303                (
304                    "browser".into(),
305                    Kind::object(BTreeMap::from([
306                        ("family".into(), Kind::bytes().or_null()),
307                        ("version".into(), Kind::bytes().or_null()),
308                    ])),
309                ),
310                (
311                    "os".into(),
312                    Kind::object(BTreeMap::from([
313                        ("family".into(), Kind::bytes().or_null()),
314                        ("version".into(), Kind::bytes().or_null()),
315                    ])),
316                ),
317                (
318                    "device".into(),
319                    Kind::object(BTreeMap::from([(
320                        "category".into(),
321                        Kind::bytes().or_null(),
322                    )])),
323                ),
324            ])),
325            Mode::Enriched => TypeDef::object(BTreeMap::from([
326                (
327                    "browser".into(),
328                    Kind::object(BTreeMap::from([
329                        ("family".into(), Kind::bytes().or_null()),
330                        ("version".into(), Kind::bytes().or_null()),
331                        ("major".into(), Kind::bytes().or_null()),
332                        ("minor".into(), Kind::bytes().or_null()),
333                        ("patch".into(), Kind::bytes().or_null()),
334                    ])),
335                ),
336                (
337                    "os".into(),
338                    Kind::object(BTreeMap::from([
339                        ("family".into(), Kind::bytes().or_null()),
340                        ("version".into(), Kind::bytes().or_null()),
341                        ("major".into(), Kind::bytes().or_null()),
342                        ("minor".into(), Kind::bytes().or_null()),
343                        ("patch".into(), Kind::bytes().or_null()),
344                        ("patch_minor".into(), Kind::bytes().or_null()),
345                    ])),
346                ),
347                (
348                    "device".into(),
349                    Kind::object(BTreeMap::from([
350                        ("family".into(), Kind::bytes().or_null()),
351                        ("category".into(), Kind::bytes().or_null()),
352                        ("brand".into(), Kind::bytes().or_null()),
353                        ("model".into(), Kind::bytes().or_null()),
354                    ])),
355                ),
356            ])),
357        }
358    }
359}
360
361impl FromStr for Mode {
362    type Err = &'static str;
363
364    fn from_str(s: &str) -> std::result::Result<Self, Self::Err> {
365        use Mode::{Enriched, Fast, Reliable};
366
367        match s {
368            "fast" => Ok(Fast),
369            "reliable" => Ok(Reliable),
370            "enriched" => Ok(Enriched),
371            _ => Err("unknown mode variant"),
372        }
373    }
374}
375
376#[derive(Default)]
377struct UserAgent {
378    browser: Browser,
379    os: Os,
380    device: Device,
381}
382
383impl UserAgent {
384    fn partial_schema(self) -> Value {
385        let Self {
386            browser,
387            os,
388            device,
389        } = self;
390
391        IntoIterator::into_iter([
392            ("browser", browser.partial_schema()),
393            ("os", os.partial_schema()),
394            ("device", device.partial_schema()),
395        ])
396        .map(|(name, value)| (name.to_string(), value))
397        .collect()
398    }
399
400    fn full_schema(self) -> Value {
401        let Self {
402            browser,
403            os,
404            device,
405        } = self;
406
407        IntoIterator::into_iter([
408            ("browser", browser.full_schema()),
409            ("os", os.full_schema()),
410            ("device", device.full_schema()),
411        ])
412        .map(|(name, value)| (name.to_string(), value))
413        .collect()
414    }
415
416    fn or(self, other: Self) -> Self {
417        Self {
418            browser: self.browser.or(other.browser),
419            os: self.os.or(other.os),
420            device: self.device.or(other.device),
421        }
422    }
423}
424
425#[derive(Default)]
426struct Browser {
427    family: Option<String>,
428    version: Option<String>,
429    major: Option<String>,
430    minor: Option<String>,
431    patch: Option<String>,
432}
433
434impl Browser {
435    fn partial_schema(self) -> Value {
436        let Self {
437            family, version, ..
438        } = self;
439
440        into_value([("family", family), ("version", version)])
441    }
442
443    fn full_schema(self) -> Value {
444        let Self {
445            family,
446            version,
447            major,
448            minor,
449            patch,
450        } = self;
451
452        into_value([
453            ("family", family),
454            ("version", version),
455            ("major", major),
456            ("minor", minor),
457            ("patch", patch),
458        ])
459    }
460
461    fn or(self, other: Self) -> Self {
462        Self {
463            family: self.family.or(other.family),
464            version: self.version.or(other.version),
465            major: self.major.or(other.major),
466            minor: self.minor.or(other.minor),
467            patch: self.patch.or(other.patch),
468        }
469    }
470}
471
472#[derive(Default)]
473struct Os {
474    family: Option<String>,
475    version: Option<String>,
476    major: Option<String>,
477    minor: Option<String>,
478    patch: Option<String>,
479    patch_minor: Option<String>,
480}
481
482impl Os {
483    fn partial_schema(self) -> Value {
484        let Self {
485            family, version, ..
486        } = self;
487
488        into_value([("family", family), ("version", version)])
489    }
490
491    fn full_schema(self) -> Value {
492        let Self {
493            family,
494            version,
495            major,
496            minor,
497            patch,
498            patch_minor,
499        } = self;
500
501        into_value([
502            ("family", family),
503            ("version", version),
504            ("major", major),
505            ("minor", minor),
506            ("patch", patch),
507            ("patch_minor", patch_minor),
508        ])
509    }
510
511    fn or(self, other: Self) -> Self {
512        Self {
513            family: self.family.or(other.family),
514            version: self.version.or(other.version),
515            major: self.major.or(other.major),
516            minor: self.minor.or(other.minor),
517            patch: self.patch.or(other.patch),
518            patch_minor: self.patch_minor.or(other.patch_minor),
519        }
520    }
521}
522
523#[derive(Default)]
524struct Device {
525    family: Option<String>,
526    category: Option<String>,
527    brand: Option<String>,
528    model: Option<String>,
529}
530
531impl Device {
532    fn partial_schema(self) -> Value {
533        let Self { category, .. } = self;
534
535        into_value([("category", category)])
536    }
537
538    fn full_schema(self) -> Value {
539        let Self {
540            category,
541            family,
542            brand,
543            model,
544        } = self;
545
546        into_value([
547            ("category", category),
548            ("family", family),
549            ("brand", brand),
550            ("model", model),
551        ])
552    }
553
554    fn or(self, other: Self) -> Self {
555        Self {
556            category: self.category.or(other.category),
557            family: self.family.or(other.family),
558            brand: self.brand.or(other.brand),
559            model: self.model.or(other.model),
560        }
561    }
562}
563
564fn into_value<'a>(iter: impl IntoIterator<Item = (&'a str, Option<String>)>) -> Value {
565    iter.into_iter()
566        .map(|(name, value)| {
567            (
568                name.to_string(),
569                value.map_or(Value::Null, std::convert::Into::into),
570            )
571        })
572        .collect()
573}
574
575trait Parser {
576    fn parse_user_agent(&self, user_agent: &str) -> UserAgent;
577}
578
579impl Parser for WootheeParser {
580    fn parse_user_agent(&self, user_agent: &str) -> UserAgent {
581        fn unknown_to_none<'a>(s: impl Into<Cow<'a, str>>) -> Option<String> {
582            let cow = s.into();
583            match cow.as_ref() {
584                "" | woothee::woothee::VALUE_UNKNOWN => None,
585                _ => Some(cow.into_owned()),
586            }
587        }
588
589        let ua = self.parse(user_agent).unwrap_or_default();
590
591        UserAgent {
592            browser: Browser {
593                family: unknown_to_none(ua.name),
594                version: unknown_to_none(ua.version),
595                ..Default::default()
596            },
597            os: Os {
598                family: unknown_to_none(ua.os),
599                version: unknown_to_none(ua.os_version),
600                ..Default::default()
601            },
602            device: Device {
603                category: unknown_to_none(ua.category),
604                ..Default::default()
605            },
606        }
607    }
608}
609
610impl Parser for ua_parser::Extractor<'_> {
611    fn parse_user_agent(&self, user_agent: &str) -> UserAgent {
612        let browser = self
613            .ua
614            .extract(user_agent)
615            .map(|ua| Browser {
616                family: Some(ua.family.into_owned()),
617                major: ua.major.map(Into::into),
618                minor: ua.minor.map(Into::into),
619                patch: ua.patch.map(Into::into),
620                ..Default::default()
621            })
622            .unwrap_or_default();
623
624        let os = self
625            .os
626            .extract(user_agent)
627            .map(|os| Os {
628                family: Some(os.os.into_owned()),
629                major: os.major.map(Cow::into_owned),
630                minor: os.minor.map(Cow::into_owned),
631                patch: os.patch.map(Cow::into_owned),
632                patch_minor: os.patch_minor.map(Cow::into_owned),
633                ..Default::default()
634            })
635            .unwrap_or_default();
636
637        let device = self
638            .dev
639            .extract(user_agent)
640            .map(|dev| Device {
641                family: Some(dev.device.into_owned()),
642                brand: dev.brand.map(Cow::into_owned),
643                model: dev.model.map(Cow::into_owned),
644                ..Default::default()
645            })
646            .unwrap_or_default();
647
648        UserAgent {
649            browser,
650            os,
651            device,
652        }
653    }
654}
655
656#[cfg(test)]
657mod tests {
658    use super::*;
659    use crate::value;
660
661    test_function![
662        parse_user_agent => ParseUserAgent;
663
664        parses {
665            args: func_args![ value: "Mozilla/4.0 (compatible; MSIE 7.66; Windows NT 5.1; SV1)" ],
666            want: Ok(value!({ browser: { family: "Internet Explorer", version: "7.66" }, device: { category: "pc" }, os: { family: "Windows XP", version: "NT 5.1" } })),
667            tdef: Mode::Fast.type_def(),
668        }
669
670        unknown_user_agent {
671            args: func_args![ value: "w3m/0.3", mode: "enriched"],
672            want: Ok(value!({ browser: { family: null, major: null, minor: null, patch: null, version: null }, device: { brand: null, category: null, family: null, model: null }, os: { family: null, major: null, minor: null, patch: null, patch_minor: null, version: null } })),
673            tdef: Mode::Enriched.type_def(),
674        }
675    ];
676}