vrl/stdlib/
parse_etld.rs

1use psl::Psl;
2use publicsuffix::List;
3
4use crate::compiler::prelude::*;
5use std::sync::LazyLock;
6use std::{collections::BTreeMap, path::Path};
7
8static DEFAULT_PLUS_PARTS: LazyLock<Value> = LazyLock::new(|| Value::Integer(0));
9
10static PARAMETERS: LazyLock<Vec<Parameter>> = LazyLock::new(|| {
11    vec![
12        Parameter::required("value", kind::BYTES, "The domain string."),
13        Parameter::optional(
14            "plus_parts",
15            kind::INTEGER,
16            "Can be provided to get additional parts of the domain name. When 1 is passed,
17eTLD+1 will be returned, which represents a domain registrable by a single
18organization. Higher numbers will return subdomains.",
19        )
20        .default(&DEFAULT_PLUS_PARTS),
21        Parameter::optional(
22            "psl",
23            kind::BYTES,
24            "Can be provided to use a different public suffix list.
25
26By default, https://publicsuffix.org/list/public_suffix_list.dat is used.",
27        ),
28    ]
29});
30
31#[derive(Clone, Copy, Debug)]
32pub struct ParseEtld;
33
34impl Function for ParseEtld {
35    fn identifier(&self) -> &'static str {
36        "parse_etld"
37    }
38
39    fn usage(&self) -> &'static str {
40        "Parses the [eTLD](https://developer.mozilla.org/en-US/docs/Glossary/eTLD) from `value` representing domain name."
41    }
42
43    fn category(&self) -> &'static str {
44        Category::Parse.as_ref()
45    }
46
47    fn internal_failure_reasons(&self) -> &'static [&'static str] {
48        &["unable to determine eTLD for `value`"]
49    }
50
51    fn return_kind(&self) -> u16 {
52        kind::OBJECT
53    }
54
55    fn parameters(&self) -> &'static [Parameter] {
56        PARAMETERS.as_slice()
57    }
58
59    fn examples(&self) -> &'static [Example] {
60        &[
61            example! {
62                title: "Parse eTLD",
63                source: r#"parse_etld!("sub.sussex.ac.uk")"#,
64                result: Ok(indoc! {r#"
65                {
66                    "etld": "ac.uk",
67                    "etld_plus": "ac.uk",
68                    "known_suffix": true
69                }
70            "#}),
71            },
72            example! {
73                title: "Parse eTLD+1",
74                source: r#"parse_etld!("sub.sussex.ac.uk", plus_parts: 1)"#,
75                result: Ok(indoc! {r#"
76                {
77                    "etld": "ac.uk",
78                    "etld_plus": "sussex.ac.uk",
79                    "known_suffix": true
80                }
81            "#}),
82            },
83            example! {
84                title: "Parse eTLD with unknown suffix",
85                source: r#"parse_etld!("vector.acmecorp")"#,
86                result: Ok(indoc! {r#"
87                {
88                    "etld": "acmecorp",
89                    "etld_plus": "acmecorp",
90                    "known_suffix": false
91                }
92            "#}),
93            },
94            example! {
95                title: "Parse eTLD with custom PSL",
96                source: r#"parse_etld!("vector.acmecorp", psl: "lib/tests/tests/functions/custom_public_suffix_list.dat")"#,
97                result: Ok(indoc! {r#"
98                {
99                    "etld": "acmecorp",
100                    "etld_plus": "acmecorp",
101                    "known_suffix": false
102                }
103            "#}),
104            },
105        ]
106    }
107
108    fn compile(
109        &self,
110        state: &state::TypeState,
111        _ctx: &mut FunctionCompileContext,
112        arguments: ArgumentList,
113    ) -> Compiled {
114        let value = arguments.required("value");
115        let plus_parts = arguments.optional("plus_parts");
116
117        let psl_expr = arguments.optional_expr("psl");
118        let mut psl: Option<List> = None;
119        if let Some(psl_expr) = psl_expr {
120            let psl_location = psl_expr
121                .clone()
122                .resolve_constant(state)
123                .ok_or(function::Error::ExpectedStaticExpression {
124                    keyword: "psl",
125                    expr: psl_expr.clone(),
126                })?
127                .try_bytes_utf8_lossy()
128                .map_err(|_| function::Error::InvalidArgument {
129                    keyword: "psl",
130                    value: format!("{psl_expr:?}").into(),
131                    error: "psl should be a string",
132                })?
133                .into_owned();
134
135            let path = Path::new(&psl_location);
136            psl = Some(
137                std::fs::read_to_string(path)
138                    .map_err(|_| function::Error::InvalidArgument {
139                        keyword: "psl",
140                        value: format!("{}", path.display()).into(),
141                        error: "Unable to read psl file",
142                    })?
143                    .parse()
144                    .map_err(|_| function::Error::InvalidArgument {
145                        keyword: "psl",
146                        value: format!("{}", path.display()).into(),
147                        error: "Unable to parse psl file",
148                    })?,
149            );
150        }
151
152        Ok(ParseEtldFn {
153            value,
154            plus_parts,
155            psl,
156        }
157        .as_expr())
158    }
159}
160
161#[derive(Debug, Clone)]
162struct ParseEtldFn {
163    value: Box<dyn Expression>,
164    plus_parts: Option<Box<dyn Expression>>,
165    psl: Option<List>,
166}
167
168impl FunctionExpression for ParseEtldFn {
169    fn resolve(&self, ctx: &mut Context) -> Resolved {
170        let value = self.value.resolve(ctx)?;
171        let string = value.try_bytes_utf8_lossy()?;
172
173        let plus_parts_value = self
174            .plus_parts
175            .map_resolve_with_default(ctx, || DEFAULT_PLUS_PARTS.clone())?;
176        let plus_parts = match plus_parts_value.try_integer()? {
177            x if x < 0 => 0,
178            // TODO consider removal options
179            #[allow(clippy::cast_sign_loss, clippy::cast_possible_truncation)]
180            x => x as usize,
181        };
182
183        let suffix_result = if let Some(list) = &self.psl {
184            list.suffix(string.as_bytes())
185        } else {
186            psl::suffix(string.as_bytes())
187        };
188        let etld = suffix_result.ok_or(format!("unable to determine eTLD for {string}"))?;
189        let etld_string = core::str::from_utf8(etld.as_bytes())
190            .map_err(|err| format!("could not convert eTLD to UTF8 {err}"))?;
191
192        let etld_parts_count = etld_string.chars().filter(|c| *c == '.').count() + 1;
193        let etld_plus_parts: Vec<&str> = string
194            .rsplit('.')
195            .take(etld_parts_count + plus_parts)
196            .collect();
197
198        let etld_plus = etld_plus_parts
199            .into_iter()
200            .rev()
201            .collect::<Vec<_>>()
202            .join(".");
203
204        let mut map = BTreeMap::<&str, Value>::new();
205
206        map.insert("etld", etld_string.to_owned().into());
207        map.insert("etld_plus", etld_plus.into());
208        map.insert("known_suffix", etld.is_known().into());
209
210        Ok(map
211            .into_iter()
212            .map(|(k, v)| (k.to_owned(), v))
213            .collect::<Value>())
214    }
215
216    fn type_def(&self, _: &state::TypeState) -> TypeDef {
217        TypeDef::object(inner_kind()).fallible()
218    }
219}
220
221fn inner_kind() -> BTreeMap<Field, Kind> {
222    BTreeMap::from([
223        ("etld".into(), Kind::bytes()),
224        ("etld_plus".into(), Kind::bytes()),
225        ("known_suffix".into(), Kind::boolean()),
226    ])
227}
228
229#[cfg(test)]
230mod tests {
231    use super::*;
232    use crate::value;
233
234    test_function![
235        parse_etld => ParseEtld;
236
237        naive {
238            args: func_args![value: value!("vector.dev")],
239            want: Ok(value!({
240                etld: "dev",
241                etld_plus: "dev",
242                known_suffix: true,
243            })),
244            tdef: TypeDef::object(inner_kind()).fallible(),
245        }
246
247        naive_plus_one {
248            args: func_args![value: value!("vector.dev"), plus_parts: 1],
249            want: Ok(value!({
250                etld: "dev",
251                etld_plus: "vector.dev",
252                known_suffix: true,
253            })),
254            tdef: TypeDef::object(inner_kind()).fallible(),
255        }
256
257        psl {
258            args: func_args![value: value!("sussex.ac.uk")],
259            want: Ok(value!({
260                etld: "ac.uk",
261                etld_plus: "ac.uk",
262                known_suffix: true,
263            })),
264            tdef: TypeDef::object(inner_kind()).fallible(),
265        }
266
267        psl_plus_one {
268            args: func_args![value: value!("sussex.ac.uk"), plus_parts: 1],
269            want: Ok(value!({
270                etld: "ac.uk",
271                etld_plus: "sussex.ac.uk",
272                known_suffix: true,
273            })),
274            tdef: TypeDef::object(inner_kind()).fallible(),
275        }
276
277        short_plus {
278            args: func_args![value: value!("vector.dev"), plus_parts: 10],
279            want: Ok(value!({
280                etld: "dev",
281                etld_plus: "vector.dev",
282                known_suffix: true,
283            })),
284            tdef: TypeDef::object(inner_kind()).fallible(),
285        }
286
287        long_plus {
288            args: func_args![value: value!("www.long.plus.test.vector.dev"), plus_parts: 4],
289            want: Ok(value!({
290                etld: "dev",
291                etld_plus: "long.plus.test.vector.dev",
292                known_suffix: true,
293            })),
294            tdef: TypeDef::object(inner_kind()).fallible(),
295        }
296
297        unknown_tld {
298            args: func_args![value: value!("vector.unknowndev")],
299            want: Ok(value!({
300                etld: "unknowndev",
301                etld_plus: "unknowndev",
302                known_suffix: false,
303            })),
304            tdef: TypeDef::object(inner_kind()).fallible(),
305        }
306
307        utf8 {
308            args: func_args![value: value!("www.食狮.中国")],
309            want: Ok(value!({
310                etld: "中国",
311                etld_plus: "中国",
312                known_suffix: true,
313            })),
314            tdef: TypeDef::object(inner_kind()).fallible(),
315        }
316
317        utf8_plus_one {
318            args: func_args![value: value!("www.食狮.中国"), plus_parts: 1],
319            want: Ok(value!({
320                etld: "中国",
321                etld_plus: "食狮.中国",
322                known_suffix: true,
323            })),
324            tdef: TypeDef::object(inner_kind()).fallible(),
325        }
326
327        empty_host {
328            args: func_args![value: value!("")],
329            want: Err("unable to determine eTLD for "),
330            tdef: TypeDef::object(inner_kind()).fallible(),
331        }
332
333        bad_psl_file {
334            args: func_args![value: value!("vector.dev"), psl: value!("definitelynotafile")],
335            want: Err("invalid argument"),
336            tdef: TypeDef::object(inner_kind()).fallible(),
337        }
338    ];
339}