1use psl::Psl;
2use publicsuffix::List;
3
4use crate::compiler::prelude::*;
5use std::sync::LazyLock;
6use std::{collections::BTreeMap, path::Path};
7
8static DEFAULT_PLUS_PARTS: LazyLock<Value> = LazyLock::new(|| Value::Integer(0));
9
10static PARAMETERS: LazyLock<Vec<Parameter>> = LazyLock::new(|| {
11 vec![
12 Parameter::required("value", kind::BYTES, "The domain string."),
13 Parameter::optional(
14 "plus_parts",
15 kind::INTEGER,
16 "Can be provided to get additional parts of the domain name. When 1 is passed,
17eTLD+1 will be returned, which represents a domain registrable by a single
18organization. Higher numbers will return subdomains.",
19 )
20 .default(&DEFAULT_PLUS_PARTS),
21 Parameter::optional(
22 "psl",
23 kind::BYTES,
24 "Can be provided to use a different public suffix list.
25
26By default, https://publicsuffix.org/list/public_suffix_list.dat is used.",
27 ),
28 ]
29});
30
31#[derive(Clone, Copy, Debug)]
32pub struct ParseEtld;
33
34impl Function for ParseEtld {
35 fn identifier(&self) -> &'static str {
36 "parse_etld"
37 }
38
39 fn usage(&self) -> &'static str {
40 "Parses the [eTLD](https://developer.mozilla.org/en-US/docs/Glossary/eTLD) from `value` representing domain name."
41 }
42
43 fn category(&self) -> &'static str {
44 Category::Parse.as_ref()
45 }
46
47 fn internal_failure_reasons(&self) -> &'static [&'static str] {
48 &["unable to determine eTLD for `value`"]
49 }
50
51 fn return_kind(&self) -> u16 {
52 kind::OBJECT
53 }
54
55 fn parameters(&self) -> &'static [Parameter] {
56 PARAMETERS.as_slice()
57 }
58
59 fn examples(&self) -> &'static [Example] {
60 &[
61 example! {
62 title: "Parse eTLD",
63 source: r#"parse_etld!("sub.sussex.ac.uk")"#,
64 result: Ok(indoc! {r#"
65 {
66 "etld": "ac.uk",
67 "etld_plus": "ac.uk",
68 "known_suffix": true
69 }
70 "#}),
71 },
72 example! {
73 title: "Parse eTLD+1",
74 source: r#"parse_etld!("sub.sussex.ac.uk", plus_parts: 1)"#,
75 result: Ok(indoc! {r#"
76 {
77 "etld": "ac.uk",
78 "etld_plus": "sussex.ac.uk",
79 "known_suffix": true
80 }
81 "#}),
82 },
83 example! {
84 title: "Parse eTLD with unknown suffix",
85 source: r#"parse_etld!("vector.acmecorp")"#,
86 result: Ok(indoc! {r#"
87 {
88 "etld": "acmecorp",
89 "etld_plus": "acmecorp",
90 "known_suffix": false
91 }
92 "#}),
93 },
94 example! {
95 title: "Parse eTLD with custom PSL",
96 source: r#"parse_etld!("vector.acmecorp", psl: "lib/tests/tests/functions/custom_public_suffix_list.dat")"#,
97 result: Ok(indoc! {r#"
98 {
99 "etld": "acmecorp",
100 "etld_plus": "acmecorp",
101 "known_suffix": false
102 }
103 "#}),
104 },
105 ]
106 }
107
108 fn compile(
109 &self,
110 state: &state::TypeState,
111 _ctx: &mut FunctionCompileContext,
112 arguments: ArgumentList,
113 ) -> Compiled {
114 let value = arguments.required("value");
115 let plus_parts = arguments.optional("plus_parts");
116
117 let psl_expr = arguments.optional_expr("psl");
118 let mut psl: Option<List> = None;
119 if let Some(psl_expr) = psl_expr {
120 let psl_location = psl_expr
121 .clone()
122 .resolve_constant(state)
123 .ok_or(function::Error::ExpectedStaticExpression {
124 keyword: "psl",
125 expr: psl_expr.clone(),
126 })?
127 .try_bytes_utf8_lossy()
128 .map_err(|_| function::Error::InvalidArgument {
129 keyword: "psl",
130 value: format!("{psl_expr:?}").into(),
131 error: "psl should be a string",
132 })?
133 .into_owned();
134
135 let path = Path::new(&psl_location);
136 psl = Some(
137 std::fs::read_to_string(path)
138 .map_err(|_| function::Error::InvalidArgument {
139 keyword: "psl",
140 value: format!("{}", path.display()).into(),
141 error: "Unable to read psl file",
142 })?
143 .parse()
144 .map_err(|_| function::Error::InvalidArgument {
145 keyword: "psl",
146 value: format!("{}", path.display()).into(),
147 error: "Unable to parse psl file",
148 })?,
149 );
150 }
151
152 Ok(ParseEtldFn {
153 value,
154 plus_parts,
155 psl,
156 }
157 .as_expr())
158 }
159}
160
161#[derive(Debug, Clone)]
162struct ParseEtldFn {
163 value: Box<dyn Expression>,
164 plus_parts: Option<Box<dyn Expression>>,
165 psl: Option<List>,
166}
167
168impl FunctionExpression for ParseEtldFn {
169 fn resolve(&self, ctx: &mut Context) -> Resolved {
170 let value = self.value.resolve(ctx)?;
171 let string = value.try_bytes_utf8_lossy()?;
172
173 let plus_parts_value = self
174 .plus_parts
175 .map_resolve_with_default(ctx, || DEFAULT_PLUS_PARTS.clone())?;
176 let plus_parts = match plus_parts_value.try_integer()? {
177 x if x < 0 => 0,
178 #[allow(clippy::cast_sign_loss, clippy::cast_possible_truncation)]
180 x => x as usize,
181 };
182
183 let suffix_result = if let Some(list) = &self.psl {
184 list.suffix(string.as_bytes())
185 } else {
186 psl::suffix(string.as_bytes())
187 };
188 let etld = suffix_result.ok_or(format!("unable to determine eTLD for {string}"))?;
189 let etld_string = core::str::from_utf8(etld.as_bytes())
190 .map_err(|err| format!("could not convert eTLD to UTF8 {err}"))?;
191
192 let etld_parts_count = etld_string.chars().filter(|c| *c == '.').count() + 1;
193 let etld_plus_parts: Vec<&str> = string
194 .rsplit('.')
195 .take(etld_parts_count + plus_parts)
196 .collect();
197
198 let etld_plus = etld_plus_parts
199 .into_iter()
200 .rev()
201 .collect::<Vec<_>>()
202 .join(".");
203
204 let mut map = BTreeMap::<&str, Value>::new();
205
206 map.insert("etld", etld_string.to_owned().into());
207 map.insert("etld_plus", etld_plus.into());
208 map.insert("known_suffix", etld.is_known().into());
209
210 Ok(map
211 .into_iter()
212 .map(|(k, v)| (k.to_owned(), v))
213 .collect::<Value>())
214 }
215
216 fn type_def(&self, _: &state::TypeState) -> TypeDef {
217 TypeDef::object(inner_kind()).fallible()
218 }
219}
220
221fn inner_kind() -> BTreeMap<Field, Kind> {
222 BTreeMap::from([
223 ("etld".into(), Kind::bytes()),
224 ("etld_plus".into(), Kind::bytes()),
225 ("known_suffix".into(), Kind::boolean()),
226 ])
227}
228
229#[cfg(test)]
230mod tests {
231 use super::*;
232 use crate::value;
233
234 test_function![
235 parse_etld => ParseEtld;
236
237 naive {
238 args: func_args![value: value!("vector.dev")],
239 want: Ok(value!({
240 etld: "dev",
241 etld_plus: "dev",
242 known_suffix: true,
243 })),
244 tdef: TypeDef::object(inner_kind()).fallible(),
245 }
246
247 naive_plus_one {
248 args: func_args![value: value!("vector.dev"), plus_parts: 1],
249 want: Ok(value!({
250 etld: "dev",
251 etld_plus: "vector.dev",
252 known_suffix: true,
253 })),
254 tdef: TypeDef::object(inner_kind()).fallible(),
255 }
256
257 psl {
258 args: func_args![value: value!("sussex.ac.uk")],
259 want: Ok(value!({
260 etld: "ac.uk",
261 etld_plus: "ac.uk",
262 known_suffix: true,
263 })),
264 tdef: TypeDef::object(inner_kind()).fallible(),
265 }
266
267 psl_plus_one {
268 args: func_args![value: value!("sussex.ac.uk"), plus_parts: 1],
269 want: Ok(value!({
270 etld: "ac.uk",
271 etld_plus: "sussex.ac.uk",
272 known_suffix: true,
273 })),
274 tdef: TypeDef::object(inner_kind()).fallible(),
275 }
276
277 short_plus {
278 args: func_args![value: value!("vector.dev"), plus_parts: 10],
279 want: Ok(value!({
280 etld: "dev",
281 etld_plus: "vector.dev",
282 known_suffix: true,
283 })),
284 tdef: TypeDef::object(inner_kind()).fallible(),
285 }
286
287 long_plus {
288 args: func_args![value: value!("www.long.plus.test.vector.dev"), plus_parts: 4],
289 want: Ok(value!({
290 etld: "dev",
291 etld_plus: "long.plus.test.vector.dev",
292 known_suffix: true,
293 })),
294 tdef: TypeDef::object(inner_kind()).fallible(),
295 }
296
297 unknown_tld {
298 args: func_args![value: value!("vector.unknowndev")],
299 want: Ok(value!({
300 etld: "unknowndev",
301 etld_plus: "unknowndev",
302 known_suffix: false,
303 })),
304 tdef: TypeDef::object(inner_kind()).fallible(),
305 }
306
307 utf8 {
308 args: func_args![value: value!("www.食狮.中国")],
309 want: Ok(value!({
310 etld: "中国",
311 etld_plus: "中国",
312 known_suffix: true,
313 })),
314 tdef: TypeDef::object(inner_kind()).fallible(),
315 }
316
317 utf8_plus_one {
318 args: func_args![value: value!("www.食狮.中国"), plus_parts: 1],
319 want: Ok(value!({
320 etld: "中国",
321 etld_plus: "食狮.中国",
322 known_suffix: true,
323 })),
324 tdef: TypeDef::object(inner_kind()).fallible(),
325 }
326
327 empty_host {
328 args: func_args![value: value!("")],
329 want: Err("unable to determine eTLD for "),
330 tdef: TypeDef::object(inner_kind()).fallible(),
331 }
332
333 bad_psl_file {
334 args: func_args![value: value!("vector.dev"), psl: value!("definitelynotafile")],
335 want: Err("invalid argument"),
336 tdef: TypeDef::object(inner_kind()).fallible(),
337 }
338 ];
339}