vrl/stdlib/
parse_xml.rs

1use crate::compiler::prelude::*;
2use crate::parsing::xml::{
3    DEFAULT_ALWAYS_USE_TEXT_KEY, DEFAULT_ATTR_PREFIX, DEFAULT_INCLUDE_ATTR, DEFAULT_PARSE_BOOL,
4    DEFAULT_PARSE_NULL, DEFAULT_PARSE_NUMBER, DEFAULT_TEXT_KEY, DEFAULT_TRIM, ParseOptions,
5    parse_xml,
6};
7use std::sync::LazyLock;
8
9static PARAMETERS: LazyLock<Vec<Parameter>> = LazyLock::new(|| {
10    vec![
11        Parameter::required(
12            "value",
13            kind::BYTES,
14            "The string representation of the XML document to parse.",
15        ),
16        Parameter::optional(
17            "trim",
18            kind::BOOLEAN,
19            "Remove excess whitespace between XML elements.",
20        )
21        .default(&DEFAULT_TRIM),
22        Parameter::optional(
23            "include_attr",
24            kind::BOOLEAN,
25            "Include XML tag attributes in the returned object.",
26        )
27        .default(&DEFAULT_INCLUDE_ATTR),
28        Parameter::optional(
29            "attr_prefix",
30            kind::BYTES,
31            "String prefix to use for XML tag attribute keys.",
32        )
33        .default(&DEFAULT_ATTR_PREFIX),
34        Parameter::optional(
35            "text_key",
36            kind::BYTES,
37            "Key name to use for expanded text nodes.",
38        )
39        .default(&DEFAULT_TEXT_KEY),
40        Parameter::optional(
41            "always_use_text_key",
42            kind::BOOLEAN,
43            "Always return text nodes as `{\"<text_key>\": \"value\"}.`",
44        )
45        .default(&DEFAULT_ALWAYS_USE_TEXT_KEY),
46        Parameter::optional(
47            "parse_bool",
48            kind::BOOLEAN,
49            "Parse \"true\" and \"false\" as boolean.",
50        )
51        .default(&DEFAULT_PARSE_BOOL),
52        Parameter::optional("parse_null", kind::BOOLEAN, "Parse \"null\" as null.")
53            .default(&DEFAULT_PARSE_NULL),
54        Parameter::optional(
55            "parse_number",
56            kind::BOOLEAN,
57            "Parse numbers as integers/floats.",
58        )
59        .default(&DEFAULT_PARSE_NUMBER),
60    ]
61});
62
63#[derive(Clone, Copy, Debug)]
64pub struct ParseXml;
65
66impl Function for ParseXml {
67    fn identifier(&self) -> &'static str {
68        "parse_xml"
69    }
70
71    fn usage(&self) -> &'static str {
72        "Parses the `value` as XML."
73    }
74
75    fn category(&self) -> &'static str {
76        Category::Parse.as_ref()
77    }
78
79    fn internal_failure_reasons(&self) -> &'static [&'static str] {
80        &["`value` is not a valid XML document."]
81    }
82
83    fn return_kind(&self) -> u16 {
84        kind::OBJECT
85    }
86
87    fn notices(&self) -> &'static [&'static str] {
88        &["Valid XML must contain exactly one root node. Always returns an object."]
89    }
90
91    fn examples(&self) -> &'static [Example] {
92        &[example! {
93            title: "Parse XML",
94            source: indoc! {r#"
95                value = s'<book category="CHILDREN"><title lang="en">Harry Potter</title><author>J K. Rowling</author><year>2005</year></book>';
96
97                parse_xml!(value, text_key: "value", parse_number: false)
98            "#},
99            result: Ok(
100                r#"{ "book": { "@category": "CHILDREN", "author": "J K. Rowling", "title": { "@lang": "en", "value": "Harry Potter" }, "year": "2005" } }"#,
101            ),
102        }]
103    }
104
105    fn compile(
106        &self,
107        _state: &state::TypeState,
108        _ctx: &mut FunctionCompileContext,
109        arguments: ArgumentList,
110    ) -> Compiled {
111        let value = arguments.required("value");
112
113        let trim = arguments.optional("trim");
114        let include_attr = arguments.optional("include_attr");
115        let attr_prefix = arguments.optional("attr_prefix");
116        let text_key = arguments.optional("text_key");
117        let always_use_text_key = arguments.optional("always_use_text_key");
118        let parse_bool = arguments.optional("parse_bool");
119        let parse_null = arguments.optional("parse_null");
120        let parse_number = arguments.optional("parse_number");
121
122        Ok(ParseXmlFn {
123            value,
124            trim,
125            include_attr,
126            attr_prefix,
127            text_key,
128            always_use_text_key,
129            parse_bool,
130            parse_null,
131            parse_number,
132        }
133        .as_expr())
134    }
135
136    fn parameters(&self) -> &'static [Parameter] {
137        PARAMETERS.as_slice()
138    }
139}
140
141#[derive(Debug, Clone)]
142struct ParseXmlFn {
143    value: Box<dyn Expression>,
144
145    trim: Option<Box<dyn Expression>>,
146    include_attr: Option<Box<dyn Expression>>,
147    attr_prefix: Option<Box<dyn Expression>>,
148    text_key: Option<Box<dyn Expression>>,
149    always_use_text_key: Option<Box<dyn Expression>>,
150    parse_bool: Option<Box<dyn Expression>>,
151    parse_null: Option<Box<dyn Expression>>,
152    parse_number: Option<Box<dyn Expression>>,
153}
154
155impl FunctionExpression for ParseXmlFn {
156    fn resolve(&self, ctx: &mut Context) -> Resolved {
157        let value = self.value.resolve(ctx)?;
158
159        let options = ParseOptions {
160            trim: self
161                .trim
162                .as_ref()
163                .map(|expr| expr.resolve(ctx))
164                .transpose()?,
165
166            include_attr: self
167                .include_attr
168                .as_ref()
169                .map(|expr| expr.resolve(ctx))
170                .transpose()?,
171
172            attr_prefix: self
173                .attr_prefix
174                .as_ref()
175                .map(|expr| expr.resolve(ctx))
176                .transpose()?,
177
178            text_key: self
179                .text_key
180                .as_ref()
181                .map(|expr| expr.resolve(ctx))
182                .transpose()?,
183
184            always_use_text_key: self
185                .always_use_text_key
186                .as_ref()
187                .map(|expr| expr.resolve(ctx))
188                .transpose()?,
189
190            parse_bool: self
191                .parse_bool
192                .as_ref()
193                .map(|expr| expr.resolve(ctx))
194                .transpose()?,
195
196            parse_null: self
197                .parse_null
198                .as_ref()
199                .map(|expr| expr.resolve(ctx))
200                .transpose()?,
201
202            parse_number: self
203                .parse_number
204                .as_ref()
205                .map(|expr| expr.resolve(ctx))
206                .transpose()?,
207        };
208
209        parse_xml(value, options)
210    }
211
212    fn type_def(&self, _: &state::TypeState) -> TypeDef {
213        type_def()
214    }
215}
216
217fn type_def() -> TypeDef {
218    TypeDef::bytes()
219        .or_object(Collection::from_unknown(inner_kind()))
220        .fallible()
221}
222
223fn inner_kind() -> Kind {
224    Kind::object(Collection::any())
225}
226
227#[cfg(test)]
228mod tests {
229    use super::*;
230    use crate::value;
231
232    test_function![
233        parse_xml => ParseXml;
234
235        simple_text {
236            args: func_args![ value: "<a>test</a>" ],
237            want: Ok(value!({ "a": "test" })),
238            tdef: type_def(),
239        }
240
241        include_attr {
242            args: func_args![ value: r#"<a href="https://vector.dev">test</a>"# ],
243            want: Ok(value!({ "a": { "@href": "https://vector.dev", "text": "test" } })),
244            tdef: type_def(),
245        }
246
247        exclude_attr {
248            args: func_args![ value: r#"<a href="https://vector.dev">test</a>"#, include_attr: false ],
249            want: Ok(value!({ "a": "test" })),
250            tdef: type_def(),
251        }
252
253        custom_text_key {
254            args: func_args![ value: "<b>test</b>", text_key: "node", always_use_text_key: true ],
255            want: Ok(value!({ "b": { "node": "test" } })),
256            tdef: type_def(),
257        }
258
259        // https://github.com/vectordotdev/vector/issues/11901
260        include_attributes_if_single_node {
261            args: func_args![ value: r#"<root><node attr="value"><message>foo</message></node></root>"# ],
262            want: Ok(value!({ "root": { "node": { "@attr": "value", "message": "foo" } } })),
263            tdef: type_def(),
264        }
265
266        // https://github.com/vectordotdev/vector/issues/11901
267        include_attributes_multiple_children {
268            args: func_args![ value: r#"<root><node attr="value"><message>bar</message></node><node attr="value"><message>baz</message></node></root>"#],
269            want: Ok(value!({"root":{ "node":[ { "@attr": "value", "message": "bar" }, { "@attr": "value", "message": "baz" } ] } })),
270            tdef: type_def(),
271        }
272
273        nested_object {
274            args: func_args![ value: r#"<a attr="value"><b>one</b><c>two</c></a>"# ],
275            want: Ok(value!({ "a": { "@attr": "value", "b": "one", "c": "two" } })),
276            tdef: type_def(),
277        }
278
279        nested_object_array {
280            args: func_args![ value: "<a><b>one</b><b>two</b></a>" ],
281            want: Ok(value!({ "a": { "b": ["one", "two"] } })),
282            tdef: type_def(),
283        }
284
285        header_and_comments {
286            args: func_args![ value: indoc!{r#"
287                <?xml version="1.0" encoding="ISO-8859-1"?>
288                <!-- Example found somewhere in the deep depths of the web -->
289                <note>
290                    <to>Tove</to>
291                    <!-- Randomly inserted inner comment -->
292                    <from>Jani</from>
293                    <heading>Reminder</heading>
294                    <body>Don't forget me this weekend!</body>
295                </note>
296
297                <!-- Could literally be placed anywhere -->
298            "#}],
299            want: Ok(value!(
300                {
301                    "note": {
302                        "to": "Tove",
303                        "from": "Jani",
304                        "heading": "Reminder",
305                        "body": "Don't forget me this weekend!"
306                    }
307                }
308            )),
309            tdef: type_def(),
310        }
311
312        header_inside_element {
313            args: func_args![ value: "<p><?xml?>text123</p>" ],
314            want: Ok(value!(
315                {
316                    "p": {
317                        "text": "text123"
318                    }
319                }
320            )),
321            tdef: type_def(),
322        }
323
324        mixed_types {
325            args: func_args![ value: indoc!{r#"
326                <?xml version="1.0" encoding="ISO-8859-1"?>
327                <!-- Mixed types -->
328                <data>
329                    <!-- Booleans -->
330                    <item>true</item>
331                    <item>false</item>
332                    <!-- String -->
333                    <item>string!</item>
334                    <!-- Empty object -->
335                    <item />
336                    <!-- Literal value "null" -->
337                    <item>null</item>
338                    <!-- Integer -->
339                    <item>1</item>
340                    <!-- Float -->
341                    <item>1.0</item>
342                </data>
343            "#}],
344            want: Ok(value!(
345                {
346                    "data": {
347                        "item": [
348                            true,
349                            false,
350                            "string!",
351                            {},
352                            null,
353                            1,
354                            1.0
355                        ]
356                    }
357                }
358            )),
359            tdef: type_def(),
360        }
361
362        just_strings {
363            args: func_args![ value: indoc!{r#"
364                <?xml version="1.0" encoding="ISO-8859-1"?>
365                <!-- All scalar types are just strings -->
366                <data>
367                    <item>true</item>
368                    <item>false</item>
369                    <item>string!</item>
370                    <!-- Still an empty object -->
371                    <item />
372                    <item>null</item>
373                    <item>1</item>
374                    <item>1.0</item>
375                </data>
376            "#}, parse_null: false, parse_bool: false, parse_number: false],
377            want: Ok(value!(
378                {
379                    "data": {
380                        "item": [
381                            "true",
382                            "false",
383                            "string!",
384                            {},
385                            "null",
386                            "1",
387                            "1.0"
388                        ]
389                    }
390                }
391            )),
392            tdef: type_def(),
393        }
394
395        untrimmed {
396            args: func_args![ value: "<root>  <a>test</a>  </root>", trim: false ],
397            want: Ok(value!(
398                {
399                    "root": {
400                        "a": "test",
401                        "text": ["  ", "  "],
402                    }
403                }
404            )),
405            tdef: type_def(),
406        }
407
408        invalid_token {
409            args: func_args![ value: "true" ],
410            want: Err("unable to parse xml: unknown token at 1:1"),
411            tdef: type_def(),
412        }
413
414        flat_parent_property {
415            args: func_args![ value: indoc!{r#"
416                <?xml version="1.0" encoding="UTF-8"?>
417                <MY_XML>
418                  <property1>
419                    <property1_a>a</property1_a>
420                    <property1_b>b</property1_b>
421                    <property1_c>c</property1_c>
422                  </property1>
423                  <property2>
424                    <property2_object>
425                      <property2a_a>a</property2a_a>
426                      <property2a_b>b</property2a_b>
427                      <property2a_c>c</property2a_c>
428                    </property2_object>
429                  </property2>
430                </MY_XML>
431            "#}],
432            want: Ok(value!(
433                {
434                  "MY_XML": {
435                    "property1": {
436                      "property1_a": "a",
437                      "property1_b": "b",
438                      "property1_c": "c"
439                    },
440                    "property2": {
441                      "property2_object": {
442                        "property2a_a": "a",
443                        "property2a_b": "b",
444                        "property2a_c": "c"
445                      }
446                    }
447                  }
448                }
449            )),
450            tdef: type_def(),
451        }
452
453        nested_parent_property {
454            args: func_args![ value: indoc!{r#"
455                <?xml version="1.0" encoding="UTF-8"?>
456                <MY_XML>
457                  <property1>
458                    <property1_a>a</property1_a>
459                    <property1_b>b</property1_b>
460                    <property1_c>c</property1_c>
461                  </property1>
462                  <property2>
463                    <property2_object>
464                      <property2a_a>a</property2a_a>
465                      <property2a_b>b</property2a_b>
466                      <property2a_c>c</property2a_c>
467                    </property2_object>
468                    <property2_object>
469                      <property2a_a>a</property2a_a>
470                      <property2a_b>b</property2a_b>
471                      <property2a_c>c</property2a_c>
472                    </property2_object>
473                  </property2>
474                </MY_XML>
475            "#}],
476            want: Ok(value!(
477                {
478                  "MY_XML": {
479                    "property1": {
480                      "property1_a": "a",
481                      "property1_b": "b",
482                      "property1_c": "c"
483                    },
484                    "property2": {
485                      "property2_object": [
486                        {
487                          "property2a_a": "a",
488                          "property2a_b": "b",
489                          "property2a_c": "c"
490                        },
491                        {
492                          "property2a_a": "a",
493                          "property2a_b": "b",
494                          "property2a_c": "c"
495                        }
496                      ]
497                    }
498                  }
499                }
500            )),
501            tdef: type_def(),
502        }
503
504        if_no_sibling {
505            args: func_args![ value: "<root><a>test</a></root>"],
506            want: Ok(value!({ "root": { "a": "test" } })),
507            tdef: type_def(),
508        }
509
510        if_no_sibling2 {
511            args: func_args![ value: "<root><a><a1>test</a1></a><b>test2</b></root>"],
512            want: Ok(value!({ "root": { "a": { "a1": "test" }, "b" : "test2" } })),
513            tdef: type_def(),
514        }
515    ];
516
517    #[test]
518    fn test_kind() {
519        let state = state::TypeState::default();
520
521        let func = ParseXmlFn {
522            value: value!(true).into_expression(),
523            trim: None,
524            include_attr: None,
525            attr_prefix: None,
526            text_key: None,
527            always_use_text_key: None,
528            parse_bool: None,
529            parse_null: None,
530            parse_number: None,
531        };
532
533        let type_def = func.type_def(&state);
534
535        assert!(type_def.is_fallible());
536        assert!(!type_def.is_exact());
537        assert!(type_def.contains_bytes());
538        assert!(type_def.contains_object());
539
540        let object1 = type_def.as_object().unwrap();
541
542        assert!(object1.known().is_empty());
543        assert!(object1.unknown_kind().contains_object());
544
545        let object2 = object1.unknown_kind().as_object().cloned().unwrap();
546
547        assert!(object2.known().is_empty());
548        assert!(object2.unknown_kind().is_any());
549    }
550}