vrl/parsing/
xml.rs

1//! XML parsing utilities. Exports functions and configuration structs
2//! that are sufficient to process a `roxmltree::Node`.
3
4use crate::compiler::prelude::*;
5use regex::{Regex, RegexBuilder};
6// Re-export `roxmltree` to match the public API of `process_node`.
7use roxmltree::NodeType;
8pub use roxmltree::{Document, Node};
9use rust_decimal::prelude::Zero;
10use std::sync::LazyLock;
11use std::{
12    borrow::Cow,
13    collections::{BTreeMap, btree_map::Entry},
14};
15
16/// A lazily initialized regular expression that matches excess whitespace between XML/HTML tags.
17///
18/// This regex helps in cleaning up formatted or pretty-printed XML/HTML by removing unnecessary
19/// spaces, newlines, or indentation between tags. It specifically looks for occurrences where
20/// a `>` (closing tag or self-closing tag) is immediately followed by whitespace (spaces, tabs,
21/// or newlines) and then a `<` (opening tag).
22///
23/// ## Notes
24/// - This regex is compiled once and reused, improving performance.
25/// - The `multi_line(true)` flag (if used with `RegexBuilder`) ensures it applies across multiple lines.
26/// - This is particularly useful for XML minification or normalization before processing.
27pub static XML_RE: LazyLock<Regex> = LazyLock::new(|| {
28    RegexBuilder::new(r">\s+?<")
29        .multi_line(true)
30        .build()
31        .expect("trim regex failed")
32});
33
34pub static DEFAULT_TRIM: LazyLock<Value> = LazyLock::new(|| Value::Boolean(true));
35pub static DEFAULT_INCLUDE_ATTR: LazyLock<Value> = LazyLock::new(|| Value::Boolean(true));
36pub static DEFAULT_ATTR_PREFIX: LazyLock<Value> = LazyLock::new(|| Value::Bytes(Bytes::from("@")));
37pub static DEFAULT_TEXT_KEY: LazyLock<Value> = LazyLock::new(|| Value::Bytes(Bytes::from("text")));
38pub static DEFAULT_ALWAYS_USE_TEXT_KEY: LazyLock<Value> = LazyLock::new(|| Value::Boolean(false));
39pub static DEFAULT_PARSE_BOOL: LazyLock<Value> = LazyLock::new(|| Value::Boolean(true));
40pub static DEFAULT_PARSE_NULL: LazyLock<Value> = LazyLock::new(|| Value::Boolean(true));
41pub static DEFAULT_PARSE_NUMBER: LazyLock<Value> = LazyLock::new(|| Value::Boolean(true));
42
43/// Configuration to determine which XML options will be used when
44/// parsing a roxmltree `Node`.
45#[derive(Debug, Clone)]
46pub struct ParseXmlConfig<'a> {
47    /// Include XML attributes. Default: true,
48    pub include_attr: bool,
49    /// XML attribute prefix, e.g. `<a href="test">` -> `{a: { "@href": "test }}`. Default: "@".
50    pub attr_prefix: Cow<'a, str>,
51    /// Key to use for text nodes when attributes are included. Default: "text".
52    pub text_key: Cow<'a, str>,
53    /// Always use text default (instead of flattening). Default: false.
54    pub always_use_text_key: bool,
55    /// Parse "true" or "false" as booleans. Default: true.
56    pub parse_bool: bool,
57    /// Parse "null" as null. Default: true.
58    pub parse_null: bool,
59    /// Parse numeric values as integers/floats. Default: true.
60    pub parse_number: bool,
61}
62
63/// Used to keep Clippy's `too_many_argument` check happy.
64#[derive(Debug, Default)]
65pub struct ParseOptions {
66    pub trim: Option<Value>,
67    pub include_attr: Option<Value>,
68    pub attr_prefix: Option<Value>,
69    pub text_key: Option<Value>,
70    pub always_use_text_key: Option<Value>,
71    pub parse_bool: Option<Value>,
72    pub parse_null: Option<Value>,
73    pub parse_number: Option<Value>,
74}
75
76/// Parses an XML string into a structured `Resolved` format based on the provided `ParseOptions`.
77///
78/// This function processes an XML document, applying transformations and extracting elements
79/// according to the given parsing options.
80///
81/// # Parameters
82/// - `value`: A [`vrl::value::Value`](`crate::value::Value`) containing the XML string to be parsed.
83/// - `options`: A `ParseOptions` struct that defines parsing behaviors, including:
84///   - `trim`: Whether to remove excess whitespace between XML elements (default: `true`).
85///   - `include_attr`: Whether to include XML attributes in the output (default: `true`).
86///   - `attr_prefix`: The prefix used for attribute keys (default: `"@"`).
87///   - `text_key`: The key used for text content within an element (default: `"text"`).
88///   - `always_use_text_key`: Whether text values should always be wrapped in a text key (default: `false`).
89///   - `parse_bool`: Whether to attempt parsing boolean values (default: `true`).
90///   - `parse_null`: Whether to attempt parsing null values (default: `true`).
91///   - `parse_number`: Whether to attempt parsing numeric values (default: `true`).
92///
93/// # Returns
94/// - `Ok(Resolved)`: The structured representation of the parsed XML.
95/// - `Err(String)`: If XML parsing fails or an error occurs during processing.
96///
97/// # Errors
98/// - Returns an error if the input is not valid XML or if any step in processing fails.
99pub fn parse_xml(value: Value, options: ParseOptions) -> Resolved {
100    let string = value.try_bytes_utf8_lossy()?;
101    let trim = options
102        .trim
103        .unwrap_or_else(|| DEFAULT_TRIM.clone())
104        .try_boolean()?;
105    let include_attr = options
106        .include_attr
107        .unwrap_or_else(|| DEFAULT_INCLUDE_ATTR.clone())
108        .try_boolean()?;
109    let attr_prefix = Cow::from(
110        options
111            .attr_prefix
112            .unwrap_or_else(|| DEFAULT_ATTR_PREFIX.clone())
113            .try_bytes_utf8_lossy()?
114            .into_owned(),
115    );
116    let text_key = Cow::from(
117        options
118            .text_key
119            .unwrap_or_else(|| DEFAULT_TEXT_KEY.clone())
120            .try_bytes_utf8_lossy()?
121            .into_owned(),
122    );
123    let always_use_text_key = options
124        .always_use_text_key
125        .unwrap_or_else(|| DEFAULT_ALWAYS_USE_TEXT_KEY.clone())
126        .try_boolean()?;
127    let parse_bool = options
128        .parse_bool
129        .unwrap_or_else(|| DEFAULT_PARSE_BOOL.clone())
130        .try_boolean()?;
131    let parse_null = options
132        .parse_null
133        .unwrap_or_else(|| DEFAULT_PARSE_NULL.clone())
134        .try_boolean()?;
135    let parse_number = options
136        .parse_number
137        .unwrap_or_else(|| DEFAULT_PARSE_NUMBER.clone())
138        .try_boolean()?;
139    let config = ParseXmlConfig {
140        include_attr,
141        attr_prefix,
142        text_key,
143        always_use_text_key,
144        parse_bool,
145        parse_null,
146        parse_number,
147    };
148    // Trim whitespace around XML elements, if applicable.
149    let parse = if trim { trim_xml(&string) } else { string };
150    let doc = Document::parse(&parse).map_err(|e| format!("unable to parse xml: {e}"))?;
151    let value = process_node(doc.root(), &config);
152    Ok(value)
153}
154
155/// Process an XML `Node` and return a VRL `Value`.
156pub fn process_node(node: Node, config: &ParseXmlConfig) -> Value {
157    // Helper to recurse over a `Node`s children, and build an object.
158    let recurse = |node: Node| -> ObjectMap {
159        let mut map = BTreeMap::new();
160
161        // Expand attributes, if required.
162        if config.include_attr {
163            for attr in node.attributes() {
164                map.insert(
165                    format!("{}{}", config.attr_prefix, attr.name()).into(),
166                    attr.value().into(),
167                );
168            }
169        }
170
171        for n in node.children().filter(|n| n.is_element() || n.is_text()) {
172            let name = match n.node_type() {
173                NodeType::Element => n.tag_name().name().to_string().into(),
174                NodeType::Text => config.text_key.to_string().into(),
175                _ => unreachable!("shouldn't be other XML nodes"),
176            };
177
178            // Transform the node into a VRL `Value`.
179            let value = process_node(n, config);
180
181            // If the key already exists, add it. Otherwise, insert.
182            match map.entry(name) {
183                Entry::Occupied(mut entry) => {
184                    let v = entry.get_mut();
185
186                    // Push a value onto the existing array, or wrap in a `Value::Array`.
187                    match v {
188                        Value::Array(v) => v.push(value),
189                        v => {
190                            let prev = std::mem::replace(v, Value::Array(Vec::with_capacity(2)));
191                            if let Value::Array(v) = v {
192                                v.extend_from_slice(&[prev, value]);
193                            }
194                        }
195                    };
196                }
197                Entry::Vacant(entry) => {
198                    entry.insert(value);
199                }
200            }
201        }
202
203        map
204    };
205
206    match node.node_type() {
207        NodeType::Root => Value::Object(recurse(node)),
208
209        NodeType::Element => {
210            match (
211                config.always_use_text_key,
212                node.attributes().len().is_zero(),
213            ) {
214                // If the node has attributes, *always* recurse to expand default keys.
215                (_, false) if config.include_attr => Value::Object(recurse(node)),
216                // If a text key should be used, always recurse.
217                (true, true) => Value::Object(recurse(node)),
218                // Otherwise, check the node count to determine what to do.
219                _ => match node.children().count() {
220                    // For a single node, 'flatten' the object if necessary.
221                    1 => {
222                        // Expect a single element.
223                        let node = node.children().next().expect("expected 1 XML node");
224
225                        // If the node is an element, treat it as an object.
226                        if node.is_element() {
227                            let mut map = BTreeMap::new();
228
229                            map.insert(
230                                node.tag_name().name().to_string().into(),
231                                process_node(node, config),
232                            );
233
234                            Value::Object(map)
235                        } else {
236                            // Otherwise, 'flatten' the object by continuing processing.
237                            process_node(node, config)
238                        }
239                    }
240                    // For 2+ nodes, expand.
241                    _ => Value::Object(recurse(node)),
242                },
243            }
244        }
245        NodeType::Text => process_text(node.text().expect("expected XML text node"), config),
246        _ => unreachable!("shouldn't be other XML nodes"),
247    }
248}
249
250/// Process a text node, and return the correct `Value` type based on config.
251fn process_text<'a>(text: &'a str, config: &ParseXmlConfig<'a>) -> Value {
252    match text {
253        // Parse nulls.
254        "" | "null" if config.parse_null => Value::Null,
255        // Parse bools.
256        "true" if config.parse_bool => true.into(),
257        "false" if config.parse_bool => false.into(),
258        // String numbers.
259        _ if !config.parse_number => text.into(),
260        // Parse numbers, falling back to string.
261        _ => {
262            // Attempt an integer first (effectively a subset of float).
263            if let Ok(v) = text.parse::<i64>() {
264                return v.into();
265            }
266
267            // Then a float.
268            if let Ok(v) = text.parse::<f64>() {
269                return Value::from_f64_or_zero(v);
270            }
271
272            // Fall back to string.
273            text.into()
274        }
275    }
276}
277
278#[inline]
279fn trim_xml(xml: &str) -> Cow<'_, str> {
280    XML_RE.replace_all(xml, "><")
281}