vrl/parsing/xml.rs
1//! XML parsing utilities. Exports functions and configuration structs
2//! that are sufficient to process a `roxmltree::Node`.
3
4use crate::compiler::prelude::*;
5use regex::{Regex, RegexBuilder};
6// Re-export `roxmltree` to match the public API of `process_node`.
7use roxmltree::NodeType;
8pub use roxmltree::{Document, Node};
9use rust_decimal::prelude::Zero;
10use std::sync::LazyLock;
11use std::{
12 borrow::Cow,
13 collections::{BTreeMap, btree_map::Entry},
14};
15
16/// A lazily initialized regular expression that matches excess whitespace between XML/HTML tags.
17///
18/// This regex helps in cleaning up formatted or pretty-printed XML/HTML by removing unnecessary
19/// spaces, newlines, or indentation between tags. It specifically looks for occurrences where
20/// a `>` (closing tag or self-closing tag) is immediately followed by whitespace (spaces, tabs,
21/// or newlines) and then a `<` (opening tag).
22///
23/// ## Notes
24/// - This regex is compiled once and reused, improving performance.
25/// - The `multi_line(true)` flag (if used with `RegexBuilder`) ensures it applies across multiple lines.
26/// - This is particularly useful for XML minification or normalization before processing.
27pub static XML_RE: LazyLock<Regex> = LazyLock::new(|| {
28 RegexBuilder::new(r">\s+?<")
29 .multi_line(true)
30 .build()
31 .expect("trim regex failed")
32});
33
34pub static DEFAULT_TRIM: LazyLock<Value> = LazyLock::new(|| Value::Boolean(true));
35pub static DEFAULT_INCLUDE_ATTR: LazyLock<Value> = LazyLock::new(|| Value::Boolean(true));
36pub static DEFAULT_ATTR_PREFIX: LazyLock<Value> = LazyLock::new(|| Value::Bytes(Bytes::from("@")));
37pub static DEFAULT_TEXT_KEY: LazyLock<Value> = LazyLock::new(|| Value::Bytes(Bytes::from("text")));
38pub static DEFAULT_ALWAYS_USE_TEXT_KEY: LazyLock<Value> = LazyLock::new(|| Value::Boolean(false));
39pub static DEFAULT_PARSE_BOOL: LazyLock<Value> = LazyLock::new(|| Value::Boolean(true));
40pub static DEFAULT_PARSE_NULL: LazyLock<Value> = LazyLock::new(|| Value::Boolean(true));
41pub static DEFAULT_PARSE_NUMBER: LazyLock<Value> = LazyLock::new(|| Value::Boolean(true));
42
43/// Configuration to determine which XML options will be used when
44/// parsing a roxmltree `Node`.
45#[derive(Debug, Clone)]
46pub struct ParseXmlConfig<'a> {
47 /// Include XML attributes. Default: true,
48 pub include_attr: bool,
49 /// XML attribute prefix, e.g. `<a href="test">` -> `{a: { "@href": "test }}`. Default: "@".
50 pub attr_prefix: Cow<'a, str>,
51 /// Key to use for text nodes when attributes are included. Default: "text".
52 pub text_key: Cow<'a, str>,
53 /// Always use text default (instead of flattening). Default: false.
54 pub always_use_text_key: bool,
55 /// Parse "true" or "false" as booleans. Default: true.
56 pub parse_bool: bool,
57 /// Parse "null" as null. Default: true.
58 pub parse_null: bool,
59 /// Parse numeric values as integers/floats. Default: true.
60 pub parse_number: bool,
61}
62
63/// Used to keep Clippy's `too_many_argument` check happy.
64#[derive(Debug, Default)]
65pub struct ParseOptions {
66 pub trim: Option<Value>,
67 pub include_attr: Option<Value>,
68 pub attr_prefix: Option<Value>,
69 pub text_key: Option<Value>,
70 pub always_use_text_key: Option<Value>,
71 pub parse_bool: Option<Value>,
72 pub parse_null: Option<Value>,
73 pub parse_number: Option<Value>,
74}
75
76/// Parses an XML string into a structured `Resolved` format based on the provided `ParseOptions`.
77///
78/// This function processes an XML document, applying transformations and extracting elements
79/// according to the given parsing options.
80///
81/// # Parameters
82/// - `value`: A [`vrl::value::Value`](`crate::value::Value`) containing the XML string to be parsed.
83/// - `options`: A `ParseOptions` struct that defines parsing behaviors, including:
84/// - `trim`: Whether to remove excess whitespace between XML elements (default: `true`).
85/// - `include_attr`: Whether to include XML attributes in the output (default: `true`).
86/// - `attr_prefix`: The prefix used for attribute keys (default: `"@"`).
87/// - `text_key`: The key used for text content within an element (default: `"text"`).
88/// - `always_use_text_key`: Whether text values should always be wrapped in a text key (default: `false`).
89/// - `parse_bool`: Whether to attempt parsing boolean values (default: `true`).
90/// - `parse_null`: Whether to attempt parsing null values (default: `true`).
91/// - `parse_number`: Whether to attempt parsing numeric values (default: `true`).
92///
93/// # Returns
94/// - `Ok(Resolved)`: The structured representation of the parsed XML.
95/// - `Err(String)`: If XML parsing fails or an error occurs during processing.
96///
97/// # Errors
98/// - Returns an error if the input is not valid XML or if any step in processing fails.
99pub fn parse_xml(value: Value, options: ParseOptions) -> Resolved {
100 let string = value.try_bytes_utf8_lossy()?;
101 let trim = options
102 .trim
103 .unwrap_or_else(|| DEFAULT_TRIM.clone())
104 .try_boolean()?;
105 let include_attr = options
106 .include_attr
107 .unwrap_or_else(|| DEFAULT_INCLUDE_ATTR.clone())
108 .try_boolean()?;
109 let attr_prefix = Cow::from(
110 options
111 .attr_prefix
112 .unwrap_or_else(|| DEFAULT_ATTR_PREFIX.clone())
113 .try_bytes_utf8_lossy()?
114 .into_owned(),
115 );
116 let text_key = Cow::from(
117 options
118 .text_key
119 .unwrap_or_else(|| DEFAULT_TEXT_KEY.clone())
120 .try_bytes_utf8_lossy()?
121 .into_owned(),
122 );
123 let always_use_text_key = options
124 .always_use_text_key
125 .unwrap_or_else(|| DEFAULT_ALWAYS_USE_TEXT_KEY.clone())
126 .try_boolean()?;
127 let parse_bool = options
128 .parse_bool
129 .unwrap_or_else(|| DEFAULT_PARSE_BOOL.clone())
130 .try_boolean()?;
131 let parse_null = options
132 .parse_null
133 .unwrap_or_else(|| DEFAULT_PARSE_NULL.clone())
134 .try_boolean()?;
135 let parse_number = options
136 .parse_number
137 .unwrap_or_else(|| DEFAULT_PARSE_NUMBER.clone())
138 .try_boolean()?;
139 let config = ParseXmlConfig {
140 include_attr,
141 attr_prefix,
142 text_key,
143 always_use_text_key,
144 parse_bool,
145 parse_null,
146 parse_number,
147 };
148 // Trim whitespace around XML elements, if applicable.
149 let parse = if trim { trim_xml(&string) } else { string };
150 let doc = Document::parse(&parse).map_err(|e| format!("unable to parse xml: {e}"))?;
151 let value = process_node(doc.root(), &config);
152 Ok(value)
153}
154
155/// Process an XML `Node` and return a VRL `Value`.
156pub fn process_node(node: Node, config: &ParseXmlConfig) -> Value {
157 // Helper to recurse over a `Node`s children, and build an object.
158 let recurse = |node: Node| -> ObjectMap {
159 let mut map = BTreeMap::new();
160
161 // Expand attributes, if required.
162 if config.include_attr {
163 for attr in node.attributes() {
164 map.insert(
165 format!("{}{}", config.attr_prefix, attr.name()).into(),
166 attr.value().into(),
167 );
168 }
169 }
170
171 for n in node.children().filter(|n| n.is_element() || n.is_text()) {
172 let name = match n.node_type() {
173 NodeType::Element => n.tag_name().name().to_string().into(),
174 NodeType::Text => config.text_key.to_string().into(),
175 _ => unreachable!("shouldn't be other XML nodes"),
176 };
177
178 // Transform the node into a VRL `Value`.
179 let value = process_node(n, config);
180
181 // If the key already exists, add it. Otherwise, insert.
182 match map.entry(name) {
183 Entry::Occupied(mut entry) => {
184 let v = entry.get_mut();
185
186 // Push a value onto the existing array, or wrap in a `Value::Array`.
187 match v {
188 Value::Array(v) => v.push(value),
189 v => {
190 let prev = std::mem::replace(v, Value::Array(Vec::with_capacity(2)));
191 if let Value::Array(v) = v {
192 v.extend_from_slice(&[prev, value]);
193 }
194 }
195 };
196 }
197 Entry::Vacant(entry) => {
198 entry.insert(value);
199 }
200 }
201 }
202
203 map
204 };
205
206 match node.node_type() {
207 NodeType::Root => Value::Object(recurse(node)),
208
209 NodeType::Element => {
210 match (
211 config.always_use_text_key,
212 node.attributes().len().is_zero(),
213 ) {
214 // If the node has attributes, *always* recurse to expand default keys.
215 (_, false) if config.include_attr => Value::Object(recurse(node)),
216 // If a text key should be used, always recurse.
217 (true, true) => Value::Object(recurse(node)),
218 // Otherwise, check the node count to determine what to do.
219 _ => match node.children().count() {
220 // For a single node, 'flatten' the object if necessary.
221 1 => {
222 // Expect a single element.
223 let node = node.children().next().expect("expected 1 XML node");
224
225 // If the node is an element, treat it as an object.
226 if node.is_element() {
227 let mut map = BTreeMap::new();
228
229 map.insert(
230 node.tag_name().name().to_string().into(),
231 process_node(node, config),
232 );
233
234 Value::Object(map)
235 } else {
236 // Otherwise, 'flatten' the object by continuing processing.
237 process_node(node, config)
238 }
239 }
240 // For 2+ nodes, expand.
241 _ => Value::Object(recurse(node)),
242 },
243 }
244 }
245 NodeType::Text => process_text(node.text().expect("expected XML text node"), config),
246 _ => unreachable!("shouldn't be other XML nodes"),
247 }
248}
249
250/// Process a text node, and return the correct `Value` type based on config.
251fn process_text<'a>(text: &'a str, config: &ParseXmlConfig<'a>) -> Value {
252 match text {
253 // Parse nulls.
254 "" | "null" if config.parse_null => Value::Null,
255 // Parse bools.
256 "true" if config.parse_bool => true.into(),
257 "false" if config.parse_bool => false.into(),
258 // String numbers.
259 _ if !config.parse_number => text.into(),
260 // Parse numbers, falling back to string.
261 _ => {
262 // Attempt an integer first (effectively a subset of float).
263 if let Ok(v) = text.parse::<i64>() {
264 return v.into();
265 }
266
267 // Then a float.
268 if let Ok(v) = text.parse::<f64>() {
269 return Value::from_f64_or_zero(v);
270 }
271
272 // Fall back to string.
273 text.into()
274 }
275 }
276}
277
278#[inline]
279fn trim_xml(xml: &str) -> Cow<'_, str> {
280 XML_RE.replace_all(xml, "><")
281}