vrl/stdlib/
parse_json.rs

1use std::collections::HashMap;
2
3use serde_json::{
4    Error, Map,
5    value::{RawValue, Value as JsonValue},
6};
7
8use crate::compiler::prelude::*;
9use crate::stdlib::json_utils::bom::StripBomFromUTF8;
10use crate::stdlib::json_utils::json_type_def::json_type_def;
11use std::sync::LazyLock;
12
13static DEFAULT_LOSSY: LazyLock<Value> = LazyLock::new(|| Value::Boolean(true));
14
15static PARAMETERS: LazyLock<Vec<Parameter>> = LazyLock::new(|| {
16    vec![
17        Parameter::required(
18            "value",
19            kind::BYTES,
20            "The string representation of the JSON to parse.",
21        ),
22        Parameter::optional(
23            "max_depth",
24            kind::INTEGER,
25            "Number of layers to parse for nested JSON-formatted documents.
26The value must be in the range of 1 to 128.",
27        ),
28        Parameter::optional(
29            "lossy",
30            kind::BOOLEAN,
31            "Whether to parse the JSON in a lossy manner. Replaces invalid UTF-8 characters
32with the Unicode character `�` (U+FFFD) if set to true, otherwise returns an error
33if there are any invalid UTF-8 characters present.",
34        )
35        .default(&DEFAULT_LOSSY),
36    ]
37});
38
39fn parse_json(value: Value, lossy: Value) -> Resolved {
40    let lossy = lossy.try_boolean()?;
41    Ok(if lossy {
42        serde_json::from_str(value.try_bytes_utf8_lossy()?.strip_bom())
43    } else {
44        serde_json::from_slice(value.try_bytes()?.strip_bom())
45    }
46    .map_err(|e| format!("unable to parse json: {e}"))?)
47}
48
49// parse_json_with_depth method recursively traverses the value and returns raw JSON-formatted bytes
50// after reaching provided depth.
51fn parse_json_with_depth(value: Value, max_depth: Value, lossy: Value) -> Resolved {
52    let parsed_depth = validate_depth(max_depth)?;
53    let lossy = lossy.try_boolean()?;
54    let bytes = if lossy {
55        value.try_bytes_utf8_lossy()?.into_owned().into()
56    } else {
57        value.try_bytes()?
58    };
59
60    let raw_value = serde_json::from_slice::<'_, &RawValue>(&bytes)
61        .map_err(|e| format!("unable to read json: {e}"))?;
62
63    let res = parse_layer(raw_value, parsed_depth)
64        .map_err(|e| format!("unable to parse json with max depth: {e}"))?;
65
66    Ok(Value::from(res))
67}
68
69fn parse_layer(value: &RawValue, remaining_depth: u8) -> std::result::Result<JsonValue, Error> {
70    let raw_value = value.get();
71
72    // RawValue is a JSON object.
73    if raw_value.starts_with('{') {
74        if remaining_depth == 0 {
75            // If max_depth is reached, return the raw representation of the JSON object,
76            // e.g., "{\"key\":\"value\"}"
77            serde_json::value::to_value(raw_value)
78        } else {
79            // Parse each value of the object as a raw JSON value recursively with the same method.
80            let map: HashMap<String, &RawValue> = serde_json::from_str(raw_value)?;
81
82            let mut res_map: Map<String, JsonValue> = Map::with_capacity(map.len());
83            for (k, v) in map {
84                res_map.insert(k, parse_layer(v, remaining_depth - 1)?);
85            }
86            Ok(serde_json::Value::from(res_map))
87        }
88    // RawValue is a JSON array.
89    } else if raw_value.starts_with('[') {
90        if remaining_depth == 0 {
91            // If max_depth is reached, return the raw representation of the JSON array,
92            // e.g., "[\"one\",\"two\",\"three\"]"
93            serde_json::value::to_value(raw_value)
94        } else {
95            // Parse all values of the array as a raw JSON value recursively with the same method.
96            let arr: Vec<&RawValue> = serde_json::from_str(raw_value)?;
97
98            let mut res_arr: Vec<JsonValue> = Vec::with_capacity(arr.len());
99            for v in arr {
100                res_arr.push(parse_layer(v, remaining_depth - 1)?);
101            }
102            Ok(serde_json::Value::from(res_arr))
103        }
104    // RawValue is not an object or array, do not need to traverse the doc further.
105    // Parse and return the value.
106    } else {
107        serde_json::from_str(raw_value)
108    }
109}
110
111fn validate_depth(value: Value) -> ExpressionResult<u8> {
112    let res = value.try_integer()?;
113    let res = u8::try_from(res).map_err(|e| e.to_string())?;
114
115    // The lower cap is 1 because it is pointless to use anything lower,
116    // because 'data = parse_json!(.message, max_depth: 0)' equals to 'data = .message'.
117    //
118    // The upper cap is 128 because serde_json has the same recursion limit by default.
119    // https://github.com/serde-rs/json/blob/4d57ebeea8d791b8a51c229552d2d480415d00e6/json/src/de.rs#L111
120    if (1..=128).contains(&res) {
121        Ok(res)
122    } else {
123        Err(ExpressionError::from(format!(
124            "max_depth value should be greater than 0 and less than 128, got {res}"
125        )))
126    }
127}
128
129#[derive(Clone, Copy, Debug)]
130pub struct ParseJson;
131
132impl Function for ParseJson {
133    fn identifier(&self) -> &'static str {
134        "parse_json"
135    }
136
137    fn summary(&self) -> &'static str {
138        "parse a string to a JSON type"
139    }
140
141    fn usage(&self) -> &'static str {
142        indoc! {"
143            Parses the provided `value` as JSON.
144
145            Only JSON types are returned. If you need to convert a `string` into a `timestamp`,
146            consider the `parse_timestamp` function.
147        "}
148    }
149
150    fn category(&self) -> &'static str {
151        Category::Parse.as_ref()
152    }
153
154    fn internal_failure_reasons(&self) -> &'static [&'static str] {
155        &["`value` is not a valid JSON-formatted payload."]
156    }
157
158    fn return_kind(&self) -> u16 {
159        kind::BOOLEAN
160            | kind::INTEGER
161            | kind::FLOAT
162            | kind::BYTES
163            | kind::OBJECT
164            | kind::ARRAY
165            | kind::NULL
166    }
167
168    fn notices(&self) -> &'static [&'static str] {
169        &[indoc! {"
170            Only JSON types are returned. If you need to convert a `string` into a `timestamp`,
171            consider the [`parse_timestamp`](#parse_timestamp) function.
172        "}]
173    }
174
175    fn parameters(&self) -> &'static [Parameter] {
176        PARAMETERS.as_slice()
177    }
178
179    fn examples(&self) -> &'static [Example] {
180        &[
181            example! {
182                title: "Parse JSON",
183                source: r#"parse_json!(s'{"key": "val"}')"#,
184                result: Ok(r#"{ "key": "val" }"#),
185            },
186            example! {
187                title: "Parse JSON array",
188                source: r#"parse_json!("[true, 0]")"#,
189                result: Ok("[true, 0]"),
190            },
191            example! {
192                title: "Parse JSON string",
193                source: r#"parse_json!(s'"hello"')"#,
194                result: Ok("hello"),
195            },
196            example! {
197                title: "Parse JSON integer",
198                source: r#"parse_json!("42")"#,
199                result: Ok("42"),
200            },
201            example! {
202                title: "Parse JSON float",
203                source: r#"parse_json!("42.13")"#,
204                result: Ok("42.13"),
205            },
206            example! {
207                title: "Parse JSON boolean",
208                source: r#"parse_json!("false")"#,
209                result: Ok("false"),
210            },
211            example! {
212                title: "Invalid JSON value",
213                source: r#"parse_json!("{ INVALID }")"#,
214                result: Err(
215                    r#"function call error for "parse_json" at (0:26): unable to parse json: key must be a string at line 1 column 3"#,
216                ),
217            },
218            example! {
219                title: "Parse JSON with max_depth",
220                source: r#"parse_json!(s'{"first_level":{"second_level":"finish"}}', max_depth: 1)"#,
221                result: Ok(r#"{"first_level":"{\"second_level\":\"finish\"}"}"#),
222            },
223        ]
224    }
225
226    fn compile(
227        &self,
228        _state: &state::TypeState,
229        _ctx: &mut FunctionCompileContext,
230        arguments: ArgumentList,
231    ) -> Compiled {
232        let value = arguments.required("value");
233        let max_depth = arguments.optional("max_depth");
234        let lossy = arguments.optional("lossy");
235
236        match max_depth {
237            Some(max_depth) => Ok(ParseJsonMaxDepthFn {
238                value,
239                max_depth,
240                lossy,
241            }
242            .as_expr()),
243            None => Ok(ParseJsonFn { value, lossy }.as_expr()),
244        }
245    }
246}
247
248#[derive(Debug, Clone)]
249struct ParseJsonFn {
250    value: Box<dyn Expression>,
251    lossy: Option<Box<dyn Expression>>,
252}
253
254impl FunctionExpression for ParseJsonFn {
255    fn resolve(&self, ctx: &mut Context) -> Resolved {
256        let value = self.value.resolve(ctx)?;
257        let lossy = self
258            .lossy
259            .map_resolve_with_default(ctx, || DEFAULT_LOSSY.clone())?;
260        parse_json(value, lossy)
261    }
262
263    fn type_def(&self, _: &state::TypeState) -> TypeDef {
264        json_type_def()
265    }
266}
267
268#[derive(Debug, Clone)]
269struct ParseJsonMaxDepthFn {
270    value: Box<dyn Expression>,
271    max_depth: Box<dyn Expression>,
272    lossy: Option<Box<dyn Expression>>,
273}
274
275impl FunctionExpression for ParseJsonMaxDepthFn {
276    fn resolve(&self, ctx: &mut Context) -> Resolved {
277        let value = self.value.resolve(ctx)?;
278        let max_depth = self.max_depth.resolve(ctx)?;
279        let lossy = self
280            .lossy
281            .map_resolve_with_default(ctx, || DEFAULT_LOSSY.clone())?;
282        parse_json_with_depth(value, max_depth, lossy)
283    }
284
285    fn type_def(&self, _: &state::TypeState) -> TypeDef {
286        json_type_def()
287    }
288}
289
290#[cfg(test)]
291mod tests {
292    use super::*;
293    use crate::value;
294
295    test_function![
296        parse_json => ParseJson;
297
298        parses {
299            args: func_args![ value: r#"{"field": "value"}"# ],
300            want: Ok(value!({ field: "value" })),
301            tdef: json_type_def(),
302        }
303
304        complex_json {
305            args: func_args![ value: r#"{"object": {"string":"value","number":42,"array":["hello","world"],"boolean":false}}"# ],
306            want: Ok(value!({ object: {string: "value", number: 42, array: ["hello", "world"], boolean: false} })),
307            tdef: json_type_def(),
308        }
309
310        invalid_json_errors {
311            args: func_args![ value: r#"{"field": "value"# ],
312            want: Err("unable to parse json: EOF while parsing a string at line 1 column 16"),
313            tdef: json_type_def(),
314        }
315
316        max_depth {
317            args: func_args![ value: r#"{"top_layer": {"layer_one": "finish", "layer_two": 2}}"#, max_depth: 1],
318            want: Ok(value!({ top_layer: r#"{"layer_one": "finish", "layer_two": 2}"# })),
319            tdef: json_type_def(),
320        }
321
322        max_depth_array {
323            args: func_args![ value: r#"[{"top_layer": {"next_layer": ["finish"]}}]"#, max_depth: 2],
324            want: Ok(value!([{ top_layer: r#"{"next_layer": ["finish"]}"# }])),
325            tdef: json_type_def(),
326        }
327
328        max_depth_exceeds_layers {
329            args: func_args![ value: r#"{"top_layer": {"layer_one": "finish", "layer_two": 2}}"#, max_depth: 10],
330            want: Ok(value!({ top_layer: {layer_one: "finish", layer_two: 2} })),
331            tdef: json_type_def(),
332        }
333
334        invalid_json_with_max_depth {
335            args: func_args![ value: r#"{"field": "value"#, max_depth: 3 ],
336            want: Err("unable to read json: EOF while parsing a string at line 1 column 16"),
337            tdef: json_type_def(),
338        }
339
340        invalid_input_max_depth {
341            args: func_args![ value: r#"{"top_layer": "finish"}"#, max_depth: 129],
342            want: Err("max_depth value should be greater than 0 and less than 128, got 129"),
343            tdef: json_type_def(),
344        }
345
346        // // TODO: provide a function version of the `test_function!` macro.
347        max_int {
348            args: func_args![ value: format!("{{\"num\": {}}}", i64::MAX - 1)],
349            want: Ok(value!({"num": 9_223_372_036_854_775_806_i64})),
350            tdef: json_type_def(),
351        }
352
353        lossy_float_conversion {
354            args: func_args![ value: r#"{"num": 9223372036854775808}"#],
355            want: Ok(value!({"num": 9.223_372_036_854_776e18})),
356            tdef: json_type_def(),
357        }
358
359        // Checks that the parsing uses the default lossy argument value
360        parse_invalid_utf8_default_lossy_arg {
361            // 0x22 is a quote character
362            // 0xf5 is out of the range of valid UTF-8 bytes
363            args: func_args![ value: Bytes::from_static(&[0x22,0xf5,0x22])],
364            want: Ok(value!(std::char::REPLACEMENT_CHARACTER.to_string())),
365            tdef: json_type_def(),
366        }
367
368        parse_invalid_utf8_lossy_arg_true {
369            // 0xf5 is out of the range of valid UTF-8 bytes
370            args: func_args![ value: Bytes::from_static(&[0x22,0xf5,0x22]), lossy: true],
371            // U+FFFD is the replacement character for invalid UTF-8
372            want: Ok(value!(std::char::REPLACEMENT_CHARACTER.to_string())),
373            tdef: json_type_def(),
374        }
375
376        invalid_utf8_json_lossy_arg_false {
377            args: func_args![ value: Bytes::from_static(&[0x22,0xf5,0x22]), lossy: false],
378            want: Err("unable to parse json: invalid unicode code point at line 1 column 3"),
379            tdef: json_type_def(),
380        }
381
382        json_bom {
383            // 0xef,0xbb,0xbf are the UTF-8 BOM markers and 0x7b,0x7d are just {}
384            args: func_args![ value: Bytes::from_static(&[0xef,0xbb,0xbf,0x7b,0x7d]), lossy: false],
385            want: Ok(value!({})),
386            tdef: json_type_def(),
387        }
388
389        json_bom_lossy {
390            args: func_args![ value: Bytes::from_static(&[0xef,0xbb,0xbf,0x7b,0x7d]), lossy: true],
391            want: Ok(value!({})),
392            tdef: json_type_def(),
393        }
394    ];
395
396    #[cfg(not(feature = "float_roundtrip"))]
397    test_function![
398        parse_json => ParseJson;
399
400        no_roundtrip_float_conversion {
401            args: func_args![ value: r#"{"num": 1626175065.5934923}"#],
402            want: Ok(value!({"num": 1_626_175_065.593_492_5})),
403            tdef: json_type_def(),
404        }
405    ];
406
407    #[cfg(feature = "float_roundtrip")]
408    test_function![
409        parse_json => ParseJson;
410
411        roundtrip_float_conversion {
412            args: func_args![ value: r#"{"num": 1626175065.5934923}"#],
413            want: Ok(value!({"num": 1_626_175_065.593_492_3})),
414            tdef: json_type_def(),
415        }
416    ];
417}