vrl/stdlib/
validate_json_schema.rs

1use super::util::example_path_or_basename;
2use crate::compiler::prelude::*;
3#[cfg(not(target_arch = "wasm32"))]
4use std::path::PathBuf;
5use std::sync::LazyLock;
6
7// This needs to be static because validate_json_schema needs to read a file
8// and the file path needs to be a literal.
9static EXAMPLE_JSON_SCHEMA_VALID_EMAIL: LazyLock<&str> = LazyLock::new(|| {
10    let path =
11        example_path_or_basename("jsonschema/validate_json_schema/schema_with_email_format.json");
12
13    Box::leak(
14        format!(
15            r#"validate_json_schema!(s'{{ "productUser": "valid@email.com" }}', "{path}", false)"#
16        )
17        .into_boxed_str(),
18    )
19});
20
21static EXAMPLE_JSON_SCHEMA_INVALID_EMAIL: LazyLock<&str> = LazyLock::new(|| {
22    let path =
23        example_path_or_basename("jsonschema/validate_json_schema/schema_with_email_format.json");
24
25    Box::leak(
26        format!(
27            r#"validate_json_schema!(s'{{ "productUser": "invalidEmail" }}', "{path}", false)"#
28        )
29        .into_boxed_str(),
30    )
31});
32
33static EXAMPLE_JSON_SCHEMA_CUSTOM_FORMAT_FALSE: LazyLock<&str> = LazyLock::new(|| {
34    let path =
35        example_path_or_basename("jsonschema/validate_json_schema/schema_with_custom_format.json");
36
37    Box::leak(
38        format!(r#"validate_json_schema!(s'{{ "productUser": "a-custom-formatted-string" }}', "{path}", false)"#)
39            .into_boxed_str(),
40    )
41});
42
43static EXAMPLE_JSON_SCHEMA_CUSTOM_FORMAT_TRUE: LazyLock<&str> = LazyLock::new(|| {
44    let path =
45        example_path_or_basename("jsonschema/validate_json_schema/schema_with_custom_format.json");
46
47    Box::leak(
48        format!(r#"validate_json_schema!(s'{{ "productUser": "a-custom-formatted-string" }}', "{path}", true)"#)
49            .into_boxed_str(),
50    )
51});
52
53static EXAMPLES: LazyLock<Vec<Example>> = LazyLock::new(|| {
54    vec![
55        example! {
56            title: "Payload contains a valid email",
57            source: &EXAMPLE_JSON_SCHEMA_VALID_EMAIL,
58            result: Ok("true"),
59        },
60        example! {
61            title: "Payload contains an invalid email",
62            source: &EXAMPLE_JSON_SCHEMA_INVALID_EMAIL,
63            result: Err(Box::leak(
64                format!(
65                    r#"function call error for "validate_json_schema" at (0:{}): JSON schema validation failed: "invalidEmail" is not a "email" at /productUser"#,
66                    EXAMPLE_JSON_SCHEMA_INVALID_EMAIL.len()
67                )
68                .into_boxed_str(),
69            )),
70        },
71        example! {
72            title: "Payload contains a custom format declaration",
73            source: &EXAMPLE_JSON_SCHEMA_CUSTOM_FORMAT_FALSE,
74            result: Err(Box::leak(
75                format!(
76                    r#"function call error for "validate_json_schema" at (0:{}): Failed to compile schema: Unknown format: 'my-custom-format'. Adjust configuration to ignore unrecognized formats"#,
77                    EXAMPLE_JSON_SCHEMA_CUSTOM_FORMAT_FALSE.len()
78                )
79                .into_boxed_str(),
80            )),
81        },
82        example! {
83            title: "Payload contains a custom format declaration, with ignore_unknown_formats set to true",
84            source: &EXAMPLE_JSON_SCHEMA_CUSTOM_FORMAT_TRUE,
85            result: Ok("true"),
86        },
87    ]
88});
89
90#[cfg(not(target_arch = "wasm32"))]
91use non_wasm::ValidateJsonSchemaFn;
92#[derive(Clone, Copy, Debug)]
93pub struct ValidateJsonSchema;
94
95impl Function for ValidateJsonSchema {
96    fn identifier(&self) -> &'static str {
97        "validate_json_schema"
98    }
99
100    fn usage(&self) -> &'static str {
101        "Check if `value` conforms to a JSON Schema definition. This function validates a JSON payload against a JSON Schema definition. It can be used to ensure that the data structure and types in `value` match the expectations defined in `schema_definition`."
102    }
103
104    fn category(&self) -> &'static str {
105        Category::Type.as_ref()
106    }
107
108    fn internal_failure_reasons(&self) -> &'static [&'static str] {
109        &[
110            "`value` is not a valid JSON Schema payload.",
111            "`value` contains custom format declarations and `ignore_unknown_formats` has not been set to `true`.",
112            "`schema_definition` is not a valid JSON Schema definition.",
113            "`schema_definition` file does not exist.",
114        ]
115    }
116
117    fn return_kind(&self) -> u16 {
118        kind::BOOLEAN
119    }
120
121    fn return_rules(&self) -> &'static [&'static str] {
122        &[
123            "Returns `true` if `value` conforms to the JSON Schema definition.",
124            "Returns `false` if `value` does not conform to the JSON Schema definition.",
125        ]
126    }
127
128    fn notices(&self) -> &'static [&'static str] {
129        &[indoc! {"
130            This function uses a compiled schema cache. The first time it is called with a specific
131            `schema_definition`, it will compile the schema and cache it for subsequent calls. This
132            improves performance when validating multiple values against the same schema. The cache
133            implementation is fairly naive and does not support refreshing the schema if it changes.
134            If you update the schema definition file, you must restart Vector to clear the cache.
135        "}]
136    }
137
138    fn examples(&self) -> &'static [Example] {
139        EXAMPLES.as_slice()
140    }
141
142    fn parameters(&self) -> &'static [Parameter] {
143        const PARAMETERS: &[Parameter] = &[
144            Parameter::required(
145                "value",
146                kind::BYTES,
147                "The value to check if it conforms to the JSON schema definition.",
148            ),
149            Parameter::required(
150                "schema_definition",
151                kind::BYTES,
152                "The location (path) of the JSON Schema definition.",
153            ),
154            Parameter::optional(
155                "ignore_unknown_formats",
156                kind::BOOLEAN,
157                "Unknown formats can be silently ignored by setting this to `true` and validation continues without failing due to those fields.",
158            ),
159        ];
160        PARAMETERS
161    }
162
163    #[cfg(not(target_arch = "wasm32"))]
164    fn compile(
165        &self,
166        state: &state::TypeState,
167        _ctx: &mut FunctionCompileContext,
168        arguments: ArgumentList,
169    ) -> Compiled {
170        let value = arguments.required("value");
171        let schema_definition = arguments.required_literal("schema_definition", state)?;
172        let ignore_unknown_formats = arguments
173            .optional("ignore_unknown_formats")
174            .unwrap_or(expr!(false));
175
176        let schema_file_str = schema_definition
177            .try_bytes_utf8_lossy()
178            .expect("schema definition file must be a string");
179
180        let schema_file_path = std::path::Path::new(schema_file_str.as_ref());
181
182        Ok(ValidateJsonSchemaFn {
183            value,
184            schema_path: PathBuf::from(schema_file_path),
185            ignore_unknown_formats,
186        }
187        .as_expr())
188    }
189
190    #[cfg(target_arch = "wasm32")]
191    fn compile(
192        &self,
193        _state: &state::TypeState,
194        ctx: &mut FunctionCompileContext,
195        _arguments: ArgumentList,
196    ) -> Compiled {
197        Ok(super::WasmUnsupportedFunction::new(ctx.span(), TypeDef::bytes().fallible()).as_expr())
198    }
199}
200
201#[cfg(not(target_arch = "wasm32"))]
202mod non_wasm {
203    use super::{
204        Context, Expression, FunctionExpression, Resolved, TypeDef, VrlValueConvert, state,
205    };
206    use crate::prelude::ExpressionError;
207    use crate::stdlib::json_utils::bom::StripBomFromUTF8;
208    use crate::value;
209    use jsonschema;
210    use std::collections::HashMap;
211    use std::path::{Path, PathBuf};
212    use std::sync::{Arc, LazyLock, RwLock};
213
214    // Global cache for compiled schema validators, this allows us to reuse the compiled
215    // schema across multiple calls to the function, which is important for performance.
216    static SCHEMA_CACHE: LazyLock<RwLock<HashMap<PathBuf, Arc<jsonschema::Validator>>>> =
217        LazyLock::new(|| RwLock::new(HashMap::new()));
218
219    #[derive(Debug, Clone)]
220    pub(super) struct ValidateJsonSchemaFn {
221        pub(super) value: Box<dyn Expression>,
222        pub(super) schema_path: PathBuf, // Path to the schema file, also used as cache key
223        pub(super) ignore_unknown_formats: Box<dyn Expression>,
224    }
225
226    impl FunctionExpression for ValidateJsonSchemaFn {
227        fn resolve(&self, ctx: &mut Context) -> Resolved {
228            let value = self.value.resolve(ctx)?;
229            let ignore_unknown_formats = self.ignore_unknown_formats.resolve(ctx)?.try_boolean()?;
230
231            // Get bytes without extra allocation if possible
232            let bytes = value.try_bytes()?;
233            let stripped_bytes = bytes.strip_bom();
234
235            // Quick empty check
236            if bytes.is_empty() {
237                return Err(ExpressionError::from("Empty JSON value")); // Empty JSON is typically invalid
238            }
239
240            // Fast path: check if it's valid JSON first (cheaper than full parsing)
241            let json_value = if stripped_bytes.is_empty() {
242                serde_json::Value::Null
243            } else {
244                serde_json::from_slice(stripped_bytes).map_err(|e| format!("Invalid JSON: {e}"))?
245            };
246
247            let schema_validator =
248                get_or_compile_schema(&self.schema_path, ignore_unknown_formats)?;
249
250            let validation_errors = schema_validator
251                .iter_errors(&json_value)
252                .map(|e| {
253                    format!(
254                        "{} at {}",
255                        e,
256                        if e.instance_path().as_str().is_empty() {
257                            "/"
258                        } else {
259                            e.instance_path().as_str()
260                        }
261                    )
262                })
263                .collect::<Vec<String>>()
264                .join(", ");
265
266            if validation_errors.is_empty() {
267                Ok(value!(true))
268            } else {
269                Err(ExpressionError::from(format!(
270                    "JSON schema validation failed: {validation_errors}"
271                )))
272            }
273        }
274
275        fn type_def(&self, _: &state::TypeState) -> TypeDef {
276            TypeDef::boolean().fallible()
277        }
278    }
279
280    // Reads the JSON schema definition from a file and returns it as a serde_json::Value.
281    // Returns an error if the file cannot be read or parsed.
282    // The path must be a literal string.
283    // This function is used to load the schema definition for the validate_json_schema function.
284    // it will not fetch remote references, so the schema must be self-contained.
285    pub(super) fn get_json_schema_definition(path: &Path) -> Result<serde_json::Value, String> {
286        let b = std::fs::read(path).map_err(|e| {
287            format!(
288                "Failed to open schema definition file '{}': {e}",
289                path.display()
290            )
291        })?;
292        let schema: serde_json::Value = serde_json::from_slice(&b).map_err(|e| {
293            format!(
294                "Failed to parse schema definition file '{}': {e}",
295                path.display()
296            )
297        })?;
298        Ok(schema)
299    }
300
301    pub(super) fn get_or_compile_schema(
302        schema_path: &Path,
303        ignore_unknown_formats: bool,
304    ) -> Result<Arc<jsonschema::Validator>, String> {
305        // Try read lock first
306        {
307            let cache = SCHEMA_CACHE.read().unwrap();
308            if let Some(schema) = cache.get(schema_path) {
309                return Ok(schema.clone());
310            }
311        }
312
313        // Need to compile - get write lock
314        let mut cache = SCHEMA_CACHE.write().unwrap();
315
316        // Double-check pattern
317        if let Some(schema) = cache.get(schema_path) {
318            return Ok(schema.clone());
319        }
320
321        let schema_definition = get_json_schema_definition(schema_path)
322            .map_err(|e| format!("JSON schema not found: {e}"))?;
323
324        // Compile schema
325        let compiled_schema = jsonschema::options()
326            .should_validate_formats(true)
327            .should_ignore_unknown_formats(ignore_unknown_formats)
328            .build(&schema_definition)
329            .map_err(|e| format!("Failed to compile schema: {e}"))?;
330
331        let compiled_schema = Arc::new(compiled_schema);
332        cache.insert(schema_path.to_path_buf(), compiled_schema.clone());
333        Ok(compiled_schema)
334    }
335}
336
337#[cfg(test)]
338mod tests {
339    use super::*;
340    use crate::value;
341    use std::env;
342
343    fn test_data_dir() -> PathBuf {
344        PathBuf::from(env::var_os("CARGO_MANIFEST_DIR").unwrap()).join("tests/data/jsonschema/")
345    }
346
347    test_function![
348        validate_json_schema => ValidateJsonSchema;
349
350        valid_with_email_format_json {
351            args: func_args![
352                value: value!("{\"productUser\":\"email@domain.com\"}"),
353                schema_definition: test_data_dir().join("validate_json_schema/schema_with_email_format.json").to_str().unwrap().to_owned(),
354                ignore_unknown_formats: false],
355            want: Ok(value!(true)),
356            tdef: TypeDef::boolean().fallible(),
357        }
358
359        valid_with_array_of_things_json {
360            args: func_args![
361                value: value!("{\"fruits\":[\"apple\",\"orange\",\"pear\"],\"vegetables\":[{\"veggieName\":\"potato\",\"veggieLike\":true},{\"veggieName\":\"broccoli\",\"veggieLike\":false}]}"),
362                schema_definition: test_data_dir().join("validate_json_schema/schema_arrays_of_things.json").to_str().unwrap().to_owned(),
363                ignore_unknown_formats: false],
364            want: Ok(value!(true)),
365            tdef: TypeDef::boolean().fallible(),
366        }
367
368        invalid_email_json {
369            args: func_args![
370                value: value!("{\"productUser\":\"invalid-email\"}"),
371                schema_definition: test_data_dir().join("validate_json_schema/schema_with_email_format.json").to_str().unwrap().to_owned(),
372                ignore_unknown_formats: false],
373            want: Err("JSON schema validation failed: \"invalid-email\" is not a \"email\" at /productUser"),
374            tdef: TypeDef::boolean().fallible(),
375        }
376
377        custom_format_ignored_json {
378            args: func_args![
379                value: value!("{\"productUser\":\"just-a-string\"}"),
380                schema_definition: test_data_dir().join("validate_json_schema/schema_with_custom_format.json").to_str().unwrap().to_owned(),
381                ignore_unknown_formats: true],
382            want: Ok(value!(true)),
383            tdef: TypeDef::boolean().fallible(),
384        }
385
386        invalid_empty_json {
387            args: func_args![
388                value: value!(""),
389                schema_definition: test_data_dir().join("validate_json_schema/schema_with_email_format.json").to_str().unwrap().to_owned(),
390                ignore_unknown_formats: false],
391            want: Err("Empty JSON value"),
392            tdef: TypeDef::boolean().fallible(),
393        }
394
395    ];
396}