vrl/stdlib/
decode_charset.rs

1use crate::diagnostic::Label;
2use crate::prelude::*;
3use encoding_rs::Encoding;
4use nom::AsBytes;
5use std::str::from_utf8;
6
7#[derive(Clone, Copy, Debug)]
8pub struct DecodeCharset;
9
10impl Function for DecodeCharset {
11    fn identifier(&self) -> &'static str {
12        "decode_charset"
13    }
14
15    fn examples(&self) -> &'static [Example] {
16        &[
17            example! {
18                title: "Decode EUC-KR string",
19                source: r#"decode_charset!(decode_base64!("vsiz58fPvLy/5A=="), "euc-kr")"#,
20                result: Ok("안녕하세요"),
21            },
22            example! {
23                title: "Decode EUC-JP string",
24                source: r#"decode_charset!(decode_base64!("pLOk86TLpMGkzw=="), "euc-jp")"#,
25                result: Ok("こんにちは"),
26            },
27            example! {
28                title: "Decode GB2312 string",
29                source: r#"decode_charset!(decode_base64!("xOO6ww=="), "gb2312")"#,
30                result: Ok("你好"),
31            },
32        ]
33    }
34
35    fn summary(&self) -> &'static str {
36        "Decode non UTF-8 charset to UTF-8"
37    }
38
39    fn usage(&self) -> &'static str {
40        indoc! {"
41            Decodes the `value` (a non-UTF8 string) to a UTF8 string using the specified
42            [character set](https://encoding.spec.whatwg.org/#names-and-labels).
43        "}
44    }
45
46    fn category(&self) -> &'static str {
47        Category::Codec.as_ref()
48    }
49
50    fn internal_failure_reasons(&self) -> &'static [&'static str] {
51        &[
52            "`from_charset` isn't a valid [character set](https://encoding.spec.whatwg.org/#names-and-labels).",
53        ]
54    }
55
56    fn return_kind(&self) -> u16 {
57        kind::BYTES
58    }
59
60    fn parameters(&self) -> &'static [Parameter] {
61        const PARAMETERS: &[Parameter] = &[
62            Parameter::required("value", kind::BYTES, "The non-UTF8 string to decode."),
63            Parameter::required(
64                "from_charset",
65                kind::BYTES,
66                "The [character set](https://encoding.spec.whatwg.org/#names-and-labels) to use when decoding the data.",
67            ),
68        ];
69        PARAMETERS
70    }
71
72    fn compile(
73        &self,
74        _state: &TypeState,
75        _ctx: &mut FunctionCompileContext,
76        arguments: ArgumentList,
77    ) -> Compiled {
78        let value = arguments.required("value");
79        let from_charset = arguments.required("from_charset");
80
81        Ok(DecodeCharsetFn {
82            value,
83            from_charset,
84        }
85        .as_expr())
86    }
87}
88
89fn decode_charset(value: &[u8], from_charset: &[u8]) -> Resolved {
90    let decoder = Encoding::for_label(from_charset).ok_or_else(|| create_error(from_charset))?;
91
92    let (output, _, _) = decoder.decode(value);
93    Ok(Value::Bytes(output.as_bytes().to_vec().into()))
94}
95
96fn create_error(from_charset: &[u8]) -> ExpressionError {
97    ExpressionError::Error {
98        message: format!(
99            "Unknown charset: {}",
100            from_utf8(from_charset).unwrap_or("unknown")
101        ),
102        labels: vec![Label::primary("Unknown charset", Span::default())],
103        notes: vec![Note::SeeDocs(
104            "Encoding Living Standard".to_string(),
105            "https://encoding.spec.whatwg.org/".to_string(),
106        )],
107    }
108}
109
110#[derive(Debug, Clone)]
111struct DecodeCharsetFn {
112    value: Box<dyn Expression>,
113    from_charset: Box<dyn Expression>,
114}
115
116impl FunctionExpression for DecodeCharsetFn {
117    fn resolve(&self, ctx: &mut Context) -> Resolved {
118        let value = self.value.resolve(ctx)?.try_bytes()?;
119        let from = self.from_charset.resolve(ctx)?.try_bytes()?;
120
121        decode_charset(value.as_bytes(), from.as_bytes())
122    }
123
124    fn type_def(&self, _state: &TypeState) -> TypeDef {
125        TypeDef::bytes().fallible()
126    }
127}
128
129#[cfg(test)]
130mod test {
131    use super::*;
132    use crate::value;
133
134    test_function![
135        decode_charset => DecodeCharset;
136
137        decode_from_euc_kr {
138            args: func_args![value: b"\xbe\xc8\xb3\xe7\xc7\xcf\xbc\xbc\xbf\xe4",
139                             from_charset: value!("euc-kr")],
140            want: Ok(value!("안녕하세요")),
141            tdef: TypeDef::bytes().fallible(),
142        }
143
144        decode_from_euc_jp {
145            args: func_args![value: b"\xa4\xb3\xa4\xf3\xa4\xcb\xa4\xc1\xa4\xcf",
146                             from_charset: value!("euc-jp")],
147            want: Ok(value!("こんにちは")),
148            tdef: TypeDef::bytes().fallible(),
149        }
150
151        decode_from_gb2312 {
152            args: func_args![value: b"\xc4\xe3\xba\xc3",
153                             from_charset: value!("gb2312")],
154            want: Ok(value!("你好")),
155            tdef: TypeDef::bytes().fallible(),
156        }
157
158        unknown_charset {
159            args: func_args![value: value!(b"\xbe\xc8\xb3\xe7\xc7\xcf\xbc\xbc\xbf\xe4"),
160                             from_charset: value!(b"euc--kr")],
161            want: Err("Unknown charset: euc--kr"),
162            tdef: TypeDef::bytes().fallible(),
163        }
164    ];
165}