vrl/stdlib/
decode_charset.rs1use crate::diagnostic::Label;
2use crate::prelude::*;
3use encoding_rs::Encoding;
4use nom::AsBytes;
5use std::str::from_utf8;
6
7#[derive(Clone, Copy, Debug)]
8pub struct DecodeCharset;
9
10impl Function for DecodeCharset {
11 fn identifier(&self) -> &'static str {
12 "decode_charset"
13 }
14
15 fn examples(&self) -> &'static [Example] {
16 &[
17 example! {
18 title: "Decode EUC-KR string",
19 source: r#"decode_charset!(decode_base64!("vsiz58fPvLy/5A=="), "euc-kr")"#,
20 result: Ok("안녕하세요"),
21 },
22 example! {
23 title: "Decode EUC-JP string",
24 source: r#"decode_charset!(decode_base64!("pLOk86TLpMGkzw=="), "euc-jp")"#,
25 result: Ok("こんにちは"),
26 },
27 example! {
28 title: "Decode GB2312 string",
29 source: r#"decode_charset!(decode_base64!("xOO6ww=="), "gb2312")"#,
30 result: Ok("你好"),
31 },
32 ]
33 }
34
35 fn summary(&self) -> &'static str {
36 "Decode non UTF-8 charset to UTF-8"
37 }
38
39 fn usage(&self) -> &'static str {
40 indoc! {"
41 Decodes the `value` (a non-UTF8 string) to a UTF8 string using the specified
42 [character set](https://encoding.spec.whatwg.org/#names-and-labels).
43 "}
44 }
45
46 fn category(&self) -> &'static str {
47 Category::Codec.as_ref()
48 }
49
50 fn internal_failure_reasons(&self) -> &'static [&'static str] {
51 &[
52 "`from_charset` isn't a valid [character set](https://encoding.spec.whatwg.org/#names-and-labels).",
53 ]
54 }
55
56 fn return_kind(&self) -> u16 {
57 kind::BYTES
58 }
59
60 fn parameters(&self) -> &'static [Parameter] {
61 const PARAMETERS: &[Parameter] = &[
62 Parameter::required("value", kind::BYTES, "The non-UTF8 string to decode."),
63 Parameter::required(
64 "from_charset",
65 kind::BYTES,
66 "The [character set](https://encoding.spec.whatwg.org/#names-and-labels) to use when decoding the data.",
67 ),
68 ];
69 PARAMETERS
70 }
71
72 fn compile(
73 &self,
74 _state: &TypeState,
75 _ctx: &mut FunctionCompileContext,
76 arguments: ArgumentList,
77 ) -> Compiled {
78 let value = arguments.required("value");
79 let from_charset = arguments.required("from_charset");
80
81 Ok(DecodeCharsetFn {
82 value,
83 from_charset,
84 }
85 .as_expr())
86 }
87}
88
89fn decode_charset(value: &[u8], from_charset: &[u8]) -> Resolved {
90 let decoder = Encoding::for_label(from_charset).ok_or_else(|| create_error(from_charset))?;
91
92 let (output, _, _) = decoder.decode(value);
93 Ok(Value::Bytes(output.as_bytes().to_vec().into()))
94}
95
96fn create_error(from_charset: &[u8]) -> ExpressionError {
97 ExpressionError::Error {
98 message: format!(
99 "Unknown charset: {}",
100 from_utf8(from_charset).unwrap_or("unknown")
101 ),
102 labels: vec![Label::primary("Unknown charset", Span::default())],
103 notes: vec![Note::SeeDocs(
104 "Encoding Living Standard".to_string(),
105 "https://encoding.spec.whatwg.org/".to_string(),
106 )],
107 }
108}
109
110#[derive(Debug, Clone)]
111struct DecodeCharsetFn {
112 value: Box<dyn Expression>,
113 from_charset: Box<dyn Expression>,
114}
115
116impl FunctionExpression for DecodeCharsetFn {
117 fn resolve(&self, ctx: &mut Context) -> Resolved {
118 let value = self.value.resolve(ctx)?.try_bytes()?;
119 let from = self.from_charset.resolve(ctx)?.try_bytes()?;
120
121 decode_charset(value.as_bytes(), from.as_bytes())
122 }
123
124 fn type_def(&self, _state: &TypeState) -> TypeDef {
125 TypeDef::bytes().fallible()
126 }
127}
128
129#[cfg(test)]
130mod test {
131 use super::*;
132 use crate::value;
133
134 test_function![
135 decode_charset => DecodeCharset;
136
137 decode_from_euc_kr {
138 args: func_args![value: b"\xbe\xc8\xb3\xe7\xc7\xcf\xbc\xbc\xbf\xe4",
139 from_charset: value!("euc-kr")],
140 want: Ok(value!("안녕하세요")),
141 tdef: TypeDef::bytes().fallible(),
142 }
143
144 decode_from_euc_jp {
145 args: func_args![value: b"\xa4\xb3\xa4\xf3\xa4\xcb\xa4\xc1\xa4\xcf",
146 from_charset: value!("euc-jp")],
147 want: Ok(value!("こんにちは")),
148 tdef: TypeDef::bytes().fallible(),
149 }
150
151 decode_from_gb2312 {
152 args: func_args![value: b"\xc4\xe3\xba\xc3",
153 from_charset: value!("gb2312")],
154 want: Ok(value!("你好")),
155 tdef: TypeDef::bytes().fallible(),
156 }
157
158 unknown_charset {
159 args: func_args![value: value!(b"\xbe\xc8\xb3\xe7\xc7\xcf\xbc\xbc\xbf\xe4"),
160 from_charset: value!(b"euc--kr")],
161 want: Err("Unknown charset: euc--kr"),
162 tdef: TypeDef::bytes().fallible(),
163 }
164 ];
165}