vector/
encoding_transcode.rs

1#![allow(missing_docs)]
2use bytes::{Bytes, BytesMut};
3use encoding_rs::{CoderResult, Encoding};
4
5use crate::internal_events::{
6    DecoderBomRemoval, DecoderMalformedReplacement, EncoderUnmappableReplacement,
7};
8
9const BUFFER_SIZE: usize = 4096;
10
11// BOM unicode character (U+FEFF) expressed in utf-8
12// http://unicode.org/faq/utf_bom.html#bom4
13const BOM_UTF8: &[u8] = b"\xef\xbb\xbf";
14const BOM_UTF8_LEN: usize = BOM_UTF8.len();
15
16/// Helps transcoding from the specified encoding to utf8
17pub struct Decoder {
18    buffer: [u8; BUFFER_SIZE],
19    output: BytesMut,
20    inner: encoding_rs::Decoder,
21}
22
23impl Decoder {
24    pub fn new(encoding: &'static Encoding) -> Self {
25        Self {
26            buffer: [0; BUFFER_SIZE],
27            output: BytesMut::new(),
28            // We explicitly choose not to remove BOM as part of encoding_rs's
29            // decoding capabilities: the library has support for it, but it does
30            // so only for the first input provided to the decoder (basically,
31            // start of the stream), and for our usecases, we may get BOM markers
32            // in later inputs too (eg: when reading multiple files):
33            // https://docs.rs/encoding_rs/0.8.26/encoding_rs/struct.Encoding.html#method.new_decoder_with_bom_removal
34            //
35            // We can try to maintain separate decoders for each unique stream
36            // (eg: by filepath when reading multiple files), but that mandates
37            // cleanup of the initialized decoder structs/buffers when they are
38            // no longer needed (eg: when files are closed), which can get
39            // complicated. So we opt for simplicity here.
40            //
41            // BOM markers are still removed if the input starts with it:
42            // see decode_to_utf8() for the rationale/logic.
43            inner: encoding.new_decoder_without_bom_handling(),
44        }
45    }
46
47    pub fn decode_to_utf8(&mut self, input: Bytes) -> Bytes {
48        let mut total_read_from_input = 0;
49        let mut total_had_errors = false;
50
51        loop {
52            let (result, read, written, had_errors) = self.inner.decode_to_utf8(
53                &input[total_read_from_input..],
54                &mut self.buffer,
55                false, // not last (since we are processing a continuous stream)
56            );
57
58            total_read_from_input += read;
59            total_had_errors |= had_errors;
60
61            self.output.extend_from_slice(&self.buffer[..written]);
62
63            match result {
64                CoderResult::InputEmpty => break, // we have consumed all of the given input so we are done!
65                CoderResult::OutputFull => (), // continue reading from the input in the next loop iteration
66            }
67        }
68
69        if total_had_errors {
70            emit!(DecoderMalformedReplacement {
71                from_encoding: self.inner.encoding().name()
72            });
73        }
74
75        let output = self.output.split().freeze();
76
77        // All of the input (including any BOM sequences present) has been decoded
78        // to utf-8 by now so we can check to see if the output starts with utf-8
79        // BOM marker bytes and if it does, remove it for the final output.
80        //
81        // We can choose not to strip the BOM marker and keep it as is, but the
82        // presence of these extra bytes can throw off any downstream processing
83        // we do on the output, and rather than handling it specially on each
84        // processing, we handle it centrally here. Also, the BOM does not serve
85        // any more use for us, since the source encoding is already pre-identified
86        // as part of decoder initialization.
87        if output.get(..BOM_UTF8_LEN) == Some(BOM_UTF8) {
88            emit!(DecoderBomRemoval {
89                from_encoding: self.inner.encoding().name()
90            });
91            output.slice(BOM_UTF8_LEN..)
92        } else {
93            output
94        }
95    }
96}
97
98/// Helps transcoding to the specified encoding from utf8
99pub struct Encoder {
100    buffer: [u8; BUFFER_SIZE],
101    output: BytesMut,
102    inner: encoding_rs::Encoder,
103    // Useful for tracking whether the encoder's encoding is utf-16 (and when it
104    // is, its variety). Since encoding_rs does not have encoders for utf-16,
105    // this is necessary:
106    // https://docs.rs/encoding_rs/0.8.26/encoding_rs/index.html#utf-16le-utf-16be-and-unicode-encoding-schemes
107    utf16_encoding: Option<Utf16Encoding>,
108}
109
110#[derive(Debug, Clone, Copy)]
111enum Utf16Encoding {
112    Le, // little-endian
113    Be, // big-endian
114}
115
116impl Encoder {
117    pub fn new(encoding: &'static Encoding) -> Self {
118        Self {
119            buffer: [0; BUFFER_SIZE],
120            output: BytesMut::new(),
121            inner: encoding.new_encoder(),
122            utf16_encoding: Self::get_utf16_encoding(encoding),
123        }
124    }
125
126    fn get_utf16_encoding(encoding: &'static Encoding) -> Option<Utf16Encoding> {
127        match encoding.name() {
128            "UTF-16LE" => Some(Utf16Encoding::Le),
129            "UTF-16BE" => Some(Utf16Encoding::Be),
130            _ => None,
131        }
132    }
133
134    fn encode_from_utf8_to_utf16(&mut self, input: &str, variant: Utf16Encoding) -> Bytes {
135        let to_bytes_func = match variant {
136            Utf16Encoding::Le => u16::to_le_bytes,
137            Utf16Encoding::Be => u16::to_be_bytes,
138        };
139
140        for utf16_value in input.encode_utf16() {
141            self.output.extend_from_slice(&to_bytes_func(utf16_value));
142        }
143
144        self.output.split().freeze()
145    }
146
147    pub fn encode_from_utf8(&mut self, input: &str) -> Bytes {
148        // alternate logic if the encoder is for a utf-16 encoding variant
149        if let Some(variant) = self.utf16_encoding {
150            return self.encode_from_utf8_to_utf16(input, variant);
151        }
152
153        let mut total_read_from_input = 0;
154        let mut total_had_errors = false;
155
156        loop {
157            let (result, read, written, had_errors) = self.inner.encode_from_utf8(
158                &input[total_read_from_input..],
159                &mut self.buffer,
160                false, // not last (since we are processing a continuous stream)
161            );
162
163            total_read_from_input += read;
164            total_had_errors |= had_errors;
165
166            self.output.extend_from_slice(&self.buffer[..written]);
167
168            match result {
169                CoderResult::InputEmpty => break, // we have consumed all of the given input so we are done!
170                CoderResult::OutputFull => (), // continue reading from the input in the next loop iteration
171            }
172        }
173
174        if total_had_errors {
175            emit!(EncoderUnmappableReplacement {
176                to_encoding: self.inner.encoding().name()
177            });
178        }
179
180        self.output.split().freeze()
181    }
182}
183
184#[cfg(test)]
185mod tests {
186    use std::char::REPLACEMENT_CHARACTER;
187
188    use bytes::Bytes;
189    use encoding_rs::{SHIFT_JIS, UTF_16BE, UTF_16LE, UTF_8};
190
191    use super::{Decoder, Encoder, BOM_UTF8};
192
193    // BOM unicode character (U+FEFF) expressed in utf-16
194    // http://unicode.org/faq/utf_bom.html#bom4
195    const BOM_UTF16LE: &[u8] = b"\xff\xfe";
196
197    // test UTF_16LE data
198    const fn test_data_utf16le_123() -> &'static [u8] {
199        b"1\x002\x003\x00"
200    }
201
202    const fn test_data_utf16le_crlf() -> &'static [u8] {
203        b"\r\x00\n\x00"
204    }
205
206    const fn test_data_utf16le_vector_devanagari() -> &'static [u8] {
207        b"-\tG\t\x15\tM\t\x1f\t0\t"
208    }
209
210    // test UTF_16BE data
211    const fn test_data_utf16be_123() -> &'static [u8] {
212        b"\x001\x002\x003"
213    }
214
215    const fn test_data_utf16be_crlf() -> &'static [u8] {
216        b"\x00\r\x00\n"
217    }
218
219    const fn test_data_utf16be_vector_devanagari() -> &'static [u8] {
220        b"\t-\tG\t\x15\tM\t\x1f\t0"
221    }
222
223    // test SHIFT_JIS data
224    const fn test_data_shiftjis_helloworld_japanese() -> &'static [u8] {
225        b"\x83n\x83\x8D\x81[\x81E\x83\x8F\x81[\x83\x8B\x83h"
226    }
227
228    #[test]
229    fn test_decoder_various() {
230        let mut d = Decoder::new(UTF_8);
231        assert_eq!(d.decode_to_utf8(Bytes::from("123")), Bytes::from("123"));
232        assert_eq!(d.decode_to_utf8(Bytes::from("\n")), Bytes::from("\n"));
233        assert_eq!(d.decode_to_utf8(Bytes::from("भेक्टर")), Bytes::from("भेक्टर"));
234
235        let mut d = Decoder::new(UTF_16LE);
236        assert_eq!(
237            d.decode_to_utf8(Bytes::from(test_data_utf16le_123())),
238            Bytes::from("123")
239        );
240        assert_eq!(
241            d.decode_to_utf8(Bytes::from(test_data_utf16le_crlf())),
242            Bytes::from("\r\n")
243        );
244        assert_eq!(
245            d.decode_to_utf8(Bytes::from(test_data_utf16le_vector_devanagari())),
246            Bytes::from("भेक्टर")
247        );
248
249        let mut d = Decoder::new(UTF_16BE);
250        assert_eq!(
251            d.decode_to_utf8(Bytes::from(test_data_utf16be_123())),
252            Bytes::from("123")
253        );
254        assert_eq!(
255            d.decode_to_utf8(Bytes::from(test_data_utf16be_crlf())),
256            Bytes::from("\r\n")
257        );
258        assert_eq!(
259            d.decode_to_utf8(Bytes::from(test_data_utf16be_vector_devanagari())),
260            Bytes::from("भेक्टर")
261        );
262
263        let mut d = Decoder::new(SHIFT_JIS);
264        assert_eq!(
265            d.decode_to_utf8(Bytes::from(test_data_shiftjis_helloworld_japanese())),
266            // ハロー・ワールド
267            Bytes::from("\u{30CF}\u{30ED}\u{30FC}\u{30FB}\u{30EF}\u{30FC}\u{30EB}\u{30C9}")
268        );
269    }
270
271    #[test]
272    fn test_decoder_long_input() {
273        let mut d = Decoder::new(UTF_8);
274
275        let long_input = "This line is super long and will take up more space than Decoder's internal buffer, just to make sure that everything works properly when multiple inner decode calls are involved".repeat(10000);
276
277        assert_eq!(
278            d.decode_to_utf8(Bytes::from(long_input.clone())),
279            Bytes::from(long_input)
280        );
281    }
282
283    #[test]
284    fn test_decoder_replacements() {
285        let mut d = Decoder::new(UTF_8);
286
287        // utf-16le BOM contains bytes not mappable to utf-8 so we should see
288        // replacement characters in place of it
289        let problematic_input = [BOM_UTF16LE, b"123"].concat();
290
291        assert_eq!(
292            d.decode_to_utf8(Bytes::from(problematic_input)),
293            Bytes::from(format!("{REPLACEMENT_CHARACTER}{REPLACEMENT_CHARACTER}123"))
294        );
295    }
296
297    #[test]
298    fn test_decoder_bom_removal() {
299        let mut d = Decoder::new(UTF_16LE);
300
301        let input_bom_start = [BOM_UTF16LE, test_data_utf16le_123()].concat();
302
303        // starting BOM should be removed for first input
304        assert_eq!(
305            d.decode_to_utf8(Bytes::from(input_bom_start.clone())),
306            Bytes::from("123")
307        );
308
309        // starting BOM should continue to be removed for subsequent inputs
310        assert_eq!(
311            d.decode_to_utf8(Bytes::from(input_bom_start)),
312            Bytes::from("123")
313        );
314
315        // but if BOM is not at the start, it should be left untouched
316        assert_eq!(
317            d.decode_to_utf8(Bytes::from(
318                [
319                    test_data_utf16le_123(),
320                    BOM_UTF16LE,
321                    test_data_utf16le_123(),
322                ]
323                .concat()
324            )),
325            Bytes::from([b"123", BOM_UTF8, b"123"].concat())
326        );
327
328        // inputs without BOM should continue to work
329        assert_eq!(
330            d.decode_to_utf8(Bytes::from(test_data_utf16le_123())),
331            Bytes::from("123")
332        );
333        assert_eq!(
334            d.decode_to_utf8(Bytes::from(test_data_utf16le_crlf())),
335            Bytes::from("\r\n")
336        );
337    }
338
339    #[test]
340    fn test_encoder_various() {
341        let mut d = Encoder::new(UTF_8);
342        assert_eq!(d.encode_from_utf8("123"), Bytes::from("123"));
343        assert_eq!(d.encode_from_utf8("\n"), Bytes::from("\n"));
344        assert_eq!(d.encode_from_utf8("भेक्टर"), Bytes::from("भेक्टर"));
345
346        let mut d = Encoder::new(UTF_16LE);
347        assert_eq!(
348            d.encode_from_utf8("123"),
349            Bytes::from(test_data_utf16le_123())
350        );
351        assert_eq!(
352            d.encode_from_utf8("\r\n"),
353            Bytes::from(test_data_utf16le_crlf())
354        );
355        assert_eq!(
356            d.encode_from_utf8("भेक्टर"),
357            Bytes::from(test_data_utf16le_vector_devanagari())
358        );
359
360        let mut d = Encoder::new(UTF_16BE);
361        assert_eq!(
362            d.encode_from_utf8("123"),
363            Bytes::from(test_data_utf16be_123())
364        );
365        assert_eq!(
366            d.encode_from_utf8("\r\n"),
367            Bytes::from(test_data_utf16be_crlf())
368        );
369        assert_eq!(
370            d.encode_from_utf8("भेक्टर"),
371            Bytes::from(test_data_utf16be_vector_devanagari())
372        );
373
374        let mut d = Encoder::new(SHIFT_JIS);
375        assert_eq!(
376            // ハロー・ワールド
377            d.encode_from_utf8("\u{30CF}\u{30ED}\u{30FC}\u{30FB}\u{30EF}\u{30FC}\u{30EB}\u{30C9}"),
378            Bytes::from(test_data_shiftjis_helloworld_japanese())
379        );
380    }
381
382    #[test]
383    fn test_encoder_long_input() {
384        let mut d = Encoder::new(UTF_8);
385
386        let long_input = "This line is super long and will take up more space than Encoder's internal buffer, just to make sure that everything works properly when multiple inner encode calls are involved".repeat(10000);
387
388        assert_eq!(
389            d.encode_from_utf8(long_input.as_str()),
390            Bytes::from(long_input)
391        );
392    }
393
394    #[test]
395    fn test_encoder_replacements() {
396        let mut d = Encoder::new(SHIFT_JIS);
397
398        // surrounding unicode characters here [☸ & ☯︎] are not mappable to
399        // shift JIS, we should see numeric character references in place of it
400        let problematic_input = "\u{2638}123\u{262F}";
401
402        assert_eq!(
403            d.encode_from_utf8(problematic_input),
404            Bytes::from(format!("{}123{}", "&#9784;", "&#9775;"))
405        );
406    }
407
408    #[test]
409    fn test_transcode_symmetry() {
410        let encoding = UTF_16LE;
411        let mut encoder = Encoder::new(encoding);
412        let mut decoder = Decoder::new(encoding);
413
414        let input = "οὐροβόρος";
415
416        assert_eq!(
417            // this should be an identity operation for our input plus the choice
418            // of encoding (no BOM bytes in the input, plus the unicode characters
419            // can be represented fully in both utf8 and utf16)
420            decoder.decode_to_utf8(encoder.encode_from_utf8(input)),
421            Bytes::from(input),
422        );
423    }
424}