1#![allow(missing_docs)]
2use bytes::{Bytes, BytesMut};
3use encoding_rs::{CoderResult, Encoding};
4
5use crate::internal_events::{
6 DecoderBomRemoval, DecoderMalformedReplacement, EncoderUnmappableReplacement,
7};
8
9const BUFFER_SIZE: usize = 4096;
10
11const BOM_UTF8: &[u8] = b"\xef\xbb\xbf";
14const BOM_UTF8_LEN: usize = BOM_UTF8.len();
15
16pub struct Decoder {
18 buffer: [u8; BUFFER_SIZE],
19 output: BytesMut,
20 inner: encoding_rs::Decoder,
21}
22
23impl Decoder {
24 pub fn new(encoding: &'static Encoding) -> Self {
25 Self {
26 buffer: [0; BUFFER_SIZE],
27 output: BytesMut::new(),
28 inner: encoding.new_decoder_without_bom_handling(),
44 }
45 }
46
47 pub fn decode_to_utf8(&mut self, input: Bytes) -> Bytes {
48 let mut total_read_from_input = 0;
49 let mut total_had_errors = false;
50
51 loop {
52 let (result, read, written, had_errors) = self.inner.decode_to_utf8(
53 &input[total_read_from_input..],
54 &mut self.buffer,
55 false, );
57
58 total_read_from_input += read;
59 total_had_errors |= had_errors;
60
61 self.output.extend_from_slice(&self.buffer[..written]);
62
63 match result {
64 CoderResult::InputEmpty => break, CoderResult::OutputFull => (), }
67 }
68
69 if total_had_errors {
70 emit!(DecoderMalformedReplacement {
71 from_encoding: self.inner.encoding().name()
72 });
73 }
74
75 let output = self.output.split().freeze();
76
77 if output.get(..BOM_UTF8_LEN) == Some(BOM_UTF8) {
88 emit!(DecoderBomRemoval {
89 from_encoding: self.inner.encoding().name()
90 });
91 output.slice(BOM_UTF8_LEN..)
92 } else {
93 output
94 }
95 }
96}
97
98pub struct Encoder {
100 buffer: [u8; BUFFER_SIZE],
101 output: BytesMut,
102 inner: encoding_rs::Encoder,
103 utf16_encoding: Option<Utf16Encoding>,
108}
109
110#[derive(Debug, Clone, Copy)]
111enum Utf16Encoding {
112 Le, Be, }
115
116impl Encoder {
117 pub fn new(encoding: &'static Encoding) -> Self {
118 Self {
119 buffer: [0; BUFFER_SIZE],
120 output: BytesMut::new(),
121 inner: encoding.new_encoder(),
122 utf16_encoding: Self::get_utf16_encoding(encoding),
123 }
124 }
125
126 fn get_utf16_encoding(encoding: &'static Encoding) -> Option<Utf16Encoding> {
127 match encoding.name() {
128 "UTF-16LE" => Some(Utf16Encoding::Le),
129 "UTF-16BE" => Some(Utf16Encoding::Be),
130 _ => None,
131 }
132 }
133
134 fn encode_from_utf8_to_utf16(&mut self, input: &str, variant: Utf16Encoding) -> Bytes {
135 let to_bytes_func = match variant {
136 Utf16Encoding::Le => u16::to_le_bytes,
137 Utf16Encoding::Be => u16::to_be_bytes,
138 };
139
140 for utf16_value in input.encode_utf16() {
141 self.output.extend_from_slice(&to_bytes_func(utf16_value));
142 }
143
144 self.output.split().freeze()
145 }
146
147 pub fn encode_from_utf8(&mut self, input: &str) -> Bytes {
148 if let Some(variant) = self.utf16_encoding {
150 return self.encode_from_utf8_to_utf16(input, variant);
151 }
152
153 let mut total_read_from_input = 0;
154 let mut total_had_errors = false;
155
156 loop {
157 let (result, read, written, had_errors) = self.inner.encode_from_utf8(
158 &input[total_read_from_input..],
159 &mut self.buffer,
160 false, );
162
163 total_read_from_input += read;
164 total_had_errors |= had_errors;
165
166 self.output.extend_from_slice(&self.buffer[..written]);
167
168 match result {
169 CoderResult::InputEmpty => break, CoderResult::OutputFull => (), }
172 }
173
174 if total_had_errors {
175 emit!(EncoderUnmappableReplacement {
176 to_encoding: self.inner.encoding().name()
177 });
178 }
179
180 self.output.split().freeze()
181 }
182}
183
184#[cfg(test)]
185mod tests {
186 use std::char::REPLACEMENT_CHARACTER;
187
188 use bytes::Bytes;
189 use encoding_rs::{SHIFT_JIS, UTF_16BE, UTF_16LE, UTF_8};
190
191 use super::{Decoder, Encoder, BOM_UTF8};
192
193 const BOM_UTF16LE: &[u8] = b"\xff\xfe";
196
197 const fn test_data_utf16le_123() -> &'static [u8] {
199 b"1\x002\x003\x00"
200 }
201
202 const fn test_data_utf16le_crlf() -> &'static [u8] {
203 b"\r\x00\n\x00"
204 }
205
206 const fn test_data_utf16le_vector_devanagari() -> &'static [u8] {
207 b"-\tG\t\x15\tM\t\x1f\t0\t"
208 }
209
210 const fn test_data_utf16be_123() -> &'static [u8] {
212 b"\x001\x002\x003"
213 }
214
215 const fn test_data_utf16be_crlf() -> &'static [u8] {
216 b"\x00\r\x00\n"
217 }
218
219 const fn test_data_utf16be_vector_devanagari() -> &'static [u8] {
220 b"\t-\tG\t\x15\tM\t\x1f\t0"
221 }
222
223 const fn test_data_shiftjis_helloworld_japanese() -> &'static [u8] {
225 b"\x83n\x83\x8D\x81[\x81E\x83\x8F\x81[\x83\x8B\x83h"
226 }
227
228 #[test]
229 fn test_decoder_various() {
230 let mut d = Decoder::new(UTF_8);
231 assert_eq!(d.decode_to_utf8(Bytes::from("123")), Bytes::from("123"));
232 assert_eq!(d.decode_to_utf8(Bytes::from("\n")), Bytes::from("\n"));
233 assert_eq!(d.decode_to_utf8(Bytes::from("भेक्टर")), Bytes::from("भेक्टर"));
234
235 let mut d = Decoder::new(UTF_16LE);
236 assert_eq!(
237 d.decode_to_utf8(Bytes::from(test_data_utf16le_123())),
238 Bytes::from("123")
239 );
240 assert_eq!(
241 d.decode_to_utf8(Bytes::from(test_data_utf16le_crlf())),
242 Bytes::from("\r\n")
243 );
244 assert_eq!(
245 d.decode_to_utf8(Bytes::from(test_data_utf16le_vector_devanagari())),
246 Bytes::from("भेक्टर")
247 );
248
249 let mut d = Decoder::new(UTF_16BE);
250 assert_eq!(
251 d.decode_to_utf8(Bytes::from(test_data_utf16be_123())),
252 Bytes::from("123")
253 );
254 assert_eq!(
255 d.decode_to_utf8(Bytes::from(test_data_utf16be_crlf())),
256 Bytes::from("\r\n")
257 );
258 assert_eq!(
259 d.decode_to_utf8(Bytes::from(test_data_utf16be_vector_devanagari())),
260 Bytes::from("भेक्टर")
261 );
262
263 let mut d = Decoder::new(SHIFT_JIS);
264 assert_eq!(
265 d.decode_to_utf8(Bytes::from(test_data_shiftjis_helloworld_japanese())),
266 Bytes::from("\u{30CF}\u{30ED}\u{30FC}\u{30FB}\u{30EF}\u{30FC}\u{30EB}\u{30C9}")
268 );
269 }
270
271 #[test]
272 fn test_decoder_long_input() {
273 let mut d = Decoder::new(UTF_8);
274
275 let long_input = "This line is super long and will take up more space than Decoder's internal buffer, just to make sure that everything works properly when multiple inner decode calls are involved".repeat(10000);
276
277 assert_eq!(
278 d.decode_to_utf8(Bytes::from(long_input.clone())),
279 Bytes::from(long_input)
280 );
281 }
282
283 #[test]
284 fn test_decoder_replacements() {
285 let mut d = Decoder::new(UTF_8);
286
287 let problematic_input = [BOM_UTF16LE, b"123"].concat();
290
291 assert_eq!(
292 d.decode_to_utf8(Bytes::from(problematic_input)),
293 Bytes::from(format!("{REPLACEMENT_CHARACTER}{REPLACEMENT_CHARACTER}123"))
294 );
295 }
296
297 #[test]
298 fn test_decoder_bom_removal() {
299 let mut d = Decoder::new(UTF_16LE);
300
301 let input_bom_start = [BOM_UTF16LE, test_data_utf16le_123()].concat();
302
303 assert_eq!(
305 d.decode_to_utf8(Bytes::from(input_bom_start.clone())),
306 Bytes::from("123")
307 );
308
309 assert_eq!(
311 d.decode_to_utf8(Bytes::from(input_bom_start)),
312 Bytes::from("123")
313 );
314
315 assert_eq!(
317 d.decode_to_utf8(Bytes::from(
318 [
319 test_data_utf16le_123(),
320 BOM_UTF16LE,
321 test_data_utf16le_123(),
322 ]
323 .concat()
324 )),
325 Bytes::from([b"123", BOM_UTF8, b"123"].concat())
326 );
327
328 assert_eq!(
330 d.decode_to_utf8(Bytes::from(test_data_utf16le_123())),
331 Bytes::from("123")
332 );
333 assert_eq!(
334 d.decode_to_utf8(Bytes::from(test_data_utf16le_crlf())),
335 Bytes::from("\r\n")
336 );
337 }
338
339 #[test]
340 fn test_encoder_various() {
341 let mut d = Encoder::new(UTF_8);
342 assert_eq!(d.encode_from_utf8("123"), Bytes::from("123"));
343 assert_eq!(d.encode_from_utf8("\n"), Bytes::from("\n"));
344 assert_eq!(d.encode_from_utf8("भेक्टर"), Bytes::from("भेक्टर"));
345
346 let mut d = Encoder::new(UTF_16LE);
347 assert_eq!(
348 d.encode_from_utf8("123"),
349 Bytes::from(test_data_utf16le_123())
350 );
351 assert_eq!(
352 d.encode_from_utf8("\r\n"),
353 Bytes::from(test_data_utf16le_crlf())
354 );
355 assert_eq!(
356 d.encode_from_utf8("भेक्टर"),
357 Bytes::from(test_data_utf16le_vector_devanagari())
358 );
359
360 let mut d = Encoder::new(UTF_16BE);
361 assert_eq!(
362 d.encode_from_utf8("123"),
363 Bytes::from(test_data_utf16be_123())
364 );
365 assert_eq!(
366 d.encode_from_utf8("\r\n"),
367 Bytes::from(test_data_utf16be_crlf())
368 );
369 assert_eq!(
370 d.encode_from_utf8("भेक्टर"),
371 Bytes::from(test_data_utf16be_vector_devanagari())
372 );
373
374 let mut d = Encoder::new(SHIFT_JIS);
375 assert_eq!(
376 d.encode_from_utf8("\u{30CF}\u{30ED}\u{30FC}\u{30FB}\u{30EF}\u{30FC}\u{30EB}\u{30C9}"),
378 Bytes::from(test_data_shiftjis_helloworld_japanese())
379 );
380 }
381
382 #[test]
383 fn test_encoder_long_input() {
384 let mut d = Encoder::new(UTF_8);
385
386 let long_input = "This line is super long and will take up more space than Encoder's internal buffer, just to make sure that everything works properly when multiple inner encode calls are involved".repeat(10000);
387
388 assert_eq!(
389 d.encode_from_utf8(long_input.as_str()),
390 Bytes::from(long_input)
391 );
392 }
393
394 #[test]
395 fn test_encoder_replacements() {
396 let mut d = Encoder::new(SHIFT_JIS);
397
398 let problematic_input = "\u{2638}123\u{262F}";
401
402 assert_eq!(
403 d.encode_from_utf8(problematic_input),
404 Bytes::from(format!("{}123{}", "☸", "☯"))
405 );
406 }
407
408 #[test]
409 fn test_transcode_symmetry() {
410 let encoding = UTF_16LE;
411 let mut encoder = Encoder::new(encoding);
412 let mut decoder = Decoder::new(encoding);
413
414 let input = "οὐροβόρος";
415
416 assert_eq!(
417 decoder.decode_to_utf8(encoder.encode_from_utf8(input)),
421 Bytes::from(input),
422 );
423 }
424}