| extern crate utf8; |
| |
| use std::borrow::Cow; |
| use std::collections::VecDeque; |
| use std::io; |
| use utf8::*; |
| |
| /// A re-implementation of std::str::from_utf8 |
| pub fn str_from_utf8(input: &[u8]) -> Result<&str, usize> { |
| match decode(input) { |
| Ok(s) => return Ok(s), |
| Err(DecodeError::Invalid { valid_prefix, .. }) | |
| Err(DecodeError::Incomplete { valid_prefix, .. }) => Err(valid_prefix.len()), |
| } |
| } |
| |
| #[test] |
| fn test_str_from_utf8() { |
| let xs = b"hello"; |
| assert_eq!(str_from_utf8(xs), Ok("hello")); |
| |
| let xs = "ศไทย中华Việt Nam".as_bytes(); |
| assert_eq!(str_from_utf8(xs), Ok("ศไทย中华Việt Nam")); |
| |
| let xs = b"hello\xFF"; |
| assert!(str_from_utf8(xs).is_err()); |
| } |
| |
| #[test] |
| fn test_is_utf8() { |
| // Chars of 1, 2, 3, and 4 bytes |
| assert!(str_from_utf8("eé€\u{10000}".as_bytes()).is_ok()); |
| // invalid prefix |
| assert!(str_from_utf8(&[0x80]).is_err()); |
| // invalid 2 byte prefix |
| assert!(str_from_utf8(&[0xc0]).is_err()); |
| assert!(str_from_utf8(&[0xc0, 0x10]).is_err()); |
| // invalid 3 byte prefix |
| assert!(str_from_utf8(&[0xe0]).is_err()); |
| assert!(str_from_utf8(&[0xe0, 0x10]).is_err()); |
| assert!(str_from_utf8(&[0xe0, 0xff, 0x10]).is_err()); |
| // invalid 4 byte prefix |
| assert!(str_from_utf8(&[0xf0]).is_err()); |
| assert!(str_from_utf8(&[0xf0, 0x10]).is_err()); |
| assert!(str_from_utf8(&[0xf0, 0xff, 0x10]).is_err()); |
| assert!(str_from_utf8(&[0xf0, 0xff, 0xff, 0x10]).is_err()); |
| |
| // deny overlong encodings |
| assert!(str_from_utf8(&[0xc0, 0x80]).is_err()); |
| assert!(str_from_utf8(&[0xc0, 0xae]).is_err()); |
| assert!(str_from_utf8(&[0xe0, 0x80, 0x80]).is_err()); |
| assert!(str_from_utf8(&[0xe0, 0x80, 0xaf]).is_err()); |
| assert!(str_from_utf8(&[0xe0, 0x81, 0x81]).is_err()); |
| assert!(str_from_utf8(&[0xf0, 0x82, 0x82, 0xac]).is_err()); |
| assert!(str_from_utf8(&[0xf4, 0x90, 0x80, 0x80]).is_err()); |
| |
| // deny surrogates |
| assert!(str_from_utf8(&[0xED, 0xA0, 0x80]).is_err()); |
| assert!(str_from_utf8(&[0xED, 0xBF, 0xBF]).is_err()); |
| |
| assert!(str_from_utf8(&[0xC2, 0x80]).is_ok()); |
| assert!(str_from_utf8(&[0xDF, 0xBF]).is_ok()); |
| assert!(str_from_utf8(&[0xE0, 0xA0, 0x80]).is_ok()); |
| assert!(str_from_utf8(&[0xED, 0x9F, 0xBF]).is_ok()); |
| assert!(str_from_utf8(&[0xEE, 0x80, 0x80]).is_ok()); |
| assert!(str_from_utf8(&[0xEF, 0xBF, 0xBF]).is_ok()); |
| assert!(str_from_utf8(&[0xF0, 0x90, 0x80, 0x80]).is_ok()); |
| assert!(str_from_utf8(&[0xF4, 0x8F, 0xBF, 0xBF]).is_ok()); |
| } |
| |
| /// A re-implementation of String::from_utf8_lossy |
| pub fn string_from_utf8_lossy(input: &[u8]) -> Cow<str> { |
| let mut result = decode(input); |
| if let Ok(s) = result { |
| return s.into() |
| } |
| let mut string = String::with_capacity(input.len() + REPLACEMENT_CHARACTER.len()); |
| loop { |
| match result { |
| Ok(s) => { |
| string.push_str(s); |
| return string.into() |
| } |
| Err(DecodeError::Incomplete { valid_prefix, .. }) => { |
| string.push_str(valid_prefix); |
| string.push_str(REPLACEMENT_CHARACTER); |
| return string.into() |
| } |
| Err(DecodeError::Invalid { valid_prefix, remaining_input, .. }) => { |
| string.push_str(valid_prefix); |
| string.push_str(REPLACEMENT_CHARACTER); |
| result = decode(remaining_input); |
| } |
| } |
| } |
| } |
| |
| pub const DECODED_LOSSY: &'static [(&'static [u8], &'static str)] = &[ |
| (b"hello", "hello"), |
| (b"\xe0\xb8\xa8\xe0\xb9\x84\xe0\xb8\x97\xe0\xb8\xa2\xe4\xb8\xad\xe5\x8d\x8e", "ศไทย中华"), |
| (b"Vi\xe1\xbb\x87t Nam", "Việt Nam"), |
| (b"Hello\xC2 There\xFF ", "Hello\u{FFFD} There\u{FFFD} "), |
| (b"Hello\xC0\x80 There", "Hello\u{FFFD}\u{FFFD} There"), |
| (b"\xE6\x83 Goodbye", "\u{FFFD} Goodbye"), |
| (b"\xF5foo\xF5\x80bar", "\u{FFFD}foo\u{FFFD}\u{FFFD}bar"), |
| (b"\xF5foo\xF5\xC2", "\u{FFFD}foo\u{FFFD}\u{FFFD}"), |
| (b"\xF1foo\xF1\x80bar\xF1\x80\x80baz", "\u{FFFD}foo\u{FFFD}bar\u{FFFD}baz"), |
| (b"\xF4foo\xF4\x80bar\xF4\xBFbaz", "\u{FFFD}foo\u{FFFD}bar\u{FFFD}\u{FFFD}baz"), |
| (b"\xF0\x80\x80\x80foo\xF0\x90\x80\x80bar", "\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}foo\u{10000}bar"), |
| (b"\xF0\x90\x80foo", "\u{FFFD}foo"), |
| // surrogates |
| (b"\xED\xA0\x80foo\xED\xBF\xBFbar", "\u{FFFD}\u{FFFD}\u{FFFD}foo\u{FFFD}\u{FFFD}\u{FFFD}bar"), |
| ]; |
| |
| #[test] |
| fn test_string_from_utf8_lossy() { |
| for &(input, expected) in DECODED_LOSSY { |
| assert_eq!(string_from_utf8_lossy(input), expected); |
| } |
| } |
| |
| pub fn all_partitions<'a, F>(input: &'a [u8], f: F) |
| where F: Fn(&[&[u8]]) |
| { |
| |
| fn all_partitions_inner<'a, F>(chunks: &mut Vec<&'a [u8]>, input: &'a [u8], f: &F) |
| where F: Fn(&[&[u8]]) |
| { |
| if input.is_empty() { |
| f(chunks) |
| } |
| for i in 1..(input.len() + 1) { |
| chunks.push(&input[..i]); |
| all_partitions_inner(chunks, &input[i..], f); |
| chunks.pop(); |
| } |
| } |
| |
| let mut chunks = Vec::new(); |
| all_partitions_inner(&mut chunks, input, &f); |
| assert_eq!(chunks.len(), 0); |
| } |
| |
| #[test] |
| fn test_incremental_decoder() { |
| for &(input, expected) in DECODED_LOSSY { |
| all_partitions(input, |chunks| { |
| let mut string = String::new(); |
| { |
| let mut decoder = LossyDecoder::new(|s| string.push_str(s)); |
| for &chunk in &*chunks { |
| decoder.feed(chunk); |
| } |
| } |
| assert_eq!(string, expected); |
| }); |
| } |
| } |
| |
| #[test] |
| fn test_bufread_decoder() { |
| for &(input, expected) in DECODED_LOSSY { |
| all_partitions(input, |chunks| { |
| let chunks = Chunks(chunks.to_vec().into()); |
| let string = BufReadDecoder::read_to_string_lossy(chunks).unwrap(); |
| assert_eq!(string, expected) |
| }); |
| } |
| } |
| |
| struct Chunks<'a>(VecDeque<&'a [u8]>); |
| |
| impl<'a> io::Read for Chunks<'a> { |
| fn read(&mut self, _: &mut [u8]) -> io::Result<usize> { |
| unimplemented!() |
| } |
| } |
| |
| impl<'a> io::BufRead for Chunks<'a> { |
| fn fill_buf(&mut self) -> io::Result<&[u8]> { |
| Ok(*self.0.front().unwrap()) |
| } |
| |
| fn consume(&mut self, bytes: usize) { |
| { |
| let front = self.0.front_mut().unwrap(); |
| *front = &front[bytes..]; |
| if !front.is_empty() { |
| return |
| } |
| } |
| if self.0.len() > 1 { |
| self.0.pop_front(); |
| } |
| } |
| |
| } |