1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52
use std::str;
/// Escape invalid UTF-8, so that it can be un-escaped without being lossy.
///
/// We need this because path names in Tar archives are just bytes, meaning that they can contain
/// arbitrary (non-valid UTF-8) names. We need some way of encoding them as regular strings, so
/// that we can use them in the API (for example, JSON only supports UTF-8 keys in its maps).
pub fn escape_invalid_utf8(bytes: &[u8]) -> String {
let mut output = String::new();
let mut bytes = bytes;
// valid strings need to have their backslashes escaped
fn write_valid(output: &mut String, valid: &str) {
for character in valid.chars() {
match character {
'\\' => {
output.push('\\');
output.push('\\');
}
other => output.push(other),
}
}
}
#[allow(clippy::ptr_arg)]
fn write_invalid(_output: &mut String, _invalid: u8) {}
while !bytes.is_empty() {
let (valid, rest) = match str::from_utf8(bytes) {
Ok(s) => (s, &bytes[bytes.len()..bytes.len()]),
Err(e) => {
let (good, bad) = bytes.split_at(e.valid_up_to());
// we already know this is good
let good = unsafe { str::from_utf8_unchecked(good) };
(good, bad)
}
};
// The entire rest of the string was valid UTF-8, we are done
write_valid(&mut output, valid);
// write rest, if possible
bytes = if rest.is_empty() {
rest
} else {
write_invalid(&mut output, rest[0]);
&rest[1..]
};
}
output
}