1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
use std::str;

/// Escape invalid UTF-8, so that it can be un-escaped without being lossy.
///
/// We need this because path names in Tar archives are just bytes, meaning that they can contain
/// arbitrary (non-valid UTF-8) names. We need some way of encoding them as regular strings, so
/// that we can use them in the API (for example, JSON only supports UTF-8 keys in its maps).
pub fn escape_invalid_utf8(bytes: &[u8]) -> String {
    let mut output = String::new();
    let mut bytes = bytes;

    // valid strings need to have their backslashes escaped
    fn write_valid(output: &mut String, valid: &str) {
        for character in valid.chars() {
            match character {
                '\\' => {
                    output.push('\\');
                    output.push('\\');
                }
                other => output.push(other),
            }
        }
    }

    #[allow(clippy::ptr_arg)]
    fn write_invalid(_output: &mut String, _invalid: u8) {}

    while !bytes.is_empty() {
        let (valid, rest) = match str::from_utf8(bytes) {
            Ok(s) => (s, &bytes[bytes.len()..bytes.len()]),
            Err(e) => {
                let (good, bad) = bytes.split_at(e.valid_up_to());
                // we already know this is good
                let good = unsafe { str::from_utf8_unchecked(good) };
                (good, bad)
            }
        };

        // The entire rest of the string was valid UTF-8, we are done
        write_valid(&mut output, valid);

        // write rest, if possible
        bytes = if rest.is_empty() {
            rest
        } else {
            write_invalid(&mut output, rest[0]);
            &rest[1..]
        };
    }

    output
}