///! Utilities for operating on strings in SWF files. use super::tables::{LOWERCASE_TABLE, UPPERCASE_TABLE}; use super::Units; fn is_surrogate_pair_at(us: &[u16], pos: usize) -> bool { if let Some(pair) = us.get(pos..pos + 2) { let has_high = (0xD800..=0xDBFF).contains(&pair[0]); let has_low = (0xDC00..=0xDFFF).contains(&pair[1]); has_high && has_low } else { false } } /// Gets the position of the previous utf16 char; /// `pos` must already lie on a char boundary pub fn prev_char_boundary(slice: &super::WStr, pos: usize) -> usize { if pos <= 1 { return 0; } match slice.units() { Units::Bytes(_) => pos - 1, // LATIN1 strings only contains 1-bytes chars Units::Wide(us) if is_surrogate_pair_at(us, pos - 2) => pos - 2, Units::Wide(_) => pos - 1, } } /// Gets the byte position of the next utf16 char; /// `pos` must already lie on a char boundary pub fn next_char_boundary(slice: &super::WStr, pos: usize) -> usize { if pos >= slice.len() { return slice.len(); } match slice.units() { Units::Bytes(_) => pos + 1, // LATIN1 strings only contains 1-bytes chars Units::Wide(us) if is_surrogate_pair_at(us, pos) => pos + 2, Units::Wide(_) => pos + 1, } } /// Returns `true` if the given utf16 code unit is an whitespace /// according to the Flash Player. #[inline] pub fn swf_is_whitespace(c: u16) -> bool { matches!(u8::try_from(c), Ok(b' ' | b'\t' | b'\n' | b'\r')) } /// Finds the longest prefix of `slice` that is entirely ASCII, /// and returns it as an UTF8 string, together with the remaining tail. pub fn split_ascii_prefix_bytes(slice: &[u8]) -> (&str, &[u8]) { let first_non_ascii = slice.iter().position(|c| *c >= 0x80); let (head, tail) = slice.split_at(first_non_ascii.unwrap_or(0)); // SAFETY: `head` only contains ASCII. let head = unsafe { core::str::from_utf8_unchecked(head) }; (head, tail) } /// Finds the longest prefix of `slice` that is entirely ASCII, /// and returns it as a byte slice, together with the remaining tail. pub fn split_ascii_prefix(slice: &str) -> (&[u8], &str) { let (head, tail) = split_ascii_prefix_bytes(slice.as_bytes()); // SAFETY: `split_ascii_prefix_bytes` always split on a char boundary. let tail = unsafe { core::str::from_utf8_unchecked(tail) }; (head.as_bytes(), tail) } /// Maps a UTF-16 code unit into a `char`. /// TODO: Surrogate characters will get replaced with the Unicode replacement character. pub fn utf16_code_unit_to_char(c: u16) -> char { char::decode_utf16(core::iter::once(c)) .next() .unwrap() .unwrap_or(char::REPLACEMENT_CHARACTER) } /// Maps a UCS2 code unit to its lowercase variant according to the Flash Player. /// Note that this mapping is different that Rust's `to_lowercase`. pub fn swf_to_lowercase(c: u16) -> u16 { if c < 0x80 { return (c as u8).to_ascii_lowercase().into(); } match LOWERCASE_TABLE.binary_search_by(|&(key, _)| key.cmp(&c)) { Ok(i) => LOWERCASE_TABLE[i].1, Err(_) => c, } } /// Maps a UCS2 code unit to its uppercase variant according to the Flash Player. /// Note that this mapping is different that Rust's `to_uppercase`. pub fn swf_to_uppercase(c: u16) -> u16 { if c < 0x80 { return (c as u8).to_ascii_uppercase().into(); } match UPPERCASE_TABLE.binary_search_by(|&(key, _)| key.cmp(&c)) { Ok(i) => UPPERCASE_TABLE[i].1, Err(_) => c, } }