ruffle/wstr/src/utils.rs

103 lines
3.5 KiB
Rust

///! Utilities for operating on strings in SWF files.
use super::tables::{LOWERCASE_TABLE, UPPERCASE_TABLE};
use super::Units;
fn is_surrogate_pair_at(us: &[u16], pos: usize) -> bool {
if let Some(pair) = us.get(pos..pos + 2) {
let has_high = (0xD800..=0xDBFF).contains(&pair[0]);
let has_low = (0xDC00..=0xDFFF).contains(&pair[1]);
has_high && has_low
} else {
false
}
}
/// Gets the position of the previous utf16 char;
/// `pos` must already lie on a char boundary
pub fn prev_char_boundary(slice: &super::WStr, pos: usize) -> usize {
if pos <= 1 {
return 0;
}
match slice.units() {
Units::Bytes(_) => pos - 1, // LATIN1 strings only contains 1-bytes chars
Units::Wide(us) if is_surrogate_pair_at(us, pos - 2) => pos - 2,
Units::Wide(_) => pos - 1,
}
}
/// Gets the byte position of the next utf16 char;
/// `pos` must already lie on a char boundary
pub fn next_char_boundary(slice: &super::WStr, pos: usize) -> usize {
if pos >= slice.len() {
return slice.len();
}
match slice.units() {
Units::Bytes(_) => pos + 1, // LATIN1 strings only contains 1-bytes chars
Units::Wide(us) if is_surrogate_pair_at(us, pos) => pos + 2,
Units::Wide(_) => pos + 1,
}
}
/// Returns `true` if the given utf16 code unit is an whitespace
/// according to the Flash Player.
#[inline]
pub fn swf_is_whitespace(c: u16) -> bool {
matches!(u8::try_from(c), Ok(b' ' | b'\t' | b'\n' | b'\r'))
}
/// Finds the longest prefix of `slice` that is entirely ASCII,
/// and returns it as an UTF8 string, together with the remaining tail.
pub fn split_ascii_prefix_bytes(slice: &[u8]) -> (&str, &[u8]) {
let first_non_ascii = slice.iter().position(|c| *c >= 0x80);
let (head, tail) = slice.split_at(first_non_ascii.unwrap_or(0));
// SAFETY: `head` only contains ASCII.
let head = unsafe { core::str::from_utf8_unchecked(head) };
(head, tail)
}
/// Finds the longest prefix of `slice` that is entirely ASCII,
/// and returns it as a byte slice, together with the remaining tail.
pub fn split_ascii_prefix(slice: &str) -> (&[u8], &str) {
let (head, tail) = split_ascii_prefix_bytes(slice.as_bytes());
// SAFETY: `split_ascii_prefix_bytes` always split on a char boundary.
let tail = unsafe { core::str::from_utf8_unchecked(tail) };
(head.as_bytes(), tail)
}
/// Maps a UTF-16 code unit into a `char`.
/// TODO: Surrogate characters will get replaced with the Unicode replacement character.
pub fn utf16_code_unit_to_char(c: u16) -> char {
char::decode_utf16(core::iter::once(c))
.next()
.unwrap()
.unwrap_or(char::REPLACEMENT_CHARACTER)
}
/// Maps a UCS2 code unit to its lowercase variant according to the Flash Player.
/// Note that this mapping is different that Rust's `to_lowercase`.
pub fn swf_to_lowercase(c: u16) -> u16 {
if c < 0x80 {
return (c as u8).to_ascii_lowercase().into();
}
match LOWERCASE_TABLE.binary_search_by(|&(key, _)| key.cmp(&c)) {
Ok(i) => LOWERCASE_TABLE[i].1,
Err(_) => c,
}
}
/// Maps a UCS2 code unit to its uppercase variant according to the Flash Player.
/// Note that this mapping is different that Rust's `to_uppercase`.
pub fn swf_to_uppercase(c: u16) -> u16 {
if c < 0x80 {
return (c as u8).to_ascii_uppercase().into();
}
match UPPERCASE_TABLE.binary_search_by(|&(key, _)| key.cmp(&c)) {
Ok(i) => UPPERCASE_TABLE[i].1,
Err(_) => c,
}
}