wstr: Implement custom UTF-8 decoding routine

This commit is contained in:
EmperorBale 2022-07-14 19:05:24 -07:00 committed by Mike Welsh
parent 6dab6ca557
commit d6604f538c
8 changed files with 124 additions and 29 deletions

View File

@ -172,7 +172,7 @@ impl<'gc> XmlObject<'gc> {
let is_whitespace_char = |c: &u8| matches!(*c, b'\t' | b'\n' | b'\r' | b' ');
let is_whitespace_text = text.iter().all(is_whitespace_char);
if !(text.is_empty() || ignore_white && is_whitespace_text) {
let text = AvmString::new_utf8_bytes(activation.context.gc_context, text)?;
let text = AvmString::new_utf8_bytes(activation.context.gc_context, &text);
let child =
XmlNode::new(activation.context.gc_context, TEXT_NODE, Some(text));
open_tags

View File

@ -223,10 +223,10 @@ impl ByteArrayStorage {
Ok(buffer)
}
pub fn read_utf(&self) -> Result<String, Error> {
pub fn read_utf(&self) -> Result<&[u8], Error> {
let len = self.read_unsigned_short()?;
let val = String::from_utf8_lossy(self.read_bytes(len.into())?);
Ok(val.into_owned())
let val = self.read_bytes(len.into())?;
Ok(val)
}
pub fn write_boolean(&mut self, val: bool) -> Result<(), Error> {

View File

@ -280,7 +280,7 @@ pub fn read_utf<'gc>(
if let Some(this) = this {
if let Some(bytearray) = this.as_bytearray() {
return Ok(
AvmString::new_utf8(activation.context.gc_context, bytearray.read_utf()?).into(),
AvmString::new_utf8_bytes(activation.context.gc_context, bytearray.read_utf()?).into(),
);
}
}
@ -294,8 +294,7 @@ pub fn to_string<'gc>(
) -> Result<Value<'gc>, Error> {
if let Some(this) = this {
if let Some(bytearray) = this.as_bytearray() {
let (new_string, _, _) = UTF_8.decode(bytearray.bytes());
return Ok(AvmString::new_utf8(activation.context.gc_context, new_string).into());
return Ok(AvmString::new_utf8_bytes(activation.context.gc_context, bytearray.bytes()).into());
}
}

View File

@ -721,10 +721,9 @@ impl<'gc> Loader<'gc> {
ByteArrayObject::from_storage(activation, storage).unwrap();
bytearray.into()
}
DataFormat::Text => Avm2Value::String(AvmString::new_utf8_bytes_lossy(
activation.context.gc_context,
body,
)),
DataFormat::Text => Avm2Value::String(
AvmString::new_utf8_bytes(activation.context.gc_context, &body),
),
DataFormat::Variables => {
log::warn!(
"Support for URLLoaderDataFormat.VARIABLES not yet implemented"

View File

@ -36,22 +36,14 @@ impl<'gc> AvmString<'gc> {
}
}
pub fn new_utf8_bytes<'b, B: Into<Cow<'b, [u8]>>>(
pub fn new_utf8_bytes(
gc_context: MutationContext<'gc, '_>,
bytes: B,
) -> Result<Self, std::str::Utf8Error> {
let utf8 = match bytes.into() {
Cow::Owned(b) => Cow::Owned(String::from_utf8(b).map_err(|e| e.utf8_error())?),
Cow::Borrowed(b) => Cow::Borrowed(std::str::from_utf8(b)?),
};
Ok(Self::new_utf8(gc_context, utf8))
}
pub fn new_utf8_bytes_lossy<'b, B: Into<Cow<'b, [u8]>>>(
gc_context: MutationContext<'gc, '_>,
bytes: B,
bytes: &[u8]
) -> Self {
Self::new_utf8(gc_context, String::from_utf8_lossy(&bytes.into()))
let buf = WString::from_utf8_bytes(bytes.to_vec());
Self {
source: Source::Owned(Gc::allocate(gc_context, OwnedWStr(buf))),
}
}
pub fn new<S: Into<WString>>(gc_context: MutationContext<'gc, '_>, string: S) -> Self {

View File

@ -80,16 +80,16 @@ impl<'gc> XmlNode<'gc> {
bs: BytesStart<'_>,
id_map: ScriptObject<'gc>,
) -> Result<Self, quick_xml::Error> {
let name = AvmString::new_utf8_bytes(activation.context.gc_context, bs.name())?;
let name = AvmString::new_utf8_bytes(activation.context.gc_context, bs.name());
let mut node = Self::new(activation.context.gc_context, ELEMENT_NODE, Some(name));
// Reverse attributes so they appear in the `PropertyMap` in their definition order.
let attributes: Result<Vec<_>, _> = bs.attributes().collect();
let attributes = attributes?;
for attribute in attributes.iter().rev() {
let key = AvmString::new_utf8_bytes(activation.context.gc_context, attribute.key)?;
let key = AvmString::new_utf8_bytes(activation.context.gc_context, attribute.key);
let value_bytes = attribute.unescaped_value()?;
let value = AvmString::new_utf8_bytes(activation.context.gc_context, value_bytes)?;
let value = AvmString::new_utf8_bytes(activation.context.gc_context, &value_bytes);
// Insert an attribute.
node.attributes().define_value(

View File

@ -6,7 +6,9 @@ use core::mem::{self, ManuallyDrop};
use core::ops::{Deref, DerefMut};
use core::ptr::{self, NonNull};
use super::utils::split_ascii_prefix;
use crate::utils::AvmUtf8Decoder;
use super::utils::{encode_raw_utf16, split_ascii_prefix, split_ascii_prefix_bytes};
use super::{Units, WStr, MAX_STRING_LEN};
/// An owned, extensible UCS2 string, analoguous to `String`.
@ -102,6 +104,32 @@ impl WString {
buf
}
pub fn from_utf8_bytes(b: Vec<u8>) -> Self {
let (ascii, tail) = split_ascii_prefix_bytes(&b);
let ascii = ascii.as_bytes();
if tail.is_empty() {
// We can directly reinterpret ASCII bytes as LATIN1.
return Self::from_buf(b);
}
let is_wide = AvmUtf8Decoder::new(tail)
.find(|ch| *ch > u8::MAX.into())
.is_some();
if is_wide {
let mut buf = Vec::new();
buf.extend(ascii.iter().map(|c| u16::from(*c)));
for ch in AvmUtf8Decoder::new(tail) {
encode_raw_utf16(ch, &mut buf);
}
Self::from_buf(buf)
} else {
let mut buf = Vec::new();
buf.extend_from_slice(ascii);
buf.extend(tail.iter());
Self::from_buf(buf)
}
}
/// Creates a `WString` from a single UCS2 code unit.
#[inline]
pub fn from_unit(c: u16) -> Self {

View File

@ -1,6 +1,7 @@
///! Utilities for operating on strings in SWF files.
use super::tables::{LOWERCASE_TABLE, UPPERCASE_TABLE};
use super::Units;
use alloc::vec::Vec;
fn is_surrogate_pair_at(us: &[u16], pos: usize) -> bool {
if let Some(pair) = us.get(pos..pos + 2) {
@ -100,3 +101,79 @@ pub fn swf_to_uppercase(c: u16) -> u16 {
Err(_) => c,
}
}
/// This is the same idea as std::str::Chars, except it uses flash's weird UTF-8 decoding rules,
/// and works on raw bytes. It also does not return `char`, but raw u32's that may or may not be valid chars.
pub struct AvmUtf8Decoder<'a> {
src: &'a [u8],
index: usize,
}
impl<'a> AvmUtf8Decoder<'a> {
pub fn new(src: &'a [u8]) -> Self {
Self { src, index: 0 }
}
}
impl<'a> Iterator for AvmUtf8Decoder<'a> {
type Item = u32;
fn next(&mut self) -> Option<Self::Item> {
let mut ch: u32;
let first = *self.src.get(self.index)?;
let ones = first.leading_ones();
if ones <= 1 {
self.index += 1;
ch = first as u32;
} else {
let mb_count = core::cmp::min(ones - 1, 3);
let bm = u8::MAX >> ones;
ch = (bm & first) as u32;
match self
.src
.get(self.index + 1..)
.and_then(|src| src.get(..mb_count as usize))
{
Some(mb) => {
let mut fail = false;
for b in mb.iter() {
// continuation bytes should start with a single leading 1
if b.leading_ones() != 1 {
self.index += 1;
ch = first as u32;
fail = true;
break;
}
ch <<= 6;
ch |= (*b & (u8::MAX >> 2)) as u32;
}
if !fail {
self.index += mb_count as usize + 1;
}
}
None => {
self.index += 1;
ch = first as u32;
}
}
};
debug_assert!(ch <= 0x10FFFF);
Some(ch)
}
}
/// Encodes a raw character point into UTF16. Unlike char::encode_utf16, this does not require
/// that the character point is valid.
pub fn encode_raw_utf16(mut ch: u32, dst: &mut Vec<u16>) {
if ch < 0x10000 {
dst.push(ch as u16);
return;
}
ch -= 0x10000;
let mut w1: u16 = 0xD800;
let mut w2: u16 = 0xDC00;
w1 |= (ch >> 10) as u16;
w2 |= (ch & !(u32::MAX << 10)) as u16;
dst.push(w1);
dst.push(w2);
}