wstr: Implement custom UTF-8 decoding routine

2022-07-14 19:05:24 -07:00 · 2022-07-14 19:05:24 -07:00 · d6604f538c
parent 6dab6ca557
commit d6604f538c
8 changed files with 124 additions and 29 deletions
--- a/core/src/avm1/object/xml_object.rs
+++ b/core/src/avm1/object/xml_object.rs
@ -172,7 +172,7 @@ impl<'gc> XmlObject<'gc> {
                    let is_whitespace_char = |c: &u8| matches!(*c, b'\t' | b'\n' | b'\r' | b' ');
                    let is_whitespace_text = text.iter().all(is_whitespace_char);
                    if !(text.is_empty() || ignore_white && is_whitespace_text) {
-                        let text = AvmString::new_utf8_bytes(activation.context.gc_context, text)?;
+                        let text = AvmString::new_utf8_bytes(activation.context.gc_context, &text);
                        let child =
                            XmlNode::new(activation.context.gc_context, TEXT_NODE, Some(text));
                        open_tags
--- a/core/src/avm2/bytearray.rs
+++ b/core/src/avm2/bytearray.rs
@ -223,10 +223,10 @@ impl ByteArrayStorage {
        Ok(buffer)
    }

-    pub fn read_utf(&self) -> Result<String, Error> {
+    pub fn read_utf(&self) -> Result<&[u8], Error> {
        let len = self.read_unsigned_short()?;
-        let val = String::from_utf8_lossy(self.read_bytes(len.into())?);
-        Ok(val.into_owned())
+        let val = self.read_bytes(len.into())?;
+        Ok(val)
    }

    pub fn write_boolean(&mut self, val: bool) -> Result<(), Error> {
--- a/core/src/avm2/globals/flash/utils/bytearray.rs
+++ b/core/src/avm2/globals/flash/utils/bytearray.rs
@ -280,7 +280,7 @@ pub fn read_utf<'gc>(
    if let Some(this) = this {
        if let Some(bytearray) = this.as_bytearray() {
            return Ok(
-                AvmString::new_utf8(activation.context.gc_context, bytearray.read_utf()?).into(),
+                AvmString::new_utf8_bytes(activation.context.gc_context, bytearray.read_utf()?).into(),
            );
        }
    }
@ -294,8 +294,7 @@ pub fn to_string<'gc>(
 ) -> Result<Value<'gc>, Error> {
    if let Some(this) = this {
        if let Some(bytearray) = this.as_bytearray() {
-            let (new_string, _, _) = UTF_8.decode(bytearray.bytes());
-            return Ok(AvmString::new_utf8(activation.context.gc_context, new_string).into());
+            return Ok(AvmString::new_utf8_bytes(activation.context.gc_context, bytearray.bytes()).into());
        }
    }

--- a/core/src/loader.rs
+++ b/core/src/loader.rs
@ -721,10 +721,9 @@ impl<'gc> Loader<'gc> {
                                ByteArrayObject::from_storage(activation, storage).unwrap();
                            bytearray.into()
                        }
-                        DataFormat::Text => Avm2Value::String(AvmString::new_utf8_bytes_lossy(
-                            activation.context.gc_context,
-                            body,
-                        )),
+                        DataFormat::Text => Avm2Value::String(
+                            AvmString::new_utf8_bytes(activation.context.gc_context, &body),
+                        ),
                        DataFormat::Variables => {
                            log::warn!(
                                "Support for URLLoaderDataFormat.VARIABLES not yet implemented"
--- a/core/src/string.rs
+++ b/core/src/string.rs
@ -36,22 +36,14 @@ impl<'gc> AvmString<'gc> {
        }
    }

-    pub fn new_utf8_bytes<'b, B: Into<Cow<'b, [u8]>>>(
+    pub fn new_utf8_bytes(
        gc_context: MutationContext<'gc, '_>,
-        bytes: B,
-    ) -> Result<Self, std::str::Utf8Error> {
-        let utf8 = match bytes.into() {
-            Cow::Owned(b) => Cow::Owned(String::from_utf8(b).map_err(|e| e.utf8_error())?),
-            Cow::Borrowed(b) => Cow::Borrowed(std::str::from_utf8(b)?),
-        };
-        Ok(Self::new_utf8(gc_context, utf8))
-    }
-
-    pub fn new_utf8_bytes_lossy<'b, B: Into<Cow<'b, [u8]>>>(
-        gc_context: MutationContext<'gc, '_>,
-        bytes: B,
+        bytes: &[u8]
    ) -> Self {
-        Self::new_utf8(gc_context, String::from_utf8_lossy(&bytes.into()))
+        let buf = WString::from_utf8_bytes(bytes.to_vec());
+        Self {
+            source: Source::Owned(Gc::allocate(gc_context, OwnedWStr(buf))),
+        }
    }

    pub fn new<S: Into<WString>>(gc_context: MutationContext<'gc, '_>, string: S) -> Self {
--- a/core/src/xml/tree.rs
+++ b/core/src/xml/tree.rs
@ -80,16 +80,16 @@ impl<'gc> XmlNode<'gc> {
        bs: BytesStart<'_>,
        id_map: ScriptObject<'gc>,
    ) -> Result<Self, quick_xml::Error> {
-        let name = AvmString::new_utf8_bytes(activation.context.gc_context, bs.name())?;
+        let name = AvmString::new_utf8_bytes(activation.context.gc_context, bs.name());
        let mut node = Self::new(activation.context.gc_context, ELEMENT_NODE, Some(name));

        // Reverse attributes so they appear in the `PropertyMap` in their definition order.
        let attributes: Result<Vec<_>, _> = bs.attributes().collect();
        let attributes = attributes?;
        for attribute in attributes.iter().rev() {
-            let key = AvmString::new_utf8_bytes(activation.context.gc_context, attribute.key)?;
+            let key = AvmString::new_utf8_bytes(activation.context.gc_context, attribute.key);
            let value_bytes = attribute.unescaped_value()?;
-            let value = AvmString::new_utf8_bytes(activation.context.gc_context, value_bytes)?;
+            let value = AvmString::new_utf8_bytes(activation.context.gc_context, &value_bytes);

            // Insert an attribute.
            node.attributes().define_value(
--- a/wstr/src/buf.rs
+++ b/wstr/src/buf.rs
@ -6,7 +6,9 @@ use core::mem::{self, ManuallyDrop};
 use core::ops::{Deref, DerefMut};
 use core::ptr::{self, NonNull};

-use super::utils::split_ascii_prefix;
+use crate::utils::AvmUtf8Decoder;
+
+use super::utils::{encode_raw_utf16, split_ascii_prefix, split_ascii_prefix_bytes};
 use super::{Units, WStr, MAX_STRING_LEN};

 /// An owned, extensible UCS2 string, analoguous to `String`.
@ -102,6 +104,32 @@ impl WString {
        buf
    }

+    pub fn from_utf8_bytes(b: Vec<u8>) -> Self {
+        let (ascii, tail) = split_ascii_prefix_bytes(&b);
+        let ascii = ascii.as_bytes();
+        if tail.is_empty() {
+            // We can directly reinterpret ASCII bytes as LATIN1.
+            return Self::from_buf(b);
+        }
+
+        let is_wide = AvmUtf8Decoder::new(tail)
+            .find(|ch| *ch > u8::MAX.into())
+            .is_some();
+        if is_wide {
+            let mut buf = Vec::new();
+            buf.extend(ascii.iter().map(|c| u16::from(*c)));
+            for ch in AvmUtf8Decoder::new(tail) {
+                encode_raw_utf16(ch, &mut buf);
+            }
+            Self::from_buf(buf)
+        } else {
+            let mut buf = Vec::new();
+            buf.extend_from_slice(ascii);
+            buf.extend(tail.iter());
+            Self::from_buf(buf)
+        }
+    }
+
    /// Creates a `WString` from a single UCS2 code unit.
    #[inline]
    pub fn from_unit(c: u16) -> Self {
--- a/wstr/src/utils.rs
+++ b/wstr/src/utils.rs
@ -1,6 +1,7 @@
 ///! Utilities for operating on strings in SWF files.
 use super::tables::{LOWERCASE_TABLE, UPPERCASE_TABLE};
 use super::Units;
+use alloc::vec::Vec;

 fn is_surrogate_pair_at(us: &[u16], pos: usize) -> bool {
    if let Some(pair) = us.get(pos..pos + 2) {
@ -100,3 +101,79 @@ pub fn swf_to_uppercase(c: u16) -> u16 {
        Err(_) => c,
    }
 }
+
+/// This is the same idea as std::str::Chars, except it uses flash's weird UTF-8 decoding rules,
+/// and works on raw bytes. It also does not return `char`, but raw u32's that may or may not be valid chars.
+pub struct AvmUtf8Decoder<'a> {
+    src: &'a [u8],
+    index: usize,
+}
+
+impl<'a> AvmUtf8Decoder<'a> {
+    pub fn new(src: &'a [u8]) -> Self {
+        Self { src, index: 0 }
+    }
+}
+
+impl<'a> Iterator for AvmUtf8Decoder<'a> {
+    type Item = u32;
+    fn next(&mut self) -> Option<Self::Item> {
+        let mut ch: u32;
+        let first = *self.src.get(self.index)?;
+        let ones = first.leading_ones();
+
+        if ones <= 1 {
+            self.index += 1;
+            ch = first as u32;
+        } else {
+            let mb_count = core::cmp::min(ones - 1, 3);
+            let bm = u8::MAX >> ones;
+            ch = (bm & first) as u32;
+            match self
+                .src
+                .get(self.index + 1..)
+                .and_then(|src| src.get(..mb_count as usize))
+            {
+                Some(mb) => {
+                    let mut fail = false;
+                    for b in mb.iter() {
+                        // continuation bytes should start with a single leading 1
+                        if b.leading_ones() != 1 {
+                            self.index += 1;
+                            ch = first as u32;
+                            fail = true;
+                            break;
+                        }
+                        ch <<= 6;
+                        ch |= (*b & (u8::MAX >> 2)) as u32;
+                    }
+                    if !fail {
+                        self.index += mb_count as usize + 1;
+                    }
+                }
+                None => {
+                    self.index += 1;
+                    ch = first as u32;
+                }
+            }
+        };
+        debug_assert!(ch <= 0x10FFFF);
+        Some(ch)
+    }
+}
+
+/// Encodes a raw character point into UTF16. Unlike char::encode_utf16, this does not require
+/// that the character point is valid.
+pub fn encode_raw_utf16(mut ch: u32, dst: &mut Vec<u16>) {
+    if ch < 0x10000 {
+        dst.push(ch as u16);
+        return;
+    }
+    ch -= 0x10000;
+    let mut w1: u16 = 0xD800;
+    let mut w2: u16 = 0xDC00;
+    w1 |= (ch >> 10) as u16;
+    w2 |= (ch & !(u32::MAX << 10)) as u16;
+    dst.push(w1);
+    dst.push(w2);
+}