wstr: Implement custom UTF-8 decoding routine
This commit is contained in:
parent
6dab6ca557
commit
d6604f538c
|
@ -172,7 +172,7 @@ impl<'gc> XmlObject<'gc> {
|
|||
let is_whitespace_char = |c: &u8| matches!(*c, b'\t' | b'\n' | b'\r' | b' ');
|
||||
let is_whitespace_text = text.iter().all(is_whitespace_char);
|
||||
if !(text.is_empty() || ignore_white && is_whitespace_text) {
|
||||
let text = AvmString::new_utf8_bytes(activation.context.gc_context, text)?;
|
||||
let text = AvmString::new_utf8_bytes(activation.context.gc_context, &text);
|
||||
let child =
|
||||
XmlNode::new(activation.context.gc_context, TEXT_NODE, Some(text));
|
||||
open_tags
|
||||
|
|
|
@ -223,10 +223,10 @@ impl ByteArrayStorage {
|
|||
Ok(buffer)
|
||||
}
|
||||
|
||||
pub fn read_utf(&self) -> Result<String, Error> {
|
||||
pub fn read_utf(&self) -> Result<&[u8], Error> {
|
||||
let len = self.read_unsigned_short()?;
|
||||
let val = String::from_utf8_lossy(self.read_bytes(len.into())?);
|
||||
Ok(val.into_owned())
|
||||
let val = self.read_bytes(len.into())?;
|
||||
Ok(val)
|
||||
}
|
||||
|
||||
pub fn write_boolean(&mut self, val: bool) -> Result<(), Error> {
|
||||
|
|
|
@ -280,7 +280,7 @@ pub fn read_utf<'gc>(
|
|||
if let Some(this) = this {
|
||||
if let Some(bytearray) = this.as_bytearray() {
|
||||
return Ok(
|
||||
AvmString::new_utf8(activation.context.gc_context, bytearray.read_utf()?).into(),
|
||||
AvmString::new_utf8_bytes(activation.context.gc_context, bytearray.read_utf()?).into(),
|
||||
);
|
||||
}
|
||||
}
|
||||
|
@ -294,8 +294,7 @@ pub fn to_string<'gc>(
|
|||
) -> Result<Value<'gc>, Error> {
|
||||
if let Some(this) = this {
|
||||
if let Some(bytearray) = this.as_bytearray() {
|
||||
let (new_string, _, _) = UTF_8.decode(bytearray.bytes());
|
||||
return Ok(AvmString::new_utf8(activation.context.gc_context, new_string).into());
|
||||
return Ok(AvmString::new_utf8_bytes(activation.context.gc_context, bytearray.bytes()).into());
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -721,10 +721,9 @@ impl<'gc> Loader<'gc> {
|
|||
ByteArrayObject::from_storage(activation, storage).unwrap();
|
||||
bytearray.into()
|
||||
}
|
||||
DataFormat::Text => Avm2Value::String(AvmString::new_utf8_bytes_lossy(
|
||||
activation.context.gc_context,
|
||||
body,
|
||||
)),
|
||||
DataFormat::Text => Avm2Value::String(
|
||||
AvmString::new_utf8_bytes(activation.context.gc_context, &body),
|
||||
),
|
||||
DataFormat::Variables => {
|
||||
log::warn!(
|
||||
"Support for URLLoaderDataFormat.VARIABLES not yet implemented"
|
||||
|
|
|
@ -36,22 +36,14 @@ impl<'gc> AvmString<'gc> {
|
|||
}
|
||||
}
|
||||
|
||||
pub fn new_utf8_bytes<'b, B: Into<Cow<'b, [u8]>>>(
|
||||
pub fn new_utf8_bytes(
|
||||
gc_context: MutationContext<'gc, '_>,
|
||||
bytes: B,
|
||||
) -> Result<Self, std::str::Utf8Error> {
|
||||
let utf8 = match bytes.into() {
|
||||
Cow::Owned(b) => Cow::Owned(String::from_utf8(b).map_err(|e| e.utf8_error())?),
|
||||
Cow::Borrowed(b) => Cow::Borrowed(std::str::from_utf8(b)?),
|
||||
};
|
||||
Ok(Self::new_utf8(gc_context, utf8))
|
||||
}
|
||||
|
||||
pub fn new_utf8_bytes_lossy<'b, B: Into<Cow<'b, [u8]>>>(
|
||||
gc_context: MutationContext<'gc, '_>,
|
||||
bytes: B,
|
||||
bytes: &[u8]
|
||||
) -> Self {
|
||||
Self::new_utf8(gc_context, String::from_utf8_lossy(&bytes.into()))
|
||||
let buf = WString::from_utf8_bytes(bytes.to_vec());
|
||||
Self {
|
||||
source: Source::Owned(Gc::allocate(gc_context, OwnedWStr(buf))),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn new<S: Into<WString>>(gc_context: MutationContext<'gc, '_>, string: S) -> Self {
|
||||
|
|
|
@ -80,16 +80,16 @@ impl<'gc> XmlNode<'gc> {
|
|||
bs: BytesStart<'_>,
|
||||
id_map: ScriptObject<'gc>,
|
||||
) -> Result<Self, quick_xml::Error> {
|
||||
let name = AvmString::new_utf8_bytes(activation.context.gc_context, bs.name())?;
|
||||
let name = AvmString::new_utf8_bytes(activation.context.gc_context, bs.name());
|
||||
let mut node = Self::new(activation.context.gc_context, ELEMENT_NODE, Some(name));
|
||||
|
||||
// Reverse attributes so they appear in the `PropertyMap` in their definition order.
|
||||
let attributes: Result<Vec<_>, _> = bs.attributes().collect();
|
||||
let attributes = attributes?;
|
||||
for attribute in attributes.iter().rev() {
|
||||
let key = AvmString::new_utf8_bytes(activation.context.gc_context, attribute.key)?;
|
||||
let key = AvmString::new_utf8_bytes(activation.context.gc_context, attribute.key);
|
||||
let value_bytes = attribute.unescaped_value()?;
|
||||
let value = AvmString::new_utf8_bytes(activation.context.gc_context, value_bytes)?;
|
||||
let value = AvmString::new_utf8_bytes(activation.context.gc_context, &value_bytes);
|
||||
|
||||
// Insert an attribute.
|
||||
node.attributes().define_value(
|
||||
|
|
|
@ -6,7 +6,9 @@ use core::mem::{self, ManuallyDrop};
|
|||
use core::ops::{Deref, DerefMut};
|
||||
use core::ptr::{self, NonNull};
|
||||
|
||||
use super::utils::split_ascii_prefix;
|
||||
use crate::utils::AvmUtf8Decoder;
|
||||
|
||||
use super::utils::{encode_raw_utf16, split_ascii_prefix, split_ascii_prefix_bytes};
|
||||
use super::{Units, WStr, MAX_STRING_LEN};
|
||||
|
||||
/// An owned, extensible UCS2 string, analoguous to `String`.
|
||||
|
@ -102,6 +104,32 @@ impl WString {
|
|||
buf
|
||||
}
|
||||
|
||||
pub fn from_utf8_bytes(b: Vec<u8>) -> Self {
|
||||
let (ascii, tail) = split_ascii_prefix_bytes(&b);
|
||||
let ascii = ascii.as_bytes();
|
||||
if tail.is_empty() {
|
||||
// We can directly reinterpret ASCII bytes as LATIN1.
|
||||
return Self::from_buf(b);
|
||||
}
|
||||
|
||||
let is_wide = AvmUtf8Decoder::new(tail)
|
||||
.find(|ch| *ch > u8::MAX.into())
|
||||
.is_some();
|
||||
if is_wide {
|
||||
let mut buf = Vec::new();
|
||||
buf.extend(ascii.iter().map(|c| u16::from(*c)));
|
||||
for ch in AvmUtf8Decoder::new(tail) {
|
||||
encode_raw_utf16(ch, &mut buf);
|
||||
}
|
||||
Self::from_buf(buf)
|
||||
} else {
|
||||
let mut buf = Vec::new();
|
||||
buf.extend_from_slice(ascii);
|
||||
buf.extend(tail.iter());
|
||||
Self::from_buf(buf)
|
||||
}
|
||||
}
|
||||
|
||||
/// Creates a `WString` from a single UCS2 code unit.
|
||||
#[inline]
|
||||
pub fn from_unit(c: u16) -> Self {
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
///! Utilities for operating on strings in SWF files.
|
||||
use super::tables::{LOWERCASE_TABLE, UPPERCASE_TABLE};
|
||||
use super::Units;
|
||||
use alloc::vec::Vec;
|
||||
|
||||
fn is_surrogate_pair_at(us: &[u16], pos: usize) -> bool {
|
||||
if let Some(pair) = us.get(pos..pos + 2) {
|
||||
|
@ -100,3 +101,79 @@ pub fn swf_to_uppercase(c: u16) -> u16 {
|
|||
Err(_) => c,
|
||||
}
|
||||
}
|
||||
|
||||
/// This is the same idea as std::str::Chars, except it uses flash's weird UTF-8 decoding rules,
|
||||
/// and works on raw bytes. It also does not return `char`, but raw u32's that may or may not be valid chars.
|
||||
pub struct AvmUtf8Decoder<'a> {
|
||||
src: &'a [u8],
|
||||
index: usize,
|
||||
}
|
||||
|
||||
impl<'a> AvmUtf8Decoder<'a> {
|
||||
pub fn new(src: &'a [u8]) -> Self {
|
||||
Self { src, index: 0 }
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Iterator for AvmUtf8Decoder<'a> {
|
||||
type Item = u32;
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
let mut ch: u32;
|
||||
let first = *self.src.get(self.index)?;
|
||||
let ones = first.leading_ones();
|
||||
|
||||
if ones <= 1 {
|
||||
self.index += 1;
|
||||
ch = first as u32;
|
||||
} else {
|
||||
let mb_count = core::cmp::min(ones - 1, 3);
|
||||
let bm = u8::MAX >> ones;
|
||||
ch = (bm & first) as u32;
|
||||
match self
|
||||
.src
|
||||
.get(self.index + 1..)
|
||||
.and_then(|src| src.get(..mb_count as usize))
|
||||
{
|
||||
Some(mb) => {
|
||||
let mut fail = false;
|
||||
for b in mb.iter() {
|
||||
// continuation bytes should start with a single leading 1
|
||||
if b.leading_ones() != 1 {
|
||||
self.index += 1;
|
||||
ch = first as u32;
|
||||
fail = true;
|
||||
break;
|
||||
}
|
||||
ch <<= 6;
|
||||
ch |= (*b & (u8::MAX >> 2)) as u32;
|
||||
}
|
||||
if !fail {
|
||||
self.index += mb_count as usize + 1;
|
||||
}
|
||||
}
|
||||
None => {
|
||||
self.index += 1;
|
||||
ch = first as u32;
|
||||
}
|
||||
}
|
||||
};
|
||||
debug_assert!(ch <= 0x10FFFF);
|
||||
Some(ch)
|
||||
}
|
||||
}
|
||||
|
||||
/// Encodes a raw character point into UTF16. Unlike char::encode_utf16, this does not require
|
||||
/// that the character point is valid.
|
||||
pub fn encode_raw_utf16(mut ch: u32, dst: &mut Vec<u16>) {
|
||||
if ch < 0x10000 {
|
||||
dst.push(ch as u16);
|
||||
return;
|
||||
}
|
||||
ch -= 0x10000;
|
||||
let mut w1: u16 = 0xD800;
|
||||
let mut w2: u16 = 0xDC00;
|
||||
w1 |= (ch >> 10) as u16;
|
||||
w2 |= (ch & !(u32::MAX << 10)) as u16;
|
||||
dst.push(w1);
|
||||
dst.push(w2);
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue