From 4a09088d422336aef71f47868cfff26c23bf558b Mon Sep 17 00:00:00 2001
From: Moulins <arthur.heuillard@orange.fr>
Date: Mon, 20 Sep 2021 23:56:17 +0200
Subject: [PATCH] avm2: rewrite regexp module to not rely on AvmString::as_str

This is a little tricky, because we have to map the utf8 indices
returned by the regex engine to utf16 indices usable by Ruffle.

To limit the impact on performance, the regex, the string we're
currently matching on, and the last known (utf8, utf16) positions
are cached, avoiding extra utf8 conversions in common use cases
where a single string is repeatedly searched with increasing
`lastIndex`.
---
 core/src/avm2/globals/regexp.rs       |  50 +++--
 core/src/avm2/globals/string.rs       |  19 +-
 core/src/avm2/object/regexp_object.rs |  14 +-
 core/src/avm2/regexp.rs               | 289 ++++++++++++++++++++------
 core/src/string.rs                    |   2 +-
 core/src/string/avm.rs                |   9 +
 core/src/string/buf.rs                |  10 +-
 core/src/string/common.rs             |   8 +
 core/src/string/ops.rs                |  44 +++-
 core/src/string/pattern.rs            |   2 +-
 core/src/string/utils.rs              |  19 ++
 11 files changed, 348 insertions(+), 118 deletions(-)

diff --git a/core/src/avm2/globals/regexp.rs b/core/src/avm2/globals/regexp.rs
index 96c117f18..617d6c037 100644
--- a/core/src/avm2/globals/regexp.rs
+++ b/core/src/avm2/globals/regexp.rs
@@ -4,10 +4,11 @@ use crate::avm2::class::Class;
 use crate::avm2::method::{Method, NativeMethodImpl, ParamConfig};
 use crate::avm2::names::{Namespace, QName};
 use crate::avm2::object::{regexp_allocator, ArrayObject, Object, TObject};
+use crate::avm2::regexp::RegExpFlags;
 use crate::avm2::value::Value;
 use crate::avm2::Error;
 use crate::avm2::{activation::Activation, array::ArrayStorage};
-use crate::string::AvmString;
+use crate::string::{AvmString, WString};
 use gc_arena::{GcCell, MutationContext};
 
 /// Implements `RegExp`'s instance initializer.
@@ -26,20 +27,24 @@ pub fn instance_init<'gc>(
                     .coerce_to_string(activation)?,
             );
 
-            let flags = args
+            let flag_chars = args
                 .get(1)
                 .unwrap_or(&Value::String("".into()))
                 .coerce_to_string(activation)?;
-            for flag in flags.chars() {
-                match flag {
-                    's' => regexp.set_dotall(true),
-                    'x' => regexp.set_extended(true),
-                    'g' => regexp.set_global(true),
-                    'i' => regexp.set_ignore_case(true),
-                    'm' => regexp.set_multiline(true),
-                    _ => {}
+
+            let mut flags = RegExpFlags::empty();
+            for c in &flag_chars {
+                flags |= match u8::try_from(c) {
+                    Ok(b's') => RegExpFlags::DOTALL,
+                    Ok(b'x') => RegExpFlags::EXTENDED,
+                    Ok(b'g') => RegExpFlags::GLOBAL,
+                    Ok(b'i') => RegExpFlags::IGNORE_CASE,
+                    Ok(b'm') => RegExpFlags::MULTILINE,
+                    _ => continue,
                 };
             }
+
+            regexp.set_flags(flags);
         }
     }
 
@@ -63,7 +68,7 @@ pub fn dotall<'gc>(
 ) -> Result<Value<'gc>, Error> {
     if let Some(this) = this {
         if let Some(regexp) = this.as_regexp() {
-            return Ok(regexp.dotall().into());
+            return Ok(regexp.flags().contains(RegExpFlags::DOTALL).into());
         }
     }
 
@@ -78,7 +83,7 @@ pub fn extended<'gc>(
 ) -> Result<Value<'gc>, Error> {
     if let Some(this) = this {
         if let Some(regexp) = this.as_regexp() {
-            return Ok(regexp.extended().into());
+            return Ok(regexp.flags().contains(RegExpFlags::EXTENDED).into());
         }
     }
 
@@ -93,7 +98,7 @@ pub fn global<'gc>(
 ) -> Result<Value<'gc>, Error> {
     if let Some(this) = this {
         if let Some(regexp) = this.as_regexp() {
-            return Ok(regexp.global().into());
+            return Ok(regexp.flags().contains(RegExpFlags::GLOBAL).into());
         }
     }
 
@@ -108,7 +113,7 @@ pub fn ignore_case<'gc>(
 ) -> Result<Value<'gc>, Error> {
     if let Some(this) = this {
         if let Some(regexp) = this.as_regexp() {
-            return Ok(regexp.ignore_case().into());
+            return Ok(regexp.flags().contains(RegExpFlags::IGNORE_CASE).into());
         }
     }
 
@@ -123,7 +128,7 @@ pub fn multiline<'gc>(
 ) -> Result<Value<'gc>, Error> {
     if let Some(this) = this {
         if let Some(regexp) = this.as_regexp() {
-            return Ok(regexp.multiline().into());
+            return Ok(regexp.flags().contains(RegExpFlags::MULTILINE).into());
         }
     }
 
@@ -192,17 +197,16 @@ pub fn exec<'gc>(
                 .unwrap_or(&Value::Undefined)
                 .coerce_to_string(activation)?;
 
-            let (storage, index) = match re.exec(&text) {
+            let (storage, index) = match re.exec(text) {
                 Some(matched) => {
                     let substrings = matched
                         .groups()
-                        .map(|range| text[range.unwrap_or(0..0)].to_string());
+                        .map(|range| range.map(|r| WString::from(text.slice(r))));
 
-                    let mut storage = ArrayStorage::new(0);
-                    for substring in substrings {
-                        storage
-                            .push(AvmString::new(activation.context.gc_context, substring).into());
-                    }
+                    let storage = ArrayStorage::from_iter(substrings.map(|s| match s {
+                        None => Value::Undefined,
+                        Some(s) => AvmString::new_ucs2(activation.context.gc_context, s).into(),
+                    }));
 
                     (storage, matched.start())
                 }
@@ -244,7 +248,7 @@ pub fn test<'gc>(
                 .get(0)
                 .unwrap_or(&Value::Undefined)
                 .coerce_to_string(activation)?;
-            return Ok(re.test(&text).into());
+            return Ok(re.test(text).into());
         }
     }
 
diff --git a/core/src/avm2/globals/string.rs b/core/src/avm2/globals/string.rs
index 72fb74feb..553237803 100644
--- a/core/src/avm2/globals/string.rs
+++ b/core/src/avm2/globals/string.rs
@@ -5,6 +5,7 @@ use crate::avm2::class::{Class, ClassAttributes};
 use crate::avm2::method::{Method, NativeMethodImpl};
 use crate::avm2::names::{Namespace, QName};
 use crate::avm2::object::{primitive_allocator, Object, TObject};
+use crate::avm2::regexp::RegExpFlags;
 use crate::avm2::value::Value;
 use crate::avm2::Error;
 use crate::avm2::{ArrayObject, ArrayStorage};
@@ -230,18 +231,18 @@ fn match_s<'gc>(
 
         if let Some(mut regexp) = pattern.as_regexp_mut(activation.context.gc_context) {
             let mut storage = ArrayStorage::new(0);
-            if regexp.global() {
+            if regexp.flags().contains(RegExpFlags::GLOBAL) {
                 let mut last = regexp.last_index();
                 let old_last_index = regexp.last_index();
                 regexp.set_last_index(0);
-                while let Some(result) = regexp.exec(this.as_str()) {
+                while let Some(result) = regexp.exec(this) {
                     if regexp.last_index() == last {
                         break;
                     }
                     storage.push(
-                        AvmString::new(
+                        AvmString::new_ucs2(
                             activation.context.gc_context,
-                            this.as_str()[result.range()].to_string(),
+                            this.slice(result.range()).into(),
                         )
                         .into(),
                     );
@@ -257,15 +258,17 @@ fn match_s<'gc>(
             } else {
                 let old = regexp.last_index();
                 regexp.set_last_index(0);
-                if let Some(result) = regexp.exec(this.as_str()) {
+                if let Some(result) = regexp.exec(this) {
                     let substrings = result
                         .groups()
-                        .map(|range| this.as_str()[range.unwrap_or(0..0)].to_string());
+                        .map(|range| this.slice(range.unwrap_or(0..0)));
 
                     let mut storage = ArrayStorage::new(0);
                     for substring in substrings {
-                        storage
-                            .push(AvmString::new(activation.context.gc_context, substring).into());
+                        storage.push(
+                            AvmString::new_ucs2(activation.context.gc_context, substring.into())
+                                .into(),
+                        );
                     }
                     regexp.set_last_index(old);
                     return Ok(ArrayObject::from_storage(activation, storage)
diff --git a/core/src/avm2/object/regexp_object.rs b/core/src/avm2/object/regexp_object.rs
index 707f2e613..36ad81343 100644
--- a/core/src/avm2/object/regexp_object.rs
+++ b/core/src/avm2/object/regexp_object.rs
@@ -3,7 +3,7 @@
 use crate::avm2::activation::Activation;
 use crate::avm2::object::script_object::ScriptObjectData;
 use crate::avm2::object::{ClassObject, Object, ObjectPtr, TObject};
-use crate::avm2::regexp::RegExp;
+use crate::avm2::regexp::{RegExp, RegExpFlags};
 use crate::avm2::value::Value;
 use crate::avm2::Error;
 use crate::string::AvmString;
@@ -97,19 +97,21 @@ impl<'gc> TObject<'gc> for RegExpObject<'gc> {
         let read = self.0.read();
         let mut s = format!("/{}/", read.regexp.source());
 
-        if read.regexp.global() {
+        let flags = read.regexp.flags();
+
+        if flags.contains(RegExpFlags::GLOBAL) {
             s.push('g');
         }
-        if read.regexp.ignore_case() {
+        if flags.contains(RegExpFlags::IGNORE_CASE) {
             s.push('i');
         }
-        if read.regexp.multiline() {
+        if flags.contains(RegExpFlags::MULTILINE) {
             s.push('m');
         }
-        if read.regexp.dotall() {
+        if flags.contains(RegExpFlags::DOTALL) {
             s.push('s');
         }
-        if read.regexp.extended() {
+        if flags.contains(RegExpFlags::EXTENDED) {
             s.push('x');
         }
 
diff --git a/core/src/avm2/regexp.rs b/core/src/avm2/regexp.rs
index 5aa343fb9..742e305ea 100644
--- a/core/src/avm2/regexp.rs
+++ b/core/src/avm2/regexp.rs
@@ -1,22 +1,39 @@
 //! RegExp Structure
 
-use crate::string::AvmString;
+use std::borrow::Cow;
+
+use crate::string::{AvmString, Units, WStrToUtf8};
 use bitflags::bitflags;
 use gc_arena::Collect;
-use regress::Regex;
 
-#[derive(Clone, Collect, Debug)]
+#[derive(Collect, Debug)]
 #[collect(no_drop)]
 pub struct RegExp<'gc> {
     source: AvmString<'gc>,
     flags: RegExpFlags,
     last_index: usize,
+
+    #[collect(require_static)]
+    cached_regex: Option<Result<regress::Regex, ()>>,
+    cached_text: Option<CachedText<'gc>>,
+}
+
+impl<'gc> Clone for RegExp<'gc> {
+    fn clone(&self) -> Self {
+        Self {
+            source: self.source,
+            flags: self.flags,
+            last_index: self.last_index,
+            cached_regex: None,
+            cached_text: None,
+        }
+    }
 }
 
 bitflags! {
     #[derive(Collect)]
     #[collect(require_static)]
-    struct RegExpFlags: u8 {
+    pub struct RegExpFlags: u8 {
         const GLOBAL       = 1 << 0;
         const IGNORE_CASE  = 1 << 1;
         const MULTILINE    = 1 << 2;
@@ -34,6 +51,8 @@ impl<'gc> RegExp<'gc> {
             source: source.into(),
             flags: RegExpFlags::empty(),
             last_index: 0,
+            cached_regex: None,
+            cached_text: None,
         }
     }
 
@@ -45,9 +64,19 @@ impl<'gc> RegExp<'gc> {
     where
         S: Into<AvmString<'gc>>,
     {
+        self.cached_regex = None;
         self.source = source.into();
     }
 
+    pub fn flags(&self) -> RegExpFlags {
+        self.flags
+    }
+
+    pub fn set_flags(&mut self, flags: RegExpFlags) {
+        self.cached_regex = None;
+        self.flags = flags;
+    }
+
     pub fn last_index(&self) -> usize {
         self.last_index
     }
@@ -56,70 +85,196 @@ impl<'gc> RegExp<'gc> {
         self.last_index = i;
     }
 
-    pub fn dotall(&self) -> bool {
-        self.flags.contains(RegExpFlags::DOTALL)
-    }
-
-    pub fn set_dotall(&mut self, value: bool) {
-        self.flags.set(RegExpFlags::DOTALL, value);
-    }
-
-    pub fn extended(&self) -> bool {
-        self.flags.contains(RegExpFlags::EXTENDED)
-    }
-
-    pub fn set_extended(&mut self, value: bool) {
-        self.flags.set(RegExpFlags::EXTENDED, value);
-    }
-
-    pub fn global(&self) -> bool {
-        self.flags.contains(RegExpFlags::GLOBAL)
-    }
-
-    pub fn set_global(&mut self, value: bool) {
-        self.flags.set(RegExpFlags::GLOBAL, value);
-    }
-
-    pub fn ignore_case(&self) -> bool {
-        self.flags.contains(RegExpFlags::IGNORE_CASE)
-    }
-
-    pub fn set_ignore_case(&mut self, value: bool) {
-        self.flags.set(RegExpFlags::IGNORE_CASE, value);
-    }
-
-    pub fn multiline(&self) -> bool {
-        self.flags.contains(RegExpFlags::MULTILINE)
-    }
-
-    pub fn set_multiline(&mut self, value: bool) {
-        self.flags.set(RegExpFlags::MULTILINE, value);
-    }
-
-    pub fn test(&mut self, text: &str) -> bool {
-        self.exec(text).is_some()
-    }
-
-    pub fn exec(&mut self, text: &str) -> Option<regress::Match> {
-        if let Ok(re) = Regex::with_flags(
-            &self.source,
-            regress::Flags {
-                icase: self.ignore_case(),
-                multiline: self.multiline(),
-                dot_all: self.dotall(),
-                no_opt: false,
-            },
-        ) {
-            let start = if self.global() { self.last_index } else { 0 };
-            if let Some(matched) = re.find_from(text, start).next() {
-                if self.global() {
-                    self.last_index = matched.end();
-                }
-
-                return Some(matched);
-            }
+    fn find_utf8_match_at<T, F>(&mut self, text: AvmString<'gc>, start: usize, f: F) -> Option<T>
+    where
+        F: FnOnce(&mut CachedText<'gc>, regress::Match) -> T,
+    {
+        if self.cached_regex.is_none() {
+            let re = regress::Regex::with_flags(
+                &self.source.to_utf8_lossy(),
+                regress::Flags {
+                    icase: self.flags.contains(RegExpFlags::IGNORE_CASE),
+                    multiline: self.flags.contains(RegExpFlags::MULTILINE),
+                    dot_all: self.flags.contains(RegExpFlags::DOTALL),
+                    no_opt: false,
+                },
+            );
+            self.cached_regex = Some(re.map_err(drop));
         }
 
-        None
+        let regex = match self.cached_regex.as_mut() {
+            Some(Ok(re)) => re,
+            Some(Err(_)) => return None,
+            None => unreachable!(),
+        };
+
+        let cached = self
+            .cached_text
+            .as_ref()
+            .filter(|cached| AvmString::ptr_eq(&cached.text, &text))
+            .is_some();
+        if !cached {
+            self.cached_text = Some(CachedText::new(text));
+        }
+        let text = self.cached_text.as_mut().unwrap();
+
+        let start = text.utf8_index(start)?;
+        let re_match = regex.find_from(text.utf8(), start).next()?;
+        Some(f(text, re_match))
+    }
+
+    pub fn test(&mut self, text: AvmString<'gc>) -> bool {
+        let global = self.flags.contains(RegExpFlags::GLOBAL);
+        let start = if global { self.last_index } else { 0 };
+        let matched_idx = self.find_utf8_match_at(text, start, |text, re_match| {
+            if global {
+                text.utf16_index(re_match.end())
+            } else {
+                None
+            }
+        });
+
+        match matched_idx {
+            Some(Some(idx)) => {
+                self.last_index = idx;
+                true
+            }
+            Some(None) => true,
+            None => false,
+        }
+    }
+
+    pub fn exec(&mut self, text: AvmString<'gc>) -> Option<regress::Match> {
+        let global = self.flags.contains(RegExpFlags::GLOBAL);
+        let start = if global { self.last_index } else { 0 };
+        let re_match = self.find_utf8_match_at(text, start, |text, mut re_match| {
+            // Sort the capture endpoints by increasing index, so that CachedText::utf16_index is efficient.
+            let mut utf8_indices = re_match
+                .captures
+                .iter_mut()
+                .filter_map(Option::as_mut)
+                .chain(std::iter::once(&mut re_match.range))
+                .flat_map(|capture| [&mut capture.start, &mut capture.end])
+                .collect::<Vec<_>>();
+            utf8_indices.sort_by_key(|i| **i);
+
+            // Map UTF8 indices back to UTF16.
+            for i in utf8_indices {
+                *i = text.utf16_index(*i).unwrap();
+            }
+
+            re_match
+        })?;
+
+        if global {
+            self.last_index = re_match.end();
+        }
+
+        Some(re_match)
+    }
+}
+
+#[derive(Collect, Debug)]
+#[collect(no_drop)]
+struct CachedText<'gc> {
+    text: AvmString<'gc>,
+    // None means that `text` is already a valid utf8 string.
+    utf8: Option<String>,
+    utf8_prefix_len: usize,
+
+    // Cached values of the last `{utf8, utf16}_index` call,
+    // to avoid unnecessary recomputation when calling these methods
+    // with increasing indices.
+    cur_utf8_index: usize,
+    cur_utf16_index: usize,
+}
+
+impl<'gc> CachedText<'gc> {
+    fn new(text: AvmString<'gc>) -> Self {
+        let to_utf8 = WStrToUtf8::new(text.as_ucs2());
+        let utf8 = to_utf8.to_utf8_lossy();
+        let utf8_prefix_len = if utf8.len() == text.len() {
+            // Identical len means the string is fully utf8,
+            // even if `utf8_prefix` is empty.
+            text.len()
+        } else {
+            to_utf8.prefix().len()
+        };
+
+        Self {
+            text,
+            utf8: match utf8 {
+                Cow::Owned(s) => Some(s),
+                Cow::Borrowed(_) => None,
+            },
+            utf8_prefix_len,
+            cur_utf8_index: utf8_prefix_len,
+            cur_utf16_index: utf8_prefix_len,
+        }
+    }
+
+    fn utf8(&self) -> &str {
+        self.utf8
+            .as_deref()
+            .unwrap_or_else(|| match self.text.units() {
+                // SAFETY: because `self.utf8` is None, we know `text` contains
+                // a valid UTF8 string.
+                Units::Bytes(s) => unsafe { std::str::from_utf8_unchecked(s) },
+                _ => unreachable!(),
+            })
+    }
+
+    fn reset(&mut self) {
+        self.cur_utf8_index = self.utf8_prefix_len;
+        self.cur_utf16_index = self.utf8_prefix_len;
+    }
+
+    fn advance(&mut self) -> Option<()> {
+        let c = self.utf8()[self.cur_utf8_index..].chars().next()?;
+        self.cur_utf8_index += c.len_utf8();
+        self.cur_utf16_index += c.len_utf16();
+        Some(())
+    }
+
+    /// Returns the UTF8 index corresponding to the given UTF16 index.
+    ///
+    /// If `utf16_index` is out of bounds, return `None`.
+    /// If `utf16_index` isn't on a char boundary, return the index
+    /// of the next char.
+    fn utf8_index(&mut self, utf16_index: usize) -> Option<usize> {
+        if utf16_index <= self.utf8_prefix_len {
+            return Some(utf16_index);
+        }
+
+        if utf16_index < self.cur_utf16_index {
+            self.reset();
+        }
+
+        while self.cur_utf16_index < utf16_index {
+            self.advance()?;
+        }
+
+        Some(self.cur_utf8_index)
+    }
+
+    /// Returns the UTF16 index corresponding to the given UTF8 index.
+    ///
+    /// If `utf8_index` is out of bounds, return `None`.
+    /// If `utf8_index` isn't on a char boundary, return the index
+    /// of the next char.
+    fn utf16_index(&mut self, utf8_index: usize) -> Option<usize> {
+        if utf8_index <= self.utf8_prefix_len {
+            return Some(utf8_index);
+        }
+
+        if utf8_index < self.cur_utf8_index {
+            self.reset();
+        }
+
+        while self.cur_utf8_index < utf8_index {
+            self.advance()?;
+        }
+
+        Some(self.cur_utf16_index)
     }
 }
diff --git a/core/src/string.rs b/core/src/string.rs
index 35577fb6d..b215d6a87 100644
--- a/core/src/string.rs
+++ b/core/src/string.rs
@@ -24,7 +24,7 @@ pub const MAX_STRING_LEN: usize = raw::MAX_STRING_LEN;
 pub use avm::AvmString;
 pub use buf::WString;
 pub use common::{BorrowWStr, BorrowWStrMut, Units};
-pub use ops::{Iter, Split};
+pub use ops::{Iter, Split, WStrToUtf8};
 pub use pattern::Pattern;
 pub use slice::{WStr, WStrMut};
 
diff --git a/core/src/string/avm.rs b/core/src/string/avm.rs
index bf13de21f..0c7a7ad74 100644
--- a/core/src/string/avm.rs
+++ b/core/src/string/avm.rs
@@ -53,6 +53,15 @@ impl<'gc> AvmString<'gc> {
         }
     }
 
+    #[inline]
+    pub fn ptr_eq(this: &Self, other: &Self) -> bool {
+        match (this.source, other.source) {
+            (Source::Owned(this), Source::Owned(other)) => Gc::ptr_eq(this, other),
+            (Source::Static(this), Source::Static(other)) => std::ptr::eq(this, other),
+            _ => false,
+        }
+    }
+
     #[inline]
     pub fn as_str(&self) -> &str {
         self
diff --git a/core/src/string/buf.rs b/core/src/string/buf.rs
index ff326af5e..3745d69a0 100644
--- a/core/src/string/buf.rs
+++ b/core/src/string/buf.rs
@@ -1,6 +1,7 @@
 use gc_arena::Collect;
 
 use super::raw::WStrPtr;
+use super::utils::split_ascii_prefix;
 use super::{BorrowWStr, BorrowWStrMut, Units, WStr, WStrMut, MAX_STRING_LEN};
 
 /// An owned, extensible UCS2 string, analoguous to `String`.
@@ -273,15 +274,6 @@ impl WString {
     }
 }
 
-fn split_ascii_prefix(s: &str) -> (&[u8], &str) {
-    let first_non_ascii = s.as_bytes().iter().position(|c| *c >= 0x80);
-    let (head, tail) = match first_non_ascii {
-        Some(i) => s.split_at(i),
-        None => ("", s),
-    };
-    (head.as_bytes(), tail)
-}
-
 impl Drop for WString {
     fn drop(&mut self) {
         // SAFETY: `self` is gone after this line.
diff --git a/core/src/string/common.rs b/core/src/string/common.rs
index 7c6f0dadf..fb87f1d43 100644
--- a/core/src/string/common.rs
+++ b/core/src/string/common.rs
@@ -161,6 +161,14 @@ macro_rules! impl_str_methods {
             crate::string::ops::str_is_latin1($deref)
         }
 
+        /// Converts this string to an UTF8 `String`.
+        ///
+        /// Unpaired surrogates are replaced by the replacement character.
+        #[inline]
+        pub fn to_utf8_lossy($self: $receiver) -> std::borrow::Cow<$lt, str> {
+            crate::string::ops::WStrToUtf8::new($deref).to_utf8_lossy()
+        }
+
         /// Analogue of [`str::find`].
         #[inline]
         pub fn find<$($pat_gen)* P: crate::string::Pattern<$pat_lt>>($self: $pat_self, pattern: P) -> Option<usize> {
diff --git a/core/src/string/ops.rs b/core/src/string/ops.rs
index 85eaf134e..269f26507 100644
--- a/core/src/string/ops.rs
+++ b/core/src/string/ops.rs
@@ -1,9 +1,10 @@
+use std::borrow::Cow;
 use std::fmt::{self, Write};
 use std::hash::Hasher;
 use std::slice::Iter as SliceIter;
 
 use super::pattern::Searcher;
-use super::{utils, Pattern, WStr, Units};
+use super::{utils, Pattern, Units, WStr};
 
 pub struct Iter<'a> {
     inner: Units<SliceIter<'a, u8>, SliceIter<'a, u16>>,
@@ -41,7 +42,9 @@ pub fn str_iter(s: WStr<'_>) -> Iter<'_> {
 }
 
 pub fn str_fmt(s: WStr<'_>, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-    std::char::decode_utf16(s.iter())
+    let utf8 = WStrToUtf8::new(s);
+    f.write_str(utf8.head)?;
+    std::char::decode_utf16(utf8.tail.iter())
         .map(|c| c.unwrap_or(char::REPLACEMENT_CHARACTER))
         .try_for_each(|c| f.write_char(c))
 }
@@ -120,7 +123,6 @@ pub fn str_is_latin1(s: WStr<'_>) -> bool {
         Units::Wide(us) => us.iter().all(|c| *c <= u16::from(u8::MAX)),
     }
 }
-
 pub fn str_find<'a, P: Pattern<'a>>(haystack: WStr<'a>, pattern: P) -> Option<usize> {
     pattern
         .into_searcher(haystack)
@@ -168,3 +170,39 @@ impl<'a, P: Pattern<'a>> Iterator for Split<'a, P> {
         }
     }
 }
+
+/// A struct for converting a `WStr<'_>` to an UTF8 `String`.
+pub struct WStrToUtf8<'a> {
+    head: &'a str,
+    tail: WStr<'a>,
+}
+
+impl<'a> WStrToUtf8<'a> {
+    pub fn new(s: WStr<'a>) -> Self {
+        let (head, tail) = match s.units() {
+            Units::Bytes(b) => {
+                let (head, tail) = utils::split_ascii_prefix_bytes(b);
+                (head, WStr::from_units(tail))
+            }
+            Units::Wide(_) => ("", s),
+        };
+
+        Self { head, tail }
+    }
+
+    pub fn to_utf8_lossy(&self) -> Cow<'a, str> {
+        if self.tail.is_empty() {
+            Cow::Borrowed(self.head)
+        } else {
+            let mut out = String::with_capacity(self.head.len() + self.tail.len());
+            out.push_str(self.head);
+            write!(out, "{}", self.tail).unwrap();
+            Cow::Owned(out)
+        }
+    }
+
+    #[inline]
+    pub fn prefix(&self) -> &str {
+        self.head
+    }
+}
diff --git a/core/src/string/pattern.rs b/core/src/string/pattern.rs
index 34f74aeef..ffe816ac1 100644
--- a/core/src/string/pattern.rs
+++ b/core/src/string/pattern.rs
@@ -5,7 +5,7 @@
 //  - remove implicit bound checks?
 //  - use memchr crate?
 
-use super::{WStr, Units};
+use super::{Units, WStr};
 
 /// A pattern that can be searched in a [`WStr`].
 ///
diff --git a/core/src/string/utils.rs b/core/src/string/utils.rs
index 5eb9f12c9..1cbb430fb 100644
--- a/core/src/string/utils.rs
+++ b/core/src/string/utils.rs
@@ -25,6 +25,25 @@ pub fn next_char_boundary(slice: &str, pos: usize) -> usize {
     }
 }
 
+/// Finds the longest prefix of `slice` that is entirely ASCII,
+/// and returns it as an UTF8 string, together with the remaining tail.
+pub fn split_ascii_prefix_bytes(slice: &[u8]) -> (&str, &[u8]) {
+    let first_non_ascii = slice.iter().position(|c| *c >= 0x80);
+    let (head, tail) = slice.split_at(first_non_ascii.unwrap_or(0));
+    // SAFETY: `head` only contains ASCII.
+    let head = unsafe { std::str::from_utf8_unchecked(head) };
+    (head, tail)
+}
+
+/// Finds the longest prefix of `slice` that is entirely ASCII,
+/// and returns it as a byte slice, together with the remaining tail.
+pub fn split_ascii_prefix(slice: &str) -> (&[u8], &str) {
+    let (head, tail) = split_ascii_prefix_bytes(slice.as_bytes());
+    // SAFETY: `split_ascii_prefix_bytes` always split on a char boundary.
+    let tail = unsafe { std::str::from_utf8_unchecked(tail) };
+    (head.as_bytes(), tail)
+}
+
 /// Maps a UTF-16 code unit into a `char`.
 /// TODO: Surrogate characters will get replaced with the Unicode replacement character.
 pub fn utf16_code_unit_to_char(c: u16) -> char {