avm2: rewrite regexp module to not rely on AvmString::as_str

This is a little tricky, because we have to map the utf8 indices returned by the regex engine to utf16 indices usable by Ruffle. To limit the impact on performance, the regex, the string we're currently matching on, and the last known (utf8, utf16) positions are cached, avoiding extra utf8 conversions in common use cases where a single string is repeatedly searched with increasing `lastIndex`.
2021-09-20 23:56:17 +02:00 · 2021-09-20 23:56:17 +02:00 · 4a09088d42
parent 23cbe4c2fd
commit 4a09088d42
11 changed files with 348 additions and 118 deletions
--- a/core/src/avm2/globals/regexp.rs
+++ b/core/src/avm2/globals/regexp.rs
@ -4,10 +4,11 @@ use crate::avm2::class::Class;
 use crate::avm2::method::{Method, NativeMethodImpl, ParamConfig};
 use crate::avm2::names::{Namespace, QName};
 use crate::avm2::object::{regexp_allocator, ArrayObject, Object, TObject};
 use crate::avm2::regexp::RegExpFlags;
 use crate::avm2::value::Value;
 use crate::avm2::Error;
 use crate::avm2::{activation::Activation, array::ArrayStorage};
-use crate::string::AvmString;
+use crate::string::{AvmString, WString};
 use gc_arena::{GcCell, MutationContext};
 /// Implements `RegExp`'s instance initializer.
@ -26,20 +27,24 @@ pub fn instance_init<'gc>(
                    .coerce_to_string(activation)?,
            );
-            let flags = args
+            let flag_chars = args
                .get(1)
                .unwrap_or(&Value::String("".into()))
                .coerce_to_string(activation)?;
-            for flag in flags.chars() {
+
-                match flag {
+            let mut flags = RegExpFlags::empty();
-                    's' => regexp.set_dotall(true),
+            for c in &flag_chars {
-                    'x' => regexp.set_extended(true),
+                flags |= match u8::try_from(c) {
-                    'g' => regexp.set_global(true),
+                    Ok(b's') => RegExpFlags::DOTALL,
-                    'i' => regexp.set_ignore_case(true),
+                    Ok(b'x') => RegExpFlags::EXTENDED,
-                    'm' => regexp.set_multiline(true),
+                    Ok(b'g') => RegExpFlags::GLOBAL,
-                    _ => {}
+                    Ok(b'i') => RegExpFlags::IGNORE_CASE,
                    Ok(b'm') => RegExpFlags::MULTILINE,
                    _ => continue,
                };
            }
            regexp.set_flags(flags);
        }
    }
@ -63,7 +68,7 @@ pub fn dotall<'gc>(
 ) -> Result<Value<'gc>, Error> {
    if let Some(this) = this {
        if let Some(regexp) = this.as_regexp() {
-            return Ok(regexp.dotall().into());
+            return Ok(regexp.flags().contains(RegExpFlags::DOTALL).into());
        }
    }
@ -78,7 +83,7 @@ pub fn extended<'gc>(
 ) -> Result<Value<'gc>, Error> {
    if let Some(this) = this {
        if let Some(regexp) = this.as_regexp() {
-            return Ok(regexp.extended().into());
+            return Ok(regexp.flags().contains(RegExpFlags::EXTENDED).into());
        }
    }
@ -93,7 +98,7 @@ pub fn global<'gc>(
 ) -> Result<Value<'gc>, Error> {
    if let Some(this) = this {
        if let Some(regexp) = this.as_regexp() {
-            return Ok(regexp.global().into());
+            return Ok(regexp.flags().contains(RegExpFlags::GLOBAL).into());
        }
    }
@ -108,7 +113,7 @@ pub fn ignore_case<'gc>(
 ) -> Result<Value<'gc>, Error> {
    if let Some(this) = this {
        if let Some(regexp) = this.as_regexp() {
-            return Ok(regexp.ignore_case().into());
+            return Ok(regexp.flags().contains(RegExpFlags::IGNORE_CASE).into());
        }
    }
@ -123,7 +128,7 @@ pub fn multiline<'gc>(
 ) -> Result<Value<'gc>, Error> {
    if let Some(this) = this {
        if let Some(regexp) = this.as_regexp() {
-            return Ok(regexp.multiline().into());
+            return Ok(regexp.flags().contains(RegExpFlags::MULTILINE).into());
        }
    }
@ -192,17 +197,16 @@ pub fn exec<'gc>(
                .unwrap_or(&Value::Undefined)
                .coerce_to_string(activation)?;
-            let (storage, index) = match re.exec(&text) {
+            let (storage, index) = match re.exec(text) {
                Some(matched) => {
                    let substrings = matched
                        .groups()
-                        .map(|range| text[range.unwrap_or(0..0)].to_string());
+                        .map(|range| range.map(|r| WString::from(text.slice(r))));
-                    let mut storage = ArrayStorage::new(0);
+                    let storage = ArrayStorage::from_iter(substrings.map(|s| match s {
-                    for substring in substrings {
+                        None => Value::Undefined,
-                        storage
+                        Some(s) => AvmString::new_ucs2(activation.context.gc_context, s).into(),
-                            .push(AvmString::new(activation.context.gc_context, substring).into());
+                    }));
                    }
                    (storage, matched.start())
                }
@ -244,7 +248,7 @@ pub fn test<'gc>(
                .get(0)
                .unwrap_or(&Value::Undefined)
                .coerce_to_string(activation)?;
-            return Ok(re.test(&text).into());
+            return Ok(re.test(text).into());
        }
    }
--- a/core/src/avm2/globals/string.rs
+++ b/core/src/avm2/globals/string.rs
@ -5,6 +5,7 @@ use crate::avm2::class::{Class, ClassAttributes};
 use crate::avm2::method::{Method, NativeMethodImpl};
 use crate::avm2::names::{Namespace, QName};
 use crate::avm2::object::{primitive_allocator, Object, TObject};
 use crate::avm2::regexp::RegExpFlags;
 use crate::avm2::value::Value;
 use crate::avm2::Error;
 use crate::avm2::{ArrayObject, ArrayStorage};
@ -230,18 +231,18 @@ fn match_s<'gc>(
        if let Some(mut regexp) = pattern.as_regexp_mut(activation.context.gc_context) {
            let mut storage = ArrayStorage::new(0);
-            if regexp.global() {
+            if regexp.flags().contains(RegExpFlags::GLOBAL) {
                let mut last = regexp.last_index();
                let old_last_index = regexp.last_index();
                regexp.set_last_index(0);
-                while let Some(result) = regexp.exec(this.as_str()) {
+                while let Some(result) = regexp.exec(this) {
                    if regexp.last_index() == last {
                        break;
                    }
                    storage.push(
-                        AvmString::new(
+                        AvmString::new_ucs2(
                            activation.context.gc_context,
-                            this.as_str()[result.range()].to_string(),
+                            this.slice(result.range()).into(),
                        )
                        .into(),
                    );
@ -257,15 +258,17 @@ fn match_s<'gc>(
            } else {
                let old = regexp.last_index();
                regexp.set_last_index(0);
-                if let Some(result) = regexp.exec(this.as_str()) {
+                if let Some(result) = regexp.exec(this) {
                    let substrings = result
                        .groups()
-                        .map(|range| this.as_str()[range.unwrap_or(0..0)].to_string());
+                        .map(|range| this.slice(range.unwrap_or(0..0)));
                    let mut storage = ArrayStorage::new(0);
                    for substring in substrings {
-                        storage
+                        storage.push(
-                            .push(AvmString::new(activation.context.gc_context, substring).into());
+                            AvmString::new_ucs2(activation.context.gc_context, substring.into())
                                .into(),
                        );
                    }
                    regexp.set_last_index(old);
                    return Ok(ArrayObject::from_storage(activation, storage)
--- a/core/src/avm2/object/regexp_object.rs
+++ b/core/src/avm2/object/regexp_object.rs
@ -3,7 +3,7 @@
 use crate::avm2::activation::Activation;
 use crate::avm2::object::script_object::ScriptObjectData;
 use crate::avm2::object::{ClassObject, Object, ObjectPtr, TObject};
-use crate::avm2::regexp::RegExp;
+use crate::avm2::regexp::{RegExp, RegExpFlags};
 use crate::avm2::value::Value;
 use crate::avm2::Error;
 use crate::string::AvmString;
@ -97,19 +97,21 @@ impl<'gc> TObject<'gc> for RegExpObject<'gc> {
        let read = self.0.read();
        let mut s = format!("/{}/", read.regexp.source());
-        if read.regexp.global() {
+        let flags = read.regexp.flags();
        if flags.contains(RegExpFlags::GLOBAL) {
            s.push('g');
        }
-        if read.regexp.ignore_case() {
+        if flags.contains(RegExpFlags::IGNORE_CASE) {
            s.push('i');
        }
-        if read.regexp.multiline() {
+        if flags.contains(RegExpFlags::MULTILINE) {
            s.push('m');
        }
-        if read.regexp.dotall() {
+        if flags.contains(RegExpFlags::DOTALL) {
            s.push('s');
        }
-        if read.regexp.extended() {
+        if flags.contains(RegExpFlags::EXTENDED) {
            s.push('x');
        }
--- a/core/src/avm2/regexp.rs
+++ b/core/src/avm2/regexp.rs
@ -1,22 +1,39 @@
 //! RegExp Structure
-use crate::string::AvmString;
+use std::borrow::Cow;
 use crate::string::{AvmString, Units, WStrToUtf8};
 use bitflags::bitflags;
 use gc_arena::Collect;
 use regress::Regex;
-#[derive(Clone, Collect, Debug)]
+#[derive(Collect, Debug)]
 #[collect(no_drop)]
 pub struct RegExp<'gc> {
    source: AvmString<'gc>,
    flags: RegExpFlags,
    last_index: usize,
    #[collect(require_static)]
    cached_regex: Option<Result<regress::Regex, ()>>,
    cached_text: Option<CachedText<'gc>>,
 }
 impl<'gc> Clone for RegExp<'gc> {
    fn clone(&self) -> Self {
        Self {
            source: self.source,
            flags: self.flags,
            last_index: self.last_index,
            cached_regex: None,
            cached_text: None,
        }
    }
 }
 bitflags! {
    #[derive(Collect)]
    #[collect(require_static)]
-    struct RegExpFlags: u8 {
+    pub struct RegExpFlags: u8 {
        const GLOBAL       = 1 << 0;
        const IGNORE_CASE  = 1 << 1;
        const MULTILINE    = 1 << 2;
@ -34,6 +51,8 @@ impl<'gc> RegExp<'gc> {
            source: source.into(),
            flags: RegExpFlags::empty(),
            last_index: 0,
            cached_regex: None,
            cached_text: None,
        }
    }
@ -45,9 +64,19 @@ impl<'gc> RegExp<'gc> {
    where
        S: Into<AvmString<'gc>>,
    {
        self.cached_regex = None;
        self.source = source.into();
    }
    pub fn flags(&self) -> RegExpFlags {
        self.flags
    }
    pub fn set_flags(&mut self, flags: RegExpFlags) {
        self.cached_regex = None;
        self.flags = flags;
    }
    pub fn last_index(&self) -> usize {
        self.last_index
    }
@ -56,70 +85,196 @@ impl<'gc> RegExp<'gc> {
        self.last_index = i;
    }
-    pub fn dotall(&self) -> bool {
+    fn find_utf8_match_at<T, F>(&mut self, text: AvmString<'gc>, start: usize, f: F) -> Option<T>
-        self.flags.contains(RegExpFlags::DOTALL)
+    where
-    }
+        F: FnOnce(&mut CachedText<'gc>, regress::Match) -> T,
-
+    {
-    pub fn set_dotall(&mut self, value: bool) {
+        if self.cached_regex.is_none() {
-        self.flags.set(RegExpFlags::DOTALL, value);
+            let re = regress::Regex::with_flags(
-    }
+                &self.source.to_utf8_lossy(),
-
+                regress::Flags {
-    pub fn extended(&self) -> bool {
+                    icase: self.flags.contains(RegExpFlags::IGNORE_CASE),
-        self.flags.contains(RegExpFlags::EXTENDED)
+                    multiline: self.flags.contains(RegExpFlags::MULTILINE),
-    }
+                    dot_all: self.flags.contains(RegExpFlags::DOTALL),
-
+                    no_opt: false,
-    pub fn set_extended(&mut self, value: bool) {
+                },
-        self.flags.set(RegExpFlags::EXTENDED, value);
+            );
-    }
+            self.cached_regex = Some(re.map_err(drop));
    pub fn global(&self) -> bool {
        self.flags.contains(RegExpFlags::GLOBAL)
    }
    pub fn set_global(&mut self, value: bool) {
        self.flags.set(RegExpFlags::GLOBAL, value);
    }
    pub fn ignore_case(&self) -> bool {
        self.flags.contains(RegExpFlags::IGNORE_CASE)
    }
    pub fn set_ignore_case(&mut self, value: bool) {
        self.flags.set(RegExpFlags::IGNORE_CASE, value);
    }
    pub fn multiline(&self) -> bool {
        self.flags.contains(RegExpFlags::MULTILINE)
    }
    pub fn set_multiline(&mut self, value: bool) {
        self.flags.set(RegExpFlags::MULTILINE, value);
    }
    pub fn test(&mut self, text: &str) -> bool {
        self.exec(text).is_some()
    }
    pub fn exec(&mut self, text: &str) -> Option<regress::Match> {
        if let Ok(re) = Regex::with_flags(
            &self.source,
            regress::Flags {
                icase: self.ignore_case(),
                multiline: self.multiline(),
                dot_all: self.dotall(),
                no_opt: false,
            },
        ) {
            let start = if self.global() { self.last_index } else { 0 };
            if let Some(matched) = re.find_from(text, start).next() {
                if self.global() {
                    self.last_index = matched.end();
                }
                return Some(matched);
            }
        }
-        None
+        let regex = match self.cached_regex.as_mut() {
            Some(Ok(re)) => re,
            Some(Err(_)) => return None,
            None => unreachable!(),
        };
        let cached = self
            .cached_text
            .as_ref()
            .filter(|cached| AvmString::ptr_eq(&cached.text, &text))
            .is_some();
        if !cached {
            self.cached_text = Some(CachedText::new(text));
        }
        let text = self.cached_text.as_mut().unwrap();
        let start = text.utf8_index(start)?;
        let re_match = regex.find_from(text.utf8(), start).next()?;
        Some(f(text, re_match))
    }
    pub fn test(&mut self, text: AvmString<'gc>) -> bool {
        let global = self.flags.contains(RegExpFlags::GLOBAL);
        let start = if global { self.last_index } else { 0 };
        let matched_idx = self.find_utf8_match_at(text, start, |text, re_match| {
            if global {
                text.utf16_index(re_match.end())
            } else {
                None
            }
        });
        match matched_idx {
            Some(Some(idx)) => {
                self.last_index = idx;
                true
            }
            Some(None) => true,
            None => false,
        }
    }
    pub fn exec(&mut self, text: AvmString<'gc>) -> Option<regress::Match> {
        let global = self.flags.contains(RegExpFlags::GLOBAL);
        let start = if global { self.last_index } else { 0 };
        let re_match = self.find_utf8_match_at(text, start, |text, mut re_match| {
            // Sort the capture endpoints by increasing index, so that CachedText::utf16_index is efficient.
            let mut utf8_indices = re_match
                .captures
                .iter_mut()
                .filter_map(Option::as_mut)
                .chain(std::iter::once(&mut re_match.range))
                .flat_map(|capture| [&mut capture.start, &mut capture.end])
                .collect::<Vec<_>>();
            utf8_indices.sort_by_key(|i| **i);
            // Map UTF8 indices back to UTF16.
            for i in utf8_indices {
                *i = text.utf16_index(*i).unwrap();
            }
            re_match
        })?;
        if global {
            self.last_index = re_match.end();
        }
        Some(re_match)
    }
 }
 #[derive(Collect, Debug)]
 #[collect(no_drop)]
 struct CachedText<'gc> {
    text: AvmString<'gc>,
    // None means that `text` is already a valid utf8 string.
    utf8: Option<String>,
    utf8_prefix_len: usize,
    // Cached values of the last `{utf8, utf16}_index` call,
    // to avoid unnecessary recomputation when calling these methods
    // with increasing indices.
    cur_utf8_index: usize,
    cur_utf16_index: usize,
 }
 impl<'gc> CachedText<'gc> {
    fn new(text: AvmString<'gc>) -> Self {
        let to_utf8 = WStrToUtf8::new(text.as_ucs2());
        let utf8 = to_utf8.to_utf8_lossy();
        let utf8_prefix_len = if utf8.len() == text.len() {
            // Identical len means the string is fully utf8,
            // even if `utf8_prefix` is empty.
            text.len()
        } else {
            to_utf8.prefix().len()
        };
        Self {
            text,
            utf8: match utf8 {
                Cow::Owned(s) => Some(s),
                Cow::Borrowed(_) => None,
            },
            utf8_prefix_len,
            cur_utf8_index: utf8_prefix_len,
            cur_utf16_index: utf8_prefix_len,
        }
    }
    fn utf8(&self) -> &str {
        self.utf8
            .as_deref()
            .unwrap_or_else(|| match self.text.units() {
                // SAFETY: because `self.utf8` is None, we know `text` contains
                // a valid UTF8 string.
                Units::Bytes(s) => unsafe { std::str::from_utf8_unchecked(s) },
                _ => unreachable!(),
            })
    }
    fn reset(&mut self) {
        self.cur_utf8_index = self.utf8_prefix_len;
        self.cur_utf16_index = self.utf8_prefix_len;
    }
    fn advance(&mut self) -> Option<()> {
        let c = self.utf8()[self.cur_utf8_index..].chars().next()?;
        self.cur_utf8_index += c.len_utf8();
        self.cur_utf16_index += c.len_utf16();
        Some(())
    }
    /// Returns the UTF8 index corresponding to the given UTF16 index.
    ///
    /// If `utf16_index` is out of bounds, return `None`.
    /// If `utf16_index` isn't on a char boundary, return the index
    /// of the next char.
    fn utf8_index(&mut self, utf16_index: usize) -> Option<usize> {
        if utf16_index <= self.utf8_prefix_len {
            return Some(utf16_index);
        }
        if utf16_index < self.cur_utf16_index {
            self.reset();
        }
        while self.cur_utf16_index < utf16_index {
            self.advance()?;
        }
        Some(self.cur_utf8_index)
    }
    /// Returns the UTF16 index corresponding to the given UTF8 index.
    ///
    /// If `utf8_index` is out of bounds, return `None`.
    /// If `utf8_index` isn't on a char boundary, return the index
    /// of the next char.
    fn utf16_index(&mut self, utf8_index: usize) -> Option<usize> {
        if utf8_index <= self.utf8_prefix_len {
            return Some(utf8_index);
        }
        if utf8_index < self.cur_utf8_index {
            self.reset();
        }
        while self.cur_utf8_index < utf8_index {
            self.advance()?;
        }
        Some(self.cur_utf16_index)
    }
 }
--- a/core/src/string.rs
+++ b/core/src/string.rs
@ -24,7 +24,7 @@ pub const MAX_STRING_LEN: usize = raw::MAX_STRING_LEN;
 pub use avm::AvmString;
 pub use buf::WString;
 pub use common::{BorrowWStr, BorrowWStrMut, Units};
-pub use ops::{Iter, Split};
+pub use ops::{Iter, Split, WStrToUtf8};
 pub use pattern::Pattern;
 pub use slice::{WStr, WStrMut};
--- a/core/src/string/avm.rs
+++ b/core/src/string/avm.rs
@ -53,6 +53,15 @@ impl<'gc> AvmString<'gc> {
        }
    }
    #[inline]
    pub fn ptr_eq(this: &Self, other: &Self) -> bool {
        match (this.source, other.source) {
            (Source::Owned(this), Source::Owned(other)) => Gc::ptr_eq(this, other),
            (Source::Static(this), Source::Static(other)) => std::ptr::eq(this, other),
            _ => false,
        }
    }
    #[inline]
    pub fn as_str(&self) -> &str {
        self
--- a/core/src/string/buf.rs
+++ b/core/src/string/buf.rs
@ -1,6 +1,7 @@
 use gc_arena::Collect;
 use super::raw::WStrPtr;
 use super::utils::split_ascii_prefix;
 use super::{BorrowWStr, BorrowWStrMut, Units, WStr, WStrMut, MAX_STRING_LEN};
 /// An owned, extensible UCS2 string, analoguous to `String`.
@ -273,15 +274,6 @@ impl WString {
    }
 }
 fn split_ascii_prefix(s: &str) -> (&[u8], &str) {
    let first_non_ascii = s.as_bytes().iter().position(|c| *c >= 0x80);
    let (head, tail) = match first_non_ascii {
        Some(i) => s.split_at(i),
        None => ("", s),
    };
    (head.as_bytes(), tail)
 }
 impl Drop for WString {
    fn drop(&mut self) {
        // SAFETY: `self` is gone after this line.
--- a/core/src/string/common.rs
+++ b/core/src/string/common.rs
@ -161,6 +161,14 @@ macro_rules! impl_str_methods {
            crate::string::ops::str_is_latin1($deref)
        }
        /// Converts this string to an UTF8 `String`.
        ///
        /// Unpaired surrogates are replaced by the replacement character.
        #[inline]
        pub fn to_utf8_lossy($self: $receiver) -> std::borrow::Cow<$lt, str> {
            crate::string::ops::WStrToUtf8::new($deref).to_utf8_lossy()
        }
        /// Analogue of [`str::find`].
        #[inline]
        pub fn find<$($pat_gen)* P: crate::string::Pattern<$pat_lt>>($self: $pat_self, pattern: P) -> Option<usize> {
--- a/core/src/string/ops.rs
+++ b/core/src/string/ops.rs
@ -1,9 +1,10 @@
 use std::borrow::Cow;
 use std::fmt::{self, Write};
 use std::hash::Hasher;
 use std::slice::Iter as SliceIter;
 use super::pattern::Searcher;
-use super::{utils, Pattern, WStr, Units};
+use super::{utils, Pattern, Units, WStr};
 pub struct Iter<'a> {
    inner: Units<SliceIter<'a, u8>, SliceIter<'a, u16>>,
@ -41,7 +42,9 @@ pub fn str_iter(s: WStr<'_>) -> Iter<'_> {
 }
 pub fn str_fmt(s: WStr<'_>, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-    std::char::decode_utf16(s.iter())
+    let utf8 = WStrToUtf8::new(s);
    f.write_str(utf8.head)?;
    std::char::decode_utf16(utf8.tail.iter())
        .map(|c| c.unwrap_or(char::REPLACEMENT_CHARACTER))
        .try_for_each(|c| f.write_char(c))
 }
@ -120,7 +123,6 @@ pub fn str_is_latin1(s: WStr<'_>) -> bool {
        Units::Wide(us) => us.iter().all(|c| *c <= u16::from(u8::MAX)),
    }
 }
 pub fn str_find<'a, P: Pattern<'a>>(haystack: WStr<'a>, pattern: P) -> Option<usize> {
    pattern
        .into_searcher(haystack)
@ -168,3 +170,39 @@ impl<'a, P: Pattern<'a>> Iterator for Split<'a, P> {
        }
    }
 }
 /// A struct for converting a `WStr<'_>` to an UTF8 `String`.
 pub struct WStrToUtf8<'a> {
    head: &'a str,
    tail: WStr<'a>,
 }
 impl<'a> WStrToUtf8<'a> {
    pub fn new(s: WStr<'a>) -> Self {
        let (head, tail) = match s.units() {
            Units::Bytes(b) => {
                let (head, tail) = utils::split_ascii_prefix_bytes(b);
                (head, WStr::from_units(tail))
            }
            Units::Wide(_) => ("", s),
        };
        Self { head, tail }
    }
    pub fn to_utf8_lossy(&self) -> Cow<'a, str> {
        if self.tail.is_empty() {
            Cow::Borrowed(self.head)
        } else {
            let mut out = String::with_capacity(self.head.len() + self.tail.len());
            out.push_str(self.head);
            write!(out, "{}", self.tail).unwrap();
            Cow::Owned(out)
        }
    }
    #[inline]
    pub fn prefix(&self) -> &str {
        self.head
    }
 }
--- a/core/src/string/pattern.rs
+++ b/core/src/string/pattern.rs
@ -5,7 +5,7 @@
 //  - remove implicit bound checks?
 //  - use memchr crate?
-use super::{WStr, Units};
+use super::{Units, WStr};
 /// A pattern that can be searched in a [`WStr`].
 ///
--- a/core/src/string/utils.rs
+++ b/core/src/string/utils.rs
@ -25,6 +25,25 @@ pub fn next_char_boundary(slice: &str, pos: usize) -> usize {
    }
 }
 /// Finds the longest prefix of `slice` that is entirely ASCII,
 /// and returns it as an UTF8 string, together with the remaining tail.
 pub fn split_ascii_prefix_bytes(slice: &[u8]) -> (&str, &[u8]) {
    let first_non_ascii = slice.iter().position(|c| *c >= 0x80);
    let (head, tail) = slice.split_at(first_non_ascii.unwrap_or(0));
    // SAFETY: `head` only contains ASCII.
    let head = unsafe { std::str::from_utf8_unchecked(head) };
    (head, tail)
 }
 /// Finds the longest prefix of `slice` that is entirely ASCII,
 /// and returns it as a byte slice, together with the remaining tail.
 pub fn split_ascii_prefix(slice: &str) -> (&[u8], &str) {
    let (head, tail) = split_ascii_prefix_bytes(slice.as_bytes());
    // SAFETY: `split_ascii_prefix_bytes` always split on a char boundary.
    let tail = unsafe { std::str::from_utf8_unchecked(tail) };
    (head.as_bytes(), tail)
 }
 /// Maps a UTF-16 code unit into a `char`.
 /// TODO: Surrogate characters will get replaced with the Unicode replacement character.
 pub fn utf16_code_unit_to_char(c: u16) -> char {