avm2: rewrite regexp module to not rely on AvmString::as_str

This is a little tricky, because we have to map the utf8 indices
returned by the regex engine to utf16 indices usable by Ruffle.

To limit the impact on performance, the regex, the string we're
currently matching on, and the last known (utf8, utf16) positions
are cached, avoiding extra utf8 conversions in common use cases
where a single string is repeatedly searched with increasing
`lastIndex`.
This commit is contained in:
Moulins 2021-09-20 23:56:17 +02:00 committed by kmeisthax
parent 23cbe4c2fd
commit 4a09088d42
11 changed files with 348 additions and 118 deletions

View File

@ -4,10 +4,11 @@ use crate::avm2::class::Class;
use crate::avm2::method::{Method, NativeMethodImpl, ParamConfig}; use crate::avm2::method::{Method, NativeMethodImpl, ParamConfig};
use crate::avm2::names::{Namespace, QName}; use crate::avm2::names::{Namespace, QName};
use crate::avm2::object::{regexp_allocator, ArrayObject, Object, TObject}; use crate::avm2::object::{regexp_allocator, ArrayObject, Object, TObject};
use crate::avm2::regexp::RegExpFlags;
use crate::avm2::value::Value; use crate::avm2::value::Value;
use crate::avm2::Error; use crate::avm2::Error;
use crate::avm2::{activation::Activation, array::ArrayStorage}; use crate::avm2::{activation::Activation, array::ArrayStorage};
use crate::string::AvmString; use crate::string::{AvmString, WString};
use gc_arena::{GcCell, MutationContext}; use gc_arena::{GcCell, MutationContext};
/// Implements `RegExp`'s instance initializer. /// Implements `RegExp`'s instance initializer.
@ -26,20 +27,24 @@ pub fn instance_init<'gc>(
.coerce_to_string(activation)?, .coerce_to_string(activation)?,
); );
let flags = args let flag_chars = args
.get(1) .get(1)
.unwrap_or(&Value::String("".into())) .unwrap_or(&Value::String("".into()))
.coerce_to_string(activation)?; .coerce_to_string(activation)?;
for flag in flags.chars() {
match flag { let mut flags = RegExpFlags::empty();
's' => regexp.set_dotall(true), for c in &flag_chars {
'x' => regexp.set_extended(true), flags |= match u8::try_from(c) {
'g' => regexp.set_global(true), Ok(b's') => RegExpFlags::DOTALL,
'i' => regexp.set_ignore_case(true), Ok(b'x') => RegExpFlags::EXTENDED,
'm' => regexp.set_multiline(true), Ok(b'g') => RegExpFlags::GLOBAL,
_ => {} Ok(b'i') => RegExpFlags::IGNORE_CASE,
Ok(b'm') => RegExpFlags::MULTILINE,
_ => continue,
}; };
} }
regexp.set_flags(flags);
} }
} }
@ -63,7 +68,7 @@ pub fn dotall<'gc>(
) -> Result<Value<'gc>, Error> { ) -> Result<Value<'gc>, Error> {
if let Some(this) = this { if let Some(this) = this {
if let Some(regexp) = this.as_regexp() { if let Some(regexp) = this.as_regexp() {
return Ok(regexp.dotall().into()); return Ok(regexp.flags().contains(RegExpFlags::DOTALL).into());
} }
} }
@ -78,7 +83,7 @@ pub fn extended<'gc>(
) -> Result<Value<'gc>, Error> { ) -> Result<Value<'gc>, Error> {
if let Some(this) = this { if let Some(this) = this {
if let Some(regexp) = this.as_regexp() { if let Some(regexp) = this.as_regexp() {
return Ok(regexp.extended().into()); return Ok(regexp.flags().contains(RegExpFlags::EXTENDED).into());
} }
} }
@ -93,7 +98,7 @@ pub fn global<'gc>(
) -> Result<Value<'gc>, Error> { ) -> Result<Value<'gc>, Error> {
if let Some(this) = this { if let Some(this) = this {
if let Some(regexp) = this.as_regexp() { if let Some(regexp) = this.as_regexp() {
return Ok(regexp.global().into()); return Ok(regexp.flags().contains(RegExpFlags::GLOBAL).into());
} }
} }
@ -108,7 +113,7 @@ pub fn ignore_case<'gc>(
) -> Result<Value<'gc>, Error> { ) -> Result<Value<'gc>, Error> {
if let Some(this) = this { if let Some(this) = this {
if let Some(regexp) = this.as_regexp() { if let Some(regexp) = this.as_regexp() {
return Ok(regexp.ignore_case().into()); return Ok(regexp.flags().contains(RegExpFlags::IGNORE_CASE).into());
} }
} }
@ -123,7 +128,7 @@ pub fn multiline<'gc>(
) -> Result<Value<'gc>, Error> { ) -> Result<Value<'gc>, Error> {
if let Some(this) = this { if let Some(this) = this {
if let Some(regexp) = this.as_regexp() { if let Some(regexp) = this.as_regexp() {
return Ok(regexp.multiline().into()); return Ok(regexp.flags().contains(RegExpFlags::MULTILINE).into());
} }
} }
@ -192,17 +197,16 @@ pub fn exec<'gc>(
.unwrap_or(&Value::Undefined) .unwrap_or(&Value::Undefined)
.coerce_to_string(activation)?; .coerce_to_string(activation)?;
let (storage, index) = match re.exec(&text) { let (storage, index) = match re.exec(text) {
Some(matched) => { Some(matched) => {
let substrings = matched let substrings = matched
.groups() .groups()
.map(|range| text[range.unwrap_or(0..0)].to_string()); .map(|range| range.map(|r| WString::from(text.slice(r))));
let mut storage = ArrayStorage::new(0); let storage = ArrayStorage::from_iter(substrings.map(|s| match s {
for substring in substrings { None => Value::Undefined,
storage Some(s) => AvmString::new_ucs2(activation.context.gc_context, s).into(),
.push(AvmString::new(activation.context.gc_context, substring).into()); }));
}
(storage, matched.start()) (storage, matched.start())
} }
@ -244,7 +248,7 @@ pub fn test<'gc>(
.get(0) .get(0)
.unwrap_or(&Value::Undefined) .unwrap_or(&Value::Undefined)
.coerce_to_string(activation)?; .coerce_to_string(activation)?;
return Ok(re.test(&text).into()); return Ok(re.test(text).into());
} }
} }

View File

@ -5,6 +5,7 @@ use crate::avm2::class::{Class, ClassAttributes};
use crate::avm2::method::{Method, NativeMethodImpl}; use crate::avm2::method::{Method, NativeMethodImpl};
use crate::avm2::names::{Namespace, QName}; use crate::avm2::names::{Namespace, QName};
use crate::avm2::object::{primitive_allocator, Object, TObject}; use crate::avm2::object::{primitive_allocator, Object, TObject};
use crate::avm2::regexp::RegExpFlags;
use crate::avm2::value::Value; use crate::avm2::value::Value;
use crate::avm2::Error; use crate::avm2::Error;
use crate::avm2::{ArrayObject, ArrayStorage}; use crate::avm2::{ArrayObject, ArrayStorage};
@ -230,18 +231,18 @@ fn match_s<'gc>(
if let Some(mut regexp) = pattern.as_regexp_mut(activation.context.gc_context) { if let Some(mut regexp) = pattern.as_regexp_mut(activation.context.gc_context) {
let mut storage = ArrayStorage::new(0); let mut storage = ArrayStorage::new(0);
if regexp.global() { if regexp.flags().contains(RegExpFlags::GLOBAL) {
let mut last = regexp.last_index(); let mut last = regexp.last_index();
let old_last_index = regexp.last_index(); let old_last_index = regexp.last_index();
regexp.set_last_index(0); regexp.set_last_index(0);
while let Some(result) = regexp.exec(this.as_str()) { while let Some(result) = regexp.exec(this) {
if regexp.last_index() == last { if regexp.last_index() == last {
break; break;
} }
storage.push( storage.push(
AvmString::new( AvmString::new_ucs2(
activation.context.gc_context, activation.context.gc_context,
this.as_str()[result.range()].to_string(), this.slice(result.range()).into(),
) )
.into(), .into(),
); );
@ -257,15 +258,17 @@ fn match_s<'gc>(
} else { } else {
let old = regexp.last_index(); let old = regexp.last_index();
regexp.set_last_index(0); regexp.set_last_index(0);
if let Some(result) = regexp.exec(this.as_str()) { if let Some(result) = regexp.exec(this) {
let substrings = result let substrings = result
.groups() .groups()
.map(|range| this.as_str()[range.unwrap_or(0..0)].to_string()); .map(|range| this.slice(range.unwrap_or(0..0)));
let mut storage = ArrayStorage::new(0); let mut storage = ArrayStorage::new(0);
for substring in substrings { for substring in substrings {
storage storage.push(
.push(AvmString::new(activation.context.gc_context, substring).into()); AvmString::new_ucs2(activation.context.gc_context, substring.into())
.into(),
);
} }
regexp.set_last_index(old); regexp.set_last_index(old);
return Ok(ArrayObject::from_storage(activation, storage) return Ok(ArrayObject::from_storage(activation, storage)

View File

@ -3,7 +3,7 @@
use crate::avm2::activation::Activation; use crate::avm2::activation::Activation;
use crate::avm2::object::script_object::ScriptObjectData; use crate::avm2::object::script_object::ScriptObjectData;
use crate::avm2::object::{ClassObject, Object, ObjectPtr, TObject}; use crate::avm2::object::{ClassObject, Object, ObjectPtr, TObject};
use crate::avm2::regexp::RegExp; use crate::avm2::regexp::{RegExp, RegExpFlags};
use crate::avm2::value::Value; use crate::avm2::value::Value;
use crate::avm2::Error; use crate::avm2::Error;
use crate::string::AvmString; use crate::string::AvmString;
@ -97,19 +97,21 @@ impl<'gc> TObject<'gc> for RegExpObject<'gc> {
let read = self.0.read(); let read = self.0.read();
let mut s = format!("/{}/", read.regexp.source()); let mut s = format!("/{}/", read.regexp.source());
if read.regexp.global() { let flags = read.regexp.flags();
if flags.contains(RegExpFlags::GLOBAL) {
s.push('g'); s.push('g');
} }
if read.regexp.ignore_case() { if flags.contains(RegExpFlags::IGNORE_CASE) {
s.push('i'); s.push('i');
} }
if read.regexp.multiline() { if flags.contains(RegExpFlags::MULTILINE) {
s.push('m'); s.push('m');
} }
if read.regexp.dotall() { if flags.contains(RegExpFlags::DOTALL) {
s.push('s'); s.push('s');
} }
if read.regexp.extended() { if flags.contains(RegExpFlags::EXTENDED) {
s.push('x'); s.push('x');
} }

View File

@ -1,22 +1,39 @@
//! RegExp Structure //! RegExp Structure
use crate::string::AvmString; use std::borrow::Cow;
use crate::string::{AvmString, Units, WStrToUtf8};
use bitflags::bitflags; use bitflags::bitflags;
use gc_arena::Collect; use gc_arena::Collect;
use regress::Regex;
#[derive(Clone, Collect, Debug)] #[derive(Collect, Debug)]
#[collect(no_drop)] #[collect(no_drop)]
pub struct RegExp<'gc> { pub struct RegExp<'gc> {
source: AvmString<'gc>, source: AvmString<'gc>,
flags: RegExpFlags, flags: RegExpFlags,
last_index: usize, last_index: usize,
#[collect(require_static)]
cached_regex: Option<Result<regress::Regex, ()>>,
cached_text: Option<CachedText<'gc>>,
}
impl<'gc> Clone for RegExp<'gc> {
fn clone(&self) -> Self {
Self {
source: self.source,
flags: self.flags,
last_index: self.last_index,
cached_regex: None,
cached_text: None,
}
}
} }
bitflags! { bitflags! {
#[derive(Collect)] #[derive(Collect)]
#[collect(require_static)] #[collect(require_static)]
struct RegExpFlags: u8 { pub struct RegExpFlags: u8 {
const GLOBAL = 1 << 0; const GLOBAL = 1 << 0;
const IGNORE_CASE = 1 << 1; const IGNORE_CASE = 1 << 1;
const MULTILINE = 1 << 2; const MULTILINE = 1 << 2;
@ -34,6 +51,8 @@ impl<'gc> RegExp<'gc> {
source: source.into(), source: source.into(),
flags: RegExpFlags::empty(), flags: RegExpFlags::empty(),
last_index: 0, last_index: 0,
cached_regex: None,
cached_text: None,
} }
} }
@ -45,9 +64,19 @@ impl<'gc> RegExp<'gc> {
where where
S: Into<AvmString<'gc>>, S: Into<AvmString<'gc>>,
{ {
self.cached_regex = None;
self.source = source.into(); self.source = source.into();
} }
pub fn flags(&self) -> RegExpFlags {
self.flags
}
pub fn set_flags(&mut self, flags: RegExpFlags) {
self.cached_regex = None;
self.flags = flags;
}
pub fn last_index(&self) -> usize { pub fn last_index(&self) -> usize {
self.last_index self.last_index
} }
@ -56,70 +85,196 @@ impl<'gc> RegExp<'gc> {
self.last_index = i; self.last_index = i;
} }
pub fn dotall(&self) -> bool { fn find_utf8_match_at<T, F>(&mut self, text: AvmString<'gc>, start: usize, f: F) -> Option<T>
self.flags.contains(RegExpFlags::DOTALL) where
} F: FnOnce(&mut CachedText<'gc>, regress::Match) -> T,
{
pub fn set_dotall(&mut self, value: bool) { if self.cached_regex.is_none() {
self.flags.set(RegExpFlags::DOTALL, value); let re = regress::Regex::with_flags(
} &self.source.to_utf8_lossy(),
regress::Flags {
pub fn extended(&self) -> bool { icase: self.flags.contains(RegExpFlags::IGNORE_CASE),
self.flags.contains(RegExpFlags::EXTENDED) multiline: self.flags.contains(RegExpFlags::MULTILINE),
} dot_all: self.flags.contains(RegExpFlags::DOTALL),
no_opt: false,
pub fn set_extended(&mut self, value: bool) { },
self.flags.set(RegExpFlags::EXTENDED, value); );
} self.cached_regex = Some(re.map_err(drop));
pub fn global(&self) -> bool {
self.flags.contains(RegExpFlags::GLOBAL)
}
pub fn set_global(&mut self, value: bool) {
self.flags.set(RegExpFlags::GLOBAL, value);
}
pub fn ignore_case(&self) -> bool {
self.flags.contains(RegExpFlags::IGNORE_CASE)
}
pub fn set_ignore_case(&mut self, value: bool) {
self.flags.set(RegExpFlags::IGNORE_CASE, value);
}
pub fn multiline(&self) -> bool {
self.flags.contains(RegExpFlags::MULTILINE)
}
pub fn set_multiline(&mut self, value: bool) {
self.flags.set(RegExpFlags::MULTILINE, value);
}
pub fn test(&mut self, text: &str) -> bool {
self.exec(text).is_some()
}
pub fn exec(&mut self, text: &str) -> Option<regress::Match> {
if let Ok(re) = Regex::with_flags(
&self.source,
regress::Flags {
icase: self.ignore_case(),
multiline: self.multiline(),
dot_all: self.dotall(),
no_opt: false,
},
) {
let start = if self.global() { self.last_index } else { 0 };
if let Some(matched) = re.find_from(text, start).next() {
if self.global() {
self.last_index = matched.end();
}
return Some(matched);
}
} }
None let regex = match self.cached_regex.as_mut() {
Some(Ok(re)) => re,
Some(Err(_)) => return None,
None => unreachable!(),
};
let cached = self
.cached_text
.as_ref()
.filter(|cached| AvmString::ptr_eq(&cached.text, &text))
.is_some();
if !cached {
self.cached_text = Some(CachedText::new(text));
}
let text = self.cached_text.as_mut().unwrap();
let start = text.utf8_index(start)?;
let re_match = regex.find_from(text.utf8(), start).next()?;
Some(f(text, re_match))
}
pub fn test(&mut self, text: AvmString<'gc>) -> bool {
let global = self.flags.contains(RegExpFlags::GLOBAL);
let start = if global { self.last_index } else { 0 };
let matched_idx = self.find_utf8_match_at(text, start, |text, re_match| {
if global {
text.utf16_index(re_match.end())
} else {
None
}
});
match matched_idx {
Some(Some(idx)) => {
self.last_index = idx;
true
}
Some(None) => true,
None => false,
}
}
pub fn exec(&mut self, text: AvmString<'gc>) -> Option<regress::Match> {
let global = self.flags.contains(RegExpFlags::GLOBAL);
let start = if global { self.last_index } else { 0 };
let re_match = self.find_utf8_match_at(text, start, |text, mut re_match| {
// Sort the capture endpoints by increasing index, so that CachedText::utf16_index is efficient.
let mut utf8_indices = re_match
.captures
.iter_mut()
.filter_map(Option::as_mut)
.chain(std::iter::once(&mut re_match.range))
.flat_map(|capture| [&mut capture.start, &mut capture.end])
.collect::<Vec<_>>();
utf8_indices.sort_by_key(|i| **i);
// Map UTF8 indices back to UTF16.
for i in utf8_indices {
*i = text.utf16_index(*i).unwrap();
}
re_match
})?;
if global {
self.last_index = re_match.end();
}
Some(re_match)
}
}
#[derive(Collect, Debug)]
#[collect(no_drop)]
struct CachedText<'gc> {
text: AvmString<'gc>,
// None means that `text` is already a valid utf8 string.
utf8: Option<String>,
utf8_prefix_len: usize,
// Cached values of the last `{utf8, utf16}_index` call,
// to avoid unnecessary recomputation when calling these methods
// with increasing indices.
cur_utf8_index: usize,
cur_utf16_index: usize,
}
impl<'gc> CachedText<'gc> {
fn new(text: AvmString<'gc>) -> Self {
let to_utf8 = WStrToUtf8::new(text.as_ucs2());
let utf8 = to_utf8.to_utf8_lossy();
let utf8_prefix_len = if utf8.len() == text.len() {
// Identical len means the string is fully utf8,
// even if `utf8_prefix` is empty.
text.len()
} else {
to_utf8.prefix().len()
};
Self {
text,
utf8: match utf8 {
Cow::Owned(s) => Some(s),
Cow::Borrowed(_) => None,
},
utf8_prefix_len,
cur_utf8_index: utf8_prefix_len,
cur_utf16_index: utf8_prefix_len,
}
}
fn utf8(&self) -> &str {
self.utf8
.as_deref()
.unwrap_or_else(|| match self.text.units() {
// SAFETY: because `self.utf8` is None, we know `text` contains
// a valid UTF8 string.
Units::Bytes(s) => unsafe { std::str::from_utf8_unchecked(s) },
_ => unreachable!(),
})
}
fn reset(&mut self) {
self.cur_utf8_index = self.utf8_prefix_len;
self.cur_utf16_index = self.utf8_prefix_len;
}
fn advance(&mut self) -> Option<()> {
let c = self.utf8()[self.cur_utf8_index..].chars().next()?;
self.cur_utf8_index += c.len_utf8();
self.cur_utf16_index += c.len_utf16();
Some(())
}
/// Returns the UTF8 index corresponding to the given UTF16 index.
///
/// If `utf16_index` is out of bounds, return `None`.
/// If `utf16_index` isn't on a char boundary, return the index
/// of the next char.
fn utf8_index(&mut self, utf16_index: usize) -> Option<usize> {
if utf16_index <= self.utf8_prefix_len {
return Some(utf16_index);
}
if utf16_index < self.cur_utf16_index {
self.reset();
}
while self.cur_utf16_index < utf16_index {
self.advance()?;
}
Some(self.cur_utf8_index)
}
/// Returns the UTF16 index corresponding to the given UTF8 index.
///
/// If `utf8_index` is out of bounds, return `None`.
/// If `utf8_index` isn't on a char boundary, return the index
/// of the next char.
fn utf16_index(&mut self, utf8_index: usize) -> Option<usize> {
if utf8_index <= self.utf8_prefix_len {
return Some(utf8_index);
}
if utf8_index < self.cur_utf8_index {
self.reset();
}
while self.cur_utf8_index < utf8_index {
self.advance()?;
}
Some(self.cur_utf16_index)
} }
} }

View File

@ -24,7 +24,7 @@ pub const MAX_STRING_LEN: usize = raw::MAX_STRING_LEN;
pub use avm::AvmString; pub use avm::AvmString;
pub use buf::WString; pub use buf::WString;
pub use common::{BorrowWStr, BorrowWStrMut, Units}; pub use common::{BorrowWStr, BorrowWStrMut, Units};
pub use ops::{Iter, Split}; pub use ops::{Iter, Split, WStrToUtf8};
pub use pattern::Pattern; pub use pattern::Pattern;
pub use slice::{WStr, WStrMut}; pub use slice::{WStr, WStrMut};

View File

@ -53,6 +53,15 @@ impl<'gc> AvmString<'gc> {
} }
} }
#[inline]
pub fn ptr_eq(this: &Self, other: &Self) -> bool {
match (this.source, other.source) {
(Source::Owned(this), Source::Owned(other)) => Gc::ptr_eq(this, other),
(Source::Static(this), Source::Static(other)) => std::ptr::eq(this, other),
_ => false,
}
}
#[inline] #[inline]
pub fn as_str(&self) -> &str { pub fn as_str(&self) -> &str {
self self

View File

@ -1,6 +1,7 @@
use gc_arena::Collect; use gc_arena::Collect;
use super::raw::WStrPtr; use super::raw::WStrPtr;
use super::utils::split_ascii_prefix;
use super::{BorrowWStr, BorrowWStrMut, Units, WStr, WStrMut, MAX_STRING_LEN}; use super::{BorrowWStr, BorrowWStrMut, Units, WStr, WStrMut, MAX_STRING_LEN};
/// An owned, extensible UCS2 string, analoguous to `String`. /// An owned, extensible UCS2 string, analoguous to `String`.
@ -273,15 +274,6 @@ impl WString {
} }
} }
fn split_ascii_prefix(s: &str) -> (&[u8], &str) {
let first_non_ascii = s.as_bytes().iter().position(|c| *c >= 0x80);
let (head, tail) = match first_non_ascii {
Some(i) => s.split_at(i),
None => ("", s),
};
(head.as_bytes(), tail)
}
impl Drop for WString { impl Drop for WString {
fn drop(&mut self) { fn drop(&mut self) {
// SAFETY: `self` is gone after this line. // SAFETY: `self` is gone after this line.

View File

@ -161,6 +161,14 @@ macro_rules! impl_str_methods {
crate::string::ops::str_is_latin1($deref) crate::string::ops::str_is_latin1($deref)
} }
/// Converts this string to an UTF8 `String`.
///
/// Unpaired surrogates are replaced by the replacement character.
#[inline]
pub fn to_utf8_lossy($self: $receiver) -> std::borrow::Cow<$lt, str> {
crate::string::ops::WStrToUtf8::new($deref).to_utf8_lossy()
}
/// Analogue of [`str::find`]. /// Analogue of [`str::find`].
#[inline] #[inline]
pub fn find<$($pat_gen)* P: crate::string::Pattern<$pat_lt>>($self: $pat_self, pattern: P) -> Option<usize> { pub fn find<$($pat_gen)* P: crate::string::Pattern<$pat_lt>>($self: $pat_self, pattern: P) -> Option<usize> {

View File

@ -1,9 +1,10 @@
use std::borrow::Cow;
use std::fmt::{self, Write}; use std::fmt::{self, Write};
use std::hash::Hasher; use std::hash::Hasher;
use std::slice::Iter as SliceIter; use std::slice::Iter as SliceIter;
use super::pattern::Searcher; use super::pattern::Searcher;
use super::{utils, Pattern, WStr, Units}; use super::{utils, Pattern, Units, WStr};
pub struct Iter<'a> { pub struct Iter<'a> {
inner: Units<SliceIter<'a, u8>, SliceIter<'a, u16>>, inner: Units<SliceIter<'a, u8>, SliceIter<'a, u16>>,
@ -41,7 +42,9 @@ pub fn str_iter(s: WStr<'_>) -> Iter<'_> {
} }
pub fn str_fmt(s: WStr<'_>, f: &mut fmt::Formatter<'_>) -> fmt::Result { pub fn str_fmt(s: WStr<'_>, f: &mut fmt::Formatter<'_>) -> fmt::Result {
std::char::decode_utf16(s.iter()) let utf8 = WStrToUtf8::new(s);
f.write_str(utf8.head)?;
std::char::decode_utf16(utf8.tail.iter())
.map(|c| c.unwrap_or(char::REPLACEMENT_CHARACTER)) .map(|c| c.unwrap_or(char::REPLACEMENT_CHARACTER))
.try_for_each(|c| f.write_char(c)) .try_for_each(|c| f.write_char(c))
} }
@ -120,7 +123,6 @@ pub fn str_is_latin1(s: WStr<'_>) -> bool {
Units::Wide(us) => us.iter().all(|c| *c <= u16::from(u8::MAX)), Units::Wide(us) => us.iter().all(|c| *c <= u16::from(u8::MAX)),
} }
} }
pub fn str_find<'a, P: Pattern<'a>>(haystack: WStr<'a>, pattern: P) -> Option<usize> { pub fn str_find<'a, P: Pattern<'a>>(haystack: WStr<'a>, pattern: P) -> Option<usize> {
pattern pattern
.into_searcher(haystack) .into_searcher(haystack)
@ -168,3 +170,39 @@ impl<'a, P: Pattern<'a>> Iterator for Split<'a, P> {
} }
} }
} }
/// A struct for converting a `WStr<'_>` to an UTF8 `String`.
pub struct WStrToUtf8<'a> {
head: &'a str,
tail: WStr<'a>,
}
impl<'a> WStrToUtf8<'a> {
pub fn new(s: WStr<'a>) -> Self {
let (head, tail) = match s.units() {
Units::Bytes(b) => {
let (head, tail) = utils::split_ascii_prefix_bytes(b);
(head, WStr::from_units(tail))
}
Units::Wide(_) => ("", s),
};
Self { head, tail }
}
pub fn to_utf8_lossy(&self) -> Cow<'a, str> {
if self.tail.is_empty() {
Cow::Borrowed(self.head)
} else {
let mut out = String::with_capacity(self.head.len() + self.tail.len());
out.push_str(self.head);
write!(out, "{}", self.tail).unwrap();
Cow::Owned(out)
}
}
#[inline]
pub fn prefix(&self) -> &str {
self.head
}
}

View File

@ -5,7 +5,7 @@
// - remove implicit bound checks? // - remove implicit bound checks?
// - use memchr crate? // - use memchr crate?
use super::{WStr, Units}; use super::{Units, WStr};
/// A pattern that can be searched in a [`WStr`]. /// A pattern that can be searched in a [`WStr`].
/// ///

View File

@ -25,6 +25,25 @@ pub fn next_char_boundary(slice: &str, pos: usize) -> usize {
} }
} }
/// Finds the longest prefix of `slice` that is entirely ASCII,
/// and returns it as an UTF8 string, together with the remaining tail.
pub fn split_ascii_prefix_bytes(slice: &[u8]) -> (&str, &[u8]) {
let first_non_ascii = slice.iter().position(|c| *c >= 0x80);
let (head, tail) = slice.split_at(first_non_ascii.unwrap_or(0));
// SAFETY: `head` only contains ASCII.
let head = unsafe { std::str::from_utf8_unchecked(head) };
(head, tail)
}
/// Finds the longest prefix of `slice` that is entirely ASCII,
/// and returns it as a byte slice, together with the remaining tail.
pub fn split_ascii_prefix(slice: &str) -> (&[u8], &str) {
let (head, tail) = split_ascii_prefix_bytes(slice.as_bytes());
// SAFETY: `split_ascii_prefix_bytes` always split on a char boundary.
let tail = unsafe { std::str::from_utf8_unchecked(tail) };
(head.as_bytes(), tail)
}
/// Maps a UTF-16 code unit into a `char`. /// Maps a UTF-16 code unit into a `char`.
/// TODO: Surrogate characters will get replaced with the Unicode replacement character. /// TODO: Surrogate characters will get replaced with the Unicode replacement character.
pub fn utf16_code_unit_to_char(c: u16) -> char { pub fn utf16_code_unit_to_char(c: u16) -> char {