ruffle/core/src/string/ops.rs

use std::borrow::Cow;
use std::fmt::{self, Write};
use std::hash::Hasher;
use std::slice::Iter as SliceIter;

use super::pattern::{SearchStep, Searcher};
use super::{utils, BorrowWStr, Pattern, Units, WStr, WString};

pub struct Iter<'a> {
    inner: Units<SliceIter<'a, u8>, SliceIter<'a, u16>>,
}

impl<'a> Iterator for Iter<'a> {
    type Item = u16;

    #[inline]
    fn next(&mut self) -> Option<Self::Item> {
        match &mut self.inner {
            Units::Bytes(it) => it.next().map(|c| *c as u16),
            Units::Wide(it) => it.next().copied(),
        }
    }
}

impl<'a> DoubleEndedIterator for Iter<'a> {
    #[inline]
    fn next_back(&mut self) -> Option<Self::Item> {
        match &mut self.inner {
            Units::Bytes(it) => it.next_back().map(|c| *c as u16),
            Units::Wide(it) => it.next_back().copied(),
        }
    }
}

#[inline]
pub fn str_iter(s: WStr<'_>) -> Iter<'_> {
    let inner = match s.units() {
        Units::Bytes(us) => Units::Bytes(us.iter()),
        Units::Wide(us) => Units::Wide(us.iter()),
    };
    Iter { inner }
}

pub fn str_fmt(s: WStr<'_>, f: &mut fmt::Formatter<'_>) -> fmt::Result {
    let utf8 = WStrToUtf8::new(s);
    f.write_str(utf8.head)?;
    std::char::decode_utf16(utf8.tail.iter())
        .map(|c| c.unwrap_or(char::REPLACEMENT_CHARACTER))
        .try_for_each(|c| f.write_char(c))
}

pub fn str_debug_fmt(s: WStr<'_>, f: &mut fmt::Formatter<'_>) -> fmt::Result {
    f.write_char('"')?;

    for c in std::char::decode_utf16(s.iter()) {
        match c {
            Ok(c) => c.escape_debug().try_for_each(|c| f.write_char(c))?,
            Err(err) => write!(f, "\\u{{{:x}}}", err.unpaired_surrogate())?,
        }
    }

    f.write_char('"')
}

pub fn str_eq(left: WStr<'_>, right: WStr<'_>) -> bool {
    let (bytes, wide) = match (left.units(), right.units()) {
        (Units::Bytes(a), Units::Bytes(b)) => return a == b,
        (Units::Wide(a), Units::Wide(b)) => return a == b,
        (Units::Bytes(a), Units::Wide(b)) => (a, b),
        (Units::Wide(a), Units::Bytes(b)) => (b, a),
    };

    if bytes.len() != wide.len() {
        return false;
    }

    (0..bytes.len()).all(|i| {
        // SAFETY: Both slices have the same length.
        unsafe { *bytes.get_unchecked(i) as u16 == *wide.get_unchecked(i) }
    })
}

pub fn str_eq_ignore_case(left: WStr<'_>, right: WStr<'_>) -> bool {
    let left = left.iter().map(utils::swf_to_lowercase);
    let right = right.iter().map(utils::swf_to_lowercase);
    left.eq(right)
}

pub fn str_cmp(left: WStr<'_>, right: WStr<'_>) -> std::cmp::Ordering {
    let (bytes, wide, rev) = match (left.units(), right.units()) {
        (Units::Bytes(a), Units::Bytes(b)) => return a.cmp(b),
        (Units::Wide(a), Units::Wide(b)) => return a.cmp(b),
        (Units::Bytes(a), Units::Wide(b)) => (a, b, false),
        (Units::Wide(a), Units::Bytes(b)) => (b, a, true),
    };

    let bytes = bytes.iter().map(|c| *c as u16);
    let wide = wide.iter().copied();
    let cmp = bytes.cmp(wide);
    if rev {
        cmp.reverse()
    } else {
        cmp
    }
}

pub fn str_cmp_ignore_case(left: WStr<'_>, right: WStr<'_>) -> std::cmp::Ordering {
    let left = left.iter().map(utils::swf_to_lowercase);
    let right = right.iter().map(utils::swf_to_lowercase);
    left.cmp(right)
}

pub fn str_hash<H: Hasher>(s: WStr<'_>, state: &mut H) {
    match s.units() {
        Units::Bytes(us) => us.iter().for_each(|u| state.write_u16(u16::from(*u))),
        Units::Wide(us) => us.iter().for_each(|u| state.write_u16(*u)),
    }
}

fn map_latin1_chars(s: WStr<'_>, mut map: impl FnMut(u8) -> u8) -> WString {
    match s.units() {
        Units::Bytes(us) => {
            let us: Vec<u8> = us.iter().map(|c| map(*c)).collect();
            WString::from_buf(us)
        }
        Units::Wide(us) => {
            let us: Vec<u16> = us
                .iter()
                .map(|c| match u8::try_from(*c) {
                    Ok(c) => map(c).into(),
                    Err(_) => *c,
                })
                .collect();
            WString::from_buf(us)
        }
    }
}

pub fn str_to_ascii_lowercase(s: WStr<'_>) -> WString {
    map_latin1_chars(s, |c| c.to_ascii_lowercase())
}

pub fn str_is_latin1(s: WStr<'_>) -> bool {
    match s.units() {
        Units::Bytes(_) => true,
        Units::Wide(us) => us.iter().all(|c| *c <= u16::from(u8::MAX)),
    }
}

pub fn str_join<E: BorrowWStr>(elems: &[E], sep: WStr<'_>) -> WString {
    fn join_inner<T, E, F>(total_len: usize, elems: &[E], sep: WStr<'_>, mut extend: F) -> Vec<T>
    where
        E: BorrowWStr,
        F: FnMut(&mut Vec<T>, WStr<'_>),
    {
        let mut buf = Vec::with_capacity(total_len);
        extend(&mut buf, elems[0].borrow());
        for e in &elems[1..] {
            extend(&mut buf, sep);
            extend(&mut buf, e.borrow());
        }
        buf
    }

    if elems.is_empty() {
        return WString::default();
    }

    let (len, is_latin1) = elems.iter().fold(
        (sep.len() * elems.len().saturating_sub(1), sep.is_latin1()),
        |(len, is_latin1), e| {
            let e = e.borrow();
            (len + e.len(), is_latin1 && e.is_latin1())
        },
    );

    if is_latin1 {
        let buf = join_inner(len, elems, sep, |buf: &mut Vec<u8>, e| match e.units() {
            Units::Bytes(us) => buf.extend_from_slice(us),
            Units::Wide(us) => buf.extend(us.iter().map(|c| *c as u8)),
        });
        WString::from_buf(buf)
    } else {
        let buf = join_inner(len, elems, sep, |buf: &mut Vec<u16>, e| match e.units() {
            Units::Bytes(us) => buf.extend(us.iter().map(|c| *c as u16)),
            Units::Wide(us) => buf.extend_from_slice(us),
        });
        WString::from_buf(buf)
    }
}

pub fn str_find<'a, P: Pattern<'a>>(haystack: WStr<'a>, pattern: P) -> Option<usize> {
    pattern
        .into_searcher(haystack)
        .next_match()
        .map(|(start, _)| start)
}

pub fn str_rfind<'a, P: Pattern<'a>>(haystack: WStr<'a>, pattern: P) -> Option<usize> {
    pattern
        .into_searcher(haystack)
        .next_match_back()
        .map(|(start, _)| start)
}

#[inline]
pub fn str_split<'a, P: Pattern<'a>>(string: WStr<'a>, pattern: P) -> Split<'a, P> {
    Split {
        string: Some(string),
        searcher: pattern.into_searcher(string),
        prev_end: 0,
    }
}

pub fn str_rsplit_once<'a, P: Pattern<'a>>(
    string: WStr<'a>,
    pattern: P,
) -> Option<(WStr<'a>, WStr<'a>)> {
    let (start, end) = pattern.into_searcher(string).next_match_back()?;
    Some((string.slice(..start), string.slice(end..)))
}

pub fn starts_with<'a, P: Pattern<'a>>(string: WStr<'a>, pattern: P) -> bool {
    matches!(
        pattern.into_searcher(string).next(),
        SearchStep::Match(_, _)
    )
}

pub fn ends_with<'a, P: Pattern<'a>>(string: WStr<'a>, pattern: P) -> bool {
    matches!(
        pattern.into_searcher(string).next_back(),
        SearchStep::Match(_, _)
    )
}

pub fn strip_prefix<'a, P: Pattern<'a>>(string: WStr<'a>, pattern: P) -> Option<WStr<'a>> {
    match pattern.into_searcher(string).next() {
        SearchStep::Match(_, end) => Some(string.slice(end..)),
        _ => None,
    }
}

pub fn strip_suffix<'a, P: Pattern<'a>>(string: WStr<'a>, pattern: P) -> Option<WStr<'a>> {
    match pattern.into_searcher(string).next_back() {
        SearchStep::Match(start, _) => Some(string.slice(..start)),
        _ => None,
    }
}

pub fn str_trim_matches<'a, P: Pattern<'a>>(string: WStr<'a>, pattern: P) -> WStr<'a> {
    let mut i = 0;
    let mut j = 0;
    let mut searcher = pattern.into_searcher(string);
    if let Some((start, end)) = searcher.next_reject() {
        i = start;
        j = end;
    }

    if let Some((_, end)) = searcher.next_reject_back() {
        j = end;
    }

    string.slice(i..j)
}

pub fn str_trim_start_matches<'a, P: Pattern<'a>>(string: WStr<'a>, pattern: P) -> WStr<'a> {
    let mut i = string.len();
    let mut searcher = pattern.into_searcher(string);
    if let Some((start, _)) = searcher.next_reject() {
        i = start;
    }

    string.slice(i..)
}

pub fn str_trim_end_matches<'a, P: Pattern<'a>>(string: WStr<'a>, pattern: P) -> WStr<'a> {
    let mut i = 0;
    let mut searcher = pattern.into_searcher(string);
    if let Some((_, end)) = searcher.next_reject_back() {
        i = end;
    }

    string.slice(..i)
}

pub struct Split<'a, P: Pattern<'a>> {
    string: Option<WStr<'a>>,
    searcher: P::Searcher,
    prev_end: usize,
}

impl<'a, P: Pattern<'a>> Iterator for Split<'a, P> {
    type Item = WStr<'a>;

    fn next(&mut self) -> Option<Self::Item> {
        let string = self.string?;

        match self.searcher.next_match() {
            Some((start, end)) => {
                let end = std::mem::replace(&mut self.prev_end, end);
                Some(string.slice(end..start))
            }
            None => {
                self.string = None;
                Some(string.slice(self.prev_end..))
            }
        }
    }
}

/// A struct for converting a `WStr<'_>` to an UTF8 `String`.
pub struct WStrToUtf8<'a> {
    head: &'a str,
    tail: WStr<'a>,
}

impl<'a> WStrToUtf8<'a> {
    pub fn new(s: WStr<'a>) -> Self {
        let (head, tail) = match s.units() {
            Units::Bytes(b) => {
                let (head, tail) = utils::split_ascii_prefix_bytes(b);
                (head, WStr::from_units(tail))
            }
            Units::Wide(_) => ("", s),
        };

        Self { head, tail }
    }

    pub fn to_utf8_lossy(&self) -> Cow<'a, str> {
        if self.tail.is_empty() {
            Cow::Borrowed(self.head)
        } else {
            let mut out = String::with_capacity(self.head.len() + self.tail.len());
            out.push_str(self.head);
            write!(out, "{}", self.tail).unwrap();
            Cow::Owned(out)
        }
    }

    #[inline]
    pub fn prefix(&self) -> &str {
        self.head
    }
}
avm2: rewrite regexp module to not rely on AvmString::as_str This is a little tricky, because we have to map the utf8 indices returned by the regex engine to utf16 indices usable by Ruffle. To limit the impact on performance, the regex, the string we're currently matching on, and the last known (utf8, utf16) positions are cached, avoiding extra utf8 conversions in common use cases where a single string is repeatedly searched with increasing `lastIndex`. 2021-09-20 21:56:17 +00:00			`use std::borrow::Cow;`
core: Introduce `WStr` and `WStrMut` types, for representing Flash strings These types represent an UCS2 string (UTF-16 with unpaired surrogates). The string is stored either as a sequence of u8s (Bytes) or u16s (Wide); the type of string is tracked by setting the high bit of the length of its fat pointer. 2021-08-28 14:40:14 +00:00			`use std::fmt::{self, Write};`
			`use std::hash::Hasher;`
			`use std::slice::Iter as SliceIter;`

avm1: partially rewrite Value::primitive_as_number 2021-09-21 16:28:52 +00:00			`use super::pattern::{SearchStep, Searcher};`
core: implement WStr::join 2021-10-02 22:28:32 +00:00			`use super::{utils, BorrowWStr, Pattern, Units, WStr, WString};`
core: Introduce `WStr` and `WStrMut` types, for representing Flash strings These types represent an UCS2 string (UTF-16 with unpaired surrogates). The string is stored either as a sequence of u8s (Bytes) or u16s (Wide); the type of string is tracked by setting the high bit of the length of its fat pointer. 2021-08-28 14:40:14 +00:00
			`pub struct Iter<'a> {`
			`inner: Units<SliceIter<'a, u8>, SliceIter<'a, u16>>,`
			`}`

			`impl<'a> Iterator for Iter<'a> {`
			`type Item = u16;`

			`#[inline]`
			`fn next(&mut self) -> Option<Self::Item> {`
			`match &mut self.inner {`
			`Units::Bytes(it) => it.next().map(\|c\| *c as u16),`
			`Units::Wide(it) => it.next().copied(),`
			`}`
			`}`
			`}`

core: Replace string::utils *_ignore_case functions by methods on WStr 2021-09-16 23:53:17 +00:00			`impl<'a> DoubleEndedIterator for Iter<'a> {`
			`#[inline]`
			`fn next_back(&mut self) -> Option<Self::Item> {`
			`match &mut self.inner {`
			`Units::Bytes(it) => it.next_back().map(\|c\| *c as u16),`
			`Units::Wide(it) => it.next_back().copied(),`
			`}`
			`}`
			`}`

core: Introduce `WStr` and `WStrMut` types, for representing Flash strings These types represent an UCS2 string (UTF-16 with unpaired surrogates). The string is stored either as a sequence of u8s (Bytes) or u16s (Wide); the type of string is tracked by setting the high bit of the length of its fat pointer. 2021-08-28 14:40:14 +00:00			`#[inline]`
			`pub fn str_iter(s: WStr<'_>) -> Iter<'_> {`
			`let inner = match s.units() {`
			`Units::Bytes(us) => Units::Bytes(us.iter()),`
			`Units::Wide(us) => Units::Wide(us.iter()),`
			`};`
			`Iter { inner }`
			`}`

			`pub fn str_fmt(s: WStr<'_>, f: &mut fmt::Formatter<'_>) -> fmt::Result {`
avm2: rewrite regexp module to not rely on AvmString::as_str This is a little tricky, because we have to map the utf8 indices returned by the regex engine to utf16 indices usable by Ruffle. To limit the impact on performance, the regex, the string we're currently matching on, and the last known (utf8, utf16) positions are cached, avoiding extra utf8 conversions in common use cases where a single string is repeatedly searched with increasing `lastIndex`. 2021-09-20 21:56:17 +00:00			`let utf8 = WStrToUtf8::new(s);`
			`f.write_str(utf8.head)?;`
			`std::char::decode_utf16(utf8.tail.iter())`
core: Introduce `WStr` and `WStrMut` types, for representing Flash strings These types represent an UCS2 string (UTF-16 with unpaired surrogates). The string is stored either as a sequence of u8s (Bytes) or u16s (Wide); the type of string is tracked by setting the high bit of the length of its fat pointer. 2021-08-28 14:40:14 +00:00			`.map(\|c\| c.unwrap_or(char::REPLACEMENT_CHARACTER))`
			`.try_for_each(\|c\| f.write_char(c))`
			`}`

			`pub fn str_debug_fmt(s: WStr<'_>, f: &mut fmt::Formatter<'_>) -> fmt::Result {`
			`f.write_char('"')?;`

			`for c in std::char::decode_utf16(s.iter()) {`
			`match c {`
			`Ok(c) => c.escape_debug().try_for_each(\|c\| f.write_char(c))?,`
			`Err(err) => write!(f, "\\u{{{:x}}}", err.unpaired_surrogate())?,`
			`}`
			`}`

			`f.write_char('"')`
			`}`

			`pub fn str_eq(left: WStr<'_>, right: WStr<'_>) -> bool {`
			`let (bytes, wide) = match (left.units(), right.units()) {`
			`(Units::Bytes(a), Units::Bytes(b)) => return a == b,`
			`(Units::Wide(a), Units::Wide(b)) => return a == b,`
			`(Units::Bytes(a), Units::Wide(b)) => (a, b),`
			`(Units::Wide(a), Units::Bytes(b)) => (b, a),`
			`};`

			`if bytes.len() != wide.len() {`
			`return false;`
			`}`

			`(0..bytes.len()).all(\|i\| {`
			`// SAFETY: Both slices have the same length.`
			`unsafe { bytes.get_unchecked(i) as u16 == wide.get_unchecked(i) }`
			`})`
			`}`

avm1: Requires a WStr<'_> to access PropertyMap 2021-09-16 19:10:51 +00:00			`pub fn str_eq_ignore_case(left: WStr<'_>, right: WStr<'_>) -> bool {`
			`let left = left.iter().map(utils::swf_to_lowercase);`
			`let right = right.iter().map(utils::swf_to_lowercase);`
			`left.eq(right)`
			`}`

core: Introduce `WStr` and `WStrMut` types, for representing Flash strings These types represent an UCS2 string (UTF-16 with unpaired surrogates). The string is stored either as a sequence of u8s (Bytes) or u16s (Wide); the type of string is tracked by setting the high bit of the length of its fat pointer. 2021-08-28 14:40:14 +00:00			`pub fn str_cmp(left: WStr<'_>, right: WStr<'_>) -> std::cmp::Ordering {`
			`let (bytes, wide, rev) = match (left.units(), right.units()) {`
			`(Units::Bytes(a), Units::Bytes(b)) => return a.cmp(b),`
			`(Units::Wide(a), Units::Wide(b)) => return a.cmp(b),`
			`(Units::Bytes(a), Units::Wide(b)) => (a, b, false),`
			`(Units::Wide(a), Units::Bytes(b)) => (b, a, true),`
			`};`

			`let bytes = bytes.iter().map(\|c\| *c as u16);`
			`let wide = wide.iter().copied();`
			`let cmp = bytes.cmp(wide);`
			`if rev {`
			`cmp.reverse()`
			`} else {`
			`cmp`
			`}`
			`}`

core: Replace string::utils *_ignore_case functions by methods on WStr 2021-09-16 23:53:17 +00:00			`pub fn str_cmp_ignore_case(left: WStr<'_>, right: WStr<'_>) -> std::cmp::Ordering {`
			`let left = left.iter().map(utils::swf_to_lowercase);`
			`let right = right.iter().map(utils::swf_to_lowercase);`
			`left.cmp(right)`
			`}`

core: Introduce `WStr` and `WStrMut` types, for representing Flash strings These types represent an UCS2 string (UTF-16 with unpaired surrogates). The string is stored either as a sequence of u8s (Bytes) or u16s (Wide); the type of string is tracked by setting the high bit of the length of its fat pointer. 2021-08-28 14:40:14 +00:00			`pub fn str_hash<H: Hasher>(s: WStr<'_>, state: &mut H) {`
			`match s.units() {`
			`Units::Bytes(us) => us.iter().for_each(\|u\| state.write_u16(u16::from(*u))),`
			`Units::Wide(us) => us.iter().for_each(\|u\| state.write_u16(*u)),`
			`}`
			`}`
core: Reimplement AVM string methods using our UCS2 API This generally means that methods are more efficient, as we don't need to encode to UTF16 on-the-fly to have correct indices. This also fix some bugs: - charCode now properly handle surrogate pairs - calling lastIndexOf with the empty pattern and an OoB index now properly returns the string length Still missing is AVM2's String.match 2021-09-15 09:59:43 +00:00
core: use WStrings for display object's frame and scene labels 2021-10-02 01:11:10 +00:00			`fn map_latin1_chars(s: WStr<'_>, mut map: impl FnMut(u8) -> u8) -> WString {`
			`match s.units() {`
			`Units::Bytes(us) => {`
			`let us: Vec<u8> = us.iter().map(\|c\| map(*c)).collect();`
			`WString::from_buf(us)`
			`}`
			`Units::Wide(us) => {`
			`let us: Vec<u16> = us`
			`.iter()`
			`.map(\|c\| match u8::try_from(*c) {`
			`Ok(c) => map(c).into(),`
			`Err(_) => *c,`
			`})`
			`.collect();`
			`WString::from_buf(us)`
			`}`
			`}`
			`}`

			`pub fn str_to_ascii_lowercase(s: WStr<'_>) -> WString {`
			`map_latin1_chars(s, \|c\| c.to_ascii_lowercase())`
			`}`

core: Add `string::Pattern` trait, based on `std::str::Pattern` This allows `Str::{find, rfind, split}` to accept multiple types 2021-09-17 19:12:11 +00:00			`pub fn str_is_latin1(s: WStr<'_>) -> bool {`
			`match s.units() {`
			`Units::Bytes(_) => true,`
			`Units::Wide(us) => us.iter().all(\|c\| *c <= u16::from(u8::MAX)),`
			`}`
core: Reimplement AVM string methods using our UCS2 API This generally means that methods are more efficient, as we don't need to encode to UTF16 on-the-fly to have correct indices. This also fix some bugs: - charCode now properly handle surrogate pairs - calling lastIndexOf with the empty pattern and an OoB index now properly returns the string length Still missing is AVM2's String.match 2021-09-15 09:59:43 +00:00			`}`
core: implement WStr::join 2021-10-02 22:28:32 +00:00
			`pub fn str_join<E: BorrowWStr>(elems: &[E], sep: WStr<'_>) -> WString {`
			`fn join_inner<T, E, F>(total_len: usize, elems: &[E], sep: WStr<'_>, mut extend: F) -> Vec<T>`
			`where`
			`E: BorrowWStr,`
			`F: FnMut(&mut Vec<T>, WStr<'_>),`
			`{`
			`let mut buf = Vec::with_capacity(total_len);`
			`extend(&mut buf, elems[0].borrow());`
			`for e in &elems[1..] {`
			`extend(&mut buf, sep);`
			`extend(&mut buf, e.borrow());`
			`}`
			`buf`
			`}`

			`if elems.is_empty() {`
			`return WString::default();`
			`}`

			`let (len, is_latin1) = elems.iter().fold(`
			`(sep.len() * elems.len().saturating_sub(1), sep.is_latin1()),`
			`\|(len, is_latin1), e\| {`
			`let e = e.borrow();`
			`(len + e.len(), is_latin1 && e.is_latin1())`
			`},`
			`);`

			`if is_latin1 {`
			`let buf = join_inner(len, elems, sep, \|buf: &mut Vec<u8>, e\| match e.units() {`
			`Units::Bytes(us) => buf.extend_from_slice(us),`
			`Units::Wide(us) => buf.extend(us.iter().map(\|c\| *c as u8)),`
			`});`
			`WString::from_buf(buf)`
			`} else {`
			`let buf = join_inner(len, elems, sep, \|buf: &mut Vec<u16>, e\| match e.units() {`
			`Units::Bytes(us) => buf.extend(us.iter().map(\|c\| *c as u16)),`
			`Units::Wide(us) => buf.extend_from_slice(us),`
			`});`
			`WString::from_buf(buf)`
			`}`
			`}`

core: Add `string::Pattern` trait, based on `std::str::Pattern` This allows `Str::{find, rfind, split}` to accept multiple types 2021-09-17 19:12:11 +00:00			`pub fn str_find<'a, P: Pattern<'a>>(haystack: WStr<'a>, pattern: P) -> Option<usize> {`
			`pattern`
			`.into_searcher(haystack)`
			`.next_match()`
			`.map(\|(start, _)\| start)`
			`}`
core: Reimplement AVM string methods using our UCS2 API This generally means that methods are more efficient, as we don't need to encode to UTF16 on-the-fly to have correct indices. This also fix some bugs: - charCode now properly handle surrogate pairs - calling lastIndexOf with the empty pattern and an OoB index now properly returns the string length Still missing is AVM2's String.match 2021-09-15 09:59:43 +00:00
core: Add `string::Pattern` trait, based on `std::str::Pattern` This allows `Str::{find, rfind, split}` to accept multiple types 2021-09-17 19:12:11 +00:00			`pub fn str_rfind<'a, P: Pattern<'a>>(haystack: WStr<'a>, pattern: P) -> Option<usize> {`
			`pattern`
			`.into_searcher(haystack)`
			`.next_match_back()`
			`.map(\|(start, _)\| start)`
core: Reimplement AVM string methods using our UCS2 API This generally means that methods are more efficient, as we don't need to encode to UTF16 on-the-fly to have correct indices. This also fix some bugs: - charCode now properly handle surrogate pairs - calling lastIndexOf with the empty pattern and an OoB index now properly returns the string length Still missing is AVM2's String.match 2021-09-15 09:59:43 +00:00			`}`

			`#[inline]`
core: Add `string::Pattern` trait, based on `std::str::Pattern` This allows `Str::{find, rfind, split}` to accept multiple types 2021-09-17 19:12:11 +00:00			`pub fn str_split<'a, P: Pattern<'a>>(string: WStr<'a>, pattern: P) -> Split<'a, P> {`
core: Reimplement AVM string methods using our UCS2 API This generally means that methods are more efficient, as we don't need to encode to UTF16 on-the-fly to have correct indices. This also fix some bugs: - charCode now properly handle surrogate pairs - calling lastIndexOf with the empty pattern and an OoB index now properly returns the string length Still missing is AVM2's String.match 2021-09-15 09:59:43 +00:00			`Split {`
core: Add `string::Pattern` trait, based on `std::str::Pattern` This allows `Str::{find, rfind, split}` to accept multiple types 2021-09-17 19:12:11 +00:00			`string: Some(string),`
			`searcher: pattern.into_searcher(string),`
			`prev_end: 0,`
core: Reimplement AVM string methods using our UCS2 API This generally means that methods are more efficient, as we don't need to encode to UTF16 on-the-fly to have correct indices. This also fix some bugs: - charCode now properly handle surrogate pairs - calling lastIndexOf with the empty pattern and an OoB index now properly returns the string length Still missing is AVM2's String.match 2021-09-15 09:59:43 +00:00			`}`
			`}`

avm2: use WString in QName methods 2021-10-03 00:33:19 +00:00			`pub fn str_rsplit_once<'a, P: Pattern<'a>>(`
			`string: WStr<'a>,`
			`pattern: P,`
			`) -> Option<(WStr<'a>, WStr<'a>)> {`
			`let (start, end) = pattern.into_searcher(string).next_match_back()?;`
			`Some((string.slice(..start), string.slice(end..)))`
			`}`

avm1: partially rewrite Value::primitive_as_number 2021-09-21 16:28:52 +00:00			`pub fn starts_with<'a, P: Pattern<'a>>(string: WStr<'a>, pattern: P) -> bool {`
			`matches!(`
			`pattern.into_searcher(string).next(),`
			`SearchStep::Match(_, _)`
			`)`
			`}`

			`pub fn ends_with<'a, P: Pattern<'a>>(string: WStr<'a>, pattern: P) -> bool {`
			`matches!(`
			`pattern.into_searcher(string).next_back(),`
			`SearchStep::Match(_, _)`
			`)`
			`}`

			`pub fn strip_prefix<'a, P: Pattern<'a>>(string: WStr<'a>, pattern: P) -> Option<WStr<'a>> {`
			`match pattern.into_searcher(string).next() {`
			`SearchStep::Match(_, end) => Some(string.slice(end..)),`
			`_ => None,`
			`}`
			`}`

			`pub fn strip_suffix<'a, P: Pattern<'a>>(string: WStr<'a>, pattern: P) -> Option<WStr<'a>> {`
			`match pattern.into_searcher(string).next_back() {`
			`SearchStep::Match(start, _) => Some(string.slice(..start)),`
			`_ => None,`
			`}`
			`}`

			`pub fn str_trim_matches<'a, P: Pattern<'a>>(string: WStr<'a>, pattern: P) -> WStr<'a> {`
			`let mut i = 0;`
			`let mut j = 0;`
			`let mut searcher = pattern.into_searcher(string);`
			`if let Some((start, end)) = searcher.next_reject() {`
			`i = start;`
			`j = end;`
			`}`

			`if let Some((_, end)) = searcher.next_reject_back() {`
			`j = end;`
			`}`

			`string.slice(i..j)`
			`}`

			`pub fn str_trim_start_matches<'a, P: Pattern<'a>>(string: WStr<'a>, pattern: P) -> WStr<'a> {`
			`let mut i = string.len();`
			`let mut searcher = pattern.into_searcher(string);`
			`if let Some((start, _)) = searcher.next_reject() {`
			`i = start;`
			`}`

			`string.slice(i..)`
			`}`

			`pub fn str_trim_end_matches<'a, P: Pattern<'a>>(string: WStr<'a>, pattern: P) -> WStr<'a> {`
			`let mut i = 0;`
			`let mut searcher = pattern.into_searcher(string);`
			`if let Some((_, end)) = searcher.next_reject_back() {`
			`i = end;`
			`}`

			`string.slice(..i)`
			`}`

core: Add `string::Pattern` trait, based on `std::str::Pattern` This allows `Str::{find, rfind, split}` to accept multiple types 2021-09-17 19:12:11 +00:00			`pub struct Split<'a, P: Pattern<'a>> {`
			`string: Option<WStr<'a>>,`
			`searcher: P::Searcher,`
			`prev_end: usize,`
core: Reimplement AVM string methods using our UCS2 API This generally means that methods are more efficient, as we don't need to encode to UTF16 on-the-fly to have correct indices. This also fix some bugs: - charCode now properly handle surrogate pairs - calling lastIndexOf with the empty pattern and an OoB index now properly returns the string length Still missing is AVM2's String.match 2021-09-15 09:59:43 +00:00			`}`

core: Add `string::Pattern` trait, based on `std::str::Pattern` This allows `Str::{find, rfind, split}` to accept multiple types 2021-09-17 19:12:11 +00:00			`impl<'a, P: Pattern<'a>> Iterator for Split<'a, P> {`
core: Reimplement AVM string methods using our UCS2 API This generally means that methods are more efficient, as we don't need to encode to UTF16 on-the-fly to have correct indices. This also fix some bugs: - charCode now properly handle surrogate pairs - calling lastIndexOf with the empty pattern and an OoB index now properly returns the string length Still missing is AVM2's String.match 2021-09-15 09:59:43 +00:00			`type Item = WStr<'a>;`

			`fn next(&mut self) -> Option<Self::Item> {`
core: Add `string::Pattern` trait, based on `std::str::Pattern` This allows `Str::{find, rfind, split}` to accept multiple types 2021-09-17 19:12:11 +00:00			`let string = self.string?;`
core: Reimplement AVM string methods using our UCS2 API This generally means that methods are more efficient, as we don't need to encode to UTF16 on-the-fly to have correct indices. This also fix some bugs: - charCode now properly handle surrogate pairs - calling lastIndexOf with the empty pattern and an OoB index now properly returns the string length Still missing is AVM2's String.match 2021-09-15 09:59:43 +00:00
core: Add `string::Pattern` trait, based on `std::str::Pattern` This allows `Str::{find, rfind, split}` to accept multiple types 2021-09-17 19:12:11 +00:00			`match self.searcher.next_match() {`
			`Some((start, end)) => {`
			`let end = std::mem::replace(&mut self.prev_end, end);`
			`Some(string.slice(end..start))`
core: Reimplement AVM string methods using our UCS2 API This generally means that methods are more efficient, as we don't need to encode to UTF16 on-the-fly to have correct indices. This also fix some bugs: - charCode now properly handle surrogate pairs - calling lastIndexOf with the empty pattern and an OoB index now properly returns the string length Still missing is AVM2's String.match 2021-09-15 09:59:43 +00:00			`}`
			`None => {`
core: Add `string::Pattern` trait, based on `std::str::Pattern` This allows `Str::{find, rfind, split}` to accept multiple types 2021-09-17 19:12:11 +00:00			`self.string = None;`
			`Some(string.slice(self.prev_end..))`
core: Reimplement AVM string methods using our UCS2 API This generally means that methods are more efficient, as we don't need to encode to UTF16 on-the-fly to have correct indices. This also fix some bugs: - charCode now properly handle surrogate pairs - calling lastIndexOf with the empty pattern and an OoB index now properly returns the string length Still missing is AVM2's String.match 2021-09-15 09:59:43 +00:00			`}`
			`}`
			`}`
			`}`
avm2: rewrite regexp module to not rely on AvmString::as_str This is a little tricky, because we have to map the utf8 indices returned by the regex engine to utf16 indices usable by Ruffle. To limit the impact on performance, the regex, the string we're currently matching on, and the last known (utf8, utf16) positions are cached, avoiding extra utf8 conversions in common use cases where a single string is repeatedly searched with increasing `lastIndex`. 2021-09-20 21:56:17 +00:00
			/// A struct for converting a `WStr<'_>` to an UTF8 `String`.
			`pub struct WStrToUtf8<'a> {`
			`head: &'a str,`
			`tail: WStr<'a>,`
			`}`

			`impl<'a> WStrToUtf8<'a> {`
			`pub fn new(s: WStr<'a>) -> Self {`
			`let (head, tail) = match s.units() {`
			`Units::Bytes(b) => {`
			`let (head, tail) = utils::split_ascii_prefix_bytes(b);`
			`(head, WStr::from_units(tail))`
			`}`
			`Units::Wide(_) => ("", s),`
			`};`

			`Self { head, tail }`
			`}`

			`pub fn to_utf8_lossy(&self) -> Cow<'a, str> {`
			`if self.tail.is_empty() {`
			`Cow::Borrowed(self.head)`
			`} else {`
			`let mut out = String::with_capacity(self.head.len() + self.tail.len());`
			`out.push_str(self.head);`
			`write!(out, "{}", self.tail).unwrap();`
			`Cow::Owned(out)`
			`}`
			`}`

			`#[inline]`
			`pub fn prefix(&self) -> &str {`
			`self.head`
			`}`
			`}`