avm2: implement string.split for regex (#7363)

* avm2: implement string.split for regex

* Compressed the testing for regexp and unwrapping thereof

* * Moved the split logic into the regex object

  * Factored out a method for utf-16 matching

  * Added tests

* formatting

* * replaced manual counting with storage.length()

* clippy cleanup

* Address review comments

  * fix import path for WString
  * remove redundant variable in return statement
  * error passing via '?' instead of unwrap()
This commit is contained in:
Rafał Dowgird 2022-07-11 18:47:05 +02:00 committed by GitHub
parent ded77ab46a
commit 80d1a8449a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 148 additions and 43 deletions

View File

@ -408,20 +408,23 @@ fn split<'gc>(
.into(), .into(),
); );
} }
if delimiter
.as_object()
.map(|o| o.as_regexp().is_some())
.unwrap_or(false)
{
log::warn!("string.split(regex) - not implemented");
}
let this = Value::from(this).coerce_to_string(activation)?; let this = Value::from(this).coerce_to_string(activation)?;
let delimiter = delimiter.coerce_to_string(activation)?;
let limit = match args.get(1).unwrap_or(&Value::Undefined) { let limit = match args.get(1).unwrap_or(&Value::Undefined) {
Value::Undefined => usize::MAX, Value::Undefined => usize::MAX,
limit => limit.coerce_to_i32(activation)?.max(0) as usize, limit => limit.coerce_to_i32(activation)?.max(0) as usize,
}; };
if let Some(mut regexp) = delimiter
.as_object()
.as_ref()
.and_then(|o| o.as_regexp_mut(activation.context.gc_context))
{
return Ok(regexp.split(activation, this, limit)?.into());
}
let delimiter = delimiter.coerce_to_string(activation)?;
let storage = if delimiter.is_empty() { let storage = if delimiter.is_empty() {
// When using an empty delimiter, Str::split adds an extra beginning and trailing item, but Flash does not. // When using an empty delimiter, Str::split adds an extra beginning and trailing item, but Flash does not.
// e.g., split("foo", "") returns ["", "f", "o", "o", ""] in Rust but ["f, "o", "o"] in Flash. // e.g., split("foo", "") returns ["", "f", "o", "o", ""] in Rust but ["f, "o", "o"] in Flash.

View File

@ -2,6 +2,10 @@
use std::borrow::Cow; use std::borrow::Cow;
use crate::avm2::activation::Activation;
use crate::avm2::Error;
use crate::avm2::{ArrayObject, ArrayStorage, Object};
use crate::string::WString;
use crate::string::{AvmString, Units, WStrToUtf8}; use crate::string::{AvmString, Units, WStrToUtf8};
use bitflags::bitflags; use bitflags::bitflags;
use gc_arena::Collect; use gc_arena::Collect;
@ -144,10 +148,55 @@ impl<'gc> RegExp<'gc> {
} }
} }
pub fn exec(&mut self, text: AvmString<'gc>) -> Option<regress::Match> { pub fn split(
let global = self.flags.contains(RegExpFlags::GLOBAL); &mut self,
let start = if global { self.last_index } else { 0 }; activation: &mut Activation<'_, 'gc, '_>,
let re_match = self.find_utf8_match_at(text, start, |text, mut re_match| { text: AvmString<'gc>,
limit: usize,
) -> Result<Object<'gc>, Error> {
let mut storage = ArrayStorage::new(0);
// The empty regex is a special case which splits into characters.
if self.source.is_empty() {
let mut it = text.chars().take(limit);
while let Some(Ok(c)) = it.next() {
storage.push(
AvmString::new(activation.context.gc_context, WString::from_char(c)).into(),
);
}
return ArrayObject::from_storage(activation, storage);
}
let mut start = 0;
while let Some(m) = self.find_utf16_match(text, start) {
if m.range.end == start {
break;
}
storage.push(
AvmString::new(activation.context.gc_context, &text[start..m.range.start]).into(),
);
if storage.length() >= limit {
break;
}
for c in m.captures.iter().filter_map(Option::as_ref) {
storage.push(
AvmString::new(activation.context.gc_context, &text[c.start..c.end]).into(),
);
if storage.length() >= limit {
break; // Intentional bug to match Flash.
// Causes adding parts past limit.
}
}
start = m.range.end;
}
if storage.length() < limit {
storage.push(AvmString::new(activation.context.gc_context, &text[start..]).into());
}
ArrayObject::from_storage(activation, storage)
}
fn find_utf16_match(&mut self, text: AvmString<'gc>, start: usize) -> Option<regress::Match> {
self.find_utf8_match_at(text, start, |text, mut re_match| {
// Sort the capture endpoints by increasing index, so that CachedText::utf16_index is efficient. // Sort the capture endpoints by increasing index, so that CachedText::utf16_index is efficient.
let mut utf8_indices = re_match let mut utf8_indices = re_match
.captures .captures
@ -162,10 +211,13 @@ impl<'gc> RegExp<'gc> {
for i in utf8_indices { for i in utf8_indices {
*i = text.utf16_index(*i).unwrap(); *i = text.utf16_index(*i).unwrap();
} }
re_match re_match
})?; })
}
pub fn exec(&mut self, text: AvmString<'gc>) -> Option<regress::Match> {
let global = self.flags.contains(RegExpFlags::GLOBAL);
let start = if global { self.last_index } else { 0 };
let re_match = self.find_utf16_match(text, start)?;
if global { if global {
self.last_index = re_match.end(); self.last_index = re_match.end();
} }

View File

@ -0,0 +1,58 @@
package {
import flash.display.MovieClip;
public class Test extends MovieClip {
public function Test() {
// note: compiled manually with AIR SDK
trace('// var text = "a.b.c";');
var text = "a.b.c";
trace('// text.split("a.b.c")');
trace(text.split("a.b.c"));
trace('// text.split(".")');
trace(text.split("."));
trace('// text.split("")');
trace(text.split(""));
trace('// text.split()');
trace(text.split());
trace('// text.split(regex)');
var regex = /b+/
trace("abbabc".split(regex));
trace('// no match')
trace("ccccc".split(/b/));
trace('// match all')
var regex = /.*/
trace("cccc".split(regex));
trace('// empty string, match all')
trace("".split(/.*/));
trace('// multibyte chars')
trace("ąąbąą".split(/b/))
trace('// Group expansion')
trace("abba".split(/(b(b))/))
trace('// Split on empty regex')
trace("aął".split(/(?:)/))
trace('// Split on non-empty regex with zero-length match')
trace("aąbcde".split(/f*/))
trace('// Limit')
trace("aąbaababa".split(/b/,3))
trace('// Limit on group captures - flash returns 6 parts instead of 5')
trace("aąbbaabbabbabbabbabba".split(/(b(b))/,5))
}
}
}

View File

@ -7,4 +7,23 @@ a,b,c
a,.,b,.,c a,.,b,.,c
// text.split() // text.split()
a.b.c a.b.c
// text.split(regex) - unimplemented // text.split(regex)
a,a,c
// no match
ccccc
// match all
,
// empty string, match all
// multibyte chars
ąą,ąą
// Group expansion
a,bb,b,a
// Split on empty regex
aął
// Split on non-empty regex with zero-length match
aąbcde
// Limit
aą,aa,a
// Limit on group captures - flash returns 6 parts instead of 5
aą,bb,b,aa,bb,a

View File

@ -1,27 +0,0 @@
package {
import flash.display.MovieClip;
public class Test extends MovieClip {
public function Test() {
// note: compiled manually with AIR SDK
trace('// var text = "a.b.c";');
var text = "a.b.c";
trace('// text.split("a.b.c")');
trace(text.split("a.b.c"));
trace('// text.split(".")');
trace(text.split("."));
trace('// text.split("")');
trace(text.split(""));
trace('// text.split()');
trace(text.split());
trace('// text.split(regex) - unimplemented');
}
}
}