wstr: Add UTF-8/UTF-16 index mapping

Methods `utf8_index` and `utf16_index` from `WStrToUtf8` may be used to map code unit indices between UTF-8 (str) and UTF-16 (WStr) strings.
2024-01-30 02:43:44 +01:00 · 2024-01-30 02:43:44 +01:00 · b39919951b
parent 7121cd4095
commit b39919951b
4 changed files with 180 additions and 1 deletions
--- a/core/src/avm2/regexp.rs
+++ b/core/src/avm2/regexp.rs
@ -373,6 +373,7 @@ struct CachedText<'gc> {
    // Cached values of the last `{utf8, utf16}_index` call,
    // to avoid unnecessary recomputation when calling these methods
    // with increasing indices.
+    // TODO WStrToUtf8 implements UTF-8/UTF-16 index mapping, merge it if possible
    cur_utf8_index: usize,
    cur_utf16_index: usize,
 }
--- a/wstr/src/common.rs
+++ b/wstr/src/common.rs
@ -320,7 +320,7 @@ impl WStr {

    /// Returns `true` is the string contains only LATIN1 characters.
    ///
-    /// Note that this doesn't necessarily means that `self.is_wide()` is `false`.
+    /// Note that this doesn't necessarily mean that `self.is_wide()` is `false`.
    #[inline]
    pub fn is_latin1(&self) -> bool {
        super::ops::str_is_latin1(self)
--- a/wstr/src/ops.rs
+++ b/wstr/src/ops.rs
@ -462,6 +462,47 @@ impl<'a> WStrToUtf8<'a> {
        }
    }

+    /// Map the given UTF-16 code unit index to its corresponding UTF-8 code unit index.
+    pub fn utf8_index(&self, utf16_index: usize) -> Option<usize> {
+        self.translate_index(utf16_index, false)
+            .map(|(utf8_index, _)| utf8_index)
+    }
+
+    /// Map the given UTF-8 code unit index to its corresponding UTF-16 code unit index.
+    pub fn utf16_index(&self, utf8_index: usize) -> Option<usize> {
+        self.translate_index(utf8_index, true)
+            .map(|(_, utf16_index)| utf16_index)
+    }
+
+    fn translate_index(&self, index: usize, is_utf8: bool) -> Option<(usize, usize)> {
+        let ascii_prefix_len = self.head.len();
+        if index <= ascii_prefix_len {
+            return Some((index, index));
+        }
+
+        if self.tail.is_empty() {
+            return None;
+        }
+
+        let mut utf8_tail_pos = 0;
+        let mut utf16_tail_pos = 0;
+
+        while if is_utf8 {
+            utf8_tail_pos + ascii_prefix_len < index
+        } else {
+            utf16_tail_pos + ascii_prefix_len < index
+        } {
+            let c = self.tail[utf16_tail_pos..].chars().next()?.ok()?;
+            utf8_tail_pos += c.len_utf8();
+            utf16_tail_pos += c.len_utf16();
+        }
+
+        Some((
+            ascii_prefix_len + utf8_tail_pos,
+            ascii_prefix_len + utf16_tail_pos,
+        ))
+    }
+
    #[inline]
    pub fn prefix(&self) -> &str {
        self.head
--- a/wstr/src/tests.rs
+++ b/wstr/src/tests.rs
@ -214,3 +214,140 @@ fn split_ascii_prefix() {
    assert_eq!(utils::split_ascii_prefix("abc"), (&b"abc"[..], ""));
    assert_eq!(utils::split_ascii_prefix("abcd€fg"), (&b"abcd"[..], "€fg"));
 }
+
+#[test]
+fn char_boundary() {
+    // bytes
+    let bytes = bstr!(b"abcdefgh");
+    assert_eq!(utils::next_char_boundary(bytes, 8), 8);
+    assert_eq!(utils::prev_char_boundary(bytes, 8), 7);
+    assert_eq!(utils::next_char_boundary(bytes, 7), 8);
+    assert_eq!(utils::prev_char_boundary(bytes, 4), 3);
+    assert_eq!(utils::next_char_boundary(bytes, 3), 4);
+    assert_eq!(utils::prev_char_boundary(bytes, 1), 0);
+    assert_eq!(utils::next_char_boundary(bytes, 0), 1);
+    assert_eq!(utils::prev_char_boundary(bytes, 0), 0);
+
+    // wide
+    let wide = wstr!('↓''↑''a''b''c');
+    assert_eq!(utils::next_char_boundary(wide, 5), 5);
+    assert_eq!(utils::prev_char_boundary(wide, 5), 4);
+    assert_eq!(utils::next_char_boundary(wide, 4), 5);
+    assert_eq!(utils::prev_char_boundary(wide, 3), 2);
+    assert_eq!(utils::next_char_boundary(wide, 2), 3);
+    assert_eq!(utils::prev_char_boundary(wide, 1), 0);
+    assert_eq!(utils::next_char_boundary(wide, 0), 1);
+    assert_eq!(utils::prev_char_boundary(wide, 0), 0);
+
+    // surrogate pairs
+    #[rustfmt::skip]
+    let sp = WStr::from_units(&[
+        '↓' as u16,
+        0xd83d, 0xdf01, // 🜁
+        'a' as u16,
+        0xd83d, 0xdf03, // 🜃
+        '↓' as u16,
+    ]);
+    assert_eq!(utils::next_char_boundary(sp, 7), 7);
+    assert_eq!(utils::prev_char_boundary(sp, 7), 6);
+    assert_eq!(utils::next_char_boundary(sp, 6), 7);
+    assert_eq!(utils::prev_char_boundary(sp, 6), 4);
+    assert_eq!(utils::next_char_boundary(sp, 4), 6);
+    assert_eq!(utils::prev_char_boundary(sp, 4), 3);
+    assert_eq!(utils::next_char_boundary(sp, 3), 4);
+    assert_eq!(utils::prev_char_boundary(sp, 3), 1);
+    assert_eq!(utils::next_char_boundary(sp, 1), 3);
+    assert_eq!(utils::prev_char_boundary(sp, 1), 0);
+    assert_eq!(utils::next_char_boundary(sp, 0), 1);
+    assert_eq!(utils::prev_char_boundary(sp, 0), 0);
+}
+
+#[test]
+fn utf8_index_mapping() {
+    #[rustfmt::skip]
+    let utf16 = WStr::from_units(&[
+        'a' as u16,
+        'b' as u16,
+        'c' as u16,
+        '↓' as u16,
+        'a' as u16,
+        'b' as u16,
+        0xd83d, 0xdf01, // 🜁
+        'a' as u16,
+        'ł' as u16,
+        0xd83d, 0xdf03, // 🜃
+        '↓' as u16,
+        'a' as u16,
+        'b' as u16,
+        'c' as u16,
+    ]);
+
+    // utf16 indices
+    // a    | b    | c    | ↓    | a    | b    | 🜁         | a    | ł    | 🜃         | ↓    | a    | b    | c
+    // 0061 | 0062 | 0063 | 2193 | 0061 | 0062 | d83d df01 | 0061 | 0142 | d83d df03 | 2193 | 0061 | 0062 | 0063
+    // 0    | 1    | 2    | 3    | 4    | 5    | 6    7    | 8    | 9    | 10   11   | 12   | 13   | 14   | 15
+
+    // utf8 indices
+    // a  | b  | c  | ↓        | a  | b  | 🜁           | a  | ł     | 🜃           | ↓        | a  | b  | c
+    // 61 | 62 | 63 | e2 86 93 | 61 | 62 | f0 9f 9c 81 | 61 | c5 82 | f0 9f 9c 83 | e2 86 93 | 61 | 62 | 63
+    // 0  | 1  | 2  | 3  4  5  | 6  | 7  | 8  9  10 11 | 12 | 13 14 | 15 16 17 18 | 19 20 21 | 22 | 23 | 24
+
+    let to_utf8 = WStrToUtf8::new(utf16);
+    let utf8 = to_utf8.to_utf8_lossy();
+
+    assert_eq!(utf8, "abc↓ab🜁ał🜃↓abc");
+    assert_eq!(utf8.len(), 25);
+    assert_eq!(utf16.len(), 16);
+
+    assert_eq!(to_utf8.utf16_index(0), Some(0));
+    assert_eq!(to_utf8.utf16_index(2), Some(2));
+    assert_eq!(to_utf8.utf16_index(3), Some(3));
+    assert_eq!(to_utf8.utf16_index(6), Some(4));
+    assert_eq!(to_utf8.utf16_index(7), Some(5));
+    assert_eq!(to_utf8.utf16_index(8), Some(6));
+    assert_eq!(to_utf8.utf16_index(13), Some(9));
+    assert_eq!(to_utf8.utf16_index(15), Some(10));
+    assert_eq!(to_utf8.utf16_index(22), Some(13));
+    assert_eq!(to_utf8.utf16_index(24), Some(15));
+
+    assert_eq!(to_utf8.utf8_index(0), Some(0));
+    assert_eq!(to_utf8.utf8_index(2), Some(2));
+    assert_eq!(to_utf8.utf8_index(3), Some(3));
+    assert_eq!(to_utf8.utf8_index(4), Some(6));
+    assert_eq!(to_utf8.utf8_index(5), Some(7));
+    assert_eq!(to_utf8.utf8_index(6), Some(8));
+    assert_eq!(to_utf8.utf8_index(9), Some(13));
+    assert_eq!(to_utf8.utf8_index(10), Some(15));
+    assert_eq!(to_utf8.utf8_index(13), Some(22));
+    assert_eq!(to_utf8.utf8_index(15), Some(24));
+
+    // last (potential) position
+    assert_eq!(to_utf8.utf16_index(25), Some(16));
+    assert_eq!(to_utf8.utf8_index(16), Some(25));
+
+    // out of bounds
+    assert_eq!(to_utf8.utf16_index(26), None);
+    assert_eq!(to_utf8.utf8_index(17), None);
+
+    // indices outside of character boundary
+    assert_eq!(to_utf8.utf16_index(4), Some(4));
+    assert_eq!(to_utf8.utf16_index(5), Some(4));
+    assert_eq!(to_utf8.utf16_index(9), Some(8));
+    assert_eq!(to_utf8.utf16_index(10), Some(8));
+    assert_eq!(to_utf8.utf8_index(7), Some(12));
+    assert_eq!(to_utf8.utf8_index(11), Some(19));
+}
+
+#[test]
+fn utf8_index_mapping_empty() {
+    let utf16 = WStr::empty();
+
+    let to_utf8 = WStrToUtf8::new(utf16);
+    let utf8 = to_utf8.to_utf8_lossy();
+
+    assert_eq!(utf8.len(), 0);
+    assert_eq!(utf16.len(), 0);
+
+    assert_eq!(to_utf8.utf16_index(0), Some(0));
+    assert_eq!(to_utf8.utf16_index(1), None);
+}