wstr: Add UTF-8/UTF-16 index mapping

Methods `utf8_index` and `utf16_index` from `WStrToUtf8` may be used to map code unit indices between UTF-8 (str) and UTF-16 (WStr) strings.
2024-01-30 02:43:44 +01:00 · 2024-01-30 02:43:44 +01:00 · b39919951b
parent 7121cd4095
commit b39919951b
4 changed files with 180 additions and 1 deletions
--- a/core/src/avm2/regexp.rs
+++ b/core/src/avm2/regexp.rs
@ -373,6 +373,7 @@ struct CachedText<'gc> {
    // Cached values of the last `{utf8, utf16}_index` call,
    // to avoid unnecessary recomputation when calling these methods
    // with increasing indices.
    // TODO WStrToUtf8 implements UTF-8/UTF-16 index mapping, merge it if possible
    cur_utf8_index: usize,
    cur_utf16_index: usize,
 }
--- a/wstr/src/common.rs
+++ b/wstr/src/common.rs
@ -320,7 +320,7 @@ impl WStr {
    /// Returns `true` is the string contains only LATIN1 characters.
    ///
-    /// Note that this doesn't necessarily means that `self.is_wide()` is `false`.
+    /// Note that this doesn't necessarily mean that `self.is_wide()` is `false`.
    #[inline]
    pub fn is_latin1(&self) -> bool {
        super::ops::str_is_latin1(self)
--- a/wstr/src/ops.rs
+++ b/wstr/src/ops.rs
@ -462,6 +462,47 @@ impl<'a> WStrToUtf8<'a> {
        }
    }
    /// Map the given UTF-16 code unit index to its corresponding UTF-8 code unit index.
    pub fn utf8_index(&self, utf16_index: usize) -> Option<usize> {
        self.translate_index(utf16_index, false)
            .map(|(utf8_index, _)| utf8_index)
    }
    /// Map the given UTF-8 code unit index to its corresponding UTF-16 code unit index.
    pub fn utf16_index(&self, utf8_index: usize) -> Option<usize> {
        self.translate_index(utf8_index, true)
            .map(|(_, utf16_index)| utf16_index)
    }
    fn translate_index(&self, index: usize, is_utf8: bool) -> Option<(usize, usize)> {
        let ascii_prefix_len = self.head.len();
        if index <= ascii_prefix_len {
            return Some((index, index));
        }
        if self.tail.is_empty() {
            return None;
        }
        let mut utf8_tail_pos = 0;
        let mut utf16_tail_pos = 0;
        while if is_utf8 {
            utf8_tail_pos + ascii_prefix_len < index
        } else {
            utf16_tail_pos + ascii_prefix_len < index
        } {
            let c = self.tail[utf16_tail_pos..].chars().next()?.ok()?;
            utf8_tail_pos += c.len_utf8();
            utf16_tail_pos += c.len_utf16();
        }
        Some((
            ascii_prefix_len + utf8_tail_pos,
            ascii_prefix_len + utf16_tail_pos,
        ))
    }
    #[inline]
    pub fn prefix(&self) -> &str {
        self.head
--- a/wstr/src/tests.rs
+++ b/wstr/src/tests.rs
@ -214,3 +214,140 @@ fn split_ascii_prefix() {
    assert_eq!(utils::split_ascii_prefix("abc"), (&b"abc"[..], ""));
    assert_eq!(utils::split_ascii_prefix("abcd€fg"), (&b"abcd"[..], "€fg"));
 }
 #[test]
 fn char_boundary() {
    // bytes
    let bytes = bstr!(b"abcdefgh");
    assert_eq!(utils::next_char_boundary(bytes, 8), 8);
    assert_eq!(utils::prev_char_boundary(bytes, 8), 7);
    assert_eq!(utils::next_char_boundary(bytes, 7), 8);
    assert_eq!(utils::prev_char_boundary(bytes, 4), 3);
    assert_eq!(utils::next_char_boundary(bytes, 3), 4);
    assert_eq!(utils::prev_char_boundary(bytes, 1), 0);
    assert_eq!(utils::next_char_boundary(bytes, 0), 1);
    assert_eq!(utils::prev_char_boundary(bytes, 0), 0);
    // wide
    let wide = wstr!('↓''↑''a''b''c');
    assert_eq!(utils::next_char_boundary(wide, 5), 5);
    assert_eq!(utils::prev_char_boundary(wide, 5), 4);
    assert_eq!(utils::next_char_boundary(wide, 4), 5);
    assert_eq!(utils::prev_char_boundary(wide, 3), 2);
    assert_eq!(utils::next_char_boundary(wide, 2), 3);
    assert_eq!(utils::prev_char_boundary(wide, 1), 0);
    assert_eq!(utils::next_char_boundary(wide, 0), 1);
    assert_eq!(utils::prev_char_boundary(wide, 0), 0);
    // surrogate pairs
    #[rustfmt::skip]
    let sp = WStr::from_units(&[
        '↓' as u16,
        0xd83d, 0xdf01, // 🜁
        'a' as u16,
        0xd83d, 0xdf03, // 🜃
        '↓' as u16,
    ]);
    assert_eq!(utils::next_char_boundary(sp, 7), 7);
    assert_eq!(utils::prev_char_boundary(sp, 7), 6);
    assert_eq!(utils::next_char_boundary(sp, 6), 7);
    assert_eq!(utils::prev_char_boundary(sp, 6), 4);
    assert_eq!(utils::next_char_boundary(sp, 4), 6);
    assert_eq!(utils::prev_char_boundary(sp, 4), 3);
    assert_eq!(utils::next_char_boundary(sp, 3), 4);
    assert_eq!(utils::prev_char_boundary(sp, 3), 1);
    assert_eq!(utils::next_char_boundary(sp, 1), 3);
    assert_eq!(utils::prev_char_boundary(sp, 1), 0);
    assert_eq!(utils::next_char_boundary(sp, 0), 1);
    assert_eq!(utils::prev_char_boundary(sp, 0), 0);
 }
 #[test]
 fn utf8_index_mapping() {
    #[rustfmt::skip]
    let utf16 = WStr::from_units(&[
        'a' as u16,
        'b' as u16,
        'c' as u16,
        '↓' as u16,
        'a' as u16,
        'b' as u16,
        0xd83d, 0xdf01, // 🜁
        'a' as u16,
        'ł' as u16,
        0xd83d, 0xdf03, // 🜃
        '↓' as u16,
        'a' as u16,
        'b' as u16,
        'c' as u16,
    ]);
    // utf16 indices
    // a    | b    | c    | ↓    | a    | b    | 🜁         | a    | ł    | 🜃         | ↓    | a    | b    | c
    // 0061 | 0062 | 0063 | 2193 | 0061 | 0062 | d83d df01 | 0061 | 0142 | d83d df03 | 2193 | 0061 | 0062 | 0063
    // 0    | 1    | 2    | 3    | 4    | 5    | 6    7    | 8    | 9    | 10   11   | 12   | 13   | 14   | 15
    // utf8 indices
    // a  | b  | c  | ↓        | a  | b  | 🜁           | a  | ł     | 🜃           | ↓        | a  | b  | c
    // 61 | 62 | 63 | e2 86 93 | 61 | 62 | f0 9f 9c 81 | 61 | c5 82 | f0 9f 9c 83 | e2 86 93 | 61 | 62 | 63
    // 0  | 1  | 2  | 3  4  5  | 6  | 7  | 8  9  10 11 | 12 | 13 14 | 15 16 17 18 | 19 20 21 | 22 | 23 | 24
    let to_utf8 = WStrToUtf8::new(utf16);
    let utf8 = to_utf8.to_utf8_lossy();
    assert_eq!(utf8, "abc↓ab🜁ał🜃↓abc");
    assert_eq!(utf8.len(), 25);
    assert_eq!(utf16.len(), 16);
    assert_eq!(to_utf8.utf16_index(0), Some(0));
    assert_eq!(to_utf8.utf16_index(2), Some(2));
    assert_eq!(to_utf8.utf16_index(3), Some(3));
    assert_eq!(to_utf8.utf16_index(6), Some(4));
    assert_eq!(to_utf8.utf16_index(7), Some(5));
    assert_eq!(to_utf8.utf16_index(8), Some(6));
    assert_eq!(to_utf8.utf16_index(13), Some(9));
    assert_eq!(to_utf8.utf16_index(15), Some(10));
    assert_eq!(to_utf8.utf16_index(22), Some(13));
    assert_eq!(to_utf8.utf16_index(24), Some(15));
    assert_eq!(to_utf8.utf8_index(0), Some(0));
    assert_eq!(to_utf8.utf8_index(2), Some(2));
    assert_eq!(to_utf8.utf8_index(3), Some(3));
    assert_eq!(to_utf8.utf8_index(4), Some(6));
    assert_eq!(to_utf8.utf8_index(5), Some(7));
    assert_eq!(to_utf8.utf8_index(6), Some(8));
    assert_eq!(to_utf8.utf8_index(9), Some(13));
    assert_eq!(to_utf8.utf8_index(10), Some(15));
    assert_eq!(to_utf8.utf8_index(13), Some(22));
    assert_eq!(to_utf8.utf8_index(15), Some(24));
    // last (potential) position
    assert_eq!(to_utf8.utf16_index(25), Some(16));
    assert_eq!(to_utf8.utf8_index(16), Some(25));
    // out of bounds
    assert_eq!(to_utf8.utf16_index(26), None);
    assert_eq!(to_utf8.utf8_index(17), None);
    // indices outside of character boundary
    assert_eq!(to_utf8.utf16_index(4), Some(4));
    assert_eq!(to_utf8.utf16_index(5), Some(4));
    assert_eq!(to_utf8.utf16_index(9), Some(8));
    assert_eq!(to_utf8.utf16_index(10), Some(8));
    assert_eq!(to_utf8.utf8_index(7), Some(12));
    assert_eq!(to_utf8.utf8_index(11), Some(19));
 }
 #[test]
 fn utf8_index_mapping_empty() {
    let utf16 = WStr::empty();
    let to_utf8 = WStrToUtf8::new(utf16);
    let utf8 = to_utf8.to_utf8_lossy();
    assert_eq!(utf8.len(), 0);
    assert_eq!(utf16.len(), 0);
    assert_eq!(to_utf8.utf16_index(0), Some(0));
    assert_eq!(to_utf8.utf16_index(1), None);
 }