From b39919951bf08af16812e078d902abc6cebca5f8 Mon Sep 17 00:00:00 2001 From: Kamil Jarosz Date: Tue, 30 Jan 2024 02:43:44 +0100 Subject: [PATCH] wstr: Add UTF-8/UTF-16 index mapping Methods `utf8_index` and `utf16_index` from `WStrToUtf8` may be used to map code unit indices between UTF-8 (str) and UTF-16 (WStr) strings. --- core/src/avm2/regexp.rs | 1 + wstr/src/common.rs | 2 +- wstr/src/ops.rs | 41 ++++++++++++ wstr/src/tests.rs | 137 ++++++++++++++++++++++++++++++++++++++++ 4 files changed, 180 insertions(+), 1 deletion(-) diff --git a/core/src/avm2/regexp.rs b/core/src/avm2/regexp.rs index 75f8f7d33..4e08a19b6 100644 --- a/core/src/avm2/regexp.rs +++ b/core/src/avm2/regexp.rs @@ -373,6 +373,7 @@ struct CachedText<'gc> { // Cached values of the last `{utf8, utf16}_index` call, // to avoid unnecessary recomputation when calling these methods // with increasing indices. + // TODO WStrToUtf8 implements UTF-8/UTF-16 index mapping, merge it if possible cur_utf8_index: usize, cur_utf16_index: usize, } diff --git a/wstr/src/common.rs b/wstr/src/common.rs index f2c9e9aab..f3385ca6e 100644 --- a/wstr/src/common.rs +++ b/wstr/src/common.rs @@ -320,7 +320,7 @@ impl WStr { /// Returns `true` is the string contains only LATIN1 characters. /// - /// Note that this doesn't necessarily means that `self.is_wide()` is `false`. + /// Note that this doesn't necessarily mean that `self.is_wide()` is `false`. #[inline] pub fn is_latin1(&self) -> bool { super::ops::str_is_latin1(self) diff --git a/wstr/src/ops.rs b/wstr/src/ops.rs index 9facb19d9..7c36e1a3c 100644 --- a/wstr/src/ops.rs +++ b/wstr/src/ops.rs @@ -462,6 +462,47 @@ impl<'a> WStrToUtf8<'a> { } } + /// Map the given UTF-16 code unit index to its corresponding UTF-8 code unit index. + pub fn utf8_index(&self, utf16_index: usize) -> Option { + self.translate_index(utf16_index, false) + .map(|(utf8_index, _)| utf8_index) + } + + /// Map the given UTF-8 code unit index to its corresponding UTF-16 code unit index. + pub fn utf16_index(&self, utf8_index: usize) -> Option { + self.translate_index(utf8_index, true) + .map(|(_, utf16_index)| utf16_index) + } + + fn translate_index(&self, index: usize, is_utf8: bool) -> Option<(usize, usize)> { + let ascii_prefix_len = self.head.len(); + if index <= ascii_prefix_len { + return Some((index, index)); + } + + if self.tail.is_empty() { + return None; + } + + let mut utf8_tail_pos = 0; + let mut utf16_tail_pos = 0; + + while if is_utf8 { + utf8_tail_pos + ascii_prefix_len < index + } else { + utf16_tail_pos + ascii_prefix_len < index + } { + let c = self.tail[utf16_tail_pos..].chars().next()?.ok()?; + utf8_tail_pos += c.len_utf8(); + utf16_tail_pos += c.len_utf16(); + } + + Some(( + ascii_prefix_len + utf8_tail_pos, + ascii_prefix_len + utf16_tail_pos, + )) + } + #[inline] pub fn prefix(&self) -> &str { self.head diff --git a/wstr/src/tests.rs b/wstr/src/tests.rs index 213b48077..78e99d877 100644 --- a/wstr/src/tests.rs +++ b/wstr/src/tests.rs @@ -214,3 +214,140 @@ fn split_ascii_prefix() { assert_eq!(utils::split_ascii_prefix("abc"), (&b"abc"[..], "")); assert_eq!(utils::split_ascii_prefix("abcd€fg"), (&b"abcd"[..], "€fg")); } + +#[test] +fn char_boundary() { + // bytes + let bytes = bstr!(b"abcdefgh"); + assert_eq!(utils::next_char_boundary(bytes, 8), 8); + assert_eq!(utils::prev_char_boundary(bytes, 8), 7); + assert_eq!(utils::next_char_boundary(bytes, 7), 8); + assert_eq!(utils::prev_char_boundary(bytes, 4), 3); + assert_eq!(utils::next_char_boundary(bytes, 3), 4); + assert_eq!(utils::prev_char_boundary(bytes, 1), 0); + assert_eq!(utils::next_char_boundary(bytes, 0), 1); + assert_eq!(utils::prev_char_boundary(bytes, 0), 0); + + // wide + let wide = wstr!('↓''↑''a''b''c'); + assert_eq!(utils::next_char_boundary(wide, 5), 5); + assert_eq!(utils::prev_char_boundary(wide, 5), 4); + assert_eq!(utils::next_char_boundary(wide, 4), 5); + assert_eq!(utils::prev_char_boundary(wide, 3), 2); + assert_eq!(utils::next_char_boundary(wide, 2), 3); + assert_eq!(utils::prev_char_boundary(wide, 1), 0); + assert_eq!(utils::next_char_boundary(wide, 0), 1); + assert_eq!(utils::prev_char_boundary(wide, 0), 0); + + // surrogate pairs + #[rustfmt::skip] + let sp = WStr::from_units(&[ + '↓' as u16, + 0xd83d, 0xdf01, // 🜁 + 'a' as u16, + 0xd83d, 0xdf03, // 🜃 + '↓' as u16, + ]); + assert_eq!(utils::next_char_boundary(sp, 7), 7); + assert_eq!(utils::prev_char_boundary(sp, 7), 6); + assert_eq!(utils::next_char_boundary(sp, 6), 7); + assert_eq!(utils::prev_char_boundary(sp, 6), 4); + assert_eq!(utils::next_char_boundary(sp, 4), 6); + assert_eq!(utils::prev_char_boundary(sp, 4), 3); + assert_eq!(utils::next_char_boundary(sp, 3), 4); + assert_eq!(utils::prev_char_boundary(sp, 3), 1); + assert_eq!(utils::next_char_boundary(sp, 1), 3); + assert_eq!(utils::prev_char_boundary(sp, 1), 0); + assert_eq!(utils::next_char_boundary(sp, 0), 1); + assert_eq!(utils::prev_char_boundary(sp, 0), 0); +} + +#[test] +fn utf8_index_mapping() { + #[rustfmt::skip] + let utf16 = WStr::from_units(&[ + 'a' as u16, + 'b' as u16, + 'c' as u16, + '↓' as u16, + 'a' as u16, + 'b' as u16, + 0xd83d, 0xdf01, // 🜁 + 'a' as u16, + 'ł' as u16, + 0xd83d, 0xdf03, // 🜃 + '↓' as u16, + 'a' as u16, + 'b' as u16, + 'c' as u16, + ]); + + // utf16 indices + // a | b | c | ↓ | a | b | 🜁 | a | ł | 🜃 | ↓ | a | b | c + // 0061 | 0062 | 0063 | 2193 | 0061 | 0062 | d83d df01 | 0061 | 0142 | d83d df03 | 2193 | 0061 | 0062 | 0063 + // 0 | 1 | 2 | 3 | 4 | 5 | 6 7 | 8 | 9 | 10 11 | 12 | 13 | 14 | 15 + + // utf8 indices + // a | b | c | ↓ | a | b | 🜁 | a | ł | 🜃 | ↓ | a | b | c + // 61 | 62 | 63 | e2 86 93 | 61 | 62 | f0 9f 9c 81 | 61 | c5 82 | f0 9f 9c 83 | e2 86 93 | 61 | 62 | 63 + // 0 | 1 | 2 | 3 4 5 | 6 | 7 | 8 9 10 11 | 12 | 13 14 | 15 16 17 18 | 19 20 21 | 22 | 23 | 24 + + let to_utf8 = WStrToUtf8::new(utf16); + let utf8 = to_utf8.to_utf8_lossy(); + + assert_eq!(utf8, "abc↓ab🜁ał🜃↓abc"); + assert_eq!(utf8.len(), 25); + assert_eq!(utf16.len(), 16); + + assert_eq!(to_utf8.utf16_index(0), Some(0)); + assert_eq!(to_utf8.utf16_index(2), Some(2)); + assert_eq!(to_utf8.utf16_index(3), Some(3)); + assert_eq!(to_utf8.utf16_index(6), Some(4)); + assert_eq!(to_utf8.utf16_index(7), Some(5)); + assert_eq!(to_utf8.utf16_index(8), Some(6)); + assert_eq!(to_utf8.utf16_index(13), Some(9)); + assert_eq!(to_utf8.utf16_index(15), Some(10)); + assert_eq!(to_utf8.utf16_index(22), Some(13)); + assert_eq!(to_utf8.utf16_index(24), Some(15)); + + assert_eq!(to_utf8.utf8_index(0), Some(0)); + assert_eq!(to_utf8.utf8_index(2), Some(2)); + assert_eq!(to_utf8.utf8_index(3), Some(3)); + assert_eq!(to_utf8.utf8_index(4), Some(6)); + assert_eq!(to_utf8.utf8_index(5), Some(7)); + assert_eq!(to_utf8.utf8_index(6), Some(8)); + assert_eq!(to_utf8.utf8_index(9), Some(13)); + assert_eq!(to_utf8.utf8_index(10), Some(15)); + assert_eq!(to_utf8.utf8_index(13), Some(22)); + assert_eq!(to_utf8.utf8_index(15), Some(24)); + + // last (potential) position + assert_eq!(to_utf8.utf16_index(25), Some(16)); + assert_eq!(to_utf8.utf8_index(16), Some(25)); + + // out of bounds + assert_eq!(to_utf8.utf16_index(26), None); + assert_eq!(to_utf8.utf8_index(17), None); + + // indices outside of character boundary + assert_eq!(to_utf8.utf16_index(4), Some(4)); + assert_eq!(to_utf8.utf16_index(5), Some(4)); + assert_eq!(to_utf8.utf16_index(9), Some(8)); + assert_eq!(to_utf8.utf16_index(10), Some(8)); + assert_eq!(to_utf8.utf8_index(7), Some(12)); + assert_eq!(to_utf8.utf8_index(11), Some(19)); +} + +#[test] +fn utf8_index_mapping_empty() { + let utf16 = WStr::empty(); + + let to_utf8 = WStrToUtf8::new(utf16); + let utf8 = to_utf8.to_utf8_lossy(); + + assert_eq!(utf8.len(), 0); + assert_eq!(utf16.len(), 0); + + assert_eq!(to_utf8.utf16_index(0), Some(0)); + assert_eq!(to_utf8.utf16_index(1), None); +}