wstr: Add UTF-8/UTF-16 index mapping

Methods `utf8_index` and `utf16_index` from `WStrToUtf8`
may be used to map code unit indices between UTF-8 (str)
and UTF-16 (WStr) strings.
This commit is contained in:
Kamil Jarosz 2024-01-30 02:43:44 +01:00 committed by Nathan Adams
parent 7121cd4095
commit b39919951b
4 changed files with 180 additions and 1 deletions

View File

@ -373,6 +373,7 @@ struct CachedText<'gc> {
// Cached values of the last `{utf8, utf16}_index` call,
// to avoid unnecessary recomputation when calling these methods
// with increasing indices.
// TODO WStrToUtf8 implements UTF-8/UTF-16 index mapping, merge it if possible
cur_utf8_index: usize,
cur_utf16_index: usize,
}

View File

@ -320,7 +320,7 @@ impl WStr {
/// Returns `true` is the string contains only LATIN1 characters.
///
/// Note that this doesn't necessarily means that `self.is_wide()` is `false`.
/// Note that this doesn't necessarily mean that `self.is_wide()` is `false`.
#[inline]
pub fn is_latin1(&self) -> bool {
super::ops::str_is_latin1(self)

View File

@ -462,6 +462,47 @@ impl<'a> WStrToUtf8<'a> {
}
}
/// Map the given UTF-16 code unit index to its corresponding UTF-8 code unit index.
pub fn utf8_index(&self, utf16_index: usize) -> Option<usize> {
self.translate_index(utf16_index, false)
.map(|(utf8_index, _)| utf8_index)
}
/// Map the given UTF-8 code unit index to its corresponding UTF-16 code unit index.
pub fn utf16_index(&self, utf8_index: usize) -> Option<usize> {
self.translate_index(utf8_index, true)
.map(|(_, utf16_index)| utf16_index)
}
fn translate_index(&self, index: usize, is_utf8: bool) -> Option<(usize, usize)> {
let ascii_prefix_len = self.head.len();
if index <= ascii_prefix_len {
return Some((index, index));
}
if self.tail.is_empty() {
return None;
}
let mut utf8_tail_pos = 0;
let mut utf16_tail_pos = 0;
while if is_utf8 {
utf8_tail_pos + ascii_prefix_len < index
} else {
utf16_tail_pos + ascii_prefix_len < index
} {
let c = self.tail[utf16_tail_pos..].chars().next()?.ok()?;
utf8_tail_pos += c.len_utf8();
utf16_tail_pos += c.len_utf16();
}
Some((
ascii_prefix_len + utf8_tail_pos,
ascii_prefix_len + utf16_tail_pos,
))
}
#[inline]
pub fn prefix(&self) -> &str {
self.head

View File

@ -214,3 +214,140 @@ fn split_ascii_prefix() {
assert_eq!(utils::split_ascii_prefix("abc"), (&b"abc"[..], ""));
assert_eq!(utils::split_ascii_prefix("abcd€fg"), (&b"abcd"[..], "€fg"));
}
#[test]
fn char_boundary() {
// bytes
let bytes = bstr!(b"abcdefgh");
assert_eq!(utils::next_char_boundary(bytes, 8), 8);
assert_eq!(utils::prev_char_boundary(bytes, 8), 7);
assert_eq!(utils::next_char_boundary(bytes, 7), 8);
assert_eq!(utils::prev_char_boundary(bytes, 4), 3);
assert_eq!(utils::next_char_boundary(bytes, 3), 4);
assert_eq!(utils::prev_char_boundary(bytes, 1), 0);
assert_eq!(utils::next_char_boundary(bytes, 0), 1);
assert_eq!(utils::prev_char_boundary(bytes, 0), 0);
// wide
let wide = wstr!('↓''↑''a''b''c');
assert_eq!(utils::next_char_boundary(wide, 5), 5);
assert_eq!(utils::prev_char_boundary(wide, 5), 4);
assert_eq!(utils::next_char_boundary(wide, 4), 5);
assert_eq!(utils::prev_char_boundary(wide, 3), 2);
assert_eq!(utils::next_char_boundary(wide, 2), 3);
assert_eq!(utils::prev_char_boundary(wide, 1), 0);
assert_eq!(utils::next_char_boundary(wide, 0), 1);
assert_eq!(utils::prev_char_boundary(wide, 0), 0);
// surrogate pairs
#[rustfmt::skip]
let sp = WStr::from_units(&[
'↓' as u16,
0xd83d, 0xdf01, // 🜁
'a' as u16,
0xd83d, 0xdf03, // 🜃
'↓' as u16,
]);
assert_eq!(utils::next_char_boundary(sp, 7), 7);
assert_eq!(utils::prev_char_boundary(sp, 7), 6);
assert_eq!(utils::next_char_boundary(sp, 6), 7);
assert_eq!(utils::prev_char_boundary(sp, 6), 4);
assert_eq!(utils::next_char_boundary(sp, 4), 6);
assert_eq!(utils::prev_char_boundary(sp, 4), 3);
assert_eq!(utils::next_char_boundary(sp, 3), 4);
assert_eq!(utils::prev_char_boundary(sp, 3), 1);
assert_eq!(utils::next_char_boundary(sp, 1), 3);
assert_eq!(utils::prev_char_boundary(sp, 1), 0);
assert_eq!(utils::next_char_boundary(sp, 0), 1);
assert_eq!(utils::prev_char_boundary(sp, 0), 0);
}
#[test]
fn utf8_index_mapping() {
#[rustfmt::skip]
let utf16 = WStr::from_units(&[
'a' as u16,
'b' as u16,
'c' as u16,
'↓' as u16,
'a' as u16,
'b' as u16,
0xd83d, 0xdf01, // 🜁
'a' as u16,
'ł' as u16,
0xd83d, 0xdf03, // 🜃
'↓' as u16,
'a' as u16,
'b' as u16,
'c' as u16,
]);
// utf16 indices
// a | b | c | ↓ | a | b | 🜁 | a | ł | 🜃 | ↓ | a | b | c
// 0061 | 0062 | 0063 | 2193 | 0061 | 0062 | d83d df01 | 0061 | 0142 | d83d df03 | 2193 | 0061 | 0062 | 0063
// 0 | 1 | 2 | 3 | 4 | 5 | 6 7 | 8 | 9 | 10 11 | 12 | 13 | 14 | 15
// utf8 indices
// a | b | c | ↓ | a | b | 🜁 | a | ł | 🜃 | ↓ | a | b | c
// 61 | 62 | 63 | e2 86 93 | 61 | 62 | f0 9f 9c 81 | 61 | c5 82 | f0 9f 9c 83 | e2 86 93 | 61 | 62 | 63
// 0 | 1 | 2 | 3 4 5 | 6 | 7 | 8 9 10 11 | 12 | 13 14 | 15 16 17 18 | 19 20 21 | 22 | 23 | 24
let to_utf8 = WStrToUtf8::new(utf16);
let utf8 = to_utf8.to_utf8_lossy();
assert_eq!(utf8, "abc↓ab🜁ał🜃↓abc");
assert_eq!(utf8.len(), 25);
assert_eq!(utf16.len(), 16);
assert_eq!(to_utf8.utf16_index(0), Some(0));
assert_eq!(to_utf8.utf16_index(2), Some(2));
assert_eq!(to_utf8.utf16_index(3), Some(3));
assert_eq!(to_utf8.utf16_index(6), Some(4));
assert_eq!(to_utf8.utf16_index(7), Some(5));
assert_eq!(to_utf8.utf16_index(8), Some(6));
assert_eq!(to_utf8.utf16_index(13), Some(9));
assert_eq!(to_utf8.utf16_index(15), Some(10));
assert_eq!(to_utf8.utf16_index(22), Some(13));
assert_eq!(to_utf8.utf16_index(24), Some(15));
assert_eq!(to_utf8.utf8_index(0), Some(0));
assert_eq!(to_utf8.utf8_index(2), Some(2));
assert_eq!(to_utf8.utf8_index(3), Some(3));
assert_eq!(to_utf8.utf8_index(4), Some(6));
assert_eq!(to_utf8.utf8_index(5), Some(7));
assert_eq!(to_utf8.utf8_index(6), Some(8));
assert_eq!(to_utf8.utf8_index(9), Some(13));
assert_eq!(to_utf8.utf8_index(10), Some(15));
assert_eq!(to_utf8.utf8_index(13), Some(22));
assert_eq!(to_utf8.utf8_index(15), Some(24));
// last (potential) position
assert_eq!(to_utf8.utf16_index(25), Some(16));
assert_eq!(to_utf8.utf8_index(16), Some(25));
// out of bounds
assert_eq!(to_utf8.utf16_index(26), None);
assert_eq!(to_utf8.utf8_index(17), None);
// indices outside of character boundary
assert_eq!(to_utf8.utf16_index(4), Some(4));
assert_eq!(to_utf8.utf16_index(5), Some(4));
assert_eq!(to_utf8.utf16_index(9), Some(8));
assert_eq!(to_utf8.utf16_index(10), Some(8));
assert_eq!(to_utf8.utf8_index(7), Some(12));
assert_eq!(to_utf8.utf8_index(11), Some(19));
}
#[test]
fn utf8_index_mapping_empty() {
let utf16 = WStr::empty();
let to_utf8 = WStrToUtf8::new(utf16);
let utf8 = to_utf8.to_utf8_lossy();
assert_eq!(utf8.len(), 0);
assert_eq!(utf16.len(), 0);
assert_eq!(to_utf8.utf16_index(0), Some(0));
assert_eq!(to_utf8.utf16_index(1), None);
}