wstr: Add UTF-8/UTF-16 index mapping
Methods `utf8_index` and `utf16_index` from `WStrToUtf8` may be used to map code unit indices between UTF-8 (str) and UTF-16 (WStr) strings.
This commit is contained in:
parent
7121cd4095
commit
b39919951b
|
@ -373,6 +373,7 @@ struct CachedText<'gc> {
|
||||||
// Cached values of the last `{utf8, utf16}_index` call,
|
// Cached values of the last `{utf8, utf16}_index` call,
|
||||||
// to avoid unnecessary recomputation when calling these methods
|
// to avoid unnecessary recomputation when calling these methods
|
||||||
// with increasing indices.
|
// with increasing indices.
|
||||||
|
// TODO WStrToUtf8 implements UTF-8/UTF-16 index mapping, merge it if possible
|
||||||
cur_utf8_index: usize,
|
cur_utf8_index: usize,
|
||||||
cur_utf16_index: usize,
|
cur_utf16_index: usize,
|
||||||
}
|
}
|
||||||
|
|
|
@ -320,7 +320,7 @@ impl WStr {
|
||||||
|
|
||||||
/// Returns `true` is the string contains only LATIN1 characters.
|
/// Returns `true` is the string contains only LATIN1 characters.
|
||||||
///
|
///
|
||||||
/// Note that this doesn't necessarily means that `self.is_wide()` is `false`.
|
/// Note that this doesn't necessarily mean that `self.is_wide()` is `false`.
|
||||||
#[inline]
|
#[inline]
|
||||||
pub fn is_latin1(&self) -> bool {
|
pub fn is_latin1(&self) -> bool {
|
||||||
super::ops::str_is_latin1(self)
|
super::ops::str_is_latin1(self)
|
||||||
|
|
|
@ -462,6 +462,47 @@ impl<'a> WStrToUtf8<'a> {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Map the given UTF-16 code unit index to its corresponding UTF-8 code unit index.
|
||||||
|
pub fn utf8_index(&self, utf16_index: usize) -> Option<usize> {
|
||||||
|
self.translate_index(utf16_index, false)
|
||||||
|
.map(|(utf8_index, _)| utf8_index)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Map the given UTF-8 code unit index to its corresponding UTF-16 code unit index.
|
||||||
|
pub fn utf16_index(&self, utf8_index: usize) -> Option<usize> {
|
||||||
|
self.translate_index(utf8_index, true)
|
||||||
|
.map(|(_, utf16_index)| utf16_index)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn translate_index(&self, index: usize, is_utf8: bool) -> Option<(usize, usize)> {
|
||||||
|
let ascii_prefix_len = self.head.len();
|
||||||
|
if index <= ascii_prefix_len {
|
||||||
|
return Some((index, index));
|
||||||
|
}
|
||||||
|
|
||||||
|
if self.tail.is_empty() {
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut utf8_tail_pos = 0;
|
||||||
|
let mut utf16_tail_pos = 0;
|
||||||
|
|
||||||
|
while if is_utf8 {
|
||||||
|
utf8_tail_pos + ascii_prefix_len < index
|
||||||
|
} else {
|
||||||
|
utf16_tail_pos + ascii_prefix_len < index
|
||||||
|
} {
|
||||||
|
let c = self.tail[utf16_tail_pos..].chars().next()?.ok()?;
|
||||||
|
utf8_tail_pos += c.len_utf8();
|
||||||
|
utf16_tail_pos += c.len_utf16();
|
||||||
|
}
|
||||||
|
|
||||||
|
Some((
|
||||||
|
ascii_prefix_len + utf8_tail_pos,
|
||||||
|
ascii_prefix_len + utf16_tail_pos,
|
||||||
|
))
|
||||||
|
}
|
||||||
|
|
||||||
#[inline]
|
#[inline]
|
||||||
pub fn prefix(&self) -> &str {
|
pub fn prefix(&self) -> &str {
|
||||||
self.head
|
self.head
|
||||||
|
|
|
@ -214,3 +214,140 @@ fn split_ascii_prefix() {
|
||||||
assert_eq!(utils::split_ascii_prefix("abc"), (&b"abc"[..], ""));
|
assert_eq!(utils::split_ascii_prefix("abc"), (&b"abc"[..], ""));
|
||||||
assert_eq!(utils::split_ascii_prefix("abcd€fg"), (&b"abcd"[..], "€fg"));
|
assert_eq!(utils::split_ascii_prefix("abcd€fg"), (&b"abcd"[..], "€fg"));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn char_boundary() {
|
||||||
|
// bytes
|
||||||
|
let bytes = bstr!(b"abcdefgh");
|
||||||
|
assert_eq!(utils::next_char_boundary(bytes, 8), 8);
|
||||||
|
assert_eq!(utils::prev_char_boundary(bytes, 8), 7);
|
||||||
|
assert_eq!(utils::next_char_boundary(bytes, 7), 8);
|
||||||
|
assert_eq!(utils::prev_char_boundary(bytes, 4), 3);
|
||||||
|
assert_eq!(utils::next_char_boundary(bytes, 3), 4);
|
||||||
|
assert_eq!(utils::prev_char_boundary(bytes, 1), 0);
|
||||||
|
assert_eq!(utils::next_char_boundary(bytes, 0), 1);
|
||||||
|
assert_eq!(utils::prev_char_boundary(bytes, 0), 0);
|
||||||
|
|
||||||
|
// wide
|
||||||
|
let wide = wstr!('↓''↑''a''b''c');
|
||||||
|
assert_eq!(utils::next_char_boundary(wide, 5), 5);
|
||||||
|
assert_eq!(utils::prev_char_boundary(wide, 5), 4);
|
||||||
|
assert_eq!(utils::next_char_boundary(wide, 4), 5);
|
||||||
|
assert_eq!(utils::prev_char_boundary(wide, 3), 2);
|
||||||
|
assert_eq!(utils::next_char_boundary(wide, 2), 3);
|
||||||
|
assert_eq!(utils::prev_char_boundary(wide, 1), 0);
|
||||||
|
assert_eq!(utils::next_char_boundary(wide, 0), 1);
|
||||||
|
assert_eq!(utils::prev_char_boundary(wide, 0), 0);
|
||||||
|
|
||||||
|
// surrogate pairs
|
||||||
|
#[rustfmt::skip]
|
||||||
|
let sp = WStr::from_units(&[
|
||||||
|
'↓' as u16,
|
||||||
|
0xd83d, 0xdf01, // 🜁
|
||||||
|
'a' as u16,
|
||||||
|
0xd83d, 0xdf03, // 🜃
|
||||||
|
'↓' as u16,
|
||||||
|
]);
|
||||||
|
assert_eq!(utils::next_char_boundary(sp, 7), 7);
|
||||||
|
assert_eq!(utils::prev_char_boundary(sp, 7), 6);
|
||||||
|
assert_eq!(utils::next_char_boundary(sp, 6), 7);
|
||||||
|
assert_eq!(utils::prev_char_boundary(sp, 6), 4);
|
||||||
|
assert_eq!(utils::next_char_boundary(sp, 4), 6);
|
||||||
|
assert_eq!(utils::prev_char_boundary(sp, 4), 3);
|
||||||
|
assert_eq!(utils::next_char_boundary(sp, 3), 4);
|
||||||
|
assert_eq!(utils::prev_char_boundary(sp, 3), 1);
|
||||||
|
assert_eq!(utils::next_char_boundary(sp, 1), 3);
|
||||||
|
assert_eq!(utils::prev_char_boundary(sp, 1), 0);
|
||||||
|
assert_eq!(utils::next_char_boundary(sp, 0), 1);
|
||||||
|
assert_eq!(utils::prev_char_boundary(sp, 0), 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn utf8_index_mapping() {
|
||||||
|
#[rustfmt::skip]
|
||||||
|
let utf16 = WStr::from_units(&[
|
||||||
|
'a' as u16,
|
||||||
|
'b' as u16,
|
||||||
|
'c' as u16,
|
||||||
|
'↓' as u16,
|
||||||
|
'a' as u16,
|
||||||
|
'b' as u16,
|
||||||
|
0xd83d, 0xdf01, // 🜁
|
||||||
|
'a' as u16,
|
||||||
|
'ł' as u16,
|
||||||
|
0xd83d, 0xdf03, // 🜃
|
||||||
|
'↓' as u16,
|
||||||
|
'a' as u16,
|
||||||
|
'b' as u16,
|
||||||
|
'c' as u16,
|
||||||
|
]);
|
||||||
|
|
||||||
|
// utf16 indices
|
||||||
|
// a | b | c | ↓ | a | b | 🜁 | a | ł | 🜃 | ↓ | a | b | c
|
||||||
|
// 0061 | 0062 | 0063 | 2193 | 0061 | 0062 | d83d df01 | 0061 | 0142 | d83d df03 | 2193 | 0061 | 0062 | 0063
|
||||||
|
// 0 | 1 | 2 | 3 | 4 | 5 | 6 7 | 8 | 9 | 10 11 | 12 | 13 | 14 | 15
|
||||||
|
|
||||||
|
// utf8 indices
|
||||||
|
// a | b | c | ↓ | a | b | 🜁 | a | ł | 🜃 | ↓ | a | b | c
|
||||||
|
// 61 | 62 | 63 | e2 86 93 | 61 | 62 | f0 9f 9c 81 | 61 | c5 82 | f0 9f 9c 83 | e2 86 93 | 61 | 62 | 63
|
||||||
|
// 0 | 1 | 2 | 3 4 5 | 6 | 7 | 8 9 10 11 | 12 | 13 14 | 15 16 17 18 | 19 20 21 | 22 | 23 | 24
|
||||||
|
|
||||||
|
let to_utf8 = WStrToUtf8::new(utf16);
|
||||||
|
let utf8 = to_utf8.to_utf8_lossy();
|
||||||
|
|
||||||
|
assert_eq!(utf8, "abc↓ab🜁ał🜃↓abc");
|
||||||
|
assert_eq!(utf8.len(), 25);
|
||||||
|
assert_eq!(utf16.len(), 16);
|
||||||
|
|
||||||
|
assert_eq!(to_utf8.utf16_index(0), Some(0));
|
||||||
|
assert_eq!(to_utf8.utf16_index(2), Some(2));
|
||||||
|
assert_eq!(to_utf8.utf16_index(3), Some(3));
|
||||||
|
assert_eq!(to_utf8.utf16_index(6), Some(4));
|
||||||
|
assert_eq!(to_utf8.utf16_index(7), Some(5));
|
||||||
|
assert_eq!(to_utf8.utf16_index(8), Some(6));
|
||||||
|
assert_eq!(to_utf8.utf16_index(13), Some(9));
|
||||||
|
assert_eq!(to_utf8.utf16_index(15), Some(10));
|
||||||
|
assert_eq!(to_utf8.utf16_index(22), Some(13));
|
||||||
|
assert_eq!(to_utf8.utf16_index(24), Some(15));
|
||||||
|
|
||||||
|
assert_eq!(to_utf8.utf8_index(0), Some(0));
|
||||||
|
assert_eq!(to_utf8.utf8_index(2), Some(2));
|
||||||
|
assert_eq!(to_utf8.utf8_index(3), Some(3));
|
||||||
|
assert_eq!(to_utf8.utf8_index(4), Some(6));
|
||||||
|
assert_eq!(to_utf8.utf8_index(5), Some(7));
|
||||||
|
assert_eq!(to_utf8.utf8_index(6), Some(8));
|
||||||
|
assert_eq!(to_utf8.utf8_index(9), Some(13));
|
||||||
|
assert_eq!(to_utf8.utf8_index(10), Some(15));
|
||||||
|
assert_eq!(to_utf8.utf8_index(13), Some(22));
|
||||||
|
assert_eq!(to_utf8.utf8_index(15), Some(24));
|
||||||
|
|
||||||
|
// last (potential) position
|
||||||
|
assert_eq!(to_utf8.utf16_index(25), Some(16));
|
||||||
|
assert_eq!(to_utf8.utf8_index(16), Some(25));
|
||||||
|
|
||||||
|
// out of bounds
|
||||||
|
assert_eq!(to_utf8.utf16_index(26), None);
|
||||||
|
assert_eq!(to_utf8.utf8_index(17), None);
|
||||||
|
|
||||||
|
// indices outside of character boundary
|
||||||
|
assert_eq!(to_utf8.utf16_index(4), Some(4));
|
||||||
|
assert_eq!(to_utf8.utf16_index(5), Some(4));
|
||||||
|
assert_eq!(to_utf8.utf16_index(9), Some(8));
|
||||||
|
assert_eq!(to_utf8.utf16_index(10), Some(8));
|
||||||
|
assert_eq!(to_utf8.utf8_index(7), Some(12));
|
||||||
|
assert_eq!(to_utf8.utf8_index(11), Some(19));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn utf8_index_mapping_empty() {
|
||||||
|
let utf16 = WStr::empty();
|
||||||
|
|
||||||
|
let to_utf8 = WStrToUtf8::new(utf16);
|
||||||
|
let utf8 = to_utf8.to_utf8_lossy();
|
||||||
|
|
||||||
|
assert_eq!(utf8.len(), 0);
|
||||||
|
assert_eq!(utf16.len(), 0);
|
||||||
|
|
||||||
|
assert_eq!(to_utf8.utf16_index(0), Some(0));
|
||||||
|
assert_eq!(to_utf8.utf16_index(1), None);
|
||||||
|
}
|
||||||
|
|
Loading…
Reference in New Issue