wstr: Add UTF-8/UTF-16 index mapping
Methods `utf8_index` and `utf16_index` from `WStrToUtf8` may be used to map code unit indices between UTF-8 (str) and UTF-16 (WStr) strings.
This commit is contained in:
parent
7121cd4095
commit
b39919951b
|
@ -373,6 +373,7 @@ struct CachedText<'gc> {
|
|||
// Cached values of the last `{utf8, utf16}_index` call,
|
||||
// to avoid unnecessary recomputation when calling these methods
|
||||
// with increasing indices.
|
||||
// TODO WStrToUtf8 implements UTF-8/UTF-16 index mapping, merge it if possible
|
||||
cur_utf8_index: usize,
|
||||
cur_utf16_index: usize,
|
||||
}
|
||||
|
|
|
@ -320,7 +320,7 @@ impl WStr {
|
|||
|
||||
/// Returns `true` is the string contains only LATIN1 characters.
|
||||
///
|
||||
/// Note that this doesn't necessarily means that `self.is_wide()` is `false`.
|
||||
/// Note that this doesn't necessarily mean that `self.is_wide()` is `false`.
|
||||
#[inline]
|
||||
pub fn is_latin1(&self) -> bool {
|
||||
super::ops::str_is_latin1(self)
|
||||
|
|
|
@ -462,6 +462,47 @@ impl<'a> WStrToUtf8<'a> {
|
|||
}
|
||||
}
|
||||
|
||||
/// Map the given UTF-16 code unit index to its corresponding UTF-8 code unit index.
|
||||
pub fn utf8_index(&self, utf16_index: usize) -> Option<usize> {
|
||||
self.translate_index(utf16_index, false)
|
||||
.map(|(utf8_index, _)| utf8_index)
|
||||
}
|
||||
|
||||
/// Map the given UTF-8 code unit index to its corresponding UTF-16 code unit index.
|
||||
pub fn utf16_index(&self, utf8_index: usize) -> Option<usize> {
|
||||
self.translate_index(utf8_index, true)
|
||||
.map(|(_, utf16_index)| utf16_index)
|
||||
}
|
||||
|
||||
fn translate_index(&self, index: usize, is_utf8: bool) -> Option<(usize, usize)> {
|
||||
let ascii_prefix_len = self.head.len();
|
||||
if index <= ascii_prefix_len {
|
||||
return Some((index, index));
|
||||
}
|
||||
|
||||
if self.tail.is_empty() {
|
||||
return None;
|
||||
}
|
||||
|
||||
let mut utf8_tail_pos = 0;
|
||||
let mut utf16_tail_pos = 0;
|
||||
|
||||
while if is_utf8 {
|
||||
utf8_tail_pos + ascii_prefix_len < index
|
||||
} else {
|
||||
utf16_tail_pos + ascii_prefix_len < index
|
||||
} {
|
||||
let c = self.tail[utf16_tail_pos..].chars().next()?.ok()?;
|
||||
utf8_tail_pos += c.len_utf8();
|
||||
utf16_tail_pos += c.len_utf16();
|
||||
}
|
||||
|
||||
Some((
|
||||
ascii_prefix_len + utf8_tail_pos,
|
||||
ascii_prefix_len + utf16_tail_pos,
|
||||
))
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn prefix(&self) -> &str {
|
||||
self.head
|
||||
|
|
|
@ -214,3 +214,140 @@ fn split_ascii_prefix() {
|
|||
assert_eq!(utils::split_ascii_prefix("abc"), (&b"abc"[..], ""));
|
||||
assert_eq!(utils::split_ascii_prefix("abcd€fg"), (&b"abcd"[..], "€fg"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn char_boundary() {
|
||||
// bytes
|
||||
let bytes = bstr!(b"abcdefgh");
|
||||
assert_eq!(utils::next_char_boundary(bytes, 8), 8);
|
||||
assert_eq!(utils::prev_char_boundary(bytes, 8), 7);
|
||||
assert_eq!(utils::next_char_boundary(bytes, 7), 8);
|
||||
assert_eq!(utils::prev_char_boundary(bytes, 4), 3);
|
||||
assert_eq!(utils::next_char_boundary(bytes, 3), 4);
|
||||
assert_eq!(utils::prev_char_boundary(bytes, 1), 0);
|
||||
assert_eq!(utils::next_char_boundary(bytes, 0), 1);
|
||||
assert_eq!(utils::prev_char_boundary(bytes, 0), 0);
|
||||
|
||||
// wide
|
||||
let wide = wstr!('↓''↑''a''b''c');
|
||||
assert_eq!(utils::next_char_boundary(wide, 5), 5);
|
||||
assert_eq!(utils::prev_char_boundary(wide, 5), 4);
|
||||
assert_eq!(utils::next_char_boundary(wide, 4), 5);
|
||||
assert_eq!(utils::prev_char_boundary(wide, 3), 2);
|
||||
assert_eq!(utils::next_char_boundary(wide, 2), 3);
|
||||
assert_eq!(utils::prev_char_boundary(wide, 1), 0);
|
||||
assert_eq!(utils::next_char_boundary(wide, 0), 1);
|
||||
assert_eq!(utils::prev_char_boundary(wide, 0), 0);
|
||||
|
||||
// surrogate pairs
|
||||
#[rustfmt::skip]
|
||||
let sp = WStr::from_units(&[
|
||||
'↓' as u16,
|
||||
0xd83d, 0xdf01, // 🜁
|
||||
'a' as u16,
|
||||
0xd83d, 0xdf03, // 🜃
|
||||
'↓' as u16,
|
||||
]);
|
||||
assert_eq!(utils::next_char_boundary(sp, 7), 7);
|
||||
assert_eq!(utils::prev_char_boundary(sp, 7), 6);
|
||||
assert_eq!(utils::next_char_boundary(sp, 6), 7);
|
||||
assert_eq!(utils::prev_char_boundary(sp, 6), 4);
|
||||
assert_eq!(utils::next_char_boundary(sp, 4), 6);
|
||||
assert_eq!(utils::prev_char_boundary(sp, 4), 3);
|
||||
assert_eq!(utils::next_char_boundary(sp, 3), 4);
|
||||
assert_eq!(utils::prev_char_boundary(sp, 3), 1);
|
||||
assert_eq!(utils::next_char_boundary(sp, 1), 3);
|
||||
assert_eq!(utils::prev_char_boundary(sp, 1), 0);
|
||||
assert_eq!(utils::next_char_boundary(sp, 0), 1);
|
||||
assert_eq!(utils::prev_char_boundary(sp, 0), 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn utf8_index_mapping() {
|
||||
#[rustfmt::skip]
|
||||
let utf16 = WStr::from_units(&[
|
||||
'a' as u16,
|
||||
'b' as u16,
|
||||
'c' as u16,
|
||||
'↓' as u16,
|
||||
'a' as u16,
|
||||
'b' as u16,
|
||||
0xd83d, 0xdf01, // 🜁
|
||||
'a' as u16,
|
||||
'ł' as u16,
|
||||
0xd83d, 0xdf03, // 🜃
|
||||
'↓' as u16,
|
||||
'a' as u16,
|
||||
'b' as u16,
|
||||
'c' as u16,
|
||||
]);
|
||||
|
||||
// utf16 indices
|
||||
// a | b | c | ↓ | a | b | 🜁 | a | ł | 🜃 | ↓ | a | b | c
|
||||
// 0061 | 0062 | 0063 | 2193 | 0061 | 0062 | d83d df01 | 0061 | 0142 | d83d df03 | 2193 | 0061 | 0062 | 0063
|
||||
// 0 | 1 | 2 | 3 | 4 | 5 | 6 7 | 8 | 9 | 10 11 | 12 | 13 | 14 | 15
|
||||
|
||||
// utf8 indices
|
||||
// a | b | c | ↓ | a | b | 🜁 | a | ł | 🜃 | ↓ | a | b | c
|
||||
// 61 | 62 | 63 | e2 86 93 | 61 | 62 | f0 9f 9c 81 | 61 | c5 82 | f0 9f 9c 83 | e2 86 93 | 61 | 62 | 63
|
||||
// 0 | 1 | 2 | 3 4 5 | 6 | 7 | 8 9 10 11 | 12 | 13 14 | 15 16 17 18 | 19 20 21 | 22 | 23 | 24
|
||||
|
||||
let to_utf8 = WStrToUtf8::new(utf16);
|
||||
let utf8 = to_utf8.to_utf8_lossy();
|
||||
|
||||
assert_eq!(utf8, "abc↓ab🜁ał🜃↓abc");
|
||||
assert_eq!(utf8.len(), 25);
|
||||
assert_eq!(utf16.len(), 16);
|
||||
|
||||
assert_eq!(to_utf8.utf16_index(0), Some(0));
|
||||
assert_eq!(to_utf8.utf16_index(2), Some(2));
|
||||
assert_eq!(to_utf8.utf16_index(3), Some(3));
|
||||
assert_eq!(to_utf8.utf16_index(6), Some(4));
|
||||
assert_eq!(to_utf8.utf16_index(7), Some(5));
|
||||
assert_eq!(to_utf8.utf16_index(8), Some(6));
|
||||
assert_eq!(to_utf8.utf16_index(13), Some(9));
|
||||
assert_eq!(to_utf8.utf16_index(15), Some(10));
|
||||
assert_eq!(to_utf8.utf16_index(22), Some(13));
|
||||
assert_eq!(to_utf8.utf16_index(24), Some(15));
|
||||
|
||||
assert_eq!(to_utf8.utf8_index(0), Some(0));
|
||||
assert_eq!(to_utf8.utf8_index(2), Some(2));
|
||||
assert_eq!(to_utf8.utf8_index(3), Some(3));
|
||||
assert_eq!(to_utf8.utf8_index(4), Some(6));
|
||||
assert_eq!(to_utf8.utf8_index(5), Some(7));
|
||||
assert_eq!(to_utf8.utf8_index(6), Some(8));
|
||||
assert_eq!(to_utf8.utf8_index(9), Some(13));
|
||||
assert_eq!(to_utf8.utf8_index(10), Some(15));
|
||||
assert_eq!(to_utf8.utf8_index(13), Some(22));
|
||||
assert_eq!(to_utf8.utf8_index(15), Some(24));
|
||||
|
||||
// last (potential) position
|
||||
assert_eq!(to_utf8.utf16_index(25), Some(16));
|
||||
assert_eq!(to_utf8.utf8_index(16), Some(25));
|
||||
|
||||
// out of bounds
|
||||
assert_eq!(to_utf8.utf16_index(26), None);
|
||||
assert_eq!(to_utf8.utf8_index(17), None);
|
||||
|
||||
// indices outside of character boundary
|
||||
assert_eq!(to_utf8.utf16_index(4), Some(4));
|
||||
assert_eq!(to_utf8.utf16_index(5), Some(4));
|
||||
assert_eq!(to_utf8.utf16_index(9), Some(8));
|
||||
assert_eq!(to_utf8.utf16_index(10), Some(8));
|
||||
assert_eq!(to_utf8.utf8_index(7), Some(12));
|
||||
assert_eq!(to_utf8.utf8_index(11), Some(19));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn utf8_index_mapping_empty() {
|
||||
let utf16 = WStr::empty();
|
||||
|
||||
let to_utf8 = WStrToUtf8::new(utf16);
|
||||
let utf8 = to_utf8.to_utf8_lossy();
|
||||
|
||||
assert_eq!(utf8.len(), 0);
|
||||
assert_eq!(utf16.len(), 0);
|
||||
|
||||
assert_eq!(to_utf8.utf16_index(0), Some(0));
|
||||
assert_eq!(to_utf8.utf16_index(1), None);
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue