str_indices/
utf16.rs

1//! Index by utf16 code units.
2
3use crate::byte_chunk::{ByteChunk, Chunk};
4
5/// Counts the utf16 code units that would be in a string slice if it
6/// were encoded as utf16.
7///
8/// Runs in O(N) time.
9#[inline]
10pub fn count(text: &str) -> usize {
11    crate::chars::count_impl::<Chunk>(text.as_bytes())
12        + count_surrogates_impl::<Chunk>(text.as_bytes())
13}
14
15/// Counts the utf16 surrogate pairs that would be in a string slice if
16/// it were encoded as utf16.
17///
18/// Runs in O(N) time.
19#[inline]
20pub fn count_surrogates(text: &str) -> usize {
21    count_surrogates_impl::<Chunk>(text.as_bytes())
22}
23
24/// Converts from byte-index to utf16-code-unit-index in a string slice.
25///
26/// If the byte is in the middle of a multi-byte char, returns the utf16
27/// index of the char that the byte belongs to.
28///
29/// Any past-the-end index will return the one-past-the-end utf16 index.
30///
31/// Runs in O(N) time.
32#[inline]
33pub fn from_byte_idx(text: &str, byte_idx: usize) -> usize {
34    let mut i = byte_idx.min(text.len());
35    while !text.is_char_boundary(i) {
36        i -= 1;
37    }
38    let slice = &text.as_bytes()[..i];
39    crate::chars::count_impl::<Chunk>(slice) + count_surrogates_impl::<Chunk>(slice)
40}
41
42/// Converts from utf16-code-unit-index to byte-index in a string slice.
43///
44/// If the utf16 index is in the middle of a char, returns the bytes
45/// index of the char that utf16 code unit belongs to.
46///
47/// Any past-the-end index will return the one-past-the-end byte index.
48///
49/// Runs in O(N) time.
50#[inline]
51pub fn to_byte_idx(text: &str, utf16_idx: usize) -> usize {
52    to_byte_idx_impl::<Chunk>(text, utf16_idx)
53}
54
55//-------------------------------------------------------------
56
57#[inline(always)]
58fn to_byte_idx_impl<T: ByteChunk>(text: &str, utf16_idx: usize) -> usize {
59    // Get `middle` so we can do more efficient chunk-based counting.
60    // We can't use this to get `end`, however, because the start index of
61    // `end` actually depends on the accumulating char counts during the
62    // counting process.
63    let (start, middle, _) = unsafe { text.as_bytes().align_to::<T>() };
64
65    let mut byte_count = 0;
66    let mut utf16_count = 0;
67
68    // Take care of any unaligned bytes at the beginning.
69    for byte in start.iter() {
70        utf16_count += ((*byte & 0xC0) != 0x80) as usize + ((byte & 0xf0) == 0xf0) as usize;
71        if utf16_count > utf16_idx {
72            break;
73        }
74        byte_count += 1;
75    }
76
77    // Process chunks in the fast path.
78    let mut chunks = middle;
79    let mut max_round_len = utf16_idx.saturating_sub(utf16_count) / T::MAX_ACC;
80    while max_round_len > 0 && !chunks.is_empty() {
81        // Choose the largest number of chunks we can do this round
82        // that will neither overflow `max_acc` nor blast past the
83        // utf16 code unit we're looking for.
84        let round_len = T::MAX_ACC.min(max_round_len).min(chunks.len());
85        max_round_len -= round_len;
86        let round = &chunks[..round_len];
87        chunks = &chunks[round_len..];
88
89        // Process the chunks in this round.
90        let mut acc_inv_chars = T::zero();
91        let mut acc_surrogates = T::zero();
92        for chunk in round.iter() {
93            acc_inv_chars = acc_inv_chars.add(chunk.bitand(T::splat(0xc0)).cmp_eq_byte(0x80));
94            acc_surrogates = acc_surrogates.add(chunk.bitand(T::splat(0xf0)).cmp_eq_byte(0xf0));
95        }
96        utf16_count +=
97            ((T::SIZE * round_len) - acc_inv_chars.sum_bytes()) + acc_surrogates.sum_bytes();
98        byte_count += T::SIZE * round_len;
99    }
100
101    // Process chunks in the slow path.
102    for chunk in chunks.iter() {
103        let inv_chars = chunk.bitand(T::splat(0xc0)).cmp_eq_byte(0x80).sum_bytes();
104        let surrogates = chunk.bitand(T::splat(0xf0)).cmp_eq_byte(0xf0).sum_bytes();
105        let new_utf16_count = utf16_count + (T::SIZE - inv_chars) + surrogates;
106        if new_utf16_count >= utf16_idx {
107            break;
108        }
109        utf16_count = new_utf16_count;
110        byte_count += T::SIZE;
111    }
112
113    // Take care of any unaligned bytes at the end.
114    let end = &text.as_bytes()[byte_count..];
115    for byte in end.iter() {
116        utf16_count += ((*byte & 0xC0) != 0x80) as usize + ((byte & 0xf0) == 0xf0) as usize;
117        if utf16_count > utf16_idx {
118            break;
119        }
120        byte_count += 1;
121    }
122
123    byte_count
124}
125
126#[inline(always)]
127fn count_surrogates_impl<T: ByteChunk>(text: &[u8]) -> usize {
128    // We chop off the last three bytes, because all surrogate pairs are
129    // four bytes in utf8, and so it prevents counting partial
130    // characters.
131    if text.len() <= 3 {
132        return 0;
133    }
134    let text = &text[..(text.len() - 3)];
135
136    // Get `middle` for more efficient chunk-based counting.
137    let (start, middle, end) = unsafe { text.align_to::<T>() };
138
139    let mut utf16_surrogate_count = 0;
140
141    // Take care of unaligned bytes at the beginning.
142    for byte in start.iter() {
143        utf16_surrogate_count += ((byte & 0xf0) == 0xf0) as usize;
144    }
145
146    // Take care of the middle bytes in big chunks.
147    for chunks in middle.chunks(T::MAX_ACC) {
148        let mut acc = T::zero();
149        for chunk in chunks.iter() {
150            acc = acc.add(chunk.bitand(T::splat(0xf0)).cmp_eq_byte(0xf0));
151        }
152        utf16_surrogate_count += acc.sum_bytes();
153    }
154
155    // Take care of unaligned bytes at the end.
156    for byte in end.iter() {
157        utf16_surrogate_count += ((byte & 0xf0) == 0xf0) as usize;
158    }
159
160    utf16_surrogate_count
161}
162
163//=============================================================
164
165#[cfg(test)]
166mod tests {
167    use super::*;
168
169    // 45 bytes, 27 utf16 code units.
170    const TEXT: &str = "Hel🐸lo world! こん🐸にち🐸🐸は!";
171
172    #[test]
173    fn count_01() {
174        assert_eq!(27, count(TEXT));
175    }
176
177    #[test]
178    fn count_surrogates_01() {
179        assert_eq!(4, count_surrogates(TEXT));
180    }
181
182    #[test]
183    fn from_byte_idx_01() {
184        assert_eq!(0, from_byte_idx(TEXT, 0));
185
186        assert_eq!(3, from_byte_idx(TEXT, 3));
187        assert_eq!(3, from_byte_idx(TEXT, 4));
188        assert_eq!(3, from_byte_idx(TEXT, 5));
189        assert_eq!(3, from_byte_idx(TEXT, 6));
190        assert_eq!(5, from_byte_idx(TEXT, 7));
191
192        assert_eq!(7, from_byte_idx(TEXT, 9));
193
194        assert_eq!(17, from_byte_idx(TEXT, 23));
195        assert_eq!(17, from_byte_idx(TEXT, 24));
196        assert_eq!(17, from_byte_idx(TEXT, 25));
197        assert_eq!(17, from_byte_idx(TEXT, 26));
198        assert_eq!(19, from_byte_idx(TEXT, 27));
199
200        assert_eq!(21, from_byte_idx(TEXT, 33));
201        assert_eq!(21, from_byte_idx(TEXT, 34));
202        assert_eq!(21, from_byte_idx(TEXT, 35));
203        assert_eq!(21, from_byte_idx(TEXT, 36));
204        assert_eq!(23, from_byte_idx(TEXT, 37));
205        assert_eq!(23, from_byte_idx(TEXT, 38));
206        assert_eq!(23, from_byte_idx(TEXT, 39));
207        assert_eq!(23, from_byte_idx(TEXT, 40));
208        assert_eq!(25, from_byte_idx(TEXT, 41));
209
210        assert_eq!(27, from_byte_idx(TEXT, 45));
211        assert_eq!(27, from_byte_idx(TEXT, 46)); // Index 1 past the end.
212    }
213
214    #[test]
215    fn to_byte_idx_01() {
216        assert_eq!(to_byte_idx(TEXT, 0), 0);
217
218        assert_eq!(3, to_byte_idx(TEXT, 3));
219        assert_eq!(3, to_byte_idx(TEXT, 4));
220        assert_eq!(7, to_byte_idx(TEXT, 5));
221
222        assert_eq!(9, to_byte_idx(TEXT, 7));
223
224        assert_eq!(23, to_byte_idx(TEXT, 17));
225        assert_eq!(23, to_byte_idx(TEXT, 18));
226        assert_eq!(27, to_byte_idx(TEXT, 19));
227
228        assert_eq!(33, to_byte_idx(TEXT, 21));
229        assert_eq!(33, to_byte_idx(TEXT, 22));
230        assert_eq!(37, to_byte_idx(TEXT, 23));
231        assert_eq!(37, to_byte_idx(TEXT, 24));
232        assert_eq!(41, to_byte_idx(TEXT, 25));
233
234        assert_eq!(45, to_byte_idx(TEXT, 27));
235        assert_eq!(45, to_byte_idx(TEXT, 27)); // Index 1 past the end.
236    }
237}