1use crate::byte_chunk::{ByteChunk, Chunk};
4
5#[inline]
10pub fn count(text: &str) -> usize {
11 crate::chars::count_impl::<Chunk>(text.as_bytes())
12 + count_surrogates_impl::<Chunk>(text.as_bytes())
13}
14
15#[inline]
20pub fn count_surrogates(text: &str) -> usize {
21 count_surrogates_impl::<Chunk>(text.as_bytes())
22}
23
24#[inline]
33pub fn from_byte_idx(text: &str, byte_idx: usize) -> usize {
34 let mut i = byte_idx.min(text.len());
35 while !text.is_char_boundary(i) {
36 i -= 1;
37 }
38 let slice = &text.as_bytes()[..i];
39 crate::chars::count_impl::<Chunk>(slice) + count_surrogates_impl::<Chunk>(slice)
40}
41
42#[inline]
51pub fn to_byte_idx(text: &str, utf16_idx: usize) -> usize {
52 to_byte_idx_impl::<Chunk>(text, utf16_idx)
53}
54
55#[inline(always)]
58fn to_byte_idx_impl<T: ByteChunk>(text: &str, utf16_idx: usize) -> usize {
59 let (start, middle, _) = unsafe { text.as_bytes().align_to::<T>() };
64
65 let mut byte_count = 0;
66 let mut utf16_count = 0;
67
68 for byte in start.iter() {
70 utf16_count += ((*byte & 0xC0) != 0x80) as usize + ((byte & 0xf0) == 0xf0) as usize;
71 if utf16_count > utf16_idx {
72 break;
73 }
74 byte_count += 1;
75 }
76
77 let mut chunks = middle;
79 let mut max_round_len = utf16_idx.saturating_sub(utf16_count) / T::MAX_ACC;
80 while max_round_len > 0 && !chunks.is_empty() {
81 let round_len = T::MAX_ACC.min(max_round_len).min(chunks.len());
85 max_round_len -= round_len;
86 let round = &chunks[..round_len];
87 chunks = &chunks[round_len..];
88
89 let mut acc_inv_chars = T::zero();
91 let mut acc_surrogates = T::zero();
92 for chunk in round.iter() {
93 acc_inv_chars = acc_inv_chars.add(chunk.bitand(T::splat(0xc0)).cmp_eq_byte(0x80));
94 acc_surrogates = acc_surrogates.add(chunk.bitand(T::splat(0xf0)).cmp_eq_byte(0xf0));
95 }
96 utf16_count +=
97 ((T::SIZE * round_len) - acc_inv_chars.sum_bytes()) + acc_surrogates.sum_bytes();
98 byte_count += T::SIZE * round_len;
99 }
100
101 for chunk in chunks.iter() {
103 let inv_chars = chunk.bitand(T::splat(0xc0)).cmp_eq_byte(0x80).sum_bytes();
104 let surrogates = chunk.bitand(T::splat(0xf0)).cmp_eq_byte(0xf0).sum_bytes();
105 let new_utf16_count = utf16_count + (T::SIZE - inv_chars) + surrogates;
106 if new_utf16_count >= utf16_idx {
107 break;
108 }
109 utf16_count = new_utf16_count;
110 byte_count += T::SIZE;
111 }
112
113 let end = &text.as_bytes()[byte_count..];
115 for byte in end.iter() {
116 utf16_count += ((*byte & 0xC0) != 0x80) as usize + ((byte & 0xf0) == 0xf0) as usize;
117 if utf16_count > utf16_idx {
118 break;
119 }
120 byte_count += 1;
121 }
122
123 byte_count
124}
125
126#[inline(always)]
127fn count_surrogates_impl<T: ByteChunk>(text: &[u8]) -> usize {
128 if text.len() <= 3 {
132 return 0;
133 }
134 let text = &text[..(text.len() - 3)];
135
136 let (start, middle, end) = unsafe { text.align_to::<T>() };
138
139 let mut utf16_surrogate_count = 0;
140
141 for byte in start.iter() {
143 utf16_surrogate_count += ((byte & 0xf0) == 0xf0) as usize;
144 }
145
146 for chunks in middle.chunks(T::MAX_ACC) {
148 let mut acc = T::zero();
149 for chunk in chunks.iter() {
150 acc = acc.add(chunk.bitand(T::splat(0xf0)).cmp_eq_byte(0xf0));
151 }
152 utf16_surrogate_count += acc.sum_bytes();
153 }
154
155 for byte in end.iter() {
157 utf16_surrogate_count += ((byte & 0xf0) == 0xf0) as usize;
158 }
159
160 utf16_surrogate_count
161}
162
163#[cfg(test)]
166mod tests {
167 use super::*;
168
169 const TEXT: &str = "Hel🐸lo world! こん🐸にち🐸🐸は!";
171
172 #[test]
173 fn count_01() {
174 assert_eq!(27, count(TEXT));
175 }
176
177 #[test]
178 fn count_surrogates_01() {
179 assert_eq!(4, count_surrogates(TEXT));
180 }
181
182 #[test]
183 fn from_byte_idx_01() {
184 assert_eq!(0, from_byte_idx(TEXT, 0));
185
186 assert_eq!(3, from_byte_idx(TEXT, 3));
187 assert_eq!(3, from_byte_idx(TEXT, 4));
188 assert_eq!(3, from_byte_idx(TEXT, 5));
189 assert_eq!(3, from_byte_idx(TEXT, 6));
190 assert_eq!(5, from_byte_idx(TEXT, 7));
191
192 assert_eq!(7, from_byte_idx(TEXT, 9));
193
194 assert_eq!(17, from_byte_idx(TEXT, 23));
195 assert_eq!(17, from_byte_idx(TEXT, 24));
196 assert_eq!(17, from_byte_idx(TEXT, 25));
197 assert_eq!(17, from_byte_idx(TEXT, 26));
198 assert_eq!(19, from_byte_idx(TEXT, 27));
199
200 assert_eq!(21, from_byte_idx(TEXT, 33));
201 assert_eq!(21, from_byte_idx(TEXT, 34));
202 assert_eq!(21, from_byte_idx(TEXT, 35));
203 assert_eq!(21, from_byte_idx(TEXT, 36));
204 assert_eq!(23, from_byte_idx(TEXT, 37));
205 assert_eq!(23, from_byte_idx(TEXT, 38));
206 assert_eq!(23, from_byte_idx(TEXT, 39));
207 assert_eq!(23, from_byte_idx(TEXT, 40));
208 assert_eq!(25, from_byte_idx(TEXT, 41));
209
210 assert_eq!(27, from_byte_idx(TEXT, 45));
211 assert_eq!(27, from_byte_idx(TEXT, 46)); }
213
214 #[test]
215 fn to_byte_idx_01() {
216 assert_eq!(to_byte_idx(TEXT, 0), 0);
217
218 assert_eq!(3, to_byte_idx(TEXT, 3));
219 assert_eq!(3, to_byte_idx(TEXT, 4));
220 assert_eq!(7, to_byte_idx(TEXT, 5));
221
222 assert_eq!(9, to_byte_idx(TEXT, 7));
223
224 assert_eq!(23, to_byte_idx(TEXT, 17));
225 assert_eq!(23, to_byte_idx(TEXT, 18));
226 assert_eq!(27, to_byte_idx(TEXT, 19));
227
228 assert_eq!(33, to_byte_idx(TEXT, 21));
229 assert_eq!(33, to_byte_idx(TEXT, 22));
230 assert_eq!(37, to_byte_idx(TEXT, 23));
231 assert_eq!(37, to_byte_idx(TEXT, 24));
232 assert_eq!(41, to_byte_idx(TEXT, 25));
233
234 assert_eq!(45, to_byte_idx(TEXT, 27));
235 assert_eq!(45, to_byte_idx(TEXT, 27)); }
237}