str_indices/
chars.rs

1//! Index by chars.
2
3use crate::byte_chunk::{ByteChunk, Chunk};
4
5/// Counts the chars in a string slice.
6///
7/// Runs in O(N) time.
8#[inline]
9pub fn count(text: &str) -> usize {
10    count_impl::<Chunk>(text.as_bytes())
11}
12
13/// Converts from byte-index to char-index in a string slice.
14///
15/// If the byte is in the middle of a multi-byte char, returns the index of
16/// the char that the byte belongs to.
17///
18/// Any past-the-end index will return the one-past-the-end char index.
19///
20/// Runs in O(N) time.
21#[inline]
22pub fn from_byte_idx(text: &str, byte_idx: usize) -> usize {
23    let bytes = text.as_bytes();
24
25    // Ensure the index is either a char boundary or is off the end of
26    // the text.
27    let mut i = byte_idx;
28    while Some(true) == bytes.get(i).map(is_trailing_byte) {
29        i -= 1;
30    }
31
32    count_impl::<Chunk>(&bytes[0..i.min(bytes.len())])
33}
34
35/// Converts from char-index to byte-index in a string slice.
36///
37/// Any past-the-end index will return the one-past-the-end byte index.
38///
39/// Runs in O(N) time.
40#[inline]
41pub fn to_byte_idx(text: &str, char_idx: usize) -> usize {
42    to_byte_idx_impl::<Chunk>(text.as_bytes(), char_idx)
43}
44
45//-------------------------------------------------------------
46
47#[inline(always)]
48fn to_byte_idx_impl<T: ByteChunk>(text: &[u8], char_idx: usize) -> usize {
49    if text.len() <= T::SIZE {
50        // Bypass the more complex routine for short strings, where the
51        // complexity hurts performance.
52        let mut char_count = 0;
53        for (i, byte) in text.iter().enumerate() {
54            char_count += is_leading_byte(byte) as usize;
55            if char_count > char_idx {
56                return i;
57            }
58        }
59        return text.len();
60    }
61    // Get `middle` so we can do more efficient chunk-based counting.
62    // We can't use this to get `end`, however, because the start index of
63    // `end` actually depends on the accumulating char counts during the
64    // counting process.
65    let (start, middle, _) = unsafe { text.align_to::<T>() };
66
67    let mut byte_count = 0;
68    let mut char_count = 0;
69
70    // Take care of any unaligned bytes at the beginning.
71    for byte in start.iter() {
72        char_count += is_leading_byte(byte) as usize;
73        if char_count > char_idx {
74            return byte_count;
75        }
76        byte_count += 1;
77    }
78
79    // Process chunks in the fast path. Ensure that we don't go past the number
80    // of chars we are counting towards
81    let fast_path_chunks = middle.len().min((char_idx - char_count) / T::SIZE);
82    let bytes = T::SIZE * 4;
83    for chunks in middle[..fast_path_chunks].chunks_exact(4) {
84        let val1 = count_trailing_chunk(chunks[0]);
85        let val2 = count_trailing_chunk(chunks[1]);
86        let val3 = count_trailing_chunk(chunks[2]);
87        let val4 = count_trailing_chunk(chunks[3]);
88        char_count += bytes - val1.add(val2).add(val3.add(val4)).sum_bytes();
89        byte_count += bytes;
90    }
91
92    // Process the rest of chunks in the slow path.
93    for chunk in middle[(fast_path_chunks - fast_path_chunks % 4)..].iter() {
94        let new_char_count = char_count + T::SIZE - count_trailing_chunk(*chunk).sum_bytes();
95        if new_char_count >= char_idx {
96            break;
97        }
98        char_count = new_char_count;
99        byte_count += T::SIZE;
100    }
101
102    // Take care of any unaligned bytes at the end.
103    let end = &text[byte_count..];
104    for byte in end.iter() {
105        char_count += is_leading_byte(byte) as usize;
106        if char_count > char_idx {
107            break;
108        }
109        byte_count += 1;
110    }
111
112    byte_count
113}
114
115#[inline(always)]
116pub(crate) fn count_impl<T: ByteChunk>(text: &[u8]) -> usize {
117    if text.len() < T::SIZE {
118        // Bypass the more complex routine for short strings, where the
119        // complexity hurts performance.
120        return text.iter().map(|x| is_leading_byte(x) as usize).sum();
121    }
122    // Get `middle` for more efficient chunk-based counting.
123    let (start, middle, end) = unsafe { text.align_to::<T>() };
124
125    let mut inv_count = 0;
126
127    // Take care of unaligned bytes at the beginning.
128    inv_count += start.iter().filter(|x| is_trailing_byte(x)).count();
129
130    // Take care of the middle bytes in big chunks. Loop unrolled.
131    for chunks in middle.chunks_exact(4) {
132        let val1 = count_trailing_chunk(chunks[0]);
133        let val2 = count_trailing_chunk(chunks[1]);
134        let val3 = count_trailing_chunk(chunks[2]);
135        let val4 = count_trailing_chunk(chunks[3]);
136        inv_count += val1.add(val2).add(val3.add(val4)).sum_bytes();
137    }
138    let mut acc = T::zero();
139    for chunk in middle.chunks_exact(4).remainder() {
140        acc = acc.add(count_trailing_chunk(*chunk));
141    }
142    inv_count += acc.sum_bytes();
143
144    // Take care of unaligned bytes at the end.
145    inv_count += end.iter().filter(|x| is_trailing_byte(x)).count();
146
147    text.len() - inv_count
148}
149
150#[inline(always)]
151fn is_leading_byte(byte: &u8) -> bool {
152    (byte & 0xC0) != 0x80
153}
154
155#[inline(always)]
156fn is_trailing_byte(byte: &u8) -> bool {
157    (byte & 0xC0) == 0x80
158}
159
160#[inline(always)]
161fn count_trailing_chunk<T: ByteChunk>(val: T) -> T {
162    val.bitand(T::splat(0xc0)).cmp_eq_byte(0x80)
163}
164
165//=============================================================
166
167#[cfg(test)]
168mod tests {
169    use super::*;
170
171    // 124 bytes, 100 chars, 4 lines
172    const TEXT_LINES: &str = "Hello there!  How're you doing?\nIt's \
173                              a fine day, isn't it?\nAren't you glad \
174                              we're alive?\nこんにちは、みんなさん!";
175
176    #[test]
177    fn count_01() {
178        let text = "Hello せかい! Hello せかい! Hello せかい! Hello せかい! Hello せかい!";
179
180        assert_eq!(54, count(text));
181    }
182
183    #[test]
184    fn count_02() {
185        assert_eq!(100, count(TEXT_LINES));
186    }
187
188    #[test]
189    fn from_byte_idx_01() {
190        let text = "Hello せかい!";
191        assert_eq!(0, from_byte_idx(text, 0));
192        assert_eq!(1, from_byte_idx(text, 1));
193        assert_eq!(6, from_byte_idx(text, 6));
194        assert_eq!(6, from_byte_idx(text, 7));
195        assert_eq!(6, from_byte_idx(text, 8));
196        assert_eq!(7, from_byte_idx(text, 9));
197        assert_eq!(7, from_byte_idx(text, 10));
198        assert_eq!(7, from_byte_idx(text, 11));
199        assert_eq!(8, from_byte_idx(text, 12));
200        assert_eq!(8, from_byte_idx(text, 13));
201        assert_eq!(8, from_byte_idx(text, 14));
202        assert_eq!(9, from_byte_idx(text, 15));
203        assert_eq!(10, from_byte_idx(text, 16));
204        assert_eq!(10, from_byte_idx(text, 17));
205        assert_eq!(10, from_byte_idx(text, 18));
206        assert_eq!(10, from_byte_idx(text, 19));
207    }
208
209    #[test]
210    fn from_byte_idx_02() {
211        let text = "";
212        assert_eq!(0, from_byte_idx(text, 0));
213        assert_eq!(0, from_byte_idx(text, 1));
214
215        let text = "h";
216        assert_eq!(0, from_byte_idx(text, 0));
217        assert_eq!(1, from_byte_idx(text, 1));
218        assert_eq!(1, from_byte_idx(text, 2));
219
220        let text = "hi";
221        assert_eq!(0, from_byte_idx(text, 0));
222        assert_eq!(1, from_byte_idx(text, 1));
223        assert_eq!(2, from_byte_idx(text, 2));
224        assert_eq!(2, from_byte_idx(text, 3));
225    }
226
227    #[test]
228    fn from_byte_idx_03() {
229        let text = "せかい";
230        assert_eq!(0, from_byte_idx(text, 0));
231        assert_eq!(0, from_byte_idx(text, 1));
232        assert_eq!(0, from_byte_idx(text, 2));
233        assert_eq!(1, from_byte_idx(text, 3));
234        assert_eq!(1, from_byte_idx(text, 4));
235        assert_eq!(1, from_byte_idx(text, 5));
236        assert_eq!(2, from_byte_idx(text, 6));
237        assert_eq!(2, from_byte_idx(text, 7));
238        assert_eq!(2, from_byte_idx(text, 8));
239        assert_eq!(3, from_byte_idx(text, 9));
240        assert_eq!(3, from_byte_idx(text, 10));
241        assert_eq!(3, from_byte_idx(text, 11));
242        assert_eq!(3, from_byte_idx(text, 12));
243    }
244
245    #[test]
246    fn from_byte_idx_04() {
247        // Ascii range
248        for i in 0..88 {
249            assert_eq!(i, from_byte_idx(TEXT_LINES, i));
250        }
251
252        // Hiragana characters
253        for i in 88..125 {
254            assert_eq!(88 + ((i - 88) / 3), from_byte_idx(TEXT_LINES, i));
255        }
256
257        // Past the end
258        for i in 125..130 {
259            assert_eq!(100, from_byte_idx(TEXT_LINES, i));
260        }
261    }
262
263    #[test]
264    fn to_byte_idx_01() {
265        let text = "Hello せかい!";
266        assert_eq!(0, to_byte_idx(text, 0));
267        assert_eq!(1, to_byte_idx(text, 1));
268        assert_eq!(2, to_byte_idx(text, 2));
269        assert_eq!(5, to_byte_idx(text, 5));
270        assert_eq!(6, to_byte_idx(text, 6));
271        assert_eq!(12, to_byte_idx(text, 8));
272        assert_eq!(15, to_byte_idx(text, 9));
273        assert_eq!(16, to_byte_idx(text, 10));
274    }
275
276    #[test]
277    fn to_byte_idx_02() {
278        let text = "せかい";
279        assert_eq!(0, to_byte_idx(text, 0));
280        assert_eq!(3, to_byte_idx(text, 1));
281        assert_eq!(6, to_byte_idx(text, 2));
282        assert_eq!(9, to_byte_idx(text, 3));
283    }
284
285    #[test]
286    fn to_byte_idx_03() {
287        let text = "Hello world!";
288        assert_eq!(0, to_byte_idx(text, 0));
289        assert_eq!(1, to_byte_idx(text, 1));
290        assert_eq!(8, to_byte_idx(text, 8));
291        assert_eq!(11, to_byte_idx(text, 11));
292        assert_eq!(12, to_byte_idx(text, 12));
293    }
294
295    #[test]
296    fn to_byte_idx_04() {
297        let text = "Hello world! Hello せかい! Hello world! Hello せかい! \
298                    Hello world! Hello せかい! Hello world! Hello せかい! \
299                    Hello world! Hello せかい! Hello world! Hello せかい! \
300                    Hello world! Hello せかい! Hello world! Hello せかい!";
301        assert_eq!(0, to_byte_idx(text, 0));
302        assert_eq!(30, to_byte_idx(text, 24));
303        assert_eq!(60, to_byte_idx(text, 48));
304        assert_eq!(90, to_byte_idx(text, 72));
305        assert_eq!(115, to_byte_idx(text, 93));
306        assert_eq!(120, to_byte_idx(text, 96));
307        assert_eq!(150, to_byte_idx(text, 120));
308        assert_eq!(180, to_byte_idx(text, 144));
309        assert_eq!(210, to_byte_idx(text, 168));
310        assert_eq!(239, to_byte_idx(text, 191));
311    }
312
313    #[test]
314    fn to_byte_idx_05() {
315        // Ascii range
316        for i in 0..88 {
317            assert_eq!(i, to_byte_idx(TEXT_LINES, i));
318        }
319
320        // Hiragana characters
321        for i in 88..100 {
322            assert_eq!(88 + ((i - 88) * 3), to_byte_idx(TEXT_LINES, i));
323        }
324
325        // Past the end
326        for i in 100..110 {
327            assert_eq!(124, to_byte_idx(TEXT_LINES, i));
328        }
329    }
330}