str_indices/
lines.rs

Help
1//! Index by lines (all Unicode line breaks).
2//!
3//! This module recognizes all line breaks defined in
4//! [Unicode Annex #14](https://www.unicode.org/reports/tr14/):
5//!
6//! - `U+000A`          &mdash; LF (Line Feed)
7//! - `U+000B`          &mdash; VT (Vertical Tab)
8//! - `U+000C`          &mdash; FF (Form Feed)
9//! - `U+000D`          &mdash; CR (Carriage Return)
10//! - `U+0085`          &mdash; NEL (Next Line)
11//! - `U+2028`          &mdash; Line Separator
12//! - `U+2029`          &mdash; Paragraph Separator
13//! - `U+000D` `U+000A` &mdash; CRLF (Carriage Return + Line Feed)
14
15use crate::alignment_diff;
16use crate::byte_chunk::{ByteChunk, Chunk};
17
18/// Counts the line breaks in a string slice.
19///
20/// Runs in O(N) time.
21#[inline]
22pub fn count_breaks(text: &str) -> usize {
23    count_breaks_impl::<Chunk>(text.as_bytes())
24}
25
26/// Converts from byte-index to line-index in a string slice.
27///
28/// Line break characters are considered to be a part of the line they
29/// end.  And a string that ends with a line break is considered to have
30/// a final empty line.  So this function is equivalent to counting the
31/// line breaks before the specified byte.
32///
33/// Any past-the-end index will return the last line index.
34///
35/// Runs in O(N) time.
36#[inline]
37pub fn from_byte_idx(text: &str, byte_idx: usize) -> usize {
38    let mut i = byte_idx.min(text.len());
39    while !text.is_char_boundary(i) {
40        i -= 1;
41    }
42    let nl_count = count_breaks_impl::<Chunk>(&text.as_bytes()[..i]);
43    if crate::is_not_crlf_middle(i, text.as_bytes()) {
44        nl_count
45    } else {
46        nl_count - 1
47    }
48}
49
50/// Converts from line-index to byte-index in a string slice.
51///
52/// Returns the byte index of the start of the specified line.  Line 0 is
53/// the start of the string, and subsequent lines start immediately
54/// *after* each line break character.
55///
56/// Any past-the-end index will return the one-past-the-end byte index.
57///
58/// Runs in O(N) time.
59#[inline]
60pub fn to_byte_idx(text: &str, line_idx: usize) -> usize {
61    to_byte_idx_impl::<Chunk>(text, line_idx)
62}
63
64//-------------------------------------------------------------
65
66#[inline(always)]
67fn to_byte_idx_impl<T: ByteChunk>(text: &str, line_idx: usize) -> usize {
68    let mut bytes = text.as_bytes();
69    let mut line_break_count = 0;
70
71    // Handle unaligned bytes at the start.
72    let aligned_idx = alignment_diff::<T>(bytes);
73    if aligned_idx > 0 {
74        let result = count_breaks_up_to(bytes, aligned_idx, line_idx);
75        line_break_count += result.0;
76        bytes = &bytes[result.1..];
77    }
78
79    // Count line breaks in big chunks.
80    if alignment_diff::<T>(bytes) == 0 {
81        while bytes.len() >= T::SIZE {
82            // Unsafe because the called function depends on correct alignment.
83            let tmp = unsafe { count_breaks_in_chunk_from_ptr::<T>(bytes) }.sum_bytes();
84            if tmp + line_break_count >= line_idx {
85                break;
86            }
87            line_break_count += tmp;
88
89            bytes = &bytes[T::SIZE..];
90        }
91    }
92
93    // Handle unaligned bytes at the end.
94    let result = count_breaks_up_to(bytes, bytes.len(), line_idx - line_break_count);
95    bytes = &bytes[result.1..];
96
97    // Finish up
98    let mut byte_idx = text.len() - bytes.len();
99    while !text.is_char_boundary(byte_idx) {
100        byte_idx += 1;
101    }
102    byte_idx
103}
104
105/// Counts the line breaks in a utf8 encoded string.
106///
107/// The following unicode sequences are considered newlines by this function:
108/// - u{000A}        (Line Feed)
109/// - u{000B}        (Vertical Tab)
110/// - u{000C}        (Form Feed)
111/// - u{000D}        (Carriage Return)
112/// - u{000D}u{000A} (Carriage Return + Line Feed)
113/// - u{0085}        (Next Line)
114/// - u{2028}        (Line Separator)
115/// - u{2029}        (Paragraph Separator)
116#[inline(always)]
117fn count_breaks_impl<T: ByteChunk>(text: &[u8]) -> usize {
118    let mut bytes = text;
119    let mut count = 0;
120
121    // Handle unaligned bytes at the start.
122    let aligned_idx = alignment_diff::<T>(bytes);
123    if aligned_idx > 0 {
124        let result = count_breaks_up_to(bytes, aligned_idx, bytes.len());
125        count += result.0;
126        bytes = &bytes[result.1..];
127    }
128
129    // Count line breaks in big chunks.
130    let mut i = 0;
131    let mut acc = T::zero();
132    while bytes.len() >= T::SIZE {
133        // Unsafe because the called function depends on correct alignment.
134        acc = acc.add(unsafe { count_breaks_in_chunk_from_ptr::<T>(bytes) });
135        i += 1;
136        if i == T::MAX_ACC {
137            i = 0;
138            count += acc.sum_bytes();
139            acc = T::zero();
140        }
141        bytes = &bytes[T::SIZE..];
142    }
143    count += acc.sum_bytes();
144
145    // Handle unaligned bytes at the end.
146    count += count_breaks_up_to(bytes, bytes.len(), bytes.len()).0;
147
148    count
149}
150
151/// Used internally in the line-break counting functions.
152///
153/// Counts line breaks a byte at a time up to a maximum number of bytes and
154/// line breaks, and returns the counted lines and how many bytes were processed.
155#[inline(always)]
156#[allow(clippy::if_same_then_else)]
157fn count_breaks_up_to(bytes: &[u8], max_bytes: usize, max_breaks: usize) -> (usize, usize) {
158    let mut ptr = 0;
159    let mut count = 0;
160    while ptr < max_bytes && count < max_breaks {
161        let byte = bytes[ptr];
162
163        // Handle u{000A}, u{000B}, u{000C}, and u{000D}
164        if (0x0A..=0x0D).contains(&byte) {
165            count += 1;
166
167            // Check for CRLF and and subtract 1 if it is,
168            // since it will be caught in the next iteration
169            // with the LF.
170            if byte == 0x0D && (ptr + 1) < bytes.len() && bytes[ptr + 1] == 0x0A {
171                count -= 1;
172            }
173        }
174        // Handle u{0085}
175        else if byte == 0xC2 && (ptr + 1) < bytes.len() && bytes[ptr + 1] == 0x85 {
176            count += 1;
177        }
178        // Handle u{2028} and u{2029}
179        else if byte == 0xE2
180            && (ptr + 2) < bytes.len()
181            && bytes[ptr + 1] == 0x80
182            && (bytes[ptr + 2] >> 1) == 0x54
183        {
184            count += 1;
185        }
186
187        ptr += 1;
188    }
189
190    (count, ptr)
191}
192
193/// Used internally in the line-break counting functions.
194///
195/// The start of `bytes` MUST be aligned as type T, and `bytes` MUST be at
196/// least as large (in bytes) as T.  If these invariants are not met, bad
197/// things could potentially happen.  Hence why this function is unsafe.
198#[inline(always)]
199unsafe fn count_breaks_in_chunk_from_ptr<T: ByteChunk>(bytes: &[u8]) -> T {
200    let c = {
201        // The only unsafe bits of the function are in this block.
202        debug_assert_eq!(bytes.align_to::<T>().0.len(), 0);
203        debug_assert!(bytes.len() >= T::SIZE);
204        // This unsafe cast is for performance reasons: going through e.g.
205        // `align_to()` results in a significant drop in performance.
206        *(bytes.as_ptr() as *const T)
207    };
208    let end_i = T::SIZE;
209
210    let mut acc = T::zero();
211
212    // Calculate the flags we're going to be working with.
213    let nl_1_flags = c.cmp_eq_byte(0xC2);
214    let sp_1_flags = c.cmp_eq_byte(0xE2);
215    let all_flags = c.bytes_between_127(0x09, 0x0E);
216    let cr_flags = c.cmp_eq_byte(0x0D);
217
218    // Next Line: u{0085}
219    if !nl_1_flags.is_zero() {
220        let nl_2_flags = c.cmp_eq_byte(0x85).shift_back_lex(1);
221        let flags = nl_1_flags.bitand(nl_2_flags);
222        acc = acc.add(flags);
223
224        // Handle ending boundary
225        if bytes.len() > end_i && bytes[end_i - 1] == 0xC2 && bytes[end_i] == 0x85 {
226            acc = acc.inc_nth_from_end_lex_byte(0);
227        }
228    }
229
230    // Line Separator:      u{2028}
231    // Paragraph Separator: u{2029}
232    if !sp_1_flags.is_zero() {
233        let sp_2_flags = c.cmp_eq_byte(0x80).shift_back_lex(1).bitand(sp_1_flags);
234        if !sp_2_flags.is_zero() {
235            let sp_3_flags = c
236                .shr(1)
237                .bitand(T::splat(!0x80))
238                .cmp_eq_byte(0x54)
239                .shift_back_lex(2);
240            let sp_flags = sp_2_flags.bitand(sp_3_flags);
241            acc = acc.add(sp_flags);
242        }
243
244        // Handle ending boundary
245        if bytes.len() > end_i
246            && bytes[end_i - 2] == 0xE2
247            && bytes[end_i - 1] == 0x80
248            && (bytes[end_i] >> 1) == 0x54
249        {
250            acc = acc.inc_nth_from_end_lex_byte(1);
251        } else if bytes.len() > (end_i + 1)
252            && bytes[end_i - 1] == 0xE2
253            && bytes[end_i] == 0x80
254            && (bytes[end_i + 1] >> 1) == 0x54
255        {
256            acc = acc.inc_nth_from_end_lex_byte(0);
257        }
258    }
259
260    // Line Feed:                   u{000A}
261    // Vertical Tab:                u{000B}
262    // Form Feed:                   u{000C}
263    // Carriage Return:             u{000D}
264    // Carriage Return + Line Feed: u{000D}u{000A}
265    acc = acc.add(all_flags);
266    if !cr_flags.is_zero() {
267        // Handle CRLF
268        let lf_flags = c.cmp_eq_byte(0x0A);
269        let crlf_flags = cr_flags.bitand(lf_flags.shift_back_lex(1));
270        acc = acc.sub(crlf_flags);
271        if bytes.len() > end_i && bytes[end_i - 1] == 0x0D && bytes[end_i] == 0x0A {
272            acc = acc.dec_last_lex_byte();
273        }
274    }
275
276    acc
277}
278
279//=============================================================
280
281#[cfg(test)]
282mod tests {
283    use super::*;
284
285    // 124 bytes, 100 chars, 4 lines
286    const TEXT_LINES: &str = "Hello there!  How're you doing?\nIt's \
287                              a fine day, isn't it?\nAren't you glad \
288                              we're alive?\nこんにちは、みんなさん！";
289
290    #[test]
291    fn count_breaks_01() {
292        let text = "\u{000A}Hello\u{000D}\u{000A}\u{000D}せ\u{000B}か\u{000C}い\u{0085}. \
293                    There\u{2028}is something.\u{2029}";
294        assert_eq!(48, text.len());
295        assert_eq!(8, count_breaks(text));
296    }
297
298    #[test]
299    fn from_byte_idx_01() {
300        let text = "Here\nare\nsome\nwords";
301        assert_eq!(0, from_byte_idx(text, 0));
302        assert_eq!(0, from_byte_idx(text, 4));
303        assert_eq!(1, from_byte_idx(text, 5));
304        assert_eq!(1, from_byte_idx(text, 8));
305        assert_eq!(2, from_byte_idx(text, 9));
306        assert_eq!(2, from_byte_idx(text, 13));
307        assert_eq!(3, from_byte_idx(text, 14));
308        assert_eq!(3, from_byte_idx(text, 19));
309    }
310
311    #[test]
312    fn from_byte_idx_02() {
313        let text = "\nHere\nare\nsome\nwords\n";
314        assert_eq!(0, from_byte_idx(text, 0));
315        assert_eq!(1, from_byte_idx(text, 1));
316        assert_eq!(1, from_byte_idx(text, 5));
317        assert_eq!(2, from_byte_idx(text, 6));
318        assert_eq!(2, from_byte_idx(text, 9));
319        assert_eq!(3, from_byte_idx(text, 10));
320        assert_eq!(3, from_byte_idx(text, 14));
321        assert_eq!(4, from_byte_idx(text, 15));
322        assert_eq!(4, from_byte_idx(text, 20));
323        assert_eq!(5, from_byte_idx(text, 21));
324    }
325
326    #[test]
327    fn from_byte_idx_03() {
328        let text = "Here\r\nare\r\nsome\r\nwords";
329        assert_eq!(0, from_byte_idx(text, 0));
330        assert_eq!(0, from_byte_idx(text, 4));
331        assert_eq!(0, from_byte_idx(text, 5));
332        assert_eq!(1, from_byte_idx(text, 6));
333        assert_eq!(1, from_byte_idx(text, 9));
334        assert_eq!(1, from_byte_idx(text, 10));
335        assert_eq!(2, from_byte_idx(text, 11));
336        assert_eq!(2, from_byte_idx(text, 15));
337        assert_eq!(2, from_byte_idx(text, 16));
338        assert_eq!(3, from_byte_idx(text, 17));
339    }
340
341    #[test]
342    fn from_byte_idx_04() {
343        // Line 0
344        for i in 0..32 {
345            assert_eq!(0, from_byte_idx(TEXT_LINES, i));
346        }
347
348        // Line 1
349        for i in 32..59 {
350            assert_eq!(1, from_byte_idx(TEXT_LINES, i));
351        }
352
353        // Line 2
354        for i in 59..88 {
355            assert_eq!(2, from_byte_idx(TEXT_LINES, i));
356        }
357
358        // Line 3
359        for i in 88..125 {
360            assert_eq!(3, from_byte_idx(TEXT_LINES, i));
361        }
362
363        // Past the end
364        for i in 125..130 {
365            assert_eq!(3, from_byte_idx(TEXT_LINES, i));
366        }
367    }
368
369    #[test]
370    fn to_byte_idx_01() {
371        let text = "Here\r\nare\r\nsome\r\nwords";
372        assert_eq!(0, to_byte_idx(text, 0));
373        assert_eq!(6, to_byte_idx(text, 1));
374        assert_eq!(11, to_byte_idx(text, 2));
375        assert_eq!(17, to_byte_idx(text, 3));
376    }
377
378    #[test]
379    fn to_byte_idx_02() {
380        let text = "\nHere\nare\nsome\nwords\n";
381        assert_eq!(0, to_byte_idx(text, 0));
382        assert_eq!(1, to_byte_idx(text, 1));
383        assert_eq!(6, to_byte_idx(text, 2));
384        assert_eq!(10, to_byte_idx(text, 3));
385        assert_eq!(15, to_byte_idx(text, 4));
386        assert_eq!(21, to_byte_idx(text, 5));
387    }
388
389    #[test]
390    fn to_byte_idx_03() {
391        assert_eq!(0, to_byte_idx(TEXT_LINES, 0));
392        assert_eq!(32, to_byte_idx(TEXT_LINES, 1));
393        assert_eq!(59, to_byte_idx(TEXT_LINES, 2));
394        assert_eq!(88, to_byte_idx(TEXT_LINES, 3));
395
396        // Past end
397        assert_eq!(124, to_byte_idx(TEXT_LINES, 4));
398        assert_eq!(124, to_byte_idx(TEXT_LINES, 5));
399        assert_eq!(124, to_byte_idx(TEXT_LINES, 6));
400    }
401
402    #[test]
403    fn line_byte_round_trip() {
404        let text = "\nHere\nare\nsome\nwords\n";
405        assert_eq!(6, to_byte_idx(text, from_byte_idx(text, 6)));
406        assert_eq!(2, from_byte_idx(text, to_byte_idx(text, 2)));
407
408        assert_eq!(0, to_byte_idx(text, from_byte_idx(text, 0)));
409        assert_eq!(0, from_byte_idx(text, to_byte_idx(text, 0)));
410
411        assert_eq!(21, to_byte_idx(text, from_byte_idx(text, 21)));
412        assert_eq!(5, from_byte_idx(text, to_byte_idx(text, 5)));
413    }
414}
str_indices/lines.rs

str_indices/
lines.rs