str_indices/
lines_crlf.rs

1//! Index by lines (carriage return and line feed).
2//!
3//! This module recognizes the following as line breaks:
4//!
5//! - `U+000A`          — LF (Line Feed)
6//! - `U+000D`          — CR (Carriage Return)
7//! - `U+000D` `U+000A` — CRLF (Carriage Return + Line Feed)
8//!
9//! (Note: if you only want to recognize LF and CRLF, without
10//! recognizing CR individually, see the [`lines_lf`](crate::lines_lf) module.)
11
12use crate::byte_chunk::{ByteChunk, Chunk};
13
14/// Counts the line breaks in a string slice.
15///
16/// Runs in O(N) time.
17#[inline]
18pub fn count_breaks(text: &str) -> usize {
19    count_breaks_impl::<Chunk>(text.as_bytes())
20}
21
22/// Converts from byte-index to line-index in a string slice.
23///
24/// Line break characters are considered to be a part of the line they
25/// end.  And a string that ends with a line break is considered to have
26/// a final empty line.  So this function is equivalent to counting the
27/// line breaks before the specified byte.
28///
29/// Any past-the-end index will return the last line index.
30///
31/// Runs in O(N) time.
32#[inline]
33pub fn from_byte_idx(text: &str, byte_idx: usize) -> usize {
34    let i = byte_idx.min(text.len());
35    let nl_count = count_breaks_impl::<Chunk>(&text.as_bytes()[..i]);
36    if crate::is_not_crlf_middle(i, text.as_bytes()) {
37        nl_count
38    } else {
39        nl_count - 1
40    }
41}
42
43/// Converts from line-index to byte-index in a string slice.
44///
45/// Returns the byte index of the start of the specified line.  Line 0 is
46/// the start of the string, and subsequent lines start immediately
47/// *after* each line break character.
48///
49/// Any past-the-end index will return the one-past-the-end byte index.
50///
51/// Runs in O(N) time.
52#[inline]
53pub fn to_byte_idx(text: &str, line_idx: usize) -> usize {
54    to_byte_idx_impl::<Chunk>(text.as_bytes(), line_idx)
55}
56
57//-------------------------------------------------------------
58const LF: u8 = b'\n';
59const CR: u8 = b'\r';
60
61#[inline(always)]
62fn to_byte_idx_impl<T: ByteChunk>(text: &[u8], line_idx: usize) -> usize {
63    // Get `middle` so we can do more efficient chunk-based counting.
64    // We can't use this to get `end`, however, because the start index of
65    // `end` actually depends on the accumulating line counts during the
66    // counting process.
67    let (start, middle, _) = unsafe { text.align_to::<T>() };
68
69    let mut byte_count = 0;
70    let mut break_count = 0;
71
72    // Take care of any unaligned bytes at the beginning.
73    let mut last_was_cr = false;
74    for byte in start.iter().copied() {
75        let is_lf = byte == LF;
76        let is_cr = byte == CR;
77        if break_count == line_idx {
78            if last_was_cr && is_lf {
79                byte_count += 1;
80            }
81            return byte_count;
82        }
83        if is_cr || (is_lf && !last_was_cr) {
84            break_count += 1;
85        }
86        last_was_cr = is_cr;
87        byte_count += 1;
88    }
89
90    // Process the chunks 2 at a time.
91    let mut chunk_count = 0;
92    let mut prev = T::splat(last_was_cr as u8);
93    for chunks in middle.chunks_exact(2) {
94        let lf_flags0 = chunks[0].cmp_eq_byte(LF);
95        let cr_flags0 = chunks[0].cmp_eq_byte(CR);
96        let crlf_flags0 = prev.shift_across(cr_flags0).bitand(lf_flags0);
97
98        let lf_flags1 = chunks[1].cmp_eq_byte(LF);
99        let cr_flags1 = chunks[1].cmp_eq_byte(CR);
100        let crlf_flags1 = cr_flags0.shift_across(cr_flags1).bitand(lf_flags1);
101        let new_break_count = break_count
102            + lf_flags0
103                .add(cr_flags0)
104                .add(lf_flags1)
105                .add(cr_flags1)
106                .sub(crlf_flags0)
107                .sub(crlf_flags1)
108                .sum_bytes();
109        if new_break_count >= line_idx {
110            break;
111        }
112        break_count = new_break_count;
113        byte_count += T::SIZE * 2;
114        chunk_count += 2;
115        prev = cr_flags1;
116    }
117
118    // Process the rest of the chunks.
119    for chunk in middle[chunk_count..].iter() {
120        let lf_flags = chunk.cmp_eq_byte(LF);
121        let cr_flags = chunk.cmp_eq_byte(CR);
122        let crlf_flags = prev.shift_across(cr_flags).bitand(lf_flags);
123        let new_break_count = break_count + lf_flags.add(cr_flags).sub(crlf_flags).sum_bytes();
124        if new_break_count >= line_idx {
125            break;
126        }
127        break_count = new_break_count;
128        byte_count += T::SIZE;
129        prev = cr_flags;
130    }
131
132    // Take care of any unaligned bytes at the end.
133    last_was_cr = text.get(byte_count.saturating_sub(1)) == Some(&CR);
134    for byte in text[byte_count..].iter().copied() {
135        let is_lf = byte == LF;
136        let is_cr = byte == CR;
137        if break_count == line_idx {
138            if last_was_cr && is_lf {
139                byte_count += 1;
140            }
141            break;
142        }
143        if is_cr || (is_lf && !last_was_cr) {
144            break_count += 1;
145        }
146        last_was_cr = is_cr;
147        byte_count += 1;
148    }
149
150    // Finish up
151    byte_count
152}
153
154/// Counts the line breaks in a utf8 encoded string.
155///
156/// The following unicode sequences are considered newlines by this function:
157/// - u{000A}        (Line Feed)
158/// - u{000D}        (Carriage Return)
159#[inline(always)]
160fn count_breaks_impl<T: ByteChunk>(text: &[u8]) -> usize {
161    // Get `middle` so we can do more efficient chunk-based counting.
162    let (start, middle, end) = unsafe { text.align_to::<T>() };
163
164    let mut count = 0;
165
166    // Take care of unaligned bytes at the beginning.
167    let mut last_was_cr = false;
168    for byte in start.iter().copied() {
169        let is_lf = byte == LF;
170        let is_cr = byte == CR;
171        count += (is_cr | (is_lf & !last_was_cr)) as usize;
172        last_was_cr = is_cr;
173    }
174
175    // Take care of the middle bytes in big chunks.
176    let mut prev = T::splat(last_was_cr as u8);
177    for chunks in middle.chunks_exact(2) {
178        let lf_flags0 = chunks[0].cmp_eq_byte(LF);
179        let cr_flags0 = chunks[0].cmp_eq_byte(CR);
180        let crlf_flags0 = prev.shift_across(cr_flags0).bitand(lf_flags0);
181
182        let lf_flags1 = chunks[1].cmp_eq_byte(LF);
183        let cr_flags1 = chunks[1].cmp_eq_byte(CR);
184        let crlf_flags1 = cr_flags0.shift_across(cr_flags1).bitand(lf_flags1);
185        count += lf_flags0
186            .add(cr_flags0)
187            .sub(crlf_flags0)
188            .add(lf_flags1)
189            .add(cr_flags1)
190            .sub(crlf_flags1)
191            .sum_bytes();
192        prev = cr_flags1;
193    }
194
195    if let Some(chunk) = middle.chunks_exact(2).remainder().iter().next() {
196        let lf_flags = chunk.cmp_eq_byte(LF);
197        let cr_flags = chunk.cmp_eq_byte(CR);
198        let crlf_flags = prev.shift_across(cr_flags).bitand(lf_flags);
199        count += lf_flags.add(cr_flags).sub(crlf_flags).sum_bytes();
200    }
201
202    // Take care of unaligned bytes at the end.
203    last_was_cr = text.get((text.len() - end.len()).saturating_sub(1)) == Some(&CR);
204    for byte in end.iter().copied() {
205        let is_lf = byte == LF;
206        let is_cr = byte == CR;
207        count += (is_cr | (is_lf & !last_was_cr)) as usize;
208        last_was_cr = is_cr;
209    }
210
211    count
212}
213
214//=============================================================
215
216#[cfg(test)]
217mod tests {
218    use super::*;
219
220    // 124 bytes, 100 chars, 4 lines
221    const TEXT_LINES: &str = "Hello there!  How're you doing?\nIt's \
222                              a fine day, isn't it?\nAren't you glad \
223                              we're alive?\nこんにちは、みんなさん!";
224
225    #[test]
226    fn count_breaks_01() {
227        let text = "\u{000A}Hello\u{000D}\u{000A}せ\u{000B}か\u{000C}い\u{0085}. \
228                    There\u{000A}is something.\u{2029}";
229        assert_eq!(45, text.len());
230        assert_eq!(3, count_breaks(text));
231    }
232
233    #[test]
234    fn from_byte_idx_01() {
235        let text = "Here\nare\nsome\nwords";
236        assert_eq!(0, from_byte_idx(text, 0));
237        assert_eq!(0, from_byte_idx(text, 4));
238        assert_eq!(1, from_byte_idx(text, 5));
239        assert_eq!(1, from_byte_idx(text, 8));
240        assert_eq!(2, from_byte_idx(text, 9));
241        assert_eq!(2, from_byte_idx(text, 13));
242        assert_eq!(3, from_byte_idx(text, 14));
243        assert_eq!(3, from_byte_idx(text, 19));
244    }
245
246    #[test]
247    fn from_byte_idx_02() {
248        let text = "\nHere\nare\nsome\nwords\n";
249        assert_eq!(0, from_byte_idx(text, 0));
250        assert_eq!(1, from_byte_idx(text, 1));
251        assert_eq!(1, from_byte_idx(text, 5));
252        assert_eq!(2, from_byte_idx(text, 6));
253        assert_eq!(2, from_byte_idx(text, 9));
254        assert_eq!(3, from_byte_idx(text, 10));
255        assert_eq!(3, from_byte_idx(text, 14));
256        assert_eq!(4, from_byte_idx(text, 15));
257        assert_eq!(4, from_byte_idx(text, 20));
258        assert_eq!(5, from_byte_idx(text, 21));
259    }
260
261    #[test]
262    fn from_byte_idx_03() {
263        let text = "Here\r\nare\r\nsome\r\nwords";
264        assert_eq!(0, from_byte_idx(text, 0));
265        assert_eq!(0, from_byte_idx(text, 4));
266        assert_eq!(0, from_byte_idx(text, 5));
267        assert_eq!(1, from_byte_idx(text, 6));
268        assert_eq!(1, from_byte_idx(text, 9));
269        assert_eq!(1, from_byte_idx(text, 10));
270        assert_eq!(2, from_byte_idx(text, 11));
271        assert_eq!(2, from_byte_idx(text, 15));
272        assert_eq!(2, from_byte_idx(text, 16));
273        assert_eq!(3, from_byte_idx(text, 17));
274    }
275
276    #[test]
277    fn from_byte_idx_04() {
278        // Line 0
279        for i in 0..32 {
280            assert_eq!(0, from_byte_idx(TEXT_LINES, i));
281        }
282
283        // Line 1
284        for i in 32..59 {
285            assert_eq!(1, from_byte_idx(TEXT_LINES, i));
286        }
287
288        // Line 2
289        for i in 59..88 {
290            assert_eq!(2, from_byte_idx(TEXT_LINES, i));
291        }
292
293        // Line 3
294        for i in 88..125 {
295            assert_eq!(3, from_byte_idx(TEXT_LINES, i));
296        }
297
298        // Past the end
299        for i in 125..130 {
300            assert_eq!(3, from_byte_idx(TEXT_LINES, i));
301        }
302    }
303
304    #[test]
305    fn to_byte_idx_01() {
306        let text = "Here\r\nare\r\nsome\r\nwords";
307        assert_eq!(0, to_byte_idx(text, 0));
308        assert_eq!(6, to_byte_idx(text, 1));
309        assert_eq!(11, to_byte_idx(text, 2));
310        assert_eq!(17, to_byte_idx(text, 3));
311    }
312
313    #[test]
314    fn to_byte_idx_02() {
315        let text = "\nHere\nare\nsome\nwords\n";
316        assert_eq!(0, to_byte_idx(text, 0));
317        assert_eq!(1, to_byte_idx(text, 1));
318        assert_eq!(6, to_byte_idx(text, 2));
319        assert_eq!(10, to_byte_idx(text, 3));
320        assert_eq!(15, to_byte_idx(text, 4));
321        assert_eq!(21, to_byte_idx(text, 5));
322    }
323
324    #[test]
325    fn to_byte_idx_03() {
326        assert_eq!(0, to_byte_idx(TEXT_LINES, 0));
327        assert_eq!(32, to_byte_idx(TEXT_LINES, 1));
328        assert_eq!(59, to_byte_idx(TEXT_LINES, 2));
329        assert_eq!(88, to_byte_idx(TEXT_LINES, 3));
330
331        // Past end
332        assert_eq!(124, to_byte_idx(TEXT_LINES, 4));
333        assert_eq!(124, to_byte_idx(TEXT_LINES, 5));
334        assert_eq!(124, to_byte_idx(TEXT_LINES, 6));
335    }
336
337    #[test]
338    fn line_byte_round_trip() {
339        let text = "\nHere\nare\nsome\nwords\n";
340        assert_eq!(6, to_byte_idx(text, from_byte_idx(text, 6)));
341        assert_eq!(2, from_byte_idx(text, to_byte_idx(text, 2)));
342
343        assert_eq!(0, to_byte_idx(text, from_byte_idx(text, 0)));
344        assert_eq!(0, from_byte_idx(text, to_byte_idx(text, 0)));
345
346        assert_eq!(21, to_byte_idx(text, from_byte_idx(text, 21)));
347        assert_eq!(5, from_byte_idx(text, to_byte_idx(text, 5)));
348    }
349}