str_indices/
lines_lf.rs

1//! Index by lines (line feed only).
2//!
3//! This module recognizes the following as line breaks:
4//!
5//! - `U+000A`          — LF (Line Feed)
6//! - `U+000D` `U+000A` — CRLF (Carriage Return + Line Feed)
7//!   — by coincidence due to ignoring CR.
8
9use crate::byte_chunk::{ByteChunk, Chunk};
10
11/// Counts the line breaks in a string slice.
12///
13/// Runs in O(N) time.
14#[inline]
15pub fn count_breaks(text: &str) -> usize {
16    count_breaks_impl::<Chunk>(text.as_bytes())
17}
18
19/// Converts from byte-index to line-index in a string slice.
20///
21/// Line break characters are considered to be a part of the line they
22/// end.  And a string that ends with a line break is considered to have
23/// a final empty line.  So this function is equivalent to counting the
24/// line breaks before the specified byte.
25///
26/// Any past-the-end index will return the last line index.
27///
28/// Runs in O(N) time.
29#[inline]
30pub fn from_byte_idx(text: &str, byte_idx: usize) -> usize {
31    let i = byte_idx.min(text.len());
32    count_breaks_impl::<Chunk>(&text.as_bytes()[..i])
33}
34
35/// Converts from line-index to byte-index in a string slice.
36///
37/// Returns the byte index of the start of the specified line.  Line 0 is
38/// the start of the string, and subsequent lines start immediately
39/// *after* each line break character.
40///
41/// Any past-the-end index will return the one-past-the-end byte index.
42///
43/// Runs in O(N) time.
44#[inline]
45pub fn to_byte_idx(text: &str, line_idx: usize) -> usize {
46    to_byte_idx_impl::<Chunk>(text.as_bytes(), line_idx)
47}
48
49//-------------------------------------------------------------
50
51#[inline(always)]
52fn to_byte_idx_impl<T: ByteChunk>(text: &[u8], line_idx: usize) -> usize {
53    let mut byte_count = 0;
54    let mut lf_count = 0;
55
56    // Get `middle` so we can do more efficient chunk-based counting.
57    // We can't use this to get `end`, however, because the start index of
58    // `end` actually depends on the accumulating line counts during the
59    // counting process.
60    let (start, middle, _) = unsafe { text.align_to::<T>() };
61
62    // Take care of any unaligned bytes at the beginning.
63    for byte in start.iter() {
64        if lf_count == line_idx {
65            return byte_count;
66        }
67        if *byte == 0x0A {
68            lf_count += 1;
69        }
70        byte_count += 1;
71    }
72
73    // Process the chunks 4 at a time
74    let mut chunk_count = 0;
75    for chunks in middle.chunks_exact(4) {
76        let val1 = chunks[0].cmp_eq_byte(0x0A);
77        let val2 = chunks[1].cmp_eq_byte(0x0A);
78        let val3 = chunks[2].cmp_eq_byte(0x0A);
79        let val4 = chunks[3].cmp_eq_byte(0x0A);
80        let new_lf_count = lf_count + val1.add(val2).add(val3.add(val4)).sum_bytes();
81        if new_lf_count >= line_idx {
82            break;
83        }
84        lf_count = new_lf_count;
85        byte_count += T::SIZE * 4;
86        chunk_count += 4;
87    }
88
89    // Process the rest of the chunks
90    for chunk in middle[chunk_count..].iter() {
91        let new_lf_count = lf_count + chunk.cmp_eq_byte(0x0A).sum_bytes();
92        if new_lf_count >= line_idx {
93            break;
94        }
95        lf_count = new_lf_count;
96        byte_count += T::SIZE;
97    }
98
99    // Take care of any unaligned bytes at the end.
100    for byte in &text[byte_count..] {
101        if lf_count == line_idx {
102            break;
103        }
104        lf_count += (*byte == 0x0A) as usize;
105        byte_count += 1;
106    }
107
108    byte_count
109}
110
111/// Counts the line breaks in a utf8 encoded string.
112///
113/// The following unicode sequences are considered newlines by this function:
114/// - u{000A}        (Line Feed)
115#[inline(always)]
116fn count_breaks_impl<T: ByteChunk>(text: &[u8]) -> usize {
117    if text.len() < T::SIZE {
118        // Bypass the more complex routine for short strings, where the
119        // complexity hurts performance.
120        text.iter().map(|byte| (*byte == 0x0A) as usize).sum()
121    } else {
122        // Get `middle` so we can do more efficient chunk-based counting.
123        let (start, middle, end) = unsafe { text.align_to::<T>() };
124
125        let mut count = 0;
126
127        // Take care of unaligned bytes at the beginning.
128        count += start
129            .iter()
130            .map(|byte| (*byte == 0x0A) as usize)
131            .sum::<usize>();
132
133        // Take care of the middle bytes in big chunks. Loop unrolled.
134        for chunks in middle.chunks_exact(4) {
135            let val1 = chunks[0].cmp_eq_byte(0x0A);
136            let val2 = chunks[1].cmp_eq_byte(0x0A);
137            let val3 = chunks[2].cmp_eq_byte(0x0A);
138            let val4 = chunks[3].cmp_eq_byte(0x0A);
139            count += val1.add(val2).add(val3.add(val4)).sum_bytes();
140        }
141
142        // Chunk remainder
143        let mut acc = T::zero();
144        for chunk in middle.chunks_exact(4).remainder() {
145            acc = acc.add(chunk.cmp_eq_byte(0x0A));
146        }
147        count += acc.sum_bytes();
148
149        // Take care of unaligned bytes at the end.
150        count
151            + end
152                .iter()
153                .map(|byte| (*byte == 0x0A) as usize)
154                .sum::<usize>()
155    }
156}
157
158//=============================================================
159
160#[cfg(test)]
161mod tests {
162    use super::*;
163
164    // 124 bytes, 100 chars, 4 lines
165    const TEXT_LINES: &str = "Hello there!  How're you doing?\nIt's \
166                              a fine day, isn't it?\nAren't you glad \
167                              we're alive?\nこんにちは、みんなさん!";
168
169    #[test]
170    fn count_breaks_01() {
171        let text = "\nHello\u{000D}\nせ\u{000B}か\u{000C}い\u{0085}. \
172                    There\nis something.\u{2029}";
173        assert_eq!(45, text.len());
174        assert_eq!(3, count_breaks(text));
175    }
176
177    #[test]
178    fn from_byte_idx_01() {
179        let text = "Here\nare\nsome\nwords";
180        assert_eq!(0, from_byte_idx(text, 0));
181        assert_eq!(0, from_byte_idx(text, 4));
182        assert_eq!(1, from_byte_idx(text, 5));
183        assert_eq!(1, from_byte_idx(text, 8));
184        assert_eq!(2, from_byte_idx(text, 9));
185        assert_eq!(2, from_byte_idx(text, 13));
186        assert_eq!(3, from_byte_idx(text, 14));
187        assert_eq!(3, from_byte_idx(text, 19));
188    }
189
190    #[test]
191    fn from_byte_idx_02() {
192        let text = "\nHere\nare\nsome\nwords\n";
193        assert_eq!(0, from_byte_idx(text, 0));
194        assert_eq!(1, from_byte_idx(text, 1));
195        assert_eq!(1, from_byte_idx(text, 5));
196        assert_eq!(2, from_byte_idx(text, 6));
197        assert_eq!(2, from_byte_idx(text, 9));
198        assert_eq!(3, from_byte_idx(text, 10));
199        assert_eq!(3, from_byte_idx(text, 14));
200        assert_eq!(4, from_byte_idx(text, 15));
201        assert_eq!(4, from_byte_idx(text, 20));
202        assert_eq!(5, from_byte_idx(text, 21));
203    }
204
205    #[test]
206    fn from_byte_idx_03() {
207        let text = "Here\r\nare\r\nsome\r\nwords";
208        assert_eq!(0, from_byte_idx(text, 0));
209        assert_eq!(0, from_byte_idx(text, 4));
210        assert_eq!(0, from_byte_idx(text, 5));
211        assert_eq!(1, from_byte_idx(text, 6));
212        assert_eq!(1, from_byte_idx(text, 9));
213        assert_eq!(1, from_byte_idx(text, 10));
214        assert_eq!(2, from_byte_idx(text, 11));
215        assert_eq!(2, from_byte_idx(text, 15));
216        assert_eq!(2, from_byte_idx(text, 16));
217        assert_eq!(3, from_byte_idx(text, 17));
218    }
219
220    #[test]
221    fn from_byte_idx_04() {
222        // Line 0
223        for i in 0..32 {
224            assert_eq!(0, from_byte_idx(TEXT_LINES, i));
225        }
226
227        // Line 1
228        for i in 32..59 {
229            assert_eq!(1, from_byte_idx(TEXT_LINES, i));
230        }
231
232        // Line 2
233        for i in 59..88 {
234            assert_eq!(2, from_byte_idx(TEXT_LINES, i));
235        }
236
237        // Line 3
238        for i in 88..125 {
239            assert_eq!(3, from_byte_idx(TEXT_LINES, i));
240        }
241
242        // Past the end
243        for i in 125..130 {
244            assert_eq!(3, from_byte_idx(TEXT_LINES, i));
245        }
246    }
247
248    #[test]
249    fn to_byte_idx_01() {
250        let text = "Here\r\nare\r\nsome\r\nwords";
251        assert_eq!(0, to_byte_idx(text, 0));
252        assert_eq!(6, to_byte_idx(text, 1));
253        assert_eq!(11, to_byte_idx(text, 2));
254        assert_eq!(17, to_byte_idx(text, 3));
255    }
256
257    #[test]
258    fn to_byte_idx_02() {
259        let text = "\nHere\nare\nsome\nwords\n";
260        assert_eq!(0, to_byte_idx(text, 0));
261        assert_eq!(1, to_byte_idx(text, 1));
262        assert_eq!(6, to_byte_idx(text, 2));
263        assert_eq!(10, to_byte_idx(text, 3));
264        assert_eq!(15, to_byte_idx(text, 4));
265        assert_eq!(21, to_byte_idx(text, 5));
266    }
267
268    #[test]
269    fn to_byte_idx_03() {
270        assert_eq!(0, to_byte_idx(TEXT_LINES, 0));
271        assert_eq!(32, to_byte_idx(TEXT_LINES, 1));
272        assert_eq!(59, to_byte_idx(TEXT_LINES, 2));
273        assert_eq!(88, to_byte_idx(TEXT_LINES, 3));
274
275        // Past end
276        assert_eq!(124, to_byte_idx(TEXT_LINES, 4));
277        assert_eq!(124, to_byte_idx(TEXT_LINES, 5));
278        assert_eq!(124, to_byte_idx(TEXT_LINES, 6));
279    }
280
281    #[test]
282    fn line_byte_round_trip() {
283        let text = "\nHere\nare\nsome\nwords\n";
284        assert_eq!(6, to_byte_idx(text, from_byte_idx(text, 6)));
285        assert_eq!(2, from_byte_idx(text, to_byte_idx(text, 2)));
286
287        assert_eq!(0, to_byte_idx(text, from_byte_idx(text, 0)));
288        assert_eq!(0, from_byte_idx(text, to_byte_idx(text, 0)));
289
290        assert_eq!(21, to_byte_idx(text, from_byte_idx(text, 21)));
291        assert_eq!(5, from_byte_idx(text, to_byte_idx(text, 5)));
292    }
293}