ropey/
str_utils.rs

1//! Utility functions for utf8 string slices.
2//!
3//! This module provides various utility functions that operate on string
4//! slices in ways compatible with Ropey.  They may be useful when building
5//! additional functionality on top of Ropey.
6
7pub(crate) use str_indices::chars::count as count_chars;
8pub use str_indices::chars::from_byte_idx as byte_to_char_idx;
9pub use str_indices::chars::to_byte_idx as char_to_byte_idx;
10pub(crate) use str_indices::utf16::count_surrogates as count_utf16_surrogates;
11
12// Determine which line implementation to use.
13#[cfg(feature = "unicode_lines")]
14use str_indices::lines;
15#[cfg(all(feature = "cr_lines", not(feature = "unicode_lines")))]
16use str_indices::lines_crlf as lines;
17#[cfg(not(any(feature = "cr_lines", feature = "unicode_lines")))]
18use str_indices::lines_lf as lines;
19
20pub(crate) use self::lines::count_breaks as count_line_breaks;
21pub use self::lines::from_byte_idx as byte_to_line_idx;
22pub use self::lines::to_byte_idx as line_to_byte_idx;
23
24/// Converts from char-index to line-index in a string slice.
25///
26/// This is equivalent to counting the line endings before the given char.
27///
28/// Any past-the-end index will return the last line index.
29///
30/// Runs in O(N) time.
31#[inline]
32pub fn char_to_line_idx(text: &str, char_idx: usize) -> usize {
33    lines::from_byte_idx(text, str_indices::chars::to_byte_idx(text, char_idx))
34}
35
36/// Converts from line-index to char-index in a string slice.
37///
38/// More specifically, this returns the index of the first char of the given line.
39///
40/// Any past-the-end index will return the one-past-the-end char index.
41///
42/// Runs in O(N) time.
43#[inline]
44pub fn line_to_char_idx(text: &str, line_idx: usize) -> usize {
45    str_indices::chars::from_byte_idx(text, lines::to_byte_idx(text, line_idx))
46}
47
48//-------------------------------------------------------------
49
50pub(crate) fn byte_to_utf16_surrogate_idx(text: &str, byte_idx: usize) -> usize {
51    let mut i = byte_idx;
52    while !text.is_char_boundary(i) {
53        i -= 1;
54    }
55    str_indices::utf16::count_surrogates(&text[..i])
56}
57
58pub(crate) fn utf16_code_unit_to_char_idx(text: &str, utf16_idx: usize) -> usize {
59    str_indices::chars::from_byte_idx(text, str_indices::utf16::to_byte_idx(text, utf16_idx))
60}
61
62/// Returns the byte index of the start of the last line of the passed text.
63///
64/// Note: if the text ends in a line break, that means the last line is
65/// an empty line that starts at the end of the text.
66pub(crate) fn last_line_start_byte_idx(text: &str) -> usize {
67    let mut itr = text.bytes().enumerate().rev();
68
69    while let Some((idx, byte)) = itr.next() {
70        match byte {
71            0x0A => {
72                return idx + 1;
73            }
74            0x0D => {
75                #[cfg(any(feature = "cr_lines", feature = "unicode_lines"))]
76                return idx + 1;
77            }
78            0x0B | 0x0C => {
79                #[cfg(feature = "unicode_lines")]
80                return idx + 1;
81            }
82            0x85 =>
83            {
84                #[cfg(feature = "unicode_lines")]
85                if let Some((_, 0xC2)) = itr.next() {
86                    return idx + 1;
87                }
88            }
89            0xA8 | 0xA9 =>
90            {
91                #[cfg(feature = "unicode_lines")]
92                if let Some((_, 0x80)) = itr.next() {
93                    if let Some((_, 0xE2)) = itr.next() {
94                        return idx + 1;
95                    }
96                }
97            }
98            _ => {}
99        }
100    }
101
102    return 0;
103}
104
105/// Trims a single trailing line break (if any) off the end of the passed string.
106///
107/// If the string doesn't end in a line break, returns the string unchanged.
108#[inline]
109pub(crate) fn trim_line_break(text: &str) -> &str {
110    if text.is_empty() {
111        return "";
112    }
113
114    // Find the starting boundary of the last codepoint.
115    let mut i = text.len() - 1;
116    while !text.is_char_boundary(i) {
117        i -= 1;
118    }
119
120    let tail = &text[i..];
121
122    // Check if it's one of the fancy unicode line breaks.
123    #[cfg(feature = "unicode_lines")]
124    if matches!(
125        tail,
126        "\u{000B}" | "\u{000C}" | "\u{0085}" | "\u{2028}" | "\u{2029}"
127    ) {
128        return &text[..i];
129    }
130
131    #[cfg(feature = "cr_lines")]
132    if tail == "\u{000D}" {
133        return &text[..i];
134    }
135
136    if tail == "\u{000A}" {
137        #[cfg(feature = "cr_lines")]
138        if i > 0 && text.as_bytes()[i - 1] == 0xd {
139            return &text[..(i - 1)];
140        }
141
142        return &text[..i];
143    }
144
145    return text;
146}
147
148/// Returns whether the given string ends in a line break or not.
149#[inline]
150pub(crate) fn ends_with_line_break(text: &str) -> bool {
151    if text.is_empty() {
152        return false;
153    }
154
155    // Find the starting boundary of the last codepoint.
156    let mut i = text.len() - 1;
157    while !text.is_char_boundary(i) {
158        i -= 1;
159    }
160
161    // Check if the last codepoint is a line break.
162    #[cfg(feature = "unicode_lines")]
163    return matches!(
164        &text[i..],
165        "\u{000A}" | "\u{000B}" | "\u{000C}" | "\u{000D}" | "\u{0085}" | "\u{2028}" | "\u{2029}"
166    );
167
168    #[cfg(all(feature = "cr_lines", not(feature = "unicode_lines")))]
169    return matches!(&text[i..], "\u{000A}" | "\u{000D}");
170
171    #[cfg(not(any(feature = "cr_lines", feature = "unicode_lines")))]
172    return &text[i..] == "\u{000A}";
173}
174
175//======================================================================
176
177#[cfg(test)]
178mod tests {
179    use super::*;
180
181    #[cfg(not(any(feature = "cr_lines", feature = "unicode_lines")))]
182    #[test]
183    fn last_line_start_byte_idx_lf_01() {
184        assert_eq!(0, last_line_start_byte_idx(""));
185        assert_eq!(0, last_line_start_byte_idx("Hi"));
186
187        assert_eq!(3, last_line_start_byte_idx("Hi\u{000A}there."));
188        assert_eq!(0, last_line_start_byte_idx("Hi\u{000B}there."));
189        assert_eq!(0, last_line_start_byte_idx("Hi\u{000C}there."));
190        assert_eq!(0, last_line_start_byte_idx("Hi\u{000D}there."));
191        assert_eq!(0, last_line_start_byte_idx("Hi\u{0085}there."));
192        assert_eq!(0, last_line_start_byte_idx("Hi\u{2028}there."));
193        assert_eq!(0, last_line_start_byte_idx("Hi\u{2029}there."));
194    }
195
196    #[cfg(not(any(feature = "cr_lines", feature = "unicode_lines")))]
197    #[test]
198    fn last_line_start_byte_idx_lf_02() {
199        let mut text = "\u{000A}Hello\u{000D}\u{000A}\u{000D}せ\u{000B}か\u{000C}い\u{0085}. \
200                        There\u{2028}is something.\u{2029}";
201
202        assert_eq!(48, text.len());
203        text = &text[..last_line_start_byte_idx(trim_line_break(text))];
204        assert_eq!(8, text.len());
205        text = &text[..last_line_start_byte_idx(trim_line_break(text))];
206        assert_eq!(1, text.len());
207        text = &text[..last_line_start_byte_idx(trim_line_break(text))];
208        assert_eq!(0, text.len());
209    }
210
211    #[cfg(all(feature = "cr_lines", not(feature = "unicode_lines")))]
212    #[test]
213    fn last_line_start_byte_idx_crlf_01() {
214        assert_eq!(0, last_line_start_byte_idx(""));
215        assert_eq!(0, last_line_start_byte_idx("Hi"));
216
217        assert_eq!(3, last_line_start_byte_idx("Hi\u{000A}there."));
218        assert_eq!(0, last_line_start_byte_idx("Hi\u{000B}there."));
219        assert_eq!(0, last_line_start_byte_idx("Hi\u{000C}there."));
220        assert_eq!(3, last_line_start_byte_idx("Hi\u{000D}there."));
221        assert_eq!(0, last_line_start_byte_idx("Hi\u{0085}there."));
222        assert_eq!(0, last_line_start_byte_idx("Hi\u{2028}there."));
223        assert_eq!(0, last_line_start_byte_idx("Hi\u{2029}there."));
224    }
225
226    #[cfg(all(feature = "cr_lines", not(feature = "unicode_lines")))]
227    #[test]
228    fn last_line_start_byte_idx_crlf_02() {
229        let mut text = "\u{000A}Hello\u{000D}\u{000A}\u{000D}せ\u{000B}か\u{000C}い\u{0085}. \
230                        There\u{2028}is something.\u{2029}";
231
232        assert_eq!(48, text.len());
233        text = &text[..last_line_start_byte_idx(trim_line_break(text))];
234        assert_eq!(9, text.len());
235        text = &text[..last_line_start_byte_idx(trim_line_break(text))];
236        assert_eq!(8, text.len());
237        text = &text[..last_line_start_byte_idx(trim_line_break(text))];
238        assert_eq!(1, text.len());
239        text = &text[..last_line_start_byte_idx(trim_line_break(text))];
240        assert_eq!(0, text.len());
241    }
242
243    #[cfg(feature = "unicode_lines")]
244    #[test]
245    fn last_line_start_byte_idx_unicode_01() {
246        assert_eq!(0, last_line_start_byte_idx(""));
247        assert_eq!(0, last_line_start_byte_idx("Hi"));
248
249        assert_eq!(3, last_line_start_byte_idx("Hi\u{000A}there."));
250        assert_eq!(3, last_line_start_byte_idx("Hi\u{000B}there."));
251        assert_eq!(3, last_line_start_byte_idx("Hi\u{000C}there."));
252        assert_eq!(3, last_line_start_byte_idx("Hi\u{000D}there."));
253        assert_eq!(4, last_line_start_byte_idx("Hi\u{0085}there."));
254        assert_eq!(5, last_line_start_byte_idx("Hi\u{2028}there."));
255        assert_eq!(5, last_line_start_byte_idx("Hi\u{2029}there."));
256    }
257
258    #[cfg(feature = "unicode_lines")]
259    #[test]
260    fn last_line_start_byte_idx_unicode_02() {
261        let mut text = "\u{000A}Hello\u{000D}\u{000A}\u{000D}せ\u{000B}か\u{000C}い\u{0085}. \
262                        There\u{2028}is something.\u{2029}";
263
264        assert_eq!(48, text.len());
265        text = &text[..last_line_start_byte_idx(trim_line_break(text))];
266        assert_eq!(32, text.len());
267        text = &text[..last_line_start_byte_idx(trim_line_break(text))];
268        assert_eq!(22, text.len());
269        text = &text[..last_line_start_byte_idx(trim_line_break(text))];
270        assert_eq!(17, text.len());
271        text = &text[..last_line_start_byte_idx(trim_line_break(text))];
272        assert_eq!(13, text.len());
273        text = &text[..last_line_start_byte_idx(trim_line_break(text))];
274        assert_eq!(9, text.len());
275        text = &text[..last_line_start_byte_idx(trim_line_break(text))];
276        assert_eq!(8, text.len());
277        text = &text[..last_line_start_byte_idx(trim_line_break(text))];
278        assert_eq!(1, text.len());
279        text = &text[..last_line_start_byte_idx(trim_line_break(text))];
280        assert_eq!(0, text.len());
281    }
282
283    #[cfg(not(any(feature = "cr_lines", feature = "unicode_lines")))]
284    #[test]
285    fn trim_line_break_lf_01() {
286        assert_eq!("", trim_line_break(""));
287        assert_eq!("Hi", trim_line_break("Hi"));
288
289        assert_eq!("Hi", trim_line_break("Hi\u{000A}"));
290        assert_eq!("Hi\u{000B}", trim_line_break("Hi\u{000B}"));
291        assert_eq!("Hi\u{000C}", trim_line_break("Hi\u{000C}"));
292        assert_eq!("Hi\u{000D}", trim_line_break("Hi\u{000D}"));
293        assert_eq!("Hi\u{0085}", trim_line_break("Hi\u{0085}"));
294        assert_eq!("Hi\u{2028}", trim_line_break("Hi\u{2028}"));
295        assert_eq!("Hi\u{2029}", trim_line_break("Hi\u{2029}"));
296
297        assert_eq!("\r", trim_line_break("\r\n"));
298        assert_eq!("Hi\r", trim_line_break("Hi\r\n"));
299    }
300
301    #[cfg(all(feature = "cr_lines", not(feature = "unicode_lines")))]
302    #[test]
303    fn trim_line_break_crlf_01() {
304        assert_eq!("", trim_line_break(""));
305        assert_eq!("Hi", trim_line_break("Hi"));
306
307        assert_eq!("Hi", trim_line_break("Hi\u{000A}"));
308        assert_eq!("Hi\u{000B}", trim_line_break("Hi\u{000B}"));
309        assert_eq!("Hi\u{000C}", trim_line_break("Hi\u{000C}"));
310        assert_eq!("Hi", trim_line_break("Hi\u{000D}"));
311        assert_eq!("Hi\u{0085}", trim_line_break("Hi\u{0085}"));
312        assert_eq!("Hi\u{2028}", trim_line_break("Hi\u{2028}"));
313        assert_eq!("Hi\u{2029}", trim_line_break("Hi\u{2029}"));
314
315        assert_eq!("", trim_line_break("\r\n"));
316        assert_eq!("Hi", trim_line_break("Hi\r\n"));
317    }
318
319    #[cfg(feature = "unicode_lines")]
320    #[test]
321    fn trim_line_break_unicode_01() {
322        assert_eq!("", trim_line_break(""));
323        assert_eq!("Hi", trim_line_break("Hi"));
324
325        assert_eq!("Hi", trim_line_break("Hi\u{000A}"));
326        assert_eq!("Hi", trim_line_break("Hi\u{000B}"));
327        assert_eq!("Hi", trim_line_break("Hi\u{000C}"));
328        assert_eq!("Hi", trim_line_break("Hi\u{000D}"));
329        assert_eq!("Hi", trim_line_break("Hi\u{0085}"));
330        assert_eq!("Hi", trim_line_break("Hi\u{2028}"));
331        assert_eq!("Hi", trim_line_break("Hi\u{2029}"));
332
333        assert_eq!("", trim_line_break("\r\n"));
334        assert_eq!("Hi", trim_line_break("Hi\r\n"));
335    }
336
337    #[test]
338    fn ends_with_line_break_01() {
339        assert!(ends_with_line_break("\n"));
340
341        #[cfg(any(feature = "cr_lines", feature = "unicode_lines"))]
342        assert!(ends_with_line_break("\r"));
343
344        #[cfg(feature = "unicode_lines")]
345        {
346            assert!(ends_with_line_break("\u{000A}"));
347            assert!(ends_with_line_break("\u{000B}"));
348            assert!(ends_with_line_break("\u{000C}"));
349            assert!(ends_with_line_break("\u{000D}"));
350            assert!(ends_with_line_break("\u{0085}"));
351            assert!(ends_with_line_break("\u{2028}"));
352            assert!(ends_with_line_break("\u{2029}"));
353        }
354    }
355
356    #[test]
357    fn ends_with_line_break_02() {
358        assert!(ends_with_line_break("Hi there!\n"));
359
360        #[cfg(any(feature = "cr_lines", feature = "unicode_lines"))]
361        assert!(ends_with_line_break("Hi there!\r"));
362
363        #[cfg(feature = "unicode_lines")]
364        {
365            assert!(ends_with_line_break("Hi there!\u{000A}"));
366            assert!(ends_with_line_break("Hi there!\u{000B}"));
367            assert!(ends_with_line_break("Hi there!\u{000C}"));
368            assert!(ends_with_line_break("Hi there!\u{000D}"));
369            assert!(ends_with_line_break("Hi there!\u{0085}"));
370            assert!(ends_with_line_break("Hi there!\u{2028}"));
371            assert!(ends_with_line_break("Hi there!\u{2029}"));
372        }
373    }
374
375    #[test]
376    fn ends_with_line_break_03() {
377        assert!(!ends_with_line_break(""));
378        assert!(!ends_with_line_break("a"));
379        assert!(!ends_with_line_break("Hi there!"));
380    }
381
382    #[test]
383    fn ends_with_line_break_04() {
384        assert!(!ends_with_line_break("\na"));
385        assert!(!ends_with_line_break("\ra"));
386        assert!(!ends_with_line_break("\u{000A}a"));
387        assert!(!ends_with_line_break("\u{000B}a"));
388        assert!(!ends_with_line_break("\u{000C}a"));
389        assert!(!ends_with_line_break("\u{000D}a"));
390        assert!(!ends_with_line_break("\u{0085}a"));
391        assert!(!ends_with_line_break("\u{2028}a"));
392        assert!(!ends_with_line_break("\u{2029}a"));
393    }
394
395    #[test]
396    fn char_to_line_idx_01() {
397        let text = "\u{000A}Hello\u{000D}\u{000A}\u{000D}せ\u{000B}か\u{000C}い\u{0085}. \
398                    There\u{2028}is something.\u{2029}";
399
400        #[cfg(not(any(feature = "cr_lines", feature = "unicode_lines")))]
401        {
402            assert_eq!(0, char_to_line_idx(text, 0));
403            assert_eq!(1, char_to_line_idx(text, 1));
404            assert_eq!(2, char_to_line_idx(text, 8));
405            assert_eq!(2, char_to_line_idx(text, 38));
406        }
407
408        #[cfg(all(feature = "cr_lines", not(feature = "unicode_lines")))]
409        {
410            assert_eq!(0, char_to_line_idx(text, 0));
411            assert_eq!(1, char_to_line_idx(text, 1));
412            assert_eq!(2, char_to_line_idx(text, 8));
413            assert_eq!(3, char_to_line_idx(text, 9));
414            assert_eq!(3, char_to_line_idx(text, 38));
415        }
416
417        #[cfg(feature = "unicode_lines")]
418        {
419            assert_eq!(0, char_to_line_idx(text, 0));
420            assert_eq!(1, char_to_line_idx(text, 1));
421            assert_eq!(2, char_to_line_idx(text, 8));
422            assert_eq!(3, char_to_line_idx(text, 9));
423            assert_eq!(4, char_to_line_idx(text, 11));
424            assert_eq!(5, char_to_line_idx(text, 13));
425            assert_eq!(6, char_to_line_idx(text, 15));
426            assert_eq!(7, char_to_line_idx(text, 23));
427            assert_eq!(8, char_to_line_idx(text, 37));
428            assert_eq!(8, char_to_line_idx(text, 38));
429        }
430    }
431
432    #[test]
433    fn line_to_char_idx_01() {
434        let text = "\u{000A}Hello\u{000D}\u{000A}\u{000D}せ\u{000B}か\u{000C}い\u{0085}. \
435                    There\u{2028}is something.\u{2029}";
436
437        #[cfg(not(any(feature = "cr_lines", feature = "unicode_lines")))]
438        {
439            assert_eq!(0, line_to_char_idx(text, 0));
440            assert_eq!(1, line_to_char_idx(text, 1));
441            assert_eq!(8, line_to_char_idx(text, 2));
442            assert_eq!(37, line_to_char_idx(text, 3));
443        }
444
445        #[cfg(all(feature = "cr_lines", not(feature = "unicode_lines")))]
446        {
447            assert_eq!(0, line_to_char_idx(text, 0));
448            assert_eq!(1, line_to_char_idx(text, 1));
449            assert_eq!(8, line_to_char_idx(text, 2));
450            assert_eq!(9, line_to_char_idx(text, 3));
451            assert_eq!(37, line_to_char_idx(text, 4));
452        }
453
454        #[cfg(feature = "unicode_lines")]
455        {
456            assert_eq!(0, line_to_char_idx(text, 0));
457            assert_eq!(1, line_to_char_idx(text, 1));
458            assert_eq!(8, line_to_char_idx(text, 2));
459            assert_eq!(9, line_to_char_idx(text, 3));
460            assert_eq!(11, line_to_char_idx(text, 4));
461            assert_eq!(13, line_to_char_idx(text, 5));
462            assert_eq!(15, line_to_char_idx(text, 6));
463            assert_eq!(23, line_to_char_idx(text, 7));
464            assert_eq!(37, line_to_char_idx(text, 8));
465            assert_eq!(37, line_to_char_idx(text, 9));
466        }
467    }
468}