ropey/
crlf.rs

1/// Returns whether the given byte index in `text` is a valid
2/// splitting point.  Valid splitting point in this case means
3/// that it _is_ a utf8 code point boundary and _is not_ the
4/// middle of a CRLF pair.
5#[inline]
6pub fn is_break(byte_idx: usize, text: &[u8]) -> bool {
7    debug_assert!(byte_idx <= text.len());
8
9    if byte_idx == 0 || byte_idx == text.len() {
10        true
11    } else {
12        (text[byte_idx] >> 6 != 0b10) && ((text[byte_idx - 1] != 0x0D) | (text[byte_idx] != 0x0A))
13    }
14}
15
16/// Returns whether the seam between `left` and `right` is a valid
17/// splitting point.  Valid splitting point in this case means
18/// that it _is_ a utf8 code point boundary and _is not_ the middle
19/// of a CRLF pair.
20#[inline]
21pub fn seam_is_break(left: &[u8], right: &[u8]) -> bool {
22    debug_assert!(!left.is_empty() && !right.is_empty());
23    (right[0] >> 6 != 0b10) && ((left[left.len() - 1] != 0x0D) | (right[0] != 0x0A))
24}
25
26/// Returns the segment break before (but not including) the given byte
27/// boundary.
28///
29/// This will return back the passed byte boundary if it is at the start
30/// of the string.
31#[inline]
32pub fn prev_break(byte_idx: usize, text: &[u8]) -> usize {
33    // Bounds check
34    debug_assert!(byte_idx <= text.len());
35
36    if byte_idx == 0 {
37        0
38    } else {
39        let mut boundary_idx = byte_idx - 1;
40        while !is_break(boundary_idx, text) {
41            boundary_idx -= 1;
42        }
43        boundary_idx
44    }
45}
46
47/// Returns the segment break after (but not including) the given byte
48/// boundary.
49///
50/// This will return back the passed byte boundary if it is at the end of
51/// the string.
52#[inline]
53pub fn next_break(byte_idx: usize, text: &[u8]) -> usize {
54    // Bounds check
55    debug_assert!(byte_idx <= text.len());
56
57    if byte_idx == text.len() {
58        text.len()
59    } else {
60        let mut boundary_idx = byte_idx + 1;
61        while !is_break(boundary_idx, text) {
62            boundary_idx += 1;
63        }
64        boundary_idx
65    }
66}
67
68/// Finds the segment break nearest to the given byte that is not the
69/// left or right edge of the text.
70///
71/// There is only one circumstance where the left or right edge will be
72/// returned: if the entire text is a single unbroken segment, then the
73/// right edge of the text is returned.
74#[inline]
75pub fn nearest_internal_break(byte_idx: usize, text: &[u8]) -> usize {
76    // Bounds check
77    debug_assert!(byte_idx <= text.len());
78
79    // Find the two nearest segment boundaries
80    let left = if is_break(byte_idx, text) && byte_idx != text.len() {
81        byte_idx
82    } else {
83        prev_break(byte_idx, text)
84    };
85    let right = next_break(byte_idx, text);
86
87    // Otherwise, return the closest of left and right that isn't the
88    // start or end of the string
89    if left == 0 || (right != text.len() && (byte_idx - left) >= (right - byte_idx)) {
90        return right;
91    } else {
92        return left;
93    }
94}
95
96#[inline]
97pub fn find_good_split(byte_idx: usize, text: &[u8], bias_left: bool) -> usize {
98    // Bounds check
99    debug_assert!(byte_idx <= text.len());
100
101    if is_break(byte_idx, text) {
102        byte_idx
103    } else {
104        let prev = prev_break(byte_idx, text);
105        let next = next_break(byte_idx, text);
106        if bias_left {
107            if prev > 0 {
108                prev
109            } else {
110                next
111            }
112        } else {
113            #[allow(clippy::collapsible_if)] // More readable this way
114            if next < text.len() {
115                next
116            } else {
117                prev
118            }
119        }
120    }
121}
122
123//===========================================================================
124
125#[cfg(test)]
126mod tests {
127    use super::*;
128
129    #[test]
130    fn crlf_segmenter_01() {
131        let text = b"Hello world!\r\nHow's it going?";
132
133        assert!(is_break(0, b""));
134        assert!(is_break(0, text));
135        assert!(is_break(12, text));
136        assert!(!is_break(13, text));
137        assert!(is_break(14, text));
138        assert!(is_break(19, text));
139    }
140
141    #[test]
142    fn crlf_segmenter_02() {
143        let l = b"Hello world!\r";
144        let r = b"\nHow's it going?";
145
146        assert!(!seam_is_break(l, r));
147        assert!(!seam_is_break(l, b"\n"));
148        assert!(!seam_is_break(b"\r", r));
149        assert!(!seam_is_break(b"\r", b"\n"));
150        assert!(seam_is_break(r, l));
151        assert!(seam_is_break(b"\n", b"\r"));
152    }
153
154    #[test]
155    fn nearest_internal_break_01() {
156        let text = b"Hello world!";
157        assert_eq!(1, nearest_internal_break(0, text));
158        assert_eq!(6, nearest_internal_break(6, text));
159        assert_eq!(11, nearest_internal_break(12, text));
160    }
161
162    #[test]
163    fn nearest_internal_break_02() {
164        let text = b"Hello\r\n world!";
165        assert_eq!(5, nearest_internal_break(5, text));
166        assert_eq!(7, nearest_internal_break(6, text));
167        assert_eq!(7, nearest_internal_break(7, text));
168    }
169
170    #[test]
171    fn nearest_internal_break_03() {
172        let text = b"\r\nHello world!\r\n";
173        assert_eq!(2, nearest_internal_break(0, text));
174        assert_eq!(2, nearest_internal_break(1, text));
175        assert_eq!(2, nearest_internal_break(2, text));
176        assert_eq!(14, nearest_internal_break(14, text));
177        assert_eq!(14, nearest_internal_break(15, text));
178        assert_eq!(14, nearest_internal_break(16, text));
179    }
180
181    #[test]
182    fn nearest_internal_break_04() {
183        let text = b"\r\n";
184        assert_eq!(2, nearest_internal_break(0, text));
185        assert_eq!(2, nearest_internal_break(1, text));
186        assert_eq!(2, nearest_internal_break(2, text));
187    }
188
189    #[test]
190    fn is_break_01() {
191        let text = b"\n\r\n\r\n\r\n\r\n\r\n\r";
192
193        assert!(is_break(0, text));
194        assert!(is_break(12, text));
195        assert!(is_break(3, text));
196        assert!(!is_break(6, text));
197    }
198
199    #[test]
200    fn seam_is_break_01() {
201        let text1 = b"\r\n\r\n\r\n";
202        let text2 = b"\r\n\r\n";
203
204        assert!(seam_is_break(text1, text2));
205    }
206
207    #[test]
208    fn seam_is_break_02() {
209        let text1 = b"\r\n\r\n\r";
210        let text2 = b"\n\r\n\r\n";
211
212        assert!(!seam_is_break(text1, text2));
213    }
214}