similar/text/
abstraction.rs

1use std::borrow::Cow;
2use std::hash::Hash;
3use std::ops::Range;
4
5/// Reference to a [`DiffableStr`].
6///
7/// This type exists because while the library only really provides ways to
8/// work with `&str` and `&[u8]` there are types that deref into those string
9/// slices such as `String` and `Vec<u8>`.
10///
11/// This trait is used in the library whenever it's nice to be able to pass
12/// strings of different types in.
13///
14/// Requires the `text` feature.
15pub trait DiffableStrRef {
16    /// The type of the resolved [`DiffableStr`].
17    type Output: DiffableStr + ?Sized;
18
19    /// Resolves the reference.
20    fn as_diffable_str(&self) -> &Self::Output;
21}
22
23impl<T: DiffableStr + ?Sized> DiffableStrRef for T {
24    type Output = T;
25
26    fn as_diffable_str(&self) -> &T {
27        self
28    }
29}
30
31impl DiffableStrRef for String {
32    type Output = str;
33
34    fn as_diffable_str(&self) -> &str {
35        self.as_str()
36    }
37}
38
39impl<T: DiffableStr + ?Sized> DiffableStrRef for Cow<'_, T> {
40    type Output = T;
41
42    fn as_diffable_str(&self) -> &T {
43        self
44    }
45}
46
47/// All supported diffable strings.
48///
49/// The text module can work with different types of strings depending
50/// on how the crate is compiled.  Out of the box `&str` is always supported
51/// but with the `bytes` feature one can also work with `[u8]` slices for
52/// as long as they are ASCII compatible.
53///
54/// Requires the `text` feature.
55pub trait DiffableStr: Hash + PartialEq + PartialOrd + Ord + Eq + ToOwned {
56    /// Splits the value into newlines with newlines attached.
57    fn tokenize_lines(&self) -> Vec<&Self>;
58
59    /// Splits the value into newlines with newlines separated.
60    fn tokenize_lines_and_newlines(&self) -> Vec<&Self>;
61
62    /// Tokenizes into words.
63    fn tokenize_words(&self) -> Vec<&Self>;
64
65    /// Tokenizes the input into characters.
66    fn tokenize_chars(&self) -> Vec<&Self>;
67
68    /// Tokenizes into unicode words.
69    #[cfg(feature = "unicode")]
70    fn tokenize_unicode_words(&self) -> Vec<&Self>;
71
72    /// Tokenizes into unicode graphemes.
73    #[cfg(feature = "unicode")]
74    fn tokenize_graphemes(&self) -> Vec<&Self>;
75
76    /// Decodes the string (potentially) lossy.
77    fn as_str(&self) -> Option<&str>;
78
79    /// Decodes the string (potentially) lossy.
80    fn to_string_lossy(&self) -> Cow<'_, str>;
81
82    /// Checks if the string ends in a newline.
83    fn ends_with_newline(&self) -> bool;
84
85    /// The length of the string.
86    fn len(&self) -> usize;
87
88    /// Slices the string.
89    fn slice(&self, rng: Range<usize>) -> &Self;
90
91    /// Returns the string as slice of raw bytes.
92    fn as_bytes(&self) -> &[u8];
93
94    /// Checks if the string is empty.
95    fn is_empty(&self) -> bool {
96        self.len() == 0
97    }
98}
99
100impl DiffableStr for str {
101    fn tokenize_lines(&self) -> Vec<&Self> {
102        let mut iter = self.char_indices().peekable();
103        let mut last_pos = 0;
104        let mut lines = vec![];
105
106        while let Some((idx, c)) = iter.next() {
107            if c == '\r' {
108                if iter.peek().map_or(false, |x| x.1 == '\n') {
109                    lines.push(&self[last_pos..=idx + 1]);
110                    iter.next();
111                    last_pos = idx + 2;
112                } else {
113                    lines.push(&self[last_pos..=idx]);
114                    last_pos = idx + 1;
115                }
116            } else if c == '\n' {
117                lines.push(&self[last_pos..=idx]);
118                last_pos = idx + 1;
119            }
120        }
121
122        if last_pos < self.len() {
123            lines.push(&self[last_pos..]);
124        }
125
126        lines
127    }
128
129    fn tokenize_lines_and_newlines(&self) -> Vec<&Self> {
130        let mut rv = vec![];
131        let mut iter = self.char_indices().peekable();
132
133        while let Some((idx, c)) = iter.next() {
134            let is_newline = c == '\r' || c == '\n';
135            let start = idx;
136            let mut end = idx + c.len_utf8();
137            while let Some(&(_, next_char)) = iter.peek() {
138                if (next_char == '\r' || next_char == '\n') != is_newline {
139                    break;
140                }
141                iter.next();
142                end += next_char.len_utf8();
143            }
144            rv.push(&self[start..end]);
145        }
146
147        rv
148    }
149
150    fn tokenize_words(&self) -> Vec<&Self> {
151        let mut iter = self.char_indices().peekable();
152        let mut rv = vec![];
153
154        while let Some((idx, c)) = iter.next() {
155            let is_whitespace = c.is_whitespace();
156            let start = idx;
157            let mut end = idx + c.len_utf8();
158            while let Some(&(_, next_char)) = iter.peek() {
159                if next_char.is_whitespace() != is_whitespace {
160                    break;
161                }
162                iter.next();
163                end += next_char.len_utf8();
164            }
165            rv.push(&self[start..end]);
166        }
167
168        rv
169    }
170
171    fn tokenize_chars(&self) -> Vec<&Self> {
172        self.char_indices()
173            .map(move |(i, c)| &self[i..i + c.len_utf8()])
174            .collect()
175    }
176
177    #[cfg(feature = "unicode")]
178    fn tokenize_unicode_words(&self) -> Vec<&Self> {
179        unicode_segmentation::UnicodeSegmentation::split_word_bounds(self).collect()
180    }
181
182    #[cfg(feature = "unicode")]
183    fn tokenize_graphemes(&self) -> Vec<&Self> {
184        unicode_segmentation::UnicodeSegmentation::graphemes(self, true).collect()
185    }
186
187    fn as_str(&self) -> Option<&str> {
188        Some(self)
189    }
190
191    fn to_string_lossy(&self) -> Cow<'_, str> {
192        Cow::Borrowed(self)
193    }
194
195    fn ends_with_newline(&self) -> bool {
196        self.ends_with(&['\r', '\n'][..])
197    }
198
199    fn len(&self) -> usize {
200        str::len(self)
201    }
202
203    fn slice(&self, rng: Range<usize>) -> &Self {
204        &self[rng]
205    }
206
207    fn as_bytes(&self) -> &[u8] {
208        str::as_bytes(self)
209    }
210}
211
212#[cfg(feature = "bytes")]
213mod bytes_support {
214    use super::*;
215
216    use bstr::ByteSlice;
217
218    impl DiffableStrRef for Vec<u8> {
219        type Output = [u8];
220
221        fn as_diffable_str(&self) -> &[u8] {
222            self.as_slice()
223        }
224    }
225
226    /// Allows viewing ASCII compatible byte slices as strings.
227    ///
228    /// Requires the `bytes` feature.
229    impl DiffableStr for [u8] {
230        fn tokenize_lines(&self) -> Vec<&Self> {
231            let mut iter = self.char_indices().peekable();
232            let mut last_pos = 0;
233            let mut lines = vec![];
234
235            while let Some((_, end, c)) = iter.next() {
236                if c == '\r' {
237                    if iter.peek().map_or(false, |x| x.2 == '\n') {
238                        lines.push(&self[last_pos..end + 1]);
239                        iter.next();
240                        last_pos = end + 1;
241                    } else {
242                        lines.push(&self[last_pos..end]);
243                        last_pos = end;
244                    }
245                } else if c == '\n' {
246                    lines.push(&self[last_pos..end]);
247                    last_pos = end;
248                }
249            }
250
251            if last_pos < self.len() {
252                lines.push(&self[last_pos..]);
253            }
254
255            lines
256        }
257
258        fn tokenize_lines_and_newlines(&self) -> Vec<&Self> {
259            let mut rv = vec![];
260            let mut iter = self.char_indices().peekable();
261
262            while let Some((start, mut end, c)) = iter.next() {
263                let is_newline = c == '\r' || c == '\n';
264                while let Some(&(_, new_end, next_char)) = iter.peek() {
265                    if (next_char == '\r' || next_char == '\n') != is_newline {
266                        break;
267                    }
268                    iter.next();
269                    end = new_end;
270                }
271                rv.push(&self[start..end]);
272            }
273
274            rv
275        }
276
277        fn tokenize_words(&self) -> Vec<&Self> {
278            let mut iter = self.char_indices().peekable();
279            let mut rv = vec![];
280
281            while let Some((start, mut end, c)) = iter.next() {
282                let is_whitespace = c.is_whitespace();
283                while let Some(&(_, new_end, next_char)) = iter.peek() {
284                    if next_char.is_whitespace() != is_whitespace {
285                        break;
286                    }
287                    iter.next();
288                    end = new_end;
289                }
290                rv.push(&self[start..end]);
291            }
292
293            rv
294        }
295
296        #[cfg(feature = "unicode")]
297        fn tokenize_unicode_words(&self) -> Vec<&Self> {
298            self.words_with_breaks().map(|x| x.as_bytes()).collect()
299        }
300
301        #[cfg(feature = "unicode")]
302        fn tokenize_graphemes(&self) -> Vec<&Self> {
303            self.graphemes().map(|x| x.as_bytes()).collect()
304        }
305
306        fn tokenize_chars(&self) -> Vec<&Self> {
307            self.char_indices()
308                .map(move |(start, end, _)| &self[start..end])
309                .collect()
310        }
311
312        fn as_str(&self) -> Option<&str> {
313            std::str::from_utf8(self).ok()
314        }
315
316        fn to_string_lossy(&self) -> Cow<'_, str> {
317            String::from_utf8_lossy(self)
318        }
319
320        fn ends_with_newline(&self) -> bool {
321            matches!(self.last_byte(), Some(b'\r') | Some(b'\n'))
322        }
323
324        fn len(&self) -> usize {
325            <[u8]>::len(self)
326        }
327
328        fn slice(&self, rng: Range<usize>) -> &Self {
329            &self[rng]
330        }
331
332        fn as_bytes(&self) -> &[u8] {
333            self
334        }
335    }
336}
337
338#[test]
339fn test_split_lines() {
340    assert_eq!(
341        DiffableStr::tokenize_lines("first\nsecond\rthird\r\nfourth\nlast"),
342        vec!["first\n", "second\r", "third\r\n", "fourth\n", "last"]
343    );
344    assert_eq!(DiffableStr::tokenize_lines("\n\n"), vec!["\n", "\n"]);
345    assert_eq!(DiffableStr::tokenize_lines("\n"), vec!["\n"]);
346    assert!(DiffableStr::tokenize_lines("").is_empty());
347}
348
349#[test]
350fn test_split_words() {
351    assert_eq!(
352        DiffableStr::tokenize_words("foo    bar baz\n\n  aha"),
353        ["foo", "    ", "bar", " ", "baz", "\n\n  ", "aha"]
354    );
355}
356
357#[test]
358fn test_split_chars() {
359    assert_eq!(
360        DiffableStr::tokenize_chars("abcfö❄️"),
361        vec!["a", "b", "c", "f", "ö", "❄", "\u{fe0f}"]
362    );
363}
364
365#[test]
366#[cfg(feature = "unicode")]
367fn test_split_graphemes() {
368    assert_eq!(
369        DiffableStr::tokenize_graphemes("abcfö❄️"),
370        vec!["a", "b", "c", "f", "ö", "❄️"]
371    );
372}
373
374#[test]
375#[cfg(feature = "bytes")]
376fn test_split_lines_bytes() {
377    assert_eq!(
378        DiffableStr::tokenize_lines("first\nsecond\rthird\r\nfourth\nlast".as_bytes()),
379        vec![
380            "first\n".as_bytes(),
381            "second\r".as_bytes(),
382            "third\r\n".as_bytes(),
383            "fourth\n".as_bytes(),
384            "last".as_bytes()
385        ]
386    );
387    assert_eq!(
388        DiffableStr::tokenize_lines("\n\n".as_bytes()),
389        vec!["\n".as_bytes(), "\n".as_bytes()]
390    );
391    assert_eq!(
392        DiffableStr::tokenize_lines("\n".as_bytes()),
393        vec!["\n".as_bytes()]
394    );
395    assert!(DiffableStr::tokenize_lines("".as_bytes()).is_empty());
396}
397
398#[test]
399#[cfg(feature = "bytes")]
400fn test_split_words_bytes() {
401    assert_eq!(
402        DiffableStr::tokenize_words("foo    bar baz\n\n  aha".as_bytes()),
403        [
404            &b"foo"[..],
405            &b"    "[..],
406            &b"bar"[..],
407            &b" "[..],
408            &b"baz"[..],
409            &b"\n\n  "[..],
410            &b"aha"[..]
411        ]
412    );
413}
414
415#[test]
416#[cfg(feature = "bytes")]
417fn test_split_chars_bytes() {
418    assert_eq!(
419        DiffableStr::tokenize_chars("abcfö❄️".as_bytes()),
420        vec![
421            &b"a"[..],
422            &b"b"[..],
423            &b"c"[..],
424            &b"f"[..],
425            "ö".as_bytes(),
426            "❄".as_bytes(),
427            "\u{fe0f}".as_bytes()
428        ]
429    );
430}
431
432#[test]
433#[cfg(all(feature = "bytes", feature = "unicode"))]
434fn test_split_graphemes_bytes() {
435    assert_eq!(
436        DiffableStr::tokenize_graphemes("abcfö❄️".as_bytes()),
437        vec![
438            &b"a"[..],
439            &b"b"[..],
440            &b"c"[..],
441            &b"f"[..],
442            "ö".as_bytes(),
443            "❄️".as_bytes()
444        ]
445    );
446}