1use std::borrow::Cow;
2use std::hash::Hash;
3use std::ops::Range;
4
5pub trait DiffableStrRef {
16 type Output: DiffableStr + ?Sized;
18
19 fn as_diffable_str(&self) -> &Self::Output;
21}
22
23impl<T: DiffableStr + ?Sized> DiffableStrRef for T {
24 type Output = T;
25
26 fn as_diffable_str(&self) -> &T {
27 self
28 }
29}
30
31impl DiffableStrRef for String {
32 type Output = str;
33
34 fn as_diffable_str(&self) -> &str {
35 self.as_str()
36 }
37}
38
39impl<T: DiffableStr + ?Sized> DiffableStrRef for Cow<'_, T> {
40 type Output = T;
41
42 fn as_diffable_str(&self) -> &T {
43 self
44 }
45}
46
47pub trait DiffableStr: Hash + PartialEq + PartialOrd + Ord + Eq + ToOwned {
56 fn tokenize_lines(&self) -> Vec<&Self>;
58
59 fn tokenize_lines_and_newlines(&self) -> Vec<&Self>;
61
62 fn tokenize_words(&self) -> Vec<&Self>;
64
65 fn tokenize_chars(&self) -> Vec<&Self>;
67
68 #[cfg(feature = "unicode")]
70 fn tokenize_unicode_words(&self) -> Vec<&Self>;
71
72 #[cfg(feature = "unicode")]
74 fn tokenize_graphemes(&self) -> Vec<&Self>;
75
76 fn as_str(&self) -> Option<&str>;
78
79 fn to_string_lossy(&self) -> Cow<'_, str>;
81
82 fn ends_with_newline(&self) -> bool;
84
85 fn len(&self) -> usize;
87
88 fn slice(&self, rng: Range<usize>) -> &Self;
90
91 fn as_bytes(&self) -> &[u8];
93
94 fn is_empty(&self) -> bool {
96 self.len() == 0
97 }
98}
99
100impl DiffableStr for str {
101 fn tokenize_lines(&self) -> Vec<&Self> {
102 let mut iter = self.char_indices().peekable();
103 let mut last_pos = 0;
104 let mut lines = vec![];
105
106 while let Some((idx, c)) = iter.next() {
107 if c == '\r' {
108 if iter.peek().map_or(false, |x| x.1 == '\n') {
109 lines.push(&self[last_pos..=idx + 1]);
110 iter.next();
111 last_pos = idx + 2;
112 } else {
113 lines.push(&self[last_pos..=idx]);
114 last_pos = idx + 1;
115 }
116 } else if c == '\n' {
117 lines.push(&self[last_pos..=idx]);
118 last_pos = idx + 1;
119 }
120 }
121
122 if last_pos < self.len() {
123 lines.push(&self[last_pos..]);
124 }
125
126 lines
127 }
128
129 fn tokenize_lines_and_newlines(&self) -> Vec<&Self> {
130 let mut rv = vec![];
131 let mut iter = self.char_indices().peekable();
132
133 while let Some((idx, c)) = iter.next() {
134 let is_newline = c == '\r' || c == '\n';
135 let start = idx;
136 let mut end = idx + c.len_utf8();
137 while let Some(&(_, next_char)) = iter.peek() {
138 if (next_char == '\r' || next_char == '\n') != is_newline {
139 break;
140 }
141 iter.next();
142 end += next_char.len_utf8();
143 }
144 rv.push(&self[start..end]);
145 }
146
147 rv
148 }
149
150 fn tokenize_words(&self) -> Vec<&Self> {
151 let mut iter = self.char_indices().peekable();
152 let mut rv = vec![];
153
154 while let Some((idx, c)) = iter.next() {
155 let is_whitespace = c.is_whitespace();
156 let start = idx;
157 let mut end = idx + c.len_utf8();
158 while let Some(&(_, next_char)) = iter.peek() {
159 if next_char.is_whitespace() != is_whitespace {
160 break;
161 }
162 iter.next();
163 end += next_char.len_utf8();
164 }
165 rv.push(&self[start..end]);
166 }
167
168 rv
169 }
170
171 fn tokenize_chars(&self) -> Vec<&Self> {
172 self.char_indices()
173 .map(move |(i, c)| &self[i..i + c.len_utf8()])
174 .collect()
175 }
176
177 #[cfg(feature = "unicode")]
178 fn tokenize_unicode_words(&self) -> Vec<&Self> {
179 unicode_segmentation::UnicodeSegmentation::split_word_bounds(self).collect()
180 }
181
182 #[cfg(feature = "unicode")]
183 fn tokenize_graphemes(&self) -> Vec<&Self> {
184 unicode_segmentation::UnicodeSegmentation::graphemes(self, true).collect()
185 }
186
187 fn as_str(&self) -> Option<&str> {
188 Some(self)
189 }
190
191 fn to_string_lossy(&self) -> Cow<'_, str> {
192 Cow::Borrowed(self)
193 }
194
195 fn ends_with_newline(&self) -> bool {
196 self.ends_with(&['\r', '\n'][..])
197 }
198
199 fn len(&self) -> usize {
200 str::len(self)
201 }
202
203 fn slice(&self, rng: Range<usize>) -> &Self {
204 &self[rng]
205 }
206
207 fn as_bytes(&self) -> &[u8] {
208 str::as_bytes(self)
209 }
210}
211
212#[cfg(feature = "bytes")]
213mod bytes_support {
214 use super::*;
215
216 use bstr::ByteSlice;
217
218 impl DiffableStrRef for Vec<u8> {
219 type Output = [u8];
220
221 fn as_diffable_str(&self) -> &[u8] {
222 self.as_slice()
223 }
224 }
225
226 impl DiffableStr for [u8] {
230 fn tokenize_lines(&self) -> Vec<&Self> {
231 let mut iter = self.char_indices().peekable();
232 let mut last_pos = 0;
233 let mut lines = vec![];
234
235 while let Some((_, end, c)) = iter.next() {
236 if c == '\r' {
237 if iter.peek().map_or(false, |x| x.2 == '\n') {
238 lines.push(&self[last_pos..end + 1]);
239 iter.next();
240 last_pos = end + 1;
241 } else {
242 lines.push(&self[last_pos..end]);
243 last_pos = end;
244 }
245 } else if c == '\n' {
246 lines.push(&self[last_pos..end]);
247 last_pos = end;
248 }
249 }
250
251 if last_pos < self.len() {
252 lines.push(&self[last_pos..]);
253 }
254
255 lines
256 }
257
258 fn tokenize_lines_and_newlines(&self) -> Vec<&Self> {
259 let mut rv = vec![];
260 let mut iter = self.char_indices().peekable();
261
262 while let Some((start, mut end, c)) = iter.next() {
263 let is_newline = c == '\r' || c == '\n';
264 while let Some(&(_, new_end, next_char)) = iter.peek() {
265 if (next_char == '\r' || next_char == '\n') != is_newline {
266 break;
267 }
268 iter.next();
269 end = new_end;
270 }
271 rv.push(&self[start..end]);
272 }
273
274 rv
275 }
276
277 fn tokenize_words(&self) -> Vec<&Self> {
278 let mut iter = self.char_indices().peekable();
279 let mut rv = vec![];
280
281 while let Some((start, mut end, c)) = iter.next() {
282 let is_whitespace = c.is_whitespace();
283 while let Some(&(_, new_end, next_char)) = iter.peek() {
284 if next_char.is_whitespace() != is_whitespace {
285 break;
286 }
287 iter.next();
288 end = new_end;
289 }
290 rv.push(&self[start..end]);
291 }
292
293 rv
294 }
295
296 #[cfg(feature = "unicode")]
297 fn tokenize_unicode_words(&self) -> Vec<&Self> {
298 self.words_with_breaks().map(|x| x.as_bytes()).collect()
299 }
300
301 #[cfg(feature = "unicode")]
302 fn tokenize_graphemes(&self) -> Vec<&Self> {
303 self.graphemes().map(|x| x.as_bytes()).collect()
304 }
305
306 fn tokenize_chars(&self) -> Vec<&Self> {
307 self.char_indices()
308 .map(move |(start, end, _)| &self[start..end])
309 .collect()
310 }
311
312 fn as_str(&self) -> Option<&str> {
313 std::str::from_utf8(self).ok()
314 }
315
316 fn to_string_lossy(&self) -> Cow<'_, str> {
317 String::from_utf8_lossy(self)
318 }
319
320 fn ends_with_newline(&self) -> bool {
321 matches!(self.last_byte(), Some(b'\r') | Some(b'\n'))
322 }
323
324 fn len(&self) -> usize {
325 <[u8]>::len(self)
326 }
327
328 fn slice(&self, rng: Range<usize>) -> &Self {
329 &self[rng]
330 }
331
332 fn as_bytes(&self) -> &[u8] {
333 self
334 }
335 }
336}
337
338#[test]
339fn test_split_lines() {
340 assert_eq!(
341 DiffableStr::tokenize_lines("first\nsecond\rthird\r\nfourth\nlast"),
342 vec!["first\n", "second\r", "third\r\n", "fourth\n", "last"]
343 );
344 assert_eq!(DiffableStr::tokenize_lines("\n\n"), vec!["\n", "\n"]);
345 assert_eq!(DiffableStr::tokenize_lines("\n"), vec!["\n"]);
346 assert!(DiffableStr::tokenize_lines("").is_empty());
347}
348
349#[test]
350fn test_split_words() {
351 assert_eq!(
352 DiffableStr::tokenize_words("foo bar baz\n\n aha"),
353 ["foo", " ", "bar", " ", "baz", "\n\n ", "aha"]
354 );
355}
356
357#[test]
358fn test_split_chars() {
359 assert_eq!(
360 DiffableStr::tokenize_chars("abcfö❄️"),
361 vec!["a", "b", "c", "f", "ö", "❄", "\u{fe0f}"]
362 );
363}
364
365#[test]
366#[cfg(feature = "unicode")]
367fn test_split_graphemes() {
368 assert_eq!(
369 DiffableStr::tokenize_graphemes("abcfö❄️"),
370 vec!["a", "b", "c", "f", "ö", "❄️"]
371 );
372}
373
374#[test]
375#[cfg(feature = "bytes")]
376fn test_split_lines_bytes() {
377 assert_eq!(
378 DiffableStr::tokenize_lines("first\nsecond\rthird\r\nfourth\nlast".as_bytes()),
379 vec![
380 "first\n".as_bytes(),
381 "second\r".as_bytes(),
382 "third\r\n".as_bytes(),
383 "fourth\n".as_bytes(),
384 "last".as_bytes()
385 ]
386 );
387 assert_eq!(
388 DiffableStr::tokenize_lines("\n\n".as_bytes()),
389 vec!["\n".as_bytes(), "\n".as_bytes()]
390 );
391 assert_eq!(
392 DiffableStr::tokenize_lines("\n".as_bytes()),
393 vec!["\n".as_bytes()]
394 );
395 assert!(DiffableStr::tokenize_lines("".as_bytes()).is_empty());
396}
397
398#[test]
399#[cfg(feature = "bytes")]
400fn test_split_words_bytes() {
401 assert_eq!(
402 DiffableStr::tokenize_words("foo bar baz\n\n aha".as_bytes()),
403 [
404 &b"foo"[..],
405 &b" "[..],
406 &b"bar"[..],
407 &b" "[..],
408 &b"baz"[..],
409 &b"\n\n "[..],
410 &b"aha"[..]
411 ]
412 );
413}
414
415#[test]
416#[cfg(feature = "bytes")]
417fn test_split_chars_bytes() {
418 assert_eq!(
419 DiffableStr::tokenize_chars("abcfö❄️".as_bytes()),
420 vec![
421 &b"a"[..],
422 &b"b"[..],
423 &b"c"[..],
424 &b"f"[..],
425 "ö".as_bytes(),
426 "❄".as_bytes(),
427 "\u{fe0f}".as_bytes()
428 ]
429 );
430}
431
432#[test]
433#[cfg(all(feature = "bytes", feature = "unicode"))]
434fn test_split_graphemes_bytes() {
435 assert_eq!(
436 DiffableStr::tokenize_graphemes("abcfö❄️".as_bytes()),
437 vec![
438 &b"a"[..],
439 &b"b"[..],
440 &b"c"[..],
441 &b"f"[..],
442 "ö".as_bytes(),
443 "❄️".as_bytes()
444 ]
445 );
446}