similar/algorithms/
myers.rs

1//! Myers' diff algorithm.
2//!
3//! * time: `O((N+M)D)`
4//! * space `O(N+M)`
5//!
6//! See [the original article by Eugene W. Myers](http://www.xmailserver.org/diff2.pdf)
7//! describing it.
8//!
9//! The implementation of this algorithm is based on the implementation by
10//! Brandon Williams.
11//!
12//! # Heuristics
13//!
14//! At present this implementation of Myers' does not implement any more advanced
15//! heuristics that would solve some pathological cases.  For instance passing two
16//! large and completely distinct sequences to the algorithm will make it spin
17//! without making reasonable progress.  Currently the only protection in the
18//! library against this is to pass a deadline to the diffing algorithm.
19//!
20//! For potential improvements here see [similar#15](https://github.com/mitsuhiko/similar/issues/15).
21
22use std::ops::{Index, IndexMut, Range};
23
24use crate::algorithms::utils::{common_prefix_len, common_suffix_len, is_empty_range};
25use crate::algorithms::DiffHook;
26use crate::deadline_support::{deadline_exceeded, Instant};
27
28/// Myers' diff algorithm.
29///
30/// Diff `old`, between indices `old_range` and `new` between indices `new_range`.
31pub fn diff<Old, New, D>(
32    d: &mut D,
33    old: &Old,
34    old_range: Range<usize>,
35    new: &New,
36    new_range: Range<usize>,
37) -> Result<(), D::Error>
38where
39    Old: Index<usize> + ?Sized,
40    New: Index<usize> + ?Sized,
41    D: DiffHook,
42    New::Output: PartialEq<Old::Output>,
43{
44    diff_deadline(d, old, old_range, new, new_range, None)
45}
46
47/// Myers' diff algorithm with deadline.
48///
49/// Diff `old`, between indices `old_range` and `new` between indices `new_range`.
50///
51/// This diff is done with an optional deadline that defines the maximal
52/// execution time permitted before it bails and falls back to an approximation.
53pub fn diff_deadline<Old, New, D>(
54    d: &mut D,
55    old: &Old,
56    old_range: Range<usize>,
57    new: &New,
58    new_range: Range<usize>,
59    deadline: Option<Instant>,
60) -> Result<(), D::Error>
61where
62    Old: Index<usize> + ?Sized,
63    New: Index<usize> + ?Sized,
64    D: DiffHook,
65    New::Output: PartialEq<Old::Output>,
66{
67    let max_d = max_d(old_range.len(), new_range.len());
68    let mut vb = V::new(max_d);
69    let mut vf = V::new(max_d);
70    conquer(
71        d, old, old_range, new, new_range, &mut vf, &mut vb, deadline,
72    )?;
73    d.finish()
74}
75
76// A D-path is a path which starts at (0,0) that has exactly D non-diagonal
77// edges. All D-paths consist of a (D - 1)-path followed by a non-diagonal edge
78// and then a possibly empty sequence of diagonal edges called a snake.
79
80/// `V` contains the endpoints of the furthest reaching `D-paths`. For each
81/// recorded endpoint `(x,y)` in diagonal `k`, we only need to retain `x` because
82/// `y` can be computed from `x - k`. In other words, `V` is an array of integers
83/// where `V[k]` contains the row index of the endpoint of the furthest reaching
84/// path in diagonal `k`.
85///
86/// We can't use a traditional Vec to represent `V` since we use `k` as an index
87/// and it can take on negative values. So instead `V` is represented as a
88/// light-weight wrapper around a Vec plus an `offset` which is the maximum value
89/// `k` can take on in order to map negative `k`'s back to a value >= 0.
90#[derive(Debug)]
91struct V {
92    offset: isize,
93    v: Vec<usize>, // Look into initializing this to -1 and storing isize
94}
95
96impl V {
97    fn new(max_d: usize) -> Self {
98        Self {
99            offset: max_d as isize,
100            v: vec![0; 2 * max_d],
101        }
102    }
103
104    fn len(&self) -> usize {
105        self.v.len()
106    }
107}
108
109impl Index<isize> for V {
110    type Output = usize;
111
112    fn index(&self, index: isize) -> &Self::Output {
113        &self.v[(index + self.offset) as usize]
114    }
115}
116
117impl IndexMut<isize> for V {
118    fn index_mut(&mut self, index: isize) -> &mut Self::Output {
119        &mut self.v[(index + self.offset) as usize]
120    }
121}
122
123fn max_d(len1: usize, len2: usize) -> usize {
124    // XXX look into reducing the need to have the additional '+ 1'
125    (len1 + len2 + 1) / 2 + 1
126}
127
128#[inline(always)]
129fn split_at(range: Range<usize>, at: usize) -> (Range<usize>, Range<usize>) {
130    (range.start..at, at..range.end)
131}
132
133/// A `Snake` is a sequence of diagonal edges in the edit graph.  Normally
134/// a snake has a start end end point (and it is possible for a snake to have
135/// a length of zero, meaning the start and end points are the same) however
136/// we do not need the end point which is why it's not implemented here.
137///
138/// The divide part of a divide-and-conquer strategy. A D-path has D+1 snakes
139/// some of which may be empty. The divide step requires finding the ceil(D/2) +
140/// 1 or middle snake of an optimal D-path. The idea for doing so is to
141/// simultaneously run the basic algorithm in both the forward and reverse
142/// directions until furthest reaching forward and reverse paths starting at
143/// opposing corners 'overlap'.
144fn find_middle_snake<Old, New>(
145    old: &Old,
146    old_range: Range<usize>,
147    new: &New,
148    new_range: Range<usize>,
149    vf: &mut V,
150    vb: &mut V,
151    deadline: Option<Instant>,
152) -> Option<(usize, usize)>
153where
154    Old: Index<usize> + ?Sized,
155    New: Index<usize> + ?Sized,
156    New::Output: PartialEq<Old::Output>,
157{
158    let n = old_range.len();
159    let m = new_range.len();
160
161    // By Lemma 1 in the paper, the optimal edit script length is odd or even as
162    // `delta` is odd or even.
163    let delta = n as isize - m as isize;
164    let odd = delta & 1 == 1;
165
166    // The initial point at (0, -1)
167    vf[1] = 0;
168    // The initial point at (N, M+1)
169    vb[1] = 0;
170
171    // We only need to explore ceil(D/2) + 1
172    let d_max = max_d(n, m);
173    assert!(vf.len() >= d_max);
174    assert!(vb.len() >= d_max);
175
176    for d in 0..d_max as isize {
177        // are we running for too long?
178        if deadline_exceeded(deadline) {
179            break;
180        }
181
182        // Forward path
183        for k in (-d..=d).rev().step_by(2) {
184            let mut x = if k == -d || (k != d && vf[k - 1] < vf[k + 1]) {
185                vf[k + 1]
186            } else {
187                vf[k - 1] + 1
188            };
189            let y = (x as isize - k) as usize;
190
191            // The coordinate of the start of a snake
192            let (x0, y0) = (x, y);
193            //  While these sequences are identical, keep moving through the
194            //  graph with no cost
195            if x < old_range.len() && y < new_range.len() {
196                let advance = common_prefix_len(
197                    old,
198                    old_range.start + x..old_range.end,
199                    new,
200                    new_range.start + y..new_range.end,
201                );
202                x += advance;
203            }
204
205            // This is the new best x value
206            vf[k] = x;
207
208            // Only check for connections from the forward search when N - M is
209            // odd and when there is a reciprocal k line coming from the other
210            // direction.
211            if odd && (k - delta).abs() <= (d - 1) {
212                // TODO optimize this so we don't have to compare against n
213                if vf[k] + vb[-(k - delta)] >= n {
214                    // Return the snake
215                    return Some((x0 + old_range.start, y0 + new_range.start));
216                }
217            }
218        }
219
220        // Backward path
221        for k in (-d..=d).rev().step_by(2) {
222            let mut x = if k == -d || (k != d && vb[k - 1] < vb[k + 1]) {
223                vb[k + 1]
224            } else {
225                vb[k - 1] + 1
226            };
227            let mut y = (x as isize - k) as usize;
228
229            // The coordinate of the start of a snake
230            if x < n && y < m {
231                let advance = common_suffix_len(
232                    old,
233                    old_range.start..old_range.start + n - x,
234                    new,
235                    new_range.start..new_range.start + m - y,
236                );
237                x += advance;
238                y += advance;
239            }
240
241            // This is the new best x value
242            vb[k] = x;
243
244            if !odd && (k - delta).abs() <= d {
245                // TODO optimize this so we don't have to compare against n
246                if vb[k] + vf[-(k - delta)] >= n {
247                    // Return the snake
248                    return Some((n - x + old_range.start, m - y + new_range.start));
249                }
250            }
251        }
252
253        // TODO: Maybe there's an opportunity to optimize and bail early?
254    }
255
256    // deadline reached
257    None
258}
259
260#[allow(clippy::too_many_arguments)]
261fn conquer<Old, New, D>(
262    d: &mut D,
263    old: &Old,
264    mut old_range: Range<usize>,
265    new: &New,
266    mut new_range: Range<usize>,
267    vf: &mut V,
268    vb: &mut V,
269    deadline: Option<Instant>,
270) -> Result<(), D::Error>
271where
272    Old: Index<usize> + ?Sized,
273    New: Index<usize> + ?Sized,
274    D: DiffHook,
275    New::Output: PartialEq<Old::Output>,
276{
277    // Check for common prefix
278    let common_prefix_len = common_prefix_len(old, old_range.clone(), new, new_range.clone());
279    if common_prefix_len > 0 {
280        d.equal(old_range.start, new_range.start, common_prefix_len)?;
281    }
282    old_range.start += common_prefix_len;
283    new_range.start += common_prefix_len;
284
285    // Check for common suffix
286    let common_suffix_len = common_suffix_len(old, old_range.clone(), new, new_range.clone());
287    let common_suffix = (
288        old_range.end - common_suffix_len,
289        new_range.end - common_suffix_len,
290    );
291    old_range.end -= common_suffix_len;
292    new_range.end -= common_suffix_len;
293
294    if is_empty_range(&old_range) && is_empty_range(&new_range) {
295        // Do nothing
296    } else if is_empty_range(&new_range) {
297        d.delete(old_range.start, old_range.len(), new_range.start)?;
298    } else if is_empty_range(&old_range) {
299        d.insert(old_range.start, new_range.start, new_range.len())?;
300    } else if let Some((x_start, y_start)) = find_middle_snake(
301        old,
302        old_range.clone(),
303        new,
304        new_range.clone(),
305        vf,
306        vb,
307        deadline,
308    ) {
309        let (old_a, old_b) = split_at(old_range, x_start);
310        let (new_a, new_b) = split_at(new_range, y_start);
311        conquer(d, old, old_a, new, new_a, vf, vb, deadline)?;
312        conquer(d, old, old_b, new, new_b, vf, vb, deadline)?;
313    } else {
314        d.delete(
315            old_range.start,
316            old_range.end - old_range.start,
317            new_range.start,
318        )?;
319        d.insert(
320            old_range.start,
321            new_range.start,
322            new_range.end - new_range.start,
323        )?;
324    }
325
326    if common_suffix_len > 0 {
327        d.equal(common_suffix.0, common_suffix.1, common_suffix_len)?;
328    }
329
330    Ok(())
331}
332
333#[test]
334fn test_find_middle_snake() {
335    let a = &b"ABCABBA"[..];
336    let b = &b"CBABAC"[..];
337    let max_d = max_d(a.len(), b.len());
338    let mut vf = V::new(max_d);
339    let mut vb = V::new(max_d);
340    let (x_start, y_start) =
341        find_middle_snake(a, 0..a.len(), b, 0..b.len(), &mut vf, &mut vb, None).unwrap();
342    assert_eq!(x_start, 4);
343    assert_eq!(y_start, 1);
344}
345
346#[test]
347fn test_diff() {
348    let a: &[usize] = &[0, 1, 2, 3, 4];
349    let b: &[usize] = &[0, 1, 2, 9, 4];
350
351    let mut d = crate::algorithms::Replace::new(crate::algorithms::Capture::new());
352    diff(&mut d, a, 0..a.len(), b, 0..b.len()).unwrap();
353    insta::assert_debug_snapshot!(d.into_inner().ops());
354}
355
356#[test]
357fn test_contiguous() {
358    let a: &[usize] = &[0, 1, 2, 3, 4, 4, 4, 5];
359    let b: &[usize] = &[0, 1, 2, 8, 9, 4, 4, 7];
360
361    let mut d = crate::algorithms::Replace::new(crate::algorithms::Capture::new());
362    diff(&mut d, a, 0..a.len(), b, 0..b.len()).unwrap();
363    insta::assert_debug_snapshot!(d.into_inner().ops());
364}
365
366#[test]
367fn test_pat() {
368    let a: &[usize] = &[0, 1, 3, 4, 5];
369    let b: &[usize] = &[0, 1, 4, 5, 8, 9];
370
371    let mut d = crate::algorithms::Capture::new();
372    diff(&mut d, a, 0..a.len(), b, 0..b.len()).unwrap();
373    insta::assert_debug_snapshot!(d.ops());
374}
375
376#[test]
377fn test_deadline_reached() {
378    use std::ops::Index;
379    use std::time::Duration;
380
381    let a = (0..100).collect::<Vec<_>>();
382    let mut b = (0..100).collect::<Vec<_>>();
383    b[10] = 99;
384    b[50] = 99;
385    b[25] = 99;
386
387    struct SlowIndex<'a>(&'a [usize]);
388
389    impl Index<usize> for SlowIndex<'_> {
390        type Output = usize;
391
392        fn index(&self, index: usize) -> &Self::Output {
393            std::thread::sleep(Duration::from_millis(1));
394            &self.0[index]
395        }
396    }
397
398    let slow_a = SlowIndex(&a);
399    let slow_b = SlowIndex(&b);
400
401    // don't give it enough time to do anything interesting
402    let mut d = crate::algorithms::Replace::new(crate::algorithms::Capture::new());
403    diff_deadline(
404        &mut d,
405        &slow_a,
406        0..a.len(),
407        &slow_b,
408        0..b.len(),
409        Some(Instant::now() + Duration::from_millis(50)),
410    )
411    .unwrap();
412    insta::assert_debug_snapshot!(d.into_inner().ops());
413}
414
415#[test]
416fn test_finish_called() {
417    struct HasRunFinish(bool);
418
419    impl DiffHook for HasRunFinish {
420        type Error = ();
421        fn finish(&mut self) -> Result<(), Self::Error> {
422            self.0 = true;
423            Ok(())
424        }
425    }
426
427    let mut d = HasRunFinish(false);
428    let slice = &[1, 2];
429    let slice2 = &[1, 2, 3];
430    diff(&mut d, slice, 0..slice.len(), slice2, 0..slice2.len()).unwrap();
431    assert!(d.0);
432
433    let mut d = HasRunFinish(false);
434    let slice = &[1, 2];
435    diff(&mut d, slice, 0..slice.len(), slice, 0..slice.len()).unwrap();
436    assert!(d.0);
437
438    let mut d = HasRunFinish(false);
439    let slice: &[u8] = &[];
440    diff(&mut d, slice, 0..slice.len(), slice, 0..slice.len()).unwrap();
441    assert!(d.0);
442}