similar/algorithms/myers.rs
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444
//! Myers' diff algorithm.
//!
//! * time: `O((N+M)D)`
//! * space `O(N+M)`
//!
//! See [the original article by Eugene W. Myers](http://www.xmailserver.org/diff2.pdf)
//! describing it.
//!
//! The implementation of this algorithm is based on the implementation by
//! Brandon Williams.
//!
//! # Heuristics
//!
//! At present this implementation of Myers' does not implement any more advanced
//! heuristics that would solve some pathological cases. For instance passing two
//! large and completely distinct sequences to the algorithm will make it spin
//! without making reasonable progress. Currently the only protection in the
//! library against this is to pass a deadline to the diffing algorithm.
//!
//! For potential improvements here see [similar#15](https://github.com/mitsuhiko/similar/issues/15).
use std::ops::{Index, IndexMut, Range};
use std::time::Instant;
use crate::algorithms::utils::{common_prefix_len, common_suffix_len, is_empty_range};
use crate::algorithms::DiffHook;
/// Myers' diff algorithm.
///
/// Diff `old`, between indices `old_range` and `new` between indices `new_range`.
pub fn diff<Old, New, D>(
d: &mut D,
old: &Old,
old_range: Range<usize>,
new: &New,
new_range: Range<usize>,
) -> Result<(), D::Error>
where
Old: Index<usize> + ?Sized,
New: Index<usize> + ?Sized,
D: DiffHook,
New::Output: PartialEq<Old::Output>,
{
diff_deadline(d, old, old_range, new, new_range, None)
}
/// Myers' diff algorithm with deadline.
///
/// Diff `old`, between indices `old_range` and `new` between indices `new_range`.
///
/// This diff is done with an optional deadline that defines the maximal
/// execution time permitted before it bails and falls back to an approximation.
pub fn diff_deadline<Old, New, D>(
d: &mut D,
old: &Old,
old_range: Range<usize>,
new: &New,
new_range: Range<usize>,
deadline: Option<Instant>,
) -> Result<(), D::Error>
where
Old: Index<usize> + ?Sized,
New: Index<usize> + ?Sized,
D: DiffHook,
New::Output: PartialEq<Old::Output>,
{
let max_d = max_d(old_range.len(), new_range.len());
let mut vb = V::new(max_d);
let mut vf = V::new(max_d);
conquer(
d, old, old_range, new, new_range, &mut vf, &mut vb, deadline,
)?;
d.finish()
}
// A D-path is a path which starts at (0,0) that has exactly D non-diagonal
// edges. All D-paths consist of a (D - 1)-path followed by a non-diagonal edge
// and then a possibly empty sequence of diagonal edges called a snake.
/// `V` contains the endpoints of the furthest reaching `D-paths`. For each
/// recorded endpoint `(x,y)` in diagonal `k`, we only need to retain `x` because
/// `y` can be computed from `x - k`. In other words, `V` is an array of integers
/// where `V[k]` contains the row index of the endpoint of the furthest reaching
/// path in diagonal `k`.
///
/// We can't use a traditional Vec to represent `V` since we use `k` as an index
/// and it can take on negative values. So instead `V` is represented as a
/// light-weight wrapper around a Vec plus an `offset` which is the maximum value
/// `k` can take on in order to map negative `k`'s back to a value >= 0.
#[derive(Debug)]
struct V {
offset: isize,
v: Vec<usize>, // Look into initializing this to -1 and storing isize
}
impl V {
fn new(max_d: usize) -> Self {
Self {
offset: max_d as isize,
v: vec![0; 2 * max_d],
}
}
fn len(&self) -> usize {
self.v.len()
}
}
impl Index<isize> for V {
type Output = usize;
fn index(&self, index: isize) -> &Self::Output {
&self.v[(index + self.offset) as usize]
}
}
impl IndexMut<isize> for V {
fn index_mut(&mut self, index: isize) -> &mut Self::Output {
&mut self.v[(index + self.offset) as usize]
}
}
fn max_d(len1: usize, len2: usize) -> usize {
// XXX look into reducing the need to have the additional '+ 1'
(len1 + len2 + 1) / 2 + 1
}
#[inline(always)]
fn split_at(range: Range<usize>, at: usize) -> (Range<usize>, Range<usize>) {
(range.start..at, at..range.end)
}
/// A `Snake` is a sequence of diagonal edges in the edit graph. Normally
/// a snake has a start end end point (and it is possible for a snake to have
/// a length of zero, meaning the start and end points are the same) however
/// we do not need the end point which is why it's not implemented here.
///
/// The divide part of a divide-and-conquer strategy. A D-path has D+1 snakes
/// some of which may be empty. The divide step requires finding the ceil(D/2) +
/// 1 or middle snake of an optimal D-path. The idea for doing so is to
/// simultaneously run the basic algorithm in both the forward and reverse
/// directions until furthest reaching forward and reverse paths starting at
/// opposing corners 'overlap'.
fn find_middle_snake<Old, New>(
old: &Old,
old_range: Range<usize>,
new: &New,
new_range: Range<usize>,
vf: &mut V,
vb: &mut V,
deadline: Option<Instant>,
) -> Option<(usize, usize)>
where
Old: Index<usize> + ?Sized,
New: Index<usize> + ?Sized,
New::Output: PartialEq<Old::Output>,
{
let n = old_range.len();
let m = new_range.len();
// By Lemma 1 in the paper, the optimal edit script length is odd or even as
// `delta` is odd or even.
let delta = n as isize - m as isize;
let odd = delta & 1 == 1;
// The initial point at (0, -1)
vf[1] = 0;
// The initial point at (N, M+1)
vb[1] = 0;
// We only need to explore ceil(D/2) + 1
let d_max = max_d(n, m);
assert!(vf.len() >= d_max);
assert!(vb.len() >= d_max);
for d in 0..d_max as isize {
// are we running for too long?
if let Some(deadline) = deadline {
if Instant::now() > deadline {
break;
}
}
// Forward path
for k in (-d..=d).rev().step_by(2) {
let mut x = if k == -d || (k != d && vf[k - 1] < vf[k + 1]) {
vf[k + 1]
} else {
vf[k - 1] + 1
};
let y = (x as isize - k) as usize;
// The coordinate of the start of a snake
let (x0, y0) = (x, y);
// While these sequences are identical, keep moving through the
// graph with no cost
if x < old_range.len() && y < new_range.len() {
let advance = common_prefix_len(
old,
old_range.start + x..old_range.end,
new,
new_range.start + y..new_range.end,
);
x += advance;
}
// This is the new best x value
vf[k] = x;
// Only check for connections from the forward search when N - M is
// odd and when there is a reciprocal k line coming from the other
// direction.
if odd && (k - delta).abs() <= (d - 1) {
// TODO optimize this so we don't have to compare against n
if vf[k] + vb[-(k - delta)] >= n {
// Return the snake
return Some((x0 + old_range.start, y0 + new_range.start));
}
}
}
// Backward path
for k in (-d..=d).rev().step_by(2) {
let mut x = if k == -d || (k != d && vb[k - 1] < vb[k + 1]) {
vb[k + 1]
} else {
vb[k - 1] + 1
};
let mut y = (x as isize - k) as usize;
// The coordinate of the start of a snake
if x < n && y < m {
let advance = common_suffix_len(
old,
old_range.start..old_range.start + n - x,
new,
new_range.start..new_range.start + m - y,
);
x += advance;
y += advance;
}
// This is the new best x value
vb[k] = x;
if !odd && (k - delta).abs() <= d {
// TODO optimize this so we don't have to compare against n
if vb[k] + vf[-(k - delta)] >= n {
// Return the snake
return Some((n - x + old_range.start, m - y + new_range.start));
}
}
}
// TODO: Maybe there's an opportunity to optimize and bail early?
}
// deadline reached
None
}
#[allow(clippy::too_many_arguments)]
fn conquer<Old, New, D>(
d: &mut D,
old: &Old,
mut old_range: Range<usize>,
new: &New,
mut new_range: Range<usize>,
vf: &mut V,
vb: &mut V,
deadline: Option<Instant>,
) -> Result<(), D::Error>
where
Old: Index<usize> + ?Sized,
New: Index<usize> + ?Sized,
D: DiffHook,
New::Output: PartialEq<Old::Output>,
{
// Check for common prefix
let common_prefix_len = common_prefix_len(old, old_range.clone(), new, new_range.clone());
if common_prefix_len > 0 {
d.equal(old_range.start, new_range.start, common_prefix_len)?;
}
old_range.start += common_prefix_len;
new_range.start += common_prefix_len;
// Check for common suffix
let common_suffix_len = common_suffix_len(old, old_range.clone(), new, new_range.clone());
let common_suffix = (
old_range.end - common_suffix_len,
new_range.end - common_suffix_len,
);
old_range.end -= common_suffix_len;
new_range.end -= common_suffix_len;
if is_empty_range(&old_range) && is_empty_range(&new_range) {
// Do nothing
} else if is_empty_range(&new_range) {
d.delete(old_range.start, old_range.len(), new_range.start)?;
} else if is_empty_range(&old_range) {
d.insert(old_range.start, new_range.start, new_range.len())?;
} else if let Some((x_start, y_start)) = find_middle_snake(
old,
old_range.clone(),
new,
new_range.clone(),
vf,
vb,
deadline,
) {
let (old_a, old_b) = split_at(old_range, x_start);
let (new_a, new_b) = split_at(new_range, y_start);
conquer(d, old, old_a, new, new_a, vf, vb, deadline)?;
conquer(d, old, old_b, new, new_b, vf, vb, deadline)?;
} else {
d.delete(
old_range.start,
old_range.end - old_range.start,
new_range.start,
)?;
d.insert(
old_range.start,
new_range.start,
new_range.end - new_range.start,
)?;
}
if common_suffix_len > 0 {
d.equal(common_suffix.0, common_suffix.1, common_suffix_len)?;
}
Ok(())
}
#[test]
fn test_find_middle_snake() {
let a = &b"ABCABBA"[..];
let b = &b"CBABAC"[..];
let max_d = max_d(a.len(), b.len());
let mut vf = V::new(max_d);
let mut vb = V::new(max_d);
let (x_start, y_start) =
find_middle_snake(a, 0..a.len(), b, 0..b.len(), &mut vf, &mut vb, None).unwrap();
assert_eq!(x_start, 4);
assert_eq!(y_start, 1);
}
#[test]
fn test_diff() {
let a: &[usize] = &[0, 1, 2, 3, 4];
let b: &[usize] = &[0, 1, 2, 9, 4];
let mut d = crate::algorithms::Replace::new(crate::algorithms::Capture::new());
diff(&mut d, a, 0..a.len(), b, 0..b.len()).unwrap();
insta::assert_debug_snapshot!(d.into_inner().ops());
}
#[test]
fn test_contiguous() {
let a: &[usize] = &[0, 1, 2, 3, 4, 4, 4, 5];
let b: &[usize] = &[0, 1, 2, 8, 9, 4, 4, 7];
let mut d = crate::algorithms::Replace::new(crate::algorithms::Capture::new());
diff(&mut d, a, 0..a.len(), b, 0..b.len()).unwrap();
insta::assert_debug_snapshot!(d.into_inner().ops());
}
#[test]
fn test_pat() {
let a: &[usize] = &[0, 1, 3, 4, 5];
let b: &[usize] = &[0, 1, 4, 5, 8, 9];
let mut d = crate::algorithms::Capture::new();
diff(&mut d, a, 0..a.len(), b, 0..b.len()).unwrap();
insta::assert_debug_snapshot!(d.ops());
}
#[test]
fn test_deadline_reached() {
use std::ops::Index;
use std::time::Duration;
let a = (0..100).collect::<Vec<_>>();
let mut b = (0..100).collect::<Vec<_>>();
b[10] = 99;
b[50] = 99;
b[25] = 99;
struct SlowIndex<'a>(&'a [usize]);
impl<'a> Index<usize> for SlowIndex<'a> {
type Output = usize;
fn index(&self, index: usize) -> &Self::Output {
std::thread::sleep(Duration::from_millis(1));
&self.0[index]
}
}
let slow_a = SlowIndex(&a);
let slow_b = SlowIndex(&b);
// don't give it enough time to do anything interesting
let mut d = crate::algorithms::Replace::new(crate::algorithms::Capture::new());
diff_deadline(
&mut d,
&slow_a,
0..a.len(),
&slow_b,
0..b.len(),
Some(Instant::now() + Duration::from_millis(50)),
)
.unwrap();
insta::assert_debug_snapshot!(d.into_inner().ops());
}
#[test]
fn test_finish_called() {
struct HasRunFinish(bool);
impl DiffHook for HasRunFinish {
type Error = ();
fn finish(&mut self) -> Result<(), Self::Error> {
self.0 = true;
Ok(())
}
}
let mut d = HasRunFinish(false);
let slice = &[1, 2];
let slice2 = &[1, 2, 3];
diff(&mut d, slice, 0..slice.len(), slice2, 0..slice2.len()).unwrap();
assert!(d.0);
let mut d = HasRunFinish(false);
let slice = &[1, 2];
diff(&mut d, slice, 0..slice.len(), slice, 0..slice.len()).unwrap();
assert!(d.0);
let mut d = HasRunFinish(false);
let slice: &[u8] = &[];
diff(&mut d, slice, 0..slice.len(), slice, 0..slice.len()).unwrap();
assert!(d.0);
}