similar/algorithms/
utils.rs

1use std::collections::hash_map::Entry;
2use std::collections::HashMap;
3use std::fmt::Debug;
4use std::hash::{Hash, Hasher};
5use std::ops::{Add, Index, Range};
6
7/// Utility function to check if a range is empty that works on older rust versions
8#[inline(always)]
9#[allow(clippy::neg_cmp_op_on_partial_ord)]
10pub fn is_empty_range<T: PartialOrd<T>>(range: &Range<T>) -> bool {
11    !(range.start < range.end)
12}
13
14/// Represents an item in the vector returned by [`unique`].
15///
16/// It compares like the underlying item does it was created from but
17/// carries the index it was originally created from.
18pub struct UniqueItem<'a, Idx: ?Sized> {
19    lookup: &'a Idx,
20    index: usize,
21}
22
23impl<Idx: ?Sized> UniqueItem<'_, Idx>
24where
25    Idx: Index<usize>,
26{
27    /// Returns the value.
28    #[inline(always)]
29    pub fn value(&self) -> &Idx::Output {
30        &self.lookup[self.index]
31    }
32
33    /// Returns the original index.
34    #[inline(always)]
35    pub fn original_index(&self) -> usize {
36        self.index
37    }
38}
39
40impl<'a, Idx: Index<usize> + 'a> Debug for UniqueItem<'a, Idx>
41where
42    Idx::Output: Debug,
43{
44    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
45        f.debug_struct("UniqueItem")
46            .field("value", &self.value())
47            .field("original_index", &self.original_index())
48            .finish()
49    }
50}
51
52impl<'a, 'b, A, B> PartialEq<UniqueItem<'a, A>> for UniqueItem<'b, B>
53where
54    A: Index<usize> + 'b + ?Sized,
55    B: Index<usize> + 'b + ?Sized,
56    B::Output: PartialEq<A::Output>,
57{
58    #[inline(always)]
59    fn eq(&self, other: &UniqueItem<'a, A>) -> bool {
60        self.value() == other.value()
61    }
62}
63
64/// Returns only unique items in the sequence as vector.
65///
66/// Each item is wrapped in a [`UniqueItem`] so that both the value and the
67/// index can be extracted.
68pub fn unique<Idx>(lookup: &Idx, range: Range<usize>) -> Vec<UniqueItem<Idx>>
69where
70    Idx: Index<usize> + ?Sized,
71    Idx::Output: Hash + Eq,
72{
73    let mut by_item = HashMap::new();
74    for index in range {
75        match by_item.entry(&lookup[index]) {
76            Entry::Vacant(entry) => {
77                entry.insert(Some(index));
78            }
79            Entry::Occupied(mut entry) => {
80                let entry = entry.get_mut();
81                if entry.is_some() {
82                    *entry = None
83                }
84            }
85        }
86    }
87    let mut rv = by_item
88        .into_iter()
89        .filter_map(|(_, x)| x)
90        .map(|index| UniqueItem { lookup, index })
91        .collect::<Vec<_>>();
92    rv.sort_by_key(|a| a.original_index());
93    rv
94}
95
96/// Given two lookups and ranges calculates the length of the common prefix.
97pub fn common_prefix_len<Old, New>(
98    old: &Old,
99    old_range: Range<usize>,
100    new: &New,
101    new_range: Range<usize>,
102) -> usize
103where
104    Old: Index<usize> + ?Sized,
105    New: Index<usize> + ?Sized,
106    New::Output: PartialEq<Old::Output>,
107{
108    if is_empty_range(&old_range) || is_empty_range(&new_range) {
109        return 0;
110    }
111    new_range
112        .zip(old_range)
113        .take_while(
114            #[inline(always)]
115            |x| new[x.0] == old[x.1],
116        )
117        .count()
118}
119
120/// Given two lookups and ranges calculates the length of common suffix.
121pub fn common_suffix_len<Old, New>(
122    old: &Old,
123    old_range: Range<usize>,
124    new: &New,
125    new_range: Range<usize>,
126) -> usize
127where
128    Old: Index<usize> + ?Sized,
129    New: Index<usize> + ?Sized,
130    New::Output: PartialEq<Old::Output>,
131{
132    if is_empty_range(&old_range) || is_empty_range(&new_range) {
133        return 0;
134    }
135    new_range
136        .rev()
137        .zip(old_range.rev())
138        .take_while(
139            #[inline(always)]
140            |x| new[x.0] == old[x.1],
141        )
142        .count()
143}
144
145struct OffsetLookup<Int> {
146    offset: usize,
147    vec: Vec<Int>,
148}
149
150impl<Int> Index<usize> for OffsetLookup<Int> {
151    type Output = Int;
152
153    #[inline(always)]
154    fn index(&self, index: usize) -> &Self::Output {
155        &self.vec[index - self.offset]
156    }
157}
158
159/// A utility struct to convert distinct items to unique integers.
160///
161/// This can be helpful on larger inputs to speed up the comparisons
162/// performed by doing a first pass where the data set gets reduced
163/// to (small) integers.
164///
165/// The idea is that instead of passing two sequences to a diffling algorithm
166/// you first pass it via [`IdentifyDistinct`]:
167///
168/// ```rust
169/// use similar::capture_diff;
170/// use similar::algorithms::{Algorithm, IdentifyDistinct};
171///
172/// let old = &["foo", "bar", "baz"][..];
173/// let new = &["foo", "blah", "baz"][..];
174/// let h = IdentifyDistinct::<u32>::new(old, 0..old.len(), new, 0..new.len());
175/// let ops = capture_diff(
176///     Algorithm::Myers,
177///     h.old_lookup(),
178///     h.old_range(),
179///     h.new_lookup(),
180///     h.new_range(),
181/// );
182/// ```
183///
184/// The indexes are the same as with the passed source ranges.
185pub struct IdentifyDistinct<Int> {
186    old: OffsetLookup<Int>,
187    new: OffsetLookup<Int>,
188}
189
190impl<Int> IdentifyDistinct<Int>
191where
192    Int: Add<Output = Int> + From<u8> + Default + Copy,
193{
194    /// Creates an int hasher for two sequences.
195    pub fn new<Old, New>(
196        old: &Old,
197        old_range: Range<usize>,
198        new: &New,
199        new_range: Range<usize>,
200    ) -> Self
201    where
202        Old: Index<usize> + ?Sized,
203        Old::Output: Eq + Hash,
204        New: Index<usize> + ?Sized,
205        New::Output: Eq + Hash + PartialEq<Old::Output>,
206    {
207        enum Key<'old, 'new, Old: ?Sized, New: ?Sized> {
208            Old(&'old Old),
209            New(&'new New),
210        }
211
212        impl<Old, New> Hash for Key<'_, '_, Old, New>
213        where
214            Old: Hash + ?Sized,
215            New: Hash + ?Sized,
216        {
217            fn hash<H: Hasher>(&self, state: &mut H) {
218                match *self {
219                    Key::Old(val) => val.hash(state),
220                    Key::New(val) => val.hash(state),
221                }
222            }
223        }
224
225        impl<Old, New> PartialEq for Key<'_, '_, Old, New>
226        where
227            Old: Eq + ?Sized,
228            New: Eq + PartialEq<Old> + ?Sized,
229        {
230            #[inline(always)]
231            fn eq(&self, other: &Self) -> bool {
232                match (self, other) {
233                    (Key::Old(a), Key::Old(b)) => a == b,
234                    (Key::New(a), Key::New(b)) => a == b,
235                    (Key::Old(a), Key::New(b)) | (Key::New(b), Key::Old(a)) => b == a,
236                }
237            }
238        }
239
240        impl<Old, New> Eq for Key<'_, '_, Old, New>
241        where
242            Old: Eq + ?Sized,
243            New: Eq + PartialEq<Old> + ?Sized,
244        {
245        }
246
247        let mut map = HashMap::new();
248        let mut old_seq = Vec::new();
249        let mut new_seq = Vec::new();
250        let mut next_id = Int::default();
251        let step = Int::from(1);
252        let old_start = old_range.start;
253        let new_start = new_range.start;
254
255        for idx in old_range {
256            let item = Key::Old(&old[idx]);
257            let id = match map.entry(item) {
258                Entry::Occupied(o) => *o.get(),
259                Entry::Vacant(v) => {
260                    let id = next_id;
261                    next_id = next_id + step;
262                    *v.insert(id)
263                }
264            };
265            old_seq.push(id);
266        }
267
268        for idx in new_range {
269            let item = Key::New(&new[idx]);
270            let id = match map.entry(item) {
271                Entry::Occupied(o) => *o.get(),
272                Entry::Vacant(v) => {
273                    let id = next_id;
274                    next_id = next_id + step;
275                    *v.insert(id)
276                }
277            };
278            new_seq.push(id);
279        }
280
281        IdentifyDistinct {
282            old: OffsetLookup {
283                offset: old_start,
284                vec: old_seq,
285            },
286            new: OffsetLookup {
287                offset: new_start,
288                vec: new_seq,
289            },
290        }
291    }
292
293    /// Returns a lookup for the old side.
294    pub fn old_lookup(&self) -> &impl Index<usize, Output = Int> {
295        &self.old
296    }
297
298    /// Returns a lookup for the new side.
299    pub fn new_lookup(&self) -> &impl Index<usize, Output = Int> {
300        &self.new
301    }
302
303    /// Convenience method to get back the old range.
304    pub fn old_range(&self) -> Range<usize> {
305        self.old.offset..self.old.offset + self.old.vec.len()
306    }
307
308    /// Convenience method to get back the new range.
309    pub fn new_range(&self) -> Range<usize> {
310        self.new.offset..self.new.offset + self.new.vec.len()
311    }
312}
313
314#[test]
315fn test_unique() {
316    let u = unique(&vec!['a', 'b', 'c', 'd', 'd', 'b'], 0..6)
317        .into_iter()
318        .map(|x| (*x.value(), x.original_index()))
319        .collect::<Vec<_>>();
320    assert_eq!(u, vec![('a', 0), ('c', 2)]);
321}
322
323#[test]
324fn test_int_hasher() {
325    let ih = IdentifyDistinct::<u8>::new(
326        &["", "foo", "bar", "baz"][..],
327        1..4,
328        &["", "foo", "blah", "baz"][..],
329        1..4,
330    );
331    assert_eq!(ih.old_lookup()[1], 0);
332    assert_eq!(ih.old_lookup()[2], 1);
333    assert_eq!(ih.old_lookup()[3], 2);
334    assert_eq!(ih.new_lookup()[1], 0);
335    assert_eq!(ih.new_lookup()[2], 3);
336    assert_eq!(ih.new_lookup()[3], 2);
337    assert_eq!(ih.old_range(), 1..4);
338    assert_eq!(ih.new_range(), 1..4);
339}
340
341#[test]
342fn test_common_prefix_len() {
343    assert_eq!(
344        common_prefix_len("".as_bytes(), 0..0, "".as_bytes(), 0..0),
345        0
346    );
347    assert_eq!(
348        common_prefix_len("foobarbaz".as_bytes(), 0..9, "foobarblah".as_bytes(), 0..10),
349        7
350    );
351    assert_eq!(
352        common_prefix_len("foobarbaz".as_bytes(), 0..9, "blablabla".as_bytes(), 0..9),
353        0
354    );
355    assert_eq!(
356        common_prefix_len("foobarbaz".as_bytes(), 3..9, "foobarblah".as_bytes(), 3..10),
357        4
358    );
359}
360
361#[test]
362fn test_common_suffix_len() {
363    assert_eq!(
364        common_suffix_len("".as_bytes(), 0..0, "".as_bytes(), 0..0),
365        0
366    );
367    assert_eq!(
368        common_suffix_len("1234".as_bytes(), 0..4, "X0001234".as_bytes(), 0..8),
369        4
370    );
371    assert_eq!(
372        common_suffix_len("1234".as_bytes(), 0..4, "Xxxx".as_bytes(), 0..4),
373        0
374    );
375    assert_eq!(
376        common_suffix_len("1234".as_bytes(), 2..4, "01234".as_bytes(), 2..5),
377        2
378    );
379}