just playing with tangled
1// Copyright 2021 The Jujutsu Authors
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// https://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15#![allow(missing_docs)]
16
17use std::cmp::{max, min, Ordering};
18use std::collections::{BTreeMap, HashMap};
19use std::fmt::{Debug, Formatter};
20use std::ops::Range;
21use std::slice;
22
23use itertools::Itertools;
24
25pub fn find_line_ranges(text: &[u8]) -> Vec<Range<usize>> {
26 let mut ranges = vec![];
27 let mut start = 0;
28 loop {
29 match text[start..].iter().position(|b| *b == b'\n') {
30 None => {
31 break;
32 }
33 Some(i) => {
34 ranges.push(start..start + i + 1);
35 start += i + 1;
36 }
37 }
38 }
39 if start < text.len() {
40 ranges.push(start..text.len());
41 }
42 ranges
43}
44
45fn is_word_byte(b: u8) -> bool {
46 // TODO: Make this configurable (probably higher up in the call stack)
47 matches!(
48 b,
49 // Count 0x80..0xff as word bytes so multi-byte UTF-8 chars are
50 // treated as a single unit.
51 b'A'..=b'Z' | b'a'..=b'z' | b'0'..=b'9' | b'_' | b'\x80'..=b'\xff'
52 )
53}
54
55pub fn find_word_ranges(text: &[u8]) -> Vec<Range<usize>> {
56 let mut word_ranges = vec![];
57 let mut word_start_pos = 0;
58 let mut in_word = false;
59 for (i, b) in text.iter().enumerate() {
60 if in_word && !is_word_byte(*b) {
61 in_word = false;
62 word_ranges.push(word_start_pos..i);
63 word_start_pos = i;
64 } else if !in_word && is_word_byte(*b) {
65 in_word = true;
66 word_start_pos = i;
67 }
68 }
69 if in_word && word_start_pos < text.len() {
70 word_ranges.push(word_start_pos..text.len());
71 }
72 word_ranges
73}
74
75pub fn find_nonword_ranges(text: &[u8]) -> Vec<Range<usize>> {
76 let mut ranges = vec![];
77 for (i, b) in text.iter().enumerate() {
78 if !is_word_byte(*b) {
79 ranges.push(i..i + 1);
80 }
81 }
82 ranges
83}
84
85struct Histogram<'a> {
86 word_to_positions: HashMap<&'a [u8], Vec<usize>>,
87 count_to_words: BTreeMap<usize, Vec<&'a [u8]>>,
88}
89
90impl Histogram<'_> {
91 fn calculate<'a>(
92 text: &'a [u8],
93 ranges: &[Range<usize>],
94 max_occurrences: usize,
95 ) -> Histogram<'a> {
96 let mut word_to_positions: HashMap<&[u8], Vec<usize>> = HashMap::new();
97 for (i, range) in ranges.iter().enumerate() {
98 let positions = word_to_positions.entry(&text[range.clone()]).or_default();
99 // Allow one more than max_occurrences, so we can later skip those with more
100 // than max_occurrences
101 if positions.len() <= max_occurrences {
102 positions.push(i);
103 }
104 }
105 let mut count_to_words: BTreeMap<usize, Vec<&[u8]>> = BTreeMap::new();
106 for (word, ranges) in &word_to_positions {
107 count_to_words.entry(ranges.len()).or_default().push(word);
108 }
109 Histogram {
110 word_to_positions,
111 count_to_words,
112 }
113 }
114}
115
116/// Finds the LCS given a array where the value of `input[i]` indicates that
117/// the position of element `i` in the right array is at position `input[i]` in
118/// the left array.
119///
120/// For example (some have multiple valid outputs):
121///
122/// [0,1,2] => [(0,0),(1,1),(2,2)]
123/// [2,1,0] => [(0,2)]
124/// [0,1,4,2,3,5,6] => [(0,0),(1,1),(2,3),(3,4),(5,5),(6,6)]
125/// [0,1,4,3,2,5,6] => [(0,0),(1,1),(4,2),(5,5),(6,6)]
126fn find_lcs(input: &[usize]) -> Vec<(usize, usize)> {
127 if input.is_empty() {
128 return vec![];
129 }
130
131 let mut chain = vec![(0, 0, 0); input.len()];
132 let mut global_longest = 0;
133 let mut global_longest_right_pos = 0;
134 for (right_pos, &left_pos) in input.iter().enumerate() {
135 let mut longest_from_here = 1;
136 let mut previous_right_pos = usize::MAX;
137 for i in (0..right_pos).rev() {
138 let (previous_len, previous_left_pos, _) = chain[i];
139 if previous_left_pos < left_pos {
140 let len = previous_len + 1;
141 if len > longest_from_here {
142 longest_from_here = len;
143 previous_right_pos = i;
144 if len > global_longest {
145 global_longest = len;
146 global_longest_right_pos = right_pos;
147 // If this is the longest chain globally so far, we cannot find a
148 // longer one by using a previous value, so break early.
149 break;
150 }
151 }
152 }
153 }
154 chain[right_pos] = (longest_from_here, left_pos, previous_right_pos);
155 }
156
157 let mut result = vec![];
158 let mut right_pos = global_longest_right_pos;
159 loop {
160 let (_, left_pos, previous_right_pos) = chain[right_pos];
161 result.push((left_pos, right_pos));
162 if previous_right_pos == usize::MAX {
163 break;
164 }
165 right_pos = previous_right_pos;
166 }
167 result.reverse();
168
169 result
170}
171
172/// Finds unchanged ranges among the ones given as arguments. The data between
173/// those ranges is ignored.
174pub(crate) fn unchanged_ranges(
175 left: &[u8],
176 right: &[u8],
177 left_ranges: &[Range<usize>],
178 right_ranges: &[Range<usize>],
179) -> Vec<(Range<usize>, Range<usize>)> {
180 if left_ranges.is_empty() || right_ranges.is_empty() {
181 return vec![];
182 }
183
184 let max_occurrences = 100;
185 let mut left_histogram = Histogram::calculate(left, left_ranges, max_occurrences);
186 if *left_histogram.count_to_words.keys().next().unwrap() > max_occurrences {
187 // If there are very many occurrences of all words, then we just give up.
188 return vec![];
189 }
190 let mut right_histogram = Histogram::calculate(right, right_ranges, max_occurrences);
191 // Look for words with few occurrences in `left` (could equally well have picked
192 // `right`?). If any of them also occur in `right`, then we add the words to
193 // the LCS.
194 let mut uncommon_shared_words = vec![];
195 while !left_histogram.count_to_words.is_empty() && uncommon_shared_words.is_empty() {
196 let left_words = left_histogram
197 .count_to_words
198 .first_entry()
199 .map(|x| x.remove())
200 .unwrap();
201 for left_word in left_words {
202 if right_histogram.word_to_positions.contains_key(left_word) {
203 uncommon_shared_words.push(left_word);
204 }
205 }
206 }
207 if uncommon_shared_words.is_empty() {
208 return vec![];
209 }
210
211 // Let's say our inputs are "a b a b" and "a b c c b a b". We will have found
212 // the least common words to be "a" and "b". We now assume that each
213 // occurrence of each word lines up in the left and right input. We do that
214 // by numbering the shared occurrences, effectively instead comparing "a1 b1
215 // a2 b2" and "a1 b1 c c b2 a2 b". We then walk the common words in the
216 // right input in order (["a1", "b1", "b2", "a2"]), and record the index of
217 // that word in the left input ([0,1,3,2]). We then find the LCS and split
218 // points based on that ([0,1,3] or [0,1,2] are both valid).
219
220 // [(index into left_ranges, word, occurrence #)]
221 let mut left_positions = vec![];
222 let mut right_positions = vec![];
223 for uncommon_shared_word in uncommon_shared_words {
224 let left_occurrences = left_histogram
225 .word_to_positions
226 .get_mut(uncommon_shared_word)
227 .unwrap();
228 let right_occurrences = right_histogram
229 .word_to_positions
230 .get_mut(uncommon_shared_word)
231 .unwrap();
232 let shared_count = min(left_occurrences.len(), right_occurrences.len());
233 for occurrence in 0..shared_count {
234 left_positions.push((
235 left_occurrences[occurrence],
236 uncommon_shared_word,
237 occurrence,
238 ));
239 right_positions.push((
240 right_occurrences[occurrence],
241 uncommon_shared_word,
242 occurrence,
243 ));
244 }
245 }
246 left_positions.sort();
247 right_positions.sort();
248 let mut left_position_map = HashMap::new();
249 for (i, (_pos, word, occurrence)) in left_positions.iter().enumerate() {
250 left_position_map.insert((*word, *occurrence), i);
251 }
252 let mut left_index_by_right_index = vec![];
253 for (_pos, word, occurrence) in &right_positions {
254 left_index_by_right_index.push(*left_position_map.get(&(*word, *occurrence)).unwrap());
255 }
256
257 let lcs = find_lcs(&left_index_by_right_index);
258
259 // Produce output ranges, recursing into the modified areas between the elements
260 // in the LCS.
261 let mut result = vec![];
262 let mut previous_left_position = 0;
263 let mut previous_right_position = 0;
264 for (left_index, right_index) in lcs {
265 let left_position = left_positions[left_index].0;
266 let right_position = right_positions[right_index].0;
267 let skipped_left_positions = previous_left_position..left_position;
268 let skipped_right_positions = previous_right_position..right_position;
269 if !skipped_left_positions.is_empty() || !skipped_right_positions.is_empty() {
270 for unchanged_nested_range in unchanged_ranges(
271 left,
272 right,
273 &left_ranges[skipped_left_positions.clone()],
274 &right_ranges[skipped_right_positions.clone()],
275 ) {
276 result.push(unchanged_nested_range);
277 }
278 }
279 result.push((
280 left_ranges[left_position].clone(),
281 right_ranges[right_position].clone(),
282 ));
283 previous_left_position = left_position + 1;
284 previous_right_position = right_position + 1;
285 }
286 // Also recurse into range at end (after common ranges).
287 let skipped_left_positions = previous_left_position..left_ranges.len();
288 let skipped_right_positions = previous_right_position..right_ranges.len();
289 if !skipped_left_positions.is_empty() || !skipped_right_positions.is_empty() {
290 for unchanged_nested_range in unchanged_ranges(
291 left,
292 right,
293 &left_ranges[skipped_left_positions],
294 &right_ranges[skipped_right_positions],
295 ) {
296 result.push(unchanged_nested_range);
297 }
298 }
299
300 result
301}
302
303#[derive(Clone, PartialEq, Eq, Debug)]
304struct UnchangedRange {
305 base_range: Range<usize>,
306 offsets: Vec<isize>,
307}
308
309impl UnchangedRange {
310 fn start(&self, side: usize) -> usize {
311 self.base_range
312 .start
313 .wrapping_add(self.offsets[side] as usize)
314 }
315
316 fn end(&self, side: usize) -> usize {
317 self.base_range
318 .end
319 .wrapping_add(self.offsets[side] as usize)
320 }
321}
322
323impl PartialOrd for UnchangedRange {
324 fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
325 Some(self.cmp(other))
326 }
327}
328
329impl Ord for UnchangedRange {
330 fn cmp(&self, other: &Self) -> Ordering {
331 self.base_range
332 .start
333 .cmp(&other.base_range.start)
334 .then_with(|| self.base_range.end.cmp(&other.base_range.end))
335 }
336}
337
338/// Takes any number of inputs and finds regions that are them same between all
339/// of them.
340#[derive(Clone, Debug)]
341pub struct Diff<'input> {
342 base_input: &'input [u8],
343 other_inputs: Vec<&'input [u8]>,
344 // The key is a range in the base input. The value is the start of each non-base region
345 // relative to the base region's start. By making them relative, they don't need to change
346 // when the base range changes.
347 unchanged_regions: Vec<UnchangedRange>,
348}
349
350/// Takes the current regions and intersects it with the new unchanged ranges
351/// from a 2-way diff. The result is a map of unchanged regions with one more
352/// offset in the map's values.
353fn intersect_regions(
354 current_ranges: Vec<UnchangedRange>,
355 new_unchanged_ranges: &[(Range<usize>, Range<usize>)],
356) -> Vec<UnchangedRange> {
357 let mut result = vec![];
358 let mut current_ranges_iter = current_ranges.into_iter().peekable();
359 for (new_base_range, other_range) in new_unchanged_ranges.iter() {
360 assert_eq!(new_base_range.len(), other_range.len());
361 while let Some(UnchangedRange {
362 base_range,
363 offsets,
364 }) = current_ranges_iter.peek()
365 {
366 // No need to look further if we're past the new range.
367 if base_range.start >= new_base_range.end {
368 break;
369 }
370 // Discard any current unchanged regions that don't match between the base and
371 // the new input.
372 if base_range.end <= new_base_range.start {
373 current_ranges_iter.next();
374 continue;
375 }
376 let new_start = max(base_range.start, new_base_range.start);
377 let new_end = min(base_range.end, new_base_range.end);
378 let mut new_offsets = offsets.clone();
379 new_offsets.push(other_range.start.wrapping_sub(new_base_range.start) as isize);
380 result.push(UnchangedRange {
381 base_range: new_start..new_end,
382 offsets: new_offsets,
383 });
384 if base_range.end >= new_base_range.end {
385 // Break without consuming the item; there may be other new ranges that overlap
386 // with it.
387 break;
388 }
389 current_ranges_iter.next();
390 }
391 }
392 result
393}
394
395impl<'input> Diff<'input> {
396 pub fn for_tokenizer(
397 inputs: &[&'input [u8]],
398 tokenizer: &impl Fn(&[u8]) -> Vec<Range<usize>>,
399 ) -> Self {
400 assert!(!inputs.is_empty());
401 let base_input = inputs[0];
402 let other_inputs = inputs.iter().skip(1).copied().collect_vec();
403 // First tokenize each input
404 let base_token_ranges: Vec<Range<usize>> = tokenizer(base_input);
405 let other_token_ranges: Vec<Vec<Range<usize>>> = other_inputs
406 .iter()
407 .map(|other_input| tokenizer(other_input))
408 .collect_vec();
409
410 // Look for unchanged regions. Initially consider the whole range of the base
411 // input as unchanged (compared to itself). Then diff each other input
412 // against the base. Intersect the previously found ranges with the
413 // unchanged ranges in the diff.
414 let mut unchanged_regions = vec![UnchangedRange {
415 base_range: 0..base_input.len(),
416 offsets: vec![],
417 }];
418 for (i, other_token_ranges) in other_token_ranges.iter().enumerate() {
419 let unchanged_diff_ranges = unchanged_ranges(
420 base_input,
421 other_inputs[i],
422 &base_token_ranges,
423 other_token_ranges,
424 );
425 unchanged_regions = intersect_regions(unchanged_regions, &unchanged_diff_ranges);
426 }
427 // Add an empty range at the end to make life easier for hunks().
428 let offsets = other_inputs
429 .iter()
430 .map(|input| input.len().wrapping_sub(base_input.len()) as isize)
431 .collect_vec();
432 unchanged_regions.push(UnchangedRange {
433 base_range: base_input.len()..base_input.len(),
434 offsets,
435 });
436
437 let mut diff = Self {
438 base_input,
439 other_inputs,
440 unchanged_regions,
441 };
442 diff.compact_unchanged_regions();
443 diff
444 }
445
446 pub fn unrefined(inputs: &[&'input [u8]]) -> Self {
447 Diff::for_tokenizer(inputs, &|_| vec![])
448 }
449
450 // TODO: At least when merging, it's wasteful to refine the diff if e.g. if 2
451 // out of 3 inputs match in the differing regions. Perhaps the refine()
452 // method should be on the hunk instead (probably returning a new Diff)?
453 // That would let each user decide which hunks to refine. However, it would
454 // probably mean that many callers repeat the same code. Perhaps it
455 // should be possible to refine a whole diff *or* individual hunks.
456 pub fn default_refinement(inputs: &[&'input [u8]]) -> Self {
457 let mut diff = Diff::for_tokenizer(inputs, &find_line_ranges);
458 diff.refine_changed_regions(&find_word_ranges);
459 diff.refine_changed_regions(&find_nonword_ranges);
460 diff
461 }
462
463 pub fn hunks<'diff>(&'diff self) -> DiffHunkIterator<'diff, 'input> {
464 let previous_offsets = vec![0; self.other_inputs.len()];
465 DiffHunkIterator {
466 diff: self,
467 previous: UnchangedRange {
468 base_range: 0..0,
469 offsets: previous_offsets,
470 },
471 unchanged_emitted: true,
472 unchanged_iter: self.unchanged_regions.iter(),
473 }
474 }
475
476 /// Uses the given tokenizer to split the changed regions into smaller
477 /// regions. Then tries to finds unchanged regions among them.
478 pub fn refine_changed_regions(&mut self, tokenizer: &impl Fn(&[u8]) -> Vec<Range<usize>>) {
479 let mut previous = UnchangedRange {
480 base_range: 0..0,
481 offsets: vec![0; self.other_inputs.len()],
482 };
483 let mut new_unchanged_ranges = vec![];
484 for current in self.unchanged_regions.iter() {
485 // For the changed region between the previous region and the current one,
486 // create a new Diff instance. Then adjust the start positions and
487 // offsets to be valid in the context of the larger Diff instance
488 // (`self`).
489 let mut slices =
490 vec![&self.base_input[previous.base_range.end..current.base_range.start]];
491 for i in 0..current.offsets.len() {
492 let changed_range = previous.end(i)..current.start(i);
493 slices.push(&self.other_inputs[i][changed_range]);
494 }
495
496 let refined_diff = Diff::for_tokenizer(&slices, tokenizer);
497
498 for UnchangedRange {
499 base_range,
500 offsets,
501 } in refined_diff.unchanged_regions
502 {
503 let new_base_start = base_range.start + previous.base_range.end;
504 let new_base_end = base_range.end + previous.base_range.end;
505 let offsets = offsets
506 .into_iter()
507 .enumerate()
508 .map(|(i, offset)| offset + previous.offsets[i])
509 .collect_vec();
510 new_unchanged_ranges.push(UnchangedRange {
511 base_range: new_base_start..new_base_end,
512 offsets,
513 });
514 }
515 previous = current.clone();
516 }
517 self.unchanged_regions = self
518 .unchanged_regions
519 .iter()
520 .cloned()
521 .merge(new_unchanged_ranges)
522 .collect_vec();
523 self.compact_unchanged_regions();
524 }
525
526 fn compact_unchanged_regions(&mut self) {
527 let mut compacted = vec![];
528 let mut maybe_previous: Option<UnchangedRange> = None;
529 for current in self.unchanged_regions.iter() {
530 if let Some(previous) = maybe_previous {
531 if previous.base_range.end == current.base_range.start
532 && previous.offsets == *current.offsets
533 {
534 maybe_previous = Some(UnchangedRange {
535 base_range: previous.base_range.start..current.base_range.end,
536 offsets: current.offsets.clone(),
537 });
538 continue;
539 }
540 compacted.push(previous);
541 }
542 maybe_previous = Some(current.clone());
543 }
544 if let Some(previous) = maybe_previous {
545 compacted.push(previous);
546 }
547 self.unchanged_regions = compacted;
548 }
549}
550
551#[derive(PartialEq, Eq, Clone)]
552pub enum DiffHunk<'input> {
553 Matching(&'input [u8]),
554 Different(Vec<&'input [u8]>),
555}
556
557impl Debug for DiffHunk<'_> {
558 fn fmt(&self, f: &mut Formatter<'_>) -> Result<(), std::fmt::Error> {
559 match self {
560 DiffHunk::Matching(slice) => f
561 .debug_tuple("DiffHunk::Matching")
562 .field(&String::from_utf8_lossy(slice))
563 .finish(),
564 DiffHunk::Different(slices) => f
565 .debug_tuple("DiffHunk::Different")
566 .field(
567 &slices
568 .iter()
569 .map(|slice| String::from_utf8_lossy(slice))
570 .collect_vec(),
571 )
572 .finish(),
573 }
574 }
575}
576
577pub struct DiffHunkIterator<'diff, 'input> {
578 diff: &'diff Diff<'input>,
579 previous: UnchangedRange,
580 unchanged_emitted: bool,
581 unchanged_iter: slice::Iter<'diff, UnchangedRange>,
582}
583
584impl<'diff, 'input> Iterator for DiffHunkIterator<'diff, 'input> {
585 type Item = DiffHunk<'input>;
586
587 fn next(&mut self) -> Option<Self::Item> {
588 loop {
589 if !self.unchanged_emitted {
590 self.unchanged_emitted = true;
591 if !self.previous.base_range.is_empty() {
592 return Some(DiffHunk::Matching(
593 &self.diff.base_input[self.previous.base_range.clone()],
594 ));
595 }
596 }
597 if let Some(current) = self.unchanged_iter.next() {
598 let mut slices = vec![
599 &self.diff.base_input[self.previous.base_range.end..current.base_range.start],
600 ];
601 for (i, input) in self.diff.other_inputs.iter().enumerate() {
602 slices.push(&input[self.previous.end(i)..current.start(i)]);
603 }
604 self.previous = current.clone();
605 self.unchanged_emitted = false;
606 if slices.iter().any(|slice| !slice.is_empty()) {
607 return Some(DiffHunk::Different(slices));
608 }
609 } else {
610 break;
611 }
612 }
613 None
614 }
615}
616
617/// Diffs two slices of bytes. The returned diff hunks may be any length (may
618/// span many lines or may be only part of a line). This currently uses
619/// Histogram diff (or maybe something similar; I'm not sure I understood the
620/// algorithm correctly). It first diffs lines in the input and then refines
621/// the changed ranges at the word level.
622pub fn diff<'a>(left: &'a [u8], right: &'a [u8]) -> Vec<DiffHunk<'a>> {
623 if left == right {
624 return vec![DiffHunk::Matching(left)];
625 }
626 if left.is_empty() {
627 return vec![DiffHunk::Different(vec![b"", right])];
628 }
629 if right.is_empty() {
630 return vec![DiffHunk::Different(vec![left, b""])];
631 }
632
633 Diff::default_refinement(&[left, right])
634 .hunks()
635 .collect_vec()
636}
637
638#[cfg(test)]
639mod tests {
640 use super::*;
641
642 // Extracted to a function because type inference is ambiguous due to
643 // `impl PartialEq<aho_corasick::util::search::Span> for std::ops::Range<usize>`
644 fn no_ranges() -> Vec<Range<usize>> {
645 vec![]
646 }
647
648 #[test]
649 fn test_find_line_ranges_empty() {
650 assert_eq!(find_line_ranges(b""), no_ranges());
651 }
652
653 #[test]
654 fn test_find_line_ranges_blank_line() {
655 assert_eq!(find_line_ranges(b"\n"), vec![0..1]);
656 }
657
658 #[test]
659 fn test_find_line_ranges_missing_newline_at_eof() {
660 assert_eq!(find_line_ranges(b"foo"), vec![0..3]);
661 }
662
663 #[test]
664 fn test_find_line_ranges_multiple_lines() {
665 assert_eq!(find_line_ranges(b"a\nbb\nccc\n"), vec![0..2, 2..5, 5..9]);
666 }
667
668 #[test]
669 fn test_find_word_ranges_empty() {
670 assert_eq!(find_word_ranges(b""), no_ranges());
671 }
672
673 #[test]
674 fn test_find_word_ranges_single_word() {
675 assert_eq!(find_word_ranges(b"Abc"), vec![0..3]);
676 }
677
678 #[test]
679 fn test_find_word_ranges_no_word() {
680 assert_eq!(find_word_ranges(b"+-*/"), no_ranges());
681 }
682
683 #[test]
684 fn test_find_word_ranges_word_then_non_word() {
685 assert_eq!(find_word_ranges(b"Abc "), vec![0..3]);
686 }
687
688 #[test]
689 fn test_find_word_ranges_non_word_then_word() {
690 assert_eq!(find_word_ranges(b" Abc"), vec![3..6]);
691 }
692
693 #[test]
694 fn test_find_word_ranges_multibyte() {
695 assert_eq!(find_word_ranges("⊢".as_bytes()), vec![0..3])
696 }
697
698 #[test]
699 fn test_find_lcs_empty() {
700 let empty: Vec<(usize, usize)> = vec![];
701 assert_eq!(find_lcs(&[]), empty);
702 }
703
704 #[test]
705 fn test_find_lcs_single_element() {
706 assert_eq!(find_lcs(&[0]), vec![(0, 0)]);
707 }
708
709 #[test]
710 fn test_find_lcs_in_order() {
711 assert_eq!(find_lcs(&[0, 1, 2]), vec![(0, 0), (1, 1), (2, 2)]);
712 }
713
714 #[test]
715 fn test_find_lcs_reverse_order() {
716 assert_eq!(find_lcs(&[2, 1, 0]), vec![(2, 0)]);
717 }
718
719 #[test]
720 fn test_find_lcs_two_swapped() {
721 assert_eq!(
722 find_lcs(&[0, 1, 4, 3, 2, 5, 6]),
723 vec![(0, 0), (1, 1), (2, 4), (5, 5), (6, 6)]
724 );
725 }
726
727 #[test]
728 fn test_find_lcs_element_moved_earlier() {
729 assert_eq!(
730 find_lcs(&[0, 1, 4, 2, 3, 5, 6]),
731 vec![(0, 0), (1, 1), (2, 3), (3, 4), (5, 5), (6, 6)]
732 );
733 }
734
735 #[test]
736 fn test_find_lcs_element_moved_later() {
737 assert_eq!(
738 find_lcs(&[0, 1, 3, 4, 2, 5, 6]),
739 vec![(0, 0), (1, 1), (3, 2), (4, 3), (5, 5), (6, 6)]
740 );
741 }
742
743 #[test]
744 fn test_find_lcs_interleaved_longest_chains() {
745 assert_eq!(
746 find_lcs(&[0, 4, 2, 9, 6, 5, 1, 3, 7, 8]),
747 vec![(0, 0), (1, 6), (3, 7), (7, 8), (8, 9)]
748 );
749 }
750
751 #[test]
752 fn test_find_word_ranges_many_words() {
753 assert_eq!(
754 find_word_ranges(b"fn find_words(text: &[u8])"),
755 vec![0..2, 3..13, 14..18, 22..24]
756 );
757 }
758
759 #[test]
760 fn test_unchanged_ranges_insert_in_middle() {
761 assert_eq!(
762 unchanged_ranges(
763 b"a b b c",
764 b"a b X b c",
765 &[0..1, 2..3, 4..5, 6..7],
766 &[0..1, 2..3, 4..5, 6..7, 8..9],
767 ),
768 vec![(0..1, 0..1), (2..3, 2..3), (4..5, 6..7), (6..7, 8..9)]
769 );
770 }
771
772 #[test]
773 fn test_unchanged_ranges_non_unique_removed() {
774 assert_eq!(
775 unchanged_ranges(
776 b"a a a a",
777 b"a b a c",
778 &[0..1, 2..3, 4..5, 6..7],
779 &[0..1, 2..3, 4..5, 6..7],
780 ),
781 vec![(0..1, 0..1), (2..3, 4..5)]
782 );
783 }
784
785 #[test]
786 fn test_unchanged_ranges_non_unique_added() {
787 assert_eq!(
788 unchanged_ranges(
789 b"a b a c",
790 b"a a a a",
791 &[0..1, 2..3, 4..5, 6..7],
792 &[0..1, 2..3, 4..5, 6..7],
793 ),
794 vec![(0..1, 0..1), (4..5, 2..3)]
795 );
796 }
797
798 #[test]
799 fn test_intersect_regions_existing_empty() {
800 let actual = intersect_regions(vec![], &[(20..25, 55..60)]);
801 let expected = vec![];
802 assert_eq!(actual, expected);
803 }
804
805 #[test]
806 fn test_intersect_regions_new_ranges_within_existing() {
807 let actual = intersect_regions(
808 vec![UnchangedRange {
809 base_range: 20..70,
810 offsets: vec![3],
811 }],
812 &[(25..30, 35..40), (40..50, 40..50)],
813 );
814 let expected = vec![
815 UnchangedRange {
816 base_range: 25..30,
817 offsets: vec![3, 10],
818 },
819 UnchangedRange {
820 base_range: 40..50,
821 offsets: vec![3, 0],
822 },
823 ];
824 assert_eq!(actual, expected);
825 }
826
827 #[test]
828 fn test_intersect_regions_partial_overlap() {
829 let actual = intersect_regions(
830 vec![UnchangedRange {
831 base_range: 20..50,
832 offsets: vec![-3],
833 }],
834 &[(15..25, 5..15), (45..60, 55..70)],
835 );
836 let expected = vec![
837 UnchangedRange {
838 base_range: 20..25,
839 offsets: vec![-3, -10],
840 },
841 UnchangedRange {
842 base_range: 45..50,
843 offsets: vec![-3, 10],
844 },
845 ];
846 assert_eq!(actual, expected);
847 }
848
849 #[test]
850 fn test_intersect_regions_new_range_overlaps_multiple_existing() {
851 let actual = intersect_regions(
852 vec![
853 UnchangedRange {
854 base_range: 20..50,
855 offsets: vec![3, -8],
856 },
857 UnchangedRange {
858 base_range: 70..80,
859 offsets: vec![7, 1],
860 },
861 ],
862 &[(10..100, 5..95)],
863 );
864 let expected = vec![
865 UnchangedRange {
866 base_range: 20..50,
867 offsets: vec![3, -8, -5],
868 },
869 UnchangedRange {
870 base_range: 70..80,
871 offsets: vec![7, 1, -5],
872 },
873 ];
874 assert_eq!(actual, expected);
875 }
876
877 #[test]
878 fn test_diff_single_input() {
879 let diff = Diff::default_refinement(&[b"abc"]);
880 assert_eq!(diff.hunks().collect_vec(), vec![DiffHunk::Matching(b"abc")]);
881 }
882
883 #[test]
884 fn test_diff_single_empty_input() {
885 let diff = Diff::default_refinement(&[b""]);
886 assert_eq!(diff.hunks().collect_vec(), vec![]);
887 }
888
889 #[test]
890 fn test_diff_two_inputs_one_different() {
891 let diff = Diff::default_refinement(&[b"a b c", b"a X c"]);
892 assert_eq!(
893 diff.hunks().collect_vec(),
894 vec![
895 DiffHunk::Matching(b"a "),
896 DiffHunk::Different(vec![b"b", b"X"]),
897 DiffHunk::Matching(b" c"),
898 ]
899 );
900 }
901
902 #[test]
903 fn test_diff_multiple_inputs_one_different() {
904 let diff = Diff::default_refinement(&[b"a b c", b"a X c", b"a b c"]);
905 assert_eq!(
906 diff.hunks().collect_vec(),
907 vec![
908 DiffHunk::Matching(b"a "),
909 DiffHunk::Different(vec![b"b", b"X", b"b"]),
910 DiffHunk::Matching(b" c"),
911 ]
912 );
913 }
914
915 #[test]
916 fn test_diff_multiple_inputs_all_different() {
917 let diff = Diff::default_refinement(&[b"a b c", b"a X c", b"a c X"]);
918 assert_eq!(
919 diff.hunks().collect_vec(),
920 vec![
921 DiffHunk::Matching(b"a "),
922 DiffHunk::Different(vec![b"b ", b"X ", b""]),
923 DiffHunk::Matching(b"c"),
924 DiffHunk::Different(vec![b"", b"", b" X"]),
925 ]
926 );
927 }
928
929 #[test]
930 fn test_diff_for_tokenizer_compacted() {
931 // Tests that unchanged regions are compacted when using for_tokenizer()
932 let diff = Diff::for_tokenizer(
933 &[b"a\nb\nc\nd\ne\nf\ng", b"a\nb\nc\nX\ne\nf\ng"],
934 &find_line_ranges,
935 );
936 assert_eq!(
937 diff.hunks().collect_vec(),
938 vec![
939 DiffHunk::Matching(b"a\nb\nc\n"),
940 DiffHunk::Different(vec![b"d\n", b"X\n"]),
941 DiffHunk::Matching(b"e\nf\ng"),
942 ]
943 );
944 }
945
946 #[test]
947 fn test_diff_nothing_in_common() {
948 assert_eq!(
949 diff(b"aaa", b"bb"),
950 vec![DiffHunk::Different(vec![b"aaa", b"bb"])]
951 );
952 }
953
954 #[test]
955 fn test_diff_insert_in_middle() {
956 assert_eq!(
957 diff(b"a z", b"a S z"),
958 vec![
959 DiffHunk::Matching(b"a "),
960 DiffHunk::Different(vec![b"", b"S "]),
961 DiffHunk::Matching(b"z"),
962 ]
963 );
964 }
965
966 #[test]
967 fn test_diff_no_unique_middle_flips() {
968 assert_eq!(
969 diff(b"a R R S S z", b"a S S R R z"),
970 vec![
971 DiffHunk::Matching(b"a "),
972 DiffHunk::Different(vec![b"R R ", b""]),
973 DiffHunk::Matching(b"S S "),
974 DiffHunk::Different(vec![b"", b"R R "]),
975 DiffHunk::Matching(b"z")
976 ],
977 );
978 }
979
980 #[test]
981 fn test_diff_recursion_needed() {
982 assert_eq!(
983 diff(
984 b"a q x q y q z q b q y q x q c",
985 b"a r r x q y z q b y q x r r c",
986 ),
987 vec![
988 DiffHunk::Matching(b"a "),
989 DiffHunk::Different(vec![b"q", b"r"]),
990 DiffHunk::Matching(b" "),
991 DiffHunk::Different(vec![b"", b"r "]),
992 DiffHunk::Matching(b"x q y "),
993 DiffHunk::Different(vec![b"q ", b""]),
994 DiffHunk::Matching(b"z q b "),
995 DiffHunk::Different(vec![b"q ", b""]),
996 DiffHunk::Matching(b"y q x "),
997 DiffHunk::Different(vec![b"q", b"r"]),
998 DiffHunk::Matching(b" "),
999 DiffHunk::Different(vec![b"", b"r "]),
1000 DiffHunk::Matching(b"c"),
1001 ]
1002 );
1003 }
1004
1005 #[test]
1006 fn test_diff_real_case_write_fmt() {
1007 // This is from src/ui.rs in commit f44d246e3f88 in this repo. It highlights the
1008 // need for recursion into the range at the end: after splitting at "Arguments"
1009 // and "formatter", the region at the end has the unique words "write_fmt"
1010 // and "fmt", but we forgot to recurse into that region, so we ended up
1011 // saying that "write_fmt(fmt).unwrap()" was replaced by b"write_fmt(fmt)".
1012 assert_eq!(diff(
1013 b" pub fn write_fmt(&mut self, fmt: fmt::Arguments<\'_>) {\n self.styler().write_fmt(fmt).unwrap()\n",
1014 b" pub fn write_fmt(&mut self, fmt: fmt::Arguments<\'_>) -> io::Result<()> {\n self.styler().write_fmt(fmt)\n"
1015 ),
1016 vec![
1017 DiffHunk::Matching(b" pub fn write_fmt(&mut self, fmt: fmt::Arguments<\'_>) "),
1018 DiffHunk::Different(vec![b"", b"-> io::Result<()> "]),
1019 DiffHunk::Matching(b"{\n self.styler().write_fmt(fmt)"),
1020 DiffHunk::Different(vec![b".unwrap()", b""]),
1021 DiffHunk::Matching(b"\n")
1022 ]
1023 );
1024 }
1025
1026 #[test]
1027 fn test_diff_real_case_gitgit_read_tree_c() {
1028 // This is the diff from commit e497ea2a9b in the git.git repo
1029 assert_eq!(
1030 diff(
1031 br##"/*
1032 * GIT - The information manager from hell
1033 *
1034 * Copyright (C) Linus Torvalds, 2005
1035 */
1036#include "#cache.h"
1037
1038static int unpack(unsigned char *sha1)
1039{
1040 void *buffer;
1041 unsigned long size;
1042 char type[20];
1043
1044 buffer = read_sha1_file(sha1, type, &size);
1045 if (!buffer)
1046 usage("unable to read sha1 file");
1047 if (strcmp(type, "tree"))
1048 usage("expected a 'tree' node");
1049 while (size) {
1050 int len = strlen(buffer)+1;
1051 unsigned char *sha1 = buffer + len;
1052 char *path = strchr(buffer, ' ')+1;
1053 unsigned int mode;
1054 if (size < len + 20 || sscanf(buffer, "%o", &mode) != 1)
1055 usage("corrupt 'tree' file");
1056 buffer = sha1 + 20;
1057 size -= len + 20;
1058 printf("%o %s (%s)\n", mode, path, sha1_to_hex(sha1));
1059 }
1060 return 0;
1061}
1062
1063int main(int argc, char **argv)
1064{
1065 int fd;
1066 unsigned char sha1[20];
1067
1068 if (argc != 2)
1069 usage("read-tree <key>");
1070 if (get_sha1_hex(argv[1], sha1) < 0)
1071 usage("read-tree <key>");
1072 sha1_file_directory = getenv(DB_ENVIRONMENT);
1073 if (!sha1_file_directory)
1074 sha1_file_directory = DEFAULT_DB_ENVIRONMENT;
1075 if (unpack(sha1) < 0)
1076 usage("unpack failed");
1077 return 0;
1078}
1079"##,
1080 br##"/*
1081 * GIT - The information manager from hell
1082 *
1083 * Copyright (C) Linus Torvalds, 2005
1084 */
1085#include "#cache.h"
1086
1087static void create_directories(const char *path)
1088{
1089 int len = strlen(path);
1090 char *buf = malloc(len + 1);
1091 const char *slash = path;
1092
1093 while ((slash = strchr(slash+1, '/')) != NULL) {
1094 len = slash - path;
1095 memcpy(buf, path, len);
1096 buf[len] = 0;
1097 mkdir(buf, 0700);
1098 }
1099}
1100
1101static int create_file(const char *path)
1102{
1103 int fd = open(path, O_WRONLY | O_TRUNC | O_CREAT, 0600);
1104 if (fd < 0) {
1105 if (errno == ENOENT) {
1106 create_directories(path);
1107 fd = open(path, O_WRONLY | O_TRUNC | O_CREAT, 0600);
1108 }
1109 }
1110 return fd;
1111}
1112
1113static int unpack(unsigned char *sha1)
1114{
1115 void *buffer;
1116 unsigned long size;
1117 char type[20];
1118
1119 buffer = read_sha1_file(sha1, type, &size);
1120 if (!buffer)
1121 usage("unable to read sha1 file");
1122 if (strcmp(type, "tree"))
1123 usage("expected a 'tree' node");
1124 while (size) {
1125 int len = strlen(buffer)+1;
1126 unsigned char *sha1 = buffer + len;
1127 char *path = strchr(buffer, ' ')+1;
1128 char *data;
1129 unsigned long filesize;
1130 unsigned int mode;
1131 int fd;
1132
1133 if (size < len + 20 || sscanf(buffer, "%o", &mode) != 1)
1134 usage("corrupt 'tree' file");
1135 buffer = sha1 + 20;
1136 size -= len + 20;
1137 data = read_sha1_file(sha1, type, &filesize);
1138 if (!data || strcmp(type, "blob"))
1139 usage("tree file refers to bad file data");
1140 fd = create_file(path);
1141 if (fd < 0)
1142 usage("unable to create file");
1143 if (write(fd, data, filesize) != filesize)
1144 usage("unable to write file");
1145 fchmod(fd, mode);
1146 close(fd);
1147 free(data);
1148 }
1149 return 0;
1150}
1151
1152int main(int argc, char **argv)
1153{
1154 int fd;
1155 unsigned char sha1[20];
1156
1157 if (argc != 2)
1158 usage("read-tree <key>");
1159 if (get_sha1_hex(argv[1], sha1) < 0)
1160 usage("read-tree <key>");
1161 sha1_file_directory = getenv(DB_ENVIRONMENT);
1162 if (!sha1_file_directory)
1163 sha1_file_directory = DEFAULT_DB_ENVIRONMENT;
1164 if (unpack(sha1) < 0)
1165 usage("unpack failed");
1166 return 0;
1167}
1168"##,
1169 ),
1170 vec![
1171 DiffHunk::Matching(b"/*\n * GIT - The information manager from hell\n *\n * Copyright (C) Linus Torvalds, 2005\n */\n#include \"#cache.h\"\n\n"),
1172 DiffHunk::Different(vec![b"", b"static void create_directories(const char *path)\n{\n\tint len = strlen(path);\n\tchar *buf = malloc(len + 1);\n\tconst char *slash = path;\n\n\twhile ((slash = strchr(slash+1, \'/\')) != NULL) {\n\t\tlen = slash - path;\n\t\tmemcpy(buf, path, len);\n\t\tbuf[len] = 0;\n\t\tmkdir(buf, 0700);\n\t}\n}\n\nstatic int create_file(const char *path)\n{\n\tint fd = open(path, O_WRONLY | O_TRUNC | O_CREAT, 0600);\n\tif (fd < 0) {\n\t\tif (errno == ENOENT) {\n\t\t\tcreate_directories(path);\n\t\t\tfd = open(path, O_WRONLY | O_TRUNC | O_CREAT, 0600);\n\t\t}\n\t}\n\treturn fd;\n}\n\n"]),
1173 DiffHunk::Matching(b"static int unpack(unsigned char *sha1)\n{\n\tvoid *buffer;\n\tunsigned long size;\n\tchar type[20];\n\n\tbuffer = read_sha1_file(sha1, type, &size);\n\tif (!buffer)\n\t\tusage(\"unable to read sha1 file\");\n\tif (strcmp(type, \"tree\"))\n\t\tusage(\"expected a \'tree\' node\");\n\twhile (size) {\n\t\tint len = strlen(buffer)+1;\n\t\tunsigned char *sha1 = buffer + len;\n\t\tchar *path = strchr(buffer, \' \')+1;\n"),
1174 DiffHunk::Different(vec![b"", b"\t\tchar *data;\n\t\tunsigned long filesize;\n"]),
1175 DiffHunk::Matching(b"\t\tunsigned int mode;\n"),
1176 DiffHunk::Different(vec![b"", b"\t\tint fd;\n\n"]),
1177 DiffHunk::Matching(b"\t\tif (size < len + 20 || sscanf(buffer, \"%o\", &mode) != 1)\n\t\t\tusage(\"corrupt \'tree\' file\");\n\t\tbuffer = sha1 + 20;\n\t\tsize -= len + 20;\n\t\t"),
1178 DiffHunk::Different(vec![b"printf", b"data = read_sha1_file"]),
1179 DiffHunk::Matching(b"("),
1180 DiffHunk::Different(vec![b"\"%o %s (%s)\\n\", mode, path, sha1_to_hex(", b""]),
1181 DiffHunk::Matching(b"sha1"),
1182 DiffHunk::Different(vec![b"", b", type, &filesize"]),
1183 DiffHunk::Matching(b")"),
1184 DiffHunk::Different(vec![b")", b""]),
1185 DiffHunk::Matching(b";\n"),
1186 DiffHunk::Different(vec![b"", b"\t\tif (!data || strcmp(type, \"blob\"))\n\t\t\tusage(\"tree file refers to bad file data\");\n\t\tfd = create_file(path);\n\t\tif (fd < 0)\n\t\t\tusage(\"unable to create file\");\n\t\tif (write(fd, data, filesize) != filesize)\n\t\t\tusage(\"unable to write file\");\n\t\tfchmod(fd, mode);\n\t\tclose(fd);\n\t\tfree(data);\n"]),
1187 DiffHunk::Matching(b"\t}\n\treturn 0;\n}\n\nint main(int argc, char **argv)\n{\n\tint fd;\n\tunsigned char sha1[20];\n\n\tif (argc != 2)\n\t\tusage(\"read-tree <key>\");\n\tif (get_sha1_hex(argv[1], sha1) < 0)\n\t\tusage(\"read-tree <key>\");\n\tsha1_file_directory = getenv(DB_ENVIRONMENT);\n\tif (!sha1_file_directory)\n\t\tsha1_file_directory = DEFAULT_DB_ENVIRONMENT;\n\tif (unpack(sha1) < 0)\n\t\tusage(\"unpack failed\");\n\treturn 0;\n}\n")
1188 ]
1189 );
1190 }
1191}