just playing with tangled
at tmp-tutorial 1191 lines 41 kB view raw
1// Copyright 2021 The Jujutsu Authors 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// https://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15#![allow(missing_docs)] 16 17use std::cmp::{max, min, Ordering}; 18use std::collections::{BTreeMap, HashMap}; 19use std::fmt::{Debug, Formatter}; 20use std::ops::Range; 21use std::slice; 22 23use itertools::Itertools; 24 25pub fn find_line_ranges(text: &[u8]) -> Vec<Range<usize>> { 26 let mut ranges = vec![]; 27 let mut start = 0; 28 loop { 29 match text[start..].iter().position(|b| *b == b'\n') { 30 None => { 31 break; 32 } 33 Some(i) => { 34 ranges.push(start..start + i + 1); 35 start += i + 1; 36 } 37 } 38 } 39 if start < text.len() { 40 ranges.push(start..text.len()); 41 } 42 ranges 43} 44 45fn is_word_byte(b: u8) -> bool { 46 // TODO: Make this configurable (probably higher up in the call stack) 47 matches!( 48 b, 49 // Count 0x80..0xff as word bytes so multi-byte UTF-8 chars are 50 // treated as a single unit. 51 b'A'..=b'Z' | b'a'..=b'z' | b'0'..=b'9' | b'_' | b'\x80'..=b'\xff' 52 ) 53} 54 55pub fn find_word_ranges(text: &[u8]) -> Vec<Range<usize>> { 56 let mut word_ranges = vec![]; 57 let mut word_start_pos = 0; 58 let mut in_word = false; 59 for (i, b) in text.iter().enumerate() { 60 if in_word && !is_word_byte(*b) { 61 in_word = false; 62 word_ranges.push(word_start_pos..i); 63 word_start_pos = i; 64 } else if !in_word && is_word_byte(*b) { 65 in_word = true; 66 word_start_pos = i; 67 } 68 } 69 if in_word && word_start_pos < text.len() { 70 word_ranges.push(word_start_pos..text.len()); 71 } 72 word_ranges 73} 74 75pub fn find_nonword_ranges(text: &[u8]) -> Vec<Range<usize>> { 76 let mut ranges = vec![]; 77 for (i, b) in text.iter().enumerate() { 78 if !is_word_byte(*b) { 79 ranges.push(i..i + 1); 80 } 81 } 82 ranges 83} 84 85struct Histogram<'a> { 86 word_to_positions: HashMap<&'a [u8], Vec<usize>>, 87 count_to_words: BTreeMap<usize, Vec<&'a [u8]>>, 88} 89 90impl Histogram<'_> { 91 fn calculate<'a>( 92 text: &'a [u8], 93 ranges: &[Range<usize>], 94 max_occurrences: usize, 95 ) -> Histogram<'a> { 96 let mut word_to_positions: HashMap<&[u8], Vec<usize>> = HashMap::new(); 97 for (i, range) in ranges.iter().enumerate() { 98 let positions = word_to_positions.entry(&text[range.clone()]).or_default(); 99 // Allow one more than max_occurrences, so we can later skip those with more 100 // than max_occurrences 101 if positions.len() <= max_occurrences { 102 positions.push(i); 103 } 104 } 105 let mut count_to_words: BTreeMap<usize, Vec<&[u8]>> = BTreeMap::new(); 106 for (word, ranges) in &word_to_positions { 107 count_to_words.entry(ranges.len()).or_default().push(word); 108 } 109 Histogram { 110 word_to_positions, 111 count_to_words, 112 } 113 } 114} 115 116/// Finds the LCS given a array where the value of `input[i]` indicates that 117/// the position of element `i` in the right array is at position `input[i]` in 118/// the left array. 119/// 120/// For example (some have multiple valid outputs): 121/// 122/// [0,1,2] => [(0,0),(1,1),(2,2)] 123/// [2,1,0] => [(0,2)] 124/// [0,1,4,2,3,5,6] => [(0,0),(1,1),(2,3),(3,4),(5,5),(6,6)] 125/// [0,1,4,3,2,5,6] => [(0,0),(1,1),(4,2),(5,5),(6,6)] 126fn find_lcs(input: &[usize]) -> Vec<(usize, usize)> { 127 if input.is_empty() { 128 return vec![]; 129 } 130 131 let mut chain = vec![(0, 0, 0); input.len()]; 132 let mut global_longest = 0; 133 let mut global_longest_right_pos = 0; 134 for (right_pos, &left_pos) in input.iter().enumerate() { 135 let mut longest_from_here = 1; 136 let mut previous_right_pos = usize::MAX; 137 for i in (0..right_pos).rev() { 138 let (previous_len, previous_left_pos, _) = chain[i]; 139 if previous_left_pos < left_pos { 140 let len = previous_len + 1; 141 if len > longest_from_here { 142 longest_from_here = len; 143 previous_right_pos = i; 144 if len > global_longest { 145 global_longest = len; 146 global_longest_right_pos = right_pos; 147 // If this is the longest chain globally so far, we cannot find a 148 // longer one by using a previous value, so break early. 149 break; 150 } 151 } 152 } 153 } 154 chain[right_pos] = (longest_from_here, left_pos, previous_right_pos); 155 } 156 157 let mut result = vec![]; 158 let mut right_pos = global_longest_right_pos; 159 loop { 160 let (_, left_pos, previous_right_pos) = chain[right_pos]; 161 result.push((left_pos, right_pos)); 162 if previous_right_pos == usize::MAX { 163 break; 164 } 165 right_pos = previous_right_pos; 166 } 167 result.reverse(); 168 169 result 170} 171 172/// Finds unchanged ranges among the ones given as arguments. The data between 173/// those ranges is ignored. 174pub(crate) fn unchanged_ranges( 175 left: &[u8], 176 right: &[u8], 177 left_ranges: &[Range<usize>], 178 right_ranges: &[Range<usize>], 179) -> Vec<(Range<usize>, Range<usize>)> { 180 if left_ranges.is_empty() || right_ranges.is_empty() { 181 return vec![]; 182 } 183 184 let max_occurrences = 100; 185 let mut left_histogram = Histogram::calculate(left, left_ranges, max_occurrences); 186 if *left_histogram.count_to_words.keys().next().unwrap() > max_occurrences { 187 // If there are very many occurrences of all words, then we just give up. 188 return vec![]; 189 } 190 let mut right_histogram = Histogram::calculate(right, right_ranges, max_occurrences); 191 // Look for words with few occurrences in `left` (could equally well have picked 192 // `right`?). If any of them also occur in `right`, then we add the words to 193 // the LCS. 194 let mut uncommon_shared_words = vec![]; 195 while !left_histogram.count_to_words.is_empty() && uncommon_shared_words.is_empty() { 196 let left_words = left_histogram 197 .count_to_words 198 .first_entry() 199 .map(|x| x.remove()) 200 .unwrap(); 201 for left_word in left_words { 202 if right_histogram.word_to_positions.contains_key(left_word) { 203 uncommon_shared_words.push(left_word); 204 } 205 } 206 } 207 if uncommon_shared_words.is_empty() { 208 return vec![]; 209 } 210 211 // Let's say our inputs are "a b a b" and "a b c c b a b". We will have found 212 // the least common words to be "a" and "b". We now assume that each 213 // occurrence of each word lines up in the left and right input. We do that 214 // by numbering the shared occurrences, effectively instead comparing "a1 b1 215 // a2 b2" and "a1 b1 c c b2 a2 b". We then walk the common words in the 216 // right input in order (["a1", "b1", "b2", "a2"]), and record the index of 217 // that word in the left input ([0,1,3,2]). We then find the LCS and split 218 // points based on that ([0,1,3] or [0,1,2] are both valid). 219 220 // [(index into left_ranges, word, occurrence #)] 221 let mut left_positions = vec![]; 222 let mut right_positions = vec![]; 223 for uncommon_shared_word in uncommon_shared_words { 224 let left_occurrences = left_histogram 225 .word_to_positions 226 .get_mut(uncommon_shared_word) 227 .unwrap(); 228 let right_occurrences = right_histogram 229 .word_to_positions 230 .get_mut(uncommon_shared_word) 231 .unwrap(); 232 let shared_count = min(left_occurrences.len(), right_occurrences.len()); 233 for occurrence in 0..shared_count { 234 left_positions.push(( 235 left_occurrences[occurrence], 236 uncommon_shared_word, 237 occurrence, 238 )); 239 right_positions.push(( 240 right_occurrences[occurrence], 241 uncommon_shared_word, 242 occurrence, 243 )); 244 } 245 } 246 left_positions.sort(); 247 right_positions.sort(); 248 let mut left_position_map = HashMap::new(); 249 for (i, (_pos, word, occurrence)) in left_positions.iter().enumerate() { 250 left_position_map.insert((*word, *occurrence), i); 251 } 252 let mut left_index_by_right_index = vec![]; 253 for (_pos, word, occurrence) in &right_positions { 254 left_index_by_right_index.push(*left_position_map.get(&(*word, *occurrence)).unwrap()); 255 } 256 257 let lcs = find_lcs(&left_index_by_right_index); 258 259 // Produce output ranges, recursing into the modified areas between the elements 260 // in the LCS. 261 let mut result = vec![]; 262 let mut previous_left_position = 0; 263 let mut previous_right_position = 0; 264 for (left_index, right_index) in lcs { 265 let left_position = left_positions[left_index].0; 266 let right_position = right_positions[right_index].0; 267 let skipped_left_positions = previous_left_position..left_position; 268 let skipped_right_positions = previous_right_position..right_position; 269 if !skipped_left_positions.is_empty() || !skipped_right_positions.is_empty() { 270 for unchanged_nested_range in unchanged_ranges( 271 left, 272 right, 273 &left_ranges[skipped_left_positions.clone()], 274 &right_ranges[skipped_right_positions.clone()], 275 ) { 276 result.push(unchanged_nested_range); 277 } 278 } 279 result.push(( 280 left_ranges[left_position].clone(), 281 right_ranges[right_position].clone(), 282 )); 283 previous_left_position = left_position + 1; 284 previous_right_position = right_position + 1; 285 } 286 // Also recurse into range at end (after common ranges). 287 let skipped_left_positions = previous_left_position..left_ranges.len(); 288 let skipped_right_positions = previous_right_position..right_ranges.len(); 289 if !skipped_left_positions.is_empty() || !skipped_right_positions.is_empty() { 290 for unchanged_nested_range in unchanged_ranges( 291 left, 292 right, 293 &left_ranges[skipped_left_positions], 294 &right_ranges[skipped_right_positions], 295 ) { 296 result.push(unchanged_nested_range); 297 } 298 } 299 300 result 301} 302 303#[derive(Clone, PartialEq, Eq, Debug)] 304struct UnchangedRange { 305 base_range: Range<usize>, 306 offsets: Vec<isize>, 307} 308 309impl UnchangedRange { 310 fn start(&self, side: usize) -> usize { 311 self.base_range 312 .start 313 .wrapping_add(self.offsets[side] as usize) 314 } 315 316 fn end(&self, side: usize) -> usize { 317 self.base_range 318 .end 319 .wrapping_add(self.offsets[side] as usize) 320 } 321} 322 323impl PartialOrd for UnchangedRange { 324 fn partial_cmp(&self, other: &Self) -> Option<Ordering> { 325 Some(self.cmp(other)) 326 } 327} 328 329impl Ord for UnchangedRange { 330 fn cmp(&self, other: &Self) -> Ordering { 331 self.base_range 332 .start 333 .cmp(&other.base_range.start) 334 .then_with(|| self.base_range.end.cmp(&other.base_range.end)) 335 } 336} 337 338/// Takes any number of inputs and finds regions that are them same between all 339/// of them. 340#[derive(Clone, Debug)] 341pub struct Diff<'input> { 342 base_input: &'input [u8], 343 other_inputs: Vec<&'input [u8]>, 344 // The key is a range in the base input. The value is the start of each non-base region 345 // relative to the base region's start. By making them relative, they don't need to change 346 // when the base range changes. 347 unchanged_regions: Vec<UnchangedRange>, 348} 349 350/// Takes the current regions and intersects it with the new unchanged ranges 351/// from a 2-way diff. The result is a map of unchanged regions with one more 352/// offset in the map's values. 353fn intersect_regions( 354 current_ranges: Vec<UnchangedRange>, 355 new_unchanged_ranges: &[(Range<usize>, Range<usize>)], 356) -> Vec<UnchangedRange> { 357 let mut result = vec![]; 358 let mut current_ranges_iter = current_ranges.into_iter().peekable(); 359 for (new_base_range, other_range) in new_unchanged_ranges.iter() { 360 assert_eq!(new_base_range.len(), other_range.len()); 361 while let Some(UnchangedRange { 362 base_range, 363 offsets, 364 }) = current_ranges_iter.peek() 365 { 366 // No need to look further if we're past the new range. 367 if base_range.start >= new_base_range.end { 368 break; 369 } 370 // Discard any current unchanged regions that don't match between the base and 371 // the new input. 372 if base_range.end <= new_base_range.start { 373 current_ranges_iter.next(); 374 continue; 375 } 376 let new_start = max(base_range.start, new_base_range.start); 377 let new_end = min(base_range.end, new_base_range.end); 378 let mut new_offsets = offsets.clone(); 379 new_offsets.push(other_range.start.wrapping_sub(new_base_range.start) as isize); 380 result.push(UnchangedRange { 381 base_range: new_start..new_end, 382 offsets: new_offsets, 383 }); 384 if base_range.end >= new_base_range.end { 385 // Break without consuming the item; there may be other new ranges that overlap 386 // with it. 387 break; 388 } 389 current_ranges_iter.next(); 390 } 391 } 392 result 393} 394 395impl<'input> Diff<'input> { 396 pub fn for_tokenizer( 397 inputs: &[&'input [u8]], 398 tokenizer: &impl Fn(&[u8]) -> Vec<Range<usize>>, 399 ) -> Self { 400 assert!(!inputs.is_empty()); 401 let base_input = inputs[0]; 402 let other_inputs = inputs.iter().skip(1).copied().collect_vec(); 403 // First tokenize each input 404 let base_token_ranges: Vec<Range<usize>> = tokenizer(base_input); 405 let other_token_ranges: Vec<Vec<Range<usize>>> = other_inputs 406 .iter() 407 .map(|other_input| tokenizer(other_input)) 408 .collect_vec(); 409 410 // Look for unchanged regions. Initially consider the whole range of the base 411 // input as unchanged (compared to itself). Then diff each other input 412 // against the base. Intersect the previously found ranges with the 413 // unchanged ranges in the diff. 414 let mut unchanged_regions = vec![UnchangedRange { 415 base_range: 0..base_input.len(), 416 offsets: vec![], 417 }]; 418 for (i, other_token_ranges) in other_token_ranges.iter().enumerate() { 419 let unchanged_diff_ranges = unchanged_ranges( 420 base_input, 421 other_inputs[i], 422 &base_token_ranges, 423 other_token_ranges, 424 ); 425 unchanged_regions = intersect_regions(unchanged_regions, &unchanged_diff_ranges); 426 } 427 // Add an empty range at the end to make life easier for hunks(). 428 let offsets = other_inputs 429 .iter() 430 .map(|input| input.len().wrapping_sub(base_input.len()) as isize) 431 .collect_vec(); 432 unchanged_regions.push(UnchangedRange { 433 base_range: base_input.len()..base_input.len(), 434 offsets, 435 }); 436 437 let mut diff = Self { 438 base_input, 439 other_inputs, 440 unchanged_regions, 441 }; 442 diff.compact_unchanged_regions(); 443 diff 444 } 445 446 pub fn unrefined(inputs: &[&'input [u8]]) -> Self { 447 Diff::for_tokenizer(inputs, &|_| vec![]) 448 } 449 450 // TODO: At least when merging, it's wasteful to refine the diff if e.g. if 2 451 // out of 3 inputs match in the differing regions. Perhaps the refine() 452 // method should be on the hunk instead (probably returning a new Diff)? 453 // That would let each user decide which hunks to refine. However, it would 454 // probably mean that many callers repeat the same code. Perhaps it 455 // should be possible to refine a whole diff *or* individual hunks. 456 pub fn default_refinement(inputs: &[&'input [u8]]) -> Self { 457 let mut diff = Diff::for_tokenizer(inputs, &find_line_ranges); 458 diff.refine_changed_regions(&find_word_ranges); 459 diff.refine_changed_regions(&find_nonword_ranges); 460 diff 461 } 462 463 pub fn hunks<'diff>(&'diff self) -> DiffHunkIterator<'diff, 'input> { 464 let previous_offsets = vec![0; self.other_inputs.len()]; 465 DiffHunkIterator { 466 diff: self, 467 previous: UnchangedRange { 468 base_range: 0..0, 469 offsets: previous_offsets, 470 }, 471 unchanged_emitted: true, 472 unchanged_iter: self.unchanged_regions.iter(), 473 } 474 } 475 476 /// Uses the given tokenizer to split the changed regions into smaller 477 /// regions. Then tries to finds unchanged regions among them. 478 pub fn refine_changed_regions(&mut self, tokenizer: &impl Fn(&[u8]) -> Vec<Range<usize>>) { 479 let mut previous = UnchangedRange { 480 base_range: 0..0, 481 offsets: vec![0; self.other_inputs.len()], 482 }; 483 let mut new_unchanged_ranges = vec![]; 484 for current in self.unchanged_regions.iter() { 485 // For the changed region between the previous region and the current one, 486 // create a new Diff instance. Then adjust the start positions and 487 // offsets to be valid in the context of the larger Diff instance 488 // (`self`). 489 let mut slices = 490 vec![&self.base_input[previous.base_range.end..current.base_range.start]]; 491 for i in 0..current.offsets.len() { 492 let changed_range = previous.end(i)..current.start(i); 493 slices.push(&self.other_inputs[i][changed_range]); 494 } 495 496 let refined_diff = Diff::for_tokenizer(&slices, tokenizer); 497 498 for UnchangedRange { 499 base_range, 500 offsets, 501 } in refined_diff.unchanged_regions 502 { 503 let new_base_start = base_range.start + previous.base_range.end; 504 let new_base_end = base_range.end + previous.base_range.end; 505 let offsets = offsets 506 .into_iter() 507 .enumerate() 508 .map(|(i, offset)| offset + previous.offsets[i]) 509 .collect_vec(); 510 new_unchanged_ranges.push(UnchangedRange { 511 base_range: new_base_start..new_base_end, 512 offsets, 513 }); 514 } 515 previous = current.clone(); 516 } 517 self.unchanged_regions = self 518 .unchanged_regions 519 .iter() 520 .cloned() 521 .merge(new_unchanged_ranges) 522 .collect_vec(); 523 self.compact_unchanged_regions(); 524 } 525 526 fn compact_unchanged_regions(&mut self) { 527 let mut compacted = vec![]; 528 let mut maybe_previous: Option<UnchangedRange> = None; 529 for current in self.unchanged_regions.iter() { 530 if let Some(previous) = maybe_previous { 531 if previous.base_range.end == current.base_range.start 532 && previous.offsets == *current.offsets 533 { 534 maybe_previous = Some(UnchangedRange { 535 base_range: previous.base_range.start..current.base_range.end, 536 offsets: current.offsets.clone(), 537 }); 538 continue; 539 } 540 compacted.push(previous); 541 } 542 maybe_previous = Some(current.clone()); 543 } 544 if let Some(previous) = maybe_previous { 545 compacted.push(previous); 546 } 547 self.unchanged_regions = compacted; 548 } 549} 550 551#[derive(PartialEq, Eq, Clone)] 552pub enum DiffHunk<'input> { 553 Matching(&'input [u8]), 554 Different(Vec<&'input [u8]>), 555} 556 557impl Debug for DiffHunk<'_> { 558 fn fmt(&self, f: &mut Formatter<'_>) -> Result<(), std::fmt::Error> { 559 match self { 560 DiffHunk::Matching(slice) => f 561 .debug_tuple("DiffHunk::Matching") 562 .field(&String::from_utf8_lossy(slice)) 563 .finish(), 564 DiffHunk::Different(slices) => f 565 .debug_tuple("DiffHunk::Different") 566 .field( 567 &slices 568 .iter() 569 .map(|slice| String::from_utf8_lossy(slice)) 570 .collect_vec(), 571 ) 572 .finish(), 573 } 574 } 575} 576 577pub struct DiffHunkIterator<'diff, 'input> { 578 diff: &'diff Diff<'input>, 579 previous: UnchangedRange, 580 unchanged_emitted: bool, 581 unchanged_iter: slice::Iter<'diff, UnchangedRange>, 582} 583 584impl<'diff, 'input> Iterator for DiffHunkIterator<'diff, 'input> { 585 type Item = DiffHunk<'input>; 586 587 fn next(&mut self) -> Option<Self::Item> { 588 loop { 589 if !self.unchanged_emitted { 590 self.unchanged_emitted = true; 591 if !self.previous.base_range.is_empty() { 592 return Some(DiffHunk::Matching( 593 &self.diff.base_input[self.previous.base_range.clone()], 594 )); 595 } 596 } 597 if let Some(current) = self.unchanged_iter.next() { 598 let mut slices = vec![ 599 &self.diff.base_input[self.previous.base_range.end..current.base_range.start], 600 ]; 601 for (i, input) in self.diff.other_inputs.iter().enumerate() { 602 slices.push(&input[self.previous.end(i)..current.start(i)]); 603 } 604 self.previous = current.clone(); 605 self.unchanged_emitted = false; 606 if slices.iter().any(|slice| !slice.is_empty()) { 607 return Some(DiffHunk::Different(slices)); 608 } 609 } else { 610 break; 611 } 612 } 613 None 614 } 615} 616 617/// Diffs two slices of bytes. The returned diff hunks may be any length (may 618/// span many lines or may be only part of a line). This currently uses 619/// Histogram diff (or maybe something similar; I'm not sure I understood the 620/// algorithm correctly). It first diffs lines in the input and then refines 621/// the changed ranges at the word level. 622pub fn diff<'a>(left: &'a [u8], right: &'a [u8]) -> Vec<DiffHunk<'a>> { 623 if left == right { 624 return vec![DiffHunk::Matching(left)]; 625 } 626 if left.is_empty() { 627 return vec![DiffHunk::Different(vec![b"", right])]; 628 } 629 if right.is_empty() { 630 return vec![DiffHunk::Different(vec![left, b""])]; 631 } 632 633 Diff::default_refinement(&[left, right]) 634 .hunks() 635 .collect_vec() 636} 637 638#[cfg(test)] 639mod tests { 640 use super::*; 641 642 // Extracted to a function because type inference is ambiguous due to 643 // `impl PartialEq<aho_corasick::util::search::Span> for std::ops::Range<usize>` 644 fn no_ranges() -> Vec<Range<usize>> { 645 vec![] 646 } 647 648 #[test] 649 fn test_find_line_ranges_empty() { 650 assert_eq!(find_line_ranges(b""), no_ranges()); 651 } 652 653 #[test] 654 fn test_find_line_ranges_blank_line() { 655 assert_eq!(find_line_ranges(b"\n"), vec![0..1]); 656 } 657 658 #[test] 659 fn test_find_line_ranges_missing_newline_at_eof() { 660 assert_eq!(find_line_ranges(b"foo"), vec![0..3]); 661 } 662 663 #[test] 664 fn test_find_line_ranges_multiple_lines() { 665 assert_eq!(find_line_ranges(b"a\nbb\nccc\n"), vec![0..2, 2..5, 5..9]); 666 } 667 668 #[test] 669 fn test_find_word_ranges_empty() { 670 assert_eq!(find_word_ranges(b""), no_ranges()); 671 } 672 673 #[test] 674 fn test_find_word_ranges_single_word() { 675 assert_eq!(find_word_ranges(b"Abc"), vec![0..3]); 676 } 677 678 #[test] 679 fn test_find_word_ranges_no_word() { 680 assert_eq!(find_word_ranges(b"+-*/"), no_ranges()); 681 } 682 683 #[test] 684 fn test_find_word_ranges_word_then_non_word() { 685 assert_eq!(find_word_ranges(b"Abc "), vec![0..3]); 686 } 687 688 #[test] 689 fn test_find_word_ranges_non_word_then_word() { 690 assert_eq!(find_word_ranges(b" Abc"), vec![3..6]); 691 } 692 693 #[test] 694 fn test_find_word_ranges_multibyte() { 695 assert_eq!(find_word_ranges("".as_bytes()), vec![0..3]) 696 } 697 698 #[test] 699 fn test_find_lcs_empty() { 700 let empty: Vec<(usize, usize)> = vec![]; 701 assert_eq!(find_lcs(&[]), empty); 702 } 703 704 #[test] 705 fn test_find_lcs_single_element() { 706 assert_eq!(find_lcs(&[0]), vec![(0, 0)]); 707 } 708 709 #[test] 710 fn test_find_lcs_in_order() { 711 assert_eq!(find_lcs(&[0, 1, 2]), vec![(0, 0), (1, 1), (2, 2)]); 712 } 713 714 #[test] 715 fn test_find_lcs_reverse_order() { 716 assert_eq!(find_lcs(&[2, 1, 0]), vec![(2, 0)]); 717 } 718 719 #[test] 720 fn test_find_lcs_two_swapped() { 721 assert_eq!( 722 find_lcs(&[0, 1, 4, 3, 2, 5, 6]), 723 vec![(0, 0), (1, 1), (2, 4), (5, 5), (6, 6)] 724 ); 725 } 726 727 #[test] 728 fn test_find_lcs_element_moved_earlier() { 729 assert_eq!( 730 find_lcs(&[0, 1, 4, 2, 3, 5, 6]), 731 vec![(0, 0), (1, 1), (2, 3), (3, 4), (5, 5), (6, 6)] 732 ); 733 } 734 735 #[test] 736 fn test_find_lcs_element_moved_later() { 737 assert_eq!( 738 find_lcs(&[0, 1, 3, 4, 2, 5, 6]), 739 vec![(0, 0), (1, 1), (3, 2), (4, 3), (5, 5), (6, 6)] 740 ); 741 } 742 743 #[test] 744 fn test_find_lcs_interleaved_longest_chains() { 745 assert_eq!( 746 find_lcs(&[0, 4, 2, 9, 6, 5, 1, 3, 7, 8]), 747 vec![(0, 0), (1, 6), (3, 7), (7, 8), (8, 9)] 748 ); 749 } 750 751 #[test] 752 fn test_find_word_ranges_many_words() { 753 assert_eq!( 754 find_word_ranges(b"fn find_words(text: &[u8])"), 755 vec![0..2, 3..13, 14..18, 22..24] 756 ); 757 } 758 759 #[test] 760 fn test_unchanged_ranges_insert_in_middle() { 761 assert_eq!( 762 unchanged_ranges( 763 b"a b b c", 764 b"a b X b c", 765 &[0..1, 2..3, 4..5, 6..7], 766 &[0..1, 2..3, 4..5, 6..7, 8..9], 767 ), 768 vec![(0..1, 0..1), (2..3, 2..3), (4..5, 6..7), (6..7, 8..9)] 769 ); 770 } 771 772 #[test] 773 fn test_unchanged_ranges_non_unique_removed() { 774 assert_eq!( 775 unchanged_ranges( 776 b"a a a a", 777 b"a b a c", 778 &[0..1, 2..3, 4..5, 6..7], 779 &[0..1, 2..3, 4..5, 6..7], 780 ), 781 vec![(0..1, 0..1), (2..3, 4..5)] 782 ); 783 } 784 785 #[test] 786 fn test_unchanged_ranges_non_unique_added() { 787 assert_eq!( 788 unchanged_ranges( 789 b"a b a c", 790 b"a a a a", 791 &[0..1, 2..3, 4..5, 6..7], 792 &[0..1, 2..3, 4..5, 6..7], 793 ), 794 vec![(0..1, 0..1), (4..5, 2..3)] 795 ); 796 } 797 798 #[test] 799 fn test_intersect_regions_existing_empty() { 800 let actual = intersect_regions(vec![], &[(20..25, 55..60)]); 801 let expected = vec![]; 802 assert_eq!(actual, expected); 803 } 804 805 #[test] 806 fn test_intersect_regions_new_ranges_within_existing() { 807 let actual = intersect_regions( 808 vec![UnchangedRange { 809 base_range: 20..70, 810 offsets: vec![3], 811 }], 812 &[(25..30, 35..40), (40..50, 40..50)], 813 ); 814 let expected = vec![ 815 UnchangedRange { 816 base_range: 25..30, 817 offsets: vec![3, 10], 818 }, 819 UnchangedRange { 820 base_range: 40..50, 821 offsets: vec![3, 0], 822 }, 823 ]; 824 assert_eq!(actual, expected); 825 } 826 827 #[test] 828 fn test_intersect_regions_partial_overlap() { 829 let actual = intersect_regions( 830 vec![UnchangedRange { 831 base_range: 20..50, 832 offsets: vec![-3], 833 }], 834 &[(15..25, 5..15), (45..60, 55..70)], 835 ); 836 let expected = vec![ 837 UnchangedRange { 838 base_range: 20..25, 839 offsets: vec![-3, -10], 840 }, 841 UnchangedRange { 842 base_range: 45..50, 843 offsets: vec![-3, 10], 844 }, 845 ]; 846 assert_eq!(actual, expected); 847 } 848 849 #[test] 850 fn test_intersect_regions_new_range_overlaps_multiple_existing() { 851 let actual = intersect_regions( 852 vec![ 853 UnchangedRange { 854 base_range: 20..50, 855 offsets: vec![3, -8], 856 }, 857 UnchangedRange { 858 base_range: 70..80, 859 offsets: vec![7, 1], 860 }, 861 ], 862 &[(10..100, 5..95)], 863 ); 864 let expected = vec![ 865 UnchangedRange { 866 base_range: 20..50, 867 offsets: vec![3, -8, -5], 868 }, 869 UnchangedRange { 870 base_range: 70..80, 871 offsets: vec![7, 1, -5], 872 }, 873 ]; 874 assert_eq!(actual, expected); 875 } 876 877 #[test] 878 fn test_diff_single_input() { 879 let diff = Diff::default_refinement(&[b"abc"]); 880 assert_eq!(diff.hunks().collect_vec(), vec![DiffHunk::Matching(b"abc")]); 881 } 882 883 #[test] 884 fn test_diff_single_empty_input() { 885 let diff = Diff::default_refinement(&[b""]); 886 assert_eq!(diff.hunks().collect_vec(), vec![]); 887 } 888 889 #[test] 890 fn test_diff_two_inputs_one_different() { 891 let diff = Diff::default_refinement(&[b"a b c", b"a X c"]); 892 assert_eq!( 893 diff.hunks().collect_vec(), 894 vec![ 895 DiffHunk::Matching(b"a "), 896 DiffHunk::Different(vec![b"b", b"X"]), 897 DiffHunk::Matching(b" c"), 898 ] 899 ); 900 } 901 902 #[test] 903 fn test_diff_multiple_inputs_one_different() { 904 let diff = Diff::default_refinement(&[b"a b c", b"a X c", b"a b c"]); 905 assert_eq!( 906 diff.hunks().collect_vec(), 907 vec![ 908 DiffHunk::Matching(b"a "), 909 DiffHunk::Different(vec![b"b", b"X", b"b"]), 910 DiffHunk::Matching(b" c"), 911 ] 912 ); 913 } 914 915 #[test] 916 fn test_diff_multiple_inputs_all_different() { 917 let diff = Diff::default_refinement(&[b"a b c", b"a X c", b"a c X"]); 918 assert_eq!( 919 diff.hunks().collect_vec(), 920 vec![ 921 DiffHunk::Matching(b"a "), 922 DiffHunk::Different(vec![b"b ", b"X ", b""]), 923 DiffHunk::Matching(b"c"), 924 DiffHunk::Different(vec![b"", b"", b" X"]), 925 ] 926 ); 927 } 928 929 #[test] 930 fn test_diff_for_tokenizer_compacted() { 931 // Tests that unchanged regions are compacted when using for_tokenizer() 932 let diff = Diff::for_tokenizer( 933 &[b"a\nb\nc\nd\ne\nf\ng", b"a\nb\nc\nX\ne\nf\ng"], 934 &find_line_ranges, 935 ); 936 assert_eq!( 937 diff.hunks().collect_vec(), 938 vec![ 939 DiffHunk::Matching(b"a\nb\nc\n"), 940 DiffHunk::Different(vec![b"d\n", b"X\n"]), 941 DiffHunk::Matching(b"e\nf\ng"), 942 ] 943 ); 944 } 945 946 #[test] 947 fn test_diff_nothing_in_common() { 948 assert_eq!( 949 diff(b"aaa", b"bb"), 950 vec![DiffHunk::Different(vec![b"aaa", b"bb"])] 951 ); 952 } 953 954 #[test] 955 fn test_diff_insert_in_middle() { 956 assert_eq!( 957 diff(b"a z", b"a S z"), 958 vec![ 959 DiffHunk::Matching(b"a "), 960 DiffHunk::Different(vec![b"", b"S "]), 961 DiffHunk::Matching(b"z"), 962 ] 963 ); 964 } 965 966 #[test] 967 fn test_diff_no_unique_middle_flips() { 968 assert_eq!( 969 diff(b"a R R S S z", b"a S S R R z"), 970 vec![ 971 DiffHunk::Matching(b"a "), 972 DiffHunk::Different(vec![b"R R ", b""]), 973 DiffHunk::Matching(b"S S "), 974 DiffHunk::Different(vec![b"", b"R R "]), 975 DiffHunk::Matching(b"z") 976 ], 977 ); 978 } 979 980 #[test] 981 fn test_diff_recursion_needed() { 982 assert_eq!( 983 diff( 984 b"a q x q y q z q b q y q x q c", 985 b"a r r x q y z q b y q x r r c", 986 ), 987 vec![ 988 DiffHunk::Matching(b"a "), 989 DiffHunk::Different(vec![b"q", b"r"]), 990 DiffHunk::Matching(b" "), 991 DiffHunk::Different(vec![b"", b"r "]), 992 DiffHunk::Matching(b"x q y "), 993 DiffHunk::Different(vec![b"q ", b""]), 994 DiffHunk::Matching(b"z q b "), 995 DiffHunk::Different(vec![b"q ", b""]), 996 DiffHunk::Matching(b"y q x "), 997 DiffHunk::Different(vec![b"q", b"r"]), 998 DiffHunk::Matching(b" "), 999 DiffHunk::Different(vec![b"", b"r "]), 1000 DiffHunk::Matching(b"c"), 1001 ] 1002 ); 1003 } 1004 1005 #[test] 1006 fn test_diff_real_case_write_fmt() { 1007 // This is from src/ui.rs in commit f44d246e3f88 in this repo. It highlights the 1008 // need for recursion into the range at the end: after splitting at "Arguments" 1009 // and "formatter", the region at the end has the unique words "write_fmt" 1010 // and "fmt", but we forgot to recurse into that region, so we ended up 1011 // saying that "write_fmt(fmt).unwrap()" was replaced by b"write_fmt(fmt)". 1012 assert_eq!(diff( 1013 b" pub fn write_fmt(&mut self, fmt: fmt::Arguments<\'_>) {\n self.styler().write_fmt(fmt).unwrap()\n", 1014 b" pub fn write_fmt(&mut self, fmt: fmt::Arguments<\'_>) -> io::Result<()> {\n self.styler().write_fmt(fmt)\n" 1015 ), 1016 vec![ 1017 DiffHunk::Matching(b" pub fn write_fmt(&mut self, fmt: fmt::Arguments<\'_>) "), 1018 DiffHunk::Different(vec![b"", b"-> io::Result<()> "]), 1019 DiffHunk::Matching(b"{\n self.styler().write_fmt(fmt)"), 1020 DiffHunk::Different(vec![b".unwrap()", b""]), 1021 DiffHunk::Matching(b"\n") 1022 ] 1023 ); 1024 } 1025 1026 #[test] 1027 fn test_diff_real_case_gitgit_read_tree_c() { 1028 // This is the diff from commit e497ea2a9b in the git.git repo 1029 assert_eq!( 1030 diff( 1031 br##"/* 1032 * GIT - The information manager from hell 1033 * 1034 * Copyright (C) Linus Torvalds, 2005 1035 */ 1036#include "#cache.h" 1037 1038static int unpack(unsigned char *sha1) 1039{ 1040 void *buffer; 1041 unsigned long size; 1042 char type[20]; 1043 1044 buffer = read_sha1_file(sha1, type, &size); 1045 if (!buffer) 1046 usage("unable to read sha1 file"); 1047 if (strcmp(type, "tree")) 1048 usage("expected a 'tree' node"); 1049 while (size) { 1050 int len = strlen(buffer)+1; 1051 unsigned char *sha1 = buffer + len; 1052 char *path = strchr(buffer, ' ')+1; 1053 unsigned int mode; 1054 if (size < len + 20 || sscanf(buffer, "%o", &mode) != 1) 1055 usage("corrupt 'tree' file"); 1056 buffer = sha1 + 20; 1057 size -= len + 20; 1058 printf("%o %s (%s)\n", mode, path, sha1_to_hex(sha1)); 1059 } 1060 return 0; 1061} 1062 1063int main(int argc, char **argv) 1064{ 1065 int fd; 1066 unsigned char sha1[20]; 1067 1068 if (argc != 2) 1069 usage("read-tree <key>"); 1070 if (get_sha1_hex(argv[1], sha1) < 0) 1071 usage("read-tree <key>"); 1072 sha1_file_directory = getenv(DB_ENVIRONMENT); 1073 if (!sha1_file_directory) 1074 sha1_file_directory = DEFAULT_DB_ENVIRONMENT; 1075 if (unpack(sha1) < 0) 1076 usage("unpack failed"); 1077 return 0; 1078} 1079"##, 1080 br##"/* 1081 * GIT - The information manager from hell 1082 * 1083 * Copyright (C) Linus Torvalds, 2005 1084 */ 1085#include "#cache.h" 1086 1087static void create_directories(const char *path) 1088{ 1089 int len = strlen(path); 1090 char *buf = malloc(len + 1); 1091 const char *slash = path; 1092 1093 while ((slash = strchr(slash+1, '/')) != NULL) { 1094 len = slash - path; 1095 memcpy(buf, path, len); 1096 buf[len] = 0; 1097 mkdir(buf, 0700); 1098 } 1099} 1100 1101static int create_file(const char *path) 1102{ 1103 int fd = open(path, O_WRONLY | O_TRUNC | O_CREAT, 0600); 1104 if (fd < 0) { 1105 if (errno == ENOENT) { 1106 create_directories(path); 1107 fd = open(path, O_WRONLY | O_TRUNC | O_CREAT, 0600); 1108 } 1109 } 1110 return fd; 1111} 1112 1113static int unpack(unsigned char *sha1) 1114{ 1115 void *buffer; 1116 unsigned long size; 1117 char type[20]; 1118 1119 buffer = read_sha1_file(sha1, type, &size); 1120 if (!buffer) 1121 usage("unable to read sha1 file"); 1122 if (strcmp(type, "tree")) 1123 usage("expected a 'tree' node"); 1124 while (size) { 1125 int len = strlen(buffer)+1; 1126 unsigned char *sha1 = buffer + len; 1127 char *path = strchr(buffer, ' ')+1; 1128 char *data; 1129 unsigned long filesize; 1130 unsigned int mode; 1131 int fd; 1132 1133 if (size < len + 20 || sscanf(buffer, "%o", &mode) != 1) 1134 usage("corrupt 'tree' file"); 1135 buffer = sha1 + 20; 1136 size -= len + 20; 1137 data = read_sha1_file(sha1, type, &filesize); 1138 if (!data || strcmp(type, "blob")) 1139 usage("tree file refers to bad file data"); 1140 fd = create_file(path); 1141 if (fd < 0) 1142 usage("unable to create file"); 1143 if (write(fd, data, filesize) != filesize) 1144 usage("unable to write file"); 1145 fchmod(fd, mode); 1146 close(fd); 1147 free(data); 1148 } 1149 return 0; 1150} 1151 1152int main(int argc, char **argv) 1153{ 1154 int fd; 1155 unsigned char sha1[20]; 1156 1157 if (argc != 2) 1158 usage("read-tree <key>"); 1159 if (get_sha1_hex(argv[1], sha1) < 0) 1160 usage("read-tree <key>"); 1161 sha1_file_directory = getenv(DB_ENVIRONMENT); 1162 if (!sha1_file_directory) 1163 sha1_file_directory = DEFAULT_DB_ENVIRONMENT; 1164 if (unpack(sha1) < 0) 1165 usage("unpack failed"); 1166 return 0; 1167} 1168"##, 1169 ), 1170 vec![ 1171 DiffHunk::Matching(b"/*\n * GIT - The information manager from hell\n *\n * Copyright (C) Linus Torvalds, 2005\n */\n#include \"#cache.h\"\n\n"), 1172 DiffHunk::Different(vec![b"", b"static void create_directories(const char *path)\n{\n\tint len = strlen(path);\n\tchar *buf = malloc(len + 1);\n\tconst char *slash = path;\n\n\twhile ((slash = strchr(slash+1, \'/\')) != NULL) {\n\t\tlen = slash - path;\n\t\tmemcpy(buf, path, len);\n\t\tbuf[len] = 0;\n\t\tmkdir(buf, 0700);\n\t}\n}\n\nstatic int create_file(const char *path)\n{\n\tint fd = open(path, O_WRONLY | O_TRUNC | O_CREAT, 0600);\n\tif (fd < 0) {\n\t\tif (errno == ENOENT) {\n\t\t\tcreate_directories(path);\n\t\t\tfd = open(path, O_WRONLY | O_TRUNC | O_CREAT, 0600);\n\t\t}\n\t}\n\treturn fd;\n}\n\n"]), 1173 DiffHunk::Matching(b"static int unpack(unsigned char *sha1)\n{\n\tvoid *buffer;\n\tunsigned long size;\n\tchar type[20];\n\n\tbuffer = read_sha1_file(sha1, type, &size);\n\tif (!buffer)\n\t\tusage(\"unable to read sha1 file\");\n\tif (strcmp(type, \"tree\"))\n\t\tusage(\"expected a \'tree\' node\");\n\twhile (size) {\n\t\tint len = strlen(buffer)+1;\n\t\tunsigned char *sha1 = buffer + len;\n\t\tchar *path = strchr(buffer, \' \')+1;\n"), 1174 DiffHunk::Different(vec![b"", b"\t\tchar *data;\n\t\tunsigned long filesize;\n"]), 1175 DiffHunk::Matching(b"\t\tunsigned int mode;\n"), 1176 DiffHunk::Different(vec![b"", b"\t\tint fd;\n\n"]), 1177 DiffHunk::Matching(b"\t\tif (size < len + 20 || sscanf(buffer, \"%o\", &mode) != 1)\n\t\t\tusage(\"corrupt \'tree\' file\");\n\t\tbuffer = sha1 + 20;\n\t\tsize -= len + 20;\n\t\t"), 1178 DiffHunk::Different(vec![b"printf", b"data = read_sha1_file"]), 1179 DiffHunk::Matching(b"("), 1180 DiffHunk::Different(vec![b"\"%o %s (%s)\\n\", mode, path, sha1_to_hex(", b""]), 1181 DiffHunk::Matching(b"sha1"), 1182 DiffHunk::Different(vec![b"", b", type, &filesize"]), 1183 DiffHunk::Matching(b")"), 1184 DiffHunk::Different(vec![b")", b""]), 1185 DiffHunk::Matching(b";\n"), 1186 DiffHunk::Different(vec![b"", b"\t\tif (!data || strcmp(type, \"blob\"))\n\t\t\tusage(\"tree file refers to bad file data\");\n\t\tfd = create_file(path);\n\t\tif (fd < 0)\n\t\t\tusage(\"unable to create file\");\n\t\tif (write(fd, data, filesize) != filesize)\n\t\t\tusage(\"unable to write file\");\n\t\tfchmod(fd, mode);\n\t\tclose(fd);\n\t\tfree(data);\n"]), 1187 DiffHunk::Matching(b"\t}\n\treturn 0;\n}\n\nint main(int argc, char **argv)\n{\n\tint fd;\n\tunsigned char sha1[20];\n\n\tif (argc != 2)\n\t\tusage(\"read-tree <key>\");\n\tif (get_sha1_hex(argv[1], sha1) < 0)\n\t\tusage(\"read-tree <key>\");\n\tsha1_file_directory = getenv(DB_ENVIRONMENT);\n\tif (!sha1_file_directory)\n\t\tsha1_file_directory = DEFAULT_DB_ENVIRONMENT;\n\tif (unpack(sha1) < 0)\n\t\tusage(\"unpack failed\");\n\treturn 0;\n}\n") 1188 ] 1189 ); 1190 } 1191}