lib/src/diff.rs at tmp-tutorial · ilyagr.bsky.social/jj

ilyagr.bsky.social / jj
just playing with tangled
jj / lib / src / diff.rs
at tmp-tutorial 1191 lines 41 kB view raw
   1// Copyright 2021 The Jujutsu Authors
   2//
   3// Licensed under the Apache License, Version 2.0 (the "License");
   4// you may not use this file except in compliance with the License.
   5// You may obtain a copy of the License at
   6//
   7// https://www.apache.org/licenses/LICENSE-2.0
   8//
   9// Unless required by applicable law or agreed to in writing, software
  10// distributed under the License is distributed on an "AS IS" BASIS,
  11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12// See the License for the specific language governing permissions and
  13// limitations under the License.
  14
  15#![allow(missing_docs)]
  16
  17use std::cmp::{max, min, Ordering};
  18use std::collections::{BTreeMap, HashMap};
  19use std::fmt::{Debug, Formatter};
  20use std::ops::Range;
  21use std::slice;
  22
  23use itertools::Itertools;
  24
  25pub fn find_line_ranges(text: &[u8]) -> Vec<Range<usize>> {
  26    let mut ranges = vec![];
  27    let mut start = 0;
  28    loop {
  29        match text[start..].iter().position(|b| *b == b'\n') {
  30            None => {
  31                break;
  32            }
  33            Some(i) => {
  34                ranges.push(start..start + i + 1);
  35                start += i + 1;
  36            }
  37        }
  38    }
  39    if start < text.len() {
  40        ranges.push(start..text.len());
  41    }
  42    ranges
  43}
  44
  45fn is_word_byte(b: u8) -> bool {
  46    // TODO: Make this configurable (probably higher up in the call stack)
  47    matches!(
  48        b,
  49        // Count 0x80..0xff as word bytes so multi-byte UTF-8 chars are
  50        // treated as a single unit.
  51        b'A'..=b'Z' | b'a'..=b'z' | b'0'..=b'9' | b'_' | b'\x80'..=b'\xff'
  52    )
  53}
  54
  55pub fn find_word_ranges(text: &[u8]) -> Vec<Range<usize>> {
  56    let mut word_ranges = vec![];
  57    let mut word_start_pos = 0;
  58    let mut in_word = false;
  59    for (i, b) in text.iter().enumerate() {
  60        if in_word && !is_word_byte(*b) {
  61            in_word = false;
  62            word_ranges.push(word_start_pos..i);
  63            word_start_pos = i;
  64        } else if !in_word && is_word_byte(*b) {
  65            in_word = true;
  66            word_start_pos = i;
  67        }
  68    }
  69    if in_word && word_start_pos < text.len() {
  70        word_ranges.push(word_start_pos..text.len());
  71    }
  72    word_ranges
  73}
  74
  75pub fn find_nonword_ranges(text: &[u8]) -> Vec<Range<usize>> {
  76    let mut ranges = vec![];
  77    for (i, b) in text.iter().enumerate() {
  78        if !is_word_byte(*b) {
  79            ranges.push(i..i + 1);
  80        }
  81    }
  82    ranges
  83}
  84
  85struct Histogram<'a> {
  86    word_to_positions: HashMap<&'a [u8], Vec<usize>>,
  87    count_to_words: BTreeMap<usize, Vec<&'a [u8]>>,
  88}
  89
  90impl Histogram<'_> {
  91    fn calculate<'a>(
  92        text: &'a [u8],
  93        ranges: &[Range<usize>],
  94        max_occurrences: usize,
  95    ) -> Histogram<'a> {
  96        let mut word_to_positions: HashMap<&[u8], Vec<usize>> = HashMap::new();
  97        for (i, range) in ranges.iter().enumerate() {
  98            let positions = word_to_positions.entry(&text[range.clone()]).or_default();
  99            // Allow one more than max_occurrences, so we can later skip those with more
 100            // than max_occurrences
 101            if positions.len() <= max_occurrences {
 102                positions.push(i);
 103            }
 104        }
 105        let mut count_to_words: BTreeMap<usize, Vec<&[u8]>> = BTreeMap::new();
 106        for (word, ranges) in &word_to_positions {
 107            count_to_words.entry(ranges.len()).or_default().push(word);
 108        }
 109        Histogram {
 110            word_to_positions,
 111            count_to_words,
 112        }
 113    }
 114}
 115
 116/// Finds the LCS given a array where the value of `input[i]` indicates that
 117/// the position of element `i` in the right array is at position `input[i]` in
 118/// the left array.
 119///
 120/// For example (some have multiple valid outputs):
 121///
 122/// [0,1,2] => [(0,0),(1,1),(2,2)]
 123/// [2,1,0] => [(0,2)]
 124/// [0,1,4,2,3,5,6] => [(0,0),(1,1),(2,3),(3,4),(5,5),(6,6)]
 125/// [0,1,4,3,2,5,6] => [(0,0),(1,1),(4,2),(5,5),(6,6)]
 126fn find_lcs(input: &[usize]) -> Vec<(usize, usize)> {
 127    if input.is_empty() {
 128        return vec![];
 129    }
 130
 131    let mut chain = vec![(0, 0, 0); input.len()];
 132    let mut global_longest = 0;
 133    let mut global_longest_right_pos = 0;
 134    for (right_pos, &left_pos) in input.iter().enumerate() {
 135        let mut longest_from_here = 1;
 136        let mut previous_right_pos = usize::MAX;
 137        for i in (0..right_pos).rev() {
 138            let (previous_len, previous_left_pos, _) = chain[i];
 139            if previous_left_pos < left_pos {
 140                let len = previous_len + 1;
 141                if len > longest_from_here {
 142                    longest_from_here = len;
 143                    previous_right_pos = i;
 144                    if len > global_longest {
 145                        global_longest = len;
 146                        global_longest_right_pos = right_pos;
 147                        // If this is the longest chain globally so far, we cannot find a
 148                        // longer one by using a previous value, so break early.
 149                        break;
 150                    }
 151                }
 152            }
 153        }
 154        chain[right_pos] = (longest_from_here, left_pos, previous_right_pos);
 155    }
 156
 157    let mut result = vec![];
 158    let mut right_pos = global_longest_right_pos;
 159    loop {
 160        let (_, left_pos, previous_right_pos) = chain[right_pos];
 161        result.push((left_pos, right_pos));
 162        if previous_right_pos == usize::MAX {
 163            break;
 164        }
 165        right_pos = previous_right_pos;
 166    }
 167    result.reverse();
 168
 169    result
 170}
 171
 172/// Finds unchanged ranges among the ones given as arguments. The data between
 173/// those ranges is ignored.
 174pub(crate) fn unchanged_ranges(
 175    left: &[u8],
 176    right: &[u8],
 177    left_ranges: &[Range<usize>],
 178    right_ranges: &[Range<usize>],
 179) -> Vec<(Range<usize>, Range<usize>)> {
 180    if left_ranges.is_empty() || right_ranges.is_empty() {
 181        return vec![];
 182    }
 183
 184    let max_occurrences = 100;
 185    let mut left_histogram = Histogram::calculate(left, left_ranges, max_occurrences);
 186    if *left_histogram.count_to_words.keys().next().unwrap() > max_occurrences {
 187        // If there are very many occurrences of all words, then we just give up.
 188        return vec![];
 189    }
 190    let mut right_histogram = Histogram::calculate(right, right_ranges, max_occurrences);
 191    // Look for words with few occurrences in `left` (could equally well have picked
 192    // `right`?). If any of them also occur in `right`, then we add the words to
 193    // the LCS.
 194    let mut uncommon_shared_words = vec![];
 195    while !left_histogram.count_to_words.is_empty() && uncommon_shared_words.is_empty() {
 196        let left_words = left_histogram
 197            .count_to_words
 198            .first_entry()
 199            .map(|x| x.remove())
 200            .unwrap();
 201        for left_word in left_words {
 202            if right_histogram.word_to_positions.contains_key(left_word) {
 203                uncommon_shared_words.push(left_word);
 204            }
 205        }
 206    }
 207    if uncommon_shared_words.is_empty() {
 208        return vec![];
 209    }
 210
 211    // Let's say our inputs are "a b a b" and "a b c c b a b". We will have found
 212    // the least common words to be "a" and "b". We now assume that each
 213    // occurrence of each word lines up in the left and right input. We do that
 214    // by numbering the shared occurrences, effectively instead comparing "a1 b1
 215    // a2 b2" and "a1 b1 c c b2 a2 b". We then walk the common words in the
 216    // right input in order (["a1", "b1", "b2", "a2"]), and record the index of
 217    // that word in the left input ([0,1,3,2]). We then find the LCS and split
 218    // points based on that ([0,1,3] or [0,1,2] are both valid).
 219
 220    // [(index into left_ranges, word, occurrence #)]
 221    let mut left_positions = vec![];
 222    let mut right_positions = vec![];
 223    for uncommon_shared_word in uncommon_shared_words {
 224        let left_occurrences = left_histogram
 225            .word_to_positions
 226            .get_mut(uncommon_shared_word)
 227            .unwrap();
 228        let right_occurrences = right_histogram
 229            .word_to_positions
 230            .get_mut(uncommon_shared_word)
 231            .unwrap();
 232        let shared_count = min(left_occurrences.len(), right_occurrences.len());
 233        for occurrence in 0..shared_count {
 234            left_positions.push((
 235                left_occurrences[occurrence],
 236                uncommon_shared_word,
 237                occurrence,
 238            ));
 239            right_positions.push((
 240                right_occurrences[occurrence],
 241                uncommon_shared_word,
 242                occurrence,
 243            ));
 244        }
 245    }
 246    left_positions.sort();
 247    right_positions.sort();
 248    let mut left_position_map = HashMap::new();
 249    for (i, (_pos, word, occurrence)) in left_positions.iter().enumerate() {
 250        left_position_map.insert((*word, *occurrence), i);
 251    }
 252    let mut left_index_by_right_index = vec![];
 253    for (_pos, word, occurrence) in &right_positions {
 254        left_index_by_right_index.push(*left_position_map.get(&(*word, *occurrence)).unwrap());
 255    }
 256
 257    let lcs = find_lcs(&left_index_by_right_index);
 258
 259    // Produce output ranges, recursing into the modified areas between the elements
 260    // in the LCS.
 261    let mut result = vec![];
 262    let mut previous_left_position = 0;
 263    let mut previous_right_position = 0;
 264    for (left_index, right_index) in lcs {
 265        let left_position = left_positions[left_index].0;
 266        let right_position = right_positions[right_index].0;
 267        let skipped_left_positions = previous_left_position..left_position;
 268        let skipped_right_positions = previous_right_position..right_position;
 269        if !skipped_left_positions.is_empty() || !skipped_right_positions.is_empty() {
 270            for unchanged_nested_range in unchanged_ranges(
 271                left,
 272                right,
 273                &left_ranges[skipped_left_positions.clone()],
 274                &right_ranges[skipped_right_positions.clone()],
 275            ) {
 276                result.push(unchanged_nested_range);
 277            }
 278        }
 279        result.push((
 280            left_ranges[left_position].clone(),
 281            right_ranges[right_position].clone(),
 282        ));
 283        previous_left_position = left_position + 1;
 284        previous_right_position = right_position + 1;
 285    }
 286    // Also recurse into range at end (after common ranges).
 287    let skipped_left_positions = previous_left_position..left_ranges.len();
 288    let skipped_right_positions = previous_right_position..right_ranges.len();
 289    if !skipped_left_positions.is_empty() || !skipped_right_positions.is_empty() {
 290        for unchanged_nested_range in unchanged_ranges(
 291            left,
 292            right,
 293            &left_ranges[skipped_left_positions],
 294            &right_ranges[skipped_right_positions],
 295        ) {
 296            result.push(unchanged_nested_range);
 297        }
 298    }
 299
 300    result
 301}
 302
 303#[derive(Clone, PartialEq, Eq, Debug)]
 304struct UnchangedRange {
 305    base_range: Range<usize>,
 306    offsets: Vec<isize>,
 307}
 308
 309impl UnchangedRange {
 310    fn start(&self, side: usize) -> usize {
 311        self.base_range
 312            .start
 313            .wrapping_add(self.offsets[side] as usize)
 314    }
 315
 316    fn end(&self, side: usize) -> usize {
 317        self.base_range
 318            .end
 319            .wrapping_add(self.offsets[side] as usize)
 320    }
 321}
 322
 323impl PartialOrd for UnchangedRange {
 324    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
 325        Some(self.cmp(other))
 326    }
 327}
 328
 329impl Ord for UnchangedRange {
 330    fn cmp(&self, other: &Self) -> Ordering {
 331        self.base_range
 332            .start
 333            .cmp(&other.base_range.start)
 334            .then_with(|| self.base_range.end.cmp(&other.base_range.end))
 335    }
 336}
 337
 338/// Takes any number of inputs and finds regions that are them same between all
 339/// of them.
 340#[derive(Clone, Debug)]
 341pub struct Diff<'input> {
 342    base_input: &'input [u8],
 343    other_inputs: Vec<&'input [u8]>,
 344    // The key is a range in the base input. The value is the start of each non-base region
 345    // relative to the base region's start. By making them relative, they don't need to change
 346    // when the base range changes.
 347    unchanged_regions: Vec<UnchangedRange>,
 348}
 349
 350/// Takes the current regions and intersects it with the new unchanged ranges
 351/// from a 2-way diff. The result is a map of unchanged regions with one more
 352/// offset in the map's values.
 353fn intersect_regions(
 354    current_ranges: Vec<UnchangedRange>,
 355    new_unchanged_ranges: &[(Range<usize>, Range<usize>)],
 356) -> Vec<UnchangedRange> {
 357    let mut result = vec![];
 358    let mut current_ranges_iter = current_ranges.into_iter().peekable();
 359    for (new_base_range, other_range) in new_unchanged_ranges.iter() {
 360        assert_eq!(new_base_range.len(), other_range.len());
 361        while let Some(UnchangedRange {
 362            base_range,
 363            offsets,
 364        }) = current_ranges_iter.peek()
 365        {
 366            // No need to look further if we're past the new range.
 367            if base_range.start >= new_base_range.end {
 368                break;
 369            }
 370            // Discard any current unchanged regions that don't match between the base and
 371            // the new input.
 372            if base_range.end <= new_base_range.start {
 373                current_ranges_iter.next();
 374                continue;
 375            }
 376            let new_start = max(base_range.start, new_base_range.start);
 377            let new_end = min(base_range.end, new_base_range.end);
 378            let mut new_offsets = offsets.clone();
 379            new_offsets.push(other_range.start.wrapping_sub(new_base_range.start) as isize);
 380            result.push(UnchangedRange {
 381                base_range: new_start..new_end,
 382                offsets: new_offsets,
 383            });
 384            if base_range.end >= new_base_range.end {
 385                // Break without consuming the item; there may be other new ranges that overlap
 386                // with it.
 387                break;
 388            }
 389            current_ranges_iter.next();
 390        }
 391    }
 392    result
 393}
 394
 395impl<'input> Diff<'input> {
 396    pub fn for_tokenizer(
 397        inputs: &[&'input [u8]],
 398        tokenizer: &impl Fn(&[u8]) -> Vec<Range<usize>>,
 399    ) -> Self {
 400        assert!(!inputs.is_empty());
 401        let base_input = inputs[0];
 402        let other_inputs = inputs.iter().skip(1).copied().collect_vec();
 403        // First tokenize each input
 404        let base_token_ranges: Vec<Range<usize>> = tokenizer(base_input);
 405        let other_token_ranges: Vec<Vec<Range<usize>>> = other_inputs
 406            .iter()
 407            .map(|other_input| tokenizer(other_input))
 408            .collect_vec();
 409
 410        // Look for unchanged regions. Initially consider the whole range of the base
 411        // input as unchanged (compared to itself). Then diff each other input
 412        // against the base. Intersect the previously found ranges with the
 413        // unchanged ranges in the diff.
 414        let mut unchanged_regions = vec![UnchangedRange {
 415            base_range: 0..base_input.len(),
 416            offsets: vec![],
 417        }];
 418        for (i, other_token_ranges) in other_token_ranges.iter().enumerate() {
 419            let unchanged_diff_ranges = unchanged_ranges(
 420                base_input,
 421                other_inputs[i],
 422                &base_token_ranges,
 423                other_token_ranges,
 424            );
 425            unchanged_regions = intersect_regions(unchanged_regions, &unchanged_diff_ranges);
 426        }
 427        // Add an empty range at the end to make life easier for hunks().
 428        let offsets = other_inputs
 429            .iter()
 430            .map(|input| input.len().wrapping_sub(base_input.len()) as isize)
 431            .collect_vec();
 432        unchanged_regions.push(UnchangedRange {
 433            base_range: base_input.len()..base_input.len(),
 434            offsets,
 435        });
 436
 437        let mut diff = Self {
 438            base_input,
 439            other_inputs,
 440            unchanged_regions,
 441        };
 442        diff.compact_unchanged_regions();
 443        diff
 444    }
 445
 446    pub fn unrefined(inputs: &[&'input [u8]]) -> Self {
 447        Diff::for_tokenizer(inputs, &|_| vec![])
 448    }
 449
 450    // TODO: At least when merging, it's wasteful to refine the diff if e.g. if 2
 451    // out of 3 inputs match in the differing regions. Perhaps the refine()
 452    // method should be on the hunk instead (probably returning a new Diff)?
 453    // That would let each user decide which hunks to refine. However, it would
 454    // probably mean that many callers repeat the same code. Perhaps it
 455    // should be possible to refine a whole diff *or* individual hunks.
 456    pub fn default_refinement(inputs: &[&'input [u8]]) -> Self {
 457        let mut diff = Diff::for_tokenizer(inputs, &find_line_ranges);
 458        diff.refine_changed_regions(&find_word_ranges);
 459        diff.refine_changed_regions(&find_nonword_ranges);
 460        diff
 461    }
 462
 463    pub fn hunks<'diff>(&'diff self) -> DiffHunkIterator<'diff, 'input> {
 464        let previous_offsets = vec![0; self.other_inputs.len()];
 465        DiffHunkIterator {
 466            diff: self,
 467            previous: UnchangedRange {
 468                base_range: 0..0,
 469                offsets: previous_offsets,
 470            },
 471            unchanged_emitted: true,
 472            unchanged_iter: self.unchanged_regions.iter(),
 473        }
 474    }
 475
 476    /// Uses the given tokenizer to split the changed regions into smaller
 477    /// regions. Then tries to finds unchanged regions among them.
 478    pub fn refine_changed_regions(&mut self, tokenizer: &impl Fn(&[u8]) -> Vec<Range<usize>>) {
 479        let mut previous = UnchangedRange {
 480            base_range: 0..0,
 481            offsets: vec![0; self.other_inputs.len()],
 482        };
 483        let mut new_unchanged_ranges = vec![];
 484        for current in self.unchanged_regions.iter() {
 485            // For the changed region between the previous region and the current one,
 486            // create a new Diff instance. Then adjust the start positions and
 487            // offsets to be valid in the context of the larger Diff instance
 488            // (`self`).
 489            let mut slices =
 490                vec![&self.base_input[previous.base_range.end..current.base_range.start]];
 491            for i in 0..current.offsets.len() {
 492                let changed_range = previous.end(i)..current.start(i);
 493                slices.push(&self.other_inputs[i][changed_range]);
 494            }
 495
 496            let refined_diff = Diff::for_tokenizer(&slices, tokenizer);
 497
 498            for UnchangedRange {
 499                base_range,
 500                offsets,
 501            } in refined_diff.unchanged_regions
 502            {
 503                let new_base_start = base_range.start + previous.base_range.end;
 504                let new_base_end = base_range.end + previous.base_range.end;
 505                let offsets = offsets
 506                    .into_iter()
 507                    .enumerate()
 508                    .map(|(i, offset)| offset + previous.offsets[i])
 509                    .collect_vec();
 510                new_unchanged_ranges.push(UnchangedRange {
 511                    base_range: new_base_start..new_base_end,
 512                    offsets,
 513                });
 514            }
 515            previous = current.clone();
 516        }
 517        self.unchanged_regions = self
 518            .unchanged_regions
 519            .iter()
 520            .cloned()
 521            .merge(new_unchanged_ranges)
 522            .collect_vec();
 523        self.compact_unchanged_regions();
 524    }
 525
 526    fn compact_unchanged_regions(&mut self) {
 527        let mut compacted = vec![];
 528        let mut maybe_previous: Option<UnchangedRange> = None;
 529        for current in self.unchanged_regions.iter() {
 530            if let Some(previous) = maybe_previous {
 531                if previous.base_range.end == current.base_range.start
 532                    && previous.offsets == *current.offsets
 533                {
 534                    maybe_previous = Some(UnchangedRange {
 535                        base_range: previous.base_range.start..current.base_range.end,
 536                        offsets: current.offsets.clone(),
 537                    });
 538                    continue;
 539                }
 540                compacted.push(previous);
 541            }
 542            maybe_previous = Some(current.clone());
 543        }
 544        if let Some(previous) = maybe_previous {
 545            compacted.push(previous);
 546        }
 547        self.unchanged_regions = compacted;
 548    }
 549}
 550
 551#[derive(PartialEq, Eq, Clone)]
 552pub enum DiffHunk<'input> {
 553    Matching(&'input [u8]),
 554    Different(Vec<&'input [u8]>),
 555}
 556
 557impl Debug for DiffHunk<'_> {
 558    fn fmt(&self, f: &mut Formatter<'_>) -> Result<(), std::fmt::Error> {
 559        match self {
 560            DiffHunk::Matching(slice) => f
 561                .debug_tuple("DiffHunk::Matching")
 562                .field(&String::from_utf8_lossy(slice))
 563                .finish(),
 564            DiffHunk::Different(slices) => f
 565                .debug_tuple("DiffHunk::Different")
 566                .field(
 567                    &slices
 568                        .iter()
 569                        .map(|slice| String::from_utf8_lossy(slice))
 570                        .collect_vec(),
 571                )
 572                .finish(),
 573        }
 574    }
 575}
 576
 577pub struct DiffHunkIterator<'diff, 'input> {
 578    diff: &'diff Diff<'input>,
 579    previous: UnchangedRange,
 580    unchanged_emitted: bool,
 581    unchanged_iter: slice::Iter<'diff, UnchangedRange>,
 582}
 583
 584impl<'diff, 'input> Iterator for DiffHunkIterator<'diff, 'input> {
 585    type Item = DiffHunk<'input>;
 586
 587    fn next(&mut self) -> Option<Self::Item> {
 588        loop {
 589            if !self.unchanged_emitted {
 590                self.unchanged_emitted = true;
 591                if !self.previous.base_range.is_empty() {
 592                    return Some(DiffHunk::Matching(
 593                        &self.diff.base_input[self.previous.base_range.clone()],
 594                    ));
 595                }
 596            }
 597            if let Some(current) = self.unchanged_iter.next() {
 598                let mut slices = vec![
 599                    &self.diff.base_input[self.previous.base_range.end..current.base_range.start],
 600                ];
 601                for (i, input) in self.diff.other_inputs.iter().enumerate() {
 602                    slices.push(&input[self.previous.end(i)..current.start(i)]);
 603                }
 604                self.previous = current.clone();
 605                self.unchanged_emitted = false;
 606                if slices.iter().any(|slice| !slice.is_empty()) {
 607                    return Some(DiffHunk::Different(slices));
 608                }
 609            } else {
 610                break;
 611            }
 612        }
 613        None
 614    }
 615}
 616
 617/// Diffs two slices of bytes. The returned diff hunks may be any length (may
 618/// span many lines or may be only part of a line). This currently uses
 619/// Histogram diff (or maybe something similar; I'm not sure I understood the
 620/// algorithm correctly). It first diffs lines in the input and then refines
 621/// the changed ranges at the word level.
 622pub fn diff<'a>(left: &'a [u8], right: &'a [u8]) -> Vec<DiffHunk<'a>> {
 623    if left == right {
 624        return vec![DiffHunk::Matching(left)];
 625    }
 626    if left.is_empty() {
 627        return vec![DiffHunk::Different(vec![b"", right])];
 628    }
 629    if right.is_empty() {
 630        return vec![DiffHunk::Different(vec![left, b""])];
 631    }
 632
 633    Diff::default_refinement(&[left, right])
 634        .hunks()
 635        .collect_vec()
 636}
 637
 638#[cfg(test)]
 639mod tests {
 640    use super::*;
 641
 642    // Extracted to a function because type inference is ambiguous due to
 643    // `impl PartialEq<aho_corasick::util::search::Span> for std::ops::Range<usize>`
 644    fn no_ranges() -> Vec<Range<usize>> {
 645        vec![]
 646    }
 647
 648    #[test]
 649    fn test_find_line_ranges_empty() {
 650        assert_eq!(find_line_ranges(b""), no_ranges());
 651    }
 652
 653    #[test]
 654    fn test_find_line_ranges_blank_line() {
 655        assert_eq!(find_line_ranges(b"\n"), vec![0..1]);
 656    }
 657
 658    #[test]
 659    fn test_find_line_ranges_missing_newline_at_eof() {
 660        assert_eq!(find_line_ranges(b"foo"), vec![0..3]);
 661    }
 662
 663    #[test]
 664    fn test_find_line_ranges_multiple_lines() {
 665        assert_eq!(find_line_ranges(b"a\nbb\nccc\n"), vec![0..2, 2..5, 5..9]);
 666    }
 667
 668    #[test]
 669    fn test_find_word_ranges_empty() {
 670        assert_eq!(find_word_ranges(b""), no_ranges());
 671    }
 672
 673    #[test]
 674    fn test_find_word_ranges_single_word() {
 675        assert_eq!(find_word_ranges(b"Abc"), vec![0..3]);
 676    }
 677
 678    #[test]
 679    fn test_find_word_ranges_no_word() {
 680        assert_eq!(find_word_ranges(b"+-*/"), no_ranges());
 681    }
 682
 683    #[test]
 684    fn test_find_word_ranges_word_then_non_word() {
 685        assert_eq!(find_word_ranges(b"Abc   "), vec![0..3]);
 686    }
 687
 688    #[test]
 689    fn test_find_word_ranges_non_word_then_word() {
 690        assert_eq!(find_word_ranges(b"   Abc"), vec![3..6]);
 691    }
 692
 693    #[test]
 694    fn test_find_word_ranges_multibyte() {
 695        assert_eq!(find_word_ranges("⊢".as_bytes()), vec![0..3])
 696    }
 697
 698    #[test]
 699    fn test_find_lcs_empty() {
 700        let empty: Vec<(usize, usize)> = vec![];
 701        assert_eq!(find_lcs(&[]), empty);
 702    }
 703
 704    #[test]
 705    fn test_find_lcs_single_element() {
 706        assert_eq!(find_lcs(&[0]), vec![(0, 0)]);
 707    }
 708
 709    #[test]
 710    fn test_find_lcs_in_order() {
 711        assert_eq!(find_lcs(&[0, 1, 2]), vec![(0, 0), (1, 1), (2, 2)]);
 712    }
 713
 714    #[test]
 715    fn test_find_lcs_reverse_order() {
 716        assert_eq!(find_lcs(&[2, 1, 0]), vec![(2, 0)]);
 717    }
 718
 719    #[test]
 720    fn test_find_lcs_two_swapped() {
 721        assert_eq!(
 722            find_lcs(&[0, 1, 4, 3, 2, 5, 6]),
 723            vec![(0, 0), (1, 1), (2, 4), (5, 5), (6, 6)]
 724        );
 725    }
 726
 727    #[test]
 728    fn test_find_lcs_element_moved_earlier() {
 729        assert_eq!(
 730            find_lcs(&[0, 1, 4, 2, 3, 5, 6]),
 731            vec![(0, 0), (1, 1), (2, 3), (3, 4), (5, 5), (6, 6)]
 732        );
 733    }
 734
 735    #[test]
 736    fn test_find_lcs_element_moved_later() {
 737        assert_eq!(
 738            find_lcs(&[0, 1, 3, 4, 2, 5, 6]),
 739            vec![(0, 0), (1, 1), (3, 2), (4, 3), (5, 5), (6, 6)]
 740        );
 741    }
 742
 743    #[test]
 744    fn test_find_lcs_interleaved_longest_chains() {
 745        assert_eq!(
 746            find_lcs(&[0, 4, 2, 9, 6, 5, 1, 3, 7, 8]),
 747            vec![(0, 0), (1, 6), (3, 7), (7, 8), (8, 9)]
 748        );
 749    }
 750
 751    #[test]
 752    fn test_find_word_ranges_many_words() {
 753        assert_eq!(
 754            find_word_ranges(b"fn find_words(text: &[u8])"),
 755            vec![0..2, 3..13, 14..18, 22..24]
 756        );
 757    }
 758
 759    #[test]
 760    fn test_unchanged_ranges_insert_in_middle() {
 761        assert_eq!(
 762            unchanged_ranges(
 763                b"a b b c",
 764                b"a b X b c",
 765                &[0..1, 2..3, 4..5, 6..7],
 766                &[0..1, 2..3, 4..5, 6..7, 8..9],
 767            ),
 768            vec![(0..1, 0..1), (2..3, 2..3), (4..5, 6..7), (6..7, 8..9)]
 769        );
 770    }
 771
 772    #[test]
 773    fn test_unchanged_ranges_non_unique_removed() {
 774        assert_eq!(
 775            unchanged_ranges(
 776                b"a a a a",
 777                b"a b a c",
 778                &[0..1, 2..3, 4..5, 6..7],
 779                &[0..1, 2..3, 4..5, 6..7],
 780            ),
 781            vec![(0..1, 0..1), (2..3, 4..5)]
 782        );
 783    }
 784
 785    #[test]
 786    fn test_unchanged_ranges_non_unique_added() {
 787        assert_eq!(
 788            unchanged_ranges(
 789                b"a b a c",
 790                b"a a a a",
 791                &[0..1, 2..3, 4..5, 6..7],
 792                &[0..1, 2..3, 4..5, 6..7],
 793            ),
 794            vec![(0..1, 0..1), (4..5, 2..3)]
 795        );
 796    }
 797
 798    #[test]
 799    fn test_intersect_regions_existing_empty() {
 800        let actual = intersect_regions(vec![], &[(20..25, 55..60)]);
 801        let expected = vec![];
 802        assert_eq!(actual, expected);
 803    }
 804
 805    #[test]
 806    fn test_intersect_regions_new_ranges_within_existing() {
 807        let actual = intersect_regions(
 808            vec![UnchangedRange {
 809                base_range: 20..70,
 810                offsets: vec![3],
 811            }],
 812            &[(25..30, 35..40), (40..50, 40..50)],
 813        );
 814        let expected = vec![
 815            UnchangedRange {
 816                base_range: 25..30,
 817                offsets: vec![3, 10],
 818            },
 819            UnchangedRange {
 820                base_range: 40..50,
 821                offsets: vec![3, 0],
 822            },
 823        ];
 824        assert_eq!(actual, expected);
 825    }
 826
 827    #[test]
 828    fn test_intersect_regions_partial_overlap() {
 829        let actual = intersect_regions(
 830            vec![UnchangedRange {
 831                base_range: 20..50,
 832                offsets: vec![-3],
 833            }],
 834            &[(15..25, 5..15), (45..60, 55..70)],
 835        );
 836        let expected = vec![
 837            UnchangedRange {
 838                base_range: 20..25,
 839                offsets: vec![-3, -10],
 840            },
 841            UnchangedRange {
 842                base_range: 45..50,
 843                offsets: vec![-3, 10],
 844            },
 845        ];
 846        assert_eq!(actual, expected);
 847    }
 848
 849    #[test]
 850    fn test_intersect_regions_new_range_overlaps_multiple_existing() {
 851        let actual = intersect_regions(
 852            vec![
 853                UnchangedRange {
 854                    base_range: 20..50,
 855                    offsets: vec![3, -8],
 856                },
 857                UnchangedRange {
 858                    base_range: 70..80,
 859                    offsets: vec![7, 1],
 860                },
 861            ],
 862            &[(10..100, 5..95)],
 863        );
 864        let expected = vec![
 865            UnchangedRange {
 866                base_range: 20..50,
 867                offsets: vec![3, -8, -5],
 868            },
 869            UnchangedRange {
 870                base_range: 70..80,
 871                offsets: vec![7, 1, -5],
 872            },
 873        ];
 874        assert_eq!(actual, expected);
 875    }
 876
 877    #[test]
 878    fn test_diff_single_input() {
 879        let diff = Diff::default_refinement(&[b"abc"]);
 880        assert_eq!(diff.hunks().collect_vec(), vec![DiffHunk::Matching(b"abc")]);
 881    }
 882
 883    #[test]
 884    fn test_diff_single_empty_input() {
 885        let diff = Diff::default_refinement(&[b""]);
 886        assert_eq!(diff.hunks().collect_vec(), vec![]);
 887    }
 888
 889    #[test]
 890    fn test_diff_two_inputs_one_different() {
 891        let diff = Diff::default_refinement(&[b"a b c", b"a X c"]);
 892        assert_eq!(
 893            diff.hunks().collect_vec(),
 894            vec![
 895                DiffHunk::Matching(b"a "),
 896                DiffHunk::Different(vec![b"b", b"X"]),
 897                DiffHunk::Matching(b" c"),
 898            ]
 899        );
 900    }
 901
 902    #[test]
 903    fn test_diff_multiple_inputs_one_different() {
 904        let diff = Diff::default_refinement(&[b"a b c", b"a X c", b"a b c"]);
 905        assert_eq!(
 906            diff.hunks().collect_vec(),
 907            vec![
 908                DiffHunk::Matching(b"a "),
 909                DiffHunk::Different(vec![b"b", b"X", b"b"]),
 910                DiffHunk::Matching(b" c"),
 911            ]
 912        );
 913    }
 914
 915    #[test]
 916    fn test_diff_multiple_inputs_all_different() {
 917        let diff = Diff::default_refinement(&[b"a b c", b"a X c", b"a c X"]);
 918        assert_eq!(
 919            diff.hunks().collect_vec(),
 920            vec![
 921                DiffHunk::Matching(b"a "),
 922                DiffHunk::Different(vec![b"b ", b"X ", b""]),
 923                DiffHunk::Matching(b"c"),
 924                DiffHunk::Different(vec![b"", b"", b" X"]),
 925            ]
 926        );
 927    }
 928
 929    #[test]
 930    fn test_diff_for_tokenizer_compacted() {
 931        // Tests that unchanged regions are compacted when using for_tokenizer()
 932        let diff = Diff::for_tokenizer(
 933            &[b"a\nb\nc\nd\ne\nf\ng", b"a\nb\nc\nX\ne\nf\ng"],
 934            &find_line_ranges,
 935        );
 936        assert_eq!(
 937            diff.hunks().collect_vec(),
 938            vec![
 939                DiffHunk::Matching(b"a\nb\nc\n"),
 940                DiffHunk::Different(vec![b"d\n", b"X\n"]),
 941                DiffHunk::Matching(b"e\nf\ng"),
 942            ]
 943        );
 944    }
 945
 946    #[test]
 947    fn test_diff_nothing_in_common() {
 948        assert_eq!(
 949            diff(b"aaa", b"bb"),
 950            vec![DiffHunk::Different(vec![b"aaa", b"bb"])]
 951        );
 952    }
 953
 954    #[test]
 955    fn test_diff_insert_in_middle() {
 956        assert_eq!(
 957            diff(b"a z", b"a S z"),
 958            vec![
 959                DiffHunk::Matching(b"a "),
 960                DiffHunk::Different(vec![b"", b"S "]),
 961                DiffHunk::Matching(b"z"),
 962            ]
 963        );
 964    }
 965
 966    #[test]
 967    fn test_diff_no_unique_middle_flips() {
 968        assert_eq!(
 969            diff(b"a R R S S z", b"a S S R R z"),
 970            vec![
 971                DiffHunk::Matching(b"a "),
 972                DiffHunk::Different(vec![b"R R ", b""]),
 973                DiffHunk::Matching(b"S S "),
 974                DiffHunk::Different(vec![b"", b"R R "]),
 975                DiffHunk::Matching(b"z")
 976            ],
 977        );
 978    }
 979
 980    #[test]
 981    fn test_diff_recursion_needed() {
 982        assert_eq!(
 983            diff(
 984                b"a q x q y q z q b q y q x q c",
 985                b"a r r x q y z q b y q x r r c",
 986            ),
 987            vec![
 988                DiffHunk::Matching(b"a "),
 989                DiffHunk::Different(vec![b"q", b"r"]),
 990                DiffHunk::Matching(b" "),
 991                DiffHunk::Different(vec![b"", b"r "]),
 992                DiffHunk::Matching(b"x q y "),
 993                DiffHunk::Different(vec![b"q ", b""]),
 994                DiffHunk::Matching(b"z q b "),
 995                DiffHunk::Different(vec![b"q ", b""]),
 996                DiffHunk::Matching(b"y q x "),
 997                DiffHunk::Different(vec![b"q", b"r"]),
 998                DiffHunk::Matching(b" "),
 999                DiffHunk::Different(vec![b"", b"r "]),
1000                DiffHunk::Matching(b"c"),
1001            ]
1002        );
1003    }
1004
1005    #[test]
1006    fn test_diff_real_case_write_fmt() {
1007        // This is from src/ui.rs in commit f44d246e3f88 in this repo. It highlights the
1008        // need for recursion into the range at the end: after splitting at "Arguments"
1009        // and "formatter", the region at the end has the unique words "write_fmt"
1010        // and "fmt", but we forgot to recurse into that region, so we ended up
1011        // saying that "write_fmt(fmt).unwrap()" was replaced by b"write_fmt(fmt)".
1012        assert_eq!(diff(
1013                b"    pub fn write_fmt(&mut self, fmt: fmt::Arguments<\'_>) {\n        self.styler().write_fmt(fmt).unwrap()\n",
1014                b"    pub fn write_fmt(&mut self, fmt: fmt::Arguments<\'_>) -> io::Result<()> {\n        self.styler().write_fmt(fmt)\n"
1015            ),
1016            vec![
1017                DiffHunk::Matching(b"    pub fn write_fmt(&mut self, fmt: fmt::Arguments<\'_>) "),
1018                DiffHunk::Different(vec![b"", b"-> io::Result<()> "]),
1019                DiffHunk::Matching(b"{\n        self.styler().write_fmt(fmt)"),
1020                DiffHunk::Different(vec![b".unwrap()", b""]),
1021                DiffHunk::Matching(b"\n")
1022            ]
1023        );
1024    }
1025
1026    #[test]
1027    fn test_diff_real_case_gitgit_read_tree_c() {
1028        // This is the diff from commit e497ea2a9b in the git.git repo
1029        assert_eq!(
1030            diff(
1031                br##"/*
1032 * GIT - The information manager from hell
1033 *
1034 * Copyright (C) Linus Torvalds, 2005
1035 */
1036#include "#cache.h"
1037
1038static int unpack(unsigned char *sha1)
1039{
1040	void *buffer;
1041	unsigned long size;
1042	char type[20];
1043
1044	buffer = read_sha1_file(sha1, type, &size);
1045	if (!buffer)
1046		usage("unable to read sha1 file");
1047	if (strcmp(type, "tree"))
1048		usage("expected a 'tree' node");
1049	while (size) {
1050		int len = strlen(buffer)+1;
1051		unsigned char *sha1 = buffer + len;
1052		char *path = strchr(buffer, ' ')+1;
1053		unsigned int mode;
1054		if (size < len + 20 || sscanf(buffer, "%o", &mode) != 1)
1055			usage("corrupt 'tree' file");
1056		buffer = sha1 + 20;
1057		size -= len + 20;
1058		printf("%o %s (%s)\n", mode, path, sha1_to_hex(sha1));
1059	}
1060	return 0;
1061}
1062
1063int main(int argc, char **argv)
1064{
1065	int fd;
1066	unsigned char sha1[20];
1067
1068	if (argc != 2)
1069		usage("read-tree <key>");
1070	if (get_sha1_hex(argv[1], sha1) < 0)
1071		usage("read-tree <key>");
1072	sha1_file_directory = getenv(DB_ENVIRONMENT);
1073	if (!sha1_file_directory)
1074		sha1_file_directory = DEFAULT_DB_ENVIRONMENT;
1075	if (unpack(sha1) < 0)
1076		usage("unpack failed");
1077	return 0;
1078}
1079"##,
1080                br##"/*
1081 * GIT - The information manager from hell
1082 *
1083 * Copyright (C) Linus Torvalds, 2005
1084 */
1085#include "#cache.h"
1086
1087static void create_directories(const char *path)
1088{
1089	int len = strlen(path);
1090	char *buf = malloc(len + 1);
1091	const char *slash = path;
1092
1093	while ((slash = strchr(slash+1, '/')) != NULL) {
1094		len = slash - path;
1095		memcpy(buf, path, len);
1096		buf[len] = 0;
1097		mkdir(buf, 0700);
1098	}
1099}
1100
1101static int create_file(const char *path)
1102{
1103	int fd = open(path, O_WRONLY | O_TRUNC | O_CREAT, 0600);
1104	if (fd < 0) {
1105		if (errno == ENOENT) {
1106			create_directories(path);
1107			fd = open(path, O_WRONLY | O_TRUNC | O_CREAT, 0600);
1108		}
1109	}
1110	return fd;
1111}
1112
1113static int unpack(unsigned char *sha1)
1114{
1115	void *buffer;
1116	unsigned long size;
1117	char type[20];
1118
1119	buffer = read_sha1_file(sha1, type, &size);
1120	if (!buffer)
1121		usage("unable to read sha1 file");
1122	if (strcmp(type, "tree"))
1123		usage("expected a 'tree' node");
1124	while (size) {
1125		int len = strlen(buffer)+1;
1126		unsigned char *sha1 = buffer + len;
1127		char *path = strchr(buffer, ' ')+1;
1128		char *data;
1129		unsigned long filesize;
1130		unsigned int mode;
1131		int fd;
1132
1133		if (size < len + 20 || sscanf(buffer, "%o", &mode) != 1)
1134			usage("corrupt 'tree' file");
1135		buffer = sha1 + 20;
1136		size -= len + 20;
1137		data = read_sha1_file(sha1, type, &filesize);
1138		if (!data || strcmp(type, "blob"))
1139			usage("tree file refers to bad file data");
1140		fd = create_file(path);
1141		if (fd < 0)
1142			usage("unable to create file");
1143		if (write(fd, data, filesize) != filesize)
1144			usage("unable to write file");
1145		fchmod(fd, mode);
1146		close(fd);
1147		free(data);
1148	}
1149	return 0;
1150}
1151
1152int main(int argc, char **argv)
1153{
1154	int fd;
1155	unsigned char sha1[20];
1156
1157	if (argc != 2)
1158		usage("read-tree <key>");
1159	if (get_sha1_hex(argv[1], sha1) < 0)
1160		usage("read-tree <key>");
1161	sha1_file_directory = getenv(DB_ENVIRONMENT);
1162	if (!sha1_file_directory)
1163		sha1_file_directory = DEFAULT_DB_ENVIRONMENT;
1164	if (unpack(sha1) < 0)
1165		usage("unpack failed");
1166	return 0;
1167}
1168"##,
1169            ),
1170            vec![
1171               DiffHunk::Matching(b"/*\n * GIT - The information manager from hell\n *\n * Copyright (C) Linus Torvalds, 2005\n */\n#include \"#cache.h\"\n\n"),
1172               DiffHunk::Different(vec![b"", b"static void create_directories(const char *path)\n{\n\tint len = strlen(path);\n\tchar *buf = malloc(len + 1);\n\tconst char *slash = path;\n\n\twhile ((slash = strchr(slash+1, \'/\')) != NULL) {\n\t\tlen = slash - path;\n\t\tmemcpy(buf, path, len);\n\t\tbuf[len] = 0;\n\t\tmkdir(buf, 0700);\n\t}\n}\n\nstatic int create_file(const char *path)\n{\n\tint fd = open(path, O_WRONLY | O_TRUNC | O_CREAT, 0600);\n\tif (fd < 0) {\n\t\tif (errno == ENOENT) {\n\t\t\tcreate_directories(path);\n\t\t\tfd = open(path, O_WRONLY | O_TRUNC | O_CREAT, 0600);\n\t\t}\n\t}\n\treturn fd;\n}\n\n"]),
1173               DiffHunk::Matching(b"static int unpack(unsigned char *sha1)\n{\n\tvoid *buffer;\n\tunsigned long size;\n\tchar type[20];\n\n\tbuffer = read_sha1_file(sha1, type, &size);\n\tif (!buffer)\n\t\tusage(\"unable to read sha1 file\");\n\tif (strcmp(type, \"tree\"))\n\t\tusage(\"expected a \'tree\' node\");\n\twhile (size) {\n\t\tint len = strlen(buffer)+1;\n\t\tunsigned char *sha1 = buffer + len;\n\t\tchar *path = strchr(buffer, \' \')+1;\n"),
1174               DiffHunk::Different(vec![b"", b"\t\tchar *data;\n\t\tunsigned long filesize;\n"]),
1175               DiffHunk::Matching(b"\t\tunsigned int mode;\n"),
1176               DiffHunk::Different(vec![b"", b"\t\tint fd;\n\n"]),
1177               DiffHunk::Matching(b"\t\tif (size < len + 20 || sscanf(buffer, \"%o\", &mode) != 1)\n\t\t\tusage(\"corrupt \'tree\' file\");\n\t\tbuffer = sha1 + 20;\n\t\tsize -= len + 20;\n\t\t"),
1178               DiffHunk::Different(vec![b"printf", b"data = read_sha1_file"]),
1179               DiffHunk::Matching(b"("),
1180               DiffHunk::Different(vec![b"\"%o %s (%s)\\n\", mode, path, sha1_to_hex(", b""]),
1181               DiffHunk::Matching(b"sha1"),
1182               DiffHunk::Different(vec![b"", b", type, &filesize"]),
1183               DiffHunk::Matching(b")"),
1184               DiffHunk::Different(vec![b")", b""]),
1185               DiffHunk::Matching(b";\n"),
1186               DiffHunk::Different(vec![b"", b"\t\tif (!data || strcmp(type, \"blob\"))\n\t\t\tusage(\"tree file refers to bad file data\");\n\t\tfd = create_file(path);\n\t\tif (fd < 0)\n\t\t\tusage(\"unable to create file\");\n\t\tif (write(fd, data, filesize) != filesize)\n\t\t\tusage(\"unable to write file\");\n\t\tfchmod(fd, mode);\n\t\tclose(fd);\n\t\tfree(data);\n"]),
1187               DiffHunk::Matching(b"\t}\n\treturn 0;\n}\n\nint main(int argc, char **argv)\n{\n\tint fd;\n\tunsigned char sha1[20];\n\n\tif (argc != 2)\n\t\tusage(\"read-tree <key>\");\n\tif (get_sha1_hex(argv[1], sha1) < 0)\n\t\tusage(\"read-tree <key>\");\n\tsha1_file_directory = getenv(DB_ENVIRONMENT);\n\tif (!sha1_file_directory)\n\t\tsha1_file_directory = DEFAULT_DB_ENVIRONMENT;\n\tif (unpack(sha1) < 0)\n\t\tusage(\"unpack failed\");\n\treturn 0;\n}\n")
1188            ]
1189        );
1190    }
1191}