internal/diff/diff.go at main · pdewey.com/shutter

pdewey.com / shutter
fork atom
Approval-based snapshot testing library for Go (mirror)
fork atom
shutter / internal / diff / diff.go
at main 443 lines 12 kB view raw
wrap content
Patrick Dewey refactor: cleanup 3mo ago
a62d7719
  1package diff
  2
  3/*
  4This file was sourced from github.com/gkampitakis/go-snaps, available with the following License:
  5
  6MIT License
  7
  8Copyright (c) 2021 Georgios Kampitakis
  9
 10Permission is hereby granted, free of charge, to any person obtaining a copy
 11of this software and associated documentation files (the "Software"), to deal
 12in the Software without restriction, including without limitation the rights
 13to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 14copies of the Software, and to permit persons to whom the Software is
 15furnished to do so, subject to the following conditions:
 16
 17The above copyright notice and this permission notice shall be included in all
 18copies or substantial portions of the Software.
 19
 20THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 21IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 22FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 23AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 24LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 25OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 26SOFTWARE.
 27
 28=======================
 29
 30This package is a partial port of Python difflib.
 31
 32This library is vendored and modified to address the `go-snaps` needs for a more
 33readable difference report.
 34
 35
 36Original source: https://github.com/pmezard/go-difflib
 37
 38Copyright (c) 2013, Patrick Mezard
 39All rights reserved.
 40
 41Redistribution and use in source and binary forms, with or without
 42modification, are permitted provided that the following conditions are
 43met:
 44
 45    Redistributions of source code must retain the above copyright
 46notice, this list of conditions and the following disclaimer.
 47    Redistributions in binary form must reproduce the above copyright
 48notice, this list of conditions and the following disclaimer in the
 49documentation and/or other materials provided with the distribution.
 50    The names of its contributors may not be used to endorse or promote
 51products derived from this software without specific prior written
 52permission.
 53
 54THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
 55IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 56TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
 57PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 58HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 59SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
 60TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 61PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 62LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 63NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 64SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 65*/
 66
 67import "strconv"
 68
 69type DiffKind int
 70
 71const (
 72	DiffShared DiffKind = iota
 73	DiffOld
 74	DiffNew
 75)
 76
 77type DiffLine struct {
 78	OldNumber int
 79	NewNumber int
 80	Line      string
 81	Kind      DiffKind
 82}
 83
 84const (
 85	// Tag Codes for internal opcodes
 86	opEqual int8 = iota
 87	opInsert
 88	opDelete
 89	opReplace
 90)
 91
 92type match struct {
 93	A    int
 94	B    int
 95	Size int
 96}
 97
 98type opCode struct {
 99	Tag int8
100	I1  int
101	I2  int
102	J1  int
103	J2  int
104}
105
106// sequenceMatcher compares sequence of strings. The basic
107// algorithm predates, and is a little fancier than, an algorithm
108// published in the late 1980's by Ratcliff and Obershelp under the
109// hyperbolic name "gestalt pattern matching".  The basic idea is to find
110// the longest contiguous matching subsequence that contains no "junk"
111// elements (R-O doesn't address junk).  The same idea is then applied
112// recursively to the pieces of the sequences to the left and to the right
113// of the matching subsequence.  This does not yield minimal edit
114// sequences, but does tend to yield matches that "look right" to people.
115//
116// sequenceMatcher tries to compute a "human-friendly diff" between two
117// sequences.  Unlike e.g. UNIX(tm) diff, the fundamental notion is the
118// longest *contiguous* & junk-free matching subsequence.  That's what
119// catches peoples' eyes.  The Windows(tm) windiff has another interesting
120// notion, pairing up elements that appear uniquely in each sequence.
121// That, and the method here, appear to yield more intuitive difference
122// reports than does diff.  This method appears to be the least vulnerable
123// to synching up on blocks of "junk lines", though (like blank lines in
124// ordinary text files, or maybe "<P>" lines in HTML files).  That may be
125// because this is the only method of the 3 that has a *concept* of
126// "junk" <wink>.
127//
128// Timing:  Basic R-O is cubic time worst case and quadratic time expected
129// case.  sequenceMatcher is quadratic time for the worst case and has
130// expected-case behavior dependent in a complicated way on how many
131// elements the sequences have in common; best case time is linear.
132type sequenceMatcher struct {
133	a              []string
134	b              []string
135	b2j            map[string][]int
136	IsJunk         func(string) bool
137	autoJunk       bool
138	bJunk          map[string]struct{}
139	matchingBlocks []match
140	fullBCount     map[string]int
141	bPopular       map[string]struct{}
142	opCodes        []opCode
143}
144
145func newMatcher(a, b []string) *sequenceMatcher {
146	m := sequenceMatcher{autoJunk: true}
147	m.setSeqs(a, b)
148	return &m
149}
150
151// Set two sequences to be compared.
152func (m *sequenceMatcher) setSeqs(a, b []string) {
153	m.setSeq1(a)
154	m.setSeq2(b)
155}
156
157// Set the first sequence to be compared.
158func (m *sequenceMatcher) setSeq1(a []string) {
159	m.a = a
160	m.matchingBlocks, m.opCodes = nil, nil
161}
162
163// Set the second sequence to be compared.
164func (m *sequenceMatcher) setSeq2(b []string) {
165	m.b = b
166	m.matchingBlocks, m.opCodes, m.fullBCount = nil, nil, nil
167	m.chainB()
168}
169
170func (m *sequenceMatcher) chainB() {
171	// Populate line -> index mapping
172	b2j := map[string][]int{}
173	for i, elt := range m.b {
174		indices := b2j[elt]
175		indices = append(indices, i)
176		b2j[elt] = indices
177	}
178
179	// Purge junk elements
180	m.bJunk = map[string]struct{}{}
181	if m.IsJunk != nil {
182		junk := m.bJunk
183		for elt := range b2j {
184			if m.IsJunk(elt) {
185				junk[elt] = struct{}{}
186			}
187		}
188		for elt := range junk { // separate loop avoids separate list of keys
189			delete(b2j, elt)
190		}
191	}
192
193	// Purge popular elements that are not junk
194	popular := map[string]struct{}{}
195	n := len(m.b)
196	if m.autoJunk && n >= 200 {
197		ntest := n/100 + 1
198		for s, indices := range b2j {
199			if len(indices) > ntest {
200				popular[s] = struct{}{}
201			}
202		}
203		for s := range popular {
204			delete(b2j, s)
205		}
206	}
207	m.bPopular = popular
208	m.b2j = b2j
209}
210
211func (m *sequenceMatcher) isBJunk(s string) bool {
212	_, ok := m.bJunk[s]
213	return ok
214}
215
216// Find longest matching block in a[alo:ahi] and b[blo:bhi].
217func (m *sequenceMatcher) findLongestMatch(alo, ahi, blo, bhi int) match {
218	besti, bestj, bestsize := alo, blo, 0
219
220	// find longest junk-free match
221	j2len := map[int]int{}
222	for i := alo; i != ahi; i++ {
223		newj2len := map[int]int{}
224		for _, j := range m.b2j[m.a[i]] {
225			if j < blo {
226				continue
227			}
228			if j >= bhi {
229				break
230			}
231			k := j2len[j-1] + 1
232			newj2len[j] = k
233			if k > bestsize {
234				besti, bestj, bestsize = i-k+1, j-k+1, k
235			}
236		}
237		j2len = newj2len
238	}
239
240	// Extend the best by non-junk elements on each end
241	for besti > alo && bestj > blo && !m.isBJunk(m.b[bestj-1]) &&
242		m.a[besti-1] == m.b[bestj-1] {
243		besti, bestj, bestsize = besti-1, bestj-1, bestsize+1
244	}
245	for besti+bestsize < ahi && bestj+bestsize < bhi &&
246		!m.isBJunk(m.b[bestj+bestsize]) &&
247		m.a[besti+bestsize] == m.b[bestj+bestsize] {
248		bestsize++
249	}
250
251	// Suck up junk on each side
252	for besti > alo && bestj > blo && m.isBJunk(m.b[bestj-1]) &&
253		m.a[besti-1] == m.b[bestj-1] {
254		besti, bestj, bestsize = besti-1, bestj-1, bestsize+1
255	}
256	for besti+bestsize < ahi && bestj+bestsize < bhi &&
257		m.isBJunk(m.b[bestj+bestsize]) &&
258		m.a[besti+bestsize] == m.b[bestj+bestsize] {
259		bestsize++
260	}
261
262	return match{A: besti, B: bestj, Size: bestsize}
263}
264
265// Return list of triples describing matching subsequences.
266func (m *sequenceMatcher) getMatchingBlocks() []match {
267	if m.matchingBlocks != nil {
268		return m.matchingBlocks
269	}
270
271	var matchBlocks func(alo, ahi, blo, bhi int, matched []match) []match
272	matchBlocks = func(alo, ahi, blo, bhi int, matched []match) []match {
273		match := m.findLongestMatch(alo, ahi, blo, bhi)
274		i, j, k := match.A, match.B, match.Size
275		if match.Size > 0 {
276			if alo < i && blo < j {
277				matched = matchBlocks(alo, i, blo, j, matched)
278			}
279			matched = append(matched, match)
280			if i+k < ahi && j+k < bhi {
281				matched = matchBlocks(i+k, ahi, j+k, bhi, matched)
282			}
283		}
284		return matched
285	}
286	matched := matchBlocks(0, len(m.a), 0, len(m.b), nil)
287
288	// Collapse adjacent equal blocks
289	nonAdjacent := []match{}
290	i1, j1, k1 := 0, 0, 0
291	for _, b := range matched {
292		i2, j2, k2 := b.A, b.B, b.Size
293		if i1+k1 == i2 && j1+k1 == j2 {
294			k1 += k2
295		} else {
296			if k1 > 0 {
297				nonAdjacent = append(nonAdjacent, match{i1, j1, k1})
298			}
299			i1, j1, k1 = i2, j2, k2
300		}
301	}
302	if k1 > 0 {
303		nonAdjacent = append(nonAdjacent, match{i1, j1, k1})
304	}
305
306	nonAdjacent = append(nonAdjacent, match{len(m.a), len(m.b), 0})
307	m.matchingBlocks = nonAdjacent
308	return m.matchingBlocks
309}
310
311// Return list of opcodes describing how to turn a into b.
312func (m *sequenceMatcher) getOpCodes() []opCode {
313	if m.opCodes != nil {
314		return m.opCodes
315	}
316	i, j := 0, 0
317	matching := m.getMatchingBlocks()
318	opCodes := make([]opCode, 0, len(matching))
319	for _, m := range matching {
320		ai, bj, size := m.A, m.B, m.Size
321		var tag int8 = 0
322		if i < ai && j < bj {
323			tag = opReplace
324		} else if i < ai {
325			tag = opDelete
326		} else if j < bj {
327			tag = opInsert
328		}
329		if tag > 0 {
330			opCodes = append(opCodes, opCode{tag, i, ai, j, bj})
331		}
332		i, j = ai+size, bj+size
333		if size > 0 {
334			opCodes = append(opCodes, opCode{opEqual, ai, i, bj, j})
335		}
336	}
337	m.opCodes = opCodes
338	return m.opCodes
339}
340
341// Histogram computes a diff between two strings using the Ratcliff-Obershelp algorithm
342func Histogram(old, new string) []DiffLine {
343	oldLines := splitLines(old)
344	newLines := splitLines(new)
345
346	matcher := newMatcher(oldLines, newLines)
347	opcodes := matcher.getOpCodes()
348
349	var result []DiffLine
350
351	for _, op := range opcodes {
352		switch op.Tag {
353		case opEqual:
354			for i := op.I1; i < op.I2; i++ {
355				newIdx := i + (op.J1 - op.I1)
356				result = append(result, DiffLine{
357					Line:      oldLines[i],
358					Kind:      DiffShared,
359					OldNumber: i + 1,
360					NewNumber: newIdx + 1,
361				})
362			}
363		case opDelete:
364			for i := op.I1; i < op.I2; i++ {
365				result = append(result, DiffLine{
366					Line:      oldLines[i],
367					Kind:      DiffOld,
368					OldNumber: i + 1,
369				})
370			}
371		case opInsert:
372			for j := op.J1; j < op.J2; j++ {
373				result = append(result, DiffLine{
374					Line:      newLines[j],
375					Kind:      DiffNew,
376					NewNumber: j + 1,
377				})
378			}
379		case opReplace:
380			for i := op.I1; i < op.I2; i++ {
381				result = append(result, DiffLine{
382					Line:      oldLines[i],
383					Kind:      DiffOld,
384					OldNumber: i + 1,
385				})
386			}
387			for j := op.J1; j < op.J2; j++ {
388				result = append(result, DiffLine{
389					Line:      newLines[j],
390					Kind:      DiffNew,
391					NewNumber: j + 1,
392				})
393			}
394		}
395	}
396
397	return result
398}
399
400// splitLines splits a string by newlines, handling empty strings
401func splitLines(s string) []string {
402	if s == "" {
403		return []string{}
404	}
405	lines := splitLinesKeepNewline(s)
406	return lines
407}
408
409// splitLinesKeepNewline splits on newlines but removes the newlines themselves
410func splitLinesKeepNewline(s string) []string {
411	var lines []string
412	var current string
413
414	for _, char := range s {
415		if char == '\n' {
416			lines = append(lines, current)
417			current = ""
418		} else {
419			current += string(char)
420		}
421	}
422
423	if current != "" {
424		lines = append(lines, current)
425	}
426
427	return lines
428}
429
430// FormatRangeUnified converts range to the "ed" format for unified diffs
431func FormatRangeUnified(start, stop int) string {
432	beginning := start + 1 // lines start numbering with one
433	length := stop - start
434
435	if length == 1 {
436		return strconv.Itoa(beginning)
437	}
438	if length == 0 {
439		beginning--
440	}
441
442	return strconv.Itoa(beginning) + "," + strconv.Itoa(length)
443}