services/gitdiff/highlightdiff.go at forgejo · oppi.li/perf-test

oppi.li / perf-test
loading up the forgejo repo on tangled to test page performance
perf-test / services / gitdiff / highlightdiff.go
at forgejo 8.2 kB view raw
  1// Copyright 2022 The Gitea Authors. All rights reserved.
  2// SPDX-License-Identifier: MIT
  3
  4package gitdiff
  5
  6import (
  7	"strings"
  8
  9	"forgejo.org/modules/highlight"
 10
 11	"github.com/sergi/go-diff/diffmatchpatch"
 12)
 13
 14// token is a html tag or entity, eg: "<span ...>", "</span>", "&lt;"
 15func extractHTMLToken(s string) (before, token, after string, valid bool) {
 16	for pos1 := 0; pos1 < len(s); pos1++ {
 17		switch s[pos1] {
 18		case '<':
 19			pos2 := strings.IndexByte(s[pos1:], '>')
 20			if pos2 == -1 {
 21				return "", "", s, false
 22			}
 23			return s[:pos1], s[pos1 : pos1+pos2+1], s[pos1+pos2+1:], true
 24		case '&':
 25			pos2 := strings.IndexByte(s[pos1:], ';')
 26			if pos2 == -1 {
 27				return "", "", s, false
 28			}
 29			return s[:pos1], s[pos1 : pos1+pos2+1], s[pos1+pos2+1:], true
 30		}
 31	}
 32	return "", "", s, true
 33}
 34
 35// HighlightCodeDiff is used to do diff with highlighted HTML code.
 36// It totally depends on Chroma's valid HTML output and its structure, do not use these functions for other purposes.
 37// The HTML tags and entities will be replaced by Unicode placeholders: "<span>{TEXT}</span>" => "\uE000{TEXT}\uE001"
 38// These Unicode placeholders are friendly to the diff.
 39// Then after diff, the placeholders in diff result will be recovered to the HTML tags and entities.
 40// It's guaranteed that the tags in final diff result are paired correctly.
 41type HighlightCodeDiff struct {
 42	placeholderBegin    rune
 43	placeholderMaxCount int
 44	placeholderIndex    int
 45	PlaceholderTokenMap map[rune]string
 46	tokenPlaceholderMap map[string]rune
 47
 48	placeholderOverflowCount int
 49
 50	lineWrapperTags []string
 51}
 52
 53func NewHighlightCodeDiff() *HighlightCodeDiff {
 54	return &HighlightCodeDiff{
 55		placeholderBegin:    rune(0x100000), // Plane 16: Supplementary Private Use Area B (U+100000..U+10FFFD)
 56		placeholderMaxCount: 64000,
 57		PlaceholderTokenMap: map[rune]string{},
 58		tokenPlaceholderMap: map[string]rune{},
 59	}
 60}
 61
 62// NextPlaceholder returns 0 if no more placeholder can be used
 63// the diff is done line by line, usually there are only a few (no more than 10) placeholders in one line
 64// so the placeholderMaxCount is impossible to be exhausted in real cases.
 65func (hcd *HighlightCodeDiff) NextPlaceholder() rune {
 66	for hcd.placeholderIndex < hcd.placeholderMaxCount {
 67		r := hcd.placeholderBegin + rune(hcd.placeholderIndex)
 68		hcd.placeholderIndex++
 69		// only use non-existing (not used by code) rune as placeholders
 70		if _, ok := hcd.PlaceholderTokenMap[r]; !ok {
 71			return r
 72		}
 73	}
 74	return 0 // no more available placeholder
 75}
 76
 77func (hcd *HighlightCodeDiff) isInPlaceholderRange(r rune) bool {
 78	return hcd.placeholderBegin <= r && r < hcd.placeholderBegin+rune(hcd.placeholderMaxCount)
 79}
 80
 81func (hcd *HighlightCodeDiff) CollectUsedRunes(code string) {
 82	for _, r := range code {
 83		if hcd.isInPlaceholderRange(r) {
 84			// put the existing rune (used by code) in map, then this rune won't be used a placeholder anymore.
 85			hcd.PlaceholderTokenMap[r] = ""
 86		}
 87	}
 88}
 89
 90func (hcd *HighlightCodeDiff) diffWithHighlight(filename, language, codeA, codeB string) []diffmatchpatch.Diff {
 91	hcd.CollectUsedRunes(codeA)
 92	hcd.CollectUsedRunes(codeB)
 93
 94	highlightCodeA, _ := highlight.Code(filename, language, codeA)
 95	highlightCodeB, _ := highlight.Code(filename, language, codeB)
 96
 97	convertedCodeA := hcd.ConvertToPlaceholders(string(highlightCodeA))
 98	convertedCodeB := hcd.ConvertToPlaceholders(string(highlightCodeB))
 99
100	diffs := diffMatchPatch.DiffMain(convertedCodeA, convertedCodeB, true)
101	diffs = diffMatchPatch.DiffCleanupSemantic(diffs)
102	diffs = diffMatchPatch.DiffCleanupEfficiency(diffs)
103
104	for i := range diffs {
105		hcd.recoverOneDiff(&diffs[i])
106	}
107	return diffs
108}
109
110// convertToPlaceholders totally depends on Chroma's valid HTML output and its structure, do not use these functions for other purposes.
111func (hcd *HighlightCodeDiff) ConvertToPlaceholders(htmlCode string) string {
112	var tagStack []string
113	res := strings.Builder{}
114
115	firstRunForLineTags := hcd.lineWrapperTags == nil
116
117	var beforeToken, token string
118	var valid bool
119
120	// the standard chroma highlight HTML is "<span class="line [hl]"><span class="cl"> ... </span></span>"
121	for {
122		beforeToken, token, htmlCode, valid = extractHTMLToken(htmlCode)
123		if !valid || token == "" {
124			break
125		}
126		// write the content before the token into result string, and consume the token in the string
127		res.WriteString(beforeToken)
128
129		// the line wrapper tags should be removed before diff
130		if strings.HasPrefix(token, `<span class="line`) || strings.HasPrefix(token, `<span class="cl"`) {
131			if firstRunForLineTags {
132				// if this is the first run for converting, save the line wrapper tags for later use, they should be added back
133				hcd.lineWrapperTags = append(hcd.lineWrapperTags, token)
134			}
135			htmlCode = strings.TrimSuffix(htmlCode, "</span>")
136			continue
137		}
138
139		var tokenInMap string
140		if strings.HasSuffix(token, "</") { // for closing tag
141			if len(tagStack) == 0 {
142				break // invalid diff result, no opening tag but see closing tag
143			}
144			// make sure the closing tag in map is related to the open tag, to make the diff algorithm can match the opening/closing tags
145			// the closing tag will be recorded in the map by key "</span><!-- <span the-opening> -->" for "<span the-opening>"
146			tokenInMap = token + "<!-- " + tagStack[len(tagStack)-1] + "-->"
147			tagStack = tagStack[:len(tagStack)-1]
148		} else if token[0] == '<' { // for opening tag
149			tokenInMap = token
150			tagStack = append(tagStack, token)
151		} else if token[0] == '&' { // for html entity
152			tokenInMap = token
153		} // else: impossible
154
155		// remember the placeholder and token in the map
156		placeholder, ok := hcd.tokenPlaceholderMap[tokenInMap]
157		if !ok {
158			placeholder = hcd.NextPlaceholder()
159			if placeholder != 0 {
160				hcd.tokenPlaceholderMap[tokenInMap] = placeholder
161				hcd.PlaceholderTokenMap[placeholder] = tokenInMap
162			}
163		}
164
165		if placeholder != 0 {
166			res.WriteRune(placeholder) // use the placeholder to replace the token
167		} else {
168			// unfortunately, all private use runes has been exhausted, no more placeholder could be used, no more converting
169			// usually, the exhausting won't occur in real cases, the magnitude of used placeholders is not larger than that of the CSS classes outputted by chroma.
170			hcd.placeholderOverflowCount++
171			if strings.HasPrefix(token, "&") {
172				// when the token is a html entity, something must be outputted even if there is no placeholder.
173				res.WriteRune(0xFFFD)      // replacement character TODO: how to handle this case more gracefully?
174				res.WriteString(token[1:]) // still output the entity code part, otherwise there will be no diff result.
175			}
176		}
177	}
178
179	// write the remaining string
180	res.WriteString(htmlCode)
181	return res.String()
182}
183
184func (hcd *HighlightCodeDiff) recoverOneDiff(diff *diffmatchpatch.Diff) {
185	diff.Text = hcd.Recover(diff.Text)
186}
187
188func (hcd *HighlightCodeDiff) Recover(src string) string {
189	sb := strings.Builder{}
190	var tagStack []string
191
192	for _, r := range src {
193		token, ok := hcd.PlaceholderTokenMap[r]
194		if !ok || token == "" {
195			sb.WriteRune(r) // if the rune is not a placeholder, write it as it is
196			continue
197		}
198		var tokenToRecover string
199		if strings.HasPrefix(token, "</") { // for closing tag
200			// only get the tag itself, ignore the trailing comment (for how the comment is generated, see the code in `convert` function)
201			tokenToRecover = token[:strings.IndexByte(token, '>')+1]
202			if len(tagStack) == 0 {
203				continue // if no opening tag in stack yet, skip the closing tag
204			}
205			tagStack = tagStack[:len(tagStack)-1]
206		} else if token[0] == '<' { // for opening tag
207			tokenToRecover = token
208			tagStack = append(tagStack, token)
209		} else if token[0] == '&' { // for html entity
210			tokenToRecover = token
211		} // else: impossible
212		sb.WriteString(tokenToRecover)
213	}
214
215	if len(tagStack) > 0 {
216		// close all opening tags
217		for i := len(tagStack) - 1; i >= 0; i-- {
218			tagToClose := tagStack[i]
219			// get the closing tag "</span>" from "<span class=...>" or "<span>"
220			pos := strings.IndexAny(tagToClose, " >")
221			if pos != -1 {
222				sb.WriteString("</" + tagToClose[1:pos] + ">")
223			} // else: impossible. every tag was pushed into the stack by the code above and is valid HTML opening tag
224		}
225	}
226
227	return sb.String()
228}