1// Copyright 2022 The Gitea Authors. All rights reserved.
2// SPDX-License-Identifier: MIT
3
4package gitdiff
5
6import (
7 "strings"
8
9 "forgejo.org/modules/highlight"
10
11 "github.com/sergi/go-diff/diffmatchpatch"
12)
13
14// token is a html tag or entity, eg: "<span ...>", "</span>", "<"
15func extractHTMLToken(s string) (before, token, after string, valid bool) {
16 for pos1 := 0; pos1 < len(s); pos1++ {
17 switch s[pos1] {
18 case '<':
19 pos2 := strings.IndexByte(s[pos1:], '>')
20 if pos2 == -1 {
21 return "", "", s, false
22 }
23 return s[:pos1], s[pos1 : pos1+pos2+1], s[pos1+pos2+1:], true
24 case '&':
25 pos2 := strings.IndexByte(s[pos1:], ';')
26 if pos2 == -1 {
27 return "", "", s, false
28 }
29 return s[:pos1], s[pos1 : pos1+pos2+1], s[pos1+pos2+1:], true
30 }
31 }
32 return "", "", s, true
33}
34
35// HighlightCodeDiff is used to do diff with highlighted HTML code.
36// It totally depends on Chroma's valid HTML output and its structure, do not use these functions for other purposes.
37// The HTML tags and entities will be replaced by Unicode placeholders: "<span>{TEXT}</span>" => "\uE000{TEXT}\uE001"
38// These Unicode placeholders are friendly to the diff.
39// Then after diff, the placeholders in diff result will be recovered to the HTML tags and entities.
40// It's guaranteed that the tags in final diff result are paired correctly.
41type HighlightCodeDiff struct {
42 placeholderBegin rune
43 placeholderMaxCount int
44 placeholderIndex int
45 PlaceholderTokenMap map[rune]string
46 tokenPlaceholderMap map[string]rune
47
48 placeholderOverflowCount int
49
50 lineWrapperTags []string
51}
52
53func NewHighlightCodeDiff() *HighlightCodeDiff {
54 return &HighlightCodeDiff{
55 placeholderBegin: rune(0x100000), // Plane 16: Supplementary Private Use Area B (U+100000..U+10FFFD)
56 placeholderMaxCount: 64000,
57 PlaceholderTokenMap: map[rune]string{},
58 tokenPlaceholderMap: map[string]rune{},
59 }
60}
61
62// NextPlaceholder returns 0 if no more placeholder can be used
63// the diff is done line by line, usually there are only a few (no more than 10) placeholders in one line
64// so the placeholderMaxCount is impossible to be exhausted in real cases.
65func (hcd *HighlightCodeDiff) NextPlaceholder() rune {
66 for hcd.placeholderIndex < hcd.placeholderMaxCount {
67 r := hcd.placeholderBegin + rune(hcd.placeholderIndex)
68 hcd.placeholderIndex++
69 // only use non-existing (not used by code) rune as placeholders
70 if _, ok := hcd.PlaceholderTokenMap[r]; !ok {
71 return r
72 }
73 }
74 return 0 // no more available placeholder
75}
76
77func (hcd *HighlightCodeDiff) isInPlaceholderRange(r rune) bool {
78 return hcd.placeholderBegin <= r && r < hcd.placeholderBegin+rune(hcd.placeholderMaxCount)
79}
80
81func (hcd *HighlightCodeDiff) CollectUsedRunes(code string) {
82 for _, r := range code {
83 if hcd.isInPlaceholderRange(r) {
84 // put the existing rune (used by code) in map, then this rune won't be used a placeholder anymore.
85 hcd.PlaceholderTokenMap[r] = ""
86 }
87 }
88}
89
90func (hcd *HighlightCodeDiff) diffWithHighlight(filename, language, codeA, codeB string) []diffmatchpatch.Diff {
91 hcd.CollectUsedRunes(codeA)
92 hcd.CollectUsedRunes(codeB)
93
94 highlightCodeA, _ := highlight.Code(filename, language, codeA)
95 highlightCodeB, _ := highlight.Code(filename, language, codeB)
96
97 convertedCodeA := hcd.ConvertToPlaceholders(string(highlightCodeA))
98 convertedCodeB := hcd.ConvertToPlaceholders(string(highlightCodeB))
99
100 diffs := diffMatchPatch.DiffMain(convertedCodeA, convertedCodeB, true)
101 diffs = diffMatchPatch.DiffCleanupSemantic(diffs)
102 diffs = diffMatchPatch.DiffCleanupEfficiency(diffs)
103
104 for i := range diffs {
105 hcd.recoverOneDiff(&diffs[i])
106 }
107 return diffs
108}
109
110// convertToPlaceholders totally depends on Chroma's valid HTML output and its structure, do not use these functions for other purposes.
111func (hcd *HighlightCodeDiff) ConvertToPlaceholders(htmlCode string) string {
112 var tagStack []string
113 res := strings.Builder{}
114
115 firstRunForLineTags := hcd.lineWrapperTags == nil
116
117 var beforeToken, token string
118 var valid bool
119
120 // the standard chroma highlight HTML is "<span class="line [hl]"><span class="cl"> ... </span></span>"
121 for {
122 beforeToken, token, htmlCode, valid = extractHTMLToken(htmlCode)
123 if !valid || token == "" {
124 break
125 }
126 // write the content before the token into result string, and consume the token in the string
127 res.WriteString(beforeToken)
128
129 // the line wrapper tags should be removed before diff
130 if strings.HasPrefix(token, `<span class="line`) || strings.HasPrefix(token, `<span class="cl"`) {
131 if firstRunForLineTags {
132 // if this is the first run for converting, save the line wrapper tags for later use, they should be added back
133 hcd.lineWrapperTags = append(hcd.lineWrapperTags, token)
134 }
135 htmlCode = strings.TrimSuffix(htmlCode, "</span>")
136 continue
137 }
138
139 var tokenInMap string
140 if strings.HasSuffix(token, "</") { // for closing tag
141 if len(tagStack) == 0 {
142 break // invalid diff result, no opening tag but see closing tag
143 }
144 // make sure the closing tag in map is related to the open tag, to make the diff algorithm can match the opening/closing tags
145 // the closing tag will be recorded in the map by key "</span><!-- <span the-opening> -->" for "<span the-opening>"
146 tokenInMap = token + "<!-- " + tagStack[len(tagStack)-1] + "-->"
147 tagStack = tagStack[:len(tagStack)-1]
148 } else if token[0] == '<' { // for opening tag
149 tokenInMap = token
150 tagStack = append(tagStack, token)
151 } else if token[0] == '&' { // for html entity
152 tokenInMap = token
153 } // else: impossible
154
155 // remember the placeholder and token in the map
156 placeholder, ok := hcd.tokenPlaceholderMap[tokenInMap]
157 if !ok {
158 placeholder = hcd.NextPlaceholder()
159 if placeholder != 0 {
160 hcd.tokenPlaceholderMap[tokenInMap] = placeholder
161 hcd.PlaceholderTokenMap[placeholder] = tokenInMap
162 }
163 }
164
165 if placeholder != 0 {
166 res.WriteRune(placeholder) // use the placeholder to replace the token
167 } else {
168 // unfortunately, all private use runes has been exhausted, no more placeholder could be used, no more converting
169 // usually, the exhausting won't occur in real cases, the magnitude of used placeholders is not larger than that of the CSS classes outputted by chroma.
170 hcd.placeholderOverflowCount++
171 if strings.HasPrefix(token, "&") {
172 // when the token is a html entity, something must be outputted even if there is no placeholder.
173 res.WriteRune(0xFFFD) // replacement character TODO: how to handle this case more gracefully?
174 res.WriteString(token[1:]) // still output the entity code part, otherwise there will be no diff result.
175 }
176 }
177 }
178
179 // write the remaining string
180 res.WriteString(htmlCode)
181 return res.String()
182}
183
184func (hcd *HighlightCodeDiff) recoverOneDiff(diff *diffmatchpatch.Diff) {
185 diff.Text = hcd.Recover(diff.Text)
186}
187
188func (hcd *HighlightCodeDiff) Recover(src string) string {
189 sb := strings.Builder{}
190 var tagStack []string
191
192 for _, r := range src {
193 token, ok := hcd.PlaceholderTokenMap[r]
194 if !ok || token == "" {
195 sb.WriteRune(r) // if the rune is not a placeholder, write it as it is
196 continue
197 }
198 var tokenToRecover string
199 if strings.HasPrefix(token, "</") { // for closing tag
200 // only get the tag itself, ignore the trailing comment (for how the comment is generated, see the code in `convert` function)
201 tokenToRecover = token[:strings.IndexByte(token, '>')+1]
202 if len(tagStack) == 0 {
203 continue // if no opening tag in stack yet, skip the closing tag
204 }
205 tagStack = tagStack[:len(tagStack)-1]
206 } else if token[0] == '<' { // for opening tag
207 tokenToRecover = token
208 tagStack = append(tagStack, token)
209 } else if token[0] == '&' { // for html entity
210 tokenToRecover = token
211 } // else: impossible
212 sb.WriteString(tokenToRecover)
213 }
214
215 if len(tagStack) > 0 {
216 // close all opening tags
217 for i := len(tagStack) - 1; i >= 0; i-- {
218 tagToClose := tagStack[i]
219 // get the closing tag "</span>" from "<span class=...>" or "<span>"
220 pos := strings.IndexAny(tagToClose, " >")
221 if pos != -1 {
222 sb.WriteString("</" + tagToClose[1:pos] + ">")
223 } // else: impossible. every tag was pushed into the stack by the code above and is valid HTML opening tag
224 }
225 }
226
227 return sb.String()
228}