loading up the forgejo repo on tangled to test page performance
at forgejo 8.2 kB view raw
1// Copyright 2022 The Gitea Authors. All rights reserved. 2// SPDX-License-Identifier: MIT 3 4package gitdiff 5 6import ( 7 "strings" 8 9 "forgejo.org/modules/highlight" 10 11 "github.com/sergi/go-diff/diffmatchpatch" 12) 13 14// token is a html tag or entity, eg: "<span ...>", "</span>", "&lt;" 15func extractHTMLToken(s string) (before, token, after string, valid bool) { 16 for pos1 := 0; pos1 < len(s); pos1++ { 17 switch s[pos1] { 18 case '<': 19 pos2 := strings.IndexByte(s[pos1:], '>') 20 if pos2 == -1 { 21 return "", "", s, false 22 } 23 return s[:pos1], s[pos1 : pos1+pos2+1], s[pos1+pos2+1:], true 24 case '&': 25 pos2 := strings.IndexByte(s[pos1:], ';') 26 if pos2 == -1 { 27 return "", "", s, false 28 } 29 return s[:pos1], s[pos1 : pos1+pos2+1], s[pos1+pos2+1:], true 30 } 31 } 32 return "", "", s, true 33} 34 35// HighlightCodeDiff is used to do diff with highlighted HTML code. 36// It totally depends on Chroma's valid HTML output and its structure, do not use these functions for other purposes. 37// The HTML tags and entities will be replaced by Unicode placeholders: "<span>{TEXT}</span>" => "\uE000{TEXT}\uE001" 38// These Unicode placeholders are friendly to the diff. 39// Then after diff, the placeholders in diff result will be recovered to the HTML tags and entities. 40// It's guaranteed that the tags in final diff result are paired correctly. 41type HighlightCodeDiff struct { 42 placeholderBegin rune 43 placeholderMaxCount int 44 placeholderIndex int 45 PlaceholderTokenMap map[rune]string 46 tokenPlaceholderMap map[string]rune 47 48 placeholderOverflowCount int 49 50 lineWrapperTags []string 51} 52 53func NewHighlightCodeDiff() *HighlightCodeDiff { 54 return &HighlightCodeDiff{ 55 placeholderBegin: rune(0x100000), // Plane 16: Supplementary Private Use Area B (U+100000..U+10FFFD) 56 placeholderMaxCount: 64000, 57 PlaceholderTokenMap: map[rune]string{}, 58 tokenPlaceholderMap: map[string]rune{}, 59 } 60} 61 62// NextPlaceholder returns 0 if no more placeholder can be used 63// the diff is done line by line, usually there are only a few (no more than 10) placeholders in one line 64// so the placeholderMaxCount is impossible to be exhausted in real cases. 65func (hcd *HighlightCodeDiff) NextPlaceholder() rune { 66 for hcd.placeholderIndex < hcd.placeholderMaxCount { 67 r := hcd.placeholderBegin + rune(hcd.placeholderIndex) 68 hcd.placeholderIndex++ 69 // only use non-existing (not used by code) rune as placeholders 70 if _, ok := hcd.PlaceholderTokenMap[r]; !ok { 71 return r 72 } 73 } 74 return 0 // no more available placeholder 75} 76 77func (hcd *HighlightCodeDiff) isInPlaceholderRange(r rune) bool { 78 return hcd.placeholderBegin <= r && r < hcd.placeholderBegin+rune(hcd.placeholderMaxCount) 79} 80 81func (hcd *HighlightCodeDiff) CollectUsedRunes(code string) { 82 for _, r := range code { 83 if hcd.isInPlaceholderRange(r) { 84 // put the existing rune (used by code) in map, then this rune won't be used a placeholder anymore. 85 hcd.PlaceholderTokenMap[r] = "" 86 } 87 } 88} 89 90func (hcd *HighlightCodeDiff) diffWithHighlight(filename, language, codeA, codeB string) []diffmatchpatch.Diff { 91 hcd.CollectUsedRunes(codeA) 92 hcd.CollectUsedRunes(codeB) 93 94 highlightCodeA, _ := highlight.Code(filename, language, codeA) 95 highlightCodeB, _ := highlight.Code(filename, language, codeB) 96 97 convertedCodeA := hcd.ConvertToPlaceholders(string(highlightCodeA)) 98 convertedCodeB := hcd.ConvertToPlaceholders(string(highlightCodeB)) 99 100 diffs := diffMatchPatch.DiffMain(convertedCodeA, convertedCodeB, true) 101 diffs = diffMatchPatch.DiffCleanupSemantic(diffs) 102 diffs = diffMatchPatch.DiffCleanupEfficiency(diffs) 103 104 for i := range diffs { 105 hcd.recoverOneDiff(&diffs[i]) 106 } 107 return diffs 108} 109 110// convertToPlaceholders totally depends on Chroma's valid HTML output and its structure, do not use these functions for other purposes. 111func (hcd *HighlightCodeDiff) ConvertToPlaceholders(htmlCode string) string { 112 var tagStack []string 113 res := strings.Builder{} 114 115 firstRunForLineTags := hcd.lineWrapperTags == nil 116 117 var beforeToken, token string 118 var valid bool 119 120 // the standard chroma highlight HTML is "<span class="line [hl]"><span class="cl"> ... </span></span>" 121 for { 122 beforeToken, token, htmlCode, valid = extractHTMLToken(htmlCode) 123 if !valid || token == "" { 124 break 125 } 126 // write the content before the token into result string, and consume the token in the string 127 res.WriteString(beforeToken) 128 129 // the line wrapper tags should be removed before diff 130 if strings.HasPrefix(token, `<span class="line`) || strings.HasPrefix(token, `<span class="cl"`) { 131 if firstRunForLineTags { 132 // if this is the first run for converting, save the line wrapper tags for later use, they should be added back 133 hcd.lineWrapperTags = append(hcd.lineWrapperTags, token) 134 } 135 htmlCode = strings.TrimSuffix(htmlCode, "</span>") 136 continue 137 } 138 139 var tokenInMap string 140 if strings.HasSuffix(token, "</") { // for closing tag 141 if len(tagStack) == 0 { 142 break // invalid diff result, no opening tag but see closing tag 143 } 144 // make sure the closing tag in map is related to the open tag, to make the diff algorithm can match the opening/closing tags 145 // the closing tag will be recorded in the map by key "</span><!-- <span the-opening> -->" for "<span the-opening>" 146 tokenInMap = token + "<!-- " + tagStack[len(tagStack)-1] + "-->" 147 tagStack = tagStack[:len(tagStack)-1] 148 } else if token[0] == '<' { // for opening tag 149 tokenInMap = token 150 tagStack = append(tagStack, token) 151 } else if token[0] == '&' { // for html entity 152 tokenInMap = token 153 } // else: impossible 154 155 // remember the placeholder and token in the map 156 placeholder, ok := hcd.tokenPlaceholderMap[tokenInMap] 157 if !ok { 158 placeholder = hcd.NextPlaceholder() 159 if placeholder != 0 { 160 hcd.tokenPlaceholderMap[tokenInMap] = placeholder 161 hcd.PlaceholderTokenMap[placeholder] = tokenInMap 162 } 163 } 164 165 if placeholder != 0 { 166 res.WriteRune(placeholder) // use the placeholder to replace the token 167 } else { 168 // unfortunately, all private use runes has been exhausted, no more placeholder could be used, no more converting 169 // usually, the exhausting won't occur in real cases, the magnitude of used placeholders is not larger than that of the CSS classes outputted by chroma. 170 hcd.placeholderOverflowCount++ 171 if strings.HasPrefix(token, "&") { 172 // when the token is a html entity, something must be outputted even if there is no placeholder. 173 res.WriteRune(0xFFFD) // replacement character TODO: how to handle this case more gracefully? 174 res.WriteString(token[1:]) // still output the entity code part, otherwise there will be no diff result. 175 } 176 } 177 } 178 179 // write the remaining string 180 res.WriteString(htmlCode) 181 return res.String() 182} 183 184func (hcd *HighlightCodeDiff) recoverOneDiff(diff *diffmatchpatch.Diff) { 185 diff.Text = hcd.Recover(diff.Text) 186} 187 188func (hcd *HighlightCodeDiff) Recover(src string) string { 189 sb := strings.Builder{} 190 var tagStack []string 191 192 for _, r := range src { 193 token, ok := hcd.PlaceholderTokenMap[r] 194 if !ok || token == "" { 195 sb.WriteRune(r) // if the rune is not a placeholder, write it as it is 196 continue 197 } 198 var tokenToRecover string 199 if strings.HasPrefix(token, "</") { // for closing tag 200 // only get the tag itself, ignore the trailing comment (for how the comment is generated, see the code in `convert` function) 201 tokenToRecover = token[:strings.IndexByte(token, '>')+1] 202 if len(tagStack) == 0 { 203 continue // if no opening tag in stack yet, skip the closing tag 204 } 205 tagStack = tagStack[:len(tagStack)-1] 206 } else if token[0] == '<' { // for opening tag 207 tokenToRecover = token 208 tagStack = append(tagStack, token) 209 } else if token[0] == '&' { // for html entity 210 tokenToRecover = token 211 } // else: impossible 212 sb.WriteString(tokenToRecover) 213 } 214 215 if len(tagStack) > 0 { 216 // close all opening tags 217 for i := len(tagStack) - 1; i >= 0; i-- { 218 tagToClose := tagStack[i] 219 // get the closing tag "</span>" from "<span class=...>" or "<span>" 220 pos := strings.IndexAny(tagToClose, " >") 221 if pos != -1 { 222 sb.WriteString("</" + tagToClose[1:pos] + ">") 223 } // else: impossible. every tag was pushed into the stack by the code above and is valid HTML opening tag 224 } 225 } 226 227 return sb.String() 228}