Diffdown is a real-time collaborative Markdown editor/previewer built on the AT Protocol diffdown.com
at 874486dcfe5d8c0bc1e04546e56bc4e4f052ccab 199 lines 7.1 kB view raw
1/* Hand-written tokenizers for HTML. */ 2 3import {ExternalTokenizer, ContextTracker} from "@lezer/lr" 4import {StartTag, StartCloseTag, NoMatchStartCloseTag, MismatchedStartCloseTag, missingCloseTag, 5 StartSelfClosingTag, IncompleteCloseTag, Element, OpenTag, IncompleteTag, 6 StartScriptTag, scriptText, StartCloseScriptTag, 7 StartStyleTag, styleText, StartCloseStyleTag, 8 StartTextareaTag, textareaText, StartCloseTextareaTag, 9 Dialect_noMatch, Dialect_selfClosing, EndTag, SelfClosingEndTag, 10 commentContent as cmntContent} from "./parser.terms.js" 11 12const selfClosers = { 13 area: true, base: true, br: true, col: true, command: true, 14 embed: true, frame: true, hr: true, img: true, input: true, 15 keygen: true, link: true, meta: true, param: true, source: true, 16 track: true, wbr: true, menuitem: true 17} 18 19const implicitlyClosed = { 20 dd: true, li: true, optgroup: true, option: true, p: true, 21 rp: true, rt: true, tbody: true, td: true, tfoot: true, 22 th: true, tr: true 23} 24 25const closeOnOpen = { 26 dd: {dd: true, dt: true}, 27 dt: {dd: true, dt: true}, 28 li: {li: true}, 29 option: {option: true, optgroup: true}, 30 optgroup: {optgroup: true}, 31 p: { 32 address: true, article: true, aside: true, blockquote: true, dir: true, 33 div: true, dl: true, fieldset: true, footer: true, form: true, 34 h1: true, h2: true, h3: true, h4: true, h5: true, h6: true, 35 header: true, hgroup: true, hr: true, menu: true, nav: true, ol: true, 36 p: true, pre: true, section: true, table: true, ul: true 37 }, 38 rp: {rp: true, rt: true}, 39 rt: {rp: true, rt: true}, 40 tbody: {tbody: true, tfoot: true}, 41 td: {td: true, th: true}, 42 tfoot: {tbody: true}, 43 th: {td: true, th: true}, 44 thead: {tbody: true, tfoot: true}, 45 tr: {tr: true} 46} 47 48function nameChar(ch) { 49 return ch == 45 || ch == 46 || ch == 58 || ch >= 65 && ch <= 90 || ch == 95 || ch >= 97 && ch <= 122 || ch >= 161 50} 51 52function isSpace(ch) { 53 return ch == 9 || ch == 10 || ch == 13 || ch == 32 54} 55 56let cachedName = null, cachedInput = null, cachedPos = 0 57function tagNameAfter(input, offset) { 58 let pos = input.pos + offset 59 if (cachedPos == pos && cachedInput == input) return cachedName 60 let next = input.peek(offset), name = "" 61 for (;;) { 62 if (!nameChar(next)) break 63 name += String.fromCharCode(next) 64 next = input.peek(++offset) 65 } 66 // Undefined to signal there's a <? or <!, null for just missing 67 cachedInput = input; cachedPos = pos 68 return cachedName = name ? name.toLowerCase() : next == question || next == bang ? undefined : null 69} 70 71const lessThan = 60, greaterThan = 62, slash = 47, question = 63, bang = 33, dash = 45 72 73function ElementContext(name, parent) { 74 this.name = name 75 this.parent = parent 76} 77 78const startTagTerms = [StartTag, StartSelfClosingTag, StartScriptTag, StartStyleTag, StartTextareaTag] 79 80export const elementContext = new ContextTracker({ 81 start: null, 82 shift(context, term, stack, input) { 83 return startTagTerms.indexOf(term) > -1 ? new ElementContext(tagNameAfter(input, 1) || "", context) : context 84 }, 85 reduce(context, term) { 86 return term == Element && context ? context.parent : context 87 }, 88 reuse(context, node, stack, input) { 89 let type = node.type.id 90 return type == StartTag || type == OpenTag 91 ? new ElementContext(tagNameAfter(input, 1) || "", context) : context 92 }, 93 strict: false 94}) 95 96export const tagStart = new ExternalTokenizer((input, stack) => { 97 if (input.next != lessThan) { 98 // End of file, close any open tags 99 if (input.next < 0 && stack.context) input.acceptToken(missingCloseTag) 100 return 101 } 102 input.advance() 103 let close = input.next == slash 104 if (close) input.advance() 105 let name = tagNameAfter(input, 0) 106 if (name === undefined) return 107 if (!name) return input.acceptToken(close ? IncompleteCloseTag : IncompleteTag) 108 109 let parent = stack.context ? stack.context.name : null 110 if (close) { 111 if (name == parent) return input.acceptToken(StartCloseTag) 112 if (parent && implicitlyClosed[parent]) return input.acceptToken(missingCloseTag, -2) 113 if (stack.dialectEnabled(Dialect_noMatch)) return input.acceptToken(NoMatchStartCloseTag) 114 for (let cx = stack.context; cx; cx = cx.parent) if (cx.name == name) return 115 input.acceptToken(MismatchedStartCloseTag) 116 } else { 117 if (name == "script") return input.acceptToken(StartScriptTag) 118 if (name == "style") return input.acceptToken(StartStyleTag) 119 if (name == "textarea") return input.acceptToken(StartTextareaTag) 120 if (selfClosers.hasOwnProperty(name)) return input.acceptToken(StartSelfClosingTag) 121 if (parent && closeOnOpen[parent] && closeOnOpen[parent][name]) input.acceptToken(missingCloseTag, -1) 122 else input.acceptToken(StartTag) 123 } 124}, {contextual: true}) 125 126export const commentContent = new ExternalTokenizer(input => { 127 for (let dashes = 0, i = 0;; i++) { 128 if (input.next < 0) { 129 if (i) input.acceptToken(cmntContent) 130 break 131 } 132 if (input.next == dash) { 133 dashes++ 134 } else if (input.next == greaterThan && dashes >= 2) { 135 if (i >= 3) input.acceptToken(cmntContent, -2) 136 break 137 } else { 138 dashes = 0 139 } 140 input.advance() 141 } 142}) 143 144function inForeignElement(context) { 145 for (; context; context = context.parent) 146 if (context.name == "svg" || context.name == "math") return true 147 return false 148} 149 150export const endTag = new ExternalTokenizer((input, stack) => { 151 if (input.next == slash && input.peek(1) == greaterThan) { 152 let selfClosing = stack.dialectEnabled(Dialect_selfClosing) || inForeignElement(stack.context) 153 input.acceptToken(selfClosing ? SelfClosingEndTag : EndTag, 2) 154 } else if (input.next == greaterThan) { 155 input.acceptToken(EndTag, 1) 156 } 157}) 158 159function contentTokenizer(tag, textToken, endToken) { 160 let lastState = 2 + tag.length 161 return new ExternalTokenizer(input => { 162 // state means: 163 // - 0 nothing matched 164 // - 1 '<' matched 165 // - 2 '</' 166 // - 3-(1+tag.length) part of the tag matched 167 // - lastState whole tag + possibly whitespace matched 168 for (let state = 0, matchedLen = 0, i = 0;; i++) { 169 if (input.next < 0) { 170 if (i) input.acceptToken(textToken) 171 break 172 } 173 if (state == 0 && input.next == lessThan || 174 state == 1 && input.next == slash || 175 state >= 2 && state < lastState && input.next == tag.charCodeAt(state - 2)) { 176 state++ 177 matchedLen++ 178 } else if (state == lastState && input.next == greaterThan) { 179 if (i > matchedLen) 180 input.acceptToken(textToken, -matchedLen) 181 else 182 input.acceptToken(endToken, -(matchedLen - 2)) 183 break 184 } else if ((input.next == 10 /* '\n' */ || input.next == 13 /* '\r' */) && i) { 185 input.acceptToken(textToken, 1) 186 break 187 } else { 188 state = matchedLen = 0 189 } 190 input.advance() 191 } 192 }) 193} 194 195export const scriptTokens = contentTokenizer("script", scriptText, StartCloseScriptTag) 196 197export const styleTokens = contentTokenizer("style", styleText, StartCloseStyleTag) 198 199export const textareaTokens = contentTokenizer("textarea", textareaText, StartCloseTextareaTag)