Diffdown is a real-time collaborative Markdown editor/previewer built on the AT Protocol
diffdown.com
1/* Hand-written tokenizers for HTML. */
2
3import {ExternalTokenizer, ContextTracker} from "@lezer/lr"
4import {StartTag, StartCloseTag, NoMatchStartCloseTag, MismatchedStartCloseTag, missingCloseTag,
5 StartSelfClosingTag, IncompleteCloseTag, Element, OpenTag, IncompleteTag,
6 StartScriptTag, scriptText, StartCloseScriptTag,
7 StartStyleTag, styleText, StartCloseStyleTag,
8 StartTextareaTag, textareaText, StartCloseTextareaTag,
9 Dialect_noMatch, Dialect_selfClosing, EndTag, SelfClosingEndTag,
10 commentContent as cmntContent} from "./parser.terms.js"
11
12const selfClosers = {
13 area: true, base: true, br: true, col: true, command: true,
14 embed: true, frame: true, hr: true, img: true, input: true,
15 keygen: true, link: true, meta: true, param: true, source: true,
16 track: true, wbr: true, menuitem: true
17}
18
19const implicitlyClosed = {
20 dd: true, li: true, optgroup: true, option: true, p: true,
21 rp: true, rt: true, tbody: true, td: true, tfoot: true,
22 th: true, tr: true
23}
24
25const closeOnOpen = {
26 dd: {dd: true, dt: true},
27 dt: {dd: true, dt: true},
28 li: {li: true},
29 option: {option: true, optgroup: true},
30 optgroup: {optgroup: true},
31 p: {
32 address: true, article: true, aside: true, blockquote: true, dir: true,
33 div: true, dl: true, fieldset: true, footer: true, form: true,
34 h1: true, h2: true, h3: true, h4: true, h5: true, h6: true,
35 header: true, hgroup: true, hr: true, menu: true, nav: true, ol: true,
36 p: true, pre: true, section: true, table: true, ul: true
37 },
38 rp: {rp: true, rt: true},
39 rt: {rp: true, rt: true},
40 tbody: {tbody: true, tfoot: true},
41 td: {td: true, th: true},
42 tfoot: {tbody: true},
43 th: {td: true, th: true},
44 thead: {tbody: true, tfoot: true},
45 tr: {tr: true}
46}
47
48function nameChar(ch) {
49 return ch == 45 || ch == 46 || ch == 58 || ch >= 65 && ch <= 90 || ch == 95 || ch >= 97 && ch <= 122 || ch >= 161
50}
51
52function isSpace(ch) {
53 return ch == 9 || ch == 10 || ch == 13 || ch == 32
54}
55
56let cachedName = null, cachedInput = null, cachedPos = 0
57function tagNameAfter(input, offset) {
58 let pos = input.pos + offset
59 if (cachedPos == pos && cachedInput == input) return cachedName
60 let next = input.peek(offset), name = ""
61 for (;;) {
62 if (!nameChar(next)) break
63 name += String.fromCharCode(next)
64 next = input.peek(++offset)
65 }
66 // Undefined to signal there's a <? or <!, null for just missing
67 cachedInput = input; cachedPos = pos
68 return cachedName = name ? name.toLowerCase() : next == question || next == bang ? undefined : null
69}
70
71const lessThan = 60, greaterThan = 62, slash = 47, question = 63, bang = 33, dash = 45
72
73function ElementContext(name, parent) {
74 this.name = name
75 this.parent = parent
76}
77
78const startTagTerms = [StartTag, StartSelfClosingTag, StartScriptTag, StartStyleTag, StartTextareaTag]
79
80export const elementContext = new ContextTracker({
81 start: null,
82 shift(context, term, stack, input) {
83 return startTagTerms.indexOf(term) > -1 ? new ElementContext(tagNameAfter(input, 1) || "", context) : context
84 },
85 reduce(context, term) {
86 return term == Element && context ? context.parent : context
87 },
88 reuse(context, node, stack, input) {
89 let type = node.type.id
90 return type == StartTag || type == OpenTag
91 ? new ElementContext(tagNameAfter(input, 1) || "", context) : context
92 },
93 strict: false
94})
95
96export const tagStart = new ExternalTokenizer((input, stack) => {
97 if (input.next != lessThan) {
98 // End of file, close any open tags
99 if (input.next < 0 && stack.context) input.acceptToken(missingCloseTag)
100 return
101 }
102 input.advance()
103 let close = input.next == slash
104 if (close) input.advance()
105 let name = tagNameAfter(input, 0)
106 if (name === undefined) return
107 if (!name) return input.acceptToken(close ? IncompleteCloseTag : IncompleteTag)
108
109 let parent = stack.context ? stack.context.name : null
110 if (close) {
111 if (name == parent) return input.acceptToken(StartCloseTag)
112 if (parent && implicitlyClosed[parent]) return input.acceptToken(missingCloseTag, -2)
113 if (stack.dialectEnabled(Dialect_noMatch)) return input.acceptToken(NoMatchStartCloseTag)
114 for (let cx = stack.context; cx; cx = cx.parent) if (cx.name == name) return
115 input.acceptToken(MismatchedStartCloseTag)
116 } else {
117 if (name == "script") return input.acceptToken(StartScriptTag)
118 if (name == "style") return input.acceptToken(StartStyleTag)
119 if (name == "textarea") return input.acceptToken(StartTextareaTag)
120 if (selfClosers.hasOwnProperty(name)) return input.acceptToken(StartSelfClosingTag)
121 if (parent && closeOnOpen[parent] && closeOnOpen[parent][name]) input.acceptToken(missingCloseTag, -1)
122 else input.acceptToken(StartTag)
123 }
124}, {contextual: true})
125
126export const commentContent = new ExternalTokenizer(input => {
127 for (let dashes = 0, i = 0;; i++) {
128 if (input.next < 0) {
129 if (i) input.acceptToken(cmntContent)
130 break
131 }
132 if (input.next == dash) {
133 dashes++
134 } else if (input.next == greaterThan && dashes >= 2) {
135 if (i >= 3) input.acceptToken(cmntContent, -2)
136 break
137 } else {
138 dashes = 0
139 }
140 input.advance()
141 }
142})
143
144function inForeignElement(context) {
145 for (; context; context = context.parent)
146 if (context.name == "svg" || context.name == "math") return true
147 return false
148}
149
150export const endTag = new ExternalTokenizer((input, stack) => {
151 if (input.next == slash && input.peek(1) == greaterThan) {
152 let selfClosing = stack.dialectEnabled(Dialect_selfClosing) || inForeignElement(stack.context)
153 input.acceptToken(selfClosing ? SelfClosingEndTag : EndTag, 2)
154 } else if (input.next == greaterThan) {
155 input.acceptToken(EndTag, 1)
156 }
157})
158
159function contentTokenizer(tag, textToken, endToken) {
160 let lastState = 2 + tag.length
161 return new ExternalTokenizer(input => {
162 // state means:
163 // - 0 nothing matched
164 // - 1 '<' matched
165 // - 2 '</'
166 // - 3-(1+tag.length) part of the tag matched
167 // - lastState whole tag + possibly whitespace matched
168 for (let state = 0, matchedLen = 0, i = 0;; i++) {
169 if (input.next < 0) {
170 if (i) input.acceptToken(textToken)
171 break
172 }
173 if (state == 0 && input.next == lessThan ||
174 state == 1 && input.next == slash ||
175 state >= 2 && state < lastState && input.next == tag.charCodeAt(state - 2)) {
176 state++
177 matchedLen++
178 } else if (state == lastState && input.next == greaterThan) {
179 if (i > matchedLen)
180 input.acceptToken(textToken, -matchedLen)
181 else
182 input.acceptToken(endToken, -(matchedLen - 2))
183 break
184 } else if ((input.next == 10 /* '\n' */ || input.next == 13 /* '\r' */) && i) {
185 input.acceptToken(textToken, 1)
186 break
187 } else {
188 state = matchedLen = 0
189 }
190 input.advance()
191 }
192 })
193}
194
195export const scriptTokens = contentTokenizer("script", scriptText, StartCloseScriptTag)
196
197export const styleTokens = contentTokenizer("style", styleText, StartCloseStyleTag)
198
199export const textareaTokens = contentTokenizer("textarea", textareaText, StartCloseTextareaTag)