+1
.gitignore
+1
.gitignore
···
1
+
_build
+22
LICENSE.md
+22
LICENSE.md
···
1
+
MIT License
2
+
3
+
Copyright (c) 2025 Emil Stenström
4
+
Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>
5
+
6
+
Permission is hereby granted, free of charge, to any person obtaining a copy
7
+
of this software and associated documentation files (the "Software"), to deal
8
+
in the Software without restriction, including without limitation the rights
9
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
+
copies of the Software, and to permit persons to whom the Software is
11
+
furnished to do so, subject to the following conditions:
12
+
13
+
The above copyright notice and this permission notice shall be included in all
14
+
copies or substantial portions of the Software.
15
+
16
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22
+
SOFTWARE.
+2233
data/entities.json
+2233
data/entities.json
···
1
+
{
2
+
"Æ": { "codepoints": [198], "characters": "\u00C6" },
3
+
"Æ": { "codepoints": [198], "characters": "\u00C6" },
4
+
"&": { "codepoints": [38], "characters": "\u0026" },
5
+
"&": { "codepoints": [38], "characters": "\u0026" },
6
+
"Á": { "codepoints": [193], "characters": "\u00C1" },
7
+
"Á": { "codepoints": [193], "characters": "\u00C1" },
8
+
"Ă": { "codepoints": [258], "characters": "\u0102" },
9
+
"Â": { "codepoints": [194], "characters": "\u00C2" },
10
+
"Â": { "codepoints": [194], "characters": "\u00C2" },
11
+
"А": { "codepoints": [1040], "characters": "\u0410" },
12
+
"𝔄": { "codepoints": [120068], "characters": "\uD835\uDD04" },
13
+
"À": { "codepoints": [192], "characters": "\u00C0" },
14
+
"À": { "codepoints": [192], "characters": "\u00C0" },
15
+
"Α": { "codepoints": [913], "characters": "\u0391" },
16
+
"Ā": { "codepoints": [256], "characters": "\u0100" },
17
+
"⩓": { "codepoints": [10835], "characters": "\u2A53" },
18
+
"Ą": { "codepoints": [260], "characters": "\u0104" },
19
+
"𝔸": { "codepoints": [120120], "characters": "\uD835\uDD38" },
20
+
"⁡": { "codepoints": [8289], "characters": "\u2061" },
21
+
"Å": { "codepoints": [197], "characters": "\u00C5" },
22
+
"Å": { "codepoints": [197], "characters": "\u00C5" },
23
+
"𝒜": { "codepoints": [119964], "characters": "\uD835\uDC9C" },
24
+
"≔": { "codepoints": [8788], "characters": "\u2254" },
25
+
"Ã": { "codepoints": [195], "characters": "\u00C3" },
26
+
"Ã": { "codepoints": [195], "characters": "\u00C3" },
27
+
"Ä": { "codepoints": [196], "characters": "\u00C4" },
28
+
"Ä": { "codepoints": [196], "characters": "\u00C4" },
29
+
"∖": { "codepoints": [8726], "characters": "\u2216" },
30
+
"⫧": { "codepoints": [10983], "characters": "\u2AE7" },
31
+
"⌆": { "codepoints": [8966], "characters": "\u2306" },
32
+
"Б": { "codepoints": [1041], "characters": "\u0411" },
33
+
"∵": { "codepoints": [8757], "characters": "\u2235" },
34
+
"ℬ": { "codepoints": [8492], "characters": "\u212C" },
35
+
"Β": { "codepoints": [914], "characters": "\u0392" },
36
+
"𝔅": { "codepoints": [120069], "characters": "\uD835\uDD05" },
37
+
"𝔹": { "codepoints": [120121], "characters": "\uD835\uDD39" },
38
+
"˘": { "codepoints": [728], "characters": "\u02D8" },
39
+
"ℬ": { "codepoints": [8492], "characters": "\u212C" },
40
+
"≎": { "codepoints": [8782], "characters": "\u224E" },
41
+
"Ч": { "codepoints": [1063], "characters": "\u0427" },
42
+
"©": { "codepoints": [169], "characters": "\u00A9" },
43
+
"©": { "codepoints": [169], "characters": "\u00A9" },
44
+
"Ć": { "codepoints": [262], "characters": "\u0106" },
45
+
"⋒": { "codepoints": [8914], "characters": "\u22D2" },
46
+
"ⅅ": { "codepoints": [8517], "characters": "\u2145" },
47
+
"ℭ": { "codepoints": [8493], "characters": "\u212D" },
48
+
"Č": { "codepoints": [268], "characters": "\u010C" },
49
+
"Ç": { "codepoints": [199], "characters": "\u00C7" },
50
+
"Ç": { "codepoints": [199], "characters": "\u00C7" },
51
+
"Ĉ": { "codepoints": [264], "characters": "\u0108" },
52
+
"∰": { "codepoints": [8752], "characters": "\u2230" },
53
+
"Ċ": { "codepoints": [266], "characters": "\u010A" },
54
+
"¸": { "codepoints": [184], "characters": "\u00B8" },
55
+
"·": { "codepoints": [183], "characters": "\u00B7" },
56
+
"ℭ": { "codepoints": [8493], "characters": "\u212D" },
57
+
"Χ": { "codepoints": [935], "characters": "\u03A7" },
58
+
"⊙": { "codepoints": [8857], "characters": "\u2299" },
59
+
"⊖": { "codepoints": [8854], "characters": "\u2296" },
60
+
"⊕": { "codepoints": [8853], "characters": "\u2295" },
61
+
"⊗": { "codepoints": [8855], "characters": "\u2297" },
62
+
"∲": { "codepoints": [8754], "characters": "\u2232" },
63
+
"”": { "codepoints": [8221], "characters": "\u201D" },
64
+
"’": { "codepoints": [8217], "characters": "\u2019" },
65
+
"∷": { "codepoints": [8759], "characters": "\u2237" },
66
+
"⩴": { "codepoints": [10868], "characters": "\u2A74" },
67
+
"≡": { "codepoints": [8801], "characters": "\u2261" },
68
+
"∯": { "codepoints": [8751], "characters": "\u222F" },
69
+
"∮": { "codepoints": [8750], "characters": "\u222E" },
70
+
"ℂ": { "codepoints": [8450], "characters": "\u2102" },
71
+
"∐": { "codepoints": [8720], "characters": "\u2210" },
72
+
"∳": { "codepoints": [8755], "characters": "\u2233" },
73
+
"⨯": { "codepoints": [10799], "characters": "\u2A2F" },
74
+
"𝒞": { "codepoints": [119966], "characters": "\uD835\uDC9E" },
75
+
"⋓": { "codepoints": [8915], "characters": "\u22D3" },
76
+
"≍": { "codepoints": [8781], "characters": "\u224D" },
77
+
"ⅅ": { "codepoints": [8517], "characters": "\u2145" },
78
+
"⤑": { "codepoints": [10513], "characters": "\u2911" },
79
+
"Ђ": { "codepoints": [1026], "characters": "\u0402" },
80
+
"Ѕ": { "codepoints": [1029], "characters": "\u0405" },
81
+
"Џ": { "codepoints": [1039], "characters": "\u040F" },
82
+
"‡": { "codepoints": [8225], "characters": "\u2021" },
83
+
"↡": { "codepoints": [8609], "characters": "\u21A1" },
84
+
"⫤": { "codepoints": [10980], "characters": "\u2AE4" },
85
+
"Ď": { "codepoints": [270], "characters": "\u010E" },
86
+
"Д": { "codepoints": [1044], "characters": "\u0414" },
87
+
"∇": { "codepoints": [8711], "characters": "\u2207" },
88
+
"Δ": { "codepoints": [916], "characters": "\u0394" },
89
+
"𝔇": { "codepoints": [120071], "characters": "\uD835\uDD07" },
90
+
"´": { "codepoints": [180], "characters": "\u00B4" },
91
+
"˙": { "codepoints": [729], "characters": "\u02D9" },
92
+
"˝": { "codepoints": [733], "characters": "\u02DD" },
93
+
"`": { "codepoints": [96], "characters": "\u0060" },
94
+
"˜": { "codepoints": [732], "characters": "\u02DC" },
95
+
"⋄": { "codepoints": [8900], "characters": "\u22C4" },
96
+
"ⅆ": { "codepoints": [8518], "characters": "\u2146" },
97
+
"𝔻": { "codepoints": [120123], "characters": "\uD835\uDD3B" },
98
+
"¨": { "codepoints": [168], "characters": "\u00A8" },
99
+
"⃜": { "codepoints": [8412], "characters": "\u20DC" },
100
+
"≐": { "codepoints": [8784], "characters": "\u2250" },
101
+
"∯": { "codepoints": [8751], "characters": "\u222F" },
102
+
"¨": { "codepoints": [168], "characters": "\u00A8" },
103
+
"⇓": { "codepoints": [8659], "characters": "\u21D3" },
104
+
"⇐": { "codepoints": [8656], "characters": "\u21D0" },
105
+
"⇔": { "codepoints": [8660], "characters": "\u21D4" },
106
+
"⫤": { "codepoints": [10980], "characters": "\u2AE4" },
107
+
"⟸": { "codepoints": [10232], "characters": "\u27F8" },
108
+
"⟺": { "codepoints": [10234], "characters": "\u27FA" },
109
+
"⟹": { "codepoints": [10233], "characters": "\u27F9" },
110
+
"⇒": { "codepoints": [8658], "characters": "\u21D2" },
111
+
"⊨": { "codepoints": [8872], "characters": "\u22A8" },
112
+
"⇑": { "codepoints": [8657], "characters": "\u21D1" },
113
+
"⇕": { "codepoints": [8661], "characters": "\u21D5" },
114
+
"∥": { "codepoints": [8741], "characters": "\u2225" },
115
+
"↓": { "codepoints": [8595], "characters": "\u2193" },
116
+
"⤓": { "codepoints": [10515], "characters": "\u2913" },
117
+
"⇵": { "codepoints": [8693], "characters": "\u21F5" },
118
+
"̑": { "codepoints": [785], "characters": "\u0311" },
119
+
"⥐": { "codepoints": [10576], "characters": "\u2950" },
120
+
"⥞": { "codepoints": [10590], "characters": "\u295E" },
121
+
"↽": { "codepoints": [8637], "characters": "\u21BD" },
122
+
"⥖": { "codepoints": [10582], "characters": "\u2956" },
123
+
"⥟": { "codepoints": [10591], "characters": "\u295F" },
124
+
"⇁": { "codepoints": [8641], "characters": "\u21C1" },
125
+
"⥗": { "codepoints": [10583], "characters": "\u2957" },
126
+
"⊤": { "codepoints": [8868], "characters": "\u22A4" },
127
+
"↧": { "codepoints": [8615], "characters": "\u21A7" },
128
+
"⇓": { "codepoints": [8659], "characters": "\u21D3" },
129
+
"𝒟": { "codepoints": [119967], "characters": "\uD835\uDC9F" },
130
+
"Đ": { "codepoints": [272], "characters": "\u0110" },
131
+
"Ŋ": { "codepoints": [330], "characters": "\u014A" },
132
+
"Ð": { "codepoints": [208], "characters": "\u00D0" },
133
+
"Ð": { "codepoints": [208], "characters": "\u00D0" },
134
+
"É": { "codepoints": [201], "characters": "\u00C9" },
135
+
"É": { "codepoints": [201], "characters": "\u00C9" },
136
+
"Ě": { "codepoints": [282], "characters": "\u011A" },
137
+
"Ê": { "codepoints": [202], "characters": "\u00CA" },
138
+
"Ê": { "codepoints": [202], "characters": "\u00CA" },
139
+
"Э": { "codepoints": [1069], "characters": "\u042D" },
140
+
"Ė": { "codepoints": [278], "characters": "\u0116" },
141
+
"𝔈": { "codepoints": [120072], "characters": "\uD835\uDD08" },
142
+
"È": { "codepoints": [200], "characters": "\u00C8" },
143
+
"È": { "codepoints": [200], "characters": "\u00C8" },
144
+
"∈": { "codepoints": [8712], "characters": "\u2208" },
145
+
"Ē": { "codepoints": [274], "characters": "\u0112" },
146
+
"◻": { "codepoints": [9723], "characters": "\u25FB" },
147
+
"▫": { "codepoints": [9643], "characters": "\u25AB" },
148
+
"Ę": { "codepoints": [280], "characters": "\u0118" },
149
+
"𝔼": { "codepoints": [120124], "characters": "\uD835\uDD3C" },
150
+
"Ε": { "codepoints": [917], "characters": "\u0395" },
151
+
"⩵": { "codepoints": [10869], "characters": "\u2A75" },
152
+
"≂": { "codepoints": [8770], "characters": "\u2242" },
153
+
"⇌": { "codepoints": [8652], "characters": "\u21CC" },
154
+
"ℰ": { "codepoints": [8496], "characters": "\u2130" },
155
+
"⩳": { "codepoints": [10867], "characters": "\u2A73" },
156
+
"Η": { "codepoints": [919], "characters": "\u0397" },
157
+
"Ë": { "codepoints": [203], "characters": "\u00CB" },
158
+
"Ë": { "codepoints": [203], "characters": "\u00CB" },
159
+
"∃": { "codepoints": [8707], "characters": "\u2203" },
160
+
"ⅇ": { "codepoints": [8519], "characters": "\u2147" },
161
+
"Ф": { "codepoints": [1060], "characters": "\u0424" },
162
+
"𝔉": { "codepoints": [120073], "characters": "\uD835\uDD09" },
163
+
"◼": { "codepoints": [9724], "characters": "\u25FC" },
164
+
"▪": { "codepoints": [9642], "characters": "\u25AA" },
165
+
"𝔽": { "codepoints": [120125], "characters": "\uD835\uDD3D" },
166
+
"∀": { "codepoints": [8704], "characters": "\u2200" },
167
+
"ℱ": { "codepoints": [8497], "characters": "\u2131" },
168
+
"ℱ": { "codepoints": [8497], "characters": "\u2131" },
169
+
"Ѓ": { "codepoints": [1027], "characters": "\u0403" },
170
+
">": { "codepoints": [62], "characters": "\u003E" },
171
+
">": { "codepoints": [62], "characters": "\u003E" },
172
+
"Γ": { "codepoints": [915], "characters": "\u0393" },
173
+
"Ϝ": { "codepoints": [988], "characters": "\u03DC" },
174
+
"Ğ": { "codepoints": [286], "characters": "\u011E" },
175
+
"Ģ": { "codepoints": [290], "characters": "\u0122" },
176
+
"Ĝ": { "codepoints": [284], "characters": "\u011C" },
177
+
"Г": { "codepoints": [1043], "characters": "\u0413" },
178
+
"Ġ": { "codepoints": [288], "characters": "\u0120" },
179
+
"𝔊": { "codepoints": [120074], "characters": "\uD835\uDD0A" },
180
+
"⋙": { "codepoints": [8921], "characters": "\u22D9" },
181
+
"𝔾": { "codepoints": [120126], "characters": "\uD835\uDD3E" },
182
+
"≥": { "codepoints": [8805], "characters": "\u2265" },
183
+
"⋛": { "codepoints": [8923], "characters": "\u22DB" },
184
+
"≧": { "codepoints": [8807], "characters": "\u2267" },
185
+
"⪢": { "codepoints": [10914], "characters": "\u2AA2" },
186
+
"≷": { "codepoints": [8823], "characters": "\u2277" },
187
+
"⩾": { "codepoints": [10878], "characters": "\u2A7E" },
188
+
"≳": { "codepoints": [8819], "characters": "\u2273" },
189
+
"𝒢": { "codepoints": [119970], "characters": "\uD835\uDCA2" },
190
+
"≫": { "codepoints": [8811], "characters": "\u226B" },
191
+
"Ъ": { "codepoints": [1066], "characters": "\u042A" },
192
+
"ˇ": { "codepoints": [711], "characters": "\u02C7" },
193
+
"^": { "codepoints": [94], "characters": "\u005E" },
194
+
"Ĥ": { "codepoints": [292], "characters": "\u0124" },
195
+
"ℌ": { "codepoints": [8460], "characters": "\u210C" },
196
+
"ℋ": { "codepoints": [8459], "characters": "\u210B" },
197
+
"ℍ": { "codepoints": [8461], "characters": "\u210D" },
198
+
"─": { "codepoints": [9472], "characters": "\u2500" },
199
+
"ℋ": { "codepoints": [8459], "characters": "\u210B" },
200
+
"Ħ": { "codepoints": [294], "characters": "\u0126" },
201
+
"≎": { "codepoints": [8782], "characters": "\u224E" },
202
+
"≏": { "codepoints": [8783], "characters": "\u224F" },
203
+
"Е": { "codepoints": [1045], "characters": "\u0415" },
204
+
"IJ": { "codepoints": [306], "characters": "\u0132" },
205
+
"Ё": { "codepoints": [1025], "characters": "\u0401" },
206
+
"Í": { "codepoints": [205], "characters": "\u00CD" },
207
+
"Í": { "codepoints": [205], "characters": "\u00CD" },
208
+
"Î": { "codepoints": [206], "characters": "\u00CE" },
209
+
"Î": { "codepoints": [206], "characters": "\u00CE" },
210
+
"И": { "codepoints": [1048], "characters": "\u0418" },
211
+
"İ": { "codepoints": [304], "characters": "\u0130" },
212
+
"ℑ": { "codepoints": [8465], "characters": "\u2111" },
213
+
"Ì": { "codepoints": [204], "characters": "\u00CC" },
214
+
"Ì": { "codepoints": [204], "characters": "\u00CC" },
215
+
"ℑ": { "codepoints": [8465], "characters": "\u2111" },
216
+
"Ī": { "codepoints": [298], "characters": "\u012A" },
217
+
"ⅈ": { "codepoints": [8520], "characters": "\u2148" },
218
+
"⇒": { "codepoints": [8658], "characters": "\u21D2" },
219
+
"∬": { "codepoints": [8748], "characters": "\u222C" },
220
+
"∫": { "codepoints": [8747], "characters": "\u222B" },
221
+
"⋂": { "codepoints": [8898], "characters": "\u22C2" },
222
+
"⁣": { "codepoints": [8291], "characters": "\u2063" },
223
+
"⁢": { "codepoints": [8290], "characters": "\u2062" },
224
+
"Į": { "codepoints": [302], "characters": "\u012E" },
225
+
"𝕀": { "codepoints": [120128], "characters": "\uD835\uDD40" },
226
+
"Ι": { "codepoints": [921], "characters": "\u0399" },
227
+
"ℐ": { "codepoints": [8464], "characters": "\u2110" },
228
+
"Ĩ": { "codepoints": [296], "characters": "\u0128" },
229
+
"І": { "codepoints": [1030], "characters": "\u0406" },
230
+
"Ï": { "codepoints": [207], "characters": "\u00CF" },
231
+
"Ï": { "codepoints": [207], "characters": "\u00CF" },
232
+
"Ĵ": { "codepoints": [308], "characters": "\u0134" },
233
+
"Й": { "codepoints": [1049], "characters": "\u0419" },
234
+
"𝔍": { "codepoints": [120077], "characters": "\uD835\uDD0D" },
235
+
"𝕁": { "codepoints": [120129], "characters": "\uD835\uDD41" },
236
+
"𝒥": { "codepoints": [119973], "characters": "\uD835\uDCA5" },
237
+
"Ј": { "codepoints": [1032], "characters": "\u0408" },
238
+
"Є": { "codepoints": [1028], "characters": "\u0404" },
239
+
"Х": { "codepoints": [1061], "characters": "\u0425" },
240
+
"Ќ": { "codepoints": [1036], "characters": "\u040C" },
241
+
"Κ": { "codepoints": [922], "characters": "\u039A" },
242
+
"Ķ": { "codepoints": [310], "characters": "\u0136" },
243
+
"К": { "codepoints": [1050], "characters": "\u041A" },
244
+
"𝔎": { "codepoints": [120078], "characters": "\uD835\uDD0E" },
245
+
"𝕂": { "codepoints": [120130], "characters": "\uD835\uDD42" },
246
+
"𝒦": { "codepoints": [119974], "characters": "\uD835\uDCA6" },
247
+
"Љ": { "codepoints": [1033], "characters": "\u0409" },
248
+
"<": { "codepoints": [60], "characters": "\u003C" },
249
+
"<": { "codepoints": [60], "characters": "\u003C" },
250
+
"Ĺ": { "codepoints": [313], "characters": "\u0139" },
251
+
"Λ": { "codepoints": [923], "characters": "\u039B" },
252
+
"⟪": { "codepoints": [10218], "characters": "\u27EA" },
253
+
"ℒ": { "codepoints": [8466], "characters": "\u2112" },
254
+
"↞": { "codepoints": [8606], "characters": "\u219E" },
255
+
"Ľ": { "codepoints": [317], "characters": "\u013D" },
256
+
"Ļ": { "codepoints": [315], "characters": "\u013B" },
257
+
"Л": { "codepoints": [1051], "characters": "\u041B" },
258
+
"⟨": { "codepoints": [10216], "characters": "\u27E8" },
259
+
"←": { "codepoints": [8592], "characters": "\u2190" },
260
+
"⇤": { "codepoints": [8676], "characters": "\u21E4" },
261
+
"⇆": { "codepoints": [8646], "characters": "\u21C6" },
262
+
"⌈": { "codepoints": [8968], "characters": "\u2308" },
263
+
"⟦": { "codepoints": [10214], "characters": "\u27E6" },
264
+
"⥡": { "codepoints": [10593], "characters": "\u2961" },
265
+
"⇃": { "codepoints": [8643], "characters": "\u21C3" },
266
+
"⥙": { "codepoints": [10585], "characters": "\u2959" },
267
+
"⌊": { "codepoints": [8970], "characters": "\u230A" },
268
+
"↔": { "codepoints": [8596], "characters": "\u2194" },
269
+
"⥎": { "codepoints": [10574], "characters": "\u294E" },
270
+
"⊣": { "codepoints": [8867], "characters": "\u22A3" },
271
+
"↤": { "codepoints": [8612], "characters": "\u21A4" },
272
+
"⥚": { "codepoints": [10586], "characters": "\u295A" },
273
+
"⊲": { "codepoints": [8882], "characters": "\u22B2" },
274
+
"⧏": { "codepoints": [10703], "characters": "\u29CF" },
275
+
"⊴": { "codepoints": [8884], "characters": "\u22B4" },
276
+
"⥑": { "codepoints": [10577], "characters": "\u2951" },
277
+
"⥠": { "codepoints": [10592], "characters": "\u2960" },
278
+
"↿": { "codepoints": [8639], "characters": "\u21BF" },
279
+
"⥘": { "codepoints": [10584], "characters": "\u2958" },
280
+
"↼": { "codepoints": [8636], "characters": "\u21BC" },
281
+
"⥒": { "codepoints": [10578], "characters": "\u2952" },
282
+
"⇐": { "codepoints": [8656], "characters": "\u21D0" },
283
+
"⇔": { "codepoints": [8660], "characters": "\u21D4" },
284
+
"⋚": { "codepoints": [8922], "characters": "\u22DA" },
285
+
"≦": { "codepoints": [8806], "characters": "\u2266" },
286
+
"≶": { "codepoints": [8822], "characters": "\u2276" },
287
+
"⪡": { "codepoints": [10913], "characters": "\u2AA1" },
288
+
"⩽": { "codepoints": [10877], "characters": "\u2A7D" },
289
+
"≲": { "codepoints": [8818], "characters": "\u2272" },
290
+
"𝔏": { "codepoints": [120079], "characters": "\uD835\uDD0F" },
291
+
"⋘": { "codepoints": [8920], "characters": "\u22D8" },
292
+
"⇚": { "codepoints": [8666], "characters": "\u21DA" },
293
+
"Ŀ": { "codepoints": [319], "characters": "\u013F" },
294
+
"⟵": { "codepoints": [10229], "characters": "\u27F5" },
295
+
"⟷": { "codepoints": [10231], "characters": "\u27F7" },
296
+
"⟶": { "codepoints": [10230], "characters": "\u27F6" },
297
+
"⟸": { "codepoints": [10232], "characters": "\u27F8" },
298
+
"⟺": { "codepoints": [10234], "characters": "\u27FA" },
299
+
"⟹": { "codepoints": [10233], "characters": "\u27F9" },
300
+
"𝕃": { "codepoints": [120131], "characters": "\uD835\uDD43" },
301
+
"↙": { "codepoints": [8601], "characters": "\u2199" },
302
+
"↘": { "codepoints": [8600], "characters": "\u2198" },
303
+
"ℒ": { "codepoints": [8466], "characters": "\u2112" },
304
+
"↰": { "codepoints": [8624], "characters": "\u21B0" },
305
+
"Ł": { "codepoints": [321], "characters": "\u0141" },
306
+
"≪": { "codepoints": [8810], "characters": "\u226A" },
307
+
"⤅": { "codepoints": [10501], "characters": "\u2905" },
308
+
"М": { "codepoints": [1052], "characters": "\u041C" },
309
+
" ": { "codepoints": [8287], "characters": "\u205F" },
310
+
"ℳ": { "codepoints": [8499], "characters": "\u2133" },
311
+
"𝔐": { "codepoints": [120080], "characters": "\uD835\uDD10" },
312
+
"∓": { "codepoints": [8723], "characters": "\u2213" },
313
+
"𝕄": { "codepoints": [120132], "characters": "\uD835\uDD44" },
314
+
"ℳ": { "codepoints": [8499], "characters": "\u2133" },
315
+
"Μ": { "codepoints": [924], "characters": "\u039C" },
316
+
"Њ": { "codepoints": [1034], "characters": "\u040A" },
317
+
"Ń": { "codepoints": [323], "characters": "\u0143" },
318
+
"Ň": { "codepoints": [327], "characters": "\u0147" },
319
+
"Ņ": { "codepoints": [325], "characters": "\u0145" },
320
+
"Н": { "codepoints": [1053], "characters": "\u041D" },
321
+
"​": { "codepoints": [8203], "characters": "\u200B" },
322
+
"​": { "codepoints": [8203], "characters": "\u200B" },
323
+
"​": { "codepoints": [8203], "characters": "\u200B" },
324
+
"​": { "codepoints": [8203], "characters": "\u200B" },
325
+
"≫": { "codepoints": [8811], "characters": "\u226B" },
326
+
"≪": { "codepoints": [8810], "characters": "\u226A" },
327
+
"
": { "codepoints": [10], "characters": "\u000A" },
328
+
"𝔑": { "codepoints": [120081], "characters": "\uD835\uDD11" },
329
+
"⁠": { "codepoints": [8288], "characters": "\u2060" },
330
+
" ": { "codepoints": [160], "characters": "\u00A0" },
331
+
"ℕ": { "codepoints": [8469], "characters": "\u2115" },
332
+
"⫬": { "codepoints": [10988], "characters": "\u2AEC" },
333
+
"≢": { "codepoints": [8802], "characters": "\u2262" },
334
+
"≭": { "codepoints": [8813], "characters": "\u226D" },
335
+
"∦": { "codepoints": [8742], "characters": "\u2226" },
336
+
"∉": { "codepoints": [8713], "characters": "\u2209" },
337
+
"≠": { "codepoints": [8800], "characters": "\u2260" },
338
+
"≂̸": { "codepoints": [8770, 824], "characters": "\u2242\u0338" },
339
+
"∄": { "codepoints": [8708], "characters": "\u2204" },
340
+
"≯": { "codepoints": [8815], "characters": "\u226F" },
341
+
"≱": { "codepoints": [8817], "characters": "\u2271" },
342
+
"≧̸": { "codepoints": [8807, 824], "characters": "\u2267\u0338" },
343
+
"≫̸": { "codepoints": [8811, 824], "characters": "\u226B\u0338" },
344
+
"≹": { "codepoints": [8825], "characters": "\u2279" },
345
+
"⩾̸": { "codepoints": [10878, 824], "characters": "\u2A7E\u0338" },
346
+
"≵": { "codepoints": [8821], "characters": "\u2275" },
347
+
"≎̸": { "codepoints": [8782, 824], "characters": "\u224E\u0338" },
348
+
"≏̸": { "codepoints": [8783, 824], "characters": "\u224F\u0338" },
349
+
"⋪": { "codepoints": [8938], "characters": "\u22EA" },
350
+
"⧏̸": { "codepoints": [10703, 824], "characters": "\u29CF\u0338" },
351
+
"⋬": { "codepoints": [8940], "characters": "\u22EC" },
352
+
"≮": { "codepoints": [8814], "characters": "\u226E" },
353
+
"≰": { "codepoints": [8816], "characters": "\u2270" },
354
+
"≸": { "codepoints": [8824], "characters": "\u2278" },
355
+
"≪̸": { "codepoints": [8810, 824], "characters": "\u226A\u0338" },
356
+
"⩽̸": { "codepoints": [10877, 824], "characters": "\u2A7D\u0338" },
357
+
"≴": { "codepoints": [8820], "characters": "\u2274" },
358
+
"⪢̸": { "codepoints": [10914, 824], "characters": "\u2AA2\u0338" },
359
+
"⪡̸": { "codepoints": [10913, 824], "characters": "\u2AA1\u0338" },
360
+
"⊀": { "codepoints": [8832], "characters": "\u2280" },
361
+
"⪯̸": { "codepoints": [10927, 824], "characters": "\u2AAF\u0338" },
362
+
"⋠": { "codepoints": [8928], "characters": "\u22E0" },
363
+
"∌": { "codepoints": [8716], "characters": "\u220C" },
364
+
"⋫": { "codepoints": [8939], "characters": "\u22EB" },
365
+
"⧐̸": { "codepoints": [10704, 824], "characters": "\u29D0\u0338" },
366
+
"⋭": { "codepoints": [8941], "characters": "\u22ED" },
367
+
"⊏̸": { "codepoints": [8847, 824], "characters": "\u228F\u0338" },
368
+
"⋢": { "codepoints": [8930], "characters": "\u22E2" },
369
+
"⊐̸": { "codepoints": [8848, 824], "characters": "\u2290\u0338" },
370
+
"⋣": { "codepoints": [8931], "characters": "\u22E3" },
371
+
"⊂⃒": { "codepoints": [8834, 8402], "characters": "\u2282\u20D2" },
372
+
"⊈": { "codepoints": [8840], "characters": "\u2288" },
373
+
"⊁": { "codepoints": [8833], "characters": "\u2281" },
374
+
"⪰̸": { "codepoints": [10928, 824], "characters": "\u2AB0\u0338" },
375
+
"⋡": { "codepoints": [8929], "characters": "\u22E1" },
376
+
"≿̸": { "codepoints": [8831, 824], "characters": "\u227F\u0338" },
377
+
"⊃⃒": { "codepoints": [8835, 8402], "characters": "\u2283\u20D2" },
378
+
"⊉": { "codepoints": [8841], "characters": "\u2289" },
379
+
"≁": { "codepoints": [8769], "characters": "\u2241" },
380
+
"≄": { "codepoints": [8772], "characters": "\u2244" },
381
+
"≇": { "codepoints": [8775], "characters": "\u2247" },
382
+
"≉": { "codepoints": [8777], "characters": "\u2249" },
383
+
"∤": { "codepoints": [8740], "characters": "\u2224" },
384
+
"𝒩": { "codepoints": [119977], "characters": "\uD835\uDCA9" },
385
+
"Ñ": { "codepoints": [209], "characters": "\u00D1" },
386
+
"Ñ": { "codepoints": [209], "characters": "\u00D1" },
387
+
"Ν": { "codepoints": [925], "characters": "\u039D" },
388
+
"Œ": { "codepoints": [338], "characters": "\u0152" },
389
+
"Ó": { "codepoints": [211], "characters": "\u00D3" },
390
+
"Ó": { "codepoints": [211], "characters": "\u00D3" },
391
+
"Ô": { "codepoints": [212], "characters": "\u00D4" },
392
+
"Ô": { "codepoints": [212], "characters": "\u00D4" },
393
+
"О": { "codepoints": [1054], "characters": "\u041E" },
394
+
"Ő": { "codepoints": [336], "characters": "\u0150" },
395
+
"𝔒": { "codepoints": [120082], "characters": "\uD835\uDD12" },
396
+
"Ò": { "codepoints": [210], "characters": "\u00D2" },
397
+
"Ò": { "codepoints": [210], "characters": "\u00D2" },
398
+
"Ō": { "codepoints": [332], "characters": "\u014C" },
399
+
"Ω": { "codepoints": [937], "characters": "\u03A9" },
400
+
"Ο": { "codepoints": [927], "characters": "\u039F" },
401
+
"𝕆": { "codepoints": [120134], "characters": "\uD835\uDD46" },
402
+
"“": { "codepoints": [8220], "characters": "\u201C" },
403
+
"‘": { "codepoints": [8216], "characters": "\u2018" },
404
+
"⩔": { "codepoints": [10836], "characters": "\u2A54" },
405
+
"𝒪": { "codepoints": [119978], "characters": "\uD835\uDCAA" },
406
+
"Ø": { "codepoints": [216], "characters": "\u00D8" },
407
+
"Ø": { "codepoints": [216], "characters": "\u00D8" },
408
+
"Õ": { "codepoints": [213], "characters": "\u00D5" },
409
+
"Õ": { "codepoints": [213], "characters": "\u00D5" },
410
+
"⨷": { "codepoints": [10807], "characters": "\u2A37" },
411
+
"Ö": { "codepoints": [214], "characters": "\u00D6" },
412
+
"Ö": { "codepoints": [214], "characters": "\u00D6" },
413
+
"‾": { "codepoints": [8254], "characters": "\u203E" },
414
+
"⏞": { "codepoints": [9182], "characters": "\u23DE" },
415
+
"⎴": { "codepoints": [9140], "characters": "\u23B4" },
416
+
"⏜": { "codepoints": [9180], "characters": "\u23DC" },
417
+
"∂": { "codepoints": [8706], "characters": "\u2202" },
418
+
"П": { "codepoints": [1055], "characters": "\u041F" },
419
+
"𝔓": { "codepoints": [120083], "characters": "\uD835\uDD13" },
420
+
"Φ": { "codepoints": [934], "characters": "\u03A6" },
421
+
"Π": { "codepoints": [928], "characters": "\u03A0" },
422
+
"±": { "codepoints": [177], "characters": "\u00B1" },
423
+
"ℌ": { "codepoints": [8460], "characters": "\u210C" },
424
+
"ℙ": { "codepoints": [8473], "characters": "\u2119" },
425
+
"⪻": { "codepoints": [10939], "characters": "\u2ABB" },
426
+
"≺": { "codepoints": [8826], "characters": "\u227A" },
427
+
"⪯": { "codepoints": [10927], "characters": "\u2AAF" },
428
+
"≼": { "codepoints": [8828], "characters": "\u227C" },
429
+
"≾": { "codepoints": [8830], "characters": "\u227E" },
430
+
"″": { "codepoints": [8243], "characters": "\u2033" },
431
+
"∏": { "codepoints": [8719], "characters": "\u220F" },
432
+
"∷": { "codepoints": [8759], "characters": "\u2237" },
433
+
"∝": { "codepoints": [8733], "characters": "\u221D" },
434
+
"𝒫": { "codepoints": [119979], "characters": "\uD835\uDCAB" },
435
+
"Ψ": { "codepoints": [936], "characters": "\u03A8" },
436
+
""": { "codepoints": [34], "characters": "\u0022" },
437
+
""": { "codepoints": [34], "characters": "\u0022" },
438
+
"𝔔": { "codepoints": [120084], "characters": "\uD835\uDD14" },
439
+
"ℚ": { "codepoints": [8474], "characters": "\u211A" },
440
+
"𝒬": { "codepoints": [119980], "characters": "\uD835\uDCAC" },
441
+
"⤐": { "codepoints": [10512], "characters": "\u2910" },
442
+
"®": { "codepoints": [174], "characters": "\u00AE" },
443
+
"®": { "codepoints": [174], "characters": "\u00AE" },
444
+
"Ŕ": { "codepoints": [340], "characters": "\u0154" },
445
+
"⟫": { "codepoints": [10219], "characters": "\u27EB" },
446
+
"↠": { "codepoints": [8608], "characters": "\u21A0" },
447
+
"⤖": { "codepoints": [10518], "characters": "\u2916" },
448
+
"Ř": { "codepoints": [344], "characters": "\u0158" },
449
+
"Ŗ": { "codepoints": [342], "characters": "\u0156" },
450
+
"Р": { "codepoints": [1056], "characters": "\u0420" },
451
+
"ℜ": { "codepoints": [8476], "characters": "\u211C" },
452
+
"∋": { "codepoints": [8715], "characters": "\u220B" },
453
+
"⇋": { "codepoints": [8651], "characters": "\u21CB" },
454
+
"⥯": { "codepoints": [10607], "characters": "\u296F" },
455
+
"ℜ": { "codepoints": [8476], "characters": "\u211C" },
456
+
"Ρ": { "codepoints": [929], "characters": "\u03A1" },
457
+
"⟩": { "codepoints": [10217], "characters": "\u27E9" },
458
+
"→": { "codepoints": [8594], "characters": "\u2192" },
459
+
"⇥": { "codepoints": [8677], "characters": "\u21E5" },
460
+
"⇄": { "codepoints": [8644], "characters": "\u21C4" },
461
+
"⌉": { "codepoints": [8969], "characters": "\u2309" },
462
+
"⟧": { "codepoints": [10215], "characters": "\u27E7" },
463
+
"⥝": { "codepoints": [10589], "characters": "\u295D" },
464
+
"⇂": { "codepoints": [8642], "characters": "\u21C2" },
465
+
"⥕": { "codepoints": [10581], "characters": "\u2955" },
466
+
"⌋": { "codepoints": [8971], "characters": "\u230B" },
467
+
"⊢": { "codepoints": [8866], "characters": "\u22A2" },
468
+
"↦": { "codepoints": [8614], "characters": "\u21A6" },
469
+
"⥛": { "codepoints": [10587], "characters": "\u295B" },
470
+
"⊳": { "codepoints": [8883], "characters": "\u22B3" },
471
+
"⧐": { "codepoints": [10704], "characters": "\u29D0" },
472
+
"⊵": { "codepoints": [8885], "characters": "\u22B5" },
473
+
"⥏": { "codepoints": [10575], "characters": "\u294F" },
474
+
"⥜": { "codepoints": [10588], "characters": "\u295C" },
475
+
"↾": { "codepoints": [8638], "characters": "\u21BE" },
476
+
"⥔": { "codepoints": [10580], "characters": "\u2954" },
477
+
"⇀": { "codepoints": [8640], "characters": "\u21C0" },
478
+
"⥓": { "codepoints": [10579], "characters": "\u2953" },
479
+
"⇒": { "codepoints": [8658], "characters": "\u21D2" },
480
+
"ℝ": { "codepoints": [8477], "characters": "\u211D" },
481
+
"⥰": { "codepoints": [10608], "characters": "\u2970" },
482
+
"⇛": { "codepoints": [8667], "characters": "\u21DB" },
483
+
"ℛ": { "codepoints": [8475], "characters": "\u211B" },
484
+
"↱": { "codepoints": [8625], "characters": "\u21B1" },
485
+
"⧴": { "codepoints": [10740], "characters": "\u29F4" },
486
+
"Щ": { "codepoints": [1065], "characters": "\u0429" },
487
+
"Ш": { "codepoints": [1064], "characters": "\u0428" },
488
+
"Ь": { "codepoints": [1068], "characters": "\u042C" },
489
+
"Ś": { "codepoints": [346], "characters": "\u015A" },
490
+
"⪼": { "codepoints": [10940], "characters": "\u2ABC" },
491
+
"Š": { "codepoints": [352], "characters": "\u0160" },
492
+
"Ş": { "codepoints": [350], "characters": "\u015E" },
493
+
"Ŝ": { "codepoints": [348], "characters": "\u015C" },
494
+
"С": { "codepoints": [1057], "characters": "\u0421" },
495
+
"𝔖": { "codepoints": [120086], "characters": "\uD835\uDD16" },
496
+
"↓": { "codepoints": [8595], "characters": "\u2193" },
497
+
"←": { "codepoints": [8592], "characters": "\u2190" },
498
+
"→": { "codepoints": [8594], "characters": "\u2192" },
499
+
"↑": { "codepoints": [8593], "characters": "\u2191" },
500
+
"Σ": { "codepoints": [931], "characters": "\u03A3" },
501
+
"∘": { "codepoints": [8728], "characters": "\u2218" },
502
+
"𝕊": { "codepoints": [120138], "characters": "\uD835\uDD4A" },
503
+
"√": { "codepoints": [8730], "characters": "\u221A" },
504
+
"□": { "codepoints": [9633], "characters": "\u25A1" },
505
+
"⊓": { "codepoints": [8851], "characters": "\u2293" },
506
+
"⊏": { "codepoints": [8847], "characters": "\u228F" },
507
+
"⊑": { "codepoints": [8849], "characters": "\u2291" },
508
+
"⊐": { "codepoints": [8848], "characters": "\u2290" },
509
+
"⊒": { "codepoints": [8850], "characters": "\u2292" },
510
+
"⊔": { "codepoints": [8852], "characters": "\u2294" },
511
+
"𝒮": { "codepoints": [119982], "characters": "\uD835\uDCAE" },
512
+
"⋆": { "codepoints": [8902], "characters": "\u22C6" },
513
+
"⋐": { "codepoints": [8912], "characters": "\u22D0" },
514
+
"⋐": { "codepoints": [8912], "characters": "\u22D0" },
515
+
"⊆": { "codepoints": [8838], "characters": "\u2286" },
516
+
"≻": { "codepoints": [8827], "characters": "\u227B" },
517
+
"⪰": { "codepoints": [10928], "characters": "\u2AB0" },
518
+
"≽": { "codepoints": [8829], "characters": "\u227D" },
519
+
"≿": { "codepoints": [8831], "characters": "\u227F" },
520
+
"∋": { "codepoints": [8715], "characters": "\u220B" },
521
+
"∑": { "codepoints": [8721], "characters": "\u2211" },
522
+
"⋑": { "codepoints": [8913], "characters": "\u22D1" },
523
+
"⊃": { "codepoints": [8835], "characters": "\u2283" },
524
+
"⊇": { "codepoints": [8839], "characters": "\u2287" },
525
+
"⋑": { "codepoints": [8913], "characters": "\u22D1" },
526
+
"Þ": { "codepoints": [222], "characters": "\u00DE" },
527
+
"Þ": { "codepoints": [222], "characters": "\u00DE" },
528
+
"™": { "codepoints": [8482], "characters": "\u2122" },
529
+
"Ћ": { "codepoints": [1035], "characters": "\u040B" },
530
+
"Ц": { "codepoints": [1062], "characters": "\u0426" },
531
+
"	": { "codepoints": [9], "characters": "\u0009" },
532
+
"Τ": { "codepoints": [932], "characters": "\u03A4" },
533
+
"Ť": { "codepoints": [356], "characters": "\u0164" },
534
+
"Ţ": { "codepoints": [354], "characters": "\u0162" },
535
+
"Т": { "codepoints": [1058], "characters": "\u0422" },
536
+
"𝔗": { "codepoints": [120087], "characters": "\uD835\uDD17" },
537
+
"∴": { "codepoints": [8756], "characters": "\u2234" },
538
+
"Θ": { "codepoints": [920], "characters": "\u0398" },
539
+
"  ": { "codepoints": [8287, 8202], "characters": "\u205F\u200A" },
540
+
" ": { "codepoints": [8201], "characters": "\u2009" },
541
+
"∼": { "codepoints": [8764], "characters": "\u223C" },
542
+
"≃": { "codepoints": [8771], "characters": "\u2243" },
543
+
"≅": { "codepoints": [8773], "characters": "\u2245" },
544
+
"≈": { "codepoints": [8776], "characters": "\u2248" },
545
+
"𝕋": { "codepoints": [120139], "characters": "\uD835\uDD4B" },
546
+
"⃛": { "codepoints": [8411], "characters": "\u20DB" },
547
+
"𝒯": { "codepoints": [119983], "characters": "\uD835\uDCAF" },
548
+
"Ŧ": { "codepoints": [358], "characters": "\u0166" },
549
+
"Ú": { "codepoints": [218], "characters": "\u00DA" },
550
+
"Ú": { "codepoints": [218], "characters": "\u00DA" },
551
+
"↟": { "codepoints": [8607], "characters": "\u219F" },
552
+
"⥉": { "codepoints": [10569], "characters": "\u2949" },
553
+
"Ў": { "codepoints": [1038], "characters": "\u040E" },
554
+
"Ŭ": { "codepoints": [364], "characters": "\u016C" },
555
+
"Û": { "codepoints": [219], "characters": "\u00DB" },
556
+
"Û": { "codepoints": [219], "characters": "\u00DB" },
557
+
"У": { "codepoints": [1059], "characters": "\u0423" },
558
+
"Ű": { "codepoints": [368], "characters": "\u0170" },
559
+
"𝔘": { "codepoints": [120088], "characters": "\uD835\uDD18" },
560
+
"Ù": { "codepoints": [217], "characters": "\u00D9" },
561
+
"Ù": { "codepoints": [217], "characters": "\u00D9" },
562
+
"Ū": { "codepoints": [362], "characters": "\u016A" },
563
+
"_": { "codepoints": [95], "characters": "\u005F" },
564
+
"⏟": { "codepoints": [9183], "characters": "\u23DF" },
565
+
"⎵": { "codepoints": [9141], "characters": "\u23B5" },
566
+
"⏝": { "codepoints": [9181], "characters": "\u23DD" },
567
+
"⋃": { "codepoints": [8899], "characters": "\u22C3" },
568
+
"⊎": { "codepoints": [8846], "characters": "\u228E" },
569
+
"Ų": { "codepoints": [370], "characters": "\u0172" },
570
+
"𝕌": { "codepoints": [120140], "characters": "\uD835\uDD4C" },
571
+
"↑": { "codepoints": [8593], "characters": "\u2191" },
572
+
"⤒": { "codepoints": [10514], "characters": "\u2912" },
573
+
"⇅": { "codepoints": [8645], "characters": "\u21C5" },
574
+
"↕": { "codepoints": [8597], "characters": "\u2195" },
575
+
"⥮": { "codepoints": [10606], "characters": "\u296E" },
576
+
"⊥": { "codepoints": [8869], "characters": "\u22A5" },
577
+
"↥": { "codepoints": [8613], "characters": "\u21A5" },
578
+
"⇑": { "codepoints": [8657], "characters": "\u21D1" },
579
+
"⇕": { "codepoints": [8661], "characters": "\u21D5" },
580
+
"↖": { "codepoints": [8598], "characters": "\u2196" },
581
+
"↗": { "codepoints": [8599], "characters": "\u2197" },
582
+
"ϒ": { "codepoints": [978], "characters": "\u03D2" },
583
+
"Υ": { "codepoints": [933], "characters": "\u03A5" },
584
+
"Ů": { "codepoints": [366], "characters": "\u016E" },
585
+
"𝒰": { "codepoints": [119984], "characters": "\uD835\uDCB0" },
586
+
"Ũ": { "codepoints": [360], "characters": "\u0168" },
587
+
"Ü": { "codepoints": [220], "characters": "\u00DC" },
588
+
"Ü": { "codepoints": [220], "characters": "\u00DC" },
589
+
"⊫": { "codepoints": [8875], "characters": "\u22AB" },
590
+
"⫫": { "codepoints": [10987], "characters": "\u2AEB" },
591
+
"В": { "codepoints": [1042], "characters": "\u0412" },
592
+
"⊩": { "codepoints": [8873], "characters": "\u22A9" },
593
+
"⫦": { "codepoints": [10982], "characters": "\u2AE6" },
594
+
"⋁": { "codepoints": [8897], "characters": "\u22C1" },
595
+
"‖": { "codepoints": [8214], "characters": "\u2016" },
596
+
"‖": { "codepoints": [8214], "characters": "\u2016" },
597
+
"∣": { "codepoints": [8739], "characters": "\u2223" },
598
+
"|": { "codepoints": [124], "characters": "\u007C" },
599
+
"❘": { "codepoints": [10072], "characters": "\u2758" },
600
+
"≀": { "codepoints": [8768], "characters": "\u2240" },
601
+
" ": { "codepoints": [8202], "characters": "\u200A" },
602
+
"𝔙": { "codepoints": [120089], "characters": "\uD835\uDD19" },
603
+
"𝕍": { "codepoints": [120141], "characters": "\uD835\uDD4D" },
604
+
"𝒱": { "codepoints": [119985], "characters": "\uD835\uDCB1" },
605
+
"⊪": { "codepoints": [8874], "characters": "\u22AA" },
606
+
"Ŵ": { "codepoints": [372], "characters": "\u0174" },
607
+
"⋀": { "codepoints": [8896], "characters": "\u22C0" },
608
+
"𝔚": { "codepoints": [120090], "characters": "\uD835\uDD1A" },
609
+
"𝕎": { "codepoints": [120142], "characters": "\uD835\uDD4E" },
610
+
"𝒲": { "codepoints": [119986], "characters": "\uD835\uDCB2" },
611
+
"𝔛": { "codepoints": [120091], "characters": "\uD835\uDD1B" },
612
+
"Ξ": { "codepoints": [926], "characters": "\u039E" },
613
+
"𝕏": { "codepoints": [120143], "characters": "\uD835\uDD4F" },
614
+
"𝒳": { "codepoints": [119987], "characters": "\uD835\uDCB3" },
615
+
"Я": { "codepoints": [1071], "characters": "\u042F" },
616
+
"Ї": { "codepoints": [1031], "characters": "\u0407" },
617
+
"Ю": { "codepoints": [1070], "characters": "\u042E" },
618
+
"Ý": { "codepoints": [221], "characters": "\u00DD" },
619
+
"Ý": { "codepoints": [221], "characters": "\u00DD" },
620
+
"Ŷ": { "codepoints": [374], "characters": "\u0176" },
621
+
"Ы": { "codepoints": [1067], "characters": "\u042B" },
622
+
"𝔜": { "codepoints": [120092], "characters": "\uD835\uDD1C" },
623
+
"𝕐": { "codepoints": [120144], "characters": "\uD835\uDD50" },
624
+
"𝒴": { "codepoints": [119988], "characters": "\uD835\uDCB4" },
625
+
"Ÿ": { "codepoints": [376], "characters": "\u0178" },
626
+
"Ж": { "codepoints": [1046], "characters": "\u0416" },
627
+
"Ź": { "codepoints": [377], "characters": "\u0179" },
628
+
"Ž": { "codepoints": [381], "characters": "\u017D" },
629
+
"З": { "codepoints": [1047], "characters": "\u0417" },
630
+
"Ż": { "codepoints": [379], "characters": "\u017B" },
631
+
"​": { "codepoints": [8203], "characters": "\u200B" },
632
+
"Ζ": { "codepoints": [918], "characters": "\u0396" },
633
+
"ℨ": { "codepoints": [8488], "characters": "\u2128" },
634
+
"ℤ": { "codepoints": [8484], "characters": "\u2124" },
635
+
"𝒵": { "codepoints": [119989], "characters": "\uD835\uDCB5" },
636
+
"á": { "codepoints": [225], "characters": "\u00E1" },
637
+
"á": { "codepoints": [225], "characters": "\u00E1" },
638
+
"ă": { "codepoints": [259], "characters": "\u0103" },
639
+
"∾": { "codepoints": [8766], "characters": "\u223E" },
640
+
"∾̳": { "codepoints": [8766, 819], "characters": "\u223E\u0333" },
641
+
"∿": { "codepoints": [8767], "characters": "\u223F" },
642
+
"â": { "codepoints": [226], "characters": "\u00E2" },
643
+
"â": { "codepoints": [226], "characters": "\u00E2" },
644
+
"´": { "codepoints": [180], "characters": "\u00B4" },
645
+
"´": { "codepoints": [180], "characters": "\u00B4" },
646
+
"а": { "codepoints": [1072], "characters": "\u0430" },
647
+
"æ": { "codepoints": [230], "characters": "\u00E6" },
648
+
"æ": { "codepoints": [230], "characters": "\u00E6" },
649
+
"⁡": { "codepoints": [8289], "characters": "\u2061" },
650
+
"𝔞": { "codepoints": [120094], "characters": "\uD835\uDD1E" },
651
+
"à": { "codepoints": [224], "characters": "\u00E0" },
652
+
"à": { "codepoints": [224], "characters": "\u00E0" },
653
+
"ℵ": { "codepoints": [8501], "characters": "\u2135" },
654
+
"ℵ": { "codepoints": [8501], "characters": "\u2135" },
655
+
"α": { "codepoints": [945], "characters": "\u03B1" },
656
+
"ā": { "codepoints": [257], "characters": "\u0101" },
657
+
"⨿": { "codepoints": [10815], "characters": "\u2A3F" },
658
+
"&": { "codepoints": [38], "characters": "\u0026" },
659
+
"&": { "codepoints": [38], "characters": "\u0026" },
660
+
"∧": { "codepoints": [8743], "characters": "\u2227" },
661
+
"⩕": { "codepoints": [10837], "characters": "\u2A55" },
662
+
"⩜": { "codepoints": [10844], "characters": "\u2A5C" },
663
+
"⩘": { "codepoints": [10840], "characters": "\u2A58" },
664
+
"⩚": { "codepoints": [10842], "characters": "\u2A5A" },
665
+
"∠": { "codepoints": [8736], "characters": "\u2220" },
666
+
"⦤": { "codepoints": [10660], "characters": "\u29A4" },
667
+
"∠": { "codepoints": [8736], "characters": "\u2220" },
668
+
"∡": { "codepoints": [8737], "characters": "\u2221" },
669
+
"⦨": { "codepoints": [10664], "characters": "\u29A8" },
670
+
"⦩": { "codepoints": [10665], "characters": "\u29A9" },
671
+
"⦪": { "codepoints": [10666], "characters": "\u29AA" },
672
+
"⦫": { "codepoints": [10667], "characters": "\u29AB" },
673
+
"⦬": { "codepoints": [10668], "characters": "\u29AC" },
674
+
"⦭": { "codepoints": [10669], "characters": "\u29AD" },
675
+
"⦮": { "codepoints": [10670], "characters": "\u29AE" },
676
+
"⦯": { "codepoints": [10671], "characters": "\u29AF" },
677
+
"∟": { "codepoints": [8735], "characters": "\u221F" },
678
+
"⊾": { "codepoints": [8894], "characters": "\u22BE" },
679
+
"⦝": { "codepoints": [10653], "characters": "\u299D" },
680
+
"∢": { "codepoints": [8738], "characters": "\u2222" },
681
+
"Å": { "codepoints": [197], "characters": "\u00C5" },
682
+
"⍼": { "codepoints": [9084], "characters": "\u237C" },
683
+
"ą": { "codepoints": [261], "characters": "\u0105" },
684
+
"𝕒": { "codepoints": [120146], "characters": "\uD835\uDD52" },
685
+
"≈": { "codepoints": [8776], "characters": "\u2248" },
686
+
"⩰": { "codepoints": [10864], "characters": "\u2A70" },
687
+
"⩯": { "codepoints": [10863], "characters": "\u2A6F" },
688
+
"≊": { "codepoints": [8778], "characters": "\u224A" },
689
+
"≋": { "codepoints": [8779], "characters": "\u224B" },
690
+
"'": { "codepoints": [39], "characters": "\u0027" },
691
+
"≈": { "codepoints": [8776], "characters": "\u2248" },
692
+
"≊": { "codepoints": [8778], "characters": "\u224A" },
693
+
"å": { "codepoints": [229], "characters": "\u00E5" },
694
+
"å": { "codepoints": [229], "characters": "\u00E5" },
695
+
"𝒶": { "codepoints": [119990], "characters": "\uD835\uDCB6" },
696
+
"*": { "codepoints": [42], "characters": "\u002A" },
697
+
"≈": { "codepoints": [8776], "characters": "\u2248" },
698
+
"≍": { "codepoints": [8781], "characters": "\u224D" },
699
+
"ã": { "codepoints": [227], "characters": "\u00E3" },
700
+
"ã": { "codepoints": [227], "characters": "\u00E3" },
701
+
"ä": { "codepoints": [228], "characters": "\u00E4" },
702
+
"ä": { "codepoints": [228], "characters": "\u00E4" },
703
+
"∳": { "codepoints": [8755], "characters": "\u2233" },
704
+
"⨑": { "codepoints": [10769], "characters": "\u2A11" },
705
+
"⫭": { "codepoints": [10989], "characters": "\u2AED" },
706
+
"≌": { "codepoints": [8780], "characters": "\u224C" },
707
+
"϶": { "codepoints": [1014], "characters": "\u03F6" },
708
+
"‵": { "codepoints": [8245], "characters": "\u2035" },
709
+
"∽": { "codepoints": [8765], "characters": "\u223D" },
710
+
"⋍": { "codepoints": [8909], "characters": "\u22CD" },
711
+
"⊽": { "codepoints": [8893], "characters": "\u22BD" },
712
+
"⌅": { "codepoints": [8965], "characters": "\u2305" },
713
+
"⌅": { "codepoints": [8965], "characters": "\u2305" },
714
+
"⎵": { "codepoints": [9141], "characters": "\u23B5" },
715
+
"⎶": { "codepoints": [9142], "characters": "\u23B6" },
716
+
"≌": { "codepoints": [8780], "characters": "\u224C" },
717
+
"б": { "codepoints": [1073], "characters": "\u0431" },
718
+
"„": { "codepoints": [8222], "characters": "\u201E" },
719
+
"∵": { "codepoints": [8757], "characters": "\u2235" },
720
+
"∵": { "codepoints": [8757], "characters": "\u2235" },
721
+
"⦰": { "codepoints": [10672], "characters": "\u29B0" },
722
+
"϶": { "codepoints": [1014], "characters": "\u03F6" },
723
+
"ℬ": { "codepoints": [8492], "characters": "\u212C" },
724
+
"β": { "codepoints": [946], "characters": "\u03B2" },
725
+
"ℶ": { "codepoints": [8502], "characters": "\u2136" },
726
+
"≬": { "codepoints": [8812], "characters": "\u226C" },
727
+
"𝔟": { "codepoints": [120095], "characters": "\uD835\uDD1F" },
728
+
"⋂": { "codepoints": [8898], "characters": "\u22C2" },
729
+
"◯": { "codepoints": [9711], "characters": "\u25EF" },
730
+
"⋃": { "codepoints": [8899], "characters": "\u22C3" },
731
+
"⨀": { "codepoints": [10752], "characters": "\u2A00" },
732
+
"⨁": { "codepoints": [10753], "characters": "\u2A01" },
733
+
"⨂": { "codepoints": [10754], "characters": "\u2A02" },
734
+
"⨆": { "codepoints": [10758], "characters": "\u2A06" },
735
+
"★": { "codepoints": [9733], "characters": "\u2605" },
736
+
"▽": { "codepoints": [9661], "characters": "\u25BD" },
737
+
"△": { "codepoints": [9651], "characters": "\u25B3" },
738
+
"⨄": { "codepoints": [10756], "characters": "\u2A04" },
739
+
"⋁": { "codepoints": [8897], "characters": "\u22C1" },
740
+
"⋀": { "codepoints": [8896], "characters": "\u22C0" },
741
+
"⤍": { "codepoints": [10509], "characters": "\u290D" },
742
+
"⧫": { "codepoints": [10731], "characters": "\u29EB" },
743
+
"▪": { "codepoints": [9642], "characters": "\u25AA" },
744
+
"▴": { "codepoints": [9652], "characters": "\u25B4" },
745
+
"▾": { "codepoints": [9662], "characters": "\u25BE" },
746
+
"◂": { "codepoints": [9666], "characters": "\u25C2" },
747
+
"▸": { "codepoints": [9656], "characters": "\u25B8" },
748
+
"␣": { "codepoints": [9251], "characters": "\u2423" },
749
+
"▒": { "codepoints": [9618], "characters": "\u2592" },
750
+
"░": { "codepoints": [9617], "characters": "\u2591" },
751
+
"▓": { "codepoints": [9619], "characters": "\u2593" },
752
+
"█": { "codepoints": [9608], "characters": "\u2588" },
753
+
"=⃥": { "codepoints": [61, 8421], "characters": "\u003D\u20E5" },
754
+
"≡⃥": { "codepoints": [8801, 8421], "characters": "\u2261\u20E5" },
755
+
"⌐": { "codepoints": [8976], "characters": "\u2310" },
756
+
"𝕓": { "codepoints": [120147], "characters": "\uD835\uDD53" },
757
+
"⊥": { "codepoints": [8869], "characters": "\u22A5" },
758
+
"⊥": { "codepoints": [8869], "characters": "\u22A5" },
759
+
"⋈": { "codepoints": [8904], "characters": "\u22C8" },
760
+
"╗": { "codepoints": [9559], "characters": "\u2557" },
761
+
"╔": { "codepoints": [9556], "characters": "\u2554" },
762
+
"╖": { "codepoints": [9558], "characters": "\u2556" },
763
+
"╓": { "codepoints": [9555], "characters": "\u2553" },
764
+
"═": { "codepoints": [9552], "characters": "\u2550" },
765
+
"╦": { "codepoints": [9574], "characters": "\u2566" },
766
+
"╩": { "codepoints": [9577], "characters": "\u2569" },
767
+
"╤": { "codepoints": [9572], "characters": "\u2564" },
768
+
"╧": { "codepoints": [9575], "characters": "\u2567" },
769
+
"╝": { "codepoints": [9565], "characters": "\u255D" },
770
+
"╚": { "codepoints": [9562], "characters": "\u255A" },
771
+
"╜": { "codepoints": [9564], "characters": "\u255C" },
772
+
"╙": { "codepoints": [9561], "characters": "\u2559" },
773
+
"║": { "codepoints": [9553], "characters": "\u2551" },
774
+
"╬": { "codepoints": [9580], "characters": "\u256C" },
775
+
"╣": { "codepoints": [9571], "characters": "\u2563" },
776
+
"╠": { "codepoints": [9568], "characters": "\u2560" },
777
+
"╫": { "codepoints": [9579], "characters": "\u256B" },
778
+
"╢": { "codepoints": [9570], "characters": "\u2562" },
779
+
"╟": { "codepoints": [9567], "characters": "\u255F" },
780
+
"⧉": { "codepoints": [10697], "characters": "\u29C9" },
781
+
"╕": { "codepoints": [9557], "characters": "\u2555" },
782
+
"╒": { "codepoints": [9554], "characters": "\u2552" },
783
+
"┐": { "codepoints": [9488], "characters": "\u2510" },
784
+
"┌": { "codepoints": [9484], "characters": "\u250C" },
785
+
"─": { "codepoints": [9472], "characters": "\u2500" },
786
+
"╥": { "codepoints": [9573], "characters": "\u2565" },
787
+
"╨": { "codepoints": [9576], "characters": "\u2568" },
788
+
"┬": { "codepoints": [9516], "characters": "\u252C" },
789
+
"┴": { "codepoints": [9524], "characters": "\u2534" },
790
+
"⊟": { "codepoints": [8863], "characters": "\u229F" },
791
+
"⊞": { "codepoints": [8862], "characters": "\u229E" },
792
+
"⊠": { "codepoints": [8864], "characters": "\u22A0" },
793
+
"╛": { "codepoints": [9563], "characters": "\u255B" },
794
+
"╘": { "codepoints": [9560], "characters": "\u2558" },
795
+
"┘": { "codepoints": [9496], "characters": "\u2518" },
796
+
"└": { "codepoints": [9492], "characters": "\u2514" },
797
+
"│": { "codepoints": [9474], "characters": "\u2502" },
798
+
"╪": { "codepoints": [9578], "characters": "\u256A" },
799
+
"╡": { "codepoints": [9569], "characters": "\u2561" },
800
+
"╞": { "codepoints": [9566], "characters": "\u255E" },
801
+
"┼": { "codepoints": [9532], "characters": "\u253C" },
802
+
"┤": { "codepoints": [9508], "characters": "\u2524" },
803
+
"├": { "codepoints": [9500], "characters": "\u251C" },
804
+
"‵": { "codepoints": [8245], "characters": "\u2035" },
805
+
"˘": { "codepoints": [728], "characters": "\u02D8" },
806
+
"¦": { "codepoints": [166], "characters": "\u00A6" },
807
+
"¦": { "codepoints": [166], "characters": "\u00A6" },
808
+
"𝒷": { "codepoints": [119991], "characters": "\uD835\uDCB7" },
809
+
"⁏": { "codepoints": [8271], "characters": "\u204F" },
810
+
"∽": { "codepoints": [8765], "characters": "\u223D" },
811
+
"⋍": { "codepoints": [8909], "characters": "\u22CD" },
812
+
"\": { "codepoints": [92], "characters": "\u005C" },
813
+
"⧅": { "codepoints": [10693], "characters": "\u29C5" },
814
+
"⟈": { "codepoints": [10184], "characters": "\u27C8" },
815
+
"•": { "codepoints": [8226], "characters": "\u2022" },
816
+
"•": { "codepoints": [8226], "characters": "\u2022" },
817
+
"≎": { "codepoints": [8782], "characters": "\u224E" },
818
+
"⪮": { "codepoints": [10926], "characters": "\u2AAE" },
819
+
"≏": { "codepoints": [8783], "characters": "\u224F" },
820
+
"≏": { "codepoints": [8783], "characters": "\u224F" },
821
+
"ć": { "codepoints": [263], "characters": "\u0107" },
822
+
"∩": { "codepoints": [8745], "characters": "\u2229" },
823
+
"⩄": { "codepoints": [10820], "characters": "\u2A44" },
824
+
"⩉": { "codepoints": [10825], "characters": "\u2A49" },
825
+
"⩋": { "codepoints": [10827], "characters": "\u2A4B" },
826
+
"⩇": { "codepoints": [10823], "characters": "\u2A47" },
827
+
"⩀": { "codepoints": [10816], "characters": "\u2A40" },
828
+
"∩︀": { "codepoints": [8745, 65024], "characters": "\u2229\uFE00" },
829
+
"⁁": { "codepoints": [8257], "characters": "\u2041" },
830
+
"ˇ": { "codepoints": [711], "characters": "\u02C7" },
831
+
"⩍": { "codepoints": [10829], "characters": "\u2A4D" },
832
+
"č": { "codepoints": [269], "characters": "\u010D" },
833
+
"ç": { "codepoints": [231], "characters": "\u00E7" },
834
+
"ç": { "codepoints": [231], "characters": "\u00E7" },
835
+
"ĉ": { "codepoints": [265], "characters": "\u0109" },
836
+
"⩌": { "codepoints": [10828], "characters": "\u2A4C" },
837
+
"⩐": { "codepoints": [10832], "characters": "\u2A50" },
838
+
"ċ": { "codepoints": [267], "characters": "\u010B" },
839
+
"¸": { "codepoints": [184], "characters": "\u00B8" },
840
+
"¸": { "codepoints": [184], "characters": "\u00B8" },
841
+
"⦲": { "codepoints": [10674], "characters": "\u29B2" },
842
+
"¢": { "codepoints": [162], "characters": "\u00A2" },
843
+
"¢": { "codepoints": [162], "characters": "\u00A2" },
844
+
"·": { "codepoints": [183], "characters": "\u00B7" },
845
+
"𝔠": { "codepoints": [120096], "characters": "\uD835\uDD20" },
846
+
"ч": { "codepoints": [1095], "characters": "\u0447" },
847
+
"✓": { "codepoints": [10003], "characters": "\u2713" },
848
+
"✓": { "codepoints": [10003], "characters": "\u2713" },
849
+
"χ": { "codepoints": [967], "characters": "\u03C7" },
850
+
"○": { "codepoints": [9675], "characters": "\u25CB" },
851
+
"⧃": { "codepoints": [10691], "characters": "\u29C3" },
852
+
"ˆ": { "codepoints": [710], "characters": "\u02C6" },
853
+
"≗": { "codepoints": [8791], "characters": "\u2257" },
854
+
"↺": { "codepoints": [8634], "characters": "\u21BA" },
855
+
"↻": { "codepoints": [8635], "characters": "\u21BB" },
856
+
"®": { "codepoints": [174], "characters": "\u00AE" },
857
+
"Ⓢ": { "codepoints": [9416], "characters": "\u24C8" },
858
+
"⊛": { "codepoints": [8859], "characters": "\u229B" },
859
+
"⊚": { "codepoints": [8858], "characters": "\u229A" },
860
+
"⊝": { "codepoints": [8861], "characters": "\u229D" },
861
+
"≗": { "codepoints": [8791], "characters": "\u2257" },
862
+
"⨐": { "codepoints": [10768], "characters": "\u2A10" },
863
+
"⫯": { "codepoints": [10991], "characters": "\u2AEF" },
864
+
"⧂": { "codepoints": [10690], "characters": "\u29C2" },
865
+
"♣": { "codepoints": [9827], "characters": "\u2663" },
866
+
"♣": { "codepoints": [9827], "characters": "\u2663" },
867
+
":": { "codepoints": [58], "characters": "\u003A" },
868
+
"≔": { "codepoints": [8788], "characters": "\u2254" },
869
+
"≔": { "codepoints": [8788], "characters": "\u2254" },
870
+
",": { "codepoints": [44], "characters": "\u002C" },
871
+
"@": { "codepoints": [64], "characters": "\u0040" },
872
+
"∁": { "codepoints": [8705], "characters": "\u2201" },
873
+
"∘": { "codepoints": [8728], "characters": "\u2218" },
874
+
"∁": { "codepoints": [8705], "characters": "\u2201" },
875
+
"ℂ": { "codepoints": [8450], "characters": "\u2102" },
876
+
"≅": { "codepoints": [8773], "characters": "\u2245" },
877
+
"⩭": { "codepoints": [10861], "characters": "\u2A6D" },
878
+
"∮": { "codepoints": [8750], "characters": "\u222E" },
879
+
"𝕔": { "codepoints": [120148], "characters": "\uD835\uDD54" },
880
+
"∐": { "codepoints": [8720], "characters": "\u2210" },
881
+
"©": { "codepoints": [169], "characters": "\u00A9" },
882
+
"©": { "codepoints": [169], "characters": "\u00A9" },
883
+
"℗": { "codepoints": [8471], "characters": "\u2117" },
884
+
"↵": { "codepoints": [8629], "characters": "\u21B5" },
885
+
"✗": { "codepoints": [10007], "characters": "\u2717" },
886
+
"𝒸": { "codepoints": [119992], "characters": "\uD835\uDCB8" },
887
+
"⫏": { "codepoints": [10959], "characters": "\u2ACF" },
888
+
"⫑": { "codepoints": [10961], "characters": "\u2AD1" },
889
+
"⫐": { "codepoints": [10960], "characters": "\u2AD0" },
890
+
"⫒": { "codepoints": [10962], "characters": "\u2AD2" },
891
+
"⋯": { "codepoints": [8943], "characters": "\u22EF" },
892
+
"⤸": { "codepoints": [10552], "characters": "\u2938" },
893
+
"⤵": { "codepoints": [10549], "characters": "\u2935" },
894
+
"⋞": { "codepoints": [8926], "characters": "\u22DE" },
895
+
"⋟": { "codepoints": [8927], "characters": "\u22DF" },
896
+
"↶": { "codepoints": [8630], "characters": "\u21B6" },
897
+
"⤽": { "codepoints": [10557], "characters": "\u293D" },
898
+
"∪": { "codepoints": [8746], "characters": "\u222A" },
899
+
"⩈": { "codepoints": [10824], "characters": "\u2A48" },
900
+
"⩆": { "codepoints": [10822], "characters": "\u2A46" },
901
+
"⩊": { "codepoints": [10826], "characters": "\u2A4A" },
902
+
"⊍": { "codepoints": [8845], "characters": "\u228D" },
903
+
"⩅": { "codepoints": [10821], "characters": "\u2A45" },
904
+
"∪︀": { "codepoints": [8746, 65024], "characters": "\u222A\uFE00" },
905
+
"↷": { "codepoints": [8631], "characters": "\u21B7" },
906
+
"⤼": { "codepoints": [10556], "characters": "\u293C" },
907
+
"⋞": { "codepoints": [8926], "characters": "\u22DE" },
908
+
"⋟": { "codepoints": [8927], "characters": "\u22DF" },
909
+
"⋎": { "codepoints": [8910], "characters": "\u22CE" },
910
+
"⋏": { "codepoints": [8911], "characters": "\u22CF" },
911
+
"¤": { "codepoints": [164], "characters": "\u00A4" },
912
+
"¤": { "codepoints": [164], "characters": "\u00A4" },
913
+
"↶": { "codepoints": [8630], "characters": "\u21B6" },
914
+
"↷": { "codepoints": [8631], "characters": "\u21B7" },
915
+
"⋎": { "codepoints": [8910], "characters": "\u22CE" },
916
+
"⋏": { "codepoints": [8911], "characters": "\u22CF" },
917
+
"∲": { "codepoints": [8754], "characters": "\u2232" },
918
+
"∱": { "codepoints": [8753], "characters": "\u2231" },
919
+
"⌭": { "codepoints": [9005], "characters": "\u232D" },
920
+
"⇓": { "codepoints": [8659], "characters": "\u21D3" },
921
+
"⥥": { "codepoints": [10597], "characters": "\u2965" },
922
+
"†": { "codepoints": [8224], "characters": "\u2020" },
923
+
"ℸ": { "codepoints": [8504], "characters": "\u2138" },
924
+
"↓": { "codepoints": [8595], "characters": "\u2193" },
925
+
"‐": { "codepoints": [8208], "characters": "\u2010" },
926
+
"⊣": { "codepoints": [8867], "characters": "\u22A3" },
927
+
"⤏": { "codepoints": [10511], "characters": "\u290F" },
928
+
"˝": { "codepoints": [733], "characters": "\u02DD" },
929
+
"ď": { "codepoints": [271], "characters": "\u010F" },
930
+
"д": { "codepoints": [1076], "characters": "\u0434" },
931
+
"ⅆ": { "codepoints": [8518], "characters": "\u2146" },
932
+
"‡": { "codepoints": [8225], "characters": "\u2021" },
933
+
"⇊": { "codepoints": [8650], "characters": "\u21CA" },
934
+
"⩷": { "codepoints": [10871], "characters": "\u2A77" },
935
+
"°": { "codepoints": [176], "characters": "\u00B0" },
936
+
"°": { "codepoints": [176], "characters": "\u00B0" },
937
+
"δ": { "codepoints": [948], "characters": "\u03B4" },
938
+
"⦱": { "codepoints": [10673], "characters": "\u29B1" },
939
+
"⥿": { "codepoints": [10623], "characters": "\u297F" },
940
+
"𝔡": { "codepoints": [120097], "characters": "\uD835\uDD21" },
941
+
"⇃": { "codepoints": [8643], "characters": "\u21C3" },
942
+
"⇂": { "codepoints": [8642], "characters": "\u21C2" },
943
+
"⋄": { "codepoints": [8900], "characters": "\u22C4" },
944
+
"⋄": { "codepoints": [8900], "characters": "\u22C4" },
945
+
"♦": { "codepoints": [9830], "characters": "\u2666" },
946
+
"♦": { "codepoints": [9830], "characters": "\u2666" },
947
+
"¨": { "codepoints": [168], "characters": "\u00A8" },
948
+
"ϝ": { "codepoints": [989], "characters": "\u03DD" },
949
+
"⋲": { "codepoints": [8946], "characters": "\u22F2" },
950
+
"÷": { "codepoints": [247], "characters": "\u00F7" },
951
+
"÷": { "codepoints": [247], "characters": "\u00F7" },
952
+
"÷": { "codepoints": [247], "characters": "\u00F7" },
953
+
"⋇": { "codepoints": [8903], "characters": "\u22C7" },
954
+
"⋇": { "codepoints": [8903], "characters": "\u22C7" },
955
+
"ђ": { "codepoints": [1106], "characters": "\u0452" },
956
+
"⌞": { "codepoints": [8990], "characters": "\u231E" },
957
+
"⌍": { "codepoints": [8973], "characters": "\u230D" },
958
+
"$": { "codepoints": [36], "characters": "\u0024" },
959
+
"𝕕": { "codepoints": [120149], "characters": "\uD835\uDD55" },
960
+
"˙": { "codepoints": [729], "characters": "\u02D9" },
961
+
"≐": { "codepoints": [8784], "characters": "\u2250" },
962
+
"≑": { "codepoints": [8785], "characters": "\u2251" },
963
+
"∸": { "codepoints": [8760], "characters": "\u2238" },
964
+
"∔": { "codepoints": [8724], "characters": "\u2214" },
965
+
"⊡": { "codepoints": [8865], "characters": "\u22A1" },
966
+
"⌆": { "codepoints": [8966], "characters": "\u2306" },
967
+
"↓": { "codepoints": [8595], "characters": "\u2193" },
968
+
"⇊": { "codepoints": [8650], "characters": "\u21CA" },
969
+
"⇃": { "codepoints": [8643], "characters": "\u21C3" },
970
+
"⇂": { "codepoints": [8642], "characters": "\u21C2" },
971
+
"⤐": { "codepoints": [10512], "characters": "\u2910" },
972
+
"⌟": { "codepoints": [8991], "characters": "\u231F" },
973
+
"⌌": { "codepoints": [8972], "characters": "\u230C" },
974
+
"𝒹": { "codepoints": [119993], "characters": "\uD835\uDCB9" },
975
+
"ѕ": { "codepoints": [1109], "characters": "\u0455" },
976
+
"⧶": { "codepoints": [10742], "characters": "\u29F6" },
977
+
"đ": { "codepoints": [273], "characters": "\u0111" },
978
+
"⋱": { "codepoints": [8945], "characters": "\u22F1" },
979
+
"▿": { "codepoints": [9663], "characters": "\u25BF" },
980
+
"▾": { "codepoints": [9662], "characters": "\u25BE" },
981
+
"⇵": { "codepoints": [8693], "characters": "\u21F5" },
982
+
"⥯": { "codepoints": [10607], "characters": "\u296F" },
983
+
"⦦": { "codepoints": [10662], "characters": "\u29A6" },
984
+
"џ": { "codepoints": [1119], "characters": "\u045F" },
985
+
"⟿": { "codepoints": [10239], "characters": "\u27FF" },
986
+
"⩷": { "codepoints": [10871], "characters": "\u2A77" },
987
+
"≑": { "codepoints": [8785], "characters": "\u2251" },
988
+
"é": { "codepoints": [233], "characters": "\u00E9" },
989
+
"é": { "codepoints": [233], "characters": "\u00E9" },
990
+
"⩮": { "codepoints": [10862], "characters": "\u2A6E" },
991
+
"ě": { "codepoints": [283], "characters": "\u011B" },
992
+
"≖": { "codepoints": [8790], "characters": "\u2256" },
993
+
"ê": { "codepoints": [234], "characters": "\u00EA" },
994
+
"ê": { "codepoints": [234], "characters": "\u00EA" },
995
+
"≕": { "codepoints": [8789], "characters": "\u2255" },
996
+
"э": { "codepoints": [1101], "characters": "\u044D" },
997
+
"ė": { "codepoints": [279], "characters": "\u0117" },
998
+
"ⅇ": { "codepoints": [8519], "characters": "\u2147" },
999
+
"≒": { "codepoints": [8786], "characters": "\u2252" },
1000
+
"𝔢": { "codepoints": [120098], "characters": "\uD835\uDD22" },
1001
+
"⪚": { "codepoints": [10906], "characters": "\u2A9A" },
1002
+
"è": { "codepoints": [232], "characters": "\u00E8" },
1003
+
"è": { "codepoints": [232], "characters": "\u00E8" },
1004
+
"⪖": { "codepoints": [10902], "characters": "\u2A96" },
1005
+
"⪘": { "codepoints": [10904], "characters": "\u2A98" },
1006
+
"⪙": { "codepoints": [10905], "characters": "\u2A99" },
1007
+
"⏧": { "codepoints": [9191], "characters": "\u23E7" },
1008
+
"ℓ": { "codepoints": [8467], "characters": "\u2113" },
1009
+
"⪕": { "codepoints": [10901], "characters": "\u2A95" },
1010
+
"⪗": { "codepoints": [10903], "characters": "\u2A97" },
1011
+
"ē": { "codepoints": [275], "characters": "\u0113" },
1012
+
"∅": { "codepoints": [8709], "characters": "\u2205" },
1013
+
"∅": { "codepoints": [8709], "characters": "\u2205" },
1014
+
"∅": { "codepoints": [8709], "characters": "\u2205" },
1015
+
" ": { "codepoints": [8196], "characters": "\u2004" },
1016
+
" ": { "codepoints": [8197], "characters": "\u2005" },
1017
+
" ": { "codepoints": [8195], "characters": "\u2003" },
1018
+
"ŋ": { "codepoints": [331], "characters": "\u014B" },
1019
+
" ": { "codepoints": [8194], "characters": "\u2002" },
1020
+
"ę": { "codepoints": [281], "characters": "\u0119" },
1021
+
"𝕖": { "codepoints": [120150], "characters": "\uD835\uDD56" },
1022
+
"⋕": { "codepoints": [8917], "characters": "\u22D5" },
1023
+
"⧣": { "codepoints": [10723], "characters": "\u29E3" },
1024
+
"⩱": { "codepoints": [10865], "characters": "\u2A71" },
1025
+
"ε": { "codepoints": [949], "characters": "\u03B5" },
1026
+
"ε": { "codepoints": [949], "characters": "\u03B5" },
1027
+
"ϵ": { "codepoints": [1013], "characters": "\u03F5" },
1028
+
"≖": { "codepoints": [8790], "characters": "\u2256" },
1029
+
"≕": { "codepoints": [8789], "characters": "\u2255" },
1030
+
"≂": { "codepoints": [8770], "characters": "\u2242" },
1031
+
"⪖": { "codepoints": [10902], "characters": "\u2A96" },
1032
+
"⪕": { "codepoints": [10901], "characters": "\u2A95" },
1033
+
"=": { "codepoints": [61], "characters": "\u003D" },
1034
+
"≟": { "codepoints": [8799], "characters": "\u225F" },
1035
+
"≡": { "codepoints": [8801], "characters": "\u2261" },
1036
+
"⩸": { "codepoints": [10872], "characters": "\u2A78" },
1037
+
"⧥": { "codepoints": [10725], "characters": "\u29E5" },
1038
+
"≓": { "codepoints": [8787], "characters": "\u2253" },
1039
+
"⥱": { "codepoints": [10609], "characters": "\u2971" },
1040
+
"ℯ": { "codepoints": [8495], "characters": "\u212F" },
1041
+
"≐": { "codepoints": [8784], "characters": "\u2250" },
1042
+
"≂": { "codepoints": [8770], "characters": "\u2242" },
1043
+
"η": { "codepoints": [951], "characters": "\u03B7" },
1044
+
"ð": { "codepoints": [240], "characters": "\u00F0" },
1045
+
"ð": { "codepoints": [240], "characters": "\u00F0" },
1046
+
"ë": { "codepoints": [235], "characters": "\u00EB" },
1047
+
"ë": { "codepoints": [235], "characters": "\u00EB" },
1048
+
"€": { "codepoints": [8364], "characters": "\u20AC" },
1049
+
"!": { "codepoints": [33], "characters": "\u0021" },
1050
+
"∃": { "codepoints": [8707], "characters": "\u2203" },
1051
+
"ℰ": { "codepoints": [8496], "characters": "\u2130" },
1052
+
"ⅇ": { "codepoints": [8519], "characters": "\u2147" },
1053
+
"≒": { "codepoints": [8786], "characters": "\u2252" },
1054
+
"ф": { "codepoints": [1092], "characters": "\u0444" },
1055
+
"♀": { "codepoints": [9792], "characters": "\u2640" },
1056
+
"ffi": { "codepoints": [64259], "characters": "\uFB03" },
1057
+
"ff": { "codepoints": [64256], "characters": "\uFB00" },
1058
+
"ffl": { "codepoints": [64260], "characters": "\uFB04" },
1059
+
"𝔣": { "codepoints": [120099], "characters": "\uD835\uDD23" },
1060
+
"fi": { "codepoints": [64257], "characters": "\uFB01" },
1061
+
"fj": { "codepoints": [102, 106], "characters": "\u0066\u006A" },
1062
+
"♭": { "codepoints": [9837], "characters": "\u266D" },
1063
+
"fl": { "codepoints": [64258], "characters": "\uFB02" },
1064
+
"▱": { "codepoints": [9649], "characters": "\u25B1" },
1065
+
"ƒ": { "codepoints": [402], "characters": "\u0192" },
1066
+
"𝕗": { "codepoints": [120151], "characters": "\uD835\uDD57" },
1067
+
"∀": { "codepoints": [8704], "characters": "\u2200" },
1068
+
"⋔": { "codepoints": [8916], "characters": "\u22D4" },
1069
+
"⫙": { "codepoints": [10969], "characters": "\u2AD9" },
1070
+
"⨍": { "codepoints": [10765], "characters": "\u2A0D" },
1071
+
"½": { "codepoints": [189], "characters": "\u00BD" },
1072
+
"½": { "codepoints": [189], "characters": "\u00BD" },
1073
+
"⅓": { "codepoints": [8531], "characters": "\u2153" },
1074
+
"¼": { "codepoints": [188], "characters": "\u00BC" },
1075
+
"¼": { "codepoints": [188], "characters": "\u00BC" },
1076
+
"⅕": { "codepoints": [8533], "characters": "\u2155" },
1077
+
"⅙": { "codepoints": [8537], "characters": "\u2159" },
1078
+
"⅛": { "codepoints": [8539], "characters": "\u215B" },
1079
+
"⅔": { "codepoints": [8532], "characters": "\u2154" },
1080
+
"⅖": { "codepoints": [8534], "characters": "\u2156" },
1081
+
"¾": { "codepoints": [190], "characters": "\u00BE" },
1082
+
"¾": { "codepoints": [190], "characters": "\u00BE" },
1083
+
"⅗": { "codepoints": [8535], "characters": "\u2157" },
1084
+
"⅜": { "codepoints": [8540], "characters": "\u215C" },
1085
+
"⅘": { "codepoints": [8536], "characters": "\u2158" },
1086
+
"⅚": { "codepoints": [8538], "characters": "\u215A" },
1087
+
"⅝": { "codepoints": [8541], "characters": "\u215D" },
1088
+
"⅞": { "codepoints": [8542], "characters": "\u215E" },
1089
+
"⁄": { "codepoints": [8260], "characters": "\u2044" },
1090
+
"⌢": { "codepoints": [8994], "characters": "\u2322" },
1091
+
"𝒻": { "codepoints": [119995], "characters": "\uD835\uDCBB" },
1092
+
"≧": { "codepoints": [8807], "characters": "\u2267" },
1093
+
"⪌": { "codepoints": [10892], "characters": "\u2A8C" },
1094
+
"ǵ": { "codepoints": [501], "characters": "\u01F5" },
1095
+
"γ": { "codepoints": [947], "characters": "\u03B3" },
1096
+
"ϝ": { "codepoints": [989], "characters": "\u03DD" },
1097
+
"⪆": { "codepoints": [10886], "characters": "\u2A86" },
1098
+
"ğ": { "codepoints": [287], "characters": "\u011F" },
1099
+
"ĝ": { "codepoints": [285], "characters": "\u011D" },
1100
+
"г": { "codepoints": [1075], "characters": "\u0433" },
1101
+
"ġ": { "codepoints": [289], "characters": "\u0121" },
1102
+
"≥": { "codepoints": [8805], "characters": "\u2265" },
1103
+
"⋛": { "codepoints": [8923], "characters": "\u22DB" },
1104
+
"≥": { "codepoints": [8805], "characters": "\u2265" },
1105
+
"≧": { "codepoints": [8807], "characters": "\u2267" },
1106
+
"⩾": { "codepoints": [10878], "characters": "\u2A7E" },
1107
+
"⩾": { "codepoints": [10878], "characters": "\u2A7E" },
1108
+
"⪩": { "codepoints": [10921], "characters": "\u2AA9" },
1109
+
"⪀": { "codepoints": [10880], "characters": "\u2A80" },
1110
+
"⪂": { "codepoints": [10882], "characters": "\u2A82" },
1111
+
"⪄": { "codepoints": [10884], "characters": "\u2A84" },
1112
+
"⋛︀": { "codepoints": [8923, 65024], "characters": "\u22DB\uFE00" },
1113
+
"⪔": { "codepoints": [10900], "characters": "\u2A94" },
1114
+
"𝔤": { "codepoints": [120100], "characters": "\uD835\uDD24" },
1115
+
"≫": { "codepoints": [8811], "characters": "\u226B" },
1116
+
"⋙": { "codepoints": [8921], "characters": "\u22D9" },
1117
+
"ℷ": { "codepoints": [8503], "characters": "\u2137" },
1118
+
"ѓ": { "codepoints": [1107], "characters": "\u0453" },
1119
+
"≷": { "codepoints": [8823], "characters": "\u2277" },
1120
+
"⪒": { "codepoints": [10898], "characters": "\u2A92" },
1121
+
"⪥": { "codepoints": [10917], "characters": "\u2AA5" },
1122
+
"⪤": { "codepoints": [10916], "characters": "\u2AA4" },
1123
+
"≩": { "codepoints": [8809], "characters": "\u2269" },
1124
+
"⪊": { "codepoints": [10890], "characters": "\u2A8A" },
1125
+
"⪊": { "codepoints": [10890], "characters": "\u2A8A" },
1126
+
"⪈": { "codepoints": [10888], "characters": "\u2A88" },
1127
+
"⪈": { "codepoints": [10888], "characters": "\u2A88" },
1128
+
"≩": { "codepoints": [8809], "characters": "\u2269" },
1129
+
"⋧": { "codepoints": [8935], "characters": "\u22E7" },
1130
+
"𝕘": { "codepoints": [120152], "characters": "\uD835\uDD58" },
1131
+
"`": { "codepoints": [96], "characters": "\u0060" },
1132
+
"ℊ": { "codepoints": [8458], "characters": "\u210A" },
1133
+
"≳": { "codepoints": [8819], "characters": "\u2273" },
1134
+
"⪎": { "codepoints": [10894], "characters": "\u2A8E" },
1135
+
"⪐": { "codepoints": [10896], "characters": "\u2A90" },
1136
+
">": { "codepoints": [62], "characters": "\u003E" },
1137
+
">": { "codepoints": [62], "characters": "\u003E" },
1138
+
"⪧": { "codepoints": [10919], "characters": "\u2AA7" },
1139
+
"⩺": { "codepoints": [10874], "characters": "\u2A7A" },
1140
+
"⋗": { "codepoints": [8919], "characters": "\u22D7" },
1141
+
"⦕": { "codepoints": [10645], "characters": "\u2995" },
1142
+
"⩼": { "codepoints": [10876], "characters": "\u2A7C" },
1143
+
"⪆": { "codepoints": [10886], "characters": "\u2A86" },
1144
+
"⥸": { "codepoints": [10616], "characters": "\u2978" },
1145
+
"⋗": { "codepoints": [8919], "characters": "\u22D7" },
1146
+
"⋛": { "codepoints": [8923], "characters": "\u22DB" },
1147
+
"⪌": { "codepoints": [10892], "characters": "\u2A8C" },
1148
+
"≷": { "codepoints": [8823], "characters": "\u2277" },
1149
+
"≳": { "codepoints": [8819], "characters": "\u2273" },
1150
+
"≩︀": { "codepoints": [8809, 65024], "characters": "\u2269\uFE00" },
1151
+
"≩︀": { "codepoints": [8809, 65024], "characters": "\u2269\uFE00" },
1152
+
"⇔": { "codepoints": [8660], "characters": "\u21D4" },
1153
+
" ": { "codepoints": [8202], "characters": "\u200A" },
1154
+
"½": { "codepoints": [189], "characters": "\u00BD" },
1155
+
"ℋ": { "codepoints": [8459], "characters": "\u210B" },
1156
+
"ъ": { "codepoints": [1098], "characters": "\u044A" },
1157
+
"↔": { "codepoints": [8596], "characters": "\u2194" },
1158
+
"⥈": { "codepoints": [10568], "characters": "\u2948" },
1159
+
"↭": { "codepoints": [8621], "characters": "\u21AD" },
1160
+
"ℏ": { "codepoints": [8463], "characters": "\u210F" },
1161
+
"ĥ": { "codepoints": [293], "characters": "\u0125" },
1162
+
"♥": { "codepoints": [9829], "characters": "\u2665" },
1163
+
"♥": { "codepoints": [9829], "characters": "\u2665" },
1164
+
"…": { "codepoints": [8230], "characters": "\u2026" },
1165
+
"⊹": { "codepoints": [8889], "characters": "\u22B9" },
1166
+
"𝔥": { "codepoints": [120101], "characters": "\uD835\uDD25" },
1167
+
"⤥": { "codepoints": [10533], "characters": "\u2925" },
1168
+
"⤦": { "codepoints": [10534], "characters": "\u2926" },
1169
+
"⇿": { "codepoints": [8703], "characters": "\u21FF" },
1170
+
"∻": { "codepoints": [8763], "characters": "\u223B" },
1171
+
"↩": { "codepoints": [8617], "characters": "\u21A9" },
1172
+
"↪": { "codepoints": [8618], "characters": "\u21AA" },
1173
+
"𝕙": { "codepoints": [120153], "characters": "\uD835\uDD59" },
1174
+
"―": { "codepoints": [8213], "characters": "\u2015" },
1175
+
"𝒽": { "codepoints": [119997], "characters": "\uD835\uDCBD" },
1176
+
"ℏ": { "codepoints": [8463], "characters": "\u210F" },
1177
+
"ħ": { "codepoints": [295], "characters": "\u0127" },
1178
+
"⁃": { "codepoints": [8259], "characters": "\u2043" },
1179
+
"‐": { "codepoints": [8208], "characters": "\u2010" },
1180
+
"í": { "codepoints": [237], "characters": "\u00ED" },
1181
+
"í": { "codepoints": [237], "characters": "\u00ED" },
1182
+
"⁣": { "codepoints": [8291], "characters": "\u2063" },
1183
+
"î": { "codepoints": [238], "characters": "\u00EE" },
1184
+
"î": { "codepoints": [238], "characters": "\u00EE" },
1185
+
"и": { "codepoints": [1080], "characters": "\u0438" },
1186
+
"е": { "codepoints": [1077], "characters": "\u0435" },
1187
+
"¡": { "codepoints": [161], "characters": "\u00A1" },
1188
+
"¡": { "codepoints": [161], "characters": "\u00A1" },
1189
+
"⇔": { "codepoints": [8660], "characters": "\u21D4" },
1190
+
"𝔦": { "codepoints": [120102], "characters": "\uD835\uDD26" },
1191
+
"ì": { "codepoints": [236], "characters": "\u00EC" },
1192
+
"ì": { "codepoints": [236], "characters": "\u00EC" },
1193
+
"ⅈ": { "codepoints": [8520], "characters": "\u2148" },
1194
+
"⨌": { "codepoints": [10764], "characters": "\u2A0C" },
1195
+
"∭": { "codepoints": [8749], "characters": "\u222D" },
1196
+
"⧜": { "codepoints": [10716], "characters": "\u29DC" },
1197
+
"℩": { "codepoints": [8489], "characters": "\u2129" },
1198
+
"ij": { "codepoints": [307], "characters": "\u0133" },
1199
+
"ī": { "codepoints": [299], "characters": "\u012B" },
1200
+
"ℑ": { "codepoints": [8465], "characters": "\u2111" },
1201
+
"ℐ": { "codepoints": [8464], "characters": "\u2110" },
1202
+
"ℑ": { "codepoints": [8465], "characters": "\u2111" },
1203
+
"ı": { "codepoints": [305], "characters": "\u0131" },
1204
+
"⊷": { "codepoints": [8887], "characters": "\u22B7" },
1205
+
"Ƶ": { "codepoints": [437], "characters": "\u01B5" },
1206
+
"∈": { "codepoints": [8712], "characters": "\u2208" },
1207
+
"℅": { "codepoints": [8453], "characters": "\u2105" },
1208
+
"∞": { "codepoints": [8734], "characters": "\u221E" },
1209
+
"⧝": { "codepoints": [10717], "characters": "\u29DD" },
1210
+
"ı": { "codepoints": [305], "characters": "\u0131" },
1211
+
"∫": { "codepoints": [8747], "characters": "\u222B" },
1212
+
"⊺": { "codepoints": [8890], "characters": "\u22BA" },
1213
+
"ℤ": { "codepoints": [8484], "characters": "\u2124" },
1214
+
"⊺": { "codepoints": [8890], "characters": "\u22BA" },
1215
+
"⨗": { "codepoints": [10775], "characters": "\u2A17" },
1216
+
"⨼": { "codepoints": [10812], "characters": "\u2A3C" },
1217
+
"ё": { "codepoints": [1105], "characters": "\u0451" },
1218
+
"į": { "codepoints": [303], "characters": "\u012F" },
1219
+
"𝕚": { "codepoints": [120154], "characters": "\uD835\uDD5A" },
1220
+
"ι": { "codepoints": [953], "characters": "\u03B9" },
1221
+
"⨼": { "codepoints": [10812], "characters": "\u2A3C" },
1222
+
"¿": { "codepoints": [191], "characters": "\u00BF" },
1223
+
"¿": { "codepoints": [191], "characters": "\u00BF" },
1224
+
"𝒾": { "codepoints": [119998], "characters": "\uD835\uDCBE" },
1225
+
"∈": { "codepoints": [8712], "characters": "\u2208" },
1226
+
"⋹": { "codepoints": [8953], "characters": "\u22F9" },
1227
+
"⋵": { "codepoints": [8949], "characters": "\u22F5" },
1228
+
"⋴": { "codepoints": [8948], "characters": "\u22F4" },
1229
+
"⋳": { "codepoints": [8947], "characters": "\u22F3" },
1230
+
"∈": { "codepoints": [8712], "characters": "\u2208" },
1231
+
"⁢": { "codepoints": [8290], "characters": "\u2062" },
1232
+
"ĩ": { "codepoints": [297], "characters": "\u0129" },
1233
+
"і": { "codepoints": [1110], "characters": "\u0456" },
1234
+
"ï": { "codepoints": [239], "characters": "\u00EF" },
1235
+
"ï": { "codepoints": [239], "characters": "\u00EF" },
1236
+
"ĵ": { "codepoints": [309], "characters": "\u0135" },
1237
+
"й": { "codepoints": [1081], "characters": "\u0439" },
1238
+
"𝔧": { "codepoints": [120103], "characters": "\uD835\uDD27" },
1239
+
"ȷ": { "codepoints": [567], "characters": "\u0237" },
1240
+
"𝕛": { "codepoints": [120155], "characters": "\uD835\uDD5B" },
1241
+
"𝒿": { "codepoints": [119999], "characters": "\uD835\uDCBF" },
1242
+
"ј": { "codepoints": [1112], "characters": "\u0458" },
1243
+
"є": { "codepoints": [1108], "characters": "\u0454" },
1244
+
"κ": { "codepoints": [954], "characters": "\u03BA" },
1245
+
"ϰ": { "codepoints": [1008], "characters": "\u03F0" },
1246
+
"ķ": { "codepoints": [311], "characters": "\u0137" },
1247
+
"к": { "codepoints": [1082], "characters": "\u043A" },
1248
+
"𝔨": { "codepoints": [120104], "characters": "\uD835\uDD28" },
1249
+
"ĸ": { "codepoints": [312], "characters": "\u0138" },
1250
+
"х": { "codepoints": [1093], "characters": "\u0445" },
1251
+
"ќ": { "codepoints": [1116], "characters": "\u045C" },
1252
+
"𝕜": { "codepoints": [120156], "characters": "\uD835\uDD5C" },
1253
+
"𝓀": { "codepoints": [120000], "characters": "\uD835\uDCC0" },
1254
+
"⇚": { "codepoints": [8666], "characters": "\u21DA" },
1255
+
"⇐": { "codepoints": [8656], "characters": "\u21D0" },
1256
+
"⤛": { "codepoints": [10523], "characters": "\u291B" },
1257
+
"⤎": { "codepoints": [10510], "characters": "\u290E" },
1258
+
"≦": { "codepoints": [8806], "characters": "\u2266" },
1259
+
"⪋": { "codepoints": [10891], "characters": "\u2A8B" },
1260
+
"⥢": { "codepoints": [10594], "characters": "\u2962" },
1261
+
"ĺ": { "codepoints": [314], "characters": "\u013A" },
1262
+
"⦴": { "codepoints": [10676], "characters": "\u29B4" },
1263
+
"ℒ": { "codepoints": [8466], "characters": "\u2112" },
1264
+
"λ": { "codepoints": [955], "characters": "\u03BB" },
1265
+
"⟨": { "codepoints": [10216], "characters": "\u27E8" },
1266
+
"⦑": { "codepoints": [10641], "characters": "\u2991" },
1267
+
"⟨": { "codepoints": [10216], "characters": "\u27E8" },
1268
+
"⪅": { "codepoints": [10885], "characters": "\u2A85" },
1269
+
"«": { "codepoints": [171], "characters": "\u00AB" },
1270
+
"«": { "codepoints": [171], "characters": "\u00AB" },
1271
+
"←": { "codepoints": [8592], "characters": "\u2190" },
1272
+
"⇤": { "codepoints": [8676], "characters": "\u21E4" },
1273
+
"⤟": { "codepoints": [10527], "characters": "\u291F" },
1274
+
"⤝": { "codepoints": [10525], "characters": "\u291D" },
1275
+
"↩": { "codepoints": [8617], "characters": "\u21A9" },
1276
+
"↫": { "codepoints": [8619], "characters": "\u21AB" },
1277
+
"⤹": { "codepoints": [10553], "characters": "\u2939" },
1278
+
"⥳": { "codepoints": [10611], "characters": "\u2973" },
1279
+
"↢": { "codepoints": [8610], "characters": "\u21A2" },
1280
+
"⪫": { "codepoints": [10923], "characters": "\u2AAB" },
1281
+
"⤙": { "codepoints": [10521], "characters": "\u2919" },
1282
+
"⪭": { "codepoints": [10925], "characters": "\u2AAD" },
1283
+
"⪭︀": { "codepoints": [10925, 65024], "characters": "\u2AAD\uFE00" },
1284
+
"⤌": { "codepoints": [10508], "characters": "\u290C" },
1285
+
"❲": { "codepoints": [10098], "characters": "\u2772" },
1286
+
"{": { "codepoints": [123], "characters": "\u007B" },
1287
+
"[": { "codepoints": [91], "characters": "\u005B" },
1288
+
"⦋": { "codepoints": [10635], "characters": "\u298B" },
1289
+
"⦏": { "codepoints": [10639], "characters": "\u298F" },
1290
+
"⦍": { "codepoints": [10637], "characters": "\u298D" },
1291
+
"ľ": { "codepoints": [318], "characters": "\u013E" },
1292
+
"ļ": { "codepoints": [316], "characters": "\u013C" },
1293
+
"⌈": { "codepoints": [8968], "characters": "\u2308" },
1294
+
"{": { "codepoints": [123], "characters": "\u007B" },
1295
+
"л": { "codepoints": [1083], "characters": "\u043B" },
1296
+
"⤶": { "codepoints": [10550], "characters": "\u2936" },
1297
+
"“": { "codepoints": [8220], "characters": "\u201C" },
1298
+
"„": { "codepoints": [8222], "characters": "\u201E" },
1299
+
"⥧": { "codepoints": [10599], "characters": "\u2967" },
1300
+
"⥋": { "codepoints": [10571], "characters": "\u294B" },
1301
+
"↲": { "codepoints": [8626], "characters": "\u21B2" },
1302
+
"≤": { "codepoints": [8804], "characters": "\u2264" },
1303
+
"←": { "codepoints": [8592], "characters": "\u2190" },
1304
+
"↢": { "codepoints": [8610], "characters": "\u21A2" },
1305
+
"↽": { "codepoints": [8637], "characters": "\u21BD" },
1306
+
"↼": { "codepoints": [8636], "characters": "\u21BC" },
1307
+
"⇇": { "codepoints": [8647], "characters": "\u21C7" },
1308
+
"↔": { "codepoints": [8596], "characters": "\u2194" },
1309
+
"⇆": { "codepoints": [8646], "characters": "\u21C6" },
1310
+
"⇋": { "codepoints": [8651], "characters": "\u21CB" },
1311
+
"↭": { "codepoints": [8621], "characters": "\u21AD" },
1312
+
"⋋": { "codepoints": [8907], "characters": "\u22CB" },
1313
+
"⋚": { "codepoints": [8922], "characters": "\u22DA" },
1314
+
"≤": { "codepoints": [8804], "characters": "\u2264" },
1315
+
"≦": { "codepoints": [8806], "characters": "\u2266" },
1316
+
"⩽": { "codepoints": [10877], "characters": "\u2A7D" },
1317
+
"⩽": { "codepoints": [10877], "characters": "\u2A7D" },
1318
+
"⪨": { "codepoints": [10920], "characters": "\u2AA8" },
1319
+
"⩿": { "codepoints": [10879], "characters": "\u2A7F" },
1320
+
"⪁": { "codepoints": [10881], "characters": "\u2A81" },
1321
+
"⪃": { "codepoints": [10883], "characters": "\u2A83" },
1322
+
"⋚︀": { "codepoints": [8922, 65024], "characters": "\u22DA\uFE00" },
1323
+
"⪓": { "codepoints": [10899], "characters": "\u2A93" },
1324
+
"⪅": { "codepoints": [10885], "characters": "\u2A85" },
1325
+
"⋖": { "codepoints": [8918], "characters": "\u22D6" },
1326
+
"⋚": { "codepoints": [8922], "characters": "\u22DA" },
1327
+
"⪋": { "codepoints": [10891], "characters": "\u2A8B" },
1328
+
"≶": { "codepoints": [8822], "characters": "\u2276" },
1329
+
"≲": { "codepoints": [8818], "characters": "\u2272" },
1330
+
"⥼": { "codepoints": [10620], "characters": "\u297C" },
1331
+
"⌊": { "codepoints": [8970], "characters": "\u230A" },
1332
+
"𝔩": { "codepoints": [120105], "characters": "\uD835\uDD29" },
1333
+
"≶": { "codepoints": [8822], "characters": "\u2276" },
1334
+
"⪑": { "codepoints": [10897], "characters": "\u2A91" },
1335
+
"↽": { "codepoints": [8637], "characters": "\u21BD" },
1336
+
"↼": { "codepoints": [8636], "characters": "\u21BC" },
1337
+
"⥪": { "codepoints": [10602], "characters": "\u296A" },
1338
+
"▄": { "codepoints": [9604], "characters": "\u2584" },
1339
+
"љ": { "codepoints": [1113], "characters": "\u0459" },
1340
+
"≪": { "codepoints": [8810], "characters": "\u226A" },
1341
+
"⇇": { "codepoints": [8647], "characters": "\u21C7" },
1342
+
"⌞": { "codepoints": [8990], "characters": "\u231E" },
1343
+
"⥫": { "codepoints": [10603], "characters": "\u296B" },
1344
+
"◺": { "codepoints": [9722], "characters": "\u25FA" },
1345
+
"ŀ": { "codepoints": [320], "characters": "\u0140" },
1346
+
"⎰": { "codepoints": [9136], "characters": "\u23B0" },
1347
+
"⎰": { "codepoints": [9136], "characters": "\u23B0" },
1348
+
"≨": { "codepoints": [8808], "characters": "\u2268" },
1349
+
"⪉": { "codepoints": [10889], "characters": "\u2A89" },
1350
+
"⪉": { "codepoints": [10889], "characters": "\u2A89" },
1351
+
"⪇": { "codepoints": [10887], "characters": "\u2A87" },
1352
+
"⪇": { "codepoints": [10887], "characters": "\u2A87" },
1353
+
"≨": { "codepoints": [8808], "characters": "\u2268" },
1354
+
"⋦": { "codepoints": [8934], "characters": "\u22E6" },
1355
+
"⟬": { "codepoints": [10220], "characters": "\u27EC" },
1356
+
"⇽": { "codepoints": [8701], "characters": "\u21FD" },
1357
+
"⟦": { "codepoints": [10214], "characters": "\u27E6" },
1358
+
"⟵": { "codepoints": [10229], "characters": "\u27F5" },
1359
+
"⟷": { "codepoints": [10231], "characters": "\u27F7" },
1360
+
"⟼": { "codepoints": [10236], "characters": "\u27FC" },
1361
+
"⟶": { "codepoints": [10230], "characters": "\u27F6" },
1362
+
"↫": { "codepoints": [8619], "characters": "\u21AB" },
1363
+
"↬": { "codepoints": [8620], "characters": "\u21AC" },
1364
+
"⦅": { "codepoints": [10629], "characters": "\u2985" },
1365
+
"𝕝": { "codepoints": [120157], "characters": "\uD835\uDD5D" },
1366
+
"⨭": { "codepoints": [10797], "characters": "\u2A2D" },
1367
+
"⨴": { "codepoints": [10804], "characters": "\u2A34" },
1368
+
"∗": { "codepoints": [8727], "characters": "\u2217" },
1369
+
"_": { "codepoints": [95], "characters": "\u005F" },
1370
+
"◊": { "codepoints": [9674], "characters": "\u25CA" },
1371
+
"◊": { "codepoints": [9674], "characters": "\u25CA" },
1372
+
"⧫": { "codepoints": [10731], "characters": "\u29EB" },
1373
+
"(": { "codepoints": [40], "characters": "\u0028" },
1374
+
"⦓": { "codepoints": [10643], "characters": "\u2993" },
1375
+
"⇆": { "codepoints": [8646], "characters": "\u21C6" },
1376
+
"⌟": { "codepoints": [8991], "characters": "\u231F" },
1377
+
"⇋": { "codepoints": [8651], "characters": "\u21CB" },
1378
+
"⥭": { "codepoints": [10605], "characters": "\u296D" },
1379
+
"‎": { "codepoints": [8206], "characters": "\u200E" },
1380
+
"⊿": { "codepoints": [8895], "characters": "\u22BF" },
1381
+
"‹": { "codepoints": [8249], "characters": "\u2039" },
1382
+
"𝓁": { "codepoints": [120001], "characters": "\uD835\uDCC1" },
1383
+
"↰": { "codepoints": [8624], "characters": "\u21B0" },
1384
+
"≲": { "codepoints": [8818], "characters": "\u2272" },
1385
+
"⪍": { "codepoints": [10893], "characters": "\u2A8D" },
1386
+
"⪏": { "codepoints": [10895], "characters": "\u2A8F" },
1387
+
"[": { "codepoints": [91], "characters": "\u005B" },
1388
+
"‘": { "codepoints": [8216], "characters": "\u2018" },
1389
+
"‚": { "codepoints": [8218], "characters": "\u201A" },
1390
+
"ł": { "codepoints": [322], "characters": "\u0142" },
1391
+
"<": { "codepoints": [60], "characters": "\u003C" },
1392
+
"<": { "codepoints": [60], "characters": "\u003C" },
1393
+
"⪦": { "codepoints": [10918], "characters": "\u2AA6" },
1394
+
"⩹": { "codepoints": [10873], "characters": "\u2A79" },
1395
+
"⋖": { "codepoints": [8918], "characters": "\u22D6" },
1396
+
"⋋": { "codepoints": [8907], "characters": "\u22CB" },
1397
+
"⋉": { "codepoints": [8905], "characters": "\u22C9" },
1398
+
"⥶": { "codepoints": [10614], "characters": "\u2976" },
1399
+
"⩻": { "codepoints": [10875], "characters": "\u2A7B" },
1400
+
"⦖": { "codepoints": [10646], "characters": "\u2996" },
1401
+
"◃": { "codepoints": [9667], "characters": "\u25C3" },
1402
+
"⊴": { "codepoints": [8884], "characters": "\u22B4" },
1403
+
"◂": { "codepoints": [9666], "characters": "\u25C2" },
1404
+
"⥊": { "codepoints": [10570], "characters": "\u294A" },
1405
+
"⥦": { "codepoints": [10598], "characters": "\u2966" },
1406
+
"≨︀": { "codepoints": [8808, 65024], "characters": "\u2268\uFE00" },
1407
+
"≨︀": { "codepoints": [8808, 65024], "characters": "\u2268\uFE00" },
1408
+
"∺": { "codepoints": [8762], "characters": "\u223A" },
1409
+
"¯": { "codepoints": [175], "characters": "\u00AF" },
1410
+
"¯": { "codepoints": [175], "characters": "\u00AF" },
1411
+
"♂": { "codepoints": [9794], "characters": "\u2642" },
1412
+
"✠": { "codepoints": [10016], "characters": "\u2720" },
1413
+
"✠": { "codepoints": [10016], "characters": "\u2720" },
1414
+
"↦": { "codepoints": [8614], "characters": "\u21A6" },
1415
+
"↦": { "codepoints": [8614], "characters": "\u21A6" },
1416
+
"↧": { "codepoints": [8615], "characters": "\u21A7" },
1417
+
"↤": { "codepoints": [8612], "characters": "\u21A4" },
1418
+
"↥": { "codepoints": [8613], "characters": "\u21A5" },
1419
+
"▮": { "codepoints": [9646], "characters": "\u25AE" },
1420
+
"⨩": { "codepoints": [10793], "characters": "\u2A29" },
1421
+
"м": { "codepoints": [1084], "characters": "\u043C" },
1422
+
"—": { "codepoints": [8212], "characters": "\u2014" },
1423
+
"∡": { "codepoints": [8737], "characters": "\u2221" },
1424
+
"𝔪": { "codepoints": [120106], "characters": "\uD835\uDD2A" },
1425
+
"℧": { "codepoints": [8487], "characters": "\u2127" },
1426
+
"µ": { "codepoints": [181], "characters": "\u00B5" },
1427
+
"µ": { "codepoints": [181], "characters": "\u00B5" },
1428
+
"∣": { "codepoints": [8739], "characters": "\u2223" },
1429
+
"*": { "codepoints": [42], "characters": "\u002A" },
1430
+
"⫰": { "codepoints": [10992], "characters": "\u2AF0" },
1431
+
"·": { "codepoints": [183], "characters": "\u00B7" },
1432
+
"·": { "codepoints": [183], "characters": "\u00B7" },
1433
+
"−": { "codepoints": [8722], "characters": "\u2212" },
1434
+
"⊟": { "codepoints": [8863], "characters": "\u229F" },
1435
+
"∸": { "codepoints": [8760], "characters": "\u2238" },
1436
+
"⨪": { "codepoints": [10794], "characters": "\u2A2A" },
1437
+
"⫛": { "codepoints": [10971], "characters": "\u2ADB" },
1438
+
"…": { "codepoints": [8230], "characters": "\u2026" },
1439
+
"∓": { "codepoints": [8723], "characters": "\u2213" },
1440
+
"⊧": { "codepoints": [8871], "characters": "\u22A7" },
1441
+
"𝕞": { "codepoints": [120158], "characters": "\uD835\uDD5E" },
1442
+
"∓": { "codepoints": [8723], "characters": "\u2213" },
1443
+
"𝓂": { "codepoints": [120002], "characters": "\uD835\uDCC2" },
1444
+
"∾": { "codepoints": [8766], "characters": "\u223E" },
1445
+
"μ": { "codepoints": [956], "characters": "\u03BC" },
1446
+
"⊸": { "codepoints": [8888], "characters": "\u22B8" },
1447
+
"⊸": { "codepoints": [8888], "characters": "\u22B8" },
1448
+
"⋙̸": { "codepoints": [8921, 824], "characters": "\u22D9\u0338" },
1449
+
"≫⃒": { "codepoints": [8811, 8402], "characters": "\u226B\u20D2" },
1450
+
"≫̸": { "codepoints": [8811, 824], "characters": "\u226B\u0338" },
1451
+
"⇍": { "codepoints": [8653], "characters": "\u21CD" },
1452
+
"⇎": { "codepoints": [8654], "characters": "\u21CE" },
1453
+
"⋘̸": { "codepoints": [8920, 824], "characters": "\u22D8\u0338" },
1454
+
"≪⃒": { "codepoints": [8810, 8402], "characters": "\u226A\u20D2" },
1455
+
"≪̸": { "codepoints": [8810, 824], "characters": "\u226A\u0338" },
1456
+
"⇏": { "codepoints": [8655], "characters": "\u21CF" },
1457
+
"⊯": { "codepoints": [8879], "characters": "\u22AF" },
1458
+
"⊮": { "codepoints": [8878], "characters": "\u22AE" },
1459
+
"∇": { "codepoints": [8711], "characters": "\u2207" },
1460
+
"ń": { "codepoints": [324], "characters": "\u0144" },
1461
+
"∠⃒": { "codepoints": [8736, 8402], "characters": "\u2220\u20D2" },
1462
+
"≉": { "codepoints": [8777], "characters": "\u2249" },
1463
+
"⩰̸": { "codepoints": [10864, 824], "characters": "\u2A70\u0338" },
1464
+
"≋̸": { "codepoints": [8779, 824], "characters": "\u224B\u0338" },
1465
+
"ʼn": { "codepoints": [329], "characters": "\u0149" },
1466
+
"≉": { "codepoints": [8777], "characters": "\u2249" },
1467
+
"♮": { "codepoints": [9838], "characters": "\u266E" },
1468
+
"♮": { "codepoints": [9838], "characters": "\u266E" },
1469
+
"ℕ": { "codepoints": [8469], "characters": "\u2115" },
1470
+
" ": { "codepoints": [160], "characters": "\u00A0" },
1471
+
" ": { "codepoints": [160], "characters": "\u00A0" },
1472
+
"≎̸": { "codepoints": [8782, 824], "characters": "\u224E\u0338" },
1473
+
"≏̸": { "codepoints": [8783, 824], "characters": "\u224F\u0338" },
1474
+
"⩃": { "codepoints": [10819], "characters": "\u2A43" },
1475
+
"ň": { "codepoints": [328], "characters": "\u0148" },
1476
+
"ņ": { "codepoints": [326], "characters": "\u0146" },
1477
+
"≇": { "codepoints": [8775], "characters": "\u2247" },
1478
+
"⩭̸": { "codepoints": [10861, 824], "characters": "\u2A6D\u0338" },
1479
+
"⩂": { "codepoints": [10818], "characters": "\u2A42" },
1480
+
"н": { "codepoints": [1085], "characters": "\u043D" },
1481
+
"–": { "codepoints": [8211], "characters": "\u2013" },
1482
+
"≠": { "codepoints": [8800], "characters": "\u2260" },
1483
+
"⇗": { "codepoints": [8663], "characters": "\u21D7" },
1484
+
"⤤": { "codepoints": [10532], "characters": "\u2924" },
1485
+
"↗": { "codepoints": [8599], "characters": "\u2197" },
1486
+
"↗": { "codepoints": [8599], "characters": "\u2197" },
1487
+
"≐̸": { "codepoints": [8784, 824], "characters": "\u2250\u0338" },
1488
+
"≢": { "codepoints": [8802], "characters": "\u2262" },
1489
+
"⤨": { "codepoints": [10536], "characters": "\u2928" },
1490
+
"≂̸": { "codepoints": [8770, 824], "characters": "\u2242\u0338" },
1491
+
"∄": { "codepoints": [8708], "characters": "\u2204" },
1492
+
"∄": { "codepoints": [8708], "characters": "\u2204" },
1493
+
"𝔫": { "codepoints": [120107], "characters": "\uD835\uDD2B" },
1494
+
"≧̸": { "codepoints": [8807, 824], "characters": "\u2267\u0338" },
1495
+
"≱": { "codepoints": [8817], "characters": "\u2271" },
1496
+
"≱": { "codepoints": [8817], "characters": "\u2271" },
1497
+
"≧̸": { "codepoints": [8807, 824], "characters": "\u2267\u0338" },
1498
+
"⩾̸": { "codepoints": [10878, 824], "characters": "\u2A7E\u0338" },
1499
+
"⩾̸": { "codepoints": [10878, 824], "characters": "\u2A7E\u0338" },
1500
+
"≵": { "codepoints": [8821], "characters": "\u2275" },
1501
+
"≯": { "codepoints": [8815], "characters": "\u226F" },
1502
+
"≯": { "codepoints": [8815], "characters": "\u226F" },
1503
+
"⇎": { "codepoints": [8654], "characters": "\u21CE" },
1504
+
"↮": { "codepoints": [8622], "characters": "\u21AE" },
1505
+
"⫲": { "codepoints": [10994], "characters": "\u2AF2" },
1506
+
"∋": { "codepoints": [8715], "characters": "\u220B" },
1507
+
"⋼": { "codepoints": [8956], "characters": "\u22FC" },
1508
+
"⋺": { "codepoints": [8954], "characters": "\u22FA" },
1509
+
"∋": { "codepoints": [8715], "characters": "\u220B" },
1510
+
"њ": { "codepoints": [1114], "characters": "\u045A" },
1511
+
"⇍": { "codepoints": [8653], "characters": "\u21CD" },
1512
+
"≦̸": { "codepoints": [8806, 824], "characters": "\u2266\u0338" },
1513
+
"↚": { "codepoints": [8602], "characters": "\u219A" },
1514
+
"‥": { "codepoints": [8229], "characters": "\u2025" },
1515
+
"≰": { "codepoints": [8816], "characters": "\u2270" },
1516
+
"↚": { "codepoints": [8602], "characters": "\u219A" },
1517
+
"↮": { "codepoints": [8622], "characters": "\u21AE" },
1518
+
"≰": { "codepoints": [8816], "characters": "\u2270" },
1519
+
"≦̸": { "codepoints": [8806, 824], "characters": "\u2266\u0338" },
1520
+
"⩽̸": { "codepoints": [10877, 824], "characters": "\u2A7D\u0338" },
1521
+
"⩽̸": { "codepoints": [10877, 824], "characters": "\u2A7D\u0338" },
1522
+
"≮": { "codepoints": [8814], "characters": "\u226E" },
1523
+
"≴": { "codepoints": [8820], "characters": "\u2274" },
1524
+
"≮": { "codepoints": [8814], "characters": "\u226E" },
1525
+
"⋪": { "codepoints": [8938], "characters": "\u22EA" },
1526
+
"⋬": { "codepoints": [8940], "characters": "\u22EC" },
1527
+
"∤": { "codepoints": [8740], "characters": "\u2224" },
1528
+
"𝕟": { "codepoints": [120159], "characters": "\uD835\uDD5F" },
1529
+
"¬": { "codepoints": [172], "characters": "\u00AC" },
1530
+
"¬": { "codepoints": [172], "characters": "\u00AC" },
1531
+
"∉": { "codepoints": [8713], "characters": "\u2209" },
1532
+
"⋹̸": { "codepoints": [8953, 824], "characters": "\u22F9\u0338" },
1533
+
"⋵̸": { "codepoints": [8949, 824], "characters": "\u22F5\u0338" },
1534
+
"∉": { "codepoints": [8713], "characters": "\u2209" },
1535
+
"⋷": { "codepoints": [8951], "characters": "\u22F7" },
1536
+
"⋶": { "codepoints": [8950], "characters": "\u22F6" },
1537
+
"∌": { "codepoints": [8716], "characters": "\u220C" },
1538
+
"∌": { "codepoints": [8716], "characters": "\u220C" },
1539
+
"⋾": { "codepoints": [8958], "characters": "\u22FE" },
1540
+
"⋽": { "codepoints": [8957], "characters": "\u22FD" },
1541
+
"∦": { "codepoints": [8742], "characters": "\u2226" },
1542
+
"∦": { "codepoints": [8742], "characters": "\u2226" },
1543
+
"⫽⃥": { "codepoints": [11005, 8421], "characters": "\u2AFD\u20E5" },
1544
+
"∂̸": { "codepoints": [8706, 824], "characters": "\u2202\u0338" },
1545
+
"⨔": { "codepoints": [10772], "characters": "\u2A14" },
1546
+
"⊀": { "codepoints": [8832], "characters": "\u2280" },
1547
+
"⋠": { "codepoints": [8928], "characters": "\u22E0" },
1548
+
"⪯̸": { "codepoints": [10927, 824], "characters": "\u2AAF\u0338" },
1549
+
"⊀": { "codepoints": [8832], "characters": "\u2280" },
1550
+
"⪯̸": { "codepoints": [10927, 824], "characters": "\u2AAF\u0338" },
1551
+
"⇏": { "codepoints": [8655], "characters": "\u21CF" },
1552
+
"↛": { "codepoints": [8603], "characters": "\u219B" },
1553
+
"⤳̸": { "codepoints": [10547, 824], "characters": "\u2933\u0338" },
1554
+
"↝̸": { "codepoints": [8605, 824], "characters": "\u219D\u0338" },
1555
+
"↛": { "codepoints": [8603], "characters": "\u219B" },
1556
+
"⋫": { "codepoints": [8939], "characters": "\u22EB" },
1557
+
"⋭": { "codepoints": [8941], "characters": "\u22ED" },
1558
+
"⊁": { "codepoints": [8833], "characters": "\u2281" },
1559
+
"⋡": { "codepoints": [8929], "characters": "\u22E1" },
1560
+
"⪰̸": { "codepoints": [10928, 824], "characters": "\u2AB0\u0338" },
1561
+
"𝓃": { "codepoints": [120003], "characters": "\uD835\uDCC3" },
1562
+
"∤": { "codepoints": [8740], "characters": "\u2224" },
1563
+
"∦": { "codepoints": [8742], "characters": "\u2226" },
1564
+
"≁": { "codepoints": [8769], "characters": "\u2241" },
1565
+
"≄": { "codepoints": [8772], "characters": "\u2244" },
1566
+
"≄": { "codepoints": [8772], "characters": "\u2244" },
1567
+
"∤": { "codepoints": [8740], "characters": "\u2224" },
1568
+
"∦": { "codepoints": [8742], "characters": "\u2226" },
1569
+
"⋢": { "codepoints": [8930], "characters": "\u22E2" },
1570
+
"⋣": { "codepoints": [8931], "characters": "\u22E3" },
1571
+
"⊄": { "codepoints": [8836], "characters": "\u2284" },
1572
+
"⫅̸": { "codepoints": [10949, 824], "characters": "\u2AC5\u0338" },
1573
+
"⊈": { "codepoints": [8840], "characters": "\u2288" },
1574
+
"⊂⃒": { "codepoints": [8834, 8402], "characters": "\u2282\u20D2" },
1575
+
"⊈": { "codepoints": [8840], "characters": "\u2288" },
1576
+
"⫅̸": { "codepoints": [10949, 824], "characters": "\u2AC5\u0338" },
1577
+
"⊁": { "codepoints": [8833], "characters": "\u2281" },
1578
+
"⪰̸": { "codepoints": [10928, 824], "characters": "\u2AB0\u0338" },
1579
+
"⊅": { "codepoints": [8837], "characters": "\u2285" },
1580
+
"⫆̸": { "codepoints": [10950, 824], "characters": "\u2AC6\u0338" },
1581
+
"⊉": { "codepoints": [8841], "characters": "\u2289" },
1582
+
"⊃⃒": { "codepoints": [8835, 8402], "characters": "\u2283\u20D2" },
1583
+
"⊉": { "codepoints": [8841], "characters": "\u2289" },
1584
+
"⫆̸": { "codepoints": [10950, 824], "characters": "\u2AC6\u0338" },
1585
+
"≹": { "codepoints": [8825], "characters": "\u2279" },
1586
+
"ñ": { "codepoints": [241], "characters": "\u00F1" },
1587
+
"ñ": { "codepoints": [241], "characters": "\u00F1" },
1588
+
"≸": { "codepoints": [8824], "characters": "\u2278" },
1589
+
"⋪": { "codepoints": [8938], "characters": "\u22EA" },
1590
+
"⋬": { "codepoints": [8940], "characters": "\u22EC" },
1591
+
"⋫": { "codepoints": [8939], "characters": "\u22EB" },
1592
+
"⋭": { "codepoints": [8941], "characters": "\u22ED" },
1593
+
"ν": { "codepoints": [957], "characters": "\u03BD" },
1594
+
"#": { "codepoints": [35], "characters": "\u0023" },
1595
+
"№": { "codepoints": [8470], "characters": "\u2116" },
1596
+
" ": { "codepoints": [8199], "characters": "\u2007" },
1597
+
"⊭": { "codepoints": [8877], "characters": "\u22AD" },
1598
+
"⤄": { "codepoints": [10500], "characters": "\u2904" },
1599
+
"≍⃒": { "codepoints": [8781, 8402], "characters": "\u224D\u20D2" },
1600
+
"⊬": { "codepoints": [8876], "characters": "\u22AC" },
1601
+
"≥⃒": { "codepoints": [8805, 8402], "characters": "\u2265\u20D2" },
1602
+
">⃒": { "codepoints": [62, 8402], "characters": "\u003E\u20D2" },
1603
+
"⧞": { "codepoints": [10718], "characters": "\u29DE" },
1604
+
"⤂": { "codepoints": [10498], "characters": "\u2902" },
1605
+
"≤⃒": { "codepoints": [8804, 8402], "characters": "\u2264\u20D2" },
1606
+
"<⃒": { "codepoints": [60, 8402], "characters": "\u003C\u20D2" },
1607
+
"⊴⃒": { "codepoints": [8884, 8402], "characters": "\u22B4\u20D2" },
1608
+
"⤃": { "codepoints": [10499], "characters": "\u2903" },
1609
+
"⊵⃒": { "codepoints": [8885, 8402], "characters": "\u22B5\u20D2" },
1610
+
"∼⃒": { "codepoints": [8764, 8402], "characters": "\u223C\u20D2" },
1611
+
"⇖": { "codepoints": [8662], "characters": "\u21D6" },
1612
+
"⤣": { "codepoints": [10531], "characters": "\u2923" },
1613
+
"↖": { "codepoints": [8598], "characters": "\u2196" },
1614
+
"↖": { "codepoints": [8598], "characters": "\u2196" },
1615
+
"⤧": { "codepoints": [10535], "characters": "\u2927" },
1616
+
"Ⓢ": { "codepoints": [9416], "characters": "\u24C8" },
1617
+
"ó": { "codepoints": [243], "characters": "\u00F3" },
1618
+
"ó": { "codepoints": [243], "characters": "\u00F3" },
1619
+
"⊛": { "codepoints": [8859], "characters": "\u229B" },
1620
+
"⊚": { "codepoints": [8858], "characters": "\u229A" },
1621
+
"ô": { "codepoints": [244], "characters": "\u00F4" },
1622
+
"ô": { "codepoints": [244], "characters": "\u00F4" },
1623
+
"о": { "codepoints": [1086], "characters": "\u043E" },
1624
+
"⊝": { "codepoints": [8861], "characters": "\u229D" },
1625
+
"ő": { "codepoints": [337], "characters": "\u0151" },
1626
+
"⨸": { "codepoints": [10808], "characters": "\u2A38" },
1627
+
"⊙": { "codepoints": [8857], "characters": "\u2299" },
1628
+
"⦼": { "codepoints": [10684], "characters": "\u29BC" },
1629
+
"œ": { "codepoints": [339], "characters": "\u0153" },
1630
+
"⦿": { "codepoints": [10687], "characters": "\u29BF" },
1631
+
"𝔬": { "codepoints": [120108], "characters": "\uD835\uDD2C" },
1632
+
"˛": { "codepoints": [731], "characters": "\u02DB" },
1633
+
"ò": { "codepoints": [242], "characters": "\u00F2" },
1634
+
"ò": { "codepoints": [242], "characters": "\u00F2" },
1635
+
"⧁": { "codepoints": [10689], "characters": "\u29C1" },
1636
+
"⦵": { "codepoints": [10677], "characters": "\u29B5" },
1637
+
"Ω": { "codepoints": [937], "characters": "\u03A9" },
1638
+
"∮": { "codepoints": [8750], "characters": "\u222E" },
1639
+
"↺": { "codepoints": [8634], "characters": "\u21BA" },
1640
+
"⦾": { "codepoints": [10686], "characters": "\u29BE" },
1641
+
"⦻": { "codepoints": [10683], "characters": "\u29BB" },
1642
+
"‾": { "codepoints": [8254], "characters": "\u203E" },
1643
+
"⧀": { "codepoints": [10688], "characters": "\u29C0" },
1644
+
"ō": { "codepoints": [333], "characters": "\u014D" },
1645
+
"ω": { "codepoints": [969], "characters": "\u03C9" },
1646
+
"ο": { "codepoints": [959], "characters": "\u03BF" },
1647
+
"⦶": { "codepoints": [10678], "characters": "\u29B6" },
1648
+
"⊖": { "codepoints": [8854], "characters": "\u2296" },
1649
+
"𝕠": { "codepoints": [120160], "characters": "\uD835\uDD60" },
1650
+
"⦷": { "codepoints": [10679], "characters": "\u29B7" },
1651
+
"⦹": { "codepoints": [10681], "characters": "\u29B9" },
1652
+
"⊕": { "codepoints": [8853], "characters": "\u2295" },
1653
+
"∨": { "codepoints": [8744], "characters": "\u2228" },
1654
+
"↻": { "codepoints": [8635], "characters": "\u21BB" },
1655
+
"⩝": { "codepoints": [10845], "characters": "\u2A5D" },
1656
+
"ℴ": { "codepoints": [8500], "characters": "\u2134" },
1657
+
"ℴ": { "codepoints": [8500], "characters": "\u2134" },
1658
+
"ª": { "codepoints": [170], "characters": "\u00AA" },
1659
+
"ª": { "codepoints": [170], "characters": "\u00AA" },
1660
+
"º": { "codepoints": [186], "characters": "\u00BA" },
1661
+
"º": { "codepoints": [186], "characters": "\u00BA" },
1662
+
"⊶": { "codepoints": [8886], "characters": "\u22B6" },
1663
+
"⩖": { "codepoints": [10838], "characters": "\u2A56" },
1664
+
"⩗": { "codepoints": [10839], "characters": "\u2A57" },
1665
+
"⩛": { "codepoints": [10843], "characters": "\u2A5B" },
1666
+
"ℴ": { "codepoints": [8500], "characters": "\u2134" },
1667
+
"ø": { "codepoints": [248], "characters": "\u00F8" },
1668
+
"ø": { "codepoints": [248], "characters": "\u00F8" },
1669
+
"⊘": { "codepoints": [8856], "characters": "\u2298" },
1670
+
"õ": { "codepoints": [245], "characters": "\u00F5" },
1671
+
"õ": { "codepoints": [245], "characters": "\u00F5" },
1672
+
"⊗": { "codepoints": [8855], "characters": "\u2297" },
1673
+
"⨶": { "codepoints": [10806], "characters": "\u2A36" },
1674
+
"ö": { "codepoints": [246], "characters": "\u00F6" },
1675
+
"ö": { "codepoints": [246], "characters": "\u00F6" },
1676
+
"⌽": { "codepoints": [9021], "characters": "\u233D" },
1677
+
"∥": { "codepoints": [8741], "characters": "\u2225" },
1678
+
"¶": { "codepoints": [182], "characters": "\u00B6" },
1679
+
"¶": { "codepoints": [182], "characters": "\u00B6" },
1680
+
"∥": { "codepoints": [8741], "characters": "\u2225" },
1681
+
"⫳": { "codepoints": [10995], "characters": "\u2AF3" },
1682
+
"⫽": { "codepoints": [11005], "characters": "\u2AFD" },
1683
+
"∂": { "codepoints": [8706], "characters": "\u2202" },
1684
+
"п": { "codepoints": [1087], "characters": "\u043F" },
1685
+
"%": { "codepoints": [37], "characters": "\u0025" },
1686
+
".": { "codepoints": [46], "characters": "\u002E" },
1687
+
"‰": { "codepoints": [8240], "characters": "\u2030" },
1688
+
"⊥": { "codepoints": [8869], "characters": "\u22A5" },
1689
+
"‱": { "codepoints": [8241], "characters": "\u2031" },
1690
+
"𝔭": { "codepoints": [120109], "characters": "\uD835\uDD2D" },
1691
+
"φ": { "codepoints": [966], "characters": "\u03C6" },
1692
+
"ϕ": { "codepoints": [981], "characters": "\u03D5" },
1693
+
"ℳ": { "codepoints": [8499], "characters": "\u2133" },
1694
+
"☎": { "codepoints": [9742], "characters": "\u260E" },
1695
+
"π": { "codepoints": [960], "characters": "\u03C0" },
1696
+
"⋔": { "codepoints": [8916], "characters": "\u22D4" },
1697
+
"ϖ": { "codepoints": [982], "characters": "\u03D6" },
1698
+
"ℏ": { "codepoints": [8463], "characters": "\u210F" },
1699
+
"ℎ": { "codepoints": [8462], "characters": "\u210E" },
1700
+
"ℏ": { "codepoints": [8463], "characters": "\u210F" },
1701
+
"+": { "codepoints": [43], "characters": "\u002B" },
1702
+
"⨣": { "codepoints": [10787], "characters": "\u2A23" },
1703
+
"⊞": { "codepoints": [8862], "characters": "\u229E" },
1704
+
"⨢": { "codepoints": [10786], "characters": "\u2A22" },
1705
+
"∔": { "codepoints": [8724], "characters": "\u2214" },
1706
+
"⨥": { "codepoints": [10789], "characters": "\u2A25" },
1707
+
"⩲": { "codepoints": [10866], "characters": "\u2A72" },
1708
+
"±": { "codepoints": [177], "characters": "\u00B1" },
1709
+
"±": { "codepoints": [177], "characters": "\u00B1" },
1710
+
"⨦": { "codepoints": [10790], "characters": "\u2A26" },
1711
+
"⨧": { "codepoints": [10791], "characters": "\u2A27" },
1712
+
"±": { "codepoints": [177], "characters": "\u00B1" },
1713
+
"⨕": { "codepoints": [10773], "characters": "\u2A15" },
1714
+
"𝕡": { "codepoints": [120161], "characters": "\uD835\uDD61" },
1715
+
"£": { "codepoints": [163], "characters": "\u00A3" },
1716
+
"£": { "codepoints": [163], "characters": "\u00A3" },
1717
+
"≺": { "codepoints": [8826], "characters": "\u227A" },
1718
+
"⪳": { "codepoints": [10931], "characters": "\u2AB3" },
1719
+
"⪷": { "codepoints": [10935], "characters": "\u2AB7" },
1720
+
"≼": { "codepoints": [8828], "characters": "\u227C" },
1721
+
"⪯": { "codepoints": [10927], "characters": "\u2AAF" },
1722
+
"≺": { "codepoints": [8826], "characters": "\u227A" },
1723
+
"⪷": { "codepoints": [10935], "characters": "\u2AB7" },
1724
+
"≼": { "codepoints": [8828], "characters": "\u227C" },
1725
+
"⪯": { "codepoints": [10927], "characters": "\u2AAF" },
1726
+
"⪹": { "codepoints": [10937], "characters": "\u2AB9" },
1727
+
"⪵": { "codepoints": [10933], "characters": "\u2AB5" },
1728
+
"⋨": { "codepoints": [8936], "characters": "\u22E8" },
1729
+
"≾": { "codepoints": [8830], "characters": "\u227E" },
1730
+
"′": { "codepoints": [8242], "characters": "\u2032" },
1731
+
"ℙ": { "codepoints": [8473], "characters": "\u2119" },
1732
+
"⪵": { "codepoints": [10933], "characters": "\u2AB5" },
1733
+
"⪹": { "codepoints": [10937], "characters": "\u2AB9" },
1734
+
"⋨": { "codepoints": [8936], "characters": "\u22E8" },
1735
+
"∏": { "codepoints": [8719], "characters": "\u220F" },
1736
+
"⌮": { "codepoints": [9006], "characters": "\u232E" },
1737
+
"⌒": { "codepoints": [8978], "characters": "\u2312" },
1738
+
"⌓": { "codepoints": [8979], "characters": "\u2313" },
1739
+
"∝": { "codepoints": [8733], "characters": "\u221D" },
1740
+
"∝": { "codepoints": [8733], "characters": "\u221D" },
1741
+
"≾": { "codepoints": [8830], "characters": "\u227E" },
1742
+
"⊰": { "codepoints": [8880], "characters": "\u22B0" },
1743
+
"𝓅": { "codepoints": [120005], "characters": "\uD835\uDCC5" },
1744
+
"ψ": { "codepoints": [968], "characters": "\u03C8" },
1745
+
" ": { "codepoints": [8200], "characters": "\u2008" },
1746
+
"𝔮": { "codepoints": [120110], "characters": "\uD835\uDD2E" },
1747
+
"⨌": { "codepoints": [10764], "characters": "\u2A0C" },
1748
+
"𝕢": { "codepoints": [120162], "characters": "\uD835\uDD62" },
1749
+
"⁗": { "codepoints": [8279], "characters": "\u2057" },
1750
+
"𝓆": { "codepoints": [120006], "characters": "\uD835\uDCC6" },
1751
+
"ℍ": { "codepoints": [8461], "characters": "\u210D" },
1752
+
"⨖": { "codepoints": [10774], "characters": "\u2A16" },
1753
+
"?": { "codepoints": [63], "characters": "\u003F" },
1754
+
"≟": { "codepoints": [8799], "characters": "\u225F" },
1755
+
""": { "codepoints": [34], "characters": "\u0022" },
1756
+
""": { "codepoints": [34], "characters": "\u0022" },
1757
+
"⇛": { "codepoints": [8667], "characters": "\u21DB" },
1758
+
"⇒": { "codepoints": [8658], "characters": "\u21D2" },
1759
+
"⤜": { "codepoints": [10524], "characters": "\u291C" },
1760
+
"⤏": { "codepoints": [10511], "characters": "\u290F" },
1761
+
"⥤": { "codepoints": [10596], "characters": "\u2964" },
1762
+
"∽̱": { "codepoints": [8765, 817], "characters": "\u223D\u0331" },
1763
+
"ŕ": { "codepoints": [341], "characters": "\u0155" },
1764
+
"√": { "codepoints": [8730], "characters": "\u221A" },
1765
+
"⦳": { "codepoints": [10675], "characters": "\u29B3" },
1766
+
"⟩": { "codepoints": [10217], "characters": "\u27E9" },
1767
+
"⦒": { "codepoints": [10642], "characters": "\u2992" },
1768
+
"⦥": { "codepoints": [10661], "characters": "\u29A5" },
1769
+
"⟩": { "codepoints": [10217], "characters": "\u27E9" },
1770
+
"»": { "codepoints": [187], "characters": "\u00BB" },
1771
+
"»": { "codepoints": [187], "characters": "\u00BB" },
1772
+
"→": { "codepoints": [8594], "characters": "\u2192" },
1773
+
"⥵": { "codepoints": [10613], "characters": "\u2975" },
1774
+
"⇥": { "codepoints": [8677], "characters": "\u21E5" },
1775
+
"⤠": { "codepoints": [10528], "characters": "\u2920" },
1776
+
"⤳": { "codepoints": [10547], "characters": "\u2933" },
1777
+
"⤞": { "codepoints": [10526], "characters": "\u291E" },
1778
+
"↪": { "codepoints": [8618], "characters": "\u21AA" },
1779
+
"↬": { "codepoints": [8620], "characters": "\u21AC" },
1780
+
"⥅": { "codepoints": [10565], "characters": "\u2945" },
1781
+
"⥴": { "codepoints": [10612], "characters": "\u2974" },
1782
+
"↣": { "codepoints": [8611], "characters": "\u21A3" },
1783
+
"↝": { "codepoints": [8605], "characters": "\u219D" },
1784
+
"⤚": { "codepoints": [10522], "characters": "\u291A" },
1785
+
"∶": { "codepoints": [8758], "characters": "\u2236" },
1786
+
"ℚ": { "codepoints": [8474], "characters": "\u211A" },
1787
+
"⤍": { "codepoints": [10509], "characters": "\u290D" },
1788
+
"❳": { "codepoints": [10099], "characters": "\u2773" },
1789
+
"}": { "codepoints": [125], "characters": "\u007D" },
1790
+
"]": { "codepoints": [93], "characters": "\u005D" },
1791
+
"⦌": { "codepoints": [10636], "characters": "\u298C" },
1792
+
"⦎": { "codepoints": [10638], "characters": "\u298E" },
1793
+
"⦐": { "codepoints": [10640], "characters": "\u2990" },
1794
+
"ř": { "codepoints": [345], "characters": "\u0159" },
1795
+
"ŗ": { "codepoints": [343], "characters": "\u0157" },
1796
+
"⌉": { "codepoints": [8969], "characters": "\u2309" },
1797
+
"}": { "codepoints": [125], "characters": "\u007D" },
1798
+
"р": { "codepoints": [1088], "characters": "\u0440" },
1799
+
"⤷": { "codepoints": [10551], "characters": "\u2937" },
1800
+
"⥩": { "codepoints": [10601], "characters": "\u2969" },
1801
+
"”": { "codepoints": [8221], "characters": "\u201D" },
1802
+
"”": { "codepoints": [8221], "characters": "\u201D" },
1803
+
"↳": { "codepoints": [8627], "characters": "\u21B3" },
1804
+
"ℜ": { "codepoints": [8476], "characters": "\u211C" },
1805
+
"ℛ": { "codepoints": [8475], "characters": "\u211B" },
1806
+
"ℜ": { "codepoints": [8476], "characters": "\u211C" },
1807
+
"ℝ": { "codepoints": [8477], "characters": "\u211D" },
1808
+
"▭": { "codepoints": [9645], "characters": "\u25AD" },
1809
+
"®": { "codepoints": [174], "characters": "\u00AE" },
1810
+
"®": { "codepoints": [174], "characters": "\u00AE" },
1811
+
"⥽": { "codepoints": [10621], "characters": "\u297D" },
1812
+
"⌋": { "codepoints": [8971], "characters": "\u230B" },
1813
+
"𝔯": { "codepoints": [120111], "characters": "\uD835\uDD2F" },
1814
+
"⇁": { "codepoints": [8641], "characters": "\u21C1" },
1815
+
"⇀": { "codepoints": [8640], "characters": "\u21C0" },
1816
+
"⥬": { "codepoints": [10604], "characters": "\u296C" },
1817
+
"ρ": { "codepoints": [961], "characters": "\u03C1" },
1818
+
"ϱ": { "codepoints": [1009], "characters": "\u03F1" },
1819
+
"→": { "codepoints": [8594], "characters": "\u2192" },
1820
+
"↣": { "codepoints": [8611], "characters": "\u21A3" },
1821
+
"⇁": { "codepoints": [8641], "characters": "\u21C1" },
1822
+
"⇀": { "codepoints": [8640], "characters": "\u21C0" },
1823
+
"⇄": { "codepoints": [8644], "characters": "\u21C4" },
1824
+
"⇌": { "codepoints": [8652], "characters": "\u21CC" },
1825
+
"⇉": { "codepoints": [8649], "characters": "\u21C9" },
1826
+
"↝": { "codepoints": [8605], "characters": "\u219D" },
1827
+
"⋌": { "codepoints": [8908], "characters": "\u22CC" },
1828
+
"˚": { "codepoints": [730], "characters": "\u02DA" },
1829
+
"≓": { "codepoints": [8787], "characters": "\u2253" },
1830
+
"⇄": { "codepoints": [8644], "characters": "\u21C4" },
1831
+
"⇌": { "codepoints": [8652], "characters": "\u21CC" },
1832
+
"‏": { "codepoints": [8207], "characters": "\u200F" },
1833
+
"⎱": { "codepoints": [9137], "characters": "\u23B1" },
1834
+
"⎱": { "codepoints": [9137], "characters": "\u23B1" },
1835
+
"⫮": { "codepoints": [10990], "characters": "\u2AEE" },
1836
+
"⟭": { "codepoints": [10221], "characters": "\u27ED" },
1837
+
"⇾": { "codepoints": [8702], "characters": "\u21FE" },
1838
+
"⟧": { "codepoints": [10215], "characters": "\u27E7" },
1839
+
"⦆": { "codepoints": [10630], "characters": "\u2986" },
1840
+
"𝕣": { "codepoints": [120163], "characters": "\uD835\uDD63" },
1841
+
"⨮": { "codepoints": [10798], "characters": "\u2A2E" },
1842
+
"⨵": { "codepoints": [10805], "characters": "\u2A35" },
1843
+
")": { "codepoints": [41], "characters": "\u0029" },
1844
+
"⦔": { "codepoints": [10644], "characters": "\u2994" },
1845
+
"⨒": { "codepoints": [10770], "characters": "\u2A12" },
1846
+
"⇉": { "codepoints": [8649], "characters": "\u21C9" },
1847
+
"›": { "codepoints": [8250], "characters": "\u203A" },
1848
+
"𝓇": { "codepoints": [120007], "characters": "\uD835\uDCC7" },
1849
+
"↱": { "codepoints": [8625], "characters": "\u21B1" },
1850
+
"]": { "codepoints": [93], "characters": "\u005D" },
1851
+
"’": { "codepoints": [8217], "characters": "\u2019" },
1852
+
"’": { "codepoints": [8217], "characters": "\u2019" },
1853
+
"⋌": { "codepoints": [8908], "characters": "\u22CC" },
1854
+
"⋊": { "codepoints": [8906], "characters": "\u22CA" },
1855
+
"▹": { "codepoints": [9657], "characters": "\u25B9" },
1856
+
"⊵": { "codepoints": [8885], "characters": "\u22B5" },
1857
+
"▸": { "codepoints": [9656], "characters": "\u25B8" },
1858
+
"⧎": { "codepoints": [10702], "characters": "\u29CE" },
1859
+
"⥨": { "codepoints": [10600], "characters": "\u2968" },
1860
+
"℞": { "codepoints": [8478], "characters": "\u211E" },
1861
+
"ś": { "codepoints": [347], "characters": "\u015B" },
1862
+
"‚": { "codepoints": [8218], "characters": "\u201A" },
1863
+
"≻": { "codepoints": [8827], "characters": "\u227B" },
1864
+
"⪴": { "codepoints": [10932], "characters": "\u2AB4" },
1865
+
"⪸": { "codepoints": [10936], "characters": "\u2AB8" },
1866
+
"š": { "codepoints": [353], "characters": "\u0161" },
1867
+
"≽": { "codepoints": [8829], "characters": "\u227D" },
1868
+
"⪰": { "codepoints": [10928], "characters": "\u2AB0" },
1869
+
"ş": { "codepoints": [351], "characters": "\u015F" },
1870
+
"ŝ": { "codepoints": [349], "characters": "\u015D" },
1871
+
"⪶": { "codepoints": [10934], "characters": "\u2AB6" },
1872
+
"⪺": { "codepoints": [10938], "characters": "\u2ABA" },
1873
+
"⋩": { "codepoints": [8937], "characters": "\u22E9" },
1874
+
"⨓": { "codepoints": [10771], "characters": "\u2A13" },
1875
+
"≿": { "codepoints": [8831], "characters": "\u227F" },
1876
+
"с": { "codepoints": [1089], "characters": "\u0441" },
1877
+
"⋅": { "codepoints": [8901], "characters": "\u22C5" },
1878
+
"⊡": { "codepoints": [8865], "characters": "\u22A1" },
1879
+
"⩦": { "codepoints": [10854], "characters": "\u2A66" },
1880
+
"⇘": { "codepoints": [8664], "characters": "\u21D8" },
1881
+
"⤥": { "codepoints": [10533], "characters": "\u2925" },
1882
+
"↘": { "codepoints": [8600], "characters": "\u2198" },
1883
+
"↘": { "codepoints": [8600], "characters": "\u2198" },
1884
+
"§": { "codepoints": [167], "characters": "\u00A7" },
1885
+
"§": { "codepoints": [167], "characters": "\u00A7" },
1886
+
";": { "codepoints": [59], "characters": "\u003B" },
1887
+
"⤩": { "codepoints": [10537], "characters": "\u2929" },
1888
+
"∖": { "codepoints": [8726], "characters": "\u2216" },
1889
+
"∖": { "codepoints": [8726], "characters": "\u2216" },
1890
+
"✶": { "codepoints": [10038], "characters": "\u2736" },
1891
+
"𝔰": { "codepoints": [120112], "characters": "\uD835\uDD30" },
1892
+
"⌢": { "codepoints": [8994], "characters": "\u2322" },
1893
+
"♯": { "codepoints": [9839], "characters": "\u266F" },
1894
+
"щ": { "codepoints": [1097], "characters": "\u0449" },
1895
+
"ш": { "codepoints": [1096], "characters": "\u0448" },
1896
+
"∣": { "codepoints": [8739], "characters": "\u2223" },
1897
+
"∥": { "codepoints": [8741], "characters": "\u2225" },
1898
+
"­": { "codepoints": [173], "characters": "\u00AD" },
1899
+
"­": { "codepoints": [173], "characters": "\u00AD" },
1900
+
"σ": { "codepoints": [963], "characters": "\u03C3" },
1901
+
"ς": { "codepoints": [962], "characters": "\u03C2" },
1902
+
"ς": { "codepoints": [962], "characters": "\u03C2" },
1903
+
"∼": { "codepoints": [8764], "characters": "\u223C" },
1904
+
"⩪": { "codepoints": [10858], "characters": "\u2A6A" },
1905
+
"≃": { "codepoints": [8771], "characters": "\u2243" },
1906
+
"≃": { "codepoints": [8771], "characters": "\u2243" },
1907
+
"⪞": { "codepoints": [10910], "characters": "\u2A9E" },
1908
+
"⪠": { "codepoints": [10912], "characters": "\u2AA0" },
1909
+
"⪝": { "codepoints": [10909], "characters": "\u2A9D" },
1910
+
"⪟": { "codepoints": [10911], "characters": "\u2A9F" },
1911
+
"≆": { "codepoints": [8774], "characters": "\u2246" },
1912
+
"⨤": { "codepoints": [10788], "characters": "\u2A24" },
1913
+
"⥲": { "codepoints": [10610], "characters": "\u2972" },
1914
+
"←": { "codepoints": [8592], "characters": "\u2190" },
1915
+
"∖": { "codepoints": [8726], "characters": "\u2216" },
1916
+
"⨳": { "codepoints": [10803], "characters": "\u2A33" },
1917
+
"⧤": { "codepoints": [10724], "characters": "\u29E4" },
1918
+
"∣": { "codepoints": [8739], "characters": "\u2223" },
1919
+
"⌣": { "codepoints": [8995], "characters": "\u2323" },
1920
+
"⪪": { "codepoints": [10922], "characters": "\u2AAA" },
1921
+
"⪬": { "codepoints": [10924], "characters": "\u2AAC" },
1922
+
"⪬︀": { "codepoints": [10924, 65024], "characters": "\u2AAC\uFE00" },
1923
+
"ь": { "codepoints": [1100], "characters": "\u044C" },
1924
+
"/": { "codepoints": [47], "characters": "\u002F" },
1925
+
"⧄": { "codepoints": [10692], "characters": "\u29C4" },
1926
+
"⌿": { "codepoints": [9023], "characters": "\u233F" },
1927
+
"𝕤": { "codepoints": [120164], "characters": "\uD835\uDD64" },
1928
+
"♠": { "codepoints": [9824], "characters": "\u2660" },
1929
+
"♠": { "codepoints": [9824], "characters": "\u2660" },
1930
+
"∥": { "codepoints": [8741], "characters": "\u2225" },
1931
+
"⊓": { "codepoints": [8851], "characters": "\u2293" },
1932
+
"⊓︀": { "codepoints": [8851, 65024], "characters": "\u2293\uFE00" },
1933
+
"⊔": { "codepoints": [8852], "characters": "\u2294" },
1934
+
"⊔︀": { "codepoints": [8852, 65024], "characters": "\u2294\uFE00" },
1935
+
"⊏": { "codepoints": [8847], "characters": "\u228F" },
1936
+
"⊑": { "codepoints": [8849], "characters": "\u2291" },
1937
+
"⊏": { "codepoints": [8847], "characters": "\u228F" },
1938
+
"⊑": { "codepoints": [8849], "characters": "\u2291" },
1939
+
"⊐": { "codepoints": [8848], "characters": "\u2290" },
1940
+
"⊒": { "codepoints": [8850], "characters": "\u2292" },
1941
+
"⊐": { "codepoints": [8848], "characters": "\u2290" },
1942
+
"⊒": { "codepoints": [8850], "characters": "\u2292" },
1943
+
"□": { "codepoints": [9633], "characters": "\u25A1" },
1944
+
"□": { "codepoints": [9633], "characters": "\u25A1" },
1945
+
"▪": { "codepoints": [9642], "characters": "\u25AA" },
1946
+
"▪": { "codepoints": [9642], "characters": "\u25AA" },
1947
+
"→": { "codepoints": [8594], "characters": "\u2192" },
1948
+
"𝓈": { "codepoints": [120008], "characters": "\uD835\uDCC8" },
1949
+
"∖": { "codepoints": [8726], "characters": "\u2216" },
1950
+
"⌣": { "codepoints": [8995], "characters": "\u2323" },
1951
+
"⋆": { "codepoints": [8902], "characters": "\u22C6" },
1952
+
"☆": { "codepoints": [9734], "characters": "\u2606" },
1953
+
"★": { "codepoints": [9733], "characters": "\u2605" },
1954
+
"ϵ": { "codepoints": [1013], "characters": "\u03F5" },
1955
+
"ϕ": { "codepoints": [981], "characters": "\u03D5" },
1956
+
"¯": { "codepoints": [175], "characters": "\u00AF" },
1957
+
"⊂": { "codepoints": [8834], "characters": "\u2282" },
1958
+
"⫅": { "codepoints": [10949], "characters": "\u2AC5" },
1959
+
"⪽": { "codepoints": [10941], "characters": "\u2ABD" },
1960
+
"⊆": { "codepoints": [8838], "characters": "\u2286" },
1961
+
"⫃": { "codepoints": [10947], "characters": "\u2AC3" },
1962
+
"⫁": { "codepoints": [10945], "characters": "\u2AC1" },
1963
+
"⫋": { "codepoints": [10955], "characters": "\u2ACB" },
1964
+
"⊊": { "codepoints": [8842], "characters": "\u228A" },
1965
+
"⪿": { "codepoints": [10943], "characters": "\u2ABF" },
1966
+
"⥹": { "codepoints": [10617], "characters": "\u2979" },
1967
+
"⊂": { "codepoints": [8834], "characters": "\u2282" },
1968
+
"⊆": { "codepoints": [8838], "characters": "\u2286" },
1969
+
"⫅": { "codepoints": [10949], "characters": "\u2AC5" },
1970
+
"⊊": { "codepoints": [8842], "characters": "\u228A" },
1971
+
"⫋": { "codepoints": [10955], "characters": "\u2ACB" },
1972
+
"⫇": { "codepoints": [10951], "characters": "\u2AC7" },
1973
+
"⫕": { "codepoints": [10965], "characters": "\u2AD5" },
1974
+
"⫓": { "codepoints": [10963], "characters": "\u2AD3" },
1975
+
"≻": { "codepoints": [8827], "characters": "\u227B" },
1976
+
"⪸": { "codepoints": [10936], "characters": "\u2AB8" },
1977
+
"≽": { "codepoints": [8829], "characters": "\u227D" },
1978
+
"⪰": { "codepoints": [10928], "characters": "\u2AB0" },
1979
+
"⪺": { "codepoints": [10938], "characters": "\u2ABA" },
1980
+
"⪶": { "codepoints": [10934], "characters": "\u2AB6" },
1981
+
"⋩": { "codepoints": [8937], "characters": "\u22E9" },
1982
+
"≿": { "codepoints": [8831], "characters": "\u227F" },
1983
+
"∑": { "codepoints": [8721], "characters": "\u2211" },
1984
+
"♪": { "codepoints": [9834], "characters": "\u266A" },
1985
+
"¹": { "codepoints": [185], "characters": "\u00B9" },
1986
+
"¹": { "codepoints": [185], "characters": "\u00B9" },
1987
+
"²": { "codepoints": [178], "characters": "\u00B2" },
1988
+
"²": { "codepoints": [178], "characters": "\u00B2" },
1989
+
"³": { "codepoints": [179], "characters": "\u00B3" },
1990
+
"³": { "codepoints": [179], "characters": "\u00B3" },
1991
+
"⊃": { "codepoints": [8835], "characters": "\u2283" },
1992
+
"⫆": { "codepoints": [10950], "characters": "\u2AC6" },
1993
+
"⪾": { "codepoints": [10942], "characters": "\u2ABE" },
1994
+
"⫘": { "codepoints": [10968], "characters": "\u2AD8" },
1995
+
"⊇": { "codepoints": [8839], "characters": "\u2287" },
1996
+
"⫄": { "codepoints": [10948], "characters": "\u2AC4" },
1997
+
"⟉": { "codepoints": [10185], "characters": "\u27C9" },
1998
+
"⫗": { "codepoints": [10967], "characters": "\u2AD7" },
1999
+
"⥻": { "codepoints": [10619], "characters": "\u297B" },
2000
+
"⫂": { "codepoints": [10946], "characters": "\u2AC2" },
2001
+
"⫌": { "codepoints": [10956], "characters": "\u2ACC" },
2002
+
"⊋": { "codepoints": [8843], "characters": "\u228B" },
2003
+
"⫀": { "codepoints": [10944], "characters": "\u2AC0" },
2004
+
"⊃": { "codepoints": [8835], "characters": "\u2283" },
2005
+
"⊇": { "codepoints": [8839], "characters": "\u2287" },
2006
+
"⫆": { "codepoints": [10950], "characters": "\u2AC6" },
2007
+
"⊋": { "codepoints": [8843], "characters": "\u228B" },
2008
+
"⫌": { "codepoints": [10956], "characters": "\u2ACC" },
2009
+
"⫈": { "codepoints": [10952], "characters": "\u2AC8" },
2010
+
"⫔": { "codepoints": [10964], "characters": "\u2AD4" },
2011
+
"⫖": { "codepoints": [10966], "characters": "\u2AD6" },
2012
+
"⇙": { "codepoints": [8665], "characters": "\u21D9" },
2013
+
"⤦": { "codepoints": [10534], "characters": "\u2926" },
2014
+
"↙": { "codepoints": [8601], "characters": "\u2199" },
2015
+
"↙": { "codepoints": [8601], "characters": "\u2199" },
2016
+
"⤪": { "codepoints": [10538], "characters": "\u292A" },
2017
+
"ß": { "codepoints": [223], "characters": "\u00DF" },
2018
+
"ß": { "codepoints": [223], "characters": "\u00DF" },
2019
+
"⌖": { "codepoints": [8982], "characters": "\u2316" },
2020
+
"τ": { "codepoints": [964], "characters": "\u03C4" },
2021
+
"⎴": { "codepoints": [9140], "characters": "\u23B4" },
2022
+
"ť": { "codepoints": [357], "characters": "\u0165" },
2023
+
"ţ": { "codepoints": [355], "characters": "\u0163" },
2024
+
"т": { "codepoints": [1090], "characters": "\u0442" },
2025
+
"⃛": { "codepoints": [8411], "characters": "\u20DB" },
2026
+
"⌕": { "codepoints": [8981], "characters": "\u2315" },
2027
+
"𝔱": { "codepoints": [120113], "characters": "\uD835\uDD31" },
2028
+
"∴": { "codepoints": [8756], "characters": "\u2234" },
2029
+
"∴": { "codepoints": [8756], "characters": "\u2234" },
2030
+
"θ": { "codepoints": [952], "characters": "\u03B8" },
2031
+
"ϑ": { "codepoints": [977], "characters": "\u03D1" },
2032
+
"ϑ": { "codepoints": [977], "characters": "\u03D1" },
2033
+
"≈": { "codepoints": [8776], "characters": "\u2248" },
2034
+
"∼": { "codepoints": [8764], "characters": "\u223C" },
2035
+
" ": { "codepoints": [8201], "characters": "\u2009" },
2036
+
"≈": { "codepoints": [8776], "characters": "\u2248" },
2037
+
"∼": { "codepoints": [8764], "characters": "\u223C" },
2038
+
"þ": { "codepoints": [254], "characters": "\u00FE" },
2039
+
"þ": { "codepoints": [254], "characters": "\u00FE" },
2040
+
"˜": { "codepoints": [732], "characters": "\u02DC" },
2041
+
"×": { "codepoints": [215], "characters": "\u00D7" },
2042
+
"×": { "codepoints": [215], "characters": "\u00D7" },
2043
+
"⊠": { "codepoints": [8864], "characters": "\u22A0" },
2044
+
"⨱": { "codepoints": [10801], "characters": "\u2A31" },
2045
+
"⨰": { "codepoints": [10800], "characters": "\u2A30" },
2046
+
"∭": { "codepoints": [8749], "characters": "\u222D" },
2047
+
"⤨": { "codepoints": [10536], "characters": "\u2928" },
2048
+
"⊤": { "codepoints": [8868], "characters": "\u22A4" },
2049
+
"⌶": { "codepoints": [9014], "characters": "\u2336" },
2050
+
"⫱": { "codepoints": [10993], "characters": "\u2AF1" },
2051
+
"𝕥": { "codepoints": [120165], "characters": "\uD835\uDD65" },
2052
+
"⫚": { "codepoints": [10970], "characters": "\u2ADA" },
2053
+
"⤩": { "codepoints": [10537], "characters": "\u2929" },
2054
+
"‴": { "codepoints": [8244], "characters": "\u2034" },
2055
+
"™": { "codepoints": [8482], "characters": "\u2122" },
2056
+
"▵": { "codepoints": [9653], "characters": "\u25B5" },
2057
+
"▿": { "codepoints": [9663], "characters": "\u25BF" },
2058
+
"◃": { "codepoints": [9667], "characters": "\u25C3" },
2059
+
"⊴": { "codepoints": [8884], "characters": "\u22B4" },
2060
+
"≜": { "codepoints": [8796], "characters": "\u225C" },
2061
+
"▹": { "codepoints": [9657], "characters": "\u25B9" },
2062
+
"⊵": { "codepoints": [8885], "characters": "\u22B5" },
2063
+
"◬": { "codepoints": [9708], "characters": "\u25EC" },
2064
+
"≜": { "codepoints": [8796], "characters": "\u225C" },
2065
+
"⨺": { "codepoints": [10810], "characters": "\u2A3A" },
2066
+
"⨹": { "codepoints": [10809], "characters": "\u2A39" },
2067
+
"⧍": { "codepoints": [10701], "characters": "\u29CD" },
2068
+
"⨻": { "codepoints": [10811], "characters": "\u2A3B" },
2069
+
"⏢": { "codepoints": [9186], "characters": "\u23E2" },
2070
+
"𝓉": { "codepoints": [120009], "characters": "\uD835\uDCC9" },
2071
+
"ц": { "codepoints": [1094], "characters": "\u0446" },
2072
+
"ћ": { "codepoints": [1115], "characters": "\u045B" },
2073
+
"ŧ": { "codepoints": [359], "characters": "\u0167" },
2074
+
"≬": { "codepoints": [8812], "characters": "\u226C" },
2075
+
"↞": { "codepoints": [8606], "characters": "\u219E" },
2076
+
"↠": { "codepoints": [8608], "characters": "\u21A0" },
2077
+
"⇑": { "codepoints": [8657], "characters": "\u21D1" },
2078
+
"⥣": { "codepoints": [10595], "characters": "\u2963" },
2079
+
"ú": { "codepoints": [250], "characters": "\u00FA" },
2080
+
"ú": { "codepoints": [250], "characters": "\u00FA" },
2081
+
"↑": { "codepoints": [8593], "characters": "\u2191" },
2082
+
"ў": { "codepoints": [1118], "characters": "\u045E" },
2083
+
"ŭ": { "codepoints": [365], "characters": "\u016D" },
2084
+
"û": { "codepoints": [251], "characters": "\u00FB" },
2085
+
"û": { "codepoints": [251], "characters": "\u00FB" },
2086
+
"у": { "codepoints": [1091], "characters": "\u0443" },
2087
+
"⇅": { "codepoints": [8645], "characters": "\u21C5" },
2088
+
"ű": { "codepoints": [369], "characters": "\u0171" },
2089
+
"⥮": { "codepoints": [10606], "characters": "\u296E" },
2090
+
"⥾": { "codepoints": [10622], "characters": "\u297E" },
2091
+
"𝔲": { "codepoints": [120114], "characters": "\uD835\uDD32" },
2092
+
"ù": { "codepoints": [249], "characters": "\u00F9" },
2093
+
"ù": { "codepoints": [249], "characters": "\u00F9" },
2094
+
"↿": { "codepoints": [8639], "characters": "\u21BF" },
2095
+
"↾": { "codepoints": [8638], "characters": "\u21BE" },
2096
+
"▀": { "codepoints": [9600], "characters": "\u2580" },
2097
+
"⌜": { "codepoints": [8988], "characters": "\u231C" },
2098
+
"⌜": { "codepoints": [8988], "characters": "\u231C" },
2099
+
"⌏": { "codepoints": [8975], "characters": "\u230F" },
2100
+
"◸": { "codepoints": [9720], "characters": "\u25F8" },
2101
+
"ū": { "codepoints": [363], "characters": "\u016B" },
2102
+
"¨": { "codepoints": [168], "characters": "\u00A8" },
2103
+
"¨": { "codepoints": [168], "characters": "\u00A8" },
2104
+
"ų": { "codepoints": [371], "characters": "\u0173" },
2105
+
"𝕦": { "codepoints": [120166], "characters": "\uD835\uDD66" },
2106
+
"↑": { "codepoints": [8593], "characters": "\u2191" },
2107
+
"↕": { "codepoints": [8597], "characters": "\u2195" },
2108
+
"↿": { "codepoints": [8639], "characters": "\u21BF" },
2109
+
"↾": { "codepoints": [8638], "characters": "\u21BE" },
2110
+
"⊎": { "codepoints": [8846], "characters": "\u228E" },
2111
+
"υ": { "codepoints": [965], "characters": "\u03C5" },
2112
+
"ϒ": { "codepoints": [978], "characters": "\u03D2" },
2113
+
"υ": { "codepoints": [965], "characters": "\u03C5" },
2114
+
"⇈": { "codepoints": [8648], "characters": "\u21C8" },
2115
+
"⌝": { "codepoints": [8989], "characters": "\u231D" },
2116
+
"⌝": { "codepoints": [8989], "characters": "\u231D" },
2117
+
"⌎": { "codepoints": [8974], "characters": "\u230E" },
2118
+
"ů": { "codepoints": [367], "characters": "\u016F" },
2119
+
"◹": { "codepoints": [9721], "characters": "\u25F9" },
2120
+
"𝓊": { "codepoints": [120010], "characters": "\uD835\uDCCA" },
2121
+
"⋰": { "codepoints": [8944], "characters": "\u22F0" },
2122
+
"ũ": { "codepoints": [361], "characters": "\u0169" },
2123
+
"▵": { "codepoints": [9653], "characters": "\u25B5" },
2124
+
"▴": { "codepoints": [9652], "characters": "\u25B4" },
2125
+
"⇈": { "codepoints": [8648], "characters": "\u21C8" },
2126
+
"ü": { "codepoints": [252], "characters": "\u00FC" },
2127
+
"ü": { "codepoints": [252], "characters": "\u00FC" },
2128
+
"⦧": { "codepoints": [10663], "characters": "\u29A7" },
2129
+
"⇕": { "codepoints": [8661], "characters": "\u21D5" },
2130
+
"⫨": { "codepoints": [10984], "characters": "\u2AE8" },
2131
+
"⫩": { "codepoints": [10985], "characters": "\u2AE9" },
2132
+
"⊨": { "codepoints": [8872], "characters": "\u22A8" },
2133
+
"⦜": { "codepoints": [10652], "characters": "\u299C" },
2134
+
"ϵ": { "codepoints": [1013], "characters": "\u03F5" },
2135
+
"ϰ": { "codepoints": [1008], "characters": "\u03F0" },
2136
+
"∅": { "codepoints": [8709], "characters": "\u2205" },
2137
+
"ϕ": { "codepoints": [981], "characters": "\u03D5" },
2138
+
"ϖ": { "codepoints": [982], "characters": "\u03D6" },
2139
+
"∝": { "codepoints": [8733], "characters": "\u221D" },
2140
+
"↕": { "codepoints": [8597], "characters": "\u2195" },
2141
+
"ϱ": { "codepoints": [1009], "characters": "\u03F1" },
2142
+
"ς": { "codepoints": [962], "characters": "\u03C2" },
2143
+
"⊊︀": { "codepoints": [8842, 65024], "characters": "\u228A\uFE00" },
2144
+
"⫋︀": { "codepoints": [10955, 65024], "characters": "\u2ACB\uFE00" },
2145
+
"⊋︀": { "codepoints": [8843, 65024], "characters": "\u228B\uFE00" },
2146
+
"⫌︀": { "codepoints": [10956, 65024], "characters": "\u2ACC\uFE00" },
2147
+
"ϑ": { "codepoints": [977], "characters": "\u03D1" },
2148
+
"⊲": { "codepoints": [8882], "characters": "\u22B2" },
2149
+
"⊳": { "codepoints": [8883], "characters": "\u22B3" },
2150
+
"в": { "codepoints": [1074], "characters": "\u0432" },
2151
+
"⊢": { "codepoints": [8866], "characters": "\u22A2" },
2152
+
"∨": { "codepoints": [8744], "characters": "\u2228" },
2153
+
"⊻": { "codepoints": [8891], "characters": "\u22BB" },
2154
+
"≚": { "codepoints": [8794], "characters": "\u225A" },
2155
+
"⋮": { "codepoints": [8942], "characters": "\u22EE" },
2156
+
"|": { "codepoints": [124], "characters": "\u007C" },
2157
+
"|": { "codepoints": [124], "characters": "\u007C" },
2158
+
"𝔳": { "codepoints": [120115], "characters": "\uD835\uDD33" },
2159
+
"⊲": { "codepoints": [8882], "characters": "\u22B2" },
2160
+
"⊂⃒": { "codepoints": [8834, 8402], "characters": "\u2282\u20D2" },
2161
+
"⊃⃒": { "codepoints": [8835, 8402], "characters": "\u2283\u20D2" },
2162
+
"𝕧": { "codepoints": [120167], "characters": "\uD835\uDD67" },
2163
+
"∝": { "codepoints": [8733], "characters": "\u221D" },
2164
+
"⊳": { "codepoints": [8883], "characters": "\u22B3" },
2165
+
"𝓋": { "codepoints": [120011], "characters": "\uD835\uDCCB" },
2166
+
"⫋︀": { "codepoints": [10955, 65024], "characters": "\u2ACB\uFE00" },
2167
+
"⊊︀": { "codepoints": [8842, 65024], "characters": "\u228A\uFE00" },
2168
+
"⫌︀": { "codepoints": [10956, 65024], "characters": "\u2ACC\uFE00" },
2169
+
"⊋︀": { "codepoints": [8843, 65024], "characters": "\u228B\uFE00" },
2170
+
"⦚": { "codepoints": [10650], "characters": "\u299A" },
2171
+
"ŵ": { "codepoints": [373], "characters": "\u0175" },
2172
+
"⩟": { "codepoints": [10847], "characters": "\u2A5F" },
2173
+
"∧": { "codepoints": [8743], "characters": "\u2227" },
2174
+
"≙": { "codepoints": [8793], "characters": "\u2259" },
2175
+
"℘": { "codepoints": [8472], "characters": "\u2118" },
2176
+
"𝔴": { "codepoints": [120116], "characters": "\uD835\uDD34" },
2177
+
"𝕨": { "codepoints": [120168], "characters": "\uD835\uDD68" },
2178
+
"℘": { "codepoints": [8472], "characters": "\u2118" },
2179
+
"≀": { "codepoints": [8768], "characters": "\u2240" },
2180
+
"≀": { "codepoints": [8768], "characters": "\u2240" },
2181
+
"𝓌": { "codepoints": [120012], "characters": "\uD835\uDCCC" },
2182
+
"⋂": { "codepoints": [8898], "characters": "\u22C2" },
2183
+
"◯": { "codepoints": [9711], "characters": "\u25EF" },
2184
+
"⋃": { "codepoints": [8899], "characters": "\u22C3" },
2185
+
"▽": { "codepoints": [9661], "characters": "\u25BD" },
2186
+
"𝔵": { "codepoints": [120117], "characters": "\uD835\uDD35" },
2187
+
"⟺": { "codepoints": [10234], "characters": "\u27FA" },
2188
+
"⟷": { "codepoints": [10231], "characters": "\u27F7" },
2189
+
"ξ": { "codepoints": [958], "characters": "\u03BE" },
2190
+
"⟸": { "codepoints": [10232], "characters": "\u27F8" },
2191
+
"⟵": { "codepoints": [10229], "characters": "\u27F5" },
2192
+
"⟼": { "codepoints": [10236], "characters": "\u27FC" },
2193
+
"⋻": { "codepoints": [8955], "characters": "\u22FB" },
2194
+
"⨀": { "codepoints": [10752], "characters": "\u2A00" },
2195
+
"𝕩": { "codepoints": [120169], "characters": "\uD835\uDD69" },
2196
+
"⨁": { "codepoints": [10753], "characters": "\u2A01" },
2197
+
"⨂": { "codepoints": [10754], "characters": "\u2A02" },
2198
+
"⟹": { "codepoints": [10233], "characters": "\u27F9" },
2199
+
"⟶": { "codepoints": [10230], "characters": "\u27F6" },
2200
+
"𝓍": { "codepoints": [120013], "characters": "\uD835\uDCCD" },
2201
+
"⨆": { "codepoints": [10758], "characters": "\u2A06" },
2202
+
"⨄": { "codepoints": [10756], "characters": "\u2A04" },
2203
+
"△": { "codepoints": [9651], "characters": "\u25B3" },
2204
+
"⋁": { "codepoints": [8897], "characters": "\u22C1" },
2205
+
"⋀": { "codepoints": [8896], "characters": "\u22C0" },
2206
+
"ý": { "codepoints": [253], "characters": "\u00FD" },
2207
+
"ý": { "codepoints": [253], "characters": "\u00FD" },
2208
+
"я": { "codepoints": [1103], "characters": "\u044F" },
2209
+
"ŷ": { "codepoints": [375], "characters": "\u0177" },
2210
+
"ы": { "codepoints": [1099], "characters": "\u044B" },
2211
+
"¥": { "codepoints": [165], "characters": "\u00A5" },
2212
+
"¥": { "codepoints": [165], "characters": "\u00A5" },
2213
+
"𝔶": { "codepoints": [120118], "characters": "\uD835\uDD36" },
2214
+
"ї": { "codepoints": [1111], "characters": "\u0457" },
2215
+
"𝕪": { "codepoints": [120170], "characters": "\uD835\uDD6A" },
2216
+
"𝓎": { "codepoints": [120014], "characters": "\uD835\uDCCE" },
2217
+
"ю": { "codepoints": [1102], "characters": "\u044E" },
2218
+
"ÿ": { "codepoints": [255], "characters": "\u00FF" },
2219
+
"ÿ": { "codepoints": [255], "characters": "\u00FF" },
2220
+
"ź": { "codepoints": [378], "characters": "\u017A" },
2221
+
"ž": { "codepoints": [382], "characters": "\u017E" },
2222
+
"з": { "codepoints": [1079], "characters": "\u0437" },
2223
+
"ż": { "codepoints": [380], "characters": "\u017C" },
2224
+
"ℨ": { "codepoints": [8488], "characters": "\u2128" },
2225
+
"ζ": { "codepoints": [950], "characters": "\u03B6" },
2226
+
"𝔷": { "codepoints": [120119], "characters": "\uD835\uDD37" },
2227
+
"ж": { "codepoints": [1078], "characters": "\u0436" },
2228
+
"⇝": { "codepoints": [8669], "characters": "\u21DD" },
2229
+
"𝕫": { "codepoints": [120171], "characters": "\uD835\uDD6B" },
2230
+
"𝓏": { "codepoints": [120015], "characters": "\uD835\uDCCF" },
2231
+
"‍": { "codepoints": [8205], "characters": "\u200D" },
2232
+
"‌": { "codepoints": [8204], "characters": "\u200C" }
2233
+
}
+21
dune-project
+21
dune-project
···
1
+
(lang dune 3.0)
2
+
(name html5rw)
3
+
(version 0.1.0)
4
+
5
+
(generate_opam_files true)
6
+
7
+
(source (github username/html5rw))
8
+
(license MIT)
9
+
(authors "Author")
10
+
(maintainers "author@example.com")
11
+
12
+
(package
13
+
(name html5rw)
14
+
(synopsis "Pure OCaml HTML5 parser implementing the WHATWG specification")
15
+
(description "A pure OCaml HTML5 parser that passes the html5lib-tests suite. Implements the WHATWG HTML5 parsing specification including tokenization, tree construction, encoding detection, and CSS selector queries.")
16
+
(depends
17
+
(ocaml (>= 4.14.0))
18
+
(bytesrw (>= 0.3.0))
19
+
(uutf (>= 1.0.0))
20
+
(re (>= 1.10.0))
21
+
(yojson (and :build (>= 2.0.0)))))
+32
examples/basic_parsing.ml
+32
examples/basic_parsing.ml
···
1
+
open Bytesrw
2
+
3
+
(* Basic HTML parsing example *)
4
+
5
+
let html = {|
6
+
<!DOCTYPE html>
7
+
<html>
8
+
<head>
9
+
<title>Hello World</title>
10
+
</head>
11
+
<body>
12
+
<h1>Welcome</h1>
13
+
<p>This is a <strong>simple</strong> example.</p>
14
+
</body>
15
+
</html>
16
+
|}
17
+
18
+
let () =
19
+
(* Parse HTML string *)
20
+
let result = Html5rw.parse (Bytes.Reader.of_string html) in
21
+
22
+
(* Access the root document node *)
23
+
let doc = Html5rw.root result in
24
+
Printf.printf "Root node: %s\n" doc.Html5rw.Dom.name;
25
+
26
+
(* Convert back to HTML *)
27
+
let output = Html5rw.to_string result in
28
+
Printf.printf "\nParsed and serialized:\n%s\n" output;
29
+
30
+
(* Extract plain text *)
31
+
let text = Html5rw.to_text result in
32
+
Printf.printf "\nText content: %s\n" text
+122
examples/css_selectors.ml
+122
examples/css_selectors.ml
···
1
+
open Bytesrw
2
+
3
+
(* CSS selector query example *)
4
+
5
+
let html = {|
6
+
<!DOCTYPE html>
7
+
<html>
8
+
<head><title>Products</title></head>
9
+
<body>
10
+
<div class="container">
11
+
<h1 id="title">Product List</h1>
12
+
<ul class="products">
13
+
<li class="product" data-id="1">
14
+
<span class="name">Widget A</span>
15
+
<span class="price">$10.00</span>
16
+
</li>
17
+
<li class="product" data-id="2">
18
+
<span class="name">Widget B</span>
19
+
<span class="price">$15.00</span>
20
+
</li>
21
+
<li class="product featured" data-id="3">
22
+
<span class="name">Widget C</span>
23
+
<span class="price">$20.00</span>
24
+
</li>
25
+
</ul>
26
+
</div>
27
+
</body>
28
+
</html>
29
+
|}
30
+
31
+
let () =
32
+
let result = Html5rw.parse (Bytes.Reader.of_string html) in
33
+
34
+
(* Find element by ID *)
35
+
Printf.printf "=== ID Selector (#title) ===\n";
36
+
let titles = Html5rw.query result "#title" in
37
+
List.iter (fun node ->
38
+
Printf.printf "Found: %s\n" (Html5rw.get_text_content node)
39
+
) titles;
40
+
41
+
(* Find elements by class *)
42
+
Printf.printf "\n=== Class Selector (.product) ===\n";
43
+
let products = Html5rw.query result ".product" in
44
+
Printf.printf "Found %d products\n" (List.length products);
45
+
46
+
(* Find elements by tag *)
47
+
Printf.printf "\n=== Tag Selector (span) ===\n";
48
+
let spans = Html5rw.query result "span" in
49
+
Printf.printf "Found %d span elements\n" (List.length spans);
50
+
51
+
(* Find with attribute presence *)
52
+
Printf.printf "\n=== Attribute Presence ([data-id]) ===\n";
53
+
let with_data_id = Html5rw.query result "[data-id]" in
54
+
List.iter (fun node ->
55
+
match Html5rw.get_attr node "data-id" with
56
+
| Some id -> Printf.printf "Found element with data-id=%s\n" id
57
+
| None -> ()
58
+
) with_data_id;
59
+
60
+
(* Find with attribute value *)
61
+
Printf.printf "\n=== Attribute Value ([data-id=\"3\"]) ===\n";
62
+
let featured = Html5rw.query result "[data-id=\"3\"]" in
63
+
List.iter (fun node ->
64
+
Printf.printf "Found: %s\n" (Html5rw.get_text_content node)
65
+
) featured;
66
+
67
+
(* Find with multiple classes *)
68
+
Printf.printf "\n=== Multiple Classes (.product.featured) ===\n";
69
+
let featured_products = Html5rw.query result ".featured" in
70
+
List.iter (fun node ->
71
+
Printf.printf "Featured: %s\n" (Html5rw.get_text_content node)
72
+
) featured_products;
73
+
74
+
(* Check if a node matches a selector *)
75
+
Printf.printf "\n=== Match Check (.featured) ===\n";
76
+
List.iter (fun node ->
77
+
if Html5rw.matches node ".featured" then
78
+
Printf.printf "This product is featured!\n"
79
+
) products;
80
+
81
+
(* Pseudo-class: first-child *)
82
+
Printf.printf "\n=== Pseudo-class (:first-child) ===\n";
83
+
let first = Html5rw.query result "li:first-child" in
84
+
List.iter (fun node ->
85
+
Printf.printf "First li: %s\n" (String.trim (Html5rw.get_text_content node))
86
+
) first;
87
+
88
+
(* Pseudo-class: last-child *)
89
+
Printf.printf "\n=== Pseudo-class (:last-child) ===\n";
90
+
let last = Html5rw.query result "li:last-child" in
91
+
List.iter (fun node ->
92
+
Printf.printf "Last li: %s\n" (String.trim (Html5rw.get_text_content node))
93
+
) last;
94
+
95
+
(* Universal selector *)
96
+
Printf.printf "\n=== Universal Selector (*) ===\n";
97
+
let all = Html5rw.query result "*" in
98
+
Printf.printf "Total elements: %d\n" (List.length all);
99
+
100
+
(* Combining queries: find products then filter *)
101
+
Printf.printf "\n=== Combined: Products with price > $15 ===\n";
102
+
List.iter (fun product ->
103
+
(* Find price span within this product *)
104
+
let price_spans = List.filter (fun node ->
105
+
Html5rw.matches node ".price"
106
+
) (Html5rw.descendants product) in
107
+
List.iter (fun price_span ->
108
+
let price_text = Html5rw.get_text_content price_span in
109
+
(* Parse price - remove $ and convert *)
110
+
let price_str = String.sub price_text 1 (String.length price_text - 1) in
111
+
let price = float_of_string price_str in
112
+
if price > 15.0 then begin
113
+
let name_spans = List.filter (fun node ->
114
+
Html5rw.matches node ".name"
115
+
) (Html5rw.descendants product) in
116
+
match name_spans with
117
+
| name :: _ ->
118
+
Printf.printf " %s: %s\n" (Html5rw.get_text_content name) price_text
119
+
| [] -> ()
120
+
end
121
+
) price_spans
122
+
) products
+57
examples/dom_manipulation.ml
+57
examples/dom_manipulation.ml
···
1
+
open Bytesrw
2
+
3
+
(* DOM manipulation example *)
4
+
5
+
let html = {|
6
+
<!DOCTYPE html>
7
+
<html>
8
+
<head><title>DOM Example</title></head>
9
+
<body>
10
+
<div id="content">
11
+
<p>Original content</p>
12
+
</div>
13
+
</body>
14
+
</html>
15
+
|}
16
+
17
+
let () =
18
+
let result = Html5rw.parse (Bytes.Reader.of_string html) in
19
+
20
+
(* Find the content div *)
21
+
match Html5rw.query result "#content" with
22
+
| content_div :: _ ->
23
+
Printf.printf "Original:\n%s\n\n" (Html5rw.Dom.to_html content_div);
24
+
25
+
(* Create and append a new element *)
26
+
let new_para = Html5rw.create_element "p" () in
27
+
let text_node = Html5rw.create_text "This paragraph was added programmatically!" in
28
+
Html5rw.append_child new_para text_node;
29
+
Html5rw.set_attr new_para "class" "dynamic";
30
+
Html5rw.append_child content_div new_para;
31
+
32
+
Printf.printf "After adding element:\n%s\n\n" (Html5rw.Dom.to_html content_div);
33
+
34
+
(* Create an element with attributes *)
35
+
let link = Html5rw.create_element "a"
36
+
~attrs:[("href", "https://example.com"); ("target", "_blank")] () in
37
+
Html5rw.append_child link (Html5rw.create_text "Click here");
38
+
Html5rw.append_child content_div link;
39
+
40
+
Printf.printf "After adding link:\n%s\n\n" (Html5rw.Dom.to_html content_div);
41
+
42
+
(* Check attributes *)
43
+
Printf.printf "Link has href: %b\n" (Html5rw.has_attr link "href");
44
+
Printf.printf "Link href value: %s\n"
45
+
(Option.value ~default:"(none)" (Html5rw.get_attr link "href"));
46
+
47
+
(* Clone a node *)
48
+
let cloned = Html5rw.clone ~deep:true content_div in
49
+
Printf.printf "\nCloned node children: %d\n"
50
+
(List.length cloned.Html5rw.Dom.children);
51
+
52
+
(* Get descendants *)
53
+
let all_descendants = Html5rw.descendants content_div in
54
+
Printf.printf "Total descendants: %d\n" (List.length all_descendants)
55
+
56
+
| [] ->
57
+
Printf.printf "Content div not found\n"
+31
examples/dune
+31
examples/dune
···
1
+
(executable
2
+
(name basic_parsing)
3
+
(libraries bytesrw html5rw))
4
+
5
+
(executable
6
+
(name css_selectors)
7
+
(libraries bytesrw html5rw))
8
+
9
+
(executable
10
+
(name dom_manipulation)
11
+
(libraries bytesrw html5rw))
12
+
13
+
(executable
14
+
(name text_extraction)
15
+
(libraries bytesrw html5rw))
16
+
17
+
(executable
18
+
(name error_handling)
19
+
(libraries bytesrw html5rw))
20
+
21
+
(executable
22
+
(name fragment_parsing)
23
+
(libraries bytesrw html5rw))
24
+
25
+
(executable
26
+
(name encoding_detection)
27
+
(libraries bytesrw html5rw))
28
+
29
+
(executable
30
+
(name web_scraper)
31
+
(libraries bytesrw html5rw))
+43
examples/encoding_detection.ml
+43
examples/encoding_detection.ml
···
1
+
open Bytesrw
2
+
3
+
(* Encoding detection example *)
4
+
5
+
let () =
6
+
Printf.printf "=== Encoding Detection ===\n\n";
7
+
8
+
(* Parse UTF-8 bytes with BOM *)
9
+
let utf8_bom = Bytes.of_string "\xEF\xBB\xBF<html><body>UTF-8 with BOM</body></html>" in
10
+
let result = Html5rw.parse_bytes utf8_bom in
11
+
(match Html5rw.encoding result with
12
+
| Some enc -> Printf.printf "Detected encoding: %s\n" (Html5rw.Encoding.encoding_to_string enc)
13
+
| None -> Printf.printf "No encoding detected\n");
14
+
Printf.printf "Text: %s\n\n" (Html5rw.to_text result);
15
+
16
+
(* Parse with meta charset *)
17
+
let meta_charset = Bytes.of_string {|
18
+
<html>
19
+
<head><meta charset="utf-8"></head>
20
+
<body>Encoding from meta tag</body>
21
+
</html>
22
+
|} in
23
+
let result2 = Html5rw.parse_bytes meta_charset in
24
+
(match Html5rw.encoding result2 with
25
+
| Some enc -> Printf.printf "Detected encoding: %s\n" (Html5rw.Encoding.encoding_to_string enc)
26
+
| None -> Printf.printf "No encoding detected\n");
27
+
Printf.printf "Text: %s\n\n" (Html5rw.to_text result2);
28
+
29
+
(* Using low-level encoding functions *)
30
+
Printf.printf "=== Low-level Encoding API ===\n\n";
31
+
32
+
let bytes = Bytes.of_string "\xEF\xBB\xBFHello" in
33
+
(match Html5rw.Encoding.sniff_bom bytes with
34
+
| Some (enc, offset) ->
35
+
Printf.printf "BOM sniffing result: %s (skip %d bytes)\n"
36
+
(Html5rw.Encoding.encoding_to_string enc) offset
37
+
| None ->
38
+
Printf.printf "No BOM detected\n");
39
+
40
+
let html_bytes = Bytes.of_string {|<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">|} in
41
+
(match Html5rw.Encoding.prescan_for_meta_charset html_bytes with
42
+
| Some enc -> Printf.printf "Prescan found: %s\n" (Html5rw.Encoding.encoding_to_string enc)
43
+
| None -> Printf.printf "No charset in prescan\n")
+52
examples/error_handling.ml
+52
examples/error_handling.ml
···
1
+
open Bytesrw
2
+
3
+
(* Error handling and malformed HTML example *)
4
+
5
+
let malformed_html = {|
6
+
<html>
7
+
<head>
8
+
<title>Unclosed title
9
+
<meta charset="utf-8">
10
+
</head>
11
+
<body>
12
+
<div>
13
+
<p>Unclosed paragraph
14
+
<p>Another paragraph (implicitly closes the previous one)
15
+
<span><div>Misnested tags</span></div>
16
+
</div>
17
+
<table>
18
+
<tr><td>Cell 1<td>Cell 2</td>
19
+
</table>
20
+
<!-- Unclosed comment
21
+
</body>
22
+
</html>
23
+
|}
24
+
25
+
let () =
26
+
Printf.printf "=== Parsing Malformed HTML ===\n\n";
27
+
28
+
(* Parse with error collection enabled *)
29
+
let result = Html5rw.parse ~collect_errors:true (Bytes.Reader.of_string malformed_html) in
30
+
31
+
(* Get parse errors *)
32
+
let errs = Html5rw.errors result in
33
+
Printf.printf "Parse errors: %d\n\n" (List.length errs);
34
+
List.iter (fun err ->
35
+
Printf.printf " Line %d, Col %d: %s\n"
36
+
(Html5rw.error_line err)
37
+
(Html5rw.error_column err)
38
+
(Html5rw.error_code err)
39
+
) errs;
40
+
41
+
(* The parser still produces a valid DOM tree *)
42
+
Printf.printf "\n=== Recovered DOM Tree ===\n";
43
+
let html = Html5rw.to_string ~pretty:true ~indent_size:2 result in
44
+
Printf.printf "%s\n" html;
45
+
46
+
(* Query the recovered tree *)
47
+
Printf.printf "\n=== Query Results ===\n";
48
+
let paragraphs = Html5rw.query result "p" in
49
+
Printf.printf "Found %d paragraphs\n" (List.length paragraphs);
50
+
51
+
let cells = Html5rw.query result "td" in
52
+
Printf.printf "Found %d table cells\n" (List.length cells)
+11
examples/fragment_parsing.ml
+11
examples/fragment_parsing.ml
···
1
+
open Bytesrw
2
+
3
+
(** Example: Parsing HTML fragments *)
4
+
5
+
let () =
6
+
let fragment = "<li>Item 1</li><li>Item 2</li>" in
7
+
let context = Html5rw.make_fragment_context ~tag_name:"ul" () in
8
+
let reader = Bytes.Reader.of_string fragment in
9
+
let result = Html5rw.parse ~fragment_context:context reader in
10
+
11
+
Printf.printf "Fragment parsing result:\n%s\n" (Html5rw.to_string result)
+69
examples/text_extraction.ml
+69
examples/text_extraction.ml
···
1
+
open Bytesrw
2
+
3
+
(* Text extraction example *)
4
+
5
+
let html = {|
6
+
<!DOCTYPE html>
7
+
<html>
8
+
<head>
9
+
<title>Article</title>
10
+
<style>body { font-family: sans-serif; }</style>
11
+
<script>console.log("Hello");</script>
12
+
</head>
13
+
<body>
14
+
<article>
15
+
<h1>The Great HTML5 Parser</h1>
16
+
<p class="intro">
17
+
This is the <em>introduction</em> to an article about
18
+
<strong>HTML parsing</strong> in OCaml.
19
+
</p>
20
+
<p class="content">
21
+
The parser follows the WHATWG specification and handles
22
+
all kinds of malformed HTML gracefully.
23
+
</p>
24
+
<ul>
25
+
<li>Feature 1: Fast parsing</li>
26
+
<li>Feature 2: CSS selectors</li>
27
+
<li>Feature 3: Encoding detection</li>
28
+
</ul>
29
+
</article>
30
+
<footer>
31
+
<p>Copyright 2024</p>
32
+
</footer>
33
+
</body>
34
+
</html>
35
+
|}
36
+
37
+
let () =
38
+
let result = Html5rw.parse (Bytes.Reader.of_string html) in
39
+
40
+
(* Extract all text *)
41
+
Printf.printf "=== All Text (default) ===\n";
42
+
let text = Html5rw.to_text result in
43
+
Printf.printf "%s\n\n" text;
44
+
45
+
(* Extract text with custom separator *)
46
+
Printf.printf "=== Text with Newline Separator ===\n";
47
+
let text = Html5rw.to_text ~separator:"\n" result in
48
+
Printf.printf "%s\n\n" text;
49
+
50
+
(* Extract text from specific element *)
51
+
Printf.printf "=== Article Text Only ===\n";
52
+
let articles = Html5rw.query result "article" in
53
+
List.iter (fun article ->
54
+
let text = Html5rw.get_text_content article in
55
+
Printf.printf "%s\n" text
56
+
) articles;
57
+
58
+
(* Extract structured data *)
59
+
Printf.printf "\n=== Structured Extraction ===\n";
60
+
let headings = Html5rw.query result "h1" in
61
+
List.iter (fun h ->
62
+
Printf.printf "Title: %s\n" (Html5rw.get_text_content h)
63
+
) headings;
64
+
65
+
let items = Html5rw.query result "li" in
66
+
Printf.printf "Features:\n";
67
+
List.iter (fun li ->
68
+
Printf.printf " - %s\n" (Html5rw.get_text_content li)
69
+
) items
+170
examples/web_scraper.ml
+170
examples/web_scraper.ml
···
1
+
open Bytesrw
2
+
3
+
(* Practical web scraping example *)
4
+
5
+
let sample_page = {|
6
+
<!DOCTYPE html>
7
+
<html lang="en">
8
+
<head>
9
+
<meta charset="UTF-8">
10
+
<title>Tech News - Latest Stories</title>
11
+
</head>
12
+
<body>
13
+
<header>
14
+
<nav>
15
+
<a href="/">Home</a>
16
+
<a href="/news">News</a>
17
+
<a href="/about">About</a>
18
+
</nav>
19
+
</header>
20
+
21
+
<main>
22
+
<article class="story featured">
23
+
<h2><a href="/story/1">Revolutionary AI Breakthrough</a></h2>
24
+
<p class="summary">Scientists announce major advancement in machine learning...</p>
25
+
<span class="author">By Jane Smith</span>
26
+
<time datetime="2024-01-15">January 15, 2024</time>
27
+
</article>
28
+
29
+
<article class="story">
30
+
<h2><a href="/story/2">New Programming Language Released</a></h2>
31
+
<p class="summary">The language promises 10x developer productivity...</p>
32
+
<span class="author">By John Doe</span>
33
+
<time datetime="2024-01-14">January 14, 2024</time>
34
+
</article>
35
+
36
+
<article class="story">
37
+
<h2><a href="/story/3">Open Source Project Reaches Milestone</a></h2>
38
+
<p class="summary">Community celebrates 1 million downloads...</p>
39
+
<span class="author">By Alice Chen</span>
40
+
<time datetime="2024-01-13">January 13, 2024</time>
41
+
</article>
42
+
</main>
43
+
44
+
<aside>
45
+
<h3>Popular Tags</h3>
46
+
<ul class="tags">
47
+
<li><a href="/tag/ai">AI</a></li>
48
+
<li><a href="/tag/programming">Programming</a></li>
49
+
<li><a href="/tag/opensource">Open Source</a></li>
50
+
</ul>
51
+
</aside>
52
+
</body>
53
+
</html>
54
+
|}
55
+
56
+
type story = {
57
+
title: string;
58
+
url: string;
59
+
summary: string;
60
+
author: string;
61
+
date: string;
62
+
featured: bool;
63
+
}
64
+
65
+
(* Helper to find first child element with given tag name *)
66
+
let find_child_by_tag parent tag =
67
+
List.find_opt (fun n ->
68
+
Html5rw.is_element n && String.lowercase_ascii n.Html5rw.Dom.name = tag
69
+
) parent.Html5rw.Dom.children
70
+
71
+
(* Helper to find first descendant element with given tag name *)
72
+
let rec find_descendant_by_tag node tag =
73
+
let children = List.filter Html5rw.is_element node.Html5rw.Dom.children in
74
+
match List.find_opt (fun n -> String.lowercase_ascii n.Html5rw.Dom.name = tag) children with
75
+
| Some found -> Some found
76
+
| None ->
77
+
List.find_map (fun child -> find_descendant_by_tag child tag) children
78
+
79
+
(* Helper to find first descendant with given class *)
80
+
let rec find_by_class node cls =
81
+
let children = List.filter Html5rw.is_element node.Html5rw.Dom.children in
82
+
let has_class n =
83
+
match Html5rw.get_attr n "class" with
84
+
| Some classes -> List.mem cls (String.split_on_char ' ' classes)
85
+
| None -> false
86
+
in
87
+
match List.find_opt has_class children with
88
+
| Some found -> Some found
89
+
| None ->
90
+
List.find_map (fun child -> find_by_class child cls) children
91
+
92
+
let extract_story article =
93
+
(* Find h2 > a for title and URL *)
94
+
let title, url =
95
+
match find_descendant_by_tag article "h2" with
96
+
| Some h2 ->
97
+
(match find_child_by_tag h2 "a" with
98
+
| Some a ->
99
+
(Html5rw.get_text_content a,
100
+
Option.value ~default:"#" (Html5rw.get_attr a "href"))
101
+
| None -> (Html5rw.get_text_content h2, "#"))
102
+
| None -> ("(no title)", "#")
103
+
in
104
+
let summary =
105
+
match find_by_class article "summary" with
106
+
| Some p -> Html5rw.get_text_content p
107
+
| None -> ""
108
+
in
109
+
let author =
110
+
match find_by_class article "author" with
111
+
| Some s -> Html5rw.get_text_content s
112
+
| None -> "Unknown"
113
+
in
114
+
let date =
115
+
match find_descendant_by_tag article "time" with
116
+
| Some t -> Option.value ~default:"" (Html5rw.get_attr t "datetime")
117
+
| None -> ""
118
+
in
119
+
let featured = Html5rw.matches article ".featured" in
120
+
{ title; url; summary; author; date; featured }
121
+
122
+
let () =
123
+
Printf.printf "=== Web Scraping Example ===\n\n";
124
+
125
+
let result = Html5rw.parse (Bytes.Reader.of_string sample_page) in
126
+
127
+
(* Extract page title *)
128
+
let titles = Html5rw.query result "title" in
129
+
(match titles with
130
+
| t :: _ -> Printf.printf "Page title: %s\n\n" (Html5rw.get_text_content t)
131
+
| [] -> ());
132
+
133
+
(* Extract navigation links using descendant query *)
134
+
Printf.printf "Navigation:\n";
135
+
let nav_links = Html5rw.query result "a" in
136
+
let nav = List.filter (fun a ->
137
+
(* Check if this link is in nav by looking at ancestors *)
138
+
List.exists (fun n -> n.Html5rw.Dom.name = "nav") (Html5rw.ancestors a)
139
+
) nav_links in
140
+
List.iter (fun a ->
141
+
let text = Html5rw.get_text_content a in
142
+
let href = Option.value ~default:"#" (Html5rw.get_attr a "href") in
143
+
Printf.printf " %s -> %s\n" text href
144
+
) nav;
145
+
146
+
(* Extract stories *)
147
+
Printf.printf "\nStories:\n";
148
+
let articles = Html5rw.query result "article" in
149
+
List.iter (fun article ->
150
+
let story = extract_story article in
151
+
Printf.printf "\n %s%s\n"
152
+
(if story.featured then "[FEATURED] " else "")
153
+
story.title;
154
+
Printf.printf " URL: %s\n" story.url;
155
+
Printf.printf " Summary: %s\n" story.summary;
156
+
Printf.printf " %s | %s\n" story.author story.date
157
+
) articles;
158
+
159
+
(* Extract tags *)
160
+
Printf.printf "\nPopular Tags:\n";
161
+
let all_links = Html5rw.query result "a" in
162
+
let tag_links = List.filter (fun a ->
163
+
let href = Option.value ~default:"" (Html5rw.get_attr a "href") in
164
+
String.length href > 5 && String.sub href 0 5 = "/tag/"
165
+
) all_links in
166
+
List.iter (fun a ->
167
+
let tag = Html5rw.get_text_content a in
168
+
let href = Option.value ~default:"#" (Html5rw.get_attr a "href") in
169
+
Printf.printf " #%s (%s)\n" tag href
170
+
) tag_links
+152
gen/gen_entities.ml
+152
gen/gen_entities.ml
···
1
+
(* Entity table generator for html5rw.
2
+
Reads WHATWG entities.json and generates OCaml code. *)
3
+
4
+
let () =
5
+
let json_file = Sys.argv.(1) in
6
+
let out_file = Sys.argv.(2) in
7
+
8
+
let ic = open_in json_file in
9
+
let n = in_channel_length ic in
10
+
let s = really_input_string ic n in
11
+
close_in ic;
12
+
13
+
let json = Yojson.Basic.from_string s in
14
+
15
+
let oc = open_out out_file in
16
+
17
+
(* Header *)
18
+
output_string oc "(* Auto-generated from entities.json - do not edit *)\n\n";
19
+
20
+
(* We need two tables:
21
+
1. Full entity table (name without & -> codepoints)
22
+
2. Legacy entities set (entities that can be used without semicolon) *)
23
+
24
+
let entities = ref [] in
25
+
let legacy = ref [] in
26
+
27
+
(match json with
28
+
| `Assoc entries ->
29
+
List.iter (fun (name, value) ->
30
+
(* name is like "&" or "&" *)
31
+
let name_without_amp =
32
+
if String.length name > 0 && name.[0] = '&' then
33
+
String.sub name 1 (String.length name - 1)
34
+
else name
35
+
in
36
+
let has_semicolon =
37
+
String.length name_without_amp > 0 &&
38
+
name_without_amp.[String.length name_without_amp - 1] = ';'
39
+
in
40
+
let key =
41
+
if has_semicolon then
42
+
String.sub name_without_amp 0 (String.length name_without_amp - 1)
43
+
else
44
+
name_without_amp
45
+
in
46
+
(match value with
47
+
| `Assoc fields ->
48
+
let codepoints =
49
+
match List.assoc_opt "codepoints" fields with
50
+
| Some (`List cps) ->
51
+
List.map (function `Int i -> i | _ -> 0) cps
52
+
| _ -> []
53
+
in
54
+
if codepoints <> [] then begin
55
+
entities := (key, codepoints, has_semicolon) :: !entities;
56
+
(* Legacy entities are those that appear without semicolon in the JSON *)
57
+
if not has_semicolon then
58
+
legacy := key :: !legacy
59
+
end
60
+
| _ -> ())
61
+
) entries
62
+
| _ -> failwith "Expected JSON object");
63
+
64
+
(* Remove duplicates - prefer semicolon version *)
65
+
let seen = Hashtbl.create 2500 in
66
+
let unique_entities =
67
+
List.filter (fun (key, _, has_semi) ->
68
+
if Hashtbl.mem seen key then begin
69
+
(* If we already have this key without semicolon, and this one has semicolon, replace *)
70
+
if has_semi then begin
71
+
Hashtbl.replace seen key true;
72
+
true
73
+
end else false
74
+
end else begin
75
+
Hashtbl.add seen key has_semi;
76
+
true
77
+
end
78
+
) (List.rev !entities)
79
+
in
80
+
81
+
(* Sort for binary search *)
82
+
let sorted = List.sort (fun (a, _, _) (b, _, _) -> String.compare a b) unique_entities in
83
+
84
+
(* Generate codepoints to string function *)
85
+
output_string oc "let codepoints_to_string cps =\n";
86
+
output_string oc " let buf = Buffer.create 8 in\n";
87
+
output_string oc " List.iter (fun cp ->\n";
88
+
output_string oc " if cp <= 0x7F then\n";
89
+
output_string oc " Buffer.add_char buf (Char.chr cp)\n";
90
+
output_string oc " else if cp <= 0x7FF then begin\n";
91
+
output_string oc " Buffer.add_char buf (Char.chr (0xC0 lor (cp lsr 6)));\n";
92
+
output_string oc " Buffer.add_char buf (Char.chr (0x80 lor (cp land 0x3F)))\n";
93
+
output_string oc " end else if cp <= 0xFFFF then begin\n";
94
+
output_string oc " Buffer.add_char buf (Char.chr (0xE0 lor (cp lsr 12)));\n";
95
+
output_string oc " Buffer.add_char buf (Char.chr (0x80 lor ((cp lsr 6) land 0x3F)));\n";
96
+
output_string oc " Buffer.add_char buf (Char.chr (0x80 lor (cp land 0x3F)))\n";
97
+
output_string oc " end else begin\n";
98
+
output_string oc " Buffer.add_char buf (Char.chr (0xF0 lor (cp lsr 18)));\n";
99
+
output_string oc " Buffer.add_char buf (Char.chr (0x80 lor ((cp lsr 12) land 0x3F)));\n";
100
+
output_string oc " Buffer.add_char buf (Char.chr (0x80 lor ((cp lsr 6) land 0x3F)));\n";
101
+
output_string oc " Buffer.add_char buf (Char.chr (0x80 lor (cp land 0x3F)))\n";
102
+
output_string oc " end\n";
103
+
output_string oc " ) cps;\n";
104
+
output_string oc " Buffer.contents buf\n\n";
105
+
106
+
(* Generate the entity array for binary search *)
107
+
output_string oc "let entities = [|\n";
108
+
List.iter (fun (name, codepoints, _) ->
109
+
let cps_str = String.concat "; " (List.map string_of_int codepoints) in
110
+
Printf.fprintf oc " (%S, [%s]);\n" name cps_str
111
+
) sorted;
112
+
output_string oc "|]\n\n";
113
+
114
+
(* Binary search lookup *)
115
+
output_string oc "let lookup name =\n";
116
+
output_string oc " let rec search lo hi =\n";
117
+
output_string oc " if lo > hi then None\n";
118
+
output_string oc " else begin\n";
119
+
output_string oc " let mid = (lo + hi) / 2 in\n";
120
+
output_string oc " let (key, cps) = entities.(mid) in\n";
121
+
output_string oc " let cmp = String.compare name key in\n";
122
+
output_string oc " if cmp = 0 then Some (codepoints_to_string cps)\n";
123
+
output_string oc " else if cmp < 0 then search lo (mid - 1)\n";
124
+
output_string oc " else search (mid + 1) hi\n";
125
+
output_string oc " end\n";
126
+
output_string oc " in\n";
127
+
output_string oc " search 0 (Array.length entities - 1)\n\n";
128
+
129
+
(* Generate legacy entities set *)
130
+
let legacy_sorted = List.sort_uniq String.compare !legacy in
131
+
output_string oc "let legacy_entities = [|\n";
132
+
List.iter (fun name ->
133
+
Printf.fprintf oc " %S;\n" name
134
+
) legacy_sorted;
135
+
output_string oc "|]\n\n";
136
+
137
+
output_string oc "let is_legacy name =\n";
138
+
output_string oc " let rec search lo hi =\n";
139
+
output_string oc " if lo > hi then false\n";
140
+
output_string oc " else begin\n";
141
+
output_string oc " let mid = (lo + hi) / 2 in\n";
142
+
output_string oc " let cmp = String.compare name legacy_entities.(mid) in\n";
143
+
output_string oc " if cmp = 0 then true\n";
144
+
output_string oc " else if cmp < 0 then search lo (mid - 1)\n";
145
+
output_string oc " else search (mid + 1) hi\n";
146
+
output_string oc " end\n";
147
+
output_string oc " in\n";
148
+
output_string oc " search 0 (Array.length legacy_entities - 1)\n";
149
+
150
+
close_out oc;
151
+
Printf.printf "Generated %s with %d entities (%d legacy)\n"
152
+
out_file (List.length sorted) (List.length legacy_sorted)
+35
html5rw.opam
+35
html5rw.opam
···
1
+
# This file is generated by dune, edit dune-project instead
2
+
opam-version: "2.0"
3
+
version: "0.1.0"
4
+
synopsis: "Pure OCaml HTML5 parser implementing the WHATWG specification"
5
+
description:
6
+
"A pure OCaml HTML5 parser that passes the html5lib-tests suite. Implements the WHATWG HTML5 parsing specification including tokenization, tree construction, encoding detection, and CSS selector queries."
7
+
maintainer: ["author@example.com"]
8
+
authors: ["Author"]
9
+
license: "MIT"
10
+
homepage: "https://github.com/username/html5rw"
11
+
bug-reports: "https://github.com/username/html5rw/issues"
12
+
depends: [
13
+
"dune" {>= "3.0"}
14
+
"ocaml" {>= "4.14.0"}
15
+
"bytesrw" {>= "0.3.0"}
16
+
"uutf" {>= "1.0.0"}
17
+
"re" {>= "1.10.0"}
18
+
"yojson" {build & >= "2.0.0"}
19
+
"odoc" {with-doc}
20
+
]
21
+
build: [
22
+
["dune" "subst"] {dev}
23
+
[
24
+
"dune"
25
+
"build"
26
+
"-p"
27
+
name
28
+
"-j"
29
+
jobs
30
+
"@install"
31
+
"@runtest" {with-test}
32
+
"@doc" {with-doc}
33
+
]
34
+
]
35
+
dev-repo: "git+https://github.com/username/html5rw.git"
+4
lib/dom/dune
+4
lib/dom/dune
+8
lib/dom/html5rw_dom.ml
+8
lib/dom/html5rw_dom.ml
+159
lib/dom/node.ml
+159
lib/dom/node.ml
···
1
+
(* HTML5 DOM node types *)
2
+
3
+
type doctype_data = {
4
+
name : string option;
5
+
public_id : string option;
6
+
system_id : string option;
7
+
}
8
+
9
+
type quirks_mode = No_quirks | Quirks | Limited_quirks
10
+
11
+
type node = {
12
+
mutable name : string;
13
+
mutable namespace : string option; (* None = html, Some "svg", Some "mathml" *)
14
+
mutable attrs : (string * string) list;
15
+
mutable children : node list;
16
+
mutable parent : node option;
17
+
mutable data : string; (* For text, comment nodes *)
18
+
mutable template_content : node option; (* For <template> elements *)
19
+
mutable doctype : doctype_data option; (* For doctype nodes *)
20
+
}
21
+
22
+
(* Node name constants *)
23
+
let document_name = "#document"
24
+
let document_fragment_name = "#document-fragment"
25
+
let text_name = "#text"
26
+
let comment_name = "#comment"
27
+
let doctype_name = "!doctype"
28
+
29
+
(* Base node constructor - all nodes share this structure *)
30
+
let make_node ~name ?(namespace=None) ?(attrs=[]) ?(data="") ?template_content ?doctype () = {
31
+
name;
32
+
namespace;
33
+
attrs;
34
+
children = [];
35
+
parent = None;
36
+
data;
37
+
template_content;
38
+
doctype;
39
+
}
40
+
41
+
(* Constructors *)
42
+
let create_element name ?(namespace=None) ?(attrs=[]) () =
43
+
make_node ~name ~namespace ~attrs ()
44
+
45
+
let create_text data =
46
+
make_node ~name:text_name ~data ()
47
+
48
+
let create_comment data =
49
+
make_node ~name:comment_name ~data ()
50
+
51
+
let create_document () =
52
+
make_node ~name:document_name ()
53
+
54
+
let create_document_fragment () =
55
+
make_node ~name:document_fragment_name ()
56
+
57
+
let create_doctype ?name ?public_id ?system_id () =
58
+
make_node ~name:doctype_name ~doctype:{ name; public_id; system_id } ()
59
+
60
+
let create_template ?(namespace=None) ?(attrs=[]) () =
61
+
let node = create_element "template" ~namespace ~attrs () in
62
+
node.template_content <- Some (create_document_fragment ());
63
+
node
64
+
65
+
(* Predicates *)
66
+
let is_element node =
67
+
not (List.mem node.name [text_name; comment_name; document_name; document_fragment_name; doctype_name])
68
+
69
+
let is_text node = node.name = text_name
70
+
let is_comment node = node.name = comment_name
71
+
let is_document node = node.name = document_name
72
+
let is_document_fragment node = node.name = document_fragment_name
73
+
let is_doctype node = node.name = doctype_name
74
+
let has_children node = node.children <> []
75
+
76
+
(* DOM manipulation *)
77
+
let append_child parent child =
78
+
child.parent <- Some parent;
79
+
parent.children <- parent.children @ [child]
80
+
81
+
let insert_before parent new_child ref_child =
82
+
new_child.parent <- Some parent;
83
+
let rec insert acc = function
84
+
| [] -> List.rev acc @ [new_child]
85
+
| x :: xs when x == ref_child -> List.rev acc @ [new_child; x] @ xs
86
+
| x :: xs -> insert (x :: acc) xs
87
+
in
88
+
parent.children <- insert [] parent.children
89
+
90
+
let remove_child parent child =
91
+
child.parent <- None;
92
+
parent.children <- List.filter (fun c -> c != child) parent.children
93
+
94
+
(* Find the last text node before a reference point *)
95
+
let last_child_text parent =
96
+
match List.rev parent.children with
97
+
| last :: _ when is_text last -> Some last
98
+
| _ -> None
99
+
100
+
let insert_text_at parent text before_node =
101
+
match before_node with
102
+
| None ->
103
+
(* Append - merge with last child if it's text *)
104
+
(match last_child_text parent with
105
+
| Some txt -> txt.data <- txt.data ^ text
106
+
| None -> append_child parent (create_text text))
107
+
| Some ref ->
108
+
(* Find last text node before ref_child *)
109
+
let rec find_prev_text = function
110
+
| [] | [_] -> None
111
+
| prev :: curr :: _ when curr == ref && is_text prev -> Some prev
112
+
| _ :: rest -> find_prev_text rest
113
+
in
114
+
match find_prev_text parent.children with
115
+
| Some txt -> txt.data <- txt.data ^ text
116
+
| None -> insert_before parent (create_text text) ref
117
+
118
+
(* Attribute helpers *)
119
+
let get_attr node name = List.assoc_opt name node.attrs
120
+
121
+
let set_attr node name value =
122
+
node.attrs <- List.filter (fun (n, _) -> n <> name) node.attrs @ [(name, value)]
123
+
124
+
let has_attr node name = List.mem_assoc name node.attrs
125
+
126
+
(* Tree traversal *)
127
+
let rec descendants node =
128
+
List.concat_map (fun n -> n :: descendants n) node.children
129
+
130
+
let ancestors node =
131
+
let rec collect acc n =
132
+
match n.parent with
133
+
| None -> List.rev acc
134
+
| Some p -> collect (p :: acc) p
135
+
in
136
+
collect [] node
137
+
138
+
let rec get_text_content node =
139
+
if is_text node then node.data
140
+
else String.concat "" (List.map get_text_content node.children)
141
+
142
+
(* Clone *)
143
+
let rec clone ?(deep=false) node =
144
+
let new_node = make_node
145
+
~name:node.name
146
+
~namespace:node.namespace
147
+
~attrs:node.attrs
148
+
~data:node.data
149
+
?doctype:node.doctype
150
+
()
151
+
in
152
+
if deep then begin
153
+
new_node.children <- List.map (clone ~deep:true) node.children;
154
+
List.iter (fun c -> c.parent <- Some new_node) new_node.children;
155
+
Option.iter (fun tc ->
156
+
new_node.template_content <- Some (clone ~deep:true tc)
157
+
) node.template_content
158
+
end;
159
+
new_node
+333
lib/dom/node.mli
+333
lib/dom/node.mli
···
1
+
(** HTML5 DOM Node Types and Operations
2
+
3
+
This module provides the DOM node representation used by the HTML5 parser.
4
+
Nodes form a tree structure representing HTML documents. The type follows
5
+
the WHATWG HTML5 specification for document structure.
6
+
7
+
{2 Node Types}
8
+
9
+
The HTML5 DOM includes several node types, all represented by the same
10
+
record type with different field usage:
11
+
12
+
- {b Element nodes}: Regular HTML elements like [<div>], [<p>], [<span>]
13
+
- {b Text nodes}: Text content within elements
14
+
- {b Comment nodes}: HTML comments [<!-- comment -->]
15
+
- {b Document nodes}: The root node representing the entire document
16
+
- {b Document fragment nodes}: A lightweight container (used for templates)
17
+
- {b Doctype nodes}: The [<!DOCTYPE html>] declaration
18
+
19
+
{2 Namespaces}
20
+
21
+
Elements can belong to different namespaces:
22
+
- [None] or [Some "html"]: HTML namespace (default)
23
+
- [Some "svg"]: SVG namespace for embedded SVG content
24
+
- [Some "mathml"]: MathML namespace for mathematical notation
25
+
26
+
The parser automatically switches namespaces when encountering [<svg>]
27
+
or [<math>] elements, as specified by the HTML5 algorithm.
28
+
29
+
{2 Tree Structure}
30
+
31
+
Nodes form a bidirectional tree: each node has a list of children and
32
+
an optional parent reference. Modification functions maintain these
33
+
references automatically.
34
+
*)
35
+
36
+
(** {1 Types} *)
37
+
38
+
(** Information associated with a DOCTYPE node.
39
+
40
+
In HTML5, the DOCTYPE is primarily used for quirks mode detection.
41
+
Most modern HTML5 documents use [<!DOCTYPE html>] which results in
42
+
all fields being [None] or the name being [Some "html"].
43
+
44
+
@see <https://html.spec.whatwg.org/multipage/parsing.html#the-initial-insertion-mode>
45
+
The WHATWG specification for DOCTYPE handling
46
+
*)
47
+
type doctype_data = {
48
+
name : string option; (** The DOCTYPE name, e.g., "html" *)
49
+
public_id : string option; (** Public identifier (legacy, rarely used) *)
50
+
system_id : string option; (** System identifier (legacy, rarely used) *)
51
+
}
52
+
53
+
(** Quirks mode setting for the document.
54
+
55
+
Quirks mode affects CSS layout behavior for backwards compatibility with
56
+
old web content. The HTML5 parser determines quirks mode based on the
57
+
DOCTYPE declaration.
58
+
59
+
- [No_quirks]: Standards mode - full HTML5/CSS3 behavior
60
+
- [Quirks]: Full quirks mode - emulates legacy browser behavior
61
+
- [Limited_quirks]: Almost standards mode - limited quirks for specific cases
62
+
63
+
@see <https://quirks.spec.whatwg.org/> The Quirks Mode specification
64
+
*)
65
+
type quirks_mode = No_quirks | Quirks | Limited_quirks
66
+
67
+
(** A DOM node in the parsed document tree.
68
+
69
+
All node types use the same record structure. The [name] field determines
70
+
the node type:
71
+
- Element: the tag name (e.g., "div", "p")
72
+
- Text: "#text"
73
+
- Comment: "#comment"
74
+
- Document: "#document"
75
+
- Document fragment: "#document-fragment"
76
+
- Doctype: "!doctype"
77
+
78
+
{3 Field Usage by Node Type}
79
+
80
+
{v
81
+
Node Type | name | namespace | attrs | data | template_content | doctype
82
+
------------------|------------------|-----------|-------|------|------------------|--------
83
+
Element | tag name | Yes | Yes | No | If <template> | No
84
+
Text | "#text" | No | No | Yes | No | No
85
+
Comment | "#comment" | No | No | Yes | No | No
86
+
Document | "#document" | No | No | No | No | No
87
+
Document Fragment | "#document-frag" | No | No | No | No | No
88
+
Doctype | "!doctype" | No | No | No | No | Yes
89
+
v}
90
+
*)
91
+
type node = {
92
+
mutable name : string;
93
+
(** Tag name for elements, or special name for other node types *)
94
+
95
+
mutable namespace : string option;
96
+
(** Element namespace: [None] for HTML, [Some "svg"], [Some "mathml"] *)
97
+
98
+
mutable attrs : (string * string) list;
99
+
(** Element attributes as (name, value) pairs *)
100
+
101
+
mutable children : node list;
102
+
(** Child nodes in document order *)
103
+
104
+
mutable parent : node option;
105
+
(** Parent node, [None] for root nodes *)
106
+
107
+
mutable data : string;
108
+
(** Text content for text and comment nodes *)
109
+
110
+
mutable template_content : node option;
111
+
(** Document fragment for [<template>] element contents *)
112
+
113
+
mutable doctype : doctype_data option;
114
+
(** DOCTYPE information for doctype nodes *)
115
+
}
116
+
117
+
(** {1 Node Name Constants}
118
+
119
+
These constants identify special node types. Compare with [node.name]
120
+
to determine the node type.
121
+
*)
122
+
123
+
val document_name : string
124
+
(** ["#document"] - name for document nodes *)
125
+
126
+
val document_fragment_name : string
127
+
(** ["#document-fragment"] - name for document fragment nodes *)
128
+
129
+
val text_name : string
130
+
(** ["#text"] - name for text nodes *)
131
+
132
+
val comment_name : string
133
+
(** ["#comment"] - name for comment nodes *)
134
+
135
+
val doctype_name : string
136
+
(** ["!doctype"] - name for doctype nodes *)
137
+
138
+
(** {1 Constructors}
139
+
140
+
Functions to create new DOM nodes. All nodes start with no parent and
141
+
no children.
142
+
*)
143
+
144
+
val create_element : string -> ?namespace:string option ->
145
+
?attrs:(string * string) list -> unit -> node
146
+
(** Create an element node.
147
+
148
+
@param name The tag name (e.g., "div", "p", "span")
149
+
@param namespace Element namespace: [None] for HTML, [Some "svg"], [Some "mathml"]
150
+
@param attrs Initial attributes as (name, value) pairs
151
+
152
+
{[
153
+
let div = create_element "div" ()
154
+
let svg = create_element "rect" ~namespace:(Some "svg") ()
155
+
let link = create_element "a" ~attrs:[("href", "/")] ()
156
+
]}
157
+
*)
158
+
159
+
val create_text : string -> node
160
+
(** Create a text node with the given content.
161
+
162
+
{[
163
+
let text = create_text "Hello, world!"
164
+
]}
165
+
*)
166
+
167
+
val create_comment : string -> node
168
+
(** Create a comment node with the given content.
169
+
170
+
The content should not include the comment delimiters.
171
+
172
+
{[
173
+
let comment = create_comment " This is a comment "
174
+
(* Represents: <!-- This is a comment --> *)
175
+
]}
176
+
*)
177
+
178
+
val create_document : unit -> node
179
+
(** Create an empty document node.
180
+
181
+
Document nodes are the root of a complete HTML document tree.
182
+
*)
183
+
184
+
val create_document_fragment : unit -> node
185
+
(** Create an empty document fragment.
186
+
187
+
Document fragments are lightweight containers used for:
188
+
- Template contents
189
+
- Fragment parsing results
190
+
- Efficient batch DOM operations
191
+
*)
192
+
193
+
val create_doctype : ?name:string -> ?public_id:string ->
194
+
?system_id:string -> unit -> node
195
+
(** Create a DOCTYPE node.
196
+
197
+
For HTML5, use [create_doctype ~name:"html" ()] which produces
198
+
[<!DOCTYPE html>].
199
+
200
+
@param name DOCTYPE name (usually "html")
201
+
@param public_id Public identifier (legacy)
202
+
@param system_id System identifier (legacy)
203
+
*)
204
+
205
+
val create_template : ?namespace:string option ->
206
+
?attrs:(string * string) list -> unit -> node
207
+
(** Create a [<template>] element with its content document fragment.
208
+
209
+
Template elements have special semantics: their children are not rendered
210
+
directly but stored in a separate document fragment accessible via
211
+
[template_content].
212
+
213
+
@see <https://html.spec.whatwg.org/multipage/scripting.html#the-template-element>
214
+
The HTML5 template element specification
215
+
*)
216
+
217
+
(** {1 Node Type Predicates}
218
+
219
+
Functions to test what type of node you have.
220
+
*)
221
+
222
+
val is_element : node -> bool
223
+
(** [is_element node] returns [true] if the node is an element node.
224
+
225
+
Elements are nodes with HTML tags like [<div>], [<p>], etc.
226
+
*)
227
+
228
+
val is_text : node -> bool
229
+
(** [is_text node] returns [true] if the node is a text node. *)
230
+
231
+
val is_comment : node -> bool
232
+
(** [is_comment node] returns [true] if the node is a comment node. *)
233
+
234
+
val is_document : node -> bool
235
+
(** [is_document node] returns [true] if the node is a document node. *)
236
+
237
+
val is_document_fragment : node -> bool
238
+
(** [is_document_fragment node] returns [true] if the node is a document fragment. *)
239
+
240
+
val is_doctype : node -> bool
241
+
(** [is_doctype node] returns [true] if the node is a DOCTYPE node. *)
242
+
243
+
val has_children : node -> bool
244
+
(** [has_children node] returns [true] if the node has any children. *)
245
+
246
+
(** {1 Tree Manipulation}
247
+
248
+
Functions to modify the DOM tree structure. These functions automatically
249
+
maintain parent/child references.
250
+
*)
251
+
252
+
val append_child : node -> node -> unit
253
+
(** [append_child parent child] adds [child] as the last child of [parent].
254
+
255
+
The child's parent reference is updated to point to [parent].
256
+
*)
257
+
258
+
val insert_before : node -> node -> node -> unit
259
+
(** [insert_before parent new_child ref_child] inserts [new_child] before
260
+
[ref_child] in [parent]'s children.
261
+
262
+
@raise Not_found if [ref_child] is not a child of [parent]
263
+
*)
264
+
265
+
val remove_child : node -> node -> unit
266
+
(** [remove_child parent child] removes [child] from [parent]'s children.
267
+
268
+
The child's parent reference is set to [None].
269
+
*)
270
+
271
+
val insert_text_at : node -> string -> node option -> unit
272
+
(** [insert_text_at parent text before_node] inserts text content.
273
+
274
+
If [before_node] is [None], appends at the end. If the previous sibling
275
+
is a text node, the text is merged into it. Otherwise, a new text node
276
+
is created.
277
+
278
+
This implements the HTML5 parser's text insertion algorithm which
279
+
coalesces adjacent text nodes.
280
+
*)
281
+
282
+
(** {1 Attribute Operations}
283
+
284
+
Functions to read and modify element attributes.
285
+
*)
286
+
287
+
val get_attr : node -> string -> string option
288
+
(** [get_attr node name] returns the value of attribute [name], or [None]. *)
289
+
290
+
val set_attr : node -> string -> string -> unit
291
+
(** [set_attr node name value] sets attribute [name] to [value].
292
+
293
+
If the attribute already exists, it is replaced.
294
+
*)
295
+
296
+
val has_attr : node -> string -> bool
297
+
(** [has_attr node name] returns [true] if the node has attribute [name]. *)
298
+
299
+
(** {1 Tree Traversal}
300
+
301
+
Functions to navigate the DOM tree.
302
+
*)
303
+
304
+
val descendants : node -> node list
305
+
(** [descendants node] returns all descendant nodes in document order.
306
+
307
+
This performs a depth-first traversal, returning children before
308
+
siblings at each level.
309
+
*)
310
+
311
+
val ancestors : node -> node list
312
+
(** [ancestors node] returns all ancestor nodes from parent to root.
313
+
314
+
The first element is the immediate parent, the last is the root.
315
+
*)
316
+
317
+
val get_text_content : node -> string
318
+
(** [get_text_content node] returns the concatenated text content.
319
+
320
+
For text nodes, returns the text data. For elements, recursively
321
+
concatenates all descendant text content.
322
+
*)
323
+
324
+
(** {1 Cloning} *)
325
+
326
+
val clone : ?deep:bool -> node -> node
327
+
(** [clone ?deep node] creates a copy of the node.
328
+
329
+
@param deep If [true], recursively clone all descendants (default: [false])
330
+
331
+
The cloned node has no parent. Attribute lists are copied by reference
332
+
(the list itself is new, but attribute strings are shared).
333
+
*)
+301
lib/dom/serialize.ml
+301
lib/dom/serialize.ml
···
1
+
(* HTML5 DOM serialization *)
2
+
3
+
open Bytesrw
4
+
open Node
5
+
6
+
(* Void elements that don't have end tags *)
7
+
let void_elements = [
8
+
"area"; "base"; "br"; "col"; "embed"; "hr"; "img"; "input";
9
+
"link"; "meta"; "source"; "track"; "wbr"
10
+
]
11
+
12
+
let is_void name = List.mem name void_elements
13
+
14
+
(* Foreign attribute adjustments for test output *)
15
+
let foreign_attr_adjustments = [
16
+
"xlink:actuate"; "xlink:arcrole"; "xlink:href"; "xlink:role";
17
+
"xlink:show"; "xlink:title"; "xlink:type"; "xml:lang"; "xml:space";
18
+
"xmlns:xlink"
19
+
]
20
+
21
+
(* Escape text content *)
22
+
let escape_text text =
23
+
let buf = Buffer.create (String.length text) in
24
+
String.iter (fun c ->
25
+
match c with
26
+
| '&' -> Buffer.add_string buf "&"
27
+
| '<' -> Buffer.add_string buf "<"
28
+
| '>' -> Buffer.add_string buf ">"
29
+
| c -> Buffer.add_char buf c
30
+
) text;
31
+
Buffer.contents buf
32
+
33
+
(* Choose quote character for attribute value *)
34
+
let choose_attr_quote value =
35
+
if String.contains value '"' && not (String.contains value '\'') then '\''
36
+
else '"'
37
+
38
+
(* Escape attribute value *)
39
+
let escape_attr_value value quote_char =
40
+
let buf = Buffer.create (String.length value) in
41
+
String.iter (fun c ->
42
+
match c with
43
+
| '&' -> Buffer.add_string buf "&"
44
+
| '"' when quote_char = '"' -> Buffer.add_string buf """
45
+
| '\'' when quote_char = '\'' -> Buffer.add_string buf "'"
46
+
| c -> Buffer.add_char buf c
47
+
) value;
48
+
Buffer.contents buf
49
+
50
+
(* Check if attribute value can be unquoted *)
51
+
let can_unquote_attr_value value =
52
+
if String.length value = 0 then false
53
+
else
54
+
let invalid = ref false in
55
+
String.iter (fun c ->
56
+
if c = '>' || c = '"' || c = '\'' || c = '=' || c = '`' ||
57
+
c = ' ' || c = '\t' || c = '\n' || c = '\x0C' || c = '\r' then
58
+
invalid := true
59
+
) value;
60
+
not !invalid
61
+
62
+
(* Serialize start tag *)
63
+
let serialize_start_tag name attrs =
64
+
let buf = Buffer.create 64 in
65
+
Buffer.add_char buf '<';
66
+
Buffer.add_string buf name;
67
+
List.iter (fun (key, value) ->
68
+
Buffer.add_char buf ' ';
69
+
Buffer.add_string buf key;
70
+
if value <> "" then begin
71
+
if can_unquote_attr_value value then begin
72
+
Buffer.add_char buf '=';
73
+
Buffer.add_string buf (escape_attr_value value '"')
74
+
end else begin
75
+
let quote = choose_attr_quote value in
76
+
Buffer.add_char buf '=';
77
+
Buffer.add_char buf quote;
78
+
Buffer.add_string buf (escape_attr_value value quote);
79
+
Buffer.add_char buf quote
80
+
end
81
+
end
82
+
) attrs;
83
+
Buffer.add_char buf '>';
84
+
Buffer.contents buf
85
+
86
+
(* Serialize end tag *)
87
+
let serialize_end_tag name =
88
+
"</" ^ name ^ ">"
89
+
90
+
(* Convert node to HTML string *)
91
+
let rec to_html ?(pretty=true) ?(indent_size=2) ?(indent=0) node =
92
+
let prefix = if pretty then String.make (indent * indent_size) ' ' else "" in
93
+
let newline = if pretty then "\n" else "" in
94
+
95
+
match node.name with
96
+
| "#document" ->
97
+
let parts = List.map (to_html ~pretty ~indent_size ~indent:0) node.children in
98
+
String.concat newline (List.filter (fun s -> s <> "") parts)
99
+
100
+
| "#document-fragment" ->
101
+
let parts = List.map (to_html ~pretty ~indent_size ~indent) node.children in
102
+
String.concat newline (List.filter (fun s -> s <> "") parts)
103
+
104
+
| "#text" ->
105
+
let text = node.data in
106
+
if pretty then
107
+
let trimmed = String.trim text in
108
+
if trimmed = "" then ""
109
+
else prefix ^ escape_text trimmed
110
+
else escape_text text
111
+
112
+
| "#comment" ->
113
+
prefix ^ "<!--" ^ node.data ^ "-->"
114
+
115
+
| "!doctype" ->
116
+
prefix ^ "<!DOCTYPE html>"
117
+
118
+
| name ->
119
+
let open_tag = serialize_start_tag name node.attrs in
120
+
121
+
if is_void name then
122
+
prefix ^ open_tag
123
+
else if node.children = [] then
124
+
prefix ^ open_tag ^ serialize_end_tag name
125
+
else begin
126
+
(* Check if all children are text *)
127
+
let all_text = List.for_all is_text node.children in
128
+
if all_text && pretty then
129
+
let text = String.concat "" (List.map (fun c -> c.data) node.children) in
130
+
prefix ^ open_tag ^ escape_text text ^ serialize_end_tag name
131
+
else begin
132
+
let parts = [prefix ^ open_tag] in
133
+
let child_parts = List.filter_map (fun child ->
134
+
let html = to_html ~pretty ~indent_size ~indent:(indent + 1) child in
135
+
if html = "" then None else Some html
136
+
) node.children in
137
+
let parts = parts @ child_parts @ [prefix ^ serialize_end_tag name] in
138
+
String.concat newline parts
139
+
end
140
+
end
141
+
142
+
(* Get qualified name for test format *)
143
+
let qualified_name node =
144
+
match node.namespace with
145
+
| Some "svg" -> "svg " ^ node.name
146
+
| Some "mathml" -> "math " ^ node.name
147
+
| Some ns when ns <> "html" -> ns ^ " " ^ node.name
148
+
| _ -> node.name
149
+
150
+
(* Format attributes for test output *)
151
+
let attrs_to_test_format node indent =
152
+
if node.attrs = [] then []
153
+
else begin
154
+
let padding = String.make (indent + 2) ' ' in
155
+
(* Compute display names first, then sort by display name for canonical output *)
156
+
let with_display_names = List.map (fun (name, value) ->
157
+
let display_name =
158
+
match node.namespace with
159
+
| Some ns when ns <> "html" && List.mem (String.lowercase_ascii name) foreign_attr_adjustments ->
160
+
String.map (fun c -> if c = ':' then ' ' else c) name
161
+
| _ -> name
162
+
in
163
+
(display_name, value)
164
+
) node.attrs in
165
+
let sorted = List.sort (fun (a, _) (b, _) -> String.compare a b) with_display_names in
166
+
List.map (fun (display_name, value) ->
167
+
Printf.sprintf "| %s%s=\"%s\"" padding display_name value
168
+
) sorted
169
+
end
170
+
171
+
(* Convert node to html5lib test format *)
172
+
let rec to_test_format ?(indent=0) node =
173
+
match node.name with
174
+
| "#document" | "#document-fragment" ->
175
+
let parts = List.map (to_test_format ~indent:0) node.children in
176
+
String.concat "\n" parts
177
+
178
+
| "#comment" ->
179
+
Printf.sprintf "| %s<!-- %s -->" (String.make indent ' ') node.data
180
+
181
+
| "!doctype" ->
182
+
let dt = match node.doctype with Some d -> d | None -> { name = None; public_id = None; system_id = None } in
183
+
let name_str = match dt.name with Some n -> " " ^ n | None -> " " in
184
+
let ids_str =
185
+
match dt.public_id, dt.system_id with
186
+
| None, None -> ""
187
+
| pub, sys ->
188
+
let pub_str = match pub with Some p -> p | None -> "" in
189
+
let sys_str = match sys with Some s -> s | None -> "" in
190
+
Printf.sprintf " \"%s\" \"%s\"" pub_str sys_str
191
+
in
192
+
Printf.sprintf "| <!DOCTYPE%s%s>" name_str ids_str
193
+
194
+
| "#text" ->
195
+
Printf.sprintf "| %s\"%s\"" (String.make indent ' ') node.data
196
+
197
+
| "template" when node.namespace = None || node.namespace = Some "html" ->
198
+
let line = Printf.sprintf "| %s<%s>" (String.make indent ' ') (qualified_name node) in
199
+
let attr_lines = attrs_to_test_format node indent in
200
+
let content_line = Printf.sprintf "| %scontent" (String.make (indent + 2) ' ') in
201
+
let content_children =
202
+
match node.template_content with
203
+
| Some tc -> List.map (to_test_format ~indent:(indent + 4)) tc.children
204
+
| None -> []
205
+
in
206
+
String.concat "\n" ([line] @ attr_lines @ [content_line] @ content_children)
207
+
208
+
| _ ->
209
+
let line = Printf.sprintf "| %s<%s>" (String.make indent ' ') (qualified_name node) in
210
+
let attr_lines = attrs_to_test_format node indent in
211
+
let child_lines = List.map (to_test_format ~indent:(indent + 2)) node.children in
212
+
String.concat "\n" ([line] @ attr_lines @ child_lines)
213
+
214
+
(* Extract text content *)
215
+
let to_text ?(separator=" ") ?(strip=true) node =
216
+
let rec collect_text n =
217
+
if is_text n then [n.data]
218
+
else List.concat_map collect_text n.children
219
+
in
220
+
let texts = collect_text node in
221
+
let combined = String.concat separator texts in
222
+
if strip then String.trim combined else combined
223
+
224
+
(* Streaming serialization to a Bytes.Writer.t
225
+
Writes HTML directly to the writer without building intermediate strings *)
226
+
let rec to_writer ?(pretty=true) ?(indent_size=2) ?(indent=0) (w : Bytes.Writer.t) node =
227
+
let write s = Bytes.Writer.write_string w s in
228
+
let write_prefix () = if pretty then write (String.make (indent * indent_size) ' ') in
229
+
let write_newline () = if pretty then write "\n" in
230
+
231
+
match node.name with
232
+
| "#document" ->
233
+
let rec write_children first = function
234
+
| [] -> ()
235
+
| child :: rest ->
236
+
if not first && pretty then write_newline ();
237
+
to_writer ~pretty ~indent_size ~indent:0 w child;
238
+
write_children false rest
239
+
in
240
+
write_children true node.children
241
+
242
+
| "#document-fragment" ->
243
+
let rec write_children first = function
244
+
| [] -> ()
245
+
| child :: rest ->
246
+
if not first && pretty then write_newline ();
247
+
to_writer ~pretty ~indent_size ~indent w child;
248
+
write_children false rest
249
+
in
250
+
write_children true node.children
251
+
252
+
| "#text" ->
253
+
let text = node.data in
254
+
if pretty then begin
255
+
let trimmed = String.trim text in
256
+
if trimmed <> "" then begin
257
+
write_prefix ();
258
+
write (escape_text trimmed)
259
+
end
260
+
end else
261
+
write (escape_text text)
262
+
263
+
| "#comment" ->
264
+
write_prefix ();
265
+
write "<!--";
266
+
write node.data;
267
+
write "-->"
268
+
269
+
| "!doctype" ->
270
+
write_prefix ();
271
+
write "<!DOCTYPE html>"
272
+
273
+
| name ->
274
+
write_prefix ();
275
+
write (serialize_start_tag name node.attrs);
276
+
277
+
if not (is_void name) then begin
278
+
if node.children = [] then
279
+
write (serialize_end_tag name)
280
+
else begin
281
+
(* Check if all children are text *)
282
+
let all_text = List.for_all is_text node.children in
283
+
if all_text && pretty then begin
284
+
let text = String.concat "" (List.map (fun c -> c.data) node.children) in
285
+
write (escape_text text);
286
+
write (serialize_end_tag name)
287
+
end else begin
288
+
let rec write_children = function
289
+
| [] -> ()
290
+
| child :: rest ->
291
+
write_newline ();
292
+
to_writer ~pretty ~indent_size ~indent:(indent + 1) w child;
293
+
write_children rest
294
+
in
295
+
write_children node.children;
296
+
write_newline ();
297
+
write_prefix ();
298
+
write (serialize_end_tag name)
299
+
end
300
+
end
301
+
end
+19
lib/encoding/bom.ml
+19
lib/encoding/bom.ml
···
1
+
(* BOM (Byte Order Mark) sniffing *)
2
+
3
+
let sniff data =
4
+
let len = Bytes.length data in
5
+
if len >= 3 &&
6
+
Bytes.get data 0 = '\xEF' &&
7
+
Bytes.get data 1 = '\xBB' &&
8
+
Bytes.get data 2 = '\xBF' then
9
+
Some (Encoding.Utf8, 3)
10
+
else if len >= 2 &&
11
+
Bytes.get data 0 = '\xFF' &&
12
+
Bytes.get data 1 = '\xFE' then
13
+
Some (Encoding.Utf16le, 2)
14
+
else if len >= 2 &&
15
+
Bytes.get data 0 = '\xFE' &&
16
+
Bytes.get data 1 = '\xFF' then
17
+
Some (Encoding.Utf16be, 2)
18
+
else
19
+
None
+190
lib/encoding/decode.ml
+190
lib/encoding/decode.ml
···
1
+
(* HTML5 encoding detection and decoding *)
2
+
3
+
let decode_utf16 data ~is_le ~bom_len =
4
+
let len = Bytes.length data in
5
+
let buf = Buffer.create len in
6
+
let i = ref bom_len in
7
+
8
+
while !i + 1 < len do
9
+
let b0 = Char.code (Bytes.get data !i) in
10
+
let b1 = Char.code (Bytes.get data (!i + 1)) in
11
+
let code_unit =
12
+
if is_le then b0 lor (b1 lsl 8)
13
+
else (b0 lsl 8) lor b1
14
+
in
15
+
i := !i + 2;
16
+
17
+
(* Handle surrogate pairs *)
18
+
if code_unit >= 0xD800 && code_unit <= 0xDBFF && !i + 1 < len then begin
19
+
(* High surrogate, look for low surrogate *)
20
+
let b2 = Char.code (Bytes.get data !i) in
21
+
let b3 = Char.code (Bytes.get data (!i + 1)) in
22
+
let code_unit2 =
23
+
if is_le then b2 lor (b3 lsl 8)
24
+
else (b2 lsl 8) lor b3
25
+
in
26
+
if code_unit2 >= 0xDC00 && code_unit2 <= 0xDFFF then begin
27
+
i := !i + 2;
28
+
let high = code_unit - 0xD800 in
29
+
let low = code_unit2 - 0xDC00 in
30
+
let cp = 0x10000 + (high lsl 10) lor low in
31
+
Buffer.add_char buf (Char.chr (0xF0 lor (cp lsr 18)));
32
+
Buffer.add_char buf (Char.chr (0x80 lor ((cp lsr 12) land 0x3F)));
33
+
Buffer.add_char buf (Char.chr (0x80 lor ((cp lsr 6) land 0x3F)));
34
+
Buffer.add_char buf (Char.chr (0x80 lor (cp land 0x3F)))
35
+
end else begin
36
+
(* Invalid surrogate, output replacement *)
37
+
Buffer.add_string buf "\xEF\xBF\xBD"
38
+
end
39
+
end else if code_unit >= 0xD800 && code_unit <= 0xDFFF then begin
40
+
(* Lone surrogate *)
41
+
Buffer.add_string buf "\xEF\xBF\xBD"
42
+
end else if code_unit <= 0x7F then begin
43
+
Buffer.add_char buf (Char.chr code_unit)
44
+
end else if code_unit <= 0x7FF then begin
45
+
Buffer.add_char buf (Char.chr (0xC0 lor (code_unit lsr 6)));
46
+
Buffer.add_char buf (Char.chr (0x80 lor (code_unit land 0x3F)))
47
+
end else begin
48
+
Buffer.add_char buf (Char.chr (0xE0 lor (code_unit lsr 12)));
49
+
Buffer.add_char buf (Char.chr (0x80 lor ((code_unit lsr 6) land 0x3F)));
50
+
Buffer.add_char buf (Char.chr (0x80 lor (code_unit land 0x3F)))
51
+
end
52
+
done;
53
+
54
+
(* Odd trailing byte *)
55
+
if !i < len then Buffer.add_string buf "\xEF\xBF\xBD";
56
+
57
+
Buffer.contents buf
58
+
59
+
let decode_with_encoding data enc ~bom_len =
60
+
match enc with
61
+
| Encoding.Utf8 ->
62
+
(* UTF-8: Just validate and replace errors with replacement character *)
63
+
let len = Bytes.length data in
64
+
let buf = Buffer.create len in
65
+
let decoder = Uutf.decoder ~encoding:`UTF_8 (`String (Bytes.to_string data)) in
66
+
(* Skip BOM if present *)
67
+
let _ =
68
+
if bom_len > 0 then begin
69
+
for _ = 1 to bom_len do
70
+
ignore (Uutf.decode decoder)
71
+
done
72
+
end
73
+
in
74
+
let rec loop () =
75
+
match Uutf.decode decoder with
76
+
| `Uchar u -> Uutf.Buffer.add_utf_8 buf u; loop ()
77
+
| `Malformed _ -> Buffer.add_string buf "\xEF\xBF\xBD"; loop ()
78
+
| `End -> ()
79
+
| `Await -> assert false
80
+
in
81
+
loop ();
82
+
Buffer.contents buf
83
+
84
+
| Encoding.Utf16le -> decode_utf16 data ~is_le:true ~bom_len
85
+
| Encoding.Utf16be -> decode_utf16 data ~is_le:false ~bom_len
86
+
87
+
| Encoding.Windows_1252 ->
88
+
let len = Bytes.length data in
89
+
let buf = Buffer.create len in
90
+
let table = [|
91
+
(* 0x80-0x9F *)
92
+
0x20AC; 0x0081; 0x201A; 0x0192; 0x201E; 0x2026; 0x2020; 0x2021;
93
+
0x02C6; 0x2030; 0x0160; 0x2039; 0x0152; 0x008D; 0x017D; 0x008F;
94
+
0x0090; 0x2018; 0x2019; 0x201C; 0x201D; 0x2022; 0x2013; 0x2014;
95
+
0x02DC; 0x2122; 0x0161; 0x203A; 0x0153; 0x009D; 0x017E; 0x0178;
96
+
|] in
97
+
for i = bom_len to len - 1 do
98
+
let b = Char.code (Bytes.get data i) in
99
+
let cp =
100
+
if b >= 0x80 && b <= 0x9F then table.(b - 0x80)
101
+
else b
102
+
in
103
+
if cp <= 0x7F then
104
+
Buffer.add_char buf (Char.chr cp)
105
+
else if cp <= 0x7FF then begin
106
+
Buffer.add_char buf (Char.chr (0xC0 lor (cp lsr 6)));
107
+
Buffer.add_char buf (Char.chr (0x80 lor (cp land 0x3F)))
108
+
end else begin
109
+
Buffer.add_char buf (Char.chr (0xE0 lor (cp lsr 12)));
110
+
Buffer.add_char buf (Char.chr (0x80 lor ((cp lsr 6) land 0x3F)));
111
+
Buffer.add_char buf (Char.chr (0x80 lor (cp land 0x3F)))
112
+
end
113
+
done;
114
+
Buffer.contents buf
115
+
116
+
| Encoding.Iso_8859_2 ->
117
+
let len = Bytes.length data in
118
+
let buf = Buffer.create len in
119
+
let table = [|
120
+
(* 0xA0-0xBF *)
121
+
0x00A0; 0x0104; 0x02D8; 0x0141; 0x00A4; 0x013D; 0x015A; 0x00A7;
122
+
0x00A8; 0x0160; 0x015E; 0x0164; 0x0179; 0x00AD; 0x017D; 0x017B;
123
+
0x00B0; 0x0105; 0x02DB; 0x0142; 0x00B4; 0x013E; 0x015B; 0x02C7;
124
+
0x00B8; 0x0161; 0x015F; 0x0165; 0x017A; 0x02DD; 0x017E; 0x017C;
125
+
(* 0xC0-0xFF *)
126
+
0x0154; 0x00C1; 0x00C2; 0x0102; 0x00C4; 0x0139; 0x0106; 0x00C7;
127
+
0x010C; 0x00C9; 0x0118; 0x00CB; 0x011A; 0x00CD; 0x00CE; 0x010E;
128
+
0x0110; 0x0143; 0x0147; 0x00D3; 0x00D4; 0x0150; 0x00D6; 0x00D7;
129
+
0x0158; 0x016E; 0x00DA; 0x0170; 0x00DC; 0x00DD; 0x0162; 0x00DF;
130
+
0x0155; 0x00E1; 0x00E2; 0x0103; 0x00E4; 0x013A; 0x0107; 0x00E7;
131
+
0x010D; 0x00E9; 0x0119; 0x00EB; 0x011B; 0x00ED; 0x00EE; 0x010F;
132
+
0x0111; 0x0144; 0x0148; 0x00F3; 0x00F4; 0x0151; 0x00F6; 0x00F7;
133
+
0x0159; 0x016F; 0x00FA; 0x0171; 0x00FC; 0x00FD; 0x0163; 0x02D9;
134
+
|] in
135
+
for i = bom_len to len - 1 do
136
+
let b = Char.code (Bytes.get data i) in
137
+
let cp =
138
+
if b >= 0xA0 then table.(b - 0xA0)
139
+
else b
140
+
in
141
+
if cp <= 0x7F then
142
+
Buffer.add_char buf (Char.chr cp)
143
+
else if cp <= 0x7FF then begin
144
+
Buffer.add_char buf (Char.chr (0xC0 lor (cp lsr 6)));
145
+
Buffer.add_char buf (Char.chr (0x80 lor (cp land 0x3F)))
146
+
end else begin
147
+
Buffer.add_char buf (Char.chr (0xE0 lor (cp lsr 12)));
148
+
Buffer.add_char buf (Char.chr (0x80 lor ((cp lsr 6) land 0x3F)));
149
+
Buffer.add_char buf (Char.chr (0x80 lor (cp land 0x3F)))
150
+
end
151
+
done;
152
+
Buffer.contents buf
153
+
154
+
| Encoding.Euc_jp ->
155
+
(* For EUC-JP, use uutf with best effort *)
156
+
let len = Bytes.length data in
157
+
let buf = Buffer.create len in
158
+
let s = Bytes.sub_string data bom_len (len - bom_len) in
159
+
(* EUC-JP not directly supported by uutf, fall back to treating high bytes as replacement *)
160
+
(* This is a simplification - full EUC-JP would need a separate decoder *)
161
+
String.iter (fun c ->
162
+
if Char.code c <= 0x7F then
163
+
Buffer.add_char buf c
164
+
else
165
+
Buffer.add_string buf "\xEF\xBF\xBD"
166
+
) s;
167
+
Buffer.contents buf
168
+
169
+
let decode data ?transport_encoding () =
170
+
(* Step 1: Check for BOM *)
171
+
let bom_result = Bom.sniff data in
172
+
match bom_result with
173
+
| Some (enc, bom_len) ->
174
+
(decode_with_encoding data enc ~bom_len, enc)
175
+
| None ->
176
+
(* Step 2: Check transport encoding (e.g., HTTP Content-Type) *)
177
+
let enc_from_transport =
178
+
match transport_encoding with
179
+
| Some te -> Labels.normalize_label te
180
+
| None -> None
181
+
in
182
+
match enc_from_transport with
183
+
| Some enc -> (decode_with_encoding data enc ~bom_len:0, enc)
184
+
| None ->
185
+
(* Step 3: Prescan for meta charset *)
186
+
match Prescan.prescan_for_meta_charset data with
187
+
| Some enc -> (decode_with_encoding data enc ~bom_len:0, enc)
188
+
| None ->
189
+
(* Default to UTF-8 *)
190
+
(decode_with_encoding data Encoding.Utf8 ~bom_len:0, Encoding.Utf8)
+4
lib/encoding/dune
+4
lib/encoding/dune
+17
lib/encoding/encoding.ml
+17
lib/encoding/encoding.ml
···
1
+
(* HTML5 encoding types *)
2
+
3
+
type t =
4
+
| Utf8
5
+
| Utf16le
6
+
| Utf16be
7
+
| Windows_1252
8
+
| Iso_8859_2
9
+
| Euc_jp
10
+
11
+
let to_string = function
12
+
| Utf8 -> "utf-8"
13
+
| Utf16le -> "utf-16le"
14
+
| Utf16be -> "utf-16be"
15
+
| Windows_1252 -> "windows-1252"
16
+
| Iso_8859_2 -> "iso-8859-2"
17
+
| Euc_jp -> "euc-jp"
+19
lib/encoding/html5rw_encoding.ml
+19
lib/encoding/html5rw_encoding.ml
···
1
+
(* html5rw.encoding - HTML5 encoding detection and decoding *)
2
+
3
+
type encoding = Encoding.t =
4
+
| Utf8
5
+
| Utf16le
6
+
| Utf16be
7
+
| Windows_1252
8
+
| Iso_8859_2
9
+
| Euc_jp
10
+
11
+
let encoding_to_string = Encoding.to_string
12
+
13
+
let sniff_bom = Bom.sniff
14
+
15
+
let normalize_label = Labels.normalize_label
16
+
17
+
let prescan_for_meta_charset = Prescan.prescan_for_meta_charset
18
+
19
+
let decode = Decode.decode
+41
lib/encoding/labels.ml
+41
lib/encoding/labels.ml
···
1
+
(* Encoding label normalization per WHATWG Encoding Standard *)
2
+
3
+
let normalize_label label =
4
+
if String.length label = 0 then None
5
+
else
6
+
let s = String.lowercase_ascii (String.trim label) in
7
+
if String.length s = 0 then None
8
+
else
9
+
(* Security: never allow utf-7 *)
10
+
if s = "utf-7" || s = "utf7" || s = "x-utf-7" then
11
+
Some Encoding.Windows_1252
12
+
else if s = "utf-8" || s = "utf8" then
13
+
Some Encoding.Utf8
14
+
(* HTML treats latin-1 labels as windows-1252 *)
15
+
else if s = "iso-8859-1" || s = "iso8859-1" || s = "latin1" ||
16
+
s = "latin-1" || s = "l1" || s = "cp819" || s = "ibm819" then
17
+
Some Encoding.Windows_1252
18
+
else if s = "windows-1252" || s = "windows1252" || s = "cp1252" || s = "x-cp1252" then
19
+
Some Encoding.Windows_1252
20
+
else if s = "iso-8859-2" || s = "iso8859-2" || s = "latin2" || s = "latin-2" then
21
+
Some Encoding.Iso_8859_2
22
+
else if s = "euc-jp" || s = "eucjp" then
23
+
Some Encoding.Euc_jp
24
+
else if s = "utf-16" || s = "utf16" then
25
+
Some Encoding.Utf16le (* Default to LE for ambiguous utf-16 *)
26
+
else if s = "utf-16le" || s = "utf16le" then
27
+
Some Encoding.Utf16le
28
+
else if s = "utf-16be" || s = "utf16be" then
29
+
Some Encoding.Utf16be
30
+
else
31
+
None
32
+
33
+
let normalize_meta_declared label =
34
+
match normalize_label label with
35
+
| None -> None
36
+
| Some enc ->
37
+
(* Per HTML meta charset handling: ignore UTF-16/UTF-32 declarations and
38
+
treat them as UTF-8 *)
39
+
match enc with
40
+
| Encoding.Utf16le | Encoding.Utf16be -> Some Encoding.Utf8
41
+
| other -> Some other
+268
lib/encoding/prescan.ml
+268
lib/encoding/prescan.ml
···
1
+
(* HTML meta charset prescan per WHATWG spec *)
2
+
3
+
let ascii_whitespace = ['\x09'; '\x0A'; '\x0C'; '\x0D'; '\x20']
4
+
5
+
let is_ascii_whitespace c = List.mem c ascii_whitespace
6
+
7
+
let is_ascii_alpha c =
8
+
(c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')
9
+
10
+
let ascii_lower c =
11
+
if c >= 'A' && c <= 'Z' then Char.chr (Char.code c + 32)
12
+
else c
13
+
14
+
let skip_whitespace data i len =
15
+
let j = ref i in
16
+
while !j < len && is_ascii_whitespace (Bytes.get data !j) do
17
+
incr j
18
+
done;
19
+
!j
20
+
21
+
let strip_whitespace data start len =
22
+
let s = ref start in
23
+
let e = ref (start + len) in
24
+
while !s < !e && is_ascii_whitespace (Bytes.get data !s) do incr s done;
25
+
while !e > !s && is_ascii_whitespace (Bytes.get data (!e - 1)) do decr e done;
26
+
Bytes.sub_string data !s (!e - !s)
27
+
28
+
let extract_charset_from_content content =
29
+
let len = String.length content in
30
+
(* Find "charset" *)
31
+
let rec find_charset i =
32
+
if i + 7 > len then None
33
+
else
34
+
let sub = String.lowercase_ascii (String.sub content i 7) in
35
+
if sub = "charset" then
36
+
let j = ref (i + 7) in
37
+
(* Skip whitespace *)
38
+
while !j < len && is_ascii_whitespace content.[!j] do incr j done;
39
+
if !j >= len || content.[!j] <> '=' then find_charset (i + 1)
40
+
else begin
41
+
incr j;
42
+
(* Skip whitespace after = *)
43
+
while !j < len && is_ascii_whitespace content.[!j] do incr j done;
44
+
if !j >= len then None
45
+
else
46
+
let quote =
47
+
if content.[!j] = '"' || content.[!j] = '\'' then begin
48
+
let q = content.[!j] in
49
+
incr j;
50
+
Some q
51
+
end else None
52
+
in
53
+
let start = !j in
54
+
(match quote with
55
+
| Some q ->
56
+
while !j < len && content.[!j] <> q do incr j done;
57
+
if !j >= len then None
58
+
else Some (String.sub content start (!j - start))
59
+
| None ->
60
+
while !j < len &&
61
+
not (is_ascii_whitespace content.[!j]) &&
62
+
content.[!j] <> ';' do
63
+
incr j
64
+
done;
65
+
Some (String.sub content start (!j - start)))
66
+
end
67
+
else find_charset (i + 1)
68
+
in
69
+
find_charset 0
70
+
71
+
let prescan_for_meta_charset data =
72
+
let len = Bytes.length data in
73
+
let max_non_comment = 1024 in
74
+
let max_total = 65536 in
75
+
let i = ref 0 in
76
+
let non_comment = ref 0 in
77
+
78
+
let result = ref None in
79
+
80
+
while !result = None && !i < len && !i < max_total && !non_comment < max_non_comment do
81
+
if Bytes.get data !i <> '<' then begin
82
+
incr i;
83
+
incr non_comment
84
+
end else begin
85
+
(* Check for comment *)
86
+
if !i + 3 < len &&
87
+
Bytes.get data (!i + 1) = '!' &&
88
+
Bytes.get data (!i + 2) = '-' &&
89
+
Bytes.get data (!i + 3) = '-' then begin
90
+
(* Skip comment *)
91
+
let j = ref (!i + 4) in
92
+
while !j + 2 < len && not (
93
+
Bytes.get data !j = '-' &&
94
+
Bytes.get data (!j + 1) = '-' &&
95
+
Bytes.get data (!j + 2) = '>'
96
+
) do incr j done;
97
+
if !j + 2 < len then
98
+
i := !j + 3
99
+
else
100
+
result := None (* Unclosed comment, stop scanning *)
101
+
end
102
+
(* Check for end tag - skip it *)
103
+
else if !i + 1 < len && Bytes.get data (!i + 1) = '/' then begin
104
+
let j = ref (!i + 2) in
105
+
let in_quote = ref None in
106
+
while !j < len && !j < max_total && !non_comment < max_non_comment do
107
+
let c = Bytes.get data !j in
108
+
match !in_quote with
109
+
| None ->
110
+
if c = '"' || c = '\'' then begin
111
+
in_quote := Some c;
112
+
incr j;
113
+
incr non_comment
114
+
end else if c = '>' then begin
115
+
incr j;
116
+
incr non_comment;
117
+
j := len (* Exit loop *)
118
+
end else begin
119
+
incr j;
120
+
incr non_comment
121
+
end
122
+
| Some q ->
123
+
if c = q then in_quote := None;
124
+
incr j;
125
+
incr non_comment
126
+
done;
127
+
i := !j
128
+
end
129
+
(* Check for tag *)
130
+
else if !i + 1 < len && is_ascii_alpha (Bytes.get data (!i + 1)) then begin
131
+
let j = ref (!i + 1) in
132
+
while !j < len && is_ascii_alpha (Bytes.get data !j) do incr j done;
133
+
let tag_name =
134
+
let name_bytes = Bytes.sub data (!i + 1) (!j - !i - 1) in
135
+
String.lowercase_ascii (Bytes.to_string name_bytes)
136
+
in
137
+
138
+
if tag_name <> "meta" then begin
139
+
(* Skip non-meta tag *)
140
+
let in_quote = ref None in
141
+
while !j < len && !j < max_total && !non_comment < max_non_comment do
142
+
let c = Bytes.get data !j in
143
+
match !in_quote with
144
+
| None ->
145
+
if c = '"' || c = '\'' then begin
146
+
in_quote := Some c;
147
+
incr j;
148
+
incr non_comment
149
+
end else if c = '>' then begin
150
+
incr j;
151
+
incr non_comment;
152
+
j := len
153
+
end else begin
154
+
incr j;
155
+
incr non_comment
156
+
end
157
+
| Some q ->
158
+
if c = q then in_quote := None;
159
+
incr j;
160
+
incr non_comment
161
+
done;
162
+
i := !j
163
+
end else begin
164
+
(* Parse meta tag attributes *)
165
+
let charset = ref None in
166
+
let http_equiv = ref None in
167
+
let content = ref None in
168
+
let k = ref !j in
169
+
let saw_gt = ref false in
170
+
171
+
while not !saw_gt && !k < len && !k < max_total do
172
+
let c = Bytes.get data !k in
173
+
if c = '>' then begin
174
+
saw_gt := true;
175
+
incr k
176
+
end else if c = '<' then begin
177
+
(* Restart scanning from here *)
178
+
k := len
179
+
end else if is_ascii_whitespace c || c = '/' then begin
180
+
incr k
181
+
end else begin
182
+
(* Attribute name *)
183
+
let attr_start = !k in
184
+
while !k < len &&
185
+
not (is_ascii_whitespace (Bytes.get data !k)) &&
186
+
Bytes.get data !k <> '=' &&
187
+
Bytes.get data !k <> '>' &&
188
+
Bytes.get data !k <> '/' &&
189
+
Bytes.get data !k <> '<' do
190
+
incr k
191
+
done;
192
+
let attr_name =
193
+
String.lowercase_ascii (Bytes.sub_string data attr_start (!k - attr_start))
194
+
in
195
+
k := skip_whitespace data !k len;
196
+
197
+
let value = ref None in
198
+
if !k < len && Bytes.get data !k = '=' then begin
199
+
incr k;
200
+
k := skip_whitespace data !k len;
201
+
if !k < len then begin
202
+
let qc = Bytes.get data !k in
203
+
if qc = '"' || qc = '\'' then begin
204
+
incr k;
205
+
let val_start = !k in
206
+
while !k < len && Bytes.get data !k <> qc do incr k done;
207
+
if !k < len then begin
208
+
value := Some (Bytes.sub_string data val_start (!k - val_start));
209
+
incr k
210
+
end
211
+
end else begin
212
+
let val_start = !k in
213
+
while !k < len &&
214
+
not (is_ascii_whitespace (Bytes.get data !k)) &&
215
+
Bytes.get data !k <> '>' &&
216
+
Bytes.get data !k <> '<' do
217
+
incr k
218
+
done;
219
+
value := Some (Bytes.sub_string data val_start (!k - val_start))
220
+
end
221
+
end
222
+
end;
223
+
224
+
if attr_name = "charset" then
225
+
charset := !value
226
+
else if attr_name = "http-equiv" then
227
+
http_equiv := !value
228
+
else if attr_name = "content" then
229
+
content := !value
230
+
end
231
+
done;
232
+
233
+
if !saw_gt then begin
234
+
(* Check for charset *)
235
+
(match !charset with
236
+
| Some cs ->
237
+
(match Labels.normalize_meta_declared cs with
238
+
| Some enc -> result := Some enc
239
+
| None -> ())
240
+
| None -> ());
241
+
242
+
(* Check for http-equiv="content-type" with content *)
243
+
if !result = None then
244
+
(match !http_equiv, !content with
245
+
| Some he, Some ct when String.lowercase_ascii he = "content-type" ->
246
+
(match extract_charset_from_content ct with
247
+
| Some extracted ->
248
+
(match Labels.normalize_meta_declared extracted with
249
+
| Some enc -> result := Some enc
250
+
| None -> ())
251
+
| None -> ())
252
+
| _ -> ());
253
+
254
+
i := !k;
255
+
non_comment := !non_comment + (!k - !j)
256
+
end else begin
257
+
incr i;
258
+
incr non_comment
259
+
end
260
+
end
261
+
end else begin
262
+
incr i;
263
+
incr non_comment
264
+
end
265
+
end
266
+
done;
267
+
268
+
!result
+192
lib/entities/decode.ml
+192
lib/entities/decode.ml
···
1
+
(* HTML5 entity decoding *)
2
+
3
+
let is_alpha c =
4
+
(c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')
5
+
6
+
let is_alnum c =
7
+
is_alpha c || (c >= '0' && c <= '9')
8
+
9
+
let is_hex_digit c =
10
+
(c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F')
11
+
12
+
let is_digit c =
13
+
c >= '0' && c <= '9'
14
+
15
+
let decode_entities_in_text text ~in_attribute =
16
+
let len = String.length text in
17
+
let buf = Buffer.create len in
18
+
let i = ref 0 in
19
+
20
+
while !i < len do
21
+
(* Find next ampersand *)
22
+
let amp_pos =
23
+
try Some (String.index_from text !i '&')
24
+
with Not_found -> None
25
+
in
26
+
27
+
match amp_pos with
28
+
| None ->
29
+
(* No more ampersands, append rest *)
30
+
Buffer.add_substring buf text !i (len - !i);
31
+
i := len
32
+
| Some amp ->
33
+
(* Append text before ampersand *)
34
+
if amp > !i then
35
+
Buffer.add_substring buf text !i (amp - !i);
36
+
37
+
i := amp;
38
+
let j = ref (amp + 1) in
39
+
40
+
if !j >= len then begin
41
+
(* Ampersand at end *)
42
+
Buffer.add_char buf '&';
43
+
i := len
44
+
end else if text.[!j] = '#' then begin
45
+
(* Numeric entity *)
46
+
incr j;
47
+
let is_hex =
48
+
if !j < len && (text.[!j] = 'x' || text.[!j] = 'X') then begin
49
+
incr j;
50
+
true
51
+
end else false
52
+
in
53
+
54
+
let digit_start = !j in
55
+
if is_hex then
56
+
while !j < len && is_hex_digit text.[!j] do incr j done
57
+
else
58
+
while !j < len && is_digit text.[!j] do incr j done;
59
+
60
+
let has_semicolon = !j < len && text.[!j] = ';' in
61
+
let digit_text = String.sub text digit_start (!j - digit_start) in
62
+
63
+
if String.length digit_text > 0 then begin
64
+
match Numeric_ref.decode digit_text ~is_hex with
65
+
| Some decoded ->
66
+
Buffer.add_string buf decoded;
67
+
i := if has_semicolon then !j + 1 else !j
68
+
| None ->
69
+
(* Invalid numeric entity, keep as-is *)
70
+
let end_pos = if has_semicolon then !j + 1 else !j in
71
+
Buffer.add_substring buf text amp (end_pos - amp);
72
+
i := end_pos
73
+
end else begin
74
+
(* No digits, keep as-is *)
75
+
let end_pos = if has_semicolon then !j + 1 else !j in
76
+
Buffer.add_substring buf text amp (end_pos - amp);
77
+
i := end_pos
78
+
end
79
+
end else begin
80
+
(* Named entity *)
81
+
(* Collect alphanumeric characters *)
82
+
while !j < len && is_alnum text.[!j] do incr j done;
83
+
84
+
let entity_name = String.sub text (amp + 1) (!j - amp - 1) in
85
+
let has_semicolon = !j < len && text.[!j] = ';' in
86
+
87
+
if String.length entity_name = 0 then begin
88
+
Buffer.add_char buf '&';
89
+
i := amp + 1
90
+
end else begin
91
+
(* Try exact match first (with semicolon expected) *)
92
+
let decoded =
93
+
if has_semicolon then
94
+
Entity_table.lookup entity_name
95
+
else
96
+
None
97
+
in
98
+
99
+
match decoded with
100
+
| Some value ->
101
+
Buffer.add_string buf value;
102
+
i := !j + 1
103
+
| None ->
104
+
(* If semicolon present but no exact match, try legacy prefix match in text *)
105
+
if has_semicolon && not in_attribute then begin
106
+
(* Try progressively shorter prefixes *)
107
+
let rec try_prefix k =
108
+
if k <= 0 then None
109
+
else
110
+
let prefix = String.sub entity_name 0 k in
111
+
if Entity_table.is_legacy prefix then
112
+
match Entity_table.lookup prefix with
113
+
| Some value -> Some (value, k)
114
+
| None -> try_prefix (k - 1)
115
+
else
116
+
try_prefix (k - 1)
117
+
in
118
+
match try_prefix (String.length entity_name) with
119
+
| Some (value, matched_len) ->
120
+
Buffer.add_string buf value;
121
+
i := amp + 1 + matched_len
122
+
| None ->
123
+
(* No match, keep as-is including semicolon *)
124
+
Buffer.add_substring buf text amp (!j + 1 - amp);
125
+
i := !j + 1
126
+
end else if not has_semicolon then begin
127
+
(* Try without semicolon for legacy compatibility *)
128
+
if Entity_table.is_legacy entity_name then
129
+
match Entity_table.lookup entity_name with
130
+
| Some value ->
131
+
(* Legacy entities without semicolon have strict rules in attributes *)
132
+
let next_char = if !j < len then Some text.[!j] else None in
133
+
let blocked =
134
+
in_attribute &&
135
+
match next_char with
136
+
| Some c -> is_alnum c || c = '='
137
+
| None -> false
138
+
in
139
+
if blocked then begin
140
+
Buffer.add_char buf '&';
141
+
i := amp + 1
142
+
end else begin
143
+
Buffer.add_string buf value;
144
+
i := !j
145
+
end
146
+
| None ->
147
+
Buffer.add_char buf '&';
148
+
i := amp + 1
149
+
else begin
150
+
(* Try longest prefix match for legacy entities *)
151
+
let rec try_prefix k =
152
+
if k <= 0 then None
153
+
else
154
+
let prefix = String.sub entity_name 0 k in
155
+
if Entity_table.is_legacy prefix then
156
+
match Entity_table.lookup prefix with
157
+
| Some value -> Some (value, k)
158
+
| None -> try_prefix (k - 1)
159
+
else
160
+
try_prefix (k - 1)
161
+
in
162
+
match try_prefix (String.length entity_name) with
163
+
| Some (value, matched_len) ->
164
+
let end_pos = amp + 1 + matched_len in
165
+
let next_char = if end_pos < len then Some text.[end_pos] else None in
166
+
let blocked =
167
+
in_attribute &&
168
+
match next_char with
169
+
| Some c -> is_alnum c || c = '='
170
+
| None -> false
171
+
in
172
+
if blocked then begin
173
+
Buffer.add_char buf '&';
174
+
i := amp + 1
175
+
end else begin
176
+
Buffer.add_string buf value;
177
+
i := end_pos
178
+
end
179
+
| None ->
180
+
Buffer.add_char buf '&';
181
+
i := amp + 1
182
+
end
183
+
end else begin
184
+
(* Has semicolon but no match *)
185
+
Buffer.add_substring buf text amp (!j + 1 - amp);
186
+
i := !j + 1
187
+
end
188
+
end
189
+
end
190
+
done;
191
+
192
+
Buffer.contents buf
+8
lib/entities/dune
+8
lib/entities/dune
+13
lib/entities/html5rw_entities.ml
+13
lib/entities/html5rw_entities.ml
···
1
+
(* html5rw.entities - HTML5 entity decoding *)
2
+
3
+
let decode = Decode.decode_entities_in_text
4
+
5
+
let decode_numeric = Numeric_ref.decode
6
+
7
+
let lookup = Entity_table.lookup
8
+
9
+
let is_legacy = Entity_table.is_legacy
10
+
11
+
let codepoint_to_utf8 = Numeric_ref.codepoint_to_utf8
12
+
13
+
module Numeric_ref = Numeric_ref
+85
lib/entities/numeric_ref.ml
+85
lib/entities/numeric_ref.ml
···
1
+
(* HTML5 numeric character reference decoding *)
2
+
3
+
(* HTML5 spec: numeric character reference replacements (§13.2.5.73) *)
4
+
let numeric_replacements = [|
5
+
(0x00, 0xFFFD); (* NULL -> REPLACEMENT CHARACTER *)
6
+
(0x80, 0x20AC); (* -> EURO SIGN *)
7
+
(0x82, 0x201A); (* -> SINGLE LOW-9 QUOTATION MARK *)
8
+
(0x83, 0x0192); (* -> LATIN SMALL LETTER F WITH HOOK *)
9
+
(0x84, 0x201E); (* -> DOUBLE LOW-9 QUOTATION MARK *)
10
+
(0x85, 0x2026); (* -> HORIZONTAL ELLIPSIS *)
11
+
(0x86, 0x2020); (* -> DAGGER *)
12
+
(0x87, 0x2021); (* -> DOUBLE DAGGER *)
13
+
(0x88, 0x02C6); (* -> MODIFIER LETTER CIRCUMFLEX ACCENT *)
14
+
(0x89, 0x2030); (* -> PER MILLE SIGN *)
15
+
(0x8A, 0x0160); (* -> LATIN CAPITAL LETTER S WITH CARON *)
16
+
(0x8B, 0x2039); (* -> SINGLE LEFT-POINTING ANGLE QUOTATION MARK *)
17
+
(0x8C, 0x0152); (* -> LATIN CAPITAL LIGATURE OE *)
18
+
(0x8E, 0x017D); (* -> LATIN CAPITAL LETTER Z WITH CARON *)
19
+
(0x91, 0x2018); (* -> LEFT SINGLE QUOTATION MARK *)
20
+
(0x92, 0x2019); (* -> RIGHT SINGLE QUOTATION MARK *)
21
+
(0x93, 0x201C); (* -> LEFT DOUBLE QUOTATION MARK *)
22
+
(0x94, 0x201D); (* -> RIGHT DOUBLE QUOTATION MARK *)
23
+
(0x95, 0x2022); (* -> BULLET *)
24
+
(0x96, 0x2013); (* -> EN DASH *)
25
+
(0x97, 0x2014); (* -> EM DASH *)
26
+
(0x98, 0x02DC); (* -> SMALL TILDE *)
27
+
(0x99, 0x2122); (* -> TRADE MARK SIGN *)
28
+
(0x9A, 0x0161); (* -> LATIN SMALL LETTER S WITH CARON *)
29
+
(0x9B, 0x203A); (* -> SINGLE RIGHT-POINTING ANGLE QUOTATION MARK *)
30
+
(0x9C, 0x0153); (* -> LATIN SMALL LIGATURE OE *)
31
+
(0x9E, 0x017E); (* -> LATIN SMALL LETTER Z WITH CARON *)
32
+
(0x9F, 0x0178); (* -> LATIN CAPITAL LETTER Y WITH DIAERESIS *)
33
+
|]
34
+
35
+
let find_replacement cp =
36
+
let rec search i =
37
+
if i >= Array.length numeric_replacements then None
38
+
else
39
+
let (k, v) = numeric_replacements.(i) in
40
+
if k = cp then Some v
41
+
else if k > cp then None
42
+
else search (i + 1)
43
+
in
44
+
search 0
45
+
46
+
let codepoint_to_utf8 cp =
47
+
let buf = Buffer.create 4 in
48
+
if cp <= 0x7F then
49
+
Buffer.add_char buf (Char.chr cp)
50
+
else if cp <= 0x7FF then begin
51
+
Buffer.add_char buf (Char.chr (0xC0 lor (cp lsr 6)));
52
+
Buffer.add_char buf (Char.chr (0x80 lor (cp land 0x3F)))
53
+
end else if cp <= 0xFFFF then begin
54
+
Buffer.add_char buf (Char.chr (0xE0 lor (cp lsr 12)));
55
+
Buffer.add_char buf (Char.chr (0x80 lor ((cp lsr 6) land 0x3F)));
56
+
Buffer.add_char buf (Char.chr (0x80 lor (cp land 0x3F)))
57
+
end else begin
58
+
Buffer.add_char buf (Char.chr (0xF0 lor (cp lsr 18)));
59
+
Buffer.add_char buf (Char.chr (0x80 lor ((cp lsr 12) land 0x3F)));
60
+
Buffer.add_char buf (Char.chr (0x80 lor ((cp lsr 6) land 0x3F)));
61
+
Buffer.add_char buf (Char.chr (0x80 lor (cp land 0x3F)))
62
+
end;
63
+
Buffer.contents buf
64
+
65
+
let replacement_char = "\xEF\xBF\xBD" (* U+FFFD in UTF-8 *)
66
+
67
+
let decode text ~is_hex =
68
+
match int_of_string_opt ((if is_hex then "0x" else "") ^ text) with
69
+
| None -> None
70
+
| Some cp ->
71
+
(* Apply HTML5 replacements *)
72
+
let cp = match find_replacement cp with
73
+
| Some replacement -> replacement
74
+
| None -> cp
75
+
in
76
+
(* Invalid ranges per HTML5 spec *)
77
+
if cp > 0x10FFFF then
78
+
Some replacement_char
79
+
else if cp >= 0xD800 && cp <= 0xDFFF then
80
+
(* Surrogate range *)
81
+
Some replacement_char
82
+
else if cp = 0 then
83
+
Some replacement_char
84
+
else
85
+
Some (codepoint_to_utf8 cp)
+11
lib/html5rw/dune
+11
lib/html5rw/dune
+302
lib/html5rw/html5rw.ml
+302
lib/html5rw/html5rw.ml
···
1
+
(** Html5rw - Pure OCaml HTML5 Parser
2
+
3
+
This module provides a complete HTML5 parsing solution following the
4
+
WHATWG specification. It uses bytesrw for streaming input/output.
5
+
6
+
{2 Quick Start}
7
+
8
+
Parse HTML from a reader:
9
+
{[
10
+
open Bytesrw
11
+
let reader = Bytes.Reader.of_string "<p>Hello, world!</p>" in
12
+
let result = Html5rw.parse reader in
13
+
let html = Html5rw.to_string result
14
+
]}
15
+
16
+
Parse from a file:
17
+
{[
18
+
open Bytesrw
19
+
let ic = open_in "page.html" in
20
+
let reader = Bytes.Reader.of_in_channel ic in
21
+
let result = Html5rw.parse reader in
22
+
close_in ic
23
+
]}
24
+
25
+
Query with CSS selectors:
26
+
{[
27
+
let result = Html5rw.parse reader in
28
+
let divs = Html5rw.query result "div.content"
29
+
]}
30
+
*)
31
+
32
+
(** {1 Sub-modules} *)
33
+
34
+
(** DOM types and manipulation functions *)
35
+
module Dom = Html5rw_dom
36
+
37
+
(** HTML5 tokenizer *)
38
+
module Tokenizer = Html5rw_tokenizer
39
+
40
+
(** Encoding detection and decoding *)
41
+
module Encoding = Html5rw_encoding
42
+
43
+
(** CSS selector engine *)
44
+
module Selector = Html5rw_selector
45
+
46
+
(** HTML entity decoding *)
47
+
module Entities = Html5rw_entities
48
+
49
+
(** Low-level parser access *)
50
+
module Parser = Html5rw_parser
51
+
52
+
(** {1 Core Types} *)
53
+
54
+
(** DOM node type. See {!Dom} for manipulation functions. *)
55
+
type node = Dom.node
56
+
57
+
(** Doctype information *)
58
+
type doctype_data = Dom.doctype_data = {
59
+
name : string option;
60
+
public_id : string option;
61
+
system_id : string option;
62
+
}
63
+
64
+
(** Quirks mode as determined during parsing *)
65
+
type quirks_mode = Dom.quirks_mode = No_quirks | Quirks | Limited_quirks
66
+
67
+
(** Character encoding detected or specified *)
68
+
type encoding = Encoding.encoding =
69
+
| Utf8
70
+
| Utf16le
71
+
| Utf16be
72
+
| Windows_1252
73
+
| Iso_8859_2
74
+
| Euc_jp
75
+
76
+
(** Parse error record *)
77
+
type parse_error = Parser.parse_error
78
+
79
+
(** Fragment parsing context *)
80
+
type fragment_context = Parser.fragment_context
81
+
82
+
(** Create a fragment parsing context.
83
+
@param tag_name Tag name of the context element
84
+
@param namespace Namespace (None for HTML, Some "svg", Some "mathml")
85
+
*)
86
+
let make_fragment_context = Parser.make_fragment_context
87
+
88
+
(** Get the tag name from a fragment context *)
89
+
let fragment_context_tag = Parser.fragment_context_tag
90
+
91
+
(** Get the namespace from a fragment context *)
92
+
let fragment_context_namespace = Parser.fragment_context_namespace
93
+
94
+
(** Get the error code string *)
95
+
let error_code = Parser.error_code
96
+
97
+
(** Get the line number of an error (1-indexed) *)
98
+
let error_line = Parser.error_line
99
+
100
+
(** Get the column number of an error (1-indexed) *)
101
+
let error_column = Parser.error_column
102
+
103
+
(** Result of parsing an HTML document *)
104
+
type t = {
105
+
root : node;
106
+
errors : parse_error list;
107
+
encoding : encoding option;
108
+
}
109
+
110
+
(* Internal: convert Parser.t to our t *)
111
+
let of_parser_result (p : Parser.t) : t =
112
+
{ root = Parser.root p; errors = Parser.errors p; encoding = Parser.encoding p }
113
+
114
+
(** {1 Parsing Functions} *)
115
+
116
+
(** Parse HTML from a [Bytes.Reader.t].
117
+
118
+
This is the primary parsing function. Create a reader from any source:
119
+
- [Bytes.Reader.of_string s] for strings
120
+
- [Bytes.Reader.of_in_channel ic] for files
121
+
- [Bytes.Reader.of_bytes b] for byte buffers
122
+
123
+
{[
124
+
open Bytesrw
125
+
let reader = Bytes.Reader.of_string "<html><body>Hello</body></html>" in
126
+
let result = Html5rw.parse reader
127
+
]}
128
+
129
+
@param collect_errors If true, collect parse errors (default: false)
130
+
@param fragment_context Context element for fragment parsing
131
+
*)
132
+
let parse ?collect_errors ?fragment_context reader =
133
+
of_parser_result (Parser.parse ?collect_errors ?fragment_context reader)
134
+
135
+
(** Parse raw bytes with automatic encoding detection.
136
+
137
+
This function implements the WHATWG encoding sniffing algorithm:
138
+
1. Check for BOM (Byte Order Mark)
139
+
2. Prescan for <meta charset>
140
+
3. Fall back to UTF-8
141
+
142
+
@param collect_errors If true, collect parse errors (default: false)
143
+
@param transport_encoding Encoding from HTTP Content-Type header
144
+
@param fragment_context Context element for fragment parsing
145
+
*)
146
+
let parse_bytes ?collect_errors ?transport_encoding ?fragment_context bytes =
147
+
of_parser_result (Parser.parse_bytes ?collect_errors ?transport_encoding ?fragment_context bytes)
148
+
149
+
(** {1 Querying} *)
150
+
151
+
(** Query the DOM tree with a CSS selector.
152
+
153
+
Supported selectors:
154
+
- Tag: [div], [p], [span]
155
+
- ID: [#myid]
156
+
- Class: [.myclass]
157
+
- Universal: [*]
158
+
- Attribute: [[attr]], [[attr="value"]], [[attr~="value"]], [[attr|="value"]]
159
+
- Pseudo-classes: [:first-child], [:last-child], [:nth-child(n)]
160
+
- Combinators: descendant (space), child (>), adjacent sibling (+), general sibling (~)
161
+
162
+
{[
163
+
let divs = Html5rw.query result "div.content > p"
164
+
]}
165
+
166
+
@raise Selector.Selector_error if the selector is invalid
167
+
*)
168
+
let query t selector = Selector.query t.root selector
169
+
170
+
(** Check if a node matches a CSS selector. *)
171
+
let matches node selector = Selector.matches node selector
172
+
173
+
(** {1 Serialization} *)
174
+
175
+
(** Write the DOM tree to a [Bytes.Writer.t].
176
+
177
+
{[
178
+
open Bytesrw
179
+
let buf = Buffer.create 1024 in
180
+
let writer = Bytes.Writer.of_buffer buf in
181
+
Html5rw.to_writer result writer;
182
+
Bytes.Writer.write_eod writer;
183
+
let html = Buffer.contents buf
184
+
]}
185
+
186
+
@param pretty If true, format with indentation (default: true)
187
+
@param indent_size Number of spaces per indent level (default: 2)
188
+
*)
189
+
let to_writer ?pretty ?indent_size t writer =
190
+
Dom.to_writer ?pretty ?indent_size writer t.root
191
+
192
+
(** Serialize the DOM tree to a string.
193
+
194
+
Convenience function when the output fits in memory.
195
+
196
+
@param pretty If true, format with indentation (default: true)
197
+
@param indent_size Number of spaces per indent level (default: 2)
198
+
*)
199
+
let to_string ?pretty ?indent_size t = Dom.to_html ?pretty ?indent_size t.root
200
+
201
+
(** Extract text content from the DOM tree.
202
+
203
+
@param separator String to insert between text nodes (default: " ")
204
+
@param strip If true, trim whitespace (default: true)
205
+
*)
206
+
let to_text ?separator ?strip t = Dom.to_text ?separator ?strip t.root
207
+
208
+
(** Serialize to html5lib test format (for testing). *)
209
+
let to_test_format t = Dom.to_test_format t.root
210
+
211
+
(** {1 Result Accessors} *)
212
+
213
+
(** Get the root node of the parsed document. *)
214
+
let root t = t.root
215
+
216
+
(** Get parse errors (if error collection was enabled). *)
217
+
let errors t = t.errors
218
+
219
+
(** Get the detected encoding (if parsed from bytes). *)
220
+
let encoding t = t.encoding
221
+
222
+
(** {1 DOM Utilities}
223
+
224
+
Common DOM operations are available directly. For the full API,
225
+
see the {!Dom} module.
226
+
*)
227
+
228
+
(** Create an element node.
229
+
@param namespace None for HTML, Some "svg" or Some "mathml" for foreign content
230
+
@param attrs List of (name, value) attribute pairs
231
+
*)
232
+
let create_element = Dom.create_element
233
+
234
+
(** Create a text node. *)
235
+
let create_text = Dom.create_text
236
+
237
+
(** Create a comment node. *)
238
+
let create_comment = Dom.create_comment
239
+
240
+
(** Create an empty document node. *)
241
+
let create_document = Dom.create_document
242
+
243
+
(** Create a document fragment node. *)
244
+
let create_document_fragment = Dom.create_document_fragment
245
+
246
+
(** Create a doctype node. *)
247
+
let create_doctype = Dom.create_doctype
248
+
249
+
(** Append a child node to a parent. *)
250
+
let append_child = Dom.append_child
251
+
252
+
(** Insert a node before a reference node. *)
253
+
let insert_before = Dom.insert_before
254
+
255
+
(** Remove a child node from its parent. *)
256
+
let remove_child = Dom.remove_child
257
+
258
+
(** Get an attribute value. *)
259
+
let get_attr = Dom.get_attr
260
+
261
+
(** Set an attribute value. *)
262
+
let set_attr = Dom.set_attr
263
+
264
+
(** Check if a node has an attribute. *)
265
+
let has_attr = Dom.has_attr
266
+
267
+
(** Get all descendant nodes. *)
268
+
let descendants = Dom.descendants
269
+
270
+
(** Get all ancestor nodes (from parent to root). *)
271
+
let ancestors = Dom.ancestors
272
+
273
+
(** Get text content of a node and its descendants. *)
274
+
let get_text_content = Dom.get_text_content
275
+
276
+
(** Clone a node.
277
+
@param deep If true, also clone descendants (default: false)
278
+
*)
279
+
let clone = Dom.clone
280
+
281
+
(** {1 Node Predicates} *)
282
+
283
+
(** Test if a node is an element. *)
284
+
let is_element = Dom.is_element
285
+
286
+
(** Test if a node is a text node. *)
287
+
let is_text = Dom.is_text
288
+
289
+
(** Test if a node is a comment node. *)
290
+
let is_comment = Dom.is_comment
291
+
292
+
(** Test if a node is a document node. *)
293
+
let is_document = Dom.is_document
294
+
295
+
(** Test if a node is a document fragment. *)
296
+
let is_document_fragment = Dom.is_document_fragment
297
+
298
+
(** Test if a node is a doctype node. *)
299
+
let is_doctype = Dom.is_doctype
300
+
301
+
(** Test if a node has children. *)
302
+
let has_children = Dom.has_children
+324
lib/html5rw/html5rw.mli
+324
lib/html5rw/html5rw.mli
···
1
+
(** Html5rw - Pure OCaml HTML5 Parser
2
+
3
+
This module provides a complete HTML5 parsing solution following the
4
+
WHATWG specification. It uses bytesrw for streaming input/output.
5
+
6
+
{2 Quick Start}
7
+
8
+
Parse HTML from a reader:
9
+
{[
10
+
open Bytesrw
11
+
let reader = Bytes.Reader.of_string "<p>Hello, world!</p>" in
12
+
let result = Html5rw.parse reader in
13
+
let html = Html5rw.to_string result
14
+
]}
15
+
16
+
Parse from a file:
17
+
{[
18
+
open Bytesrw
19
+
let ic = open_in "page.html" in
20
+
let reader = Bytes.Reader.of_in_channel ic in
21
+
let result = Html5rw.parse reader in
22
+
close_in ic
23
+
]}
24
+
25
+
Query with CSS selectors:
26
+
{[
27
+
let result = Html5rw.parse reader in
28
+
let divs = Html5rw.query result "div.content"
29
+
]}
30
+
*)
31
+
32
+
(** {1 Sub-modules} *)
33
+
34
+
(** DOM types and manipulation functions *)
35
+
module Dom = Html5rw_dom
36
+
37
+
(** HTML5 tokenizer *)
38
+
module Tokenizer = Html5rw_tokenizer
39
+
40
+
(** Encoding detection and decoding *)
41
+
module Encoding = Html5rw_encoding
42
+
43
+
(** CSS selector engine *)
44
+
module Selector = Html5rw_selector
45
+
46
+
(** HTML entity decoding *)
47
+
module Entities = Html5rw_entities
48
+
49
+
(** Low-level parser access *)
50
+
module Parser = Html5rw_parser
51
+
52
+
(** {1 Core Types} *)
53
+
54
+
(** DOM node type. See {!Dom} for manipulation functions. *)
55
+
type node = Dom.node
56
+
57
+
(** Doctype information *)
58
+
type doctype_data = Dom.doctype_data = {
59
+
name : string option;
60
+
public_id : string option;
61
+
system_id : string option;
62
+
}
63
+
64
+
(** Quirks mode as determined during parsing *)
65
+
type quirks_mode = Dom.quirks_mode = No_quirks | Quirks | Limited_quirks
66
+
67
+
(** Character encoding detected or specified *)
68
+
type encoding = Encoding.encoding =
69
+
| Utf8
70
+
| Utf16le
71
+
| Utf16be
72
+
| Windows_1252
73
+
| Iso_8859_2
74
+
| Euc_jp
75
+
76
+
(** A parse error encountered during HTML5 parsing.
77
+
78
+
HTML5 parsing never fails - the specification defines error recovery
79
+
for all malformed input. However, conformance checkers can report
80
+
these errors. Enable error collection with [~collect_errors:true].
81
+
82
+
@see <https://html.spec.whatwg.org/multipage/parsing.html#parse-errors>
83
+
WHATWG parse error definitions
84
+
*)
85
+
type parse_error = Parser.parse_error
86
+
87
+
(** Get the error code (e.g., "unexpected-null-character"). *)
88
+
val error_code : parse_error -> string
89
+
90
+
(** Get the line number where the error occurred (1-indexed). *)
91
+
val error_line : parse_error -> int
92
+
93
+
(** Get the column number where the error occurred (1-indexed). *)
94
+
val error_column : parse_error -> int
95
+
96
+
(** Context element for HTML fragment parsing (innerHTML).
97
+
98
+
When parsing HTML fragments, you must specify what element would
99
+
contain the fragment. This affects how certain elements are handled.
100
+
101
+
@see <https://html.spec.whatwg.org/multipage/parsing.html#parsing-html-fragments>
102
+
The fragment parsing algorithm
103
+
*)
104
+
type fragment_context = Parser.fragment_context
105
+
106
+
(** Create a fragment parsing context.
107
+
108
+
@param tag_name Tag name of the context element (e.g., "div", "tr")
109
+
@param namespace Namespace: [None] for HTML, [Some "svg"], [Some "mathml"]
110
+
111
+
{[
112
+
(* Parse as innerHTML of a <ul> *)
113
+
let ctx = Html5rw.make_fragment_context ~tag_name:"ul" ()
114
+
115
+
(* Parse as innerHTML of an SVG <g> element *)
116
+
let ctx = Html5rw.make_fragment_context ~tag_name:"g" ~namespace:(Some "svg") ()
117
+
]}
118
+
*)
119
+
val make_fragment_context : tag_name:string -> ?namespace:string option ->
120
+
unit -> fragment_context
121
+
122
+
(** Get the tag name of a fragment context. *)
123
+
val fragment_context_tag : fragment_context -> string
124
+
125
+
(** Get the namespace of a fragment context. *)
126
+
val fragment_context_namespace : fragment_context -> string option
127
+
128
+
(** Result of parsing an HTML document.
129
+
130
+
Contains the parsed DOM tree, any errors encountered, and the
131
+
detected encoding (when parsing from bytes).
132
+
*)
133
+
type t = {
134
+
root : node;
135
+
errors : parse_error list;
136
+
encoding : encoding option;
137
+
}
138
+
139
+
(** {1 Parsing Functions} *)
140
+
141
+
(** Parse HTML from a [Bytes.Reader.t].
142
+
143
+
This is the primary parsing function. Create a reader from any source:
144
+
- [Bytes.Reader.of_string s] for strings
145
+
- [Bytes.Reader.of_in_channel ic] for files
146
+
- [Bytes.Reader.of_bytes b] for byte buffers
147
+
148
+
{[
149
+
open Bytesrw
150
+
let reader = Bytes.Reader.of_string "<html><body>Hello</body></html>" in
151
+
let result = Html5rw.parse reader
152
+
]}
153
+
154
+
@param collect_errors If true, collect parse errors (default: false)
155
+
@param fragment_context Context element for fragment parsing
156
+
*)
157
+
val parse : ?collect_errors:bool -> ?fragment_context:fragment_context -> Bytesrw.Bytes.Reader.t -> t
158
+
159
+
(** Parse raw bytes with automatic encoding detection.
160
+
161
+
This function implements the WHATWG encoding sniffing algorithm:
162
+
1. Check for BOM (Byte Order Mark)
163
+
2. Prescan for <meta charset>
164
+
3. Fall back to UTF-8
165
+
166
+
@param collect_errors If true, collect parse errors (default: false)
167
+
@param transport_encoding Encoding from HTTP Content-Type header
168
+
@param fragment_context Context element for fragment parsing
169
+
*)
170
+
val parse_bytes : ?collect_errors:bool -> ?transport_encoding:string -> ?fragment_context:fragment_context -> bytes -> t
171
+
172
+
(** {1 Querying} *)
173
+
174
+
(** Query the DOM tree with a CSS selector.
175
+
176
+
Supported selectors:
177
+
- Tag: [div], [p], [span]
178
+
- ID: [#myid]
179
+
- Class: [.myclass]
180
+
- Universal: [*]
181
+
- Attribute: [[attr]], [[attr="value"]], [[attr~="value"]], [[attr|="value"]]
182
+
- Pseudo-classes: [:first-child], [:last-child], [:nth-child(n)]
183
+
- Combinators: descendant (space), child (>), adjacent sibling (+), general sibling (~)
184
+
185
+
{[
186
+
let divs = Html5rw.query result "div.content > p"
187
+
]}
188
+
189
+
@raise Selector.Selector_error if the selector is invalid
190
+
*)
191
+
val query : t -> string -> node list
192
+
193
+
(** Check if a node matches a CSS selector. *)
194
+
val matches : node -> string -> bool
195
+
196
+
(** {1 Serialization} *)
197
+
198
+
(** Write the DOM tree to a [Bytes.Writer.t].
199
+
200
+
{[
201
+
open Bytesrw
202
+
let buf = Buffer.create 1024 in
203
+
let writer = Bytes.Writer.of_buffer buf in
204
+
Html5rw.to_writer result writer;
205
+
Bytes.Writer.write_eod writer;
206
+
let html = Buffer.contents buf
207
+
]}
208
+
209
+
@param pretty If true, format with indentation (default: true)
210
+
@param indent_size Number of spaces per indent level (default: 2)
211
+
*)
212
+
val to_writer : ?pretty:bool -> ?indent_size:int -> t -> Bytesrw.Bytes.Writer.t -> unit
213
+
214
+
(** Serialize the DOM tree to a string.
215
+
216
+
Convenience function when the output fits in memory.
217
+
218
+
@param pretty If true, format with indentation (default: true)
219
+
@param indent_size Number of spaces per indent level (default: 2)
220
+
*)
221
+
val to_string : ?pretty:bool -> ?indent_size:int -> t -> string
222
+
223
+
(** Extract text content from the DOM tree.
224
+
225
+
@param separator String to insert between text nodes (default: " ")
226
+
@param strip If true, trim whitespace (default: true)
227
+
*)
228
+
val to_text : ?separator:string -> ?strip:bool -> t -> string
229
+
230
+
(** Serialize to html5lib test format (for testing). *)
231
+
val to_test_format : t -> string
232
+
233
+
(** {1 Result Accessors} *)
234
+
235
+
(** Get the root node of the parsed document. *)
236
+
val root : t -> node
237
+
238
+
(** Get parse errors (if error collection was enabled). *)
239
+
val errors : t -> parse_error list
240
+
241
+
(** Get the detected encoding (if parsed from bytes). *)
242
+
val encoding : t -> encoding option
243
+
244
+
(** {1 DOM Utilities}
245
+
246
+
Common DOM operations are available directly. For the full API,
247
+
see the {!Dom} module.
248
+
*)
249
+
250
+
(** Create an element node.
251
+
@param namespace None for HTML, Some "svg" or Some "mathml" for foreign content
252
+
@param attrs List of (name, value) attribute pairs
253
+
*)
254
+
val create_element : string -> ?namespace:string option -> ?attrs:(string * string) list -> unit -> node
255
+
256
+
(** Create a text node. *)
257
+
val create_text : string -> node
258
+
259
+
(** Create a comment node. *)
260
+
val create_comment : string -> node
261
+
262
+
(** Create an empty document node. *)
263
+
val create_document : unit -> node
264
+
265
+
(** Create a document fragment node. *)
266
+
val create_document_fragment : unit -> node
267
+
268
+
(** Create a doctype node. *)
269
+
val create_doctype : ?name:string -> ?public_id:string -> ?system_id:string -> unit -> node
270
+
271
+
(** Append a child node to a parent. *)
272
+
val append_child : node -> node -> unit
273
+
274
+
(** Insert a node before a reference node. *)
275
+
val insert_before : node -> node -> node -> unit
276
+
277
+
(** Remove a child node from its parent. *)
278
+
val remove_child : node -> node -> unit
279
+
280
+
(** Get an attribute value. *)
281
+
val get_attr : node -> string -> string option
282
+
283
+
(** Set an attribute value. *)
284
+
val set_attr : node -> string -> string -> unit
285
+
286
+
(** Check if a node has an attribute. *)
287
+
val has_attr : node -> string -> bool
288
+
289
+
(** Get all descendant nodes. *)
290
+
val descendants : node -> node list
291
+
292
+
(** Get all ancestor nodes (from parent to root). *)
293
+
val ancestors : node -> node list
294
+
295
+
(** Get text content of a node and its descendants. *)
296
+
val get_text_content : node -> string
297
+
298
+
(** Clone a node.
299
+
@param deep If true, also clone descendants (default: false)
300
+
*)
301
+
val clone : ?deep:bool -> node -> node
302
+
303
+
(** {1 Node Predicates} *)
304
+
305
+
(** Test if a node is an element. *)
306
+
val is_element : node -> bool
307
+
308
+
(** Test if a node is a text node. *)
309
+
val is_text : node -> bool
310
+
311
+
(** Test if a node is a comment node. *)
312
+
val is_comment : node -> bool
313
+
314
+
(** Test if a node is a document node. *)
315
+
val is_document : node -> bool
316
+
317
+
(** Test if a node is a document fragment. *)
318
+
val is_document_fragment : node -> bool
319
+
320
+
(** Test if a node is a doctype node. *)
321
+
val is_doctype : node -> bool
322
+
323
+
(** Test if a node has children. *)
324
+
val has_children : node -> bool
+306
lib/parser/constants.ml
+306
lib/parser/constants.ml
···
1
+
(* HTML5 spec constants *)
2
+
3
+
(* Void elements - no end tag allowed *)
4
+
let void_elements = [
5
+
"area"; "base"; "br"; "col"; "embed"; "hr"; "img"; "input";
6
+
"link"; "meta"; "source"; "track"; "wbr"
7
+
]
8
+
9
+
(* Raw text elements - content is raw text *)
10
+
let raw_text_elements = ["script"; "style"]
11
+
12
+
(* Escapable raw text elements *)
13
+
let escapable_raw_text_elements = ["textarea"; "title"]
14
+
15
+
(* Formatting elements for adoption agency *)
16
+
let formatting_elements = [
17
+
"a"; "b"; "big"; "code"; "em"; "font"; "i"; "nobr"; "s"; "small";
18
+
"strike"; "strong"; "tt"; "u"
19
+
]
20
+
21
+
(* Special elements *)
22
+
let special_elements = [
23
+
"address"; "applet"; "area"; "article"; "aside"; "base"; "basefont";
24
+
"bgsound"; "blockquote"; "body"; "br"; "button"; "caption"; "center";
25
+
"col"; "colgroup"; "dd"; "details"; "dir"; "div"; "dl"; "dt"; "embed";
26
+
"fieldset"; "figcaption"; "figure"; "footer"; "form"; "frame"; "frameset";
27
+
"h1"; "h2"; "h3"; "h4"; "h5"; "h6"; "head"; "header"; "hgroup"; "hr";
28
+
"html"; "iframe"; "img"; "input"; "keygen"; "li"; "link"; "listing";
29
+
"main"; "marquee"; "menu"; "meta"; "nav"; "noembed"; "noframes";
30
+
"noscript"; "object"; "ol"; "p"; "param"; "plaintext"; "pre"; "script";
31
+
"search"; "section"; "select"; "source"; "style"; "summary"; "table";
32
+
"tbody"; "td"; "template"; "textarea"; "tfoot"; "th"; "thead"; "title";
33
+
"tr"; "track"; "ul"; "wbr"; "xmp"
34
+
]
35
+
36
+
(* Heading elements *)
37
+
let heading_elements = ["h1"; "h2"; "h3"; "h4"; "h5"; "h6"]
38
+
39
+
(* Implied end tag elements *)
40
+
let implied_end_tags = [
41
+
"dd"; "dt"; "li"; "optgroup"; "option"; "p"; "rb"; "rp"; "rt"; "rtc"
42
+
]
43
+
44
+
(* Thoroughly implied end tags *)
45
+
let thoroughly_implied_end_tags = [
46
+
"caption"; "colgroup"; "dd"; "dt"; "li"; "optgroup"; "option"; "p";
47
+
"rb"; "rp"; "rt"; "rtc"; "tbody"; "td"; "tfoot"; "th"; "thead"; "tr"
48
+
]
49
+
50
+
(* Scope elements for various scope checks *)
51
+
let default_scope = [
52
+
"applet"; "caption"; "html"; "table"; "td"; "th"; "marquee"; "object"; "template"
53
+
]
54
+
55
+
let list_item_scope = default_scope @ ["ol"; "ul"]
56
+
57
+
let button_scope = default_scope @ ["button"]
58
+
59
+
let table_scope = ["html"; "table"; "template"]
60
+
61
+
let select_scope_exclude = ["optgroup"; "option"]
62
+
63
+
(* MathML text integration points *)
64
+
let mathml_text_integration = ["mi"; "mo"; "mn"; "ms"; "mtext"]
65
+
66
+
(* MathML attribute adjustments *)
67
+
let mathml_attr_adjustments = [
68
+
("definitionurl", "definitionURL")
69
+
]
70
+
71
+
let adjust_mathml_attrs attrs =
72
+
List.map (fun (k, v) ->
73
+
match List.assoc_opt (String.lowercase_ascii k) mathml_attr_adjustments with
74
+
| Some adjusted_k -> (adjusted_k, v)
75
+
| None -> (k, v)
76
+
) attrs
77
+
78
+
(* SVG HTML integration points *)
79
+
let svg_html_integration = ["foreignObject"; "desc"; "title"]
80
+
81
+
(* SVG tag name adjustments *)
82
+
let svg_tag_adjustments = [
83
+
("altglyph", "altGlyph");
84
+
("altglyphdef", "altGlyphDef");
85
+
("altglyphitem", "altGlyphItem");
86
+
("animatecolor", "animateColor");
87
+
("animatemotion", "animateMotion");
88
+
("animatetransform", "animateTransform");
89
+
("clippath", "clipPath");
90
+
("feblend", "feBlend");
91
+
("fecolormatrix", "feColorMatrix");
92
+
("fecomponenttransfer", "feComponentTransfer");
93
+
("fecomposite", "feComposite");
94
+
("feconvolvematrix", "feConvolveMatrix");
95
+
("fediffuselighting", "feDiffuseLighting");
96
+
("fedisplacementmap", "feDisplacementMap");
97
+
("fedistantlight", "feDistantLight");
98
+
("fedropshadow", "feDropShadow");
99
+
("feflood", "feFlood");
100
+
("fefunca", "feFuncA");
101
+
("fefuncb", "feFuncB");
102
+
("fefuncg", "feFuncG");
103
+
("fefuncr", "feFuncR");
104
+
("fegaussianblur", "feGaussianBlur");
105
+
("feimage", "feImage");
106
+
("femerge", "feMerge");
107
+
("femergenode", "feMergeNode");
108
+
("femorphology", "feMorphology");
109
+
("feoffset", "feOffset");
110
+
("fepointlight", "fePointLight");
111
+
("fespecularlighting", "feSpecularLighting");
112
+
("fespotlight", "feSpotLight");
113
+
("fetile", "feTile");
114
+
("feturbulence", "feTurbulence");
115
+
("foreignobject", "foreignObject");
116
+
("glyphref", "glyphRef");
117
+
("lineargradient", "linearGradient");
118
+
("radialgradient", "radialGradient");
119
+
("textpath", "textPath");
120
+
]
121
+
122
+
(* SVG attribute adjustments *)
123
+
let svg_attr_adjustments = [
124
+
("attributename", "attributeName");
125
+
("attributetype", "attributeType");
126
+
("basefrequency", "baseFrequency");
127
+
("baseprofile", "baseProfile");
128
+
("calcmode", "calcMode");
129
+
("clippathunits", "clipPathUnits");
130
+
("diffuseconstant", "diffuseConstant");
131
+
("edgemode", "edgeMode");
132
+
("filterunits", "filterUnits");
133
+
("glyphref", "glyphRef");
134
+
("gradienttransform", "gradientTransform");
135
+
("gradientunits", "gradientUnits");
136
+
("kernelmatrix", "kernelMatrix");
137
+
("kernelunitlength", "kernelUnitLength");
138
+
("keypoints", "keyPoints");
139
+
("keysplines", "keySplines");
140
+
("keytimes", "keyTimes");
141
+
("lengthadjust", "lengthAdjust");
142
+
("limitingconeangle", "limitingConeAngle");
143
+
("markerheight", "markerHeight");
144
+
("markerunits", "markerUnits");
145
+
("markerwidth", "markerWidth");
146
+
("maskcontentunits", "maskContentUnits");
147
+
("maskunits", "maskUnits");
148
+
("numoctaves", "numOctaves");
149
+
("pathlength", "pathLength");
150
+
("patterncontentunits", "patternContentUnits");
151
+
("patterntransform", "patternTransform");
152
+
("patternunits", "patternUnits");
153
+
("pointsatx", "pointsAtX");
154
+
("pointsaty", "pointsAtY");
155
+
("pointsatz", "pointsAtZ");
156
+
("preservealpha", "preserveAlpha");
157
+
("preserveaspectratio", "preserveAspectRatio");
158
+
("primitiveunits", "primitiveUnits");
159
+
("refx", "refX");
160
+
("refy", "refY");
161
+
("repeatcount", "repeatCount");
162
+
("repeatdur", "repeatDur");
163
+
("requiredextensions", "requiredExtensions");
164
+
("requiredfeatures", "requiredFeatures");
165
+
("specularconstant", "specularConstant");
166
+
("specularexponent", "specularExponent");
167
+
("spreadmethod", "spreadMethod");
168
+
("startoffset", "startOffset");
169
+
("stddeviation", "stdDeviation");
170
+
("stitchtiles", "stitchTiles");
171
+
("surfacescale", "surfaceScale");
172
+
("systemlanguage", "systemLanguage");
173
+
("tablevalues", "tableValues");
174
+
("targetx", "targetX");
175
+
("targety", "targetY");
176
+
("textlength", "textLength");
177
+
("viewbox", "viewBox");
178
+
("viewtarget", "viewTarget");
179
+
("xchannelselector", "xChannelSelector");
180
+
("ychannelselector", "yChannelSelector");
181
+
("zoomandpan", "zoomAndPan");
182
+
]
183
+
184
+
(* Foreign attribute adjustments *)
185
+
let foreign_attr_adjustments = [
186
+
("xlink:actuate", ("xlink", "actuate", "http://www.w3.org/1999/xlink"));
187
+
("xlink:arcrole", ("xlink", "arcrole", "http://www.w3.org/1999/xlink"));
188
+
("xlink:href", ("xlink", "href", "http://www.w3.org/1999/xlink"));
189
+
("xlink:role", ("xlink", "role", "http://www.w3.org/1999/xlink"));
190
+
("xlink:show", ("xlink", "show", "http://www.w3.org/1999/xlink"));
191
+
("xlink:title", ("xlink", "title", "http://www.w3.org/1999/xlink"));
192
+
("xlink:type", ("xlink", "type", "http://www.w3.org/1999/xlink"));
193
+
("xml:lang", ("xml", "lang", "http://www.w3.org/XML/1998/namespace"));
194
+
("xml:space", ("xml", "space", "http://www.w3.org/XML/1998/namespace"));
195
+
("xmlns", ("", "xmlns", "http://www.w3.org/2000/xmlns/"));
196
+
("xmlns:xlink", ("xmlns", "xlink", "http://www.w3.org/2000/xmlns/"));
197
+
]
198
+
199
+
(* Quirks mode detection *)
200
+
let quirky_public_matches = [
201
+
"-//w3o//dtd w3 html strict 3.0//en//";
202
+
"-/w3c/dtd html 4.0 transitional/en";
203
+
"html"
204
+
]
205
+
206
+
let quirky_public_prefixes = [
207
+
"+//silmaril//dtd html pro v0r11 19970101//";
208
+
"-//as//dtd html 3.0 aswedit + extensions//";
209
+
"-//advasoft ltd//dtd html 3.0 aswedit + extensions//";
210
+
"-//ietf//dtd html 2.0 level 1//";
211
+
"-//ietf//dtd html 2.0 level 2//";
212
+
"-//ietf//dtd html 2.0 strict level 1//";
213
+
"-//ietf//dtd html 2.0 strict level 2//";
214
+
"-//ietf//dtd html 2.0 strict//";
215
+
"-//ietf//dtd html 2.0//";
216
+
"-//ietf//dtd html 2.1e//";
217
+
"-//ietf//dtd html 3.0//";
218
+
"-//ietf//dtd html 3.2 final//";
219
+
"-//ietf//dtd html 3.2//";
220
+
"-//ietf//dtd html 3//";
221
+
"-//ietf//dtd html level 0//";
222
+
"-//ietf//dtd html level 1//";
223
+
"-//ietf//dtd html level 2//";
224
+
"-//ietf//dtd html level 3//";
225
+
"-//ietf//dtd html strict level 0//";
226
+
"-//ietf//dtd html strict level 1//";
227
+
"-//ietf//dtd html strict level 2//";
228
+
"-//ietf//dtd html strict level 3//";
229
+
"-//ietf//dtd html strict//";
230
+
"-//ietf//dtd html//";
231
+
"-//metrius//dtd metrius presentational//";
232
+
"-//microsoft//dtd internet explorer 2.0 html strict//";
233
+
"-//microsoft//dtd internet explorer 2.0 html//";
234
+
"-//microsoft//dtd internet explorer 2.0 tables//";
235
+
"-//microsoft//dtd internet explorer 3.0 html strict//";
236
+
"-//microsoft//dtd internet explorer 3.0 html//";
237
+
"-//microsoft//dtd internet explorer 3.0 tables//";
238
+
"-//netscape comm. corp.//dtd html//";
239
+
"-//netscape comm. corp.//dtd strict html//";
240
+
"-//o'reilly and associates//dtd html 2.0//";
241
+
"-//o'reilly and associates//dtd html extended 1.0//";
242
+
"-//o'reilly and associates//dtd html extended relaxed 1.0//";
243
+
"-//sq//dtd html 2.0 hotmetal + extensions//";
244
+
"-//softquad software//dtd hotmetal pro 6.0::19990601::extensions to html 4.0//";
245
+
"-//softquad//dtd hotmetal pro 4.0::19971010::extensions to html 4.0//";
246
+
"-//spyglass//dtd html 2.0 extended//";
247
+
"-//sun microsystems corp.//dtd hotjava html//";
248
+
"-//sun microsystems corp.//dtd hotjava strict html//";
249
+
"-//w3c//dtd html 3 1995-03-24//";
250
+
"-//w3c//dtd html 3.2 draft//";
251
+
"-//w3c//dtd html 3.2 final//";
252
+
"-//w3c//dtd html 3.2//";
253
+
"-//w3c//dtd html 3.2s draft//";
254
+
"-//w3c//dtd html 4.0 frameset//";
255
+
"-//w3c//dtd html 4.0 transitional//";
256
+
"-//w3c//dtd html experimental 19960712//";
257
+
"-//w3c//dtd html experimental 970421//";
258
+
"-//w3c//dtd w3 html//";
259
+
"-//w3o//dtd w3 html 3.0//";
260
+
"-//webtechs//dtd mozilla html 2.0//";
261
+
"-//webtechs//dtd mozilla html//";
262
+
]
263
+
264
+
let limited_quirky_public_prefixes = [
265
+
"-//w3c//dtd xhtml 1.0 frameset//";
266
+
"-//w3c//dtd xhtml 1.0 transitional//";
267
+
]
268
+
269
+
let html4_public_prefixes = [
270
+
"-//w3c//dtd html 4.01 frameset//";
271
+
"-//w3c//dtd html 4.01 transitional//";
272
+
]
273
+
274
+
let quirky_system_matches = [
275
+
"http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd"
276
+
]
277
+
278
+
(* Helper functions *)
279
+
let is_void = List.mem
280
+
let is_formatting = List.mem
281
+
let is_special name = List.mem name special_elements
282
+
let is_heading = List.mem
283
+
284
+
let adjust_svg_tag_name name =
285
+
match List.assoc_opt (String.lowercase_ascii name) svg_tag_adjustments with
286
+
| Some adjusted -> adjusted
287
+
| None -> name
288
+
289
+
let adjust_svg_attrs attrs =
290
+
List.map (fun (name, value) ->
291
+
let adjusted_name =
292
+
match List.assoc_opt (String.lowercase_ascii name) svg_attr_adjustments with
293
+
| Some n -> n
294
+
| None -> name
295
+
in
296
+
(adjusted_name, value)
297
+
) attrs
298
+
299
+
let adjust_foreign_attrs attrs =
300
+
List.map (fun (name, value) ->
301
+
match List.assoc_opt (String.lowercase_ascii name) foreign_attr_adjustments with
302
+
| Some (prefix, local, _ns) ->
303
+
if prefix = "" then (local, value)
304
+
else (prefix ^ ":" ^ local, value)
305
+
| None -> (name, value)
306
+
) attrs
+4
lib/parser/dune
+4
lib/parser/dune
+36
lib/parser/html5rw_parser.ml
+36
lib/parser/html5rw_parser.ml
···
1
+
(* html5rw.parser - HTML5 parser with bytesrw-only API *)
2
+
3
+
module Dom = Html5rw_dom
4
+
module Tokenizer = Html5rw_tokenizer
5
+
module Encoding = Html5rw_encoding
6
+
module Constants = Constants
7
+
module Insertion_mode = Insertion_mode
8
+
module Tree_builder = Tree_builder
9
+
10
+
type parse_error = Parser.parse_error
11
+
type fragment_context = Parser.fragment_context
12
+
type t = Parser.t
13
+
14
+
(* parse_error accessors *)
15
+
let error_code (e : parse_error) = e.Tree_builder.code
16
+
let error_line (e : parse_error) = e.Tree_builder.line
17
+
let error_column (e : parse_error) = e.Tree_builder.column
18
+
19
+
(* fragment_context constructor and accessors *)
20
+
let make_fragment_context ~tag_name ?(namespace=None) () : fragment_context =
21
+
{ Tree_builder.tag_name; namespace }
22
+
23
+
let fragment_context_tag (ctx : fragment_context) = ctx.Tree_builder.tag_name
24
+
let fragment_context_namespace (ctx : fragment_context) = ctx.Tree_builder.namespace
25
+
26
+
let parse = Parser.parse
27
+
let parse_bytes = Parser.parse_bytes
28
+
let query = Parser.query
29
+
let to_writer = Parser.to_writer
30
+
let to_string = Parser.to_string
31
+
let to_text = Parser.to_text
32
+
let to_test_format = Parser.to_test_format
33
+
34
+
let root t = t.Parser.root
35
+
let errors t = t.Parser.errors
36
+
let encoding t = t.Parser.encoding
+207
lib/parser/html5rw_parser.mli
+207
lib/parser/html5rw_parser.mli
···
1
+
(** HTML5 Parser
2
+
3
+
This module provides the core HTML5 parsing functionality implementing
4
+
the WHATWG parsing specification. It handles tokenization, tree construction,
5
+
error recovery, and produces a DOM tree.
6
+
7
+
For most uses, prefer the top-level {!Html5rw} module which re-exports
8
+
these functions with a simpler interface.
9
+
10
+
{2 Parsing Algorithm}
11
+
12
+
The HTML5 parsing algorithm is defined by the WHATWG specification and
13
+
consists of several phases:
14
+
15
+
1. {b Encoding sniffing}: Detect character encoding from BOM, meta tags,
16
+
or transport layer hints
17
+
2. {b Tokenization}: Convert the input stream into a sequence of tokens
18
+
(start tags, end tags, character data, comments, etc.)
19
+
3. {b Tree construction}: Build the DOM tree using a state machine with
20
+
multiple insertion modes
21
+
22
+
The algorithm includes extensive error recovery to handle malformed HTML
23
+
in a consistent way across browsers.
24
+
25
+
@see <https://html.spec.whatwg.org/multipage/parsing.html>
26
+
The WHATWG HTML Parsing specification
27
+
*)
28
+
29
+
(** {1 Sub-modules} *)
30
+
31
+
module Dom = Html5rw_dom
32
+
module Tokenizer = Html5rw_tokenizer
33
+
module Encoding = Html5rw_encoding
34
+
module Constants : sig
35
+
val void_elements : string list
36
+
val formatting_elements : string list
37
+
val special_elements : string list
38
+
end
39
+
module Insertion_mode : sig
40
+
type t
41
+
end
42
+
module Tree_builder : sig
43
+
type t
44
+
end
45
+
46
+
(** {1 Types} *)
47
+
48
+
(** A parse error encountered during parsing.
49
+
50
+
HTML5 parsing never fails - it always produces a DOM tree. However,
51
+
the specification defines many error conditions that conformance
52
+
checkers should report. Error collection is optional and disabled
53
+
by default for performance.
54
+
55
+
Error codes follow the WHATWG specification naming convention,
56
+
e.g., "unexpected-null-character", "eof-in-tag".
57
+
58
+
@see <https://html.spec.whatwg.org/multipage/parsing.html#parse-errors>
59
+
The list of HTML5 parse errors
60
+
*)
61
+
type parse_error
62
+
63
+
(** Get the error code string.
64
+
65
+
Error codes are lowercase with hyphens, matching the WHATWG spec names
66
+
like "unexpected-null-character" or "eof-before-tag-name".
67
+
*)
68
+
val error_code : parse_error -> string
69
+
70
+
(** Get the line number where the error occurred (1-indexed). *)
71
+
val error_line : parse_error -> int
72
+
73
+
(** Get the column number where the error occurred (1-indexed). *)
74
+
val error_column : parse_error -> int
75
+
76
+
(** Context element for HTML fragment parsing.
77
+
78
+
When parsing an HTML fragment (innerHTML), you need to specify the
79
+
context element that would contain the fragment. This affects how
80
+
the parser handles certain elements.
81
+
82
+
For example, parsing [<td>] as a fragment of a [<tr>] works differently
83
+
than parsing it as a fragment of a [<div>].
84
+
85
+
@see <https://html.spec.whatwg.org/multipage/parsing.html#parsing-html-fragments>
86
+
The HTML fragment parsing algorithm
87
+
*)
88
+
type fragment_context
89
+
90
+
(** Create a fragment parsing context.
91
+
92
+
@param tag_name The tag name of the context element (e.g., "div", "tr")
93
+
@param namespace Namespace: [None] for HTML, [Some "svg"], [Some "mathml"]
94
+
95
+
{[
96
+
(* Parse as innerHTML of a table row *)
97
+
let ctx = make_fragment_context ~tag_name:"tr" ()
98
+
99
+
(* Parse as innerHTML of an SVG element *)
100
+
let ctx = make_fragment_context ~tag_name:"g" ~namespace:(Some "svg") ()
101
+
]}
102
+
*)
103
+
val make_fragment_context : tag_name:string -> ?namespace:string option ->
104
+
unit -> fragment_context
105
+
106
+
(** Get the tag name of a fragment context. *)
107
+
val fragment_context_tag : fragment_context -> string
108
+
109
+
(** Get the namespace of a fragment context. *)
110
+
val fragment_context_namespace : fragment_context -> string option
111
+
112
+
(** Result of parsing an HTML document or fragment.
113
+
114
+
Contains the parsed DOM tree, any errors encountered (if error
115
+
collection was enabled), and the detected encoding (for byte input).
116
+
*)
117
+
type t
118
+
119
+
(** {1 Parsing Functions} *)
120
+
121
+
val parse : ?collect_errors:bool -> ?fragment_context:fragment_context ->
122
+
Bytesrw.Bytes.Reader.t -> t
123
+
(** Parse HTML from a byte stream reader.
124
+
125
+
This is the primary parsing function. The input must be valid UTF-8
126
+
(or will be converted from detected encoding when using {!parse_bytes}).
127
+
128
+
@param collect_errors If [true], collect parse errors (default: [false])
129
+
@param fragment_context Context for fragment parsing (innerHTML)
130
+
131
+
{[
132
+
open Bytesrw
133
+
let reader = Bytes.Reader.of_string "<p>Hello</p>" in
134
+
let result = parse reader
135
+
]}
136
+
*)
137
+
138
+
val parse_bytes : ?collect_errors:bool -> ?transport_encoding:string ->
139
+
?fragment_context:fragment_context -> bytes -> t
140
+
(** Parse HTML bytes with automatic encoding detection.
141
+
142
+
Implements the WHATWG encoding sniffing algorithm:
143
+
1. Check for BOM (UTF-8, UTF-16LE, UTF-16BE)
144
+
2. Prescan for [<meta charset>] declaration
145
+
3. Use transport encoding hint if provided
146
+
4. Fall back to UTF-8
147
+
148
+
@param collect_errors If [true], collect parse errors (default: [false])
149
+
@param transport_encoding Encoding from HTTP Content-Type header
150
+
@param fragment_context Context for fragment parsing (innerHTML)
151
+
*)
152
+
153
+
(** {1 Result Accessors} *)
154
+
155
+
val root : t -> Dom.node
156
+
(** Get the root node of the parsed document.
157
+
158
+
For full document parsing, this is a document node.
159
+
For fragment parsing, this is a document fragment node.
160
+
*)
161
+
162
+
val errors : t -> parse_error list
163
+
(** Get parse errors (empty if error collection was disabled). *)
164
+
165
+
val encoding : t -> Encoding.encoding option
166
+
(** Get the detected encoding (only set when using {!parse_bytes}). *)
167
+
168
+
(** {1 Querying} *)
169
+
170
+
val query : t -> string -> Dom.node list
171
+
(** Query the DOM with a CSS selector.
172
+
173
+
@raise Html5rw_selector.Selector_error if the selector is invalid
174
+
175
+
@see {!Html5rw_selector} for supported selector syntax
176
+
*)
177
+
178
+
(** {1 Serialization} *)
179
+
180
+
val to_writer : ?pretty:bool -> ?indent_size:int -> t ->
181
+
Bytesrw.Bytes.Writer.t -> unit
182
+
(** Serialize the DOM tree to a byte stream writer.
183
+
184
+
@param pretty If [true], format with indentation (default: [true])
185
+
@param indent_size Spaces per indent level (default: [2])
186
+
*)
187
+
188
+
val to_string : ?pretty:bool -> ?indent_size:int -> t -> string
189
+
(** Serialize the DOM tree to a string.
190
+
191
+
@param pretty If [true], format with indentation (default: [true])
192
+
@param indent_size Spaces per indent level (default: [2])
193
+
*)
194
+
195
+
val to_text : ?separator:string -> ?strip:bool -> t -> string
196
+
(** Extract text content from the DOM tree.
197
+
198
+
@param separator String between text nodes (default: [" "])
199
+
@param strip If [true], trim whitespace (default: [true])
200
+
*)
201
+
202
+
val to_test_format : t -> string
203
+
(** Serialize to html5lib test format.
204
+
205
+
This format is used by the html5lib test suite and shows the tree
206
+
structure with indentation and node type prefixes.
207
+
*)
+51
lib/parser/insertion_mode.ml
+51
lib/parser/insertion_mode.ml
···
1
+
(* HTML5 tree builder insertion modes *)
2
+
3
+
type t =
4
+
| Initial
5
+
| Before_html
6
+
| Before_head
7
+
| In_head
8
+
| In_head_noscript
9
+
| After_head
10
+
| In_body
11
+
| Text
12
+
| In_table
13
+
| In_table_text
14
+
| In_caption
15
+
| In_column_group
16
+
| In_table_body
17
+
| In_row
18
+
| In_cell
19
+
| In_select
20
+
| In_select_in_table
21
+
| In_template
22
+
| After_body
23
+
| In_frameset
24
+
| After_frameset
25
+
| After_after_body
26
+
| After_after_frameset
27
+
28
+
let to_string = function
29
+
| Initial -> "initial"
30
+
| Before_html -> "before html"
31
+
| Before_head -> "before head"
32
+
| In_head -> "in head"
33
+
| In_head_noscript -> "in head noscript"
34
+
| After_head -> "after head"
35
+
| In_body -> "in body"
36
+
| Text -> "text"
37
+
| In_table -> "in table"
38
+
| In_table_text -> "in table text"
39
+
| In_caption -> "in caption"
40
+
| In_column_group -> "in column group"
41
+
| In_table_body -> "in table body"
42
+
| In_row -> "in row"
43
+
| In_cell -> "in cell"
44
+
| In_select -> "in select"
45
+
| In_select_in_table -> "in select in table"
46
+
| In_template -> "in template"
47
+
| After_body -> "after body"
48
+
| In_frameset -> "in frameset"
49
+
| After_frameset -> "after frameset"
50
+
| After_after_body -> "after after body"
51
+
| After_after_frameset -> "after after frameset"
+107
lib/parser/parser.ml
+107
lib/parser/parser.ml
···
1
+
(* Main parser entry point - bytesrw-only API *)
2
+
3
+
open Bytesrw
4
+
5
+
module Dom = Html5rw_dom
6
+
module Tokenizer = Html5rw_tokenizer
7
+
module Encoding = Html5rw_encoding
8
+
9
+
type parse_error = Tree_builder.parse_error
10
+
11
+
type fragment_context = Tree_builder.fragment_context
12
+
13
+
type t = {
14
+
root : Dom.node;
15
+
errors : parse_error list;
16
+
encoding : Encoding.encoding option;
17
+
}
18
+
19
+
(* Token sink that feeds tokens to tree builder *)
20
+
module TreeBuilderSink = struct
21
+
type t = Tree_builder.t
22
+
23
+
let process tb token =
24
+
Tree_builder.process_token tb token;
25
+
(* Check if we need to switch tokenizer state based on current element *)
26
+
(* Only switch for HTML namespace elements - SVG/MathML use different rules *)
27
+
match Tree_builder.current_node tb with
28
+
| Some node when node.Dom.namespace = None || node.Dom.namespace = Some "html" ->
29
+
let name = node.Dom.name in
30
+
if List.mem name ["textarea"; "title"] then
31
+
`SwitchTo Tokenizer.State.Rcdata
32
+
else if List.mem name ["style"; "xmp"; "iframe"; "noembed"; "noframes"] then
33
+
`SwitchTo Tokenizer.State.Rawtext
34
+
else if name = "script" then
35
+
`SwitchTo Tokenizer.State.Script_data
36
+
else if name = "plaintext" then
37
+
`SwitchTo Tokenizer.State.Plaintext
38
+
else
39
+
`Continue
40
+
| _ -> `Continue
41
+
42
+
let adjusted_current_node_in_html_namespace tb =
43
+
Tree_builder.adjusted_current_node_in_html_namespace tb
44
+
end
45
+
46
+
(* Core parsing function that takes a Bytes.Reader.t *)
47
+
let parse ?(collect_errors=false) ?fragment_context (reader : Bytes.Reader.t) =
48
+
let tb = Tree_builder.create ~collect_errors ?fragment_context () in
49
+
let tokenizer = Tokenizer.create (module TreeBuilderSink) tb ~collect_errors () in
50
+
51
+
(* Set tokenizer state for fragment parsing *)
52
+
(* Note: We do NOT set last_start_tag because in fragment parsing, no start tag has been
53
+
emitted. This means end tags won't match as "appropriate end tags" and will be treated
54
+
as raw text in RCDATA/RAWTEXT/Script modes. *)
55
+
(* Only change tokenizer state for HTML namespace contexts - foreign contexts use Data state *)
56
+
(match fragment_context with
57
+
| Some ctx when ctx.namespace = None || ctx.namespace = Some "html" ->
58
+
let name = String.lowercase_ascii ctx.tag_name in
59
+
if List.mem name ["title"; "textarea"] then
60
+
Tokenizer.set_state tokenizer Tokenizer.State.Rcdata
61
+
else if List.mem name ["style"; "xmp"; "iframe"; "noembed"; "noframes"] then
62
+
Tokenizer.set_state tokenizer Tokenizer.State.Rawtext
63
+
else if name = "script" then
64
+
Tokenizer.set_state tokenizer Tokenizer.State.Script_data
65
+
else if name = "plaintext" then
66
+
Tokenizer.set_state tokenizer Tokenizer.State.Plaintext
67
+
| _ -> ());
68
+
69
+
Tokenizer.run tokenizer (module TreeBuilderSink) reader;
70
+
71
+
let root = Tree_builder.finish tb in
72
+
let tokenizer_errors = Tokenizer.get_errors tokenizer in
73
+
let tree_errors = Tree_builder.get_errors tb in
74
+
let all_errors = List.map (fun e ->
75
+
{ Tree_builder.code = e.Tokenizer.Errors.code;
76
+
line = e.Tokenizer.Errors.line;
77
+
column = e.Tokenizer.Errors.column }
78
+
) tokenizer_errors @ tree_errors in
79
+
80
+
{ root; errors = all_errors; encoding = None }
81
+
82
+
(* Parse raw bytes with automatic encoding detection *)
83
+
let parse_bytes ?(collect_errors=false) ?transport_encoding ?fragment_context data =
84
+
let (html, enc) = Encoding.decode data ?transport_encoding () in
85
+
let reader = Bytes.Reader.of_string html in
86
+
let result = parse ~collect_errors ?fragment_context reader in
87
+
{ result with encoding = Some enc }
88
+
89
+
let query t selector =
90
+
Html5rw_selector.query t.root selector
91
+
92
+
(* Serialize to a Bytes.Writer.t *)
93
+
let to_writer ?(pretty=true) ?(indent_size=2) t (writer : Bytes.Writer.t) =
94
+
let html = Dom.to_html ~pretty ~indent_size t.root in
95
+
Bytes.Writer.write_string writer html
96
+
97
+
(* Serialize to string (convenience for when result fits in memory) *)
98
+
let to_string ?(pretty=true) ?(indent_size=2) t =
99
+
Dom.to_html ~pretty ~indent_size t.root
100
+
101
+
(* Extract text content *)
102
+
let to_text ?(separator=" ") ?(strip=true) t =
103
+
Dom.to_text ~separator ~strip t.root
104
+
105
+
(* For testing *)
106
+
let to_test_format t =
107
+
Dom.to_test_format t.root
+2520
lib/parser/tree_builder.ml
+2520
lib/parser/tree_builder.ml
···
1
+
(* HTML5 Tree Builder *)
2
+
3
+
module Dom = Html5rw_dom
4
+
module Token = Html5rw_tokenizer.Token
5
+
module State = Html5rw_tokenizer.State
6
+
7
+
type fragment_context = {
8
+
tag_name : string;
9
+
namespace : string option;
10
+
}
11
+
12
+
type formatting_entry =
13
+
| Marker
14
+
| Entry of {
15
+
name : string;
16
+
attrs : (string * string) list;
17
+
node : Dom.node;
18
+
}
19
+
20
+
type parse_error = {
21
+
code : string;
22
+
line : int;
23
+
column : int;
24
+
}
25
+
26
+
type t = {
27
+
mutable document : Dom.node;
28
+
mutable mode : Insertion_mode.t;
29
+
mutable original_mode : Insertion_mode.t option;
30
+
mutable open_elements : Dom.node list;
31
+
mutable active_formatting : formatting_entry list;
32
+
mutable head_element : Dom.node option;
33
+
mutable form_element : Dom.node option;
34
+
mutable frameset_ok : bool;
35
+
mutable ignore_lf : bool;
36
+
mutable foster_parenting : bool;
37
+
mutable pending_table_chars : string list;
38
+
mutable template_modes : Insertion_mode.t list;
39
+
mutable quirks_mode : Dom.quirks_mode;
40
+
mutable errors : parse_error list;
41
+
collect_errors : bool;
42
+
fragment_context : fragment_context option;
43
+
mutable fragment_context_element : Dom.node option;
44
+
iframe_srcdoc : bool;
45
+
}
46
+
47
+
let create ?(collect_errors=false) ?fragment_context ?(iframe_srcdoc=false) () =
48
+
let is_fragment = fragment_context <> None in
49
+
let doc = if is_fragment then Dom.create_document_fragment () else Dom.create_document () in
50
+
let t = {
51
+
document = doc;
52
+
mode = Insertion_mode.Initial;
53
+
original_mode = None;
54
+
open_elements = [];
55
+
active_formatting = [];
56
+
head_element = None;
57
+
form_element = None;
58
+
frameset_ok = true;
59
+
ignore_lf = false;
60
+
foster_parenting = false;
61
+
pending_table_chars = [];
62
+
template_modes = [];
63
+
quirks_mode = Dom.No_quirks;
64
+
errors = [];
65
+
collect_errors;
66
+
fragment_context;
67
+
fragment_context_element = None;
68
+
iframe_srcdoc;
69
+
} in
70
+
(* Initialize fragment parsing *)
71
+
(match fragment_context with
72
+
| Some ctx ->
73
+
let name = String.lowercase_ascii ctx.tag_name in
74
+
let ns = ctx.namespace in
75
+
(* Create html root *)
76
+
let root = Dom.create_element "html" () in
77
+
Dom.append_child doc root;
78
+
t.open_elements <- [root];
79
+
(* For foreign content contexts, create context element *)
80
+
(match ns with
81
+
| Some namespace when namespace <> "html" ->
82
+
let context_elem = Dom.create_element ctx.tag_name ~namespace:ns () in
83
+
Dom.append_child root context_elem;
84
+
t.open_elements <- [context_elem; root];
85
+
t.fragment_context_element <- Some context_elem
86
+
| _ -> ());
87
+
(* Set initial mode based on context *)
88
+
t.mode <- (
89
+
if name = "html" then Insertion_mode.Before_head
90
+
else if List.mem name ["tbody"; "thead"; "tfoot"] && (ns = None || ns = Some "html") then
91
+
Insertion_mode.In_table_body
92
+
else if name = "tr" && (ns = None || ns = Some "html") then
93
+
Insertion_mode.In_row
94
+
else if List.mem name ["td"; "th"] && (ns = None || ns = Some "html") then
95
+
Insertion_mode.In_cell
96
+
else if name = "caption" && (ns = None || ns = Some "html") then
97
+
Insertion_mode.In_caption
98
+
else if name = "colgroup" && (ns = None || ns = Some "html") then
99
+
Insertion_mode.In_column_group
100
+
else if name = "table" && (ns = None || ns = Some "html") then
101
+
Insertion_mode.In_table
102
+
else if name = "template" && (ns = None || ns = Some "html") then begin
103
+
t.template_modes <- [Insertion_mode.In_template];
104
+
Insertion_mode.In_template
105
+
end
106
+
else
107
+
Insertion_mode.In_body
108
+
);
109
+
t.frameset_ok <- false
110
+
| None -> ());
111
+
t
112
+
113
+
(* Error handling *)
114
+
let parse_error t code =
115
+
if t.collect_errors then
116
+
t.errors <- { code; line = 0; column = 0 } :: t.errors
117
+
118
+
(* Stack helpers *)
119
+
let current_node t =
120
+
match t.open_elements with
121
+
| [] -> None
122
+
| x :: _ -> Some x
123
+
124
+
let adjusted_current_node t =
125
+
match t.fragment_context, t.open_elements with
126
+
| Some ctx, [_] ->
127
+
(* Fragment case: use context element info *)
128
+
Some (Dom.create_element ctx.tag_name ~namespace:ctx.namespace ())
129
+
| _, x :: _ -> Some x
130
+
| _, [] -> None
131
+
132
+
let is_in_html_namespace node =
133
+
node.Dom.namespace = None || node.Dom.namespace = Some "html"
134
+
135
+
(* Namespace-aware check for "special" elements per WHATWG spec *)
136
+
let is_special_element node =
137
+
let name = String.lowercase_ascii node.Dom.name in
138
+
match node.Dom.namespace with
139
+
| None | Some "html" -> Constants.is_special name
140
+
| Some "mathml" -> List.mem name ["mi"; "mo"; "mn"; "ms"; "mtext"; "annotation-xml"]
141
+
| Some "svg" -> List.mem name ["foreignobject"; "desc"; "title"]
142
+
| _ -> false
143
+
144
+
let adjusted_current_node_in_html_namespace t =
145
+
match adjusted_current_node t with
146
+
| Some node -> is_in_html_namespace node
147
+
| None -> true
148
+
149
+
(* Insertion helpers *)
150
+
let appropriate_insertion_place t =
151
+
match current_node t with
152
+
| None -> (t.document, None)
153
+
| Some target ->
154
+
if t.foster_parenting && List.mem target.Dom.name ["table"; "tbody"; "tfoot"; "thead"; "tr"] then begin
155
+
(* Foster parenting per WHATWG spec *)
156
+
(* Step 1: Find last (most recent) template and table in stack *)
157
+
(* Note: index 0 = top of stack = most recently added *)
158
+
let last_template_idx = ref None in
159
+
let last_table_idx = ref None in
160
+
List.iteri (fun i n ->
161
+
(* Take first match (most recent = lowest index) *)
162
+
if n.Dom.name = "template" && !last_template_idx = None then last_template_idx := Some i;
163
+
if n.Dom.name = "table" && !last_table_idx = None then last_table_idx := Some i
164
+
) t.open_elements;
165
+
166
+
(* Step 2-3: If last template is more recent than last table (lower index = more recent) *)
167
+
match !last_template_idx, !last_table_idx with
168
+
| Some ti, None ->
169
+
(* No table, use template content *)
170
+
let template = List.nth t.open_elements ti in
171
+
(match template.Dom.template_content with
172
+
| Some tc -> (tc, None)
173
+
| None -> (template, None))
174
+
| Some ti, Some tbi when ti < tbi ->
175
+
(* Template is more recent than table, use template content *)
176
+
let template = List.nth t.open_elements ti in
177
+
(match template.Dom.template_content with
178
+
| Some tc -> (tc, None)
179
+
| None -> (template, None))
180
+
| _, Some tbi ->
181
+
(* Use table's parent as foster parent *)
182
+
let table = List.nth t.open_elements tbi in
183
+
(match table.Dom.parent with
184
+
| Some parent -> (parent, Some table)
185
+
| None ->
186
+
(* Step 6: element above table in stack (index + 1 since 0 is top) *)
187
+
if tbi + 1 < List.length t.open_elements then
188
+
(List.nth t.open_elements (tbi + 1), None)
189
+
else
190
+
(t.document, None))
191
+
| None, None ->
192
+
(* No table or template, use document *)
193
+
(t.document, None)
194
+
end else begin
195
+
(* If target is a template, insert into its content document fragment *)
196
+
match target.Dom.template_content with
197
+
| Some tc -> (tc, None)
198
+
| None -> (target, None)
199
+
end
200
+
201
+
let insert_element t name ?(namespace=None) ?(push=false) attrs =
202
+
let node = Dom.create_element name ~namespace ~attrs () in
203
+
let (parent, before) = appropriate_insertion_place t in
204
+
(match before with
205
+
| None -> Dom.append_child parent node
206
+
| Some ref -> Dom.insert_before parent node ref);
207
+
if push then t.open_elements <- node :: t.open_elements;
208
+
node
209
+
210
+
let insert_element_for_token t (tag : Token.tag) =
211
+
insert_element t tag.name ~push:true tag.attrs
212
+
213
+
let insert_foreign_element t (tag : Token.tag) namespace =
214
+
let attrs =
215
+
if namespace = Some "svg" then
216
+
Constants.adjust_svg_attrs (Constants.adjust_foreign_attrs tag.attrs)
217
+
else
218
+
Constants.adjust_foreign_attrs tag.attrs
219
+
in
220
+
let name =
221
+
if namespace = Some "svg" then Constants.adjust_svg_tag_name tag.name
222
+
else tag.name
223
+
in
224
+
let node = insert_element t name ~namespace attrs in
225
+
t.open_elements <- node :: t.open_elements;
226
+
node
227
+
228
+
let insert_character t data =
229
+
if t.ignore_lf && String.length data > 0 && data.[0] = '\n' then begin
230
+
t.ignore_lf <- false;
231
+
if String.length data > 1 then begin
232
+
let rest = String.sub data 1 (String.length data - 1) in
233
+
let (parent, before) = appropriate_insertion_place t in
234
+
Dom.insert_text_at parent rest before
235
+
end
236
+
end else begin
237
+
t.ignore_lf <- false;
238
+
let (parent, before) = appropriate_insertion_place t in
239
+
Dom.insert_text_at parent data before
240
+
end
241
+
242
+
let insert_comment t data =
243
+
let node = Dom.create_comment data in
244
+
let (parent, _) = appropriate_insertion_place t in
245
+
Dom.append_child parent node
246
+
247
+
let insert_comment_to_document t data =
248
+
let node = Dom.create_comment data in
249
+
Dom.append_child t.document node
250
+
251
+
(* Stack manipulation *)
252
+
let pop_current t =
253
+
match t.open_elements with
254
+
| [] -> ()
255
+
| _ :: rest -> t.open_elements <- rest
256
+
257
+
let pop_until t pred =
258
+
let rec loop () =
259
+
match t.open_elements with
260
+
| [] -> ()
261
+
| x :: rest ->
262
+
t.open_elements <- rest;
263
+
if not (pred x) then loop ()
264
+
in
265
+
loop ()
266
+
267
+
let pop_until_tag t name =
268
+
pop_until t (fun n -> n.Dom.name = name)
269
+
270
+
(* Pop until HTML namespace element with given name *)
271
+
let pop_until_html_tag t name =
272
+
pop_until t (fun n -> n.Dom.name = name && is_in_html_namespace n)
273
+
274
+
let pop_until_one_of t names =
275
+
pop_until t (fun n -> List.mem n.Dom.name names)
276
+
277
+
(* Pop until HTML namespace element with one of given names *)
278
+
let pop_until_html_one_of t names =
279
+
pop_until t (fun n -> List.mem n.Dom.name names && is_in_html_namespace n)
280
+
281
+
(* Check if element is an HTML integration point *)
282
+
let is_html_integration_point node =
283
+
(* SVG foreignObject, desc, and title are always HTML integration points *)
284
+
if node.Dom.namespace = Some "svg" &&
285
+
List.mem node.Dom.name Constants.svg_html_integration then true
286
+
(* annotation-xml is an HTML integration point only with specific encoding values *)
287
+
else if node.Dom.namespace = Some "mathml" && node.Dom.name = "annotation-xml" then
288
+
match List.assoc_opt "encoding" node.Dom.attrs with
289
+
| Some enc ->
290
+
let enc_lower = String.lowercase_ascii enc in
291
+
enc_lower = "text/html" || enc_lower = "application/xhtml+xml"
292
+
| None -> false
293
+
else false
294
+
295
+
(* Check if element is a MathML text integration point *)
296
+
let is_mathml_text_integration_point node =
297
+
node.Dom.namespace = Some "mathml" &&
298
+
List.mem node.Dom.name ["mi"; "mo"; "mn"; "ms"; "mtext"]
299
+
300
+
(* Scope checks - integration points also terminate scope (except for table scope) *)
301
+
(* Per WHATWG spec, scope checks only consider HTML namespace elements for the target names *)
302
+
let has_element_in_scope_impl t names exclude_list ~check_integration_points =
303
+
let rec check = function
304
+
| [] -> false
305
+
| n :: rest ->
306
+
(* Target elements must be in HTML namespace *)
307
+
if is_in_html_namespace n && List.mem n.Dom.name names then true
308
+
else if is_in_html_namespace n && List.mem n.Dom.name exclude_list then false
309
+
(* Integration points terminate scope (unless we're checking table scope) *)
310
+
else if check_integration_points && (is_html_integration_point n || is_mathml_text_integration_point n) then false
311
+
else check rest
312
+
in
313
+
check t.open_elements
314
+
315
+
let has_element_in_scope t name =
316
+
has_element_in_scope_impl t [name] Constants.default_scope ~check_integration_points:true
317
+
318
+
let has_element_in_button_scope t name =
319
+
has_element_in_scope_impl t [name] Constants.button_scope ~check_integration_points:true
320
+
321
+
let has_element_in_list_item_scope t name =
322
+
has_element_in_scope_impl t [name] Constants.list_item_scope ~check_integration_points:true
323
+
324
+
let has_element_in_table_scope t name =
325
+
has_element_in_scope_impl t [name] Constants.table_scope ~check_integration_points:false
326
+
327
+
let has_element_in_select_scope t name =
328
+
let rec check = function
329
+
| [] -> false
330
+
| n :: rest ->
331
+
if n.Dom.name = name then true
332
+
else if not (List.mem n.Dom.name Constants.select_scope_exclude) then false
333
+
else check rest
334
+
in
335
+
check t.open_elements
336
+
337
+
(* Implied end tags *)
338
+
let generate_implied_end_tags t ?except () =
339
+
let rec loop () =
340
+
match current_node t with
341
+
| Some n when List.mem n.Dom.name Constants.implied_end_tags ->
342
+
(match except with
343
+
| Some ex when n.Dom.name = ex -> ()
344
+
| _ -> pop_current t; loop ())
345
+
| _ -> ()
346
+
in
347
+
loop ()
348
+
349
+
let generate_all_implied_end_tags t =
350
+
let rec loop () =
351
+
match current_node t with
352
+
| Some n when List.mem n.Dom.name Constants.thoroughly_implied_end_tags ->
353
+
pop_current t; loop ()
354
+
| _ -> ()
355
+
in
356
+
loop ()
357
+
358
+
(* Active formatting elements *)
359
+
let push_formatting_marker t =
360
+
t.active_formatting <- Marker :: t.active_formatting
361
+
362
+
let push_formatting_element t node name attrs =
363
+
(* Noah's Ark: remove earlier identical elements (up to 3) *)
364
+
let rec count_and_remove same acc = function
365
+
| [] -> List.rev acc
366
+
| Marker :: rest -> List.rev acc @ (Marker :: rest)
367
+
| Entry e :: rest when e.name = name && e.attrs = attrs ->
368
+
if same >= 2 then
369
+
count_and_remove same acc rest (* Remove this one *)
370
+
else
371
+
count_and_remove (same + 1) (Entry e :: acc) rest
372
+
| x :: rest -> count_and_remove same (x :: acc) rest
373
+
in
374
+
t.active_formatting <- count_and_remove 0 [] t.active_formatting;
375
+
t.active_formatting <- Entry { name; attrs; node } :: t.active_formatting
376
+
377
+
let clear_active_formatting_to_marker t =
378
+
let rec loop = function
379
+
| [] -> []
380
+
| Marker :: rest -> rest
381
+
| _ :: rest -> loop rest
382
+
in
383
+
t.active_formatting <- loop t.active_formatting
384
+
385
+
let reconstruct_active_formatting t =
386
+
let rec find_to_reconstruct acc = function
387
+
| [] -> acc
388
+
| Marker :: _ -> acc
389
+
| Entry e :: rest ->
390
+
if List.exists (fun n -> n == e.node) t.open_elements then acc
391
+
else find_to_reconstruct (Entry e :: acc) rest
392
+
in
393
+
let to_reconstruct = find_to_reconstruct [] t.active_formatting in
394
+
List.iter (fun entry ->
395
+
match entry with
396
+
| Entry e ->
397
+
let node = insert_element t e.name e.attrs in
398
+
t.open_elements <- node :: t.open_elements;
399
+
(* Update the entry to point to new node *)
400
+
t.active_formatting <- List.map (fun x ->
401
+
if x == entry then Entry { e with node }
402
+
else x
403
+
) t.active_formatting
404
+
| Marker -> ()
405
+
) to_reconstruct
406
+
407
+
(* Adoption agency algorithm - follows WHATWG spec *)
408
+
let adoption_agency t tag_name =
409
+
(* Step 1: If current node is subject and not in active formatting list, just pop *)
410
+
(match current_node t with
411
+
| Some n when n.Dom.name = tag_name ->
412
+
let in_active = List.exists (function
413
+
| Entry e -> e.name = tag_name
414
+
| Marker -> false
415
+
) t.active_formatting in
416
+
if not in_active then begin
417
+
pop_current t;
418
+
() (* Return early - this case is handled *)
419
+
end
420
+
| _ -> ());
421
+
422
+
(* Step 2: Outer loop *)
423
+
let outer_loop_counter = ref 0 in
424
+
let done_flag = ref false in
425
+
426
+
while !outer_loop_counter < 8 && not !done_flag do
427
+
incr outer_loop_counter;
428
+
429
+
(* Step 3: Find formatting element in active formatting list *)
430
+
let rec find_formatting_index idx = function
431
+
| [] -> None
432
+
| Marker :: _ -> None
433
+
| Entry e :: rest ->
434
+
if e.name = tag_name then Some (idx, e.node, e.attrs)
435
+
else find_formatting_index (idx + 1) rest
436
+
in
437
+
let formatting_entry = find_formatting_index 0 t.active_formatting in
438
+
439
+
match formatting_entry with
440
+
| None ->
441
+
(* No formatting element found - done *)
442
+
done_flag := true
443
+
| Some (fmt_idx, fmt_node, fmt_attrs) ->
444
+
445
+
(* Step 4: Check if formatting element is in open elements *)
446
+
if not (List.exists (fun n -> n == fmt_node) t.open_elements) then begin
447
+
parse_error t "adoption-agency-1.2";
448
+
t.active_formatting <- List.filteri (fun i _ -> i <> fmt_idx) t.active_formatting;
449
+
done_flag := true
450
+
end
451
+
(* Step 5: Check if formatting element is in scope *)
452
+
else if not (has_element_in_scope t tag_name) then begin
453
+
parse_error t "adoption-agency-1.3";
454
+
done_flag := true
455
+
end else begin
456
+
(* Step 6: Parse error if not current node *)
457
+
(match current_node t with
458
+
| Some n when n != fmt_node -> parse_error t "adoption-agency-1.3"
459
+
| _ -> ());
460
+
461
+
(* Step 7: Find furthest block - first special element BELOW formatting element *)
462
+
(* open_elements is [current(top)...html(bottom)], formatting element is somewhere in the middle *)
463
+
(* We need the first special element going from formatting element toward current *)
464
+
(* This is the "topmost" (closest to formatting element) special element that is "lower" (closer to current) *)
465
+
let fmt_stack_idx = ref (-1) in
466
+
List.iteri (fun i n -> if n == fmt_node then fmt_stack_idx := i) t.open_elements;
467
+
let furthest_block =
468
+
if !fmt_stack_idx <= 0 then None
469
+
else begin
470
+
(* Look from fmt_stack_idx-1 down to 0, find first special element *)
471
+
let rec find_from_idx idx =
472
+
if idx < 0 then None
473
+
else
474
+
let n = List.nth t.open_elements idx in
475
+
if is_special_element n then Some n
476
+
else find_from_idx (idx - 1)
477
+
in
478
+
find_from_idx (!fmt_stack_idx - 1)
479
+
end
480
+
in
481
+
482
+
match furthest_block with
483
+
| None ->
484
+
(* Step 8: No furthest block - pop elements including formatting element *)
485
+
pop_until t (fun n -> n == fmt_node);
486
+
t.active_formatting <- List.filteri (fun i _ -> i <> fmt_idx) t.active_formatting;
487
+
done_flag := true
488
+
489
+
| Some fb ->
490
+
(* Step 9: Let common ancestor be element immediately above formatting element *)
491
+
let rec find_common_ancestor = function
492
+
| [] -> None
493
+
| n :: rest when n == fmt_node ->
494
+
(match rest with x :: _ -> Some x | [] -> None)
495
+
| _ :: rest -> find_common_ancestor rest
496
+
in
497
+
let common_ancestor = find_common_ancestor t.open_elements in
498
+
499
+
(* Step 10: Bookmark starts after formatting element *)
500
+
let bookmark = ref (fmt_idx + 1) in
501
+
502
+
(* Step 11: Let last_node = furthest block *)
503
+
let last_node = ref fb in
504
+
505
+
(* Step 12: Inner loop *)
506
+
(* The inner loop processes elements between furthest_block and formatting_element,
507
+
removing non-formatting elements and reparenting formatting elements *)
508
+
let inner_loop_counter = ref 0 in
509
+
510
+
(* Get index of furthest block in open elements *)
511
+
let fb_idx = ref 0 in
512
+
List.iteri (fun i n -> if n == fb then fb_idx := i) t.open_elements;
513
+
514
+
(* Start from element after furthest block (toward formatting element) *)
515
+
let node_idx = ref (!fb_idx + 1) in
516
+
517
+
while !node_idx < List.length t.open_elements &&
518
+
(List.nth t.open_elements !node_idx) != fmt_node do
519
+
incr inner_loop_counter;
520
+
let current_node = List.nth t.open_elements !node_idx in
521
+
522
+
(* Step 12.3: Find node in active formatting list *)
523
+
let rec find_node_in_formatting idx = function
524
+
| [] -> None
525
+
| Entry e :: _rest when e.node == current_node -> Some idx
526
+
| _ :: rest -> find_node_in_formatting (idx + 1) rest
527
+
in
528
+
let node_fmt_idx = find_node_in_formatting 0 t.active_formatting in
529
+
530
+
(* Step 12.4: If inner loop counter > 3 and node in active formatting, remove it *)
531
+
let node_fmt_idx =
532
+
match node_fmt_idx with
533
+
| Some idx when !inner_loop_counter > 3 ->
534
+
t.active_formatting <- List.filteri (fun i _ -> i <> idx) t.active_formatting;
535
+
if idx < !bookmark then decr bookmark;
536
+
None
537
+
| x -> x
538
+
in
539
+
540
+
(* Step 12.5: If node not in active formatting, remove from stack and continue *)
541
+
match node_fmt_idx with
542
+
| None ->
543
+
(* Remove from stack - this shifts indices *)
544
+
t.open_elements <- List.filteri (fun i _ -> i <> !node_idx) t.open_elements
545
+
(* Don't increment node_idx since we removed an element *)
546
+
547
+
| Some af_idx ->
548
+
(* Step 12.6: Create new element for node *)
549
+
let (node_name, node_attrs) = match List.nth t.active_formatting af_idx with
550
+
| Entry e -> (e.name, e.attrs)
551
+
| Marker -> failwith "unexpected marker"
552
+
in
553
+
let new_node_elem = Dom.create_element node_name ~attrs:node_attrs () in
554
+
555
+
(* Update active formatting with new node *)
556
+
t.active_formatting <- List.mapi (fun i entry ->
557
+
if i = af_idx then Entry { name = node_name; node = new_node_elem; attrs = node_attrs }
558
+
else entry
559
+
) t.active_formatting;
560
+
561
+
(* Replace node in open elements *)
562
+
t.open_elements <- List.mapi (fun i n ->
563
+
if i = !node_idx then new_node_elem else n
564
+
) t.open_elements;
565
+
566
+
(* Step 12.7: If last_node is furthest block, update bookmark *)
567
+
if !last_node == fb then
568
+
bookmark := af_idx + 1;
569
+
570
+
(* Step 12.8: Reparent last_node to new node *)
571
+
(match !last_node.Dom.parent with
572
+
| Some p -> Dom.remove_child p !last_node
573
+
| None -> ());
574
+
Dom.append_child new_node_elem !last_node;
575
+
576
+
(* Step 12.9: Let last_node = new node *)
577
+
last_node := new_node_elem;
578
+
579
+
(* Move to next element *)
580
+
incr node_idx
581
+
done;
582
+
583
+
(* Step 13: Insert last_node into common ancestor *)
584
+
(match common_ancestor with
585
+
| Some ca ->
586
+
(match !last_node.Dom.parent with
587
+
| Some p -> Dom.remove_child p !last_node
588
+
| None -> ());
589
+
(* Check if we need foster parenting *)
590
+
if t.foster_parenting && List.mem ca.Dom.name ["table"; "tbody"; "tfoot"; "thead"; "tr"] then begin
591
+
(* Find table and insert before it *)
592
+
let rec find_table = function
593
+
| [] -> None
594
+
| n :: rest when n.Dom.name = "table" -> Some (n, rest)
595
+
| _ :: rest -> find_table rest
596
+
in
597
+
match find_table t.open_elements with
598
+
| Some (table, _) ->
599
+
(match table.Dom.parent with
600
+
| Some parent -> Dom.insert_before parent !last_node table
601
+
| None -> Dom.append_child ca !last_node)
602
+
| None -> Dom.append_child ca !last_node
603
+
end else begin
604
+
(* If common ancestor is template, insert into its content *)
605
+
match ca.Dom.template_content with
606
+
| Some tc -> Dom.append_child tc !last_node
607
+
| None -> Dom.append_child ca !last_node
608
+
end
609
+
| None -> ());
610
+
611
+
(* Step 14: Create new formatting element *)
612
+
let new_formatting = Dom.create_element tag_name ~attrs:fmt_attrs () in
613
+
614
+
(* Step 15: Move children of furthest block to new formatting element *)
615
+
let fb_children = fb.Dom.children in
616
+
List.iter (fun child ->
617
+
Dom.remove_child fb child;
618
+
Dom.append_child new_formatting child
619
+
) fb_children;
620
+
621
+
(* Step 16: Append new formatting element to furthest block *)
622
+
Dom.append_child fb new_formatting;
623
+
624
+
(* Step 17: Remove old from active formatting, insert new at bookmark *)
625
+
let new_entry = Entry { name = tag_name; node = new_formatting; attrs = fmt_attrs } in
626
+
t.active_formatting <- List.filteri (fun i _ -> i <> fmt_idx) t.active_formatting;
627
+
(* Adjust bookmark since we removed an element *)
628
+
let adjusted_bookmark = if fmt_idx < !bookmark then !bookmark - 1 else !bookmark in
629
+
let rec insert_at_bookmark idx acc = function
630
+
| [] -> List.rev (new_entry :: acc)
631
+
| x :: rest when idx = adjusted_bookmark ->
632
+
List.rev_append acc (new_entry :: x :: rest)
633
+
| x :: rest -> insert_at_bookmark (idx + 1) (x :: acc) rest
634
+
in
635
+
t.active_formatting <- insert_at_bookmark 0 [] t.active_formatting;
636
+
637
+
(* Step 18: Remove formatting element from open elements, insert new after furthest block *)
638
+
(* "After" in stack terms means new_formatting should be between fb and current node *)
639
+
(* In our list orientation (current at index 0), this means new_formatting at lower index than fb *)
640
+
t.open_elements <- List.filter (fun n -> n != fmt_node) t.open_elements;
641
+
(* Find fb and insert new_formatting before it (lower index = closer to current) *)
642
+
let rec insert_before acc = function
643
+
| [] -> List.rev (new_formatting :: acc)
644
+
| n :: rest when n == fb ->
645
+
(* Insert new_formatting before fb: acc reversed, then new_formatting, then fb, then rest *)
646
+
List.rev_append acc (new_formatting :: n :: rest)
647
+
| n :: rest -> insert_before (n :: acc) rest
648
+
in
649
+
t.open_elements <- insert_before [] t.open_elements
650
+
(* Continue outer loop *)
651
+
end
652
+
done
653
+
654
+
(* Close p element *)
655
+
let close_p_element t =
656
+
generate_implied_end_tags t ~except:"p" ();
657
+
(match current_node t with
658
+
| Some n when n.Dom.name <> "p" -> parse_error t "expected-p"
659
+
| _ -> ());
660
+
pop_until_tag t "p"
661
+
662
+
(* Reset insertion mode *)
663
+
let reset_insertion_mode t =
664
+
let rec check_node last = function
665
+
| [] -> t.mode <- Insertion_mode.In_body
666
+
| node :: rest ->
667
+
let is_last = rest = [] in
668
+
let node_to_check =
669
+
if is_last then
670
+
match t.fragment_context with
671
+
| Some ctx -> Dom.create_element ctx.tag_name ~namespace:ctx.namespace ()
672
+
| None -> node
673
+
else node
674
+
in
675
+
let name = node_to_check.Dom.name in
676
+
if name = "select" then begin
677
+
if not is_last then begin
678
+
let rec find_table_or_template = function
679
+
| [] -> ()
680
+
| n :: rest ->
681
+
if n.Dom.name = "template" then t.mode <- Insertion_mode.In_select
682
+
else if n.Dom.name = "table" then t.mode <- Insertion_mode.In_select_in_table
683
+
else find_table_or_template rest
684
+
in
685
+
find_table_or_template rest
686
+
end;
687
+
if t.mode <> Insertion_mode.In_select_in_table then
688
+
t.mode <- Insertion_mode.In_select
689
+
end else if List.mem name ["td"; "th"] && not is_last then
690
+
t.mode <- Insertion_mode.In_cell
691
+
else if name = "tr" then
692
+
t.mode <- Insertion_mode.In_row
693
+
else if List.mem name ["tbody"; "thead"; "tfoot"] then
694
+
t.mode <- Insertion_mode.In_table_body
695
+
else if name = "caption" then
696
+
t.mode <- Insertion_mode.In_caption
697
+
else if name = "colgroup" then
698
+
t.mode <- Insertion_mode.In_column_group
699
+
else if name = "table" then
700
+
t.mode <- Insertion_mode.In_table
701
+
else if name = "template" then
702
+
t.mode <- (match t.template_modes with m :: _ -> m | [] -> Insertion_mode.In_template)
703
+
else if name = "head" && not is_last then
704
+
t.mode <- Insertion_mode.In_head
705
+
else if name = "body" then
706
+
t.mode <- Insertion_mode.In_body
707
+
else if name = "frameset" then
708
+
t.mode <- Insertion_mode.In_frameset
709
+
else if name = "html" then
710
+
t.mode <- (if t.head_element = None then Insertion_mode.Before_head else Insertion_mode.After_head)
711
+
else if is_last then
712
+
t.mode <- Insertion_mode.In_body
713
+
else
714
+
check_node last rest
715
+
in
716
+
check_node false t.open_elements
717
+
718
+
let is_whitespace s =
719
+
let ws = [' '; '\t'; '\n'; '\x0C'; '\r'] in
720
+
String.for_all (fun c -> List.mem c ws) s
721
+
722
+
(* Mode handlers *)
723
+
let rec process_initial t token =
724
+
match token with
725
+
| Token.Character data when is_whitespace data -> ()
726
+
| Token.Comment data -> insert_comment_to_document t data
727
+
| Token.Doctype dt ->
728
+
let node = Dom.create_doctype ?name:dt.name ?public_id:dt.public_id ?system_id:dt.system_id () in
729
+
Dom.append_child t.document node;
730
+
(* Quirks mode detection *)
731
+
if dt.force_quirks then
732
+
t.quirks_mode <- Dom.Quirks
733
+
else if dt.name <> Some "html" then
734
+
t.quirks_mode <- Dom.Quirks
735
+
else begin
736
+
let pub = Option.map String.lowercase_ascii dt.public_id in
737
+
let sys = Option.map String.lowercase_ascii dt.system_id in
738
+
let is_quirky =
739
+
(match pub with
740
+
| Some p -> List.mem p Constants.quirky_public_matches ||
741
+
List.exists (fun prefix -> String.length p >= String.length prefix &&
742
+
String.sub p 0 (String.length prefix) = prefix) Constants.quirky_public_prefixes
743
+
| None -> false) ||
744
+
(match sys with
745
+
| Some s -> List.mem s Constants.quirky_system_matches
746
+
| None -> false)
747
+
in
748
+
if is_quirky then t.quirks_mode <- Dom.Quirks
749
+
else begin
750
+
let is_limited_quirky =
751
+
match pub with
752
+
| Some p -> List.exists (fun prefix -> String.length p >= String.length prefix &&
753
+
String.sub p 0 (String.length prefix) = prefix)
754
+
Constants.limited_quirky_public_prefixes
755
+
| None -> false
756
+
in
757
+
if is_limited_quirky then t.quirks_mode <- Dom.Limited_quirks
758
+
end
759
+
end;
760
+
t.mode <- Insertion_mode.Before_html
761
+
| _ ->
762
+
parse_error t "expected-doctype-but-got-other";
763
+
t.quirks_mode <- Dom.Quirks;
764
+
t.mode <- Insertion_mode.Before_html;
765
+
process_token t token
766
+
767
+
and process_before_html t token =
768
+
match token with
769
+
| Token.Doctype _ -> parse_error t "unexpected-doctype"
770
+
| Token.Comment data -> insert_comment_to_document t data
771
+
| Token.Character data when is_whitespace data -> ()
772
+
| Token.Tag { kind = Token.Start; name = "html"; attrs; _ } ->
773
+
let html = insert_element t "html" attrs in
774
+
t.open_elements <- [html];
775
+
t.mode <- Insertion_mode.Before_head
776
+
| Token.Tag { kind = Token.End; name; _ } when List.mem name ["head"; "body"; "html"; "br"] ->
777
+
let html = insert_element t "html" [] in
778
+
t.open_elements <- [html];
779
+
t.mode <- Insertion_mode.Before_head;
780
+
process_token t token
781
+
| Token.Tag { kind = Token.End; _ } ->
782
+
parse_error t "unexpected-end-tag"
783
+
| _ ->
784
+
let html = insert_element t "html" [] in
785
+
t.open_elements <- [html];
786
+
t.mode <- Insertion_mode.Before_head;
787
+
process_token t token
788
+
789
+
and process_before_head t token =
790
+
match token with
791
+
| Token.Character data when is_whitespace data -> ()
792
+
| Token.Comment data -> insert_comment t data
793
+
| Token.Doctype _ -> parse_error t "unexpected-doctype"
794
+
| Token.Tag { kind = Token.Start; name = "html"; _ } ->
795
+
process_in_body t token
796
+
| Token.Tag { kind = Token.Start; name = "head"; attrs; _ } ->
797
+
let head = insert_element t "head" attrs in
798
+
t.open_elements <- head :: t.open_elements;
799
+
t.head_element <- Some head;
800
+
t.mode <- Insertion_mode.In_head
801
+
| Token.Tag { kind = Token.End; name; _ } when List.mem name ["head"; "body"; "html"; "br"] ->
802
+
let head = insert_element t "head" [] in
803
+
t.open_elements <- head :: t.open_elements;
804
+
t.head_element <- Some head;
805
+
t.mode <- Insertion_mode.In_head;
806
+
process_token t token
807
+
| Token.Tag { kind = Token.End; _ } ->
808
+
parse_error t "unexpected-end-tag"
809
+
| _ ->
810
+
let head = insert_element t "head" [] in
811
+
t.open_elements <- head :: t.open_elements;
812
+
t.head_element <- Some head;
813
+
t.mode <- Insertion_mode.In_head;
814
+
process_token t token
815
+
816
+
and process_in_head t token =
817
+
match token with
818
+
| Token.Character data when is_whitespace data ->
819
+
insert_character t data
820
+
| Token.Character data ->
821
+
(* Extract leading whitespace *)
822
+
let rec count_leading_ws i =
823
+
if i >= String.length data then i
824
+
else match data.[i] with
825
+
| '\t' | '\n' | '\x0C' | '\r' | ' ' -> count_leading_ws (i + 1)
826
+
| _ -> i
827
+
in
828
+
let ws_count = count_leading_ws 0 in
829
+
let leading_ws = String.sub data 0 ws_count in
830
+
let remaining = String.sub data ws_count (String.length data - ws_count) in
831
+
(* If there's leading whitespace and current element has children, insert it *)
832
+
if leading_ws <> "" then
833
+
(match current_node t with
834
+
| Some n when n.Dom.children <> [] -> insert_character t leading_ws
835
+
| _ -> ());
836
+
pop_current t;
837
+
t.mode <- Insertion_mode.After_head;
838
+
process_token t (Token.Character remaining)
839
+
| Token.Comment data ->
840
+
insert_comment t data
841
+
| Token.Doctype _ ->
842
+
parse_error t "unexpected-doctype"
843
+
| Token.Tag { kind = Token.Start; name = "html"; _ } ->
844
+
process_in_body t token
845
+
| Token.Tag { kind = Token.Start; name; attrs; _ }
846
+
when List.mem name ["base"; "basefont"; "bgsound"; "link"; "meta"] ->
847
+
ignore (insert_element t name attrs)
848
+
| Token.Tag { kind = Token.Start; name = "title"; _ } ->
849
+
ignore (insert_element_for_token t { kind = Token.Start; name = "title"; attrs = []; self_closing = false });
850
+
t.original_mode <- Some t.mode;
851
+
t.mode <- Insertion_mode.Text
852
+
| Token.Tag { kind = Token.Start; name; _ }
853
+
when List.mem name ["noframes"; "style"] ->
854
+
ignore (insert_element_for_token t { kind = Token.Start; name; attrs = []; self_closing = false });
855
+
t.original_mode <- Some t.mode;
856
+
t.mode <- Insertion_mode.Text
857
+
| Token.Tag { kind = Token.Start; name = "noscript"; _ } ->
858
+
(* Scripting is disabled: parse noscript content as HTML *)
859
+
ignore (insert_element_for_token t { kind = Token.Start; name = "noscript"; attrs = []; self_closing = false });
860
+
t.mode <- Insertion_mode.In_head_noscript
861
+
| Token.Tag { kind = Token.Start; name = "script"; attrs; self_closing } ->
862
+
ignore (insert_element_for_token t { kind = Token.Start; name = "script"; attrs; self_closing });
863
+
t.original_mode <- Some t.mode;
864
+
t.mode <- Insertion_mode.Text
865
+
| Token.Tag { kind = Token.End; name = "head"; _ } ->
866
+
pop_current t;
867
+
t.mode <- Insertion_mode.After_head
868
+
| Token.Tag { kind = Token.End; name; _ } when List.mem name ["body"; "html"; "br"] ->
869
+
pop_current t;
870
+
t.mode <- Insertion_mode.After_head;
871
+
process_token t token
872
+
| Token.Tag { kind = Token.Start; name = "template"; attrs; _ } ->
873
+
let node = Dom.create_template ~attrs () in
874
+
let (parent, _) = appropriate_insertion_place t in
875
+
Dom.append_child parent node;
876
+
t.open_elements <- node :: t.open_elements;
877
+
push_formatting_marker t;
878
+
t.frameset_ok <- false;
879
+
t.mode <- Insertion_mode.In_template;
880
+
t.template_modes <- Insertion_mode.In_template :: t.template_modes
881
+
| Token.Tag { kind = Token.End; name = "template"; _ } ->
882
+
if not (List.exists (fun n -> n.Dom.name = "template" && is_in_html_namespace n) t.open_elements) then
883
+
parse_error t "unexpected-end-tag"
884
+
else begin
885
+
generate_all_implied_end_tags t;
886
+
(match current_node t with
887
+
| Some n when not (n.Dom.name = "template" && is_in_html_namespace n) -> parse_error t "unexpected-end-tag"
888
+
| _ -> ());
889
+
pop_until_html_tag t "template";
890
+
clear_active_formatting_to_marker t;
891
+
t.template_modes <- (match t.template_modes with _ :: rest -> rest | [] -> []);
892
+
reset_insertion_mode t
893
+
end
894
+
| Token.Tag { kind = Token.Start; name = "head"; _ } ->
895
+
parse_error t "unexpected-start-tag"
896
+
| Token.Tag { kind = Token.End; _ } ->
897
+
parse_error t "unexpected-end-tag"
898
+
| _ ->
899
+
pop_current t;
900
+
t.mode <- Insertion_mode.After_head;
901
+
process_token t token
902
+
903
+
and process_in_head_noscript t token =
904
+
match token with
905
+
| Token.Character data when is_whitespace data ->
906
+
process_in_head t token
907
+
| Token.Character _ ->
908
+
parse_error t "unexpected-char-in-noscript";
909
+
pop_current t; (* Pop noscript *)
910
+
t.mode <- Insertion_mode.In_head;
911
+
process_token t token
912
+
| Token.Comment _ ->
913
+
process_in_head t token
914
+
| Token.Doctype _ ->
915
+
parse_error t "unexpected-doctype"
916
+
| Token.Tag { kind = Token.Start; name = "html"; _ } ->
917
+
process_in_body t token
918
+
| Token.Tag { kind = Token.Start; name; _ }
919
+
when List.mem name ["basefont"; "bgsound"; "link"; "meta"; "noframes"; "style"] ->
920
+
process_in_head t token
921
+
| Token.Tag { kind = Token.Start; name; _ }
922
+
when List.mem name ["head"; "noscript"] ->
923
+
parse_error t "unexpected-start-tag"
924
+
| Token.Tag { kind = Token.Start; _ } ->
925
+
parse_error t "unexpected-start-tag";
926
+
pop_current t; (* Pop noscript *)
927
+
t.mode <- Insertion_mode.In_head;
928
+
process_token t token
929
+
| Token.Tag { kind = Token.End; name = "noscript"; _ } ->
930
+
pop_current t; (* Pop noscript *)
931
+
t.mode <- Insertion_mode.In_head
932
+
| Token.Tag { kind = Token.End; name = "br"; _ } ->
933
+
parse_error t "unexpected-end-tag";
934
+
pop_current t; (* Pop noscript *)
935
+
t.mode <- Insertion_mode.In_head;
936
+
process_token t token
937
+
| Token.Tag { kind = Token.End; _ } ->
938
+
parse_error t "unexpected-end-tag"
939
+
| Token.EOF ->
940
+
parse_error t "expected-closing-tag-but-got-eof";
941
+
pop_current t; (* Pop noscript *)
942
+
t.mode <- Insertion_mode.In_head;
943
+
process_token t token
944
+
945
+
and process_after_head t token =
946
+
match token with
947
+
| Token.Character data when is_whitespace data ->
948
+
insert_character t data
949
+
| Token.Comment data ->
950
+
insert_comment t data
951
+
| Token.Doctype _ ->
952
+
parse_error t "unexpected-doctype"
953
+
| Token.Tag { kind = Token.Start; name = "html"; _ } ->
954
+
process_in_body t token
955
+
| Token.Tag { kind = Token.Start; name = "body"; attrs; _ } ->
956
+
ignore (insert_element t "body" ~push:true attrs);
957
+
t.frameset_ok <- false;
958
+
t.mode <- Insertion_mode.In_body
959
+
| Token.Tag { kind = Token.Start; name = "frameset"; attrs; _ } ->
960
+
ignore (insert_element t "frameset" ~push:true attrs);
961
+
t.mode <- Insertion_mode.In_frameset
962
+
| Token.Tag { kind = Token.Start; name = "input"; attrs; _ } ->
963
+
(* Special handling for input type="hidden" - parse error, ignore *)
964
+
let input_type = List.assoc_opt "type" attrs in
965
+
(match input_type with
966
+
| Some typ when String.lowercase_ascii typ = "hidden" ->
967
+
parse_error t "unexpected-hidden-input-after-head"
968
+
| _ ->
969
+
(* Non-hidden input creates body *)
970
+
let body = insert_element t "body" [] in
971
+
t.open_elements <- body :: t.open_elements;
972
+
t.mode <- Insertion_mode.In_body;
973
+
process_token t token)
974
+
| Token.Tag { kind = Token.Start; name; _ }
975
+
when List.mem name ["base"; "basefont"; "bgsound"; "link"; "meta"; "noframes"; "script"; "style"; "template"; "title"] ->
976
+
parse_error t "unexpected-start-tag";
977
+
(match t.head_element with
978
+
| Some head ->
979
+
t.open_elements <- head :: t.open_elements;
980
+
process_in_head t token;
981
+
t.open_elements <- List.filter (fun n -> n != head) t.open_elements
982
+
| None -> ())
983
+
| Token.Tag { kind = Token.End; name = "template"; _ } ->
984
+
process_in_head t token
985
+
| Token.Tag { kind = Token.End; name; _ } when List.mem name ["body"; "html"; "br"] ->
986
+
let body = insert_element t "body" [] in
987
+
t.open_elements <- body :: t.open_elements;
988
+
t.mode <- Insertion_mode.In_body;
989
+
process_token t token
990
+
| Token.Tag { kind = Token.Start; name = "head"; _ } ->
991
+
parse_error t "unexpected-start-tag"
992
+
| Token.Tag { kind = Token.End; _ } ->
993
+
parse_error t "unexpected-end-tag"
994
+
| _ ->
995
+
let body = insert_element t "body" [] in
996
+
t.open_elements <- body :: t.open_elements;
997
+
t.mode <- Insertion_mode.In_body;
998
+
process_token t token
999
+
1000
+
and process_in_body t token =
1001
+
match token with
1002
+
| Token.Character "\x00" ->
1003
+
parse_error t "unexpected-null-character"
1004
+
| Token.Character data ->
1005
+
reconstruct_active_formatting t;
1006
+
insert_character t data;
1007
+
if not (is_whitespace data) then t.frameset_ok <- false
1008
+
| Token.Comment data ->
1009
+
insert_comment t data
1010
+
| Token.Doctype _ ->
1011
+
parse_error t "unexpected-doctype"
1012
+
| Token.Tag { kind = Token.Start; name = "html"; attrs; _ } ->
1013
+
parse_error t "unexpected-start-tag";
1014
+
if not (List.exists (fun n -> n.Dom.name = "template") t.open_elements) then
1015
+
(* Find the html element (at the bottom of the stack) *)
1016
+
let html_elem = List.find_opt (fun n -> n.Dom.name = "html") t.open_elements in
1017
+
(match html_elem with
1018
+
| Some html ->
1019
+
List.iter (fun (k, v) ->
1020
+
if not (Dom.has_attr html k) then Dom.set_attr html k v
1021
+
) attrs
1022
+
| None -> ())
1023
+
| Token.Tag { kind = Token.Start; name; _ }
1024
+
when List.mem name ["base"; "basefont"; "bgsound"; "link"; "meta"; "noframes"; "script"; "style"; "template"; "title"] ->
1025
+
process_in_head t token
1026
+
| Token.Tag { kind = Token.End; name = "template"; _ } ->
1027
+
process_in_head t token
1028
+
| Token.Tag { kind = Token.Start; name = "body"; attrs; _ } ->
1029
+
parse_error t "unexpected-start-tag";
1030
+
(* Find body element on stack - it should be near the end (html is last) *)
1031
+
let body = List.find_opt (fun n -> n.Dom.name = "body") t.open_elements in
1032
+
(match body with
1033
+
| Some body when not (List.exists (fun n -> n.Dom.name = "template") t.open_elements) ->
1034
+
t.frameset_ok <- false;
1035
+
List.iter (fun (k, v) ->
1036
+
if not (Dom.has_attr body k) then Dom.set_attr body k v
1037
+
) attrs
1038
+
| _ -> ())
1039
+
| Token.Tag { kind = Token.Start; name = "frameset"; attrs; _ } ->
1040
+
if not t.frameset_ok then
1041
+
parse_error t "unexpected-start-tag-ignored"
1042
+
else begin
1043
+
(* Find body element on the stack *)
1044
+
let rec find_body_index idx = function
1045
+
| [] -> None
1046
+
| n :: rest ->
1047
+
if n.Dom.name = "body" then Some (idx, n)
1048
+
else find_body_index (idx + 1) rest
1049
+
in
1050
+
match find_body_index 0 t.open_elements with
1051
+
| None ->
1052
+
parse_error t "unexpected-start-tag-ignored"
1053
+
| Some (idx, body_elem) ->
1054
+
(* Remove body from its parent (the html element) *)
1055
+
(match body_elem.Dom.parent with
1056
+
| Some parent -> Dom.remove_child parent body_elem
1057
+
| None -> ());
1058
+
(* Pop all elements up to and including body - keep only elements after body_idx *)
1059
+
let rec drop n lst = if n <= 0 then lst else match lst with [] -> [] | _ :: rest -> drop (n - 1) rest in
1060
+
t.open_elements <- drop (idx + 1) t.open_elements;
1061
+
(* Insert frameset element *)
1062
+
ignore (insert_element t "frameset" ~push:true attrs);
1063
+
t.mode <- Insertion_mode.In_frameset
1064
+
end
1065
+
| Token.EOF ->
1066
+
if t.template_modes <> [] then
1067
+
process_in_template t token
1068
+
else begin
1069
+
let has_unclosed = List.exists (fun n ->
1070
+
not (List.mem n.Dom.name ["dd"; "dt"; "li"; "optgroup"; "option"; "p"; "rb"; "rp"; "rt"; "rtc"; "tbody"; "td"; "tfoot"; "th"; "thead"; "tr"; "body"; "html"])
1071
+
) t.open_elements in
1072
+
if has_unclosed then parse_error t "expected-closing-tag-but-got-eof"
1073
+
end
1074
+
| Token.Tag { kind = Token.End; name = "body"; _ } ->
1075
+
if not (has_element_in_scope t "body") then
1076
+
parse_error t "unexpected-end-tag"
1077
+
else begin
1078
+
let has_unclosed = List.exists (fun n ->
1079
+
not (List.mem n.Dom.name ["dd"; "dt"; "li"; "optgroup"; "option"; "p"; "rb"; "rp"; "rt"; "rtc"; "tbody"; "td"; "tfoot"; "th"; "thead"; "tr"; "body"; "html"])
1080
+
) t.open_elements in
1081
+
if has_unclosed then parse_error t "end-tag-too-early";
1082
+
t.mode <- Insertion_mode.After_body
1083
+
end
1084
+
| Token.Tag { kind = Token.End; name = "html"; _ } ->
1085
+
if not (has_element_in_scope t "body") then
1086
+
parse_error t "unexpected-end-tag"
1087
+
else begin
1088
+
t.mode <- Insertion_mode.After_body;
1089
+
process_token t token
1090
+
end
1091
+
| Token.Tag { kind = Token.Start; name; attrs; _ }
1092
+
when List.mem name ["address"; "article"; "aside"; "blockquote"; "center"; "details"; "dialog"; "dir"; "div"; "dl"; "fieldset"; "figcaption"; "figure"; "footer"; "header"; "hgroup"; "main"; "menu"; "nav"; "ol"; "p"; "search"; "section"; "summary"; "ul"] ->
1093
+
if has_element_in_button_scope t "p" then close_p_element t;
1094
+
ignore (insert_element t name ~push:true attrs)
1095
+
| Token.Tag { kind = Token.Start; name; attrs; _ } when List.mem name Constants.heading_elements ->
1096
+
if has_element_in_button_scope t "p" then close_p_element t;
1097
+
(match current_node t with
1098
+
| Some n when List.mem n.Dom.name Constants.heading_elements ->
1099
+
parse_error t "unexpected-start-tag";
1100
+
pop_current t
1101
+
| _ -> ());
1102
+
ignore (insert_element t name ~push:true attrs)
1103
+
| Token.Tag { kind = Token.Start; name; attrs; _ } when List.mem name ["pre"; "listing"] ->
1104
+
if has_element_in_button_scope t "p" then close_p_element t;
1105
+
ignore (insert_element t name ~push:true attrs);
1106
+
t.ignore_lf <- true;
1107
+
t.frameset_ok <- false
1108
+
| Token.Tag { kind = Token.Start; name = "form"; attrs; _ } ->
1109
+
if t.form_element <> None && not (List.exists (fun n -> n.Dom.name = "template") t.open_elements) then
1110
+
parse_error t "unexpected-start-tag"
1111
+
else begin
1112
+
if has_element_in_button_scope t "p" then close_p_element t;
1113
+
let form = insert_element t "form" attrs in
1114
+
t.open_elements <- form :: t.open_elements;
1115
+
if not (List.exists (fun n -> n.Dom.name = "template") t.open_elements) then
1116
+
t.form_element <- Some form
1117
+
end
1118
+
| Token.Tag { kind = Token.Start; name = "li"; attrs; _ } ->
1119
+
t.frameset_ok <- false;
1120
+
let rec check = function
1121
+
| [] -> ()
1122
+
| n :: rest ->
1123
+
if n.Dom.name = "li" then begin
1124
+
generate_implied_end_tags t ~except:"li" ();
1125
+
(match current_node t with
1126
+
| Some cn when cn.Dom.name <> "li" -> parse_error t "unexpected-start-tag"
1127
+
| _ -> ());
1128
+
pop_until_tag t "li"
1129
+
end else if is_special_element n && not (List.mem (String.lowercase_ascii n.Dom.name) ["address"; "div"; "p"]) then
1130
+
()
1131
+
else
1132
+
check rest
1133
+
in
1134
+
check t.open_elements;
1135
+
if has_element_in_button_scope t "p" then close_p_element t;
1136
+
ignore (insert_element t "li" ~push:true attrs)
1137
+
| Token.Tag { kind = Token.Start; name; attrs; _ } when List.mem name ["dd"; "dt"] ->
1138
+
t.frameset_ok <- false;
1139
+
let rec check = function
1140
+
| [] -> ()
1141
+
| n :: rest ->
1142
+
if List.mem n.Dom.name ["dd"; "dt"] then begin
1143
+
generate_implied_end_tags t ~except:n.Dom.name ();
1144
+
(match current_node t with
1145
+
| Some cn when cn.Dom.name <> n.Dom.name -> parse_error t "unexpected-start-tag"
1146
+
| _ -> ());
1147
+
pop_until_one_of t ["dd"; "dt"]
1148
+
end else if is_special_element n && not (List.mem (String.lowercase_ascii n.Dom.name) ["address"; "div"; "p"]) then
1149
+
()
1150
+
else
1151
+
check rest
1152
+
in
1153
+
check t.open_elements;
1154
+
if has_element_in_button_scope t "p" then close_p_element t;
1155
+
ignore (insert_element t name ~push:true attrs)
1156
+
| Token.Tag { kind = Token.Start; name = "plaintext"; _ } ->
1157
+
if has_element_in_button_scope t "p" then close_p_element t;
1158
+
ignore (insert_element t "plaintext" ~push:true [])
1159
+
(* Tokenizer should switch to PLAINTEXT state *)
1160
+
| Token.Tag { kind = Token.Start; name = "button"; attrs; _ } ->
1161
+
if has_element_in_scope t "button" then begin
1162
+
parse_error t "unexpected-start-tag";
1163
+
generate_implied_end_tags t ();
1164
+
pop_until_tag t "button"
1165
+
end;
1166
+
reconstruct_active_formatting t;
1167
+
ignore (insert_element t "button" ~push:true attrs);
1168
+
t.frameset_ok <- false
1169
+
| Token.Tag { kind = Token.End; name; _ }
1170
+
when List.mem name ["address"; "article"; "aside"; "blockquote"; "button"; "center"; "details"; "dialog"; "dir"; "div"; "dl"; "fieldset"; "figcaption"; "figure"; "footer"; "header"; "hgroup"; "listing"; "main"; "menu"; "nav"; "ol"; "pre"; "search"; "section"; "summary"; "ul"] ->
1171
+
if not (has_element_in_scope t name) then
1172
+
parse_error t "unexpected-end-tag"
1173
+
else begin
1174
+
generate_implied_end_tags t ();
1175
+
(match current_node t with
1176
+
| Some n when n.Dom.name <> name -> parse_error t "end-tag-too-early"
1177
+
| _ -> ());
1178
+
pop_until_tag t name
1179
+
end
1180
+
| Token.Tag { kind = Token.End; name = "form"; _ } ->
1181
+
if not (List.exists (fun n -> n.Dom.name = "template") t.open_elements) then begin
1182
+
let node = t.form_element in
1183
+
t.form_element <- None;
1184
+
match node with
1185
+
| None -> parse_error t "unexpected-end-tag"
1186
+
| Some form_node ->
1187
+
if not (has_element_in_scope t "form") then
1188
+
parse_error t "unexpected-end-tag"
1189
+
else begin
1190
+
generate_implied_end_tags t ();
1191
+
(match current_node t with
1192
+
| Some n when n != form_node -> parse_error t "end-tag-too-early"
1193
+
| _ -> ());
1194
+
t.open_elements <- List.filter (fun n -> n != form_node) t.open_elements
1195
+
end
1196
+
end else begin
1197
+
if not (has_element_in_scope t "form") then
1198
+
parse_error t "unexpected-end-tag"
1199
+
else begin
1200
+
generate_implied_end_tags t ();
1201
+
(match current_node t with
1202
+
| Some n when n.Dom.name <> "form" -> parse_error t "end-tag-too-early"
1203
+
| _ -> ());
1204
+
pop_until_tag t "form"
1205
+
end
1206
+
end
1207
+
| Token.Tag { kind = Token.End; name = "p"; _ } ->
1208
+
if not (has_element_in_button_scope t "p") then begin
1209
+
parse_error t "unexpected-end-tag";
1210
+
ignore (insert_element t "p" ~push:true [])
1211
+
end;
1212
+
close_p_element t
1213
+
| Token.Tag { kind = Token.End; name = "li"; _ } ->
1214
+
if not (has_element_in_list_item_scope t "li") then
1215
+
parse_error t "unexpected-end-tag"
1216
+
else begin
1217
+
generate_implied_end_tags t ~except:"li" ();
1218
+
(match current_node t with
1219
+
| Some n when n.Dom.name <> "li" -> parse_error t "end-tag-too-early"
1220
+
| _ -> ());
1221
+
pop_until_tag t "li"
1222
+
end
1223
+
| Token.Tag { kind = Token.End; name; _ } when List.mem name ["dd"; "dt"] ->
1224
+
if not (has_element_in_scope t name) then
1225
+
parse_error t "unexpected-end-tag"
1226
+
else begin
1227
+
generate_implied_end_tags t ~except:name ();
1228
+
(match current_node t with
1229
+
| Some n when n.Dom.name <> name -> parse_error t "end-tag-too-early"
1230
+
| _ -> ());
1231
+
pop_until_tag t name
1232
+
end
1233
+
| Token.Tag { kind = Token.End; name; _ } when List.mem name Constants.heading_elements ->
1234
+
if not (has_element_in_scope_impl t Constants.heading_elements Constants.default_scope ~check_integration_points:true) then
1235
+
parse_error t "unexpected-end-tag"
1236
+
else begin
1237
+
generate_implied_end_tags t ();
1238
+
(match current_node t with
1239
+
| Some n when n.Dom.name <> name -> parse_error t "end-tag-too-early"
1240
+
| _ -> ());
1241
+
pop_until_one_of t Constants.heading_elements
1242
+
end
1243
+
| Token.Tag { kind = Token.Start; name = "a"; attrs; _ } ->
1244
+
(* Check for existing <a> in active formatting *)
1245
+
let rec find_a = function
1246
+
| [] -> None
1247
+
| Marker :: _ -> None
1248
+
| Entry e :: _ when e.name = "a" -> Some e.node
1249
+
| _ :: rest -> find_a rest
1250
+
in
1251
+
(match find_a t.active_formatting with
1252
+
| Some existing ->
1253
+
parse_error t "unexpected-start-tag";
1254
+
adoption_agency t "a";
1255
+
t.active_formatting <- List.filter (function
1256
+
| Entry e -> e.node != existing
1257
+
| _ -> true
1258
+
) t.active_formatting;
1259
+
t.open_elements <- List.filter (fun n -> n != existing) t.open_elements
1260
+
| None -> ());
1261
+
reconstruct_active_formatting t;
1262
+
let node = insert_element t "a" attrs in
1263
+
t.open_elements <- node :: t.open_elements;
1264
+
push_formatting_element t node "a" attrs
1265
+
| Token.Tag { kind = Token.Start; name; attrs; _ }
1266
+
when List.mem name ["b"; "big"; "code"; "em"; "font"; "i"; "s"; "small"; "strike"; "strong"; "tt"; "u"] ->
1267
+
reconstruct_active_formatting t;
1268
+
let node = insert_element t name attrs in
1269
+
t.open_elements <- node :: t.open_elements;
1270
+
push_formatting_element t node name attrs
1271
+
| Token.Tag { kind = Token.Start; name = "nobr"; attrs; _ } ->
1272
+
if has_element_in_scope t "nobr" then begin
1273
+
parse_error t "unexpected-start-tag";
1274
+
adoption_agency t "nobr";
1275
+
(* Remove nobr from active formatting *)
1276
+
t.active_formatting <- List.filter (function
1277
+
| Entry e -> e.name <> "nobr"
1278
+
| Marker -> true
1279
+
) t.active_formatting;
1280
+
(* Remove nobr from open elements *)
1281
+
t.open_elements <- List.filter (fun n -> n.Dom.name <> "nobr") t.open_elements
1282
+
end;
1283
+
reconstruct_active_formatting t;
1284
+
let node = insert_element t "nobr" attrs in
1285
+
t.open_elements <- node :: t.open_elements;
1286
+
push_formatting_element t node "nobr" attrs
1287
+
| Token.Tag { kind = Token.End; name; _ }
1288
+
when List.mem name ["a"; "b"; "big"; "code"; "em"; "font"; "i"; "nobr"; "s"; "small"; "strike"; "strong"; "tt"; "u"] ->
1289
+
adoption_agency t name
1290
+
| Token.Tag { kind = Token.Start; name; attrs; _ }
1291
+
when List.mem name ["applet"; "marquee"; "object"] ->
1292
+
reconstruct_active_formatting t;
1293
+
ignore (insert_element t name ~push:true attrs);
1294
+
push_formatting_marker t;
1295
+
t.frameset_ok <- false
1296
+
| Token.Tag { kind = Token.End; name; _ }
1297
+
when List.mem name ["applet"; "marquee"; "object"] ->
1298
+
if not (has_element_in_scope t name) then
1299
+
parse_error t "unexpected-end-tag"
1300
+
else begin
1301
+
generate_implied_end_tags t ();
1302
+
(match current_node t with
1303
+
| Some n when n.Dom.name <> name -> parse_error t "end-tag-too-early"
1304
+
| _ -> ());
1305
+
pop_until_tag t name;
1306
+
clear_active_formatting_to_marker t
1307
+
end
1308
+
| Token.Tag { kind = Token.Start; name = "table"; attrs; _ } ->
1309
+
if t.quirks_mode <> Dom.Quirks && has_element_in_button_scope t "p" then
1310
+
close_p_element t;
1311
+
ignore (insert_element t "table" ~push:true attrs);
1312
+
t.frameset_ok <- false;
1313
+
t.mode <- Insertion_mode.In_table
1314
+
| Token.Tag { kind = Token.End; name = "br"; _ } ->
1315
+
parse_error t "unexpected-end-tag";
1316
+
reconstruct_active_formatting t;
1317
+
ignore (insert_element t "br" ~push:true []);
1318
+
pop_current t;
1319
+
t.frameset_ok <- false
1320
+
| Token.Tag { kind = Token.Start; name; attrs; _ }
1321
+
when List.mem name ["area"; "br"; "embed"; "img"; "keygen"; "wbr"] ->
1322
+
reconstruct_active_formatting t;
1323
+
ignore (insert_element t name ~push:true attrs);
1324
+
pop_current t;
1325
+
t.frameset_ok <- false
1326
+
| Token.Tag { kind = Token.Start; name = "input"; attrs; _ } ->
1327
+
reconstruct_active_formatting t;
1328
+
ignore (insert_element t "input" ~push:true attrs);
1329
+
pop_current t;
1330
+
let is_hidden = List.exists (fun (k, v) ->
1331
+
String.lowercase_ascii k = "type" && String.lowercase_ascii v = "hidden"
1332
+
) attrs in
1333
+
if not is_hidden then t.frameset_ok <- false
1334
+
| Token.Tag { kind = Token.Start; name; _ }
1335
+
when List.mem name ["param"; "source"; "track"] ->
1336
+
ignore (insert_element_for_token t { kind = Token.Start; name; attrs = []; self_closing = false });
1337
+
pop_current t
1338
+
| Token.Tag { kind = Token.Start; name = "hr"; _ } ->
1339
+
if has_element_in_button_scope t "p" then close_p_element t;
1340
+
ignore (insert_element t "hr" ~push:true []);
1341
+
pop_current t;
1342
+
t.frameset_ok <- false
1343
+
| Token.Tag { kind = Token.Start; name = "image"; attrs; _ } ->
1344
+
parse_error t "unexpected-start-tag";
1345
+
(* Treat <image> as <img> *)
1346
+
reconstruct_active_formatting t;
1347
+
ignore (insert_element t "img" ~push:true attrs);
1348
+
pop_current t;
1349
+
t.frameset_ok <- false
1350
+
| Token.Tag { kind = Token.Start; name = "textarea"; attrs; _ } ->
1351
+
ignore (insert_element t "textarea" ~push:true attrs);
1352
+
t.ignore_lf <- true;
1353
+
t.original_mode <- Some t.mode;
1354
+
t.frameset_ok <- false;
1355
+
t.mode <- Insertion_mode.Text
1356
+
| Token.Tag { kind = Token.Start; name = "xmp"; _ } ->
1357
+
if has_element_in_button_scope t "p" then close_p_element t;
1358
+
reconstruct_active_formatting t;
1359
+
t.frameset_ok <- false;
1360
+
ignore (insert_element_for_token t { kind = Token.Start; name = "xmp"; attrs = []; self_closing = false });
1361
+
t.original_mode <- Some t.mode;
1362
+
t.mode <- Insertion_mode.Text
1363
+
| Token.Tag { kind = Token.Start; name = "iframe"; _ } ->
1364
+
t.frameset_ok <- false;
1365
+
ignore (insert_element_for_token t { kind = Token.Start; name = "iframe"; attrs = []; self_closing = false });
1366
+
t.original_mode <- Some t.mode;
1367
+
t.mode <- Insertion_mode.Text
1368
+
| Token.Tag { kind = Token.Start; name = "noembed"; _ } ->
1369
+
ignore (insert_element_for_token t { kind = Token.Start; name = "noembed"; attrs = []; self_closing = false });
1370
+
t.original_mode <- Some t.mode;
1371
+
t.mode <- Insertion_mode.Text
1372
+
| Token.Tag { kind = Token.Start; name = "select"; attrs; _ } ->
1373
+
reconstruct_active_formatting t;
1374
+
ignore (insert_element t "select" ~push:true attrs);
1375
+
t.frameset_ok <- false;
1376
+
if List.mem t.mode [Insertion_mode.In_table; Insertion_mode.In_caption; Insertion_mode.In_table_body; Insertion_mode.In_row; Insertion_mode.In_cell] then
1377
+
t.mode <- Insertion_mode.In_select_in_table
1378
+
else
1379
+
t.mode <- Insertion_mode.In_select
1380
+
| Token.Tag { kind = Token.Start; name; attrs; _ } when List.mem name ["optgroup"; "option"] ->
1381
+
(match current_node t with
1382
+
| Some n when n.Dom.name = "option" -> pop_current t
1383
+
| _ -> ());
1384
+
reconstruct_active_formatting t;
1385
+
ignore (insert_element t name ~push:true attrs)
1386
+
| Token.Tag { kind = Token.Start; name; attrs; _ } when List.mem name ["rb"; "rtc"] ->
1387
+
if has_element_in_scope t "ruby" then begin
1388
+
generate_implied_end_tags t ()
1389
+
end;
1390
+
(match current_node t with
1391
+
| Some n when n.Dom.name <> "ruby" && n.Dom.name <> "rtc" -> parse_error t "unexpected-start-tag"
1392
+
| _ -> ());
1393
+
ignore (insert_element t name ~push:true attrs)
1394
+
| Token.Tag { kind = Token.Start; name; attrs; _ } when List.mem name ["rp"; "rt"] ->
1395
+
if has_element_in_scope t "ruby" then begin
1396
+
generate_implied_end_tags t ~except:"rtc" ()
1397
+
end;
1398
+
(match current_node t with
1399
+
| Some n when n.Dom.name <> "ruby" && n.Dom.name <> "rtc" -> parse_error t "unexpected-start-tag"
1400
+
| _ -> ());
1401
+
ignore (insert_element t name ~push:true attrs)
1402
+
| Token.Tag { kind = Token.Start; name = "math"; attrs; self_closing } ->
1403
+
reconstruct_active_formatting t;
1404
+
let adjusted_attrs = Constants.adjust_mathml_attrs (Constants.adjust_foreign_attrs attrs) in
1405
+
ignore (insert_foreign_element t { kind = Token.Start; name = "math"; attrs = adjusted_attrs; self_closing } (Some "mathml"));
1406
+
if self_closing then pop_current t
1407
+
| Token.Tag { kind = Token.Start; name = "svg"; attrs; self_closing } ->
1408
+
reconstruct_active_formatting t;
1409
+
let adjusted_attrs = Constants.adjust_svg_attrs (Constants.adjust_foreign_attrs attrs) in
1410
+
ignore (insert_foreign_element t { kind = Token.Start; name = "svg"; attrs = adjusted_attrs; self_closing } (Some "svg"));
1411
+
if self_closing then pop_current t
1412
+
| Token.Tag { kind = Token.Start; name; attrs; _ }
1413
+
when List.mem name ["col"; "frame"] ->
1414
+
(* In fragment context, insert these; otherwise ignore *)
1415
+
if t.fragment_context = None then
1416
+
parse_error t "unexpected-start-tag-ignored"
1417
+
else
1418
+
ignore (insert_element t name attrs)
1419
+
| Token.Tag { kind = Token.Start; name; _ }
1420
+
when List.mem name ["caption"; "colgroup"; "head"; "tbody"; "td"; "tfoot"; "th"; "thead"; "tr"] ->
1421
+
parse_error t "unexpected-start-tag"
1422
+
| Token.Tag { kind = Token.Start; name; attrs; _ } ->
1423
+
(* Any other start tag *)
1424
+
reconstruct_active_formatting t;
1425
+
ignore (insert_element t name ~push:true attrs)
1426
+
| Token.Tag { kind = Token.End; name; _ } ->
1427
+
(* Any other end tag *)
1428
+
let rec check = function
1429
+
| [] -> ()
1430
+
| node :: rest ->
1431
+
if node.Dom.name = name then begin
1432
+
generate_implied_end_tags t ~except:name ();
1433
+
(match current_node t with
1434
+
| Some n when n.Dom.name <> name -> parse_error t "end-tag-too-early"
1435
+
| _ -> ());
1436
+
pop_until t (fun n -> n == node)
1437
+
end else if is_special_element node then
1438
+
parse_error t "unexpected-end-tag"
1439
+
else
1440
+
check rest
1441
+
in
1442
+
check t.open_elements
1443
+
1444
+
and process_text t token =
1445
+
match token with
1446
+
| Token.Character data ->
1447
+
insert_character t data
1448
+
| Token.EOF ->
1449
+
parse_error t "expected-closing-tag-but-got-eof";
1450
+
pop_current t;
1451
+
t.mode <- Option.value t.original_mode ~default:Insertion_mode.In_body;
1452
+
process_token t token
1453
+
| Token.Tag { kind = Token.End; _ } ->
1454
+
pop_current t;
1455
+
t.mode <- Option.value t.original_mode ~default:Insertion_mode.In_body
1456
+
| _ -> ()
1457
+
1458
+
and process_in_table t token =
1459
+
match token with
1460
+
| Token.Character _ when (match current_node t with Some n -> List.mem n.Dom.name ["table"; "tbody"; "tfoot"; "thead"; "tr"] | None -> false) ->
1461
+
t.pending_table_chars <- [];
1462
+
t.original_mode <- Some t.mode;
1463
+
t.mode <- Insertion_mode.In_table_text;
1464
+
process_token t token
1465
+
| Token.Comment data ->
1466
+
insert_comment t data
1467
+
| Token.Doctype _ ->
1468
+
parse_error t "unexpected-doctype"
1469
+
| Token.Tag { kind = Token.Start; name = "caption"; attrs; _ } ->
1470
+
clear_stack_back_to_table_context t;
1471
+
push_formatting_marker t;
1472
+
ignore (insert_element t "caption" ~push:true attrs);
1473
+
t.mode <- Insertion_mode.In_caption
1474
+
| Token.Tag { kind = Token.Start; name = "colgroup"; attrs; _ } ->
1475
+
clear_stack_back_to_table_context t;
1476
+
ignore (insert_element t "colgroup" ~push:true attrs);
1477
+
t.mode <- Insertion_mode.In_column_group
1478
+
| Token.Tag { kind = Token.Start; name = "col"; _ } ->
1479
+
clear_stack_back_to_table_context t;
1480
+
ignore (insert_element t "colgroup" ~push:true []);
1481
+
t.mode <- Insertion_mode.In_column_group;
1482
+
process_token t token
1483
+
| Token.Tag { kind = Token.Start; name; attrs; _ } when List.mem name ["tbody"; "tfoot"; "thead"] ->
1484
+
clear_stack_back_to_table_context t;
1485
+
ignore (insert_element t name ~push:true attrs);
1486
+
t.mode <- Insertion_mode.In_table_body
1487
+
| Token.Tag { kind = Token.Start; name; _ } when List.mem name ["td"; "th"; "tr"] ->
1488
+
clear_stack_back_to_table_context t;
1489
+
ignore (insert_element t "tbody" ~push:true []);
1490
+
t.mode <- Insertion_mode.In_table_body;
1491
+
process_token t token
1492
+
| Token.Tag { kind = Token.Start; name = "table"; _ } ->
1493
+
parse_error t "unexpected-start-tag";
1494
+
if has_element_in_table_scope t "table" then begin
1495
+
pop_until_tag t "table";
1496
+
reset_insertion_mode t;
1497
+
process_token t token
1498
+
end
1499
+
| Token.Tag { kind = Token.End; name = "table"; _ } ->
1500
+
if not (has_element_in_table_scope t "table") then
1501
+
parse_error t "unexpected-end-tag"
1502
+
else begin
1503
+
pop_until_tag t "table";
1504
+
reset_insertion_mode t
1505
+
end
1506
+
| Token.Tag { kind = Token.End; name; _ }
1507
+
when List.mem name ["body"; "caption"; "col"; "colgroup"; "html"; "tbody"; "td"; "tfoot"; "th"; "thead"; "tr"] ->
1508
+
parse_error t "unexpected-end-tag"
1509
+
| Token.Tag { kind = Token.Start; name; _ } when List.mem name ["style"; "script"; "template"] ->
1510
+
process_in_head t token
1511
+
| Token.Tag { kind = Token.End; name = "template"; _ } ->
1512
+
process_in_head t token
1513
+
| Token.Tag { kind = Token.Start; name = "input"; attrs; _ } ->
1514
+
let is_hidden = List.exists (fun (k, v) ->
1515
+
String.lowercase_ascii k = "type" && String.lowercase_ascii v = "hidden"
1516
+
) attrs in
1517
+
if not is_hidden then begin
1518
+
parse_error t "unexpected-start-tag";
1519
+
t.foster_parenting <- true;
1520
+
process_in_body t token;
1521
+
t.foster_parenting <- false
1522
+
end else begin
1523
+
parse_error t "unexpected-start-tag";
1524
+
ignore (insert_element t "input" ~push:true attrs);
1525
+
pop_current t
1526
+
end
1527
+
| Token.Tag { kind = Token.Start; name = "form"; attrs; _ } ->
1528
+
parse_error t "unexpected-start-tag";
1529
+
if t.form_element = None && not (List.exists (fun n -> n.Dom.name = "template") t.open_elements) then begin
1530
+
let form = insert_element t "form" attrs in
1531
+
t.open_elements <- form :: t.open_elements;
1532
+
t.form_element <- Some form;
1533
+
pop_current t
1534
+
end
1535
+
| Token.EOF ->
1536
+
process_in_body t token
1537
+
| _ ->
1538
+
parse_error t "unexpected-token-in-table";
1539
+
t.foster_parenting <- true;
1540
+
process_in_body t token;
1541
+
t.foster_parenting <- false
1542
+
1543
+
and clear_stack_back_to_table_context t =
1544
+
let rec loop () =
1545
+
match current_node t with
1546
+
| Some n when not (List.mem n.Dom.name ["table"; "template"; "html"]) ->
1547
+
pop_current t;
1548
+
loop ()
1549
+
| _ -> ()
1550
+
in
1551
+
loop ()
1552
+
1553
+
and process_in_table_text t token =
1554
+
match token with
1555
+
| Token.Character data ->
1556
+
if String.contains data '\x00' then
1557
+
parse_error t "unexpected-null-character"
1558
+
else
1559
+
t.pending_table_chars <- data :: t.pending_table_chars
1560
+
| _ ->
1561
+
let pending = String.concat "" (List.rev t.pending_table_chars) in
1562
+
t.pending_table_chars <- [];
1563
+
if not (is_whitespace pending) then begin
1564
+
parse_error t "unexpected-character-in-table";
1565
+
t.foster_parenting <- true;
1566
+
reconstruct_active_formatting t;
1567
+
insert_character t pending;
1568
+
t.foster_parenting <- false
1569
+
end else
1570
+
insert_character t pending;
1571
+
t.mode <- Option.value t.original_mode ~default:Insertion_mode.In_table;
1572
+
process_token t token
1573
+
1574
+
and process_in_caption t token =
1575
+
match token with
1576
+
| Token.Tag { kind = Token.End; name = "caption"; _ } ->
1577
+
if not (has_element_in_table_scope t "caption") then
1578
+
parse_error t "unexpected-end-tag"
1579
+
else begin
1580
+
generate_implied_end_tags t ();
1581
+
(match current_node t with
1582
+
| Some n when n.Dom.name <> "caption" -> parse_error t "end-tag-too-early"
1583
+
| _ -> ());
1584
+
pop_until_tag t "caption";
1585
+
clear_active_formatting_to_marker t;
1586
+
t.mode <- Insertion_mode.In_table
1587
+
end
1588
+
| Token.Tag { kind = Token.Start; name; _ }
1589
+
when List.mem name ["caption"; "col"; "colgroup"; "tbody"; "td"; "tfoot"; "th"; "thead"; "tr"] ->
1590
+
if not (has_element_in_table_scope t "caption") then
1591
+
parse_error t "unexpected-start-tag"
1592
+
else begin
1593
+
generate_implied_end_tags t ();
1594
+
pop_until_tag t "caption";
1595
+
clear_active_formatting_to_marker t;
1596
+
t.mode <- Insertion_mode.In_table;
1597
+
process_token t token
1598
+
end
1599
+
| Token.Tag { kind = Token.End; name = "table"; _ } ->
1600
+
if not (has_element_in_table_scope t "caption") then
1601
+
parse_error t "unexpected-end-tag"
1602
+
else begin
1603
+
generate_implied_end_tags t ();
1604
+
pop_until_tag t "caption";
1605
+
clear_active_formatting_to_marker t;
1606
+
t.mode <- Insertion_mode.In_table;
1607
+
process_token t token
1608
+
end
1609
+
| Token.Tag { kind = Token.End; name; _ }
1610
+
when List.mem name ["body"; "col"; "colgroup"; "html"; "tbody"; "td"; "tfoot"; "th"; "thead"; "tr"] ->
1611
+
parse_error t "unexpected-end-tag"
1612
+
| _ ->
1613
+
process_in_body t token
1614
+
1615
+
and process_in_column_group t token =
1616
+
match token with
1617
+
| Token.Character data when is_whitespace data ->
1618
+
insert_character t data
1619
+
| Token.Character data ->
1620
+
(* Split leading whitespace from non-whitespace *)
1621
+
let ws_chars = [' '; '\t'; '\n'; '\x0C'; '\r'] in
1622
+
let len = String.length data in
1623
+
let ws_end = ref 0 in
1624
+
while !ws_end < len && List.mem data.[!ws_end] ws_chars do incr ws_end done;
1625
+
if !ws_end > 0 then
1626
+
insert_character t (String.sub data 0 !ws_end);
1627
+
if !ws_end < len then begin
1628
+
let remaining = String.sub data !ws_end (len - !ws_end) in
1629
+
(match current_node t with
1630
+
| Some n when n.Dom.name = "colgroup" ->
1631
+
pop_current t;
1632
+
t.mode <- Insertion_mode.In_table;
1633
+
process_token t (Token.Character remaining)
1634
+
| _ ->
1635
+
parse_error t "unexpected-token")
1636
+
end
1637
+
| Token.Comment data ->
1638
+
insert_comment t data
1639
+
| Token.Doctype _ ->
1640
+
parse_error t "unexpected-doctype"
1641
+
| Token.Tag { kind = Token.Start; name = "html"; _ } ->
1642
+
process_in_body t token
1643
+
| Token.Tag { kind = Token.Start; name = "col"; attrs; _ } ->
1644
+
ignore (insert_element t "col" ~push:true attrs);
1645
+
pop_current t
1646
+
| Token.Tag { kind = Token.End; name = "colgroup"; _ } ->
1647
+
(match current_node t with
1648
+
| Some n when n.Dom.name <> "colgroup" -> parse_error t "unexpected-end-tag"
1649
+
| Some _ -> pop_current t; t.mode <- Insertion_mode.In_table
1650
+
| None -> parse_error t "unexpected-end-tag")
1651
+
| Token.Tag { kind = Token.End; name = "col"; _ } ->
1652
+
parse_error t "unexpected-end-tag"
1653
+
| Token.Tag { kind = Token.Start; name = "template"; _ }
1654
+
| Token.Tag { kind = Token.End; name = "template"; _ } ->
1655
+
process_in_head t token
1656
+
| Token.EOF ->
1657
+
process_in_body t token
1658
+
| _ ->
1659
+
(match current_node t with
1660
+
| Some n when n.Dom.name = "colgroup" ->
1661
+
pop_current t;
1662
+
t.mode <- Insertion_mode.In_table;
1663
+
process_token t token
1664
+
| _ ->
1665
+
parse_error t "unexpected-token")
1666
+
1667
+
and process_in_table_body t token =
1668
+
match token with
1669
+
| Token.Tag { kind = Token.Start; name = "tr"; attrs; _ } ->
1670
+
clear_stack_back_to_table_body_context t;
1671
+
ignore (insert_element t "tr" ~push:true attrs);
1672
+
t.mode <- Insertion_mode.In_row
1673
+
| Token.Tag { kind = Token.Start; name; _ } when List.mem name ["th"; "td"] ->
1674
+
parse_error t "unexpected-start-tag";
1675
+
clear_stack_back_to_table_body_context t;
1676
+
ignore (insert_element t "tr" ~push:true []);
1677
+
t.mode <- Insertion_mode.In_row;
1678
+
process_token t token
1679
+
| Token.Tag { kind = Token.End; name; _ } when List.mem name ["tbody"; "tfoot"; "thead"] ->
1680
+
if not (has_element_in_table_scope t name) then
1681
+
parse_error t "unexpected-end-tag"
1682
+
else begin
1683
+
clear_stack_back_to_table_body_context t;
1684
+
pop_current t;
1685
+
t.mode <- Insertion_mode.In_table
1686
+
end
1687
+
| Token.Tag { kind = Token.Start; name; _ }
1688
+
when List.mem name ["caption"; "col"; "colgroup"; "tbody"; "tfoot"; "thead"] ->
1689
+
if not (has_element_in_scope_impl t ["tbody"; "tfoot"; "thead"] Constants.table_scope ~check_integration_points:false) then
1690
+
parse_error t "unexpected-start-tag"
1691
+
else begin
1692
+
clear_stack_back_to_table_body_context t;
1693
+
pop_current t;
1694
+
t.mode <- Insertion_mode.In_table;
1695
+
process_token t token
1696
+
end
1697
+
| Token.Tag { kind = Token.End; name = "table"; _ } ->
1698
+
if not (has_element_in_scope_impl t ["tbody"; "tfoot"; "thead"] Constants.table_scope ~check_integration_points:false) then
1699
+
parse_error t "unexpected-end-tag"
1700
+
else begin
1701
+
clear_stack_back_to_table_body_context t;
1702
+
pop_current t;
1703
+
t.mode <- Insertion_mode.In_table;
1704
+
process_token t token
1705
+
end
1706
+
| Token.Tag { kind = Token.End; name; _ }
1707
+
when List.mem name ["body"; "caption"; "col"; "colgroup"; "html"; "td"; "th"; "tr"] ->
1708
+
parse_error t "unexpected-end-tag"
1709
+
| _ ->
1710
+
process_in_table t token
1711
+
1712
+
and clear_stack_back_to_table_body_context t =
1713
+
let rec loop () =
1714
+
match current_node t with
1715
+
| Some n when not (List.mem n.Dom.name ["tbody"; "tfoot"; "thead"; "template"; "html"]) ->
1716
+
pop_current t;
1717
+
loop ()
1718
+
| _ -> ()
1719
+
in
1720
+
loop ()
1721
+
1722
+
and process_in_row t token =
1723
+
match token with
1724
+
| Token.Tag { kind = Token.Start; name; attrs; _ } when List.mem name ["th"; "td"] ->
1725
+
clear_stack_back_to_table_row_context t;
1726
+
ignore (insert_element t name ~push:true attrs);
1727
+
t.mode <- Insertion_mode.In_cell;
1728
+
push_formatting_marker t
1729
+
| Token.Tag { kind = Token.End; name = "tr"; _ } ->
1730
+
if not (has_element_in_table_scope t "tr") then
1731
+
parse_error t "unexpected-end-tag"
1732
+
else begin
1733
+
clear_stack_back_to_table_row_context t;
1734
+
pop_current t;
1735
+
t.mode <- Insertion_mode.In_table_body
1736
+
end
1737
+
| Token.Tag { kind = Token.Start; name; _ }
1738
+
when List.mem name ["caption"; "col"; "colgroup"; "tbody"; "tfoot"; "thead"; "tr"] ->
1739
+
if not (has_element_in_table_scope t "tr") then
1740
+
parse_error t "unexpected-start-tag"
1741
+
else begin
1742
+
clear_stack_back_to_table_row_context t;
1743
+
pop_current t;
1744
+
t.mode <- Insertion_mode.In_table_body;
1745
+
process_token t token
1746
+
end
1747
+
| Token.Tag { kind = Token.End; name = "table"; _ } ->
1748
+
if not (has_element_in_table_scope t "tr") then
1749
+
parse_error t "unexpected-end-tag"
1750
+
else begin
1751
+
clear_stack_back_to_table_row_context t;
1752
+
pop_current t;
1753
+
t.mode <- Insertion_mode.In_table_body;
1754
+
process_token t token
1755
+
end
1756
+
| Token.Tag { kind = Token.End; name; _ } when List.mem name ["tbody"; "tfoot"; "thead"] ->
1757
+
if not (has_element_in_table_scope t name) then
1758
+
parse_error t "unexpected-end-tag"
1759
+
else if not (has_element_in_table_scope t "tr") then
1760
+
parse_error t "unexpected-end-tag"
1761
+
else begin
1762
+
clear_stack_back_to_table_row_context t;
1763
+
pop_current t;
1764
+
t.mode <- Insertion_mode.In_table_body;
1765
+
process_token t token
1766
+
end
1767
+
| Token.Tag { kind = Token.End; name; _ }
1768
+
when List.mem name ["body"; "caption"; "col"; "colgroup"; "html"; "td"; "th"] ->
1769
+
parse_error t "unexpected-end-tag"
1770
+
| _ ->
1771
+
process_in_table t token
1772
+
1773
+
and clear_stack_back_to_table_row_context t =
1774
+
let rec loop () =
1775
+
match current_node t with
1776
+
| Some n when not (List.mem n.Dom.name ["tr"; "template"; "html"]) ->
1777
+
pop_current t;
1778
+
loop ()
1779
+
| _ -> ()
1780
+
in
1781
+
loop ()
1782
+
1783
+
and process_in_cell t token =
1784
+
match token with
1785
+
| Token.Tag { kind = Token.End; name; _ } when List.mem name ["td"; "th"] ->
1786
+
if not (has_element_in_table_scope t name) then
1787
+
parse_error t "unexpected-end-tag"
1788
+
else begin
1789
+
generate_implied_end_tags t ();
1790
+
(match current_node t with
1791
+
| Some n when not (n.Dom.name = name && is_in_html_namespace n) -> parse_error t "end-tag-too-early"
1792
+
| _ -> ());
1793
+
pop_until_html_tag t name;
1794
+
clear_active_formatting_to_marker t;
1795
+
t.mode <- Insertion_mode.In_row
1796
+
end
1797
+
| Token.Tag { kind = Token.Start; name; _ }
1798
+
when List.mem name ["caption"; "col"; "colgroup"; "tbody"; "td"; "tfoot"; "th"; "thead"; "tr"] ->
1799
+
if not (has_element_in_scope_impl t ["td"; "th"] Constants.table_scope ~check_integration_points:false) then
1800
+
parse_error t "unexpected-start-tag"
1801
+
else begin
1802
+
close_cell t;
1803
+
process_token t token
1804
+
end
1805
+
| Token.Tag { kind = Token.End; name; _ }
1806
+
when List.mem name ["body"; "caption"; "col"; "colgroup"; "html"] ->
1807
+
parse_error t "unexpected-end-tag"
1808
+
| Token.Tag { kind = Token.End; name; _ }
1809
+
when List.mem name ["table"; "tbody"; "tfoot"; "thead"; "tr"] ->
1810
+
if not (has_element_in_table_scope t name) then
1811
+
parse_error t "unexpected-end-tag"
1812
+
else begin
1813
+
close_cell t;
1814
+
process_token t token
1815
+
end
1816
+
| _ ->
1817
+
process_in_body t token
1818
+
1819
+
and close_cell t =
1820
+
generate_implied_end_tags t ();
1821
+
(match current_node t with
1822
+
| Some n when not (List.mem n.Dom.name ["td"; "th"] && is_in_html_namespace n) -> parse_error t "end-tag-too-early"
1823
+
| _ -> ());
1824
+
pop_until_html_one_of t ["td"; "th"];
1825
+
clear_active_formatting_to_marker t;
1826
+
t.mode <- Insertion_mode.In_row
1827
+
1828
+
and process_in_select t token =
1829
+
match token with
1830
+
| Token.Character "\x00" ->
1831
+
parse_error t "unexpected-null-character"
1832
+
| Token.Character data ->
1833
+
reconstruct_active_formatting t;
1834
+
insert_character t data
1835
+
| Token.Comment data ->
1836
+
insert_comment t data
1837
+
| Token.Doctype _ ->
1838
+
parse_error t "unexpected-doctype"
1839
+
| Token.Tag { kind = Token.Start; name = "html"; _ } ->
1840
+
process_in_body t token
1841
+
| Token.Tag { kind = Token.Start; name = "option"; attrs; _ } ->
1842
+
(match current_node t with
1843
+
| Some n when n.Dom.name = "option" -> pop_current t
1844
+
| _ -> ());
1845
+
reconstruct_active_formatting t;
1846
+
ignore (insert_element t "option" ~push:true attrs)
1847
+
| Token.Tag { kind = Token.Start; name = "optgroup"; attrs; _ } ->
1848
+
(match current_node t with
1849
+
| Some n when n.Dom.name = "option" -> pop_current t
1850
+
| _ -> ());
1851
+
(match current_node t with
1852
+
| Some n when n.Dom.name = "optgroup" -> pop_current t
1853
+
| _ -> ());
1854
+
reconstruct_active_formatting t;
1855
+
ignore (insert_element t "optgroup" ~push:true attrs)
1856
+
| Token.Tag { kind = Token.Start; name = "hr"; _ } ->
1857
+
(match current_node t with
1858
+
| Some n when n.Dom.name = "option" -> pop_current t
1859
+
| _ -> ());
1860
+
(match current_node t with
1861
+
| Some n when n.Dom.name = "optgroup" -> pop_current t
1862
+
| _ -> ());
1863
+
ignore (insert_element t "hr" ~push:true []);
1864
+
pop_current t
1865
+
| Token.Tag { kind = Token.End; name = "optgroup"; _ } ->
1866
+
(match t.open_elements with
1867
+
| opt :: optg :: _ when opt.Dom.name = "option" && optg.Dom.name = "optgroup" ->
1868
+
pop_current t
1869
+
| _ -> ());
1870
+
(match current_node t with
1871
+
| Some n when n.Dom.name = "optgroup" -> pop_current t
1872
+
| _ -> parse_error t "unexpected-end-tag")
1873
+
| Token.Tag { kind = Token.End; name = "option"; _ } ->
1874
+
(match current_node t with
1875
+
| Some n when n.Dom.name = "option" -> pop_current t
1876
+
| _ -> parse_error t "unexpected-end-tag")
1877
+
| Token.Tag { kind = Token.End; name = "select"; _ } ->
1878
+
if not (has_element_in_select_scope t "select") then
1879
+
parse_error t "unexpected-end-tag"
1880
+
else begin
1881
+
pop_until_tag t "select";
1882
+
reset_insertion_mode t
1883
+
end
1884
+
| Token.Tag { kind = Token.Start; name = "select"; _ } ->
1885
+
parse_error t "unexpected-start-tag";
1886
+
(* Per spec: in IN_SELECT mode, select is always on the stack - just pop *)
1887
+
pop_until_tag t "select";
1888
+
reset_insertion_mode t
1889
+
| Token.Tag { kind = Token.Start; name; _ } when List.mem name ["input"; "textarea"] ->
1890
+
parse_error t "unexpected-start-tag";
1891
+
(* Per spec: in IN_SELECT mode, select is always on the stack - just pop *)
1892
+
pop_until_tag t "select";
1893
+
reset_insertion_mode t;
1894
+
process_token t token
1895
+
| Token.Tag { kind = Token.Start; name = "plaintext"; attrs; _ } ->
1896
+
(* plaintext is allowed in select - creates element, parser will switch tokenizer to PLAINTEXT mode *)
1897
+
reconstruct_active_formatting t;
1898
+
ignore (insert_element t "plaintext" ~push:true attrs)
1899
+
| Token.Tag { kind = Token.Start; name = "menuitem"; attrs; _ } ->
1900
+
(* menuitem is allowed in select *)
1901
+
reconstruct_active_formatting t;
1902
+
ignore (insert_element t "menuitem" ~push:true attrs)
1903
+
| Token.Tag { kind = Token.Start; name = "keygen"; attrs; _ } ->
1904
+
(* keygen is handled specially in select - inserted directly *)
1905
+
reconstruct_active_formatting t;
1906
+
ignore (insert_element t "keygen" attrs)
1907
+
(* Void element, don't push to stack *)
1908
+
| Token.Tag { kind = Token.Start; name = "svg"; attrs; self_closing } ->
1909
+
reconstruct_active_formatting t;
1910
+
let node = insert_foreign_element t { kind = Token.Start; name = "svg"; attrs; self_closing } (Some "svg") in
1911
+
if not self_closing then t.open_elements <- node :: t.open_elements
1912
+
| Token.Tag { kind = Token.Start; name = "math"; attrs; self_closing } ->
1913
+
reconstruct_active_formatting t;
1914
+
let node = insert_foreign_element t { kind = Token.Start; name = "math"; attrs; self_closing } (Some "mathml") in
1915
+
if not self_closing then t.open_elements <- node :: t.open_elements
1916
+
| Token.Tag { kind = Token.Start; name; _ } when List.mem name ["script"; "template"] ->
1917
+
process_in_head t token
1918
+
| Token.Tag { kind = Token.End; name = "template"; _ } ->
1919
+
process_in_head t token
1920
+
(* Allow certain HTML elements in select - newer spec behavior *)
1921
+
| Token.Tag { kind = Token.Start; name; attrs; self_closing } when List.mem name ["p"; "div"; "span"; "button"; "datalist"; "selectedcontent"] ->
1922
+
reconstruct_active_formatting t;
1923
+
let node = insert_element t name attrs in
1924
+
if not self_closing then t.open_elements <- node :: t.open_elements
1925
+
| Token.Tag { kind = Token.Start; name; attrs; _ } when List.mem name ["br"; "img"] ->
1926
+
reconstruct_active_formatting t;
1927
+
ignore (insert_element t name attrs)
1928
+
(* Don't push to stack - void elements *)
1929
+
(* Handle formatting elements in select *)
1930
+
| Token.Tag { kind = Token.Start; name; attrs; _ } when List.mem name Constants.formatting_elements ->
1931
+
reconstruct_active_formatting t;
1932
+
let node = insert_element t name ~push:true attrs in
1933
+
push_formatting_element t node name attrs
1934
+
| Token.Tag { kind = Token.End; name; _ } when List.mem name Constants.formatting_elements ->
1935
+
(* Find select element and check if formatting element is inside select *)
1936
+
let select_idx = ref None in
1937
+
let fmt_idx = ref None in
1938
+
List.iteri (fun i n ->
1939
+
if n.Dom.name = "select" && !select_idx = None then select_idx := Some i;
1940
+
if n.Dom.name = name then fmt_idx := Some i
1941
+
) t.open_elements;
1942
+
(match !fmt_idx, !select_idx with
1943
+
| Some fi, Some si when fi < si ->
1944
+
(* Formatting element is inside select, run adoption agency *)
1945
+
adoption_agency t name
1946
+
| Some _, Some _ ->
1947
+
(* Formatting element is outside select boundary - parse error, ignore *)
1948
+
parse_error t "unexpected-end-tag"
1949
+
| Some _, None ->
1950
+
adoption_agency t name
1951
+
| None, _ ->
1952
+
parse_error t "unexpected-end-tag")
1953
+
(* End tags for HTML elements allowed in select *)
1954
+
| Token.Tag { kind = Token.End; name; _ } when List.mem name ["p"; "div"; "span"; "button"; "datalist"; "selectedcontent"] ->
1955
+
(* Find select and target indices *)
1956
+
let select_idx = ref None in
1957
+
let target_idx = ref None in
1958
+
List.iteri (fun i n ->
1959
+
if n.Dom.name = "select" && !select_idx = None then select_idx := Some i;
1960
+
if n.Dom.name = name then target_idx := Some i
1961
+
) t.open_elements;
1962
+
(* Only pop if target exists and is inside select (lower index = closer to current) *)
1963
+
(match !target_idx, !select_idx with
1964
+
| Some ti, Some si when ti < si ->
1965
+
(* Pop until we reach the target *)
1966
+
let rec pop_to_target () =
1967
+
match t.open_elements with
1968
+
| [] -> ()
1969
+
| n :: rest ->
1970
+
t.open_elements <- rest;
1971
+
if n.Dom.name <> name then pop_to_target ()
1972
+
in
1973
+
pop_to_target ()
1974
+
| Some _, Some _ ->
1975
+
parse_error t "unexpected-end-tag"
1976
+
| Some _, None ->
1977
+
(* No select on stack, just pop to target *)
1978
+
let rec pop_to_target () =
1979
+
match t.open_elements with
1980
+
| [] -> ()
1981
+
| n :: rest ->
1982
+
t.open_elements <- rest;
1983
+
if n.Dom.name <> name then pop_to_target ()
1984
+
in
1985
+
pop_to_target ()
1986
+
| None, _ ->
1987
+
parse_error t "unexpected-end-tag")
1988
+
| Token.EOF ->
1989
+
process_in_body t token
1990
+
| _ ->
1991
+
parse_error t "unexpected-token-in-select"
1992
+
1993
+
and process_in_select_in_table t token =
1994
+
match token with
1995
+
| Token.Tag { kind = Token.Start; name; _ }
1996
+
when List.mem name ["caption"; "table"; "tbody"; "tfoot"; "thead"; "tr"; "td"; "th"] ->
1997
+
parse_error t "unexpected-start-tag";
1998
+
pop_until_tag t "select";
1999
+
reset_insertion_mode t;
2000
+
process_token t token
2001
+
| Token.Tag { kind = Token.End; name; _ }
2002
+
when List.mem name ["caption"; "table"; "tbody"; "tfoot"; "thead"; "tr"; "td"; "th"] ->
2003
+
parse_error t "unexpected-end-tag";
2004
+
if has_element_in_table_scope t name then begin
2005
+
pop_until_tag t "select";
2006
+
reset_insertion_mode t;
2007
+
process_token t token
2008
+
end
2009
+
| _ ->
2010
+
process_in_select t token
2011
+
2012
+
and process_in_template t token =
2013
+
match token with
2014
+
| Token.Character _ | Token.Comment _ | Token.Doctype _ ->
2015
+
process_in_body t token
2016
+
| Token.Tag { kind = Token.Start; name; _ }
2017
+
when List.mem name ["base"; "basefont"; "bgsound"; "link"; "meta"; "noframes"; "script"; "style"; "template"; "title"] ->
2018
+
process_in_head t token
2019
+
| Token.Tag { kind = Token.End; name = "template"; _ } ->
2020
+
process_in_head t token
2021
+
| Token.Tag { kind = Token.Start; name; _ }
2022
+
when List.mem name ["caption"; "colgroup"; "tbody"; "tfoot"; "thead"] ->
2023
+
t.template_modes <- (match t.template_modes with _ :: rest -> rest | [] -> []);
2024
+
t.template_modes <- Insertion_mode.In_table :: t.template_modes;
2025
+
t.mode <- Insertion_mode.In_table;
2026
+
process_token t token
2027
+
| Token.Tag { kind = Token.Start; name = "col"; _ } ->
2028
+
t.template_modes <- (match t.template_modes with _ :: rest -> rest | [] -> []);
2029
+
t.template_modes <- Insertion_mode.In_column_group :: t.template_modes;
2030
+
t.mode <- Insertion_mode.In_column_group;
2031
+
process_token t token
2032
+
| Token.Tag { kind = Token.Start; name = "tr"; _ } ->
2033
+
t.template_modes <- (match t.template_modes with _ :: rest -> rest | [] -> []);
2034
+
t.template_modes <- Insertion_mode.In_table_body :: t.template_modes;
2035
+
t.mode <- Insertion_mode.In_table_body;
2036
+
process_token t token
2037
+
| Token.Tag { kind = Token.Start; name; _ } when List.mem name ["td"; "th"] ->
2038
+
t.template_modes <- (match t.template_modes with _ :: rest -> rest | [] -> []);
2039
+
t.template_modes <- Insertion_mode.In_row :: t.template_modes;
2040
+
t.mode <- Insertion_mode.In_row;
2041
+
process_token t token
2042
+
| Token.Tag { kind = Token.Start; _ } ->
2043
+
t.template_modes <- (match t.template_modes with _ :: rest -> rest | [] -> []);
2044
+
t.template_modes <- Insertion_mode.In_body :: t.template_modes;
2045
+
t.mode <- Insertion_mode.In_body;
2046
+
process_token t token
2047
+
| Token.Tag { kind = Token.End; _ } ->
2048
+
parse_error t "unexpected-end-tag"
2049
+
| Token.EOF ->
2050
+
if not (List.exists (fun n -> n.Dom.name = "template" && is_in_html_namespace n) t.open_elements) then
2051
+
() (* Stop parsing *)
2052
+
else begin
2053
+
parse_error t "expected-closing-tag-but-got-eof";
2054
+
pop_until_html_tag t "template";
2055
+
clear_active_formatting_to_marker t;
2056
+
t.template_modes <- (match t.template_modes with _ :: rest -> rest | [] -> []);
2057
+
reset_insertion_mode t;
2058
+
process_token t token
2059
+
end
2060
+
2061
+
and process_after_body t token =
2062
+
match token with
2063
+
| Token.Character data when is_whitespace data ->
2064
+
process_in_body t token
2065
+
| Token.Comment data ->
2066
+
(* Insert as last child of html element - html is at bottom of stack *)
2067
+
let html_opt = List.find_opt (fun n -> n.Dom.name = "html") t.open_elements in
2068
+
(match html_opt with
2069
+
| Some html -> Dom.append_child html (Dom.create_comment data)
2070
+
| None -> ())
2071
+
| Token.Doctype _ ->
2072
+
parse_error t "unexpected-doctype"
2073
+
| Token.Tag { kind = Token.Start; name = "html"; _ } ->
2074
+
process_in_body t token
2075
+
| Token.Tag { kind = Token.End; name = "html"; _ } ->
2076
+
if t.fragment_context <> None then
2077
+
parse_error t "unexpected-end-tag"
2078
+
else
2079
+
t.mode <- Insertion_mode.After_after_body
2080
+
| Token.EOF ->
2081
+
() (* Stop parsing *)
2082
+
| _ ->
2083
+
parse_error t "unexpected-token-after-body";
2084
+
t.mode <- Insertion_mode.In_body;
2085
+
process_token t token
2086
+
2087
+
and process_in_frameset t token =
2088
+
match token with
2089
+
| Token.Character data ->
2090
+
(* Extract only whitespace characters and insert them *)
2091
+
let whitespace = String.to_seq data
2092
+
|> Seq.filter (fun c -> List.mem c ['\t'; '\n'; '\x0C'; '\r'; ' '])
2093
+
|> String.of_seq in
2094
+
if whitespace <> "" then insert_character t whitespace;
2095
+
if not (is_whitespace data) then
2096
+
parse_error t "unexpected-char-in-frameset"
2097
+
| Token.Comment data ->
2098
+
insert_comment t data
2099
+
| Token.Doctype _ ->
2100
+
parse_error t "unexpected-doctype"
2101
+
| Token.Tag { kind = Token.Start; name = "html"; _ } ->
2102
+
process_in_body t token
2103
+
| Token.Tag { kind = Token.Start; name = "frameset"; attrs; _ } ->
2104
+
ignore (insert_element t "frameset" ~push:true attrs)
2105
+
| Token.Tag { kind = Token.End; name = "frameset"; _ } ->
2106
+
(match current_node t with
2107
+
| Some n when n.Dom.name = "html" -> parse_error t "unexpected-end-tag"
2108
+
| _ ->
2109
+
pop_current t;
2110
+
if t.fragment_context = None then
2111
+
(match current_node t with
2112
+
| Some n when n.Dom.name <> "frameset" -> t.mode <- Insertion_mode.After_frameset
2113
+
| _ -> ()))
2114
+
| Token.Tag { kind = Token.Start; name = "frame"; attrs; _ } ->
2115
+
ignore (insert_element t "frame" ~push:true attrs);
2116
+
pop_current t
2117
+
| Token.Tag { kind = Token.Start; name = "noframes"; _ } ->
2118
+
process_in_head t token
2119
+
| Token.EOF ->
2120
+
(match current_node t with
2121
+
| Some n when n.Dom.name <> "html" -> parse_error t "expected-closing-tag-but-got-eof"
2122
+
| _ -> ())
2123
+
| _ ->
2124
+
parse_error t "unexpected-token-in-frameset"
2125
+
2126
+
and process_after_frameset t token =
2127
+
match token with
2128
+
| Token.Character data ->
2129
+
(* Extract only whitespace characters and insert them *)
2130
+
let whitespace = String.to_seq data
2131
+
|> Seq.filter (fun c -> List.mem c ['\t'; '\n'; '\x0C'; '\r'; ' '])
2132
+
|> String.of_seq in
2133
+
if whitespace <> "" then insert_character t whitespace;
2134
+
if not (is_whitespace data) then
2135
+
parse_error t "unexpected-char-after-frameset"
2136
+
| Token.Comment data ->
2137
+
insert_comment t data
2138
+
| Token.Doctype _ ->
2139
+
parse_error t "unexpected-doctype"
2140
+
| Token.Tag { kind = Token.Start; name = "html"; _ } ->
2141
+
process_in_body t token
2142
+
| Token.Tag { kind = Token.End; name = "html"; _ } ->
2143
+
t.mode <- Insertion_mode.After_after_frameset
2144
+
| Token.Tag { kind = Token.Start; name = "noframes"; _ } ->
2145
+
process_in_head t token
2146
+
| Token.EOF ->
2147
+
() (* Stop parsing *)
2148
+
| _ ->
2149
+
parse_error t "unexpected-token-after-frameset"
2150
+
2151
+
and process_after_after_body t token =
2152
+
match token with
2153
+
| Token.Comment data ->
2154
+
insert_comment_to_document t data
2155
+
| Token.Doctype _ ->
2156
+
process_in_body t token
2157
+
| Token.Character data when is_whitespace data ->
2158
+
process_in_body t token
2159
+
| Token.Tag { kind = Token.Start; name = "html"; _ } ->
2160
+
process_in_body t token
2161
+
| Token.EOF ->
2162
+
() (* Stop parsing *)
2163
+
| _ ->
2164
+
parse_error t "unexpected-token-after-after-body";
2165
+
t.mode <- Insertion_mode.In_body;
2166
+
process_token t token
2167
+
2168
+
and process_after_after_frameset t token =
2169
+
match token with
2170
+
| Token.Comment data ->
2171
+
insert_comment_to_document t data
2172
+
| Token.Doctype _ ->
2173
+
process_in_body t token
2174
+
| Token.Character data ->
2175
+
(* Extract only whitespace characters and process using in_body rules *)
2176
+
let whitespace = String.to_seq data
2177
+
|> Seq.filter (fun c -> List.mem c ['\t'; '\n'; '\x0C'; '\r'; ' '])
2178
+
|> String.of_seq in
2179
+
if whitespace <> "" then process_in_body t (Token.Character whitespace);
2180
+
if not (is_whitespace data) then
2181
+
parse_error t "unexpected-char-after-after-frameset"
2182
+
| Token.Tag { kind = Token.Start; name = "html"; _ } ->
2183
+
process_in_body t token
2184
+
| Token.EOF ->
2185
+
() (* Stop parsing *)
2186
+
| Token.Tag { kind = Token.Start; name = "noframes"; _ } ->
2187
+
process_in_head t token
2188
+
| _ ->
2189
+
parse_error t "unexpected-token-after-after-frameset"
2190
+
2191
+
and process_token t token =
2192
+
(* Check for HTML integration points (SVG foreignObject, desc, title) *)
2193
+
let is_html_integration_point node =
2194
+
(* SVG foreignObject, desc, and title are always HTML integration points *)
2195
+
if node.Dom.namespace = Some "svg" &&
2196
+
List.mem node.Dom.name Constants.svg_html_integration then true
2197
+
(* annotation-xml is an HTML integration point only with specific encoding values *)
2198
+
else if node.Dom.namespace = Some "mathml" && node.Dom.name = "annotation-xml" then
2199
+
match List.assoc_opt "encoding" node.Dom.attrs with
2200
+
| Some enc ->
2201
+
let enc_lower = String.lowercase_ascii enc in
2202
+
enc_lower = "text/html" || enc_lower = "application/xhtml+xml"
2203
+
| None -> false
2204
+
else false
2205
+
in
2206
+
(* Check for MathML text integration points *)
2207
+
let is_mathml_text_integration_point node =
2208
+
node.Dom.namespace = Some "mathml" &&
2209
+
List.mem node.Dom.name ["mi"; "mo"; "mn"; "ms"; "mtext"]
2210
+
in
2211
+
(* Foreign content handling *)
2212
+
let in_foreign =
2213
+
match adjusted_current_node t with
2214
+
| None -> false
2215
+
| Some node ->
2216
+
if is_in_html_namespace node then false
2217
+
else begin
2218
+
(* At HTML integration points, characters and start tags (except mglyph/malignmark) use HTML rules *)
2219
+
if is_html_integration_point node then begin
2220
+
match token with
2221
+
| Token.Character _ -> false
2222
+
| Token.Tag { kind = Token.Start; _ } -> false
2223
+
| _ -> true
2224
+
end
2225
+
(* At MathML text integration points, characters and start tags (except mglyph/malignmark) use HTML rules *)
2226
+
else if is_mathml_text_integration_point node then begin
2227
+
match token with
2228
+
| Token.Character _ -> false
2229
+
| Token.Tag { kind = Token.Start; name; _ } ->
2230
+
name = "mglyph" || name = "malignmark"
2231
+
| _ -> true
2232
+
end
2233
+
(* Special case: <svg> inside annotation-xml uses HTML rules (creates svg in svg namespace) *)
2234
+
else if node.Dom.namespace = Some "mathml" && node.Dom.name = "annotation-xml" then begin
2235
+
match token with
2236
+
| Token.Tag { kind = Token.Start; name; _ } when String.lowercase_ascii name = "svg" -> false
2237
+
| _ -> true
2238
+
end
2239
+
(* Not at integration point - use foreign content rules *)
2240
+
(* Breakout handling is done inside process_foreign_content *)
2241
+
else true
2242
+
end
2243
+
in
2244
+
2245
+
(* Check if at HTML integration point for special table mode handling *)
2246
+
let at_integration_point =
2247
+
match adjusted_current_node t with
2248
+
| Some node ->
2249
+
is_html_integration_point node || is_mathml_text_integration_point node
2250
+
| None -> false
2251
+
in
2252
+
2253
+
if in_foreign then
2254
+
process_foreign_content t token
2255
+
else if at_integration_point then begin
2256
+
(* At integration points, check if in table mode without table in scope *)
2257
+
let is_table_mode = List.mem t.mode [In_table; In_table_body; In_row; In_cell; In_caption; In_column_group] in
2258
+
let has_table = has_element_in_table_scope t "table" in
2259
+
if is_table_mode && not has_table then begin
2260
+
match token with
2261
+
| Token.Tag { kind = Token.Start; _ } ->
2262
+
(* Temporarily use IN_BODY for start tags in table mode without table *)
2263
+
let saved_mode = t.mode in
2264
+
t.mode <- In_body;
2265
+
process_by_mode t token;
2266
+
if t.mode = In_body then t.mode <- saved_mode
2267
+
| _ -> process_by_mode t token
2268
+
end else
2269
+
process_by_mode t token
2270
+
end else
2271
+
process_by_mode t token
2272
+
2273
+
(* Pop foreign elements until HTML or integration point *)
2274
+
and pop_until_html_or_integration_point t =
2275
+
let is_html_integration_point node =
2276
+
(* SVG foreignObject, desc, and title are always HTML integration points *)
2277
+
if node.Dom.namespace = Some "svg" &&
2278
+
List.mem node.Dom.name Constants.svg_html_integration then true
2279
+
(* annotation-xml is an HTML integration point only with specific encoding values *)
2280
+
else if node.Dom.namespace = Some "mathml" && node.Dom.name = "annotation-xml" then
2281
+
match List.assoc_opt "encoding" node.Dom.attrs with
2282
+
| Some enc ->
2283
+
let enc_lower = String.lowercase_ascii enc in
2284
+
enc_lower = "text/html" || enc_lower = "application/xhtml+xml"
2285
+
| None -> false
2286
+
else false
2287
+
in
2288
+
(* Get fragment context element - only for foreign namespace fragment contexts *)
2289
+
let fragment_context_elem = t.fragment_context_element in
2290
+
let rec pop () =
2291
+
match current_node t with
2292
+
| None -> ()
2293
+
| Some node ->
2294
+
if is_in_html_namespace node then ()
2295
+
else if is_html_integration_point node then ()
2296
+
(* Don't pop past fragment context element *)
2297
+
else (match fragment_context_elem with
2298
+
| Some ctx when node == ctx -> ()
2299
+
| _ ->
2300
+
pop_current t;
2301
+
pop ())
2302
+
in
2303
+
pop ()
2304
+
2305
+
(* Foreign breakout elements - these break out of foreign content *)
2306
+
and is_foreign_breakout_element name =
2307
+
List.mem (String.lowercase_ascii name)
2308
+
["b"; "big"; "blockquote"; "body"; "br"; "center"; "code"; "dd"; "div"; "dl"; "dt";
2309
+
"em"; "embed"; "h1"; "h2"; "h3"; "h4"; "h5"; "h6"; "head"; "hr"; "i"; "img"; "li";
2310
+
"listing"; "menu"; "meta"; "nobr"; "ol"; "p"; "pre"; "ruby"; "s"; "small"; "span";
2311
+
"strong"; "strike"; "sub"; "sup"; "table"; "tt"; "u"; "ul"; "var"]
2312
+
2313
+
and process_foreign_content t token =
2314
+
match token with
2315
+
| Token.Character "\x00" ->
2316
+
parse_error t "unexpected-null-character";
2317
+
insert_character t "\xEF\xBF\xBD"
2318
+
| Token.Character data when is_whitespace data ->
2319
+
insert_character t data
2320
+
| Token.Character data ->
2321
+
insert_character t data;
2322
+
t.frameset_ok <- false
2323
+
| Token.Comment data ->
2324
+
insert_comment t data
2325
+
| Token.Doctype _ ->
2326
+
parse_error t "unexpected-doctype"
2327
+
| Token.Tag { kind = Token.Start; name; _ } when is_foreign_breakout_element name ->
2328
+
(* Breakout from foreign content - pop until HTML or integration point, reprocess in HTML mode *)
2329
+
parse_error t "unexpected-html-element-in-foreign-content";
2330
+
pop_until_html_or_integration_point t;
2331
+
reset_insertion_mode t;
2332
+
(* Use process_by_mode to force HTML mode processing and avoid infinite loop *)
2333
+
process_by_mode t token
2334
+
| Token.Tag { kind = Token.Start; name = "font"; attrs; _ }
2335
+
when List.exists (fun (n, _) ->
2336
+
let n = String.lowercase_ascii n in
2337
+
n = "color" || n = "face" || n = "size") attrs ->
2338
+
(* font with color/face/size breaks out of foreign content *)
2339
+
parse_error t "unexpected-html-element-in-foreign-content";
2340
+
pop_until_html_or_integration_point t;
2341
+
reset_insertion_mode t;
2342
+
process_by_mode t token
2343
+
| Token.Tag { kind = Token.Start; name; attrs; self_closing } ->
2344
+
let name =
2345
+
match adjusted_current_node t with
2346
+
| Some n when n.Dom.namespace = Some "svg" -> Constants.adjust_svg_tag_name name
2347
+
| _ -> name
2348
+
in
2349
+
let attrs =
2350
+
match adjusted_current_node t with
2351
+
| Some n when n.Dom.namespace = Some "svg" ->
2352
+
Constants.adjust_svg_attrs (Constants.adjust_foreign_attrs attrs)
2353
+
| Some n when n.Dom.namespace = Some "mathml" ->
2354
+
Constants.adjust_mathml_attrs (Constants.adjust_foreign_attrs attrs)
2355
+
| _ -> Constants.adjust_foreign_attrs attrs
2356
+
in
2357
+
let namespace =
2358
+
match adjusted_current_node t with
2359
+
| Some n -> n.Dom.namespace
2360
+
| None -> None
2361
+
in
2362
+
let node = insert_element t name ~namespace attrs in
2363
+
t.open_elements <- node :: t.open_elements;
2364
+
if self_closing then pop_current t
2365
+
| Token.Tag { kind = Token.End; name; _ } when List.mem (String.lowercase_ascii name) ["br"; "p"] ->
2366
+
(* Special case: </br> and </p> end tags trigger breakout from foreign content *)
2367
+
parse_error t "unexpected-html-element-in-foreign-content";
2368
+
pop_until_html_or_integration_point t;
2369
+
reset_insertion_mode t;
2370
+
(* Use process_by_mode to force HTML mode processing and avoid infinite loop *)
2371
+
process_by_mode t token
2372
+
| Token.Tag { kind = Token.End; name; _ } ->
2373
+
(* Find matching element per WHATWG spec for foreign content *)
2374
+
let is_fragment_context n =
2375
+
match t.fragment_context_element with
2376
+
| Some ctx -> n == ctx
2377
+
| None -> false
2378
+
in
2379
+
let name_lower = String.lowercase_ascii name in
2380
+
(* Walk through stack looking for matching element *)
2381
+
let rec find_and_process first_node idx = function
2382
+
| [] -> () (* Stack exhausted - ignore tag *)
2383
+
| n :: rest ->
2384
+
let node_name_lower = String.lowercase_ascii n.Dom.name in
2385
+
let is_html = is_in_html_namespace n in
2386
+
let name_matches = node_name_lower = name_lower in
2387
+
2388
+
(* If first node doesn't match tag name, it's a parse error *)
2389
+
if first_node && not name_matches then
2390
+
parse_error t "unexpected-end-tag-in-foreign-content";
2391
+
2392
+
(* Check if this node matches the end tag *)
2393
+
if name_matches then begin
2394
+
(* Fragment context check *)
2395
+
if is_fragment_context n then
2396
+
parse_error t "unexpected-end-tag-in-fragment-context"
2397
+
(* If matched element is in HTML namespace, reprocess via HTML mode *)
2398
+
else if is_html then
2399
+
process_by_mode t token
2400
+
(* Otherwise it's a foreign element - pop everything from this point up *)
2401
+
else begin
2402
+
(* Pop all elements from current down to and including the matched element *)
2403
+
let rec pop_to_idx current_idx =
2404
+
if current_idx >= idx then begin
2405
+
pop_current t;
2406
+
pop_to_idx (current_idx - 1)
2407
+
end
2408
+
in
2409
+
pop_to_idx (List.length t.open_elements - 1)
2410
+
end
2411
+
end
2412
+
(* If we hit an HTML element that doesn't match, process via HTML mode *)
2413
+
else if is_html then
2414
+
process_by_mode t token
2415
+
(* Continue searching in the stack *)
2416
+
else
2417
+
find_and_process false (idx - 1) rest
2418
+
in
2419
+
find_and_process true (List.length t.open_elements - 1) t.open_elements
2420
+
| Token.EOF ->
2421
+
process_by_mode t token
2422
+
2423
+
and process_by_mode t token =
2424
+
match t.mode with
2425
+
| Insertion_mode.Initial -> process_initial t token
2426
+
| Insertion_mode.Before_html -> process_before_html t token
2427
+
| Insertion_mode.Before_head -> process_before_head t token
2428
+
| Insertion_mode.In_head -> process_in_head t token
2429
+
| Insertion_mode.In_head_noscript -> process_in_head_noscript t token
2430
+
| Insertion_mode.After_head -> process_after_head t token
2431
+
| Insertion_mode.In_body -> process_in_body t token
2432
+
| Insertion_mode.Text -> process_text t token
2433
+
| Insertion_mode.In_table -> process_in_table t token
2434
+
| Insertion_mode.In_table_text -> process_in_table_text t token
2435
+
| Insertion_mode.In_caption -> process_in_caption t token
2436
+
| Insertion_mode.In_column_group -> process_in_column_group t token
2437
+
| Insertion_mode.In_table_body -> process_in_table_body t token
2438
+
| Insertion_mode.In_row -> process_in_row t token
2439
+
| Insertion_mode.In_cell -> process_in_cell t token
2440
+
| Insertion_mode.In_select -> process_in_select t token
2441
+
| Insertion_mode.In_select_in_table -> process_in_select_in_table t token
2442
+
| Insertion_mode.In_template -> process_in_template t token
2443
+
| Insertion_mode.After_body -> process_after_body t token
2444
+
| Insertion_mode.In_frameset -> process_in_frameset t token
2445
+
| Insertion_mode.After_frameset -> process_after_frameset t token
2446
+
| Insertion_mode.After_after_body -> process_after_after_body t token
2447
+
| Insertion_mode.After_after_frameset -> process_after_after_frameset t token
2448
+
2449
+
(* Populate selectedcontent elements with content from selected option *)
2450
+
let find_elements name node =
2451
+
let result = ref [] in
2452
+
let rec find n =
2453
+
if n.Dom.name = name then result := n :: !result;
2454
+
List.iter find n.Dom.children
2455
+
in
2456
+
find node;
2457
+
List.rev !result (* Reverse to maintain document order *)
2458
+
2459
+
let find_element name node =
2460
+
let rec find n =
2461
+
if n.Dom.name = name then Some n
2462
+
else
2463
+
List.find_map find n.Dom.children
2464
+
in
2465
+
find node
2466
+
2467
+
let populate_selectedcontent document =
2468
+
let selects = find_elements "select" document in
2469
+
List.iter (fun select ->
2470
+
match find_element "selectedcontent" select with
2471
+
| None -> ()
2472
+
| Some selectedcontent ->
2473
+
let options = find_elements "option" select in
2474
+
if options <> [] then begin
2475
+
(* Find selected option or use first *)
2476
+
let selected_option =
2477
+
match List.find_opt (fun opt -> Dom.has_attr opt "selected") options with
2478
+
| Some opt -> opt
2479
+
| None -> List.hd options
2480
+
in
2481
+
(* Clone children from selected option to selectedcontent *)
2482
+
List.iter (fun child ->
2483
+
let cloned = Dom.clone ~deep:true child in
2484
+
Dom.append_child selectedcontent cloned
2485
+
) selected_option.Dom.children
2486
+
end
2487
+
) selects
2488
+
2489
+
let finish t =
2490
+
(* Populate selectedcontent elements *)
2491
+
populate_selectedcontent t.document;
2492
+
(* For fragment parsing, remove the html wrapper and promote children *)
2493
+
if t.fragment_context <> None then begin
2494
+
match t.document.Dom.children with
2495
+
| [root] when root.Dom.name = "html" ->
2496
+
(* Move context element's children to root if applicable *)
2497
+
(match t.fragment_context_element with
2498
+
| Some ctx_elem ->
2499
+
(match ctx_elem.Dom.parent with
2500
+
| Some p when p == root ->
2501
+
let ctx_children = ctx_elem.Dom.children in
2502
+
List.iter (fun child ->
2503
+
Dom.remove_child ctx_elem child;
2504
+
Dom.append_child root child
2505
+
) ctx_children;
2506
+
Dom.remove_child root ctx_elem
2507
+
| _ -> ())
2508
+
| None -> ());
2509
+
(* Promote root's children to document - preserve order *)
2510
+
let children_copy = root.Dom.children in
2511
+
List.iter (fun child ->
2512
+
Dom.remove_child root child;
2513
+
Dom.append_child t.document child
2514
+
) children_copy;
2515
+
Dom.remove_child t.document root
2516
+
| _ -> ()
2517
+
end;
2518
+
t.document
2519
+
2520
+
let get_errors t = List.rev t.errors
+4
lib/selector/dune
+4
lib/selector/dune
+12
lib/selector/html5rw_selector.ml
+12
lib/selector/html5rw_selector.ml
···
1
+
(* html5rw.selector - CSS selector engine *)
2
+
3
+
exception Selector_error = Selector_lexer.Selector_error
4
+
5
+
module Ast = Selector_ast
6
+
module Token = Selector_token
7
+
8
+
let parse = Selector_parser.parse_selector
9
+
10
+
let query = Selector_match.query
11
+
12
+
let matches = Selector_match.matches
+47
lib/selector/selector_ast.ml
+47
lib/selector/selector_ast.ml
···
1
+
(* CSS selector AST types *)
2
+
3
+
type simple_selector_type =
4
+
| Type_tag
5
+
| Type_id
6
+
| Type_class
7
+
| Type_universal
8
+
| Type_attr
9
+
| Type_pseudo
10
+
11
+
type simple_selector = {
12
+
selector_type : simple_selector_type;
13
+
name : string option;
14
+
operator : string option;
15
+
value : string option;
16
+
arg : string option;
17
+
}
18
+
19
+
type compound_selector = {
20
+
selectors : simple_selector list;
21
+
}
22
+
23
+
type complex_selector = {
24
+
parts : (string option * compound_selector) list;
25
+
(* List of (combinator, compound_selector) pairs.
26
+
First element has combinator = None *)
27
+
}
28
+
29
+
type selector_list = {
30
+
selectors : complex_selector list;
31
+
}
32
+
33
+
type selector =
34
+
| Simple of simple_selector
35
+
| Compound of compound_selector
36
+
| Complex of complex_selector
37
+
| List of selector_list
38
+
39
+
(* Constructors *)
40
+
let make_simple selector_type ?name ?operator ?value ?arg () =
41
+
{ selector_type; name; operator; value; arg }
42
+
43
+
let make_compound (selectors : simple_selector list) : compound_selector = { selectors }
44
+
45
+
let make_complex parts : complex_selector = { parts }
46
+
47
+
let make_list (selectors : complex_selector list) : selector_list = { selectors }
+195
lib/selector/selector_lexer.ml
+195
lib/selector/selector_lexer.ml
···
1
+
(* CSS selector lexer *)
2
+
3
+
exception Selector_error of string
4
+
5
+
type t = {
6
+
input : string;
7
+
len : int;
8
+
mutable pos : int;
9
+
}
10
+
11
+
let create input = { input; len = String.length input; pos = 0 }
12
+
13
+
let peek t =
14
+
if t.pos < t.len then Some t.input.[t.pos]
15
+
else None
16
+
17
+
let advance t =
18
+
if t.pos < t.len then t.pos <- t.pos + 1
19
+
20
+
let consume t =
21
+
let c = peek t in
22
+
advance t;
23
+
c
24
+
25
+
let is_whitespace c = c = ' ' || c = '\t' || c = '\n' || c = '\r' || c = '\x0C'
26
+
27
+
let is_name_start c =
28
+
(c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c = '_' || c = '-' || Char.code c > 127
29
+
30
+
let is_name_char c =
31
+
is_name_start c || (c >= '0' && c <= '9')
32
+
33
+
let skip_whitespace t =
34
+
while t.pos < t.len && is_whitespace t.input.[t.pos] do
35
+
advance t
36
+
done
37
+
38
+
let read_name t =
39
+
let start = t.pos in
40
+
while t.pos < t.len && is_name_char t.input.[t.pos] do
41
+
advance t
42
+
done;
43
+
String.sub t.input start (t.pos - start)
44
+
45
+
let read_string t quote =
46
+
advance t; (* Skip opening quote *)
47
+
let buf = Buffer.create 32 in
48
+
let rec loop () =
49
+
match peek t with
50
+
| None -> raise (Selector_error "Unterminated string")
51
+
| Some c when c = quote -> advance t
52
+
| Some '\\' ->
53
+
advance t;
54
+
(match peek t with
55
+
| Some c -> Buffer.add_char buf c; advance t; loop ()
56
+
| None -> raise (Selector_error "Unterminated escape"))
57
+
| Some c ->
58
+
Buffer.add_char buf c;
59
+
advance t;
60
+
loop ()
61
+
in
62
+
loop ();
63
+
Buffer.contents buf
64
+
65
+
let read_unquoted_attr_value t =
66
+
let start = t.pos in
67
+
while t.pos < t.len &&
68
+
let c = t.input.[t.pos] in
69
+
not (is_whitespace c) && c <> ']' do
70
+
advance t
71
+
done;
72
+
String.sub t.input start (t.pos - start)
73
+
74
+
let tokenize input =
75
+
let t = create input in
76
+
let tokens = ref [] in
77
+
let pending_ws = ref false in
78
+
79
+
while t.pos < t.len do
80
+
let c = t.input.[t.pos] in
81
+
82
+
if is_whitespace c then begin
83
+
pending_ws := true;
84
+
skip_whitespace t
85
+
end else if c = '>' || c = '+' || c = '~' then begin
86
+
pending_ws := false;
87
+
advance t;
88
+
skip_whitespace t;
89
+
tokens := Selector_token.Combinator (String.make 1 c) :: !tokens
90
+
end else begin
91
+
if !pending_ws && !tokens <> [] && c <> ',' then
92
+
tokens := Selector_token.Combinator " " :: !tokens;
93
+
pending_ws := false;
94
+
95
+
match c with
96
+
| '*' ->
97
+
advance t;
98
+
tokens := Selector_token.Universal :: !tokens
99
+
| '#' ->
100
+
advance t;
101
+
let name = read_name t in
102
+
if name = "" then raise (Selector_error "Expected identifier after #");
103
+
tokens := Selector_token.Id name :: !tokens
104
+
| '.' ->
105
+
advance t;
106
+
let name = read_name t in
107
+
if name = "" then raise (Selector_error "Expected identifier after .");
108
+
tokens := Selector_token.Class name :: !tokens
109
+
| '[' ->
110
+
advance t;
111
+
tokens := Selector_token.Attr_start :: !tokens;
112
+
skip_whitespace t;
113
+
let attr_name = read_name t in
114
+
if attr_name = "" then raise (Selector_error "Expected attribute name");
115
+
tokens := Selector_token.Tag attr_name :: !tokens;
116
+
skip_whitespace t;
117
+
118
+
(match peek t with
119
+
| Some ']' ->
120
+
advance t;
121
+
tokens := Selector_token.Attr_end :: !tokens
122
+
| Some '=' ->
123
+
advance t;
124
+
tokens := Selector_token.Attr_op "=" :: !tokens;
125
+
skip_whitespace t;
126
+
let value = match peek t with
127
+
| Some '"' -> read_string t '"'
128
+
| Some '\'' -> read_string t '\''
129
+
| _ -> read_unquoted_attr_value t
130
+
in
131
+
tokens := Selector_token.String value :: !tokens;
132
+
skip_whitespace t;
133
+
if peek t <> Some ']' then raise (Selector_error "Expected ]");
134
+
advance t;
135
+
tokens := Selector_token.Attr_end :: !tokens
136
+
| Some ('~' | '|' | '^' | '$' | '*') as op_char ->
137
+
let op_c = Option.get op_char in
138
+
advance t;
139
+
if peek t <> Some '=' then
140
+
raise (Selector_error ("Expected = after " ^ String.make 1 op_c));
141
+
advance t;
142
+
tokens := Selector_token.Attr_op (String.make 1 op_c ^ "=") :: !tokens;
143
+
skip_whitespace t;
144
+
let value = match peek t with
145
+
| Some '"' -> read_string t '"'
146
+
| Some '\'' -> read_string t '\''
147
+
| _ -> read_unquoted_attr_value t
148
+
in
149
+
tokens := Selector_token.String value :: !tokens;
150
+
skip_whitespace t;
151
+
if peek t <> Some ']' then raise (Selector_error "Expected ]");
152
+
advance t;
153
+
tokens := Selector_token.Attr_end :: !tokens
154
+
| _ -> raise (Selector_error "Unexpected character in attribute selector"))
155
+
156
+
| ',' ->
157
+
advance t;
158
+
skip_whitespace t;
159
+
tokens := Selector_token.Comma :: !tokens
160
+
| ':' ->
161
+
advance t;
162
+
tokens := Selector_token.Colon :: !tokens;
163
+
let name = read_name t in
164
+
if name = "" then raise (Selector_error "Expected pseudo-class name");
165
+
tokens := Selector_token.Tag name :: !tokens;
166
+
167
+
if peek t = Some '(' then begin
168
+
advance t;
169
+
tokens := Selector_token.Paren_open :: !tokens;
170
+
skip_whitespace t;
171
+
(* Read argument until closing paren *)
172
+
let depth = ref 1 in
173
+
let start = t.pos in
174
+
while !depth > 0 && t.pos < t.len do
175
+
match t.input.[t.pos] with
176
+
| '(' -> incr depth; advance t
177
+
| ')' -> decr depth; if !depth > 0 then advance t
178
+
| _ -> advance t
179
+
done;
180
+
let arg = String.trim (String.sub t.input start (t.pos - start)) in
181
+
if arg <> "" then tokens := Selector_token.String arg :: !tokens;
182
+
if peek t <> Some ')' then raise (Selector_error "Expected )");
183
+
advance t;
184
+
tokens := Selector_token.Paren_close :: !tokens
185
+
end
186
+
| _ when is_name_start c ->
187
+
let name = read_name t in
188
+
tokens := Selector_token.Tag (String.lowercase_ascii name) :: !tokens
189
+
| _ ->
190
+
raise (Selector_error ("Unexpected character: " ^ String.make 1 c))
191
+
end
192
+
done;
193
+
194
+
tokens := Selector_token.EOF :: !tokens;
195
+
List.rev !tokens
+308
lib/selector/selector_match.ml
+308
lib/selector/selector_match.ml
···
1
+
(* CSS selector matching *)
2
+
3
+
module Dom = Html5rw_dom
4
+
open Selector_ast
5
+
6
+
let is_element node =
7
+
let name = node.Dom.name in
8
+
name <> "#text" && name <> "#comment" && name <> "#document" &&
9
+
name <> "#document-fragment" && name <> "!doctype"
10
+
11
+
let get_element_children node =
12
+
List.filter is_element node.Dom.children
13
+
14
+
let get_previous_sibling node =
15
+
match node.Dom.parent with
16
+
| None -> None
17
+
| Some parent ->
18
+
let rec find_prev prev = function
19
+
| [] -> None
20
+
| n :: rest ->
21
+
if n == node then prev
22
+
else if is_element n then find_prev (Some n) rest
23
+
else find_prev prev rest
24
+
in
25
+
find_prev None parent.Dom.children
26
+
27
+
let is_first_child node =
28
+
match node.Dom.parent with
29
+
| None -> false
30
+
| Some parent ->
31
+
match get_element_children parent with
32
+
| first :: _ -> first == node
33
+
| [] -> false
34
+
35
+
let is_last_child node =
36
+
match node.Dom.parent with
37
+
| None -> false
38
+
| Some parent ->
39
+
match List.rev (get_element_children parent) with
40
+
| last :: _ -> last == node
41
+
| [] -> false
42
+
43
+
let is_first_of_type node =
44
+
match node.Dom.parent with
45
+
| None -> false
46
+
| Some parent ->
47
+
let name = String.lowercase_ascii node.Dom.name in
48
+
let rec find = function
49
+
| [] -> false
50
+
| n :: _ when String.lowercase_ascii n.Dom.name = name -> n == node
51
+
| _ :: rest -> find rest
52
+
in
53
+
find (get_element_children parent)
54
+
55
+
let is_last_of_type node =
56
+
match node.Dom.parent with
57
+
| None -> false
58
+
| Some parent ->
59
+
let name = String.lowercase_ascii node.Dom.name in
60
+
let rec find last = function
61
+
| [] -> (match last with Some l -> l == node | None -> false)
62
+
| n :: rest when String.lowercase_ascii n.Dom.name = name -> find (Some n) rest
63
+
| _ :: rest -> find last rest
64
+
in
65
+
find None (get_element_children parent)
66
+
67
+
let get_index node =
68
+
match node.Dom.parent with
69
+
| None -> 0
70
+
| Some parent ->
71
+
let children = get_element_children parent in
72
+
let rec find idx = function
73
+
| [] -> 0
74
+
| n :: _ when n == node -> idx
75
+
| _ :: rest -> find (idx + 1) rest
76
+
in
77
+
find 1 children
78
+
79
+
let get_type_index node =
80
+
match node.Dom.parent with
81
+
| None -> 0
82
+
| Some parent ->
83
+
let name = String.lowercase_ascii node.Dom.name in
84
+
let children = get_element_children parent in
85
+
let rec find idx = function
86
+
| [] -> 0
87
+
| n :: _ when n == node -> idx
88
+
| n :: rest when String.lowercase_ascii n.Dom.name = name -> find (idx + 1) rest
89
+
| _ :: rest -> find idx rest
90
+
in
91
+
find 1 children
92
+
93
+
(* Parse nth expression: "odd", "even", "3", "2n+1", etc *)
94
+
let parse_nth expr =
95
+
let expr = String.lowercase_ascii (String.trim expr) in
96
+
if expr = "odd" then Some (2, 1)
97
+
else if expr = "even" then Some (2, 0)
98
+
else
99
+
let expr = String.concat "" (String.split_on_char ' ' expr) in
100
+
if String.contains expr 'n' then
101
+
let parts = String.split_on_char 'n' expr in
102
+
match parts with
103
+
| [a_part; b_part] ->
104
+
let a =
105
+
if a_part = "" || a_part = "+" then 1
106
+
else if a_part = "-" then -1
107
+
else int_of_string_opt a_part |> Option.value ~default:0
108
+
in
109
+
let b =
110
+
if b_part = "" then 0
111
+
else int_of_string_opt b_part |> Option.value ~default:0
112
+
in
113
+
Some (a, b)
114
+
| _ -> None
115
+
else
116
+
match int_of_string_opt expr with
117
+
| Some n -> Some (0, n)
118
+
| None -> None
119
+
120
+
let matches_nth index a b =
121
+
if a = 0 then index = b
122
+
else
123
+
let diff = index - b in
124
+
if a > 0 then diff >= 0 && diff mod a = 0
125
+
else diff <= 0 && diff mod a = 0
126
+
127
+
let rec matches_simple node selector =
128
+
if not (is_element node) then false
129
+
else
130
+
match selector.selector_type with
131
+
| Type_universal -> true
132
+
| Type_tag ->
133
+
(match selector.name with
134
+
| Some name -> String.lowercase_ascii node.Dom.name = String.lowercase_ascii name
135
+
| None -> false)
136
+
| Type_id ->
137
+
(match selector.name with
138
+
| Some id ->
139
+
(match Dom.get_attr node "id" with
140
+
| Some node_id -> node_id = id
141
+
| None -> false)
142
+
| None -> false)
143
+
| Type_class ->
144
+
(match selector.name with
145
+
| Some cls ->
146
+
(match Dom.get_attr node "class" with
147
+
| Some class_attr ->
148
+
let classes = String.split_on_char ' ' class_attr in
149
+
List.mem cls classes
150
+
| None -> false)
151
+
| None -> false)
152
+
| Type_attr ->
153
+
(match selector.name with
154
+
| Some attr_name ->
155
+
let attr_name_lower = String.lowercase_ascii attr_name in
156
+
let node_value =
157
+
List.find_map (fun (k, v) ->
158
+
if String.lowercase_ascii k = attr_name_lower then Some v
159
+
else None
160
+
) node.Dom.attrs
161
+
in
162
+
(match node_value with
163
+
| None -> false
164
+
| Some _ when selector.operator = None -> true
165
+
| Some attr_value ->
166
+
let value = Option.value selector.value ~default:"" in
167
+
(match selector.operator with
168
+
| Some "=" -> attr_value = value
169
+
| Some "~=" ->
170
+
let words = String.split_on_char ' ' attr_value in
171
+
List.mem value words
172
+
| Some "|=" ->
173
+
attr_value = value || String.length attr_value > String.length value &&
174
+
String.sub attr_value 0 (String.length value) = value &&
175
+
attr_value.[String.length value] = '-'
176
+
| Some "^=" -> value <> "" && String.length attr_value >= String.length value &&
177
+
String.sub attr_value 0 (String.length value) = value
178
+
| Some "$=" -> value <> "" && String.length attr_value >= String.length value &&
179
+
String.sub attr_value (String.length attr_value - String.length value) (String.length value) = value
180
+
| Some "*=" -> value <> "" && Re.execp (Re.compile (Re.str value)) attr_value
181
+
| Some _ | None -> false))
182
+
| None -> false)
183
+
| Type_pseudo ->
184
+
(match selector.name with
185
+
| Some "first-child" -> is_first_child node
186
+
| Some "last-child" -> is_last_child node
187
+
| Some "first-of-type" -> is_first_of_type node
188
+
| Some "last-of-type" -> is_last_of_type node
189
+
| Some "only-child" -> is_first_child node && is_last_child node
190
+
| Some "only-of-type" -> is_first_of_type node && is_last_of_type node
191
+
| Some "empty" ->
192
+
not (List.exists (fun c ->
193
+
is_element c || (c.Dom.name = "#text" && String.trim c.Dom.data <> "")
194
+
) node.Dom.children)
195
+
| Some "root" ->
196
+
(match node.Dom.parent with
197
+
| Some p -> p.Dom.name = "#document" || p.Dom.name = "#document-fragment"
198
+
| None -> false)
199
+
| Some "nth-child" ->
200
+
(match selector.arg with
201
+
| Some arg ->
202
+
(match parse_nth arg with
203
+
| Some (a, b) -> matches_nth (get_index node) a b
204
+
| None -> false)
205
+
| None -> false)
206
+
| Some "nth-of-type" ->
207
+
(match selector.arg with
208
+
| Some arg ->
209
+
(match parse_nth arg with
210
+
| Some (a, b) -> matches_nth (get_type_index node) a b
211
+
| None -> false)
212
+
| None -> false)
213
+
| Some "not" ->
214
+
(match selector.arg with
215
+
| Some arg ->
216
+
(try
217
+
let inner = Selector_parser.parse_selector arg in
218
+
not (matches_selector node inner)
219
+
with _ -> true)
220
+
| None -> true)
221
+
| _ -> false)
222
+
223
+
and matches_compound node (compound : Selector_ast.compound_selector) =
224
+
List.for_all (matches_simple node) compound.selectors
225
+
226
+
and matches_complex node complex =
227
+
(* Match from right to left *)
228
+
let parts = List.rev complex.parts in
229
+
match parts with
230
+
| [] -> false
231
+
| (_, rightmost) :: rest ->
232
+
if not (matches_compound node rightmost) then false
233
+
else
234
+
let rec check current remaining =
235
+
match remaining with
236
+
| [] -> true
237
+
| (Some " ", compound) :: rest ->
238
+
(* Descendant combinator *)
239
+
let rec find_ancestor n =
240
+
match n.Dom.parent with
241
+
| None -> false
242
+
| Some p ->
243
+
if matches_compound p compound then check p rest
244
+
else find_ancestor p
245
+
in
246
+
find_ancestor current
247
+
| (Some ">", compound) :: rest ->
248
+
(* Child combinator *)
249
+
(match current.Dom.parent with
250
+
| None -> false
251
+
| Some p ->
252
+
if matches_compound p compound then check p rest
253
+
else false)
254
+
| (Some "+", compound) :: rest ->
255
+
(* Adjacent sibling *)
256
+
(match get_previous_sibling current with
257
+
| None -> false
258
+
| Some sib ->
259
+
if matches_compound sib compound then check sib rest
260
+
else false)
261
+
| (Some "~", compound) :: rest ->
262
+
(* General sibling *)
263
+
let rec find_sibling n =
264
+
match get_previous_sibling n with
265
+
| None -> false
266
+
| Some sib ->
267
+
if matches_compound sib compound then check sib rest
268
+
else find_sibling sib
269
+
in
270
+
find_sibling current
271
+
| (None, compound) :: rest ->
272
+
if matches_compound current compound then check current rest
273
+
else false
274
+
| _ -> false
275
+
in
276
+
check node rest
277
+
278
+
and matches_selector node selector =
279
+
match selector with
280
+
| Simple s -> matches_simple node s
281
+
| Compound c -> matches_compound node c
282
+
| Complex c -> matches_complex node c
283
+
| List l -> List.exists (fun c -> matches_complex node c) l.selectors
284
+
285
+
let matches node selector_string =
286
+
try
287
+
let selector = Selector_parser.parse_selector selector_string in
288
+
matches_selector node selector
289
+
with _ -> false
290
+
291
+
let rec query_descendants node selector results =
292
+
List.iter (fun child ->
293
+
if is_element child && matches_selector child selector then
294
+
results := child :: !results;
295
+
query_descendants child selector results;
296
+
(* Also search template content *)
297
+
(match child.Dom.template_content with
298
+
| Some tc -> query_descendants tc selector results
299
+
| None -> ())
300
+
) node.Dom.children
301
+
302
+
let query root selector_string =
303
+
try
304
+
let selector = Selector_parser.parse_selector selector_string in
305
+
let results = ref [] in
306
+
query_descendants root selector results;
307
+
List.rev !results
308
+
with _ -> []
+149
lib/selector/selector_parser.ml
+149
lib/selector/selector_parser.ml
···
1
+
(* CSS selector parser *)
2
+
3
+
open Selector_ast
4
+
open Selector_token
5
+
6
+
exception Parse_error of string
7
+
8
+
type t = {
9
+
tokens : Selector_token.t list;
10
+
mutable pos : int;
11
+
}
12
+
13
+
let create tokens = { tokens; pos = 0 }
14
+
15
+
let peek t =
16
+
if t.pos < List.length t.tokens then
17
+
List.nth t.tokens t.pos
18
+
else EOF
19
+
20
+
let advance t =
21
+
if t.pos < List.length t.tokens then
22
+
t.pos <- t.pos + 1
23
+
24
+
let consume t =
25
+
let tok = peek t in
26
+
advance t;
27
+
tok
28
+
29
+
let expect t expected =
30
+
let tok = peek t in
31
+
if tok <> expected then
32
+
raise (Parse_error ("Expected " ^ (match expected with EOF -> "EOF" | _ -> "token")))
33
+
else
34
+
advance t
35
+
36
+
let parse_simple_selector t =
37
+
match peek t with
38
+
| Tag name ->
39
+
advance t;
40
+
Some (make_simple Type_tag ~name ())
41
+
| Universal ->
42
+
advance t;
43
+
Some (make_simple Type_universal ())
44
+
| Id name ->
45
+
advance t;
46
+
Some (make_simple Type_id ~name ())
47
+
| Class name ->
48
+
advance t;
49
+
Some (make_simple Type_class ~name ())
50
+
| Attr_start ->
51
+
advance t;
52
+
let attr_name = match peek t with
53
+
| Tag name -> advance t; name
54
+
| _ -> raise (Parse_error "Expected attribute name")
55
+
in
56
+
(match peek t with
57
+
| Attr_end ->
58
+
advance t;
59
+
Some (make_simple Type_attr ~name:attr_name ())
60
+
| Attr_op op ->
61
+
advance t;
62
+
let value = match peek t with
63
+
| String v -> advance t; v
64
+
| _ -> raise (Parse_error "Expected attribute value")
65
+
in
66
+
(match peek t with
67
+
| Attr_end -> advance t
68
+
| _ -> raise (Parse_error "Expected ]"));
69
+
Some (make_simple Type_attr ~name:attr_name ~operator:op ~value ())
70
+
| _ -> raise (Parse_error "Expected ] or attribute operator"))
71
+
| Colon ->
72
+
advance t;
73
+
let name = match peek t with
74
+
| Tag n -> advance t; n
75
+
| _ -> raise (Parse_error "Expected pseudo-class name")
76
+
in
77
+
let arg = match peek t with
78
+
| Paren_open ->
79
+
advance t;
80
+
let a = match peek t with
81
+
| String s -> advance t; Some s
82
+
| Paren_close -> None
83
+
| _ -> None
84
+
in
85
+
(match peek t with
86
+
| Paren_close -> advance t
87
+
| _ -> raise (Parse_error "Expected )"));
88
+
a
89
+
| _ -> None
90
+
in
91
+
Some (make_simple Type_pseudo ~name ?arg ())
92
+
| _ -> None
93
+
94
+
let parse_compound_selector t =
95
+
let rec loop acc =
96
+
match parse_simple_selector t with
97
+
| Some s -> loop (s :: acc)
98
+
| None -> acc
99
+
in
100
+
let selectors = List.rev (loop []) in
101
+
if selectors = [] then None
102
+
else Some (make_compound selectors)
103
+
104
+
let parse_complex_selector t =
105
+
match parse_compound_selector t with
106
+
| None -> None
107
+
| Some first ->
108
+
let parts = ref [(None, first)] in
109
+
let rec loop () =
110
+
match peek t with
111
+
| Combinator comb ->
112
+
advance t;
113
+
(match parse_compound_selector t with
114
+
| None -> raise (Parse_error "Expected selector after combinator")
115
+
| Some compound ->
116
+
parts := (Some comb, compound) :: !parts;
117
+
loop ())
118
+
| _ -> ()
119
+
in
120
+
loop ();
121
+
Some (make_complex (List.rev !parts))
122
+
123
+
let parse tokens =
124
+
let t = create tokens in
125
+
let rec loop acc =
126
+
match parse_complex_selector t with
127
+
| None -> acc
128
+
| Some sel ->
129
+
(match peek t with
130
+
| Comma ->
131
+
advance t;
132
+
loop (sel :: acc)
133
+
| EOF -> sel :: acc
134
+
| _ -> raise (Parse_error "Unexpected token"))
135
+
in
136
+
let selectors = List.rev (loop []) in
137
+
(match peek t with
138
+
| EOF -> ()
139
+
| _ -> raise (Parse_error "Expected end of selector"));
140
+
match selectors with
141
+
| [] -> raise (Parse_error "Empty selector")
142
+
| [sel] -> Complex sel
143
+
| sels -> List (make_list sels)
144
+
145
+
let parse_selector input =
146
+
if String.trim input = "" then
147
+
raise (Selector_lexer.Selector_error "Empty selector");
148
+
let tokens = Selector_lexer.tokenize input in
149
+
parse tokens
+17
lib/selector/selector_token.ml
+17
lib/selector/selector_token.ml
···
1
+
(* CSS selector token types *)
2
+
3
+
type t =
4
+
| Tag of string
5
+
| Id of string
6
+
| Class of string
7
+
| Universal
8
+
| Attr_start
9
+
| Attr_end
10
+
| Attr_op of string
11
+
| String of string
12
+
| Combinator of string
13
+
| Comma
14
+
| Colon
15
+
| Paren_open
16
+
| Paren_close
17
+
| EOF
+4
lib/tokenizer/dune
+4
lib/tokenizer/dune
+12
lib/tokenizer/errors.ml
+12
lib/tokenizer/errors.ml
+16
lib/tokenizer/html5rw_tokenizer.ml
+16
lib/tokenizer/html5rw_tokenizer.ml
···
1
+
(* html5rw.tokenizer - HTML5 tokenizer with bytesrw-only API *)
2
+
3
+
module Token = Token
4
+
module State = State
5
+
module Errors = Errors
6
+
module Stream = Stream
7
+
8
+
module type SINK = Tokenizer.SINK
9
+
10
+
type 'a t = 'a Tokenizer.t
11
+
12
+
let create = Tokenizer.create
13
+
let run = Tokenizer.run
14
+
let get_errors = Tokenizer.get_errors
15
+
let set_state = Tokenizer.set_state
16
+
let set_last_start_tag = Tokenizer.set_last_start_tag
+83
lib/tokenizer/state.ml
+83
lib/tokenizer/state.ml
···
1
+
(* HTML5 tokenizer states *)
2
+
3
+
type t =
4
+
| Data
5
+
| Rcdata
6
+
| Rawtext
7
+
| Script_data
8
+
| Plaintext
9
+
| Tag_open
10
+
| End_tag_open
11
+
| Tag_name
12
+
| Rcdata_less_than_sign
13
+
| Rcdata_end_tag_open
14
+
| Rcdata_end_tag_name
15
+
| Rawtext_less_than_sign
16
+
| Rawtext_end_tag_open
17
+
| Rawtext_end_tag_name
18
+
| Script_data_less_than_sign
19
+
| Script_data_end_tag_open
20
+
| Script_data_end_tag_name
21
+
| Script_data_escape_start
22
+
| Script_data_escape_start_dash
23
+
| Script_data_escaped
24
+
| Script_data_escaped_dash
25
+
| Script_data_escaped_dash_dash
26
+
| Script_data_escaped_less_than_sign
27
+
| Script_data_escaped_end_tag_open
28
+
| Script_data_escaped_end_tag_name
29
+
| Script_data_double_escape_start
30
+
| Script_data_double_escaped
31
+
| Script_data_double_escaped_dash
32
+
| Script_data_double_escaped_dash_dash
33
+
| Script_data_double_escaped_less_than_sign
34
+
| Script_data_double_escape_end
35
+
| Before_attribute_name
36
+
| Attribute_name
37
+
| After_attribute_name
38
+
| Before_attribute_value
39
+
| Attribute_value_double_quoted
40
+
| Attribute_value_single_quoted
41
+
| Attribute_value_unquoted
42
+
| After_attribute_value_quoted
43
+
| Self_closing_start_tag
44
+
| Bogus_comment
45
+
| Markup_declaration_open
46
+
| Comment_start
47
+
| Comment_start_dash
48
+
| Comment
49
+
| Comment_less_than_sign
50
+
| Comment_less_than_sign_bang
51
+
| Comment_less_than_sign_bang_dash
52
+
| Comment_less_than_sign_bang_dash_dash
53
+
| Comment_end_dash
54
+
| Comment_end
55
+
| Comment_end_bang
56
+
| Doctype
57
+
| Before_doctype_name
58
+
| Doctype_name
59
+
| After_doctype_name
60
+
| After_doctype_public_keyword
61
+
| Before_doctype_public_identifier
62
+
| Doctype_public_identifier_double_quoted
63
+
| Doctype_public_identifier_single_quoted
64
+
| After_doctype_public_identifier
65
+
| Between_doctype_public_and_system_identifiers
66
+
| After_doctype_system_keyword
67
+
| Before_doctype_system_identifier
68
+
| Doctype_system_identifier_double_quoted
69
+
| Doctype_system_identifier_single_quoted
70
+
| After_doctype_system_identifier
71
+
| Bogus_doctype
72
+
| Cdata_section
73
+
| Cdata_section_bracket
74
+
| Cdata_section_end
75
+
| Character_reference
76
+
| Named_character_reference
77
+
| Ambiguous_ampersand
78
+
| Numeric_character_reference
79
+
| Hexadecimal_character_reference_start
80
+
| Decimal_character_reference_start
81
+
| Hexadecimal_character_reference
82
+
| Decimal_character_reference
83
+
| Numeric_character_reference_end
+203
lib/tokenizer/stream.ml
+203
lib/tokenizer/stream.ml
···
1
+
(* Input stream for tokenizer with position tracking using bytesrw
2
+
3
+
This implementation is designed to be as streaming as possible:
4
+
- Reads slices on-demand from the Bytes.Reader.t
5
+
- Only buffers what's needed for lookahead (typically 1-2 chars)
6
+
- Avoids string allocations in hot paths like matches_ci
7
+
*)
8
+
9
+
open Bytesrw
10
+
11
+
type t = {
12
+
reader : Bytes.Reader.t;
13
+
(* Current slice and position within it *)
14
+
mutable current_slice : Bytes.Slice.t;
15
+
mutable slice_pos : int;
16
+
(* Lookahead buffer for reconsume and peek_n - small, typically 0-7 chars *)
17
+
mutable lookahead : char list;
18
+
(* Position tracking *)
19
+
mutable line : int;
20
+
mutable column : int;
21
+
(* Track if we just saw CR (for CR/LF normalization) *)
22
+
mutable last_was_cr : bool;
23
+
}
24
+
25
+
(* Create a stream from a Bytes.Reader.t *)
26
+
let create_from_reader reader =
27
+
let slice = Bytes.Reader.read reader in
28
+
{
29
+
reader;
30
+
current_slice = slice;
31
+
slice_pos = 0;
32
+
lookahead = [];
33
+
line = 1;
34
+
column = 0;
35
+
last_was_cr = false;
36
+
}
37
+
38
+
(* Create a stream from a string - discouraged, prefer create_from_reader *)
39
+
let create input =
40
+
create_from_reader (Bytes.Reader.of_string input)
41
+
42
+
let position t = (t.line, t.column)
43
+
44
+
(* Read next raw byte from the stream (before CR/LF normalization) *)
45
+
let read_raw_char t =
46
+
(* First check lookahead *)
47
+
match t.lookahead with
48
+
| c :: rest ->
49
+
t.lookahead <- rest;
50
+
Some c
51
+
| [] ->
52
+
(* Check if current slice is exhausted *)
53
+
if Bytes.Slice.is_eod t.current_slice then
54
+
None
55
+
else if t.slice_pos >= Bytes.Slice.length t.current_slice then begin
56
+
(* Get next slice *)
57
+
t.current_slice <- Bytes.Reader.read t.reader;
58
+
t.slice_pos <- 0;
59
+
if Bytes.Slice.is_eod t.current_slice then
60
+
None
61
+
else begin
62
+
let c = Bytes.get (Bytes.Slice.bytes t.current_slice)
63
+
(Bytes.Slice.first t.current_slice + t.slice_pos) in
64
+
t.slice_pos <- t.slice_pos + 1;
65
+
Some c
66
+
end
67
+
end else begin
68
+
let c = Bytes.get (Bytes.Slice.bytes t.current_slice)
69
+
(Bytes.Slice.first t.current_slice + t.slice_pos) in
70
+
t.slice_pos <- t.slice_pos + 1;
71
+
Some c
72
+
end
73
+
74
+
(* Push a char back to lookahead *)
75
+
let push_back_char t c =
76
+
t.lookahead <- c :: t.lookahead
77
+
78
+
(* Read next char with CR/LF normalization *)
79
+
let rec read_normalized_char t =
80
+
match read_raw_char t with
81
+
| None ->
82
+
t.last_was_cr <- false;
83
+
None
84
+
| Some '\r' ->
85
+
t.last_was_cr <- true;
86
+
Some '\n' (* CR becomes LF *)
87
+
| Some '\n' when t.last_was_cr ->
88
+
(* Skip LF after CR - it was already converted *)
89
+
t.last_was_cr <- false;
90
+
read_normalized_char t
91
+
| Some c ->
92
+
t.last_was_cr <- false;
93
+
Some c
94
+
95
+
let is_eof t =
96
+
t.lookahead = [] &&
97
+
(Bytes.Slice.is_eod t.current_slice ||
98
+
(t.slice_pos >= Bytes.Slice.length t.current_slice &&
99
+
(let next = Bytes.Reader.read t.reader in
100
+
t.current_slice <- next;
101
+
t.slice_pos <- 0;
102
+
Bytes.Slice.is_eod next)))
103
+
104
+
let peek t =
105
+
match read_normalized_char t with
106
+
| None -> None
107
+
| Some c ->
108
+
push_back_char t c;
109
+
(* Undo last_was_cr if we pushed back a CR-converted LF *)
110
+
if c = '\n' then t.last_was_cr <- false;
111
+
Some c
112
+
113
+
(* Read n characters into a list, returns (chars_read, all_read_successfully) *)
114
+
let peek_chars t n =
115
+
let rec collect acc remaining =
116
+
if remaining <= 0 then (List.rev acc, true)
117
+
else match read_normalized_char t with
118
+
| None -> (List.rev acc, false) (* Not enough chars available *)
119
+
| Some c -> collect (c :: acc) (remaining - 1)
120
+
in
121
+
let (chars, success) = collect [] n in
122
+
(* Always push back characters we read, in reverse order *)
123
+
List.iter (push_back_char t) (List.rev chars);
124
+
t.last_was_cr <- false;
125
+
(chars, success)
126
+
127
+
(* peek_n returns Some string only when exactly n chars are available
128
+
Avoid using this in hot paths - prefer peek_chars + direct comparison *)
129
+
let peek_n t n =
130
+
let (chars, success) = peek_chars t n in
131
+
if success then
132
+
Some (String.init n (fun i -> List.nth chars i))
133
+
else
134
+
None
135
+
136
+
let advance t =
137
+
match read_normalized_char t with
138
+
| None -> ()
139
+
| Some c ->
140
+
(* Update position tracking *)
141
+
if c = '\n' then begin
142
+
t.line <- t.line + 1;
143
+
t.column <- 0
144
+
end else
145
+
t.column <- t.column + 1
146
+
147
+
let consume t =
148
+
let c = peek t in
149
+
advance t;
150
+
c
151
+
152
+
let consume_if t pred =
153
+
match peek t with
154
+
| Some c when pred c -> advance t; Some c
155
+
| _ -> None
156
+
157
+
let consume_while t pred =
158
+
let buf = Buffer.create 16 in
159
+
let rec loop () =
160
+
match peek t with
161
+
| Some c when pred c ->
162
+
Buffer.add_char buf c;
163
+
advance t;
164
+
loop ()
165
+
| _ -> ()
166
+
in
167
+
loop ();
168
+
Buffer.contents buf
169
+
170
+
(* Case-insensitive match without allocating a string
171
+
Compares directly with the char list from peek_chars *)
172
+
let matches_ci t s =
173
+
let slen = String.length s in
174
+
let (chars, success) = peek_chars t slen in
175
+
if not success then false
176
+
else begin
177
+
let rec check chars_remaining i =
178
+
match chars_remaining with
179
+
| [] -> i >= slen (* Matched all *)
180
+
| c :: rest ->
181
+
if i >= slen then true
182
+
else
183
+
let c1 = Char.lowercase_ascii c in
184
+
let c2 = Char.lowercase_ascii (String.unsafe_get s i) in
185
+
if c1 = c2 then check rest (i + 1)
186
+
else false
187
+
in
188
+
check chars 0
189
+
end
190
+
191
+
let consume_exact_ci t s =
192
+
if matches_ci t s then begin
193
+
for _ = 1 to String.length s do advance t done;
194
+
true
195
+
end else false
196
+
197
+
let reconsume t =
198
+
(* Move back one position - simplified, doesn't handle CR/LF properly for reconsume *)
199
+
(* This is called after advance, so we just need to push back a placeholder *)
200
+
(* The tokenizer will call peek again which will get the right character *)
201
+
(* Actually, for reconsume we need to track what we last consumed *)
202
+
(* For now, just adjust column *)
203
+
if t.column > 0 then t.column <- t.column - 1
+39
lib/tokenizer/token.ml
+39
lib/tokenizer/token.ml
···
1
+
(* HTML5 token types *)
2
+
3
+
type tag_kind = Start | End
4
+
5
+
type doctype = {
6
+
name : string option;
7
+
public_id : string option;
8
+
system_id : string option;
9
+
force_quirks : bool;
10
+
}
11
+
12
+
type tag = {
13
+
kind : tag_kind;
14
+
name : string;
15
+
attrs : (string * string) list;
16
+
self_closing : bool;
17
+
}
18
+
19
+
type t =
20
+
| Tag of tag
21
+
| Character of string
22
+
| Comment of string
23
+
| Doctype of doctype
24
+
| EOF
25
+
26
+
let make_start_tag name attrs self_closing =
27
+
Tag { kind = Start; name; attrs; self_closing }
28
+
29
+
let make_end_tag name =
30
+
Tag { kind = End; name; attrs = []; self_closing = false }
31
+
32
+
let make_doctype ?name ?public_id ?system_id ?(force_quirks=false) () =
33
+
Doctype { name; public_id; system_id; force_quirks }
34
+
35
+
let make_comment data = Comment data
36
+
37
+
let make_character data = Character data
38
+
39
+
let eof = EOF
+1842
lib/tokenizer/tokenizer.ml
+1842
lib/tokenizer/tokenizer.ml
···
1
+
(* HTML5 Tokenizer - implements WHATWG tokenization algorithm *)
2
+
3
+
let is_ascii_alpha c = (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')
4
+
let is_ascii_upper c = c >= 'A' && c <= 'Z'
5
+
let is_ascii_digit c = c >= '0' && c <= '9'
6
+
let is_ascii_hex c = is_ascii_digit c || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F')
7
+
let is_ascii_alnum c = is_ascii_alpha c || is_ascii_digit c
8
+
let is_whitespace c = c = ' ' || c = '\t' || c = '\n' || c = '\x0C' || c = '\r'
9
+
10
+
let ascii_lower c =
11
+
if is_ascii_upper c then Char.chr (Char.code c + 32) else c
12
+
13
+
(* Token sink interface *)
14
+
module type SINK = sig
15
+
type t
16
+
val process : t -> Token.t -> [ `Continue | `SwitchTo of State.t ]
17
+
val adjusted_current_node_in_html_namespace : t -> bool
18
+
end
19
+
20
+
type 'sink t = {
21
+
mutable stream : Stream.t;
22
+
sink : 'sink;
23
+
mutable state : State.t;
24
+
mutable return_state : State.t;
25
+
mutable char_ref_code : int;
26
+
mutable temp_buffer : Buffer.t;
27
+
mutable last_start_tag : string;
28
+
mutable current_tag_name : Buffer.t;
29
+
mutable current_tag_kind : Token.tag_kind;
30
+
mutable current_tag_self_closing : bool;
31
+
mutable current_attr_name : Buffer.t;
32
+
mutable current_attr_value : Buffer.t;
33
+
mutable current_attrs : (string * string) list;
34
+
mutable current_doctype_name : Buffer.t option;
35
+
mutable current_doctype_public : Buffer.t option;
36
+
mutable current_doctype_system : Buffer.t option;
37
+
mutable current_doctype_force_quirks : bool;
38
+
mutable current_comment : Buffer.t;
39
+
mutable pending_chars : Buffer.t;
40
+
mutable errors : Errors.t list;
41
+
collect_errors : bool;
42
+
}
43
+
44
+
let create (type s) (module S : SINK with type t = s) sink ?(collect_errors=false) () = {
45
+
stream = Stream.create "";
46
+
sink;
47
+
state = State.Data;
48
+
return_state = State.Data;
49
+
char_ref_code = 0;
50
+
temp_buffer = Buffer.create 64;
51
+
last_start_tag = "";
52
+
current_tag_name = Buffer.create 32;
53
+
current_tag_kind = Token.Start;
54
+
current_tag_self_closing = false;
55
+
current_attr_name = Buffer.create 32;
56
+
current_attr_value = Buffer.create 64;
57
+
current_attrs = [];
58
+
current_doctype_name = None;
59
+
current_doctype_public = None;
60
+
current_doctype_system = None;
61
+
current_doctype_force_quirks = false;
62
+
current_comment = Buffer.create 64;
63
+
pending_chars = Buffer.create 256;
64
+
errors = [];
65
+
collect_errors;
66
+
}
67
+
68
+
let error t code =
69
+
if t.collect_errors then begin
70
+
let (line, column) = Stream.position t.stream in
71
+
t.errors <- Errors.make ~code ~line ~column :: t.errors
72
+
end
73
+
74
+
(* emit functions are defined locally inside run *)
75
+
76
+
let emit_char t c =
77
+
Buffer.add_char t.pending_chars c
78
+
79
+
let emit_str t s =
80
+
Buffer.add_string t.pending_chars s
81
+
82
+
let start_new_tag t kind =
83
+
Buffer.clear t.current_tag_name;
84
+
t.current_tag_kind <- kind;
85
+
t.current_tag_self_closing <- false;
86
+
t.current_attrs <- []
87
+
88
+
let start_new_attribute t =
89
+
(* Save previous attribute if any *)
90
+
let name = Buffer.contents t.current_attr_name in
91
+
if String.length name > 0 then begin
92
+
let value = Buffer.contents t.current_attr_value in
93
+
(* Check for duplicates - only add if not already present *)
94
+
if not (List.exists (fun (n, _) -> n = name) t.current_attrs) then
95
+
t.current_attrs <- (name, value) :: t.current_attrs
96
+
else
97
+
error t "duplicate-attribute"
98
+
end;
99
+
Buffer.clear t.current_attr_name;
100
+
Buffer.clear t.current_attr_value
101
+
102
+
let finish_attribute t =
103
+
start_new_attribute t
104
+
105
+
let start_new_doctype t =
106
+
t.current_doctype_name <- None;
107
+
t.current_doctype_public <- None;
108
+
t.current_doctype_system <- None;
109
+
t.current_doctype_force_quirks <- false
110
+
111
+
(* emit_current_tag, emit_current_doctype, emit_current_comment are defined locally inside run *)
112
+
113
+
let is_appropriate_end_tag t =
114
+
let name = Buffer.contents t.current_tag_name in
115
+
String.length t.last_start_tag > 0 && name = t.last_start_tag
116
+
117
+
let flush_code_points_consumed_as_char_ref t =
118
+
let s = Buffer.contents t.temp_buffer in
119
+
match t.return_state with
120
+
| State.Attribute_value_double_quoted
121
+
| State.Attribute_value_single_quoted
122
+
| State.Attribute_value_unquoted ->
123
+
Buffer.add_string t.current_attr_value s
124
+
| _ ->
125
+
emit_str t s
126
+
127
+
open Bytesrw
128
+
129
+
(* Main tokenization loop *)
130
+
let run (type s) t (module S : SINK with type t = s) (reader : Bytes.Reader.t) =
131
+
t.stream <- Stream.create_from_reader reader;
132
+
t.errors <- [];
133
+
134
+
(* Local emit functions with access to S *)
135
+
let emit_pending_chars () =
136
+
if Buffer.length t.pending_chars > 0 then begin
137
+
let data = Buffer.contents t.pending_chars in
138
+
Buffer.clear t.pending_chars;
139
+
ignore (S.process t.sink (Token.Character data))
140
+
end
141
+
in
142
+
143
+
let emit token =
144
+
emit_pending_chars ();
145
+
match S.process t.sink token with
146
+
| `Continue -> ()
147
+
| `SwitchTo new_state -> t.state <- new_state
148
+
in
149
+
150
+
let emit_current_tag () =
151
+
finish_attribute t;
152
+
let name = Buffer.contents t.current_tag_name in
153
+
let tag = {
154
+
Token.kind = t.current_tag_kind;
155
+
name;
156
+
attrs = List.rev t.current_attrs;
157
+
self_closing = t.current_tag_self_closing;
158
+
} in
159
+
if t.current_tag_kind = Token.Start then
160
+
t.last_start_tag <- name;
161
+
emit (Token.Tag tag)
162
+
in
163
+
164
+
let emit_current_doctype () =
165
+
let doctype = {
166
+
Token.name = Option.map Buffer.contents t.current_doctype_name;
167
+
public_id = Option.map Buffer.contents t.current_doctype_public;
168
+
system_id = Option.map Buffer.contents t.current_doctype_system;
169
+
force_quirks = t.current_doctype_force_quirks;
170
+
} in
171
+
emit (Token.Doctype doctype)
172
+
in
173
+
174
+
let emit_current_comment () =
175
+
emit (Token.Comment (Buffer.contents t.current_comment))
176
+
in
177
+
178
+
let rec process_state () =
179
+
if Stream.is_eof t.stream && t.state <> State.Data then begin
180
+
(* Handle EOF in various states *)
181
+
handle_eof ()
182
+
end else if Stream.is_eof t.stream then begin
183
+
emit_pending_chars ();
184
+
ignore (S.process t.sink Token.EOF)
185
+
end else begin
186
+
step ();
187
+
process_state ()
188
+
end
189
+
190
+
and handle_eof () =
191
+
match t.state with
192
+
| State.Data ->
193
+
emit_pending_chars ();
194
+
ignore (S.process t.sink Token.EOF)
195
+
| State.Tag_open ->
196
+
error t "eof-before-tag-name";
197
+
emit_char t '<';
198
+
emit_pending_chars ();
199
+
ignore (S.process t.sink Token.EOF)
200
+
| State.End_tag_open ->
201
+
error t "eof-before-tag-name";
202
+
emit_str t "</";
203
+
emit_pending_chars ();
204
+
ignore (S.process t.sink Token.EOF)
205
+
| State.Tag_name
206
+
| State.Before_attribute_name
207
+
| State.Attribute_name
208
+
| State.After_attribute_name
209
+
| State.Before_attribute_value
210
+
| State.Attribute_value_double_quoted
211
+
| State.Attribute_value_single_quoted
212
+
| State.Attribute_value_unquoted
213
+
| State.After_attribute_value_quoted
214
+
| State.Self_closing_start_tag ->
215
+
error t "eof-in-tag";
216
+
emit_pending_chars ();
217
+
ignore (S.process t.sink Token.EOF)
218
+
| State.Rawtext ->
219
+
emit_pending_chars ();
220
+
ignore (S.process t.sink Token.EOF)
221
+
| State.Rawtext_less_than_sign ->
222
+
emit_char t '<';
223
+
emit_pending_chars ();
224
+
ignore (S.process t.sink Token.EOF)
225
+
| State.Rawtext_end_tag_open ->
226
+
emit_str t "</";
227
+
emit_pending_chars ();
228
+
ignore (S.process t.sink Token.EOF)
229
+
| State.Rawtext_end_tag_name ->
230
+
emit_str t "</";
231
+
emit_str t (Buffer.contents t.temp_buffer);
232
+
emit_pending_chars ();
233
+
ignore (S.process t.sink Token.EOF)
234
+
| State.Rcdata ->
235
+
emit_pending_chars ();
236
+
ignore (S.process t.sink Token.EOF)
237
+
| State.Rcdata_less_than_sign ->
238
+
emit_char t '<';
239
+
emit_pending_chars ();
240
+
ignore (S.process t.sink Token.EOF)
241
+
| State.Rcdata_end_tag_open ->
242
+
emit_str t "</";
243
+
emit_pending_chars ();
244
+
ignore (S.process t.sink Token.EOF)
245
+
| State.Rcdata_end_tag_name ->
246
+
emit_str t "</";
247
+
emit_str t (Buffer.contents t.temp_buffer);
248
+
emit_pending_chars ();
249
+
ignore (S.process t.sink Token.EOF)
250
+
| State.Script_data ->
251
+
emit_pending_chars ();
252
+
ignore (S.process t.sink Token.EOF)
253
+
| State.Script_data_less_than_sign ->
254
+
emit_char t '<';
255
+
emit_pending_chars ();
256
+
ignore (S.process t.sink Token.EOF)
257
+
| State.Script_data_end_tag_open ->
258
+
emit_str t "</";
259
+
emit_pending_chars ();
260
+
ignore (S.process t.sink Token.EOF)
261
+
| State.Script_data_end_tag_name ->
262
+
emit_str t "</";
263
+
emit_str t (Buffer.contents t.temp_buffer);
264
+
emit_pending_chars ();
265
+
ignore (S.process t.sink Token.EOF)
266
+
| State.Script_data_escape_start
267
+
| State.Script_data_escape_start_dash
268
+
| State.Script_data_escaped
269
+
| State.Script_data_escaped_dash
270
+
| State.Script_data_escaped_dash_dash ->
271
+
emit_pending_chars ();
272
+
ignore (S.process t.sink Token.EOF)
273
+
| State.Script_data_escaped_less_than_sign ->
274
+
emit_char t '<';
275
+
emit_pending_chars ();
276
+
ignore (S.process t.sink Token.EOF)
277
+
| State.Script_data_escaped_end_tag_open ->
278
+
emit_str t "</";
279
+
emit_pending_chars ();
280
+
ignore (S.process t.sink Token.EOF)
281
+
| State.Script_data_escaped_end_tag_name ->
282
+
emit_str t "</";
283
+
emit_str t (Buffer.contents t.temp_buffer);
284
+
emit_pending_chars ();
285
+
ignore (S.process t.sink Token.EOF)
286
+
| State.Script_data_double_escape_start
287
+
| State.Script_data_double_escaped
288
+
| State.Script_data_double_escaped_dash
289
+
| State.Script_data_double_escaped_dash_dash ->
290
+
emit_pending_chars ();
291
+
ignore (S.process t.sink Token.EOF)
292
+
| State.Script_data_double_escaped_less_than_sign ->
293
+
(* '<' was already emitted when entering this state from Script_data_double_escaped *)
294
+
emit_pending_chars ();
295
+
ignore (S.process t.sink Token.EOF)
296
+
| State.Script_data_double_escape_end ->
297
+
emit_pending_chars ();
298
+
ignore (S.process t.sink Token.EOF)
299
+
| State.Plaintext ->
300
+
emit_pending_chars ();
301
+
ignore (S.process t.sink Token.EOF)
302
+
| State.Comment_start
303
+
| State.Comment_start_dash
304
+
| State.Comment
305
+
| State.Comment_less_than_sign
306
+
| State.Comment_less_than_sign_bang
307
+
| State.Comment_less_than_sign_bang_dash
308
+
| State.Comment_less_than_sign_bang_dash_dash
309
+
| State.Comment_end_dash
310
+
| State.Comment_end
311
+
| State.Comment_end_bang ->
312
+
error t "eof-in-comment";
313
+
emit_current_comment ();
314
+
emit_pending_chars ();
315
+
ignore (S.process t.sink Token.EOF)
316
+
| State.Bogus_comment ->
317
+
emit_current_comment ();
318
+
emit_pending_chars ();
319
+
ignore (S.process t.sink Token.EOF)
320
+
| State.Markup_declaration_open ->
321
+
error t "incorrectly-opened-comment";
322
+
Buffer.clear t.current_comment;
323
+
emit_current_comment ();
324
+
emit_pending_chars ();
325
+
ignore (S.process t.sink Token.EOF)
326
+
| State.Doctype
327
+
| State.Before_doctype_name ->
328
+
error t "eof-in-doctype";
329
+
start_new_doctype t;
330
+
t.current_doctype_force_quirks <- true;
331
+
emit_current_doctype ();
332
+
emit_pending_chars ();
333
+
ignore (S.process t.sink Token.EOF)
334
+
| State.Doctype_name
335
+
| State.After_doctype_name
336
+
| State.After_doctype_public_keyword
337
+
| State.Before_doctype_public_identifier
338
+
| State.Doctype_public_identifier_double_quoted
339
+
| State.Doctype_public_identifier_single_quoted
340
+
| State.After_doctype_public_identifier
341
+
| State.Between_doctype_public_and_system_identifiers
342
+
| State.After_doctype_system_keyword
343
+
| State.Before_doctype_system_identifier
344
+
| State.Doctype_system_identifier_double_quoted
345
+
| State.Doctype_system_identifier_single_quoted
346
+
| State.After_doctype_system_identifier ->
347
+
error t "eof-in-doctype";
348
+
t.current_doctype_force_quirks <- true;
349
+
emit_current_doctype ();
350
+
emit_pending_chars ();
351
+
ignore (S.process t.sink Token.EOF)
352
+
| State.Bogus_doctype ->
353
+
emit_current_doctype ();
354
+
emit_pending_chars ();
355
+
ignore (S.process t.sink Token.EOF)
356
+
| State.Cdata_section ->
357
+
error t "eof-in-cdata";
358
+
emit_pending_chars ();
359
+
ignore (S.process t.sink Token.EOF)
360
+
| State.Cdata_section_bracket ->
361
+
error t "eof-in-cdata";
362
+
emit_char t ']';
363
+
emit_pending_chars ();
364
+
ignore (S.process t.sink Token.EOF)
365
+
| State.Cdata_section_end ->
366
+
error t "eof-in-cdata";
367
+
emit_str t "]]";
368
+
emit_pending_chars ();
369
+
ignore (S.process t.sink Token.EOF)
370
+
| State.Character_reference ->
371
+
(* state_character_reference never ran, so initialize temp_buffer with & *)
372
+
Buffer.clear t.temp_buffer;
373
+
Buffer.add_char t.temp_buffer '&';
374
+
flush_code_points_consumed_as_char_ref t;
375
+
t.state <- t.return_state;
376
+
handle_eof ()
377
+
| State.Named_character_reference
378
+
| State.Numeric_character_reference
379
+
| State.Hexadecimal_character_reference_start
380
+
| State.Decimal_character_reference_start
381
+
| State.Numeric_character_reference_end ->
382
+
flush_code_points_consumed_as_char_ref t;
383
+
t.state <- t.return_state;
384
+
handle_eof ()
385
+
| State.Ambiguous_ampersand ->
386
+
(* Buffer was already flushed when entering this state, just transition *)
387
+
t.state <- t.return_state;
388
+
handle_eof ()
389
+
| State.Hexadecimal_character_reference
390
+
| State.Decimal_character_reference ->
391
+
(* At EOF with collected digits - convert the numeric reference *)
392
+
error t "missing-semicolon-after-character-reference";
393
+
let code = t.char_ref_code in
394
+
let replacement_char = "\xEF\xBF\xBD" in
395
+
let result =
396
+
if code = 0 then begin
397
+
error t "null-character-reference";
398
+
replacement_char
399
+
end else if code > 0x10FFFF then begin
400
+
error t "character-reference-outside-unicode-range";
401
+
replacement_char
402
+
end else if code >= 0xD800 && code <= 0xDFFF then begin
403
+
error t "surrogate-character-reference";
404
+
replacement_char
405
+
end else
406
+
Html5rw_entities.Numeric_ref.codepoint_to_utf8 code
407
+
in
408
+
Buffer.clear t.temp_buffer;
409
+
Buffer.add_string t.temp_buffer result;
410
+
flush_code_points_consumed_as_char_ref t;
411
+
t.state <- t.return_state;
412
+
handle_eof ()
413
+
414
+
and step () =
415
+
match t.state with
416
+
| State.Data -> state_data ()
417
+
| State.Rcdata -> state_rcdata ()
418
+
| State.Rawtext -> state_rawtext ()
419
+
| State.Script_data -> state_script_data ()
420
+
| State.Plaintext -> state_plaintext ()
421
+
| State.Tag_open -> state_tag_open ()
422
+
| State.End_tag_open -> state_end_tag_open ()
423
+
| State.Tag_name -> state_tag_name ()
424
+
| State.Rcdata_less_than_sign -> state_rcdata_less_than_sign ()
425
+
| State.Rcdata_end_tag_open -> state_rcdata_end_tag_open ()
426
+
| State.Rcdata_end_tag_name -> state_rcdata_end_tag_name ()
427
+
| State.Rawtext_less_than_sign -> state_rawtext_less_than_sign ()
428
+
| State.Rawtext_end_tag_open -> state_rawtext_end_tag_open ()
429
+
| State.Rawtext_end_tag_name -> state_rawtext_end_tag_name ()
430
+
| State.Script_data_less_than_sign -> state_script_data_less_than_sign ()
431
+
| State.Script_data_end_tag_open -> state_script_data_end_tag_open ()
432
+
| State.Script_data_end_tag_name -> state_script_data_end_tag_name ()
433
+
| State.Script_data_escape_start -> state_script_data_escape_start ()
434
+
| State.Script_data_escape_start_dash -> state_script_data_escape_start_dash ()
435
+
| State.Script_data_escaped -> state_script_data_escaped ()
436
+
| State.Script_data_escaped_dash -> state_script_data_escaped_dash ()
437
+
| State.Script_data_escaped_dash_dash -> state_script_data_escaped_dash_dash ()
438
+
| State.Script_data_escaped_less_than_sign -> state_script_data_escaped_less_than_sign ()
439
+
| State.Script_data_escaped_end_tag_open -> state_script_data_escaped_end_tag_open ()
440
+
| State.Script_data_escaped_end_tag_name -> state_script_data_escaped_end_tag_name ()
441
+
| State.Script_data_double_escape_start -> state_script_data_double_escape_start ()
442
+
| State.Script_data_double_escaped -> state_script_data_double_escaped ()
443
+
| State.Script_data_double_escaped_dash -> state_script_data_double_escaped_dash ()
444
+
| State.Script_data_double_escaped_dash_dash -> state_script_data_double_escaped_dash_dash ()
445
+
| State.Script_data_double_escaped_less_than_sign -> state_script_data_double_escaped_less_than_sign ()
446
+
| State.Script_data_double_escape_end -> state_script_data_double_escape_end ()
447
+
| State.Before_attribute_name -> state_before_attribute_name ()
448
+
| State.Attribute_name -> state_attribute_name ()
449
+
| State.After_attribute_name -> state_after_attribute_name ()
450
+
| State.Before_attribute_value -> state_before_attribute_value ()
451
+
| State.Attribute_value_double_quoted -> state_attribute_value_double_quoted ()
452
+
| State.Attribute_value_single_quoted -> state_attribute_value_single_quoted ()
453
+
| State.Attribute_value_unquoted -> state_attribute_value_unquoted ()
454
+
| State.After_attribute_value_quoted -> state_after_attribute_value_quoted ()
455
+
| State.Self_closing_start_tag -> state_self_closing_start_tag ()
456
+
| State.Bogus_comment -> state_bogus_comment ()
457
+
| State.Markup_declaration_open -> state_markup_declaration_open ()
458
+
| State.Comment_start -> state_comment_start ()
459
+
| State.Comment_start_dash -> state_comment_start_dash ()
460
+
| State.Comment -> state_comment ()
461
+
| State.Comment_less_than_sign -> state_comment_less_than_sign ()
462
+
| State.Comment_less_than_sign_bang -> state_comment_less_than_sign_bang ()
463
+
| State.Comment_less_than_sign_bang_dash -> state_comment_less_than_sign_bang_dash ()
464
+
| State.Comment_less_than_sign_bang_dash_dash -> state_comment_less_than_sign_bang_dash_dash ()
465
+
| State.Comment_end_dash -> state_comment_end_dash ()
466
+
| State.Comment_end -> state_comment_end ()
467
+
| State.Comment_end_bang -> state_comment_end_bang ()
468
+
| State.Doctype -> state_doctype ()
469
+
| State.Before_doctype_name -> state_before_doctype_name ()
470
+
| State.Doctype_name -> state_doctype_name ()
471
+
| State.After_doctype_name -> state_after_doctype_name ()
472
+
| State.After_doctype_public_keyword -> state_after_doctype_public_keyword ()
473
+
| State.Before_doctype_public_identifier -> state_before_doctype_public_identifier ()
474
+
| State.Doctype_public_identifier_double_quoted -> state_doctype_public_identifier_double_quoted ()
475
+
| State.Doctype_public_identifier_single_quoted -> state_doctype_public_identifier_single_quoted ()
476
+
| State.After_doctype_public_identifier -> state_after_doctype_public_identifier ()
477
+
| State.Between_doctype_public_and_system_identifiers -> state_between_doctype_public_and_system_identifiers ()
478
+
| State.After_doctype_system_keyword -> state_after_doctype_system_keyword ()
479
+
| State.Before_doctype_system_identifier -> state_before_doctype_system_identifier ()
480
+
| State.Doctype_system_identifier_double_quoted -> state_doctype_system_identifier_double_quoted ()
481
+
| State.Doctype_system_identifier_single_quoted -> state_doctype_system_identifier_single_quoted ()
482
+
| State.After_doctype_system_identifier -> state_after_doctype_system_identifier ()
483
+
| State.Bogus_doctype -> state_bogus_doctype ()
484
+
| State.Cdata_section -> state_cdata_section ()
485
+
| State.Cdata_section_bracket -> state_cdata_section_bracket ()
486
+
| State.Cdata_section_end -> state_cdata_section_end ()
487
+
| State.Character_reference -> state_character_reference ()
488
+
| State.Named_character_reference -> state_named_character_reference ()
489
+
| State.Ambiguous_ampersand -> state_ambiguous_ampersand ()
490
+
| State.Numeric_character_reference -> state_numeric_character_reference ()
491
+
| State.Hexadecimal_character_reference_start -> state_hexadecimal_character_reference_start ()
492
+
| State.Decimal_character_reference_start -> state_decimal_character_reference_start ()
493
+
| State.Hexadecimal_character_reference -> state_hexadecimal_character_reference ()
494
+
| State.Decimal_character_reference -> state_decimal_character_reference ()
495
+
| State.Numeric_character_reference_end -> state_numeric_character_reference_end ()
496
+
497
+
(* State implementations *)
498
+
and state_data () =
499
+
match Stream.consume t.stream with
500
+
| Some '&' ->
501
+
t.return_state <- State.Data;
502
+
t.state <- State.Character_reference
503
+
| Some '<' ->
504
+
t.state <- State.Tag_open
505
+
| Some '\x00' ->
506
+
(* Emit pending chars first, then emit null separately for proper tree builder handling *)
507
+
emit_pending_chars ();
508
+
error t "unexpected-null-character";
509
+
ignore (S.process t.sink (Token.Character "\x00"))
510
+
| Some c ->
511
+
emit_char t c
512
+
| None -> ()
513
+
514
+
and state_rcdata () =
515
+
match Stream.consume t.stream with
516
+
| Some '&' ->
517
+
t.return_state <- State.Rcdata;
518
+
t.state <- State.Character_reference
519
+
| Some '<' ->
520
+
t.state <- State.Rcdata_less_than_sign
521
+
| Some '\x00' ->
522
+
error t "unexpected-null-character";
523
+
emit_str t "\xEF\xBF\xBD"
524
+
| Some c ->
525
+
emit_char t c
526
+
| None -> ()
527
+
528
+
and state_rawtext () =
529
+
match Stream.consume t.stream with
530
+
| Some '<' ->
531
+
t.state <- State.Rawtext_less_than_sign
532
+
| Some '\x00' ->
533
+
error t "unexpected-null-character";
534
+
emit_str t "\xEF\xBF\xBD"
535
+
| Some c ->
536
+
emit_char t c
537
+
| None -> ()
538
+
539
+
and state_script_data () =
540
+
match Stream.consume t.stream with
541
+
| Some '<' ->
542
+
t.state <- State.Script_data_less_than_sign
543
+
| Some '\x00' ->
544
+
error t "unexpected-null-character";
545
+
emit_str t "\xEF\xBF\xBD"
546
+
| Some c ->
547
+
emit_char t c
548
+
| None -> ()
549
+
550
+
and state_plaintext () =
551
+
match Stream.consume t.stream with
552
+
| Some '\x00' ->
553
+
error t "unexpected-null-character";
554
+
emit_str t "\xEF\xBF\xBD"
555
+
| Some c ->
556
+
emit_char t c
557
+
| None -> ()
558
+
559
+
and state_tag_open () =
560
+
match Stream.peek t.stream with
561
+
| Some '!' ->
562
+
Stream.advance t.stream;
563
+
t.state <- State.Markup_declaration_open
564
+
| Some '/' ->
565
+
Stream.advance t.stream;
566
+
t.state <- State.End_tag_open
567
+
| Some c when is_ascii_alpha c ->
568
+
start_new_tag t Token.Start;
569
+
t.state <- State.Tag_name
570
+
| Some '?' ->
571
+
error t "unexpected-question-mark-instead-of-tag-name";
572
+
Buffer.clear t.current_comment;
573
+
t.state <- State.Bogus_comment
574
+
| None ->
575
+
error t "eof-before-tag-name";
576
+
emit_char t '<'
577
+
| Some _ ->
578
+
error t "invalid-first-character-of-tag-name";
579
+
emit_char t '<';
580
+
t.state <- State.Data
581
+
582
+
and state_end_tag_open () =
583
+
match Stream.peek t.stream with
584
+
| Some c when is_ascii_alpha c ->
585
+
start_new_tag t Token.End;
586
+
t.state <- State.Tag_name
587
+
| Some '>' ->
588
+
Stream.advance t.stream;
589
+
error t "missing-end-tag-name";
590
+
t.state <- State.Data
591
+
| None ->
592
+
error t "eof-before-tag-name";
593
+
emit_str t "</"
594
+
| Some _ ->
595
+
error t "invalid-first-character-of-tag-name";
596
+
Buffer.clear t.current_comment;
597
+
t.state <- State.Bogus_comment
598
+
599
+
and state_tag_name () =
600
+
match Stream.consume t.stream with
601
+
| Some ('\t' | '\n' | '\x0C' | ' ') ->
602
+
t.state <- State.Before_attribute_name
603
+
| Some '/' ->
604
+
t.state <- State.Self_closing_start_tag
605
+
| Some '>' ->
606
+
t.state <- State.Data;
607
+
emit_current_tag ()
608
+
| Some '\x00' ->
609
+
error t "unexpected-null-character";
610
+
Buffer.add_string t.current_tag_name "\xEF\xBF\xBD"
611
+
| Some c ->
612
+
Buffer.add_char t.current_tag_name (ascii_lower c)
613
+
| None -> ()
614
+
615
+
and state_rcdata_less_than_sign () =
616
+
match Stream.peek t.stream with
617
+
| Some '/' ->
618
+
Stream.advance t.stream;
619
+
Buffer.clear t.temp_buffer;
620
+
t.state <- State.Rcdata_end_tag_open
621
+
| _ ->
622
+
emit_char t '<';
623
+
t.state <- State.Rcdata
624
+
625
+
and state_rcdata_end_tag_open () =
626
+
match Stream.peek t.stream with
627
+
| Some c when is_ascii_alpha c ->
628
+
start_new_tag t Token.End;
629
+
t.state <- State.Rcdata_end_tag_name
630
+
| _ ->
631
+
emit_str t "</";
632
+
t.state <- State.Rcdata
633
+
634
+
and state_rcdata_end_tag_name () =
635
+
match Stream.peek t.stream with
636
+
| Some ('\t' | '\n' | '\x0C' | ' ') when is_appropriate_end_tag t ->
637
+
Stream.advance t.stream;
638
+
t.state <- State.Before_attribute_name
639
+
| Some '/' when is_appropriate_end_tag t ->
640
+
Stream.advance t.stream;
641
+
t.state <- State.Self_closing_start_tag
642
+
| Some '>' when is_appropriate_end_tag t ->
643
+
Stream.advance t.stream;
644
+
t.state <- State.Data;
645
+
emit_current_tag ()
646
+
| Some c when is_ascii_alpha c ->
647
+
Stream.advance t.stream;
648
+
Buffer.add_char t.current_tag_name (ascii_lower c);
649
+
Buffer.add_char t.temp_buffer c
650
+
| _ ->
651
+
emit_str t "</";
652
+
emit_str t (Buffer.contents t.temp_buffer);
653
+
t.state <- State.Rcdata
654
+
655
+
and state_rawtext_less_than_sign () =
656
+
match Stream.peek t.stream with
657
+
| Some '/' ->
658
+
Stream.advance t.stream;
659
+
Buffer.clear t.temp_buffer;
660
+
t.state <- State.Rawtext_end_tag_open
661
+
| _ ->
662
+
emit_char t '<';
663
+
t.state <- State.Rawtext
664
+
665
+
and state_rawtext_end_tag_open () =
666
+
match Stream.peek t.stream with
667
+
| Some c when is_ascii_alpha c ->
668
+
start_new_tag t Token.End;
669
+
t.state <- State.Rawtext_end_tag_name
670
+
| _ ->
671
+
emit_str t "</";
672
+
t.state <- State.Rawtext
673
+
674
+
and state_rawtext_end_tag_name () =
675
+
match Stream.peek t.stream with
676
+
| Some ('\t' | '\n' | '\x0C' | ' ') when is_appropriate_end_tag t ->
677
+
Stream.advance t.stream;
678
+
t.state <- State.Before_attribute_name
679
+
| Some '/' when is_appropriate_end_tag t ->
680
+
Stream.advance t.stream;
681
+
t.state <- State.Self_closing_start_tag
682
+
| Some '>' when is_appropriate_end_tag t ->
683
+
Stream.advance t.stream;
684
+
t.state <- State.Data;
685
+
emit_current_tag ()
686
+
| Some c when is_ascii_alpha c ->
687
+
Stream.advance t.stream;
688
+
Buffer.add_char t.current_tag_name (ascii_lower c);
689
+
Buffer.add_char t.temp_buffer c
690
+
| _ ->
691
+
emit_str t "</";
692
+
emit_str t (Buffer.contents t.temp_buffer);
693
+
t.state <- State.Rawtext
694
+
695
+
and state_script_data_less_than_sign () =
696
+
match Stream.peek t.stream with
697
+
| Some '/' ->
698
+
Stream.advance t.stream;
699
+
Buffer.clear t.temp_buffer;
700
+
t.state <- State.Script_data_end_tag_open
701
+
| Some '!' ->
702
+
Stream.advance t.stream;
703
+
t.state <- State.Script_data_escape_start;
704
+
emit_str t "<!"
705
+
| _ ->
706
+
emit_char t '<';
707
+
t.state <- State.Script_data
708
+
709
+
and state_script_data_end_tag_open () =
710
+
match Stream.peek t.stream with
711
+
| Some c when is_ascii_alpha c ->
712
+
start_new_tag t Token.End;
713
+
t.state <- State.Script_data_end_tag_name
714
+
| _ ->
715
+
emit_str t "</";
716
+
t.state <- State.Script_data
717
+
718
+
and state_script_data_end_tag_name () =
719
+
match Stream.peek t.stream with
720
+
| Some ('\t' | '\n' | '\x0C' | ' ') when is_appropriate_end_tag t ->
721
+
Stream.advance t.stream;
722
+
t.state <- State.Before_attribute_name
723
+
| Some '/' when is_appropriate_end_tag t ->
724
+
Stream.advance t.stream;
725
+
t.state <- State.Self_closing_start_tag
726
+
| Some '>' when is_appropriate_end_tag t ->
727
+
Stream.advance t.stream;
728
+
t.state <- State.Data;
729
+
emit_current_tag ()
730
+
| Some c when is_ascii_alpha c ->
731
+
Stream.advance t.stream;
732
+
Buffer.add_char t.current_tag_name (ascii_lower c);
733
+
Buffer.add_char t.temp_buffer c
734
+
| _ ->
735
+
emit_str t "</";
736
+
emit_str t (Buffer.contents t.temp_buffer);
737
+
t.state <- State.Script_data
738
+
739
+
and state_script_data_escape_start () =
740
+
match Stream.peek t.stream with
741
+
| Some '-' ->
742
+
Stream.advance t.stream;
743
+
t.state <- State.Script_data_escape_start_dash;
744
+
emit_char t '-'
745
+
| _ ->
746
+
t.state <- State.Script_data
747
+
748
+
and state_script_data_escape_start_dash () =
749
+
match Stream.peek t.stream with
750
+
| Some '-' ->
751
+
Stream.advance t.stream;
752
+
t.state <- State.Script_data_escaped_dash_dash;
753
+
emit_char t '-'
754
+
| _ ->
755
+
t.state <- State.Script_data
756
+
757
+
and state_script_data_escaped () =
758
+
match Stream.consume t.stream with
759
+
| Some '-' ->
760
+
t.state <- State.Script_data_escaped_dash;
761
+
emit_char t '-'
762
+
| Some '<' ->
763
+
t.state <- State.Script_data_escaped_less_than_sign
764
+
| Some '\x00' ->
765
+
error t "unexpected-null-character";
766
+
emit_str t "\xEF\xBF\xBD"
767
+
| Some c ->
768
+
emit_char t c
769
+
| None -> ()
770
+
771
+
and state_script_data_escaped_dash () =
772
+
match Stream.consume t.stream with
773
+
| Some '-' ->
774
+
t.state <- State.Script_data_escaped_dash_dash;
775
+
emit_char t '-'
776
+
| Some '<' ->
777
+
t.state <- State.Script_data_escaped_less_than_sign
778
+
| Some '\x00' ->
779
+
error t "unexpected-null-character";
780
+
t.state <- State.Script_data_escaped;
781
+
emit_str t "\xEF\xBF\xBD"
782
+
| Some c ->
783
+
t.state <- State.Script_data_escaped;
784
+
emit_char t c
785
+
| None -> ()
786
+
787
+
and state_script_data_escaped_dash_dash () =
788
+
match Stream.consume t.stream with
789
+
| Some '-' ->
790
+
emit_char t '-'
791
+
| Some '<' ->
792
+
t.state <- State.Script_data_escaped_less_than_sign
793
+
| Some '>' ->
794
+
t.state <- State.Script_data;
795
+
emit_char t '>'
796
+
| Some '\x00' ->
797
+
error t "unexpected-null-character";
798
+
t.state <- State.Script_data_escaped;
799
+
emit_str t "\xEF\xBF\xBD"
800
+
| Some c ->
801
+
t.state <- State.Script_data_escaped;
802
+
emit_char t c
803
+
| None -> ()
804
+
805
+
and state_script_data_escaped_less_than_sign () =
806
+
match Stream.peek t.stream with
807
+
| Some '/' ->
808
+
Stream.advance t.stream;
809
+
Buffer.clear t.temp_buffer;
810
+
t.state <- State.Script_data_escaped_end_tag_open
811
+
| Some c when is_ascii_alpha c ->
812
+
Buffer.clear t.temp_buffer;
813
+
emit_char t '<';
814
+
t.state <- State.Script_data_double_escape_start
815
+
| _ ->
816
+
emit_char t '<';
817
+
t.state <- State.Script_data_escaped
818
+
819
+
and state_script_data_escaped_end_tag_open () =
820
+
match Stream.peek t.stream with
821
+
| Some c when is_ascii_alpha c ->
822
+
start_new_tag t Token.End;
823
+
t.state <- State.Script_data_escaped_end_tag_name
824
+
| _ ->
825
+
emit_str t "</";
826
+
t.state <- State.Script_data_escaped
827
+
828
+
and state_script_data_escaped_end_tag_name () =
829
+
match Stream.peek t.stream with
830
+
| Some ('\t' | '\n' | '\x0C' | ' ') when is_appropriate_end_tag t ->
831
+
Stream.advance t.stream;
832
+
t.state <- State.Before_attribute_name
833
+
| Some '/' when is_appropriate_end_tag t ->
834
+
Stream.advance t.stream;
835
+
t.state <- State.Self_closing_start_tag
836
+
| Some '>' when is_appropriate_end_tag t ->
837
+
Stream.advance t.stream;
838
+
t.state <- State.Data;
839
+
emit_current_tag ()
840
+
| Some c when is_ascii_alpha c ->
841
+
Stream.advance t.stream;
842
+
Buffer.add_char t.current_tag_name (ascii_lower c);
843
+
Buffer.add_char t.temp_buffer c
844
+
| _ ->
845
+
emit_str t "</";
846
+
emit_str t (Buffer.contents t.temp_buffer);
847
+
t.state <- State.Script_data_escaped
848
+
849
+
and state_script_data_double_escape_start () =
850
+
match Stream.peek t.stream with
851
+
| Some ('\t' | '\n' | '\x0C' | ' ' | '/' | '>') as c_opt ->
852
+
Stream.advance t.stream;
853
+
let c = Option.get c_opt in
854
+
if Buffer.contents t.temp_buffer = "script" then
855
+
t.state <- State.Script_data_double_escaped
856
+
else
857
+
t.state <- State.Script_data_escaped;
858
+
emit_char t c
859
+
| Some c when is_ascii_alpha c ->
860
+
Stream.advance t.stream;
861
+
Buffer.add_char t.temp_buffer (ascii_lower c);
862
+
emit_char t c
863
+
| _ ->
864
+
t.state <- State.Script_data_escaped
865
+
866
+
and state_script_data_double_escaped () =
867
+
match Stream.consume t.stream with
868
+
| Some '-' ->
869
+
t.state <- State.Script_data_double_escaped_dash;
870
+
emit_char t '-'
871
+
| Some '<' ->
872
+
t.state <- State.Script_data_double_escaped_less_than_sign;
873
+
emit_char t '<'
874
+
| Some '\x00' ->
875
+
error t "unexpected-null-character";
876
+
emit_str t "\xEF\xBF\xBD"
877
+
| Some c ->
878
+
emit_char t c
879
+
| None -> ()
880
+
881
+
and state_script_data_double_escaped_dash () =
882
+
match Stream.consume t.stream with
883
+
| Some '-' ->
884
+
t.state <- State.Script_data_double_escaped_dash_dash;
885
+
emit_char t '-'
886
+
| Some '<' ->
887
+
t.state <- State.Script_data_double_escaped_less_than_sign;
888
+
emit_char t '<'
889
+
| Some '\x00' ->
890
+
error t "unexpected-null-character";
891
+
t.state <- State.Script_data_double_escaped;
892
+
emit_str t "\xEF\xBF\xBD"
893
+
| Some c ->
894
+
t.state <- State.Script_data_double_escaped;
895
+
emit_char t c
896
+
| None -> ()
897
+
898
+
and state_script_data_double_escaped_dash_dash () =
899
+
match Stream.consume t.stream with
900
+
| Some '-' ->
901
+
emit_char t '-'
902
+
| Some '<' ->
903
+
t.state <- State.Script_data_double_escaped_less_than_sign;
904
+
emit_char t '<'
905
+
| Some '>' ->
906
+
t.state <- State.Script_data;
907
+
emit_char t '>'
908
+
| Some '\x00' ->
909
+
error t "unexpected-null-character";
910
+
t.state <- State.Script_data_double_escaped;
911
+
emit_str t "\xEF\xBF\xBD"
912
+
| Some c ->
913
+
t.state <- State.Script_data_double_escaped;
914
+
emit_char t c
915
+
| None -> ()
916
+
917
+
and state_script_data_double_escaped_less_than_sign () =
918
+
match Stream.peek t.stream with
919
+
| Some '/' ->
920
+
Stream.advance t.stream;
921
+
Buffer.clear t.temp_buffer;
922
+
t.state <- State.Script_data_double_escape_end;
923
+
emit_char t '/'
924
+
| _ ->
925
+
t.state <- State.Script_data_double_escaped
926
+
927
+
and state_script_data_double_escape_end () =
928
+
match Stream.peek t.stream with
929
+
| Some ('\t' | '\n' | '\x0C' | ' ' | '/' | '>') as c_opt ->
930
+
Stream.advance t.stream;
931
+
let c = Option.get c_opt in
932
+
if Buffer.contents t.temp_buffer = "script" then
933
+
t.state <- State.Script_data_escaped
934
+
else
935
+
t.state <- State.Script_data_double_escaped;
936
+
emit_char t c
937
+
| Some c when is_ascii_alpha c ->
938
+
Stream.advance t.stream;
939
+
Buffer.add_char t.temp_buffer (ascii_lower c);
940
+
emit_char t c
941
+
| _ ->
942
+
t.state <- State.Script_data_double_escaped
943
+
944
+
and state_before_attribute_name () =
945
+
match Stream.peek t.stream with
946
+
| Some ('\t' | '\n' | '\x0C' | ' ') ->
947
+
Stream.advance t.stream
948
+
| Some '/' | Some '>' | None ->
949
+
t.state <- State.After_attribute_name
950
+
| Some '=' ->
951
+
Stream.advance t.stream;
952
+
error t "unexpected-equals-sign-before-attribute-name";
953
+
start_new_attribute t;
954
+
Buffer.add_char t.current_attr_name '=';
955
+
t.state <- State.Attribute_name
956
+
| Some _ ->
957
+
start_new_attribute t;
958
+
t.state <- State.Attribute_name
959
+
960
+
and state_attribute_name () =
961
+
match Stream.peek t.stream with
962
+
| Some ('\t' | '\n' | '\x0C' | ' ') ->
963
+
Stream.advance t.stream;
964
+
t.state <- State.After_attribute_name
965
+
| Some '/' | Some '>' | None ->
966
+
t.state <- State.After_attribute_name
967
+
| Some '=' ->
968
+
Stream.advance t.stream;
969
+
t.state <- State.Before_attribute_value
970
+
| Some '\x00' ->
971
+
Stream.advance t.stream;
972
+
error t "unexpected-null-character";
973
+
Buffer.add_string t.current_attr_name "\xEF\xBF\xBD"
974
+
| Some ('"' | '\'' | '<') as c_opt ->
975
+
Stream.advance t.stream;
976
+
error t "unexpected-character-in-attribute-name";
977
+
Buffer.add_char t.current_attr_name (Option.get c_opt)
978
+
| Some c ->
979
+
Stream.advance t.stream;
980
+
Buffer.add_char t.current_attr_name (ascii_lower c)
981
+
982
+
and state_after_attribute_name () =
983
+
match Stream.peek t.stream with
984
+
| Some ('\t' | '\n' | '\x0C' | ' ') ->
985
+
Stream.advance t.stream
986
+
| Some '/' ->
987
+
Stream.advance t.stream;
988
+
t.state <- State.Self_closing_start_tag
989
+
| Some '=' ->
990
+
Stream.advance t.stream;
991
+
t.state <- State.Before_attribute_value
992
+
| Some '>' ->
993
+
Stream.advance t.stream;
994
+
t.state <- State.Data;
995
+
emit_current_tag ()
996
+
| None -> ()
997
+
| Some _ ->
998
+
start_new_attribute t;
999
+
t.state <- State.Attribute_name
1000
+
1001
+
and state_before_attribute_value () =
1002
+
match Stream.peek t.stream with
1003
+
| Some ('\t' | '\n' | '\x0C' | ' ') ->
1004
+
Stream.advance t.stream
1005
+
| Some '"' ->
1006
+
Stream.advance t.stream;
1007
+
t.state <- State.Attribute_value_double_quoted
1008
+
| Some '\'' ->
1009
+
Stream.advance t.stream;
1010
+
t.state <- State.Attribute_value_single_quoted
1011
+
| Some '>' ->
1012
+
Stream.advance t.stream;
1013
+
error t "missing-attribute-value";
1014
+
t.state <- State.Data;
1015
+
emit_current_tag ()
1016
+
| _ ->
1017
+
t.state <- State.Attribute_value_unquoted
1018
+
1019
+
and state_attribute_value_double_quoted () =
1020
+
match Stream.consume t.stream with
1021
+
| Some '"' ->
1022
+
t.state <- State.After_attribute_value_quoted
1023
+
| Some '&' ->
1024
+
t.return_state <- State.Attribute_value_double_quoted;
1025
+
t.state <- State.Character_reference
1026
+
| Some '\x00' ->
1027
+
error t "unexpected-null-character";
1028
+
Buffer.add_string t.current_attr_value "\xEF\xBF\xBD"
1029
+
| Some c ->
1030
+
Buffer.add_char t.current_attr_value c
1031
+
| None -> ()
1032
+
1033
+
and state_attribute_value_single_quoted () =
1034
+
match Stream.consume t.stream with
1035
+
| Some '\'' ->
1036
+
t.state <- State.After_attribute_value_quoted
1037
+
| Some '&' ->
1038
+
t.return_state <- State.Attribute_value_single_quoted;
1039
+
t.state <- State.Character_reference
1040
+
| Some '\x00' ->
1041
+
error t "unexpected-null-character";
1042
+
Buffer.add_string t.current_attr_value "\xEF\xBF\xBD"
1043
+
| Some c ->
1044
+
Buffer.add_char t.current_attr_value c
1045
+
| None -> ()
1046
+
1047
+
and state_attribute_value_unquoted () =
1048
+
match Stream.peek t.stream with
1049
+
| Some ('\t' | '\n' | '\x0C' | ' ') ->
1050
+
Stream.advance t.stream;
1051
+
t.state <- State.Before_attribute_name
1052
+
| Some '&' ->
1053
+
Stream.advance t.stream;
1054
+
t.return_state <- State.Attribute_value_unquoted;
1055
+
t.state <- State.Character_reference
1056
+
| Some '>' ->
1057
+
Stream.advance t.stream;
1058
+
t.state <- State.Data;
1059
+
emit_current_tag ()
1060
+
| Some '\x00' ->
1061
+
Stream.advance t.stream;
1062
+
error t "unexpected-null-character";
1063
+
Buffer.add_string t.current_attr_value "\xEF\xBF\xBD"
1064
+
| Some ('"' | '\'' | '<' | '=' | '`') as c_opt ->
1065
+
Stream.advance t.stream;
1066
+
error t "unexpected-character-in-unquoted-attribute-value";
1067
+
Buffer.add_char t.current_attr_value (Option.get c_opt)
1068
+
| Some c ->
1069
+
Stream.advance t.stream;
1070
+
Buffer.add_char t.current_attr_value c
1071
+
| None -> ()
1072
+
1073
+
and state_after_attribute_value_quoted () =
1074
+
match Stream.peek t.stream with
1075
+
| Some ('\t' | '\n' | '\x0C' | ' ') ->
1076
+
Stream.advance t.stream;
1077
+
t.state <- State.Before_attribute_name
1078
+
| Some '/' ->
1079
+
Stream.advance t.stream;
1080
+
t.state <- State.Self_closing_start_tag
1081
+
| Some '>' ->
1082
+
Stream.advance t.stream;
1083
+
t.state <- State.Data;
1084
+
emit_current_tag ()
1085
+
| None -> ()
1086
+
| Some _ ->
1087
+
error t "missing-whitespace-between-attributes";
1088
+
t.state <- State.Before_attribute_name
1089
+
1090
+
and state_self_closing_start_tag () =
1091
+
match Stream.peek t.stream with
1092
+
| Some '>' ->
1093
+
Stream.advance t.stream;
1094
+
t.current_tag_self_closing <- true;
1095
+
t.state <- State.Data;
1096
+
emit_current_tag ()
1097
+
| None -> ()
1098
+
| Some _ ->
1099
+
error t "unexpected-solidus-in-tag";
1100
+
t.state <- State.Before_attribute_name
1101
+
1102
+
and state_bogus_comment () =
1103
+
match Stream.consume t.stream with
1104
+
| Some '>' ->
1105
+
t.state <- State.Data;
1106
+
emit_current_comment ()
1107
+
| Some '\x00' ->
1108
+
error t "unexpected-null-character";
1109
+
Buffer.add_string t.current_comment "\xEF\xBF\xBD"
1110
+
| Some c ->
1111
+
Buffer.add_char t.current_comment c
1112
+
| None -> ()
1113
+
1114
+
and state_markup_declaration_open () =
1115
+
if Stream.matches_ci t.stream "--" then begin
1116
+
ignore (Stream.consume_exact_ci t.stream "--");
1117
+
Buffer.clear t.current_comment;
1118
+
t.state <- State.Comment_start
1119
+
end else if Stream.matches_ci t.stream "DOCTYPE" then begin
1120
+
ignore (Stream.consume_exact_ci t.stream "DOCTYPE");
1121
+
t.state <- State.Doctype
1122
+
end else if Stream.matches_ci t.stream "[CDATA[" then begin
1123
+
ignore (Stream.consume_exact_ci t.stream "[CDATA[");
1124
+
(* CDATA only allowed in foreign content *)
1125
+
if S.adjusted_current_node_in_html_namespace t.sink then begin
1126
+
error t "cdata-in-html-content";
1127
+
Buffer.clear t.current_comment;
1128
+
Buffer.add_string t.current_comment "[CDATA[";
1129
+
t.state <- State.Bogus_comment
1130
+
end else
1131
+
t.state <- State.Cdata_section
1132
+
end else begin
1133
+
error t "incorrectly-opened-comment";
1134
+
Buffer.clear t.current_comment;
1135
+
t.state <- State.Bogus_comment
1136
+
end
1137
+
1138
+
and state_comment_start () =
1139
+
match Stream.peek t.stream with
1140
+
| Some '-' ->
1141
+
Stream.advance t.stream;
1142
+
t.state <- State.Comment_start_dash
1143
+
| Some '>' ->
1144
+
Stream.advance t.stream;
1145
+
error t "abrupt-closing-of-empty-comment";
1146
+
t.state <- State.Data;
1147
+
emit_current_comment ()
1148
+
| _ ->
1149
+
t.state <- State.Comment
1150
+
1151
+
and state_comment_start_dash () =
1152
+
match Stream.peek t.stream with
1153
+
| Some '-' ->
1154
+
Stream.advance t.stream;
1155
+
t.state <- State.Comment_end
1156
+
| Some '>' ->
1157
+
Stream.advance t.stream;
1158
+
error t "abrupt-closing-of-empty-comment";
1159
+
t.state <- State.Data;
1160
+
emit_current_comment ()
1161
+
| None -> ()
1162
+
| Some _ ->
1163
+
Buffer.add_char t.current_comment '-';
1164
+
t.state <- State.Comment
1165
+
1166
+
and state_comment () =
1167
+
match Stream.consume t.stream with
1168
+
| Some '<' ->
1169
+
Buffer.add_char t.current_comment '<';
1170
+
t.state <- State.Comment_less_than_sign
1171
+
| Some '-' ->
1172
+
t.state <- State.Comment_end_dash
1173
+
| Some '\x00' ->
1174
+
error t "unexpected-null-character";
1175
+
Buffer.add_string t.current_comment "\xEF\xBF\xBD"
1176
+
| Some c ->
1177
+
Buffer.add_char t.current_comment c
1178
+
| None -> ()
1179
+
1180
+
and state_comment_less_than_sign () =
1181
+
match Stream.peek t.stream with
1182
+
| Some '!' ->
1183
+
Stream.advance t.stream;
1184
+
Buffer.add_char t.current_comment '!';
1185
+
t.state <- State.Comment_less_than_sign_bang
1186
+
| Some '<' ->
1187
+
Stream.advance t.stream;
1188
+
Buffer.add_char t.current_comment '<'
1189
+
| _ ->
1190
+
t.state <- State.Comment
1191
+
1192
+
and state_comment_less_than_sign_bang () =
1193
+
match Stream.peek t.stream with
1194
+
| Some '-' ->
1195
+
Stream.advance t.stream;
1196
+
t.state <- State.Comment_less_than_sign_bang_dash
1197
+
| _ ->
1198
+
t.state <- State.Comment
1199
+
1200
+
and state_comment_less_than_sign_bang_dash () =
1201
+
match Stream.peek t.stream with
1202
+
| Some '-' ->
1203
+
Stream.advance t.stream;
1204
+
t.state <- State.Comment_less_than_sign_bang_dash_dash
1205
+
| _ ->
1206
+
t.state <- State.Comment_end_dash
1207
+
1208
+
and state_comment_less_than_sign_bang_dash_dash () =
1209
+
match Stream.peek t.stream with
1210
+
| Some '>' | None ->
1211
+
t.state <- State.Comment_end
1212
+
| Some _ ->
1213
+
error t "nested-comment";
1214
+
t.state <- State.Comment_end
1215
+
1216
+
and state_comment_end_dash () =
1217
+
match Stream.peek t.stream with
1218
+
| Some '-' ->
1219
+
Stream.advance t.stream;
1220
+
t.state <- State.Comment_end
1221
+
| None -> ()
1222
+
| Some _ ->
1223
+
Buffer.add_char t.current_comment '-';
1224
+
t.state <- State.Comment
1225
+
1226
+
and state_comment_end () =
1227
+
match Stream.peek t.stream with
1228
+
| Some '>' ->
1229
+
Stream.advance t.stream;
1230
+
t.state <- State.Data;
1231
+
emit_current_comment ()
1232
+
| Some '!' ->
1233
+
Stream.advance t.stream;
1234
+
t.state <- State.Comment_end_bang
1235
+
| Some '-' ->
1236
+
Stream.advance t.stream;
1237
+
Buffer.add_char t.current_comment '-'
1238
+
| None -> ()
1239
+
| Some _ ->
1240
+
Buffer.add_string t.current_comment "--";
1241
+
t.state <- State.Comment
1242
+
1243
+
and state_comment_end_bang () =
1244
+
match Stream.peek t.stream with
1245
+
| Some '-' ->
1246
+
Stream.advance t.stream;
1247
+
Buffer.add_string t.current_comment "--!";
1248
+
t.state <- State.Comment_end_dash
1249
+
| Some '>' ->
1250
+
Stream.advance t.stream;
1251
+
error t "incorrectly-closed-comment";
1252
+
t.state <- State.Data;
1253
+
emit_current_comment ()
1254
+
| None -> ()
1255
+
| Some _ ->
1256
+
Buffer.add_string t.current_comment "--!";
1257
+
t.state <- State.Comment
1258
+
1259
+
and state_doctype () =
1260
+
match Stream.peek t.stream with
1261
+
| Some ('\t' | '\n' | '\x0C' | ' ') ->
1262
+
Stream.advance t.stream;
1263
+
t.state <- State.Before_doctype_name
1264
+
| Some '>' ->
1265
+
t.state <- State.Before_doctype_name
1266
+
| None -> ()
1267
+
| Some _ ->
1268
+
error t "missing-whitespace-before-doctype-name";
1269
+
t.state <- State.Before_doctype_name
1270
+
1271
+
and state_before_doctype_name () =
1272
+
match Stream.peek t.stream with
1273
+
| Some ('\t' | '\n' | '\x0C' | ' ') ->
1274
+
Stream.advance t.stream
1275
+
| Some '\x00' ->
1276
+
Stream.advance t.stream;
1277
+
error t "unexpected-null-character";
1278
+
start_new_doctype t;
1279
+
t.current_doctype_name <- Some (Buffer.create 8);
1280
+
Buffer.add_string (Option.get t.current_doctype_name) "\xEF\xBF\xBD";
1281
+
t.state <- State.Doctype_name
1282
+
| Some '>' ->
1283
+
Stream.advance t.stream;
1284
+
error t "missing-doctype-name";
1285
+
start_new_doctype t;
1286
+
t.current_doctype_force_quirks <- true;
1287
+
t.state <- State.Data;
1288
+
emit_current_doctype ()
1289
+
| None -> ()
1290
+
| Some c ->
1291
+
Stream.advance t.stream;
1292
+
start_new_doctype t;
1293
+
t.current_doctype_name <- Some (Buffer.create 8);
1294
+
Buffer.add_char (Option.get t.current_doctype_name) (ascii_lower c);
1295
+
t.state <- State.Doctype_name
1296
+
1297
+
and state_doctype_name () =
1298
+
match Stream.consume t.stream with
1299
+
| Some ('\t' | '\n' | '\x0C' | ' ') ->
1300
+
t.state <- State.After_doctype_name
1301
+
| Some '>' ->
1302
+
t.state <- State.Data;
1303
+
emit_current_doctype ()
1304
+
| Some '\x00' ->
1305
+
error t "unexpected-null-character";
1306
+
Buffer.add_string (Option.get t.current_doctype_name) "\xEF\xBF\xBD"
1307
+
| Some c ->
1308
+
Buffer.add_char (Option.get t.current_doctype_name) (ascii_lower c)
1309
+
| None -> ()
1310
+
1311
+
and state_after_doctype_name () =
1312
+
match Stream.peek t.stream with
1313
+
| Some ('\t' | '\n' | '\x0C' | ' ') ->
1314
+
Stream.advance t.stream
1315
+
| Some '>' ->
1316
+
Stream.advance t.stream;
1317
+
t.state <- State.Data;
1318
+
emit_current_doctype ()
1319
+
| None -> ()
1320
+
| Some _ ->
1321
+
if Stream.matches_ci t.stream "PUBLIC" then begin
1322
+
ignore (Stream.consume_exact_ci t.stream "PUBLIC");
1323
+
t.state <- State.After_doctype_public_keyword
1324
+
end else if Stream.matches_ci t.stream "SYSTEM" then begin
1325
+
ignore (Stream.consume_exact_ci t.stream "SYSTEM");
1326
+
t.state <- State.After_doctype_system_keyword
1327
+
end else begin
1328
+
error t "invalid-character-sequence-after-doctype-name";
1329
+
t.current_doctype_force_quirks <- true;
1330
+
t.state <- State.Bogus_doctype
1331
+
end
1332
+
1333
+
and state_after_doctype_public_keyword () =
1334
+
match Stream.peek t.stream with
1335
+
| Some ('\t' | '\n' | '\x0C' | ' ') ->
1336
+
Stream.advance t.stream;
1337
+
t.state <- State.Before_doctype_public_identifier
1338
+
| Some '"' ->
1339
+
Stream.advance t.stream;
1340
+
error t "missing-whitespace-after-doctype-public-keyword";
1341
+
t.current_doctype_public <- Some (Buffer.create 32);
1342
+
t.state <- State.Doctype_public_identifier_double_quoted
1343
+
| Some '\'' ->
1344
+
Stream.advance t.stream;
1345
+
error t "missing-whitespace-after-doctype-public-keyword";
1346
+
t.current_doctype_public <- Some (Buffer.create 32);
1347
+
t.state <- State.Doctype_public_identifier_single_quoted
1348
+
| Some '>' ->
1349
+
Stream.advance t.stream;
1350
+
error t "missing-doctype-public-identifier";
1351
+
t.current_doctype_force_quirks <- true;
1352
+
t.state <- State.Data;
1353
+
emit_current_doctype ()
1354
+
| None -> ()
1355
+
| Some _ ->
1356
+
error t "missing-quote-before-doctype-public-identifier";
1357
+
t.current_doctype_force_quirks <- true;
1358
+
t.state <- State.Bogus_doctype
1359
+
1360
+
and state_before_doctype_public_identifier () =
1361
+
match Stream.peek t.stream with
1362
+
| Some ('\t' | '\n' | '\x0C' | ' ') ->
1363
+
Stream.advance t.stream
1364
+
| Some '"' ->
1365
+
Stream.advance t.stream;
1366
+
t.current_doctype_public <- Some (Buffer.create 32);
1367
+
t.state <- State.Doctype_public_identifier_double_quoted
1368
+
| Some '\'' ->
1369
+
Stream.advance t.stream;
1370
+
t.current_doctype_public <- Some (Buffer.create 32);
1371
+
t.state <- State.Doctype_public_identifier_single_quoted
1372
+
| Some '>' ->
1373
+
Stream.advance t.stream;
1374
+
error t "missing-doctype-public-identifier";
1375
+
t.current_doctype_force_quirks <- true;
1376
+
t.state <- State.Data;
1377
+
emit_current_doctype ()
1378
+
| None -> ()
1379
+
| Some _ ->
1380
+
error t "missing-quote-before-doctype-public-identifier";
1381
+
t.current_doctype_force_quirks <- true;
1382
+
t.state <- State.Bogus_doctype
1383
+
1384
+
and state_doctype_public_identifier_double_quoted () =
1385
+
match Stream.consume t.stream with
1386
+
| Some '"' ->
1387
+
t.state <- State.After_doctype_public_identifier
1388
+
| Some '\x00' ->
1389
+
error t "unexpected-null-character";
1390
+
Buffer.add_string (Option.get t.current_doctype_public) "\xEF\xBF\xBD"
1391
+
| Some '>' ->
1392
+
error t "abrupt-doctype-public-identifier";
1393
+
t.current_doctype_force_quirks <- true;
1394
+
t.state <- State.Data;
1395
+
emit_current_doctype ()
1396
+
| Some c ->
1397
+
Buffer.add_char (Option.get t.current_doctype_public) c
1398
+
| None -> ()
1399
+
1400
+
and state_doctype_public_identifier_single_quoted () =
1401
+
match Stream.consume t.stream with
1402
+
| Some '\'' ->
1403
+
t.state <- State.After_doctype_public_identifier
1404
+
| Some '\x00' ->
1405
+
error t "unexpected-null-character";
1406
+
Buffer.add_string (Option.get t.current_doctype_public) "\xEF\xBF\xBD"
1407
+
| Some '>' ->
1408
+
error t "abrupt-doctype-public-identifier";
1409
+
t.current_doctype_force_quirks <- true;
1410
+
t.state <- State.Data;
1411
+
emit_current_doctype ()
1412
+
| Some c ->
1413
+
Buffer.add_char (Option.get t.current_doctype_public) c
1414
+
| None -> ()
1415
+
1416
+
and state_after_doctype_public_identifier () =
1417
+
match Stream.peek t.stream with
1418
+
| Some ('\t' | '\n' | '\x0C' | ' ') ->
1419
+
Stream.advance t.stream;
1420
+
t.state <- State.Between_doctype_public_and_system_identifiers
1421
+
| Some '>' ->
1422
+
Stream.advance t.stream;
1423
+
t.state <- State.Data;
1424
+
emit_current_doctype ()
1425
+
| Some '"' ->
1426
+
Stream.advance t.stream;
1427
+
error t "missing-whitespace-between-doctype-public-and-system-identifiers";
1428
+
t.current_doctype_system <- Some (Buffer.create 32);
1429
+
t.state <- State.Doctype_system_identifier_double_quoted
1430
+
| Some '\'' ->
1431
+
Stream.advance t.stream;
1432
+
error t "missing-whitespace-between-doctype-public-and-system-identifiers";
1433
+
t.current_doctype_system <- Some (Buffer.create 32);
1434
+
t.state <- State.Doctype_system_identifier_single_quoted
1435
+
| None -> ()
1436
+
| Some _ ->
1437
+
error t "missing-quote-before-doctype-system-identifier";
1438
+
t.current_doctype_force_quirks <- true;
1439
+
t.state <- State.Bogus_doctype
1440
+
1441
+
and state_between_doctype_public_and_system_identifiers () =
1442
+
match Stream.peek t.stream with
1443
+
| Some ('\t' | '\n' | '\x0C' | ' ') ->
1444
+
Stream.advance t.stream
1445
+
| Some '>' ->
1446
+
Stream.advance t.stream;
1447
+
t.state <- State.Data;
1448
+
emit_current_doctype ()
1449
+
| Some '"' ->
1450
+
Stream.advance t.stream;
1451
+
t.current_doctype_system <- Some (Buffer.create 32);
1452
+
t.state <- State.Doctype_system_identifier_double_quoted
1453
+
| Some '\'' ->
1454
+
Stream.advance t.stream;
1455
+
t.current_doctype_system <- Some (Buffer.create 32);
1456
+
t.state <- State.Doctype_system_identifier_single_quoted
1457
+
| None -> ()
1458
+
| Some _ ->
1459
+
error t "missing-quote-before-doctype-system-identifier";
1460
+
t.current_doctype_force_quirks <- true;
1461
+
t.state <- State.Bogus_doctype
1462
+
1463
+
and state_after_doctype_system_keyword () =
1464
+
match Stream.peek t.stream with
1465
+
| Some ('\t' | '\n' | '\x0C' | ' ') ->
1466
+
Stream.advance t.stream;
1467
+
t.state <- State.Before_doctype_system_identifier
1468
+
| Some '"' ->
1469
+
Stream.advance t.stream;
1470
+
error t "missing-whitespace-after-doctype-system-keyword";
1471
+
t.current_doctype_system <- Some (Buffer.create 32);
1472
+
t.state <- State.Doctype_system_identifier_double_quoted
1473
+
| Some '\'' ->
1474
+
Stream.advance t.stream;
1475
+
error t "missing-whitespace-after-doctype-system-keyword";
1476
+
t.current_doctype_system <- Some (Buffer.create 32);
1477
+
t.state <- State.Doctype_system_identifier_single_quoted
1478
+
| Some '>' ->
1479
+
Stream.advance t.stream;
1480
+
error t "missing-doctype-system-identifier";
1481
+
t.current_doctype_force_quirks <- true;
1482
+
t.state <- State.Data;
1483
+
emit_current_doctype ()
1484
+
| None -> ()
1485
+
| Some _ ->
1486
+
error t "missing-quote-before-doctype-system-identifier";
1487
+
t.current_doctype_force_quirks <- true;
1488
+
t.state <- State.Bogus_doctype
1489
+
1490
+
and state_before_doctype_system_identifier () =
1491
+
match Stream.peek t.stream with
1492
+
| Some ('\t' | '\n' | '\x0C' | ' ') ->
1493
+
Stream.advance t.stream
1494
+
| Some '"' ->
1495
+
Stream.advance t.stream;
1496
+
t.current_doctype_system <- Some (Buffer.create 32);
1497
+
t.state <- State.Doctype_system_identifier_double_quoted
1498
+
| Some '\'' ->
1499
+
Stream.advance t.stream;
1500
+
t.current_doctype_system <- Some (Buffer.create 32);
1501
+
t.state <- State.Doctype_system_identifier_single_quoted
1502
+
| Some '>' ->
1503
+
Stream.advance t.stream;
1504
+
error t "missing-doctype-system-identifier";
1505
+
t.current_doctype_force_quirks <- true;
1506
+
t.state <- State.Data;
1507
+
emit_current_doctype ()
1508
+
| None -> ()
1509
+
| Some _ ->
1510
+
error t "missing-quote-before-doctype-system-identifier";
1511
+
t.current_doctype_force_quirks <- true;
1512
+
t.state <- State.Bogus_doctype
1513
+
1514
+
and state_doctype_system_identifier_double_quoted () =
1515
+
match Stream.consume t.stream with
1516
+
| Some '"' ->
1517
+
t.state <- State.After_doctype_system_identifier
1518
+
| Some '\x00' ->
1519
+
error t "unexpected-null-character";
1520
+
Buffer.add_string (Option.get t.current_doctype_system) "\xEF\xBF\xBD"
1521
+
| Some '>' ->
1522
+
error t "abrupt-doctype-system-identifier";
1523
+
t.current_doctype_force_quirks <- true;
1524
+
t.state <- State.Data;
1525
+
emit_current_doctype ()
1526
+
| Some c ->
1527
+
Buffer.add_char (Option.get t.current_doctype_system) c
1528
+
| None -> ()
1529
+
1530
+
and state_doctype_system_identifier_single_quoted () =
1531
+
match Stream.consume t.stream with
1532
+
| Some '\'' ->
1533
+
t.state <- State.After_doctype_system_identifier
1534
+
| Some '\x00' ->
1535
+
error t "unexpected-null-character";
1536
+
Buffer.add_string (Option.get t.current_doctype_system) "\xEF\xBF\xBD"
1537
+
| Some '>' ->
1538
+
error t "abrupt-doctype-system-identifier";
1539
+
t.current_doctype_force_quirks <- true;
1540
+
t.state <- State.Data;
1541
+
emit_current_doctype ()
1542
+
| Some c ->
1543
+
Buffer.add_char (Option.get t.current_doctype_system) c
1544
+
| None -> ()
1545
+
1546
+
and state_after_doctype_system_identifier () =
1547
+
match Stream.peek t.stream with
1548
+
| Some ('\t' | '\n' | '\x0C' | ' ') ->
1549
+
Stream.advance t.stream
1550
+
| Some '>' ->
1551
+
Stream.advance t.stream;
1552
+
t.state <- State.Data;
1553
+
emit_current_doctype ()
1554
+
| None -> ()
1555
+
| Some _ ->
1556
+
error t "unexpected-character-after-doctype-system-identifier";
1557
+
t.state <- State.Bogus_doctype
1558
+
1559
+
and state_bogus_doctype () =
1560
+
match Stream.consume t.stream with
1561
+
| Some '>' ->
1562
+
t.state <- State.Data;
1563
+
emit_current_doctype ()
1564
+
| Some '\x00' ->
1565
+
error t "unexpected-null-character"
1566
+
| Some _ -> ()
1567
+
| None -> ()
1568
+
1569
+
and state_cdata_section () =
1570
+
match Stream.consume t.stream with
1571
+
| Some ']' ->
1572
+
t.state <- State.Cdata_section_bracket
1573
+
| Some '\x00' ->
1574
+
error t "unexpected-null-character";
1575
+
emit_str t "\xEF\xBF\xBD"
1576
+
| Some c ->
1577
+
emit_char t c
1578
+
| None -> ()
1579
+
1580
+
and state_cdata_section_bracket () =
1581
+
match Stream.peek t.stream with
1582
+
| Some ']' ->
1583
+
Stream.advance t.stream;
1584
+
t.state <- State.Cdata_section_end
1585
+
| _ ->
1586
+
emit_char t ']';
1587
+
t.state <- State.Cdata_section
1588
+
1589
+
and state_cdata_section_end () =
1590
+
match Stream.peek t.stream with
1591
+
| Some ']' ->
1592
+
Stream.advance t.stream;
1593
+
emit_char t ']'
1594
+
| Some '>' ->
1595
+
Stream.advance t.stream;
1596
+
t.state <- State.Data
1597
+
| _ ->
1598
+
emit_str t "]]";
1599
+
t.state <- State.Cdata_section
1600
+
1601
+
and state_character_reference () =
1602
+
Buffer.clear t.temp_buffer;
1603
+
Buffer.add_char t.temp_buffer '&';
1604
+
match Stream.peek t.stream with
1605
+
| Some c when is_ascii_alnum c ->
1606
+
t.state <- State.Named_character_reference
1607
+
| Some '#' ->
1608
+
Stream.advance t.stream;
1609
+
Buffer.add_char t.temp_buffer '#';
1610
+
t.state <- State.Numeric_character_reference
1611
+
| _ ->
1612
+
flush_code_points_consumed_as_char_ref t;
1613
+
t.state <- t.return_state
1614
+
1615
+
and state_named_character_reference () =
1616
+
(* Collect alphanumeric characters *)
1617
+
let rec collect () =
1618
+
match Stream.peek t.stream with
1619
+
| Some c when is_ascii_alnum c ->
1620
+
Stream.advance t.stream;
1621
+
Buffer.add_char t.temp_buffer c;
1622
+
collect ()
1623
+
| _ -> ()
1624
+
in
1625
+
collect ();
1626
+
1627
+
let has_semicolon =
1628
+
match Stream.peek t.stream with
1629
+
| Some ';' -> Stream.advance t.stream; Buffer.add_char t.temp_buffer ';'; true
1630
+
| _ -> false
1631
+
in
1632
+
1633
+
(* Try to match entity - buffer contains "&name" or "&name;" *)
1634
+
let buf_contents = Buffer.contents t.temp_buffer in
1635
+
let name_start = 1 in (* Skip '&' *)
1636
+
let name_end = String.length buf_contents - (if has_semicolon then 1 else 0) in
1637
+
let entity_name = String.sub buf_contents name_start (name_end - name_start) in
1638
+
1639
+
(* Try progressively shorter matches *)
1640
+
(* Only match if:
1641
+
1. Full match with semicolon, OR
1642
+
2. Legacy entity (can be used without semicolon) *)
1643
+
let rec try_match len =
1644
+
if len <= 0 then None
1645
+
else
1646
+
let prefix = String.sub entity_name 0 len in
1647
+
let is_full = len = String.length entity_name in
1648
+
let would_have_semi = has_semicolon && is_full in
1649
+
(* Only use this match if it has semicolon or is a legacy entity *)
1650
+
if would_have_semi || Html5rw_entities.is_legacy prefix then
1651
+
match Html5rw_entities.lookup prefix with
1652
+
| Some decoded -> Some (decoded, len)
1653
+
| None -> try_match (len - 1)
1654
+
else
1655
+
try_match (len - 1)
1656
+
in
1657
+
1658
+
match try_match (String.length entity_name) with
1659
+
| Some (decoded, matched_len) ->
1660
+
let full_match = matched_len = String.length entity_name in
1661
+
let ends_with_semi = has_semicolon && full_match in
1662
+
1663
+
(* Check attribute context restrictions *)
1664
+
let in_attribute = match t.return_state with
1665
+
| State.Attribute_value_double_quoted
1666
+
| State.Attribute_value_single_quoted
1667
+
| State.Attribute_value_unquoted -> true
1668
+
| _ -> false
1669
+
in
1670
+
1671
+
let next_char =
1672
+
if full_match && not has_semicolon then
1673
+
Stream.peek t.stream
1674
+
else if not full_match then
1675
+
Some entity_name.[matched_len]
1676
+
else None
1677
+
in
1678
+
1679
+
let blocked = in_attribute && not ends_with_semi &&
1680
+
match next_char with
1681
+
| Some '=' -> true
1682
+
| Some c when is_ascii_alnum c -> true
1683
+
| _ -> false
1684
+
in
1685
+
1686
+
if blocked then begin
1687
+
flush_code_points_consumed_as_char_ref t;
1688
+
t.state <- t.return_state
1689
+
end else begin
1690
+
if not ends_with_semi then
1691
+
error t "missing-semicolon-after-character-reference";
1692
+
Buffer.clear t.temp_buffer;
1693
+
Buffer.add_string t.temp_buffer decoded;
1694
+
flush_code_points_consumed_as_char_ref t;
1695
+
(* Emit unconsumed chars after partial match *)
1696
+
if not full_match then begin
1697
+
let unconsumed = String.sub entity_name matched_len (String.length entity_name - matched_len) in
1698
+
emit_str t unconsumed;
1699
+
(* If there was a semicolon in input but we didn't use the full match, emit the semicolon too *)
1700
+
if has_semicolon then
1701
+
emit_char t ';'
1702
+
end;
1703
+
t.state <- t.return_state
1704
+
end
1705
+
| None ->
1706
+
(* No match - check if we should report ambiguous ampersand *)
1707
+
if String.length entity_name > 0 then begin
1708
+
t.state <- State.Ambiguous_ampersand;
1709
+
(* Reset position - we need to emit the ampersand and chars *)
1710
+
flush_code_points_consumed_as_char_ref t
1711
+
end else begin
1712
+
flush_code_points_consumed_as_char_ref t;
1713
+
t.state <- t.return_state
1714
+
end
1715
+
1716
+
and state_ambiguous_ampersand () =
1717
+
match Stream.peek t.stream with
1718
+
| Some c when is_ascii_alnum c ->
1719
+
Stream.advance t.stream;
1720
+
(match t.return_state with
1721
+
| State.Attribute_value_double_quoted
1722
+
| State.Attribute_value_single_quoted
1723
+
| State.Attribute_value_unquoted ->
1724
+
Buffer.add_char t.current_attr_value c
1725
+
| _ ->
1726
+
emit_char t c)
1727
+
| Some ';' ->
1728
+
error t "unknown-named-character-reference";
1729
+
t.state <- t.return_state
1730
+
| _ ->
1731
+
t.state <- t.return_state
1732
+
1733
+
and state_numeric_character_reference () =
1734
+
t.char_ref_code <- 0;
1735
+
match Stream.peek t.stream with
1736
+
| Some (('x' | 'X') as c) ->
1737
+
Stream.advance t.stream;
1738
+
Buffer.add_char t.temp_buffer c;
1739
+
t.state <- State.Hexadecimal_character_reference_start
1740
+
| _ ->
1741
+
t.state <- State.Decimal_character_reference_start
1742
+
1743
+
and state_hexadecimal_character_reference_start () =
1744
+
match Stream.peek t.stream with
1745
+
| Some c when is_ascii_hex c ->
1746
+
t.state <- State.Hexadecimal_character_reference
1747
+
| _ ->
1748
+
error t "absence-of-digits-in-numeric-character-reference";
1749
+
flush_code_points_consumed_as_char_ref t;
1750
+
t.state <- t.return_state
1751
+
1752
+
and state_decimal_character_reference_start () =
1753
+
match Stream.peek t.stream with
1754
+
| Some c when is_ascii_digit c ->
1755
+
t.state <- State.Decimal_character_reference
1756
+
| _ ->
1757
+
error t "absence-of-digits-in-numeric-character-reference";
1758
+
flush_code_points_consumed_as_char_ref t;
1759
+
t.state <- t.return_state
1760
+
1761
+
and state_hexadecimal_character_reference () =
1762
+
match Stream.peek t.stream with
1763
+
| Some c when is_ascii_digit c ->
1764
+
Stream.advance t.stream;
1765
+
t.char_ref_code <- t.char_ref_code * 16 + (Char.code c - Char.code '0');
1766
+
if t.char_ref_code > 0x10FFFF then t.char_ref_code <- 0x10FFFF + 1
1767
+
| Some c when c >= 'A' && c <= 'F' ->
1768
+
Stream.advance t.stream;
1769
+
t.char_ref_code <- t.char_ref_code * 16 + (Char.code c - Char.code 'A' + 10);
1770
+
if t.char_ref_code > 0x10FFFF then t.char_ref_code <- 0x10FFFF + 1
1771
+
| Some c when c >= 'a' && c <= 'f' ->
1772
+
Stream.advance t.stream;
1773
+
t.char_ref_code <- t.char_ref_code * 16 + (Char.code c - Char.code 'a' + 10);
1774
+
if t.char_ref_code > 0x10FFFF then t.char_ref_code <- 0x10FFFF + 1
1775
+
| Some ';' ->
1776
+
Stream.advance t.stream;
1777
+
t.state <- State.Numeric_character_reference_end
1778
+
| _ ->
1779
+
error t "missing-semicolon-after-character-reference";
1780
+
t.state <- State.Numeric_character_reference_end
1781
+
1782
+
and state_decimal_character_reference () =
1783
+
match Stream.peek t.stream with
1784
+
| Some c when is_ascii_digit c ->
1785
+
Stream.advance t.stream;
1786
+
t.char_ref_code <- t.char_ref_code * 10 + (Char.code c - Char.code '0');
1787
+
if t.char_ref_code > 0x10FFFF then t.char_ref_code <- 0x10FFFF + 1
1788
+
| Some ';' ->
1789
+
Stream.advance t.stream;
1790
+
t.state <- State.Numeric_character_reference_end
1791
+
| _ ->
1792
+
error t "missing-semicolon-after-character-reference";
1793
+
t.state <- State.Numeric_character_reference_end
1794
+
1795
+
and state_numeric_character_reference_end () =
1796
+
let code = t.char_ref_code in
1797
+
let replacement_char = "\xEF\xBF\xBD" in
1798
+
1799
+
let result =
1800
+
if code = 0 then begin
1801
+
error t "null-character-reference";
1802
+
replacement_char
1803
+
end else if code > 0x10FFFF then begin
1804
+
error t "character-reference-outside-unicode-range";
1805
+
replacement_char
1806
+
end else if code >= 0xD800 && code <= 0xDFFF then begin
1807
+
error t "surrogate-character-reference";
1808
+
replacement_char
1809
+
end else if (code >= 0xFDD0 && code <= 0xFDEF) ||
1810
+
List.mem code [0xFFFE; 0xFFFF; 0x1FFFE; 0x1FFFF; 0x2FFFE; 0x2FFFF;
1811
+
0x3FFFE; 0x3FFFF; 0x4FFFE; 0x4FFFF; 0x5FFFE; 0x5FFFF;
1812
+
0x6FFFE; 0x6FFFF; 0x7FFFE; 0x7FFFF; 0x8FFFE; 0x8FFFF;
1813
+
0x9FFFE; 0x9FFFF; 0xAFFFE; 0xAFFFF; 0xBFFFE; 0xBFFFF;
1814
+
0xCFFFE; 0xCFFFF; 0xDFFFE; 0xDFFFF; 0xEFFFE; 0xEFFFF;
1815
+
0xFFFFE; 0xFFFFF; 0x10FFFE; 0x10FFFF] then begin
1816
+
error t "noncharacter-character-reference";
1817
+
Html5rw_entities.Numeric_ref.codepoint_to_utf8 code
1818
+
end else if (code >= 0x01 && code <= 0x08) || code = 0x0B ||
1819
+
(code >= 0x0D && code <= 0x1F) ||
1820
+
(code >= 0x7F && code <= 0x9F) then begin
1821
+
error t "control-character-reference";
1822
+
(* Apply Windows-1252 replacement table for 0x80-0x9F *)
1823
+
match Html5rw_entities.Numeric_ref.find_replacement code with
1824
+
| Some replacement -> Html5rw_entities.Numeric_ref.codepoint_to_utf8 replacement
1825
+
| None -> Html5rw_entities.Numeric_ref.codepoint_to_utf8 code
1826
+
end else
1827
+
Html5rw_entities.Numeric_ref.codepoint_to_utf8 code
1828
+
in
1829
+
1830
+
Buffer.clear t.temp_buffer;
1831
+
Buffer.add_string t.temp_buffer result;
1832
+
flush_code_points_consumed_as_char_ref t;
1833
+
t.state <- t.return_state
1834
+
1835
+
in
1836
+
process_state ()
1837
+
1838
+
let get_errors t = List.rev t.errors
1839
+
1840
+
let set_state t state = t.state <- state
1841
+
1842
+
let set_last_start_tag t name = t.last_start_tag <- name
+9
test/adoption_test.ml
+9
test/adoption_test.ml
···
1
+
open Bytesrw
2
+
3
+
module Parser = Html5rw_parser
4
+
5
+
let () =
6
+
print_endline "=== Test: <a><svg><tr><input></a> ===";
7
+
let result = Parser.parse (Bytes.Reader.of_string "<a><svg><tr><input></a>") in
8
+
print_endline (Html5rw_dom.to_test_format (Parser.root result));
9
+
print_newline ()
+12
test/debug2.ml
+12
test/debug2.ml
···
1
+
open Bytesrw
2
+
3
+
module Parser = Html5rw_parser
4
+
module Dom = Html5rw_dom
5
+
6
+
let () =
7
+
(* Test adoption agency *)
8
+
let input = "<p><b>One<p>Two" in
9
+
let result = Parser.parse ~collect_errors:true (Bytes.Reader.of_string input) in
10
+
print_endline ("Input: " ^ input);
11
+
print_endline "Result:";
12
+
print_endline (Dom.to_test_format (Parser.root result))
+12
test/debug_fragment.ml
+12
test/debug_fragment.ml
···
1
+
open Bytesrw
2
+
3
+
module Parser = Html5rw_parser
4
+
module Dom = Html5rw_dom
5
+
6
+
let () =
7
+
(* Simple table test *)
8
+
let input = "<table><th>" in
9
+
let result = Parser.parse ~collect_errors:true (Bytes.Reader.of_string input) in
10
+
print_endline ("Input: " ^ input);
11
+
print_endline "Result:";
12
+
print_endline (Dom.to_test_format (Parser.root result))
+28
test/debug_svg.ml
+28
test/debug_svg.ml
···
1
+
open Bytesrw
2
+
3
+
module Parser = Html5rw_parser
4
+
module Dom = Html5rw_dom
5
+
6
+
let rec print_tree depth node =
7
+
let indent = String.make (depth * 2) ' ' in
8
+
Printf.printf "%s%s (ns=%s)\n" indent node.Dom.name
9
+
(match node.Dom.namespace with Some ns -> ns | None -> "html");
10
+
List.iter (print_tree (depth + 1)) node.Dom.children
11
+
12
+
let () =
13
+
(* SVG fragment test *)
14
+
let input = "<svg><tr><td><title><tr>" in
15
+
let context = Parser.make_fragment_context ~tag_name:"td" () in
16
+
let result = Parser.parse ~collect_errors:true ~fragment_context:context (Bytes.Reader.of_string input) in
17
+
print_endline ("Input: " ^ input);
18
+
print_endline "Tree structure:";
19
+
print_tree 0 (Parser.root result);
20
+
print_endline "";
21
+
print_endline "Result:";
22
+
print_endline (Dom.to_test_format (Parser.root result));
23
+
print_endline "";
24
+
print_endline "Expected:";
25
+
print_endline "| <svg svg>";
26
+
print_endline "| <svg tr>";
27
+
print_endline "| <svg td>";
28
+
print_endline "| <svg title>"
+20
test/debug_title.ml
+20
test/debug_title.ml
···
1
+
open Bytesrw
2
+
3
+
module Parser = Html5rw_parser
4
+
module Dom = Html5rw_dom
5
+
6
+
let () =
7
+
let input = "<!doctype html><title> <!-- </title>--> x" in
8
+
let result = Parser.parse ~collect_errors:true (Bytes.Reader.of_string input) in
9
+
print_endline ("Input: " ^ input);
10
+
print_endline "Result:";
11
+
print_endline (Dom.to_test_format (Parser.root result));
12
+
print_endline "";
13
+
print_endline "Expected:";
14
+
print_endline "| <!DOCTYPE html>";
15
+
print_endline "| <html>";
16
+
print_endline "| <head>";
17
+
print_endline "| <title>";
18
+
print_endline "| \" <!-- \"";
19
+
print_endline "| <body>";
20
+
print_endline "| \"--> x\""
+90
test/dune
+90
test/dune
···
1
+
(executable
2
+
(name test_html5lib)
3
+
(libraries bytesrw html5rw.parser html5rw.dom))
4
+
5
+
(executable
6
+
(name debug_fragment)
7
+
(libraries bytesrw html5rw.parser html5rw.dom))
8
+
9
+
(executable
10
+
(name debug_svg)
11
+
(libraries bytesrw html5rw.parser html5rw.dom))
12
+
13
+
(executable
14
+
(name test_table)
15
+
(libraries bytesrw html5rw.parser html5rw.dom))
16
+
17
+
(executable
18
+
(name test_debug)
19
+
(libraries bytesrw html5rw.parser html5rw.dom))
20
+
21
+
(executable
22
+
(name test_frameset)
23
+
(libraries bytesrw html5rw.parser html5rw.dom))
24
+
25
+
(executable
26
+
(name test_whitespace)
27
+
(libraries bytesrw html5rw.parser html5rw.dom))
28
+
29
+
(executable
30
+
(name test_mi)
31
+
(libraries bytesrw html5rw.parser html5rw.dom))
32
+
33
+
(executable
34
+
(name test_table_svg)
35
+
(libraries bytesrw html5rw.parser html5rw.dom))
36
+
(executable
37
+
(name quick_test)
38
+
(libraries bytesrw html5rw.parser html5rw.dom))
39
+
(executable
40
+
(name simple_test)
41
+
(libraries bytesrw html5rw.parser html5rw.dom))
42
+
(executable
43
+
(name html_frag_test)
44
+
(libraries bytesrw html5rw.parser html5rw.dom))
45
+
(executable
46
+
(name svg_frag_test)
47
+
(libraries bytesrw html5rw.parser html5rw.dom))
48
+
(executable
49
+
(name nobr_test)
50
+
(libraries bytesrw html5rw.parser html5rw.dom))
51
+
(executable
52
+
(name nobr_debug)
53
+
(libraries bytesrw html5rw.parser html5rw.dom))
54
+
(executable
55
+
(name select_debug)
56
+
(libraries bytesrw html5rw.parser html5rw.dom))
57
+
(executable
58
+
(name template_debug)
59
+
(libraries bytesrw html5rw.parser html5rw.dom))
60
+
(executable
61
+
(name template_debug2)
62
+
(libraries bytesrw html5rw.parser html5rw.dom))
63
+
(executable
64
+
(name script_eof_test)
65
+
(libraries bytesrw html5rw.parser html5rw.dom))
66
+
(executable
67
+
(name entity_test)
68
+
(libraries bytesrw html5rw.parser html5rw.dom))
69
+
(executable
70
+
(name entity_dup_test)
71
+
(libraries bytesrw html5rw.parser html5rw.dom))
72
+
(executable
73
+
(name script_attr_test)
74
+
(libraries bytesrw html5rw.parser html5rw.dom))
75
+
76
+
(executable
77
+
(name frag_debug)
78
+
(libraries bytesrw html5rw.parser html5rw.dom))
79
+
80
+
(executable
81
+
(name frag_debug2)
82
+
(libraries bytesrw html5rw.parser html5rw.dom))
83
+
84
+
(executable
85
+
(name frag_debug3)
86
+
(libraries bytesrw html5rw.parser html5rw.dom))
87
+
(executable (name adoption_test) (libraries bytesrw html5rw.parser html5rw.dom))
88
+
(executable (name template_debug3) (libraries bytesrw html5rw.parser html5rw.dom))
89
+
(executable (name template_debug4) (libraries bytesrw html5rw.parser html5rw.dom))
90
+
(executable (name ns_sens_test) (libraries bytesrw html5rw.parser html5rw.dom))
+20
test/entity_dup_test.ml
+20
test/entity_dup_test.ml
···
1
+
open Bytesrw
2
+
3
+
module Parser = Html5rw_parser
4
+
module Dom = Html5rw_dom
5
+
6
+
let () =
7
+
print_endline "=== Test: &AMp; ===";
8
+
let input = "&AMp;" in
9
+
print_endline ("Input: " ^ input);
10
+
let result = Parser.parse ~collect_errors:true (Bytes.Reader.of_string input) in
11
+
print_endline "Result:";
12
+
print_endline (Dom.to_test_format (Parser.root result));
13
+
print_endline "";
14
+
15
+
print_endline "=== Test: & ===";
16
+
let input = "&" in
17
+
print_endline ("Input: " ^ input);
18
+
let result = Parser.parse ~collect_errors:true (Bytes.Reader.of_string input) in
19
+
print_endline "Result:";
20
+
print_endline (Dom.to_test_format (Parser.root result))
+28
test/entity_test.ml
+28
test/entity_test.ml
···
1
+
open Bytesrw
2
+
3
+
module Parser = Html5rw_parser
4
+
module Dom = Html5rw_dom
5
+
6
+
let () =
7
+
print_endline "=== Test 1: Single & ===";
8
+
let input = "&" in
9
+
print_endline ("Input: " ^ input);
10
+
let result = Parser.parse ~collect_errors:true (Bytes.Reader.of_string input) in
11
+
print_endline "Result:";
12
+
print_endline (Dom.to_test_format (Parser.root result));
13
+
print_endline "";
14
+
15
+
print_endline "=== Test 2: - (decimal ref) ===";
16
+
let input = "-" in
17
+
print_endline ("Input: " ^ input);
18
+
let result = Parser.parse ~collect_errors:true (Bytes.Reader.of_string input) in
19
+
print_endline "Result:";
20
+
print_endline (Dom.to_test_format (Parser.root result));
21
+
print_endline "";
22
+
23
+
print_endline "=== Test 3: &#X (hex ref incomplete) ===";
24
+
let input = "&#X" in
25
+
print_endline ("Input: " ^ input);
26
+
let result = Parser.parse ~collect_errors:true (Bytes.Reader.of_string input) in
27
+
print_endline "Result:";
28
+
print_endline (Dom.to_test_format (Parser.root result))
+39
test/frag_debug.ml
+39
test/frag_debug.ml
···
1
+
open Bytesrw
2
+
3
+
module Parser = Html5rw_parser
4
+
5
+
let () =
6
+
(* Test 77 - template with adoption agency *)
7
+
print_endline "=== Template test 77 ===";
8
+
print_endline "Input: <body><template><i><menu>Foo</i>";
9
+
let result = Html5rw_parser.parse (Bytes.Reader.of_string "<body><template><i><menu>Foo</i>") in
10
+
print_endline "Actual:";
11
+
print_endline (Html5rw_dom.to_test_format (Parser.root result));
12
+
print_newline ();
13
+
14
+
(* Simpler test - just template with content *)
15
+
print_endline "=== Simpler template test ===";
16
+
print_endline "Input: <template><i>X</i></template>";
17
+
let result = Html5rw_parser.parse (Bytes.Reader.of_string "<template><i>X</i></template>") in
18
+
print_endline (Html5rw_dom.to_test_format (Parser.root result));
19
+
print_newline ();
20
+
21
+
(* Test without template *)
22
+
print_endline "=== Without template ===";
23
+
print_endline "Input: <i><menu>Foo</i>";
24
+
let result = Html5rw_parser.parse (Bytes.Reader.of_string "<i><menu>Foo</i>") in
25
+
print_endline (Html5rw_dom.to_test_format (Parser.root result));
26
+
print_newline ();
27
+
28
+
(* Test 31 - foreignObject/math *)
29
+
print_endline "=== Test 31 - foreignObject ===";
30
+
print_endline "Input: <div><svg><path><foreignObject><math></div>a";
31
+
let result = Html5rw_parser.parse (Bytes.Reader.of_string "<div><svg><path><foreignObject><math></div>a") in
32
+
print_endline (Html5rw_dom.to_test_format (Parser.root result));
33
+
print_newline ();
34
+
35
+
(* namespace-sensitivity test *)
36
+
print_endline "=== Namespace sensitivity ===";
37
+
print_endline "Input: <body><table><tr><td><svg><td><foreignObject><span></td>Foo";
38
+
let result = Html5rw_parser.parse (Bytes.Reader.of_string "<body><table><tr><td><svg><td><foreignObject><span></td>Foo") in
39
+
print_endline (Html5rw_dom.to_test_format (Parser.root result))
+40
test/frag_debug2.ml
+40
test/frag_debug2.ml
···
1
+
open Bytesrw
2
+
3
+
module Parser = Html5rw_parser
4
+
5
+
let () =
6
+
(* Test: svg end tag handling *)
7
+
print_endline "=== Test: <div><svg></div> ===";
8
+
let result = Html5rw_parser.parse (Bytes.Reader.of_string "<div><svg></div>") in
9
+
print_endline (Html5rw_dom.to_test_format (Parser.root result));
10
+
print_newline ();
11
+
12
+
(* Test: foreignObject text integration *)
13
+
print_endline "=== Test: <div><svg><foreignObject></div> ===";
14
+
let result = Html5rw_parser.parse (Bytes.Reader.of_string "<div><svg><foreignObject></div>") in
15
+
print_endline (Html5rw_dom.to_test_format (Parser.root result));
16
+
print_newline ();
17
+
18
+
(* Test: math inside foreignObject with end tag *)
19
+
print_endline "=== Test: <div><svg><foreignObject><math></div>a ===";
20
+
let result = Html5rw_parser.parse (Bytes.Reader.of_string "<div><svg><foreignObject><math></div>a") in
21
+
print_endline (Html5rw_dom.to_test_format (Parser.root result));
22
+
print_newline ();
23
+
24
+
(* Without path element *)
25
+
print_endline "=== Test: <div><svg><foreignObject><b></div>text ===";
26
+
let result = Html5rw_parser.parse (Bytes.Reader.of_string "<div><svg><foreignObject><b></div>text") in
27
+
print_endline (Html5rw_dom.to_test_format (Parser.root result));
28
+
print_newline ();
29
+
30
+
(* Template adoption agency test *)
31
+
print_endline "=== Test: <template><b><menu>text</b> ===";
32
+
let result = Html5rw_parser.parse (Bytes.Reader.of_string "<template><b><menu>text</b>") in
33
+
print_endline (Html5rw_dom.to_test_format (Parser.root result));
34
+
print_newline ();
35
+
36
+
(* Without template for comparison *)
37
+
print_endline "=== Test: <b><menu>text</b> (no template) ===";
38
+
let result = Html5rw_parser.parse (Bytes.Reader.of_string "<b><menu>text</b>") in
39
+
print_endline (Html5rw_dom.to_test_format (Parser.root result));
40
+
print_newline ()
+34
test/frag_debug3.ml
+34
test/frag_debug3.ml
···
1
+
open Bytesrw
2
+
3
+
module Parser = Html5rw_parser
4
+
5
+
let () =
6
+
(* Simple svg with child *)
7
+
print_endline "=== Test: <svg><path></path></svg>text ===";
8
+
let result = Html5rw_parser.parse (Bytes.Reader.of_string "<svg><path></path></svg>text") in
9
+
print_endline (Html5rw_dom.to_test_format (Parser.root result));
10
+
print_newline ();
11
+
12
+
(* The failing test - foreignObject inside svg *)
13
+
print_endline "=== Test: <div><svg><path><foreignObject><math></div>a ===";
14
+
let result = Html5rw_parser.parse (Bytes.Reader.of_string "<div><svg><path><foreignObject><math></div>a") in
15
+
print_endline (Html5rw_dom.to_test_format (Parser.root result));
16
+
print_newline ();
17
+
18
+
(* Expected output for test 31:
19
+
<html>
20
+
<head>
21
+
<body>
22
+
<div>
23
+
<svg svg>
24
+
<svg path>
25
+
<svg foreignObject>
26
+
<math math>
27
+
"a"
28
+
*)
29
+
30
+
(* Simple svg structure *)
31
+
print_endline "=== Test: <svg><rect/><circle/></svg> ===";
32
+
let result = Html5rw_parser.parse (Bytes.Reader.of_string "<svg><rect/><circle/></svg>") in
33
+
print_endline (Html5rw_dom.to_test_format (Parser.root result));
34
+
print_newline ()
+10
test/html_frag_test.ml
+10
test/html_frag_test.ml
···
1
+
open Bytesrw
2
+
3
+
module Parser = Html5rw_parser
4
+
module Dom = Html5rw_dom
5
+
6
+
let () =
7
+
let input = "Hello" in
8
+
let context = Parser.make_fragment_context ~tag_name:"div" () in
9
+
let result = Parser.parse ~collect_errors:true ~fragment_context:context (Bytes.Reader.of_string input) in
10
+
print_endline (Dom.to_test_format (Parser.root result))
+22
test/nobr_debug.ml
+22
test/nobr_debug.ml
···
1
+
open Bytesrw
2
+
3
+
module Parser = Html5rw_parser
4
+
module Dom = Html5rw_dom
5
+
6
+
let rec print_tree indent node =
7
+
Printf.printf "%s%s (ns=%s, %d children)\n"
8
+
indent
9
+
node.Dom.name
10
+
(match node.Dom.namespace with Some s -> s | None -> "html")
11
+
(List.length node.Dom.children);
12
+
List.iter (print_tree (indent ^ " ")) node.Dom.children
13
+
14
+
let () =
15
+
let input = "<nobr>X" in
16
+
print_endline "Starting...";
17
+
let context = Parser.make_fragment_context ~tag_name:"path" ~namespace:(Some "svg") () in
18
+
let result = Parser.parse ~collect_errors:true ~fragment_context:context (Bytes.Reader.of_string input) in
19
+
print_endline "\nFinal tree structure:";
20
+
print_tree "" (Parser.root result);
21
+
print_endline "\nTest format:";
22
+
print_endline (Dom.to_test_format (Parser.root result))
+23
test/nobr_debug2.ml
+23
test/nobr_debug2.ml
···
1
+
module Parser = Html5rw_parser
2
+
module Dom = Html5rw_dom
3
+
4
+
let rec print_tree indent node =
5
+
Printf.printf "%s%s (ns=%s, %d children)\n"
6
+
indent
7
+
node.Dom.name
8
+
(match node.Dom.namespace with Some s -> s | None -> "html")
9
+
(List.length node.Dom.children);
10
+
List.iter (print_tree (indent ^ " ")) node.Dom.children
11
+
12
+
let () =
13
+
let input = "<nobr>X" in
14
+
print_endline "Starting...";
15
+
let context = { Parser.Tree_builder.tag_name = "path"; namespace = Some "svg" } in
16
+
17
+
(* Create parser state directly for inspection *)
18
+
let t = Parser.Tree_builder.create ~collect_errors:true ~fragment_context:context input in
19
+
print_endline "\nInitial tree structure:";
20
+
print_tree "" t.Parser.Tree_builder.document;
21
+
print_endline "\nInitial stack size:";
22
+
Printf.printf "%d elements\n" (List.length t.Parser.Tree_builder.open_elements);
23
+
List.iter (fun n -> Printf.printf " - %s\n" n.Dom.name) t.Parser.Tree_builder.open_elements
+13
test/nobr_test.ml
+13
test/nobr_test.ml
···
1
+
open Bytesrw
2
+
3
+
module Parser = Html5rw_parser
4
+
module Dom = Html5rw_dom
5
+
6
+
let () =
7
+
let input = "<nobr>X" in
8
+
print_endline "Starting...";
9
+
let context = Parser.make_fragment_context ~tag_name:"path" ~namespace:(Some "svg") () in
10
+
print_endline "Created context";
11
+
let result = Parser.parse ~collect_errors:true ~fragment_context:context (Bytes.Reader.of_string input) in
12
+
print_endline "Parsed";
13
+
print_endline (Dom.to_test_format (Parser.root result))
+35
test/ns_sens_test.ml
+35
test/ns_sens_test.ml
···
1
+
open Bytesrw
2
+
3
+
module Parser = Html5rw_parser
4
+
5
+
let () =
6
+
print_endline "=== Test: <body><table><tr><td><svg><td><foreignObject><span></td>Foo ===";
7
+
let result = Html5rw_parser.parse (Bytes.Reader.of_string "<body><table><tr><td><svg><td><foreignObject><span></td>Foo") in
8
+
print_endline (Html5rw_dom.to_test_format (Parser.root result));
9
+
print_newline ();
10
+
11
+
(* Expected:
12
+
<html>
13
+
<head>
14
+
<body>
15
+
"Foo"
16
+
<table>
17
+
<tbody>
18
+
<tr>
19
+
<td>
20
+
<svg svg>
21
+
<svg td>
22
+
<svg foreignObject>
23
+
<span>
24
+
*)
25
+
26
+
(* Let's also test simpler case *)
27
+
print_endline "=== Test: <table><td><svg><foreignObject></td>text ===";
28
+
let result = Html5rw_parser.parse (Bytes.Reader.of_string "<table><td><svg><foreignObject></td>text") in
29
+
print_endline (Html5rw_dom.to_test_format (Parser.root result));
30
+
print_newline ();
31
+
32
+
print_endline "=== Test: <table><td></td>text ===";
33
+
let result = Html5rw_parser.parse (Bytes.Reader.of_string "<table><td></td>text") in
34
+
print_endline (Html5rw_dom.to_test_format (Parser.root result));
35
+
print_newline ()
+10
test/quick_test.ml
+10
test/quick_test.ml
···
1
+
open Bytesrw
2
+
3
+
module Parser = Html5rw_parser
4
+
module Dom = Html5rw_dom
5
+
6
+
let () =
7
+
let input = "<nobr>X" in
8
+
let context = Parser.make_fragment_context ~tag_name:"path" ~namespace:(Some "svg") () in
9
+
let result = Parser.parse ~collect_errors:true ~fragment_context:context (Bytes.Reader.of_string input) in
10
+
print_endline (Dom.to_test_format (Parser.root result))
+22
test/script_attr_test.ml
+22
test/script_attr_test.ml
···
1
+
open Bytesrw
2
+
3
+
module Parser = Html5rw_parser
4
+
module Dom = Html5rw_dom
5
+
6
+
let () =
7
+
(* Test incomplete script tag with attribute *)
8
+
let input = "<!doctypehtml><scrIPt type=text/x-foobar;baz>X</SCRipt" in
9
+
print_endline "=== Test: script tag with attribute at incomplete end ===";
10
+
print_endline ("Input: " ^ input);
11
+
let result = Parser.parse ~collect_errors:true (Bytes.Reader.of_string input) in
12
+
print_endline "Result:";
13
+
print_endline (Dom.to_test_format (Parser.root result));
14
+
print_endline "";
15
+
16
+
(* Test simpler case *)
17
+
let input = "<script type=text>X</script>" in
18
+
print_endline "=== Test: Complete script tag with attribute ===";
19
+
print_endline ("Input: " ^ input);
20
+
let result = Parser.parse ~collect_errors:true (Bytes.Reader.of_string input) in
21
+
print_endline "Result:";
22
+
print_endline (Dom.to_test_format (Parser.root result))
+12
test/script_eof_test.ml
+12
test/script_eof_test.ml
···
1
+
open Bytesrw
2
+
3
+
module Parser = Html5rw_parser
4
+
module Dom = Html5rw_dom
5
+
6
+
let () =
7
+
(* Test incomplete script tag *)
8
+
let input = "<!doctype html><script><" in
9
+
print_endline ("Input: " ^ input);
10
+
let result = Parser.parse ~collect_errors:true (Bytes.Reader.of_string input) in
11
+
print_endline "Result:";
12
+
print_endline (Dom.to_test_format (Parser.root result))
+13
test/select_debug.ml
+13
test/select_debug.ml
···
1
+
open Bytesrw
2
+
3
+
module Parser = Html5rw_parser
4
+
module Dom = Html5rw_dom
5
+
6
+
let () =
7
+
let input = "<select><b><option><select><option></b></select>X" in
8
+
print_endline "Input:";
9
+
print_endline input;
10
+
print_endline "";
11
+
let result = Parser.parse ~collect_errors:true (Bytes.Reader.of_string input) in
12
+
print_endline "Result:";
13
+
print_endline (Dom.to_test_format (Parser.root result))
+9
test/simple_test.ml
+9
test/simple_test.ml
+13
test/svg_frag_test.ml
+13
test/svg_frag_test.ml
···
1
+
open Bytesrw
2
+
3
+
module Parser = Html5rw_parser
4
+
module Dom = Html5rw_dom
5
+
6
+
let () =
7
+
let input = "Hello" in
8
+
print_endline "Starting...";
9
+
let context = Parser.make_fragment_context ~tag_name:"path" ~namespace:(Some "svg") () in
10
+
print_endline "Created context";
11
+
let result = Parser.parse ~collect_errors:true ~fragment_context:context (Bytes.Reader.of_string input) in
12
+
print_endline "Parsed";
13
+
print_endline (Dom.to_test_format (Parser.root result))
+21
test/template_debug.ml
+21
test/template_debug.ml
···
1
+
open Bytesrw
2
+
3
+
module Parser = Html5rw_parser
4
+
module Dom = Html5rw_dom
5
+
6
+
let () =
7
+
(* Template test 45: div inside tr inside template *)
8
+
let input1 = "<body><template><tr><div></div></tr></template>" in
9
+
print_endline "=== Test 1 ===";
10
+
print_endline ("Input: " ^ input1);
11
+
let result1 = Parser.parse ~collect_errors:true (Bytes.Reader.of_string input1) in
12
+
print_endline "Result:";
13
+
print_endline (Dom.to_test_format (Parser.root result1));
14
+
15
+
(* Template test 91: select inside tbody inside nested template *)
16
+
let input2 = "<template><template><tbody><select>" in
17
+
print_endline "\n=== Test 2 ===";
18
+
print_endline ("Input: " ^ input2);
19
+
let result2 = Parser.parse ~collect_errors:true (Bytes.Reader.of_string input2) in
20
+
print_endline "Result:";
21
+
print_endline (Dom.to_test_format (Parser.root result2))
+13
test/template_debug2.ml
+13
test/template_debug2.ml
···
1
+
open Bytesrw
2
+
3
+
module Parser = Html5rw_parser
4
+
module Dom = Html5rw_dom
5
+
6
+
let () =
7
+
(* Test i then menu in template *)
8
+
let input = "<template><i><menu>Foo" in
9
+
print_endline "=== Test: i then menu in template ===";
10
+
print_endline ("Input: " ^ input);
11
+
let result = Parser.parse ~collect_errors:true (Bytes.Reader.of_string input) in
12
+
print_endline "Result:";
13
+
print_endline (Dom.to_test_format (Parser.root result))
+26
test/template_debug3.ml
+26
test/template_debug3.ml
···
1
+
open Bytesrw
2
+
3
+
module Parser = Html5rw_parser
4
+
5
+
let () =
6
+
print_endline "=== Test: <body><template><i><menu>Foo</i> ===";
7
+
let result = Html5rw_parser.parse (Bytes.Reader.of_string "<body><template><i><menu>Foo</i>") in
8
+
print_endline (Html5rw_dom.to_test_format (Parser.root result));
9
+
print_newline ();
10
+
11
+
(* Expected:
12
+
<html>
13
+
<head>
14
+
<body>
15
+
<template>
16
+
content
17
+
<i>
18
+
<menu>
19
+
<i>
20
+
"Foo"
21
+
*)
22
+
23
+
print_endline "=== Test: <i><menu>Foo</i> (without template) ===";
24
+
let result = Html5rw_parser.parse (Bytes.Reader.of_string "<i><menu>Foo</i>") in
25
+
print_endline (Html5rw_dom.to_test_format (Parser.root result));
26
+
print_newline ()
+29
test/template_debug4.ml
+29
test/template_debug4.ml
···
1
+
open Bytesrw
2
+
3
+
module Parser = Html5rw_parser
4
+
5
+
let () =
6
+
print_endline "=== Test: <template><svg><foo><template><foreignObject><div></template><div> ===";
7
+
let result = Html5rw_parser.parse (Bytes.Reader.of_string "<template><svg><foo><template><foreignObject><div></template><div>") in
8
+
print_endline (Html5rw_dom.to_test_format (Parser.root result));
9
+
print_newline ();
10
+
11
+
(* Expected:
12
+
<html>
13
+
<head>
14
+
<template>
15
+
content
16
+
<svg svg>
17
+
<svg foo>
18
+
<svg template>
19
+
<svg foreignObject>
20
+
<div>
21
+
<body>
22
+
<div>
23
+
*)
24
+
25
+
(* Let's also test what happens with just the SVG template *)
26
+
print_endline "=== Test: <svg><template><foreignObject><div></template>text ===";
27
+
let result = Html5rw_parser.parse (Bytes.Reader.of_string "<svg><template><foreignObject><div></template>text") in
28
+
print_endline (Html5rw_dom.to_test_format (Parser.root result));
29
+
print_newline ()
+14
test/test_debug.ml
+14
test/test_debug.ml
···
1
+
open Bytesrw
2
+
3
+
module Parser = Html5rw_parser
4
+
module Dom = Html5rw_dom
5
+
6
+
let test input =
7
+
print_endline ("Input: " ^ input);
8
+
let result = Parser.parse ~collect_errors:true (Bytes.Reader.of_string input) in
9
+
print_endline (Dom.to_test_format (Parser.root result));
10
+
print_endline ""
11
+
12
+
let () =
13
+
(* Frameset tests - exact test input *)
14
+
test "<frameset></frameset>\nfoo"
+15
test/test_frameset.ml
+15
test/test_frameset.ml
···
1
+
open Bytesrw
2
+
3
+
module Parser = Html5rw_parser
4
+
module Dom = Html5rw_dom
5
+
6
+
let () =
7
+
let input = "<param><frameset></frameset>" in
8
+
print_endline ("Input: " ^ input);
9
+
try
10
+
let result = Parser.parse ~collect_errors:true (Bytes.Reader.of_string input) in
11
+
print_endline "Tree:";
12
+
print_endline (Dom.to_test_format (Parser.root result))
13
+
with e ->
14
+
print_endline ("Exception: " ^ Printexc.to_string e);
15
+
Printexc.print_backtrace stdout
+201
test/test_html5lib.ml
+201
test/test_html5lib.ml
···
1
+
(* Test runner for html5lib-tests tree construction tests *)
2
+
3
+
open Bytesrw
4
+
5
+
module Parser = Html5rw_parser
6
+
module Dom = Html5rw_dom
7
+
8
+
type test_case = {
9
+
input : string;
10
+
expected_tree : string;
11
+
expected_errors : string list;
12
+
script_on : bool;
13
+
fragment_context : string option;
14
+
}
15
+
16
+
let _is_blank s = String.trim s = ""
17
+
18
+
(* Parse a single test case from lines *)
19
+
let parse_test_case lines =
20
+
let rec parse acc = function
21
+
| [] -> acc
22
+
| line :: rest when String.length line > 0 && line.[0] = '#' ->
23
+
let section = String.trim line in
24
+
let content, remaining = collect_section rest in
25
+
parse ((section, content) :: acc) remaining
26
+
| _ :: rest -> parse acc rest
27
+
and collect_section lines =
28
+
let rec loop acc = function
29
+
| [] -> (List.rev acc, [])
30
+
| line :: rest when String.length line > 0 && line.[0] = '#' ->
31
+
(List.rev acc, line :: rest)
32
+
| line :: rest -> loop (line :: acc) rest
33
+
in
34
+
loop [] lines
35
+
in
36
+
let sections = parse [] lines in
37
+
38
+
let get_section name =
39
+
match List.assoc_opt name sections with
40
+
| Some lines -> String.concat "\n" lines
41
+
| None -> ""
42
+
in
43
+
44
+
let data = get_section "#data" in
45
+
let document = get_section "#document" in
46
+
let errors_text = get_section "#errors" in
47
+
let errors =
48
+
String.split_on_char '\n' errors_text
49
+
|> List.filter (fun s -> String.trim s <> "")
50
+
in
51
+
let script_on = List.mem_assoc "#script-on" sections in
52
+
let fragment =
53
+
if List.mem_assoc "#document-fragment" sections then
54
+
Some (get_section "#document-fragment" |> String.trim)
55
+
else None
56
+
in
57
+
58
+
{
59
+
input = data;
60
+
expected_tree = document;
61
+
expected_errors = errors;
62
+
script_on;
63
+
fragment_context = fragment;
64
+
}
65
+
66
+
(* Parse a .dat file into test cases *)
67
+
let parse_dat_file content =
68
+
let lines = String.split_on_char '\n' content in
69
+
(* Split on empty lines followed by #data *)
70
+
let rec split_tests current acc = function
71
+
| [] ->
72
+
if current = [] then List.rev acc
73
+
else List.rev (List.rev current :: acc)
74
+
| "" :: "#data" :: rest ->
75
+
(* End of current test, start new one *)
76
+
let new_acc = if current = [] then acc else (List.rev current :: acc) in
77
+
split_tests ["#data"] new_acc rest
78
+
| line :: rest ->
79
+
split_tests (line :: current) acc rest
80
+
in
81
+
let test_groups = split_tests [] [] lines in
82
+
List.filter_map (fun lines ->
83
+
if List.exists (fun l -> l = "#data") lines then
84
+
Some (parse_test_case lines)
85
+
else None
86
+
) test_groups
87
+
88
+
(* Strip "| " prefix from each line *)
89
+
let strip_tree_prefix s =
90
+
let lines = String.split_on_char '\n' s in
91
+
let stripped = List.filter_map (fun line ->
92
+
if String.length line >= 2 && String.sub line 0 2 = "| " then
93
+
Some (String.sub line 2 (String.length line - 2))
94
+
else if String.trim line = "" then None
95
+
else Some line
96
+
) lines in
97
+
String.concat "\n" stripped
98
+
99
+
(* Normalize tree output for comparison *)
100
+
let normalize_tree s =
101
+
let lines = String.split_on_char '\n' s in
102
+
let non_empty = List.filter (fun l -> String.trim l <> "") lines in
103
+
String.concat "\n" non_empty
104
+
105
+
let run_test test =
106
+
try
107
+
let result =
108
+
match test.fragment_context with
109
+
| Some ctx_str ->
110
+
(* Parse "namespace element" or just "element" *)
111
+
let (namespace, tag_name) =
112
+
match String.split_on_char ' ' ctx_str with
113
+
| [ns; tag] when ns = "svg" -> (Some "svg", tag)
114
+
| [ns; tag] when ns = "math" -> (Some "mathml", tag)
115
+
| [tag] -> (None, tag)
116
+
| _ -> (None, ctx_str)
117
+
in
118
+
let context = Parser.make_fragment_context ~tag_name ~namespace () in
119
+
let reader = Bytes.Reader.of_string test.input in
120
+
Parser.parse ~collect_errors:true ~fragment_context:context reader
121
+
| None ->
122
+
let reader = Bytes.Reader.of_string test.input in
123
+
Parser.parse ~collect_errors:true reader
124
+
in
125
+
let actual_tree = Dom.to_test_format (Parser.root result) in
126
+
let expected = normalize_tree (strip_tree_prefix test.expected_tree) in
127
+
let actual = normalize_tree (strip_tree_prefix actual_tree) in
128
+
(expected = actual, expected, actual, List.length (Parser.errors result), List.length test.expected_errors)
129
+
with e ->
130
+
let expected = normalize_tree (strip_tree_prefix test.expected_tree) in
131
+
(false, expected, Printf.sprintf "EXCEPTION: %s" (Printexc.to_string e), 0, 0)
132
+
133
+
let run_file path =
134
+
let ic = open_in path in
135
+
let content = really_input_string ic (in_channel_length ic) in
136
+
close_in ic;
137
+
138
+
let tests = parse_dat_file content in
139
+
let filename = Filename.basename path in
140
+
141
+
let passed = ref 0 in
142
+
let failed = ref 0 in
143
+
let errors = ref [] in
144
+
145
+
List.iteri (fun i test ->
146
+
(* Skip script-on tests since we don't support scripting *)
147
+
if test.script_on then
148
+
() (* Skip this test *)
149
+
else begin
150
+
let (success, expected, actual, _actual_error_count, _expected_error_count) = run_test test in
151
+
if success then
152
+
incr passed
153
+
else begin
154
+
incr failed;
155
+
errors := (i + 1, test.input, expected, actual) :: !errors
156
+
end
157
+
end
158
+
) tests;
159
+
160
+
(!passed, !failed, List.rev !errors, filename)
161
+
162
+
let () =
163
+
let test_dir = Sys.argv.(1) in
164
+
let files = Sys.readdir test_dir |> Array.to_list in
165
+
let dat_files = List.filter (fun f ->
166
+
Filename.check_suffix f ".dat" &&
167
+
not (String.contains f '/') (* Skip subdirectories *)
168
+
) files in
169
+
170
+
let total_passed = ref 0 in
171
+
let total_failed = ref 0 in
172
+
let all_errors = ref [] in
173
+
174
+
List.iter (fun file ->
175
+
let path = Filename.concat test_dir file in
176
+
if Sys.is_directory path then () else begin
177
+
let (passed, failed, errors, filename) = run_file path in
178
+
total_passed := !total_passed + passed;
179
+
total_failed := !total_failed + failed;
180
+
if errors <> [] then
181
+
all_errors := (filename, errors) :: !all_errors;
182
+
Printf.printf "%s: %d passed, %d failed\n" filename passed failed
183
+
end
184
+
) (List.sort String.compare dat_files);
185
+
186
+
Printf.printf "\n=== Summary ===\n";
187
+
Printf.printf "Total: %d passed, %d failed\n" !total_passed !total_failed;
188
+
189
+
if !all_errors <> [] then begin
190
+
Printf.printf "\n=== First failures ===\n";
191
+
List.iter (fun (filename, errors) ->
192
+
List.iter (fun (test_num, input, expected, actual) ->
193
+
Printf.printf "\n--- %s test %d ---\n" filename test_num;
194
+
Printf.printf "Input: %s\n" (String.escaped input);
195
+
Printf.printf "Expected:\n%s\n" expected;
196
+
Printf.printf "Actual:\n%s\n" actual
197
+
) (List.filteri (fun i _ -> i < 3) errors)
198
+
) (List.filteri (fun i _ -> i < 10) !all_errors)
199
+
end;
200
+
201
+
exit (if !total_failed > 0 then 1 else 0)
+11
test/test_mi.ml
+11
test/test_mi.ml
···
1
+
open Bytesrw
2
+
3
+
module Parser = Html5rw_parser
4
+
module Dom = Html5rw_dom
5
+
6
+
let () =
7
+
let input = "<!doctype html><p><math><mi><p><h1>" in
8
+
print_endline ("Input: " ^ input);
9
+
let result = Parser.parse ~collect_errors:true (Bytes.Reader.of_string input) in
10
+
print_endline "Tree:";
11
+
print_endline (Dom.to_test_format (Parser.root result))
+9
test/test_table.ml
+9
test/test_table.ml
···
1
+
open Bytesrw
2
+
3
+
module Parser = Html5rw_parser
4
+
module Dom = Html5rw_dom
5
+
let () =
6
+
let input = "<b><em><foo><foo><aside></b>" in
7
+
print_endline ("Input: " ^ input);
8
+
let result = Parser.parse ~collect_errors:true (Bytes.Reader.of_string input) in
9
+
print_endline (Dom.to_test_format (Parser.root result))
+11
test/test_table_svg.ml
+11
test/test_table_svg.ml
···
1
+
open Bytesrw
2
+
3
+
module Parser = Html5rw_parser
4
+
module Dom = Html5rw_dom
5
+
6
+
let () =
7
+
let input = "<table><tr><td><svg><desc><td></desc><circle>" in
8
+
print_endline ("Input: " ^ input);
9
+
let result = Parser.parse ~collect_errors:true (Bytes.Reader.of_string input) in
10
+
print_endline "Tree:";
11
+
print_endline (Dom.to_test_format (Parser.root result))
+11
test/test_whitespace.ml
+11
test/test_whitespace.ml
···
1
+
open Bytesrw
2
+
3
+
module Parser = Html5rw_parser
4
+
module Dom = Html5rw_dom
5
+
6
+
let () =
7
+
let input = "<style> <!-- </style> --> </style>x" in
8
+
print_endline ("Input: " ^ input);
9
+
let result = Parser.parse ~collect_errors:true (Bytes.Reader.of_string input) in
10
+
print_endline "Tree:";
11
+
print_endline (Dom.to_test_format (Parser.root result))