OCaml HTML5 parser/serialiser based on Python's JustHTML
1(* HTML5 spec constants *)
2
3(* Use Astring for string operations *)
4let lowercase = Astring.String.Ascii.lowercase
5
6(* Void elements - no end tag allowed *)
7let void_elements = [
8 "area"; "base"; "br"; "col"; "embed"; "hr"; "img"; "input";
9 "link"; "meta"; "source"; "track"; "wbr"
10]
11
12(* Raw text elements - content is raw text *)
13let raw_text_elements = ["script"; "style"]
14
15(* Escapable raw text elements *)
16let escapable_raw_text_elements = ["textarea"; "title"]
17
18(* Formatting elements for adoption agency *)
19let formatting_elements = [
20 "a"; "b"; "big"; "code"; "em"; "font"; "i"; "nobr"; "s"; "small";
21 "strike"; "strong"; "tt"; "u"
22]
23
24(* Special elements *)
25let special_elements = [
26 "address"; "applet"; "area"; "article"; "aside"; "base"; "basefont";
27 "bgsound"; "blockquote"; "body"; "br"; "button"; "caption"; "center";
28 "col"; "colgroup"; "dd"; "details"; "dir"; "div"; "dl"; "dt"; "embed";
29 "fieldset"; "figcaption"; "figure"; "footer"; "form"; "frame"; "frameset";
30 "h1"; "h2"; "h3"; "h4"; "h5"; "h6"; "head"; "header"; "hgroup"; "hr";
31 "html"; "iframe"; "img"; "input"; "keygen"; "li"; "link"; "listing";
32 "main"; "marquee"; "menu"; "meta"; "nav"; "noembed"; "noframes";
33 "noscript"; "object"; "ol"; "p"; "param"; "plaintext"; "pre"; "script";
34 "search"; "section"; "select"; "source"; "style"; "summary"; "table";
35 "tbody"; "td"; "template"; "textarea"; "tfoot"; "th"; "thead"; "title";
36 "tr"; "track"; "ul"; "wbr"; "xmp"
37]
38
39(* Heading elements *)
40let heading_elements = ["h1"; "h2"; "h3"; "h4"; "h5"; "h6"]
41
42(* Implied end tag elements *)
43let implied_end_tags = [
44 "dd"; "dt"; "li"; "optgroup"; "option"; "p"; "rb"; "rp"; "rt"; "rtc"
45]
46
47(* Thoroughly implied end tags *)
48let thoroughly_implied_end_tags = [
49 "caption"; "colgroup"; "dd"; "dt"; "li"; "optgroup"; "option"; "p";
50 "rb"; "rp"; "rt"; "rtc"; "tbody"; "td"; "tfoot"; "th"; "thead"; "tr"
51]
52
53(* Scope elements for various scope checks *)
54let default_scope = [
55 "applet"; "caption"; "html"; "table"; "td"; "th"; "marquee"; "object"; "template"
56]
57
58let list_item_scope = default_scope @ ["ol"; "ul"]
59
60let button_scope = default_scope @ ["button"]
61
62let table_scope = ["html"; "table"; "template"]
63
64let select_scope_exclude = ["optgroup"; "option"]
65
66(* MathML text integration points *)
67let mathml_text_integration = ["mi"; "mo"; "mn"; "ms"; "mtext"]
68
69(* MathML attribute adjustments *)
70let mathml_attr_adjustments = [
71 ("definitionurl", "definitionURL")
72]
73
74let adjust_mathml_attrs attrs =
75 List.map (fun (k, v) ->
76 match List.assoc_opt (lowercase k) mathml_attr_adjustments with
77 | Some adjusted_k -> (adjusted_k, v)
78 | None -> (k, v)
79 ) attrs
80
81(* SVG HTML integration points *)
82let svg_html_integration = ["foreignObject"; "desc"; "title"]
83
84(* SVG tag name adjustments *)
85let svg_tag_adjustments = [
86 ("altglyph", "altGlyph");
87 ("altglyphdef", "altGlyphDef");
88 ("altglyphitem", "altGlyphItem");
89 ("animatecolor", "animateColor");
90 ("animatemotion", "animateMotion");
91 ("animatetransform", "animateTransform");
92 ("clippath", "clipPath");
93 ("feblend", "feBlend");
94 ("fecolormatrix", "feColorMatrix");
95 ("fecomponenttransfer", "feComponentTransfer");
96 ("fecomposite", "feComposite");
97 ("feconvolvematrix", "feConvolveMatrix");
98 ("fediffuselighting", "feDiffuseLighting");
99 ("fedisplacementmap", "feDisplacementMap");
100 ("fedistantlight", "feDistantLight");
101 ("fedropshadow", "feDropShadow");
102 ("feflood", "feFlood");
103 ("fefunca", "feFuncA");
104 ("fefuncb", "feFuncB");
105 ("fefuncg", "feFuncG");
106 ("fefuncr", "feFuncR");
107 ("fegaussianblur", "feGaussianBlur");
108 ("feimage", "feImage");
109 ("femerge", "feMerge");
110 ("femergenode", "feMergeNode");
111 ("femorphology", "feMorphology");
112 ("feoffset", "feOffset");
113 ("fepointlight", "fePointLight");
114 ("fespecularlighting", "feSpecularLighting");
115 ("fespotlight", "feSpotLight");
116 ("fetile", "feTile");
117 ("feturbulence", "feTurbulence");
118 ("foreignobject", "foreignObject");
119 ("glyphref", "glyphRef");
120 ("lineargradient", "linearGradient");
121 ("radialgradient", "radialGradient");
122 ("textpath", "textPath");
123]
124
125(* SVG attribute adjustments *)
126let svg_attr_adjustments = [
127 ("attributename", "attributeName");
128 ("attributetype", "attributeType");
129 ("basefrequency", "baseFrequency");
130 ("baseprofile", "baseProfile");
131 ("calcmode", "calcMode");
132 ("clippathunits", "clipPathUnits");
133 ("diffuseconstant", "diffuseConstant");
134 ("edgemode", "edgeMode");
135 ("filterunits", "filterUnits");
136 ("glyphref", "glyphRef");
137 ("gradienttransform", "gradientTransform");
138 ("gradientunits", "gradientUnits");
139 ("kernelmatrix", "kernelMatrix");
140 ("kernelunitlength", "kernelUnitLength");
141 ("keypoints", "keyPoints");
142 ("keysplines", "keySplines");
143 ("keytimes", "keyTimes");
144 ("lengthadjust", "lengthAdjust");
145 ("limitingconeangle", "limitingConeAngle");
146 ("markerheight", "markerHeight");
147 ("markerunits", "markerUnits");
148 ("markerwidth", "markerWidth");
149 ("maskcontentunits", "maskContentUnits");
150 ("maskunits", "maskUnits");
151 ("numoctaves", "numOctaves");
152 ("pathlength", "pathLength");
153 ("patterncontentunits", "patternContentUnits");
154 ("patterntransform", "patternTransform");
155 ("patternunits", "patternUnits");
156 ("pointsatx", "pointsAtX");
157 ("pointsaty", "pointsAtY");
158 ("pointsatz", "pointsAtZ");
159 ("preservealpha", "preserveAlpha");
160 ("preserveaspectratio", "preserveAspectRatio");
161 ("primitiveunits", "primitiveUnits");
162 ("refx", "refX");
163 ("refy", "refY");
164 ("repeatcount", "repeatCount");
165 ("repeatdur", "repeatDur");
166 ("requiredextensions", "requiredExtensions");
167 ("requiredfeatures", "requiredFeatures");
168 ("specularconstant", "specularConstant");
169 ("specularexponent", "specularExponent");
170 ("spreadmethod", "spreadMethod");
171 ("startoffset", "startOffset");
172 ("stddeviation", "stdDeviation");
173 ("stitchtiles", "stitchTiles");
174 ("surfacescale", "surfaceScale");
175 ("systemlanguage", "systemLanguage");
176 ("tablevalues", "tableValues");
177 ("targetx", "targetX");
178 ("targety", "targetY");
179 ("textlength", "textLength");
180 ("viewbox", "viewBox");
181 ("viewtarget", "viewTarget");
182 ("xchannelselector", "xChannelSelector");
183 ("ychannelselector", "yChannelSelector");
184 ("zoomandpan", "zoomAndPan");
185]
186
187(* Foreign attribute adjustments *)
188let foreign_attr_adjustments = [
189 ("xlink:actuate", ("xlink", "actuate", "http://www.w3.org/1999/xlink"));
190 ("xlink:arcrole", ("xlink", "arcrole", "http://www.w3.org/1999/xlink"));
191 ("xlink:href", ("xlink", "href", "http://www.w3.org/1999/xlink"));
192 ("xlink:role", ("xlink", "role", "http://www.w3.org/1999/xlink"));
193 ("xlink:show", ("xlink", "show", "http://www.w3.org/1999/xlink"));
194 ("xlink:title", ("xlink", "title", "http://www.w3.org/1999/xlink"));
195 ("xlink:type", ("xlink", "type", "http://www.w3.org/1999/xlink"));
196 ("xml:lang", ("xml", "lang", "http://www.w3.org/XML/1998/namespace"));
197 ("xml:space", ("xml", "space", "http://www.w3.org/XML/1998/namespace"));
198 ("xmlns", ("", "xmlns", "http://www.w3.org/2000/xmlns/"));
199 ("xmlns:xlink", ("xmlns", "xlink", "http://www.w3.org/2000/xmlns/"));
200]
201
202(* Quirks mode detection *)
203let quirky_public_matches = [
204 "-//w3o//dtd w3 html strict 3.0//en//";
205 "-/w3c/dtd html 4.0 transitional/en";
206 "html"
207]
208
209let quirky_public_prefixes = [
210 "+//silmaril//dtd html pro v0r11 19970101//";
211 "-//as//dtd html 3.0 aswedit + extensions//";
212 "-//advasoft ltd//dtd html 3.0 aswedit + extensions//";
213 "-//ietf//dtd html 2.0 level 1//";
214 "-//ietf//dtd html 2.0 level 2//";
215 "-//ietf//dtd html 2.0 strict level 1//";
216 "-//ietf//dtd html 2.0 strict level 2//";
217 "-//ietf//dtd html 2.0 strict//";
218 "-//ietf//dtd html 2.0//";
219 "-//ietf//dtd html 2.1e//";
220 "-//ietf//dtd html 3.0//";
221 "-//ietf//dtd html 3.2 final//";
222 "-//ietf//dtd html 3.2//";
223 "-//ietf//dtd html 3//";
224 "-//ietf//dtd html level 0//";
225 "-//ietf//dtd html level 1//";
226 "-//ietf//dtd html level 2//";
227 "-//ietf//dtd html level 3//";
228 "-//ietf//dtd html strict level 0//";
229 "-//ietf//dtd html strict level 1//";
230 "-//ietf//dtd html strict level 2//";
231 "-//ietf//dtd html strict level 3//";
232 "-//ietf//dtd html strict//";
233 "-//ietf//dtd html//";
234 "-//metrius//dtd metrius presentational//";
235 "-//microsoft//dtd internet explorer 2.0 html strict//";
236 "-//microsoft//dtd internet explorer 2.0 html//";
237 "-//microsoft//dtd internet explorer 2.0 tables//";
238 "-//microsoft//dtd internet explorer 3.0 html strict//";
239 "-//microsoft//dtd internet explorer 3.0 html//";
240 "-//microsoft//dtd internet explorer 3.0 tables//";
241 "-//netscape comm. corp.//dtd html//";
242 "-//netscape comm. corp.//dtd strict html//";
243 "-//o'reilly and associates//dtd html 2.0//";
244 "-//o'reilly and associates//dtd html extended 1.0//";
245 "-//o'reilly and associates//dtd html extended relaxed 1.0//";
246 "-//sq//dtd html 2.0 hotmetal + extensions//";
247 "-//softquad software//dtd hotmetal pro 6.0::19990601::extensions to html 4.0//";
248 "-//softquad//dtd hotmetal pro 4.0::19971010::extensions to html 4.0//";
249 "-//spyglass//dtd html 2.0 extended//";
250 "-//sun microsystems corp.//dtd hotjava html//";
251 "-//sun microsystems corp.//dtd hotjava strict html//";
252 "-//w3c//dtd html 3 1995-03-24//";
253 "-//w3c//dtd html 3.2 draft//";
254 "-//w3c//dtd html 3.2 final//";
255 "-//w3c//dtd html 3.2//";
256 "-//w3c//dtd html 3.2s draft//";
257 "-//w3c//dtd html 4.0 frameset//";
258 "-//w3c//dtd html 4.0 transitional//";
259 "-//w3c//dtd html experimental 19960712//";
260 "-//w3c//dtd html experimental 970421//";
261 "-//w3c//dtd w3 html//";
262 "-//w3o//dtd w3 html 3.0//";
263 "-//webtechs//dtd mozilla html 2.0//";
264 "-//webtechs//dtd mozilla html//";
265]
266
267let limited_quirky_public_prefixes = [
268 "-//w3c//dtd xhtml 1.0 frameset//";
269 "-//w3c//dtd xhtml 1.0 transitional//";
270]
271
272let html4_public_prefixes = [
273 "-//w3c//dtd html 4.01 frameset//";
274 "-//w3c//dtd html 4.01 transitional//";
275]
276
277let quirky_system_matches = [
278 "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd"
279]
280
281(* Helper functions *)
282let is_void = List.mem
283let is_formatting = List.mem
284let is_special name = List.mem name special_elements
285let is_heading = List.mem
286
287let adjust_svg_tag_name name =
288 match List.assoc_opt (lowercase name) svg_tag_adjustments with
289 | Some adjusted -> adjusted
290 | None -> name
291
292let adjust_svg_attrs attrs =
293 List.map (fun (name, value) ->
294 let adjusted_name =
295 match List.assoc_opt (lowercase name) svg_attr_adjustments with
296 | Some n -> n
297 | None -> name
298 in
299 (adjusted_name, value)
300 ) attrs
301
302let adjust_foreign_attrs attrs =
303 List.map (fun (name, value) ->
304 match List.assoc_opt (lowercase name) foreign_attr_adjustments with
305 | Some (prefix, local, _ns) ->
306 if prefix = "" then (local, value)
307 else (prefix ^ ":" ^ local, value)
308 | None -> (name, value)
309 ) attrs