OCaml HTML5 parser/serialiser based on Python's JustHTML
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

at 6387bcc507f92984dc0154360b8bc442d5332615 309 lines 11 kB view raw
1(* HTML5 spec constants *) 2 3(* Use Astring for string operations *) 4let lowercase = Astring.String.Ascii.lowercase 5 6(* Void elements - no end tag allowed *) 7let void_elements = [ 8 "area"; "base"; "br"; "col"; "embed"; "hr"; "img"; "input"; 9 "link"; "meta"; "source"; "track"; "wbr" 10] 11 12(* Raw text elements - content is raw text *) 13let raw_text_elements = ["script"; "style"] 14 15(* Escapable raw text elements *) 16let escapable_raw_text_elements = ["textarea"; "title"] 17 18(* Formatting elements for adoption agency *) 19let formatting_elements = [ 20 "a"; "b"; "big"; "code"; "em"; "font"; "i"; "nobr"; "s"; "small"; 21 "strike"; "strong"; "tt"; "u" 22] 23 24(* Special elements *) 25let special_elements = [ 26 "address"; "applet"; "area"; "article"; "aside"; "base"; "basefont"; 27 "bgsound"; "blockquote"; "body"; "br"; "button"; "caption"; "center"; 28 "col"; "colgroup"; "dd"; "details"; "dir"; "div"; "dl"; "dt"; "embed"; 29 "fieldset"; "figcaption"; "figure"; "footer"; "form"; "frame"; "frameset"; 30 "h1"; "h2"; "h3"; "h4"; "h5"; "h6"; "head"; "header"; "hgroup"; "hr"; 31 "html"; "iframe"; "img"; "input"; "keygen"; "li"; "link"; "listing"; 32 "main"; "marquee"; "menu"; "meta"; "nav"; "noembed"; "noframes"; 33 "noscript"; "object"; "ol"; "p"; "param"; "plaintext"; "pre"; "script"; 34 "search"; "section"; "select"; "source"; "style"; "summary"; "table"; 35 "tbody"; "td"; "template"; "textarea"; "tfoot"; "th"; "thead"; "title"; 36 "tr"; "track"; "ul"; "wbr"; "xmp" 37] 38 39(* Heading elements *) 40let heading_elements = ["h1"; "h2"; "h3"; "h4"; "h5"; "h6"] 41 42(* Implied end tag elements *) 43let implied_end_tags = [ 44 "dd"; "dt"; "li"; "optgroup"; "option"; "p"; "rb"; "rp"; "rt"; "rtc" 45] 46 47(* Thoroughly implied end tags *) 48let thoroughly_implied_end_tags = [ 49 "caption"; "colgroup"; "dd"; "dt"; "li"; "optgroup"; "option"; "p"; 50 "rb"; "rp"; "rt"; "rtc"; "tbody"; "td"; "tfoot"; "th"; "thead"; "tr" 51] 52 53(* Scope elements for various scope checks *) 54let default_scope = [ 55 "applet"; "caption"; "html"; "table"; "td"; "th"; "marquee"; "object"; "template" 56] 57 58let list_item_scope = default_scope @ ["ol"; "ul"] 59 60let button_scope = default_scope @ ["button"] 61 62let table_scope = ["html"; "table"; "template"] 63 64let select_scope_exclude = ["optgroup"; "option"] 65 66(* MathML text integration points *) 67let mathml_text_integration = ["mi"; "mo"; "mn"; "ms"; "mtext"] 68 69(* MathML attribute adjustments *) 70let mathml_attr_adjustments = [ 71 ("definitionurl", "definitionURL") 72] 73 74let adjust_mathml_attrs attrs = 75 List.map (fun (k, v) -> 76 match List.assoc_opt (lowercase k) mathml_attr_adjustments with 77 | Some adjusted_k -> (adjusted_k, v) 78 | None -> (k, v) 79 ) attrs 80 81(* SVG HTML integration points *) 82let svg_html_integration = ["foreignObject"; "desc"; "title"] 83 84(* SVG tag name adjustments *) 85let svg_tag_adjustments = [ 86 ("altglyph", "altGlyph"); 87 ("altglyphdef", "altGlyphDef"); 88 ("altglyphitem", "altGlyphItem"); 89 ("animatecolor", "animateColor"); 90 ("animatemotion", "animateMotion"); 91 ("animatetransform", "animateTransform"); 92 ("clippath", "clipPath"); 93 ("feblend", "feBlend"); 94 ("fecolormatrix", "feColorMatrix"); 95 ("fecomponenttransfer", "feComponentTransfer"); 96 ("fecomposite", "feComposite"); 97 ("feconvolvematrix", "feConvolveMatrix"); 98 ("fediffuselighting", "feDiffuseLighting"); 99 ("fedisplacementmap", "feDisplacementMap"); 100 ("fedistantlight", "feDistantLight"); 101 ("fedropshadow", "feDropShadow"); 102 ("feflood", "feFlood"); 103 ("fefunca", "feFuncA"); 104 ("fefuncb", "feFuncB"); 105 ("fefuncg", "feFuncG"); 106 ("fefuncr", "feFuncR"); 107 ("fegaussianblur", "feGaussianBlur"); 108 ("feimage", "feImage"); 109 ("femerge", "feMerge"); 110 ("femergenode", "feMergeNode"); 111 ("femorphology", "feMorphology"); 112 ("feoffset", "feOffset"); 113 ("fepointlight", "fePointLight"); 114 ("fespecularlighting", "feSpecularLighting"); 115 ("fespotlight", "feSpotLight"); 116 ("fetile", "feTile"); 117 ("feturbulence", "feTurbulence"); 118 ("foreignobject", "foreignObject"); 119 ("glyphref", "glyphRef"); 120 ("lineargradient", "linearGradient"); 121 ("radialgradient", "radialGradient"); 122 ("textpath", "textPath"); 123] 124 125(* SVG attribute adjustments *) 126let svg_attr_adjustments = [ 127 ("attributename", "attributeName"); 128 ("attributetype", "attributeType"); 129 ("basefrequency", "baseFrequency"); 130 ("baseprofile", "baseProfile"); 131 ("calcmode", "calcMode"); 132 ("clippathunits", "clipPathUnits"); 133 ("diffuseconstant", "diffuseConstant"); 134 ("edgemode", "edgeMode"); 135 ("filterunits", "filterUnits"); 136 ("glyphref", "glyphRef"); 137 ("gradienttransform", "gradientTransform"); 138 ("gradientunits", "gradientUnits"); 139 ("kernelmatrix", "kernelMatrix"); 140 ("kernelunitlength", "kernelUnitLength"); 141 ("keypoints", "keyPoints"); 142 ("keysplines", "keySplines"); 143 ("keytimes", "keyTimes"); 144 ("lengthadjust", "lengthAdjust"); 145 ("limitingconeangle", "limitingConeAngle"); 146 ("markerheight", "markerHeight"); 147 ("markerunits", "markerUnits"); 148 ("markerwidth", "markerWidth"); 149 ("maskcontentunits", "maskContentUnits"); 150 ("maskunits", "maskUnits"); 151 ("numoctaves", "numOctaves"); 152 ("pathlength", "pathLength"); 153 ("patterncontentunits", "patternContentUnits"); 154 ("patterntransform", "patternTransform"); 155 ("patternunits", "patternUnits"); 156 ("pointsatx", "pointsAtX"); 157 ("pointsaty", "pointsAtY"); 158 ("pointsatz", "pointsAtZ"); 159 ("preservealpha", "preserveAlpha"); 160 ("preserveaspectratio", "preserveAspectRatio"); 161 ("primitiveunits", "primitiveUnits"); 162 ("refx", "refX"); 163 ("refy", "refY"); 164 ("repeatcount", "repeatCount"); 165 ("repeatdur", "repeatDur"); 166 ("requiredextensions", "requiredExtensions"); 167 ("requiredfeatures", "requiredFeatures"); 168 ("specularconstant", "specularConstant"); 169 ("specularexponent", "specularExponent"); 170 ("spreadmethod", "spreadMethod"); 171 ("startoffset", "startOffset"); 172 ("stddeviation", "stdDeviation"); 173 ("stitchtiles", "stitchTiles"); 174 ("surfacescale", "surfaceScale"); 175 ("systemlanguage", "systemLanguage"); 176 ("tablevalues", "tableValues"); 177 ("targetx", "targetX"); 178 ("targety", "targetY"); 179 ("textlength", "textLength"); 180 ("viewbox", "viewBox"); 181 ("viewtarget", "viewTarget"); 182 ("xchannelselector", "xChannelSelector"); 183 ("ychannelselector", "yChannelSelector"); 184 ("zoomandpan", "zoomAndPan"); 185] 186 187(* Foreign attribute adjustments *) 188let foreign_attr_adjustments = [ 189 ("xlink:actuate", ("xlink", "actuate", "http://www.w3.org/1999/xlink")); 190 ("xlink:arcrole", ("xlink", "arcrole", "http://www.w3.org/1999/xlink")); 191 ("xlink:href", ("xlink", "href", "http://www.w3.org/1999/xlink")); 192 ("xlink:role", ("xlink", "role", "http://www.w3.org/1999/xlink")); 193 ("xlink:show", ("xlink", "show", "http://www.w3.org/1999/xlink")); 194 ("xlink:title", ("xlink", "title", "http://www.w3.org/1999/xlink")); 195 ("xlink:type", ("xlink", "type", "http://www.w3.org/1999/xlink")); 196 ("xml:lang", ("xml", "lang", "http://www.w3.org/XML/1998/namespace")); 197 ("xml:space", ("xml", "space", "http://www.w3.org/XML/1998/namespace")); 198 ("xmlns", ("", "xmlns", "http://www.w3.org/2000/xmlns/")); 199 ("xmlns:xlink", ("xmlns", "xlink", "http://www.w3.org/2000/xmlns/")); 200] 201 202(* Quirks mode detection *) 203let quirky_public_matches = [ 204 "-//w3o//dtd w3 html strict 3.0//en//"; 205 "-/w3c/dtd html 4.0 transitional/en"; 206 "html" 207] 208 209let quirky_public_prefixes = [ 210 "+//silmaril//dtd html pro v0r11 19970101//"; 211 "-//as//dtd html 3.0 aswedit + extensions//"; 212 "-//advasoft ltd//dtd html 3.0 aswedit + extensions//"; 213 "-//ietf//dtd html 2.0 level 1//"; 214 "-//ietf//dtd html 2.0 level 2//"; 215 "-//ietf//dtd html 2.0 strict level 1//"; 216 "-//ietf//dtd html 2.0 strict level 2//"; 217 "-//ietf//dtd html 2.0 strict//"; 218 "-//ietf//dtd html 2.0//"; 219 "-//ietf//dtd html 2.1e//"; 220 "-//ietf//dtd html 3.0//"; 221 "-//ietf//dtd html 3.2 final//"; 222 "-//ietf//dtd html 3.2//"; 223 "-//ietf//dtd html 3//"; 224 "-//ietf//dtd html level 0//"; 225 "-//ietf//dtd html level 1//"; 226 "-//ietf//dtd html level 2//"; 227 "-//ietf//dtd html level 3//"; 228 "-//ietf//dtd html strict level 0//"; 229 "-//ietf//dtd html strict level 1//"; 230 "-//ietf//dtd html strict level 2//"; 231 "-//ietf//dtd html strict level 3//"; 232 "-//ietf//dtd html strict//"; 233 "-//ietf//dtd html//"; 234 "-//metrius//dtd metrius presentational//"; 235 "-//microsoft//dtd internet explorer 2.0 html strict//"; 236 "-//microsoft//dtd internet explorer 2.0 html//"; 237 "-//microsoft//dtd internet explorer 2.0 tables//"; 238 "-//microsoft//dtd internet explorer 3.0 html strict//"; 239 "-//microsoft//dtd internet explorer 3.0 html//"; 240 "-//microsoft//dtd internet explorer 3.0 tables//"; 241 "-//netscape comm. corp.//dtd html//"; 242 "-//netscape comm. corp.//dtd strict html//"; 243 "-//o'reilly and associates//dtd html 2.0//"; 244 "-//o'reilly and associates//dtd html extended 1.0//"; 245 "-//o'reilly and associates//dtd html extended relaxed 1.0//"; 246 "-//sq//dtd html 2.0 hotmetal + extensions//"; 247 "-//softquad software//dtd hotmetal pro 6.0::19990601::extensions to html 4.0//"; 248 "-//softquad//dtd hotmetal pro 4.0::19971010::extensions to html 4.0//"; 249 "-//spyglass//dtd html 2.0 extended//"; 250 "-//sun microsystems corp.//dtd hotjava html//"; 251 "-//sun microsystems corp.//dtd hotjava strict html//"; 252 "-//w3c//dtd html 3 1995-03-24//"; 253 "-//w3c//dtd html 3.2 draft//"; 254 "-//w3c//dtd html 3.2 final//"; 255 "-//w3c//dtd html 3.2//"; 256 "-//w3c//dtd html 3.2s draft//"; 257 "-//w3c//dtd html 4.0 frameset//"; 258 "-//w3c//dtd html 4.0 transitional//"; 259 "-//w3c//dtd html experimental 19960712//"; 260 "-//w3c//dtd html experimental 970421//"; 261 "-//w3c//dtd w3 html//"; 262 "-//w3o//dtd w3 html 3.0//"; 263 "-//webtechs//dtd mozilla html 2.0//"; 264 "-//webtechs//dtd mozilla html//"; 265] 266 267let limited_quirky_public_prefixes = [ 268 "-//w3c//dtd xhtml 1.0 frameset//"; 269 "-//w3c//dtd xhtml 1.0 transitional//"; 270] 271 272let html4_public_prefixes = [ 273 "-//w3c//dtd html 4.01 frameset//"; 274 "-//w3c//dtd html 4.01 transitional//"; 275] 276 277let quirky_system_matches = [ 278 "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd" 279] 280 281(* Helper functions *) 282let is_void = List.mem 283let is_formatting = List.mem 284let is_special name = List.mem name special_elements 285let is_heading = List.mem 286 287let adjust_svg_tag_name name = 288 match List.assoc_opt (lowercase name) svg_tag_adjustments with 289 | Some adjusted -> adjusted 290 | None -> name 291 292let adjust_svg_attrs attrs = 293 List.map (fun (name, value) -> 294 let adjusted_name = 295 match List.assoc_opt (lowercase name) svg_attr_adjustments with 296 | Some n -> n 297 | None -> name 298 in 299 (adjusted_name, value) 300 ) attrs 301 302let adjust_foreign_attrs attrs = 303 List.map (fun (name, value) -> 304 match List.assoc_opt (lowercase name) foreign_attr_adjustments with 305 | Some (prefix, local, _ns) -> 306 if prefix = "" then (local, value) 307 else (prefix ^ ":" ^ local, value) 308 | None -> (name, value) 309 ) attrs