OCaml HTML5 parser/serialiser based on Python's JustHTML

init

+1
.gitignore
···
··· 1 + _build
+22
LICENSE.md
···
··· 1 + MIT License 2 + 3 + Copyright (c) 2025 Emil Stenström 4 + Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org> 5 + 6 + Permission is hereby granted, free of charge, to any person obtaining a copy 7 + of this software and associated documentation files (the "Software"), to deal 8 + in the Software without restriction, including without limitation the rights 9 + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 + copies of the Software, and to permit persons to whom the Software is 11 + furnished to do so, subject to the following conditions: 12 + 13 + The above copyright notice and this permission notice shall be included in all 14 + copies or substantial portions of the Software. 15 + 16 + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 + SOFTWARE.
+2233
data/entities.json
···
··· 1 + { 2 + "&AElig": { "codepoints": [198], "characters": "\u00C6" }, 3 + "&AElig;": { "codepoints": [198], "characters": "\u00C6" }, 4 + "&AMP": { "codepoints": [38], "characters": "\u0026" }, 5 + "&AMP;": { "codepoints": [38], "characters": "\u0026" }, 6 + "&Aacute": { "codepoints": [193], "characters": "\u00C1" }, 7 + "&Aacute;": { "codepoints": [193], "characters": "\u00C1" }, 8 + "&Abreve;": { "codepoints": [258], "characters": "\u0102" }, 9 + "&Acirc": { "codepoints": [194], "characters": "\u00C2" }, 10 + "&Acirc;": { "codepoints": [194], "characters": "\u00C2" }, 11 + "&Acy;": { "codepoints": [1040], "characters": "\u0410" }, 12 + "&Afr;": { "codepoints": [120068], "characters": "\uD835\uDD04" }, 13 + "&Agrave": { "codepoints": [192], "characters": "\u00C0" }, 14 + "&Agrave;": { "codepoints": [192], "characters": "\u00C0" }, 15 + "&Alpha;": { "codepoints": [913], "characters": "\u0391" }, 16 + "&Amacr;": { "codepoints": [256], "characters": "\u0100" }, 17 + "&And;": { "codepoints": [10835], "characters": "\u2A53" }, 18 + "&Aogon;": { "codepoints": [260], "characters": "\u0104" }, 19 + "&Aopf;": { "codepoints": [120120], "characters": "\uD835\uDD38" }, 20 + "&ApplyFunction;": { "codepoints": [8289], "characters": "\u2061" }, 21 + "&Aring": { "codepoints": [197], "characters": "\u00C5" }, 22 + "&Aring;": { "codepoints": [197], "characters": "\u00C5" }, 23 + "&Ascr;": { "codepoints": [119964], "characters": "\uD835\uDC9C" }, 24 + "&Assign;": { "codepoints": [8788], "characters": "\u2254" }, 25 + "&Atilde": { "codepoints": [195], "characters": "\u00C3" }, 26 + "&Atilde;": { "codepoints": [195], "characters": "\u00C3" }, 27 + "&Auml": { "codepoints": [196], "characters": "\u00C4" }, 28 + "&Auml;": { "codepoints": [196], "characters": "\u00C4" }, 29 + "&Backslash;": { "codepoints": [8726], "characters": "\u2216" }, 30 + "&Barv;": { "codepoints": [10983], "characters": "\u2AE7" }, 31 + "&Barwed;": { "codepoints": [8966], "characters": "\u2306" }, 32 + "&Bcy;": { "codepoints": [1041], "characters": "\u0411" }, 33 + "&Because;": { "codepoints": [8757], "characters": "\u2235" }, 34 + "&Bernoullis;": { "codepoints": [8492], "characters": "\u212C" }, 35 + "&Beta;": { "codepoints": [914], "characters": "\u0392" }, 36 + "&Bfr;": { "codepoints": [120069], "characters": "\uD835\uDD05" }, 37 + "&Bopf;": { "codepoints": [120121], "characters": "\uD835\uDD39" }, 38 + "&Breve;": { "codepoints": [728], "characters": "\u02D8" }, 39 + "&Bscr;": { "codepoints": [8492], "characters": "\u212C" }, 40 + "&Bumpeq;": { "codepoints": [8782], "characters": "\u224E" }, 41 + "&CHcy;": { "codepoints": [1063], "characters": "\u0427" }, 42 + "&COPY": { "codepoints": [169], "characters": "\u00A9" }, 43 + "&COPY;": { "codepoints": [169], "characters": "\u00A9" }, 44 + "&Cacute;": { "codepoints": [262], "characters": "\u0106" }, 45 + "&Cap;": { "codepoints": [8914], "characters": "\u22D2" }, 46 + "&CapitalDifferentialD;": { "codepoints": [8517], "characters": "\u2145" }, 47 + "&Cayleys;": { "codepoints": [8493], "characters": "\u212D" }, 48 + "&Ccaron;": { "codepoints": [268], "characters": "\u010C" }, 49 + "&Ccedil": { "codepoints": [199], "characters": "\u00C7" }, 50 + "&Ccedil;": { "codepoints": [199], "characters": "\u00C7" }, 51 + "&Ccirc;": { "codepoints": [264], "characters": "\u0108" }, 52 + "&Cconint;": { "codepoints": [8752], "characters": "\u2230" }, 53 + "&Cdot;": { "codepoints": [266], "characters": "\u010A" }, 54 + "&Cedilla;": { "codepoints": [184], "characters": "\u00B8" }, 55 + "&CenterDot;": { "codepoints": [183], "characters": "\u00B7" }, 56 + "&Cfr;": { "codepoints": [8493], "characters": "\u212D" }, 57 + "&Chi;": { "codepoints": [935], "characters": "\u03A7" }, 58 + "&CircleDot;": { "codepoints": [8857], "characters": "\u2299" }, 59 + "&CircleMinus;": { "codepoints": [8854], "characters": "\u2296" }, 60 + "&CirclePlus;": { "codepoints": [8853], "characters": "\u2295" }, 61 + "&CircleTimes;": { "codepoints": [8855], "characters": "\u2297" }, 62 + "&ClockwiseContourIntegral;": { "codepoints": [8754], "characters": "\u2232" }, 63 + "&CloseCurlyDoubleQuote;": { "codepoints": [8221], "characters": "\u201D" }, 64 + "&CloseCurlyQuote;": { "codepoints": [8217], "characters": "\u2019" }, 65 + "&Colon;": { "codepoints": [8759], "characters": "\u2237" }, 66 + "&Colone;": { "codepoints": [10868], "characters": "\u2A74" }, 67 + "&Congruent;": { "codepoints": [8801], "characters": "\u2261" }, 68 + "&Conint;": { "codepoints": [8751], "characters": "\u222F" }, 69 + "&ContourIntegral;": { "codepoints": [8750], "characters": "\u222E" }, 70 + "&Copf;": { "codepoints": [8450], "characters": "\u2102" }, 71 + "&Coproduct;": { "codepoints": [8720], "characters": "\u2210" }, 72 + "&CounterClockwiseContourIntegral;": { "codepoints": [8755], "characters": "\u2233" }, 73 + "&Cross;": { "codepoints": [10799], "characters": "\u2A2F" }, 74 + "&Cscr;": { "codepoints": [119966], "characters": "\uD835\uDC9E" }, 75 + "&Cup;": { "codepoints": [8915], "characters": "\u22D3" }, 76 + "&CupCap;": { "codepoints": [8781], "characters": "\u224D" }, 77 + "&DD;": { "codepoints": [8517], "characters": "\u2145" }, 78 + "&DDotrahd;": { "codepoints": [10513], "characters": "\u2911" }, 79 + "&DJcy;": { "codepoints": [1026], "characters": "\u0402" }, 80 + "&DScy;": { "codepoints": [1029], "characters": "\u0405" }, 81 + "&DZcy;": { "codepoints": [1039], "characters": "\u040F" }, 82 + "&Dagger;": { "codepoints": [8225], "characters": "\u2021" }, 83 + "&Darr;": { "codepoints": [8609], "characters": "\u21A1" }, 84 + "&Dashv;": { "codepoints": [10980], "characters": "\u2AE4" }, 85 + "&Dcaron;": { "codepoints": [270], "characters": "\u010E" }, 86 + "&Dcy;": { "codepoints": [1044], "characters": "\u0414" }, 87 + "&Del;": { "codepoints": [8711], "characters": "\u2207" }, 88 + "&Delta;": { "codepoints": [916], "characters": "\u0394" }, 89 + "&Dfr;": { "codepoints": [120071], "characters": "\uD835\uDD07" }, 90 + "&DiacriticalAcute;": { "codepoints": [180], "characters": "\u00B4" }, 91 + "&DiacriticalDot;": { "codepoints": [729], "characters": "\u02D9" }, 92 + "&DiacriticalDoubleAcute;": { "codepoints": [733], "characters": "\u02DD" }, 93 + "&DiacriticalGrave;": { "codepoints": [96], "characters": "\u0060" }, 94 + "&DiacriticalTilde;": { "codepoints": [732], "characters": "\u02DC" }, 95 + "&Diamond;": { "codepoints": [8900], "characters": "\u22C4" }, 96 + "&DifferentialD;": { "codepoints": [8518], "characters": "\u2146" }, 97 + "&Dopf;": { "codepoints": [120123], "characters": "\uD835\uDD3B" }, 98 + "&Dot;": { "codepoints": [168], "characters": "\u00A8" }, 99 + "&DotDot;": { "codepoints": [8412], "characters": "\u20DC" }, 100 + "&DotEqual;": { "codepoints": [8784], "characters": "\u2250" }, 101 + "&DoubleContourIntegral;": { "codepoints": [8751], "characters": "\u222F" }, 102 + "&DoubleDot;": { "codepoints": [168], "characters": "\u00A8" }, 103 + "&DoubleDownArrow;": { "codepoints": [8659], "characters": "\u21D3" }, 104 + "&DoubleLeftArrow;": { "codepoints": [8656], "characters": "\u21D0" }, 105 + "&DoubleLeftRightArrow;": { "codepoints": [8660], "characters": "\u21D4" }, 106 + "&DoubleLeftTee;": { "codepoints": [10980], "characters": "\u2AE4" }, 107 + "&DoubleLongLeftArrow;": { "codepoints": [10232], "characters": "\u27F8" }, 108 + "&DoubleLongLeftRightArrow;": { "codepoints": [10234], "characters": "\u27FA" }, 109 + "&DoubleLongRightArrow;": { "codepoints": [10233], "characters": "\u27F9" }, 110 + "&DoubleRightArrow;": { "codepoints": [8658], "characters": "\u21D2" }, 111 + "&DoubleRightTee;": { "codepoints": [8872], "characters": "\u22A8" }, 112 + "&DoubleUpArrow;": { "codepoints": [8657], "characters": "\u21D1" }, 113 + "&DoubleUpDownArrow;": { "codepoints": [8661], "characters": "\u21D5" }, 114 + "&DoubleVerticalBar;": { "codepoints": [8741], "characters": "\u2225" }, 115 + "&DownArrow;": { "codepoints": [8595], "characters": "\u2193" }, 116 + "&DownArrowBar;": { "codepoints": [10515], "characters": "\u2913" }, 117 + "&DownArrowUpArrow;": { "codepoints": [8693], "characters": "\u21F5" }, 118 + "&DownBreve;": { "codepoints": [785], "characters": "\u0311" }, 119 + "&DownLeftRightVector;": { "codepoints": [10576], "characters": "\u2950" }, 120 + "&DownLeftTeeVector;": { "codepoints": [10590], "characters": "\u295E" }, 121 + "&DownLeftVector;": { "codepoints": [8637], "characters": "\u21BD" }, 122 + "&DownLeftVectorBar;": { "codepoints": [10582], "characters": "\u2956" }, 123 + "&DownRightTeeVector;": { "codepoints": [10591], "characters": "\u295F" }, 124 + "&DownRightVector;": { "codepoints": [8641], "characters": "\u21C1" }, 125 + "&DownRightVectorBar;": { "codepoints": [10583], "characters": "\u2957" }, 126 + "&DownTee;": { "codepoints": [8868], "characters": "\u22A4" }, 127 + "&DownTeeArrow;": { "codepoints": [8615], "characters": "\u21A7" }, 128 + "&Downarrow;": { "codepoints": [8659], "characters": "\u21D3" }, 129 + "&Dscr;": { "codepoints": [119967], "characters": "\uD835\uDC9F" }, 130 + "&Dstrok;": { "codepoints": [272], "characters": "\u0110" }, 131 + "&ENG;": { "codepoints": [330], "characters": "\u014A" }, 132 + "&ETH": { "codepoints": [208], "characters": "\u00D0" }, 133 + "&ETH;": { "codepoints": [208], "characters": "\u00D0" }, 134 + "&Eacute": { "codepoints": [201], "characters": "\u00C9" }, 135 + "&Eacute;": { "codepoints": [201], "characters": "\u00C9" }, 136 + "&Ecaron;": { "codepoints": [282], "characters": "\u011A" }, 137 + "&Ecirc": { "codepoints": [202], "characters": "\u00CA" }, 138 + "&Ecirc;": { "codepoints": [202], "characters": "\u00CA" }, 139 + "&Ecy;": { "codepoints": [1069], "characters": "\u042D" }, 140 + "&Edot;": { "codepoints": [278], "characters": "\u0116" }, 141 + "&Efr;": { "codepoints": [120072], "characters": "\uD835\uDD08" }, 142 + "&Egrave": { "codepoints": [200], "characters": "\u00C8" }, 143 + "&Egrave;": { "codepoints": [200], "characters": "\u00C8" }, 144 + "&Element;": { "codepoints": [8712], "characters": "\u2208" }, 145 + "&Emacr;": { "codepoints": [274], "characters": "\u0112" }, 146 + "&EmptySmallSquare;": { "codepoints": [9723], "characters": "\u25FB" }, 147 + "&EmptyVerySmallSquare;": { "codepoints": [9643], "characters": "\u25AB" }, 148 + "&Eogon;": { "codepoints": [280], "characters": "\u0118" }, 149 + "&Eopf;": { "codepoints": [120124], "characters": "\uD835\uDD3C" }, 150 + "&Epsilon;": { "codepoints": [917], "characters": "\u0395" }, 151 + "&Equal;": { "codepoints": [10869], "characters": "\u2A75" }, 152 + "&EqualTilde;": { "codepoints": [8770], "characters": "\u2242" }, 153 + "&Equilibrium;": { "codepoints": [8652], "characters": "\u21CC" }, 154 + "&Escr;": { "codepoints": [8496], "characters": "\u2130" }, 155 + "&Esim;": { "codepoints": [10867], "characters": "\u2A73" }, 156 + "&Eta;": { "codepoints": [919], "characters": "\u0397" }, 157 + "&Euml": { "codepoints": [203], "characters": "\u00CB" }, 158 + "&Euml;": { "codepoints": [203], "characters": "\u00CB" }, 159 + "&Exists;": { "codepoints": [8707], "characters": "\u2203" }, 160 + "&ExponentialE;": { "codepoints": [8519], "characters": "\u2147" }, 161 + "&Fcy;": { "codepoints": [1060], "characters": "\u0424" }, 162 + "&Ffr;": { "codepoints": [120073], "characters": "\uD835\uDD09" }, 163 + "&FilledSmallSquare;": { "codepoints": [9724], "characters": "\u25FC" }, 164 + "&FilledVerySmallSquare;": { "codepoints": [9642], "characters": "\u25AA" }, 165 + "&Fopf;": { "codepoints": [120125], "characters": "\uD835\uDD3D" }, 166 + "&ForAll;": { "codepoints": [8704], "characters": "\u2200" }, 167 + "&Fouriertrf;": { "codepoints": [8497], "characters": "\u2131" }, 168 + "&Fscr;": { "codepoints": [8497], "characters": "\u2131" }, 169 + "&GJcy;": { "codepoints": [1027], "characters": "\u0403" }, 170 + "&GT": { "codepoints": [62], "characters": "\u003E" }, 171 + "&GT;": { "codepoints": [62], "characters": "\u003E" }, 172 + "&Gamma;": { "codepoints": [915], "characters": "\u0393" }, 173 + "&Gammad;": { "codepoints": [988], "characters": "\u03DC" }, 174 + "&Gbreve;": { "codepoints": [286], "characters": "\u011E" }, 175 + "&Gcedil;": { "codepoints": [290], "characters": "\u0122" }, 176 + "&Gcirc;": { "codepoints": [284], "characters": "\u011C" }, 177 + "&Gcy;": { "codepoints": [1043], "characters": "\u0413" }, 178 + "&Gdot;": { "codepoints": [288], "characters": "\u0120" }, 179 + "&Gfr;": { "codepoints": [120074], "characters": "\uD835\uDD0A" }, 180 + "&Gg;": { "codepoints": [8921], "characters": "\u22D9" }, 181 + "&Gopf;": { "codepoints": [120126], "characters": "\uD835\uDD3E" }, 182 + "&GreaterEqual;": { "codepoints": [8805], "characters": "\u2265" }, 183 + "&GreaterEqualLess;": { "codepoints": [8923], "characters": "\u22DB" }, 184 + "&GreaterFullEqual;": { "codepoints": [8807], "characters": "\u2267" }, 185 + "&GreaterGreater;": { "codepoints": [10914], "characters": "\u2AA2" }, 186 + "&GreaterLess;": { "codepoints": [8823], "characters": "\u2277" }, 187 + "&GreaterSlantEqual;": { "codepoints": [10878], "characters": "\u2A7E" }, 188 + "&GreaterTilde;": { "codepoints": [8819], "characters": "\u2273" }, 189 + "&Gscr;": { "codepoints": [119970], "characters": "\uD835\uDCA2" }, 190 + "&Gt;": { "codepoints": [8811], "characters": "\u226B" }, 191 + "&HARDcy;": { "codepoints": [1066], "characters": "\u042A" }, 192 + "&Hacek;": { "codepoints": [711], "characters": "\u02C7" }, 193 + "&Hat;": { "codepoints": [94], "characters": "\u005E" }, 194 + "&Hcirc;": { "codepoints": [292], "characters": "\u0124" }, 195 + "&Hfr;": { "codepoints": [8460], "characters": "\u210C" }, 196 + "&HilbertSpace;": { "codepoints": [8459], "characters": "\u210B" }, 197 + "&Hopf;": { "codepoints": [8461], "characters": "\u210D" }, 198 + "&HorizontalLine;": { "codepoints": [9472], "characters": "\u2500" }, 199 + "&Hscr;": { "codepoints": [8459], "characters": "\u210B" }, 200 + "&Hstrok;": { "codepoints": [294], "characters": "\u0126" }, 201 + "&HumpDownHump;": { "codepoints": [8782], "characters": "\u224E" }, 202 + "&HumpEqual;": { "codepoints": [8783], "characters": "\u224F" }, 203 + "&IEcy;": { "codepoints": [1045], "characters": "\u0415" }, 204 + "&IJlig;": { "codepoints": [306], "characters": "\u0132" }, 205 + "&IOcy;": { "codepoints": [1025], "characters": "\u0401" }, 206 + "&Iacute": { "codepoints": [205], "characters": "\u00CD" }, 207 + "&Iacute;": { "codepoints": [205], "characters": "\u00CD" }, 208 + "&Icirc": { "codepoints": [206], "characters": "\u00CE" }, 209 + "&Icirc;": { "codepoints": [206], "characters": "\u00CE" }, 210 + "&Icy;": { "codepoints": [1048], "characters": "\u0418" }, 211 + "&Idot;": { "codepoints": [304], "characters": "\u0130" }, 212 + "&Ifr;": { "codepoints": [8465], "characters": "\u2111" }, 213 + "&Igrave": { "codepoints": [204], "characters": "\u00CC" }, 214 + "&Igrave;": { "codepoints": [204], "characters": "\u00CC" }, 215 + "&Im;": { "codepoints": [8465], "characters": "\u2111" }, 216 + "&Imacr;": { "codepoints": [298], "characters": "\u012A" }, 217 + "&ImaginaryI;": { "codepoints": [8520], "characters": "\u2148" }, 218 + "&Implies;": { "codepoints": [8658], "characters": "\u21D2" }, 219 + "&Int;": { "codepoints": [8748], "characters": "\u222C" }, 220 + "&Integral;": { "codepoints": [8747], "characters": "\u222B" }, 221 + "&Intersection;": { "codepoints": [8898], "characters": "\u22C2" }, 222 + "&InvisibleComma;": { "codepoints": [8291], "characters": "\u2063" }, 223 + "&InvisibleTimes;": { "codepoints": [8290], "characters": "\u2062" }, 224 + "&Iogon;": { "codepoints": [302], "characters": "\u012E" }, 225 + "&Iopf;": { "codepoints": [120128], "characters": "\uD835\uDD40" }, 226 + "&Iota;": { "codepoints": [921], "characters": "\u0399" }, 227 + "&Iscr;": { "codepoints": [8464], "characters": "\u2110" }, 228 + "&Itilde;": { "codepoints": [296], "characters": "\u0128" }, 229 + "&Iukcy;": { "codepoints": [1030], "characters": "\u0406" }, 230 + "&Iuml": { "codepoints": [207], "characters": "\u00CF" }, 231 + "&Iuml;": { "codepoints": [207], "characters": "\u00CF" }, 232 + "&Jcirc;": { "codepoints": [308], "characters": "\u0134" }, 233 + "&Jcy;": { "codepoints": [1049], "characters": "\u0419" }, 234 + "&Jfr;": { "codepoints": [120077], "characters": "\uD835\uDD0D" }, 235 + "&Jopf;": { "codepoints": [120129], "characters": "\uD835\uDD41" }, 236 + "&Jscr;": { "codepoints": [119973], "characters": "\uD835\uDCA5" }, 237 + "&Jsercy;": { "codepoints": [1032], "characters": "\u0408" }, 238 + "&Jukcy;": { "codepoints": [1028], "characters": "\u0404" }, 239 + "&KHcy;": { "codepoints": [1061], "characters": "\u0425" }, 240 + "&KJcy;": { "codepoints": [1036], "characters": "\u040C" }, 241 + "&Kappa;": { "codepoints": [922], "characters": "\u039A" }, 242 + "&Kcedil;": { "codepoints": [310], "characters": "\u0136" }, 243 + "&Kcy;": { "codepoints": [1050], "characters": "\u041A" }, 244 + "&Kfr;": { "codepoints": [120078], "characters": "\uD835\uDD0E" }, 245 + "&Kopf;": { "codepoints": [120130], "characters": "\uD835\uDD42" }, 246 + "&Kscr;": { "codepoints": [119974], "characters": "\uD835\uDCA6" }, 247 + "&LJcy;": { "codepoints": [1033], "characters": "\u0409" }, 248 + "&LT": { "codepoints": [60], "characters": "\u003C" }, 249 + "&LT;": { "codepoints": [60], "characters": "\u003C" }, 250 + "&Lacute;": { "codepoints": [313], "characters": "\u0139" }, 251 + "&Lambda;": { "codepoints": [923], "characters": "\u039B" }, 252 + "&Lang;": { "codepoints": [10218], "characters": "\u27EA" }, 253 + "&Laplacetrf;": { "codepoints": [8466], "characters": "\u2112" }, 254 + "&Larr;": { "codepoints": [8606], "characters": "\u219E" }, 255 + "&Lcaron;": { "codepoints": [317], "characters": "\u013D" }, 256 + "&Lcedil;": { "codepoints": [315], "characters": "\u013B" }, 257 + "&Lcy;": { "codepoints": [1051], "characters": "\u041B" }, 258 + "&LeftAngleBracket;": { "codepoints": [10216], "characters": "\u27E8" }, 259 + "&LeftArrow;": { "codepoints": [8592], "characters": "\u2190" }, 260 + "&LeftArrowBar;": { "codepoints": [8676], "characters": "\u21E4" }, 261 + "&LeftArrowRightArrow;": { "codepoints": [8646], "characters": "\u21C6" }, 262 + "&LeftCeiling;": { "codepoints": [8968], "characters": "\u2308" }, 263 + "&LeftDoubleBracket;": { "codepoints": [10214], "characters": "\u27E6" }, 264 + "&LeftDownTeeVector;": { "codepoints": [10593], "characters": "\u2961" }, 265 + "&LeftDownVector;": { "codepoints": [8643], "characters": "\u21C3" }, 266 + "&LeftDownVectorBar;": { "codepoints": [10585], "characters": "\u2959" }, 267 + "&LeftFloor;": { "codepoints": [8970], "characters": "\u230A" }, 268 + "&LeftRightArrow;": { "codepoints": [8596], "characters": "\u2194" }, 269 + "&LeftRightVector;": { "codepoints": [10574], "characters": "\u294E" }, 270 + "&LeftTee;": { "codepoints": [8867], "characters": "\u22A3" }, 271 + "&LeftTeeArrow;": { "codepoints": [8612], "characters": "\u21A4" }, 272 + "&LeftTeeVector;": { "codepoints": [10586], "characters": "\u295A" }, 273 + "&LeftTriangle;": { "codepoints": [8882], "characters": "\u22B2" }, 274 + "&LeftTriangleBar;": { "codepoints": [10703], "characters": "\u29CF" }, 275 + "&LeftTriangleEqual;": { "codepoints": [8884], "characters": "\u22B4" }, 276 + "&LeftUpDownVector;": { "codepoints": [10577], "characters": "\u2951" }, 277 + "&LeftUpTeeVector;": { "codepoints": [10592], "characters": "\u2960" }, 278 + "&LeftUpVector;": { "codepoints": [8639], "characters": "\u21BF" }, 279 + "&LeftUpVectorBar;": { "codepoints": [10584], "characters": "\u2958" }, 280 + "&LeftVector;": { "codepoints": [8636], "characters": "\u21BC" }, 281 + "&LeftVectorBar;": { "codepoints": [10578], "characters": "\u2952" }, 282 + "&Leftarrow;": { "codepoints": [8656], "characters": "\u21D0" }, 283 + "&Leftrightarrow;": { "codepoints": [8660], "characters": "\u21D4" }, 284 + "&LessEqualGreater;": { "codepoints": [8922], "characters": "\u22DA" }, 285 + "&LessFullEqual;": { "codepoints": [8806], "characters": "\u2266" }, 286 + "&LessGreater;": { "codepoints": [8822], "characters": "\u2276" }, 287 + "&LessLess;": { "codepoints": [10913], "characters": "\u2AA1" }, 288 + "&LessSlantEqual;": { "codepoints": [10877], "characters": "\u2A7D" }, 289 + "&LessTilde;": { "codepoints": [8818], "characters": "\u2272" }, 290 + "&Lfr;": { "codepoints": [120079], "characters": "\uD835\uDD0F" }, 291 + "&Ll;": { "codepoints": [8920], "characters": "\u22D8" }, 292 + "&Lleftarrow;": { "codepoints": [8666], "characters": "\u21DA" }, 293 + "&Lmidot;": { "codepoints": [319], "characters": "\u013F" }, 294 + "&LongLeftArrow;": { "codepoints": [10229], "characters": "\u27F5" }, 295 + "&LongLeftRightArrow;": { "codepoints": [10231], "characters": "\u27F7" }, 296 + "&LongRightArrow;": { "codepoints": [10230], "characters": "\u27F6" }, 297 + "&Longleftarrow;": { "codepoints": [10232], "characters": "\u27F8" }, 298 + "&Longleftrightarrow;": { "codepoints": [10234], "characters": "\u27FA" }, 299 + "&Longrightarrow;": { "codepoints": [10233], "characters": "\u27F9" }, 300 + "&Lopf;": { "codepoints": [120131], "characters": "\uD835\uDD43" }, 301 + "&LowerLeftArrow;": { "codepoints": [8601], "characters": "\u2199" }, 302 + "&LowerRightArrow;": { "codepoints": [8600], "characters": "\u2198" }, 303 + "&Lscr;": { "codepoints": [8466], "characters": "\u2112" }, 304 + "&Lsh;": { "codepoints": [8624], "characters": "\u21B0" }, 305 + "&Lstrok;": { "codepoints": [321], "characters": "\u0141" }, 306 + "&Lt;": { "codepoints": [8810], "characters": "\u226A" }, 307 + "&Map;": { "codepoints": [10501], "characters": "\u2905" }, 308 + "&Mcy;": { "codepoints": [1052], "characters": "\u041C" }, 309 + "&MediumSpace;": { "codepoints": [8287], "characters": "\u205F" }, 310 + "&Mellintrf;": { "codepoints": [8499], "characters": "\u2133" }, 311 + "&Mfr;": { "codepoints": [120080], "characters": "\uD835\uDD10" }, 312 + "&MinusPlus;": { "codepoints": [8723], "characters": "\u2213" }, 313 + "&Mopf;": { "codepoints": [120132], "characters": "\uD835\uDD44" }, 314 + "&Mscr;": { "codepoints": [8499], "characters": "\u2133" }, 315 + "&Mu;": { "codepoints": [924], "characters": "\u039C" }, 316 + "&NJcy;": { "codepoints": [1034], "characters": "\u040A" }, 317 + "&Nacute;": { "codepoints": [323], "characters": "\u0143" }, 318 + "&Ncaron;": { "codepoints": [327], "characters": "\u0147" }, 319 + "&Ncedil;": { "codepoints": [325], "characters": "\u0145" }, 320 + "&Ncy;": { "codepoints": [1053], "characters": "\u041D" }, 321 + "&NegativeMediumSpace;": { "codepoints": [8203], "characters": "\u200B" }, 322 + "&NegativeThickSpace;": { "codepoints": [8203], "characters": "\u200B" }, 323 + "&NegativeThinSpace;": { "codepoints": [8203], "characters": "\u200B" }, 324 + "&NegativeVeryThinSpace;": { "codepoints": [8203], "characters": "\u200B" }, 325 + "&NestedGreaterGreater;": { "codepoints": [8811], "characters": "\u226B" }, 326 + "&NestedLessLess;": { "codepoints": [8810], "characters": "\u226A" }, 327 + "&NewLine;": { "codepoints": [10], "characters": "\u000A" }, 328 + "&Nfr;": { "codepoints": [120081], "characters": "\uD835\uDD11" }, 329 + "&NoBreak;": { "codepoints": [8288], "characters": "\u2060" }, 330 + "&NonBreakingSpace;": { "codepoints": [160], "characters": "\u00A0" }, 331 + "&Nopf;": { "codepoints": [8469], "characters": "\u2115" }, 332 + "&Not;": { "codepoints": [10988], "characters": "\u2AEC" }, 333 + "&NotCongruent;": { "codepoints": [8802], "characters": "\u2262" }, 334 + "&NotCupCap;": { "codepoints": [8813], "characters": "\u226D" }, 335 + "&NotDoubleVerticalBar;": { "codepoints": [8742], "characters": "\u2226" }, 336 + "&NotElement;": { "codepoints": [8713], "characters": "\u2209" }, 337 + "&NotEqual;": { "codepoints": [8800], "characters": "\u2260" }, 338 + "&NotEqualTilde;": { "codepoints": [8770, 824], "characters": "\u2242\u0338" }, 339 + "&NotExists;": { "codepoints": [8708], "characters": "\u2204" }, 340 + "&NotGreater;": { "codepoints": [8815], "characters": "\u226F" }, 341 + "&NotGreaterEqual;": { "codepoints": [8817], "characters": "\u2271" }, 342 + "&NotGreaterFullEqual;": { "codepoints": [8807, 824], "characters": "\u2267\u0338" }, 343 + "&NotGreaterGreater;": { "codepoints": [8811, 824], "characters": "\u226B\u0338" }, 344 + "&NotGreaterLess;": { "codepoints": [8825], "characters": "\u2279" }, 345 + "&NotGreaterSlantEqual;": { "codepoints": [10878, 824], "characters": "\u2A7E\u0338" }, 346 + "&NotGreaterTilde;": { "codepoints": [8821], "characters": "\u2275" }, 347 + "&NotHumpDownHump;": { "codepoints": [8782, 824], "characters": "\u224E\u0338" }, 348 + "&NotHumpEqual;": { "codepoints": [8783, 824], "characters": "\u224F\u0338" }, 349 + "&NotLeftTriangle;": { "codepoints": [8938], "characters": "\u22EA" }, 350 + "&NotLeftTriangleBar;": { "codepoints": [10703, 824], "characters": "\u29CF\u0338" }, 351 + "&NotLeftTriangleEqual;": { "codepoints": [8940], "characters": "\u22EC" }, 352 + "&NotLess;": { "codepoints": [8814], "characters": "\u226E" }, 353 + "&NotLessEqual;": { "codepoints": [8816], "characters": "\u2270" }, 354 + "&NotLessGreater;": { "codepoints": [8824], "characters": "\u2278" }, 355 + "&NotLessLess;": { "codepoints": [8810, 824], "characters": "\u226A\u0338" }, 356 + "&NotLessSlantEqual;": { "codepoints": [10877, 824], "characters": "\u2A7D\u0338" }, 357 + "&NotLessTilde;": { "codepoints": [8820], "characters": "\u2274" }, 358 + "&NotNestedGreaterGreater;": { "codepoints": [10914, 824], "characters": "\u2AA2\u0338" }, 359 + "&NotNestedLessLess;": { "codepoints": [10913, 824], "characters": "\u2AA1\u0338" }, 360 + "&NotPrecedes;": { "codepoints": [8832], "characters": "\u2280" }, 361 + "&NotPrecedesEqual;": { "codepoints": [10927, 824], "characters": "\u2AAF\u0338" }, 362 + "&NotPrecedesSlantEqual;": { "codepoints": [8928], "characters": "\u22E0" }, 363 + "&NotReverseElement;": { "codepoints": [8716], "characters": "\u220C" }, 364 + "&NotRightTriangle;": { "codepoints": [8939], "characters": "\u22EB" }, 365 + "&NotRightTriangleBar;": { "codepoints": [10704, 824], "characters": "\u29D0\u0338" }, 366 + "&NotRightTriangleEqual;": { "codepoints": [8941], "characters": "\u22ED" }, 367 + "&NotSquareSubset;": { "codepoints": [8847, 824], "characters": "\u228F\u0338" }, 368 + "&NotSquareSubsetEqual;": { "codepoints": [8930], "characters": "\u22E2" }, 369 + "&NotSquareSuperset;": { "codepoints": [8848, 824], "characters": "\u2290\u0338" }, 370 + "&NotSquareSupersetEqual;": { "codepoints": [8931], "characters": "\u22E3" }, 371 + "&NotSubset;": { "codepoints": [8834, 8402], "characters": "\u2282\u20D2" }, 372 + "&NotSubsetEqual;": { "codepoints": [8840], "characters": "\u2288" }, 373 + "&NotSucceeds;": { "codepoints": [8833], "characters": "\u2281" }, 374 + "&NotSucceedsEqual;": { "codepoints": [10928, 824], "characters": "\u2AB0\u0338" }, 375 + "&NotSucceedsSlantEqual;": { "codepoints": [8929], "characters": "\u22E1" }, 376 + "&NotSucceedsTilde;": { "codepoints": [8831, 824], "characters": "\u227F\u0338" }, 377 + "&NotSuperset;": { "codepoints": [8835, 8402], "characters": "\u2283\u20D2" }, 378 + "&NotSupersetEqual;": { "codepoints": [8841], "characters": "\u2289" }, 379 + "&NotTilde;": { "codepoints": [8769], "characters": "\u2241" }, 380 + "&NotTildeEqual;": { "codepoints": [8772], "characters": "\u2244" }, 381 + "&NotTildeFullEqual;": { "codepoints": [8775], "characters": "\u2247" }, 382 + "&NotTildeTilde;": { "codepoints": [8777], "characters": "\u2249" }, 383 + "&NotVerticalBar;": { "codepoints": [8740], "characters": "\u2224" }, 384 + "&Nscr;": { "codepoints": [119977], "characters": "\uD835\uDCA9" }, 385 + "&Ntilde": { "codepoints": [209], "characters": "\u00D1" }, 386 + "&Ntilde;": { "codepoints": [209], "characters": "\u00D1" }, 387 + "&Nu;": { "codepoints": [925], "characters": "\u039D" }, 388 + "&OElig;": { "codepoints": [338], "characters": "\u0152" }, 389 + "&Oacute": { "codepoints": [211], "characters": "\u00D3" }, 390 + "&Oacute;": { "codepoints": [211], "characters": "\u00D3" }, 391 + "&Ocirc": { "codepoints": [212], "characters": "\u00D4" }, 392 + "&Ocirc;": { "codepoints": [212], "characters": "\u00D4" }, 393 + "&Ocy;": { "codepoints": [1054], "characters": "\u041E" }, 394 + "&Odblac;": { "codepoints": [336], "characters": "\u0150" }, 395 + "&Ofr;": { "codepoints": [120082], "characters": "\uD835\uDD12" }, 396 + "&Ograve": { "codepoints": [210], "characters": "\u00D2" }, 397 + "&Ograve;": { "codepoints": [210], "characters": "\u00D2" }, 398 + "&Omacr;": { "codepoints": [332], "characters": "\u014C" }, 399 + "&Omega;": { "codepoints": [937], "characters": "\u03A9" }, 400 + "&Omicron;": { "codepoints": [927], "characters": "\u039F" }, 401 + "&Oopf;": { "codepoints": [120134], "characters": "\uD835\uDD46" }, 402 + "&OpenCurlyDoubleQuote;": { "codepoints": [8220], "characters": "\u201C" }, 403 + "&OpenCurlyQuote;": { "codepoints": [8216], "characters": "\u2018" }, 404 + "&Or;": { "codepoints": [10836], "characters": "\u2A54" }, 405 + "&Oscr;": { "codepoints": [119978], "characters": "\uD835\uDCAA" }, 406 + "&Oslash": { "codepoints": [216], "characters": "\u00D8" }, 407 + "&Oslash;": { "codepoints": [216], "characters": "\u00D8" }, 408 + "&Otilde": { "codepoints": [213], "characters": "\u00D5" }, 409 + "&Otilde;": { "codepoints": [213], "characters": "\u00D5" }, 410 + "&Otimes;": { "codepoints": [10807], "characters": "\u2A37" }, 411 + "&Ouml": { "codepoints": [214], "characters": "\u00D6" }, 412 + "&Ouml;": { "codepoints": [214], "characters": "\u00D6" }, 413 + "&OverBar;": { "codepoints": [8254], "characters": "\u203E" }, 414 + "&OverBrace;": { "codepoints": [9182], "characters": "\u23DE" }, 415 + "&OverBracket;": { "codepoints": [9140], "characters": "\u23B4" }, 416 + "&OverParenthesis;": { "codepoints": [9180], "characters": "\u23DC" }, 417 + "&PartialD;": { "codepoints": [8706], "characters": "\u2202" }, 418 + "&Pcy;": { "codepoints": [1055], "characters": "\u041F" }, 419 + "&Pfr;": { "codepoints": [120083], "characters": "\uD835\uDD13" }, 420 + "&Phi;": { "codepoints": [934], "characters": "\u03A6" }, 421 + "&Pi;": { "codepoints": [928], "characters": "\u03A0" }, 422 + "&PlusMinus;": { "codepoints": [177], "characters": "\u00B1" }, 423 + "&Poincareplane;": { "codepoints": [8460], "characters": "\u210C" }, 424 + "&Popf;": { "codepoints": [8473], "characters": "\u2119" }, 425 + "&Pr;": { "codepoints": [10939], "characters": "\u2ABB" }, 426 + "&Precedes;": { "codepoints": [8826], "characters": "\u227A" }, 427 + "&PrecedesEqual;": { "codepoints": [10927], "characters": "\u2AAF" }, 428 + "&PrecedesSlantEqual;": { "codepoints": [8828], "characters": "\u227C" }, 429 + "&PrecedesTilde;": { "codepoints": [8830], "characters": "\u227E" }, 430 + "&Prime;": { "codepoints": [8243], "characters": "\u2033" }, 431 + "&Product;": { "codepoints": [8719], "characters": "\u220F" }, 432 + "&Proportion;": { "codepoints": [8759], "characters": "\u2237" }, 433 + "&Proportional;": { "codepoints": [8733], "characters": "\u221D" }, 434 + "&Pscr;": { "codepoints": [119979], "characters": "\uD835\uDCAB" }, 435 + "&Psi;": { "codepoints": [936], "characters": "\u03A8" }, 436 + "&QUOT": { "codepoints": [34], "characters": "\u0022" }, 437 + "&QUOT;": { "codepoints": [34], "characters": "\u0022" }, 438 + "&Qfr;": { "codepoints": [120084], "characters": "\uD835\uDD14" }, 439 + "&Qopf;": { "codepoints": [8474], "characters": "\u211A" }, 440 + "&Qscr;": { "codepoints": [119980], "characters": "\uD835\uDCAC" }, 441 + "&RBarr;": { "codepoints": [10512], "characters": "\u2910" }, 442 + "&REG": { "codepoints": [174], "characters": "\u00AE" }, 443 + "&REG;": { "codepoints": [174], "characters": "\u00AE" }, 444 + "&Racute;": { "codepoints": [340], "characters": "\u0154" }, 445 + "&Rang;": { "codepoints": [10219], "characters": "\u27EB" }, 446 + "&Rarr;": { "codepoints": [8608], "characters": "\u21A0" }, 447 + "&Rarrtl;": { "codepoints": [10518], "characters": "\u2916" }, 448 + "&Rcaron;": { "codepoints": [344], "characters": "\u0158" }, 449 + "&Rcedil;": { "codepoints": [342], "characters": "\u0156" }, 450 + "&Rcy;": { "codepoints": [1056], "characters": "\u0420" }, 451 + "&Re;": { "codepoints": [8476], "characters": "\u211C" }, 452 + "&ReverseElement;": { "codepoints": [8715], "characters": "\u220B" }, 453 + "&ReverseEquilibrium;": { "codepoints": [8651], "characters": "\u21CB" }, 454 + "&ReverseUpEquilibrium;": { "codepoints": [10607], "characters": "\u296F" }, 455 + "&Rfr;": { "codepoints": [8476], "characters": "\u211C" }, 456 + "&Rho;": { "codepoints": [929], "characters": "\u03A1" }, 457 + "&RightAngleBracket;": { "codepoints": [10217], "characters": "\u27E9" }, 458 + "&RightArrow;": { "codepoints": [8594], "characters": "\u2192" }, 459 + "&RightArrowBar;": { "codepoints": [8677], "characters": "\u21E5" }, 460 + "&RightArrowLeftArrow;": { "codepoints": [8644], "characters": "\u21C4" }, 461 + "&RightCeiling;": { "codepoints": [8969], "characters": "\u2309" }, 462 + "&RightDoubleBracket;": { "codepoints": [10215], "characters": "\u27E7" }, 463 + "&RightDownTeeVector;": { "codepoints": [10589], "characters": "\u295D" }, 464 + "&RightDownVector;": { "codepoints": [8642], "characters": "\u21C2" }, 465 + "&RightDownVectorBar;": { "codepoints": [10581], "characters": "\u2955" }, 466 + "&RightFloor;": { "codepoints": [8971], "characters": "\u230B" }, 467 + "&RightTee;": { "codepoints": [8866], "characters": "\u22A2" }, 468 + "&RightTeeArrow;": { "codepoints": [8614], "characters": "\u21A6" }, 469 + "&RightTeeVector;": { "codepoints": [10587], "characters": "\u295B" }, 470 + "&RightTriangle;": { "codepoints": [8883], "characters": "\u22B3" }, 471 + "&RightTriangleBar;": { "codepoints": [10704], "characters": "\u29D0" }, 472 + "&RightTriangleEqual;": { "codepoints": [8885], "characters": "\u22B5" }, 473 + "&RightUpDownVector;": { "codepoints": [10575], "characters": "\u294F" }, 474 + "&RightUpTeeVector;": { "codepoints": [10588], "characters": "\u295C" }, 475 + "&RightUpVector;": { "codepoints": [8638], "characters": "\u21BE" }, 476 + "&RightUpVectorBar;": { "codepoints": [10580], "characters": "\u2954" }, 477 + "&RightVector;": { "codepoints": [8640], "characters": "\u21C0" }, 478 + "&RightVectorBar;": { "codepoints": [10579], "characters": "\u2953" }, 479 + "&Rightarrow;": { "codepoints": [8658], "characters": "\u21D2" }, 480 + "&Ropf;": { "codepoints": [8477], "characters": "\u211D" }, 481 + "&RoundImplies;": { "codepoints": [10608], "characters": "\u2970" }, 482 + "&Rrightarrow;": { "codepoints": [8667], "characters": "\u21DB" }, 483 + "&Rscr;": { "codepoints": [8475], "characters": "\u211B" }, 484 + "&Rsh;": { "codepoints": [8625], "characters": "\u21B1" }, 485 + "&RuleDelayed;": { "codepoints": [10740], "characters": "\u29F4" }, 486 + "&SHCHcy;": { "codepoints": [1065], "characters": "\u0429" }, 487 + "&SHcy;": { "codepoints": [1064], "characters": "\u0428" }, 488 + "&SOFTcy;": { "codepoints": [1068], "characters": "\u042C" }, 489 + "&Sacute;": { "codepoints": [346], "characters": "\u015A" }, 490 + "&Sc;": { "codepoints": [10940], "characters": "\u2ABC" }, 491 + "&Scaron;": { "codepoints": [352], "characters": "\u0160" }, 492 + "&Scedil;": { "codepoints": [350], "characters": "\u015E" }, 493 + "&Scirc;": { "codepoints": [348], "characters": "\u015C" }, 494 + "&Scy;": { "codepoints": [1057], "characters": "\u0421" }, 495 + "&Sfr;": { "codepoints": [120086], "characters": "\uD835\uDD16" }, 496 + "&ShortDownArrow;": { "codepoints": [8595], "characters": "\u2193" }, 497 + "&ShortLeftArrow;": { "codepoints": [8592], "characters": "\u2190" }, 498 + "&ShortRightArrow;": { "codepoints": [8594], "characters": "\u2192" }, 499 + "&ShortUpArrow;": { "codepoints": [8593], "characters": "\u2191" }, 500 + "&Sigma;": { "codepoints": [931], "characters": "\u03A3" }, 501 + "&SmallCircle;": { "codepoints": [8728], "characters": "\u2218" }, 502 + "&Sopf;": { "codepoints": [120138], "characters": "\uD835\uDD4A" }, 503 + "&Sqrt;": { "codepoints": [8730], "characters": "\u221A" }, 504 + "&Square;": { "codepoints": [9633], "characters": "\u25A1" }, 505 + "&SquareIntersection;": { "codepoints": [8851], "characters": "\u2293" }, 506 + "&SquareSubset;": { "codepoints": [8847], "characters": "\u228F" }, 507 + "&SquareSubsetEqual;": { "codepoints": [8849], "characters": "\u2291" }, 508 + "&SquareSuperset;": { "codepoints": [8848], "characters": "\u2290" }, 509 + "&SquareSupersetEqual;": { "codepoints": [8850], "characters": "\u2292" }, 510 + "&SquareUnion;": { "codepoints": [8852], "characters": "\u2294" }, 511 + "&Sscr;": { "codepoints": [119982], "characters": "\uD835\uDCAE" }, 512 + "&Star;": { "codepoints": [8902], "characters": "\u22C6" }, 513 + "&Sub;": { "codepoints": [8912], "characters": "\u22D0" }, 514 + "&Subset;": { "codepoints": [8912], "characters": "\u22D0" }, 515 + "&SubsetEqual;": { "codepoints": [8838], "characters": "\u2286" }, 516 + "&Succeeds;": { "codepoints": [8827], "characters": "\u227B" }, 517 + "&SucceedsEqual;": { "codepoints": [10928], "characters": "\u2AB0" }, 518 + "&SucceedsSlantEqual;": { "codepoints": [8829], "characters": "\u227D" }, 519 + "&SucceedsTilde;": { "codepoints": [8831], "characters": "\u227F" }, 520 + "&SuchThat;": { "codepoints": [8715], "characters": "\u220B" }, 521 + "&Sum;": { "codepoints": [8721], "characters": "\u2211" }, 522 + "&Sup;": { "codepoints": [8913], "characters": "\u22D1" }, 523 + "&Superset;": { "codepoints": [8835], "characters": "\u2283" }, 524 + "&SupersetEqual;": { "codepoints": [8839], "characters": "\u2287" }, 525 + "&Supset;": { "codepoints": [8913], "characters": "\u22D1" }, 526 + "&THORN": { "codepoints": [222], "characters": "\u00DE" }, 527 + "&THORN;": { "codepoints": [222], "characters": "\u00DE" }, 528 + "&TRADE;": { "codepoints": [8482], "characters": "\u2122" }, 529 + "&TSHcy;": { "codepoints": [1035], "characters": "\u040B" }, 530 + "&TScy;": { "codepoints": [1062], "characters": "\u0426" }, 531 + "&Tab;": { "codepoints": [9], "characters": "\u0009" }, 532 + "&Tau;": { "codepoints": [932], "characters": "\u03A4" }, 533 + "&Tcaron;": { "codepoints": [356], "characters": "\u0164" }, 534 + "&Tcedil;": { "codepoints": [354], "characters": "\u0162" }, 535 + "&Tcy;": { "codepoints": [1058], "characters": "\u0422" }, 536 + "&Tfr;": { "codepoints": [120087], "characters": "\uD835\uDD17" }, 537 + "&Therefore;": { "codepoints": [8756], "characters": "\u2234" }, 538 + "&Theta;": { "codepoints": [920], "characters": "\u0398" }, 539 + "&ThickSpace;": { "codepoints": [8287, 8202], "characters": "\u205F\u200A" }, 540 + "&ThinSpace;": { "codepoints": [8201], "characters": "\u2009" }, 541 + "&Tilde;": { "codepoints": [8764], "characters": "\u223C" }, 542 + "&TildeEqual;": { "codepoints": [8771], "characters": "\u2243" }, 543 + "&TildeFullEqual;": { "codepoints": [8773], "characters": "\u2245" }, 544 + "&TildeTilde;": { "codepoints": [8776], "characters": "\u2248" }, 545 + "&Topf;": { "codepoints": [120139], "characters": "\uD835\uDD4B" }, 546 + "&TripleDot;": { "codepoints": [8411], "characters": "\u20DB" }, 547 + "&Tscr;": { "codepoints": [119983], "characters": "\uD835\uDCAF" }, 548 + "&Tstrok;": { "codepoints": [358], "characters": "\u0166" }, 549 + "&Uacute": { "codepoints": [218], "characters": "\u00DA" }, 550 + "&Uacute;": { "codepoints": [218], "characters": "\u00DA" }, 551 + "&Uarr;": { "codepoints": [8607], "characters": "\u219F" }, 552 + "&Uarrocir;": { "codepoints": [10569], "characters": "\u2949" }, 553 + "&Ubrcy;": { "codepoints": [1038], "characters": "\u040E" }, 554 + "&Ubreve;": { "codepoints": [364], "characters": "\u016C" }, 555 + "&Ucirc": { "codepoints": [219], "characters": "\u00DB" }, 556 + "&Ucirc;": { "codepoints": [219], "characters": "\u00DB" }, 557 + "&Ucy;": { "codepoints": [1059], "characters": "\u0423" }, 558 + "&Udblac;": { "codepoints": [368], "characters": "\u0170" }, 559 + "&Ufr;": { "codepoints": [120088], "characters": "\uD835\uDD18" }, 560 + "&Ugrave": { "codepoints": [217], "characters": "\u00D9" }, 561 + "&Ugrave;": { "codepoints": [217], "characters": "\u00D9" }, 562 + "&Umacr;": { "codepoints": [362], "characters": "\u016A" }, 563 + "&UnderBar;": { "codepoints": [95], "characters": "\u005F" }, 564 + "&UnderBrace;": { "codepoints": [9183], "characters": "\u23DF" }, 565 + "&UnderBracket;": { "codepoints": [9141], "characters": "\u23B5" }, 566 + "&UnderParenthesis;": { "codepoints": [9181], "characters": "\u23DD" }, 567 + "&Union;": { "codepoints": [8899], "characters": "\u22C3" }, 568 + "&UnionPlus;": { "codepoints": [8846], "characters": "\u228E" }, 569 + "&Uogon;": { "codepoints": [370], "characters": "\u0172" }, 570 + "&Uopf;": { "codepoints": [120140], "characters": "\uD835\uDD4C" }, 571 + "&UpArrow;": { "codepoints": [8593], "characters": "\u2191" }, 572 + "&UpArrowBar;": { "codepoints": [10514], "characters": "\u2912" }, 573 + "&UpArrowDownArrow;": { "codepoints": [8645], "characters": "\u21C5" }, 574 + "&UpDownArrow;": { "codepoints": [8597], "characters": "\u2195" }, 575 + "&UpEquilibrium;": { "codepoints": [10606], "characters": "\u296E" }, 576 + "&UpTee;": { "codepoints": [8869], "characters": "\u22A5" }, 577 + "&UpTeeArrow;": { "codepoints": [8613], "characters": "\u21A5" }, 578 + "&Uparrow;": { "codepoints": [8657], "characters": "\u21D1" }, 579 + "&Updownarrow;": { "codepoints": [8661], "characters": "\u21D5" }, 580 + "&UpperLeftArrow;": { "codepoints": [8598], "characters": "\u2196" }, 581 + "&UpperRightArrow;": { "codepoints": [8599], "characters": "\u2197" }, 582 + "&Upsi;": { "codepoints": [978], "characters": "\u03D2" }, 583 + "&Upsilon;": { "codepoints": [933], "characters": "\u03A5" }, 584 + "&Uring;": { "codepoints": [366], "characters": "\u016E" }, 585 + "&Uscr;": { "codepoints": [119984], "characters": "\uD835\uDCB0" }, 586 + "&Utilde;": { "codepoints": [360], "characters": "\u0168" }, 587 + "&Uuml": { "codepoints": [220], "characters": "\u00DC" }, 588 + "&Uuml;": { "codepoints": [220], "characters": "\u00DC" }, 589 + "&VDash;": { "codepoints": [8875], "characters": "\u22AB" }, 590 + "&Vbar;": { "codepoints": [10987], "characters": "\u2AEB" }, 591 + "&Vcy;": { "codepoints": [1042], "characters": "\u0412" }, 592 + "&Vdash;": { "codepoints": [8873], "characters": "\u22A9" }, 593 + "&Vdashl;": { "codepoints": [10982], "characters": "\u2AE6" }, 594 + "&Vee;": { "codepoints": [8897], "characters": "\u22C1" }, 595 + "&Verbar;": { "codepoints": [8214], "characters": "\u2016" }, 596 + "&Vert;": { "codepoints": [8214], "characters": "\u2016" }, 597 + "&VerticalBar;": { "codepoints": [8739], "characters": "\u2223" }, 598 + "&VerticalLine;": { "codepoints": [124], "characters": "\u007C" }, 599 + "&VerticalSeparator;": { "codepoints": [10072], "characters": "\u2758" }, 600 + "&VerticalTilde;": { "codepoints": [8768], "characters": "\u2240" }, 601 + "&VeryThinSpace;": { "codepoints": [8202], "characters": "\u200A" }, 602 + "&Vfr;": { "codepoints": [120089], "characters": "\uD835\uDD19" }, 603 + "&Vopf;": { "codepoints": [120141], "characters": "\uD835\uDD4D" }, 604 + "&Vscr;": { "codepoints": [119985], "characters": "\uD835\uDCB1" }, 605 + "&Vvdash;": { "codepoints": [8874], "characters": "\u22AA" }, 606 + "&Wcirc;": { "codepoints": [372], "characters": "\u0174" }, 607 + "&Wedge;": { "codepoints": [8896], "characters": "\u22C0" }, 608 + "&Wfr;": { "codepoints": [120090], "characters": "\uD835\uDD1A" }, 609 + "&Wopf;": { "codepoints": [120142], "characters": "\uD835\uDD4E" }, 610 + "&Wscr;": { "codepoints": [119986], "characters": "\uD835\uDCB2" }, 611 + "&Xfr;": { "codepoints": [120091], "characters": "\uD835\uDD1B" }, 612 + "&Xi;": { "codepoints": [926], "characters": "\u039E" }, 613 + "&Xopf;": { "codepoints": [120143], "characters": "\uD835\uDD4F" }, 614 + "&Xscr;": { "codepoints": [119987], "characters": "\uD835\uDCB3" }, 615 + "&YAcy;": { "codepoints": [1071], "characters": "\u042F" }, 616 + "&YIcy;": { "codepoints": [1031], "characters": "\u0407" }, 617 + "&YUcy;": { "codepoints": [1070], "characters": "\u042E" }, 618 + "&Yacute": { "codepoints": [221], "characters": "\u00DD" }, 619 + "&Yacute;": { "codepoints": [221], "characters": "\u00DD" }, 620 + "&Ycirc;": { "codepoints": [374], "characters": "\u0176" }, 621 + "&Ycy;": { "codepoints": [1067], "characters": "\u042B" }, 622 + "&Yfr;": { "codepoints": [120092], "characters": "\uD835\uDD1C" }, 623 + "&Yopf;": { "codepoints": [120144], "characters": "\uD835\uDD50" }, 624 + "&Yscr;": { "codepoints": [119988], "characters": "\uD835\uDCB4" }, 625 + "&Yuml;": { "codepoints": [376], "characters": "\u0178" }, 626 + "&ZHcy;": { "codepoints": [1046], "characters": "\u0416" }, 627 + "&Zacute;": { "codepoints": [377], "characters": "\u0179" }, 628 + "&Zcaron;": { "codepoints": [381], "characters": "\u017D" }, 629 + "&Zcy;": { "codepoints": [1047], "characters": "\u0417" }, 630 + "&Zdot;": { "codepoints": [379], "characters": "\u017B" }, 631 + "&ZeroWidthSpace;": { "codepoints": [8203], "characters": "\u200B" }, 632 + "&Zeta;": { "codepoints": [918], "characters": "\u0396" }, 633 + "&Zfr;": { "codepoints": [8488], "characters": "\u2128" }, 634 + "&Zopf;": { "codepoints": [8484], "characters": "\u2124" }, 635 + "&Zscr;": { "codepoints": [119989], "characters": "\uD835\uDCB5" }, 636 + "&aacute": { "codepoints": [225], "characters": "\u00E1" }, 637 + "&aacute;": { "codepoints": [225], "characters": "\u00E1" }, 638 + "&abreve;": { "codepoints": [259], "characters": "\u0103" }, 639 + "&ac;": { "codepoints": [8766], "characters": "\u223E" }, 640 + "&acE;": { "codepoints": [8766, 819], "characters": "\u223E\u0333" }, 641 + "&acd;": { "codepoints": [8767], "characters": "\u223F" }, 642 + "&acirc": { "codepoints": [226], "characters": "\u00E2" }, 643 + "&acirc;": { "codepoints": [226], "characters": "\u00E2" }, 644 + "&acute": { "codepoints": [180], "characters": "\u00B4" }, 645 + "&acute;": { "codepoints": [180], "characters": "\u00B4" }, 646 + "&acy;": { "codepoints": [1072], "characters": "\u0430" }, 647 + "&aelig": { "codepoints": [230], "characters": "\u00E6" }, 648 + "&aelig;": { "codepoints": [230], "characters": "\u00E6" }, 649 + "&af;": { "codepoints": [8289], "characters": "\u2061" }, 650 + "&afr;": { "codepoints": [120094], "characters": "\uD835\uDD1E" }, 651 + "&agrave": { "codepoints": [224], "characters": "\u00E0" }, 652 + "&agrave;": { "codepoints": [224], "characters": "\u00E0" }, 653 + "&alefsym;": { "codepoints": [8501], "characters": "\u2135" }, 654 + "&aleph;": { "codepoints": [8501], "characters": "\u2135" }, 655 + "&alpha;": { "codepoints": [945], "characters": "\u03B1" }, 656 + "&amacr;": { "codepoints": [257], "characters": "\u0101" }, 657 + "&amalg;": { "codepoints": [10815], "characters": "\u2A3F" }, 658 + "&amp": { "codepoints": [38], "characters": "\u0026" }, 659 + "&amp;": { "codepoints": [38], "characters": "\u0026" }, 660 + "&and;": { "codepoints": [8743], "characters": "\u2227" }, 661 + "&andand;": { "codepoints": [10837], "characters": "\u2A55" }, 662 + "&andd;": { "codepoints": [10844], "characters": "\u2A5C" }, 663 + "&andslope;": { "codepoints": [10840], "characters": "\u2A58" }, 664 + "&andv;": { "codepoints": [10842], "characters": "\u2A5A" }, 665 + "&ang;": { "codepoints": [8736], "characters": "\u2220" }, 666 + "&ange;": { "codepoints": [10660], "characters": "\u29A4" }, 667 + "&angle;": { "codepoints": [8736], "characters": "\u2220" }, 668 + "&angmsd;": { "codepoints": [8737], "characters": "\u2221" }, 669 + "&angmsdaa;": { "codepoints": [10664], "characters": "\u29A8" }, 670 + "&angmsdab;": { "codepoints": [10665], "characters": "\u29A9" }, 671 + "&angmsdac;": { "codepoints": [10666], "characters": "\u29AA" }, 672 + "&angmsdad;": { "codepoints": [10667], "characters": "\u29AB" }, 673 + "&angmsdae;": { "codepoints": [10668], "characters": "\u29AC" }, 674 + "&angmsdaf;": { "codepoints": [10669], "characters": "\u29AD" }, 675 + "&angmsdag;": { "codepoints": [10670], "characters": "\u29AE" }, 676 + "&angmsdah;": { "codepoints": [10671], "characters": "\u29AF" }, 677 + "&angrt;": { "codepoints": [8735], "characters": "\u221F" }, 678 + "&angrtvb;": { "codepoints": [8894], "characters": "\u22BE" }, 679 + "&angrtvbd;": { "codepoints": [10653], "characters": "\u299D" }, 680 + "&angsph;": { "codepoints": [8738], "characters": "\u2222" }, 681 + "&angst;": { "codepoints": [197], "characters": "\u00C5" }, 682 + "&angzarr;": { "codepoints": [9084], "characters": "\u237C" }, 683 + "&aogon;": { "codepoints": [261], "characters": "\u0105" }, 684 + "&aopf;": { "codepoints": [120146], "characters": "\uD835\uDD52" }, 685 + "&ap;": { "codepoints": [8776], "characters": "\u2248" }, 686 + "&apE;": { "codepoints": [10864], "characters": "\u2A70" }, 687 + "&apacir;": { "codepoints": [10863], "characters": "\u2A6F" }, 688 + "&ape;": { "codepoints": [8778], "characters": "\u224A" }, 689 + "&apid;": { "codepoints": [8779], "characters": "\u224B" }, 690 + "&apos;": { "codepoints": [39], "characters": "\u0027" }, 691 + "&approx;": { "codepoints": [8776], "characters": "\u2248" }, 692 + "&approxeq;": { "codepoints": [8778], "characters": "\u224A" }, 693 + "&aring": { "codepoints": [229], "characters": "\u00E5" }, 694 + "&aring;": { "codepoints": [229], "characters": "\u00E5" }, 695 + "&ascr;": { "codepoints": [119990], "characters": "\uD835\uDCB6" }, 696 + "&ast;": { "codepoints": [42], "characters": "\u002A" }, 697 + "&asymp;": { "codepoints": [8776], "characters": "\u2248" }, 698 + "&asympeq;": { "codepoints": [8781], "characters": "\u224D" }, 699 + "&atilde": { "codepoints": [227], "characters": "\u00E3" }, 700 + "&atilde;": { "codepoints": [227], "characters": "\u00E3" }, 701 + "&auml": { "codepoints": [228], "characters": "\u00E4" }, 702 + "&auml;": { "codepoints": [228], "characters": "\u00E4" }, 703 + "&awconint;": { "codepoints": [8755], "characters": "\u2233" }, 704 + "&awint;": { "codepoints": [10769], "characters": "\u2A11" }, 705 + "&bNot;": { "codepoints": [10989], "characters": "\u2AED" }, 706 + "&backcong;": { "codepoints": [8780], "characters": "\u224C" }, 707 + "&backepsilon;": { "codepoints": [1014], "characters": "\u03F6" }, 708 + "&backprime;": { "codepoints": [8245], "characters": "\u2035" }, 709 + "&backsim;": { "codepoints": [8765], "characters": "\u223D" }, 710 + "&backsimeq;": { "codepoints": [8909], "characters": "\u22CD" }, 711 + "&barvee;": { "codepoints": [8893], "characters": "\u22BD" }, 712 + "&barwed;": { "codepoints": [8965], "characters": "\u2305" }, 713 + "&barwedge;": { "codepoints": [8965], "characters": "\u2305" }, 714 + "&bbrk;": { "codepoints": [9141], "characters": "\u23B5" }, 715 + "&bbrktbrk;": { "codepoints": [9142], "characters": "\u23B6" }, 716 + "&bcong;": { "codepoints": [8780], "characters": "\u224C" }, 717 + "&bcy;": { "codepoints": [1073], "characters": "\u0431" }, 718 + "&bdquo;": { "codepoints": [8222], "characters": "\u201E" }, 719 + "&becaus;": { "codepoints": [8757], "characters": "\u2235" }, 720 + "&because;": { "codepoints": [8757], "characters": "\u2235" }, 721 + "&bemptyv;": { "codepoints": [10672], "characters": "\u29B0" }, 722 + "&bepsi;": { "codepoints": [1014], "characters": "\u03F6" }, 723 + "&bernou;": { "codepoints": [8492], "characters": "\u212C" }, 724 + "&beta;": { "codepoints": [946], "characters": "\u03B2" }, 725 + "&beth;": { "codepoints": [8502], "characters": "\u2136" }, 726 + "&between;": { "codepoints": [8812], "characters": "\u226C" }, 727 + "&bfr;": { "codepoints": [120095], "characters": "\uD835\uDD1F" }, 728 + "&bigcap;": { "codepoints": [8898], "characters": "\u22C2" }, 729 + "&bigcirc;": { "codepoints": [9711], "characters": "\u25EF" }, 730 + "&bigcup;": { "codepoints": [8899], "characters": "\u22C3" }, 731 + "&bigodot;": { "codepoints": [10752], "characters": "\u2A00" }, 732 + "&bigoplus;": { "codepoints": [10753], "characters": "\u2A01" }, 733 + "&bigotimes;": { "codepoints": [10754], "characters": "\u2A02" }, 734 + "&bigsqcup;": { "codepoints": [10758], "characters": "\u2A06" }, 735 + "&bigstar;": { "codepoints": [9733], "characters": "\u2605" }, 736 + "&bigtriangledown;": { "codepoints": [9661], "characters": "\u25BD" }, 737 + "&bigtriangleup;": { "codepoints": [9651], "characters": "\u25B3" }, 738 + "&biguplus;": { "codepoints": [10756], "characters": "\u2A04" }, 739 + "&bigvee;": { "codepoints": [8897], "characters": "\u22C1" }, 740 + "&bigwedge;": { "codepoints": [8896], "characters": "\u22C0" }, 741 + "&bkarow;": { "codepoints": [10509], "characters": "\u290D" }, 742 + "&blacklozenge;": { "codepoints": [10731], "characters": "\u29EB" }, 743 + "&blacksquare;": { "codepoints": [9642], "characters": "\u25AA" }, 744 + "&blacktriangle;": { "codepoints": [9652], "characters": "\u25B4" }, 745 + "&blacktriangledown;": { "codepoints": [9662], "characters": "\u25BE" }, 746 + "&blacktriangleleft;": { "codepoints": [9666], "characters": "\u25C2" }, 747 + "&blacktriangleright;": { "codepoints": [9656], "characters": "\u25B8" }, 748 + "&blank;": { "codepoints": [9251], "characters": "\u2423" }, 749 + "&blk12;": { "codepoints": [9618], "characters": "\u2592" }, 750 + "&blk14;": { "codepoints": [9617], "characters": "\u2591" }, 751 + "&blk34;": { "codepoints": [9619], "characters": "\u2593" }, 752 + "&block;": { "codepoints": [9608], "characters": "\u2588" }, 753 + "&bne;": { "codepoints": [61, 8421], "characters": "\u003D\u20E5" }, 754 + "&bnequiv;": { "codepoints": [8801, 8421], "characters": "\u2261\u20E5" }, 755 + "&bnot;": { "codepoints": [8976], "characters": "\u2310" }, 756 + "&bopf;": { "codepoints": [120147], "characters": "\uD835\uDD53" }, 757 + "&bot;": { "codepoints": [8869], "characters": "\u22A5" }, 758 + "&bottom;": { "codepoints": [8869], "characters": "\u22A5" }, 759 + "&bowtie;": { "codepoints": [8904], "characters": "\u22C8" }, 760 + "&boxDL;": { "codepoints": [9559], "characters": "\u2557" }, 761 + "&boxDR;": { "codepoints": [9556], "characters": "\u2554" }, 762 + "&boxDl;": { "codepoints": [9558], "characters": "\u2556" }, 763 + "&boxDr;": { "codepoints": [9555], "characters": "\u2553" }, 764 + "&boxH;": { "codepoints": [9552], "characters": "\u2550" }, 765 + "&boxHD;": { "codepoints": [9574], "characters": "\u2566" }, 766 + "&boxHU;": { "codepoints": [9577], "characters": "\u2569" }, 767 + "&boxHd;": { "codepoints": [9572], "characters": "\u2564" }, 768 + "&boxHu;": { "codepoints": [9575], "characters": "\u2567" }, 769 + "&boxUL;": { "codepoints": [9565], "characters": "\u255D" }, 770 + "&boxUR;": { "codepoints": [9562], "characters": "\u255A" }, 771 + "&boxUl;": { "codepoints": [9564], "characters": "\u255C" }, 772 + "&boxUr;": { "codepoints": [9561], "characters": "\u2559" }, 773 + "&boxV;": { "codepoints": [9553], "characters": "\u2551" }, 774 + "&boxVH;": { "codepoints": [9580], "characters": "\u256C" }, 775 + "&boxVL;": { "codepoints": [9571], "characters": "\u2563" }, 776 + "&boxVR;": { "codepoints": [9568], "characters": "\u2560" }, 777 + "&boxVh;": { "codepoints": [9579], "characters": "\u256B" }, 778 + "&boxVl;": { "codepoints": [9570], "characters": "\u2562" }, 779 + "&boxVr;": { "codepoints": [9567], "characters": "\u255F" }, 780 + "&boxbox;": { "codepoints": [10697], "characters": "\u29C9" }, 781 + "&boxdL;": { "codepoints": [9557], "characters": "\u2555" }, 782 + "&boxdR;": { "codepoints": [9554], "characters": "\u2552" }, 783 + "&boxdl;": { "codepoints": [9488], "characters": "\u2510" }, 784 + "&boxdr;": { "codepoints": [9484], "characters": "\u250C" }, 785 + "&boxh;": { "codepoints": [9472], "characters": "\u2500" }, 786 + "&boxhD;": { "codepoints": [9573], "characters": "\u2565" }, 787 + "&boxhU;": { "codepoints": [9576], "characters": "\u2568" }, 788 + "&boxhd;": { "codepoints": [9516], "characters": "\u252C" }, 789 + "&boxhu;": { "codepoints": [9524], "characters": "\u2534" }, 790 + "&boxminus;": { "codepoints": [8863], "characters": "\u229F" }, 791 + "&boxplus;": { "codepoints": [8862], "characters": "\u229E" }, 792 + "&boxtimes;": { "codepoints": [8864], "characters": "\u22A0" }, 793 + "&boxuL;": { "codepoints": [9563], "characters": "\u255B" }, 794 + "&boxuR;": { "codepoints": [9560], "characters": "\u2558" }, 795 + "&boxul;": { "codepoints": [9496], "characters": "\u2518" }, 796 + "&boxur;": { "codepoints": [9492], "characters": "\u2514" }, 797 + "&boxv;": { "codepoints": [9474], "characters": "\u2502" }, 798 + "&boxvH;": { "codepoints": [9578], "characters": "\u256A" }, 799 + "&boxvL;": { "codepoints": [9569], "characters": "\u2561" }, 800 + "&boxvR;": { "codepoints": [9566], "characters": "\u255E" }, 801 + "&boxvh;": { "codepoints": [9532], "characters": "\u253C" }, 802 + "&boxvl;": { "codepoints": [9508], "characters": "\u2524" }, 803 + "&boxvr;": { "codepoints": [9500], "characters": "\u251C" }, 804 + "&bprime;": { "codepoints": [8245], "characters": "\u2035" }, 805 + "&breve;": { "codepoints": [728], "characters": "\u02D8" }, 806 + "&brvbar": { "codepoints": [166], "characters": "\u00A6" }, 807 + "&brvbar;": { "codepoints": [166], "characters": "\u00A6" }, 808 + "&bscr;": { "codepoints": [119991], "characters": "\uD835\uDCB7" }, 809 + "&bsemi;": { "codepoints": [8271], "characters": "\u204F" }, 810 + "&bsim;": { "codepoints": [8765], "characters": "\u223D" }, 811 + "&bsime;": { "codepoints": [8909], "characters": "\u22CD" }, 812 + "&bsol;": { "codepoints": [92], "characters": "\u005C" }, 813 + "&bsolb;": { "codepoints": [10693], "characters": "\u29C5" }, 814 + "&bsolhsub;": { "codepoints": [10184], "characters": "\u27C8" }, 815 + "&bull;": { "codepoints": [8226], "characters": "\u2022" }, 816 + "&bullet;": { "codepoints": [8226], "characters": "\u2022" }, 817 + "&bump;": { "codepoints": [8782], "characters": "\u224E" }, 818 + "&bumpE;": { "codepoints": [10926], "characters": "\u2AAE" }, 819 + "&bumpe;": { "codepoints": [8783], "characters": "\u224F" }, 820 + "&bumpeq;": { "codepoints": [8783], "characters": "\u224F" }, 821 + "&cacute;": { "codepoints": [263], "characters": "\u0107" }, 822 + "&cap;": { "codepoints": [8745], "characters": "\u2229" }, 823 + "&capand;": { "codepoints": [10820], "characters": "\u2A44" }, 824 + "&capbrcup;": { "codepoints": [10825], "characters": "\u2A49" }, 825 + "&capcap;": { "codepoints": [10827], "characters": "\u2A4B" }, 826 + "&capcup;": { "codepoints": [10823], "characters": "\u2A47" }, 827 + "&capdot;": { "codepoints": [10816], "characters": "\u2A40" }, 828 + "&caps;": { "codepoints": [8745, 65024], "characters": "\u2229\uFE00" }, 829 + "&caret;": { "codepoints": [8257], "characters": "\u2041" }, 830 + "&caron;": { "codepoints": [711], "characters": "\u02C7" }, 831 + "&ccaps;": { "codepoints": [10829], "characters": "\u2A4D" }, 832 + "&ccaron;": { "codepoints": [269], "characters": "\u010D" }, 833 + "&ccedil": { "codepoints": [231], "characters": "\u00E7" }, 834 + "&ccedil;": { "codepoints": [231], "characters": "\u00E7" }, 835 + "&ccirc;": { "codepoints": [265], "characters": "\u0109" }, 836 + "&ccups;": { "codepoints": [10828], "characters": "\u2A4C" }, 837 + "&ccupssm;": { "codepoints": [10832], "characters": "\u2A50" }, 838 + "&cdot;": { "codepoints": [267], "characters": "\u010B" }, 839 + "&cedil": { "codepoints": [184], "characters": "\u00B8" }, 840 + "&cedil;": { "codepoints": [184], "characters": "\u00B8" }, 841 + "&cemptyv;": { "codepoints": [10674], "characters": "\u29B2" }, 842 + "&cent": { "codepoints": [162], "characters": "\u00A2" }, 843 + "&cent;": { "codepoints": [162], "characters": "\u00A2" }, 844 + "&centerdot;": { "codepoints": [183], "characters": "\u00B7" }, 845 + "&cfr;": { "codepoints": [120096], "characters": "\uD835\uDD20" }, 846 + "&chcy;": { "codepoints": [1095], "characters": "\u0447" }, 847 + "&check;": { "codepoints": [10003], "characters": "\u2713" }, 848 + "&checkmark;": { "codepoints": [10003], "characters": "\u2713" }, 849 + "&chi;": { "codepoints": [967], "characters": "\u03C7" }, 850 + "&cir;": { "codepoints": [9675], "characters": "\u25CB" }, 851 + "&cirE;": { "codepoints": [10691], "characters": "\u29C3" }, 852 + "&circ;": { "codepoints": [710], "characters": "\u02C6" }, 853 + "&circeq;": { "codepoints": [8791], "characters": "\u2257" }, 854 + "&circlearrowleft;": { "codepoints": [8634], "characters": "\u21BA" }, 855 + "&circlearrowright;": { "codepoints": [8635], "characters": "\u21BB" }, 856 + "&circledR;": { "codepoints": [174], "characters": "\u00AE" }, 857 + "&circledS;": { "codepoints": [9416], "characters": "\u24C8" }, 858 + "&circledast;": { "codepoints": [8859], "characters": "\u229B" }, 859 + "&circledcirc;": { "codepoints": [8858], "characters": "\u229A" }, 860 + "&circleddash;": { "codepoints": [8861], "characters": "\u229D" }, 861 + "&cire;": { "codepoints": [8791], "characters": "\u2257" }, 862 + "&cirfnint;": { "codepoints": [10768], "characters": "\u2A10" }, 863 + "&cirmid;": { "codepoints": [10991], "characters": "\u2AEF" }, 864 + "&cirscir;": { "codepoints": [10690], "characters": "\u29C2" }, 865 + "&clubs;": { "codepoints": [9827], "characters": "\u2663" }, 866 + "&clubsuit;": { "codepoints": [9827], "characters": "\u2663" }, 867 + "&colon;": { "codepoints": [58], "characters": "\u003A" }, 868 + "&colone;": { "codepoints": [8788], "characters": "\u2254" }, 869 + "&coloneq;": { "codepoints": [8788], "characters": "\u2254" }, 870 + "&comma;": { "codepoints": [44], "characters": "\u002C" }, 871 + "&commat;": { "codepoints": [64], "characters": "\u0040" }, 872 + "&comp;": { "codepoints": [8705], "characters": "\u2201" }, 873 + "&compfn;": { "codepoints": [8728], "characters": "\u2218" }, 874 + "&complement;": { "codepoints": [8705], "characters": "\u2201" }, 875 + "&complexes;": { "codepoints": [8450], "characters": "\u2102" }, 876 + "&cong;": { "codepoints": [8773], "characters": "\u2245" }, 877 + "&congdot;": { "codepoints": [10861], "characters": "\u2A6D" }, 878 + "&conint;": { "codepoints": [8750], "characters": "\u222E" }, 879 + "&copf;": { "codepoints": [120148], "characters": "\uD835\uDD54" }, 880 + "&coprod;": { "codepoints": [8720], "characters": "\u2210" }, 881 + "&copy": { "codepoints": [169], "characters": "\u00A9" }, 882 + "&copy;": { "codepoints": [169], "characters": "\u00A9" }, 883 + "&copysr;": { "codepoints": [8471], "characters": "\u2117" }, 884 + "&crarr;": { "codepoints": [8629], "characters": "\u21B5" }, 885 + "&cross;": { "codepoints": [10007], "characters": "\u2717" }, 886 + "&cscr;": { "codepoints": [119992], "characters": "\uD835\uDCB8" }, 887 + "&csub;": { "codepoints": [10959], "characters": "\u2ACF" }, 888 + "&csube;": { "codepoints": [10961], "characters": "\u2AD1" }, 889 + "&csup;": { "codepoints": [10960], "characters": "\u2AD0" }, 890 + "&csupe;": { "codepoints": [10962], "characters": "\u2AD2" }, 891 + "&ctdot;": { "codepoints": [8943], "characters": "\u22EF" }, 892 + "&cudarrl;": { "codepoints": [10552], "characters": "\u2938" }, 893 + "&cudarrr;": { "codepoints": [10549], "characters": "\u2935" }, 894 + "&cuepr;": { "codepoints": [8926], "characters": "\u22DE" }, 895 + "&cuesc;": { "codepoints": [8927], "characters": "\u22DF" }, 896 + "&cularr;": { "codepoints": [8630], "characters": "\u21B6" }, 897 + "&cularrp;": { "codepoints": [10557], "characters": "\u293D" }, 898 + "&cup;": { "codepoints": [8746], "characters": "\u222A" }, 899 + "&cupbrcap;": { "codepoints": [10824], "characters": "\u2A48" }, 900 + "&cupcap;": { "codepoints": [10822], "characters": "\u2A46" }, 901 + "&cupcup;": { "codepoints": [10826], "characters": "\u2A4A" }, 902 + "&cupdot;": { "codepoints": [8845], "characters": "\u228D" }, 903 + "&cupor;": { "codepoints": [10821], "characters": "\u2A45" }, 904 + "&cups;": { "codepoints": [8746, 65024], "characters": "\u222A\uFE00" }, 905 + "&curarr;": { "codepoints": [8631], "characters": "\u21B7" }, 906 + "&curarrm;": { "codepoints": [10556], "characters": "\u293C" }, 907 + "&curlyeqprec;": { "codepoints": [8926], "characters": "\u22DE" }, 908 + "&curlyeqsucc;": { "codepoints": [8927], "characters": "\u22DF" }, 909 + "&curlyvee;": { "codepoints": [8910], "characters": "\u22CE" }, 910 + "&curlywedge;": { "codepoints": [8911], "characters": "\u22CF" }, 911 + "&curren": { "codepoints": [164], "characters": "\u00A4" }, 912 + "&curren;": { "codepoints": [164], "characters": "\u00A4" }, 913 + "&curvearrowleft;": { "codepoints": [8630], "characters": "\u21B6" }, 914 + "&curvearrowright;": { "codepoints": [8631], "characters": "\u21B7" }, 915 + "&cuvee;": { "codepoints": [8910], "characters": "\u22CE" }, 916 + "&cuwed;": { "codepoints": [8911], "characters": "\u22CF" }, 917 + "&cwconint;": { "codepoints": [8754], "characters": "\u2232" }, 918 + "&cwint;": { "codepoints": [8753], "characters": "\u2231" }, 919 + "&cylcty;": { "codepoints": [9005], "characters": "\u232D" }, 920 + "&dArr;": { "codepoints": [8659], "characters": "\u21D3" }, 921 + "&dHar;": { "codepoints": [10597], "characters": "\u2965" }, 922 + "&dagger;": { "codepoints": [8224], "characters": "\u2020" }, 923 + "&daleth;": { "codepoints": [8504], "characters": "\u2138" }, 924 + "&darr;": { "codepoints": [8595], "characters": "\u2193" }, 925 + "&dash;": { "codepoints": [8208], "characters": "\u2010" }, 926 + "&dashv;": { "codepoints": [8867], "characters": "\u22A3" }, 927 + "&dbkarow;": { "codepoints": [10511], "characters": "\u290F" }, 928 + "&dblac;": { "codepoints": [733], "characters": "\u02DD" }, 929 + "&dcaron;": { "codepoints": [271], "characters": "\u010F" }, 930 + "&dcy;": { "codepoints": [1076], "characters": "\u0434" }, 931 + "&dd;": { "codepoints": [8518], "characters": "\u2146" }, 932 + "&ddagger;": { "codepoints": [8225], "characters": "\u2021" }, 933 + "&ddarr;": { "codepoints": [8650], "characters": "\u21CA" }, 934 + "&ddotseq;": { "codepoints": [10871], "characters": "\u2A77" }, 935 + "&deg": { "codepoints": [176], "characters": "\u00B0" }, 936 + "&deg;": { "codepoints": [176], "characters": "\u00B0" }, 937 + "&delta;": { "codepoints": [948], "characters": "\u03B4" }, 938 + "&demptyv;": { "codepoints": [10673], "characters": "\u29B1" }, 939 + "&dfisht;": { "codepoints": [10623], "characters": "\u297F" }, 940 + "&dfr;": { "codepoints": [120097], "characters": "\uD835\uDD21" }, 941 + "&dharl;": { "codepoints": [8643], "characters": "\u21C3" }, 942 + "&dharr;": { "codepoints": [8642], "characters": "\u21C2" }, 943 + "&diam;": { "codepoints": [8900], "characters": "\u22C4" }, 944 + "&diamond;": { "codepoints": [8900], "characters": "\u22C4" }, 945 + "&diamondsuit;": { "codepoints": [9830], "characters": "\u2666" }, 946 + "&diams;": { "codepoints": [9830], "characters": "\u2666" }, 947 + "&die;": { "codepoints": [168], "characters": "\u00A8" }, 948 + "&digamma;": { "codepoints": [989], "characters": "\u03DD" }, 949 + "&disin;": { "codepoints": [8946], "characters": "\u22F2" }, 950 + "&div;": { "codepoints": [247], "characters": "\u00F7" }, 951 + "&divide": { "codepoints": [247], "characters": "\u00F7" }, 952 + "&divide;": { "codepoints": [247], "characters": "\u00F7" }, 953 + "&divideontimes;": { "codepoints": [8903], "characters": "\u22C7" }, 954 + "&divonx;": { "codepoints": [8903], "characters": "\u22C7" }, 955 + "&djcy;": { "codepoints": [1106], "characters": "\u0452" }, 956 + "&dlcorn;": { "codepoints": [8990], "characters": "\u231E" }, 957 + "&dlcrop;": { "codepoints": [8973], "characters": "\u230D" }, 958 + "&dollar;": { "codepoints": [36], "characters": "\u0024" }, 959 + "&dopf;": { "codepoints": [120149], "characters": "\uD835\uDD55" }, 960 + "&dot;": { "codepoints": [729], "characters": "\u02D9" }, 961 + "&doteq;": { "codepoints": [8784], "characters": "\u2250" }, 962 + "&doteqdot;": { "codepoints": [8785], "characters": "\u2251" }, 963 + "&dotminus;": { "codepoints": [8760], "characters": "\u2238" }, 964 + "&dotplus;": { "codepoints": [8724], "characters": "\u2214" }, 965 + "&dotsquare;": { "codepoints": [8865], "characters": "\u22A1" }, 966 + "&doublebarwedge;": { "codepoints": [8966], "characters": "\u2306" }, 967 + "&downarrow;": { "codepoints": [8595], "characters": "\u2193" }, 968 + "&downdownarrows;": { "codepoints": [8650], "characters": "\u21CA" }, 969 + "&downharpoonleft;": { "codepoints": [8643], "characters": "\u21C3" }, 970 + "&downharpoonright;": { "codepoints": [8642], "characters": "\u21C2" }, 971 + "&drbkarow;": { "codepoints": [10512], "characters": "\u2910" }, 972 + "&drcorn;": { "codepoints": [8991], "characters": "\u231F" }, 973 + "&drcrop;": { "codepoints": [8972], "characters": "\u230C" }, 974 + "&dscr;": { "codepoints": [119993], "characters": "\uD835\uDCB9" }, 975 + "&dscy;": { "codepoints": [1109], "characters": "\u0455" }, 976 + "&dsol;": { "codepoints": [10742], "characters": "\u29F6" }, 977 + "&dstrok;": { "codepoints": [273], "characters": "\u0111" }, 978 + "&dtdot;": { "codepoints": [8945], "characters": "\u22F1" }, 979 + "&dtri;": { "codepoints": [9663], "characters": "\u25BF" }, 980 + "&dtrif;": { "codepoints": [9662], "characters": "\u25BE" }, 981 + "&duarr;": { "codepoints": [8693], "characters": "\u21F5" }, 982 + "&duhar;": { "codepoints": [10607], "characters": "\u296F" }, 983 + "&dwangle;": { "codepoints": [10662], "characters": "\u29A6" }, 984 + "&dzcy;": { "codepoints": [1119], "characters": "\u045F" }, 985 + "&dzigrarr;": { "codepoints": [10239], "characters": "\u27FF" }, 986 + "&eDDot;": { "codepoints": [10871], "characters": "\u2A77" }, 987 + "&eDot;": { "codepoints": [8785], "characters": "\u2251" }, 988 + "&eacute": { "codepoints": [233], "characters": "\u00E9" }, 989 + "&eacute;": { "codepoints": [233], "characters": "\u00E9" }, 990 + "&easter;": { "codepoints": [10862], "characters": "\u2A6E" }, 991 + "&ecaron;": { "codepoints": [283], "characters": "\u011B" }, 992 + "&ecir;": { "codepoints": [8790], "characters": "\u2256" }, 993 + "&ecirc": { "codepoints": [234], "characters": "\u00EA" }, 994 + "&ecirc;": { "codepoints": [234], "characters": "\u00EA" }, 995 + "&ecolon;": { "codepoints": [8789], "characters": "\u2255" }, 996 + "&ecy;": { "codepoints": [1101], "characters": "\u044D" }, 997 + "&edot;": { "codepoints": [279], "characters": "\u0117" }, 998 + "&ee;": { "codepoints": [8519], "characters": "\u2147" }, 999 + "&efDot;": { "codepoints": [8786], "characters": "\u2252" }, 1000 + "&efr;": { "codepoints": [120098], "characters": "\uD835\uDD22" }, 1001 + "&eg;": { "codepoints": [10906], "characters": "\u2A9A" }, 1002 + "&egrave": { "codepoints": [232], "characters": "\u00E8" }, 1003 + "&egrave;": { "codepoints": [232], "characters": "\u00E8" }, 1004 + "&egs;": { "codepoints": [10902], "characters": "\u2A96" }, 1005 + "&egsdot;": { "codepoints": [10904], "characters": "\u2A98" }, 1006 + "&el;": { "codepoints": [10905], "characters": "\u2A99" }, 1007 + "&elinters;": { "codepoints": [9191], "characters": "\u23E7" }, 1008 + "&ell;": { "codepoints": [8467], "characters": "\u2113" }, 1009 + "&els;": { "codepoints": [10901], "characters": "\u2A95" }, 1010 + "&elsdot;": { "codepoints": [10903], "characters": "\u2A97" }, 1011 + "&emacr;": { "codepoints": [275], "characters": "\u0113" }, 1012 + "&empty;": { "codepoints": [8709], "characters": "\u2205" }, 1013 + "&emptyset;": { "codepoints": [8709], "characters": "\u2205" }, 1014 + "&emptyv;": { "codepoints": [8709], "characters": "\u2205" }, 1015 + "&emsp13;": { "codepoints": [8196], "characters": "\u2004" }, 1016 + "&emsp14;": { "codepoints": [8197], "characters": "\u2005" }, 1017 + "&emsp;": { "codepoints": [8195], "characters": "\u2003" }, 1018 + "&eng;": { "codepoints": [331], "characters": "\u014B" }, 1019 + "&ensp;": { "codepoints": [8194], "characters": "\u2002" }, 1020 + "&eogon;": { "codepoints": [281], "characters": "\u0119" }, 1021 + "&eopf;": { "codepoints": [120150], "characters": "\uD835\uDD56" }, 1022 + "&epar;": { "codepoints": [8917], "characters": "\u22D5" }, 1023 + "&eparsl;": { "codepoints": [10723], "characters": "\u29E3" }, 1024 + "&eplus;": { "codepoints": [10865], "characters": "\u2A71" }, 1025 + "&epsi;": { "codepoints": [949], "characters": "\u03B5" }, 1026 + "&epsilon;": { "codepoints": [949], "characters": "\u03B5" }, 1027 + "&epsiv;": { "codepoints": [1013], "characters": "\u03F5" }, 1028 + "&eqcirc;": { "codepoints": [8790], "characters": "\u2256" }, 1029 + "&eqcolon;": { "codepoints": [8789], "characters": "\u2255" }, 1030 + "&eqsim;": { "codepoints": [8770], "characters": "\u2242" }, 1031 + "&eqslantgtr;": { "codepoints": [10902], "characters": "\u2A96" }, 1032 + "&eqslantless;": { "codepoints": [10901], "characters": "\u2A95" }, 1033 + "&equals;": { "codepoints": [61], "characters": "\u003D" }, 1034 + "&equest;": { "codepoints": [8799], "characters": "\u225F" }, 1035 + "&equiv;": { "codepoints": [8801], "characters": "\u2261" }, 1036 + "&equivDD;": { "codepoints": [10872], "characters": "\u2A78" }, 1037 + "&eqvparsl;": { "codepoints": [10725], "characters": "\u29E5" }, 1038 + "&erDot;": { "codepoints": [8787], "characters": "\u2253" }, 1039 + "&erarr;": { "codepoints": [10609], "characters": "\u2971" }, 1040 + "&escr;": { "codepoints": [8495], "characters": "\u212F" }, 1041 + "&esdot;": { "codepoints": [8784], "characters": "\u2250" }, 1042 + "&esim;": { "codepoints": [8770], "characters": "\u2242" }, 1043 + "&eta;": { "codepoints": [951], "characters": "\u03B7" }, 1044 + "&eth": { "codepoints": [240], "characters": "\u00F0" }, 1045 + "&eth;": { "codepoints": [240], "characters": "\u00F0" }, 1046 + "&euml": { "codepoints": [235], "characters": "\u00EB" }, 1047 + "&euml;": { "codepoints": [235], "characters": "\u00EB" }, 1048 + "&euro;": { "codepoints": [8364], "characters": "\u20AC" }, 1049 + "&excl;": { "codepoints": [33], "characters": "\u0021" }, 1050 + "&exist;": { "codepoints": [8707], "characters": "\u2203" }, 1051 + "&expectation;": { "codepoints": [8496], "characters": "\u2130" }, 1052 + "&exponentiale;": { "codepoints": [8519], "characters": "\u2147" }, 1053 + "&fallingdotseq;": { "codepoints": [8786], "characters": "\u2252" }, 1054 + "&fcy;": { "codepoints": [1092], "characters": "\u0444" }, 1055 + "&female;": { "codepoints": [9792], "characters": "\u2640" }, 1056 + "&ffilig;": { "codepoints": [64259], "characters": "\uFB03" }, 1057 + "&fflig;": { "codepoints": [64256], "characters": "\uFB00" }, 1058 + "&ffllig;": { "codepoints": [64260], "characters": "\uFB04" }, 1059 + "&ffr;": { "codepoints": [120099], "characters": "\uD835\uDD23" }, 1060 + "&filig;": { "codepoints": [64257], "characters": "\uFB01" }, 1061 + "&fjlig;": { "codepoints": [102, 106], "characters": "\u0066\u006A" }, 1062 + "&flat;": { "codepoints": [9837], "characters": "\u266D" }, 1063 + "&fllig;": { "codepoints": [64258], "characters": "\uFB02" }, 1064 + "&fltns;": { "codepoints": [9649], "characters": "\u25B1" }, 1065 + "&fnof;": { "codepoints": [402], "characters": "\u0192" }, 1066 + "&fopf;": { "codepoints": [120151], "characters": "\uD835\uDD57" }, 1067 + "&forall;": { "codepoints": [8704], "characters": "\u2200" }, 1068 + "&fork;": { "codepoints": [8916], "characters": "\u22D4" }, 1069 + "&forkv;": { "codepoints": [10969], "characters": "\u2AD9" }, 1070 + "&fpartint;": { "codepoints": [10765], "characters": "\u2A0D" }, 1071 + "&frac12": { "codepoints": [189], "characters": "\u00BD" }, 1072 + "&frac12;": { "codepoints": [189], "characters": "\u00BD" }, 1073 + "&frac13;": { "codepoints": [8531], "characters": "\u2153" }, 1074 + "&frac14": { "codepoints": [188], "characters": "\u00BC" }, 1075 + "&frac14;": { "codepoints": [188], "characters": "\u00BC" }, 1076 + "&frac15;": { "codepoints": [8533], "characters": "\u2155" }, 1077 + "&frac16;": { "codepoints": [8537], "characters": "\u2159" }, 1078 + "&frac18;": { "codepoints": [8539], "characters": "\u215B" }, 1079 + "&frac23;": { "codepoints": [8532], "characters": "\u2154" }, 1080 + "&frac25;": { "codepoints": [8534], "characters": "\u2156" }, 1081 + "&frac34": { "codepoints": [190], "characters": "\u00BE" }, 1082 + "&frac34;": { "codepoints": [190], "characters": "\u00BE" }, 1083 + "&frac35;": { "codepoints": [8535], "characters": "\u2157" }, 1084 + "&frac38;": { "codepoints": [8540], "characters": "\u215C" }, 1085 + "&frac45;": { "codepoints": [8536], "characters": "\u2158" }, 1086 + "&frac56;": { "codepoints": [8538], "characters": "\u215A" }, 1087 + "&frac58;": { "codepoints": [8541], "characters": "\u215D" }, 1088 + "&frac78;": { "codepoints": [8542], "characters": "\u215E" }, 1089 + "&frasl;": { "codepoints": [8260], "characters": "\u2044" }, 1090 + "&frown;": { "codepoints": [8994], "characters": "\u2322" }, 1091 + "&fscr;": { "codepoints": [119995], "characters": "\uD835\uDCBB" }, 1092 + "&gE;": { "codepoints": [8807], "characters": "\u2267" }, 1093 + "&gEl;": { "codepoints": [10892], "characters": "\u2A8C" }, 1094 + "&gacute;": { "codepoints": [501], "characters": "\u01F5" }, 1095 + "&gamma;": { "codepoints": [947], "characters": "\u03B3" }, 1096 + "&gammad;": { "codepoints": [989], "characters": "\u03DD" }, 1097 + "&gap;": { "codepoints": [10886], "characters": "\u2A86" }, 1098 + "&gbreve;": { "codepoints": [287], "characters": "\u011F" }, 1099 + "&gcirc;": { "codepoints": [285], "characters": "\u011D" }, 1100 + "&gcy;": { "codepoints": [1075], "characters": "\u0433" }, 1101 + "&gdot;": { "codepoints": [289], "characters": "\u0121" }, 1102 + "&ge;": { "codepoints": [8805], "characters": "\u2265" }, 1103 + "&gel;": { "codepoints": [8923], "characters": "\u22DB" }, 1104 + "&geq;": { "codepoints": [8805], "characters": "\u2265" }, 1105 + "&geqq;": { "codepoints": [8807], "characters": "\u2267" }, 1106 + "&geqslant;": { "codepoints": [10878], "characters": "\u2A7E" }, 1107 + "&ges;": { "codepoints": [10878], "characters": "\u2A7E" }, 1108 + "&gescc;": { "codepoints": [10921], "characters": "\u2AA9" }, 1109 + "&gesdot;": { "codepoints": [10880], "characters": "\u2A80" }, 1110 + "&gesdoto;": { "codepoints": [10882], "characters": "\u2A82" }, 1111 + "&gesdotol;": { "codepoints": [10884], "characters": "\u2A84" }, 1112 + "&gesl;": { "codepoints": [8923, 65024], "characters": "\u22DB\uFE00" }, 1113 + "&gesles;": { "codepoints": [10900], "characters": "\u2A94" }, 1114 + "&gfr;": { "codepoints": [120100], "characters": "\uD835\uDD24" }, 1115 + "&gg;": { "codepoints": [8811], "characters": "\u226B" }, 1116 + "&ggg;": { "codepoints": [8921], "characters": "\u22D9" }, 1117 + "&gimel;": { "codepoints": [8503], "characters": "\u2137" }, 1118 + "&gjcy;": { "codepoints": [1107], "characters": "\u0453" }, 1119 + "&gl;": { "codepoints": [8823], "characters": "\u2277" }, 1120 + "&glE;": { "codepoints": [10898], "characters": "\u2A92" }, 1121 + "&gla;": { "codepoints": [10917], "characters": "\u2AA5" }, 1122 + "&glj;": { "codepoints": [10916], "characters": "\u2AA4" }, 1123 + "&gnE;": { "codepoints": [8809], "characters": "\u2269" }, 1124 + "&gnap;": { "codepoints": [10890], "characters": "\u2A8A" }, 1125 + "&gnapprox;": { "codepoints": [10890], "characters": "\u2A8A" }, 1126 + "&gne;": { "codepoints": [10888], "characters": "\u2A88" }, 1127 + "&gneq;": { "codepoints": [10888], "characters": "\u2A88" }, 1128 + "&gneqq;": { "codepoints": [8809], "characters": "\u2269" }, 1129 + "&gnsim;": { "codepoints": [8935], "characters": "\u22E7" }, 1130 + "&gopf;": { "codepoints": [120152], "characters": "\uD835\uDD58" }, 1131 + "&grave;": { "codepoints": [96], "characters": "\u0060" }, 1132 + "&gscr;": { "codepoints": [8458], "characters": "\u210A" }, 1133 + "&gsim;": { "codepoints": [8819], "characters": "\u2273" }, 1134 + "&gsime;": { "codepoints": [10894], "characters": "\u2A8E" }, 1135 + "&gsiml;": { "codepoints": [10896], "characters": "\u2A90" }, 1136 + "&gt": { "codepoints": [62], "characters": "\u003E" }, 1137 + "&gt;": { "codepoints": [62], "characters": "\u003E" }, 1138 + "&gtcc;": { "codepoints": [10919], "characters": "\u2AA7" }, 1139 + "&gtcir;": { "codepoints": [10874], "characters": "\u2A7A" }, 1140 + "&gtdot;": { "codepoints": [8919], "characters": "\u22D7" }, 1141 + "&gtlPar;": { "codepoints": [10645], "characters": "\u2995" }, 1142 + "&gtquest;": { "codepoints": [10876], "characters": "\u2A7C" }, 1143 + "&gtrapprox;": { "codepoints": [10886], "characters": "\u2A86" }, 1144 + "&gtrarr;": { "codepoints": [10616], "characters": "\u2978" }, 1145 + "&gtrdot;": { "codepoints": [8919], "characters": "\u22D7" }, 1146 + "&gtreqless;": { "codepoints": [8923], "characters": "\u22DB" }, 1147 + "&gtreqqless;": { "codepoints": [10892], "characters": "\u2A8C" }, 1148 + "&gtrless;": { "codepoints": [8823], "characters": "\u2277" }, 1149 + "&gtrsim;": { "codepoints": [8819], "characters": "\u2273" }, 1150 + "&gvertneqq;": { "codepoints": [8809, 65024], "characters": "\u2269\uFE00" }, 1151 + "&gvnE;": { "codepoints": [8809, 65024], "characters": "\u2269\uFE00" }, 1152 + "&hArr;": { "codepoints": [8660], "characters": "\u21D4" }, 1153 + "&hairsp;": { "codepoints": [8202], "characters": "\u200A" }, 1154 + "&half;": { "codepoints": [189], "characters": "\u00BD" }, 1155 + "&hamilt;": { "codepoints": [8459], "characters": "\u210B" }, 1156 + "&hardcy;": { "codepoints": [1098], "characters": "\u044A" }, 1157 + "&harr;": { "codepoints": [8596], "characters": "\u2194" }, 1158 + "&harrcir;": { "codepoints": [10568], "characters": "\u2948" }, 1159 + "&harrw;": { "codepoints": [8621], "characters": "\u21AD" }, 1160 + "&hbar;": { "codepoints": [8463], "characters": "\u210F" }, 1161 + "&hcirc;": { "codepoints": [293], "characters": "\u0125" }, 1162 + "&hearts;": { "codepoints": [9829], "characters": "\u2665" }, 1163 + "&heartsuit;": { "codepoints": [9829], "characters": "\u2665" }, 1164 + "&hellip;": { "codepoints": [8230], "characters": "\u2026" }, 1165 + "&hercon;": { "codepoints": [8889], "characters": "\u22B9" }, 1166 + "&hfr;": { "codepoints": [120101], "characters": "\uD835\uDD25" }, 1167 + "&hksearow;": { "codepoints": [10533], "characters": "\u2925" }, 1168 + "&hkswarow;": { "codepoints": [10534], "characters": "\u2926" }, 1169 + "&hoarr;": { "codepoints": [8703], "characters": "\u21FF" }, 1170 + "&homtht;": { "codepoints": [8763], "characters": "\u223B" }, 1171 + "&hookleftarrow;": { "codepoints": [8617], "characters": "\u21A9" }, 1172 + "&hookrightarrow;": { "codepoints": [8618], "characters": "\u21AA" }, 1173 + "&hopf;": { "codepoints": [120153], "characters": "\uD835\uDD59" }, 1174 + "&horbar;": { "codepoints": [8213], "characters": "\u2015" }, 1175 + "&hscr;": { "codepoints": [119997], "characters": "\uD835\uDCBD" }, 1176 + "&hslash;": { "codepoints": [8463], "characters": "\u210F" }, 1177 + "&hstrok;": { "codepoints": [295], "characters": "\u0127" }, 1178 + "&hybull;": { "codepoints": [8259], "characters": "\u2043" }, 1179 + "&hyphen;": { "codepoints": [8208], "characters": "\u2010" }, 1180 + "&iacute": { "codepoints": [237], "characters": "\u00ED" }, 1181 + "&iacute;": { "codepoints": [237], "characters": "\u00ED" }, 1182 + "&ic;": { "codepoints": [8291], "characters": "\u2063" }, 1183 + "&icirc": { "codepoints": [238], "characters": "\u00EE" }, 1184 + "&icirc;": { "codepoints": [238], "characters": "\u00EE" }, 1185 + "&icy;": { "codepoints": [1080], "characters": "\u0438" }, 1186 + "&iecy;": { "codepoints": [1077], "characters": "\u0435" }, 1187 + "&iexcl": { "codepoints": [161], "characters": "\u00A1" }, 1188 + "&iexcl;": { "codepoints": [161], "characters": "\u00A1" }, 1189 + "&iff;": { "codepoints": [8660], "characters": "\u21D4" }, 1190 + "&ifr;": { "codepoints": [120102], "characters": "\uD835\uDD26" }, 1191 + "&igrave": { "codepoints": [236], "characters": "\u00EC" }, 1192 + "&igrave;": { "codepoints": [236], "characters": "\u00EC" }, 1193 + "&ii;": { "codepoints": [8520], "characters": "\u2148" }, 1194 + "&iiiint;": { "codepoints": [10764], "characters": "\u2A0C" }, 1195 + "&iiint;": { "codepoints": [8749], "characters": "\u222D" }, 1196 + "&iinfin;": { "codepoints": [10716], "characters": "\u29DC" }, 1197 + "&iiota;": { "codepoints": [8489], "characters": "\u2129" }, 1198 + "&ijlig;": { "codepoints": [307], "characters": "\u0133" }, 1199 + "&imacr;": { "codepoints": [299], "characters": "\u012B" }, 1200 + "&image;": { "codepoints": [8465], "characters": "\u2111" }, 1201 + "&imagline;": { "codepoints": [8464], "characters": "\u2110" }, 1202 + "&imagpart;": { "codepoints": [8465], "characters": "\u2111" }, 1203 + "&imath;": { "codepoints": [305], "characters": "\u0131" }, 1204 + "&imof;": { "codepoints": [8887], "characters": "\u22B7" }, 1205 + "&imped;": { "codepoints": [437], "characters": "\u01B5" }, 1206 + "&in;": { "codepoints": [8712], "characters": "\u2208" }, 1207 + "&incare;": { "codepoints": [8453], "characters": "\u2105" }, 1208 + "&infin;": { "codepoints": [8734], "characters": "\u221E" }, 1209 + "&infintie;": { "codepoints": [10717], "characters": "\u29DD" }, 1210 + "&inodot;": { "codepoints": [305], "characters": "\u0131" }, 1211 + "&int;": { "codepoints": [8747], "characters": "\u222B" }, 1212 + "&intcal;": { "codepoints": [8890], "characters": "\u22BA" }, 1213 + "&integers;": { "codepoints": [8484], "characters": "\u2124" }, 1214 + "&intercal;": { "codepoints": [8890], "characters": "\u22BA" }, 1215 + "&intlarhk;": { "codepoints": [10775], "characters": "\u2A17" }, 1216 + "&intprod;": { "codepoints": [10812], "characters": "\u2A3C" }, 1217 + "&iocy;": { "codepoints": [1105], "characters": "\u0451" }, 1218 + "&iogon;": { "codepoints": [303], "characters": "\u012F" }, 1219 + "&iopf;": { "codepoints": [120154], "characters": "\uD835\uDD5A" }, 1220 + "&iota;": { "codepoints": [953], "characters": "\u03B9" }, 1221 + "&iprod;": { "codepoints": [10812], "characters": "\u2A3C" }, 1222 + "&iquest": { "codepoints": [191], "characters": "\u00BF" }, 1223 + "&iquest;": { "codepoints": [191], "characters": "\u00BF" }, 1224 + "&iscr;": { "codepoints": [119998], "characters": "\uD835\uDCBE" }, 1225 + "&isin;": { "codepoints": [8712], "characters": "\u2208" }, 1226 + "&isinE;": { "codepoints": [8953], "characters": "\u22F9" }, 1227 + "&isindot;": { "codepoints": [8949], "characters": "\u22F5" }, 1228 + "&isins;": { "codepoints": [8948], "characters": "\u22F4" }, 1229 + "&isinsv;": { "codepoints": [8947], "characters": "\u22F3" }, 1230 + "&isinv;": { "codepoints": [8712], "characters": "\u2208" }, 1231 + "&it;": { "codepoints": [8290], "characters": "\u2062" }, 1232 + "&itilde;": { "codepoints": [297], "characters": "\u0129" }, 1233 + "&iukcy;": { "codepoints": [1110], "characters": "\u0456" }, 1234 + "&iuml": { "codepoints": [239], "characters": "\u00EF" }, 1235 + "&iuml;": { "codepoints": [239], "characters": "\u00EF" }, 1236 + "&jcirc;": { "codepoints": [309], "characters": "\u0135" }, 1237 + "&jcy;": { "codepoints": [1081], "characters": "\u0439" }, 1238 + "&jfr;": { "codepoints": [120103], "characters": "\uD835\uDD27" }, 1239 + "&jmath;": { "codepoints": [567], "characters": "\u0237" }, 1240 + "&jopf;": { "codepoints": [120155], "characters": "\uD835\uDD5B" }, 1241 + "&jscr;": { "codepoints": [119999], "characters": "\uD835\uDCBF" }, 1242 + "&jsercy;": { "codepoints": [1112], "characters": "\u0458" }, 1243 + "&jukcy;": { "codepoints": [1108], "characters": "\u0454" }, 1244 + "&kappa;": { "codepoints": [954], "characters": "\u03BA" }, 1245 + "&kappav;": { "codepoints": [1008], "characters": "\u03F0" }, 1246 + "&kcedil;": { "codepoints": [311], "characters": "\u0137" }, 1247 + "&kcy;": { "codepoints": [1082], "characters": "\u043A" }, 1248 + "&kfr;": { "codepoints": [120104], "characters": "\uD835\uDD28" }, 1249 + "&kgreen;": { "codepoints": [312], "characters": "\u0138" }, 1250 + "&khcy;": { "codepoints": [1093], "characters": "\u0445" }, 1251 + "&kjcy;": { "codepoints": [1116], "characters": "\u045C" }, 1252 + "&kopf;": { "codepoints": [120156], "characters": "\uD835\uDD5C" }, 1253 + "&kscr;": { "codepoints": [120000], "characters": "\uD835\uDCC0" }, 1254 + "&lAarr;": { "codepoints": [8666], "characters": "\u21DA" }, 1255 + "&lArr;": { "codepoints": [8656], "characters": "\u21D0" }, 1256 + "&lAtail;": { "codepoints": [10523], "characters": "\u291B" }, 1257 + "&lBarr;": { "codepoints": [10510], "characters": "\u290E" }, 1258 + "&lE;": { "codepoints": [8806], "characters": "\u2266" }, 1259 + "&lEg;": { "codepoints": [10891], "characters": "\u2A8B" }, 1260 + "&lHar;": { "codepoints": [10594], "characters": "\u2962" }, 1261 + "&lacute;": { "codepoints": [314], "characters": "\u013A" }, 1262 + "&laemptyv;": { "codepoints": [10676], "characters": "\u29B4" }, 1263 + "&lagran;": { "codepoints": [8466], "characters": "\u2112" }, 1264 + "&lambda;": { "codepoints": [955], "characters": "\u03BB" }, 1265 + "&lang;": { "codepoints": [10216], "characters": "\u27E8" }, 1266 + "&langd;": { "codepoints": [10641], "characters": "\u2991" }, 1267 + "&langle;": { "codepoints": [10216], "characters": "\u27E8" }, 1268 + "&lap;": { "codepoints": [10885], "characters": "\u2A85" }, 1269 + "&laquo": { "codepoints": [171], "characters": "\u00AB" }, 1270 + "&laquo;": { "codepoints": [171], "characters": "\u00AB" }, 1271 + "&larr;": { "codepoints": [8592], "characters": "\u2190" }, 1272 + "&larrb;": { "codepoints": [8676], "characters": "\u21E4" }, 1273 + "&larrbfs;": { "codepoints": [10527], "characters": "\u291F" }, 1274 + "&larrfs;": { "codepoints": [10525], "characters": "\u291D" }, 1275 + "&larrhk;": { "codepoints": [8617], "characters": "\u21A9" }, 1276 + "&larrlp;": { "codepoints": [8619], "characters": "\u21AB" }, 1277 + "&larrpl;": { "codepoints": [10553], "characters": "\u2939" }, 1278 + "&larrsim;": { "codepoints": [10611], "characters": "\u2973" }, 1279 + "&larrtl;": { "codepoints": [8610], "characters": "\u21A2" }, 1280 + "&lat;": { "codepoints": [10923], "characters": "\u2AAB" }, 1281 + "&latail;": { "codepoints": [10521], "characters": "\u2919" }, 1282 + "&late;": { "codepoints": [10925], "characters": "\u2AAD" }, 1283 + "&lates;": { "codepoints": [10925, 65024], "characters": "\u2AAD\uFE00" }, 1284 + "&lbarr;": { "codepoints": [10508], "characters": "\u290C" }, 1285 + "&lbbrk;": { "codepoints": [10098], "characters": "\u2772" }, 1286 + "&lbrace;": { "codepoints": [123], "characters": "\u007B" }, 1287 + "&lbrack;": { "codepoints": [91], "characters": "\u005B" }, 1288 + "&lbrke;": { "codepoints": [10635], "characters": "\u298B" }, 1289 + "&lbrksld;": { "codepoints": [10639], "characters": "\u298F" }, 1290 + "&lbrkslu;": { "codepoints": [10637], "characters": "\u298D" }, 1291 + "&lcaron;": { "codepoints": [318], "characters": "\u013E" }, 1292 + "&lcedil;": { "codepoints": [316], "characters": "\u013C" }, 1293 + "&lceil;": { "codepoints": [8968], "characters": "\u2308" }, 1294 + "&lcub;": { "codepoints": [123], "characters": "\u007B" }, 1295 + "&lcy;": { "codepoints": [1083], "characters": "\u043B" }, 1296 + "&ldca;": { "codepoints": [10550], "characters": "\u2936" }, 1297 + "&ldquo;": { "codepoints": [8220], "characters": "\u201C" }, 1298 + "&ldquor;": { "codepoints": [8222], "characters": "\u201E" }, 1299 + "&ldrdhar;": { "codepoints": [10599], "characters": "\u2967" }, 1300 + "&ldrushar;": { "codepoints": [10571], "characters": "\u294B" }, 1301 + "&ldsh;": { "codepoints": [8626], "characters": "\u21B2" }, 1302 + "&le;": { "codepoints": [8804], "characters": "\u2264" }, 1303 + "&leftarrow;": { "codepoints": [8592], "characters": "\u2190" }, 1304 + "&leftarrowtail;": { "codepoints": [8610], "characters": "\u21A2" }, 1305 + "&leftharpoondown;": { "codepoints": [8637], "characters": "\u21BD" }, 1306 + "&leftharpoonup;": { "codepoints": [8636], "characters": "\u21BC" }, 1307 + "&leftleftarrows;": { "codepoints": [8647], "characters": "\u21C7" }, 1308 + "&leftrightarrow;": { "codepoints": [8596], "characters": "\u2194" }, 1309 + "&leftrightarrows;": { "codepoints": [8646], "characters": "\u21C6" }, 1310 + "&leftrightharpoons;": { "codepoints": [8651], "characters": "\u21CB" }, 1311 + "&leftrightsquigarrow;": { "codepoints": [8621], "characters": "\u21AD" }, 1312 + "&leftthreetimes;": { "codepoints": [8907], "characters": "\u22CB" }, 1313 + "&leg;": { "codepoints": [8922], "characters": "\u22DA" }, 1314 + "&leq;": { "codepoints": [8804], "characters": "\u2264" }, 1315 + "&leqq;": { "codepoints": [8806], "characters": "\u2266" }, 1316 + "&leqslant;": { "codepoints": [10877], "characters": "\u2A7D" }, 1317 + "&les;": { "codepoints": [10877], "characters": "\u2A7D" }, 1318 + "&lescc;": { "codepoints": [10920], "characters": "\u2AA8" }, 1319 + "&lesdot;": { "codepoints": [10879], "characters": "\u2A7F" }, 1320 + "&lesdoto;": { "codepoints": [10881], "characters": "\u2A81" }, 1321 + "&lesdotor;": { "codepoints": [10883], "characters": "\u2A83" }, 1322 + "&lesg;": { "codepoints": [8922, 65024], "characters": "\u22DA\uFE00" }, 1323 + "&lesges;": { "codepoints": [10899], "characters": "\u2A93" }, 1324 + "&lessapprox;": { "codepoints": [10885], "characters": "\u2A85" }, 1325 + "&lessdot;": { "codepoints": [8918], "characters": "\u22D6" }, 1326 + "&lesseqgtr;": { "codepoints": [8922], "characters": "\u22DA" }, 1327 + "&lesseqqgtr;": { "codepoints": [10891], "characters": "\u2A8B" }, 1328 + "&lessgtr;": { "codepoints": [8822], "characters": "\u2276" }, 1329 + "&lesssim;": { "codepoints": [8818], "characters": "\u2272" }, 1330 + "&lfisht;": { "codepoints": [10620], "characters": "\u297C" }, 1331 + "&lfloor;": { "codepoints": [8970], "characters": "\u230A" }, 1332 + "&lfr;": { "codepoints": [120105], "characters": "\uD835\uDD29" }, 1333 + "&lg;": { "codepoints": [8822], "characters": "\u2276" }, 1334 + "&lgE;": { "codepoints": [10897], "characters": "\u2A91" }, 1335 + "&lhard;": { "codepoints": [8637], "characters": "\u21BD" }, 1336 + "&lharu;": { "codepoints": [8636], "characters": "\u21BC" }, 1337 + "&lharul;": { "codepoints": [10602], "characters": "\u296A" }, 1338 + "&lhblk;": { "codepoints": [9604], "characters": "\u2584" }, 1339 + "&ljcy;": { "codepoints": [1113], "characters": "\u0459" }, 1340 + "&ll;": { "codepoints": [8810], "characters": "\u226A" }, 1341 + "&llarr;": { "codepoints": [8647], "characters": "\u21C7" }, 1342 + "&llcorner;": { "codepoints": [8990], "characters": "\u231E" }, 1343 + "&llhard;": { "codepoints": [10603], "characters": "\u296B" }, 1344 + "&lltri;": { "codepoints": [9722], "characters": "\u25FA" }, 1345 + "&lmidot;": { "codepoints": [320], "characters": "\u0140" }, 1346 + "&lmoust;": { "codepoints": [9136], "characters": "\u23B0" }, 1347 + "&lmoustache;": { "codepoints": [9136], "characters": "\u23B0" }, 1348 + "&lnE;": { "codepoints": [8808], "characters": "\u2268" }, 1349 + "&lnap;": { "codepoints": [10889], "characters": "\u2A89" }, 1350 + "&lnapprox;": { "codepoints": [10889], "characters": "\u2A89" }, 1351 + "&lne;": { "codepoints": [10887], "characters": "\u2A87" }, 1352 + "&lneq;": { "codepoints": [10887], "characters": "\u2A87" }, 1353 + "&lneqq;": { "codepoints": [8808], "characters": "\u2268" }, 1354 + "&lnsim;": { "codepoints": [8934], "characters": "\u22E6" }, 1355 + "&loang;": { "codepoints": [10220], "characters": "\u27EC" }, 1356 + "&loarr;": { "codepoints": [8701], "characters": "\u21FD" }, 1357 + "&lobrk;": { "codepoints": [10214], "characters": "\u27E6" }, 1358 + "&longleftarrow;": { "codepoints": [10229], "characters": "\u27F5" }, 1359 + "&longleftrightarrow;": { "codepoints": [10231], "characters": "\u27F7" }, 1360 + "&longmapsto;": { "codepoints": [10236], "characters": "\u27FC" }, 1361 + "&longrightarrow;": { "codepoints": [10230], "characters": "\u27F6" }, 1362 + "&looparrowleft;": { "codepoints": [8619], "characters": "\u21AB" }, 1363 + "&looparrowright;": { "codepoints": [8620], "characters": "\u21AC" }, 1364 + "&lopar;": { "codepoints": [10629], "characters": "\u2985" }, 1365 + "&lopf;": { "codepoints": [120157], "characters": "\uD835\uDD5D" }, 1366 + "&loplus;": { "codepoints": [10797], "characters": "\u2A2D" }, 1367 + "&lotimes;": { "codepoints": [10804], "characters": "\u2A34" }, 1368 + "&lowast;": { "codepoints": [8727], "characters": "\u2217" }, 1369 + "&lowbar;": { "codepoints": [95], "characters": "\u005F" }, 1370 + "&loz;": { "codepoints": [9674], "characters": "\u25CA" }, 1371 + "&lozenge;": { "codepoints": [9674], "characters": "\u25CA" }, 1372 + "&lozf;": { "codepoints": [10731], "characters": "\u29EB" }, 1373 + "&lpar;": { "codepoints": [40], "characters": "\u0028" }, 1374 + "&lparlt;": { "codepoints": [10643], "characters": "\u2993" }, 1375 + "&lrarr;": { "codepoints": [8646], "characters": "\u21C6" }, 1376 + "&lrcorner;": { "codepoints": [8991], "characters": "\u231F" }, 1377 + "&lrhar;": { "codepoints": [8651], "characters": "\u21CB" }, 1378 + "&lrhard;": { "codepoints": [10605], "characters": "\u296D" }, 1379 + "&lrm;": { "codepoints": [8206], "characters": "\u200E" }, 1380 + "&lrtri;": { "codepoints": [8895], "characters": "\u22BF" }, 1381 + "&lsaquo;": { "codepoints": [8249], "characters": "\u2039" }, 1382 + "&lscr;": { "codepoints": [120001], "characters": "\uD835\uDCC1" }, 1383 + "&lsh;": { "codepoints": [8624], "characters": "\u21B0" }, 1384 + "&lsim;": { "codepoints": [8818], "characters": "\u2272" }, 1385 + "&lsime;": { "codepoints": [10893], "characters": "\u2A8D" }, 1386 + "&lsimg;": { "codepoints": [10895], "characters": "\u2A8F" }, 1387 + "&lsqb;": { "codepoints": [91], "characters": "\u005B" }, 1388 + "&lsquo;": { "codepoints": [8216], "characters": "\u2018" }, 1389 + "&lsquor;": { "codepoints": [8218], "characters": "\u201A" }, 1390 + "&lstrok;": { "codepoints": [322], "characters": "\u0142" }, 1391 + "&lt": { "codepoints": [60], "characters": "\u003C" }, 1392 + "&lt;": { "codepoints": [60], "characters": "\u003C" }, 1393 + "&ltcc;": { "codepoints": [10918], "characters": "\u2AA6" }, 1394 + "&ltcir;": { "codepoints": [10873], "characters": "\u2A79" }, 1395 + "&ltdot;": { "codepoints": [8918], "characters": "\u22D6" }, 1396 + "&lthree;": { "codepoints": [8907], "characters": "\u22CB" }, 1397 + "&ltimes;": { "codepoints": [8905], "characters": "\u22C9" }, 1398 + "&ltlarr;": { "codepoints": [10614], "characters": "\u2976" }, 1399 + "&ltquest;": { "codepoints": [10875], "characters": "\u2A7B" }, 1400 + "&ltrPar;": { "codepoints": [10646], "characters": "\u2996" }, 1401 + "&ltri;": { "codepoints": [9667], "characters": "\u25C3" }, 1402 + "&ltrie;": { "codepoints": [8884], "characters": "\u22B4" }, 1403 + "&ltrif;": { "codepoints": [9666], "characters": "\u25C2" }, 1404 + "&lurdshar;": { "codepoints": [10570], "characters": "\u294A" }, 1405 + "&luruhar;": { "codepoints": [10598], "characters": "\u2966" }, 1406 + "&lvertneqq;": { "codepoints": [8808, 65024], "characters": "\u2268\uFE00" }, 1407 + "&lvnE;": { "codepoints": [8808, 65024], "characters": "\u2268\uFE00" }, 1408 + "&mDDot;": { "codepoints": [8762], "characters": "\u223A" }, 1409 + "&macr": { "codepoints": [175], "characters": "\u00AF" }, 1410 + "&macr;": { "codepoints": [175], "characters": "\u00AF" }, 1411 + "&male;": { "codepoints": [9794], "characters": "\u2642" }, 1412 + "&malt;": { "codepoints": [10016], "characters": "\u2720" }, 1413 + "&maltese;": { "codepoints": [10016], "characters": "\u2720" }, 1414 + "&map;": { "codepoints": [8614], "characters": "\u21A6" }, 1415 + "&mapsto;": { "codepoints": [8614], "characters": "\u21A6" }, 1416 + "&mapstodown;": { "codepoints": [8615], "characters": "\u21A7" }, 1417 + "&mapstoleft;": { "codepoints": [8612], "characters": "\u21A4" }, 1418 + "&mapstoup;": { "codepoints": [8613], "characters": "\u21A5" }, 1419 + "&marker;": { "codepoints": [9646], "characters": "\u25AE" }, 1420 + "&mcomma;": { "codepoints": [10793], "characters": "\u2A29" }, 1421 + "&mcy;": { "codepoints": [1084], "characters": "\u043C" }, 1422 + "&mdash;": { "codepoints": [8212], "characters": "\u2014" }, 1423 + "&measuredangle;": { "codepoints": [8737], "characters": "\u2221" }, 1424 + "&mfr;": { "codepoints": [120106], "characters": "\uD835\uDD2A" }, 1425 + "&mho;": { "codepoints": [8487], "characters": "\u2127" }, 1426 + "&micro": { "codepoints": [181], "characters": "\u00B5" }, 1427 + "&micro;": { "codepoints": [181], "characters": "\u00B5" }, 1428 + "&mid;": { "codepoints": [8739], "characters": "\u2223" }, 1429 + "&midast;": { "codepoints": [42], "characters": "\u002A" }, 1430 + "&midcir;": { "codepoints": [10992], "characters": "\u2AF0" }, 1431 + "&middot": { "codepoints": [183], "characters": "\u00B7" }, 1432 + "&middot;": { "codepoints": [183], "characters": "\u00B7" }, 1433 + "&minus;": { "codepoints": [8722], "characters": "\u2212" }, 1434 + "&minusb;": { "codepoints": [8863], "characters": "\u229F" }, 1435 + "&minusd;": { "codepoints": [8760], "characters": "\u2238" }, 1436 + "&minusdu;": { "codepoints": [10794], "characters": "\u2A2A" }, 1437 + "&mlcp;": { "codepoints": [10971], "characters": "\u2ADB" }, 1438 + "&mldr;": { "codepoints": [8230], "characters": "\u2026" }, 1439 + "&mnplus;": { "codepoints": [8723], "characters": "\u2213" }, 1440 + "&models;": { "codepoints": [8871], "characters": "\u22A7" }, 1441 + "&mopf;": { "codepoints": [120158], "characters": "\uD835\uDD5E" }, 1442 + "&mp;": { "codepoints": [8723], "characters": "\u2213" }, 1443 + "&mscr;": { "codepoints": [120002], "characters": "\uD835\uDCC2" }, 1444 + "&mstpos;": { "codepoints": [8766], "characters": "\u223E" }, 1445 + "&mu;": { "codepoints": [956], "characters": "\u03BC" }, 1446 + "&multimap;": { "codepoints": [8888], "characters": "\u22B8" }, 1447 + "&mumap;": { "codepoints": [8888], "characters": "\u22B8" }, 1448 + "&nGg;": { "codepoints": [8921, 824], "characters": "\u22D9\u0338" }, 1449 + "&nGt;": { "codepoints": [8811, 8402], "characters": "\u226B\u20D2" }, 1450 + "&nGtv;": { "codepoints": [8811, 824], "characters": "\u226B\u0338" }, 1451 + "&nLeftarrow;": { "codepoints": [8653], "characters": "\u21CD" }, 1452 + "&nLeftrightarrow;": { "codepoints": [8654], "characters": "\u21CE" }, 1453 + "&nLl;": { "codepoints": [8920, 824], "characters": "\u22D8\u0338" }, 1454 + "&nLt;": { "codepoints": [8810, 8402], "characters": "\u226A\u20D2" }, 1455 + "&nLtv;": { "codepoints": [8810, 824], "characters": "\u226A\u0338" }, 1456 + "&nRightarrow;": { "codepoints": [8655], "characters": "\u21CF" }, 1457 + "&nVDash;": { "codepoints": [8879], "characters": "\u22AF" }, 1458 + "&nVdash;": { "codepoints": [8878], "characters": "\u22AE" }, 1459 + "&nabla;": { "codepoints": [8711], "characters": "\u2207" }, 1460 + "&nacute;": { "codepoints": [324], "characters": "\u0144" }, 1461 + "&nang;": { "codepoints": [8736, 8402], "characters": "\u2220\u20D2" }, 1462 + "&nap;": { "codepoints": [8777], "characters": "\u2249" }, 1463 + "&napE;": { "codepoints": [10864, 824], "characters": "\u2A70\u0338" }, 1464 + "&napid;": { "codepoints": [8779, 824], "characters": "\u224B\u0338" }, 1465 + "&napos;": { "codepoints": [329], "characters": "\u0149" }, 1466 + "&napprox;": { "codepoints": [8777], "characters": "\u2249" }, 1467 + "&natur;": { "codepoints": [9838], "characters": "\u266E" }, 1468 + "&natural;": { "codepoints": [9838], "characters": "\u266E" }, 1469 + "&naturals;": { "codepoints": [8469], "characters": "\u2115" }, 1470 + "&nbsp": { "codepoints": [160], "characters": "\u00A0" }, 1471 + "&nbsp;": { "codepoints": [160], "characters": "\u00A0" }, 1472 + "&nbump;": { "codepoints": [8782, 824], "characters": "\u224E\u0338" }, 1473 + "&nbumpe;": { "codepoints": [8783, 824], "characters": "\u224F\u0338" }, 1474 + "&ncap;": { "codepoints": [10819], "characters": "\u2A43" }, 1475 + "&ncaron;": { "codepoints": [328], "characters": "\u0148" }, 1476 + "&ncedil;": { "codepoints": [326], "characters": "\u0146" }, 1477 + "&ncong;": { "codepoints": [8775], "characters": "\u2247" }, 1478 + "&ncongdot;": { "codepoints": [10861, 824], "characters": "\u2A6D\u0338" }, 1479 + "&ncup;": { "codepoints": [10818], "characters": "\u2A42" }, 1480 + "&ncy;": { "codepoints": [1085], "characters": "\u043D" }, 1481 + "&ndash;": { "codepoints": [8211], "characters": "\u2013" }, 1482 + "&ne;": { "codepoints": [8800], "characters": "\u2260" }, 1483 + "&neArr;": { "codepoints": [8663], "characters": "\u21D7" }, 1484 + "&nearhk;": { "codepoints": [10532], "characters": "\u2924" }, 1485 + "&nearr;": { "codepoints": [8599], "characters": "\u2197" }, 1486 + "&nearrow;": { "codepoints": [8599], "characters": "\u2197" }, 1487 + "&nedot;": { "codepoints": [8784, 824], "characters": "\u2250\u0338" }, 1488 + "&nequiv;": { "codepoints": [8802], "characters": "\u2262" }, 1489 + "&nesear;": { "codepoints": [10536], "characters": "\u2928" }, 1490 + "&nesim;": { "codepoints": [8770, 824], "characters": "\u2242\u0338" }, 1491 + "&nexist;": { "codepoints": [8708], "characters": "\u2204" }, 1492 + "&nexists;": { "codepoints": [8708], "characters": "\u2204" }, 1493 + "&nfr;": { "codepoints": [120107], "characters": "\uD835\uDD2B" }, 1494 + "&ngE;": { "codepoints": [8807, 824], "characters": "\u2267\u0338" }, 1495 + "&nge;": { "codepoints": [8817], "characters": "\u2271" }, 1496 + "&ngeq;": { "codepoints": [8817], "characters": "\u2271" }, 1497 + "&ngeqq;": { "codepoints": [8807, 824], "characters": "\u2267\u0338" }, 1498 + "&ngeqslant;": { "codepoints": [10878, 824], "characters": "\u2A7E\u0338" }, 1499 + "&nges;": { "codepoints": [10878, 824], "characters": "\u2A7E\u0338" }, 1500 + "&ngsim;": { "codepoints": [8821], "characters": "\u2275" }, 1501 + "&ngt;": { "codepoints": [8815], "characters": "\u226F" }, 1502 + "&ngtr;": { "codepoints": [8815], "characters": "\u226F" }, 1503 + "&nhArr;": { "codepoints": [8654], "characters": "\u21CE" }, 1504 + "&nharr;": { "codepoints": [8622], "characters": "\u21AE" }, 1505 + "&nhpar;": { "codepoints": [10994], "characters": "\u2AF2" }, 1506 + "&ni;": { "codepoints": [8715], "characters": "\u220B" }, 1507 + "&nis;": { "codepoints": [8956], "characters": "\u22FC" }, 1508 + "&nisd;": { "codepoints": [8954], "characters": "\u22FA" }, 1509 + "&niv;": { "codepoints": [8715], "characters": "\u220B" }, 1510 + "&njcy;": { "codepoints": [1114], "characters": "\u045A" }, 1511 + "&nlArr;": { "codepoints": [8653], "characters": "\u21CD" }, 1512 + "&nlE;": { "codepoints": [8806, 824], "characters": "\u2266\u0338" }, 1513 + "&nlarr;": { "codepoints": [8602], "characters": "\u219A" }, 1514 + "&nldr;": { "codepoints": [8229], "characters": "\u2025" }, 1515 + "&nle;": { "codepoints": [8816], "characters": "\u2270" }, 1516 + "&nleftarrow;": { "codepoints": [8602], "characters": "\u219A" }, 1517 + "&nleftrightarrow;": { "codepoints": [8622], "characters": "\u21AE" }, 1518 + "&nleq;": { "codepoints": [8816], "characters": "\u2270" }, 1519 + "&nleqq;": { "codepoints": [8806, 824], "characters": "\u2266\u0338" }, 1520 + "&nleqslant;": { "codepoints": [10877, 824], "characters": "\u2A7D\u0338" }, 1521 + "&nles;": { "codepoints": [10877, 824], "characters": "\u2A7D\u0338" }, 1522 + "&nless;": { "codepoints": [8814], "characters": "\u226E" }, 1523 + "&nlsim;": { "codepoints": [8820], "characters": "\u2274" }, 1524 + "&nlt;": { "codepoints": [8814], "characters": "\u226E" }, 1525 + "&nltri;": { "codepoints": [8938], "characters": "\u22EA" }, 1526 + "&nltrie;": { "codepoints": [8940], "characters": "\u22EC" }, 1527 + "&nmid;": { "codepoints": [8740], "characters": "\u2224" }, 1528 + "&nopf;": { "codepoints": [120159], "characters": "\uD835\uDD5F" }, 1529 + "&not": { "codepoints": [172], "characters": "\u00AC" }, 1530 + "&not;": { "codepoints": [172], "characters": "\u00AC" }, 1531 + "&notin;": { "codepoints": [8713], "characters": "\u2209" }, 1532 + "&notinE;": { "codepoints": [8953, 824], "characters": "\u22F9\u0338" }, 1533 + "&notindot;": { "codepoints": [8949, 824], "characters": "\u22F5\u0338" }, 1534 + "&notinva;": { "codepoints": [8713], "characters": "\u2209" }, 1535 + "&notinvb;": { "codepoints": [8951], "characters": "\u22F7" }, 1536 + "&notinvc;": { "codepoints": [8950], "characters": "\u22F6" }, 1537 + "&notni;": { "codepoints": [8716], "characters": "\u220C" }, 1538 + "&notniva;": { "codepoints": [8716], "characters": "\u220C" }, 1539 + "&notnivb;": { "codepoints": [8958], "characters": "\u22FE" }, 1540 + "&notnivc;": { "codepoints": [8957], "characters": "\u22FD" }, 1541 + "&npar;": { "codepoints": [8742], "characters": "\u2226" }, 1542 + "&nparallel;": { "codepoints": [8742], "characters": "\u2226" }, 1543 + "&nparsl;": { "codepoints": [11005, 8421], "characters": "\u2AFD\u20E5" }, 1544 + "&npart;": { "codepoints": [8706, 824], "characters": "\u2202\u0338" }, 1545 + "&npolint;": { "codepoints": [10772], "characters": "\u2A14" }, 1546 + "&npr;": { "codepoints": [8832], "characters": "\u2280" }, 1547 + "&nprcue;": { "codepoints": [8928], "characters": "\u22E0" }, 1548 + "&npre;": { "codepoints": [10927, 824], "characters": "\u2AAF\u0338" }, 1549 + "&nprec;": { "codepoints": [8832], "characters": "\u2280" }, 1550 + "&npreceq;": { "codepoints": [10927, 824], "characters": "\u2AAF\u0338" }, 1551 + "&nrArr;": { "codepoints": [8655], "characters": "\u21CF" }, 1552 + "&nrarr;": { "codepoints": [8603], "characters": "\u219B" }, 1553 + "&nrarrc;": { "codepoints": [10547, 824], "characters": "\u2933\u0338" }, 1554 + "&nrarrw;": { "codepoints": [8605, 824], "characters": "\u219D\u0338" }, 1555 + "&nrightarrow;": { "codepoints": [8603], "characters": "\u219B" }, 1556 + "&nrtri;": { "codepoints": [8939], "characters": "\u22EB" }, 1557 + "&nrtrie;": { "codepoints": [8941], "characters": "\u22ED" }, 1558 + "&nsc;": { "codepoints": [8833], "characters": "\u2281" }, 1559 + "&nsccue;": { "codepoints": [8929], "characters": "\u22E1" }, 1560 + "&nsce;": { "codepoints": [10928, 824], "characters": "\u2AB0\u0338" }, 1561 + "&nscr;": { "codepoints": [120003], "characters": "\uD835\uDCC3" }, 1562 + "&nshortmid;": { "codepoints": [8740], "characters": "\u2224" }, 1563 + "&nshortparallel;": { "codepoints": [8742], "characters": "\u2226" }, 1564 + "&nsim;": { "codepoints": [8769], "characters": "\u2241" }, 1565 + "&nsime;": { "codepoints": [8772], "characters": "\u2244" }, 1566 + "&nsimeq;": { "codepoints": [8772], "characters": "\u2244" }, 1567 + "&nsmid;": { "codepoints": [8740], "characters": "\u2224" }, 1568 + "&nspar;": { "codepoints": [8742], "characters": "\u2226" }, 1569 + "&nsqsube;": { "codepoints": [8930], "characters": "\u22E2" }, 1570 + "&nsqsupe;": { "codepoints": [8931], "characters": "\u22E3" }, 1571 + "&nsub;": { "codepoints": [8836], "characters": "\u2284" }, 1572 + "&nsubE;": { "codepoints": [10949, 824], "characters": "\u2AC5\u0338" }, 1573 + "&nsube;": { "codepoints": [8840], "characters": "\u2288" }, 1574 + "&nsubset;": { "codepoints": [8834, 8402], "characters": "\u2282\u20D2" }, 1575 + "&nsubseteq;": { "codepoints": [8840], "characters": "\u2288" }, 1576 + "&nsubseteqq;": { "codepoints": [10949, 824], "characters": "\u2AC5\u0338" }, 1577 + "&nsucc;": { "codepoints": [8833], "characters": "\u2281" }, 1578 + "&nsucceq;": { "codepoints": [10928, 824], "characters": "\u2AB0\u0338" }, 1579 + "&nsup;": { "codepoints": [8837], "characters": "\u2285" }, 1580 + "&nsupE;": { "codepoints": [10950, 824], "characters": "\u2AC6\u0338" }, 1581 + "&nsupe;": { "codepoints": [8841], "characters": "\u2289" }, 1582 + "&nsupset;": { "codepoints": [8835, 8402], "characters": "\u2283\u20D2" }, 1583 + "&nsupseteq;": { "codepoints": [8841], "characters": "\u2289" }, 1584 + "&nsupseteqq;": { "codepoints": [10950, 824], "characters": "\u2AC6\u0338" }, 1585 + "&ntgl;": { "codepoints": [8825], "characters": "\u2279" }, 1586 + "&ntilde": { "codepoints": [241], "characters": "\u00F1" }, 1587 + "&ntilde;": { "codepoints": [241], "characters": "\u00F1" }, 1588 + "&ntlg;": { "codepoints": [8824], "characters": "\u2278" }, 1589 + "&ntriangleleft;": { "codepoints": [8938], "characters": "\u22EA" }, 1590 + "&ntrianglelefteq;": { "codepoints": [8940], "characters": "\u22EC" }, 1591 + "&ntriangleright;": { "codepoints": [8939], "characters": "\u22EB" }, 1592 + "&ntrianglerighteq;": { "codepoints": [8941], "characters": "\u22ED" }, 1593 + "&nu;": { "codepoints": [957], "characters": "\u03BD" }, 1594 + "&num;": { "codepoints": [35], "characters": "\u0023" }, 1595 + "&numero;": { "codepoints": [8470], "characters": "\u2116" }, 1596 + "&numsp;": { "codepoints": [8199], "characters": "\u2007" }, 1597 + "&nvDash;": { "codepoints": [8877], "characters": "\u22AD" }, 1598 + "&nvHarr;": { "codepoints": [10500], "characters": "\u2904" }, 1599 + "&nvap;": { "codepoints": [8781, 8402], "characters": "\u224D\u20D2" }, 1600 + "&nvdash;": { "codepoints": [8876], "characters": "\u22AC" }, 1601 + "&nvge;": { "codepoints": [8805, 8402], "characters": "\u2265\u20D2" }, 1602 + "&nvgt;": { "codepoints": [62, 8402], "characters": "\u003E\u20D2" }, 1603 + "&nvinfin;": { "codepoints": [10718], "characters": "\u29DE" }, 1604 + "&nvlArr;": { "codepoints": [10498], "characters": "\u2902" }, 1605 + "&nvle;": { "codepoints": [8804, 8402], "characters": "\u2264\u20D2" }, 1606 + "&nvlt;": { "codepoints": [60, 8402], "characters": "\u003C\u20D2" }, 1607 + "&nvltrie;": { "codepoints": [8884, 8402], "characters": "\u22B4\u20D2" }, 1608 + "&nvrArr;": { "codepoints": [10499], "characters": "\u2903" }, 1609 + "&nvrtrie;": { "codepoints": [8885, 8402], "characters": "\u22B5\u20D2" }, 1610 + "&nvsim;": { "codepoints": [8764, 8402], "characters": "\u223C\u20D2" }, 1611 + "&nwArr;": { "codepoints": [8662], "characters": "\u21D6" }, 1612 + "&nwarhk;": { "codepoints": [10531], "characters": "\u2923" }, 1613 + "&nwarr;": { "codepoints": [8598], "characters": "\u2196" }, 1614 + "&nwarrow;": { "codepoints": [8598], "characters": "\u2196" }, 1615 + "&nwnear;": { "codepoints": [10535], "characters": "\u2927" }, 1616 + "&oS;": { "codepoints": [9416], "characters": "\u24C8" }, 1617 + "&oacute": { "codepoints": [243], "characters": "\u00F3" }, 1618 + "&oacute;": { "codepoints": [243], "characters": "\u00F3" }, 1619 + "&oast;": { "codepoints": [8859], "characters": "\u229B" }, 1620 + "&ocir;": { "codepoints": [8858], "characters": "\u229A" }, 1621 + "&ocirc": { "codepoints": [244], "characters": "\u00F4" }, 1622 + "&ocirc;": { "codepoints": [244], "characters": "\u00F4" }, 1623 + "&ocy;": { "codepoints": [1086], "characters": "\u043E" }, 1624 + "&odash;": { "codepoints": [8861], "characters": "\u229D" }, 1625 + "&odblac;": { "codepoints": [337], "characters": "\u0151" }, 1626 + "&odiv;": { "codepoints": [10808], "characters": "\u2A38" }, 1627 + "&odot;": { "codepoints": [8857], "characters": "\u2299" }, 1628 + "&odsold;": { "codepoints": [10684], "characters": "\u29BC" }, 1629 + "&oelig;": { "codepoints": [339], "characters": "\u0153" }, 1630 + "&ofcir;": { "codepoints": [10687], "characters": "\u29BF" }, 1631 + "&ofr;": { "codepoints": [120108], "characters": "\uD835\uDD2C" }, 1632 + "&ogon;": { "codepoints": [731], "characters": "\u02DB" }, 1633 + "&ograve": { "codepoints": [242], "characters": "\u00F2" }, 1634 + "&ograve;": { "codepoints": [242], "characters": "\u00F2" }, 1635 + "&ogt;": { "codepoints": [10689], "characters": "\u29C1" }, 1636 + "&ohbar;": { "codepoints": [10677], "characters": "\u29B5" }, 1637 + "&ohm;": { "codepoints": [937], "characters": "\u03A9" }, 1638 + "&oint;": { "codepoints": [8750], "characters": "\u222E" }, 1639 + "&olarr;": { "codepoints": [8634], "characters": "\u21BA" }, 1640 + "&olcir;": { "codepoints": [10686], "characters": "\u29BE" }, 1641 + "&olcross;": { "codepoints": [10683], "characters": "\u29BB" }, 1642 + "&oline;": { "codepoints": [8254], "characters": "\u203E" }, 1643 + "&olt;": { "codepoints": [10688], "characters": "\u29C0" }, 1644 + "&omacr;": { "codepoints": [333], "characters": "\u014D" }, 1645 + "&omega;": { "codepoints": [969], "characters": "\u03C9" }, 1646 + "&omicron;": { "codepoints": [959], "characters": "\u03BF" }, 1647 + "&omid;": { "codepoints": [10678], "characters": "\u29B6" }, 1648 + "&ominus;": { "codepoints": [8854], "characters": "\u2296" }, 1649 + "&oopf;": { "codepoints": [120160], "characters": "\uD835\uDD60" }, 1650 + "&opar;": { "codepoints": [10679], "characters": "\u29B7" }, 1651 + "&operp;": { "codepoints": [10681], "characters": "\u29B9" }, 1652 + "&oplus;": { "codepoints": [8853], "characters": "\u2295" }, 1653 + "&or;": { "codepoints": [8744], "characters": "\u2228" }, 1654 + "&orarr;": { "codepoints": [8635], "characters": "\u21BB" }, 1655 + "&ord;": { "codepoints": [10845], "characters": "\u2A5D" }, 1656 + "&order;": { "codepoints": [8500], "characters": "\u2134" }, 1657 + "&orderof;": { "codepoints": [8500], "characters": "\u2134" }, 1658 + "&ordf": { "codepoints": [170], "characters": "\u00AA" }, 1659 + "&ordf;": { "codepoints": [170], "characters": "\u00AA" }, 1660 + "&ordm": { "codepoints": [186], "characters": "\u00BA" }, 1661 + "&ordm;": { "codepoints": [186], "characters": "\u00BA" }, 1662 + "&origof;": { "codepoints": [8886], "characters": "\u22B6" }, 1663 + "&oror;": { "codepoints": [10838], "characters": "\u2A56" }, 1664 + "&orslope;": { "codepoints": [10839], "characters": "\u2A57" }, 1665 + "&orv;": { "codepoints": [10843], "characters": "\u2A5B" }, 1666 + "&oscr;": { "codepoints": [8500], "characters": "\u2134" }, 1667 + "&oslash": { "codepoints": [248], "characters": "\u00F8" }, 1668 + "&oslash;": { "codepoints": [248], "characters": "\u00F8" }, 1669 + "&osol;": { "codepoints": [8856], "characters": "\u2298" }, 1670 + "&otilde": { "codepoints": [245], "characters": "\u00F5" }, 1671 + "&otilde;": { "codepoints": [245], "characters": "\u00F5" }, 1672 + "&otimes;": { "codepoints": [8855], "characters": "\u2297" }, 1673 + "&otimesas;": { "codepoints": [10806], "characters": "\u2A36" }, 1674 + "&ouml": { "codepoints": [246], "characters": "\u00F6" }, 1675 + "&ouml;": { "codepoints": [246], "characters": "\u00F6" }, 1676 + "&ovbar;": { "codepoints": [9021], "characters": "\u233D" }, 1677 + "&par;": { "codepoints": [8741], "characters": "\u2225" }, 1678 + "&para": { "codepoints": [182], "characters": "\u00B6" }, 1679 + "&para;": { "codepoints": [182], "characters": "\u00B6" }, 1680 + "&parallel;": { "codepoints": [8741], "characters": "\u2225" }, 1681 + "&parsim;": { "codepoints": [10995], "characters": "\u2AF3" }, 1682 + "&parsl;": { "codepoints": [11005], "characters": "\u2AFD" }, 1683 + "&part;": { "codepoints": [8706], "characters": "\u2202" }, 1684 + "&pcy;": { "codepoints": [1087], "characters": "\u043F" }, 1685 + "&percnt;": { "codepoints": [37], "characters": "\u0025" }, 1686 + "&period;": { "codepoints": [46], "characters": "\u002E" }, 1687 + "&permil;": { "codepoints": [8240], "characters": "\u2030" }, 1688 + "&perp;": { "codepoints": [8869], "characters": "\u22A5" }, 1689 + "&pertenk;": { "codepoints": [8241], "characters": "\u2031" }, 1690 + "&pfr;": { "codepoints": [120109], "characters": "\uD835\uDD2D" }, 1691 + "&phi;": { "codepoints": [966], "characters": "\u03C6" }, 1692 + "&phiv;": { "codepoints": [981], "characters": "\u03D5" }, 1693 + "&phmmat;": { "codepoints": [8499], "characters": "\u2133" }, 1694 + "&phone;": { "codepoints": [9742], "characters": "\u260E" }, 1695 + "&pi;": { "codepoints": [960], "characters": "\u03C0" }, 1696 + "&pitchfork;": { "codepoints": [8916], "characters": "\u22D4" }, 1697 + "&piv;": { "codepoints": [982], "characters": "\u03D6" }, 1698 + "&planck;": { "codepoints": [8463], "characters": "\u210F" }, 1699 + "&planckh;": { "codepoints": [8462], "characters": "\u210E" }, 1700 + "&plankv;": { "codepoints": [8463], "characters": "\u210F" }, 1701 + "&plus;": { "codepoints": [43], "characters": "\u002B" }, 1702 + "&plusacir;": { "codepoints": [10787], "characters": "\u2A23" }, 1703 + "&plusb;": { "codepoints": [8862], "characters": "\u229E" }, 1704 + "&pluscir;": { "codepoints": [10786], "characters": "\u2A22" }, 1705 + "&plusdo;": { "codepoints": [8724], "characters": "\u2214" }, 1706 + "&plusdu;": { "codepoints": [10789], "characters": "\u2A25" }, 1707 + "&pluse;": { "codepoints": [10866], "characters": "\u2A72" }, 1708 + "&plusmn": { "codepoints": [177], "characters": "\u00B1" }, 1709 + "&plusmn;": { "codepoints": [177], "characters": "\u00B1" }, 1710 + "&plussim;": { "codepoints": [10790], "characters": "\u2A26" }, 1711 + "&plustwo;": { "codepoints": [10791], "characters": "\u2A27" }, 1712 + "&pm;": { "codepoints": [177], "characters": "\u00B1" }, 1713 + "&pointint;": { "codepoints": [10773], "characters": "\u2A15" }, 1714 + "&popf;": { "codepoints": [120161], "characters": "\uD835\uDD61" }, 1715 + "&pound": { "codepoints": [163], "characters": "\u00A3" }, 1716 + "&pound;": { "codepoints": [163], "characters": "\u00A3" }, 1717 + "&pr;": { "codepoints": [8826], "characters": "\u227A" }, 1718 + "&prE;": { "codepoints": [10931], "characters": "\u2AB3" }, 1719 + "&prap;": { "codepoints": [10935], "characters": "\u2AB7" }, 1720 + "&prcue;": { "codepoints": [8828], "characters": "\u227C" }, 1721 + "&pre;": { "codepoints": [10927], "characters": "\u2AAF" }, 1722 + "&prec;": { "codepoints": [8826], "characters": "\u227A" }, 1723 + "&precapprox;": { "codepoints": [10935], "characters": "\u2AB7" }, 1724 + "&preccurlyeq;": { "codepoints": [8828], "characters": "\u227C" }, 1725 + "&preceq;": { "codepoints": [10927], "characters": "\u2AAF" }, 1726 + "&precnapprox;": { "codepoints": [10937], "characters": "\u2AB9" }, 1727 + "&precneqq;": { "codepoints": [10933], "characters": "\u2AB5" }, 1728 + "&precnsim;": { "codepoints": [8936], "characters": "\u22E8" }, 1729 + "&precsim;": { "codepoints": [8830], "characters": "\u227E" }, 1730 + "&prime;": { "codepoints": [8242], "characters": "\u2032" }, 1731 + "&primes;": { "codepoints": [8473], "characters": "\u2119" }, 1732 + "&prnE;": { "codepoints": [10933], "characters": "\u2AB5" }, 1733 + "&prnap;": { "codepoints": [10937], "characters": "\u2AB9" }, 1734 + "&prnsim;": { "codepoints": [8936], "characters": "\u22E8" }, 1735 + "&prod;": { "codepoints": [8719], "characters": "\u220F" }, 1736 + "&profalar;": { "codepoints": [9006], "characters": "\u232E" }, 1737 + "&profline;": { "codepoints": [8978], "characters": "\u2312" }, 1738 + "&profsurf;": { "codepoints": [8979], "characters": "\u2313" }, 1739 + "&prop;": { "codepoints": [8733], "characters": "\u221D" }, 1740 + "&propto;": { "codepoints": [8733], "characters": "\u221D" }, 1741 + "&prsim;": { "codepoints": [8830], "characters": "\u227E" }, 1742 + "&prurel;": { "codepoints": [8880], "characters": "\u22B0" }, 1743 + "&pscr;": { "codepoints": [120005], "characters": "\uD835\uDCC5" }, 1744 + "&psi;": { "codepoints": [968], "characters": "\u03C8" }, 1745 + "&puncsp;": { "codepoints": [8200], "characters": "\u2008" }, 1746 + "&qfr;": { "codepoints": [120110], "characters": "\uD835\uDD2E" }, 1747 + "&qint;": { "codepoints": [10764], "characters": "\u2A0C" }, 1748 + "&qopf;": { "codepoints": [120162], "characters": "\uD835\uDD62" }, 1749 + "&qprime;": { "codepoints": [8279], "characters": "\u2057" }, 1750 + "&qscr;": { "codepoints": [120006], "characters": "\uD835\uDCC6" }, 1751 + "&quaternions;": { "codepoints": [8461], "characters": "\u210D" }, 1752 + "&quatint;": { "codepoints": [10774], "characters": "\u2A16" }, 1753 + "&quest;": { "codepoints": [63], "characters": "\u003F" }, 1754 + "&questeq;": { "codepoints": [8799], "characters": "\u225F" }, 1755 + "&quot": { "codepoints": [34], "characters": "\u0022" }, 1756 + "&quot;": { "codepoints": [34], "characters": "\u0022" }, 1757 + "&rAarr;": { "codepoints": [8667], "characters": "\u21DB" }, 1758 + "&rArr;": { "codepoints": [8658], "characters": "\u21D2" }, 1759 + "&rAtail;": { "codepoints": [10524], "characters": "\u291C" }, 1760 + "&rBarr;": { "codepoints": [10511], "characters": "\u290F" }, 1761 + "&rHar;": { "codepoints": [10596], "characters": "\u2964" }, 1762 + "&race;": { "codepoints": [8765, 817], "characters": "\u223D\u0331" }, 1763 + "&racute;": { "codepoints": [341], "characters": "\u0155" }, 1764 + "&radic;": { "codepoints": [8730], "characters": "\u221A" }, 1765 + "&raemptyv;": { "codepoints": [10675], "characters": "\u29B3" }, 1766 + "&rang;": { "codepoints": [10217], "characters": "\u27E9" }, 1767 + "&rangd;": { "codepoints": [10642], "characters": "\u2992" }, 1768 + "&range;": { "codepoints": [10661], "characters": "\u29A5" }, 1769 + "&rangle;": { "codepoints": [10217], "characters": "\u27E9" }, 1770 + "&raquo": { "codepoints": [187], "characters": "\u00BB" }, 1771 + "&raquo;": { "codepoints": [187], "characters": "\u00BB" }, 1772 + "&rarr;": { "codepoints": [8594], "characters": "\u2192" }, 1773 + "&rarrap;": { "codepoints": [10613], "characters": "\u2975" }, 1774 + "&rarrb;": { "codepoints": [8677], "characters": "\u21E5" }, 1775 + "&rarrbfs;": { "codepoints": [10528], "characters": "\u2920" }, 1776 + "&rarrc;": { "codepoints": [10547], "characters": "\u2933" }, 1777 + "&rarrfs;": { "codepoints": [10526], "characters": "\u291E" }, 1778 + "&rarrhk;": { "codepoints": [8618], "characters": "\u21AA" }, 1779 + "&rarrlp;": { "codepoints": [8620], "characters": "\u21AC" }, 1780 + "&rarrpl;": { "codepoints": [10565], "characters": "\u2945" }, 1781 + "&rarrsim;": { "codepoints": [10612], "characters": "\u2974" }, 1782 + "&rarrtl;": { "codepoints": [8611], "characters": "\u21A3" }, 1783 + "&rarrw;": { "codepoints": [8605], "characters": "\u219D" }, 1784 + "&ratail;": { "codepoints": [10522], "characters": "\u291A" }, 1785 + "&ratio;": { "codepoints": [8758], "characters": "\u2236" }, 1786 + "&rationals;": { "codepoints": [8474], "characters": "\u211A" }, 1787 + "&rbarr;": { "codepoints": [10509], "characters": "\u290D" }, 1788 + "&rbbrk;": { "codepoints": [10099], "characters": "\u2773" }, 1789 + "&rbrace;": { "codepoints": [125], "characters": "\u007D" }, 1790 + "&rbrack;": { "codepoints": [93], "characters": "\u005D" }, 1791 + "&rbrke;": { "codepoints": [10636], "characters": "\u298C" }, 1792 + "&rbrksld;": { "codepoints": [10638], "characters": "\u298E" }, 1793 + "&rbrkslu;": { "codepoints": [10640], "characters": "\u2990" }, 1794 + "&rcaron;": { "codepoints": [345], "characters": "\u0159" }, 1795 + "&rcedil;": { "codepoints": [343], "characters": "\u0157" }, 1796 + "&rceil;": { "codepoints": [8969], "characters": "\u2309" }, 1797 + "&rcub;": { "codepoints": [125], "characters": "\u007D" }, 1798 + "&rcy;": { "codepoints": [1088], "characters": "\u0440" }, 1799 + "&rdca;": { "codepoints": [10551], "characters": "\u2937" }, 1800 + "&rdldhar;": { "codepoints": [10601], "characters": "\u2969" }, 1801 + "&rdquo;": { "codepoints": [8221], "characters": "\u201D" }, 1802 + "&rdquor;": { "codepoints": [8221], "characters": "\u201D" }, 1803 + "&rdsh;": { "codepoints": [8627], "characters": "\u21B3" }, 1804 + "&real;": { "codepoints": [8476], "characters": "\u211C" }, 1805 + "&realine;": { "codepoints": [8475], "characters": "\u211B" }, 1806 + "&realpart;": { "codepoints": [8476], "characters": "\u211C" }, 1807 + "&reals;": { "codepoints": [8477], "characters": "\u211D" }, 1808 + "&rect;": { "codepoints": [9645], "characters": "\u25AD" }, 1809 + "&reg": { "codepoints": [174], "characters": "\u00AE" }, 1810 + "&reg;": { "codepoints": [174], "characters": "\u00AE" }, 1811 + "&rfisht;": { "codepoints": [10621], "characters": "\u297D" }, 1812 + "&rfloor;": { "codepoints": [8971], "characters": "\u230B" }, 1813 + "&rfr;": { "codepoints": [120111], "characters": "\uD835\uDD2F" }, 1814 + "&rhard;": { "codepoints": [8641], "characters": "\u21C1" }, 1815 + "&rharu;": { "codepoints": [8640], "characters": "\u21C0" }, 1816 + "&rharul;": { "codepoints": [10604], "characters": "\u296C" }, 1817 + "&rho;": { "codepoints": [961], "characters": "\u03C1" }, 1818 + "&rhov;": { "codepoints": [1009], "characters": "\u03F1" }, 1819 + "&rightarrow;": { "codepoints": [8594], "characters": "\u2192" }, 1820 + "&rightarrowtail;": { "codepoints": [8611], "characters": "\u21A3" }, 1821 + "&rightharpoondown;": { "codepoints": [8641], "characters": "\u21C1" }, 1822 + "&rightharpoonup;": { "codepoints": [8640], "characters": "\u21C0" }, 1823 + "&rightleftarrows;": { "codepoints": [8644], "characters": "\u21C4" }, 1824 + "&rightleftharpoons;": { "codepoints": [8652], "characters": "\u21CC" }, 1825 + "&rightrightarrows;": { "codepoints": [8649], "characters": "\u21C9" }, 1826 + "&rightsquigarrow;": { "codepoints": [8605], "characters": "\u219D" }, 1827 + "&rightthreetimes;": { "codepoints": [8908], "characters": "\u22CC" }, 1828 + "&ring;": { "codepoints": [730], "characters": "\u02DA" }, 1829 + "&risingdotseq;": { "codepoints": [8787], "characters": "\u2253" }, 1830 + "&rlarr;": { "codepoints": [8644], "characters": "\u21C4" }, 1831 + "&rlhar;": { "codepoints": [8652], "characters": "\u21CC" }, 1832 + "&rlm;": { "codepoints": [8207], "characters": "\u200F" }, 1833 + "&rmoust;": { "codepoints": [9137], "characters": "\u23B1" }, 1834 + "&rmoustache;": { "codepoints": [9137], "characters": "\u23B1" }, 1835 + "&rnmid;": { "codepoints": [10990], "characters": "\u2AEE" }, 1836 + "&roang;": { "codepoints": [10221], "characters": "\u27ED" }, 1837 + "&roarr;": { "codepoints": [8702], "characters": "\u21FE" }, 1838 + "&robrk;": { "codepoints": [10215], "characters": "\u27E7" }, 1839 + "&ropar;": { "codepoints": [10630], "characters": "\u2986" }, 1840 + "&ropf;": { "codepoints": [120163], "characters": "\uD835\uDD63" }, 1841 + "&roplus;": { "codepoints": [10798], "characters": "\u2A2E" }, 1842 + "&rotimes;": { "codepoints": [10805], "characters": "\u2A35" }, 1843 + "&rpar;": { "codepoints": [41], "characters": "\u0029" }, 1844 + "&rpargt;": { "codepoints": [10644], "characters": "\u2994" }, 1845 + "&rppolint;": { "codepoints": [10770], "characters": "\u2A12" }, 1846 + "&rrarr;": { "codepoints": [8649], "characters": "\u21C9" }, 1847 + "&rsaquo;": { "codepoints": [8250], "characters": "\u203A" }, 1848 + "&rscr;": { "codepoints": [120007], "characters": "\uD835\uDCC7" }, 1849 + "&rsh;": { "codepoints": [8625], "characters": "\u21B1" }, 1850 + "&rsqb;": { "codepoints": [93], "characters": "\u005D" }, 1851 + "&rsquo;": { "codepoints": [8217], "characters": "\u2019" }, 1852 + "&rsquor;": { "codepoints": [8217], "characters": "\u2019" }, 1853 + "&rthree;": { "codepoints": [8908], "characters": "\u22CC" }, 1854 + "&rtimes;": { "codepoints": [8906], "characters": "\u22CA" }, 1855 + "&rtri;": { "codepoints": [9657], "characters": "\u25B9" }, 1856 + "&rtrie;": { "codepoints": [8885], "characters": "\u22B5" }, 1857 + "&rtrif;": { "codepoints": [9656], "characters": "\u25B8" }, 1858 + "&rtriltri;": { "codepoints": [10702], "characters": "\u29CE" }, 1859 + "&ruluhar;": { "codepoints": [10600], "characters": "\u2968" }, 1860 + "&rx;": { "codepoints": [8478], "characters": "\u211E" }, 1861 + "&sacute;": { "codepoints": [347], "characters": "\u015B" }, 1862 + "&sbquo;": { "codepoints": [8218], "characters": "\u201A" }, 1863 + "&sc;": { "codepoints": [8827], "characters": "\u227B" }, 1864 + "&scE;": { "codepoints": [10932], "characters": "\u2AB4" }, 1865 + "&scap;": { "codepoints": [10936], "characters": "\u2AB8" }, 1866 + "&scaron;": { "codepoints": [353], "characters": "\u0161" }, 1867 + "&sccue;": { "codepoints": [8829], "characters": "\u227D" }, 1868 + "&sce;": { "codepoints": [10928], "characters": "\u2AB0" }, 1869 + "&scedil;": { "codepoints": [351], "characters": "\u015F" }, 1870 + "&scirc;": { "codepoints": [349], "characters": "\u015D" }, 1871 + "&scnE;": { "codepoints": [10934], "characters": "\u2AB6" }, 1872 + "&scnap;": { "codepoints": [10938], "characters": "\u2ABA" }, 1873 + "&scnsim;": { "codepoints": [8937], "characters": "\u22E9" }, 1874 + "&scpolint;": { "codepoints": [10771], "characters": "\u2A13" }, 1875 + "&scsim;": { "codepoints": [8831], "characters": "\u227F" }, 1876 + "&scy;": { "codepoints": [1089], "characters": "\u0441" }, 1877 + "&sdot;": { "codepoints": [8901], "characters": "\u22C5" }, 1878 + "&sdotb;": { "codepoints": [8865], "characters": "\u22A1" }, 1879 + "&sdote;": { "codepoints": [10854], "characters": "\u2A66" }, 1880 + "&seArr;": { "codepoints": [8664], "characters": "\u21D8" }, 1881 + "&searhk;": { "codepoints": [10533], "characters": "\u2925" }, 1882 + "&searr;": { "codepoints": [8600], "characters": "\u2198" }, 1883 + "&searrow;": { "codepoints": [8600], "characters": "\u2198" }, 1884 + "&sect": { "codepoints": [167], "characters": "\u00A7" }, 1885 + "&sect;": { "codepoints": [167], "characters": "\u00A7" }, 1886 + "&semi;": { "codepoints": [59], "characters": "\u003B" }, 1887 + "&seswar;": { "codepoints": [10537], "characters": "\u2929" }, 1888 + "&setminus;": { "codepoints": [8726], "characters": "\u2216" }, 1889 + "&setmn;": { "codepoints": [8726], "characters": "\u2216" }, 1890 + "&sext;": { "codepoints": [10038], "characters": "\u2736" }, 1891 + "&sfr;": { "codepoints": [120112], "characters": "\uD835\uDD30" }, 1892 + "&sfrown;": { "codepoints": [8994], "characters": "\u2322" }, 1893 + "&sharp;": { "codepoints": [9839], "characters": "\u266F" }, 1894 + "&shchcy;": { "codepoints": [1097], "characters": "\u0449" }, 1895 + "&shcy;": { "codepoints": [1096], "characters": "\u0448" }, 1896 + "&shortmid;": { "codepoints": [8739], "characters": "\u2223" }, 1897 + "&shortparallel;": { "codepoints": [8741], "characters": "\u2225" }, 1898 + "&shy": { "codepoints": [173], "characters": "\u00AD" }, 1899 + "&shy;": { "codepoints": [173], "characters": "\u00AD" }, 1900 + "&sigma;": { "codepoints": [963], "characters": "\u03C3" }, 1901 + "&sigmaf;": { "codepoints": [962], "characters": "\u03C2" }, 1902 + "&sigmav;": { "codepoints": [962], "characters": "\u03C2" }, 1903 + "&sim;": { "codepoints": [8764], "characters": "\u223C" }, 1904 + "&simdot;": { "codepoints": [10858], "characters": "\u2A6A" }, 1905 + "&sime;": { "codepoints": [8771], "characters": "\u2243" }, 1906 + "&simeq;": { "codepoints": [8771], "characters": "\u2243" }, 1907 + "&simg;": { "codepoints": [10910], "characters": "\u2A9E" }, 1908 + "&simgE;": { "codepoints": [10912], "characters": "\u2AA0" }, 1909 + "&siml;": { "codepoints": [10909], "characters": "\u2A9D" }, 1910 + "&simlE;": { "codepoints": [10911], "characters": "\u2A9F" }, 1911 + "&simne;": { "codepoints": [8774], "characters": "\u2246" }, 1912 + "&simplus;": { "codepoints": [10788], "characters": "\u2A24" }, 1913 + "&simrarr;": { "codepoints": [10610], "characters": "\u2972" }, 1914 + "&slarr;": { "codepoints": [8592], "characters": "\u2190" }, 1915 + "&smallsetminus;": { "codepoints": [8726], "characters": "\u2216" }, 1916 + "&smashp;": { "codepoints": [10803], "characters": "\u2A33" }, 1917 + "&smeparsl;": { "codepoints": [10724], "characters": "\u29E4" }, 1918 + "&smid;": { "codepoints": [8739], "characters": "\u2223" }, 1919 + "&smile;": { "codepoints": [8995], "characters": "\u2323" }, 1920 + "&smt;": { "codepoints": [10922], "characters": "\u2AAA" }, 1921 + "&smte;": { "codepoints": [10924], "characters": "\u2AAC" }, 1922 + "&smtes;": { "codepoints": [10924, 65024], "characters": "\u2AAC\uFE00" }, 1923 + "&softcy;": { "codepoints": [1100], "characters": "\u044C" }, 1924 + "&sol;": { "codepoints": [47], "characters": "\u002F" }, 1925 + "&solb;": { "codepoints": [10692], "characters": "\u29C4" }, 1926 + "&solbar;": { "codepoints": [9023], "characters": "\u233F" }, 1927 + "&sopf;": { "codepoints": [120164], "characters": "\uD835\uDD64" }, 1928 + "&spades;": { "codepoints": [9824], "characters": "\u2660" }, 1929 + "&spadesuit;": { "codepoints": [9824], "characters": "\u2660" }, 1930 + "&spar;": { "codepoints": [8741], "characters": "\u2225" }, 1931 + "&sqcap;": { "codepoints": [8851], "characters": "\u2293" }, 1932 + "&sqcaps;": { "codepoints": [8851, 65024], "characters": "\u2293\uFE00" }, 1933 + "&sqcup;": { "codepoints": [8852], "characters": "\u2294" }, 1934 + "&sqcups;": { "codepoints": [8852, 65024], "characters": "\u2294\uFE00" }, 1935 + "&sqsub;": { "codepoints": [8847], "characters": "\u228F" }, 1936 + "&sqsube;": { "codepoints": [8849], "characters": "\u2291" }, 1937 + "&sqsubset;": { "codepoints": [8847], "characters": "\u228F" }, 1938 + "&sqsubseteq;": { "codepoints": [8849], "characters": "\u2291" }, 1939 + "&sqsup;": { "codepoints": [8848], "characters": "\u2290" }, 1940 + "&sqsupe;": { "codepoints": [8850], "characters": "\u2292" }, 1941 + "&sqsupset;": { "codepoints": [8848], "characters": "\u2290" }, 1942 + "&sqsupseteq;": { "codepoints": [8850], "characters": "\u2292" }, 1943 + "&squ;": { "codepoints": [9633], "characters": "\u25A1" }, 1944 + "&square;": { "codepoints": [9633], "characters": "\u25A1" }, 1945 + "&squarf;": { "codepoints": [9642], "characters": "\u25AA" }, 1946 + "&squf;": { "codepoints": [9642], "characters": "\u25AA" }, 1947 + "&srarr;": { "codepoints": [8594], "characters": "\u2192" }, 1948 + "&sscr;": { "codepoints": [120008], "characters": "\uD835\uDCC8" }, 1949 + "&ssetmn;": { "codepoints": [8726], "characters": "\u2216" }, 1950 + "&ssmile;": { "codepoints": [8995], "characters": "\u2323" }, 1951 + "&sstarf;": { "codepoints": [8902], "characters": "\u22C6" }, 1952 + "&star;": { "codepoints": [9734], "characters": "\u2606" }, 1953 + "&starf;": { "codepoints": [9733], "characters": "\u2605" }, 1954 + "&straightepsilon;": { "codepoints": [1013], "characters": "\u03F5" }, 1955 + "&straightphi;": { "codepoints": [981], "characters": "\u03D5" }, 1956 + "&strns;": { "codepoints": [175], "characters": "\u00AF" }, 1957 + "&sub;": { "codepoints": [8834], "characters": "\u2282" }, 1958 + "&subE;": { "codepoints": [10949], "characters": "\u2AC5" }, 1959 + "&subdot;": { "codepoints": [10941], "characters": "\u2ABD" }, 1960 + "&sube;": { "codepoints": [8838], "characters": "\u2286" }, 1961 + "&subedot;": { "codepoints": [10947], "characters": "\u2AC3" }, 1962 + "&submult;": { "codepoints": [10945], "characters": "\u2AC1" }, 1963 + "&subnE;": { "codepoints": [10955], "characters": "\u2ACB" }, 1964 + "&subne;": { "codepoints": [8842], "characters": "\u228A" }, 1965 + "&subplus;": { "codepoints": [10943], "characters": "\u2ABF" }, 1966 + "&subrarr;": { "codepoints": [10617], "characters": "\u2979" }, 1967 + "&subset;": { "codepoints": [8834], "characters": "\u2282" }, 1968 + "&subseteq;": { "codepoints": [8838], "characters": "\u2286" }, 1969 + "&subseteqq;": { "codepoints": [10949], "characters": "\u2AC5" }, 1970 + "&subsetneq;": { "codepoints": [8842], "characters": "\u228A" }, 1971 + "&subsetneqq;": { "codepoints": [10955], "characters": "\u2ACB" }, 1972 + "&subsim;": { "codepoints": [10951], "characters": "\u2AC7" }, 1973 + "&subsub;": { "codepoints": [10965], "characters": "\u2AD5" }, 1974 + "&subsup;": { "codepoints": [10963], "characters": "\u2AD3" }, 1975 + "&succ;": { "codepoints": [8827], "characters": "\u227B" }, 1976 + "&succapprox;": { "codepoints": [10936], "characters": "\u2AB8" }, 1977 + "&succcurlyeq;": { "codepoints": [8829], "characters": "\u227D" }, 1978 + "&succeq;": { "codepoints": [10928], "characters": "\u2AB0" }, 1979 + "&succnapprox;": { "codepoints": [10938], "characters": "\u2ABA" }, 1980 + "&succneqq;": { "codepoints": [10934], "characters": "\u2AB6" }, 1981 + "&succnsim;": { "codepoints": [8937], "characters": "\u22E9" }, 1982 + "&succsim;": { "codepoints": [8831], "characters": "\u227F" }, 1983 + "&sum;": { "codepoints": [8721], "characters": "\u2211" }, 1984 + "&sung;": { "codepoints": [9834], "characters": "\u266A" }, 1985 + "&sup1": { "codepoints": [185], "characters": "\u00B9" }, 1986 + "&sup1;": { "codepoints": [185], "characters": "\u00B9" }, 1987 + "&sup2": { "codepoints": [178], "characters": "\u00B2" }, 1988 + "&sup2;": { "codepoints": [178], "characters": "\u00B2" }, 1989 + "&sup3": { "codepoints": [179], "characters": "\u00B3" }, 1990 + "&sup3;": { "codepoints": [179], "characters": "\u00B3" }, 1991 + "&sup;": { "codepoints": [8835], "characters": "\u2283" }, 1992 + "&supE;": { "codepoints": [10950], "characters": "\u2AC6" }, 1993 + "&supdot;": { "codepoints": [10942], "characters": "\u2ABE" }, 1994 + "&supdsub;": { "codepoints": [10968], "characters": "\u2AD8" }, 1995 + "&supe;": { "codepoints": [8839], "characters": "\u2287" }, 1996 + "&supedot;": { "codepoints": [10948], "characters": "\u2AC4" }, 1997 + "&suphsol;": { "codepoints": [10185], "characters": "\u27C9" }, 1998 + "&suphsub;": { "codepoints": [10967], "characters": "\u2AD7" }, 1999 + "&suplarr;": { "codepoints": [10619], "characters": "\u297B" }, 2000 + "&supmult;": { "codepoints": [10946], "characters": "\u2AC2" }, 2001 + "&supnE;": { "codepoints": [10956], "characters": "\u2ACC" }, 2002 + "&supne;": { "codepoints": [8843], "characters": "\u228B" }, 2003 + "&supplus;": { "codepoints": [10944], "characters": "\u2AC0" }, 2004 + "&supset;": { "codepoints": [8835], "characters": "\u2283" }, 2005 + "&supseteq;": { "codepoints": [8839], "characters": "\u2287" }, 2006 + "&supseteqq;": { "codepoints": [10950], "characters": "\u2AC6" }, 2007 + "&supsetneq;": { "codepoints": [8843], "characters": "\u228B" }, 2008 + "&supsetneqq;": { "codepoints": [10956], "characters": "\u2ACC" }, 2009 + "&supsim;": { "codepoints": [10952], "characters": "\u2AC8" }, 2010 + "&supsub;": { "codepoints": [10964], "characters": "\u2AD4" }, 2011 + "&supsup;": { "codepoints": [10966], "characters": "\u2AD6" }, 2012 + "&swArr;": { "codepoints": [8665], "characters": "\u21D9" }, 2013 + "&swarhk;": { "codepoints": [10534], "characters": "\u2926" }, 2014 + "&swarr;": { "codepoints": [8601], "characters": "\u2199" }, 2015 + "&swarrow;": { "codepoints": [8601], "characters": "\u2199" }, 2016 + "&swnwar;": { "codepoints": [10538], "characters": "\u292A" }, 2017 + "&szlig": { "codepoints": [223], "characters": "\u00DF" }, 2018 + "&szlig;": { "codepoints": [223], "characters": "\u00DF" }, 2019 + "&target;": { "codepoints": [8982], "characters": "\u2316" }, 2020 + "&tau;": { "codepoints": [964], "characters": "\u03C4" }, 2021 + "&tbrk;": { "codepoints": [9140], "characters": "\u23B4" }, 2022 + "&tcaron;": { "codepoints": [357], "characters": "\u0165" }, 2023 + "&tcedil;": { "codepoints": [355], "characters": "\u0163" }, 2024 + "&tcy;": { "codepoints": [1090], "characters": "\u0442" }, 2025 + "&tdot;": { "codepoints": [8411], "characters": "\u20DB" }, 2026 + "&telrec;": { "codepoints": [8981], "characters": "\u2315" }, 2027 + "&tfr;": { "codepoints": [120113], "characters": "\uD835\uDD31" }, 2028 + "&there4;": { "codepoints": [8756], "characters": "\u2234" }, 2029 + "&therefore;": { "codepoints": [8756], "characters": "\u2234" }, 2030 + "&theta;": { "codepoints": [952], "characters": "\u03B8" }, 2031 + "&thetasym;": { "codepoints": [977], "characters": "\u03D1" }, 2032 + "&thetav;": { "codepoints": [977], "characters": "\u03D1" }, 2033 + "&thickapprox;": { "codepoints": [8776], "characters": "\u2248" }, 2034 + "&thicksim;": { "codepoints": [8764], "characters": "\u223C" }, 2035 + "&thinsp;": { "codepoints": [8201], "characters": "\u2009" }, 2036 + "&thkap;": { "codepoints": [8776], "characters": "\u2248" }, 2037 + "&thksim;": { "codepoints": [8764], "characters": "\u223C" }, 2038 + "&thorn": { "codepoints": [254], "characters": "\u00FE" }, 2039 + "&thorn;": { "codepoints": [254], "characters": "\u00FE" }, 2040 + "&tilde;": { "codepoints": [732], "characters": "\u02DC" }, 2041 + "&times": { "codepoints": [215], "characters": "\u00D7" }, 2042 + "&times;": { "codepoints": [215], "characters": "\u00D7" }, 2043 + "&timesb;": { "codepoints": [8864], "characters": "\u22A0" }, 2044 + "&timesbar;": { "codepoints": [10801], "characters": "\u2A31" }, 2045 + "&timesd;": { "codepoints": [10800], "characters": "\u2A30" }, 2046 + "&tint;": { "codepoints": [8749], "characters": "\u222D" }, 2047 + "&toea;": { "codepoints": [10536], "characters": "\u2928" }, 2048 + "&top;": { "codepoints": [8868], "characters": "\u22A4" }, 2049 + "&topbot;": { "codepoints": [9014], "characters": "\u2336" }, 2050 + "&topcir;": { "codepoints": [10993], "characters": "\u2AF1" }, 2051 + "&topf;": { "codepoints": [120165], "characters": "\uD835\uDD65" }, 2052 + "&topfork;": { "codepoints": [10970], "characters": "\u2ADA" }, 2053 + "&tosa;": { "codepoints": [10537], "characters": "\u2929" }, 2054 + "&tprime;": { "codepoints": [8244], "characters": "\u2034" }, 2055 + "&trade;": { "codepoints": [8482], "characters": "\u2122" }, 2056 + "&triangle;": { "codepoints": [9653], "characters": "\u25B5" }, 2057 + "&triangledown;": { "codepoints": [9663], "characters": "\u25BF" }, 2058 + "&triangleleft;": { "codepoints": [9667], "characters": "\u25C3" }, 2059 + "&trianglelefteq;": { "codepoints": [8884], "characters": "\u22B4" }, 2060 + "&triangleq;": { "codepoints": [8796], "characters": "\u225C" }, 2061 + "&triangleright;": { "codepoints": [9657], "characters": "\u25B9" }, 2062 + "&trianglerighteq;": { "codepoints": [8885], "characters": "\u22B5" }, 2063 + "&tridot;": { "codepoints": [9708], "characters": "\u25EC" }, 2064 + "&trie;": { "codepoints": [8796], "characters": "\u225C" }, 2065 + "&triminus;": { "codepoints": [10810], "characters": "\u2A3A" }, 2066 + "&triplus;": { "codepoints": [10809], "characters": "\u2A39" }, 2067 + "&trisb;": { "codepoints": [10701], "characters": "\u29CD" }, 2068 + "&tritime;": { "codepoints": [10811], "characters": "\u2A3B" }, 2069 + "&trpezium;": { "codepoints": [9186], "characters": "\u23E2" }, 2070 + "&tscr;": { "codepoints": [120009], "characters": "\uD835\uDCC9" }, 2071 + "&tscy;": { "codepoints": [1094], "characters": "\u0446" }, 2072 + "&tshcy;": { "codepoints": [1115], "characters": "\u045B" }, 2073 + "&tstrok;": { "codepoints": [359], "characters": "\u0167" }, 2074 + "&twixt;": { "codepoints": [8812], "characters": "\u226C" }, 2075 + "&twoheadleftarrow;": { "codepoints": [8606], "characters": "\u219E" }, 2076 + "&twoheadrightarrow;": { "codepoints": [8608], "characters": "\u21A0" }, 2077 + "&uArr;": { "codepoints": [8657], "characters": "\u21D1" }, 2078 + "&uHar;": { "codepoints": [10595], "characters": "\u2963" }, 2079 + "&uacute": { "codepoints": [250], "characters": "\u00FA" }, 2080 + "&uacute;": { "codepoints": [250], "characters": "\u00FA" }, 2081 + "&uarr;": { "codepoints": [8593], "characters": "\u2191" }, 2082 + "&ubrcy;": { "codepoints": [1118], "characters": "\u045E" }, 2083 + "&ubreve;": { "codepoints": [365], "characters": "\u016D" }, 2084 + "&ucirc": { "codepoints": [251], "characters": "\u00FB" }, 2085 + "&ucirc;": { "codepoints": [251], "characters": "\u00FB" }, 2086 + "&ucy;": { "codepoints": [1091], "characters": "\u0443" }, 2087 + "&udarr;": { "codepoints": [8645], "characters": "\u21C5" }, 2088 + "&udblac;": { "codepoints": [369], "characters": "\u0171" }, 2089 + "&udhar;": { "codepoints": [10606], "characters": "\u296E" }, 2090 + "&ufisht;": { "codepoints": [10622], "characters": "\u297E" }, 2091 + "&ufr;": { "codepoints": [120114], "characters": "\uD835\uDD32" }, 2092 + "&ugrave": { "codepoints": [249], "characters": "\u00F9" }, 2093 + "&ugrave;": { "codepoints": [249], "characters": "\u00F9" }, 2094 + "&uharl;": { "codepoints": [8639], "characters": "\u21BF" }, 2095 + "&uharr;": { "codepoints": [8638], "characters": "\u21BE" }, 2096 + "&uhblk;": { "codepoints": [9600], "characters": "\u2580" }, 2097 + "&ulcorn;": { "codepoints": [8988], "characters": "\u231C" }, 2098 + "&ulcorner;": { "codepoints": [8988], "characters": "\u231C" }, 2099 + "&ulcrop;": { "codepoints": [8975], "characters": "\u230F" }, 2100 + "&ultri;": { "codepoints": [9720], "characters": "\u25F8" }, 2101 + "&umacr;": { "codepoints": [363], "characters": "\u016B" }, 2102 + "&uml": { "codepoints": [168], "characters": "\u00A8" }, 2103 + "&uml;": { "codepoints": [168], "characters": "\u00A8" }, 2104 + "&uogon;": { "codepoints": [371], "characters": "\u0173" }, 2105 + "&uopf;": { "codepoints": [120166], "characters": "\uD835\uDD66" }, 2106 + "&uparrow;": { "codepoints": [8593], "characters": "\u2191" }, 2107 + "&updownarrow;": { "codepoints": [8597], "characters": "\u2195" }, 2108 + "&upharpoonleft;": { "codepoints": [8639], "characters": "\u21BF" }, 2109 + "&upharpoonright;": { "codepoints": [8638], "characters": "\u21BE" }, 2110 + "&uplus;": { "codepoints": [8846], "characters": "\u228E" }, 2111 + "&upsi;": { "codepoints": [965], "characters": "\u03C5" }, 2112 + "&upsih;": { "codepoints": [978], "characters": "\u03D2" }, 2113 + "&upsilon;": { "codepoints": [965], "characters": "\u03C5" }, 2114 + "&upuparrows;": { "codepoints": [8648], "characters": "\u21C8" }, 2115 + "&urcorn;": { "codepoints": [8989], "characters": "\u231D" }, 2116 + "&urcorner;": { "codepoints": [8989], "characters": "\u231D" }, 2117 + "&urcrop;": { "codepoints": [8974], "characters": "\u230E" }, 2118 + "&uring;": { "codepoints": [367], "characters": "\u016F" }, 2119 + "&urtri;": { "codepoints": [9721], "characters": "\u25F9" }, 2120 + "&uscr;": { "codepoints": [120010], "characters": "\uD835\uDCCA" }, 2121 + "&utdot;": { "codepoints": [8944], "characters": "\u22F0" }, 2122 + "&utilde;": { "codepoints": [361], "characters": "\u0169" }, 2123 + "&utri;": { "codepoints": [9653], "characters": "\u25B5" }, 2124 + "&utrif;": { "codepoints": [9652], "characters": "\u25B4" }, 2125 + "&uuarr;": { "codepoints": [8648], "characters": "\u21C8" }, 2126 + "&uuml": { "codepoints": [252], "characters": "\u00FC" }, 2127 + "&uuml;": { "codepoints": [252], "characters": "\u00FC" }, 2128 + "&uwangle;": { "codepoints": [10663], "characters": "\u29A7" }, 2129 + "&vArr;": { "codepoints": [8661], "characters": "\u21D5" }, 2130 + "&vBar;": { "codepoints": [10984], "characters": "\u2AE8" }, 2131 + "&vBarv;": { "codepoints": [10985], "characters": "\u2AE9" }, 2132 + "&vDash;": { "codepoints": [8872], "characters": "\u22A8" }, 2133 + "&vangrt;": { "codepoints": [10652], "characters": "\u299C" }, 2134 + "&varepsilon;": { "codepoints": [1013], "characters": "\u03F5" }, 2135 + "&varkappa;": { "codepoints": [1008], "characters": "\u03F0" }, 2136 + "&varnothing;": { "codepoints": [8709], "characters": "\u2205" }, 2137 + "&varphi;": { "codepoints": [981], "characters": "\u03D5" }, 2138 + "&varpi;": { "codepoints": [982], "characters": "\u03D6" }, 2139 + "&varpropto;": { "codepoints": [8733], "characters": "\u221D" }, 2140 + "&varr;": { "codepoints": [8597], "characters": "\u2195" }, 2141 + "&varrho;": { "codepoints": [1009], "characters": "\u03F1" }, 2142 + "&varsigma;": { "codepoints": [962], "characters": "\u03C2" }, 2143 + "&varsubsetneq;": { "codepoints": [8842, 65024], "characters": "\u228A\uFE00" }, 2144 + "&varsubsetneqq;": { "codepoints": [10955, 65024], "characters": "\u2ACB\uFE00" }, 2145 + "&varsupsetneq;": { "codepoints": [8843, 65024], "characters": "\u228B\uFE00" }, 2146 + "&varsupsetneqq;": { "codepoints": [10956, 65024], "characters": "\u2ACC\uFE00" }, 2147 + "&vartheta;": { "codepoints": [977], "characters": "\u03D1" }, 2148 + "&vartriangleleft;": { "codepoints": [8882], "characters": "\u22B2" }, 2149 + "&vartriangleright;": { "codepoints": [8883], "characters": "\u22B3" }, 2150 + "&vcy;": { "codepoints": [1074], "characters": "\u0432" }, 2151 + "&vdash;": { "codepoints": [8866], "characters": "\u22A2" }, 2152 + "&vee;": { "codepoints": [8744], "characters": "\u2228" }, 2153 + "&veebar;": { "codepoints": [8891], "characters": "\u22BB" }, 2154 + "&veeeq;": { "codepoints": [8794], "characters": "\u225A" }, 2155 + "&vellip;": { "codepoints": [8942], "characters": "\u22EE" }, 2156 + "&verbar;": { "codepoints": [124], "characters": "\u007C" }, 2157 + "&vert;": { "codepoints": [124], "characters": "\u007C" }, 2158 + "&vfr;": { "codepoints": [120115], "characters": "\uD835\uDD33" }, 2159 + "&vltri;": { "codepoints": [8882], "characters": "\u22B2" }, 2160 + "&vnsub;": { "codepoints": [8834, 8402], "characters": "\u2282\u20D2" }, 2161 + "&vnsup;": { "codepoints": [8835, 8402], "characters": "\u2283\u20D2" }, 2162 + "&vopf;": { "codepoints": [120167], "characters": "\uD835\uDD67" }, 2163 + "&vprop;": { "codepoints": [8733], "characters": "\u221D" }, 2164 + "&vrtri;": { "codepoints": [8883], "characters": "\u22B3" }, 2165 + "&vscr;": { "codepoints": [120011], "characters": "\uD835\uDCCB" }, 2166 + "&vsubnE;": { "codepoints": [10955, 65024], "characters": "\u2ACB\uFE00" }, 2167 + "&vsubne;": { "codepoints": [8842, 65024], "characters": "\u228A\uFE00" }, 2168 + "&vsupnE;": { "codepoints": [10956, 65024], "characters": "\u2ACC\uFE00" }, 2169 + "&vsupne;": { "codepoints": [8843, 65024], "characters": "\u228B\uFE00" }, 2170 + "&vzigzag;": { "codepoints": [10650], "characters": "\u299A" }, 2171 + "&wcirc;": { "codepoints": [373], "characters": "\u0175" }, 2172 + "&wedbar;": { "codepoints": [10847], "characters": "\u2A5F" }, 2173 + "&wedge;": { "codepoints": [8743], "characters": "\u2227" }, 2174 + "&wedgeq;": { "codepoints": [8793], "characters": "\u2259" }, 2175 + "&weierp;": { "codepoints": [8472], "characters": "\u2118" }, 2176 + "&wfr;": { "codepoints": [120116], "characters": "\uD835\uDD34" }, 2177 + "&wopf;": { "codepoints": [120168], "characters": "\uD835\uDD68" }, 2178 + "&wp;": { "codepoints": [8472], "characters": "\u2118" }, 2179 + "&wr;": { "codepoints": [8768], "characters": "\u2240" }, 2180 + "&wreath;": { "codepoints": [8768], "characters": "\u2240" }, 2181 + "&wscr;": { "codepoints": [120012], "characters": "\uD835\uDCCC" }, 2182 + "&xcap;": { "codepoints": [8898], "characters": "\u22C2" }, 2183 + "&xcirc;": { "codepoints": [9711], "characters": "\u25EF" }, 2184 + "&xcup;": { "codepoints": [8899], "characters": "\u22C3" }, 2185 + "&xdtri;": { "codepoints": [9661], "characters": "\u25BD" }, 2186 + "&xfr;": { "codepoints": [120117], "characters": "\uD835\uDD35" }, 2187 + "&xhArr;": { "codepoints": [10234], "characters": "\u27FA" }, 2188 + "&xharr;": { "codepoints": [10231], "characters": "\u27F7" }, 2189 + "&xi;": { "codepoints": [958], "characters": "\u03BE" }, 2190 + "&xlArr;": { "codepoints": [10232], "characters": "\u27F8" }, 2191 + "&xlarr;": { "codepoints": [10229], "characters": "\u27F5" }, 2192 + "&xmap;": { "codepoints": [10236], "characters": "\u27FC" }, 2193 + "&xnis;": { "codepoints": [8955], "characters": "\u22FB" }, 2194 + "&xodot;": { "codepoints": [10752], "characters": "\u2A00" }, 2195 + "&xopf;": { "codepoints": [120169], "characters": "\uD835\uDD69" }, 2196 + "&xoplus;": { "codepoints": [10753], "characters": "\u2A01" }, 2197 + "&xotime;": { "codepoints": [10754], "characters": "\u2A02" }, 2198 + "&xrArr;": { "codepoints": [10233], "characters": "\u27F9" }, 2199 + "&xrarr;": { "codepoints": [10230], "characters": "\u27F6" }, 2200 + "&xscr;": { "codepoints": [120013], "characters": "\uD835\uDCCD" }, 2201 + "&xsqcup;": { "codepoints": [10758], "characters": "\u2A06" }, 2202 + "&xuplus;": { "codepoints": [10756], "characters": "\u2A04" }, 2203 + "&xutri;": { "codepoints": [9651], "characters": "\u25B3" }, 2204 + "&xvee;": { "codepoints": [8897], "characters": "\u22C1" }, 2205 + "&xwedge;": { "codepoints": [8896], "characters": "\u22C0" }, 2206 + "&yacute": { "codepoints": [253], "characters": "\u00FD" }, 2207 + "&yacute;": { "codepoints": [253], "characters": "\u00FD" }, 2208 + "&yacy;": { "codepoints": [1103], "characters": "\u044F" }, 2209 + "&ycirc;": { "codepoints": [375], "characters": "\u0177" }, 2210 + "&ycy;": { "codepoints": [1099], "characters": "\u044B" }, 2211 + "&yen": { "codepoints": [165], "characters": "\u00A5" }, 2212 + "&yen;": { "codepoints": [165], "characters": "\u00A5" }, 2213 + "&yfr;": { "codepoints": [120118], "characters": "\uD835\uDD36" }, 2214 + "&yicy;": { "codepoints": [1111], "characters": "\u0457" }, 2215 + "&yopf;": { "codepoints": [120170], "characters": "\uD835\uDD6A" }, 2216 + "&yscr;": { "codepoints": [120014], "characters": "\uD835\uDCCE" }, 2217 + "&yucy;": { "codepoints": [1102], "characters": "\u044E" }, 2218 + "&yuml": { "codepoints": [255], "characters": "\u00FF" }, 2219 + "&yuml;": { "codepoints": [255], "characters": "\u00FF" }, 2220 + "&zacute;": { "codepoints": [378], "characters": "\u017A" }, 2221 + "&zcaron;": { "codepoints": [382], "characters": "\u017E" }, 2222 + "&zcy;": { "codepoints": [1079], "characters": "\u0437" }, 2223 + "&zdot;": { "codepoints": [380], "characters": "\u017C" }, 2224 + "&zeetrf;": { "codepoints": [8488], "characters": "\u2128" }, 2225 + "&zeta;": { "codepoints": [950], "characters": "\u03B6" }, 2226 + "&zfr;": { "codepoints": [120119], "characters": "\uD835\uDD37" }, 2227 + "&zhcy;": { "codepoints": [1078], "characters": "\u0436" }, 2228 + "&zigrarr;": { "codepoints": [8669], "characters": "\u21DD" }, 2229 + "&zopf;": { "codepoints": [120171], "characters": "\uD835\uDD6B" }, 2230 + "&zscr;": { "codepoints": [120015], "characters": "\uD835\uDCCF" }, 2231 + "&zwj;": { "codepoints": [8205], "characters": "\u200D" }, 2232 + "&zwnj;": { "codepoints": [8204], "characters": "\u200C" } 2233 + }
+21
dune-project
···
··· 1 + (lang dune 3.0) 2 + (name html5rw) 3 + (version 0.1.0) 4 + 5 + (generate_opam_files true) 6 + 7 + (source (github username/html5rw)) 8 + (license MIT) 9 + (authors "Author") 10 + (maintainers "author@example.com") 11 + 12 + (package 13 + (name html5rw) 14 + (synopsis "Pure OCaml HTML5 parser implementing the WHATWG specification") 15 + (description "A pure OCaml HTML5 parser that passes the html5lib-tests suite. Implements the WHATWG HTML5 parsing specification including tokenization, tree construction, encoding detection, and CSS selector queries.") 16 + (depends 17 + (ocaml (>= 4.14.0)) 18 + (bytesrw (>= 0.3.0)) 19 + (uutf (>= 1.0.0)) 20 + (re (>= 1.10.0)) 21 + (yojson (and :build (>= 2.0.0)))))
+32
examples/basic_parsing.ml
···
··· 1 + open Bytesrw 2 + 3 + (* Basic HTML parsing example *) 4 + 5 + let html = {| 6 + <!DOCTYPE html> 7 + <html> 8 + <head> 9 + <title>Hello World</title> 10 + </head> 11 + <body> 12 + <h1>Welcome</h1> 13 + <p>This is a <strong>simple</strong> example.</p> 14 + </body> 15 + </html> 16 + |} 17 + 18 + let () = 19 + (* Parse HTML string *) 20 + let result = Html5rw.parse (Bytes.Reader.of_string html) in 21 + 22 + (* Access the root document node *) 23 + let doc = Html5rw.root result in 24 + Printf.printf "Root node: %s\n" doc.Html5rw.Dom.name; 25 + 26 + (* Convert back to HTML *) 27 + let output = Html5rw.to_string result in 28 + Printf.printf "\nParsed and serialized:\n%s\n" output; 29 + 30 + (* Extract plain text *) 31 + let text = Html5rw.to_text result in 32 + Printf.printf "\nText content: %s\n" text
+122
examples/css_selectors.ml
···
··· 1 + open Bytesrw 2 + 3 + (* CSS selector query example *) 4 + 5 + let html = {| 6 + <!DOCTYPE html> 7 + <html> 8 + <head><title>Products</title></head> 9 + <body> 10 + <div class="container"> 11 + <h1 id="title">Product List</h1> 12 + <ul class="products"> 13 + <li class="product" data-id="1"> 14 + <span class="name">Widget A</span> 15 + <span class="price">$10.00</span> 16 + </li> 17 + <li class="product" data-id="2"> 18 + <span class="name">Widget B</span> 19 + <span class="price">$15.00</span> 20 + </li> 21 + <li class="product featured" data-id="3"> 22 + <span class="name">Widget C</span> 23 + <span class="price">$20.00</span> 24 + </li> 25 + </ul> 26 + </div> 27 + </body> 28 + </html> 29 + |} 30 + 31 + let () = 32 + let result = Html5rw.parse (Bytes.Reader.of_string html) in 33 + 34 + (* Find element by ID *) 35 + Printf.printf "=== ID Selector (#title) ===\n"; 36 + let titles = Html5rw.query result "#title" in 37 + List.iter (fun node -> 38 + Printf.printf "Found: %s\n" (Html5rw.get_text_content node) 39 + ) titles; 40 + 41 + (* Find elements by class *) 42 + Printf.printf "\n=== Class Selector (.product) ===\n"; 43 + let products = Html5rw.query result ".product" in 44 + Printf.printf "Found %d products\n" (List.length products); 45 + 46 + (* Find elements by tag *) 47 + Printf.printf "\n=== Tag Selector (span) ===\n"; 48 + let spans = Html5rw.query result "span" in 49 + Printf.printf "Found %d span elements\n" (List.length spans); 50 + 51 + (* Find with attribute presence *) 52 + Printf.printf "\n=== Attribute Presence ([data-id]) ===\n"; 53 + let with_data_id = Html5rw.query result "[data-id]" in 54 + List.iter (fun node -> 55 + match Html5rw.get_attr node "data-id" with 56 + | Some id -> Printf.printf "Found element with data-id=%s\n" id 57 + | None -> () 58 + ) with_data_id; 59 + 60 + (* Find with attribute value *) 61 + Printf.printf "\n=== Attribute Value ([data-id=\"3\"]) ===\n"; 62 + let featured = Html5rw.query result "[data-id=\"3\"]" in 63 + List.iter (fun node -> 64 + Printf.printf "Found: %s\n" (Html5rw.get_text_content node) 65 + ) featured; 66 + 67 + (* Find with multiple classes *) 68 + Printf.printf "\n=== Multiple Classes (.product.featured) ===\n"; 69 + let featured_products = Html5rw.query result ".featured" in 70 + List.iter (fun node -> 71 + Printf.printf "Featured: %s\n" (Html5rw.get_text_content node) 72 + ) featured_products; 73 + 74 + (* Check if a node matches a selector *) 75 + Printf.printf "\n=== Match Check (.featured) ===\n"; 76 + List.iter (fun node -> 77 + if Html5rw.matches node ".featured" then 78 + Printf.printf "This product is featured!\n" 79 + ) products; 80 + 81 + (* Pseudo-class: first-child *) 82 + Printf.printf "\n=== Pseudo-class (:first-child) ===\n"; 83 + let first = Html5rw.query result "li:first-child" in 84 + List.iter (fun node -> 85 + Printf.printf "First li: %s\n" (String.trim (Html5rw.get_text_content node)) 86 + ) first; 87 + 88 + (* Pseudo-class: last-child *) 89 + Printf.printf "\n=== Pseudo-class (:last-child) ===\n"; 90 + let last = Html5rw.query result "li:last-child" in 91 + List.iter (fun node -> 92 + Printf.printf "Last li: %s\n" (String.trim (Html5rw.get_text_content node)) 93 + ) last; 94 + 95 + (* Universal selector *) 96 + Printf.printf "\n=== Universal Selector (*) ===\n"; 97 + let all = Html5rw.query result "*" in 98 + Printf.printf "Total elements: %d\n" (List.length all); 99 + 100 + (* Combining queries: find products then filter *) 101 + Printf.printf "\n=== Combined: Products with price > $15 ===\n"; 102 + List.iter (fun product -> 103 + (* Find price span within this product *) 104 + let price_spans = List.filter (fun node -> 105 + Html5rw.matches node ".price" 106 + ) (Html5rw.descendants product) in 107 + List.iter (fun price_span -> 108 + let price_text = Html5rw.get_text_content price_span in 109 + (* Parse price - remove $ and convert *) 110 + let price_str = String.sub price_text 1 (String.length price_text - 1) in 111 + let price = float_of_string price_str in 112 + if price > 15.0 then begin 113 + let name_spans = List.filter (fun node -> 114 + Html5rw.matches node ".name" 115 + ) (Html5rw.descendants product) in 116 + match name_spans with 117 + | name :: _ -> 118 + Printf.printf " %s: %s\n" (Html5rw.get_text_content name) price_text 119 + | [] -> () 120 + end 121 + ) price_spans 122 + ) products
+57
examples/dom_manipulation.ml
···
··· 1 + open Bytesrw 2 + 3 + (* DOM manipulation example *) 4 + 5 + let html = {| 6 + <!DOCTYPE html> 7 + <html> 8 + <head><title>DOM Example</title></head> 9 + <body> 10 + <div id="content"> 11 + <p>Original content</p> 12 + </div> 13 + </body> 14 + </html> 15 + |} 16 + 17 + let () = 18 + let result = Html5rw.parse (Bytes.Reader.of_string html) in 19 + 20 + (* Find the content div *) 21 + match Html5rw.query result "#content" with 22 + | content_div :: _ -> 23 + Printf.printf "Original:\n%s\n\n" (Html5rw.Dom.to_html content_div); 24 + 25 + (* Create and append a new element *) 26 + let new_para = Html5rw.create_element "p" () in 27 + let text_node = Html5rw.create_text "This paragraph was added programmatically!" in 28 + Html5rw.append_child new_para text_node; 29 + Html5rw.set_attr new_para "class" "dynamic"; 30 + Html5rw.append_child content_div new_para; 31 + 32 + Printf.printf "After adding element:\n%s\n\n" (Html5rw.Dom.to_html content_div); 33 + 34 + (* Create an element with attributes *) 35 + let link = Html5rw.create_element "a" 36 + ~attrs:[("href", "https://example.com"); ("target", "_blank")] () in 37 + Html5rw.append_child link (Html5rw.create_text "Click here"); 38 + Html5rw.append_child content_div link; 39 + 40 + Printf.printf "After adding link:\n%s\n\n" (Html5rw.Dom.to_html content_div); 41 + 42 + (* Check attributes *) 43 + Printf.printf "Link has href: %b\n" (Html5rw.has_attr link "href"); 44 + Printf.printf "Link href value: %s\n" 45 + (Option.value ~default:"(none)" (Html5rw.get_attr link "href")); 46 + 47 + (* Clone a node *) 48 + let cloned = Html5rw.clone ~deep:true content_div in 49 + Printf.printf "\nCloned node children: %d\n" 50 + (List.length cloned.Html5rw.Dom.children); 51 + 52 + (* Get descendants *) 53 + let all_descendants = Html5rw.descendants content_div in 54 + Printf.printf "Total descendants: %d\n" (List.length all_descendants) 55 + 56 + | [] -> 57 + Printf.printf "Content div not found\n"
+31
examples/dune
···
··· 1 + (executable 2 + (name basic_parsing) 3 + (libraries bytesrw html5rw)) 4 + 5 + (executable 6 + (name css_selectors) 7 + (libraries bytesrw html5rw)) 8 + 9 + (executable 10 + (name dom_manipulation) 11 + (libraries bytesrw html5rw)) 12 + 13 + (executable 14 + (name text_extraction) 15 + (libraries bytesrw html5rw)) 16 + 17 + (executable 18 + (name error_handling) 19 + (libraries bytesrw html5rw)) 20 + 21 + (executable 22 + (name fragment_parsing) 23 + (libraries bytesrw html5rw)) 24 + 25 + (executable 26 + (name encoding_detection) 27 + (libraries bytesrw html5rw)) 28 + 29 + (executable 30 + (name web_scraper) 31 + (libraries bytesrw html5rw))
+43
examples/encoding_detection.ml
···
··· 1 + open Bytesrw 2 + 3 + (* Encoding detection example *) 4 + 5 + let () = 6 + Printf.printf "=== Encoding Detection ===\n\n"; 7 + 8 + (* Parse UTF-8 bytes with BOM *) 9 + let utf8_bom = Bytes.of_string "\xEF\xBB\xBF<html><body>UTF-8 with BOM</body></html>" in 10 + let result = Html5rw.parse_bytes utf8_bom in 11 + (match Html5rw.encoding result with 12 + | Some enc -> Printf.printf "Detected encoding: %s\n" (Html5rw.Encoding.encoding_to_string enc) 13 + | None -> Printf.printf "No encoding detected\n"); 14 + Printf.printf "Text: %s\n\n" (Html5rw.to_text result); 15 + 16 + (* Parse with meta charset *) 17 + let meta_charset = Bytes.of_string {| 18 + <html> 19 + <head><meta charset="utf-8"></head> 20 + <body>Encoding from meta tag</body> 21 + </html> 22 + |} in 23 + let result2 = Html5rw.parse_bytes meta_charset in 24 + (match Html5rw.encoding result2 with 25 + | Some enc -> Printf.printf "Detected encoding: %s\n" (Html5rw.Encoding.encoding_to_string enc) 26 + | None -> Printf.printf "No encoding detected\n"); 27 + Printf.printf "Text: %s\n\n" (Html5rw.to_text result2); 28 + 29 + (* Using low-level encoding functions *) 30 + Printf.printf "=== Low-level Encoding API ===\n\n"; 31 + 32 + let bytes = Bytes.of_string "\xEF\xBB\xBFHello" in 33 + (match Html5rw.Encoding.sniff_bom bytes with 34 + | Some (enc, offset) -> 35 + Printf.printf "BOM sniffing result: %s (skip %d bytes)\n" 36 + (Html5rw.Encoding.encoding_to_string enc) offset 37 + | None -> 38 + Printf.printf "No BOM detected\n"); 39 + 40 + let html_bytes = Bytes.of_string {|<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">|} in 41 + (match Html5rw.Encoding.prescan_for_meta_charset html_bytes with 42 + | Some enc -> Printf.printf "Prescan found: %s\n" (Html5rw.Encoding.encoding_to_string enc) 43 + | None -> Printf.printf "No charset in prescan\n")
+52
examples/error_handling.ml
···
··· 1 + open Bytesrw 2 + 3 + (* Error handling and malformed HTML example *) 4 + 5 + let malformed_html = {| 6 + <html> 7 + <head> 8 + <title>Unclosed title 9 + <meta charset="utf-8"> 10 + </head> 11 + <body> 12 + <div> 13 + <p>Unclosed paragraph 14 + <p>Another paragraph (implicitly closes the previous one) 15 + <span><div>Misnested tags</span></div> 16 + </div> 17 + <table> 18 + <tr><td>Cell 1<td>Cell 2</td> 19 + </table> 20 + <!-- Unclosed comment 21 + </body> 22 + </html> 23 + |} 24 + 25 + let () = 26 + Printf.printf "=== Parsing Malformed HTML ===\n\n"; 27 + 28 + (* Parse with error collection enabled *) 29 + let result = Html5rw.parse ~collect_errors:true (Bytes.Reader.of_string malformed_html) in 30 + 31 + (* Get parse errors *) 32 + let errs = Html5rw.errors result in 33 + Printf.printf "Parse errors: %d\n\n" (List.length errs); 34 + List.iter (fun err -> 35 + Printf.printf " Line %d, Col %d: %s\n" 36 + (Html5rw.error_line err) 37 + (Html5rw.error_column err) 38 + (Html5rw.error_code err) 39 + ) errs; 40 + 41 + (* The parser still produces a valid DOM tree *) 42 + Printf.printf "\n=== Recovered DOM Tree ===\n"; 43 + let html = Html5rw.to_string ~pretty:true ~indent_size:2 result in 44 + Printf.printf "%s\n" html; 45 + 46 + (* Query the recovered tree *) 47 + Printf.printf "\n=== Query Results ===\n"; 48 + let paragraphs = Html5rw.query result "p" in 49 + Printf.printf "Found %d paragraphs\n" (List.length paragraphs); 50 + 51 + let cells = Html5rw.query result "td" in 52 + Printf.printf "Found %d table cells\n" (List.length cells)
+11
examples/fragment_parsing.ml
···
··· 1 + open Bytesrw 2 + 3 + (** Example: Parsing HTML fragments *) 4 + 5 + let () = 6 + let fragment = "<li>Item 1</li><li>Item 2</li>" in 7 + let context = Html5rw.make_fragment_context ~tag_name:"ul" () in 8 + let reader = Bytes.Reader.of_string fragment in 9 + let result = Html5rw.parse ~fragment_context:context reader in 10 + 11 + Printf.printf "Fragment parsing result:\n%s\n" (Html5rw.to_string result)
+69
examples/text_extraction.ml
···
··· 1 + open Bytesrw 2 + 3 + (* Text extraction example *) 4 + 5 + let html = {| 6 + <!DOCTYPE html> 7 + <html> 8 + <head> 9 + <title>Article</title> 10 + <style>body { font-family: sans-serif; }</style> 11 + <script>console.log("Hello");</script> 12 + </head> 13 + <body> 14 + <article> 15 + <h1>The Great HTML5 Parser</h1> 16 + <p class="intro"> 17 + This is the <em>introduction</em> to an article about 18 + <strong>HTML parsing</strong> in OCaml. 19 + </p> 20 + <p class="content"> 21 + The parser follows the WHATWG specification and handles 22 + all kinds of malformed HTML gracefully. 23 + </p> 24 + <ul> 25 + <li>Feature 1: Fast parsing</li> 26 + <li>Feature 2: CSS selectors</li> 27 + <li>Feature 3: Encoding detection</li> 28 + </ul> 29 + </article> 30 + <footer> 31 + <p>Copyright 2024</p> 32 + </footer> 33 + </body> 34 + </html> 35 + |} 36 + 37 + let () = 38 + let result = Html5rw.parse (Bytes.Reader.of_string html) in 39 + 40 + (* Extract all text *) 41 + Printf.printf "=== All Text (default) ===\n"; 42 + let text = Html5rw.to_text result in 43 + Printf.printf "%s\n\n" text; 44 + 45 + (* Extract text with custom separator *) 46 + Printf.printf "=== Text with Newline Separator ===\n"; 47 + let text = Html5rw.to_text ~separator:"\n" result in 48 + Printf.printf "%s\n\n" text; 49 + 50 + (* Extract text from specific element *) 51 + Printf.printf "=== Article Text Only ===\n"; 52 + let articles = Html5rw.query result "article" in 53 + List.iter (fun article -> 54 + let text = Html5rw.get_text_content article in 55 + Printf.printf "%s\n" text 56 + ) articles; 57 + 58 + (* Extract structured data *) 59 + Printf.printf "\n=== Structured Extraction ===\n"; 60 + let headings = Html5rw.query result "h1" in 61 + List.iter (fun h -> 62 + Printf.printf "Title: %s\n" (Html5rw.get_text_content h) 63 + ) headings; 64 + 65 + let items = Html5rw.query result "li" in 66 + Printf.printf "Features:\n"; 67 + List.iter (fun li -> 68 + Printf.printf " - %s\n" (Html5rw.get_text_content li) 69 + ) items
+170
examples/web_scraper.ml
···
··· 1 + open Bytesrw 2 + 3 + (* Practical web scraping example *) 4 + 5 + let sample_page = {| 6 + <!DOCTYPE html> 7 + <html lang="en"> 8 + <head> 9 + <meta charset="UTF-8"> 10 + <title>Tech News - Latest Stories</title> 11 + </head> 12 + <body> 13 + <header> 14 + <nav> 15 + <a href="/">Home</a> 16 + <a href="/news">News</a> 17 + <a href="/about">About</a> 18 + </nav> 19 + </header> 20 + 21 + <main> 22 + <article class="story featured"> 23 + <h2><a href="/story/1">Revolutionary AI Breakthrough</a></h2> 24 + <p class="summary">Scientists announce major advancement in machine learning...</p> 25 + <span class="author">By Jane Smith</span> 26 + <time datetime="2024-01-15">January 15, 2024</time> 27 + </article> 28 + 29 + <article class="story"> 30 + <h2><a href="/story/2">New Programming Language Released</a></h2> 31 + <p class="summary">The language promises 10x developer productivity...</p> 32 + <span class="author">By John Doe</span> 33 + <time datetime="2024-01-14">January 14, 2024</time> 34 + </article> 35 + 36 + <article class="story"> 37 + <h2><a href="/story/3">Open Source Project Reaches Milestone</a></h2> 38 + <p class="summary">Community celebrates 1 million downloads...</p> 39 + <span class="author">By Alice Chen</span> 40 + <time datetime="2024-01-13">January 13, 2024</time> 41 + </article> 42 + </main> 43 + 44 + <aside> 45 + <h3>Popular Tags</h3> 46 + <ul class="tags"> 47 + <li><a href="/tag/ai">AI</a></li> 48 + <li><a href="/tag/programming">Programming</a></li> 49 + <li><a href="/tag/opensource">Open Source</a></li> 50 + </ul> 51 + </aside> 52 + </body> 53 + </html> 54 + |} 55 + 56 + type story = { 57 + title: string; 58 + url: string; 59 + summary: string; 60 + author: string; 61 + date: string; 62 + featured: bool; 63 + } 64 + 65 + (* Helper to find first child element with given tag name *) 66 + let find_child_by_tag parent tag = 67 + List.find_opt (fun n -> 68 + Html5rw.is_element n && String.lowercase_ascii n.Html5rw.Dom.name = tag 69 + ) parent.Html5rw.Dom.children 70 + 71 + (* Helper to find first descendant element with given tag name *) 72 + let rec find_descendant_by_tag node tag = 73 + let children = List.filter Html5rw.is_element node.Html5rw.Dom.children in 74 + match List.find_opt (fun n -> String.lowercase_ascii n.Html5rw.Dom.name = tag) children with 75 + | Some found -> Some found 76 + | None -> 77 + List.find_map (fun child -> find_descendant_by_tag child tag) children 78 + 79 + (* Helper to find first descendant with given class *) 80 + let rec find_by_class node cls = 81 + let children = List.filter Html5rw.is_element node.Html5rw.Dom.children in 82 + let has_class n = 83 + match Html5rw.get_attr n "class" with 84 + | Some classes -> List.mem cls (String.split_on_char ' ' classes) 85 + | None -> false 86 + in 87 + match List.find_opt has_class children with 88 + | Some found -> Some found 89 + | None -> 90 + List.find_map (fun child -> find_by_class child cls) children 91 + 92 + let extract_story article = 93 + (* Find h2 > a for title and URL *) 94 + let title, url = 95 + match find_descendant_by_tag article "h2" with 96 + | Some h2 -> 97 + (match find_child_by_tag h2 "a" with 98 + | Some a -> 99 + (Html5rw.get_text_content a, 100 + Option.value ~default:"#" (Html5rw.get_attr a "href")) 101 + | None -> (Html5rw.get_text_content h2, "#")) 102 + | None -> ("(no title)", "#") 103 + in 104 + let summary = 105 + match find_by_class article "summary" with 106 + | Some p -> Html5rw.get_text_content p 107 + | None -> "" 108 + in 109 + let author = 110 + match find_by_class article "author" with 111 + | Some s -> Html5rw.get_text_content s 112 + | None -> "Unknown" 113 + in 114 + let date = 115 + match find_descendant_by_tag article "time" with 116 + | Some t -> Option.value ~default:"" (Html5rw.get_attr t "datetime") 117 + | None -> "" 118 + in 119 + let featured = Html5rw.matches article ".featured" in 120 + { title; url; summary; author; date; featured } 121 + 122 + let () = 123 + Printf.printf "=== Web Scraping Example ===\n\n"; 124 + 125 + let result = Html5rw.parse (Bytes.Reader.of_string sample_page) in 126 + 127 + (* Extract page title *) 128 + let titles = Html5rw.query result "title" in 129 + (match titles with 130 + | t :: _ -> Printf.printf "Page title: %s\n\n" (Html5rw.get_text_content t) 131 + | [] -> ()); 132 + 133 + (* Extract navigation links using descendant query *) 134 + Printf.printf "Navigation:\n"; 135 + let nav_links = Html5rw.query result "a" in 136 + let nav = List.filter (fun a -> 137 + (* Check if this link is in nav by looking at ancestors *) 138 + List.exists (fun n -> n.Html5rw.Dom.name = "nav") (Html5rw.ancestors a) 139 + ) nav_links in 140 + List.iter (fun a -> 141 + let text = Html5rw.get_text_content a in 142 + let href = Option.value ~default:"#" (Html5rw.get_attr a "href") in 143 + Printf.printf " %s -> %s\n" text href 144 + ) nav; 145 + 146 + (* Extract stories *) 147 + Printf.printf "\nStories:\n"; 148 + let articles = Html5rw.query result "article" in 149 + List.iter (fun article -> 150 + let story = extract_story article in 151 + Printf.printf "\n %s%s\n" 152 + (if story.featured then "[FEATURED] " else "") 153 + story.title; 154 + Printf.printf " URL: %s\n" story.url; 155 + Printf.printf " Summary: %s\n" story.summary; 156 + Printf.printf " %s | %s\n" story.author story.date 157 + ) articles; 158 + 159 + (* Extract tags *) 160 + Printf.printf "\nPopular Tags:\n"; 161 + let all_links = Html5rw.query result "a" in 162 + let tag_links = List.filter (fun a -> 163 + let href = Option.value ~default:"" (Html5rw.get_attr a "href") in 164 + String.length href > 5 && String.sub href 0 5 = "/tag/" 165 + ) all_links in 166 + List.iter (fun a -> 167 + let tag = Html5rw.get_text_content a in 168 + let href = Option.value ~default:"#" (Html5rw.get_attr a "href") in 169 + Printf.printf " #%s (%s)\n" tag href 170 + ) tag_links
+3
gen/dune
···
··· 1 + (executable 2 + (name gen_entities) 3 + (libraries yojson))
+152
gen/gen_entities.ml
···
··· 1 + (* Entity table generator for html5rw. 2 + Reads WHATWG entities.json and generates OCaml code. *) 3 + 4 + let () = 5 + let json_file = Sys.argv.(1) in 6 + let out_file = Sys.argv.(2) in 7 + 8 + let ic = open_in json_file in 9 + let n = in_channel_length ic in 10 + let s = really_input_string ic n in 11 + close_in ic; 12 + 13 + let json = Yojson.Basic.from_string s in 14 + 15 + let oc = open_out out_file in 16 + 17 + (* Header *) 18 + output_string oc "(* Auto-generated from entities.json - do not edit *)\n\n"; 19 + 20 + (* We need two tables: 21 + 1. Full entity table (name without & -> codepoints) 22 + 2. Legacy entities set (entities that can be used without semicolon) *) 23 + 24 + let entities = ref [] in 25 + let legacy = ref [] in 26 + 27 + (match json with 28 + | `Assoc entries -> 29 + List.iter (fun (name, value) -> 30 + (* name is like "&amp;" or "&amp" *) 31 + let name_without_amp = 32 + if String.length name > 0 && name.[0] = '&' then 33 + String.sub name 1 (String.length name - 1) 34 + else name 35 + in 36 + let has_semicolon = 37 + String.length name_without_amp > 0 && 38 + name_without_amp.[String.length name_without_amp - 1] = ';' 39 + in 40 + let key = 41 + if has_semicolon then 42 + String.sub name_without_amp 0 (String.length name_without_amp - 1) 43 + else 44 + name_without_amp 45 + in 46 + (match value with 47 + | `Assoc fields -> 48 + let codepoints = 49 + match List.assoc_opt "codepoints" fields with 50 + | Some (`List cps) -> 51 + List.map (function `Int i -> i | _ -> 0) cps 52 + | _ -> [] 53 + in 54 + if codepoints <> [] then begin 55 + entities := (key, codepoints, has_semicolon) :: !entities; 56 + (* Legacy entities are those that appear without semicolon in the JSON *) 57 + if not has_semicolon then 58 + legacy := key :: !legacy 59 + end 60 + | _ -> ()) 61 + ) entries 62 + | _ -> failwith "Expected JSON object"); 63 + 64 + (* Remove duplicates - prefer semicolon version *) 65 + let seen = Hashtbl.create 2500 in 66 + let unique_entities = 67 + List.filter (fun (key, _, has_semi) -> 68 + if Hashtbl.mem seen key then begin 69 + (* If we already have this key without semicolon, and this one has semicolon, replace *) 70 + if has_semi then begin 71 + Hashtbl.replace seen key true; 72 + true 73 + end else false 74 + end else begin 75 + Hashtbl.add seen key has_semi; 76 + true 77 + end 78 + ) (List.rev !entities) 79 + in 80 + 81 + (* Sort for binary search *) 82 + let sorted = List.sort (fun (a, _, _) (b, _, _) -> String.compare a b) unique_entities in 83 + 84 + (* Generate codepoints to string function *) 85 + output_string oc "let codepoints_to_string cps =\n"; 86 + output_string oc " let buf = Buffer.create 8 in\n"; 87 + output_string oc " List.iter (fun cp ->\n"; 88 + output_string oc " if cp <= 0x7F then\n"; 89 + output_string oc " Buffer.add_char buf (Char.chr cp)\n"; 90 + output_string oc " else if cp <= 0x7FF then begin\n"; 91 + output_string oc " Buffer.add_char buf (Char.chr (0xC0 lor (cp lsr 6)));\n"; 92 + output_string oc " Buffer.add_char buf (Char.chr (0x80 lor (cp land 0x3F)))\n"; 93 + output_string oc " end else if cp <= 0xFFFF then begin\n"; 94 + output_string oc " Buffer.add_char buf (Char.chr (0xE0 lor (cp lsr 12)));\n"; 95 + output_string oc " Buffer.add_char buf (Char.chr (0x80 lor ((cp lsr 6) land 0x3F)));\n"; 96 + output_string oc " Buffer.add_char buf (Char.chr (0x80 lor (cp land 0x3F)))\n"; 97 + output_string oc " end else begin\n"; 98 + output_string oc " Buffer.add_char buf (Char.chr (0xF0 lor (cp lsr 18)));\n"; 99 + output_string oc " Buffer.add_char buf (Char.chr (0x80 lor ((cp lsr 12) land 0x3F)));\n"; 100 + output_string oc " Buffer.add_char buf (Char.chr (0x80 lor ((cp lsr 6) land 0x3F)));\n"; 101 + output_string oc " Buffer.add_char buf (Char.chr (0x80 lor (cp land 0x3F)))\n"; 102 + output_string oc " end\n"; 103 + output_string oc " ) cps;\n"; 104 + output_string oc " Buffer.contents buf\n\n"; 105 + 106 + (* Generate the entity array for binary search *) 107 + output_string oc "let entities = [|\n"; 108 + List.iter (fun (name, codepoints, _) -> 109 + let cps_str = String.concat "; " (List.map string_of_int codepoints) in 110 + Printf.fprintf oc " (%S, [%s]);\n" name cps_str 111 + ) sorted; 112 + output_string oc "|]\n\n"; 113 + 114 + (* Binary search lookup *) 115 + output_string oc "let lookup name =\n"; 116 + output_string oc " let rec search lo hi =\n"; 117 + output_string oc " if lo > hi then None\n"; 118 + output_string oc " else begin\n"; 119 + output_string oc " let mid = (lo + hi) / 2 in\n"; 120 + output_string oc " let (key, cps) = entities.(mid) in\n"; 121 + output_string oc " let cmp = String.compare name key in\n"; 122 + output_string oc " if cmp = 0 then Some (codepoints_to_string cps)\n"; 123 + output_string oc " else if cmp < 0 then search lo (mid - 1)\n"; 124 + output_string oc " else search (mid + 1) hi\n"; 125 + output_string oc " end\n"; 126 + output_string oc " in\n"; 127 + output_string oc " search 0 (Array.length entities - 1)\n\n"; 128 + 129 + (* Generate legacy entities set *) 130 + let legacy_sorted = List.sort_uniq String.compare !legacy in 131 + output_string oc "let legacy_entities = [|\n"; 132 + List.iter (fun name -> 133 + Printf.fprintf oc " %S;\n" name 134 + ) legacy_sorted; 135 + output_string oc "|]\n\n"; 136 + 137 + output_string oc "let is_legacy name =\n"; 138 + output_string oc " let rec search lo hi =\n"; 139 + output_string oc " if lo > hi then false\n"; 140 + output_string oc " else begin\n"; 141 + output_string oc " let mid = (lo + hi) / 2 in\n"; 142 + output_string oc " let cmp = String.compare name legacy_entities.(mid) in\n"; 143 + output_string oc " if cmp = 0 then true\n"; 144 + output_string oc " else if cmp < 0 then search lo (mid - 1)\n"; 145 + output_string oc " else search (mid + 1) hi\n"; 146 + output_string oc " end\n"; 147 + output_string oc " in\n"; 148 + output_string oc " search 0 (Array.length legacy_entities - 1)\n"; 149 + 150 + close_out oc; 151 + Printf.printf "Generated %s with %d entities (%d legacy)\n" 152 + out_file (List.length sorted) (List.length legacy_sorted)
+35
html5rw.opam
···
··· 1 + # This file is generated by dune, edit dune-project instead 2 + opam-version: "2.0" 3 + version: "0.1.0" 4 + synopsis: "Pure OCaml HTML5 parser implementing the WHATWG specification" 5 + description: 6 + "A pure OCaml HTML5 parser that passes the html5lib-tests suite. Implements the WHATWG HTML5 parsing specification including tokenization, tree construction, encoding detection, and CSS selector queries." 7 + maintainer: ["author@example.com"] 8 + authors: ["Author"] 9 + license: "MIT" 10 + homepage: "https://github.com/username/html5rw" 11 + bug-reports: "https://github.com/username/html5rw/issues" 12 + depends: [ 13 + "dune" {>= "3.0"} 14 + "ocaml" {>= "4.14.0"} 15 + "bytesrw" {>= "0.3.0"} 16 + "uutf" {>= "1.0.0"} 17 + "re" {>= "1.10.0"} 18 + "yojson" {build & >= "2.0.0"} 19 + "odoc" {with-doc} 20 + ] 21 + build: [ 22 + ["dune" "subst"] {dev} 23 + [ 24 + "dune" 25 + "build" 26 + "-p" 27 + name 28 + "-j" 29 + jobs 30 + "@install" 31 + "@runtest" {with-test} 32 + "@doc" {with-doc} 33 + ] 34 + ] 35 + dev-repo: "git+https://github.com/username/html5rw.git"
+4
lib/dom/dune
···
··· 1 + (library 2 + (name html5rw_dom) 3 + (public_name html5rw.dom) 4 + (libraries bytesrw))
+8
lib/dom/html5rw_dom.ml
···
··· 1 + (* html5rw.dom - HTML5 DOM types and operations *) 2 + 3 + include Node 4 + 5 + let to_html = Serialize.to_html 6 + let to_writer = Serialize.to_writer 7 + let to_test_format = Serialize.to_test_format 8 + let to_text = Serialize.to_text
+159
lib/dom/node.ml
···
··· 1 + (* HTML5 DOM node types *) 2 + 3 + type doctype_data = { 4 + name : string option; 5 + public_id : string option; 6 + system_id : string option; 7 + } 8 + 9 + type quirks_mode = No_quirks | Quirks | Limited_quirks 10 + 11 + type node = { 12 + mutable name : string; 13 + mutable namespace : string option; (* None = html, Some "svg", Some "mathml" *) 14 + mutable attrs : (string * string) list; 15 + mutable children : node list; 16 + mutable parent : node option; 17 + mutable data : string; (* For text, comment nodes *) 18 + mutable template_content : node option; (* For <template> elements *) 19 + mutable doctype : doctype_data option; (* For doctype nodes *) 20 + } 21 + 22 + (* Node name constants *) 23 + let document_name = "#document" 24 + let document_fragment_name = "#document-fragment" 25 + let text_name = "#text" 26 + let comment_name = "#comment" 27 + let doctype_name = "!doctype" 28 + 29 + (* Base node constructor - all nodes share this structure *) 30 + let make_node ~name ?(namespace=None) ?(attrs=[]) ?(data="") ?template_content ?doctype () = { 31 + name; 32 + namespace; 33 + attrs; 34 + children = []; 35 + parent = None; 36 + data; 37 + template_content; 38 + doctype; 39 + } 40 + 41 + (* Constructors *) 42 + let create_element name ?(namespace=None) ?(attrs=[]) () = 43 + make_node ~name ~namespace ~attrs () 44 + 45 + let create_text data = 46 + make_node ~name:text_name ~data () 47 + 48 + let create_comment data = 49 + make_node ~name:comment_name ~data () 50 + 51 + let create_document () = 52 + make_node ~name:document_name () 53 + 54 + let create_document_fragment () = 55 + make_node ~name:document_fragment_name () 56 + 57 + let create_doctype ?name ?public_id ?system_id () = 58 + make_node ~name:doctype_name ~doctype:{ name; public_id; system_id } () 59 + 60 + let create_template ?(namespace=None) ?(attrs=[]) () = 61 + let node = create_element "template" ~namespace ~attrs () in 62 + node.template_content <- Some (create_document_fragment ()); 63 + node 64 + 65 + (* Predicates *) 66 + let is_element node = 67 + not (List.mem node.name [text_name; comment_name; document_name; document_fragment_name; doctype_name]) 68 + 69 + let is_text node = node.name = text_name 70 + let is_comment node = node.name = comment_name 71 + let is_document node = node.name = document_name 72 + let is_document_fragment node = node.name = document_fragment_name 73 + let is_doctype node = node.name = doctype_name 74 + let has_children node = node.children <> [] 75 + 76 + (* DOM manipulation *) 77 + let append_child parent child = 78 + child.parent <- Some parent; 79 + parent.children <- parent.children @ [child] 80 + 81 + let insert_before parent new_child ref_child = 82 + new_child.parent <- Some parent; 83 + let rec insert acc = function 84 + | [] -> List.rev acc @ [new_child] 85 + | x :: xs when x == ref_child -> List.rev acc @ [new_child; x] @ xs 86 + | x :: xs -> insert (x :: acc) xs 87 + in 88 + parent.children <- insert [] parent.children 89 + 90 + let remove_child parent child = 91 + child.parent <- None; 92 + parent.children <- List.filter (fun c -> c != child) parent.children 93 + 94 + (* Find the last text node before a reference point *) 95 + let last_child_text parent = 96 + match List.rev parent.children with 97 + | last :: _ when is_text last -> Some last 98 + | _ -> None 99 + 100 + let insert_text_at parent text before_node = 101 + match before_node with 102 + | None -> 103 + (* Append - merge with last child if it's text *) 104 + (match last_child_text parent with 105 + | Some txt -> txt.data <- txt.data ^ text 106 + | None -> append_child parent (create_text text)) 107 + | Some ref -> 108 + (* Find last text node before ref_child *) 109 + let rec find_prev_text = function 110 + | [] | [_] -> None 111 + | prev :: curr :: _ when curr == ref && is_text prev -> Some prev 112 + | _ :: rest -> find_prev_text rest 113 + in 114 + match find_prev_text parent.children with 115 + | Some txt -> txt.data <- txt.data ^ text 116 + | None -> insert_before parent (create_text text) ref 117 + 118 + (* Attribute helpers *) 119 + let get_attr node name = List.assoc_opt name node.attrs 120 + 121 + let set_attr node name value = 122 + node.attrs <- List.filter (fun (n, _) -> n <> name) node.attrs @ [(name, value)] 123 + 124 + let has_attr node name = List.mem_assoc name node.attrs 125 + 126 + (* Tree traversal *) 127 + let rec descendants node = 128 + List.concat_map (fun n -> n :: descendants n) node.children 129 + 130 + let ancestors node = 131 + let rec collect acc n = 132 + match n.parent with 133 + | None -> List.rev acc 134 + | Some p -> collect (p :: acc) p 135 + in 136 + collect [] node 137 + 138 + let rec get_text_content node = 139 + if is_text node then node.data 140 + else String.concat "" (List.map get_text_content node.children) 141 + 142 + (* Clone *) 143 + let rec clone ?(deep=false) node = 144 + let new_node = make_node 145 + ~name:node.name 146 + ~namespace:node.namespace 147 + ~attrs:node.attrs 148 + ~data:node.data 149 + ?doctype:node.doctype 150 + () 151 + in 152 + if deep then begin 153 + new_node.children <- List.map (clone ~deep:true) node.children; 154 + List.iter (fun c -> c.parent <- Some new_node) new_node.children; 155 + Option.iter (fun tc -> 156 + new_node.template_content <- Some (clone ~deep:true tc) 157 + ) node.template_content 158 + end; 159 + new_node
+333
lib/dom/node.mli
···
··· 1 + (** HTML5 DOM Node Types and Operations 2 + 3 + This module provides the DOM node representation used by the HTML5 parser. 4 + Nodes form a tree structure representing HTML documents. The type follows 5 + the WHATWG HTML5 specification for document structure. 6 + 7 + {2 Node Types} 8 + 9 + The HTML5 DOM includes several node types, all represented by the same 10 + record type with different field usage: 11 + 12 + - {b Element nodes}: Regular HTML elements like [<div>], [<p>], [<span>] 13 + - {b Text nodes}: Text content within elements 14 + - {b Comment nodes}: HTML comments [<!-- comment -->] 15 + - {b Document nodes}: The root node representing the entire document 16 + - {b Document fragment nodes}: A lightweight container (used for templates) 17 + - {b Doctype nodes}: The [<!DOCTYPE html>] declaration 18 + 19 + {2 Namespaces} 20 + 21 + Elements can belong to different namespaces: 22 + - [None] or [Some "html"]: HTML namespace (default) 23 + - [Some "svg"]: SVG namespace for embedded SVG content 24 + - [Some "mathml"]: MathML namespace for mathematical notation 25 + 26 + The parser automatically switches namespaces when encountering [<svg>] 27 + or [<math>] elements, as specified by the HTML5 algorithm. 28 + 29 + {2 Tree Structure} 30 + 31 + Nodes form a bidirectional tree: each node has a list of children and 32 + an optional parent reference. Modification functions maintain these 33 + references automatically. 34 + *) 35 + 36 + (** {1 Types} *) 37 + 38 + (** Information associated with a DOCTYPE node. 39 + 40 + In HTML5, the DOCTYPE is primarily used for quirks mode detection. 41 + Most modern HTML5 documents use [<!DOCTYPE html>] which results in 42 + all fields being [None] or the name being [Some "html"]. 43 + 44 + @see <https://html.spec.whatwg.org/multipage/parsing.html#the-initial-insertion-mode> 45 + The WHATWG specification for DOCTYPE handling 46 + *) 47 + type doctype_data = { 48 + name : string option; (** The DOCTYPE name, e.g., "html" *) 49 + public_id : string option; (** Public identifier (legacy, rarely used) *) 50 + system_id : string option; (** System identifier (legacy, rarely used) *) 51 + } 52 + 53 + (** Quirks mode setting for the document. 54 + 55 + Quirks mode affects CSS layout behavior for backwards compatibility with 56 + old web content. The HTML5 parser determines quirks mode based on the 57 + DOCTYPE declaration. 58 + 59 + - [No_quirks]: Standards mode - full HTML5/CSS3 behavior 60 + - [Quirks]: Full quirks mode - emulates legacy browser behavior 61 + - [Limited_quirks]: Almost standards mode - limited quirks for specific cases 62 + 63 + @see <https://quirks.spec.whatwg.org/> The Quirks Mode specification 64 + *) 65 + type quirks_mode = No_quirks | Quirks | Limited_quirks 66 + 67 + (** A DOM node in the parsed document tree. 68 + 69 + All node types use the same record structure. The [name] field determines 70 + the node type: 71 + - Element: the tag name (e.g., "div", "p") 72 + - Text: "#text" 73 + - Comment: "#comment" 74 + - Document: "#document" 75 + - Document fragment: "#document-fragment" 76 + - Doctype: "!doctype" 77 + 78 + {3 Field Usage by Node Type} 79 + 80 + {v 81 + Node Type | name | namespace | attrs | data | template_content | doctype 82 + ------------------|------------------|-----------|-------|------|------------------|-------- 83 + Element | tag name | Yes | Yes | No | If <template> | No 84 + Text | "#text" | No | No | Yes | No | No 85 + Comment | "#comment" | No | No | Yes | No | No 86 + Document | "#document" | No | No | No | No | No 87 + Document Fragment | "#document-frag" | No | No | No | No | No 88 + Doctype | "!doctype" | No | No | No | No | Yes 89 + v} 90 + *) 91 + type node = { 92 + mutable name : string; 93 + (** Tag name for elements, or special name for other node types *) 94 + 95 + mutable namespace : string option; 96 + (** Element namespace: [None] for HTML, [Some "svg"], [Some "mathml"] *) 97 + 98 + mutable attrs : (string * string) list; 99 + (** Element attributes as (name, value) pairs *) 100 + 101 + mutable children : node list; 102 + (** Child nodes in document order *) 103 + 104 + mutable parent : node option; 105 + (** Parent node, [None] for root nodes *) 106 + 107 + mutable data : string; 108 + (** Text content for text and comment nodes *) 109 + 110 + mutable template_content : node option; 111 + (** Document fragment for [<template>] element contents *) 112 + 113 + mutable doctype : doctype_data option; 114 + (** DOCTYPE information for doctype nodes *) 115 + } 116 + 117 + (** {1 Node Name Constants} 118 + 119 + These constants identify special node types. Compare with [node.name] 120 + to determine the node type. 121 + *) 122 + 123 + val document_name : string 124 + (** ["#document"] - name for document nodes *) 125 + 126 + val document_fragment_name : string 127 + (** ["#document-fragment"] - name for document fragment nodes *) 128 + 129 + val text_name : string 130 + (** ["#text"] - name for text nodes *) 131 + 132 + val comment_name : string 133 + (** ["#comment"] - name for comment nodes *) 134 + 135 + val doctype_name : string 136 + (** ["!doctype"] - name for doctype nodes *) 137 + 138 + (** {1 Constructors} 139 + 140 + Functions to create new DOM nodes. All nodes start with no parent and 141 + no children. 142 + *) 143 + 144 + val create_element : string -> ?namespace:string option -> 145 + ?attrs:(string * string) list -> unit -> node 146 + (** Create an element node. 147 + 148 + @param name The tag name (e.g., "div", "p", "span") 149 + @param namespace Element namespace: [None] for HTML, [Some "svg"], [Some "mathml"] 150 + @param attrs Initial attributes as (name, value) pairs 151 + 152 + {[ 153 + let div = create_element "div" () 154 + let svg = create_element "rect" ~namespace:(Some "svg") () 155 + let link = create_element "a" ~attrs:[("href", "/")] () 156 + ]} 157 + *) 158 + 159 + val create_text : string -> node 160 + (** Create a text node with the given content. 161 + 162 + {[ 163 + let text = create_text "Hello, world!" 164 + ]} 165 + *) 166 + 167 + val create_comment : string -> node 168 + (** Create a comment node with the given content. 169 + 170 + The content should not include the comment delimiters. 171 + 172 + {[ 173 + let comment = create_comment " This is a comment " 174 + (* Represents: <!-- This is a comment --> *) 175 + ]} 176 + *) 177 + 178 + val create_document : unit -> node 179 + (** Create an empty document node. 180 + 181 + Document nodes are the root of a complete HTML document tree. 182 + *) 183 + 184 + val create_document_fragment : unit -> node 185 + (** Create an empty document fragment. 186 + 187 + Document fragments are lightweight containers used for: 188 + - Template contents 189 + - Fragment parsing results 190 + - Efficient batch DOM operations 191 + *) 192 + 193 + val create_doctype : ?name:string -> ?public_id:string -> 194 + ?system_id:string -> unit -> node 195 + (** Create a DOCTYPE node. 196 + 197 + For HTML5, use [create_doctype ~name:"html" ()] which produces 198 + [<!DOCTYPE html>]. 199 + 200 + @param name DOCTYPE name (usually "html") 201 + @param public_id Public identifier (legacy) 202 + @param system_id System identifier (legacy) 203 + *) 204 + 205 + val create_template : ?namespace:string option -> 206 + ?attrs:(string * string) list -> unit -> node 207 + (** Create a [<template>] element with its content document fragment. 208 + 209 + Template elements have special semantics: their children are not rendered 210 + directly but stored in a separate document fragment accessible via 211 + [template_content]. 212 + 213 + @see <https://html.spec.whatwg.org/multipage/scripting.html#the-template-element> 214 + The HTML5 template element specification 215 + *) 216 + 217 + (** {1 Node Type Predicates} 218 + 219 + Functions to test what type of node you have. 220 + *) 221 + 222 + val is_element : node -> bool 223 + (** [is_element node] returns [true] if the node is an element node. 224 + 225 + Elements are nodes with HTML tags like [<div>], [<p>], etc. 226 + *) 227 + 228 + val is_text : node -> bool 229 + (** [is_text node] returns [true] if the node is a text node. *) 230 + 231 + val is_comment : node -> bool 232 + (** [is_comment node] returns [true] if the node is a comment node. *) 233 + 234 + val is_document : node -> bool 235 + (** [is_document node] returns [true] if the node is a document node. *) 236 + 237 + val is_document_fragment : node -> bool 238 + (** [is_document_fragment node] returns [true] if the node is a document fragment. *) 239 + 240 + val is_doctype : node -> bool 241 + (** [is_doctype node] returns [true] if the node is a DOCTYPE node. *) 242 + 243 + val has_children : node -> bool 244 + (** [has_children node] returns [true] if the node has any children. *) 245 + 246 + (** {1 Tree Manipulation} 247 + 248 + Functions to modify the DOM tree structure. These functions automatically 249 + maintain parent/child references. 250 + *) 251 + 252 + val append_child : node -> node -> unit 253 + (** [append_child parent child] adds [child] as the last child of [parent]. 254 + 255 + The child's parent reference is updated to point to [parent]. 256 + *) 257 + 258 + val insert_before : node -> node -> node -> unit 259 + (** [insert_before parent new_child ref_child] inserts [new_child] before 260 + [ref_child] in [parent]'s children. 261 + 262 + @raise Not_found if [ref_child] is not a child of [parent] 263 + *) 264 + 265 + val remove_child : node -> node -> unit 266 + (** [remove_child parent child] removes [child] from [parent]'s children. 267 + 268 + The child's parent reference is set to [None]. 269 + *) 270 + 271 + val insert_text_at : node -> string -> node option -> unit 272 + (** [insert_text_at parent text before_node] inserts text content. 273 + 274 + If [before_node] is [None], appends at the end. If the previous sibling 275 + is a text node, the text is merged into it. Otherwise, a new text node 276 + is created. 277 + 278 + This implements the HTML5 parser's text insertion algorithm which 279 + coalesces adjacent text nodes. 280 + *) 281 + 282 + (** {1 Attribute Operations} 283 + 284 + Functions to read and modify element attributes. 285 + *) 286 + 287 + val get_attr : node -> string -> string option 288 + (** [get_attr node name] returns the value of attribute [name], or [None]. *) 289 + 290 + val set_attr : node -> string -> string -> unit 291 + (** [set_attr node name value] sets attribute [name] to [value]. 292 + 293 + If the attribute already exists, it is replaced. 294 + *) 295 + 296 + val has_attr : node -> string -> bool 297 + (** [has_attr node name] returns [true] if the node has attribute [name]. *) 298 + 299 + (** {1 Tree Traversal} 300 + 301 + Functions to navigate the DOM tree. 302 + *) 303 + 304 + val descendants : node -> node list 305 + (** [descendants node] returns all descendant nodes in document order. 306 + 307 + This performs a depth-first traversal, returning children before 308 + siblings at each level. 309 + *) 310 + 311 + val ancestors : node -> node list 312 + (** [ancestors node] returns all ancestor nodes from parent to root. 313 + 314 + The first element is the immediate parent, the last is the root. 315 + *) 316 + 317 + val get_text_content : node -> string 318 + (** [get_text_content node] returns the concatenated text content. 319 + 320 + For text nodes, returns the text data. For elements, recursively 321 + concatenates all descendant text content. 322 + *) 323 + 324 + (** {1 Cloning} *) 325 + 326 + val clone : ?deep:bool -> node -> node 327 + (** [clone ?deep node] creates a copy of the node. 328 + 329 + @param deep If [true], recursively clone all descendants (default: [false]) 330 + 331 + The cloned node has no parent. Attribute lists are copied by reference 332 + (the list itself is new, but attribute strings are shared). 333 + *)
+301
lib/dom/serialize.ml
···
··· 1 + (* HTML5 DOM serialization *) 2 + 3 + open Bytesrw 4 + open Node 5 + 6 + (* Void elements that don't have end tags *) 7 + let void_elements = [ 8 + "area"; "base"; "br"; "col"; "embed"; "hr"; "img"; "input"; 9 + "link"; "meta"; "source"; "track"; "wbr" 10 + ] 11 + 12 + let is_void name = List.mem name void_elements 13 + 14 + (* Foreign attribute adjustments for test output *) 15 + let foreign_attr_adjustments = [ 16 + "xlink:actuate"; "xlink:arcrole"; "xlink:href"; "xlink:role"; 17 + "xlink:show"; "xlink:title"; "xlink:type"; "xml:lang"; "xml:space"; 18 + "xmlns:xlink" 19 + ] 20 + 21 + (* Escape text content *) 22 + let escape_text text = 23 + let buf = Buffer.create (String.length text) in 24 + String.iter (fun c -> 25 + match c with 26 + | '&' -> Buffer.add_string buf "&amp;" 27 + | '<' -> Buffer.add_string buf "&lt;" 28 + | '>' -> Buffer.add_string buf "&gt;" 29 + | c -> Buffer.add_char buf c 30 + ) text; 31 + Buffer.contents buf 32 + 33 + (* Choose quote character for attribute value *) 34 + let choose_attr_quote value = 35 + if String.contains value '"' && not (String.contains value '\'') then '\'' 36 + else '"' 37 + 38 + (* Escape attribute value *) 39 + let escape_attr_value value quote_char = 40 + let buf = Buffer.create (String.length value) in 41 + String.iter (fun c -> 42 + match c with 43 + | '&' -> Buffer.add_string buf "&amp;" 44 + | '"' when quote_char = '"' -> Buffer.add_string buf "&quot;" 45 + | '\'' when quote_char = '\'' -> Buffer.add_string buf "&#39;" 46 + | c -> Buffer.add_char buf c 47 + ) value; 48 + Buffer.contents buf 49 + 50 + (* Check if attribute value can be unquoted *) 51 + let can_unquote_attr_value value = 52 + if String.length value = 0 then false 53 + else 54 + let invalid = ref false in 55 + String.iter (fun c -> 56 + if c = '>' || c = '"' || c = '\'' || c = '=' || c = '`' || 57 + c = ' ' || c = '\t' || c = '\n' || c = '\x0C' || c = '\r' then 58 + invalid := true 59 + ) value; 60 + not !invalid 61 + 62 + (* Serialize start tag *) 63 + let serialize_start_tag name attrs = 64 + let buf = Buffer.create 64 in 65 + Buffer.add_char buf '<'; 66 + Buffer.add_string buf name; 67 + List.iter (fun (key, value) -> 68 + Buffer.add_char buf ' '; 69 + Buffer.add_string buf key; 70 + if value <> "" then begin 71 + if can_unquote_attr_value value then begin 72 + Buffer.add_char buf '='; 73 + Buffer.add_string buf (escape_attr_value value '"') 74 + end else begin 75 + let quote = choose_attr_quote value in 76 + Buffer.add_char buf '='; 77 + Buffer.add_char buf quote; 78 + Buffer.add_string buf (escape_attr_value value quote); 79 + Buffer.add_char buf quote 80 + end 81 + end 82 + ) attrs; 83 + Buffer.add_char buf '>'; 84 + Buffer.contents buf 85 + 86 + (* Serialize end tag *) 87 + let serialize_end_tag name = 88 + "</" ^ name ^ ">" 89 + 90 + (* Convert node to HTML string *) 91 + let rec to_html ?(pretty=true) ?(indent_size=2) ?(indent=0) node = 92 + let prefix = if pretty then String.make (indent * indent_size) ' ' else "" in 93 + let newline = if pretty then "\n" else "" in 94 + 95 + match node.name with 96 + | "#document" -> 97 + let parts = List.map (to_html ~pretty ~indent_size ~indent:0) node.children in 98 + String.concat newline (List.filter (fun s -> s <> "") parts) 99 + 100 + | "#document-fragment" -> 101 + let parts = List.map (to_html ~pretty ~indent_size ~indent) node.children in 102 + String.concat newline (List.filter (fun s -> s <> "") parts) 103 + 104 + | "#text" -> 105 + let text = node.data in 106 + if pretty then 107 + let trimmed = String.trim text in 108 + if trimmed = "" then "" 109 + else prefix ^ escape_text trimmed 110 + else escape_text text 111 + 112 + | "#comment" -> 113 + prefix ^ "<!--" ^ node.data ^ "-->" 114 + 115 + | "!doctype" -> 116 + prefix ^ "<!DOCTYPE html>" 117 + 118 + | name -> 119 + let open_tag = serialize_start_tag name node.attrs in 120 + 121 + if is_void name then 122 + prefix ^ open_tag 123 + else if node.children = [] then 124 + prefix ^ open_tag ^ serialize_end_tag name 125 + else begin 126 + (* Check if all children are text *) 127 + let all_text = List.for_all is_text node.children in 128 + if all_text && pretty then 129 + let text = String.concat "" (List.map (fun c -> c.data) node.children) in 130 + prefix ^ open_tag ^ escape_text text ^ serialize_end_tag name 131 + else begin 132 + let parts = [prefix ^ open_tag] in 133 + let child_parts = List.filter_map (fun child -> 134 + let html = to_html ~pretty ~indent_size ~indent:(indent + 1) child in 135 + if html = "" then None else Some html 136 + ) node.children in 137 + let parts = parts @ child_parts @ [prefix ^ serialize_end_tag name] in 138 + String.concat newline parts 139 + end 140 + end 141 + 142 + (* Get qualified name for test format *) 143 + let qualified_name node = 144 + match node.namespace with 145 + | Some "svg" -> "svg " ^ node.name 146 + | Some "mathml" -> "math " ^ node.name 147 + | Some ns when ns <> "html" -> ns ^ " " ^ node.name 148 + | _ -> node.name 149 + 150 + (* Format attributes for test output *) 151 + let attrs_to_test_format node indent = 152 + if node.attrs = [] then [] 153 + else begin 154 + let padding = String.make (indent + 2) ' ' in 155 + (* Compute display names first, then sort by display name for canonical output *) 156 + let with_display_names = List.map (fun (name, value) -> 157 + let display_name = 158 + match node.namespace with 159 + | Some ns when ns <> "html" && List.mem (String.lowercase_ascii name) foreign_attr_adjustments -> 160 + String.map (fun c -> if c = ':' then ' ' else c) name 161 + | _ -> name 162 + in 163 + (display_name, value) 164 + ) node.attrs in 165 + let sorted = List.sort (fun (a, _) (b, _) -> String.compare a b) with_display_names in 166 + List.map (fun (display_name, value) -> 167 + Printf.sprintf "| %s%s=\"%s\"" padding display_name value 168 + ) sorted 169 + end 170 + 171 + (* Convert node to html5lib test format *) 172 + let rec to_test_format ?(indent=0) node = 173 + match node.name with 174 + | "#document" | "#document-fragment" -> 175 + let parts = List.map (to_test_format ~indent:0) node.children in 176 + String.concat "\n" parts 177 + 178 + | "#comment" -> 179 + Printf.sprintf "| %s<!-- %s -->" (String.make indent ' ') node.data 180 + 181 + | "!doctype" -> 182 + let dt = match node.doctype with Some d -> d | None -> { name = None; public_id = None; system_id = None } in 183 + let name_str = match dt.name with Some n -> " " ^ n | None -> " " in 184 + let ids_str = 185 + match dt.public_id, dt.system_id with 186 + | None, None -> "" 187 + | pub, sys -> 188 + let pub_str = match pub with Some p -> p | None -> "" in 189 + let sys_str = match sys with Some s -> s | None -> "" in 190 + Printf.sprintf " \"%s\" \"%s\"" pub_str sys_str 191 + in 192 + Printf.sprintf "| <!DOCTYPE%s%s>" name_str ids_str 193 + 194 + | "#text" -> 195 + Printf.sprintf "| %s\"%s\"" (String.make indent ' ') node.data 196 + 197 + | "template" when node.namespace = None || node.namespace = Some "html" -> 198 + let line = Printf.sprintf "| %s<%s>" (String.make indent ' ') (qualified_name node) in 199 + let attr_lines = attrs_to_test_format node indent in 200 + let content_line = Printf.sprintf "| %scontent" (String.make (indent + 2) ' ') in 201 + let content_children = 202 + match node.template_content with 203 + | Some tc -> List.map (to_test_format ~indent:(indent + 4)) tc.children 204 + | None -> [] 205 + in 206 + String.concat "\n" ([line] @ attr_lines @ [content_line] @ content_children) 207 + 208 + | _ -> 209 + let line = Printf.sprintf "| %s<%s>" (String.make indent ' ') (qualified_name node) in 210 + let attr_lines = attrs_to_test_format node indent in 211 + let child_lines = List.map (to_test_format ~indent:(indent + 2)) node.children in 212 + String.concat "\n" ([line] @ attr_lines @ child_lines) 213 + 214 + (* Extract text content *) 215 + let to_text ?(separator=" ") ?(strip=true) node = 216 + let rec collect_text n = 217 + if is_text n then [n.data] 218 + else List.concat_map collect_text n.children 219 + in 220 + let texts = collect_text node in 221 + let combined = String.concat separator texts in 222 + if strip then String.trim combined else combined 223 + 224 + (* Streaming serialization to a Bytes.Writer.t 225 + Writes HTML directly to the writer without building intermediate strings *) 226 + let rec to_writer ?(pretty=true) ?(indent_size=2) ?(indent=0) (w : Bytes.Writer.t) node = 227 + let write s = Bytes.Writer.write_string w s in 228 + let write_prefix () = if pretty then write (String.make (indent * indent_size) ' ') in 229 + let write_newline () = if pretty then write "\n" in 230 + 231 + match node.name with 232 + | "#document" -> 233 + let rec write_children first = function 234 + | [] -> () 235 + | child :: rest -> 236 + if not first && pretty then write_newline (); 237 + to_writer ~pretty ~indent_size ~indent:0 w child; 238 + write_children false rest 239 + in 240 + write_children true node.children 241 + 242 + | "#document-fragment" -> 243 + let rec write_children first = function 244 + | [] -> () 245 + | child :: rest -> 246 + if not first && pretty then write_newline (); 247 + to_writer ~pretty ~indent_size ~indent w child; 248 + write_children false rest 249 + in 250 + write_children true node.children 251 + 252 + | "#text" -> 253 + let text = node.data in 254 + if pretty then begin 255 + let trimmed = String.trim text in 256 + if trimmed <> "" then begin 257 + write_prefix (); 258 + write (escape_text trimmed) 259 + end 260 + end else 261 + write (escape_text text) 262 + 263 + | "#comment" -> 264 + write_prefix (); 265 + write "<!--"; 266 + write node.data; 267 + write "-->" 268 + 269 + | "!doctype" -> 270 + write_prefix (); 271 + write "<!DOCTYPE html>" 272 + 273 + | name -> 274 + write_prefix (); 275 + write (serialize_start_tag name node.attrs); 276 + 277 + if not (is_void name) then begin 278 + if node.children = [] then 279 + write (serialize_end_tag name) 280 + else begin 281 + (* Check if all children are text *) 282 + let all_text = List.for_all is_text node.children in 283 + if all_text && pretty then begin 284 + let text = String.concat "" (List.map (fun c -> c.data) node.children) in 285 + write (escape_text text); 286 + write (serialize_end_tag name) 287 + end else begin 288 + let rec write_children = function 289 + | [] -> () 290 + | child :: rest -> 291 + write_newline (); 292 + to_writer ~pretty ~indent_size ~indent:(indent + 1) w child; 293 + write_children rest 294 + in 295 + write_children node.children; 296 + write_newline (); 297 + write_prefix (); 298 + write (serialize_end_tag name) 299 + end 300 + end 301 + end
+19
lib/encoding/bom.ml
···
··· 1 + (* BOM (Byte Order Mark) sniffing *) 2 + 3 + let sniff data = 4 + let len = Bytes.length data in 5 + if len >= 3 && 6 + Bytes.get data 0 = '\xEF' && 7 + Bytes.get data 1 = '\xBB' && 8 + Bytes.get data 2 = '\xBF' then 9 + Some (Encoding.Utf8, 3) 10 + else if len >= 2 && 11 + Bytes.get data 0 = '\xFF' && 12 + Bytes.get data 1 = '\xFE' then 13 + Some (Encoding.Utf16le, 2) 14 + else if len >= 2 && 15 + Bytes.get data 0 = '\xFE' && 16 + Bytes.get data 1 = '\xFF' then 17 + Some (Encoding.Utf16be, 2) 18 + else 19 + None
+190
lib/encoding/decode.ml
···
··· 1 + (* HTML5 encoding detection and decoding *) 2 + 3 + let decode_utf16 data ~is_le ~bom_len = 4 + let len = Bytes.length data in 5 + let buf = Buffer.create len in 6 + let i = ref bom_len in 7 + 8 + while !i + 1 < len do 9 + let b0 = Char.code (Bytes.get data !i) in 10 + let b1 = Char.code (Bytes.get data (!i + 1)) in 11 + let code_unit = 12 + if is_le then b0 lor (b1 lsl 8) 13 + else (b0 lsl 8) lor b1 14 + in 15 + i := !i + 2; 16 + 17 + (* Handle surrogate pairs *) 18 + if code_unit >= 0xD800 && code_unit <= 0xDBFF && !i + 1 < len then begin 19 + (* High surrogate, look for low surrogate *) 20 + let b2 = Char.code (Bytes.get data !i) in 21 + let b3 = Char.code (Bytes.get data (!i + 1)) in 22 + let code_unit2 = 23 + if is_le then b2 lor (b3 lsl 8) 24 + else (b2 lsl 8) lor b3 25 + in 26 + if code_unit2 >= 0xDC00 && code_unit2 <= 0xDFFF then begin 27 + i := !i + 2; 28 + let high = code_unit - 0xD800 in 29 + let low = code_unit2 - 0xDC00 in 30 + let cp = 0x10000 + (high lsl 10) lor low in 31 + Buffer.add_char buf (Char.chr (0xF0 lor (cp lsr 18))); 32 + Buffer.add_char buf (Char.chr (0x80 lor ((cp lsr 12) land 0x3F))); 33 + Buffer.add_char buf (Char.chr (0x80 lor ((cp lsr 6) land 0x3F))); 34 + Buffer.add_char buf (Char.chr (0x80 lor (cp land 0x3F))) 35 + end else begin 36 + (* Invalid surrogate, output replacement *) 37 + Buffer.add_string buf "\xEF\xBF\xBD" 38 + end 39 + end else if code_unit >= 0xD800 && code_unit <= 0xDFFF then begin 40 + (* Lone surrogate *) 41 + Buffer.add_string buf "\xEF\xBF\xBD" 42 + end else if code_unit <= 0x7F then begin 43 + Buffer.add_char buf (Char.chr code_unit) 44 + end else if code_unit <= 0x7FF then begin 45 + Buffer.add_char buf (Char.chr (0xC0 lor (code_unit lsr 6))); 46 + Buffer.add_char buf (Char.chr (0x80 lor (code_unit land 0x3F))) 47 + end else begin 48 + Buffer.add_char buf (Char.chr (0xE0 lor (code_unit lsr 12))); 49 + Buffer.add_char buf (Char.chr (0x80 lor ((code_unit lsr 6) land 0x3F))); 50 + Buffer.add_char buf (Char.chr (0x80 lor (code_unit land 0x3F))) 51 + end 52 + done; 53 + 54 + (* Odd trailing byte *) 55 + if !i < len then Buffer.add_string buf "\xEF\xBF\xBD"; 56 + 57 + Buffer.contents buf 58 + 59 + let decode_with_encoding data enc ~bom_len = 60 + match enc with 61 + | Encoding.Utf8 -> 62 + (* UTF-8: Just validate and replace errors with replacement character *) 63 + let len = Bytes.length data in 64 + let buf = Buffer.create len in 65 + let decoder = Uutf.decoder ~encoding:`UTF_8 (`String (Bytes.to_string data)) in 66 + (* Skip BOM if present *) 67 + let _ = 68 + if bom_len > 0 then begin 69 + for _ = 1 to bom_len do 70 + ignore (Uutf.decode decoder) 71 + done 72 + end 73 + in 74 + let rec loop () = 75 + match Uutf.decode decoder with 76 + | `Uchar u -> Uutf.Buffer.add_utf_8 buf u; loop () 77 + | `Malformed _ -> Buffer.add_string buf "\xEF\xBF\xBD"; loop () 78 + | `End -> () 79 + | `Await -> assert false 80 + in 81 + loop (); 82 + Buffer.contents buf 83 + 84 + | Encoding.Utf16le -> decode_utf16 data ~is_le:true ~bom_len 85 + | Encoding.Utf16be -> decode_utf16 data ~is_le:false ~bom_len 86 + 87 + | Encoding.Windows_1252 -> 88 + let len = Bytes.length data in 89 + let buf = Buffer.create len in 90 + let table = [| 91 + (* 0x80-0x9F *) 92 + 0x20AC; 0x0081; 0x201A; 0x0192; 0x201E; 0x2026; 0x2020; 0x2021; 93 + 0x02C6; 0x2030; 0x0160; 0x2039; 0x0152; 0x008D; 0x017D; 0x008F; 94 + 0x0090; 0x2018; 0x2019; 0x201C; 0x201D; 0x2022; 0x2013; 0x2014; 95 + 0x02DC; 0x2122; 0x0161; 0x203A; 0x0153; 0x009D; 0x017E; 0x0178; 96 + |] in 97 + for i = bom_len to len - 1 do 98 + let b = Char.code (Bytes.get data i) in 99 + let cp = 100 + if b >= 0x80 && b <= 0x9F then table.(b - 0x80) 101 + else b 102 + in 103 + if cp <= 0x7F then 104 + Buffer.add_char buf (Char.chr cp) 105 + else if cp <= 0x7FF then begin 106 + Buffer.add_char buf (Char.chr (0xC0 lor (cp lsr 6))); 107 + Buffer.add_char buf (Char.chr (0x80 lor (cp land 0x3F))) 108 + end else begin 109 + Buffer.add_char buf (Char.chr (0xE0 lor (cp lsr 12))); 110 + Buffer.add_char buf (Char.chr (0x80 lor ((cp lsr 6) land 0x3F))); 111 + Buffer.add_char buf (Char.chr (0x80 lor (cp land 0x3F))) 112 + end 113 + done; 114 + Buffer.contents buf 115 + 116 + | Encoding.Iso_8859_2 -> 117 + let len = Bytes.length data in 118 + let buf = Buffer.create len in 119 + let table = [| 120 + (* 0xA0-0xBF *) 121 + 0x00A0; 0x0104; 0x02D8; 0x0141; 0x00A4; 0x013D; 0x015A; 0x00A7; 122 + 0x00A8; 0x0160; 0x015E; 0x0164; 0x0179; 0x00AD; 0x017D; 0x017B; 123 + 0x00B0; 0x0105; 0x02DB; 0x0142; 0x00B4; 0x013E; 0x015B; 0x02C7; 124 + 0x00B8; 0x0161; 0x015F; 0x0165; 0x017A; 0x02DD; 0x017E; 0x017C; 125 + (* 0xC0-0xFF *) 126 + 0x0154; 0x00C1; 0x00C2; 0x0102; 0x00C4; 0x0139; 0x0106; 0x00C7; 127 + 0x010C; 0x00C9; 0x0118; 0x00CB; 0x011A; 0x00CD; 0x00CE; 0x010E; 128 + 0x0110; 0x0143; 0x0147; 0x00D3; 0x00D4; 0x0150; 0x00D6; 0x00D7; 129 + 0x0158; 0x016E; 0x00DA; 0x0170; 0x00DC; 0x00DD; 0x0162; 0x00DF; 130 + 0x0155; 0x00E1; 0x00E2; 0x0103; 0x00E4; 0x013A; 0x0107; 0x00E7; 131 + 0x010D; 0x00E9; 0x0119; 0x00EB; 0x011B; 0x00ED; 0x00EE; 0x010F; 132 + 0x0111; 0x0144; 0x0148; 0x00F3; 0x00F4; 0x0151; 0x00F6; 0x00F7; 133 + 0x0159; 0x016F; 0x00FA; 0x0171; 0x00FC; 0x00FD; 0x0163; 0x02D9; 134 + |] in 135 + for i = bom_len to len - 1 do 136 + let b = Char.code (Bytes.get data i) in 137 + let cp = 138 + if b >= 0xA0 then table.(b - 0xA0) 139 + else b 140 + in 141 + if cp <= 0x7F then 142 + Buffer.add_char buf (Char.chr cp) 143 + else if cp <= 0x7FF then begin 144 + Buffer.add_char buf (Char.chr (0xC0 lor (cp lsr 6))); 145 + Buffer.add_char buf (Char.chr (0x80 lor (cp land 0x3F))) 146 + end else begin 147 + Buffer.add_char buf (Char.chr (0xE0 lor (cp lsr 12))); 148 + Buffer.add_char buf (Char.chr (0x80 lor ((cp lsr 6) land 0x3F))); 149 + Buffer.add_char buf (Char.chr (0x80 lor (cp land 0x3F))) 150 + end 151 + done; 152 + Buffer.contents buf 153 + 154 + | Encoding.Euc_jp -> 155 + (* For EUC-JP, use uutf with best effort *) 156 + let len = Bytes.length data in 157 + let buf = Buffer.create len in 158 + let s = Bytes.sub_string data bom_len (len - bom_len) in 159 + (* EUC-JP not directly supported by uutf, fall back to treating high bytes as replacement *) 160 + (* This is a simplification - full EUC-JP would need a separate decoder *) 161 + String.iter (fun c -> 162 + if Char.code c <= 0x7F then 163 + Buffer.add_char buf c 164 + else 165 + Buffer.add_string buf "\xEF\xBF\xBD" 166 + ) s; 167 + Buffer.contents buf 168 + 169 + let decode data ?transport_encoding () = 170 + (* Step 1: Check for BOM *) 171 + let bom_result = Bom.sniff data in 172 + match bom_result with 173 + | Some (enc, bom_len) -> 174 + (decode_with_encoding data enc ~bom_len, enc) 175 + | None -> 176 + (* Step 2: Check transport encoding (e.g., HTTP Content-Type) *) 177 + let enc_from_transport = 178 + match transport_encoding with 179 + | Some te -> Labels.normalize_label te 180 + | None -> None 181 + in 182 + match enc_from_transport with 183 + | Some enc -> (decode_with_encoding data enc ~bom_len:0, enc) 184 + | None -> 185 + (* Step 3: Prescan for meta charset *) 186 + match Prescan.prescan_for_meta_charset data with 187 + | Some enc -> (decode_with_encoding data enc ~bom_len:0, enc) 188 + | None -> 189 + (* Default to UTF-8 *) 190 + (decode_with_encoding data Encoding.Utf8 ~bom_len:0, Encoding.Utf8)
+4
lib/encoding/dune
···
··· 1 + (library 2 + (name html5rw_encoding) 3 + (public_name html5rw.encoding) 4 + (libraries bytesrw uutf))
+17
lib/encoding/encoding.ml
···
··· 1 + (* HTML5 encoding types *) 2 + 3 + type t = 4 + | Utf8 5 + | Utf16le 6 + | Utf16be 7 + | Windows_1252 8 + | Iso_8859_2 9 + | Euc_jp 10 + 11 + let to_string = function 12 + | Utf8 -> "utf-8" 13 + | Utf16le -> "utf-16le" 14 + | Utf16be -> "utf-16be" 15 + | Windows_1252 -> "windows-1252" 16 + | Iso_8859_2 -> "iso-8859-2" 17 + | Euc_jp -> "euc-jp"
+19
lib/encoding/html5rw_encoding.ml
···
··· 1 + (* html5rw.encoding - HTML5 encoding detection and decoding *) 2 + 3 + type encoding = Encoding.t = 4 + | Utf8 5 + | Utf16le 6 + | Utf16be 7 + | Windows_1252 8 + | Iso_8859_2 9 + | Euc_jp 10 + 11 + let encoding_to_string = Encoding.to_string 12 + 13 + let sniff_bom = Bom.sniff 14 + 15 + let normalize_label = Labels.normalize_label 16 + 17 + let prescan_for_meta_charset = Prescan.prescan_for_meta_charset 18 + 19 + let decode = Decode.decode
+41
lib/encoding/labels.ml
···
··· 1 + (* Encoding label normalization per WHATWG Encoding Standard *) 2 + 3 + let normalize_label label = 4 + if String.length label = 0 then None 5 + else 6 + let s = String.lowercase_ascii (String.trim label) in 7 + if String.length s = 0 then None 8 + else 9 + (* Security: never allow utf-7 *) 10 + if s = "utf-7" || s = "utf7" || s = "x-utf-7" then 11 + Some Encoding.Windows_1252 12 + else if s = "utf-8" || s = "utf8" then 13 + Some Encoding.Utf8 14 + (* HTML treats latin-1 labels as windows-1252 *) 15 + else if s = "iso-8859-1" || s = "iso8859-1" || s = "latin1" || 16 + s = "latin-1" || s = "l1" || s = "cp819" || s = "ibm819" then 17 + Some Encoding.Windows_1252 18 + else if s = "windows-1252" || s = "windows1252" || s = "cp1252" || s = "x-cp1252" then 19 + Some Encoding.Windows_1252 20 + else if s = "iso-8859-2" || s = "iso8859-2" || s = "latin2" || s = "latin-2" then 21 + Some Encoding.Iso_8859_2 22 + else if s = "euc-jp" || s = "eucjp" then 23 + Some Encoding.Euc_jp 24 + else if s = "utf-16" || s = "utf16" then 25 + Some Encoding.Utf16le (* Default to LE for ambiguous utf-16 *) 26 + else if s = "utf-16le" || s = "utf16le" then 27 + Some Encoding.Utf16le 28 + else if s = "utf-16be" || s = "utf16be" then 29 + Some Encoding.Utf16be 30 + else 31 + None 32 + 33 + let normalize_meta_declared label = 34 + match normalize_label label with 35 + | None -> None 36 + | Some enc -> 37 + (* Per HTML meta charset handling: ignore UTF-16/UTF-32 declarations and 38 + treat them as UTF-8 *) 39 + match enc with 40 + | Encoding.Utf16le | Encoding.Utf16be -> Some Encoding.Utf8 41 + | other -> Some other
+268
lib/encoding/prescan.ml
···
··· 1 + (* HTML meta charset prescan per WHATWG spec *) 2 + 3 + let ascii_whitespace = ['\x09'; '\x0A'; '\x0C'; '\x0D'; '\x20'] 4 + 5 + let is_ascii_whitespace c = List.mem c ascii_whitespace 6 + 7 + let is_ascii_alpha c = 8 + (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') 9 + 10 + let ascii_lower c = 11 + if c >= 'A' && c <= 'Z' then Char.chr (Char.code c + 32) 12 + else c 13 + 14 + let skip_whitespace data i len = 15 + let j = ref i in 16 + while !j < len && is_ascii_whitespace (Bytes.get data !j) do 17 + incr j 18 + done; 19 + !j 20 + 21 + let strip_whitespace data start len = 22 + let s = ref start in 23 + let e = ref (start + len) in 24 + while !s < !e && is_ascii_whitespace (Bytes.get data !s) do incr s done; 25 + while !e > !s && is_ascii_whitespace (Bytes.get data (!e - 1)) do decr e done; 26 + Bytes.sub_string data !s (!e - !s) 27 + 28 + let extract_charset_from_content content = 29 + let len = String.length content in 30 + (* Find "charset" *) 31 + let rec find_charset i = 32 + if i + 7 > len then None 33 + else 34 + let sub = String.lowercase_ascii (String.sub content i 7) in 35 + if sub = "charset" then 36 + let j = ref (i + 7) in 37 + (* Skip whitespace *) 38 + while !j < len && is_ascii_whitespace content.[!j] do incr j done; 39 + if !j >= len || content.[!j] <> '=' then find_charset (i + 1) 40 + else begin 41 + incr j; 42 + (* Skip whitespace after = *) 43 + while !j < len && is_ascii_whitespace content.[!j] do incr j done; 44 + if !j >= len then None 45 + else 46 + let quote = 47 + if content.[!j] = '"' || content.[!j] = '\'' then begin 48 + let q = content.[!j] in 49 + incr j; 50 + Some q 51 + end else None 52 + in 53 + let start = !j in 54 + (match quote with 55 + | Some q -> 56 + while !j < len && content.[!j] <> q do incr j done; 57 + if !j >= len then None 58 + else Some (String.sub content start (!j - start)) 59 + | None -> 60 + while !j < len && 61 + not (is_ascii_whitespace content.[!j]) && 62 + content.[!j] <> ';' do 63 + incr j 64 + done; 65 + Some (String.sub content start (!j - start))) 66 + end 67 + else find_charset (i + 1) 68 + in 69 + find_charset 0 70 + 71 + let prescan_for_meta_charset data = 72 + let len = Bytes.length data in 73 + let max_non_comment = 1024 in 74 + let max_total = 65536 in 75 + let i = ref 0 in 76 + let non_comment = ref 0 in 77 + 78 + let result = ref None in 79 + 80 + while !result = None && !i < len && !i < max_total && !non_comment < max_non_comment do 81 + if Bytes.get data !i <> '<' then begin 82 + incr i; 83 + incr non_comment 84 + end else begin 85 + (* Check for comment *) 86 + if !i + 3 < len && 87 + Bytes.get data (!i + 1) = '!' && 88 + Bytes.get data (!i + 2) = '-' && 89 + Bytes.get data (!i + 3) = '-' then begin 90 + (* Skip comment *) 91 + let j = ref (!i + 4) in 92 + while !j + 2 < len && not ( 93 + Bytes.get data !j = '-' && 94 + Bytes.get data (!j + 1) = '-' && 95 + Bytes.get data (!j + 2) = '>' 96 + ) do incr j done; 97 + if !j + 2 < len then 98 + i := !j + 3 99 + else 100 + result := None (* Unclosed comment, stop scanning *) 101 + end 102 + (* Check for end tag - skip it *) 103 + else if !i + 1 < len && Bytes.get data (!i + 1) = '/' then begin 104 + let j = ref (!i + 2) in 105 + let in_quote = ref None in 106 + while !j < len && !j < max_total && !non_comment < max_non_comment do 107 + let c = Bytes.get data !j in 108 + match !in_quote with 109 + | None -> 110 + if c = '"' || c = '\'' then begin 111 + in_quote := Some c; 112 + incr j; 113 + incr non_comment 114 + end else if c = '>' then begin 115 + incr j; 116 + incr non_comment; 117 + j := len (* Exit loop *) 118 + end else begin 119 + incr j; 120 + incr non_comment 121 + end 122 + | Some q -> 123 + if c = q then in_quote := None; 124 + incr j; 125 + incr non_comment 126 + done; 127 + i := !j 128 + end 129 + (* Check for tag *) 130 + else if !i + 1 < len && is_ascii_alpha (Bytes.get data (!i + 1)) then begin 131 + let j = ref (!i + 1) in 132 + while !j < len && is_ascii_alpha (Bytes.get data !j) do incr j done; 133 + let tag_name = 134 + let name_bytes = Bytes.sub data (!i + 1) (!j - !i - 1) in 135 + String.lowercase_ascii (Bytes.to_string name_bytes) 136 + in 137 + 138 + if tag_name <> "meta" then begin 139 + (* Skip non-meta tag *) 140 + let in_quote = ref None in 141 + while !j < len && !j < max_total && !non_comment < max_non_comment do 142 + let c = Bytes.get data !j in 143 + match !in_quote with 144 + | None -> 145 + if c = '"' || c = '\'' then begin 146 + in_quote := Some c; 147 + incr j; 148 + incr non_comment 149 + end else if c = '>' then begin 150 + incr j; 151 + incr non_comment; 152 + j := len 153 + end else begin 154 + incr j; 155 + incr non_comment 156 + end 157 + | Some q -> 158 + if c = q then in_quote := None; 159 + incr j; 160 + incr non_comment 161 + done; 162 + i := !j 163 + end else begin 164 + (* Parse meta tag attributes *) 165 + let charset = ref None in 166 + let http_equiv = ref None in 167 + let content = ref None in 168 + let k = ref !j in 169 + let saw_gt = ref false in 170 + 171 + while not !saw_gt && !k < len && !k < max_total do 172 + let c = Bytes.get data !k in 173 + if c = '>' then begin 174 + saw_gt := true; 175 + incr k 176 + end else if c = '<' then begin 177 + (* Restart scanning from here *) 178 + k := len 179 + end else if is_ascii_whitespace c || c = '/' then begin 180 + incr k 181 + end else begin 182 + (* Attribute name *) 183 + let attr_start = !k in 184 + while !k < len && 185 + not (is_ascii_whitespace (Bytes.get data !k)) && 186 + Bytes.get data !k <> '=' && 187 + Bytes.get data !k <> '>' && 188 + Bytes.get data !k <> '/' && 189 + Bytes.get data !k <> '<' do 190 + incr k 191 + done; 192 + let attr_name = 193 + String.lowercase_ascii (Bytes.sub_string data attr_start (!k - attr_start)) 194 + in 195 + k := skip_whitespace data !k len; 196 + 197 + let value = ref None in 198 + if !k < len && Bytes.get data !k = '=' then begin 199 + incr k; 200 + k := skip_whitespace data !k len; 201 + if !k < len then begin 202 + let qc = Bytes.get data !k in 203 + if qc = '"' || qc = '\'' then begin 204 + incr k; 205 + let val_start = !k in 206 + while !k < len && Bytes.get data !k <> qc do incr k done; 207 + if !k < len then begin 208 + value := Some (Bytes.sub_string data val_start (!k - val_start)); 209 + incr k 210 + end 211 + end else begin 212 + let val_start = !k in 213 + while !k < len && 214 + not (is_ascii_whitespace (Bytes.get data !k)) && 215 + Bytes.get data !k <> '>' && 216 + Bytes.get data !k <> '<' do 217 + incr k 218 + done; 219 + value := Some (Bytes.sub_string data val_start (!k - val_start)) 220 + end 221 + end 222 + end; 223 + 224 + if attr_name = "charset" then 225 + charset := !value 226 + else if attr_name = "http-equiv" then 227 + http_equiv := !value 228 + else if attr_name = "content" then 229 + content := !value 230 + end 231 + done; 232 + 233 + if !saw_gt then begin 234 + (* Check for charset *) 235 + (match !charset with 236 + | Some cs -> 237 + (match Labels.normalize_meta_declared cs with 238 + | Some enc -> result := Some enc 239 + | None -> ()) 240 + | None -> ()); 241 + 242 + (* Check for http-equiv="content-type" with content *) 243 + if !result = None then 244 + (match !http_equiv, !content with 245 + | Some he, Some ct when String.lowercase_ascii he = "content-type" -> 246 + (match extract_charset_from_content ct with 247 + | Some extracted -> 248 + (match Labels.normalize_meta_declared extracted with 249 + | Some enc -> result := Some enc 250 + | None -> ()) 251 + | None -> ()) 252 + | _ -> ()); 253 + 254 + i := !k; 255 + non_comment := !non_comment + (!k - !j) 256 + end else begin 257 + incr i; 258 + incr non_comment 259 + end 260 + end 261 + end else begin 262 + incr i; 263 + incr non_comment 264 + end 265 + end 266 + done; 267 + 268 + !result
+192
lib/entities/decode.ml
···
··· 1 + (* HTML5 entity decoding *) 2 + 3 + let is_alpha c = 4 + (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') 5 + 6 + let is_alnum c = 7 + is_alpha c || (c >= '0' && c <= '9') 8 + 9 + let is_hex_digit c = 10 + (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F') 11 + 12 + let is_digit c = 13 + c >= '0' && c <= '9' 14 + 15 + let decode_entities_in_text text ~in_attribute = 16 + let len = String.length text in 17 + let buf = Buffer.create len in 18 + let i = ref 0 in 19 + 20 + while !i < len do 21 + (* Find next ampersand *) 22 + let amp_pos = 23 + try Some (String.index_from text !i '&') 24 + with Not_found -> None 25 + in 26 + 27 + match amp_pos with 28 + | None -> 29 + (* No more ampersands, append rest *) 30 + Buffer.add_substring buf text !i (len - !i); 31 + i := len 32 + | Some amp -> 33 + (* Append text before ampersand *) 34 + if amp > !i then 35 + Buffer.add_substring buf text !i (amp - !i); 36 + 37 + i := amp; 38 + let j = ref (amp + 1) in 39 + 40 + if !j >= len then begin 41 + (* Ampersand at end *) 42 + Buffer.add_char buf '&'; 43 + i := len 44 + end else if text.[!j] = '#' then begin 45 + (* Numeric entity *) 46 + incr j; 47 + let is_hex = 48 + if !j < len && (text.[!j] = 'x' || text.[!j] = 'X') then begin 49 + incr j; 50 + true 51 + end else false 52 + in 53 + 54 + let digit_start = !j in 55 + if is_hex then 56 + while !j < len && is_hex_digit text.[!j] do incr j done 57 + else 58 + while !j < len && is_digit text.[!j] do incr j done; 59 + 60 + let has_semicolon = !j < len && text.[!j] = ';' in 61 + let digit_text = String.sub text digit_start (!j - digit_start) in 62 + 63 + if String.length digit_text > 0 then begin 64 + match Numeric_ref.decode digit_text ~is_hex with 65 + | Some decoded -> 66 + Buffer.add_string buf decoded; 67 + i := if has_semicolon then !j + 1 else !j 68 + | None -> 69 + (* Invalid numeric entity, keep as-is *) 70 + let end_pos = if has_semicolon then !j + 1 else !j in 71 + Buffer.add_substring buf text amp (end_pos - amp); 72 + i := end_pos 73 + end else begin 74 + (* No digits, keep as-is *) 75 + let end_pos = if has_semicolon then !j + 1 else !j in 76 + Buffer.add_substring buf text amp (end_pos - amp); 77 + i := end_pos 78 + end 79 + end else begin 80 + (* Named entity *) 81 + (* Collect alphanumeric characters *) 82 + while !j < len && is_alnum text.[!j] do incr j done; 83 + 84 + let entity_name = String.sub text (amp + 1) (!j - amp - 1) in 85 + let has_semicolon = !j < len && text.[!j] = ';' in 86 + 87 + if String.length entity_name = 0 then begin 88 + Buffer.add_char buf '&'; 89 + i := amp + 1 90 + end else begin 91 + (* Try exact match first (with semicolon expected) *) 92 + let decoded = 93 + if has_semicolon then 94 + Entity_table.lookup entity_name 95 + else 96 + None 97 + in 98 + 99 + match decoded with 100 + | Some value -> 101 + Buffer.add_string buf value; 102 + i := !j + 1 103 + | None -> 104 + (* If semicolon present but no exact match, try legacy prefix match in text *) 105 + if has_semicolon && not in_attribute then begin 106 + (* Try progressively shorter prefixes *) 107 + let rec try_prefix k = 108 + if k <= 0 then None 109 + else 110 + let prefix = String.sub entity_name 0 k in 111 + if Entity_table.is_legacy prefix then 112 + match Entity_table.lookup prefix with 113 + | Some value -> Some (value, k) 114 + | None -> try_prefix (k - 1) 115 + else 116 + try_prefix (k - 1) 117 + in 118 + match try_prefix (String.length entity_name) with 119 + | Some (value, matched_len) -> 120 + Buffer.add_string buf value; 121 + i := amp + 1 + matched_len 122 + | None -> 123 + (* No match, keep as-is including semicolon *) 124 + Buffer.add_substring buf text amp (!j + 1 - amp); 125 + i := !j + 1 126 + end else if not has_semicolon then begin 127 + (* Try without semicolon for legacy compatibility *) 128 + if Entity_table.is_legacy entity_name then 129 + match Entity_table.lookup entity_name with 130 + | Some value -> 131 + (* Legacy entities without semicolon have strict rules in attributes *) 132 + let next_char = if !j < len then Some text.[!j] else None in 133 + let blocked = 134 + in_attribute && 135 + match next_char with 136 + | Some c -> is_alnum c || c = '=' 137 + | None -> false 138 + in 139 + if blocked then begin 140 + Buffer.add_char buf '&'; 141 + i := amp + 1 142 + end else begin 143 + Buffer.add_string buf value; 144 + i := !j 145 + end 146 + | None -> 147 + Buffer.add_char buf '&'; 148 + i := amp + 1 149 + else begin 150 + (* Try longest prefix match for legacy entities *) 151 + let rec try_prefix k = 152 + if k <= 0 then None 153 + else 154 + let prefix = String.sub entity_name 0 k in 155 + if Entity_table.is_legacy prefix then 156 + match Entity_table.lookup prefix with 157 + | Some value -> Some (value, k) 158 + | None -> try_prefix (k - 1) 159 + else 160 + try_prefix (k - 1) 161 + in 162 + match try_prefix (String.length entity_name) with 163 + | Some (value, matched_len) -> 164 + let end_pos = amp + 1 + matched_len in 165 + let next_char = if end_pos < len then Some text.[end_pos] else None in 166 + let blocked = 167 + in_attribute && 168 + match next_char with 169 + | Some c -> is_alnum c || c = '=' 170 + | None -> false 171 + in 172 + if blocked then begin 173 + Buffer.add_char buf '&'; 174 + i := amp + 1 175 + end else begin 176 + Buffer.add_string buf value; 177 + i := end_pos 178 + end 179 + | None -> 180 + Buffer.add_char buf '&'; 181 + i := amp + 1 182 + end 183 + end else begin 184 + (* Has semicolon but no match *) 185 + Buffer.add_substring buf text amp (!j + 1 - amp); 186 + i := !j + 1 187 + end 188 + end 189 + end 190 + done; 191 + 192 + Buffer.contents buf
+8
lib/entities/dune
···
··· 1 + (library 2 + (name html5rw_entities) 3 + (public_name html5rw.entities)) 4 + 5 + (rule 6 + (target entity_table.ml) 7 + (deps ../../data/entities.json) 8 + (action (run ../../gen/gen_entities.exe %{deps} %{target})))
+13
lib/entities/html5rw_entities.ml
···
··· 1 + (* html5rw.entities - HTML5 entity decoding *) 2 + 3 + let decode = Decode.decode_entities_in_text 4 + 5 + let decode_numeric = Numeric_ref.decode 6 + 7 + let lookup = Entity_table.lookup 8 + 9 + let is_legacy = Entity_table.is_legacy 10 + 11 + let codepoint_to_utf8 = Numeric_ref.codepoint_to_utf8 12 + 13 + module Numeric_ref = Numeric_ref
+85
lib/entities/numeric_ref.ml
···
··· 1 + (* HTML5 numeric character reference decoding *) 2 + 3 + (* HTML5 spec: numeric character reference replacements (§13.2.5.73) *) 4 + let numeric_replacements = [| 5 + (0x00, 0xFFFD); (* NULL -> REPLACEMENT CHARACTER *) 6 + (0x80, 0x20AC); (* -> EURO SIGN *) 7 + (0x82, 0x201A); (* -> SINGLE LOW-9 QUOTATION MARK *) 8 + (0x83, 0x0192); (* -> LATIN SMALL LETTER F WITH HOOK *) 9 + (0x84, 0x201E); (* -> DOUBLE LOW-9 QUOTATION MARK *) 10 + (0x85, 0x2026); (* -> HORIZONTAL ELLIPSIS *) 11 + (0x86, 0x2020); (* -> DAGGER *) 12 + (0x87, 0x2021); (* -> DOUBLE DAGGER *) 13 + (0x88, 0x02C6); (* -> MODIFIER LETTER CIRCUMFLEX ACCENT *) 14 + (0x89, 0x2030); (* -> PER MILLE SIGN *) 15 + (0x8A, 0x0160); (* -> LATIN CAPITAL LETTER S WITH CARON *) 16 + (0x8B, 0x2039); (* -> SINGLE LEFT-POINTING ANGLE QUOTATION MARK *) 17 + (0x8C, 0x0152); (* -> LATIN CAPITAL LIGATURE OE *) 18 + (0x8E, 0x017D); (* -> LATIN CAPITAL LETTER Z WITH CARON *) 19 + (0x91, 0x2018); (* -> LEFT SINGLE QUOTATION MARK *) 20 + (0x92, 0x2019); (* -> RIGHT SINGLE QUOTATION MARK *) 21 + (0x93, 0x201C); (* -> LEFT DOUBLE QUOTATION MARK *) 22 + (0x94, 0x201D); (* -> RIGHT DOUBLE QUOTATION MARK *) 23 + (0x95, 0x2022); (* -> BULLET *) 24 + (0x96, 0x2013); (* -> EN DASH *) 25 + (0x97, 0x2014); (* -> EM DASH *) 26 + (0x98, 0x02DC); (* -> SMALL TILDE *) 27 + (0x99, 0x2122); (* -> TRADE MARK SIGN *) 28 + (0x9A, 0x0161); (* -> LATIN SMALL LETTER S WITH CARON *) 29 + (0x9B, 0x203A); (* -> SINGLE RIGHT-POINTING ANGLE QUOTATION MARK *) 30 + (0x9C, 0x0153); (* -> LATIN SMALL LIGATURE OE *) 31 + (0x9E, 0x017E); (* -> LATIN SMALL LETTER Z WITH CARON *) 32 + (0x9F, 0x0178); (* -> LATIN CAPITAL LETTER Y WITH DIAERESIS *) 33 + |] 34 + 35 + let find_replacement cp = 36 + let rec search i = 37 + if i >= Array.length numeric_replacements then None 38 + else 39 + let (k, v) = numeric_replacements.(i) in 40 + if k = cp then Some v 41 + else if k > cp then None 42 + else search (i + 1) 43 + in 44 + search 0 45 + 46 + let codepoint_to_utf8 cp = 47 + let buf = Buffer.create 4 in 48 + if cp <= 0x7F then 49 + Buffer.add_char buf (Char.chr cp) 50 + else if cp <= 0x7FF then begin 51 + Buffer.add_char buf (Char.chr (0xC0 lor (cp lsr 6))); 52 + Buffer.add_char buf (Char.chr (0x80 lor (cp land 0x3F))) 53 + end else if cp <= 0xFFFF then begin 54 + Buffer.add_char buf (Char.chr (0xE0 lor (cp lsr 12))); 55 + Buffer.add_char buf (Char.chr (0x80 lor ((cp lsr 6) land 0x3F))); 56 + Buffer.add_char buf (Char.chr (0x80 lor (cp land 0x3F))) 57 + end else begin 58 + Buffer.add_char buf (Char.chr (0xF0 lor (cp lsr 18))); 59 + Buffer.add_char buf (Char.chr (0x80 lor ((cp lsr 12) land 0x3F))); 60 + Buffer.add_char buf (Char.chr (0x80 lor ((cp lsr 6) land 0x3F))); 61 + Buffer.add_char buf (Char.chr (0x80 lor (cp land 0x3F))) 62 + end; 63 + Buffer.contents buf 64 + 65 + let replacement_char = "\xEF\xBF\xBD" (* U+FFFD in UTF-8 *) 66 + 67 + let decode text ~is_hex = 68 + match int_of_string_opt ((if is_hex then "0x" else "") ^ text) with 69 + | None -> None 70 + | Some cp -> 71 + (* Apply HTML5 replacements *) 72 + let cp = match find_replacement cp with 73 + | Some replacement -> replacement 74 + | None -> cp 75 + in 76 + (* Invalid ranges per HTML5 spec *) 77 + if cp > 0x10FFFF then 78 + Some replacement_char 79 + else if cp >= 0xD800 && cp <= 0xDFFF then 80 + (* Surrogate range *) 81 + Some replacement_char 82 + else if cp = 0 then 83 + Some replacement_char 84 + else 85 + Some (codepoint_to_utf8 cp)
+11
lib/html5rw/dune
···
··· 1 + (library 2 + (name html5rw) 3 + (public_name html5rw) 4 + (libraries 5 + bytesrw 6 + html5rw.parser 7 + html5rw.dom 8 + html5rw.tokenizer 9 + html5rw.encoding 10 + html5rw.selector 11 + html5rw.entities))
+302
lib/html5rw/html5rw.ml
···
··· 1 + (** Html5rw - Pure OCaml HTML5 Parser 2 + 3 + This module provides a complete HTML5 parsing solution following the 4 + WHATWG specification. It uses bytesrw for streaming input/output. 5 + 6 + {2 Quick Start} 7 + 8 + Parse HTML from a reader: 9 + {[ 10 + open Bytesrw 11 + let reader = Bytes.Reader.of_string "<p>Hello, world!</p>" in 12 + let result = Html5rw.parse reader in 13 + let html = Html5rw.to_string result 14 + ]} 15 + 16 + Parse from a file: 17 + {[ 18 + open Bytesrw 19 + let ic = open_in "page.html" in 20 + let reader = Bytes.Reader.of_in_channel ic in 21 + let result = Html5rw.parse reader in 22 + close_in ic 23 + ]} 24 + 25 + Query with CSS selectors: 26 + {[ 27 + let result = Html5rw.parse reader in 28 + let divs = Html5rw.query result "div.content" 29 + ]} 30 + *) 31 + 32 + (** {1 Sub-modules} *) 33 + 34 + (** DOM types and manipulation functions *) 35 + module Dom = Html5rw_dom 36 + 37 + (** HTML5 tokenizer *) 38 + module Tokenizer = Html5rw_tokenizer 39 + 40 + (** Encoding detection and decoding *) 41 + module Encoding = Html5rw_encoding 42 + 43 + (** CSS selector engine *) 44 + module Selector = Html5rw_selector 45 + 46 + (** HTML entity decoding *) 47 + module Entities = Html5rw_entities 48 + 49 + (** Low-level parser access *) 50 + module Parser = Html5rw_parser 51 + 52 + (** {1 Core Types} *) 53 + 54 + (** DOM node type. See {!Dom} for manipulation functions. *) 55 + type node = Dom.node 56 + 57 + (** Doctype information *) 58 + type doctype_data = Dom.doctype_data = { 59 + name : string option; 60 + public_id : string option; 61 + system_id : string option; 62 + } 63 + 64 + (** Quirks mode as determined during parsing *) 65 + type quirks_mode = Dom.quirks_mode = No_quirks | Quirks | Limited_quirks 66 + 67 + (** Character encoding detected or specified *) 68 + type encoding = Encoding.encoding = 69 + | Utf8 70 + | Utf16le 71 + | Utf16be 72 + | Windows_1252 73 + | Iso_8859_2 74 + | Euc_jp 75 + 76 + (** Parse error record *) 77 + type parse_error = Parser.parse_error 78 + 79 + (** Fragment parsing context *) 80 + type fragment_context = Parser.fragment_context 81 + 82 + (** Create a fragment parsing context. 83 + @param tag_name Tag name of the context element 84 + @param namespace Namespace (None for HTML, Some "svg", Some "mathml") 85 + *) 86 + let make_fragment_context = Parser.make_fragment_context 87 + 88 + (** Get the tag name from a fragment context *) 89 + let fragment_context_tag = Parser.fragment_context_tag 90 + 91 + (** Get the namespace from a fragment context *) 92 + let fragment_context_namespace = Parser.fragment_context_namespace 93 + 94 + (** Get the error code string *) 95 + let error_code = Parser.error_code 96 + 97 + (** Get the line number of an error (1-indexed) *) 98 + let error_line = Parser.error_line 99 + 100 + (** Get the column number of an error (1-indexed) *) 101 + let error_column = Parser.error_column 102 + 103 + (** Result of parsing an HTML document *) 104 + type t = { 105 + root : node; 106 + errors : parse_error list; 107 + encoding : encoding option; 108 + } 109 + 110 + (* Internal: convert Parser.t to our t *) 111 + let of_parser_result (p : Parser.t) : t = 112 + { root = Parser.root p; errors = Parser.errors p; encoding = Parser.encoding p } 113 + 114 + (** {1 Parsing Functions} *) 115 + 116 + (** Parse HTML from a [Bytes.Reader.t]. 117 + 118 + This is the primary parsing function. Create a reader from any source: 119 + - [Bytes.Reader.of_string s] for strings 120 + - [Bytes.Reader.of_in_channel ic] for files 121 + - [Bytes.Reader.of_bytes b] for byte buffers 122 + 123 + {[ 124 + open Bytesrw 125 + let reader = Bytes.Reader.of_string "<html><body>Hello</body></html>" in 126 + let result = Html5rw.parse reader 127 + ]} 128 + 129 + @param collect_errors If true, collect parse errors (default: false) 130 + @param fragment_context Context element for fragment parsing 131 + *) 132 + let parse ?collect_errors ?fragment_context reader = 133 + of_parser_result (Parser.parse ?collect_errors ?fragment_context reader) 134 + 135 + (** Parse raw bytes with automatic encoding detection. 136 + 137 + This function implements the WHATWG encoding sniffing algorithm: 138 + 1. Check for BOM (Byte Order Mark) 139 + 2. Prescan for <meta charset> 140 + 3. Fall back to UTF-8 141 + 142 + @param collect_errors If true, collect parse errors (default: false) 143 + @param transport_encoding Encoding from HTTP Content-Type header 144 + @param fragment_context Context element for fragment parsing 145 + *) 146 + let parse_bytes ?collect_errors ?transport_encoding ?fragment_context bytes = 147 + of_parser_result (Parser.parse_bytes ?collect_errors ?transport_encoding ?fragment_context bytes) 148 + 149 + (** {1 Querying} *) 150 + 151 + (** Query the DOM tree with a CSS selector. 152 + 153 + Supported selectors: 154 + - Tag: [div], [p], [span] 155 + - ID: [#myid] 156 + - Class: [.myclass] 157 + - Universal: [*] 158 + - Attribute: [[attr]], [[attr="value"]], [[attr~="value"]], [[attr|="value"]] 159 + - Pseudo-classes: [:first-child], [:last-child], [:nth-child(n)] 160 + - Combinators: descendant (space), child (>), adjacent sibling (+), general sibling (~) 161 + 162 + {[ 163 + let divs = Html5rw.query result "div.content > p" 164 + ]} 165 + 166 + @raise Selector.Selector_error if the selector is invalid 167 + *) 168 + let query t selector = Selector.query t.root selector 169 + 170 + (** Check if a node matches a CSS selector. *) 171 + let matches node selector = Selector.matches node selector 172 + 173 + (** {1 Serialization} *) 174 + 175 + (** Write the DOM tree to a [Bytes.Writer.t]. 176 + 177 + {[ 178 + open Bytesrw 179 + let buf = Buffer.create 1024 in 180 + let writer = Bytes.Writer.of_buffer buf in 181 + Html5rw.to_writer result writer; 182 + Bytes.Writer.write_eod writer; 183 + let html = Buffer.contents buf 184 + ]} 185 + 186 + @param pretty If true, format with indentation (default: true) 187 + @param indent_size Number of spaces per indent level (default: 2) 188 + *) 189 + let to_writer ?pretty ?indent_size t writer = 190 + Dom.to_writer ?pretty ?indent_size writer t.root 191 + 192 + (** Serialize the DOM tree to a string. 193 + 194 + Convenience function when the output fits in memory. 195 + 196 + @param pretty If true, format with indentation (default: true) 197 + @param indent_size Number of spaces per indent level (default: 2) 198 + *) 199 + let to_string ?pretty ?indent_size t = Dom.to_html ?pretty ?indent_size t.root 200 + 201 + (** Extract text content from the DOM tree. 202 + 203 + @param separator String to insert between text nodes (default: " ") 204 + @param strip If true, trim whitespace (default: true) 205 + *) 206 + let to_text ?separator ?strip t = Dom.to_text ?separator ?strip t.root 207 + 208 + (** Serialize to html5lib test format (for testing). *) 209 + let to_test_format t = Dom.to_test_format t.root 210 + 211 + (** {1 Result Accessors} *) 212 + 213 + (** Get the root node of the parsed document. *) 214 + let root t = t.root 215 + 216 + (** Get parse errors (if error collection was enabled). *) 217 + let errors t = t.errors 218 + 219 + (** Get the detected encoding (if parsed from bytes). *) 220 + let encoding t = t.encoding 221 + 222 + (** {1 DOM Utilities} 223 + 224 + Common DOM operations are available directly. For the full API, 225 + see the {!Dom} module. 226 + *) 227 + 228 + (** Create an element node. 229 + @param namespace None for HTML, Some "svg" or Some "mathml" for foreign content 230 + @param attrs List of (name, value) attribute pairs 231 + *) 232 + let create_element = Dom.create_element 233 + 234 + (** Create a text node. *) 235 + let create_text = Dom.create_text 236 + 237 + (** Create a comment node. *) 238 + let create_comment = Dom.create_comment 239 + 240 + (** Create an empty document node. *) 241 + let create_document = Dom.create_document 242 + 243 + (** Create a document fragment node. *) 244 + let create_document_fragment = Dom.create_document_fragment 245 + 246 + (** Create a doctype node. *) 247 + let create_doctype = Dom.create_doctype 248 + 249 + (** Append a child node to a parent. *) 250 + let append_child = Dom.append_child 251 + 252 + (** Insert a node before a reference node. *) 253 + let insert_before = Dom.insert_before 254 + 255 + (** Remove a child node from its parent. *) 256 + let remove_child = Dom.remove_child 257 + 258 + (** Get an attribute value. *) 259 + let get_attr = Dom.get_attr 260 + 261 + (** Set an attribute value. *) 262 + let set_attr = Dom.set_attr 263 + 264 + (** Check if a node has an attribute. *) 265 + let has_attr = Dom.has_attr 266 + 267 + (** Get all descendant nodes. *) 268 + let descendants = Dom.descendants 269 + 270 + (** Get all ancestor nodes (from parent to root). *) 271 + let ancestors = Dom.ancestors 272 + 273 + (** Get text content of a node and its descendants. *) 274 + let get_text_content = Dom.get_text_content 275 + 276 + (** Clone a node. 277 + @param deep If true, also clone descendants (default: false) 278 + *) 279 + let clone = Dom.clone 280 + 281 + (** {1 Node Predicates} *) 282 + 283 + (** Test if a node is an element. *) 284 + let is_element = Dom.is_element 285 + 286 + (** Test if a node is a text node. *) 287 + let is_text = Dom.is_text 288 + 289 + (** Test if a node is a comment node. *) 290 + let is_comment = Dom.is_comment 291 + 292 + (** Test if a node is a document node. *) 293 + let is_document = Dom.is_document 294 + 295 + (** Test if a node is a document fragment. *) 296 + let is_document_fragment = Dom.is_document_fragment 297 + 298 + (** Test if a node is a doctype node. *) 299 + let is_doctype = Dom.is_doctype 300 + 301 + (** Test if a node has children. *) 302 + let has_children = Dom.has_children
+324
lib/html5rw/html5rw.mli
···
··· 1 + (** Html5rw - Pure OCaml HTML5 Parser 2 + 3 + This module provides a complete HTML5 parsing solution following the 4 + WHATWG specification. It uses bytesrw for streaming input/output. 5 + 6 + {2 Quick Start} 7 + 8 + Parse HTML from a reader: 9 + {[ 10 + open Bytesrw 11 + let reader = Bytes.Reader.of_string "<p>Hello, world!</p>" in 12 + let result = Html5rw.parse reader in 13 + let html = Html5rw.to_string result 14 + ]} 15 + 16 + Parse from a file: 17 + {[ 18 + open Bytesrw 19 + let ic = open_in "page.html" in 20 + let reader = Bytes.Reader.of_in_channel ic in 21 + let result = Html5rw.parse reader in 22 + close_in ic 23 + ]} 24 + 25 + Query with CSS selectors: 26 + {[ 27 + let result = Html5rw.parse reader in 28 + let divs = Html5rw.query result "div.content" 29 + ]} 30 + *) 31 + 32 + (** {1 Sub-modules} *) 33 + 34 + (** DOM types and manipulation functions *) 35 + module Dom = Html5rw_dom 36 + 37 + (** HTML5 tokenizer *) 38 + module Tokenizer = Html5rw_tokenizer 39 + 40 + (** Encoding detection and decoding *) 41 + module Encoding = Html5rw_encoding 42 + 43 + (** CSS selector engine *) 44 + module Selector = Html5rw_selector 45 + 46 + (** HTML entity decoding *) 47 + module Entities = Html5rw_entities 48 + 49 + (** Low-level parser access *) 50 + module Parser = Html5rw_parser 51 + 52 + (** {1 Core Types} *) 53 + 54 + (** DOM node type. See {!Dom} for manipulation functions. *) 55 + type node = Dom.node 56 + 57 + (** Doctype information *) 58 + type doctype_data = Dom.doctype_data = { 59 + name : string option; 60 + public_id : string option; 61 + system_id : string option; 62 + } 63 + 64 + (** Quirks mode as determined during parsing *) 65 + type quirks_mode = Dom.quirks_mode = No_quirks | Quirks | Limited_quirks 66 + 67 + (** Character encoding detected or specified *) 68 + type encoding = Encoding.encoding = 69 + | Utf8 70 + | Utf16le 71 + | Utf16be 72 + | Windows_1252 73 + | Iso_8859_2 74 + | Euc_jp 75 + 76 + (** A parse error encountered during HTML5 parsing. 77 + 78 + HTML5 parsing never fails - the specification defines error recovery 79 + for all malformed input. However, conformance checkers can report 80 + these errors. Enable error collection with [~collect_errors:true]. 81 + 82 + @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-errors> 83 + WHATWG parse error definitions 84 + *) 85 + type parse_error = Parser.parse_error 86 + 87 + (** Get the error code (e.g., "unexpected-null-character"). *) 88 + val error_code : parse_error -> string 89 + 90 + (** Get the line number where the error occurred (1-indexed). *) 91 + val error_line : parse_error -> int 92 + 93 + (** Get the column number where the error occurred (1-indexed). *) 94 + val error_column : parse_error -> int 95 + 96 + (** Context element for HTML fragment parsing (innerHTML). 97 + 98 + When parsing HTML fragments, you must specify what element would 99 + contain the fragment. This affects how certain elements are handled. 100 + 101 + @see <https://html.spec.whatwg.org/multipage/parsing.html#parsing-html-fragments> 102 + The fragment parsing algorithm 103 + *) 104 + type fragment_context = Parser.fragment_context 105 + 106 + (** Create a fragment parsing context. 107 + 108 + @param tag_name Tag name of the context element (e.g., "div", "tr") 109 + @param namespace Namespace: [None] for HTML, [Some "svg"], [Some "mathml"] 110 + 111 + {[ 112 + (* Parse as innerHTML of a <ul> *) 113 + let ctx = Html5rw.make_fragment_context ~tag_name:"ul" () 114 + 115 + (* Parse as innerHTML of an SVG <g> element *) 116 + let ctx = Html5rw.make_fragment_context ~tag_name:"g" ~namespace:(Some "svg") () 117 + ]} 118 + *) 119 + val make_fragment_context : tag_name:string -> ?namespace:string option -> 120 + unit -> fragment_context 121 + 122 + (** Get the tag name of a fragment context. *) 123 + val fragment_context_tag : fragment_context -> string 124 + 125 + (** Get the namespace of a fragment context. *) 126 + val fragment_context_namespace : fragment_context -> string option 127 + 128 + (** Result of parsing an HTML document. 129 + 130 + Contains the parsed DOM tree, any errors encountered, and the 131 + detected encoding (when parsing from bytes). 132 + *) 133 + type t = { 134 + root : node; 135 + errors : parse_error list; 136 + encoding : encoding option; 137 + } 138 + 139 + (** {1 Parsing Functions} *) 140 + 141 + (** Parse HTML from a [Bytes.Reader.t]. 142 + 143 + This is the primary parsing function. Create a reader from any source: 144 + - [Bytes.Reader.of_string s] for strings 145 + - [Bytes.Reader.of_in_channel ic] for files 146 + - [Bytes.Reader.of_bytes b] for byte buffers 147 + 148 + {[ 149 + open Bytesrw 150 + let reader = Bytes.Reader.of_string "<html><body>Hello</body></html>" in 151 + let result = Html5rw.parse reader 152 + ]} 153 + 154 + @param collect_errors If true, collect parse errors (default: false) 155 + @param fragment_context Context element for fragment parsing 156 + *) 157 + val parse : ?collect_errors:bool -> ?fragment_context:fragment_context -> Bytesrw.Bytes.Reader.t -> t 158 + 159 + (** Parse raw bytes with automatic encoding detection. 160 + 161 + This function implements the WHATWG encoding sniffing algorithm: 162 + 1. Check for BOM (Byte Order Mark) 163 + 2. Prescan for <meta charset> 164 + 3. Fall back to UTF-8 165 + 166 + @param collect_errors If true, collect parse errors (default: false) 167 + @param transport_encoding Encoding from HTTP Content-Type header 168 + @param fragment_context Context element for fragment parsing 169 + *) 170 + val parse_bytes : ?collect_errors:bool -> ?transport_encoding:string -> ?fragment_context:fragment_context -> bytes -> t 171 + 172 + (** {1 Querying} *) 173 + 174 + (** Query the DOM tree with a CSS selector. 175 + 176 + Supported selectors: 177 + - Tag: [div], [p], [span] 178 + - ID: [#myid] 179 + - Class: [.myclass] 180 + - Universal: [*] 181 + - Attribute: [[attr]], [[attr="value"]], [[attr~="value"]], [[attr|="value"]] 182 + - Pseudo-classes: [:first-child], [:last-child], [:nth-child(n)] 183 + - Combinators: descendant (space), child (>), adjacent sibling (+), general sibling (~) 184 + 185 + {[ 186 + let divs = Html5rw.query result "div.content > p" 187 + ]} 188 + 189 + @raise Selector.Selector_error if the selector is invalid 190 + *) 191 + val query : t -> string -> node list 192 + 193 + (** Check if a node matches a CSS selector. *) 194 + val matches : node -> string -> bool 195 + 196 + (** {1 Serialization} *) 197 + 198 + (** Write the DOM tree to a [Bytes.Writer.t]. 199 + 200 + {[ 201 + open Bytesrw 202 + let buf = Buffer.create 1024 in 203 + let writer = Bytes.Writer.of_buffer buf in 204 + Html5rw.to_writer result writer; 205 + Bytes.Writer.write_eod writer; 206 + let html = Buffer.contents buf 207 + ]} 208 + 209 + @param pretty If true, format with indentation (default: true) 210 + @param indent_size Number of spaces per indent level (default: 2) 211 + *) 212 + val to_writer : ?pretty:bool -> ?indent_size:int -> t -> Bytesrw.Bytes.Writer.t -> unit 213 + 214 + (** Serialize the DOM tree to a string. 215 + 216 + Convenience function when the output fits in memory. 217 + 218 + @param pretty If true, format with indentation (default: true) 219 + @param indent_size Number of spaces per indent level (default: 2) 220 + *) 221 + val to_string : ?pretty:bool -> ?indent_size:int -> t -> string 222 + 223 + (** Extract text content from the DOM tree. 224 + 225 + @param separator String to insert between text nodes (default: " ") 226 + @param strip If true, trim whitespace (default: true) 227 + *) 228 + val to_text : ?separator:string -> ?strip:bool -> t -> string 229 + 230 + (** Serialize to html5lib test format (for testing). *) 231 + val to_test_format : t -> string 232 + 233 + (** {1 Result Accessors} *) 234 + 235 + (** Get the root node of the parsed document. *) 236 + val root : t -> node 237 + 238 + (** Get parse errors (if error collection was enabled). *) 239 + val errors : t -> parse_error list 240 + 241 + (** Get the detected encoding (if parsed from bytes). *) 242 + val encoding : t -> encoding option 243 + 244 + (** {1 DOM Utilities} 245 + 246 + Common DOM operations are available directly. For the full API, 247 + see the {!Dom} module. 248 + *) 249 + 250 + (** Create an element node. 251 + @param namespace None for HTML, Some "svg" or Some "mathml" for foreign content 252 + @param attrs List of (name, value) attribute pairs 253 + *) 254 + val create_element : string -> ?namespace:string option -> ?attrs:(string * string) list -> unit -> node 255 + 256 + (** Create a text node. *) 257 + val create_text : string -> node 258 + 259 + (** Create a comment node. *) 260 + val create_comment : string -> node 261 + 262 + (** Create an empty document node. *) 263 + val create_document : unit -> node 264 + 265 + (** Create a document fragment node. *) 266 + val create_document_fragment : unit -> node 267 + 268 + (** Create a doctype node. *) 269 + val create_doctype : ?name:string -> ?public_id:string -> ?system_id:string -> unit -> node 270 + 271 + (** Append a child node to a parent. *) 272 + val append_child : node -> node -> unit 273 + 274 + (** Insert a node before a reference node. *) 275 + val insert_before : node -> node -> node -> unit 276 + 277 + (** Remove a child node from its parent. *) 278 + val remove_child : node -> node -> unit 279 + 280 + (** Get an attribute value. *) 281 + val get_attr : node -> string -> string option 282 + 283 + (** Set an attribute value. *) 284 + val set_attr : node -> string -> string -> unit 285 + 286 + (** Check if a node has an attribute. *) 287 + val has_attr : node -> string -> bool 288 + 289 + (** Get all descendant nodes. *) 290 + val descendants : node -> node list 291 + 292 + (** Get all ancestor nodes (from parent to root). *) 293 + val ancestors : node -> node list 294 + 295 + (** Get text content of a node and its descendants. *) 296 + val get_text_content : node -> string 297 + 298 + (** Clone a node. 299 + @param deep If true, also clone descendants (default: false) 300 + *) 301 + val clone : ?deep:bool -> node -> node 302 + 303 + (** {1 Node Predicates} *) 304 + 305 + (** Test if a node is an element. *) 306 + val is_element : node -> bool 307 + 308 + (** Test if a node is a text node. *) 309 + val is_text : node -> bool 310 + 311 + (** Test if a node is a comment node. *) 312 + val is_comment : node -> bool 313 + 314 + (** Test if a node is a document node. *) 315 + val is_document : node -> bool 316 + 317 + (** Test if a node is a document fragment. *) 318 + val is_document_fragment : node -> bool 319 + 320 + (** Test if a node is a doctype node. *) 321 + val is_doctype : node -> bool 322 + 323 + (** Test if a node has children. *) 324 + val has_children : node -> bool
+306
lib/parser/constants.ml
···
··· 1 + (* HTML5 spec constants *) 2 + 3 + (* Void elements - no end tag allowed *) 4 + let void_elements = [ 5 + "area"; "base"; "br"; "col"; "embed"; "hr"; "img"; "input"; 6 + "link"; "meta"; "source"; "track"; "wbr" 7 + ] 8 + 9 + (* Raw text elements - content is raw text *) 10 + let raw_text_elements = ["script"; "style"] 11 + 12 + (* Escapable raw text elements *) 13 + let escapable_raw_text_elements = ["textarea"; "title"] 14 + 15 + (* Formatting elements for adoption agency *) 16 + let formatting_elements = [ 17 + "a"; "b"; "big"; "code"; "em"; "font"; "i"; "nobr"; "s"; "small"; 18 + "strike"; "strong"; "tt"; "u" 19 + ] 20 + 21 + (* Special elements *) 22 + let special_elements = [ 23 + "address"; "applet"; "area"; "article"; "aside"; "base"; "basefont"; 24 + "bgsound"; "blockquote"; "body"; "br"; "button"; "caption"; "center"; 25 + "col"; "colgroup"; "dd"; "details"; "dir"; "div"; "dl"; "dt"; "embed"; 26 + "fieldset"; "figcaption"; "figure"; "footer"; "form"; "frame"; "frameset"; 27 + "h1"; "h2"; "h3"; "h4"; "h5"; "h6"; "head"; "header"; "hgroup"; "hr"; 28 + "html"; "iframe"; "img"; "input"; "keygen"; "li"; "link"; "listing"; 29 + "main"; "marquee"; "menu"; "meta"; "nav"; "noembed"; "noframes"; 30 + "noscript"; "object"; "ol"; "p"; "param"; "plaintext"; "pre"; "script"; 31 + "search"; "section"; "select"; "source"; "style"; "summary"; "table"; 32 + "tbody"; "td"; "template"; "textarea"; "tfoot"; "th"; "thead"; "title"; 33 + "tr"; "track"; "ul"; "wbr"; "xmp" 34 + ] 35 + 36 + (* Heading elements *) 37 + let heading_elements = ["h1"; "h2"; "h3"; "h4"; "h5"; "h6"] 38 + 39 + (* Implied end tag elements *) 40 + let implied_end_tags = [ 41 + "dd"; "dt"; "li"; "optgroup"; "option"; "p"; "rb"; "rp"; "rt"; "rtc" 42 + ] 43 + 44 + (* Thoroughly implied end tags *) 45 + let thoroughly_implied_end_tags = [ 46 + "caption"; "colgroup"; "dd"; "dt"; "li"; "optgroup"; "option"; "p"; 47 + "rb"; "rp"; "rt"; "rtc"; "tbody"; "td"; "tfoot"; "th"; "thead"; "tr" 48 + ] 49 + 50 + (* Scope elements for various scope checks *) 51 + let default_scope = [ 52 + "applet"; "caption"; "html"; "table"; "td"; "th"; "marquee"; "object"; "template" 53 + ] 54 + 55 + let list_item_scope = default_scope @ ["ol"; "ul"] 56 + 57 + let button_scope = default_scope @ ["button"] 58 + 59 + let table_scope = ["html"; "table"; "template"] 60 + 61 + let select_scope_exclude = ["optgroup"; "option"] 62 + 63 + (* MathML text integration points *) 64 + let mathml_text_integration = ["mi"; "mo"; "mn"; "ms"; "mtext"] 65 + 66 + (* MathML attribute adjustments *) 67 + let mathml_attr_adjustments = [ 68 + ("definitionurl", "definitionURL") 69 + ] 70 + 71 + let adjust_mathml_attrs attrs = 72 + List.map (fun (k, v) -> 73 + match List.assoc_opt (String.lowercase_ascii k) mathml_attr_adjustments with 74 + | Some adjusted_k -> (adjusted_k, v) 75 + | None -> (k, v) 76 + ) attrs 77 + 78 + (* SVG HTML integration points *) 79 + let svg_html_integration = ["foreignObject"; "desc"; "title"] 80 + 81 + (* SVG tag name adjustments *) 82 + let svg_tag_adjustments = [ 83 + ("altglyph", "altGlyph"); 84 + ("altglyphdef", "altGlyphDef"); 85 + ("altglyphitem", "altGlyphItem"); 86 + ("animatecolor", "animateColor"); 87 + ("animatemotion", "animateMotion"); 88 + ("animatetransform", "animateTransform"); 89 + ("clippath", "clipPath"); 90 + ("feblend", "feBlend"); 91 + ("fecolormatrix", "feColorMatrix"); 92 + ("fecomponenttransfer", "feComponentTransfer"); 93 + ("fecomposite", "feComposite"); 94 + ("feconvolvematrix", "feConvolveMatrix"); 95 + ("fediffuselighting", "feDiffuseLighting"); 96 + ("fedisplacementmap", "feDisplacementMap"); 97 + ("fedistantlight", "feDistantLight"); 98 + ("fedropshadow", "feDropShadow"); 99 + ("feflood", "feFlood"); 100 + ("fefunca", "feFuncA"); 101 + ("fefuncb", "feFuncB"); 102 + ("fefuncg", "feFuncG"); 103 + ("fefuncr", "feFuncR"); 104 + ("fegaussianblur", "feGaussianBlur"); 105 + ("feimage", "feImage"); 106 + ("femerge", "feMerge"); 107 + ("femergenode", "feMergeNode"); 108 + ("femorphology", "feMorphology"); 109 + ("feoffset", "feOffset"); 110 + ("fepointlight", "fePointLight"); 111 + ("fespecularlighting", "feSpecularLighting"); 112 + ("fespotlight", "feSpotLight"); 113 + ("fetile", "feTile"); 114 + ("feturbulence", "feTurbulence"); 115 + ("foreignobject", "foreignObject"); 116 + ("glyphref", "glyphRef"); 117 + ("lineargradient", "linearGradient"); 118 + ("radialgradient", "radialGradient"); 119 + ("textpath", "textPath"); 120 + ] 121 + 122 + (* SVG attribute adjustments *) 123 + let svg_attr_adjustments = [ 124 + ("attributename", "attributeName"); 125 + ("attributetype", "attributeType"); 126 + ("basefrequency", "baseFrequency"); 127 + ("baseprofile", "baseProfile"); 128 + ("calcmode", "calcMode"); 129 + ("clippathunits", "clipPathUnits"); 130 + ("diffuseconstant", "diffuseConstant"); 131 + ("edgemode", "edgeMode"); 132 + ("filterunits", "filterUnits"); 133 + ("glyphref", "glyphRef"); 134 + ("gradienttransform", "gradientTransform"); 135 + ("gradientunits", "gradientUnits"); 136 + ("kernelmatrix", "kernelMatrix"); 137 + ("kernelunitlength", "kernelUnitLength"); 138 + ("keypoints", "keyPoints"); 139 + ("keysplines", "keySplines"); 140 + ("keytimes", "keyTimes"); 141 + ("lengthadjust", "lengthAdjust"); 142 + ("limitingconeangle", "limitingConeAngle"); 143 + ("markerheight", "markerHeight"); 144 + ("markerunits", "markerUnits"); 145 + ("markerwidth", "markerWidth"); 146 + ("maskcontentunits", "maskContentUnits"); 147 + ("maskunits", "maskUnits"); 148 + ("numoctaves", "numOctaves"); 149 + ("pathlength", "pathLength"); 150 + ("patterncontentunits", "patternContentUnits"); 151 + ("patterntransform", "patternTransform"); 152 + ("patternunits", "patternUnits"); 153 + ("pointsatx", "pointsAtX"); 154 + ("pointsaty", "pointsAtY"); 155 + ("pointsatz", "pointsAtZ"); 156 + ("preservealpha", "preserveAlpha"); 157 + ("preserveaspectratio", "preserveAspectRatio"); 158 + ("primitiveunits", "primitiveUnits"); 159 + ("refx", "refX"); 160 + ("refy", "refY"); 161 + ("repeatcount", "repeatCount"); 162 + ("repeatdur", "repeatDur"); 163 + ("requiredextensions", "requiredExtensions"); 164 + ("requiredfeatures", "requiredFeatures"); 165 + ("specularconstant", "specularConstant"); 166 + ("specularexponent", "specularExponent"); 167 + ("spreadmethod", "spreadMethod"); 168 + ("startoffset", "startOffset"); 169 + ("stddeviation", "stdDeviation"); 170 + ("stitchtiles", "stitchTiles"); 171 + ("surfacescale", "surfaceScale"); 172 + ("systemlanguage", "systemLanguage"); 173 + ("tablevalues", "tableValues"); 174 + ("targetx", "targetX"); 175 + ("targety", "targetY"); 176 + ("textlength", "textLength"); 177 + ("viewbox", "viewBox"); 178 + ("viewtarget", "viewTarget"); 179 + ("xchannelselector", "xChannelSelector"); 180 + ("ychannelselector", "yChannelSelector"); 181 + ("zoomandpan", "zoomAndPan"); 182 + ] 183 + 184 + (* Foreign attribute adjustments *) 185 + let foreign_attr_adjustments = [ 186 + ("xlink:actuate", ("xlink", "actuate", "http://www.w3.org/1999/xlink")); 187 + ("xlink:arcrole", ("xlink", "arcrole", "http://www.w3.org/1999/xlink")); 188 + ("xlink:href", ("xlink", "href", "http://www.w3.org/1999/xlink")); 189 + ("xlink:role", ("xlink", "role", "http://www.w3.org/1999/xlink")); 190 + ("xlink:show", ("xlink", "show", "http://www.w3.org/1999/xlink")); 191 + ("xlink:title", ("xlink", "title", "http://www.w3.org/1999/xlink")); 192 + ("xlink:type", ("xlink", "type", "http://www.w3.org/1999/xlink")); 193 + ("xml:lang", ("xml", "lang", "http://www.w3.org/XML/1998/namespace")); 194 + ("xml:space", ("xml", "space", "http://www.w3.org/XML/1998/namespace")); 195 + ("xmlns", ("", "xmlns", "http://www.w3.org/2000/xmlns/")); 196 + ("xmlns:xlink", ("xmlns", "xlink", "http://www.w3.org/2000/xmlns/")); 197 + ] 198 + 199 + (* Quirks mode detection *) 200 + let quirky_public_matches = [ 201 + "-//w3o//dtd w3 html strict 3.0//en//"; 202 + "-/w3c/dtd html 4.0 transitional/en"; 203 + "html" 204 + ] 205 + 206 + let quirky_public_prefixes = [ 207 + "+//silmaril//dtd html pro v0r11 19970101//"; 208 + "-//as//dtd html 3.0 aswedit + extensions//"; 209 + "-//advasoft ltd//dtd html 3.0 aswedit + extensions//"; 210 + "-//ietf//dtd html 2.0 level 1//"; 211 + "-//ietf//dtd html 2.0 level 2//"; 212 + "-//ietf//dtd html 2.0 strict level 1//"; 213 + "-//ietf//dtd html 2.0 strict level 2//"; 214 + "-//ietf//dtd html 2.0 strict//"; 215 + "-//ietf//dtd html 2.0//"; 216 + "-//ietf//dtd html 2.1e//"; 217 + "-//ietf//dtd html 3.0//"; 218 + "-//ietf//dtd html 3.2 final//"; 219 + "-//ietf//dtd html 3.2//"; 220 + "-//ietf//dtd html 3//"; 221 + "-//ietf//dtd html level 0//"; 222 + "-//ietf//dtd html level 1//"; 223 + "-//ietf//dtd html level 2//"; 224 + "-//ietf//dtd html level 3//"; 225 + "-//ietf//dtd html strict level 0//"; 226 + "-//ietf//dtd html strict level 1//"; 227 + "-//ietf//dtd html strict level 2//"; 228 + "-//ietf//dtd html strict level 3//"; 229 + "-//ietf//dtd html strict//"; 230 + "-//ietf//dtd html//"; 231 + "-//metrius//dtd metrius presentational//"; 232 + "-//microsoft//dtd internet explorer 2.0 html strict//"; 233 + "-//microsoft//dtd internet explorer 2.0 html//"; 234 + "-//microsoft//dtd internet explorer 2.0 tables//"; 235 + "-//microsoft//dtd internet explorer 3.0 html strict//"; 236 + "-//microsoft//dtd internet explorer 3.0 html//"; 237 + "-//microsoft//dtd internet explorer 3.0 tables//"; 238 + "-//netscape comm. corp.//dtd html//"; 239 + "-//netscape comm. corp.//dtd strict html//"; 240 + "-//o'reilly and associates//dtd html 2.0//"; 241 + "-//o'reilly and associates//dtd html extended 1.0//"; 242 + "-//o'reilly and associates//dtd html extended relaxed 1.0//"; 243 + "-//sq//dtd html 2.0 hotmetal + extensions//"; 244 + "-//softquad software//dtd hotmetal pro 6.0::19990601::extensions to html 4.0//"; 245 + "-//softquad//dtd hotmetal pro 4.0::19971010::extensions to html 4.0//"; 246 + "-//spyglass//dtd html 2.0 extended//"; 247 + "-//sun microsystems corp.//dtd hotjava html//"; 248 + "-//sun microsystems corp.//dtd hotjava strict html//"; 249 + "-//w3c//dtd html 3 1995-03-24//"; 250 + "-//w3c//dtd html 3.2 draft//"; 251 + "-//w3c//dtd html 3.2 final//"; 252 + "-//w3c//dtd html 3.2//"; 253 + "-//w3c//dtd html 3.2s draft//"; 254 + "-//w3c//dtd html 4.0 frameset//"; 255 + "-//w3c//dtd html 4.0 transitional//"; 256 + "-//w3c//dtd html experimental 19960712//"; 257 + "-//w3c//dtd html experimental 970421//"; 258 + "-//w3c//dtd w3 html//"; 259 + "-//w3o//dtd w3 html 3.0//"; 260 + "-//webtechs//dtd mozilla html 2.0//"; 261 + "-//webtechs//dtd mozilla html//"; 262 + ] 263 + 264 + let limited_quirky_public_prefixes = [ 265 + "-//w3c//dtd xhtml 1.0 frameset//"; 266 + "-//w3c//dtd xhtml 1.0 transitional//"; 267 + ] 268 + 269 + let html4_public_prefixes = [ 270 + "-//w3c//dtd html 4.01 frameset//"; 271 + "-//w3c//dtd html 4.01 transitional//"; 272 + ] 273 + 274 + let quirky_system_matches = [ 275 + "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd" 276 + ] 277 + 278 + (* Helper functions *) 279 + let is_void = List.mem 280 + let is_formatting = List.mem 281 + let is_special name = List.mem name special_elements 282 + let is_heading = List.mem 283 + 284 + let adjust_svg_tag_name name = 285 + match List.assoc_opt (String.lowercase_ascii name) svg_tag_adjustments with 286 + | Some adjusted -> adjusted 287 + | None -> name 288 + 289 + let adjust_svg_attrs attrs = 290 + List.map (fun (name, value) -> 291 + let adjusted_name = 292 + match List.assoc_opt (String.lowercase_ascii name) svg_attr_adjustments with 293 + | Some n -> n 294 + | None -> name 295 + in 296 + (adjusted_name, value) 297 + ) attrs 298 + 299 + let adjust_foreign_attrs attrs = 300 + List.map (fun (name, value) -> 301 + match List.assoc_opt (String.lowercase_ascii name) foreign_attr_adjustments with 302 + | Some (prefix, local, _ns) -> 303 + if prefix = "" then (local, value) 304 + else (prefix ^ ":" ^ local, value) 305 + | None -> (name, value) 306 + ) attrs
+4
lib/parser/dune
···
··· 1 + (library 2 + (name html5rw_parser) 3 + (public_name html5rw.parser) 4 + (libraries bytesrw html5rw.tokenizer html5rw.dom html5rw.encoding html5rw.selector))
+36
lib/parser/html5rw_parser.ml
···
··· 1 + (* html5rw.parser - HTML5 parser with bytesrw-only API *) 2 + 3 + module Dom = Html5rw_dom 4 + module Tokenizer = Html5rw_tokenizer 5 + module Encoding = Html5rw_encoding 6 + module Constants = Constants 7 + module Insertion_mode = Insertion_mode 8 + module Tree_builder = Tree_builder 9 + 10 + type parse_error = Parser.parse_error 11 + type fragment_context = Parser.fragment_context 12 + type t = Parser.t 13 + 14 + (* parse_error accessors *) 15 + let error_code (e : parse_error) = e.Tree_builder.code 16 + let error_line (e : parse_error) = e.Tree_builder.line 17 + let error_column (e : parse_error) = e.Tree_builder.column 18 + 19 + (* fragment_context constructor and accessors *) 20 + let make_fragment_context ~tag_name ?(namespace=None) () : fragment_context = 21 + { Tree_builder.tag_name; namespace } 22 + 23 + let fragment_context_tag (ctx : fragment_context) = ctx.Tree_builder.tag_name 24 + let fragment_context_namespace (ctx : fragment_context) = ctx.Tree_builder.namespace 25 + 26 + let parse = Parser.parse 27 + let parse_bytes = Parser.parse_bytes 28 + let query = Parser.query 29 + let to_writer = Parser.to_writer 30 + let to_string = Parser.to_string 31 + let to_text = Parser.to_text 32 + let to_test_format = Parser.to_test_format 33 + 34 + let root t = t.Parser.root 35 + let errors t = t.Parser.errors 36 + let encoding t = t.Parser.encoding
+207
lib/parser/html5rw_parser.mli
···
··· 1 + (** HTML5 Parser 2 + 3 + This module provides the core HTML5 parsing functionality implementing 4 + the WHATWG parsing specification. It handles tokenization, tree construction, 5 + error recovery, and produces a DOM tree. 6 + 7 + For most uses, prefer the top-level {!Html5rw} module which re-exports 8 + these functions with a simpler interface. 9 + 10 + {2 Parsing Algorithm} 11 + 12 + The HTML5 parsing algorithm is defined by the WHATWG specification and 13 + consists of several phases: 14 + 15 + 1. {b Encoding sniffing}: Detect character encoding from BOM, meta tags, 16 + or transport layer hints 17 + 2. {b Tokenization}: Convert the input stream into a sequence of tokens 18 + (start tags, end tags, character data, comments, etc.) 19 + 3. {b Tree construction}: Build the DOM tree using a state machine with 20 + multiple insertion modes 21 + 22 + The algorithm includes extensive error recovery to handle malformed HTML 23 + in a consistent way across browsers. 24 + 25 + @see <https://html.spec.whatwg.org/multipage/parsing.html> 26 + The WHATWG HTML Parsing specification 27 + *) 28 + 29 + (** {1 Sub-modules} *) 30 + 31 + module Dom = Html5rw_dom 32 + module Tokenizer = Html5rw_tokenizer 33 + module Encoding = Html5rw_encoding 34 + module Constants : sig 35 + val void_elements : string list 36 + val formatting_elements : string list 37 + val special_elements : string list 38 + end 39 + module Insertion_mode : sig 40 + type t 41 + end 42 + module Tree_builder : sig 43 + type t 44 + end 45 + 46 + (** {1 Types} *) 47 + 48 + (** A parse error encountered during parsing. 49 + 50 + HTML5 parsing never fails - it always produces a DOM tree. However, 51 + the specification defines many error conditions that conformance 52 + checkers should report. Error collection is optional and disabled 53 + by default for performance. 54 + 55 + Error codes follow the WHATWG specification naming convention, 56 + e.g., "unexpected-null-character", "eof-in-tag". 57 + 58 + @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-errors> 59 + The list of HTML5 parse errors 60 + *) 61 + type parse_error 62 + 63 + (** Get the error code string. 64 + 65 + Error codes are lowercase with hyphens, matching the WHATWG spec names 66 + like "unexpected-null-character" or "eof-before-tag-name". 67 + *) 68 + val error_code : parse_error -> string 69 + 70 + (** Get the line number where the error occurred (1-indexed). *) 71 + val error_line : parse_error -> int 72 + 73 + (** Get the column number where the error occurred (1-indexed). *) 74 + val error_column : parse_error -> int 75 + 76 + (** Context element for HTML fragment parsing. 77 + 78 + When parsing an HTML fragment (innerHTML), you need to specify the 79 + context element that would contain the fragment. This affects how 80 + the parser handles certain elements. 81 + 82 + For example, parsing [<td>] as a fragment of a [<tr>] works differently 83 + than parsing it as a fragment of a [<div>]. 84 + 85 + @see <https://html.spec.whatwg.org/multipage/parsing.html#parsing-html-fragments> 86 + The HTML fragment parsing algorithm 87 + *) 88 + type fragment_context 89 + 90 + (** Create a fragment parsing context. 91 + 92 + @param tag_name The tag name of the context element (e.g., "div", "tr") 93 + @param namespace Namespace: [None] for HTML, [Some "svg"], [Some "mathml"] 94 + 95 + {[ 96 + (* Parse as innerHTML of a table row *) 97 + let ctx = make_fragment_context ~tag_name:"tr" () 98 + 99 + (* Parse as innerHTML of an SVG element *) 100 + let ctx = make_fragment_context ~tag_name:"g" ~namespace:(Some "svg") () 101 + ]} 102 + *) 103 + val make_fragment_context : tag_name:string -> ?namespace:string option -> 104 + unit -> fragment_context 105 + 106 + (** Get the tag name of a fragment context. *) 107 + val fragment_context_tag : fragment_context -> string 108 + 109 + (** Get the namespace of a fragment context. *) 110 + val fragment_context_namespace : fragment_context -> string option 111 + 112 + (** Result of parsing an HTML document or fragment. 113 + 114 + Contains the parsed DOM tree, any errors encountered (if error 115 + collection was enabled), and the detected encoding (for byte input). 116 + *) 117 + type t 118 + 119 + (** {1 Parsing Functions} *) 120 + 121 + val parse : ?collect_errors:bool -> ?fragment_context:fragment_context -> 122 + Bytesrw.Bytes.Reader.t -> t 123 + (** Parse HTML from a byte stream reader. 124 + 125 + This is the primary parsing function. The input must be valid UTF-8 126 + (or will be converted from detected encoding when using {!parse_bytes}). 127 + 128 + @param collect_errors If [true], collect parse errors (default: [false]) 129 + @param fragment_context Context for fragment parsing (innerHTML) 130 + 131 + {[ 132 + open Bytesrw 133 + let reader = Bytes.Reader.of_string "<p>Hello</p>" in 134 + let result = parse reader 135 + ]} 136 + *) 137 + 138 + val parse_bytes : ?collect_errors:bool -> ?transport_encoding:string -> 139 + ?fragment_context:fragment_context -> bytes -> t 140 + (** Parse HTML bytes with automatic encoding detection. 141 + 142 + Implements the WHATWG encoding sniffing algorithm: 143 + 1. Check for BOM (UTF-8, UTF-16LE, UTF-16BE) 144 + 2. Prescan for [<meta charset>] declaration 145 + 3. Use transport encoding hint if provided 146 + 4. Fall back to UTF-8 147 + 148 + @param collect_errors If [true], collect parse errors (default: [false]) 149 + @param transport_encoding Encoding from HTTP Content-Type header 150 + @param fragment_context Context for fragment parsing (innerHTML) 151 + *) 152 + 153 + (** {1 Result Accessors} *) 154 + 155 + val root : t -> Dom.node 156 + (** Get the root node of the parsed document. 157 + 158 + For full document parsing, this is a document node. 159 + For fragment parsing, this is a document fragment node. 160 + *) 161 + 162 + val errors : t -> parse_error list 163 + (** Get parse errors (empty if error collection was disabled). *) 164 + 165 + val encoding : t -> Encoding.encoding option 166 + (** Get the detected encoding (only set when using {!parse_bytes}). *) 167 + 168 + (** {1 Querying} *) 169 + 170 + val query : t -> string -> Dom.node list 171 + (** Query the DOM with a CSS selector. 172 + 173 + @raise Html5rw_selector.Selector_error if the selector is invalid 174 + 175 + @see {!Html5rw_selector} for supported selector syntax 176 + *) 177 + 178 + (** {1 Serialization} *) 179 + 180 + val to_writer : ?pretty:bool -> ?indent_size:int -> t -> 181 + Bytesrw.Bytes.Writer.t -> unit 182 + (** Serialize the DOM tree to a byte stream writer. 183 + 184 + @param pretty If [true], format with indentation (default: [true]) 185 + @param indent_size Spaces per indent level (default: [2]) 186 + *) 187 + 188 + val to_string : ?pretty:bool -> ?indent_size:int -> t -> string 189 + (** Serialize the DOM tree to a string. 190 + 191 + @param pretty If [true], format with indentation (default: [true]) 192 + @param indent_size Spaces per indent level (default: [2]) 193 + *) 194 + 195 + val to_text : ?separator:string -> ?strip:bool -> t -> string 196 + (** Extract text content from the DOM tree. 197 + 198 + @param separator String between text nodes (default: [" "]) 199 + @param strip If [true], trim whitespace (default: [true]) 200 + *) 201 + 202 + val to_test_format : t -> string 203 + (** Serialize to html5lib test format. 204 + 205 + This format is used by the html5lib test suite and shows the tree 206 + structure with indentation and node type prefixes. 207 + *)
+51
lib/parser/insertion_mode.ml
···
··· 1 + (* HTML5 tree builder insertion modes *) 2 + 3 + type t = 4 + | Initial 5 + | Before_html 6 + | Before_head 7 + | In_head 8 + | In_head_noscript 9 + | After_head 10 + | In_body 11 + | Text 12 + | In_table 13 + | In_table_text 14 + | In_caption 15 + | In_column_group 16 + | In_table_body 17 + | In_row 18 + | In_cell 19 + | In_select 20 + | In_select_in_table 21 + | In_template 22 + | After_body 23 + | In_frameset 24 + | After_frameset 25 + | After_after_body 26 + | After_after_frameset 27 + 28 + let to_string = function 29 + | Initial -> "initial" 30 + | Before_html -> "before html" 31 + | Before_head -> "before head" 32 + | In_head -> "in head" 33 + | In_head_noscript -> "in head noscript" 34 + | After_head -> "after head" 35 + | In_body -> "in body" 36 + | Text -> "text" 37 + | In_table -> "in table" 38 + | In_table_text -> "in table text" 39 + | In_caption -> "in caption" 40 + | In_column_group -> "in column group" 41 + | In_table_body -> "in table body" 42 + | In_row -> "in row" 43 + | In_cell -> "in cell" 44 + | In_select -> "in select" 45 + | In_select_in_table -> "in select in table" 46 + | In_template -> "in template" 47 + | After_body -> "after body" 48 + | In_frameset -> "in frameset" 49 + | After_frameset -> "after frameset" 50 + | After_after_body -> "after after body" 51 + | After_after_frameset -> "after after frameset"
+107
lib/parser/parser.ml
···
··· 1 + (* Main parser entry point - bytesrw-only API *) 2 + 3 + open Bytesrw 4 + 5 + module Dom = Html5rw_dom 6 + module Tokenizer = Html5rw_tokenizer 7 + module Encoding = Html5rw_encoding 8 + 9 + type parse_error = Tree_builder.parse_error 10 + 11 + type fragment_context = Tree_builder.fragment_context 12 + 13 + type t = { 14 + root : Dom.node; 15 + errors : parse_error list; 16 + encoding : Encoding.encoding option; 17 + } 18 + 19 + (* Token sink that feeds tokens to tree builder *) 20 + module TreeBuilderSink = struct 21 + type t = Tree_builder.t 22 + 23 + let process tb token = 24 + Tree_builder.process_token tb token; 25 + (* Check if we need to switch tokenizer state based on current element *) 26 + (* Only switch for HTML namespace elements - SVG/MathML use different rules *) 27 + match Tree_builder.current_node tb with 28 + | Some node when node.Dom.namespace = None || node.Dom.namespace = Some "html" -> 29 + let name = node.Dom.name in 30 + if List.mem name ["textarea"; "title"] then 31 + `SwitchTo Tokenizer.State.Rcdata 32 + else if List.mem name ["style"; "xmp"; "iframe"; "noembed"; "noframes"] then 33 + `SwitchTo Tokenizer.State.Rawtext 34 + else if name = "script" then 35 + `SwitchTo Tokenizer.State.Script_data 36 + else if name = "plaintext" then 37 + `SwitchTo Tokenizer.State.Plaintext 38 + else 39 + `Continue 40 + | _ -> `Continue 41 + 42 + let adjusted_current_node_in_html_namespace tb = 43 + Tree_builder.adjusted_current_node_in_html_namespace tb 44 + end 45 + 46 + (* Core parsing function that takes a Bytes.Reader.t *) 47 + let parse ?(collect_errors=false) ?fragment_context (reader : Bytes.Reader.t) = 48 + let tb = Tree_builder.create ~collect_errors ?fragment_context () in 49 + let tokenizer = Tokenizer.create (module TreeBuilderSink) tb ~collect_errors () in 50 + 51 + (* Set tokenizer state for fragment parsing *) 52 + (* Note: We do NOT set last_start_tag because in fragment parsing, no start tag has been 53 + emitted. This means end tags won't match as "appropriate end tags" and will be treated 54 + as raw text in RCDATA/RAWTEXT/Script modes. *) 55 + (* Only change tokenizer state for HTML namespace contexts - foreign contexts use Data state *) 56 + (match fragment_context with 57 + | Some ctx when ctx.namespace = None || ctx.namespace = Some "html" -> 58 + let name = String.lowercase_ascii ctx.tag_name in 59 + if List.mem name ["title"; "textarea"] then 60 + Tokenizer.set_state tokenizer Tokenizer.State.Rcdata 61 + else if List.mem name ["style"; "xmp"; "iframe"; "noembed"; "noframes"] then 62 + Tokenizer.set_state tokenizer Tokenizer.State.Rawtext 63 + else if name = "script" then 64 + Tokenizer.set_state tokenizer Tokenizer.State.Script_data 65 + else if name = "plaintext" then 66 + Tokenizer.set_state tokenizer Tokenizer.State.Plaintext 67 + | _ -> ()); 68 + 69 + Tokenizer.run tokenizer (module TreeBuilderSink) reader; 70 + 71 + let root = Tree_builder.finish tb in 72 + let tokenizer_errors = Tokenizer.get_errors tokenizer in 73 + let tree_errors = Tree_builder.get_errors tb in 74 + let all_errors = List.map (fun e -> 75 + { Tree_builder.code = e.Tokenizer.Errors.code; 76 + line = e.Tokenizer.Errors.line; 77 + column = e.Tokenizer.Errors.column } 78 + ) tokenizer_errors @ tree_errors in 79 + 80 + { root; errors = all_errors; encoding = None } 81 + 82 + (* Parse raw bytes with automatic encoding detection *) 83 + let parse_bytes ?(collect_errors=false) ?transport_encoding ?fragment_context data = 84 + let (html, enc) = Encoding.decode data ?transport_encoding () in 85 + let reader = Bytes.Reader.of_string html in 86 + let result = parse ~collect_errors ?fragment_context reader in 87 + { result with encoding = Some enc } 88 + 89 + let query t selector = 90 + Html5rw_selector.query t.root selector 91 + 92 + (* Serialize to a Bytes.Writer.t *) 93 + let to_writer ?(pretty=true) ?(indent_size=2) t (writer : Bytes.Writer.t) = 94 + let html = Dom.to_html ~pretty ~indent_size t.root in 95 + Bytes.Writer.write_string writer html 96 + 97 + (* Serialize to string (convenience for when result fits in memory) *) 98 + let to_string ?(pretty=true) ?(indent_size=2) t = 99 + Dom.to_html ~pretty ~indent_size t.root 100 + 101 + (* Extract text content *) 102 + let to_text ?(separator=" ") ?(strip=true) t = 103 + Dom.to_text ~separator ~strip t.root 104 + 105 + (* For testing *) 106 + let to_test_format t = 107 + Dom.to_test_format t.root
+2520
lib/parser/tree_builder.ml
···
··· 1 + (* HTML5 Tree Builder *) 2 + 3 + module Dom = Html5rw_dom 4 + module Token = Html5rw_tokenizer.Token 5 + module State = Html5rw_tokenizer.State 6 + 7 + type fragment_context = { 8 + tag_name : string; 9 + namespace : string option; 10 + } 11 + 12 + type formatting_entry = 13 + | Marker 14 + | Entry of { 15 + name : string; 16 + attrs : (string * string) list; 17 + node : Dom.node; 18 + } 19 + 20 + type parse_error = { 21 + code : string; 22 + line : int; 23 + column : int; 24 + } 25 + 26 + type t = { 27 + mutable document : Dom.node; 28 + mutable mode : Insertion_mode.t; 29 + mutable original_mode : Insertion_mode.t option; 30 + mutable open_elements : Dom.node list; 31 + mutable active_formatting : formatting_entry list; 32 + mutable head_element : Dom.node option; 33 + mutable form_element : Dom.node option; 34 + mutable frameset_ok : bool; 35 + mutable ignore_lf : bool; 36 + mutable foster_parenting : bool; 37 + mutable pending_table_chars : string list; 38 + mutable template_modes : Insertion_mode.t list; 39 + mutable quirks_mode : Dom.quirks_mode; 40 + mutable errors : parse_error list; 41 + collect_errors : bool; 42 + fragment_context : fragment_context option; 43 + mutable fragment_context_element : Dom.node option; 44 + iframe_srcdoc : bool; 45 + } 46 + 47 + let create ?(collect_errors=false) ?fragment_context ?(iframe_srcdoc=false) () = 48 + let is_fragment = fragment_context <> None in 49 + let doc = if is_fragment then Dom.create_document_fragment () else Dom.create_document () in 50 + let t = { 51 + document = doc; 52 + mode = Insertion_mode.Initial; 53 + original_mode = None; 54 + open_elements = []; 55 + active_formatting = []; 56 + head_element = None; 57 + form_element = None; 58 + frameset_ok = true; 59 + ignore_lf = false; 60 + foster_parenting = false; 61 + pending_table_chars = []; 62 + template_modes = []; 63 + quirks_mode = Dom.No_quirks; 64 + errors = []; 65 + collect_errors; 66 + fragment_context; 67 + fragment_context_element = None; 68 + iframe_srcdoc; 69 + } in 70 + (* Initialize fragment parsing *) 71 + (match fragment_context with 72 + | Some ctx -> 73 + let name = String.lowercase_ascii ctx.tag_name in 74 + let ns = ctx.namespace in 75 + (* Create html root *) 76 + let root = Dom.create_element "html" () in 77 + Dom.append_child doc root; 78 + t.open_elements <- [root]; 79 + (* For foreign content contexts, create context element *) 80 + (match ns with 81 + | Some namespace when namespace <> "html" -> 82 + let context_elem = Dom.create_element ctx.tag_name ~namespace:ns () in 83 + Dom.append_child root context_elem; 84 + t.open_elements <- [context_elem; root]; 85 + t.fragment_context_element <- Some context_elem 86 + | _ -> ()); 87 + (* Set initial mode based on context *) 88 + t.mode <- ( 89 + if name = "html" then Insertion_mode.Before_head 90 + else if List.mem name ["tbody"; "thead"; "tfoot"] && (ns = None || ns = Some "html") then 91 + Insertion_mode.In_table_body 92 + else if name = "tr" && (ns = None || ns = Some "html") then 93 + Insertion_mode.In_row 94 + else if List.mem name ["td"; "th"] && (ns = None || ns = Some "html") then 95 + Insertion_mode.In_cell 96 + else if name = "caption" && (ns = None || ns = Some "html") then 97 + Insertion_mode.In_caption 98 + else if name = "colgroup" && (ns = None || ns = Some "html") then 99 + Insertion_mode.In_column_group 100 + else if name = "table" && (ns = None || ns = Some "html") then 101 + Insertion_mode.In_table 102 + else if name = "template" && (ns = None || ns = Some "html") then begin 103 + t.template_modes <- [Insertion_mode.In_template]; 104 + Insertion_mode.In_template 105 + end 106 + else 107 + Insertion_mode.In_body 108 + ); 109 + t.frameset_ok <- false 110 + | None -> ()); 111 + t 112 + 113 + (* Error handling *) 114 + let parse_error t code = 115 + if t.collect_errors then 116 + t.errors <- { code; line = 0; column = 0 } :: t.errors 117 + 118 + (* Stack helpers *) 119 + let current_node t = 120 + match t.open_elements with 121 + | [] -> None 122 + | x :: _ -> Some x 123 + 124 + let adjusted_current_node t = 125 + match t.fragment_context, t.open_elements with 126 + | Some ctx, [_] -> 127 + (* Fragment case: use context element info *) 128 + Some (Dom.create_element ctx.tag_name ~namespace:ctx.namespace ()) 129 + | _, x :: _ -> Some x 130 + | _, [] -> None 131 + 132 + let is_in_html_namespace node = 133 + node.Dom.namespace = None || node.Dom.namespace = Some "html" 134 + 135 + (* Namespace-aware check for "special" elements per WHATWG spec *) 136 + let is_special_element node = 137 + let name = String.lowercase_ascii node.Dom.name in 138 + match node.Dom.namespace with 139 + | None | Some "html" -> Constants.is_special name 140 + | Some "mathml" -> List.mem name ["mi"; "mo"; "mn"; "ms"; "mtext"; "annotation-xml"] 141 + | Some "svg" -> List.mem name ["foreignobject"; "desc"; "title"] 142 + | _ -> false 143 + 144 + let adjusted_current_node_in_html_namespace t = 145 + match adjusted_current_node t with 146 + | Some node -> is_in_html_namespace node 147 + | None -> true 148 + 149 + (* Insertion helpers *) 150 + let appropriate_insertion_place t = 151 + match current_node t with 152 + | None -> (t.document, None) 153 + | Some target -> 154 + if t.foster_parenting && List.mem target.Dom.name ["table"; "tbody"; "tfoot"; "thead"; "tr"] then begin 155 + (* Foster parenting per WHATWG spec *) 156 + (* Step 1: Find last (most recent) template and table in stack *) 157 + (* Note: index 0 = top of stack = most recently added *) 158 + let last_template_idx = ref None in 159 + let last_table_idx = ref None in 160 + List.iteri (fun i n -> 161 + (* Take first match (most recent = lowest index) *) 162 + if n.Dom.name = "template" && !last_template_idx = None then last_template_idx := Some i; 163 + if n.Dom.name = "table" && !last_table_idx = None then last_table_idx := Some i 164 + ) t.open_elements; 165 + 166 + (* Step 2-3: If last template is more recent than last table (lower index = more recent) *) 167 + match !last_template_idx, !last_table_idx with 168 + | Some ti, None -> 169 + (* No table, use template content *) 170 + let template = List.nth t.open_elements ti in 171 + (match template.Dom.template_content with 172 + | Some tc -> (tc, None) 173 + | None -> (template, None)) 174 + | Some ti, Some tbi when ti < tbi -> 175 + (* Template is more recent than table, use template content *) 176 + let template = List.nth t.open_elements ti in 177 + (match template.Dom.template_content with 178 + | Some tc -> (tc, None) 179 + | None -> (template, None)) 180 + | _, Some tbi -> 181 + (* Use table's parent as foster parent *) 182 + let table = List.nth t.open_elements tbi in 183 + (match table.Dom.parent with 184 + | Some parent -> (parent, Some table) 185 + | None -> 186 + (* Step 6: element above table in stack (index + 1 since 0 is top) *) 187 + if tbi + 1 < List.length t.open_elements then 188 + (List.nth t.open_elements (tbi + 1), None) 189 + else 190 + (t.document, None)) 191 + | None, None -> 192 + (* No table or template, use document *) 193 + (t.document, None) 194 + end else begin 195 + (* If target is a template, insert into its content document fragment *) 196 + match target.Dom.template_content with 197 + | Some tc -> (tc, None) 198 + | None -> (target, None) 199 + end 200 + 201 + let insert_element t name ?(namespace=None) ?(push=false) attrs = 202 + let node = Dom.create_element name ~namespace ~attrs () in 203 + let (parent, before) = appropriate_insertion_place t in 204 + (match before with 205 + | None -> Dom.append_child parent node 206 + | Some ref -> Dom.insert_before parent node ref); 207 + if push then t.open_elements <- node :: t.open_elements; 208 + node 209 + 210 + let insert_element_for_token t (tag : Token.tag) = 211 + insert_element t tag.name ~push:true tag.attrs 212 + 213 + let insert_foreign_element t (tag : Token.tag) namespace = 214 + let attrs = 215 + if namespace = Some "svg" then 216 + Constants.adjust_svg_attrs (Constants.adjust_foreign_attrs tag.attrs) 217 + else 218 + Constants.adjust_foreign_attrs tag.attrs 219 + in 220 + let name = 221 + if namespace = Some "svg" then Constants.adjust_svg_tag_name tag.name 222 + else tag.name 223 + in 224 + let node = insert_element t name ~namespace attrs in 225 + t.open_elements <- node :: t.open_elements; 226 + node 227 + 228 + let insert_character t data = 229 + if t.ignore_lf && String.length data > 0 && data.[0] = '\n' then begin 230 + t.ignore_lf <- false; 231 + if String.length data > 1 then begin 232 + let rest = String.sub data 1 (String.length data - 1) in 233 + let (parent, before) = appropriate_insertion_place t in 234 + Dom.insert_text_at parent rest before 235 + end 236 + end else begin 237 + t.ignore_lf <- false; 238 + let (parent, before) = appropriate_insertion_place t in 239 + Dom.insert_text_at parent data before 240 + end 241 + 242 + let insert_comment t data = 243 + let node = Dom.create_comment data in 244 + let (parent, _) = appropriate_insertion_place t in 245 + Dom.append_child parent node 246 + 247 + let insert_comment_to_document t data = 248 + let node = Dom.create_comment data in 249 + Dom.append_child t.document node 250 + 251 + (* Stack manipulation *) 252 + let pop_current t = 253 + match t.open_elements with 254 + | [] -> () 255 + | _ :: rest -> t.open_elements <- rest 256 + 257 + let pop_until t pred = 258 + let rec loop () = 259 + match t.open_elements with 260 + | [] -> () 261 + | x :: rest -> 262 + t.open_elements <- rest; 263 + if not (pred x) then loop () 264 + in 265 + loop () 266 + 267 + let pop_until_tag t name = 268 + pop_until t (fun n -> n.Dom.name = name) 269 + 270 + (* Pop until HTML namespace element with given name *) 271 + let pop_until_html_tag t name = 272 + pop_until t (fun n -> n.Dom.name = name && is_in_html_namespace n) 273 + 274 + let pop_until_one_of t names = 275 + pop_until t (fun n -> List.mem n.Dom.name names) 276 + 277 + (* Pop until HTML namespace element with one of given names *) 278 + let pop_until_html_one_of t names = 279 + pop_until t (fun n -> List.mem n.Dom.name names && is_in_html_namespace n) 280 + 281 + (* Check if element is an HTML integration point *) 282 + let is_html_integration_point node = 283 + (* SVG foreignObject, desc, and title are always HTML integration points *) 284 + if node.Dom.namespace = Some "svg" && 285 + List.mem node.Dom.name Constants.svg_html_integration then true 286 + (* annotation-xml is an HTML integration point only with specific encoding values *) 287 + else if node.Dom.namespace = Some "mathml" && node.Dom.name = "annotation-xml" then 288 + match List.assoc_opt "encoding" node.Dom.attrs with 289 + | Some enc -> 290 + let enc_lower = String.lowercase_ascii enc in 291 + enc_lower = "text/html" || enc_lower = "application/xhtml+xml" 292 + | None -> false 293 + else false 294 + 295 + (* Check if element is a MathML text integration point *) 296 + let is_mathml_text_integration_point node = 297 + node.Dom.namespace = Some "mathml" && 298 + List.mem node.Dom.name ["mi"; "mo"; "mn"; "ms"; "mtext"] 299 + 300 + (* Scope checks - integration points also terminate scope (except for table scope) *) 301 + (* Per WHATWG spec, scope checks only consider HTML namespace elements for the target names *) 302 + let has_element_in_scope_impl t names exclude_list ~check_integration_points = 303 + let rec check = function 304 + | [] -> false 305 + | n :: rest -> 306 + (* Target elements must be in HTML namespace *) 307 + if is_in_html_namespace n && List.mem n.Dom.name names then true 308 + else if is_in_html_namespace n && List.mem n.Dom.name exclude_list then false 309 + (* Integration points terminate scope (unless we're checking table scope) *) 310 + else if check_integration_points && (is_html_integration_point n || is_mathml_text_integration_point n) then false 311 + else check rest 312 + in 313 + check t.open_elements 314 + 315 + let has_element_in_scope t name = 316 + has_element_in_scope_impl t [name] Constants.default_scope ~check_integration_points:true 317 + 318 + let has_element_in_button_scope t name = 319 + has_element_in_scope_impl t [name] Constants.button_scope ~check_integration_points:true 320 + 321 + let has_element_in_list_item_scope t name = 322 + has_element_in_scope_impl t [name] Constants.list_item_scope ~check_integration_points:true 323 + 324 + let has_element_in_table_scope t name = 325 + has_element_in_scope_impl t [name] Constants.table_scope ~check_integration_points:false 326 + 327 + let has_element_in_select_scope t name = 328 + let rec check = function 329 + | [] -> false 330 + | n :: rest -> 331 + if n.Dom.name = name then true 332 + else if not (List.mem n.Dom.name Constants.select_scope_exclude) then false 333 + else check rest 334 + in 335 + check t.open_elements 336 + 337 + (* Implied end tags *) 338 + let generate_implied_end_tags t ?except () = 339 + let rec loop () = 340 + match current_node t with 341 + | Some n when List.mem n.Dom.name Constants.implied_end_tags -> 342 + (match except with 343 + | Some ex when n.Dom.name = ex -> () 344 + | _ -> pop_current t; loop ()) 345 + | _ -> () 346 + in 347 + loop () 348 + 349 + let generate_all_implied_end_tags t = 350 + let rec loop () = 351 + match current_node t with 352 + | Some n when List.mem n.Dom.name Constants.thoroughly_implied_end_tags -> 353 + pop_current t; loop () 354 + | _ -> () 355 + in 356 + loop () 357 + 358 + (* Active formatting elements *) 359 + let push_formatting_marker t = 360 + t.active_formatting <- Marker :: t.active_formatting 361 + 362 + let push_formatting_element t node name attrs = 363 + (* Noah's Ark: remove earlier identical elements (up to 3) *) 364 + let rec count_and_remove same acc = function 365 + | [] -> List.rev acc 366 + | Marker :: rest -> List.rev acc @ (Marker :: rest) 367 + | Entry e :: rest when e.name = name && e.attrs = attrs -> 368 + if same >= 2 then 369 + count_and_remove same acc rest (* Remove this one *) 370 + else 371 + count_and_remove (same + 1) (Entry e :: acc) rest 372 + | x :: rest -> count_and_remove same (x :: acc) rest 373 + in 374 + t.active_formatting <- count_and_remove 0 [] t.active_formatting; 375 + t.active_formatting <- Entry { name; attrs; node } :: t.active_formatting 376 + 377 + let clear_active_formatting_to_marker t = 378 + let rec loop = function 379 + | [] -> [] 380 + | Marker :: rest -> rest 381 + | _ :: rest -> loop rest 382 + in 383 + t.active_formatting <- loop t.active_formatting 384 + 385 + let reconstruct_active_formatting t = 386 + let rec find_to_reconstruct acc = function 387 + | [] -> acc 388 + | Marker :: _ -> acc 389 + | Entry e :: rest -> 390 + if List.exists (fun n -> n == e.node) t.open_elements then acc 391 + else find_to_reconstruct (Entry e :: acc) rest 392 + in 393 + let to_reconstruct = find_to_reconstruct [] t.active_formatting in 394 + List.iter (fun entry -> 395 + match entry with 396 + | Entry e -> 397 + let node = insert_element t e.name e.attrs in 398 + t.open_elements <- node :: t.open_elements; 399 + (* Update the entry to point to new node *) 400 + t.active_formatting <- List.map (fun x -> 401 + if x == entry then Entry { e with node } 402 + else x 403 + ) t.active_formatting 404 + | Marker -> () 405 + ) to_reconstruct 406 + 407 + (* Adoption agency algorithm - follows WHATWG spec *) 408 + let adoption_agency t tag_name = 409 + (* Step 1: If current node is subject and not in active formatting list, just pop *) 410 + (match current_node t with 411 + | Some n when n.Dom.name = tag_name -> 412 + let in_active = List.exists (function 413 + | Entry e -> e.name = tag_name 414 + | Marker -> false 415 + ) t.active_formatting in 416 + if not in_active then begin 417 + pop_current t; 418 + () (* Return early - this case is handled *) 419 + end 420 + | _ -> ()); 421 + 422 + (* Step 2: Outer loop *) 423 + let outer_loop_counter = ref 0 in 424 + let done_flag = ref false in 425 + 426 + while !outer_loop_counter < 8 && not !done_flag do 427 + incr outer_loop_counter; 428 + 429 + (* Step 3: Find formatting element in active formatting list *) 430 + let rec find_formatting_index idx = function 431 + | [] -> None 432 + | Marker :: _ -> None 433 + | Entry e :: rest -> 434 + if e.name = tag_name then Some (idx, e.node, e.attrs) 435 + else find_formatting_index (idx + 1) rest 436 + in 437 + let formatting_entry = find_formatting_index 0 t.active_formatting in 438 + 439 + match formatting_entry with 440 + | None -> 441 + (* No formatting element found - done *) 442 + done_flag := true 443 + | Some (fmt_idx, fmt_node, fmt_attrs) -> 444 + 445 + (* Step 4: Check if formatting element is in open elements *) 446 + if not (List.exists (fun n -> n == fmt_node) t.open_elements) then begin 447 + parse_error t "adoption-agency-1.2"; 448 + t.active_formatting <- List.filteri (fun i _ -> i <> fmt_idx) t.active_formatting; 449 + done_flag := true 450 + end 451 + (* Step 5: Check if formatting element is in scope *) 452 + else if not (has_element_in_scope t tag_name) then begin 453 + parse_error t "adoption-agency-1.3"; 454 + done_flag := true 455 + end else begin 456 + (* Step 6: Parse error if not current node *) 457 + (match current_node t with 458 + | Some n when n != fmt_node -> parse_error t "adoption-agency-1.3" 459 + | _ -> ()); 460 + 461 + (* Step 7: Find furthest block - first special element BELOW formatting element *) 462 + (* open_elements is [current(top)...html(bottom)], formatting element is somewhere in the middle *) 463 + (* We need the first special element going from formatting element toward current *) 464 + (* This is the "topmost" (closest to formatting element) special element that is "lower" (closer to current) *) 465 + let fmt_stack_idx = ref (-1) in 466 + List.iteri (fun i n -> if n == fmt_node then fmt_stack_idx := i) t.open_elements; 467 + let furthest_block = 468 + if !fmt_stack_idx <= 0 then None 469 + else begin 470 + (* Look from fmt_stack_idx-1 down to 0, find first special element *) 471 + let rec find_from_idx idx = 472 + if idx < 0 then None 473 + else 474 + let n = List.nth t.open_elements idx in 475 + if is_special_element n then Some n 476 + else find_from_idx (idx - 1) 477 + in 478 + find_from_idx (!fmt_stack_idx - 1) 479 + end 480 + in 481 + 482 + match furthest_block with 483 + | None -> 484 + (* Step 8: No furthest block - pop elements including formatting element *) 485 + pop_until t (fun n -> n == fmt_node); 486 + t.active_formatting <- List.filteri (fun i _ -> i <> fmt_idx) t.active_formatting; 487 + done_flag := true 488 + 489 + | Some fb -> 490 + (* Step 9: Let common ancestor be element immediately above formatting element *) 491 + let rec find_common_ancestor = function 492 + | [] -> None 493 + | n :: rest when n == fmt_node -> 494 + (match rest with x :: _ -> Some x | [] -> None) 495 + | _ :: rest -> find_common_ancestor rest 496 + in 497 + let common_ancestor = find_common_ancestor t.open_elements in 498 + 499 + (* Step 10: Bookmark starts after formatting element *) 500 + let bookmark = ref (fmt_idx + 1) in 501 + 502 + (* Step 11: Let last_node = furthest block *) 503 + let last_node = ref fb in 504 + 505 + (* Step 12: Inner loop *) 506 + (* The inner loop processes elements between furthest_block and formatting_element, 507 + removing non-formatting elements and reparenting formatting elements *) 508 + let inner_loop_counter = ref 0 in 509 + 510 + (* Get index of furthest block in open elements *) 511 + let fb_idx = ref 0 in 512 + List.iteri (fun i n -> if n == fb then fb_idx := i) t.open_elements; 513 + 514 + (* Start from element after furthest block (toward formatting element) *) 515 + let node_idx = ref (!fb_idx + 1) in 516 + 517 + while !node_idx < List.length t.open_elements && 518 + (List.nth t.open_elements !node_idx) != fmt_node do 519 + incr inner_loop_counter; 520 + let current_node = List.nth t.open_elements !node_idx in 521 + 522 + (* Step 12.3: Find node in active formatting list *) 523 + let rec find_node_in_formatting idx = function 524 + | [] -> None 525 + | Entry e :: _rest when e.node == current_node -> Some idx 526 + | _ :: rest -> find_node_in_formatting (idx + 1) rest 527 + in 528 + let node_fmt_idx = find_node_in_formatting 0 t.active_formatting in 529 + 530 + (* Step 12.4: If inner loop counter > 3 and node in active formatting, remove it *) 531 + let node_fmt_idx = 532 + match node_fmt_idx with 533 + | Some idx when !inner_loop_counter > 3 -> 534 + t.active_formatting <- List.filteri (fun i _ -> i <> idx) t.active_formatting; 535 + if idx < !bookmark then decr bookmark; 536 + None 537 + | x -> x 538 + in 539 + 540 + (* Step 12.5: If node not in active formatting, remove from stack and continue *) 541 + match node_fmt_idx with 542 + | None -> 543 + (* Remove from stack - this shifts indices *) 544 + t.open_elements <- List.filteri (fun i _ -> i <> !node_idx) t.open_elements 545 + (* Don't increment node_idx since we removed an element *) 546 + 547 + | Some af_idx -> 548 + (* Step 12.6: Create new element for node *) 549 + let (node_name, node_attrs) = match List.nth t.active_formatting af_idx with 550 + | Entry e -> (e.name, e.attrs) 551 + | Marker -> failwith "unexpected marker" 552 + in 553 + let new_node_elem = Dom.create_element node_name ~attrs:node_attrs () in 554 + 555 + (* Update active formatting with new node *) 556 + t.active_formatting <- List.mapi (fun i entry -> 557 + if i = af_idx then Entry { name = node_name; node = new_node_elem; attrs = node_attrs } 558 + else entry 559 + ) t.active_formatting; 560 + 561 + (* Replace node in open elements *) 562 + t.open_elements <- List.mapi (fun i n -> 563 + if i = !node_idx then new_node_elem else n 564 + ) t.open_elements; 565 + 566 + (* Step 12.7: If last_node is furthest block, update bookmark *) 567 + if !last_node == fb then 568 + bookmark := af_idx + 1; 569 + 570 + (* Step 12.8: Reparent last_node to new node *) 571 + (match !last_node.Dom.parent with 572 + | Some p -> Dom.remove_child p !last_node 573 + | None -> ()); 574 + Dom.append_child new_node_elem !last_node; 575 + 576 + (* Step 12.9: Let last_node = new node *) 577 + last_node := new_node_elem; 578 + 579 + (* Move to next element *) 580 + incr node_idx 581 + done; 582 + 583 + (* Step 13: Insert last_node into common ancestor *) 584 + (match common_ancestor with 585 + | Some ca -> 586 + (match !last_node.Dom.parent with 587 + | Some p -> Dom.remove_child p !last_node 588 + | None -> ()); 589 + (* Check if we need foster parenting *) 590 + if t.foster_parenting && List.mem ca.Dom.name ["table"; "tbody"; "tfoot"; "thead"; "tr"] then begin 591 + (* Find table and insert before it *) 592 + let rec find_table = function 593 + | [] -> None 594 + | n :: rest when n.Dom.name = "table" -> Some (n, rest) 595 + | _ :: rest -> find_table rest 596 + in 597 + match find_table t.open_elements with 598 + | Some (table, _) -> 599 + (match table.Dom.parent with 600 + | Some parent -> Dom.insert_before parent !last_node table 601 + | None -> Dom.append_child ca !last_node) 602 + | None -> Dom.append_child ca !last_node 603 + end else begin 604 + (* If common ancestor is template, insert into its content *) 605 + match ca.Dom.template_content with 606 + | Some tc -> Dom.append_child tc !last_node 607 + | None -> Dom.append_child ca !last_node 608 + end 609 + | None -> ()); 610 + 611 + (* Step 14: Create new formatting element *) 612 + let new_formatting = Dom.create_element tag_name ~attrs:fmt_attrs () in 613 + 614 + (* Step 15: Move children of furthest block to new formatting element *) 615 + let fb_children = fb.Dom.children in 616 + List.iter (fun child -> 617 + Dom.remove_child fb child; 618 + Dom.append_child new_formatting child 619 + ) fb_children; 620 + 621 + (* Step 16: Append new formatting element to furthest block *) 622 + Dom.append_child fb new_formatting; 623 + 624 + (* Step 17: Remove old from active formatting, insert new at bookmark *) 625 + let new_entry = Entry { name = tag_name; node = new_formatting; attrs = fmt_attrs } in 626 + t.active_formatting <- List.filteri (fun i _ -> i <> fmt_idx) t.active_formatting; 627 + (* Adjust bookmark since we removed an element *) 628 + let adjusted_bookmark = if fmt_idx < !bookmark then !bookmark - 1 else !bookmark in 629 + let rec insert_at_bookmark idx acc = function 630 + | [] -> List.rev (new_entry :: acc) 631 + | x :: rest when idx = adjusted_bookmark -> 632 + List.rev_append acc (new_entry :: x :: rest) 633 + | x :: rest -> insert_at_bookmark (idx + 1) (x :: acc) rest 634 + in 635 + t.active_formatting <- insert_at_bookmark 0 [] t.active_formatting; 636 + 637 + (* Step 18: Remove formatting element from open elements, insert new after furthest block *) 638 + (* "After" in stack terms means new_formatting should be between fb and current node *) 639 + (* In our list orientation (current at index 0), this means new_formatting at lower index than fb *) 640 + t.open_elements <- List.filter (fun n -> n != fmt_node) t.open_elements; 641 + (* Find fb and insert new_formatting before it (lower index = closer to current) *) 642 + let rec insert_before acc = function 643 + | [] -> List.rev (new_formatting :: acc) 644 + | n :: rest when n == fb -> 645 + (* Insert new_formatting before fb: acc reversed, then new_formatting, then fb, then rest *) 646 + List.rev_append acc (new_formatting :: n :: rest) 647 + | n :: rest -> insert_before (n :: acc) rest 648 + in 649 + t.open_elements <- insert_before [] t.open_elements 650 + (* Continue outer loop *) 651 + end 652 + done 653 + 654 + (* Close p element *) 655 + let close_p_element t = 656 + generate_implied_end_tags t ~except:"p" (); 657 + (match current_node t with 658 + | Some n when n.Dom.name <> "p" -> parse_error t "expected-p" 659 + | _ -> ()); 660 + pop_until_tag t "p" 661 + 662 + (* Reset insertion mode *) 663 + let reset_insertion_mode t = 664 + let rec check_node last = function 665 + | [] -> t.mode <- Insertion_mode.In_body 666 + | node :: rest -> 667 + let is_last = rest = [] in 668 + let node_to_check = 669 + if is_last then 670 + match t.fragment_context with 671 + | Some ctx -> Dom.create_element ctx.tag_name ~namespace:ctx.namespace () 672 + | None -> node 673 + else node 674 + in 675 + let name = node_to_check.Dom.name in 676 + if name = "select" then begin 677 + if not is_last then begin 678 + let rec find_table_or_template = function 679 + | [] -> () 680 + | n :: rest -> 681 + if n.Dom.name = "template" then t.mode <- Insertion_mode.In_select 682 + else if n.Dom.name = "table" then t.mode <- Insertion_mode.In_select_in_table 683 + else find_table_or_template rest 684 + in 685 + find_table_or_template rest 686 + end; 687 + if t.mode <> Insertion_mode.In_select_in_table then 688 + t.mode <- Insertion_mode.In_select 689 + end else if List.mem name ["td"; "th"] && not is_last then 690 + t.mode <- Insertion_mode.In_cell 691 + else if name = "tr" then 692 + t.mode <- Insertion_mode.In_row 693 + else if List.mem name ["tbody"; "thead"; "tfoot"] then 694 + t.mode <- Insertion_mode.In_table_body 695 + else if name = "caption" then 696 + t.mode <- Insertion_mode.In_caption 697 + else if name = "colgroup" then 698 + t.mode <- Insertion_mode.In_column_group 699 + else if name = "table" then 700 + t.mode <- Insertion_mode.In_table 701 + else if name = "template" then 702 + t.mode <- (match t.template_modes with m :: _ -> m | [] -> Insertion_mode.In_template) 703 + else if name = "head" && not is_last then 704 + t.mode <- Insertion_mode.In_head 705 + else if name = "body" then 706 + t.mode <- Insertion_mode.In_body 707 + else if name = "frameset" then 708 + t.mode <- Insertion_mode.In_frameset 709 + else if name = "html" then 710 + t.mode <- (if t.head_element = None then Insertion_mode.Before_head else Insertion_mode.After_head) 711 + else if is_last then 712 + t.mode <- Insertion_mode.In_body 713 + else 714 + check_node last rest 715 + in 716 + check_node false t.open_elements 717 + 718 + let is_whitespace s = 719 + let ws = [' '; '\t'; '\n'; '\x0C'; '\r'] in 720 + String.for_all (fun c -> List.mem c ws) s 721 + 722 + (* Mode handlers *) 723 + let rec process_initial t token = 724 + match token with 725 + | Token.Character data when is_whitespace data -> () 726 + | Token.Comment data -> insert_comment_to_document t data 727 + | Token.Doctype dt -> 728 + let node = Dom.create_doctype ?name:dt.name ?public_id:dt.public_id ?system_id:dt.system_id () in 729 + Dom.append_child t.document node; 730 + (* Quirks mode detection *) 731 + if dt.force_quirks then 732 + t.quirks_mode <- Dom.Quirks 733 + else if dt.name <> Some "html" then 734 + t.quirks_mode <- Dom.Quirks 735 + else begin 736 + let pub = Option.map String.lowercase_ascii dt.public_id in 737 + let sys = Option.map String.lowercase_ascii dt.system_id in 738 + let is_quirky = 739 + (match pub with 740 + | Some p -> List.mem p Constants.quirky_public_matches || 741 + List.exists (fun prefix -> String.length p >= String.length prefix && 742 + String.sub p 0 (String.length prefix) = prefix) Constants.quirky_public_prefixes 743 + | None -> false) || 744 + (match sys with 745 + | Some s -> List.mem s Constants.quirky_system_matches 746 + | None -> false) 747 + in 748 + if is_quirky then t.quirks_mode <- Dom.Quirks 749 + else begin 750 + let is_limited_quirky = 751 + match pub with 752 + | Some p -> List.exists (fun prefix -> String.length p >= String.length prefix && 753 + String.sub p 0 (String.length prefix) = prefix) 754 + Constants.limited_quirky_public_prefixes 755 + | None -> false 756 + in 757 + if is_limited_quirky then t.quirks_mode <- Dom.Limited_quirks 758 + end 759 + end; 760 + t.mode <- Insertion_mode.Before_html 761 + | _ -> 762 + parse_error t "expected-doctype-but-got-other"; 763 + t.quirks_mode <- Dom.Quirks; 764 + t.mode <- Insertion_mode.Before_html; 765 + process_token t token 766 + 767 + and process_before_html t token = 768 + match token with 769 + | Token.Doctype _ -> parse_error t "unexpected-doctype" 770 + | Token.Comment data -> insert_comment_to_document t data 771 + | Token.Character data when is_whitespace data -> () 772 + | Token.Tag { kind = Token.Start; name = "html"; attrs; _ } -> 773 + let html = insert_element t "html" attrs in 774 + t.open_elements <- [html]; 775 + t.mode <- Insertion_mode.Before_head 776 + | Token.Tag { kind = Token.End; name; _ } when List.mem name ["head"; "body"; "html"; "br"] -> 777 + let html = insert_element t "html" [] in 778 + t.open_elements <- [html]; 779 + t.mode <- Insertion_mode.Before_head; 780 + process_token t token 781 + | Token.Tag { kind = Token.End; _ } -> 782 + parse_error t "unexpected-end-tag" 783 + | _ -> 784 + let html = insert_element t "html" [] in 785 + t.open_elements <- [html]; 786 + t.mode <- Insertion_mode.Before_head; 787 + process_token t token 788 + 789 + and process_before_head t token = 790 + match token with 791 + | Token.Character data when is_whitespace data -> () 792 + | Token.Comment data -> insert_comment t data 793 + | Token.Doctype _ -> parse_error t "unexpected-doctype" 794 + | Token.Tag { kind = Token.Start; name = "html"; _ } -> 795 + process_in_body t token 796 + | Token.Tag { kind = Token.Start; name = "head"; attrs; _ } -> 797 + let head = insert_element t "head" attrs in 798 + t.open_elements <- head :: t.open_elements; 799 + t.head_element <- Some head; 800 + t.mode <- Insertion_mode.In_head 801 + | Token.Tag { kind = Token.End; name; _ } when List.mem name ["head"; "body"; "html"; "br"] -> 802 + let head = insert_element t "head" [] in 803 + t.open_elements <- head :: t.open_elements; 804 + t.head_element <- Some head; 805 + t.mode <- Insertion_mode.In_head; 806 + process_token t token 807 + | Token.Tag { kind = Token.End; _ } -> 808 + parse_error t "unexpected-end-tag" 809 + | _ -> 810 + let head = insert_element t "head" [] in 811 + t.open_elements <- head :: t.open_elements; 812 + t.head_element <- Some head; 813 + t.mode <- Insertion_mode.In_head; 814 + process_token t token 815 + 816 + and process_in_head t token = 817 + match token with 818 + | Token.Character data when is_whitespace data -> 819 + insert_character t data 820 + | Token.Character data -> 821 + (* Extract leading whitespace *) 822 + let rec count_leading_ws i = 823 + if i >= String.length data then i 824 + else match data.[i] with 825 + | '\t' | '\n' | '\x0C' | '\r' | ' ' -> count_leading_ws (i + 1) 826 + | _ -> i 827 + in 828 + let ws_count = count_leading_ws 0 in 829 + let leading_ws = String.sub data 0 ws_count in 830 + let remaining = String.sub data ws_count (String.length data - ws_count) in 831 + (* If there's leading whitespace and current element has children, insert it *) 832 + if leading_ws <> "" then 833 + (match current_node t with 834 + | Some n when n.Dom.children <> [] -> insert_character t leading_ws 835 + | _ -> ()); 836 + pop_current t; 837 + t.mode <- Insertion_mode.After_head; 838 + process_token t (Token.Character remaining) 839 + | Token.Comment data -> 840 + insert_comment t data 841 + | Token.Doctype _ -> 842 + parse_error t "unexpected-doctype" 843 + | Token.Tag { kind = Token.Start; name = "html"; _ } -> 844 + process_in_body t token 845 + | Token.Tag { kind = Token.Start; name; attrs; _ } 846 + when List.mem name ["base"; "basefont"; "bgsound"; "link"; "meta"] -> 847 + ignore (insert_element t name attrs) 848 + | Token.Tag { kind = Token.Start; name = "title"; _ } -> 849 + ignore (insert_element_for_token t { kind = Token.Start; name = "title"; attrs = []; self_closing = false }); 850 + t.original_mode <- Some t.mode; 851 + t.mode <- Insertion_mode.Text 852 + | Token.Tag { kind = Token.Start; name; _ } 853 + when List.mem name ["noframes"; "style"] -> 854 + ignore (insert_element_for_token t { kind = Token.Start; name; attrs = []; self_closing = false }); 855 + t.original_mode <- Some t.mode; 856 + t.mode <- Insertion_mode.Text 857 + | Token.Tag { kind = Token.Start; name = "noscript"; _ } -> 858 + (* Scripting is disabled: parse noscript content as HTML *) 859 + ignore (insert_element_for_token t { kind = Token.Start; name = "noscript"; attrs = []; self_closing = false }); 860 + t.mode <- Insertion_mode.In_head_noscript 861 + | Token.Tag { kind = Token.Start; name = "script"; attrs; self_closing } -> 862 + ignore (insert_element_for_token t { kind = Token.Start; name = "script"; attrs; self_closing }); 863 + t.original_mode <- Some t.mode; 864 + t.mode <- Insertion_mode.Text 865 + | Token.Tag { kind = Token.End; name = "head"; _ } -> 866 + pop_current t; 867 + t.mode <- Insertion_mode.After_head 868 + | Token.Tag { kind = Token.End; name; _ } when List.mem name ["body"; "html"; "br"] -> 869 + pop_current t; 870 + t.mode <- Insertion_mode.After_head; 871 + process_token t token 872 + | Token.Tag { kind = Token.Start; name = "template"; attrs; _ } -> 873 + let node = Dom.create_template ~attrs () in 874 + let (parent, _) = appropriate_insertion_place t in 875 + Dom.append_child parent node; 876 + t.open_elements <- node :: t.open_elements; 877 + push_formatting_marker t; 878 + t.frameset_ok <- false; 879 + t.mode <- Insertion_mode.In_template; 880 + t.template_modes <- Insertion_mode.In_template :: t.template_modes 881 + | Token.Tag { kind = Token.End; name = "template"; _ } -> 882 + if not (List.exists (fun n -> n.Dom.name = "template" && is_in_html_namespace n) t.open_elements) then 883 + parse_error t "unexpected-end-tag" 884 + else begin 885 + generate_all_implied_end_tags t; 886 + (match current_node t with 887 + | Some n when not (n.Dom.name = "template" && is_in_html_namespace n) -> parse_error t "unexpected-end-tag" 888 + | _ -> ()); 889 + pop_until_html_tag t "template"; 890 + clear_active_formatting_to_marker t; 891 + t.template_modes <- (match t.template_modes with _ :: rest -> rest | [] -> []); 892 + reset_insertion_mode t 893 + end 894 + | Token.Tag { kind = Token.Start; name = "head"; _ } -> 895 + parse_error t "unexpected-start-tag" 896 + | Token.Tag { kind = Token.End; _ } -> 897 + parse_error t "unexpected-end-tag" 898 + | _ -> 899 + pop_current t; 900 + t.mode <- Insertion_mode.After_head; 901 + process_token t token 902 + 903 + and process_in_head_noscript t token = 904 + match token with 905 + | Token.Character data when is_whitespace data -> 906 + process_in_head t token 907 + | Token.Character _ -> 908 + parse_error t "unexpected-char-in-noscript"; 909 + pop_current t; (* Pop noscript *) 910 + t.mode <- Insertion_mode.In_head; 911 + process_token t token 912 + | Token.Comment _ -> 913 + process_in_head t token 914 + | Token.Doctype _ -> 915 + parse_error t "unexpected-doctype" 916 + | Token.Tag { kind = Token.Start; name = "html"; _ } -> 917 + process_in_body t token 918 + | Token.Tag { kind = Token.Start; name; _ } 919 + when List.mem name ["basefont"; "bgsound"; "link"; "meta"; "noframes"; "style"] -> 920 + process_in_head t token 921 + | Token.Tag { kind = Token.Start; name; _ } 922 + when List.mem name ["head"; "noscript"] -> 923 + parse_error t "unexpected-start-tag" 924 + | Token.Tag { kind = Token.Start; _ } -> 925 + parse_error t "unexpected-start-tag"; 926 + pop_current t; (* Pop noscript *) 927 + t.mode <- Insertion_mode.In_head; 928 + process_token t token 929 + | Token.Tag { kind = Token.End; name = "noscript"; _ } -> 930 + pop_current t; (* Pop noscript *) 931 + t.mode <- Insertion_mode.In_head 932 + | Token.Tag { kind = Token.End; name = "br"; _ } -> 933 + parse_error t "unexpected-end-tag"; 934 + pop_current t; (* Pop noscript *) 935 + t.mode <- Insertion_mode.In_head; 936 + process_token t token 937 + | Token.Tag { kind = Token.End; _ } -> 938 + parse_error t "unexpected-end-tag" 939 + | Token.EOF -> 940 + parse_error t "expected-closing-tag-but-got-eof"; 941 + pop_current t; (* Pop noscript *) 942 + t.mode <- Insertion_mode.In_head; 943 + process_token t token 944 + 945 + and process_after_head t token = 946 + match token with 947 + | Token.Character data when is_whitespace data -> 948 + insert_character t data 949 + | Token.Comment data -> 950 + insert_comment t data 951 + | Token.Doctype _ -> 952 + parse_error t "unexpected-doctype" 953 + | Token.Tag { kind = Token.Start; name = "html"; _ } -> 954 + process_in_body t token 955 + | Token.Tag { kind = Token.Start; name = "body"; attrs; _ } -> 956 + ignore (insert_element t "body" ~push:true attrs); 957 + t.frameset_ok <- false; 958 + t.mode <- Insertion_mode.In_body 959 + | Token.Tag { kind = Token.Start; name = "frameset"; attrs; _ } -> 960 + ignore (insert_element t "frameset" ~push:true attrs); 961 + t.mode <- Insertion_mode.In_frameset 962 + | Token.Tag { kind = Token.Start; name = "input"; attrs; _ } -> 963 + (* Special handling for input type="hidden" - parse error, ignore *) 964 + let input_type = List.assoc_opt "type" attrs in 965 + (match input_type with 966 + | Some typ when String.lowercase_ascii typ = "hidden" -> 967 + parse_error t "unexpected-hidden-input-after-head" 968 + | _ -> 969 + (* Non-hidden input creates body *) 970 + let body = insert_element t "body" [] in 971 + t.open_elements <- body :: t.open_elements; 972 + t.mode <- Insertion_mode.In_body; 973 + process_token t token) 974 + | Token.Tag { kind = Token.Start; name; _ } 975 + when List.mem name ["base"; "basefont"; "bgsound"; "link"; "meta"; "noframes"; "script"; "style"; "template"; "title"] -> 976 + parse_error t "unexpected-start-tag"; 977 + (match t.head_element with 978 + | Some head -> 979 + t.open_elements <- head :: t.open_elements; 980 + process_in_head t token; 981 + t.open_elements <- List.filter (fun n -> n != head) t.open_elements 982 + | None -> ()) 983 + | Token.Tag { kind = Token.End; name = "template"; _ } -> 984 + process_in_head t token 985 + | Token.Tag { kind = Token.End; name; _ } when List.mem name ["body"; "html"; "br"] -> 986 + let body = insert_element t "body" [] in 987 + t.open_elements <- body :: t.open_elements; 988 + t.mode <- Insertion_mode.In_body; 989 + process_token t token 990 + | Token.Tag { kind = Token.Start; name = "head"; _ } -> 991 + parse_error t "unexpected-start-tag" 992 + | Token.Tag { kind = Token.End; _ } -> 993 + parse_error t "unexpected-end-tag" 994 + | _ -> 995 + let body = insert_element t "body" [] in 996 + t.open_elements <- body :: t.open_elements; 997 + t.mode <- Insertion_mode.In_body; 998 + process_token t token 999 + 1000 + and process_in_body t token = 1001 + match token with 1002 + | Token.Character "\x00" -> 1003 + parse_error t "unexpected-null-character" 1004 + | Token.Character data -> 1005 + reconstruct_active_formatting t; 1006 + insert_character t data; 1007 + if not (is_whitespace data) then t.frameset_ok <- false 1008 + | Token.Comment data -> 1009 + insert_comment t data 1010 + | Token.Doctype _ -> 1011 + parse_error t "unexpected-doctype" 1012 + | Token.Tag { kind = Token.Start; name = "html"; attrs; _ } -> 1013 + parse_error t "unexpected-start-tag"; 1014 + if not (List.exists (fun n -> n.Dom.name = "template") t.open_elements) then 1015 + (* Find the html element (at the bottom of the stack) *) 1016 + let html_elem = List.find_opt (fun n -> n.Dom.name = "html") t.open_elements in 1017 + (match html_elem with 1018 + | Some html -> 1019 + List.iter (fun (k, v) -> 1020 + if not (Dom.has_attr html k) then Dom.set_attr html k v 1021 + ) attrs 1022 + | None -> ()) 1023 + | Token.Tag { kind = Token.Start; name; _ } 1024 + when List.mem name ["base"; "basefont"; "bgsound"; "link"; "meta"; "noframes"; "script"; "style"; "template"; "title"] -> 1025 + process_in_head t token 1026 + | Token.Tag { kind = Token.End; name = "template"; _ } -> 1027 + process_in_head t token 1028 + | Token.Tag { kind = Token.Start; name = "body"; attrs; _ } -> 1029 + parse_error t "unexpected-start-tag"; 1030 + (* Find body element on stack - it should be near the end (html is last) *) 1031 + let body = List.find_opt (fun n -> n.Dom.name = "body") t.open_elements in 1032 + (match body with 1033 + | Some body when not (List.exists (fun n -> n.Dom.name = "template") t.open_elements) -> 1034 + t.frameset_ok <- false; 1035 + List.iter (fun (k, v) -> 1036 + if not (Dom.has_attr body k) then Dom.set_attr body k v 1037 + ) attrs 1038 + | _ -> ()) 1039 + | Token.Tag { kind = Token.Start; name = "frameset"; attrs; _ } -> 1040 + if not t.frameset_ok then 1041 + parse_error t "unexpected-start-tag-ignored" 1042 + else begin 1043 + (* Find body element on the stack *) 1044 + let rec find_body_index idx = function 1045 + | [] -> None 1046 + | n :: rest -> 1047 + if n.Dom.name = "body" then Some (idx, n) 1048 + else find_body_index (idx + 1) rest 1049 + in 1050 + match find_body_index 0 t.open_elements with 1051 + | None -> 1052 + parse_error t "unexpected-start-tag-ignored" 1053 + | Some (idx, body_elem) -> 1054 + (* Remove body from its parent (the html element) *) 1055 + (match body_elem.Dom.parent with 1056 + | Some parent -> Dom.remove_child parent body_elem 1057 + | None -> ()); 1058 + (* Pop all elements up to and including body - keep only elements after body_idx *) 1059 + let rec drop n lst = if n <= 0 then lst else match lst with [] -> [] | _ :: rest -> drop (n - 1) rest in 1060 + t.open_elements <- drop (idx + 1) t.open_elements; 1061 + (* Insert frameset element *) 1062 + ignore (insert_element t "frameset" ~push:true attrs); 1063 + t.mode <- Insertion_mode.In_frameset 1064 + end 1065 + | Token.EOF -> 1066 + if t.template_modes <> [] then 1067 + process_in_template t token 1068 + else begin 1069 + let has_unclosed = List.exists (fun n -> 1070 + not (List.mem n.Dom.name ["dd"; "dt"; "li"; "optgroup"; "option"; "p"; "rb"; "rp"; "rt"; "rtc"; "tbody"; "td"; "tfoot"; "th"; "thead"; "tr"; "body"; "html"]) 1071 + ) t.open_elements in 1072 + if has_unclosed then parse_error t "expected-closing-tag-but-got-eof" 1073 + end 1074 + | Token.Tag { kind = Token.End; name = "body"; _ } -> 1075 + if not (has_element_in_scope t "body") then 1076 + parse_error t "unexpected-end-tag" 1077 + else begin 1078 + let has_unclosed = List.exists (fun n -> 1079 + not (List.mem n.Dom.name ["dd"; "dt"; "li"; "optgroup"; "option"; "p"; "rb"; "rp"; "rt"; "rtc"; "tbody"; "td"; "tfoot"; "th"; "thead"; "tr"; "body"; "html"]) 1080 + ) t.open_elements in 1081 + if has_unclosed then parse_error t "end-tag-too-early"; 1082 + t.mode <- Insertion_mode.After_body 1083 + end 1084 + | Token.Tag { kind = Token.End; name = "html"; _ } -> 1085 + if not (has_element_in_scope t "body") then 1086 + parse_error t "unexpected-end-tag" 1087 + else begin 1088 + t.mode <- Insertion_mode.After_body; 1089 + process_token t token 1090 + end 1091 + | Token.Tag { kind = Token.Start; name; attrs; _ } 1092 + when List.mem name ["address"; "article"; "aside"; "blockquote"; "center"; "details"; "dialog"; "dir"; "div"; "dl"; "fieldset"; "figcaption"; "figure"; "footer"; "header"; "hgroup"; "main"; "menu"; "nav"; "ol"; "p"; "search"; "section"; "summary"; "ul"] -> 1093 + if has_element_in_button_scope t "p" then close_p_element t; 1094 + ignore (insert_element t name ~push:true attrs) 1095 + | Token.Tag { kind = Token.Start; name; attrs; _ } when List.mem name Constants.heading_elements -> 1096 + if has_element_in_button_scope t "p" then close_p_element t; 1097 + (match current_node t with 1098 + | Some n when List.mem n.Dom.name Constants.heading_elements -> 1099 + parse_error t "unexpected-start-tag"; 1100 + pop_current t 1101 + | _ -> ()); 1102 + ignore (insert_element t name ~push:true attrs) 1103 + | Token.Tag { kind = Token.Start; name; attrs; _ } when List.mem name ["pre"; "listing"] -> 1104 + if has_element_in_button_scope t "p" then close_p_element t; 1105 + ignore (insert_element t name ~push:true attrs); 1106 + t.ignore_lf <- true; 1107 + t.frameset_ok <- false 1108 + | Token.Tag { kind = Token.Start; name = "form"; attrs; _ } -> 1109 + if t.form_element <> None && not (List.exists (fun n -> n.Dom.name = "template") t.open_elements) then 1110 + parse_error t "unexpected-start-tag" 1111 + else begin 1112 + if has_element_in_button_scope t "p" then close_p_element t; 1113 + let form = insert_element t "form" attrs in 1114 + t.open_elements <- form :: t.open_elements; 1115 + if not (List.exists (fun n -> n.Dom.name = "template") t.open_elements) then 1116 + t.form_element <- Some form 1117 + end 1118 + | Token.Tag { kind = Token.Start; name = "li"; attrs; _ } -> 1119 + t.frameset_ok <- false; 1120 + let rec check = function 1121 + | [] -> () 1122 + | n :: rest -> 1123 + if n.Dom.name = "li" then begin 1124 + generate_implied_end_tags t ~except:"li" (); 1125 + (match current_node t with 1126 + | Some cn when cn.Dom.name <> "li" -> parse_error t "unexpected-start-tag" 1127 + | _ -> ()); 1128 + pop_until_tag t "li" 1129 + end else if is_special_element n && not (List.mem (String.lowercase_ascii n.Dom.name) ["address"; "div"; "p"]) then 1130 + () 1131 + else 1132 + check rest 1133 + in 1134 + check t.open_elements; 1135 + if has_element_in_button_scope t "p" then close_p_element t; 1136 + ignore (insert_element t "li" ~push:true attrs) 1137 + | Token.Tag { kind = Token.Start; name; attrs; _ } when List.mem name ["dd"; "dt"] -> 1138 + t.frameset_ok <- false; 1139 + let rec check = function 1140 + | [] -> () 1141 + | n :: rest -> 1142 + if List.mem n.Dom.name ["dd"; "dt"] then begin 1143 + generate_implied_end_tags t ~except:n.Dom.name (); 1144 + (match current_node t with 1145 + | Some cn when cn.Dom.name <> n.Dom.name -> parse_error t "unexpected-start-tag" 1146 + | _ -> ()); 1147 + pop_until_one_of t ["dd"; "dt"] 1148 + end else if is_special_element n && not (List.mem (String.lowercase_ascii n.Dom.name) ["address"; "div"; "p"]) then 1149 + () 1150 + else 1151 + check rest 1152 + in 1153 + check t.open_elements; 1154 + if has_element_in_button_scope t "p" then close_p_element t; 1155 + ignore (insert_element t name ~push:true attrs) 1156 + | Token.Tag { kind = Token.Start; name = "plaintext"; _ } -> 1157 + if has_element_in_button_scope t "p" then close_p_element t; 1158 + ignore (insert_element t "plaintext" ~push:true []) 1159 + (* Tokenizer should switch to PLAINTEXT state *) 1160 + | Token.Tag { kind = Token.Start; name = "button"; attrs; _ } -> 1161 + if has_element_in_scope t "button" then begin 1162 + parse_error t "unexpected-start-tag"; 1163 + generate_implied_end_tags t (); 1164 + pop_until_tag t "button" 1165 + end; 1166 + reconstruct_active_formatting t; 1167 + ignore (insert_element t "button" ~push:true attrs); 1168 + t.frameset_ok <- false 1169 + | Token.Tag { kind = Token.End; name; _ } 1170 + when List.mem name ["address"; "article"; "aside"; "blockquote"; "button"; "center"; "details"; "dialog"; "dir"; "div"; "dl"; "fieldset"; "figcaption"; "figure"; "footer"; "header"; "hgroup"; "listing"; "main"; "menu"; "nav"; "ol"; "pre"; "search"; "section"; "summary"; "ul"] -> 1171 + if not (has_element_in_scope t name) then 1172 + parse_error t "unexpected-end-tag" 1173 + else begin 1174 + generate_implied_end_tags t (); 1175 + (match current_node t with 1176 + | Some n when n.Dom.name <> name -> parse_error t "end-tag-too-early" 1177 + | _ -> ()); 1178 + pop_until_tag t name 1179 + end 1180 + | Token.Tag { kind = Token.End; name = "form"; _ } -> 1181 + if not (List.exists (fun n -> n.Dom.name = "template") t.open_elements) then begin 1182 + let node = t.form_element in 1183 + t.form_element <- None; 1184 + match node with 1185 + | None -> parse_error t "unexpected-end-tag" 1186 + | Some form_node -> 1187 + if not (has_element_in_scope t "form") then 1188 + parse_error t "unexpected-end-tag" 1189 + else begin 1190 + generate_implied_end_tags t (); 1191 + (match current_node t with 1192 + | Some n when n != form_node -> parse_error t "end-tag-too-early" 1193 + | _ -> ()); 1194 + t.open_elements <- List.filter (fun n -> n != form_node) t.open_elements 1195 + end 1196 + end else begin 1197 + if not (has_element_in_scope t "form") then 1198 + parse_error t "unexpected-end-tag" 1199 + else begin 1200 + generate_implied_end_tags t (); 1201 + (match current_node t with 1202 + | Some n when n.Dom.name <> "form" -> parse_error t "end-tag-too-early" 1203 + | _ -> ()); 1204 + pop_until_tag t "form" 1205 + end 1206 + end 1207 + | Token.Tag { kind = Token.End; name = "p"; _ } -> 1208 + if not (has_element_in_button_scope t "p") then begin 1209 + parse_error t "unexpected-end-tag"; 1210 + ignore (insert_element t "p" ~push:true []) 1211 + end; 1212 + close_p_element t 1213 + | Token.Tag { kind = Token.End; name = "li"; _ } -> 1214 + if not (has_element_in_list_item_scope t "li") then 1215 + parse_error t "unexpected-end-tag" 1216 + else begin 1217 + generate_implied_end_tags t ~except:"li" (); 1218 + (match current_node t with 1219 + | Some n when n.Dom.name <> "li" -> parse_error t "end-tag-too-early" 1220 + | _ -> ()); 1221 + pop_until_tag t "li" 1222 + end 1223 + | Token.Tag { kind = Token.End; name; _ } when List.mem name ["dd"; "dt"] -> 1224 + if not (has_element_in_scope t name) then 1225 + parse_error t "unexpected-end-tag" 1226 + else begin 1227 + generate_implied_end_tags t ~except:name (); 1228 + (match current_node t with 1229 + | Some n when n.Dom.name <> name -> parse_error t "end-tag-too-early" 1230 + | _ -> ()); 1231 + pop_until_tag t name 1232 + end 1233 + | Token.Tag { kind = Token.End; name; _ } when List.mem name Constants.heading_elements -> 1234 + if not (has_element_in_scope_impl t Constants.heading_elements Constants.default_scope ~check_integration_points:true) then 1235 + parse_error t "unexpected-end-tag" 1236 + else begin 1237 + generate_implied_end_tags t (); 1238 + (match current_node t with 1239 + | Some n when n.Dom.name <> name -> parse_error t "end-tag-too-early" 1240 + | _ -> ()); 1241 + pop_until_one_of t Constants.heading_elements 1242 + end 1243 + | Token.Tag { kind = Token.Start; name = "a"; attrs; _ } -> 1244 + (* Check for existing <a> in active formatting *) 1245 + let rec find_a = function 1246 + | [] -> None 1247 + | Marker :: _ -> None 1248 + | Entry e :: _ when e.name = "a" -> Some e.node 1249 + | _ :: rest -> find_a rest 1250 + in 1251 + (match find_a t.active_formatting with 1252 + | Some existing -> 1253 + parse_error t "unexpected-start-tag"; 1254 + adoption_agency t "a"; 1255 + t.active_formatting <- List.filter (function 1256 + | Entry e -> e.node != existing 1257 + | _ -> true 1258 + ) t.active_formatting; 1259 + t.open_elements <- List.filter (fun n -> n != existing) t.open_elements 1260 + | None -> ()); 1261 + reconstruct_active_formatting t; 1262 + let node = insert_element t "a" attrs in 1263 + t.open_elements <- node :: t.open_elements; 1264 + push_formatting_element t node "a" attrs 1265 + | Token.Tag { kind = Token.Start; name; attrs; _ } 1266 + when List.mem name ["b"; "big"; "code"; "em"; "font"; "i"; "s"; "small"; "strike"; "strong"; "tt"; "u"] -> 1267 + reconstruct_active_formatting t; 1268 + let node = insert_element t name attrs in 1269 + t.open_elements <- node :: t.open_elements; 1270 + push_formatting_element t node name attrs 1271 + | Token.Tag { kind = Token.Start; name = "nobr"; attrs; _ } -> 1272 + if has_element_in_scope t "nobr" then begin 1273 + parse_error t "unexpected-start-tag"; 1274 + adoption_agency t "nobr"; 1275 + (* Remove nobr from active formatting *) 1276 + t.active_formatting <- List.filter (function 1277 + | Entry e -> e.name <> "nobr" 1278 + | Marker -> true 1279 + ) t.active_formatting; 1280 + (* Remove nobr from open elements *) 1281 + t.open_elements <- List.filter (fun n -> n.Dom.name <> "nobr") t.open_elements 1282 + end; 1283 + reconstruct_active_formatting t; 1284 + let node = insert_element t "nobr" attrs in 1285 + t.open_elements <- node :: t.open_elements; 1286 + push_formatting_element t node "nobr" attrs 1287 + | Token.Tag { kind = Token.End; name; _ } 1288 + when List.mem name ["a"; "b"; "big"; "code"; "em"; "font"; "i"; "nobr"; "s"; "small"; "strike"; "strong"; "tt"; "u"] -> 1289 + adoption_agency t name 1290 + | Token.Tag { kind = Token.Start; name; attrs; _ } 1291 + when List.mem name ["applet"; "marquee"; "object"] -> 1292 + reconstruct_active_formatting t; 1293 + ignore (insert_element t name ~push:true attrs); 1294 + push_formatting_marker t; 1295 + t.frameset_ok <- false 1296 + | Token.Tag { kind = Token.End; name; _ } 1297 + when List.mem name ["applet"; "marquee"; "object"] -> 1298 + if not (has_element_in_scope t name) then 1299 + parse_error t "unexpected-end-tag" 1300 + else begin 1301 + generate_implied_end_tags t (); 1302 + (match current_node t with 1303 + | Some n when n.Dom.name <> name -> parse_error t "end-tag-too-early" 1304 + | _ -> ()); 1305 + pop_until_tag t name; 1306 + clear_active_formatting_to_marker t 1307 + end 1308 + | Token.Tag { kind = Token.Start; name = "table"; attrs; _ } -> 1309 + if t.quirks_mode <> Dom.Quirks && has_element_in_button_scope t "p" then 1310 + close_p_element t; 1311 + ignore (insert_element t "table" ~push:true attrs); 1312 + t.frameset_ok <- false; 1313 + t.mode <- Insertion_mode.In_table 1314 + | Token.Tag { kind = Token.End; name = "br"; _ } -> 1315 + parse_error t "unexpected-end-tag"; 1316 + reconstruct_active_formatting t; 1317 + ignore (insert_element t "br" ~push:true []); 1318 + pop_current t; 1319 + t.frameset_ok <- false 1320 + | Token.Tag { kind = Token.Start; name; attrs; _ } 1321 + when List.mem name ["area"; "br"; "embed"; "img"; "keygen"; "wbr"] -> 1322 + reconstruct_active_formatting t; 1323 + ignore (insert_element t name ~push:true attrs); 1324 + pop_current t; 1325 + t.frameset_ok <- false 1326 + | Token.Tag { kind = Token.Start; name = "input"; attrs; _ } -> 1327 + reconstruct_active_formatting t; 1328 + ignore (insert_element t "input" ~push:true attrs); 1329 + pop_current t; 1330 + let is_hidden = List.exists (fun (k, v) -> 1331 + String.lowercase_ascii k = "type" && String.lowercase_ascii v = "hidden" 1332 + ) attrs in 1333 + if not is_hidden then t.frameset_ok <- false 1334 + | Token.Tag { kind = Token.Start; name; _ } 1335 + when List.mem name ["param"; "source"; "track"] -> 1336 + ignore (insert_element_for_token t { kind = Token.Start; name; attrs = []; self_closing = false }); 1337 + pop_current t 1338 + | Token.Tag { kind = Token.Start; name = "hr"; _ } -> 1339 + if has_element_in_button_scope t "p" then close_p_element t; 1340 + ignore (insert_element t "hr" ~push:true []); 1341 + pop_current t; 1342 + t.frameset_ok <- false 1343 + | Token.Tag { kind = Token.Start; name = "image"; attrs; _ } -> 1344 + parse_error t "unexpected-start-tag"; 1345 + (* Treat <image> as <img> *) 1346 + reconstruct_active_formatting t; 1347 + ignore (insert_element t "img" ~push:true attrs); 1348 + pop_current t; 1349 + t.frameset_ok <- false 1350 + | Token.Tag { kind = Token.Start; name = "textarea"; attrs; _ } -> 1351 + ignore (insert_element t "textarea" ~push:true attrs); 1352 + t.ignore_lf <- true; 1353 + t.original_mode <- Some t.mode; 1354 + t.frameset_ok <- false; 1355 + t.mode <- Insertion_mode.Text 1356 + | Token.Tag { kind = Token.Start; name = "xmp"; _ } -> 1357 + if has_element_in_button_scope t "p" then close_p_element t; 1358 + reconstruct_active_formatting t; 1359 + t.frameset_ok <- false; 1360 + ignore (insert_element_for_token t { kind = Token.Start; name = "xmp"; attrs = []; self_closing = false }); 1361 + t.original_mode <- Some t.mode; 1362 + t.mode <- Insertion_mode.Text 1363 + | Token.Tag { kind = Token.Start; name = "iframe"; _ } -> 1364 + t.frameset_ok <- false; 1365 + ignore (insert_element_for_token t { kind = Token.Start; name = "iframe"; attrs = []; self_closing = false }); 1366 + t.original_mode <- Some t.mode; 1367 + t.mode <- Insertion_mode.Text 1368 + | Token.Tag { kind = Token.Start; name = "noembed"; _ } -> 1369 + ignore (insert_element_for_token t { kind = Token.Start; name = "noembed"; attrs = []; self_closing = false }); 1370 + t.original_mode <- Some t.mode; 1371 + t.mode <- Insertion_mode.Text 1372 + | Token.Tag { kind = Token.Start; name = "select"; attrs; _ } -> 1373 + reconstruct_active_formatting t; 1374 + ignore (insert_element t "select" ~push:true attrs); 1375 + t.frameset_ok <- false; 1376 + if List.mem t.mode [Insertion_mode.In_table; Insertion_mode.In_caption; Insertion_mode.In_table_body; Insertion_mode.In_row; Insertion_mode.In_cell] then 1377 + t.mode <- Insertion_mode.In_select_in_table 1378 + else 1379 + t.mode <- Insertion_mode.In_select 1380 + | Token.Tag { kind = Token.Start; name; attrs; _ } when List.mem name ["optgroup"; "option"] -> 1381 + (match current_node t with 1382 + | Some n when n.Dom.name = "option" -> pop_current t 1383 + | _ -> ()); 1384 + reconstruct_active_formatting t; 1385 + ignore (insert_element t name ~push:true attrs) 1386 + | Token.Tag { kind = Token.Start; name; attrs; _ } when List.mem name ["rb"; "rtc"] -> 1387 + if has_element_in_scope t "ruby" then begin 1388 + generate_implied_end_tags t () 1389 + end; 1390 + (match current_node t with 1391 + | Some n when n.Dom.name <> "ruby" && n.Dom.name <> "rtc" -> parse_error t "unexpected-start-tag" 1392 + | _ -> ()); 1393 + ignore (insert_element t name ~push:true attrs) 1394 + | Token.Tag { kind = Token.Start; name; attrs; _ } when List.mem name ["rp"; "rt"] -> 1395 + if has_element_in_scope t "ruby" then begin 1396 + generate_implied_end_tags t ~except:"rtc" () 1397 + end; 1398 + (match current_node t with 1399 + | Some n when n.Dom.name <> "ruby" && n.Dom.name <> "rtc" -> parse_error t "unexpected-start-tag" 1400 + | _ -> ()); 1401 + ignore (insert_element t name ~push:true attrs) 1402 + | Token.Tag { kind = Token.Start; name = "math"; attrs; self_closing } -> 1403 + reconstruct_active_formatting t; 1404 + let adjusted_attrs = Constants.adjust_mathml_attrs (Constants.adjust_foreign_attrs attrs) in 1405 + ignore (insert_foreign_element t { kind = Token.Start; name = "math"; attrs = adjusted_attrs; self_closing } (Some "mathml")); 1406 + if self_closing then pop_current t 1407 + | Token.Tag { kind = Token.Start; name = "svg"; attrs; self_closing } -> 1408 + reconstruct_active_formatting t; 1409 + let adjusted_attrs = Constants.adjust_svg_attrs (Constants.adjust_foreign_attrs attrs) in 1410 + ignore (insert_foreign_element t { kind = Token.Start; name = "svg"; attrs = adjusted_attrs; self_closing } (Some "svg")); 1411 + if self_closing then pop_current t 1412 + | Token.Tag { kind = Token.Start; name; attrs; _ } 1413 + when List.mem name ["col"; "frame"] -> 1414 + (* In fragment context, insert these; otherwise ignore *) 1415 + if t.fragment_context = None then 1416 + parse_error t "unexpected-start-tag-ignored" 1417 + else 1418 + ignore (insert_element t name attrs) 1419 + | Token.Tag { kind = Token.Start; name; _ } 1420 + when List.mem name ["caption"; "colgroup"; "head"; "tbody"; "td"; "tfoot"; "th"; "thead"; "tr"] -> 1421 + parse_error t "unexpected-start-tag" 1422 + | Token.Tag { kind = Token.Start; name; attrs; _ } -> 1423 + (* Any other start tag *) 1424 + reconstruct_active_formatting t; 1425 + ignore (insert_element t name ~push:true attrs) 1426 + | Token.Tag { kind = Token.End; name; _ } -> 1427 + (* Any other end tag *) 1428 + let rec check = function 1429 + | [] -> () 1430 + | node :: rest -> 1431 + if node.Dom.name = name then begin 1432 + generate_implied_end_tags t ~except:name (); 1433 + (match current_node t with 1434 + | Some n when n.Dom.name <> name -> parse_error t "end-tag-too-early" 1435 + | _ -> ()); 1436 + pop_until t (fun n -> n == node) 1437 + end else if is_special_element node then 1438 + parse_error t "unexpected-end-tag" 1439 + else 1440 + check rest 1441 + in 1442 + check t.open_elements 1443 + 1444 + and process_text t token = 1445 + match token with 1446 + | Token.Character data -> 1447 + insert_character t data 1448 + | Token.EOF -> 1449 + parse_error t "expected-closing-tag-but-got-eof"; 1450 + pop_current t; 1451 + t.mode <- Option.value t.original_mode ~default:Insertion_mode.In_body; 1452 + process_token t token 1453 + | Token.Tag { kind = Token.End; _ } -> 1454 + pop_current t; 1455 + t.mode <- Option.value t.original_mode ~default:Insertion_mode.In_body 1456 + | _ -> () 1457 + 1458 + and process_in_table t token = 1459 + match token with 1460 + | Token.Character _ when (match current_node t with Some n -> List.mem n.Dom.name ["table"; "tbody"; "tfoot"; "thead"; "tr"] | None -> false) -> 1461 + t.pending_table_chars <- []; 1462 + t.original_mode <- Some t.mode; 1463 + t.mode <- Insertion_mode.In_table_text; 1464 + process_token t token 1465 + | Token.Comment data -> 1466 + insert_comment t data 1467 + | Token.Doctype _ -> 1468 + parse_error t "unexpected-doctype" 1469 + | Token.Tag { kind = Token.Start; name = "caption"; attrs; _ } -> 1470 + clear_stack_back_to_table_context t; 1471 + push_formatting_marker t; 1472 + ignore (insert_element t "caption" ~push:true attrs); 1473 + t.mode <- Insertion_mode.In_caption 1474 + | Token.Tag { kind = Token.Start; name = "colgroup"; attrs; _ } -> 1475 + clear_stack_back_to_table_context t; 1476 + ignore (insert_element t "colgroup" ~push:true attrs); 1477 + t.mode <- Insertion_mode.In_column_group 1478 + | Token.Tag { kind = Token.Start; name = "col"; _ } -> 1479 + clear_stack_back_to_table_context t; 1480 + ignore (insert_element t "colgroup" ~push:true []); 1481 + t.mode <- Insertion_mode.In_column_group; 1482 + process_token t token 1483 + | Token.Tag { kind = Token.Start; name; attrs; _ } when List.mem name ["tbody"; "tfoot"; "thead"] -> 1484 + clear_stack_back_to_table_context t; 1485 + ignore (insert_element t name ~push:true attrs); 1486 + t.mode <- Insertion_mode.In_table_body 1487 + | Token.Tag { kind = Token.Start; name; _ } when List.mem name ["td"; "th"; "tr"] -> 1488 + clear_stack_back_to_table_context t; 1489 + ignore (insert_element t "tbody" ~push:true []); 1490 + t.mode <- Insertion_mode.In_table_body; 1491 + process_token t token 1492 + | Token.Tag { kind = Token.Start; name = "table"; _ } -> 1493 + parse_error t "unexpected-start-tag"; 1494 + if has_element_in_table_scope t "table" then begin 1495 + pop_until_tag t "table"; 1496 + reset_insertion_mode t; 1497 + process_token t token 1498 + end 1499 + | Token.Tag { kind = Token.End; name = "table"; _ } -> 1500 + if not (has_element_in_table_scope t "table") then 1501 + parse_error t "unexpected-end-tag" 1502 + else begin 1503 + pop_until_tag t "table"; 1504 + reset_insertion_mode t 1505 + end 1506 + | Token.Tag { kind = Token.End; name; _ } 1507 + when List.mem name ["body"; "caption"; "col"; "colgroup"; "html"; "tbody"; "td"; "tfoot"; "th"; "thead"; "tr"] -> 1508 + parse_error t "unexpected-end-tag" 1509 + | Token.Tag { kind = Token.Start; name; _ } when List.mem name ["style"; "script"; "template"] -> 1510 + process_in_head t token 1511 + | Token.Tag { kind = Token.End; name = "template"; _ } -> 1512 + process_in_head t token 1513 + | Token.Tag { kind = Token.Start; name = "input"; attrs; _ } -> 1514 + let is_hidden = List.exists (fun (k, v) -> 1515 + String.lowercase_ascii k = "type" && String.lowercase_ascii v = "hidden" 1516 + ) attrs in 1517 + if not is_hidden then begin 1518 + parse_error t "unexpected-start-tag"; 1519 + t.foster_parenting <- true; 1520 + process_in_body t token; 1521 + t.foster_parenting <- false 1522 + end else begin 1523 + parse_error t "unexpected-start-tag"; 1524 + ignore (insert_element t "input" ~push:true attrs); 1525 + pop_current t 1526 + end 1527 + | Token.Tag { kind = Token.Start; name = "form"; attrs; _ } -> 1528 + parse_error t "unexpected-start-tag"; 1529 + if t.form_element = None && not (List.exists (fun n -> n.Dom.name = "template") t.open_elements) then begin 1530 + let form = insert_element t "form" attrs in 1531 + t.open_elements <- form :: t.open_elements; 1532 + t.form_element <- Some form; 1533 + pop_current t 1534 + end 1535 + | Token.EOF -> 1536 + process_in_body t token 1537 + | _ -> 1538 + parse_error t "unexpected-token-in-table"; 1539 + t.foster_parenting <- true; 1540 + process_in_body t token; 1541 + t.foster_parenting <- false 1542 + 1543 + and clear_stack_back_to_table_context t = 1544 + let rec loop () = 1545 + match current_node t with 1546 + | Some n when not (List.mem n.Dom.name ["table"; "template"; "html"]) -> 1547 + pop_current t; 1548 + loop () 1549 + | _ -> () 1550 + in 1551 + loop () 1552 + 1553 + and process_in_table_text t token = 1554 + match token with 1555 + | Token.Character data -> 1556 + if String.contains data '\x00' then 1557 + parse_error t "unexpected-null-character" 1558 + else 1559 + t.pending_table_chars <- data :: t.pending_table_chars 1560 + | _ -> 1561 + let pending = String.concat "" (List.rev t.pending_table_chars) in 1562 + t.pending_table_chars <- []; 1563 + if not (is_whitespace pending) then begin 1564 + parse_error t "unexpected-character-in-table"; 1565 + t.foster_parenting <- true; 1566 + reconstruct_active_formatting t; 1567 + insert_character t pending; 1568 + t.foster_parenting <- false 1569 + end else 1570 + insert_character t pending; 1571 + t.mode <- Option.value t.original_mode ~default:Insertion_mode.In_table; 1572 + process_token t token 1573 + 1574 + and process_in_caption t token = 1575 + match token with 1576 + | Token.Tag { kind = Token.End; name = "caption"; _ } -> 1577 + if not (has_element_in_table_scope t "caption") then 1578 + parse_error t "unexpected-end-tag" 1579 + else begin 1580 + generate_implied_end_tags t (); 1581 + (match current_node t with 1582 + | Some n when n.Dom.name <> "caption" -> parse_error t "end-tag-too-early" 1583 + | _ -> ()); 1584 + pop_until_tag t "caption"; 1585 + clear_active_formatting_to_marker t; 1586 + t.mode <- Insertion_mode.In_table 1587 + end 1588 + | Token.Tag { kind = Token.Start; name; _ } 1589 + when List.mem name ["caption"; "col"; "colgroup"; "tbody"; "td"; "tfoot"; "th"; "thead"; "tr"] -> 1590 + if not (has_element_in_table_scope t "caption") then 1591 + parse_error t "unexpected-start-tag" 1592 + else begin 1593 + generate_implied_end_tags t (); 1594 + pop_until_tag t "caption"; 1595 + clear_active_formatting_to_marker t; 1596 + t.mode <- Insertion_mode.In_table; 1597 + process_token t token 1598 + end 1599 + | Token.Tag { kind = Token.End; name = "table"; _ } -> 1600 + if not (has_element_in_table_scope t "caption") then 1601 + parse_error t "unexpected-end-tag" 1602 + else begin 1603 + generate_implied_end_tags t (); 1604 + pop_until_tag t "caption"; 1605 + clear_active_formatting_to_marker t; 1606 + t.mode <- Insertion_mode.In_table; 1607 + process_token t token 1608 + end 1609 + | Token.Tag { kind = Token.End; name; _ } 1610 + when List.mem name ["body"; "col"; "colgroup"; "html"; "tbody"; "td"; "tfoot"; "th"; "thead"; "tr"] -> 1611 + parse_error t "unexpected-end-tag" 1612 + | _ -> 1613 + process_in_body t token 1614 + 1615 + and process_in_column_group t token = 1616 + match token with 1617 + | Token.Character data when is_whitespace data -> 1618 + insert_character t data 1619 + | Token.Character data -> 1620 + (* Split leading whitespace from non-whitespace *) 1621 + let ws_chars = [' '; '\t'; '\n'; '\x0C'; '\r'] in 1622 + let len = String.length data in 1623 + let ws_end = ref 0 in 1624 + while !ws_end < len && List.mem data.[!ws_end] ws_chars do incr ws_end done; 1625 + if !ws_end > 0 then 1626 + insert_character t (String.sub data 0 !ws_end); 1627 + if !ws_end < len then begin 1628 + let remaining = String.sub data !ws_end (len - !ws_end) in 1629 + (match current_node t with 1630 + | Some n when n.Dom.name = "colgroup" -> 1631 + pop_current t; 1632 + t.mode <- Insertion_mode.In_table; 1633 + process_token t (Token.Character remaining) 1634 + | _ -> 1635 + parse_error t "unexpected-token") 1636 + end 1637 + | Token.Comment data -> 1638 + insert_comment t data 1639 + | Token.Doctype _ -> 1640 + parse_error t "unexpected-doctype" 1641 + | Token.Tag { kind = Token.Start; name = "html"; _ } -> 1642 + process_in_body t token 1643 + | Token.Tag { kind = Token.Start; name = "col"; attrs; _ } -> 1644 + ignore (insert_element t "col" ~push:true attrs); 1645 + pop_current t 1646 + | Token.Tag { kind = Token.End; name = "colgroup"; _ } -> 1647 + (match current_node t with 1648 + | Some n when n.Dom.name <> "colgroup" -> parse_error t "unexpected-end-tag" 1649 + | Some _ -> pop_current t; t.mode <- Insertion_mode.In_table 1650 + | None -> parse_error t "unexpected-end-tag") 1651 + | Token.Tag { kind = Token.End; name = "col"; _ } -> 1652 + parse_error t "unexpected-end-tag" 1653 + | Token.Tag { kind = Token.Start; name = "template"; _ } 1654 + | Token.Tag { kind = Token.End; name = "template"; _ } -> 1655 + process_in_head t token 1656 + | Token.EOF -> 1657 + process_in_body t token 1658 + | _ -> 1659 + (match current_node t with 1660 + | Some n when n.Dom.name = "colgroup" -> 1661 + pop_current t; 1662 + t.mode <- Insertion_mode.In_table; 1663 + process_token t token 1664 + | _ -> 1665 + parse_error t "unexpected-token") 1666 + 1667 + and process_in_table_body t token = 1668 + match token with 1669 + | Token.Tag { kind = Token.Start; name = "tr"; attrs; _ } -> 1670 + clear_stack_back_to_table_body_context t; 1671 + ignore (insert_element t "tr" ~push:true attrs); 1672 + t.mode <- Insertion_mode.In_row 1673 + | Token.Tag { kind = Token.Start; name; _ } when List.mem name ["th"; "td"] -> 1674 + parse_error t "unexpected-start-tag"; 1675 + clear_stack_back_to_table_body_context t; 1676 + ignore (insert_element t "tr" ~push:true []); 1677 + t.mode <- Insertion_mode.In_row; 1678 + process_token t token 1679 + | Token.Tag { kind = Token.End; name; _ } when List.mem name ["tbody"; "tfoot"; "thead"] -> 1680 + if not (has_element_in_table_scope t name) then 1681 + parse_error t "unexpected-end-tag" 1682 + else begin 1683 + clear_stack_back_to_table_body_context t; 1684 + pop_current t; 1685 + t.mode <- Insertion_mode.In_table 1686 + end 1687 + | Token.Tag { kind = Token.Start; name; _ } 1688 + when List.mem name ["caption"; "col"; "colgroup"; "tbody"; "tfoot"; "thead"] -> 1689 + if not (has_element_in_scope_impl t ["tbody"; "tfoot"; "thead"] Constants.table_scope ~check_integration_points:false) then 1690 + parse_error t "unexpected-start-tag" 1691 + else begin 1692 + clear_stack_back_to_table_body_context t; 1693 + pop_current t; 1694 + t.mode <- Insertion_mode.In_table; 1695 + process_token t token 1696 + end 1697 + | Token.Tag { kind = Token.End; name = "table"; _ } -> 1698 + if not (has_element_in_scope_impl t ["tbody"; "tfoot"; "thead"] Constants.table_scope ~check_integration_points:false) then 1699 + parse_error t "unexpected-end-tag" 1700 + else begin 1701 + clear_stack_back_to_table_body_context t; 1702 + pop_current t; 1703 + t.mode <- Insertion_mode.In_table; 1704 + process_token t token 1705 + end 1706 + | Token.Tag { kind = Token.End; name; _ } 1707 + when List.mem name ["body"; "caption"; "col"; "colgroup"; "html"; "td"; "th"; "tr"] -> 1708 + parse_error t "unexpected-end-tag" 1709 + | _ -> 1710 + process_in_table t token 1711 + 1712 + and clear_stack_back_to_table_body_context t = 1713 + let rec loop () = 1714 + match current_node t with 1715 + | Some n when not (List.mem n.Dom.name ["tbody"; "tfoot"; "thead"; "template"; "html"]) -> 1716 + pop_current t; 1717 + loop () 1718 + | _ -> () 1719 + in 1720 + loop () 1721 + 1722 + and process_in_row t token = 1723 + match token with 1724 + | Token.Tag { kind = Token.Start; name; attrs; _ } when List.mem name ["th"; "td"] -> 1725 + clear_stack_back_to_table_row_context t; 1726 + ignore (insert_element t name ~push:true attrs); 1727 + t.mode <- Insertion_mode.In_cell; 1728 + push_formatting_marker t 1729 + | Token.Tag { kind = Token.End; name = "tr"; _ } -> 1730 + if not (has_element_in_table_scope t "tr") then 1731 + parse_error t "unexpected-end-tag" 1732 + else begin 1733 + clear_stack_back_to_table_row_context t; 1734 + pop_current t; 1735 + t.mode <- Insertion_mode.In_table_body 1736 + end 1737 + | Token.Tag { kind = Token.Start; name; _ } 1738 + when List.mem name ["caption"; "col"; "colgroup"; "tbody"; "tfoot"; "thead"; "tr"] -> 1739 + if not (has_element_in_table_scope t "tr") then 1740 + parse_error t "unexpected-start-tag" 1741 + else begin 1742 + clear_stack_back_to_table_row_context t; 1743 + pop_current t; 1744 + t.mode <- Insertion_mode.In_table_body; 1745 + process_token t token 1746 + end 1747 + | Token.Tag { kind = Token.End; name = "table"; _ } -> 1748 + if not (has_element_in_table_scope t "tr") then 1749 + parse_error t "unexpected-end-tag" 1750 + else begin 1751 + clear_stack_back_to_table_row_context t; 1752 + pop_current t; 1753 + t.mode <- Insertion_mode.In_table_body; 1754 + process_token t token 1755 + end 1756 + | Token.Tag { kind = Token.End; name; _ } when List.mem name ["tbody"; "tfoot"; "thead"] -> 1757 + if not (has_element_in_table_scope t name) then 1758 + parse_error t "unexpected-end-tag" 1759 + else if not (has_element_in_table_scope t "tr") then 1760 + parse_error t "unexpected-end-tag" 1761 + else begin 1762 + clear_stack_back_to_table_row_context t; 1763 + pop_current t; 1764 + t.mode <- Insertion_mode.In_table_body; 1765 + process_token t token 1766 + end 1767 + | Token.Tag { kind = Token.End; name; _ } 1768 + when List.mem name ["body"; "caption"; "col"; "colgroup"; "html"; "td"; "th"] -> 1769 + parse_error t "unexpected-end-tag" 1770 + | _ -> 1771 + process_in_table t token 1772 + 1773 + and clear_stack_back_to_table_row_context t = 1774 + let rec loop () = 1775 + match current_node t with 1776 + | Some n when not (List.mem n.Dom.name ["tr"; "template"; "html"]) -> 1777 + pop_current t; 1778 + loop () 1779 + | _ -> () 1780 + in 1781 + loop () 1782 + 1783 + and process_in_cell t token = 1784 + match token with 1785 + | Token.Tag { kind = Token.End; name; _ } when List.mem name ["td"; "th"] -> 1786 + if not (has_element_in_table_scope t name) then 1787 + parse_error t "unexpected-end-tag" 1788 + else begin 1789 + generate_implied_end_tags t (); 1790 + (match current_node t with 1791 + | Some n when not (n.Dom.name = name && is_in_html_namespace n) -> parse_error t "end-tag-too-early" 1792 + | _ -> ()); 1793 + pop_until_html_tag t name; 1794 + clear_active_formatting_to_marker t; 1795 + t.mode <- Insertion_mode.In_row 1796 + end 1797 + | Token.Tag { kind = Token.Start; name; _ } 1798 + when List.mem name ["caption"; "col"; "colgroup"; "tbody"; "td"; "tfoot"; "th"; "thead"; "tr"] -> 1799 + if not (has_element_in_scope_impl t ["td"; "th"] Constants.table_scope ~check_integration_points:false) then 1800 + parse_error t "unexpected-start-tag" 1801 + else begin 1802 + close_cell t; 1803 + process_token t token 1804 + end 1805 + | Token.Tag { kind = Token.End; name; _ } 1806 + when List.mem name ["body"; "caption"; "col"; "colgroup"; "html"] -> 1807 + parse_error t "unexpected-end-tag" 1808 + | Token.Tag { kind = Token.End; name; _ } 1809 + when List.mem name ["table"; "tbody"; "tfoot"; "thead"; "tr"] -> 1810 + if not (has_element_in_table_scope t name) then 1811 + parse_error t "unexpected-end-tag" 1812 + else begin 1813 + close_cell t; 1814 + process_token t token 1815 + end 1816 + | _ -> 1817 + process_in_body t token 1818 + 1819 + and close_cell t = 1820 + generate_implied_end_tags t (); 1821 + (match current_node t with 1822 + | Some n when not (List.mem n.Dom.name ["td"; "th"] && is_in_html_namespace n) -> parse_error t "end-tag-too-early" 1823 + | _ -> ()); 1824 + pop_until_html_one_of t ["td"; "th"]; 1825 + clear_active_formatting_to_marker t; 1826 + t.mode <- Insertion_mode.In_row 1827 + 1828 + and process_in_select t token = 1829 + match token with 1830 + | Token.Character "\x00" -> 1831 + parse_error t "unexpected-null-character" 1832 + | Token.Character data -> 1833 + reconstruct_active_formatting t; 1834 + insert_character t data 1835 + | Token.Comment data -> 1836 + insert_comment t data 1837 + | Token.Doctype _ -> 1838 + parse_error t "unexpected-doctype" 1839 + | Token.Tag { kind = Token.Start; name = "html"; _ } -> 1840 + process_in_body t token 1841 + | Token.Tag { kind = Token.Start; name = "option"; attrs; _ } -> 1842 + (match current_node t with 1843 + | Some n when n.Dom.name = "option" -> pop_current t 1844 + | _ -> ()); 1845 + reconstruct_active_formatting t; 1846 + ignore (insert_element t "option" ~push:true attrs) 1847 + | Token.Tag { kind = Token.Start; name = "optgroup"; attrs; _ } -> 1848 + (match current_node t with 1849 + | Some n when n.Dom.name = "option" -> pop_current t 1850 + | _ -> ()); 1851 + (match current_node t with 1852 + | Some n when n.Dom.name = "optgroup" -> pop_current t 1853 + | _ -> ()); 1854 + reconstruct_active_formatting t; 1855 + ignore (insert_element t "optgroup" ~push:true attrs) 1856 + | Token.Tag { kind = Token.Start; name = "hr"; _ } -> 1857 + (match current_node t with 1858 + | Some n when n.Dom.name = "option" -> pop_current t 1859 + | _ -> ()); 1860 + (match current_node t with 1861 + | Some n when n.Dom.name = "optgroup" -> pop_current t 1862 + | _ -> ()); 1863 + ignore (insert_element t "hr" ~push:true []); 1864 + pop_current t 1865 + | Token.Tag { kind = Token.End; name = "optgroup"; _ } -> 1866 + (match t.open_elements with 1867 + | opt :: optg :: _ when opt.Dom.name = "option" && optg.Dom.name = "optgroup" -> 1868 + pop_current t 1869 + | _ -> ()); 1870 + (match current_node t with 1871 + | Some n when n.Dom.name = "optgroup" -> pop_current t 1872 + | _ -> parse_error t "unexpected-end-tag") 1873 + | Token.Tag { kind = Token.End; name = "option"; _ } -> 1874 + (match current_node t with 1875 + | Some n when n.Dom.name = "option" -> pop_current t 1876 + | _ -> parse_error t "unexpected-end-tag") 1877 + | Token.Tag { kind = Token.End; name = "select"; _ } -> 1878 + if not (has_element_in_select_scope t "select") then 1879 + parse_error t "unexpected-end-tag" 1880 + else begin 1881 + pop_until_tag t "select"; 1882 + reset_insertion_mode t 1883 + end 1884 + | Token.Tag { kind = Token.Start; name = "select"; _ } -> 1885 + parse_error t "unexpected-start-tag"; 1886 + (* Per spec: in IN_SELECT mode, select is always on the stack - just pop *) 1887 + pop_until_tag t "select"; 1888 + reset_insertion_mode t 1889 + | Token.Tag { kind = Token.Start; name; _ } when List.mem name ["input"; "textarea"] -> 1890 + parse_error t "unexpected-start-tag"; 1891 + (* Per spec: in IN_SELECT mode, select is always on the stack - just pop *) 1892 + pop_until_tag t "select"; 1893 + reset_insertion_mode t; 1894 + process_token t token 1895 + | Token.Tag { kind = Token.Start; name = "plaintext"; attrs; _ } -> 1896 + (* plaintext is allowed in select - creates element, parser will switch tokenizer to PLAINTEXT mode *) 1897 + reconstruct_active_formatting t; 1898 + ignore (insert_element t "plaintext" ~push:true attrs) 1899 + | Token.Tag { kind = Token.Start; name = "menuitem"; attrs; _ } -> 1900 + (* menuitem is allowed in select *) 1901 + reconstruct_active_formatting t; 1902 + ignore (insert_element t "menuitem" ~push:true attrs) 1903 + | Token.Tag { kind = Token.Start; name = "keygen"; attrs; _ } -> 1904 + (* keygen is handled specially in select - inserted directly *) 1905 + reconstruct_active_formatting t; 1906 + ignore (insert_element t "keygen" attrs) 1907 + (* Void element, don't push to stack *) 1908 + | Token.Tag { kind = Token.Start; name = "svg"; attrs; self_closing } -> 1909 + reconstruct_active_formatting t; 1910 + let node = insert_foreign_element t { kind = Token.Start; name = "svg"; attrs; self_closing } (Some "svg") in 1911 + if not self_closing then t.open_elements <- node :: t.open_elements 1912 + | Token.Tag { kind = Token.Start; name = "math"; attrs; self_closing } -> 1913 + reconstruct_active_formatting t; 1914 + let node = insert_foreign_element t { kind = Token.Start; name = "math"; attrs; self_closing } (Some "mathml") in 1915 + if not self_closing then t.open_elements <- node :: t.open_elements 1916 + | Token.Tag { kind = Token.Start; name; _ } when List.mem name ["script"; "template"] -> 1917 + process_in_head t token 1918 + | Token.Tag { kind = Token.End; name = "template"; _ } -> 1919 + process_in_head t token 1920 + (* Allow certain HTML elements in select - newer spec behavior *) 1921 + | Token.Tag { kind = Token.Start; name; attrs; self_closing } when List.mem name ["p"; "div"; "span"; "button"; "datalist"; "selectedcontent"] -> 1922 + reconstruct_active_formatting t; 1923 + let node = insert_element t name attrs in 1924 + if not self_closing then t.open_elements <- node :: t.open_elements 1925 + | Token.Tag { kind = Token.Start; name; attrs; _ } when List.mem name ["br"; "img"] -> 1926 + reconstruct_active_formatting t; 1927 + ignore (insert_element t name attrs) 1928 + (* Don't push to stack - void elements *) 1929 + (* Handle formatting elements in select *) 1930 + | Token.Tag { kind = Token.Start; name; attrs; _ } when List.mem name Constants.formatting_elements -> 1931 + reconstruct_active_formatting t; 1932 + let node = insert_element t name ~push:true attrs in 1933 + push_formatting_element t node name attrs 1934 + | Token.Tag { kind = Token.End; name; _ } when List.mem name Constants.formatting_elements -> 1935 + (* Find select element and check if formatting element is inside select *) 1936 + let select_idx = ref None in 1937 + let fmt_idx = ref None in 1938 + List.iteri (fun i n -> 1939 + if n.Dom.name = "select" && !select_idx = None then select_idx := Some i; 1940 + if n.Dom.name = name then fmt_idx := Some i 1941 + ) t.open_elements; 1942 + (match !fmt_idx, !select_idx with 1943 + | Some fi, Some si when fi < si -> 1944 + (* Formatting element is inside select, run adoption agency *) 1945 + adoption_agency t name 1946 + | Some _, Some _ -> 1947 + (* Formatting element is outside select boundary - parse error, ignore *) 1948 + parse_error t "unexpected-end-tag" 1949 + | Some _, None -> 1950 + adoption_agency t name 1951 + | None, _ -> 1952 + parse_error t "unexpected-end-tag") 1953 + (* End tags for HTML elements allowed in select *) 1954 + | Token.Tag { kind = Token.End; name; _ } when List.mem name ["p"; "div"; "span"; "button"; "datalist"; "selectedcontent"] -> 1955 + (* Find select and target indices *) 1956 + let select_idx = ref None in 1957 + let target_idx = ref None in 1958 + List.iteri (fun i n -> 1959 + if n.Dom.name = "select" && !select_idx = None then select_idx := Some i; 1960 + if n.Dom.name = name then target_idx := Some i 1961 + ) t.open_elements; 1962 + (* Only pop if target exists and is inside select (lower index = closer to current) *) 1963 + (match !target_idx, !select_idx with 1964 + | Some ti, Some si when ti < si -> 1965 + (* Pop until we reach the target *) 1966 + let rec pop_to_target () = 1967 + match t.open_elements with 1968 + | [] -> () 1969 + | n :: rest -> 1970 + t.open_elements <- rest; 1971 + if n.Dom.name <> name then pop_to_target () 1972 + in 1973 + pop_to_target () 1974 + | Some _, Some _ -> 1975 + parse_error t "unexpected-end-tag" 1976 + | Some _, None -> 1977 + (* No select on stack, just pop to target *) 1978 + let rec pop_to_target () = 1979 + match t.open_elements with 1980 + | [] -> () 1981 + | n :: rest -> 1982 + t.open_elements <- rest; 1983 + if n.Dom.name <> name then pop_to_target () 1984 + in 1985 + pop_to_target () 1986 + | None, _ -> 1987 + parse_error t "unexpected-end-tag") 1988 + | Token.EOF -> 1989 + process_in_body t token 1990 + | _ -> 1991 + parse_error t "unexpected-token-in-select" 1992 + 1993 + and process_in_select_in_table t token = 1994 + match token with 1995 + | Token.Tag { kind = Token.Start; name; _ } 1996 + when List.mem name ["caption"; "table"; "tbody"; "tfoot"; "thead"; "tr"; "td"; "th"] -> 1997 + parse_error t "unexpected-start-tag"; 1998 + pop_until_tag t "select"; 1999 + reset_insertion_mode t; 2000 + process_token t token 2001 + | Token.Tag { kind = Token.End; name; _ } 2002 + when List.mem name ["caption"; "table"; "tbody"; "tfoot"; "thead"; "tr"; "td"; "th"] -> 2003 + parse_error t "unexpected-end-tag"; 2004 + if has_element_in_table_scope t name then begin 2005 + pop_until_tag t "select"; 2006 + reset_insertion_mode t; 2007 + process_token t token 2008 + end 2009 + | _ -> 2010 + process_in_select t token 2011 + 2012 + and process_in_template t token = 2013 + match token with 2014 + | Token.Character _ | Token.Comment _ | Token.Doctype _ -> 2015 + process_in_body t token 2016 + | Token.Tag { kind = Token.Start; name; _ } 2017 + when List.mem name ["base"; "basefont"; "bgsound"; "link"; "meta"; "noframes"; "script"; "style"; "template"; "title"] -> 2018 + process_in_head t token 2019 + | Token.Tag { kind = Token.End; name = "template"; _ } -> 2020 + process_in_head t token 2021 + | Token.Tag { kind = Token.Start; name; _ } 2022 + when List.mem name ["caption"; "colgroup"; "tbody"; "tfoot"; "thead"] -> 2023 + t.template_modes <- (match t.template_modes with _ :: rest -> rest | [] -> []); 2024 + t.template_modes <- Insertion_mode.In_table :: t.template_modes; 2025 + t.mode <- Insertion_mode.In_table; 2026 + process_token t token 2027 + | Token.Tag { kind = Token.Start; name = "col"; _ } -> 2028 + t.template_modes <- (match t.template_modes with _ :: rest -> rest | [] -> []); 2029 + t.template_modes <- Insertion_mode.In_column_group :: t.template_modes; 2030 + t.mode <- Insertion_mode.In_column_group; 2031 + process_token t token 2032 + | Token.Tag { kind = Token.Start; name = "tr"; _ } -> 2033 + t.template_modes <- (match t.template_modes with _ :: rest -> rest | [] -> []); 2034 + t.template_modes <- Insertion_mode.In_table_body :: t.template_modes; 2035 + t.mode <- Insertion_mode.In_table_body; 2036 + process_token t token 2037 + | Token.Tag { kind = Token.Start; name; _ } when List.mem name ["td"; "th"] -> 2038 + t.template_modes <- (match t.template_modes with _ :: rest -> rest | [] -> []); 2039 + t.template_modes <- Insertion_mode.In_row :: t.template_modes; 2040 + t.mode <- Insertion_mode.In_row; 2041 + process_token t token 2042 + | Token.Tag { kind = Token.Start; _ } -> 2043 + t.template_modes <- (match t.template_modes with _ :: rest -> rest | [] -> []); 2044 + t.template_modes <- Insertion_mode.In_body :: t.template_modes; 2045 + t.mode <- Insertion_mode.In_body; 2046 + process_token t token 2047 + | Token.Tag { kind = Token.End; _ } -> 2048 + parse_error t "unexpected-end-tag" 2049 + | Token.EOF -> 2050 + if not (List.exists (fun n -> n.Dom.name = "template" && is_in_html_namespace n) t.open_elements) then 2051 + () (* Stop parsing *) 2052 + else begin 2053 + parse_error t "expected-closing-tag-but-got-eof"; 2054 + pop_until_html_tag t "template"; 2055 + clear_active_formatting_to_marker t; 2056 + t.template_modes <- (match t.template_modes with _ :: rest -> rest | [] -> []); 2057 + reset_insertion_mode t; 2058 + process_token t token 2059 + end 2060 + 2061 + and process_after_body t token = 2062 + match token with 2063 + | Token.Character data when is_whitespace data -> 2064 + process_in_body t token 2065 + | Token.Comment data -> 2066 + (* Insert as last child of html element - html is at bottom of stack *) 2067 + let html_opt = List.find_opt (fun n -> n.Dom.name = "html") t.open_elements in 2068 + (match html_opt with 2069 + | Some html -> Dom.append_child html (Dom.create_comment data) 2070 + | None -> ()) 2071 + | Token.Doctype _ -> 2072 + parse_error t "unexpected-doctype" 2073 + | Token.Tag { kind = Token.Start; name = "html"; _ } -> 2074 + process_in_body t token 2075 + | Token.Tag { kind = Token.End; name = "html"; _ } -> 2076 + if t.fragment_context <> None then 2077 + parse_error t "unexpected-end-tag" 2078 + else 2079 + t.mode <- Insertion_mode.After_after_body 2080 + | Token.EOF -> 2081 + () (* Stop parsing *) 2082 + | _ -> 2083 + parse_error t "unexpected-token-after-body"; 2084 + t.mode <- Insertion_mode.In_body; 2085 + process_token t token 2086 + 2087 + and process_in_frameset t token = 2088 + match token with 2089 + | Token.Character data -> 2090 + (* Extract only whitespace characters and insert them *) 2091 + let whitespace = String.to_seq data 2092 + |> Seq.filter (fun c -> List.mem c ['\t'; '\n'; '\x0C'; '\r'; ' ']) 2093 + |> String.of_seq in 2094 + if whitespace <> "" then insert_character t whitespace; 2095 + if not (is_whitespace data) then 2096 + parse_error t "unexpected-char-in-frameset" 2097 + | Token.Comment data -> 2098 + insert_comment t data 2099 + | Token.Doctype _ -> 2100 + parse_error t "unexpected-doctype" 2101 + | Token.Tag { kind = Token.Start; name = "html"; _ } -> 2102 + process_in_body t token 2103 + | Token.Tag { kind = Token.Start; name = "frameset"; attrs; _ } -> 2104 + ignore (insert_element t "frameset" ~push:true attrs) 2105 + | Token.Tag { kind = Token.End; name = "frameset"; _ } -> 2106 + (match current_node t with 2107 + | Some n when n.Dom.name = "html" -> parse_error t "unexpected-end-tag" 2108 + | _ -> 2109 + pop_current t; 2110 + if t.fragment_context = None then 2111 + (match current_node t with 2112 + | Some n when n.Dom.name <> "frameset" -> t.mode <- Insertion_mode.After_frameset 2113 + | _ -> ())) 2114 + | Token.Tag { kind = Token.Start; name = "frame"; attrs; _ } -> 2115 + ignore (insert_element t "frame" ~push:true attrs); 2116 + pop_current t 2117 + | Token.Tag { kind = Token.Start; name = "noframes"; _ } -> 2118 + process_in_head t token 2119 + | Token.EOF -> 2120 + (match current_node t with 2121 + | Some n when n.Dom.name <> "html" -> parse_error t "expected-closing-tag-but-got-eof" 2122 + | _ -> ()) 2123 + | _ -> 2124 + parse_error t "unexpected-token-in-frameset" 2125 + 2126 + and process_after_frameset t token = 2127 + match token with 2128 + | Token.Character data -> 2129 + (* Extract only whitespace characters and insert them *) 2130 + let whitespace = String.to_seq data 2131 + |> Seq.filter (fun c -> List.mem c ['\t'; '\n'; '\x0C'; '\r'; ' ']) 2132 + |> String.of_seq in 2133 + if whitespace <> "" then insert_character t whitespace; 2134 + if not (is_whitespace data) then 2135 + parse_error t "unexpected-char-after-frameset" 2136 + | Token.Comment data -> 2137 + insert_comment t data 2138 + | Token.Doctype _ -> 2139 + parse_error t "unexpected-doctype" 2140 + | Token.Tag { kind = Token.Start; name = "html"; _ } -> 2141 + process_in_body t token 2142 + | Token.Tag { kind = Token.End; name = "html"; _ } -> 2143 + t.mode <- Insertion_mode.After_after_frameset 2144 + | Token.Tag { kind = Token.Start; name = "noframes"; _ } -> 2145 + process_in_head t token 2146 + | Token.EOF -> 2147 + () (* Stop parsing *) 2148 + | _ -> 2149 + parse_error t "unexpected-token-after-frameset" 2150 + 2151 + and process_after_after_body t token = 2152 + match token with 2153 + | Token.Comment data -> 2154 + insert_comment_to_document t data 2155 + | Token.Doctype _ -> 2156 + process_in_body t token 2157 + | Token.Character data when is_whitespace data -> 2158 + process_in_body t token 2159 + | Token.Tag { kind = Token.Start; name = "html"; _ } -> 2160 + process_in_body t token 2161 + | Token.EOF -> 2162 + () (* Stop parsing *) 2163 + | _ -> 2164 + parse_error t "unexpected-token-after-after-body"; 2165 + t.mode <- Insertion_mode.In_body; 2166 + process_token t token 2167 + 2168 + and process_after_after_frameset t token = 2169 + match token with 2170 + | Token.Comment data -> 2171 + insert_comment_to_document t data 2172 + | Token.Doctype _ -> 2173 + process_in_body t token 2174 + | Token.Character data -> 2175 + (* Extract only whitespace characters and process using in_body rules *) 2176 + let whitespace = String.to_seq data 2177 + |> Seq.filter (fun c -> List.mem c ['\t'; '\n'; '\x0C'; '\r'; ' ']) 2178 + |> String.of_seq in 2179 + if whitespace <> "" then process_in_body t (Token.Character whitespace); 2180 + if not (is_whitespace data) then 2181 + parse_error t "unexpected-char-after-after-frameset" 2182 + | Token.Tag { kind = Token.Start; name = "html"; _ } -> 2183 + process_in_body t token 2184 + | Token.EOF -> 2185 + () (* Stop parsing *) 2186 + | Token.Tag { kind = Token.Start; name = "noframes"; _ } -> 2187 + process_in_head t token 2188 + | _ -> 2189 + parse_error t "unexpected-token-after-after-frameset" 2190 + 2191 + and process_token t token = 2192 + (* Check for HTML integration points (SVG foreignObject, desc, title) *) 2193 + let is_html_integration_point node = 2194 + (* SVG foreignObject, desc, and title are always HTML integration points *) 2195 + if node.Dom.namespace = Some "svg" && 2196 + List.mem node.Dom.name Constants.svg_html_integration then true 2197 + (* annotation-xml is an HTML integration point only with specific encoding values *) 2198 + else if node.Dom.namespace = Some "mathml" && node.Dom.name = "annotation-xml" then 2199 + match List.assoc_opt "encoding" node.Dom.attrs with 2200 + | Some enc -> 2201 + let enc_lower = String.lowercase_ascii enc in 2202 + enc_lower = "text/html" || enc_lower = "application/xhtml+xml" 2203 + | None -> false 2204 + else false 2205 + in 2206 + (* Check for MathML text integration points *) 2207 + let is_mathml_text_integration_point node = 2208 + node.Dom.namespace = Some "mathml" && 2209 + List.mem node.Dom.name ["mi"; "mo"; "mn"; "ms"; "mtext"] 2210 + in 2211 + (* Foreign content handling *) 2212 + let in_foreign = 2213 + match adjusted_current_node t with 2214 + | None -> false 2215 + | Some node -> 2216 + if is_in_html_namespace node then false 2217 + else begin 2218 + (* At HTML integration points, characters and start tags (except mglyph/malignmark) use HTML rules *) 2219 + if is_html_integration_point node then begin 2220 + match token with 2221 + | Token.Character _ -> false 2222 + | Token.Tag { kind = Token.Start; _ } -> false 2223 + | _ -> true 2224 + end 2225 + (* At MathML text integration points, characters and start tags (except mglyph/malignmark) use HTML rules *) 2226 + else if is_mathml_text_integration_point node then begin 2227 + match token with 2228 + | Token.Character _ -> false 2229 + | Token.Tag { kind = Token.Start; name; _ } -> 2230 + name = "mglyph" || name = "malignmark" 2231 + | _ -> true 2232 + end 2233 + (* Special case: <svg> inside annotation-xml uses HTML rules (creates svg in svg namespace) *) 2234 + else if node.Dom.namespace = Some "mathml" && node.Dom.name = "annotation-xml" then begin 2235 + match token with 2236 + | Token.Tag { kind = Token.Start; name; _ } when String.lowercase_ascii name = "svg" -> false 2237 + | _ -> true 2238 + end 2239 + (* Not at integration point - use foreign content rules *) 2240 + (* Breakout handling is done inside process_foreign_content *) 2241 + else true 2242 + end 2243 + in 2244 + 2245 + (* Check if at HTML integration point for special table mode handling *) 2246 + let at_integration_point = 2247 + match adjusted_current_node t with 2248 + | Some node -> 2249 + is_html_integration_point node || is_mathml_text_integration_point node 2250 + | None -> false 2251 + in 2252 + 2253 + if in_foreign then 2254 + process_foreign_content t token 2255 + else if at_integration_point then begin 2256 + (* At integration points, check if in table mode without table in scope *) 2257 + let is_table_mode = List.mem t.mode [In_table; In_table_body; In_row; In_cell; In_caption; In_column_group] in 2258 + let has_table = has_element_in_table_scope t "table" in 2259 + if is_table_mode && not has_table then begin 2260 + match token with 2261 + | Token.Tag { kind = Token.Start; _ } -> 2262 + (* Temporarily use IN_BODY for start tags in table mode without table *) 2263 + let saved_mode = t.mode in 2264 + t.mode <- In_body; 2265 + process_by_mode t token; 2266 + if t.mode = In_body then t.mode <- saved_mode 2267 + | _ -> process_by_mode t token 2268 + end else 2269 + process_by_mode t token 2270 + end else 2271 + process_by_mode t token 2272 + 2273 + (* Pop foreign elements until HTML or integration point *) 2274 + and pop_until_html_or_integration_point t = 2275 + let is_html_integration_point node = 2276 + (* SVG foreignObject, desc, and title are always HTML integration points *) 2277 + if node.Dom.namespace = Some "svg" && 2278 + List.mem node.Dom.name Constants.svg_html_integration then true 2279 + (* annotation-xml is an HTML integration point only with specific encoding values *) 2280 + else if node.Dom.namespace = Some "mathml" && node.Dom.name = "annotation-xml" then 2281 + match List.assoc_opt "encoding" node.Dom.attrs with 2282 + | Some enc -> 2283 + let enc_lower = String.lowercase_ascii enc in 2284 + enc_lower = "text/html" || enc_lower = "application/xhtml+xml" 2285 + | None -> false 2286 + else false 2287 + in 2288 + (* Get fragment context element - only for foreign namespace fragment contexts *) 2289 + let fragment_context_elem = t.fragment_context_element in 2290 + let rec pop () = 2291 + match current_node t with 2292 + | None -> () 2293 + | Some node -> 2294 + if is_in_html_namespace node then () 2295 + else if is_html_integration_point node then () 2296 + (* Don't pop past fragment context element *) 2297 + else (match fragment_context_elem with 2298 + | Some ctx when node == ctx -> () 2299 + | _ -> 2300 + pop_current t; 2301 + pop ()) 2302 + in 2303 + pop () 2304 + 2305 + (* Foreign breakout elements - these break out of foreign content *) 2306 + and is_foreign_breakout_element name = 2307 + List.mem (String.lowercase_ascii name) 2308 + ["b"; "big"; "blockquote"; "body"; "br"; "center"; "code"; "dd"; "div"; "dl"; "dt"; 2309 + "em"; "embed"; "h1"; "h2"; "h3"; "h4"; "h5"; "h6"; "head"; "hr"; "i"; "img"; "li"; 2310 + "listing"; "menu"; "meta"; "nobr"; "ol"; "p"; "pre"; "ruby"; "s"; "small"; "span"; 2311 + "strong"; "strike"; "sub"; "sup"; "table"; "tt"; "u"; "ul"; "var"] 2312 + 2313 + and process_foreign_content t token = 2314 + match token with 2315 + | Token.Character "\x00" -> 2316 + parse_error t "unexpected-null-character"; 2317 + insert_character t "\xEF\xBF\xBD" 2318 + | Token.Character data when is_whitespace data -> 2319 + insert_character t data 2320 + | Token.Character data -> 2321 + insert_character t data; 2322 + t.frameset_ok <- false 2323 + | Token.Comment data -> 2324 + insert_comment t data 2325 + | Token.Doctype _ -> 2326 + parse_error t "unexpected-doctype" 2327 + | Token.Tag { kind = Token.Start; name; _ } when is_foreign_breakout_element name -> 2328 + (* Breakout from foreign content - pop until HTML or integration point, reprocess in HTML mode *) 2329 + parse_error t "unexpected-html-element-in-foreign-content"; 2330 + pop_until_html_or_integration_point t; 2331 + reset_insertion_mode t; 2332 + (* Use process_by_mode to force HTML mode processing and avoid infinite loop *) 2333 + process_by_mode t token 2334 + | Token.Tag { kind = Token.Start; name = "font"; attrs; _ } 2335 + when List.exists (fun (n, _) -> 2336 + let n = String.lowercase_ascii n in 2337 + n = "color" || n = "face" || n = "size") attrs -> 2338 + (* font with color/face/size breaks out of foreign content *) 2339 + parse_error t "unexpected-html-element-in-foreign-content"; 2340 + pop_until_html_or_integration_point t; 2341 + reset_insertion_mode t; 2342 + process_by_mode t token 2343 + | Token.Tag { kind = Token.Start; name; attrs; self_closing } -> 2344 + let name = 2345 + match adjusted_current_node t with 2346 + | Some n when n.Dom.namespace = Some "svg" -> Constants.adjust_svg_tag_name name 2347 + | _ -> name 2348 + in 2349 + let attrs = 2350 + match adjusted_current_node t with 2351 + | Some n when n.Dom.namespace = Some "svg" -> 2352 + Constants.adjust_svg_attrs (Constants.adjust_foreign_attrs attrs) 2353 + | Some n when n.Dom.namespace = Some "mathml" -> 2354 + Constants.adjust_mathml_attrs (Constants.adjust_foreign_attrs attrs) 2355 + | _ -> Constants.adjust_foreign_attrs attrs 2356 + in 2357 + let namespace = 2358 + match adjusted_current_node t with 2359 + | Some n -> n.Dom.namespace 2360 + | None -> None 2361 + in 2362 + let node = insert_element t name ~namespace attrs in 2363 + t.open_elements <- node :: t.open_elements; 2364 + if self_closing then pop_current t 2365 + | Token.Tag { kind = Token.End; name; _ } when List.mem (String.lowercase_ascii name) ["br"; "p"] -> 2366 + (* Special case: </br> and </p> end tags trigger breakout from foreign content *) 2367 + parse_error t "unexpected-html-element-in-foreign-content"; 2368 + pop_until_html_or_integration_point t; 2369 + reset_insertion_mode t; 2370 + (* Use process_by_mode to force HTML mode processing and avoid infinite loop *) 2371 + process_by_mode t token 2372 + | Token.Tag { kind = Token.End; name; _ } -> 2373 + (* Find matching element per WHATWG spec for foreign content *) 2374 + let is_fragment_context n = 2375 + match t.fragment_context_element with 2376 + | Some ctx -> n == ctx 2377 + | None -> false 2378 + in 2379 + let name_lower = String.lowercase_ascii name in 2380 + (* Walk through stack looking for matching element *) 2381 + let rec find_and_process first_node idx = function 2382 + | [] -> () (* Stack exhausted - ignore tag *) 2383 + | n :: rest -> 2384 + let node_name_lower = String.lowercase_ascii n.Dom.name in 2385 + let is_html = is_in_html_namespace n in 2386 + let name_matches = node_name_lower = name_lower in 2387 + 2388 + (* If first node doesn't match tag name, it's a parse error *) 2389 + if first_node && not name_matches then 2390 + parse_error t "unexpected-end-tag-in-foreign-content"; 2391 + 2392 + (* Check if this node matches the end tag *) 2393 + if name_matches then begin 2394 + (* Fragment context check *) 2395 + if is_fragment_context n then 2396 + parse_error t "unexpected-end-tag-in-fragment-context" 2397 + (* If matched element is in HTML namespace, reprocess via HTML mode *) 2398 + else if is_html then 2399 + process_by_mode t token 2400 + (* Otherwise it's a foreign element - pop everything from this point up *) 2401 + else begin 2402 + (* Pop all elements from current down to and including the matched element *) 2403 + let rec pop_to_idx current_idx = 2404 + if current_idx >= idx then begin 2405 + pop_current t; 2406 + pop_to_idx (current_idx - 1) 2407 + end 2408 + in 2409 + pop_to_idx (List.length t.open_elements - 1) 2410 + end 2411 + end 2412 + (* If we hit an HTML element that doesn't match, process via HTML mode *) 2413 + else if is_html then 2414 + process_by_mode t token 2415 + (* Continue searching in the stack *) 2416 + else 2417 + find_and_process false (idx - 1) rest 2418 + in 2419 + find_and_process true (List.length t.open_elements - 1) t.open_elements 2420 + | Token.EOF -> 2421 + process_by_mode t token 2422 + 2423 + and process_by_mode t token = 2424 + match t.mode with 2425 + | Insertion_mode.Initial -> process_initial t token 2426 + | Insertion_mode.Before_html -> process_before_html t token 2427 + | Insertion_mode.Before_head -> process_before_head t token 2428 + | Insertion_mode.In_head -> process_in_head t token 2429 + | Insertion_mode.In_head_noscript -> process_in_head_noscript t token 2430 + | Insertion_mode.After_head -> process_after_head t token 2431 + | Insertion_mode.In_body -> process_in_body t token 2432 + | Insertion_mode.Text -> process_text t token 2433 + | Insertion_mode.In_table -> process_in_table t token 2434 + | Insertion_mode.In_table_text -> process_in_table_text t token 2435 + | Insertion_mode.In_caption -> process_in_caption t token 2436 + | Insertion_mode.In_column_group -> process_in_column_group t token 2437 + | Insertion_mode.In_table_body -> process_in_table_body t token 2438 + | Insertion_mode.In_row -> process_in_row t token 2439 + | Insertion_mode.In_cell -> process_in_cell t token 2440 + | Insertion_mode.In_select -> process_in_select t token 2441 + | Insertion_mode.In_select_in_table -> process_in_select_in_table t token 2442 + | Insertion_mode.In_template -> process_in_template t token 2443 + | Insertion_mode.After_body -> process_after_body t token 2444 + | Insertion_mode.In_frameset -> process_in_frameset t token 2445 + | Insertion_mode.After_frameset -> process_after_frameset t token 2446 + | Insertion_mode.After_after_body -> process_after_after_body t token 2447 + | Insertion_mode.After_after_frameset -> process_after_after_frameset t token 2448 + 2449 + (* Populate selectedcontent elements with content from selected option *) 2450 + let find_elements name node = 2451 + let result = ref [] in 2452 + let rec find n = 2453 + if n.Dom.name = name then result := n :: !result; 2454 + List.iter find n.Dom.children 2455 + in 2456 + find node; 2457 + List.rev !result (* Reverse to maintain document order *) 2458 + 2459 + let find_element name node = 2460 + let rec find n = 2461 + if n.Dom.name = name then Some n 2462 + else 2463 + List.find_map find n.Dom.children 2464 + in 2465 + find node 2466 + 2467 + let populate_selectedcontent document = 2468 + let selects = find_elements "select" document in 2469 + List.iter (fun select -> 2470 + match find_element "selectedcontent" select with 2471 + | None -> () 2472 + | Some selectedcontent -> 2473 + let options = find_elements "option" select in 2474 + if options <> [] then begin 2475 + (* Find selected option or use first *) 2476 + let selected_option = 2477 + match List.find_opt (fun opt -> Dom.has_attr opt "selected") options with 2478 + | Some opt -> opt 2479 + | None -> List.hd options 2480 + in 2481 + (* Clone children from selected option to selectedcontent *) 2482 + List.iter (fun child -> 2483 + let cloned = Dom.clone ~deep:true child in 2484 + Dom.append_child selectedcontent cloned 2485 + ) selected_option.Dom.children 2486 + end 2487 + ) selects 2488 + 2489 + let finish t = 2490 + (* Populate selectedcontent elements *) 2491 + populate_selectedcontent t.document; 2492 + (* For fragment parsing, remove the html wrapper and promote children *) 2493 + if t.fragment_context <> None then begin 2494 + match t.document.Dom.children with 2495 + | [root] when root.Dom.name = "html" -> 2496 + (* Move context element's children to root if applicable *) 2497 + (match t.fragment_context_element with 2498 + | Some ctx_elem -> 2499 + (match ctx_elem.Dom.parent with 2500 + | Some p when p == root -> 2501 + let ctx_children = ctx_elem.Dom.children in 2502 + List.iter (fun child -> 2503 + Dom.remove_child ctx_elem child; 2504 + Dom.append_child root child 2505 + ) ctx_children; 2506 + Dom.remove_child root ctx_elem 2507 + | _ -> ()) 2508 + | None -> ()); 2509 + (* Promote root's children to document - preserve order *) 2510 + let children_copy = root.Dom.children in 2511 + List.iter (fun child -> 2512 + Dom.remove_child root child; 2513 + Dom.append_child t.document child 2514 + ) children_copy; 2515 + Dom.remove_child t.document root 2516 + | _ -> () 2517 + end; 2518 + t.document 2519 + 2520 + let get_errors t = List.rev t.errors
+4
lib/selector/dune
···
··· 1 + (library 2 + (name html5rw_selector) 3 + (public_name html5rw.selector) 4 + (libraries html5rw.dom re))
+12
lib/selector/html5rw_selector.ml
···
··· 1 + (* html5rw.selector - CSS selector engine *) 2 + 3 + exception Selector_error = Selector_lexer.Selector_error 4 + 5 + module Ast = Selector_ast 6 + module Token = Selector_token 7 + 8 + let parse = Selector_parser.parse_selector 9 + 10 + let query = Selector_match.query 11 + 12 + let matches = Selector_match.matches
+47
lib/selector/selector_ast.ml
···
··· 1 + (* CSS selector AST types *) 2 + 3 + type simple_selector_type = 4 + | Type_tag 5 + | Type_id 6 + | Type_class 7 + | Type_universal 8 + | Type_attr 9 + | Type_pseudo 10 + 11 + type simple_selector = { 12 + selector_type : simple_selector_type; 13 + name : string option; 14 + operator : string option; 15 + value : string option; 16 + arg : string option; 17 + } 18 + 19 + type compound_selector = { 20 + selectors : simple_selector list; 21 + } 22 + 23 + type complex_selector = { 24 + parts : (string option * compound_selector) list; 25 + (* List of (combinator, compound_selector) pairs. 26 + First element has combinator = None *) 27 + } 28 + 29 + type selector_list = { 30 + selectors : complex_selector list; 31 + } 32 + 33 + type selector = 34 + | Simple of simple_selector 35 + | Compound of compound_selector 36 + | Complex of complex_selector 37 + | List of selector_list 38 + 39 + (* Constructors *) 40 + let make_simple selector_type ?name ?operator ?value ?arg () = 41 + { selector_type; name; operator; value; arg } 42 + 43 + let make_compound (selectors : simple_selector list) : compound_selector = { selectors } 44 + 45 + let make_complex parts : complex_selector = { parts } 46 + 47 + let make_list (selectors : complex_selector list) : selector_list = { selectors }
+195
lib/selector/selector_lexer.ml
···
··· 1 + (* CSS selector lexer *) 2 + 3 + exception Selector_error of string 4 + 5 + type t = { 6 + input : string; 7 + len : int; 8 + mutable pos : int; 9 + } 10 + 11 + let create input = { input; len = String.length input; pos = 0 } 12 + 13 + let peek t = 14 + if t.pos < t.len then Some t.input.[t.pos] 15 + else None 16 + 17 + let advance t = 18 + if t.pos < t.len then t.pos <- t.pos + 1 19 + 20 + let consume t = 21 + let c = peek t in 22 + advance t; 23 + c 24 + 25 + let is_whitespace c = c = ' ' || c = '\t' || c = '\n' || c = '\r' || c = '\x0C' 26 + 27 + let is_name_start c = 28 + (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c = '_' || c = '-' || Char.code c > 127 29 + 30 + let is_name_char c = 31 + is_name_start c || (c >= '0' && c <= '9') 32 + 33 + let skip_whitespace t = 34 + while t.pos < t.len && is_whitespace t.input.[t.pos] do 35 + advance t 36 + done 37 + 38 + let read_name t = 39 + let start = t.pos in 40 + while t.pos < t.len && is_name_char t.input.[t.pos] do 41 + advance t 42 + done; 43 + String.sub t.input start (t.pos - start) 44 + 45 + let read_string t quote = 46 + advance t; (* Skip opening quote *) 47 + let buf = Buffer.create 32 in 48 + let rec loop () = 49 + match peek t with 50 + | None -> raise (Selector_error "Unterminated string") 51 + | Some c when c = quote -> advance t 52 + | Some '\\' -> 53 + advance t; 54 + (match peek t with 55 + | Some c -> Buffer.add_char buf c; advance t; loop () 56 + | None -> raise (Selector_error "Unterminated escape")) 57 + | Some c -> 58 + Buffer.add_char buf c; 59 + advance t; 60 + loop () 61 + in 62 + loop (); 63 + Buffer.contents buf 64 + 65 + let read_unquoted_attr_value t = 66 + let start = t.pos in 67 + while t.pos < t.len && 68 + let c = t.input.[t.pos] in 69 + not (is_whitespace c) && c <> ']' do 70 + advance t 71 + done; 72 + String.sub t.input start (t.pos - start) 73 + 74 + let tokenize input = 75 + let t = create input in 76 + let tokens = ref [] in 77 + let pending_ws = ref false in 78 + 79 + while t.pos < t.len do 80 + let c = t.input.[t.pos] in 81 + 82 + if is_whitespace c then begin 83 + pending_ws := true; 84 + skip_whitespace t 85 + end else if c = '>' || c = '+' || c = '~' then begin 86 + pending_ws := false; 87 + advance t; 88 + skip_whitespace t; 89 + tokens := Selector_token.Combinator (String.make 1 c) :: !tokens 90 + end else begin 91 + if !pending_ws && !tokens <> [] && c <> ',' then 92 + tokens := Selector_token.Combinator " " :: !tokens; 93 + pending_ws := false; 94 + 95 + match c with 96 + | '*' -> 97 + advance t; 98 + tokens := Selector_token.Universal :: !tokens 99 + | '#' -> 100 + advance t; 101 + let name = read_name t in 102 + if name = "" then raise (Selector_error "Expected identifier after #"); 103 + tokens := Selector_token.Id name :: !tokens 104 + | '.' -> 105 + advance t; 106 + let name = read_name t in 107 + if name = "" then raise (Selector_error "Expected identifier after ."); 108 + tokens := Selector_token.Class name :: !tokens 109 + | '[' -> 110 + advance t; 111 + tokens := Selector_token.Attr_start :: !tokens; 112 + skip_whitespace t; 113 + let attr_name = read_name t in 114 + if attr_name = "" then raise (Selector_error "Expected attribute name"); 115 + tokens := Selector_token.Tag attr_name :: !tokens; 116 + skip_whitespace t; 117 + 118 + (match peek t with 119 + | Some ']' -> 120 + advance t; 121 + tokens := Selector_token.Attr_end :: !tokens 122 + | Some '=' -> 123 + advance t; 124 + tokens := Selector_token.Attr_op "=" :: !tokens; 125 + skip_whitespace t; 126 + let value = match peek t with 127 + | Some '"' -> read_string t '"' 128 + | Some '\'' -> read_string t '\'' 129 + | _ -> read_unquoted_attr_value t 130 + in 131 + tokens := Selector_token.String value :: !tokens; 132 + skip_whitespace t; 133 + if peek t <> Some ']' then raise (Selector_error "Expected ]"); 134 + advance t; 135 + tokens := Selector_token.Attr_end :: !tokens 136 + | Some ('~' | '|' | '^' | '$' | '*') as op_char -> 137 + let op_c = Option.get op_char in 138 + advance t; 139 + if peek t <> Some '=' then 140 + raise (Selector_error ("Expected = after " ^ String.make 1 op_c)); 141 + advance t; 142 + tokens := Selector_token.Attr_op (String.make 1 op_c ^ "=") :: !tokens; 143 + skip_whitespace t; 144 + let value = match peek t with 145 + | Some '"' -> read_string t '"' 146 + | Some '\'' -> read_string t '\'' 147 + | _ -> read_unquoted_attr_value t 148 + in 149 + tokens := Selector_token.String value :: !tokens; 150 + skip_whitespace t; 151 + if peek t <> Some ']' then raise (Selector_error "Expected ]"); 152 + advance t; 153 + tokens := Selector_token.Attr_end :: !tokens 154 + | _ -> raise (Selector_error "Unexpected character in attribute selector")) 155 + 156 + | ',' -> 157 + advance t; 158 + skip_whitespace t; 159 + tokens := Selector_token.Comma :: !tokens 160 + | ':' -> 161 + advance t; 162 + tokens := Selector_token.Colon :: !tokens; 163 + let name = read_name t in 164 + if name = "" then raise (Selector_error "Expected pseudo-class name"); 165 + tokens := Selector_token.Tag name :: !tokens; 166 + 167 + if peek t = Some '(' then begin 168 + advance t; 169 + tokens := Selector_token.Paren_open :: !tokens; 170 + skip_whitespace t; 171 + (* Read argument until closing paren *) 172 + let depth = ref 1 in 173 + let start = t.pos in 174 + while !depth > 0 && t.pos < t.len do 175 + match t.input.[t.pos] with 176 + | '(' -> incr depth; advance t 177 + | ')' -> decr depth; if !depth > 0 then advance t 178 + | _ -> advance t 179 + done; 180 + let arg = String.trim (String.sub t.input start (t.pos - start)) in 181 + if arg <> "" then tokens := Selector_token.String arg :: !tokens; 182 + if peek t <> Some ')' then raise (Selector_error "Expected )"); 183 + advance t; 184 + tokens := Selector_token.Paren_close :: !tokens 185 + end 186 + | _ when is_name_start c -> 187 + let name = read_name t in 188 + tokens := Selector_token.Tag (String.lowercase_ascii name) :: !tokens 189 + | _ -> 190 + raise (Selector_error ("Unexpected character: " ^ String.make 1 c)) 191 + end 192 + done; 193 + 194 + tokens := Selector_token.EOF :: !tokens; 195 + List.rev !tokens
+308
lib/selector/selector_match.ml
···
··· 1 + (* CSS selector matching *) 2 + 3 + module Dom = Html5rw_dom 4 + open Selector_ast 5 + 6 + let is_element node = 7 + let name = node.Dom.name in 8 + name <> "#text" && name <> "#comment" && name <> "#document" && 9 + name <> "#document-fragment" && name <> "!doctype" 10 + 11 + let get_element_children node = 12 + List.filter is_element node.Dom.children 13 + 14 + let get_previous_sibling node = 15 + match node.Dom.parent with 16 + | None -> None 17 + | Some parent -> 18 + let rec find_prev prev = function 19 + | [] -> None 20 + | n :: rest -> 21 + if n == node then prev 22 + else if is_element n then find_prev (Some n) rest 23 + else find_prev prev rest 24 + in 25 + find_prev None parent.Dom.children 26 + 27 + let is_first_child node = 28 + match node.Dom.parent with 29 + | None -> false 30 + | Some parent -> 31 + match get_element_children parent with 32 + | first :: _ -> first == node 33 + | [] -> false 34 + 35 + let is_last_child node = 36 + match node.Dom.parent with 37 + | None -> false 38 + | Some parent -> 39 + match List.rev (get_element_children parent) with 40 + | last :: _ -> last == node 41 + | [] -> false 42 + 43 + let is_first_of_type node = 44 + match node.Dom.parent with 45 + | None -> false 46 + | Some parent -> 47 + let name = String.lowercase_ascii node.Dom.name in 48 + let rec find = function 49 + | [] -> false 50 + | n :: _ when String.lowercase_ascii n.Dom.name = name -> n == node 51 + | _ :: rest -> find rest 52 + in 53 + find (get_element_children parent) 54 + 55 + let is_last_of_type node = 56 + match node.Dom.parent with 57 + | None -> false 58 + | Some parent -> 59 + let name = String.lowercase_ascii node.Dom.name in 60 + let rec find last = function 61 + | [] -> (match last with Some l -> l == node | None -> false) 62 + | n :: rest when String.lowercase_ascii n.Dom.name = name -> find (Some n) rest 63 + | _ :: rest -> find last rest 64 + in 65 + find None (get_element_children parent) 66 + 67 + let get_index node = 68 + match node.Dom.parent with 69 + | None -> 0 70 + | Some parent -> 71 + let children = get_element_children parent in 72 + let rec find idx = function 73 + | [] -> 0 74 + | n :: _ when n == node -> idx 75 + | _ :: rest -> find (idx + 1) rest 76 + in 77 + find 1 children 78 + 79 + let get_type_index node = 80 + match node.Dom.parent with 81 + | None -> 0 82 + | Some parent -> 83 + let name = String.lowercase_ascii node.Dom.name in 84 + let children = get_element_children parent in 85 + let rec find idx = function 86 + | [] -> 0 87 + | n :: _ when n == node -> idx 88 + | n :: rest when String.lowercase_ascii n.Dom.name = name -> find (idx + 1) rest 89 + | _ :: rest -> find idx rest 90 + in 91 + find 1 children 92 + 93 + (* Parse nth expression: "odd", "even", "3", "2n+1", etc *) 94 + let parse_nth expr = 95 + let expr = String.lowercase_ascii (String.trim expr) in 96 + if expr = "odd" then Some (2, 1) 97 + else if expr = "even" then Some (2, 0) 98 + else 99 + let expr = String.concat "" (String.split_on_char ' ' expr) in 100 + if String.contains expr 'n' then 101 + let parts = String.split_on_char 'n' expr in 102 + match parts with 103 + | [a_part; b_part] -> 104 + let a = 105 + if a_part = "" || a_part = "+" then 1 106 + else if a_part = "-" then -1 107 + else int_of_string_opt a_part |> Option.value ~default:0 108 + in 109 + let b = 110 + if b_part = "" then 0 111 + else int_of_string_opt b_part |> Option.value ~default:0 112 + in 113 + Some (a, b) 114 + | _ -> None 115 + else 116 + match int_of_string_opt expr with 117 + | Some n -> Some (0, n) 118 + | None -> None 119 + 120 + let matches_nth index a b = 121 + if a = 0 then index = b 122 + else 123 + let diff = index - b in 124 + if a > 0 then diff >= 0 && diff mod a = 0 125 + else diff <= 0 && diff mod a = 0 126 + 127 + let rec matches_simple node selector = 128 + if not (is_element node) then false 129 + else 130 + match selector.selector_type with 131 + | Type_universal -> true 132 + | Type_tag -> 133 + (match selector.name with 134 + | Some name -> String.lowercase_ascii node.Dom.name = String.lowercase_ascii name 135 + | None -> false) 136 + | Type_id -> 137 + (match selector.name with 138 + | Some id -> 139 + (match Dom.get_attr node "id" with 140 + | Some node_id -> node_id = id 141 + | None -> false) 142 + | None -> false) 143 + | Type_class -> 144 + (match selector.name with 145 + | Some cls -> 146 + (match Dom.get_attr node "class" with 147 + | Some class_attr -> 148 + let classes = String.split_on_char ' ' class_attr in 149 + List.mem cls classes 150 + | None -> false) 151 + | None -> false) 152 + | Type_attr -> 153 + (match selector.name with 154 + | Some attr_name -> 155 + let attr_name_lower = String.lowercase_ascii attr_name in 156 + let node_value = 157 + List.find_map (fun (k, v) -> 158 + if String.lowercase_ascii k = attr_name_lower then Some v 159 + else None 160 + ) node.Dom.attrs 161 + in 162 + (match node_value with 163 + | None -> false 164 + | Some _ when selector.operator = None -> true 165 + | Some attr_value -> 166 + let value = Option.value selector.value ~default:"" in 167 + (match selector.operator with 168 + | Some "=" -> attr_value = value 169 + | Some "~=" -> 170 + let words = String.split_on_char ' ' attr_value in 171 + List.mem value words 172 + | Some "|=" -> 173 + attr_value = value || String.length attr_value > String.length value && 174 + String.sub attr_value 0 (String.length value) = value && 175 + attr_value.[String.length value] = '-' 176 + | Some "^=" -> value <> "" && String.length attr_value >= String.length value && 177 + String.sub attr_value 0 (String.length value) = value 178 + | Some "$=" -> value <> "" && String.length attr_value >= String.length value && 179 + String.sub attr_value (String.length attr_value - String.length value) (String.length value) = value 180 + | Some "*=" -> value <> "" && Re.execp (Re.compile (Re.str value)) attr_value 181 + | Some _ | None -> false)) 182 + | None -> false) 183 + | Type_pseudo -> 184 + (match selector.name with 185 + | Some "first-child" -> is_first_child node 186 + | Some "last-child" -> is_last_child node 187 + | Some "first-of-type" -> is_first_of_type node 188 + | Some "last-of-type" -> is_last_of_type node 189 + | Some "only-child" -> is_first_child node && is_last_child node 190 + | Some "only-of-type" -> is_first_of_type node && is_last_of_type node 191 + | Some "empty" -> 192 + not (List.exists (fun c -> 193 + is_element c || (c.Dom.name = "#text" && String.trim c.Dom.data <> "") 194 + ) node.Dom.children) 195 + | Some "root" -> 196 + (match node.Dom.parent with 197 + | Some p -> p.Dom.name = "#document" || p.Dom.name = "#document-fragment" 198 + | None -> false) 199 + | Some "nth-child" -> 200 + (match selector.arg with 201 + | Some arg -> 202 + (match parse_nth arg with 203 + | Some (a, b) -> matches_nth (get_index node) a b 204 + | None -> false) 205 + | None -> false) 206 + | Some "nth-of-type" -> 207 + (match selector.arg with 208 + | Some arg -> 209 + (match parse_nth arg with 210 + | Some (a, b) -> matches_nth (get_type_index node) a b 211 + | None -> false) 212 + | None -> false) 213 + | Some "not" -> 214 + (match selector.arg with 215 + | Some arg -> 216 + (try 217 + let inner = Selector_parser.parse_selector arg in 218 + not (matches_selector node inner) 219 + with _ -> true) 220 + | None -> true) 221 + | _ -> false) 222 + 223 + and matches_compound node (compound : Selector_ast.compound_selector) = 224 + List.for_all (matches_simple node) compound.selectors 225 + 226 + and matches_complex node complex = 227 + (* Match from right to left *) 228 + let parts = List.rev complex.parts in 229 + match parts with 230 + | [] -> false 231 + | (_, rightmost) :: rest -> 232 + if not (matches_compound node rightmost) then false 233 + else 234 + let rec check current remaining = 235 + match remaining with 236 + | [] -> true 237 + | (Some " ", compound) :: rest -> 238 + (* Descendant combinator *) 239 + let rec find_ancestor n = 240 + match n.Dom.parent with 241 + | None -> false 242 + | Some p -> 243 + if matches_compound p compound then check p rest 244 + else find_ancestor p 245 + in 246 + find_ancestor current 247 + | (Some ">", compound) :: rest -> 248 + (* Child combinator *) 249 + (match current.Dom.parent with 250 + | None -> false 251 + | Some p -> 252 + if matches_compound p compound then check p rest 253 + else false) 254 + | (Some "+", compound) :: rest -> 255 + (* Adjacent sibling *) 256 + (match get_previous_sibling current with 257 + | None -> false 258 + | Some sib -> 259 + if matches_compound sib compound then check sib rest 260 + else false) 261 + | (Some "~", compound) :: rest -> 262 + (* General sibling *) 263 + let rec find_sibling n = 264 + match get_previous_sibling n with 265 + | None -> false 266 + | Some sib -> 267 + if matches_compound sib compound then check sib rest 268 + else find_sibling sib 269 + in 270 + find_sibling current 271 + | (None, compound) :: rest -> 272 + if matches_compound current compound then check current rest 273 + else false 274 + | _ -> false 275 + in 276 + check node rest 277 + 278 + and matches_selector node selector = 279 + match selector with 280 + | Simple s -> matches_simple node s 281 + | Compound c -> matches_compound node c 282 + | Complex c -> matches_complex node c 283 + | List l -> List.exists (fun c -> matches_complex node c) l.selectors 284 + 285 + let matches node selector_string = 286 + try 287 + let selector = Selector_parser.parse_selector selector_string in 288 + matches_selector node selector 289 + with _ -> false 290 + 291 + let rec query_descendants node selector results = 292 + List.iter (fun child -> 293 + if is_element child && matches_selector child selector then 294 + results := child :: !results; 295 + query_descendants child selector results; 296 + (* Also search template content *) 297 + (match child.Dom.template_content with 298 + | Some tc -> query_descendants tc selector results 299 + | None -> ()) 300 + ) node.Dom.children 301 + 302 + let query root selector_string = 303 + try 304 + let selector = Selector_parser.parse_selector selector_string in 305 + let results = ref [] in 306 + query_descendants root selector results; 307 + List.rev !results 308 + with _ -> []
+149
lib/selector/selector_parser.ml
···
··· 1 + (* CSS selector parser *) 2 + 3 + open Selector_ast 4 + open Selector_token 5 + 6 + exception Parse_error of string 7 + 8 + type t = { 9 + tokens : Selector_token.t list; 10 + mutable pos : int; 11 + } 12 + 13 + let create tokens = { tokens; pos = 0 } 14 + 15 + let peek t = 16 + if t.pos < List.length t.tokens then 17 + List.nth t.tokens t.pos 18 + else EOF 19 + 20 + let advance t = 21 + if t.pos < List.length t.tokens then 22 + t.pos <- t.pos + 1 23 + 24 + let consume t = 25 + let tok = peek t in 26 + advance t; 27 + tok 28 + 29 + let expect t expected = 30 + let tok = peek t in 31 + if tok <> expected then 32 + raise (Parse_error ("Expected " ^ (match expected with EOF -> "EOF" | _ -> "token"))) 33 + else 34 + advance t 35 + 36 + let parse_simple_selector t = 37 + match peek t with 38 + | Tag name -> 39 + advance t; 40 + Some (make_simple Type_tag ~name ()) 41 + | Universal -> 42 + advance t; 43 + Some (make_simple Type_universal ()) 44 + | Id name -> 45 + advance t; 46 + Some (make_simple Type_id ~name ()) 47 + | Class name -> 48 + advance t; 49 + Some (make_simple Type_class ~name ()) 50 + | Attr_start -> 51 + advance t; 52 + let attr_name = match peek t with 53 + | Tag name -> advance t; name 54 + | _ -> raise (Parse_error "Expected attribute name") 55 + in 56 + (match peek t with 57 + | Attr_end -> 58 + advance t; 59 + Some (make_simple Type_attr ~name:attr_name ()) 60 + | Attr_op op -> 61 + advance t; 62 + let value = match peek t with 63 + | String v -> advance t; v 64 + | _ -> raise (Parse_error "Expected attribute value") 65 + in 66 + (match peek t with 67 + | Attr_end -> advance t 68 + | _ -> raise (Parse_error "Expected ]")); 69 + Some (make_simple Type_attr ~name:attr_name ~operator:op ~value ()) 70 + | _ -> raise (Parse_error "Expected ] or attribute operator")) 71 + | Colon -> 72 + advance t; 73 + let name = match peek t with 74 + | Tag n -> advance t; n 75 + | _ -> raise (Parse_error "Expected pseudo-class name") 76 + in 77 + let arg = match peek t with 78 + | Paren_open -> 79 + advance t; 80 + let a = match peek t with 81 + | String s -> advance t; Some s 82 + | Paren_close -> None 83 + | _ -> None 84 + in 85 + (match peek t with 86 + | Paren_close -> advance t 87 + | _ -> raise (Parse_error "Expected )")); 88 + a 89 + | _ -> None 90 + in 91 + Some (make_simple Type_pseudo ~name ?arg ()) 92 + | _ -> None 93 + 94 + let parse_compound_selector t = 95 + let rec loop acc = 96 + match parse_simple_selector t with 97 + | Some s -> loop (s :: acc) 98 + | None -> acc 99 + in 100 + let selectors = List.rev (loop []) in 101 + if selectors = [] then None 102 + else Some (make_compound selectors) 103 + 104 + let parse_complex_selector t = 105 + match parse_compound_selector t with 106 + | None -> None 107 + | Some first -> 108 + let parts = ref [(None, first)] in 109 + let rec loop () = 110 + match peek t with 111 + | Combinator comb -> 112 + advance t; 113 + (match parse_compound_selector t with 114 + | None -> raise (Parse_error "Expected selector after combinator") 115 + | Some compound -> 116 + parts := (Some comb, compound) :: !parts; 117 + loop ()) 118 + | _ -> () 119 + in 120 + loop (); 121 + Some (make_complex (List.rev !parts)) 122 + 123 + let parse tokens = 124 + let t = create tokens in 125 + let rec loop acc = 126 + match parse_complex_selector t with 127 + | None -> acc 128 + | Some sel -> 129 + (match peek t with 130 + | Comma -> 131 + advance t; 132 + loop (sel :: acc) 133 + | EOF -> sel :: acc 134 + | _ -> raise (Parse_error "Unexpected token")) 135 + in 136 + let selectors = List.rev (loop []) in 137 + (match peek t with 138 + | EOF -> () 139 + | _ -> raise (Parse_error "Expected end of selector")); 140 + match selectors with 141 + | [] -> raise (Parse_error "Empty selector") 142 + | [sel] -> Complex sel 143 + | sels -> List (make_list sels) 144 + 145 + let parse_selector input = 146 + if String.trim input = "" then 147 + raise (Selector_lexer.Selector_error "Empty selector"); 148 + let tokens = Selector_lexer.tokenize input in 149 + parse tokens
+17
lib/selector/selector_token.ml
···
··· 1 + (* CSS selector token types *) 2 + 3 + type t = 4 + | Tag of string 5 + | Id of string 6 + | Class of string 7 + | Universal 8 + | Attr_start 9 + | Attr_end 10 + | Attr_op of string 11 + | String of string 12 + | Combinator of string 13 + | Comma 14 + | Colon 15 + | Paren_open 16 + | Paren_close 17 + | EOF
+4
lib/tokenizer/dune
···
··· 1 + (library 2 + (name html5rw_tokenizer) 3 + (public_name html5rw.tokenizer) 4 + (libraries bytesrw html5rw.entities))
+12
lib/tokenizer/errors.ml
···
··· 1 + (* HTML5 parse error types *) 2 + 3 + type t = { 4 + code : string; 5 + line : int; 6 + column : int; 7 + } 8 + 9 + let make ~code ~line ~column = { code; line; column } 10 + 11 + let to_string err = 12 + Printf.sprintf "(%d,%d): %s" err.line err.column err.code
+16
lib/tokenizer/html5rw_tokenizer.ml
···
··· 1 + (* html5rw.tokenizer - HTML5 tokenizer with bytesrw-only API *) 2 + 3 + module Token = Token 4 + module State = State 5 + module Errors = Errors 6 + module Stream = Stream 7 + 8 + module type SINK = Tokenizer.SINK 9 + 10 + type 'a t = 'a Tokenizer.t 11 + 12 + let create = Tokenizer.create 13 + let run = Tokenizer.run 14 + let get_errors = Tokenizer.get_errors 15 + let set_state = Tokenizer.set_state 16 + let set_last_start_tag = Tokenizer.set_last_start_tag
+83
lib/tokenizer/state.ml
···
··· 1 + (* HTML5 tokenizer states *) 2 + 3 + type t = 4 + | Data 5 + | Rcdata 6 + | Rawtext 7 + | Script_data 8 + | Plaintext 9 + | Tag_open 10 + | End_tag_open 11 + | Tag_name 12 + | Rcdata_less_than_sign 13 + | Rcdata_end_tag_open 14 + | Rcdata_end_tag_name 15 + | Rawtext_less_than_sign 16 + | Rawtext_end_tag_open 17 + | Rawtext_end_tag_name 18 + | Script_data_less_than_sign 19 + | Script_data_end_tag_open 20 + | Script_data_end_tag_name 21 + | Script_data_escape_start 22 + | Script_data_escape_start_dash 23 + | Script_data_escaped 24 + | Script_data_escaped_dash 25 + | Script_data_escaped_dash_dash 26 + | Script_data_escaped_less_than_sign 27 + | Script_data_escaped_end_tag_open 28 + | Script_data_escaped_end_tag_name 29 + | Script_data_double_escape_start 30 + | Script_data_double_escaped 31 + | Script_data_double_escaped_dash 32 + | Script_data_double_escaped_dash_dash 33 + | Script_data_double_escaped_less_than_sign 34 + | Script_data_double_escape_end 35 + | Before_attribute_name 36 + | Attribute_name 37 + | After_attribute_name 38 + | Before_attribute_value 39 + | Attribute_value_double_quoted 40 + | Attribute_value_single_quoted 41 + | Attribute_value_unquoted 42 + | After_attribute_value_quoted 43 + | Self_closing_start_tag 44 + | Bogus_comment 45 + | Markup_declaration_open 46 + | Comment_start 47 + | Comment_start_dash 48 + | Comment 49 + | Comment_less_than_sign 50 + | Comment_less_than_sign_bang 51 + | Comment_less_than_sign_bang_dash 52 + | Comment_less_than_sign_bang_dash_dash 53 + | Comment_end_dash 54 + | Comment_end 55 + | Comment_end_bang 56 + | Doctype 57 + | Before_doctype_name 58 + | Doctype_name 59 + | After_doctype_name 60 + | After_doctype_public_keyword 61 + | Before_doctype_public_identifier 62 + | Doctype_public_identifier_double_quoted 63 + | Doctype_public_identifier_single_quoted 64 + | After_doctype_public_identifier 65 + | Between_doctype_public_and_system_identifiers 66 + | After_doctype_system_keyword 67 + | Before_doctype_system_identifier 68 + | Doctype_system_identifier_double_quoted 69 + | Doctype_system_identifier_single_quoted 70 + | After_doctype_system_identifier 71 + | Bogus_doctype 72 + | Cdata_section 73 + | Cdata_section_bracket 74 + | Cdata_section_end 75 + | Character_reference 76 + | Named_character_reference 77 + | Ambiguous_ampersand 78 + | Numeric_character_reference 79 + | Hexadecimal_character_reference_start 80 + | Decimal_character_reference_start 81 + | Hexadecimal_character_reference 82 + | Decimal_character_reference 83 + | Numeric_character_reference_end
+203
lib/tokenizer/stream.ml
···
··· 1 + (* Input stream for tokenizer with position tracking using bytesrw 2 + 3 + This implementation is designed to be as streaming as possible: 4 + - Reads slices on-demand from the Bytes.Reader.t 5 + - Only buffers what's needed for lookahead (typically 1-2 chars) 6 + - Avoids string allocations in hot paths like matches_ci 7 + *) 8 + 9 + open Bytesrw 10 + 11 + type t = { 12 + reader : Bytes.Reader.t; 13 + (* Current slice and position within it *) 14 + mutable current_slice : Bytes.Slice.t; 15 + mutable slice_pos : int; 16 + (* Lookahead buffer for reconsume and peek_n - small, typically 0-7 chars *) 17 + mutable lookahead : char list; 18 + (* Position tracking *) 19 + mutable line : int; 20 + mutable column : int; 21 + (* Track if we just saw CR (for CR/LF normalization) *) 22 + mutable last_was_cr : bool; 23 + } 24 + 25 + (* Create a stream from a Bytes.Reader.t *) 26 + let create_from_reader reader = 27 + let slice = Bytes.Reader.read reader in 28 + { 29 + reader; 30 + current_slice = slice; 31 + slice_pos = 0; 32 + lookahead = []; 33 + line = 1; 34 + column = 0; 35 + last_was_cr = false; 36 + } 37 + 38 + (* Create a stream from a string - discouraged, prefer create_from_reader *) 39 + let create input = 40 + create_from_reader (Bytes.Reader.of_string input) 41 + 42 + let position t = (t.line, t.column) 43 + 44 + (* Read next raw byte from the stream (before CR/LF normalization) *) 45 + let read_raw_char t = 46 + (* First check lookahead *) 47 + match t.lookahead with 48 + | c :: rest -> 49 + t.lookahead <- rest; 50 + Some c 51 + | [] -> 52 + (* Check if current slice is exhausted *) 53 + if Bytes.Slice.is_eod t.current_slice then 54 + None 55 + else if t.slice_pos >= Bytes.Slice.length t.current_slice then begin 56 + (* Get next slice *) 57 + t.current_slice <- Bytes.Reader.read t.reader; 58 + t.slice_pos <- 0; 59 + if Bytes.Slice.is_eod t.current_slice then 60 + None 61 + else begin 62 + let c = Bytes.get (Bytes.Slice.bytes t.current_slice) 63 + (Bytes.Slice.first t.current_slice + t.slice_pos) in 64 + t.slice_pos <- t.slice_pos + 1; 65 + Some c 66 + end 67 + end else begin 68 + let c = Bytes.get (Bytes.Slice.bytes t.current_slice) 69 + (Bytes.Slice.first t.current_slice + t.slice_pos) in 70 + t.slice_pos <- t.slice_pos + 1; 71 + Some c 72 + end 73 + 74 + (* Push a char back to lookahead *) 75 + let push_back_char t c = 76 + t.lookahead <- c :: t.lookahead 77 + 78 + (* Read next char with CR/LF normalization *) 79 + let rec read_normalized_char t = 80 + match read_raw_char t with 81 + | None -> 82 + t.last_was_cr <- false; 83 + None 84 + | Some '\r' -> 85 + t.last_was_cr <- true; 86 + Some '\n' (* CR becomes LF *) 87 + | Some '\n' when t.last_was_cr -> 88 + (* Skip LF after CR - it was already converted *) 89 + t.last_was_cr <- false; 90 + read_normalized_char t 91 + | Some c -> 92 + t.last_was_cr <- false; 93 + Some c 94 + 95 + let is_eof t = 96 + t.lookahead = [] && 97 + (Bytes.Slice.is_eod t.current_slice || 98 + (t.slice_pos >= Bytes.Slice.length t.current_slice && 99 + (let next = Bytes.Reader.read t.reader in 100 + t.current_slice <- next; 101 + t.slice_pos <- 0; 102 + Bytes.Slice.is_eod next))) 103 + 104 + let peek t = 105 + match read_normalized_char t with 106 + | None -> None 107 + | Some c -> 108 + push_back_char t c; 109 + (* Undo last_was_cr if we pushed back a CR-converted LF *) 110 + if c = '\n' then t.last_was_cr <- false; 111 + Some c 112 + 113 + (* Read n characters into a list, returns (chars_read, all_read_successfully) *) 114 + let peek_chars t n = 115 + let rec collect acc remaining = 116 + if remaining <= 0 then (List.rev acc, true) 117 + else match read_normalized_char t with 118 + | None -> (List.rev acc, false) (* Not enough chars available *) 119 + | Some c -> collect (c :: acc) (remaining - 1) 120 + in 121 + let (chars, success) = collect [] n in 122 + (* Always push back characters we read, in reverse order *) 123 + List.iter (push_back_char t) (List.rev chars); 124 + t.last_was_cr <- false; 125 + (chars, success) 126 + 127 + (* peek_n returns Some string only when exactly n chars are available 128 + Avoid using this in hot paths - prefer peek_chars + direct comparison *) 129 + let peek_n t n = 130 + let (chars, success) = peek_chars t n in 131 + if success then 132 + Some (String.init n (fun i -> List.nth chars i)) 133 + else 134 + None 135 + 136 + let advance t = 137 + match read_normalized_char t with 138 + | None -> () 139 + | Some c -> 140 + (* Update position tracking *) 141 + if c = '\n' then begin 142 + t.line <- t.line + 1; 143 + t.column <- 0 144 + end else 145 + t.column <- t.column + 1 146 + 147 + let consume t = 148 + let c = peek t in 149 + advance t; 150 + c 151 + 152 + let consume_if t pred = 153 + match peek t with 154 + | Some c when pred c -> advance t; Some c 155 + | _ -> None 156 + 157 + let consume_while t pred = 158 + let buf = Buffer.create 16 in 159 + let rec loop () = 160 + match peek t with 161 + | Some c when pred c -> 162 + Buffer.add_char buf c; 163 + advance t; 164 + loop () 165 + | _ -> () 166 + in 167 + loop (); 168 + Buffer.contents buf 169 + 170 + (* Case-insensitive match without allocating a string 171 + Compares directly with the char list from peek_chars *) 172 + let matches_ci t s = 173 + let slen = String.length s in 174 + let (chars, success) = peek_chars t slen in 175 + if not success then false 176 + else begin 177 + let rec check chars_remaining i = 178 + match chars_remaining with 179 + | [] -> i >= slen (* Matched all *) 180 + | c :: rest -> 181 + if i >= slen then true 182 + else 183 + let c1 = Char.lowercase_ascii c in 184 + let c2 = Char.lowercase_ascii (String.unsafe_get s i) in 185 + if c1 = c2 then check rest (i + 1) 186 + else false 187 + in 188 + check chars 0 189 + end 190 + 191 + let consume_exact_ci t s = 192 + if matches_ci t s then begin 193 + for _ = 1 to String.length s do advance t done; 194 + true 195 + end else false 196 + 197 + let reconsume t = 198 + (* Move back one position - simplified, doesn't handle CR/LF properly for reconsume *) 199 + (* This is called after advance, so we just need to push back a placeholder *) 200 + (* The tokenizer will call peek again which will get the right character *) 201 + (* Actually, for reconsume we need to track what we last consumed *) 202 + (* For now, just adjust column *) 203 + if t.column > 0 then t.column <- t.column - 1
+39
lib/tokenizer/token.ml
···
··· 1 + (* HTML5 token types *) 2 + 3 + type tag_kind = Start | End 4 + 5 + type doctype = { 6 + name : string option; 7 + public_id : string option; 8 + system_id : string option; 9 + force_quirks : bool; 10 + } 11 + 12 + type tag = { 13 + kind : tag_kind; 14 + name : string; 15 + attrs : (string * string) list; 16 + self_closing : bool; 17 + } 18 + 19 + type t = 20 + | Tag of tag 21 + | Character of string 22 + | Comment of string 23 + | Doctype of doctype 24 + | EOF 25 + 26 + let make_start_tag name attrs self_closing = 27 + Tag { kind = Start; name; attrs; self_closing } 28 + 29 + let make_end_tag name = 30 + Tag { kind = End; name; attrs = []; self_closing = false } 31 + 32 + let make_doctype ?name ?public_id ?system_id ?(force_quirks=false) () = 33 + Doctype { name; public_id; system_id; force_quirks } 34 + 35 + let make_comment data = Comment data 36 + 37 + let make_character data = Character data 38 + 39 + let eof = EOF
+1842
lib/tokenizer/tokenizer.ml
···
··· 1 + (* HTML5 Tokenizer - implements WHATWG tokenization algorithm *) 2 + 3 + let is_ascii_alpha c = (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') 4 + let is_ascii_upper c = c >= 'A' && c <= 'Z' 5 + let is_ascii_digit c = c >= '0' && c <= '9' 6 + let is_ascii_hex c = is_ascii_digit c || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F') 7 + let is_ascii_alnum c = is_ascii_alpha c || is_ascii_digit c 8 + let is_whitespace c = c = ' ' || c = '\t' || c = '\n' || c = '\x0C' || c = '\r' 9 + 10 + let ascii_lower c = 11 + if is_ascii_upper c then Char.chr (Char.code c + 32) else c 12 + 13 + (* Token sink interface *) 14 + module type SINK = sig 15 + type t 16 + val process : t -> Token.t -> [ `Continue | `SwitchTo of State.t ] 17 + val adjusted_current_node_in_html_namespace : t -> bool 18 + end 19 + 20 + type 'sink t = { 21 + mutable stream : Stream.t; 22 + sink : 'sink; 23 + mutable state : State.t; 24 + mutable return_state : State.t; 25 + mutable char_ref_code : int; 26 + mutable temp_buffer : Buffer.t; 27 + mutable last_start_tag : string; 28 + mutable current_tag_name : Buffer.t; 29 + mutable current_tag_kind : Token.tag_kind; 30 + mutable current_tag_self_closing : bool; 31 + mutable current_attr_name : Buffer.t; 32 + mutable current_attr_value : Buffer.t; 33 + mutable current_attrs : (string * string) list; 34 + mutable current_doctype_name : Buffer.t option; 35 + mutable current_doctype_public : Buffer.t option; 36 + mutable current_doctype_system : Buffer.t option; 37 + mutable current_doctype_force_quirks : bool; 38 + mutable current_comment : Buffer.t; 39 + mutable pending_chars : Buffer.t; 40 + mutable errors : Errors.t list; 41 + collect_errors : bool; 42 + } 43 + 44 + let create (type s) (module S : SINK with type t = s) sink ?(collect_errors=false) () = { 45 + stream = Stream.create ""; 46 + sink; 47 + state = State.Data; 48 + return_state = State.Data; 49 + char_ref_code = 0; 50 + temp_buffer = Buffer.create 64; 51 + last_start_tag = ""; 52 + current_tag_name = Buffer.create 32; 53 + current_tag_kind = Token.Start; 54 + current_tag_self_closing = false; 55 + current_attr_name = Buffer.create 32; 56 + current_attr_value = Buffer.create 64; 57 + current_attrs = []; 58 + current_doctype_name = None; 59 + current_doctype_public = None; 60 + current_doctype_system = None; 61 + current_doctype_force_quirks = false; 62 + current_comment = Buffer.create 64; 63 + pending_chars = Buffer.create 256; 64 + errors = []; 65 + collect_errors; 66 + } 67 + 68 + let error t code = 69 + if t.collect_errors then begin 70 + let (line, column) = Stream.position t.stream in 71 + t.errors <- Errors.make ~code ~line ~column :: t.errors 72 + end 73 + 74 + (* emit functions are defined locally inside run *) 75 + 76 + let emit_char t c = 77 + Buffer.add_char t.pending_chars c 78 + 79 + let emit_str t s = 80 + Buffer.add_string t.pending_chars s 81 + 82 + let start_new_tag t kind = 83 + Buffer.clear t.current_tag_name; 84 + t.current_tag_kind <- kind; 85 + t.current_tag_self_closing <- false; 86 + t.current_attrs <- [] 87 + 88 + let start_new_attribute t = 89 + (* Save previous attribute if any *) 90 + let name = Buffer.contents t.current_attr_name in 91 + if String.length name > 0 then begin 92 + let value = Buffer.contents t.current_attr_value in 93 + (* Check for duplicates - only add if not already present *) 94 + if not (List.exists (fun (n, _) -> n = name) t.current_attrs) then 95 + t.current_attrs <- (name, value) :: t.current_attrs 96 + else 97 + error t "duplicate-attribute" 98 + end; 99 + Buffer.clear t.current_attr_name; 100 + Buffer.clear t.current_attr_value 101 + 102 + let finish_attribute t = 103 + start_new_attribute t 104 + 105 + let start_new_doctype t = 106 + t.current_doctype_name <- None; 107 + t.current_doctype_public <- None; 108 + t.current_doctype_system <- None; 109 + t.current_doctype_force_quirks <- false 110 + 111 + (* emit_current_tag, emit_current_doctype, emit_current_comment are defined locally inside run *) 112 + 113 + let is_appropriate_end_tag t = 114 + let name = Buffer.contents t.current_tag_name in 115 + String.length t.last_start_tag > 0 && name = t.last_start_tag 116 + 117 + let flush_code_points_consumed_as_char_ref t = 118 + let s = Buffer.contents t.temp_buffer in 119 + match t.return_state with 120 + | State.Attribute_value_double_quoted 121 + | State.Attribute_value_single_quoted 122 + | State.Attribute_value_unquoted -> 123 + Buffer.add_string t.current_attr_value s 124 + | _ -> 125 + emit_str t s 126 + 127 + open Bytesrw 128 + 129 + (* Main tokenization loop *) 130 + let run (type s) t (module S : SINK with type t = s) (reader : Bytes.Reader.t) = 131 + t.stream <- Stream.create_from_reader reader; 132 + t.errors <- []; 133 + 134 + (* Local emit functions with access to S *) 135 + let emit_pending_chars () = 136 + if Buffer.length t.pending_chars > 0 then begin 137 + let data = Buffer.contents t.pending_chars in 138 + Buffer.clear t.pending_chars; 139 + ignore (S.process t.sink (Token.Character data)) 140 + end 141 + in 142 + 143 + let emit token = 144 + emit_pending_chars (); 145 + match S.process t.sink token with 146 + | `Continue -> () 147 + | `SwitchTo new_state -> t.state <- new_state 148 + in 149 + 150 + let emit_current_tag () = 151 + finish_attribute t; 152 + let name = Buffer.contents t.current_tag_name in 153 + let tag = { 154 + Token.kind = t.current_tag_kind; 155 + name; 156 + attrs = List.rev t.current_attrs; 157 + self_closing = t.current_tag_self_closing; 158 + } in 159 + if t.current_tag_kind = Token.Start then 160 + t.last_start_tag <- name; 161 + emit (Token.Tag tag) 162 + in 163 + 164 + let emit_current_doctype () = 165 + let doctype = { 166 + Token.name = Option.map Buffer.contents t.current_doctype_name; 167 + public_id = Option.map Buffer.contents t.current_doctype_public; 168 + system_id = Option.map Buffer.contents t.current_doctype_system; 169 + force_quirks = t.current_doctype_force_quirks; 170 + } in 171 + emit (Token.Doctype doctype) 172 + in 173 + 174 + let emit_current_comment () = 175 + emit (Token.Comment (Buffer.contents t.current_comment)) 176 + in 177 + 178 + let rec process_state () = 179 + if Stream.is_eof t.stream && t.state <> State.Data then begin 180 + (* Handle EOF in various states *) 181 + handle_eof () 182 + end else if Stream.is_eof t.stream then begin 183 + emit_pending_chars (); 184 + ignore (S.process t.sink Token.EOF) 185 + end else begin 186 + step (); 187 + process_state () 188 + end 189 + 190 + and handle_eof () = 191 + match t.state with 192 + | State.Data -> 193 + emit_pending_chars (); 194 + ignore (S.process t.sink Token.EOF) 195 + | State.Tag_open -> 196 + error t "eof-before-tag-name"; 197 + emit_char t '<'; 198 + emit_pending_chars (); 199 + ignore (S.process t.sink Token.EOF) 200 + | State.End_tag_open -> 201 + error t "eof-before-tag-name"; 202 + emit_str t "</"; 203 + emit_pending_chars (); 204 + ignore (S.process t.sink Token.EOF) 205 + | State.Tag_name 206 + | State.Before_attribute_name 207 + | State.Attribute_name 208 + | State.After_attribute_name 209 + | State.Before_attribute_value 210 + | State.Attribute_value_double_quoted 211 + | State.Attribute_value_single_quoted 212 + | State.Attribute_value_unquoted 213 + | State.After_attribute_value_quoted 214 + | State.Self_closing_start_tag -> 215 + error t "eof-in-tag"; 216 + emit_pending_chars (); 217 + ignore (S.process t.sink Token.EOF) 218 + | State.Rawtext -> 219 + emit_pending_chars (); 220 + ignore (S.process t.sink Token.EOF) 221 + | State.Rawtext_less_than_sign -> 222 + emit_char t '<'; 223 + emit_pending_chars (); 224 + ignore (S.process t.sink Token.EOF) 225 + | State.Rawtext_end_tag_open -> 226 + emit_str t "</"; 227 + emit_pending_chars (); 228 + ignore (S.process t.sink Token.EOF) 229 + | State.Rawtext_end_tag_name -> 230 + emit_str t "</"; 231 + emit_str t (Buffer.contents t.temp_buffer); 232 + emit_pending_chars (); 233 + ignore (S.process t.sink Token.EOF) 234 + | State.Rcdata -> 235 + emit_pending_chars (); 236 + ignore (S.process t.sink Token.EOF) 237 + | State.Rcdata_less_than_sign -> 238 + emit_char t '<'; 239 + emit_pending_chars (); 240 + ignore (S.process t.sink Token.EOF) 241 + | State.Rcdata_end_tag_open -> 242 + emit_str t "</"; 243 + emit_pending_chars (); 244 + ignore (S.process t.sink Token.EOF) 245 + | State.Rcdata_end_tag_name -> 246 + emit_str t "</"; 247 + emit_str t (Buffer.contents t.temp_buffer); 248 + emit_pending_chars (); 249 + ignore (S.process t.sink Token.EOF) 250 + | State.Script_data -> 251 + emit_pending_chars (); 252 + ignore (S.process t.sink Token.EOF) 253 + | State.Script_data_less_than_sign -> 254 + emit_char t '<'; 255 + emit_pending_chars (); 256 + ignore (S.process t.sink Token.EOF) 257 + | State.Script_data_end_tag_open -> 258 + emit_str t "</"; 259 + emit_pending_chars (); 260 + ignore (S.process t.sink Token.EOF) 261 + | State.Script_data_end_tag_name -> 262 + emit_str t "</"; 263 + emit_str t (Buffer.contents t.temp_buffer); 264 + emit_pending_chars (); 265 + ignore (S.process t.sink Token.EOF) 266 + | State.Script_data_escape_start 267 + | State.Script_data_escape_start_dash 268 + | State.Script_data_escaped 269 + | State.Script_data_escaped_dash 270 + | State.Script_data_escaped_dash_dash -> 271 + emit_pending_chars (); 272 + ignore (S.process t.sink Token.EOF) 273 + | State.Script_data_escaped_less_than_sign -> 274 + emit_char t '<'; 275 + emit_pending_chars (); 276 + ignore (S.process t.sink Token.EOF) 277 + | State.Script_data_escaped_end_tag_open -> 278 + emit_str t "</"; 279 + emit_pending_chars (); 280 + ignore (S.process t.sink Token.EOF) 281 + | State.Script_data_escaped_end_tag_name -> 282 + emit_str t "</"; 283 + emit_str t (Buffer.contents t.temp_buffer); 284 + emit_pending_chars (); 285 + ignore (S.process t.sink Token.EOF) 286 + | State.Script_data_double_escape_start 287 + | State.Script_data_double_escaped 288 + | State.Script_data_double_escaped_dash 289 + | State.Script_data_double_escaped_dash_dash -> 290 + emit_pending_chars (); 291 + ignore (S.process t.sink Token.EOF) 292 + | State.Script_data_double_escaped_less_than_sign -> 293 + (* '<' was already emitted when entering this state from Script_data_double_escaped *) 294 + emit_pending_chars (); 295 + ignore (S.process t.sink Token.EOF) 296 + | State.Script_data_double_escape_end -> 297 + emit_pending_chars (); 298 + ignore (S.process t.sink Token.EOF) 299 + | State.Plaintext -> 300 + emit_pending_chars (); 301 + ignore (S.process t.sink Token.EOF) 302 + | State.Comment_start 303 + | State.Comment_start_dash 304 + | State.Comment 305 + | State.Comment_less_than_sign 306 + | State.Comment_less_than_sign_bang 307 + | State.Comment_less_than_sign_bang_dash 308 + | State.Comment_less_than_sign_bang_dash_dash 309 + | State.Comment_end_dash 310 + | State.Comment_end 311 + | State.Comment_end_bang -> 312 + error t "eof-in-comment"; 313 + emit_current_comment (); 314 + emit_pending_chars (); 315 + ignore (S.process t.sink Token.EOF) 316 + | State.Bogus_comment -> 317 + emit_current_comment (); 318 + emit_pending_chars (); 319 + ignore (S.process t.sink Token.EOF) 320 + | State.Markup_declaration_open -> 321 + error t "incorrectly-opened-comment"; 322 + Buffer.clear t.current_comment; 323 + emit_current_comment (); 324 + emit_pending_chars (); 325 + ignore (S.process t.sink Token.EOF) 326 + | State.Doctype 327 + | State.Before_doctype_name -> 328 + error t "eof-in-doctype"; 329 + start_new_doctype t; 330 + t.current_doctype_force_quirks <- true; 331 + emit_current_doctype (); 332 + emit_pending_chars (); 333 + ignore (S.process t.sink Token.EOF) 334 + | State.Doctype_name 335 + | State.After_doctype_name 336 + | State.After_doctype_public_keyword 337 + | State.Before_doctype_public_identifier 338 + | State.Doctype_public_identifier_double_quoted 339 + | State.Doctype_public_identifier_single_quoted 340 + | State.After_doctype_public_identifier 341 + | State.Between_doctype_public_and_system_identifiers 342 + | State.After_doctype_system_keyword 343 + | State.Before_doctype_system_identifier 344 + | State.Doctype_system_identifier_double_quoted 345 + | State.Doctype_system_identifier_single_quoted 346 + | State.After_doctype_system_identifier -> 347 + error t "eof-in-doctype"; 348 + t.current_doctype_force_quirks <- true; 349 + emit_current_doctype (); 350 + emit_pending_chars (); 351 + ignore (S.process t.sink Token.EOF) 352 + | State.Bogus_doctype -> 353 + emit_current_doctype (); 354 + emit_pending_chars (); 355 + ignore (S.process t.sink Token.EOF) 356 + | State.Cdata_section -> 357 + error t "eof-in-cdata"; 358 + emit_pending_chars (); 359 + ignore (S.process t.sink Token.EOF) 360 + | State.Cdata_section_bracket -> 361 + error t "eof-in-cdata"; 362 + emit_char t ']'; 363 + emit_pending_chars (); 364 + ignore (S.process t.sink Token.EOF) 365 + | State.Cdata_section_end -> 366 + error t "eof-in-cdata"; 367 + emit_str t "]]"; 368 + emit_pending_chars (); 369 + ignore (S.process t.sink Token.EOF) 370 + | State.Character_reference -> 371 + (* state_character_reference never ran, so initialize temp_buffer with & *) 372 + Buffer.clear t.temp_buffer; 373 + Buffer.add_char t.temp_buffer '&'; 374 + flush_code_points_consumed_as_char_ref t; 375 + t.state <- t.return_state; 376 + handle_eof () 377 + | State.Named_character_reference 378 + | State.Numeric_character_reference 379 + | State.Hexadecimal_character_reference_start 380 + | State.Decimal_character_reference_start 381 + | State.Numeric_character_reference_end -> 382 + flush_code_points_consumed_as_char_ref t; 383 + t.state <- t.return_state; 384 + handle_eof () 385 + | State.Ambiguous_ampersand -> 386 + (* Buffer was already flushed when entering this state, just transition *) 387 + t.state <- t.return_state; 388 + handle_eof () 389 + | State.Hexadecimal_character_reference 390 + | State.Decimal_character_reference -> 391 + (* At EOF with collected digits - convert the numeric reference *) 392 + error t "missing-semicolon-after-character-reference"; 393 + let code = t.char_ref_code in 394 + let replacement_char = "\xEF\xBF\xBD" in 395 + let result = 396 + if code = 0 then begin 397 + error t "null-character-reference"; 398 + replacement_char 399 + end else if code > 0x10FFFF then begin 400 + error t "character-reference-outside-unicode-range"; 401 + replacement_char 402 + end else if code >= 0xD800 && code <= 0xDFFF then begin 403 + error t "surrogate-character-reference"; 404 + replacement_char 405 + end else 406 + Html5rw_entities.Numeric_ref.codepoint_to_utf8 code 407 + in 408 + Buffer.clear t.temp_buffer; 409 + Buffer.add_string t.temp_buffer result; 410 + flush_code_points_consumed_as_char_ref t; 411 + t.state <- t.return_state; 412 + handle_eof () 413 + 414 + and step () = 415 + match t.state with 416 + | State.Data -> state_data () 417 + | State.Rcdata -> state_rcdata () 418 + | State.Rawtext -> state_rawtext () 419 + | State.Script_data -> state_script_data () 420 + | State.Plaintext -> state_plaintext () 421 + | State.Tag_open -> state_tag_open () 422 + | State.End_tag_open -> state_end_tag_open () 423 + | State.Tag_name -> state_tag_name () 424 + | State.Rcdata_less_than_sign -> state_rcdata_less_than_sign () 425 + | State.Rcdata_end_tag_open -> state_rcdata_end_tag_open () 426 + | State.Rcdata_end_tag_name -> state_rcdata_end_tag_name () 427 + | State.Rawtext_less_than_sign -> state_rawtext_less_than_sign () 428 + | State.Rawtext_end_tag_open -> state_rawtext_end_tag_open () 429 + | State.Rawtext_end_tag_name -> state_rawtext_end_tag_name () 430 + | State.Script_data_less_than_sign -> state_script_data_less_than_sign () 431 + | State.Script_data_end_tag_open -> state_script_data_end_tag_open () 432 + | State.Script_data_end_tag_name -> state_script_data_end_tag_name () 433 + | State.Script_data_escape_start -> state_script_data_escape_start () 434 + | State.Script_data_escape_start_dash -> state_script_data_escape_start_dash () 435 + | State.Script_data_escaped -> state_script_data_escaped () 436 + | State.Script_data_escaped_dash -> state_script_data_escaped_dash () 437 + | State.Script_data_escaped_dash_dash -> state_script_data_escaped_dash_dash () 438 + | State.Script_data_escaped_less_than_sign -> state_script_data_escaped_less_than_sign () 439 + | State.Script_data_escaped_end_tag_open -> state_script_data_escaped_end_tag_open () 440 + | State.Script_data_escaped_end_tag_name -> state_script_data_escaped_end_tag_name () 441 + | State.Script_data_double_escape_start -> state_script_data_double_escape_start () 442 + | State.Script_data_double_escaped -> state_script_data_double_escaped () 443 + | State.Script_data_double_escaped_dash -> state_script_data_double_escaped_dash () 444 + | State.Script_data_double_escaped_dash_dash -> state_script_data_double_escaped_dash_dash () 445 + | State.Script_data_double_escaped_less_than_sign -> state_script_data_double_escaped_less_than_sign () 446 + | State.Script_data_double_escape_end -> state_script_data_double_escape_end () 447 + | State.Before_attribute_name -> state_before_attribute_name () 448 + | State.Attribute_name -> state_attribute_name () 449 + | State.After_attribute_name -> state_after_attribute_name () 450 + | State.Before_attribute_value -> state_before_attribute_value () 451 + | State.Attribute_value_double_quoted -> state_attribute_value_double_quoted () 452 + | State.Attribute_value_single_quoted -> state_attribute_value_single_quoted () 453 + | State.Attribute_value_unquoted -> state_attribute_value_unquoted () 454 + | State.After_attribute_value_quoted -> state_after_attribute_value_quoted () 455 + | State.Self_closing_start_tag -> state_self_closing_start_tag () 456 + | State.Bogus_comment -> state_bogus_comment () 457 + | State.Markup_declaration_open -> state_markup_declaration_open () 458 + | State.Comment_start -> state_comment_start () 459 + | State.Comment_start_dash -> state_comment_start_dash () 460 + | State.Comment -> state_comment () 461 + | State.Comment_less_than_sign -> state_comment_less_than_sign () 462 + | State.Comment_less_than_sign_bang -> state_comment_less_than_sign_bang () 463 + | State.Comment_less_than_sign_bang_dash -> state_comment_less_than_sign_bang_dash () 464 + | State.Comment_less_than_sign_bang_dash_dash -> state_comment_less_than_sign_bang_dash_dash () 465 + | State.Comment_end_dash -> state_comment_end_dash () 466 + | State.Comment_end -> state_comment_end () 467 + | State.Comment_end_bang -> state_comment_end_bang () 468 + | State.Doctype -> state_doctype () 469 + | State.Before_doctype_name -> state_before_doctype_name () 470 + | State.Doctype_name -> state_doctype_name () 471 + | State.After_doctype_name -> state_after_doctype_name () 472 + | State.After_doctype_public_keyword -> state_after_doctype_public_keyword () 473 + | State.Before_doctype_public_identifier -> state_before_doctype_public_identifier () 474 + | State.Doctype_public_identifier_double_quoted -> state_doctype_public_identifier_double_quoted () 475 + | State.Doctype_public_identifier_single_quoted -> state_doctype_public_identifier_single_quoted () 476 + | State.After_doctype_public_identifier -> state_after_doctype_public_identifier () 477 + | State.Between_doctype_public_and_system_identifiers -> state_between_doctype_public_and_system_identifiers () 478 + | State.After_doctype_system_keyword -> state_after_doctype_system_keyword () 479 + | State.Before_doctype_system_identifier -> state_before_doctype_system_identifier () 480 + | State.Doctype_system_identifier_double_quoted -> state_doctype_system_identifier_double_quoted () 481 + | State.Doctype_system_identifier_single_quoted -> state_doctype_system_identifier_single_quoted () 482 + | State.After_doctype_system_identifier -> state_after_doctype_system_identifier () 483 + | State.Bogus_doctype -> state_bogus_doctype () 484 + | State.Cdata_section -> state_cdata_section () 485 + | State.Cdata_section_bracket -> state_cdata_section_bracket () 486 + | State.Cdata_section_end -> state_cdata_section_end () 487 + | State.Character_reference -> state_character_reference () 488 + | State.Named_character_reference -> state_named_character_reference () 489 + | State.Ambiguous_ampersand -> state_ambiguous_ampersand () 490 + | State.Numeric_character_reference -> state_numeric_character_reference () 491 + | State.Hexadecimal_character_reference_start -> state_hexadecimal_character_reference_start () 492 + | State.Decimal_character_reference_start -> state_decimal_character_reference_start () 493 + | State.Hexadecimal_character_reference -> state_hexadecimal_character_reference () 494 + | State.Decimal_character_reference -> state_decimal_character_reference () 495 + | State.Numeric_character_reference_end -> state_numeric_character_reference_end () 496 + 497 + (* State implementations *) 498 + and state_data () = 499 + match Stream.consume t.stream with 500 + | Some '&' -> 501 + t.return_state <- State.Data; 502 + t.state <- State.Character_reference 503 + | Some '<' -> 504 + t.state <- State.Tag_open 505 + | Some '\x00' -> 506 + (* Emit pending chars first, then emit null separately for proper tree builder handling *) 507 + emit_pending_chars (); 508 + error t "unexpected-null-character"; 509 + ignore (S.process t.sink (Token.Character "\x00")) 510 + | Some c -> 511 + emit_char t c 512 + | None -> () 513 + 514 + and state_rcdata () = 515 + match Stream.consume t.stream with 516 + | Some '&' -> 517 + t.return_state <- State.Rcdata; 518 + t.state <- State.Character_reference 519 + | Some '<' -> 520 + t.state <- State.Rcdata_less_than_sign 521 + | Some '\x00' -> 522 + error t "unexpected-null-character"; 523 + emit_str t "\xEF\xBF\xBD" 524 + | Some c -> 525 + emit_char t c 526 + | None -> () 527 + 528 + and state_rawtext () = 529 + match Stream.consume t.stream with 530 + | Some '<' -> 531 + t.state <- State.Rawtext_less_than_sign 532 + | Some '\x00' -> 533 + error t "unexpected-null-character"; 534 + emit_str t "\xEF\xBF\xBD" 535 + | Some c -> 536 + emit_char t c 537 + | None -> () 538 + 539 + and state_script_data () = 540 + match Stream.consume t.stream with 541 + | Some '<' -> 542 + t.state <- State.Script_data_less_than_sign 543 + | Some '\x00' -> 544 + error t "unexpected-null-character"; 545 + emit_str t "\xEF\xBF\xBD" 546 + | Some c -> 547 + emit_char t c 548 + | None -> () 549 + 550 + and state_plaintext () = 551 + match Stream.consume t.stream with 552 + | Some '\x00' -> 553 + error t "unexpected-null-character"; 554 + emit_str t "\xEF\xBF\xBD" 555 + | Some c -> 556 + emit_char t c 557 + | None -> () 558 + 559 + and state_tag_open () = 560 + match Stream.peek t.stream with 561 + | Some '!' -> 562 + Stream.advance t.stream; 563 + t.state <- State.Markup_declaration_open 564 + | Some '/' -> 565 + Stream.advance t.stream; 566 + t.state <- State.End_tag_open 567 + | Some c when is_ascii_alpha c -> 568 + start_new_tag t Token.Start; 569 + t.state <- State.Tag_name 570 + | Some '?' -> 571 + error t "unexpected-question-mark-instead-of-tag-name"; 572 + Buffer.clear t.current_comment; 573 + t.state <- State.Bogus_comment 574 + | None -> 575 + error t "eof-before-tag-name"; 576 + emit_char t '<' 577 + | Some _ -> 578 + error t "invalid-first-character-of-tag-name"; 579 + emit_char t '<'; 580 + t.state <- State.Data 581 + 582 + and state_end_tag_open () = 583 + match Stream.peek t.stream with 584 + | Some c when is_ascii_alpha c -> 585 + start_new_tag t Token.End; 586 + t.state <- State.Tag_name 587 + | Some '>' -> 588 + Stream.advance t.stream; 589 + error t "missing-end-tag-name"; 590 + t.state <- State.Data 591 + | None -> 592 + error t "eof-before-tag-name"; 593 + emit_str t "</" 594 + | Some _ -> 595 + error t "invalid-first-character-of-tag-name"; 596 + Buffer.clear t.current_comment; 597 + t.state <- State.Bogus_comment 598 + 599 + and state_tag_name () = 600 + match Stream.consume t.stream with 601 + | Some ('\t' | '\n' | '\x0C' | ' ') -> 602 + t.state <- State.Before_attribute_name 603 + | Some '/' -> 604 + t.state <- State.Self_closing_start_tag 605 + | Some '>' -> 606 + t.state <- State.Data; 607 + emit_current_tag () 608 + | Some '\x00' -> 609 + error t "unexpected-null-character"; 610 + Buffer.add_string t.current_tag_name "\xEF\xBF\xBD" 611 + | Some c -> 612 + Buffer.add_char t.current_tag_name (ascii_lower c) 613 + | None -> () 614 + 615 + and state_rcdata_less_than_sign () = 616 + match Stream.peek t.stream with 617 + | Some '/' -> 618 + Stream.advance t.stream; 619 + Buffer.clear t.temp_buffer; 620 + t.state <- State.Rcdata_end_tag_open 621 + | _ -> 622 + emit_char t '<'; 623 + t.state <- State.Rcdata 624 + 625 + and state_rcdata_end_tag_open () = 626 + match Stream.peek t.stream with 627 + | Some c when is_ascii_alpha c -> 628 + start_new_tag t Token.End; 629 + t.state <- State.Rcdata_end_tag_name 630 + | _ -> 631 + emit_str t "</"; 632 + t.state <- State.Rcdata 633 + 634 + and state_rcdata_end_tag_name () = 635 + match Stream.peek t.stream with 636 + | Some ('\t' | '\n' | '\x0C' | ' ') when is_appropriate_end_tag t -> 637 + Stream.advance t.stream; 638 + t.state <- State.Before_attribute_name 639 + | Some '/' when is_appropriate_end_tag t -> 640 + Stream.advance t.stream; 641 + t.state <- State.Self_closing_start_tag 642 + | Some '>' when is_appropriate_end_tag t -> 643 + Stream.advance t.stream; 644 + t.state <- State.Data; 645 + emit_current_tag () 646 + | Some c when is_ascii_alpha c -> 647 + Stream.advance t.stream; 648 + Buffer.add_char t.current_tag_name (ascii_lower c); 649 + Buffer.add_char t.temp_buffer c 650 + | _ -> 651 + emit_str t "</"; 652 + emit_str t (Buffer.contents t.temp_buffer); 653 + t.state <- State.Rcdata 654 + 655 + and state_rawtext_less_than_sign () = 656 + match Stream.peek t.stream with 657 + | Some '/' -> 658 + Stream.advance t.stream; 659 + Buffer.clear t.temp_buffer; 660 + t.state <- State.Rawtext_end_tag_open 661 + | _ -> 662 + emit_char t '<'; 663 + t.state <- State.Rawtext 664 + 665 + and state_rawtext_end_tag_open () = 666 + match Stream.peek t.stream with 667 + | Some c when is_ascii_alpha c -> 668 + start_new_tag t Token.End; 669 + t.state <- State.Rawtext_end_tag_name 670 + | _ -> 671 + emit_str t "</"; 672 + t.state <- State.Rawtext 673 + 674 + and state_rawtext_end_tag_name () = 675 + match Stream.peek t.stream with 676 + | Some ('\t' | '\n' | '\x0C' | ' ') when is_appropriate_end_tag t -> 677 + Stream.advance t.stream; 678 + t.state <- State.Before_attribute_name 679 + | Some '/' when is_appropriate_end_tag t -> 680 + Stream.advance t.stream; 681 + t.state <- State.Self_closing_start_tag 682 + | Some '>' when is_appropriate_end_tag t -> 683 + Stream.advance t.stream; 684 + t.state <- State.Data; 685 + emit_current_tag () 686 + | Some c when is_ascii_alpha c -> 687 + Stream.advance t.stream; 688 + Buffer.add_char t.current_tag_name (ascii_lower c); 689 + Buffer.add_char t.temp_buffer c 690 + | _ -> 691 + emit_str t "</"; 692 + emit_str t (Buffer.contents t.temp_buffer); 693 + t.state <- State.Rawtext 694 + 695 + and state_script_data_less_than_sign () = 696 + match Stream.peek t.stream with 697 + | Some '/' -> 698 + Stream.advance t.stream; 699 + Buffer.clear t.temp_buffer; 700 + t.state <- State.Script_data_end_tag_open 701 + | Some '!' -> 702 + Stream.advance t.stream; 703 + t.state <- State.Script_data_escape_start; 704 + emit_str t "<!" 705 + | _ -> 706 + emit_char t '<'; 707 + t.state <- State.Script_data 708 + 709 + and state_script_data_end_tag_open () = 710 + match Stream.peek t.stream with 711 + | Some c when is_ascii_alpha c -> 712 + start_new_tag t Token.End; 713 + t.state <- State.Script_data_end_tag_name 714 + | _ -> 715 + emit_str t "</"; 716 + t.state <- State.Script_data 717 + 718 + and state_script_data_end_tag_name () = 719 + match Stream.peek t.stream with 720 + | Some ('\t' | '\n' | '\x0C' | ' ') when is_appropriate_end_tag t -> 721 + Stream.advance t.stream; 722 + t.state <- State.Before_attribute_name 723 + | Some '/' when is_appropriate_end_tag t -> 724 + Stream.advance t.stream; 725 + t.state <- State.Self_closing_start_tag 726 + | Some '>' when is_appropriate_end_tag t -> 727 + Stream.advance t.stream; 728 + t.state <- State.Data; 729 + emit_current_tag () 730 + | Some c when is_ascii_alpha c -> 731 + Stream.advance t.stream; 732 + Buffer.add_char t.current_tag_name (ascii_lower c); 733 + Buffer.add_char t.temp_buffer c 734 + | _ -> 735 + emit_str t "</"; 736 + emit_str t (Buffer.contents t.temp_buffer); 737 + t.state <- State.Script_data 738 + 739 + and state_script_data_escape_start () = 740 + match Stream.peek t.stream with 741 + | Some '-' -> 742 + Stream.advance t.stream; 743 + t.state <- State.Script_data_escape_start_dash; 744 + emit_char t '-' 745 + | _ -> 746 + t.state <- State.Script_data 747 + 748 + and state_script_data_escape_start_dash () = 749 + match Stream.peek t.stream with 750 + | Some '-' -> 751 + Stream.advance t.stream; 752 + t.state <- State.Script_data_escaped_dash_dash; 753 + emit_char t '-' 754 + | _ -> 755 + t.state <- State.Script_data 756 + 757 + and state_script_data_escaped () = 758 + match Stream.consume t.stream with 759 + | Some '-' -> 760 + t.state <- State.Script_data_escaped_dash; 761 + emit_char t '-' 762 + | Some '<' -> 763 + t.state <- State.Script_data_escaped_less_than_sign 764 + | Some '\x00' -> 765 + error t "unexpected-null-character"; 766 + emit_str t "\xEF\xBF\xBD" 767 + | Some c -> 768 + emit_char t c 769 + | None -> () 770 + 771 + and state_script_data_escaped_dash () = 772 + match Stream.consume t.stream with 773 + | Some '-' -> 774 + t.state <- State.Script_data_escaped_dash_dash; 775 + emit_char t '-' 776 + | Some '<' -> 777 + t.state <- State.Script_data_escaped_less_than_sign 778 + | Some '\x00' -> 779 + error t "unexpected-null-character"; 780 + t.state <- State.Script_data_escaped; 781 + emit_str t "\xEF\xBF\xBD" 782 + | Some c -> 783 + t.state <- State.Script_data_escaped; 784 + emit_char t c 785 + | None -> () 786 + 787 + and state_script_data_escaped_dash_dash () = 788 + match Stream.consume t.stream with 789 + | Some '-' -> 790 + emit_char t '-' 791 + | Some '<' -> 792 + t.state <- State.Script_data_escaped_less_than_sign 793 + | Some '>' -> 794 + t.state <- State.Script_data; 795 + emit_char t '>' 796 + | Some '\x00' -> 797 + error t "unexpected-null-character"; 798 + t.state <- State.Script_data_escaped; 799 + emit_str t "\xEF\xBF\xBD" 800 + | Some c -> 801 + t.state <- State.Script_data_escaped; 802 + emit_char t c 803 + | None -> () 804 + 805 + and state_script_data_escaped_less_than_sign () = 806 + match Stream.peek t.stream with 807 + | Some '/' -> 808 + Stream.advance t.stream; 809 + Buffer.clear t.temp_buffer; 810 + t.state <- State.Script_data_escaped_end_tag_open 811 + | Some c when is_ascii_alpha c -> 812 + Buffer.clear t.temp_buffer; 813 + emit_char t '<'; 814 + t.state <- State.Script_data_double_escape_start 815 + | _ -> 816 + emit_char t '<'; 817 + t.state <- State.Script_data_escaped 818 + 819 + and state_script_data_escaped_end_tag_open () = 820 + match Stream.peek t.stream with 821 + | Some c when is_ascii_alpha c -> 822 + start_new_tag t Token.End; 823 + t.state <- State.Script_data_escaped_end_tag_name 824 + | _ -> 825 + emit_str t "</"; 826 + t.state <- State.Script_data_escaped 827 + 828 + and state_script_data_escaped_end_tag_name () = 829 + match Stream.peek t.stream with 830 + | Some ('\t' | '\n' | '\x0C' | ' ') when is_appropriate_end_tag t -> 831 + Stream.advance t.stream; 832 + t.state <- State.Before_attribute_name 833 + | Some '/' when is_appropriate_end_tag t -> 834 + Stream.advance t.stream; 835 + t.state <- State.Self_closing_start_tag 836 + | Some '>' when is_appropriate_end_tag t -> 837 + Stream.advance t.stream; 838 + t.state <- State.Data; 839 + emit_current_tag () 840 + | Some c when is_ascii_alpha c -> 841 + Stream.advance t.stream; 842 + Buffer.add_char t.current_tag_name (ascii_lower c); 843 + Buffer.add_char t.temp_buffer c 844 + | _ -> 845 + emit_str t "</"; 846 + emit_str t (Buffer.contents t.temp_buffer); 847 + t.state <- State.Script_data_escaped 848 + 849 + and state_script_data_double_escape_start () = 850 + match Stream.peek t.stream with 851 + | Some ('\t' | '\n' | '\x0C' | ' ' | '/' | '>') as c_opt -> 852 + Stream.advance t.stream; 853 + let c = Option.get c_opt in 854 + if Buffer.contents t.temp_buffer = "script" then 855 + t.state <- State.Script_data_double_escaped 856 + else 857 + t.state <- State.Script_data_escaped; 858 + emit_char t c 859 + | Some c when is_ascii_alpha c -> 860 + Stream.advance t.stream; 861 + Buffer.add_char t.temp_buffer (ascii_lower c); 862 + emit_char t c 863 + | _ -> 864 + t.state <- State.Script_data_escaped 865 + 866 + and state_script_data_double_escaped () = 867 + match Stream.consume t.stream with 868 + | Some '-' -> 869 + t.state <- State.Script_data_double_escaped_dash; 870 + emit_char t '-' 871 + | Some '<' -> 872 + t.state <- State.Script_data_double_escaped_less_than_sign; 873 + emit_char t '<' 874 + | Some '\x00' -> 875 + error t "unexpected-null-character"; 876 + emit_str t "\xEF\xBF\xBD" 877 + | Some c -> 878 + emit_char t c 879 + | None -> () 880 + 881 + and state_script_data_double_escaped_dash () = 882 + match Stream.consume t.stream with 883 + | Some '-' -> 884 + t.state <- State.Script_data_double_escaped_dash_dash; 885 + emit_char t '-' 886 + | Some '<' -> 887 + t.state <- State.Script_data_double_escaped_less_than_sign; 888 + emit_char t '<' 889 + | Some '\x00' -> 890 + error t "unexpected-null-character"; 891 + t.state <- State.Script_data_double_escaped; 892 + emit_str t "\xEF\xBF\xBD" 893 + | Some c -> 894 + t.state <- State.Script_data_double_escaped; 895 + emit_char t c 896 + | None -> () 897 + 898 + and state_script_data_double_escaped_dash_dash () = 899 + match Stream.consume t.stream with 900 + | Some '-' -> 901 + emit_char t '-' 902 + | Some '<' -> 903 + t.state <- State.Script_data_double_escaped_less_than_sign; 904 + emit_char t '<' 905 + | Some '>' -> 906 + t.state <- State.Script_data; 907 + emit_char t '>' 908 + | Some '\x00' -> 909 + error t "unexpected-null-character"; 910 + t.state <- State.Script_data_double_escaped; 911 + emit_str t "\xEF\xBF\xBD" 912 + | Some c -> 913 + t.state <- State.Script_data_double_escaped; 914 + emit_char t c 915 + | None -> () 916 + 917 + and state_script_data_double_escaped_less_than_sign () = 918 + match Stream.peek t.stream with 919 + | Some '/' -> 920 + Stream.advance t.stream; 921 + Buffer.clear t.temp_buffer; 922 + t.state <- State.Script_data_double_escape_end; 923 + emit_char t '/' 924 + | _ -> 925 + t.state <- State.Script_data_double_escaped 926 + 927 + and state_script_data_double_escape_end () = 928 + match Stream.peek t.stream with 929 + | Some ('\t' | '\n' | '\x0C' | ' ' | '/' | '>') as c_opt -> 930 + Stream.advance t.stream; 931 + let c = Option.get c_opt in 932 + if Buffer.contents t.temp_buffer = "script" then 933 + t.state <- State.Script_data_escaped 934 + else 935 + t.state <- State.Script_data_double_escaped; 936 + emit_char t c 937 + | Some c when is_ascii_alpha c -> 938 + Stream.advance t.stream; 939 + Buffer.add_char t.temp_buffer (ascii_lower c); 940 + emit_char t c 941 + | _ -> 942 + t.state <- State.Script_data_double_escaped 943 + 944 + and state_before_attribute_name () = 945 + match Stream.peek t.stream with 946 + | Some ('\t' | '\n' | '\x0C' | ' ') -> 947 + Stream.advance t.stream 948 + | Some '/' | Some '>' | None -> 949 + t.state <- State.After_attribute_name 950 + | Some '=' -> 951 + Stream.advance t.stream; 952 + error t "unexpected-equals-sign-before-attribute-name"; 953 + start_new_attribute t; 954 + Buffer.add_char t.current_attr_name '='; 955 + t.state <- State.Attribute_name 956 + | Some _ -> 957 + start_new_attribute t; 958 + t.state <- State.Attribute_name 959 + 960 + and state_attribute_name () = 961 + match Stream.peek t.stream with 962 + | Some ('\t' | '\n' | '\x0C' | ' ') -> 963 + Stream.advance t.stream; 964 + t.state <- State.After_attribute_name 965 + | Some '/' | Some '>' | None -> 966 + t.state <- State.After_attribute_name 967 + | Some '=' -> 968 + Stream.advance t.stream; 969 + t.state <- State.Before_attribute_value 970 + | Some '\x00' -> 971 + Stream.advance t.stream; 972 + error t "unexpected-null-character"; 973 + Buffer.add_string t.current_attr_name "\xEF\xBF\xBD" 974 + | Some ('"' | '\'' | '<') as c_opt -> 975 + Stream.advance t.stream; 976 + error t "unexpected-character-in-attribute-name"; 977 + Buffer.add_char t.current_attr_name (Option.get c_opt) 978 + | Some c -> 979 + Stream.advance t.stream; 980 + Buffer.add_char t.current_attr_name (ascii_lower c) 981 + 982 + and state_after_attribute_name () = 983 + match Stream.peek t.stream with 984 + | Some ('\t' | '\n' | '\x0C' | ' ') -> 985 + Stream.advance t.stream 986 + | Some '/' -> 987 + Stream.advance t.stream; 988 + t.state <- State.Self_closing_start_tag 989 + | Some '=' -> 990 + Stream.advance t.stream; 991 + t.state <- State.Before_attribute_value 992 + | Some '>' -> 993 + Stream.advance t.stream; 994 + t.state <- State.Data; 995 + emit_current_tag () 996 + | None -> () 997 + | Some _ -> 998 + start_new_attribute t; 999 + t.state <- State.Attribute_name 1000 + 1001 + and state_before_attribute_value () = 1002 + match Stream.peek t.stream with 1003 + | Some ('\t' | '\n' | '\x0C' | ' ') -> 1004 + Stream.advance t.stream 1005 + | Some '"' -> 1006 + Stream.advance t.stream; 1007 + t.state <- State.Attribute_value_double_quoted 1008 + | Some '\'' -> 1009 + Stream.advance t.stream; 1010 + t.state <- State.Attribute_value_single_quoted 1011 + | Some '>' -> 1012 + Stream.advance t.stream; 1013 + error t "missing-attribute-value"; 1014 + t.state <- State.Data; 1015 + emit_current_tag () 1016 + | _ -> 1017 + t.state <- State.Attribute_value_unquoted 1018 + 1019 + and state_attribute_value_double_quoted () = 1020 + match Stream.consume t.stream with 1021 + | Some '"' -> 1022 + t.state <- State.After_attribute_value_quoted 1023 + | Some '&' -> 1024 + t.return_state <- State.Attribute_value_double_quoted; 1025 + t.state <- State.Character_reference 1026 + | Some '\x00' -> 1027 + error t "unexpected-null-character"; 1028 + Buffer.add_string t.current_attr_value "\xEF\xBF\xBD" 1029 + | Some c -> 1030 + Buffer.add_char t.current_attr_value c 1031 + | None -> () 1032 + 1033 + and state_attribute_value_single_quoted () = 1034 + match Stream.consume t.stream with 1035 + | Some '\'' -> 1036 + t.state <- State.After_attribute_value_quoted 1037 + | Some '&' -> 1038 + t.return_state <- State.Attribute_value_single_quoted; 1039 + t.state <- State.Character_reference 1040 + | Some '\x00' -> 1041 + error t "unexpected-null-character"; 1042 + Buffer.add_string t.current_attr_value "\xEF\xBF\xBD" 1043 + | Some c -> 1044 + Buffer.add_char t.current_attr_value c 1045 + | None -> () 1046 + 1047 + and state_attribute_value_unquoted () = 1048 + match Stream.peek t.stream with 1049 + | Some ('\t' | '\n' | '\x0C' | ' ') -> 1050 + Stream.advance t.stream; 1051 + t.state <- State.Before_attribute_name 1052 + | Some '&' -> 1053 + Stream.advance t.stream; 1054 + t.return_state <- State.Attribute_value_unquoted; 1055 + t.state <- State.Character_reference 1056 + | Some '>' -> 1057 + Stream.advance t.stream; 1058 + t.state <- State.Data; 1059 + emit_current_tag () 1060 + | Some '\x00' -> 1061 + Stream.advance t.stream; 1062 + error t "unexpected-null-character"; 1063 + Buffer.add_string t.current_attr_value "\xEF\xBF\xBD" 1064 + | Some ('"' | '\'' | '<' | '=' | '`') as c_opt -> 1065 + Stream.advance t.stream; 1066 + error t "unexpected-character-in-unquoted-attribute-value"; 1067 + Buffer.add_char t.current_attr_value (Option.get c_opt) 1068 + | Some c -> 1069 + Stream.advance t.stream; 1070 + Buffer.add_char t.current_attr_value c 1071 + | None -> () 1072 + 1073 + and state_after_attribute_value_quoted () = 1074 + match Stream.peek t.stream with 1075 + | Some ('\t' | '\n' | '\x0C' | ' ') -> 1076 + Stream.advance t.stream; 1077 + t.state <- State.Before_attribute_name 1078 + | Some '/' -> 1079 + Stream.advance t.stream; 1080 + t.state <- State.Self_closing_start_tag 1081 + | Some '>' -> 1082 + Stream.advance t.stream; 1083 + t.state <- State.Data; 1084 + emit_current_tag () 1085 + | None -> () 1086 + | Some _ -> 1087 + error t "missing-whitespace-between-attributes"; 1088 + t.state <- State.Before_attribute_name 1089 + 1090 + and state_self_closing_start_tag () = 1091 + match Stream.peek t.stream with 1092 + | Some '>' -> 1093 + Stream.advance t.stream; 1094 + t.current_tag_self_closing <- true; 1095 + t.state <- State.Data; 1096 + emit_current_tag () 1097 + | None -> () 1098 + | Some _ -> 1099 + error t "unexpected-solidus-in-tag"; 1100 + t.state <- State.Before_attribute_name 1101 + 1102 + and state_bogus_comment () = 1103 + match Stream.consume t.stream with 1104 + | Some '>' -> 1105 + t.state <- State.Data; 1106 + emit_current_comment () 1107 + | Some '\x00' -> 1108 + error t "unexpected-null-character"; 1109 + Buffer.add_string t.current_comment "\xEF\xBF\xBD" 1110 + | Some c -> 1111 + Buffer.add_char t.current_comment c 1112 + | None -> () 1113 + 1114 + and state_markup_declaration_open () = 1115 + if Stream.matches_ci t.stream "--" then begin 1116 + ignore (Stream.consume_exact_ci t.stream "--"); 1117 + Buffer.clear t.current_comment; 1118 + t.state <- State.Comment_start 1119 + end else if Stream.matches_ci t.stream "DOCTYPE" then begin 1120 + ignore (Stream.consume_exact_ci t.stream "DOCTYPE"); 1121 + t.state <- State.Doctype 1122 + end else if Stream.matches_ci t.stream "[CDATA[" then begin 1123 + ignore (Stream.consume_exact_ci t.stream "[CDATA["); 1124 + (* CDATA only allowed in foreign content *) 1125 + if S.adjusted_current_node_in_html_namespace t.sink then begin 1126 + error t "cdata-in-html-content"; 1127 + Buffer.clear t.current_comment; 1128 + Buffer.add_string t.current_comment "[CDATA["; 1129 + t.state <- State.Bogus_comment 1130 + end else 1131 + t.state <- State.Cdata_section 1132 + end else begin 1133 + error t "incorrectly-opened-comment"; 1134 + Buffer.clear t.current_comment; 1135 + t.state <- State.Bogus_comment 1136 + end 1137 + 1138 + and state_comment_start () = 1139 + match Stream.peek t.stream with 1140 + | Some '-' -> 1141 + Stream.advance t.stream; 1142 + t.state <- State.Comment_start_dash 1143 + | Some '>' -> 1144 + Stream.advance t.stream; 1145 + error t "abrupt-closing-of-empty-comment"; 1146 + t.state <- State.Data; 1147 + emit_current_comment () 1148 + | _ -> 1149 + t.state <- State.Comment 1150 + 1151 + and state_comment_start_dash () = 1152 + match Stream.peek t.stream with 1153 + | Some '-' -> 1154 + Stream.advance t.stream; 1155 + t.state <- State.Comment_end 1156 + | Some '>' -> 1157 + Stream.advance t.stream; 1158 + error t "abrupt-closing-of-empty-comment"; 1159 + t.state <- State.Data; 1160 + emit_current_comment () 1161 + | None -> () 1162 + | Some _ -> 1163 + Buffer.add_char t.current_comment '-'; 1164 + t.state <- State.Comment 1165 + 1166 + and state_comment () = 1167 + match Stream.consume t.stream with 1168 + | Some '<' -> 1169 + Buffer.add_char t.current_comment '<'; 1170 + t.state <- State.Comment_less_than_sign 1171 + | Some '-' -> 1172 + t.state <- State.Comment_end_dash 1173 + | Some '\x00' -> 1174 + error t "unexpected-null-character"; 1175 + Buffer.add_string t.current_comment "\xEF\xBF\xBD" 1176 + | Some c -> 1177 + Buffer.add_char t.current_comment c 1178 + | None -> () 1179 + 1180 + and state_comment_less_than_sign () = 1181 + match Stream.peek t.stream with 1182 + | Some '!' -> 1183 + Stream.advance t.stream; 1184 + Buffer.add_char t.current_comment '!'; 1185 + t.state <- State.Comment_less_than_sign_bang 1186 + | Some '<' -> 1187 + Stream.advance t.stream; 1188 + Buffer.add_char t.current_comment '<' 1189 + | _ -> 1190 + t.state <- State.Comment 1191 + 1192 + and state_comment_less_than_sign_bang () = 1193 + match Stream.peek t.stream with 1194 + | Some '-' -> 1195 + Stream.advance t.stream; 1196 + t.state <- State.Comment_less_than_sign_bang_dash 1197 + | _ -> 1198 + t.state <- State.Comment 1199 + 1200 + and state_comment_less_than_sign_bang_dash () = 1201 + match Stream.peek t.stream with 1202 + | Some '-' -> 1203 + Stream.advance t.stream; 1204 + t.state <- State.Comment_less_than_sign_bang_dash_dash 1205 + | _ -> 1206 + t.state <- State.Comment_end_dash 1207 + 1208 + and state_comment_less_than_sign_bang_dash_dash () = 1209 + match Stream.peek t.stream with 1210 + | Some '>' | None -> 1211 + t.state <- State.Comment_end 1212 + | Some _ -> 1213 + error t "nested-comment"; 1214 + t.state <- State.Comment_end 1215 + 1216 + and state_comment_end_dash () = 1217 + match Stream.peek t.stream with 1218 + | Some '-' -> 1219 + Stream.advance t.stream; 1220 + t.state <- State.Comment_end 1221 + | None -> () 1222 + | Some _ -> 1223 + Buffer.add_char t.current_comment '-'; 1224 + t.state <- State.Comment 1225 + 1226 + and state_comment_end () = 1227 + match Stream.peek t.stream with 1228 + | Some '>' -> 1229 + Stream.advance t.stream; 1230 + t.state <- State.Data; 1231 + emit_current_comment () 1232 + | Some '!' -> 1233 + Stream.advance t.stream; 1234 + t.state <- State.Comment_end_bang 1235 + | Some '-' -> 1236 + Stream.advance t.stream; 1237 + Buffer.add_char t.current_comment '-' 1238 + | None -> () 1239 + | Some _ -> 1240 + Buffer.add_string t.current_comment "--"; 1241 + t.state <- State.Comment 1242 + 1243 + and state_comment_end_bang () = 1244 + match Stream.peek t.stream with 1245 + | Some '-' -> 1246 + Stream.advance t.stream; 1247 + Buffer.add_string t.current_comment "--!"; 1248 + t.state <- State.Comment_end_dash 1249 + | Some '>' -> 1250 + Stream.advance t.stream; 1251 + error t "incorrectly-closed-comment"; 1252 + t.state <- State.Data; 1253 + emit_current_comment () 1254 + | None -> () 1255 + | Some _ -> 1256 + Buffer.add_string t.current_comment "--!"; 1257 + t.state <- State.Comment 1258 + 1259 + and state_doctype () = 1260 + match Stream.peek t.stream with 1261 + | Some ('\t' | '\n' | '\x0C' | ' ') -> 1262 + Stream.advance t.stream; 1263 + t.state <- State.Before_doctype_name 1264 + | Some '>' -> 1265 + t.state <- State.Before_doctype_name 1266 + | None -> () 1267 + | Some _ -> 1268 + error t "missing-whitespace-before-doctype-name"; 1269 + t.state <- State.Before_doctype_name 1270 + 1271 + and state_before_doctype_name () = 1272 + match Stream.peek t.stream with 1273 + | Some ('\t' | '\n' | '\x0C' | ' ') -> 1274 + Stream.advance t.stream 1275 + | Some '\x00' -> 1276 + Stream.advance t.stream; 1277 + error t "unexpected-null-character"; 1278 + start_new_doctype t; 1279 + t.current_doctype_name <- Some (Buffer.create 8); 1280 + Buffer.add_string (Option.get t.current_doctype_name) "\xEF\xBF\xBD"; 1281 + t.state <- State.Doctype_name 1282 + | Some '>' -> 1283 + Stream.advance t.stream; 1284 + error t "missing-doctype-name"; 1285 + start_new_doctype t; 1286 + t.current_doctype_force_quirks <- true; 1287 + t.state <- State.Data; 1288 + emit_current_doctype () 1289 + | None -> () 1290 + | Some c -> 1291 + Stream.advance t.stream; 1292 + start_new_doctype t; 1293 + t.current_doctype_name <- Some (Buffer.create 8); 1294 + Buffer.add_char (Option.get t.current_doctype_name) (ascii_lower c); 1295 + t.state <- State.Doctype_name 1296 + 1297 + and state_doctype_name () = 1298 + match Stream.consume t.stream with 1299 + | Some ('\t' | '\n' | '\x0C' | ' ') -> 1300 + t.state <- State.After_doctype_name 1301 + | Some '>' -> 1302 + t.state <- State.Data; 1303 + emit_current_doctype () 1304 + | Some '\x00' -> 1305 + error t "unexpected-null-character"; 1306 + Buffer.add_string (Option.get t.current_doctype_name) "\xEF\xBF\xBD" 1307 + | Some c -> 1308 + Buffer.add_char (Option.get t.current_doctype_name) (ascii_lower c) 1309 + | None -> () 1310 + 1311 + and state_after_doctype_name () = 1312 + match Stream.peek t.stream with 1313 + | Some ('\t' | '\n' | '\x0C' | ' ') -> 1314 + Stream.advance t.stream 1315 + | Some '>' -> 1316 + Stream.advance t.stream; 1317 + t.state <- State.Data; 1318 + emit_current_doctype () 1319 + | None -> () 1320 + | Some _ -> 1321 + if Stream.matches_ci t.stream "PUBLIC" then begin 1322 + ignore (Stream.consume_exact_ci t.stream "PUBLIC"); 1323 + t.state <- State.After_doctype_public_keyword 1324 + end else if Stream.matches_ci t.stream "SYSTEM" then begin 1325 + ignore (Stream.consume_exact_ci t.stream "SYSTEM"); 1326 + t.state <- State.After_doctype_system_keyword 1327 + end else begin 1328 + error t "invalid-character-sequence-after-doctype-name"; 1329 + t.current_doctype_force_quirks <- true; 1330 + t.state <- State.Bogus_doctype 1331 + end 1332 + 1333 + and state_after_doctype_public_keyword () = 1334 + match Stream.peek t.stream with 1335 + | Some ('\t' | '\n' | '\x0C' | ' ') -> 1336 + Stream.advance t.stream; 1337 + t.state <- State.Before_doctype_public_identifier 1338 + | Some '"' -> 1339 + Stream.advance t.stream; 1340 + error t "missing-whitespace-after-doctype-public-keyword"; 1341 + t.current_doctype_public <- Some (Buffer.create 32); 1342 + t.state <- State.Doctype_public_identifier_double_quoted 1343 + | Some '\'' -> 1344 + Stream.advance t.stream; 1345 + error t "missing-whitespace-after-doctype-public-keyword"; 1346 + t.current_doctype_public <- Some (Buffer.create 32); 1347 + t.state <- State.Doctype_public_identifier_single_quoted 1348 + | Some '>' -> 1349 + Stream.advance t.stream; 1350 + error t "missing-doctype-public-identifier"; 1351 + t.current_doctype_force_quirks <- true; 1352 + t.state <- State.Data; 1353 + emit_current_doctype () 1354 + | None -> () 1355 + | Some _ -> 1356 + error t "missing-quote-before-doctype-public-identifier"; 1357 + t.current_doctype_force_quirks <- true; 1358 + t.state <- State.Bogus_doctype 1359 + 1360 + and state_before_doctype_public_identifier () = 1361 + match Stream.peek t.stream with 1362 + | Some ('\t' | '\n' | '\x0C' | ' ') -> 1363 + Stream.advance t.stream 1364 + | Some '"' -> 1365 + Stream.advance t.stream; 1366 + t.current_doctype_public <- Some (Buffer.create 32); 1367 + t.state <- State.Doctype_public_identifier_double_quoted 1368 + | Some '\'' -> 1369 + Stream.advance t.stream; 1370 + t.current_doctype_public <- Some (Buffer.create 32); 1371 + t.state <- State.Doctype_public_identifier_single_quoted 1372 + | Some '>' -> 1373 + Stream.advance t.stream; 1374 + error t "missing-doctype-public-identifier"; 1375 + t.current_doctype_force_quirks <- true; 1376 + t.state <- State.Data; 1377 + emit_current_doctype () 1378 + | None -> () 1379 + | Some _ -> 1380 + error t "missing-quote-before-doctype-public-identifier"; 1381 + t.current_doctype_force_quirks <- true; 1382 + t.state <- State.Bogus_doctype 1383 + 1384 + and state_doctype_public_identifier_double_quoted () = 1385 + match Stream.consume t.stream with 1386 + | Some '"' -> 1387 + t.state <- State.After_doctype_public_identifier 1388 + | Some '\x00' -> 1389 + error t "unexpected-null-character"; 1390 + Buffer.add_string (Option.get t.current_doctype_public) "\xEF\xBF\xBD" 1391 + | Some '>' -> 1392 + error t "abrupt-doctype-public-identifier"; 1393 + t.current_doctype_force_quirks <- true; 1394 + t.state <- State.Data; 1395 + emit_current_doctype () 1396 + | Some c -> 1397 + Buffer.add_char (Option.get t.current_doctype_public) c 1398 + | None -> () 1399 + 1400 + and state_doctype_public_identifier_single_quoted () = 1401 + match Stream.consume t.stream with 1402 + | Some '\'' -> 1403 + t.state <- State.After_doctype_public_identifier 1404 + | Some '\x00' -> 1405 + error t "unexpected-null-character"; 1406 + Buffer.add_string (Option.get t.current_doctype_public) "\xEF\xBF\xBD" 1407 + | Some '>' -> 1408 + error t "abrupt-doctype-public-identifier"; 1409 + t.current_doctype_force_quirks <- true; 1410 + t.state <- State.Data; 1411 + emit_current_doctype () 1412 + | Some c -> 1413 + Buffer.add_char (Option.get t.current_doctype_public) c 1414 + | None -> () 1415 + 1416 + and state_after_doctype_public_identifier () = 1417 + match Stream.peek t.stream with 1418 + | Some ('\t' | '\n' | '\x0C' | ' ') -> 1419 + Stream.advance t.stream; 1420 + t.state <- State.Between_doctype_public_and_system_identifiers 1421 + | Some '>' -> 1422 + Stream.advance t.stream; 1423 + t.state <- State.Data; 1424 + emit_current_doctype () 1425 + | Some '"' -> 1426 + Stream.advance t.stream; 1427 + error t "missing-whitespace-between-doctype-public-and-system-identifiers"; 1428 + t.current_doctype_system <- Some (Buffer.create 32); 1429 + t.state <- State.Doctype_system_identifier_double_quoted 1430 + | Some '\'' -> 1431 + Stream.advance t.stream; 1432 + error t "missing-whitespace-between-doctype-public-and-system-identifiers"; 1433 + t.current_doctype_system <- Some (Buffer.create 32); 1434 + t.state <- State.Doctype_system_identifier_single_quoted 1435 + | None -> () 1436 + | Some _ -> 1437 + error t "missing-quote-before-doctype-system-identifier"; 1438 + t.current_doctype_force_quirks <- true; 1439 + t.state <- State.Bogus_doctype 1440 + 1441 + and state_between_doctype_public_and_system_identifiers () = 1442 + match Stream.peek t.stream with 1443 + | Some ('\t' | '\n' | '\x0C' | ' ') -> 1444 + Stream.advance t.stream 1445 + | Some '>' -> 1446 + Stream.advance t.stream; 1447 + t.state <- State.Data; 1448 + emit_current_doctype () 1449 + | Some '"' -> 1450 + Stream.advance t.stream; 1451 + t.current_doctype_system <- Some (Buffer.create 32); 1452 + t.state <- State.Doctype_system_identifier_double_quoted 1453 + | Some '\'' -> 1454 + Stream.advance t.stream; 1455 + t.current_doctype_system <- Some (Buffer.create 32); 1456 + t.state <- State.Doctype_system_identifier_single_quoted 1457 + | None -> () 1458 + | Some _ -> 1459 + error t "missing-quote-before-doctype-system-identifier"; 1460 + t.current_doctype_force_quirks <- true; 1461 + t.state <- State.Bogus_doctype 1462 + 1463 + and state_after_doctype_system_keyword () = 1464 + match Stream.peek t.stream with 1465 + | Some ('\t' | '\n' | '\x0C' | ' ') -> 1466 + Stream.advance t.stream; 1467 + t.state <- State.Before_doctype_system_identifier 1468 + | Some '"' -> 1469 + Stream.advance t.stream; 1470 + error t "missing-whitespace-after-doctype-system-keyword"; 1471 + t.current_doctype_system <- Some (Buffer.create 32); 1472 + t.state <- State.Doctype_system_identifier_double_quoted 1473 + | Some '\'' -> 1474 + Stream.advance t.stream; 1475 + error t "missing-whitespace-after-doctype-system-keyword"; 1476 + t.current_doctype_system <- Some (Buffer.create 32); 1477 + t.state <- State.Doctype_system_identifier_single_quoted 1478 + | Some '>' -> 1479 + Stream.advance t.stream; 1480 + error t "missing-doctype-system-identifier"; 1481 + t.current_doctype_force_quirks <- true; 1482 + t.state <- State.Data; 1483 + emit_current_doctype () 1484 + | None -> () 1485 + | Some _ -> 1486 + error t "missing-quote-before-doctype-system-identifier"; 1487 + t.current_doctype_force_quirks <- true; 1488 + t.state <- State.Bogus_doctype 1489 + 1490 + and state_before_doctype_system_identifier () = 1491 + match Stream.peek t.stream with 1492 + | Some ('\t' | '\n' | '\x0C' | ' ') -> 1493 + Stream.advance t.stream 1494 + | Some '"' -> 1495 + Stream.advance t.stream; 1496 + t.current_doctype_system <- Some (Buffer.create 32); 1497 + t.state <- State.Doctype_system_identifier_double_quoted 1498 + | Some '\'' -> 1499 + Stream.advance t.stream; 1500 + t.current_doctype_system <- Some (Buffer.create 32); 1501 + t.state <- State.Doctype_system_identifier_single_quoted 1502 + | Some '>' -> 1503 + Stream.advance t.stream; 1504 + error t "missing-doctype-system-identifier"; 1505 + t.current_doctype_force_quirks <- true; 1506 + t.state <- State.Data; 1507 + emit_current_doctype () 1508 + | None -> () 1509 + | Some _ -> 1510 + error t "missing-quote-before-doctype-system-identifier"; 1511 + t.current_doctype_force_quirks <- true; 1512 + t.state <- State.Bogus_doctype 1513 + 1514 + and state_doctype_system_identifier_double_quoted () = 1515 + match Stream.consume t.stream with 1516 + | Some '"' -> 1517 + t.state <- State.After_doctype_system_identifier 1518 + | Some '\x00' -> 1519 + error t "unexpected-null-character"; 1520 + Buffer.add_string (Option.get t.current_doctype_system) "\xEF\xBF\xBD" 1521 + | Some '>' -> 1522 + error t "abrupt-doctype-system-identifier"; 1523 + t.current_doctype_force_quirks <- true; 1524 + t.state <- State.Data; 1525 + emit_current_doctype () 1526 + | Some c -> 1527 + Buffer.add_char (Option.get t.current_doctype_system) c 1528 + | None -> () 1529 + 1530 + and state_doctype_system_identifier_single_quoted () = 1531 + match Stream.consume t.stream with 1532 + | Some '\'' -> 1533 + t.state <- State.After_doctype_system_identifier 1534 + | Some '\x00' -> 1535 + error t "unexpected-null-character"; 1536 + Buffer.add_string (Option.get t.current_doctype_system) "\xEF\xBF\xBD" 1537 + | Some '>' -> 1538 + error t "abrupt-doctype-system-identifier"; 1539 + t.current_doctype_force_quirks <- true; 1540 + t.state <- State.Data; 1541 + emit_current_doctype () 1542 + | Some c -> 1543 + Buffer.add_char (Option.get t.current_doctype_system) c 1544 + | None -> () 1545 + 1546 + and state_after_doctype_system_identifier () = 1547 + match Stream.peek t.stream with 1548 + | Some ('\t' | '\n' | '\x0C' | ' ') -> 1549 + Stream.advance t.stream 1550 + | Some '>' -> 1551 + Stream.advance t.stream; 1552 + t.state <- State.Data; 1553 + emit_current_doctype () 1554 + | None -> () 1555 + | Some _ -> 1556 + error t "unexpected-character-after-doctype-system-identifier"; 1557 + t.state <- State.Bogus_doctype 1558 + 1559 + and state_bogus_doctype () = 1560 + match Stream.consume t.stream with 1561 + | Some '>' -> 1562 + t.state <- State.Data; 1563 + emit_current_doctype () 1564 + | Some '\x00' -> 1565 + error t "unexpected-null-character" 1566 + | Some _ -> () 1567 + | None -> () 1568 + 1569 + and state_cdata_section () = 1570 + match Stream.consume t.stream with 1571 + | Some ']' -> 1572 + t.state <- State.Cdata_section_bracket 1573 + | Some '\x00' -> 1574 + error t "unexpected-null-character"; 1575 + emit_str t "\xEF\xBF\xBD" 1576 + | Some c -> 1577 + emit_char t c 1578 + | None -> () 1579 + 1580 + and state_cdata_section_bracket () = 1581 + match Stream.peek t.stream with 1582 + | Some ']' -> 1583 + Stream.advance t.stream; 1584 + t.state <- State.Cdata_section_end 1585 + | _ -> 1586 + emit_char t ']'; 1587 + t.state <- State.Cdata_section 1588 + 1589 + and state_cdata_section_end () = 1590 + match Stream.peek t.stream with 1591 + | Some ']' -> 1592 + Stream.advance t.stream; 1593 + emit_char t ']' 1594 + | Some '>' -> 1595 + Stream.advance t.stream; 1596 + t.state <- State.Data 1597 + | _ -> 1598 + emit_str t "]]"; 1599 + t.state <- State.Cdata_section 1600 + 1601 + and state_character_reference () = 1602 + Buffer.clear t.temp_buffer; 1603 + Buffer.add_char t.temp_buffer '&'; 1604 + match Stream.peek t.stream with 1605 + | Some c when is_ascii_alnum c -> 1606 + t.state <- State.Named_character_reference 1607 + | Some '#' -> 1608 + Stream.advance t.stream; 1609 + Buffer.add_char t.temp_buffer '#'; 1610 + t.state <- State.Numeric_character_reference 1611 + | _ -> 1612 + flush_code_points_consumed_as_char_ref t; 1613 + t.state <- t.return_state 1614 + 1615 + and state_named_character_reference () = 1616 + (* Collect alphanumeric characters *) 1617 + let rec collect () = 1618 + match Stream.peek t.stream with 1619 + | Some c when is_ascii_alnum c -> 1620 + Stream.advance t.stream; 1621 + Buffer.add_char t.temp_buffer c; 1622 + collect () 1623 + | _ -> () 1624 + in 1625 + collect (); 1626 + 1627 + let has_semicolon = 1628 + match Stream.peek t.stream with 1629 + | Some ';' -> Stream.advance t.stream; Buffer.add_char t.temp_buffer ';'; true 1630 + | _ -> false 1631 + in 1632 + 1633 + (* Try to match entity - buffer contains "&name" or "&name;" *) 1634 + let buf_contents = Buffer.contents t.temp_buffer in 1635 + let name_start = 1 in (* Skip '&' *) 1636 + let name_end = String.length buf_contents - (if has_semicolon then 1 else 0) in 1637 + let entity_name = String.sub buf_contents name_start (name_end - name_start) in 1638 + 1639 + (* Try progressively shorter matches *) 1640 + (* Only match if: 1641 + 1. Full match with semicolon, OR 1642 + 2. Legacy entity (can be used without semicolon) *) 1643 + let rec try_match len = 1644 + if len <= 0 then None 1645 + else 1646 + let prefix = String.sub entity_name 0 len in 1647 + let is_full = len = String.length entity_name in 1648 + let would_have_semi = has_semicolon && is_full in 1649 + (* Only use this match if it has semicolon or is a legacy entity *) 1650 + if would_have_semi || Html5rw_entities.is_legacy prefix then 1651 + match Html5rw_entities.lookup prefix with 1652 + | Some decoded -> Some (decoded, len) 1653 + | None -> try_match (len - 1) 1654 + else 1655 + try_match (len - 1) 1656 + in 1657 + 1658 + match try_match (String.length entity_name) with 1659 + | Some (decoded, matched_len) -> 1660 + let full_match = matched_len = String.length entity_name in 1661 + let ends_with_semi = has_semicolon && full_match in 1662 + 1663 + (* Check attribute context restrictions *) 1664 + let in_attribute = match t.return_state with 1665 + | State.Attribute_value_double_quoted 1666 + | State.Attribute_value_single_quoted 1667 + | State.Attribute_value_unquoted -> true 1668 + | _ -> false 1669 + in 1670 + 1671 + let next_char = 1672 + if full_match && not has_semicolon then 1673 + Stream.peek t.stream 1674 + else if not full_match then 1675 + Some entity_name.[matched_len] 1676 + else None 1677 + in 1678 + 1679 + let blocked = in_attribute && not ends_with_semi && 1680 + match next_char with 1681 + | Some '=' -> true 1682 + | Some c when is_ascii_alnum c -> true 1683 + | _ -> false 1684 + in 1685 + 1686 + if blocked then begin 1687 + flush_code_points_consumed_as_char_ref t; 1688 + t.state <- t.return_state 1689 + end else begin 1690 + if not ends_with_semi then 1691 + error t "missing-semicolon-after-character-reference"; 1692 + Buffer.clear t.temp_buffer; 1693 + Buffer.add_string t.temp_buffer decoded; 1694 + flush_code_points_consumed_as_char_ref t; 1695 + (* Emit unconsumed chars after partial match *) 1696 + if not full_match then begin 1697 + let unconsumed = String.sub entity_name matched_len (String.length entity_name - matched_len) in 1698 + emit_str t unconsumed; 1699 + (* If there was a semicolon in input but we didn't use the full match, emit the semicolon too *) 1700 + if has_semicolon then 1701 + emit_char t ';' 1702 + end; 1703 + t.state <- t.return_state 1704 + end 1705 + | None -> 1706 + (* No match - check if we should report ambiguous ampersand *) 1707 + if String.length entity_name > 0 then begin 1708 + t.state <- State.Ambiguous_ampersand; 1709 + (* Reset position - we need to emit the ampersand and chars *) 1710 + flush_code_points_consumed_as_char_ref t 1711 + end else begin 1712 + flush_code_points_consumed_as_char_ref t; 1713 + t.state <- t.return_state 1714 + end 1715 + 1716 + and state_ambiguous_ampersand () = 1717 + match Stream.peek t.stream with 1718 + | Some c when is_ascii_alnum c -> 1719 + Stream.advance t.stream; 1720 + (match t.return_state with 1721 + | State.Attribute_value_double_quoted 1722 + | State.Attribute_value_single_quoted 1723 + | State.Attribute_value_unquoted -> 1724 + Buffer.add_char t.current_attr_value c 1725 + | _ -> 1726 + emit_char t c) 1727 + | Some ';' -> 1728 + error t "unknown-named-character-reference"; 1729 + t.state <- t.return_state 1730 + | _ -> 1731 + t.state <- t.return_state 1732 + 1733 + and state_numeric_character_reference () = 1734 + t.char_ref_code <- 0; 1735 + match Stream.peek t.stream with 1736 + | Some (('x' | 'X') as c) -> 1737 + Stream.advance t.stream; 1738 + Buffer.add_char t.temp_buffer c; 1739 + t.state <- State.Hexadecimal_character_reference_start 1740 + | _ -> 1741 + t.state <- State.Decimal_character_reference_start 1742 + 1743 + and state_hexadecimal_character_reference_start () = 1744 + match Stream.peek t.stream with 1745 + | Some c when is_ascii_hex c -> 1746 + t.state <- State.Hexadecimal_character_reference 1747 + | _ -> 1748 + error t "absence-of-digits-in-numeric-character-reference"; 1749 + flush_code_points_consumed_as_char_ref t; 1750 + t.state <- t.return_state 1751 + 1752 + and state_decimal_character_reference_start () = 1753 + match Stream.peek t.stream with 1754 + | Some c when is_ascii_digit c -> 1755 + t.state <- State.Decimal_character_reference 1756 + | _ -> 1757 + error t "absence-of-digits-in-numeric-character-reference"; 1758 + flush_code_points_consumed_as_char_ref t; 1759 + t.state <- t.return_state 1760 + 1761 + and state_hexadecimal_character_reference () = 1762 + match Stream.peek t.stream with 1763 + | Some c when is_ascii_digit c -> 1764 + Stream.advance t.stream; 1765 + t.char_ref_code <- t.char_ref_code * 16 + (Char.code c - Char.code '0'); 1766 + if t.char_ref_code > 0x10FFFF then t.char_ref_code <- 0x10FFFF + 1 1767 + | Some c when c >= 'A' && c <= 'F' -> 1768 + Stream.advance t.stream; 1769 + t.char_ref_code <- t.char_ref_code * 16 + (Char.code c - Char.code 'A' + 10); 1770 + if t.char_ref_code > 0x10FFFF then t.char_ref_code <- 0x10FFFF + 1 1771 + | Some c when c >= 'a' && c <= 'f' -> 1772 + Stream.advance t.stream; 1773 + t.char_ref_code <- t.char_ref_code * 16 + (Char.code c - Char.code 'a' + 10); 1774 + if t.char_ref_code > 0x10FFFF then t.char_ref_code <- 0x10FFFF + 1 1775 + | Some ';' -> 1776 + Stream.advance t.stream; 1777 + t.state <- State.Numeric_character_reference_end 1778 + | _ -> 1779 + error t "missing-semicolon-after-character-reference"; 1780 + t.state <- State.Numeric_character_reference_end 1781 + 1782 + and state_decimal_character_reference () = 1783 + match Stream.peek t.stream with 1784 + | Some c when is_ascii_digit c -> 1785 + Stream.advance t.stream; 1786 + t.char_ref_code <- t.char_ref_code * 10 + (Char.code c - Char.code '0'); 1787 + if t.char_ref_code > 0x10FFFF then t.char_ref_code <- 0x10FFFF + 1 1788 + | Some ';' -> 1789 + Stream.advance t.stream; 1790 + t.state <- State.Numeric_character_reference_end 1791 + | _ -> 1792 + error t "missing-semicolon-after-character-reference"; 1793 + t.state <- State.Numeric_character_reference_end 1794 + 1795 + and state_numeric_character_reference_end () = 1796 + let code = t.char_ref_code in 1797 + let replacement_char = "\xEF\xBF\xBD" in 1798 + 1799 + let result = 1800 + if code = 0 then begin 1801 + error t "null-character-reference"; 1802 + replacement_char 1803 + end else if code > 0x10FFFF then begin 1804 + error t "character-reference-outside-unicode-range"; 1805 + replacement_char 1806 + end else if code >= 0xD800 && code <= 0xDFFF then begin 1807 + error t "surrogate-character-reference"; 1808 + replacement_char 1809 + end else if (code >= 0xFDD0 && code <= 0xFDEF) || 1810 + List.mem code [0xFFFE; 0xFFFF; 0x1FFFE; 0x1FFFF; 0x2FFFE; 0x2FFFF; 1811 + 0x3FFFE; 0x3FFFF; 0x4FFFE; 0x4FFFF; 0x5FFFE; 0x5FFFF; 1812 + 0x6FFFE; 0x6FFFF; 0x7FFFE; 0x7FFFF; 0x8FFFE; 0x8FFFF; 1813 + 0x9FFFE; 0x9FFFF; 0xAFFFE; 0xAFFFF; 0xBFFFE; 0xBFFFF; 1814 + 0xCFFFE; 0xCFFFF; 0xDFFFE; 0xDFFFF; 0xEFFFE; 0xEFFFF; 1815 + 0xFFFFE; 0xFFFFF; 0x10FFFE; 0x10FFFF] then begin 1816 + error t "noncharacter-character-reference"; 1817 + Html5rw_entities.Numeric_ref.codepoint_to_utf8 code 1818 + end else if (code >= 0x01 && code <= 0x08) || code = 0x0B || 1819 + (code >= 0x0D && code <= 0x1F) || 1820 + (code >= 0x7F && code <= 0x9F) then begin 1821 + error t "control-character-reference"; 1822 + (* Apply Windows-1252 replacement table for 0x80-0x9F *) 1823 + match Html5rw_entities.Numeric_ref.find_replacement code with 1824 + | Some replacement -> Html5rw_entities.Numeric_ref.codepoint_to_utf8 replacement 1825 + | None -> Html5rw_entities.Numeric_ref.codepoint_to_utf8 code 1826 + end else 1827 + Html5rw_entities.Numeric_ref.codepoint_to_utf8 code 1828 + in 1829 + 1830 + Buffer.clear t.temp_buffer; 1831 + Buffer.add_string t.temp_buffer result; 1832 + flush_code_points_consumed_as_char_ref t; 1833 + t.state <- t.return_state 1834 + 1835 + in 1836 + process_state () 1837 + 1838 + let get_errors t = List.rev t.errors 1839 + 1840 + let set_state t state = t.state <- state 1841 + 1842 + let set_last_start_tag t name = t.last_start_tag <- name
+9
test/adoption_test.ml
···
··· 1 + open Bytesrw 2 + 3 + module Parser = Html5rw_parser 4 + 5 + let () = 6 + print_endline "=== Test: <a><svg><tr><input></a> ==="; 7 + let result = Parser.parse (Bytes.Reader.of_string "<a><svg><tr><input></a>") in 8 + print_endline (Html5rw_dom.to_test_format (Parser.root result)); 9 + print_newline ()
+12
test/debug2.ml
···
··· 1 + open Bytesrw 2 + 3 + module Parser = Html5rw_parser 4 + module Dom = Html5rw_dom 5 + 6 + let () = 7 + (* Test adoption agency *) 8 + let input = "<p><b>One<p>Two" in 9 + let result = Parser.parse ~collect_errors:true (Bytes.Reader.of_string input) in 10 + print_endline ("Input: " ^ input); 11 + print_endline "Result:"; 12 + print_endline (Dom.to_test_format (Parser.root result))
+12
test/debug_fragment.ml
···
··· 1 + open Bytesrw 2 + 3 + module Parser = Html5rw_parser 4 + module Dom = Html5rw_dom 5 + 6 + let () = 7 + (* Simple table test *) 8 + let input = "<table><th>" in 9 + let result = Parser.parse ~collect_errors:true (Bytes.Reader.of_string input) in 10 + print_endline ("Input: " ^ input); 11 + print_endline "Result:"; 12 + print_endline (Dom.to_test_format (Parser.root result))
+28
test/debug_svg.ml
···
··· 1 + open Bytesrw 2 + 3 + module Parser = Html5rw_parser 4 + module Dom = Html5rw_dom 5 + 6 + let rec print_tree depth node = 7 + let indent = String.make (depth * 2) ' ' in 8 + Printf.printf "%s%s (ns=%s)\n" indent node.Dom.name 9 + (match node.Dom.namespace with Some ns -> ns | None -> "html"); 10 + List.iter (print_tree (depth + 1)) node.Dom.children 11 + 12 + let () = 13 + (* SVG fragment test *) 14 + let input = "<svg><tr><td><title><tr>" in 15 + let context = Parser.make_fragment_context ~tag_name:"td" () in 16 + let result = Parser.parse ~collect_errors:true ~fragment_context:context (Bytes.Reader.of_string input) in 17 + print_endline ("Input: " ^ input); 18 + print_endline "Tree structure:"; 19 + print_tree 0 (Parser.root result); 20 + print_endline ""; 21 + print_endline "Result:"; 22 + print_endline (Dom.to_test_format (Parser.root result)); 23 + print_endline ""; 24 + print_endline "Expected:"; 25 + print_endline "| <svg svg>"; 26 + print_endline "| <svg tr>"; 27 + print_endline "| <svg td>"; 28 + print_endline "| <svg title>"
+20
test/debug_title.ml
···
··· 1 + open Bytesrw 2 + 3 + module Parser = Html5rw_parser 4 + module Dom = Html5rw_dom 5 + 6 + let () = 7 + let input = "<!doctype html><title> <!-- </title>--> x" in 8 + let result = Parser.parse ~collect_errors:true (Bytes.Reader.of_string input) in 9 + print_endline ("Input: " ^ input); 10 + print_endline "Result:"; 11 + print_endline (Dom.to_test_format (Parser.root result)); 12 + print_endline ""; 13 + print_endline "Expected:"; 14 + print_endline "| <!DOCTYPE html>"; 15 + print_endline "| <html>"; 16 + print_endline "| <head>"; 17 + print_endline "| <title>"; 18 + print_endline "| \" <!-- \""; 19 + print_endline "| <body>"; 20 + print_endline "| \"--> x\""
+90
test/dune
···
··· 1 + (executable 2 + (name test_html5lib) 3 + (libraries bytesrw html5rw.parser html5rw.dom)) 4 + 5 + (executable 6 + (name debug_fragment) 7 + (libraries bytesrw html5rw.parser html5rw.dom)) 8 + 9 + (executable 10 + (name debug_svg) 11 + (libraries bytesrw html5rw.parser html5rw.dom)) 12 + 13 + (executable 14 + (name test_table) 15 + (libraries bytesrw html5rw.parser html5rw.dom)) 16 + 17 + (executable 18 + (name test_debug) 19 + (libraries bytesrw html5rw.parser html5rw.dom)) 20 + 21 + (executable 22 + (name test_frameset) 23 + (libraries bytesrw html5rw.parser html5rw.dom)) 24 + 25 + (executable 26 + (name test_whitespace) 27 + (libraries bytesrw html5rw.parser html5rw.dom)) 28 + 29 + (executable 30 + (name test_mi) 31 + (libraries bytesrw html5rw.parser html5rw.dom)) 32 + 33 + (executable 34 + (name test_table_svg) 35 + (libraries bytesrw html5rw.parser html5rw.dom)) 36 + (executable 37 + (name quick_test) 38 + (libraries bytesrw html5rw.parser html5rw.dom)) 39 + (executable 40 + (name simple_test) 41 + (libraries bytesrw html5rw.parser html5rw.dom)) 42 + (executable 43 + (name html_frag_test) 44 + (libraries bytesrw html5rw.parser html5rw.dom)) 45 + (executable 46 + (name svg_frag_test) 47 + (libraries bytesrw html5rw.parser html5rw.dom)) 48 + (executable 49 + (name nobr_test) 50 + (libraries bytesrw html5rw.parser html5rw.dom)) 51 + (executable 52 + (name nobr_debug) 53 + (libraries bytesrw html5rw.parser html5rw.dom)) 54 + (executable 55 + (name select_debug) 56 + (libraries bytesrw html5rw.parser html5rw.dom)) 57 + (executable 58 + (name template_debug) 59 + (libraries bytesrw html5rw.parser html5rw.dom)) 60 + (executable 61 + (name template_debug2) 62 + (libraries bytesrw html5rw.parser html5rw.dom)) 63 + (executable 64 + (name script_eof_test) 65 + (libraries bytesrw html5rw.parser html5rw.dom)) 66 + (executable 67 + (name entity_test) 68 + (libraries bytesrw html5rw.parser html5rw.dom)) 69 + (executable 70 + (name entity_dup_test) 71 + (libraries bytesrw html5rw.parser html5rw.dom)) 72 + (executable 73 + (name script_attr_test) 74 + (libraries bytesrw html5rw.parser html5rw.dom)) 75 + 76 + (executable 77 + (name frag_debug) 78 + (libraries bytesrw html5rw.parser html5rw.dom)) 79 + 80 + (executable 81 + (name frag_debug2) 82 + (libraries bytesrw html5rw.parser html5rw.dom)) 83 + 84 + (executable 85 + (name frag_debug3) 86 + (libraries bytesrw html5rw.parser html5rw.dom)) 87 + (executable (name adoption_test) (libraries bytesrw html5rw.parser html5rw.dom)) 88 + (executable (name template_debug3) (libraries bytesrw html5rw.parser html5rw.dom)) 89 + (executable (name template_debug4) (libraries bytesrw html5rw.parser html5rw.dom)) 90 + (executable (name ns_sens_test) (libraries bytesrw html5rw.parser html5rw.dom))
+20
test/entity_dup_test.ml
···
··· 1 + open Bytesrw 2 + 3 + module Parser = Html5rw_parser 4 + module Dom = Html5rw_dom 5 + 6 + let () = 7 + print_endline "=== Test: &AMp; ==="; 8 + let input = "&AMp;" in 9 + print_endline ("Input: " ^ input); 10 + let result = Parser.parse ~collect_errors:true (Bytes.Reader.of_string input) in 11 + print_endline "Result:"; 12 + print_endline (Dom.to_test_format (Parser.root result)); 13 + print_endline ""; 14 + 15 + print_endline "=== Test: &amp; ==="; 16 + let input = "&amp;" in 17 + print_endline ("Input: " ^ input); 18 + let result = Parser.parse ~collect_errors:true (Bytes.Reader.of_string input) in 19 + print_endline "Result:"; 20 + print_endline (Dom.to_test_format (Parser.root result))
+28
test/entity_test.ml
···
··· 1 + open Bytesrw 2 + 3 + module Parser = Html5rw_parser 4 + module Dom = Html5rw_dom 5 + 6 + let () = 7 + print_endline "=== Test 1: Single & ==="; 8 + let input = "&" in 9 + print_endline ("Input: " ^ input); 10 + let result = Parser.parse ~collect_errors:true (Bytes.Reader.of_string input) in 11 + print_endline "Result:"; 12 + print_endline (Dom.to_test_format (Parser.root result)); 13 + print_endline ""; 14 + 15 + print_endline "=== Test 2: &#45 (decimal ref) ==="; 16 + let input = "&#45" in 17 + print_endline ("Input: " ^ input); 18 + let result = Parser.parse ~collect_errors:true (Bytes.Reader.of_string input) in 19 + print_endline "Result:"; 20 + print_endline (Dom.to_test_format (Parser.root result)); 21 + print_endline ""; 22 + 23 + print_endline "=== Test 3: &#X (hex ref incomplete) ==="; 24 + let input = "&#X" in 25 + print_endline ("Input: " ^ input); 26 + let result = Parser.parse ~collect_errors:true (Bytes.Reader.of_string input) in 27 + print_endline "Result:"; 28 + print_endline (Dom.to_test_format (Parser.root result))
+39
test/frag_debug.ml
···
··· 1 + open Bytesrw 2 + 3 + module Parser = Html5rw_parser 4 + 5 + let () = 6 + (* Test 77 - template with adoption agency *) 7 + print_endline "=== Template test 77 ==="; 8 + print_endline "Input: <body><template><i><menu>Foo</i>"; 9 + let result = Html5rw_parser.parse (Bytes.Reader.of_string "<body><template><i><menu>Foo</i>") in 10 + print_endline "Actual:"; 11 + print_endline (Html5rw_dom.to_test_format (Parser.root result)); 12 + print_newline (); 13 + 14 + (* Simpler test - just template with content *) 15 + print_endline "=== Simpler template test ==="; 16 + print_endline "Input: <template><i>X</i></template>"; 17 + let result = Html5rw_parser.parse (Bytes.Reader.of_string "<template><i>X</i></template>") in 18 + print_endline (Html5rw_dom.to_test_format (Parser.root result)); 19 + print_newline (); 20 + 21 + (* Test without template *) 22 + print_endline "=== Without template ==="; 23 + print_endline "Input: <i><menu>Foo</i>"; 24 + let result = Html5rw_parser.parse (Bytes.Reader.of_string "<i><menu>Foo</i>") in 25 + print_endline (Html5rw_dom.to_test_format (Parser.root result)); 26 + print_newline (); 27 + 28 + (* Test 31 - foreignObject/math *) 29 + print_endline "=== Test 31 - foreignObject ==="; 30 + print_endline "Input: <div><svg><path><foreignObject><math></div>a"; 31 + let result = Html5rw_parser.parse (Bytes.Reader.of_string "<div><svg><path><foreignObject><math></div>a") in 32 + print_endline (Html5rw_dom.to_test_format (Parser.root result)); 33 + print_newline (); 34 + 35 + (* namespace-sensitivity test *) 36 + print_endline "=== Namespace sensitivity ==="; 37 + print_endline "Input: <body><table><tr><td><svg><td><foreignObject><span></td>Foo"; 38 + let result = Html5rw_parser.parse (Bytes.Reader.of_string "<body><table><tr><td><svg><td><foreignObject><span></td>Foo") in 39 + print_endline (Html5rw_dom.to_test_format (Parser.root result))
+40
test/frag_debug2.ml
···
··· 1 + open Bytesrw 2 + 3 + module Parser = Html5rw_parser 4 + 5 + let () = 6 + (* Test: svg end tag handling *) 7 + print_endline "=== Test: <div><svg></div> ==="; 8 + let result = Html5rw_parser.parse (Bytes.Reader.of_string "<div><svg></div>") in 9 + print_endline (Html5rw_dom.to_test_format (Parser.root result)); 10 + print_newline (); 11 + 12 + (* Test: foreignObject text integration *) 13 + print_endline "=== Test: <div><svg><foreignObject></div> ==="; 14 + let result = Html5rw_parser.parse (Bytes.Reader.of_string "<div><svg><foreignObject></div>") in 15 + print_endline (Html5rw_dom.to_test_format (Parser.root result)); 16 + print_newline (); 17 + 18 + (* Test: math inside foreignObject with end tag *) 19 + print_endline "=== Test: <div><svg><foreignObject><math></div>a ==="; 20 + let result = Html5rw_parser.parse (Bytes.Reader.of_string "<div><svg><foreignObject><math></div>a") in 21 + print_endline (Html5rw_dom.to_test_format (Parser.root result)); 22 + print_newline (); 23 + 24 + (* Without path element *) 25 + print_endline "=== Test: <div><svg><foreignObject><b></div>text ==="; 26 + let result = Html5rw_parser.parse (Bytes.Reader.of_string "<div><svg><foreignObject><b></div>text") in 27 + print_endline (Html5rw_dom.to_test_format (Parser.root result)); 28 + print_newline (); 29 + 30 + (* Template adoption agency test *) 31 + print_endline "=== Test: <template><b><menu>text</b> ==="; 32 + let result = Html5rw_parser.parse (Bytes.Reader.of_string "<template><b><menu>text</b>") in 33 + print_endline (Html5rw_dom.to_test_format (Parser.root result)); 34 + print_newline (); 35 + 36 + (* Without template for comparison *) 37 + print_endline "=== Test: <b><menu>text</b> (no template) ==="; 38 + let result = Html5rw_parser.parse (Bytes.Reader.of_string "<b><menu>text</b>") in 39 + print_endline (Html5rw_dom.to_test_format (Parser.root result)); 40 + print_newline ()
+34
test/frag_debug3.ml
···
··· 1 + open Bytesrw 2 + 3 + module Parser = Html5rw_parser 4 + 5 + let () = 6 + (* Simple svg with child *) 7 + print_endline "=== Test: <svg><path></path></svg>text ==="; 8 + let result = Html5rw_parser.parse (Bytes.Reader.of_string "<svg><path></path></svg>text") in 9 + print_endline (Html5rw_dom.to_test_format (Parser.root result)); 10 + print_newline (); 11 + 12 + (* The failing test - foreignObject inside svg *) 13 + print_endline "=== Test: <div><svg><path><foreignObject><math></div>a ==="; 14 + let result = Html5rw_parser.parse (Bytes.Reader.of_string "<div><svg><path><foreignObject><math></div>a") in 15 + print_endline (Html5rw_dom.to_test_format (Parser.root result)); 16 + print_newline (); 17 + 18 + (* Expected output for test 31: 19 + <html> 20 + <head> 21 + <body> 22 + <div> 23 + <svg svg> 24 + <svg path> 25 + <svg foreignObject> 26 + <math math> 27 + "a" 28 + *) 29 + 30 + (* Simple svg structure *) 31 + print_endline "=== Test: <svg><rect/><circle/></svg> ==="; 32 + let result = Html5rw_parser.parse (Bytes.Reader.of_string "<svg><rect/><circle/></svg>") in 33 + print_endline (Html5rw_dom.to_test_format (Parser.root result)); 34 + print_newline ()
+10
test/html_frag_test.ml
···
··· 1 + open Bytesrw 2 + 3 + module Parser = Html5rw_parser 4 + module Dom = Html5rw_dom 5 + 6 + let () = 7 + let input = "Hello" in 8 + let context = Parser.make_fragment_context ~tag_name:"div" () in 9 + let result = Parser.parse ~collect_errors:true ~fragment_context:context (Bytes.Reader.of_string input) in 10 + print_endline (Dom.to_test_format (Parser.root result))
+22
test/nobr_debug.ml
···
··· 1 + open Bytesrw 2 + 3 + module Parser = Html5rw_parser 4 + module Dom = Html5rw_dom 5 + 6 + let rec print_tree indent node = 7 + Printf.printf "%s%s (ns=%s, %d children)\n" 8 + indent 9 + node.Dom.name 10 + (match node.Dom.namespace with Some s -> s | None -> "html") 11 + (List.length node.Dom.children); 12 + List.iter (print_tree (indent ^ " ")) node.Dom.children 13 + 14 + let () = 15 + let input = "<nobr>X" in 16 + print_endline "Starting..."; 17 + let context = Parser.make_fragment_context ~tag_name:"path" ~namespace:(Some "svg") () in 18 + let result = Parser.parse ~collect_errors:true ~fragment_context:context (Bytes.Reader.of_string input) in 19 + print_endline "\nFinal tree structure:"; 20 + print_tree "" (Parser.root result); 21 + print_endline "\nTest format:"; 22 + print_endline (Dom.to_test_format (Parser.root result))
+23
test/nobr_debug2.ml
···
··· 1 + module Parser = Html5rw_parser 2 + module Dom = Html5rw_dom 3 + 4 + let rec print_tree indent node = 5 + Printf.printf "%s%s (ns=%s, %d children)\n" 6 + indent 7 + node.Dom.name 8 + (match node.Dom.namespace with Some s -> s | None -> "html") 9 + (List.length node.Dom.children); 10 + List.iter (print_tree (indent ^ " ")) node.Dom.children 11 + 12 + let () = 13 + let input = "<nobr>X" in 14 + print_endline "Starting..."; 15 + let context = { Parser.Tree_builder.tag_name = "path"; namespace = Some "svg" } in 16 + 17 + (* Create parser state directly for inspection *) 18 + let t = Parser.Tree_builder.create ~collect_errors:true ~fragment_context:context input in 19 + print_endline "\nInitial tree structure:"; 20 + print_tree "" t.Parser.Tree_builder.document; 21 + print_endline "\nInitial stack size:"; 22 + Printf.printf "%d elements\n" (List.length t.Parser.Tree_builder.open_elements); 23 + List.iter (fun n -> Printf.printf " - %s\n" n.Dom.name) t.Parser.Tree_builder.open_elements
+13
test/nobr_test.ml
···
··· 1 + open Bytesrw 2 + 3 + module Parser = Html5rw_parser 4 + module Dom = Html5rw_dom 5 + 6 + let () = 7 + let input = "<nobr>X" in 8 + print_endline "Starting..."; 9 + let context = Parser.make_fragment_context ~tag_name:"path" ~namespace:(Some "svg") () in 10 + print_endline "Created context"; 11 + let result = Parser.parse ~collect_errors:true ~fragment_context:context (Bytes.Reader.of_string input) in 12 + print_endline "Parsed"; 13 + print_endline (Dom.to_test_format (Parser.root result))
+35
test/ns_sens_test.ml
···
··· 1 + open Bytesrw 2 + 3 + module Parser = Html5rw_parser 4 + 5 + let () = 6 + print_endline "=== Test: <body><table><tr><td><svg><td><foreignObject><span></td>Foo ==="; 7 + let result = Html5rw_parser.parse (Bytes.Reader.of_string "<body><table><tr><td><svg><td><foreignObject><span></td>Foo") in 8 + print_endline (Html5rw_dom.to_test_format (Parser.root result)); 9 + print_newline (); 10 + 11 + (* Expected: 12 + <html> 13 + <head> 14 + <body> 15 + "Foo" 16 + <table> 17 + <tbody> 18 + <tr> 19 + <td> 20 + <svg svg> 21 + <svg td> 22 + <svg foreignObject> 23 + <span> 24 + *) 25 + 26 + (* Let's also test simpler case *) 27 + print_endline "=== Test: <table><td><svg><foreignObject></td>text ==="; 28 + let result = Html5rw_parser.parse (Bytes.Reader.of_string "<table><td><svg><foreignObject></td>text") in 29 + print_endline (Html5rw_dom.to_test_format (Parser.root result)); 30 + print_newline (); 31 + 32 + print_endline "=== Test: <table><td></td>text ==="; 33 + let result = Html5rw_parser.parse (Bytes.Reader.of_string "<table><td></td>text") in 34 + print_endline (Html5rw_dom.to_test_format (Parser.root result)); 35 + print_newline ()
+10
test/quick_test.ml
···
··· 1 + open Bytesrw 2 + 3 + module Parser = Html5rw_parser 4 + module Dom = Html5rw_dom 5 + 6 + let () = 7 + let input = "<nobr>X" in 8 + let context = Parser.make_fragment_context ~tag_name:"path" ~namespace:(Some "svg") () in 9 + let result = Parser.parse ~collect_errors:true ~fragment_context:context (Bytes.Reader.of_string input) in 10 + print_endline (Dom.to_test_format (Parser.root result))
+22
test/script_attr_test.ml
···
··· 1 + open Bytesrw 2 + 3 + module Parser = Html5rw_parser 4 + module Dom = Html5rw_dom 5 + 6 + let () = 7 + (* Test incomplete script tag with attribute *) 8 + let input = "<!doctypehtml><scrIPt type=text/x-foobar;baz>X</SCRipt" in 9 + print_endline "=== Test: script tag with attribute at incomplete end ==="; 10 + print_endline ("Input: " ^ input); 11 + let result = Parser.parse ~collect_errors:true (Bytes.Reader.of_string input) in 12 + print_endline "Result:"; 13 + print_endline (Dom.to_test_format (Parser.root result)); 14 + print_endline ""; 15 + 16 + (* Test simpler case *) 17 + let input = "<script type=text>X</script>" in 18 + print_endline "=== Test: Complete script tag with attribute ==="; 19 + print_endline ("Input: " ^ input); 20 + let result = Parser.parse ~collect_errors:true (Bytes.Reader.of_string input) in 21 + print_endline "Result:"; 22 + print_endline (Dom.to_test_format (Parser.root result))
+12
test/script_eof_test.ml
···
··· 1 + open Bytesrw 2 + 3 + module Parser = Html5rw_parser 4 + module Dom = Html5rw_dom 5 + 6 + let () = 7 + (* Test incomplete script tag *) 8 + let input = "<!doctype html><script><" in 9 + print_endline ("Input: " ^ input); 10 + let result = Parser.parse ~collect_errors:true (Bytes.Reader.of_string input) in 11 + print_endline "Result:"; 12 + print_endline (Dom.to_test_format (Parser.root result))
+13
test/select_debug.ml
···
··· 1 + open Bytesrw 2 + 3 + module Parser = Html5rw_parser 4 + module Dom = Html5rw_dom 5 + 6 + let () = 7 + let input = "<select><b><option><select><option></b></select>X" in 8 + print_endline "Input:"; 9 + print_endline input; 10 + print_endline ""; 11 + let result = Parser.parse ~collect_errors:true (Bytes.Reader.of_string input) in 12 + print_endline "Result:"; 13 + print_endline (Dom.to_test_format (Parser.root result))
+9
test/simple_test.ml
···
··· 1 + open Bytesrw 2 + 3 + module Parser = Html5rw_parser 4 + module Dom = Html5rw_dom 5 + 6 + let () = 7 + let input = "<p>Hello</p>" in 8 + let result = Parser.parse ~collect_errors:true (Bytes.Reader.of_string input) in 9 + print_endline (Dom.to_test_format (Parser.root result))
+13
test/svg_frag_test.ml
···
··· 1 + open Bytesrw 2 + 3 + module Parser = Html5rw_parser 4 + module Dom = Html5rw_dom 5 + 6 + let () = 7 + let input = "Hello" in 8 + print_endline "Starting..."; 9 + let context = Parser.make_fragment_context ~tag_name:"path" ~namespace:(Some "svg") () in 10 + print_endline "Created context"; 11 + let result = Parser.parse ~collect_errors:true ~fragment_context:context (Bytes.Reader.of_string input) in 12 + print_endline "Parsed"; 13 + print_endline (Dom.to_test_format (Parser.root result))
+21
test/template_debug.ml
···
··· 1 + open Bytesrw 2 + 3 + module Parser = Html5rw_parser 4 + module Dom = Html5rw_dom 5 + 6 + let () = 7 + (* Template test 45: div inside tr inside template *) 8 + let input1 = "<body><template><tr><div></div></tr></template>" in 9 + print_endline "=== Test 1 ==="; 10 + print_endline ("Input: " ^ input1); 11 + let result1 = Parser.parse ~collect_errors:true (Bytes.Reader.of_string input1) in 12 + print_endline "Result:"; 13 + print_endline (Dom.to_test_format (Parser.root result1)); 14 + 15 + (* Template test 91: select inside tbody inside nested template *) 16 + let input2 = "<template><template><tbody><select>" in 17 + print_endline "\n=== Test 2 ==="; 18 + print_endline ("Input: " ^ input2); 19 + let result2 = Parser.parse ~collect_errors:true (Bytes.Reader.of_string input2) in 20 + print_endline "Result:"; 21 + print_endline (Dom.to_test_format (Parser.root result2))
+13
test/template_debug2.ml
···
··· 1 + open Bytesrw 2 + 3 + module Parser = Html5rw_parser 4 + module Dom = Html5rw_dom 5 + 6 + let () = 7 + (* Test i then menu in template *) 8 + let input = "<template><i><menu>Foo" in 9 + print_endline "=== Test: i then menu in template ==="; 10 + print_endline ("Input: " ^ input); 11 + let result = Parser.parse ~collect_errors:true (Bytes.Reader.of_string input) in 12 + print_endline "Result:"; 13 + print_endline (Dom.to_test_format (Parser.root result))
+26
test/template_debug3.ml
···
··· 1 + open Bytesrw 2 + 3 + module Parser = Html5rw_parser 4 + 5 + let () = 6 + print_endline "=== Test: <body><template><i><menu>Foo</i> ==="; 7 + let result = Html5rw_parser.parse (Bytes.Reader.of_string "<body><template><i><menu>Foo</i>") in 8 + print_endline (Html5rw_dom.to_test_format (Parser.root result)); 9 + print_newline (); 10 + 11 + (* Expected: 12 + <html> 13 + <head> 14 + <body> 15 + <template> 16 + content 17 + <i> 18 + <menu> 19 + <i> 20 + "Foo" 21 + *) 22 + 23 + print_endline "=== Test: <i><menu>Foo</i> (without template) ==="; 24 + let result = Html5rw_parser.parse (Bytes.Reader.of_string "<i><menu>Foo</i>") in 25 + print_endline (Html5rw_dom.to_test_format (Parser.root result)); 26 + print_newline ()
+29
test/template_debug4.ml
···
··· 1 + open Bytesrw 2 + 3 + module Parser = Html5rw_parser 4 + 5 + let () = 6 + print_endline "=== Test: <template><svg><foo><template><foreignObject><div></template><div> ==="; 7 + let result = Html5rw_parser.parse (Bytes.Reader.of_string "<template><svg><foo><template><foreignObject><div></template><div>") in 8 + print_endline (Html5rw_dom.to_test_format (Parser.root result)); 9 + print_newline (); 10 + 11 + (* Expected: 12 + <html> 13 + <head> 14 + <template> 15 + content 16 + <svg svg> 17 + <svg foo> 18 + <svg template> 19 + <svg foreignObject> 20 + <div> 21 + <body> 22 + <div> 23 + *) 24 + 25 + (* Let's also test what happens with just the SVG template *) 26 + print_endline "=== Test: <svg><template><foreignObject><div></template>text ==="; 27 + let result = Html5rw_parser.parse (Bytes.Reader.of_string "<svg><template><foreignObject><div></template>text") in 28 + print_endline (Html5rw_dom.to_test_format (Parser.root result)); 29 + print_newline ()
+14
test/test_debug.ml
···
··· 1 + open Bytesrw 2 + 3 + module Parser = Html5rw_parser 4 + module Dom = Html5rw_dom 5 + 6 + let test input = 7 + print_endline ("Input: " ^ input); 8 + let result = Parser.parse ~collect_errors:true (Bytes.Reader.of_string input) in 9 + print_endline (Dom.to_test_format (Parser.root result)); 10 + print_endline "" 11 + 12 + let () = 13 + (* Frameset tests - exact test input *) 14 + test "<frameset></frameset>\nfoo"
+15
test/test_frameset.ml
···
··· 1 + open Bytesrw 2 + 3 + module Parser = Html5rw_parser 4 + module Dom = Html5rw_dom 5 + 6 + let () = 7 + let input = "<param><frameset></frameset>" in 8 + print_endline ("Input: " ^ input); 9 + try 10 + let result = Parser.parse ~collect_errors:true (Bytes.Reader.of_string input) in 11 + print_endline "Tree:"; 12 + print_endline (Dom.to_test_format (Parser.root result)) 13 + with e -> 14 + print_endline ("Exception: " ^ Printexc.to_string e); 15 + Printexc.print_backtrace stdout
+201
test/test_html5lib.ml
···
··· 1 + (* Test runner for html5lib-tests tree construction tests *) 2 + 3 + open Bytesrw 4 + 5 + module Parser = Html5rw_parser 6 + module Dom = Html5rw_dom 7 + 8 + type test_case = { 9 + input : string; 10 + expected_tree : string; 11 + expected_errors : string list; 12 + script_on : bool; 13 + fragment_context : string option; 14 + } 15 + 16 + let _is_blank s = String.trim s = "" 17 + 18 + (* Parse a single test case from lines *) 19 + let parse_test_case lines = 20 + let rec parse acc = function 21 + | [] -> acc 22 + | line :: rest when String.length line > 0 && line.[0] = '#' -> 23 + let section = String.trim line in 24 + let content, remaining = collect_section rest in 25 + parse ((section, content) :: acc) remaining 26 + | _ :: rest -> parse acc rest 27 + and collect_section lines = 28 + let rec loop acc = function 29 + | [] -> (List.rev acc, []) 30 + | line :: rest when String.length line > 0 && line.[0] = '#' -> 31 + (List.rev acc, line :: rest) 32 + | line :: rest -> loop (line :: acc) rest 33 + in 34 + loop [] lines 35 + in 36 + let sections = parse [] lines in 37 + 38 + let get_section name = 39 + match List.assoc_opt name sections with 40 + | Some lines -> String.concat "\n" lines 41 + | None -> "" 42 + in 43 + 44 + let data = get_section "#data" in 45 + let document = get_section "#document" in 46 + let errors_text = get_section "#errors" in 47 + let errors = 48 + String.split_on_char '\n' errors_text 49 + |> List.filter (fun s -> String.trim s <> "") 50 + in 51 + let script_on = List.mem_assoc "#script-on" sections in 52 + let fragment = 53 + if List.mem_assoc "#document-fragment" sections then 54 + Some (get_section "#document-fragment" |> String.trim) 55 + else None 56 + in 57 + 58 + { 59 + input = data; 60 + expected_tree = document; 61 + expected_errors = errors; 62 + script_on; 63 + fragment_context = fragment; 64 + } 65 + 66 + (* Parse a .dat file into test cases *) 67 + let parse_dat_file content = 68 + let lines = String.split_on_char '\n' content in 69 + (* Split on empty lines followed by #data *) 70 + let rec split_tests current acc = function 71 + | [] -> 72 + if current = [] then List.rev acc 73 + else List.rev (List.rev current :: acc) 74 + | "" :: "#data" :: rest -> 75 + (* End of current test, start new one *) 76 + let new_acc = if current = [] then acc else (List.rev current :: acc) in 77 + split_tests ["#data"] new_acc rest 78 + | line :: rest -> 79 + split_tests (line :: current) acc rest 80 + in 81 + let test_groups = split_tests [] [] lines in 82 + List.filter_map (fun lines -> 83 + if List.exists (fun l -> l = "#data") lines then 84 + Some (parse_test_case lines) 85 + else None 86 + ) test_groups 87 + 88 + (* Strip "| " prefix from each line *) 89 + let strip_tree_prefix s = 90 + let lines = String.split_on_char '\n' s in 91 + let stripped = List.filter_map (fun line -> 92 + if String.length line >= 2 && String.sub line 0 2 = "| " then 93 + Some (String.sub line 2 (String.length line - 2)) 94 + else if String.trim line = "" then None 95 + else Some line 96 + ) lines in 97 + String.concat "\n" stripped 98 + 99 + (* Normalize tree output for comparison *) 100 + let normalize_tree s = 101 + let lines = String.split_on_char '\n' s in 102 + let non_empty = List.filter (fun l -> String.trim l <> "") lines in 103 + String.concat "\n" non_empty 104 + 105 + let run_test test = 106 + try 107 + let result = 108 + match test.fragment_context with 109 + | Some ctx_str -> 110 + (* Parse "namespace element" or just "element" *) 111 + let (namespace, tag_name) = 112 + match String.split_on_char ' ' ctx_str with 113 + | [ns; tag] when ns = "svg" -> (Some "svg", tag) 114 + | [ns; tag] when ns = "math" -> (Some "mathml", tag) 115 + | [tag] -> (None, tag) 116 + | _ -> (None, ctx_str) 117 + in 118 + let context = Parser.make_fragment_context ~tag_name ~namespace () in 119 + let reader = Bytes.Reader.of_string test.input in 120 + Parser.parse ~collect_errors:true ~fragment_context:context reader 121 + | None -> 122 + let reader = Bytes.Reader.of_string test.input in 123 + Parser.parse ~collect_errors:true reader 124 + in 125 + let actual_tree = Dom.to_test_format (Parser.root result) in 126 + let expected = normalize_tree (strip_tree_prefix test.expected_tree) in 127 + let actual = normalize_tree (strip_tree_prefix actual_tree) in 128 + (expected = actual, expected, actual, List.length (Parser.errors result), List.length test.expected_errors) 129 + with e -> 130 + let expected = normalize_tree (strip_tree_prefix test.expected_tree) in 131 + (false, expected, Printf.sprintf "EXCEPTION: %s" (Printexc.to_string e), 0, 0) 132 + 133 + let run_file path = 134 + let ic = open_in path in 135 + let content = really_input_string ic (in_channel_length ic) in 136 + close_in ic; 137 + 138 + let tests = parse_dat_file content in 139 + let filename = Filename.basename path in 140 + 141 + let passed = ref 0 in 142 + let failed = ref 0 in 143 + let errors = ref [] in 144 + 145 + List.iteri (fun i test -> 146 + (* Skip script-on tests since we don't support scripting *) 147 + if test.script_on then 148 + () (* Skip this test *) 149 + else begin 150 + let (success, expected, actual, _actual_error_count, _expected_error_count) = run_test test in 151 + if success then 152 + incr passed 153 + else begin 154 + incr failed; 155 + errors := (i + 1, test.input, expected, actual) :: !errors 156 + end 157 + end 158 + ) tests; 159 + 160 + (!passed, !failed, List.rev !errors, filename) 161 + 162 + let () = 163 + let test_dir = Sys.argv.(1) in 164 + let files = Sys.readdir test_dir |> Array.to_list in 165 + let dat_files = List.filter (fun f -> 166 + Filename.check_suffix f ".dat" && 167 + not (String.contains f '/') (* Skip subdirectories *) 168 + ) files in 169 + 170 + let total_passed = ref 0 in 171 + let total_failed = ref 0 in 172 + let all_errors = ref [] in 173 + 174 + List.iter (fun file -> 175 + let path = Filename.concat test_dir file in 176 + if Sys.is_directory path then () else begin 177 + let (passed, failed, errors, filename) = run_file path in 178 + total_passed := !total_passed + passed; 179 + total_failed := !total_failed + failed; 180 + if errors <> [] then 181 + all_errors := (filename, errors) :: !all_errors; 182 + Printf.printf "%s: %d passed, %d failed\n" filename passed failed 183 + end 184 + ) (List.sort String.compare dat_files); 185 + 186 + Printf.printf "\n=== Summary ===\n"; 187 + Printf.printf "Total: %d passed, %d failed\n" !total_passed !total_failed; 188 + 189 + if !all_errors <> [] then begin 190 + Printf.printf "\n=== First failures ===\n"; 191 + List.iter (fun (filename, errors) -> 192 + List.iter (fun (test_num, input, expected, actual) -> 193 + Printf.printf "\n--- %s test %d ---\n" filename test_num; 194 + Printf.printf "Input: %s\n" (String.escaped input); 195 + Printf.printf "Expected:\n%s\n" expected; 196 + Printf.printf "Actual:\n%s\n" actual 197 + ) (List.filteri (fun i _ -> i < 3) errors) 198 + ) (List.filteri (fun i _ -> i < 10) !all_errors) 199 + end; 200 + 201 + exit (if !total_failed > 0 then 1 else 0)
+11
test/test_mi.ml
···
··· 1 + open Bytesrw 2 + 3 + module Parser = Html5rw_parser 4 + module Dom = Html5rw_dom 5 + 6 + let () = 7 + let input = "<!doctype html><p><math><mi><p><h1>" in 8 + print_endline ("Input: " ^ input); 9 + let result = Parser.parse ~collect_errors:true (Bytes.Reader.of_string input) in 10 + print_endline "Tree:"; 11 + print_endline (Dom.to_test_format (Parser.root result))
+9
test/test_table.ml
···
··· 1 + open Bytesrw 2 + 3 + module Parser = Html5rw_parser 4 + module Dom = Html5rw_dom 5 + let () = 6 + let input = "<b><em><foo><foo><aside></b>" in 7 + print_endline ("Input: " ^ input); 8 + let result = Parser.parse ~collect_errors:true (Bytes.Reader.of_string input) in 9 + print_endline (Dom.to_test_format (Parser.root result))
+11
test/test_table_svg.ml
···
··· 1 + open Bytesrw 2 + 3 + module Parser = Html5rw_parser 4 + module Dom = Html5rw_dom 5 + 6 + let () = 7 + let input = "<table><tr><td><svg><desc><td></desc><circle>" in 8 + print_endline ("Input: " ^ input); 9 + let result = Parser.parse ~collect_errors:true (Bytes.Reader.of_string input) in 10 + print_endline "Tree:"; 11 + print_endline (Dom.to_test_format (Parser.root result))
+11
test/test_whitespace.ml
···
··· 1 + open Bytesrw 2 + 3 + module Parser = Html5rw_parser 4 + module Dom = Html5rw_dom 5 + 6 + let () = 7 + let input = "<style> <!-- </style> --> </style>x" in 8 + print_endline ("Input: " ^ input); 9 + let result = Parser.parse ~collect_errors:true (Bytes.Reader.of_string input) in 10 + print_endline "Tree:"; 11 + print_endline (Dom.to_test_format (Parser.root result))