OCaml HTML5 parser/serialiser based on Python's JustHTML

more

Changed files
+63 -28
lib
html5_checker
+41 -15
lib/html5_checker/semantic/lang_detecting_checker.ml
··· 9 9 mutable html_locator : (int * int) option; (* line, column *) 10 10 mutable in_body : bool; 11 11 mutable skip_depth : int; (* depth in elements to skip *) 12 + mutable foreign_depth : int; (* depth in SVG/MathML content to skip *) 12 13 mutable text_buffer : Buffer.t; 13 14 mutable char_count : int; 14 15 } ··· 32 33 html_locator = None; 33 34 in_body = false; 34 35 skip_depth = 0; 36 + foreign_depth = 0; 35 37 text_buffer = Buffer.create 4096; 36 38 char_count = 0; 37 39 } ··· 42 44 state.html_locator <- None; 43 45 state.in_body <- false; 44 46 state.skip_depth <- 0; 47 + state.foreign_depth <- 0; 45 48 Buffer.clear state.text_buffer; 46 49 state.char_count <- 0 47 50 51 + (* Namespaces to skip for language detection *) 52 + let svg_namespace = "http://www.w3.org/2000/svg" 53 + let mathml_namespace = "http://www.w3.org/1998/Math/MathML" 54 + 55 + let is_foreign_namespace ns = 56 + ns = svg_namespace || ns = mathml_namespace 57 + 58 + (* Element names that start foreign content (for when namespace isn't set) *) 59 + let is_foreign_element name = 60 + let n = String.lowercase_ascii name in 61 + n = "svg" || n = "math" 62 + 48 63 let get_attr name attrs = 49 64 List.find_map (fun (n, v) -> 50 65 if String.lowercase_ascii n = name then Some v else None ··· 126 141 | "zh-tw" -> "zh-hant" 127 142 | _ -> code 128 143 129 - let start_element state ~name ~namespace:_ ~attrs _collector = 144 + let start_element state ~name ~namespace ~attrs _collector = 130 145 let name_lower = String.lowercase_ascii name in 146 + let ns = Option.value namespace ~default:"" in 131 147 132 148 if name_lower = "html" then begin 133 149 state.html_lang <- get_attr "lang" attrs; ··· 138 154 else if name_lower = "body" then 139 155 state.in_body <- true 140 156 else if state.in_body then begin 157 + (* Track foreign namespace depth (SVG/MathML) *) 158 + if is_foreign_namespace ns || is_foreign_element name then 159 + state.foreign_depth <- state.foreign_depth + 1 160 + else if state.foreign_depth > 0 then 161 + state.foreign_depth <- state.foreign_depth + 1 141 162 (* Check if we should skip this element's text *) 142 - if List.mem name_lower skip_elements then 163 + else if List.mem name_lower skip_elements then 143 164 state.skip_depth <- state.skip_depth + 1 144 165 else begin 145 166 (* Check for different lang attribute *) ··· 154 175 let name_lower = String.lowercase_ascii name in 155 176 if name_lower = "body" then 156 177 state.in_body <- false 157 - else if state.in_body && state.skip_depth > 0 then begin 158 - if List.mem name_lower skip_elements then 159 - state.skip_depth <- state.skip_depth - 1 160 - else begin 161 - (* TODO: properly track nested elements with different lang *) 162 - state.skip_depth <- max 0 (state.skip_depth - 1) 178 + else if state.in_body then begin 179 + (* Track foreign namespace depth *) 180 + if state.foreign_depth > 0 then 181 + state.foreign_depth <- state.foreign_depth - 1 182 + else if state.skip_depth > 0 then begin 183 + if List.mem name_lower skip_elements then 184 + state.skip_depth <- state.skip_depth - 1 185 + else 186 + (* TODO: properly track nested elements with different lang *) 187 + state.skip_depth <- max 0 (state.skip_depth - 1) 163 188 end 164 189 end 165 190 166 191 let characters state text _collector = 167 - if state.in_body && state.skip_depth = 0 && state.char_count < max_chars then begin 192 + if state.in_body && state.skip_depth = 0 && state.foreign_depth = 0 && state.char_count < max_chars then begin 168 193 (* Count Unicode code points, not bytes *) 169 194 let decoder = Uutf.decoder ~encoding:`UTF_8 (`String text) in 170 195 let rec process () = ··· 197 222 match Langdetect.detect_with_prob detector text with 198 223 | None -> () 199 224 | Some (detected_lang, prob) when prob > 0.90 -> 200 - let declared_lang = match state.html_lang with 201 - | Some l -> get_lang_code l 225 + (* Get the original declared lang value (preserve exactly as written) *) 226 + let original_declared = match state.html_lang with 227 + | Some l -> l 202 228 | None -> "" 203 229 in 204 230 let detected_code = detected_lang in (* Keep full code like zh-tw *) 205 231 let detected_name = get_language_name detected_lang in 206 232 let suggested_code = get_bcp47_code detected_lang in 207 233 208 - (* Check for language mismatch *) 209 - let base_declared = get_lang_code declared_lang in 234 + (* Check for language mismatch using base codes *) 235 + let base_declared = get_lang_code original_declared in 210 236 let base_detected = get_lang_code detected_code in 211 - if declared_lang = "" then begin 237 + if original_declared = "" then begin 212 238 (* No lang attribute - suggest adding one *) 213 239 Message_collector.add_warning collector 214 240 ~message:(Printf.sprintf ··· 224 250 Message_collector.add_warning collector 225 251 ~message:(Printf.sprintf 226 252 "This document appears to be written in %s but the \xe2\x80\x9chtml\xe2\x80\x9d start tag has \xe2\x80\x9clang=\"%s\"\xe2\x80\x9d. Consider using \xe2\x80\x9clang=\"%s\"\xe2\x80\x9d (or variant) instead." 227 - detected_name declared_lang suggested_code) 253 + detected_name original_declared suggested_code) 228 254 ~code:"wrong-lang" 229 255 ~element:"html" 230 256 ()
+22 -13
lib/html5_checker/specialized/h1_checker.ml
··· 2 2 3 3 type state = { 4 4 mutable h1_count : int; 5 + mutable svg_depth : int; (* Track depth inside SVG *) 5 6 } 6 7 7 8 let create () = { 8 9 h1_count = 0; 10 + svg_depth = 0; 9 11 } 10 12 11 13 let reset state = 12 - state.h1_count <- 0 14 + state.h1_count <- 0; 15 + state.svg_depth <- 0 13 16 14 17 let start_element state ~name ~namespace ~attrs collector = 15 18 ignore attrs; 16 - if namespace <> None then () 17 - else begin 18 - let name_lower = String.lowercase_ascii name in 19 - if name_lower = "h1" then begin 20 - state.h1_count <- state.h1_count + 1; 21 - if state.h1_count > 1 then 22 - Message_collector.add_info collector 23 - ~message:"Consider using only one \xe2\x80\x9ch1\xe2\x80\x9d element per document (or, if using \xe2\x80\x9ch1\xe2\x80\x9d elements multiple times is required, consider using the \xe2\x80\x9cheadingoffset\xe2\x80\x9d attribute to indicate that these \xe2\x80\x9ch1\xe2\x80\x9d elements are not all top-level headings)." 24 - ~code:"multiple-h1" 25 - ~element:name () 26 - end 19 + let name_lower = String.lowercase_ascii name in 20 + (* Track SVG depth - h1 inside SVG (foreignObject, desc) shouldn't count *) 21 + if name_lower = "svg" then 22 + state.svg_depth <- state.svg_depth + 1 23 + else if namespace <> None || state.svg_depth > 0 then 24 + () (* Skip non-HTML namespace or inside SVG *) 25 + else if name_lower = "h1" then begin 26 + state.h1_count <- state.h1_count + 1; 27 + if state.h1_count > 1 then 28 + Message_collector.add_info collector 29 + ~message:"Consider using only one \xe2\x80\x9ch1\xe2\x80\x9d element per document (or, if using \xe2\x80\x9ch1\xe2\x80\x9d elements multiple times is required, consider using the \xe2\x80\x9cheadingoffset\xe2\x80\x9d attribute to indicate that these \xe2\x80\x9ch1\xe2\x80\x9d elements are not all top-level headings)." 30 + ~code:"multiple-h1" 31 + ~element:name () 27 32 end 28 33 29 - let end_element _state ~name:_ ~namespace:_ _collector = () 34 + let end_element state ~name ~namespace:_ _collector = 35 + let name_lower = String.lowercase_ascii name in 36 + if name_lower = "svg" && state.svg_depth > 0 then 37 + state.svg_depth <- state.svg_depth - 1 38 + 30 39 let characters _state _text _collector = () 31 40 let end_document _state _collector = () 32 41