OCaml HTML5 parser/serialiser based on Python's JustHTML

more

Changed files
+63 -28
lib
html5_checker
+41 -15
lib/html5_checker/semantic/lang_detecting_checker.ml
··· 9 mutable html_locator : (int * int) option; (* line, column *) 10 mutable in_body : bool; 11 mutable skip_depth : int; (* depth in elements to skip *) 12 mutable text_buffer : Buffer.t; 13 mutable char_count : int; 14 } ··· 32 html_locator = None; 33 in_body = false; 34 skip_depth = 0; 35 text_buffer = Buffer.create 4096; 36 char_count = 0; 37 } ··· 42 state.html_locator <- None; 43 state.in_body <- false; 44 state.skip_depth <- 0; 45 Buffer.clear state.text_buffer; 46 state.char_count <- 0 47 48 let get_attr name attrs = 49 List.find_map (fun (n, v) -> 50 if String.lowercase_ascii n = name then Some v else None ··· 126 | "zh-tw" -> "zh-hant" 127 | _ -> code 128 129 - let start_element state ~name ~namespace:_ ~attrs _collector = 130 let name_lower = String.lowercase_ascii name in 131 132 if name_lower = "html" then begin 133 state.html_lang <- get_attr "lang" attrs; ··· 138 else if name_lower = "body" then 139 state.in_body <- true 140 else if state.in_body then begin 141 (* Check if we should skip this element's text *) 142 - if List.mem name_lower skip_elements then 143 state.skip_depth <- state.skip_depth + 1 144 else begin 145 (* Check for different lang attribute *) ··· 154 let name_lower = String.lowercase_ascii name in 155 if name_lower = "body" then 156 state.in_body <- false 157 - else if state.in_body && state.skip_depth > 0 then begin 158 - if List.mem name_lower skip_elements then 159 - state.skip_depth <- state.skip_depth - 1 160 - else begin 161 - (* TODO: properly track nested elements with different lang *) 162 - state.skip_depth <- max 0 (state.skip_depth - 1) 163 end 164 end 165 166 let characters state text _collector = 167 - if state.in_body && state.skip_depth = 0 && state.char_count < max_chars then begin 168 (* Count Unicode code points, not bytes *) 169 let decoder = Uutf.decoder ~encoding:`UTF_8 (`String text) in 170 let rec process () = ··· 197 match Langdetect.detect_with_prob detector text with 198 | None -> () 199 | Some (detected_lang, prob) when prob > 0.90 -> 200 - let declared_lang = match state.html_lang with 201 - | Some l -> get_lang_code l 202 | None -> "" 203 in 204 let detected_code = detected_lang in (* Keep full code like zh-tw *) 205 let detected_name = get_language_name detected_lang in 206 let suggested_code = get_bcp47_code detected_lang in 207 208 - (* Check for language mismatch *) 209 - let base_declared = get_lang_code declared_lang in 210 let base_detected = get_lang_code detected_code in 211 - if declared_lang = "" then begin 212 (* No lang attribute - suggest adding one *) 213 Message_collector.add_warning collector 214 ~message:(Printf.sprintf ··· 224 Message_collector.add_warning collector 225 ~message:(Printf.sprintf 226 "This document appears to be written in %s but the \xe2\x80\x9chtml\xe2\x80\x9d start tag has \xe2\x80\x9clang=\"%s\"\xe2\x80\x9d. Consider using \xe2\x80\x9clang=\"%s\"\xe2\x80\x9d (or variant) instead." 227 - detected_name declared_lang suggested_code) 228 ~code:"wrong-lang" 229 ~element:"html" 230 ()
··· 9 mutable html_locator : (int * int) option; (* line, column *) 10 mutable in_body : bool; 11 mutable skip_depth : int; (* depth in elements to skip *) 12 + mutable foreign_depth : int; (* depth in SVG/MathML content to skip *) 13 mutable text_buffer : Buffer.t; 14 mutable char_count : int; 15 } ··· 33 html_locator = None; 34 in_body = false; 35 skip_depth = 0; 36 + foreign_depth = 0; 37 text_buffer = Buffer.create 4096; 38 char_count = 0; 39 } ··· 44 state.html_locator <- None; 45 state.in_body <- false; 46 state.skip_depth <- 0; 47 + state.foreign_depth <- 0; 48 Buffer.clear state.text_buffer; 49 state.char_count <- 0 50 51 + (* Namespaces to skip for language detection *) 52 + let svg_namespace = "http://www.w3.org/2000/svg" 53 + let mathml_namespace = "http://www.w3.org/1998/Math/MathML" 54 + 55 + let is_foreign_namespace ns = 56 + ns = svg_namespace || ns = mathml_namespace 57 + 58 + (* Element names that start foreign content (for when namespace isn't set) *) 59 + let is_foreign_element name = 60 + let n = String.lowercase_ascii name in 61 + n = "svg" || n = "math" 62 + 63 let get_attr name attrs = 64 List.find_map (fun (n, v) -> 65 if String.lowercase_ascii n = name then Some v else None ··· 141 | "zh-tw" -> "zh-hant" 142 | _ -> code 143 144 + let start_element state ~name ~namespace ~attrs _collector = 145 let name_lower = String.lowercase_ascii name in 146 + let ns = Option.value namespace ~default:"" in 147 148 if name_lower = "html" then begin 149 state.html_lang <- get_attr "lang" attrs; ··· 154 else if name_lower = "body" then 155 state.in_body <- true 156 else if state.in_body then begin 157 + (* Track foreign namespace depth (SVG/MathML) *) 158 + if is_foreign_namespace ns || is_foreign_element name then 159 + state.foreign_depth <- state.foreign_depth + 1 160 + else if state.foreign_depth > 0 then 161 + state.foreign_depth <- state.foreign_depth + 1 162 (* Check if we should skip this element's text *) 163 + else if List.mem name_lower skip_elements then 164 state.skip_depth <- state.skip_depth + 1 165 else begin 166 (* Check for different lang attribute *) ··· 175 let name_lower = String.lowercase_ascii name in 176 if name_lower = "body" then 177 state.in_body <- false 178 + else if state.in_body then begin 179 + (* Track foreign namespace depth *) 180 + if state.foreign_depth > 0 then 181 + state.foreign_depth <- state.foreign_depth - 1 182 + else if state.skip_depth > 0 then begin 183 + if List.mem name_lower skip_elements then 184 + state.skip_depth <- state.skip_depth - 1 185 + else 186 + (* TODO: properly track nested elements with different lang *) 187 + state.skip_depth <- max 0 (state.skip_depth - 1) 188 end 189 end 190 191 let characters state text _collector = 192 + if state.in_body && state.skip_depth = 0 && state.foreign_depth = 0 && state.char_count < max_chars then begin 193 (* Count Unicode code points, not bytes *) 194 let decoder = Uutf.decoder ~encoding:`UTF_8 (`String text) in 195 let rec process () = ··· 222 match Langdetect.detect_with_prob detector text with 223 | None -> () 224 | Some (detected_lang, prob) when prob > 0.90 -> 225 + (* Get the original declared lang value (preserve exactly as written) *) 226 + let original_declared = match state.html_lang with 227 + | Some l -> l 228 | None -> "" 229 in 230 let detected_code = detected_lang in (* Keep full code like zh-tw *) 231 let detected_name = get_language_name detected_lang in 232 let suggested_code = get_bcp47_code detected_lang in 233 234 + (* Check for language mismatch using base codes *) 235 + let base_declared = get_lang_code original_declared in 236 let base_detected = get_lang_code detected_code in 237 + if original_declared = "" then begin 238 (* No lang attribute - suggest adding one *) 239 Message_collector.add_warning collector 240 ~message:(Printf.sprintf ··· 250 Message_collector.add_warning collector 251 ~message:(Printf.sprintf 252 "This document appears to be written in %s but the \xe2\x80\x9chtml\xe2\x80\x9d start tag has \xe2\x80\x9clang=\"%s\"\xe2\x80\x9d. Consider using \xe2\x80\x9clang=\"%s\"\xe2\x80\x9d (or variant) instead." 253 + detected_name original_declared suggested_code) 254 ~code:"wrong-lang" 255 ~element:"html" 256 ()
+22 -13
lib/html5_checker/specialized/h1_checker.ml
··· 2 3 type state = { 4 mutable h1_count : int; 5 } 6 7 let create () = { 8 h1_count = 0; 9 } 10 11 let reset state = 12 - state.h1_count <- 0 13 14 let start_element state ~name ~namespace ~attrs collector = 15 ignore attrs; 16 - if namespace <> None then () 17 - else begin 18 - let name_lower = String.lowercase_ascii name in 19 - if name_lower = "h1" then begin 20 - state.h1_count <- state.h1_count + 1; 21 - if state.h1_count > 1 then 22 - Message_collector.add_info collector 23 - ~message:"Consider using only one \xe2\x80\x9ch1\xe2\x80\x9d element per document (or, if using \xe2\x80\x9ch1\xe2\x80\x9d elements multiple times is required, consider using the \xe2\x80\x9cheadingoffset\xe2\x80\x9d attribute to indicate that these \xe2\x80\x9ch1\xe2\x80\x9d elements are not all top-level headings)." 24 - ~code:"multiple-h1" 25 - ~element:name () 26 - end 27 end 28 29 - let end_element _state ~name:_ ~namespace:_ _collector = () 30 let characters _state _text _collector = () 31 let end_document _state _collector = () 32
··· 2 3 type state = { 4 mutable h1_count : int; 5 + mutable svg_depth : int; (* Track depth inside SVG *) 6 } 7 8 let create () = { 9 h1_count = 0; 10 + svg_depth = 0; 11 } 12 13 let reset state = 14 + state.h1_count <- 0; 15 + state.svg_depth <- 0 16 17 let start_element state ~name ~namespace ~attrs collector = 18 ignore attrs; 19 + let name_lower = String.lowercase_ascii name in 20 + (* Track SVG depth - h1 inside SVG (foreignObject, desc) shouldn't count *) 21 + if name_lower = "svg" then 22 + state.svg_depth <- state.svg_depth + 1 23 + else if namespace <> None || state.svg_depth > 0 then 24 + () (* Skip non-HTML namespace or inside SVG *) 25 + else if name_lower = "h1" then begin 26 + state.h1_count <- state.h1_count + 1; 27 + if state.h1_count > 1 then 28 + Message_collector.add_info collector 29 + ~message:"Consider using only one \xe2\x80\x9ch1\xe2\x80\x9d element per document (or, if using \xe2\x80\x9ch1\xe2\x80\x9d elements multiple times is required, consider using the \xe2\x80\x9cheadingoffset\xe2\x80\x9d attribute to indicate that these \xe2\x80\x9ch1\xe2\x80\x9d elements are not all top-level headings)." 30 + ~code:"multiple-h1" 31 + ~element:name () 32 end 33 34 + let end_element state ~name ~namespace:_ _collector = 35 + let name_lower = String.lowercase_ascii name in 36 + if name_lower = "svg" && state.svg_depth > 0 then 37 + state.svg_depth <- state.svg_depth - 1 38 + 39 let characters _state _text _collector = () 40 let end_document _state _collector = () 41