OCaml HTML5 parser/serialiser based on Python's JustHTML

debug

Changed files
+36 -12
lib
test
+4 -4
lib/check/semantic/lang_detecting_checker.ml
··· 13 13 mutable char_count : int; 14 14 } 15 15 16 - let max_chars = 30720 16 + let max_chars = 8192 (* Reduced from 30720 to avoid slow language detection *) 17 17 let min_chars = 1024 18 18 19 19 (* Elements whose text content we skip for language detection - O(1) lookup *) 20 20 let skip_elements = 21 21 Attr_utils.hashtbl_of_list [ 22 - "a"; "button"; "details"; "figcaption"; "form"; "li"; "nav"; 23 - "pre"; "script"; "select"; "span"; "style"; "summary"; 24 - "td"; "textarea"; "th"; "tr" 22 + "a"; "button"; "code"; "details"; "figcaption"; "form"; "kbd"; "li"; "nav"; 23 + "pre"; "samp"; "script"; "select"; "span"; "style"; "summary"; 24 + "td"; "textarea"; "th"; "tr"; "var"; "xmp" 25 25 ] 26 26 27 27 let is_skip_element name = Hashtbl.mem skip_elements name
+20 -6
lib/check/specialized/normalization_checker.ml
··· 2 2 3 3 Validates that text content is in Unicode Normalization Form C (NFC). *) 4 4 5 - type state = unit [@@warning "-34"] 5 + type state = { 6 + mutable in_raw_text : int; (** Depth inside style/script elements *) 7 + } 6 8 7 - let create () = () 8 - let reset _state = () 9 + let create () = { in_raw_text = 0 } 10 + let reset state = state.in_raw_text <- 0 11 + 12 + (** Elements whose text content is raw text and should be skipped *) 13 + let is_raw_text_element name = 14 + name = "style" || name = "script" || name = "xmp" || name = "textarea" 9 15 10 16 (** Normalize a string to NFC form using uunf. *) 11 17 let normalize_nfc text = ··· 40 46 if end_pos = len then s 41 47 else String.sub s 0 end_pos 42 48 43 - let start_element _state ~element:_ _collector = () 49 + let start_element state ~element _collector = 50 + let name = Tag.tag_to_string element.Element.tag in 51 + if is_raw_text_element name then 52 + state.in_raw_text <- state.in_raw_text + 1 44 53 45 - let end_element _state ~tag:_ _collector = () 54 + let end_element state ~tag _collector = 55 + let name = Tag.tag_to_string tag in 56 + if is_raw_text_element name && state.in_raw_text > 0 then 57 + state.in_raw_text <- state.in_raw_text - 1 46 58 47 - let characters _state text collector = 59 + let characters state text collector = 60 + (* Skip text inside raw text elements like style/script *) 61 + if state.in_raw_text > 0 then () else 48 62 (* Skip empty text or whitespace-only text *) 49 63 let text_trimmed = String.trim text in 50 64 if String.length text_trimmed = 0 then ()
+6 -1
test/test_roundtrip.ml
··· 129 129 Printf.printf "Running roundtrip tests...\n%!"; 130 130 131 131 (* Run tests *) 132 - let results = List.map test_file test_files in 132 + let total = List.length test_files in 133 + let results = List.mapi (fun i path -> 134 + Printf.printf "\r[%d/%d] %s%!" (i + 1) total (Filename.basename path); 135 + test_file path 136 + ) test_files in 137 + Printf.printf "\n%!"; 133 138 134 139 (* Categorize results *) 135 140 let isvalid_tests = List.filter (fun r -> r.test_type = "isvalid") results in
+6 -1
test/test_validator.ml
··· 426 426 Printf.printf "Found %d test files\n%!" (List.length tests); 427 427 428 428 Printf.printf "Running tests...\n%!"; 429 - let results = List.map (run_test messages) tests in 429 + let total = List.length tests in 430 + let results = List.mapi (fun i test -> 431 + Printf.printf "\r[%d/%d] %s%!" (i + 1) total test.relative_path; 432 + run_test messages test 433 + ) tests in 434 + Printf.printf "\n%!"; 430 435 431 436 (* Print failing isvalid tests *) 432 437 let failing_isvalid = List.filter (fun r ->