OCaml HTML5 parser/serialiser based on Python's JustHTML

more

+1
dune-project
··· 24 (bytesrw (>= 0.3.0)) 25 (uutf (>= 1.0.0)) 26 (uuuu (>= 0.3.0)) 27 (odoc :with-doc) 28 (jsont (>= 0.2.0)) 29 (cmdliner (>= 1.3.0))))
··· 24 (bytesrw (>= 0.3.0)) 25 (uutf (>= 1.0.0)) 26 (uuuu (>= 0.3.0)) 27 + (uunf (>= 15.0.0)) 28 (odoc :with-doc) 29 (jsont (>= 0.2.0)) 30 (cmdliner (>= 1.3.0))))
+1
html5rw.opam
··· 15 "bytesrw" {>= "0.3.0"} 16 "uutf" {>= "1.0.0"} 17 "uuuu" {>= "0.3.0"} 18 "odoc" {with-doc} 19 "jsont" {>= "0.2.0"} 20 "cmdliner" {>= "1.3.0"}
··· 15 "bytesrw" {>= "0.3.0"} 16 "uutf" {>= "1.0.0"} 17 "uuuu" {>= "0.3.0"} 18 + "uunf" {>= "15.0.0"} 19 "odoc" {with-doc} 20 "jsont" {>= "0.2.0"} 21 "cmdliner" {>= "1.3.0"}
+1
lib/html5_checker/checker_registry.ml
··· 40 Hashtbl.replace reg "importmap" Importmap_checker.checker; 41 Hashtbl.replace reg "table" Table_checker.checker; 42 Hashtbl.replace reg "mime-type" Mime_type_checker.checker; 43 (* Hashtbl.replace reg "heading" Heading_checker.checker; *) 44 (* Hashtbl.replace reg "content" Content_checker.checker; *) 45 reg
··· 40 Hashtbl.replace reg "importmap" Importmap_checker.checker; 41 Hashtbl.replace reg "table" Table_checker.checker; 42 Hashtbl.replace reg "mime-type" Mime_type_checker.checker; 43 + Hashtbl.replace reg "normalization" Normalization_checker.checker; 44 (* Hashtbl.replace reg "heading" Heading_checker.checker; *) 45 (* Hashtbl.replace reg "content" Content_checker.checker; *) 46 reg
+55 -3
lib/html5_checker/datatype/dt_language.ml
··· 18 let to_lower s = 19 String.lowercase_ascii s 20 21 (** Validate language tag structure according to BCP 47. 22 - This is a simplified validator that checks structural validity 23 - but does not validate against the IANA registry. *) 24 let validate_language_structure s = 25 if String.length s = 0 then 26 Error "The empty string is not a valid language tag" ··· 74 check_private_use rest 75 (* Primary language: 2-3 letters (ISO 639) *) 76 else if (len = 2 || len = 3) && is_all_alpha first_lower then 77 - Ok () 78 (* Reserved: 4 letters *) 79 else if len = 4 && is_all_alpha first_lower then 80 Error "Found reserved language tag"
··· 18 let to_lower s = 19 String.lowercase_ascii s 20 21 + (** Valid extlang subtags per IANA language-subtag-registry. 22 + Extlangs are 3-letter subtags that follow the primary language. 23 + Each extlang has a specific Prefix requirement. 24 + Here we list the extlang subtag and its required prefix. *) 25 + let valid_extlangs = [ 26 + (* Arabic extlangs (prefix: ar) *) 27 + ("aao", "ar"); ("abh", "ar"); ("abv", "ar"); ("acm", "ar"); ("acq", "ar"); 28 + ("acw", "ar"); ("acx", "ar"); ("acy", "ar"); ("adf", "ar"); ("aeb", "ar"); 29 + ("aec", "ar"); ("afb", "ar"); ("ajp", "ar"); ("apc", "ar"); ("apd", "ar"); 30 + ("arb", "ar"); ("arq", "ar"); ("ars", "ar"); ("ary", "ar"); ("arz", "ar"); 31 + ("auz", "ar"); ("avl", "ar"); ("ayh", "ar"); ("ayl", "ar"); ("ayn", "ar"); 32 + ("ayp", "ar"); ("bbz", "ar"); ("pga", "ar"); ("shu", "ar"); ("ssh", "ar"); 33 + (* Chinese extlangs (prefix: zh) *) 34 + ("cdo", "zh"); ("cjy", "zh"); ("cmn", "zh"); ("cpx", "zh"); ("czh", "zh"); 35 + ("czo", "zh"); ("gan", "zh"); ("hak", "zh"); ("hsn", "zh"); ("lzh", "zh"); 36 + ("mnp", "zh"); ("nan", "zh"); ("wuu", "zh"); ("yue", "zh"); 37 + (* Malay extlangs (prefix: ms) *) 38 + ("bjn", "ms"); ("btj", "ms"); ("bve", "ms"); ("bvu", "ms"); ("coa", "ms"); 39 + ("dup", "ms"); ("hji", "ms"); ("jak", "ms"); ("jax", "ms"); ("kvb", "ms"); 40 + ("kvr", "ms"); ("kxd", "ms"); ("lce", "ms"); ("lcf", "ms"); ("liw", "ms"); 41 + ("max", "ms"); ("meo", "ms"); ("mfa", "ms"); ("mfb", "ms"); ("min", "ms"); 42 + ("mqg", "ms"); ("msi", "ms"); ("mui", "ms"); ("orn", "ms"); ("ors", "ms"); 43 + ("pel", "ms"); ("pse", "ms"); ("tmw", "ms"); ("urk", "ms"); ("vkk", "ms"); 44 + ("vkt", "ms"); ("xmm", "ms"); ("zlm", "ms"); ("zmi", "ms"); 45 + (* Swahili extlangs (prefix: sw) *) 46 + ("swc", "sw"); ("swh", "sw"); 47 + (* Uzbek extlangs (prefix: uz) *) 48 + ("uzn", "uz"); ("uzs", "uz"); 49 + ] 50 + 51 + (** Check if an extlang is valid for the given primary language prefix. *) 52 + let is_valid_extlang prefix extlang = 53 + let prefix_lower = to_lower prefix in 54 + let extlang_lower = to_lower extlang in 55 + match List.assoc_opt extlang_lower valid_extlangs with 56 + | Some required_prefix -> required_prefix = prefix_lower 57 + | None -> false 58 + 59 (** Validate language tag structure according to BCP 47. 60 + This validator checks structural validity and validates extlang subtags 61 + against the IANA registry. *) 62 let validate_language_structure s = 63 if String.length s = 0 then 64 Error "The empty string is not a valid language tag" ··· 112 check_private_use rest 113 (* Primary language: 2-3 letters (ISO 639) *) 114 else if (len = 2 || len = 3) && is_all_alpha first_lower then 115 + (* Check for extlang subtag (3 letters following primary) *) 116 + (match rest with 117 + | second :: _ -> 118 + let second_lower = to_lower second in 119 + let second_len = String.length second_lower in 120 + (* An extlang is exactly 3 alphabetic characters *) 121 + if second_len = 3 && is_all_alpha second_lower then 122 + (* Check if this is a valid extlang for this prefix *) 123 + if is_valid_extlang first_lower second_lower then 124 + Ok () 125 + else 126 + Error (Printf.sprintf "Bad extlang subtag \xe2\x80\x9c%s\xe2\x80\x9d" second_lower) 127 + else 128 + Ok () (* Not an extlang pattern, continue *) 129 + | [] -> Ok ()) 130 (* Reserved: 4 letters *) 131 else if len = 4 && is_all_alpha first_lower then 132 Error "Found reserved language tag"
+1 -1
lib/html5_checker/dune
··· 3 (library 4 (name html5_checker) 5 (public_name html5rw.checker) 6 - (libraries html5rw jsont jsont.bytesrw astring str) 7 )
··· 3 (library 4 (name html5_checker) 5 (public_name html5rw.checker) 6 + (libraries html5rw jsont jsont.bytesrw astring str uunf uutf) 7 )
+9 -7
lib/html5_checker/specialized/language_checker.ml
··· 38 | None -> None 39 40 (** Validate language attribute. *) 41 - let validate_lang_attr value ~location ~element collector = 42 (* First check structural validity *) 43 match Dt_language.Language_or_empty.validate value with 44 | Error msg -> 45 Message_collector.add_error collector 46 - ~message:(Printf.sprintf "Invalid lang attribute: %s" msg) 47 ~code:"invalid-lang" 48 ?location 49 ~element 50 - ~attribute:"lang" 51 () 52 | Ok () -> 53 (* Then check for deprecated subtags *) ··· 84 (* Validate lang attribute *) 85 begin match lang_opt with 86 | Some lang -> 87 - validate_lang_attr lang ~location ~element collector 88 | None -> () 89 end; 90 91 (* Validate xml:lang attribute *) 92 begin match xmllang_opt with 93 | Some xmllang -> 94 - validate_lang_attr xmllang ~location ~element collector 95 | None -> () 96 end; 97 ··· 113 () 114 115 let end_document _state _collector = 116 - (* Note: The "missing lang on html" warning is not produced by default since 117 - the Nu validator only produces it for specific test cases. *) 118 () 119 120 let checker = (module struct
··· 38 | None -> None 39 40 (** Validate language attribute. *) 41 + let validate_lang_attr value ~location ~element ~attribute collector = 42 (* First check structural validity *) 43 match Dt_language.Language_or_empty.validate value with 44 | Error msg -> 45 Message_collector.add_error collector 46 + ~message:(Printf.sprintf 47 + "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad language tag: %s." 48 + value attribute element msg) 49 ~code:"invalid-lang" 50 ?location 51 ~element 52 + ~attribute 53 () 54 | Ok () -> 55 (* Then check for deprecated subtags *) ··· 86 (* Validate lang attribute *) 87 begin match lang_opt with 88 | Some lang -> 89 + validate_lang_attr lang ~location ~element ~attribute:"lang" collector 90 | None -> () 91 end; 92 93 (* Validate xml:lang attribute *) 94 begin match xmllang_opt with 95 | Some xmllang -> 96 + validate_lang_attr xmllang ~location ~element ~attribute:"xml:lang" collector 97 | None -> () 98 end; 99 ··· 115 () 116 117 let end_document _state _collector = 118 + (* Note: The "missing lang on html" warning is only produced for specific 119 + test cases in the Nu validator. We don't produce it by default. *) 120 () 121 122 let checker = (module struct
+49
lib/html5_checker/specialized/normalization_checker.ml
···
··· 1 + (** Unicode normalization checker. 2 + 3 + Validates that text content is in Unicode Normalization Form C (NFC). *) 4 + 5 + type state = unit 6 + 7 + let create () = () 8 + let reset _state = () 9 + 10 + (** Normalize a string to NFC form using uunf. *) 11 + let normalize_nfc text = 12 + Uunf_string.normalize_utf_8 `NFC text 13 + 14 + (** Check if a string is in NFC form. *) 15 + let is_nfc text = 16 + (* A string is in NFC if normalizing it produces the same string *) 17 + let normalized = normalize_nfc text in 18 + text = normalized 19 + 20 + let start_element _state ~name:_ ~namespace:_ ~attrs:_ _collector = () 21 + 22 + let end_element _state ~name:_ ~namespace:_ _collector = () 23 + 24 + let characters _state text collector = 25 + (* Skip empty text or whitespace-only text *) 26 + let text_trimmed = String.trim text in 27 + if String.length text_trimmed = 0 then () 28 + else if not (is_nfc text_trimmed) then begin 29 + let normalized = normalize_nfc text_trimmed in 30 + Message_collector.add_warning collector 31 + ~message:(Printf.sprintf 32 + "Text run is not in Unicode Normalization Form C. Should instead be \xe2\x80\x9c%s\xe2\x80\x9d. (Copy and paste that into your source document to replace the un-normalized text.)" 33 + normalized) 34 + ~code:"unicode-normalization" 35 + () 36 + end 37 + 38 + let end_document _state _collector = () 39 + 40 + let checker = 41 + (module struct 42 + type nonrec state = state 43 + let create = create 44 + let reset = reset 45 + let start_element = start_element 46 + let end_element = end_element 47 + let characters = characters 48 + let end_document = end_document 49 + end : Checker.S)
+5
lib/html5_checker/specialized/normalization_checker.mli
···
··· 1 + (** Unicode normalization checker. 2 + 3 + Validates that text content is in Unicode Normalization Form C (NFC). *) 4 + 5 + val checker : Checker.t
+28 -4
test/debug_check.ml
··· 1 let () = 2 - let test_file = "validator/tests/html/mime-types/004-novalid.html" in 3 let ic = open_in test_file in 4 let html = really_input_string ic (in_channel_length ic) in 5 close_in ic; 6 let reader = Bytesrw.Bytes.Reader.of_string html in 7 - let result = Html5_checker.check ~collect_parse_errors:true ~system_id:test_file reader in 8 let errors = Html5_checker.errors result in 9 let warnings = Html5_checker.warnings result in 10 print_endline "=== Errors ==="; 11 List.iter (fun e -> print_endline e.Html5_checker.Message.message) errors; 12 print_endline "=== Warnings ==="; 13 List.iter (fun e -> print_endline e.Html5_checker.Message.message) warnings; 14 if List.length errors > 0 then 15 - print_endline "PASS (has errors)" 16 else 17 - print_endline "FAIL (no errors)"
··· 1 let () = 2 + let test_file = "validator/tests/xhtml/elements/menu/menu-containing-hr-novalid.xhtml" in 3 let ic = open_in test_file in 4 let html = really_input_string ic (in_channel_length ic) in 5 close_in ic; 6 let reader = Bytesrw.Bytes.Reader.of_string html in 7 + let doc = Html5rw.parse ~collect_errors:true reader in 8 + let root = Html5rw.root doc in 9 + print_endline "=== DOM Structure ==="; 10 + let rec print_node indent (node : Html5rw.Dom.node) = 11 + let open Html5rw.Dom in 12 + match node.name with 13 + | "#text" -> 14 + let text = String.trim node.data in 15 + if String.length text > 0 then 16 + Printf.printf "%sTEXT: %s\n" indent text 17 + | "#document" | "#document-fragment" -> 18 + Printf.printf "%s%s\n" indent node.name; 19 + List.iter (print_node (indent ^ " ")) node.children 20 + | "!doctype" -> Printf.printf "%s<!DOCTYPE>\n" indent 21 + | "#comment" -> () 22 + | _ -> 23 + Printf.printf "%s<%s>\n" indent node.name; 24 + List.iter (print_node (indent ^ " ")) node.children 25 + in 26 + print_node "" root; 27 + print_endline "\n=== Now checking ==="; 28 + let reader2 = Bytesrw.Bytes.Reader.of_string html in 29 + let result = Html5_checker.check ~collect_parse_errors:true ~system_id:test_file reader2 in 30 let errors = Html5_checker.errors result in 31 let warnings = Html5_checker.warnings result in 32 print_endline "=== Errors ==="; 33 List.iter (fun e -> print_endline e.Html5_checker.Message.message) errors; 34 print_endline "=== Warnings ==="; 35 List.iter (fun e -> print_endline e.Html5_checker.Message.message) warnings; 36 + print_endline "\n=== Expected ==="; 37 + print_endline "Element \xe2\x80\x9chr\xe2\x80\x9d not allowed as child of element \xe2\x80\x9cmenu\xe2\x80\x9d in this context. (Suppressing further errors from this subtree.)"; 38 if List.length errors > 0 then 39 + print_endline "\nPASS (has errors)" 40 else 41 + print_endline "\nFAIL (no errors)"