OCaml HTML5 parser/serialiser based on Python's JustHTML

fix

+12 -2
lib/html5_checker/datatype/dt_media_query.ml
··· 328 328 let color_features = ["color"; "min-color"; "max-color"; "color-index"; "min-color-index"; "max-color-index"; 329 329 "monochrome"; "min-monochrome"; "max-monochrome"] in 330 330 331 + (* Get base feature name for error messages (strip min-/max- prefix) *) 332 + let base_feature = 333 + if String.length feature > 4 && String.sub feature 0 4 = "min-" then 334 + String.sub feature 4 (String.length feature - 4) 335 + else if String.length feature > 4 && String.sub feature 0 4 = "max-" then 336 + String.sub feature 4 (String.length feature - 4) 337 + else 338 + feature 339 + in 340 + 331 341 if List.mem feature length_features then begin 332 342 (* Must be a valid length: number followed by unit *) 333 343 let value = String.trim value in ··· 360 370 let unit_lower = String.lowercase_ascii unit_part in 361 371 if List.mem unit_lower valid_length_units then Ok () 362 372 else if List.mem unit_lower valid_resolution_units then 363 - Error (Printf.sprintf "\"%s\" is not a \"%s\" value" value feature) 373 + Error (Printf.sprintf "\"%s\" is not a \"%s\" value" value base_feature) 364 374 else 365 375 Error "Unknown dimension." 366 376 end ··· 370 380 let is_digit c = c >= '0' && c <= '9' in 371 381 if String.length value > 0 && String.for_all is_digit value then Ok () 372 382 else 373 - Error (Printf.sprintf "\"%s\" is not a \"%s\" value" value feature) 383 + Error (Printf.sprintf "\"%s\" is not a \"%s\" value" value base_feature) 374 384 end else 375 385 Ok () (* Allow other features with any value for now *) 376 386
+3
lib/html5_checker/parse_error_bridge.ml
··· 74 74 else if String.length s > 19 && String.sub s 0 19 = "unexpected-end-tag:" then 75 75 let element = String.sub s 19 (String.length s - 19) in 76 76 (Printf.sprintf "Stray end tag \xe2\x80\x9c%s\xe2\x80\x9d." element, "stray-end-tag") 77 + else if String.length s > 19 && String.sub s 0 19 = "start-tag-in-table:" then 78 + let tag = String.sub s 19 (String.length s - 19) in 79 + (Printf.sprintf "Start tag \xe2\x80\x9c%s\xe2\x80\x9d seen in \xe2\x80\x9ctable\xe2\x80\x9d." tag, "start-tag-in-table") 77 80 else 78 81 (Printf.sprintf "Parse error: %s" s, s) 79 82 with _ -> (Printf.sprintf "Parse error: %s" s, s))
+10
lib/html5_checker/semantic/id_checker.ml
··· 218 218 (* Use specific error for list attribute on input *) 219 219 if ref.attribute = "list" && ref.referring_element = "input" then 220 220 Message_collector.add_typed collector Error_code.List_attr_requires_datalist 221 + else if ref.attribute = "commandfor" then 222 + (* commandfor has a specific expected message format *) 223 + Message_collector.add_error collector 224 + ~message:(Printf.sprintf "The value of the %s attribute of the %s element must be the ID of an element in the same tree as the %s with the %s attribute." 225 + (Error_code.q "commandfor") (Error_code.q ref.referring_element) 226 + (Error_code.q ref.referring_element) (Error_code.q "commandfor")) 227 + ~code:"dangling-id-reference" 228 + ~element:ref.referring_element 229 + ~attribute:ref.attribute 230 + () 221 231 else 222 232 (* Use generic for dangling references - format may vary *) 223 233 Message_collector.add_typed collector
+12 -6
lib/html5_checker/semantic/option_checker.ml
··· 29 29 ) attrs 30 30 31 31 let start_element state ~name ~namespace ~attrs collector = 32 - ignore collector; 33 32 let name_lower = String.lowercase_ascii name in 34 33 35 34 if namespace <> None then () ··· 43 42 | Some v -> String.trim v = "" 44 43 | None -> false 45 44 in 45 + (* Report error for empty label attribute value *) 46 + if label_empty then 47 + Message_collector.add_error collector 48 + ~message:"Bad value \xe2\x80\x9c\xe2\x80\x9d for attribute \xe2\x80\x9clabel\xe2\x80\x9d on element \xe2\x80\x9coption\xe2\x80\x9d: Bad non-empty string: Must not be empty." 49 + ~code:"empty-attribute-value" 50 + ~element:"option" 51 + ~attribute:"label" 52 + (); 46 53 let ctx = { has_text = false; has_label; label_empty } in 47 54 state.option_stack <- ctx :: state.option_stack 48 55 end ··· 60 67 | ctx :: rest -> 61 68 state.option_stack <- rest; 62 69 (* Validate: option must have text content or non-empty label *) 63 - if not ctx.has_text then begin 64 - if ctx.label_empty || not ctx.has_label then 65 - (* Has label="" (empty) and no text, or no label at all - error *) 66 - Message_collector.add_typed collector Error_code.Option_empty_without_label 67 - end 70 + (* Note: empty label error is already reported at start_element, 71 + so only report empty option without label when there's no label attribute at all *) 72 + if not ctx.has_text && not ctx.has_label then 73 + Message_collector.add_typed collector Error_code.Option_empty_without_label 68 74 | [] -> () 69 75 end 70 76 end
+22 -4
lib/html5_checker/specialized/aria_checker.ml
··· 34 34 (* Window roles *) 35 35 "alertdialog"; 36 36 37 - (* Abstract roles - not for use in HTML content *) 38 - "command"; "comment"; "composite"; "input"; "landmark"; "range"; 39 - "roletype"; "section"; "sectionhead"; "select"; "structure"; "widget"; 40 - "window"; 37 + (* Note: Abstract roles (command, composite, input, landmark, range, etc.) 38 + are NOT included as they should not be used in HTML content. 39 + Using an abstract role will result in "Discarding unrecognized token" error. *) 41 40 42 41 (* Additional roles *) 43 42 "application"; "columnheader"; "rowheader"; ··· 342 341 end 343 342 | None -> Some "textbox" (* default input type is text *) 344 343 end 344 + (* Check for area element - implicit role depends on href attribute *) 345 + else if element_name = "area" then begin 346 + match List.assoc_opt "href" attrs with 347 + | Some _ -> Some "link" (* area with href has implicit role "link" *) 348 + | None -> Some "generic" (* area without href has no corresponding role, treated as generic *) 349 + end 350 + (* Check for a element - implicit role depends on href attribute *) 351 + else if element_name = "a" then begin 352 + match List.assoc_opt "href" attrs with 353 + | Some _ -> Some "link" (* a with href has implicit role "link" *) 354 + | None -> Some "generic" (* a without href has no corresponding role, treated as generic *) 355 + end 345 356 else 346 357 Hashtbl.find_opt elements_with_implicit_role element_name 347 358 ··· 443 454 | Some role_value -> split_roles role_value 444 455 | None -> [] 445 456 in 457 + 458 + (* Check for unrecognized role tokens *) 459 + List.iter (fun role -> 460 + if not (Hashtbl.mem valid_aria_roles role) then 461 + Message_collector.add_typed collector 462 + (Error_code.Discarding_unrecognized_role { token = role }) 463 + ) explicit_roles; 446 464 447 465 (* Get implicit role for this element *) 448 466 let implicit_role = get_implicit_role name_lower attrs in
+3 -4
lib/html5_checker/specialized/attr_restrictions_checker.ml
··· 341 341 (* Check if the name contains colon - not XML serializable *) 342 342 else if String.contains after_prefix ':' then 343 343 Message_collector.add_error collector 344 - ~message:(Printf.sprintf "Attribute \xe2\x80\x9c%s\xe2\x80\x9d is not serializable as XML 1.0." 345 - attr_name) 344 + ~message:"\xe2\x80\x9cdata-*\xe2\x80\x9d attribute names must be XML 1.0 4th ed. plus Namespaces NCNames." 346 345 ~code:"bad-attribute-name" 347 346 ~element:name ~attribute:attr_name () 348 347 end ··· 486 485 487 486 if has_command && has_aria_expanded then 488 487 Message_collector.add_error collector 489 - ~message:"The \xe2\x80\x9caria-expanded\xe2\x80\x9d attribute must not be specified on \xe2\x80\x9cbutton\xe2\x80\x9d elements that have the \xe2\x80\x9ccommand\xe2\x80\x9d attribute." 488 + ~message:"The \xe2\x80\x9caria-expanded\xe2\x80\x9d attribute must not be used on any element which has a \xe2\x80\x9ccommand\xe2\x80\x9d attribute." 490 489 ~code:"disallowed-attribute" 491 490 ~element:name ~attribute:"aria-expanded" (); 492 491 493 492 if has_popovertarget && has_aria_expanded then 494 493 Message_collector.add_error collector 495 - ~message:"The \xe2\x80\x9caria-expanded\xe2\x80\x9d attribute must not be specified on \xe2\x80\x9cbutton\xe2\x80\x9d elements that have the \xe2\x80\x9cpopovertarget\xe2\x80\x9d attribute." 494 + ~message:"The \xe2\x80\x9caria-expanded\xe2\x80\x9d attribute must not be used on any element which has a \xe2\x80\x9cpopovertarget\xe2\x80\x9d attribute." 496 495 ~code:"disallowed-attribute" 497 496 ~element:name ~attribute:"aria-expanded" () 498 497 end;
+20 -4
lib/html5_checker/specialized/mime_type_checker.ml
··· 9 9 Some (Printf.sprintf 10 10 "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad MIME type: Empty value." 11 11 value attr_name element) 12 - else if value.[len - 1] = ' ' || value.[len - 1] = '\t' then 13 - Some (Printf.sprintf 14 - "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad MIME type: Extraneous trailing whitespace." 15 - value attr_name element) 12 + else if value.[len - 1] = ' ' || value.[len - 1] = '\t' then begin 13 + (* Check if this is a semicolon followed by only whitespace *) 14 + let semicolon_pos = try Some (String.index value ';') with Not_found -> None in 15 + match semicolon_pos with 16 + | Some semi_pos -> 17 + let params = String.sub value (semi_pos + 1) (len - semi_pos - 1) in 18 + let params_trimmed = String.trim params in 19 + if params_trimmed = "" then 20 + Some (Printf.sprintf 21 + "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad MIME type: Semicolon seen but there was no parameter following it." 22 + value attr_name element) 23 + else 24 + Some (Printf.sprintf 25 + "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad MIME type: Extraneous trailing whitespace." 26 + value attr_name element) 27 + | None -> 28 + Some (Printf.sprintf 29 + "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad MIME type: Extraneous trailing whitespace." 30 + value attr_name element) 31 + end 16 32 else if len > 0 && (value.[0] = ' ' || value.[0] = '\t') then 17 33 Some (Printf.sprintf 18 34 "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad MIME type: Expected a token character but saw \xe2\x80\x9c \xe2\x80\x9d instead."
+34 -3
lib/html5_checker/specialized/srcset_sizes_checker.ml
··· 20 20 if String.lowercase_ascii n = name then Some v else None 21 21 ) attrs 22 22 23 + (** Split string on a character while respecting parentheses *) 24 + let split_respecting_parens ~sep s = 25 + let len = String.length s in 26 + let result = ref [] in 27 + let current = Buffer.create 64 in 28 + let depth = ref 0 in 29 + for i = 0 to len - 1 do 30 + let c = s.[i] in 31 + if c = '(' then begin 32 + incr depth; 33 + Buffer.add_char current c 34 + end else if c = ')' then begin 35 + decr depth; 36 + Buffer.add_char current c 37 + end else if c = sep && !depth = 0 then begin 38 + result := Buffer.contents current :: !result; 39 + Buffer.clear current 40 + end else 41 + Buffer.add_char current c 42 + done; 43 + (* Add the last segment *) 44 + result := Buffer.contents current :: !result; 45 + List.rev !result 46 + 47 + (** Split string on commas while respecting parentheses *) 48 + let split_on_comma_respecting_parens s = split_respecting_parens ~sep:',' s 49 + 50 + (** Split string on spaces while respecting parentheses, filtering empty segments *) 51 + let split_on_space_respecting_parens s = 52 + split_respecting_parens ~sep:' ' s |> List.filter (fun s -> s <> "") 53 + 23 54 (** Check if string contains only whitespace *) 24 55 let is_whitespace_only s = 25 56 String.for_all (fun c -> c = ' ' || c = '\t' || c = '\n' || c = '\r') s ··· 793 824 794 825 (** Parse and validate srcset attribute value *) 795 826 let validate_srcset value element_name has_sizes collector = 796 - let entries = String.split_on_char ',' value in 827 + let entries = split_on_comma_respecting_parens value in 797 828 let has_w_descriptor = ref false in 798 829 let has_x_descriptor = ref false in 799 830 let no_descriptor_url = ref None in (* Track URL of first entry without width descriptor *) ··· 844 875 List.iter (fun entry -> 845 876 let entry = String.trim entry in 846 877 if entry <> "" then begin 847 - (* Split entry into URL and optional descriptor *) 848 - let parts = String.split_on_char ' ' entry |> List.filter (fun s -> s <> "") in 878 + (* Split entry into URL and optional descriptor - respect parentheses *) 879 + let parts = split_on_space_respecting_parens entry in 849 880 (* Check if URL is valid *) 850 881 let check_srcset_url url = 851 882 (* Special schemes that require host/content after :// *)
+40 -10
lib/html5_checker/specialized/svg_checker.ml
··· 286 286 (* Validate xmlns attributes *) 287 287 let validate_xmlns_attr attr value element collector = 288 288 match attr with 289 - | "xmlns" when element = "svg" -> 289 + | "xmlns" -> 290 + (* xmlns on any SVG element must be the SVG namespace *) 290 291 if value <> svg_ns_url then 291 292 Message_collector.add_error collector 292 293 ~message:(Printf.sprintf ··· 348 349 let flag = Str.matched_group 4 d in 349 350 if flag <> "0" && flag <> "1" then begin 350 351 let pos = Str.match_beginning () in 351 - let ctx_end = min (String.length d) (pos + 25) in 352 + (* Context ends right after the invalid flag *) 353 + let flag_end = Str.match_end () in 352 354 let ctx_start = max 0 (pos - 10) in 353 - let context = String.sub d ctx_start (ctx_end - ctx_start) in 355 + let context = String.sub d ctx_start (flag_end - ctx_start) in 354 356 Message_collector.add_error collector 355 357 ~message:(Printf.sprintf 356 358 "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9cd\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad SVG path data: Expected \xe2\x80\x9c0\xe2\x80\x9d or \xe2\x80\x9c1\xe2\x80\x9d for large-arc-flag for \xe2\x80\x9ca\xe2\x80\x9d command but found \xe2\x80\x9c%s\xe2\x80\x9d instead (context: \xe2\x80\x9c%s\xe2\x80\x9d)." ··· 391 393 | [] -> () 392 394 end; 393 395 396 + (* 2.5 Check stop element is only in linearGradient or radialGradient *) 397 + if name_lower = "stop" then begin 398 + match state.element_stack with 399 + | parent :: _ when (let p = String.lowercase_ascii parent in 400 + p = "lineargradient" || p = "radialgradient") -> () 401 + | parent :: _ -> 402 + Message_collector.add_error collector 403 + ~message:(Printf.sprintf 404 + "Element \xe2\x80\x9c%s\xe2\x80\x9d not allowed as child of element \xe2\x80\x9c%s\xe2\x80\x9d in this context. (Suppressing further errors from this subtree.)" 405 + name parent) 406 + ~element:name 407 + () 408 + | [] -> () 409 + end; 410 + 411 + (* 2.6 Check use element is not nested inside another use element *) 412 + if name_lower = "use" then begin 413 + match state.element_stack with 414 + | parent :: _ when String.lowercase_ascii parent = "use" -> 415 + Message_collector.add_error collector 416 + ~message:(Printf.sprintf 417 + "Element \xe2\x80\x9c%s\xe2\x80\x9d not allowed as child of element \xe2\x80\x9c%s\xe2\x80\x9d in this context. (Suppressing further errors from this subtree.)" 418 + name parent) 419 + ~element:name 420 + () 421 + | _ -> () 422 + end; 423 + 394 424 (* 3. Check duplicate feFunc* in feComponentTransfer *) 395 425 (match state.element_stack with 396 426 | parent :: _ when String.lowercase_ascii parent = "fecomponenttransfer" -> ··· 401 431 Message_collector.add_error collector 402 432 ~message:(Printf.sprintf 403 433 "Element \xe2\x80\x9c%s\xe2\x80\x9d not allowed as child of element \xe2\x80\x9cfeComponentTransfer\xe2\x80\x9d in this context. (Suppressing further errors from this subtree.)" 404 - name_lower) 405 - ~element:name_lower 434 + name) 435 + ~element:name 406 436 () 407 437 else 408 438 fect.seen_funcs <- name_lower :: fect.seen_funcs ··· 430 460 Message_collector.add_error collector 431 461 ~message:(Printf.sprintf 432 462 "Attribute \xe2\x80\x9c%s\xe2\x80\x9d not allowed on element \xe2\x80\x9c%s\xe2\x80\x9d at this point." 433 - attr name_lower) 434 - ~element:name_lower 463 + attr name) 464 + ~element:name 435 465 ~attribute:attr_lower 436 466 () 437 467 (* Validate path data *) 438 468 else if attr_lower = "d" && name_lower = "path" then 439 - validate_path_data value name_lower collector 469 + validate_path_data value name collector 440 470 (* Check if attribute is valid for this element *) 441 471 else if not (is_valid_attr name_lower attr_lower) then 442 472 Message_collector.add_error collector 443 473 ~message:(Printf.sprintf 444 474 "Attribute \xe2\x80\x9c%s\xe2\x80\x9d not allowed on element \xe2\x80\x9c%s\xe2\x80\x9d at this point." 445 - attr name_lower) 446 - ~element:name_lower 475 + attr name) 476 + ~element:name 447 477 ~attribute:attr_lower 448 478 () 449 479 ) attrs;
+3 -3
lib/html5rw/parser/parser_tree_builder.ml
··· 1178 1178 | Token.Tag { kind = Token.End; name; _ } 1179 1179 when List.mem name ["address"; "article"; "aside"; "blockquote"; "button"; "center"; "details"; "dialog"; "dir"; "div"; "dl"; "fieldset"; "figcaption"; "figure"; "footer"; "header"; "hgroup"; "listing"; "main"; "menu"; "nav"; "ol"; "pre"; "search"; "section"; "summary"; "ul"] -> 1180 1180 if not (has_element_in_scope t name) then 1181 - parse_error t "unexpected-end-tag" 1181 + parse_error t ("unexpected-end-tag:" ^ name) 1182 1182 else begin 1183 1183 generate_implied_end_tags t (); 1184 1184 (match current_node t with ··· 1527 1527 String.lowercase_ascii k = "type" && String.lowercase_ascii v = "hidden" 1528 1528 ) attrs in 1529 1529 if not is_hidden then begin 1530 - parse_error t "unexpected-start-tag"; 1530 + parse_error t "start-tag-in-table:input"; 1531 1531 t.foster_parenting <- true; 1532 1532 process_in_body t token; 1533 1533 t.foster_parenting <- false 1534 1534 end else begin 1535 - parse_error t "unexpected-start-tag"; 1535 + parse_error t "start-tag-in-table:input"; 1536 1536 ignore (insert_element t "input" ~push:true attrs); 1537 1537 pop_current t 1538 1538 end
+11
test/test_nfc_debug.ml
··· 1 + let () = 2 + let content = In_channel.with_open_text "validator/tests/html-svg/struct-cond-02-t-haswarn.html" (fun ic -> 3 + In_channel.input_all ic 4 + ) in 5 + let reader = Bytesrw.Bytes.Reader.of_string content in 6 + let result = Html5_checker.check ~system_id:"test.html" reader in 7 + let warnings = Html5_checker.warnings result in 8 + Printf.printf "Total warnings: %d\n" (List.length warnings); 9 + List.iter (fun msg -> 10 + Printf.printf "WARNING: %s\n" (Html5_checker.Message.message msg) 11 + ) warnings