OCaml HTML5 parser/serialiser based on Python's JustHTML

rename

Changed files
+14 -598
lib
check
content_model
datatype
element
semantic
specialized
htmlrw_check
test
+14
.gitignore
··· 2 2 _build/ 3 3 *.install 4 4 *.merlin 5 + *.cmi 6 + *.cmo 7 + *.cmx 8 + *.cma 9 + *.cmxa 10 + *.a 11 + *.o 5 12 6 13 # Third-party sources (fetch locally with opam source) 7 14 third_party/ ··· 17 24 _opam/ 18 25 *.html 19 26 validator 27 + 28 + # Session/planning files 29 + PLAN*.md 30 + FIXES.md 31 + CLAUDE.md 32 + *_output.txt 33 + curl_response
-365
PLAN.md
··· 1 - # Plan: Nu HTML Validator Test Suite Integration 2 - 3 - This document describes how to run the Nu HTML Validator test suite against the OCaml `html5_checker` library. 4 - 5 - ## Background 6 - 7 - The Nu HTML Checker (vnu) is the W3C's official HTML validator. Its test suite in `third_party/validator/tests/` contains ~4,300 HTML test files covering HTML5 validation rules including ARIA, microdata, element nesting, required attributes, and more. 8 - 9 - ## Test Suite Structure 10 - 11 - ### Location 12 - ``` 13 - third_party/validator/tests/ 14 - ├── messages.json # Expected error messages keyed by test path 15 - ├── html/ # 2,601 core HTML5 tests 16 - │ ├── attributes/ 17 - │ ├── elements/ 18 - │ ├── microdata/ 19 - │ ├── mime-types/ 20 - │ ├── obsolete/ 21 - │ ├── parser/ 22 - │ └── ... 23 - ├── html-aria/ # 712 ARIA validation tests 24 - ├── html-its/ # 90 internationalization tests 25 - ├── html-rdfa/ # 212 RDFa tests 26 - ├── html-rdfalite/ # 56 RDFa Lite tests 27 - ├── html-svg/ # 517 SVG-in-HTML tests 28 - └── xhtml/ # 110 XHTML tests 29 - ``` 30 - 31 - ### Filename Convention 32 - 33 - Test files use a suffix to indicate expected outcome: 34 - 35 - | Suffix | Meaning | Expected Result | 36 - |--------|---------|-----------------| 37 - | `-isvalid.html` | Valid HTML | No errors, no warnings | 38 - | `-novalid.html` | Invalid HTML | At least one error | 39 - | `-haswarn.html` | Valid with warning | At least one warning | 40 - 41 - ### Expected Messages (messages.json) 42 - 43 - For `-novalid` and `-haswarn` files, `messages.json` contains the expected error/warning message: 44 - 45 - ```json 46 - { 47 - "html-aria/misc/aria-label-div-novalid.html": "The "aria-label" attribute must not be specified on any "div" element unless...", 48 - "html/mime-types/001-novalid.html": "Bad value \"text/html \" for attribute \"type\" on element \"link\": Bad MIME type..." 49 - } 50 - ``` 51 - 52 - Note: Messages use Unicode curly quotes (U+201C `"` and U+201D `"`). 53 - 54 - ## Implementation Steps 55 - 56 - ### Step 1: Create Messages JSON Parser 57 - 58 - Create `test/validator_messages.ml`: 59 - 60 - ```ocaml 61 - (** Parser for third_party/validator/tests/messages.json *) 62 - 63 - type t = (string, string) Hashtbl.t 64 - (** Maps test file path to expected error message *) 65 - 66 - val load : string -> t 67 - (** [load path] loads messages.json from [path] *) 68 - 69 - val get : t -> string -> string option 70 - (** [get messages test_path] returns expected message for test, if any *) 71 - ``` 72 - 73 - Implementation notes: 74 - - Use `Jsont` library (already a dependency) 75 - - Keys are relative paths like `"html/parser/foo-novalid.html"` 76 - - Values are error message strings with Unicode quotes 77 - 78 - ### Step 2: Create Test File Discovery 79 - 80 - Create logic to find and classify test files: 81 - 82 - ```ocaml 83 - type expected_outcome = 84 - | Valid (* -isvalid.html: expect no errors *) 85 - | Invalid (* -novalid.html: expect error matching messages.json *) 86 - | HasWarning (* -haswarn.html: expect warning matching messages.json *) 87 - 88 - type test_file = { 89 - path : string; (* Full filesystem path *) 90 - relative_path : string; (* Path relative to tests/, used as key in messages.json *) 91 - category : string; (* html, html-aria, etc. *) 92 - expected : expected_outcome; 93 - } 94 - 95 - val discover_tests : string -> test_file list 96 - (** [discover_tests tests_dir] finds all test files recursively *) 97 - 98 - val parse_outcome : string -> expected_outcome 99 - (** [parse_outcome filename] extracts outcome from filename suffix *) 100 - ``` 101 - 102 - ### Step 3: Create Test Runner 103 - 104 - Create `test/test_validator.ml`: 105 - 106 - ```ocaml 107 - (** Test runner for Nu HTML Validator test suite *) 108 - 109 - (** Run a single test, returns (passed, details) *) 110 - let run_test messages test_file = 111 - (* 1. Read HTML content *) 112 - let content = read_file test_file.path in 113 - 114 - (* 2. Run validator *) 115 - let result = Html5_checker.check 116 - ~collect_parse_errors:true 117 - ~system_id:test_file.relative_path 118 - (Bytesrw.Bytes.Reader.of_string content) in 119 - 120 - (* 3. Check result against expected outcome *) 121 - match test_file.expected with 122 - | Valid -> 123 - (* Should have no errors or warnings *) 124 - let errors = Html5_checker.errors result in 125 - let warnings = Html5_checker.warnings result in 126 - if errors = [] && warnings = [] then 127 - (true, "OK: No messages") 128 - else 129 - (false, Printf.sprintf "Expected valid but got %d errors, %d warnings" 130 - (List.length errors) (List.length warnings)) 131 - 132 - | Invalid -> 133 - (* Should have at least one error matching expected message *) 134 - let errors = Html5_checker.errors result in 135 - let expected_msg = Validator_messages.get messages test_file.relative_path in 136 - if errors = [] then 137 - (false, "Expected error but got none") 138 - else 139 - check_message_match errors expected_msg 140 - 141 - | HasWarning -> 142 - (* Should have at least one warning matching expected message *) 143 - let warnings = Html5_checker.warnings result in 144 - let expected_msg = Validator_messages.get messages test_file.relative_path in 145 - if warnings = [] then 146 - (false, "Expected warning but got none") 147 - else 148 - check_message_match warnings expected_msg 149 - ``` 150 - 151 - ### Step 4: Message Matching Strategy 152 - 153 - The OCaml checker may produce different message text than the Nu validator. Implement flexible matching: 154 - 155 - ```ocaml 156 - (** Check if actual message matches expected *) 157 - let message_matches ~expected ~actual = 158 - (* Strategy 1: Exact match *) 159 - if actual = expected then true 160 - (* Strategy 2: Normalized match (ignore quote style) *) 161 - else if normalize_quotes actual = normalize_quotes expected then true 162 - (* Strategy 3: Substring match *) 163 - else if String.is_substring actual ~substring:(extract_core expected) then true 164 - else false 165 - 166 - (** Normalize Unicode curly quotes to ASCII *) 167 - let normalize_quotes s = 168 - s |> String.map (function 169 - | '\u{201C}' | '\u{201D}' -> '"' (* " " -> " *) 170 - | c -> c) 171 - ``` 172 - 173 - ### Step 5: Test Categories for Selective Running 174 - 175 - Map tests to checker categories for phased enablement: 176 - 177 - ```ocaml 178 - type checker_category = 179 - | Parse_errors (* Built into parser *) 180 - | Nesting (* Nesting_checker *) 181 - | Aria (* Aria_checker *) 182 - | Required_attrs (* Required_attr_checker *) 183 - | Obsolete (* Obsolete_checker *) 184 - | Id_uniqueness (* Id_checker *) 185 - | Table_structure (* Table_checker *) 186 - | Heading_structure (* Heading_checker *) 187 - | Form_validation (* Form_checker *) 188 - | Microdata (* Microdata_checker *) 189 - | Language (* Language_checker *) 190 - | Unknown 191 - 192 - (** Infer category from test path *) 193 - let categorize_test test_file = 194 - match test_file.category, extract_subcategory test_file.relative_path with 195 - | "html-aria", _ -> Aria 196 - | "html", "parser" -> Parse_errors 197 - | "html", "elements" -> Nesting (* mostly *) 198 - | "html", "attributes" -> Required_attrs 199 - | "html", "obsolete" -> Obsolete 200 - | "html", "microdata" -> Microdata 201 - | _ -> Unknown 202 - ``` 203 - 204 - ### Step 6: Dune Build Integration 205 - 206 - Add to `test/dune`: 207 - 208 - ```dune 209 - (executable 210 - (name test_validator) 211 - (modules test_validator validator_messages) 212 - (libraries bytesrw html5rw html5rw.checker jsont jsont.bytesrw test_report)) 213 - 214 - (rule 215 - (alias validator-tests) 216 - (deps 217 - (glob_files_rec ../third_party/validator/tests/**/*.html) 218 - ../third_party/validator/tests/messages.json) 219 - (action 220 - (run %{exe:test_validator.exe} ../third_party/validator/tests))) 221 - ``` 222 - 223 - Note: Use separate alias `validator-tests` initially (not `runtest`) since many tests will fail until checkers are integrated. 224 - 225 - ### Step 7: Reporting 226 - 227 - Generate reports compatible with existing test infrastructure: 228 - 229 - ```ocaml 230 - (** Print summary *) 231 - let print_summary results = 232 - let by_category = group_by_category results in 233 - List.iter (fun (cat, tests) -> 234 - let passed = List.filter fst tests |> List.length in 235 - let total = List.length tests in 236 - Printf.printf "%s: %d/%d passed\n" (category_name cat) passed total 237 - ) by_category; 238 - 239 - (* Overall *) 240 - let total_passed = List.filter (fun (p, _) -> p) results |> List.length in 241 - Printf.printf "\nTotal: %d/%d passed\n" total_passed (List.length results) 242 - 243 - (** Generate HTML report *) 244 - let write_html_report results filename = 245 - (* Use Test_report module pattern from other tests *) 246 - ... 247 - ``` 248 - 249 - ## Prerequisites 250 - 251 - Before tests can pass, the following must be completed: 252 - 253 - ### 1. Wire Checker Registry into Html5_checker.check 254 - 255 - In `lib/html5_checker/html5_checker.ml`, the `check` and `check_dom` functions currently only collect parse errors. They need to: 256 - 257 - ```ocaml 258 - let check ?collect_parse_errors ?system_id reader = 259 - let doc = Html5rw.parse reader in 260 - let collector = Message_collector.create () in 261 - 262 - (* Collect parse errors if requested *) 263 - if Option.value ~default:false collect_parse_errors then 264 - Parse_error_bridge.collect_parse_errors ?system_id doc 265 - |> List.iter (Message_collector.add collector); 266 - 267 - (* TODO: Run checkers - THIS NEEDS TO BE IMPLEMENTED *) 268 - let registry = Checker_registry.default () in 269 - Dom_walker.walk_registry registry collector (Html5rw.root doc); 270 - 271 - { document = doc; collector; system_id } 272 - ``` 273 - 274 - ### 2. Populate Default Checker Registry 275 - 276 - In `lib/html5_checker/checker_registry.ml`: 277 - 278 - ```ocaml 279 - let default () = 280 - let reg = create () in 281 - register reg "nesting" Nesting_checker.checker; 282 - register reg "aria" Aria_checker.checker; 283 - register reg "required-attrs" Required_attr_checker.checker; 284 - register reg "obsolete" Obsolete_checker.checker; 285 - register reg "id" Id_checker.checker; 286 - register reg "table" Table_checker.checker; 287 - register reg "heading" Heading_checker.checker; 288 - register reg "form" Form_checker.checker; 289 - register reg "microdata" Microdata_checker.checker; 290 - register reg "language" Language_checker.checker; 291 - reg 292 - ``` 293 - 294 - ### 3. Ensure Checkers Produce Compatible Messages 295 - 296 - Review each checker's error messages against `messages.json` to ensure they can be matched. May need to: 297 - - Use curly quotes in messages 298 - - Match Nu validator's phrasing 299 - - Include element/attribute names in same format 300 - 301 - ## Phased Rollout 302 - 303 - Run tests incrementally as checkers are integrated: 304 - 305 - | Phase | Command | What's Tested | 306 - |-------|---------|---------------| 307 - | 1 | `--category=parse` | Parse errors only (~200 tests) | 308 - | 2 | `--category=nesting` | + Nesting checker | 309 - | 3 | `--category=aria` | + ARIA checker (~700 tests) | 310 - | 4 | `--category=required` | + Required attributes | 311 - | 5 | (all) | Full suite | 312 - 313 - Implement command-line filtering: 314 - 315 - ```ocaml 316 - let () = 317 - let tests_dir = Sys.argv.(1) in 318 - let category_filter = 319 - if Array.length Sys.argv > 2 then Some Sys.argv.(2) else None in 320 - 321 - let messages = Validator_messages.load (tests_dir ^ "/messages.json") in 322 - let tests = discover_tests tests_dir in 323 - let tests = match category_filter with 324 - | Some cat -> List.filter (fun t -> categorize_test t = cat) tests 325 - | None -> tests in 326 - 327 - run_tests messages tests 328 - ``` 329 - 330 - ## Expected Test Counts by Category 331 - 332 - Based on file counts in `third_party/validator/tests/`: 333 - 334 - | Category | Files | Notes | 335 - |----------|-------|-------| 336 - | html/ | 2,601 | Core HTML5 validation | 337 - | html-aria/ | 712 | ARIA attributes | 338 - | html-svg/ | 517 | SVG embedded in HTML | 339 - | html-rdfa/ | 212 | RDFa semantic markup | 340 - | xhtml/ | 110 | XHTML variant | 341 - | html-its/ | 90 | Internationalization | 342 - | html-rdfalite/ | 56 | RDFa Lite | 343 - | **Total** | **~4,300** | | 344 - 345 - ## Files to Create 346 - 347 - | File | Purpose | 348 - |------|---------| 349 - | `test/validator_messages.ml` | Load and query messages.json | 350 - | `test/test_validator.ml` | Main test runner | 351 - | `test/dune` (modify) | Add build rules | 352 - 353 - ## Success Criteria 354 - 355 - 1. All `-isvalid.html` tests pass (no false positives) 356 - 2. All `-novalid.html` tests produce at least one error 357 - 3. All `-haswarn.html` tests produce at least one warning 358 - 4. Message content matches for implemented checkers 359 - 5. HTML report generated for review 360 - 361 - ## Reference 362 - 363 - - Nu HTML Checker: https://validator.github.io/validator/ 364 - - Test harness reference: `third_party/validator/resources/examples/test-harness/validator-tester.py` 365 - - Existing OCaml tests: `test/test_html5lib.ml`, `test/test_tokenizer.ml`
lib/htmlrw_check/attr_utils.ml lib/check/attr_utils.ml
lib/htmlrw_check/attr_utils.mli lib/check/attr_utils.mli
lib/htmlrw_check/checker.ml lib/check/checker.ml
lib/htmlrw_check/checker.mli lib/check/checker.mli
lib/htmlrw_check/checker_registry.ml lib/check/checker_registry.ml
lib/htmlrw_check/checker_registry.mli lib/check/checker_registry.mli
lib/htmlrw_check/content_model/attr_spec.ml lib/check/content_model/attr_spec.ml
lib/htmlrw_check/content_model/attr_spec.mli lib/check/content_model/attr_spec.mli
-26
lib/htmlrw_check/content_model/attribute_spec.ml
··· 1 - type requirement = Required | Optional 2 - 3 - type t = { 4 - name : string; 5 - requirement : requirement; 6 - datatype : string option; 7 - description : string; 8 - } 9 - 10 - let create ~name ?(requirement = Optional) ?datatype ~description () = 11 - { name; requirement; datatype; description } 12 - 13 - let requirement_to_string = function 14 - | Required -> "required" 15 - | Optional -> "optional" 16 - 17 - let to_string t = 18 - let req = requirement_to_string t.requirement in 19 - let dt = 20 - match t.datatype with 21 - | None -> "" 22 - | Some d -> Printf.sprintf " (%s)" d 23 - in 24 - Printf.sprintf "%s [%s]%s: %s" t.name req dt t.description 25 - 26 - let pp fmt t = Format.fprintf fmt "%s" (to_string t)
-40
lib/htmlrw_check/content_model/attribute_spec.mli
··· 1 - (** HTML5 attribute specifications. 2 - 3 - Defines attribute requirements and constraints for HTML5 elements. 4 - See https://html.spec.whatwg.org/multipage/indices.html#attributes-3 *) 5 - 6 - type requirement = 7 - | Required (** Attribute must be present *) 8 - | Optional (** Attribute may be present *) 9 - 10 - type t = { 11 - name : string; 12 - (** Attribute name *) 13 - 14 - requirement : requirement; 15 - (** Whether attribute is required or optional *) 16 - 17 - datatype : string option; 18 - (** Datatype validator name (e.g., "url", "integer", "boolean") *) 19 - 20 - description : string; 21 - (** Human-readable description *) 22 - } 23 - 24 - val create : 25 - name:string -> 26 - ?requirement:requirement -> 27 - ?datatype:string -> 28 - description:string -> 29 - unit -> 30 - t 31 - (** Create an attribute specification. 32 - 33 - @param name Attribute name 34 - @param requirement Whether required or optional (default: Optional) 35 - @param datatype Datatype validator name 36 - @param description Human-readable description *) 37 - 38 - val to_string : t -> string 39 - 40 - val pp : Format.formatter -> t -> unit
lib/htmlrw_check/content_model/category.ml lib/check/content_model/category.ml
lib/htmlrw_check/content_model/category.mli lib/check/content_model/category.mli
lib/htmlrw_check/content_model/content_category.ml lib/check/content_model/content_category.ml
lib/htmlrw_check/content_model/content_category.mli lib/check/content_model/content_category.mli
lib/htmlrw_check/content_model/content_checker.ml lib/check/content_model/content_checker.ml
lib/htmlrw_check/content_model/content_checker.mli lib/check/content_model/content_checker.mli
lib/htmlrw_check/content_model/content_model.ml lib/check/content_model/content_model.ml
lib/htmlrw_check/content_model/content_model.mli lib/check/content_model/content_model.mli
lib/htmlrw_check/content_model/element_registry.ml lib/check/content_model/element_registry.ml
lib/htmlrw_check/content_model/element_registry.mli lib/check/content_model/element_registry.mli
lib/htmlrw_check/content_model/element_spec.ml lib/check/content_model/element_spec.ml
lib/htmlrw_check/content_model/element_spec.mli lib/check/content_model/element_spec.mli
lib/htmlrw_check/content_model/elements_document.ml lib/check/content_model/elements_document.ml
lib/htmlrw_check/content_model/elements_document.mli lib/check/content_model/elements_document.mli
lib/htmlrw_check/content_model/elements_embedded.ml lib/check/content_model/elements_embedded.ml
lib/htmlrw_check/content_model/elements_embedded.mli lib/check/content_model/elements_embedded.mli
lib/htmlrw_check/content_model/elements_form.ml lib/check/content_model/elements_form.ml
lib/htmlrw_check/content_model/elements_form.mli lib/check/content_model/elements_form.mli
lib/htmlrw_check/content_model/elements_interactive.ml lib/check/content_model/elements_interactive.ml
lib/htmlrw_check/content_model/elements_interactive.mli lib/check/content_model/elements_interactive.mli
lib/htmlrw_check/content_model/elements_table.ml lib/check/content_model/elements_table.ml
lib/htmlrw_check/content_model/elements_table.mli lib/check/content_model/elements_table.mli
lib/htmlrw_check/content_model/elements_text.ml lib/check/content_model/elements_text.ml
lib/htmlrw_check/content_model/elements_text.mli lib/check/content_model/elements_text.mli
lib/htmlrw_check/context_tracker.ml lib/check/context_tracker.ml
lib/htmlrw_check/context_tracker.mli lib/check/context_tracker.mli
lib/htmlrw_check/datatype/datatype.cmi

This is a binary file and will not be displayed.

lib/htmlrw_check/datatype/datatype.ml lib/check/datatype/datatype.ml
lib/htmlrw_check/datatype/datatype.mli lib/check/datatype/datatype.mli
lib/htmlrw_check/datatype/datatype_registry.ml lib/check/datatype/datatype_registry.ml
lib/htmlrw_check/datatype/datatype_registry.mli lib/check/datatype/datatype_registry.mli
lib/htmlrw_check/datatype/dt_autocomplete.ml lib/check/datatype/dt_autocomplete.ml
lib/htmlrw_check/datatype/dt_autocomplete.mli lib/check/datatype/dt_autocomplete.mli
lib/htmlrw_check/datatype/dt_boolean.ml lib/check/datatype/dt_boolean.ml
lib/htmlrw_check/datatype/dt_boolean.mli lib/check/datatype/dt_boolean.mli
lib/htmlrw_check/datatype/dt_button_type.ml lib/check/datatype/dt_button_type.ml
lib/htmlrw_check/datatype/dt_button_type.mli lib/check/datatype/dt_button_type.mli
lib/htmlrw_check/datatype/dt_charset.ml lib/check/datatype/dt_charset.ml
lib/htmlrw_check/datatype/dt_charset.mli lib/check/datatype/dt_charset.mli
lib/htmlrw_check/datatype/dt_color.ml lib/check/datatype/dt_color.ml
lib/htmlrw_check/datatype/dt_color.mli lib/check/datatype/dt_color.mli
lib/htmlrw_check/datatype/dt_contenteditable.ml lib/check/datatype/dt_contenteditable.ml
lib/htmlrw_check/datatype/dt_contenteditable.mli lib/check/datatype/dt_contenteditable.mli
lib/htmlrw_check/datatype/dt_coords.ml lib/check/datatype/dt_coords.ml
lib/htmlrw_check/datatype/dt_coords.mli lib/check/datatype/dt_coords.mli
lib/htmlrw_check/datatype/dt_crossorigin.ml lib/check/datatype/dt_crossorigin.ml
lib/htmlrw_check/datatype/dt_crossorigin.mli lib/check/datatype/dt_crossorigin.mli
lib/htmlrw_check/datatype/dt_datetime.ml lib/check/datatype/dt_datetime.ml
lib/htmlrw_check/datatype/dt_datetime.mli lib/check/datatype/dt_datetime.mli
lib/htmlrw_check/datatype/dt_decoding.ml lib/check/datatype/dt_decoding.ml
lib/htmlrw_check/datatype/dt_decoding.mli lib/check/datatype/dt_decoding.mli
lib/htmlrw_check/datatype/dt_dir.ml lib/check/datatype/dt_dir.ml
lib/htmlrw_check/datatype/dt_dir.mli lib/check/datatype/dt_dir.mli
lib/htmlrw_check/datatype/dt_draggable.ml lib/check/datatype/dt_draggable.ml
lib/htmlrw_check/datatype/dt_draggable.mli lib/check/datatype/dt_draggable.mli
lib/htmlrw_check/datatype/dt_email.ml lib/check/datatype/dt_email.ml
lib/htmlrw_check/datatype/dt_email.mli lib/check/datatype/dt_email.mli
lib/htmlrw_check/datatype/dt_enterkeyhint.ml lib/check/datatype/dt_enterkeyhint.ml
lib/htmlrw_check/datatype/dt_enterkeyhint.mli lib/check/datatype/dt_enterkeyhint.mli
lib/htmlrw_check/datatype/dt_fetchpriority.ml lib/check/datatype/dt_fetchpriority.ml
lib/htmlrw_check/datatype/dt_fetchpriority.mli lib/check/datatype/dt_fetchpriority.mli
lib/htmlrw_check/datatype/dt_float.ml lib/check/datatype/dt_float.ml
lib/htmlrw_check/datatype/dt_float.mli lib/check/datatype/dt_float.mli
lib/htmlrw_check/datatype/dt_form_enctype.ml lib/check/datatype/dt_form_enctype.ml
lib/htmlrw_check/datatype/dt_form_enctype.mli lib/check/datatype/dt_form_enctype.mli
lib/htmlrw_check/datatype/dt_form_method.ml lib/check/datatype/dt_form_method.ml
lib/htmlrw_check/datatype/dt_form_method.mli lib/check/datatype/dt_form_method.mli
lib/htmlrw_check/datatype/dt_hash.ml lib/check/datatype/dt_hash.ml
lib/htmlrw_check/datatype/dt_hash.mli lib/check/datatype/dt_hash.mli
lib/htmlrw_check/datatype/dt_hidden.ml lib/check/datatype/dt_hidden.ml
lib/htmlrw_check/datatype/dt_hidden.mli lib/check/datatype/dt_hidden.mli
lib/htmlrw_check/datatype/dt_id.ml lib/check/datatype/dt_id.ml
lib/htmlrw_check/datatype/dt_id.mli lib/check/datatype/dt_id.mli
lib/htmlrw_check/datatype/dt_input_type.ml lib/check/datatype/dt_input_type.ml
lib/htmlrw_check/datatype/dt_input_type.mli lib/check/datatype/dt_input_type.mli
lib/htmlrw_check/datatype/dt_inputmode.ml lib/check/datatype/dt_inputmode.ml
lib/htmlrw_check/datatype/dt_inputmode.mli lib/check/datatype/dt_inputmode.mli
lib/htmlrw_check/datatype/dt_integer.ml lib/check/datatype/dt_integer.ml
lib/htmlrw_check/datatype/dt_integer.mli lib/check/datatype/dt_integer.mli
lib/htmlrw_check/datatype/dt_integrity.ml lib/check/datatype/dt_integrity.ml
lib/htmlrw_check/datatype/dt_integrity.mli lib/check/datatype/dt_integrity.mli
lib/htmlrw_check/datatype/dt_kind.ml lib/check/datatype/dt_kind.ml
lib/htmlrw_check/datatype/dt_kind.mli lib/check/datatype/dt_kind.mli
lib/htmlrw_check/datatype/dt_language.ml lib/check/datatype/dt_language.ml
lib/htmlrw_check/datatype/dt_language.mli lib/check/datatype/dt_language.mli
lib/htmlrw_check/datatype/dt_list_type.ml lib/check/datatype/dt_list_type.ml
lib/htmlrw_check/datatype/dt_list_type.mli lib/check/datatype/dt_list_type.mli
lib/htmlrw_check/datatype/dt_loading.ml lib/check/datatype/dt_loading.ml
lib/htmlrw_check/datatype/dt_loading.mli lib/check/datatype/dt_loading.mli
lib/htmlrw_check/datatype/dt_media_query.ml lib/check/datatype/dt_media_query.ml
lib/htmlrw_check/datatype/dt_media_query.mli lib/check/datatype/dt_media_query.mli
lib/htmlrw_check/datatype/dt_mime.ml lib/check/datatype/dt_mime.ml
lib/htmlrw_check/datatype/dt_mime.mli lib/check/datatype/dt_mime.mli
lib/htmlrw_check/datatype/dt_popover.ml lib/check/datatype/dt_popover.ml
lib/htmlrw_check/datatype/dt_popover.mli lib/check/datatype/dt_popover.mli
lib/htmlrw_check/datatype/dt_preload.ml lib/check/datatype/dt_preload.ml
lib/htmlrw_check/datatype/dt_preload.mli lib/check/datatype/dt_preload.mli
lib/htmlrw_check/datatype/dt_referrer.ml lib/check/datatype/dt_referrer.ml
lib/htmlrw_check/datatype/dt_referrer.mli lib/check/datatype/dt_referrer.mli
lib/htmlrw_check/datatype/dt_sandbox.ml lib/check/datatype/dt_sandbox.ml
lib/htmlrw_check/datatype/dt_sandbox.mli lib/check/datatype/dt_sandbox.mli
lib/htmlrw_check/datatype/dt_scope.ml lib/check/datatype/dt_scope.ml
lib/htmlrw_check/datatype/dt_scope.mli lib/check/datatype/dt_scope.mli
lib/htmlrw_check/datatype/dt_shape.ml lib/check/datatype/dt_shape.ml
lib/htmlrw_check/datatype/dt_shape.mli lib/check/datatype/dt_shape.mli
lib/htmlrw_check/datatype/dt_spellcheck.ml lib/check/datatype/dt_spellcheck.ml
lib/htmlrw_check/datatype/dt_spellcheck.mli lib/check/datatype/dt_spellcheck.mli
lib/htmlrw_check/datatype/dt_srcset.ml lib/check/datatype/dt_srcset.ml
lib/htmlrw_check/datatype/dt_srcset.mli lib/check/datatype/dt_srcset.mli
lib/htmlrw_check/datatype/dt_target.ml lib/check/datatype/dt_target.ml
lib/htmlrw_check/datatype/dt_target.mli lib/check/datatype/dt_target.mli
lib/htmlrw_check/datatype/dt_translate.ml lib/check/datatype/dt_translate.ml
lib/htmlrw_check/datatype/dt_translate.mli lib/check/datatype/dt_translate.mli
lib/htmlrw_check/datatype/dt_url.ml lib/check/datatype/dt_url.ml
lib/htmlrw_check/datatype/dt_url.mli lib/check/datatype/dt_url.mli
lib/htmlrw_check/datatype/dt_wrap.ml lib/check/datatype/dt_wrap.ml
lib/htmlrw_check/datatype/dt_wrap.mli lib/check/datatype/dt_wrap.mli
lib/htmlrw_check/dom_walker.ml lib/check/dom_walker.ml
lib/htmlrw_check/dom_walker.mli lib/check/dom_walker.mli
lib/htmlrw_check/dune lib/check/dune
lib/htmlrw_check/element/attr.ml lib/check/element/attr.ml
lib/htmlrw_check/element/attr.mli lib/check/element/attr.mli
lib/htmlrw_check/element/element.ml lib/check/element/element.ml
lib/htmlrw_check/element/element.mli lib/check/element/element.mli
lib/htmlrw_check/element/tag.ml lib/check/element/tag.ml
lib/htmlrw_check/element/tag.mli lib/check/element/tag.mli
lib/htmlrw_check/error_code.ml lib/check/error_code.ml
lib/htmlrw_check/error_code.mli lib/check/error_code.mli
lib/htmlrw_check/htmlrw_check.ml lib/check/htmlrw_check.ml
lib/htmlrw_check/htmlrw_check.mli lib/check/htmlrw_check.mli
lib/htmlrw_check/message.cmi

This is a binary file and will not be displayed.

lib/htmlrw_check/message.ml lib/check/message.ml
lib/htmlrw_check/message.mli lib/check/message.mli
lib/htmlrw_check/message_collector.ml lib/check/message_collector.ml
lib/htmlrw_check/message_collector.mli lib/check/message_collector.mli
lib/htmlrw_check/message_format.ml lib/check/message_format.ml
lib/htmlrw_check/message_format.mli lib/check/message_format.mli
lib/htmlrw_check/parse_error_bridge.ml lib/check/parse_error_bridge.ml
lib/htmlrw_check/parse_error_bridge.mli lib/check/parse_error_bridge.mli
lib/htmlrw_check/semantic/autofocus_checker.ml lib/check/semantic/autofocus_checker.ml
lib/htmlrw_check/semantic/autofocus_checker.mli lib/check/semantic/autofocus_checker.mli
lib/htmlrw_check/semantic/form_checker.ml lib/check/semantic/form_checker.ml
lib/htmlrw_check/semantic/form_checker.mli lib/check/semantic/form_checker.mli
lib/htmlrw_check/semantic/id_checker.ml lib/check/semantic/id_checker.ml
lib/htmlrw_check/semantic/id_checker.mli lib/check/semantic/id_checker.mli
lib/htmlrw_check/semantic/lang_detecting_checker.ml lib/check/semantic/lang_detecting_checker.ml
lib/htmlrw_check/semantic/lang_detecting_checker.mli lib/check/semantic/lang_detecting_checker.mli
lib/htmlrw_check/semantic/nesting_checker.ml lib/check/semantic/nesting_checker.ml
lib/htmlrw_check/semantic/nesting_checker.mli lib/check/semantic/nesting_checker.mli
lib/htmlrw_check/semantic/obsolete_checker.ml lib/check/semantic/obsolete_checker.ml
lib/htmlrw_check/semantic/obsolete_checker.mli lib/check/semantic/obsolete_checker.mli
lib/htmlrw_check/semantic/option_checker.ml lib/check/semantic/option_checker.ml
lib/htmlrw_check/semantic/option_checker.mli lib/check/semantic/option_checker.mli
lib/htmlrw_check/semantic/required_attr_checker.ml lib/check/semantic/required_attr_checker.ml
lib/htmlrw_check/semantic/required_attr_checker.mli lib/check/semantic/required_attr_checker.mli
lib/htmlrw_check/specialized/aria_checker.ml lib/check/specialized/aria_checker.ml
lib/htmlrw_check/specialized/aria_checker.mli lib/check/specialized/aria_checker.mli
lib/htmlrw_check/specialized/attr_restrictions_checker.ml lib/check/specialized/attr_restrictions_checker.ml
lib/htmlrw_check/specialized/attr_restrictions_checker.mli lib/check/specialized/attr_restrictions_checker.mli
lib/htmlrw_check/specialized/base_checker.ml lib/check/specialized/base_checker.ml
lib/htmlrw_check/specialized/base_checker.mli lib/check/specialized/base_checker.mli
lib/htmlrw_check/specialized/datetime_checker.ml lib/check/specialized/datetime_checker.ml
lib/htmlrw_check/specialized/datetime_checker.mli lib/check/specialized/datetime_checker.mli
lib/htmlrw_check/specialized/dl_checker.ml lib/check/specialized/dl_checker.ml
lib/htmlrw_check/specialized/dl_checker.mli lib/check/specialized/dl_checker.mli
lib/htmlrw_check/specialized/h1_checker.ml lib/check/specialized/h1_checker.ml
lib/htmlrw_check/specialized/h1_checker.mli lib/check/specialized/h1_checker.mli
lib/htmlrw_check/specialized/heading_checker.ml lib/check/specialized/heading_checker.ml
lib/htmlrw_check/specialized/heading_checker.mli lib/check/specialized/heading_checker.mli
lib/htmlrw_check/specialized/importmap_checker.ml lib/check/specialized/importmap_checker.ml
lib/htmlrw_check/specialized/importmap_checker.mli lib/check/specialized/importmap_checker.mli
lib/htmlrw_check/specialized/label_checker.ml lib/check/specialized/label_checker.ml
lib/htmlrw_check/specialized/label_checker.mli lib/check/specialized/label_checker.mli
lib/htmlrw_check/specialized/language_checker.ml lib/check/specialized/language_checker.ml
lib/htmlrw_check/specialized/language_checker.mli lib/check/specialized/language_checker.mli
lib/htmlrw_check/specialized/microdata_checker.ml lib/check/specialized/microdata_checker.ml
lib/htmlrw_check/specialized/microdata_checker.mli lib/check/specialized/microdata_checker.mli
lib/htmlrw_check/specialized/mime_type_checker.ml lib/check/specialized/mime_type_checker.ml
lib/htmlrw_check/specialized/mime_type_checker.mli lib/check/specialized/mime_type_checker.mli
lib/htmlrw_check/specialized/normalization_checker.ml lib/check/specialized/normalization_checker.ml
lib/htmlrw_check/specialized/normalization_checker.mli lib/check/specialized/normalization_checker.mli
lib/htmlrw_check/specialized/picture_checker.ml lib/check/specialized/picture_checker.ml
lib/htmlrw_check/specialized/picture_checker.mli lib/check/specialized/picture_checker.mli
lib/htmlrw_check/specialized/ruby_checker.ml lib/check/specialized/ruby_checker.ml
lib/htmlrw_check/specialized/ruby_checker.mli lib/check/specialized/ruby_checker.mli
lib/htmlrw_check/specialized/source_checker.ml lib/check/specialized/source_checker.ml
lib/htmlrw_check/specialized/source_checker.mli lib/check/specialized/source_checker.mli
lib/htmlrw_check/specialized/srcset_sizes_checker.ml lib/check/specialized/srcset_sizes_checker.ml
lib/htmlrw_check/specialized/srcset_sizes_checker.mli lib/check/specialized/srcset_sizes_checker.mli
lib/htmlrw_check/specialized/svg_checker.ml lib/check/specialized/svg_checker.ml
lib/htmlrw_check/specialized/svg_checker.mli lib/check/specialized/svg_checker.mli
lib/htmlrw_check/specialized/table_checker.ml lib/check/specialized/table_checker.ml
lib/htmlrw_check/specialized/table_checker.mli lib/check/specialized/table_checker.mli
lib/htmlrw_check/specialized/title_checker.ml lib/check/specialized/title_checker.ml
lib/htmlrw_check/specialized/title_checker.mli lib/check/specialized/title_checker.mli
lib/htmlrw_check/specialized/unknown_element_checker.ml lib/check/specialized/unknown_element_checker.ml
lib/htmlrw_check/specialized/unknown_element_checker.mli lib/check/specialized/unknown_element_checker.mli
lib/htmlrw_check/specialized/url_checker.ml lib/check/specialized/url_checker.ml
lib/htmlrw_check/specialized/url_checker.mli lib/check/specialized/url_checker.mli
lib/htmlrw_check/specialized/xhtml_content_checker.ml lib/check/specialized/xhtml_content_checker.ml
lib/htmlrw_check/specialized/xhtml_content_checker.mli lib/check/specialized/xhtml_content_checker.mli
lib/htmlrw_check/xhtml_parser.ml lib/check/xhtml_parser.ml
lib/htmlrw_check/xhtml_parser.mli lib/check/xhtml_parser.mli
-62
test/analyze_failures.ml
··· 1 - (* Quick analysis: find failing test files and print their content *) 2 - 3 - let tests_dir = "validator/tests" 4 - 5 - type expected_outcome = Valid | Invalid | HasWarning | Unknown 6 - 7 - let parse_outcome filename = 8 - (* Check .html *) 9 - if String.length filename > 13 && String.sub filename (String.length filename - 13) 13 = "-isvalid.html" then Valid 10 - else if String.length filename > 13 && String.sub filename (String.length filename - 13) 13 = "-novalid.html" then Invalid 11 - else if String.length filename > 13 && String.sub filename (String.length filename - 13) 13 = "-haswarn.html" then HasWarning 12 - (* Check .xhtml *) 13 - else if String.length filename > 14 && String.sub filename (String.length filename - 14) 14 = "-isvalid.xhtml" then Valid 14 - else if String.length filename > 14 && String.sub filename (String.length filename - 14) 14 = "-novalid.xhtml" then Invalid 15 - else if String.length filename > 14 && String.sub filename (String.length filename - 14) 14 = "-haswarn.xhtml" then HasWarning 16 - else Unknown 17 - 18 - let rec find_files dir = 19 - let entries = Sys.readdir dir |> Array.to_list in 20 - List.concat_map (fun entry -> 21 - let path = Filename.concat dir entry in 22 - if Sys.is_directory path then find_files path 23 - else if parse_outcome (Filename.basename path) <> Unknown then [path] 24 - else [] 25 - ) entries 26 - 27 - let () = 28 - let mode = if Array.length Sys.argv > 1 then Sys.argv.(1) else "novalid" in 29 - let files = find_files tests_dir in 30 - let count = ref 0 in 31 - 32 - List.iter (fun path -> 33 - let outcome = parse_outcome (Filename.basename path) in 34 - let ic = open_in path in 35 - let content = really_input_string ic (in_channel_length ic) in 36 - close_in ic; 37 - 38 - let reader = Bytesrw.Bytes.Reader.of_string content in 39 - let result = Htmlrw_check.check ~collect_parse_errors:true reader in 40 - let errors = Htmlrw_check.errors result in 41 - let warnings = Htmlrw_check.warnings result in 42 - 43 - let should_print = match mode with 44 - | "isvalid" -> outcome = Valid && (errors <> [] || warnings <> []) && !count < 60 45 - | _ -> outcome = Invalid && errors = [] && !count < 60 46 - in 47 - if should_print then begin 48 - Printf.printf "\n=== %s ===\n" path; 49 - if mode = "isvalid" then begin 50 - if errors <> [] then begin 51 - Printf.printf "ERRORS:\n"; 52 - List.iter (fun e -> Printf.printf " %s\n" e.Htmlrw_check.text) errors 53 - end; 54 - if warnings <> [] then begin 55 - Printf.printf "WARNINGS:\n"; 56 - List.iter (fun w -> Printf.printf " %s\n" w.Htmlrw_check.text) warnings 57 - end 58 - end; 59 - print_endline content; 60 - incr count 61 - end 62 - ) files
-38
test/debug_check.ml
··· 1 - let () = 2 - let test_file = "validator/tests/html/attributes/lang/missing-lang-attribute-haswarn.html" in 3 - let ic = open_in test_file in 4 - let html = really_input_string ic (in_channel_length ic) in 5 - close_in ic; 6 - let reader = Bytesrw.Bytes.Reader.of_string html in 7 - let doc = Html5rw.parse ~collect_errors:true reader in 8 - let root = Html5rw.root doc in 9 - print_endline "=== DOM Structure (with namespaces) ==="; 10 - let rec print_node indent (node : Html5rw.Dom.node) = 11 - let open Html5rw.Dom in 12 - match node.name with 13 - | "#text" -> () 14 - | "#document" | "#document-fragment" -> 15 - Printf.printf "%s%s\n" indent node.name; 16 - List.iter (print_node (indent ^ " ")) node.children 17 - | "!doctype" -> () 18 - | "#comment" -> () 19 - | _ -> 20 - let ns = match node.namespace with Some ns -> ns | None -> "none" in 21 - Printf.printf "%s<%s ns=%s>\n" indent node.name ns; 22 - List.iter (fun (k, v) -> 23 - if k = "foo" then Printf.printf "%s @%s=%s\n" indent k v 24 - ) node.attrs; 25 - List.iter (print_node (indent ^ " ")) node.children 26 - in 27 - print_node "" root; 28 - print_endline "\n=== Checking... ==="; 29 - let reader2 = Bytesrw.Bytes.Reader.of_string html in 30 - let result = Htmlrw_check.check ~collect_parse_errors:true ~system_id:test_file reader2 in 31 - let errors = Htmlrw_check.errors result in 32 - let warnings = Htmlrw_check.warnings result in 33 - print_endline "=== Errors ==="; 34 - List.iter (fun e -> print_endline e.Htmlrw_check.text) errors; 35 - print_endline "\n=== Warnings ==="; 36 - List.iter (fun e -> print_endline e.Htmlrw_check.text) warnings; 37 - print_endline "\n=== Expected ==="; 38 - print_endline "Consider adding a \xe2\x80\x9clang\xe2\x80\x9d attribute to the \xe2\x80\x9chtml\xe2\x80\x9d start tag to declare the language of this document."
-41
test/debug_validator.ml
··· 1 - (** Debug utility for testing individual HTML files against the validator *) 2 - 3 - let () = 4 - if Array.length Sys.argv < 2 then begin 5 - Printf.printf "Usage: debug_validator <html-file>\n"; 6 - exit 1 7 - end; 8 - 9 - let path = Sys.argv.(1) in 10 - let ic = open_in path in 11 - let content = really_input_string ic (in_channel_length ic) in 12 - close_in ic; 13 - 14 - Printf.printf "=== Checking: %s ===\n\n" path; 15 - Printf.printf "Input (%d bytes):\n%s\n\n" (String.length content) content; 16 - 17 - let reader = Bytesrw.Bytes.Reader.of_string content in 18 - let result = Htmlrw_check.check ~collect_parse_errors:true ~system_id:path reader in 19 - 20 - let errors = Htmlrw_check.errors result in 21 - let warnings = Htmlrw_check.warnings result in 22 - 23 - Printf.printf "=== Results ===\n"; 24 - Printf.printf "Errors: %d\n" (List.length errors); 25 - List.iter (fun msg -> 26 - Printf.printf " [ERROR] %s\n" msg.Htmlrw_check.text; 27 - (match msg.Htmlrw_check.location with 28 - | Some loc -> Printf.printf " at line %d, col %d\n" loc.line loc.column 29 - | None -> ()) 30 - ) errors; 31 - 32 - Printf.printf "Warnings: %d\n" (List.length warnings); 33 - List.iter (fun msg -> 34 - Printf.printf " [WARN] %s\n" msg.Htmlrw_check.text; 35 - (match msg.Htmlrw_check.location with 36 - | Some loc -> Printf.printf " at line %d, col %d\n" loc.line loc.column 37 - | None -> ()) 38 - ) warnings; 39 - 40 - Printf.printf "\n=== Formatted Output ===\n"; 41 - Printf.printf "%s\n" (Htmlrw_check.to_text result)
-15
test/dune
··· 86 86 (libraries bytesrw html5rw html5rw.check str jsont jsont.bytesrw test_report validator_messages expected_message)) 87 87 88 88 (executable 89 - (name debug_validator) 90 - (modules debug_validator) 91 - (libraries bytesrw html5rw html5rw.check)) 92 - 93 - (executable 94 - (name analyze_failures) 95 - (modules analyze_failures) 96 - (libraries bytesrw html5rw html5rw.check)) 97 - 98 - (executable 99 - (name debug_check) 100 - (modules debug_check) 101 - (libraries html5rw.check bytesrw)) 102 - 103 - (executable 104 89 (name test_roundtrip) 105 90 (modules test_roundtrip) 106 91 (libraries bytesrw html5rw html5rw.check astring test_report))
-11
test/test_nfc_debug.ml
··· 1 - let () = 2 - let content = In_channel.with_open_text "validator/tests/html-svg/struct-cond-02-t-haswarn.html" (fun ic -> 3 - In_channel.input_all ic 4 - ) in 5 - let reader = Bytesrw.Bytes.Reader.of_string content in 6 - let result = Htmlrw_check.check ~system_id:"test.html" reader in 7 - let warnings = Htmlrw_check.warnings result in 8 - Printf.printf "Total warnings: %d\n" (List.length warnings); 9 - List.iter (fun msg -> 10 - Printf.printf "WARNING: %s\n" msg.Htmlrw_check.text 11 - ) warnings