OCaml HTML5 parser/serialiser based on Python's JustHTML

fixes

Changed files
+31 -24
lib
+3 -3
lib/check/datatype/dt_email.mli
··· 6 (** Valid email address validator. 7 8 Validates a single email address. Uses simplified validation rules: 9 - - Must contain exactly one '@' character 10 - - Local part (before @) must be non-empty 11 - - Domain part (after @) must be non-empty and contain at least one '.' 12 - Only ASCII characters allowed *) 13 module Email : Datatype.S 14
··· 6 (** Valid email address validator. 7 8 Validates a single email address. Uses simplified validation rules: 9 + - Must contain exactly one ['@'] character 10 + - Local part (before ['@']) must be non-empty 11 + - Domain part (after ['@']) must be non-empty and contain at least one ['.'] 12 - Only ASCII characters allowed *) 13 module Email : Datatype.S 14
+1 -1
lib/check/datatype/dt_media_query.mli
··· 8 9 (** Media query validator. 10 11 - Validates CSS media queries used in media attributes and CSS @media rules. 12 13 Examples: 14 - "screen"
··· 8 9 (** Media query validator. 10 11 + Validates CSS media queries used in media attributes and CSS [@@media] rules. 12 13 Examples: 14 - "screen"
+3 -3
lib/check/element/element.mli
··· 45 46 (** A typed HTML element. 47 48 - @field tag The element's tag classification 49 - @field attrs Typed attributes parsed from raw input 50 - @field raw_attrs Original attribute name-value pairs for fallback *) 51 type t = { 52 tag : Tag.element_tag; 53 attrs : Attr.t list;
··· 45 46 (** A typed HTML element. 47 48 + - [tag]: The element's tag classification 49 + - [attrs]: Typed attributes parsed from raw input 50 + - [raw_attrs]: Original attribute name-value pairs for fallback *) 51 type t = { 52 tag : Tag.element_tag; 53 attrs : Attr.t list;
+7 -7
lib/check/htmlrw_check.mli
··· 21 22 {2 Handling Specific Errors} 23 24 - Use pattern matching on {!field-message.error_code} for fine-grained control: 25 26 {[ 27 List.iter (fun msg -> ··· 188 189 (** Human-readable text format. 190 191 - {v 192 file.html:5.3: error [missing-alt]: Element "img" is missing required attribute "alt". 193 - v} *) 194 val to_text : t -> string 195 196 (** JSON format compatible with Nu HTML Validator. 197 198 - {v 199 {"messages":[{"type":"error","message":"...","firstLine":5,"firstColumn":3}]} 200 - v} *) 201 val to_json : t -> string 202 203 (** GNU error format for IDE integration. 204 205 - {v 206 file.html:5:3: error: Element "img" is missing required attribute "alt". 207 - v} *) 208 val to_gnu : t -> string 209 210
··· 21 22 {2 Handling Specific Errors} 23 24 + Use pattern matching on [error_code] for fine-grained control: 25 26 {[ 27 List.iter (fun msg -> ··· 188 189 (** Human-readable text format. 190 191 + {v 192 file.html:5.3: error [missing-alt]: Element "img" is missing required attribute "alt". 193 + v} *) 194 val to_text : t -> string 195 196 (** JSON format compatible with Nu HTML Validator. 197 198 + {v 199 {"messages":[{"type":"error","message":"...","firstLine":5,"firstColumn":3}]} 200 + v} *) 201 val to_json : t -> string 202 203 (** GNU error format for IDE integration. 204 205 + {v 206 file.html:5:3: error: Element "img" is missing required attribute "alt". 207 + v} *) 208 val to_gnu : t -> string 209 210
+10 -4
lib/check/specialized/title_checker.ml
··· 1 (** Title element validation checker. *) 2 3 type state = { 4 mutable in_head : bool; 5 mutable head_had_children : bool; (* true if head contained any child elements *) 6 mutable has_title : bool; ··· 10 } 11 12 let create () = { 13 in_head = false; 14 head_had_children = false; 15 has_title = false; ··· 19 } 20 21 let reset state = 22 state.in_head <- false; 23 state.head_had_children <- false; 24 state.has_title <- false; ··· 28 29 let start_element state ~element _collector = 30 (match element.Element.tag with 31 - | Tag.Html `Html -> () 32 | Tag.Html `Head -> 33 state.in_head <- true; 34 state.head_had_children <- false ··· 55 (`Element (`Must_not_be_empty (`Elem "title"))); 56 state.in_title <- false 57 | Tag.Html `Head -> 58 - (* Only report missing title if head had children (was explicit with content). 59 - An empty head was likely implicit (fragment validation from body). *) 60 - if state.in_head && not state.has_title && state.head_had_children then 61 Message_collector.add_typed collector 62 (`Element (`Missing_child (`Parent "head", `Child "title"))); 63 state.in_head <- false
··· 1 (** Title element validation checker. *) 2 3 type state = { 4 + mutable seen_html : bool; (* true if we've seen html element (full document mode) *) 5 mutable in_head : bool; 6 mutable head_had_children : bool; (* true if head contained any child elements *) 7 mutable has_title : bool; ··· 11 } 12 13 let create () = { 14 + seen_html = false; 15 in_head = false; 16 head_had_children = false; 17 has_title = false; ··· 21 } 22 23 let reset state = 24 + state.seen_html <- false; 25 state.in_head <- false; 26 state.head_had_children <- false; 27 state.has_title <- false; ··· 31 32 let start_element state ~element _collector = 33 (match element.Element.tag with 34 + | Tag.Html `Html -> 35 + state.seen_html <- true 36 | Tag.Html `Head -> 37 state.in_head <- true; 38 state.head_had_children <- false ··· 59 (`Element (`Must_not_be_empty (`Elem "title"))); 60 state.in_title <- false 61 | Tag.Html `Head -> 62 + (* Report missing title if: 63 + - We saw an html element (full document mode), OR 64 + - Head had explicit children (was not just an implicit empty head) 65 + An empty head without html element was likely implicit (fragment validation). *) 66 + if state.in_head && not state.has_title && (state.seen_html || state.head_had_children) then 67 Message_collector.add_typed collector 68 (`Element (`Missing_child (`Parent "head", `Child "title"))); 69 state.in_head <- false
+4 -4
lib/html5rw/parser/parser.mli
··· 359 (** Result of parsing an HTML document or fragment. 360 361 This opaque type contains: 362 - - The DOM tree (access via {!root}) 363 - - Parse errors if collection was enabled (access via {!errors}) 364 - - Detected encoding for byte input (access via {!encoding}) 365 *) 366 type t 367 ··· 416 3. {b Transport hint}: Use [transport_encoding] if provided 417 4. {b Fallback}: Use UTF-8 418 419 - The detected encoding is stored in the result (access via {!encoding}). 420 421 {b Prescan details:} 422
··· 359 (** Result of parsing an HTML document or fragment. 360 361 This opaque type contains: 362 + - The DOM tree (access via {!val:root}) 363 + - Parse errors if collection was enabled (access via {!val:errors}) 364 + - Detected encoding for byte input (access via {!val:encoding}) 365 *) 366 type t 367 ··· 416 3. {b Transport hint}: Use [transport_encoding] if provided 417 4. {b Fallback}: Use UTF-8 418 419 + The detected encoding is stored in the result (access via {!val:encoding}). 420 421 {b Prescan details:} 422
+3 -2
lib/js/htmlrw_js.mli
··· 47 (** Validate an HTML string. 48 49 This is the simplest form of validation. Since there's no source element, 50 - the returned {!browser_message}s will not have element references. 51 52 {[ 53 let result = validate_string "<html><body><img></body></html>" in ··· 83 descendants are annotated with data attributes, classes, and optionally 84 tooltips based on the validation results. 85 86 - @param config Annotation configuration. Defaults to {!default_annotation_config}. *) 87 val validate_and_annotate : 88 ?config:annotation_config -> Brr.El.t -> result 89
··· 47 (** Validate an HTML string. 48 49 This is the simplest form of validation. Since there's no source element, 50 + the returned messages will not have element references. 51 52 {[ 53 let result = validate_string "<html><body><img></body></html>" in ··· 83 descendants are annotated with data attributes, classes, and optionally 84 tooltips based on the validation results. 85 86 + @param config Annotation configuration. Defaults to 87 + [Htmlrw_js_types.default_annotation_config]. *) 88 val validate_and_annotate : 89 ?config:annotation_config -> Brr.El.t -> result 90