OCaml HTML5 parser/serialiser based on Python's JustHTML

fixes

Changed files
+31 -24
lib
+3 -3
lib/check/datatype/dt_email.mli
··· 6 6 (** Valid email address validator. 7 7 8 8 Validates a single email address. Uses simplified validation rules: 9 - - Must contain exactly one '@' character 10 - - Local part (before @) must be non-empty 11 - - Domain part (after @) must be non-empty and contain at least one '.' 9 + - Must contain exactly one ['@'] character 10 + - Local part (before ['@']) must be non-empty 11 + - Domain part (after ['@']) must be non-empty and contain at least one ['.'] 12 12 - Only ASCII characters allowed *) 13 13 module Email : Datatype.S 14 14
+1 -1
lib/check/datatype/dt_media_query.mli
··· 8 8 9 9 (** Media query validator. 10 10 11 - Validates CSS media queries used in media attributes and CSS @media rules. 11 + Validates CSS media queries used in media attributes and CSS [@@media] rules. 12 12 13 13 Examples: 14 14 - "screen"
+3 -3
lib/check/element/element.mli
··· 45 45 46 46 (** A typed HTML element. 47 47 48 - @field tag The element's tag classification 49 - @field attrs Typed attributes parsed from raw input 50 - @field raw_attrs Original attribute name-value pairs for fallback *) 48 + - [tag]: The element's tag classification 49 + - [attrs]: Typed attributes parsed from raw input 50 + - [raw_attrs]: Original attribute name-value pairs for fallback *) 51 51 type t = { 52 52 tag : Tag.element_tag; 53 53 attrs : Attr.t list;
+7 -7
lib/check/htmlrw_check.mli
··· 21 21 22 22 {2 Handling Specific Errors} 23 23 24 - Use pattern matching on {!field-message.error_code} for fine-grained control: 24 + Use pattern matching on [error_code] for fine-grained control: 25 25 26 26 {[ 27 27 List.iter (fun msg -> ··· 188 188 189 189 (** Human-readable text format. 190 190 191 - {v 191 + {v 192 192 file.html:5.3: error [missing-alt]: Element "img" is missing required attribute "alt". 193 - v} *) 193 + v} *) 194 194 val to_text : t -> string 195 195 196 196 (** JSON format compatible with Nu HTML Validator. 197 197 198 - {v 198 + {v 199 199 {"messages":[{"type":"error","message":"...","firstLine":5,"firstColumn":3}]} 200 - v} *) 200 + v} *) 201 201 val to_json : t -> string 202 202 203 203 (** GNU error format for IDE integration. 204 204 205 - {v 205 + {v 206 206 file.html:5:3: error: Element "img" is missing required attribute "alt". 207 - v} *) 207 + v} *) 208 208 val to_gnu : t -> string 209 209 210 210
+10 -4
lib/check/specialized/title_checker.ml
··· 1 1 (** Title element validation checker. *) 2 2 3 3 type state = { 4 + mutable seen_html : bool; (* true if we've seen html element (full document mode) *) 4 5 mutable in_head : bool; 5 6 mutable head_had_children : bool; (* true if head contained any child elements *) 6 7 mutable has_title : bool; ··· 10 11 } 11 12 12 13 let create () = { 14 + seen_html = false; 13 15 in_head = false; 14 16 head_had_children = false; 15 17 has_title = false; ··· 19 21 } 20 22 21 23 let reset state = 24 + state.seen_html <- false; 22 25 state.in_head <- false; 23 26 state.head_had_children <- false; 24 27 state.has_title <- false; ··· 28 31 29 32 let start_element state ~element _collector = 30 33 (match element.Element.tag with 31 - | Tag.Html `Html -> () 34 + | Tag.Html `Html -> 35 + state.seen_html <- true 32 36 | Tag.Html `Head -> 33 37 state.in_head <- true; 34 38 state.head_had_children <- false ··· 55 59 (`Element (`Must_not_be_empty (`Elem "title"))); 56 60 state.in_title <- false 57 61 | Tag.Html `Head -> 58 - (* Only report missing title if head had children (was explicit with content). 59 - An empty head was likely implicit (fragment validation from body). *) 60 - if state.in_head && not state.has_title && state.head_had_children then 62 + (* Report missing title if: 63 + - We saw an html element (full document mode), OR 64 + - Head had explicit children (was not just an implicit empty head) 65 + An empty head without html element was likely implicit (fragment validation). *) 66 + if state.in_head && not state.has_title && (state.seen_html || state.head_had_children) then 61 67 Message_collector.add_typed collector 62 68 (`Element (`Missing_child (`Parent "head", `Child "title"))); 63 69 state.in_head <- false
+4 -4
lib/html5rw/parser/parser.mli
··· 359 359 (** Result of parsing an HTML document or fragment. 360 360 361 361 This opaque type contains: 362 - - The DOM tree (access via {!root}) 363 - - Parse errors if collection was enabled (access via {!errors}) 364 - - Detected encoding for byte input (access via {!encoding}) 362 + - The DOM tree (access via {!val:root}) 363 + - Parse errors if collection was enabled (access via {!val:errors}) 364 + - Detected encoding for byte input (access via {!val:encoding}) 365 365 *) 366 366 type t 367 367 ··· 416 416 3. {b Transport hint}: Use [transport_encoding] if provided 417 417 4. {b Fallback}: Use UTF-8 418 418 419 - The detected encoding is stored in the result (access via {!encoding}). 419 + The detected encoding is stored in the result (access via {!val:encoding}). 420 420 421 421 {b Prescan details:} 422 422
+3 -2
lib/js/htmlrw_js.mli
··· 47 47 (** Validate an HTML string. 48 48 49 49 This is the simplest form of validation. Since there's no source element, 50 - the returned {!browser_message}s will not have element references. 50 + the returned messages will not have element references. 51 51 52 52 {[ 53 53 let result = validate_string "<html><body><img></body></html>" in ··· 83 83 descendants are annotated with data attributes, classes, and optionally 84 84 tooltips based on the validation results. 85 85 86 - @param config Annotation configuration. Defaults to {!default_annotation_config}. *) 86 + @param config Annotation configuration. Defaults to 87 + [Htmlrw_js_types.default_annotation_config]. *) 87 88 val validate_and_annotate : 88 89 ?config:annotation_config -> Brr.El.t -> result 89 90