OCaml HTML5 parser/serialiser based on Python's JustHTML

skip implicit heads in the browser

Changed files
+28 -6
lib
check
specialized
js
+12 -2
lib/check/specialized/title_checker.ml
··· 2 2 3 3 type state = { 4 4 mutable in_head : bool; 5 + mutable head_had_children : bool; (* true if head contained any child elements *) 5 6 mutable has_title : bool; 6 7 mutable in_title : bool; 7 8 mutable title_has_content : bool; ··· 10 11 11 12 let create () = { 12 13 in_head = false; 14 + head_had_children = false; 13 15 has_title = false; 14 16 in_title = false; 15 17 title_has_content = false; ··· 18 20 19 21 let reset state = 20 22 state.in_head <- false; 23 + state.head_had_children <- false; 21 24 state.has_title <- false; 22 25 state.in_title <- false; 23 26 state.title_has_content <- false; ··· 27 30 (match element.Element.tag with 28 31 | Tag.Html `Html -> () 29 32 | Tag.Html `Head -> 30 - state.in_head <- true 33 + state.in_head <- true; 34 + state.head_had_children <- false 31 35 | Tag.Html `Title when state.in_head -> 36 + state.head_had_children <- true; 32 37 state.has_title <- true; 33 38 state.in_title <- true; 34 39 state.title_has_content <- false; 35 40 state.title_depth <- 0 41 + | _ when state.in_head -> 42 + (* Any element inside head means head had children *) 43 + state.head_had_children <- true 36 44 | _ -> ()); 37 45 if state.in_title then 38 46 state.title_depth <- state.title_depth + 1 ··· 47 55 (`Element (`Must_not_be_empty (`Elem "title"))); 48 56 state.in_title <- false 49 57 | Tag.Html `Head -> 50 - if state.in_head && not state.has_title then 58 + (* Only report missing title if head had children (was explicit with content). 59 + An empty head was likely implicit (fragment validation from body). *) 60 + if state.in_head && not state.has_title && state.head_had_children then 51 61 Message_collector.add_typed collector 52 62 (`Element (`Missing_child (`Parent "head", `Child "title"))); 53 63 state.in_head <- false
+16 -4
lib/js/htmlrw_js_dom.ml
··· 81 81 82 82 (* Build the location map by matching elements *) 83 83 let loc_to_el = 84 + (* Find the starting point in parsed elements that matches the root tag *) 85 + let root_tag = String.lowercase_ascii (Jstr.to_string (El.tag_name root)) in 86 + let rec find_start = function 87 + | [] -> [] 88 + | h_el :: rest -> 89 + if String.lowercase_ascii h_el.Html5rw.Dom.name = root_tag then 90 + h_el :: rest 91 + else 92 + find_start rest 93 + in 94 + let html5rw_elements_aligned = find_start html5rw_elements in 95 + 84 96 let rec match_elements loc_map browser_els html5rw_els = 85 97 match browser_els, html5rw_els with 86 98 | [], _ | _, [] -> loc_map ··· 96 108 in 97 109 match_elements loc_map b_rest h_rest 98 110 else 99 - (* Tags don't match - try to resync by skipping one side *) 100 - (* This handles cases where browser might have implicit elements *) 101 - match_elements loc_map b_rest html5rw_els 111 + (* Tags don't match - try skipping the parsed element first *) 112 + (* This handles cases where parser creates implicit elements *) 113 + match_elements loc_map browser_els h_rest 102 114 in 103 - match_elements LocMap.empty browser_elements html5rw_elements 115 + match_elements LocMap.empty browser_elements html5rw_elements_aligned 104 116 in 105 117 106 118 { root; html_source = html; loc_to_el }, html