+1
dune-project
+1
dune-project
+1
html5rw.opam
+1
html5rw.opam
+1
lib/html5_checker/checker_registry.ml
+1
lib/html5_checker/checker_registry.ml
···
40
Hashtbl.replace reg "importmap" Importmap_checker.checker;
41
Hashtbl.replace reg "table" Table_checker.checker;
42
Hashtbl.replace reg "mime-type" Mime_type_checker.checker;
43
(* Hashtbl.replace reg "heading" Heading_checker.checker; *)
44
(* Hashtbl.replace reg "content" Content_checker.checker; *)
45
reg
···
40
Hashtbl.replace reg "importmap" Importmap_checker.checker;
41
Hashtbl.replace reg "table" Table_checker.checker;
42
Hashtbl.replace reg "mime-type" Mime_type_checker.checker;
43
+
Hashtbl.replace reg "normalization" Normalization_checker.checker;
44
(* Hashtbl.replace reg "heading" Heading_checker.checker; *)
45
(* Hashtbl.replace reg "content" Content_checker.checker; *)
46
reg
+55
-3
lib/html5_checker/datatype/dt_language.ml
+55
-3
lib/html5_checker/datatype/dt_language.ml
···
18
let to_lower s =
19
String.lowercase_ascii s
20
21
(** Validate language tag structure according to BCP 47.
22
-
This is a simplified validator that checks structural validity
23
-
but does not validate against the IANA registry. *)
24
let validate_language_structure s =
25
if String.length s = 0 then
26
Error "The empty string is not a valid language tag"
···
74
check_private_use rest
75
(* Primary language: 2-3 letters (ISO 639) *)
76
else if (len = 2 || len = 3) && is_all_alpha first_lower then
77
-
Ok ()
78
(* Reserved: 4 letters *)
79
else if len = 4 && is_all_alpha first_lower then
80
Error "Found reserved language tag"
···
18
let to_lower s =
19
String.lowercase_ascii s
20
21
+
(** Valid extlang subtags per IANA language-subtag-registry.
22
+
Extlangs are 3-letter subtags that follow the primary language.
23
+
Each extlang has a specific Prefix requirement.
24
+
Here we list the extlang subtag and its required prefix. *)
25
+
let valid_extlangs = [
26
+
(* Arabic extlangs (prefix: ar) *)
27
+
("aao", "ar"); ("abh", "ar"); ("abv", "ar"); ("acm", "ar"); ("acq", "ar");
28
+
("acw", "ar"); ("acx", "ar"); ("acy", "ar"); ("adf", "ar"); ("aeb", "ar");
29
+
("aec", "ar"); ("afb", "ar"); ("ajp", "ar"); ("apc", "ar"); ("apd", "ar");
30
+
("arb", "ar"); ("arq", "ar"); ("ars", "ar"); ("ary", "ar"); ("arz", "ar");
31
+
("auz", "ar"); ("avl", "ar"); ("ayh", "ar"); ("ayl", "ar"); ("ayn", "ar");
32
+
("ayp", "ar"); ("bbz", "ar"); ("pga", "ar"); ("shu", "ar"); ("ssh", "ar");
33
+
(* Chinese extlangs (prefix: zh) *)
34
+
("cdo", "zh"); ("cjy", "zh"); ("cmn", "zh"); ("cpx", "zh"); ("czh", "zh");
35
+
("czo", "zh"); ("gan", "zh"); ("hak", "zh"); ("hsn", "zh"); ("lzh", "zh");
36
+
("mnp", "zh"); ("nan", "zh"); ("wuu", "zh"); ("yue", "zh");
37
+
(* Malay extlangs (prefix: ms) *)
38
+
("bjn", "ms"); ("btj", "ms"); ("bve", "ms"); ("bvu", "ms"); ("coa", "ms");
39
+
("dup", "ms"); ("hji", "ms"); ("jak", "ms"); ("jax", "ms"); ("kvb", "ms");
40
+
("kvr", "ms"); ("kxd", "ms"); ("lce", "ms"); ("lcf", "ms"); ("liw", "ms");
41
+
("max", "ms"); ("meo", "ms"); ("mfa", "ms"); ("mfb", "ms"); ("min", "ms");
42
+
("mqg", "ms"); ("msi", "ms"); ("mui", "ms"); ("orn", "ms"); ("ors", "ms");
43
+
("pel", "ms"); ("pse", "ms"); ("tmw", "ms"); ("urk", "ms"); ("vkk", "ms");
44
+
("vkt", "ms"); ("xmm", "ms"); ("zlm", "ms"); ("zmi", "ms");
45
+
(* Swahili extlangs (prefix: sw) *)
46
+
("swc", "sw"); ("swh", "sw");
47
+
(* Uzbek extlangs (prefix: uz) *)
48
+
("uzn", "uz"); ("uzs", "uz");
49
+
]
50
+
51
+
(** Check if an extlang is valid for the given primary language prefix. *)
52
+
let is_valid_extlang prefix extlang =
53
+
let prefix_lower = to_lower prefix in
54
+
let extlang_lower = to_lower extlang in
55
+
match List.assoc_opt extlang_lower valid_extlangs with
56
+
| Some required_prefix -> required_prefix = prefix_lower
57
+
| None -> false
58
+
59
(** Validate language tag structure according to BCP 47.
60
+
This validator checks structural validity and validates extlang subtags
61
+
against the IANA registry. *)
62
let validate_language_structure s =
63
if String.length s = 0 then
64
Error "The empty string is not a valid language tag"
···
112
check_private_use rest
113
(* Primary language: 2-3 letters (ISO 639) *)
114
else if (len = 2 || len = 3) && is_all_alpha first_lower then
115
+
(* Check for extlang subtag (3 letters following primary) *)
116
+
(match rest with
117
+
| second :: _ ->
118
+
let second_lower = to_lower second in
119
+
let second_len = String.length second_lower in
120
+
(* An extlang is exactly 3 alphabetic characters *)
121
+
if second_len = 3 && is_all_alpha second_lower then
122
+
(* Check if this is a valid extlang for this prefix *)
123
+
if is_valid_extlang first_lower second_lower then
124
+
Ok ()
125
+
else
126
+
Error (Printf.sprintf "Bad extlang subtag \xe2\x80\x9c%s\xe2\x80\x9d" second_lower)
127
+
else
128
+
Ok () (* Not an extlang pattern, continue *)
129
+
| [] -> Ok ())
130
(* Reserved: 4 letters *)
131
else if len = 4 && is_all_alpha first_lower then
132
Error "Found reserved language tag"
+1
-1
lib/html5_checker/dune
+1
-1
lib/html5_checker/dune
+9
-7
lib/html5_checker/specialized/language_checker.ml
+9
-7
lib/html5_checker/specialized/language_checker.ml
···
38
| None -> None
39
40
(** Validate language attribute. *)
41
-
let validate_lang_attr value ~location ~element collector =
42
(* First check structural validity *)
43
match Dt_language.Language_or_empty.validate value with
44
| Error msg ->
45
Message_collector.add_error collector
46
-
~message:(Printf.sprintf "Invalid lang attribute: %s" msg)
47
~code:"invalid-lang"
48
?location
49
~element
50
-
~attribute:"lang"
51
()
52
| Ok () ->
53
(* Then check for deprecated subtags *)
···
84
(* Validate lang attribute *)
85
begin match lang_opt with
86
| Some lang ->
87
-
validate_lang_attr lang ~location ~element collector
88
| None -> ()
89
end;
90
91
(* Validate xml:lang attribute *)
92
begin match xmllang_opt with
93
| Some xmllang ->
94
-
validate_lang_attr xmllang ~location ~element collector
95
| None -> ()
96
end;
97
···
113
()
114
115
let end_document _state _collector =
116
-
(* Note: The "missing lang on html" warning is not produced by default since
117
-
the Nu validator only produces it for specific test cases. *)
118
()
119
120
let checker = (module struct
···
38
| None -> None
39
40
(** Validate language attribute. *)
41
+
let validate_lang_attr value ~location ~element ~attribute collector =
42
(* First check structural validity *)
43
match Dt_language.Language_or_empty.validate value with
44
| Error msg ->
45
Message_collector.add_error collector
46
+
~message:(Printf.sprintf
47
+
"Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad language tag: %s."
48
+
value attribute element msg)
49
~code:"invalid-lang"
50
?location
51
~element
52
+
~attribute
53
()
54
| Ok () ->
55
(* Then check for deprecated subtags *)
···
86
(* Validate lang attribute *)
87
begin match lang_opt with
88
| Some lang ->
89
+
validate_lang_attr lang ~location ~element ~attribute:"lang" collector
90
| None -> ()
91
end;
92
93
(* Validate xml:lang attribute *)
94
begin match xmllang_opt with
95
| Some xmllang ->
96
+
validate_lang_attr xmllang ~location ~element ~attribute:"xml:lang" collector
97
| None -> ()
98
end;
99
···
115
()
116
117
let end_document _state _collector =
118
+
(* Note: The "missing lang on html" warning is only produced for specific
119
+
test cases in the Nu validator. We don't produce it by default. *)
120
()
121
122
let checker = (module struct
+49
lib/html5_checker/specialized/normalization_checker.ml
+49
lib/html5_checker/specialized/normalization_checker.ml
···
···
1
+
(** Unicode normalization checker.
2
+
3
+
Validates that text content is in Unicode Normalization Form C (NFC). *)
4
+
5
+
type state = unit
6
+
7
+
let create () = ()
8
+
let reset _state = ()
9
+
10
+
(** Normalize a string to NFC form using uunf. *)
11
+
let normalize_nfc text =
12
+
Uunf_string.normalize_utf_8 `NFC text
13
+
14
+
(** Check if a string is in NFC form. *)
15
+
let is_nfc text =
16
+
(* A string is in NFC if normalizing it produces the same string *)
17
+
let normalized = normalize_nfc text in
18
+
text = normalized
19
+
20
+
let start_element _state ~name:_ ~namespace:_ ~attrs:_ _collector = ()
21
+
22
+
let end_element _state ~name:_ ~namespace:_ _collector = ()
23
+
24
+
let characters _state text collector =
25
+
(* Skip empty text or whitespace-only text *)
26
+
let text_trimmed = String.trim text in
27
+
if String.length text_trimmed = 0 then ()
28
+
else if not (is_nfc text_trimmed) then begin
29
+
let normalized = normalize_nfc text_trimmed in
30
+
Message_collector.add_warning collector
31
+
~message:(Printf.sprintf
32
+
"Text run is not in Unicode Normalization Form C. Should instead be \xe2\x80\x9c%s\xe2\x80\x9d. (Copy and paste that into your source document to replace the un-normalized text.)"
33
+
normalized)
34
+
~code:"unicode-normalization"
35
+
()
36
+
end
37
+
38
+
let end_document _state _collector = ()
39
+
40
+
let checker =
41
+
(module struct
42
+
type nonrec state = state
43
+
let create = create
44
+
let reset = reset
45
+
let start_element = start_element
46
+
let end_element = end_element
47
+
let characters = characters
48
+
let end_document = end_document
49
+
end : Checker.S)
+5
lib/html5_checker/specialized/normalization_checker.mli
+5
lib/html5_checker/specialized/normalization_checker.mli
+28
-4
test/debug_check.ml
+28
-4
test/debug_check.ml
···
1
let () =
2
-
let test_file = "validator/tests/html/mime-types/004-novalid.html" in
3
let ic = open_in test_file in
4
let html = really_input_string ic (in_channel_length ic) in
5
close_in ic;
6
let reader = Bytesrw.Bytes.Reader.of_string html in
7
-
let result = Html5_checker.check ~collect_parse_errors:true ~system_id:test_file reader in
8
let errors = Html5_checker.errors result in
9
let warnings = Html5_checker.warnings result in
10
print_endline "=== Errors ===";
11
List.iter (fun e -> print_endline e.Html5_checker.Message.message) errors;
12
print_endline "=== Warnings ===";
13
List.iter (fun e -> print_endline e.Html5_checker.Message.message) warnings;
14
if List.length errors > 0 then
15
-
print_endline "PASS (has errors)"
16
else
17
-
print_endline "FAIL (no errors)"
···
1
let () =
2
+
let test_file = "validator/tests/xhtml/elements/menu/menu-containing-hr-novalid.xhtml" in
3
let ic = open_in test_file in
4
let html = really_input_string ic (in_channel_length ic) in
5
close_in ic;
6
let reader = Bytesrw.Bytes.Reader.of_string html in
7
+
let doc = Html5rw.parse ~collect_errors:true reader in
8
+
let root = Html5rw.root doc in
9
+
print_endline "=== DOM Structure ===";
10
+
let rec print_node indent (node : Html5rw.Dom.node) =
11
+
let open Html5rw.Dom in
12
+
match node.name with
13
+
| "#text" ->
14
+
let text = String.trim node.data in
15
+
if String.length text > 0 then
16
+
Printf.printf "%sTEXT: %s\n" indent text
17
+
| "#document" | "#document-fragment" ->
18
+
Printf.printf "%s%s\n" indent node.name;
19
+
List.iter (print_node (indent ^ " ")) node.children
20
+
| "!doctype" -> Printf.printf "%s<!DOCTYPE>\n" indent
21
+
| "#comment" -> ()
22
+
| _ ->
23
+
Printf.printf "%s<%s>\n" indent node.name;
24
+
List.iter (print_node (indent ^ " ")) node.children
25
+
in
26
+
print_node "" root;
27
+
print_endline "\n=== Now checking ===";
28
+
let reader2 = Bytesrw.Bytes.Reader.of_string html in
29
+
let result = Html5_checker.check ~collect_parse_errors:true ~system_id:test_file reader2 in
30
let errors = Html5_checker.errors result in
31
let warnings = Html5_checker.warnings result in
32
print_endline "=== Errors ===";
33
List.iter (fun e -> print_endline e.Html5_checker.Message.message) errors;
34
print_endline "=== Warnings ===";
35
List.iter (fun e -> print_endline e.Html5_checker.Message.message) warnings;
36
+
print_endline "\n=== Expected ===";
37
+
print_endline "Element \xe2\x80\x9chr\xe2\x80\x9d not allowed as child of element \xe2\x80\x9cmenu\xe2\x80\x9d in this context. (Suppressing further errors from this subtree.)";
38
if List.length errors > 0 then
39
+
print_endline "\nPASS (has errors)"
40
else
41
+
print_endline "\nFAIL (no errors)"