+41
-15
lib/html5_checker/semantic/lang_detecting_checker.ml
+41
-15
lib/html5_checker/semantic/lang_detecting_checker.ml
···
9
mutable html_locator : (int * int) option; (* line, column *)
10
mutable in_body : bool;
11
mutable skip_depth : int; (* depth in elements to skip *)
12
mutable text_buffer : Buffer.t;
13
mutable char_count : int;
14
}
···
32
html_locator = None;
33
in_body = false;
34
skip_depth = 0;
35
text_buffer = Buffer.create 4096;
36
char_count = 0;
37
}
···
42
state.html_locator <- None;
43
state.in_body <- false;
44
state.skip_depth <- 0;
45
Buffer.clear state.text_buffer;
46
state.char_count <- 0
47
48
let get_attr name attrs =
49
List.find_map (fun (n, v) ->
50
if String.lowercase_ascii n = name then Some v else None
···
126
| "zh-tw" -> "zh-hant"
127
| _ -> code
128
129
-
let start_element state ~name ~namespace:_ ~attrs _collector =
130
let name_lower = String.lowercase_ascii name in
131
132
if name_lower = "html" then begin
133
state.html_lang <- get_attr "lang" attrs;
···
138
else if name_lower = "body" then
139
state.in_body <- true
140
else if state.in_body then begin
141
(* Check if we should skip this element's text *)
142
-
if List.mem name_lower skip_elements then
143
state.skip_depth <- state.skip_depth + 1
144
else begin
145
(* Check for different lang attribute *)
···
154
let name_lower = String.lowercase_ascii name in
155
if name_lower = "body" then
156
state.in_body <- false
157
-
else if state.in_body && state.skip_depth > 0 then begin
158
-
if List.mem name_lower skip_elements then
159
-
state.skip_depth <- state.skip_depth - 1
160
-
else begin
161
-
(* TODO: properly track nested elements with different lang *)
162
-
state.skip_depth <- max 0 (state.skip_depth - 1)
163
end
164
end
165
166
let characters state text _collector =
167
-
if state.in_body && state.skip_depth = 0 && state.char_count < max_chars then begin
168
(* Count Unicode code points, not bytes *)
169
let decoder = Uutf.decoder ~encoding:`UTF_8 (`String text) in
170
let rec process () =
···
197
match Langdetect.detect_with_prob detector text with
198
| None -> ()
199
| Some (detected_lang, prob) when prob > 0.90 ->
200
-
let declared_lang = match state.html_lang with
201
-
| Some l -> get_lang_code l
202
| None -> ""
203
in
204
let detected_code = detected_lang in (* Keep full code like zh-tw *)
205
let detected_name = get_language_name detected_lang in
206
let suggested_code = get_bcp47_code detected_lang in
207
208
-
(* Check for language mismatch *)
209
-
let base_declared = get_lang_code declared_lang in
210
let base_detected = get_lang_code detected_code in
211
-
if declared_lang = "" then begin
212
(* No lang attribute - suggest adding one *)
213
Message_collector.add_warning collector
214
~message:(Printf.sprintf
···
224
Message_collector.add_warning collector
225
~message:(Printf.sprintf
226
"This document appears to be written in %s but the \xe2\x80\x9chtml\xe2\x80\x9d start tag has \xe2\x80\x9clang=\"%s\"\xe2\x80\x9d. Consider using \xe2\x80\x9clang=\"%s\"\xe2\x80\x9d (or variant) instead."
227
-
detected_name declared_lang suggested_code)
228
~code:"wrong-lang"
229
~element:"html"
230
()
···
9
mutable html_locator : (int * int) option; (* line, column *)
10
mutable in_body : bool;
11
mutable skip_depth : int; (* depth in elements to skip *)
12
+
mutable foreign_depth : int; (* depth in SVG/MathML content to skip *)
13
mutable text_buffer : Buffer.t;
14
mutable char_count : int;
15
}
···
33
html_locator = None;
34
in_body = false;
35
skip_depth = 0;
36
+
foreign_depth = 0;
37
text_buffer = Buffer.create 4096;
38
char_count = 0;
39
}
···
44
state.html_locator <- None;
45
state.in_body <- false;
46
state.skip_depth <- 0;
47
+
state.foreign_depth <- 0;
48
Buffer.clear state.text_buffer;
49
state.char_count <- 0
50
51
+
(* Namespaces to skip for language detection *)
52
+
let svg_namespace = "http://www.w3.org/2000/svg"
53
+
let mathml_namespace = "http://www.w3.org/1998/Math/MathML"
54
+
55
+
let is_foreign_namespace ns =
56
+
ns = svg_namespace || ns = mathml_namespace
57
+
58
+
(* Element names that start foreign content (for when namespace isn't set) *)
59
+
let is_foreign_element name =
60
+
let n = String.lowercase_ascii name in
61
+
n = "svg" || n = "math"
62
+
63
let get_attr name attrs =
64
List.find_map (fun (n, v) ->
65
if String.lowercase_ascii n = name then Some v else None
···
141
| "zh-tw" -> "zh-hant"
142
| _ -> code
143
144
+
let start_element state ~name ~namespace ~attrs _collector =
145
let name_lower = String.lowercase_ascii name in
146
+
let ns = Option.value namespace ~default:"" in
147
148
if name_lower = "html" then begin
149
state.html_lang <- get_attr "lang" attrs;
···
154
else if name_lower = "body" then
155
state.in_body <- true
156
else if state.in_body then begin
157
+
(* Track foreign namespace depth (SVG/MathML) *)
158
+
if is_foreign_namespace ns || is_foreign_element name then
159
+
state.foreign_depth <- state.foreign_depth + 1
160
+
else if state.foreign_depth > 0 then
161
+
state.foreign_depth <- state.foreign_depth + 1
162
(* Check if we should skip this element's text *)
163
+
else if List.mem name_lower skip_elements then
164
state.skip_depth <- state.skip_depth + 1
165
else begin
166
(* Check for different lang attribute *)
···
175
let name_lower = String.lowercase_ascii name in
176
if name_lower = "body" then
177
state.in_body <- false
178
+
else if state.in_body then begin
179
+
(* Track foreign namespace depth *)
180
+
if state.foreign_depth > 0 then
181
+
state.foreign_depth <- state.foreign_depth - 1
182
+
else if state.skip_depth > 0 then begin
183
+
if List.mem name_lower skip_elements then
184
+
state.skip_depth <- state.skip_depth - 1
185
+
else
186
+
(* TODO: properly track nested elements with different lang *)
187
+
state.skip_depth <- max 0 (state.skip_depth - 1)
188
end
189
end
190
191
let characters state text _collector =
192
+
if state.in_body && state.skip_depth = 0 && state.foreign_depth = 0 && state.char_count < max_chars then begin
193
(* Count Unicode code points, not bytes *)
194
let decoder = Uutf.decoder ~encoding:`UTF_8 (`String text) in
195
let rec process () =
···
222
match Langdetect.detect_with_prob detector text with
223
| None -> ()
224
| Some (detected_lang, prob) when prob > 0.90 ->
225
+
(* Get the original declared lang value (preserve exactly as written) *)
226
+
let original_declared = match state.html_lang with
227
+
| Some l -> l
228
| None -> ""
229
in
230
let detected_code = detected_lang in (* Keep full code like zh-tw *)
231
let detected_name = get_language_name detected_lang in
232
let suggested_code = get_bcp47_code detected_lang in
233
234
+
(* Check for language mismatch using base codes *)
235
+
let base_declared = get_lang_code original_declared in
236
let base_detected = get_lang_code detected_code in
237
+
if original_declared = "" then begin
238
(* No lang attribute - suggest adding one *)
239
Message_collector.add_warning collector
240
~message:(Printf.sprintf
···
250
Message_collector.add_warning collector
251
~message:(Printf.sprintf
252
"This document appears to be written in %s but the \xe2\x80\x9chtml\xe2\x80\x9d start tag has \xe2\x80\x9clang=\"%s\"\xe2\x80\x9d. Consider using \xe2\x80\x9clang=\"%s\"\xe2\x80\x9d (or variant) instead."
253
+
detected_name original_declared suggested_code)
254
~code:"wrong-lang"
255
~element:"html"
256
()
+22
-13
lib/html5_checker/specialized/h1_checker.ml
+22
-13
lib/html5_checker/specialized/h1_checker.ml
···
2
3
type state = {
4
mutable h1_count : int;
5
}
6
7
let create () = {
8
h1_count = 0;
9
}
10
11
let reset state =
12
-
state.h1_count <- 0
13
14
let start_element state ~name ~namespace ~attrs collector =
15
ignore attrs;
16
-
if namespace <> None then ()
17
-
else begin
18
-
let name_lower = String.lowercase_ascii name in
19
-
if name_lower = "h1" then begin
20
-
state.h1_count <- state.h1_count + 1;
21
-
if state.h1_count > 1 then
22
-
Message_collector.add_info collector
23
-
~message:"Consider using only one \xe2\x80\x9ch1\xe2\x80\x9d element per document (or, if using \xe2\x80\x9ch1\xe2\x80\x9d elements multiple times is required, consider using the \xe2\x80\x9cheadingoffset\xe2\x80\x9d attribute to indicate that these \xe2\x80\x9ch1\xe2\x80\x9d elements are not all top-level headings)."
24
-
~code:"multiple-h1"
25
-
~element:name ()
26
-
end
27
end
28
29
-
let end_element _state ~name:_ ~namespace:_ _collector = ()
30
let characters _state _text _collector = ()
31
let end_document _state _collector = ()
32
···
2
3
type state = {
4
mutable h1_count : int;
5
+
mutable svg_depth : int; (* Track depth inside SVG *)
6
}
7
8
let create () = {
9
h1_count = 0;
10
+
svg_depth = 0;
11
}
12
13
let reset state =
14
+
state.h1_count <- 0;
15
+
state.svg_depth <- 0
16
17
let start_element state ~name ~namespace ~attrs collector =
18
ignore attrs;
19
+
let name_lower = String.lowercase_ascii name in
20
+
(* Track SVG depth - h1 inside SVG (foreignObject, desc) shouldn't count *)
21
+
if name_lower = "svg" then
22
+
state.svg_depth <- state.svg_depth + 1
23
+
else if namespace <> None || state.svg_depth > 0 then
24
+
() (* Skip non-HTML namespace or inside SVG *)
25
+
else if name_lower = "h1" then begin
26
+
state.h1_count <- state.h1_count + 1;
27
+
if state.h1_count > 1 then
28
+
Message_collector.add_info collector
29
+
~message:"Consider using only one \xe2\x80\x9ch1\xe2\x80\x9d element per document (or, if using \xe2\x80\x9ch1\xe2\x80\x9d elements multiple times is required, consider using the \xe2\x80\x9cheadingoffset\xe2\x80\x9d attribute to indicate that these \xe2\x80\x9ch1\xe2\x80\x9d elements are not all top-level headings)."
30
+
~code:"multiple-h1"
31
+
~element:name ()
32
end
33
34
+
let end_element state ~name ~namespace:_ _collector =
35
+
let name_lower = String.lowercase_ascii name in
36
+
if name_lower = "svg" && state.svg_depth > 0 then
37
+
state.svg_depth <- state.svg_depth - 1
38
+
39
let characters _state _text _collector = ()
40
let end_document _state _collector = ()
41