+41
-15
lib/html5_checker/semantic/lang_detecting_checker.ml
+41
-15
lib/html5_checker/semantic/lang_detecting_checker.ml
···
9
9
mutable html_locator : (int * int) option; (* line, column *)
10
10
mutable in_body : bool;
11
11
mutable skip_depth : int; (* depth in elements to skip *)
12
+
mutable foreign_depth : int; (* depth in SVG/MathML content to skip *)
12
13
mutable text_buffer : Buffer.t;
13
14
mutable char_count : int;
14
15
}
···
32
33
html_locator = None;
33
34
in_body = false;
34
35
skip_depth = 0;
36
+
foreign_depth = 0;
35
37
text_buffer = Buffer.create 4096;
36
38
char_count = 0;
37
39
}
···
42
44
state.html_locator <- None;
43
45
state.in_body <- false;
44
46
state.skip_depth <- 0;
47
+
state.foreign_depth <- 0;
45
48
Buffer.clear state.text_buffer;
46
49
state.char_count <- 0
47
50
51
+
(* Namespaces to skip for language detection *)
52
+
let svg_namespace = "http://www.w3.org/2000/svg"
53
+
let mathml_namespace = "http://www.w3.org/1998/Math/MathML"
54
+
55
+
let is_foreign_namespace ns =
56
+
ns = svg_namespace || ns = mathml_namespace
57
+
58
+
(* Element names that start foreign content (for when namespace isn't set) *)
59
+
let is_foreign_element name =
60
+
let n = String.lowercase_ascii name in
61
+
n = "svg" || n = "math"
62
+
48
63
let get_attr name attrs =
49
64
List.find_map (fun (n, v) ->
50
65
if String.lowercase_ascii n = name then Some v else None
···
126
141
| "zh-tw" -> "zh-hant"
127
142
| _ -> code
128
143
129
-
let start_element state ~name ~namespace:_ ~attrs _collector =
144
+
let start_element state ~name ~namespace ~attrs _collector =
130
145
let name_lower = String.lowercase_ascii name in
146
+
let ns = Option.value namespace ~default:"" in
131
147
132
148
if name_lower = "html" then begin
133
149
state.html_lang <- get_attr "lang" attrs;
···
138
154
else if name_lower = "body" then
139
155
state.in_body <- true
140
156
else if state.in_body then begin
157
+
(* Track foreign namespace depth (SVG/MathML) *)
158
+
if is_foreign_namespace ns || is_foreign_element name then
159
+
state.foreign_depth <- state.foreign_depth + 1
160
+
else if state.foreign_depth > 0 then
161
+
state.foreign_depth <- state.foreign_depth + 1
141
162
(* Check if we should skip this element's text *)
142
-
if List.mem name_lower skip_elements then
163
+
else if List.mem name_lower skip_elements then
143
164
state.skip_depth <- state.skip_depth + 1
144
165
else begin
145
166
(* Check for different lang attribute *)
···
154
175
let name_lower = String.lowercase_ascii name in
155
176
if name_lower = "body" then
156
177
state.in_body <- false
157
-
else if state.in_body && state.skip_depth > 0 then begin
158
-
if List.mem name_lower skip_elements then
159
-
state.skip_depth <- state.skip_depth - 1
160
-
else begin
161
-
(* TODO: properly track nested elements with different lang *)
162
-
state.skip_depth <- max 0 (state.skip_depth - 1)
178
+
else if state.in_body then begin
179
+
(* Track foreign namespace depth *)
180
+
if state.foreign_depth > 0 then
181
+
state.foreign_depth <- state.foreign_depth - 1
182
+
else if state.skip_depth > 0 then begin
183
+
if List.mem name_lower skip_elements then
184
+
state.skip_depth <- state.skip_depth - 1
185
+
else
186
+
(* TODO: properly track nested elements with different lang *)
187
+
state.skip_depth <- max 0 (state.skip_depth - 1)
163
188
end
164
189
end
165
190
166
191
let characters state text _collector =
167
-
if state.in_body && state.skip_depth = 0 && state.char_count < max_chars then begin
192
+
if state.in_body && state.skip_depth = 0 && state.foreign_depth = 0 && state.char_count < max_chars then begin
168
193
(* Count Unicode code points, not bytes *)
169
194
let decoder = Uutf.decoder ~encoding:`UTF_8 (`String text) in
170
195
let rec process () =
···
197
222
match Langdetect.detect_with_prob detector text with
198
223
| None -> ()
199
224
| Some (detected_lang, prob) when prob > 0.90 ->
200
-
let declared_lang = match state.html_lang with
201
-
| Some l -> get_lang_code l
225
+
(* Get the original declared lang value (preserve exactly as written) *)
226
+
let original_declared = match state.html_lang with
227
+
| Some l -> l
202
228
| None -> ""
203
229
in
204
230
let detected_code = detected_lang in (* Keep full code like zh-tw *)
205
231
let detected_name = get_language_name detected_lang in
206
232
let suggested_code = get_bcp47_code detected_lang in
207
233
208
-
(* Check for language mismatch *)
209
-
let base_declared = get_lang_code declared_lang in
234
+
(* Check for language mismatch using base codes *)
235
+
let base_declared = get_lang_code original_declared in
210
236
let base_detected = get_lang_code detected_code in
211
-
if declared_lang = "" then begin
237
+
if original_declared = "" then begin
212
238
(* No lang attribute - suggest adding one *)
213
239
Message_collector.add_warning collector
214
240
~message:(Printf.sprintf
···
224
250
Message_collector.add_warning collector
225
251
~message:(Printf.sprintf
226
252
"This document appears to be written in %s but the \xe2\x80\x9chtml\xe2\x80\x9d start tag has \xe2\x80\x9clang=\"%s\"\xe2\x80\x9d. Consider using \xe2\x80\x9clang=\"%s\"\xe2\x80\x9d (or variant) instead."
227
-
detected_name declared_lang suggested_code)
253
+
detected_name original_declared suggested_code)
228
254
~code:"wrong-lang"
229
255
~element:"html"
230
256
()
+22
-13
lib/html5_checker/specialized/h1_checker.ml
+22
-13
lib/html5_checker/specialized/h1_checker.ml
···
2
2
3
3
type state = {
4
4
mutable h1_count : int;
5
+
mutable svg_depth : int; (* Track depth inside SVG *)
5
6
}
6
7
7
8
let create () = {
8
9
h1_count = 0;
10
+
svg_depth = 0;
9
11
}
10
12
11
13
let reset state =
12
-
state.h1_count <- 0
14
+
state.h1_count <- 0;
15
+
state.svg_depth <- 0
13
16
14
17
let start_element state ~name ~namespace ~attrs collector =
15
18
ignore attrs;
16
-
if namespace <> None then ()
17
-
else begin
18
-
let name_lower = String.lowercase_ascii name in
19
-
if name_lower = "h1" then begin
20
-
state.h1_count <- state.h1_count + 1;
21
-
if state.h1_count > 1 then
22
-
Message_collector.add_info collector
23
-
~message:"Consider using only one \xe2\x80\x9ch1\xe2\x80\x9d element per document (or, if using \xe2\x80\x9ch1\xe2\x80\x9d elements multiple times is required, consider using the \xe2\x80\x9cheadingoffset\xe2\x80\x9d attribute to indicate that these \xe2\x80\x9ch1\xe2\x80\x9d elements are not all top-level headings)."
24
-
~code:"multiple-h1"
25
-
~element:name ()
26
-
end
19
+
let name_lower = String.lowercase_ascii name in
20
+
(* Track SVG depth - h1 inside SVG (foreignObject, desc) shouldn't count *)
21
+
if name_lower = "svg" then
22
+
state.svg_depth <- state.svg_depth + 1
23
+
else if namespace <> None || state.svg_depth > 0 then
24
+
() (* Skip non-HTML namespace or inside SVG *)
25
+
else if name_lower = "h1" then begin
26
+
state.h1_count <- state.h1_count + 1;
27
+
if state.h1_count > 1 then
28
+
Message_collector.add_info collector
29
+
~message:"Consider using only one \xe2\x80\x9ch1\xe2\x80\x9d element per document (or, if using \xe2\x80\x9ch1\xe2\x80\x9d elements multiple times is required, consider using the \xe2\x80\x9cheadingoffset\xe2\x80\x9d attribute to indicate that these \xe2\x80\x9ch1\xe2\x80\x9d elements are not all top-level headings)."
30
+
~code:"multiple-h1"
31
+
~element:name ()
27
32
end
28
33
29
-
let end_element _state ~name:_ ~namespace:_ _collector = ()
34
+
let end_element state ~name ~namespace:_ _collector =
35
+
let name_lower = String.lowercase_ascii name in
36
+
if name_lower = "svg" && state.svg_depth > 0 then
37
+
state.svg_depth <- state.svg_depth - 1
38
+
30
39
let characters _state _text _collector = ()
31
40
let end_document _state _collector = ()
32
41