OCaml HTML5 parser/serialiser based on Python's JustHTML

tempoary workaround while i figure out uucp wasm compilation

Changed files
+37 -14
lib
check
-1
dune-project
··· 25 25 (uutf (>= 1.0.0)) 26 26 (uuuu (>= 0.3.0)) 27 27 (uunf (>= 15.0.0)) 28 - (uucp (>= 15.0.0)) 29 28 (xmlm (>= 1.4.0)) 30 29 langdetect 31 30 (odoc :with-doc)
-1
html5rw.opam
··· 16 16 "uutf" {>= "1.0.0"} 17 17 "uuuu" {>= "0.3.0"} 18 18 "uunf" {>= "15.0.0"} 19 - "uucp" {>= "15.0.0"} 20 19 "xmlm" {>= "1.4.0"} 21 20 "langdetect" 22 21 "odoc" {with-doc}
+36 -11
lib/check/datatype/dt_media_query.ml
··· 82 82 83 83 (** Unicode case folding for case-insensitive comparison. 84 84 85 - Uses the Uucp library for proper Unicode case folding, which handles 86 - special cases like Turkish dotted-I (U+0130 -> 'i' + U+0307) correctly. *) 85 + WORKAROUND: This is a temporary domain-specific implementation because 86 + the uucp library fails to compile with wasm_of_ocaml due to "too many 87 + locals" errors. Once uucp supports WASM, restore the proper implementation: 88 + 89 + {[ 90 + (* Proper uucp-based case folding: *) 91 + let case_fold s = 92 + let buf = Buffer.create (String.length s) in 93 + let add_uchar u = Uutf.Buffer.add_utf_8 buf u in 94 + let fold_char () _pos = function 95 + | `Malformed _ -> () 96 + | `Uchar u -> 97 + match Uucp.Case.Fold.fold u with 98 + | `Self -> add_uchar u 99 + | `Uchars us -> List.iter add_uchar us 100 + in 101 + Uutf.String.fold_utf_8 fold_char () s; 102 + Buffer.contents buf 103 + ]} 104 + 105 + This workaround handles the Turkish dotted-I (U+0130 -> 'i' + U+0307) 106 + which is the main non-ASCII case relevant for CSS media query identifiers. *) 87 107 let case_fold s = 88 108 let buf = Buffer.create (String.length s) in 89 - let add_uchar u = Uutf.Buffer.add_utf_8 buf u in 90 - let fold_char () _pos = function 91 - | `Malformed _ -> () (* Skip malformed sequences *) 92 - | `Uchar u -> 93 - match Uucp.Case.Fold.fold u with 94 - | `Self -> add_uchar u 95 - | `Uchars us -> List.iter add_uchar us 96 - in 97 - Uutf.String.fold_utf_8 fold_char () s; 109 + let len = String.length s in 110 + let i = ref 0 in 111 + while !i < len do 112 + let c = s.[!i] in 113 + (* U+0130 LATIN CAPITAL LETTER I WITH DOT ABOVE encoded as UTF-8: 0xC4 0xB0 *) 114 + if c = '\xc4' && !i + 1 < len && s.[!i + 1] = '\xb0' then begin 115 + (* Case fold to 'i' + U+0307 (combining dot above) = 0x69 0xCC 0x87 *) 116 + Buffer.add_string buf "i\xcc\x87"; 117 + i := !i + 2 118 + end else begin 119 + Buffer.add_char buf (Char.lowercase_ascii c); 120 + incr i 121 + end 122 + done; 98 123 Buffer.contents buf 99 124 100 125 (** Check balanced parentheses *)
+1 -1
lib/check/dune
··· 3 3 (library 4 4 (name htmlrw_check) 5 5 (public_name html5rw.check) 6 - (libraries html5rw jsont jsont.bytesrw astring str uunf uucp uutf xmlm langdetect)) 6 + (libraries html5rw jsont jsont.bytesrw astring str uunf uutf xmlm langdetect))