simdjson bindings with streaming support

Add CBOR support with RFC 8949 compliance tests

- CBOR decoder using simdjson tape format (C++)
- CBOR encoder (pure OCaml)
- Half-precision float (16-bit) support
- Indefinite-length arrays/maps/strings
- 64/81 RFC 8949 Appendix A tests passing
- Memory leak tests for CBOR operations
- README documentation and API docstrings
- Version bump to 0.2.0

+52
README.md
··· 408 408 print_endline ("Parse error: " ^ e.message) 409 409 ``` 410 410 411 + ## CBOR 412 + 413 + `simdjsont` includes support for [CBOR (RFC 8949)](https://cbor.io/), using the same codecs as JSON. It decodes CBOR into the same internal `Simdjsont.Json.t` representation, allowing you to use your existing data models and validation logic. 414 + 415 + ### CBOR Quick Start 416 + 417 + ```ocaml 418 + type point = { x: int; y: int } 419 + 420 + let point_codec = 421 + let open Simdjsont.Decode in 422 + Obj.field (fun x y -> { x; y }) 423 + |> Obj.mem "x" int ~enc:(fun p -> p.x) 424 + |> Obj.mem "y" int ~enc:(fun p -> p.y) 425 + |> Obj.finish 426 + 427 + (* Encode to CBOR binary string *) 428 + let cbor = Simdjsont.Cbor.encode_string point_codec { x = 10; y = 20 } 429 + 430 + (* Decode from CBOR *) 431 + match Simdjsont.Cbor.decode_string point_codec cbor with 432 + | Ok p -> Printf.printf "Point from CBOR: (%d, %d)\n" p.x p.y 433 + | Error e -> print_endline e 434 + ``` 435 + 436 + ### CBOR Streaming 437 + 438 + CBOR often contains multiple concatenated items. Use `Simdjsont.Cbor.to_seq` to parse these as a sequence: 439 + 440 + ```ocaml 441 + let cbor_data = 442 + Simdjsont.Cbor.encode_string Simdjsont.Decode.int 1 ^ 443 + Simdjsont.Cbor.encode_string Simdjsont.Decode.int 2 ^ 444 + Simdjsont.Cbor.encode_string Simdjsont.Decode.int 3 445 + 446 + let () = 447 + Simdjsont.Cbor.to_seq Simdjsont.Decode.int cbor_data 448 + |> Seq.iter (function 449 + | Ok n -> Printf.printf "Decoded: %d\n" n 450 + | Error e -> Printf.printf "Error: %s\n" e) 451 + ``` 452 + 453 + ### CBOR Limitations 454 + 455 + The CBOR implementation is optimized for JSON-compatible data structures. The following CBOR features are currently NOT supported: 456 + 457 + - **Tags**: While basic tags are skipped, specialized tag processing (e.g., for Date/Time) is not included. 458 + - **Bignums**: Integers are limited to 64-bit signed/unsigned. 459 + - **Byte Strings**: Decoded as strings; binary-only data might fail UTF-8 validation if decoded via `string` codec. 460 + - **Undefined/Simple Values**: Only `true`, `false`, and `null` are supported. 461 + - **Integer Map Keys**: Only string keys (Major type 3) and byte string keys (Major type 2) are supported for objects. 462 + 411 463 ## License 412 464 413 465 ISC
+102
bench/bench_cbor.ml
··· 1 + let read_file path = 2 + let ic = open_in path in 3 + let n = in_channel_length ic in 4 + let s = really_input_string ic n in 5 + close_in ic; 6 + s 7 + 8 + let twitter_json = lazy (read_file "bench/data/twitter.json") 9 + let citm_json = lazy (read_file "bench/data/citm_catalog.json") 10 + 11 + let json_to_cbor_string json_str = 12 + match Simdjsont.Codec.decode_string Simdjsont.Codec.value json_str with 13 + | Ok json -> Simdjsont.Cbor.encode_string Simdjsont.Codec.value json 14 + | Error e -> failwith e 15 + 16 + let twitter_cbor = lazy (json_to_cbor_string (Lazy.force twitter_json)) 17 + let citm_cbor = lazy (json_to_cbor_string (Lazy.force citm_json)) 18 + 19 + let time_it name iterations f = 20 + Gc.full_major (); 21 + let start = Unix.gettimeofday () in 22 + for _ = 1 to iterations do 23 + f () 24 + done; 25 + let elapsed = Unix.gettimeofday () -. start in 26 + let ns_per_op = elapsed /. float_of_int iterations *. 1_000_000_000.0 in 27 + (name, ns_per_op) 28 + 29 + let run_cbor_benchmarks name cbor_str iterations = 30 + Printf.printf "\n=== %s (%d bytes) ===\n\n%!" name (String.length cbor_str); 31 + 32 + Printf.printf "--- DECODE (to AST) ---\n%!"; 33 + 34 + Printf.printf "Running simdjsont decode...\n%!"; 35 + let decode_results = 36 + [ 37 + time_it "simdjsont" iterations (fun () -> 38 + match Simdjsont.Cbor.decode_string Simdjsont.Codec.value cbor_str with 39 + | Ok _ -> () 40 + | Error e -> failwith ("Simdjsont error: " ^ e)); 41 + time_it "ocaml-cbor" iterations (fun () -> 42 + let _ = CBOR.Simple.decode cbor_str in 43 + ()); 44 + ] 45 + in 46 + let baseline = List.assoc "ocaml-cbor" decode_results in 47 + List.iter 48 + (fun (n, ns) -> 49 + Printf.printf " %-12s %10.0f ns (%.1fx)\n%!" n ns (baseline /. ns)) 50 + decode_results; 51 + 52 + Printf.printf "\n--- ENCODE (AST to string) ---\n%!"; 53 + 54 + let simdjson_ast = 55 + match Simdjsont.Cbor.decode_string Simdjsont.Codec.value cbor_str with 56 + | Ok v -> v 57 + | Error e -> failwith e 58 + in 59 + 60 + let cbor_ast = CBOR.Simple.decode cbor_str in 61 + let encode_results = 62 + [ 63 + time_it "simdjsont" iterations (fun () -> 64 + let _ = 65 + Simdjsont.Cbor.encode_string Simdjsont.Codec.value simdjson_ast 66 + in 67 + ()); 68 + time_it "ocaml-cbor" iterations (fun () -> 69 + let _ = CBOR.Simple.encode cbor_ast in 70 + ()); 71 + ] 72 + in 73 + let baseline = List.assoc "ocaml-cbor" encode_results in 74 + List.iter 75 + (fun (n, ns) -> 76 + Printf.printf " %-12s %10.0f ns (%.1fx)\n%!" n ns (baseline /. ns)) 77 + encode_results 78 + 79 + let () = 80 + Printf.printf "Preparing CBOR data...\n%!"; 81 + let twitter = Lazy.force twitter_cbor in 82 + let citm = Lazy.force citm_cbor in 83 + Printf.printf "CBOR data prepared. Twitter size: %d, CITM size: %d\n%!" 84 + (String.length twitter) (String.length citm); 85 + 86 + (* Printf.printf "Testing tiny CBOR...\n%!"; 87 + let tiny_json = "123" in 88 + let tiny_cbor = json_to_cbor_string tiny_json in 89 + (match Simdjsont.Cbor.decode_string Simdjsont.Codec.value tiny_cbor with 90 + | Ok _ -> Printf.printf "Tiny decode success\n%!" 91 + | Error e -> Printf.printf "Tiny decode failed: %s\n%!" e); 92 + 93 + Printf.printf "Testing array CBOR...\n%!"; 94 + let array_json = "[1, 2, 3]" in 95 + let array_cbor = json_to_cbor_string array_json in 96 + (match Simdjsont.Cbor.decode_string Simdjsont.Codec.value array_cbor with 97 + | Ok _ -> Printf.printf "Array decode success\n%!" 98 + | Error e -> Printf.printf "Array decode failed: %s\n%!" e); *) 99 + Printf.printf "CBOR benchmarks\n%!"; 100 + Printf.printf "==============================\n%!"; 101 + run_cbor_benchmarks "twitter.cbor" twitter 50; 102 + run_cbor_benchmarks "citm_catalog.cbor" citm 20
+4
bench/dune
··· 6 6 (executable 7 7 (name bench_ndjson) 8 8 (libraries simdjsont unix)) 9 + 10 + (executable 11 + (name bench_cbor) 12 + (libraries simdjsont cbor unix))
+3 -3
dune-project
··· 1 1 (lang dune 3.20) 2 2 3 3 (name simdjsont) 4 - (version 0.1.0) 4 + (version 0.2.0) 5 5 6 6 (generate_opam_files true) 7 7 ··· 20 20 21 21 (package 22 22 (name simdjsont) 23 - (synopsis "JSON parsing with simdjson, with support for ndjson streaming") 23 + (synopsis "JSON and CBOR parsing with simdjson, with support for streaming") 24 24 (description 25 - "OCaml bindings to simdjson with support for ndjson streaming. Includes vendored simdjson 4.2.4.") 25 + "OCaml bindings to simdjson with support for ndjson streaming and CBOR encoding/decoding. Includes vendored simdjson 4.2.4.") 26 26 (depends 27 27 (ocaml (>= 5.4.0)) 28 28 integers
+2 -2
lib/dune
··· 4 4 (libraries integers) 5 5 (foreign_stubs 6 6 (language cxx) 7 - (names simdjsont_stubs simdjsont_impl) 8 - (extra_deps simdjson_vendor.cc simdjson_vendor.hh) 7 + (names simdjsont_stubs simdjsont_impl simdcbor) 8 + (extra_deps simdjson_vendor.cc simdjson_vendor.hh simdcbor.hh) 9 9 (flags 10 10 :standard 11 11 -std=c++17
+411
lib/simdcbor.cpp
··· 1 + #include "simdcbor.hh" 2 + #include <cmath> 3 + #include <cstring> 4 + #include <vector> 5 + #include <stdexcept> 6 + 7 + #if defined(_MSC_VER) 8 + #include <stdlib.h> 9 + #define bswap_16(x) _byteswap_ushort(x) 10 + #define bswap_32(x) _byteswap_ulong(x) 11 + #define bswap_64(x) _byteswap_uint64(x) 12 + #else 13 + #define bswap_16(x) __builtin_bswap16(x) 14 + #define bswap_32(x) __builtin_bswap32(x) 15 + #define bswap_64(x) __builtin_bswap64(x) 16 + #endif 17 + 18 + using namespace simdjson; 19 + 20 + namespace { 21 + 22 + class CborReader { 23 + public: 24 + const uint8_t* current; 25 + const uint8_t* end; 26 + dom::document& doc; 27 + size_t tape_idx; 28 + uint8_t* current_string_buf; 29 + const uint8_t* string_buf_start; 30 + 31 + CborReader(const uint8_t* buf, size_t len, dom::document& d) 32 + : current(buf), end(buf + len), doc(d), tape_idx(0), 33 + current_string_buf(d.string_buf.get()), 34 + string_buf_start(d.string_buf.get()) {} 35 + 36 + void append_tape(uint64_t val, internal::tape_type type) { 37 + doc.tape[tape_idx++] = val | (uint64_t(type) << 56); 38 + } 39 + 40 + void append_tape_value(uint64_t val) { 41 + doc.tape[tape_idx++] = val; 42 + } 43 + 44 + size_t reserve_tape() { 45 + return tape_idx++; 46 + } 47 + 48 + void set_tape(size_t idx, uint64_t val, internal::tape_type type) { 49 + doc.tape[idx] = val | (uint64_t(type) << 56); 50 + } 51 + 52 + error_code parse_root() { 53 + size_t root_start = reserve_tape(); 54 + 55 + error_code ec = parse_item(); 56 + if (ec != SUCCESS) return ec; 57 + 58 + size_t root_end = reserve_tape(); 59 + 60 + set_tape(root_start, root_end, internal::tape_type::ROOT); 61 + set_tape(root_end, root_start, internal::tape_type::ROOT); 62 + 63 + return SUCCESS; 64 + } 65 + 66 + error_code parse_item() { 67 + if (current >= end) return EMPTY; 68 + 69 + uint8_t initial = *current++; 70 + uint8_t major = initial >> 5; 71 + uint8_t additional = initial & 0x1F; 72 + 73 + switch (major) { 74 + case 0: return parse_uint(additional); 75 + case 1: return parse_nint(additional); 76 + case 2: return parse_byte_string(additional); 77 + case 3: return parse_text_string(additional); 78 + case 4: return parse_array(additional); 79 + case 5: return parse_map(additional); 80 + case 6: return parse_item(); 81 + case 7: return parse_float_simple(additional); 82 + default: return UNEXPECTED_ERROR; 83 + } 84 + } 85 + 86 + private: 87 + uint64_t read_uint(uint8_t additional, error_code& ec) { 88 + if (additional < 24) { 89 + return additional; 90 + } else if (additional == 24) { 91 + if (current + 1 > end) { ec = INDEX_OUT_OF_BOUNDS; return 0; } 92 + uint8_t v = *current++; 93 + return v; 94 + } else if (additional == 25) { 95 + if (current + 2 > end) { ec = INDEX_OUT_OF_BOUNDS; return 0; } 96 + uint16_t v; 97 + memcpy(&v, current, 2); 98 + current += 2; 99 + return bswap_16(v); 100 + } else if (additional == 26) { 101 + if (current + 4 > end) { ec = INDEX_OUT_OF_BOUNDS; return 0; } 102 + uint32_t v; 103 + memcpy(&v, current, 4); 104 + current += 4; 105 + return bswap_32(v); 106 + } else if (additional == 27) { 107 + if (current + 8 > end) { ec = INDEX_OUT_OF_BOUNDS; return 0; } 108 + uint64_t v; 109 + memcpy(&v, current, 8); 110 + current += 8; 111 + return bswap_64(v); 112 + } else { 113 + ec = UNEXPECTED_ERROR; 114 + return 0; 115 + } 116 + } 117 + 118 + error_code parse_uint(uint8_t additional) { 119 + error_code ec = SUCCESS; 120 + uint64_t val = read_uint(additional, ec); 121 + if (ec != SUCCESS) return ec; 122 + 123 + append_tape(0, internal::tape_type::UINT64); 124 + append_tape_value(val); 125 + return SUCCESS; 126 + } 127 + 128 + error_code parse_nint(uint8_t additional) { 129 + error_code ec = SUCCESS; 130 + uint64_t val = read_uint(additional, ec); 131 + if (ec != SUCCESS) return ec; 132 + 133 + append_tape(0, internal::tape_type::INT64); 134 + int64_t nval = -1 - int64_t(val); 135 + append_tape_value((uint64_t)nval); 136 + return SUCCESS; 137 + } 138 + 139 + error_code parse_byte_string(uint8_t additional) { 140 + if (additional == 31) return UNEXPECTED_ERROR; 141 + 142 + error_code ec = SUCCESS; 143 + uint64_t len = read_uint(additional, ec); 144 + if (ec != SUCCESS) return ec; 145 + 146 + if (current + len > end) return INDEX_OUT_OF_BOUNDS; 147 + 148 + return write_string(current, len); 149 + } 150 + 151 + error_code parse_text_string(uint8_t additional) { 152 + if (additional == 31) { 153 + size_t offset = current_string_buf - string_buf_start; 154 + uint8_t* len_ptr = current_string_buf; 155 + current_string_buf += sizeof(uint32_t); 156 + size_t total_len = 0; 157 + 158 + while (true) { 159 + if (current >= end) return UNEXPECTED_ERROR; 160 + if (*current == 0xFF) { 161 + current++; 162 + break; 163 + } 164 + uint8_t chunk_initial = *current++; 165 + if ((chunk_initial >> 5) != 3) return INCORRECT_TYPE; 166 + 167 + error_code ec = SUCCESS; 168 + uint64_t chunk_len = read_uint(chunk_initial & 0x1F, ec); 169 + if (ec != SUCCESS) return ec; 170 + if (current + chunk_len > end) return INDEX_OUT_OF_BOUNDS; 171 + 172 + if (!simdjson::validate_utf8((const char*)current, chunk_len)) { 173 + return UTF8_ERROR; 174 + } 175 + 176 + memcpy(current_string_buf, current, chunk_len); 177 + current_string_buf += chunk_len; 178 + current += chunk_len; 179 + total_len += chunk_len; 180 + } 181 + 182 + *current_string_buf++ = 0; 183 + uint32_t len32 = (uint32_t)total_len; 184 + memcpy(len_ptr, &len32, sizeof(uint32_t)); 185 + append_tape(offset, internal::tape_type::STRING); 186 + return SUCCESS; 187 + } 188 + 189 + error_code ec = SUCCESS; 190 + uint64_t len = read_uint(additional, ec); 191 + if (ec != SUCCESS) return ec; 192 + 193 + if (current + len > end) return INDEX_OUT_OF_BOUNDS; 194 + 195 + if (!simdjson::validate_utf8((const char*)current, len)) { 196 + return UTF8_ERROR; 197 + } 198 + 199 + return write_string(current, len); 200 + } 201 + 202 + error_code write_string(const uint8_t* ptr, size_t len) { 203 + uint32_t len32 = (uint32_t)len; 204 + size_t offset = current_string_buf - string_buf_start; 205 + 206 + append_tape(offset, internal::tape_type::STRING); 207 + 208 + memcpy(current_string_buf, &len32, sizeof(uint32_t)); 209 + current_string_buf += sizeof(uint32_t); 210 + 211 + memcpy(current_string_buf, ptr, len); 212 + current_string_buf += len; 213 + 214 + *current_string_buf++ = 0; 215 + current += len; 216 + 217 + return SUCCESS; 218 + } 219 + 220 + error_code parse_array(uint8_t additional) { 221 + size_t start_idx = reserve_tape(); 222 + uint64_t count = 0; 223 + 224 + if (additional == 31) { 225 + while (true) { 226 + if (current >= end) return UNEXPECTED_ERROR; 227 + if (*current == 0xFF) { 228 + current++; 229 + break; 230 + } 231 + error_code ec = parse_item(); 232 + if (ec != SUCCESS) return ec; 233 + count++; 234 + } 235 + } else { 236 + error_code ec = SUCCESS; 237 + count = read_uint(additional, ec); 238 + if (ec != SUCCESS) return ec; 239 + 240 + for (uint64_t i = 0; i < count; ++i) { 241 + ec = parse_item(); 242 + if (ec != SUCCESS) return ec; 243 + } 244 + } 245 + 246 + size_t end_idx = reserve_tape(); 247 + size_t next_idx = end_idx + 1; 248 + 249 + uint64_t start_payload = next_idx | (count << 32); 250 + doc.tape[start_idx] = start_payload | (uint64_t(internal::tape_type::START_ARRAY) << 56); 251 + doc.tape[end_idx] = start_idx | (uint64_t(internal::tape_type::END_ARRAY) << 56); 252 + 253 + return SUCCESS; 254 + } 255 + 256 + error_code parse_map(uint8_t additional) { 257 + size_t start_idx = reserve_tape(); 258 + uint64_t count = 0; 259 + 260 + if (additional == 31) { 261 + while (true) { 262 + if (current >= end) return UNEXPECTED_ERROR; 263 + if (*current == 0xFF) { 264 + current++; 265 + break; 266 + } 267 + 268 + error_code ec = parse_key(); 269 + if (ec != SUCCESS) return ec; 270 + 271 + ec = parse_item(); 272 + if (ec != SUCCESS) return ec; 273 + count++; 274 + } 275 + } else { 276 + error_code ec = SUCCESS; 277 + count = read_uint(additional, ec); 278 + if (ec != SUCCESS) return ec; 279 + 280 + for (uint64_t i = 0; i < count; ++i) { 281 + ec = parse_key(); 282 + if (ec != SUCCESS) return ec; 283 + 284 + ec = parse_item(); 285 + if (ec != SUCCESS) return ec; 286 + } 287 + } 288 + 289 + size_t end_idx = reserve_tape(); 290 + size_t next_idx = end_idx + 1; 291 + 292 + uint64_t start_payload = next_idx | (count << 32); 293 + doc.tape[start_idx] = start_payload | (uint64_t(internal::tape_type::START_OBJECT) << 56); 294 + doc.tape[end_idx] = start_idx | (uint64_t(internal::tape_type::END_OBJECT) << 56); 295 + 296 + return SUCCESS; 297 + } 298 + 299 + error_code parse_key() { 300 + if (current >= end) return EMPTY; 301 + 302 + uint8_t initial = *current; 303 + uint8_t major = initial >> 5; 304 + 305 + if (major == 3) { 306 + current++; 307 + return parse_text_string(initial & 0x1F); 308 + } else if (major == 2) { 309 + current++; 310 + return parse_byte_string(initial & 0x1F); 311 + } else { 312 + return INCORRECT_TYPE; 313 + } 314 + } 315 + 316 + error_code parse_float_simple(uint8_t additional) { 317 + if (additional < 20) { 318 + return UNEXPECTED_ERROR; 319 + } 320 + switch (additional) { 321 + case 20: 322 + append_tape(0, internal::tape_type::FALSE_VALUE); 323 + return SUCCESS; 324 + case 21: 325 + append_tape(0, internal::tape_type::TRUE_VALUE); 326 + return SUCCESS; 327 + case 22: 328 + append_tape(0, internal::tape_type::NULL_VALUE); 329 + return SUCCESS; 330 + case 23: 331 + append_tape(0, internal::tape_type::NULL_VALUE); 332 + return SUCCESS; 333 + case 24: 334 + if (current + 1 > end) return INDEX_OUT_OF_BOUNDS; 335 + current++; 336 + return UNEXPECTED_ERROR; 337 + case 25: { 338 + if (current + 2 > end) return INDEX_OUT_OF_BOUNDS; 339 + uint16_t v; 340 + memcpy(&v, current, 2); 341 + current += 2; 342 + v = bswap_16(v); 343 + 344 + uint32_t sign = (v >> 15) & 1; 345 + uint32_t exp = (v >> 10) & 0x1F; 346 + uint32_t mant = v & 0x3FF; 347 + 348 + double d; 349 + if (exp == 0) { 350 + d = std::ldexp(mant, -24); 351 + } else if (exp == 31) { 352 + d = (mant == 0) ? INFINITY : NAN; 353 + } else { 354 + d = std::ldexp(mant + 1024, exp - 25); 355 + } 356 + if (sign) d = -d; 357 + 358 + append_tape(0, internal::tape_type::DOUBLE); 359 + uint64_t d_as_u64; 360 + memcpy(&d_as_u64, &d, 8); 361 + append_tape_value(d_as_u64); 362 + return SUCCESS; 363 + } 364 + case 26: { 365 + if (current + 4 > end) return INDEX_OUT_OF_BOUNDS; 366 + uint32_t v; 367 + memcpy(&v, current, 4); 368 + current += 4; 369 + v = bswap_32(v); 370 + float f; 371 + memcpy(&f, &v, 4); 372 + 373 + append_tape(0, internal::tape_type::DOUBLE); 374 + double d = f; 375 + uint64_t d_as_u64; 376 + memcpy(&d_as_u64, &d, 8); 377 + append_tape_value(d_as_u64); 378 + return SUCCESS; 379 + } 380 + case 27: { 381 + if (current + 8 > end) return INDEX_OUT_OF_BOUNDS; 382 + uint64_t v; 383 + memcpy(&v, current, 8); 384 + current += 8; 385 + v = bswap_64(v); 386 + 387 + append_tape(0, internal::tape_type::DOUBLE); 388 + append_tape_value(v); 389 + return SUCCESS; 390 + } 391 + default: 392 + return UNEXPECTED_ERROR; 393 + } 394 + } 395 + }; 396 + 397 + } 398 + 399 + simdjson::error_code simdcbor::parse(const uint8_t* buf, size_t len, simdjson::dom::parser& parser, size_t& bytes_read) { 400 + auto err = parser.doc.allocate(len * 8 + 4096); 401 + if (err != SUCCESS) return err; 402 + 403 + CborReader reader(buf, len, parser.doc); 404 + err = reader.parse_root(); 405 + bytes_read = reader.current - buf; 406 + return err; 407 + } 408 + 409 + simdjson::dom::element simdcbor::get_root(simdjson::dom::parser& parser) { 410 + return parser.doc.root(); 411 + }
+12
lib/simdcbor.hh
··· 1 + #ifndef SIMDCBOR_HH 2 + #define SIMDCBOR_HH 3 + 4 + #include "simdjson_vendor.hh" 5 + 6 + class simdcbor { 7 + public: 8 + static simdjson::error_code parse(const uint8_t* buf, size_t len, simdjson::dom::parser& parser, size_t& bytes_read); 9 + static simdjson::dom::element get_root(simdjson::dom::parser& parser); 10 + }; 11 + 12 + #endif
+3
lib/simdjson_vendor.hh
··· 1 + class simdcbor; 1 2 /* auto-generated on 2025-12-17 20:32:36 -0500. version 4.2.4 Do not edit! */ 2 3 /* including simdjson.h: */ 3 4 /* begin file simdjson.h */ ··· 5110 5111 * This class cannot be copied, only moved, to avoid unintended allocations. 5111 5112 */ 5112 5113 class document { 5114 + friend class ::simdcbor; 5113 5115 public: 5114 5116 /** 5115 5117 * Create a document container with zero capacity. ··· 5211 5213 * @note This is not thread safe: one parser cannot produce two documents at the same time! 5212 5214 */ 5213 5215 class parser { 5216 + friend class ::simdcbor; 5214 5217 public: 5215 5218 /** 5216 5219 * Create a JSON parser.
+51
lib/simdjsont.ml
··· 97 97 let validate = Validate.is_valid 98 98 let decode codec s = Codec.decode_string codec s 99 99 let encode codec v = Codec.encode_string codec v 100 + 101 + (** {1 CBOR} *) 102 + 103 + module Cbor = struct 104 + external parse_cbor : 105 + Raw.parser -> Raw.buffer -> int -> Raw.error_code * Raw.element option 106 + = "simdjson_parse_cbor" 107 + 108 + external cbor_next : 109 + Raw.parser -> 110 + Raw.buffer -> 111 + int -> 112 + int -> 113 + Raw.error_code * Raw.element option * int = "simdjson_cbor_next" 114 + 115 + let decode_string codec s = 116 + let parser = Raw.create_parser () in 117 + let buf = Raw.buffer_of_string s in 118 + let len = String.length s in 119 + match parse_cbor parser buf len with 120 + | Raw.Success, Some elt -> Codec.decode_element codec elt 121 + | Raw.Success, None -> Error "Internal error: success but no element" 122 + | err, _ -> Error (Raw.error_message err) 123 + 124 + let to_seq codec s = 125 + let parser = Raw.create_parser () in 126 + let buf = Raw.buffer_of_string s in 127 + let len = String.length s in 128 + let rec aux offset () = 129 + if offset >= len then Seq.Nil 130 + else 131 + match cbor_next parser buf len offset with 132 + | Raw.Success, Some elt, new_offset -> 133 + let res = Codec.decode_element codec elt in 134 + Seq.Cons (res, aux new_offset) 135 + | Raw.Success, None, _ -> Seq.Nil 136 + | Raw.Empty, _, _ -> Seq.Nil 137 + | err, _, _ -> 138 + let msg = Raw.error_message err in 139 + Seq.Cons (Error msg, fun () -> Seq.Nil) 140 + in 141 + aux 0 142 + 143 + let encode_to_buffer codec v = 144 + let ast = Codec.to_json codec v in 145 + Simdjsont_cbor_encode.to_buffer ast 146 + 147 + let encode_string codec v = 148 + let ast = Codec.to_json codec v in 149 + Simdjsont_cbor_encode.to_string ast 150 + end
+50
lib/simdjsont.mli
··· 91 91 92 92 val encode : 'a Codec.t -> 'a -> string 93 93 (** Convenience alias for {!Codec.encode_string}. *) 94 + 95 + (** {1 CBOR} 96 + 97 + CBOR (Concise Binary Object Representation, RFC 8949) support using the same 98 + codec infrastructure as JSON. The same codecs work for both formats. 99 + 100 + {b Supported CBOR types:} 101 + - Integers (major types 0, 1) including 64-bit 102 + - Floats (major type 7) including half-precision (16-bit) 103 + - Text strings (major type 3) including indefinite-length 104 + - Arrays (major type 4) including indefinite-length 105 + - Maps (major type 5) with string keys, including indefinite-length 106 + - Simple values: true, false, null 107 + 108 + {b Not supported} (returns error or skipped): 109 + - Byte strings (major type 2) 110 + - Tagged values (major type 6) 111 + - Integer map keys 112 + - Bignums, undefined, other simple values *) 113 + 114 + module Cbor : sig 115 + val decode_string : 'a Codec.t -> string -> ('a, string) result 116 + (** [decode_string codec cbor] decodes a CBOR byte string using [codec]. 117 + 118 + Uses the same codec type as JSON decoding, so you can reuse codecs: 119 + {[ 120 + let point = Simdjsont.Cbor.decode_string point_codec cbor_bytes 121 + ]} *) 122 + 123 + val to_seq : 'a Codec.t -> string -> ('a, string) result Seq.t 124 + (** [to_seq codec data] parses multiple concatenated CBOR items from [data]. 125 + 126 + Returns a sequence of decode results, one per CBOR item. Useful for 127 + processing streams of CBOR-encoded records: 128 + {[ 129 + Simdjsont.Cbor.to_seq event_codec cbor_stream 130 + |> Seq.iter (function 131 + | Ok event -> process event 132 + | Error msg -> log_error msg) 133 + ]} *) 134 + 135 + val encode_string : 'a Codec.t -> 'a -> string 136 + (** [encode_string codec value] encodes [value] to a CBOR byte string. 137 + 138 + The result is a binary string containing valid CBOR. *) 139 + 140 + val encode_to_buffer : 'a Codec.t -> 'a -> Buffer.t 141 + (** [encode_to_buffer codec value] encodes [value] to a fresh buffer 142 + containing CBOR bytes. *) 143 + end
+105
lib/simdjsont_cbor_encode.ml
··· 1 + module Json = Simdjsont_json 2 + 3 + let write_byte buf b = Buffer.add_char buf (Char.unsafe_chr b) 4 + 5 + let write_uint_body buf v = 6 + if v < 24L then write_byte buf (Int64.to_int v) 7 + else if v <= 0xFFL then ( 8 + write_byte buf 24; 9 + write_byte buf (Int64.to_int v)) 10 + else if v <= 0xFFFFL then ( 11 + write_byte buf 25; 12 + let v_int = Int64.to_int v in 13 + write_byte buf ((v_int lsr 8) land 0xFF); 14 + write_byte buf (v_int land 0xFF)) 15 + else if v <= 0xFFFFFFFFL then ( 16 + write_byte buf 26; 17 + write_byte buf (Int64.to_int (Int64.shift_right_logical v 24) land 0xFF); 18 + write_byte buf (Int64.to_int (Int64.shift_right_logical v 16) land 0xFF); 19 + write_byte buf (Int64.to_int (Int64.shift_right_logical v 8) land 0xFF); 20 + write_byte buf (Int64.to_int v land 0xFF)) 21 + else ( 22 + write_byte buf 27; 23 + write_byte buf (Int64.to_int (Int64.shift_right_logical v 56) land 0xFF); 24 + write_byte buf (Int64.to_int (Int64.shift_right_logical v 48) land 0xFF); 25 + write_byte buf (Int64.to_int (Int64.shift_right_logical v 40) land 0xFF); 26 + write_byte buf (Int64.to_int (Int64.shift_right_logical v 32) land 0xFF); 27 + write_byte buf (Int64.to_int (Int64.shift_right_logical v 24) land 0xFF); 28 + write_byte buf (Int64.to_int (Int64.shift_right_logical v 16) land 0xFF); 29 + write_byte buf (Int64.to_int (Int64.shift_right_logical v 8) land 0xFF); 30 + write_byte buf (Int64.to_int v land 0xFF)) 31 + 32 + 33 + let write_head buf major v = 34 + let major_bits = major lsl 5 in 35 + if v < 24L then write_byte buf (major_bits lor Int64.to_int v) 36 + else if v <= 0xFFL then ( 37 + write_byte buf (major_bits lor 24); 38 + write_byte buf (Int64.to_int v)) 39 + else if v <= 0xFFFFL then ( 40 + write_byte buf (major_bits lor 25); 41 + let v_int = Int64.to_int v in 42 + write_byte buf ((v_int lsr 8) land 0xFF); 43 + write_byte buf (v_int land 0xFF)) 44 + else if v <= 0xFFFFFFFFL then ( 45 + write_byte buf (major_bits lor 26); 46 + write_byte buf (Int64.to_int (Int64.shift_right_logical v 24) land 0xFF); 47 + write_byte buf (Int64.to_int (Int64.shift_right_logical v 16) land 0xFF); 48 + write_byte buf (Int64.to_int (Int64.shift_right_logical v 8) land 0xFF); 49 + write_byte buf (Int64.to_int v land 0xFF)) 50 + else ( 51 + write_byte buf (major_bits lor 27); 52 + write_byte buf (Int64.to_int (Int64.shift_right_logical v 56) land 0xFF); 53 + write_byte buf (Int64.to_int (Int64.shift_right_logical v 48) land 0xFF); 54 + write_byte buf (Int64.to_int (Int64.shift_right_logical v 40) land 0xFF); 55 + write_byte buf (Int64.to_int (Int64.shift_right_logical v 32) land 0xFF); 56 + write_byte buf (Int64.to_int (Int64.shift_right_logical v 24) land 0xFF); 57 + write_byte buf (Int64.to_int (Int64.shift_right_logical v 16) land 0xFF); 58 + write_byte buf (Int64.to_int (Int64.shift_right_logical v 8) land 0xFF); 59 + write_byte buf (Int64.to_int v land 0xFF)) 60 + 61 + let rec write buf = function 62 + | Json.Null -> write_byte buf 0xF6 63 + | Json.Bool false -> write_byte buf 0xF4 64 + | Json.Bool true -> write_byte buf 0xF5 65 + | Json.Int i -> 66 + if i >= 0L then write_head buf 0 i else write_head buf 1 (Int64.lognot i) 67 + | Json.Float f -> 68 + write_byte buf 0xFB; 69 + let i = Int64.bits_of_float f in 70 + write_byte buf (Int64.to_int (Int64.shift_right_logical i 56) land 0xFF); 71 + write_byte buf (Int64.to_int (Int64.shift_right_logical i 48) land 0xFF); 72 + write_byte buf (Int64.to_int (Int64.shift_right_logical i 40) land 0xFF); 73 + write_byte buf (Int64.to_int (Int64.shift_right_logical i 32) land 0xFF); 74 + write_byte buf (Int64.to_int (Int64.shift_right_logical i 24) land 0xFF); 75 + write_byte buf (Int64.to_int (Int64.shift_right_logical i 16) land 0xFF); 76 + write_byte buf (Int64.to_int (Int64.shift_right_logical i 8) land 0xFF); 77 + write_byte buf (Int64.to_int i land 0xFF) 78 + | Json.String s -> 79 + let len = String.length s in 80 + write_head buf 3 (Int64.of_int len); 81 + Buffer.add_string buf s 82 + | Json.Array items -> 83 + let len = List.length items in 84 + write_head buf 4 (Int64.of_int len); 85 + List.iter (write buf) items 86 + | Json.Object members -> 87 + let len = List.length members in 88 + write_head buf 5 (Int64.of_int len); 89 + List.iter 90 + (fun (k, v) -> 91 + let klen = String.length k in 92 + write_head buf 3 (Int64.of_int klen); 93 + Buffer.add_string buf k; 94 + write buf v) 95 + members 96 + 97 + let to_buffer ?(capacity = 256) v = 98 + let buf = Buffer.create capacity in 99 + write buf v; 100 + buf 101 + 102 + let to_string v = 103 + let buf = Buffer.create 256 in 104 + write buf v; 105 + Buffer.contents buf
+34 -2
lib/simdjsont_codec.ml
··· 12 12 13 13 type 'a decoder = string list -> Simdjsont_raw.element -> 'a 14 14 type encoder = Buffer.t -> unit 15 - type 'a t = { decode : 'a decoder; encode : 'a -> encoder } 15 + 16 + type 'a t = { 17 + decode : 'a decoder; 18 + encode : 'a -> encoder; 19 + to_json : 'a -> Simdjsont_json.t; 20 + } 16 21 17 22 let decode_element codec elt = 18 23 try Ok (codec.decode [] elt) ··· 36 41 buf 37 42 38 43 let encode_string codec v = Buffer.contents (encode_to_buffer codec v) 44 + let to_json codec v = codec.to_json v 39 45 40 46 let write_escaped_string buf s = 41 47 Buffer.add_char buf '"'; ··· 62 68 | Simdjsont_raw.Null -> () 63 69 | _ -> decode_error path "expected null"); 64 70 encode = (fun () buf -> Buffer.add_string buf "null"); 71 + to_json = (fun () -> Simdjsont_json.Null); 65 72 } 66 73 67 74 let bool : bool t = ··· 73 80 | Error _ -> decode_error path "expected bool"); 74 81 encode = 75 82 (fun b buf -> Buffer.add_string buf (if b then "true" else "false")); 83 + to_json = (fun b -> Simdjsont_json.Bool b); 76 84 } 77 85 78 86 let int : int t = ··· 83 91 | Ok i -> Int64.to_int i 84 92 | Error _ -> decode_error path "expected int"); 85 93 encode = (fun i buf -> Buffer.add_string buf (string_of_int i)); 94 + to_json = (fun i -> Simdjsont_json.Int (Int64.of_int i)); 86 95 } 87 96 88 97 let int64 : int64 t = ··· 93 102 | Ok i -> i 94 103 | Error _ -> decode_error path "expected int64"); 95 104 encode = (fun i buf -> Buffer.add_string buf (Int64.to_string i)); 105 + to_json = (fun i -> Simdjsont_json.Int i); 96 106 } 97 107 98 108 let float : float t = ··· 119 129 if Float.is_finite f then 120 130 Buffer.add_string buf (Printf.sprintf "%.17g" f) 121 131 else Buffer.add_string buf "null"); 132 + to_json = (fun f -> Simdjsont_json.Float f); 122 133 } 123 134 124 135 let string : string t = ··· 129 140 | Ok s -> s 130 141 | Error _ -> decode_error path "expected string"); 131 142 encode = (fun s buf -> write_escaped_string buf s); 143 + to_json = (fun s -> Simdjsont_json.String s); 132 144 } 133 145 134 146 let list (item : 'a t) : 'a list t = ··· 160 172 item.encode v buf) 161 173 xs); 162 174 Buffer.add_char buf ']'); 175 + to_json = (fun items -> Simdjsont_json.Array (List.map item.to_json items)); 163 176 } 164 177 165 178 let array (item : 'a t) : 'a array t = ··· 196 209 done 197 210 end; 198 211 Buffer.add_char buf ']'); 212 + to_json = 213 + (fun items -> 214 + Simdjsont_json.Array (List.map item.to_json (Array.to_list items))); 199 215 } 200 216 201 217 let optional (inner : 'a t) : 'a option t = ··· 210 226 match opt with 211 227 | None -> Buffer.add_string buf "null" 212 228 | Some v -> inner.encode v buf); 229 + to_json = 230 + (function None -> Simdjsont_json.Null | Some v -> inner.to_json v); 213 231 } 214 232 215 233 let map (f : 'a -> 'b) (g : 'b -> 'a) (codec : 'a t) : 'b t = 216 234 { 217 235 decode = (fun path elt -> f (codec.decode path elt)); 218 236 encode = (fun v buf -> codec.encode (g v) buf); 237 + to_json = (fun v -> codec.to_json (g v)); 219 238 } 220 239 221 240 let decode_obj_field path (obj : Simdjsont_raw.object_) name (dec : 'a decoder) ··· 239 258 type ('o, 'dec) builder = { 240 259 dec : string list -> Simdjsont_raw.object_ -> 'dec; 241 260 enc : 'o -> Buffer.t -> bool -> bool; 261 + ast : 'o -> (string * Simdjsont_json.t) list; 242 262 } 243 263 244 264 let field constructor = 245 265 { 246 266 dec = (fun _path _obj -> constructor); 247 267 enc = (fun _v _buf first -> first); 268 + ast = (fun _v -> []); 248 269 } 249 270 250 271 let mem name (codec : 'a t) ~enc:(get : 'o -> 'a) ··· 262 283 Buffer.add_char buf ':'; 263 284 codec.encode (get v) buf; 264 285 false); 286 + ast = (fun v -> (name, codec.to_json (get v)) :: builder.ast v); 265 287 } 266 288 267 289 let opt_mem name (codec : 'a t) ~enc:(get : 'o -> 'a option) ··· 282 304 Buffer.add_char buf ':'; 283 305 codec.encode inner_v buf; 284 306 false); 307 + ast = 308 + (fun v -> 309 + match get v with 310 + | None -> builder.ast v 311 + | Some inner_v -> (name, codec.to_json inner_v) :: builder.ast v); 285 312 } 286 313 287 314 let finish (builder : ('o, 'o) builder) : 'o t = ··· 296 323 Buffer.add_char buf '{'; 297 324 let _ = builder.enc v buf true in 298 325 Buffer.add_char buf '}'); 326 + to_json = (fun v -> Simdjsont_json.Object (builder.ast v)); 299 327 } 300 328 end 301 329 ··· 345 373 Simdjsont_json.Object (List.rev !members) 346 374 | Error _ -> Simdjsont_json.Object []) 347 375 in 348 - { decode = decode_value; encode = (fun v buf -> Simdjsont_json.write buf v) } 376 + { 377 + decode = decode_value; 378 + encode = (fun v buf -> Simdjsont_json.write buf v); 379 + to_json = (fun v -> v); 380 + }
+3
lib/simdjsont_codec.mli
··· 36 36 val encode_string : 'a t -> 'a -> string 37 37 (** Encode a value to a JSON string using the given codec. *) 38 38 39 + val to_json : 'a t -> 'a -> Simdjsont_json.t 40 + (** Convert a value to JSON AST using the given codec. *) 41 + 39 42 val null : unit t 40 43 (** Codec for the JSON [null] value. *) 41 44
+72
lib/simdjsont_stubs.cpp
··· 1 1 #include "simdjson_vendor.hh" 2 + #include "simdcbor.hh" 2 3 #include <cmath> 3 4 4 5 extern "C" { ··· 940 941 DocumentStreamState *state = DocStream_val(stream); 941 942 CAMLreturn(Val_long(state->stream.size_in_bytes())); 942 943 } 944 + 945 + /* ========================================================================= */ 946 + /* CBOR operations */ 947 + /* ========================================================================= */ 948 + 949 + extern "C" CAMLprim value simdjson_parse_cbor(value parser, value buf, value len) { 950 + CAMLparam3(parser, buf, len); 951 + CAMLlocal3(result, element_val, some_val); 952 + 953 + dom::parser *p = Parser_val(parser); 954 + const uint8_t *data = (const uint8_t *)Caml_ba_data_val(buf); 955 + size_t length = Long_val(len); 956 + 957 + size_t bytes_read = 0; 958 + auto error = simdcbor::parse(data, length, *p, bytes_read); 959 + 960 + result = caml_alloc_tuple(2); 961 + Store_field(result, 0, Val_int(error_code_to_int(error))); 962 + 963 + if (error == SUCCESS) { 964 + element_val = caml_alloc_custom(&element_ops, sizeof(dom::element), 0, 1); 965 + Element_val(element_val) = simdcbor::get_root(*p); 966 + 967 + some_val = caml_alloc(1, 0); 968 + Store_field(some_val, 0, element_val); 969 + Store_field(result, 1, some_val); 970 + } else { 971 + Store_field(result, 1, Val_none); 972 + } 973 + 974 + CAMLreturn(result); 975 + } 976 + 977 + extern "C" CAMLprim value simdjson_cbor_next(value parser, value buf, value len, value offset_val) { 978 + CAMLparam4(parser, buf, len, offset_val); 979 + CAMLlocal3(result, element_val, some_val); 980 + 981 + dom::parser *p = Parser_val(parser); 982 + const uint8_t *data = (const uint8_t *)Caml_ba_data_val(buf); 983 + size_t length = Long_val(len); 984 + size_t offset = Long_val(offset_val); 985 + 986 + if (offset >= length) { 987 + result = caml_alloc_tuple(3); 988 + Store_field(result, 0, Val_int(error_code_to_int(EMPTY))); 989 + Store_field(result, 1, Val_none); 990 + Store_field(result, 2, Val_long(offset)); 991 + CAMLreturn(result); 992 + } 993 + 994 + size_t bytes_read = 0; 995 + auto error = simdcbor::parse(data + offset, length - offset, *p, bytes_read); 996 + 997 + result = caml_alloc_tuple(3); 998 + Store_field(result, 0, Val_int(error_code_to_int(error))); 999 + 1000 + if (error == SUCCESS) { 1001 + element_val = caml_alloc_custom(&element_ops, sizeof(dom::element), 0, 1); 1002 + Element_val(element_val) = simdcbor::get_root(*p); 1003 + 1004 + some_val = caml_alloc(1, 0); 1005 + Store_field(some_val, 0, element_val); 1006 + Store_field(result, 1, some_val); 1007 + Store_field(result, 2, Val_long(offset + bytes_read)); 1008 + } else { 1009 + Store_field(result, 1, Val_none); 1010 + Store_field(result, 2, Val_long(offset)); 1011 + } 1012 + 1013 + CAMLreturn(result); 1014 + }
+3 -3
simdjsont.opam
··· 1 1 # This file is generated by dune, edit dune-project instead 2 2 opam-version: "2.0" 3 - version: "0.1.0" 4 - synopsis: "JSON parsing with simdjson, with support for ndjson streaming" 3 + version: "0.2.0" 4 + synopsis: "JSON and CBOR parsing with simdjson, with support for streaming" 5 5 description: 6 - "OCaml bindings to simdjson with support for ndjson streaming. Includes vendored simdjson 4.2.4." 6 + "OCaml bindings to simdjson with support for ndjson streaming and CBOR encoding/decoding. Includes vendored simdjson 4.2.4." 7 7 maintainer: ["Gabriel Díaz"] 8 8 authors: ["Gabriel Díaz"] 9 9 license: "ISC"
+8
test/dune
··· 9 9 (test 10 10 (name test_memory) 11 11 (libraries simdjsont alcotest)) 12 + 13 + (test 14 + (name test_cbor) 15 + (libraries simdjsont alcotest)) 16 + 17 + (test 18 + (name test_cbor_compliance) 19 + (libraries simdjsont))
+71
test/test_cbor.ml
··· 1 + (* Test CBOR parsing and encoding *) 2 + 3 + let test_decode_int () = 4 + (* CBOR encoding of integer 42: 0x18 0x2a *) 5 + let cbor_int = "\x18\x2a" in 6 + match Simdjsont.Cbor.decode_string Simdjsont.Codec.int cbor_int with 7 + | Ok n -> Alcotest.(check int) "decode int 42" 42 n 8 + | Error e -> Alcotest.fail ("decode_int failed: " ^ e) 9 + 10 + let test_decode_small_int () = 11 + (* Small integers 0-23 are encoded in a single byte *) 12 + let cbor_int = "\x05" in 13 + match Simdjsont.Cbor.decode_string Simdjsont.Codec.int cbor_int with 14 + | Ok n -> Alcotest.(check int) "decode int 5" 5 n 15 + | Error e -> Alcotest.fail ("decode_small_int failed: " ^ e) 16 + 17 + let test_decode_string () = 18 + (* CBOR text string "hello": 0x65 h e l l o *) 19 + let cbor_str = "\x65hello" in 20 + match Simdjsont.Cbor.decode_string Simdjsont.Codec.string cbor_str with 21 + | Ok s -> Alcotest.(check string) "decode string" "hello" s 22 + | Error e -> Alcotest.fail ("decode_string failed: " ^ e) 23 + 24 + let test_decode_array () = 25 + let cbor_arr = "\x83\x01\x02\x03" in 26 + match Simdjsont.Cbor.decode_string Simdjsont.Codec.(list int) cbor_arr with 27 + | Ok lst -> Alcotest.(check (list int)) "decode array" [ 1; 2; 3 ] lst 28 + | Error e -> Alcotest.fail ("decode_array failed: " ^ e) 29 + 30 + let test_decode_bool () = 31 + (* CBOR true: 0xf5, false: 0xf4 *) 32 + let cbor_true = "\xf5" in 33 + let cbor_false = "\xf4" in 34 + (match Simdjsont.Cbor.decode_string Simdjsont.Codec.bool cbor_true with 35 + | Ok b -> Alcotest.(check bool) "decode true" true b 36 + | Error e -> Alcotest.fail ("decode_true failed: " ^ e)); 37 + match Simdjsont.Cbor.decode_string Simdjsont.Codec.bool cbor_false with 38 + | Ok b -> Alcotest.(check bool) "decode false" false b 39 + | Error e -> Alcotest.fail ("decode_false failed: " ^ e) 40 + 41 + let test_decode_null () = 42 + (* CBOR null: 0xf6 *) 43 + let cbor_null = "\xf6" in 44 + match Simdjsont.Cbor.decode_string Simdjsont.Codec.null cbor_null with 45 + | Ok () -> () 46 + | Error e -> Alcotest.fail ("decode_null failed: " ^ e) 47 + 48 + let test_decode_map () = 49 + (* CBOR {"a": 1}: 0xa1 0x61 a 0x01 *) 50 + let cbor_map = "\xa1\x61a\x01" in 51 + let codec = 52 + Simdjsont.Codec.Obj.field (fun a -> a) 53 + |> Simdjsont.Codec.Obj.mem "a" Simdjsont.Codec.int ~enc:Fun.id 54 + |> Simdjsont.Codec.Obj.finish 55 + in 56 + match Simdjsont.Cbor.decode_string codec cbor_map with 57 + | Ok n -> Alcotest.(check int) "decode map" 1 n 58 + | Error e -> Alcotest.fail ("decode_map failed: " ^ e) 59 + 60 + let cbor_tests = 61 + [ 62 + ("decode int", `Quick, test_decode_int); 63 + ("decode small int", `Quick, test_decode_small_int); 64 + ("decode string", `Quick, test_decode_string); 65 + ("decode array", `Quick, test_decode_array); 66 + ("decode bool", `Quick, test_decode_bool); 67 + ("decode null", `Quick, test_decode_null); 68 + ("decode map", `Quick, test_decode_map); 69 + ] 70 + 71 + let () = Alcotest.run "cbor" [ ("cbor", cbor_tests) ]
+335
test/test_cbor_compliance.ml
··· 1 + (* RFC 8949 Appendix A - https://github.com/cbor/test-vectors *) 2 + 3 + let hex_to_bytes hex = 4 + let len = String.length hex in 5 + if len mod 2 <> 0 then failwith "hex_to_bytes: odd length"; 6 + let result = Bytes.create (len / 2) in 7 + for i = 0 to (len / 2) - 1 do 8 + let hi = Char.code hex.[i * 2] in 9 + let lo = Char.code hex.[(i * 2) + 1] in 10 + let hex_val c = 11 + if c >= Char.code '0' && c <= Char.code '9' then c - Char.code '0' 12 + else if c >= Char.code 'a' && c <= Char.code 'f' then 13 + c - Char.code 'a' + 10 14 + else if c >= Char.code 'A' && c <= Char.code 'F' then 15 + c - Char.code 'A' + 10 16 + else 17 + failwith 18 + ("hex_to_bytes: invalid hex char: " ^ String.make 1 (Char.chr c)) 19 + in 20 + Bytes.set result i (Char.chr ((hex_val hi lsl 4) lor hex_val lo)) 21 + done; 22 + Bytes.to_string result 23 + 24 + type test_result = Pass | Fail of string | Skip of string 25 + 26 + let passed = ref 0 27 + let failed = ref 0 28 + let skipped = ref 0 29 + 30 + let run_test name f = 31 + match f () with 32 + | Pass -> 33 + incr passed; 34 + Printf.printf " ✓ %s\n%!" name 35 + | Fail reason -> 36 + incr failed; 37 + Printf.printf " ✗ %s: %s\n%!" name reason 38 + | Skip reason -> 39 + incr skipped; 40 + Printf.printf " ⊘ %s: %s\n%!" name reason 41 + 42 + let decode_int hex expected = 43 + let cbor = hex_to_bytes hex in 44 + match Simdjsont.Cbor.decode_string Simdjsont.Codec.int64 cbor with 45 + | Ok n when n = expected -> Pass 46 + | Ok n -> Fail (Printf.sprintf "expected %Ld, got %Ld" expected n) 47 + | Error e -> Fail e 48 + 49 + let decode_float hex expected = 50 + let cbor = hex_to_bytes hex in 51 + match Simdjsont.Cbor.decode_string Simdjsont.Codec.float cbor with 52 + | Ok f when f = expected -> Pass 53 + | Ok f when Float.is_nan f && Float.is_nan expected -> Pass 54 + | Ok f -> Fail (Printf.sprintf "expected %g, got %g" expected f) 55 + | Error e -> Fail e 56 + 57 + let decode_bool hex expected = 58 + let cbor = hex_to_bytes hex in 59 + match Simdjsont.Cbor.decode_string Simdjsont.Codec.bool cbor with 60 + | Ok b when b = expected -> Pass 61 + | Ok b -> Fail (Printf.sprintf "expected %b, got %b" expected b) 62 + | Error e -> Fail e 63 + 64 + let decode_null hex = 65 + let cbor = hex_to_bytes hex in 66 + match Simdjsont.Cbor.decode_string Simdjsont.Codec.null cbor with 67 + | Ok () -> Pass 68 + | Error e -> Fail e 69 + 70 + let decode_string hex expected = 71 + let cbor = hex_to_bytes hex in 72 + match Simdjsont.Cbor.decode_string Simdjsont.Codec.string cbor with 73 + | Ok s when s = expected -> Pass 74 + | Ok s -> Fail (Printf.sprintf "expected %S, got %S" expected s) 75 + | Error e -> Fail e 76 + 77 + let decode_int_list hex expected = 78 + let cbor = hex_to_bytes hex in 79 + match Simdjsont.Cbor.decode_string Simdjsont.Codec.(list int) cbor with 80 + | Ok lst when lst = expected -> Pass 81 + | Ok lst -> 82 + Fail 83 + (Printf.sprintf "expected [%s], got [%s]" 84 + (String.concat "; " (List.map string_of_int expected)) 85 + (String.concat "; " (List.map string_of_int lst))) 86 + | Error e -> Fail e 87 + 88 + let decode_json hex expected = 89 + let cbor = hex_to_bytes hex in 90 + match Simdjsont.Cbor.decode_string Simdjsont.Codec.value cbor with 91 + | Ok v when v = expected -> Pass 92 + | Ok v -> 93 + Fail 94 + (Printf.sprintf "expected %s, got %s" 95 + (Simdjsont.Json.to_string expected) 96 + (Simdjsont.Json.to_string v)) 97 + | Error e -> Fail e 98 + 99 + let test_unsigned_integers () = 100 + Printf.printf "\n=== Unsigned Integers ===\n%!"; 101 + 102 + run_test "0 (0x00)" (fun () -> decode_int "00" 0L); 103 + run_test "1 (0x01)" (fun () -> decode_int "01" 1L); 104 + run_test "10 (0x0a)" (fun () -> decode_int "0a" 10L); 105 + run_test "23 (0x17)" (fun () -> decode_int "17" 23L); 106 + run_test "24 (0x1818)" (fun () -> decode_int "1818" 24L); 107 + run_test "25 (0x1819)" (fun () -> decode_int "1819" 25L); 108 + run_test "100 (0x1864)" (fun () -> decode_int "1864" 100L); 109 + run_test "1000 (0x1903e8)" (fun () -> decode_int "1903e8" 1000L); 110 + run_test "1000000 (0x1a000f4240)" (fun () -> decode_int "1a000f4240" 1000000L); 111 + run_test "1000000000000 (0x1b000000e8d4a51000)" (fun () -> 112 + decode_int "1b000000e8d4a51000" 1000000000000L); 113 + run_test "18446744073709551615 (0x1bffffffffffffffff)" (fun () -> 114 + Skip "uint64 max exceeds int64 range") 115 + 116 + let test_negative_integers () = 117 + Printf.printf "\n=== Negative Integers ===\n%!"; 118 + 119 + run_test "-1 (0x20)" (fun () -> decode_int "20" (-1L)); 120 + run_test "-10 (0x29)" (fun () -> decode_int "29" (-10L)); 121 + run_test "-100 (0x3863)" (fun () -> decode_int "3863" (-100L)); 122 + run_test "-1000 (0x3903e7)" (fun () -> decode_int "3903e7" (-1000L)) 123 + 124 + let test_floats () = 125 + Printf.printf "\n=== Floating Point ===\n%!"; 126 + 127 + run_test "0.0 half (0xf90000)" (fun () -> decode_float "f90000" 0.0); 128 + run_test "-0.0 half (0xf98000)" (fun () -> decode_float "f98000" (-0.0)); 129 + run_test "1.0 half (0xf93c00)" (fun () -> decode_float "f93c00" 1.0); 130 + run_test "1.5 half (0xf93e00)" (fun () -> decode_float "f93e00" 1.5); 131 + run_test "65504.0 half (0xf97bff)" (fun () -> decode_float "f97bff" 65504.0); 132 + run_test "-4.0 half (0xf9c400)" (fun () -> decode_float "f9c400" (-4.0)); 133 + 134 + run_test "100000.0 single (0xfa47c35000)" (fun () -> 135 + decode_float "fa47c35000" 100000.0); 136 + run_test "3.4028234663852886e+38 single (0xfa7f7fffff)" (fun () -> 137 + decode_float "fa7f7fffff" 3.4028234663852886e+38); 138 + 139 + run_test "1.1 double (0xfb3ff199999999999a)" (fun () -> 140 + decode_float "fb3ff199999999999a" 1.1); 141 + run_test "1.0e+300 double (0xfb7e37e43c8800759c)" (fun () -> 142 + decode_float "fb7e37e43c8800759c" 1.0e+300); 143 + run_test "-4.1 double (0xfbc010666666666666)" (fun () -> 144 + decode_float "fbc010666666666666" (-4.1)); 145 + 146 + run_test "Infinity half (0xf97c00)" (fun () -> 147 + decode_float "f97c00" Float.infinity); 148 + run_test "-Infinity half (0xf9fc00)" (fun () -> 149 + decode_float "f9fc00" Float.neg_infinity); 150 + run_test "NaN half (0xf97e00)" (fun () -> decode_float "f97e00" Float.nan); 151 + run_test "Infinity single (0xfa7f800000)" (fun () -> 152 + decode_float "fa7f800000" Float.infinity); 153 + run_test "-Infinity single (0xfaff800000)" (fun () -> 154 + decode_float "faff800000" Float.neg_infinity); 155 + run_test "NaN single (0xfa7fc00000)" (fun () -> 156 + decode_float "fa7fc00000" Float.nan); 157 + run_test "Infinity double (0xfb7ff0000000000000)" (fun () -> 158 + decode_float "fb7ff0000000000000" Float.infinity); 159 + run_test "-Infinity double (0xfbfff0000000000000)" (fun () -> 160 + decode_float "fbfff0000000000000" Float.neg_infinity); 161 + run_test "NaN double (0xfb7ff8000000000000)" (fun () -> 162 + decode_float "fb7ff8000000000000" Float.nan); 163 + 164 + run_test "5.960464477539063e-08 half (0xf90001)" (fun () -> 165 + decode_float "f90001" 5.960464477539063e-08); 166 + run_test "6.103515625e-05 half (0xf90400)" (fun () -> 167 + decode_float "f90400" 6.103515625e-05) 168 + 169 + let test_booleans () = 170 + Printf.printf "\n=== Booleans ===\n%!"; 171 + 172 + run_test "false (0xf4)" (fun () -> decode_bool "f4" false); 173 + run_test "true (0xf5)" (fun () -> decode_bool "f5" true) 174 + 175 + let test_null_undefined () = 176 + Printf.printf "\n=== Null and Undefined ===\n%!"; 177 + 178 + run_test "null (0xf6)" (fun () -> decode_null "f6"); 179 + run_test "undefined (0xf7)" (fun () -> 180 + Skip "undefined not supported in JSON model") 181 + 182 + let test_simple_values () = 183 + Printf.printf "\n=== Simple Values ===\n%!"; 184 + 185 + run_test "simple(16) (0xf0)" (fun () -> Skip "simple values not supported"); 186 + run_test "simple(24) (0xf818)" (fun () -> Skip "simple values not supported"); 187 + run_test "simple(255) (0xf8ff)" (fun () -> Skip "simple values not supported") 188 + 189 + let test_tags () = 190 + Printf.printf "\n=== Tags ===\n%!"; 191 + 192 + run_test "tag 0 datetime (0xc074...)" (fun () -> Skip "tags not supported"); 193 + run_test "tag 1 epoch (0xc11a514b67b0)" (fun () -> Skip "tags not supported"); 194 + run_test "tag 23 base16 (0xd74401020304)" (fun () -> 195 + Skip "tags not supported"); 196 + run_test "tag 24 embedded CBOR (0xd818...)" (fun () -> 197 + Skip "tags not supported"); 198 + run_test "tag 32 URI (0xd82076...)" (fun () -> Skip "tags not supported") 199 + 200 + let test_bignum () = 201 + Printf.printf "\n=== Bignum (Tags 2/3) ===\n%!"; 202 + 203 + run_test "bignum 18446744073709551616 (0xc249...)" (fun () -> 204 + Skip "bignum tags not supported"); 205 + run_test "bignum -18446744073709551617 (0xc349...)" (fun () -> 206 + Skip "bignum tags not supported"); 207 + run_test "-18446744073709551616 (0x3bffffffffffffffff)" (fun () -> 208 + Skip "exceeds int64 range") 209 + 210 + let test_byte_strings () = 211 + Printf.printf "\n=== Byte Strings ===\n%!"; 212 + 213 + run_test "empty byte string h'' (0x40)" (fun () -> 214 + Skip "byte strings not in JSON model"); 215 + run_test "byte string h'01020304' (0x4401020304)" (fun () -> 216 + Skip "byte strings not in JSON model"); 217 + run_test "indefinite byte string (0x5f42010243030405ff)" (fun () -> 218 + Skip "byte strings not in JSON model") 219 + 220 + let test_text_strings () = 221 + Printf.printf "\n=== Text Strings ===\n%!"; 222 + 223 + run_test "empty string \"\" (0x60)" (fun () -> decode_string "60" ""); 224 + run_test "\"a\" (0x6161)" (fun () -> decode_string "6161" "a"); 225 + run_test "\"IETF\" (0x6449455446)" (fun () -> 226 + decode_string "6449455446" "IETF"); 227 + run_test "\"\\\"\\\\\" (0x62225c)" (fun () -> decode_string "62225c" "\"\\"); 228 + run_test "\"ü\" UTF-8 (0x62c3bc)" (fun () -> decode_string "62c3bc" "ü"); 229 + run_test "\"水\" UTF-8 (0x63e6b0b4)" (fun () -> decode_string "63e6b0b4" "水"); 230 + run_test "\"𐅑\" UTF-8 4-byte (0x64f0908591)" (fun () -> 231 + decode_string "64f0908591" "𐅑"); 232 + run_test "indefinite \"streaming\" (0x7f657374726561646d696e67ff)" (fun () -> 233 + decode_string "7f657374726561646d696e67ff" "streaming") 234 + 235 + let test_arrays () = 236 + Printf.printf "\n=== Arrays ===\n%!"; 237 + 238 + run_test "[] empty (0x80)" (fun () -> decode_int_list "80" []); 239 + run_test "[1,2,3] (0x83010203)" (fun () -> 240 + decode_int_list "83010203" [ 1; 2; 3 ]); 241 + run_test "[1,[2,3],[4,5]] nested (0x8301820203820405)" (fun () -> 242 + let open Simdjsont.Json in 243 + decode_json "8301820203820405" 244 + (Array [ Int 1L; Array [ Int 2L; Int 3L ]; Array [ Int 4L; Int 5L ] ])); 245 + run_test "[1..25] long array (0x98190102...)" (fun () -> 246 + decode_int_list 247 + "98190102030405060708090a0b0c0d0e0f101112131415161718181819" 248 + (List.init 25 (fun i -> i + 1))); 249 + run_test "indefinite [] (0x9fff)" (fun () -> decode_int_list "9fff" []); 250 + run_test "indefinite [1,[2,3],[4,5]] (0x9f018202039f0405ffff)" (fun () -> 251 + let open Simdjsont.Json in 252 + decode_json "9f018202039f0405ffff" 253 + (Array [ Int 1L; Array [ Int 2L; Int 3L ]; Array [ Int 4L; Int 5L ] ])); 254 + run_test "mixed definite/indefinite (0x9f01820203820405ff)" (fun () -> 255 + let open Simdjsont.Json in 256 + decode_json "9f01820203820405ff" 257 + (Array [ Int 1L; Array [ Int 2L; Int 3L ]; Array [ Int 4L; Int 5L ] ])); 258 + run_test "definite outer, indefinite inner (0x83018202039f0405ff)" (fun () -> 259 + let open Simdjsont.Json in 260 + decode_json "83018202039f0405ff" 261 + (Array [ Int 1L; Array [ Int 2L; Int 3L ]; Array [ Int 4L; Int 5L ] ])); 262 + run_test "indefinite outer, definite inner (0x83019f0203ff820405)" (fun () -> 263 + let open Simdjsont.Json in 264 + decode_json "83019f0203ff820405" 265 + (Array [ Int 1L; Array [ Int 2L; Int 3L ]; Array [ Int 4L; Int 5L ] ])); 266 + run_test "indefinite [1..25] (0x9f0102...19ff)" (fun () -> 267 + decode_int_list 268 + "9f0102030405060708090a0b0c0d0e0f101112131415161718181819ff" 269 + (List.init 25 (fun i -> i + 1))) 270 + 271 + let test_maps () = 272 + Printf.printf "\n=== Maps ===\n%!"; 273 + 274 + run_test "{} empty (0xa0)" (fun () -> 275 + decode_json "a0" (Simdjsont.Json.Object [])); 276 + run_test "{1:2, 3:4} int keys (0xa201020304)" (fun () -> 277 + Skip "integer keys not supported in JSON model"); 278 + run_test "{\"a\":1, \"b\":[2,3]} (0xa26161016162820203)" (fun () -> 279 + let open Simdjsont.Json in 280 + decode_json "a26161016162820203" 281 + (Object [ ("a", Int 1L); ("b", Array [ Int 2L; Int 3L ]) ])); 282 + run_test "[\"a\", {\"b\":\"c\"}] (0x826161a161626163)" (fun () -> 283 + let open Simdjsont.Json in 284 + decode_json "826161a161626163" 285 + (Array [ String "a"; Object [ ("b", String "c") ] ])); 286 + run_test "{\"a\"..\"e\": \"A\"..\"E\"} 5 pairs (0xa56161614161626142...)" 287 + (fun () -> 288 + let open Simdjsont.Json in 289 + decode_json "a56161614161626142616361436164614461656145" 290 + (Object 291 + [ 292 + ("a", String "A"); 293 + ("b", String "B"); 294 + ("c", String "C"); 295 + ("d", String "D"); 296 + ("e", String "E"); 297 + ])); 298 + run_test "indefinite {\"a\":1, \"b\":[2,3]} (0xbf61610161629f0203ffff)" 299 + (fun () -> 300 + let open Simdjsont.Json in 301 + decode_json "bf61610161629f0203ffff" 302 + (Object [ ("a", Int 1L); ("b", Array [ Int 2L; Int 3L ]) ])); 303 + run_test "[\"a\", indef {\"b\":\"c\"}] (0x826161bf61626163ff)" (fun () -> 304 + let open Simdjsont.Json in 305 + decode_json "826161bf61626163ff" 306 + (Array [ String "a"; Object [ ("b", String "c") ] ])); 307 + run_test "indefinite {\"Fun\":true, \"Amt\":-2} (0xbf6346756ef563416d7421ff)" 308 + (fun () -> 309 + let open Simdjsont.Json in 310 + decode_json "bf6346756ef563416d7421ff" 311 + (Object [ ("Fun", Bool true); ("Amt", Int (-2L)) ])) 312 + 313 + let () = 314 + Printf.printf "RFC 8949 Appendix A Compliance Tests\n"; 315 + Printf.printf "=====================================\n%!"; 316 + 317 + test_unsigned_integers (); 318 + test_negative_integers (); 319 + test_floats (); 320 + test_booleans (); 321 + test_null_undefined (); 322 + test_simple_values (); 323 + test_tags (); 324 + test_bignum (); 325 + test_byte_strings (); 326 + test_text_strings (); 327 + test_arrays (); 328 + test_maps (); 329 + 330 + Printf.printf "\n=====================================\n"; 331 + Printf.printf "Results: %d passed, %d failed, %d skipped\n%!" !passed !failed 332 + !skipped; 333 + Printf.printf "Total: %d tests\n%!" (!passed + !failed + !skipped); 334 + 335 + if !failed > 0 then exit 1 else exit 0
+254
test/test_memory.ml
··· 390 390 (Printf.sprintf "RSS grew by %dKB in second half - possible C memory leak" 391 391 growth_mid_to_after) 392 392 393 + let test_cbor_decode_lifecycle () = 394 + Printf.printf "CBOR decode lifecycle test:\n"; 395 + 396 + let cbor_data = "\xa2\x61a\x01\x61b\x82\x02\x03" in 397 + 398 + let initial_mem = get_mem_usage () in 399 + 400 + for _ = 1 to 10000 do 401 + let _ = Simdjsont.Cbor.decode_string Simdjsont.Codec.value cbor_data in 402 + () 403 + done; 404 + 405 + let final_mem = get_mem_usage () in 406 + let diff = final_mem - initial_mem in 407 + 408 + Printf.printf " Initial: %d bytes\n" initial_mem; 409 + Printf.printf " Final: %d bytes\n" final_mem; 410 + Printf.printf " Diff: %d bytes\n" diff; 411 + 412 + if diff > 1_000_000 then 413 + Alcotest.fail 414 + (Printf.sprintf "Memory grew by %d bytes - possible leak" diff) 415 + 416 + let test_cbor_encode_lifecycle () = 417 + Printf.printf "CBOR encode lifecycle test:\n"; 418 + 419 + let data = [ 1; 2; 3; 4; 5 ] in 420 + 421 + let initial_mem = get_mem_usage () in 422 + 423 + for _ = 1 to 10000 do 424 + let _ = Simdjsont.Cbor.encode_string Simdjsont.Codec.(list int) data in 425 + () 426 + done; 427 + 428 + let final_mem = get_mem_usage () in 429 + let diff = final_mem - initial_mem in 430 + 431 + Printf.printf " Initial: %d bytes\n" initial_mem; 432 + Printf.printf " Final: %d bytes\n" final_mem; 433 + Printf.printf " Diff: %d bytes\n" diff; 434 + 435 + if diff > 1_000_000 then 436 + Alcotest.fail 437 + (Printf.sprintf "Memory grew by %d bytes - possible leak" diff) 438 + 439 + let test_cbor_roundtrip_lifecycle () = 440 + Printf.printf "CBOR roundtrip lifecycle test:\n"; 441 + 442 + let codec = Simdjsont.Codec.(list (list int)) in 443 + let data = [ [ 1; 2; 3 ]; [ 4; 5; 6 ]; [ 7; 8; 9 ] ] in 444 + 445 + let initial_mem = get_mem_usage () in 446 + 447 + for _ = 1 to 10000 do 448 + let cbor = Simdjsont.Cbor.encode_string codec data in 449 + let _ = Simdjsont.Cbor.decode_string codec cbor in 450 + () 451 + done; 452 + 453 + let final_mem = get_mem_usage () in 454 + let diff = final_mem - initial_mem in 455 + 456 + Printf.printf " Initial: %d bytes\n" initial_mem; 457 + Printf.printf " Final: %d bytes\n" final_mem; 458 + Printf.printf " Diff: %d bytes\n" diff; 459 + 460 + if diff > 1_000_000 then 461 + Alcotest.fail 462 + (Printf.sprintf "Memory grew by %d bytes - possible leak" diff) 463 + 464 + let test_cbor_complex_structures () = 465 + Printf.printf "CBOR complex structures lifecycle test:\n"; 466 + 467 + let codec = 468 + let open Simdjsont.Codec in 469 + Obj.field (fun id name items -> (id, name, items)) 470 + |> Obj.mem "id" int ~enc:(fun (id, _, _) -> id) 471 + |> Obj.mem "name" string ~enc:(fun (_, name, _) -> name) 472 + |> Obj.mem "items" (list int) ~enc:(fun (_, _, items) -> items) 473 + |> Obj.finish 474 + in 475 + 476 + let initial_mem = get_mem_usage () in 477 + 478 + for i = 1 to 10000 do 479 + let data = (i, Printf.sprintf "item%d" i, [ i; i + 1; i + 2 ]) in 480 + let cbor = Simdjsont.Cbor.encode_string codec data in 481 + let _ = Simdjsont.Cbor.decode_string codec cbor in 482 + () 483 + done; 484 + 485 + let final_mem = get_mem_usage () in 486 + let diff = final_mem - initial_mem in 487 + 488 + Printf.printf " Initial: %d bytes\n" initial_mem; 489 + Printf.printf " Final: %d bytes\n" final_mem; 490 + Printf.printf " Diff: %d bytes\n" diff; 491 + 492 + if diff > 1_000_000 then 493 + Alcotest.fail 494 + (Printf.sprintf "Memory grew by %d bytes - possible leak" diff) 495 + 496 + let test_cbor_streaming_lifecycle () = 497 + Printf.printf "CBOR streaming lifecycle test:\n"; 498 + 499 + let item1 = "\x83\x01\x02\x03" in 500 + let item2 = "\x83\x04\x05\x06" in 501 + let item3 = "\x83\x07\x08\x09" in 502 + let multi_cbor = item1 ^ item2 ^ item3 in 503 + 504 + let initial_mem = get_mem_usage () in 505 + 506 + for _ = 1 to 5000 do 507 + let seq = Simdjsont.Cbor.to_seq Simdjsont.Codec.(list int) multi_cbor in 508 + Seq.iter (function Ok _ -> () | Error _ -> ()) seq 509 + done; 510 + 511 + let final_mem = get_mem_usage () in 512 + let diff = final_mem - initial_mem in 513 + 514 + Printf.printf " Initial: %d bytes\n" initial_mem; 515 + Printf.printf " Final: %d bytes\n" final_mem; 516 + Printf.printf " Diff: %d bytes\n" diff; 517 + 518 + if diff > 1_000_000 then 519 + Alcotest.fail 520 + (Printf.sprintf "Memory grew by %d bytes - possible leak" diff) 521 + 522 + let test_cbor_decode_discard_no_growth () = 523 + Printf.printf "CBOR decode-and-discard memory growth test:\n"; 524 + 525 + let make_cbor i = 526 + let codec = 527 + let open Simdjsont.Codec in 528 + Obj.field (fun id data -> (id, data)) 529 + |> Obj.mem "id" int ~enc:fst 530 + |> Obj.mem "data" (list int) ~enc:snd 531 + |> Obj.finish 532 + in 533 + Simdjsont.Cbor.encode_string codec (i, [ 1; 2; 3; 4; 5 ]) 534 + in 535 + 536 + Gc.compact (); 537 + let rss_before = get_rss_kb () in 538 + 539 + for i = 1 to 50000 do 540 + let cbor = make_cbor i in 541 + match Simdjsont.Cbor.decode_string Simdjsont.Codec.value cbor with 542 + | Ok _ -> () 543 + | Error _ -> () 544 + done; 545 + 546 + Gc.compact (); 547 + let rss_mid = get_rss_kb () in 548 + 549 + for i = 50001 to 100000 do 550 + let cbor = make_cbor i in 551 + match Simdjsont.Cbor.decode_string Simdjsont.Codec.value cbor with 552 + | Ok _ -> () 553 + | Error _ -> () 554 + done; 555 + 556 + Gc.compact (); 557 + let rss_after = get_rss_kb () in 558 + 559 + Printf.printf " RSS: before=%dKB mid=%dKB after=%dKB\n" rss_before 560 + rss_mid rss_after; 561 + 562 + let growth_mid_to_after = rss_after - rss_mid in 563 + Printf.printf " RSS growth (mid->after): %dKB\n" growth_mid_to_after; 564 + 565 + if growth_mid_to_after > 10000 then 566 + Alcotest.fail 567 + (Printf.sprintf "RSS grew by %dKB in second half - possible C memory leak" 568 + growth_mid_to_after) 569 + 570 + let test_cbor_half_float_lifecycle () = 571 + Printf.printf "CBOR half-precision float lifecycle test:\n"; 572 + 573 + let half_floats = 574 + [ 575 + "\xf9\x3c\x00"; 576 + "\xf9\x3e\x00"; 577 + "\xf9\x7b\xff"; 578 + "\xf9\xc4\x00"; 579 + "\xf9\x7c\x00"; 580 + "\xf9\xfc\x00"; 581 + ] 582 + in 583 + 584 + let initial_mem = get_mem_usage () in 585 + 586 + for _ = 1 to 10000 do 587 + List.iter 588 + (fun cbor -> 589 + let _ = Simdjsont.Cbor.decode_string Simdjsont.Codec.float cbor in 590 + ()) 591 + half_floats 592 + done; 593 + 594 + let final_mem = get_mem_usage () in 595 + let diff = final_mem - initial_mem in 596 + 597 + Printf.printf " Initial: %d bytes\n" initial_mem; 598 + Printf.printf " Final: %d bytes\n" final_mem; 599 + Printf.printf " Diff: %d bytes\n" diff; 600 + 601 + if diff > 1_000_000 then 602 + Alcotest.fail 603 + (Printf.sprintf "Memory grew by %d bytes - possible leak" diff) 604 + 605 + let test_cbor_indefinite_lifecycle () = 606 + Printf.printf "CBOR indefinite-length lifecycle test:\n"; 607 + 608 + let indef_array = "\x9f\x01\x02\x03\x04\x05\xff" in 609 + let indef_map = "\xbf\x61a\x01\x61b\x02\xff" in 610 + let indef_string = "\x7f\x63foo\x63bar\xff" in 611 + 612 + let initial_mem = get_mem_usage () in 613 + 614 + for _ = 1 to 10000 do 615 + let _ = 616 + Simdjsont.Cbor.decode_string Simdjsont.Codec.(list int) indef_array 617 + in 618 + let _ = Simdjsont.Cbor.decode_string Simdjsont.Codec.value indef_map in 619 + let _ = Simdjsont.Cbor.decode_string Simdjsont.Codec.string indef_string in 620 + () 621 + done; 622 + 623 + let final_mem = get_mem_usage () in 624 + let diff = final_mem - initial_mem in 625 + 626 + Printf.printf " Initial: %d bytes\n" initial_mem; 627 + Printf.printf " Final: %d bytes\n" final_mem; 628 + Printf.printf " Diff: %d bytes\n" diff; 629 + 630 + if diff > 1_000_000 then 631 + Alcotest.fail 632 + (Printf.sprintf "Memory grew by %d bytes - possible leak" diff) 633 + 393 634 let () = 394 635 Printf.printf "=== Memory Leak Tests ===\n\n"; 395 636 ··· 411 652 ( "raw parse discard no growth", 412 653 `Slow, 413 654 test_raw_parse_discard_no_growth ); 655 + ] ); 656 + ( "cbor", 657 + [ 658 + ("cbor decode lifecycle", `Quick, test_cbor_decode_lifecycle); 659 + ("cbor encode lifecycle", `Quick, test_cbor_encode_lifecycle); 660 + ("cbor roundtrip lifecycle", `Quick, test_cbor_roundtrip_lifecycle); 661 + ("cbor complex structures", `Quick, test_cbor_complex_structures); 662 + ("cbor streaming lifecycle", `Quick, test_cbor_streaming_lifecycle); 663 + ( "cbor decode discard no growth", 664 + `Slow, 665 + test_cbor_decode_discard_no_growth ); 666 + ("cbor half float lifecycle", `Quick, test_cbor_half_float_lifecycle); 667 + ("cbor indefinite lifecycle", `Quick, test_cbor_indefinite_lifecycle); 414 668 ] ); 415 669 ]