Minimal SQLite key-value store for OCaml
at main 654 lines 20 kB view raw
1(*--------------------------------------------------------------------------- 2 Copyright (c) 2025 Thomas Gazagnaire. All rights reserved. 3 SPDX-License-Identifier: MIT 4 ---------------------------------------------------------------------------*) 5 6(* Pure OCaml B-tree backed key-value store with SQLite-compatible file format. 7 8 The file format is a valid SQLite database: 9 - Page 1: 100-byte database header + sqlite_master table B-tree 10 - Page 2+: user data tables using Table B-tree with Record encoding 11 - In-memory hashtable for O(1) key→rowid lookups *) 12 13let page_size = 4096 14let magic = "SQLite format 3\000" 15 16(* Re-export Btree.Record.value so users don't need to depend on btree *) 17 18type value = Btree.Record.value = 19 | Vnull 20 | Vint of int64 21 | Vfloat of float 22 | Vblob of string 23 | Vtext of string 24 25let pp_value = Btree.Record.pp_value 26 27(* Schema types *) 28 29type column = { 30 col_name : string; 31 col_affinity : string; 32 col_is_rowid_alias : bool; 33} 34 35type schema = { tbl_name : string; columns : column list; sql : string } 36 37(* Per-table state *) 38type kv_table = { 39 btree : Btree.Table.t; 40 keys : (string, int64) Hashtbl.t; 41 mutable next_rowid : int64; 42} 43 44type generic_table = { g_btree : Btree.Table.t; g_schema : schema } 45 46type t = { 47 pager : Btree.Pager.t; 48 mutable data : kv_table option; 49 mutable named_tables : (string * kv_table) list; 50 mutable all_tables : generic_table list; 51} 52 53let pp ppf t = 54 let names = List.map (fun gt -> gt.g_schema.tbl_name) t.all_tables in 55 Fmt.pf ppf "sqlite(%a)" Fmt.(list ~sep:(any ",") string) names 56 57(* CREATE TABLE parser *) 58 59let is_space = function ' ' | '\t' | '\n' | '\r' -> true | _ -> false 60 61(* Split a string by commas, respecting nested parentheses *) 62let split_respecting_parens s = 63 let len = String.length s in 64 let buf = Buffer.create 64 in 65 let parts = ref [] in 66 let depth = ref 0 in 67 for i = 0 to len - 1 do 68 match s.[i] with 69 | '(' -> 70 incr depth; 71 Buffer.add_char buf '(' 72 | ')' -> 73 decr depth; 74 Buffer.add_char buf ')' 75 | ',' when !depth = 0 -> 76 parts := String.trim (Buffer.contents buf) :: !parts; 77 Buffer.clear buf 78 | c -> Buffer.add_char buf c 79 done; 80 let last = String.trim (Buffer.contents buf) in 81 if last <> "" then parts := last :: !parts; 82 List.rev !parts 83 84(* Find the position of the matching closing paren *) 85let matching_paren s start = 86 let len = String.length s in 87 let rec loop i depth = 88 if i >= len then None 89 else 90 match s.[i] with 91 | '(' -> loop (i + 1) (depth + 1) 92 | ')' -> if depth = 0 then Some i else loop (i + 1) (depth - 1) 93 | '\'' -> 94 (* Skip single-quoted string literal *) 95 let rec skip j = 96 if j >= len then loop j 0 97 else if s.[j] = '\'' then 98 if j + 1 < len && s.[j + 1] = '\'' then skip (j + 2) 99 else loop (j + 1) depth 100 else skip (j + 1) 101 in 102 skip (i + 1) 103 | '"' -> 104 let rec skip j = 105 if j >= len then loop j 0 106 else if s.[j] = '"' then loop (j + 1) depth 107 else skip (j + 1) 108 in 109 skip (i + 1) 110 | _ -> loop (i + 1) depth 111 in 112 loop (start + 1) 0 113 114(* Consume parenthesized content into buf, advancing i past the closing paren *) 115let add_paren_content buf s i len = 116 Buffer.add_char buf '('; 117 let depth = ref 1 in 118 incr i; 119 while !i < len && !depth > 0 do 120 (match s.[!i] with '(' -> incr depth | ')' -> decr depth | _ -> ()); 121 Buffer.add_char buf s.[!i]; 122 incr i 123 done 124 125(* Tokenize a column definition into words, handling quoted identifiers 126 and parenthesized type parameters like DECIMAL(10,2) *) 127let tokenize s = 128 let len = String.length s in 129 let buf = Buffer.create 16 in 130 let tokens = ref [] in 131 let flush () = 132 if Buffer.length buf > 0 then begin 133 tokens := Buffer.contents buf :: !tokens; 134 Buffer.clear buf 135 end 136 in 137 let i = ref 0 in 138 while !i < len do 139 match s.[!i] with 140 | c when is_space c -> 141 flush (); 142 incr i 143 | '(' -> 144 (* Include parenthesized content as part of current token *) 145 add_paren_content buf s i len 146 | '"' -> 147 (* Double-quoted identifier: strip quotes *) 148 incr i; 149 while !i < len && s.[!i] <> '"' do 150 Buffer.add_char buf s.[!i]; 151 incr i 152 done; 153 if !i < len then incr i 154 | '[' -> 155 (* Bracket-quoted identifier: strip brackets *) 156 incr i; 157 while !i < len && s.[!i] <> ']' do 158 Buffer.add_char buf s.[!i]; 159 incr i 160 done; 161 if !i < len then incr i 162 | '`' -> 163 (* Backtick-quoted identifier: strip backticks *) 164 incr i; 165 while !i < len && s.[!i] <> '`' do 166 Buffer.add_char buf s.[!i]; 167 incr i 168 done; 169 if !i < len then incr i 170 | c -> 171 Buffer.add_char buf c; 172 incr i 173 done; 174 flush (); 175 List.rev !tokens 176 177(* Keywords that start constraint clauses in column definitions *) 178let constraint_keywords = 179 [ 180 "PRIMARY"; 181 "NOT"; 182 "UNIQUE"; 183 "DEFAULT"; 184 "CHECK"; 185 "REFERENCES"; 186 "COLLATE"; 187 "GENERATED"; 188 "AUTOINCREMENT"; 189 "ASC"; 190 "DESC"; 191 "ON"; 192 "CONSTRAINT"; 193 ] 194 195(* Check if a column def is a table-level constraint *) 196let is_table_constraint s = 197 let upper = String.uppercase_ascii (String.trim s) in 198 let starts_with prefix = 199 String.length upper >= String.length prefix 200 && String.sub upper 0 (String.length prefix) = prefix 201 in 202 starts_with "PRIMARY KEY(" || starts_with "PRIMARY KEY " 203 || starts_with "UNIQUE(" || starts_with "UNIQUE " || starts_with "FOREIGN KEY" 204 || starts_with "CONSTRAINT " || starts_with "CHECK(" || starts_with "CHECK " 205 206let parse_column_def s = 207 if is_table_constraint s then None 208 else 209 let tokens = tokenize s in 210 match tokens with 211 | [] -> None 212 | name :: rest -> 213 (* Collect type tokens until we hit a constraint keyword *) 214 let rec collect_type acc = function 215 | [] -> (List.rev acc, []) 216 | tok :: _ as all 217 when List.mem (String.uppercase_ascii tok) constraint_keywords -> 218 (List.rev acc, all) 219 | tok :: tl -> collect_type (tok :: acc) tl 220 in 221 let type_tokens, constraint_tokens = collect_type [] rest in 222 let affinity = String.concat " " type_tokens in 223 (* INTEGER PRIMARY KEY is a rowid alias *) 224 let is_rowid_alias = 225 String.uppercase_ascii affinity = "INTEGER" 226 && 227 let rec has_pk = function 228 | "PRIMARY" :: "KEY" :: _ -> true 229 | _ :: tl -> has_pk tl 230 | [] -> false 231 in 232 has_pk (List.map String.uppercase_ascii constraint_tokens) 233 in 234 Some 235 { 236 col_name = name; 237 col_affinity = affinity; 238 col_is_rowid_alias = is_rowid_alias; 239 } 240 241let parse_create_table sql = 242 match String.index_opt sql '(' with 243 | None -> [] 244 | Some start -> ( 245 match matching_paren sql start with 246 | None -> [] 247 | Some body_end -> 248 let body = String.sub sql (start + 1) (body_end - start - 1) in 249 let parts = split_respecting_parens body in 250 List.filter_map parse_column_def parts) 251 252(* Standard kv table schema *) 253let kv_columns = 254 [ 255 { col_name = "key"; col_affinity = "TEXT"; col_is_rowid_alias = false }; 256 { col_name = "value"; col_affinity = "BLOB"; col_is_rowid_alias = false }; 257 ] 258 259let table_sql name = Fmt.str "CREATE TABLE %s (key TEXT, value BLOB)" name 260 261let kv_schema name = 262 { tbl_name = name; columns = kv_columns; sql = table_sql name } 263 264(* Decode a Record payload into (key, value) *) 265let decode_kv payload = 266 match Btree.Record.decode payload with 267 | [ Btree.Record.Vtext k; Btree.Record.Vblob v ] 268 | [ Btree.Record.Vtext k; Btree.Record.Vtext v ] -> 269 Some (k, v) 270 | _ -> None 271 272(* Scan a table B-tree to build key→rowid map *) 273let scan_table btree = 274 let keys = Hashtbl.create 64 in 275 let next_rowid = ref 1L in 276 Btree.Table.iter btree (fun rowid payload -> 277 (match decode_kv payload with 278 | Some (k, _) -> Hashtbl.replace keys k rowid 279 | None -> ()); 280 if rowid >= !next_rowid then next_rowid := Int64.add rowid 1L); 281 (keys, !next_rowid) 282 283(* Write the 100-byte SQLite database header *) 284let write_db_header buf ~page_count = 285 Bytes.blit_string magic 0 buf 0 16; 286 Btree.Page.set_u16_be buf 16 page_size; 287 Bytes.set_uint8 buf 18 1; 288 Bytes.set_uint8 buf 19 1; 289 Bytes.set_uint8 buf 20 0; 290 Bytes.set_uint8 buf 21 64; 291 Bytes.set_uint8 buf 22 32; 292 Bytes.set_uint8 buf 23 32; 293 Btree.Page.set_u32_be buf 24 1; 294 Btree.Page.set_u32_be buf 28 page_count; 295 Btree.Page.set_u32_be buf 32 0; 296 Btree.Page.set_u32_be buf 36 0; 297 Btree.Page.set_u32_be buf 40 1; 298 Btree.Page.set_u32_be buf 44 4; 299 Btree.Page.set_u32_be buf 48 0; 300 Btree.Page.set_u32_be buf 52 0; 301 Btree.Page.set_u32_be buf 56 1; 302 Btree.Page.set_u32_be buf 60 0; 303 Btree.Page.set_u32_be buf 64 0; 304 Btree.Page.set_u32_be buf 68 0; 305 (* Offsets 72-91: reserved for expansion, must be zero (spec section 1.2) *) 306 Bytes.fill buf 72 20 '\x00'; 307 Btree.Page.set_u32_be buf 92 1; 308 Btree.Page.set_u32_be buf 96 3046000 309 310(* Build a sqlite_master table leaf cell *) 311let master_cell ~rowid ~name ~root_page ~sql = 312 let record = 313 Btree.Record.encode 314 [ 315 Btree.Record.Vtext "table"; 316 Btree.Record.Vtext name; 317 Btree.Record.Vtext name; 318 Btree.Record.Vint (Int64.of_int root_page); 319 Btree.Record.Vtext sql; 320 ] 321 in 322 let payload_varint = 323 Btree.Varint.encode (Int64.of_int (String.length record)) 324 in 325 let rowid_varint = Btree.Varint.encode rowid in 326 payload_varint ^ rowid_varint ^ record 327 328(* Write page 1: db header + sqlite_master leaf table *) 329let rebuild_page1 t = 330 let buf = Bytes.create page_size in 331 write_db_header buf ~page_count:(Btree.Pager.page_count t.pager); 332 (* Leaf table header at offset 100 *) 333 Bytes.set_uint8 buf 100 0x0d; 334 Btree.Page.set_u16_be buf 101 0; 335 Bytes.set_uint8 buf 107 0; 336 (* Collect all tables from all_tables *) 337 let tables = 338 List.map 339 (fun gt -> 340 (gt.g_schema.tbl_name, Btree.Table.root_page gt.g_btree, gt.g_schema.sql)) 341 t.all_tables 342 in 343 let n = List.length tables in 344 Btree.Page.set_u16_be buf 103 n; 345 (* Build cells from end of page *) 346 let cell_content_start = ref page_size in 347 let cell_ptrs = Array.make n 0 in 348 List.iteri 349 (fun i (name, root_page, sql) -> 350 let cell = 351 master_cell ~rowid:(Int64.of_int (i + 1)) ~name ~root_page ~sql 352 in 353 let cell_len = String.length cell in 354 cell_content_start := !cell_content_start - cell_len; 355 Bytes.blit_string cell 0 buf !cell_content_start cell_len; 356 cell_ptrs.(i) <- !cell_content_start) 357 tables; 358 Btree.Page.set_u16_be buf 105 !cell_content_start; 359 (* Cell pointer array at offset 108 (100 + 8 byte leaf header) *) 360 Array.iteri 361 (fun i ptr -> Btree.Page.set_u16_be buf (108 + (i * 2)) ptr) 362 cell_ptrs; 363 Btree.Pager.write t.pager 1 (Bytes.unsafe_to_string buf) 364 365(* Initialize a new kv_table on a fresh page *) 366let new_kv_table pager = 367 let btree = Btree.Table.v pager in 368 { btree; keys = Hashtbl.create 64; next_rowid = 1L } 369 370let mkdirs_for path = 371 match Eio.Path.split path with 372 | None -> () 373 | Some (fs, p) -> ( 374 let dir = Filename.dirname p in 375 if dir <> "." && dir <> "/" then 376 try Eio.Path.mkdirs ~exists_ok:true ~perm:0o755 Eio.Path.(fs / dir) 377 with Eio.Io _ -> ()) 378 379let v ~sw path = 380 mkdirs_for path; 381 let file = 382 Eio.Path.open_out ~sw ~create:(`Or_truncate 0o644) path |> fun f -> 383 (f :> Eio.File.rw_ty Eio.Resource.t) 384 in 385 let pager = Btree.Pager.v ~page_size file in 386 (* Allocate page 1 for db header + sqlite_master *) 387 let _page1 = Btree.Pager.allocate pager in 388 (* Create kv data table on page 2 *) 389 let kv = new_kv_table pager in 390 let gt = { g_btree = kv.btree; g_schema = kv_schema "kv" } in 391 let t = { pager; data = Some kv; named_tables = []; all_tables = [ gt ] } in 392 rebuild_page1 t; 393 t 394 395(* Extract named kv tables (non-kv tables with kv schema) from all_tables *) 396let extract_named_kv_tables all_tables = 397 List.filter_map 398 (fun gt -> 399 let name = gt.g_schema.tbl_name in 400 if name = "kv" then None 401 else 402 match gt.g_schema.columns with 403 | [ 404 { col_name = "key"; col_affinity = "TEXT"; _ }; 405 { col_name = "value"; col_affinity = "BLOB"; _ }; 406 ] -> 407 let keys, next_rowid = scan_table gt.g_btree in 408 Some (name, { btree = gt.g_btree; keys; next_rowid }) 409 | _ -> None) 410 all_tables 411 412let in_memory () = 413 let pager = Btree.Pager.mem ~page_size () in 414 let _page1 = Btree.Pager.allocate pager in 415 let kv = new_kv_table pager in 416 let gt = { g_btree = kv.btree; g_schema = kv_schema "kv" } in 417 let t = { pager; data = Some kv; named_tables = []; all_tables = [ gt ] } in 418 rebuild_page1 t; 419 t 420 421let open_ ~sw path = 422 let file = 423 Eio.Path.open_out ~sw ~create:`Never path |> fun f -> 424 (f :> Eio.File.rw_ty Eio.Resource.t) 425 in 426 let pager = Btree.Pager.v ~page_size file in 427 if Btree.Pager.page_count pager = 0 then failwith "Database file is empty"; 428 (* Read page 1 and validate *) 429 let page1 = Btree.Pager.read pager 1 in 430 if String.sub page1 0 16 <> magic then failwith "Not a SQLite database"; 431 let ps = Btree.Page.u16_be page1 16 in 432 if ps <> page_size then Fmt.failwith "Unsupported page size: %d" ps; 433 (* Parse sqlite_master at offset 100 *) 434 let header = Btree.Page.parse_header page1 100 in 435 let ptrs = Btree.Page.cell_pointers page1 100 header in 436 let raw_tables = ref [] in 437 for i = 0 to header.Btree.Page.cell_count - 1 do 438 let cell, _ = 439 Btree.Cell.parse_table_leaf page1 ptrs.(i) ~usable_size:page_size 440 in 441 match Btree.Record.decode cell.Btree.Cell.payload with 442 | [ 443 Btree.Record.Vtext "table"; 444 Btree.Record.Vtext name; 445 _; 446 Btree.Record.Vint root; 447 Btree.Record.Vtext sql; 448 ] -> 449 raw_tables := (name, Int64.to_int root, sql) :: !raw_tables 450 | _ -> () 451 done; 452 let raw_tables = List.rev !raw_tables in 453 (* Build generic_table for every table *) 454 let all_tables = 455 List.map 456 (fun (name, root, sql) -> 457 let btree = Btree.Table.open_ pager ~root_page:root in 458 let columns = parse_create_table sql in 459 let schema = { tbl_name = name; columns; sql } in 460 { g_btree = btree; g_schema = schema }) 461 raw_tables 462 in 463 (* Try to find "kv" table for backward compat *) 464 let data = 465 match List.find_opt (fun gt -> gt.g_schema.tbl_name = "kv") all_tables with 466 | None -> None 467 | Some gt -> 468 let keys, next_rowid = scan_table gt.g_btree in 469 Some { btree = gt.g_btree; keys; next_rowid } 470 in 471 let named = extract_named_kv_tables all_tables in 472 { pager; data; named_tables = named; all_tables } 473 474(* Get the kv_table, raising if no kv table exists *) 475let kv t = 476 match t.data with 477 | Some d -> d 478 | None -> failwith "No 'kv' table in this database" 479 480(* KV operations *) 481 482let find t key = 483 let d = kv t in 484 match Hashtbl.find_opt d.keys key with 485 | None -> None 486 | Some rowid -> ( 487 match Btree.Table.find d.btree rowid with 488 | None -> None 489 | Some payload -> ( 490 match decode_kv payload with Some (_, v) -> Some v | None -> None)) 491 492let put t key value = 493 let kv = kv t in 494 let record = 495 Btree.Record.encode [ Btree.Record.Vtext key; Btree.Record.Vblob value ] 496 in 497 (match Hashtbl.find_opt kv.keys key with 498 | Some old_rowid -> Btree.Table.delete kv.btree old_rowid 499 | None -> ()); 500 let rowid = kv.next_rowid in 501 kv.next_rowid <- Int64.add kv.next_rowid 1L; 502 Btree.Table.insert kv.btree ~rowid record; 503 Hashtbl.replace kv.keys key rowid 504 505let delete t key = 506 let d = kv t in 507 match Hashtbl.find_opt d.keys key with 508 | None -> () 509 | Some rowid -> 510 Btree.Table.delete d.btree rowid; 511 Hashtbl.remove d.keys key 512 513let mem t key = 514 let d = kv t in 515 Hashtbl.mem d.keys key 516 517let iter t ~f = 518 let d = kv t in 519 Btree.Table.iter d.btree (fun _rowid payload -> 520 match decode_kv payload with Some (k, v) -> f k v | None -> ()) 521 522let fold t ~init ~f = 523 let acc = ref init in 524 iter t ~f:(fun k v -> acc := f k v !acc); 525 !acc 526 527let sync t = 528 rebuild_page1 t; 529 Btree.Pager.sync t.pager 530 531let close t = sync t 532 533(* Generic read API *) 534 535let tables t = List.map (fun gt -> gt.g_schema) t.all_tables 536 537let table t name = 538 match List.find_opt (fun gt -> gt.g_schema.tbl_name = name) t.all_tables with 539 | Some gt -> gt 540 | None -> Fmt.failwith "No table %S found in database" name 541 542(* Find the index of the rowid alias column, if any *) 543let rowid_alias_index columns = 544 let rec find i = function 545 | [] -> None 546 | c :: _ when c.col_is_rowid_alias -> Some i 547 | _ :: rest -> find (i + 1) rest 548 in 549 find 0 columns 550 551(* Apply rowid substitution and trailing Vnull padding *) 552let fixup_values ~schema ~rowid values = 553 let n_cols = List.length schema.columns in 554 let len = List.length values in 555 let values = 556 if len < n_cols then 557 values @ List.init (n_cols - len) (fun _ -> Btree.Record.Vnull) 558 else values 559 in 560 match rowid_alias_index schema.columns with 561 | None -> values 562 | Some idx -> 563 List.mapi 564 (fun i v -> 565 if i = idx then 566 match v with 567 | Btree.Record.Vnull -> Btree.Record.Vint rowid 568 | v -> v 569 else v) 570 values 571 572let iter_table t name ~f = 573 let gt = table t name in 574 let schema = gt.g_schema in 575 Btree.Table.iter gt.g_btree (fun rowid payload -> 576 let values = Btree.Record.decode payload in 577 let values = fixup_values ~schema ~rowid values in 578 f rowid values) 579 580let fold_table t name ~init ~f = 581 let acc = ref init in 582 iter_table t name ~f:(fun rowid values -> acc := f rowid values !acc); 583 !acc 584 585let read_table t name = 586 fold_table t name ~init:[] ~f:(fun rowid values acc -> (rowid, values) :: acc) 587 |> List.rev 588 589(* Namespaced Tables *) 590 591module Table = struct 592 type db = t 593 type t = { parent : db; name : string; kv : kv_table } 594 595 let valid_name name = 596 String.length name > 0 597 && (let first = name.[0] in 598 (first >= 'a' && first <= 'z') 599 || (first >= 'A' && first <= 'Z') 600 || first = '_') 601 && String.for_all 602 (fun c -> 603 (c >= 'a' && c <= 'z') 604 || (c >= 'A' && c <= 'Z') 605 || (c >= '0' && c <= '9') 606 || c = '_') 607 name 608 609 let create parent ~name = 610 if not (valid_name name) then Fmt.invalid_arg "Invalid table name: %S" name; 611 match List.assoc_opt name parent.named_tables with 612 | Some kv -> { parent; name; kv } 613 | None -> 614 let kv = new_kv_table parent.pager in 615 let gt = { g_btree = kv.btree; g_schema = kv_schema name } in 616 parent.named_tables <- (name, kv) :: parent.named_tables; 617 parent.all_tables <- parent.all_tables @ [ gt ]; 618 { parent; name; kv } 619 620 let find t key = 621 match Hashtbl.find_opt t.kv.keys key with 622 | None -> None 623 | Some rowid -> ( 624 match Btree.Table.find t.kv.btree rowid with 625 | None -> None 626 | Some payload -> ( 627 match decode_kv payload with Some (_, v) -> Some v | None -> None)) 628 629 let put t key value = 630 let kv = t.kv in 631 let record = 632 Btree.Record.encode [ Btree.Record.Vtext key; Btree.Record.Vblob value ] 633 in 634 (match Hashtbl.find_opt kv.keys key with 635 | Some old_rowid -> Btree.Table.delete kv.btree old_rowid 636 | None -> ()); 637 let rowid = kv.next_rowid in 638 kv.next_rowid <- Int64.add kv.next_rowid 1L; 639 Btree.Table.insert kv.btree ~rowid record; 640 Hashtbl.replace kv.keys key rowid 641 642 let delete t key = 643 match Hashtbl.find_opt t.kv.keys key with 644 | None -> () 645 | Some rowid -> 646 Btree.Table.delete t.kv.btree rowid; 647 Hashtbl.remove t.kv.keys key 648 649 let mem t key = Hashtbl.mem t.kv.keys key 650 651 let iter t ~f = 652 Btree.Table.iter t.kv.btree (fun _rowid payload -> 653 match decode_kv payload with Some (k, v) -> f k v | None -> ()) 654end