(*--------------------------------------------------------------------------- Copyright (c) 2025 Thomas Gazagnaire. All rights reserved. SPDX-License-Identifier: MIT ---------------------------------------------------------------------------*) (* Pure OCaml B-tree backed key-value store with SQLite-compatible file format. The file format is a valid SQLite database: - Page 1: 100-byte database header + sqlite_master table B-tree - Page 2+: user data tables using Table B-tree with Record encoding - In-memory hashtable for O(1) key→rowid lookups *) let page_size = 4096 let magic = "SQLite format 3\000" (* Re-export Btree.Record.value so users don't need to depend on btree *) type value = Btree.Record.value = | Vnull | Vint of int64 | Vfloat of float | Vblob of string | Vtext of string let pp_value = Btree.Record.pp_value (* Schema types *) type column = { col_name : string; col_affinity : string; col_is_rowid_alias : bool; } type schema = { tbl_name : string; columns : column list; sql : string } (* Per-table state *) type kv_table = { btree : Btree.Table.t; keys : (string, int64) Hashtbl.t; mutable next_rowid : int64; } type generic_table = { g_btree : Btree.Table.t; g_schema : schema } type t = { pager : Btree.Pager.t; mutable data : kv_table option; mutable named_tables : (string * kv_table) list; mutable all_tables : generic_table list; } let pp ppf t = let names = List.map (fun gt -> gt.g_schema.tbl_name) t.all_tables in Fmt.pf ppf "sqlite(%a)" Fmt.(list ~sep:(any ",") string) names (* CREATE TABLE parser *) let is_space = function ' ' | '\t' | '\n' | '\r' -> true | _ -> false (* Split a string by commas, respecting nested parentheses *) let split_respecting_parens s = let len = String.length s in let buf = Buffer.create 64 in let parts = ref [] in let depth = ref 0 in for i = 0 to len - 1 do match s.[i] with | '(' -> incr depth; Buffer.add_char buf '(' | ')' -> decr depth; Buffer.add_char buf ')' | ',' when !depth = 0 -> parts := String.trim (Buffer.contents buf) :: !parts; Buffer.clear buf | c -> Buffer.add_char buf c done; let last = String.trim (Buffer.contents buf) in if last <> "" then parts := last :: !parts; List.rev !parts (* Find the position of the matching closing paren *) let matching_paren s start = let len = String.length s in let rec loop i depth = if i >= len then None else match s.[i] with | '(' -> loop (i + 1) (depth + 1) | ')' -> if depth = 0 then Some i else loop (i + 1) (depth - 1) | '\'' -> (* Skip single-quoted string literal *) let rec skip j = if j >= len then loop j 0 else if s.[j] = '\'' then if j + 1 < len && s.[j + 1] = '\'' then skip (j + 2) else loop (j + 1) depth else skip (j + 1) in skip (i + 1) | '"' -> let rec skip j = if j >= len then loop j 0 else if s.[j] = '"' then loop (j + 1) depth else skip (j + 1) in skip (i + 1) | _ -> loop (i + 1) depth in loop (start + 1) 0 (* Consume parenthesized content into buf, advancing i past the closing paren *) let add_paren_content buf s i len = Buffer.add_char buf '('; let depth = ref 1 in incr i; while !i < len && !depth > 0 do (match s.[!i] with '(' -> incr depth | ')' -> decr depth | _ -> ()); Buffer.add_char buf s.[!i]; incr i done (* Tokenize a column definition into words, handling quoted identifiers and parenthesized type parameters like DECIMAL(10,2) *) let tokenize s = let len = String.length s in let buf = Buffer.create 16 in let tokens = ref [] in let flush () = if Buffer.length buf > 0 then begin tokens := Buffer.contents buf :: !tokens; Buffer.clear buf end in let i = ref 0 in while !i < len do match s.[!i] with | c when is_space c -> flush (); incr i | '(' -> (* Include parenthesized content as part of current token *) add_paren_content buf s i len | '"' -> (* Double-quoted identifier: strip quotes *) incr i; while !i < len && s.[!i] <> '"' do Buffer.add_char buf s.[!i]; incr i done; if !i < len then incr i | '[' -> (* Bracket-quoted identifier: strip brackets *) incr i; while !i < len && s.[!i] <> ']' do Buffer.add_char buf s.[!i]; incr i done; if !i < len then incr i | '`' -> (* Backtick-quoted identifier: strip backticks *) incr i; while !i < len && s.[!i] <> '`' do Buffer.add_char buf s.[!i]; incr i done; if !i < len then incr i | c -> Buffer.add_char buf c; incr i done; flush (); List.rev !tokens (* Keywords that start constraint clauses in column definitions *) let constraint_keywords = [ "PRIMARY"; "NOT"; "UNIQUE"; "DEFAULT"; "CHECK"; "REFERENCES"; "COLLATE"; "GENERATED"; "AUTOINCREMENT"; "ASC"; "DESC"; "ON"; "CONSTRAINT"; ] (* Check if a column def is a table-level constraint *) let is_table_constraint s = let upper = String.uppercase_ascii (String.trim s) in let starts_with prefix = String.length upper >= String.length prefix && String.sub upper 0 (String.length prefix) = prefix in starts_with "PRIMARY KEY(" || starts_with "PRIMARY KEY " || starts_with "UNIQUE(" || starts_with "UNIQUE " || starts_with "FOREIGN KEY" || starts_with "CONSTRAINT " || starts_with "CHECK(" || starts_with "CHECK " let parse_column_def s = if is_table_constraint s then None else let tokens = tokenize s in match tokens with | [] -> None | name :: rest -> (* Collect type tokens until we hit a constraint keyword *) let rec collect_type acc = function | [] -> (List.rev acc, []) | tok :: _ as all when List.mem (String.uppercase_ascii tok) constraint_keywords -> (List.rev acc, all) | tok :: tl -> collect_type (tok :: acc) tl in let type_tokens, constraint_tokens = collect_type [] rest in let affinity = String.concat " " type_tokens in (* INTEGER PRIMARY KEY is a rowid alias *) let is_rowid_alias = String.uppercase_ascii affinity = "INTEGER" && let rec has_pk = function | "PRIMARY" :: "KEY" :: _ -> true | _ :: tl -> has_pk tl | [] -> false in has_pk (List.map String.uppercase_ascii constraint_tokens) in Some { col_name = name; col_affinity = affinity; col_is_rowid_alias = is_rowid_alias; } let parse_create_table sql = match String.index_opt sql '(' with | None -> [] | Some start -> ( match matching_paren sql start with | None -> [] | Some body_end -> let body = String.sub sql (start + 1) (body_end - start - 1) in let parts = split_respecting_parens body in List.filter_map parse_column_def parts) (* Standard kv table schema *) let kv_columns = [ { col_name = "key"; col_affinity = "TEXT"; col_is_rowid_alias = false }; { col_name = "value"; col_affinity = "BLOB"; col_is_rowid_alias = false }; ] let table_sql name = Fmt.str "CREATE TABLE %s (key TEXT, value BLOB)" name let kv_schema name = { tbl_name = name; columns = kv_columns; sql = table_sql name } (* Decode a Record payload into (key, value) *) let decode_kv payload = match Btree.Record.decode payload with | [ Btree.Record.Vtext k; Btree.Record.Vblob v ] | [ Btree.Record.Vtext k; Btree.Record.Vtext v ] -> Some (k, v) | _ -> None (* Scan a table B-tree to build key→rowid map *) let scan_table btree = let keys = Hashtbl.create 64 in let next_rowid = ref 1L in Btree.Table.iter btree (fun rowid payload -> (match decode_kv payload with | Some (k, _) -> Hashtbl.replace keys k rowid | None -> ()); if rowid >= !next_rowid then next_rowid := Int64.add rowid 1L); (keys, !next_rowid) (* Write the 100-byte SQLite database header *) let write_db_header buf ~page_count = Bytes.blit_string magic 0 buf 0 16; Btree.Page.set_u16_be buf 16 page_size; Bytes.set_uint8 buf 18 1; Bytes.set_uint8 buf 19 1; Bytes.set_uint8 buf 20 0; Bytes.set_uint8 buf 21 64; Bytes.set_uint8 buf 22 32; Bytes.set_uint8 buf 23 32; Btree.Page.set_u32_be buf 24 1; Btree.Page.set_u32_be buf 28 page_count; Btree.Page.set_u32_be buf 32 0; Btree.Page.set_u32_be buf 36 0; Btree.Page.set_u32_be buf 40 1; Btree.Page.set_u32_be buf 44 4; Btree.Page.set_u32_be buf 48 0; Btree.Page.set_u32_be buf 52 0; Btree.Page.set_u32_be buf 56 1; Btree.Page.set_u32_be buf 60 0; Btree.Page.set_u32_be buf 64 0; Btree.Page.set_u32_be buf 68 0; (* Offsets 72-91: reserved for expansion, must be zero (spec section 1.2) *) Bytes.fill buf 72 20 '\x00'; Btree.Page.set_u32_be buf 92 1; Btree.Page.set_u32_be buf 96 3046000 (* Build a sqlite_master table leaf cell *) let master_cell ~rowid ~name ~root_page ~sql = let record = Btree.Record.encode [ Btree.Record.Vtext "table"; Btree.Record.Vtext name; Btree.Record.Vtext name; Btree.Record.Vint (Int64.of_int root_page); Btree.Record.Vtext sql; ] in let payload_varint = Btree.Varint.encode (Int64.of_int (String.length record)) in let rowid_varint = Btree.Varint.encode rowid in payload_varint ^ rowid_varint ^ record (* Write page 1: db header + sqlite_master leaf table *) let rebuild_page1 t = let buf = Bytes.create page_size in write_db_header buf ~page_count:(Btree.Pager.page_count t.pager); (* Leaf table header at offset 100 *) Bytes.set_uint8 buf 100 0x0d; Btree.Page.set_u16_be buf 101 0; Bytes.set_uint8 buf 107 0; (* Collect all tables from all_tables *) let tables = List.map (fun gt -> (gt.g_schema.tbl_name, Btree.Table.root_page gt.g_btree, gt.g_schema.sql)) t.all_tables in let n = List.length tables in Btree.Page.set_u16_be buf 103 n; (* Build cells from end of page *) let cell_content_start = ref page_size in let cell_ptrs = Array.make n 0 in List.iteri (fun i (name, root_page, sql) -> let cell = master_cell ~rowid:(Int64.of_int (i + 1)) ~name ~root_page ~sql in let cell_len = String.length cell in cell_content_start := !cell_content_start - cell_len; Bytes.blit_string cell 0 buf !cell_content_start cell_len; cell_ptrs.(i) <- !cell_content_start) tables; Btree.Page.set_u16_be buf 105 !cell_content_start; (* Cell pointer array at offset 108 (100 + 8 byte leaf header) *) Array.iteri (fun i ptr -> Btree.Page.set_u16_be buf (108 + (i * 2)) ptr) cell_ptrs; Btree.Pager.write t.pager 1 (Bytes.unsafe_to_string buf) (* Initialize a new kv_table on a fresh page *) let new_kv_table pager = let btree = Btree.Table.v pager in { btree; keys = Hashtbl.create 64; next_rowid = 1L } let mkdirs_for path = match Eio.Path.split path with | None -> () | Some (fs, p) -> ( let dir = Filename.dirname p in if dir <> "." && dir <> "/" then try Eio.Path.mkdirs ~exists_ok:true ~perm:0o755 Eio.Path.(fs / dir) with Eio.Io _ -> ()) let v ~sw path = mkdirs_for path; let file = Eio.Path.open_out ~sw ~create:(`Or_truncate 0o644) path |> fun f -> (f :> Eio.File.rw_ty Eio.Resource.t) in let pager = Btree.Pager.v ~page_size file in (* Allocate page 1 for db header + sqlite_master *) let _page1 = Btree.Pager.allocate pager in (* Create kv data table on page 2 *) let kv = new_kv_table pager in let gt = { g_btree = kv.btree; g_schema = kv_schema "kv" } in let t = { pager; data = Some kv; named_tables = []; all_tables = [ gt ] } in rebuild_page1 t; t (* Extract named kv tables (non-kv tables with kv schema) from all_tables *) let extract_named_kv_tables all_tables = List.filter_map (fun gt -> let name = gt.g_schema.tbl_name in if name = "kv" then None else match gt.g_schema.columns with | [ { col_name = "key"; col_affinity = "TEXT"; _ }; { col_name = "value"; col_affinity = "BLOB"; _ }; ] -> let keys, next_rowid = scan_table gt.g_btree in Some (name, { btree = gt.g_btree; keys; next_rowid }) | _ -> None) all_tables let in_memory () = let pager = Btree.Pager.mem ~page_size () in let _page1 = Btree.Pager.allocate pager in let kv = new_kv_table pager in let gt = { g_btree = kv.btree; g_schema = kv_schema "kv" } in let t = { pager; data = Some kv; named_tables = []; all_tables = [ gt ] } in rebuild_page1 t; t let open_ ~sw path = let file = Eio.Path.open_out ~sw ~create:`Never path |> fun f -> (f :> Eio.File.rw_ty Eio.Resource.t) in let pager = Btree.Pager.v ~page_size file in if Btree.Pager.page_count pager = 0 then failwith "Database file is empty"; (* Read page 1 and validate *) let page1 = Btree.Pager.read pager 1 in if String.sub page1 0 16 <> magic then failwith "Not a SQLite database"; let ps = Btree.Page.u16_be page1 16 in if ps <> page_size then Fmt.failwith "Unsupported page size: %d" ps; (* Parse sqlite_master at offset 100 *) let header = Btree.Page.parse_header page1 100 in let ptrs = Btree.Page.cell_pointers page1 100 header in let raw_tables = ref [] in for i = 0 to header.Btree.Page.cell_count - 1 do let cell, _ = Btree.Cell.parse_table_leaf page1 ptrs.(i) ~usable_size:page_size in match Btree.Record.decode cell.Btree.Cell.payload with | [ Btree.Record.Vtext "table"; Btree.Record.Vtext name; _; Btree.Record.Vint root; Btree.Record.Vtext sql; ] -> raw_tables := (name, Int64.to_int root, sql) :: !raw_tables | _ -> () done; let raw_tables = List.rev !raw_tables in (* Build generic_table for every table *) let all_tables = List.map (fun (name, root, sql) -> let btree = Btree.Table.open_ pager ~root_page:root in let columns = parse_create_table sql in let schema = { tbl_name = name; columns; sql } in { g_btree = btree; g_schema = schema }) raw_tables in (* Try to find "kv" table for backward compat *) let data = match List.find_opt (fun gt -> gt.g_schema.tbl_name = "kv") all_tables with | None -> None | Some gt -> let keys, next_rowid = scan_table gt.g_btree in Some { btree = gt.g_btree; keys; next_rowid } in let named = extract_named_kv_tables all_tables in { pager; data; named_tables = named; all_tables } (* Get the kv_table, raising if no kv table exists *) let kv t = match t.data with | Some d -> d | None -> failwith "No 'kv' table in this database" (* KV operations *) let find t key = let d = kv t in match Hashtbl.find_opt d.keys key with | None -> None | Some rowid -> ( match Btree.Table.find d.btree rowid with | None -> None | Some payload -> ( match decode_kv payload with Some (_, v) -> Some v | None -> None)) let put t key value = let kv = kv t in let record = Btree.Record.encode [ Btree.Record.Vtext key; Btree.Record.Vblob value ] in (match Hashtbl.find_opt kv.keys key with | Some old_rowid -> Btree.Table.delete kv.btree old_rowid | None -> ()); let rowid = kv.next_rowid in kv.next_rowid <- Int64.add kv.next_rowid 1L; Btree.Table.insert kv.btree ~rowid record; Hashtbl.replace kv.keys key rowid let delete t key = let d = kv t in match Hashtbl.find_opt d.keys key with | None -> () | Some rowid -> Btree.Table.delete d.btree rowid; Hashtbl.remove d.keys key let mem t key = let d = kv t in Hashtbl.mem d.keys key let iter t ~f = let d = kv t in Btree.Table.iter d.btree (fun _rowid payload -> match decode_kv payload with Some (k, v) -> f k v | None -> ()) let fold t ~init ~f = let acc = ref init in iter t ~f:(fun k v -> acc := f k v !acc); !acc let sync t = rebuild_page1 t; Btree.Pager.sync t.pager let close t = sync t (* Generic read API *) let tables t = List.map (fun gt -> gt.g_schema) t.all_tables let table t name = match List.find_opt (fun gt -> gt.g_schema.tbl_name = name) t.all_tables with | Some gt -> gt | None -> Fmt.failwith "No table %S found in database" name (* Find the index of the rowid alias column, if any *) let rowid_alias_index columns = let rec find i = function | [] -> None | c :: _ when c.col_is_rowid_alias -> Some i | _ :: rest -> find (i + 1) rest in find 0 columns (* Apply rowid substitution and trailing Vnull padding *) let fixup_values ~schema ~rowid values = let n_cols = List.length schema.columns in let len = List.length values in let values = if len < n_cols then values @ List.init (n_cols - len) (fun _ -> Btree.Record.Vnull) else values in match rowid_alias_index schema.columns with | None -> values | Some idx -> List.mapi (fun i v -> if i = idx then match v with | Btree.Record.Vnull -> Btree.Record.Vint rowid | v -> v else v) values let iter_table t name ~f = let gt = table t name in let schema = gt.g_schema in Btree.Table.iter gt.g_btree (fun rowid payload -> let values = Btree.Record.decode payload in let values = fixup_values ~schema ~rowid values in f rowid values) let fold_table t name ~init ~f = let acc = ref init in iter_table t name ~f:(fun rowid values -> acc := f rowid values !acc); !acc let read_table t name = fold_table t name ~init:[] ~f:(fun rowid values acc -> (rowid, values) :: acc) |> List.rev (* Namespaced Tables *) module Table = struct type db = t type t = { parent : db; name : string; kv : kv_table } let valid_name name = String.length name > 0 && (let first = name.[0] in (first >= 'a' && first <= 'z') || (first >= 'A' && first <= 'Z') || first = '_') && String.for_all (fun c -> (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c = '_') name let create parent ~name = if not (valid_name name) then Fmt.invalid_arg "Invalid table name: %S" name; match List.assoc_opt name parent.named_tables with | Some kv -> { parent; name; kv } | None -> let kv = new_kv_table parent.pager in let gt = { g_btree = kv.btree; g_schema = kv_schema name } in parent.named_tables <- (name, kv) :: parent.named_tables; parent.all_tables <- parent.all_tables @ [ gt ]; { parent; name; kv } let find t key = match Hashtbl.find_opt t.kv.keys key with | None -> None | Some rowid -> ( match Btree.Table.find t.kv.btree rowid with | None -> None | Some payload -> ( match decode_kv payload with Some (_, v) -> Some v | None -> None)) let put t key value = let kv = t.kv in let record = Btree.Record.encode [ Btree.Record.Vtext key; Btree.Record.Vblob value ] in (match Hashtbl.find_opt kv.keys key with | Some old_rowid -> Btree.Table.delete kv.btree old_rowid | None -> ()); let rowid = kv.next_rowid in kv.next_rowid <- Int64.add kv.next_rowid 1L; Btree.Table.insert kv.btree ~rowid record; Hashtbl.replace kv.keys key rowid let delete t key = match Hashtbl.find_opt t.kv.keys key with | None -> () | Some rowid -> Btree.Table.delete t.kv.btree rowid; Hashtbl.remove t.kv.keys key let mem t key = Hashtbl.mem t.kv.keys key let iter t ~f = Btree.Table.iter t.kv.btree (fun _rowid payload -> match decode_kv payload with Some (k, v) -> f k v | None -> ()) end