Reference-counted, deduplicated string store.
at main 292 lines 10 kB view raw
1/* text-store - A store for storing reference-counted, de-duplicated strings. 2 * Copyright (C) 2025 Jeremia Dominguez <me@jeremia.dev> 3 * 4 * This program is free software: you can redistribute it and/or modify 5 * it under the terms of the GNU General Public License as published by 6 * the Free Software Foundation, either version 3 of the License, or 7 * (at your option) any later version. 8 * 9 * This program is distributed in the hope that it will be useful, 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 * GNU General Public License for more details. 13 * 14 * You should have received a copy of the GNU General Public License 15 * along with this program. If not, see <https://www.gnu.org/licenses/>. 16 */ 17 18#include <alloca.h> 19#include <stdalign.h> 20#include <stdbool.h> 21#include <stddef.h> 22#include <stdint.h> 23#include <stdlib.h> 24#include <string.h> 25 26#include <textstore.h> 27 28#include "rapidhash.h" 29 30/** 31 * TODO: Make all of this easier to follow. 32 * In hindsight, I realize how much all of this relies on specific pointer math and such. 33 * It may be better to split some things into functions, macros or structs? 34 * This may be helped by splitting the free-queue into its own allocation, rather than 35 * sharing space with the text buffer? 36 */ 37 38/* Size of struct-of-array member's items. */ 39#define SoASize (sizeof(uint32_t) + sizeof(textstore_slice) + (sizeof(uint32_t) * 2) + sizeof(bool)) 40 41static inline textstore_handle to_handle(uint32_t index, uint32_t generation) { 42 uint64_t handle = (uint64_t)generation << 32; 43 handle += index; 44 return handle; 45} 46 47/** 48 * Allocates enough room on the hash map for n items, and sets the store's pointers. 49 * 50 * Note: Use on a temporary copy. 51 */ 52static inline void textstore_alloc(textstore_t *store, size_t n) { 53 store->map = (uint32_t *)calloc(n, SoASize); 54 store->key = (textstore_slice *)(store->map + n); 55 store->ref = (uint32_t *)(store->key + n); 56 store->generation = (uint32_t *)(store->ref + n); 57 store->active = (bool *)(store->generation + n); 58} 59 60static void textstore_put(textstore_t *store, char *str, uint32_t len, uint32_t value) { 61 uint32_t *map = store->map; 62 uint32_t capacity = store->hashmap_cap; 63 uint32_t idx = rapidhash(str, len) % capacity; 64 while (map[idx] != 0) { /* The 0 index is already reserved for empty string. */ 65 idx = (idx + 1) % capacity; 66 } 67 map[idx] = value; 68} 69 70static void textstore_rehash(textstore_t *store) { 71 char *buffer = store->buffer; 72 textstore_slice *slices = store->key + 1; /* Skip index 0, empty string. */ 73 bool *active = store->active + 1; 74 uint32_t count = store->count - 1; 75 memset(store->map, 0, sizeof(uint32_t) * store->hashmap_cap); 76 for (uint32_t i = 0; i < count; i++) { 77 if (active[i]) { 78 textstore_slice slice = slices[i]; 79 textstore_put(store, &buffer[slice.offset], slice.len, i + 1); 80 } 81 } 82} 83 84static textstore_handle textstore_find(textstore_t *store, char *str, uint32_t len) { 85 assert(str != NULL); 86 assert(len != 0); 87 88 char *buffer = store->buffer; 89 uint32_t hashmap_cap = store->hashmap_cap; 90 uint32_t *map = store->map; 91 textstore_slice *key = store->key; 92 bool *active = store->active; 93 94 uint32_t index = rapidhash(str, len) % hashmap_cap; 95 while (map[index] != 0) { 96 uint32_t idx = map[index]; 97 if (key[idx].len != len || !active[idx]) { 98 index = (index + 1) % hashmap_cap; 99 continue; 100 } 101 if (memcmp(str, &buffer[key[idx].offset], len) == 0) { 102 return to_handle(idx, store->generation[idx]); 103 } 104 } 105 106 return TEXTSTORE_HANDLE_ERROR; 107} 108 109/** 110 * Appends text to the store's buffer. 111 */ 112static void textstore_append(textstore_t *store, char *str, uint32_t len) { 113 if (len + 1 + store->length > store->buffer_cap) { 114 uint32_t new_capacity= store->buffer_cap * 2; 115 while (new_capacity < len + 1 + store->length) { 116 new_capacity *= 2; 117 } 118 store->buffer = realloc(store->buffer, new_capacity); 119 memset(&store->buffer[store->buffer_cap], 0, new_capacity - store->buffer_cap); 120 store->buffer_cap = new_capacity; 121 } 122 memcpy(&store->buffer[store->length], str, len); 123 store->buffer[store->length + len] = 0; 124 store->length += len + 1; 125} 126 127textstore_handle textstore_intern(textstore_t *store, char *str) { 128 return textstore_intern_len(store, str, strlen(str)); 129} 130 131textstore_handle textstore_intern_len(textstore_t *store, char *str, uint32_t len) { 132 textstore_handle handle = to_handle(0, 0); 133 uint32_t index; 134 135 if (len == 0) { /* Special case empty string. */ 136 textstore_retain(store, handle); 137 return handle; 138 } 139 if (str == NULL) return TEXTSTORE_HANDLE_ERROR; 140 141 handle = textstore_find(store, str, len); 142 if (handle != TEXTSTORE_HANDLE_ERROR) { 143 textstore_retain(store, handle); 144 return handle; 145 } 146 147 uint32_t offset = store->length; 148 textstore_append(store, str, len); 149 150 151 if (store->available > 0) { 152 /* Pop a freed handle index off the end of the queue. */ 153 index = ((uint32_t *)store->buffer)[--store->available]; 154 store->generation[index]++; 155 } else { 156 index = store->count++; 157 store->generation[index] = 0; 158 } 159 160 textstore_put(store, str, len, index); 161 handle = to_handle(index, store->generation[index]); 162 store->key[index] = (textstore_slice){offset, len}; 163 store->ref[index] = 1; 164 store->active[index] = true; 165 166 if (index > ((store->hashmap_cap/4)*3)) { 167 /* At 3/4 capacity, grow the hash-table. */ 168 textstore_t copy = *store; 169 textstore_alloc(&copy, copy.hashmap_cap * 2); 170 memcpy(copy.key, store->key, copy.count * sizeof(textstore_slice)); 171 memcpy(copy.ref, store->ref, copy.count * sizeof(uint32_t)); 172 memcpy(copy.generation, store->generation, copy.count * sizeof(uint32_t)); 173 memcpy(copy.active, store->active, copy.count * sizeof(bool)); 174 copy.hashmap_cap *= 2; 175 free(store->map); 176 *store = copy; 177 textstore_rehash(store); 178 } 179 180 return handle; 181} 182 183int textstore_gc(textstore_t *store) { 184 textstore_slice *strings = store->key; 185 uint32_t *refs = store->ref; 186 bool *actives = store->active; 187 uint32_t count = store->count; 188 uint32_t available = store->available; 189 190 /** 191 * As a note for whoever is unfortunate enough to interact with my code (likely myself). 192 * To minimize the number of allocations, the textstore keeps the queue of freed handles 193 * and the actual text, in contiguous memory. The free queue is at the start of the buffer, 194 * and new strings get added to the end of the buffer. This works out, because the queue list 195 * only grows when textstore_gc() is caled, when all handles with 0 references are collected. 196 * 197 * The free-queue is last-in-first-out. 198 * 199 * [ 01 ] [ 02 ] H e l l o W o r l d ! ... 200 * ^ 201 * store->available = 2 202 * 203 * So when a new string is interned, and there are handles available to use, the last item is 204 * popped off the stack. 205 * 206 * [ 01 ] [ 02 ] H e l l o W o r l d ! ... 207 * ^ 208 * store->available = 1 209 */ 210 211 // Pass 1: Count how many strings need to be collected, and determine how 212 // much space is needed for the new buffer. 213 uint32_t collect = 0; 214 uint32_t new_length = available * sizeof(uint32_t); 215 for (uint32_t i = 0; i < count; i++) { 216 if (actives[i]) { 217 if (refs[i] > 0) { 218 new_length += strings[i].len + 1; 219 } else { 220 collect += 1; 221 new_length += sizeof(uint32_t); 222 } 223 } 224 } 225 226 // If no strings are in need of collecting, stop early. 227 if (collect == 0) return 0; 228 229 // Pass 2: Generate new buffer from the active strings. 230 if (new_length < store->buffer_cap) 231 new_length = store->buffer_cap; 232 char *new_buffer = malloc(new_length); 233 if (new_buffer == NULL) return 1; 234 memset(new_buffer, 0, new_length); 235 uint32_t new_offset = (collect + available) * sizeof(uint32_t); 236 // copy existing free queue to the new one. 237 if (available) 238 memcpy(new_buffer, store->buffer, available * sizeof(uint32_t)); 239 uint32_t *queue_buffer = (uint32_t *)new_buffer; 240 uint32_t queue_offset = available; 241 242 for (uint32_t i = 0; i < count; i++) { 243 if (actives[i]) { 244 if (refs[i] == 0) { 245 queue_buffer[queue_offset++] = i; 246 actives[i] = false; 247 strings[i] = (textstore_slice){0}; 248 continue; 249 } 250 // Copy string to new location, and shift offsets. 251 memcpy(&new_buffer[new_offset], &store->buffer[strings[i].offset], strings[i].len); 252 strings[i].offset = new_offset; 253 new_offset += strings[i].len + 1; 254 } 255 } 256 257 free(store->buffer); 258 store->buffer = new_buffer; 259 store->buffer_cap = new_length; 260 store->available = collect + available; 261 262 textstore_rehash(store); 263 return 0; 264} 265 266int textstore_init(textstore_t *store, uint32_t initial_buffer) { 267 textstore_t temp = {0}; 268 if (initial_buffer == 0) initial_buffer = 1; 269 temp.buffer = (char *)calloc(initial_buffer, 1); 270 if (temp.buffer == NULL) return 1; 271 temp.buffer_cap = initial_buffer; 272 273 temp.hashmap_cap = 8; 274 textstore_alloc(&temp, temp.hashmap_cap); 275 276 /* Special-case empty string ("") for index 0. */ 277 temp.key[0] = (textstore_slice){ 0, 0 }; 278 temp.ref[0] = 1; 279 temp.active[0] = true; 280 temp.length++; 281 temp.count++; 282 283 *store = temp; 284 return 0; 285} 286 287void textstore_free(textstore_t *store) { 288 free(store->buffer); 289 free(store->map); 290 291 *store = (textstore_t){0}; 292}