/* text-store - A store for storing reference-counted, de-duplicated strings. * Copyright (C) 2025 Jeremia Dominguez * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include #include #include #include #include #include #include #include #include "rapidhash.h" /** * TODO: Make all of this easier to follow. * In hindsight, I realize how much all of this relies on specific pointer math and such. * It may be better to split some things into functions, macros or structs? * This may be helped by splitting the free-queue into its own allocation, rather than * sharing space with the text buffer? */ /* Size of struct-of-array member's items. */ #define SoASize (sizeof(uint32_t) + sizeof(textstore_slice) + (sizeof(uint32_t) * 2) + sizeof(bool)) static inline textstore_handle to_handle(uint32_t index, uint32_t generation) { uint64_t handle = (uint64_t)generation << 32; handle += index; return handle; } /** * Allocates enough room on the hash map for n items, and sets the store's pointers. * * Note: Use on a temporary copy. */ static inline void textstore_alloc(textstore_t *store, size_t n) { store->map = (uint32_t *)calloc(n, SoASize); store->key = (textstore_slice *)(store->map + n); store->ref = (uint32_t *)(store->key + n); store->generation = (uint32_t *)(store->ref + n); store->active = (bool *)(store->generation + n); } static void textstore_put(textstore_t *store, char *str, uint32_t len, uint32_t value) { uint32_t *map = store->map; uint32_t capacity = store->hashmap_cap; uint32_t idx = rapidhash(str, len) % capacity; while (map[idx] != 0) { /* The 0 index is already reserved for empty string. */ idx = (idx + 1) % capacity; } map[idx] = value; } static void textstore_rehash(textstore_t *store) { char *buffer = store->buffer; textstore_slice *slices = store->key + 1; /* Skip index 0, empty string. */ bool *active = store->active + 1; uint32_t count = store->count - 1; memset(store->map, 0, sizeof(uint32_t) * store->hashmap_cap); for (uint32_t i = 0; i < count; i++) { if (active[i]) { textstore_slice slice = slices[i]; textstore_put(store, &buffer[slice.offset], slice.len, i + 1); } } } static textstore_handle textstore_find(textstore_t *store, char *str, uint32_t len) { assert(str != NULL); assert(len != 0); char *buffer = store->buffer; uint32_t hashmap_cap = store->hashmap_cap; uint32_t *map = store->map; textstore_slice *key = store->key; bool *active = store->active; uint32_t index = rapidhash(str, len) % hashmap_cap; while (map[index] != 0) { uint32_t idx = map[index]; if (key[idx].len != len || !active[idx]) { index = (index + 1) % hashmap_cap; continue; } if (memcmp(str, &buffer[key[idx].offset], len) == 0) { return to_handle(idx, store->generation[idx]); } } return TEXTSTORE_HANDLE_ERROR; } /** * Appends text to the store's buffer. */ static void textstore_append(textstore_t *store, char *str, uint32_t len) { if (len + 1 + store->length > store->buffer_cap) { uint32_t new_capacity= store->buffer_cap * 2; while (new_capacity < len + 1 + store->length) { new_capacity *= 2; } store->buffer = realloc(store->buffer, new_capacity); memset(&store->buffer[store->buffer_cap], 0, new_capacity - store->buffer_cap); store->buffer_cap = new_capacity; } memcpy(&store->buffer[store->length], str, len); store->buffer[store->length + len] = 0; store->length += len + 1; } textstore_handle textstore_intern(textstore_t *store, char *str) { return textstore_intern_len(store, str, strlen(str)); } textstore_handle textstore_intern_len(textstore_t *store, char *str, uint32_t len) { textstore_handle handle = to_handle(0, 0); uint32_t index; if (len == 0) { /* Special case empty string. */ textstore_retain(store, handle); return handle; } if (str == NULL) return TEXTSTORE_HANDLE_ERROR; handle = textstore_find(store, str, len); if (handle != TEXTSTORE_HANDLE_ERROR) { textstore_retain(store, handle); return handle; } uint32_t offset = store->length; textstore_append(store, str, len); if (store->available > 0) { /* Pop a freed handle index off the end of the queue. */ index = ((uint32_t *)store->buffer)[--store->available]; store->generation[index]++; } else { index = store->count++; store->generation[index] = 0; } textstore_put(store, str, len, index); handle = to_handle(index, store->generation[index]); store->key[index] = (textstore_slice){offset, len}; store->ref[index] = 1; store->active[index] = true; if (index > ((store->hashmap_cap/4)*3)) { /* At 3/4 capacity, grow the hash-table. */ textstore_t copy = *store; textstore_alloc(©, copy.hashmap_cap * 2); memcpy(copy.key, store->key, copy.count * sizeof(textstore_slice)); memcpy(copy.ref, store->ref, copy.count * sizeof(uint32_t)); memcpy(copy.generation, store->generation, copy.count * sizeof(uint32_t)); memcpy(copy.active, store->active, copy.count * sizeof(bool)); copy.hashmap_cap *= 2; free(store->map); *store = copy; textstore_rehash(store); } return handle; } int textstore_gc(textstore_t *store) { textstore_slice *strings = store->key; uint32_t *refs = store->ref; bool *actives = store->active; uint32_t count = store->count; uint32_t available = store->available; /** * As a note for whoever is unfortunate enough to interact with my code (likely myself). * To minimize the number of allocations, the textstore keeps the queue of freed handles * and the actual text, in contiguous memory. The free queue is at the start of the buffer, * and new strings get added to the end of the buffer. This works out, because the queue list * only grows when textstore_gc() is caled, when all handles with 0 references are collected. * * The free-queue is last-in-first-out. * * [ 01 ] [ 02 ] H e l l o W o r l d ! ... * ^ * store->available = 2 * * So when a new string is interned, and there are handles available to use, the last item is * popped off the stack. * * [ 01 ] [ 02 ] H e l l o W o r l d ! ... * ^ * store->available = 1 */ // Pass 1: Count how many strings need to be collected, and determine how // much space is needed for the new buffer. uint32_t collect = 0; uint32_t new_length = available * sizeof(uint32_t); for (uint32_t i = 0; i < count; i++) { if (actives[i]) { if (refs[i] > 0) { new_length += strings[i].len + 1; } else { collect += 1; new_length += sizeof(uint32_t); } } } // If no strings are in need of collecting, stop early. if (collect == 0) return 0; // Pass 2: Generate new buffer from the active strings. if (new_length < store->buffer_cap) new_length = store->buffer_cap; char *new_buffer = malloc(new_length); if (new_buffer == NULL) return 1; memset(new_buffer, 0, new_length); uint32_t new_offset = (collect + available) * sizeof(uint32_t); // copy existing free queue to the new one. if (available) memcpy(new_buffer, store->buffer, available * sizeof(uint32_t)); uint32_t *queue_buffer = (uint32_t *)new_buffer; uint32_t queue_offset = available; for (uint32_t i = 0; i < count; i++) { if (actives[i]) { if (refs[i] == 0) { queue_buffer[queue_offset++] = i; actives[i] = false; strings[i] = (textstore_slice){0}; continue; } // Copy string to new location, and shift offsets. memcpy(&new_buffer[new_offset], &store->buffer[strings[i].offset], strings[i].len); strings[i].offset = new_offset; new_offset += strings[i].len + 1; } } free(store->buffer); store->buffer = new_buffer; store->buffer_cap = new_length; store->available = collect + available; textstore_rehash(store); return 0; } int textstore_init(textstore_t *store, uint32_t initial_buffer) { textstore_t temp = {0}; if (initial_buffer == 0) initial_buffer = 1; temp.buffer = (char *)calloc(initial_buffer, 1); if (temp.buffer == NULL) return 1; temp.buffer_cap = initial_buffer; temp.hashmap_cap = 8; textstore_alloc(&temp, temp.hashmap_cap); /* Special-case empty string ("") for index 0. */ temp.key[0] = (textstore_slice){ 0, 0 }; temp.ref[0] = 1; temp.active[0] = true; temp.length++; temp.count++; *store = temp; return 0; } void textstore_free(textstore_t *store) { free(store->buffer); free(store->map); *store = (textstore_t){0}; }