Reference-counted, deduplicated string store.
1/* text-store - A store for storing reference-counted, de-duplicated strings.
2 * Copyright (C) 2025 Jeremia Dominguez <me@jeremia.dev>
3 *
4 * This program is free software: you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation, either version 3 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program. If not, see <https://www.gnu.org/licenses/>.
16 */
17
18#include <alloca.h>
19#include <stdalign.h>
20#include <stdbool.h>
21#include <stddef.h>
22#include <stdint.h>
23#include <stdlib.h>
24#include <string.h>
25
26#include <textstore.h>
27
28#include "rapidhash.h"
29
30/**
31 * TODO: Make all of this easier to follow.
32 * In hindsight, I realize how much all of this relies on specific pointer math and such.
33 * It may be better to split some things into functions, macros or structs?
34 * This may be helped by splitting the free-queue into its own allocation, rather than
35 * sharing space with the text buffer?
36 */
37
38/* Size of struct-of-array member's items. */
39#define SoASize (sizeof(uint32_t) + sizeof(textstore_slice) + (sizeof(uint32_t) * 2) + sizeof(bool))
40
41static inline textstore_handle to_handle(uint32_t index, uint32_t generation) {
42 uint64_t handle = (uint64_t)generation << 32;
43 handle += index;
44 return handle;
45}
46
47/**
48 * Allocates enough room on the hash map for n items, and sets the store's pointers.
49 *
50 * Note: Use on a temporary copy.
51 */
52static inline void textstore_alloc(textstore_t *store, size_t n) {
53 store->map = (uint32_t *)calloc(n, SoASize);
54 store->key = (textstore_slice *)(store->map + n);
55 store->ref = (uint32_t *)(store->key + n);
56 store->generation = (uint32_t *)(store->ref + n);
57 store->active = (bool *)(store->generation + n);
58}
59
60static void textstore_put(textstore_t *store, char *str, uint32_t len, uint32_t value) {
61 uint32_t *map = store->map;
62 uint32_t capacity = store->hashmap_cap;
63 uint32_t idx = rapidhash(str, len) % capacity;
64 while (map[idx] != 0) { /* The 0 index is already reserved for empty string. */
65 idx = (idx + 1) % capacity;
66 }
67 map[idx] = value;
68}
69
70static void textstore_rehash(textstore_t *store) {
71 char *buffer = store->buffer;
72 textstore_slice *slices = store->key + 1; /* Skip index 0, empty string. */
73 bool *active = store->active + 1;
74 uint32_t count = store->count - 1;
75 memset(store->map, 0, sizeof(uint32_t) * store->hashmap_cap);
76 for (uint32_t i = 0; i < count; i++) {
77 if (active[i]) {
78 textstore_slice slice = slices[i];
79 textstore_put(store, &buffer[slice.offset], slice.len, i + 1);
80 }
81 }
82}
83
84static textstore_handle textstore_find(textstore_t *store, char *str, uint32_t len) {
85 assert(str != NULL);
86 assert(len != 0);
87
88 char *buffer = store->buffer;
89 uint32_t hashmap_cap = store->hashmap_cap;
90 uint32_t *map = store->map;
91 textstore_slice *key = store->key;
92 bool *active = store->active;
93
94 uint32_t index = rapidhash(str, len) % hashmap_cap;
95 while (map[index] != 0) {
96 uint32_t idx = map[index];
97 if (key[idx].len != len || !active[idx]) {
98 index = (index + 1) % hashmap_cap;
99 continue;
100 }
101 if (memcmp(str, &buffer[key[idx].offset], len) == 0) {
102 return to_handle(idx, store->generation[idx]);
103 }
104 }
105
106 return TEXTSTORE_HANDLE_ERROR;
107}
108
109/**
110 * Appends text to the store's buffer.
111 */
112static void textstore_append(textstore_t *store, char *str, uint32_t len) {
113 if (len + 1 + store->length > store->buffer_cap) {
114 uint32_t new_capacity= store->buffer_cap * 2;
115 while (new_capacity < len + 1 + store->length) {
116 new_capacity *= 2;
117 }
118 store->buffer = realloc(store->buffer, new_capacity);
119 memset(&store->buffer[store->buffer_cap], 0, new_capacity - store->buffer_cap);
120 store->buffer_cap = new_capacity;
121 }
122 memcpy(&store->buffer[store->length], str, len);
123 store->buffer[store->length + len] = 0;
124 store->length += len + 1;
125}
126
127textstore_handle textstore_intern(textstore_t *store, char *str) {
128 return textstore_intern_len(store, str, strlen(str));
129}
130
131textstore_handle textstore_intern_len(textstore_t *store, char *str, uint32_t len) {
132 textstore_handle handle = to_handle(0, 0);
133 uint32_t index;
134
135 if (len == 0) { /* Special case empty string. */
136 textstore_retain(store, handle);
137 return handle;
138 }
139 if (str == NULL) return TEXTSTORE_HANDLE_ERROR;
140
141 handle = textstore_find(store, str, len);
142 if (handle != TEXTSTORE_HANDLE_ERROR) {
143 textstore_retain(store, handle);
144 return handle;
145 }
146
147 uint32_t offset = store->length;
148 textstore_append(store, str, len);
149
150
151 if (store->available > 0) {
152 /* Pop a freed handle index off the end of the queue. */
153 index = ((uint32_t *)store->buffer)[--store->available];
154 store->generation[index]++;
155 } else {
156 index = store->count++;
157 store->generation[index] = 0;
158 }
159
160 textstore_put(store, str, len, index);
161 handle = to_handle(index, store->generation[index]);
162 store->key[index] = (textstore_slice){offset, len};
163 store->ref[index] = 1;
164 store->active[index] = true;
165
166 if (index > ((store->hashmap_cap/4)*3)) {
167 /* At 3/4 capacity, grow the hash-table. */
168 textstore_t copy = *store;
169 textstore_alloc(©, copy.hashmap_cap * 2);
170 memcpy(copy.key, store->key, copy.count * sizeof(textstore_slice));
171 memcpy(copy.ref, store->ref, copy.count * sizeof(uint32_t));
172 memcpy(copy.generation, store->generation, copy.count * sizeof(uint32_t));
173 memcpy(copy.active, store->active, copy.count * sizeof(bool));
174 copy.hashmap_cap *= 2;
175 free(store->map);
176 *store = copy;
177 textstore_rehash(store);
178 }
179
180 return handle;
181}
182
183int textstore_gc(textstore_t *store) {
184 textstore_slice *strings = store->key;
185 uint32_t *refs = store->ref;
186 bool *actives = store->active;
187 uint32_t count = store->count;
188 uint32_t available = store->available;
189
190 /**
191 * As a note for whoever is unfortunate enough to interact with my code (likely myself).
192 * To minimize the number of allocations, the textstore keeps the queue of freed handles
193 * and the actual text, in contiguous memory. The free queue is at the start of the buffer,
194 * and new strings get added to the end of the buffer. This works out, because the queue list
195 * only grows when textstore_gc() is caled, when all handles with 0 references are collected.
196 *
197 * The free-queue is last-in-first-out.
198 *
199 * [ 01 ] [ 02 ] H e l l o W o r l d ! ...
200 * ^
201 * store->available = 2
202 *
203 * So when a new string is interned, and there are handles available to use, the last item is
204 * popped off the stack.
205 *
206 * [ 01 ] [ 02 ] H e l l o W o r l d ! ...
207 * ^
208 * store->available = 1
209 */
210
211 // Pass 1: Count how many strings need to be collected, and determine how
212 // much space is needed for the new buffer.
213 uint32_t collect = 0;
214 uint32_t new_length = available * sizeof(uint32_t);
215 for (uint32_t i = 0; i < count; i++) {
216 if (actives[i]) {
217 if (refs[i] > 0) {
218 new_length += strings[i].len + 1;
219 } else {
220 collect += 1;
221 new_length += sizeof(uint32_t);
222 }
223 }
224 }
225
226 // If no strings are in need of collecting, stop early.
227 if (collect == 0) return 0;
228
229 // Pass 2: Generate new buffer from the active strings.
230 if (new_length < store->buffer_cap)
231 new_length = store->buffer_cap;
232 char *new_buffer = malloc(new_length);
233 if (new_buffer == NULL) return 1;
234 memset(new_buffer, 0, new_length);
235 uint32_t new_offset = (collect + available) * sizeof(uint32_t);
236 // copy existing free queue to the new one.
237 if (available)
238 memcpy(new_buffer, store->buffer, available * sizeof(uint32_t));
239 uint32_t *queue_buffer = (uint32_t *)new_buffer;
240 uint32_t queue_offset = available;
241
242 for (uint32_t i = 0; i < count; i++) {
243 if (actives[i]) {
244 if (refs[i] == 0) {
245 queue_buffer[queue_offset++] = i;
246 actives[i] = false;
247 strings[i] = (textstore_slice){0};
248 continue;
249 }
250 // Copy string to new location, and shift offsets.
251 memcpy(&new_buffer[new_offset], &store->buffer[strings[i].offset], strings[i].len);
252 strings[i].offset = new_offset;
253 new_offset += strings[i].len + 1;
254 }
255 }
256
257 free(store->buffer);
258 store->buffer = new_buffer;
259 store->buffer_cap = new_length;
260 store->available = collect + available;
261
262 textstore_rehash(store);
263 return 0;
264}
265
266int textstore_init(textstore_t *store, uint32_t initial_buffer) {
267 textstore_t temp = {0};
268 if (initial_buffer == 0) initial_buffer = 1;
269 temp.buffer = (char *)calloc(initial_buffer, 1);
270 if (temp.buffer == NULL) return 1;
271 temp.buffer_cap = initial_buffer;
272
273 temp.hashmap_cap = 8;
274 textstore_alloc(&temp, temp.hashmap_cap);
275
276 /* Special-case empty string ("") for index 0. */
277 temp.key[0] = (textstore_slice){ 0, 0 };
278 temp.ref[0] = 1;
279 temp.active[0] = true;
280 temp.length++;
281 temp.count++;
282
283 *store = temp;
284 return 0;
285}
286
287void textstore_free(textstore_t *store) {
288 free(store->buffer);
289 free(store->map);
290
291 *store = (textstore_t){0};
292}