simdjson bindings with streaming support
1#include "simdcbor.hh"
2#include <cmath>
3#include <cstring>
4#include <vector>
5#include <stdexcept>
6
7#if defined(_MSC_VER)
8#include <stdlib.h>
9#define bswap_16(x) _byteswap_ushort(x)
10#define bswap_32(x) _byteswap_ulong(x)
11#define bswap_64(x) _byteswap_uint64(x)
12#else
13#define bswap_16(x) __builtin_bswap16(x)
14#define bswap_32(x) __builtin_bswap32(x)
15#define bswap_64(x) __builtin_bswap64(x)
16#endif
17
18using namespace simdjson;
19
20namespace {
21
22class CborReader {
23public:
24 const uint8_t* current;
25 const uint8_t* end;
26 dom::document& doc;
27 size_t tape_idx;
28 uint8_t* current_string_buf;
29 const uint8_t* string_buf_start;
30
31 CborReader(const uint8_t* buf, size_t len, dom::document& d)
32 : current(buf), end(buf + len), doc(d), tape_idx(0),
33 current_string_buf(d.string_buf.get()),
34 string_buf_start(d.string_buf.get()) {}
35
36 void append_tape(uint64_t val, internal::tape_type type) {
37 doc.tape[tape_idx++] = val | (uint64_t(type) << 56);
38 }
39
40 void append_tape_value(uint64_t val) {
41 doc.tape[tape_idx++] = val;
42 }
43
44 size_t reserve_tape() {
45 return tape_idx++;
46 }
47
48 void set_tape(size_t idx, uint64_t val, internal::tape_type type) {
49 doc.tape[idx] = val | (uint64_t(type) << 56);
50 }
51
52 error_code parse_root() {
53 size_t root_start = reserve_tape();
54
55 error_code ec = parse_item();
56 if (ec != SUCCESS) return ec;
57
58 size_t root_end = reserve_tape();
59
60 set_tape(root_start, root_end, internal::tape_type::ROOT);
61 set_tape(root_end, root_start, internal::tape_type::ROOT);
62
63 return SUCCESS;
64 }
65
66 error_code parse_item() {
67 if (current >= end) return EMPTY;
68
69 uint8_t initial = *current++;
70 uint8_t major = initial >> 5;
71 uint8_t additional = initial & 0x1F;
72
73 switch (major) {
74 case 0: return parse_uint(additional);
75 case 1: return parse_nint(additional);
76 case 2: return parse_byte_string(additional);
77 case 3: return parse_text_string(additional);
78 case 4: return parse_array(additional);
79 case 5: return parse_map(additional);
80 case 6: return parse_item();
81 case 7: return parse_float_simple(additional);
82 default: return UNEXPECTED_ERROR;
83 }
84 }
85
86private:
87 uint64_t read_uint(uint8_t additional, error_code& ec) {
88 if (additional < 24) {
89 return additional;
90 } else if (additional == 24) {
91 if (current + 1 > end) { ec = INDEX_OUT_OF_BOUNDS; return 0; }
92 uint8_t v = *current++;
93 return v;
94 } else if (additional == 25) {
95 if (current + 2 > end) { ec = INDEX_OUT_OF_BOUNDS; return 0; }
96 uint16_t v;
97 memcpy(&v, current, 2);
98 current += 2;
99 return bswap_16(v);
100 } else if (additional == 26) {
101 if (current + 4 > end) { ec = INDEX_OUT_OF_BOUNDS; return 0; }
102 uint32_t v;
103 memcpy(&v, current, 4);
104 current += 4;
105 return bswap_32(v);
106 } else if (additional == 27) {
107 if (current + 8 > end) { ec = INDEX_OUT_OF_BOUNDS; return 0; }
108 uint64_t v;
109 memcpy(&v, current, 8);
110 current += 8;
111 return bswap_64(v);
112 } else {
113 ec = UNEXPECTED_ERROR;
114 return 0;
115 }
116 }
117
118 error_code parse_uint(uint8_t additional) {
119 error_code ec = SUCCESS;
120 uint64_t val = read_uint(additional, ec);
121 if (ec != SUCCESS) return ec;
122
123 append_tape(0, internal::tape_type::UINT64);
124 append_tape_value(val);
125 return SUCCESS;
126 }
127
128 error_code parse_nint(uint8_t additional) {
129 error_code ec = SUCCESS;
130 uint64_t val = read_uint(additional, ec);
131 if (ec != SUCCESS) return ec;
132
133 append_tape(0, internal::tape_type::INT64);
134 int64_t nval = -1 - int64_t(val);
135 append_tape_value((uint64_t)nval);
136 return SUCCESS;
137 }
138
139 error_code parse_byte_string(uint8_t additional) {
140 if (additional == 31) return UNEXPECTED_ERROR;
141
142 error_code ec = SUCCESS;
143 uint64_t len = read_uint(additional, ec);
144 if (ec != SUCCESS) return ec;
145
146 if (current + len > end) return INDEX_OUT_OF_BOUNDS;
147
148 return write_string(current, len);
149 }
150
151 error_code parse_text_string(uint8_t additional) {
152 if (additional == 31) {
153 size_t offset = current_string_buf - string_buf_start;
154 uint8_t* len_ptr = current_string_buf;
155 current_string_buf += sizeof(uint32_t);
156 size_t total_len = 0;
157
158 while (true) {
159 if (current >= end) return UNEXPECTED_ERROR;
160 if (*current == 0xFF) {
161 current++;
162 break;
163 }
164 uint8_t chunk_initial = *current++;
165 if ((chunk_initial >> 5) != 3) return INCORRECT_TYPE;
166
167 error_code ec = SUCCESS;
168 uint64_t chunk_len = read_uint(chunk_initial & 0x1F, ec);
169 if (ec != SUCCESS) return ec;
170 if (current + chunk_len > end) return INDEX_OUT_OF_BOUNDS;
171
172 if (!simdjson::validate_utf8((const char*)current, chunk_len)) {
173 return UTF8_ERROR;
174 }
175
176 memcpy(current_string_buf, current, chunk_len);
177 current_string_buf += chunk_len;
178 current += chunk_len;
179 total_len += chunk_len;
180 }
181
182 *current_string_buf++ = 0;
183 uint32_t len32 = (uint32_t)total_len;
184 memcpy(len_ptr, &len32, sizeof(uint32_t));
185 append_tape(offset, internal::tape_type::STRING);
186 return SUCCESS;
187 }
188
189 error_code ec = SUCCESS;
190 uint64_t len = read_uint(additional, ec);
191 if (ec != SUCCESS) return ec;
192
193 if (current + len > end) return INDEX_OUT_OF_BOUNDS;
194
195 if (!simdjson::validate_utf8((const char*)current, len)) {
196 return UTF8_ERROR;
197 }
198
199 return write_string(current, len);
200 }
201
202 error_code write_string(const uint8_t* ptr, size_t len) {
203 uint32_t len32 = (uint32_t)len;
204 size_t offset = current_string_buf - string_buf_start;
205
206 append_tape(offset, internal::tape_type::STRING);
207
208 memcpy(current_string_buf, &len32, sizeof(uint32_t));
209 current_string_buf += sizeof(uint32_t);
210
211 memcpy(current_string_buf, ptr, len);
212 current_string_buf += len;
213
214 *current_string_buf++ = 0;
215 current += len;
216
217 return SUCCESS;
218 }
219
220 error_code parse_array(uint8_t additional) {
221 size_t start_idx = reserve_tape();
222 uint64_t count = 0;
223
224 if (additional == 31) {
225 while (true) {
226 if (current >= end) return UNEXPECTED_ERROR;
227 if (*current == 0xFF) {
228 current++;
229 break;
230 }
231 error_code ec = parse_item();
232 if (ec != SUCCESS) return ec;
233 count++;
234 }
235 } else {
236 error_code ec = SUCCESS;
237 count = read_uint(additional, ec);
238 if (ec != SUCCESS) return ec;
239
240 for (uint64_t i = 0; i < count; ++i) {
241 ec = parse_item();
242 if (ec != SUCCESS) return ec;
243 }
244 }
245
246 size_t end_idx = reserve_tape();
247 size_t next_idx = end_idx + 1;
248
249 uint64_t start_payload = next_idx | (count << 32);
250 doc.tape[start_idx] = start_payload | (uint64_t(internal::tape_type::START_ARRAY) << 56);
251 doc.tape[end_idx] = start_idx | (uint64_t(internal::tape_type::END_ARRAY) << 56);
252
253 return SUCCESS;
254 }
255
256 error_code parse_map(uint8_t additional) {
257 size_t start_idx = reserve_tape();
258 uint64_t count = 0;
259
260 if (additional == 31) {
261 while (true) {
262 if (current >= end) return UNEXPECTED_ERROR;
263 if (*current == 0xFF) {
264 current++;
265 break;
266 }
267
268 error_code ec = parse_key();
269 if (ec != SUCCESS) return ec;
270
271 ec = parse_item();
272 if (ec != SUCCESS) return ec;
273 count++;
274 }
275 } else {
276 error_code ec = SUCCESS;
277 count = read_uint(additional, ec);
278 if (ec != SUCCESS) return ec;
279
280 for (uint64_t i = 0; i < count; ++i) {
281 ec = parse_key();
282 if (ec != SUCCESS) return ec;
283
284 ec = parse_item();
285 if (ec != SUCCESS) return ec;
286 }
287 }
288
289 size_t end_idx = reserve_tape();
290 size_t next_idx = end_idx + 1;
291
292 uint64_t start_payload = next_idx | (count << 32);
293 doc.tape[start_idx] = start_payload | (uint64_t(internal::tape_type::START_OBJECT) << 56);
294 doc.tape[end_idx] = start_idx | (uint64_t(internal::tape_type::END_OBJECT) << 56);
295
296 return SUCCESS;
297 }
298
299 error_code parse_key() {
300 if (current >= end) return EMPTY;
301
302 uint8_t initial = *current;
303 uint8_t major = initial >> 5;
304
305 if (major == 3) {
306 current++;
307 return parse_text_string(initial & 0x1F);
308 } else if (major == 2) {
309 current++;
310 return parse_byte_string(initial & 0x1F);
311 } else {
312 return INCORRECT_TYPE;
313 }
314 }
315
316 error_code parse_float_simple(uint8_t additional) {
317 if (additional < 20) {
318 return UNEXPECTED_ERROR;
319 }
320 switch (additional) {
321 case 20:
322 append_tape(0, internal::tape_type::FALSE_VALUE);
323 return SUCCESS;
324 case 21:
325 append_tape(0, internal::tape_type::TRUE_VALUE);
326 return SUCCESS;
327 case 22:
328 append_tape(0, internal::tape_type::NULL_VALUE);
329 return SUCCESS;
330 case 23:
331 append_tape(0, internal::tape_type::NULL_VALUE);
332 return SUCCESS;
333 case 24:
334 if (current + 1 > end) return INDEX_OUT_OF_BOUNDS;
335 current++;
336 return UNEXPECTED_ERROR;
337 case 25: {
338 if (current + 2 > end) return INDEX_OUT_OF_BOUNDS;
339 uint16_t v;
340 memcpy(&v, current, 2);
341 current += 2;
342 v = bswap_16(v);
343
344 uint32_t sign = (v >> 15) & 1;
345 uint32_t exp = (v >> 10) & 0x1F;
346 uint32_t mant = v & 0x3FF;
347
348 double d;
349 if (exp == 0) {
350 d = std::ldexp(mant, -24);
351 } else if (exp == 31) {
352 d = (mant == 0) ? INFINITY : NAN;
353 } else {
354 d = std::ldexp(mant + 1024, exp - 25);
355 }
356 if (sign) d = -d;
357
358 append_tape(0, internal::tape_type::DOUBLE);
359 uint64_t d_as_u64;
360 memcpy(&d_as_u64, &d, 8);
361 append_tape_value(d_as_u64);
362 return SUCCESS;
363 }
364 case 26: {
365 if (current + 4 > end) return INDEX_OUT_OF_BOUNDS;
366 uint32_t v;
367 memcpy(&v, current, 4);
368 current += 4;
369 v = bswap_32(v);
370 float f;
371 memcpy(&f, &v, 4);
372
373 append_tape(0, internal::tape_type::DOUBLE);
374 double d = f;
375 uint64_t d_as_u64;
376 memcpy(&d_as_u64, &d, 8);
377 append_tape_value(d_as_u64);
378 return SUCCESS;
379 }
380 case 27: {
381 if (current + 8 > end) return INDEX_OUT_OF_BOUNDS;
382 uint64_t v;
383 memcpy(&v, current, 8);
384 current += 8;
385 v = bswap_64(v);
386
387 append_tape(0, internal::tape_type::DOUBLE);
388 append_tape_value(v);
389 return SUCCESS;
390 }
391 default:
392 return UNEXPECTED_ERROR;
393 }
394 }
395};
396
397}
398
399simdjson::error_code simdcbor::parse(const uint8_t* buf, size_t len, simdjson::dom::parser& parser, size_t& bytes_read) {
400 auto err = parser.doc.allocate(len * 8 + 4096);
401 if (err != SUCCESS) return err;
402
403 CborReader reader(buf, len, parser.doc);
404 err = reader.parse_root();
405 bytes_read = reader.current - buf;
406 return err;
407}
408
409simdjson::dom::element simdcbor::get_root(simdjson::dom::parser& parser) {
410 return parser.doc.root();
411}