simdjson bindings with streaming support
at main 13 kB view raw
1#include "simdcbor.hh" 2#include <cmath> 3#include <cstring> 4#include <vector> 5#include <stdexcept> 6 7#if defined(_MSC_VER) 8#include <stdlib.h> 9#define bswap_16(x) _byteswap_ushort(x) 10#define bswap_32(x) _byteswap_ulong(x) 11#define bswap_64(x) _byteswap_uint64(x) 12#else 13#define bswap_16(x) __builtin_bswap16(x) 14#define bswap_32(x) __builtin_bswap32(x) 15#define bswap_64(x) __builtin_bswap64(x) 16#endif 17 18using namespace simdjson; 19 20namespace { 21 22class CborReader { 23public: 24 const uint8_t* current; 25 const uint8_t* end; 26 dom::document& doc; 27 size_t tape_idx; 28 uint8_t* current_string_buf; 29 const uint8_t* string_buf_start; 30 31 CborReader(const uint8_t* buf, size_t len, dom::document& d) 32 : current(buf), end(buf + len), doc(d), tape_idx(0), 33 current_string_buf(d.string_buf.get()), 34 string_buf_start(d.string_buf.get()) {} 35 36 void append_tape(uint64_t val, internal::tape_type type) { 37 doc.tape[tape_idx++] = val | (uint64_t(type) << 56); 38 } 39 40 void append_tape_value(uint64_t val) { 41 doc.tape[tape_idx++] = val; 42 } 43 44 size_t reserve_tape() { 45 return tape_idx++; 46 } 47 48 void set_tape(size_t idx, uint64_t val, internal::tape_type type) { 49 doc.tape[idx] = val | (uint64_t(type) << 56); 50 } 51 52 error_code parse_root() { 53 size_t root_start = reserve_tape(); 54 55 error_code ec = parse_item(); 56 if (ec != SUCCESS) return ec; 57 58 size_t root_end = reserve_tape(); 59 60 set_tape(root_start, root_end, internal::tape_type::ROOT); 61 set_tape(root_end, root_start, internal::tape_type::ROOT); 62 63 return SUCCESS; 64 } 65 66 error_code parse_item() { 67 if (current >= end) return EMPTY; 68 69 uint8_t initial = *current++; 70 uint8_t major = initial >> 5; 71 uint8_t additional = initial & 0x1F; 72 73 switch (major) { 74 case 0: return parse_uint(additional); 75 case 1: return parse_nint(additional); 76 case 2: return parse_byte_string(additional); 77 case 3: return parse_text_string(additional); 78 case 4: return parse_array(additional); 79 case 5: return parse_map(additional); 80 case 6: return parse_item(); 81 case 7: return parse_float_simple(additional); 82 default: return UNEXPECTED_ERROR; 83 } 84 } 85 86private: 87 uint64_t read_uint(uint8_t additional, error_code& ec) { 88 if (additional < 24) { 89 return additional; 90 } else if (additional == 24) { 91 if (current + 1 > end) { ec = INDEX_OUT_OF_BOUNDS; return 0; } 92 uint8_t v = *current++; 93 return v; 94 } else if (additional == 25) { 95 if (current + 2 > end) { ec = INDEX_OUT_OF_BOUNDS; return 0; } 96 uint16_t v; 97 memcpy(&v, current, 2); 98 current += 2; 99 return bswap_16(v); 100 } else if (additional == 26) { 101 if (current + 4 > end) { ec = INDEX_OUT_OF_BOUNDS; return 0; } 102 uint32_t v; 103 memcpy(&v, current, 4); 104 current += 4; 105 return bswap_32(v); 106 } else if (additional == 27) { 107 if (current + 8 > end) { ec = INDEX_OUT_OF_BOUNDS; return 0; } 108 uint64_t v; 109 memcpy(&v, current, 8); 110 current += 8; 111 return bswap_64(v); 112 } else { 113 ec = UNEXPECTED_ERROR; 114 return 0; 115 } 116 } 117 118 error_code parse_uint(uint8_t additional) { 119 error_code ec = SUCCESS; 120 uint64_t val = read_uint(additional, ec); 121 if (ec != SUCCESS) return ec; 122 123 append_tape(0, internal::tape_type::UINT64); 124 append_tape_value(val); 125 return SUCCESS; 126 } 127 128 error_code parse_nint(uint8_t additional) { 129 error_code ec = SUCCESS; 130 uint64_t val = read_uint(additional, ec); 131 if (ec != SUCCESS) return ec; 132 133 append_tape(0, internal::tape_type::INT64); 134 int64_t nval = -1 - int64_t(val); 135 append_tape_value((uint64_t)nval); 136 return SUCCESS; 137 } 138 139 error_code parse_byte_string(uint8_t additional) { 140 if (additional == 31) return UNEXPECTED_ERROR; 141 142 error_code ec = SUCCESS; 143 uint64_t len = read_uint(additional, ec); 144 if (ec != SUCCESS) return ec; 145 146 if (current + len > end) return INDEX_OUT_OF_BOUNDS; 147 148 return write_string(current, len); 149 } 150 151 error_code parse_text_string(uint8_t additional) { 152 if (additional == 31) { 153 size_t offset = current_string_buf - string_buf_start; 154 uint8_t* len_ptr = current_string_buf; 155 current_string_buf += sizeof(uint32_t); 156 size_t total_len = 0; 157 158 while (true) { 159 if (current >= end) return UNEXPECTED_ERROR; 160 if (*current == 0xFF) { 161 current++; 162 break; 163 } 164 uint8_t chunk_initial = *current++; 165 if ((chunk_initial >> 5) != 3) return INCORRECT_TYPE; 166 167 error_code ec = SUCCESS; 168 uint64_t chunk_len = read_uint(chunk_initial & 0x1F, ec); 169 if (ec != SUCCESS) return ec; 170 if (current + chunk_len > end) return INDEX_OUT_OF_BOUNDS; 171 172 if (!simdjson::validate_utf8((const char*)current, chunk_len)) { 173 return UTF8_ERROR; 174 } 175 176 memcpy(current_string_buf, current, chunk_len); 177 current_string_buf += chunk_len; 178 current += chunk_len; 179 total_len += chunk_len; 180 } 181 182 *current_string_buf++ = 0; 183 uint32_t len32 = (uint32_t)total_len; 184 memcpy(len_ptr, &len32, sizeof(uint32_t)); 185 append_tape(offset, internal::tape_type::STRING); 186 return SUCCESS; 187 } 188 189 error_code ec = SUCCESS; 190 uint64_t len = read_uint(additional, ec); 191 if (ec != SUCCESS) return ec; 192 193 if (current + len > end) return INDEX_OUT_OF_BOUNDS; 194 195 if (!simdjson::validate_utf8((const char*)current, len)) { 196 return UTF8_ERROR; 197 } 198 199 return write_string(current, len); 200 } 201 202 error_code write_string(const uint8_t* ptr, size_t len) { 203 uint32_t len32 = (uint32_t)len; 204 size_t offset = current_string_buf - string_buf_start; 205 206 append_tape(offset, internal::tape_type::STRING); 207 208 memcpy(current_string_buf, &len32, sizeof(uint32_t)); 209 current_string_buf += sizeof(uint32_t); 210 211 memcpy(current_string_buf, ptr, len); 212 current_string_buf += len; 213 214 *current_string_buf++ = 0; 215 current += len; 216 217 return SUCCESS; 218 } 219 220 error_code parse_array(uint8_t additional) { 221 size_t start_idx = reserve_tape(); 222 uint64_t count = 0; 223 224 if (additional == 31) { 225 while (true) { 226 if (current >= end) return UNEXPECTED_ERROR; 227 if (*current == 0xFF) { 228 current++; 229 break; 230 } 231 error_code ec = parse_item(); 232 if (ec != SUCCESS) return ec; 233 count++; 234 } 235 } else { 236 error_code ec = SUCCESS; 237 count = read_uint(additional, ec); 238 if (ec != SUCCESS) return ec; 239 240 for (uint64_t i = 0; i < count; ++i) { 241 ec = parse_item(); 242 if (ec != SUCCESS) return ec; 243 } 244 } 245 246 size_t end_idx = reserve_tape(); 247 size_t next_idx = end_idx + 1; 248 249 uint64_t start_payload = next_idx | (count << 32); 250 doc.tape[start_idx] = start_payload | (uint64_t(internal::tape_type::START_ARRAY) << 56); 251 doc.tape[end_idx] = start_idx | (uint64_t(internal::tape_type::END_ARRAY) << 56); 252 253 return SUCCESS; 254 } 255 256 error_code parse_map(uint8_t additional) { 257 size_t start_idx = reserve_tape(); 258 uint64_t count = 0; 259 260 if (additional == 31) { 261 while (true) { 262 if (current >= end) return UNEXPECTED_ERROR; 263 if (*current == 0xFF) { 264 current++; 265 break; 266 } 267 268 error_code ec = parse_key(); 269 if (ec != SUCCESS) return ec; 270 271 ec = parse_item(); 272 if (ec != SUCCESS) return ec; 273 count++; 274 } 275 } else { 276 error_code ec = SUCCESS; 277 count = read_uint(additional, ec); 278 if (ec != SUCCESS) return ec; 279 280 for (uint64_t i = 0; i < count; ++i) { 281 ec = parse_key(); 282 if (ec != SUCCESS) return ec; 283 284 ec = parse_item(); 285 if (ec != SUCCESS) return ec; 286 } 287 } 288 289 size_t end_idx = reserve_tape(); 290 size_t next_idx = end_idx + 1; 291 292 uint64_t start_payload = next_idx | (count << 32); 293 doc.tape[start_idx] = start_payload | (uint64_t(internal::tape_type::START_OBJECT) << 56); 294 doc.tape[end_idx] = start_idx | (uint64_t(internal::tape_type::END_OBJECT) << 56); 295 296 return SUCCESS; 297 } 298 299 error_code parse_key() { 300 if (current >= end) return EMPTY; 301 302 uint8_t initial = *current; 303 uint8_t major = initial >> 5; 304 305 if (major == 3) { 306 current++; 307 return parse_text_string(initial & 0x1F); 308 } else if (major == 2) { 309 current++; 310 return parse_byte_string(initial & 0x1F); 311 } else { 312 return INCORRECT_TYPE; 313 } 314 } 315 316 error_code parse_float_simple(uint8_t additional) { 317 if (additional < 20) { 318 return UNEXPECTED_ERROR; 319 } 320 switch (additional) { 321 case 20: 322 append_tape(0, internal::tape_type::FALSE_VALUE); 323 return SUCCESS; 324 case 21: 325 append_tape(0, internal::tape_type::TRUE_VALUE); 326 return SUCCESS; 327 case 22: 328 append_tape(0, internal::tape_type::NULL_VALUE); 329 return SUCCESS; 330 case 23: 331 append_tape(0, internal::tape_type::NULL_VALUE); 332 return SUCCESS; 333 case 24: 334 if (current + 1 > end) return INDEX_OUT_OF_BOUNDS; 335 current++; 336 return UNEXPECTED_ERROR; 337 case 25: { 338 if (current + 2 > end) return INDEX_OUT_OF_BOUNDS; 339 uint16_t v; 340 memcpy(&v, current, 2); 341 current += 2; 342 v = bswap_16(v); 343 344 uint32_t sign = (v >> 15) & 1; 345 uint32_t exp = (v >> 10) & 0x1F; 346 uint32_t mant = v & 0x3FF; 347 348 double d; 349 if (exp == 0) { 350 d = std::ldexp(mant, -24); 351 } else if (exp == 31) { 352 d = (mant == 0) ? INFINITY : NAN; 353 } else { 354 d = std::ldexp(mant + 1024, exp - 25); 355 } 356 if (sign) d = -d; 357 358 append_tape(0, internal::tape_type::DOUBLE); 359 uint64_t d_as_u64; 360 memcpy(&d_as_u64, &d, 8); 361 append_tape_value(d_as_u64); 362 return SUCCESS; 363 } 364 case 26: { 365 if (current + 4 > end) return INDEX_OUT_OF_BOUNDS; 366 uint32_t v; 367 memcpy(&v, current, 4); 368 current += 4; 369 v = bswap_32(v); 370 float f; 371 memcpy(&f, &v, 4); 372 373 append_tape(0, internal::tape_type::DOUBLE); 374 double d = f; 375 uint64_t d_as_u64; 376 memcpy(&d_as_u64, &d, 8); 377 append_tape_value(d_as_u64); 378 return SUCCESS; 379 } 380 case 27: { 381 if (current + 8 > end) return INDEX_OUT_OF_BOUNDS; 382 uint64_t v; 383 memcpy(&v, current, 8); 384 current += 8; 385 v = bswap_64(v); 386 387 append_tape(0, internal::tape_type::DOUBLE); 388 append_tape_value(v); 389 return SUCCESS; 390 } 391 default: 392 return UNEXPECTED_ERROR; 393 } 394 } 395}; 396 397} 398 399simdjson::error_code simdcbor::parse(const uint8_t* buf, size_t len, simdjson::dom::parser& parser, size_t& bytes_read) { 400 auto err = parser.doc.allocate(len * 8 + 4096); 401 if (err != SUCCESS) return err; 402 403 CborReader reader(buf, len, parser.doc); 404 err = reader.parse_root(); 405 bytes_read = reader.current - buf; 406 return err; 407} 408 409simdjson::dom::element simdcbor::get_root(simdjson::dom::parser& parser) { 410 return parser.doc.root(); 411}