Add UTF8 string functions · huwcampbell.com/icicle@ff81d31

+3 -3

data/example/demographics.psv

··· 18 18 maggie|age|2|1989-12-17 19 19 maggie|gender|f|1989-12-17 20 20 homer|injury|{"location":"head","severity":2}"|1989-12-17 21 - homer|injury|{"location":"head","severity":1}"|1990-01-01 22 - homer|injury|{"location":"arm","severity":4}"|1994-01-01 23 - homer|injury|{"location":"torso","severity":3}"|1999-01-01 21 + homer|injury|{"location":"heAd","severity":1}"|1990-01-01 22 + homer|injury|{"location":"aRm","severity":4}"|1994-01-01 23 + homer|injury|{"location":"TORSO","severity":3}"|1999-01-01 24 24 homer|injury|{"location":"torso","severity":1,"admitted":"2010-01-02"}"|2010-01-01

+1 -1

data/sea/20-simple.h

··· 52 52 static idouble_t INLINE idouble_cosh (idouble_t x) { return cosh(x); } 53 53 static idouble_t INLINE idouble_div (idouble_t x, idouble_t y) { return x / y; } 54 54 static idouble_t INLINE idouble_exp (idouble_t x) { return exp(x); } 55 - static idouble_t INLINE idouble_is_valid(idouble_t x) { return isfinite(x); } 55 + static ibool_t INLINE idouble_is_valid(idouble_t x) { return isfinite(x); } 56 56 static idouble_t INLINE idouble_log (idouble_t x) { return log(x); } 57 57 static idouble_t INLINE idouble_mul (idouble_t x, idouble_t y) { return x * y; } 58 58 static idouble_t INLINE idouble_neg (idouble_t x) { return -x; }

+463

data/sea/22-utf8.h

··· 1 + #include "21-time.h" 2 + 3 + // Check if the string is a valid utf8 string. 4 + static ibool_t istring_is_valid(const istring_t str) { 5 + const char *s = (const istring_t) str; 6 + 7 + while ('\0' != *s) { 8 + if (0xf0 == (0xf8 & *s)) { 9 + // ensure each of the 3 following bytes in this 4-byte 10 + // utf8 codepoint began with 0b10xxxxxx 11 + if ((0x80 != (0xc0 & s[1])) || (0x80 != (0xc0 & s[2])) || 12 + (0x80 != (0xc0 & s[3]))) { 13 + return ifalse; 14 + } 15 + 16 + // ensure that our utf8 codepoint ended after 4 bytes 17 + if (0x80 == (0xc0 & s[4])) { 18 + return ifalse; 19 + } 20 + 21 + // ensure that the top 5 bits of this 4-byte utf8 22 + // codepoint were not 0, as then we could have used 23 + // one of the smaller encodings 24 + if ((0 == (0x07 & s[0])) && (0 == (0x30 & s[1]))) { 25 + return ifalse; 26 + } 27 + 28 + // 4-byte utf8 code point (began with 0b11110xxx) 29 + s += 4; 30 + } else if (0xe0 == (0xf0 & *s)) { 31 + // ensure each of the 2 following bytes in this 3-byte 32 + // utf8 codepoint began with 0b10xxxxxx 33 + if ((0x80 != (0xc0 & s[1])) || (0x80 != (0xc0 & s[2]))) { 34 + return ifalse; 35 + } 36 + 37 + // ensure that our utf8 codepoint ended after 3 bytes 38 + if (0x80 == (0xc0 & s[3])) { 39 + return ifalse; 40 + } 41 + 42 + // ensure that the top 5 bits of this 3-byte utf8 43 + // codepoint were not 0, as then we could have used 44 + // one of the smaller encodings 45 + if ((0 == (0x0f & s[0])) && (0 == (0x20 & s[1]))) { 46 + return ifalse; 47 + } 48 + 49 + // 3-byte utf8 code point (began with 0b1110xxxx) 50 + s += 3; 51 + } else if (0xc0 == (0xe0 & *s)) { 52 + // ensure the 1 following byte in this 2-byte 53 + // utf8 codepoint began with 0b10xxxxxx 54 + if (0x80 != (0xc0 & s[1])) { 55 + return ifalse; 56 + } 57 + 58 + // ensure that our utf8 codepoint ended after 2 bytes 59 + if (0x80 == (0xc0 & s[2])) { 60 + return ifalse; 61 + } 62 + 63 + // ensure that the top 4 bits of this 2-byte utf8 64 + // codepoint were not 0, as then we could have used 65 + // one of the smaller encodings 66 + if (0 == (0x1e & s[0])) { 67 + return ifalse; 68 + } 69 + 70 + // 2-byte utf8 code point (began with 0b110xxxxx) 71 + s += 2; 72 + } else if (0x00 == (0x80 & *s)) { 73 + // 1-byte ascii (began with 0b0xxxxxxx) 74 + s += 1; 75 + } else { 76 + // we have an invalid 0b1xxxxxxx utf8 code point entry 77 + return ifalse; 78 + } 79 + } 80 + 81 + return itrue; 82 + } 83 + 84 + // Calculate the length of a string. This counts 85 + // code points, so modifiers *will* be counted. as 86 + // characters. 87 + static iint_t istring_length(const istring_t str) { 88 + const unsigned char *s = (const unsigned char *)str; 89 + iint_t length = 0; 90 + 91 + while ('\0' != *s) { 92 + if (0xf0 == (0xf8 & *s)) { 93 + // 4-byte utf8 code point (began with 0b11110xxx) 94 + s += 4; 95 + } else if (0xe0 == (0xf0 & *s)) { 96 + // 3-byte utf8 code point (began with 0b1110xxxx) 97 + s += 3; 98 + } else if (0xc0 == (0xe0 & *s)) { 99 + // 2-byte utf8 code point (began with 0b110xxxxx) 100 + s += 2; 101 + } else { // if (0x00 == (0x80 & *s)) { 102 + // 1-byte ascii (began with 0b0xxxxxxx) 103 + s += 1; 104 + } 105 + 106 + // no matter the bytes we marched s forward by, it was 107 + // only 1 utf8 codepoint 108 + length++; 109 + } 110 + 111 + return length; 112 + } 113 + 114 + // Sets out_codepoint to the next utf8 codepoint in str, 115 + // and returns the address of the utf8 codepoint after the current one in str. 116 + static istring_t utf8codepoint(const istring_t str, 117 + int32_t * out_codepoint) { 118 + const char *s = (const char *)str; 119 + 120 + if (0xf0 == (0xf8 & s[0])) { 121 + // 4 byte utf8 codepoint 122 + *out_codepoint = ((0x07 & s[0]) << 18) | ((0x3f & s[1]) << 12) | 123 + ((0x3f & s[2]) << 6) | (0x3f & s[3]); 124 + s += 4; 125 + } else if (0xe0 == (0xf0 & s[0])) { 126 + // 3 byte utf8 codepoint 127 + *out_codepoint = 128 + ((0x0f & s[0]) << 12) | ((0x3f & s[1]) << 6) | (0x3f & s[2]); 129 + s += 3; 130 + } else if (0xc0 == (0xe0 & s[0])) { 131 + // 2 byte utf8 codepoint 132 + *out_codepoint = ((0x1f & s[0]) << 6) | (0x3f & s[1]); 133 + s += 2; 134 + } else { 135 + // 1 byte utf8 codepoint otherwise 136 + *out_codepoint = s[0]; 137 + s += 1; 138 + } 139 + 140 + return s; 141 + } 142 + 143 + // Return the size of a codepoint in bytes 144 + static size_t utf8codepointsize(int32_t chr) { 145 + if (0 == ((int32_t)0xffffff80 & chr)) { 146 + return 1; 147 + } else if (0 == ((int32_t)0xfffff800 & chr)) { 148 + return 2; 149 + } else if (0 == ((int32_t)0xffff0000 & chr)) { 150 + return 3; 151 + } else { // if (0 == ((int)0xffe00000 & chr)) { 152 + return 4; 153 + } 154 + } 155 + 156 + static istring_t utf8catcodepoint(const istring_t str, int32_t chr, size_t n) { 157 + char *s = (char *)str; 158 + 159 + if (0 == ((int32_t)0xffffff80 & chr)) { 160 + // 1-byte/7-bit ascii 161 + // (0b0xxxxxxx) 162 + if (n < 1) { 163 + return NULL; 164 + } 165 + s[0] = (char)chr; 166 + s += 1; 167 + } else if (0 == ((int32_t)0xfffff800 & chr)) { 168 + // 2-byte/11-bit utf8 code point 169 + // (0b110xxxxx 0b10xxxxxx) 170 + if (n < 2) { 171 + return NULL; 172 + } 173 + s[0] = 0xc0 | (char)(chr >> 6); 174 + s[1] = 0x80 | (char)(chr & 0x3f); 175 + s += 2; 176 + } else if (0 == ((int32_t)0xffff0000 & chr)) { 177 + // 3-byte/16-bit utf8 code point 178 + // (0b1110xxxx 0b10xxxxxx 0b10xxxxxx) 179 + if (n < 3) { 180 + return NULL; 181 + } 182 + s[0] = 0xe0 | (char)(chr >> 12); 183 + s[1] = 0x80 | (char)((chr >> 6) & 0x3f); 184 + s[2] = 0x80 | (char)(chr & 0x3f); 185 + s += 3; 186 + } else { // if (0 == ((int)0xffe00000 & chr)) { 187 + // 4-byte/21-bit utf8 code point 188 + // (0b11110xxx 0b10xxxxxx 0b10xxxxxx 0b10xxxxxx) 189 + if (n < 4) { 190 + return NULL; 191 + } 192 + s[0] = 0xf0 | (char)(chr >> 18); 193 + s[1] = 0x80 | (char)((chr >> 12) & 0x3f); 194 + s[2] = 0x80 | (char)((chr >> 6) & 0x3f); 195 + s[3] = 0x80 | (char)(chr & 0x3f); 196 + s += 4; 197 + } 198 + 199 + return s; 200 + } 201 + 202 + // Make a codepoint lower case if possible. 203 + static int32_t utf8lwrcodepoint(int32_t cp) { 204 + if (((0x0041 <= cp) && (0x005a >= cp)) || 205 + ((0x00c0 <= cp) && (0x00d6 >= cp)) || 206 + ((0x00d8 <= cp) && (0x00de >= cp)) || 207 + ((0x0391 <= cp) && (0x03a1 >= cp)) || 208 + ((0x03a3 <= cp) && (0x03ab >= cp))) { 209 + cp += 32; 210 + } else if (((0x0100 <= cp) && (0x012f >= cp)) || 211 + ((0x0132 <= cp) && (0x0137 >= cp)) || 212 + ((0x014a <= cp) && (0x0177 >= cp)) || 213 + ((0x0182 <= cp) && (0x0185 >= cp)) || 214 + ((0x01a0 <= cp) && (0x01a5 >= cp)) || 215 + ((0x01de <= cp) && (0x01ef >= cp)) || 216 + ((0x01f8 <= cp) && (0x021f >= cp)) || 217 + ((0x0222 <= cp) && (0x0233 >= cp)) || 218 + ((0x0246 <= cp) && (0x024f >= cp)) || 219 + ((0x03d8 <= cp) && (0x03ef >= cp))) { 220 + cp |= 0x1; 221 + } else if (((0x0139 <= cp) && (0x0148 >= cp)) || 222 + ((0x0179 <= cp) && (0x017e >= cp)) || 223 + ((0x01af <= cp) && (0x01b0 >= cp)) || 224 + ((0x01b3 <= cp) && (0x01b6 >= cp)) || 225 + ((0x01cd <= cp) && (0x01dc >= cp))) { 226 + cp += 1; 227 + cp &= ~0x1; 228 + } else { 229 + switch (cp) { 230 + default: break; 231 + case 0x0178: cp = 0x00ff; break; 232 + case 0x0243: cp = 0x0180; break; 233 + case 0x018e: cp = 0x01dd; break; 234 + case 0x023d: cp = 0x019a; break; 235 + case 0x0220: cp = 0x019e; break; 236 + case 0x01b7: cp = 0x0292; break; 237 + case 0x01c4: cp = 0x01c6; break; 238 + case 0x01c7: cp = 0x01c9; break; 239 + case 0x01ca: cp = 0x01cc; break; 240 + case 0x01f1: cp = 0x01f3; break; 241 + case 0x01f7: cp = 0x01bf; break; 242 + case 0x0187: cp = 0x0188; break; 243 + case 0x018b: cp = 0x018c; break; 244 + case 0x0191: cp = 0x0192; break; 245 + case 0x0198: cp = 0x0199; break; 246 + case 0x01a7: cp = 0x01a8; break; 247 + case 0x01ac: cp = 0x01ad; break; 248 + case 0x01af: cp = 0x01b0; break; 249 + case 0x01b8: cp = 0x01b9; break; 250 + case 0x01bc: cp = 0x01bd; break; 251 + case 0x01f4: cp = 0x01f5; break; 252 + case 0x023b: cp = 0x023c; break; 253 + case 0x0241: cp = 0x0242; break; 254 + case 0x03fd: cp = 0x037b; break; 255 + case 0x03fe: cp = 0x037c; break; 256 + case 0x03ff: cp = 0x037d; break; 257 + case 0x037f: cp = 0x03f3; break; 258 + case 0x0386: cp = 0x03ac; break; 259 + case 0x0388: cp = 0x03ad; break; 260 + case 0x0389: cp = 0x03ae; break; 261 + case 0x038a: cp = 0x03af; break; 262 + case 0x038c: cp = 0x03cc; break; 263 + case 0x038e: cp = 0x03cd; break; 264 + case 0x038f: cp = 0x03ce; break; 265 + case 0x0370: cp = 0x0371; break; 266 + case 0x0372: cp = 0x0373; break; 267 + case 0x0376: cp = 0x0377; break; 268 + case 0x03f4: cp = 0x03d1; break; 269 + case 0x03cf: cp = 0x03d7; break; 270 + case 0x03f9: cp = 0x03f2; break; 271 + case 0x03f7: cp = 0x03f8; break; 272 + case 0x03fa: cp = 0x03fb; break; 273 + }; 274 + } 275 + 276 + return cp; 277 + } 278 + 279 + 280 + static int32_t utf8uprcodepoint(int32_t cp) { 281 + if (((0x0061 <= cp) && (0x007a >= cp)) || 282 + ((0x00e0 <= cp) && (0x00f6 >= cp)) || 283 + ((0x00f8 <= cp) && (0x00fe >= cp)) || 284 + ((0x03b1 <= cp) && (0x03c1 >= cp)) || 285 + ((0x03c3 <= cp) && (0x03cb >= cp))) { 286 + cp -= 32; 287 + } else if (((0x0100 <= cp) && (0x012f >= cp)) || 288 + ((0x0132 <= cp) && (0x0137 >= cp)) || 289 + ((0x014a <= cp) && (0x0177 >= cp)) || 290 + ((0x0182 <= cp) && (0x0185 >= cp)) || 291 + ((0x01a0 <= cp) && (0x01a5 >= cp)) || 292 + ((0x01de <= cp) && (0x01ef >= cp)) || 293 + ((0x01f8 <= cp) && (0x021f >= cp)) || 294 + ((0x0222 <= cp) && (0x0233 >= cp)) || 295 + ((0x0246 <= cp) && (0x024f >= cp)) || 296 + ((0x03d8 <= cp) && (0x03ef >= cp))) { 297 + cp &= ~0x1; 298 + } else if (((0x0139 <= cp) && (0x0148 >= cp)) || 299 + ((0x0179 <= cp) && (0x017e >= cp)) || 300 + ((0x01af <= cp) && (0x01b0 >= cp)) || 301 + ((0x01b3 <= cp) && (0x01b6 >= cp)) || 302 + ((0x01cd <= cp) && (0x01dc >= cp))) { 303 + cp -= 1; 304 + cp |= 0x1; 305 + } else { 306 + switch (cp) { 307 + default: break; 308 + case 0x00ff: cp = 0x0178; break; 309 + case 0x0180: cp = 0x0243; break; 310 + case 0x01dd: cp = 0x018e; break; 311 + case 0x019a: cp = 0x023d; break; 312 + case 0x019e: cp = 0x0220; break; 313 + case 0x0292: cp = 0x01b7; break; 314 + case 0x01c6: cp = 0x01c4; break; 315 + case 0x01c9: cp = 0x01c7; break; 316 + case 0x01cc: cp = 0x01ca; break; 317 + case 0x01f3: cp = 0x01f1; break; 318 + case 0x01bf: cp = 0x01f7; break; 319 + case 0x0188: cp = 0x0187; break; 320 + case 0x018c: cp = 0x018b; break; 321 + case 0x0192: cp = 0x0191; break; 322 + case 0x0199: cp = 0x0198; break; 323 + case 0x01a8: cp = 0x01a7; break; 324 + case 0x01ad: cp = 0x01ac; break; 325 + case 0x01b0: cp = 0x01af; break; 326 + case 0x01b9: cp = 0x01b8; break; 327 + case 0x01bd: cp = 0x01bc; break; 328 + case 0x01f5: cp = 0x01f4; break; 329 + case 0x023c: cp = 0x023b; break; 330 + case 0x0242: cp = 0x0241; break; 331 + case 0x037b: cp = 0x03fd; break; 332 + case 0x037c: cp = 0x03fe; break; 333 + case 0x037d: cp = 0x03ff; break; 334 + case 0x03f3: cp = 0x037f; break; 335 + case 0x03ac: cp = 0x0386; break; 336 + case 0x03ad: cp = 0x0388; break; 337 + case 0x03ae: cp = 0x0389; break; 338 + case 0x03af: cp = 0x038a; break; 339 + case 0x03cc: cp = 0x038c; break; 340 + case 0x03cd: cp = 0x038e; break; 341 + case 0x03ce: cp = 0x038f; break; 342 + case 0x0371: cp = 0x0370; break; 343 + case 0x0373: cp = 0x0372; break; 344 + case 0x0377: cp = 0x0376; break; 345 + case 0x03d1: cp = 0x03f4; break; 346 + case 0x03d7: cp = 0x03cf; break; 347 + case 0x03f2: cp = 0x03f9; break; 348 + case 0x03f8: cp = 0x03f7; break; 349 + case 0x03fb: cp = 0x03fa; break; 350 + }; 351 + } 352 + 353 + return cp; 354 + } 355 + 356 + // Create a new icicle string with the case requested. 357 + static istring_t INLINE istring_to_case(anemone_mempool_t *into, int32_t (*change_codepoint)(int32_t), istring_t val) { 358 + // String pointers 359 + istring_t ret, watch, next; 360 + // String pointers use in the slow path (see below). 361 + istring_t slowret, slowwatch; 362 + 363 + // pointer to the write location 364 + char* work; 365 + 366 + // current code point 367 + int32_t cp; 368 + 369 + // Size variables. 370 + size_t val_size, written, remaining; 371 + 372 + val_size = strlen(val); 373 + ret = (istring_t) anemone_mempool_alloc (into, val_size + 1); 374 + written = 0; 375 + work = (char*) ret; 376 + watch = val; 377 + next = utf8codepoint(watch, &cp); 378 + 379 + // Loop through the input string's code 380 + // points, converting them to the required 381 + // case, and writing them to the output. 382 + while (cp != 0) { 383 + const int32_t lwr_cp = change_codepoint(cp); 384 + const size_t size = utf8codepointsize(lwr_cp); 385 + 386 + // Go to the slow path if we would 387 + // otherwise overrun the buffer. 388 + if (written + size > val_size) 389 + goto slowpath; 390 + 391 + // Write the code point and get a 392 + // new work pointer. 393 + work = (char*) utf8catcodepoint(work, lwr_cp, size); 394 + watch = next; 395 + next = utf8codepoint(watch, &cp); 396 + written += size; 397 + } 398 + 399 + *work = (char) 0; 400 + return ret; 401 + 402 + // This should be very unusual, and only happen 403 + // when we have codepoints which have a case 404 + // version longer than the input. Even then, it 405 + // should only happen for the last few characters, 406 + // so the extra work should be minimal. 407 + // 408 + // We measure the size of the remaining buffer 409 + // we need to allocate, allocate it, then run 410 + // over the input again copying the lower case 411 + // version into it. 412 + // 413 + // Unfortunately, we will be stuck with an unused 414 + // buffer allocated. 415 + slowpath: 416 + 417 + remaining = 0; 418 + // We need a variable to track the input (like watch) 419 + // but will need to reset if after measurement, so 420 + // make a new one. 421 + slowwatch = watch; 422 + 423 + // Measurement pass. 424 + while (cp != 0) { 425 + const int32_t lwr_cp = change_codepoint(cp); 426 + const size_t size = utf8codepointsize(lwr_cp); 427 + 428 + remaining += size; 429 + slowwatch = next; 430 + next = utf8codepoint(slowwatch, &cp); 431 + } 432 + 433 + // Allocate a new string to fit the buffer and copy 434 + // what we've already transformed into it. 435 + // We'll just have to live with the extra allocation. 436 + slowret = (istring_t) anemone_mempool_alloc(into, written + remaining + 1); 437 + memcpy ((char *) slowret, ret, written); 438 + 439 + // Reset our pointers for the start of the loop. 440 + slowwatch = watch; 441 + next = utf8codepoint(slowwatch, &cp); 442 + work = (char*) slowret + written; 443 + 444 + // Write pass. 445 + while (cp != 0) { 446 + const int32_t lwr_cp = utf8lwrcodepoint(cp); 447 + const size_t size = utf8codepointsize(lwr_cp); 448 + 449 + work = (char*) utf8catcodepoint(work, lwr_cp, size); 450 + slowwatch = next; 451 + next = utf8codepoint(slowwatch, &cp); 452 + } 453 + 454 + return slowret; 455 + } 456 + 457 + static istring_t INLINE istring_to_lower(anemone_mempool_t *into, istring_t val) { 458 + return istring_to_case(into, &utf8lwrcodepoint, val); 459 + } 460 + 461 + static istring_t INLINE istring_to_upper(anemone_mempool_t *into, istring_t val) { 462 + return istring_to_case(into, &utf8uprcodepoint , val); 463 + }

+1 -1

data/sea/30-array.h

··· 1 - #include "21-time.h" 1 + #include "22-utf8.h" 2 2 3 3 static iint_t INLINE iarray_size(iint_t count) 4 4 {

+15

icicle-compiler/src/Icicle/Sea/FromAvalanche/Prim.hs

··· 60 60 PrimMinimal (M.PrimBuiltinFun (M.PrimBuiltinMath fun)) 61 61 -> PDFun (seaOfPrimBuiltinMath fun) Nothing 62 62 63 + PrimMinimal (M.PrimText op) 64 + -> seaOfPrimText op 65 + 63 66 PrimMinimal (M.PrimTime op) 64 67 -> PDFun 65 68 ( prefixOfValType TimeT <> seaOfPrimTime op ) Nothing ··· 162 165 -> prefixOfValType (ArrayT t) <> "length" 163 166 _ 164 167 -> seaError "seaOfPrimProject" p 168 + 169 + 170 + seaOfPrimText :: M.PrimText -> PrimDoc 171 + seaOfPrimText p 172 + = case p of 173 + M.PrimStrLen -> 174 + PDFun ( prefixOfValType StringT <> "length" ) Nothing 175 + M.PrimStrToLower -> 176 + PDAlloc ( prefixOfValType StringT <> "to_lower") Nothing 177 + M.PrimStrToUpper -> 178 + PDAlloc ( prefixOfValType StringT <> "to_upper") Nothing 179 + 165 180 166 181 seaOfPrimUnsafe :: PrimUnsafe -> PrimDoc 167 182 seaOfPrimUnsafe p

+17

icicle-data/src/Icicle/Common/Exp/Prim/Eval.hs

··· 15 15 16 16 import qualified Data.Map as Map 17 17 import qualified Data.List as List 18 + import qualified Data.Text as Text 18 19 19 20 20 21 -- | Evaluate a primitive, given list of argument values ··· 299 300 | otherwise 300 301 -> primError 301 302 303 + PrimText PrimStrLen 304 + | [VBase (VString x)] <- vs 305 + -> return $ VBase $ VInt $ Text.length x 306 + | otherwise 307 + -> primError 302 308 309 + PrimText PrimStrToLower 310 + | [VBase (VString x)] <- vs 311 + -> return $ VBase $ VString $ Text.toLower x 312 + | otherwise 313 + -> primError 314 + 315 + PrimText PrimStrToUpper 316 + | [VBase (VString x)] <- vs 317 + -> return $ VBase $ VString $ Text.toUpper x 318 + | otherwise 319 + -> primError 303 320 -- Time stuff 304 321 PrimTime PrimTimeDaysDifference 305 322 | [VBase (VTime a), VBase (VTime b)] <- vs

+25

icicle-data/src/Icicle/Common/Exp/Prim/Minimal.hs

··· 9 9 , PrimRelation (..) 10 10 , PrimLogical (..) 11 11 , PrimConst (..) 12 + , PrimText (..) 12 13 , PrimTime (..) 13 14 , PrimPair (..) 14 15 , PrimStruct (..) ··· 42 43 | PrimConst !PrimConst -- ^ Literal value constructors 43 44 | PrimPair !PrimPair -- ^ Pair projections 44 45 | PrimStruct !PrimStruct -- ^ Struct projections 46 + | PrimText !PrimText -- ^ Text primitives 45 47 | PrimTime !PrimTime -- ^ Time/date primitives 46 48 | PrimBuiltinFun !PrimBuiltinFun 47 49 deriving (Eq, Ord, Show, Generic, NanEq) ··· 86 88 | PrimConstRight !ValType !ValType 87 89 deriving (Eq, Ord, Show, Generic, NanEq) 88 90 91 + -- | Text primitives 92 + data PrimText 93 + = PrimStrLen 94 + | PrimStrToLower 95 + | PrimStrToUpper 96 + deriving (Eq, Ord, Show, Enum, Bounded, Generic, NanEq) 97 + 89 98 -- | Time primitives 90 99 data PrimTime 91 100 = PrimTimeDaysDifference ··· 116 125 instance NFData PrimRelation 117 126 instance NFData PrimLogical 118 127 instance NFData PrimConst 128 + instance NFData PrimText 119 129 instance NFData PrimTime 120 130 instance NFData PrimStruct 121 131 instance NFData Prim ··· 210 220 PrimConst (PrimConstRight a b) 211 221 -> FunT [funOfVal b] (SumT a b) 212 222 223 + -- Text 224 + PrimText PrimStrLen 225 + -> FunT [funOfVal StringT] IntT 226 + PrimText PrimStrToLower 227 + -> FunT [funOfVal StringT] StringT 228 + PrimText PrimStrToUpper 229 + -> FunT [funOfVal StringT] StringT 230 + 231 + -- Time 213 232 PrimTime PrimTimeDaysDifference 214 233 -> FunT [funOfVal TimeT, funOfVal TimeT] IntT 215 234 PrimTime PrimTimeDaysJulianEpoch ··· 269 288 pretty (PrimConstLeft _a _b) = "left#" 270 289 pretty (PrimConstRight _a _b) = "right#" 271 290 291 + instance Pretty PrimText where 292 + pretty PrimStrLen = "strlen#" 293 + pretty PrimStrToLower = "tolower#" 294 + pretty PrimStrToUpper = "toupper#" 295 + 272 296 instance Pretty PrimTime where 273 297 pretty PrimTimeDaysDifference = "Time_daysDifference#" 274 298 pretty PrimTimeDaysJulianEpoch = "Time_daysJulianEpoch#" ··· 294 318 pretty (PrimRelation p _t) = pretty p 295 319 pretty (PrimLogical p) = pretty p 296 320 pretty (PrimConst p) = pretty p 321 + pretty (PrimText p) = pretty p 297 322 pretty (PrimTime p) = pretty p 298 323 pretty (PrimPair p) = pretty p 299 324 pretty (PrimStruct p) = pretty p

+16

icicle-source/src/Icicle/Source/Eval.hs

··· 29 29 import P 30 30 import Data.List (zip, nubBy, groupBy, take) 31 31 import qualified Data.List as List 32 + import qualified Data.Text as Text 32 33 import qualified Data.Map as Map 33 34 34 35 data EvalError a n ··· 340 341 BuiltinMath Truncate 341 342 | [VDouble i] <- args 342 343 -> return $ VDouble $ fromIntegral (truncate i :: Int) 344 + | otherwise -> err 345 + 346 + BuiltinText StrLen 347 + | [VString s] <- args 348 + -> return $ VInt $ fromIntegral (Text.length s) 349 + | otherwise -> err 350 + 351 + BuiltinText ToLower 352 + | [VString s] <- args 353 + -> return $ VString $ Text.toLower s 354 + | otherwise -> err 355 + 356 + BuiltinText ToUpper 357 + | [VString s] <- args 358 + -> return $ VString $ Text.toUpper s 343 359 | otherwise -> err 344 360 345 361 BuiltinTime DaysBetween

+7 -2

icicle-source/src/Icicle/Source/Parser/Constructor.hs

··· 46 46 47 47 -- | Convert an expression to a pattern. 48 48 -- 49 + -- This is used in the parsing stage when a pattern 50 + -- is required, we parse as an expression, then coerce 51 + -- to a pattern. 52 + -- 49 53 -- Obviously, not all expressions can be converted 50 54 -- in this way, but all valid patterns can be parsed 51 55 -- as an expression. ··· 54 58 -- using a separate parser for patterns has the benefit 55 59 -- that quite tricky things like tuple comma fixity is 56 60 -- handled the same in the patterns as the expressions 57 - -- they match. 58 - checkPat :: Q.Exp T.SourcePos Var -> Parser (Q.Pattern Var) 61 + -- they match, and we don't have to duplicate parser 62 + -- logic. 63 + checkPat :: Monad m => Q.Exp T.SourcePos Var -> m (Q.Pattern Var) 59 64 checkPat exp = 60 65 case exp of 61 66 -- Variables are simple, just underscore default

+1

icicle-source/src/Icicle/Source/PrettyAnnot.hs

··· 88 88 Lit{} -> False 89 89 Fun f -> case f of 90 90 BuiltinMath{} -> False 91 + BuiltinText{} -> False 91 92 BuiltinTime{} -> False 92 93 BuiltinData{} -> True 93 94 BuiltinArray{} -> True

+21 -2

icicle-source/src/Icicle/Source/Query/Builtin.hs

··· 12 12 13 13 data BuiltinFun 14 14 = BuiltinMath !BuiltinMath 15 + | BuiltinText !BuiltinText 15 16 | BuiltinTime !BuiltinTime 16 17 | BuiltinData !BuiltinData 17 18 | BuiltinArray !BuiltinArray 18 19 | BuiltinMap !BuiltinMap 19 20 deriving (Show, Eq, Ord, Generic) 20 21 21 - -- | Functions wired into the Parser 22 - -- these can't be introduced into 22 + -- | Functions wired into the Parser. 23 + -- These can't be introduced into 23 24 -- the environment as they are made 24 25 -- with KeyWords, and are instead 25 26 -- directly written in by the Parser. ··· 28 29 [ fmap BuiltinTime [minBound..maxBound] 29 30 ] 30 31 32 + -- | Functions wired in through the type 33 + -- checker. These are parsed normally, 34 + -- but their definitions are wired in 35 + -- to their primitives. 31 36 listOfIntroducedFuns :: [BuiltinFun] 32 37 listOfIntroducedFuns = concat 33 38 [ fmap BuiltinMath [minBound..maxBound] 39 + , fmap BuiltinText [minBound..maxBound] 34 40 , fmap BuiltinData [minBound..maxBound] 35 41 , fmap BuiltinArray [minBound..maxBound] 36 42 , fmap BuiltinMap [minBound..maxBound] ··· 58 64 | Truncate 59 65 deriving (Show, Eq, Ord, Enum, Bounded, Generic) 60 66 67 + data BuiltinText 68 + = StrLen 69 + | ToLower 70 + | ToUpper 71 + deriving (Show, Eq, Ord, Enum, Bounded, Generic) 72 + 61 73 data BuiltinTime 62 74 = DaysBetween 63 75 | DaysJulianEpoch ··· 94 106 instance NFData BuiltinData 95 107 instance NFData BuiltinMap 96 108 instance NFData BuiltinArray 109 + instance NFData BuiltinText 97 110 98 111 -------------------------------------------------------------------------------- 99 112 100 113 instance Pretty BuiltinFun where 101 114 pretty (BuiltinMath b) = pretty b 115 + pretty (BuiltinText b) = pretty b 102 116 pretty (BuiltinTime b) = pretty b 103 117 pretty (BuiltinData b) = pretty b 104 118 pretty (BuiltinArray b) = pretty b ··· 124 138 pretty Ceiling = "ceil" 125 139 pretty Round = "round" 126 140 pretty Truncate = "trunc" 141 + 142 + instance Pretty BuiltinText where 143 + pretty StrLen = "strlen" 144 + pretty ToLower = "tolower" 145 + pretty ToUpper = "toupper" 127 146 128 147 instance Pretty BuiltinTime where 129 148 pretty DaysBetween = "days between"

+1

icicle-source/src/Icicle/Source/Query/Exp.hs

··· 16 16 , Fun 17 17 , BuiltinFun (..) 18 18 , BuiltinMath (..) 19 + , BuiltinText (..) 19 20 , BuiltinTime (..) 20 21 , BuiltinData (..) 21 22 , BuiltinArray (..)

+2 -1

icicle-source/src/Icicle/Source/Query/Function.hs

··· 47 47 builtinDefinitions a_fresh = do 48 48 traverse (buildResolved a_fresh) listOfIntroducedFuns 49 49 50 - -- | Build an individual function from the its primitive definition. 50 + -- | Build an individual function from its primitive definition. 51 + -- 51 52 -- This is a little bit tricky, as we can't under apply function 52 53 -- definitions, so we need to create the arguments to the function 53 54 -- as well. It's as if we wrote something like this in the prelude

+8

icicle-source/src/Icicle/Source/Query/Prim.hs

··· 111 111 Fun (BuiltinMath Truncate) 112 112 -> fNumDefinitely $ \at -> ([at], IntT) 113 113 114 + 115 + Fun (BuiltinText StrLen) 116 + -> f0 [StringT] IntT 117 + Fun (BuiltinText ToLower) 118 + -> f0 [StringT] StringT 119 + Fun (BuiltinText ToUpper) 120 + -> f0 [StringT] StringT 121 + 114 122 Fun (BuiltinTime DaysBetween) 115 123 -> f0 [TimeT, TimeT] IntT 116 124 Fun (BuiltinTime DaysJulianEpoch)

+8

icicle-source/src/Icicle/Source/ToCore/Prim.hs

··· 109 109 110 110 go (Fun (BuiltinMath f)) 111 111 = gomath f 112 + go (Fun (BuiltinText f)) 113 + = gotext f 112 114 go (Fun (BuiltinTime f)) 113 115 = gotime f 114 116 go (Fun (BuiltinData f)) ··· 214 216 = convertError 215 217 $ ConvertErrorPrimNoArguments ann 2 p 216 218 219 + gotext StrLen 220 + = return $ primmin $ Min.PrimText Min.PrimStrLen 221 + gotext ToLower 222 + = return $ primmin $ Min.PrimText Min.PrimStrToLower 223 + gotext ToUpper 224 + = return $ primmin $ Min.PrimText Min.PrimStrToUpper 217 225 218 226 -- Source built-in primitives supported by other language fragments 219 227 gotime DaysBetween

Configure Feed

Configure Feed