Merge: glibc: 2.40-66 -> 2.40-142, fix CVE-2025-8058 (#428072)

pyrox.dev / nixpkgs

fork atom

lol

fork atom

authored by

Maximilian Bosch and committed by

GitHub 7 months ago a936217a 583e3d6d

+17093 -1

2 changed files

expand all

unified split

pkgs

development

libraries

glibc

2.40-master.patch

common.nix

+17092

pkgs/development/libraries/glibc/2.40-master.patch

··· 26216 26216 struct abort_msg_s *buf = __mmap (NULL, total, 26217 26217 PROT_READ | PROT_WRITE, 26218 26218 MAP_ANON | MAP_PRIVATE, -1, 0); 26219 + 26220 + commit aef8f8d6a947b290162393e1d717c7aee96fef8e 26221 + Author: H.J. Lu <hjl.tools@gmail.com> 26222 + Date: Tue Dec 17 18:41:45 2024 +0800 26223 + 26224 + Hide all malloc functions from compiler [BZ #32366] 26225 + 26226 + Since -1 isn't a power of two, compiler may reject it, hide memalign from 26227 + Clang 19 which issues an error: 26228 + 26229 + tst-memalign.c:86:31: error: requested alignment is not a power of 2 [-Werror,-Wnon-power-of-two-alignment] 26230 + 86 | p = memalign (-1, pagesize); 26231 + | ^~ 26232 + tst-memalign.c:86:31: error: requested alignment must be 4294967296 bytes or smaller; maximum alignment assumed [-Werror,-Wbuiltin-assume-aligned-alignment] 26233 + 86 | p = memalign (-1, pagesize); 26234 + | ^~ 26235 + 26236 + Update tst-malloc-aux.h to hide all malloc functions and include it in 26237 + all malloc tests to prevent compiler from optimizing out any malloc 26238 + functions. 26239 + 26240 + Tested with Clang 19.1.5 and GCC 15 20241206 for BZ #32366. 26241 + 26242 + Signed-off-by: H.J. Lu <hjl.tools@gmail.com> 26243 + Reviewed-by: Sam James <sam@gentoo.org> 26244 + (cherry picked from commit f9493a15ea9cfb63a815c00c23142369ec09d8ce) 26245 + 26246 + diff --git a/malloc/tst-mallinfo2.c b/malloc/tst-mallinfo2.c 26247 + index 2c02f5f700..f072b9f24b 100644 26248 + --- a/malloc/tst-mallinfo2.c 26249 + +++ b/malloc/tst-mallinfo2.c 26250 + @@ -23,6 +23,8 @@ 26251 + #include <stdlib.h> 26252 + #include <support/check.h> 26253 + 26254 + +#include "tst-malloc-aux.h" 26255 + + 26256 + /* This is not specifically needed for the test, but (1) does 26257 + something to the data so gcc doesn't optimize it away, and (2) may 26258 + help when developing future tests. */ 26259 + diff --git a/malloc/tst-malloc-aux.h b/malloc/tst-malloc-aux.h 26260 + index 54908b4a24..3e1b61ce34 100644 26261 + --- a/malloc/tst-malloc-aux.h 26262 + +++ b/malloc/tst-malloc-aux.h 26263 + @@ -22,20 +22,35 @@ 26264 + 26265 + #include <stddef.h> 26266 + #include <stdlib.h> 26267 + - 26268 + -static void *(*volatile aligned_alloc_indirect)(size_t, size_t) = aligned_alloc; 26269 + -static void *(*volatile calloc_indirect)(size_t, size_t) = calloc; 26270 + -static void *(*volatile malloc_indirect)(size_t) = malloc; 26271 + -static void *(*volatile realloc_indirect)(void*, size_t) = realloc; 26272 + +#include <malloc.h> 26273 + + 26274 + +static __typeof (aligned_alloc) * volatile aligned_alloc_indirect 26275 + + = aligned_alloc; 26276 + +static __typeof (calloc) * volatile calloc_indirect = calloc; 26277 + +static __typeof (malloc) * volatile malloc_indirect = malloc; 26278 + +static __typeof (memalign) * volatile memalign_indirect = memalign; 26279 + +static __typeof (posix_memalign) * volatile posix_memalign_indirect 26280 + + = posix_memalign; 26281 + +static __typeof (pvalloc) * volatile pvalloc_indirect = pvalloc; 26282 + +static __typeof (realloc) * volatile realloc_indirect = realloc; 26283 + +static __typeof (valloc) * volatile valloc_indirect = valloc; 26284 + 26285 + #undef aligned_alloc 26286 + #undef calloc 26287 + #undef malloc 26288 + +#undef memalign 26289 + +#undef posix_memalign 26290 + +#undef pvalloc 26291 + #undef realloc 26292 + +#undef valloc 26293 + 26294 + #define aligned_alloc aligned_alloc_indirect 26295 + #define calloc calloc_indirect 26296 + #define malloc malloc_indirect 26297 + +#define memalign memalign_indirect 26298 + +#define posix_memalign posix_memalign_indirect 26299 + +#define pvalloc pvalloc_indirect 26300 + #define realloc realloc_indirect 26301 + +#define valloc valloc_indirect 26302 + 26303 + #endif /* TST_MALLOC_AUX_H */ 26304 + diff --git a/malloc/tst-malloc-backtrace.c b/malloc/tst-malloc-backtrace.c 26305 + index c7b1d65e5c..65fa91f6fd 100644 26306 + --- a/malloc/tst-malloc-backtrace.c 26307 + +++ b/malloc/tst-malloc-backtrace.c 26308 + @@ -22,6 +22,8 @@ 26309 + #include <support/support.h> 26310 + #include <libc-diag.h> 26311 + 26312 + +#include "tst-malloc-aux.h" 26313 + + 26314 + #define SIZE 4096 26315 + 26316 + /* Wrap free with a function to prevent gcc from optimizing it out. */ 26317 + diff --git a/malloc/tst-memalign.c b/malloc/tst-memalign.c 26318 + index 563f6413d2..ac9770d3f9 100644 26319 + --- a/malloc/tst-memalign.c 26320 + +++ b/malloc/tst-memalign.c 26321 + @@ -23,6 +23,8 @@ 26322 + #include <unistd.h> 26323 + #include <libc-diag.h> 26324 + 26325 + +#include "tst-malloc-aux.h" 26326 + + 26327 + static int errors = 0; 26328 + 26329 + static void 26330 + diff --git a/malloc/tst-safe-linking.c b/malloc/tst-safe-linking.c 26331 + index 01dd07004d..63a7e2bc8e 100644 26332 + --- a/malloc/tst-safe-linking.c 26333 + +++ b/malloc/tst-safe-linking.c 26334 + @@ -26,6 +26,8 @@ 26335 + #include <support/capture_subprocess.h> 26336 + #include <support/check.h> 26337 + 26338 + +#include "tst-malloc-aux.h" 26339 + + 26340 + /* Run CALLBACK and check that the data on standard error equals 26341 + EXPECTED. */ 26342 + static void 26343 + diff --git a/malloc/tst-valloc.c b/malloc/tst-valloc.c 26344 + index 9bab8c6470..0243d3dfd4 100644 26345 + --- a/malloc/tst-valloc.c 26346 + +++ b/malloc/tst-valloc.c 26347 + @@ -23,6 +23,8 @@ 26348 + #include <unistd.h> 26349 + #include <libc-diag.h> 26350 + 26351 + +#include "tst-malloc-aux.h" 26352 + + 26353 + static int errors = 0; 26354 + 26355 + static void 26356 + 26357 + commit be48b8f6ad0ec6d0d6b1d2f45eb59bf8e8c67dd7 26358 + Author: Sam James <sam@gentoo.org> 26359 + Date: Fri Jan 10 03:03:47 2025 +0000 26360 + 26361 + malloc: obscure calloc use in tst-calloc 26362 + 26363 + Similar to a9944a52c967ce76a5894c30d0274b824df43c7a and 26364 + f9493a15ea9cfb63a815c00c23142369ec09d8ce, we need to hide calloc use from 26365 + the compiler to accommodate GCC's r15-6566-g804e9d55d9e54c change. 26366 + 26367 + First, include tst-malloc-aux.h, but then use `volatile` variables 26368 + for size. 26369 + 26370 + The test passes without the tst-malloc-aux.h change but IMO we want 26371 + it there for consistency and to avoid future problems (possibly silent). 26372 + 26373 + Reviewed-by: H.J. Lu <hjl.tools@gmail.com> 26374 + (cherry picked from commit c3d1dac96bdd10250aa37bb367d5ef8334a093a1) 26375 + 26376 + diff --git a/malloc/tst-calloc.c b/malloc/tst-calloc.c 26377 + index 01f17f9e65..5a8c7ab121 100644 26378 + --- a/malloc/tst-calloc.c 26379 + +++ b/malloc/tst-calloc.c 26380 + @@ -23,6 +23,7 @@ 26381 + #include <stdio.h> 26382 + #include <libc-diag.h> 26383 + 26384 + +#include "tst-malloc-aux.h" 26385 + 26386 + /* Number of samples per size. */ 26387 + #define N 50000 26388 + @@ -94,16 +95,19 @@ random_test (void) 26389 + static void 26390 + null_test (void) 26391 + { 26392 + + /* Obscure allocation size from the compiler. */ 26393 + + volatile size_t max_size = UINT_MAX; 26394 + + volatile size_t zero_size = 0; 26395 + /* If the size is 0 the result is implementation defined. Just make 26396 + sure the program doesn't crash. The result of calloc is 26397 + deliberately ignored, so do not warn about that. */ 26398 + DIAG_PUSH_NEEDS_COMMENT; 26399 + DIAG_IGNORE_NEEDS_COMMENT (10, "-Wunused-result"); 26400 + calloc (0, 0); 26401 + - calloc (0, UINT_MAX); 26402 + - calloc (UINT_MAX, 0); 26403 + - calloc (0, ~((size_t) 0)); 26404 + - calloc (~((size_t) 0), 0); 26405 + + calloc (0, max_size); 26406 + + calloc (max_size, 0); 26407 + + calloc (0, ~((size_t) zero_size)); 26408 + + calloc (~((size_t) zero_size), 0); 26409 + DIAG_POP_NEEDS_COMMENT; 26410 + } 26411 + 26412 + 26413 + commit 85668221974db44459527e04d04f77ca8f8e3115 26414 + Author: H.J. Lu <hjl.tools@gmail.com> 26415 + Date: Fri Jan 24 18:53:13 2025 +0800 26416 + 26417 + stdlib: Test using setenv with updated environ [BZ #32588] 26418 + 26419 + Add a test for setenv with updated environ. Verify that BZ #32588 is 26420 + fixed. 26421 + 26422 + Signed-off-by: H.J. Lu <hjl.tools@gmail.com> 26423 + Reviewed-by: Florian Weimer <fweimer@redhat.com> 26424 + (cherry picked from commit 8ab34497de14e35aff09b607222fe1309ef156da) 26425 + 26426 + diff --git a/stdlib/Makefile b/stdlib/Makefile 26427 + index 8213fa83ef..d3a84fa641 100644 26428 + --- a/stdlib/Makefile 26429 + +++ b/stdlib/Makefile 26430 + @@ -307,6 +307,7 @@ tests := \ 26431 + tst-setcontext9 \ 26432 + tst-setcontext10 \ 26433 + tst-setcontext11 \ 26434 + + tst-setenv-environ \ 26435 + tst-stdbit-Wconversion \ 26436 + tst-stdbit-builtins \ 26437 + tst-stdc_bit_ceil \ 26438 + diff --git a/stdlib/tst-setenv-environ.c b/stdlib/tst-setenv-environ.c 26439 + new file mode 100644 26440 + index 0000000000..02fcef96d0 26441 + --- /dev/null 26442 + +++ b/stdlib/tst-setenv-environ.c 26443 + @@ -0,0 +1,36 @@ 26444 + +/* Test using setenv with updated environ. 26445 + + Copyright (C) 2025 Free Software Foundation, Inc. 26446 + + This file is part of the GNU C Library. 26447 + + 26448 + + The GNU C Library is free software; you can redistribute it and/or 26449 + + modify it under the terms of the GNU Lesser General Public 26450 + + License as published by the Free Software Foundation; either 26451 + + version 2.1 of the License, or (at your option) any later version. 26452 + + 26453 + + The GNU C Library is distributed in the hope that it will be useful, 26454 + + but WITHOUT ANY WARRANTY; without even the implied warranty of 26455 + + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 26456 + + Lesser General Public License for more details. 26457 + + 26458 + + You should have received a copy of the GNU Lesser General Public 26459 + + License along with the GNU C Library; if not, see 26460 + + <https://www.gnu.org/licenses/>. */ 26461 + + 26462 + +#include <stdlib.h> 26463 + +#include <support/check.h> 26464 + + 26465 + +extern char **environ; 26466 + + 26467 + +int 26468 + +do_test (void) 26469 + +{ 26470 + + char *valp; 26471 + + static char *dummy_environ[] = { NULL }; 26472 + + environ = dummy_environ; 26473 + + setenv ("A", "1", 0); 26474 + + valp = getenv ("A"); 26475 + + TEST_VERIFY_EXIT (valp[0] == '1' && valp[1] == '\0'); 26476 + + return 0; 26477 + +} 26478 + + 26479 + +#include <support/test-driver.c> 26480 + 26481 + commit e899ca3651f8c5e01bf3420cfb34aad97d093f74 26482 + Author: John David Anglin <danglin@gcc.gnu.org> 26483 + Date: Wed Jan 29 16:51:16 2025 -0500 26484 + 26485 + nptl: Correct stack size attribute when stack grows up [BZ #32574] 26486 + 26487 + Set stack size attribute to the size of the mmap'd region only 26488 + when the size of the remaining stack space is less than the size 26489 + of the mmap'd region. 26490 + 26491 + This was reversed. As a result, the initial stack size was only 26492 + 135168 bytes. On architectures where the stack grows down, the 26493 + initial stack size is approximately 8384512 bytes with the default 26494 + rlimit settings. The small main stack size on hppa broke 26495 + applications like ruby that check for stack overflows. 26496 + 26497 + Signed-off-by: John David Anglin <dave.anglin@bell.net> 26498 + 26499 + diff --git a/nptl/pthread_getattr_np.c b/nptl/pthread_getattr_np.c 26500 + index 1e91874767..3ce34437bc 100644 26501 + --- a/nptl/pthread_getattr_np.c 26502 + +++ b/nptl/pthread_getattr_np.c 26503 + @@ -145,9 +145,9 @@ __pthread_getattr_np (pthread_t thread_id, pthread_attr_t *attr) 26504 + > (size_t) iattr->stackaddr - last_to) 26505 + iattr->stacksize = (size_t) iattr->stackaddr - last_to; 26506 + #else 26507 + - /* The limit might be too high. */ 26508 + + /* The limit might be too low. */ 26509 + if ((size_t) iattr->stacksize 26510 + - > to - (size_t) iattr->stackaddr) 26511 + + < to - (size_t) iattr->stackaddr) 26512 + iattr->stacksize = to - (size_t) iattr->stackaddr; 26513 + #endif 26514 + /* We succeed and no need to look further. */ 26515 + 26516 + commit d6c156c326999f144cb5b73d29982108d549ad8a 26517 + Author: Siddhesh Poyarekar <siddhesh@sourceware.org> 26518 + Date: Fri Jan 31 12:16:30 2025 -0500 26519 + 26520 + assert: Add test for CVE-2025-0395 26521 + 26522 + Use the __progname symbol to override the program name to induce the 26523 + failure that CVE-2025-0395 describes. 26524 + 26525 + This is related to BZ #32582 26526 + 26527 + Signed-off-by: Siddhesh Poyarekar <siddhesh@sourceware.org> 26528 + Reviewed-by: Adhemerval Zanella <adhemerval.zanella@linaro.org> 26529 + (cherry picked from commit cdb9ba84191ce72e86346fb8b1d906e7cd930ea2) 26530 + 26531 + diff --git a/assert/Makefile b/assert/Makefile 26532 + index 35dc908ddb..c0fe660bd6 100644 26533 + --- a/assert/Makefile 26534 + +++ b/assert/Makefile 26535 + @@ -38,6 +38,7 @@ tests := \ 26536 + test-assert-perr \ 26537 + tst-assert-c++ \ 26538 + tst-assert-g++ \ 26539 + + tst-assert-sa-2025-0001 \ 26540 + # tests 26541 + 26542 + ifeq ($(have-cxx-thread_local),yes) 26543 + diff --git a/assert/tst-assert-sa-2025-0001.c b/assert/tst-assert-sa-2025-0001.c 26544 + new file mode 100644 26545 + index 0000000000..102cb0078d 26546 + --- /dev/null 26547 + +++ b/assert/tst-assert-sa-2025-0001.c 26548 + @@ -0,0 +1,92 @@ 26549 + +/* Test for CVE-2025-0395. 26550 + + Copyright The GNU Toolchain Authors. 26551 + + This file is part of the GNU C Library. 26552 + + 26553 + + The GNU C Library is free software; you can redistribute it and/or 26554 + + modify it under the terms of the GNU Lesser General Public 26555 + + License as published by the Free Software Foundation; either 26556 + + version 2.1 of the License, or (at your option) any later version. 26557 + + 26558 + + The GNU C Library is distributed in the hope that it will be useful, 26559 + + but WITHOUT ANY WARRANTY; without even the implied warranty of 26560 + + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 26561 + + Lesser General Public License for more details. 26562 + + 26563 + + You should have received a copy of the GNU Lesser General Public 26564 + + License along with the GNU C Library; if not, see 26565 + + <https://www.gnu.org/licenses/>. */ 26566 + + 26567 + +/* Test that a large enough __progname does not result in a buffer overflow 26568 + + when printing an assertion failure. This was CVE-2025-0395. */ 26569 + +#include <assert.h> 26570 + +#include <inttypes.h> 26571 + +#include <signal.h> 26572 + +#include <stdbool.h> 26573 + +#include <string.h> 26574 + +#include <sys/mman.h> 26575 + +#include <support/check.h> 26576 + +#include <support/support.h> 26577 + +#include <support/xstdio.h> 26578 + +#include <support/xunistd.h> 26579 + + 26580 + +extern const char *__progname; 26581 + + 26582 + +int 26583 + +do_test (int argc, char **argv) 26584 + +{ 26585 + + 26586 + + support_need_proc ("Reads /proc/self/maps to add guards to writable maps."); 26587 + + ignore_stderr (); 26588 + + 26589 + + /* XXX assumes that the assert is on a 2 digit line number. */ 26590 + + const char *prompt = ": %s:99: do_test: Assertion `argc < 1' failed.\n"; 26591 + + 26592 + + int ret = fprintf (stderr, prompt, __FILE__); 26593 + + if (ret < 0) 26594 + + FAIL_EXIT1 ("fprintf failed: %m\n"); 26595 + + 26596 + + size_t pagesize = getpagesize (); 26597 + + size_t namesize = pagesize - 1 - ret; 26598 + + 26599 + + /* Alter the progname so that the assert message fills the entire page. */ 26600 + + char progname[namesize]; 26601 + + memset (progname, 'A', namesize - 1); 26602 + + progname[namesize - 1] = '\0'; 26603 + + __progname = progname; 26604 + + 26605 + + FILE *f = xfopen ("/proc/self/maps", "r"); 26606 + + char *line = NULL; 26607 + + size_t len = 0; 26608 + + uintptr_t prev_to = 0; 26609 + + 26610 + + /* Pad the beginning of every writable mapping with a PROT_NONE map. This 26611 + + ensures that the mmap in the assert_fail path never ends up below a 26612 + + writable map and will terminate immediately in case of a buffer 26613 + + overflow. */ 26614 + + while (xgetline (&line, &len, f)) 26615 + + { 26616 + + uintptr_t from, to; 26617 + + char perm[4]; 26618 + + 26619 + + sscanf (line, "%" SCNxPTR "-%" SCNxPTR " %c%c%c%c ", 26620 + + &from, &to, 26621 + + &perm[0], &perm[1], &perm[2], &perm[3]); 26622 + + 26623 + + bool writable = (memchr (perm, 'w', 4) != NULL); 26624 + + 26625 + + if (prev_to != 0 && from - prev_to > pagesize && writable) 26626 + + xmmap ((void *) from - pagesize, pagesize, PROT_NONE, 26627 + + MAP_ANONYMOUS | MAP_PRIVATE, 0); 26628 + + 26629 + + prev_to = to; 26630 + + } 26631 + + 26632 + + xfclose (f); 26633 + + 26634 + + assert (argc < 1); 26635 + + return 0; 26636 + +} 26637 + + 26638 + +#define EXPECTED_SIGNAL SIGABRT 26639 + +#define TEST_FUNCTION_ARGV do_test 26640 + +#include <support/test-driver.c> 26641 + 26642 + commit 523f85558152a1b9cced6d669f758c27677775ba 26643 + Author: John David Anglin <danglin@gcc.gnu.org> 26644 + Date: Tue Feb 25 15:57:53 2025 -0500 26645 + 26646 + math: Add optimization barrier to ensure a1 + u.d is not reused [BZ #30664] 26647 + 26648 + A number of fma tests started to fail on hppa when gcc was changed to 26649 + use Ranger rather than EVRP. Eventually I found that the value of 26650 + a1 + u.d in this is block of code was being computed in FE_TOWARDZERO 26651 + mode and not the original rounding mode: 26652 + 26653 + if (TININESS_AFTER_ROUNDING) 26654 + { 26655 + w.d = a1 + u.d; 26656 + if (w.ieee.exponent == 109) 26657 + return w.d * 0x1p-108; 26658 + } 26659 + 26660 + This caused the exponent value to be wrong and the wrong return path 26661 + to be used. 26662 + 26663 + Here we add an optimization barrier after the rounding mode is reset 26664 + to ensure that the previous value of a1 + u.d is not reused. 26665 + 26666 + Signed-off-by: John David Anglin <dave.anglin@bell.net> 26667 + 26668 + diff --git a/sysdeps/ieee754/dbl-64/s_fma.c b/sysdeps/ieee754/dbl-64/s_fma.c 26669 + index c5f5abdc68..79a3cd721d 100644 26670 + --- a/sysdeps/ieee754/dbl-64/s_fma.c 26671 + +++ b/sysdeps/ieee754/dbl-64/s_fma.c 26672 + @@ -244,6 +244,9 @@ __fma (double x, double y, double z) 26673 + /* Reset rounding mode and test for inexact simultaneously. */ 26674 + int j = libc_feupdateenv_test (&env, FE_INEXACT) != 0; 26675 + 26676 + + /* Ensure value of a1 + u.d is not reused. */ 26677 + + a1 = math_opt_barrier (a1); 26678 + + 26679 + if (__glibc_likely (adjust == 0)) 26680 + { 26681 + if ((u.ieee.mantissa1 & 1) == 0 && u.ieee.exponent != 0x7ff) 26682 + 26683 + commit ff10623706ea0096f3af7b38a3330ffb7fb15ae7 26684 + Author: Joe Ramsay <Joe.Ramsay@arm.com> 26685 + Date: Mon Sep 9 13:00:01 2024 +0100 26686 + 26687 + aarch64: Avoid redundant MOVs in AdvSIMD F32 logs 26688 + 26689 + Since the last operation is destructive, the first argument to the FMA 26690 + also has to be the first argument to the special-case in order to 26691 + avoid unnecessary MOVs. Reorder arguments and adjust special-case 26692 + bounds to facilitate this. 26693 + 26694 + Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com> 26695 + (cherry picked from commit 8b09af572b208bfde4d31c6abbae047dcc217675) 26696 + 26697 + diff --git a/sysdeps/aarch64/fpu/log10f_advsimd.c b/sysdeps/aarch64/fpu/log10f_advsimd.c 26698 + index 9347422a77..82228b599a 100644 26699 + --- a/sysdeps/aarch64/fpu/log10f_advsimd.c 26700 + +++ b/sysdeps/aarch64/fpu/log10f_advsimd.c 26701 + @@ -22,11 +22,11 @@ 26702 + 26703 + static const struct data 26704 + { 26705 + - uint32x4_t min_norm; 26706 + + uint32x4_t off, offset_lower_bound; 26707 + uint16x8_t special_bound; 26708 + + uint32x4_t mantissa_mask; 26709 + float32x4_t poly[8]; 26710 + float32x4_t inv_ln10, ln2; 26711 + - uint32x4_t off, mantissa_mask; 26712 + } data = { 26713 + /* Use order 9 for log10(1+x), i.e. order 8 for log10(1+x)/x, with x in 26714 + [-1/3, 1/3] (offset=2/3). Max. relative error: 0x1.068ee468p-25. */ 26715 + @@ -35,18 +35,22 @@ static const struct data 26716 + V4 (-0x1.0fc92cp-4f), V4 (0x1.f5f76ap-5f) }, 26717 + .ln2 = V4 (0x1.62e43p-1f), 26718 + .inv_ln10 = V4 (0x1.bcb7b2p-2f), 26719 + - .min_norm = V4 (0x00800000), 26720 + - .special_bound = V8 (0x7f00), /* asuint32(inf) - min_norm. */ 26721 + + /* Lower bound is the smallest positive normal float 0x00800000. For 26722 + + optimised register use subnormals are detected after offset has been 26723 + + subtracted, so lower bound is 0x0080000 - offset (which wraps around). */ 26724 + + .offset_lower_bound = V4 (0x00800000 - 0x3f2aaaab), 26725 + + .special_bound = V8 (0x7f00), /* top16(asuint32(inf) - 0x00800000). */ 26726 + .off = V4 (0x3f2aaaab), /* 0.666667. */ 26727 + .mantissa_mask = V4 (0x007fffff), 26728 + }; 26729 + 26730 + static float32x4_t VPCS_ATTR NOINLINE 26731 + -special_case (float32x4_t x, float32x4_t y, float32x4_t p, float32x4_t r2, 26732 + - uint16x4_t cmp) 26733 + +special_case (float32x4_t y, uint32x4_t u_off, float32x4_t p, float32x4_t r2, 26734 + + uint16x4_t cmp, const struct data *d) 26735 + { 26736 + /* Fall back to scalar code. */ 26737 + - return v_call_f32 (log10f, x, vfmaq_f32 (y, p, r2), vmovl_u16 (cmp)); 26738 + + return v_call_f32 (log10f, vreinterpretq_f32_u32 (vaddq_u32 (u_off, d->off)), 26739 + + vfmaq_f32 (y, p, r2), vmovl_u16 (cmp)); 26740 + } 26741 + 26742 + /* Fast implementation of AdvSIMD log10f, 26743 + @@ -58,15 +62,21 @@ special_case (float32x4_t x, float32x4_t y, float32x4_t p, float32x4_t r2, 26744 + float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log10) (float32x4_t x) 26745 + { 26746 + const struct data *d = ptr_barrier (&data); 26747 + - uint32x4_t u = vreinterpretq_u32_f32 (x); 26748 + - uint16x4_t special = vcge_u16 (vsubhn_u32 (u, d->min_norm), 26749 + - vget_low_u16 (d->special_bound)); 26750 + + 26751 + + /* To avoid having to mov x out of the way, keep u after offset has been 26752 + + applied, and recover x by adding the offset back in the special-case 26753 + + handler. */ 26754 + + uint32x4_t u_off = vreinterpretq_u32_f32 (x); 26755 + 26756 + /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3. */ 26757 + - u = vsubq_u32 (u, d->off); 26758 + + u_off = vsubq_u32 (u_off, d->off); 26759 + float32x4_t n = vcvtq_f32_s32 ( 26760 + - vshrq_n_s32 (vreinterpretq_s32_u32 (u), 23)); /* signextend. */ 26761 + - u = vaddq_u32 (vandq_u32 (u, d->mantissa_mask), d->off); 26762 + + vshrq_n_s32 (vreinterpretq_s32_u32 (u_off), 23)); /* signextend. */ 26763 + + 26764 + + uint16x4_t special = vcge_u16 (vsubhn_u32 (u_off, d->offset_lower_bound), 26765 + + vget_low_u16 (d->special_bound)); 26766 + + 26767 + + uint32x4_t u = vaddq_u32 (vandq_u32 (u_off, d->mantissa_mask), d->off); 26768 + float32x4_t r = vsubq_f32 (vreinterpretq_f32_u32 (u), v_f32 (1.0f)); 26769 + 26770 + /* y = log10(1+r) + n * log10(2). */ 26771 + @@ -77,7 +87,7 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log10) (float32x4_t x) 26772 + y = vmulq_f32 (y, d->inv_ln10); 26773 + 26774 + if (__glibc_unlikely (v_any_u16h (special))) 26775 + - return special_case (x, y, poly, r2, special); 26776 + + return special_case (y, u_off, poly, r2, special, d); 26777 + return vfmaq_f32 (y, poly, r2); 26778 + } 26779 + libmvec_hidden_def (V_NAME_F1 (log10)) 26780 + diff --git a/sysdeps/aarch64/fpu/log2f_advsimd.c b/sysdeps/aarch64/fpu/log2f_advsimd.c 26781 + index db21836749..84effe4fe9 100644 26782 + --- a/sysdeps/aarch64/fpu/log2f_advsimd.c 26783 + +++ b/sysdeps/aarch64/fpu/log2f_advsimd.c 26784 + @@ -22,9 +22,9 @@ 26785 + 26786 + static const struct data 26787 + { 26788 + - uint32x4_t min_norm; 26789 + + uint32x4_t off, offset_lower_bound; 26790 + uint16x8_t special_bound; 26791 + - uint32x4_t off, mantissa_mask; 26792 + + uint32x4_t mantissa_mask; 26793 + float32x4_t poly[9]; 26794 + } data = { 26795 + /* Coefficients generated using Remez algorithm approximate 26796 + @@ -34,18 +34,22 @@ static const struct data 26797 + V4 (-0x1.715458p-1f), V4 (0x1.ec701cp-2f), V4 (-0x1.7171a4p-2f), 26798 + V4 (0x1.27a0b8p-2f), V4 (-0x1.e5143ep-3f), V4 (0x1.9d8ecap-3f), 26799 + V4 (-0x1.c675bp-3f), V4 (0x1.9e495p-3f) }, 26800 + - .min_norm = V4 (0x00800000), 26801 + - .special_bound = V8 (0x7f00), /* asuint32(inf) - min_norm. */ 26802 + + /* Lower bound is the smallest positive normal float 0x00800000. For 26803 + + optimised register use subnormals are detected after offset has been 26804 + + subtracted, so lower bound is 0x0080000 - offset (which wraps around). */ 26805 + + .offset_lower_bound = V4 (0x00800000 - 0x3f2aaaab), 26806 + + .special_bound = V8 (0x7f00), /* top16(asuint32(inf) - 0x00800000). */ 26807 + .off = V4 (0x3f2aaaab), /* 0.666667. */ 26808 + .mantissa_mask = V4 (0x007fffff), 26809 + }; 26810 + 26811 + static float32x4_t VPCS_ATTR NOINLINE 26812 + -special_case (float32x4_t x, float32x4_t n, float32x4_t p, float32x4_t r, 26813 + - uint16x4_t cmp) 26814 + +special_case (float32x4_t n, uint32x4_t u_off, float32x4_t p, float32x4_t r, 26815 + + uint16x4_t cmp, const struct data *d) 26816 + { 26817 + /* Fall back to scalar code. */ 26818 + - return v_call_f32 (log2f, x, vfmaq_f32 (n, p, r), vmovl_u16 (cmp)); 26819 + + return v_call_f32 (log2f, vreinterpretq_f32_u32 (vaddq_u32 (u_off, d->off)), 26820 + + vfmaq_f32 (n, p, r), vmovl_u16 (cmp)); 26821 + } 26822 + 26823 + /* Fast implementation for single precision AdvSIMD log2, 26824 + @@ -56,15 +60,21 @@ special_case (float32x4_t x, float32x4_t n, float32x4_t p, float32x4_t r, 26825 + float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log2) (float32x4_t x) 26826 + { 26827 + const struct data *d = ptr_barrier (&data); 26828 + - uint32x4_t u = vreinterpretq_u32_f32 (x); 26829 + - uint16x4_t special = vcge_u16 (vsubhn_u32 (u, d->min_norm), 26830 + - vget_low_u16 (d->special_bound)); 26831 + + 26832 + + /* To avoid having to mov x out of the way, keep u after offset has been 26833 + + applied, and recover x by adding the offset back in the special-case 26834 + + handler. */ 26835 + + uint32x4_t u_off = vreinterpretq_u32_f32 (x); 26836 + 26837 + /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3. */ 26838 + - u = vsubq_u32 (u, d->off); 26839 + + u_off = vsubq_u32 (u_off, d->off); 26840 + float32x4_t n = vcvtq_f32_s32 ( 26841 + - vshrq_n_s32 (vreinterpretq_s32_u32 (u), 23)); /* signextend. */ 26842 + - u = vaddq_u32 (vandq_u32 (u, d->mantissa_mask), d->off); 26843 + + vshrq_n_s32 (vreinterpretq_s32_u32 (u_off), 23)); /* signextend. */ 26844 + + 26845 + + uint16x4_t special = vcge_u16 (vsubhn_u32 (u_off, d->offset_lower_bound), 26846 + + vget_low_u16 (d->special_bound)); 26847 + + 26848 + + uint32x4_t u = vaddq_u32 (vandq_u32 (u_off, d->mantissa_mask), d->off); 26849 + float32x4_t r = vsubq_f32 (vreinterpretq_f32_u32 (u), v_f32 (1.0f)); 26850 + 26851 + /* y = log2(1+r) + n. */ 26852 + @@ -72,7 +82,7 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log2) (float32x4_t x) 26853 + float32x4_t p = v_pw_horner_8_f32 (r, r2, d->poly); 26854 + 26855 + if (__glibc_unlikely (v_any_u16h (special))) 26856 + - return special_case (x, n, p, r, special); 26857 + + return special_case (n, u_off, p, r, special, d); 26858 + return vfmaq_f32 (n, p, r); 26859 + } 26860 + libmvec_hidden_def (V_NAME_F1 (log2)) 26861 + diff --git a/sysdeps/aarch64/fpu/logf_advsimd.c b/sysdeps/aarch64/fpu/logf_advsimd.c 26862 + index 3c0d0fcdc7..c20dbfd6c0 100644 26863 + --- a/sysdeps/aarch64/fpu/logf_advsimd.c 26864 + +++ b/sysdeps/aarch64/fpu/logf_advsimd.c 26865 + @@ -21,20 +21,22 @@ 26866 + 26867 + static const struct data 26868 + { 26869 + - uint32x4_t min_norm; 26870 + + uint32x4_t off, offset_lower_bound; 26871 + uint16x8_t special_bound; 26872 + + uint32x4_t mantissa_mask; 26873 + float32x4_t poly[7]; 26874 + - float32x4_t ln2, tiny_bound; 26875 + - uint32x4_t off, mantissa_mask; 26876 + + float32x4_t ln2; 26877 + } data = { 26878 + /* 3.34 ulp error. */ 26879 + .poly = { V4 (-0x1.3e737cp-3f), V4 (0x1.5a9aa2p-3f), V4 (-0x1.4f9934p-3f), 26880 + V4 (0x1.961348p-3f), V4 (-0x1.00187cp-2f), V4 (0x1.555d7cp-2f), 26881 + V4 (-0x1.ffffc8p-2f) }, 26882 + .ln2 = V4 (0x1.62e43p-1f), 26883 + - .tiny_bound = V4 (0x1p-126), 26884 + - .min_norm = V4 (0x00800000), 26885 + - .special_bound = V8 (0x7f00), /* asuint32(inf) - min_norm. */ 26886 + + /* Lower bound is the smallest positive normal float 0x00800000. For 26887 + + optimised register use subnormals are detected after offset has been 26888 + + subtracted, so lower bound is 0x0080000 - offset (which wraps around). */ 26889 + + .offset_lower_bound = V4 (0x00800000 - 0x3f2aaaab), 26890 + + .special_bound = V8 (0x7f00), /* top16(asuint32(inf) - 0x00800000). */ 26891 + .off = V4 (0x3f2aaaab), /* 0.666667. */ 26892 + .mantissa_mask = V4 (0x007fffff) 26893 + }; 26894 + @@ -42,32 +44,37 @@ static const struct data 26895 + #define P(i) d->poly[7 - i] 26896 + 26897 + static float32x4_t VPCS_ATTR NOINLINE 26898 + -special_case (float32x4_t x, float32x4_t y, float32x4_t r2, float32x4_t p, 26899 + - uint16x4_t cmp) 26900 + +special_case (float32x4_t p, uint32x4_t u_off, float32x4_t y, float32x4_t r2, 26901 + + uint16x4_t cmp, const struct data *d) 26902 + { 26903 + /* Fall back to scalar code. */ 26904 + - return v_call_f32 (logf, x, vfmaq_f32 (p, y, r2), vmovl_u16 (cmp)); 26905 + + return v_call_f32 (logf, vreinterpretq_f32_u32 (vaddq_u32 (u_off, d->off)), 26906 + + vfmaq_f32 (p, y, r2), vmovl_u16 (cmp)); 26907 + } 26908 + 26909 + float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log) (float32x4_t x) 26910 + { 26911 + const struct data *d = ptr_barrier (&data); 26912 + float32x4_t n, p, q, r, r2, y; 26913 + - uint32x4_t u; 26914 + + uint32x4_t u, u_off; 26915 + uint16x4_t cmp; 26916 + 26917 + - u = vreinterpretq_u32_f32 (x); 26918 + - cmp = vcge_u16 (vsubhn_u32 (u, d->min_norm), 26919 + - vget_low_u16 (d->special_bound)); 26920 + + /* To avoid having to mov x out of the way, keep u after offset has been 26921 + + applied, and recover x by adding the offset back in the special-case 26922 + + handler. */ 26923 + + u_off = vreinterpretq_u32_f32 (x); 26924 + 26925 + /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3. */ 26926 + - u = vsubq_u32 (u, d->off); 26927 + + u_off = vsubq_u32 (u_off, d->off); 26928 + n = vcvtq_f32_s32 ( 26929 + - vshrq_n_s32 (vreinterpretq_s32_u32 (u), 23)); /* signextend. */ 26930 + - u = vandq_u32 (u, d->mantissa_mask); 26931 + + vshrq_n_s32 (vreinterpretq_s32_u32 (u_off), 23)); /* signextend. */ 26932 + + u = vandq_u32 (u_off, d->mantissa_mask); 26933 + u = vaddq_u32 (u, d->off); 26934 + r = vsubq_f32 (vreinterpretq_f32_u32 (u), v_f32 (1.0f)); 26935 + 26936 + + cmp = vcge_u16 (vsubhn_u32 (u_off, d->offset_lower_bound), 26937 + + vget_low_u16 (d->special_bound)); 26938 + + 26939 + /* y = log(1+r) + n*ln2. */ 26940 + r2 = vmulq_f32 (r, r); 26941 + /* n*ln2 + r + r2*(P1 + r*P2 + r2*(P3 + r*P4 + r2*(P5 + r*P6 + r2*P7))). */ 26942 + @@ -80,7 +87,7 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log) (float32x4_t x) 26943 + p = vfmaq_f32 (r, d->ln2, n); 26944 + 26945 + if (__glibc_unlikely (v_any_u16h (cmp))) 26946 + - return special_case (x, y, r2, p, cmp); 26947 + + return special_case (p, u_off, y, r2, cmp, d); 26948 + return vfmaq_f32 (p, y, r2); 26949 + } 26950 + libmvec_hidden_def (V_NAME_F1 (log)) 26951 + 26952 + commit a991a0fc7c051d7ef2ea7778e0a699f22d4e53d7 26953 + Author: Joe Ramsay <Joe.Ramsay@arm.com> 26954 + Date: Thu Sep 19 17:34:02 2024 +0100 26955 + 26956 + AArch64: Add vector logp1 alias for log1p 26957 + 26958 + This enables vectorisation of C23 logp1, which is an alias for log1p. 26959 + There are no new tests or ulp entries because the new symbols are simply 26960 + aliases. 26961 + 26962 + Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com> 26963 + (cherry picked from commit 751a5502bea1d13551c62c47bb9bd25bff870cda) 26964 + 26965 + diff --git a/bits/libm-simd-decl-stubs.h b/bits/libm-simd-decl-stubs.h 26966 + index 08a41c46ad..5019e8e25c 100644 26967 + --- a/bits/libm-simd-decl-stubs.h 26968 + +++ b/bits/libm-simd-decl-stubs.h 26969 + @@ -253,6 +253,17 @@ 26970 + #define __DECL_SIMD_log1pf64x 26971 + #define __DECL_SIMD_log1pf128x 26972 + 26973 + +#define __DECL_SIMD_logp1 26974 + +#define __DECL_SIMD_logp1f 26975 + +#define __DECL_SIMD_logp1l 26976 + +#define __DECL_SIMD_logp1f16 26977 + +#define __DECL_SIMD_logp1f32 26978 + +#define __DECL_SIMD_logp1f64 26979 + +#define __DECL_SIMD_logp1f128 26980 + +#define __DECL_SIMD_logp1f32x 26981 + +#define __DECL_SIMD_logp1f64x 26982 + +#define __DECL_SIMD_logp1f128x 26983 + + 26984 + #define __DECL_SIMD_atanh 26985 + #define __DECL_SIMD_atanhf 26986 + #define __DECL_SIMD_atanhl 26987 + diff --git a/math/bits/mathcalls.h b/math/bits/mathcalls.h 26988 + index 6cb594b6ff..92856becc4 100644 26989 + --- a/math/bits/mathcalls.h 26990 + +++ b/math/bits/mathcalls.h 26991 + @@ -126,7 +126,7 @@ __MATHCALL (log2p1,, (_Mdouble_ __x)); 26992 + __MATHCALL (log10p1,, (_Mdouble_ __x)); 26993 + 26994 + /* Return log(1 + X). */ 26995 + -__MATHCALL (logp1,, (_Mdouble_ __x)); 26996 + +__MATHCALL_VEC (logp1,, (_Mdouble_ __x)); 26997 + #endif 26998 + 26999 + #if defined __USE_XOPEN_EXTENDED || defined __USE_ISOC99 27000 + diff --git a/sysdeps/aarch64/fpu/Versions b/sysdeps/aarch64/fpu/Versions 27001 + index cc15ce2d1e..015211f5f4 100644 27002 + --- a/sysdeps/aarch64/fpu/Versions 27003 + +++ b/sysdeps/aarch64/fpu/Versions 27004 + @@ -135,4 +135,11 @@ libmvec { 27005 + _ZGVsMxv_tanh; 27006 + _ZGVsMxv_tanhf; 27007 + } 27008 + + GLIBC_2.41 { 27009 + + _ZGVnN2v_logp1; 27010 + + _ZGVnN2v_logp1f; 27011 + + _ZGVnN4v_logp1f; 27012 + + _ZGVsMxv_logp1; 27013 + + _ZGVsMxv_logp1f; 27014 + + } 27015 + } 27016 + diff --git a/sysdeps/aarch64/fpu/advsimd_f32_protos.h b/sysdeps/aarch64/fpu/advsimd_f32_protos.h 27017 + index 097d403ffe..5909bb4ce9 100644 27018 + --- a/sysdeps/aarch64/fpu/advsimd_f32_protos.h 27019 + +++ b/sysdeps/aarch64/fpu/advsimd_f32_protos.h 27020 + @@ -36,6 +36,7 @@ libmvec_hidden_proto (V_NAME_F2(hypot)); 27021 + libmvec_hidden_proto (V_NAME_F1(log10)); 27022 + libmvec_hidden_proto (V_NAME_F1(log1p)); 27023 + libmvec_hidden_proto (V_NAME_F1(log2)); 27024 + +libmvec_hidden_proto (V_NAME_F1(logp1)); 27025 + libmvec_hidden_proto (V_NAME_F1(log)); 27026 + libmvec_hidden_proto (V_NAME_F2(pow)); 27027 + libmvec_hidden_proto (V_NAME_F1(sin)); 27028 + diff --git a/sysdeps/aarch64/fpu/bits/math-vector.h b/sysdeps/aarch64/fpu/bits/math-vector.h 27029 + index 7484150131..f295fe185d 100644 27030 + --- a/sysdeps/aarch64/fpu/bits/math-vector.h 27031 + +++ b/sysdeps/aarch64/fpu/bits/math-vector.h 27032 + @@ -113,6 +113,10 @@ 27033 + # define __DECL_SIMD_log2 __DECL_SIMD_aarch64 27034 + # undef __DECL_SIMD_log2f 27035 + # define __DECL_SIMD_log2f __DECL_SIMD_aarch64 27036 + +# undef __DECL_SIMD_logp1 27037 + +# define __DECL_SIMD_logp1 __DECL_SIMD_aarch64 27038 + +# undef __DECL_SIMD_logp1f 27039 + +# define __DECL_SIMD_logp1f __DECL_SIMD_aarch64 27040 + # undef __DECL_SIMD_pow 27041 + # define __DECL_SIMD_pow __DECL_SIMD_aarch64 27042 + # undef __DECL_SIMD_powf 27043 + @@ -180,6 +184,7 @@ __vpcs __f32x4_t _ZGVnN4v_logf (__f32x4_t); 27044 + __vpcs __f32x4_t _ZGVnN4v_log10f (__f32x4_t); 27045 + __vpcs __f32x4_t _ZGVnN4v_log1pf (__f32x4_t); 27046 + __vpcs __f32x4_t _ZGVnN4v_log2f (__f32x4_t); 27047 + +__vpcs __f32x4_t _ZGVnN4v_logp1f (__f32x4_t); 27048 + __vpcs __f32x4_t _ZGVnN4vv_powf (__f32x4_t, __f32x4_t); 27049 + __vpcs __f32x4_t _ZGVnN4v_sinf (__f32x4_t); 27050 + __vpcs __f32x4_t _ZGVnN4v_sinhf (__f32x4_t); 27051 + @@ -207,6 +212,7 @@ __vpcs __f64x2_t _ZGVnN2v_log (__f64x2_t); 27052 + __vpcs __f64x2_t _ZGVnN2v_log10 (__f64x2_t); 27053 + __vpcs __f64x2_t _ZGVnN2v_log1p (__f64x2_t); 27054 + __vpcs __f64x2_t _ZGVnN2v_log2 (__f64x2_t); 27055 + +__vpcs __f64x2_t _ZGVnN2v_logp1 (__f64x2_t); 27056 + __vpcs __f64x2_t _ZGVnN2vv_pow (__f64x2_t, __f64x2_t); 27057 + __vpcs __f64x2_t _ZGVnN2v_sin (__f64x2_t); 27058 + __vpcs __f64x2_t _ZGVnN2v_sinh (__f64x2_t); 27059 + @@ -239,6 +245,7 @@ __sv_f32_t _ZGVsMxv_logf (__sv_f32_t, __sv_bool_t); 27060 + __sv_f32_t _ZGVsMxv_log10f (__sv_f32_t, __sv_bool_t); 27061 + __sv_f32_t _ZGVsMxv_log1pf (__sv_f32_t, __sv_bool_t); 27062 + __sv_f32_t _ZGVsMxv_log2f (__sv_f32_t, __sv_bool_t); 27063 + +__sv_f32_t _ZGVsMxv_logp1f (__sv_f32_t, __sv_bool_t); 27064 + __sv_f32_t _ZGVsMxvv_powf (__sv_f32_t, __sv_f32_t, __sv_bool_t); 27065 + __sv_f32_t _ZGVsMxv_sinf (__sv_f32_t, __sv_bool_t); 27066 + __sv_f32_t _ZGVsMxv_sinhf (__sv_f32_t, __sv_bool_t); 27067 + @@ -266,6 +273,7 @@ __sv_f64_t _ZGVsMxv_log (__sv_f64_t, __sv_bool_t); 27068 + __sv_f64_t _ZGVsMxv_log10 (__sv_f64_t, __sv_bool_t); 27069 + __sv_f64_t _ZGVsMxv_log1p (__sv_f64_t, __sv_bool_t); 27070 + __sv_f64_t _ZGVsMxv_log2 (__sv_f64_t, __sv_bool_t); 27071 + +__sv_f64_t _ZGVsMxv_logp1 (__sv_f64_t, __sv_bool_t); 27072 + __sv_f64_t _ZGVsMxvv_pow (__sv_f64_t, __sv_f64_t, __sv_bool_t); 27073 + __sv_f64_t _ZGVsMxv_sin (__sv_f64_t, __sv_bool_t); 27074 + __sv_f64_t _ZGVsMxv_sinh (__sv_f64_t, __sv_bool_t); 27075 + diff --git a/sysdeps/aarch64/fpu/log1p_advsimd.c b/sysdeps/aarch64/fpu/log1p_advsimd.c 27076 + index ffc418fc9c..114064c696 100644 27077 + --- a/sysdeps/aarch64/fpu/log1p_advsimd.c 27078 + +++ b/sysdeps/aarch64/fpu/log1p_advsimd.c 27079 + @@ -127,3 +127,5 @@ VPCS_ATTR float64x2_t V_NAME_D1 (log1p) (float64x2_t x) 27080 + 27081 + return vfmaq_f64 (y, f2, p); 27082 + } 27083 + + 27084 + +strong_alias (V_NAME_D1 (log1p), V_NAME_D1 (logp1)) 27085 + diff --git a/sysdeps/aarch64/fpu/log1p_sve.c b/sysdeps/aarch64/fpu/log1p_sve.c 27086 + index 04f7e5720e..b21cfb2c90 100644 27087 + --- a/sysdeps/aarch64/fpu/log1p_sve.c 27088 + +++ b/sysdeps/aarch64/fpu/log1p_sve.c 27089 + @@ -116,3 +116,5 @@ svfloat64_t SV_NAME_D1 (log1p) (svfloat64_t x, svbool_t pg) 27090 + 27091 + return y; 27092 + } 27093 + + 27094 + +strong_alias (SV_NAME_D1 (log1p), SV_NAME_D1 (logp1)) 27095 + diff --git a/sysdeps/aarch64/fpu/log1pf_advsimd.c b/sysdeps/aarch64/fpu/log1pf_advsimd.c 27096 + index dc15334a85..8cfa28fb8a 100644 27097 + --- a/sysdeps/aarch64/fpu/log1pf_advsimd.c 27098 + +++ b/sysdeps/aarch64/fpu/log1pf_advsimd.c 27099 + @@ -128,3 +128,6 @@ VPCS_ATTR float32x4_t V_NAME_F1 (log1p) (float32x4_t x) 27100 + } 27101 + libmvec_hidden_def (V_NAME_F1 (log1p)) 27102 + HALF_WIDTH_ALIAS_F1 (log1p) 27103 + +strong_alias (V_NAME_F1 (log1p), V_NAME_F1 (logp1)) 27104 + +libmvec_hidden_def (V_NAME_F1 (logp1)) 27105 + +HALF_WIDTH_ALIAS_F1 (logp1) 27106 + diff --git a/sysdeps/aarch64/fpu/log1pf_sve.c b/sysdeps/aarch64/fpu/log1pf_sve.c 27107 + index f645cc997e..5256d5e94c 100644 27108 + --- a/sysdeps/aarch64/fpu/log1pf_sve.c 27109 + +++ b/sysdeps/aarch64/fpu/log1pf_sve.c 27110 + @@ -98,3 +98,5 @@ svfloat32_t SV_NAME_F1 (log1p) (svfloat32_t x, svbool_t pg) 27111 + 27112 + return y; 27113 + } 27114 + + 27115 + +strong_alias (SV_NAME_F1 (log1p), SV_NAME_F1 (logp1)) 27116 + diff --git a/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist b/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist 27117 + index b685106954..98687cae0d 100644 27118 + --- a/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist 27119 + +++ b/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist 27120 + @@ -128,3 +128,8 @@ GLIBC_2.40 _ZGVsMxvv_hypot F 27121 + GLIBC_2.40 _ZGVsMxvv_hypotf F 27122 + GLIBC_2.40 _ZGVsMxvv_pow F 27123 + GLIBC_2.40 _ZGVsMxvv_powf F 27124 + +GLIBC_2.41 _ZGVnN2v_logp1 F 27125 + +GLIBC_2.41 _ZGVnN2v_logp1f F 27126 + +GLIBC_2.41 _ZGVnN4v_logp1f F 27127 + +GLIBC_2.41 _ZGVsMxv_logp1 F 27128 + +GLIBC_2.41 _ZGVsMxv_logp1f F 27129 + 27130 + commit 354aeaf2130c1484007025563fe87c997f07324a 27131 + Author: Joe Ramsay <Joe.Ramsay@arm.com> 27132 + Date: Mon Sep 23 15:26:12 2024 +0100 27133 + 27134 + AArch64: Improve codegen in SVE expf & related routines 27135 + 27136 + Reduce MOV and MOVPRFX by improving special-case handling. Use inline 27137 + helper to duplicate the entire computation between the special- and 27138 + non-special case branches, removing the contention for z0 between x 27139 + and the return value. 27140 + 27141 + Also rearrange some MLAs and MLSs - by making the multiplicand the 27142 + destination we can avoid a MOVPRFX in several cases. Also change which 27143 + constants go in the vector used for lanewise ops - the last lane is no 27144 + longer wasted. 27145 + 27146 + Spotted that shift was incorrect in exp2f and exp10f, w.r.t. to the 27147 + comment that explains it. Fixed - worst-case ULP for exp2f moves 27148 + around but it doesn't change significantly for either routine. 27149 + 27150 + Worst-case error for coshf increases due to passing x to exp rather 27151 + than abs(x) - updated the comment, but does not require regen-ulps. 27152 + 27153 + Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com> 27154 + (cherry picked from commit 7b8c134b5460ed933d610fa92ed1227372b68fdc) 27155 + 27156 + diff --git a/sysdeps/aarch64/fpu/coshf_sve.c b/sysdeps/aarch64/fpu/coshf_sve.c 27157 + index e5d8a299c6..7ad6efa0fc 100644 27158 + --- a/sysdeps/aarch64/fpu/coshf_sve.c 27159 + +++ b/sysdeps/aarch64/fpu/coshf_sve.c 27160 + @@ -23,37 +23,42 @@ 27161 + static const struct data 27162 + { 27163 + struct sv_expf_data expf_consts; 27164 + - uint32_t special_bound; 27165 + + float special_bound; 27166 + } data = { 27167 + .expf_consts = SV_EXPF_DATA, 27168 + /* 0x1.5a92d8p+6: expf overflows above this, so have to use special case. */ 27169 + - .special_bound = 0x42ad496c, 27170 + + .special_bound = 0x1.5a92d8p+6, 27171 + }; 27172 + 27173 + static svfloat32_t NOINLINE 27174 + -special_case (svfloat32_t x, svfloat32_t y, svbool_t pg) 27175 + +special_case (svfloat32_t x, svfloat32_t half_e, svfloat32_t half_over_e, 27176 + + svbool_t pg) 27177 + { 27178 + - return sv_call_f32 (coshf, x, y, pg); 27179 + + return sv_call_f32 (coshf, x, svadd_x (svptrue_b32 (), half_e, half_over_e), 27180 + + pg); 27181 + } 27182 + 27183 + /* Single-precision vector cosh, using vector expf. 27184 + - Maximum error is 1.89 ULP: 27185 + - _ZGVsMxv_coshf (-0x1.65898cp+6) got 0x1.f00aep+127 27186 + - want 0x1.f00adcp+127. */ 27187 + + Maximum error is 2.77 ULP: 27188 + + _ZGVsMxv_coshf(-0x1.5b38f4p+1) got 0x1.e45946p+2 27189 + + want 0x1.e4594cp+2. */ 27190 + svfloat32_t SV_NAME_F1 (cosh) (svfloat32_t x, svbool_t pg) 27191 + { 27192 + const struct data *d = ptr_barrier (&data); 27193 + 27194 + - svfloat32_t ax = svabs_x (pg, x); 27195 + - svbool_t special = svcmpge (pg, svreinterpret_u32 (ax), d->special_bound); 27196 + + svbool_t special = svacge (pg, x, d->special_bound); 27197 + 27198 + - /* Calculate cosh by exp(x) / 2 + exp(-x) / 2. */ 27199 + - svfloat32_t t = expf_inline (ax, pg, &d->expf_consts); 27200 + - svfloat32_t half_t = svmul_x (pg, t, 0.5); 27201 + - svfloat32_t half_over_t = svdivr_x (pg, t, 0.5); 27202 + + /* Calculate cosh by exp(x) / 2 + exp(-x) / 2. 27203 + + Note that x is passed to exp here, rather than |x|. This is to avoid using 27204 + + destructive unary ABS for better register usage. However it means the 27205 + + routine is not exactly symmetrical, as the exp helper is slightly less 27206 + + accurate in the negative range. */ 27207 + + svfloat32_t e = expf_inline (x, pg, &d->expf_consts); 27208 + + svfloat32_t half_e = svmul_x (svptrue_b32 (), e, 0.5); 27209 + + svfloat32_t half_over_e = svdivr_x (pg, e, 0.5); 27210 + 27211 + if (__glibc_unlikely (svptest_any (pg, special))) 27212 + - return special_case (x, svadd_x (pg, half_t, half_over_t), special); 27213 + + return special_case (x, half_e, half_over_e, special); 27214 + 27215 + - return svadd_x (pg, half_t, half_over_t); 27216 + + return svadd_x (svptrue_b32 (), half_e, half_over_e); 27217 + } 27218 + diff --git a/sysdeps/aarch64/fpu/exp10f_sve.c b/sysdeps/aarch64/fpu/exp10f_sve.c 27219 + index e09b2f3b27..8aa3fa9c43 100644 27220 + --- a/sysdeps/aarch64/fpu/exp10f_sve.c 27221 + +++ b/sysdeps/aarch64/fpu/exp10f_sve.c 27222 + @@ -18,74 +18,83 @@ 27223 + <https://www.gnu.org/licenses/>. */ 27224 + 27225 + #include "sv_math.h" 27226 + -#include "poly_sve_f32.h" 27227 + 27228 + -/* For x < -SpecialBound, the result is subnormal and not handled correctly by 27229 + +/* For x < -Thres, the result is subnormal and not handled correctly by 27230 + FEXPA. */ 27231 + -#define SpecialBound 37.9 27232 + +#define Thres 37.9 27233 + 27234 + static const struct data 27235 + { 27236 + - float poly[5]; 27237 + - float shift, log10_2, log2_10_hi, log2_10_lo, special_bound; 27238 + + float log2_10_lo, c0, c2, c4; 27239 + + float c1, c3, log10_2; 27240 + + float shift, log2_10_hi, thres; 27241 + } data = { 27242 + /* Coefficients generated using Remez algorithm with minimisation of relative 27243 + error. 27244 + rel error: 0x1.89dafa3p-24 27245 + abs error: 0x1.167d55p-23 in [-log10(2)/2, log10(2)/2] 27246 + maxerr: 0.52 +0.5 ulp. */ 27247 + - .poly = { 0x1.26bb16p+1f, 0x1.5350d2p+1f, 0x1.04744ap+1f, 0x1.2d8176p+0f, 27248 + - 0x1.12b41ap-1f }, 27249 + + .c0 = 0x1.26bb16p+1f, 27250 + + .c1 = 0x1.5350d2p+1f, 27251 + + .c2 = 0x1.04744ap+1f, 27252 + + .c3 = 0x1.2d8176p+0f, 27253 + + .c4 = 0x1.12b41ap-1f, 27254 + /* 1.5*2^17 + 127, a shift value suitable for FEXPA. */ 27255 + - .shift = 0x1.903f8p17f, 27256 + + .shift = 0x1.803f8p17f, 27257 + .log10_2 = 0x1.a934fp+1, 27258 + .log2_10_hi = 0x1.344136p-2, 27259 + .log2_10_lo = -0x1.ec10cp-27, 27260 + - .special_bound = SpecialBound, 27261 + + .thres = Thres, 27262 + }; 27263 + 27264 + -static svfloat32_t NOINLINE 27265 + -special_case (svfloat32_t x, svfloat32_t y, svbool_t special) 27266 + +static inline svfloat32_t 27267 + +sv_exp10f_inline (svfloat32_t x, const svbool_t pg, const struct data *d) 27268 + { 27269 + - return sv_call_f32 (exp10f, x, y, special); 27270 + -} 27271 + - 27272 + -/* Single-precision SVE exp10f routine. Implements the same algorithm 27273 + - as AdvSIMD exp10f. 27274 + - Worst case error is 1.02 ULPs. 27275 + - _ZGVsMxv_exp10f(-0x1.040488p-4) got 0x1.ba5f9ep-1 27276 + - want 0x1.ba5f9cp-1. */ 27277 + -svfloat32_t SV_NAME_F1 (exp10) (svfloat32_t x, const svbool_t pg) 27278 + -{ 27279 + - const struct data *d = ptr_barrier (&data); 27280 + /* exp10(x) = 2^(n/N) * 10^r = 2^n * (1 + poly (r)), 27281 + with poly(r) in [1/sqrt(2), sqrt(2)] and 27282 + x = r + n * log10(2) / N, with r in [-log10(2)/2N, log10(2)/2N]. */ 27283 + 27284 + - /* Load some constants in quad-word chunks to minimise memory access (last 27285 + - lane is wasted). */ 27286 + - svfloat32_t log10_2_and_inv = svld1rq (svptrue_b32 (), &d->log10_2); 27287 + + svfloat32_t lane_consts = svld1rq (svptrue_b32 (), &d->log2_10_lo); 27288 + 27289 + /* n = round(x/(log10(2)/N)). */ 27290 + svfloat32_t shift = sv_f32 (d->shift); 27291 + - svfloat32_t z = svmla_lane (shift, x, log10_2_and_inv, 0); 27292 + - svfloat32_t n = svsub_x (pg, z, shift); 27293 + + svfloat32_t z = svmad_x (pg, sv_f32 (d->log10_2), x, shift); 27294 + + svfloat32_t n = svsub_x (svptrue_b32 (), z, shift); 27295 + 27296 + /* r = x - n*log10(2)/N. */ 27297 + - svfloat32_t r = svmls_lane (x, n, log10_2_and_inv, 1); 27298 + - r = svmls_lane (r, n, log10_2_and_inv, 2); 27299 + + svfloat32_t r = svmsb_x (pg, sv_f32 (d->log2_10_hi), n, x); 27300 + + r = svmls_lane (r, n, lane_consts, 0); 27301 + 27302 + - svbool_t special = svacgt (pg, x, d->special_bound); 27303 + svfloat32_t scale = svexpa (svreinterpret_u32 (z)); 27304 + 27305 + /* Polynomial evaluation: poly(r) ~ exp10(r)-1. */ 27306 + - svfloat32_t r2 = svmul_x (pg, r, r); 27307 + - svfloat32_t poly 27308 + - = svmla_x (pg, svmul_x (pg, r, d->poly[0]), 27309 + - sv_pairwise_poly_3_f32_x (pg, r, r2, d->poly + 1), r2); 27310 + - 27311 + - if (__glibc_unlikely (svptest_any (pg, special))) 27312 + - return special_case (x, svmla_x (pg, scale, scale, poly), special); 27313 + + svfloat32_t p12 = svmla_lane (sv_f32 (d->c1), r, lane_consts, 2); 27314 + + svfloat32_t p34 = svmla_lane (sv_f32 (d->c3), r, lane_consts, 3); 27315 + + svfloat32_t r2 = svmul_x (svptrue_b32 (), r, r); 27316 + + svfloat32_t p14 = svmla_x (pg, p12, p34, r2); 27317 + + svfloat32_t p0 = svmul_lane (r, lane_consts, 1); 27318 + + svfloat32_t poly = svmla_x (pg, p0, r2, p14); 27319 + 27320 + return svmla_x (pg, scale, scale, poly); 27321 + } 27322 + + 27323 + +static svfloat32_t NOINLINE 27324 + +special_case (svfloat32_t x, svbool_t special, const struct data *d) 27325 + +{ 27326 + + return sv_call_f32 (exp10f, x, sv_exp10f_inline (x, svptrue_b32 (), d), 27327 + + special); 27328 + +} 27329 + + 27330 + +/* Single-precision SVE exp10f routine. Implements the same algorithm 27331 + + as AdvSIMD exp10f. 27332 + + Worst case error is 1.02 ULPs. 27333 + + _ZGVsMxv_exp10f(-0x1.040488p-4) got 0x1.ba5f9ep-1 27334 + + want 0x1.ba5f9cp-1. */ 27335 + +svfloat32_t SV_NAME_F1 (exp10) (svfloat32_t x, const svbool_t pg) 27336 + +{ 27337 + + const struct data *d = ptr_barrier (&data); 27338 + + svbool_t special = svacgt (pg, x, d->thres); 27339 + + if (__glibc_unlikely (svptest_any (special, special))) 27340 + + return special_case (x, special, d); 27341 + + return sv_exp10f_inline (x, pg, d); 27342 + +} 27343 + diff --git a/sysdeps/aarch64/fpu/exp2f_sve.c b/sysdeps/aarch64/fpu/exp2f_sve.c 27344 + index 8a686e3e05..c6216bed9e 100644 27345 + --- a/sysdeps/aarch64/fpu/exp2f_sve.c 27346 + +++ b/sysdeps/aarch64/fpu/exp2f_sve.c 27347 + @@ -24,54 +24,64 @@ 27348 + 27349 + static const struct data 27350 + { 27351 + - float poly[5]; 27352 + + float c0, c2, c4, c1, c3; 27353 + float shift, thres; 27354 + } data = { 27355 + - /* Coefficients copied from the polynomial in AdvSIMD variant, reversed for 27356 + - compatibility with polynomial helpers. */ 27357 + - .poly = { 0x1.62e422p-1f, 0x1.ebf9bcp-3f, 0x1.c6bd32p-5f, 0x1.3ce9e4p-7f, 27358 + - 0x1.59977ap-10f }, 27359 + + /* Coefficients copied from the polynomial in AdvSIMD variant. */ 27360 + + .c0 = 0x1.62e422p-1f, 27361 + + .c1 = 0x1.ebf9bcp-3f, 27362 + + .c2 = 0x1.c6bd32p-5f, 27363 + + .c3 = 0x1.3ce9e4p-7f, 27364 + + .c4 = 0x1.59977ap-10f, 27365 + /* 1.5*2^17 + 127. */ 27366 + - .shift = 0x1.903f8p17f, 27367 + + .shift = 0x1.803f8p17f, 27368 + /* Roughly 87.3. For x < -Thres, the result is subnormal and not handled 27369 + correctly by FEXPA. */ 27370 + .thres = Thres, 27371 + }; 27372 + 27373 + -static svfloat32_t NOINLINE 27374 + -special_case (svfloat32_t x, svfloat32_t y, svbool_t special) 27375 + -{ 27376 + - return sv_call_f32 (exp2f, x, y, special); 27377 + -} 27378 + - 27379 + -/* Single-precision SVE exp2f routine. Implements the same algorithm 27380 + - as AdvSIMD exp2f. 27381 + - Worst case error is 1.04 ULPs. 27382 + - SV_NAME_F1 (exp2)(0x1.943b9p-1) got 0x1.ba7eb2p+0 27383 + - want 0x1.ba7ebp+0. */ 27384 + -svfloat32_t SV_NAME_F1 (exp2) (svfloat32_t x, const svbool_t pg) 27385 + +static inline svfloat32_t 27386 + +sv_exp2f_inline (svfloat32_t x, const svbool_t pg, const struct data *d) 27387 + { 27388 + - const struct data *d = ptr_barrier (&data); 27389 + /* exp2(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)] 27390 + x = n + r, with r in [-1/2, 1/2]. */ 27391 + - svfloat32_t shift = sv_f32 (d->shift); 27392 + - svfloat32_t z = svadd_x (pg, x, shift); 27393 + - svfloat32_t n = svsub_x (pg, z, shift); 27394 + - svfloat32_t r = svsub_x (pg, x, n); 27395 + + svfloat32_t z = svadd_x (svptrue_b32 (), x, d->shift); 27396 + + svfloat32_t n = svsub_x (svptrue_b32 (), z, d->shift); 27397 + + svfloat32_t r = svsub_x (svptrue_b32 (), x, n); 27398 + 27399 + - svbool_t special = svacgt (pg, x, d->thres); 27400 + svfloat32_t scale = svexpa (svreinterpret_u32 (z)); 27401 + 27402 + /* Polynomial evaluation: poly(r) ~ exp2(r)-1. 27403 + Evaluate polynomial use hybrid scheme - offset ESTRIN by 1 for 27404 + coefficients 1 to 4, and apply most significant coefficient directly. */ 27405 + - svfloat32_t r2 = svmul_x (pg, r, r); 27406 + - svfloat32_t p14 = sv_pairwise_poly_3_f32_x (pg, r, r2, d->poly + 1); 27407 + - svfloat32_t p0 = svmul_x (pg, r, d->poly[0]); 27408 + + svfloat32_t even_coeffs = svld1rq (svptrue_b32 (), &d->c0); 27409 + + svfloat32_t r2 = svmul_x (svptrue_b32 (), r, r); 27410 + + svfloat32_t p12 = svmla_lane (sv_f32 (d->c1), r, even_coeffs, 1); 27411 + + svfloat32_t p34 = svmla_lane (sv_f32 (d->c3), r, even_coeffs, 2); 27412 + + svfloat32_t p14 = svmla_x (pg, p12, r2, p34); 27413 + + svfloat32_t p0 = svmul_lane (r, even_coeffs, 0); 27414 + svfloat32_t poly = svmla_x (pg, p0, r2, p14); 27415 + 27416 + - if (__glibc_unlikely (svptest_any (pg, special))) 27417 + - return special_case (x, svmla_x (pg, scale, scale, poly), special); 27418 + - 27419 + return svmla_x (pg, scale, scale, poly); 27420 + } 27421 + + 27422 + +static svfloat32_t NOINLINE 27423 + +special_case (svfloat32_t x, svbool_t special, const struct data *d) 27424 + +{ 27425 + + return sv_call_f32 (exp2f, x, sv_exp2f_inline (x, svptrue_b32 (), d), 27426 + + special); 27427 + +} 27428 + + 27429 + +/* Single-precision SVE exp2f routine. Implements the same algorithm 27430 + + as AdvSIMD exp2f. 27431 + + Worst case error is 1.04 ULPs. 27432 + + _ZGVsMxv_exp2f(-0x1.af994ap-3) got 0x1.ba6a66p-1 27433 + + want 0x1.ba6a64p-1. */ 27434 + +svfloat32_t SV_NAME_F1 (exp2) (svfloat32_t x, const svbool_t pg) 27435 + +{ 27436 + + const struct data *d = ptr_barrier (&data); 27437 + + svbool_t special = svacgt (pg, x, d->thres); 27438 + + if (__glibc_unlikely (svptest_any (special, special))) 27439 + + return special_case (x, special, d); 27440 + + return sv_exp2f_inline (x, pg, d); 27441 + +} 27442 + diff --git a/sysdeps/aarch64/fpu/expf_sve.c b/sysdeps/aarch64/fpu/expf_sve.c 27443 + index 3ba79bc4f1..da93e01b87 100644 27444 + --- a/sysdeps/aarch64/fpu/expf_sve.c 27445 + +++ b/sysdeps/aarch64/fpu/expf_sve.c 27446 + @@ -18,33 +18,25 @@ 27447 + <https://www.gnu.org/licenses/>. */ 27448 + 27449 + #include "sv_math.h" 27450 + +#include "sv_expf_inline.h" 27451 + + 27452 + +/* Roughly 87.3. For x < -Thres, the result is subnormal and not handled 27453 + + correctly by FEXPA. */ 27454 + +#define Thres 0x1.5d5e2ap+6f 27455 + 27456 + static const struct data 27457 + { 27458 + - float poly[5]; 27459 + - float inv_ln2, ln2_hi, ln2_lo, shift, thres; 27460 + + struct sv_expf_data d; 27461 + + float thres; 27462 + } data = { 27463 + - /* Coefficients copied from the polynomial in AdvSIMD variant, reversed for 27464 + - compatibility with polynomial helpers. */ 27465 + - .poly = { 0x1.ffffecp-1f, 0x1.fffdb6p-2f, 0x1.555e66p-3f, 0x1.573e2ep-5f, 27466 + - 0x1.0e4020p-7f }, 27467 + - .inv_ln2 = 0x1.715476p+0f, 27468 + - .ln2_hi = 0x1.62e4p-1f, 27469 + - .ln2_lo = 0x1.7f7d1cp-20f, 27470 + - /* 1.5*2^17 + 127. */ 27471 + - .shift = 0x1.903f8p17f, 27472 + - /* Roughly 87.3. For x < -Thres, the result is subnormal and not handled 27473 + - correctly by FEXPA. */ 27474 + - .thres = 0x1.5d5e2ap+6f, 27475 + + .d = SV_EXPF_DATA, 27476 + + .thres = Thres, 27477 + }; 27478 + 27479 + -#define C(i) sv_f32 (d->poly[i]) 27480 + -#define ExponentBias 0x3f800000 27481 + - 27482 + static svfloat32_t NOINLINE 27483 + -special_case (svfloat32_t x, svfloat32_t y, svbool_t special) 27484 + +special_case (svfloat32_t x, svbool_t special, const struct sv_expf_data *d) 27485 + { 27486 + - return sv_call_f32 (expf, x, y, special); 27487 + + return sv_call_f32 (expf, x, expf_inline (x, svptrue_b32 (), d), special); 27488 + } 27489 + 27490 + /* Optimised single-precision SVE exp function. 27491 + @@ -54,36 +46,8 @@ special_case (svfloat32_t x, svfloat32_t y, svbool_t special) 27492 + svfloat32_t SV_NAME_F1 (exp) (svfloat32_t x, const svbool_t pg) 27493 + { 27494 + const struct data *d = ptr_barrier (&data); 27495 + - 27496 + - /* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)] 27497 + - x = ln2*n + r, with r in [-ln2/2, ln2/2]. */ 27498 + - 27499 + - /* Load some constants in quad-word chunks to minimise memory access (last 27500 + - lane is wasted). */ 27501 + - svfloat32_t invln2_and_ln2 = svld1rq (svptrue_b32 (), &d->inv_ln2); 27502 + - 27503 + - /* n = round(x/(ln2/N)). */ 27504 + - svfloat32_t z = svmla_lane (sv_f32 (d->shift), x, invln2_and_ln2, 0); 27505 + - svfloat32_t n = svsub_x (pg, z, d->shift); 27506 + - 27507 + - /* r = x - n*ln2/N. */ 27508 + - svfloat32_t r = svmls_lane (x, n, invln2_and_ln2, 1); 27509 + - r = svmls_lane (r, n, invln2_and_ln2, 2); 27510 + - 27511 + - /* scale = 2^(n/N). */ 27512 + svbool_t is_special_case = svacgt (pg, x, d->thres); 27513 + - svfloat32_t scale = svexpa (svreinterpret_u32 (z)); 27514 + - 27515 + - /* y = exp(r) - 1 ~= r + C0 r^2 + C1 r^3 + C2 r^4 + C3 r^5 + C4 r^6. */ 27516 + - svfloat32_t p12 = svmla_x (pg, C (1), C (2), r); 27517 + - svfloat32_t p34 = svmla_x (pg, C (3), C (4), r); 27518 + - svfloat32_t r2 = svmul_x (pg, r, r); 27519 + - svfloat32_t p14 = svmla_x (pg, p12, p34, r2); 27520 + - svfloat32_t p0 = svmul_x (pg, r, C (0)); 27521 + - svfloat32_t poly = svmla_x (pg, p0, r2, p14); 27522 + - 27523 + if (__glibc_unlikely (svptest_any (pg, is_special_case))) 27524 + - return special_case (x, svmla_x (pg, scale, scale, poly), is_special_case); 27525 + - 27526 + - return svmla_x (pg, scale, scale, poly); 27527 + + return special_case (x, is_special_case, &d->d); 27528 + + return expf_inline (x, pg, &d->d); 27529 + } 27530 + diff --git a/sysdeps/aarch64/fpu/sv_expf_inline.h b/sysdeps/aarch64/fpu/sv_expf_inline.h 27531 + index 23963b5f8e..6166df6553 100644 27532 + --- a/sysdeps/aarch64/fpu/sv_expf_inline.h 27533 + +++ b/sysdeps/aarch64/fpu/sv_expf_inline.h 27534 + @@ -24,19 +24,20 @@ 27535 + 27536 + struct sv_expf_data 27537 + { 27538 + - float poly[5]; 27539 + - float inv_ln2, ln2_hi, ln2_lo, shift; 27540 + + float c1, c3, inv_ln2; 27541 + + float ln2_lo, c0, c2, c4; 27542 + + float ln2_hi, shift; 27543 + }; 27544 + 27545 + /* Coefficients copied from the polynomial in AdvSIMD variant, reversed for 27546 + compatibility with polynomial helpers. Shift is 1.5*2^17 + 127. */ 27547 + #define SV_EXPF_DATA \ 27548 + { \ 27549 + - .poly = { 0x1.ffffecp-1f, 0x1.fffdb6p-2f, 0x1.555e66p-3f, 0x1.573e2ep-5f, \ 27550 + - 0x1.0e4020p-7f }, \ 27551 + - \ 27552 + - .inv_ln2 = 0x1.715476p+0f, .ln2_hi = 0x1.62e4p-1f, \ 27553 + - .ln2_lo = 0x1.7f7d1cp-20f, .shift = 0x1.803f8p17f, \ 27554 + + /* Coefficients copied from the polynomial in AdvSIMD variant. */ \ 27555 + + .c0 = 0x1.ffffecp-1f, .c1 = 0x1.fffdb6p-2f, .c2 = 0x1.555e66p-3f, \ 27556 + + .c3 = 0x1.573e2ep-5f, .c4 = 0x1.0e4020p-7f, .inv_ln2 = 0x1.715476p+0f, \ 27557 + + .ln2_hi = 0x1.62e4p-1f, .ln2_lo = 0x1.7f7d1cp-20f, \ 27558 + + .shift = 0x1.803f8p17f, \ 27559 + } 27560 + 27561 + #define C(i) sv_f32 (d->poly[i]) 27562 + @@ -47,26 +48,25 @@ expf_inline (svfloat32_t x, const svbool_t pg, const struct sv_expf_data *d) 27563 + /* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)] 27564 + x = ln2*n + r, with r in [-ln2/2, ln2/2]. */ 27565 + 27566 + - /* Load some constants in quad-word chunks to minimise memory access. */ 27567 + - svfloat32_t c4_invln2_and_ln2 = svld1rq (svptrue_b32 (), &d->poly[4]); 27568 + + svfloat32_t lane_consts = svld1rq (svptrue_b32 (), &d->ln2_lo); 27569 + 27570 + /* n = round(x/(ln2/N)). */ 27571 + - svfloat32_t z = svmla_lane (sv_f32 (d->shift), x, c4_invln2_and_ln2, 1); 27572 + + svfloat32_t z = svmad_x (pg, sv_f32 (d->inv_ln2), x, d->shift); 27573 + svfloat32_t n = svsub_x (pg, z, d->shift); 27574 + 27575 + /* r = x - n*ln2/N. */ 27576 + - svfloat32_t r = svmls_lane (x, n, c4_invln2_and_ln2, 2); 27577 + - r = svmls_lane (r, n, c4_invln2_and_ln2, 3); 27578 + + svfloat32_t r = svmsb_x (pg, sv_f32 (d->ln2_hi), n, x); 27579 + + r = svmls_lane (r, n, lane_consts, 0); 27580 + 27581 + /* scale = 2^(n/N). */ 27582 + - svfloat32_t scale = svexpa (svreinterpret_u32_f32 (z)); 27583 + + svfloat32_t scale = svexpa (svreinterpret_u32 (z)); 27584 + 27585 + /* y = exp(r) - 1 ~= r + C0 r^2 + C1 r^3 + C2 r^4 + C3 r^5 + C4 r^6. */ 27586 + - svfloat32_t p12 = svmla_x (pg, C (1), C (2), r); 27587 + - svfloat32_t p34 = svmla_lane (C (3), r, c4_invln2_and_ln2, 0); 27588 + - svfloat32_t r2 = svmul_f32_x (pg, r, r); 27589 + + svfloat32_t p12 = svmla_lane (sv_f32 (d->c1), r, lane_consts, 2); 27590 + + svfloat32_t p34 = svmla_lane (sv_f32 (d->c3), r, lane_consts, 3); 27591 + + svfloat32_t r2 = svmul_x (svptrue_b32 (), r, r); 27592 + svfloat32_t p14 = svmla_x (pg, p12, p34, r2); 27593 + - svfloat32_t p0 = svmul_f32_x (pg, r, C (0)); 27594 + + svfloat32_t p0 = svmul_lane (r, lane_consts, 1); 27595 + svfloat32_t poly = svmla_x (pg, p0, r2, p14); 27596 + 27597 + return svmla_x (pg, scale, scale, poly); 27598 + 27599 + commit c4373426e3a85ec483a0f412c2a7c6cdfa32ccdb 27600 + Author: Joe Ramsay <Joe.Ramsay@arm.com> 27601 + Date: Mon Sep 23 15:30:20 2024 +0100 27602 + 27603 + AArch64: Improve codegen in SVE F32 logs 27604 + 27605 + Reduce MOVPRFXs by using unpredicated (non-destructive) instructions 27606 + where possible. Similar to the recent change to AdvSIMD F32 logs, 27607 + adjust special-case arguments and bounds to allow for more optimal 27608 + register usage. For all 3 routines one MOVPRFX remains in the 27609 + reduction, which cannot be avoided as immediate AND and ASR are both 27610 + destructive. 27611 + 27612 + Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com> 27613 + (cherry picked from commit a15b1394b5eba98ffe28a02a392b587e4fe13c0d) 27614 + 27615 + diff --git a/sysdeps/aarch64/fpu/log10f_sve.c b/sysdeps/aarch64/fpu/log10f_sve.c 27616 + index bdbb49cd32..7913679f67 100644 27617 + --- a/sysdeps/aarch64/fpu/log10f_sve.c 27618 + +++ b/sysdeps/aarch64/fpu/log10f_sve.c 27619 + @@ -24,6 +24,7 @@ static const struct data 27620 + float poly_0246[4]; 27621 + float poly_1357[4]; 27622 + float ln2, inv_ln10; 27623 + + uint32_t off, lower; 27624 + } data = { 27625 + .poly_1357 = { 27626 + /* Coefficients copied from the AdvSIMD routine, then rearranged so that coeffs 27627 + @@ -35,18 +36,23 @@ static const struct data 27628 + -0x1.0fc92cp-4f }, 27629 + .ln2 = 0x1.62e43p-1f, 27630 + .inv_ln10 = 0x1.bcb7b2p-2f, 27631 + + .off = 0x3f2aaaab, 27632 + + /* Lower bound is the smallest positive normal float 0x00800000. For 27633 + + optimised register use subnormals are detected after offset has been 27634 + + subtracted, so lower bound is 0x0080000 - offset (which wraps around). */ 27635 + + .lower = 0x00800000 - 0x3f2aaaab 27636 + }; 27637 + 27638 + -#define Min 0x00800000 27639 + -#define Max 0x7f800000 27640 + -#define Thres 0x7f000000 /* Max - Min. */ 27641 + -#define Offset 0x3f2aaaab /* 0.666667. */ 27642 + +#define Thres 0x7f000000 /* asuint32(inf) - 0x00800000. */ 27643 + #define MantissaMask 0x007fffff 27644 + 27645 + static svfloat32_t NOINLINE 27646 + -special_case (svfloat32_t x, svfloat32_t y, svbool_t special) 27647 + +special_case (svuint32_t u_off, svfloat32_t p, svfloat32_t r2, svfloat32_t y, 27648 + + svbool_t cmp) 27649 + { 27650 + - return sv_call_f32 (log10f, x, y, special); 27651 + + return sv_call_f32 ( 27652 + + log10f, svreinterpret_f32 (svadd_x (svptrue_b32 (), u_off, data.off)), 27653 + + svmla_x (svptrue_b32 (), p, r2, y), cmp); 27654 + } 27655 + 27656 + /* Optimised implementation of SVE log10f using the same algorithm and 27657 + @@ -57,23 +63,25 @@ special_case (svfloat32_t x, svfloat32_t y, svbool_t special) 27658 + svfloat32_t SV_NAME_F1 (log10) (svfloat32_t x, const svbool_t pg) 27659 + { 27660 + const struct data *d = ptr_barrier (&data); 27661 + - svuint32_t ix = svreinterpret_u32 (x); 27662 + - svbool_t special = svcmpge (pg, svsub_x (pg, ix, Min), Thres); 27663 + + 27664 + + svuint32_t u_off = svreinterpret_u32 (x); 27665 + + 27666 + + u_off = svsub_x (pg, u_off, d->off); 27667 + + svbool_t special = svcmpge (pg, svsub_x (pg, u_off, d->lower), Thres); 27668 + 27669 + /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3. */ 27670 + - ix = svsub_x (pg, ix, Offset); 27671 + svfloat32_t n = svcvt_f32_x ( 27672 + - pg, svasr_x (pg, svreinterpret_s32 (ix), 23)); /* signextend. */ 27673 + - ix = svand_x (pg, ix, MantissaMask); 27674 + - ix = svadd_x (pg, ix, Offset); 27675 + + pg, svasr_x (pg, svreinterpret_s32 (u_off), 23)); /* signextend. */ 27676 + + svuint32_t ix = svand_x (pg, u_off, MantissaMask); 27677 + + ix = svadd_x (pg, ix, d->off); 27678 + svfloat32_t r = svsub_x (pg, svreinterpret_f32 (ix), 1.0f); 27679 + 27680 + /* y = log10(1+r) + n*log10(2) 27681 + log10(1+r) ~ r * InvLn(10) + P(r) 27682 + where P(r) is a polynomial. Use order 9 for log10(1+x), i.e. order 8 for 27683 + log10(1+x)/x, with x in [-1/3, 1/3] (offset=2/3). */ 27684 + - svfloat32_t r2 = svmul_x (pg, r, r); 27685 + - svfloat32_t r4 = svmul_x (pg, r2, r2); 27686 + + svfloat32_t r2 = svmul_x (svptrue_b32 (), r, r); 27687 + + svfloat32_t r4 = svmul_x (svptrue_b32 (), r2, r2); 27688 + svfloat32_t p_1357 = svld1rq (svptrue_b32 (), &d->poly_1357[0]); 27689 + svfloat32_t q_01 = svmla_lane (sv_f32 (d->poly_0246[0]), r, p_1357, 0); 27690 + svfloat32_t q_23 = svmla_lane (sv_f32 (d->poly_0246[1]), r, p_1357, 1); 27691 + @@ -88,7 +96,6 @@ svfloat32_t SV_NAME_F1 (log10) (svfloat32_t x, const svbool_t pg) 27692 + hi = svmul_x (pg, hi, d->inv_ln10); 27693 + 27694 + if (__glibc_unlikely (svptest_any (pg, special))) 27695 + - return special_case (x, svmla_x (svnot_z (pg, special), hi, r2, y), 27696 + - special); 27697 + - return svmla_x (pg, hi, r2, y); 27698 + + return special_case (u_off, hi, r2, y, special); 27699 + + return svmla_x (svptrue_b32 (), hi, r2, y); 27700 + } 27701 + diff --git a/sysdeps/aarch64/fpu/log2f_sve.c b/sysdeps/aarch64/fpu/log2f_sve.c 27702 + index 5031c42483..939d89bfb9 100644 27703 + --- a/sysdeps/aarch64/fpu/log2f_sve.c 27704 + +++ b/sysdeps/aarch64/fpu/log2f_sve.c 27705 + @@ -23,6 +23,7 @@ static const struct data 27706 + { 27707 + float poly_02468[5]; 27708 + float poly_1357[4]; 27709 + + uint32_t off, lower; 27710 + } data = { 27711 + .poly_1357 = { 27712 + /* Coefficients copied from the AdvSIMD routine, then rearranged so that coeffs 27713 + @@ -32,18 +33,23 @@ static const struct data 27714 + }, 27715 + .poly_02468 = { 0x1.715476p0f, 0x1.ec701cp-2f, 0x1.27a0b8p-2f, 27716 + 0x1.9d8ecap-3f, 0x1.9e495p-3f }, 27717 + + .off = 0x3f2aaaab, 27718 + + /* Lower bound is the smallest positive normal float 0x00800000. For 27719 + + optimised register use subnormals are detected after offset has been 27720 + + subtracted, so lower bound is 0x0080000 - offset (which wraps around). */ 27721 + + .lower = 0x00800000 - 0x3f2aaaab 27722 + }; 27723 + 27724 + -#define Min (0x00800000) 27725 + -#define Max (0x7f800000) 27726 + -#define Thres (0x7f000000) /* Max - Min. */ 27727 + +#define Thresh (0x7f000000) /* asuint32(inf) - 0x00800000. */ 27728 + #define MantissaMask (0x007fffff) 27729 + -#define Off (0x3f2aaaab) /* 0.666667. */ 27730 + 27731 + static svfloat32_t NOINLINE 27732 + -special_case (svfloat32_t x, svfloat32_t y, svbool_t cmp) 27733 + +special_case (svuint32_t u_off, svfloat32_t p, svfloat32_t r2, svfloat32_t y, 27734 + + svbool_t cmp) 27735 + { 27736 + - return sv_call_f32 (log2f, x, y, cmp); 27737 + + return sv_call_f32 ( 27738 + + log2f, svreinterpret_f32 (svadd_x (svptrue_b32 (), u_off, data.off)), 27739 + + svmla_x (svptrue_b32 (), p, r2, y), cmp); 27740 + } 27741 + 27742 + /* Optimised implementation of SVE log2f, using the same algorithm 27743 + @@ -55,19 +61,20 @@ svfloat32_t SV_NAME_F1 (log2) (svfloat32_t x, const svbool_t pg) 27744 + { 27745 + const struct data *d = ptr_barrier (&data); 27746 + 27747 + - svuint32_t u = svreinterpret_u32 (x); 27748 + - svbool_t special = svcmpge (pg, svsub_x (pg, u, Min), Thres); 27749 + + svuint32_t u_off = svreinterpret_u32 (x); 27750 + + 27751 + + u_off = svsub_x (pg, u_off, d->off); 27752 + + svbool_t special = svcmpge (pg, svsub_x (pg, u_off, d->lower), Thresh); 27753 + 27754 + /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3. */ 27755 + - u = svsub_x (pg, u, Off); 27756 + svfloat32_t n = svcvt_f32_x ( 27757 + - pg, svasr_x (pg, svreinterpret_s32 (u), 23)); /* Sign-extend. */ 27758 + - u = svand_x (pg, u, MantissaMask); 27759 + - u = svadd_x (pg, u, Off); 27760 + + pg, svasr_x (pg, svreinterpret_s32 (u_off), 23)); /* Sign-extend. */ 27761 + + svuint32_t u = svand_x (pg, u_off, MantissaMask); 27762 + + u = svadd_x (pg, u, d->off); 27763 + svfloat32_t r = svsub_x (pg, svreinterpret_f32 (u), 1.0f); 27764 + 27765 + /* y = log2(1+r) + n. */ 27766 + - svfloat32_t r2 = svmul_x (pg, r, r); 27767 + + svfloat32_t r2 = svmul_x (svptrue_b32 (), r, r); 27768 + 27769 + /* Evaluate polynomial using pairwise Horner scheme. */ 27770 + svfloat32_t p_1357 = svld1rq (svptrue_b32 (), &d->poly_1357[0]); 27771 + @@ -81,6 +88,6 @@ svfloat32_t SV_NAME_F1 (log2) (svfloat32_t x, const svbool_t pg) 27772 + y = svmla_x (pg, q_01, r2, y); 27773 + 27774 + if (__glibc_unlikely (svptest_any (pg, special))) 27775 + - return special_case (x, svmla_x (svnot_z (pg, special), n, r, y), special); 27776 + - return svmla_x (pg, n, r, y); 27777 + + return special_case (u_off, n, r, y, special); 27778 + + return svmla_x (svptrue_b32 (), n, r, y); 27779 + } 27780 + diff --git a/sysdeps/aarch64/fpu/logf_sve.c b/sysdeps/aarch64/fpu/logf_sve.c 27781 + index d64e810cfe..5b9324678d 100644 27782 + --- a/sysdeps/aarch64/fpu/logf_sve.c 27783 + +++ b/sysdeps/aarch64/fpu/logf_sve.c 27784 + @@ -24,6 +24,7 @@ static const struct data 27785 + float poly_0135[4]; 27786 + float poly_246[3]; 27787 + float ln2; 27788 + + uint32_t off, lower; 27789 + } data = { 27790 + .poly_0135 = { 27791 + /* Coefficients copied from the AdvSIMD routine in math/, then rearranged so 27792 + @@ -32,19 +33,24 @@ static const struct data 27793 + -0x1.3e737cp-3f, 0x1.5a9aa2p-3f, 0x1.961348p-3f, 0x1.555d7cp-2f 27794 + }, 27795 + .poly_246 = { -0x1.4f9934p-3f, -0x1.00187cp-2f, -0x1.ffffc8p-2f }, 27796 + - .ln2 = 0x1.62e43p-1f 27797 + + .ln2 = 0x1.62e43p-1f, 27798 + + .off = 0x3f2aaaab, 27799 + + /* Lower bound is the smallest positive normal float 0x00800000. For 27800 + + optimised register use subnormals are detected after offset has been 27801 + + subtracted, so lower bound is 0x0080000 - offset (which wraps around). */ 27802 + + .lower = 0x00800000 - 0x3f2aaaab 27803 + }; 27804 + 27805 + -#define Min (0x00800000) 27806 + -#define Max (0x7f800000) 27807 + -#define Thresh (0x7f000000) /* Max - Min. */ 27808 + +#define Thresh (0x7f000000) /* asuint32(inf) - 0x00800000. */ 27809 + #define Mask (0x007fffff) 27810 + -#define Off (0x3f2aaaab) /* 0.666667. */ 27811 + 27812 + static svfloat32_t NOINLINE 27813 + -special_case (svfloat32_t x, svfloat32_t y, svbool_t cmp) 27814 + +special_case (svuint32_t u_off, svfloat32_t p, svfloat32_t r2, svfloat32_t y, 27815 + + svbool_t cmp) 27816 + { 27817 + - return sv_call_f32 (logf, x, y, cmp); 27818 + + return sv_call_f32 ( 27819 + + logf, svreinterpret_f32 (svadd_x (svptrue_b32 (), u_off, data.off)), 27820 + + svmla_x (svptrue_b32 (), p, r2, y), cmp); 27821 + } 27822 + 27823 + /* Optimised implementation of SVE logf, using the same algorithm and 27824 + @@ -55,19 +61,21 @@ svfloat32_t SV_NAME_F1 (log) (svfloat32_t x, const svbool_t pg) 27825 + { 27826 + const struct data *d = ptr_barrier (&data); 27827 + 27828 + - svuint32_t u = svreinterpret_u32 (x); 27829 + - svbool_t cmp = svcmpge (pg, svsub_x (pg, u, Min), Thresh); 27830 + + svuint32_t u_off = svreinterpret_u32 (x); 27831 + + 27832 + + u_off = svsub_x (pg, u_off, d->off); 27833 + + svbool_t cmp = svcmpge (pg, svsub_x (pg, u_off, d->lower), Thresh); 27834 + 27835 + /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3. */ 27836 + - u = svsub_x (pg, u, Off); 27837 + svfloat32_t n = svcvt_f32_x ( 27838 + - pg, svasr_x (pg, svreinterpret_s32 (u), 23)); /* Sign-extend. */ 27839 + - u = svand_x (pg, u, Mask); 27840 + - u = svadd_x (pg, u, Off); 27841 + + pg, svasr_x (pg, svreinterpret_s32 (u_off), 23)); /* Sign-extend. */ 27842 + + 27843 + + svuint32_t u = svand_x (pg, u_off, Mask); 27844 + + u = svadd_x (pg, u, d->off); 27845 + svfloat32_t r = svsub_x (pg, svreinterpret_f32 (u), 1.0f); 27846 + 27847 + /* y = log(1+r) + n*ln2. */ 27848 + - svfloat32_t r2 = svmul_x (pg, r, r); 27849 + + svfloat32_t r2 = svmul_x (svptrue_b32 (), r, r); 27850 + /* n*ln2 + r + r2*(P6 + r*P5 + r2*(P4 + r*P3 + r2*(P2 + r*P1 + r2*P0))). */ 27851 + svfloat32_t p_0135 = svld1rq (svptrue_b32 (), &d->poly_0135[0]); 27852 + svfloat32_t p = svmla_lane (sv_f32 (d->poly_246[0]), r, p_0135, 1); 27853 + @@ -80,6 +88,6 @@ svfloat32_t SV_NAME_F1 (log) (svfloat32_t x, const svbool_t pg) 27854 + p = svmla_x (pg, r, n, d->ln2); 27855 + 27856 + if (__glibc_unlikely (svptest_any (pg, cmp))) 27857 + - return special_case (x, svmla_x (svnot_z (pg, cmp), p, r2, y), cmp); 27858 + + return special_case (u_off, p, r2, y, cmp); 27859 + return svmla_x (pg, p, r2, y); 27860 + } 27861 + 27862 + commit 520240173029fd03388ec01db9a5359291cbbd27 27863 + Author: Joe Ramsay <Joe.Ramsay@arm.com> 27864 + Date: Mon Sep 23 15:32:14 2024 +0100 27865 + 27866 + AArch64: Improve codegen in users of AdvSIMD log1pf helper 27867 + 27868 + log1pf is quite register-intensive - use fewer registers for the 27869 + polynomial, and make various changes to shorten dependency chains in 27870 + parent routines. There is now no spilling with GCC 14. Accuracy moves 27871 + around a little - comments adjusted accordingly but does not require 27872 + regen-ulps. 27873 + 27874 + Use the helper in log1pf as well, instead of having separate 27875 + implementations. The more accurate polynomial means special-casing can 27876 + be simplified, and the shorter dependency chain avoids the usual dance 27877 + around v0, which is otherwise difficult. 27878 + 27879 + There is a small duplication of vectors containing 1.0f (or 0x3f800000) - 27880 + GCC is not currently able to efficiently handle values which fit in FMOV 27881 + but not MOVI, and are reinterpreted to integer. There may be potential 27882 + for more optimisation if this is fixed. 27883 + 27884 + Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com> 27885 + (cherry picked from commit 5bc100bd4b7e00db3009ae93d25d303341545d23) 27886 + 27887 + diff --git a/sysdeps/aarch64/fpu/acoshf_advsimd.c b/sysdeps/aarch64/fpu/acoshf_advsimd.c 27888 + index 8916dcbf40..004474acf9 100644 27889 + --- a/sysdeps/aarch64/fpu/acoshf_advsimd.c 27890 + +++ b/sysdeps/aarch64/fpu/acoshf_advsimd.c 27891 + @@ -25,35 +25,32 @@ const static struct data 27892 + { 27893 + struct v_log1pf_data log1pf_consts; 27894 + uint32x4_t one; 27895 + - uint16x4_t thresh; 27896 + -} data = { 27897 + - .log1pf_consts = V_LOG1PF_CONSTANTS_TABLE, 27898 + - .one = V4 (0x3f800000), 27899 + - .thresh = V4 (0x2000) /* top(asuint(SquareLim) - asuint(1)). */ 27900 + -}; 27901 + +} data = { .log1pf_consts = V_LOG1PF_CONSTANTS_TABLE, .one = V4 (0x3f800000) }; 27902 + + 27903 + +#define Thresh vdup_n_u16 (0x2000) /* top(asuint(SquareLim) - asuint(1)). */ 27904 + 27905 + static float32x4_t NOINLINE VPCS_ATTR 27906 + special_case (float32x4_t x, float32x4_t y, uint16x4_t special, 27907 + - const struct v_log1pf_data d) 27908 + + const struct v_log1pf_data *d) 27909 + { 27910 + return v_call_f32 (acoshf, x, log1pf_inline (y, d), vmovl_u16 (special)); 27911 + } 27912 + 27913 + /* Vector approximation for single-precision acosh, based on log1p. Maximum 27914 + error depends on WANT_SIMD_EXCEPT. With SIMD fp exceptions enabled, it 27915 + - is 2.78 ULP: 27916 + - __v_acoshf(0x1.07887p+0) got 0x1.ef9e9cp-3 27917 + - want 0x1.ef9ea2p-3. 27918 + + is 3.00 ULP: 27919 + + _ZGVnN4v_acoshf(0x1.01df3ap+0) got 0x1.ef0a82p-4 27920 + + want 0x1.ef0a7cp-4. 27921 + With exceptions disabled, we can compute u with a shorter dependency chain, 27922 + - which gives maximum error of 3.07 ULP: 27923 + - __v_acoshf(0x1.01f83ep+0) got 0x1.fbc7fap-4 27924 + - want 0x1.fbc7f4p-4. */ 27925 + + which gives maximum error of 3.22 ULP: 27926 + + _ZGVnN4v_acoshf(0x1.007ef2p+0) got 0x1.fdcdccp-5 27927 + + want 0x1.fdcdd2p-5. */ 27928 + 27929 + VPCS_ATTR float32x4_t NOINLINE V_NAME_F1 (acosh) (float32x4_t x) 27930 + { 27931 + const struct data *d = ptr_barrier (&data); 27932 + uint32x4_t ix = vreinterpretq_u32_f32 (x); 27933 + - uint16x4_t special = vcge_u16 (vsubhn_u32 (ix, d->one), d->thresh); 27934 + + uint16x4_t special = vcge_u16 (vsubhn_u32 (ix, d->one), Thresh); 27935 + 27936 + #if WANT_SIMD_EXCEPT 27937 + /* Mask special lanes with 1 to side-step spurious invalid or overflow. Use 27938 + @@ -64,15 +61,16 @@ VPCS_ATTR float32x4_t NOINLINE V_NAME_F1 (acosh) (float32x4_t x) 27939 + float32x4_t xm1 = v_zerofy_f32 (vsubq_f32 (x, v_f32 (1)), p); 27940 + float32x4_t u = vfmaq_f32 (vaddq_f32 (xm1, xm1), xm1, xm1); 27941 + #else 27942 + - float32x4_t xm1 = vsubq_f32 (x, v_f32 (1)); 27943 + - float32x4_t u = vmulq_f32 (xm1, vaddq_f32 (x, v_f32 (1.0f))); 27944 + + float32x4_t xm1 = vsubq_f32 (x, vreinterpretq_f32_u32 (d->one)); 27945 + + float32x4_t u 27946 + + = vmulq_f32 (xm1, vaddq_f32 (x, vreinterpretq_f32_u32 (d->one))); 27947 + #endif 27948 + 27949 + float32x4_t y = vaddq_f32 (xm1, vsqrtq_f32 (u)); 27950 + 27951 + if (__glibc_unlikely (v_any_u16h (special))) 27952 + - return special_case (x, y, special, d->log1pf_consts); 27953 + - return log1pf_inline (y, d->log1pf_consts); 27954 + + return special_case (x, y, special, &d->log1pf_consts); 27955 + + return log1pf_inline (y, &d->log1pf_consts); 27956 + } 27957 + libmvec_hidden_def (V_NAME_F1 (acosh)) 27958 + HALF_WIDTH_ALIAS_F1 (acosh) 27959 + diff --git a/sysdeps/aarch64/fpu/asinhf_advsimd.c b/sysdeps/aarch64/fpu/asinhf_advsimd.c 27960 + index 09fd8a6143..eb789b91b6 100644 27961 + --- a/sysdeps/aarch64/fpu/asinhf_advsimd.c 27962 + +++ b/sysdeps/aarch64/fpu/asinhf_advsimd.c 27963 + @@ -20,16 +20,16 @@ 27964 + #include "v_math.h" 27965 + #include "v_log1pf_inline.h" 27966 + 27967 + -#define SignMask v_u32 (0x80000000) 27968 + - 27969 + const static struct data 27970 + { 27971 + struct v_log1pf_data log1pf_consts; 27972 + + float32x4_t one; 27973 + uint32x4_t big_bound; 27974 + #if WANT_SIMD_EXCEPT 27975 + uint32x4_t tiny_bound; 27976 + #endif 27977 + } data = { 27978 + + .one = V4 (1), 27979 + .log1pf_consts = V_LOG1PF_CONSTANTS_TABLE, 27980 + .big_bound = V4 (0x5f800000), /* asuint(0x1p64). */ 27981 + #if WANT_SIMD_EXCEPT 27982 + @@ -38,20 +38,27 @@ const static struct data 27983 + }; 27984 + 27985 + static float32x4_t NOINLINE VPCS_ATTR 27986 + -special_case (float32x4_t x, float32x4_t y, uint32x4_t special) 27987 + +special_case (float32x4_t x, uint32x4_t sign, float32x4_t y, 27988 + + uint32x4_t special, const struct data *d) 27989 + { 27990 + - return v_call_f32 (asinhf, x, y, special); 27991 + + return v_call_f32 ( 27992 + + asinhf, x, 27993 + + vreinterpretq_f32_u32 (veorq_u32 ( 27994 + + sign, vreinterpretq_u32_f32 (log1pf_inline (y, &d->log1pf_consts)))), 27995 + + special); 27996 + } 27997 + 27998 + /* Single-precision implementation of vector asinh(x), using vector log1p. 27999 + - Worst-case error is 2.66 ULP, at roughly +/-0.25: 28000 + - __v_asinhf(0x1.01b04p-2) got 0x1.fe163ep-3 want 0x1.fe1638p-3. */ 28001 + + Worst-case error is 2.59 ULP: 28002 + + _ZGVnN4v_asinhf(0x1.d86124p-3) got 0x1.d449bep-3 28003 + + want 0x1.d449c4p-3. */ 28004 + VPCS_ATTR float32x4_t NOINLINE V_NAME_F1 (asinh) (float32x4_t x) 28005 + { 28006 + const struct data *dat = ptr_barrier (&data); 28007 + - uint32x4_t iax = vbicq_u32 (vreinterpretq_u32_f32 (x), SignMask); 28008 + - float32x4_t ax = vreinterpretq_f32_u32 (iax); 28009 + + float32x4_t ax = vabsq_f32 (x); 28010 + + uint32x4_t iax = vreinterpretq_u32_f32 (ax); 28011 + uint32x4_t special = vcgeq_u32 (iax, dat->big_bound); 28012 + + uint32x4_t sign = veorq_u32 (vreinterpretq_u32_f32 (x), iax); 28013 + float32x4_t special_arg = x; 28014 + 28015 + #if WANT_SIMD_EXCEPT 28016 + @@ -68,13 +75,13 @@ VPCS_ATTR float32x4_t NOINLINE V_NAME_F1 (asinh) (float32x4_t x) 28017 + /* asinh(x) = log(x + sqrt(x * x + 1)). 28018 + For positive x, asinh(x) = log1p(x + x * x / (1 + sqrt(x * x + 1))). */ 28019 + float32x4_t d 28020 + - = vaddq_f32 (v_f32 (1), vsqrtq_f32 (vfmaq_f32 (v_f32 (1), x, x))); 28021 + - float32x4_t y = log1pf_inline ( 28022 + - vaddq_f32 (ax, vdivq_f32 (vmulq_f32 (ax, ax), d)), dat->log1pf_consts); 28023 + + = vaddq_f32 (v_f32 (1), vsqrtq_f32 (vfmaq_f32 (dat->one, ax, ax))); 28024 + + float32x4_t y = vaddq_f32 (ax, vdivq_f32 (vmulq_f32 (ax, ax), d)); 28025 + 28026 + if (__glibc_unlikely (v_any_u32 (special))) 28027 + - return special_case (special_arg, vbslq_f32 (SignMask, x, y), special); 28028 + - return vbslq_f32 (SignMask, x, y); 28029 + + return special_case (special_arg, sign, y, special, dat); 28030 + + return vreinterpretq_f32_u32 (veorq_u32 ( 28031 + + sign, vreinterpretq_u32_f32 (log1pf_inline (y, &dat->log1pf_consts)))); 28032 + } 28033 + libmvec_hidden_def (V_NAME_F1 (asinh)) 28034 + HALF_WIDTH_ALIAS_F1 (asinh) 28035 + diff --git a/sysdeps/aarch64/fpu/atanhf_advsimd.c b/sysdeps/aarch64/fpu/atanhf_advsimd.c 28036 + index ae488f7b54..818b6c92ad 100644 28037 + --- a/sysdeps/aarch64/fpu/atanhf_advsimd.c 28038 + +++ b/sysdeps/aarch64/fpu/atanhf_advsimd.c 28039 + @@ -40,15 +40,17 @@ const static struct data 28040 + #define Half v_u32 (0x3f000000) 28041 + 28042 + static float32x4_t NOINLINE VPCS_ATTR 28043 + -special_case (float32x4_t x, float32x4_t y, uint32x4_t special) 28044 + +special_case (float32x4_t x, float32x4_t halfsign, float32x4_t y, 28045 + + uint32x4_t special) 28046 + { 28047 + - return v_call_f32 (atanhf, x, y, special); 28048 + + return v_call_f32 (atanhf, vbslq_f32 (AbsMask, x, halfsign), 28049 + + vmulq_f32 (halfsign, y), special); 28050 + } 28051 + 28052 + /* Approximation for vector single-precision atanh(x) using modified log1p. 28053 + - The maximum error is 3.08 ULP: 28054 + - __v_atanhf(0x1.ff215p-5) got 0x1.ffcb7cp-5 28055 + - want 0x1.ffcb82p-5. */ 28056 + + The maximum error is 2.93 ULP: 28057 + + _ZGVnN4v_atanhf(0x1.f43d7p-5) got 0x1.f4dcfep-5 28058 + + want 0x1.f4dcf8p-5. */ 28059 + VPCS_ATTR float32x4_t NOINLINE V_NAME_F1 (atanh) (float32x4_t x) 28060 + { 28061 + const struct data *d = ptr_barrier (&data); 28062 + @@ -68,11 +70,19 @@ VPCS_ATTR float32x4_t NOINLINE V_NAME_F1 (atanh) (float32x4_t x) 28063 + uint32x4_t special = vcgeq_u32 (iax, d->one); 28064 + #endif 28065 + 28066 + - float32x4_t y = vdivq_f32 (vaddq_f32 (ax, ax), vsubq_f32 (v_f32 (1), ax)); 28067 + - y = log1pf_inline (y, d->log1pf_consts); 28068 + + float32x4_t y = vdivq_f32 (vaddq_f32 (ax, ax), 28069 + + vsubq_f32 (vreinterpretq_f32_u32 (d->one), ax)); 28070 + + y = log1pf_inline (y, &d->log1pf_consts); 28071 + 28072 + + /* If exceptions not required, pass ax to special-case for shorter dependency 28073 + + chain. If exceptions are required ax will have been zerofied, so have to 28074 + + pass x. */ 28075 + if (__glibc_unlikely (v_any_u32 (special))) 28076 + - return special_case (x, vmulq_f32 (halfsign, y), special); 28077 + +#if WANT_SIMD_EXCEPT 28078 + + return special_case (x, halfsign, y, special); 28079 + +#else 28080 + + return special_case (ax, halfsign, y, special); 28081 + +#endif 28082 + return vmulq_f32 (halfsign, y); 28083 + } 28084 + libmvec_hidden_def (V_NAME_F1 (atanh)) 28085 + diff --git a/sysdeps/aarch64/fpu/log1pf_advsimd.c b/sysdeps/aarch64/fpu/log1pf_advsimd.c 28086 + index 8cfa28fb8a..00006fc703 100644 28087 + --- a/sysdeps/aarch64/fpu/log1pf_advsimd.c 28088 + +++ b/sysdeps/aarch64/fpu/log1pf_advsimd.c 28089 + @@ -18,114 +18,79 @@ 28090 + <https://www.gnu.org/licenses/>. */ 28091 + 28092 + #include "v_math.h" 28093 + -#include "poly_advsimd_f32.h" 28094 + +#include "v_log1pf_inline.h" 28095 + + 28096 + +#if WANT_SIMD_EXCEPT 28097 + 28098 + const static struct data 28099 + { 28100 + - float32x4_t poly[8], ln2; 28101 + - uint32x4_t tiny_bound, minus_one, four, thresh; 28102 + - int32x4_t three_quarters; 28103 + + uint32x4_t minus_one, thresh; 28104 + + struct v_log1pf_data d; 28105 + } data = { 28106 + - .poly = { /* Generated using FPMinimax in [-0.25, 0.5]. First two coefficients 28107 + - (1, -0.5) are not stored as they can be generated more 28108 + - efficiently. */ 28109 + - V4 (0x1.5555aap-2f), V4 (-0x1.000038p-2f), V4 (0x1.99675cp-3f), 28110 + - V4 (-0x1.54ef78p-3f), V4 (0x1.28a1f4p-3f), V4 (-0x1.0da91p-3f), 28111 + - V4 (0x1.abcb6p-4f), V4 (-0x1.6f0d5ep-5f) }, 28112 + - .ln2 = V4 (0x1.62e43p-1f), 28113 + - .tiny_bound = V4 (0x34000000), /* asuint32(0x1p-23). ulp=0.5 at 0x1p-23. */ 28114 + - .thresh = V4 (0x4b800000), /* asuint32(INFINITY) - tiny_bound. */ 28115 + + .d = V_LOG1PF_CONSTANTS_TABLE, 28116 + + .thresh = V4 (0x4b800000), /* asuint32(INFINITY) - TinyBound. */ 28117 + .minus_one = V4 (0xbf800000), 28118 + - .four = V4 (0x40800000), 28119 + - .three_quarters = V4 (0x3f400000) 28120 + }; 28121 + 28122 + -static inline float32x4_t 28123 + -eval_poly (float32x4_t m, const float32x4_t *p) 28124 + -{ 28125 + - /* Approximate log(1+m) on [-0.25, 0.5] using split Estrin scheme. */ 28126 + - float32x4_t p_12 = vfmaq_f32 (v_f32 (-0.5), m, p[0]); 28127 + - float32x4_t p_34 = vfmaq_f32 (p[1], m, p[2]); 28128 + - float32x4_t p_56 = vfmaq_f32 (p[3], m, p[4]); 28129 + - float32x4_t p_78 = vfmaq_f32 (p[5], m, p[6]); 28130 + - 28131 + - float32x4_t m2 = vmulq_f32 (m, m); 28132 + - float32x4_t p_02 = vfmaq_f32 (m, m2, p_12); 28133 + - float32x4_t p_36 = vfmaq_f32 (p_34, m2, p_56); 28134 + - float32x4_t p_79 = vfmaq_f32 (p_78, m2, p[7]); 28135 + - 28136 + - float32x4_t m4 = vmulq_f32 (m2, m2); 28137 + - float32x4_t p_06 = vfmaq_f32 (p_02, m4, p_36); 28138 + - return vfmaq_f32 (p_06, m4, vmulq_f32 (m4, p_79)); 28139 + -} 28140 + +/* asuint32(0x1p-23). ulp=0.5 at 0x1p-23. */ 28141 + +# define TinyBound v_u32 (0x34000000) 28142 + 28143 + static float32x4_t NOINLINE VPCS_ATTR 28144 + -special_case (float32x4_t x, float32x4_t y, uint32x4_t special) 28145 + +special_case (float32x4_t x, uint32x4_t cmp, const struct data *d) 28146 + { 28147 + - return v_call_f32 (log1pf, x, y, special); 28148 + + /* Side-step special lanes so fenv exceptions are not triggered 28149 + + inadvertently. */ 28150 + + float32x4_t x_nospecial = v_zerofy_f32 (x, cmp); 28151 + + return v_call_f32 (log1pf, x, log1pf_inline (x_nospecial, &d->d), cmp); 28152 + } 28153 + 28154 + -/* Vector log1pf approximation using polynomial on reduced interval. Accuracy 28155 + - is roughly 2.02 ULP: 28156 + - log1pf(0x1.21e13ap-2) got 0x1.fe8028p-3 want 0x1.fe802cp-3. */ 28157 + +/* Vector log1pf approximation using polynomial on reduced interval. Worst-case 28158 + + error is 1.69 ULP: 28159 + + _ZGVnN4v_log1pf(0x1.04418ap-2) got 0x1.cfcbd8p-3 28160 + + want 0x1.cfcbdcp-3. */ 28161 + VPCS_ATTR float32x4_t V_NAME_F1 (log1p) (float32x4_t x) 28162 + { 28163 + const struct data *d = ptr_barrier (&data); 28164 + - 28165 + uint32x4_t ix = vreinterpretq_u32_f32 (x); 28166 + uint32x4_t ia = vreinterpretq_u32_f32 (vabsq_f32 (x)); 28167 + + 28168 + uint32x4_t special_cases 28169 + - = vorrq_u32 (vcgeq_u32 (vsubq_u32 (ia, d->tiny_bound), d->thresh), 28170 + + = vorrq_u32 (vcgeq_u32 (vsubq_u32 (ia, TinyBound), d->thresh), 28171 + vcgeq_u32 (ix, d->minus_one)); 28172 + - float32x4_t special_arg = x; 28173 + 28174 + -#if WANT_SIMD_EXCEPT 28175 + if (__glibc_unlikely (v_any_u32 (special_cases))) 28176 + - /* Side-step special lanes so fenv exceptions are not triggered 28177 + - inadvertently. */ 28178 + - x = v_zerofy_f32 (x, special_cases); 28179 + -#endif 28180 + + return special_case (x, special_cases, d); 28181 + 28182 + - /* With x + 1 = t * 2^k (where t = m + 1 and k is chosen such that m 28183 + - is in [-0.25, 0.5]): 28184 + - log1p(x) = log(t) + log(2^k) = log1p(m) + k*log(2). 28185 + - 28186 + - We approximate log1p(m) with a polynomial, then scale by 28187 + - k*log(2). Instead of doing this directly, we use an intermediate 28188 + - scale factor s = 4*k*log(2) to ensure the scale is representable 28189 + - as a normalised fp32 number. */ 28190 + + return log1pf_inline (x, &d->d); 28191 + +} 28192 + 28193 + - float32x4_t m = vaddq_f32 (x, v_f32 (1.0f)); 28194 + +#else 28195 + 28196 + - /* Choose k to scale x to the range [-1/4, 1/2]. */ 28197 + - int32x4_t k 28198 + - = vandq_s32 (vsubq_s32 (vreinterpretq_s32_f32 (m), d->three_quarters), 28199 + - v_s32 (0xff800000)); 28200 + - uint32x4_t ku = vreinterpretq_u32_s32 (k); 28201 + +const static struct v_log1pf_data data = V_LOG1PF_CONSTANTS_TABLE; 28202 + 28203 + - /* Scale x by exponent manipulation. */ 28204 + - float32x4_t m_scale 28205 + - = vreinterpretq_f32_u32 (vsubq_u32 (vreinterpretq_u32_f32 (x), ku)); 28206 + +static float32x4_t NOINLINE VPCS_ATTR 28207 + +special_case (float32x4_t x, uint32x4_t cmp) 28208 + +{ 28209 + + return v_call_f32 (log1pf, x, log1pf_inline (x, ptr_barrier (&data)), cmp); 28210 + +} 28211 + 28212 + - /* Scale up to ensure that the scale factor is representable as normalised 28213 + - fp32 number, and scale m down accordingly. */ 28214 + - float32x4_t s = vreinterpretq_f32_u32 (vsubq_u32 (d->four, ku)); 28215 + - m_scale = vaddq_f32 (m_scale, vfmaq_f32 (v_f32 (-1.0f), v_f32 (0.25f), s)); 28216 + +/* Vector log1pf approximation using polynomial on reduced interval. Worst-case 28217 + + error is 1.63 ULP: 28218 + + _ZGVnN4v_log1pf(0x1.216d12p-2) got 0x1.fdcb12p-3 28219 + + want 0x1.fdcb16p-3. */ 28220 + +VPCS_ATTR float32x4_t V_NAME_F1 (log1p) (float32x4_t x) 28221 + +{ 28222 + + uint32x4_t special_cases = vornq_u32 (vcleq_f32 (x, v_f32 (-1)), 28223 + + vcaleq_f32 (x, v_f32 (0x1p127f))); 28224 + 28225 + - /* Evaluate polynomial on the reduced interval. */ 28226 + - float32x4_t p = eval_poly (m_scale, d->poly); 28227 + + if (__glibc_unlikely (v_any_u32 (special_cases))) 28228 + + return special_case (x, special_cases); 28229 + 28230 + - /* The scale factor to be applied back at the end - by multiplying float(k) 28231 + - by 2^-23 we get the unbiased exponent of k. */ 28232 + - float32x4_t scale_back = vcvtq_f32_s32 (vshrq_n_s32 (k, 23)); 28233 + + return log1pf_inline (x, ptr_barrier (&data)); 28234 + +} 28235 + 28236 + - /* Apply the scaling back. */ 28237 + - float32x4_t y = vfmaq_f32 (p, scale_back, d->ln2); 28238 + +#endif 28239 + 28240 + - if (__glibc_unlikely (v_any_u32 (special_cases))) 28241 + - return special_case (special_arg, y, special_cases); 28242 + - return y; 28243 + -} 28244 + libmvec_hidden_def (V_NAME_F1 (log1p)) 28245 + HALF_WIDTH_ALIAS_F1 (log1p) 28246 + strong_alias (V_NAME_F1 (log1p), V_NAME_F1 (logp1)) 28247 + diff --git a/sysdeps/aarch64/fpu/v_log1pf_inline.h b/sysdeps/aarch64/fpu/v_log1pf_inline.h 28248 + index 643a6cdcfc..73e45a942e 100644 28249 + --- a/sysdeps/aarch64/fpu/v_log1pf_inline.h 28250 + +++ b/sysdeps/aarch64/fpu/v_log1pf_inline.h 28251 + @@ -25,54 +25,81 @@ 28252 + 28253 + struct v_log1pf_data 28254 + { 28255 + - float32x4_t poly[8], ln2; 28256 + uint32x4_t four; 28257 + int32x4_t three_quarters; 28258 + + float c0, c3, c5, c7; 28259 + + float32x4_t c4, c6, c1, c2, ln2; 28260 + }; 28261 + 28262 + /* Polynomial generated using FPMinimax in [-0.25, 0.5]. First two coefficients 28263 + (1, -0.5) are not stored as they can be generated more efficiently. */ 28264 + #define V_LOG1PF_CONSTANTS_TABLE \ 28265 + { \ 28266 + - .poly \ 28267 + - = { V4 (0x1.5555aap-2f), V4 (-0x1.000038p-2f), V4 (0x1.99675cp-3f), \ 28268 + - V4 (-0x1.54ef78p-3f), V4 (0x1.28a1f4p-3f), V4 (-0x1.0da91p-3f), \ 28269 + - V4 (0x1.abcb6p-4f), V4 (-0x1.6f0d5ep-5f) }, \ 28270 + - .ln2 = V4 (0x1.62e43p-1f), .four = V4 (0x40800000), \ 28271 + - .three_quarters = V4 (0x3f400000) \ 28272 + + .c0 = 0x1.5555aap-2f, .c1 = V4 (-0x1.000038p-2f), \ 28273 + + .c2 = V4 (0x1.99675cp-3f), .c3 = -0x1.54ef78p-3f, \ 28274 + + .c4 = V4 (0x1.28a1f4p-3f), .c5 = -0x1.0da91p-3f, \ 28275 + + .c6 = V4 (0x1.abcb6p-4f), .c7 = -0x1.6f0d5ep-5f, \ 28276 + + .ln2 = V4 (0x1.62e43p-1f), .four = V4 (0x40800000), \ 28277 + + .three_quarters = V4 (0x3f400000) \ 28278 + } 28279 + 28280 + static inline float32x4_t 28281 + -eval_poly (float32x4_t m, const float32x4_t *c) 28282 + +eval_poly (float32x4_t m, const struct v_log1pf_data *d) 28283 + { 28284 + - /* Approximate log(1+m) on [-0.25, 0.5] using pairwise Horner (main routine 28285 + - uses split Estrin, but this way reduces register pressure in the calling 28286 + - routine). */ 28287 + - float32x4_t q = vfmaq_f32 (v_f32 (-0.5), m, c[0]); 28288 + + /* Approximate log(1+m) on [-0.25, 0.5] using pairwise Horner. */ 28289 + + float32x4_t c0357 = vld1q_f32 (&d->c0); 28290 + + float32x4_t q = vfmaq_laneq_f32 (v_f32 (-0.5), m, c0357, 0); 28291 + float32x4_t m2 = vmulq_f32 (m, m); 28292 + - q = vfmaq_f32 (m, m2, q); 28293 + - float32x4_t p = v_pw_horner_6_f32 (m, m2, c + 1); 28294 + + float32x4_t p67 = vfmaq_laneq_f32 (d->c6, m, c0357, 3); 28295 + + float32x4_t p45 = vfmaq_laneq_f32 (d->c4, m, c0357, 2); 28296 + + float32x4_t p23 = vfmaq_laneq_f32 (d->c2, m, c0357, 1); 28297 + + float32x4_t p = vfmaq_f32 (p45, m2, p67); 28298 + + p = vfmaq_f32 (p23, m2, p); 28299 + + p = vfmaq_f32 (d->c1, m, p); 28300 + p = vmulq_f32 (m2, p); 28301 + - return vfmaq_f32 (q, m2, p); 28302 + + p = vfmaq_f32 (m, m2, p); 28303 + + return vfmaq_f32 (p, m2, q); 28304 + } 28305 + 28306 + static inline float32x4_t 28307 + -log1pf_inline (float32x4_t x, const struct v_log1pf_data d) 28308 + +log1pf_inline (float32x4_t x, const struct v_log1pf_data *d) 28309 + { 28310 + - /* Helper for calculating log(x + 1). Copied from log1pf_2u1.c, with no 28311 + - special-case handling. See that file for details of the algorithm. */ 28312 + + /* Helper for calculating log(x + 1). */ 28313 + + 28314 + + /* With x + 1 = t * 2^k (where t = m + 1 and k is chosen such that m 28315 + + is in [-0.25, 0.5]): 28316 + + log1p(x) = log(t) + log(2^k) = log1p(m) + k*log(2). 28317 + + 28318 + + We approximate log1p(m) with a polynomial, then scale by 28319 + + k*log(2). Instead of doing this directly, we use an intermediate 28320 + + scale factor s = 4*k*log(2) to ensure the scale is representable 28321 + + as a normalised fp32 number. */ 28322 + float32x4_t m = vaddq_f32 (x, v_f32 (1.0f)); 28323 + + 28324 + + /* Choose k to scale x to the range [-1/4, 1/2]. */ 28325 + int32x4_t k 28326 + - = vandq_s32 (vsubq_s32 (vreinterpretq_s32_f32 (m), d.three_quarters), 28327 + + = vandq_s32 (vsubq_s32 (vreinterpretq_s32_f32 (m), d->three_quarters), 28328 + v_s32 (0xff800000)); 28329 + uint32x4_t ku = vreinterpretq_u32_s32 (k); 28330 + - float32x4_t s = vreinterpretq_f32_u32 (vsubq_u32 (d.four, ku)); 28331 + + 28332 + + /* Scale up to ensure that the scale factor is representable as normalised 28333 + + fp32 number, and scale m down accordingly. */ 28334 + + float32x4_t s = vreinterpretq_f32_u32 (vsubq_u32 (d->four, ku)); 28335 + + 28336 + + /* Scale x by exponent manipulation. */ 28337 + float32x4_t m_scale 28338 + = vreinterpretq_f32_u32 (vsubq_u32 (vreinterpretq_u32_f32 (x), ku)); 28339 + m_scale = vaddq_f32 (m_scale, vfmaq_f32 (v_f32 (-1.0f), v_f32 (0.25f), s)); 28340 + - float32x4_t p = eval_poly (m_scale, d.poly); 28341 + + 28342 + + /* Evaluate polynomial on the reduced interval. */ 28343 + + float32x4_t p = eval_poly (m_scale, d); 28344 + + 28345 + + /* The scale factor to be applied back at the end - by multiplying float(k) 28346 + + by 2^-23 we get the unbiased exponent of k. */ 28347 + float32x4_t scale_back = vmulq_f32 (vcvtq_f32_s32 (k), v_f32 (0x1.0p-23f)); 28348 + - return vfmaq_f32 (p, scale_back, d.ln2); 28349 + + 28350 + + /* Apply the scaling back. */ 28351 + + return vfmaq_f32 (p, scale_back, d->ln2); 28352 + } 28353 + 28354 + #endif 28355 + 28356 + commit a947a43b95bbea53ec50df058b42392fd5ea52b6 28357 + Author: Joe Ramsay <Joe.Ramsay@arm.com> 28358 + Date: Mon Sep 23 15:32:53 2024 +0100 28359 + 28360 + AArch64: Improve codegen in users of ADVSIMD expm1f helper 28361 + 28362 + Rearrange operations so MOV is not necessary in reduction or around 28363 + the special-case handler. Reduce memory access by using more indexed 28364 + MLAs in polynomial. 28365 + 28366 + Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com> 28367 + (cherry picked from commit 7900ac490db32f6bccff812733f00280dde34e27) 28368 + 28369 + diff --git a/sysdeps/aarch64/fpu/expm1f_advsimd.c b/sysdeps/aarch64/fpu/expm1f_advsimd.c 28370 + index a0616ec754..8303ca296e 100644 28371 + --- a/sysdeps/aarch64/fpu/expm1f_advsimd.c 28372 + +++ b/sysdeps/aarch64/fpu/expm1f_advsimd.c 28373 + @@ -18,27 +18,18 @@ 28374 + <https://www.gnu.org/licenses/>. */ 28375 + 28376 + #include "v_math.h" 28377 + -#include "poly_advsimd_f32.h" 28378 + +#include "v_expm1f_inline.h" 28379 + 28380 + static const struct data 28381 + { 28382 + - float32x4_t poly[5]; 28383 + - float invln2_and_ln2[4]; 28384 + - float32x4_t shift; 28385 + - int32x4_t exponent_bias; 28386 + + struct v_expm1f_data d; 28387 + #if WANT_SIMD_EXCEPT 28388 + uint32x4_t thresh; 28389 + #else 28390 + float32x4_t oflow_bound; 28391 + #endif 28392 + } data = { 28393 + - /* Generated using fpminimax with degree=5 in [-log(2)/2, log(2)/2]. */ 28394 + - .poly = { V4 (0x1.fffffep-2), V4 (0x1.5554aep-3), V4 (0x1.555736p-5), 28395 + - V4 (0x1.12287cp-7), V4 (0x1.6b55a2p-10) }, 28396 + - /* Stores constants: invln2, ln2_hi, ln2_lo, 0. */ 28397 + - .invln2_and_ln2 = { 0x1.715476p+0f, 0x1.62e4p-1f, 0x1.7f7d1cp-20f, 0 }, 28398 + - .shift = V4 (0x1.8p23f), 28399 + - .exponent_bias = V4 (0x3f800000), 28400 + + .d = V_EXPM1F_DATA, 28401 + #if !WANT_SIMD_EXCEPT 28402 + /* Value above which expm1f(x) should overflow. Absolute value of the 28403 + underflow bound is greater than this, so it catches both cases - there is 28404 + @@ -55,67 +46,38 @@ static const struct data 28405 + #define TinyBound v_u32 (0x34000000 << 1) 28406 + 28407 + static float32x4_t VPCS_ATTR NOINLINE 28408 + -special_case (float32x4_t x, float32x4_t y, uint32x4_t special) 28409 + +special_case (float32x4_t x, uint32x4_t special, const struct data *d) 28410 + { 28411 + - return v_call_f32 (expm1f, x, y, special); 28412 + + return v_call_f32 ( 28413 + + expm1f, x, expm1f_inline (v_zerofy_f32 (x, special), &d->d), special); 28414 + } 28415 + 28416 + /* Single-precision vector exp(x) - 1 function. 28417 + - The maximum error is 1.51 ULP: 28418 + - _ZGVnN4v_expm1f (0x1.8baa96p-2) got 0x1.e2fb9p-2 28419 + - want 0x1.e2fb94p-2. */ 28420 + + The maximum error is 1.62 ULP: 28421 + + _ZGVnN4v_expm1f(0x1.85f83p-2) got 0x1.da9f4p-2 28422 + + want 0x1.da9f44p-2. */ 28423 + float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (expm1) (float32x4_t x) 28424 + { 28425 + const struct data *d = ptr_barrier (&data); 28426 + - uint32x4_t ix = vreinterpretq_u32_f32 (x); 28427 + 28428 + #if WANT_SIMD_EXCEPT 28429 + + uint32x4_t ix = vreinterpretq_u32_f32 (x); 28430 + /* If fp exceptions are to be triggered correctly, fall back to scalar for 28431 + |x| < 2^-23, |x| > oflow_bound, Inf & NaN. Add ix to itself for 28432 + shift-left by 1, and compare with thresh which was left-shifted offline - 28433 + this is effectively an absolute compare. */ 28434 + uint32x4_t special 28435 + = vcgeq_u32 (vsubq_u32 (vaddq_u32 (ix, ix), TinyBound), d->thresh); 28436 + - if (__glibc_unlikely (v_any_u32 (special))) 28437 + - x = v_zerofy_f32 (x, special); 28438 + #else 28439 + /* Handles very large values (+ve and -ve), +/-NaN, +/-Inf. */ 28440 + uint32x4_t special = vcagtq_f32 (x, d->oflow_bound); 28441 + #endif 28442 + 28443 + - /* Reduce argument to smaller range: 28444 + - Let i = round(x / ln2) 28445 + - and f = x - i * ln2, then f is in [-ln2/2, ln2/2]. 28446 + - exp(x) - 1 = 2^i * (expm1(f) + 1) - 1 28447 + - where 2^i is exact because i is an integer. */ 28448 + - float32x4_t invln2_and_ln2 = vld1q_f32 (d->invln2_and_ln2); 28449 + - float32x4_t j 28450 + - = vsubq_f32 (vfmaq_laneq_f32 (d->shift, x, invln2_and_ln2, 0), d->shift); 28451 + - int32x4_t i = vcvtq_s32_f32 (j); 28452 + - float32x4_t f = vfmsq_laneq_f32 (x, j, invln2_and_ln2, 1); 28453 + - f = vfmsq_laneq_f32 (f, j, invln2_and_ln2, 2); 28454 + - 28455 + - /* Approximate expm1(f) using polynomial. 28456 + - Taylor expansion for expm1(x) has the form: 28457 + - x + ax^2 + bx^3 + cx^4 .... 28458 + - So we calculate the polynomial P(f) = a + bf + cf^2 + ... 28459 + - and assemble the approximation expm1(f) ~= f + f^2 * P(f). */ 28460 + - float32x4_t p = v_horner_4_f32 (f, d->poly); 28461 + - p = vfmaq_f32 (f, vmulq_f32 (f, f), p); 28462 + - 28463 + - /* Assemble the result. 28464 + - expm1(x) ~= 2^i * (p + 1) - 1 28465 + - Let t = 2^i. */ 28466 + - int32x4_t u = vaddq_s32 (vshlq_n_s32 (i, 23), d->exponent_bias); 28467 + - float32x4_t t = vreinterpretq_f32_s32 (u); 28468 + - 28469 + if (__glibc_unlikely (v_any_u32 (special))) 28470 + - return special_case (vreinterpretq_f32_u32 (ix), 28471 + - vfmaq_f32 (vsubq_f32 (t, v_f32 (1.0f)), p, t), 28472 + - special); 28473 + + return special_case (x, special, d); 28474 + 28475 + /* expm1(x) ~= p * t + (t - 1). */ 28476 + - return vfmaq_f32 (vsubq_f32 (t, v_f32 (1.0f)), p, t); 28477 + + return expm1f_inline (x, &d->d); 28478 + } 28479 + libmvec_hidden_def (V_NAME_F1 (expm1)) 28480 + HALF_WIDTH_ALIAS_F1 (expm1) 28481 + diff --git a/sysdeps/aarch64/fpu/sinhf_advsimd.c b/sysdeps/aarch64/fpu/sinhf_advsimd.c 28482 + index 6bb7482dc2..c6ed7598e7 100644 28483 + --- a/sysdeps/aarch64/fpu/sinhf_advsimd.c 28484 + +++ b/sysdeps/aarch64/fpu/sinhf_advsimd.c 28485 + @@ -23,15 +23,13 @@ 28486 + static const struct data 28487 + { 28488 + struct v_expm1f_data expm1f_consts; 28489 + - uint32x4_t halff; 28490 + #if WANT_SIMD_EXCEPT 28491 + uint32x4_t tiny_bound, thresh; 28492 + #else 28493 + - uint32x4_t oflow_bound; 28494 + + float32x4_t oflow_bound; 28495 + #endif 28496 + } data = { 28497 + .expm1f_consts = V_EXPM1F_DATA, 28498 + - .halff = V4 (0x3f000000), 28499 + #if WANT_SIMD_EXCEPT 28500 + /* 0x1.6a09e8p-32, below which expm1f underflows. */ 28501 + .tiny_bound = V4 (0x2fb504f4), 28502 + @@ -39,14 +37,15 @@ static const struct data 28503 + .thresh = V4 (0x12fbbbb3), 28504 + #else 28505 + /* 0x1.61814ep+6, above which expm1f helper overflows. */ 28506 + - .oflow_bound = V4 (0x42b0c0a7), 28507 + + .oflow_bound = V4 (0x1.61814ep+6), 28508 + #endif 28509 + }; 28510 + 28511 + static float32x4_t NOINLINE VPCS_ATTR 28512 + -special_case (float32x4_t x, float32x4_t y, uint32x4_t special) 28513 + +special_case (float32x4_t x, float32x4_t t, float32x4_t halfsign, 28514 + + uint32x4_t special) 28515 + { 28516 + - return v_call_f32 (sinhf, x, y, special); 28517 + + return v_call_f32 (sinhf, x, vmulq_f32 (t, halfsign), special); 28518 + } 28519 + 28520 + /* Approximation for vector single-precision sinh(x) using expm1. 28521 + @@ -60,15 +59,15 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (sinh) (float32x4_t x) 28522 + 28523 + uint32x4_t ix = vreinterpretq_u32_f32 (x); 28524 + float32x4_t ax = vabsq_f32 (x); 28525 + - uint32x4_t iax = vreinterpretq_u32_f32 (ax); 28526 + - uint32x4_t sign = veorq_u32 (ix, iax); 28527 + - float32x4_t halfsign = vreinterpretq_f32_u32 (vorrq_u32 (sign, d->halff)); 28528 + + float32x4_t halfsign = vreinterpretq_f32_u32 ( 28529 + + vbslq_u32 (v_u32 (0x80000000), ix, vreinterpretq_u32_f32 (v_f32 (0.5)))); 28530 + 28531 + #if WANT_SIMD_EXCEPT 28532 + - uint32x4_t special = vcgeq_u32 (vsubq_u32 (iax, d->tiny_bound), d->thresh); 28533 + + uint32x4_t special = vcgeq_u32 ( 28534 + + vsubq_u32 (vreinterpretq_u32_f32 (ax), d->tiny_bound), d->thresh); 28535 + ax = v_zerofy_f32 (ax, special); 28536 + #else 28537 + - uint32x4_t special = vcgeq_u32 (iax, d->oflow_bound); 28538 + + uint32x4_t special = vcageq_f32 (x, d->oflow_bound); 28539 + #endif 28540 + 28541 + /* Up to the point that expm1f overflows, we can use it to calculate sinhf 28542 + @@ -80,7 +79,7 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (sinh) (float32x4_t x) 28543 + /* Fall back to the scalar variant for any lanes that should trigger an 28544 + exception. */ 28545 + if (__glibc_unlikely (v_any_u32 (special))) 28546 + - return special_case (x, vmulq_f32 (t, halfsign), special); 28547 + + return special_case (x, t, halfsign, special); 28548 + 28549 + return vmulq_f32 (t, halfsign); 28550 + } 28551 + diff --git a/sysdeps/aarch64/fpu/tanhf_advsimd.c b/sysdeps/aarch64/fpu/tanhf_advsimd.c 28552 + index 50defd6ef0..3ced9b7a41 100644 28553 + --- a/sysdeps/aarch64/fpu/tanhf_advsimd.c 28554 + +++ b/sysdeps/aarch64/fpu/tanhf_advsimd.c 28555 + @@ -28,13 +28,16 @@ static const struct data 28556 + /* 0x1.205966p+3, above which tanhf rounds to 1 (or -1 for negative). */ 28557 + .boring_bound = V4 (0x41102cb3), 28558 + .large_bound = V4 (0x7f800000), 28559 + - .onef = V4 (0x3f800000), 28560 + }; 28561 + 28562 + static float32x4_t NOINLINE VPCS_ATTR 28563 + -special_case (float32x4_t x, float32x4_t y, uint32x4_t special) 28564 + +special_case (float32x4_t x, uint32x4_t is_boring, float32x4_t boring, 28565 + + float32x4_t q, uint32x4_t special) 28566 + { 28567 + - return v_call_f32 (tanhf, x, y, special); 28568 + + return v_call_f32 ( 28569 + + tanhf, x, 28570 + + vbslq_f32 (is_boring, boring, vdivq_f32 (q, vaddq_f32 (q, v_f32 (2.0)))), 28571 + + special); 28572 + } 28573 + 28574 + /* Approximation for single-precision vector tanh(x), using a simplified 28575 + @@ -50,7 +53,9 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (tanh) (float32x4_t x) 28576 + uint32x4_t iax = vreinterpretq_u32_f32 (ax); 28577 + uint32x4_t sign = veorq_u32 (ix, iax); 28578 + uint32x4_t is_boring = vcgtq_u32 (iax, d->boring_bound); 28579 + - float32x4_t boring = vreinterpretq_f32_u32 (vorrq_u32 (sign, d->onef)); 28580 + + /* expm1 exponent bias is 1.0f reinterpreted to int. */ 28581 + + float32x4_t boring = vreinterpretq_f32_u32 (vorrq_u32 ( 28582 + + sign, vreinterpretq_u32_s32 (d->expm1f_consts.exponent_bias))); 28583 + 28584 + #if WANT_SIMD_EXCEPT 28585 + /* If fp exceptions are to be triggered properly, set all special and boring 28586 + @@ -66,10 +71,12 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (tanh) (float32x4_t x) 28587 + 28588 + /* tanh(x) = (e^2x - 1) / (e^2x + 1). */ 28589 + float32x4_t q = expm1f_inline (vmulq_n_f32 (x, 2), &d->expm1f_consts); 28590 + - float32x4_t y = vdivq_f32 (q, vaddq_f32 (q, v_f32 (2.0))); 28591 + + 28592 + if (__glibc_unlikely (v_any_u32 (special))) 28593 + - return special_case (vreinterpretq_f32_u32 (ix), 28594 + - vbslq_f32 (is_boring, boring, y), special); 28595 + + return special_case (vreinterpretq_f32_u32 (ix), is_boring, boring, q, 28596 + + special); 28597 + + 28598 + + float32x4_t y = vdivq_f32 (q, vaddq_f32 (q, v_f32 (2.0))); 28599 + return vbslq_f32 (is_boring, boring, y); 28600 + } 28601 + libmvec_hidden_def (V_NAME_F1 (tanh)) 28602 + diff --git a/sysdeps/aarch64/fpu/v_expm1f_inline.h b/sysdeps/aarch64/fpu/v_expm1f_inline.h 28603 + index 59b552da6b..1daedfdd51 100644 28604 + --- a/sysdeps/aarch64/fpu/v_expm1f_inline.h 28605 + +++ b/sysdeps/aarch64/fpu/v_expm1f_inline.h 28606 + @@ -21,48 +21,47 @@ 28607 + #define AARCH64_FPU_V_EXPM1F_INLINE_H 28608 + 28609 + #include "v_math.h" 28610 + -#include "poly_advsimd_f32.h" 28611 + +#include "math_config.h" 28612 + 28613 + struct v_expm1f_data 28614 + { 28615 + - float32x4_t poly[5]; 28616 + - float invln2_and_ln2[4]; 28617 + - float32x4_t shift; 28618 + + float32x4_t c0, c2; 28619 + int32x4_t exponent_bias; 28620 + + float c1, c3, inv_ln2, c4; 28621 + + float ln2_hi, ln2_lo; 28622 + }; 28623 + 28624 + /* Coefficients generated using fpminimax with degree=5 in [-log(2)/2, 28625 + - log(2)/2]. Exponent bias is asuint(1.0f). 28626 + - invln2_and_ln2 Stores constants: invln2, ln2_lo, ln2_hi, 0. */ 28627 + + log(2)/2]. Exponent bias is asuint(1.0f). */ 28628 + #define V_EXPM1F_DATA \ 28629 + { \ 28630 + - .poly = { V4 (0x1.fffffep-2), V4 (0x1.5554aep-3), V4 (0x1.555736p-5), \ 28631 + - V4 (0x1.12287cp-7), V4 (0x1.6b55a2p-10) }, \ 28632 + - .shift = V4 (0x1.8p23f), .exponent_bias = V4 (0x3f800000), \ 28633 + - .invln2_and_ln2 = { 0x1.715476p+0f, 0x1.62e4p-1f, 0x1.7f7d1cp-20f, 0 }, \ 28634 + + .c0 = V4 (0x1.fffffep-2), .c1 = 0x1.5554aep-3, .c2 = V4 (0x1.555736p-5), \ 28635 + + .c3 = 0x1.12287cp-7, .c4 = 0x1.6b55a2p-10, \ 28636 + + .exponent_bias = V4 (0x3f800000), .inv_ln2 = 0x1.715476p+0f, \ 28637 + + .ln2_hi = 0x1.62e4p-1f, .ln2_lo = 0x1.7f7d1cp-20f, \ 28638 + } 28639 + 28640 + static inline float32x4_t 28641 + expm1f_inline (float32x4_t x, const struct v_expm1f_data *d) 28642 + { 28643 + - /* Helper routine for calculating exp(x) - 1. 28644 + - Copied from v_expm1f_1u6.c, with all special-case handling removed - the 28645 + - calling routine should handle special values if required. */ 28646 + + /* Helper routine for calculating exp(x) - 1. */ 28647 + + 28648 + + float32x2_t ln2 = vld1_f32 (&d->ln2_hi); 28649 + + float32x4_t lane_consts = vld1q_f32 (&d->c1); 28650 + 28651 + /* Reduce argument: f in [-ln2/2, ln2/2], i is exact. */ 28652 + - float32x4_t invln2_and_ln2 = vld1q_f32 (d->invln2_and_ln2); 28653 + - float32x4_t j 28654 + - = vsubq_f32 (vfmaq_laneq_f32 (d->shift, x, invln2_and_ln2, 0), d->shift); 28655 + + float32x4_t j = vrndaq_f32 (vmulq_laneq_f32 (x, lane_consts, 2)); 28656 + int32x4_t i = vcvtq_s32_f32 (j); 28657 + - float32x4_t f = vfmsq_laneq_f32 (x, j, invln2_and_ln2, 1); 28658 + - f = vfmsq_laneq_f32 (f, j, invln2_and_ln2, 2); 28659 + + float32x4_t f = vfmsq_lane_f32 (x, j, ln2, 0); 28660 + + f = vfmsq_lane_f32 (f, j, ln2, 1); 28661 + 28662 + - /* Approximate expm1(f) with polynomial P, expm1(f) ~= f + f^2 * P(f). 28663 + - Uses Estrin scheme, where the main _ZGVnN4v_expm1f routine uses 28664 + - Horner. */ 28665 + + /* Approximate expm1(f) with polynomial P, expm1(f) ~= f + f^2 * P(f). */ 28666 + float32x4_t f2 = vmulq_f32 (f, f); 28667 + float32x4_t f4 = vmulq_f32 (f2, f2); 28668 + - float32x4_t p = v_estrin_4_f32 (f, f2, f4, d->poly); 28669 + + float32x4_t p01 = vfmaq_laneq_f32 (d->c0, f, lane_consts, 0); 28670 + + float32x4_t p23 = vfmaq_laneq_f32 (d->c2, f, lane_consts, 1); 28671 + + float32x4_t p = vfmaq_f32 (p01, f2, p23); 28672 + + p = vfmaq_laneq_f32 (p, f4, lane_consts, 3); 28673 + p = vfmaq_f32 (f, f2, p); 28674 + 28675 + /* t = 2^i. */ 28676 + 28677 + commit 68f2eb20de698675ddc74068c2cd03fee29207df 28678 + Author: Joe Ramsay <Joe.Ramsay@arm.com> 28679 + Date: Mon Sep 23 15:33:31 2024 +0100 28680 + 28681 + AArch64: Simplify rounding-multiply pattern in several AdvSIMD routines 28682 + 28683 + This operation can be simplified to use simpler multiply-round-convert 28684 + sequence, which uses fewer instructions and constants. 28685 + 28686 + Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com> 28687 + (cherry picked from commit 16a59571e4e9fd019d3fc23a2e7d73c1df8bb5cb) 28688 + 28689 + diff --git a/sysdeps/aarch64/fpu/cos_advsimd.c b/sysdeps/aarch64/fpu/cos_advsimd.c 28690 + index 3924c9ce44..11a89b1530 100644 28691 + --- a/sysdeps/aarch64/fpu/cos_advsimd.c 28692 + +++ b/sysdeps/aarch64/fpu/cos_advsimd.c 28693 + @@ -22,7 +22,7 @@ 28694 + static const struct data 28695 + { 28696 + float64x2_t poly[7]; 28697 + - float64x2_t range_val, shift, inv_pi, half_pi, pi_1, pi_2, pi_3; 28698 + + float64x2_t range_val, inv_pi, pi_1, pi_2, pi_3; 28699 + } data = { 28700 + /* Worst-case error is 3.3 ulp in [-pi/2, pi/2]. */ 28701 + .poly = { V2 (-0x1.555555555547bp-3), V2 (0x1.1111111108a4dp-7), 28702 + @@ -30,11 +30,9 @@ static const struct data 28703 + V2 (-0x1.ae633919987c6p-26), V2 (0x1.60e277ae07cecp-33), 28704 + V2 (-0x1.9e9540300a1p-41) }, 28705 + .inv_pi = V2 (0x1.45f306dc9c883p-2), 28706 + - .half_pi = V2 (0x1.921fb54442d18p+0), 28707 + .pi_1 = V2 (0x1.921fb54442d18p+1), 28708 + .pi_2 = V2 (0x1.1a62633145c06p-53), 28709 + .pi_3 = V2 (0x1.c1cd129024e09p-106), 28710 + - .shift = V2 (0x1.8p52), 28711 + .range_val = V2 (0x1p23) 28712 + }; 28713 + 28714 + @@ -68,10 +66,9 @@ float64x2_t VPCS_ATTR V_NAME_D1 (cos) (float64x2_t x) 28715 + #endif 28716 + 28717 + /* n = rint((|x|+pi/2)/pi) - 0.5. */ 28718 + - n = vfmaq_f64 (d->shift, d->inv_pi, vaddq_f64 (r, d->half_pi)); 28719 + - odd = vshlq_n_u64 (vreinterpretq_u64_f64 (n), 63); 28720 + - n = vsubq_f64 (n, d->shift); 28721 + - n = vsubq_f64 (n, v_f64 (0.5)); 28722 + + n = vrndaq_f64 (vfmaq_f64 (v_f64 (0.5), r, d->inv_pi)); 28723 + + odd = vshlq_n_u64 (vreinterpretq_u64_s64 (vcvtq_s64_f64 (n)), 63); 28724 + + n = vsubq_f64 (n, v_f64 (0.5f)); 28725 + 28726 + /* r = |x| - n*pi (range reduction into -pi/2 .. pi/2). */ 28727 + r = vfmsq_f64 (r, d->pi_1, n); 28728 + diff --git a/sysdeps/aarch64/fpu/cosf_advsimd.c b/sysdeps/aarch64/fpu/cosf_advsimd.c 28729 + index d0c285b03a..85a1b37373 100644 28730 + --- a/sysdeps/aarch64/fpu/cosf_advsimd.c 28731 + +++ b/sysdeps/aarch64/fpu/cosf_advsimd.c 28732 + @@ -22,7 +22,7 @@ 28733 + static const struct data 28734 + { 28735 + float32x4_t poly[4]; 28736 + - float32x4_t range_val, inv_pi, half_pi, shift, pi_1, pi_2, pi_3; 28737 + + float32x4_t range_val, inv_pi, pi_1, pi_2, pi_3; 28738 + } data = { 28739 + /* 1.886 ulp error. */ 28740 + .poly = { V4 (-0x1.555548p-3f), V4 (0x1.110df4p-7f), V4 (-0x1.9f42eap-13f), 28741 + @@ -33,8 +33,6 @@ static const struct data 28742 + .pi_3 = V4 (-0x1.ee59dap-49f), 28743 + 28744 + .inv_pi = V4 (0x1.45f306p-2f), 28745 + - .shift = V4 (0x1.8p+23f), 28746 + - .half_pi = V4 (0x1.921fb6p0f), 28747 + .range_val = V4 (0x1p20f) 28748 + }; 28749 + 28750 + @@ -69,9 +67,8 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (cos) (float32x4_t x) 28751 + #endif 28752 + 28753 + /* n = rint((|x|+pi/2)/pi) - 0.5. */ 28754 + - n = vfmaq_f32 (d->shift, d->inv_pi, vaddq_f32 (r, d->half_pi)); 28755 + - odd = vshlq_n_u32 (vreinterpretq_u32_f32 (n), 31); 28756 + - n = vsubq_f32 (n, d->shift); 28757 + + n = vrndaq_f32 (vfmaq_f32 (v_f32 (0.5), r, d->inv_pi)); 28758 + + odd = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtq_s32_f32 (n)), 31); 28759 + n = vsubq_f32 (n, v_f32 (0.5f)); 28760 + 28761 + /* r = |x| - n*pi (range reduction into -pi/2 .. pi/2). */ 28762 + diff --git a/sysdeps/aarch64/fpu/expf_advsimd.c b/sysdeps/aarch64/fpu/expf_advsimd.c 28763 + index 99d2e647aa..5c9cb72620 100644 28764 + --- a/sysdeps/aarch64/fpu/expf_advsimd.c 28765 + +++ b/sysdeps/aarch64/fpu/expf_advsimd.c 28766 + @@ -22,7 +22,7 @@ 28767 + static const struct data 28768 + { 28769 + float32x4_t poly[5]; 28770 + - float32x4_t shift, inv_ln2, ln2_hi, ln2_lo; 28771 + + float32x4_t inv_ln2, ln2_hi, ln2_lo; 28772 + uint32x4_t exponent_bias; 28773 + #if !WANT_SIMD_EXCEPT 28774 + float32x4_t special_bound, scale_thresh; 28775 + @@ -31,7 +31,6 @@ static const struct data 28776 + /* maxerr: 1.45358 +0.5 ulp. */ 28777 + .poly = { V4 (0x1.0e4020p-7f), V4 (0x1.573e2ep-5f), V4 (0x1.555e66p-3f), 28778 + V4 (0x1.fffdb6p-2f), V4 (0x1.ffffecp-1f) }, 28779 + - .shift = V4 (0x1.8p23f), 28780 + .inv_ln2 = V4 (0x1.715476p+0f), 28781 + .ln2_hi = V4 (0x1.62e4p-1f), 28782 + .ln2_lo = V4 (0x1.7f7d1cp-20f), 28783 + @@ -85,7 +84,7 @@ special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1, 28784 + float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (exp) (float32x4_t x) 28785 + { 28786 + const struct data *d = ptr_barrier (&data); 28787 + - float32x4_t n, r, r2, scale, p, q, poly, z; 28788 + + float32x4_t n, r, r2, scale, p, q, poly; 28789 + uint32x4_t cmp, e; 28790 + 28791 + #if WANT_SIMD_EXCEPT 28792 + @@ -104,11 +103,10 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (exp) (float32x4_t x) 28793 + 28794 + /* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)] 28795 + x = ln2*n + r, with r in [-ln2/2, ln2/2]. */ 28796 + - z = vfmaq_f32 (d->shift, x, d->inv_ln2); 28797 + - n = vsubq_f32 (z, d->shift); 28798 + + n = vrndaq_f32 (vmulq_f32 (x, d->inv_ln2)); 28799 + r = vfmsq_f32 (x, n, d->ln2_hi); 28800 + r = vfmsq_f32 (r, n, d->ln2_lo); 28801 + - e = vshlq_n_u32 (vreinterpretq_u32_f32 (z), 23); 28802 + + e = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtq_s32_f32 (n)), 23); 28803 + scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias)); 28804 + 28805 + #if !WANT_SIMD_EXCEPT 28806 + diff --git a/sysdeps/aarch64/fpu/sin_advsimd.c b/sysdeps/aarch64/fpu/sin_advsimd.c 28807 + index a0d9d3b819..718125cbad 100644 28808 + --- a/sysdeps/aarch64/fpu/sin_advsimd.c 28809 + +++ b/sysdeps/aarch64/fpu/sin_advsimd.c 28810 + @@ -22,7 +22,7 @@ 28811 + static const struct data 28812 + { 28813 + float64x2_t poly[7]; 28814 + - float64x2_t range_val, inv_pi, shift, pi_1, pi_2, pi_3; 28815 + + float64x2_t range_val, inv_pi, pi_1, pi_2, pi_3; 28816 + } data = { 28817 + .poly = { V2 (-0x1.555555555547bp-3), V2 (0x1.1111111108a4dp-7), 28818 + V2 (-0x1.a01a019936f27p-13), V2 (0x1.71de37a97d93ep-19), 28819 + @@ -34,12 +34,13 @@ static const struct data 28820 + .pi_1 = V2 (0x1.921fb54442d18p+1), 28821 + .pi_2 = V2 (0x1.1a62633145c06p-53), 28822 + .pi_3 = V2 (0x1.c1cd129024e09p-106), 28823 + - .shift = V2 (0x1.8p52), 28824 + }; 28825 + 28826 + #if WANT_SIMD_EXCEPT 28827 + -# define TinyBound v_u64 (0x3000000000000000) /* asuint64 (0x1p-255). */ 28828 + -# define Thresh v_u64 (0x1160000000000000) /* RangeVal - TinyBound. */ 28829 + +/* asuint64(0x1p-253)), below which multiply by inv_pi underflows. */ 28830 + +# define TinyBound v_u64 (0x3020000000000000) 28831 + +/* RangeVal - TinyBound. */ 28832 + +# define Thresh v_u64 (0x1160000000000000) 28833 + #endif 28834 + 28835 + #define C(i) d->poly[i] 28836 + @@ -72,16 +73,15 @@ float64x2_t VPCS_ATTR V_NAME_D1 (sin) (float64x2_t x) 28837 + fenv). These lanes will be fixed by special-case handler later. */ 28838 + uint64x2_t ir = vreinterpretq_u64_f64 (vabsq_f64 (x)); 28839 + cmp = vcgeq_u64 (vsubq_u64 (ir, TinyBound), Thresh); 28840 + - r = vbslq_f64 (cmp, vreinterpretq_f64_u64 (cmp), x); 28841 + + r = vreinterpretq_f64_u64 (vbicq_u64 (vreinterpretq_u64_f64 (x), cmp)); 28842 + #else 28843 + r = x; 28844 + cmp = vcageq_f64 (x, d->range_val); 28845 + #endif 28846 + 28847 + /* n = rint(|x|/pi). */ 28848 + - n = vfmaq_f64 (d->shift, d->inv_pi, r); 28849 + - odd = vshlq_n_u64 (vreinterpretq_u64_f64 (n), 63); 28850 + - n = vsubq_f64 (n, d->shift); 28851 + + n = vrndaq_f64 (vmulq_f64 (r, d->inv_pi)); 28852 + + odd = vshlq_n_u64 (vreinterpretq_u64_s64 (vcvtq_s64_f64 (n)), 63); 28853 + 28854 + /* r = |x| - n*pi (range reduction into -pi/2 .. pi/2). */ 28855 + r = vfmsq_f64 (r, d->pi_1, n); 28856 + diff --git a/sysdeps/aarch64/fpu/sinf_advsimd.c b/sysdeps/aarch64/fpu/sinf_advsimd.c 28857 + index 375dfc3331..6ee9a23d5b 100644 28858 + --- a/sysdeps/aarch64/fpu/sinf_advsimd.c 28859 + +++ b/sysdeps/aarch64/fpu/sinf_advsimd.c 28860 + @@ -22,7 +22,7 @@ 28861 + static const struct data 28862 + { 28863 + float32x4_t poly[4]; 28864 + - float32x4_t range_val, inv_pi, shift, pi_1, pi_2, pi_3; 28865 + + float32x4_t range_val, inv_pi, pi_1, pi_2, pi_3; 28866 + } data = { 28867 + /* 1.886 ulp error. */ 28868 + .poly = { V4 (-0x1.555548p-3f), V4 (0x1.110df4p-7f), V4 (-0x1.9f42eap-13f), 28869 + @@ -33,13 +33,14 @@ static const struct data 28870 + .pi_3 = V4 (-0x1.ee59dap-49f), 28871 + 28872 + .inv_pi = V4 (0x1.45f306p-2f), 28873 + - .shift = V4 (0x1.8p+23f), 28874 + .range_val = V4 (0x1p20f) 28875 + }; 28876 + 28877 + #if WANT_SIMD_EXCEPT 28878 + -# define TinyBound v_u32 (0x21000000) /* asuint32(0x1p-61f). */ 28879 + -# define Thresh v_u32 (0x28800000) /* RangeVal - TinyBound. */ 28880 + +/* asuint32(0x1p-59f), below which multiply by inv_pi underflows. */ 28881 + +# define TinyBound v_u32 (0x22000000) 28882 + +/* RangeVal - TinyBound. */ 28883 + +# define Thresh v_u32 (0x27800000) 28884 + #endif 28885 + 28886 + #define C(i) d->poly[i] 28887 + @@ -64,23 +65,22 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (sin) (float32x4_t x) 28888 + /* If fenv exceptions are to be triggered correctly, set any special lanes 28889 + to 1 (which is neutral w.r.t. fenv). These lanes will be fixed by 28890 + special-case handler later. */ 28891 + - r = vbslq_f32 (cmp, vreinterpretq_f32_u32 (cmp), x); 28892 + + r = vreinterpretq_f32_u32 (vbicq_u32 (vreinterpretq_u32_f32 (x), cmp)); 28893 + #else 28894 + r = x; 28895 + cmp = vcageq_f32 (x, d->range_val); 28896 + #endif 28897 + 28898 + - /* n = rint(|x|/pi) */ 28899 + - n = vfmaq_f32 (d->shift, d->inv_pi, r); 28900 + - odd = vshlq_n_u32 (vreinterpretq_u32_f32 (n), 31); 28901 + - n = vsubq_f32 (n, d->shift); 28902 + + /* n = rint(|x|/pi). */ 28903 + + n = vrndaq_f32 (vmulq_f32 (r, d->inv_pi)); 28904 + + odd = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtq_s32_f32 (n)), 31); 28905 + 28906 + - /* r = |x| - n*pi (range reduction into -pi/2 .. pi/2) */ 28907 + + /* r = |x| - n*pi (range reduction into -pi/2 .. pi/2). */ 28908 + r = vfmsq_f32 (r, d->pi_1, n); 28909 + r = vfmsq_f32 (r, d->pi_2, n); 28910 + r = vfmsq_f32 (r, d->pi_3, n); 28911 + 28912 + - /* y = sin(r) */ 28913 + + /* y = sin(r). */ 28914 + r2 = vmulq_f32 (r, r); 28915 + y = vfmaq_f32 (C (2), C (3), r2); 28916 + y = vfmaq_f32 (C (1), y, r2); 28917 + 28918 + commit 9ff7559b274eb0dbce2cbcf87284c1d30d47a2d6 28919 + Author: Joe Ramsay <Joe.Ramsay@arm.com> 28920 + Date: Mon Oct 28 14:58:35 2024 +0000 28921 + 28922 + AArch64: Small optimisation in AdvSIMD erf and erfc 28923 + 28924 + In both routines, reduce register pressure such that GCC 14 emits no 28925 + spills for erf and fewer spills for erfc. Also use more efficient 28926 + comparison for the special-case in erf. 28927 + 28928 + Benchtests show erf improves by 6.4%, erfc by 1.0%. 28929 + 28930 + (cherry picked from commit 1cf29fbc5be23db775d1dfa6b332ded6e6554252) 28931 + 28932 + diff --git a/sysdeps/aarch64/fpu/erf_advsimd.c b/sysdeps/aarch64/fpu/erf_advsimd.c 28933 + index 19cbb7d0f4..c0116735e4 100644 28934 + --- a/sysdeps/aarch64/fpu/erf_advsimd.c 28935 + +++ b/sysdeps/aarch64/fpu/erf_advsimd.c 28936 + @@ -22,19 +22,21 @@ 28937 + static const struct data 28938 + { 28939 + float64x2_t third; 28940 + - float64x2_t tenth, two_over_five, two_over_fifteen; 28941 + - float64x2_t two_over_nine, two_over_fortyfive; 28942 + + float64x2_t tenth, two_over_five, two_over_nine; 28943 + + double two_over_fifteen, two_over_fortyfive; 28944 + float64x2_t max, shift; 28945 + + uint64x2_t max_idx; 28946 + #if WANT_SIMD_EXCEPT 28947 + float64x2_t tiny_bound, huge_bound, scale_minus_one; 28948 + #endif 28949 + } data = { 28950 + + .max_idx = V2 (768), 28951 + .third = V2 (0x1.5555555555556p-2), /* used to compute 2/3 and 1/6 too. */ 28952 + - .two_over_fifteen = V2 (0x1.1111111111111p-3), 28953 + + .two_over_fifteen = 0x1.1111111111111p-3, 28954 + .tenth = V2 (-0x1.999999999999ap-4), 28955 + .two_over_five = V2 (-0x1.999999999999ap-2), 28956 + .two_over_nine = V2 (-0x1.c71c71c71c71cp-3), 28957 + - .two_over_fortyfive = V2 (0x1.6c16c16c16c17p-5), 28958 + + .two_over_fortyfive = 0x1.6c16c16c16c17p-5, 28959 + .max = V2 (5.9921875), /* 6 - 1/128. */ 28960 + .shift = V2 (0x1p45), 28961 + #if WANT_SIMD_EXCEPT 28962 + @@ -87,8 +89,8 @@ float64x2_t VPCS_ATTR V_NAME_D1 (erf) (float64x2_t x) 28963 + float64x2_t a = vabsq_f64 (x); 28964 + /* Reciprocal conditions that do not catch NaNs so they can be used in BSLs 28965 + to return expected results. */ 28966 + - uint64x2_t a_le_max = vcleq_f64 (a, dat->max); 28967 + - uint64x2_t a_gt_max = vcgtq_f64 (a, dat->max); 28968 + + uint64x2_t a_le_max = vcaleq_f64 (x, dat->max); 28969 + + uint64x2_t a_gt_max = vcagtq_f64 (x, dat->max); 28970 + 28971 + #if WANT_SIMD_EXCEPT 28972 + /* |x| huge or tiny. */ 28973 + @@ -115,7 +117,7 @@ float64x2_t VPCS_ATTR V_NAME_D1 (erf) (float64x2_t x) 28974 + segfault. */ 28975 + uint64x2_t i 28976 + = vsubq_u64 (vreinterpretq_u64_f64 (z), vreinterpretq_u64_f64 (shift)); 28977 + - i = vbslq_u64 (a_le_max, i, v_u64 (768)); 28978 + + i = vbslq_u64 (a_le_max, i, dat->max_idx); 28979 + struct entry e = lookup (i); 28980 + 28981 + float64x2_t r = vsubq_f64 (z, shift); 28982 + @@ -125,14 +127,19 @@ float64x2_t VPCS_ATTR V_NAME_D1 (erf) (float64x2_t x) 28983 + float64x2_t d2 = vmulq_f64 (d, d); 28984 + float64x2_t r2 = vmulq_f64 (r, r); 28985 + 28986 + + float64x2_t two_over_fifteen_and_fortyfive 28987 + + = vld1q_f64 (&dat->two_over_fifteen); 28988 + + 28989 + /* poly (d, r) = 1 + p1(r) * d + p2(r) * d^2 + ... + p5(r) * d^5. */ 28990 + float64x2_t p1 = r; 28991 + float64x2_t p2 28992 + = vfmsq_f64 (dat->third, r2, vaddq_f64 (dat->third, dat->third)); 28993 + float64x2_t p3 = vmulq_f64 (r, vfmaq_f64 (v_f64 (-0.5), r2, dat->third)); 28994 + - float64x2_t p4 = vfmaq_f64 (dat->two_over_five, r2, dat->two_over_fifteen); 28995 + + float64x2_t p4 = vfmaq_laneq_f64 (dat->two_over_five, r2, 28996 + + two_over_fifteen_and_fortyfive, 0); 28997 + p4 = vfmsq_f64 (dat->tenth, r2, p4); 28998 + - float64x2_t p5 = vfmaq_f64 (dat->two_over_nine, r2, dat->two_over_fortyfive); 28999 + + float64x2_t p5 = vfmaq_laneq_f64 (dat->two_over_nine, r2, 29000 + + two_over_fifteen_and_fortyfive, 1); 29001 + p5 = vmulq_f64 (r, vfmaq_f64 (vmulq_f64 (v_f64 (0.5), dat->third), r2, p5)); 29002 + 29003 + float64x2_t p34 = vfmaq_f64 (p3, d, p4); 29004 + diff --git a/sysdeps/aarch64/fpu/erfc_advsimd.c b/sysdeps/aarch64/fpu/erfc_advsimd.c 29005 + index f1b3bfe830..2f2f755c46 100644 29006 + --- a/sysdeps/aarch64/fpu/erfc_advsimd.c 29007 + +++ b/sysdeps/aarch64/fpu/erfc_advsimd.c 29008 + @@ -24,8 +24,8 @@ static const struct data 29009 + { 29010 + uint64x2_t offset, table_scale; 29011 + float64x2_t max, shift; 29012 + - float64x2_t p20, p40, p41, p42; 29013 + - float64x2_t p51, p52; 29014 + + float64x2_t p20, p40, p41, p51; 29015 + + double p42, p52; 29016 + double qr5[2], qr6[2], qr7[2], qr8[2], qr9[2]; 29017 + #if WANT_SIMD_EXCEPT 29018 + float64x2_t uflow_bound; 29019 + @@ -41,9 +41,9 @@ static const struct data 29020 + .p20 = V2 (0x1.5555555555555p-2), /* 1/3, used to compute 2/3 and 1/6. */ 29021 + .p40 = V2 (-0x1.999999999999ap-4), /* 1/10. */ 29022 + .p41 = V2 (-0x1.999999999999ap-2), /* 2/5. */ 29023 + - .p42 = V2 (0x1.1111111111111p-3), /* 2/15. */ 29024 + + .p42 = 0x1.1111111111111p-3, /* 2/15. */ 29025 + .p51 = V2 (-0x1.c71c71c71c71cp-3), /* 2/9. */ 29026 + - .p52 = V2 (0x1.6c16c16c16c17p-5), /* 2/45. */ 29027 + + .p52 = 0x1.6c16c16c16c17p-5, /* 2/45. */ 29028 + /* Qi = (i+1) / i, Ri = -2 * i / ((i+1)*(i+2)), for i = 5, ..., 9. */ 29029 + .qr5 = { 0x1.3333333333333p0, -0x1.e79e79e79e79ep-3 }, 29030 + .qr6 = { 0x1.2aaaaaaaaaaabp0, -0x1.b6db6db6db6dbp-3 }, 29031 + @@ -157,9 +157,10 @@ float64x2_t V_NAME_D1 (erfc) (float64x2_t x) 29032 + float64x2_t p1 = r; 29033 + float64x2_t p2 = vfmsq_f64 (dat->p20, r2, vaddq_f64 (dat->p20, dat->p20)); 29034 + float64x2_t p3 = vmulq_f64 (r, vfmaq_f64 (v_f64 (-0.5), r2, dat->p20)); 29035 + - float64x2_t p4 = vfmaq_f64 (dat->p41, r2, dat->p42); 29036 + + float64x2_t p42_p52 = vld1q_f64 (&dat->p42); 29037 + + float64x2_t p4 = vfmaq_laneq_f64 (dat->p41, r2, p42_p52, 0); 29038 + p4 = vfmsq_f64 (dat->p40, r2, p4); 29039 + - float64x2_t p5 = vfmaq_f64 (dat->p51, r2, dat->p52); 29040 + + float64x2_t p5 = vfmaq_laneq_f64 (dat->p51, r2, p42_p52, 1); 29041 + p5 = vmulq_f64 (r, vfmaq_f64 (vmulq_f64 (v_f64 (0.5), dat->p20), r2, p5)); 29042 + /* Compute p_i using recurrence relation: 29043 + p_{i+2} = (p_i + r * Q_{i+1} * p_{i+1}) * R_{i+1}. */ 29044 + 29045 + commit 76c923fe9d09befc8131205659d99cb9ac97460a 29046 + Author: Joe Ramsay <Joe.Ramsay@arm.com> 29047 + Date: Fri Nov 1 15:48:54 2024 +0000 29048 + 29049 + AArch64: Remove SVE erf and erfc tables 29050 + 29051 + By using a combination of mask-and-add instead of the shift-based 29052 + index calculation the routines can share the same table as other 29053 + variants with no performance degradation. 29054 + 29055 + The tables change name because of other changes in downstream AOR. 29056 + 29057 + Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com> 29058 + (cherry picked from commit 2d82d781a539ce8e82178fc1fa2c99ae1884e7fe) 29059 + 29060 + diff --git a/sysdeps/aarch64/fpu/Makefile b/sysdeps/aarch64/fpu/Makefile 29061 + index 234a6c457c..be8541f649 100644 29062 + --- a/sysdeps/aarch64/fpu/Makefile 29063 + +++ b/sysdeps/aarch64/fpu/Makefile 29064 + @@ -41,8 +41,6 @@ libmvec-support = $(addsuffix f_advsimd,$(float-advsimd-funcs)) \ 29065 + v_log10_data \ 29066 + erf_data \ 29067 + erff_data \ 29068 + - sv_erf_data \ 29069 + - sv_erff_data \ 29070 + v_exp_tail_data \ 29071 + erfc_data \ 29072 + erfcf_data \ 29073 + diff --git a/sysdeps/aarch64/fpu/erf_advsimd.c b/sysdeps/aarch64/fpu/erf_advsimd.c 29074 + index c0116735e4..a48092e838 100644 29075 + --- a/sysdeps/aarch64/fpu/erf_advsimd.c 29076 + +++ b/sysdeps/aarch64/fpu/erf_advsimd.c 29077 + @@ -58,8 +58,8 @@ static inline struct entry 29078 + lookup (uint64x2_t i) 29079 + { 29080 + struct entry e; 29081 + - float64x2_t e1 = vld1q_f64 (&__erf_data.tab[vgetq_lane_u64 (i, 0)].erf), 29082 + - e2 = vld1q_f64 (&__erf_data.tab[vgetq_lane_u64 (i, 1)].erf); 29083 + + float64x2_t e1 = vld1q_f64 (&__v_erf_data.tab[vgetq_lane_u64 (i, 0)].erf), 29084 + + e2 = vld1q_f64 (&__v_erf_data.tab[vgetq_lane_u64 (i, 1)].erf); 29085 + e.erf = vuzp1q_f64 (e1, e2); 29086 + e.scale = vuzp2q_f64 (e1, e2); 29087 + return e; 29088 + diff --git a/sysdeps/aarch64/fpu/erf_data.c b/sysdeps/aarch64/fpu/erf_data.c 29089 + index 6d2dcd235c..ea01fad7ca 100644 29090 + --- a/sysdeps/aarch64/fpu/erf_data.c 29091 + +++ b/sysdeps/aarch64/fpu/erf_data.c 29092 + @@ -19,14 +19,14 @@ 29093 + 29094 + #include "vecmath_config.h" 29095 + 29096 + -/* Lookup table used in erf. 29097 + +/* Lookup table used in vector erf. 29098 + For each possible rounded input r (multiples of 1/128), between 29099 + r = 0.0 and r = 6.0 (769 values): 29100 + - - the first entry __erff_data.tab.erf contains the values of erf(r), 29101 + - - the second entry __erff_data.tab.scale contains the values of 29102 + + - the first entry __v_erff_data.tab.erf contains the values of erf(r), 29103 + + - the second entry __v_erff_data.tab.scale contains the values of 29104 + 2/sqrt(pi)*exp(-r^2). Note that indices 0 and 1 are never hit by the 29105 + algorithm, since lookup is performed only for x >= 1/64-1/512. */ 29106 + -const struct erf_data __erf_data = { 29107 + +const struct v_erf_data __v_erf_data = { 29108 + .tab = { { 0x0.0000000000000p+0, 0x1.20dd750429b6dp+0 }, 29109 + { 0x1.20dbf3deb1340p-7, 0x1.20d8f1975c85dp+0 }, 29110 + { 0x1.20d77083f17a0p-6, 0x1.20cb67bd452c7p+0 }, 29111 + diff --git a/sysdeps/aarch64/fpu/erf_sve.c b/sysdeps/aarch64/fpu/erf_sve.c 29112 + index 7d51417406..671d55a02b 100644 29113 + --- a/sysdeps/aarch64/fpu/erf_sve.c 29114 + +++ b/sysdeps/aarch64/fpu/erf_sve.c 29115 + @@ -67,14 +67,16 @@ svfloat64_t SV_NAME_D1 (erf) (svfloat64_t x, const svbool_t pg) 29116 + svfloat64_t a = svabs_x (pg, x); 29117 + svfloat64_t shift = sv_f64 (dat->shift); 29118 + svfloat64_t z = svadd_x (pg, a, shift); 29119 + - svuint64_t i 29120 + - = svsub_x (pg, svreinterpret_u64 (z), svreinterpret_u64 (shift)); 29121 + + svuint64_t i = svand_x (pg, svreinterpret_u64 (z), 0xfff); 29122 + + i = svadd_x (pg, i, i); 29123 + 29124 + /* Lookup without shortcut for small values but with predicate to avoid 29125 + segfault for large values and NaNs. */ 29126 + svfloat64_t r = svsub_x (pg, z, shift); 29127 + - svfloat64_t erfr = svld1_gather_index (a_lt_max, __sv_erf_data.erf, i); 29128 + - svfloat64_t scale = svld1_gather_index (a_lt_max, __sv_erf_data.scale, i); 29129 + + svfloat64_t erfr 29130 + + = svld1_gather_index (a_lt_max, &__v_erf_data.tab[0].erf, i); 29131 + + svfloat64_t scale 29132 + + = svld1_gather_index (a_lt_max, &__v_erf_data.tab[0].scale, i); 29133 + 29134 + /* erf(x) ~ erf(r) + scale * d * poly (r, d). */ 29135 + svfloat64_t d = svsub_x (pg, a, r); 29136 + diff --git a/sysdeps/aarch64/fpu/erfc_advsimd.c b/sysdeps/aarch64/fpu/erfc_advsimd.c 29137 + index 2f2f755c46..d05eac61a2 100644 29138 + --- a/sysdeps/aarch64/fpu/erfc_advsimd.c 29139 + +++ b/sysdeps/aarch64/fpu/erfc_advsimd.c 29140 + @@ -69,9 +69,9 @@ lookup (uint64x2_t i) 29141 + { 29142 + struct entry e; 29143 + float64x2_t e1 29144 + - = vld1q_f64 (&__erfc_data.tab[vgetq_lane_u64 (i, 0) - Off].erfc); 29145 + + = vld1q_f64 (&__v_erfc_data.tab[vgetq_lane_u64 (i, 0) - Off].erfc); 29146 + float64x2_t e2 29147 + - = vld1q_f64 (&__erfc_data.tab[vgetq_lane_u64 (i, 1) - Off].erfc); 29148 + + = vld1q_f64 (&__v_erfc_data.tab[vgetq_lane_u64 (i, 1) - Off].erfc); 29149 + e.erfc = vuzp1q_f64 (e1, e2); 29150 + e.scale = vuzp2q_f64 (e1, e2); 29151 + return e; 29152 + diff --git a/sysdeps/aarch64/fpu/erfc_data.c b/sysdeps/aarch64/fpu/erfc_data.c 29153 + index 76a94e4681..8dc6a8c42c 100644 29154 + --- a/sysdeps/aarch64/fpu/erfc_data.c 29155 + +++ b/sysdeps/aarch64/fpu/erfc_data.c 29156 + @@ -19,14 +19,14 @@ 29157 + 29158 + #include "vecmath_config.h" 29159 + 29160 + -/* Lookup table used in erfc. 29161 + +/* Lookup table used in vector erfc. 29162 + For each possible rounded input r (multiples of 1/128), between 29163 + r = 0.0 and r = ~27.0 (3488 values): 29164 + - - the first entry __erfc_data.tab.erfc contains the values of erfc(r), 29165 + - - the second entry __erfc_data.tab.scale contains the values of 29166 + + - the first entry __v_erfc_data.tab.erfc contains the values of erfc(r), 29167 + + - the second entry __v_erfc_data.tab.scale contains the values of 29168 + 2/sqrt(pi)*exp(-r^2). Both values may go into subnormal range, therefore 29169 + they are scaled by a large enough value 2^128 (fits in 8bit). */ 29170 + -const struct erfc_data __erfc_data = { 29171 + +const struct v_erfc_data __v_erfc_data = { 29172 + .tab = { { 0x1p128, 0x1.20dd750429b6dp128 }, 29173 + { 0x1.fb7c9030853b3p127, 0x1.20d8f1975c85dp128 }, 29174 + { 0x1.f6f9447be0743p127, 0x1.20cb67bd452c7p128 }, 29175 + diff --git a/sysdeps/aarch64/fpu/erfc_sve.c b/sysdeps/aarch64/fpu/erfc_sve.c 29176 + index c17d3e4484..703926ee41 100644 29177 + --- a/sysdeps/aarch64/fpu/erfc_sve.c 29178 + +++ b/sysdeps/aarch64/fpu/erfc_sve.c 29179 + @@ -104,7 +104,7 @@ svfloat64_t SV_NAME_D1 (erfc) (svfloat64_t x, const svbool_t pg) 29180 + 29181 + /* Lookup erfc(r) and 2/sqrt(pi)*exp(-r^2) in tables. */ 29182 + i = svadd_x (pg, i, i); 29183 + - const float64_t *p = &__erfc_data.tab[0].erfc - 2 * dat->off_arr; 29184 + + const float64_t *p = &__v_erfc_data.tab[0].erfc - 2 * dat->off_arr; 29185 + svfloat64_t erfcr = svld1_gather_index (pg, p, i); 29186 + svfloat64_t scale = svld1_gather_index (pg, p + 1, i); 29187 + 29188 + diff --git a/sysdeps/aarch64/fpu/erfcf_advsimd.c b/sysdeps/aarch64/fpu/erfcf_advsimd.c 29189 + index ca5bc3ab33..59b0b0d64b 100644 29190 + --- a/sysdeps/aarch64/fpu/erfcf_advsimd.c 29191 + +++ b/sysdeps/aarch64/fpu/erfcf_advsimd.c 29192 + @@ -62,13 +62,13 @@ lookup (uint32x4_t i) 29193 + { 29194 + struct entry e; 29195 + float32x2_t t0 29196 + - = vld1_f32 (&__erfcf_data.tab[vgetq_lane_u32 (i, 0) - Off].erfc); 29197 + + = vld1_f32 (&__v_erfcf_data.tab[vgetq_lane_u32 (i, 0) - Off].erfc); 29198 + float32x2_t t1 29199 + - = vld1_f32 (&__erfcf_data.tab[vgetq_lane_u32 (i, 1) - Off].erfc); 29200 + + = vld1_f32 (&__v_erfcf_data.tab[vgetq_lane_u32 (i, 1) - Off].erfc); 29201 + float32x2_t t2 29202 + - = vld1_f32 (&__erfcf_data.tab[vgetq_lane_u32 (i, 2) - Off].erfc); 29203 + + = vld1_f32 (&__v_erfcf_data.tab[vgetq_lane_u32 (i, 2) - Off].erfc); 29204 + float32x2_t t3 29205 + - = vld1_f32 (&__erfcf_data.tab[vgetq_lane_u32 (i, 3) - Off].erfc); 29206 + + = vld1_f32 (&__v_erfcf_data.tab[vgetq_lane_u32 (i, 3) - Off].erfc); 29207 + float32x4_t e1 = vcombine_f32 (t0, t1); 29208 + float32x4_t e2 = vcombine_f32 (t2, t3); 29209 + e.erfc = vuzp1q_f32 (e1, e2); 29210 + diff --git a/sysdeps/aarch64/fpu/erfcf_data.c b/sysdeps/aarch64/fpu/erfcf_data.c 29211 + index 77fb889a78..d45087bbb9 100644 29212 + --- a/sysdeps/aarch64/fpu/erfcf_data.c 29213 + +++ b/sysdeps/aarch64/fpu/erfcf_data.c 29214 + @@ -19,14 +19,14 @@ 29215 + 29216 + #include "vecmath_config.h" 29217 + 29218 + -/* Lookup table used in erfcf. 29219 + +/* Lookup table used in vector erfcf. 29220 + For each possible rounded input r (multiples of 1/64), between 29221 + r = 0.0 and r = 10.0625 (645 values): 29222 + - - the first entry __erfcf_data.tab.erfc contains the values of erfc(r), 29223 + - - the second entry __erfcf_data.tab.scale contains the values of 29224 + + - the first entry __v_erfcf_data.tab.erfc contains the values of erfc(r), 29225 + + - the second entry __v_erfcf_data.tab.scale contains the values of 29226 + 2/sqrt(pi)*exp(-r^2). Both values may go into subnormal range, therefore 29227 + they are scaled by a large enough value 2^47 (fits in 8 bits). */ 29228 + -const struct erfcf_data __erfcf_data = { 29229 + +const struct v_erfcf_data __v_erfcf_data = { 29230 + .tab = { { 0x1p47, 0x1.20dd76p47 }, 29231 + { 0x1.f6f944p46, 0x1.20cb68p47 }, 29232 + { 0x1.edf3aap46, 0x1.209546p47 }, 29233 + diff --git a/sysdeps/aarch64/fpu/erfcf_sve.c b/sysdeps/aarch64/fpu/erfcf_sve.c 29234 + index 48d1677eb4..ecacb933ac 100644 29235 + --- a/sysdeps/aarch64/fpu/erfcf_sve.c 29236 + +++ b/sysdeps/aarch64/fpu/erfcf_sve.c 29237 + @@ -77,7 +77,7 @@ svfloat32_t SV_NAME_F1 (erfc) (svfloat32_t x, const svbool_t pg) 29238 + 29239 + /* Lookup erfc(r) and 2/sqrt(pi)*exp(-r^2) in tables. */ 29240 + i = svmul_x (pg, i, 2); 29241 + - const float32_t *p = &__erfcf_data.tab[0].erfc - 2 * dat->off_arr; 29242 + + const float32_t *p = &__v_erfcf_data.tab[0].erfc - 2 * dat->off_arr; 29243 + svfloat32_t erfcr = svld1_gather_index (pg, p, i); 29244 + svfloat32_t scale = svld1_gather_index (pg, p + 1, i); 29245 + 29246 + diff --git a/sysdeps/aarch64/fpu/erff_advsimd.c b/sysdeps/aarch64/fpu/erff_advsimd.c 29247 + index f2fe6ff236..db39e789b6 100644 29248 + --- a/sysdeps/aarch64/fpu/erff_advsimd.c 29249 + +++ b/sysdeps/aarch64/fpu/erff_advsimd.c 29250 + @@ -47,10 +47,10 @@ static inline struct entry 29251 + lookup (uint32x4_t i) 29252 + { 29253 + struct entry e; 29254 + - float32x2_t t0 = vld1_f32 (&__erff_data.tab[vgetq_lane_u32 (i, 0)].erf); 29255 + - float32x2_t t1 = vld1_f32 (&__erff_data.tab[vgetq_lane_u32 (i, 1)].erf); 29256 + - float32x2_t t2 = vld1_f32 (&__erff_data.tab[vgetq_lane_u32 (i, 2)].erf); 29257 + - float32x2_t t3 = vld1_f32 (&__erff_data.tab[vgetq_lane_u32 (i, 3)].erf); 29258 + + float32x2_t t0 = vld1_f32 (&__v_erff_data.tab[vgetq_lane_u32 (i, 0)].erf); 29259 + + float32x2_t t1 = vld1_f32 (&__v_erff_data.tab[vgetq_lane_u32 (i, 1)].erf); 29260 + + float32x2_t t2 = vld1_f32 (&__v_erff_data.tab[vgetq_lane_u32 (i, 2)].erf); 29261 + + float32x2_t t3 = vld1_f32 (&__v_erff_data.tab[vgetq_lane_u32 (i, 3)].erf); 29262 + float32x4_t e1 = vcombine_f32 (t0, t1); 29263 + float32x4_t e2 = vcombine_f32 (t2, t3); 29264 + e.erf = vuzp1q_f32 (e1, e2); 29265 + diff --git a/sysdeps/aarch64/fpu/erff_data.c b/sysdeps/aarch64/fpu/erff_data.c 29266 + index 9a32940915..da38aed205 100644 29267 + --- a/sysdeps/aarch64/fpu/erff_data.c 29268 + +++ b/sysdeps/aarch64/fpu/erff_data.c 29269 + @@ -19,14 +19,14 @@ 29270 + 29271 + #include "vecmath_config.h" 29272 + 29273 + -/* Lookup table used in erff. 29274 + +/* Lookup table used in vector erff. 29275 + For each possible rounded input r (multiples of 1/128), between 29276 + r = 0.0 and r = 4.0 (513 values): 29277 + - - the first entry __erff_data.tab.erf contains the values of erf(r), 29278 + - - the second entry __erff_data.tab.scale contains the values of 29279 + + - the first entry __v_erff_data.tab.erf contains the values of erf(r), 29280 + + - the second entry __v_erff_data.tab.scale contains the values of 29281 + 2/sqrt(pi)*exp(-r^2). Note that indices 0 and 1 are never hit by the 29282 + algorithm, since lookup is performed only for x >= 1/64-1/512. */ 29283 + -const struct erff_data __erff_data = { 29284 + +const struct v_erff_data __v_erff_data = { 29285 + .tab = { { 0x0.000000p+0, 0x1.20dd76p+0 }, 29286 + { 0x1.20dbf4p-7, 0x1.20d8f2p+0 }, 29287 + { 0x1.20d770p-6, 0x1.20cb68p+0 }, 29288 + diff --git a/sysdeps/aarch64/fpu/erff_sve.c b/sysdeps/aarch64/fpu/erff_sve.c 29289 + index 38f00db9be..0e382eb09a 100644 29290 + --- a/sysdeps/aarch64/fpu/erff_sve.c 29291 + +++ b/sysdeps/aarch64/fpu/erff_sve.c 29292 + @@ -62,18 +62,17 @@ svfloat32_t SV_NAME_F1 (erf) (svfloat32_t x, const svbool_t pg) 29293 + 29294 + svfloat32_t shift = sv_f32 (dat->shift); 29295 + svfloat32_t z = svadd_x (pg, a, shift); 29296 + - svuint32_t i 29297 + - = svsub_x (pg, svreinterpret_u32 (z), svreinterpret_u32 (shift)); 29298 + - 29299 + - /* Saturate lookup index. */ 29300 + - i = svsel (a_ge_max, sv_u32 (512), i); 29301 + + svuint32_t i = svand_x (pg, svreinterpret_u32 (z), 0xfff); 29302 + + i = svadd_x (pg, i, i); 29303 + 29304 + /* r and erf(r) set to 0 for |x| below min. */ 29305 + svfloat32_t r = svsub_z (a_gt_min, z, shift); 29306 + - svfloat32_t erfr = svld1_gather_index (a_gt_min, __sv_erff_data.erf, i); 29307 + + svfloat32_t erfr 29308 + + = svld1_gather_index (a_gt_min, &__v_erff_data.tab[0].erf, i); 29309 + 29310 + /* scale set to 2/sqrt(pi) for |x| below min. */ 29311 + - svfloat32_t scale = svld1_gather_index (a_gt_min, __sv_erff_data.scale, i); 29312 + + svfloat32_t scale 29313 + + = svld1_gather_index (a_gt_min, &__v_erff_data.tab[0].scale, i); 29314 + scale = svsel (a_gt_min, scale, sv_f32 (dat->scale)); 29315 + 29316 + /* erf(x) ~ erf(r) + scale * d * (1 - r * d + 1/3 * d^2). */ 29317 + diff --git a/sysdeps/aarch64/fpu/sv_erf_data.c b/sysdeps/aarch64/fpu/sv_erf_data.c 29318 + deleted file mode 100644 29319 + index a53878f893..0000000000 29320 + --- a/sysdeps/aarch64/fpu/sv_erf_data.c 29321 + +++ /dev/null 29322 + @@ -1,1570 +0,0 @@ 29323 + -/* Table for SVE erf approximation 29324 + - 29325 + - Copyright (C) 2024 Free Software Foundation, Inc. 29326 + - This file is part of the GNU C Library. 29327 + - 29328 + - The GNU C Library is free software; you can redistribute it and/or 29329 + - modify it under the terms of the GNU Lesser General Public 29330 + - License as published by the Free Software Foundation; either 29331 + - version 2.1 of the License, or (at your option) any later version. 29332 + - 29333 + - The GNU C Library is distributed in the hope that it will be useful, 29334 + - but WITHOUT ANY WARRANTY; without even the implied warranty of 29335 + - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 29336 + - Lesser General Public License for more details. 29337 + - 29338 + - You should have received a copy of the GNU Lesser General Public 29339 + - License along with the GNU C Library; if not, see 29340 + - <https://www.gnu.org/licenses/>. */ 29341 + - 29342 + -#include "vecmath_config.h" 29343 + - 29344 + -/* Lookup table used in vector erf. 29345 + - For each possible rounded input r (multiples of 1/128), between 29346 + - r = 0.0 and r = 6.0 (769 values): 29347 + - - the first entry __erf_data.tab.erf contains the values of erf(r), 29348 + - - the second entry __erf_data.tab.scale contains the values of 29349 + - 2/sqrt(pi)*exp(-r^2). Note that indices 0 and 1 are never hit by the 29350 + - algorithm, since lookup is performed only for x >= 1/64-1/512. */ 29351 + -const struct sv_erf_data __sv_erf_data = { 29352 + - .erf = { 0x0.0000000000000p+0, 29353 + - 0x1.20dbf3deb1340p-7, 29354 + - 0x1.20d77083f17a0p-6, 29355 + - 0x1.b137e0cf584dcp-6, 29356 + - 0x1.20c5645dd2538p-5, 29357 + - 0x1.68e5d3bbc9526p-5, 29358 + - 0x1.b0fafef135745p-5, 29359 + - 0x1.f902a77bd3821p-5, 29360 + - 0x1.207d480e90658p-4, 29361 + - 0x1.44703e87e8593p-4, 29362 + - 0x1.68591a1e83b5dp-4, 29363 + - 0x1.8c36beb8a8d23p-4, 29364 + - 0x1.b0081148a873ap-4, 29365 + - 0x1.d3cbf7e70a4b3p-4, 29366 + - 0x1.f78159ec8bb50p-4, 29367 + - 0x1.0d939005f65e5p-3, 29368 + - 0x1.1f5e1a35c3b89p-3, 29369 + - 0x1.311fc15f56d14p-3, 29370 + - 0x1.42d7fc2f64959p-3, 29371 + - 0x1.548642321d7c6p-3, 29372 + - 0x1.662a0bdf7a89fp-3, 29373 + - 0x1.77c2d2a765f9ep-3, 29374 + - 0x1.895010fdbdbfdp-3, 29375 + - 0x1.9ad142662e14dp-3, 29376 + - 0x1.ac45e37fe2526p-3, 29377 + - 0x1.bdad72110a648p-3, 29378 + - 0x1.cf076d1233237p-3, 29379 + - 0x1.e05354b96ff36p-3, 29380 + - 0x1.f190aa85540e2p-3, 29381 + - 0x1.015f78a3dcf3dp-2, 29382 + - 0x1.09eed6982b948p-2, 29383 + - 0x1.127631eb8de32p-2, 29384 + - 0x1.1af54e232d609p-2, 29385 + - 0x1.236bef825d9a2p-2, 29386 + - 0x1.2bd9db0f7827fp-2, 29387 + - 0x1.343ed6989b7d9p-2, 29388 + - 0x1.3c9aa8b84bedap-2, 29389 + - 0x1.44ed18d9f6462p-2, 29390 + - 0x1.4d35ef3e5372ep-2, 29391 + - 0x1.5574f4ffac98ep-2, 29392 + - 0x1.5da9f415ff23fp-2, 29393 + - 0x1.65d4b75b00471p-2, 29394 + - 0x1.6df50a8dff772p-2, 29395 + - 0x1.760aba57a76bfp-2, 29396 + - 0x1.7e15944d9d3e4p-2, 29397 + - 0x1.861566f5fd3c0p-2, 29398 + - 0x1.8e0a01cab516bp-2, 29399 + - 0x1.95f3353cbb146p-2, 29400 + - 0x1.9dd0d2b721f39p-2, 29401 + - 0x1.a5a2aca209394p-2, 29402 + - 0x1.ad68966569a87p-2, 29403 + - 0x1.b522646bbda68p-2, 29404 + - 0x1.bccfec24855b8p-2, 29405 + - 0x1.c4710406a65fcp-2, 29406 + - 0x1.cc058392a6d2dp-2, 29407 + - 0x1.d38d4354c3bd0p-2, 29408 + - 0x1.db081ce6e2a48p-2, 29409 + - 0x1.e275eaf25e458p-2, 29410 + - 0x1.e9d68931ae650p-2, 29411 + - 0x1.f129d471eabb1p-2, 29412 + - 0x1.f86faa9428f9dp-2, 29413 + - 0x1.ffa7ea8eb5fd0p-2, 29414 + - 0x1.03693a371519cp-1, 29415 + - 0x1.06f794ab2cae7p-1, 29416 + - 0x1.0a7ef5c18edd2p-1, 29417 + - 0x1.0dff4f247f6c6p-1, 29418 + - 0x1.1178930ada115p-1, 29419 + - 0x1.14eab43841b55p-1, 29420 + - 0x1.1855a5fd3dd50p-1, 29421 + - 0x1.1bb95c3746199p-1, 29422 + - 0x1.1f15cb50bc4dep-1, 29423 + - 0x1.226ae840d4d70p-1, 29424 + - 0x1.25b8a88b6dd7fp-1, 29425 + - 0x1.28ff0240d52cdp-1, 29426 + - 0x1.2c3debfd7d6c1p-1, 29427 + - 0x1.2f755ce9a21f4p-1, 29428 + - 0x1.32a54cb8db67bp-1, 29429 + - 0x1.35cdb3a9a144dp-1, 29430 + - 0x1.38ee8a84beb71p-1, 29431 + - 0x1.3c07ca9cb4f9ep-1, 29432 + - 0x1.3f196dcd0f135p-1, 29433 + - 0x1.42236e79a5fa6p-1, 29434 + - 0x1.4525c78dd5966p-1, 29435 + - 0x1.4820747ba2dc2p-1, 29436 + - 0x1.4b13713ad3513p-1, 29437 + - 0x1.4dfeba47f63ccp-1, 29438 + - 0x1.50e24ca35fd2cp-1, 29439 + - 0x1.53be25d016a4fp-1, 29440 + - 0x1.569243d2b3a9bp-1, 29441 + - 0x1.595ea53035283p-1, 29442 + - 0x1.5c2348ecc4dc3p-1, 29443 + - 0x1.5ee02e8a71a53p-1, 29444 + - 0x1.61955607dd15dp-1, 29445 + - 0x1.6442bfdedd397p-1, 29446 + - 0x1.66e86d0312e82p-1, 29447 + - 0x1.69865ee075011p-1, 29448 + - 0x1.6c1c9759d0e5fp-1, 29449 + - 0x1.6eab18c74091bp-1, 29450 + - 0x1.7131e5f496a5ap-1, 29451 + - 0x1.73b1021fc0cb8p-1, 29452 + - 0x1.762870f720c6fp-1, 29453 + - 0x1.78983697dc96fp-1, 29454 + - 0x1.7b00578c26037p-1, 29455 + - 0x1.7d60d8c979f7bp-1, 29456 + - 0x1.7fb9bfaed8078p-1, 29457 + - 0x1.820b1202f27fbp-1, 29458 + - 0x1.8454d5f25760dp-1, 29459 + - 0x1.8697120d92a4ap-1, 29460 + - 0x1.88d1cd474a2e0p-1, 29461 + - 0x1.8b050ef253c37p-1, 29462 + - 0x1.8d30debfc572ep-1, 29463 + - 0x1.8f5544bd00c04p-1, 29464 + - 0x1.91724951b8fc6p-1, 29465 + - 0x1.9387f53df5238p-1, 29466 + - 0x1.959651980da31p-1, 29467 + - 0x1.979d67caa6631p-1, 29468 + - 0x1.999d4192a5715p-1, 29469 + - 0x1.9b95e8fd26abap-1, 29470 + - 0x1.9d8768656cc42p-1, 29471 + - 0x1.9f71ca72cffb6p-1, 29472 + - 0x1.a1551a16aaeafp-1, 29473 + - 0x1.a331628a45b92p-1, 29474 + - 0x1.a506af4cc00f4p-1, 29475 + - 0x1.a6d50c20fa293p-1, 29476 + - 0x1.a89c850b7d54dp-1, 29477 + - 0x1.aa5d265064366p-1, 29478 + - 0x1.ac16fc7143263p-1, 29479 + - 0x1.adca142b10f98p-1, 29480 + - 0x1.af767a741088bp-1, 29481 + - 0x1.b11c3c79bb424p-1, 29482 + - 0x1.b2bb679ead19cp-1, 29483 + - 0x1.b4540978921eep-1, 29484 + - 0x1.b5e62fce16095p-1, 29485 + - 0x1.b771e894d602ep-1, 29486 + - 0x1.b8f741ef54f83p-1, 29487 + - 0x1.ba764a2af2b78p-1, 29488 + - 0x1.bbef0fbde6221p-1, 29489 + - 0x1.bd61a1453ab44p-1, 29490 + - 0x1.bece0d82d1a5cp-1, 29491 + - 0x1.c034635b66e23p-1, 29492 + - 0x1.c194b1d49a184p-1, 29493 + - 0x1.c2ef0812fc1bdp-1, 29494 + - 0x1.c443755820d64p-1, 29495 + - 0x1.c5920900b5fd1p-1, 29496 + - 0x1.c6dad2829ec62p-1, 29497 + - 0x1.c81de16b14cefp-1, 29498 + - 0x1.c95b455cce69dp-1, 29499 + - 0x1.ca930e0e2a825p-1, 29500 + - 0x1.cbc54b476248dp-1, 29501 + - 0x1.ccf20ce0c0d27p-1, 29502 + - 0x1.ce1962c0e0d8bp-1, 29503 + - 0x1.cf3b5cdaf0c39p-1, 29504 + - 0x1.d0580b2cfd249p-1, 29505 + - 0x1.d16f7dbe41ca0p-1, 29506 + - 0x1.d281c49d818d0p-1, 29507 + - 0x1.d38eefdf64fddp-1, 29508 + - 0x1.d4970f9ce00d9p-1, 29509 + - 0x1.d59a33f19ed42p-1, 29510 + - 0x1.d6986cfa798e7p-1, 29511 + - 0x1.d791cad3eff01p-1, 29512 + - 0x1.d8865d98abe01p-1, 29513 + - 0x1.d97635600bb89p-1, 29514 + - 0x1.da61623cb41e0p-1, 29515 + - 0x1.db47f43b2980dp-1, 29516 + - 0x1.dc29fb60715afp-1, 29517 + - 0x1.dd0787a8bb39dp-1, 29518 + - 0x1.dde0a90611a0dp-1, 29519 + - 0x1.deb56f5f12d28p-1, 29520 + - 0x1.df85ea8db188ep-1, 29521 + - 0x1.e0522a5dfda73p-1, 29522 + - 0x1.e11a3e8cf4eb8p-1, 29523 + - 0x1.e1de36c75ba58p-1, 29524 + - 0x1.e29e22a89d766p-1, 29525 + - 0x1.e35a11b9b61cep-1, 29526 + - 0x1.e4121370224ccp-1, 29527 + - 0x1.e4c6372cd8927p-1, 29528 + - 0x1.e5768c3b4a3fcp-1, 29529 + - 0x1.e62321d06c5e0p-1, 29530 + - 0x1.e6cc0709c8a0dp-1, 29531 + - 0x1.e7714aec96534p-1, 29532 + - 0x1.e812fc64db369p-1, 29533 + - 0x1.e8b12a44944a8p-1, 29534 + - 0x1.e94be342e6743p-1, 29535 + - 0x1.e9e335fb56f87p-1, 29536 + - 0x1.ea7730ed0bbb9p-1, 29537 + - 0x1.eb07e27a133aap-1, 29538 + - 0x1.eb9558e6b42cep-1, 29539 + - 0x1.ec1fa258c4beap-1, 29540 + - 0x1.eca6ccd709544p-1, 29541 + - 0x1.ed2ae6489ac1ep-1, 29542 + - 0x1.edabfc7453e63p-1, 29543 + - 0x1.ee2a1d004692cp-1, 29544 + - 0x1.eea5557137ae0p-1, 29545 + - 0x1.ef1db32a2277cp-1, 29546 + - 0x1.ef93436bc2daap-1, 29547 + - 0x1.f006135426b26p-1, 29548 + - 0x1.f0762fde45ee6p-1, 29549 + - 0x1.f0e3a5e1a1788p-1, 29550 + - 0x1.f14e8211e8c55p-1, 29551 + - 0x1.f1b6d0fea5f4dp-1, 29552 + - 0x1.f21c9f12f0677p-1, 29553 + - 0x1.f27ff89525acfp-1, 29554 + - 0x1.f2e0e9a6a8b09p-1, 29555 + - 0x1.f33f7e43a706bp-1, 29556 + - 0x1.f39bc242e43e6p-1, 29557 + - 0x1.f3f5c1558b19ep-1, 29558 + - 0x1.f44d870704911p-1, 29559 + - 0x1.f4a31ebcd47dfp-1, 29560 + - 0x1.f4f693b67bd77p-1, 29561 + - 0x1.f547f10d60597p-1, 29562 + - 0x1.f59741b4b97cfp-1, 29563 + - 0x1.f5e4907982a07p-1, 29564 + - 0x1.f62fe80272419p-1, 29565 + - 0x1.f67952cff6282p-1, 29566 + - 0x1.f6c0db3c34641p-1, 29567 + - 0x1.f7068b7b10fd9p-1, 29568 + - 0x1.f74a6d9a38383p-1, 29569 + - 0x1.f78c8b812d498p-1, 29570 + - 0x1.f7cceef15d631p-1, 29571 + - 0x1.f80ba18636f07p-1, 29572 + - 0x1.f848acb544e95p-1, 29573 + - 0x1.f88419ce4e184p-1, 29574 + - 0x1.f8bdf1fb78370p-1, 29575 + - 0x1.f8f63e416ebffp-1, 29576 + - 0x1.f92d077f8d56dp-1, 29577 + - 0x1.f96256700da8ep-1, 29578 + - 0x1.f99633a838a57p-1, 29579 + - 0x1.f9c8a7989af0dp-1, 29580 + - 0x1.f9f9ba8d3c733p-1, 29581 + - 0x1.fa2974addae45p-1, 29582 + - 0x1.fa57ddfe27376p-1, 29583 + - 0x1.fa84fe5e05c8dp-1, 29584 + - 0x1.fab0dd89d1309p-1, 29585 + - 0x1.fadb831a9f9c3p-1, 29586 + - 0x1.fb04f6868a944p-1, 29587 + - 0x1.fb2d3f20f9101p-1, 29588 + - 0x1.fb54641aebbc9p-1, 29589 + - 0x1.fb7a6c834b5a2p-1, 29590 + - 0x1.fb9f5f4739170p-1, 29591 + - 0x1.fbc3433260ca5p-1, 29592 + - 0x1.fbe61eef4cf6ap-1, 29593 + - 0x1.fc07f907bc794p-1, 29594 + - 0x1.fc28d7e4f9cd0p-1, 29595 + - 0x1.fc48c1d033c7ap-1, 29596 + - 0x1.fc67bcf2d7b8fp-1, 29597 + - 0x1.fc85cf56ecd38p-1, 29598 + - 0x1.fca2fee770c79p-1, 29599 + - 0x1.fcbf5170b578bp-1, 29600 + - 0x1.fcdacca0bfb73p-1, 29601 + - 0x1.fcf57607a6e7cp-1, 29602 + - 0x1.fd0f5317f582fp-1, 29603 + - 0x1.fd2869270a56fp-1, 29604 + - 0x1.fd40bd6d7a785p-1, 29605 + - 0x1.fd58550773cb5p-1, 29606 + - 0x1.fd6f34f52013ap-1, 29607 + - 0x1.fd85621b0876dp-1, 29608 + - 0x1.fd9ae142795e3p-1, 29609 + - 0x1.fdafb719e6a69p-1, 29610 + - 0x1.fdc3e835500b3p-1, 29611 + - 0x1.fdd7790ea5bc0p-1, 29612 + - 0x1.fdea6e062d0c9p-1, 29613 + - 0x1.fdfccb62e52d3p-1, 29614 + - 0x1.fe0e9552ebdd6p-1, 29615 + - 0x1.fe1fcfebe2083p-1, 29616 + - 0x1.fe307f2b503d0p-1, 29617 + - 0x1.fe40a6f70af4bp-1, 29618 + - 0x1.fe504b1d9696cp-1, 29619 + - 0x1.fe5f6f568b301p-1, 29620 + - 0x1.fe6e1742f7cf6p-1, 29621 + - 0x1.fe7c466dc57a1p-1, 29622 + - 0x1.fe8a004c19ae6p-1, 29623 + - 0x1.fe97483db8670p-1, 29624 + - 0x1.fea4218d6594ap-1, 29625 + - 0x1.feb08f7146046p-1, 29626 + - 0x1.febc950b3fa75p-1, 29627 + - 0x1.fec835695932ep-1, 29628 + - 0x1.fed37386190fbp-1, 29629 + - 0x1.fede5248e38f4p-1, 29630 + - 0x1.fee8d486585eep-1, 29631 + - 0x1.fef2fd00af31ap-1, 29632 + - 0x1.fefcce6813974p-1, 29633 + - 0x1.ff064b5afffbep-1, 29634 + - 0x1.ff0f766697c76p-1, 29635 + - 0x1.ff18520700971p-1, 29636 + - 0x1.ff20e0a7ba8c2p-1, 29637 + - 0x1.ff2924a3f7a83p-1, 29638 + - 0x1.ff312046f2339p-1, 29639 + - 0x1.ff38d5cc4227fp-1, 29640 + - 0x1.ff404760319b4p-1, 29641 + - 0x1.ff47772010262p-1, 29642 + - 0x1.ff4e671a85425p-1, 29643 + - 0x1.ff55194fe19dfp-1, 29644 + - 0x1.ff5b8fb26f5f6p-1, 29645 + - 0x1.ff61cc26c1578p-1, 29646 + - 0x1.ff67d08401202p-1, 29647 + - 0x1.ff6d9e943c231p-1, 29648 + - 0x1.ff733814af88cp-1, 29649 + - 0x1.ff789eb6130c9p-1, 29650 + - 0x1.ff7dd41ce2b4dp-1, 29651 + - 0x1.ff82d9e1a76d8p-1, 29652 + - 0x1.ff87b1913e853p-1, 29653 + - 0x1.ff8c5cad200a5p-1, 29654 + - 0x1.ff90dcaba4096p-1, 29655 + - 0x1.ff9532f846ab0p-1, 29656 + - 0x1.ff9960f3eb327p-1, 29657 + - 0x1.ff9d67f51ddbap-1, 29658 + - 0x1.ffa14948549a7p-1, 29659 + - 0x1.ffa506302ebaep-1, 29660 + - 0x1.ffa89fe5b3625p-1, 29661 + - 0x1.ffac17988ef4bp-1, 29662 + - 0x1.ffaf6e6f4f5c0p-1, 29663 + - 0x1.ffb2a5879f35ep-1, 29664 + - 0x1.ffb5bdf67fe6fp-1, 29665 + - 0x1.ffb8b8c88295fp-1, 29666 + - 0x1.ffbb970200110p-1, 29667 + - 0x1.ffbe599f4f9d9p-1, 29668 + - 0x1.ffc10194fcb64p-1, 29669 + - 0x1.ffc38fcffbb7cp-1, 29670 + - 0x1.ffc60535dd7f5p-1, 29671 + - 0x1.ffc862a501fd7p-1, 29672 + - 0x1.ffcaa8f4c9beap-1, 29673 + - 0x1.ffccd8f5c66d1p-1, 29674 + - 0x1.ffcef371ea4d7p-1, 29675 + - 0x1.ffd0f92cb6ba7p-1, 29676 + - 0x1.ffd2eae369a07p-1, 29677 + - 0x1.ffd4c94d29fdbp-1, 29678 + - 0x1.ffd6951b33686p-1, 29679 + - 0x1.ffd84ef9009eep-1, 29680 + - 0x1.ffd9f78c7524ap-1, 29681 + - 0x1.ffdb8f7605ee7p-1, 29682 + - 0x1.ffdd1750e1220p-1, 29683 + - 0x1.ffde8fb314ebfp-1, 29684 + - 0x1.ffdff92db56e5p-1, 29685 + - 0x1.ffe1544d01ccbp-1, 29686 + - 0x1.ffe2a1988857cp-1, 29687 + - 0x1.ffe3e19349dc7p-1, 29688 + - 0x1.ffe514bbdc197p-1, 29689 + - 0x1.ffe63b8c8b5f7p-1, 29690 + - 0x1.ffe7567b7b5e1p-1, 29691 + - 0x1.ffe865fac722bp-1, 29692 + - 0x1.ffe96a78a04a9p-1, 29693 + - 0x1.ffea645f6d6dap-1, 29694 + - 0x1.ffeb5415e7c44p-1, 29695 + - 0x1.ffec39ff380b9p-1, 29696 + - 0x1.ffed167b12ac2p-1, 29697 + - 0x1.ffede9e5d3262p-1, 29698 + - 0x1.ffeeb49896c6dp-1, 29699 + - 0x1.ffef76e956a9fp-1, 29700 + - 0x1.fff0312b010b5p-1, 29701 + - 0x1.fff0e3ad91ec2p-1, 29702 + - 0x1.fff18ebe2b0e1p-1, 29703 + - 0x1.fff232a72b48ep-1, 29704 + - 0x1.fff2cfb0453d9p-1, 29705 + - 0x1.fff3661e9569dp-1, 29706 + - 0x1.fff3f634b79f9p-1, 29707 + - 0x1.fff48032dbe40p-1, 29708 + - 0x1.fff50456dab8cp-1, 29709 + - 0x1.fff582dc48d30p-1, 29710 + - 0x1.fff5fbfc8a439p-1, 29711 + - 0x1.fff66feee5129p-1, 29712 + - 0x1.fff6dee89352ep-1, 29713 + - 0x1.fff7491cd4af6p-1, 29714 + - 0x1.fff7aebcff755p-1, 29715 + - 0x1.fff80ff8911fdp-1, 29716 + - 0x1.fff86cfd3e657p-1, 29717 + - 0x1.fff8c5f702ccfp-1, 29718 + - 0x1.fff91b102fca8p-1, 29719 + - 0x1.fff96c717b695p-1, 29720 + - 0x1.fff9ba420e834p-1, 29721 + - 0x1.fffa04a7928b1p-1, 29722 + - 0x1.fffa4bc63ee9ap-1, 29723 + - 0x1.fffa8fc0e5f33p-1, 29724 + - 0x1.fffad0b901755p-1, 29725 + - 0x1.fffb0ecebee1bp-1, 29726 + - 0x1.fffb4a210b172p-1, 29727 + - 0x1.fffb82cd9dcbfp-1, 29728 + - 0x1.fffbb8f1049c6p-1, 29729 + - 0x1.fffbeca6adbe9p-1, 29730 + - 0x1.fffc1e08f25f5p-1, 29731 + - 0x1.fffc4d3120aa1p-1, 29732 + - 0x1.fffc7a37857d2p-1, 29733 + - 0x1.fffca53375ce3p-1, 29734 + - 0x1.fffcce3b57bffp-1, 29735 + - 0x1.fffcf564ab6b7p-1, 29736 + - 0x1.fffd1ac4135f9p-1, 29737 + - 0x1.fffd3e6d5cd87p-1, 29738 + - 0x1.fffd607387b07p-1, 29739 + - 0x1.fffd80e8ce0dap-1, 29740 + - 0x1.fffd9fdeabccep-1, 29741 + - 0x1.fffdbd65e5ad0p-1, 29742 + - 0x1.fffdd98e903b2p-1, 29743 + - 0x1.fffdf46816833p-1, 29744 + - 0x1.fffe0e0140857p-1, 29745 + - 0x1.fffe26683972ap-1, 29746 + - 0x1.fffe3daa95b18p-1, 29747 + - 0x1.fffe53d558ae9p-1, 29748 + - 0x1.fffe68f4fa777p-1, 29749 + - 0x1.fffe7d156d244p-1, 29750 + - 0x1.fffe904222101p-1, 29751 + - 0x1.fffea2860ee1ep-1, 29752 + - 0x1.fffeb3ebb267bp-1, 29753 + - 0x1.fffec47d19457p-1, 29754 + - 0x1.fffed443e2787p-1, 29755 + - 0x1.fffee34943b15p-1, 29756 + - 0x1.fffef1960d85dp-1, 29757 + - 0x1.fffeff32af7afp-1, 29758 + - 0x1.ffff0c273bea2p-1, 29759 + - 0x1.ffff187b6bc0ep-1, 29760 + - 0x1.ffff2436a21dcp-1, 29761 + - 0x1.ffff2f5fefcaap-1, 29762 + - 0x1.ffff39fe16963p-1, 29763 + - 0x1.ffff44178c8d2p-1, 29764 + - 0x1.ffff4db27f146p-1, 29765 + - 0x1.ffff56d4d5e5ep-1, 29766 + - 0x1.ffff5f8435efcp-1, 29767 + - 0x1.ffff67c604180p-1, 29768 + - 0x1.ffff6f9f67e55p-1, 29769 + - 0x1.ffff77154e0d6p-1, 29770 + - 0x1.ffff7e2c6aea2p-1, 29771 + - 0x1.ffff84e93cd75p-1, 29772 + - 0x1.ffff8b500e77cp-1, 29773 + - 0x1.ffff9164f8e46p-1, 29774 + - 0x1.ffff972be5c59p-1, 29775 + - 0x1.ffff9ca891572p-1, 29776 + - 0x1.ffffa1de8c582p-1, 29777 + - 0x1.ffffa6d13de73p-1, 29778 + - 0x1.ffffab83e54b8p-1, 29779 + - 0x1.ffffaff99bac4p-1, 29780 + - 0x1.ffffb43555b5fp-1, 29781 + - 0x1.ffffb839e52f3p-1, 29782 + - 0x1.ffffbc09fa7cdp-1, 29783 + - 0x1.ffffbfa82616bp-1, 29784 + - 0x1.ffffc316d9ed0p-1, 29785 + - 0x1.ffffc6586abf6p-1, 29786 + - 0x1.ffffc96f1165ep-1, 29787 + - 0x1.ffffcc5cec0c1p-1, 29788 + - 0x1.ffffcf23ff5fcp-1, 29789 + - 0x1.ffffd1c637b2bp-1, 29790 + - 0x1.ffffd4456a10dp-1, 29791 + - 0x1.ffffd6a3554a1p-1, 29792 + - 0x1.ffffd8e1a2f22p-1, 29793 + - 0x1.ffffdb01e8546p-1, 29794 + - 0x1.ffffdd05a75eap-1, 29795 + - 0x1.ffffdeee4f810p-1, 29796 + - 0x1.ffffe0bd3e852p-1, 29797 + - 0x1.ffffe273c15b7p-1, 29798 + - 0x1.ffffe41314e06p-1, 29799 + - 0x1.ffffe59c6698bp-1, 29800 + - 0x1.ffffe710d565ep-1, 29801 + - 0x1.ffffe8717232dp-1, 29802 + - 0x1.ffffe9bf4098cp-1, 29803 + - 0x1.ffffeafb377d5p-1, 29804 + - 0x1.ffffec2641a9ep-1, 29805 + - 0x1.ffffed413e5b7p-1, 29806 + - 0x1.ffffee4d01cd6p-1, 29807 + - 0x1.ffffef4a55bd4p-1, 29808 + - 0x1.fffff039f9e8fp-1, 29809 + - 0x1.fffff11ca4876p-1, 29810 + - 0x1.fffff1f302bc1p-1, 29811 + - 0x1.fffff2bdb904dp-1, 29812 + - 0x1.fffff37d63a36p-1, 29813 + - 0x1.fffff43297019p-1, 29814 + - 0x1.fffff4dde0118p-1, 29815 + - 0x1.fffff57fc4a95p-1, 29816 + - 0x1.fffff618c3da6p-1, 29817 + - 0x1.fffff6a956450p-1, 29818 + - 0x1.fffff731ee681p-1, 29819 + - 0x1.fffff7b2f8ed6p-1, 29820 + - 0x1.fffff82cdcf1bp-1, 29821 + - 0x1.fffff89ffc4aap-1, 29822 + - 0x1.fffff90cb3c81p-1, 29823 + - 0x1.fffff9735b73bp-1, 29824 + - 0x1.fffff9d446cccp-1, 29825 + - 0x1.fffffa2fc5015p-1, 29826 + - 0x1.fffffa8621251p-1, 29827 + - 0x1.fffffad7a2652p-1, 29828 + - 0x1.fffffb248c39dp-1, 29829 + - 0x1.fffffb6d1e95dp-1, 29830 + - 0x1.fffffbb196132p-1, 29831 + - 0x1.fffffbf22c1e2p-1, 29832 + - 0x1.fffffc2f171e3p-1, 29833 + - 0x1.fffffc688a9cfp-1, 29834 + - 0x1.fffffc9eb76acp-1, 29835 + - 0x1.fffffcd1cbc28p-1, 29836 + - 0x1.fffffd01f36afp-1, 29837 + - 0x1.fffffd2f57d68p-1, 29838 + - 0x1.fffffd5a2041fp-1, 29839 + - 0x1.fffffd8271d12p-1, 29840 + - 0x1.fffffda86faa9p-1, 29841 + - 0x1.fffffdcc3b117p-1, 29842 + - 0x1.fffffdedf37edp-1, 29843 + - 0x1.fffffe0db6b91p-1, 29844 + - 0x1.fffffe2ba0ea5p-1, 29845 + - 0x1.fffffe47ccb60p-1, 29846 + - 0x1.fffffe62534d4p-1, 29847 + - 0x1.fffffe7b4c81ep-1, 29848 + - 0x1.fffffe92ced93p-1, 29849 + - 0x1.fffffea8ef9cfp-1, 29850 + - 0x1.fffffebdc2ec6p-1, 29851 + - 0x1.fffffed15bcbap-1, 29852 + - 0x1.fffffee3cc32cp-1, 29853 + - 0x1.fffffef5251c2p-1, 29854 + - 0x1.ffffff0576917p-1, 29855 + - 0x1.ffffff14cfb92p-1, 29856 + - 0x1.ffffff233ee1dp-1, 29857 + - 0x1.ffffff30d18e8p-1, 29858 + - 0x1.ffffff3d9480fp-1, 29859 + - 0x1.ffffff4993c46p-1, 29860 + - 0x1.ffffff54dab72p-1, 29861 + - 0x1.ffffff5f74141p-1, 29862 + - 0x1.ffffff6969fb8p-1, 29863 + - 0x1.ffffff72c5fb6p-1, 29864 + - 0x1.ffffff7b91176p-1, 29865 + - 0x1.ffffff83d3d07p-1, 29866 + - 0x1.ffffff8b962bep-1, 29867 + - 0x1.ffffff92dfba2p-1, 29868 + - 0x1.ffffff99b79d2p-1, 29869 + - 0x1.ffffffa0248e8p-1, 29870 + - 0x1.ffffffa62ce54p-1, 29871 + - 0x1.ffffffabd69b4p-1, 29872 + - 0x1.ffffffb127525p-1, 29873 + - 0x1.ffffffb624592p-1, 29874 + - 0x1.ffffffbad2affp-1, 29875 + - 0x1.ffffffbf370cdp-1, 29876 + - 0x1.ffffffc355dfdp-1, 29877 + - 0x1.ffffffc733572p-1, 29878 + - 0x1.ffffffcad3626p-1, 29879 + - 0x1.ffffffce39b67p-1, 29880 + - 0x1.ffffffd169d0cp-1, 29881 + - 0x1.ffffffd466fa5p-1, 29882 + - 0x1.ffffffd7344aap-1, 29883 + - 0x1.ffffffd9d4aabp-1, 29884 + - 0x1.ffffffdc4ad7ap-1, 29885 + - 0x1.ffffffde9964ep-1, 29886 + - 0x1.ffffffe0c2bf0p-1, 29887 + - 0x1.ffffffe2c92dbp-1, 29888 + - 0x1.ffffffe4aed5ep-1, 29889 + - 0x1.ffffffe675bbdp-1, 29890 + - 0x1.ffffffe81fc4ep-1, 29891 + - 0x1.ffffffe9aeb97p-1, 29892 + - 0x1.ffffffeb24467p-1, 29893 + - 0x1.ffffffec81ff2p-1, 29894 + - 0x1.ffffffedc95e7p-1, 29895 + - 0x1.ffffffeefbc85p-1, 29896 + - 0x1.fffffff01a8b6p-1, 29897 + - 0x1.fffffff126e1ep-1, 29898 + - 0x1.fffffff221f30p-1, 29899 + - 0x1.fffffff30cd3fp-1, 29900 + - 0x1.fffffff3e8892p-1, 29901 + - 0x1.fffffff4b606fp-1, 29902 + - 0x1.fffffff57632dp-1, 29903 + - 0x1.fffffff629e44p-1, 29904 + - 0x1.fffffff6d1e56p-1, 29905 + - 0x1.fffffff76ef3fp-1, 29906 + - 0x1.fffffff801c1fp-1, 29907 + - 0x1.fffffff88af67p-1, 29908 + - 0x1.fffffff90b2e3p-1, 29909 + - 0x1.fffffff982fc1p-1, 29910 + - 0x1.fffffff9f2e9fp-1, 29911 + - 0x1.fffffffa5b790p-1, 29912 + - 0x1.fffffffabd229p-1, 29913 + - 0x1.fffffffb18582p-1, 29914 + - 0x1.fffffffb6d844p-1, 29915 + - 0x1.fffffffbbd0aap-1, 29916 + - 0x1.fffffffc0748fp-1, 29917 + - 0x1.fffffffc4c96cp-1, 29918 + - 0x1.fffffffc8d462p-1, 29919 + - 0x1.fffffffcc9a41p-1, 29920 + - 0x1.fffffffd01f89p-1, 29921 + - 0x1.fffffffd36871p-1, 29922 + - 0x1.fffffffd678edp-1, 29923 + - 0x1.fffffffd954aep-1, 29924 + - 0x1.fffffffdbff2ap-1, 29925 + - 0x1.fffffffde7ba0p-1, 29926 + - 0x1.fffffffe0cd16p-1, 29927 + - 0x1.fffffffe2f664p-1, 29928 + - 0x1.fffffffe4fa30p-1, 29929 + - 0x1.fffffffe6daf7p-1, 29930 + - 0x1.fffffffe89b0cp-1, 29931 + - 0x1.fffffffea3c9ap-1, 29932 + - 0x1.fffffffebc1a9p-1, 29933 + - 0x1.fffffffed2c21p-1, 29934 + - 0x1.fffffffee7dc8p-1, 29935 + - 0x1.fffffffefb847p-1, 29936 + - 0x1.ffffffff0dd2bp-1, 29937 + - 0x1.ffffffff1ede9p-1, 29938 + - 0x1.ffffffff2ebdap-1, 29939 + - 0x1.ffffffff3d843p-1, 29940 + - 0x1.ffffffff4b453p-1, 29941 + - 0x1.ffffffff58126p-1, 29942 + - 0x1.ffffffff63fc3p-1, 29943 + - 0x1.ffffffff6f121p-1, 29944 + - 0x1.ffffffff79626p-1, 29945 + - 0x1.ffffffff82fabp-1, 29946 + - 0x1.ffffffff8be77p-1, 29947 + - 0x1.ffffffff94346p-1, 29948 + - 0x1.ffffffff9bec8p-1, 29949 + - 0x1.ffffffffa319fp-1, 29950 + - 0x1.ffffffffa9c63p-1, 29951 + - 0x1.ffffffffaffa4p-1, 29952 + - 0x1.ffffffffb5be5p-1, 29953 + - 0x1.ffffffffbb1a2p-1, 29954 + - 0x1.ffffffffc014ep-1, 29955 + - 0x1.ffffffffc4b56p-1, 29956 + - 0x1.ffffffffc901cp-1, 29957 + - 0x1.ffffffffccfffp-1, 29958 + - 0x1.ffffffffd0b56p-1, 29959 + - 0x1.ffffffffd4271p-1, 29960 + - 0x1.ffffffffd759dp-1, 29961 + - 0x1.ffffffffda520p-1, 29962 + - 0x1.ffffffffdd13cp-1, 29963 + - 0x1.ffffffffdfa2dp-1, 29964 + - 0x1.ffffffffe202dp-1, 29965 + - 0x1.ffffffffe4371p-1, 29966 + - 0x1.ffffffffe642ap-1, 29967 + - 0x1.ffffffffe8286p-1, 29968 + - 0x1.ffffffffe9eb0p-1, 29969 + - 0x1.ffffffffeb8d0p-1, 29970 + - 0x1.ffffffffed10ap-1, 29971 + - 0x1.ffffffffee782p-1, 29972 + - 0x1.ffffffffefc57p-1, 29973 + - 0x1.fffffffff0fa7p-1, 29974 + - 0x1.fffffffff218fp-1, 29975 + - 0x1.fffffffff3227p-1, 29976 + - 0x1.fffffffff4188p-1, 29977 + - 0x1.fffffffff4fc9p-1, 29978 + - 0x1.fffffffff5cfdp-1, 29979 + - 0x1.fffffffff6939p-1, 29980 + - 0x1.fffffffff748ep-1, 29981 + - 0x1.fffffffff7f0dp-1, 29982 + - 0x1.fffffffff88c5p-1, 29983 + - 0x1.fffffffff91c6p-1, 29984 + - 0x1.fffffffff9a1bp-1, 29985 + - 0x1.fffffffffa1d2p-1, 29986 + - 0x1.fffffffffa8f6p-1, 29987 + - 0x1.fffffffffaf92p-1, 29988 + - 0x1.fffffffffb5b0p-1, 29989 + - 0x1.fffffffffbb58p-1, 29990 + - 0x1.fffffffffc095p-1, 29991 + - 0x1.fffffffffc56dp-1, 29992 + - 0x1.fffffffffc9e8p-1, 29993 + - 0x1.fffffffffce0dp-1, 29994 + - 0x1.fffffffffd1e1p-1, 29995 + - 0x1.fffffffffd56cp-1, 29996 + - 0x1.fffffffffd8b3p-1, 29997 + - 0x1.fffffffffdbbap-1, 29998 + - 0x1.fffffffffde86p-1, 29999 + - 0x1.fffffffffe11dp-1, 30000 + - 0x1.fffffffffe380p-1, 30001 + - 0x1.fffffffffe5b6p-1, 30002 + - 0x1.fffffffffe7c0p-1, 30003 + - 0x1.fffffffffe9a2p-1, 30004 + - 0x1.fffffffffeb60p-1, 30005 + - 0x1.fffffffffecfbp-1, 30006 + - 0x1.fffffffffee77p-1, 30007 + - 0x1.fffffffffefd6p-1, 30008 + - 0x1.ffffffffff11ap-1, 30009 + - 0x1.ffffffffff245p-1, 30010 + - 0x1.ffffffffff359p-1, 30011 + - 0x1.ffffffffff457p-1, 30012 + - 0x1.ffffffffff542p-1, 30013 + - 0x1.ffffffffff61bp-1, 30014 + - 0x1.ffffffffff6e3p-1, 30015 + - 0x1.ffffffffff79bp-1, 30016 + - 0x1.ffffffffff845p-1, 30017 + - 0x1.ffffffffff8e2p-1, 30018 + - 0x1.ffffffffff973p-1, 30019 + - 0x1.ffffffffff9f8p-1, 30020 + - 0x1.ffffffffffa73p-1, 30021 + - 0x1.ffffffffffae4p-1, 30022 + - 0x1.ffffffffffb4cp-1, 30023 + - 0x1.ffffffffffbadp-1, 30024 + - 0x1.ffffffffffc05p-1, 30025 + - 0x1.ffffffffffc57p-1, 30026 + - 0x1.ffffffffffca2p-1, 30027 + - 0x1.ffffffffffce7p-1, 30028 + - 0x1.ffffffffffd27p-1, 30029 + - 0x1.ffffffffffd62p-1, 30030 + - 0x1.ffffffffffd98p-1, 30031 + - 0x1.ffffffffffdcap-1, 30032 + - 0x1.ffffffffffdf8p-1, 30033 + - 0x1.ffffffffffe22p-1, 30034 + - 0x1.ffffffffffe49p-1, 30035 + - 0x1.ffffffffffe6cp-1, 30036 + - 0x1.ffffffffffe8dp-1, 30037 + - 0x1.ffffffffffeabp-1, 30038 + - 0x1.ffffffffffec7p-1, 30039 + - 0x1.ffffffffffee1p-1, 30040 + - 0x1.ffffffffffef8p-1, 30041 + - 0x1.fffffffffff0ep-1, 30042 + - 0x1.fffffffffff22p-1, 30043 + - 0x1.fffffffffff34p-1, 30044 + - 0x1.fffffffffff45p-1, 30045 + - 0x1.fffffffffff54p-1, 30046 + - 0x1.fffffffffff62p-1, 30047 + - 0x1.fffffffffff6fp-1, 30048 + - 0x1.fffffffffff7bp-1, 30049 + - 0x1.fffffffffff86p-1, 30050 + - 0x1.fffffffffff90p-1, 30051 + - 0x1.fffffffffff9ap-1, 30052 + - 0x1.fffffffffffa2p-1, 30053 + - 0x1.fffffffffffaap-1, 30054 + - 0x1.fffffffffffb1p-1, 30055 + - 0x1.fffffffffffb8p-1, 30056 + - 0x1.fffffffffffbep-1, 30057 + - 0x1.fffffffffffc3p-1, 30058 + - 0x1.fffffffffffc8p-1, 30059 + - 0x1.fffffffffffcdp-1, 30060 + - 0x1.fffffffffffd1p-1, 30061 + - 0x1.fffffffffffd5p-1, 30062 + - 0x1.fffffffffffd9p-1, 30063 + - 0x1.fffffffffffdcp-1, 30064 + - 0x1.fffffffffffdfp-1, 30065 + - 0x1.fffffffffffe2p-1, 30066 + - 0x1.fffffffffffe4p-1, 30067 + - 0x1.fffffffffffe7p-1, 30068 + - 0x1.fffffffffffe9p-1, 30069 + - 0x1.fffffffffffebp-1, 30070 + - 0x1.fffffffffffedp-1, 30071 + - 0x1.fffffffffffeep-1, 30072 + - 0x1.ffffffffffff0p-1, 30073 + - 0x1.ffffffffffff1p-1, 30074 + - 0x1.ffffffffffff3p-1, 30075 + - 0x1.ffffffffffff4p-1, 30076 + - 0x1.ffffffffffff5p-1, 30077 + - 0x1.ffffffffffff6p-1, 30078 + - 0x1.ffffffffffff7p-1, 30079 + - 0x1.ffffffffffff7p-1, 30080 + - 0x1.ffffffffffff8p-1, 30081 + - 0x1.ffffffffffff9p-1, 30082 + - 0x1.ffffffffffff9p-1, 30083 + - 0x1.ffffffffffffap-1, 30084 + - 0x1.ffffffffffffbp-1, 30085 + - 0x1.ffffffffffffbp-1, 30086 + - 0x1.ffffffffffffbp-1, 30087 + - 0x1.ffffffffffffcp-1, 30088 + - 0x1.ffffffffffffcp-1, 30089 + - 0x1.ffffffffffffdp-1, 30090 + - 0x1.ffffffffffffdp-1, 30091 + - 0x1.ffffffffffffdp-1, 30092 + - 0x1.ffffffffffffdp-1, 30093 + - 0x1.ffffffffffffep-1, 30094 + - 0x1.ffffffffffffep-1, 30095 + - 0x1.ffffffffffffep-1, 30096 + - 0x1.ffffffffffffep-1, 30097 + - 0x1.ffffffffffffep-1, 30098 + - 0x1.ffffffffffffep-1, 30099 + - 0x1.fffffffffffffp-1, 30100 + - 0x1.fffffffffffffp-1, 30101 + - 0x1.fffffffffffffp-1, 30102 + - 0x1.fffffffffffffp-1, 30103 + - 0x1.fffffffffffffp-1, 30104 + - 0x1.fffffffffffffp-1, 30105 + - 0x1.fffffffffffffp-1, 30106 + - 0x1.fffffffffffffp-1, 30107 + - 0x1.fffffffffffffp-1, 30108 + - 0x1.fffffffffffffp-1, 30109 + - 0x1.fffffffffffffp-1, 30110 + - 0x1.0000000000000p+0, 30111 + - 0x1.0000000000000p+0, 30112 + - 0x1.0000000000000p+0, 30113 + - 0x1.0000000000000p+0, 30114 + - 0x1.0000000000000p+0, 30115 + - 0x1.0000000000000p+0, 30116 + - 0x1.0000000000000p+0, 30117 + - 0x1.0000000000000p+0, 30118 + - 0x1.0000000000000p+0, 30119 + - 0x1.0000000000000p+0, 30120 + - 0x1.0000000000000p+0, 30121 + - }, 30122 + - .scale = { 0x1.20dd750429b6dp+0, 30123 + - 0x1.20d8f1975c85dp+0, 30124 + - 0x1.20cb67bd452c7p+0, 30125 + - 0x1.20b4d8bac36c1p+0, 30126 + - 0x1.209546ad13ccfp+0, 30127 + - 0x1.206cb4897b148p+0, 30128 + - 0x1.203b261cd0052p+0, 30129 + - 0x1.2000a00ae3804p+0, 30130 + - 0x1.1fbd27cdc72d3p+0, 30131 + - 0x1.1f70c3b4f2cc7p+0, 30132 + - 0x1.1f1b7ae44867fp+0, 30133 + - 0x1.1ebd5552f795bp+0, 30134 + - 0x1.1e565bca400d4p+0, 30135 + - 0x1.1de697e413d28p+0, 30136 + - 0x1.1d6e14099944ap+0, 30137 + - 0x1.1cecdb718d61cp+0, 30138 + - 0x1.1c62fa1e869b6p+0, 30139 + - 0x1.1bd07cdd189acp+0, 30140 + - 0x1.1b357141d95d5p+0, 30141 + - 0x1.1a91e5a748165p+0, 30142 + - 0x1.19e5e92b964abp+0, 30143 + - 0x1.19318bae53a04p+0, 30144 + - 0x1.1874ddcdfce24p+0, 30145 + - 0x1.17aff0e56ec10p+0, 30146 + - 0x1.16e2d7093cd8cp+0, 30147 + - 0x1.160da304ed92fp+0, 30148 + - 0x1.153068581b781p+0, 30149 + - 0x1.144b3b337c90cp+0, 30150 + - 0x1.135e3075d076bp+0, 30151 + - 0x1.12695da8b5bdep+0, 30152 + - 0x1.116cd8fd67618p+0, 30153 + - 0x1.1068b94962e5ep+0, 30154 + - 0x1.0f5d1602f7e41p+0, 30155 + - 0x1.0e4a073dc1b91p+0, 30156 + - 0x1.0d2fa5a70c168p+0, 30157 + - 0x1.0c0e0a8223359p+0, 30158 + - 0x1.0ae54fa490722p+0, 30159 + - 0x1.09b58f724416bp+0, 30160 + - 0x1.087ee4d9ad247p+0, 30161 + - 0x1.07416b4fbfe7cp+0, 30162 + - 0x1.05fd3ecbec297p+0, 30163 + - 0x1.04b27bc403d30p+0, 30164 + - 0x1.03613f2812dafp+0, 30165 + - 0x1.0209a65e29545p+0, 30166 + - 0x1.00abcf3e187a9p+0, 30167 + - 0x1.fe8fb01a47307p-1, 30168 + - 0x1.fbbbbef34b4b2p-1, 30169 + - 0x1.f8dc092d58ff8p-1, 30170 + - 0x1.f5f0cdaf15313p-1, 30171 + - 0x1.f2fa4c16c0019p-1, 30172 + - 0x1.eff8c4b1375dbp-1, 30173 + - 0x1.ecec7870ebca7p-1, 30174 + - 0x1.e9d5a8e4c934ep-1, 30175 + - 0x1.e6b4982f158b9p-1, 30176 + - 0x1.e38988fc46e72p-1, 30177 + - 0x1.e054be79d3042p-1, 30178 + - 0x1.dd167c4cf9d2ap-1, 30179 + - 0x1.d9cf06898cdafp-1, 30180 + - 0x1.d67ea1a8b5368p-1, 30181 + - 0x1.d325927fb9d89p-1, 30182 + - 0x1.cfc41e36c7df9p-1, 30183 + - 0x1.cc5a8a3fbea40p-1, 30184 + - 0x1.c8e91c4d01368p-1, 30185 + - 0x1.c5701a484ef9dp-1, 30186 + - 0x1.c1efca49a5011p-1, 30187 + - 0x1.be68728e29d5dp-1, 30188 + - 0x1.bada596f25436p-1, 30189 + - 0x1.b745c55905bf8p-1, 30190 + - 0x1.b3aafcc27502ep-1, 30191 + - 0x1.b00a46237d5bep-1, 30192 + - 0x1.ac63e7ecc1411p-1, 30193 + - 0x1.a8b8287ec6a09p-1, 30194 + - 0x1.a5074e2157620p-1, 30195 + - 0x1.a1519efaf889ep-1, 30196 + - 0x1.9d97610879642p-1, 30197 + - 0x1.99d8da149c13fp-1, 30198 + - 0x1.96164fafd8de3p-1, 30199 + - 0x1.925007283d7aap-1, 30200 + - 0x1.8e86458169af8p-1, 30201 + - 0x1.8ab94f6caa71dp-1, 30202 + - 0x1.86e9694134b9ep-1, 30203 + - 0x1.8316d6f48133dp-1, 30204 + - 0x1.7f41dc12c9e89p-1, 30205 + - 0x1.7b6abbb7aaf19p-1, 30206 + - 0x1.7791b886e7403p-1, 30207 + - 0x1.73b714a552763p-1, 30208 + - 0x1.6fdb11b1e0c34p-1, 30209 + - 0x1.6bfdf0beddaf5p-1, 30210 + - 0x1.681ff24b4ab04p-1, 30211 + - 0x1.6441563c665d4p-1, 30212 + - 0x1.60625bd75d07bp-1, 30213 + - 0x1.5c8341bb23767p-1, 30214 + - 0x1.58a445da7c74cp-1, 30215 + - 0x1.54c5a57629db0p-1, 30216 + - 0x1.50e79d1749ac9p-1, 30217 + - 0x1.4d0a6889dfd9fp-1, 30218 + - 0x1.492e42d78d2c5p-1, 30219 + - 0x1.4553664273d24p-1, 30220 + - 0x1.417a0c4049fd0p-1, 30221 + - 0x1.3da26d759aef5p-1, 30222 + - 0x1.39ccc1b136d5ap-1, 30223 + - 0x1.35f93fe7d1b3dp-1, 30224 + - 0x1.32281e2fd1a92p-1, 30225 + - 0x1.2e5991bd4cbfcp-1, 30226 + - 0x1.2a8dcede3673bp-1, 30227 + - 0x1.26c508f6bd0ffp-1, 30228 + - 0x1.22ff727dd6f7bp-1, 30229 + - 0x1.1f3d3cf9ffe5ap-1, 30230 + - 0x1.1b7e98fe26217p-1, 30231 + - 0x1.17c3b626c7a11p-1, 30232 + - 0x1.140cc3173f007p-1, 30233 + - 0x1.1059ed7740313p-1, 30234 + - 0x1.0cab61f084b93p-1, 30235 + - 0x1.09014c2ca74dap-1, 30236 + - 0x1.055bd6d32e8d7p-1, 30237 + - 0x1.01bb2b87c6968p-1, 30238 + - 0x1.fc3ee5d1524b0p-2, 30239 + - 0x1.f511a91a67d2ap-2, 30240 + - 0x1.edeeee0959518p-2, 30241 + - 0x1.e6d6ffaa65a25p-2, 30242 + - 0x1.dfca26f5bbf88p-2, 30243 + - 0x1.d8c8aace11e63p-2, 30244 + - 0x1.d1d2cfff91594p-2, 30245 + - 0x1.cae8d93f1d7b6p-2, 30246 + - 0x1.c40b0729ed547p-2, 30247 + - 0x1.bd3998457afdap-2, 30248 + - 0x1.b674c8ffc6283p-2, 30249 + - 0x1.afbcd3afe8ab6p-2, 30250 + - 0x1.a911f096fbc26p-2, 30251 + - 0x1.a27455e14c93cp-2, 30252 + - 0x1.9be437a7de946p-2, 30253 + - 0x1.9561c7f23a47bp-2, 30254 + - 0x1.8eed36b886d93p-2, 30255 + - 0x1.8886b1e5ecfd1p-2, 30256 + - 0x1.822e655b417e6p-2, 30257 + - 0x1.7be47af1f5d89p-2, 30258 + - 0x1.75a91a7f4d2edp-2, 30259 + - 0x1.6f7c69d7d3ef8p-2, 30260 + - 0x1.695e8cd31867ep-2, 30261 + - 0x1.634fa54fa285fp-2, 30262 + - 0x1.5d4fd33729015p-2, 30263 + - 0x1.575f3483021c3p-2, 30264 + - 0x1.517de540ce2a3p-2, 30265 + - 0x1.4babff975a04cp-2, 30266 + - 0x1.45e99bcbb7915p-2, 30267 + - 0x1.4036d0468a7a2p-2, 30268 + - 0x1.3a93b1998736cp-2, 30269 + - 0x1.35005285227f1p-2, 30270 + - 0x1.2f7cc3fe6f423p-2, 30271 + - 0x1.2a09153529381p-2, 30272 + - 0x1.24a55399ea239p-2, 30273 + - 0x1.1f518ae487dc8p-2, 30274 + - 0x1.1a0dc51a9934dp-2, 30275 + - 0x1.14da0a961fd14p-2, 30276 + - 0x1.0fb6620c550afp-2, 30277 + - 0x1.0aa2d09497f2bp-2, 30278 + - 0x1.059f59af7a906p-2, 30279 + - 0x1.00abff4dec7a3p-2, 30280 + - 0x1.f79183b101c5bp-3, 30281 + - 0x1.edeb406d9c824p-3, 30282 + - 0x1.e4652fadcb6b2p-3, 30283 + - 0x1.daff4969c0b04p-3, 30284 + - 0x1.d1b982c501370p-3, 30285 + - 0x1.c893ce1dcbef7p-3, 30286 + - 0x1.bf8e1b1ca2279p-3, 30287 + - 0x1.b6a856c3ed54fp-3, 30288 + - 0x1.ade26b7fbed95p-3, 30289 + - 0x1.a53c4135a6526p-3, 30290 + - 0x1.9cb5bd549b111p-3, 30291 + - 0x1.944ec2e4f5630p-3, 30292 + - 0x1.8c07329874652p-3, 30293 + - 0x1.83deeada4d25ap-3, 30294 + - 0x1.7bd5c7df3fe9cp-3, 30295 + - 0x1.73eba3b5b07b7p-3, 30296 + - 0x1.6c205655be71fp-3, 30297 + - 0x1.6473b5b15a7a1p-3, 30298 + - 0x1.5ce595c455b0ap-3, 30299 + - 0x1.5575c8a468361p-3, 30300 + - 0x1.4e241e912c305p-3, 30301 + - 0x1.46f066040a832p-3, 30302 + - 0x1.3fda6bc016994p-3, 30303 + - 0x1.38e1fae1d6a9dp-3, 30304 + - 0x1.3206dceef5f87p-3, 30305 + - 0x1.2b48d9e5dea1cp-3, 30306 + - 0x1.24a7b84d38971p-3, 30307 + - 0x1.1e233d434b813p-3, 30308 + - 0x1.17bb2c8d41535p-3, 30309 + - 0x1.116f48a6476ccp-3, 30310 + - 0x1.0b3f52ce8c383p-3, 30311 + - 0x1.052b0b1a174eap-3, 30312 + - 0x1.fe6460fef4680p-4, 30313 + - 0x1.f2a901ccafb37p-4, 30314 + - 0x1.e723726b824a9p-4, 30315 + - 0x1.dbd32ac4c99b0p-4, 30316 + - 0x1.d0b7a0f921e7cp-4, 30317 + - 0x1.c5d0497c09e74p-4, 30318 + - 0x1.bb1c972f23e50p-4, 30319 + - 0x1.b09bfb7d11a83p-4, 30320 + - 0x1.a64de673e8837p-4, 30321 + - 0x1.9c31c6df3b1b8p-4, 30322 + - 0x1.92470a61b6965p-4, 30323 + - 0x1.888d1d8e510a3p-4, 30324 + - 0x1.7f036c0107294p-4, 30325 + - 0x1.75a96077274bap-4, 30326 + - 0x1.6c7e64e7281cbp-4, 30327 + - 0x1.6381e2980956bp-4, 30328 + - 0x1.5ab342383d177p-4, 30329 + - 0x1.5211ebf41880bp-4, 30330 + - 0x1.499d478bca735p-4, 30331 + - 0x1.4154bc68d75c3p-4, 30332 + - 0x1.3937b1b319259p-4, 30333 + - 0x1.31458e6542847p-4, 30334 + - 0x1.297db960e4f63p-4, 30335 + - 0x1.21df9981f8e53p-4, 30336 + - 0x1.1a6a95b1e786fp-4, 30337 + - 0x1.131e14fa1625dp-4, 30338 + - 0x1.0bf97e95f2a64p-4, 30339 + - 0x1.04fc3a0481321p-4, 30340 + - 0x1.fc4b5e32d6259p-5, 30341 + - 0x1.eeea8c1b1db93p-5, 30342 + - 0x1.e1d4cf1e2450ap-5, 30343 + - 0x1.d508f9a1ea64ep-5, 30344 + - 0x1.c885df3451a07p-5, 30345 + - 0x1.bc4a54a84e834p-5, 30346 + - 0x1.b055303221015p-5, 30347 + - 0x1.a4a549829587ep-5, 30348 + - 0x1.993979e14fffdp-5, 30349 + - 0x1.8e109c4622913p-5, 30350 + - 0x1.83298d717210ep-5, 30351 + - 0x1.78832c03aa2b1p-5, 30352 + - 0x1.6e1c5893c380bp-5, 30353 + - 0x1.63f3f5c4de13bp-5, 30354 + - 0x1.5a08e85af27e0p-5, 30355 + - 0x1.505a174e9c929p-5, 30356 + - 0x1.46e66be002240p-5, 30357 + - 0x1.3dacd1a8d8ccdp-5, 30358 + - 0x1.34ac36ad8dafep-5, 30359 + - 0x1.2be38b6d92415p-5, 30360 + - 0x1.2351c2f2d1449p-5, 30361 + - 0x1.1af5d2e04f3f6p-5, 30362 + - 0x1.12ceb37ff9bc3p-5, 30363 + - 0x1.0adb5fcfa8c75p-5, 30364 + - 0x1.031ad58d56279p-5, 30365 + - 0x1.f7182a851bca2p-6, 30366 + - 0x1.e85c449e377f2p-6, 30367 + - 0x1.da0005e5f28dfp-6, 30368 + - 0x1.cc0180af00a8bp-6, 30369 + - 0x1.be5ecd2fcb5f9p-6, 30370 + - 0x1.b1160991ff737p-6, 30371 + - 0x1.a4255a00b9f03p-6, 30372 + - 0x1.978ae8b55ce1bp-6, 30373 + - 0x1.8b44e6031383ep-6, 30374 + - 0x1.7f5188610ddc8p-6, 30375 + - 0x1.73af0c737bb45p-6, 30376 + - 0x1.685bb5134ef13p-6, 30377 + - 0x1.5d55cb54cd53ap-6, 30378 + - 0x1.529b9e8cf9a1ep-6, 30379 + - 0x1.482b8455dc491p-6, 30380 + - 0x1.3e03d891b37dep-6, 30381 + - 0x1.3422fd6d12e2bp-6, 30382 + - 0x1.2a875b5ffab56p-6, 30383 + - 0x1.212f612dee7fbp-6, 30384 + - 0x1.181983e5133ddp-6, 30385 + - 0x1.0f443edc5ce49p-6, 30386 + - 0x1.06ae13b0d3255p-6, 30387 + - 0x1.fcab1483ea7fcp-7, 30388 + - 0x1.ec72615a894c4p-7, 30389 + - 0x1.dcaf3691fc448p-7, 30390 + - 0x1.cd5ec93c12431p-7, 30391 + - 0x1.be7e5ac24963bp-7, 30392 + - 0x1.b00b38d6b3575p-7, 30393 + - 0x1.a202bd6372dcep-7, 30394 + - 0x1.94624e78e0fafp-7, 30395 + - 0x1.87275e3a6869dp-7, 30396 + - 0x1.7a4f6aca256cbp-7, 30397 + - 0x1.6dd7fe3358230p-7, 30398 + - 0x1.61beae53b72b7p-7, 30399 + - 0x1.56011cc3b036dp-7, 30400 + - 0x1.4a9cf6bda3f4cp-7, 30401 + - 0x1.3f8ff5042a88ep-7, 30402 + - 0x1.34d7dbc76d7e5p-7, 30403 + - 0x1.2a727a89a3f14p-7, 30404 + - 0x1.205dac02bd6b9p-7, 30405 + - 0x1.1697560347b25p-7, 30406 + - 0x1.0d1d69569b82dp-7, 30407 + - 0x1.03ede1a45bfeep-7, 30408 + - 0x1.f60d8aa2a88f2p-8, 30409 + - 0x1.e4cc4abf7d065p-8, 30410 + - 0x1.d4143a9dfe965p-8, 30411 + - 0x1.c3e1a5f5c077cp-8, 30412 + - 0x1.b430ecf4a83a8p-8, 30413 + - 0x1.a4fe83fb9db25p-8, 30414 + - 0x1.9646f35a76623p-8, 30415 + - 0x1.8806d70b2fc36p-8, 30416 + - 0x1.7a3ade6c8b3e4p-8, 30417 + - 0x1.6cdfcbfc1e263p-8, 30418 + - 0x1.5ff2750fe7820p-8, 30419 + - 0x1.536fc18f7ce5cp-8, 30420 + - 0x1.4754abacdf1dcp-8, 30421 + - 0x1.3b9e3f9d06e3fp-8, 30422 + - 0x1.30499b503957fp-8, 30423 + - 0x1.2553ee2a336bfp-8, 30424 + - 0x1.1aba78ba3af89p-8, 30425 + - 0x1.107a8c7323a6ep-8, 30426 + - 0x1.06918b6355624p-8, 30427 + - 0x1.f9f9cfd9c3035p-9, 30428 + - 0x1.e77448fb66bb9p-9, 30429 + - 0x1.d58da68fd1170p-9, 30430 + - 0x1.c4412bf4b8f0bp-9, 30431 + - 0x1.b38a3af2e55b4p-9, 30432 + - 0x1.a3645330550ffp-9, 30433 + - 0x1.93cb11a30d765p-9, 30434 + - 0x1.84ba3004a50d0p-9, 30435 + - 0x1.762d84469c18fp-9, 30436 + - 0x1.6821000795a03p-9, 30437 + - 0x1.5a90b00981d93p-9, 30438 + - 0x1.4d78bba8ca5fdp-9, 30439 + - 0x1.40d564548fad7p-9, 30440 + - 0x1.34a305080681fp-9, 30441 + - 0x1.28de11c5031ebp-9, 30442 + - 0x1.1d83170fbf6fbp-9, 30443 + - 0x1.128eb96be8798p-9, 30444 + - 0x1.07fdb4dafea5fp-9, 30445 + - 0x1.fb99b8b8279e1p-10, 30446 + - 0x1.e7f232d9e2630p-10, 30447 + - 0x1.d4fed7195d7e8p-10, 30448 + - 0x1.c2b9cf7f893bfp-10, 30449 + - 0x1.b11d702b3deb1p-10, 30450 + - 0x1.a024365f771bdp-10, 30451 + - 0x1.8fc8c794b03b5p-10, 30452 + - 0x1.8005f08d6f1efp-10, 30453 + - 0x1.70d6a46e07ddap-10, 30454 + - 0x1.6235fbd7a4345p-10, 30455 + - 0x1.541f340697987p-10, 30456 + - 0x1.468dadf4080abp-10, 30457 + - 0x1.397ced7af2b15p-10, 30458 + - 0x1.2ce898809244ep-10, 30459 + - 0x1.20cc76202c5fap-10, 30460 + - 0x1.15246dda49d47p-10, 30461 + - 0x1.09ec86c75d497p-10, 30462 + - 0x1.fe41cd9bb4eeep-11, 30463 + - 0x1.e97ba3b77f306p-11, 30464 + - 0x1.d57f524723822p-11, 30465 + - 0x1.c245d4b998479p-11, 30466 + - 0x1.afc85e0f82e12p-11, 30467 + - 0x1.9e005769dbc1dp-11, 30468 + - 0x1.8ce75e9f6f8a0p-11, 30469 + - 0x1.7c7744d9378f7p-11, 30470 + - 0x1.6caa0d3582fe9p-11, 30471 + - 0x1.5d79eb71e893bp-11, 30472 + - 0x1.4ee1429bf7cc0p-11, 30473 + - 0x1.40daa3c89f5b6p-11, 30474 + - 0x1.3360ccd23db3ap-11, 30475 + - 0x1.266ea71d4f71ap-11, 30476 + - 0x1.19ff4663ae9dfp-11, 30477 + - 0x1.0e0de78654d1ep-11, 30478 + - 0x1.0295ef6591848p-11, 30479 + - 0x1.ef25d37f49fe1p-12, 30480 + - 0x1.da01102b5f851p-12, 30481 + - 0x1.c5b5412dcafadp-12, 30482 + - 0x1.b23a5a23e4210p-12, 30483 + - 0x1.9f8893d8fd1c1p-12, 30484 + - 0x1.8d986a4187285p-12, 30485 + - 0x1.7c629a822bc9ep-12, 30486 + - 0x1.6be02102b3520p-12, 30487 + - 0x1.5c0a378c90bcap-12, 30488 + - 0x1.4cda5374ea275p-12, 30489 + - 0x1.3e4a23d1f4702p-12, 30490 + - 0x1.30538fbb77ecdp-12, 30491 + - 0x1.22f0b496539bdp-12, 30492 + - 0x1.161be46ad3b50p-12, 30493 + - 0x1.09cfa445b00ffp-12, 30494 + - 0x1.fc0d55470cf51p-13, 30495 + - 0x1.e577bbcd49935p-13, 30496 + - 0x1.cfd4a5adec5bfp-13, 30497 + - 0x1.bb1a9657ce465p-13, 30498 + - 0x1.a740684026555p-13, 30499 + - 0x1.943d4a1d1ed39p-13, 30500 + - 0x1.8208bc334a6a5p-13, 30501 + - 0x1.709a8db59f25cp-13, 30502 + - 0x1.5feada379d8b7p-13, 30503 + - 0x1.4ff207314a102p-13, 30504 + - 0x1.40a8c1949f75ep-13, 30505 + - 0x1.3207fb7420eb9p-13, 30506 + - 0x1.2408e9ba3327fp-13, 30507 + - 0x1.16a501f0e42cap-13, 30508 + - 0x1.09d5f819c9e29p-13, 30509 + - 0x1.fb2b792b40a22p-14, 30510 + - 0x1.e3bcf436a1a95p-14, 30511 + - 0x1.cd55277c18d05p-14, 30512 + - 0x1.b7e94604479dcp-14, 30513 + - 0x1.a36eec00926ddp-14, 30514 + - 0x1.8fdc1b2dcf7b9p-14, 30515 + - 0x1.7d2737527c3f9p-14, 30516 + - 0x1.6b4702d7d5849p-14, 30517 + - 0x1.5a329b7d30748p-14, 30518 + - 0x1.49e17724f4d41p-14, 30519 + - 0x1.3a4b60ba9aa4dp-14, 30520 + - 0x1.2b6875310f785p-14, 30521 + - 0x1.1d312098e9dbap-14, 30522 + - 0x1.0f9e1b4dd36dfp-14, 30523 + - 0x1.02a8673a94691p-14, 30524 + - 0x1.ec929a665b449p-15, 30525 + - 0x1.d4f4b4c8e09edp-15, 30526 + - 0x1.be6abbb10a5aap-15, 30527 + - 0x1.a8e8cc1fadef6p-15, 30528 + - 0x1.94637d5bacfdbp-15, 30529 + - 0x1.80cfdc72220cfp-15, 30530 + - 0x1.6e2367dc27f95p-15, 30531 + - 0x1.5c540b4936fd2p-15, 30532 + - 0x1.4b581b8d170fcp-15, 30533 + - 0x1.3b2652b06c2b2p-15, 30534 + - 0x1.2bb5cc22e5db6p-15, 30535 + - 0x1.1cfe010e2052dp-15, 30536 + - 0x1.0ef6c4c84a0fep-15, 30537 + - 0x1.01984165a5f36p-15, 30538 + - 0x1.e9b5e8d00ce76p-16, 30539 + - 0x1.d16f5716c6c1ap-16, 30540 + - 0x1.ba4f035d60e02p-16, 30541 + - 0x1.a447b7b03f045p-16, 30542 + - 0x1.8f4ccca7fc90dp-16, 30543 + - 0x1.7b5223dac7336p-16, 30544 + - 0x1.684c227fcacefp-16, 30545 + - 0x1.562fac4329b48p-16, 30546 + - 0x1.44f21e49054f2p-16, 30547 + - 0x1.34894a5e24657p-16, 30548 + - 0x1.24eb7254ccf83p-16, 30549 + - 0x1.160f438c70913p-16, 30550 + - 0x1.07ebd2a2d2844p-16, 30551 + - 0x1.f4f12e9ab070ap-17, 30552 + - 0x1.db5ad0b27805cp-17, 30553 + - 0x1.c304efa2c6f4ep-17, 30554 + - 0x1.abe09e9144b5ep-17, 30555 + - 0x1.95df988e76644p-17, 30556 + - 0x1.80f439b4ee04bp-17, 30557 + - 0x1.6d11788a69c64p-17, 30558 + - 0x1.5a2adfa0b4bc4p-17, 30559 + - 0x1.4834877429b8fp-17, 30560 + - 0x1.37231085c7d9ap-17, 30561 + - 0x1.26eb9daed6f7ep-17, 30562 + - 0x1.1783ceac28910p-17, 30563 + - 0x1.08e1badf0fcedp-17, 30564 + - 0x1.f5f7d88472604p-18, 30565 + - 0x1.db92b5212fb8dp-18, 30566 + - 0x1.c282cd3957edap-18, 30567 + - 0x1.aab7abace48dcp-18, 30568 + - 0x1.94219bfcb4928p-18, 30569 + - 0x1.7eb1a2075864dp-18, 30570 + - 0x1.6a597219a93d9p-18, 30571 + - 0x1.570b69502f313p-18, 30572 + - 0x1.44ba864670882p-18, 30573 + - 0x1.335a62115bce2p-18, 30574 + - 0x1.22df298214423p-18, 30575 + - 0x1.133d96ae7e0ddp-18, 30576 + - 0x1.046aeabcfcdecp-18, 30577 + - 0x1.ecb9cfe1d8642p-19, 30578 + - 0x1.d21397ead99cbp-19, 30579 + - 0x1.b8d094c86d374p-19, 30580 + - 0x1.a0df0f0c626dcp-19, 30581 + - 0x1.8a2e269750a39p-19, 30582 + - 0x1.74adc8f4064d3p-19, 30583 + - 0x1.604ea819f007cp-19, 30584 + - 0x1.4d0231928c6f9p-19, 30585 + - 0x1.3aba85fe22e1fp-19, 30586 + - 0x1.296a70f414053p-19, 30587 + - 0x1.1905613b3abf2p-19, 30588 + - 0x1.097f6156f32c5p-19, 30589 + - 0x1.f59a20caf6695p-20, 30590 + - 0x1.d9c73698fb1dcp-20, 30591 + - 0x1.bf716c6168baep-20, 30592 + - 0x1.a6852c6b58392p-20, 30593 + - 0x1.8eefd70594a88p-20, 30594 + - 0x1.789fb715aae95p-20, 30595 + - 0x1.6383f726a8e04p-20, 30596 + - 0x1.4f8c96f26a26ap-20, 30597 + - 0x1.3caa61607f920p-20, 30598 + - 0x1.2acee2f5ecdb8p-20, 30599 + - 0x1.19ec60b1242edp-20, 30600 + - 0x1.09f5cf4dd2877p-20, 30601 + - 0x1.f5bd95d8730d8p-21, 30602 + - 0x1.d9371e2ff7c35p-21, 30603 + - 0x1.be41de54d155ap-21, 30604 + - 0x1.a4c89e08ef4f3p-21, 30605 + - 0x1.8cb738399b12cp-21, 30606 + - 0x1.75fa8dbc84becp-21, 30607 + - 0x1.608078a70dcbcp-21, 30608 + - 0x1.4c37c0394d094p-21, 30609 + - 0x1.39100d5687bfep-21, 30610 + - 0x1.26f9df8519bd6p-21, 30611 + - 0x1.15e6827001f18p-21, 30612 + - 0x1.05c803e4831c1p-21, 30613 + - 0x1.ed22548cffd35p-22, 30614 + - 0x1.d06ad6ecdf971p-22, 30615 + - 0x1.b551c847fbc96p-22, 30616 + - 0x1.9bc09f112b494p-22, 30617 + - 0x1.83a1ff0aa239dp-22, 30618 + - 0x1.6ce1aa3fd7bddp-22, 30619 + - 0x1.576c72b514859p-22, 30620 + - 0x1.43302cc4a0da8p-22, 30621 + - 0x1.301ba221dc9bbp-22, 30622 + - 0x1.1e1e857adc568p-22, 30623 + - 0x1.0d2966b1746f7p-22, 30624 + - 0x1.fa5b4f49cc6b2p-23, 30625 + - 0x1.dc3ae30b55c16p-23, 30626 + - 0x1.bfd7555a3bd68p-23, 30627 + - 0x1.a517d9e61628ap-23, 30628 + - 0x1.8be4f8f6c951fp-23, 30629 + - 0x1.74287ded49339p-23, 30630 + - 0x1.5dcd669f2cd34p-23, 30631 + - 0x1.48bfd38302870p-23, 30632 + - 0x1.34ecf8a3c124ap-23, 30633 + - 0x1.22430f521cbcfp-23, 30634 + - 0x1.10b1488aeb235p-23, 30635 + - 0x1.0027c00a263a6p-23, 30636 + - 0x1.e12ee004efc37p-24, 30637 + - 0x1.c3e44ae32b16bp-24, 30638 + - 0x1.a854ea14102a8p-24, 30639 + - 0x1.8e6761569f45dp-24, 30640 + - 0x1.7603bac345f65p-24, 30641 + - 0x1.5f1353cdad001p-24, 30642 + - 0x1.4980cb3c80949p-24, 30643 + - 0x1.3537f00b6ad4dp-24, 30644 + - 0x1.2225b12bffc68p-24, 30645 + - 0x1.10380e1adb7e9p-24, 30646 + - 0x1.febc107d5efaap-25, 30647 + - 0x1.df0f2a0ee6946p-25, 30648 + - 0x1.c14b2188bcee4p-25, 30649 + - 0x1.a553644f7f07dp-25, 30650 + - 0x1.8b0cfce0579dfp-25, 30651 + - 0x1.725e7c5dd20f7p-25, 30652 + - 0x1.5b2fe547a1340p-25, 30653 + - 0x1.456a974e92e93p-25, 30654 + - 0x1.30f93c3699078p-25, 30655 + - 0x1.1dc7b5b978cf8p-25, 30656 + - 0x1.0bc30c5d52f15p-25, 30657 + - 0x1.f5b2be65a0c7fp-26, 30658 + - 0x1.d5f3a8dea7357p-26, 30659 + - 0x1.b82915b03515bp-26, 30660 + - 0x1.9c3517e789488p-26, 30661 + - 0x1.81fb7df06136ep-26, 30662 + - 0x1.6961b8d641d06p-26, 30663 + - 0x1.524ec4d916caep-26, 30664 + - 0x1.3cab1343d18d1p-26, 30665 + - 0x1.2860757487a01p-26, 30666 + - 0x1.155a09065d4f7p-26, 30667 + - 0x1.0384250e4c9fcp-26, 30668 + - 0x1.e59890b926c78p-27, 30669 + - 0x1.c642116a8a9e3p-27, 30670 + - 0x1.a8e405e651ab6p-27, 30671 + - 0x1.8d5f98114f872p-27, 30672 + - 0x1.7397c5a66e307p-27, 30673 + - 0x1.5b71456c5a4c4p-27, 30674 + - 0x1.44d26de513197p-27, 30675 + - 0x1.2fa31d6371537p-27, 30676 + - 0x1.1bcca373b7b43p-27, 30677 + - 0x1.0939ab853339fp-27, 30678 + - 0x1.efac5187b2863p-28, 30679 + - 0x1.cf1e86235d0e6p-28, 30680 + - 0x1.b0a68a2128babp-28, 30681 + - 0x1.9423165bc4444p-28, 30682 + - 0x1.7974e743dea3cp-28, 30683 + - 0x1.607e9eacd1050p-28, 30684 + - 0x1.4924a74dec728p-28, 30685 + - 0x1.334d19e0c2160p-28, 30686 + - 0x1.1edfa3c5f5ccap-28, 30687 + - 0x1.0bc56f1b54701p-28, 30688 + - 0x1.f3d2185e047d9p-29, 30689 + - 0x1.d26cb87945e87p-29, 30690 + - 0x1.b334fac4b9f99p-29, 30691 + - 0x1.96076f7918d1cp-29, 30692 + - 0x1.7ac2d72fc2c63p-29, 30693 + - 0x1.614801550319ep-29, 30694 + - 0x1.4979ac8b28926p-29, 30695 + - 0x1.333c68e2d0548p-29, 30696 + - 0x1.1e767bce37dd7p-29, 30697 + - 0x1.0b0fc5b6d05a0p-29, 30698 + - 0x1.f1e3523b41d7dp-30, 30699 + - 0x1.d00de6608effep-30, 30700 + - 0x1.b0778b7b3301ap-30, 30701 + - 0x1.92fb04ec0f6cfp-30, 30702 + - 0x1.77756ec9f78fap-30, 30703 + - 0x1.5dc61922d5a06p-30, 30704 + - 0x1.45ce65699ff6dp-30, 30705 + - 0x1.2f71a5f159970p-30, 30706 + - 0x1.1a94ff571654fp-30, 30707 + - 0x1.071f4bbea09ecp-30, 30708 + - 0x1.e9f1ff8ddd774p-31, 30709 + - 0x1.c818223a202c7p-31, 30710 + - 0x1.a887bd2b4404dp-31, 30711 + - 0x1.8b1a336c5eb6bp-31, 30712 + - 0x1.6fab63324088ap-31, 30713 + - 0x1.56197e30205bap-31, 30714 + - 0x1.3e44e45301b92p-31, 30715 + - 0x1.281000bfe4c3fp-31, 30716 + - 0x1.135f28f2d50b4p-31, 30717 + - 0x1.00187dded5975p-31, 30718 + - 0x1.dc479de0ef001p-32, 30719 + - 0x1.bad4fdad3caa1p-32, 30720 + - 0x1.9baed3ed27ab8p-32, 30721 + - 0x1.7ead9ce4285bbp-32, 30722 + - 0x1.63ac6b4edc88ep-32, 30723 + - 0x1.4a88be2a6390cp-32, 30724 + - 0x1.332259185f1a0p-32, 30725 + - 0x1.1d5b1f3793044p-32, 30726 + - 0x1.0916f04b6e18bp-32, 30727 + - 0x1.ec77101de6926p-33, 30728 + - 0x1.c960bf23153e0p-33, 30729 + - 0x1.a8bd20fc65ef7p-33, 30730 + - 0x1.8a61745ec7d1dp-33, 30731 + - 0x1.6e25d0e756261p-33, 30732 + - 0x1.53e4f7d1666cbp-33, 30733 + - 0x1.3b7c27a7ddb0ep-33, 30734 + - 0x1.24caf2c32af14p-33, 30735 + - 0x1.0fb3186804d0fp-33, 30736 + - 0x1.f830c0bb41fd7p-34, 30737 + - 0x1.d3c0f1a91c846p-34, 30738 + - 0x1.b1e5acf351d87p-34, 30739 + - 0x1.92712d259ce66p-34, 30740 + - 0x1.7538c60a04476p-34, 30741 + - 0x1.5a14b04b47879p-34, 30742 + - 0x1.40dfd87456f4cp-34, 30743 + - 0x1.2977b1172b9d5p-34, 30744 + - 0x1.13bc07e891491p-34, 30745 + - 0x1.ff1dbb4300811p-35, 30746 + - 0x1.d9a880f306bd8p-35, 30747 + - 0x1.b6e45220b55e0p-35, 30748 + - 0x1.96a0b33f2c4dap-35, 30749 + - 0x1.78b07e9e924acp-35, 30750 + - 0x1.5ce9ab1670dd2p-35, 30751 + - 0x1.4325167006bb0p-35, 30752 + - 0x1.2b3e53538ff3fp-35, 30753 + - 0x1.15137a7f44864p-35, 30754 + - 0x1.0084ff125639dp-35, 30755 + - 0x1.daeb0b7311ec7p-36, 30756 + - 0x1.b7937d1c40c52p-36, 30757 + - 0x1.96d082f59ab06p-36, 30758 + - 0x1.7872d9fa10aadp-36, 30759 + - 0x1.5c4e8e37bc7d0p-36, 30760 + - 0x1.423ac0df49a40p-36, 30761 + - 0x1.2a117230ad284p-36, 30762 + - 0x1.13af4f04f9998p-36, 30763 + - 0x1.fde703724e560p-37, 30764 + - 0x1.d77f0c82e7641p-37, 30765 + - 0x1.b3ee02611d7ddp-37, 30766 + - 0x1.92ff33023d5bdp-37, 30767 + - 0x1.7481a9e69f53fp-37, 30768 + - 0x1.5847eda620959p-37, 30769 + - 0x1.3e27c1fcc74bdp-37, 30770 + - 0x1.25f9ee0b923dcp-37, 30771 + - 0x1.0f9a0686531ffp-37, 30772 + - 0x1.f5cc7718082afp-38, 30773 + - 0x1.cf7e53d6a2ca5p-38, 30774 + - 0x1.ac0f5f3229372p-38, 30775 + - 0x1.8b498644847eap-38, 30776 + - 0x1.6cfa9bcca59dcp-38, 30777 + - 0x1.50f411d4fd2cdp-38, 30778 + - 0x1.370ab8327af5ep-38, 30779 + - 0x1.1f167f88c6b6ep-38, 30780 + - 0x1.08f24085d4597p-38, 30781 + - 0x1.e8f70e181d619p-39, 30782 + - 0x1.c324c20e337dcp-39, 30783 + - 0x1.a03261574b54ep-39, 30784 + - 0x1.7fe903cdf5855p-39, 30785 + - 0x1.6215c58da3450p-39, 30786 + - 0x1.46897d4b69fc6p-39, 30787 + - 0x1.2d1877d731b7bp-39, 30788 + - 0x1.159a386b11517p-39, 30789 + - 0x1.ffd27ae9393cep-40, 30790 + - 0x1.d7c593130dd0bp-40, 30791 + - 0x1.b2cd607c79bcfp-40, 30792 + - 0x1.90ae4d3405651p-40, 30793 + - 0x1.71312dd1759e2p-40, 30794 + - 0x1.5422ef5d8949dp-40, 30795 + - 0x1.39544b0ecc957p-40, 30796 + - 0x1.20997f73e73ddp-40, 30797 + - 0x1.09ca0eaacd277p-40, 30798 + - 0x1.e9810295890ecp-41, 30799 + - 0x1.c2b45b5aa4a1dp-41, 30800 + - 0x1.9eee068fa7596p-41, 30801 + - 0x1.7df2b399c10a8p-41, 30802 + - 0x1.5f8b87a31bd85p-41, 30803 + - 0x1.4385c96e9a2d9p-41, 30804 + - 0x1.29b2933ef4cbcp-41, 30805 + - 0x1.11e68a6378f8ap-41, 30806 + - 0x1.f7f338086a86bp-42, 30807 + - 0x1.cf8d7d9ce040ap-42, 30808 + - 0x1.aa577251ae484p-42, 30809 + - 0x1.8811d739efb5ep-42, 30810 + - 0x1.68823e52970bep-42, 30811 + - 0x1.4b72ae68e8b4cp-42, 30812 + - 0x1.30b14dbe876bcp-42, 30813 + - 0x1.181012ef86610p-42, 30814 + - 0x1.01647ba798744p-42, 30815 + - 0x1.d90e917701675p-43, 30816 + - 0x1.b2a87e86d0c8ap-43, 30817 + - 0x1.8f53dcb377293p-43, 30818 + - 0x1.6ed2f2515e933p-43, 30819 + - 0x1.50ecc9ed47f19p-43, 30820 + - 0x1.356cd5ce7799ep-43, 30821 + - 0x1.1c229a587ab78p-43, 30822 + - 0x1.04e15ecc7f3f6p-43, 30823 + - 0x1.deffc7e6a6017p-44, 30824 + - 0x1.b7b040832f310p-44, 30825 + - 0x1.938e021f36d76p-44, 30826 + - 0x1.7258610b3b233p-44, 30827 + - 0x1.53d3bfc82a909p-44, 30828 + - 0x1.37c92babdc2fdp-44, 30829 + - 0x1.1e06010120f6ap-44, 30830 + - 0x1.065b9616170d4p-44, 30831 + - 0x1.e13dd96b3753ap-45, 30832 + - 0x1.b950d32467392p-45, 30833 + - 0x1.94a72263259a5p-45, 30834 + - 0x1.72fd93e036cdcp-45, 30835 + - 0x1.54164576929abp-45, 30836 + - 0x1.37b83c521fe96p-45, 30837 + - 0x1.1daf033182e96p-45, 30838 + - 0x1.05ca50205d26ap-45, 30839 + - 0x1.dfbb6235639fap-46, 30840 + - 0x1.b7807e294781fp-46, 30841 + - 0x1.9298add70a734p-46, 30842 + - 0x1.70beaf9c7ffb6p-46, 30843 + - 0x1.51b2cd6709222p-46, 30844 + - 0x1.353a6cf7f7fffp-46, 30845 + - 0x1.1b1fa8cbe84a7p-46, 30846 + - 0x1.0330f0fd69921p-46, 30847 + - 0x1.da81670f96f9bp-47, 30848 + - 0x1.b24a16b4d09aap-47, 30849 + - 0x1.8d6eeb6efdbd6p-47, 30850 + - 0x1.6ba91ac734785p-47, 30851 + - 0x1.4cb7966770ab5p-47, 30852 + - 0x1.305e9721d0981p-47, 30853 + - 0x1.1667311fff70ap-47, 30854 + - 0x1.fd3de10d62855p-48, 30855 + - 0x1.d1aefbcd48d0cp-48, 30856 + - 0x1.a9cc93c25aca9p-48, 30857 + - 0x1.85487ee3ea735p-48, 30858 + - 0x1.63daf8b4b1e0cp-48, 30859 + - 0x1.45421e69a6ca1p-48, 30860 + - 0x1.294175802d99ap-48, 30861 + - 0x1.0fa17bf41068fp-48, 30862 + - 0x1.f05e82aae2bb9p-49, 30863 + - 0x1.c578101b29058p-49, 30864 + - 0x1.9e39dc5dd2f7cp-49, 30865 + - 0x1.7a553a728bbf2p-49, 30866 + - 0x1.5982008db1304p-49, 30867 + - 0x1.3b7e00422e51bp-49, 30868 + - 0x1.200c898d9ee3ep-49, 30869 + - 0x1.06f5f7eb65a56p-49, 30870 + - 0x1.e00e9148a1d25p-50, 30871 + - 0x1.b623734024e92p-50, 30872 + - 0x1.8fd4e01891bf8p-50, 30873 + - 0x1.6cd44c7470d89p-50, 30874 + - 0x1.4cd9c04158cd7p-50, 30875 + - 0x1.2fa34bf5c8344p-50, 30876 + - 0x1.14f4890ff2461p-50, 30877 + - 0x1.f92c49dfa4df5p-51, 30878 + - 0x1.ccaaea71ab0dfp-51, 30879 + - 0x1.a40829f001197p-51, 30880 + - 0x1.7eef13b59e96cp-51, 30881 + - 0x1.5d11e1a252bf5p-51, 30882 + - 0x1.3e296303b2297p-51, 30883 + - 0x1.21f47009f43cep-51, 30884 + - 0x1.083768c5e4541p-51, 30885 + - 0x1.e1777d831265ep-52, 30886 + - 0x1.b69f10b0191b5p-52, 30887 + - 0x1.8f8a3a05b5b52p-52, 30888 + - 0x1.6be573c40c8e7p-52, 30889 + - 0x1.4b645ba991fdbp-52, 30890 + - 0x1.2dc119095729fp-52, 30891 + - }, 30892 + -}; 30893 + diff --git a/sysdeps/aarch64/fpu/sv_erff_data.c b/sysdeps/aarch64/fpu/sv_erff_data.c 30894 + deleted file mode 100644 30895 + index 6dcd72af69..0000000000 30896 + --- a/sysdeps/aarch64/fpu/sv_erff_data.c 30897 + +++ /dev/null 30898 + @@ -1,1058 +0,0 @@ 30899 + -/* Table for SVE erff approximation 30900 + - 30901 + - Copyright (C) 2024 Free Software Foundation, Inc. 30902 + - This file is part of the GNU C Library. 30903 + - 30904 + - The GNU C Library is free software; you can redistribute it and/or 30905 + - modify it under the terms of the GNU Lesser General Public 30906 + - License as published by the Free Software Foundation; either 30907 + - version 2.1 of the License, or (at your option) any later version. 30908 + - 30909 + - The GNU C Library is distributed in the hope that it will be useful, 30910 + - but WITHOUT ANY WARRANTY; without even the implied warranty of 30911 + - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 30912 + - Lesser General Public License for more details. 30913 + - 30914 + - You should have received a copy of the GNU Lesser General Public 30915 + - License along with the GNU C Library; if not, see 30916 + - <https://www.gnu.org/licenses/>. */ 30917 + - 30918 + -#include "vecmath_config.h" 30919 + - 30920 + -/* Lookup table used in SVE erff. 30921 + - For each possible rounded input r (multiples of 1/128), between 30922 + - r = 0.0 and r = 4.0 (513 values): 30923 + - - __erff_data.erf contains the values of erf(r), 30924 + - - __erff_data.scale contains the values of 2/sqrt(pi)*exp(-r^2). 30925 + - Note that indices 0 and 1 are never hit by the algorithm, since lookup is 30926 + - performed only for x >= 1/64-1/512. */ 30927 + -const struct sv_erff_data __sv_erff_data = { 30928 + - .erf = { 0x0.000000p+0, 30929 + - 0x1.20dbf4p-7, 30930 + - 0x1.20d770p-6, 30931 + - 0x1.b137e0p-6, 30932 + - 0x1.20c564p-5, 30933 + - 0x1.68e5d4p-5, 30934 + - 0x1.b0fafep-5, 30935 + - 0x1.f902a8p-5, 30936 + - 0x1.207d48p-4, 30937 + - 0x1.44703ep-4, 30938 + - 0x1.68591ap-4, 30939 + - 0x1.8c36bep-4, 30940 + - 0x1.b00812p-4, 30941 + - 0x1.d3cbf8p-4, 30942 + - 0x1.f7815ap-4, 30943 + - 0x1.0d9390p-3, 30944 + - 0x1.1f5e1ap-3, 30945 + - 0x1.311fc2p-3, 30946 + - 0x1.42d7fcp-3, 30947 + - 0x1.548642p-3, 30948 + - 0x1.662a0cp-3, 30949 + - 0x1.77c2d2p-3, 30950 + - 0x1.895010p-3, 30951 + - 0x1.9ad142p-3, 30952 + - 0x1.ac45e4p-3, 30953 + - 0x1.bdad72p-3, 30954 + - 0x1.cf076ep-3, 30955 + - 0x1.e05354p-3, 30956 + - 0x1.f190aap-3, 30957 + - 0x1.015f78p-2, 30958 + - 0x1.09eed6p-2, 30959 + - 0x1.127632p-2, 30960 + - 0x1.1af54ep-2, 30961 + - 0x1.236bf0p-2, 30962 + - 0x1.2bd9dcp-2, 30963 + - 0x1.343ed6p-2, 30964 + - 0x1.3c9aa8p-2, 30965 + - 0x1.44ed18p-2, 30966 + - 0x1.4d35f0p-2, 30967 + - 0x1.5574f4p-2, 30968 + - 0x1.5da9f4p-2, 30969 + - 0x1.65d4b8p-2, 30970 + - 0x1.6df50ap-2, 30971 + - 0x1.760abap-2, 30972 + - 0x1.7e1594p-2, 30973 + - 0x1.861566p-2, 30974 + - 0x1.8e0a02p-2, 30975 + - 0x1.95f336p-2, 30976 + - 0x1.9dd0d2p-2, 30977 + - 0x1.a5a2acp-2, 30978 + - 0x1.ad6896p-2, 30979 + - 0x1.b52264p-2, 30980 + - 0x1.bccfecp-2, 30981 + - 0x1.c47104p-2, 30982 + - 0x1.cc0584p-2, 30983 + - 0x1.d38d44p-2, 30984 + - 0x1.db081cp-2, 30985 + - 0x1.e275eap-2, 30986 + - 0x1.e9d68ap-2, 30987 + - 0x1.f129d4p-2, 30988 + - 0x1.f86faap-2, 30989 + - 0x1.ffa7eap-2, 30990 + - 0x1.03693ap-1, 30991 + - 0x1.06f794p-1, 30992 + - 0x1.0a7ef6p-1, 30993 + - 0x1.0dff50p-1, 30994 + - 0x1.117894p-1, 30995 + - 0x1.14eab4p-1, 30996 + - 0x1.1855a6p-1, 30997 + - 0x1.1bb95cp-1, 30998 + - 0x1.1f15ccp-1, 30999 + - 0x1.226ae8p-1, 31000 + - 0x1.25b8a8p-1, 31001 + - 0x1.28ff02p-1, 31002 + - 0x1.2c3decp-1, 31003 + - 0x1.2f755cp-1, 31004 + - 0x1.32a54cp-1, 31005 + - 0x1.35cdb4p-1, 31006 + - 0x1.38ee8ap-1, 31007 + - 0x1.3c07cap-1, 31008 + - 0x1.3f196ep-1, 31009 + - 0x1.42236ep-1, 31010 + - 0x1.4525c8p-1, 31011 + - 0x1.482074p-1, 31012 + - 0x1.4b1372p-1, 31013 + - 0x1.4dfebap-1, 31014 + - 0x1.50e24cp-1, 31015 + - 0x1.53be26p-1, 31016 + - 0x1.569244p-1, 31017 + - 0x1.595ea6p-1, 31018 + - 0x1.5c2348p-1, 31019 + - 0x1.5ee02ep-1, 31020 + - 0x1.619556p-1, 31021 + - 0x1.6442c0p-1, 31022 + - 0x1.66e86ep-1, 31023 + - 0x1.69865ep-1, 31024 + - 0x1.6c1c98p-1, 31025 + - 0x1.6eab18p-1, 31026 + - 0x1.7131e6p-1, 31027 + - 0x1.73b102p-1, 31028 + - 0x1.762870p-1, 31029 + - 0x1.789836p-1, 31030 + - 0x1.7b0058p-1, 31031 + - 0x1.7d60d8p-1, 31032 + - 0x1.7fb9c0p-1, 31033 + - 0x1.820b12p-1, 31034 + - 0x1.8454d6p-1, 31035 + - 0x1.869712p-1, 31036 + - 0x1.88d1cep-1, 31037 + - 0x1.8b050ep-1, 31038 + - 0x1.8d30dep-1, 31039 + - 0x1.8f5544p-1, 31040 + - 0x1.91724ap-1, 31041 + - 0x1.9387f6p-1, 31042 + - 0x1.959652p-1, 31043 + - 0x1.979d68p-1, 31044 + - 0x1.999d42p-1, 31045 + - 0x1.9b95e8p-1, 31046 + - 0x1.9d8768p-1, 31047 + - 0x1.9f71cap-1, 31048 + - 0x1.a1551ap-1, 31049 + - 0x1.a33162p-1, 31050 + - 0x1.a506b0p-1, 31051 + - 0x1.a6d50cp-1, 31052 + - 0x1.a89c86p-1, 31053 + - 0x1.aa5d26p-1, 31054 + - 0x1.ac16fcp-1, 31055 + - 0x1.adca14p-1, 31056 + - 0x1.af767ap-1, 31057 + - 0x1.b11c3cp-1, 31058 + - 0x1.b2bb68p-1, 31059 + - 0x1.b4540ap-1, 31060 + - 0x1.b5e630p-1, 31061 + - 0x1.b771e8p-1, 31062 + - 0x1.b8f742p-1, 31063 + - 0x1.ba764ap-1, 31064 + - 0x1.bbef10p-1, 31065 + - 0x1.bd61a2p-1, 31066 + - 0x1.bece0ep-1, 31067 + - 0x1.c03464p-1, 31068 + - 0x1.c194b2p-1, 31069 + - 0x1.c2ef08p-1, 31070 + - 0x1.c44376p-1, 31071 + - 0x1.c5920ap-1, 31072 + - 0x1.c6dad2p-1, 31073 + - 0x1.c81de2p-1, 31074 + - 0x1.c95b46p-1, 31075 + - 0x1.ca930ep-1, 31076 + - 0x1.cbc54cp-1, 31077 + - 0x1.ccf20cp-1, 31078 + - 0x1.ce1962p-1, 31079 + - 0x1.cf3b5cp-1, 31080 + - 0x1.d0580cp-1, 31081 + - 0x1.d16f7ep-1, 31082 + - 0x1.d281c4p-1, 31083 + - 0x1.d38ef0p-1, 31084 + - 0x1.d49710p-1, 31085 + - 0x1.d59a34p-1, 31086 + - 0x1.d6986cp-1, 31087 + - 0x1.d791cap-1, 31088 + - 0x1.d8865ep-1, 31089 + - 0x1.d97636p-1, 31090 + - 0x1.da6162p-1, 31091 + - 0x1.db47f4p-1, 31092 + - 0x1.dc29fcp-1, 31093 + - 0x1.dd0788p-1, 31094 + - 0x1.dde0aap-1, 31095 + - 0x1.deb570p-1, 31096 + - 0x1.df85eap-1, 31097 + - 0x1.e0522ap-1, 31098 + - 0x1.e11a3ep-1, 31099 + - 0x1.e1de36p-1, 31100 + - 0x1.e29e22p-1, 31101 + - 0x1.e35a12p-1, 31102 + - 0x1.e41214p-1, 31103 + - 0x1.e4c638p-1, 31104 + - 0x1.e5768cp-1, 31105 + - 0x1.e62322p-1, 31106 + - 0x1.e6cc08p-1, 31107 + - 0x1.e7714ap-1, 31108 + - 0x1.e812fcp-1, 31109 + - 0x1.e8b12ap-1, 31110 + - 0x1.e94be4p-1, 31111 + - 0x1.e9e336p-1, 31112 + - 0x1.ea7730p-1, 31113 + - 0x1.eb07e2p-1, 31114 + - 0x1.eb9558p-1, 31115 + - 0x1.ec1fa2p-1, 31116 + - 0x1.eca6ccp-1, 31117 + - 0x1.ed2ae6p-1, 31118 + - 0x1.edabfcp-1, 31119 + - 0x1.ee2a1ep-1, 31120 + - 0x1.eea556p-1, 31121 + - 0x1.ef1db4p-1, 31122 + - 0x1.ef9344p-1, 31123 + - 0x1.f00614p-1, 31124 + - 0x1.f07630p-1, 31125 + - 0x1.f0e3a6p-1, 31126 + - 0x1.f14e82p-1, 31127 + - 0x1.f1b6d0p-1, 31128 + - 0x1.f21ca0p-1, 31129 + - 0x1.f27ff8p-1, 31130 + - 0x1.f2e0eap-1, 31131 + - 0x1.f33f7ep-1, 31132 + - 0x1.f39bc2p-1, 31133 + - 0x1.f3f5c2p-1, 31134 + - 0x1.f44d88p-1, 31135 + - 0x1.f4a31ep-1, 31136 + - 0x1.f4f694p-1, 31137 + - 0x1.f547f2p-1, 31138 + - 0x1.f59742p-1, 31139 + - 0x1.f5e490p-1, 31140 + - 0x1.f62fe8p-1, 31141 + - 0x1.f67952p-1, 31142 + - 0x1.f6c0dcp-1, 31143 + - 0x1.f7068cp-1, 31144 + - 0x1.f74a6ep-1, 31145 + - 0x1.f78c8cp-1, 31146 + - 0x1.f7cceep-1, 31147 + - 0x1.f80ba2p-1, 31148 + - 0x1.f848acp-1, 31149 + - 0x1.f8841ap-1, 31150 + - 0x1.f8bdf2p-1, 31151 + - 0x1.f8f63ep-1, 31152 + - 0x1.f92d08p-1, 31153 + - 0x1.f96256p-1, 31154 + - 0x1.f99634p-1, 31155 + - 0x1.f9c8a8p-1, 31156 + - 0x1.f9f9bap-1, 31157 + - 0x1.fa2974p-1, 31158 + - 0x1.fa57dep-1, 31159 + - 0x1.fa84fep-1, 31160 + - 0x1.fab0dep-1, 31161 + - 0x1.fadb84p-1, 31162 + - 0x1.fb04f6p-1, 31163 + - 0x1.fb2d40p-1, 31164 + - 0x1.fb5464p-1, 31165 + - 0x1.fb7a6cp-1, 31166 + - 0x1.fb9f60p-1, 31167 + - 0x1.fbc344p-1, 31168 + - 0x1.fbe61ep-1, 31169 + - 0x1.fc07fap-1, 31170 + - 0x1.fc28d8p-1, 31171 + - 0x1.fc48c2p-1, 31172 + - 0x1.fc67bcp-1, 31173 + - 0x1.fc85d0p-1, 31174 + - 0x1.fca2fep-1, 31175 + - 0x1.fcbf52p-1, 31176 + - 0x1.fcdaccp-1, 31177 + - 0x1.fcf576p-1, 31178 + - 0x1.fd0f54p-1, 31179 + - 0x1.fd286ap-1, 31180 + - 0x1.fd40bep-1, 31181 + - 0x1.fd5856p-1, 31182 + - 0x1.fd6f34p-1, 31183 + - 0x1.fd8562p-1, 31184 + - 0x1.fd9ae2p-1, 31185 + - 0x1.fdafb8p-1, 31186 + - 0x1.fdc3e8p-1, 31187 + - 0x1.fdd77ap-1, 31188 + - 0x1.fdea6ep-1, 31189 + - 0x1.fdfcccp-1, 31190 + - 0x1.fe0e96p-1, 31191 + - 0x1.fe1fd0p-1, 31192 + - 0x1.fe3080p-1, 31193 + - 0x1.fe40a6p-1, 31194 + - 0x1.fe504cp-1, 31195 + - 0x1.fe5f70p-1, 31196 + - 0x1.fe6e18p-1, 31197 + - 0x1.fe7c46p-1, 31198 + - 0x1.fe8a00p-1, 31199 + - 0x1.fe9748p-1, 31200 + - 0x1.fea422p-1, 31201 + - 0x1.feb090p-1, 31202 + - 0x1.febc96p-1, 31203 + - 0x1.fec836p-1, 31204 + - 0x1.fed374p-1, 31205 + - 0x1.fede52p-1, 31206 + - 0x1.fee8d4p-1, 31207 + - 0x1.fef2fep-1, 31208 + - 0x1.fefccep-1, 31209 + - 0x1.ff064cp-1, 31210 + - 0x1.ff0f76p-1, 31211 + - 0x1.ff1852p-1, 31212 + - 0x1.ff20e0p-1, 31213 + - 0x1.ff2924p-1, 31214 + - 0x1.ff3120p-1, 31215 + - 0x1.ff38d6p-1, 31216 + - 0x1.ff4048p-1, 31217 + - 0x1.ff4778p-1, 31218 + - 0x1.ff4e68p-1, 31219 + - 0x1.ff551ap-1, 31220 + - 0x1.ff5b90p-1, 31221 + - 0x1.ff61ccp-1, 31222 + - 0x1.ff67d0p-1, 31223 + - 0x1.ff6d9ep-1, 31224 + - 0x1.ff7338p-1, 31225 + - 0x1.ff789ep-1, 31226 + - 0x1.ff7dd4p-1, 31227 + - 0x1.ff82dap-1, 31228 + - 0x1.ff87b2p-1, 31229 + - 0x1.ff8c5cp-1, 31230 + - 0x1.ff90dcp-1, 31231 + - 0x1.ff9532p-1, 31232 + - 0x1.ff9960p-1, 31233 + - 0x1.ff9d68p-1, 31234 + - 0x1.ffa14ap-1, 31235 + - 0x1.ffa506p-1, 31236 + - 0x1.ffa8a0p-1, 31237 + - 0x1.ffac18p-1, 31238 + - 0x1.ffaf6ep-1, 31239 + - 0x1.ffb2a6p-1, 31240 + - 0x1.ffb5bep-1, 31241 + - 0x1.ffb8b8p-1, 31242 + - 0x1.ffbb98p-1, 31243 + - 0x1.ffbe5ap-1, 31244 + - 0x1.ffc102p-1, 31245 + - 0x1.ffc390p-1, 31246 + - 0x1.ffc606p-1, 31247 + - 0x1.ffc862p-1, 31248 + - 0x1.ffcaa8p-1, 31249 + - 0x1.ffccd8p-1, 31250 + - 0x1.ffcef4p-1, 31251 + - 0x1.ffd0fap-1, 31252 + - 0x1.ffd2eap-1, 31253 + - 0x1.ffd4cap-1, 31254 + - 0x1.ffd696p-1, 31255 + - 0x1.ffd84ep-1, 31256 + - 0x1.ffd9f8p-1, 31257 + - 0x1.ffdb90p-1, 31258 + - 0x1.ffdd18p-1, 31259 + - 0x1.ffde90p-1, 31260 + - 0x1.ffdffap-1, 31261 + - 0x1.ffe154p-1, 31262 + - 0x1.ffe2a2p-1, 31263 + - 0x1.ffe3e2p-1, 31264 + - 0x1.ffe514p-1, 31265 + - 0x1.ffe63cp-1, 31266 + - 0x1.ffe756p-1, 31267 + - 0x1.ffe866p-1, 31268 + - 0x1.ffe96ap-1, 31269 + - 0x1.ffea64p-1, 31270 + - 0x1.ffeb54p-1, 31271 + - 0x1.ffec3ap-1, 31272 + - 0x1.ffed16p-1, 31273 + - 0x1.ffedeap-1, 31274 + - 0x1.ffeeb4p-1, 31275 + - 0x1.ffef76p-1, 31276 + - 0x1.fff032p-1, 31277 + - 0x1.fff0e4p-1, 31278 + - 0x1.fff18ep-1, 31279 + - 0x1.fff232p-1, 31280 + - 0x1.fff2d0p-1, 31281 + - 0x1.fff366p-1, 31282 + - 0x1.fff3f6p-1, 31283 + - 0x1.fff480p-1, 31284 + - 0x1.fff504p-1, 31285 + - 0x1.fff582p-1, 31286 + - 0x1.fff5fcp-1, 31287 + - 0x1.fff670p-1, 31288 + - 0x1.fff6dep-1, 31289 + - 0x1.fff74ap-1, 31290 + - 0x1.fff7aep-1, 31291 + - 0x1.fff810p-1, 31292 + - 0x1.fff86cp-1, 31293 + - 0x1.fff8c6p-1, 31294 + - 0x1.fff91cp-1, 31295 + - 0x1.fff96cp-1, 31296 + - 0x1.fff9bap-1, 31297 + - 0x1.fffa04p-1, 31298 + - 0x1.fffa4cp-1, 31299 + - 0x1.fffa90p-1, 31300 + - 0x1.fffad0p-1, 31301 + - 0x1.fffb0ep-1, 31302 + - 0x1.fffb4ap-1, 31303 + - 0x1.fffb82p-1, 31304 + - 0x1.fffbb8p-1, 31305 + - 0x1.fffbecp-1, 31306 + - 0x1.fffc1ep-1, 31307 + - 0x1.fffc4ep-1, 31308 + - 0x1.fffc7ap-1, 31309 + - 0x1.fffca6p-1, 31310 + - 0x1.fffccep-1, 31311 + - 0x1.fffcf6p-1, 31312 + - 0x1.fffd1ap-1, 31313 + - 0x1.fffd3ep-1, 31314 + - 0x1.fffd60p-1, 31315 + - 0x1.fffd80p-1, 31316 + - 0x1.fffda0p-1, 31317 + - 0x1.fffdbep-1, 31318 + - 0x1.fffddap-1, 31319 + - 0x1.fffdf4p-1, 31320 + - 0x1.fffe0ep-1, 31321 + - 0x1.fffe26p-1, 31322 + - 0x1.fffe3ep-1, 31323 + - 0x1.fffe54p-1, 31324 + - 0x1.fffe68p-1, 31325 + - 0x1.fffe7ep-1, 31326 + - 0x1.fffe90p-1, 31327 + - 0x1.fffea2p-1, 31328 + - 0x1.fffeb4p-1, 31329 + - 0x1.fffec4p-1, 31330 + - 0x1.fffed4p-1, 31331 + - 0x1.fffee4p-1, 31332 + - 0x1.fffef2p-1, 31333 + - 0x1.ffff00p-1, 31334 + - 0x1.ffff0cp-1, 31335 + - 0x1.ffff18p-1, 31336 + - 0x1.ffff24p-1, 31337 + - 0x1.ffff30p-1, 31338 + - 0x1.ffff3ap-1, 31339 + - 0x1.ffff44p-1, 31340 + - 0x1.ffff4ep-1, 31341 + - 0x1.ffff56p-1, 31342 + - 0x1.ffff60p-1, 31343 + - 0x1.ffff68p-1, 31344 + - 0x1.ffff70p-1, 31345 + - 0x1.ffff78p-1, 31346 + - 0x1.ffff7ep-1, 31347 + - 0x1.ffff84p-1, 31348 + - 0x1.ffff8cp-1, 31349 + - 0x1.ffff92p-1, 31350 + - 0x1.ffff98p-1, 31351 + - 0x1.ffff9cp-1, 31352 + - 0x1.ffffa2p-1, 31353 + - 0x1.ffffa6p-1, 31354 + - 0x1.ffffacp-1, 31355 + - 0x1.ffffb0p-1, 31356 + - 0x1.ffffb4p-1, 31357 + - 0x1.ffffb8p-1, 31358 + - 0x1.ffffbcp-1, 31359 + - 0x1.ffffc0p-1, 31360 + - 0x1.ffffc4p-1, 31361 + - 0x1.ffffc6p-1, 31362 + - 0x1.ffffcap-1, 31363 + - 0x1.ffffccp-1, 31364 + - 0x1.ffffd0p-1, 31365 + - 0x1.ffffd2p-1, 31366 + - 0x1.ffffd4p-1, 31367 + - 0x1.ffffd6p-1, 31368 + - 0x1.ffffd8p-1, 31369 + - 0x1.ffffdcp-1, 31370 + - 0x1.ffffdep-1, 31371 + - 0x1.ffffdep-1, 31372 + - 0x1.ffffe0p-1, 31373 + - 0x1.ffffe2p-1, 31374 + - 0x1.ffffe4p-1, 31375 + - 0x1.ffffe6p-1, 31376 + - 0x1.ffffe8p-1, 31377 + - 0x1.ffffe8p-1, 31378 + - 0x1.ffffeap-1, 31379 + - 0x1.ffffeap-1, 31380 + - 0x1.ffffecp-1, 31381 + - 0x1.ffffeep-1, 31382 + - 0x1.ffffeep-1, 31383 + - 0x1.fffff0p-1, 31384 + - 0x1.fffff0p-1, 31385 + - 0x1.fffff2p-1, 31386 + - 0x1.fffff2p-1, 31387 + - 0x1.fffff2p-1, 31388 + - 0x1.fffff4p-1, 31389 + - 0x1.fffff4p-1, 31390 + - 0x1.fffff4p-1, 31391 + - 0x1.fffff6p-1, 31392 + - 0x1.fffff6p-1, 31393 + - 0x1.fffff6p-1, 31394 + - 0x1.fffff8p-1, 31395 + - 0x1.fffff8p-1, 31396 + - 0x1.fffff8p-1, 31397 + - 0x1.fffff8p-1, 31398 + - 0x1.fffffap-1, 31399 + - 0x1.fffffap-1, 31400 + - 0x1.fffffap-1, 31401 + - 0x1.fffffap-1, 31402 + - 0x1.fffffap-1, 31403 + - 0x1.fffffap-1, 31404 + - 0x1.fffffcp-1, 31405 + - 0x1.fffffcp-1, 31406 + - 0x1.fffffcp-1, 31407 + - 0x1.fffffcp-1, 31408 + - 0x1.fffffcp-1, 31409 + - 0x1.fffffcp-1, 31410 + - 0x1.fffffcp-1, 31411 + - 0x1.fffffcp-1, 31412 + - 0x1.fffffep-1, 31413 + - 0x1.fffffep-1, 31414 + - 0x1.fffffep-1, 31415 + - 0x1.fffffep-1, 31416 + - 0x1.fffffep-1, 31417 + - 0x1.fffffep-1, 31418 + - 0x1.fffffep-1, 31419 + - 0x1.fffffep-1, 31420 + - 0x1.fffffep-1, 31421 + - 0x1.fffffep-1, 31422 + - 0x1.fffffep-1, 31423 + - 0x1.fffffep-1, 31424 + - 0x1.fffffep-1, 31425 + - 0x1.fffffep-1, 31426 + - 0x1.fffffep-1, 31427 + - 0x1.fffffep-1, 31428 + - 0x1.fffffep-1, 31429 + - 0x1.fffffep-1, 31430 + - 0x1.000000p+0, 31431 + - 0x1.000000p+0, 31432 + - 0x1.000000p+0, 31433 + - 0x1.000000p+0, 31434 + - 0x1.000000p+0, 31435 + - 0x1.000000p+0, 31436 + - 0x1.000000p+0, 31437 + - 0x1.000000p+0, 31438 + - 0x1.000000p+0, 31439 + - 0x1.000000p+0, 31440 + - 0x1.000000p+0, 31441 + - }, 31442 + - .scale = { 0x1.20dd76p+0, 31443 + - 0x1.20d8f2p+0, 31444 + - 0x1.20cb68p+0, 31445 + - 0x1.20b4d8p+0, 31446 + - 0x1.209546p+0, 31447 + - 0x1.206cb4p+0, 31448 + - 0x1.203b26p+0, 31449 + - 0x1.2000a0p+0, 31450 + - 0x1.1fbd28p+0, 31451 + - 0x1.1f70c4p+0, 31452 + - 0x1.1f1b7ap+0, 31453 + - 0x1.1ebd56p+0, 31454 + - 0x1.1e565cp+0, 31455 + - 0x1.1de698p+0, 31456 + - 0x1.1d6e14p+0, 31457 + - 0x1.1cecdcp+0, 31458 + - 0x1.1c62fap+0, 31459 + - 0x1.1bd07cp+0, 31460 + - 0x1.1b3572p+0, 31461 + - 0x1.1a91e6p+0, 31462 + - 0x1.19e5eap+0, 31463 + - 0x1.19318cp+0, 31464 + - 0x1.1874dep+0, 31465 + - 0x1.17aff0p+0, 31466 + - 0x1.16e2d8p+0, 31467 + - 0x1.160da4p+0, 31468 + - 0x1.153068p+0, 31469 + - 0x1.144b3cp+0, 31470 + - 0x1.135e30p+0, 31471 + - 0x1.12695ep+0, 31472 + - 0x1.116cd8p+0, 31473 + - 0x1.1068bap+0, 31474 + - 0x1.0f5d16p+0, 31475 + - 0x1.0e4a08p+0, 31476 + - 0x1.0d2fa6p+0, 31477 + - 0x1.0c0e0ap+0, 31478 + - 0x1.0ae550p+0, 31479 + - 0x1.09b590p+0, 31480 + - 0x1.087ee4p+0, 31481 + - 0x1.07416cp+0, 31482 + - 0x1.05fd3ep+0, 31483 + - 0x1.04b27cp+0, 31484 + - 0x1.036140p+0, 31485 + - 0x1.0209a6p+0, 31486 + - 0x1.00abd0p+0, 31487 + - 0x1.fe8fb0p-1, 31488 + - 0x1.fbbbbep-1, 31489 + - 0x1.f8dc0ap-1, 31490 + - 0x1.f5f0cep-1, 31491 + - 0x1.f2fa4cp-1, 31492 + - 0x1.eff8c4p-1, 31493 + - 0x1.ecec78p-1, 31494 + - 0x1.e9d5a8p-1, 31495 + - 0x1.e6b498p-1, 31496 + - 0x1.e38988p-1, 31497 + - 0x1.e054bep-1, 31498 + - 0x1.dd167cp-1, 31499 + - 0x1.d9cf06p-1, 31500 + - 0x1.d67ea2p-1, 31501 + - 0x1.d32592p-1, 31502 + - 0x1.cfc41ep-1, 31503 + - 0x1.cc5a8ap-1, 31504 + - 0x1.c8e91cp-1, 31505 + - 0x1.c5701ap-1, 31506 + - 0x1.c1efcap-1, 31507 + - 0x1.be6872p-1, 31508 + - 0x1.bada5ap-1, 31509 + - 0x1.b745c6p-1, 31510 + - 0x1.b3aafcp-1, 31511 + - 0x1.b00a46p-1, 31512 + - 0x1.ac63e8p-1, 31513 + - 0x1.a8b828p-1, 31514 + - 0x1.a5074ep-1, 31515 + - 0x1.a1519ep-1, 31516 + - 0x1.9d9762p-1, 31517 + - 0x1.99d8dap-1, 31518 + - 0x1.961650p-1, 31519 + - 0x1.925008p-1, 31520 + - 0x1.8e8646p-1, 31521 + - 0x1.8ab950p-1, 31522 + - 0x1.86e96ap-1, 31523 + - 0x1.8316d6p-1, 31524 + - 0x1.7f41dcp-1, 31525 + - 0x1.7b6abcp-1, 31526 + - 0x1.7791b8p-1, 31527 + - 0x1.73b714p-1, 31528 + - 0x1.6fdb12p-1, 31529 + - 0x1.6bfdf0p-1, 31530 + - 0x1.681ff2p-1, 31531 + - 0x1.644156p-1, 31532 + - 0x1.60625cp-1, 31533 + - 0x1.5c8342p-1, 31534 + - 0x1.58a446p-1, 31535 + - 0x1.54c5a6p-1, 31536 + - 0x1.50e79ep-1, 31537 + - 0x1.4d0a68p-1, 31538 + - 0x1.492e42p-1, 31539 + - 0x1.455366p-1, 31540 + - 0x1.417a0cp-1, 31541 + - 0x1.3da26ep-1, 31542 + - 0x1.39ccc2p-1, 31543 + - 0x1.35f940p-1, 31544 + - 0x1.32281ep-1, 31545 + - 0x1.2e5992p-1, 31546 + - 0x1.2a8dcep-1, 31547 + - 0x1.26c508p-1, 31548 + - 0x1.22ff72p-1, 31549 + - 0x1.1f3d3cp-1, 31550 + - 0x1.1b7e98p-1, 31551 + - 0x1.17c3b6p-1, 31552 + - 0x1.140cc4p-1, 31553 + - 0x1.1059eep-1, 31554 + - 0x1.0cab62p-1, 31555 + - 0x1.09014cp-1, 31556 + - 0x1.055bd6p-1, 31557 + - 0x1.01bb2cp-1, 31558 + - 0x1.fc3ee6p-2, 31559 + - 0x1.f511aap-2, 31560 + - 0x1.edeeeep-2, 31561 + - 0x1.e6d700p-2, 31562 + - 0x1.dfca26p-2, 31563 + - 0x1.d8c8aap-2, 31564 + - 0x1.d1d2d0p-2, 31565 + - 0x1.cae8dap-2, 31566 + - 0x1.c40b08p-2, 31567 + - 0x1.bd3998p-2, 31568 + - 0x1.b674c8p-2, 31569 + - 0x1.afbcd4p-2, 31570 + - 0x1.a911f0p-2, 31571 + - 0x1.a27456p-2, 31572 + - 0x1.9be438p-2, 31573 + - 0x1.9561c8p-2, 31574 + - 0x1.8eed36p-2, 31575 + - 0x1.8886b2p-2, 31576 + - 0x1.822e66p-2, 31577 + - 0x1.7be47ap-2, 31578 + - 0x1.75a91ap-2, 31579 + - 0x1.6f7c6ap-2, 31580 + - 0x1.695e8cp-2, 31581 + - 0x1.634fa6p-2, 31582 + - 0x1.5d4fd4p-2, 31583 + - 0x1.575f34p-2, 31584 + - 0x1.517de6p-2, 31585 + - 0x1.4bac00p-2, 31586 + - 0x1.45e99cp-2, 31587 + - 0x1.4036d0p-2, 31588 + - 0x1.3a93b2p-2, 31589 + - 0x1.350052p-2, 31590 + - 0x1.2f7cc4p-2, 31591 + - 0x1.2a0916p-2, 31592 + - 0x1.24a554p-2, 31593 + - 0x1.1f518ap-2, 31594 + - 0x1.1a0dc6p-2, 31595 + - 0x1.14da0ap-2, 31596 + - 0x1.0fb662p-2, 31597 + - 0x1.0aa2d0p-2, 31598 + - 0x1.059f5ap-2, 31599 + - 0x1.00ac00p-2, 31600 + - 0x1.f79184p-3, 31601 + - 0x1.edeb40p-3, 31602 + - 0x1.e46530p-3, 31603 + - 0x1.daff4ap-3, 31604 + - 0x1.d1b982p-3, 31605 + - 0x1.c893cep-3, 31606 + - 0x1.bf8e1cp-3, 31607 + - 0x1.b6a856p-3, 31608 + - 0x1.ade26cp-3, 31609 + - 0x1.a53c42p-3, 31610 + - 0x1.9cb5bep-3, 31611 + - 0x1.944ec2p-3, 31612 + - 0x1.8c0732p-3, 31613 + - 0x1.83deeap-3, 31614 + - 0x1.7bd5c8p-3, 31615 + - 0x1.73eba4p-3, 31616 + - 0x1.6c2056p-3, 31617 + - 0x1.6473b6p-3, 31618 + - 0x1.5ce596p-3, 31619 + - 0x1.5575c8p-3, 31620 + - 0x1.4e241ep-3, 31621 + - 0x1.46f066p-3, 31622 + - 0x1.3fda6cp-3, 31623 + - 0x1.38e1fap-3, 31624 + - 0x1.3206dcp-3, 31625 + - 0x1.2b48dap-3, 31626 + - 0x1.24a7b8p-3, 31627 + - 0x1.1e233ep-3, 31628 + - 0x1.17bb2cp-3, 31629 + - 0x1.116f48p-3, 31630 + - 0x1.0b3f52p-3, 31631 + - 0x1.052b0cp-3, 31632 + - 0x1.fe6460p-4, 31633 + - 0x1.f2a902p-4, 31634 + - 0x1.e72372p-4, 31635 + - 0x1.dbd32ap-4, 31636 + - 0x1.d0b7a0p-4, 31637 + - 0x1.c5d04ap-4, 31638 + - 0x1.bb1c98p-4, 31639 + - 0x1.b09bfcp-4, 31640 + - 0x1.a64de6p-4, 31641 + - 0x1.9c31c6p-4, 31642 + - 0x1.92470ap-4, 31643 + - 0x1.888d1ep-4, 31644 + - 0x1.7f036cp-4, 31645 + - 0x1.75a960p-4, 31646 + - 0x1.6c7e64p-4, 31647 + - 0x1.6381e2p-4, 31648 + - 0x1.5ab342p-4, 31649 + - 0x1.5211ecp-4, 31650 + - 0x1.499d48p-4, 31651 + - 0x1.4154bcp-4, 31652 + - 0x1.3937b2p-4, 31653 + - 0x1.31458ep-4, 31654 + - 0x1.297dbap-4, 31655 + - 0x1.21df9ap-4, 31656 + - 0x1.1a6a96p-4, 31657 + - 0x1.131e14p-4, 31658 + - 0x1.0bf97ep-4, 31659 + - 0x1.04fc3ap-4, 31660 + - 0x1.fc4b5ep-5, 31661 + - 0x1.eeea8cp-5, 31662 + - 0x1.e1d4d0p-5, 31663 + - 0x1.d508fap-5, 31664 + - 0x1.c885e0p-5, 31665 + - 0x1.bc4a54p-5, 31666 + - 0x1.b05530p-5, 31667 + - 0x1.a4a54ap-5, 31668 + - 0x1.99397ap-5, 31669 + - 0x1.8e109cp-5, 31670 + - 0x1.83298ep-5, 31671 + - 0x1.78832cp-5, 31672 + - 0x1.6e1c58p-5, 31673 + - 0x1.63f3f6p-5, 31674 + - 0x1.5a08e8p-5, 31675 + - 0x1.505a18p-5, 31676 + - 0x1.46e66cp-5, 31677 + - 0x1.3dacd2p-5, 31678 + - 0x1.34ac36p-5, 31679 + - 0x1.2be38cp-5, 31680 + - 0x1.2351c2p-5, 31681 + - 0x1.1af5d2p-5, 31682 + - 0x1.12ceb4p-5, 31683 + - 0x1.0adb60p-5, 31684 + - 0x1.031ad6p-5, 31685 + - 0x1.f7182ap-6, 31686 + - 0x1.e85c44p-6, 31687 + - 0x1.da0006p-6, 31688 + - 0x1.cc0180p-6, 31689 + - 0x1.be5ecep-6, 31690 + - 0x1.b1160ap-6, 31691 + - 0x1.a4255ap-6, 31692 + - 0x1.978ae8p-6, 31693 + - 0x1.8b44e6p-6, 31694 + - 0x1.7f5188p-6, 31695 + - 0x1.73af0cp-6, 31696 + - 0x1.685bb6p-6, 31697 + - 0x1.5d55ccp-6, 31698 + - 0x1.529b9ep-6, 31699 + - 0x1.482b84p-6, 31700 + - 0x1.3e03d8p-6, 31701 + - 0x1.3422fep-6, 31702 + - 0x1.2a875cp-6, 31703 + - 0x1.212f62p-6, 31704 + - 0x1.181984p-6, 31705 + - 0x1.0f443ep-6, 31706 + - 0x1.06ae14p-6, 31707 + - 0x1.fcab14p-7, 31708 + - 0x1.ec7262p-7, 31709 + - 0x1.dcaf36p-7, 31710 + - 0x1.cd5ecap-7, 31711 + - 0x1.be7e5ap-7, 31712 + - 0x1.b00b38p-7, 31713 + - 0x1.a202bep-7, 31714 + - 0x1.94624ep-7, 31715 + - 0x1.87275ep-7, 31716 + - 0x1.7a4f6ap-7, 31717 + - 0x1.6dd7fep-7, 31718 + - 0x1.61beaep-7, 31719 + - 0x1.56011cp-7, 31720 + - 0x1.4a9cf6p-7, 31721 + - 0x1.3f8ff6p-7, 31722 + - 0x1.34d7dcp-7, 31723 + - 0x1.2a727ap-7, 31724 + - 0x1.205dacp-7, 31725 + - 0x1.169756p-7, 31726 + - 0x1.0d1d6ap-7, 31727 + - 0x1.03ede2p-7, 31728 + - 0x1.f60d8ap-8, 31729 + - 0x1.e4cc4ap-8, 31730 + - 0x1.d4143ap-8, 31731 + - 0x1.c3e1a6p-8, 31732 + - 0x1.b430ecp-8, 31733 + - 0x1.a4fe84p-8, 31734 + - 0x1.9646f4p-8, 31735 + - 0x1.8806d8p-8, 31736 + - 0x1.7a3adep-8, 31737 + - 0x1.6cdfccp-8, 31738 + - 0x1.5ff276p-8, 31739 + - 0x1.536fc2p-8, 31740 + - 0x1.4754acp-8, 31741 + - 0x1.3b9e40p-8, 31742 + - 0x1.30499cp-8, 31743 + - 0x1.2553eep-8, 31744 + - 0x1.1aba78p-8, 31745 + - 0x1.107a8cp-8, 31746 + - 0x1.06918cp-8, 31747 + - 0x1.f9f9d0p-9, 31748 + - 0x1.e77448p-9, 31749 + - 0x1.d58da6p-9, 31750 + - 0x1.c4412cp-9, 31751 + - 0x1.b38a3ap-9, 31752 + - 0x1.a36454p-9, 31753 + - 0x1.93cb12p-9, 31754 + - 0x1.84ba30p-9, 31755 + - 0x1.762d84p-9, 31756 + - 0x1.682100p-9, 31757 + - 0x1.5a90b0p-9, 31758 + - 0x1.4d78bcp-9, 31759 + - 0x1.40d564p-9, 31760 + - 0x1.34a306p-9, 31761 + - 0x1.28de12p-9, 31762 + - 0x1.1d8318p-9, 31763 + - 0x1.128ebap-9, 31764 + - 0x1.07fdb4p-9, 31765 + - 0x1.fb99b8p-10, 31766 + - 0x1.e7f232p-10, 31767 + - 0x1.d4fed8p-10, 31768 + - 0x1.c2b9d0p-10, 31769 + - 0x1.b11d70p-10, 31770 + - 0x1.a02436p-10, 31771 + - 0x1.8fc8c8p-10, 31772 + - 0x1.8005f0p-10, 31773 + - 0x1.70d6a4p-10, 31774 + - 0x1.6235fcp-10, 31775 + - 0x1.541f34p-10, 31776 + - 0x1.468daep-10, 31777 + - 0x1.397ceep-10, 31778 + - 0x1.2ce898p-10, 31779 + - 0x1.20cc76p-10, 31780 + - 0x1.15246ep-10, 31781 + - 0x1.09ec86p-10, 31782 + - 0x1.fe41cep-11, 31783 + - 0x1.e97ba4p-11, 31784 + - 0x1.d57f52p-11, 31785 + - 0x1.c245d4p-11, 31786 + - 0x1.afc85ep-11, 31787 + - 0x1.9e0058p-11, 31788 + - 0x1.8ce75ep-11, 31789 + - 0x1.7c7744p-11, 31790 + - 0x1.6caa0ep-11, 31791 + - 0x1.5d79ecp-11, 31792 + - 0x1.4ee142p-11, 31793 + - 0x1.40daa4p-11, 31794 + - 0x1.3360ccp-11, 31795 + - 0x1.266ea8p-11, 31796 + - 0x1.19ff46p-11, 31797 + - 0x1.0e0de8p-11, 31798 + - 0x1.0295f0p-11, 31799 + - 0x1.ef25d4p-12, 31800 + - 0x1.da0110p-12, 31801 + - 0x1.c5b542p-12, 31802 + - 0x1.b23a5ap-12, 31803 + - 0x1.9f8894p-12, 31804 + - 0x1.8d986ap-12, 31805 + - 0x1.7c629ap-12, 31806 + - 0x1.6be022p-12, 31807 + - 0x1.5c0a38p-12, 31808 + - 0x1.4cda54p-12, 31809 + - 0x1.3e4a24p-12, 31810 + - 0x1.305390p-12, 31811 + - 0x1.22f0b4p-12, 31812 + - 0x1.161be4p-12, 31813 + - 0x1.09cfa4p-12, 31814 + - 0x1.fc0d56p-13, 31815 + - 0x1.e577bcp-13, 31816 + - 0x1.cfd4a6p-13, 31817 + - 0x1.bb1a96p-13, 31818 + - 0x1.a74068p-13, 31819 + - 0x1.943d4ap-13, 31820 + - 0x1.8208bcp-13, 31821 + - 0x1.709a8ep-13, 31822 + - 0x1.5feadap-13, 31823 + - 0x1.4ff208p-13, 31824 + - 0x1.40a8c2p-13, 31825 + - 0x1.3207fcp-13, 31826 + - 0x1.2408eap-13, 31827 + - 0x1.16a502p-13, 31828 + - 0x1.09d5f8p-13, 31829 + - 0x1.fb2b7ap-14, 31830 + - 0x1.e3bcf4p-14, 31831 + - 0x1.cd5528p-14, 31832 + - 0x1.b7e946p-14, 31833 + - 0x1.a36eecp-14, 31834 + - 0x1.8fdc1cp-14, 31835 + - 0x1.7d2738p-14, 31836 + - 0x1.6b4702p-14, 31837 + - 0x1.5a329cp-14, 31838 + - 0x1.49e178p-14, 31839 + - 0x1.3a4b60p-14, 31840 + - 0x1.2b6876p-14, 31841 + - 0x1.1d3120p-14, 31842 + - 0x1.0f9e1cp-14, 31843 + - 0x1.02a868p-14, 31844 + - 0x1.ec929ap-15, 31845 + - 0x1.d4f4b4p-15, 31846 + - 0x1.be6abcp-15, 31847 + - 0x1.a8e8ccp-15, 31848 + - 0x1.94637ep-15, 31849 + - 0x1.80cfdcp-15, 31850 + - 0x1.6e2368p-15, 31851 + - 0x1.5c540cp-15, 31852 + - 0x1.4b581cp-15, 31853 + - 0x1.3b2652p-15, 31854 + - 0x1.2bb5ccp-15, 31855 + - 0x1.1cfe02p-15, 31856 + - 0x1.0ef6c4p-15, 31857 + - 0x1.019842p-15, 31858 + - 0x1.e9b5e8p-16, 31859 + - 0x1.d16f58p-16, 31860 + - 0x1.ba4f04p-16, 31861 + - 0x1.a447b8p-16, 31862 + - 0x1.8f4cccp-16, 31863 + - 0x1.7b5224p-16, 31864 + - 0x1.684c22p-16, 31865 + - 0x1.562facp-16, 31866 + - 0x1.44f21ep-16, 31867 + - 0x1.34894ap-16, 31868 + - 0x1.24eb72p-16, 31869 + - 0x1.160f44p-16, 31870 + - 0x1.07ebd2p-16, 31871 + - 0x1.f4f12ep-17, 31872 + - 0x1.db5ad0p-17, 31873 + - 0x1.c304f0p-17, 31874 + - 0x1.abe09ep-17, 31875 + - 0x1.95df98p-17, 31876 + - 0x1.80f43ap-17, 31877 + - 0x1.6d1178p-17, 31878 + - 0x1.5a2ae0p-17, 31879 + - 0x1.483488p-17, 31880 + - 0x1.372310p-17, 31881 + - 0x1.26eb9ep-17, 31882 + - 0x1.1783cep-17, 31883 + - 0x1.08e1bap-17, 31884 + - 0x1.f5f7d8p-18, 31885 + - 0x1.db92b6p-18, 31886 + - 0x1.c282cep-18, 31887 + - 0x1.aab7acp-18, 31888 + - 0x1.94219cp-18, 31889 + - 0x1.7eb1a2p-18, 31890 + - 0x1.6a5972p-18, 31891 + - 0x1.570b6ap-18, 31892 + - 0x1.44ba86p-18, 31893 + - 0x1.335a62p-18, 31894 + - 0x1.22df2ap-18, 31895 + - 0x1.133d96p-18, 31896 + - 0x1.046aeap-18, 31897 + - 0x1.ecb9d0p-19, 31898 + - 0x1.d21398p-19, 31899 + - 0x1.b8d094p-19, 31900 + - 0x1.a0df10p-19, 31901 + - 0x1.8a2e26p-19, 31902 + - 0x1.74adc8p-19, 31903 + - 0x1.604ea8p-19, 31904 + - 0x1.4d0232p-19, 31905 + - 0x1.3aba86p-19, 31906 + - 0x1.296a70p-19, 31907 + - 0x1.190562p-19, 31908 + - 0x1.097f62p-19, 31909 + - 0x1.f59a20p-20, 31910 + - 0x1.d9c736p-20, 31911 + - 0x1.bf716cp-20, 31912 + - 0x1.a6852cp-20, 31913 + - 0x1.8eefd8p-20, 31914 + - 0x1.789fb8p-20, 31915 + - 0x1.6383f8p-20, 31916 + - 0x1.4f8c96p-20, 31917 + - 0x1.3caa62p-20, 31918 + - 0x1.2acee2p-20, 31919 + - 0x1.19ec60p-20, 31920 + - 0x1.09f5d0p-20, 31921 + - 0x1.f5bd96p-21, 31922 + - 0x1.d9371ep-21, 31923 + - 0x1.be41dep-21, 31924 + - 0x1.a4c89ep-21, 31925 + - 0x1.8cb738p-21, 31926 + - 0x1.75fa8ep-21, 31927 + - 0x1.608078p-21, 31928 + - 0x1.4c37c0p-21, 31929 + - 0x1.39100ep-21, 31930 + - 0x1.26f9e0p-21, 31931 + - 0x1.15e682p-21, 31932 + - 0x1.05c804p-21, 31933 + - 0x1.ed2254p-22, 31934 + - 0x1.d06ad6p-22, 31935 + - 0x1.b551c8p-22, 31936 + - 0x1.9bc0a0p-22, 31937 + - 0x1.83a200p-22, 31938 + - 0x1.6ce1aap-22, 31939 + - 0x1.576c72p-22, 31940 + - 0x1.43302cp-22, 31941 + - 0x1.301ba2p-22, 31942 + - 0x1.1e1e86p-22, 31943 + - 0x1.0d2966p-22, 31944 + - 0x1.fa5b50p-23, 31945 + - 0x1.dc3ae4p-23, 31946 + - 0x1.bfd756p-23, 31947 + - 0x1.a517dap-23, 31948 + - 0x1.8be4f8p-23, 31949 + - 0x1.74287ep-23, 31950 + - 0x1.5dcd66p-23, 31951 + - 0x1.48bfd4p-23, 31952 + - 0x1.34ecf8p-23, 31953 + - 0x1.224310p-23, 31954 + - 0x1.10b148p-23, 31955 + - }, 31956 + -}; 31957 + diff --git a/sysdeps/aarch64/fpu/vecmath_config.h b/sysdeps/aarch64/fpu/vecmath_config.h 31958 + index 7f0a8aa5f2..862eefaf8f 100644 31959 + --- a/sysdeps/aarch64/fpu/vecmath_config.h 31960 + +++ b/sysdeps/aarch64/fpu/vecmath_config.h 31961 + @@ -75,49 +75,37 @@ extern const struct v_log10_data 31962 + } table[1 << V_LOG10_TABLE_BITS]; 31963 + } __v_log10_data attribute_hidden; 31964 + 31965 + -extern const struct erff_data 31966 + +extern const struct v_erff_data 31967 + { 31968 + struct 31969 + { 31970 + float erf, scale; 31971 + } tab[513]; 31972 + -} __erff_data attribute_hidden; 31973 + +} __v_erff_data attribute_hidden; 31974 + 31975 + -extern const struct sv_erff_data 31976 + -{ 31977 + - float erf[513]; 31978 + - float scale[513]; 31979 + -} __sv_erff_data attribute_hidden; 31980 + - 31981 + -extern const struct erf_data 31982 + +extern const struct v_erf_data 31983 + { 31984 + struct 31985 + { 31986 + double erf, scale; 31987 + } tab[769]; 31988 + -} __erf_data attribute_hidden; 31989 + - 31990 + -extern const struct sv_erf_data 31991 + -{ 31992 + - double erf[769]; 31993 + - double scale[769]; 31994 + -} __sv_erf_data attribute_hidden; 31995 + +} __v_erf_data attribute_hidden; 31996 + 31997 + -extern const struct erfc_data 31998 + +extern const struct v_erfc_data 31999 + { 32000 + struct 32001 + { 32002 + double erfc, scale; 32003 + } tab[3488]; 32004 + -} __erfc_data attribute_hidden; 32005 + +} __v_erfc_data attribute_hidden; 32006 + 32007 + -extern const struct erfcf_data 32008 + +extern const struct v_erfcf_data 32009 + { 32010 + struct 32011 + { 32012 + float erfc, scale; 32013 + } tab[645]; 32014 + -} __erfcf_data attribute_hidden; 32015 + +} __v_erfcf_data attribute_hidden; 32016 + 32017 + /* Some data for AdvSIMD and SVE pow's internal exp and log. */ 32018 + #define V_POW_EXP_TABLE_BITS 8 32019 + 32020 + commit 4148940836eee07d1138da6f1805280eeb8217e3 32021 + Author: Pierre Blanchard <pierre.blanchard@arm.com> 32022 + Date: Mon Dec 9 15:53:04 2024 +0000 32023 + 32024 + AArch64: Improve codegen in AdvSIMD pow 32025 + 32026 + Remove spurious ADRP. Improve memory access by shuffling constants and 32027 + using more indexed MLAs. 32028 + 32029 + A few more optimisation with no impact on accuracy 32030 + - force fmas contraction 32031 + - switch from shift-aided rint to rint instruction 32032 + 32033 + Between 1 and 5% throughput improvement on Neoverse 32034 + V1 depending on benchmark. 32035 + 32036 + (cherry picked from commit 569cfaaf4984ae70b23c61ee28a609b5aef93fea) 32037 + 32038 + diff --git a/sysdeps/aarch64/fpu/pow_advsimd.c b/sysdeps/aarch64/fpu/pow_advsimd.c 32039 + index 3c91e3e183..81e134ac2f 100644 32040 + --- a/sysdeps/aarch64/fpu/pow_advsimd.c 32041 + +++ b/sysdeps/aarch64/fpu/pow_advsimd.c 32042 + @@ -22,9 +22,6 @@ 32043 + /* Defines parameters of the approximation and scalar fallback. */ 32044 + #include "finite_pow.h" 32045 + 32046 + -#define VecSmallExp v_u64 (SmallExp) 32047 + -#define VecThresExp v_u64 (ThresExp) 32048 + - 32049 + #define VecSmallPowX v_u64 (SmallPowX) 32050 + #define VecThresPowX v_u64 (ThresPowX) 32051 + #define VecSmallPowY v_u64 (SmallPowY) 32052 + @@ -32,36 +29,48 @@ 32053 + 32054 + static const struct data 32055 + { 32056 + - float64x2_t log_poly[6]; 32057 + - float64x2_t exp_poly[3]; 32058 + - float64x2_t ln2_hi, ln2_lo; 32059 + - float64x2_t shift, inv_ln2_n, ln2_hi_n, ln2_lo_n, small_powx; 32060 + uint64x2_t inf; 32061 + + float64x2_t small_powx; 32062 + + uint64x2_t offset, mask; 32063 + + uint64x2_t mask_sub_0, mask_sub_1; 32064 + + float64x2_t log_c0, log_c2, log_c4, log_c5; 32065 + + double log_c1, log_c3; 32066 + + double ln2_lo, ln2_hi; 32067 + + uint64x2_t small_exp, thres_exp; 32068 + + double ln2_lo_n, ln2_hi_n; 32069 + + double inv_ln2_n, exp_c2; 32070 + + float64x2_t exp_c0, exp_c1; 32071 + } data = { 32072 + + /* Power threshold. */ 32073 + + .inf = V2 (0x7ff0000000000000), 32074 + + .small_powx = V2 (0x1p-126), 32075 + + .offset = V2 (Off), 32076 + + .mask = V2 (0xfffULL << 52), 32077 + + .mask_sub_0 = V2 (1ULL << 52), 32078 + + .mask_sub_1 = V2 (52ULL << 52), 32079 + /* Coefficients copied from v_pow_log_data.c 32080 + relative error: 0x1.11922ap-70 in [-0x1.6bp-8, 0x1.6bp-8] 32081 + Coefficients are scaled to match the scaling during evaluation. */ 32082 + - .log_poly 32083 + - = { V2 (0x1.555555555556p-2 * -2), V2 (-0x1.0000000000006p-2 * -2), 32084 + - V2 (0x1.999999959554ep-3 * 4), V2 (-0x1.555555529a47ap-3 * 4), 32085 + - V2 (0x1.2495b9b4845e9p-3 * -8), V2 (-0x1.0002b8b263fc3p-3 * -8) }, 32086 + - .ln2_hi = V2 (0x1.62e42fefa3800p-1), 32087 + - .ln2_lo = V2 (0x1.ef35793c76730p-45), 32088 + + .log_c0 = V2 (0x1.555555555556p-2 * -2), 32089 + + .log_c1 = -0x1.0000000000006p-2 * -2, 32090 + + .log_c2 = V2 (0x1.999999959554ep-3 * 4), 32091 + + .log_c3 = -0x1.555555529a47ap-3 * 4, 32092 + + .log_c4 = V2 (0x1.2495b9b4845e9p-3 * -8), 32093 + + .log_c5 = V2 (-0x1.0002b8b263fc3p-3 * -8), 32094 + + .ln2_hi = 0x1.62e42fefa3800p-1, 32095 + + .ln2_lo = 0x1.ef35793c76730p-45, 32096 + /* Polynomial coefficients: abs error: 1.43*2^-58, ulp error: 0.549 32097 + (0.550 without fma) if |x| < ln2/512. */ 32098 + - .exp_poly = { V2 (0x1.fffffffffffd4p-2), V2 (0x1.5555571d6ef9p-3), 32099 + - V2 (0x1.5555576a5adcep-5) }, 32100 + - .shift = V2 (0x1.8p52), /* round to nearest int. without intrinsics. */ 32101 + - .inv_ln2_n = V2 (0x1.71547652b82fep8), /* N/ln2. */ 32102 + - .ln2_hi_n = V2 (0x1.62e42fefc0000p-9), /* ln2/N. */ 32103 + - .ln2_lo_n = V2 (-0x1.c610ca86c3899p-45), 32104 + - .small_powx = V2 (0x1p-126), 32105 + - .inf = V2 (0x7ff0000000000000) 32106 + + .exp_c0 = V2 (0x1.fffffffffffd4p-2), 32107 + + .exp_c1 = V2 (0x1.5555571d6ef9p-3), 32108 + + .exp_c2 = 0x1.5555576a5adcep-5, 32109 + + .small_exp = V2 (0x3c90000000000000), 32110 + + .thres_exp = V2 (0x03f0000000000000), 32111 + + .inv_ln2_n = 0x1.71547652b82fep8, /* N/ln2. */ 32112 + + .ln2_hi_n = 0x1.62e42fefc0000p-9, /* ln2/N. */ 32113 + + .ln2_lo_n = -0x1.c610ca86c3899p-45, 32114 + }; 32115 + 32116 + -#define A(i) data.log_poly[i] 32117 + -#define C(i) data.exp_poly[i] 32118 + - 32119 + /* This version implements an algorithm close to scalar pow but 32120 + - does not implement the trick in the exp's specialcase subroutine to avoid 32121 + double-rounding, 32122 + @@ -91,10 +100,9 @@ v_log_inline (uint64x2_t ix, float64x2_t *tail, const struct data *d) 32123 + /* x = 2^k z; where z is in range [OFF,2*OFF) and exact. 32124 + The range is split into N subintervals. 32125 + The ith subinterval contains z and c is near its center. */ 32126 + - uint64x2_t tmp = vsubq_u64 (ix, v_u64 (Off)); 32127 + - int64x2_t k 32128 + - = vshrq_n_s64 (vreinterpretq_s64_u64 (tmp), 52); /* arithmetic shift. */ 32129 + - uint64x2_t iz = vsubq_u64 (ix, vandq_u64 (tmp, v_u64 (0xfffULL << 52))); 32130 + + uint64x2_t tmp = vsubq_u64 (ix, d->offset); 32131 + + int64x2_t k = vshrq_n_s64 (vreinterpretq_s64_u64 (tmp), 52); 32132 + + uint64x2_t iz = vsubq_u64 (ix, vandq_u64 (tmp, d->mask)); 32133 + float64x2_t z = vreinterpretq_f64_u64 (iz); 32134 + float64x2_t kd = vcvtq_f64_s64 (k); 32135 + /* log(x) = k*Ln2 + log(c) + log1p(z/c-1). */ 32136 + @@ -105,9 +113,10 @@ v_log_inline (uint64x2_t ix, float64x2_t *tail, const struct data *d) 32137 + |z/c - 1| < 1/N, so r = z/c - 1 is exactly representible. */ 32138 + float64x2_t r = vfmaq_f64 (v_f64 (-1.0), z, invc); 32139 + /* k*Ln2 + log(c) + r. */ 32140 + - float64x2_t t1 = vfmaq_f64 (logc, kd, d->ln2_hi); 32141 + + float64x2_t ln2 = vld1q_f64 (&d->ln2_lo); 32142 + + float64x2_t t1 = vfmaq_laneq_f64 (logc, kd, ln2, 1); 32143 + float64x2_t t2 = vaddq_f64 (t1, r); 32144 + - float64x2_t lo1 = vfmaq_f64 (logctail, kd, d->ln2_lo); 32145 + + float64x2_t lo1 = vfmaq_laneq_f64 (logctail, kd, ln2, 0); 32146 + float64x2_t lo2 = vaddq_f64 (vsubq_f64 (t1, t2), r); 32147 + /* Evaluation is optimized assuming superscalar pipelined execution. */ 32148 + float64x2_t ar = vmulq_f64 (v_f64 (-0.5), r); 32149 + @@ -118,9 +127,10 @@ v_log_inline (uint64x2_t ix, float64x2_t *tail, const struct data *d) 32150 + float64x2_t lo3 = vfmaq_f64 (vnegq_f64 (ar2), ar, r); 32151 + float64x2_t lo4 = vaddq_f64 (vsubq_f64 (t2, hi), ar2); 32152 + /* p = log1p(r) - r - A[0]*r*r. */ 32153 + - float64x2_t a56 = vfmaq_f64 (A (4), r, A (5)); 32154 + - float64x2_t a34 = vfmaq_f64 (A (2), r, A (3)); 32155 + - float64x2_t a12 = vfmaq_f64 (A (0), r, A (1)); 32156 + + float64x2_t odd_coeffs = vld1q_f64 (&d->log_c1); 32157 + + float64x2_t a56 = vfmaq_f64 (d->log_c4, r, d->log_c5); 32158 + + float64x2_t a34 = vfmaq_laneq_f64 (d->log_c2, r, odd_coeffs, 1); 32159 + + float64x2_t a12 = vfmaq_laneq_f64 (d->log_c0, r, odd_coeffs, 0); 32160 + float64x2_t p = vfmaq_f64 (a34, ar2, a56); 32161 + p = vfmaq_f64 (a12, ar2, p); 32162 + p = vmulq_f64 (ar3, p); 32163 + @@ -140,28 +150,28 @@ exp_special_case (float64x2_t x, float64x2_t xtail) 32164 + 32165 + /* Computes sign*exp(x+xtail) where |xtail| < 2^-8/N and |xtail| <= |x|. */ 32166 + static inline float64x2_t 32167 + -v_exp_inline (float64x2_t x, float64x2_t xtail, const struct data *d) 32168 + +v_exp_inline (float64x2_t x, float64x2_t neg_xtail, const struct data *d) 32169 + { 32170 + /* Fallback to scalar exp_inline for all lanes if any lane 32171 + contains value of x s.t. |x| <= 2^-54 or >= 512. */ 32172 + - uint64x2_t abstop 32173 + - = vshrq_n_u64 (vandq_u64 (vreinterpretq_u64_f64 (x), d->inf), 52); 32174 + - uint64x2_t uoflowx 32175 + - = vcgeq_u64 (vsubq_u64 (abstop, VecSmallExp), VecThresExp); 32176 + + uint64x2_t uoflowx = vcgeq_u64 ( 32177 + + vsubq_u64 (vreinterpretq_u64_f64 (vabsq_f64 (x)), d->small_exp), 32178 + + d->thres_exp); 32179 + if (__glibc_unlikely (v_any_u64 (uoflowx))) 32180 + - return exp_special_case (x, xtail); 32181 + + return exp_special_case (x, vnegq_f64 (neg_xtail)); 32182 + 32183 + /* exp(x) = 2^(k/N) * exp(r), with exp(r) in [2^(-1/2N),2^(1/2N)]. */ 32184 + /* x = ln2/N*k + r, with k integer and r in [-ln2/2N, ln2/2N]. */ 32185 + - float64x2_t z = vmulq_f64 (d->inv_ln2_n, x); 32186 + /* z - kd is in [-1, 1] in non-nearest rounding modes. */ 32187 + - float64x2_t kd = vaddq_f64 (z, d->shift); 32188 + - uint64x2_t ki = vreinterpretq_u64_f64 (kd); 32189 + - kd = vsubq_f64 (kd, d->shift); 32190 + - float64x2_t r = vfmsq_f64 (x, kd, d->ln2_hi_n); 32191 + - r = vfmsq_f64 (r, kd, d->ln2_lo_n); 32192 + + float64x2_t exp_consts = vld1q_f64 (&d->inv_ln2_n); 32193 + + float64x2_t z = vmulq_laneq_f64 (x, exp_consts, 0); 32194 + + float64x2_t kd = vrndnq_f64 (z); 32195 + + uint64x2_t ki = vreinterpretq_u64_s64 (vcvtaq_s64_f64 (z)); 32196 + + float64x2_t ln2_n = vld1q_f64 (&d->ln2_lo_n); 32197 + + float64x2_t r = vfmsq_laneq_f64 (x, kd, ln2_n, 1); 32198 + + r = vfmsq_laneq_f64 (r, kd, ln2_n, 0); 32199 + /* The code assumes 2^-200 < |xtail| < 2^-8/N. */ 32200 + - r = vaddq_f64 (r, xtail); 32201 + + r = vsubq_f64 (r, neg_xtail); 32202 + /* 2^(k/N) ~= scale. */ 32203 + uint64x2_t idx = vandq_u64 (ki, v_u64 (N_EXP - 1)); 32204 + uint64x2_t top = vshlq_n_u64 (ki, 52 - V_POW_EXP_TABLE_BITS); 32205 + @@ -170,8 +180,8 @@ v_exp_inline (float64x2_t x, float64x2_t xtail, const struct data *d) 32206 + sbits = vaddq_u64 (sbits, top); 32207 + /* exp(x) = 2^(k/N) * exp(r) ~= scale + scale * (exp(r) - 1). */ 32208 + float64x2_t r2 = vmulq_f64 (r, r); 32209 + - float64x2_t tmp = vfmaq_f64 (C (1), r, C (2)); 32210 + - tmp = vfmaq_f64 (C (0), r, tmp); 32211 + + float64x2_t tmp = vfmaq_laneq_f64 (d->exp_c1, r, exp_consts, 1); 32212 + + tmp = vfmaq_f64 (d->exp_c0, r, tmp); 32213 + tmp = vfmaq_f64 (r, r2, tmp); 32214 + float64x2_t scale = vreinterpretq_f64_u64 (sbits); 32215 + /* Note: tmp == 0 or |tmp| > 2^-200 and scale > 2^-739, so there 32216 + @@ -230,8 +240,8 @@ float64x2_t VPCS_ATTR V_NAME_D2 (pow) (float64x2_t x, float64x2_t y) 32217 + { 32218 + /* Normalize subnormal x so exponent becomes negative. */ 32219 + uint64x2_t vix_norm = vreinterpretq_u64_f64 ( 32220 + - vabsq_f64 (vmulq_f64 (x, vcvtq_f64_u64 (v_u64 (1ULL << 52))))); 32221 + - vix_norm = vsubq_u64 (vix_norm, v_u64 (52ULL << 52)); 32222 + + vabsq_f64 (vmulq_f64 (x, vcvtq_f64_u64 (d->mask_sub_0)))); 32223 + + vix_norm = vsubq_u64 (vix_norm, d->mask_sub_1); 32224 + vix = vbslq_u64 (sub_x, vix_norm, vix); 32225 + } 32226 + } 32227 + @@ -242,8 +252,7 @@ float64x2_t VPCS_ATTR V_NAME_D2 (pow) (float64x2_t x, float64x2_t y) 32228 + 32229 + /* Vector Exp(y_loghi, y_loglo). */ 32230 + float64x2_t vehi = vmulq_f64 (y, vhi); 32231 + - float64x2_t velo = vmulq_f64 (y, vlo); 32232 + float64x2_t vemi = vfmsq_f64 (vehi, y, vhi); 32233 + - velo = vsubq_f64 (velo, vemi); 32234 + - return v_exp_inline (vehi, velo, d); 32235 + + float64x2_t neg_velo = vfmsq_f64 (vemi, y, vlo); 32236 + + return v_exp_inline (vehi, neg_velo, d); 32237 + } 32238 + 32239 + commit ae04f63087415eba9060143608b03db693854bb7 32240 + Author: Pierre Blanchard <pierre.blanchard@arm.com> 32241 + Date: Mon Dec 9 15:54:34 2024 +0000 32242 + 32243 + AArch64: Improve codegen in AdvSIMD logs 32244 + 32245 + Remove spurious ADRP and a few MOVs. 32246 + Reduce memory access by using more indexed MLAs in polynomial. 32247 + Align notation so that algorithms are easier to compare. 32248 + Speedup on Neoverse V1 for log10 (8%), log (8.5%), and log2 (10%). 32249 + Update error threshold in AdvSIMD log (now matches SVE log). 32250 + 32251 + (cherry picked from commit 8eb5ad2ebc94cc5bedbac57c226c02ec254479c7) 32252 + 32253 + diff --git a/sysdeps/aarch64/fpu/log10_advsimd.c b/sysdeps/aarch64/fpu/log10_advsimd.c 32254 + index c065aaebae..f69ed21c39 100644 32255 + --- a/sysdeps/aarch64/fpu/log10_advsimd.c 32256 + +++ b/sysdeps/aarch64/fpu/log10_advsimd.c 32257 + @@ -18,36 +18,36 @@ 32258 + <https://www.gnu.org/licenses/>. */ 32259 + 32260 + #include "v_math.h" 32261 + -#include "poly_advsimd_f64.h" 32262 + - 32263 + -#define N (1 << V_LOG10_TABLE_BITS) 32264 + 32265 + static const struct data 32266 + { 32267 + - uint64x2_t min_norm; 32268 + + uint64x2_t off, sign_exp_mask, offset_lower_bound; 32269 + uint32x4_t special_bound; 32270 + - float64x2_t poly[5]; 32271 + - float64x2_t invln10, log10_2, ln2; 32272 + - uint64x2_t sign_exp_mask; 32273 + + double invln10, log10_2; 32274 + + double c1, c3; 32275 + + float64x2_t c0, c2, c4; 32276 + } data = { 32277 + /* Computed from log coefficients divided by log(10) then rounded to double 32278 + precision. */ 32279 + - .poly = { V2 (-0x1.bcb7b1526e506p-3), V2 (0x1.287a7636be1d1p-3), 32280 + - V2 (-0x1.bcb7b158af938p-4), V2 (0x1.63c78734e6d07p-4), 32281 + - V2 (-0x1.287461742fee4p-4) }, 32282 + - .ln2 = V2 (0x1.62e42fefa39efp-1), 32283 + - .invln10 = V2 (0x1.bcb7b1526e50ep-2), 32284 + - .log10_2 = V2 (0x1.34413509f79ffp-2), 32285 + - .min_norm = V2 (0x0010000000000000), /* asuint64(0x1p-1022). */ 32286 + - .special_bound = V4 (0x7fe00000), /* asuint64(inf) - min_norm. */ 32287 + + .c0 = V2 (-0x1.bcb7b1526e506p-3), 32288 + + .c1 = 0x1.287a7636be1d1p-3, 32289 + + .c2 = V2 (-0x1.bcb7b158af938p-4), 32290 + + .c3 = 0x1.63c78734e6d07p-4, 32291 + + .c4 = V2 (-0x1.287461742fee4p-4), 32292 + + .invln10 = 0x1.bcb7b1526e50ep-2, 32293 + + .log10_2 = 0x1.34413509f79ffp-2, 32294 + + .off = V2 (0x3fe6900900000000), 32295 + .sign_exp_mask = V2 (0xfff0000000000000), 32296 + + /* Lower bound is 0x0010000000000000. For 32297 + + optimised register use subnormals are detected after offset has been 32298 + + subtracted, so lower bound - offset (which wraps around). */ 32299 + + .offset_lower_bound = V2 (0x0010000000000000 - 0x3fe6900900000000), 32300 + + .special_bound = V4 (0x7fe00000), /* asuint64(inf) - 0x0010000000000000. */ 32301 + }; 32302 + 32303 + -#define Off v_u64 (0x3fe6900900000000) 32304 + +#define N (1 << V_LOG10_TABLE_BITS) 32305 + #define IndexMask (N - 1) 32306 + 32307 + -#define T(s, i) __v_log10_data.s[i] 32308 + - 32309 + struct entry 32310 + { 32311 + float64x2_t invc; 32312 + @@ -70,10 +70,11 @@ lookup (uint64x2_t i) 32313 + } 32314 + 32315 + static float64x2_t VPCS_ATTR NOINLINE 32316 + -special_case (float64x2_t x, float64x2_t y, float64x2_t hi, float64x2_t r2, 32317 + - uint32x2_t special) 32318 + +special_case (float64x2_t hi, uint64x2_t u_off, float64x2_t y, float64x2_t r2, 32319 + + uint32x2_t special, const struct data *d) 32320 + { 32321 + - return v_call_f64 (log10, x, vfmaq_f64 (hi, r2, y), vmovl_u32 (special)); 32322 + + float64x2_t x = vreinterpretq_f64_u64 (vaddq_u64 (u_off, d->off)); 32323 + + return v_call_f64 (log10, x, vfmaq_f64 (hi, y, r2), vmovl_u32 (special)); 32324 + } 32325 + 32326 + /* Fast implementation of double-precision vector log10 32327 + @@ -85,19 +86,24 @@ special_case (float64x2_t x, float64x2_t y, float64x2_t hi, float64x2_t r2, 32328 + float64x2_t VPCS_ATTR V_NAME_D1 (log10) (float64x2_t x) 32329 + { 32330 + const struct data *d = ptr_barrier (&data); 32331 + - uint64x2_t ix = vreinterpretq_u64_f64 (x); 32332 + - uint32x2_t special = vcge_u32 (vsubhn_u64 (ix, d->min_norm), 32333 + - vget_low_u32 (d->special_bound)); 32334 + + 32335 + + /* To avoid having to mov x out of the way, keep u after offset has been 32336 + + applied, and recover x by adding the offset back in the special-case 32337 + + handler. */ 32338 + + uint64x2_t u = vreinterpretq_u64_f64 (x); 32339 + + uint64x2_t u_off = vsubq_u64 (u, d->off); 32340 + 32341 + /* x = 2^k z; where z is in range [OFF,2*OFF) and exact. 32342 + The range is split into N subintervals. 32343 + The ith subinterval contains z and c is near its center. */ 32344 + - uint64x2_t tmp = vsubq_u64 (ix, Off); 32345 + - int64x2_t k = vshrq_n_s64 (vreinterpretq_s64_u64 (tmp), 52); 32346 + - uint64x2_t iz = vsubq_u64 (ix, vandq_u64 (tmp, d->sign_exp_mask)); 32347 + + int64x2_t k = vshrq_n_s64 (vreinterpretq_s64_u64 (u_off), 52); 32348 + + uint64x2_t iz = vsubq_u64 (u, vandq_u64 (u_off, d->sign_exp_mask)); 32349 + float64x2_t z = vreinterpretq_f64_u64 (iz); 32350 + 32351 + - struct entry e = lookup (tmp); 32352 + + struct entry e = lookup (u_off); 32353 + + 32354 + + uint32x2_t special = vcge_u32 (vsubhn_u64 (u_off, d->offset_lower_bound), 32355 + + vget_low_u32 (d->special_bound)); 32356 + 32357 + /* log10(x) = log1p(z/c-1)/log(10) + log10(c) + k*log10(2). */ 32358 + float64x2_t r = vfmaq_f64 (v_f64 (-1.0), z, e.invc); 32359 + @@ -105,17 +111,22 @@ float64x2_t VPCS_ATTR V_NAME_D1 (log10) (float64x2_t x) 32360 + 32361 + /* hi = r / log(10) + log10(c) + k*log10(2). 32362 + Constants in v_log10_data.c are computed (in extended precision) as 32363 + - e.log10c := e.logc * ivln10. */ 32364 + - float64x2_t w = vfmaq_f64 (e.log10c, r, d->invln10); 32365 + + e.log10c := e.logc * invln10. */ 32366 + + float64x2_t cte = vld1q_f64 (&d->invln10); 32367 + + float64x2_t hi = vfmaq_laneq_f64 (e.log10c, r, cte, 0); 32368 + 32369 + /* y = log10(1+r) + n * log10(2). */ 32370 + - float64x2_t hi = vfmaq_f64 (w, kd, d->log10_2); 32371 + + hi = vfmaq_laneq_f64 (hi, kd, cte, 1); 32372 + 32373 + /* y = r2*(A0 + r*A1 + r2*(A2 + r*A3 + r2*A4)) + hi. */ 32374 + float64x2_t r2 = vmulq_f64 (r, r); 32375 + - float64x2_t y = v_pw_horner_4_f64 (r, r2, d->poly); 32376 + + float64x2_t odd_coeffs = vld1q_f64 (&d->c1); 32377 + + float64x2_t y = vfmaq_laneq_f64 (d->c2, r, odd_coeffs, 1); 32378 + + float64x2_t p = vfmaq_laneq_f64 (d->c0, r, odd_coeffs, 0); 32379 + + y = vfmaq_f64 (y, d->c4, r2); 32380 + + y = vfmaq_f64 (p, y, r2); 32381 + 32382 + if (__glibc_unlikely (v_any_u32h (special))) 32383 + - return special_case (x, y, hi, r2, special); 32384 + - return vfmaq_f64 (hi, r2, y); 32385 + + return special_case (hi, u_off, y, r2, special, d); 32386 + + return vfmaq_f64 (hi, y, r2); 32387 + } 32388 + diff --git a/sysdeps/aarch64/fpu/log2_advsimd.c b/sysdeps/aarch64/fpu/log2_advsimd.c 32389 + index 4057c552d8..1eea1f86eb 100644 32390 + --- a/sysdeps/aarch64/fpu/log2_advsimd.c 32391 + +++ b/sysdeps/aarch64/fpu/log2_advsimd.c 32392 + @@ -18,31 +18,33 @@ 32393 + <https://www.gnu.org/licenses/>. */ 32394 + 32395 + #include "v_math.h" 32396 + -#include "poly_advsimd_f64.h" 32397 + - 32398 + -#define N (1 << V_LOG2_TABLE_BITS) 32399 + 32400 + static const struct data 32401 + { 32402 + - uint64x2_t min_norm; 32403 + + uint64x2_t off, sign_exp_mask, offset_lower_bound; 32404 + uint32x4_t special_bound; 32405 + - float64x2_t poly[5]; 32406 + - float64x2_t invln2; 32407 + - uint64x2_t sign_exp_mask; 32408 + + float64x2_t c0, c2; 32409 + + double c1, c3, invln2, c4; 32410 + } data = { 32411 + /* Each coefficient was generated to approximate log(r) for |r| < 0x1.fp-9 32412 + and N = 128, then scaled by log2(e) in extended precision and rounded back 32413 + to double precision. */ 32414 + - .poly = { V2 (-0x1.71547652b83p-1), V2 (0x1.ec709dc340953p-2), 32415 + - V2 (-0x1.71547651c8f35p-2), V2 (0x1.2777ebe12dda5p-2), 32416 + - V2 (-0x1.ec738d616fe26p-3) }, 32417 + - .invln2 = V2 (0x1.71547652b82fep0), 32418 + - .min_norm = V2 (0x0010000000000000), /* asuint64(0x1p-1022). */ 32419 + - .special_bound = V4 (0x7fe00000), /* asuint64(inf) - min_norm. */ 32420 + + .c0 = V2 (-0x1.71547652b8300p-1), 32421 + + .c1 = 0x1.ec709dc340953p-2, 32422 + + .c2 = V2 (-0x1.71547651c8f35p-2), 32423 + + .c3 = 0x1.2777ebe12dda5p-2, 32424 + + .c4 = -0x1.ec738d616fe26p-3, 32425 + + .invln2 = 0x1.71547652b82fep0, 32426 + + .off = V2 (0x3fe6900900000000), 32427 + .sign_exp_mask = V2 (0xfff0000000000000), 32428 + + /* Lower bound is 0x0010000000000000. For 32429 + + optimised register use subnormals are detected after offset has been 32430 + + subtracted, so lower bound - offset (which wraps around). */ 32431 + + .offset_lower_bound = V2 (0x0010000000000000 - 0x3fe6900900000000), 32432 + + .special_bound = V4 (0x7fe00000), /* asuint64(inf) - asuint64(0x1p-1022). */ 32433 + }; 32434 + 32435 + -#define Off v_u64 (0x3fe6900900000000) 32436 + +#define N (1 << V_LOG2_TABLE_BITS) 32437 + #define IndexMask (N - 1) 32438 + 32439 + struct entry 32440 + @@ -67,10 +69,11 @@ lookup (uint64x2_t i) 32441 + } 32442 + 32443 + static float64x2_t VPCS_ATTR NOINLINE 32444 + -special_case (float64x2_t x, float64x2_t y, float64x2_t w, float64x2_t r2, 32445 + - uint32x2_t special) 32446 + +special_case (float64x2_t hi, uint64x2_t u_off, float64x2_t y, float64x2_t r2, 32447 + + uint32x2_t special, const struct data *d) 32448 + { 32449 + - return v_call_f64 (log2, x, vfmaq_f64 (w, r2, y), vmovl_u32 (special)); 32450 + + float64x2_t x = vreinterpretq_f64_u64 (vaddq_u64 (u_off, d->off)); 32451 + + return v_call_f64 (log2, x, vfmaq_f64 (hi, y, r2), vmovl_u32 (special)); 32452 + } 32453 + 32454 + /* Double-precision vector log2 routine. Implements the same algorithm as 32455 + @@ -81,31 +84,41 @@ special_case (float64x2_t x, float64x2_t y, float64x2_t w, float64x2_t r2, 32456 + float64x2_t VPCS_ATTR V_NAME_D1 (log2) (float64x2_t x) 32457 + { 32458 + const struct data *d = ptr_barrier (&data); 32459 + - uint64x2_t ix = vreinterpretq_u64_f64 (x); 32460 + - uint32x2_t special = vcge_u32 (vsubhn_u64 (ix, d->min_norm), 32461 + - vget_low_u32 (d->special_bound)); 32462 + + 32463 + + /* To avoid having to mov x out of the way, keep u after offset has been 32464 + + applied, and recover x by adding the offset back in the special-case 32465 + + handler. */ 32466 + + uint64x2_t u = vreinterpretq_u64_f64 (x); 32467 + + uint64x2_t u_off = vsubq_u64 (u, d->off); 32468 + 32469 + /* x = 2^k z; where z is in range [Off,2*Off) and exact. 32470 + The range is split into N subintervals. 32471 + The ith subinterval contains z and c is near its center. */ 32472 + - uint64x2_t tmp = vsubq_u64 (ix, Off); 32473 + - int64x2_t k = vshrq_n_s64 (vreinterpretq_s64_u64 (tmp), 52); 32474 + - uint64x2_t iz = vsubq_u64 (ix, vandq_u64 (tmp, d->sign_exp_mask)); 32475 + + int64x2_t k = vshrq_n_s64 (vreinterpretq_s64_u64 (u_off), 52); 32476 + + uint64x2_t iz = vsubq_u64 (u, vandq_u64 (u_off, d->sign_exp_mask)); 32477 + float64x2_t z = vreinterpretq_f64_u64 (iz); 32478 + 32479 + - struct entry e = lookup (tmp); 32480 + + struct entry e = lookup (u_off); 32481 + 32482 + - /* log2(x) = log1p(z/c-1)/log(2) + log2(c) + k. */ 32483 + + uint32x2_t special = vcge_u32 (vsubhn_u64 (u_off, d->offset_lower_bound), 32484 + + vget_low_u32 (d->special_bound)); 32485 + 32486 + + /* log2(x) = log1p(z/c-1)/log(2) + log2(c) + k. */ 32487 + float64x2_t r = vfmaq_f64 (v_f64 (-1.0), z, e.invc); 32488 + float64x2_t kd = vcvtq_f64_s64 (k); 32489 + - float64x2_t w = vfmaq_f64 (e.log2c, r, d->invln2); 32490 + + 32491 + + float64x2_t invln2_and_c4 = vld1q_f64 (&d->invln2); 32492 + + float64x2_t hi 32493 + + = vfmaq_laneq_f64 (vaddq_f64 (e.log2c, kd), r, invln2_and_c4, 0); 32494 + 32495 + float64x2_t r2 = vmulq_f64 (r, r); 32496 + - float64x2_t y = v_pw_horner_4_f64 (r, r2, d->poly); 32497 + - w = vaddq_f64 (kd, w); 32498 + + float64x2_t odd_coeffs = vld1q_f64 (&d->c1); 32499 + + float64x2_t y = vfmaq_laneq_f64 (d->c2, r, odd_coeffs, 1); 32500 + + float64x2_t p = vfmaq_laneq_f64 (d->c0, r, odd_coeffs, 0); 32501 + + y = vfmaq_laneq_f64 (y, r2, invln2_and_c4, 1); 32502 + + y = vfmaq_f64 (p, r2, y); 32503 + 32504 + if (__glibc_unlikely (v_any_u32h (special))) 32505 + - return special_case (x, y, w, r2, special); 32506 + - return vfmaq_f64 (w, r2, y); 32507 + + return special_case (hi, u_off, y, r2, special, d); 32508 + + return vfmaq_f64 (hi, y, r2); 32509 + } 32510 + diff --git a/sysdeps/aarch64/fpu/log_advsimd.c b/sysdeps/aarch64/fpu/log_advsimd.c 32511 + index 015a6da7d7..b1a27fbc29 100644 32512 + --- a/sysdeps/aarch64/fpu/log_advsimd.c 32513 + +++ b/sysdeps/aarch64/fpu/log_advsimd.c 32514 + @@ -21,27 +21,29 @@ 32515 + 32516 + static const struct data 32517 + { 32518 + - uint64x2_t min_norm; 32519 + + uint64x2_t off, sign_exp_mask, offset_lower_bound; 32520 + uint32x4_t special_bound; 32521 + - float64x2_t poly[5]; 32522 + - float64x2_t ln2; 32523 + - uint64x2_t sign_exp_mask; 32524 + + float64x2_t c0, c2; 32525 + + double c1, c3, ln2, c4; 32526 + } data = { 32527 + - /* Worst-case error: 1.17 + 0.5 ulp. 32528 + - Rel error: 0x1.6272e588p-56 in [ -0x1.fc1p-9 0x1.009p-8 ]. */ 32529 + - .poly = { V2 (-0x1.ffffffffffff7p-2), V2 (0x1.55555555170d4p-2), 32530 + - V2 (-0x1.0000000399c27p-2), V2 (0x1.999b2e90e94cap-3), 32531 + - V2 (-0x1.554e550bd501ep-3) }, 32532 + - .ln2 = V2 (0x1.62e42fefa39efp-1), 32533 + - .min_norm = V2 (0x0010000000000000), 32534 + - .special_bound = V4 (0x7fe00000), /* asuint64(inf) - min_norm. */ 32535 + - .sign_exp_mask = V2 (0xfff0000000000000) 32536 + + /* Rel error: 0x1.6272e588p-56 in [ -0x1.fc1p-9 0x1.009p-8 ]. */ 32537 + + .c0 = V2 (-0x1.ffffffffffff7p-2), 32538 + + .c1 = 0x1.55555555170d4p-2, 32539 + + .c2 = V2 (-0x1.0000000399c27p-2), 32540 + + .c3 = 0x1.999b2e90e94cap-3, 32541 + + .c4 = -0x1.554e550bd501ep-3, 32542 + + .ln2 = 0x1.62e42fefa39efp-1, 32543 + + .sign_exp_mask = V2 (0xfff0000000000000), 32544 + + .off = V2 (0x3fe6900900000000), 32545 + + /* Lower bound is 0x0010000000000000. For 32546 + + optimised register use subnormals are detected after offset has been 32547 + + subtracted, so lower bound - offset (which wraps around). */ 32548 + + .offset_lower_bound = V2 (0x0010000000000000 - 0x3fe6900900000000), 32549 + + .special_bound = V4 (0x7fe00000), /* asuint64(inf) - asuint64(0x1p-126). */ 32550 + }; 32551 + 32552 + -#define A(i) d->poly[i] 32553 + #define N (1 << V_LOG_TABLE_BITS) 32554 + #define IndexMask (N - 1) 32555 + -#define Off v_u64 (0x3fe6900900000000) 32556 + 32557 + struct entry 32558 + { 32559 + @@ -64,48 +66,56 @@ lookup (uint64x2_t i) 32560 + } 32561 + 32562 + static float64x2_t VPCS_ATTR NOINLINE 32563 + -special_case (float64x2_t x, float64x2_t y, float64x2_t hi, float64x2_t r2, 32564 + - uint32x2_t cmp) 32565 + +special_case (float64x2_t hi, uint64x2_t u_off, float64x2_t y, float64x2_t r2, 32566 + + uint32x2_t special, const struct data *d) 32567 + { 32568 + - return v_call_f64 (log, x, vfmaq_f64 (hi, y, r2), vmovl_u32 (cmp)); 32569 + + float64x2_t x = vreinterpretq_f64_u64 (vaddq_u64 (u_off, d->off)); 32570 + + return v_call_f64 (log, x, vfmaq_f64 (hi, y, r2), vmovl_u32 (special)); 32571 + } 32572 + 32573 + +/* Double-precision vector log routine. 32574 + + The maximum observed error is 2.17 ULP: 32575 + + _ZGVnN2v_log(0x1.a6129884398a3p+0) got 0x1.ffffff1cca043p-2 32576 + + want 0x1.ffffff1cca045p-2. */ 32577 + float64x2_t VPCS_ATTR V_NAME_D1 (log) (float64x2_t x) 32578 + { 32579 + const struct data *d = ptr_barrier (&data); 32580 + - float64x2_t z, r, r2, p, y, kd, hi; 32581 + - uint64x2_t ix, iz, tmp; 32582 + - uint32x2_t cmp; 32583 + - int64x2_t k; 32584 + - struct entry e; 32585 + 32586 + - ix = vreinterpretq_u64_f64 (x); 32587 + - cmp = vcge_u32 (vsubhn_u64 (ix, d->min_norm), 32588 + - vget_low_u32 (d->special_bound)); 32589 + + /* To avoid having to mov x out of the way, keep u after offset has been 32590 + + applied, and recover x by adding the offset back in the special-case 32591 + + handler. */ 32592 + + uint64x2_t u = vreinterpretq_u64_f64 (x); 32593 + + uint64x2_t u_off = vsubq_u64 (u, d->off); 32594 + 32595 + /* x = 2^k z; where z is in range [Off,2*Off) and exact. 32596 + The range is split into N subintervals. 32597 + The ith subinterval contains z and c is near its center. */ 32598 + - tmp = vsubq_u64 (ix, Off); 32599 + - k = vshrq_n_s64 (vreinterpretq_s64_u64 (tmp), 52); /* arithmetic shift. */ 32600 + - iz = vsubq_u64 (ix, vandq_u64 (tmp, d->sign_exp_mask)); 32601 + - z = vreinterpretq_f64_u64 (iz); 32602 + - e = lookup (tmp); 32603 + + int64x2_t k = vshrq_n_s64 (vreinterpretq_s64_u64 (u_off), 52); 32604 + + uint64x2_t iz = vsubq_u64 (u, vandq_u64 (u_off, d->sign_exp_mask)); 32605 + + float64x2_t z = vreinterpretq_f64_u64 (iz); 32606 + + 32607 + + struct entry e = lookup (u_off); 32608 + + 32609 + + uint32x2_t special = vcge_u32 (vsubhn_u64 (u_off, d->offset_lower_bound), 32610 + + vget_low_u32 (d->special_bound)); 32611 + 32612 + /* log(x) = log1p(z/c-1) + log(c) + k*Ln2. */ 32613 + - r = vfmaq_f64 (v_f64 (-1.0), z, e.invc); 32614 + - kd = vcvtq_f64_s64 (k); 32615 + + float64x2_t r = vfmaq_f64 (v_f64 (-1.0), z, e.invc); 32616 + + float64x2_t kd = vcvtq_f64_s64 (k); 32617 + 32618 + /* hi = r + log(c) + k*Ln2. */ 32619 + - hi = vfmaq_f64 (vaddq_f64 (e.logc, r), kd, d->ln2); 32620 + + float64x2_t ln2_and_c4 = vld1q_f64 (&d->ln2); 32621 + + float64x2_t hi = vfmaq_laneq_f64 (vaddq_f64 (e.logc, r), kd, ln2_and_c4, 0); 32622 + + 32623 + /* y = r2*(A0 + r*A1 + r2*(A2 + r*A3 + r2*A4)) + hi. */ 32624 + - r2 = vmulq_f64 (r, r); 32625 + - y = vfmaq_f64 (A (2), A (3), r); 32626 + - p = vfmaq_f64 (A (0), A (1), r); 32627 + - y = vfmaq_f64 (y, A (4), r2); 32628 + - y = vfmaq_f64 (p, y, r2); 32629 + - 32630 + - if (__glibc_unlikely (v_any_u32h (cmp))) 32631 + - return special_case (x, y, hi, r2, cmp); 32632 + + float64x2_t odd_coeffs = vld1q_f64 (&d->c1); 32633 + + float64x2_t r2 = vmulq_f64 (r, r); 32634 + + float64x2_t y = vfmaq_laneq_f64 (d->c2, r, odd_coeffs, 1); 32635 + + float64x2_t p = vfmaq_laneq_f64 (d->c0, r, odd_coeffs, 0); 32636 + + y = vfmaq_laneq_f64 (y, r2, ln2_and_c4, 1); 32637 + + y = vfmaq_f64 (p, r2, y); 32638 + + 32639 + + if (__glibc_unlikely (v_any_u32h (special))) 32640 + + return special_case (hi, u_off, y, r2, special, d); 32641 + return vfmaq_f64 (hi, y, r2); 32642 + } 32643 + 32644 + commit 2aed9796bfb17b257e63b12cefdb7ff60be09626 32645 + Author: Pierre Blanchard <pierre.blanchard@arm.com> 32646 + Date: Mon Dec 9 15:55:39 2024 +0000 32647 + 32648 + AArch64: Improve codegen in users of ADVSIMD log1p helper 32649 + 32650 + Add inline helper for log1p and rearrange operations so MOV 32651 + is not necessary in reduction or around the special-case handler. 32652 + Reduce memory access by using more indexed MLAs in polynomial. 32653 + Speedup on Neoverse V1 for log1p (3.5%), acosh (7.5%) and atanh (10%). 32654 + 32655 + (cherry picked from commit ca0c0d0f26fbf75b9cacc65122b457e8fdec40b8) 32656 + 32657 + diff --git a/sysdeps/aarch64/fpu/acosh_advsimd.c b/sysdeps/aarch64/fpu/acosh_advsimd.c 32658 + index c88283cf11..a98f4a2e4d 100644 32659 + --- a/sysdeps/aarch64/fpu/acosh_advsimd.c 32660 + +++ b/sysdeps/aarch64/fpu/acosh_advsimd.c 32661 + @@ -54,9 +54,8 @@ VPCS_ATTR float64x2_t V_NAME_D1 (acosh) (float64x2_t x) 32662 + x = vbslq_f64 (special, vreinterpretq_f64_u64 (d->one), x); 32663 + #endif 32664 + 32665 + - float64x2_t xm1 = vsubq_f64 (x, v_f64 (1)); 32666 + - float64x2_t y; 32667 + - y = vaddq_f64 (x, v_f64 (1)); 32668 + + float64x2_t xm1 = vsubq_f64 (x, v_f64 (1.0)); 32669 + + float64x2_t y = vaddq_f64 (x, v_f64 (1.0)); 32670 + y = vmulq_f64 (y, xm1); 32671 + y = vsqrtq_f64 (y); 32672 + y = vaddq_f64 (xm1, y); 32673 + diff --git a/sysdeps/aarch64/fpu/atanh_advsimd.c b/sysdeps/aarch64/fpu/atanh_advsimd.c 32674 + index 3c3d0bd6ad..eb9769aeac 100644 32675 + --- a/sysdeps/aarch64/fpu/atanh_advsimd.c 32676 + +++ b/sysdeps/aarch64/fpu/atanh_advsimd.c 32677 + @@ -23,15 +23,19 @@ 32678 + const static struct data 32679 + { 32680 + struct v_log1p_data log1p_consts; 32681 + - uint64x2_t one, half; 32682 + + uint64x2_t one; 32683 + + uint64x2_t sign_mask; 32684 + } data = { .log1p_consts = V_LOG1P_CONSTANTS_TABLE, 32685 + .one = V2 (0x3ff0000000000000), 32686 + - .half = V2 (0x3fe0000000000000) }; 32687 + + .sign_mask = V2 (0x8000000000000000) }; 32688 + 32689 + static float64x2_t VPCS_ATTR NOINLINE 32690 + -special_case (float64x2_t x, float64x2_t y, uint64x2_t special) 32691 + +special_case (float64x2_t x, float64x2_t halfsign, float64x2_t y, 32692 + + uint64x2_t special, const struct data *d) 32693 + { 32694 + - return v_call_f64 (atanh, x, y, special); 32695 + + y = log1p_inline (y, &d->log1p_consts); 32696 + + return v_call_f64 (atanh, vbslq_f64 (d->sign_mask, halfsign, x), 32697 + + vmulq_f64 (halfsign, y), special); 32698 + } 32699 + 32700 + /* Approximation for vector double-precision atanh(x) using modified log1p. 32701 + @@ -43,11 +47,10 @@ float64x2_t V_NAME_D1 (atanh) (float64x2_t x) 32702 + { 32703 + const struct data *d = ptr_barrier (&data); 32704 + 32705 + + float64x2_t halfsign = vbslq_f64 (d->sign_mask, x, v_f64 (0.5)); 32706 + float64x2_t ax = vabsq_f64 (x); 32707 + uint64x2_t ia = vreinterpretq_u64_f64 (ax); 32708 + - uint64x2_t sign = veorq_u64 (vreinterpretq_u64_f64 (x), ia); 32709 + uint64x2_t special = vcgeq_u64 (ia, d->one); 32710 + - float64x2_t halfsign = vreinterpretq_f64_u64 (vorrq_u64 (sign, d->half)); 32711 + 32712 + #if WANT_SIMD_EXCEPT 32713 + ax = v_zerofy_f64 (ax, special); 32714 + @@ -55,10 +58,15 @@ float64x2_t V_NAME_D1 (atanh) (float64x2_t x) 32715 + 32716 + float64x2_t y; 32717 + y = vaddq_f64 (ax, ax); 32718 + - y = vdivq_f64 (y, vsubq_f64 (v_f64 (1), ax)); 32719 + - y = log1p_inline (y, &d->log1p_consts); 32720 + + y = vdivq_f64 (y, vsubq_f64 (vreinterpretq_f64_u64 (d->one), ax)); 32721 + 32722 + if (__glibc_unlikely (v_any_u64 (special))) 32723 + - return special_case (x, vmulq_f64 (y, halfsign), special); 32724 + +#if WANT_SIMD_EXCEPT 32725 + + return special_case (x, halfsign, y, special, d); 32726 + +#else 32727 + + return special_case (ax, halfsign, y, special, d); 32728 + +#endif 32729 + + 32730 + + y = log1p_inline (y, &d->log1p_consts); 32731 + return vmulq_f64 (y, halfsign); 32732 + } 32733 + diff --git a/sysdeps/aarch64/fpu/log1p_advsimd.c b/sysdeps/aarch64/fpu/log1p_advsimd.c 32734 + index 114064c696..1263587201 100644 32735 + --- a/sysdeps/aarch64/fpu/log1p_advsimd.c 32736 + +++ b/sysdeps/aarch64/fpu/log1p_advsimd.c 32737 + @@ -17,43 +17,26 @@ 32738 + License along with the GNU C Library; if not, see 32739 + <https://www.gnu.org/licenses/>. */ 32740 + 32741 + -#include "v_math.h" 32742 + -#include "poly_advsimd_f64.h" 32743 + +#define WANT_V_LOG1P_K0_SHORTCUT 0 32744 + +#include "v_log1p_inline.h" 32745 + 32746 + const static struct data 32747 + { 32748 + - float64x2_t poly[19], ln2[2]; 32749 + - uint64x2_t hf_rt2_top, one_m_hf_rt2_top, umask, inf, minus_one; 32750 + - int64x2_t one_top; 32751 + -} data = { 32752 + - /* Generated using Remez, deg=20, in [sqrt(2)/2-1, sqrt(2)-1]. */ 32753 + - .poly = { V2 (-0x1.ffffffffffffbp-2), V2 (0x1.55555555551a9p-2), 32754 + - V2 (-0x1.00000000008e3p-2), V2 (0x1.9999999a32797p-3), 32755 + - V2 (-0x1.555555552fecfp-3), V2 (0x1.249248e071e5ap-3), 32756 + - V2 (-0x1.ffffff8bf8482p-4), V2 (0x1.c71c8f07da57ap-4), 32757 + - V2 (-0x1.9999ca4ccb617p-4), V2 (0x1.7459ad2e1dfa3p-4), 32758 + - V2 (-0x1.554d2680a3ff2p-4), V2 (0x1.3b4c54d487455p-4), 32759 + - V2 (-0x1.2548a9ffe80e6p-4), V2 (0x1.0f389a24b2e07p-4), 32760 + - V2 (-0x1.eee4db15db335p-5), V2 (0x1.e95b494d4a5ddp-5), 32761 + - V2 (-0x1.15fdf07cb7c73p-4), V2 (0x1.0310b70800fcfp-4), 32762 + - V2 (-0x1.cfa7385bdb37ep-6) }, 32763 + - .ln2 = { V2 (0x1.62e42fefa3800p-1), V2 (0x1.ef35793c76730p-45) }, 32764 + - /* top32(asuint64(sqrt(2)/2)) << 32. */ 32765 + - .hf_rt2_top = V2 (0x3fe6a09e00000000), 32766 + - /* (top32(asuint64(1)) - top32(asuint64(sqrt(2)/2))) << 32. */ 32767 + - .one_m_hf_rt2_top = V2 (0x00095f6200000000), 32768 + - .umask = V2 (0x000fffff00000000), 32769 + - .one_top = V2 (0x3ff), 32770 + - .inf = V2 (0x7ff0000000000000), 32771 + - .minus_one = V2 (0xbff0000000000000) 32772 + -}; 32773 + + struct v_log1p_data d; 32774 + + uint64x2_t inf, minus_one; 32775 + +} data = { .d = V_LOG1P_CONSTANTS_TABLE, 32776 + + .inf = V2 (0x7ff0000000000000), 32777 + + .minus_one = V2 (0xbff0000000000000) }; 32778 + 32779 + #define BottomMask v_u64 (0xffffffff) 32780 + 32781 + -static float64x2_t VPCS_ATTR NOINLINE 32782 + -special_case (float64x2_t x, float64x2_t y, uint64x2_t special) 32783 + +static float64x2_t NOINLINE VPCS_ATTR 32784 + +special_case (float64x2_t x, uint64x2_t cmp, const struct data *d) 32785 + { 32786 + - return v_call_f64 (log1p, x, y, special); 32787 + + /* Side-step special lanes so fenv exceptions are not triggered 32788 + + inadvertently. */ 32789 + + float64x2_t x_nospecial = v_zerofy_f64 (x, cmp); 32790 + + return v_call_f64 (log1p, x, log1p_inline (x_nospecial, &d->d), cmp); 32791 + } 32792 + 32793 + /* Vector log1p approximation using polynomial on reduced interval. Routine is 32794 + @@ -66,66 +49,14 @@ VPCS_ATTR float64x2_t V_NAME_D1 (log1p) (float64x2_t x) 32795 + const struct data *d = ptr_barrier (&data); 32796 + uint64x2_t ix = vreinterpretq_u64_f64 (x); 32797 + uint64x2_t ia = vreinterpretq_u64_f64 (vabsq_f64 (x)); 32798 + - uint64x2_t special = vcgeq_u64 (ia, d->inf); 32799 + 32800 + -#if WANT_SIMD_EXCEPT 32801 + - special = vorrq_u64 (special, 32802 + - vcgeq_u64 (ix, vreinterpretq_u64_f64 (v_f64 (-1)))); 32803 + - if (__glibc_unlikely (v_any_u64 (special))) 32804 + - x = v_zerofy_f64 (x, special); 32805 + -#else 32806 + - special = vorrq_u64 (special, vcleq_f64 (x, v_f64 (-1))); 32807 + -#endif 32808 + + uint64x2_t special_cases 32809 + + = vorrq_u64 (vcgeq_u64 (ia, d->inf), vcgeq_u64 (ix, d->minus_one)); 32810 + 32811 + - /* With x + 1 = t * 2^k (where t = f + 1 and k is chosen such that f 32812 + - is in [sqrt(2)/2, sqrt(2)]): 32813 + - log1p(x) = k*log(2) + log1p(f). 32814 + + if (__glibc_unlikely (v_any_u64 (special_cases))) 32815 + + return special_case (x, special_cases, d); 32816 + 32817 + - f may not be representable exactly, so we need a correction term: 32818 + - let m = round(1 + x), c = (1 + x) - m. 32819 + - c << m: at very small x, log1p(x) ~ x, hence: 32820 + - log(1+x) - log(m) ~ c/m. 32821 + - 32822 + - We therefore calculate log1p(x) by k*log2 + log1p(f) + c/m. */ 32823 + - 32824 + - /* Obtain correctly scaled k by manipulation in the exponent. 32825 + - The scalar algorithm casts down to 32-bit at this point to calculate k and 32826 + - u_red. We stay in double-width to obtain f and k, using the same constants 32827 + - as the scalar algorithm but shifted left by 32. */ 32828 + - float64x2_t m = vaddq_f64 (x, v_f64 (1)); 32829 + - uint64x2_t mi = vreinterpretq_u64_f64 (m); 32830 + - uint64x2_t u = vaddq_u64 (mi, d->one_m_hf_rt2_top); 32831 + - 32832 + - int64x2_t ki 32833 + - = vsubq_s64 (vreinterpretq_s64_u64 (vshrq_n_u64 (u, 52)), d->one_top); 32834 + - float64x2_t k = vcvtq_f64_s64 (ki); 32835 + - 32836 + - /* Reduce x to f in [sqrt(2)/2, sqrt(2)]. */ 32837 + - uint64x2_t utop = vaddq_u64 (vandq_u64 (u, d->umask), d->hf_rt2_top); 32838 + - uint64x2_t u_red = vorrq_u64 (utop, vandq_u64 (mi, BottomMask)); 32839 + - float64x2_t f = vsubq_f64 (vreinterpretq_f64_u64 (u_red), v_f64 (1)); 32840 + - 32841 + - /* Correction term c/m. */ 32842 + - float64x2_t cm = vdivq_f64 (vsubq_f64 (x, vsubq_f64 (m, v_f64 (1))), m); 32843 + - 32844 + - /* Approximate log1p(x) on the reduced input using a polynomial. Because 32845 + - log1p(0)=0 we choose an approximation of the form: 32846 + - x + C0*x^2 + C1*x^3 + C2x^4 + ... 32847 + - Hence approximation has the form f + f^2 * P(f) 32848 + - where P(x) = C0 + C1*x + C2x^2 + ... 32849 + - Assembling this all correctly is dealt with at the final step. */ 32850 + - float64x2_t f2 = vmulq_f64 (f, f); 32851 + - float64x2_t p = v_pw_horner_18_f64 (f, f2, d->poly); 32852 + - 32853 + - float64x2_t ylo = vfmaq_f64 (cm, k, d->ln2[1]); 32854 + - float64x2_t yhi = vfmaq_f64 (f, k, d->ln2[0]); 32855 + - float64x2_t y = vaddq_f64 (ylo, yhi); 32856 + - 32857 + - if (__glibc_unlikely (v_any_u64 (special))) 32858 + - return special_case (vreinterpretq_f64_u64 (ix), vfmaq_f64 (y, f2, p), 32859 + - special); 32860 + - 32861 + - return vfmaq_f64 (y, f2, p); 32862 + + return log1p_inline (x, &d->d); 32863 + } 32864 + 32865 + strong_alias (V_NAME_D1 (log1p), V_NAME_D1 (logp1)) 32866 + diff --git a/sysdeps/aarch64/fpu/v_log1p_inline.h b/sysdeps/aarch64/fpu/v_log1p_inline.h 32867 + index 242e43b6ee..834ff65adf 100644 32868 + --- a/sysdeps/aarch64/fpu/v_log1p_inline.h 32869 + +++ b/sysdeps/aarch64/fpu/v_log1p_inline.h 32870 + @@ -21,29 +21,30 @@ 32871 + #define AARCH64_FPU_V_LOG1P_INLINE_H 32872 + 32873 + #include "v_math.h" 32874 + -#include "poly_advsimd_f64.h" 32875 + 32876 + struct v_log1p_data 32877 + { 32878 + - float64x2_t poly[19], ln2[2]; 32879 + + float64x2_t c0, c2, c4, c6, c8, c10, c12, c14, c16; 32880 + uint64x2_t hf_rt2_top, one_m_hf_rt2_top, umask; 32881 + int64x2_t one_top; 32882 + + double c1, c3, c5, c7, c9, c11, c13, c15, c17, c18; 32883 + + double ln2[2]; 32884 + }; 32885 + 32886 + /* Coefficients generated using Remez, deg=20, in [sqrt(2)/2-1, sqrt(2)-1]. */ 32887 + #define V_LOG1P_CONSTANTS_TABLE \ 32888 + { \ 32889 + - .poly = { V2 (-0x1.ffffffffffffbp-2), V2 (0x1.55555555551a9p-2), \ 32890 + - V2 (-0x1.00000000008e3p-2), V2 (0x1.9999999a32797p-3), \ 32891 + - V2 (-0x1.555555552fecfp-3), V2 (0x1.249248e071e5ap-3), \ 32892 + - V2 (-0x1.ffffff8bf8482p-4), V2 (0x1.c71c8f07da57ap-4), \ 32893 + - V2 (-0x1.9999ca4ccb617p-4), V2 (0x1.7459ad2e1dfa3p-4), \ 32894 + - V2 (-0x1.554d2680a3ff2p-4), V2 (0x1.3b4c54d487455p-4), \ 32895 + - V2 (-0x1.2548a9ffe80e6p-4), V2 (0x1.0f389a24b2e07p-4), \ 32896 + - V2 (-0x1.eee4db15db335p-5), V2 (0x1.e95b494d4a5ddp-5), \ 32897 + - V2 (-0x1.15fdf07cb7c73p-4), V2 (0x1.0310b70800fcfp-4), \ 32898 + - V2 (-0x1.cfa7385bdb37ep-6) }, \ 32899 + - .ln2 = { V2 (0x1.62e42fefa3800p-1), V2 (0x1.ef35793c76730p-45) }, \ 32900 + + .c0 = V2 (-0x1.ffffffffffffbp-2), .c1 = 0x1.55555555551a9p-2, \ 32901 + + .c2 = V2 (-0x1.00000000008e3p-2), .c3 = 0x1.9999999a32797p-3, \ 32902 + + .c4 = V2 (-0x1.555555552fecfp-3), .c5 = 0x1.249248e071e5ap-3, \ 32903 + + .c6 = V2 (-0x1.ffffff8bf8482p-4), .c7 = 0x1.c71c8f07da57ap-4, \ 32904 + + .c8 = V2 (-0x1.9999ca4ccb617p-4), .c9 = 0x1.7459ad2e1dfa3p-4, \ 32905 + + .c10 = V2 (-0x1.554d2680a3ff2p-4), .c11 = 0x1.3b4c54d487455p-4, \ 32906 + + .c12 = V2 (-0x1.2548a9ffe80e6p-4), .c13 = 0x1.0f389a24b2e07p-4, \ 32907 + + .c14 = V2 (-0x1.eee4db15db335p-5), .c15 = 0x1.e95b494d4a5ddp-5, \ 32908 + + .c16 = V2 (-0x1.15fdf07cb7c73p-4), .c17 = 0x1.0310b70800fcfp-4, \ 32909 + + .c18 = -0x1.cfa7385bdb37ep-6, \ 32910 + + .ln2 = { 0x1.62e42fefa3800p-1, 0x1.ef35793c76730p-45 }, \ 32911 + .hf_rt2_top = V2 (0x3fe6a09e00000000), \ 32912 + .one_m_hf_rt2_top = V2 (0x00095f6200000000), \ 32913 + .umask = V2 (0x000fffff00000000), .one_top = V2 (0x3ff) \ 32914 + @@ -51,19 +52,45 @@ struct v_log1p_data 32915 + 32916 + #define BottomMask v_u64 (0xffffffff) 32917 + 32918 + +static inline float64x2_t 32919 + +eval_poly (float64x2_t m, float64x2_t m2, const struct v_log1p_data *d) 32920 + +{ 32921 + + /* Approximate log(1+m) on [-0.25, 0.5] using pairwise Horner. */ 32922 + + float64x2_t c13 = vld1q_f64 (&d->c1); 32923 + + float64x2_t c57 = vld1q_f64 (&d->c5); 32924 + + float64x2_t c911 = vld1q_f64 (&d->c9); 32925 + + float64x2_t c1315 = vld1q_f64 (&d->c13); 32926 + + float64x2_t c1718 = vld1q_f64 (&d->c17); 32927 + + float64x2_t p1617 = vfmaq_laneq_f64 (d->c16, m, c1718, 0); 32928 + + float64x2_t p1415 = vfmaq_laneq_f64 (d->c14, m, c1315, 1); 32929 + + float64x2_t p1213 = vfmaq_laneq_f64 (d->c12, m, c1315, 0); 32930 + + float64x2_t p1011 = vfmaq_laneq_f64 (d->c10, m, c911, 1); 32931 + + float64x2_t p89 = vfmaq_laneq_f64 (d->c8, m, c911, 0); 32932 + + float64x2_t p67 = vfmaq_laneq_f64 (d->c6, m, c57, 1); 32933 + + float64x2_t p45 = vfmaq_laneq_f64 (d->c4, m, c57, 0); 32934 + + float64x2_t p23 = vfmaq_laneq_f64 (d->c2, m, c13, 1); 32935 + + float64x2_t p01 = vfmaq_laneq_f64 (d->c0, m, c13, 0); 32936 + + float64x2_t p = vfmaq_laneq_f64 (p1617, m2, c1718, 1); 32937 + + p = vfmaq_f64 (p1415, m2, p); 32938 + + p = vfmaq_f64 (p1213, m2, p); 32939 + + p = vfmaq_f64 (p1011, m2, p); 32940 + + p = vfmaq_f64 (p89, m2, p); 32941 + + p = vfmaq_f64 (p67, m2, p); 32942 + + p = vfmaq_f64 (p45, m2, p); 32943 + + p = vfmaq_f64 (p23, m2, p); 32944 + + return vfmaq_f64 (p01, m2, p); 32945 + +} 32946 + + 32947 + static inline float64x2_t 32948 + log1p_inline (float64x2_t x, const struct v_log1p_data *d) 32949 + { 32950 + - /* Helper for calculating log(x + 1). Copied from v_log1p_2u5.c, with several 32951 + - modifications: 32952 + + /* Helper for calculating log(x + 1): 32953 + - No special-case handling - this should be dealt with by the caller. 32954 + - - Pairwise Horner polynomial evaluation for improved accuracy. 32955 + - Optionally simulate the shortcut for k=0, used in the scalar routine, 32956 + - using v_sel, for improved accuracy when the argument to log1p is close to 32957 + - 0. This feature is enabled by defining WANT_V_LOG1P_K0_SHORTCUT as 1 in 32958 + - the source of the caller before including this file. 32959 + - See v_log1pf_2u1.c for details of the algorithm. */ 32960 + - float64x2_t m = vaddq_f64 (x, v_f64 (1)); 32961 + + using v_sel, for improved accuracy when the argument to log1p is close 32962 + + to 0. This feature is enabled by defining WANT_V_LOG1P_K0_SHORTCUT as 1 32963 + + in the source of the caller before including this file. */ 32964 + + float64x2_t m = vaddq_f64 (x, v_f64 (1.0)); 32965 + uint64x2_t mi = vreinterpretq_u64_f64 (m); 32966 + uint64x2_t u = vaddq_u64 (mi, d->one_m_hf_rt2_top); 32967 + 32968 + @@ -74,14 +101,14 @@ log1p_inline (float64x2_t x, const struct v_log1p_data *d) 32969 + /* Reduce x to f in [sqrt(2)/2, sqrt(2)]. */ 32970 + uint64x2_t utop = vaddq_u64 (vandq_u64 (u, d->umask), d->hf_rt2_top); 32971 + uint64x2_t u_red = vorrq_u64 (utop, vandq_u64 (mi, BottomMask)); 32972 + - float64x2_t f = vsubq_f64 (vreinterpretq_f64_u64 (u_red), v_f64 (1)); 32973 + + float64x2_t f = vsubq_f64 (vreinterpretq_f64_u64 (u_red), v_f64 (1.0)); 32974 + 32975 + /* Correction term c/m. */ 32976 + - float64x2_t cm = vdivq_f64 (vsubq_f64 (x, vsubq_f64 (m, v_f64 (1))), m); 32977 + + float64x2_t cm = vdivq_f64 (vsubq_f64 (x, vsubq_f64 (m, v_f64 (1.0))), m); 32978 + 32979 + #ifndef WANT_V_LOG1P_K0_SHORTCUT 32980 + -#error \ 32981 + - "Cannot use v_log1p_inline.h without specifying whether you need the k0 shortcut for greater accuracy close to 0" 32982 + +# error \ 32983 + + "Cannot use v_log1p_inline.h without specifying whether you need the k0 shortcut for greater accuracy close to 0" 32984 + #elif WANT_V_LOG1P_K0_SHORTCUT 32985 + /* Shortcut if k is 0 - set correction term to 0 and f to x. The result is 32986 + that the approximation is solely the polynomial. */ 32987 + @@ -92,11 +119,12 @@ log1p_inline (float64x2_t x, const struct v_log1p_data *d) 32988 + 32989 + /* Approximate log1p(f) on the reduced input using a polynomial. */ 32990 + float64x2_t f2 = vmulq_f64 (f, f); 32991 + - float64x2_t p = v_pw_horner_18_f64 (f, f2, d->poly); 32992 + + float64x2_t p = eval_poly (f, f2, d); 32993 + 32994 + /* Assemble log1p(x) = k * log2 + log1p(f) + c/m. */ 32995 + - float64x2_t ylo = vfmaq_f64 (cm, k, d->ln2[1]); 32996 + - float64x2_t yhi = vfmaq_f64 (f, k, d->ln2[0]); 32997 + + float64x2_t ln2 = vld1q_f64 (&d->ln2[0]); 32998 + + float64x2_t ylo = vfmaq_laneq_f64 (cm, k, ln2, 1); 32999 + + float64x2_t yhi = vfmaq_laneq_f64 (f, k, ln2, 0); 33000 + return vfmaq_f64 (vaddq_f64 (ylo, yhi), f2, p); 33001 + } 33002 + 33003 + 33004 + commit 9170b921fa49d2ef37141506837baaae92c7d3f8 33005 + Author: Joana Cruz <Joana.Cruz@arm.com> 33006 + Date: Tue Dec 17 14:47:31 2024 +0000 33007 + 33008 + AArch64: Improve codegen of AdvSIMD logf function family 33009 + 33010 + Load the polynomial evaluation coefficients into 2 vectors and use lanewise MLAs. 33011 + 8% improvement in throughput microbenchmark on Neoverse V1 for log2 and log, 33012 + and 2% for log10. 33013 + 33014 + Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com> 33015 + (cherry picked from commit d6e034f5b222a9ed1aeb5de0c0c7d0dda8b63da3) 33016 + 33017 + diff --git a/sysdeps/aarch64/fpu/log10f_advsimd.c b/sysdeps/aarch64/fpu/log10f_advsimd.c 33018 + index 82228b599a..0d792c3df9 100644 33019 + --- a/sysdeps/aarch64/fpu/log10f_advsimd.c 33020 + +++ b/sysdeps/aarch64/fpu/log10f_advsimd.c 33021 + @@ -18,21 +18,25 @@ 33022 + <https://www.gnu.org/licenses/>. */ 33023 + 33024 + #include "v_math.h" 33025 + -#include "poly_advsimd_f32.h" 33026 + 33027 + static const struct data 33028 + { 33029 + + float32x4_t c0, c2, c4, c6, inv_ln10, ln2; 33030 + uint32x4_t off, offset_lower_bound; 33031 + uint16x8_t special_bound; 33032 + uint32x4_t mantissa_mask; 33033 + - float32x4_t poly[8]; 33034 + - float32x4_t inv_ln10, ln2; 33035 + + float c1, c3, c5, c7; 33036 + } data = { 33037 + /* Use order 9 for log10(1+x), i.e. order 8 for log10(1+x)/x, with x in 33038 + [-1/3, 1/3] (offset=2/3). Max. relative error: 0x1.068ee468p-25. */ 33039 + - .poly = { V4 (-0x1.bcb79cp-3f), V4 (0x1.2879c8p-3f), V4 (-0x1.bcd472p-4f), 33040 + - V4 (0x1.6408f8p-4f), V4 (-0x1.246f8p-4f), V4 (0x1.f0e514p-5f), 33041 + - V4 (-0x1.0fc92cp-4f), V4 (0x1.f5f76ap-5f) }, 33042 + + .c0 = V4 (-0x1.bcb79cp-3f), 33043 + + .c1 = 0x1.2879c8p-3f, 33044 + + .c2 = V4 (-0x1.bcd472p-4f), 33045 + + .c3 = 0x1.6408f8p-4f, 33046 + + .c4 = V4 (-0x1.246f8p-4f), 33047 + + .c5 = 0x1.f0e514p-5f, 33048 + + .c6 = V4 (-0x1.0fc92cp-4f), 33049 + + .c7 = 0x1.f5f76ap-5f, 33050 + .ln2 = V4 (0x1.62e43p-1f), 33051 + .inv_ln10 = V4 (0x1.bcb7b2p-2f), 33052 + /* Lower bound is the smallest positive normal float 0x00800000. For 33053 + @@ -62,7 +66,7 @@ special_case (float32x4_t y, uint32x4_t u_off, float32x4_t p, float32x4_t r2, 33054 + float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log10) (float32x4_t x) 33055 + { 33056 + const struct data *d = ptr_barrier (&data); 33057 + - 33058 + + float32x4_t c1357 = vld1q_f32 (&d->c1); 33059 + /* To avoid having to mov x out of the way, keep u after offset has been 33060 + applied, and recover x by adding the offset back in the special-case 33061 + handler. */ 33062 + @@ -81,7 +85,16 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log10) (float32x4_t x) 33063 + 33064 + /* y = log10(1+r) + n * log10(2). */ 33065 + float32x4_t r2 = vmulq_f32 (r, r); 33066 + - float32x4_t poly = v_pw_horner_7_f32 (r, r2, d->poly); 33067 + + 33068 + + float32x4_t c01 = vfmaq_laneq_f32 (d->c0, r, c1357, 0); 33069 + + float32x4_t c23 = vfmaq_laneq_f32 (d->c2, r, c1357, 1); 33070 + + float32x4_t c45 = vfmaq_laneq_f32 (d->c4, r, c1357, 2); 33071 + + float32x4_t c67 = vfmaq_laneq_f32 (d->c6, r, c1357, 3); 33072 + + 33073 + + float32x4_t p47 = vfmaq_f32 (c45, r2, c67); 33074 + + float32x4_t p27 = vfmaq_f32 (c23, r2, p47); 33075 + + float32x4_t poly = vfmaq_f32 (c01, r2, p27); 33076 + + 33077 + /* y = Log10(2) * n + poly * InvLn(10). */ 33078 + float32x4_t y = vfmaq_f32 (r, d->ln2, n); 33079 + y = vmulq_f32 (y, d->inv_ln10); 33080 + diff --git a/sysdeps/aarch64/fpu/log2f_advsimd.c b/sysdeps/aarch64/fpu/log2f_advsimd.c 33081 + index 84effe4fe9..116c36c8e2 100644 33082 + --- a/sysdeps/aarch64/fpu/log2f_advsimd.c 33083 + +++ b/sysdeps/aarch64/fpu/log2f_advsimd.c 33084 + @@ -18,22 +18,27 @@ 33085 + <https://www.gnu.org/licenses/>. */ 33086 + 33087 + #include "v_math.h" 33088 + -#include "poly_advsimd_f32.h" 33089 + 33090 + static const struct data 33091 + { 33092 + + float32x4_t c0, c2, c4, c6, c8; 33093 + uint32x4_t off, offset_lower_bound; 33094 + uint16x8_t special_bound; 33095 + uint32x4_t mantissa_mask; 33096 + - float32x4_t poly[9]; 33097 + + float c1, c3, c5, c7; 33098 + } data = { 33099 + /* Coefficients generated using Remez algorithm approximate 33100 + log2(1+r)/r for r in [ -1/3, 1/3 ]. 33101 + rel error: 0x1.c4c4b0cp-26. */ 33102 + - .poly = { V4 (0x1.715476p0f), /* (float)(1 / ln(2)). */ 33103 + - V4 (-0x1.715458p-1f), V4 (0x1.ec701cp-2f), V4 (-0x1.7171a4p-2f), 33104 + - V4 (0x1.27a0b8p-2f), V4 (-0x1.e5143ep-3f), V4 (0x1.9d8ecap-3f), 33105 + - V4 (-0x1.c675bp-3f), V4 (0x1.9e495p-3f) }, 33106 + + .c0 = V4 (0x1.715476p0f), /* (float)(1 / ln(2)). */ 33107 + + .c1 = -0x1.715458p-1f, 33108 + + .c2 = V4 (0x1.ec701cp-2f), 33109 + + .c3 = -0x1.7171a4p-2f, 33110 + + .c4 = V4 (0x1.27a0b8p-2f), 33111 + + .c5 = -0x1.e5143ep-3f, 33112 + + .c6 = V4 (0x1.9d8ecap-3f), 33113 + + .c7 = -0x1.c675bp-3f, 33114 + + .c8 = V4 (0x1.9e495p-3f), 33115 + /* Lower bound is the smallest positive normal float 0x00800000. For 33116 + optimised register use subnormals are detected after offset has been 33117 + subtracted, so lower bound is 0x0080000 - offset (which wraps around). */ 33118 + @@ -79,11 +84,21 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log2) (float32x4_t x) 33119 + 33120 + /* y = log2(1+r) + n. */ 33121 + float32x4_t r2 = vmulq_f32 (r, r); 33122 + - float32x4_t p = v_pw_horner_8_f32 (r, r2, d->poly); 33123 + + 33124 + + float32x4_t c1357 = vld1q_f32 (&d->c1); 33125 + + float32x4_t c01 = vfmaq_laneq_f32 (d->c0, r, c1357, 0); 33126 + + float32x4_t c23 = vfmaq_laneq_f32 (d->c2, r, c1357, 1); 33127 + + float32x4_t c45 = vfmaq_laneq_f32 (d->c4, r, c1357, 2); 33128 + + float32x4_t c67 = vfmaq_laneq_f32 (d->c6, r, c1357, 3); 33129 + + float32x4_t p68 = vfmaq_f32 (c67, r2, d->c8); 33130 + + float32x4_t p48 = vfmaq_f32 (c45, r2, p68); 33131 + + float32x4_t p28 = vfmaq_f32 (c23, r2, p48); 33132 + + float32x4_t p = vfmaq_f32 (c01, r2, p28); 33133 + 33134 + if (__glibc_unlikely (v_any_u16h (special))) 33135 + return special_case (n, u_off, p, r, special, d); 33136 + return vfmaq_f32 (n, p, r); 33137 + } 33138 + + 33139 + libmvec_hidden_def (V_NAME_F1 (log2)) 33140 + HALF_WIDTH_ALIAS_F1 (log2) 33141 + diff --git a/sysdeps/aarch64/fpu/logf_advsimd.c b/sysdeps/aarch64/fpu/logf_advsimd.c 33142 + index c20dbfd6c0..d9e64c732d 100644 33143 + --- a/sysdeps/aarch64/fpu/logf_advsimd.c 33144 + +++ b/sysdeps/aarch64/fpu/logf_advsimd.c 33145 + @@ -21,16 +21,19 @@ 33146 + 33147 + static const struct data 33148 + { 33149 + - uint32x4_t off, offset_lower_bound; 33150 + + float32x4_t c2, c4, c6, ln2; 33151 + + uint32x4_t off, offset_lower_bound, mantissa_mask; 33152 + uint16x8_t special_bound; 33153 + - uint32x4_t mantissa_mask; 33154 + - float32x4_t poly[7]; 33155 + - float32x4_t ln2; 33156 + + float c1, c3, c5, c0; 33157 + } data = { 33158 + /* 3.34 ulp error. */ 33159 + - .poly = { V4 (-0x1.3e737cp-3f), V4 (0x1.5a9aa2p-3f), V4 (-0x1.4f9934p-3f), 33160 + - V4 (0x1.961348p-3f), V4 (-0x1.00187cp-2f), V4 (0x1.555d7cp-2f), 33161 + - V4 (-0x1.ffffc8p-2f) }, 33162 + + .c0 = -0x1.3e737cp-3f, 33163 + + .c1 = 0x1.5a9aa2p-3f, 33164 + + .c2 = V4 (-0x1.4f9934p-3f), 33165 + + .c3 = 0x1.961348p-3f, 33166 + + .c4 = V4 (-0x1.00187cp-2f), 33167 + + .c5 = 0x1.555d7cp-2f, 33168 + + .c6 = V4 (-0x1.ffffc8p-2f), 33169 + .ln2 = V4 (0x1.62e43p-1f), 33170 + /* Lower bound is the smallest positive normal float 0x00800000. For 33171 + optimised register use subnormals are detected after offset has been 33172 + @@ -41,8 +44,6 @@ static const struct data 33173 + .mantissa_mask = V4 (0x007fffff) 33174 + }; 33175 + 33176 + -#define P(i) d->poly[7 - i] 33177 + - 33178 + static float32x4_t VPCS_ATTR NOINLINE 33179 + special_case (float32x4_t p, uint32x4_t u_off, float32x4_t y, float32x4_t r2, 33180 + uint16x4_t cmp, const struct data *d) 33181 + @@ -55,33 +56,30 @@ special_case (float32x4_t p, uint32x4_t u_off, float32x4_t y, float32x4_t r2, 33182 + float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log) (float32x4_t x) 33183 + { 33184 + const struct data *d = ptr_barrier (&data); 33185 + - float32x4_t n, p, q, r, r2, y; 33186 + - uint32x4_t u, u_off; 33187 + - uint16x4_t cmp; 33188 + + float32x4_t c1350 = vld1q_f32 (&d->c1); 33189 + 33190 + /* To avoid having to mov x out of the way, keep u after offset has been 33191 + applied, and recover x by adding the offset back in the special-case 33192 + handler. */ 33193 + - u_off = vreinterpretq_u32_f32 (x); 33194 + + uint32x4_t u_off = vsubq_u32 (vreinterpretq_u32_f32 (x), d->off); 33195 + 33196 + /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3. */ 33197 + - u_off = vsubq_u32 (u_off, d->off); 33198 + - n = vcvtq_f32_s32 ( 33199 + + float32x4_t n = vcvtq_f32_s32 ( 33200 + vshrq_n_s32 (vreinterpretq_s32_u32 (u_off), 23)); /* signextend. */ 33201 + - u = vandq_u32 (u_off, d->mantissa_mask); 33202 + - u = vaddq_u32 (u, d->off); 33203 + - r = vsubq_f32 (vreinterpretq_f32_u32 (u), v_f32 (1.0f)); 33204 + + uint16x4_t cmp = vcge_u16 (vsubhn_u32 (u_off, d->offset_lower_bound), 33205 + + vget_low_u16 (d->special_bound)); 33206 + 33207 + - cmp = vcge_u16 (vsubhn_u32 (u_off, d->offset_lower_bound), 33208 + - vget_low_u16 (d->special_bound)); 33209 + + uint32x4_t u = vaddq_u32 (vandq_u32 (u_off, d->mantissa_mask), d->off); 33210 + + float32x4_t r = vsubq_f32 (vreinterpretq_f32_u32 (u), v_f32 (1.0f)); 33211 + 33212 + /* y = log(1+r) + n*ln2. */ 33213 + - r2 = vmulq_f32 (r, r); 33214 + + float32x4_t r2 = vmulq_f32 (r, r); 33215 + /* n*ln2 + r + r2*(P1 + r*P2 + r2*(P3 + r*P4 + r2*(P5 + r*P6 + r2*P7))). */ 33216 + - p = vfmaq_f32 (P (5), P (6), r); 33217 + - q = vfmaq_f32 (P (3), P (4), r); 33218 + - y = vfmaq_f32 (P (1), P (2), r); 33219 + - p = vfmaq_f32 (p, P (7), r2); 33220 + + float32x4_t p = vfmaq_laneq_f32 (d->c2, r, c1350, 0); 33221 + + float32x4_t q = vfmaq_laneq_f32 (d->c4, r, c1350, 1); 33222 + + float32x4_t y = vfmaq_laneq_f32 (d->c6, r, c1350, 2); 33223 + + p = vfmaq_laneq_f32 (p, r2, c1350, 3); 33224 + + 33225 + q = vfmaq_f32 (q, p, r2); 33226 + y = vfmaq_f32 (y, q, r2); 33227 + p = vfmaq_f32 (r, d->ln2, n); 33228 + 33229 + commit 41dc9e7c2d80bc5e886950b8a7bd21f77c9793b3 33230 + Author: Joana Cruz <Joana.Cruz@arm.com> 33231 + Date: Tue Dec 17 14:49:30 2024 +0000 33232 + 33233 + AArch64: Improve codegen of AdvSIMD atan(2)(f) 33234 + 33235 + Load the polynomial evaluation coefficients into 2 vectors and use lanewise MLAs. 33236 + 8% improvement in throughput microbenchmark on Neoverse V1. 33237 + 33238 + Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com> 33239 + (cherry picked from commit 6914774b9d3460876d9ad4482782213ec01a752e) 33240 + 33241 + diff --git a/sysdeps/aarch64/fpu/atan2_advsimd.c b/sysdeps/aarch64/fpu/atan2_advsimd.c 33242 + index b1e7a9b8fc..1a8f02109f 100644 33243 + --- a/sysdeps/aarch64/fpu/atan2_advsimd.c 33244 + +++ b/sysdeps/aarch64/fpu/atan2_advsimd.c 33245 + @@ -23,40 +23,57 @@ 33246 + 33247 + static const struct data 33248 + { 33249 + + float64x2_t c0, c2, c4, c6, c8, c10, c12, c14, c16, c18; 33250 + float64x2_t pi_over_2; 33251 + - float64x2_t poly[20]; 33252 + + double c1, c3, c5, c7, c9, c11, c13, c15, c17, c19; 33253 + + uint64x2_t zeroinfnan, minustwo; 33254 + } data = { 33255 + /* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on 33256 + - the interval [2**-1022, 1.0]. */ 33257 + - .poly = { V2 (-0x1.5555555555555p-2), V2 (0x1.99999999996c1p-3), 33258 + - V2 (-0x1.2492492478f88p-3), V2 (0x1.c71c71bc3951cp-4), 33259 + - V2 (-0x1.745d160a7e368p-4), V2 (0x1.3b139b6a88ba1p-4), 33260 + - V2 (-0x1.11100ee084227p-4), V2 (0x1.e1d0f9696f63bp-5), 33261 + - V2 (-0x1.aebfe7b418581p-5), V2 (0x1.842dbe9b0d916p-5), 33262 + - V2 (-0x1.5d30140ae5e99p-5), V2 (0x1.338e31eb2fbbcp-5), 33263 + - V2 (-0x1.00e6eece7de8p-5), V2 (0x1.860897b29e5efp-6), 33264 + - V2 (-0x1.0051381722a59p-6), V2 (0x1.14e9dc19a4a4ep-7), 33265 + - V2 (-0x1.d0062b42fe3bfp-9), V2 (0x1.17739e210171ap-10), 33266 + - V2 (-0x1.ab24da7be7402p-13), V2 (0x1.358851160a528p-16), }, 33267 + + [2**-1022, 1.0]. */ 33268 + + .c0 = V2 (-0x1.5555555555555p-2), 33269 + + .c1 = 0x1.99999999996c1p-3, 33270 + + .c2 = V2 (-0x1.2492492478f88p-3), 33271 + + .c3 = 0x1.c71c71bc3951cp-4, 33272 + + .c4 = V2 (-0x1.745d160a7e368p-4), 33273 + + .c5 = 0x1.3b139b6a88ba1p-4, 33274 + + .c6 = V2 (-0x1.11100ee084227p-4), 33275 + + .c7 = 0x1.e1d0f9696f63bp-5, 33276 + + .c8 = V2 (-0x1.aebfe7b418581p-5), 33277 + + .c9 = 0x1.842dbe9b0d916p-5, 33278 + + .c10 = V2 (-0x1.5d30140ae5e99p-5), 33279 + + .c11 = 0x1.338e31eb2fbbcp-5, 33280 + + .c12 = V2 (-0x1.00e6eece7de8p-5), 33281 + + .c13 = 0x1.860897b29e5efp-6, 33282 + + .c14 = V2 (-0x1.0051381722a59p-6), 33283 + + .c15 = 0x1.14e9dc19a4a4ep-7, 33284 + + .c16 = V2 (-0x1.d0062b42fe3bfp-9), 33285 + + .c17 = 0x1.17739e210171ap-10, 33286 + + .c18 = V2 (-0x1.ab24da7be7402p-13), 33287 + + .c19 = 0x1.358851160a528p-16, 33288 + .pi_over_2 = V2 (0x1.921fb54442d18p+0), 33289 + + .zeroinfnan = V2 (2 * 0x7ff0000000000000ul - 1), 33290 + + .minustwo = V2 (0xc000000000000000), 33291 + }; 33292 + 33293 + #define SignMask v_u64 (0x8000000000000000) 33294 + 33295 + /* Special cases i.e. 0, infinity, NaN (fall back to scalar calls). */ 33296 + static float64x2_t VPCS_ATTR NOINLINE 33297 + -special_case (float64x2_t y, float64x2_t x, float64x2_t ret, uint64x2_t cmp) 33298 + +special_case (float64x2_t y, float64x2_t x, float64x2_t ret, 33299 + + uint64x2_t sign_xy, uint64x2_t cmp) 33300 + { 33301 + + /* Account for the sign of x and y. */ 33302 + + ret = vreinterpretq_f64_u64 ( 33303 + + veorq_u64 (vreinterpretq_u64_f64 (ret), sign_xy)); 33304 + return v_call2_f64 (atan2, y, x, ret, cmp); 33305 + } 33306 + 33307 + /* Returns 1 if input is the bit representation of 0, infinity or nan. */ 33308 + static inline uint64x2_t 33309 + -zeroinfnan (uint64x2_t i) 33310 + +zeroinfnan (uint64x2_t i, const struct data *d) 33311 + { 33312 + /* (2 * i - 1) >= (2 * asuint64 (INFINITY) - 1). */ 33313 + - return vcgeq_u64 (vsubq_u64 (vaddq_u64 (i, i), v_u64 (1)), 33314 + - v_u64 (2 * asuint64 (INFINITY) - 1)); 33315 + + return vcgeq_u64 (vsubq_u64 (vaddq_u64 (i, i), v_u64 (1)), d->zeroinfnan); 33316 + } 33317 + 33318 + /* Fast implementation of vector atan2. 33319 + @@ -66,12 +83,13 @@ zeroinfnan (uint64x2_t i) 33320 + want 0x1.92d628ab678cfp-1. */ 33321 + float64x2_t VPCS_ATTR V_NAME_D2 (atan2) (float64x2_t y, float64x2_t x) 33322 + { 33323 + - const struct data *data_ptr = ptr_barrier (&data); 33324 + + const struct data *d = ptr_barrier (&data); 33325 + 33326 + uint64x2_t ix = vreinterpretq_u64_f64 (x); 33327 + uint64x2_t iy = vreinterpretq_u64_f64 (y); 33328 + 33329 + - uint64x2_t special_cases = vorrq_u64 (zeroinfnan (ix), zeroinfnan (iy)); 33330 + + uint64x2_t special_cases 33331 + + = vorrq_u64 (zeroinfnan (ix, d), zeroinfnan (iy, d)); 33332 + 33333 + uint64x2_t sign_x = vandq_u64 (ix, SignMask); 33334 + uint64x2_t sign_y = vandq_u64 (iy, SignMask); 33335 + @@ -81,18 +99,18 @@ float64x2_t VPCS_ATTR V_NAME_D2 (atan2) (float64x2_t y, float64x2_t x) 33336 + float64x2_t ay = vabsq_f64 (y); 33337 + 33338 + uint64x2_t pred_xlt0 = vcltzq_f64 (x); 33339 + - uint64x2_t pred_aygtax = vcgtq_f64 (ay, ax); 33340 + + uint64x2_t pred_aygtax = vcagtq_f64 (y, x); 33341 + 33342 + /* Set up z for call to atan. */ 33343 + float64x2_t n = vbslq_f64 (pred_aygtax, vnegq_f64 (ax), ay); 33344 + - float64x2_t d = vbslq_f64 (pred_aygtax, ay, ax); 33345 + - float64x2_t z = vdivq_f64 (n, d); 33346 + + float64x2_t q = vbslq_f64 (pred_aygtax, ay, ax); 33347 + + float64x2_t z = vdivq_f64 (n, q); 33348 + 33349 + /* Work out the correct shift. */ 33350 + - float64x2_t shift = vreinterpretq_f64_u64 ( 33351 + - vandq_u64 (pred_xlt0, vreinterpretq_u64_f64 (v_f64 (-2.0)))); 33352 + + float64x2_t shift 33353 + + = vreinterpretq_f64_u64 (vandq_u64 (pred_xlt0, d->minustwo)); 33354 + shift = vbslq_f64 (pred_aygtax, vaddq_f64 (shift, v_f64 (1.0)), shift); 33355 + - shift = vmulq_f64 (shift, data_ptr->pi_over_2); 33356 + + shift = vmulq_f64 (shift, d->pi_over_2); 33357 + 33358 + /* Calculate the polynomial approximation. 33359 + Use split Estrin scheme for P(z^2) with deg(P)=19. Use split instead of 33360 + @@ -103,20 +121,52 @@ float64x2_t VPCS_ATTR V_NAME_D2 (atan2) (float64x2_t y, float64x2_t x) 33361 + float64x2_t x2 = vmulq_f64 (z2, z2); 33362 + float64x2_t x4 = vmulq_f64 (x2, x2); 33363 + float64x2_t x8 = vmulq_f64 (x4, x4); 33364 + - float64x2_t ret 33365 + - = vfmaq_f64 (v_estrin_7_f64 (z2, x2, x4, data_ptr->poly), 33366 + - v_estrin_11_f64 (z2, x2, x4, x8, data_ptr->poly + 8), x8); 33367 + + 33368 + + float64x2_t c13 = vld1q_f64 (&d->c1); 33369 + + float64x2_t c57 = vld1q_f64 (&d->c5); 33370 + + float64x2_t c911 = vld1q_f64 (&d->c9); 33371 + + float64x2_t c1315 = vld1q_f64 (&d->c13); 33372 + + float64x2_t c1719 = vld1q_f64 (&d->c17); 33373 + + 33374 + + /* estrin_7. */ 33375 + + float64x2_t p01 = vfmaq_laneq_f64 (d->c0, z2, c13, 0); 33376 + + float64x2_t p23 = vfmaq_laneq_f64 (d->c2, z2, c13, 1); 33377 + + float64x2_t p03 = vfmaq_f64 (p01, x2, p23); 33378 + + 33379 + + float64x2_t p45 = vfmaq_laneq_f64 (d->c4, z2, c57, 0); 33380 + + float64x2_t p67 = vfmaq_laneq_f64 (d->c6, z2, c57, 1); 33381 + + float64x2_t p47 = vfmaq_f64 (p45, x2, p67); 33382 + + 33383 + + float64x2_t p07 = vfmaq_f64 (p03, x4, p47); 33384 + + 33385 + + /* estrin_11. */ 33386 + + float64x2_t p89 = vfmaq_laneq_f64 (d->c8, z2, c911, 0); 33387 + + float64x2_t p1011 = vfmaq_laneq_f64 (d->c10, z2, c911, 1); 33388 + + float64x2_t p811 = vfmaq_f64 (p89, x2, p1011); 33389 + + 33390 + + float64x2_t p1213 = vfmaq_laneq_f64 (d->c12, z2, c1315, 0); 33391 + + float64x2_t p1415 = vfmaq_laneq_f64 (d->c14, z2, c1315, 1); 33392 + + float64x2_t p1215 = vfmaq_f64 (p1213, x2, p1415); 33393 + + 33394 + + float64x2_t p1617 = vfmaq_laneq_f64 (d->c16, z2, c1719, 0); 33395 + + float64x2_t p1819 = vfmaq_laneq_f64 (d->c18, z2, c1719, 1); 33396 + + float64x2_t p1619 = vfmaq_f64 (p1617, x2, p1819); 33397 + + 33398 + + float64x2_t p815 = vfmaq_f64 (p811, x4, p1215); 33399 + + float64x2_t p819 = vfmaq_f64 (p815, x8, p1619); 33400 + + 33401 + + float64x2_t ret = vfmaq_f64 (p07, p819, x8); 33402 + 33403 + /* Finalize. y = shift + z + z^3 * P(z^2). */ 33404 + ret = vfmaq_f64 (z, ret, vmulq_f64 (z2, z)); 33405 + ret = vaddq_f64 (ret, shift); 33406 + 33407 + + if (__glibc_unlikely (v_any_u64 (special_cases))) 33408 + + return special_case (y, x, ret, sign_xy, special_cases); 33409 + + 33410 + /* Account for the sign of x and y. */ 33411 + ret = vreinterpretq_f64_u64 ( 33412 + veorq_u64 (vreinterpretq_u64_f64 (ret), sign_xy)); 33413 + 33414 + - if (__glibc_unlikely (v_any_u64 (special_cases))) 33415 + - return special_case (y, x, ret, special_cases); 33416 + - 33417 + return ret; 33418 + } 33419 + diff --git a/sysdeps/aarch64/fpu/atan2f_advsimd.c b/sysdeps/aarch64/fpu/atan2f_advsimd.c 33420 + index 56e610caf1..88daacd76c 100644 33421 + --- a/sysdeps/aarch64/fpu/atan2f_advsimd.c 33422 + +++ b/sysdeps/aarch64/fpu/atan2f_advsimd.c 33423 + @@ -22,34 +22,39 @@ 33424 + 33425 + static const struct data 33426 + { 33427 + - float32x4_t poly[8]; 33428 + - float32x4_t pi_over_2; 33429 + + float32x4_t c0, pi_over_2, c4, c6, c2; 33430 + + float c1, c3, c5, c7; 33431 + + uint32x4_t comp_const; 33432 + } data = { 33433 + /* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on 33434 + [2**-128, 1.0]. 33435 + Generated using fpminimax between FLT_MIN and 1. */ 33436 + - .poly = { V4 (-0x1.55555p-2f), V4 (0x1.99935ep-3f), V4 (-0x1.24051ep-3f), 33437 + - V4 (0x1.bd7368p-4f), V4 (-0x1.491f0ep-4f), V4 (0x1.93a2c0p-5f), 33438 + - V4 (-0x1.4c3c60p-6f), V4 (0x1.01fd88p-8f) }, 33439 + - .pi_over_2 = V4 (0x1.921fb6p+0f), 33440 + + .c0 = V4 (-0x1.55555p-2f), .c1 = 0x1.99935ep-3f, 33441 + + .c2 = V4 (-0x1.24051ep-3f), .c3 = 0x1.bd7368p-4f, 33442 + + .c4 = V4 (-0x1.491f0ep-4f), .c5 = 0x1.93a2c0p-5f, 33443 + + .c6 = V4 (-0x1.4c3c60p-6f), .c7 = 0x1.01fd88p-8f, 33444 + + .pi_over_2 = V4 (0x1.921fb6p+0f), .comp_const = V4 (2 * 0x7f800000lu - 1), 33445 + }; 33446 + 33447 + #define SignMask v_u32 (0x80000000) 33448 + 33449 + /* Special cases i.e. 0, infinity and nan (fall back to scalar calls). */ 33450 + static float32x4_t VPCS_ATTR NOINLINE 33451 + -special_case (float32x4_t y, float32x4_t x, float32x4_t ret, uint32x4_t cmp) 33452 + +special_case (float32x4_t y, float32x4_t x, float32x4_t ret, 33453 + + uint32x4_t sign_xy, uint32x4_t cmp) 33454 + { 33455 + + /* Account for the sign of y. */ 33456 + + ret = vreinterpretq_f32_u32 ( 33457 + + veorq_u32 (vreinterpretq_u32_f32 (ret), sign_xy)); 33458 + return v_call2_f32 (atan2f, y, x, ret, cmp); 33459 + } 33460 + 33461 + /* Returns 1 if input is the bit representation of 0, infinity or nan. */ 33462 + static inline uint32x4_t 33463 + -zeroinfnan (uint32x4_t i) 33464 + +zeroinfnan (uint32x4_t i, const struct data *d) 33465 + { 33466 + /* 2 * i - 1 >= 2 * 0x7f800000lu - 1. */ 33467 + - return vcgeq_u32 (vsubq_u32 (vmulq_n_u32 (i, 2), v_u32 (1)), 33468 + - v_u32 (2 * 0x7f800000lu - 1)); 33469 + + return vcgeq_u32 (vsubq_u32 (vmulq_n_u32 (i, 2), v_u32 (1)), d->comp_const); 33470 + } 33471 + 33472 + /* Fast implementation of vector atan2f. Maximum observed error is 33473 + @@ -58,12 +63,13 @@ zeroinfnan (uint32x4_t i) 33474 + want 0x1.967f00p-1. */ 33475 + float32x4_t VPCS_ATTR NOINLINE V_NAME_F2 (atan2) (float32x4_t y, float32x4_t x) 33476 + { 33477 + - const struct data *data_ptr = ptr_barrier (&data); 33478 + + const struct data *d = ptr_barrier (&data); 33479 + 33480 + uint32x4_t ix = vreinterpretq_u32_f32 (x); 33481 + uint32x4_t iy = vreinterpretq_u32_f32 (y); 33482 + 33483 + - uint32x4_t special_cases = vorrq_u32 (zeroinfnan (ix), zeroinfnan (iy)); 33484 + + uint32x4_t special_cases 33485 + + = vorrq_u32 (zeroinfnan (ix, d), zeroinfnan (iy, d)); 33486 + 33487 + uint32x4_t sign_x = vandq_u32 (ix, SignMask); 33488 + uint32x4_t sign_y = vandq_u32 (iy, SignMask); 33489 + @@ -77,14 +83,14 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F2 (atan2) (float32x4_t y, float32x4_t x) 33490 + 33491 + /* Set up z for call to atanf. */ 33492 + float32x4_t n = vbslq_f32 (pred_aygtax, vnegq_f32 (ax), ay); 33493 + - float32x4_t d = vbslq_f32 (pred_aygtax, ay, ax); 33494 + - float32x4_t z = vdivq_f32 (n, d); 33495 + + float32x4_t q = vbslq_f32 (pred_aygtax, ay, ax); 33496 + + float32x4_t z = vdivq_f32 (n, q); 33497 + 33498 + /* Work out the correct shift. */ 33499 + float32x4_t shift = vreinterpretq_f32_u32 ( 33500 + vandq_u32 (pred_xlt0, vreinterpretq_u32_f32 (v_f32 (-2.0f)))); 33501 + shift = vbslq_f32 (pred_aygtax, vaddq_f32 (shift, v_f32 (1.0f)), shift); 33502 + - shift = vmulq_f32 (shift, data_ptr->pi_over_2); 33503 + + shift = vmulq_f32 (shift, d->pi_over_2); 33504 + 33505 + /* Calculate the polynomial approximation. 33506 + Use 2-level Estrin scheme for P(z^2) with deg(P)=7. However, 33507 + @@ -96,23 +102,27 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F2 (atan2) (float32x4_t y, float32x4_t x) 33508 + float32x4_t z2 = vmulq_f32 (z, z); 33509 + float32x4_t z4 = vmulq_f32 (z2, z2); 33510 + 33511 + - float32x4_t ret = vfmaq_f32 ( 33512 + - v_pairwise_poly_3_f32 (z2, z4, data_ptr->poly), z4, 33513 + - vmulq_f32 (z4, v_pairwise_poly_3_f32 (z2, z4, data_ptr->poly + 4))); 33514 + + float32x4_t c1357 = vld1q_f32 (&d->c1); 33515 + + float32x4_t p01 = vfmaq_laneq_f32 (d->c0, z2, c1357, 0); 33516 + + float32x4_t p23 = vfmaq_laneq_f32 (d->c2, z2, c1357, 1); 33517 + + float32x4_t p45 = vfmaq_laneq_f32 (d->c4, z2, c1357, 2); 33518 + + float32x4_t p67 = vfmaq_laneq_f32 (d->c6, z2, c1357, 3); 33519 + + float32x4_t p03 = vfmaq_f32 (p01, z4, p23); 33520 + + float32x4_t p47 = vfmaq_f32 (p45, z4, p67); 33521 + + 33522 + + float32x4_t ret = vfmaq_f32 (p03, z4, vmulq_f32 (z4, p47)); 33523 + 33524 + /* y = shift + z * P(z^2). */ 33525 + ret = vaddq_f32 (vfmaq_f32 (z, ret, vmulq_f32 (z2, z)), shift); 33526 + 33527 + - /* Account for the sign of y. */ 33528 + - ret = vreinterpretq_f32_u32 ( 33529 + - veorq_u32 (vreinterpretq_u32_f32 (ret), sign_xy)); 33530 + - 33531 + if (__glibc_unlikely (v_any_u32 (special_cases))) 33532 + { 33533 + - return special_case (y, x, ret, special_cases); 33534 + + return special_case (y, x, ret, sign_xy, special_cases); 33535 + } 33536 + 33537 + - return ret; 33538 + + /* Account for the sign of y. */ 33539 + + return vreinterpretq_f32_u32 ( 33540 + + veorq_u32 (vreinterpretq_u32_f32 (ret), sign_xy)); 33541 + } 33542 + libmvec_hidden_def (V_NAME_F2 (atan2)) 33543 + HALF_WIDTH_ALIAS_F2(atan2) 33544 + diff --git a/sysdeps/aarch64/fpu/atan_advsimd.c b/sysdeps/aarch64/fpu/atan_advsimd.c 33545 + index a962be0f78..14f1809796 100644 33546 + --- a/sysdeps/aarch64/fpu/atan_advsimd.c 33547 + +++ b/sysdeps/aarch64/fpu/atan_advsimd.c 33548 + @@ -22,21 +22,22 @@ 33549 + 33550 + static const struct data 33551 + { 33552 + + float64x2_t c0, c2, c4, c6, c8, c10, c12, c14, c16, c18; 33553 + float64x2_t pi_over_2; 33554 + - float64x2_t poly[20]; 33555 + + double c1, c3, c5, c7, c9, c11, c13, c15, c17, c19; 33556 + } data = { 33557 + /* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on 33558 + [2**-1022, 1.0]. */ 33559 + - .poly = { V2 (-0x1.5555555555555p-2), V2 (0x1.99999999996c1p-3), 33560 + - V2 (-0x1.2492492478f88p-3), V2 (0x1.c71c71bc3951cp-4), 33561 + - V2 (-0x1.745d160a7e368p-4), V2 (0x1.3b139b6a88ba1p-4), 33562 + - V2 (-0x1.11100ee084227p-4), V2 (0x1.e1d0f9696f63bp-5), 33563 + - V2 (-0x1.aebfe7b418581p-5), V2 (0x1.842dbe9b0d916p-5), 33564 + - V2 (-0x1.5d30140ae5e99p-5), V2 (0x1.338e31eb2fbbcp-5), 33565 + - V2 (-0x1.00e6eece7de8p-5), V2 (0x1.860897b29e5efp-6), 33566 + - V2 (-0x1.0051381722a59p-6), V2 (0x1.14e9dc19a4a4ep-7), 33567 + - V2 (-0x1.d0062b42fe3bfp-9), V2 (0x1.17739e210171ap-10), 33568 + - V2 (-0x1.ab24da7be7402p-13), V2 (0x1.358851160a528p-16), }, 33569 + + .c0 = V2 (-0x1.5555555555555p-2), .c1 = 0x1.99999999996c1p-3, 33570 + + .c2 = V2 (-0x1.2492492478f88p-3), .c3 = 0x1.c71c71bc3951cp-4, 33571 + + .c4 = V2 (-0x1.745d160a7e368p-4), .c5 = 0x1.3b139b6a88ba1p-4, 33572 + + .c6 = V2 (-0x1.11100ee084227p-4), .c7 = 0x1.e1d0f9696f63bp-5, 33573 + + .c8 = V2 (-0x1.aebfe7b418581p-5), .c9 = 0x1.842dbe9b0d916p-5, 33574 + + .c10 = V2 (-0x1.5d30140ae5e99p-5), .c11 = 0x1.338e31eb2fbbcp-5, 33575 + + .c12 = V2 (-0x1.00e6eece7de8p-5), .c13 = 0x1.860897b29e5efp-6, 33576 + + .c14 = V2 (-0x1.0051381722a59p-6), .c15 = 0x1.14e9dc19a4a4ep-7, 33577 + + .c16 = V2 (-0x1.d0062b42fe3bfp-9), .c17 = 0x1.17739e210171ap-10, 33578 + + .c18 = V2 (-0x1.ab24da7be7402p-13), .c19 = 0x1.358851160a528p-16, 33579 + .pi_over_2 = V2 (0x1.921fb54442d18p+0), 33580 + }; 33581 + 33582 + @@ -52,6 +53,11 @@ static const struct data 33583 + float64x2_t VPCS_ATTR V_NAME_D1 (atan) (float64x2_t x) 33584 + { 33585 + const struct data *d = ptr_barrier (&data); 33586 + + float64x2_t c13 = vld1q_f64 (&d->c1); 33587 + + float64x2_t c57 = vld1q_f64 (&d->c5); 33588 + + float64x2_t c911 = vld1q_f64 (&d->c9); 33589 + + float64x2_t c1315 = vld1q_f64 (&d->c13); 33590 + + float64x2_t c1719 = vld1q_f64 (&d->c17); 33591 + 33592 + /* Small cases, infs and nans are supported by our approximation technique, 33593 + but do not set fenv flags correctly. Only trigger special case if we need 33594 + @@ -90,9 +96,35 @@ float64x2_t VPCS_ATTR V_NAME_D1 (atan) (float64x2_t x) 33595 + float64x2_t x2 = vmulq_f64 (z2, z2); 33596 + float64x2_t x4 = vmulq_f64 (x2, x2); 33597 + float64x2_t x8 = vmulq_f64 (x4, x4); 33598 + - float64x2_t y 33599 + - = vfmaq_f64 (v_estrin_7_f64 (z2, x2, x4, d->poly), 33600 + - v_estrin_11_f64 (z2, x2, x4, x8, d->poly + 8), x8); 33601 + + 33602 + + /* estrin_7. */ 33603 + + float64x2_t p01 = vfmaq_laneq_f64 (d->c0, z2, c13, 0); 33604 + + float64x2_t p23 = vfmaq_laneq_f64 (d->c2, z2, c13, 1); 33605 + + float64x2_t p03 = vfmaq_f64 (p01, x2, p23); 33606 + + 33607 + + float64x2_t p45 = vfmaq_laneq_f64 (d->c4, z2, c57, 0); 33608 + + float64x2_t p67 = vfmaq_laneq_f64 (d->c6, z2, c57, 1); 33609 + + float64x2_t p47 = vfmaq_f64 (p45, x2, p67); 33610 + + 33611 + + float64x2_t p07 = vfmaq_f64 (p03, x4, p47); 33612 + + 33613 + + /* estrin_11. */ 33614 + + float64x2_t p89 = vfmaq_laneq_f64 (d->c8, z2, c911, 0); 33615 + + float64x2_t p1011 = vfmaq_laneq_f64 (d->c10, z2, c911, 1); 33616 + + float64x2_t p811 = vfmaq_f64 (p89, x2, p1011); 33617 + + 33618 + + float64x2_t p1213 = vfmaq_laneq_f64 (d->c12, z2, c1315, 0); 33619 + + float64x2_t p1415 = vfmaq_laneq_f64 (d->c14, z2, c1315, 1); 33620 + + float64x2_t p1215 = vfmaq_f64 (p1213, x2, p1415); 33621 + + 33622 + + float64x2_t p1617 = vfmaq_laneq_f64 (d->c16, z2, c1719, 0); 33623 + + float64x2_t p1819 = vfmaq_laneq_f64 (d->c18, z2, c1719, 1); 33624 + + float64x2_t p1619 = vfmaq_f64 (p1617, x2, p1819); 33625 + + 33626 + + float64x2_t p815 = vfmaq_f64 (p811, x4, p1215); 33627 + + float64x2_t p819 = vfmaq_f64 (p815, x8, p1619); 33628 + + 33629 + + float64x2_t y = vfmaq_f64 (p07, p819, x8); 33630 + 33631 + /* Finalize. y = shift + z + z^3 * P(z^2). */ 33632 + y = vfmaq_f64 (az, y, vmulq_f64 (z2, az)); 33633 + 33634 + commit bf2b60a56036c951a798845223a2e04cc48507e4 33635 + Author: Joana Cruz <Joana.Cruz@arm.com> 33636 + Date: Tue Dec 17 14:50:33 2024 +0000 33637 + 33638 + AArch64: Improve codegen of AdvSIMD expf family 33639 + 33640 + Load the polynomial evaluation coefficients into 2 vectors and use lanewise MLAs. 33641 + Also use intrinsics instead of native operations. 33642 + expf: 3% improvement in throughput microbenchmark on Neoverse V1, exp2f: 5%, 33643 + exp10f: 13%, coshf: 14%. 33644 + 33645 + Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com> 33646 + (cherry picked from commit cff9648d0b50d19cdaf685f6767add040d4e1a8e) 33647 + 33648 + diff --git a/sysdeps/aarch64/fpu/coshf_advsimd.c b/sysdeps/aarch64/fpu/coshf_advsimd.c 33649 + index c1ab4923b8..cd5c866521 100644 33650 + --- a/sysdeps/aarch64/fpu/coshf_advsimd.c 33651 + +++ b/sysdeps/aarch64/fpu/coshf_advsimd.c 33652 + @@ -23,19 +23,27 @@ 33653 + static const struct data 33654 + { 33655 + struct v_expf_data expf_consts; 33656 + - uint32x4_t tiny_bound, special_bound; 33657 + + uint32x4_t tiny_bound; 33658 + + float32x4_t bound; 33659 + +#if WANT_SIMD_EXCEPT 33660 + + uint32x4_t special_bound; 33661 + +#endif 33662 + } data = { 33663 + .expf_consts = V_EXPF_DATA, 33664 + .tiny_bound = V4 (0x20000000), /* 0x1p-63: Round to 1 below this. */ 33665 + /* 0x1.5a92d8p+6: expf overflows above this, so have to use special case. */ 33666 + + .bound = V4 (0x1.5a92d8p+6), 33667 + +#if WANT_SIMD_EXCEPT 33668 + .special_bound = V4 (0x42ad496c), 33669 + +#endif 33670 + }; 33671 + 33672 + #if !WANT_SIMD_EXCEPT 33673 + static float32x4_t NOINLINE VPCS_ATTR 33674 + -special_case (float32x4_t x, float32x4_t y, uint32x4_t special) 33675 + +special_case (float32x4_t x, float32x4_t half_t, float32x4_t half_over_t, 33676 + + uint32x4_t special) 33677 + { 33678 + - return v_call_f32 (coshf, x, y, special); 33679 + + return v_call_f32 (coshf, x, vaddq_f32 (half_t, half_over_t), special); 33680 + } 33681 + #endif 33682 + 33683 + @@ -47,14 +55,13 @@ float32x4_t VPCS_ATTR V_NAME_F1 (cosh) (float32x4_t x) 33684 + { 33685 + const struct data *d = ptr_barrier (&data); 33686 + 33687 + - float32x4_t ax = vabsq_f32 (x); 33688 + - uint32x4_t iax = vreinterpretq_u32_f32 (ax); 33689 + - uint32x4_t special = vcgeq_u32 (iax, d->special_bound); 33690 + - 33691 + #if WANT_SIMD_EXCEPT 33692 + /* If fp exceptions are to be triggered correctly, fall back to the scalar 33693 + variant for all inputs if any input is a special value or above the bound 33694 + at which expf overflows. */ 33695 + + float32x4_t ax = vabsq_f32 (x); 33696 + + uint32x4_t iax = vreinterpretq_u32_f32 (ax); 33697 + + uint32x4_t special = vcgeq_u32 (iax, d->special_bound); 33698 + if (__glibc_unlikely (v_any_u32 (special))) 33699 + return v_call_f32 (coshf, x, x, v_u32 (-1)); 33700 + 33701 + @@ -63,10 +70,13 @@ float32x4_t VPCS_ATTR V_NAME_F1 (cosh) (float32x4_t x) 33702 + input to 0, which will generate no exceptions. */ 33703 + if (__glibc_unlikely (v_any_u32 (tiny))) 33704 + ax = v_zerofy_f32 (ax, tiny); 33705 + + float32x4_t t = v_expf_inline (ax, &d->expf_consts); 33706 + +#else 33707 + + uint32x4_t special = vcageq_f32 (x, d->bound); 33708 + + float32x4_t t = v_expf_inline (x, &d->expf_consts); 33709 + #endif 33710 + 33711 + /* Calculate cosh by exp(x) / 2 + exp(-x) / 2. */ 33712 + - float32x4_t t = v_expf_inline (ax, &d->expf_consts); 33713 + float32x4_t half_t = vmulq_n_f32 (t, 0.5); 33714 + float32x4_t half_over_t = vdivq_f32 (v_f32 (0.5), t); 33715 + 33716 + @@ -75,7 +85,7 @@ float32x4_t VPCS_ATTR V_NAME_F1 (cosh) (float32x4_t x) 33717 + return vbslq_f32 (tiny, v_f32 (1), vaddq_f32 (half_t, half_over_t)); 33718 + #else 33719 + if (__glibc_unlikely (v_any_u32 (special))) 33720 + - return special_case (x, vaddq_f32 (half_t, half_over_t), special); 33721 + + return special_case (x, half_t, half_over_t, special); 33722 + #endif 33723 + 33724 + return vaddq_f32 (half_t, half_over_t); 33725 + diff --git a/sysdeps/aarch64/fpu/exp10f_advsimd.c b/sysdeps/aarch64/fpu/exp10f_advsimd.c 33726 + index cf53e73290..55d9cd83f2 100644 33727 + --- a/sysdeps/aarch64/fpu/exp10f_advsimd.c 33728 + +++ b/sysdeps/aarch64/fpu/exp10f_advsimd.c 33729 + @@ -18,16 +18,15 @@ 33730 + <https://www.gnu.org/licenses/>. */ 33731 + 33732 + #include "v_math.h" 33733 + -#include "poly_advsimd_f32.h" 33734 + 33735 + #define ScaleBound 192.0f 33736 + 33737 + static const struct data 33738 + { 33739 + - float32x4_t poly[5]; 33740 + - float log10_2_and_inv[4]; 33741 + - float32x4_t shift; 33742 + - 33743 + + float32x4_t c0, c1, c3; 33744 + + float log10_2_high, log10_2_low, c2, c4; 33745 + + float32x4_t inv_log10_2, special_bound; 33746 + + uint32x4_t exponent_bias, special_offset, special_bias; 33747 + #if !WANT_SIMD_EXCEPT 33748 + float32x4_t scale_thresh; 33749 + #endif 33750 + @@ -37,19 +36,24 @@ static const struct data 33751 + rel error: 0x1.89dafa3p-24 33752 + abs error: 0x1.167d55p-23 in [-log10(2)/2, log10(2)/2] 33753 + maxerr: 1.85943 +0.5 ulp. */ 33754 + - .poly = { V4 (0x1.26bb16p+1f), V4 (0x1.5350d2p+1f), V4 (0x1.04744ap+1f), 33755 + - V4 (0x1.2d8176p+0f), V4 (0x1.12b41ap-1f) }, 33756 + - .shift = V4 (0x1.8p23f), 33757 + - 33758 + - /* Stores constants 1/log10(2), log10(2)_high, log10(2)_low, 0. */ 33759 + - .log10_2_and_inv = { 0x1.a934fp+1, 0x1.344136p-2, -0x1.ec10cp-27, 0 }, 33760 + + .c0 = V4 (0x1.26bb16p+1f), 33761 + + .c1 = V4 (0x1.5350d2p+1f), 33762 + + .c2 = 0x1.04744ap+1f, 33763 + + .c3 = V4 (0x1.2d8176p+0f), 33764 + + .c4 = 0x1.12b41ap-1f, 33765 + + .inv_log10_2 = V4 (0x1.a934fp+1), 33766 + + .log10_2_high = 0x1.344136p-2, 33767 + + .log10_2_low = 0x1.ec10cp-27, 33768 + + /* rint (log2 (2^127 / (1 + sqrt (2)))). */ 33769 + + .special_bound = V4 (126.0f), 33770 + + .exponent_bias = V4 (0x3f800000), 33771 + + .special_offset = V4 (0x82000000), 33772 + + .special_bias = V4 (0x7f000000), 33773 + #if !WANT_SIMD_EXCEPT 33774 + .scale_thresh = V4 (ScaleBound) 33775 + #endif 33776 + }; 33777 + 33778 + -#define ExponentBias v_u32 (0x3f800000) 33779 + - 33780 + #if WANT_SIMD_EXCEPT 33781 + 33782 + # define SpecialBound 38.0f /* rint(log10(2^127)). */ 33783 + @@ -67,17 +71,15 @@ special_case (float32x4_t x, float32x4_t y, uint32x4_t cmp) 33784 + 33785 + #else 33786 + 33787 + -# define SpecialBound 126.0f /* rint (log2 (2^127 / (1 + sqrt (2)))). */ 33788 + -# define SpecialOffset v_u32 (0x82000000) 33789 + -# define SpecialBias v_u32 (0x7f000000) 33790 + +# define SpecialBound 126.0f 33791 + 33792 + static float32x4_t VPCS_ATTR NOINLINE 33793 + special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1, 33794 + float32x4_t scale, const struct data *d) 33795 + { 33796 + /* 2^n may overflow, break it up into s1*s2. */ 33797 + - uint32x4_t b = vandq_u32 (vclezq_f32 (n), SpecialOffset); 33798 + - float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, SpecialBias)); 33799 + + uint32x4_t b = vandq_u32 (vclezq_f32 (n), d->special_offset); 33800 + + float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, d->special_bias)); 33801 + float32x4_t s2 = vreinterpretq_f32_u32 (vsubq_u32 (e, b)); 33802 + uint32x4_t cmp2 = vcagtq_f32 (n, d->scale_thresh); 33803 + float32x4_t r2 = vmulq_f32 (s1, s1); 33804 + @@ -112,23 +114,23 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (exp10) (float32x4_t x) 33805 + /* exp10(x) = 2^n * 10^r = 2^n * (1 + poly (r)), 33806 + with poly(r) in [1/sqrt(2), sqrt(2)] and 33807 + x = r + n * log10 (2), with r in [-log10(2)/2, log10(2)/2]. */ 33808 + - float32x4_t log10_2_and_inv = vld1q_f32 (d->log10_2_and_inv); 33809 + - float32x4_t z = vfmaq_laneq_f32 (d->shift, x, log10_2_and_inv, 0); 33810 + - float32x4_t n = vsubq_f32 (z, d->shift); 33811 + - float32x4_t r = vfmsq_laneq_f32 (x, n, log10_2_and_inv, 1); 33812 + - r = vfmsq_laneq_f32 (r, n, log10_2_and_inv, 2); 33813 + - uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_f32 (z), 23); 33814 + + float32x4_t log10_2_c24 = vld1q_f32 (&d->log10_2_high); 33815 + + float32x4_t n = vrndaq_f32 (vmulq_f32 (x, d->inv_log10_2)); 33816 + + float32x4_t r = vfmsq_laneq_f32 (x, n, log10_2_c24, 0); 33817 + + r = vfmaq_laneq_f32 (r, n, log10_2_c24, 1); 33818 + + uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtaq_s32_f32 (n)), 23); 33819 + 33820 + - float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, ExponentBias)); 33821 + + float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias)); 33822 + 33823 + #if !WANT_SIMD_EXCEPT 33824 + - uint32x4_t cmp = vcagtq_f32 (n, v_f32 (SpecialBound)); 33825 + + uint32x4_t cmp = vcagtq_f32 (n, d->special_bound); 33826 + #endif 33827 + 33828 + float32x4_t r2 = vmulq_f32 (r, r); 33829 + - float32x4_t poly 33830 + - = vfmaq_f32 (vmulq_f32 (r, d->poly[0]), 33831 + - v_pairwise_poly_3_f32 (r, r2, d->poly + 1), r2); 33832 + + float32x4_t p12 = vfmaq_laneq_f32 (d->c1, r, log10_2_c24, 2); 33833 + + float32x4_t p34 = vfmaq_laneq_f32 (d->c3, r, log10_2_c24, 3); 33834 + + float32x4_t p14 = vfmaq_f32 (p12, r2, p34); 33835 + + float32x4_t poly = vfmaq_f32 (vmulq_f32 (r, d->c0), p14, r2); 33836 + 33837 + if (__glibc_unlikely (v_any_u32 (cmp))) 33838 + #if WANT_SIMD_EXCEPT 33839 + diff --git a/sysdeps/aarch64/fpu/exp2f_advsimd.c b/sysdeps/aarch64/fpu/exp2f_advsimd.c 33840 + index 69e0b193a1..a4220da63c 100644 33841 + --- a/sysdeps/aarch64/fpu/exp2f_advsimd.c 33842 + +++ b/sysdeps/aarch64/fpu/exp2f_advsimd.c 33843 + @@ -21,24 +21,28 @@ 33844 + 33845 + static const struct data 33846 + { 33847 + - float32x4_t poly[5]; 33848 + - uint32x4_t exponent_bias; 33849 + + float32x4_t c1, c3; 33850 + + uint32x4_t exponent_bias, special_offset, special_bias; 33851 + #if !WANT_SIMD_EXCEPT 33852 + - float32x4_t special_bound, scale_thresh; 33853 + + float32x4_t scale_thresh, special_bound; 33854 + #endif 33855 + + float c0, c2, c4, zero; 33856 + } data = { 33857 + /* maxerr: 1.962 ulp. */ 33858 + - .poly = { V4 (0x1.59977ap-10f), V4 (0x1.3ce9e4p-7f), V4 (0x1.c6bd32p-5f), 33859 + - V4 (0x1.ebf9bcp-3f), V4 (0x1.62e422p-1f) }, 33860 + + .c0 = 0x1.59977ap-10f, 33861 + + .c1 = V4 (0x1.3ce9e4p-7f), 33862 + + .c2 = 0x1.c6bd32p-5f, 33863 + + .c3 = V4 (0x1.ebf9bcp-3f), 33864 + + .c4 = 0x1.62e422p-1f, 33865 + .exponent_bias = V4 (0x3f800000), 33866 + + .special_offset = V4 (0x82000000), 33867 + + .special_bias = V4 (0x7f000000), 33868 + #if !WANT_SIMD_EXCEPT 33869 + .special_bound = V4 (126.0f), 33870 + .scale_thresh = V4 (192.0f), 33871 + #endif 33872 + }; 33873 + 33874 + -#define C(i) d->poly[i] 33875 + - 33876 + #if WANT_SIMD_EXCEPT 33877 + 33878 + # define TinyBound v_u32 (0x20000000) /* asuint (0x1p-63). */ 33879 + @@ -55,16 +59,13 @@ special_case (float32x4_t x, float32x4_t y, uint32x4_t cmp) 33880 + 33881 + #else 33882 + 33883 + -# define SpecialOffset v_u32 (0x82000000) 33884 + -# define SpecialBias v_u32 (0x7f000000) 33885 + - 33886 + static float32x4_t VPCS_ATTR NOINLINE 33887 + special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1, 33888 + float32x4_t scale, const struct data *d) 33889 + { 33890 + /* 2^n may overflow, break it up into s1*s2. */ 33891 + - uint32x4_t b = vandq_u32 (vclezq_f32 (n), SpecialOffset); 33892 + - float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, SpecialBias)); 33893 + + uint32x4_t b = vandq_u32 (vclezq_f32 (n), d->special_offset); 33894 + + float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, d->special_bias)); 33895 + float32x4_t s2 = vreinterpretq_f32_u32 (vsubq_u32 (e, b)); 33896 + uint32x4_t cmp2 = vcagtq_f32 (n, d->scale_thresh); 33897 + float32x4_t r2 = vmulq_f32 (s1, s1); 33898 + @@ -80,13 +81,11 @@ special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1, 33899 + float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (exp2) (float32x4_t x) 33900 + { 33901 + const struct data *d = ptr_barrier (&data); 33902 + - float32x4_t n, r, r2, scale, p, q, poly; 33903 + - uint32x4_t cmp, e; 33904 + 33905 + #if WANT_SIMD_EXCEPT 33906 + /* asuint(|x|) - TinyBound >= BigBound - TinyBound. */ 33907 + uint32x4_t ia = vreinterpretq_u32_f32 (vabsq_f32 (x)); 33908 + - cmp = vcgeq_u32 (vsubq_u32 (ia, TinyBound), SpecialBound); 33909 + + uint32x4_t cmp = vcgeq_u32 (vsubq_u32 (ia, TinyBound), SpecialBound); 33910 + float32x4_t xm = x; 33911 + /* If any lanes are special, mask them with 1 and retain a copy of x to allow 33912 + special_case to fix special lanes later. This is only necessary if fenv 33913 + @@ -95,23 +94,24 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (exp2) (float32x4_t x) 33914 + x = vbslq_f32 (cmp, v_f32 (1), x); 33915 + #endif 33916 + 33917 + - /* exp2(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)] 33918 + - x = n + r, with r in [-1/2, 1/2]. */ 33919 + - n = vrndaq_f32 (x); 33920 + - r = vsubq_f32 (x, n); 33921 + - e = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtaq_s32_f32 (x)), 23); 33922 + - scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias)); 33923 + + /* exp2(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)] 33924 + + x = n + r, with r in [-1/2, 1/2]. */ 33925 + + float32x4_t n = vrndaq_f32 (x); 33926 + + float32x4_t r = vsubq_f32 (x, n); 33927 + + uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtaq_s32_f32 (x)), 23); 33928 + + float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias)); 33929 + 33930 + #if !WANT_SIMD_EXCEPT 33931 + - cmp = vcagtq_f32 (n, d->special_bound); 33932 + + uint32x4_t cmp = vcagtq_f32 (n, d->special_bound); 33933 + #endif 33934 + 33935 + - r2 = vmulq_f32 (r, r); 33936 + - p = vfmaq_f32 (C (1), C (0), r); 33937 + - q = vfmaq_f32 (C (3), C (2), r); 33938 + + float32x4_t c024 = vld1q_f32 (&d->c0); 33939 + + float32x4_t r2 = vmulq_f32 (r, r); 33940 + + float32x4_t p = vfmaq_laneq_f32 (d->c1, r, c024, 0); 33941 + + float32x4_t q = vfmaq_laneq_f32 (d->c3, r, c024, 1); 33942 + q = vfmaq_f32 (q, p, r2); 33943 + - p = vmulq_f32 (C (4), r); 33944 + - poly = vfmaq_f32 (p, q, r2); 33945 + + p = vmulq_laneq_f32 (r, c024, 2); 33946 + + float32x4_t poly = vfmaq_f32 (p, q, r2); 33947 + 33948 + if (__glibc_unlikely (v_any_u32 (cmp))) 33949 + #if WANT_SIMD_EXCEPT 33950 + diff --git a/sysdeps/aarch64/fpu/expf_advsimd.c b/sysdeps/aarch64/fpu/expf_advsimd.c 33951 + index 5c9cb72620..70f137e2e5 100644 33952 + --- a/sysdeps/aarch64/fpu/expf_advsimd.c 33953 + +++ b/sysdeps/aarch64/fpu/expf_advsimd.c 33954 + @@ -21,20 +21,25 @@ 33955 + 33956 + static const struct data 33957 + { 33958 + - float32x4_t poly[5]; 33959 + - float32x4_t inv_ln2, ln2_hi, ln2_lo; 33960 + - uint32x4_t exponent_bias; 33961 + + float32x4_t c1, c3, c4, inv_ln2; 33962 + + float ln2_hi, ln2_lo, c0, c2; 33963 + + uint32x4_t exponent_bias, special_offset, special_bias; 33964 + #if !WANT_SIMD_EXCEPT 33965 + float32x4_t special_bound, scale_thresh; 33966 + #endif 33967 + } data = { 33968 + /* maxerr: 1.45358 +0.5 ulp. */ 33969 + - .poly = { V4 (0x1.0e4020p-7f), V4 (0x1.573e2ep-5f), V4 (0x1.555e66p-3f), 33970 + - V4 (0x1.fffdb6p-2f), V4 (0x1.ffffecp-1f) }, 33971 + + .c0 = 0x1.0e4020p-7f, 33972 + + .c1 = V4 (0x1.573e2ep-5f), 33973 + + .c2 = 0x1.555e66p-3f, 33974 + + .c3 = V4 (0x1.fffdb6p-2f), 33975 + + .c4 = V4 (0x1.ffffecp-1f), 33976 + .inv_ln2 = V4 (0x1.715476p+0f), 33977 + - .ln2_hi = V4 (0x1.62e4p-1f), 33978 + - .ln2_lo = V4 (0x1.7f7d1cp-20f), 33979 + + .ln2_hi = 0x1.62e4p-1f, 33980 + + .ln2_lo = 0x1.7f7d1cp-20f, 33981 + .exponent_bias = V4 (0x3f800000), 33982 + + .special_offset = V4 (0x82000000), 33983 + + .special_bias = V4 (0x7f000000), 33984 + #if !WANT_SIMD_EXCEPT 33985 + .special_bound = V4 (126.0f), 33986 + .scale_thresh = V4 (192.0f), 33987 + @@ -59,19 +64,17 @@ special_case (float32x4_t x, float32x4_t y, uint32x4_t cmp) 33988 + 33989 + #else 33990 + 33991 + -# define SpecialOffset v_u32 (0x82000000) 33992 + -# define SpecialBias v_u32 (0x7f000000) 33993 + - 33994 + static float32x4_t VPCS_ATTR NOINLINE 33995 + special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1, 33996 + float32x4_t scale, const struct data *d) 33997 + { 33998 + /* 2^n may overflow, break it up into s1*s2. */ 33999 + - uint32x4_t b = vandq_u32 (vclezq_f32 (n), SpecialOffset); 34000 + - float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, SpecialBias)); 34001 + + uint32x4_t b = vandq_u32 (vclezq_f32 (n), d->special_offset); 34002 + + float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, d->special_bias)); 34003 + float32x4_t s2 = vreinterpretq_f32_u32 (vsubq_u32 (e, b)); 34004 + uint32x4_t cmp2 = vcagtq_f32 (n, d->scale_thresh); 34005 + float32x4_t r2 = vmulq_f32 (s1, s1); 34006 + + // (s2 + p*s2)*s1 = s2(p+1)s1 34007 + float32x4_t r1 = vmulq_f32 (vfmaq_f32 (s2, poly, s2), s1); 34008 + /* Similar to r1 but avoids double rounding in the subnormal range. */ 34009 + float32x4_t r0 = vfmaq_f32 (scale, poly, scale); 34010 + @@ -84,12 +87,11 @@ special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1, 34011 + float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (exp) (float32x4_t x) 34012 + { 34013 + const struct data *d = ptr_barrier (&data); 34014 + - float32x4_t n, r, r2, scale, p, q, poly; 34015 + - uint32x4_t cmp, e; 34016 + + float32x4_t ln2_c02 = vld1q_f32 (&d->ln2_hi); 34017 + 34018 + #if WANT_SIMD_EXCEPT 34019 + /* asuint(x) - TinyBound >= BigBound - TinyBound. */ 34020 + - cmp = vcgeq_u32 ( 34021 + + uint32x4_t cmp = vcgeq_u32 ( 34022 + vsubq_u32 (vandq_u32 (vreinterpretq_u32_f32 (x), v_u32 (0x7fffffff)), 34023 + TinyBound), 34024 + SpecialBound); 34025 + @@ -103,22 +105,22 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (exp) (float32x4_t x) 34026 + 34027 + /* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)] 34028 + x = ln2*n + r, with r in [-ln2/2, ln2/2]. */ 34029 + - n = vrndaq_f32 (vmulq_f32 (x, d->inv_ln2)); 34030 + - r = vfmsq_f32 (x, n, d->ln2_hi); 34031 + - r = vfmsq_f32 (r, n, d->ln2_lo); 34032 + - e = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtq_s32_f32 (n)), 23); 34033 + - scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias)); 34034 + + float32x4_t n = vrndaq_f32 (vmulq_f32 (x, d->inv_ln2)); 34035 + + float32x4_t r = vfmsq_laneq_f32 (x, n, ln2_c02, 0); 34036 + + r = vfmsq_laneq_f32 (r, n, ln2_c02, 1); 34037 + + uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtq_s32_f32 (n)), 23); 34038 + + float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias)); 34039 + 34040 + #if !WANT_SIMD_EXCEPT 34041 + - cmp = vcagtq_f32 (n, d->special_bound); 34042 + + uint32x4_t cmp = vcagtq_f32 (n, d->special_bound); 34043 + #endif 34044 + 34045 + - r2 = vmulq_f32 (r, r); 34046 + - p = vfmaq_f32 (C (1), C (0), r); 34047 + - q = vfmaq_f32 (C (3), C (2), r); 34048 + + float32x4_t r2 = vmulq_f32 (r, r); 34049 + + float32x4_t p = vfmaq_laneq_f32 (d->c1, r, ln2_c02, 2); 34050 + + float32x4_t q = vfmaq_laneq_f32 (d->c3, r, ln2_c02, 3); 34051 + q = vfmaq_f32 (q, p, r2); 34052 + - p = vmulq_f32 (C (4), r); 34053 + - poly = vfmaq_f32 (p, q, r2); 34054 + + p = vmulq_f32 (d->c4, r); 34055 + + float32x4_t poly = vfmaq_f32 (p, q, r2); 34056 + 34057 + if (__glibc_unlikely (v_any_u32 (cmp))) 34058 + #if WANT_SIMD_EXCEPT 34059 + diff --git a/sysdeps/aarch64/fpu/v_expf_inline.h b/sysdeps/aarch64/fpu/v_expf_inline.h 34060 + index 08b06e0a6b..eacd2af241 100644 34061 + --- a/sysdeps/aarch64/fpu/v_expf_inline.h 34062 + +++ b/sysdeps/aarch64/fpu/v_expf_inline.h 34063 + @@ -24,50 +24,45 @@ 34064 + 34065 + struct v_expf_data 34066 + { 34067 + - float32x4_t poly[5]; 34068 + - float32x4_t shift; 34069 + - float invln2_and_ln2[4]; 34070 + + float ln2_hi, ln2_lo, c0, c2; 34071 + + float32x4_t inv_ln2, c1, c3, c4; 34072 + + /* asuint(1.0f). */ 34073 + + uint32x4_t exponent_bias; 34074 + }; 34075 + 34076 + /* maxerr: 1.45358 +0.5 ulp. */ 34077 + #define V_EXPF_DATA \ 34078 + { \ 34079 + - .poly = { V4 (0x1.0e4020p-7f), V4 (0x1.573e2ep-5f), V4 (0x1.555e66p-3f), \ 34080 + - V4 (0x1.fffdb6p-2f), V4 (0x1.ffffecp-1f) }, \ 34081 + - .shift = V4 (0x1.8p23f), \ 34082 + - .invln2_and_ln2 = { 0x1.715476p+0f, 0x1.62e4p-1f, 0x1.7f7d1cp-20f, 0 }, \ 34083 + + .c0 = 0x1.0e4020p-7f, .c1 = V4 (0x1.573e2ep-5f), .c2 = 0x1.555e66p-3f, \ 34084 + + .c3 = V4 (0x1.fffdb6p-2f), .c4 = V4 (0x1.ffffecp-1f), \ 34085 + + .ln2_hi = 0x1.62e4p-1f, .ln2_lo = 0x1.7f7d1cp-20f, \ 34086 + + .inv_ln2 = V4 (0x1.715476p+0f), .exponent_bias = V4 (0x3f800000), \ 34087 + } 34088 + 34089 + -#define ExponentBias v_u32 (0x3f800000) /* asuint(1.0f). */ 34090 + -#define C(i) d->poly[i] 34091 + - 34092 + static inline float32x4_t 34093 + v_expf_inline (float32x4_t x, const struct v_expf_data *d) 34094 + { 34095 + - /* Helper routine for calculating exp(x). 34096 + + /* Helper routine for calculating exp(ax). 34097 + Copied from v_expf.c, with all special-case handling removed - the 34098 + calling routine should handle special values if required. */ 34099 + 34100 + - /* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)] 34101 + - x = ln2*n + r, with r in [-ln2/2, ln2/2]. */ 34102 + - float32x4_t n, r, z; 34103 + - float32x4_t invln2_and_ln2 = vld1q_f32 (d->invln2_and_ln2); 34104 + - z = vfmaq_laneq_f32 (d->shift, x, invln2_and_ln2, 0); 34105 + - n = vsubq_f32 (z, d->shift); 34106 + - r = vfmsq_laneq_f32 (x, n, invln2_and_ln2, 1); 34107 + - r = vfmsq_laneq_f32 (r, n, invln2_and_ln2, 2); 34108 + - uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_f32 (z), 23); 34109 + - float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, ExponentBias)); 34110 + + /* exp(ax) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)] 34111 + + ax = ln2*n + r, with r in [-ln2/2, ln2/2]. */ 34112 + + float32x4_t ax = vabsq_f32 (x); 34113 + + float32x4_t ln2_c02 = vld1q_f32 (&d->ln2_hi); 34114 + + float32x4_t n = vrndaq_f32 (vmulq_f32 (ax, d->inv_ln2)); 34115 + + float32x4_t r = vfmsq_laneq_f32 (ax, n, ln2_c02, 0); 34116 + + r = vfmsq_laneq_f32 (r, n, ln2_c02, 1); 34117 + + uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtq_s32_f32 (n)), 23); 34118 + + float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias)); 34119 + 34120 + /* Custom order-4 Estrin avoids building high order monomial. */ 34121 + float32x4_t r2 = vmulq_f32 (r, r); 34122 + - float32x4_t p, q, poly; 34123 + - p = vfmaq_f32 (C (1), C (0), r); 34124 + - q = vfmaq_f32 (C (3), C (2), r); 34125 + + float32x4_t p = vfmaq_laneq_f32 (d->c1, r, ln2_c02, 2); 34126 + + float32x4_t q = vfmaq_laneq_f32 (d->c3, r, ln2_c02, 3); 34127 + q = vfmaq_f32 (q, p, r2); 34128 + - p = vmulq_f32 (C (4), r); 34129 + - poly = vfmaq_f32 (p, q, r2); 34130 + + p = vmulq_f32 (d->c4, r); 34131 + + float32x4_t poly = vfmaq_f32 (p, q, r2); 34132 + return vfmaq_f32 (scale, poly, scale); 34133 + } 34134 + - 34135 + #endif 34136 + 34137 + commit abfd20ebbd2883f2c6e5f16709f7b9781c3c8068 34138 + Author: Luna Lamb <luna.lamb@arm.com> 34139 + Date: Fri Jan 3 19:00:12 2025 +0000 34140 + 34141 + AArch64: Improve codegen in AdvSIMD asinh 34142 + 34143 + Improves memory access and removes spills. 34144 + Load the polynomial evaluation coefficients into 2 vectors and use lanewise 34145 + MLAs. Reduces MOVs 6->3 , LDR 11->5, STR/STP 2->0, ADRP 3->2. 34146 + 34147 + (cherry picked from commit 140b985e5a2071000122b3cb63ebfe88cf21dd29) 34148 + 34149 + diff --git a/sysdeps/aarch64/fpu/asinh_advsimd.c b/sysdeps/aarch64/fpu/asinh_advsimd.c 34150 + index 6207e7da95..2739f98b39 100644 34151 + --- a/sysdeps/aarch64/fpu/asinh_advsimd.c 34152 + +++ b/sysdeps/aarch64/fpu/asinh_advsimd.c 34153 + @@ -20,41 +20,71 @@ 34154 + #include "v_math.h" 34155 + #include "poly_advsimd_f64.h" 34156 + 34157 + -#define A(i) v_f64 (__v_log_data.poly[i]) 34158 + -#define N (1 << V_LOG_TABLE_BITS) 34159 + -#define IndexMask (N - 1) 34160 + - 34161 + const static struct data 34162 + { 34163 + - float64x2_t poly[18]; 34164 + - uint64x2_t off, huge_bound, abs_mask; 34165 + - float64x2_t ln2, tiny_bound; 34166 + + uint64x2_t huge_bound, abs_mask, off, mask; 34167 + +#if WANT_SIMD_EXCEPT 34168 + + float64x2_t tiny_bound; 34169 + +#endif 34170 + + float64x2_t lc0, lc2; 34171 + + double lc1, lc3, ln2, lc4; 34172 + + 34173 + + float64x2_t c0, c2, c4, c6, c8, c10, c12, c14, c16, c17; 34174 + + double c1, c3, c5, c7, c9, c11, c13, c15; 34175 + + 34176 + } data = { 34177 + - .off = V2 (0x3fe6900900000000), 34178 + - .ln2 = V2 (0x1.62e42fefa39efp-1), 34179 + - .huge_bound = V2 (0x5fe0000000000000), 34180 + + 34181 + +#if WANT_SIMD_EXCEPT 34182 + .tiny_bound = V2 (0x1p-26), 34183 + - .abs_mask = V2 (0x7fffffffffffffff), 34184 + +#endif 34185 + /* Even terms of polynomial s.t. asinh(x) is approximated by 34186 + asinh(x) ~= x + x^3 * (C0 + C1 * x + C2 * x^2 + C3 * x^3 + ...). 34187 + Generated using Remez, f = (asinh(sqrt(x)) - sqrt(x))/x^(3/2). */ 34188 + - .poly = { V2 (-0x1.55555555554a7p-3), V2 (0x1.3333333326c7p-4), 34189 + - V2 (-0x1.6db6db68332e6p-5), V2 (0x1.f1c71b26fb40dp-6), 34190 + - V2 (-0x1.6e8b8b654a621p-6), V2 (0x1.1c4daa9e67871p-6), 34191 + - V2 (-0x1.c9871d10885afp-7), V2 (0x1.7a16e8d9d2ecfp-7), 34192 + - V2 (-0x1.3ddca533e9f54p-7), V2 (0x1.0becef748dafcp-7), 34193 + - V2 (-0x1.b90c7099dd397p-8), V2 (0x1.541f2bb1ffe51p-8), 34194 + - V2 (-0x1.d217026a669ecp-9), V2 (0x1.0b5c7977aaf7p-9), 34195 + - V2 (-0x1.e0f37daef9127p-11), V2 (0x1.388b5fe542a6p-12), 34196 + - V2 (-0x1.021a48685e287p-14), V2 (0x1.93d4ba83d34dap-18) }, 34197 + + 34198 + + .c0 = V2 (-0x1.55555555554a7p-3), 34199 + + .c1 = 0x1.3333333326c7p-4, 34200 + + .c2 = V2 (-0x1.6db6db68332e6p-5), 34201 + + .c3 = 0x1.f1c71b26fb40dp-6, 34202 + + .c4 = V2 (-0x1.6e8b8b654a621p-6), 34203 + + .c5 = 0x1.1c4daa9e67871p-6, 34204 + + .c6 = V2 (-0x1.c9871d10885afp-7), 34205 + + .c7 = 0x1.7a16e8d9d2ecfp-7, 34206 + + .c8 = V2 (-0x1.3ddca533e9f54p-7), 34207 + + .c9 = 0x1.0becef748dafcp-7, 34208 + + .c10 = V2 (-0x1.b90c7099dd397p-8), 34209 + + .c11 = 0x1.541f2bb1ffe51p-8, 34210 + + .c12 = V2 (-0x1.d217026a669ecp-9), 34211 + + .c13 = 0x1.0b5c7977aaf7p-9, 34212 + + .c14 = V2 (-0x1.e0f37daef9127p-11), 34213 + + .c15 = 0x1.388b5fe542a6p-12, 34214 + + .c16 = V2 (-0x1.021a48685e287p-14), 34215 + + .c17 = V2 (0x1.93d4ba83d34dap-18), 34216 + + 34217 + + .lc0 = V2 (-0x1.ffffffffffff7p-2), 34218 + + .lc1 = 0x1.55555555170d4p-2, 34219 + + .lc2 = V2 (-0x1.0000000399c27p-2), 34220 + + .lc3 = 0x1.999b2e90e94cap-3, 34221 + + .lc4 = -0x1.554e550bd501ep-3, 34222 + + .ln2 = 0x1.62e42fefa39efp-1, 34223 + + 34224 + + .off = V2 (0x3fe6900900000000), 34225 + + .huge_bound = V2 (0x5fe0000000000000), 34226 + + .abs_mask = V2 (0x7fffffffffffffff), 34227 + + .mask = V2 (0xfffULL << 52), 34228 + }; 34229 + 34230 + static float64x2_t NOINLINE VPCS_ATTR 34231 + -special_case (float64x2_t x, float64x2_t y, uint64x2_t special) 34232 + +special_case (float64x2_t x, float64x2_t y, uint64x2_t abs_mask, 34233 + + uint64x2_t special) 34234 + { 34235 + + /* Copy sign. */ 34236 + + y = vbslq_f64 (abs_mask, y, x); 34237 + return v_call_f64 (asinh, x, y, special); 34238 + } 34239 + 34240 + +#define N (1 << V_LOG_TABLE_BITS) 34241 + +#define IndexMask (N - 1) 34242 + + 34243 + struct entry 34244 + { 34245 + float64x2_t invc; 34246 + @@ -76,27 +106,34 @@ lookup (uint64x2_t i) 34247 + } 34248 + 34249 + static inline float64x2_t 34250 + -log_inline (float64x2_t x, const struct data *d) 34251 + +log_inline (float64x2_t xm, const struct data *d) 34252 + { 34253 + - /* Double-precision vector log, copied from ordinary vector log with some 34254 + - cosmetic modification and special-cases removed. */ 34255 + - uint64x2_t ix = vreinterpretq_u64_f64 (x); 34256 + - uint64x2_t tmp = vsubq_u64 (ix, d->off); 34257 + - int64x2_t k = vshrq_n_s64 (vreinterpretq_s64_u64 (tmp), 52); 34258 + - uint64x2_t iz 34259 + - = vsubq_u64 (ix, vandq_u64 (tmp, vdupq_n_u64 (0xfffULL << 52))); 34260 + + 34261 + + uint64x2_t u = vreinterpretq_u64_f64 (xm); 34262 + + uint64x2_t u_off = vsubq_u64 (u, d->off); 34263 + + 34264 + + int64x2_t k = vshrq_n_s64 (vreinterpretq_s64_u64 (u_off), 52); 34265 + + uint64x2_t iz = vsubq_u64 (u, vandq_u64 (u_off, d->mask)); 34266 + float64x2_t z = vreinterpretq_f64_u64 (iz); 34267 + - struct entry e = lookup (tmp); 34268 + + 34269 + + struct entry e = lookup (u_off); 34270 + + 34271 + + /* log(x) = log1p(z/c-1) + log(c) + k*Ln2. */ 34272 + float64x2_t r = vfmaq_f64 (v_f64 (-1.0), z, e.invc); 34273 + float64x2_t kd = vcvtq_f64_s64 (k); 34274 + - float64x2_t hi = vfmaq_f64 (vaddq_f64 (e.logc, r), kd, d->ln2); 34275 + + 34276 + + /* hi = r + log(c) + k*Ln2. */ 34277 + + float64x2_t ln2_and_lc4 = vld1q_f64 (&d->ln2); 34278 + + float64x2_t hi = vfmaq_laneq_f64 (vaddq_f64 (e.logc, r), kd, ln2_and_lc4, 0); 34279 + + 34280 + + /* y = r2*(A0 + r*A1 + r2*(A2 + r*A3 + r2*A4)) + hi. */ 34281 + + float64x2_t odd_coeffs = vld1q_f64 (&d->lc1); 34282 + float64x2_t r2 = vmulq_f64 (r, r); 34283 + - float64x2_t y = vfmaq_f64 (A (2), A (3), r); 34284 + - float64x2_t p = vfmaq_f64 (A (0), A (1), r); 34285 + - y = vfmaq_f64 (y, A (4), r2); 34286 + - y = vfmaq_f64 (p, y, r2); 34287 + - y = vfmaq_f64 (hi, y, r2); 34288 + - return y; 34289 + + float64x2_t y = vfmaq_laneq_f64 (d->lc2, r, odd_coeffs, 1); 34290 + + float64x2_t p = vfmaq_laneq_f64 (d->lc0, r, odd_coeffs, 0); 34291 + + y = vfmaq_laneq_f64 (y, r2, ln2_and_lc4, 1); 34292 + + y = vfmaq_f64 (p, r2, y); 34293 + + return vfmaq_f64 (hi, y, r2); 34294 + } 34295 + 34296 + /* Double-precision implementation of vector asinh(x). 34297 + @@ -106,23 +143,24 @@ log_inline (float64x2_t x, const struct data *d) 34298 + asinh(x) = sign(x) * log(|x| + sqrt(x^2 + 1) if |x| >= 1 34299 + = sign(x) * (|x| + |x|^3 * P(x^2)) otherwise 34300 + where log(x) is an optimized log approximation, and P(x) is a polynomial 34301 + - shared with the scalar routine. The greatest observed error 3.29 ULP, in 34302 + + shared with the scalar routine. The greatest observed error 2.79 ULP, in 34303 + |x| >= 1: 34304 + - __v_asinh(0x1.2cd9d717e2c9bp+0) got 0x1.ffffcfd0e234fp-1 34305 + - want 0x1.ffffcfd0e2352p-1. */ 34306 + + _ZGVnN2v_asinh(0x1.2cd9d73ea76a6p+0) got 0x1.ffffd003219dap-1 34307 + + want 0x1.ffffd003219ddp-1. */ 34308 + VPCS_ATTR float64x2_t V_NAME_D1 (asinh) (float64x2_t x) 34309 + { 34310 + const struct data *d = ptr_barrier (&data); 34311 + - 34312 + float64x2_t ax = vabsq_f64 (x); 34313 + - uint64x2_t iax = vreinterpretq_u64_f64 (ax); 34314 + 34315 + uint64x2_t gt1 = vcgeq_f64 (ax, v_f64 (1)); 34316 + - uint64x2_t special = vcgeq_u64 (iax, d->huge_bound); 34317 + 34318 + #if WANT_SIMD_EXCEPT 34319 + + uint64x2_t iax = vreinterpretq_u64_f64 (ax); 34320 + + uint64x2_t special = vcgeq_u64 (iax, (d->huge_bound)); 34321 + uint64x2_t tiny = vcltq_f64 (ax, d->tiny_bound); 34322 + special = vorrq_u64 (special, tiny); 34323 + +#else 34324 + + uint64x2_t special = vcgeq_f64 (ax, vreinterpretq_f64_u64 (d->huge_bound)); 34325 + #endif 34326 + 34327 + /* Option 1: |x| >= 1. 34328 + @@ -147,19 +185,45 @@ VPCS_ATTR float64x2_t V_NAME_D1 (asinh) (float64x2_t x) 34329 + overflow, and tiny lanes, which will underflow, by setting them to 0. They 34330 + will be fixed later, either by selecting x or falling back to the scalar 34331 + special-case. The largest observed error in this region is 1.47 ULPs: 34332 + - __v_asinh(0x1.fdfcd00cc1e6ap-1) got 0x1.c1d6bf874019bp-1 34333 + - want 0x1.c1d6bf874019cp-1. */ 34334 + + _ZGVnN2v_asinh(0x1.fdfcd00cc1e6ap-1) got 0x1.c1d6bf874019bp-1 34335 + + want 0x1.c1d6bf874019cp-1. */ 34336 + float64x2_t option_2 = v_f64 (0); 34337 + + 34338 + if (__glibc_likely (v_any_u64 (vceqzq_u64 (gt1)))) 34339 + { 34340 + + 34341 + #if WANT_SIMD_EXCEPT 34342 + ax = v_zerofy_f64 (ax, vorrq_u64 (tiny, gt1)); 34343 + #endif 34344 + - float64x2_t x2 = vmulq_f64 (ax, ax), x3 = vmulq_f64 (ax, x2), 34345 + - z2 = vmulq_f64 (x2, x2), z4 = vmulq_f64 (z2, z2), 34346 + - z8 = vmulq_f64 (z4, z4), z16 = vmulq_f64 (z8, z8); 34347 + - float64x2_t p = v_estrin_17_f64 (x2, z2, z4, z8, z16, d->poly); 34348 + - option_2 = vfmaq_f64 (ax, p, x3); 34349 + + float64x2_t x2 = vmulq_f64 (ax, ax), z2 = vmulq_f64 (x2, x2); 34350 + + /* Order-17 Pairwise Horner scheme. */ 34351 + + float64x2_t c13 = vld1q_f64 (&d->c1); 34352 + + float64x2_t c57 = vld1q_f64 (&d->c5); 34353 + + float64x2_t c911 = vld1q_f64 (&d->c9); 34354 + + float64x2_t c1315 = vld1q_f64 (&d->c13); 34355 + + 34356 + + float64x2_t p01 = vfmaq_laneq_f64 (d->c0, x2, c13, 0); 34357 + + float64x2_t p23 = vfmaq_laneq_f64 (d->c2, x2, c13, 1); 34358 + + float64x2_t p45 = vfmaq_laneq_f64 (d->c4, x2, c57, 0); 34359 + + float64x2_t p67 = vfmaq_laneq_f64 (d->c6, x2, c57, 1); 34360 + + float64x2_t p89 = vfmaq_laneq_f64 (d->c8, x2, c911, 0); 34361 + + float64x2_t p1011 = vfmaq_laneq_f64 (d->c10, x2, c911, 1); 34362 + + float64x2_t p1213 = vfmaq_laneq_f64 (d->c12, x2, c1315, 0); 34363 + + float64x2_t p1415 = vfmaq_laneq_f64 (d->c14, x2, c1315, 1); 34364 + + float64x2_t p1617 = vfmaq_f64 (d->c16, x2, d->c17); 34365 + + 34366 + + float64x2_t p = vfmaq_f64 (p1415, z2, p1617); 34367 + + p = vfmaq_f64 (p1213, z2, p); 34368 + + p = vfmaq_f64 (p1011, z2, p); 34369 + + p = vfmaq_f64 (p89, z2, p); 34370 + + 34371 + + p = vfmaq_f64 (p67, z2, p); 34372 + + p = vfmaq_f64 (p45, z2, p); 34373 + + 34374 + + p = vfmaq_f64 (p23, z2, p); 34375 + + 34376 + + p = vfmaq_f64 (p01, z2, p); 34377 + + option_2 = vfmaq_f64 (ax, p, vmulq_f64 (ax, x2)); 34378 + #if WANT_SIMD_EXCEPT 34379 + option_2 = vbslq_f64 (tiny, x, option_2); 34380 + #endif 34381 + @@ -167,10 +231,10 @@ VPCS_ATTR float64x2_t V_NAME_D1 (asinh) (float64x2_t x) 34382 + 34383 + /* Choose the right option for each lane. */ 34384 + float64x2_t y = vbslq_f64 (gt1, option_1, option_2); 34385 + - /* Copy sign. */ 34386 + - y = vbslq_f64 (d->abs_mask, y, x); 34387 + - 34388 + if (__glibc_unlikely (v_any_u64 (special))) 34389 + - return special_case (x, y, special); 34390 + - return y; 34391 + + { 34392 + + return special_case (x, y, d->abs_mask, special); 34393 + + } 34394 + + /* Copy sign. */ 34395 + + return vbslq_f64 (d->abs_mask, y, x); 34396 + } 34397 + 34398 + commit 5f45c0f91eae99b7d49f5c63b900441eb3491213 34399 + Author: Luna Lamb <luna.lamb@arm.com> 34400 + Date: Fri Jan 3 19:02:52 2025 +0000 34401 + 34402 + AArch64: Improve codegen in SVE tans 34403 + 34404 + Improves memory access. 34405 + Tan: MOVPRFX 7 -> 2, LD1RD 12 -> 5, move MOV away from return. 34406 + Tanf: MOV 2 -> 1, MOVPRFX 6 -> 3, LD1RW 5 -> 4, move mov away from return. 34407 + 34408 + (cherry picked from commit aa6609feb20ebf8653db639dabe2a6afc77b02cc) 34409 + 34410 + diff --git a/sysdeps/aarch64/fpu/tan_sve.c b/sysdeps/aarch64/fpu/tan_sve.c 34411 + index b2e4447316..a7318fd417 100644 34412 + --- a/sysdeps/aarch64/fpu/tan_sve.c 34413 + +++ b/sysdeps/aarch64/fpu/tan_sve.c 34414 + @@ -22,24 +22,38 @@ 34415 + 34416 + static const struct data 34417 + { 34418 + - double poly[9]; 34419 + - double half_pi_hi, half_pi_lo, inv_half_pi, range_val, shift; 34420 + + double c2, c4, c6, c8; 34421 + + double poly_1357[4]; 34422 + + double c0, inv_half_pi; 34423 + + double half_pi_hi, half_pi_lo, range_val; 34424 + } data = { 34425 + /* Polynomial generated with FPMinimax. */ 34426 + - .poly = { 0x1.5555555555556p-2, 0x1.1111111110a63p-3, 0x1.ba1ba1bb46414p-5, 34427 + - 0x1.664f47e5b5445p-6, 0x1.226e5e5ecdfa3p-7, 0x1.d6c7ddbf87047p-9, 34428 + - 0x1.7ea75d05b583ep-10, 0x1.289f22964a03cp-11, 34429 + - 0x1.4e4fd14147622p-12, }, 34430 + + .c2 = 0x1.ba1ba1bb46414p-5, 34431 + + .c4 = 0x1.226e5e5ecdfa3p-7, 34432 + + .c6 = 0x1.7ea75d05b583ep-10, 34433 + + .c8 = 0x1.4e4fd14147622p-12, 34434 + + .poly_1357 = { 0x1.1111111110a63p-3, 0x1.664f47e5b5445p-6, 34435 + + 0x1.d6c7ddbf87047p-9, 0x1.289f22964a03cp-11 }, 34436 + + .c0 = 0x1.5555555555556p-2, 34437 + + .inv_half_pi = 0x1.45f306dc9c883p-1, 34438 + .half_pi_hi = 0x1.921fb54442d18p0, 34439 + .half_pi_lo = 0x1.1a62633145c07p-54, 34440 + - .inv_half_pi = 0x1.45f306dc9c883p-1, 34441 + .range_val = 0x1p23, 34442 + - .shift = 0x1.8p52, 34443 + }; 34444 + 34445 + static svfloat64_t NOINLINE 34446 + -special_case (svfloat64_t x, svfloat64_t y, svbool_t special) 34447 + +special_case (svfloat64_t x, svfloat64_t p, svfloat64_t q, svbool_t pg, 34448 + + svbool_t special) 34449 + { 34450 + + svbool_t use_recip = svcmpeq ( 34451 + + pg, svand_x (pg, svreinterpret_u64 (svcvt_s64_x (pg, q)), 1), 0); 34452 + + 34453 + + svfloat64_t n = svmad_x (pg, p, p, -1); 34454 + + svfloat64_t d = svmul_x (svptrue_b64 (), p, 2); 34455 + + svfloat64_t swap = n; 34456 + + n = svneg_m (n, use_recip, d); 34457 + + d = svsel (use_recip, swap, d); 34458 + + svfloat64_t y = svdiv_x (svnot_z (pg, special), n, d); 34459 + return sv_call_f64 (tan, x, y, special); 34460 + } 34461 + 34462 + @@ -50,15 +64,10 @@ special_case (svfloat64_t x, svfloat64_t y, svbool_t special) 34463 + svfloat64_t SV_NAME_D1 (tan) (svfloat64_t x, svbool_t pg) 34464 + { 34465 + const struct data *dat = ptr_barrier (&data); 34466 + - 34467 + - /* Invert condition to catch NaNs and Infs as well as large values. */ 34468 + - svbool_t special = svnot_z (pg, svaclt (pg, x, dat->range_val)); 34469 + - 34470 + + svfloat64_t half_pi_c0 = svld1rq (svptrue_b64 (), &dat->c0); 34471 + /* q = nearest integer to 2 * x / pi. */ 34472 + - svfloat64_t shift = sv_f64 (dat->shift); 34473 + - svfloat64_t q = svmla_x (pg, shift, x, dat->inv_half_pi); 34474 + - q = svsub_x (pg, q, shift); 34475 + - svint64_t qi = svcvt_s64_x (pg, q); 34476 + + svfloat64_t q = svmul_lane (x, half_pi_c0, 1); 34477 + + q = svrinta_x (pg, q); 34478 + 34479 + /* Use q to reduce x to r in [-pi/4, pi/4], by: 34480 + r = x - q * pi/2, in extended precision. */ 34481 + @@ -68,7 +77,7 @@ svfloat64_t SV_NAME_D1 (tan) (svfloat64_t x, svbool_t pg) 34482 + r = svmls_lane (r, q, half_pi, 1); 34483 + /* Further reduce r to [-pi/8, pi/8], to be reconstructed using double angle 34484 + formula. */ 34485 + - r = svmul_x (pg, r, 0.5); 34486 + + r = svmul_x (svptrue_b64 (), r, 0.5); 34487 + 34488 + /* Approximate tan(r) using order 8 polynomial. 34489 + tan(x) is odd, so polynomial has the form: 34490 + @@ -76,29 +85,51 @@ svfloat64_t SV_NAME_D1 (tan) (svfloat64_t x, svbool_t pg) 34491 + Hence we first approximate P(r) = C1 + C2 * r^2 + C3 * r^4 + ... 34492 + Then compute the approximation by: 34493 + tan(r) ~= r + r^3 * (C0 + r^2 * P(r)). */ 34494 + - svfloat64_t r2 = svmul_x (pg, r, r); 34495 + - svfloat64_t r4 = svmul_x (pg, r2, r2); 34496 + - svfloat64_t r8 = svmul_x (pg, r4, r4); 34497 + + 34498 + + svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r); 34499 + + svfloat64_t r4 = svmul_x (svptrue_b64 (), r2, r2); 34500 + + svfloat64_t r8 = svmul_x (svptrue_b64 (), r4, r4); 34501 + /* Use offset version coeff array by 1 to evaluate from C1 onwards. */ 34502 + - svfloat64_t p = sv_estrin_7_f64_x (pg, r2, r4, r8, dat->poly + 1); 34503 + - p = svmad_x (pg, p, r2, dat->poly[0]); 34504 + - p = svmla_x (pg, r, r2, svmul_x (pg, p, r)); 34505 + + svfloat64_t C_24 = svld1rq (svptrue_b64 (), &dat->c2); 34506 + + svfloat64_t C_68 = svld1rq (svptrue_b64 (), &dat->c6); 34507 + + 34508 + + /* Use offset version coeff array by 1 to evaluate from C1 onwards. */ 34509 + + svfloat64_t p01 = svmla_lane (sv_f64 (dat->poly_1357[0]), r2, C_24, 0); 34510 + + svfloat64_t p23 = svmla_lane_f64 (sv_f64 (dat->poly_1357[1]), r2, C_24, 1); 34511 + + svfloat64_t p03 = svmla_x (pg, p01, p23, r4); 34512 + + 34513 + + svfloat64_t p45 = svmla_lane (sv_f64 (dat->poly_1357[2]), r2, C_68, 0); 34514 + + svfloat64_t p67 = svmla_lane (sv_f64 (dat->poly_1357[3]), r2, C_68, 1); 34515 + + svfloat64_t p47 = svmla_x (pg, p45, p67, r4); 34516 + + 34517 + + svfloat64_t p = svmla_x (pg, p03, p47, r8); 34518 + + 34519 + + svfloat64_t z = svmul_x (svptrue_b64 (), p, r); 34520 + + z = svmul_x (svptrue_b64 (), r2, z); 34521 + + z = svmla_lane (z, r, half_pi_c0, 0); 34522 + + p = svmla_x (pg, r, r2, z); 34523 + 34524 + /* Recombination uses double-angle formula: 34525 + tan(2x) = 2 * tan(x) / (1 - (tan(x))^2) 34526 + and reciprocity around pi/2: 34527 + tan(x) = 1 / (tan(pi/2 - x)) 34528 + to assemble result using change-of-sign and conditional selection of 34529 + - numerator/denominator dependent on odd/even-ness of q (hence quadrant). */ 34530 + - svbool_t use_recip 34531 + - = svcmpeq (pg, svand_x (pg, svreinterpret_u64 (qi), 1), 0); 34532 + + numerator/denominator dependent on odd/even-ness of q (quadrant). */ 34533 + + 34534 + + /* Invert condition to catch NaNs and Infs as well as large values. */ 34535 + + svbool_t special = svnot_z (pg, svaclt (pg, x, dat->range_val)); 34536 + + 34537 + + if (__glibc_unlikely (svptest_any (pg, special))) 34538 + + { 34539 + + return special_case (x, p, q, pg, special); 34540 + + } 34541 + + svbool_t use_recip = svcmpeq ( 34542 + + pg, svand_x (pg, svreinterpret_u64 (svcvt_s64_x (pg, q)), 1), 0); 34543 + 34544 + svfloat64_t n = svmad_x (pg, p, p, -1); 34545 + - svfloat64_t d = svmul_x (pg, p, 2); 34546 + + svfloat64_t d = svmul_x (svptrue_b64 (), p, 2); 34547 + svfloat64_t swap = n; 34548 + n = svneg_m (n, use_recip, d); 34549 + d = svsel (use_recip, swap, d); 34550 + - if (__glibc_unlikely (svptest_any (pg, special))) 34551 + - return special_case (x, svdiv_x (svnot_z (pg, special), n, d), special); 34552 + return svdiv_x (pg, n, d); 34553 + } 34554 + diff --git a/sysdeps/aarch64/fpu/tanf_sve.c b/sysdeps/aarch64/fpu/tanf_sve.c 34555 + index f342583241..e850fb4882 100644 34556 + --- a/sysdeps/aarch64/fpu/tanf_sve.c 34557 + +++ b/sysdeps/aarch64/fpu/tanf_sve.c 34558 + @@ -60,21 +60,16 @@ svfloat32_t SV_NAME_F1 (tan) (svfloat32_t x, const svbool_t pg) 34559 + { 34560 + const struct data *d = ptr_barrier (&data); 34561 + 34562 + - /* Determine whether input is too large to perform fast regression. */ 34563 + - svbool_t cmp = svacge (pg, x, d->range_val); 34564 + - 34565 + svfloat32_t odd_coeffs = svld1rq (svptrue_b32 (), &d->c1); 34566 + svfloat32_t pi_vals = svld1rq (svptrue_b32 (), &d->pio2_1); 34567 + 34568 + /* n = rint(x/(pi/2)). */ 34569 + - svfloat32_t q = svmla_lane (sv_f32 (d->shift), x, pi_vals, 3); 34570 + - svfloat32_t n = svsub_x (pg, q, d->shift); 34571 + + svfloat32_t n = svrintn_x (pg, svmul_lane (x, pi_vals, 3)); 34572 + /* n is already a signed integer, simply convert it. */ 34573 + svint32_t in = svcvt_s32_x (pg, n); 34574 + /* Determine if x lives in an interval, where |tan(x)| grows to infinity. */ 34575 + svint32_t alt = svand_x (pg, in, 1); 34576 + svbool_t pred_alt = svcmpne (pg, alt, 0); 34577 + - 34578 + /* r = x - n * (pi/2) (range reduction into 0 .. pi/4). */ 34579 + svfloat32_t r; 34580 + r = svmls_lane (x, n, pi_vals, 0); 34581 + @@ -93,7 +88,7 @@ svfloat32_t SV_NAME_F1 (tan) (svfloat32_t x, const svbool_t pg) 34582 + 34583 + /* Evaluate polynomial approximation of tangent on [-pi/4, pi/4], 34584 + using Estrin on z^2. */ 34585 + - svfloat32_t z2 = svmul_x (pg, z, z); 34586 + + svfloat32_t z2 = svmul_x (svptrue_b32 (), r, r); 34587 + svfloat32_t p01 = svmla_lane (sv_f32 (d->c0), z2, odd_coeffs, 0); 34588 + svfloat32_t p23 = svmla_lane (sv_f32 (d->c2), z2, odd_coeffs, 1); 34589 + svfloat32_t p45 = svmla_lane (sv_f32 (d->c4), z2, odd_coeffs, 2); 34590 + @@ -106,13 +101,14 @@ svfloat32_t SV_NAME_F1 (tan) (svfloat32_t x, const svbool_t pg) 34591 + 34592 + svfloat32_t y = svmla_x (pg, z, p, svmul_x (pg, z, z2)); 34593 + 34594 + - /* Transform result back, if necessary. */ 34595 + - svfloat32_t inv_y = svdivr_x (pg, y, 1.0f); 34596 + - 34597 + /* No need to pass pg to specialcase here since cmp is a strict subset, 34598 + guaranteed by the cmpge above. */ 34599 + + 34600 + + /* Determine whether input is too large to perform fast regression. */ 34601 + + svbool_t cmp = svacge (pg, x, d->range_val); 34602 + if (__glibc_unlikely (svptest_any (pg, cmp))) 34603 + - return special_case (x, svsel (pred_alt, inv_y, y), cmp); 34604 + + return special_case (x, svdivr_x (pg, y, 1.0f), cmp); 34605 + 34606 + + svfloat32_t inv_y = svdivr_x (pg, y, 1.0f); 34607 + return svsel (pred_alt, inv_y, y); 34608 + } 34609 + 34610 + commit ab5ba6c188159bb5e12be95cd90458924c2fe592 34611 + Author: Yat Long Poon <yatlong.poon@arm.com> 34612 + Date: Fri Jan 3 19:07:30 2025 +0000 34613 + 34614 + AArch64: Improve codegen for SVE logs 34615 + 34616 + Reduce memory access by using lanewise MLA and moving constants to struct 34617 + and reduce number of MOVPRFXs. 34618 + Update maximum ULP error for double log_sve from 1 to 2. 34619 + Speedup on Neoverse V1 for log (3%), log2 (5%), and log10 (4%). 34620 + 34621 + (cherry picked from commit 32d193a372feb28f9da247bb7283d404b84429c6) 34622 + 34623 + diff --git a/sysdeps/aarch64/fpu/log10_sve.c b/sysdeps/aarch64/fpu/log10_sve.c 34624 + index ab7362128d..f1cad2759a 100644 34625 + --- a/sysdeps/aarch64/fpu/log10_sve.c 34626 + +++ b/sysdeps/aarch64/fpu/log10_sve.c 34627 + @@ -23,28 +23,49 @@ 34628 + #define Min 0x0010000000000000 34629 + #define Max 0x7ff0000000000000 34630 + #define Thres 0x7fe0000000000000 /* Max - Min. */ 34631 + -#define Off 0x3fe6900900000000 34632 + #define N (1 << V_LOG10_TABLE_BITS) 34633 + 34634 + +static const struct data 34635 + +{ 34636 + + double c0, c2; 34637 + + double c1, c3; 34638 + + double invln10, log10_2; 34639 + + double c4; 34640 + + uint64_t off; 34641 + +} data = { 34642 + + .c0 = -0x1.bcb7b1526e506p-3, 34643 + + .c1 = 0x1.287a7636be1d1p-3, 34644 + + .c2 = -0x1.bcb7b158af938p-4, 34645 + + .c3 = 0x1.63c78734e6d07p-4, 34646 + + .c4 = -0x1.287461742fee4p-4, 34647 + + .invln10 = 0x1.bcb7b1526e50ep-2, 34648 + + .log10_2 = 0x1.34413509f79ffp-2, 34649 + + .off = 0x3fe6900900000000, 34650 + +}; 34651 + + 34652 + static svfloat64_t NOINLINE 34653 + -special_case (svfloat64_t x, svfloat64_t y, svbool_t special) 34654 + +special_case (svfloat64_t hi, svuint64_t tmp, svfloat64_t y, svfloat64_t r2, 34655 + + svbool_t special, const struct data *d) 34656 + { 34657 + - return sv_call_f64 (log10, x, y, special); 34658 + + svfloat64_t x = svreinterpret_f64 (svadd_x (svptrue_b64 (), tmp, d->off)); 34659 + + return sv_call_f64 (log10, x, svmla_x (svptrue_b64 (), hi, r2, y), special); 34660 + } 34661 + 34662 + -/* SVE log10 algorithm. 34663 + +/* Double-precision SVE log10 routine. 34664 + Maximum measured error is 2.46 ulps. 34665 + SV_NAME_D1 (log10)(0x1.131956cd4b627p+0) got 0x1.fffbdf6eaa669p-6 34666 + want 0x1.fffbdf6eaa667p-6. */ 34667 + svfloat64_t SV_NAME_D1 (log10) (svfloat64_t x, const svbool_t pg) 34668 + { 34669 + + const struct data *d = ptr_barrier (&data); 34670 + + 34671 + svuint64_t ix = svreinterpret_u64 (x); 34672 + svbool_t special = svcmpge (pg, svsub_x (pg, ix, Min), Thres); 34673 + 34674 + /* x = 2^k z; where z is in range [Off,2*Off) and exact. 34675 + The range is split into N subintervals. 34676 + The ith subinterval contains z and c is near its center. */ 34677 + - svuint64_t tmp = svsub_x (pg, ix, Off); 34678 + + svuint64_t tmp = svsub_x (pg, ix, d->off); 34679 + svuint64_t i = svlsr_x (pg, tmp, 51 - V_LOG10_TABLE_BITS); 34680 + i = svand_x (pg, i, (N - 1) << 1); 34681 + svfloat64_t k = svcvt_f64_x (pg, svasr_x (pg, svreinterpret_s64 (tmp), 52)); 34682 + @@ -62,15 +83,19 @@ svfloat64_t SV_NAME_D1 (log10) (svfloat64_t x, const svbool_t pg) 34683 + svfloat64_t r = svmad_x (pg, invc, z, -1.0); 34684 + 34685 + /* hi = log(c) + k*log(2). */ 34686 + - svfloat64_t w = svmla_x (pg, logc, r, __v_log10_data.invln10); 34687 + - svfloat64_t hi = svmla_x (pg, w, k, __v_log10_data.log10_2); 34688 + + svfloat64_t invln10_log10_2 = svld1rq_f64 (svptrue_b64 (), &d->invln10); 34689 + + svfloat64_t w = svmla_lane_f64 (logc, r, invln10_log10_2, 0); 34690 + + svfloat64_t hi = svmla_lane_f64 (w, k, invln10_log10_2, 1); 34691 + 34692 + /* y = r2*(A0 + r*A1 + r2*(A2 + r*A3 + r2*A4)) + hi. */ 34693 + - svfloat64_t r2 = svmul_x (pg, r, r); 34694 + - svfloat64_t y = sv_pw_horner_4_f64_x (pg, r, r2, __v_log10_data.poly); 34695 + + svfloat64_t odd_coeffs = svld1rq_f64 (svptrue_b64 (), &d->c1); 34696 + + svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r); 34697 + + svfloat64_t y = svmla_lane_f64 (sv_f64 (d->c2), r, odd_coeffs, 1); 34698 + + svfloat64_t p = svmla_lane_f64 (sv_f64 (d->c0), r, odd_coeffs, 0); 34699 + + y = svmla_x (pg, y, r2, d->c4); 34700 + + y = svmla_x (pg, p, r2, y); 34701 + 34702 + if (__glibc_unlikely (svptest_any (pg, special))) 34703 + - return special_case (x, svmla_x (svnot_z (pg, special), hi, r2, y), 34704 + - special); 34705 + + return special_case (hi, tmp, y, r2, special, d); 34706 + return svmla_x (pg, hi, r2, y); 34707 + } 34708 + diff --git a/sysdeps/aarch64/fpu/log2_sve.c b/sysdeps/aarch64/fpu/log2_sve.c 34709 + index 743fa2a913..908e638246 100644 34710 + --- a/sysdeps/aarch64/fpu/log2_sve.c 34711 + +++ b/sysdeps/aarch64/fpu/log2_sve.c 34712 + @@ -21,15 +21,32 @@ 34713 + #include "poly_sve_f64.h" 34714 + 34715 + #define N (1 << V_LOG2_TABLE_BITS) 34716 + -#define Off 0x3fe6900900000000 34717 + #define Max (0x7ff0000000000000) 34718 + #define Min (0x0010000000000000) 34719 + #define Thresh (0x7fe0000000000000) /* Max - Min. */ 34720 + 34721 + +static const struct data 34722 + +{ 34723 + + double c0, c2; 34724 + + double c1, c3; 34725 + + double invln2, c4; 34726 + + uint64_t off; 34727 + +} data = { 34728 + + .c0 = -0x1.71547652b83p-1, 34729 + + .c1 = 0x1.ec709dc340953p-2, 34730 + + .c2 = -0x1.71547651c8f35p-2, 34731 + + .c3 = 0x1.2777ebe12dda5p-2, 34732 + + .c4 = -0x1.ec738d616fe26p-3, 34733 + + .invln2 = 0x1.71547652b82fep0, 34734 + + .off = 0x3fe6900900000000, 34735 + +}; 34736 + + 34737 + static svfloat64_t NOINLINE 34738 + -special_case (svfloat64_t x, svfloat64_t y, svbool_t cmp) 34739 + +special_case (svfloat64_t w, svuint64_t tmp, svfloat64_t y, svfloat64_t r2, 34740 + + svbool_t special, const struct data *d) 34741 + { 34742 + - return sv_call_f64 (log2, x, y, cmp); 34743 + + svfloat64_t x = svreinterpret_f64 (svadd_x (svptrue_b64 (), tmp, d->off)); 34744 + + return sv_call_f64 (log2, x, svmla_x (svptrue_b64 (), w, r2, y), special); 34745 + } 34746 + 34747 + /* Double-precision SVE log2 routine. 34748 + @@ -40,13 +57,15 @@ special_case (svfloat64_t x, svfloat64_t y, svbool_t cmp) 34749 + want 0x1.fffb34198d9ddp-5. */ 34750 + svfloat64_t SV_NAME_D1 (log2) (svfloat64_t x, const svbool_t pg) 34751 + { 34752 + + const struct data *d = ptr_barrier (&data); 34753 + + 34754 + svuint64_t ix = svreinterpret_u64 (x); 34755 + svbool_t special = svcmpge (pg, svsub_x (pg, ix, Min), Thresh); 34756 + 34757 + /* x = 2^k z; where z is in range [Off,2*Off) and exact. 34758 + The range is split into N subintervals. 34759 + The ith subinterval contains z and c is near its center. */ 34760 + - svuint64_t tmp = svsub_x (pg, ix, Off); 34761 + + svuint64_t tmp = svsub_x (pg, ix, d->off); 34762 + svuint64_t i = svlsr_x (pg, tmp, 51 - V_LOG2_TABLE_BITS); 34763 + i = svand_x (pg, i, (N - 1) << 1); 34764 + svfloat64_t k = svcvt_f64_x (pg, svasr_x (pg, svreinterpret_s64 (tmp), 52)); 34765 + @@ -59,15 +78,19 @@ svfloat64_t SV_NAME_D1 (log2) (svfloat64_t x, const svbool_t pg) 34766 + 34767 + /* log2(x) = log1p(z/c-1)/log(2) + log2(c) + k. */ 34768 + 34769 + + svfloat64_t invln2_and_c4 = svld1rq_f64 (svptrue_b64 (), &d->invln2); 34770 + svfloat64_t r = svmad_x (pg, invc, z, -1.0); 34771 + - svfloat64_t w = svmla_x (pg, log2c, r, __v_log2_data.invln2); 34772 + - 34773 + - svfloat64_t r2 = svmul_x (pg, r, r); 34774 + - svfloat64_t y = sv_pw_horner_4_f64_x (pg, r, r2, __v_log2_data.poly); 34775 + + svfloat64_t w = svmla_lane_f64 (log2c, r, invln2_and_c4, 0); 34776 + w = svadd_x (pg, k, w); 34777 + 34778 + + svfloat64_t odd_coeffs = svld1rq_f64 (svptrue_b64 (), &d->c1); 34779 + + svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r); 34780 + + svfloat64_t y = svmla_lane_f64 (sv_f64 (d->c2), r, odd_coeffs, 1); 34781 + + svfloat64_t p = svmla_lane_f64 (sv_f64 (d->c0), r, odd_coeffs, 0); 34782 + + y = svmla_lane_f64 (y, r2, invln2_and_c4, 1); 34783 + + y = svmla_x (pg, p, r2, y); 34784 + + 34785 + if (__glibc_unlikely (svptest_any (pg, special))) 34786 + - return special_case (x, svmla_x (svnot_z (pg, special), w, r2, y), 34787 + - special); 34788 + + return special_case (w, tmp, y, r2, special, d); 34789 + return svmla_x (pg, w, r2, y); 34790 + } 34791 + diff --git a/sysdeps/aarch64/fpu/log_sve.c b/sysdeps/aarch64/fpu/log_sve.c 34792 + index 9b689f2ec7..044223400b 100644 34793 + --- a/sysdeps/aarch64/fpu/log_sve.c 34794 + +++ b/sysdeps/aarch64/fpu/log_sve.c 34795 + @@ -19,39 +19,54 @@ 34796 + 34797 + #include "sv_math.h" 34798 + 34799 + -#define P(i) sv_f64 (__v_log_data.poly[i]) 34800 + #define N (1 << V_LOG_TABLE_BITS) 34801 + -#define Off (0x3fe6900900000000) 34802 + -#define MaxTop (0x7ff) 34803 + -#define MinTop (0x001) 34804 + -#define ThreshTop (0x7fe) /* MaxTop - MinTop. */ 34805 + +#define Max (0x7ff0000000000000) 34806 + +#define Min (0x0010000000000000) 34807 + +#define Thresh (0x7fe0000000000000) /* Max - Min. */ 34808 + + 34809 + +static const struct data 34810 + +{ 34811 + + double c0, c2; 34812 + + double c1, c3; 34813 + + double ln2, c4; 34814 + + uint64_t off; 34815 + +} data = { 34816 + + .c0 = -0x1.ffffffffffff7p-2, 34817 + + .c1 = 0x1.55555555170d4p-2, 34818 + + .c2 = -0x1.0000000399c27p-2, 34819 + + .c3 = 0x1.999b2e90e94cap-3, 34820 + + .c4 = -0x1.554e550bd501ep-3, 34821 + + .ln2 = 0x1.62e42fefa39efp-1, 34822 + + .off = 0x3fe6900900000000, 34823 + +}; 34824 + 34825 + static svfloat64_t NOINLINE 34826 + -special_case (svfloat64_t x, svfloat64_t y, svbool_t cmp) 34827 + +special_case (svfloat64_t hi, svuint64_t tmp, svfloat64_t y, svfloat64_t r2, 34828 + + svbool_t special, const struct data *d) 34829 + { 34830 + - return sv_call_f64 (log, x, y, cmp); 34831 + + svfloat64_t x = svreinterpret_f64 (svadd_x (svptrue_b64 (), tmp, d->off)); 34832 + + return sv_call_f64 (log, x, svmla_x (svptrue_b64 (), hi, r2, y), special); 34833 + } 34834 + 34835 + -/* SVE port of AdvSIMD log algorithm. 34836 + - Maximum measured error is 2.17 ulp: 34837 + - SV_NAME_D1 (log)(0x1.a6129884398a3p+0) got 0x1.ffffff1cca043p-2 34838 + - want 0x1.ffffff1cca045p-2. */ 34839 + +/* Double-precision SVE log routine. 34840 + + Maximum measured error is 2.64 ulp: 34841 + + SV_NAME_D1 (log)(0x1.95e54bc91a5e2p+184) got 0x1.fffffffe88cacp+6 34842 + + want 0x1.fffffffe88cafp+6. */ 34843 + svfloat64_t SV_NAME_D1 (log) (svfloat64_t x, const svbool_t pg) 34844 + { 34845 + + const struct data *d = ptr_barrier (&data); 34846 + + 34847 + svuint64_t ix = svreinterpret_u64 (x); 34848 + - svuint64_t top = svlsr_x (pg, ix, 52); 34849 + - svbool_t cmp = svcmpge (pg, svsub_x (pg, top, MinTop), sv_u64 (ThreshTop)); 34850 + + svbool_t special = svcmpge (pg, svsub_x (pg, ix, Min), Thresh); 34851 + 34852 + /* x = 2^k z; where z is in range [Off,2*Off) and exact. 34853 + The range is split into N subintervals. 34854 + The ith subinterval contains z and c is near its center. */ 34855 + - svuint64_t tmp = svsub_x (pg, ix, Off); 34856 + + svuint64_t tmp = svsub_x (pg, ix, d->off); 34857 + /* Calculate table index = (tmp >> (52 - V_LOG_TABLE_BITS)) % N. 34858 + The actual value of i is double this due to table layout. */ 34859 + svuint64_t i 34860 + = svand_x (pg, svlsr_x (pg, tmp, (51 - V_LOG_TABLE_BITS)), (N - 1) << 1); 34861 + - svint64_t k 34862 + - = svasr_x (pg, svreinterpret_s64 (tmp), 52); /* Arithmetic shift. */ 34863 + svuint64_t iz = svsub_x (pg, ix, svand_x (pg, tmp, 0xfffULL << 52)); 34864 + svfloat64_t z = svreinterpret_f64 (iz); 34865 + /* Lookup in 2 global lists (length N). */ 34866 + @@ -59,18 +74,22 @@ svfloat64_t SV_NAME_D1 (log) (svfloat64_t x, const svbool_t pg) 34867 + svfloat64_t logc = svld1_gather_index (pg, &__v_log_data.table[0].logc, i); 34868 + 34869 + /* log(x) = log1p(z/c-1) + log(c) + k*Ln2. */ 34870 + - svfloat64_t r = svmad_x (pg, invc, z, -1); 34871 + - svfloat64_t kd = svcvt_f64_x (pg, k); 34872 + + svfloat64_t kd = svcvt_f64_x (pg, svasr_x (pg, svreinterpret_s64 (tmp), 52)); 34873 + /* hi = r + log(c) + k*Ln2. */ 34874 + - svfloat64_t hi = svmla_x (pg, svadd_x (pg, logc, r), kd, __v_log_data.ln2); 34875 + + svfloat64_t ln2_and_c4 = svld1rq_f64 (svptrue_b64 (), &d->ln2); 34876 + + svfloat64_t r = svmad_x (pg, invc, z, -1); 34877 + + svfloat64_t hi = svmla_lane_f64 (logc, kd, ln2_and_c4, 0); 34878 + + hi = svadd_x (pg, r, hi); 34879 + + 34880 + /* y = r2*(A0 + r*A1 + r2*(A2 + r*A3 + r2*A4)) + hi. */ 34881 + - svfloat64_t r2 = svmul_x (pg, r, r); 34882 + - svfloat64_t y = svmla_x (pg, P (2), r, P (3)); 34883 + - svfloat64_t p = svmla_x (pg, P (0), r, P (1)); 34884 + - y = svmla_x (pg, y, r2, P (4)); 34885 + + svfloat64_t odd_coeffs = svld1rq_f64 (svptrue_b64 (), &d->c1); 34886 + + svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r); 34887 + + svfloat64_t y = svmla_lane_f64 (sv_f64 (d->c2), r, odd_coeffs, 1); 34888 + + svfloat64_t p = svmla_lane_f64 (sv_f64 (d->c0), r, odd_coeffs, 0); 34889 + + y = svmla_lane_f64 (y, r2, ln2_and_c4, 1); 34890 + y = svmla_x (pg, p, r2, y); 34891 + 34892 + - if (__glibc_unlikely (svptest_any (pg, cmp))) 34893 + - return special_case (x, svmla_x (svnot_z (pg, cmp), hi, r2, y), cmp); 34894 + + if (__glibc_unlikely (svptest_any (pg, special))) 34895 + + return special_case (hi, tmp, y, r2, special, d); 34896 + return svmla_x (pg, hi, r2, y); 34897 + } 34898 + diff --git a/sysdeps/aarch64/libm-test-ulps b/sysdeps/aarch64/libm-test-ulps 34899 + index 6c96304611..b76c38dac2 100644 34900 + --- a/sysdeps/aarch64/libm-test-ulps 34901 + +++ b/sysdeps/aarch64/libm-test-ulps 34902 + @@ -1460,7 +1460,7 @@ float: 2 34903 + ldouble: 1 34904 + 34905 + Function: "log_sve": 34906 + -double: 1 34907 + +double: 2 34908 + float: 3 34909 + 34910 + Function: "log_towardzero": 34911 + 34912 + commit aa7c61ea15e27ae14717e065a5d4c50baa472851 34913 + Author: Yat Long Poon <yatlong.poon@arm.com> 34914 + Date: Fri Jan 3 19:09:05 2025 +0000 34915 + 34916 + AArch64: Improve codegen for SVE log1pf users 34917 + 34918 + Reduce memory access by using lanewise MLA and reduce number of MOVPRFXs. 34919 + Move log1pf implementation to inline helper function. 34920 + Speedup on Neoverse V1 for log1pf (10%), acoshf (-1%), atanhf (2%), asinhf (2%). 34921 + 34922 + (cherry picked from commit 91c1fadba338752bf514cd4cca057b27b1b10eed) 34923 + 34924 + diff --git a/sysdeps/aarch64/fpu/acoshf_sve.c b/sysdeps/aarch64/fpu/acoshf_sve.c 34925 + index 2110894e62..491365e24d 100644 34926 + --- a/sysdeps/aarch64/fpu/acoshf_sve.c 34927 + +++ b/sysdeps/aarch64/fpu/acoshf_sve.c 34928 + @@ -17,23 +17,26 @@ 34929 + License along with the GNU C Library; if not, see 34930 + <https://www.gnu.org/licenses/>. */ 34931 + 34932 + +#include "sv_math.h" 34933 + +#include "sv_log1pf_inline.h" 34934 + + 34935 + #define One 0x3f800000 34936 + #define Thres 0x20000000 /* asuint(0x1p64) - One. */ 34937 + 34938 + -#include "sv_log1pf_inline.h" 34939 + - 34940 + static svfloat32_t NOINLINE 34941 + -special_case (svfloat32_t x, svfloat32_t y, svbool_t special) 34942 + +special_case (svfloat32_t xm1, svfloat32_t tmp, svbool_t special) 34943 + { 34944 + + svfloat32_t x = svadd_x (svptrue_b32 (), xm1, 1.0f); 34945 + + svfloat32_t y = sv_log1pf_inline (tmp, svptrue_b32 ()); 34946 + return sv_call_f32 (acoshf, x, y, special); 34947 + } 34948 + 34949 + /* Single-precision SVE acosh(x) routine. Implements the same algorithm as 34950 + vector acoshf and log1p. 34951 + 34952 + - Maximum error is 2.78 ULPs: 34953 + - SV_NAME_F1 (acosh) (0x1.01e996p+0) got 0x1.f45b42p-4 34954 + - want 0x1.f45b3cp-4. */ 34955 + + Maximum error is 2.47 ULPs: 34956 + + SV_NAME_F1 (acosh) (0x1.01ca76p+0) got 0x1.e435a6p-4 34957 + + want 0x1.e435a2p-4. */ 34958 + svfloat32_t SV_NAME_F1 (acosh) (svfloat32_t x, const svbool_t pg) 34959 + { 34960 + svuint32_t ix = svreinterpret_u32 (x); 34961 + @@ -41,9 +44,9 @@ svfloat32_t SV_NAME_F1 (acosh) (svfloat32_t x, const svbool_t pg) 34962 + 34963 + svfloat32_t xm1 = svsub_x (pg, x, 1.0f); 34964 + svfloat32_t u = svmul_x (pg, xm1, svadd_x (pg, x, 1.0f)); 34965 + - svfloat32_t y = sv_log1pf_inline (svadd_x (pg, xm1, svsqrt_x (pg, u)), pg); 34966 + + svfloat32_t tmp = svadd_x (pg, xm1, svsqrt_x (pg, u)); 34967 + 34968 + if (__glibc_unlikely (svptest_any (pg, special))) 34969 + - return special_case (x, y, special); 34970 + - return y; 34971 + + return special_case (xm1, tmp, special); 34972 + + return sv_log1pf_inline (tmp, pg); 34973 + } 34974 + diff --git a/sysdeps/aarch64/fpu/asinhf_sve.c b/sysdeps/aarch64/fpu/asinhf_sve.c 34975 + index d85c3a685c..b7f253bf32 100644 34976 + --- a/sysdeps/aarch64/fpu/asinhf_sve.c 34977 + +++ b/sysdeps/aarch64/fpu/asinhf_sve.c 34978 + @@ -20,20 +20,23 @@ 34979 + #include "sv_math.h" 34980 + #include "sv_log1pf_inline.h" 34981 + 34982 + -#define BigBound (0x5f800000) /* asuint(0x1p64). */ 34983 + +#define BigBound 0x5f800000 /* asuint(0x1p64). */ 34984 + 34985 + static svfloat32_t NOINLINE 34986 + -special_case (svfloat32_t x, svfloat32_t y, svbool_t special) 34987 + +special_case (svuint32_t iax, svuint32_t sign, svfloat32_t y, svbool_t special) 34988 + { 34989 + + svfloat32_t x = svreinterpret_f32 (sveor_x (svptrue_b32 (), iax, sign)); 34990 + + y = svreinterpret_f32 ( 34991 + + svorr_x (svptrue_b32 (), sign, svreinterpret_u32 (y))); 34992 + return sv_call_f32 (asinhf, x, y, special); 34993 + } 34994 + 34995 + /* Single-precision SVE asinh(x) routine. Implements the same algorithm as 34996 + vector asinhf and log1p. 34997 + 34998 + - Maximum error is 2.48 ULPs: 34999 + - SV_NAME_F1 (asinh) (0x1.008864p-3) got 0x1.ffbbbcp-4 35000 + - want 0x1.ffbbb8p-4. */ 35001 + + Maximum error is 1.92 ULPs: 35002 + + SV_NAME_F1 (asinh) (-0x1.0922ecp-1) got -0x1.fd0bccp-2 35003 + + want -0x1.fd0bc8p-2. */ 35004 + svfloat32_t SV_NAME_F1 (asinh) (svfloat32_t x, const svbool_t pg) 35005 + { 35006 + svfloat32_t ax = svabs_x (pg, x); 35007 + @@ -49,8 +52,6 @@ svfloat32_t SV_NAME_F1 (asinh) (svfloat32_t x, const svbool_t pg) 35008 + = sv_log1pf_inline (svadd_x (pg, ax, svdiv_x (pg, ax2, d)), pg); 35009 + 35010 + if (__glibc_unlikely (svptest_any (pg, special))) 35011 + - return special_case ( 35012 + - x, svreinterpret_f32 (svorr_x (pg, sign, svreinterpret_u32 (y))), 35013 + - special); 35014 + + return special_case (iax, sign, y, special); 35015 + return svreinterpret_f32 (svorr_x (pg, sign, svreinterpret_u32 (y))); 35016 + } 35017 + diff --git a/sysdeps/aarch64/fpu/atanhf_sve.c b/sysdeps/aarch64/fpu/atanhf_sve.c 35018 + index dae83041ef..2d3005bbc8 100644 35019 + --- a/sysdeps/aarch64/fpu/atanhf_sve.c 35020 + +++ b/sysdeps/aarch64/fpu/atanhf_sve.c 35021 + @@ -17,21 +17,25 @@ 35022 + License along with the GNU C Library; if not, see 35023 + <https://www.gnu.org/licenses/>. */ 35024 + 35025 + +#include "sv_math.h" 35026 + #include "sv_log1pf_inline.h" 35027 + 35028 + #define One (0x3f800000) 35029 + #define Half (0x3f000000) 35030 + 35031 + static svfloat32_t NOINLINE 35032 + -special_case (svfloat32_t x, svfloat32_t y, svbool_t special) 35033 + +special_case (svuint32_t iax, svuint32_t sign, svfloat32_t halfsign, 35034 + + svfloat32_t y, svbool_t special) 35035 + { 35036 + + svfloat32_t x = svreinterpret_f32 (sveor_x (svptrue_b32 (), iax, sign)); 35037 + + y = svmul_x (svptrue_b32 (), halfsign, y); 35038 + return sv_call_f32 (atanhf, x, y, special); 35039 + } 35040 + 35041 + /* Approximation for vector single-precision atanh(x) using modified log1p. 35042 + - The maximum error is 2.28 ULP: 35043 + - _ZGVsMxv_atanhf(0x1.ff1194p-5) got 0x1.ffbbbcp-5 35044 + - want 0x1.ffbbb6p-5. */ 35045 + + The maximum error is 1.99 ULP: 35046 + + _ZGVsMxv_atanhf(0x1.f1583p-5) got 0x1.f1f4fap-5 35047 + + want 0x1.f1f4f6p-5. */ 35048 + svfloat32_t SV_NAME_F1 (atanh) (svfloat32_t x, const svbool_t pg) 35049 + { 35050 + svfloat32_t ax = svabs_x (pg, x); 35051 + @@ -48,7 +52,7 @@ svfloat32_t SV_NAME_F1 (atanh) (svfloat32_t x, const svbool_t pg) 35052 + y = sv_log1pf_inline (y, pg); 35053 + 35054 + if (__glibc_unlikely (svptest_any (pg, special))) 35055 + - return special_case (x, svmul_x (pg, halfsign, y), special); 35056 + + return special_case (iax, sign, halfsign, y, special); 35057 + 35058 + return svmul_x (pg, halfsign, y); 35059 + } 35060 + diff --git a/sysdeps/aarch64/fpu/log1pf_sve.c b/sysdeps/aarch64/fpu/log1pf_sve.c 35061 + index 5256d5e94c..18a185c838 100644 35062 + --- a/sysdeps/aarch64/fpu/log1pf_sve.c 35063 + +++ b/sysdeps/aarch64/fpu/log1pf_sve.c 35064 + @@ -18,30 +18,13 @@ 35065 + <https://www.gnu.org/licenses/>. */ 35066 + 35067 + #include "sv_math.h" 35068 + -#include "poly_sve_f32.h" 35069 + - 35070 + -static const struct data 35071 + -{ 35072 + - float poly[8]; 35073 + - float ln2, exp_bias; 35074 + - uint32_t four, three_quarters; 35075 + -} data = {.poly = {/* Do not store first term of polynomial, which is -0.5, as 35076 + - this can be fmov-ed directly instead of including it in 35077 + - the main load-and-mla polynomial schedule. */ 35078 + - 0x1.5555aap-2f, -0x1.000038p-2f, 0x1.99675cp-3f, 35079 + - -0x1.54ef78p-3f, 0x1.28a1f4p-3f, -0x1.0da91p-3f, 35080 + - 0x1.abcb6p-4f, -0x1.6f0d5ep-5f}, 35081 + - .ln2 = 0x1.62e43p-1f, 35082 + - .exp_bias = 0x1p-23f, 35083 + - .four = 0x40800000, 35084 + - .three_quarters = 0x3f400000}; 35085 + - 35086 + -#define SignExponentMask 0xff800000 35087 + +#include "sv_log1pf_inline.h" 35088 + 35089 + static svfloat32_t NOINLINE 35090 + -special_case (svfloat32_t x, svfloat32_t y, svbool_t special) 35091 + +special_case (svfloat32_t x, svbool_t special) 35092 + { 35093 + - return sv_call_f32 (log1pf, x, y, special); 35094 + + return sv_call_f32 (log1pf, x, sv_log1pf_inline (x, svptrue_b32 ()), 35095 + + special); 35096 + } 35097 + 35098 + /* Vector log1pf approximation using polynomial on reduced interval. Worst-case 35099 + @@ -50,53 +33,14 @@ special_case (svfloat32_t x, svfloat32_t y, svbool_t special) 35100 + want 0x1.9f323ep-2. */ 35101 + svfloat32_t SV_NAME_F1 (log1p) (svfloat32_t x, svbool_t pg) 35102 + { 35103 + - const struct data *d = ptr_barrier (&data); 35104 + /* x < -1, Inf/Nan. */ 35105 + svbool_t special = svcmpeq (pg, svreinterpret_u32 (x), 0x7f800000); 35106 + special = svorn_z (pg, special, svcmpge (pg, x, -1)); 35107 + 35108 + - /* With x + 1 = t * 2^k (where t = m + 1 and k is chosen such that m 35109 + - is in [-0.25, 0.5]): 35110 + - log1p(x) = log(t) + log(2^k) = log1p(m) + k*log(2). 35111 + - 35112 + - We approximate log1p(m) with a polynomial, then scale by 35113 + - k*log(2). Instead of doing this directly, we use an intermediate 35114 + - scale factor s = 4*k*log(2) to ensure the scale is representable 35115 + - as a normalised fp32 number. */ 35116 + - svfloat32_t m = svadd_x (pg, x, 1); 35117 + - 35118 + - /* Choose k to scale x to the range [-1/4, 1/2]. */ 35119 + - svint32_t k 35120 + - = svand_x (pg, svsub_x (pg, svreinterpret_s32 (m), d->three_quarters), 35121 + - sv_s32 (SignExponentMask)); 35122 + - 35123 + - /* Scale x by exponent manipulation. */ 35124 + - svfloat32_t m_scale = svreinterpret_f32 ( 35125 + - svsub_x (pg, svreinterpret_u32 (x), svreinterpret_u32 (k))); 35126 + - 35127 + - /* Scale up to ensure that the scale factor is representable as normalised 35128 + - fp32 number, and scale m down accordingly. */ 35129 + - svfloat32_t s = svreinterpret_f32 (svsubr_x (pg, k, d->four)); 35130 + - m_scale = svadd_x (pg, m_scale, svmla_x (pg, sv_f32 (-1), s, 0.25)); 35131 + - 35132 + - /* Evaluate polynomial on reduced interval. */ 35133 + - svfloat32_t ms2 = svmul_x (pg, m_scale, m_scale), 35134 + - ms4 = svmul_x (pg, ms2, ms2); 35135 + - svfloat32_t p = sv_estrin_7_f32_x (pg, m_scale, ms2, ms4, d->poly); 35136 + - p = svmad_x (pg, m_scale, p, -0.5); 35137 + - p = svmla_x (pg, m_scale, m_scale, svmul_x (pg, m_scale, p)); 35138 + - 35139 + - /* The scale factor to be applied back at the end - by multiplying float(k) 35140 + - by 2^-23 we get the unbiased exponent of k. */ 35141 + - svfloat32_t scale_back = svmul_x (pg, svcvt_f32_x (pg, k), d->exp_bias); 35142 + - 35143 + - /* Apply the scaling back. */ 35144 + - svfloat32_t y = svmla_x (pg, p, scale_back, d->ln2); 35145 + - 35146 + if (__glibc_unlikely (svptest_any (pg, special))) 35147 + - return special_case (x, y, special); 35148 + + return special_case (x, special); 35149 + 35150 + - return y; 35151 + + return sv_log1pf_inline (x, pg); 35152 + } 35153 + 35154 + strong_alias (SV_NAME_F1 (log1p), SV_NAME_F1 (logp1)) 35155 + diff --git a/sysdeps/aarch64/fpu/sv_log1pf_inline.h b/sysdeps/aarch64/fpu/sv_log1pf_inline.h 35156 + index b94b2da055..850297d615 100644 35157 + --- a/sysdeps/aarch64/fpu/sv_log1pf_inline.h 35158 + +++ b/sysdeps/aarch64/fpu/sv_log1pf_inline.h 35159 + @@ -22,55 +22,76 @@ 35160 + 35161 + #include "sv_math.h" 35162 + #include "vecmath_config.h" 35163 + -#include "poly_sve_f32.h" 35164 + + 35165 + +#define SignExponentMask 0xff800000 35166 + 35167 + static const struct sv_log1pf_data 35168 + { 35169 + - float32_t poly[9]; 35170 + - float32_t ln2; 35171 + - float32_t scale_back; 35172 + + float c0, c2, c4, c6; 35173 + + float c1, c3, c5, c7; 35174 + + float ln2, exp_bias, quarter; 35175 + + uint32_t four, three_quarters; 35176 + } sv_log1pf_data = { 35177 + - /* Polynomial generated using FPMinimax in [-0.25, 0.5]. */ 35178 + - .poly = { -0x1p-1f, 0x1.5555aap-2f, -0x1.000038p-2f, 0x1.99675cp-3f, 35179 + - -0x1.54ef78p-3f, 0x1.28a1f4p-3f, -0x1.0da91p-3f, 0x1.abcb6p-4f, 35180 + - -0x1.6f0d5ep-5f }, 35181 + - .scale_back = 0x1.0p-23f, 35182 + - .ln2 = 0x1.62e43p-1f, 35183 + + /* Do not store first term of polynomial, which is -0.5, as 35184 + + this can be fmov-ed directly instead of including it in 35185 + + the main load-and-mla polynomial schedule. */ 35186 + + .c0 = 0x1.5555aap-2f, .c1 = -0x1.000038p-2f, .c2 = 0x1.99675cp-3f, 35187 + + .c3 = -0x1.54ef78p-3f, .c4 = 0x1.28a1f4p-3f, .c5 = -0x1.0da91p-3f, 35188 + + .c6 = 0x1.abcb6p-4f, .c7 = -0x1.6f0d5ep-5f, .ln2 = 0x1.62e43p-1f, 35189 + + .exp_bias = 0x1p-23f, .quarter = 0x1p-2f, .four = 0x40800000, 35190 + + .three_quarters = 0x3f400000, 35191 + }; 35192 + 35193 + -static inline svfloat32_t 35194 + -eval_poly (svfloat32_t m, const float32_t *c, svbool_t pg) 35195 + -{ 35196 + - svfloat32_t p_12 = svmla_x (pg, sv_f32 (c[0]), m, sv_f32 (c[1])); 35197 + - svfloat32_t m2 = svmul_x (pg, m, m); 35198 + - svfloat32_t q = svmla_x (pg, m, m2, p_12); 35199 + - svfloat32_t p = sv_pw_horner_6_f32_x (pg, m, m2, c + 2); 35200 + - p = svmul_x (pg, m2, p); 35201 + - 35202 + - return svmla_x (pg, q, m2, p); 35203 + -} 35204 + - 35205 + static inline svfloat32_t 35206 + sv_log1pf_inline (svfloat32_t x, svbool_t pg) 35207 + { 35208 + const struct sv_log1pf_data *d = ptr_barrier (&sv_log1pf_data); 35209 + 35210 + - svfloat32_t m = svadd_x (pg, x, 1.0f); 35211 + - 35212 + - svint32_t ks = svsub_x (pg, svreinterpret_s32 (m), 35213 + - svreinterpret_s32 (svdup_f32 (0.75f))); 35214 + - ks = svand_x (pg, ks, 0xff800000); 35215 + - svuint32_t k = svreinterpret_u32 (ks); 35216 + - svfloat32_t s = svreinterpret_f32 ( 35217 + - svsub_x (pg, svreinterpret_u32 (svdup_f32 (4.0f)), k)); 35218 + - 35219 + - svfloat32_t m_scale 35220 + - = svreinterpret_f32 (svsub_x (pg, svreinterpret_u32 (x), k)); 35221 + - m_scale 35222 + - = svadd_x (pg, m_scale, svmla_x (pg, sv_f32 (-1.0f), sv_f32 (0.25f), s)); 35223 + - svfloat32_t p = eval_poly (m_scale, d->poly, pg); 35224 + - svfloat32_t scale_back = svmul_x (pg, svcvt_f32_x (pg, k), d->scale_back); 35225 + - return svmla_x (pg, p, scale_back, d->ln2); 35226 + + /* With x + 1 = t * 2^k (where t = m + 1 and k is chosen such that m 35227 + + is in [-0.25, 0.5]): 35228 + + log1p(x) = log(t) + log(2^k) = log1p(m) + k*log(2). 35229 + + 35230 + + We approximate log1p(m) with a polynomial, then scale by 35231 + + k*log(2). Instead of doing this directly, we use an intermediate 35232 + + scale factor s = 4*k*log(2) to ensure the scale is representable 35233 + + as a normalised fp32 number. */ 35234 + + svfloat32_t m = svadd_x (pg, x, 1); 35235 + + 35236 + + /* Choose k to scale x to the range [-1/4, 1/2]. */ 35237 + + svint32_t k 35238 + + = svand_x (pg, svsub_x (pg, svreinterpret_s32 (m), d->three_quarters), 35239 + + sv_s32 (SignExponentMask)); 35240 + + 35241 + + /* Scale x by exponent manipulation. */ 35242 + + svfloat32_t m_scale = svreinterpret_f32 ( 35243 + + svsub_x (pg, svreinterpret_u32 (x), svreinterpret_u32 (k))); 35244 + + 35245 + + /* Scale up to ensure that the scale factor is representable as normalised 35246 + + fp32 number, and scale m down accordingly. */ 35247 + + svfloat32_t s = svreinterpret_f32 (svsubr_x (pg, k, d->four)); 35248 + + svfloat32_t fconst = svld1rq_f32 (svptrue_b32 (), &d->ln2); 35249 + + m_scale = svadd_x (pg, m_scale, svmla_lane_f32 (sv_f32 (-1), s, fconst, 2)); 35250 + + 35251 + + /* Evaluate polynomial on reduced interval. */ 35252 + + svfloat32_t ms2 = svmul_x (svptrue_b32 (), m_scale, m_scale); 35253 + + 35254 + + svfloat32_t c1357 = svld1rq_f32 (svptrue_b32 (), &d->c1); 35255 + + svfloat32_t p01 = svmla_lane_f32 (sv_f32 (d->c0), m_scale, c1357, 0); 35256 + + svfloat32_t p23 = svmla_lane_f32 (sv_f32 (d->c2), m_scale, c1357, 1); 35257 + + svfloat32_t p45 = svmla_lane_f32 (sv_f32 (d->c4), m_scale, c1357, 2); 35258 + + svfloat32_t p67 = svmla_lane_f32 (sv_f32 (d->c6), m_scale, c1357, 3); 35259 + + 35260 + + svfloat32_t p = svmla_x (pg, p45, p67, ms2); 35261 + + p = svmla_x (pg, p23, p, ms2); 35262 + + p = svmla_x (pg, p01, p, ms2); 35263 + + 35264 + + p = svmad_x (pg, m_scale, p, -0.5); 35265 + + p = svmla_x (pg, m_scale, m_scale, svmul_x (pg, m_scale, p)); 35266 + + 35267 + + /* The scale factor to be applied back at the end - by multiplying float(k) 35268 + + by 2^-23 we get the unbiased exponent of k. */ 35269 + + svfloat32_t scale_back = svmul_lane_f32 (svcvt_f32_x (pg, k), fconst, 1); 35270 + + return svmla_lane_f32 (p, scale_back, fconst, 0); 35271 + } 35272 + 35273 + #endif 35274 + 35275 + commit d983f14c304df2d880c7b01e904e4a889064b9b3 35276 + Author: Luna Lamb <luna.lamb@arm.com> 35277 + Date: Fri Jan 3 20:15:17 2025 +0000 35278 + 35279 + AArch64: Improve codegen in SVE expm1f and users 35280 + 35281 + Use unpredicated muls, use absolute compare and improve memory access. 35282 + Expm1f, sinhf and tanhf show 7%, 5% and 1% improvement in throughput 35283 + microbenchmark on Neoverse V1. 35284 + 35285 + (cherry picked from commit f86b4cf87581cf1e45702b07880679ffa0b1f47a) 35286 + 35287 + diff --git a/sysdeps/aarch64/fpu/expm1f_sve.c b/sysdeps/aarch64/fpu/expm1f_sve.c 35288 + index 7c852125cd..05a66400d4 100644 35289 + --- a/sysdeps/aarch64/fpu/expm1f_sve.c 35290 + +++ b/sysdeps/aarch64/fpu/expm1f_sve.c 35291 + @@ -18,7 +18,6 @@ 35292 + <https://www.gnu.org/licenses/>. */ 35293 + 35294 + #include "sv_math.h" 35295 + -#include "poly_sve_f32.h" 35296 + 35297 + /* Largest value of x for which expm1(x) should round to -1. */ 35298 + #define SpecialBound 0x1.5ebc4p+6f 35299 + @@ -28,20 +27,17 @@ static const struct data 35300 + /* These 4 are grouped together so they can be loaded as one quadword, then 35301 + used with _lane forms of svmla/svmls. */ 35302 + float c2, c4, ln2_hi, ln2_lo; 35303 + - float c0, c1, c3, inv_ln2, special_bound, shift; 35304 + + float c0, inv_ln2, c1, c3, special_bound; 35305 + } data = { 35306 + /* Generated using fpminimax. */ 35307 + .c0 = 0x1.fffffep-2, .c1 = 0x1.5554aep-3, 35308 + .c2 = 0x1.555736p-5, .c3 = 0x1.12287cp-7, 35309 + - .c4 = 0x1.6b55a2p-10, 35310 + + .c4 = 0x1.6b55a2p-10, .inv_ln2 = 0x1.715476p+0f, 35311 + + .special_bound = SpecialBound, .ln2_lo = 0x1.7f7d1cp-20f, 35312 + + .ln2_hi = 0x1.62e4p-1f, 35313 + 35314 + - .special_bound = SpecialBound, .shift = 0x1.8p23f, 35315 + - .inv_ln2 = 0x1.715476p+0f, .ln2_hi = 0x1.62e4p-1f, 35316 + - .ln2_lo = 0x1.7f7d1cp-20f, 35317 + }; 35318 + 35319 + -#define C(i) sv_f32 (d->c##i) 35320 + - 35321 + static svfloat32_t NOINLINE 35322 + special_case (svfloat32_t x, svbool_t pg) 35323 + { 35324 + @@ -71,9 +67,8 @@ svfloat32_t SV_NAME_F1 (expm1) (svfloat32_t x, svbool_t pg) 35325 + and f = x - i * ln2, then f is in [-ln2/2, ln2/2]. 35326 + exp(x) - 1 = 2^i * (expm1(f) + 1) - 1 35327 + where 2^i is exact because i is an integer. */ 35328 + - svfloat32_t j = svmla_x (pg, sv_f32 (d->shift), x, d->inv_ln2); 35329 + - j = svsub_x (pg, j, d->shift); 35330 + - svint32_t i = svcvt_s32_x (pg, j); 35331 + + svfloat32_t j = svmul_x (svptrue_b32 (), x, d->inv_ln2); 35332 + + j = svrinta_x (pg, j); 35333 + 35334 + svfloat32_t f = svmls_lane (x, j, lane_constants, 2); 35335 + f = svmls_lane (f, j, lane_constants, 3); 35336 + @@ -83,17 +78,17 @@ svfloat32_t SV_NAME_F1 (expm1) (svfloat32_t x, svbool_t pg) 35337 + x + ax^2 + bx^3 + cx^4 .... 35338 + So we calculate the polynomial P(f) = a + bf + cf^2 + ... 35339 + and assemble the approximation expm1(f) ~= f + f^2 * P(f). */ 35340 + - svfloat32_t p12 = svmla_lane (C (1), f, lane_constants, 0); 35341 + - svfloat32_t p34 = svmla_lane (C (3), f, lane_constants, 1); 35342 + - svfloat32_t f2 = svmul_x (pg, f, f); 35343 + + svfloat32_t p12 = svmla_lane (sv_f32 (d->c1), f, lane_constants, 0); 35344 + + svfloat32_t p34 = svmla_lane (sv_f32 (d->c3), f, lane_constants, 1); 35345 + + svfloat32_t f2 = svmul_x (svptrue_b32 (), f, f); 35346 + svfloat32_t p = svmla_x (pg, p12, f2, p34); 35347 + - p = svmla_x (pg, C (0), f, p); 35348 + + 35349 + + p = svmla_x (pg, sv_f32 (d->c0), f, p); 35350 + p = svmla_x (pg, f, f2, p); 35351 + 35352 + /* Assemble the result. 35353 + expm1(x) ~= 2^i * (p + 1) - 1 35354 + Let t = 2^i. */ 35355 + - svfloat32_t t = svreinterpret_f32 ( 35356 + - svadd_x (pg, svreinterpret_u32 (svlsl_x (pg, i, 23)), 0x3f800000)); 35357 + - return svmla_x (pg, svsub_x (pg, t, 1), p, t); 35358 + + svfloat32_t t = svscale_x (pg, sv_f32 (1.0f), svcvt_s32_x (pg, j)); 35359 + + return svmla_x (pg, svsub_x (pg, t, 1.0f), p, t); 35360 + } 35361 + diff --git a/sysdeps/aarch64/fpu/sinhf_sve.c b/sysdeps/aarch64/fpu/sinhf_sve.c 35362 + index 6c204b57a2..50dd386774 100644 35363 + --- a/sysdeps/aarch64/fpu/sinhf_sve.c 35364 + +++ b/sysdeps/aarch64/fpu/sinhf_sve.c 35365 + @@ -63,5 +63,5 @@ svfloat32_t SV_NAME_F1 (sinh) (svfloat32_t x, const svbool_t pg) 35366 + if (__glibc_unlikely (svptest_any (pg, special))) 35367 + return special_case (x, svmul_x (pg, t, halfsign), special); 35368 + 35369 + - return svmul_x (pg, t, halfsign); 35370 + + return svmul_x (svptrue_b32 (), t, halfsign); 35371 + } 35372 + diff --git a/sysdeps/aarch64/fpu/sv_expm1f_inline.h b/sysdeps/aarch64/fpu/sv_expm1f_inline.h 35373 + index 5b72451222..e46ddda543 100644 35374 + --- a/sysdeps/aarch64/fpu/sv_expm1f_inline.h 35375 + +++ b/sysdeps/aarch64/fpu/sv_expm1f_inline.h 35376 + @@ -27,21 +27,18 @@ struct sv_expm1f_data 35377 + /* These 4 are grouped together so they can be loaded as one quadword, then 35378 + used with _lane forms of svmla/svmls. */ 35379 + float32_t c2, c4, ln2_hi, ln2_lo; 35380 + - float32_t c0, c1, c3, inv_ln2, shift; 35381 + + float c0, inv_ln2, c1, c3, special_bound; 35382 + }; 35383 + 35384 + /* Coefficients generated using fpminimax. */ 35385 + #define SV_EXPM1F_DATA \ 35386 + { \ 35387 + - .c0 = 0x1.fffffep-2, .c1 = 0x1.5554aep-3, .c2 = 0x1.555736p-5, \ 35388 + - .c3 = 0x1.12287cp-7, .c4 = 0x1.6b55a2p-10, \ 35389 + + .c0 = 0x1.fffffep-2, .c1 = 0x1.5554aep-3, .inv_ln2 = 0x1.715476p+0f, \ 35390 + + .c2 = 0x1.555736p-5, .c3 = 0x1.12287cp-7, \ 35391 + \ 35392 + - .shift = 0x1.8p23f, .inv_ln2 = 0x1.715476p+0f, .ln2_hi = 0x1.62e4p-1f, \ 35393 + - .ln2_lo = 0x1.7f7d1cp-20f, \ 35394 + + .c4 = 0x1.6b55a2p-10, .ln2_lo = 0x1.7f7d1cp-20f, .ln2_hi = 0x1.62e4p-1f, \ 35395 + } 35396 + 35397 + -#define C(i) sv_f32 (d->c##i) 35398 + - 35399 + static inline svfloat32_t 35400 + expm1f_inline (svfloat32_t x, svbool_t pg, const struct sv_expm1f_data *d) 35401 + { 35402 + @@ -55,9 +52,8 @@ expm1f_inline (svfloat32_t x, svbool_t pg, const struct sv_expm1f_data *d) 35403 + and f = x - i * ln2, then f is in [-ln2/2, ln2/2]. 35404 + exp(x) - 1 = 2^i * (expm1(f) + 1) - 1 35405 + where 2^i is exact because i is an integer. */ 35406 + - svfloat32_t j = svmla_x (pg, sv_f32 (d->shift), x, d->inv_ln2); 35407 + - j = svsub_x (pg, j, d->shift); 35408 + - svint32_t i = svcvt_s32_x (pg, j); 35409 + + svfloat32_t j = svmul_x (svptrue_b32 (), x, d->inv_ln2); 35410 + + j = svrinta_x (pg, j); 35411 + 35412 + svfloat32_t f = svmls_lane (x, j, lane_constants, 2); 35413 + f = svmls_lane (f, j, lane_constants, 3); 35414 + @@ -67,18 +63,18 @@ expm1f_inline (svfloat32_t x, svbool_t pg, const struct sv_expm1f_data *d) 35415 + x + ax^2 + bx^3 + cx^4 .... 35416 + So we calculate the polynomial P(f) = a + bf + cf^2 + ... 35417 + and assemble the approximation expm1(f) ~= f + f^2 * P(f). */ 35418 + - svfloat32_t p12 = svmla_lane (C (1), f, lane_constants, 0); 35419 + - svfloat32_t p34 = svmla_lane (C (3), f, lane_constants, 1); 35420 + - svfloat32_t f2 = svmul_x (pg, f, f); 35421 + + svfloat32_t p12 = svmla_lane (sv_f32 (d->c1), f, lane_constants, 0); 35422 + + svfloat32_t p34 = svmla_lane (sv_f32 (d->c3), f, lane_constants, 1); 35423 + + svfloat32_t f2 = svmul_x (svptrue_b32 (), f, f); 35424 + svfloat32_t p = svmla_x (pg, p12, f2, p34); 35425 + - p = svmla_x (pg, C (0), f, p); 35426 + + p = svmla_x (pg, sv_f32 (d->c0), f, p); 35427 + p = svmla_x (pg, f, f2, p); 35428 + 35429 + /* Assemble the result. 35430 + expm1(x) ~= 2^i * (p + 1) - 1 35431 + Let t = 2^i. */ 35432 + - svfloat32_t t = svscale_x (pg, sv_f32 (1), i); 35433 + - return svmla_x (pg, svsub_x (pg, t, 1), p, t); 35434 + + svfloat32_t t = svscale_x (pg, sv_f32 (1.0f), svcvt_s32_x (pg, j)); 35435 + + return svmla_x (pg, svsub_x (pg, t, 1.0f), p, t); 35436 + } 35437 + 35438 + #endif 35439 + diff --git a/sysdeps/aarch64/fpu/tanhf_sve.c b/sysdeps/aarch64/fpu/tanhf_sve.c 35440 + index 0b94523cf5..80dd679346 100644 35441 + --- a/sysdeps/aarch64/fpu/tanhf_sve.c 35442 + +++ b/sysdeps/aarch64/fpu/tanhf_sve.c 35443 + @@ -19,20 +19,27 @@ 35444 + 35445 + #include "sv_expm1f_inline.h" 35446 + 35447 + +/* Largest value of x for which tanhf(x) rounds to 1 (or -1 for negative). */ 35448 + +#define BoringBound 0x1.205966p+3f 35449 + + 35450 + static const struct data 35451 + { 35452 + struct sv_expm1f_data expm1f_consts; 35453 + - uint32_t boring_bound, onef; 35454 + + uint32_t onef, special_bound; 35455 + + float boring_bound; 35456 + } data = { 35457 + .expm1f_consts = SV_EXPM1F_DATA, 35458 + - /* 0x1.205966p+3, above which tanhf rounds to 1 (or -1 for negative). */ 35459 + - .boring_bound = 0x41102cb3, 35460 + .onef = 0x3f800000, 35461 + + .special_bound = 0x7f800000, 35462 + + .boring_bound = BoringBound, 35463 + }; 35464 + 35465 + static svfloat32_t NOINLINE 35466 + -special_case (svfloat32_t x, svfloat32_t y, svbool_t special) 35467 + +special_case (svfloat32_t x, svbool_t pg, svbool_t is_boring, 35468 + + svfloat32_t boring, svfloat32_t q, svbool_t special) 35469 + { 35470 + + svfloat32_t y 35471 + + = svsel_f32 (is_boring, boring, svdiv_x (pg, q, svadd_x (pg, q, 2.0))); 35472 + return sv_call_f32 (tanhf, x, y, special); 35473 + } 35474 + 35475 + @@ -47,15 +54,16 @@ svfloat32_t SV_NAME_F1 (tanh) (svfloat32_t x, const svbool_t pg) 35476 + svfloat32_t ax = svabs_x (pg, x); 35477 + svuint32_t iax = svreinterpret_u32 (ax); 35478 + svuint32_t sign = sveor_x (pg, svreinterpret_u32 (x), iax); 35479 + - svbool_t is_boring = svcmpgt (pg, iax, d->boring_bound); 35480 + svfloat32_t boring = svreinterpret_f32 (svorr_x (pg, sign, d->onef)); 35481 + - 35482 + - svbool_t special = svcmpgt (pg, iax, 0x7f800000); 35483 + + svbool_t special = svcmpgt (pg, iax, d->special_bound); 35484 + + svbool_t is_boring = svacgt (pg, x, d->boring_bound); 35485 + 35486 + /* tanh(x) = (e^2x - 1) / (e^2x + 1). */ 35487 + - svfloat32_t q = expm1f_inline (svmul_x (pg, x, 2.0), pg, &d->expm1f_consts); 35488 + - svfloat32_t y = svdiv_x (pg, q, svadd_x (pg, q, 2.0)); 35489 + + svfloat32_t q = expm1f_inline (svmul_x (svptrue_b32 (), x, 2.0), pg, 35490 + + &d->expm1f_consts); 35491 + + 35492 + if (__glibc_unlikely (svptest_any (pg, special))) 35493 + - return special_case (x, svsel_f32 (is_boring, boring, y), special); 35494 + + return special_case (x, pg, is_boring, boring, q, special); 35495 + + svfloat32_t y = svdiv_x (pg, q, svadd_x (pg, q, 2.0)); 35496 + return svsel_f32 (is_boring, boring, y); 35497 + } 35498 + 35499 + commit 0ff6a9ff79bca9384ce4ba20e8942d39cc377a14 35500 + Author: Luna Lamb <luna.lamb@arm.com> 35501 + Date: Thu Feb 13 17:52:09 2025 +0000 35502 + 35503 + Aarch64: Improve codegen in SVE asinh 35504 + 35505 + Use unpredicated muls, use lanewise mla's and improve memory access. 35506 + 1% regression in throughput microbenchmark on Neoverse V1. 35507 + 35508 + Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com> 35509 + (cherry picked from commit 8f0e7fe61e0a2ad5ed777933703ce09053810ec4) 35510 + 35511 + diff --git a/sysdeps/aarch64/fpu/asinh_sve.c b/sysdeps/aarch64/fpu/asinh_sve.c 35512 + index 28dc5c4587..fe8715e06c 100644 35513 + --- a/sysdeps/aarch64/fpu/asinh_sve.c 35514 + +++ b/sysdeps/aarch64/fpu/asinh_sve.c 35515 + @@ -18,36 +18,49 @@ 35516 + <https://www.gnu.org/licenses/>. */ 35517 + 35518 + #include "sv_math.h" 35519 + -#include "poly_sve_f64.h" 35520 + 35521 + #define SignMask (0x8000000000000000) 35522 + #define One (0x3ff0000000000000) 35523 + #define Thres (0x5fe0000000000000) /* asuint64 (0x1p511). */ 35524 + +#define IndexMask (((1 << V_LOG_TABLE_BITS) - 1) << 1) 35525 + 35526 + static const struct data 35527 + { 35528 + - double poly[18]; 35529 + - double ln2, p3, p1, p4, p0, p2; 35530 + - uint64_t n; 35531 + - uint64_t off; 35532 + + double even_coeffs[9]; 35533 + + double ln2, p3, p1, p4, p0, p2, c1, c3, c5, c7, c9, c11, c13, c15, c17; 35534 + + uint64_t off, mask; 35535 + 35536 + } data = { 35537 + - /* Polynomial generated using Remez on [2^-26, 1]. */ 35538 + - .poly 35539 + - = { -0x1.55555555554a7p-3, 0x1.3333333326c7p-4, -0x1.6db6db68332e6p-5, 35540 + - 0x1.f1c71b26fb40dp-6, -0x1.6e8b8b654a621p-6, 0x1.1c4daa9e67871p-6, 35541 + - -0x1.c9871d10885afp-7, 0x1.7a16e8d9d2ecfp-7, -0x1.3ddca533e9f54p-7, 35542 + - 0x1.0becef748dafcp-7, -0x1.b90c7099dd397p-8, 0x1.541f2bb1ffe51p-8, 35543 + - -0x1.d217026a669ecp-9, 0x1.0b5c7977aaf7p-9, -0x1.e0f37daef9127p-11, 35544 + - 0x1.388b5fe542a6p-12, -0x1.021a48685e287p-14, 0x1.93d4ba83d34dap-18 }, 35545 + + /* Polynomial generated using Remez on [2^-26, 1]. */ 35546 + + .even_coeffs ={ 35547 + + -0x1.55555555554a7p-3, 35548 + + -0x1.6db6db68332e6p-5, 35549 + + -0x1.6e8b8b654a621p-6, 35550 + + -0x1.c9871d10885afp-7, 35551 + + -0x1.3ddca533e9f54p-7, 35552 + + -0x1.b90c7099dd397p-8, 35553 + + -0x1.d217026a669ecp-9, 35554 + + -0x1.e0f37daef9127p-11, 35555 + + -0x1.021a48685e287p-14, }, 35556 + + 35557 + + .c1 = 0x1.3333333326c7p-4, 35558 + + .c3 = 0x1.f1c71b26fb40dp-6, 35559 + + .c5 = 0x1.1c4daa9e67871p-6, 35560 + + .c7 = 0x1.7a16e8d9d2ecfp-7, 35561 + + .c9 = 0x1.0becef748dafcp-7, 35562 + + .c11 = 0x1.541f2bb1ffe51p-8, 35563 + + .c13 = 0x1.0b5c7977aaf7p-9, 35564 + + .c15 = 0x1.388b5fe542a6p-12, 35565 + + .c17 = 0x1.93d4ba83d34dap-18, 35566 + + 35567 + .ln2 = 0x1.62e42fefa39efp-1, 35568 + .p0 = -0x1.ffffffffffff7p-2, 35569 + .p1 = 0x1.55555555170d4p-2, 35570 + .p2 = -0x1.0000000399c27p-2, 35571 + .p3 = 0x1.999b2e90e94cap-3, 35572 + .p4 = -0x1.554e550bd501ep-3, 35573 + - .n = 1 << V_LOG_TABLE_BITS, 35574 + - .off = 0x3fe6900900000000 35575 + + .off = 0x3fe6900900000000, 35576 + + .mask = 0xfffULL << 52, 35577 + }; 35578 + 35579 + static svfloat64_t NOINLINE 35580 + @@ -64,11 +77,10 @@ __sv_log_inline (svfloat64_t x, const struct data *d, const svbool_t pg) 35581 + of the algorithm used. */ 35582 + 35583 + svuint64_t ix = svreinterpret_u64 (x); 35584 + - svuint64_t tmp = svsub_x (pg, ix, d->off); 35585 + - svuint64_t i = svand_x (pg, svlsr_x (pg, tmp, (51 - V_LOG_TABLE_BITS)), 35586 + - (d->n - 1) << 1); 35587 + - svint64_t k = svasr_x (pg, svreinterpret_s64 (tmp), 52); 35588 + - svuint64_t iz = svsub_x (pg, ix, svand_x (pg, tmp, 0xfffULL << 52)); 35589 + + svuint64_t i_off = svsub_x (pg, ix, d->off); 35590 + + svuint64_t i 35591 + + = svand_x (pg, svlsr_x (pg, i_off, (51 - V_LOG_TABLE_BITS)), IndexMask); 35592 + + svuint64_t iz = svsub_x (pg, ix, svand_x (pg, i_off, d->mask)); 35593 + svfloat64_t z = svreinterpret_f64 (iz); 35594 + 35595 + svfloat64_t invc = svld1_gather_index (pg, &__v_log_data.table[0].invc, i); 35596 + @@ -78,14 +90,14 @@ __sv_log_inline (svfloat64_t x, const struct data *d, const svbool_t pg) 35597 + svfloat64_t p1_p4 = svld1rq (svptrue_b64 (), &d->p1); 35598 + 35599 + svfloat64_t r = svmla_x (pg, sv_f64 (-1.0), invc, z); 35600 + - svfloat64_t kd = svcvt_f64_x (pg, k); 35601 + + svfloat64_t kd 35602 + + = svcvt_f64_x (pg, svasr_x (pg, svreinterpret_s64 (i_off), 52)); 35603 + 35604 + svfloat64_t hi = svmla_lane (svadd_x (pg, logc, r), kd, ln2_p3, 0); 35605 + - svfloat64_t r2 = svmul_x (pg, r, r); 35606 + - 35607 + + svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r); 35608 + svfloat64_t y = svmla_lane (sv_f64 (d->p2), r, ln2_p3, 1); 35609 + - 35610 + svfloat64_t p = svmla_lane (sv_f64 (d->p0), r, p1_p4, 0); 35611 + + 35612 + y = svmla_lane (y, r2, p1_p4, 1); 35613 + y = svmla_x (pg, p, r2, y); 35614 + y = svmla_x (pg, hi, r2, y); 35615 + @@ -111,7 +123,6 @@ svfloat64_t SV_NAME_D1 (asinh) (svfloat64_t x, const svbool_t pg) 35616 + svuint64_t iax = svbic_x (pg, ix, SignMask); 35617 + svuint64_t sign = svand_x (pg, ix, SignMask); 35618 + svfloat64_t ax = svreinterpret_f64 (iax); 35619 + - 35620 + svbool_t ge1 = svcmpge (pg, iax, One); 35621 + svbool_t special = svcmpge (pg, iax, Thres); 35622 + 35623 + @@ -120,7 +131,7 @@ svfloat64_t SV_NAME_D1 (asinh) (svfloat64_t x, const svbool_t pg) 35624 + svfloat64_t option_1 = sv_f64 (0); 35625 + if (__glibc_likely (svptest_any (pg, ge1))) 35626 + { 35627 + - svfloat64_t x2 = svmul_x (pg, ax, ax); 35628 + + svfloat64_t x2 = svmul_x (svptrue_b64 (), ax, ax); 35629 + option_1 = __sv_log_inline ( 35630 + svadd_x (pg, ax, svsqrt_x (pg, svadd_x (pg, x2, 1))), d, pg); 35631 + } 35632 + @@ -130,21 +141,53 @@ svfloat64_t SV_NAME_D1 (asinh) (svfloat64_t x, const svbool_t pg) 35633 + The largest observed error in this region is 1.51 ULPs: 35634 + _ZGVsMxv_asinh(0x1.fe12bf8c616a2p-1) got 0x1.c1e649ee2681bp-1 35635 + want 0x1.c1e649ee2681dp-1. */ 35636 + + 35637 + svfloat64_t option_2 = sv_f64 (0); 35638 + if (__glibc_likely (svptest_any (pg, svnot_z (pg, ge1)))) 35639 + { 35640 + - svfloat64_t x2 = svmul_x (pg, ax, ax); 35641 + - svfloat64_t x4 = svmul_x (pg, x2, x2); 35642 + - svfloat64_t p = sv_pw_horner_17_f64_x (pg, x2, x4, d->poly); 35643 + - option_2 = svmla_x (pg, ax, p, svmul_x (pg, x2, ax)); 35644 + + svfloat64_t x2 = svmul_x (svptrue_b64 (), ax, ax); 35645 + + svfloat64_t x4 = svmul_x (svptrue_b64 (), x2, x2); 35646 + + /* Order-17 Pairwise Horner scheme. */ 35647 + + svfloat64_t c13 = svld1rq (svptrue_b64 (), &d->c1); 35648 + + svfloat64_t c57 = svld1rq (svptrue_b64 (), &d->c5); 35649 + + svfloat64_t c911 = svld1rq (svptrue_b64 (), &d->c9); 35650 + + svfloat64_t c1315 = svld1rq (svptrue_b64 (), &d->c13); 35651 + + 35652 + + svfloat64_t p01 = svmla_lane (sv_f64 (d->even_coeffs[0]), x2, c13, 0); 35653 + + svfloat64_t p23 = svmla_lane (sv_f64 (d->even_coeffs[1]), x2, c13, 1); 35654 + + svfloat64_t p45 = svmla_lane (sv_f64 (d->even_coeffs[2]), x2, c57, 0); 35655 + + svfloat64_t p67 = svmla_lane (sv_f64 (d->even_coeffs[3]), x2, c57, 1); 35656 + + svfloat64_t p89 = svmla_lane (sv_f64 (d->even_coeffs[4]), x2, c911, 0); 35657 + + svfloat64_t p1011 = svmla_lane (sv_f64 (d->even_coeffs[5]), x2, c911, 1); 35658 + + svfloat64_t p1213 35659 + + = svmla_lane (sv_f64 (d->even_coeffs[6]), x2, c1315, 0); 35660 + + svfloat64_t p1415 35661 + + = svmla_lane (sv_f64 (d->even_coeffs[7]), x2, c1315, 1); 35662 + + svfloat64_t p1617 = svmla_x (pg, sv_f64 (d->even_coeffs[8]), x2, d->c17); 35663 + + 35664 + + svfloat64_t p = svmla_x (pg, p1415, x4, p1617); 35665 + + p = svmla_x (pg, p1213, x4, p); 35666 + + p = svmla_x (pg, p1011, x4, p); 35667 + + p = svmla_x (pg, p89, x4, p); 35668 + + 35669 + + p = svmla_x (pg, p67, x4, p); 35670 + + p = svmla_x (pg, p45, x4, p); 35671 + + 35672 + + p = svmla_x (pg, p23, x4, p); 35673 + + 35674 + + p = svmla_x (pg, p01, x4, p); 35675 + + 35676 + + option_2 = svmla_x (pg, ax, p, svmul_x (svptrue_b64 (), x2, ax)); 35677 + } 35678 + 35679 + - /* Choose the right option for each lane. */ 35680 + - svfloat64_t y = svsel (ge1, option_1, option_2); 35681 + - 35682 + if (__glibc_unlikely (svptest_any (pg, special))) 35683 + return special_case ( 35684 + - x, svreinterpret_f64 (sveor_x (pg, svreinterpret_u64 (y), sign)), 35685 + + x, 35686 + + svreinterpret_f64 (sveor_x ( 35687 + + pg, svreinterpret_u64 (svsel (ge1, option_1, option_2)), sign)), 35688 + special); 35689 + + 35690 + + /* Choose the right option for each lane. */ 35691 + + svfloat64_t y = svsel (ge1, option_1, option_2); 35692 + return svreinterpret_f64 (sveor_x (pg, svreinterpret_u64 (y), sign)); 35693 + } 35694 + 35695 + commit 4b0bb84eb7e52a135c873fd9d0fc6c30599aedf4 35696 + Author: Luna Lamb <luna.lamb@arm.com> 35697 + Date: Thu Feb 13 17:54:46 2025 +0000 35698 + 35699 + Aarch64: Improve codegen in SVE exp and users, and update expf_inline 35700 + 35701 + Use unpredicted muls, and improve memory access. 35702 + 7%, 3% and 1% improvement in throughput microbenchmark on Neoverse V1, 35703 + for exp, exp2 and cosh respectively. 35704 + 35705 + Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com> 35706 + (cherry picked from commit c0ff447edf19bd4630fe79adf5e8b896405b059f) 35707 + 35708 + diff --git a/sysdeps/aarch64/fpu/cosh_sve.c b/sysdeps/aarch64/fpu/cosh_sve.c 35709 + index 919f34604a..e375dd8a34 100644 35710 + --- a/sysdeps/aarch64/fpu/cosh_sve.c 35711 + +++ b/sysdeps/aarch64/fpu/cosh_sve.c 35712 + @@ -23,7 +23,7 @@ static const struct data 35713 + { 35714 + float64_t poly[3]; 35715 + float64_t inv_ln2, ln2_hi, ln2_lo, shift, thres; 35716 + - uint64_t index_mask, special_bound; 35717 + + uint64_t special_bound; 35718 + } data = { 35719 + .poly = { 0x1.fffffffffffd4p-2, 0x1.5555571d6b68cp-3, 35720 + 0x1.5555576a59599p-5, }, 35721 + @@ -35,14 +35,16 @@ static const struct data 35722 + .shift = 0x1.8p+52, 35723 + .thres = 704.0, 35724 + 35725 + - .index_mask = 0xff, 35726 + /* 0x1.6p9, above which exp overflows. */ 35727 + .special_bound = 0x4086000000000000, 35728 + }; 35729 + 35730 + static svfloat64_t NOINLINE 35731 + -special_case (svfloat64_t x, svfloat64_t y, svbool_t special) 35732 + +special_case (svfloat64_t x, svbool_t pg, svfloat64_t t, svbool_t special) 35733 + { 35734 + + svfloat64_t half_t = svmul_x (svptrue_b64 (), t, 0.5); 35735 + + svfloat64_t half_over_t = svdivr_x (pg, t, 0.5); 35736 + + svfloat64_t y = svadd_x (pg, half_t, half_over_t); 35737 + return sv_call_f64 (cosh, x, y, special); 35738 + } 35739 + 35740 + @@ -60,12 +62,12 @@ exp_inline (svfloat64_t x, const svbool_t pg, const struct data *d) 35741 + 35742 + svuint64_t u = svreinterpret_u64 (z); 35743 + svuint64_t e = svlsl_x (pg, u, 52 - V_EXP_TAIL_TABLE_BITS); 35744 + - svuint64_t i = svand_x (pg, u, d->index_mask); 35745 + + svuint64_t i = svand_x (svptrue_b64 (), u, 0xff); 35746 + 35747 + svfloat64_t y = svmla_x (pg, sv_f64 (d->poly[1]), r, d->poly[2]); 35748 + y = svmla_x (pg, sv_f64 (d->poly[0]), r, y); 35749 + y = svmla_x (pg, sv_f64 (1.0), r, y); 35750 + - y = svmul_x (pg, r, y); 35751 + + y = svmul_x (svptrue_b64 (), r, y); 35752 + 35753 + /* s = 2^(n/N). */ 35754 + u = svld1_gather_index (pg, __v_exp_tail_data, i); 35755 + @@ -94,12 +96,12 @@ svfloat64_t SV_NAME_D1 (cosh) (svfloat64_t x, const svbool_t pg) 35756 + /* Up to the point that exp overflows, we can use it to calculate cosh by 35757 + exp(|x|) / 2 + 1 / (2 * exp(|x|)). */ 35758 + svfloat64_t t = exp_inline (ax, pg, d); 35759 + - svfloat64_t half_t = svmul_x (pg, t, 0.5); 35760 + - svfloat64_t half_over_t = svdivr_x (pg, t, 0.5); 35761 + 35762 + /* Fall back to scalar for any special cases. */ 35763 + if (__glibc_unlikely (svptest_any (pg, special))) 35764 + - return special_case (x, svadd_x (pg, half_t, half_over_t), special); 35765 + + return special_case (x, pg, t, special); 35766 + 35767 + + svfloat64_t half_t = svmul_x (svptrue_b64 (), t, 0.5); 35768 + + svfloat64_t half_over_t = svdivr_x (pg, t, 0.5); 35769 + return svadd_x (pg, half_t, half_over_t); 35770 + } 35771 + diff --git a/sysdeps/aarch64/fpu/exp10_sve.c b/sysdeps/aarch64/fpu/exp10_sve.c 35772 + index ddf64708cb..bfd3fb9e19 100644 35773 + --- a/sysdeps/aarch64/fpu/exp10_sve.c 35774 + +++ b/sysdeps/aarch64/fpu/exp10_sve.c 35775 + @@ -18,21 +18,23 @@ 35776 + <https://www.gnu.org/licenses/>. */ 35777 + 35778 + #include "sv_math.h" 35779 + -#include "poly_sve_f64.h" 35780 + 35781 + #define SpecialBound 307.0 /* floor (log10 (2^1023)). */ 35782 + 35783 + static const struct data 35784 + { 35785 + - double poly[5]; 35786 + + double c1, c3, c2, c4, c0; 35787 + double shift, log10_2, log2_10_hi, log2_10_lo, scale_thres, special_bound; 35788 + } data = { 35789 + /* Coefficients generated using Remez algorithm. 35790 + rel error: 0x1.9fcb9b3p-60 35791 + abs error: 0x1.a20d9598p-60 in [ -log10(2)/128, log10(2)/128 ] 35792 + max ulp err 0.52 +0.5. */ 35793 + - .poly = { 0x1.26bb1bbb55516p1, 0x1.53524c73cd32ap1, 0x1.0470591daeafbp1, 35794 + - 0x1.2bd77b1361ef6p0, 0x1.142b5d54e9621p-1 }, 35795 + + .c0 = 0x1.26bb1bbb55516p1, 35796 + + .c1 = 0x1.53524c73cd32ap1, 35797 + + .c2 = 0x1.0470591daeafbp1, 35798 + + .c3 = 0x1.2bd77b1361ef6p0, 35799 + + .c4 = 0x1.142b5d54e9621p-1, 35800 + /* 1.5*2^46+1023. This value is further explained below. */ 35801 + .shift = 0x1.800000000ffc0p+46, 35802 + .log10_2 = 0x1.a934f0979a371p1, /* 1/log2(10). */ 35803 + @@ -70,9 +72,9 @@ special_case (svbool_t pg, svfloat64_t s, svfloat64_t y, svfloat64_t n, 35804 + /* |n| > 1280 => 2^(n) overflows. */ 35805 + svbool_t p_cmp = svacgt (pg, n, d->scale_thres); 35806 + 35807 + - svfloat64_t r1 = svmul_x (pg, s1, s1); 35808 + + svfloat64_t r1 = svmul_x (svptrue_b64 (), s1, s1); 35809 + svfloat64_t r2 = svmla_x (pg, s2, s2, y); 35810 + - svfloat64_t r0 = svmul_x (pg, r2, s1); 35811 + + svfloat64_t r0 = svmul_x (svptrue_b64 (), r2, s1); 35812 + 35813 + return svsel (p_cmp, r1, r0); 35814 + } 35815 + @@ -103,11 +105,14 @@ svfloat64_t SV_NAME_D1 (exp10) (svfloat64_t x, svbool_t pg) 35816 + comes at significant performance cost. */ 35817 + svuint64_t u = svreinterpret_u64 (z); 35818 + svfloat64_t scale = svexpa (u); 35819 + - 35820 + + svfloat64_t c24 = svld1rq (svptrue_b64 (), &d->c2); 35821 + /* Approximate exp10(r) using polynomial. */ 35822 + - svfloat64_t r2 = svmul_x (pg, r, r); 35823 + - svfloat64_t y = svmla_x (pg, svmul_x (pg, r, d->poly[0]), r2, 35824 + - sv_pairwise_poly_3_f64_x (pg, r, r2, d->poly + 1)); 35825 + + svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r); 35826 + + svfloat64_t p12 = svmla_lane (sv_f64 (d->c1), r, c24, 0); 35827 + + svfloat64_t p34 = svmla_lane (sv_f64 (d->c3), r, c24, 1); 35828 + + svfloat64_t p14 = svmla_x (pg, p12, p34, r2); 35829 + + 35830 + + svfloat64_t y = svmla_x (pg, svmul_x (svptrue_b64 (), r, d->c0), r2, p14); 35831 + 35832 + /* Assemble result as exp10(x) = 2^n * exp10(r). If |x| > SpecialBound 35833 + multiplication may overflow, so use special case routine. */ 35834 + diff --git a/sysdeps/aarch64/fpu/exp2_sve.c b/sysdeps/aarch64/fpu/exp2_sve.c 35835 + index 22848ebfa5..5dfb77cdbc 100644 35836 + --- a/sysdeps/aarch64/fpu/exp2_sve.c 35837 + +++ b/sysdeps/aarch64/fpu/exp2_sve.c 35838 + @@ -18,7 +18,6 @@ 35839 + <https://www.gnu.org/licenses/>. */ 35840 + 35841 + #include "sv_math.h" 35842 + -#include "poly_sve_f64.h" 35843 + 35844 + #define N (1 << V_EXP_TABLE_BITS) 35845 + 35846 + @@ -27,15 +26,15 @@ 35847 + 35848 + static const struct data 35849 + { 35850 + - double poly[4]; 35851 + + double c0, c2; 35852 + + double c1, c3; 35853 + double shift, big_bound, uoflow_bound; 35854 + } data = { 35855 + /* Coefficients are computed using Remez algorithm with 35856 + minimisation of the absolute error. */ 35857 + - .poly = { 0x1.62e42fefa3686p-1, 0x1.ebfbdff82c241p-3, 0x1.c6b09b16de99ap-5, 35858 + - 0x1.3b2abf5571ad8p-7 }, 35859 + - .shift = 0x1.8p52 / N, 35860 + - .uoflow_bound = UOFlowBound, 35861 + + .c0 = 0x1.62e42fefa3686p-1, .c1 = 0x1.ebfbdff82c241p-3, 35862 + + .c2 = 0x1.c6b09b16de99ap-5, .c3 = 0x1.3b2abf5571ad8p-7, 35863 + + .shift = 0x1.8p52 / N, .uoflow_bound = UOFlowBound, 35864 + .big_bound = BigBound, 35865 + }; 35866 + 35867 + @@ -67,9 +66,9 @@ special_case (svbool_t pg, svfloat64_t s, svfloat64_t y, svfloat64_t n, 35868 + /* |n| > 1280 => 2^(n) overflows. */ 35869 + svbool_t p_cmp = svacgt (pg, n, d->uoflow_bound); 35870 + 35871 + - svfloat64_t r1 = svmul_x (pg, s1, s1); 35872 + + svfloat64_t r1 = svmul_x (svptrue_b64 (), s1, s1); 35873 + svfloat64_t r2 = svmla_x (pg, s2, s2, y); 35874 + - svfloat64_t r0 = svmul_x (pg, r2, s1); 35875 + + svfloat64_t r0 = svmul_x (svptrue_b64 (), r2, s1); 35876 + 35877 + return svsel (p_cmp, r1, r0); 35878 + } 35879 + @@ -99,11 +98,14 @@ svfloat64_t SV_NAME_D1 (exp2) (svfloat64_t x, svbool_t pg) 35880 + svuint64_t top = svlsl_x (pg, ki, 52 - V_EXP_TABLE_BITS); 35881 + svfloat64_t scale = svreinterpret_f64 (svadd_x (pg, sbits, top)); 35882 + 35883 + + svfloat64_t c13 = svld1rq (svptrue_b64 (), &d->c1); 35884 + /* Approximate exp2(r) using polynomial. */ 35885 + - svfloat64_t r2 = svmul_x (pg, r, r); 35886 + - svfloat64_t p = sv_pairwise_poly_3_f64_x (pg, r, r2, d->poly); 35887 + - svfloat64_t y = svmul_x (pg, r, p); 35888 + - 35889 + + /* y = exp2(r) - 1 ~= C0 r + C1 r^2 + C2 r^3 + C3 r^4. */ 35890 + + svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r); 35891 + + svfloat64_t p01 = svmla_lane (sv_f64 (d->c0), r, c13, 0); 35892 + + svfloat64_t p23 = svmla_lane (sv_f64 (d->c2), r, c13, 1); 35893 + + svfloat64_t p = svmla_x (pg, p01, p23, r2); 35894 + + svfloat64_t y = svmul_x (svptrue_b64 (), r, p); 35895 + /* Assemble exp2(x) = exp2(r) * scale. */ 35896 + if (__glibc_unlikely (svptest_any (pg, special))) 35897 + return special_case (pg, scale, y, kd, d); 35898 + diff --git a/sysdeps/aarch64/fpu/exp_sve.c b/sysdeps/aarch64/fpu/exp_sve.c 35899 + index aabaaa1d61..b2421d493f 100644 35900 + --- a/sysdeps/aarch64/fpu/exp_sve.c 35901 + +++ b/sysdeps/aarch64/fpu/exp_sve.c 35902 + @@ -21,12 +21,15 @@ 35903 + 35904 + static const struct data 35905 + { 35906 + - double poly[4]; 35907 + + double c0, c2; 35908 + + double c1, c3; 35909 + double ln2_hi, ln2_lo, inv_ln2, shift, thres; 35910 + + 35911 + } data = { 35912 + - .poly = { /* ulp error: 0.53. */ 35913 + - 0x1.fffffffffdbcdp-2, 0x1.555555555444cp-3, 0x1.555573c6a9f7dp-5, 35914 + - 0x1.1111266d28935p-7 }, 35915 + + .c0 = 0x1.fffffffffdbcdp-2, 35916 + + .c1 = 0x1.555555555444cp-3, 35917 + + .c2 = 0x1.555573c6a9f7dp-5, 35918 + + .c3 = 0x1.1111266d28935p-7, 35919 + .ln2_hi = 0x1.62e42fefa3800p-1, 35920 + .ln2_lo = 0x1.ef35793c76730p-45, 35921 + /* 1/ln2. */ 35922 + @@ -36,7 +39,6 @@ static const struct data 35923 + .thres = 704.0, 35924 + }; 35925 + 35926 + -#define C(i) sv_f64 (d->poly[i]) 35927 + #define SpecialOffset 0x6000000000000000 /* 0x1p513. */ 35928 + /* SpecialBias1 + SpecialBias1 = asuint(1.0). */ 35929 + #define SpecialBias1 0x7000000000000000 /* 0x1p769. */ 35930 + @@ -56,20 +58,20 @@ special_case (svbool_t pg, svfloat64_t s, svfloat64_t y, svfloat64_t n) 35931 + svuint64_t b 35932 + = svdup_u64_z (p_sign, SpecialOffset); /* Inactive lanes set to 0. */ 35933 + 35934 + - /* Set s1 to generate overflow depending on sign of exponent n. */ 35935 + - svfloat64_t s1 = svreinterpret_f64 ( 35936 + - svsubr_x (pg, b, SpecialBias1)); /* 0x70...0 - b. */ 35937 + - /* Offset s to avoid overflow in final result if n is below threshold. */ 35938 + + /* Set s1 to generate overflow depending on sign of exponent n, 35939 + + ie. s1 = 0x70...0 - b. */ 35940 + + svfloat64_t s1 = svreinterpret_f64 (svsubr_x (pg, b, SpecialBias1)); 35941 + + /* Offset s to avoid overflow in final result if n is below threshold. 35942 + + ie. s2 = as_u64 (s) - 0x3010...0 + b. */ 35943 + svfloat64_t s2 = svreinterpret_f64 ( 35944 + - svadd_x (pg, svsub_x (pg, svreinterpret_u64 (s), SpecialBias2), 35945 + - b)); /* as_u64 (s) - 0x3010...0 + b. */ 35946 + + svadd_x (pg, svsub_x (pg, svreinterpret_u64 (s), SpecialBias2), b)); 35947 + 35948 + /* |n| > 1280 => 2^(n) overflows. */ 35949 + svbool_t p_cmp = svacgt (pg, n, 1280.0); 35950 + 35951 + - svfloat64_t r1 = svmul_x (pg, s1, s1); 35952 + + svfloat64_t r1 = svmul_x (svptrue_b64 (), s1, s1); 35953 + svfloat64_t r2 = svmla_x (pg, s2, s2, y); 35954 + - svfloat64_t r0 = svmul_x (pg, r2, s1); 35955 + + svfloat64_t r0 = svmul_x (svptrue_b64 (), r2, s1); 35956 + 35957 + return svsel (p_cmp, r1, r0); 35958 + } 35959 + @@ -103,16 +105,16 @@ svfloat64_t SV_NAME_D1 (exp) (svfloat64_t x, const svbool_t pg) 35960 + svfloat64_t z = svmla_x (pg, sv_f64 (d->shift), x, d->inv_ln2); 35961 + svuint64_t u = svreinterpret_u64 (z); 35962 + svfloat64_t n = svsub_x (pg, z, d->shift); 35963 + - 35964 + + svfloat64_t c13 = svld1rq (svptrue_b64 (), &d->c1); 35965 + /* r = x - n * ln2, r is in [-ln2/(2N), ln2/(2N)]. */ 35966 + svfloat64_t ln2 = svld1rq (svptrue_b64 (), &d->ln2_hi); 35967 + svfloat64_t r = svmls_lane (x, n, ln2, 0); 35968 + r = svmls_lane (r, n, ln2, 1); 35969 + 35970 + /* y = exp(r) - 1 ~= r + C0 r^2 + C1 r^3 + C2 r^4 + C3 r^5. */ 35971 + - svfloat64_t r2 = svmul_x (pg, r, r); 35972 + - svfloat64_t p01 = svmla_x (pg, C (0), C (1), r); 35973 + - svfloat64_t p23 = svmla_x (pg, C (2), C (3), r); 35974 + + svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r); 35975 + + svfloat64_t p01 = svmla_lane (sv_f64 (d->c0), r, c13, 0); 35976 + + svfloat64_t p23 = svmla_lane (sv_f64 (d->c2), r, c13, 1); 35977 + svfloat64_t p04 = svmla_x (pg, p01, p23, r2); 35978 + svfloat64_t y = svmla_x (pg, r, p04, r2); 35979 + 35980 + diff --git a/sysdeps/aarch64/fpu/sv_expf_inline.h b/sysdeps/aarch64/fpu/sv_expf_inline.h 35981 + index 6166df6553..75781fb4dd 100644 35982 + --- a/sysdeps/aarch64/fpu/sv_expf_inline.h 35983 + +++ b/sysdeps/aarch64/fpu/sv_expf_inline.h 35984 + @@ -61,7 +61,7 @@ expf_inline (svfloat32_t x, const svbool_t pg, const struct sv_expf_data *d) 35985 + /* scale = 2^(n/N). */ 35986 + svfloat32_t scale = svexpa (svreinterpret_u32 (z)); 35987 + 35988 + - /* y = exp(r) - 1 ~= r + C0 r^2 + C1 r^3 + C2 r^4 + C3 r^5 + C4 r^6. */ 35989 + + /* poly(r) = exp(r) - 1 ~= C0 r + C1 r^2 + C2 r^3 + C3 r^4 + C4 r^5. */ 35990 + svfloat32_t p12 = svmla_lane (sv_f32 (d->c1), r, lane_consts, 2); 35991 + svfloat32_t p34 = svmla_lane (sv_f32 (d->c3), r, lane_consts, 3); 35992 + svfloat32_t r2 = svmul_x (svptrue_b32 (), r, r); 35993 + @@ -71,5 +71,4 @@ expf_inline (svfloat32_t x, const svbool_t pg, const struct sv_expf_data *d) 35994 + 35995 + return svmla_x (pg, scale, scale, poly); 35996 + } 35997 + - 35998 + #endif 35999 + 36000 + commit 194185c28954dfa11a6ded8b32f34fee680d3218 36001 + Author: Yat Long Poon <yatlong.poon@arm.com> 36002 + Date: Thu Feb 13 18:00:50 2025 +0000 36003 + 36004 + AArch64: Improve codegen for SVE erfcf 36005 + 36006 + Reduce number of MOV/MOVPRFXs and use unpredicated FMUL. 36007 + Replace MUL with LSL. Speedup on Neoverse V1: 6%. 36008 + 36009 + Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com> 36010 + (cherry picked from commit f5ff34cb3c75ec1061c75bb9188b3c1176426947) 36011 + 36012 + diff --git a/sysdeps/aarch64/fpu/erfcf_sve.c b/sysdeps/aarch64/fpu/erfcf_sve.c 36013 + index ecacb933ac..e4869263e3 100644 36014 + --- a/sysdeps/aarch64/fpu/erfcf_sve.c 36015 + +++ b/sysdeps/aarch64/fpu/erfcf_sve.c 36016 + @@ -76,7 +76,7 @@ svfloat32_t SV_NAME_F1 (erfc) (svfloat32_t x, const svbool_t pg) 36017 + svuint32_t i = svqadd (svreinterpret_u32 (z), dat->off_idx); 36018 + 36019 + /* Lookup erfc(r) and 2/sqrt(pi)*exp(-r^2) in tables. */ 36020 + - i = svmul_x (pg, i, 2); 36021 + + i = svlsl_x (svptrue_b32 (), i, 1); 36022 + const float32_t *p = &__v_erfcf_data.tab[0].erfc - 2 * dat->off_arr; 36023 + svfloat32_t erfcr = svld1_gather_index (pg, p, i); 36024 + svfloat32_t scale = svld1_gather_index (pg, p + 1, i); 36025 + @@ -84,15 +84,15 @@ svfloat32_t SV_NAME_F1 (erfc) (svfloat32_t x, const svbool_t pg) 36026 + /* erfc(x) ~ erfc(r) - scale * d * poly(r, d). */ 36027 + svfloat32_t r = svsub_x (pg, z, shift); 36028 + svfloat32_t d = svsub_x (pg, a, r); 36029 + - svfloat32_t d2 = svmul_x (pg, d, d); 36030 + - svfloat32_t r2 = svmul_x (pg, r, r); 36031 + + svfloat32_t d2 = svmul_x (svptrue_b32 (), d, d); 36032 + + svfloat32_t r2 = svmul_x (svptrue_b32 (), r, r); 36033 + 36034 + svfloat32_t coeffs = svld1rq (svptrue_b32 (), &dat->third); 36035 + - svfloat32_t third = svdup_lane (coeffs, 0); 36036 + 36037 + svfloat32_t p1 = r; 36038 + - svfloat32_t p2 = svmls_lane (third, r2, coeffs, 1); 36039 + - svfloat32_t p3 = svmul_x (pg, r, svmla_lane (sv_f32 (-0.5), r2, coeffs, 0)); 36040 + + svfloat32_t p2 = svmls_lane (sv_f32 (dat->third), r2, coeffs, 1); 36041 + + svfloat32_t p3 36042 + + = svmul_x (svptrue_b32 (), r, svmla_lane (sv_f32 (-0.5), r2, coeffs, 0)); 36043 + svfloat32_t p4 = svmla_lane (sv_f32 (dat->two_over_five), r2, coeffs, 2); 36044 + p4 = svmls_x (pg, sv_f32 (dat->tenth), r2, p4); 36045 + 36046 + 36047 + commit 7dc549c5a4af3c32689147550144397116404d22 36048 + Author: Yat Long Poon <yatlong.poon@arm.com> 36049 + Date: Thu Feb 13 18:02:01 2025 +0000 36050 + 36051 + AArch64: Improve codegen for SVE pow 36052 + 36053 + Move constants to struct. Improve memory access with indexed/unpredicated 36054 + instructions. Eliminate register spills. Speedup on Neoverse V1: 24%. 36055 + 36056 + Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com> 36057 + (cherry picked from commit 0b195651db3ae793187c7dd6d78b5a7a8da9d5e6) 36058 + 36059 + diff --git a/sysdeps/aarch64/fpu/pow_sve.c b/sysdeps/aarch64/fpu/pow_sve.c 36060 + index 4c0bf8956c..4242d22a49 100644 36061 + --- a/sysdeps/aarch64/fpu/pow_sve.c 36062 + +++ b/sysdeps/aarch64/fpu/pow_sve.c 36063 + @@ -44,19 +44,18 @@ 36064 + 36065 + /* Data is defined in v_pow_log_data.c. */ 36066 + #define N_LOG (1 << V_POW_LOG_TABLE_BITS) 36067 + -#define A __v_pow_log_data.poly 36068 + #define Off 0x3fe6955500000000 36069 + 36070 + /* Data is defined in v_pow_exp_data.c. */ 36071 + #define N_EXP (1 << V_POW_EXP_TABLE_BITS) 36072 + #define SignBias (0x800 << V_POW_EXP_TABLE_BITS) 36073 + -#define C __v_pow_exp_data.poly 36074 + #define SmallExp 0x3c9 /* top12(0x1p-54). */ 36075 + #define BigExp 0x408 /* top12(512.). */ 36076 + #define ThresExp 0x03f /* BigExp - SmallExp. */ 36077 + #define HugeExp 0x409 /* top12(1024.). */ 36078 + 36079 + /* Constants associated with pow. */ 36080 + +#define SmallBoundX 0x1p-126 36081 + #define SmallPowX 0x001 /* top12(0x1p-126). */ 36082 + #define BigPowX 0x7ff /* top12(INFINITY). */ 36083 + #define ThresPowX 0x7fe /* BigPowX - SmallPowX. */ 36084 + @@ -64,6 +63,31 @@ 36085 + #define BigPowY 0x43e /* top12(0x1.749p62). */ 36086 + #define ThresPowY 0x080 /* BigPowY - SmallPowY. */ 36087 + 36088 + +static const struct data 36089 + +{ 36090 + + double log_c0, log_c2, log_c4, log_c6, ln2_hi, ln2_lo; 36091 + + double log_c1, log_c3, log_c5, off; 36092 + + double n_over_ln2, exp_c2, ln2_over_n_hi, ln2_over_n_lo; 36093 + + double exp_c0, exp_c1; 36094 + +} data = { 36095 + + .log_c0 = -0x1p-1, 36096 + + .log_c1 = -0x1.555555555556p-1, 36097 + + .log_c2 = 0x1.0000000000006p-1, 36098 + + .log_c3 = 0x1.999999959554ep-1, 36099 + + .log_c4 = -0x1.555555529a47ap-1, 36100 + + .log_c5 = -0x1.2495b9b4845e9p0, 36101 + + .log_c6 = 0x1.0002b8b263fc3p0, 36102 + + .off = Off, 36103 + + .exp_c0 = 0x1.fffffffffffd4p-2, 36104 + + .exp_c1 = 0x1.5555571d6ef9p-3, 36105 + + .exp_c2 = 0x1.5555576a5adcep-5, 36106 + + .ln2_hi = 0x1.62e42fefa3800p-1, 36107 + + .ln2_lo = 0x1.ef35793c76730p-45, 36108 + + .n_over_ln2 = 0x1.71547652b82fep0 * N_EXP, 36109 + + .ln2_over_n_hi = 0x1.62e42fefc0000p-9, 36110 + + .ln2_over_n_lo = -0x1.c610ca86c3899p-45, 36111 + +}; 36112 + + 36113 + /* Check if x is an integer. */ 36114 + static inline svbool_t 36115 + sv_isint (svbool_t pg, svfloat64_t x) 36116 + @@ -82,7 +106,7 @@ sv_isnotint (svbool_t pg, svfloat64_t x) 36117 + static inline svbool_t 36118 + sv_isodd (svbool_t pg, svfloat64_t x) 36119 + { 36120 + - svfloat64_t y = svmul_x (pg, x, 0.5); 36121 + + svfloat64_t y = svmul_x (svptrue_b64 (), x, 0.5); 36122 + return sv_isnotint (pg, y); 36123 + } 36124 + 36125 + @@ -121,7 +145,7 @@ zeroinfnan (uint64_t i) 36126 + static inline svbool_t 36127 + sv_zeroinfnan (svbool_t pg, svuint64_t i) 36128 + { 36129 + - return svcmpge (pg, svsub_x (pg, svmul_x (pg, i, 2), 1), 36130 + + return svcmpge (pg, svsub_x (pg, svadd_x (pg, i, i), 1), 36131 + 2 * asuint64 (INFINITY) - 1); 36132 + } 36133 + 36134 + @@ -174,16 +198,17 @@ sv_call_specialcase (svfloat64_t x1, svuint64_t u1, svuint64_t u2, 36135 + additional 15 bits precision. IX is the bit representation of x, but 36136 + normalized in the subnormal range using the sign bit for the exponent. */ 36137 + static inline svfloat64_t 36138 + -sv_log_inline (svbool_t pg, svuint64_t ix, svfloat64_t *tail) 36139 + +sv_log_inline (svbool_t pg, svuint64_t ix, svfloat64_t *tail, 36140 + + const struct data *d) 36141 + { 36142 + /* x = 2^k z; where z is in range [Off,2*Off) and exact. 36143 + The range is split into N subintervals. 36144 + The ith subinterval contains z and c is near its center. */ 36145 + - svuint64_t tmp = svsub_x (pg, ix, Off); 36146 + + svuint64_t tmp = svsub_x (pg, ix, d->off); 36147 + svuint64_t i = svand_x (pg, svlsr_x (pg, tmp, 52 - V_POW_LOG_TABLE_BITS), 36148 + sv_u64 (N_LOG - 1)); 36149 + svint64_t k = svasr_x (pg, svreinterpret_s64 (tmp), 52); 36150 + - svuint64_t iz = svsub_x (pg, ix, svand_x (pg, tmp, sv_u64 (0xfffULL << 52))); 36151 + + svuint64_t iz = svsub_x (pg, ix, svlsl_x (pg, svreinterpret_u64 (k), 52)); 36152 + svfloat64_t z = svreinterpret_f64 (iz); 36153 + svfloat64_t kd = svcvt_f64_x (pg, k); 36154 + 36155 + @@ -199,40 +224,85 @@ sv_log_inline (svbool_t pg, svuint64_t ix, svfloat64_t *tail) 36156 + |z/c - 1| < 1/N, so r = z/c - 1 is exactly representible. */ 36157 + svfloat64_t r = svmad_x (pg, z, invc, -1.0); 36158 + /* k*Ln2 + log(c) + r. */ 36159 + - svfloat64_t t1 = svmla_x (pg, logc, kd, __v_pow_log_data.ln2_hi); 36160 + + 36161 + + svfloat64_t ln2_hilo = svld1rq_f64 (svptrue_b64 (), &d->ln2_hi); 36162 + + svfloat64_t t1 = svmla_lane_f64 (logc, kd, ln2_hilo, 0); 36163 + svfloat64_t t2 = svadd_x (pg, t1, r); 36164 + - svfloat64_t lo1 = svmla_x (pg, logctail, kd, __v_pow_log_data.ln2_lo); 36165 + + svfloat64_t lo1 = svmla_lane_f64 (logctail, kd, ln2_hilo, 1); 36166 + svfloat64_t lo2 = svadd_x (pg, svsub_x (pg, t1, t2), r); 36167 + 36168 + /* Evaluation is optimized assuming superscalar pipelined execution. */ 36169 + - svfloat64_t ar = svmul_x (pg, r, -0.5); /* A[0] = -0.5. */ 36170 + - svfloat64_t ar2 = svmul_x (pg, r, ar); 36171 + - svfloat64_t ar3 = svmul_x (pg, r, ar2); 36172 + + 36173 + + svfloat64_t log_c02 = svld1rq_f64 (svptrue_b64 (), &d->log_c0); 36174 + + svfloat64_t ar = svmul_lane_f64 (r, log_c02, 0); 36175 + + svfloat64_t ar2 = svmul_x (svptrue_b64 (), r, ar); 36176 + + svfloat64_t ar3 = svmul_x (svptrue_b64 (), r, ar2); 36177 + /* k*Ln2 + log(c) + r + A[0]*r*r. */ 36178 + svfloat64_t hi = svadd_x (pg, t2, ar2); 36179 + - svfloat64_t lo3 = svmla_x (pg, svneg_x (pg, ar2), ar, r); 36180 + + svfloat64_t lo3 = svmls_x (pg, ar2, ar, r); 36181 + svfloat64_t lo4 = svadd_x (pg, svsub_x (pg, t2, hi), ar2); 36182 + /* p = log1p(r) - r - A[0]*r*r. */ 36183 + /* p = (ar3 * (A[1] + r * A[2] + ar2 * (A[3] + r * A[4] + ar2 * (A[5] + r * 36184 + A[6])))). */ 36185 + - svfloat64_t a56 = svmla_x (pg, sv_f64 (A[5]), r, A[6]); 36186 + - svfloat64_t a34 = svmla_x (pg, sv_f64 (A[3]), r, A[4]); 36187 + - svfloat64_t a12 = svmla_x (pg, sv_f64 (A[1]), r, A[2]); 36188 + + 36189 + + svfloat64_t log_c46 = svld1rq_f64 (svptrue_b64 (), &d->log_c4); 36190 + + svfloat64_t a56 = svmla_lane_f64 (sv_f64 (d->log_c5), r, log_c46, 1); 36191 + + svfloat64_t a34 = svmla_lane_f64 (sv_f64 (d->log_c3), r, log_c46, 0); 36192 + + svfloat64_t a12 = svmla_lane_f64 (sv_f64 (d->log_c1), r, log_c02, 1); 36193 + svfloat64_t p = svmla_x (pg, a34, ar2, a56); 36194 + p = svmla_x (pg, a12, ar2, p); 36195 + - p = svmul_x (pg, ar3, p); 36196 + + p = svmul_x (svptrue_b64 (), ar3, p); 36197 + svfloat64_t lo = svadd_x ( 36198 + - pg, svadd_x (pg, svadd_x (pg, svadd_x (pg, lo1, lo2), lo3), lo4), p); 36199 + + pg, svadd_x (pg, svsub_x (pg, svadd_x (pg, lo1, lo2), lo3), lo4), p); 36200 + svfloat64_t y = svadd_x (pg, hi, lo); 36201 + *tail = svadd_x (pg, svsub_x (pg, hi, y), lo); 36202 + return y; 36203 + } 36204 + 36205 + +static inline svfloat64_t 36206 + +sv_exp_core (svbool_t pg, svfloat64_t x, svfloat64_t xtail, 36207 + + svuint64_t sign_bias, svfloat64_t *tmp, svuint64_t *sbits, 36208 + + svuint64_t *ki, const struct data *d) 36209 + +{ 36210 + + /* exp(x) = 2^(k/N) * exp(r), with exp(r) in [2^(-1/2N),2^(1/2N)]. */ 36211 + + /* x = ln2/N*k + r, with int k and r in [-ln2/2N, ln2/2N]. */ 36212 + + svfloat64_t n_over_ln2_and_c2 = svld1rq_f64 (svptrue_b64 (), &d->n_over_ln2); 36213 + + svfloat64_t z = svmul_lane_f64 (x, n_over_ln2_and_c2, 0); 36214 + + /* z - kd is in [-1, 1] in non-nearest rounding modes. */ 36215 + + svfloat64_t kd = svrinta_x (pg, z); 36216 + + *ki = svreinterpret_u64 (svcvt_s64_x (pg, kd)); 36217 + + 36218 + + svfloat64_t ln2_over_n_hilo 36219 + + = svld1rq_f64 (svptrue_b64 (), &d->ln2_over_n_hi); 36220 + + svfloat64_t r = x; 36221 + + r = svmls_lane_f64 (r, kd, ln2_over_n_hilo, 0); 36222 + + r = svmls_lane_f64 (r, kd, ln2_over_n_hilo, 1); 36223 + + /* The code assumes 2^-200 < |xtail| < 2^-8/N. */ 36224 + + r = svadd_x (pg, r, xtail); 36225 + + /* 2^(k/N) ~= scale. */ 36226 + + svuint64_t idx = svand_x (pg, *ki, N_EXP - 1); 36227 + + svuint64_t top 36228 + + = svlsl_x (pg, svadd_x (pg, *ki, sign_bias), 52 - V_POW_EXP_TABLE_BITS); 36229 + + /* This is only a valid scale when -1023*N < k < 1024*N. */ 36230 + + *sbits = svld1_gather_index (pg, __v_pow_exp_data.sbits, idx); 36231 + + *sbits = svadd_x (pg, *sbits, top); 36232 + + /* exp(x) = 2^(k/N) * exp(r) ~= scale + scale * (exp(r) - 1). */ 36233 + + svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r); 36234 + + *tmp = svmla_lane_f64 (sv_f64 (d->exp_c1), r, n_over_ln2_and_c2, 1); 36235 + + *tmp = svmla_x (pg, sv_f64 (d->exp_c0), r, *tmp); 36236 + + *tmp = svmla_x (pg, r, r2, *tmp); 36237 + + svfloat64_t scale = svreinterpret_f64 (*sbits); 36238 + + /* Note: tmp == 0 or |tmp| > 2^-200 and scale > 2^-739, so there 36239 + + is no spurious underflow here even without fma. */ 36240 + + z = svmla_x (pg, scale, scale, *tmp); 36241 + + return z; 36242 + +} 36243 + + 36244 + /* Computes sign*exp(x+xtail) where |xtail| < 2^-8/N and |xtail| <= |x|. 36245 + The sign_bias argument is SignBias or 0 and sets the sign to -1 or 1. */ 36246 + static inline svfloat64_t 36247 + sv_exp_inline (svbool_t pg, svfloat64_t x, svfloat64_t xtail, 36248 + - svuint64_t sign_bias) 36249 + + svuint64_t sign_bias, const struct data *d) 36250 + { 36251 + /* 3 types of special cases: tiny (uflow and spurious uflow), huge (oflow) 36252 + and other cases of large values of x (scale * (1 + TMP) oflow). */ 36253 + @@ -240,73 +310,46 @@ sv_exp_inline (svbool_t pg, svfloat64_t x, svfloat64_t xtail, 36254 + /* |x| is large (|x| >= 512) or tiny (|x| <= 0x1p-54). */ 36255 + svbool_t uoflow = svcmpge (pg, svsub_x (pg, abstop, SmallExp), ThresExp); 36256 + 36257 + - /* Conditions special, uflow and oflow are all expressed as uoflow && 36258 + - something, hence do not bother computing anything if no lane in uoflow is 36259 + - true. */ 36260 + - svbool_t special = svpfalse_b (); 36261 + - svbool_t uflow = svpfalse_b (); 36262 + - svbool_t oflow = svpfalse_b (); 36263 + + svfloat64_t tmp; 36264 + + svuint64_t sbits, ki; 36265 + if (__glibc_unlikely (svptest_any (pg, uoflow))) 36266 + { 36267 + + svfloat64_t z 36268 + + = sv_exp_core (pg, x, xtail, sign_bias, &tmp, &sbits, &ki, d); 36269 + + 36270 + /* |x| is tiny (|x| <= 0x1p-54). */ 36271 + - uflow = svcmpge (pg, svsub_x (pg, abstop, SmallExp), 0x80000000); 36272 + + svbool_t uflow 36273 + + = svcmpge (pg, svsub_x (pg, abstop, SmallExp), 0x80000000); 36274 + uflow = svand_z (pg, uoflow, uflow); 36275 + /* |x| is huge (|x| >= 1024). */ 36276 + - oflow = svcmpge (pg, abstop, HugeExp); 36277 + + svbool_t oflow = svcmpge (pg, abstop, HugeExp); 36278 + oflow = svand_z (pg, uoflow, svbic_z (pg, oflow, uflow)); 36279 + + 36280 + /* For large |x| values (512 < |x| < 1024) scale * (1 + TMP) can overflow 36281 + - or underflow. */ 36282 + - special = svbic_z (pg, uoflow, svorr_z (pg, uflow, oflow)); 36283 + + or underflow. */ 36284 + + svbool_t special = svbic_z (pg, uoflow, svorr_z (pg, uflow, oflow)); 36285 + + 36286 + + /* Update result with special and large cases. */ 36287 + + z = sv_call_specialcase (tmp, sbits, ki, z, special); 36288 + + 36289 + + /* Handle underflow and overflow. */ 36290 + + svbool_t x_is_neg = svcmplt (pg, x, 0); 36291 + + svuint64_t sign_mask 36292 + + = svlsl_x (pg, sign_bias, 52 - V_POW_EXP_TABLE_BITS); 36293 + + svfloat64_t res_uoflow 36294 + + = svsel (x_is_neg, sv_f64 (0.0), sv_f64 (INFINITY)); 36295 + + res_uoflow = svreinterpret_f64 ( 36296 + + svorr_x (pg, svreinterpret_u64 (res_uoflow), sign_mask)); 36297 + + /* Avoid spurious underflow for tiny x. */ 36298 + + svfloat64_t res_spurious_uflow 36299 + + = svreinterpret_f64 (svorr_x (pg, sign_mask, 0x3ff0000000000000)); 36300 + + 36301 + + z = svsel (oflow, res_uoflow, z); 36302 + + z = svsel (uflow, res_spurious_uflow, z); 36303 + + return z; 36304 + } 36305 + 36306 + - /* exp(x) = 2^(k/N) * exp(r), with exp(r) in [2^(-1/2N),2^(1/2N)]. */ 36307 + - /* x = ln2/N*k + r, with int k and r in [-ln2/2N, ln2/2N]. */ 36308 + - svfloat64_t z = svmul_x (pg, x, __v_pow_exp_data.n_over_ln2); 36309 + - /* z - kd is in [-1, 1] in non-nearest rounding modes. */ 36310 + - svfloat64_t shift = sv_f64 (__v_pow_exp_data.shift); 36311 + - svfloat64_t kd = svadd_x (pg, z, shift); 36312 + - svuint64_t ki = svreinterpret_u64 (kd); 36313 + - kd = svsub_x (pg, kd, shift); 36314 + - svfloat64_t r = x; 36315 + - r = svmls_x (pg, r, kd, __v_pow_exp_data.ln2_over_n_hi); 36316 + - r = svmls_x (pg, r, kd, __v_pow_exp_data.ln2_over_n_lo); 36317 + - /* The code assumes 2^-200 < |xtail| < 2^-8/N. */ 36318 + - r = svadd_x (pg, r, xtail); 36319 + - /* 2^(k/N) ~= scale. */ 36320 + - svuint64_t idx = svand_x (pg, ki, N_EXP - 1); 36321 + - svuint64_t top 36322 + - = svlsl_x (pg, svadd_x (pg, ki, sign_bias), 52 - V_POW_EXP_TABLE_BITS); 36323 + - /* This is only a valid scale when -1023*N < k < 1024*N. */ 36324 + - svuint64_t sbits = svld1_gather_index (pg, __v_pow_exp_data.sbits, idx); 36325 + - sbits = svadd_x (pg, sbits, top); 36326 + - /* exp(x) = 2^(k/N) * exp(r) ~= scale + scale * (exp(r) - 1). */ 36327 + - svfloat64_t r2 = svmul_x (pg, r, r); 36328 + - svfloat64_t tmp = svmla_x (pg, sv_f64 (C[1]), r, C[2]); 36329 + - tmp = svmla_x (pg, sv_f64 (C[0]), r, tmp); 36330 + - tmp = svmla_x (pg, r, r2, tmp); 36331 + - svfloat64_t scale = svreinterpret_f64 (sbits); 36332 + - /* Note: tmp == 0 or |tmp| > 2^-200 and scale > 2^-739, so there 36333 + - is no spurious underflow here even without fma. */ 36334 + - z = svmla_x (pg, scale, scale, tmp); 36335 + - 36336 + - /* Update result with special and large cases. */ 36337 + - if (__glibc_unlikely (svptest_any (pg, special))) 36338 + - z = sv_call_specialcase (tmp, sbits, ki, z, special); 36339 + - 36340 + - /* Handle underflow and overflow. */ 36341 + - svuint64_t sign_bit = svlsr_x (pg, svreinterpret_u64 (x), 63); 36342 + - svbool_t x_is_neg = svcmpne (pg, sign_bit, 0); 36343 + - svuint64_t sign_mask = svlsl_x (pg, sign_bias, 52 - V_POW_EXP_TABLE_BITS); 36344 + - svfloat64_t res_uoflow = svsel (x_is_neg, sv_f64 (0.0), sv_f64 (INFINITY)); 36345 + - res_uoflow = svreinterpret_f64 ( 36346 + - svorr_x (pg, svreinterpret_u64 (res_uoflow), sign_mask)); 36347 + - z = svsel (oflow, res_uoflow, z); 36348 + - /* Avoid spurious underflow for tiny x. */ 36349 + - svfloat64_t res_spurious_uflow 36350 + - = svreinterpret_f64 (svorr_x (pg, sign_mask, 0x3ff0000000000000)); 36351 + - z = svsel (uflow, res_spurious_uflow, z); 36352 + - 36353 + - return z; 36354 + + return sv_exp_core (pg, x, xtail, sign_bias, &tmp, &sbits, &ki, d); 36355 + } 36356 + 36357 + static inline double 36358 + @@ -341,47 +384,39 @@ pow_sc (double x, double y) 36359 + 36360 + svfloat64_t SV_NAME_D2 (pow) (svfloat64_t x, svfloat64_t y, const svbool_t pg) 36361 + { 36362 + + const struct data *d = ptr_barrier (&data); 36363 + + 36364 + /* This preamble handles special case conditions used in the final scalar 36365 + fallbacks. It also updates ix and sign_bias, that are used in the core 36366 + computation too, i.e., exp( y * log (x) ). */ 36367 + svuint64_t vix0 = svreinterpret_u64 (x); 36368 + svuint64_t viy0 = svreinterpret_u64 (y); 36369 + - svuint64_t vtopx0 = svlsr_x (svptrue_b64 (), vix0, 52); 36370 + 36371 + /* Negative x cases. */ 36372 + - svuint64_t sign_bit = svlsr_m (pg, vix0, 63); 36373 + - svbool_t xisneg = svcmpeq (pg, sign_bit, 1); 36374 + + svbool_t xisneg = svcmplt (pg, x, 0); 36375 + 36376 + /* Set sign_bias and ix depending on sign of x and nature of y. */ 36377 + - svbool_t yisnotint_xisneg = svpfalse_b (); 36378 + + svbool_t yint_or_xpos = pg; 36379 + svuint64_t sign_bias = sv_u64 (0); 36380 + svuint64_t vix = vix0; 36381 + - svuint64_t vtopx1 = vtopx0; 36382 + if (__glibc_unlikely (svptest_any (pg, xisneg))) 36383 + { 36384 + /* Determine nature of y. */ 36385 + - yisnotint_xisneg = sv_isnotint (xisneg, y); 36386 + - svbool_t yisint_xisneg = sv_isint (xisneg, y); 36387 + + yint_or_xpos = sv_isint (xisneg, y); 36388 + svbool_t yisodd_xisneg = sv_isodd (xisneg, y); 36389 + /* ix set to abs(ix) if y is integer. */ 36390 + - vix = svand_m (yisint_xisneg, vix0, 0x7fffffffffffffff); 36391 + - vtopx1 = svand_m (yisint_xisneg, vtopx0, 0x7ff); 36392 + + vix = svand_m (yint_or_xpos, vix0, 0x7fffffffffffffff); 36393 + /* Set to SignBias if x is negative and y is odd. */ 36394 + sign_bias = svsel (yisodd_xisneg, sv_u64 (SignBias), sv_u64 (0)); 36395 + } 36396 + 36397 + - /* Special cases of x or y: zero, inf and nan. */ 36398 + - svbool_t xspecial = sv_zeroinfnan (pg, vix0); 36399 + - svbool_t yspecial = sv_zeroinfnan (pg, viy0); 36400 + - svbool_t special = svorr_z (pg, xspecial, yspecial); 36401 + - 36402 + /* Small cases of x: |x| < 0x1p-126. */ 36403 + - svuint64_t vabstopx0 = svand_x (pg, vtopx0, 0x7ff); 36404 + - svbool_t xsmall = svcmplt (pg, vabstopx0, SmallPowX); 36405 + - if (__glibc_unlikely (svptest_any (pg, xsmall))) 36406 + + svbool_t xsmall = svaclt (yint_or_xpos, x, SmallBoundX); 36407 + + if (__glibc_unlikely (svptest_any (yint_or_xpos, xsmall))) 36408 + { 36409 + /* Normalize subnormal x so exponent becomes negative. */ 36410 + - svbool_t topx_is_null = svcmpeq (xsmall, vtopx1, 0); 36411 + + svuint64_t vtopx = svlsr_x (svptrue_b64 (), vix, 52); 36412 + + svbool_t topx_is_null = svcmpeq (xsmall, vtopx, 0); 36413 + 36414 + svuint64_t vix_norm = svreinterpret_u64 (svmul_m (xsmall, x, 0x1p52)); 36415 + vix_norm = svand_m (xsmall, vix_norm, 0x7fffffffffffffff); 36416 + @@ -391,20 +426,24 @@ svfloat64_t SV_NAME_D2 (pow) (svfloat64_t x, svfloat64_t y, const svbool_t pg) 36417 + 36418 + /* y_hi = log(ix, &y_lo). */ 36419 + svfloat64_t vlo; 36420 + - svfloat64_t vhi = sv_log_inline (pg, vix, &vlo); 36421 + + svfloat64_t vhi = sv_log_inline (yint_or_xpos, vix, &vlo, d); 36422 + 36423 + /* z = exp(y_hi, y_lo, sign_bias). */ 36424 + - svfloat64_t vehi = svmul_x (pg, y, vhi); 36425 + - svfloat64_t velo = svmul_x (pg, y, vlo); 36426 + - svfloat64_t vemi = svmls_x (pg, vehi, y, vhi); 36427 + - velo = svsub_x (pg, velo, vemi); 36428 + - svfloat64_t vz = sv_exp_inline (pg, vehi, velo, sign_bias); 36429 + + svfloat64_t vehi = svmul_x (svptrue_b64 (), y, vhi); 36430 + + svfloat64_t vemi = svmls_x (yint_or_xpos, vehi, y, vhi); 36431 + + svfloat64_t velo = svnmls_x (yint_or_xpos, vemi, y, vlo); 36432 + + svfloat64_t vz = sv_exp_inline (yint_or_xpos, vehi, velo, sign_bias, d); 36433 + 36434 + /* Cases of finite y and finite negative x. */ 36435 + - vz = svsel (yisnotint_xisneg, sv_f64 (__builtin_nan ("")), vz); 36436 + + vz = svsel (yint_or_xpos, vz, sv_f64 (__builtin_nan (""))); 36437 + + 36438 + + /* Special cases of x or y: zero, inf and nan. */ 36439 + + svbool_t xspecial = sv_zeroinfnan (svptrue_b64 (), vix0); 36440 + + svbool_t yspecial = sv_zeroinfnan (svptrue_b64 (), viy0); 36441 + + svbool_t special = svorr_z (svptrue_b64 (), xspecial, yspecial); 36442 + 36443 + /* Cases of zero/inf/nan x or y. */ 36444 + - if (__glibc_unlikely (svptest_any (pg, special))) 36445 + + if (__glibc_unlikely (svptest_any (svptrue_b64 (), special))) 36446 + vz = sv_call2_f64 (pow_sc, x, y, vz, special); 36447 + 36448 + return vz; 36449 + 36450 + commit 06fd8ad78f35a6cc65dc7c6c08ce55faf6ad079d 36451 + Author: Yat Long Poon <yatlong.poon@arm.com> 36452 + Date: Thu Feb 13 18:03:04 2025 +0000 36453 + 36454 + AArch64: Improve codegen for SVE powf 36455 + 36456 + Improve memory access with indexed/unpredicated instructions. 36457 + Eliminate register spills. Speedup on Neoverse V1: 3%. 36458 + 36459 + Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com> 36460 + (cherry picked from commit 95e807209b680257a9afe81a507754f1565dbb4d) 36461 + 36462 + diff --git a/sysdeps/aarch64/fpu/powf_sve.c b/sysdeps/aarch64/fpu/powf_sve.c 36463 + index 4f6a142325..08d7019a18 100644 36464 + --- a/sysdeps/aarch64/fpu/powf_sve.c 36465 + +++ b/sysdeps/aarch64/fpu/powf_sve.c 36466 + @@ -26,7 +26,6 @@ 36467 + #define Tlogc __v_powf_data.logc 36468 + #define Texp __v_powf_data.scale 36469 + #define SignBias (1 << (V_POWF_EXP2_TABLE_BITS + 11)) 36470 + -#define Shift 0x1.8p52 36471 + #define Norm 0x1p23f /* 0x4b000000. */ 36472 + 36473 + /* Overall ULP error bound for pow is 2.6 ulp 36474 + @@ -36,7 +35,7 @@ static const struct data 36475 + double log_poly[4]; 36476 + double exp_poly[3]; 36477 + float uflow_bound, oflow_bound, small_bound; 36478 + - uint32_t sign_bias, sign_mask, subnormal_bias, off; 36479 + + uint32_t sign_bias, subnormal_bias, off; 36480 + } data = { 36481 + /* rel err: 1.5 * 2^-30. Each coefficients is multiplied the value of 36482 + V_POWF_EXP2_N. */ 36483 + @@ -53,7 +52,6 @@ static const struct data 36484 + .small_bound = 0x1p-126f, 36485 + .off = 0x3f35d000, 36486 + .sign_bias = SignBias, 36487 + - .sign_mask = 0x80000000, 36488 + .subnormal_bias = 0x0b800000, /* 23 << 23. */ 36489 + }; 36490 + 36491 + @@ -86,7 +84,7 @@ svisodd (svbool_t pg, svfloat32_t x) 36492 + static inline svbool_t 36493 + sv_zeroinfnan (svbool_t pg, svuint32_t i) 36494 + { 36495 + - return svcmpge (pg, svsub_x (pg, svmul_x (pg, i, 2u), 1), 36496 + + return svcmpge (pg, svsub_x (pg, svadd_x (pg, i, i), 1), 36497 + 2u * 0x7f800000 - 1); 36498 + } 36499 + 36500 + @@ -150,9 +148,14 @@ powf_specialcase (float x, float y, float z) 36501 + } 36502 + 36503 + /* Scalar fallback for special case routines with custom signature. */ 36504 + -static inline svfloat32_t 36505 + -sv_call_powf_sc (svfloat32_t x1, svfloat32_t x2, svfloat32_t y, svbool_t cmp) 36506 + +static svfloat32_t NOINLINE 36507 + +sv_call_powf_sc (svfloat32_t x1, svfloat32_t x2, svfloat32_t y) 36508 + { 36509 + + /* Special cases of x or y: zero, inf and nan. */ 36510 + + svbool_t xspecial = sv_zeroinfnan (svptrue_b32 (), svreinterpret_u32 (x1)); 36511 + + svbool_t yspecial = sv_zeroinfnan (svptrue_b32 (), svreinterpret_u32 (x2)); 36512 + + svbool_t cmp = svorr_z (svptrue_b32 (), xspecial, yspecial); 36513 + + 36514 + svbool_t p = svpfirst (cmp, svpfalse ()); 36515 + while (svptest_any (cmp, p)) 36516 + { 36517 + @@ -182,30 +185,30 @@ sv_powf_core_ext (const svbool_t pg, svuint64_t i, svfloat64_t z, svint64_t k, 36518 + 36519 + /* Polynomial to approximate log1p(r)/ln2. */ 36520 + svfloat64_t logx = A (0); 36521 + - logx = svmla_x (pg, A (1), r, logx); 36522 + - logx = svmla_x (pg, A (2), r, logx); 36523 + - logx = svmla_x (pg, A (3), r, logx); 36524 + - logx = svmla_x (pg, y0, r, logx); 36525 + + logx = svmad_x (pg, r, logx, A (1)); 36526 + + logx = svmad_x (pg, r, logx, A (2)); 36527 + + logx = svmad_x (pg, r, logx, A (3)); 36528 + + logx = svmad_x (pg, r, logx, y0); 36529 + *pylogx = svmul_x (pg, y, logx); 36530 + 36531 + /* z - kd is in [-1, 1] in non-nearest rounding modes. */ 36532 + - svfloat64_t kd = svadd_x (pg, *pylogx, Shift); 36533 + - svuint64_t ki = svreinterpret_u64 (kd); 36534 + - kd = svsub_x (pg, kd, Shift); 36535 + + svfloat64_t kd = svrinta_x (svptrue_b64 (), *pylogx); 36536 + + svuint64_t ki = svreinterpret_u64 (svcvt_s64_x (svptrue_b64 (), kd)); 36537 + 36538 + r = svsub_x (pg, *pylogx, kd); 36539 + 36540 + /* exp2(x) = 2^(k/N) * 2^r ~= s * (C0*r^3 + C1*r^2 + C2*r + 1). */ 36541 + - svuint64_t t 36542 + - = svld1_gather_index (pg, Texp, svand_x (pg, ki, V_POWF_EXP2_N - 1)); 36543 + - svuint64_t ski = svadd_x (pg, ki, sign_bias); 36544 + - t = svadd_x (pg, t, svlsl_x (pg, ski, 52 - V_POWF_EXP2_TABLE_BITS)); 36545 + + svuint64_t t = svld1_gather_index ( 36546 + + svptrue_b64 (), Texp, svand_x (svptrue_b64 (), ki, V_POWF_EXP2_N - 1)); 36547 + + svuint64_t ski = svadd_x (svptrue_b64 (), ki, sign_bias); 36548 + + t = svadd_x (svptrue_b64 (), t, 36549 + + svlsl_x (svptrue_b64 (), ski, 52 - V_POWF_EXP2_TABLE_BITS)); 36550 + svfloat64_t s = svreinterpret_f64 (t); 36551 + 36552 + svfloat64_t p = C (0); 36553 + p = svmla_x (pg, C (1), p, r); 36554 + p = svmla_x (pg, C (2), p, r); 36555 + - p = svmla_x (pg, s, p, svmul_x (pg, s, r)); 36556 + + p = svmla_x (pg, s, p, svmul_x (svptrue_b64 (), s, r)); 36557 + 36558 + return p; 36559 + } 36560 + @@ -219,19 +222,16 @@ sv_powf_core (const svbool_t pg, svuint32_t i, svuint32_t iz, svint32_t k, 36561 + { 36562 + const svbool_t ptrue = svptrue_b64 (); 36563 + 36564 + - /* Unpack and promote input vectors (pg, y, z, i, k and sign_bias) into two in 36565 + - order to perform core computation in double precision. */ 36566 + + /* Unpack and promote input vectors (pg, y, z, i, k and sign_bias) into two 36567 + + * in order to perform core computation in double precision. */ 36568 + const svbool_t pg_lo = svunpklo (pg); 36569 + const svbool_t pg_hi = svunpkhi (pg); 36570 + - svfloat64_t y_lo = svcvt_f64_x ( 36571 + - ptrue, svreinterpret_f32 (svunpklo (svreinterpret_u32 (y)))); 36572 + - svfloat64_t y_hi = svcvt_f64_x ( 36573 + - ptrue, svreinterpret_f32 (svunpkhi (svreinterpret_u32 (y)))); 36574 + - svfloat32_t z = svreinterpret_f32 (iz); 36575 + - svfloat64_t z_lo = svcvt_f64_x ( 36576 + - ptrue, svreinterpret_f32 (svunpklo (svreinterpret_u32 (z)))); 36577 + - svfloat64_t z_hi = svcvt_f64_x ( 36578 + - ptrue, svreinterpret_f32 (svunpkhi (svreinterpret_u32 (z)))); 36579 + + svfloat64_t y_lo 36580 + + = svcvt_f64_x (pg, svreinterpret_f32 (svunpklo (svreinterpret_u32 (y)))); 36581 + + svfloat64_t y_hi 36582 + + = svcvt_f64_x (pg, svreinterpret_f32 (svunpkhi (svreinterpret_u32 (y)))); 36583 + + svfloat64_t z_lo = svcvt_f64_x (pg, svreinterpret_f32 (svunpklo (iz))); 36584 + + svfloat64_t z_hi = svcvt_f64_x (pg, svreinterpret_f32 (svunpkhi (iz))); 36585 + svuint64_t i_lo = svunpklo (i); 36586 + svuint64_t i_hi = svunpkhi (i); 36587 + svint64_t k_lo = svunpklo (k); 36588 + @@ -258,9 +258,9 @@ sv_powf_core (const svbool_t pg, svuint32_t i, svuint32_t iz, svint32_t k, 36589 + /* Implementation of SVE powf. 36590 + Provides the same accuracy as AdvSIMD powf, since it relies on the same 36591 + algorithm. The theoretical maximum error is under 2.60 ULPs. 36592 + - Maximum measured error is 2.56 ULPs: 36593 + - SV_NAME_F2 (pow) (0x1.004118p+0, 0x1.5d14a4p+16) got 0x1.fd4bp+127 36594 + - want 0x1.fd4b06p+127. */ 36595 + + Maximum measured error is 2.57 ULPs: 36596 + + SV_NAME_F2 (pow) (0x1.031706p+0, 0x1.ce2ec2p+12) got 0x1.fff868p+127 36597 + + want 0x1.fff862p+127. */ 36598 + svfloat32_t SV_NAME_F2 (pow) (svfloat32_t x, svfloat32_t y, const svbool_t pg) 36599 + { 36600 + const struct data *d = ptr_barrier (&data); 36601 + @@ -269,21 +269,19 @@ svfloat32_t SV_NAME_F2 (pow) (svfloat32_t x, svfloat32_t y, const svbool_t pg) 36602 + svuint32_t viy0 = svreinterpret_u32 (y); 36603 + 36604 + /* Negative x cases. */ 36605 + - svuint32_t sign_bit = svand_m (pg, vix0, d->sign_mask); 36606 + - svbool_t xisneg = svcmpeq (pg, sign_bit, d->sign_mask); 36607 + + svbool_t xisneg = svcmplt (pg, x, sv_f32 (0)); 36608 + 36609 + /* Set sign_bias and ix depending on sign of x and nature of y. */ 36610 + - svbool_t yisnotint_xisneg = svpfalse_b (); 36611 + + svbool_t yint_or_xpos = pg; 36612 + svuint32_t sign_bias = sv_u32 (0); 36613 + svuint32_t vix = vix0; 36614 + if (__glibc_unlikely (svptest_any (pg, xisneg))) 36615 + { 36616 + /* Determine nature of y. */ 36617 + - yisnotint_xisneg = svisnotint (xisneg, y); 36618 + - svbool_t yisint_xisneg = svisint (xisneg, y); 36619 + + yint_or_xpos = svisint (xisneg, y); 36620 + svbool_t yisodd_xisneg = svisodd (xisneg, y); 36621 + /* ix set to abs(ix) if y is integer. */ 36622 + - vix = svand_m (yisint_xisneg, vix0, 0x7fffffff); 36623 + + vix = svand_m (yint_or_xpos, vix0, 0x7fffffff); 36624 + /* Set to SignBias if x is negative and y is odd. */ 36625 + sign_bias = svsel (yisodd_xisneg, sv_u32 (d->sign_bias), sv_u32 (0)); 36626 + } 36627 + @@ -294,8 +292,8 @@ svfloat32_t SV_NAME_F2 (pow) (svfloat32_t x, svfloat32_t y, const svbool_t pg) 36628 + svbool_t cmp = svorr_z (pg, xspecial, yspecial); 36629 + 36630 + /* Small cases of x: |x| < 0x1p-126. */ 36631 + - svbool_t xsmall = svaclt (pg, x, d->small_bound); 36632 + - if (__glibc_unlikely (svptest_any (pg, xsmall))) 36633 + + svbool_t xsmall = svaclt (yint_or_xpos, x, d->small_bound); 36634 + + if (__glibc_unlikely (svptest_any (yint_or_xpos, xsmall))) 36635 + { 36636 + /* Normalize subnormal x so exponent becomes negative. */ 36637 + svuint32_t vix_norm = svreinterpret_u32 (svmul_x (xsmall, x, Norm)); 36638 + @@ -304,32 +302,35 @@ svfloat32_t SV_NAME_F2 (pow) (svfloat32_t x, svfloat32_t y, const svbool_t pg) 36639 + vix = svsel (xsmall, vix_norm, vix); 36640 + } 36641 + /* Part of core computation carried in working precision. */ 36642 + - svuint32_t tmp = svsub_x (pg, vix, d->off); 36643 + - svuint32_t i = svand_x (pg, svlsr_x (pg, tmp, (23 - V_POWF_LOG2_TABLE_BITS)), 36644 + - V_POWF_LOG2_N - 1); 36645 + - svuint32_t top = svand_x (pg, tmp, 0xff800000); 36646 + - svuint32_t iz = svsub_x (pg, vix, top); 36647 + - svint32_t k 36648 + - = svasr_x (pg, svreinterpret_s32 (top), (23 - V_POWF_EXP2_TABLE_BITS)); 36649 + - 36650 + - /* Compute core in extended precision and return intermediate ylogx results to 36651 + - handle cases of underflow and underflow in exp. */ 36652 + + svuint32_t tmp = svsub_x (yint_or_xpos, vix, d->off); 36653 + + svuint32_t i = svand_x ( 36654 + + yint_or_xpos, svlsr_x (yint_or_xpos, tmp, (23 - V_POWF_LOG2_TABLE_BITS)), 36655 + + V_POWF_LOG2_N - 1); 36656 + + svuint32_t top = svand_x (yint_or_xpos, tmp, 0xff800000); 36657 + + svuint32_t iz = svsub_x (yint_or_xpos, vix, top); 36658 + + svint32_t k = svasr_x (yint_or_xpos, svreinterpret_s32 (top), 36659 + + (23 - V_POWF_EXP2_TABLE_BITS)); 36660 + + 36661 + + /* Compute core in extended precision and return intermediate ylogx results 36662 + + * to handle cases of underflow and underflow in exp. */ 36663 + svfloat32_t ylogx; 36664 + - svfloat32_t ret = sv_powf_core (pg, i, iz, k, y, sign_bias, &ylogx, d); 36665 + + svfloat32_t ret 36666 + + = sv_powf_core (yint_or_xpos, i, iz, k, y, sign_bias, &ylogx, d); 36667 + 36668 + /* Handle exp special cases of underflow and overflow. */ 36669 + - svuint32_t sign = svlsl_x (pg, sign_bias, 20 - V_POWF_EXP2_TABLE_BITS); 36670 + + svuint32_t sign 36671 + + = svlsl_x (yint_or_xpos, sign_bias, 20 - V_POWF_EXP2_TABLE_BITS); 36672 + svfloat32_t ret_oflow 36673 + - = svreinterpret_f32 (svorr_x (pg, sign, asuint (INFINITY))); 36674 + + = svreinterpret_f32 (svorr_x (yint_or_xpos, sign, asuint (INFINITY))); 36675 + svfloat32_t ret_uflow = svreinterpret_f32 (sign); 36676 + - ret = svsel (svcmple (pg, ylogx, d->uflow_bound), ret_uflow, ret); 36677 + - ret = svsel (svcmpgt (pg, ylogx, d->oflow_bound), ret_oflow, ret); 36678 + + ret = svsel (svcmple (yint_or_xpos, ylogx, d->uflow_bound), ret_uflow, ret); 36679 + + ret = svsel (svcmpgt (yint_or_xpos, ylogx, d->oflow_bound), ret_oflow, ret); 36680 + 36681 + /* Cases of finite y and finite negative x. */ 36682 + - ret = svsel (yisnotint_xisneg, sv_f32 (__builtin_nanf ("")), ret); 36683 + + ret = svsel (yint_or_xpos, ret, sv_f32 (__builtin_nanf (""))); 36684 + 36685 + - if (__glibc_unlikely (svptest_any (pg, cmp))) 36686 + - return sv_call_powf_sc (x, y, ret, cmp); 36687 + + if (__glibc_unlikely (svptest_any (cmp, cmp))) 36688 + + return sv_call_powf_sc (x, y, ret); 36689 + 36690 + return ret; 36691 + } 36692 + 36693 + commit fd9a3a36fdcf14d1678c469e8b9033a46aa6c6fb 36694 + Author: Wilco Dijkstra <wilco.dijkstra@arm.com> 36695 + Date: Thu Feb 27 20:34:34 2025 +0000 36696 + 36697 + Revert "AArch64: Add vector logp1 alias for log1p" 36698 + 36699 + This reverts commit a991a0fc7c051d7ef2ea7778e0a699f22d4e53d7. 36700 + 36701 + diff --git a/bits/libm-simd-decl-stubs.h b/bits/libm-simd-decl-stubs.h 36702 + index 5019e8e25c..08a41c46ad 100644 36703 + --- a/bits/libm-simd-decl-stubs.h 36704 + +++ b/bits/libm-simd-decl-stubs.h 36705 + @@ -253,17 +253,6 @@ 36706 + #define __DECL_SIMD_log1pf64x 36707 + #define __DECL_SIMD_log1pf128x 36708 + 36709 + -#define __DECL_SIMD_logp1 36710 + -#define __DECL_SIMD_logp1f 36711 + -#define __DECL_SIMD_logp1l 36712 + -#define __DECL_SIMD_logp1f16 36713 + -#define __DECL_SIMD_logp1f32 36714 + -#define __DECL_SIMD_logp1f64 36715 + -#define __DECL_SIMD_logp1f128 36716 + -#define __DECL_SIMD_logp1f32x 36717 + -#define __DECL_SIMD_logp1f64x 36718 + -#define __DECL_SIMD_logp1f128x 36719 + - 36720 + #define __DECL_SIMD_atanh 36721 + #define __DECL_SIMD_atanhf 36722 + #define __DECL_SIMD_atanhl 36723 + diff --git a/math/bits/mathcalls.h b/math/bits/mathcalls.h 36724 + index 92856becc4..6cb594b6ff 100644 36725 + --- a/math/bits/mathcalls.h 36726 + +++ b/math/bits/mathcalls.h 36727 + @@ -126,7 +126,7 @@ __MATHCALL (log2p1,, (_Mdouble_ __x)); 36728 + __MATHCALL (log10p1,, (_Mdouble_ __x)); 36729 + 36730 + /* Return log(1 + X). */ 36731 + -__MATHCALL_VEC (logp1,, (_Mdouble_ __x)); 36732 + +__MATHCALL (logp1,, (_Mdouble_ __x)); 36733 + #endif 36734 + 36735 + #if defined __USE_XOPEN_EXTENDED || defined __USE_ISOC99 36736 + diff --git a/sysdeps/aarch64/fpu/Versions b/sysdeps/aarch64/fpu/Versions 36737 + index 015211f5f4..cc15ce2d1e 100644 36738 + --- a/sysdeps/aarch64/fpu/Versions 36739 + +++ b/sysdeps/aarch64/fpu/Versions 36740 + @@ -135,11 +135,4 @@ libmvec { 36741 + _ZGVsMxv_tanh; 36742 + _ZGVsMxv_tanhf; 36743 + } 36744 + - GLIBC_2.41 { 36745 + - _ZGVnN2v_logp1; 36746 + - _ZGVnN2v_logp1f; 36747 + - _ZGVnN4v_logp1f; 36748 + - _ZGVsMxv_logp1; 36749 + - _ZGVsMxv_logp1f; 36750 + - } 36751 + } 36752 + diff --git a/sysdeps/aarch64/fpu/advsimd_f32_protos.h b/sysdeps/aarch64/fpu/advsimd_f32_protos.h 36753 + index 5909bb4ce9..097d403ffe 100644 36754 + --- a/sysdeps/aarch64/fpu/advsimd_f32_protos.h 36755 + +++ b/sysdeps/aarch64/fpu/advsimd_f32_protos.h 36756 + @@ -36,7 +36,6 @@ libmvec_hidden_proto (V_NAME_F2(hypot)); 36757 + libmvec_hidden_proto (V_NAME_F1(log10)); 36758 + libmvec_hidden_proto (V_NAME_F1(log1p)); 36759 + libmvec_hidden_proto (V_NAME_F1(log2)); 36760 + -libmvec_hidden_proto (V_NAME_F1(logp1)); 36761 + libmvec_hidden_proto (V_NAME_F1(log)); 36762 + libmvec_hidden_proto (V_NAME_F2(pow)); 36763 + libmvec_hidden_proto (V_NAME_F1(sin)); 36764 + diff --git a/sysdeps/aarch64/fpu/bits/math-vector.h b/sysdeps/aarch64/fpu/bits/math-vector.h 36765 + index f295fe185d..7484150131 100644 36766 + --- a/sysdeps/aarch64/fpu/bits/math-vector.h 36767 + +++ b/sysdeps/aarch64/fpu/bits/math-vector.h 36768 + @@ -113,10 +113,6 @@ 36769 + # define __DECL_SIMD_log2 __DECL_SIMD_aarch64 36770 + # undef __DECL_SIMD_log2f 36771 + # define __DECL_SIMD_log2f __DECL_SIMD_aarch64 36772 + -# undef __DECL_SIMD_logp1 36773 + -# define __DECL_SIMD_logp1 __DECL_SIMD_aarch64 36774 + -# undef __DECL_SIMD_logp1f 36775 + -# define __DECL_SIMD_logp1f __DECL_SIMD_aarch64 36776 + # undef __DECL_SIMD_pow 36777 + # define __DECL_SIMD_pow __DECL_SIMD_aarch64 36778 + # undef __DECL_SIMD_powf 36779 + @@ -184,7 +180,6 @@ __vpcs __f32x4_t _ZGVnN4v_logf (__f32x4_t); 36780 + __vpcs __f32x4_t _ZGVnN4v_log10f (__f32x4_t); 36781 + __vpcs __f32x4_t _ZGVnN4v_log1pf (__f32x4_t); 36782 + __vpcs __f32x4_t _ZGVnN4v_log2f (__f32x4_t); 36783 + -__vpcs __f32x4_t _ZGVnN4v_logp1f (__f32x4_t); 36784 + __vpcs __f32x4_t _ZGVnN4vv_powf (__f32x4_t, __f32x4_t); 36785 + __vpcs __f32x4_t _ZGVnN4v_sinf (__f32x4_t); 36786 + __vpcs __f32x4_t _ZGVnN4v_sinhf (__f32x4_t); 36787 + @@ -212,7 +207,6 @@ __vpcs __f64x2_t _ZGVnN2v_log (__f64x2_t); 36788 + __vpcs __f64x2_t _ZGVnN2v_log10 (__f64x2_t); 36789 + __vpcs __f64x2_t _ZGVnN2v_log1p (__f64x2_t); 36790 + __vpcs __f64x2_t _ZGVnN2v_log2 (__f64x2_t); 36791 + -__vpcs __f64x2_t _ZGVnN2v_logp1 (__f64x2_t); 36792 + __vpcs __f64x2_t _ZGVnN2vv_pow (__f64x2_t, __f64x2_t); 36793 + __vpcs __f64x2_t _ZGVnN2v_sin (__f64x2_t); 36794 + __vpcs __f64x2_t _ZGVnN2v_sinh (__f64x2_t); 36795 + @@ -245,7 +239,6 @@ __sv_f32_t _ZGVsMxv_logf (__sv_f32_t, __sv_bool_t); 36796 + __sv_f32_t _ZGVsMxv_log10f (__sv_f32_t, __sv_bool_t); 36797 + __sv_f32_t _ZGVsMxv_log1pf (__sv_f32_t, __sv_bool_t); 36798 + __sv_f32_t _ZGVsMxv_log2f (__sv_f32_t, __sv_bool_t); 36799 + -__sv_f32_t _ZGVsMxv_logp1f (__sv_f32_t, __sv_bool_t); 36800 + __sv_f32_t _ZGVsMxvv_powf (__sv_f32_t, __sv_f32_t, __sv_bool_t); 36801 + __sv_f32_t _ZGVsMxv_sinf (__sv_f32_t, __sv_bool_t); 36802 + __sv_f32_t _ZGVsMxv_sinhf (__sv_f32_t, __sv_bool_t); 36803 + @@ -273,7 +266,6 @@ __sv_f64_t _ZGVsMxv_log (__sv_f64_t, __sv_bool_t); 36804 + __sv_f64_t _ZGVsMxv_log10 (__sv_f64_t, __sv_bool_t); 36805 + __sv_f64_t _ZGVsMxv_log1p (__sv_f64_t, __sv_bool_t); 36806 + __sv_f64_t _ZGVsMxv_log2 (__sv_f64_t, __sv_bool_t); 36807 + -__sv_f64_t _ZGVsMxv_logp1 (__sv_f64_t, __sv_bool_t); 36808 + __sv_f64_t _ZGVsMxvv_pow (__sv_f64_t, __sv_f64_t, __sv_bool_t); 36809 + __sv_f64_t _ZGVsMxv_sin (__sv_f64_t, __sv_bool_t); 36810 + __sv_f64_t _ZGVsMxv_sinh (__sv_f64_t, __sv_bool_t); 36811 + diff --git a/sysdeps/aarch64/fpu/log1p_advsimd.c b/sysdeps/aarch64/fpu/log1p_advsimd.c 36812 + index 1263587201..9d18578ce6 100644 36813 + --- a/sysdeps/aarch64/fpu/log1p_advsimd.c 36814 + +++ b/sysdeps/aarch64/fpu/log1p_advsimd.c 36815 + @@ -58,5 +58,3 @@ VPCS_ATTR float64x2_t V_NAME_D1 (log1p) (float64x2_t x) 36816 + 36817 + return log1p_inline (x, &d->d); 36818 + } 36819 + - 36820 + -strong_alias (V_NAME_D1 (log1p), V_NAME_D1 (logp1)) 36821 + diff --git a/sysdeps/aarch64/fpu/log1p_sve.c b/sysdeps/aarch64/fpu/log1p_sve.c 36822 + index b21cfb2c90..04f7e5720e 100644 36823 + --- a/sysdeps/aarch64/fpu/log1p_sve.c 36824 + +++ b/sysdeps/aarch64/fpu/log1p_sve.c 36825 + @@ -116,5 +116,3 @@ svfloat64_t SV_NAME_D1 (log1p) (svfloat64_t x, svbool_t pg) 36826 + 36827 + return y; 36828 + } 36829 + - 36830 + -strong_alias (SV_NAME_D1 (log1p), SV_NAME_D1 (logp1)) 36831 + diff --git a/sysdeps/aarch64/fpu/log1pf_advsimd.c b/sysdeps/aarch64/fpu/log1pf_advsimd.c 36832 + index 00006fc703..f2d47962fe 100644 36833 + --- a/sysdeps/aarch64/fpu/log1pf_advsimd.c 36834 + +++ b/sysdeps/aarch64/fpu/log1pf_advsimd.c 36835 + @@ -93,6 +93,3 @@ VPCS_ATTR float32x4_t V_NAME_F1 (log1p) (float32x4_t x) 36836 + 36837 + libmvec_hidden_def (V_NAME_F1 (log1p)) 36838 + HALF_WIDTH_ALIAS_F1 (log1p) 36839 + -strong_alias (V_NAME_F1 (log1p), V_NAME_F1 (logp1)) 36840 + -libmvec_hidden_def (V_NAME_F1 (logp1)) 36841 + -HALF_WIDTH_ALIAS_F1 (logp1) 36842 + diff --git a/sysdeps/aarch64/fpu/log1pf_sve.c b/sysdeps/aarch64/fpu/log1pf_sve.c 36843 + index 18a185c838..4f17c44e2d 100644 36844 + --- a/sysdeps/aarch64/fpu/log1pf_sve.c 36845 + +++ b/sysdeps/aarch64/fpu/log1pf_sve.c 36846 + @@ -42,5 +42,3 @@ svfloat32_t SV_NAME_F1 (log1p) (svfloat32_t x, svbool_t pg) 36847 + 36848 + return sv_log1pf_inline (x, pg); 36849 + } 36850 + - 36851 + -strong_alias (SV_NAME_F1 (log1p), SV_NAME_F1 (logp1)) 36852 + diff --git a/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist b/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist 36853 + index 98687cae0d..b685106954 100644 36854 + --- a/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist 36855 + +++ b/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist 36856 + @@ -128,8 +128,3 @@ GLIBC_2.40 _ZGVsMxvv_hypot F 36857 + GLIBC_2.40 _ZGVsMxvv_hypotf F 36858 + GLIBC_2.40 _ZGVsMxvv_pow F 36859 + GLIBC_2.40 _ZGVsMxvv_powf F 36860 + -GLIBC_2.41 _ZGVnN2v_logp1 F 36861 + -GLIBC_2.41 _ZGVnN2v_logp1f F 36862 + -GLIBC_2.41 _ZGVnN4v_logp1f F 36863 + -GLIBC_2.41 _ZGVsMxv_logp1 F 36864 + -GLIBC_2.41 _ZGVsMxv_logp1f F 36865 + 36866 + commit 64896b7d329809127035fde42768a6f7eeffed75 36867 + Author: Wilco Dijkstra <wilco.dijkstra@arm.com> 36868 + Date: Wed Aug 7 14:43:47 2024 +0100 36869 + 36870 + AArch64: Improve generic strlen 36871 + 36872 + Improve performance by handling another 16 bytes before entering the loop. 36873 + Use ADDHN in the loop to avoid SHRN+FMOV when it terminates. Change final 36874 + size computation to avoid increasing latency. On Neoverse V1 performance 36875 + of the random strlen benchmark improves by 4.6%. 36876 + 36877 + Reviewed-by: Adhemerval Zanella <adhemerval.zanella@linaro.org> 36878 + (cherry picked from commit 3dc426b642dcafdbc11a99f2767e081d086f5fc7) 36879 + 36880 + diff --git a/sysdeps/aarch64/strlen.S b/sysdeps/aarch64/strlen.S 36881 + index ab2a576cdb..352fb40d3a 100644 36882 + --- a/sysdeps/aarch64/strlen.S 36883 + +++ b/sysdeps/aarch64/strlen.S 36884 + @@ -1,4 +1,5 @@ 36885 + -/* Copyright (C) 2012-2024 Free Software Foundation, Inc. 36886 + +/* Generic optimized strlen using SIMD. 36887 + + Copyright (C) 2012-2024 Free Software Foundation, Inc. 36888 + 36889 + This file is part of the GNU C Library. 36890 + 36891 + @@ -56,36 +57,50 @@ ENTRY (STRLEN) 36892 + shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */ 36893 + fmov synd, dend 36894 + lsr synd, synd, shift 36895 + - cbz synd, L(loop) 36896 + + cbz synd, L(next16) 36897 + 36898 + rbit synd, synd 36899 + clz result, synd 36900 + lsr result, result, 2 36901 + ret 36902 + 36903 + +L(next16): 36904 + + ldr data, [src, 16] 36905 + + cmeq vhas_nul.16b, vdata.16b, 0 36906 + + shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */ 36907 + + fmov synd, dend 36908 + + cbz synd, L(loop) 36909 + + add src, src, 16 36910 + +#ifndef __AARCH64EB__ 36911 + + rbit synd, synd 36912 + +#endif 36913 + + sub result, src, srcin 36914 + + clz tmp, synd 36915 + + add result, result, tmp, lsr 2 36916 + + ret 36917 + + 36918 + .p2align 5 36919 + L(loop): 36920 + - ldr data, [src, 16] 36921 + + ldr data, [src, 32]! 36922 + cmeq vhas_nul.16b, vdata.16b, 0 36923 + - umaxp vend.16b, vhas_nul.16b, vhas_nul.16b 36924 + + addhn vend.8b, vhas_nul.8h, vhas_nul.8h 36925 + fmov synd, dend 36926 + cbnz synd, L(loop_end) 36927 + - ldr data, [src, 32]! 36928 + + ldr data, [src, 16] 36929 + cmeq vhas_nul.16b, vdata.16b, 0 36930 + - umaxp vend.16b, vhas_nul.16b, vhas_nul.16b 36931 + + addhn vend.8b, vhas_nul.8h, vhas_nul.8h 36932 + fmov synd, dend 36933 + cbz synd, L(loop) 36934 + - sub src, src, 16 36935 + + add src, src, 16 36936 + L(loop_end): 36937 + - shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */ 36938 + - sub result, src, srcin 36939 + - fmov synd, dend 36940 + + sub result, shift, src, lsl 2 /* (srcin - src) << 2. */ 36941 + #ifndef __AARCH64EB__ 36942 + rbit synd, synd 36943 + + sub result, result, 3 36944 + #endif 36945 + - add result, result, 16 36946 + clz tmp, synd 36947 + - add result, result, tmp, lsr 2 36948 + + sub result, tmp, result 36949 + + lsr result, result, 2 36950 + ret 36951 + 36952 + END (STRLEN) 36953 + 36954 + commit 544fb349d35efd5f86ed7e482759ff21496a32fd 36955 + Author: Wilco Dijkstra <wilco.dijkstra@arm.com> 36956 + Date: Mon Sep 9 15:26:47 2024 +0100 36957 + 36958 + AArch64: Optimize memset 36959 + 36960 + Improve small memsets by avoiding branches and use overlapping stores. 36961 + Use DC ZVA for copies over 128 bytes. Remove unnecessary code for ZVA sizes 36962 + other than 64 and 128. Performance of random memset benchmark improves by 24% 36963 + on Neoverse N1. 36964 + 36965 + Reviewed-by: Adhemerval Zanella <adhemerval.zanella@linaro.org> 36966 + (cherry picked from commit cec3aef32412779e207f825db0d057ebb4628ae8) 36967 + 36968 + diff --git a/sysdeps/aarch64/memset.S b/sysdeps/aarch64/memset.S 36969 + index 7ef77ee8c9..caafb019e2 100644 36970 + --- a/sysdeps/aarch64/memset.S 36971 + +++ b/sysdeps/aarch64/memset.S 36972 + @@ -1,4 +1,5 @@ 36973 + -/* Copyright (C) 2012-2024 Free Software Foundation, Inc. 36974 + +/* Generic optimized memset using SIMD. 36975 + + Copyright (C) 2012-2024 Free Software Foundation, Inc. 36976 + 36977 + This file is part of the GNU C Library. 36978 + 36979 + @@ -17,7 +18,6 @@ 36980 + <https://www.gnu.org/licenses/>. */ 36981 + 36982 + #include <sysdep.h> 36983 + -#include "memset-reg.h" 36984 + 36985 + #ifndef MEMSET 36986 + # define MEMSET memset 36987 + @@ -25,130 +25,132 @@ 36988 + 36989 + /* Assumptions: 36990 + * 36991 + - * ARMv8-a, AArch64, unaligned accesses 36992 + + * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses. 36993 + * 36994 + */ 36995 + 36996 + -ENTRY (MEMSET) 36997 + +#define dstin x0 36998 + +#define val x1 36999 + +#define valw w1 37000 + +#define count x2 37001 + +#define dst x3 37002 + +#define dstend x4 37003 + +#define zva_val x5 37004 + +#define off x3 37005 + +#define dstend2 x5 37006 + 37007 + +ENTRY (MEMSET) 37008 + PTR_ARG (0) 37009 + SIZE_ARG (2) 37010 + 37011 + dup v0.16B, valw 37012 + + cmp count, 16 37013 + + b.lo L(set_small) 37014 + + 37015 + add dstend, dstin, count 37016 + + cmp count, 64 37017 + + b.hs L(set_128) 37018 + 37019 + - cmp count, 96 37020 + - b.hi L(set_long) 37021 + - cmp count, 16 37022 + - b.hs L(set_medium) 37023 + - mov val, v0.D[0] 37024 + + /* Set 16..63 bytes. */ 37025 + + mov off, 16 37026 + + and off, off, count, lsr 1 37027 + + sub dstend2, dstend, off 37028 + + str q0, [dstin] 37029 + + str q0, [dstin, off] 37030 + + str q0, [dstend2, -16] 37031 + + str q0, [dstend, -16] 37032 + + ret 37033 + 37034 + + .p2align 4 37035 + /* Set 0..15 bytes. */ 37036 + - tbz count, 3, 1f 37037 + - str val, [dstin] 37038 + - str val, [dstend, -8] 37039 + - ret 37040 + - nop 37041 + -1: tbz count, 2, 2f 37042 + - str valw, [dstin] 37043 + - str valw, [dstend, -4] 37044 + +L(set_small): 37045 + + add dstend, dstin, count 37046 + + cmp count, 4 37047 + + b.lo 2f 37048 + + lsr off, count, 3 37049 + + sub dstend2, dstend, off, lsl 2 37050 + + str s0, [dstin] 37051 + + str s0, [dstin, off, lsl 2] 37052 + + str s0, [dstend2, -4] 37053 + + str s0, [dstend, -4] 37054 + ret 37055 + + 37056 + + /* Set 0..3 bytes. */ 37057 + 2: cbz count, 3f 37058 + + lsr off, count, 1 37059 + strb valw, [dstin] 37060 + - tbz count, 1, 3f 37061 + - strh valw, [dstend, -2] 37062 + + strb valw, [dstin, off] 37063 + + strb valw, [dstend, -1] 37064 + 3: ret 37065 + 37066 + - /* Set 17..96 bytes. */ 37067 + -L(set_medium): 37068 + - str q0, [dstin] 37069 + - tbnz count, 6, L(set96) 37070 + - str q0, [dstend, -16] 37071 + - tbz count, 5, 1f 37072 + - str q0, [dstin, 16] 37073 + - str q0, [dstend, -32] 37074 + -1: ret 37075 + - 37076 + .p2align 4 37077 + - /* Set 64..96 bytes. Write 64 bytes from the start and 37078 + - 32 bytes from the end. */ 37079 + -L(set96): 37080 + - str q0, [dstin, 16] 37081 + +L(set_128): 37082 + + bic dst, dstin, 15 37083 + + cmp count, 128 37084 + + b.hi L(set_long) 37085 + + stp q0, q0, [dstin] 37086 + stp q0, q0, [dstin, 32] 37087 + + stp q0, q0, [dstend, -64] 37088 + stp q0, q0, [dstend, -32] 37089 + ret 37090 + 37091 + - .p2align 3 37092 + - nop 37093 + + .p2align 4 37094 + L(set_long): 37095 + - and valw, valw, 255 37096 + - bic dst, dstin, 15 37097 + str q0, [dstin] 37098 + - cmp count, 256 37099 + - ccmp valw, 0, 0, cs 37100 + - b.eq L(try_zva) 37101 + -L(no_zva): 37102 + - sub count, dstend, dst /* Count is 16 too large. */ 37103 + - sub dst, dst, 16 /* Dst is biased by -32. */ 37104 + - sub count, count, 64 + 16 /* Adjust count and bias for loop. */ 37105 + -1: stp q0, q0, [dst, 32] 37106 + - stp q0, q0, [dst, 64]! 37107 + -L(tail64): 37108 + - subs count, count, 64 37109 + - b.hi 1b 37110 + -2: stp q0, q0, [dstend, -64] 37111 + + str q0, [dst, 16] 37112 + + tst valw, 255 37113 + + b.ne L(no_zva) 37114 + +#ifndef ZVA64_ONLY 37115 + + mrs zva_val, dczid_el0 37116 + + and zva_val, zva_val, 31 37117 + + cmp zva_val, 4 /* ZVA size is 64 bytes. */ 37118 + + b.ne L(zva_128) 37119 + +#endif 37120 + + stp q0, q0, [dst, 32] 37121 + + bic dst, dstin, 63 37122 + + sub count, dstend, dst /* Count is now 64 too large. */ 37123 + + sub count, count, 64 + 64 /* Adjust count and bias for loop. */ 37124 + + 37125 + + /* Write last bytes before ZVA loop. */ 37126 + + stp q0, q0, [dstend, -64] 37127 + stp q0, q0, [dstend, -32] 37128 + + 37129 + + .p2align 4 37130 + +L(zva64_loop): 37131 + + add dst, dst, 64 37132 + + dc zva, dst 37133 + + subs count, count, 64 37134 + + b.hi L(zva64_loop) 37135 + ret 37136 + 37137 + -L(try_zva): 37138 + -#ifndef ZVA64_ONLY 37139 + .p2align 3 37140 + - mrs tmp1, dczid_el0 37141 + - tbnz tmp1w, 4, L(no_zva) 37142 + - and tmp1w, tmp1w, 15 37143 + - cmp tmp1w, 4 /* ZVA size is 64 bytes. */ 37144 + - b.ne L(zva_128) 37145 + - nop 37146 + -#endif 37147 + - /* Write the first and last 64 byte aligned block using stp rather 37148 + - than using DC ZVA. This is faster on some cores. 37149 + - */ 37150 + - .p2align 4 37151 + -L(zva_64): 37152 + - str q0, [dst, 16] 37153 + +L(no_zva): 37154 + + sub count, dstend, dst /* Count is 32 too large. */ 37155 + + sub count, count, 64 + 32 /* Adjust count and bias for loop. */ 37156 + +L(no_zva_loop): 37157 + stp q0, q0, [dst, 32] 37158 + - bic dst, dst, 63 37159 + stp q0, q0, [dst, 64] 37160 + - stp q0, q0, [dst, 96] 37161 + - sub count, dstend, dst /* Count is now 128 too large. */ 37162 + - sub count, count, 128+64+64 /* Adjust count and bias for loop. */ 37163 + - add dst, dst, 128 37164 + -1: dc zva, dst 37165 + add dst, dst, 64 37166 + subs count, count, 64 37167 + - b.hi 1b 37168 + - stp q0, q0, [dst, 0] 37169 + - stp q0, q0, [dst, 32] 37170 + + b.hi L(no_zva_loop) 37171 + stp q0, q0, [dstend, -64] 37172 + stp q0, q0, [dstend, -32] 37173 + ret 37174 + 37175 + #ifndef ZVA64_ONLY 37176 + - .p2align 3 37177 + + .p2align 4 37178 + L(zva_128): 37179 + - cmp tmp1w, 5 /* ZVA size is 128 bytes. */ 37180 + - b.ne L(zva_other) 37181 + + cmp zva_val, 5 /* ZVA size is 128 bytes. */ 37182 + + b.ne L(no_zva) 37183 + 37184 + - str q0, [dst, 16] 37185 + stp q0, q0, [dst, 32] 37186 + stp q0, q0, [dst, 64] 37187 + stp q0, q0, [dst, 96] 37188 + bic dst, dst, 127 37189 + sub count, dstend, dst /* Count is now 128 too large. */ 37190 + - sub count, count, 128+128 /* Adjust count and bias for loop. */ 37191 + - add dst, dst, 128 37192 + -1: dc zva, dst 37193 + - add dst, dst, 128 37194 + + sub count, count, 128 + 128 /* Adjust count and bias for loop. */ 37195 + +1: add dst, dst, 128 37196 + + dc zva, dst 37197 + subs count, count, 128 37198 + b.hi 1b 37199 + stp q0, q0, [dstend, -128] 37200 + @@ -156,35 +158,6 @@ L(zva_128): 37201 + stp q0, q0, [dstend, -64] 37202 + stp q0, q0, [dstend, -32] 37203 + ret 37204 + - 37205 + -L(zva_other): 37206 + - mov tmp2w, 4 37207 + - lsl zva_lenw, tmp2w, tmp1w 37208 + - add tmp1, zva_len, 64 /* Max alignment bytes written. */ 37209 + - cmp count, tmp1 37210 + - blo L(no_zva) 37211 + - 37212 + - sub tmp2, zva_len, 1 37213 + - add tmp1, dst, zva_len 37214 + - add dst, dst, 16 37215 + - subs count, tmp1, dst /* Actual alignment bytes to write. */ 37216 + - bic tmp1, tmp1, tmp2 /* Aligned dc zva start address. */ 37217 + - beq 2f 37218 + -1: stp q0, q0, [dst], 64 37219 + - stp q0, q0, [dst, -32] 37220 + - subs count, count, 64 37221 + - b.hi 1b 37222 + -2: mov dst, tmp1 37223 + - sub count, dstend, tmp1 /* Remaining bytes to write. */ 37224 + - subs count, count, zva_len 37225 + - b.lo 4f 37226 + -3: dc zva, dst 37227 + - add dst, dst, zva_len 37228 + - subs count, count, zva_len 37229 + - b.hs 3b 37230 + -4: add count, count, zva_len 37231 + - sub dst, dst, 32 /* Bias dst for tail loop. */ 37232 + - b L(tail64) 37233 + #endif 37234 + 37235 + END (MEMSET) 37236 + 37237 + commit 41eb2f8b5847079caca90a74659456adbb80ec29 37238 + Author: Wilco Dijkstra <wilco.dijkstra@arm.com> 37239 + Date: Mon Nov 25 18:43:08 2024 +0000 37240 + 37241 + AArch64: Remove zva_128 from memset 37242 + 37243 + Remove ZVA 128 support from memset - the new memset no longer 37244 + guarantees count >= 256, which can result in underflow and a 37245 + crash if ZVA size is 128 ([1]). Since only one CPU uses a ZVA 37246 + size of 128 and its memcpy implementation was removed in commit 37247 + e162ab2bf1b82c40f29e1925986582fa07568ce8, remove this special 37248 + case too. 37249 + 37250 + [1] https://sourceware.org/pipermail/libc-alpha/2024-November/161626.html 37251 + 37252 + Reviewed-by: Andrew Pinski <quic_apinski@quicinc.com> 37253 + (cherry picked from commit a08d9a52f967531a77e1824c23b5368c6434a72d) 37254 + 37255 + diff --git a/sysdeps/aarch64/memset.S b/sysdeps/aarch64/memset.S 37256 + index caafb019e2..71814d0b2f 100644 37257 + --- a/sysdeps/aarch64/memset.S 37258 + +++ b/sysdeps/aarch64/memset.S 37259 + @@ -104,7 +104,7 @@ L(set_long): 37260 + mrs zva_val, dczid_el0 37261 + and zva_val, zva_val, 31 37262 + cmp zva_val, 4 /* ZVA size is 64 bytes. */ 37263 + - b.ne L(zva_128) 37264 + + b.ne L(no_zva) 37265 + #endif 37266 + stp q0, q0, [dst, 32] 37267 + bic dst, dstin, 63 37268 + @@ -137,28 +137,5 @@ L(no_zva_loop): 37269 + stp q0, q0, [dstend, -32] 37270 + ret 37271 + 37272 + -#ifndef ZVA64_ONLY 37273 + - .p2align 4 37274 + -L(zva_128): 37275 + - cmp zva_val, 5 /* ZVA size is 128 bytes. */ 37276 + - b.ne L(no_zva) 37277 + - 37278 + - stp q0, q0, [dst, 32] 37279 + - stp q0, q0, [dst, 64] 37280 + - stp q0, q0, [dst, 96] 37281 + - bic dst, dst, 127 37282 + - sub count, dstend, dst /* Count is now 128 too large. */ 37283 + - sub count, count, 128 + 128 /* Adjust count and bias for loop. */ 37284 + -1: add dst, dst, 128 37285 + - dc zva, dst 37286 + - subs count, count, 128 37287 + - b.hi 1b 37288 + - stp q0, q0, [dstend, -128] 37289 + - stp q0, q0, [dstend, -96] 37290 + - stp q0, q0, [dstend, -64] 37291 + - stp q0, q0, [dstend, -32] 37292 + - ret 37293 + -#endif 37294 + - 37295 + END (MEMSET) 37296 + libc_hidden_builtin_def (MEMSET) 37297 + 37298 + commit 27fa0268ead054810a5e2669d0b5bb88ceb05b05 37299 + Author: Wilco Dijkstra <wilco.dijkstra@arm.com> 37300 + Date: Wed Jul 24 15:17:47 2024 +0100 37301 + 37302 + math: Improve layout of expf data 37303 + 37304 + GCC aligns global data to 16 bytes if their size is >= 16 bytes. This patch 37305 + changes the exp2f_data struct slightly so that the fields are better aligned. 37306 + As a result on targets that support them, load-pair instructions accessing 37307 + poly_scaled and invln2_scaled are now 16-byte aligned. 37308 + 37309 + Reviewed-by: Adhemerval Zanella <adhemerval.zanella@linaro.org> 37310 + (cherry picked from commit 44fa9c1080fe6a9539f0d2345b9d2ae37b8ee57a) 37311 + 37312 + diff --git a/sysdeps/ieee754/flt-32/math_config.h b/sysdeps/ieee754/flt-32/math_config.h 37313 + index 729f22cd4f..dc07ebd459 100644 37314 + --- a/sysdeps/ieee754/flt-32/math_config.h 37315 + +++ b/sysdeps/ieee754/flt-32/math_config.h 37316 + @@ -166,9 +166,9 @@ extern const struct exp2f_data 37317 + uint64_t tab[1 << EXP2F_TABLE_BITS]; 37318 + double shift_scaled; 37319 + double poly[EXP2F_POLY_ORDER]; 37320 + - double shift; 37321 + double invln2_scaled; 37322 + double poly_scaled[EXP2F_POLY_ORDER]; 37323 + + double shift; 37324 + } __exp2f_data attribute_hidden; 37325 + 37326 + #define LOGF_TABLE_BITS 4 37327 + 37328 + commit 7038970f1f485fb660606f0c596f432fdef250f6 37329 + Author: Wilco Dijkstra <wilco.dijkstra@arm.com> 37330 + Date: Tue Dec 24 18:01:59 2024 +0000 37331 + 37332 + AArch64: Add SVE memset 37333 + 37334 + Add SVE memset based on the generic memset with predicated load for sizes < 16. 37335 + Unaligned memsets of 128-1024 are improved by ~20% on average by using aligned 37336 + stores for the last 64 bytes. Performance of random memset benchmark improves 37337 + by ~2% on Neoverse V1. 37338 + 37339 + Reviewed-by: Yury Khrustalev <yury.khrustalev@arm.com> 37340 + (cherry picked from commit 163b1bbb76caba4d9673c07940c5930a1afa7548) 37341 + 37342 + diff --git a/sysdeps/aarch64/multiarch/Makefile b/sysdeps/aarch64/multiarch/Makefile 37343 + index 3e251cc234..6880ebc035 100644 37344 + --- a/sysdeps/aarch64/multiarch/Makefile 37345 + +++ b/sysdeps/aarch64/multiarch/Makefile 37346 + @@ -16,6 +16,7 @@ sysdep_routines += \ 37347 + memset_kunpeng \ 37348 + memset_mops \ 37349 + memset_oryon1 \ 37350 + + memset_sve_zva64 \ 37351 + memset_zva64 \ 37352 + strlen_asimd \ 37353 + strlen_generic \ 37354 + diff --git a/sysdeps/aarch64/multiarch/ifunc-impl-list.c b/sysdeps/aarch64/multiarch/ifunc-impl-list.c 37355 + index b2fda541f9..1f101a719b 100644 37356 + --- a/sysdeps/aarch64/multiarch/ifunc-impl-list.c 37357 + +++ b/sysdeps/aarch64/multiarch/ifunc-impl-list.c 37358 + @@ -61,6 +61,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, 37359 + IFUNC_IMPL_ADD (array, i, memset, 1, __memset_kunpeng) 37360 + #if HAVE_AARCH64_SVE_ASM 37361 + IFUNC_IMPL_ADD (array, i, memset, sve && !bti && zva_size == 256, __memset_a64fx) 37362 + + IFUNC_IMPL_ADD (array, i, memset, sve && zva_size == 64, __memset_sve_zva64) 37363 + #endif 37364 + IFUNC_IMPL_ADD (array, i, memset, mops, __memset_mops) 37365 + IFUNC_IMPL_ADD (array, i, memset, 1, __memset_generic)) 37366 + diff --git a/sysdeps/aarch64/multiarch/memset.c b/sysdeps/aarch64/multiarch/memset.c 37367 + index bd063c16c9..4f65295e77 100644 37368 + --- a/sysdeps/aarch64/multiarch/memset.c 37369 + +++ b/sysdeps/aarch64/multiarch/memset.c 37370 + @@ -36,6 +36,7 @@ extern __typeof (__redirect_memset) __memset_a64fx attribute_hidden; 37371 + extern __typeof (__redirect_memset) __memset_generic attribute_hidden; 37372 + extern __typeof (__redirect_memset) __memset_mops attribute_hidden; 37373 + extern __typeof (__redirect_memset) __memset_oryon1 attribute_hidden; 37374 + +extern __typeof (__redirect_memset) __memset_sve_zva64 attribute_hidden; 37375 + 37376 + static inline __typeof (__redirect_memset) * 37377 + select_memset_ifunc (void) 37378 + @@ -49,6 +50,9 @@ select_memset_ifunc (void) 37379 + { 37380 + if (IS_A64FX (midr) && zva_size == 256) 37381 + return __memset_a64fx; 37382 + + 37383 + + if (zva_size == 64) 37384 + + return __memset_sve_zva64; 37385 + } 37386 + 37387 + if (IS_ORYON1 (midr) && zva_size == 64) 37388 + diff --git a/sysdeps/aarch64/multiarch/memset_sve_zva64.S b/sysdeps/aarch64/multiarch/memset_sve_zva64.S 37389 + new file mode 100644 37390 + index 0000000000..7fb40fdd9e 37391 + --- /dev/null 37392 + +++ b/sysdeps/aarch64/multiarch/memset_sve_zva64.S 37393 + @@ -0,0 +1,123 @@ 37394 + +/* Optimized memset for SVE. 37395 + + Copyright (C) 2025 Free Software Foundation, Inc. 37396 + + 37397 + + This file is part of the GNU C Library. 37398 + + 37399 + + The GNU C Library is free software; you can redistribute it and/or 37400 + + modify it under the terms of the GNU Lesser General Public 37401 + + License as published by the Free Software Foundation; either 37402 + + version 2.1 of the License, or (at your option) any later version. 37403 + + 37404 + + The GNU C Library is distributed in the hope that it will be useful, 37405 + + but WITHOUT ANY WARRANTY; without even the implied warranty of 37406 + + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 37407 + + Lesser General Public License for more details. 37408 + + 37409 + + You should have received a copy of the GNU Lesser General Public 37410 + + License along with the GNU C Library. If not, see 37411 + + <https://www.gnu.org/licenses/>. */ 37412 + + 37413 + +#include <sysdep.h> 37414 + + 37415 + +/* Assumptions: 37416 + + * 37417 + + * ARMv8-a, AArch64, Advanced SIMD, SVE, unaligned accesses. 37418 + + * ZVA size is 64. 37419 + + */ 37420 + + 37421 + +#if HAVE_AARCH64_SVE_ASM 37422 + + 37423 + +.arch armv8.2-a+sve 37424 + + 37425 + +#define dstin x0 37426 + +#define val x1 37427 + +#define valw w1 37428 + +#define count x2 37429 + +#define dst x3 37430 + +#define dstend x4 37431 + +#define zva_val x5 37432 + +#define vlen x5 37433 + +#define off x3 37434 + +#define dstend2 x5 37435 + + 37436 + +ENTRY (__memset_sve_zva64) 37437 + + dup v0.16B, valw 37438 + + cmp count, 16 37439 + + b.lo L(set_16) 37440 + + 37441 + + add dstend, dstin, count 37442 + + cmp count, 64 37443 + + b.hs L(set_128) 37444 + + 37445 + + /* Set 16..63 bytes. */ 37446 + + mov off, 16 37447 + + and off, off, count, lsr 1 37448 + + sub dstend2, dstend, off 37449 + + str q0, [dstin] 37450 + + str q0, [dstin, off] 37451 + + str q0, [dstend2, -16] 37452 + + str q0, [dstend, -16] 37453 + + ret 37454 + + 37455 + + .p2align 4 37456 + +L(set_16): 37457 + + whilelo p0.b, xzr, count 37458 + + st1b z0.b, p0, [dstin] 37459 + + ret 37460 + + 37461 + + .p2align 4 37462 + +L(set_128): 37463 + + bic dst, dstin, 15 37464 + + cmp count, 128 37465 + + b.hi L(set_long) 37466 + + stp q0, q0, [dstin] 37467 + + stp q0, q0, [dstin, 32] 37468 + + stp q0, q0, [dstend, -64] 37469 + + stp q0, q0, [dstend, -32] 37470 + + ret 37471 + + 37472 + + .p2align 4 37473 + +L(set_long): 37474 + + cmp count, 256 37475 + + b.lo L(no_zva) 37476 + + tst valw, 255 37477 + + b.ne L(no_zva) 37478 + + 37479 + + str q0, [dstin] 37480 + + str q0, [dst, 16] 37481 + + bic dst, dstin, 31 37482 + + stp q0, q0, [dst, 32] 37483 + + bic dst, dstin, 63 37484 + + sub count, dstend, dst /* Count is now 64 too large. */ 37485 + + sub count, count, 128 /* Adjust count and bias for loop. */ 37486 + + 37487 + + sub x8, dstend, 1 /* Write last bytes before ZVA loop. */ 37488 + + bic x8, x8, 15 37489 + + stp q0, q0, [x8, -48] 37490 + + str q0, [x8, -16] 37491 + + str q0, [dstend, -16] 37492 + + 37493 + + .p2align 4 37494 + +L(zva64_loop): 37495 + + add dst, dst, 64 37496 + + dc zva, dst 37497 + + subs count, count, 64 37498 + + b.hi L(zva64_loop) 37499 + + ret 37500 + + 37501 + +L(no_zva): 37502 + + str q0, [dstin] 37503 + + sub count, dstend, dst /* Count is 16 too large. */ 37504 + + sub count, count, 64 + 16 /* Adjust count and bias for loop. */ 37505 + +L(no_zva_loop): 37506 + + stp q0, q0, [dst, 16] 37507 + + stp q0, q0, [dst, 48] 37508 + + add dst, dst, 64 37509 + + subs count, count, 64 37510 + + b.hi L(no_zva_loop) 37511 + + stp q0, q0, [dstend, -64] 37512 + + stp q0, q0, [dstend, -32] 37513 + + ret 37514 + + 37515 + +END (__memset_sve_zva64) 37516 + +#endif 37517 + 37518 + commit d6175a44e95fe443d0fbfed37a9ff7424f1e2661 37519 + Author: Wilco Dijkstra <wilco.dijkstra@arm.com> 37520 + Date: Thu Feb 27 16:28:52 2025 +0000 37521 + 37522 + AArch64: Use prefer_sve_ifuncs for SVE memset 37523 + 37524 + Use prefer_sve_ifuncs for SVE memset just like memcpy. 37525 + 37526 + Reviewed-by: Yury Khrustalev <yury.khrustalev@arm.com> 37527 + (cherry picked from commit 0f044be1dae5169d0e57f8d487b427863aeadab4) 37528 + 37529 + diff --git a/sysdeps/aarch64/multiarch/memset.c b/sysdeps/aarch64/multiarch/memset.c 37530 + index 4f65295e77..bb1e865c97 100644 37531 + --- a/sysdeps/aarch64/multiarch/memset.c 37532 + +++ b/sysdeps/aarch64/multiarch/memset.c 37533 + @@ -51,7 +51,7 @@ select_memset_ifunc (void) 37534 + if (IS_A64FX (midr) && zva_size == 256) 37535 + return __memset_a64fx; 37536 + 37537 + - if (zva_size == 64) 37538 + + if (prefer_sve_ifuncs && zva_size == 64) 37539 + return __memset_sve_zva64; 37540 + } 37541 + 37542 + 37543 + commit d8e8342369831808b00324790c8809ba33408ee7 37544 + Author: Wilco Dijkstra <wilco.dijkstra@arm.com> 37545 + Date: Fri Dec 13 15:43:07 2024 +0000 37546 + 37547 + math: Improve layout of exp/exp10 data 37548 + 37549 + GCC aligns global data to 16 bytes if their size is >= 16 bytes. This patch 37550 + changes the exp_data struct slightly so that the fields are better aligned 37551 + and without gaps. As a result on targets that support them, more load-pair 37552 + instructions are used in exp. Exp10 is improved by moving invlog10_2N later 37553 + so that neglog10_2hiN and neglog10_2loN can be loaded using load-pair. 37554 + 37555 + The exp benchmark improves 2.5%, "144bits" by 7.2%, "768bits" by 12.7% on 37556 + Neoverse V2. Exp10 improves by 1.5%. 37557 + 37558 + Reviewed-by: Adhemerval Zanella <adhemerval.zanella@linaro.org> 37559 + (cherry picked from commit 5afaf99edb326fd9f36eb306a828d129a3a1d7f7) 37560 + 37561 + diff --git a/sysdeps/ieee754/dbl-64/math_config.h b/sysdeps/ieee754/dbl-64/math_config.h 37562 + index ef87cfa6be..05515fd95a 100644 37563 + --- a/sysdeps/ieee754/dbl-64/math_config.h 37564 + +++ b/sysdeps/ieee754/dbl-64/math_config.h 37565 + @@ -195,16 +195,18 @@ check_uflow (double x) 37566 + extern const struct exp_data 37567 + { 37568 + double invln2N; 37569 + - double shift; 37570 + double negln2hiN; 37571 + double negln2loN; 37572 + double poly[4]; /* Last four coefficients. */ 37573 + + double shift; 37574 + + 37575 + double exp2_shift; 37576 + double exp2_poly[EXP2_POLY_ORDER]; 37577 + - double invlog10_2N; 37578 + + 37579 + double neglog10_2hiN; 37580 + double neglog10_2loN; 37581 + double exp10_poly[5]; 37582 + + double invlog10_2N; 37583 + uint64_t tab[2*(1 << EXP_TABLE_BITS)]; 37584 + } __exp_data attribute_hidden; 37585 + 37586 + 37587 + commit 3e820e17a8cef84645d83b67abcbc3f88c7fd268 37588 + Author: Michael Jeanson <mjeanson@efficios.com> 37589 + Date: Fri Feb 14 13:54:22 2025 -0500 37590 + 37591 + nptl: clear the whole rseq area before registration 37592 + 37593 + Due to the extensible nature of the rseq area we can't explictly 37594 + initialize fields that are not part of the ABI yet. It was agreed with 37595 + upstream that all new fields will be documented as zero initialized by 37596 + userspace. Future kernels configured with CONFIG_DEBUG_RSEQ will 37597 + validate the content of all fields during registration. 37598 + 37599 + Replace the explicit field initialization with a memset of the whole 37600 + rseq area which will cover fields as they are added to future kernels. 37601 + 37602 + Signed-off-by: Michael Jeanson <mjeanson@efficios.com> 37603 + Reviewed-by: Florian Weimer <fweimer@redhat.com> 37604 + (cherry picked from commit 689a62a4217fae78b9ce0db781dc2a421f2b1ab4) 37605 + 37606 + diff --git a/sysdeps/nptl/dl-tls_init_tp.c b/sysdeps/nptl/dl-tls_init_tp.c 37607 + index 7803e19fd1..ed10185e37 100644 37608 + --- a/sysdeps/nptl/dl-tls_init_tp.c 37609 + +++ b/sysdeps/nptl/dl-tls_init_tp.c 37610 + @@ -23,6 +23,7 @@ 37611 + #include <tls.h> 37612 + #include <rseq-internal.h> 37613 + #include <thread_pointer.h> 37614 + +#include <dl-symbol-redir-ifunc.h> 37615 + 37616 + #define TUNABLE_NAMESPACE pthread 37617 + #include <dl-tunables.h> 37618 + diff --git a/sysdeps/unix/sysv/linux/rseq-internal.h b/sysdeps/unix/sysv/linux/rseq-internal.h 37619 + index ef3eab1fef..76de2b7ff0 100644 37620 + --- a/sysdeps/unix/sysv/linux/rseq-internal.h 37621 + +++ b/sysdeps/unix/sysv/linux/rseq-internal.h 37622 + @@ -52,13 +52,12 @@ rseq_register_current_thread (struct pthread *self, bool do_rseq) 37623 + but still expected size 32. */ 37624 + size = RSEQ_AREA_SIZE_INITIAL; 37625 + 37626 + - /* Initialize the rseq fields that are read by the kernel on 37627 + - registration, there is no guarantee that struct pthread is 37628 + - cleared on all architectures. */ 37629 + + /* Initialize the whole rseq area to zero prior to registration. */ 37630 + + memset (&self->rseq_area, 0, size); 37631 + + 37632 + + /* Set the cpu_id field to RSEQ_CPU_ID_UNINITIALIZED, this is checked by 37633 + + the kernel at registration when CONFIG_DEBUG_RSEQ is enabled. */ 37634 + THREAD_SETMEM (self, rseq_area.cpu_id, RSEQ_CPU_ID_UNINITIALIZED); 37635 + - THREAD_SETMEM (self, rseq_area.cpu_id_start, 0); 37636 + - THREAD_SETMEM (self, rseq_area.rseq_cs, 0); 37637 + - THREAD_SETMEM (self, rseq_area.flags, 0); 37638 + 37639 + int ret = INTERNAL_SYSCALL_CALL (rseq, &self->rseq_area, 37640 + size, 0, RSEQ_SIG); 37641 + 37642 + commit ee1ab9302363066b49cf8862b96664ed35eda81c 37643 + Author: Sunil K Pandey <skpgkp2@gmail.com> 37644 + Date: Mon Mar 10 10:24:07 2025 -0700 37645 + 37646 + x86_64: Add tanh with FMA 37647 + 37648 + On Skylake, it improves tanh bench performance by: 37649 + 37650 + Before After Improvement 37651 + max 110.89 95.826 14% 37652 + min 20.966 20.157 4% 37653 + mean 30.9601 29.8431 4% 37654 + 37655 + Reviewed-by: H.J. Lu <hjl.tools@gmail.com> 37656 + (cherry picked from commit c6352111c72a20b3588ae304dd99b63e25dd6d85) 37657 + 37658 + diff --git a/sysdeps/ieee754/dbl-64/s_tanh.c b/sysdeps/ieee754/dbl-64/s_tanh.c 37659 + index 673a97102d..13063db04e 100644 37660 + --- a/sysdeps/ieee754/dbl-64/s_tanh.c 37661 + +++ b/sysdeps/ieee754/dbl-64/s_tanh.c 37662 + @@ -46,6 +46,11 @@ static char rcsid[] = "$NetBSD: s_tanh.c,v 1.7 1995/05/10 20:48:22 jtc Exp $"; 37663 + 37664 + static const double one = 1.0, two = 2.0, tiny = 1.0e-300; 37665 + 37666 + +#ifndef SECTION 37667 + +# define SECTION 37668 + +#endif 37669 + + 37670 + +SECTION 37671 + double 37672 + __tanh (double x) 37673 + { 37674 + diff --git a/sysdeps/x86_64/fpu/multiarch/Makefile b/sysdeps/x86_64/fpu/multiarch/Makefile 37675 + index cbe09d49f4..0f69f7089c 100644 37676 + --- a/sysdeps/x86_64/fpu/multiarch/Makefile 37677 + +++ b/sysdeps/x86_64/fpu/multiarch/Makefile 37678 + @@ -10,6 +10,7 @@ CFLAGS-s_expm1-fma.c = -mfma -mavx2 37679 + CFLAGS-s_log1p-fma.c = -mfma -mavx2 37680 + CFLAGS-s_sin-fma.c = -mfma -mavx2 37681 + CFLAGS-s_tan-fma.c = -mfma -mavx2 37682 + +CFLAGS-s_tanh-fma.c = -mfma -mavx2 37683 + CFLAGS-s_sincos-fma.c = -mfma -mavx2 37684 + 37685 + CFLAGS-e_exp2f-fma.c = -mfma -mavx2 37686 + @@ -92,6 +93,7 @@ libm-sysdep_routines += \ 37687 + s_sinf-sse2 \ 37688 + s_tan-avx \ 37689 + s_tan-fma \ 37690 + + s_tanh-fma \ 37691 + s_trunc-sse4_1 \ 37692 + s_truncf-sse4_1 \ 37693 + # libm-sysdep_routines 37694 + diff --git a/sysdeps/x86_64/fpu/multiarch/s_tanh-fma.c b/sysdeps/x86_64/fpu/multiarch/s_tanh-fma.c 37695 + new file mode 100644 37696 + index 0000000000..1b808b1227 37697 + --- /dev/null 37698 + +++ b/sysdeps/x86_64/fpu/multiarch/s_tanh-fma.c 37699 + @@ -0,0 +1,11 @@ 37700 + +#define __tanh __tanh_fma 37701 + +#define __expm1 __expm1_fma 37702 + + 37703 + +/* NB: __expm1 may be expanded to __expm1_fma in the following 37704 + + prototypes. */ 37705 + +extern long double __expm1l (long double); 37706 + +extern long double __expm1f128 (long double); 37707 + + 37708 + +#define SECTION __attribute__ ((section (".text.fma"))) 37709 + + 37710 + +#include <sysdeps/ieee754/dbl-64/s_tanh.c> 37711 + diff --git a/sysdeps/x86_64/fpu/multiarch/s_tanh.c b/sysdeps/x86_64/fpu/multiarch/s_tanh.c 37712 + new file mode 100644 37713 + index 0000000000..5539b6c61c 37714 + --- /dev/null 37715 + +++ b/sysdeps/x86_64/fpu/multiarch/s_tanh.c 37716 + @@ -0,0 +1,31 @@ 37717 + +/* Multiple versions of tanh. 37718 + + Copyright (C) 2025 Free Software Foundation, Inc. 37719 + + This file is part of the GNU C Library. 37720 + + 37721 + + The GNU C Library is free software; you can redistribute it and/or 37722 + + modify it under the terms of the GNU Lesser General Public 37723 + + License as published by the Free Software Foundation; either 37724 + + version 2.1 of the License, or (at your option) any later version. 37725 + + 37726 + + The GNU C Library is distributed in the hope that it will be useful, 37727 + + but WITHOUT ANY WARRANTY; without even the implied warranty of 37728 + + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 37729 + + Lesser General Public License for more details. 37730 + + 37731 + + You should have received a copy of the GNU Lesser General Public 37732 + + License along with the GNU C Library; if not, see 37733 + + <https://www.gnu.org/licenses/>. */ 37734 + + 37735 + +#include <sysdeps/x86/isa-level.h> 37736 + +#if MINIMUM_X86_ISA_LEVEL < AVX2_X86_ISA_LEVEL 37737 + + 37738 + +extern double __redirect_tanh (double); 37739 + + 37740 + +# define SYMBOL_NAME tanh 37741 + +# include "ifunc-fma.h" 37742 + + 37743 + +libc_ifunc_redirected (__redirect_tanh, __tanh, IFUNC_SELECTOR ()); 37744 + + 37745 + +# define __tanh __tanh_sse2 37746 + +#endif 37747 + +#include <sysdeps/ieee754/dbl-64/s_tanh.c> 37748 + 37749 + commit e854f6d37cbeabb9130fed74b587befad8b4ba08 37750 + Author: Sunil K Pandey <skpgkp2@gmail.com> 37751 + Date: Sat Mar 8 08:51:10 2025 -0800 37752 + 37753 + x86_64: Add sinh with FMA 37754 + 37755 + On SPR, it improves sinh bench performance by: 37756 + 37757 + Before After Improvement 37758 + reciprocal-throughput 14.2017 11.815 17% 37759 + latency 36.4917 35.2114 4% 37760 + 37761 + Reviewed-by: H.J. Lu <hjl.tools@gmail.com> 37762 + (cherry picked from commit dded0d20f67ba1925ccbcb9cf28f0c75febe0dbe) 37763 + 37764 + diff --git a/benchtests/sinh-inputs b/benchtests/sinh-inputs 37765 + index 7b1ac46a39..2fcb2fabf8 100644 37766 + --- a/benchtests/sinh-inputs 37767 + +++ b/benchtests/sinh-inputs 37768 + @@ -1,6 +1,7 @@ 37769 + ## args: double 37770 + ## ret: double 37771 + ## includes: math.h 37772 + +## name: workload-random 37773 + 0x1.bcb6129b5ff2bp8 37774 + -0x1.63057386325ebp9 37775 + 0x1.62f1d7dc4e8bfp9 37776 + diff --git a/sysdeps/ieee754/dbl-64/e_sinh.c b/sysdeps/ieee754/dbl-64/e_sinh.c 37777 + index b4b5857ddd..3f787967f9 100644 37778 + --- a/sysdeps/ieee754/dbl-64/e_sinh.c 37779 + +++ b/sysdeps/ieee754/dbl-64/e_sinh.c 37780 + @@ -41,6 +41,11 @@ static char rcsid[] = "$NetBSD: e_sinh.c,v 1.7 1995/05/10 20:46:13 jtc Exp $"; 37781 + 37782 + static const double one = 1.0, shuge = 1.0e307; 37783 + 37784 + +#ifndef SECTION 37785 + +# define SECTION 37786 + +#endif 37787 + + 37788 + +SECTION 37789 + double 37790 + __ieee754_sinh (double x) 37791 + { 37792 + @@ -90,4 +95,7 @@ __ieee754_sinh (double x) 37793 + /* |x| > overflowthresold, sinh(x) overflow */ 37794 + return math_narrow_eval (x * shuge); 37795 + } 37796 + + 37797 + +#ifndef __ieee754_sinh 37798 + libm_alias_finite (__ieee754_sinh, __sinh) 37799 + +#endif 37800 + diff --git a/sysdeps/x86_64/fpu/multiarch/Makefile b/sysdeps/x86_64/fpu/multiarch/Makefile 37801 + index 0f69f7089c..b527cab8d1 100644 37802 + --- a/sysdeps/x86_64/fpu/multiarch/Makefile 37803 + +++ b/sysdeps/x86_64/fpu/multiarch/Makefile 37804 + @@ -5,6 +5,7 @@ CFLAGS-e_exp-fma.c = -mfma -mavx2 37805 + CFLAGS-e_log-fma.c = -mfma -mavx2 37806 + CFLAGS-e_log2-fma.c = -mfma -mavx2 37807 + CFLAGS-e_pow-fma.c = -mfma -mavx2 37808 + +CFLAGS-e_sinh-fma.c = -mfma -mavx2 37809 + CFLAGS-s_atan-fma.c = -mfma -mavx2 37810 + CFLAGS-s_expm1-fma.c = -mfma -mavx2 37811 + CFLAGS-s_log1p-fma.c = -mfma -mavx2 37812 + @@ -67,6 +68,7 @@ libm-sysdep_routines += \ 37813 + e_logf-fma \ 37814 + e_pow-fma \ 37815 + e_powf-fma \ 37816 + + e_sinh-fma \ 37817 + s_atan-avx \ 37818 + s_atan-fma \ 37819 + s_ceil-sse4_1 \ 37820 + diff --git a/sysdeps/x86_64/fpu/multiarch/e_sinh-fma.c b/sysdeps/x86_64/fpu/multiarch/e_sinh-fma.c 37821 + new file mode 100644 37822 + index 0000000000..e0e1e39a7a 37823 + --- /dev/null 37824 + +++ b/sysdeps/x86_64/fpu/multiarch/e_sinh-fma.c 37825 + @@ -0,0 +1,12 @@ 37826 + +#define __ieee754_sinh __ieee754_sinh_fma 37827 + +#define __ieee754_exp __ieee754_exp_fma 37828 + +#define __expm1 __expm1_fma 37829 + + 37830 + +/* NB: __expm1 may be expanded to __expm1_fma in the following 37831 + + prototypes. */ 37832 + +extern long double __expm1l (long double); 37833 + +extern long double __expm1f128 (long double); 37834 + + 37835 + +#define SECTION __attribute__ ((section (".text.fma"))) 37836 + + 37837 + +#include <sysdeps/ieee754/dbl-64/e_sinh.c> 37838 + diff --git a/sysdeps/x86_64/fpu/multiarch/e_sinh.c b/sysdeps/x86_64/fpu/multiarch/e_sinh.c 37839 + new file mode 100644 37840 + index 0000000000..3d3c18ccdf 37841 + --- /dev/null 37842 + +++ b/sysdeps/x86_64/fpu/multiarch/e_sinh.c 37843 + @@ -0,0 +1,35 @@ 37844 + +/* Multiple versions of sinh. 37845 + + Copyright (C) 2025 Free Software Foundation, Inc. 37846 + + This file is part of the GNU C Library. 37847 + + 37848 + + The GNU C Library is free software; you can redistribute it and/or 37849 + + modify it under the terms of the GNU Lesser General Public 37850 + + License as published by the Free Software Foundation; either 37851 + + version 2.1 of the License, or (at your option) any later version. 37852 + + 37853 + + The GNU C Library is distributed in the hope that it will be useful, 37854 + + but WITHOUT ANY WARRANTY; without even the implied warranty of 37855 + + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 37856 + + Lesser General Public License for more details. 37857 + + 37858 + + You should have received a copy of the GNU Lesser General Public 37859 + + License along with the GNU C Library; if not, see 37860 + + <https://www.gnu.org/licenses/>. */ 37861 + + 37862 + +#include <sysdeps/x86/isa-level.h> 37863 + +#if MINIMUM_X86_ISA_LEVEL < AVX2_X86_ISA_LEVEL 37864 + +# include <libm-alias-finite.h> 37865 + + 37866 + +extern double __redirect_ieee754_sinh (double); 37867 + + 37868 + +# define SYMBOL_NAME ieee754_sinh 37869 + +# include "ifunc-fma.h" 37870 + + 37871 + +libc_ifunc_redirected (__redirect_ieee754_sinh, __ieee754_sinh, 37872 + + IFUNC_SELECTOR ()); 37873 + + 37874 + +libm_alias_finite (__ieee754_sinh, __sinh) 37875 + + 37876 + +# define __ieee754_sinh __ieee754_sinh_sse2 37877 + +#endif 37878 + +#include <sysdeps/ieee754/dbl-64/e_sinh.c> 37879 + 37880 + commit e5f5dfdda28def8362896bdb1748bb27dfc8be73 37881 + Author: Sunil K Pandey <skpgkp2@gmail.com> 37882 + Date: Wed Mar 5 16:13:38 2025 -0800 37883 + 37884 + x86_64: Add atanh with FMA 37885 + 37886 + On SPR, it improves atanh bench performance by: 37887 + 37888 + Before After Improvement 37889 + reciprocal-throughput 15.1715 14.8628 2% 37890 + latency 57.1941 56.1883 2% 37891 + 37892 + Reviewed-by: H.J. Lu <hjl.tools@gmail.com> 37893 + (cherry picked from commit c7c4a5906f326f1290b1c2413a83c530564ec4b8) 37894 + 37895 + diff --git a/benchtests/atanh-inputs b/benchtests/atanh-inputs 37896 + index 455aa65b65..4985293254 100644 37897 + --- a/benchtests/atanh-inputs 37898 + +++ b/benchtests/atanh-inputs 37899 + @@ -1,6 +1,7 @@ 37900 + ## args: double 37901 + ## ret: double 37902 + ## includes: math.h 37903 + +## name: workload-random 37904 + 0x1.5a2730bacd94ap-1 37905 + -0x1.b57eb40fc048ep-21 37906 + -0x1.c0b185fb450e2p-17 37907 + diff --git a/sysdeps/ieee754/dbl-64/e_atanh.c b/sysdeps/ieee754/dbl-64/e_atanh.c 37908 + index 11a2a45799..05ac0a1b30 100644 37909 + --- a/sysdeps/ieee754/dbl-64/e_atanh.c 37910 + +++ b/sysdeps/ieee754/dbl-64/e_atanh.c 37911 + @@ -44,6 +44,11 @@ 37912 + 37913 + static const double huge = 1e300; 37914 + 37915 + +#ifndef SECTION 37916 + +# define SECTION 37917 + +#endif 37918 + + 37919 + +SECTION 37920 + double 37921 + __ieee754_atanh (double x) 37922 + { 37923 + @@ -73,4 +78,7 @@ __ieee754_atanh (double x) 37924 + 37925 + return copysign (t, x); 37926 + } 37927 + + 37928 + +#ifndef __ieee754_atanh 37929 + libm_alias_finite (__ieee754_atanh, __atanh) 37930 + +#endif 37931 + diff --git a/sysdeps/x86_64/fpu/multiarch/Makefile b/sysdeps/x86_64/fpu/multiarch/Makefile 37932 + index b527cab8d1..bc479b42d2 100644 37933 + --- a/sysdeps/x86_64/fpu/multiarch/Makefile 37934 + +++ b/sysdeps/x86_64/fpu/multiarch/Makefile 37935 + @@ -1,6 +1,7 @@ 37936 + ifeq ($(subdir),math) 37937 + CFLAGS-e_asin-fma.c = -mfma -mavx2 37938 + CFLAGS-e_atan2-fma.c = -mfma -mavx2 37939 + +CFLAGS-e_atanh-fma.c = -mfma -mavx2 37940 + CFLAGS-e_exp-fma.c = -mfma -mavx2 37941 + CFLAGS-e_log-fma.c = -mfma -mavx2 37942 + CFLAGS-e_log2-fma.c = -mfma -mavx2 37943 + @@ -57,6 +58,7 @@ libm-sysdep_routines += \ 37944 + e_asin-fma \ 37945 + e_atan2-avx \ 37946 + e_atan2-fma \ 37947 + + e_atanh-fma \ 37948 + e_exp-avx \ 37949 + e_exp-fma \ 37950 + e_exp2f-fma \ 37951 + diff --git a/sysdeps/x86_64/fpu/multiarch/e_atanh-fma.c b/sysdeps/x86_64/fpu/multiarch/e_atanh-fma.c 37952 + new file mode 100644 37953 + index 0000000000..c3f2f9e550 37954 + --- /dev/null 37955 + +++ b/sysdeps/x86_64/fpu/multiarch/e_atanh-fma.c 37956 + @@ -0,0 +1,6 @@ 37957 + +#define __ieee754_atanh __ieee754_atanh_fma 37958 + +#define __log1p __log1p_fma 37959 + + 37960 + +#define SECTION __attribute__ ((section (".text.fma"))) 37961 + + 37962 + +#include <sysdeps/ieee754/dbl-64/e_atanh.c> 37963 + diff --git a/sysdeps/x86_64/fpu/multiarch/e_atanh.c b/sysdeps/x86_64/fpu/multiarch/e_atanh.c 37964 + new file mode 100644 37965 + index 0000000000..d2b785dfc0 37966 + --- /dev/null 37967 + +++ b/sysdeps/x86_64/fpu/multiarch/e_atanh.c 37968 + @@ -0,0 +1,34 @@ 37969 + +/* Multiple versions of atanh. 37970 + + Copyright (C) 2025 Free Software Foundation, Inc. 37971 + + This file is part of the GNU C Library. 37972 + + 37973 + + The GNU C Library is free software; you can redistribute it and/or 37974 + + modify it under the terms of the GNU Lesser General Public 37975 + + License as published by the Free Software Foundation; either 37976 + + version 2.1 of the License, or (at your option) any later version. 37977 + + 37978 + + The GNU C Library is distributed in the hope that it will be useful, 37979 + + but WITHOUT ANY WARRANTY; without even the implied warranty of 37980 + + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 37981 + + Lesser General Public License for more details. 37982 + + 37983 + + You should have received a copy of the GNU Lesser General Public 37984 + + License along with the GNU C Library; if not, see 37985 + + <https://www.gnu.org/licenses/>. */ 37986 + + 37987 + +#include <sysdeps/x86/isa-level.h> 37988 + +#if MINIMUM_X86_ISA_LEVEL < AVX2_X86_ISA_LEVEL 37989 + +# include <libm-alias-finite.h> 37990 + + 37991 + +extern double __redirect_ieee754_atanh (double); 37992 + + 37993 + +# define SYMBOL_NAME ieee754_atanh 37994 + +# include "ifunc-fma.h" 37995 + + 37996 + +libc_ifunc_redirected (__redirect_ieee754_atanh, __ieee754_atanh, IFUNC_SELECTOR ()); 37997 + + 37998 + +libm_alias_finite (__ieee754_atanh, __atanh) 37999 + + 38000 + +# define __ieee754_atanh __ieee754_atanh_sse2 38001 + +#endif 38002 + +#include <sysdeps/ieee754/dbl-64/e_atanh.c> 38003 + 38004 + commit 8fc492bb4234edc1a5e8c3b7f76ba345ea7109ec 38005 + Author: Florian Weimer <fweimer@redhat.com> 38006 + Date: Fri Mar 28 09:26:06 2025 +0100 38007 + 38008 + x86: Skip XSAVE state size reset if ISA level requires XSAVE 38009 + 38010 + If we have to use XSAVE or XSAVEC trampolines, do not adjust the size 38011 + information they need. Technically, it is an operator error to try to 38012 + run with -XSAVE,-XSAVEC on such builds, but this change here disables 38013 + some unnecessary code with higher ISA levels and simplifies testing. 38014 + 38015 + Related to commit befe2d3c4dec8be2cdd01a47132e47bdb7020922 38016 + ("x86-64: Don't use SSE resolvers for ISA level 3 or above"). 38017 + 38018 + Reviewed-by: H.J. Lu <hjl.tools@gmail.com> 38019 + (cherry picked from commit 59585ddaa2d44f22af04bb4b8bd4ad1e302c4c02) 38020 + 38021 + diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c 38022 + index c096dd390a..b5b264db7f 100644 38023 + --- a/sysdeps/x86/cpu-features.c 38024 + +++ b/sysdeps/x86/cpu-features.c 38025 + @@ -24,6 +24,7 @@ 38026 + #include <dl-cacheinfo.h> 38027 + #include <dl-minsigstacksize.h> 38028 + #include <dl-hwcap2.h> 38029 + +#include <gcc-macros.h> 38030 + 38031 + extern void TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t *) 38032 + attribute_hidden; 38033 + @@ -1119,6 +1120,9 @@ no_cpuid: 38034 + TUNABLE_CALLBACK (set_prefer_map_32bit_exec)); 38035 + #endif 38036 + 38037 + + /* Do not add the logic to disable XSAVE/XSAVEC if this glibc build 38038 + + requires AVX and therefore XSAVE or XSAVEC support. */ 38039 + +#ifndef GCCMACRO__AVX__ 38040 + bool disable_xsave_features = false; 38041 + 38042 + if (!CPU_FEATURE_USABLE_P (cpu_features, OSXSAVE)) 38043 + @@ -1172,6 +1176,7 @@ no_cpuid: 38044 + 38045 + CPU_FEATURE_UNSET (cpu_features, FMA4); 38046 + } 38047 + +#endif 38048 + 38049 + #ifdef __x86_64__ 38050 + GLRO(dl_hwcap) = HWCAP_X86_64; 38051 + 38052 + commit df22af58f66e6815c054b1c56249356c2994935a 38053 + Author: Florian Weimer <fweimer@redhat.com> 38054 + Date: Fri Mar 28 09:26:59 2025 +0100 38055 + 38056 + x86: Use separate variable for TLSDESC XSAVE/XSAVEC state size (bug 32810) 38057 + 38058 + Previously, the initialization code reused the xsave_state_full_size 38059 + member of struct cpu_features for the TLSDESC state size. However, 38060 + the tunable processing code assumes that this member has the 38061 + original XSAVE (non-compact) state size, so that it can use its 38062 + value if XSAVEC is disabled via tunable. 38063 + 38064 + This change uses a separate variable and not a struct member because 38065 + the value is only needed in ld.so and the static libc, but not in 38066 + libc.so. As a result, struct cpu_features layout does not change, 38067 + helping a future backport of this change. 38068 + 38069 + Fixes commit 9b7091415af47082664717210ac49d51551456ab ("x86-64: 38070 + Update _dl_tlsdesc_dynamic to preserve AMX registers"). 38071 + 38072 + Reviewed-by: H.J. Lu <hjl.tools@gmail.com> 38073 + (cherry picked from commit 145097dff170507fe73190e8e41194f5b5f7e6bf) 38074 + 38075 + diff --git a/NEWS b/NEWS 38076 + index 57feba81cd..7a6985f5dd 100644 38077 + --- a/NEWS 38078 + +++ b/NEWS 38079 + @@ -22,6 +22,7 @@ The following bugs are resolved with this release: 38080 + [32231] elf: Change ldconfig auxcache magic number 38081 + [32245] glibc -Wstringop-overflow= build failure on hppa 38082 + [32470] x86: Avoid integer truncation with large cache sizes 38083 + + [32810] Crash on x86-64 if XSAVEC disable via tunable 38084 + 38085 + Version 2.40 38086 + 38087 + diff --git a/sysdeps/x86/Makefile b/sysdeps/x86/Makefile 38088 + index 5311b594af..8819fba1b7 100644 38089 + --- a/sysdeps/x86/Makefile 38090 + +++ b/sysdeps/x86/Makefile 38091 + @@ -21,6 +21,9 @@ tests += \ 38092 + tst-cpu-features-supports-static \ 38093 + tst-get-cpu-features \ 38094 + tst-get-cpu-features-static \ 38095 + + tst-gnu2-tls2-x86-noxsave \ 38096 + + tst-gnu2-tls2-x86-noxsavec \ 38097 + + tst-gnu2-tls2-x86-noxsavexsavec \ 38098 + tst-hwcap-tunables \ 38099 + # tests 38100 + tests-static += \ 38101 + @@ -91,6 +94,22 @@ CFLAGS-tst-gnu2-tls2.c += -msse 38102 + CFLAGS-tst-gnu2-tls2mod0.c += -msse2 -mtune=haswell 38103 + CFLAGS-tst-gnu2-tls2mod1.c += -msse2 -mtune=haswell 38104 + CFLAGS-tst-gnu2-tls2mod2.c += -msse2 -mtune=haswell 38105 + + 38106 + +LDFLAGS-tst-gnu2-tls2-x86-noxsave += -Wl,-z,lazy 38107 + +LDFLAGS-tst-gnu2-tls2-x86-noxsavec += -Wl,-z,lazy 38108 + +LDFLAGS-tst-gnu2-tls2-x86-noxsavexsavec += -Wl,-z,lazy 38109 + + 38110 + +# Test for bug 32810: incorrect XSAVE state size if XSAVEC is disabled 38111 + +# via tunable. 38112 + +tst-gnu2-tls2-x86-noxsave-ENV = GLIBC_TUNABLES=glibc.cpu.hwcaps=-XSAVE 38113 + +tst-gnu2-tls2-x86-noxsavec-ENV = GLIBC_TUNABLES=glibc.cpu.hwcaps=-XSAVEC 38114 + +tst-gnu2-tls2-x86-noxsavexsavec-ENV = GLIBC_TUNABLES=glibc.cpu.hwcaps=-XSAVE,-XSAVEC 38115 + +$(objpfx)tst-gnu2-tls2-x86-noxsave.out \ 38116 + +$(objpfx)tst-gnu2-tls2-x86-noxsavec.out \ 38117 + +$(objpfx)tst-gnu2-tls2-x86-noxsavexsavec.out: \ 38118 + + $(objpfx)tst-gnu2-tls2mod0.so \ 38119 + + $(objpfx)tst-gnu2-tls2mod1.so \ 38120 + + $(objpfx)tst-gnu2-tls2mod2.so 38121 + endif 38122 + 38123 + ifeq ($(subdir),math) 38124 + diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c 38125 + index b5b264db7f..ec27337337 100644 38126 + --- a/sysdeps/x86/cpu-features.c 38127 + +++ b/sysdeps/x86/cpu-features.c 38128 + @@ -84,6 +84,8 @@ extern void TUNABLE_CALLBACK (set_x86_shstk) (tunable_val_t *) 38129 + # include <dl-cet.h> 38130 + #endif 38131 + 38132 + +unsigned long int _dl_x86_features_tlsdesc_state_size; 38133 + + 38134 + static void 38135 + update_active (struct cpu_features *cpu_features) 38136 + { 38137 + @@ -318,6 +320,7 @@ update_active (struct cpu_features *cpu_features) 38138 + = xsave_state_full_size; 38139 + cpu_features->xsave_state_full_size 38140 + = xsave_state_full_size; 38141 + + _dl_x86_features_tlsdesc_state_size = xsave_state_full_size; 38142 + 38143 + /* Check if XSAVEC is available. */ 38144 + if (CPU_FEATURES_CPU_P (cpu_features, XSAVEC)) 38145 + @@ -406,11 +409,9 @@ update_active (struct cpu_features *cpu_features) 38146 + = ALIGN_UP ((amx_size 38147 + + TLSDESC_CALL_REGISTER_SAVE_AREA), 38148 + 64); 38149 + - /* Set xsave_state_full_size to the compact AMX 38150 + - state size for XSAVEC. NB: xsave_state_full_size 38151 + - is only used in _dl_tlsdesc_dynamic_xsave and 38152 + - _dl_tlsdesc_dynamic_xsavec. */ 38153 + - cpu_features->xsave_state_full_size = amx_size; 38154 + + /* Set TLSDESC state size to the compact AMX 38155 + + state size for XSAVEC. */ 38156 + + _dl_x86_features_tlsdesc_state_size = amx_size; 38157 + #endif 38158 + cpu_features->xsave_state_size 38159 + = ALIGN_UP (size + TLSDESC_CALL_REGISTER_SAVE_AREA, 38160 + diff --git a/sysdeps/x86/cpu-tunables.c b/sysdeps/x86/cpu-tunables.c 38161 + index ccc6b64dc2..a0b31d80f6 100644 38162 + --- a/sysdeps/x86/cpu-tunables.c 38163 + +++ b/sysdeps/x86/cpu-tunables.c 38164 + @@ -164,6 +164,8 @@ TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t *valp) 38165 + /* Update xsave_state_size to XSAVE state size. */ 38166 + cpu_features->xsave_state_size 38167 + = cpu_features->xsave_state_full_size; 38168 + + _dl_x86_features_tlsdesc_state_size 38169 + + = cpu_features->xsave_state_full_size; 38170 + CPU_FEATURE_UNSET (cpu_features, XSAVEC); 38171 + } 38172 + } 38173 + diff --git a/sysdeps/x86/dl-diagnostics-cpu.c b/sysdeps/x86/dl-diagnostics-cpu.c 38174 + index 49eeb5f70a..41100a908a 100644 38175 + --- a/sysdeps/x86/dl-diagnostics-cpu.c 38176 + +++ b/sysdeps/x86/dl-diagnostics-cpu.c 38177 + @@ -89,6 +89,8 @@ _dl_diagnostics_cpu (void) 38178 + cpu_features->xsave_state_size); 38179 + print_cpu_features_value ("xsave_state_full_size", 38180 + cpu_features->xsave_state_full_size); 38181 + + print_cpu_features_value ("tlsdesc_state_full_size", 38182 + + _dl_x86_features_tlsdesc_state_size); 38183 + print_cpu_features_value ("data_cache_size", cpu_features->data_cache_size); 38184 + print_cpu_features_value ("shared_cache_size", 38185 + cpu_features->shared_cache_size); 38186 + diff --git a/sysdeps/x86/include/cpu-features.h b/sysdeps/x86/include/cpu-features.h 38187 + index aaae44f0e1..03c71387dd 100644 38188 + --- a/sysdeps/x86/include/cpu-features.h 38189 + +++ b/sysdeps/x86/include/cpu-features.h 38190 + @@ -934,8 +934,6 @@ struct cpu_features 38191 + /* The full state size for XSAVE when XSAVEC is disabled by 38192 + 38193 + GLIBC_TUNABLES=glibc.cpu.hwcaps=-XSAVEC 38194 + - 38195 + - and the AMX state size when XSAVEC is available. 38196 + */ 38197 + unsigned int xsave_state_full_size; 38198 + /* Data cache size for use in memory and string routines, typically 38199 + @@ -989,6 +987,13 @@ extern const struct cpu_features *_dl_x86_get_cpu_features (void) 38200 + 38201 + #define __get_cpu_features() _dl_x86_get_cpu_features() 38202 + 38203 + +#if IS_IN (rtld) || IS_IN (libc) 38204 + +/* XSAVE/XSAVEC state size used by TLS descriptors. Compared to 38205 + + xsave_state_size from struct cpu_features, this includes additional 38206 + + registers. */ 38207 + +extern unsigned long int _dl_x86_features_tlsdesc_state_size attribute_hidden; 38208 + +#endif 38209 + + 38210 + #if defined (_LIBC) && !IS_IN (nonlib) 38211 + /* Unused for x86. */ 38212 + # define INIT_ARCH() 38213 + diff --git a/sysdeps/x86/tst-gnu2-tls2-x86-noxsave.c b/sysdeps/x86/tst-gnu2-tls2-x86-noxsave.c 38214 + new file mode 100644 38215 + index 0000000000..f0024c143d 38216 + --- /dev/null 38217 + +++ b/sysdeps/x86/tst-gnu2-tls2-x86-noxsave.c 38218 + @@ -0,0 +1 @@ 38219 + +#include <elf/tst-gnu2-tls2.c> 38220 + diff --git a/sysdeps/x86/tst-gnu2-tls2-x86-noxsavec.c b/sysdeps/x86/tst-gnu2-tls2-x86-noxsavec.c 38221 + new file mode 100644 38222 + index 0000000000..f0024c143d 38223 + --- /dev/null 38224 + +++ b/sysdeps/x86/tst-gnu2-tls2-x86-noxsavec.c 38225 + @@ -0,0 +1 @@ 38226 + +#include <elf/tst-gnu2-tls2.c> 38227 + diff --git a/sysdeps/x86/tst-gnu2-tls2-x86-noxsavexsavec.c b/sysdeps/x86/tst-gnu2-tls2-x86-noxsavexsavec.c 38228 + new file mode 100644 38229 + index 0000000000..f0024c143d 38230 + --- /dev/null 38231 + +++ b/sysdeps/x86/tst-gnu2-tls2-x86-noxsavexsavec.c 38232 + @@ -0,0 +1 @@ 38233 + +#include <elf/tst-gnu2-tls2.c> 38234 + diff --git a/sysdeps/x86_64/dl-tlsdesc-dynamic.h b/sysdeps/x86_64/dl-tlsdesc-dynamic.h 38235 + index 9f02cfc3eb..44d948696f 100644 38236 + --- a/sysdeps/x86_64/dl-tlsdesc-dynamic.h 38237 + +++ b/sysdeps/x86_64/dl-tlsdesc-dynamic.h 38238 + @@ -99,7 +99,7 @@ _dl_tlsdesc_dynamic: 38239 + # endif 38240 + #else 38241 + /* Allocate stack space of the required size to save the state. */ 38242 + - sub _rtld_local_ro+RTLD_GLOBAL_RO_DL_X86_CPU_FEATURES_OFFSET+XSAVE_STATE_FULL_SIZE_OFFSET(%rip), %RSP_LP 38243 + + sub _dl_x86_features_tlsdesc_state_size(%rip), %RSP_LP 38244 + #endif 38245 + /* Besides rdi and rsi, saved above, save rcx, rdx, r8, r9, 38246 + r10 and r11. */ 38247 + 38248 + commit a87d9a2c2cc17a3b22fd3be8d106336f4dcf2042 38249 + Author: Florian Weimer <fweimer@redhat.com> 38250 + Date: Mon Mar 31 21:33:18 2025 +0200 38251 + 38252 + x86: Link tst-gnu2-tls2-x86-noxsave{,c,xsavec} with libpthread 38253 + 38254 + This fixes a test build failure on Hurd. 38255 + 38256 + Fixes commit 145097dff170507fe73190e8e41194f5b5f7e6bf ("x86: Use separate 38257 + variable for TLSDESC XSAVE/XSAVEC state size (bug 32810)"). 38258 + 38259 + Reviewed-by: Adhemerval Zanella <adhemerval.zanella@linaro.org> 38260 + (cherry picked from commit c6e2895695118ab59c7b17feb0fcb75a53e3478c) 38261 + 38262 + diff --git a/sysdeps/x86/Makefile b/sysdeps/x86/Makefile 38263 + index 8819fba1b7..01b0192ddf 100644 38264 + --- a/sysdeps/x86/Makefile 38265 + +++ b/sysdeps/x86/Makefile 38266 + @@ -104,6 +104,9 @@ LDFLAGS-tst-gnu2-tls2-x86-noxsavexsavec += -Wl,-z,lazy 38267 + tst-gnu2-tls2-x86-noxsave-ENV = GLIBC_TUNABLES=glibc.cpu.hwcaps=-XSAVE 38268 + tst-gnu2-tls2-x86-noxsavec-ENV = GLIBC_TUNABLES=glibc.cpu.hwcaps=-XSAVEC 38269 + tst-gnu2-tls2-x86-noxsavexsavec-ENV = GLIBC_TUNABLES=glibc.cpu.hwcaps=-XSAVE,-XSAVEC 38270 + +$(objpfx)tst-gnu2-tls2-x86-noxsave: $(shared-thread-library) 38271 + +$(objpfx)tst-gnu2-tls2-x86-noxsavec: $(shared-thread-library) 38272 + +$(objpfx)tst-gnu2-tls2-x86-noxsavexsavec: $(shared-thread-library) 38273 + $(objpfx)tst-gnu2-tls2-x86-noxsave.out \ 38274 + $(objpfx)tst-gnu2-tls2-x86-noxsavec.out \ 38275 + $(objpfx)tst-gnu2-tls2-x86-noxsavexsavec.out: \ 38276 + 38277 + commit 8fe27af20c8b25b84e12bcd52353862a95044aa2 38278 + Author: Noah Goldstein <goldstein.w.n@gmail.com> 38279 + Date: Wed Aug 14 14:37:30 2024 +0800 38280 + 38281 + x86: Use `Avoid_Non_Temporal_Memset` to control non-temporal path 38282 + 38283 + This is just a refactor and there should be no behavioral change from 38284 + this commit. 38285 + 38286 + The goal is to make `Avoid_Non_Temporal_Memset` a more universal knob 38287 + for controlling whether we use non-temporal memset rather than having 38288 + extra logic based on vendor. 38289 + Reviewed-by: H.J. Lu <hjl.tools@gmail.com> 38290 + 38291 + (cherry picked from commit b93dddfaf440aa12f45d7c356f6ffe9f27d35577) 38292 + 38293 + diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c 38294 + index ec27337337..8841020b36 100644 38295 + --- a/sysdeps/x86/cpu-features.c 38296 + +++ b/sysdeps/x86/cpu-features.c 38297 + @@ -758,6 +758,12 @@ init_cpu_features (struct cpu_features *cpu_features) 38298 + unsigned int stepping = 0; 38299 + enum cpu_features_kind kind; 38300 + 38301 + + /* Default is avoid non-temporal memset for non Intel/AMD hardware. This is, 38302 + + as of writing this, we only have benchmarks indicatings it profitability 38303 + + on Intel/AMD. */ 38304 + + cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset] 38305 + + |= bit_arch_Avoid_Non_Temporal_Memset; 38306 + + 38307 + cpu_features->cachesize_non_temporal_divisor = 4; 38308 + #if !HAS_CPUID 38309 + if (__get_cpuid_max (0, 0) == 0) 38310 + @@ -783,6 +789,11 @@ init_cpu_features (struct cpu_features *cpu_features) 38311 + 38312 + update_active (cpu_features); 38313 + 38314 + + /* Benchmarks indicate non-temporal memset can be profitable on Intel 38315 + + hardware. */ 38316 + + cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset] 38317 + + &= ~bit_arch_Avoid_Non_Temporal_Memset; 38318 + + 38319 + if (family == 0x06) 38320 + { 38321 + model += extended_model; 38322 + @@ -993,6 +1004,11 @@ https://www.intel.com/content/www/us/en/support/articles/000059422/processors.ht 38323 + 38324 + ecx = cpu_features->features[CPUID_INDEX_1].cpuid.ecx; 38325 + 38326 + + /* Benchmarks indicate non-temporal memset can be profitable on AMD 38327 + + hardware. */ 38328 + + cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset] 38329 + + &= ~bit_arch_Avoid_Non_Temporal_Memset; 38330 + + 38331 + if (CPU_FEATURE_USABLE_P (cpu_features, AVX)) 38332 + { 38333 + /* Since the FMA4 bit is in CPUID_INDEX_80000001 and 38334 + diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h 38335 + index ac97414b5b..7b1b61c096 100644 38336 + --- a/sysdeps/x86/dl-cacheinfo.h 38337 + +++ b/sysdeps/x86/dl-cacheinfo.h 38338 + @@ -988,14 +988,6 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) 38339 + if (CPU_FEATURE_USABLE_P (cpu_features, FSRM)) 38340 + rep_movsb_threshold = 2112; 38341 + 38342 + - /* Non-temporal stores are more performant on Intel and AMD hardware above 38343 + - non_temporal_threshold. Enable this for both Intel and AMD hardware. */ 38344 + - unsigned long int memset_non_temporal_threshold = SIZE_MAX; 38345 + - if (!CPU_FEATURES_ARCH_P (cpu_features, Avoid_Non_Temporal_Memset) 38346 + - && (cpu_features->basic.kind == arch_kind_intel 38347 + - || cpu_features->basic.kind == arch_kind_amd)) 38348 + - memset_non_temporal_threshold = non_temporal_threshold; 38349 + - 38350 + /* For AMD CPUs that support ERMS (Zen3+), REP MOVSB is in a lot of 38351 + cases slower than the vectorized path (and for some alignments, 38352 + it is really slow, check BZ #30994). */ 38353 + @@ -1017,6 +1009,13 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) 38354 + if (tunable_size != 0) 38355 + shared = tunable_size; 38356 + 38357 + + /* Non-temporal stores are more performant on some hardware above 38358 + + non_temporal_threshold. Currently Prefer_Non_Temporal is set for for both 38359 + + Intel and AMD hardware. */ 38360 + + unsigned long int memset_non_temporal_threshold = SIZE_MAX; 38361 + + if (!CPU_FEATURES_ARCH_P (cpu_features, Avoid_Non_Temporal_Memset)) 38362 + + memset_non_temporal_threshold = non_temporal_threshold; 38363 + + 38364 + tunable_size = TUNABLE_GET (x86_non_temporal_threshold, long int, NULL); 38365 + if (tunable_size > minimum_non_temporal_threshold 38366 + && tunable_size <= maximum_non_temporal_threshold) 38367 + 38368 + commit 7c6bd71b4dbdadab34e4fd21ec09b86b32daf443 38369 + Author: Sunil K Pandey <skpgkp2@gmail.com> 38370 + Date: Thu Apr 3 13:00:45 2025 -0700 38371 + 38372 + x86: Optimize xstate size calculation 38373 + 38374 + Scan xstate IDs up to the maximum supported xstate ID. Remove the 38375 + separate AMX xstate calculation. Instead, exclude the AMX space from 38376 + the start of TILECFG to the end of TILEDATA in xsave_state_size. 38377 + 38378 + Completed validation on SKL/SKX/SPR/SDE and compared xsave state size 38379 + with "ld.so --list-diagnostics" option, no regression. 38380 + 38381 + Co-Authored-By: H.J. Lu <hjl.tools@gmail.com> 38382 + Reviewed-by: Sunil K Pandey <skpgkp2@gmail.com> 38383 + (cherry picked from commit 70b648855185e967e54668b101d24704c3fb869d) 38384 + 38385 + diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c 38386 + index 8841020b36..1d5e2a0072 100644 38387 + --- a/sysdeps/x86/cpu-features.c 38388 + +++ b/sysdeps/x86/cpu-features.c 38389 + @@ -325,13 +325,8 @@ update_active (struct cpu_features *cpu_features) 38390 + /* Check if XSAVEC is available. */ 38391 + if (CPU_FEATURES_CPU_P (cpu_features, XSAVEC)) 38392 + { 38393 + - unsigned int xstate_comp_offsets[32]; 38394 + - unsigned int xstate_comp_sizes[32]; 38395 + -#ifdef __x86_64__ 38396 + - unsigned int xstate_amx_comp_offsets[32]; 38397 + - unsigned int xstate_amx_comp_sizes[32]; 38398 + - unsigned int amx_ecx; 38399 + -#endif 38400 + + unsigned int xstate_comp_offsets[X86_XSTATE_MAX_ID + 1]; 38401 + + unsigned int xstate_comp_sizes[X86_XSTATE_MAX_ID + 1]; 38402 + unsigned int i; 38403 + 38404 + xstate_comp_offsets[0] = 0; 38405 + @@ -339,39 +334,16 @@ update_active (struct cpu_features *cpu_features) 38406 + xstate_comp_offsets[2] = 576; 38407 + xstate_comp_sizes[0] = 160; 38408 + xstate_comp_sizes[1] = 256; 38409 + -#ifdef __x86_64__ 38410 + - xstate_amx_comp_offsets[0] = 0; 38411 + - xstate_amx_comp_offsets[1] = 160; 38412 + - xstate_amx_comp_offsets[2] = 576; 38413 + - xstate_amx_comp_sizes[0] = 160; 38414 + - xstate_amx_comp_sizes[1] = 256; 38415 + -#endif 38416 + 38417 + - for (i = 2; i < 32; i++) 38418 + + for (i = 2; i <= X86_XSTATE_MAX_ID; i++) 38419 + { 38420 + if ((FULL_STATE_SAVE_MASK & (1 << i)) != 0) 38421 + { 38422 + __cpuid_count (0xd, i, eax, ebx, ecx, edx); 38423 + -#ifdef __x86_64__ 38424 + - /* Include this in xsave_state_full_size. */ 38425 + - amx_ecx = ecx; 38426 + - xstate_amx_comp_sizes[i] = eax; 38427 + - if ((AMX_STATE_SAVE_MASK & (1 << i)) != 0) 38428 + - { 38429 + - /* Exclude this from xsave_state_size. */ 38430 + - ecx = 0; 38431 + - xstate_comp_sizes[i] = 0; 38432 + - } 38433 + - else 38434 + -#endif 38435 + - xstate_comp_sizes[i] = eax; 38436 + + xstate_comp_sizes[i] = eax; 38437 + } 38438 + else 38439 + { 38440 + -#ifdef __x86_64__ 38441 + - amx_ecx = 0; 38442 + - xstate_amx_comp_sizes[i] = 0; 38443 + -#endif 38444 + ecx = 0; 38445 + xstate_comp_sizes[i] = 0; 38446 + } 38447 + @@ -380,42 +352,32 @@ update_active (struct cpu_features *cpu_features) 38448 + { 38449 + xstate_comp_offsets[i] 38450 + = (xstate_comp_offsets[i - 1] 38451 + - + xstate_comp_sizes[i -1]); 38452 + + + xstate_comp_sizes[i - 1]); 38453 + if ((ecx & (1 << 1)) != 0) 38454 + xstate_comp_offsets[i] 38455 + = ALIGN_UP (xstate_comp_offsets[i], 64); 38456 + -#ifdef __x86_64__ 38457 + - xstate_amx_comp_offsets[i] 38458 + - = (xstate_amx_comp_offsets[i - 1] 38459 + - + xstate_amx_comp_sizes[i - 1]); 38460 + - if ((amx_ecx & (1 << 1)) != 0) 38461 + - xstate_amx_comp_offsets[i] 38462 + - = ALIGN_UP (xstate_amx_comp_offsets[i], 38463 + - 64); 38464 + -#endif 38465 + } 38466 + } 38467 + 38468 + /* Use XSAVEC. */ 38469 + unsigned int size 38470 + - = xstate_comp_offsets[31] + xstate_comp_sizes[31]; 38471 + + = (xstate_comp_offsets[X86_XSTATE_MAX_ID] 38472 + + + xstate_comp_sizes[X86_XSTATE_MAX_ID]); 38473 + if (size) 38474 + { 38475 + + size = ALIGN_UP (size + TLSDESC_CALL_REGISTER_SAVE_AREA, 38476 + + 64); 38477 + #ifdef __x86_64__ 38478 + - unsigned int amx_size 38479 + - = (xstate_amx_comp_offsets[31] 38480 + - + xstate_amx_comp_sizes[31]); 38481 + - amx_size 38482 + - = ALIGN_UP ((amx_size 38483 + - + TLSDESC_CALL_REGISTER_SAVE_AREA), 38484 + - 64); 38485 + - /* Set TLSDESC state size to the compact AMX 38486 + - state size for XSAVEC. */ 38487 + - _dl_x86_features_tlsdesc_state_size = amx_size; 38488 + + _dl_x86_features_tlsdesc_state_size = size; 38489 + + /* Exclude the AMX space from the start of TILECFG 38490 + + space to the end of TILEDATA space. If CPU 38491 + + doesn't support AMX, TILECFG offset is the same 38492 + + as TILEDATA + 1 offset. Otherwise, they are 38493 + + multiples of 64. */ 38494 + + size -= (xstate_comp_offsets[X86_XSTATE_TILEDATA_ID + 1] 38495 + + - xstate_comp_offsets[X86_XSTATE_TILECFG_ID]); 38496 + #endif 38497 + - cpu_features->xsave_state_size 38498 + - = ALIGN_UP (size + TLSDESC_CALL_REGISTER_SAVE_AREA, 38499 + - 64); 38500 + + cpu_features->xsave_state_size = size; 38501 + CPU_FEATURE_SET (cpu_features, XSAVEC); 38502 + } 38503 + } 38504 + diff --git a/sysdeps/x86/sysdep.h b/sysdeps/x86/sysdep.h 38505 + index 7359149e17..1d6cabd816 100644 38506 + --- a/sysdeps/x86/sysdep.h 38507 + +++ b/sysdeps/x86/sysdep.h 38508 + @@ -102,6 +102,9 @@ 38509 + | (1 << X86_XSTATE_ZMM_ID) \ 38510 + | (1 << X86_XSTATE_APX_F_ID)) 38511 + 38512 + +/* The maximum supported xstate ID. */ 38513 + +# define X86_XSTATE_MAX_ID X86_XSTATE_APX_F_ID 38514 + + 38515 + /* AMX state mask. */ 38516 + # define AMX_STATE_SAVE_MASK \ 38517 + ((1 << X86_XSTATE_TILECFG_ID) | (1 << X86_XSTATE_TILEDATA_ID)) 38518 + @@ -123,6 +126,9 @@ 38519 + | (1 << X86_XSTATE_K_ID) \ 38520 + | (1 << X86_XSTATE_ZMM_H_ID)) 38521 + 38522 + +/* The maximum supported xstate ID. */ 38523 + +# define X86_XSTATE_MAX_ID X86_XSTATE_ZMM_H_ID 38524 + + 38525 + /* States to be included in xsave_state_size. */ 38526 + # define FULL_STATE_SAVE_MASK STATE_SAVE_MASK 38527 + #endif 38528 + 38529 + commit 44f92df8007d57f82b1518e219a0dbb60389ef2c 38530 + Author: Sunil K Pandey <skpgkp2@gmail.com> 38531 + Date: Thu Apr 3 18:14:20 2025 -0700 38532 + 38533 + x86: Add ARL/PTL/CWF model detection support 38534 + 38535 + - Add ARROWLAKE model detection. 38536 + - Add PANTHERLAKE model detection. 38537 + - Add CLEARWATERFOREST model detection. 38538 + 38539 + Intel® Architecture Instruction Set Extensions Programming Reference 38540 + https://cdrdv2.intel.com/v1/dl/getContent/671368 Section 1.2. 38541 + 38542 + No regression, validated model detection on SDE. 38543 + 38544 + Reviewed-by: H.J. Lu <hjl.tools@gmail.com> 38545 + (cherry picked from commit e53eb952b970ac94c97d74fb447418fb327ca096) 38546 + 38547 + diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c 38548 + index 1d5e2a0072..7f21a8227e 100644 38549 + --- a/sysdeps/x86/cpu-features.c 38550 + +++ b/sysdeps/x86/cpu-features.c 38551 + @@ -512,6 +512,7 @@ enum 38552 + INTEL_ATOM_GOLDMONT, 38553 + INTEL_ATOM_GOLDMONT_PLUS, 38554 + INTEL_ATOM_SIERRAFOREST, 38555 + + INTEL_ATOM_CLEARWATERFOREST, 38556 + INTEL_ATOM_GRANDRIDGE, 38557 + INTEL_ATOM_TREMONT, 38558 + 38559 + @@ -539,6 +540,7 @@ enum 38560 + INTEL_BIGCORE_METEORLAKE, 38561 + INTEL_BIGCORE_LUNARLAKE, 38562 + INTEL_BIGCORE_ARROWLAKE, 38563 + + INTEL_BIGCORE_PANTHERLAKE, 38564 + INTEL_BIGCORE_GRANITERAPIDS, 38565 + 38566 + /* Mixed (bigcore + atom SOC). */ 38567 + @@ -584,6 +586,8 @@ intel_get_fam6_microarch (unsigned int model, 38568 + return INTEL_ATOM_GOLDMONT_PLUS; 38569 + case 0xAF: 38570 + return INTEL_ATOM_SIERRAFOREST; 38571 + + case 0xDD: 38572 + + return INTEL_ATOM_CLEARWATERFOREST; 38573 + case 0xB6: 38574 + return INTEL_ATOM_GRANDRIDGE; 38575 + case 0x86: 38576 + @@ -691,8 +695,12 @@ intel_get_fam6_microarch (unsigned int model, 38577 + return INTEL_BIGCORE_METEORLAKE; 38578 + case 0xbd: 38579 + return INTEL_BIGCORE_LUNARLAKE; 38580 + + case 0xb5: 38581 + + case 0xc5: 38582 + case 0xc6: 38583 + return INTEL_BIGCORE_ARROWLAKE; 38584 + + case 0xCC: 38585 + + return INTEL_BIGCORE_PANTHERLAKE; 38586 + case 0xAD: 38587 + case 0xAE: 38588 + return INTEL_BIGCORE_GRANITERAPIDS; 38589 + @@ -808,6 +816,7 @@ init_cpu_features (struct cpu_features *cpu_features) 38590 + Default tuned atom microarch. 38591 + case INTEL_ATOM_SIERRAFOREST: 38592 + case INTEL_ATOM_GRANDRIDGE: 38593 + + case INTEL_ATOM_CLEARWATERFOREST: 38594 + */ 38595 + 38596 + /* Bigcore/Default Tuning. */ 38597 + @@ -864,6 +873,7 @@ init_cpu_features (struct cpu_features *cpu_features) 38598 + case INTEL_BIGCORE_METEORLAKE: 38599 + case INTEL_BIGCORE_LUNARLAKE: 38600 + case INTEL_BIGCORE_ARROWLAKE: 38601 + + case INTEL_BIGCORE_PANTHERLAKE: 38602 + case INTEL_BIGCORE_SAPPHIRERAPIDS: 38603 + case INTEL_BIGCORE_EMERALDRAPIDS: 38604 + case INTEL_BIGCORE_GRANITERAPIDS: 38605 + 38606 + commit 9ee8083c4edbe5e92af7aabb23261309f03ef05c 38607 + Author: Sunil K Pandey <sunil.k.pandey@intel.com> 38608 + Date: Fri Apr 11 08:52:52 2025 -0700 38609 + 38610 + x86: Handle unknown Intel processor with default tuning 38611 + 38612 + Enable default tuning for unknown Intel processor. 38613 + 38614 + Tested on x86, no regression. 38615 + 38616 + Co-Authored-By: H.J. Lu <hjl.tools@gmail.com> 38617 + Reviewed-by: H.J. Lu <hjl.tools@gmail.com> 38618 + (cherry picked from commit 9f0deff558d1d6b08c425c157f50de85013ada9c) 38619 + 38620 + diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c 38621 + index 7f21a8227e..1a6e694abf 100644 38622 + --- a/sysdeps/x86/cpu-features.c 38623 + +++ b/sysdeps/x86/cpu-features.c 38624 + @@ -502,8 +502,8 @@ _Static_assert (((index_arch_Fast_Unaligned_Load 38625 + "Incorrect index_arch_Fast_Unaligned_Load"); 38626 + 38627 + 38628 + -/* Intel Family-6 microarch list. */ 38629 + -enum 38630 + +/* Intel microarch list. */ 38631 + +enum intel_microarch 38632 + { 38633 + /* Atom processors. */ 38634 + INTEL_ATOM_BONNELL, 38635 + @@ -555,7 +555,7 @@ enum 38636 + INTEL_UNKNOWN, 38637 + }; 38638 + 38639 + -static unsigned int 38640 + +static enum intel_microarch 38641 + intel_get_fam6_microarch (unsigned int model, 38642 + __attribute__ ((unused)) unsigned int stepping) 38643 + { 38644 + @@ -764,134 +764,20 @@ init_cpu_features (struct cpu_features *cpu_features) 38645 + cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset] 38646 + &= ~bit_arch_Avoid_Non_Temporal_Memset; 38647 + 38648 + + enum intel_microarch microarch = INTEL_UNKNOWN; 38649 + if (family == 0x06) 38650 + { 38651 + model += extended_model; 38652 + - unsigned int microarch 38653 + - = intel_get_fam6_microarch (model, stepping); 38654 + + microarch = intel_get_fam6_microarch (model, stepping); 38655 + 38656 + + /* Disable TSX on some processors to avoid TSX on kernels that 38657 + + weren't updated with the latest microcode package (which 38658 + + disables broken feature by default). */ 38659 + switch (microarch) 38660 + { 38661 + - /* Atom / KNL tuning. */ 38662 + - case INTEL_ATOM_BONNELL: 38663 + - /* BSF is slow on Bonnell. */ 38664 + - cpu_features->preferred[index_arch_Slow_BSF] 38665 + - |= bit_arch_Slow_BSF; 38666 + - break; 38667 + - 38668 + - /* Unaligned load versions are faster than SSSE3 38669 + - on Airmont, Silvermont, Goldmont, and Goldmont Plus. */ 38670 + - case INTEL_ATOM_AIRMONT: 38671 + - case INTEL_ATOM_SILVERMONT: 38672 + - case INTEL_ATOM_GOLDMONT: 38673 + - case INTEL_ATOM_GOLDMONT_PLUS: 38674 + - 38675 + - /* Knights Landing. Enable Silvermont optimizations. */ 38676 + - case INTEL_KNIGHTS_LANDING: 38677 + - 38678 + - cpu_features->preferred[index_arch_Fast_Unaligned_Load] 38679 + - |= (bit_arch_Fast_Unaligned_Load 38680 + - | bit_arch_Fast_Unaligned_Copy 38681 + - | bit_arch_Prefer_PMINUB_for_stringop 38682 + - | bit_arch_Slow_SSE4_2); 38683 + - break; 38684 + - 38685 + - case INTEL_ATOM_TREMONT: 38686 + - /* Enable rep string instructions, unaligned load, unaligned 38687 + - copy, pminub and avoid SSE 4.2 on Tremont. */ 38688 + - cpu_features->preferred[index_arch_Fast_Rep_String] 38689 + - |= (bit_arch_Fast_Rep_String 38690 + - | bit_arch_Fast_Unaligned_Load 38691 + - | bit_arch_Fast_Unaligned_Copy 38692 + - | bit_arch_Prefer_PMINUB_for_stringop 38693 + - | bit_arch_Slow_SSE4_2); 38694 + - break; 38695 + - 38696 + - /* 38697 + - Default tuned Knights microarch. 38698 + - case INTEL_KNIGHTS_MILL: 38699 + - */ 38700 + - 38701 + - /* 38702 + - Default tuned atom microarch. 38703 + - case INTEL_ATOM_SIERRAFOREST: 38704 + - case INTEL_ATOM_GRANDRIDGE: 38705 + - case INTEL_ATOM_CLEARWATERFOREST: 38706 + - */ 38707 + - 38708 + - /* Bigcore/Default Tuning. */ 38709 + default: 38710 + - default_tuning: 38711 + - /* Unknown family 0x06 processors. Assuming this is one 38712 + - of Core i3/i5/i7 processors if AVX is available. */ 38713 + - if (!CPU_FEATURES_CPU_P (cpu_features, AVX)) 38714 + - break; 38715 + - 38716 + - enable_modern_features: 38717 + - /* Rep string instructions, unaligned load, unaligned copy, 38718 + - and pminub are fast on Intel Core i3, i5 and i7. */ 38719 + - cpu_features->preferred[index_arch_Fast_Rep_String] 38720 + - |= (bit_arch_Fast_Rep_String 38721 + - | bit_arch_Fast_Unaligned_Load 38722 + - | bit_arch_Fast_Unaligned_Copy 38723 + - | bit_arch_Prefer_PMINUB_for_stringop); 38724 + break; 38725 + 38726 + - case INTEL_BIGCORE_NEHALEM: 38727 + - case INTEL_BIGCORE_WESTMERE: 38728 + - /* Older CPUs prefer non-temporal stores at lower threshold. */ 38729 + - cpu_features->cachesize_non_temporal_divisor = 8; 38730 + - goto enable_modern_features; 38731 + - 38732 + - /* Older Bigcore microarch (smaller non-temporal store 38733 + - threshold). */ 38734 + - case INTEL_BIGCORE_SANDYBRIDGE: 38735 + - case INTEL_BIGCORE_IVYBRIDGE: 38736 + - case INTEL_BIGCORE_HASWELL: 38737 + - case INTEL_BIGCORE_BROADWELL: 38738 + - cpu_features->cachesize_non_temporal_divisor = 8; 38739 + - goto default_tuning; 38740 + - 38741 + - /* Newer Bigcore microarch (larger non-temporal store 38742 + - threshold). */ 38743 + - case INTEL_BIGCORE_SKYLAKE_AVX512: 38744 + - case INTEL_BIGCORE_CANNONLAKE: 38745 + - /* Benchmarks indicate non-temporal memset is not 38746 + - necessarily profitable on SKX (and in some cases much 38747 + - worse). This is likely unique to SKX due its it unique 38748 + - mesh interconnect (not present on ICX or BWD). Disable 38749 + - non-temporal on all Skylake servers. */ 38750 + - cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset] 38751 + - |= bit_arch_Avoid_Non_Temporal_Memset; 38752 + - case INTEL_BIGCORE_COMETLAKE: 38753 + - case INTEL_BIGCORE_SKYLAKE: 38754 + - case INTEL_BIGCORE_KABYLAKE: 38755 + - case INTEL_BIGCORE_ICELAKE: 38756 + - case INTEL_BIGCORE_TIGERLAKE: 38757 + - case INTEL_BIGCORE_ROCKETLAKE: 38758 + - case INTEL_BIGCORE_RAPTORLAKE: 38759 + - case INTEL_BIGCORE_METEORLAKE: 38760 + - case INTEL_BIGCORE_LUNARLAKE: 38761 + - case INTEL_BIGCORE_ARROWLAKE: 38762 + - case INTEL_BIGCORE_PANTHERLAKE: 38763 + - case INTEL_BIGCORE_SAPPHIRERAPIDS: 38764 + - case INTEL_BIGCORE_EMERALDRAPIDS: 38765 + - case INTEL_BIGCORE_GRANITERAPIDS: 38766 + - cpu_features->cachesize_non_temporal_divisor = 2; 38767 + - goto default_tuning; 38768 + - 38769 + - /* Default tuned Mixed (bigcore + atom SOC). */ 38770 + - case INTEL_MIXED_LAKEFIELD: 38771 + - case INTEL_MIXED_ALDERLAKE: 38772 + - cpu_features->cachesize_non_temporal_divisor = 2; 38773 + - goto default_tuning; 38774 + - } 38775 + - 38776 + - /* Disable TSX on some processors to avoid TSX on kernels that 38777 + - weren't updated with the latest microcode package (which 38778 + - disables broken feature by default). */ 38779 + - switch (microarch) 38780 + - { 38781 + case INTEL_BIGCORE_SKYLAKE_AVX512: 38782 + /* 0x55 (Skylake-avx512) && stepping <= 5 disable TSX. */ 38783 + if (stepping <= 5) 38784 + @@ -900,38 +786,152 @@ init_cpu_features (struct cpu_features *cpu_features) 38785 + 38786 + case INTEL_BIGCORE_KABYLAKE: 38787 + /* NB: Although the errata documents that for model == 0x8e 38788 + - (kabylake skylake client), only 0xb stepping or lower are 38789 + - impacted, the intention of the errata was to disable TSX on 38790 + - all client processors on all steppings. Include 0xc 38791 + - stepping which is an Intel Core i7-8665U, a client mobile 38792 + - processor. */ 38793 + + (kabylake skylake client), only 0xb stepping or lower are 38794 + + impacted, the intention of the errata was to disable TSX on 38795 + + all client processors on all steppings. Include 0xc 38796 + + stepping which is an Intel Core i7-8665U, a client mobile 38797 + + processor. */ 38798 + if (stepping > 0xc) 38799 + break; 38800 + /* Fall through. */ 38801 + case INTEL_BIGCORE_SKYLAKE: 38802 + - /* Disable Intel TSX and enable RTM_ALWAYS_ABORT for 38803 + - processors listed in: 38804 + - 38805 + -https://www.intel.com/content/www/us/en/support/articles/000059422/processors.html 38806 + - */ 38807 + - disable_tsx: 38808 + - CPU_FEATURE_UNSET (cpu_features, HLE); 38809 + - CPU_FEATURE_UNSET (cpu_features, RTM); 38810 + - CPU_FEATURE_SET (cpu_features, RTM_ALWAYS_ABORT); 38811 + - break; 38812 + + /* Disable Intel TSX and enable RTM_ALWAYS_ABORT for 38813 + + processors listed in: 38814 + + 38815 + + https://www.intel.com/content/www/us/en/support/articles/000059422/processors.html 38816 + + */ 38817 + +disable_tsx: 38818 + + CPU_FEATURE_UNSET (cpu_features, HLE); 38819 + + CPU_FEATURE_UNSET (cpu_features, RTM); 38820 + + CPU_FEATURE_SET (cpu_features, RTM_ALWAYS_ABORT); 38821 + + break; 38822 + 38823 + case INTEL_BIGCORE_HASWELL: 38824 + - /* Xeon E7 v3 (model == 0x3f) with stepping >= 4 has working 38825 + - TSX. Haswell also include other model numbers that have 38826 + - working TSX. */ 38827 + - if (model == 0x3f && stepping >= 4) 38828 + + /* Xeon E7 v3 (model == 0x3f) with stepping >= 4 has working 38829 + + TSX. Haswell also includes other model numbers that have 38830 + + working TSX. */ 38831 + + if (model == 0x3f && stepping >= 4) 38832 + break; 38833 + 38834 + - CPU_FEATURE_UNSET (cpu_features, RTM); 38835 + - break; 38836 + + CPU_FEATURE_UNSET (cpu_features, RTM); 38837 + + break; 38838 + } 38839 + } 38840 + 38841 + + switch (microarch) 38842 + + { 38843 + + /* Atom / KNL tuning. */ 38844 + + case INTEL_ATOM_BONNELL: 38845 + + /* BSF is slow on Bonnell. */ 38846 + + cpu_features->preferred[index_arch_Slow_BSF] 38847 + + |= bit_arch_Slow_BSF; 38848 + + break; 38849 + + 38850 + + /* Unaligned load versions are faster than SSSE3 38851 + + on Airmont, Silvermont, Goldmont, and Goldmont Plus. */ 38852 + + case INTEL_ATOM_AIRMONT: 38853 + + case INTEL_ATOM_SILVERMONT: 38854 + + case INTEL_ATOM_GOLDMONT: 38855 + + case INTEL_ATOM_GOLDMONT_PLUS: 38856 + + 38857 + + /* Knights Landing. Enable Silvermont optimizations. */ 38858 + + case INTEL_KNIGHTS_LANDING: 38859 + + 38860 + + cpu_features->preferred[index_arch_Fast_Unaligned_Load] 38861 + + |= (bit_arch_Fast_Unaligned_Load 38862 + + | bit_arch_Fast_Unaligned_Copy 38863 + + | bit_arch_Prefer_PMINUB_for_stringop 38864 + + | bit_arch_Slow_SSE4_2); 38865 + + break; 38866 + + 38867 + + case INTEL_ATOM_TREMONT: 38868 + + /* Enable rep string instructions, unaligned load, unaligned 38869 + + copy, pminub and avoid SSE 4.2 on Tremont. */ 38870 + + cpu_features->preferred[index_arch_Fast_Rep_String] 38871 + + |= (bit_arch_Fast_Rep_String 38872 + + | bit_arch_Fast_Unaligned_Load 38873 + + | bit_arch_Fast_Unaligned_Copy 38874 + + | bit_arch_Prefer_PMINUB_for_stringop 38875 + + | bit_arch_Slow_SSE4_2); 38876 + + break; 38877 + + 38878 + + /* 38879 + + Default tuned Knights microarch. 38880 + + case INTEL_KNIGHTS_MILL: 38881 + + */ 38882 + + 38883 + + /* 38884 + + Default tuned atom microarch. 38885 + + case INTEL_ATOM_SIERRAFOREST: 38886 + + case INTEL_ATOM_GRANDRIDGE: 38887 + + case INTEL_ATOM_CLEARWATERFOREST: 38888 + + */ 38889 + + 38890 + + /* Bigcore/Default Tuning. */ 38891 + + default: 38892 + + default_tuning: 38893 + + /* Unknown Intel processors. Assuming this is one of Core 38894 + + i3/i5/i7 processors if AVX is available. */ 38895 + + if (!CPU_FEATURES_CPU_P (cpu_features, AVX)) 38896 + + break; 38897 + + 38898 + + enable_modern_features: 38899 + + /* Rep string instructions, unaligned load, unaligned copy, 38900 + + and pminub are fast on Intel Core i3, i5 and i7. */ 38901 + + cpu_features->preferred[index_arch_Fast_Rep_String] 38902 + + |= (bit_arch_Fast_Rep_String 38903 + + | bit_arch_Fast_Unaligned_Load 38904 + + | bit_arch_Fast_Unaligned_Copy 38905 + + | bit_arch_Prefer_PMINUB_for_stringop); 38906 + + break; 38907 + + 38908 + + case INTEL_BIGCORE_NEHALEM: 38909 + + case INTEL_BIGCORE_WESTMERE: 38910 + + /* Older CPUs prefer non-temporal stores at lower threshold. */ 38911 + + cpu_features->cachesize_non_temporal_divisor = 8; 38912 + + goto enable_modern_features; 38913 + + 38914 + + /* Older Bigcore microarch (smaller non-temporal store 38915 + + threshold). */ 38916 + + case INTEL_BIGCORE_SANDYBRIDGE: 38917 + + case INTEL_BIGCORE_IVYBRIDGE: 38918 + + case INTEL_BIGCORE_HASWELL: 38919 + + case INTEL_BIGCORE_BROADWELL: 38920 + + cpu_features->cachesize_non_temporal_divisor = 8; 38921 + + goto default_tuning; 38922 + + 38923 + + /* Newer Bigcore microarch (larger non-temporal store 38924 + + threshold). */ 38925 + + case INTEL_BIGCORE_SKYLAKE_AVX512: 38926 + + case INTEL_BIGCORE_CANNONLAKE: 38927 + + /* Benchmarks indicate non-temporal memset is not 38928 + + necessarily profitable on SKX (and in some cases much 38929 + + worse). This is likely unique to SKX due to its unique 38930 + + mesh interconnect (not present on ICX or BWD). Disable 38931 + + non-temporal on all Skylake servers. */ 38932 + + cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset] 38933 + + |= bit_arch_Avoid_Non_Temporal_Memset; 38934 + + /* fallthrough */ 38935 + + case INTEL_BIGCORE_COMETLAKE: 38936 + + case INTEL_BIGCORE_SKYLAKE: 38937 + + case INTEL_BIGCORE_KABYLAKE: 38938 + + case INTEL_BIGCORE_ICELAKE: 38939 + + case INTEL_BIGCORE_TIGERLAKE: 38940 + + case INTEL_BIGCORE_ROCKETLAKE: 38941 + + case INTEL_BIGCORE_RAPTORLAKE: 38942 + + case INTEL_BIGCORE_METEORLAKE: 38943 + + case INTEL_BIGCORE_LUNARLAKE: 38944 + + case INTEL_BIGCORE_ARROWLAKE: 38945 + + case INTEL_BIGCORE_PANTHERLAKE: 38946 + + case INTEL_BIGCORE_SAPPHIRERAPIDS: 38947 + + case INTEL_BIGCORE_EMERALDRAPIDS: 38948 + + case INTEL_BIGCORE_GRANITERAPIDS: 38949 + + /* Default tuned Mixed (bigcore + atom SOC). */ 38950 + + case INTEL_MIXED_LAKEFIELD: 38951 + + case INTEL_MIXED_ALDERLAKE: 38952 + + cpu_features->cachesize_non_temporal_divisor = 2; 38953 + + goto default_tuning; 38954 + + } 38955 + 38956 + /* Since AVX512ER is unique to Xeon Phi, set Prefer_No_VZEROUPPER 38957 + if AVX512ER is available. Don't use AVX512 to avoid lower CPU 38958 + 38959 + commit d8a1a1aef7a58b991505b9a1349a40736dec3abf 38960 + Author: H.J. Lu <hjl.tools@gmail.com> 38961 + Date: Sat Apr 12 08:37:29 2025 -0700 38962 + 38963 + x86: Detect Intel Diamond Rapids 38964 + 38965 + Detect Intel Diamond Rapids and tune it similar to Intel Granite Rapids. 38966 + 38967 + Signed-off-by: H.J. Lu <hjl.tools@gmail.com> 38968 + Reviewed-by: Sunil K Pandey <skpgkp2@gmail.com> 38969 + (cherry picked from commit de14f1959ee5f9b845a7cae43bee03068b8136f0) 38970 + 38971 + diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c 38972 + index 1a6e694abf..52a2f03bdd 100644 38973 + --- a/sysdeps/x86/cpu-features.c 38974 + +++ b/sysdeps/x86/cpu-features.c 38975 + @@ -542,6 +542,7 @@ enum intel_microarch 38976 + INTEL_BIGCORE_ARROWLAKE, 38977 + INTEL_BIGCORE_PANTHERLAKE, 38978 + INTEL_BIGCORE_GRANITERAPIDS, 38979 + + INTEL_BIGCORE_DIAMONDRAPIDS, 38980 + 38981 + /* Mixed (bigcore + atom SOC). */ 38982 + INTEL_MIXED_LAKEFIELD, 38983 + @@ -817,6 +818,16 @@ disable_tsx: 38984 + break; 38985 + } 38986 + } 38987 + + else if (family == 19) 38988 + + switch (model) 38989 + + { 38990 + + case 0x01: 38991 + + microarch = INTEL_BIGCORE_DIAMONDRAPIDS; 38992 + + break; 38993 + + 38994 + + default: 38995 + + break; 38996 + + } 38997 + 38998 + switch (microarch) 38999 + { 39000 + @@ -926,6 +937,7 @@ disable_tsx: 39001 + case INTEL_BIGCORE_SAPPHIRERAPIDS: 39002 + case INTEL_BIGCORE_EMERALDRAPIDS: 39003 + case INTEL_BIGCORE_GRANITERAPIDS: 39004 + + case INTEL_BIGCORE_DIAMONDRAPIDS: 39005 + /* Default tuned Mixed (bigcore + atom SOC). */ 39006 + case INTEL_MIXED_LAKEFIELD: 39007 + case INTEL_MIXED_ALDERLAKE: 39008 + 39009 + commit 736e6735053f12181d3d287898dd5fdb9e8baf59 39010 + Author: Frank Barrus <frankbarrus_sw@shaggy.cc> 39011 + Date: Wed Dec 4 07:55:02 2024 -0500 39012 + 39013 + pthreads NPTL: lost wakeup fix 2 39014 + 39015 + This fixes the lost wakeup (from a bug in signal stealing) with a change 39016 + in the usage of g_signals[] in the condition variable internal state. 39017 + It also completely eliminates the concept and handling of signal stealing, 39018 + as well as the need for signalers to block to wait for waiters to wake 39019 + up every time there is a G1/G2 switch. This greatly reduces the average 39020 + and maximum latency for pthread_cond_signal. 39021 + 39022 + The g_signals[] field now contains a signal count that is relative to 39023 + the current g1_start value. Since it is a 32-bit field, and the LSB is 39024 + still reserved (though not currently used anymore), it has a 31-bit value 39025 + that corresponds to the low 31 bits of the sequence number in g1_start. 39026 + (since g1_start also has an LSB flag, this means bits 31:1 in g_signals 39027 + correspond to bits 31:1 in g1_start, plus the current signal count) 39028 + 39029 + By making the signal count relative to g1_start, there is no longer 39030 + any ambiguity or A/B/A issue, and thus any checks before blocking, 39031 + including the futex call itself, are guaranteed not to block if the G1/G2 39032 + switch occurs, even if the signal count remains the same. This allows 39033 + initially safely blocking in G2 until the switch to G1 occurs, and 39034 + then transitioning from G1 to a new G1 or G2, and always being able to 39035 + distinguish the state change. This removes the race condition and A/B/A 39036 + problems that otherwise ocurred if a late (pre-empted) waiter were to 39037 + resume just as the futex call attempted to block on g_signal since 39038 + otherwise there was no last opportunity to re-check things like whether 39039 + the current G1 group was already closed. 39040 + 39041 + By fixing these issues, the signal stealing code can be eliminated, 39042 + since there is no concept of signal stealing anymore. The code to block 39043 + for all waiters to exit g_refs can also be removed, since any waiters 39044 + that are still in the g_refs region can be guaranteed to safely wake 39045 + up and exit. If there are still any left at this time, they are all 39046 + sent one final futex wakeup to ensure that they are not blocked any 39047 + longer, but there is no need for the signaller to block and wait for 39048 + them to wake up and exit the g_refs region. 39049 + 39050 + The signal count is then effectively "zeroed" but since it is now 39051 + relative to g1_start, this is done by advancing it to a new value that 39052 + can be observed by any pending blocking waiters. Any late waiters can 39053 + always tell the difference, and can thus just cleanly exit if they are 39054 + in a stale G1 or G2. They can never steal a signal from the current 39055 + G1 if they are not in the current G1, since the signal value that has 39056 + to match in the cmpxchg has the low 31 bits of the g1_start value 39057 + contained in it, and that's first checked, and then it won't match if 39058 + there's a G1/G2 change. 39059 + 39060 + Note: the 31-bit sequence number used in g_signals is designed to 39061 + handle wrap-around when checking the signal count, but if the entire 39062 + 31-bit wraparound (2 billion signals) occurs while there is still a 39063 + late waiter that has not yet resumed, and it happens to then match 39064 + the current g1_start low bits, and the pre-emption occurs after the 39065 + normal "closed group" checks (which are 64-bit) but then hits the 39066 + futex syscall and signal consuming code, then an A/B/A issue could 39067 + still result and cause an incorrect assumption about whether it 39068 + should block. This particular scenario seems unlikely in practice. 39069 + Note that once awake from the futex, the waiter would notice the 39070 + closed group before consuming the signal (since that's still a 64-bit 39071 + check that would not be aliased in the wrap-around in g_signals), 39072 + so the biggest impact would be blocking on the futex until the next 39073 + full wakeup from a G1/G2 switch. 39074 + 39075 + Signed-off-by: Frank Barrus <frankbarrus_sw@shaggy.cc> 39076 + Reviewed-by: Carlos O'Donell <carlos@redhat.com> 39077 + (cherry picked from commit 1db84775f831a1494993ce9c118deaf9537cc50a) 39078 + 39079 + diff --git a/nptl/pthread_cond_common.c b/nptl/pthread_cond_common.c 39080 + index 3487557bb8..4855b8899f 100644 39081 + --- a/nptl/pthread_cond_common.c 39082 + +++ b/nptl/pthread_cond_common.c 39083 + @@ -201,7 +201,6 @@ static bool __attribute__ ((unused)) 39084 + __condvar_quiesce_and_switch_g1 (pthread_cond_t *cond, uint64_t wseq, 39085 + unsigned int *g1index, int private) 39086 + { 39087 + - const unsigned int maxspin = 0; 39088 + unsigned int g1 = *g1index; 39089 + 39090 + /* If there is no waiter in G2, we don't do anything. The expression may 39091 + @@ -222,84 +221,46 @@ __condvar_quiesce_and_switch_g1 (pthread_cond_t *cond, uint64_t wseq, 39092 + * New waiters arriving concurrently with the group switching will all go 39093 + into G2 until we atomically make the switch. Waiters existing in G2 39094 + are not affected. 39095 + - * Waiters in G1 will be closed out immediately by setting a flag in 39096 + - __g_signals, which will prevent waiters from blocking using a futex on 39097 + - __g_signals and also notifies them that the group is closed. As a 39098 + - result, they will eventually remove their group reference, allowing us 39099 + - to close switch group roles. */ 39100 + - 39101 + - /* First, set the closed flag on __g_signals. This tells waiters that are 39102 + - about to wait that they shouldn't do that anymore. This basically 39103 + - serves as an advance notification of the upcoming change to __g1_start; 39104 + - waiters interpret it as if __g1_start was larger than their waiter 39105 + - sequence position. This allows us to change __g1_start after waiting 39106 + - for all existing waiters with group references to leave, which in turn 39107 + - makes recovery after stealing a signal simpler because it then can be 39108 + - skipped if __g1_start indicates that the group is closed (otherwise, 39109 + - we would have to recover always because waiters don't know how big their 39110 + - groups are). Relaxed MO is fine. */ 39111 + - atomic_fetch_or_relaxed (cond->__data.__g_signals + g1, 1); 39112 + - 39113 + - /* Wait until there are no group references anymore. The fetch-or operation 39114 + - injects us into the modification order of __g_refs; release MO ensures 39115 + - that waiters incrementing __g_refs after our fetch-or see the previous 39116 + - changes to __g_signals and to __g1_start that had to happen before we can 39117 + - switch this G1 and alias with an older group (we have two groups, so 39118 + - aliasing requires switching group roles twice). Note that nobody else 39119 + - can have set the wake-request flag, so we do not have to act upon it. 39120 + - 39121 + - Also note that it is harmless if older waiters or waiters from this G1 39122 + - get a group reference after we have quiesced the group because it will 39123 + - remain closed for them either because of the closed flag in __g_signals 39124 + - or the later update to __g1_start. New waiters will never arrive here 39125 + - but instead continue to go into the still current G2. */ 39126 + - unsigned r = atomic_fetch_or_release (cond->__data.__g_refs + g1, 0); 39127 + - while ((r >> 1) > 0) 39128 + - { 39129 + - for (unsigned int spin = maxspin; ((r >> 1) > 0) && (spin > 0); spin--) 39130 + - { 39131 + - /* TODO Back off. */ 39132 + - r = atomic_load_relaxed (cond->__data.__g_refs + g1); 39133 + - } 39134 + - if ((r >> 1) > 0) 39135 + - { 39136 + - /* There is still a waiter after spinning. Set the wake-request 39137 + - flag and block. Relaxed MO is fine because this is just about 39138 + - this futex word. 39139 + - 39140 + - Update r to include the set wake-request flag so that the upcoming 39141 + - futex_wait only blocks if the flag is still set (otherwise, we'd 39142 + - violate the basic client-side futex protocol). */ 39143 + - r = atomic_fetch_or_relaxed (cond->__data.__g_refs + g1, 1) | 1; 39144 + - 39145 + - if ((r >> 1) > 0) 39146 + - futex_wait_simple (cond->__data.__g_refs + g1, r, private); 39147 + - /* Reload here so we eventually see the most recent value even if we 39148 + - do not spin. */ 39149 + - r = atomic_load_relaxed (cond->__data.__g_refs + g1); 39150 + - } 39151 + - } 39152 + - /* Acquire MO so that we synchronize with the release operation that waiters 39153 + - use to decrement __g_refs and thus happen after the waiters we waited 39154 + - for. */ 39155 + - atomic_thread_fence_acquire (); 39156 + + * Waiters in G1 will be closed out immediately by the advancing of 39157 + + __g_signals to the next "lowseq" (low 31 bits of the new g1_start), 39158 + + which will prevent waiters from blocking using a futex on 39159 + + __g_signals since it provides enough signals for all possible 39160 + + remaining waiters. As a result, they can each consume a signal 39161 + + and they will eventually remove their group reference. */ 39162 + 39163 + /* Update __g1_start, which finishes closing this group. The value we add 39164 + will never be negative because old_orig_size can only be zero when we 39165 + switch groups the first time after a condvar was initialized, in which 39166 + - case G1 will be at index 1 and we will add a value of 1. See above for 39167 + - why this takes place after waiting for quiescence of the group. 39168 + + case G1 will be at index 1 and we will add a value of 1. 39169 + Relaxed MO is fine because the change comes with no additional 39170 + constraints that others would have to observe. */ 39171 + __condvar_add_g1_start_relaxed (cond, 39172 + (old_orig_size << 1) + (g1 == 1 ? 1 : - 1)); 39173 + 39174 + - /* Now reopen the group, thus enabling waiters to again block using the 39175 + - futex controlled by __g_signals. Release MO so that observers that see 39176 + - no signals (and thus can block) also see the write __g1_start and thus 39177 + - that this is now a new group (see __pthread_cond_wait_common for the 39178 + - matching acquire MO loads). */ 39179 + - atomic_store_release (cond->__data.__g_signals + g1, 0); 39180 + + unsigned int lowseq = ((old_g1_start + old_orig_size) << 1) & ~1U; 39181 + + 39182 + + /* If any waiters still hold group references (and thus could be blocked), 39183 + + then wake them all up now and prevent any running ones from blocking. 39184 + + This is effectively a catch-all for any possible current or future 39185 + + bugs that can allow the group size to reach 0 before all G1 waiters 39186 + + have been awakened or at least given signals to consume, or any 39187 + + other case that can leave blocked (or about to block) older waiters.. */ 39188 + + if ((atomic_fetch_or_release (cond->__data.__g_refs + g1, 0) >> 1) > 0) 39189 + + { 39190 + + /* First advance signals to the end of the group (i.e. enough signals 39191 + + for the entire G1 group) to ensure that waiters which have not 39192 + + yet blocked in the futex will not block. 39193 + + Note that in the vast majority of cases, this should never 39194 + + actually be necessary, since __g_signals will have enough 39195 + + signals for the remaining g_refs waiters. As an optimization, 39196 + + we could check this first before proceeding, although that 39197 + + could still leave the potential for futex lost wakeup bugs 39198 + + if the signal count was non-zero but the futex wakeup 39199 + + was somehow lost. */ 39200 + + atomic_store_release (cond->__data.__g_signals + g1, lowseq); 39201 + + 39202 + + futex_wake (cond->__data.__g_signals + g1, INT_MAX, private); 39203 + + } 39204 + 39205 + /* At this point, the old G1 is now a valid new G2 (but not in use yet). 39206 + No old waiter can neither grab a signal nor acquire a reference without 39207 + @@ -311,6 +272,10 @@ __condvar_quiesce_and_switch_g1 (pthread_cond_t *cond, uint64_t wseq, 39208 + g1 ^= 1; 39209 + *g1index ^= 1; 39210 + 39211 + + /* Now advance the new G1 g_signals to the new lowseq, giving it 39212 + + an effective signal count of 0 to start. */ 39213 + + atomic_store_release (cond->__data.__g_signals + g1, lowseq); 39214 + + 39215 + /* These values are just observed by signalers, and thus protected by the 39216 + lock. */ 39217 + unsigned int orig_size = wseq - (old_g1_start + old_orig_size); 39218 + diff --git a/nptl/pthread_cond_wait.c b/nptl/pthread_cond_wait.c 39219 + index 66786c7b90..3d290e39c8 100644 39220 + --- a/nptl/pthread_cond_wait.c 39221 + +++ b/nptl/pthread_cond_wait.c 39222 + @@ -238,9 +238,7 @@ __condvar_cleanup_waiting (void *arg) 39223 + signaled), and a reference count. 39224 + 39225 + The group reference count is used to maintain the number of waiters that 39226 + - are using the group's futex. Before a group can change its role, the 39227 + - reference count must show that no waiters are using the futex anymore; this 39228 + - prevents ABA issues on the futex word. 39229 + + are using the group's futex. 39230 + 39231 + To represent which intervals in the waiter sequence the groups cover (and 39232 + thus also which group slot contains G1 or G2), we use a 64b counter to 39233 + @@ -300,11 +298,12 @@ __condvar_cleanup_waiting (void *arg) 39234 + last reference. 39235 + * Reference count used by waiters concurrently with signalers that have 39236 + acquired the condvar-internal lock. 39237 + - __g_signals: The number of signals that can still be consumed. 39238 + + __g_signals: The number of signals that can still be consumed, relative to 39239 + + the current g1_start. (i.e. bits 31 to 1 of __g_signals are bits 39240 + + 31 to 1 of g1_start with the signal count added) 39241 + * Used as a futex word by waiters. Used concurrently by waiters and 39242 + signalers. 39243 + - * LSB is true iff this group has been completely signaled (i.e., it is 39244 + - closed). 39245 + + * LSB is currently reserved and 0. 39246 + __g_size: Waiters remaining in this group (i.e., which have not been 39247 + signaled yet. 39248 + * Accessed by signalers and waiters that cancel waiting (both do so only 39249 + @@ -328,18 +327,6 @@ __condvar_cleanup_waiting (void *arg) 39250 + sufficient because if a waiter can see a sufficiently large value, it could 39251 + have also consume a signal in the waiters group. 39252 + 39253 + - Waiters try to grab a signal from __g_signals without holding a reference 39254 + - count, which can lead to stealing a signal from a more recent group after 39255 + - their own group was already closed. They cannot always detect whether they 39256 + - in fact did because they do not know when they stole, but they can 39257 + - conservatively add a signal back to the group they stole from; if they 39258 + - did so unnecessarily, all that happens is a spurious wake-up. To make this 39259 + - even less likely, __g1_start contains the index of the current g2 too, 39260 + - which allows waiters to check if there aliasing on the group slots; if 39261 + - there wasn't, they didn't steal from the current G1, which means that the 39262 + - G1 they stole from must have been already closed and they do not need to 39263 + - fix anything. 39264 + - 39265 + It is essential that the last field in pthread_cond_t is __g_signals[1]: 39266 + The previous condvar used a pointer-sized field in pthread_cond_t, so a 39267 + PTHREAD_COND_INITIALIZER from that condvar implementation might only 39268 + @@ -435,6 +422,9 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex, 39269 + { 39270 + while (1) 39271 + { 39272 + + uint64_t g1_start = __condvar_load_g1_start_relaxed (cond); 39273 + + unsigned int lowseq = (g1_start & 1) == g ? signals : g1_start & ~1U; 39274 + + 39275 + /* Spin-wait first. 39276 + Note that spinning first without checking whether a timeout 39277 + passed might lead to what looks like a spurious wake-up even 39278 + @@ -446,35 +436,45 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex, 39279 + having to compare against the current time seems to be the right 39280 + choice from a performance perspective for most use cases. */ 39281 + unsigned int spin = maxspin; 39282 + - while (signals == 0 && spin > 0) 39283 + + while (spin > 0 && ((int)(signals - lowseq) < 2)) 39284 + { 39285 + /* Check that we are not spinning on a group that's already 39286 + closed. */ 39287 + - if (seq < (__condvar_load_g1_start_relaxed (cond) >> 1)) 39288 + - goto done; 39289 + + if (seq < (g1_start >> 1)) 39290 + + break; 39291 + 39292 + /* TODO Back off. */ 39293 + 39294 + /* Reload signals. See above for MO. */ 39295 + signals = atomic_load_acquire (cond->__data.__g_signals + g); 39296 + + g1_start = __condvar_load_g1_start_relaxed (cond); 39297 + + lowseq = (g1_start & 1) == g ? signals : g1_start & ~1U; 39298 + spin--; 39299 + } 39300 + 39301 + - /* If our group will be closed as indicated by the flag on signals, 39302 + - don't bother grabbing a signal. */ 39303 + - if (signals & 1) 39304 + - goto done; 39305 + - 39306 + - /* If there is an available signal, don't block. */ 39307 + - if (signals != 0) 39308 + + if (seq < (g1_start >> 1)) 39309 + + { 39310 + + /* If the group is closed already, 39311 + + then this waiter originally had enough extra signals to 39312 + + consume, up until the time its group was closed. */ 39313 + + goto done; 39314 + + } 39315 + + 39316 + + /* If there is an available signal, don't block. 39317 + + If __g1_start has advanced at all, then we must be in G1 39318 + + by now, perhaps in the process of switching back to an older 39319 + + G2, but in either case we're allowed to consume the available 39320 + + signal and should not block anymore. */ 39321 + + if ((int)(signals - lowseq) >= 2) 39322 + break; 39323 + 39324 + /* No signals available after spinning, so prepare to block. 39325 + We first acquire a group reference and use acquire MO for that so 39326 + that we synchronize with the dummy read-modify-write in 39327 + __condvar_quiesce_and_switch_g1 if we read from that. In turn, 39328 + - in this case this will make us see the closed flag on __g_signals 39329 + - that designates a concurrent attempt to reuse the group's slot. 39330 + + in this case this will make us see the advancement of __g_signals 39331 + + to the upcoming new g1_start that occurs with a concurrent 39332 + + attempt to reuse the group's slot. 39333 + We use acquire MO for the __g_signals check to make the 39334 + __g1_start check work (see spinning above). 39335 + Note that the group reference acquisition will not mask the 39336 + @@ -482,15 +482,24 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex, 39337 + an atomic read-modify-write operation and thus extend the release 39338 + sequence. */ 39339 + atomic_fetch_add_acquire (cond->__data.__g_refs + g, 2); 39340 + - if (((atomic_load_acquire (cond->__data.__g_signals + g) & 1) != 0) 39341 + - || (seq < (__condvar_load_g1_start_relaxed (cond) >> 1))) 39342 + + signals = atomic_load_acquire (cond->__data.__g_signals + g); 39343 + + g1_start = __condvar_load_g1_start_relaxed (cond); 39344 + + lowseq = (g1_start & 1) == g ? signals : g1_start & ~1U; 39345 + + 39346 + + if (seq < (g1_start >> 1)) 39347 + { 39348 + - /* Our group is closed. Wake up any signalers that might be 39349 + - waiting. */ 39350 + + /* group is closed already, so don't block */ 39351 + __condvar_dec_grefs (cond, g, private); 39352 + goto done; 39353 + } 39354 + 39355 + + if ((int)(signals - lowseq) >= 2) 39356 + + { 39357 + + /* a signal showed up or G1/G2 switched after we grabbed the refcount */ 39358 + + __condvar_dec_grefs (cond, g, private); 39359 + + break; 39360 + + } 39361 + + 39362 + // Now block. 39363 + struct _pthread_cleanup_buffer buffer; 39364 + struct _condvar_cleanup_buffer cbuffer; 39365 + @@ -501,7 +510,7 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex, 39366 + __pthread_cleanup_push (&buffer, __condvar_cleanup_waiting, &cbuffer); 39367 + 39368 + err = __futex_abstimed_wait_cancelable64 ( 39369 + - cond->__data.__g_signals + g, 0, clockid, abstime, private); 39370 + + cond->__data.__g_signals + g, signals, clockid, abstime, private); 39371 + 39372 + __pthread_cleanup_pop (&buffer, 0); 39373 + 39374 + @@ -524,6 +533,8 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex, 39375 + signals = atomic_load_acquire (cond->__data.__g_signals + g); 39376 + } 39377 + 39378 + + if (seq < (__condvar_load_g1_start_relaxed (cond) >> 1)) 39379 + + goto done; 39380 + } 39381 + /* Try to grab a signal. Use acquire MO so that we see an up-to-date value 39382 + of __g1_start below (see spinning above for a similar case). In 39383 + @@ -532,69 +543,6 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex, 39384 + while (!atomic_compare_exchange_weak_acquire (cond->__data.__g_signals + g, 39385 + &signals, signals - 2)); 39386 + 39387 + - /* We consumed a signal but we could have consumed from a more recent group 39388 + - that aliased with ours due to being in the same group slot. If this 39389 + - might be the case our group must be closed as visible through 39390 + - __g1_start. */ 39391 + - uint64_t g1_start = __condvar_load_g1_start_relaxed (cond); 39392 + - if (seq < (g1_start >> 1)) 39393 + - { 39394 + - /* We potentially stole a signal from a more recent group but we do not 39395 + - know which group we really consumed from. 39396 + - We do not care about groups older than current G1 because they are 39397 + - closed; we could have stolen from these, but then we just add a 39398 + - spurious wake-up for the current groups. 39399 + - We will never steal a signal from current G2 that was really intended 39400 + - for G2 because G2 never receives signals (until it becomes G1). We 39401 + - could have stolen a signal from G2 that was conservatively added by a 39402 + - previous waiter that also thought it stole a signal -- but given that 39403 + - that signal was added unnecessarily, it's not a problem if we steal 39404 + - it. 39405 + - Thus, the remaining case is that we could have stolen from the current 39406 + - G1, where "current" means the __g1_start value we observed. However, 39407 + - if the current G1 does not have the same slot index as we do, we did 39408 + - not steal from it and do not need to undo that. This is the reason 39409 + - for putting a bit with G2's index into__g1_start as well. */ 39410 + - if (((g1_start & 1) ^ 1) == g) 39411 + - { 39412 + - /* We have to conservatively undo our potential mistake of stealing 39413 + - a signal. We can stop trying to do that when the current G1 39414 + - changes because other spinning waiters will notice this too and 39415 + - __condvar_quiesce_and_switch_g1 has checked that there are no 39416 + - futex waiters anymore before switching G1. 39417 + - Relaxed MO is fine for the __g1_start load because we need to 39418 + - merely be able to observe this fact and not have to observe 39419 + - something else as well. 39420 + - ??? Would it help to spin for a little while to see whether the 39421 + - current G1 gets closed? This might be worthwhile if the group is 39422 + - small or close to being closed. */ 39423 + - unsigned int s = atomic_load_relaxed (cond->__data.__g_signals + g); 39424 + - while (__condvar_load_g1_start_relaxed (cond) == g1_start) 39425 + - { 39426 + - /* Try to add a signal. We don't need to acquire the lock 39427 + - because at worst we can cause a spurious wake-up. If the 39428 + - group is in the process of being closed (LSB is true), this 39429 + - has an effect similar to us adding a signal. */ 39430 + - if (((s & 1) != 0) 39431 + - || atomic_compare_exchange_weak_relaxed 39432 + - (cond->__data.__g_signals + g, &s, s + 2)) 39433 + - { 39434 + - /* If we added a signal, we also need to add a wake-up on 39435 + - the futex. We also need to do that if we skipped adding 39436 + - a signal because the group is being closed because 39437 + - while __condvar_quiesce_and_switch_g1 could have closed 39438 + - the group, it might still be waiting for futex waiters to 39439 + - leave (and one of those waiters might be the one we stole 39440 + - the signal from, which cause it to block using the 39441 + - futex). */ 39442 + - futex_wake (cond->__data.__g_signals + g, 1, private); 39443 + - break; 39444 + - } 39445 + - /* TODO Back off. */ 39446 + - } 39447 + - } 39448 + - } 39449 + - 39450 + done: 39451 + 39452 + /* Confirm that we have been woken. We do that before acquiring the mutex 39453 + 39454 + commit 88d999d840e77c9917f08870094a23ce42294848 39455 + Author: Malte Skarupke <malteskarupke@fastmail.fm> 39456 + Date: Wed Dec 4 07:55:22 2024 -0500 39457 + 39458 + nptl: Update comments and indentation for new condvar implementation 39459 + 39460 + Some comments were wrong after the most recent commit. This fixes that. 39461 + 39462 + Also fixing indentation where it was using spaces instead of tabs. 39463 + 39464 + Signed-off-by: Malte Skarupke <malteskarupke@fastmail.fm> 39465 + Reviewed-by: Carlos O'Donell <carlos@redhat.com> 39466 + (cherry picked from commit 0cc973160c23bb67f895bc887dd6942d29f8fee3) 39467 + 39468 + diff --git a/nptl/pthread_cond_common.c b/nptl/pthread_cond_common.c 39469 + index 4855b8899f..3475d15123 100644 39470 + --- a/nptl/pthread_cond_common.c 39471 + +++ b/nptl/pthread_cond_common.c 39472 + @@ -221,8 +221,9 @@ __condvar_quiesce_and_switch_g1 (pthread_cond_t *cond, uint64_t wseq, 39473 + * New waiters arriving concurrently with the group switching will all go 39474 + into G2 until we atomically make the switch. Waiters existing in G2 39475 + are not affected. 39476 + - * Waiters in G1 will be closed out immediately by the advancing of 39477 + - __g_signals to the next "lowseq" (low 31 bits of the new g1_start), 39478 + + * Waiters in G1 have already received a signal and been woken. If they 39479 + + haven't woken yet, they will be closed out immediately by the advancing 39480 + + of __g_signals to the next "lowseq" (low 31 bits of the new g1_start), 39481 + which will prevent waiters from blocking using a futex on 39482 + __g_signals since it provides enough signals for all possible 39483 + remaining waiters. As a result, they can each consume a signal 39484 + diff --git a/nptl/pthread_cond_wait.c b/nptl/pthread_cond_wait.c 39485 + index 3d290e39c8..ad2cee7d59 100644 39486 + --- a/nptl/pthread_cond_wait.c 39487 + +++ b/nptl/pthread_cond_wait.c 39488 + @@ -249,7 +249,7 @@ __condvar_cleanup_waiting (void *arg) 39489 + figure out whether they are in a group that has already been completely 39490 + signaled (i.e., if the current G1 starts at a later position that the 39491 + waiter's position). Waiters cannot determine whether they are currently 39492 + - in G2 or G1 -- but they do not have too because all they are interested in 39493 + + in G2 or G1 -- but they do not have to because all they are interested in 39494 + is whether there are available signals, and they always start in G2 (whose 39495 + group slot they know because of the bit in the waiter sequence. Signalers 39496 + will simply fill the right group until it is completely signaled and can 39497 + @@ -412,7 +412,7 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex, 39498 + } 39499 + 39500 + /* Now wait until a signal is available in our group or it is closed. 39501 + - Acquire MO so that if we observe a value of zero written after group 39502 + + Acquire MO so that if we observe (signals == lowseq) after group 39503 + switching in __condvar_quiesce_and_switch_g1, we synchronize with that 39504 + store and will see the prior update of __g1_start done while switching 39505 + groups too. */ 39506 + @@ -422,8 +422,8 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex, 39507 + { 39508 + while (1) 39509 + { 39510 + - uint64_t g1_start = __condvar_load_g1_start_relaxed (cond); 39511 + - unsigned int lowseq = (g1_start & 1) == g ? signals : g1_start & ~1U; 39512 + + uint64_t g1_start = __condvar_load_g1_start_relaxed (cond); 39513 + + unsigned int lowseq = (g1_start & 1) == g ? signals : g1_start & ~1U; 39514 + 39515 + /* Spin-wait first. 39516 + Note that spinning first without checking whether a timeout 39517 + @@ -447,21 +447,21 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex, 39518 + 39519 + /* Reload signals. See above for MO. */ 39520 + signals = atomic_load_acquire (cond->__data.__g_signals + g); 39521 + - g1_start = __condvar_load_g1_start_relaxed (cond); 39522 + - lowseq = (g1_start & 1) == g ? signals : g1_start & ~1U; 39523 + + g1_start = __condvar_load_g1_start_relaxed (cond); 39524 + + lowseq = (g1_start & 1) == g ? signals : g1_start & ~1U; 39525 + spin--; 39526 + } 39527 + 39528 + - if (seq < (g1_start >> 1)) 39529 + + if (seq < (g1_start >> 1)) 39530 + { 39531 + - /* If the group is closed already, 39532 + + /* If the group is closed already, 39533 + then this waiter originally had enough extra signals to 39534 + consume, up until the time its group was closed. */ 39535 + goto done; 39536 + - } 39537 + + } 39538 + 39539 + /* If there is an available signal, don't block. 39540 + - If __g1_start has advanced at all, then we must be in G1 39541 + + If __g1_start has advanced at all, then we must be in G1 39542 + by now, perhaps in the process of switching back to an older 39543 + G2, but in either case we're allowed to consume the available 39544 + signal and should not block anymore. */ 39545 + @@ -483,22 +483,23 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex, 39546 + sequence. */ 39547 + atomic_fetch_add_acquire (cond->__data.__g_refs + g, 2); 39548 + signals = atomic_load_acquire (cond->__data.__g_signals + g); 39549 + - g1_start = __condvar_load_g1_start_relaxed (cond); 39550 + - lowseq = (g1_start & 1) == g ? signals : g1_start & ~1U; 39551 + + g1_start = __condvar_load_g1_start_relaxed (cond); 39552 + + lowseq = (g1_start & 1) == g ? signals : g1_start & ~1U; 39553 + 39554 + - if (seq < (g1_start >> 1)) 39555 + + if (seq < (g1_start >> 1)) 39556 + { 39557 + - /* group is closed already, so don't block */ 39558 + + /* group is closed already, so don't block */ 39559 + __condvar_dec_grefs (cond, g, private); 39560 + goto done; 39561 + } 39562 + 39563 + if ((int)(signals - lowseq) >= 2) 39564 + { 39565 + - /* a signal showed up or G1/G2 switched after we grabbed the refcount */ 39566 + + /* a signal showed up or G1/G2 switched after we grabbed the 39567 + + refcount */ 39568 + __condvar_dec_grefs (cond, g, private); 39569 + break; 39570 + - } 39571 + + } 39572 + 39573 + // Now block. 39574 + struct _pthread_cleanup_buffer buffer; 39575 + @@ -536,10 +537,8 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex, 39576 + if (seq < (__condvar_load_g1_start_relaxed (cond) >> 1)) 39577 + goto done; 39578 + } 39579 + - /* Try to grab a signal. Use acquire MO so that we see an up-to-date value 39580 + - of __g1_start below (see spinning above for a similar case). In 39581 + - particular, if we steal from a more recent group, we will also see a 39582 + - more recent __g1_start below. */ 39583 + + /* Try to grab a signal. See above for MO. (if we do another loop 39584 + + iteration we need to see the correct value of g1_start) */ 39585 + while (!atomic_compare_exchange_weak_acquire (cond->__data.__g_signals + g, 39586 + &signals, signals - 2)); 39587 + 39588 + 39589 + commit 136a29f9d0a3924828d5a16be82d054637517c95 39590 + Author: Malte Skarupke <malteskarupke@fastmail.fm> 39591 + Date: Wed Dec 4 07:55:50 2024 -0500 39592 + 39593 + nptl: Remove unnecessary catch-all-wake in condvar group switch 39594 + 39595 + This wake is unnecessary. We only switch groups after every sleeper in a group 39596 + has been woken. Sure, they may take a while to actually wake up and may still 39597 + hold a reference, but waking them a second time doesn't speed that up. Instead 39598 + this just makes the code more complicated and may hide problems. 39599 + 39600 + In particular this safety wake wouldn't even have helped with the bug that was 39601 + fixed by Barrus' patch: The bug there was that pthread_cond_signal would not 39602 + switch g1 when it should, so we wouldn't even have entered this code path. 39603 + 39604 + Signed-off-by: Malte Skarupke <malteskarupke@fastmail.fm> 39605 + Reviewed-by: Carlos O'Donell <carlos@redhat.com> 39606 + (cherry picked from commit b42cc6af11062c260c7dfa91f1c89891366fed3e) 39607 + 39608 + diff --git a/nptl/pthread_cond_common.c b/nptl/pthread_cond_common.c 39609 + index 3475d15123..30b8eee149 100644 39610 + --- a/nptl/pthread_cond_common.c 39611 + +++ b/nptl/pthread_cond_common.c 39612 + @@ -221,13 +221,7 @@ __condvar_quiesce_and_switch_g1 (pthread_cond_t *cond, uint64_t wseq, 39613 + * New waiters arriving concurrently with the group switching will all go 39614 + into G2 until we atomically make the switch. Waiters existing in G2 39615 + are not affected. 39616 + - * Waiters in G1 have already received a signal and been woken. If they 39617 + - haven't woken yet, they will be closed out immediately by the advancing 39618 + - of __g_signals to the next "lowseq" (low 31 bits of the new g1_start), 39619 + - which will prevent waiters from blocking using a futex on 39620 + - __g_signals since it provides enough signals for all possible 39621 + - remaining waiters. As a result, they can each consume a signal 39622 + - and they will eventually remove their group reference. */ 39623 + + * Waiters in G1 have already received a signal and been woken. */ 39624 + 39625 + /* Update __g1_start, which finishes closing this group. The value we add 39626 + will never be negative because old_orig_size can only be zero when we 39627 + @@ -240,29 +234,6 @@ __condvar_quiesce_and_switch_g1 (pthread_cond_t *cond, uint64_t wseq, 39628 + 39629 + unsigned int lowseq = ((old_g1_start + old_orig_size) << 1) & ~1U; 39630 + 39631 + - /* If any waiters still hold group references (and thus could be blocked), 39632 + - then wake them all up now and prevent any running ones from blocking. 39633 + - This is effectively a catch-all for any possible current or future 39634 + - bugs that can allow the group size to reach 0 before all G1 waiters 39635 + - have been awakened or at least given signals to consume, or any 39636 + - other case that can leave blocked (or about to block) older waiters.. */ 39637 + - if ((atomic_fetch_or_release (cond->__data.__g_refs + g1, 0) >> 1) > 0) 39638 + - { 39639 + - /* First advance signals to the end of the group (i.e. enough signals 39640 + - for the entire G1 group) to ensure that waiters which have not 39641 + - yet blocked in the futex will not block. 39642 + - Note that in the vast majority of cases, this should never 39643 + - actually be necessary, since __g_signals will have enough 39644 + - signals for the remaining g_refs waiters. As an optimization, 39645 + - we could check this first before proceeding, although that 39646 + - could still leave the potential for futex lost wakeup bugs 39647 + - if the signal count was non-zero but the futex wakeup 39648 + - was somehow lost. */ 39649 + - atomic_store_release (cond->__data.__g_signals + g1, lowseq); 39650 + - 39651 + - futex_wake (cond->__data.__g_signals + g1, INT_MAX, private); 39652 + - } 39653 + - 39654 + /* At this point, the old G1 is now a valid new G2 (but not in use yet). 39655 + No old waiter can neither grab a signal nor acquire a reference without 39656 + noticing that __g1_start is larger. 39657 + 39658 + commit 2a259b6d77dc5bdab5c8f4ee0e69572d5699d4bf 39659 + Author: Malte Skarupke <malteskarupke@fastmail.fm> 39660 + Date: Wed Dec 4 07:56:13 2024 -0500 39661 + 39662 + nptl: Remove unnecessary quadruple check in pthread_cond_wait 39663 + 39664 + pthread_cond_wait was checking whether it was in a closed group no less than 39665 + four times. Checking once is enough. Here are the four checks: 39666 + 39667 + 1. While spin-waiting. This was dead code: maxspin is set to 0 and has been 39668 + for years. 39669 + 2. Before deciding to go to sleep, and before incrementing grefs: I kept this 39670 + 3. After incrementing grefs. There is no reason to think that the group would 39671 + close while we do an atomic increment. Obviously it could close at any 39672 + point, but that doesn't mean we have to recheck after every step. This 39673 + check was equally good as check 2, except it has to do more work. 39674 + 4. When we find ourselves in a group that has a signal. We only get here after 39675 + we check that we're not in a closed group. There is no need to check again. 39676 + The check would only have helped in cases where the compare_exchange in the 39677 + next line would also have failed. Relying on the compare_exchange is fine. 39678 + 39679 + Removing the duplicate checks clarifies the code. 39680 + 39681 + Signed-off-by: Malte Skarupke <malteskarupke@fastmail.fm> 39682 + Reviewed-by: Carlos O'Donell <carlos@redhat.com> 39683 + (cherry picked from commit 4f7b051f8ee3feff1b53b27a906f245afaa9cee1) 39684 + 39685 + diff --git a/nptl/pthread_cond_wait.c b/nptl/pthread_cond_wait.c 39686 + index ad2cee7d59..cfdd13bb87 100644 39687 + --- a/nptl/pthread_cond_wait.c 39688 + +++ b/nptl/pthread_cond_wait.c 39689 + @@ -366,7 +366,6 @@ static __always_inline int 39690 + __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex, 39691 + clockid_t clockid, const struct __timespec64 *abstime) 39692 + { 39693 + - const int maxspin = 0; 39694 + int err; 39695 + int result = 0; 39696 + 39697 + @@ -425,33 +424,6 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex, 39698 + uint64_t g1_start = __condvar_load_g1_start_relaxed (cond); 39699 + unsigned int lowseq = (g1_start & 1) == g ? signals : g1_start & ~1U; 39700 + 39701 + - /* Spin-wait first. 39702 + - Note that spinning first without checking whether a timeout 39703 + - passed might lead to what looks like a spurious wake-up even 39704 + - though we should return ETIMEDOUT (e.g., if the caller provides 39705 + - an absolute timeout that is clearly in the past). However, 39706 + - (1) spurious wake-ups are allowed, (2) it seems unlikely that a 39707 + - user will (ab)use pthread_cond_wait as a check for whether a 39708 + - point in time is in the past, and (3) spinning first without 39709 + - having to compare against the current time seems to be the right 39710 + - choice from a performance perspective for most use cases. */ 39711 + - unsigned int spin = maxspin; 39712 + - while (spin > 0 && ((int)(signals - lowseq) < 2)) 39713 + - { 39714 + - /* Check that we are not spinning on a group that's already 39715 + - closed. */ 39716 + - if (seq < (g1_start >> 1)) 39717 + - break; 39718 + - 39719 + - /* TODO Back off. */ 39720 + - 39721 + - /* Reload signals. See above for MO. */ 39722 + - signals = atomic_load_acquire (cond->__data.__g_signals + g); 39723 + - g1_start = __condvar_load_g1_start_relaxed (cond); 39724 + - lowseq = (g1_start & 1) == g ? signals : g1_start & ~1U; 39725 + - spin--; 39726 + - } 39727 + - 39728 + if (seq < (g1_start >> 1)) 39729 + { 39730 + /* If the group is closed already, 39731 + @@ -482,24 +454,6 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex, 39732 + an atomic read-modify-write operation and thus extend the release 39733 + sequence. */ 39734 + atomic_fetch_add_acquire (cond->__data.__g_refs + g, 2); 39735 + - signals = atomic_load_acquire (cond->__data.__g_signals + g); 39736 + - g1_start = __condvar_load_g1_start_relaxed (cond); 39737 + - lowseq = (g1_start & 1) == g ? signals : g1_start & ~1U; 39738 + - 39739 + - if (seq < (g1_start >> 1)) 39740 + - { 39741 + - /* group is closed already, so don't block */ 39742 + - __condvar_dec_grefs (cond, g, private); 39743 + - goto done; 39744 + - } 39745 + - 39746 + - if ((int)(signals - lowseq) >= 2) 39747 + - { 39748 + - /* a signal showed up or G1/G2 switched after we grabbed the 39749 + - refcount */ 39750 + - __condvar_dec_grefs (cond, g, private); 39751 + - break; 39752 + - } 39753 + 39754 + // Now block. 39755 + struct _pthread_cleanup_buffer buffer; 39756 + @@ -533,9 +487,6 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex, 39757 + /* Reload signals. See above for MO. */ 39758 + signals = atomic_load_acquire (cond->__data.__g_signals + g); 39759 + } 39760 + - 39761 + - if (seq < (__condvar_load_g1_start_relaxed (cond) >> 1)) 39762 + - goto done; 39763 + } 39764 + /* Try to grab a signal. See above for MO. (if we do another loop 39765 + iteration we need to see the correct value of g1_start) */ 39766 + 39767 + commit a2465f4293ecc37ac4650fbd02e517bc6fd801c6 39768 + Author: Malte Skarupke <malteskarupke@fastmail.fm> 39769 + Date: Wed Dec 4 07:56:38 2024 -0500 39770 + 39771 + nptl: Remove g_refs from condition variables 39772 + 39773 + This variable used to be needed to wait in group switching until all sleepers 39774 + have confirmed that they have woken. This is no longer needed. Nothing waits 39775 + on this variable so there is no need to track how many threads are currently 39776 + asleep in each group. 39777 + 39778 + Signed-off-by: Malte Skarupke <malteskarupke@fastmail.fm> 39779 + Reviewed-by: Carlos O'Donell <carlos@redhat.com> 39780 + (cherry picked from commit c36fc50781995e6758cae2b6927839d0157f213c) 39781 + 39782 + diff --git a/nptl/pthread_cond_wait.c b/nptl/pthread_cond_wait.c 39783 + index cfdd13bb87..411fc0380b 100644 39784 + --- a/nptl/pthread_cond_wait.c 39785 + +++ b/nptl/pthread_cond_wait.c 39786 + @@ -143,23 +143,6 @@ __condvar_cancel_waiting (pthread_cond_t *cond, uint64_t seq, unsigned int g, 39787 + } 39788 + } 39789 + 39790 + -/* Wake up any signalers that might be waiting. */ 39791 + -static void 39792 + -__condvar_dec_grefs (pthread_cond_t *cond, unsigned int g, int private) 39793 + -{ 39794 + - /* Release MO to synchronize-with the acquire load in 39795 + - __condvar_quiesce_and_switch_g1. */ 39796 + - if (atomic_fetch_add_release (cond->__data.__g_refs + g, -2) == 3) 39797 + - { 39798 + - /* Clear the wake-up request flag before waking up. We do not need more 39799 + - than relaxed MO and it doesn't matter if we apply this for an aliased 39800 + - group because we wake all futex waiters right after clearing the 39801 + - flag. */ 39802 + - atomic_fetch_and_relaxed (cond->__data.__g_refs + g, ~(unsigned int) 1); 39803 + - futex_wake (cond->__data.__g_refs + g, INT_MAX, private); 39804 + - } 39805 + -} 39806 + - 39807 + /* Clean-up for cancellation of waiters waiting for normal signals. We cancel 39808 + our registration as a waiter, confirm we have woken up, and re-acquire the 39809 + mutex. */ 39810 + @@ -171,8 +154,6 @@ __condvar_cleanup_waiting (void *arg) 39811 + pthread_cond_t *cond = cbuffer->cond; 39812 + unsigned g = cbuffer->wseq & 1; 39813 + 39814 + - __condvar_dec_grefs (cond, g, cbuffer->private); 39815 + - 39816 + __condvar_cancel_waiting (cond, cbuffer->wseq >> 1, g, cbuffer->private); 39817 + /* FIXME With the current cancellation implementation, it is possible that 39818 + a thread is cancelled after it has returned from a syscall. This could 39819 + @@ -327,15 +308,6 @@ __condvar_cleanup_waiting (void *arg) 39820 + sufficient because if a waiter can see a sufficiently large value, it could 39821 + have also consume a signal in the waiters group. 39822 + 39823 + - It is essential that the last field in pthread_cond_t is __g_signals[1]: 39824 + - The previous condvar used a pointer-sized field in pthread_cond_t, so a 39825 + - PTHREAD_COND_INITIALIZER from that condvar implementation might only 39826 + - initialize 4 bytes to zero instead of the 8 bytes we need (i.e., 44 bytes 39827 + - in total instead of the 48 we need). __g_signals[1] is not accessed before 39828 + - the first group switch (G2 starts at index 0), which will set its value to 39829 + - zero after a harmless fetch-or whose return value is ignored. This 39830 + - effectively completes initialization. 39831 + - 39832 + 39833 + Limitations: 39834 + * This condvar isn't designed to allow for more than 39835 + @@ -440,21 +412,6 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex, 39836 + if ((int)(signals - lowseq) >= 2) 39837 + break; 39838 + 39839 + - /* No signals available after spinning, so prepare to block. 39840 + - We first acquire a group reference and use acquire MO for that so 39841 + - that we synchronize with the dummy read-modify-write in 39842 + - __condvar_quiesce_and_switch_g1 if we read from that. In turn, 39843 + - in this case this will make us see the advancement of __g_signals 39844 + - to the upcoming new g1_start that occurs with a concurrent 39845 + - attempt to reuse the group's slot. 39846 + - We use acquire MO for the __g_signals check to make the 39847 + - __g1_start check work (see spinning above). 39848 + - Note that the group reference acquisition will not mask the 39849 + - release MO when decrementing the reference count because we use 39850 + - an atomic read-modify-write operation and thus extend the release 39851 + - sequence. */ 39852 + - atomic_fetch_add_acquire (cond->__data.__g_refs + g, 2); 39853 + - 39854 + // Now block. 39855 + struct _pthread_cleanup_buffer buffer; 39856 + struct _condvar_cleanup_buffer cbuffer; 39857 + @@ -471,18 +428,11 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex, 39858 + 39859 + if (__glibc_unlikely (err == ETIMEDOUT || err == EOVERFLOW)) 39860 + { 39861 + - __condvar_dec_grefs (cond, g, private); 39862 + - /* If we timed out, we effectively cancel waiting. Note that 39863 + - we have decremented __g_refs before cancellation, so that a 39864 + - deadlock between waiting for quiescence of our group in 39865 + - __condvar_quiesce_and_switch_g1 and us trying to acquire 39866 + - the lock during cancellation is not possible. */ 39867 + + /* If we timed out, we effectively cancel waiting. */ 39868 + __condvar_cancel_waiting (cond, seq, g, private); 39869 + result = err; 39870 + goto done; 39871 + } 39872 + - else 39873 + - __condvar_dec_grefs (cond, g, private); 39874 + 39875 + /* Reload signals. See above for MO. */ 39876 + signals = atomic_load_acquire (cond->__data.__g_signals + g); 39877 + diff --git a/nptl/tst-cond22.c b/nptl/tst-cond22.c 39878 + index 1336e9c79d..bdcb45c536 100644 39879 + --- a/nptl/tst-cond22.c 39880 + +++ b/nptl/tst-cond22.c 39881 + @@ -106,13 +106,13 @@ do_test (void) 39882 + status = 1; 39883 + } 39884 + 39885 + - printf ("cond = { 0x%x:%x, 0x%x:%x, %u/%u/%u, %u/%u/%u, %u, %u }\n", 39886 + + printf ("cond = { 0x%x:%x, 0x%x:%x, %u/%u, %u/%u, %u, %u }\n", 39887 + c.__data.__wseq.__value32.__high, 39888 + c.__data.__wseq.__value32.__low, 39889 + c.__data.__g1_start.__value32.__high, 39890 + c.__data.__g1_start.__value32.__low, 39891 + - c.__data.__g_signals[0], c.__data.__g_refs[0], c.__data.__g_size[0], 39892 + - c.__data.__g_signals[1], c.__data.__g_refs[1], c.__data.__g_size[1], 39893 + + c.__data.__g_signals[0], c.__data.__g_size[0], 39894 + + c.__data.__g_signals[1], c.__data.__g_size[1], 39895 + c.__data.__g1_orig_size, c.__data.__wrefs); 39896 + 39897 + if (pthread_create (&th, NULL, tf, (void *) 1l) != 0) 39898 + @@ -152,13 +152,13 @@ do_test (void) 39899 + status = 1; 39900 + } 39901 + 39902 + - printf ("cond = { 0x%x:%x, 0x%x:%x, %u/%u/%u, %u/%u/%u, %u, %u }\n", 39903 + + printf ("cond = { 0x%x:%x, 0x%x:%x, %u/%u, %u/%u, %u, %u }\n", 39904 + c.__data.__wseq.__value32.__high, 39905 + c.__data.__wseq.__value32.__low, 39906 + c.__data.__g1_start.__value32.__high, 39907 + c.__data.__g1_start.__value32.__low, 39908 + - c.__data.__g_signals[0], c.__data.__g_refs[0], c.__data.__g_size[0], 39909 + - c.__data.__g_signals[1], c.__data.__g_refs[1], c.__data.__g_size[1], 39910 + + c.__data.__g_signals[0], c.__data.__g_size[0], 39911 + + c.__data.__g_signals[1], c.__data.__g_size[1], 39912 + c.__data.__g1_orig_size, c.__data.__wrefs); 39913 + 39914 + return status; 39915 + diff --git a/sysdeps/nptl/bits/thread-shared-types.h b/sysdeps/nptl/bits/thread-shared-types.h 39916 + index df54eef6f7..a3d482f80f 100644 39917 + --- a/sysdeps/nptl/bits/thread-shared-types.h 39918 + +++ b/sysdeps/nptl/bits/thread-shared-types.h 39919 + @@ -95,8 +95,7 @@ struct __pthread_cond_s 39920 + { 39921 + __atomic_wide_counter __wseq; 39922 + __atomic_wide_counter __g1_start; 39923 + - unsigned int __g_refs[2] __LOCK_ALIGNMENT; 39924 + - unsigned int __g_size[2]; 39925 + + unsigned int __g_size[2] __LOCK_ALIGNMENT; 39926 + unsigned int __g1_orig_size; 39927 + unsigned int __wrefs; 39928 + unsigned int __g_signals[2]; 39929 + diff --git a/sysdeps/nptl/pthread.h b/sysdeps/nptl/pthread.h 39930 + index 3d4f4a756c..9af75d6eae 100644 39931 + --- a/sysdeps/nptl/pthread.h 39932 + +++ b/sysdeps/nptl/pthread.h 39933 + @@ -152,7 +152,7 @@ enum 39934 + 39935 + 39936 + /* Conditional variable handling. */ 39937 + -#define PTHREAD_COND_INITIALIZER { { {0}, {0}, {0, 0}, {0, 0}, 0, 0, {0, 0} } } 39938 + +#define PTHREAD_COND_INITIALIZER { { {0}, {0}, {0, 0}, 0, 0, {0, 0} } } 39939 + 39940 + 39941 + /* Cleanup buffers */ 39942 + 39943 + commit fa110993a6390ae5c97dff613ef02b59ec78c5da 39944 + Author: Malte Skarupke <malteskarupke@fastmail.fm> 39945 + Date: Wed Dec 4 08:03:44 2024 -0500 39946 + 39947 + nptl: Use a single loop in pthread_cond_wait instaed of a nested loop 39948 + 39949 + The loop was a little more complicated than necessary. There was only one 39950 + break statement out of the inner loop, and the outer loop was nearly empty. 39951 + So just remove the outer loop, moving its code to the one break statement in 39952 + the inner loop. This allows us to replace all gotos with break statements. 39953 + 39954 + Signed-off-by: Malte Skarupke <malteskarupke@fastmail.fm> 39955 + Reviewed-by: Carlos O'Donell <carlos@redhat.com> 39956 + (cherry picked from commit 929a4764ac90382616b6a21f099192b2475da674) 39957 + 39958 + diff --git a/nptl/pthread_cond_wait.c b/nptl/pthread_cond_wait.c 39959 + index 411fc0380b..683cb2b133 100644 39960 + --- a/nptl/pthread_cond_wait.c 39961 + +++ b/nptl/pthread_cond_wait.c 39962 + @@ -382,17 +382,15 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex, 39963 + return err; 39964 + } 39965 + 39966 + - /* Now wait until a signal is available in our group or it is closed. 39967 + - Acquire MO so that if we observe (signals == lowseq) after group 39968 + - switching in __condvar_quiesce_and_switch_g1, we synchronize with that 39969 + - store and will see the prior update of __g1_start done while switching 39970 + - groups too. */ 39971 + - unsigned int signals = atomic_load_acquire (cond->__data.__g_signals + g); 39972 + - 39973 + - do 39974 + - { 39975 + + 39976 + while (1) 39977 + { 39978 + + /* Now wait until a signal is available in our group or it is closed. 39979 + + Acquire MO so that if we observe (signals == lowseq) after group 39980 + + switching in __condvar_quiesce_and_switch_g1, we synchronize with that 39981 + + store and will see the prior update of __g1_start done while switching 39982 + + groups too. */ 39983 + + unsigned int signals = atomic_load_acquire (cond->__data.__g_signals + g); 39984 + uint64_t g1_start = __condvar_load_g1_start_relaxed (cond); 39985 + unsigned int lowseq = (g1_start & 1) == g ? signals : g1_start & ~1U; 39986 + 39987 + @@ -401,7 +399,7 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex, 39988 + /* If the group is closed already, 39989 + then this waiter originally had enough extra signals to 39990 + consume, up until the time its group was closed. */ 39991 + - goto done; 39992 + + break; 39993 + } 39994 + 39995 + /* If there is an available signal, don't block. 39996 + @@ -410,7 +408,16 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex, 39997 + G2, but in either case we're allowed to consume the available 39998 + signal and should not block anymore. */ 39999 + if ((int)(signals - lowseq) >= 2) 40000 + - break; 40001 + + { 40002 + + /* Try to grab a signal. See above for MO. (if we do another loop 40003 + + iteration we need to see the correct value of g1_start) */ 40004 + + if (atomic_compare_exchange_weak_acquire ( 40005 + + cond->__data.__g_signals + g, 40006 + + &signals, signals - 2)) 40007 + + break; 40008 + + else 40009 + + continue; 40010 + + } 40011 + 40012 + // Now block. 40013 + struct _pthread_cleanup_buffer buffer; 40014 + @@ -431,19 +438,9 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex, 40015 + /* If we timed out, we effectively cancel waiting. */ 40016 + __condvar_cancel_waiting (cond, seq, g, private); 40017 + result = err; 40018 + - goto done; 40019 + + break; 40020 + } 40021 + - 40022 + - /* Reload signals. See above for MO. */ 40023 + - signals = atomic_load_acquire (cond->__data.__g_signals + g); 40024 + } 40025 + - } 40026 + - /* Try to grab a signal. See above for MO. (if we do another loop 40027 + - iteration we need to see the correct value of g1_start) */ 40028 + - while (!atomic_compare_exchange_weak_acquire (cond->__data.__g_signals + g, 40029 + - &signals, signals - 2)); 40030 + - 40031 + - done: 40032 + 40033 + /* Confirm that we have been woken. We do that before acquiring the mutex 40034 + to allow for execution of pthread_cond_destroy while having acquired the 40035 + 40036 + commit afbf0d46850dcd1b626d892ad8fde2162067ddc7 40037 + Author: Malte Skarupke <malteskarupke@fastmail.fm> 40038 + Date: Wed Dec 4 08:04:10 2024 -0500 40039 + 40040 + nptl: Fix indentation 40041 + 40042 + In my previous change I turned a nested loop into a simple loop. I'm doing 40043 + the resulting indentation changes in a separate commit to make the diff on 40044 + the previous commit easier to review. 40045 + 40046 + Signed-off-by: Malte Skarupke <malteskarupke@fastmail.fm> 40047 + Reviewed-by: Carlos O'Donell <carlos@redhat.com> 40048 + (cherry picked from commit ee6c14ed59d480720721aaacc5fb03213dc153da) 40049 + 40050 + diff --git a/nptl/pthread_cond_wait.c b/nptl/pthread_cond_wait.c 40051 + index 683cb2b133..7fc9dadf15 100644 40052 + --- a/nptl/pthread_cond_wait.c 40053 + +++ b/nptl/pthread_cond_wait.c 40054 + @@ -383,65 +383,65 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex, 40055 + } 40056 + 40057 + 40058 + - while (1) 40059 + - { 40060 + - /* Now wait until a signal is available in our group or it is closed. 40061 + - Acquire MO so that if we observe (signals == lowseq) after group 40062 + - switching in __condvar_quiesce_and_switch_g1, we synchronize with that 40063 + - store and will see the prior update of __g1_start done while switching 40064 + - groups too. */ 40065 + - unsigned int signals = atomic_load_acquire (cond->__data.__g_signals + g); 40066 + - uint64_t g1_start = __condvar_load_g1_start_relaxed (cond); 40067 + - unsigned int lowseq = (g1_start & 1) == g ? signals : g1_start & ~1U; 40068 + - 40069 + - if (seq < (g1_start >> 1)) 40070 + - { 40071 + - /* If the group is closed already, 40072 + - then this waiter originally had enough extra signals to 40073 + - consume, up until the time its group was closed. */ 40074 + - break; 40075 + - } 40076 + - 40077 + - /* If there is an available signal, don't block. 40078 + - If __g1_start has advanced at all, then we must be in G1 40079 + - by now, perhaps in the process of switching back to an older 40080 + - G2, but in either case we're allowed to consume the available 40081 + - signal and should not block anymore. */ 40082 + - if ((int)(signals - lowseq) >= 2) 40083 + - { 40084 + - /* Try to grab a signal. See above for MO. (if we do another loop 40085 + - iteration we need to see the correct value of g1_start) */ 40086 + - if (atomic_compare_exchange_weak_acquire ( 40087 + - cond->__data.__g_signals + g, 40088 + + while (1) 40089 + + { 40090 + + /* Now wait until a signal is available in our group or it is closed. 40091 + + Acquire MO so that if we observe (signals == lowseq) after group 40092 + + switching in __condvar_quiesce_and_switch_g1, we synchronize with that 40093 + + store and will see the prior update of __g1_start done while switching 40094 + + groups too. */ 40095 + + unsigned int signals = atomic_load_acquire (cond->__data.__g_signals + g); 40096 + + uint64_t g1_start = __condvar_load_g1_start_relaxed (cond); 40097 + + unsigned int lowseq = (g1_start & 1) == g ? signals : g1_start & ~1U; 40098 + + 40099 + + if (seq < (g1_start >> 1)) 40100 + + { 40101 + + /* If the group is closed already, 40102 + + then this waiter originally had enough extra signals to 40103 + + consume, up until the time its group was closed. */ 40104 + + break; 40105 + + } 40106 + + 40107 + + /* If there is an available signal, don't block. 40108 + + If __g1_start has advanced at all, then we must be in G1 40109 + + by now, perhaps in the process of switching back to an older 40110 + + G2, but in either case we're allowed to consume the available 40111 + + signal and should not block anymore. */ 40112 + + if ((int)(signals - lowseq) >= 2) 40113 + + { 40114 + + /* Try to grab a signal. See above for MO. (if we do another loop 40115 + + iteration we need to see the correct value of g1_start) */ 40116 + + if (atomic_compare_exchange_weak_acquire ( 40117 + + cond->__data.__g_signals + g, 40118 + &signals, signals - 2)) 40119 + - break; 40120 + - else 40121 + - continue; 40122 + - } 40123 + - 40124 + - // Now block. 40125 + - struct _pthread_cleanup_buffer buffer; 40126 + - struct _condvar_cleanup_buffer cbuffer; 40127 + - cbuffer.wseq = wseq; 40128 + - cbuffer.cond = cond; 40129 + - cbuffer.mutex = mutex; 40130 + - cbuffer.private = private; 40131 + - __pthread_cleanup_push (&buffer, __condvar_cleanup_waiting, &cbuffer); 40132 + - 40133 + - err = __futex_abstimed_wait_cancelable64 ( 40134 + - cond->__data.__g_signals + g, signals, clockid, abstime, private); 40135 + - 40136 + - __pthread_cleanup_pop (&buffer, 0); 40137 + - 40138 + - if (__glibc_unlikely (err == ETIMEDOUT || err == EOVERFLOW)) 40139 + - { 40140 + - /* If we timed out, we effectively cancel waiting. */ 40141 + - __condvar_cancel_waiting (cond, seq, g, private); 40142 + - result = err; 40143 + break; 40144 + - } 40145 + + else 40146 + + continue; 40147 + } 40148 + 40149 + + // Now block. 40150 + + struct _pthread_cleanup_buffer buffer; 40151 + + struct _condvar_cleanup_buffer cbuffer; 40152 + + cbuffer.wseq = wseq; 40153 + + cbuffer.cond = cond; 40154 + + cbuffer.mutex = mutex; 40155 + + cbuffer.private = private; 40156 + + __pthread_cleanup_push (&buffer, __condvar_cleanup_waiting, &cbuffer); 40157 + + 40158 + + err = __futex_abstimed_wait_cancelable64 ( 40159 + + cond->__data.__g_signals + g, signals, clockid, abstime, private); 40160 + + 40161 + + __pthread_cleanup_pop (&buffer, 0); 40162 + + 40163 + + if (__glibc_unlikely (err == ETIMEDOUT || err == EOVERFLOW)) 40164 + + { 40165 + + /* If we timed out, we effectively cancel waiting. */ 40166 + + __condvar_cancel_waiting (cond, seq, g, private); 40167 + + result = err; 40168 + + break; 40169 + + } 40170 + + } 40171 + + 40172 + /* Confirm that we have been woken. We do that before acquiring the mutex 40173 + to allow for execution of pthread_cond_destroy while having acquired the 40174 + mutex. */ 40175 + 40176 + commit 2ad69497346cc20ef4d568108f1de49b2f451c55 40177 + Author: Malte Skarupke <malteskarupke@fastmail.fm> 40178 + Date: Wed Dec 4 08:04:54 2024 -0500 40179 + 40180 + nptl: rename __condvar_quiesce_and_switch_g1 40181 + 40182 + This function no longer waits for threads to leave g1, so rename it to 40183 + __condvar_switch_g1 40184 + 40185 + Signed-off-by: Malte Skarupke <malteskarupke@fastmail.fm> 40186 + Reviewed-by: Carlos O'Donell <carlos@redhat.com> 40187 + (cherry picked from commit 4b79e27a5073c02f6bff9aa8f4791230a0ab1867) 40188 + 40189 + diff --git a/nptl/pthread_cond_broadcast.c b/nptl/pthread_cond_broadcast.c 40190 + index aada91639a..38bba17bfc 100644 40191 + --- a/nptl/pthread_cond_broadcast.c 40192 + +++ b/nptl/pthread_cond_broadcast.c 40193 + @@ -60,7 +60,7 @@ ___pthread_cond_broadcast (pthread_cond_t *cond) 40194 + cond->__data.__g_size[g1] << 1); 40195 + cond->__data.__g_size[g1] = 0; 40196 + 40197 + - /* We need to wake G1 waiters before we quiesce G1 below. */ 40198 + + /* We need to wake G1 waiters before we switch G1 below. */ 40199 + /* TODO Only set it if there are indeed futex waiters. We could 40200 + also try to move this out of the critical section in cases when 40201 + G2 is empty (and we don't need to quiesce). */ 40202 + @@ -69,7 +69,7 @@ ___pthread_cond_broadcast (pthread_cond_t *cond) 40203 + 40204 + /* G1 is complete. Step (2) is next unless there are no waiters in G2, in 40205 + which case we can stop. */ 40206 + - if (__condvar_quiesce_and_switch_g1 (cond, wseq, &g1, private)) 40207 + + if (__condvar_switch_g1 (cond, wseq, &g1, private)) 40208 + { 40209 + /* Step (3): Send signals to all waiters in the old G2 / new G1. */ 40210 + atomic_fetch_add_relaxed (cond->__data.__g_signals + g1, 40211 + diff --git a/nptl/pthread_cond_common.c b/nptl/pthread_cond_common.c 40212 + index 30b8eee149..5044273cc2 100644 40213 + --- a/nptl/pthread_cond_common.c 40214 + +++ b/nptl/pthread_cond_common.c 40215 + @@ -189,16 +189,15 @@ __condvar_get_private (int flags) 40216 + return FUTEX_SHARED; 40217 + } 40218 + 40219 + -/* This closes G1 (whose index is in G1INDEX), waits for all futex waiters to 40220 + - leave G1, converts G1 into a fresh G2, and then switches group roles so that 40221 + - the former G2 becomes the new G1 ending at the current __wseq value when we 40222 + - eventually make the switch (WSEQ is just an observation of __wseq by the 40223 + - signaler). 40224 + +/* This closes G1 (whose index is in G1INDEX), converts G1 into a fresh G2, 40225 + + and then switches group roles so that the former G2 becomes the new G1 40226 + + ending at the current __wseq value when we eventually make the switch 40227 + + (WSEQ is just an observation of __wseq by the signaler). 40228 + If G2 is empty, it will not switch groups because then it would create an 40229 + empty G1 which would require switching groups again on the next signal. 40230 + Returns false iff groups were not switched because G2 was empty. */ 40231 + static bool __attribute__ ((unused)) 40232 + -__condvar_quiesce_and_switch_g1 (pthread_cond_t *cond, uint64_t wseq, 40233 + +__condvar_switch_g1 (pthread_cond_t *cond, uint64_t wseq, 40234 + unsigned int *g1index, int private) 40235 + { 40236 + unsigned int g1 = *g1index; 40237 + @@ -214,8 +213,7 @@ __condvar_quiesce_and_switch_g1 (pthread_cond_t *cond, uint64_t wseq, 40238 + + cond->__data.__g_size[g1 ^ 1]) == 0) 40239 + return false; 40240 + 40241 + - /* Now try to close and quiesce G1. We have to consider the following kinds 40242 + - of waiters: 40243 + + /* We have to consider the following kinds of waiters: 40244 + * Waiters from less recent groups than G1 are not affected because 40245 + nothing will change for them apart from __g1_start getting larger. 40246 + * New waiters arriving concurrently with the group switching will all go 40247 + @@ -223,12 +221,12 @@ __condvar_quiesce_and_switch_g1 (pthread_cond_t *cond, uint64_t wseq, 40248 + are not affected. 40249 + * Waiters in G1 have already received a signal and been woken. */ 40250 + 40251 + - /* Update __g1_start, which finishes closing this group. The value we add 40252 + - will never be negative because old_orig_size can only be zero when we 40253 + - switch groups the first time after a condvar was initialized, in which 40254 + - case G1 will be at index 1 and we will add a value of 1. 40255 + - Relaxed MO is fine because the change comes with no additional 40256 + - constraints that others would have to observe. */ 40257 + + /* Update __g1_start, which closes this group. The value we add will never 40258 + + be negative because old_orig_size can only be zero when we switch groups 40259 + + the first time after a condvar was initialized, in which case G1 will be 40260 + + at index 1 and we will add a value of 1. Relaxed MO is fine because the 40261 + + change comes with no additional constraints that others would have to 40262 + + observe. */ 40263 + __condvar_add_g1_start_relaxed (cond, 40264 + (old_orig_size << 1) + (g1 == 1 ? 1 : - 1)); 40265 + 40266 + diff --git a/nptl/pthread_cond_signal.c b/nptl/pthread_cond_signal.c 40267 + index 43d6286ecd..f095497142 100644 40268 + --- a/nptl/pthread_cond_signal.c 40269 + +++ b/nptl/pthread_cond_signal.c 40270 + @@ -69,18 +69,17 @@ ___pthread_cond_signal (pthread_cond_t *cond) 40271 + bool do_futex_wake = false; 40272 + 40273 + /* If G1 is still receiving signals, we put the signal there. If not, we 40274 + - check if G2 has waiters, and if so, quiesce and switch G1 to the former 40275 + - G2; if this results in a new G1 with waiters (G2 might have cancellations 40276 + - already, see __condvar_quiesce_and_switch_g1), we put the signal in the 40277 + - new G1. */ 40278 + + check if G2 has waiters, and if so, switch G1 to the former G2; if this 40279 + + results in a new G1 with waiters (G2 might have cancellations already, 40280 + + see __condvar_switch_g1), we put the signal in the new G1. */ 40281 + if ((cond->__data.__g_size[g1] != 0) 40282 + - || __condvar_quiesce_and_switch_g1 (cond, wseq, &g1, private)) 40283 + + || __condvar_switch_g1 (cond, wseq, &g1, private)) 40284 + { 40285 + /* Add a signal. Relaxed MO is fine because signaling does not need to 40286 + - establish a happens-before relation (see above). We do not mask the 40287 + - release-MO store when initializing a group in 40288 + - __condvar_quiesce_and_switch_g1 because we use an atomic 40289 + - read-modify-write and thus extend that store's release sequence. */ 40290 + + establish a happens-before relation (see above). We do not mask the 40291 + + release-MO store when initializing a group in __condvar_switch_g1 40292 + + because we use an atomic read-modify-write and thus extend that 40293 + + store's release sequence. */ 40294 + atomic_fetch_add_relaxed (cond->__data.__g_signals + g1, 2); 40295 + cond->__data.__g_size[g1]--; 40296 + /* TODO Only set it if there are indeed futex waiters. */ 40297 + diff --git a/nptl/pthread_cond_wait.c b/nptl/pthread_cond_wait.c 40298 + index 7fc9dadf15..80bb728211 100644 40299 + --- a/nptl/pthread_cond_wait.c 40300 + +++ b/nptl/pthread_cond_wait.c 40301 + @@ -354,8 +354,7 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex, 40302 + because we do not need to establish any happens-before relation with 40303 + signalers (see __pthread_cond_signal); modification order alone 40304 + establishes a total order of waiters/signals. We do need acquire MO 40305 + - to synchronize with group reinitialization in 40306 + - __condvar_quiesce_and_switch_g1. */ 40307 + + to synchronize with group reinitialization in __condvar_switch_g1. */ 40308 + uint64_t wseq = __condvar_fetch_add_wseq_acquire (cond, 2); 40309 + /* Find our group's index. We always go into what was G2 when we acquired 40310 + our position. */ 40311 + @@ -387,9 +386,9 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex, 40312 + { 40313 + /* Now wait until a signal is available in our group or it is closed. 40314 + Acquire MO so that if we observe (signals == lowseq) after group 40315 + - switching in __condvar_quiesce_and_switch_g1, we synchronize with that 40316 + - store and will see the prior update of __g1_start done while switching 40317 + - groups too. */ 40318 + + switching in __condvar_switch_g1, we synchronize with that store and 40319 + + will see the prior update of __g1_start done while switching groups 40320 + + too. */ 40321 + unsigned int signals = atomic_load_acquire (cond->__data.__g_signals + g); 40322 + uint64_t g1_start = __condvar_load_g1_start_relaxed (cond); 40323 + unsigned int lowseq = (g1_start & 1) == g ? signals : g1_start & ~1U; 40324 + 40325 + commit 7f71824b8039b8afc150dd5c881b61faf10675ef 40326 + Author: Malte Skarupke <malteskarupke@fastmail.fm> 40327 + Date: Wed Dec 4 08:05:40 2024 -0500 40328 + 40329 + nptl: Use all of g1_start and g_signals 40330 + 40331 + The LSB of g_signals was unused. The LSB of g1_start was used to indicate 40332 + which group is G2. This was used to always go to sleep in pthread_cond_wait 40333 + if a waiter is in G2. A comment earlier in the file says that this is not 40334 + correct to do: 40335 + 40336 + "Waiters cannot determine whether they are currently in G2 or G1 -- but they 40337 + do not have to because all they are interested in is whether there are 40338 + available signals" 40339 + 40340 + I either would have had to update the comment, or get rid of the check. I 40341 + chose to get rid of the check. In fact I don't quite know why it was there. 40342 + There will never be available signals for group G2, so we didn't need the 40343 + special case. Even if there were, this would just be a spurious wake. This 40344 + might have caught some cases where the count has wrapped around, but it 40345 + wouldn't reliably do that, (and even if it did, why would you want to force a 40346 + sleep in that case?) and we don't support that many concurrent waiters 40347 + anyway. Getting rid of it allows us to use one more bit, making us more 40348 + robust to wraparound. 40349 + 40350 + Signed-off-by: Malte Skarupke <malteskarupke@fastmail.fm> 40351 + Reviewed-by: Carlos O'Donell <carlos@redhat.com> 40352 + (cherry picked from commit 91bb902f58264a2fd50fbce8f39a9a290dd23706) 40353 + 40354 + diff --git a/nptl/pthread_cond_broadcast.c b/nptl/pthread_cond_broadcast.c 40355 + index 38bba17bfc..51afa62adf 100644 40356 + --- a/nptl/pthread_cond_broadcast.c 40357 + +++ b/nptl/pthread_cond_broadcast.c 40358 + @@ -57,7 +57,7 @@ ___pthread_cond_broadcast (pthread_cond_t *cond) 40359 + { 40360 + /* Add as many signals as the remaining size of the group. */ 40361 + atomic_fetch_add_relaxed (cond->__data.__g_signals + g1, 40362 + - cond->__data.__g_size[g1] << 1); 40363 + + cond->__data.__g_size[g1]); 40364 + cond->__data.__g_size[g1] = 0; 40365 + 40366 + /* We need to wake G1 waiters before we switch G1 below. */ 40367 + @@ -73,7 +73,7 @@ ___pthread_cond_broadcast (pthread_cond_t *cond) 40368 + { 40369 + /* Step (3): Send signals to all waiters in the old G2 / new G1. */ 40370 + atomic_fetch_add_relaxed (cond->__data.__g_signals + g1, 40371 + - cond->__data.__g_size[g1] << 1); 40372 + + cond->__data.__g_size[g1]); 40373 + cond->__data.__g_size[g1] = 0; 40374 + /* TODO Only set it if there are indeed futex waiters. */ 40375 + do_futex_wake = true; 40376 + diff --git a/nptl/pthread_cond_common.c b/nptl/pthread_cond_common.c 40377 + index 5044273cc2..389402913c 100644 40378 + --- a/nptl/pthread_cond_common.c 40379 + +++ b/nptl/pthread_cond_common.c 40380 + @@ -208,9 +208,9 @@ __condvar_switch_g1 (pthread_cond_t *cond, uint64_t wseq, 40381 + behavior. 40382 + Note that this works correctly for a zero-initialized condvar too. */ 40383 + unsigned int old_orig_size = __condvar_get_orig_size (cond); 40384 + - uint64_t old_g1_start = __condvar_load_g1_start_relaxed (cond) >> 1; 40385 + - if (((unsigned) (wseq - old_g1_start - old_orig_size) 40386 + - + cond->__data.__g_size[g1 ^ 1]) == 0) 40387 + + uint64_t old_g1_start = __condvar_load_g1_start_relaxed (cond); 40388 + + uint64_t new_g1_start = old_g1_start + old_orig_size; 40389 + + if (((unsigned) (wseq - new_g1_start) + cond->__data.__g_size[g1 ^ 1]) == 0) 40390 + return false; 40391 + 40392 + /* We have to consider the following kinds of waiters: 40393 + @@ -221,16 +221,10 @@ __condvar_switch_g1 (pthread_cond_t *cond, uint64_t wseq, 40394 + are not affected. 40395 + * Waiters in G1 have already received a signal and been woken. */ 40396 + 40397 + - /* Update __g1_start, which closes this group. The value we add will never 40398 + - be negative because old_orig_size can only be zero when we switch groups 40399 + - the first time after a condvar was initialized, in which case G1 will be 40400 + - at index 1 and we will add a value of 1. Relaxed MO is fine because the 40401 + - change comes with no additional constraints that others would have to 40402 + - observe. */ 40403 + - __condvar_add_g1_start_relaxed (cond, 40404 + - (old_orig_size << 1) + (g1 == 1 ? 1 : - 1)); 40405 + - 40406 + - unsigned int lowseq = ((old_g1_start + old_orig_size) << 1) & ~1U; 40407 + + /* Update __g1_start, which closes this group. Relaxed MO is fine because 40408 + + the change comes with no additional constraints that others would have 40409 + + to observe. */ 40410 + + __condvar_add_g1_start_relaxed (cond, old_orig_size); 40411 + 40412 + /* At this point, the old G1 is now a valid new G2 (but not in use yet). 40413 + No old waiter can neither grab a signal nor acquire a reference without 40414 + @@ -242,13 +236,13 @@ __condvar_switch_g1 (pthread_cond_t *cond, uint64_t wseq, 40415 + g1 ^= 1; 40416 + *g1index ^= 1; 40417 + 40418 + - /* Now advance the new G1 g_signals to the new lowseq, giving it 40419 + + /* Now advance the new G1 g_signals to the new g1_start, giving it 40420 + an effective signal count of 0 to start. */ 40421 + - atomic_store_release (cond->__data.__g_signals + g1, lowseq); 40422 + + atomic_store_release (cond->__data.__g_signals + g1, (unsigned)new_g1_start); 40423 + 40424 + /* These values are just observed by signalers, and thus protected by the 40425 + lock. */ 40426 + - unsigned int orig_size = wseq - (old_g1_start + old_orig_size); 40427 + + unsigned int orig_size = wseq - new_g1_start; 40428 + __condvar_set_orig_size (cond, orig_size); 40429 + /* Use and addition to not loose track of cancellations in what was 40430 + previously G2. */ 40431 + diff --git a/nptl/pthread_cond_signal.c b/nptl/pthread_cond_signal.c 40432 + index f095497142..fa3a5c3d8f 100644 40433 + --- a/nptl/pthread_cond_signal.c 40434 + +++ b/nptl/pthread_cond_signal.c 40435 + @@ -80,7 +80,7 @@ ___pthread_cond_signal (pthread_cond_t *cond) 40436 + release-MO store when initializing a group in __condvar_switch_g1 40437 + because we use an atomic read-modify-write and thus extend that 40438 + store's release sequence. */ 40439 + - atomic_fetch_add_relaxed (cond->__data.__g_signals + g1, 2); 40440 + + atomic_fetch_add_relaxed (cond->__data.__g_signals + g1, 1); 40441 + cond->__data.__g_size[g1]--; 40442 + /* TODO Only set it if there are indeed futex waiters. */ 40443 + do_futex_wake = true; 40444 + diff --git a/nptl/pthread_cond_wait.c b/nptl/pthread_cond_wait.c 40445 + index 80bb728211..0f1dfcb595 100644 40446 + --- a/nptl/pthread_cond_wait.c 40447 + +++ b/nptl/pthread_cond_wait.c 40448 + @@ -84,7 +84,7 @@ __condvar_cancel_waiting (pthread_cond_t *cond, uint64_t seq, unsigned int g, 40449 + not hold a reference on the group. */ 40450 + __condvar_acquire_lock (cond, private); 40451 + 40452 + - uint64_t g1_start = __condvar_load_g1_start_relaxed (cond) >> 1; 40453 + + uint64_t g1_start = __condvar_load_g1_start_relaxed (cond); 40454 + if (g1_start > seq) 40455 + { 40456 + /* Our group is closed, so someone provided enough signals for it. 40457 + @@ -259,7 +259,6 @@ __condvar_cleanup_waiting (void *arg) 40458 + * Waiters fetch-add while having acquire the mutex associated with the 40459 + condvar. Signalers load it and fetch-xor it concurrently. 40460 + __g1_start: Starting position of G1 (inclusive) 40461 + - * LSB is index of current G2. 40462 + * Modified by signalers while having acquired the condvar-internal lock 40463 + and observed concurrently by waiters. 40464 + __g1_orig_size: Initial size of G1 40465 + @@ -280,11 +279,9 @@ __condvar_cleanup_waiting (void *arg) 40466 + * Reference count used by waiters concurrently with signalers that have 40467 + acquired the condvar-internal lock. 40468 + __g_signals: The number of signals that can still be consumed, relative to 40469 + - the current g1_start. (i.e. bits 31 to 1 of __g_signals are bits 40470 + - 31 to 1 of g1_start with the signal count added) 40471 + + the current g1_start. (i.e. g1_start with the signal count added) 40472 + * Used as a futex word by waiters. Used concurrently by waiters and 40473 + signalers. 40474 + - * LSB is currently reserved and 0. 40475 + __g_size: Waiters remaining in this group (i.e., which have not been 40476 + signaled yet. 40477 + * Accessed by signalers and waiters that cancel waiting (both do so only 40478 + @@ -391,9 +388,8 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex, 40479 + too. */ 40480 + unsigned int signals = atomic_load_acquire (cond->__data.__g_signals + g); 40481 + uint64_t g1_start = __condvar_load_g1_start_relaxed (cond); 40482 + - unsigned int lowseq = (g1_start & 1) == g ? signals : g1_start & ~1U; 40483 + 40484 + - if (seq < (g1_start >> 1)) 40485 + + if (seq < g1_start) 40486 + { 40487 + /* If the group is closed already, 40488 + then this waiter originally had enough extra signals to 40489 + @@ -406,13 +402,13 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex, 40490 + by now, perhaps in the process of switching back to an older 40491 + G2, but in either case we're allowed to consume the available 40492 + signal and should not block anymore. */ 40493 + - if ((int)(signals - lowseq) >= 2) 40494 + + if ((int)(signals - (unsigned int)g1_start) > 0) 40495 + { 40496 + /* Try to grab a signal. See above for MO. (if we do another loop 40497 + iteration we need to see the correct value of g1_start) */ 40498 + if (atomic_compare_exchange_weak_acquire ( 40499 + cond->__data.__g_signals + g, 40500 + - &signals, signals - 2)) 40501 + + &signals, signals - 1)) 40502 + break; 40503 + else 40504 + continue; 40505 + 40506 + commit 8d3dd23e3de8b4c6e4b94f8bbfab971c3b8a55be 40507 + Author: Florian Weimer <fweimer@redhat.com> 40508 + Date: Thu Mar 13 06:07:07 2025 +0100 40509 + 40510 + nptl: PTHREAD_COND_INITIALIZER compatibility with pre-2.41 versions (bug 32786) 40511 + 40512 + The new initializer and struct layout does not initialize the 40513 + __g_signals field in the old struct layout before the change in 40514 + commit c36fc50781995e6758cae2b6927839d0157f213c ("nptl: Remove 40515 + g_refs from condition variables"). Bring back fields at the end 40516 + of struct __pthread_cond_s, so that they are again zero-initialized. 40517 + 40518 + Reviewed-by: Sam James <sam@gentoo.org> 40519 + 40520 + diff --git a/sysdeps/nptl/bits/thread-shared-types.h b/sysdeps/nptl/bits/thread-shared-types.h 40521 + index a3d482f80f..bccc2003ec 100644 40522 + --- a/sysdeps/nptl/bits/thread-shared-types.h 40523 + +++ b/sysdeps/nptl/bits/thread-shared-types.h 40524 + @@ -99,6 +99,8 @@ struct __pthread_cond_s 40525 + unsigned int __g1_orig_size; 40526 + unsigned int __wrefs; 40527 + unsigned int __g_signals[2]; 40528 + + unsigned int __unused_initialized_1; 40529 + + unsigned int __unused_initialized_2; 40530 + }; 40531 + 40532 + typedef unsigned int __tss_t; 40533 + diff --git a/sysdeps/nptl/pthread.h b/sysdeps/nptl/pthread.h 40534 + index 9af75d6eae..e0f24418fe 100644 40535 + --- a/sysdeps/nptl/pthread.h 40536 + +++ b/sysdeps/nptl/pthread.h 40537 + @@ -152,7 +152,7 @@ enum 40538 + 40539 + 40540 + /* Conditional variable handling. */ 40541 + -#define PTHREAD_COND_INITIALIZER { { {0}, {0}, {0, 0}, 0, 0, {0, 0} } } 40542 + +#define PTHREAD_COND_INITIALIZER { { {0}, {0}, {0, 0}, 0, 0, {0, 0}, 0, 0 } } 40543 + 40544 + 40545 + /* Cleanup buffers */ 40546 + 40547 + commit 33b33e9dd0ff26158b1b83cc4347a39c073e490e 40548 + Author: Arjun Shankar <arjun@redhat.com> 40549 + Date: Fri Oct 18 16:03:25 2024 +0200 40550 + 40551 + libio: Fix a deadlock after fork in popen 40552 + 40553 + popen modifies its file handler book-keeping under a lock that wasn't 40554 + being taken during fork. This meant that a concurrent popen and fork 40555 + could end up copying the lock in a "locked" state into the fork child, 40556 + where subsequently calling popen would lead to a deadlock due to the 40557 + already (spuriously) held lock. 40558 + 40559 + This commit fixes the deadlock by appropriately taking the lock before 40560 + fork, and releasing/resetting it in the parent/child after the fork. 40561 + 40562 + A new test for concurrent popen and fork is also added. It consistently 40563 + hangs (and therefore fails via timeout) without the fix applied. 40564 + Reviewed-by: Florian Weimer <fweimer@redhat.com> 40565 + 40566 + (cherry picked from commit 9f0d2c0ee6c728643fcf9a4879e9f20f5e45ce5f) 40567 + 40568 + diff --git a/libio/Makefile b/libio/Makefile 40569 + index 5292baa4e0..7faba230ac 100644 40570 + --- a/libio/Makefile 40571 + +++ b/libio/Makefile 40572 + @@ -117,6 +117,7 @@ tests = \ 40573 + tst-mmap-offend \ 40574 + tst-mmap-setvbuf \ 40575 + tst-mmap2-eofsync \ 40576 + + tst-popen-fork \ 40577 + tst-popen1 \ 40578 + tst-setvbuf1 \ 40579 + tst-sprintf-chk-ub \ 40580 + diff --git a/libio/iopopen.c b/libio/iopopen.c 40581 + index d01cb0648e..352513a291 100644 40582 + --- a/libio/iopopen.c 40583 + +++ b/libio/iopopen.c 40584 + @@ -57,6 +57,26 @@ unlock (void *not_used) 40585 + } 40586 + #endif 40587 + 40588 + +/* These lock/unlock/resetlock functions are used during fork. */ 40589 + + 40590 + +void 40591 + +_IO_proc_file_chain_lock (void) 40592 + +{ 40593 + + _IO_lock_lock (proc_file_chain_lock); 40594 + +} 40595 + + 40596 + +void 40597 + +_IO_proc_file_chain_unlock (void) 40598 + +{ 40599 + + _IO_lock_unlock (proc_file_chain_lock); 40600 + +} 40601 + + 40602 + +void 40603 + +_IO_proc_file_chain_resetlock (void) 40604 + +{ 40605 + + _IO_lock_init (proc_file_chain_lock); 40606 + +} 40607 + + 40608 + /* POSIX states popen shall ensure that any streams from previous popen() 40609 + calls that remain open in the parent process should be closed in the new 40610 + child process. 40611 + diff --git a/libio/libioP.h b/libio/libioP.h 40612 + index 616253fcd0..a83a411fdf 100644 40613 + --- a/libio/libioP.h 40614 + +++ b/libio/libioP.h 40615 + @@ -429,6 +429,12 @@ libc_hidden_proto (_IO_list_resetlock) 40616 + extern void _IO_enable_locks (void) __THROW; 40617 + libc_hidden_proto (_IO_enable_locks) 40618 + 40619 + +/* Functions for operating popen's proc_file_chain_lock during fork. */ 40620 + + 40621 + +extern void _IO_proc_file_chain_lock (void) __THROW attribute_hidden; 40622 + +extern void _IO_proc_file_chain_unlock (void) __THROW attribute_hidden; 40623 + +extern void _IO_proc_file_chain_resetlock (void) __THROW attribute_hidden; 40624 + + 40625 + /* Default jumptable functions. */ 40626 + 40627 + extern int _IO_default_underflow (FILE *) __THROW; 40628 + diff --git a/libio/tst-popen-fork.c b/libio/tst-popen-fork.c 40629 + new file mode 100644 40630 + index 0000000000..1df30fc6c0 40631 + --- /dev/null 40632 + +++ b/libio/tst-popen-fork.c 40633 + @@ -0,0 +1,80 @@ 40634 + +/* Test concurrent popen and fork. 40635 + + Copyright (C) 2024 Free Software Foundation, Inc. 40636 + + This file is part of the GNU C Library. 40637 + + 40638 + + The GNU C Library is free software; you can redistribute it and/or 40639 + + modify it under the terms of the GNU Lesser General Public 40640 + + License as published by the Free Software Foundation; either 40641 + + version 2.1 of the License, or (at your option) any later version. 40642 + + 40643 + + The GNU C Library is distributed in the hope that it will be useful, 40644 + + but WITHOUT ANY WARRANTY; without even the implied warranty of 40645 + + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 40646 + + Lesser General Public License for more details. 40647 + + 40648 + + You should have received a copy of the GNU Lesser General Public 40649 + + License along with the GNU C Library; if not, see 40650 + + <https://www.gnu.org/licenses/>. */ 40651 + + 40652 + +#include <stdio.h> 40653 + +#include <stdatomic.h> 40654 + +#include <pthread.h> 40655 + +#include <unistd.h> 40656 + +#include <sys/wait.h> 40657 + + 40658 + +#include <support/check.h> 40659 + +#include <support/xthread.h> 40660 + +#include <support/xunistd.h> 40661 + + 40662 + +static void 40663 + +popen_and_pclose (void) 40664 + +{ 40665 + + FILE *f = popen ("true", "r"); 40666 + + TEST_VERIFY_EXIT (f != NULL); 40667 + + pclose (f); 40668 + + return; 40669 + +} 40670 + + 40671 + +static atomic_bool done = ATOMIC_VAR_INIT (0); 40672 + + 40673 + +static void * 40674 + +popen_and_pclose_forever (__attribute__ ((unused)) 40675 + + void *arg) 40676 + +{ 40677 + + while (!atomic_load_explicit (&done, memory_order_acquire)) 40678 + + popen_and_pclose (); 40679 + + return NULL; 40680 + +} 40681 + + 40682 + +static int 40683 + +do_test (void) 40684 + +{ 40685 + + 40686 + + /* Repeatedly call popen in a loop during the entire test. */ 40687 + + pthread_t t = xpthread_create (NULL, popen_and_pclose_forever, NULL); 40688 + + 40689 + + /* Repeatedly fork off and reap child processes one-by-one. 40690 + + Each child calls popen once, then exits, leading to the possibility 40691 + + that a child forks *during* our own popen call, thus inheriting any 40692 + + intermediate popen state, possibly including lock state(s). */ 40693 + + for (int i = 0; i < 100; i++) 40694 + + { 40695 + + int cpid = xfork (); 40696 + + 40697 + + if (cpid == 0) 40698 + + { 40699 + + popen_and_pclose (); 40700 + + _exit (0); 40701 + + } 40702 + + else 40703 + + xwaitpid (cpid, NULL, 0); 40704 + + } 40705 + + 40706 + + /* Stop calling popen. */ 40707 + + atomic_store_explicit (&done, 1, memory_order_release); 40708 + + xpthread_join (t); 40709 + + 40710 + + return 0; 40711 + +} 40712 + + 40713 + +#include <support/test-driver.c> 40714 + diff --git a/posix/fork.c b/posix/fork.c 40715 + index 298765a1ff..cf9b80e7c0 100644 40716 + --- a/posix/fork.c 40717 + +++ b/posix/fork.c 40718 + @@ -62,6 +62,7 @@ __libc_fork (void) 40719 + call_function_static_weak (__nss_database_fork_prepare_parent, 40720 + &nss_database_data); 40721 + 40722 + + _IO_proc_file_chain_lock (); 40723 + _IO_list_lock (); 40724 + 40725 + /* Acquire malloc locks. This needs to come last because fork 40726 + @@ -92,6 +93,7 @@ __libc_fork (void) 40727 + 40728 + /* Reset locks in the I/O code. */ 40729 + _IO_list_resetlock (); 40730 + + _IO_proc_file_chain_resetlock (); 40731 + 40732 + call_function_static_weak (__nss_database_fork_subprocess, 40733 + &nss_database_data); 40734 + @@ -121,6 +123,7 @@ __libc_fork (void) 40735 + 40736 + /* We execute this even if the 'fork' call failed. */ 40737 + _IO_list_unlock (); 40738 + + _IO_proc_file_chain_unlock (); 40739 + } 40740 + 40741 + /* Run the handlers registered for the parent. */ 40742 + 40743 + commit 7c3c9ae28685a9142a8cfa3521bbca74c1007d0b 40744 + Author: Arjun Shankar <arjun@redhat.com> 40745 + Date: Fri Oct 25 09:33:45 2024 +0200 40746 + 40747 + libio: Correctly link tst-popen-fork against libpthread 40748 + 40749 + tst-popen-fork failed to build for Hurd due to not being linked with 40750 + libpthread. This commit fixes that. 40751 + 40752 + Tested with build-many-glibcs.py for i686-gnu. 40753 + 40754 + Reviewed-by: Florian Weimer <fweimer@redhat.com> 40755 + (cherry picked from commit 6a290b2895b77be839fcb7c44a6a9879560097ad) 40756 + 40757 + diff --git a/libio/Makefile b/libio/Makefile 40758 + index 7faba230ac..f2e98f96eb 100644 40759 + --- a/libio/Makefile 40760 + +++ b/libio/Makefile 40761 + @@ -142,6 +142,8 @@ tests = \ 40762 + tst_wscanf \ 40763 + # tests 40764 + 40765 + +$(objpfx)tst-popen-fork: $(shared-thread-library) 40766 + + 40767 + tests-internal = tst-vtables tst-vtables-interposed 40768 + 40769 + ifeq (yes,$(build-shared)) 40770 + 40771 + commit 8667345b83c8ca528a093d4db53f57a1bb1688e4 40772 + Author: Florian Weimer <fweimer@redhat.com> 40773 + Date: Thu Feb 13 21:56:52 2025 +0100 40774 + 40775 + elf: Keep using minimal malloc after early DTV resize (bug 32412) 40776 + 40777 + If an auditor loads many TLS-using modules during startup, it is 40778 + possible to trigger DTV resizing. Previously, the DTV was marked 40779 + as allocated by the main malloc afterwards, even if the minimal 40780 + malloc was still in use. With this change, _dl_resize_dtv marks 40781 + the resized DTV as allocated with the minimal malloc. 40782 + 40783 + The new test reuses TLS-using modules from other auditing tests. 40784 + 40785 + Reviewed-by: DJ Delorie <dj@redhat.com> 40786 + (cherry picked from commit aa3d7bd5299b33bffc118aa618b59bfa66059bcb) 40787 + 40788 + diff --git a/elf/Makefile b/elf/Makefile 40789 + index dc686c3bff..be64c59887 100644 40790 + --- a/elf/Makefile 40791 + +++ b/elf/Makefile 40792 + @@ -378,6 +378,7 @@ tests += \ 40793 + tst-align3 \ 40794 + tst-audit-tlsdesc \ 40795 + tst-audit-tlsdesc-dlopen \ 40796 + + tst-audit-tlsdesc-dlopen2 \ 40797 + tst-audit1 \ 40798 + tst-audit2 \ 40799 + tst-audit8 \ 40800 + @@ -817,6 +818,7 @@ modules-names += \ 40801 + tst-auditmanymod8 \ 40802 + tst-auditmanymod9 \ 40803 + tst-auditmod-tlsdesc \ 40804 + + tst-auditmod-tlsdesc2 \ 40805 + tst-auditmod1 \ 40806 + tst-auditmod11 \ 40807 + tst-auditmod12 \ 40808 + @@ -3040,6 +3042,9 @@ $(objpfx)tst-audit-tlsdesc.out: $(objpfx)tst-auditmod-tlsdesc.so 40809 + tst-audit-tlsdesc-ENV = LD_AUDIT=$(objpfx)tst-auditmod-tlsdesc.so 40810 + $(objpfx)tst-audit-tlsdesc-dlopen.out: $(objpfx)tst-auditmod-tlsdesc.so 40811 + tst-audit-tlsdesc-dlopen-ENV = LD_AUDIT=$(objpfx)tst-auditmod-tlsdesc.so 40812 + +$(objpfx)tst-audit-tlsdesc-dlopen2.out: $(objpfx)tst-auditmod-tlsdesc2.so \ 40813 + + $(patsubst %, $(objpfx)%.so, $(tlsmod17a-modules)) 40814 + +tst-audit-tlsdesc-dlopen2-ENV = LD_AUDIT=$(objpfx)tst-auditmod-tlsdesc2.so 40815 + 40816 + $(objpfx)tst-dlmopen-twice.out: \ 40817 + $(objpfx)tst-dlmopen-twice-mod1.so \ 40818 + diff --git a/elf/dl-tls.c b/elf/dl-tls.c 40819 + index 3d529b722c..b13e752358 100644 40820 + --- a/elf/dl-tls.c 40821 + +++ b/elf/dl-tls.c 40822 + @@ -528,6 +528,13 @@ _dl_resize_dtv (dtv_t *dtv, size_t max_modid) 40823 + if (newp == NULL) 40824 + oom (); 40825 + memcpy (newp, &dtv[-1], (2 + oldsize) * sizeof (dtv_t)); 40826 + +#ifdef SHARED 40827 + + /* Auditors can trigger a DTV resize event while the full malloc 40828 + + is not yet in use. Mark the new DTV allocation as the 40829 + + initial allocation. */ 40830 + + if (!__rtld_malloc_is_complete ()) 40831 + + GL(dl_initial_dtv) = &newp[1]; 40832 + +#endif 40833 + } 40834 + else 40835 + { 40836 + diff --git a/elf/tst-audit-tlsdesc-dlopen2.c b/elf/tst-audit-tlsdesc-dlopen2.c 40837 + new file mode 100644 40838 + index 0000000000..7ba2c4129a 40839 + --- /dev/null 40840 + +++ b/elf/tst-audit-tlsdesc-dlopen2.c 40841 + @@ -0,0 +1,46 @@ 40842 + +/* Loading TLS-using modules from auditors (bug 32412). Main program. 40843 + + Copyright (C) 2021-2025 Free Software Foundation, Inc. 40844 + + This file is part of the GNU C Library. 40845 + + 40846 + + The GNU C Library is free software; you can redistribute it and/or 40847 + + modify it under the terms of the GNU Lesser General Public 40848 + + License as published by the Free Software Foundation; either 40849 + + version 2.1 of the License, or (at your option) any later version. 40850 + + 40851 + + The GNU C Library is distributed in the hope that it will be useful, 40852 + + but WITHOUT ANY WARRANTY; without even the implied warranty of 40853 + + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 40854 + + Lesser General Public License for more details. 40855 + + 40856 + + You should have received a copy of the GNU Lesser General Public 40857 + + License along with the GNU C Library; if not, see 40858 + + <https://www.gnu.org/licenses/>. */ 40859 + + 40860 + +#include <support/xdlfcn.h> 40861 + +#include <stdio.h> 40862 + + 40863 + +static int 40864 + +do_test (void) 40865 + +{ 40866 + + puts ("info: start of main program"); 40867 + + 40868 + + /* Load TLS-using modules, to trigger DTV resizing. The dynamic 40869 + + linker will load them again (requiring their own TLS) because the 40870 + + dlopen calls from the auditor were in the auditing namespace. */ 40871 + + for (int i = 1; i <= 19; ++i) 40872 + + { 40873 + + char dso[30]; 40874 + + snprintf (dso, sizeof (dso), "tst-tlsmod17a%d.so", i); 40875 + + char sym[30]; 40876 + + snprintf (sym, sizeof(sym), "tlsmod17a%d", i); 40877 + + 40878 + + void *handle = xdlopen (dso, RTLD_LAZY); 40879 + + int (*func) (void) = xdlsym (handle, sym); 40880 + + /* Trigger TLS allocation. */ 40881 + + func (); 40882 + + } 40883 + + 40884 + + return 0; 40885 + +} 40886 + + 40887 + +#include <support/test-driver.c> 40888 + diff --git a/elf/tst-auditmod-tlsdesc2.c b/elf/tst-auditmod-tlsdesc2.c 40889 + new file mode 100644 40890 + index 0000000000..50275cd34d 40891 + --- /dev/null 40892 + +++ b/elf/tst-auditmod-tlsdesc2.c 40893 + @@ -0,0 +1,59 @@ 40894 + +/* Loading TLS-using modules from auditors (bug 32412). Audit module. 40895 + + Copyright (C) 2021-2025 Free Software Foundation, Inc. 40896 + + This file is part of the GNU C Library. 40897 + + 40898 + + The GNU C Library is free software; you can redistribute it and/or 40899 + + modify it under the terms of the GNU Lesser General Public 40900 + + License as published by the Free Software Foundation; either 40901 + + version 2.1 of the License, or (at your option) any later version. 40902 + + 40903 + + The GNU C Library is distributed in the hope that it will be useful, 40904 + + but WITHOUT ANY WARRANTY; without even the implied warranty of 40905 + + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 40906 + + Lesser General Public License for more details. 40907 + + 40908 + + You should have received a copy of the GNU Lesser General Public 40909 + + License along with the GNU C Library; if not, see 40910 + + <https://www.gnu.org/licenses/>. */ 40911 + + 40912 + +#include <dlfcn.h> 40913 + +#include <link.h> 40914 + +#include <stdbool.h> 40915 + +#include <stdio.h> 40916 + +#include <unistd.h> 40917 + + 40918 + +unsigned int 40919 + +la_version (unsigned int version) 40920 + +{ 40921 + + /* Open some modules, to trigger DTV resizing before the switch to 40922 + + the main malloc. */ 40923 + + for (int i = 1; i <= 19; ++i) 40924 + + { 40925 + + char dso[30]; 40926 + + snprintf (dso, sizeof (dso), "tst-tlsmod17a%d.so", i); 40927 + + char sym[30]; 40928 + + snprintf (sym, sizeof(sym), "tlsmod17a%d", i); 40929 + + 40930 + + void *handle = dlopen (dso, RTLD_LAZY); 40931 + + if (handle == NULL) 40932 + + { 40933 + + printf ("error: dlmopen from auditor: %s\n", dlerror ()); 40934 + + fflush (stdout); 40935 + + _exit (1); 40936 + + } 40937 + + int (*func) (void) = dlsym (handle, sym); 40938 + + if (func == NULL) 40939 + + { 40940 + + printf ("error: dlsym from auditor: %s\n", dlerror ()); 40941 + + fflush (stdout); 40942 + + _exit (1); 40943 + + } 40944 + + /* Trigger TLS allocation. */ 40945 + + func (); 40946 + + } 40947 + + 40948 + + puts ("info: TLS-using modules loaded from auditor"); 40949 + + fflush (stdout); 40950 + + 40951 + + return LAV_CURRENT; 40952 + +} 40953 + 40954 + commit b3002f303cedb8262cbc1ec22999ea36482efa0e 40955 + Author: Florian Weimer <fweimer@redhat.com> 40956 + Date: Tue May 20 19:36:02 2025 +0200 40957 + 40958 + support: Use const char * argument in support_capture_subprogram_self_sgid 40959 + 40960 + The function does not modify the passed-in string, so make this clear 40961 + via the prototype. 40962 + 40963 + Reviewed-by: Carlos O'Donell <carlos@redhat.com> 40964 + (cherry picked from commit f0c09fe61678df6f7f18fe1ebff074e62fa5ca7a) 40965 + 40966 + diff --git a/support/capture_subprocess.h b/support/capture_subprocess.h 40967 + index 93b7245d2a..5406d9f6c0 100644 40968 + --- a/support/capture_subprocess.h 40969 + +++ b/support/capture_subprocess.h 40970 + @@ -45,8 +45,7 @@ struct support_capture_subprocess support_capture_subprogram 40971 + /* Copy the running program into a setgid binary and run it with CHILD_ID 40972 + argument. If execution is successful, return the exit status of the child 40973 + program, otherwise return a non-zero failure exit code. */ 40974 + -int support_capture_subprogram_self_sgid 40975 + - (char *child_id); 40976 + +int support_capture_subprogram_self_sgid (const char *child_id); 40977 + 40978 + /* Deallocate the subprocess data captured by 40979 + support_capture_subprocess. */ 40980 + diff --git a/support/support_capture_subprocess.c b/support/support_capture_subprocess.c 40981 + index 53847194cb..2383481911 100644 40982 + --- a/support/support_capture_subprocess.c 40983 + +++ b/support/support_capture_subprocess.c 40984 + @@ -110,7 +110,7 @@ support_capture_subprogram (const char *file, char *const argv[], 40985 + safely make it SGID with the TARGET group ID. Then runs the 40986 + executable. */ 40987 + static int 40988 + -copy_and_spawn_sgid (char *child_id, gid_t gid) 40989 + +copy_and_spawn_sgid (const char *child_id, gid_t gid) 40990 + { 40991 + char *dirname = xasprintf ("%s/tst-tunables-setuid.%jd", 40992 + test_dir, (intmax_t) getpid ()); 40993 + @@ -182,7 +182,7 @@ copy_and_spawn_sgid (char *child_id, gid_t gid) 40994 + ret = 0; 40995 + infd = outfd = -1; 40996 + 40997 + - char * const args[] = {execname, child_id, NULL}; 40998 + + char * const args[] = {execname, (char *) child_id, NULL}; 40999 + 41000 + status = support_subprogram_wait (args[0], args); 41001 + 41002 + @@ -211,7 +211,7 @@ err: 41003 + } 41004 + 41005 + int 41006 + -support_capture_subprogram_self_sgid (char *child_id) 41007 + +support_capture_subprogram_self_sgid (const char *child_id) 41008 + { 41009 + gid_t target = 0; 41010 + const int count = 64; 41011 + 41012 + commit 61dcce21e06834f7248a8d516c9ec20788fc728c 41013 + Author: Florian Weimer <fweimer@redhat.com> 41014 + Date: Mon Dec 23 13:57:55 2024 +0100 41015 + 41016 + support: Add support_record_failure_barrier 41017 + 41018 + This can be used to stop execution after a TEST_COMPARE_BLOB 41019 + failure, for example. 41020 + 41021 + (cherry picked from commit d0b8aa6de4529231fadfe604ac2c434e559c2d9e) 41022 + 41023 + diff --git a/support/check.h b/support/check.h 41024 + index 7ea22c7a2c..8f41e5b99f 100644 41025 + --- a/support/check.h 41026 + +++ b/support/check.h 41027 + @@ -207,6 +207,9 @@ void support_record_failure_reset (void); 41028 + failures or not. */ 41029 + int support_record_failure_is_failed (void); 41030 + 41031 + +/* Terminate the process if any failures have been encountered so far. */ 41032 + +void support_record_failure_barrier (void); 41033 + + 41034 + __END_DECLS 41035 + 41036 + #endif /* SUPPORT_CHECK_H */ 41037 + diff --git a/support/support_record_failure.c b/support/support_record_failure.c 41038 + index 978123701d..72ee2b232f 100644 41039 + --- a/support/support_record_failure.c 41040 + +++ b/support/support_record_failure.c 41041 + @@ -112,3 +112,13 @@ support_record_failure_is_failed (void) 41042 + synchronization for reliable test error reporting anyway. */ 41043 + return __atomic_load_n (&state->failed, __ATOMIC_RELAXED); 41044 + } 41045 + + 41046 + +void 41047 + +support_record_failure_barrier (void) 41048 + +{ 41049 + + if (__atomic_load_n (&state->failed, __ATOMIC_RELAXED)) 41050 + + { 41051 + + puts ("error: exiting due to previous errors"); 41052 + + exit (1); 41053 + + } 41054 + +} 41055 + 41056 + commit 079ac4a172a8f6ba37acf1e80e57f5042d2c7561 41057 + Author: Florian Weimer <fweimer@redhat.com> 41058 + Date: Tue May 20 19:45:06 2025 +0200 41059 + 41060 + elf: Test case for bug 32976 (CVE-2025-4802) 41061 + 41062 + Check that LD_LIBRARY_PATH is ignored for AT_SECURE statically 41063 + linked binaries, using support_capture_subprogram_self_sgid. 41064 + 41065 + Reviewed-by: Carlos O'Donell <carlos@redhat.com> 41066 + (cherry picked from commit d8f7a79335b0d861c12c42aec94c04cd5bb181e2) 41067 + 41068 + diff --git a/elf/Makefile b/elf/Makefile 41069 + index be64c59887..afd4eb6fdd 100644 41070 + --- a/elf/Makefile 41071 + +++ b/elf/Makefile 41072 + @@ -266,6 +266,7 @@ tests-static-normal := \ 41073 + tst-array1-static \ 41074 + tst-array5-static \ 41075 + tst-dl-iter-static \ 41076 + + tst-dlopen-sgid \ 41077 + tst-dst-static \ 41078 + tst-env-setuid-static \ 41079 + tst-getauxval-static \ 41080 + @@ -859,6 +860,7 @@ modules-names += \ 41081 + tst-dlmopen-twice-mod1 \ 41082 + tst-dlmopen-twice-mod2 \ 41083 + tst-dlmopen1mod \ 41084 + + tst-dlopen-sgid-mod \ 41085 + tst-dlopen-tlsreinitmod1 \ 41086 + tst-dlopen-tlsreinitmod2 \ 41087 + tst-dlopen-tlsreinitmod3 \ 41088 + @@ -3153,3 +3155,5 @@ $(objpfx)tst-dlopen-tlsreinit3.out: $(objpfx)tst-auditmod1.so 41089 + tst-dlopen-tlsreinit3-ENV = LD_AUDIT=$(objpfx)tst-auditmod1.so 41090 + $(objpfx)tst-dlopen-tlsreinit4.out: $(objpfx)tst-auditmod1.so 41091 + tst-dlopen-tlsreinit4-ENV = LD_AUDIT=$(objpfx)tst-auditmod1.so 41092 + + 41093 + +$(objpfx)tst-dlopen-sgid.out: $(objpfx)tst-dlopen-sgid-mod.so 41094 + diff --git a/elf/tst-dlopen-sgid-mod.c b/elf/tst-dlopen-sgid-mod.c 41095 + new file mode 100644 41096 + index 0000000000..5eb79eef48 41097 + --- /dev/null 41098 + +++ b/elf/tst-dlopen-sgid-mod.c 41099 + @@ -0,0 +1 @@ 41100 + +/* Opening this object should not succeed. */ 41101 + diff --git a/elf/tst-dlopen-sgid.c b/elf/tst-dlopen-sgid.c 41102 + new file mode 100644 41103 + index 0000000000..47829a405e 41104 + --- /dev/null 41105 + +++ b/elf/tst-dlopen-sgid.c 41106 + @@ -0,0 +1,104 @@ 41107 + +/* Test case for ignored LD_LIBRARY_PATH in static startug (bug 32976). 41108 + + Copyright (C) 2025 Free Software Foundation, Inc. 41109 + + This file is part of the GNU C Library. 41110 + + 41111 + + The GNU C Library is free software; you can redistribute it and/or 41112 + + modify it under the terms of the GNU Lesser General Public 41113 + + License as published by the Free Software Foundation; either 41114 + + version 2.1 of the License, or (at your option) any later version. 41115 + + 41116 + + The GNU C Library is distributed in the hope that it will be useful, 41117 + + but WITHOUT ANY WARRANTY; without even the implied warranty of 41118 + + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 41119 + + Lesser General Public License for more details. 41120 + + 41121 + + You should have received a copy of the GNU Lesser General Public 41122 + + License along with the GNU C Library; if not, see 41123 + + <https://www.gnu.org/licenses/>. */ 41124 + + 41125 + +#include <dlfcn.h> 41126 + +#include <gnu/lib-names.h> 41127 + +#include <stddef.h> 41128 + +#include <stdint.h> 41129 + +#include <stdlib.h> 41130 + +#include <string.h> 41131 + +#include <support/capture_subprocess.h> 41132 + +#include <support/check.h> 41133 + +#include <support/support.h> 41134 + +#include <support/temp_file.h> 41135 + +#include <unistd.h> 41136 + + 41137 + +/* This is the name of our test object. Use a custom module for 41138 + + testing, so that this object does not get picked up from the system 41139 + + path. */ 41140 + +static const char dso_name[] = "tst-dlopen-sgid-mod.so"; 41141 + + 41142 + +/* Used to mark the recursive invocation. */ 41143 + +static const char magic_argument[] = "run-actual-test"; 41144 + + 41145 + +static int 41146 + +do_test (void) 41147 + +{ 41148 + +/* Pathname of the directory that receives the shared objects this 41149 + + test attempts to load. */ 41150 + + char *libdir = support_create_temp_directory ("tst-dlopen-sgid-"); 41151 + + 41152 + + /* This is supposed to be ignored and stripped. */ 41153 + + TEST_COMPARE (setenv ("LD_LIBRARY_PATH", libdir, 1), 0); 41154 + + 41155 + + /* Copy of libc.so.6. */ 41156 + + { 41157 + + char *from = xasprintf ("%s/%s", support_objdir_root, LIBC_SO); 41158 + + char *to = xasprintf ("%s/%s", libdir, LIBC_SO); 41159 + + add_temp_file (to); 41160 + + support_copy_file (from, to); 41161 + + free (to); 41162 + + free (from); 41163 + + } 41164 + + 41165 + + /* Copy of the test object. */ 41166 + + { 41167 + + char *from = xasprintf ("%s/elf/%s", support_objdir_root, dso_name); 41168 + + char *to = xasprintf ("%s/%s", libdir, dso_name); 41169 + + add_temp_file (to); 41170 + + support_copy_file (from, to); 41171 + + free (to); 41172 + + free (from); 41173 + + } 41174 + + 41175 + + TEST_COMPARE (support_capture_subprogram_self_sgid (magic_argument), 0); 41176 + + 41177 + + free (libdir); 41178 + + 41179 + + return 0; 41180 + +} 41181 + + 41182 + +static void 41183 + +alternative_main (int argc, char **argv) 41184 + +{ 41185 + + if (argc == 2 && strcmp (argv[1], magic_argument) == 0) 41186 + + { 41187 + + if (getgid () == getegid ()) 41188 + + /* This can happen if the file system is mounted nosuid. */ 41189 + + FAIL_UNSUPPORTED ("SGID failed: GID and EGID match (%jd)\n", 41190 + + (intmax_t) getgid ()); 41191 + + 41192 + + /* Should be removed due to SGID. */ 41193 + + TEST_COMPARE_STRING (getenv ("LD_LIBRARY_PATH"), NULL); 41194 + + 41195 + + TEST_VERIFY (dlopen (dso_name, RTLD_NOW) == NULL); 41196 + + { 41197 + + const char *message = dlerror (); 41198 + + TEST_COMPARE_STRING (message, 41199 + + "tst-dlopen-sgid-mod.so:" 41200 + + " cannot open shared object file:" 41201 + + " No such file or directory"); 41202 + + } 41203 + + 41204 + + support_record_failure_barrier (); 41205 + + exit (EXIT_SUCCESS); 41206 + + } 41207 + +} 41208 + + 41209 + +#define PREPARE alternative_main 41210 + +#include <support/test-driver.c> 41211 + 41212 + commit 56e75b810ac39b0e390be5b66397dca0cdfa4d80 41213 + Author: Sunil K Pandey <sunil.k.pandey@intel.com> 41214 + Date: Tue May 20 10:07:27 2025 -0700 41215 + 41216 + x86_64: Fix typo in ifunc-impl-list.c. 41217 + 41218 + Fix wcsncpy and wcpncpy typo in ifunc-impl-list.c. 41219 + 41220 + Reviewed-by: H.J. Lu <hjl.tools@gmail.com> 41221 + (cherry picked from commit f2aeb6ff941dccc4c777b5621e77addea6cc076c) 41222 + 41223 + diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c 41224 + index 0bbb71bbbf..3db45db39b 100644 41225 + --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c 41226 + +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c 41227 + @@ -922,7 +922,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, 41228 + (CPU_FEATURE_USABLE (AVX2) 41229 + && CPU_FEATURE_USABLE (BMI2)), 41230 + __wcsncpy_avx2) 41231 + - X86_IFUNC_IMPL_ADD_V2 (array, i, wcpncpy, 41232 + + X86_IFUNC_IMPL_ADD_V2 (array, i, wcsncpy, 41233 + 1, 41234 + __wcsncpy_generic)) 41235 + 41236 + @@ -952,7 +952,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, 41237 + (CPU_FEATURE_USABLE (AVX2) 41238 + && CPU_FEATURE_USABLE (BMI2)), 41239 + __wcpncpy_avx2) 41240 + - X86_IFUNC_IMPL_ADD_V2 (array, i, wcsncpy, 41241 + + X86_IFUNC_IMPL_ADD_V2 (array, i, wcpncpy, 41242 + 1, 41243 + __wcpncpy_generic)) 41244 + 41245 + 41246 + commit c8e10f14328518954072df64aafd574e67cfdde5 41247 + Author: Florian Weimer <fweimer@redhat.com> 41248 + Date: Wed May 21 08:43:32 2025 +0200 41249 + 41250 + elf: Fix subprocess status handling for tst-dlopen-sgid (bug 32987) 41251 + 41252 + This should really move into support_capture_subprogram_self_sgid. 41253 + 41254 + Reviewed-by: Sam James <sam@gentoo.org> 41255 + (cherry picked from commit 35fc356fa3b4f485bd3ba3114c9f774e5df7d3c2) 41256 + 41257 + diff --git a/NEWS b/NEWS 41258 + index 7a6985f5dd..4b290ad4bf 100644 41259 + --- a/NEWS 41260 + +++ b/NEWS 41261 + @@ -23,6 +23,7 @@ The following bugs are resolved with this release: 41262 + [32245] glibc -Wstringop-overflow= build failure on hppa 41263 + [32470] x86: Avoid integer truncation with large cache sizes 41264 + [32810] Crash on x86-64 if XSAVEC disable via tunable 41265 + + [32987] elf: Fix subprocess status handling for tst-dlopen-sgid 41266 + 41267 + Version 2.40 41268 + 41269 + diff --git a/elf/tst-dlopen-sgid.c b/elf/tst-dlopen-sgid.c 41270 + index 47829a405e..5688b79f2e 100644 41271 + --- a/elf/tst-dlopen-sgid.c 41272 + +++ b/elf/tst-dlopen-sgid.c 41273 + @@ -26,6 +26,8 @@ 41274 + #include <support/check.h> 41275 + #include <support/support.h> 41276 + #include <support/temp_file.h> 41277 + +#include <support/test-driver.h> 41278 + +#include <sys/wait.h> 41279 + #include <unistd.h> 41280 + 41281 + /* This is the name of our test object. Use a custom module for 41282 + @@ -66,10 +68,16 @@ do_test (void) 41283 + free (from); 41284 + } 41285 + 41286 + - TEST_COMPARE (support_capture_subprogram_self_sgid (magic_argument), 0); 41287 + - 41288 + free (libdir); 41289 + 41290 + + int status = support_capture_subprogram_self_sgid (magic_argument); 41291 + + 41292 + + if (WEXITSTATUS (status) == EXIT_UNSUPPORTED) 41293 + + return EXIT_UNSUPPORTED; 41294 + + 41295 + + if (!WIFEXITED (status)) 41296 + + FAIL_EXIT1 ("Unexpected exit status %d from child process\n", status); 41297 + + 41298 + return 0; 41299 + } 41300 + 41301 + 41302 + commit 42a5a940c974d02540c8da26d6374c744d148cb9 41303 + Author: Carlos O'Donell <carlos@redhat.com> 41304 + Date: Wed Jun 11 09:19:17 2025 -0400 41305 + 41306 + ppc64le: Revert "powerpc: Optimized strncmp for power10" (CVE-2025-5745) 41307 + 41308 + This reverts commit 23f0d81608d0ca6379894ef81670cf30af7fd081 41309 + 41310 + Reason for revert: Power10 strncmp clobbers non-volatile vector 41311 + registers (Bug 33060) 41312 + 41313 + Tested on ppc64le with no regressions. 41314 + 41315 + (cherry picked from commit 63c60101ce7c5eac42be90f698ba02099b41b965) 41316 + 41317 + diff --git a/sysdeps/powerpc/powerpc64/le/power10/strncmp.S b/sysdeps/powerpc/powerpc64/le/power10/strncmp.S 41318 + deleted file mode 100644 41319 + index d4ba76acae..0000000000 41320 + --- a/sysdeps/powerpc/powerpc64/le/power10/strncmp.S 41321 + +++ /dev/null 41322 + @@ -1,271 +0,0 @@ 41323 + -/* Optimized strncmp implementation for PowerPC64/POWER10. 41324 + - Copyright (C) 2024 Free Software Foundation, Inc. 41325 + - This file is part of the GNU C Library. 41326 + - 41327 + - The GNU C Library is free software; you can redistribute it and/or 41328 + - modify it under the terms of the GNU Lesser General Public 41329 + - License as published by the Free Software Foundation; either 41330 + - version 2.1 of the License, or (at your option) any later version. 41331 + - 41332 + - The GNU C Library is distributed in the hope that it will be useful, 41333 + - but WITHOUT ANY WARRANTY; without even the implied warranty of 41334 + - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 41335 + - Lesser General Public License for more details. 41336 + - 41337 + - You should have received a copy of the GNU Lesser General Public 41338 + - License along with the GNU C Library; if not, see 41339 + - <https://www.gnu.org/licenses/>. */ 41340 + - 41341 + -#include <sysdep.h> 41342 + - 41343 + -/* Implements the function 41344 + - 41345 + - int [r3] strncmp (const char *s1 [r3], const char *s2 [r4], size_t [r5] n) 41346 + - 41347 + - The implementation uses unaligned doubleword access to avoid specialized 41348 + - code paths depending of data alignment for first 32 bytes and uses 41349 + - vectorised loops after that. */ 41350 + - 41351 + -#ifndef STRNCMP 41352 + -# define STRNCMP strncmp 41353 + -#endif 41354 + - 41355 + -/* TODO: Change this to actual instructions when minimum binutils is upgraded 41356 + - to 2.27. Macros are defined below for these newer instructions in order 41357 + - to maintain compatibility. */ 41358 + - 41359 + -#define LXVP(xtp,dq,ra) \ 41360 + - .long(((6)<<(32-6)) \ 41361 + - | ((((xtp)-32)>>1)<<(32-10)) \ 41362 + - | ((1)<<(32-11)) \ 41363 + - | ((ra)<<(32-16)) \ 41364 + - | dq) 41365 + - 41366 + -#define COMPARE_16(vreg1,vreg2,offset) \ 41367 + - lxv vreg1+32,offset(r3); \ 41368 + - lxv vreg2+32,offset(r4); \ 41369 + - vcmpnezb. v7,vreg1,vreg2; \ 41370 + - bne cr6,L(different); \ 41371 + - cmpldi cr7,r5,16; \ 41372 + - ble cr7,L(ret0); \ 41373 + - addi r5,r5,-16; 41374 + - 41375 + -#define COMPARE_32(vreg1,vreg2,offset,label1,label2) \ 41376 + - LXVP(vreg1+32,offset,r3); \ 41377 + - LXVP(vreg2+32,offset,r4); \ 41378 + - vcmpnezb. v7,vreg1+1,vreg2+1; \ 41379 + - bne cr6,L(label1); \ 41380 + - vcmpnezb. v7,vreg1,vreg2; \ 41381 + - bne cr6,L(label2); \ 41382 + - cmpldi cr7,r5,32; \ 41383 + - ble cr7,L(ret0); \ 41384 + - addi r5,r5,-32; 41385 + - 41386 + -#define TAIL_FIRST_16B(vreg1,vreg2) \ 41387 + - vctzlsbb r6,v7; \ 41388 + - cmpld cr7,r5,r6; \ 41389 + - ble cr7,L(ret0); \ 41390 + - vextubrx r5,r6,vreg1; \ 41391 + - vextubrx r4,r6,vreg2; \ 41392 + - subf r3,r4,r5; \ 41393 + - blr; 41394 + - 41395 + -#define TAIL_SECOND_16B(vreg1,vreg2) \ 41396 + - vctzlsbb r6,v7; \ 41397 + - addi r0,r6,16; \ 41398 + - cmpld cr7,r5,r0; \ 41399 + - ble cr7,L(ret0); \ 41400 + - vextubrx r5,r6,vreg1; \ 41401 + - vextubrx r4,r6,vreg2; \ 41402 + - subf r3,r4,r5; \ 41403 + - blr; 41404 + - 41405 + -#define CHECK_N_BYTES(reg1,reg2,len_reg) \ 41406 + - sldi r6,len_reg,56; \ 41407 + - lxvl 32+v4,reg1,r6; \ 41408 + - lxvl 32+v5,reg2,r6; \ 41409 + - add reg1,reg1,len_reg; \ 41410 + - add reg2,reg2,len_reg; \ 41411 + - vcmpnezb v7,v4,v5; \ 41412 + - vctzlsbb r6,v7; \ 41413 + - cmpld cr7,r6,len_reg; \ 41414 + - blt cr7,L(different); \ 41415 + - cmpld cr7,r5,len_reg; \ 41416 + - ble cr7,L(ret0); \ 41417 + - sub r5,r5,len_reg; \ 41418 + - 41419 + - /* TODO: change this to .machine power10 when the minimum required 41420 + - binutils allows it. */ 41421 + - .machine power9 41422 + -ENTRY_TOCLESS (STRNCMP, 4) 41423 + - /* Check if size is 0. */ 41424 + - cmpdi cr0,r5,0 41425 + - beq cr0,L(ret0) 41426 + - andi. r7,r3,4095 41427 + - andi. r8,r4,4095 41428 + - cmpldi cr0,r7,4096-16 41429 + - cmpldi cr1,r8,4096-16 41430 + - bgt cr0,L(crosses) 41431 + - bgt cr1,L(crosses) 41432 + - COMPARE_16(v4,v5,0) 41433 + - addi r3,r3,16 41434 + - addi r4,r4,16 41435 + - 41436 + -L(crosses): 41437 + - andi. r7,r3,15 41438 + - subfic r7,r7,16 /* r7(nalign1) = 16 - (str1 & 15). */ 41439 + - andi. r9,r4,15 41440 + - subfic r8,r9,16 /* r8(nalign2) = 16 - (str2 & 15). */ 41441 + - cmpld cr7,r7,r8 41442 + - beq cr7,L(same_aligned) 41443 + - blt cr7,L(nalign1_min) 41444 + - 41445 + - /* nalign2 is minimum and s2 pointer is aligned. */ 41446 + - CHECK_N_BYTES(r3,r4,r8) 41447 + - /* Are we on the 64B hunk which crosses a page? */ 41448 + - andi. r10,r3,63 /* Determine offset into 64B hunk. */ 41449 + - andi. r8,r3,15 /* The offset into the 16B hunk. */ 41450 + - neg r7,r3 41451 + - andi. r9,r7,15 /* Number of bytes after a 16B cross. */ 41452 + - rlwinm. r7,r7,26,0x3F /* ((r4-4096))>>6&63. */ 41453 + - beq L(compare_64_pagecross) 41454 + - mtctr r7 41455 + - b L(compare_64B_unaligned) 41456 + - 41457 + - /* nalign1 is minimum and s1 pointer is aligned. */ 41458 + -L(nalign1_min): 41459 + - CHECK_N_BYTES(r3,r4,r7) 41460 + - /* Are we on the 64B hunk which crosses a page? */ 41461 + - andi. r10,r4,63 /* Determine offset into 64B hunk. */ 41462 + - andi. r8,r4,15 /* The offset into the 16B hunk. */ 41463 + - neg r7,r4 41464 + - andi. r9,r7,15 /* Number of bytes after a 16B cross. */ 41465 + - rlwinm. r7,r7,26,0x3F /* ((r4-4096))>>6&63. */ 41466 + - beq L(compare_64_pagecross) 41467 + - mtctr r7 41468 + - 41469 + - .p2align 5 41470 + -L(compare_64B_unaligned): 41471 + - COMPARE_16(v4,v5,0) 41472 + - COMPARE_16(v4,v5,16) 41473 + - COMPARE_16(v4,v5,32) 41474 + - COMPARE_16(v4,v5,48) 41475 + - addi r3,r3,64 41476 + - addi r4,r4,64 41477 + - bdnz L(compare_64B_unaligned) 41478 + - 41479 + - /* Cross the page boundary of s2, carefully. Only for first 41480 + - iteration we have to get the count of 64B blocks to be checked. 41481 + - From second iteration and beyond, loop counter is always 63. */ 41482 + -L(compare_64_pagecross): 41483 + - li r11, 63 41484 + - mtctr r11 41485 + - cmpldi r10,16 41486 + - ble L(cross_4) 41487 + - cmpldi r10,32 41488 + - ble L(cross_3) 41489 + - cmpldi r10,48 41490 + - ble L(cross_2) 41491 + -L(cross_1): 41492 + - CHECK_N_BYTES(r3,r4,r9) 41493 + - CHECK_N_BYTES(r3,r4,r8) 41494 + - COMPARE_16(v4,v5,0) 41495 + - COMPARE_16(v4,v5,16) 41496 + - COMPARE_16(v4,v5,32) 41497 + - addi r3,r3,48 41498 + - addi r4,r4,48 41499 + - b L(compare_64B_unaligned) 41500 + -L(cross_2): 41501 + - COMPARE_16(v4,v5,0) 41502 + - addi r3,r3,16 41503 + - addi r4,r4,16 41504 + - CHECK_N_BYTES(r3,r4,r9) 41505 + - CHECK_N_BYTES(r3,r4,r8) 41506 + - COMPARE_16(v4,v5,0) 41507 + - COMPARE_16(v4,v5,16) 41508 + - addi r3,r3,32 41509 + - addi r4,r4,32 41510 + - b L(compare_64B_unaligned) 41511 + -L(cross_3): 41512 + - COMPARE_16(v4,v5,0) 41513 + - COMPARE_16(v4,v5,16) 41514 + - addi r3,r3,32 41515 + - addi r4,r4,32 41516 + - CHECK_N_BYTES(r3,r4,r9) 41517 + - CHECK_N_BYTES(r3,r4,r8) 41518 + - COMPARE_16(v4,v5,0) 41519 + - addi r3,r3,16 41520 + - addi r4,r4,16 41521 + - b L(compare_64B_unaligned) 41522 + -L(cross_4): 41523 + - COMPARE_16(v4,v5,0) 41524 + - COMPARE_16(v4,v5,16) 41525 + - COMPARE_16(v4,v5,32) 41526 + - addi r3,r3,48 41527 + - addi r4,r4,48 41528 + - CHECK_N_BYTES(r3,r4,r9) 41529 + - CHECK_N_BYTES(r3,r4,r8) 41530 + - b L(compare_64B_unaligned) 41531 + - 41532 + -L(same_aligned): 41533 + - CHECK_N_BYTES(r3,r4,r7) 41534 + - /* Align s1 to 32B and adjust s2 address. 41535 + - Use lxvp only if both s1 and s2 are 32B aligned. */ 41536 + - COMPARE_16(v4,v5,0) 41537 + - COMPARE_16(v4,v5,16) 41538 + - COMPARE_16(v4,v5,32) 41539 + - COMPARE_16(v4,v5,48) 41540 + - addi r3,r3,64 41541 + - addi r4,r4,64 41542 + - COMPARE_16(v4,v5,0) 41543 + - COMPARE_16(v4,v5,16) 41544 + - addi r5,r5,32 41545 + - 41546 + - clrldi r6,r3,59 41547 + - subfic r7,r6,32 41548 + - add r3,r3,r7 41549 + - add r4,r4,r7 41550 + - subf r5,r7,r5 41551 + - andi. r7,r4,0x1F 41552 + - beq cr0,L(32B_aligned_loop) 41553 + - 41554 + - .p2align 5 41555 + -L(16B_aligned_loop): 41556 + - COMPARE_16(v4,v5,0) 41557 + - COMPARE_16(v4,v5,16) 41558 + - COMPARE_16(v4,v5,32) 41559 + - COMPARE_16(v4,v5,48) 41560 + - addi r3,r3,64 41561 + - addi r4,r4,64 41562 + - b L(16B_aligned_loop) 41563 + - 41564 + - /* Calculate and return the difference. */ 41565 + -L(different): 41566 + - TAIL_FIRST_16B(v4,v5) 41567 + - 41568 + - .p2align 5 41569 + -L(32B_aligned_loop): 41570 + - COMPARE_32(v14,v16,0,tail1,tail2) 41571 + - COMPARE_32(v18,v20,32,tail3,tail4) 41572 + - COMPARE_32(v22,v24,64,tail5,tail6) 41573 + - COMPARE_32(v26,v28,96,tail7,tail8) 41574 + - addi r3,r3,128 41575 + - addi r4,r4,128 41576 + - b L(32B_aligned_loop) 41577 + - 41578 + -L(tail1): TAIL_FIRST_16B(v15,v17) 41579 + -L(tail2): TAIL_SECOND_16B(v14,v16) 41580 + -L(tail3): TAIL_FIRST_16B(v19,v21) 41581 + -L(tail4): TAIL_SECOND_16B(v18,v20) 41582 + -L(tail5): TAIL_FIRST_16B(v23,v25) 41583 + -L(tail6): TAIL_SECOND_16B(v22,v24) 41584 + -L(tail7): TAIL_FIRST_16B(v27,v29) 41585 + -L(tail8): TAIL_SECOND_16B(v26,v28) 41586 + - 41587 + - .p2align 5 41588 + -L(ret0): 41589 + - li r3,0 41590 + - blr 41591 + - 41592 + -END(STRNCMP) 41593 + -libc_hidden_builtin_def(strncmp) 41594 + diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile 41595 + index b847c19049..a38ff46448 100644 41596 + --- a/sysdeps/powerpc/powerpc64/multiarch/Makefile 41597 + +++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile 41598 + @@ -34,7 +34,7 @@ ifneq (,$(filter %le,$(config-machine))) 41599 + sysdep_routines += memchr-power10 memcmp-power10 memcpy-power10 \ 41600 + memmove-power10 memset-power10 rawmemchr-power9 \ 41601 + rawmemchr-power10 strcmp-power9 strcmp-power10 \ 41602 + - strncmp-power9 strncmp-power10 strcpy-power9 stpcpy-power9 \ 41603 + + strncmp-power9 strcpy-power9 stpcpy-power9 \ 41604 + strlen-power9 strncpy-power9 stpncpy-power9 strlen-power10 41605 + endif 41606 + CFLAGS-strncase-power7.c += -mcpu=power7 -funroll-loops 41607 + diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c 41608 + index 2bb47d3527..30fd89e109 100644 41609 + --- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c 41610 + +++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c 41611 + @@ -164,9 +164,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, 41612 + /* Support sysdeps/powerpc/powerpc64/multiarch/strncmp.c. */ 41613 + IFUNC_IMPL (i, name, strncmp, 41614 + #ifdef __LITTLE_ENDIAN__ 41615 + - IFUNC_IMPL_ADD (array, i, strncmp, hwcap2 & PPC_FEATURE2_ARCH_3_1 41616 + - && hwcap & PPC_FEATURE_HAS_VSX, 41617 + - __strncmp_power10) 41618 + IFUNC_IMPL_ADD (array, i, strncmp, hwcap2 & PPC_FEATURE2_ARCH_3_00 41619 + && hwcap & PPC_FEATURE_HAS_ALTIVEC, 41620 + __strncmp_power9) 41621 + diff --git a/sysdeps/powerpc/powerpc64/multiarch/strncmp-power10.S b/sysdeps/powerpc/powerpc64/multiarch/strncmp-power10.S 41622 + deleted file mode 100644 41623 + index d7026c12e2..0000000000 41624 + --- a/sysdeps/powerpc/powerpc64/multiarch/strncmp-power10.S 41625 + +++ /dev/null 41626 + @@ -1,25 +0,0 @@ 41627 + -/* Copyright (C) 2024 Free Software Foundation, Inc. 41628 + - This file is part of the GNU C Library. 41629 + - 41630 + - The GNU C Library is free software; you can redistribute it and/or 41631 + - modify it under the terms of the GNU Lesser General Public 41632 + - License as published by the Free Software Foundation; either 41633 + - version 2.1 of the License, or (at your option) any later version. 41634 + - 41635 + - The GNU C Library is distributed in the hope that it will be useful, 41636 + - but WITHOUT ANY WARRANTY; without even the implied warranty of 41637 + - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 41638 + - Lesser General Public License for more details. 41639 + - 41640 + - You should have received a copy of the GNU Lesser General Public 41641 + - License along with the GNU C Library; if not, see 41642 + - <https://www.gnu.org/licenses/>. */ 41643 + - 41644 + -#if defined __LITTLE_ENDIAN__ && IS_IN (libc) 41645 + -#define STRNCMP __strncmp_power10 41646 + - 41647 + -#undef libc_hidden_builtin_def 41648 + -#define libc_hidden_builtin_def(name) 41649 + - 41650 + -#include <sysdeps/powerpc/powerpc64/le/power10/strncmp.S> 41651 + -#endif 41652 + diff --git a/sysdeps/powerpc/powerpc64/multiarch/strncmp.c b/sysdeps/powerpc/powerpc64/multiarch/strncmp.c 41653 + index a5ed67f766..6178f4a432 100644 41654 + --- a/sysdeps/powerpc/powerpc64/multiarch/strncmp.c 41655 + +++ b/sysdeps/powerpc/powerpc64/multiarch/strncmp.c 41656 + @@ -29,7 +29,6 @@ extern __typeof (strncmp) __strncmp_ppc attribute_hidden; 41657 + extern __typeof (strncmp) __strncmp_power8 attribute_hidden; 41658 + # ifdef __LITTLE_ENDIAN__ 41659 + extern __typeof (strncmp) __strncmp_power9 attribute_hidden; 41660 + -extern __typeof (strncmp) __strncmp_power10 attribute_hidden; 41661 + # endif 41662 + # undef strncmp 41663 + 41664 + @@ -37,9 +36,6 @@ extern __typeof (strncmp) __strncmp_power10 attribute_hidden; 41665 + ifunc symbol properly. */ 41666 + libc_ifunc_redirected (__redirect_strncmp, strncmp, 41667 + # ifdef __LITTLE_ENDIAN__ 41668 + - (hwcap2 & PPC_FEATURE2_ARCH_3_1 41669 + - && hwcap & PPC_FEATURE_HAS_VSX) 41670 + - ? __strncmp_power10 : 41671 + (hwcap2 & PPC_FEATURE2_ARCH_3_00 41672 + && hwcap & PPC_FEATURE_HAS_ALTIVEC) 41673 + ? __strncmp_power9 : 41674 + 41675 + commit 2ad6e55ea5cb23af5af7af35d5f80cd93032f96a 41676 + Author: Carlos O'Donell <carlos@redhat.com> 41677 + Date: Wed Jun 11 09:43:50 2025 -0400 41678 + 41679 + ppc64le: Revert "powerpc: Fix performance issues of strcmp power10" (CVE-2025-5702) 41680 + 41681 + This reverts commit 90bcc8721ef82b7378d2b080141228660e862d56 41682 + 41683 + This change is in the chain of the final revert that fixes the CVE 41684 + i.e. 3367d8e180848030d1646f088759f02b8dfe0d6f 41685 + 41686 + Reason for revert: Power10 strcmp clobbers non-volatile vector 41687 + registers (Bug 33056) 41688 + 41689 + Tested on ppc64le with no regressions. 41690 + 41691 + (cherry picked from commit c22de63588df7a8a0edceea9bb02534064c9d201) 41692 + 41693 + diff --git a/sysdeps/powerpc/powerpc64/le/power10/strcmp.S b/sysdeps/powerpc/powerpc64/le/power10/strcmp.S 41694 + index f0d6732a25..00f1e9c170 100644 41695 + --- a/sysdeps/powerpc/powerpc64/le/power10/strcmp.S 41696 + +++ b/sysdeps/powerpc/powerpc64/le/power10/strcmp.S 41697 + @@ -62,7 +62,7 @@ 41698 + lxvl 32+v5,reg2,r0; \ 41699 + add reg1,reg1,len_reg; \ 41700 + add reg2,reg2,len_reg; \ 41701 + - vcmpnezb v7,v4,v5; \ 41702 + + vcmpnezb. v7,v4,v5; \ 41703 + vctzlsbb r6,v7; \ 41704 + cmpld cr7,r6,len_reg; \ 41705 + blt cr7,L(different); \ 41706 + @@ -72,110 +72,70 @@ 41707 + 41708 + .machine power9 41709 + ENTRY_TOCLESS (STRCMP, 4) 41710 + - andi. r7,r3,4095 41711 + - andi. r8,r4,4095 41712 + - cmpldi cr0,r7,4096-16 41713 + - cmpldi cr1,r8,4096-16 41714 + - bgt cr0,L(crosses) 41715 + - bgt cr1,L(crosses) 41716 + - COMPARE_16(v4,v5,0) 41717 + - 41718 + -L(crosses): 41719 + - andi. r7,r3,15 41720 + - subfic r7,r7,16 /* r7(nalign1) = 16 - (str1 & 15). */ 41721 + - andi. r9,r4,15 41722 + - subfic r5,r9,16 /* r5(nalign2) = 16 - (str2 & 15). */ 41723 + - cmpld cr7,r7,r5 41724 + - beq cr7,L(same_aligned) 41725 + - blt cr7,L(nalign1_min) 41726 + + li r11,16 41727 + + /* eq bit of cr1 used as swap status flag to indicate if 41728 + + source pointers were swapped. */ 41729 + + crclr 4*cr1+eq 41730 + + vspltisb v19,-1 41731 + + andi. r7,r3,15 41732 + + sub r7,r11,r7 /* r7(nalign1) = 16 - (str1 & 15). */ 41733 + + andi. r9,r4,15 41734 + + sub r5,r11,r9 /* r5(nalign2) = 16 - (str2 & 15). */ 41735 + + cmpld cr7,r7,r5 41736 + + beq cr7,L(same_aligned) 41737 + + blt cr7,L(nalign1_min) 41738 + + /* Swap r3 and r4, and r7 and r5 such that r3 and r7 hold the 41739 + + pointer which is closer to the next 16B boundary so that only 41740 + + one CHECK_N_BYTES is needed before entering the loop below. */ 41741 + + mr r8,r4 41742 + + mr r4,r3 41743 + + mr r3,r8 41744 + + mr r12,r7 41745 + + mr r7,r5 41746 + + mr r5,r12 41747 + + crset 4*cr1+eq /* Set bit on swapping source pointers. */ 41748 + 41749 + - /* nalign2 is minimum and s2 pointer is aligned. */ 41750 + - CHECK_N_BYTES(r3,r4,r5) 41751 + - /* Are we on the 64B hunk which crosses a page? */ 41752 + - andi. r10,r3,63 /* Determine offset into 64B hunk. */ 41753 + - andi. r8,r3,15 /* The offset into the 16B hunk. */ 41754 + - neg r7,r3 41755 + - andi. r9,r7,15 /* Number of bytes after a 16B cross. */ 41756 + - rlwinm. r7,r7,26,0x3F /* ((r3-4096))>>6&63. */ 41757 + - beq L(compare_64_pagecross) 41758 + - mtctr r7 41759 + - b L(compare_64B_unaligned) 41760 + - 41761 + - /* nalign1 is minimum and s1 pointer is aligned. */ 41762 + + .p2align 5 41763 + L(nalign1_min): 41764 + CHECK_N_BYTES(r3,r4,r7) 41765 + - /* Are we on the 64B hunk which crosses a page? */ 41766 + - andi. r10,r4,63 /* Determine offset into 64B hunk. */ 41767 + - andi. r8,r4,15 /* The offset into the 16B hunk. */ 41768 + - neg r7,r4 41769 + - andi. r9,r7,15 /* Number of bytes after a 16B cross. */ 41770 + - rlwinm. r7,r7,26,0x3F /* ((r4-4096))>>6&63. */ 41771 + - beq L(compare_64_pagecross) 41772 + - mtctr r7 41773 + 41774 + .p2align 5 41775 + -L(compare_64B_unaligned): 41776 + - COMPARE_16(v4,v5,0) 41777 + - COMPARE_16(v4,v5,16) 41778 + - COMPARE_16(v4,v5,32) 41779 + - COMPARE_16(v4,v5,48) 41780 + - addi r3,r3,64 41781 + - addi r4,r4,64 41782 + - bdnz L(compare_64B_unaligned) 41783 + +L(s1_aligned): 41784 + + /* r9 and r5 is number of bytes to be read after and before 41785 + + page boundary correspondingly. */ 41786 + + sub r5,r5,r7 41787 + + subfic r9,r5,16 41788 + + /* Now let r7 hold the count of quadwords which can be 41789 + + checked without crossing a page boundary. quadword offset is 41790 + + (str2>>4)&0xFF. */ 41791 + + rlwinm r7,r4,28,0xFF 41792 + + /* Below check is required only for first iteration. For second 41793 + + iteration and beyond, the new loop counter is always 255. */ 41794 + + cmpldi r7,255 41795 + + beq L(L3) 41796 + + /* Get the initial loop count by 255-((str2>>4)&0xFF). */ 41797 + + subfic r11,r7,255 41798 + 41799 + - /* Cross the page boundary of s2, carefully. Only for first 41800 + - iteration we have to get the count of 64B blocks to be checked. 41801 + - From second iteration and beyond, loop counter is always 63. */ 41802 + -L(compare_64_pagecross): 41803 + - li r11, 63 41804 + + .p2align 5 41805 + +L(L1): 41806 + mtctr r11 41807 + - cmpldi r10,16 41808 + - ble L(cross_4) 41809 + - cmpldi r10,32 41810 + - ble L(cross_3) 41811 + - cmpldi r10,48 41812 + - ble L(cross_2) 41813 + -L(cross_1): 41814 + - CHECK_N_BYTES(r3,r4,r9) 41815 + - CHECK_N_BYTES(r3,r4,r8) 41816 + - COMPARE_16(v4,v5,0) 41817 + - COMPARE_16(v4,v5,16) 41818 + - COMPARE_16(v4,v5,32) 41819 + - addi r3,r3,48 41820 + - addi r4,r4,48 41821 + - b L(compare_64B_unaligned) 41822 + -L(cross_2): 41823 + - COMPARE_16(v4,v5,0) 41824 + - addi r3,r3,16 41825 + - addi r4,r4,16 41826 + - CHECK_N_BYTES(r3,r4,r9) 41827 + - CHECK_N_BYTES(r3,r4,r8) 41828 + - COMPARE_16(v4,v5,0) 41829 + - COMPARE_16(v4,v5,16) 41830 + - addi r3,r3,32 41831 + - addi r4,r4,32 41832 + - b L(compare_64B_unaligned) 41833 + -L(cross_3): 41834 + - COMPARE_16(v4,v5,0) 41835 + - COMPARE_16(v4,v5,16) 41836 + - addi r3,r3,32 41837 + - addi r4,r4,32 41838 + - CHECK_N_BYTES(r3,r4,r9) 41839 + - CHECK_N_BYTES(r3,r4,r8) 41840 + - COMPARE_16(v4,v5,0) 41841 + + 41842 + + .p2align 5 41843 + +L(L2): 41844 + + COMPARE_16(v4,v5,0) /* Load 16B blocks using lxv. */ 41845 + addi r3,r3,16 41846 + addi r4,r4,16 41847 + - b L(compare_64B_unaligned) 41848 + -L(cross_4): 41849 + - COMPARE_16(v4,v5,0) 41850 + - COMPARE_16(v4,v5,16) 41851 + - COMPARE_16(v4,v5,32) 41852 + - addi r3,r3,48 41853 + - addi r4,r4,48 41854 + + bdnz L(L2) 41855 + + /* Cross the page boundary of s2, carefully. */ 41856 + + 41857 + + .p2align 5 41858 + +L(L3): 41859 + + CHECK_N_BYTES(r3,r4,r5) 41860 + CHECK_N_BYTES(r3,r4,r9) 41861 + - CHECK_N_BYTES(r3,r4,r8) 41862 + - b L(compare_64B_unaligned) 41863 + + li r11,255 /* Load the new loop counter. */ 41864 + + b L(L1) 41865 + 41866 + + .p2align 5 41867 + L(same_aligned): 41868 + CHECK_N_BYTES(r3,r4,r7) 41869 + /* Align s1 to 32B and adjust s2 address. 41870 + @@ -208,7 +168,18 @@ L(16B_aligned_loop): 41871 + 41872 + /* Calculate and return the difference. */ 41873 + L(different): 41874 + - TAIL(v4,v5) 41875 + + vctzlsbb r6,v7 41876 + + vextubrx r5,r6,v4 41877 + + vextubrx r4,r6,v5 41878 + + bt 4*cr1+eq,L(swapped) 41879 + + subf r3,r4,r5 41880 + + blr 41881 + + 41882 + + /* If src pointers were swapped, then swap the 41883 + + indices and calculate the return value. */ 41884 + +L(swapped): 41885 + + subf r3,r5,r4 41886 + + blr 41887 + 41888 + .p2align 5 41889 + L(32B_aligned_loop): 41890 + 41891 + commit 672f31b90e501b4ba10ba12ab4c6051f77589912 41892 + Author: Carlos O'Donell <carlos@redhat.com> 41893 + Date: Wed Jun 11 09:33:45 2025 -0400 41894 + 41895 + ppc64le: Revert "powerpc : Add optimized memchr for POWER10" (Bug 33059) 41896 + 41897 + This reverts commit b9182c793caa05df5d697427c0538936e6396d4b 41898 + 41899 + Reason for revert: Power10 memchr clobbers v20 vector register 41900 + (Bug 33059) 41901 + 41902 + This is not a security issue, unlike CVE-2025-5745 and 41903 + CVE-2025-5702. 41904 + 41905 + Tested on ppc64le without regression. 41906 + 41907 + (cherry picked from commit a7877bb6685300f159fa095c9f50b22b112cddb8) 41908 + 41909 + diff --git a/sysdeps/powerpc/powerpc64/le/power10/memchr.S b/sysdeps/powerpc/powerpc64/le/power10/memchr.S 41910 + deleted file mode 100644 41911 + index 53e5716d72..0000000000 41912 + --- a/sysdeps/powerpc/powerpc64/le/power10/memchr.S 41913 + +++ /dev/null 41914 + @@ -1,315 +0,0 @@ 41915 + -/* Optimized memchr implementation for POWER10 LE. 41916 + - Copyright (C) 2021-2024 Free Software Foundation, Inc. 41917 + - This file is part of the GNU C Library. 41918 + - 41919 + - The GNU C Library is free software; you can redistribute it and/or 41920 + - modify it under the terms of the GNU Lesser General Public 41921 + - License as published by the Free Software Foundation; either 41922 + - version 2.1 of the License, or (at your option) any later version. 41923 + - 41924 + - The GNU C Library is distributed in the hope that it will be useful, 41925 + - but WITHOUT ANY WARRANTY; without even the implied warranty of 41926 + - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 41927 + - Lesser General Public License for more details. 41928 + - 41929 + - You should have received a copy of the GNU Lesser General Public 41930 + - License along with the GNU C Library; if not, see 41931 + - <https://www.gnu.org/licenses/>. */ 41932 + - 41933 + -#include <sysdep.h> 41934 + - 41935 + -# ifndef MEMCHR 41936 + -# define MEMCHR __memchr 41937 + -# endif 41938 + -# define M_VREG_ZERO v20 41939 + -# define M_OFF_START_LOOP 256 41940 + -# define MEMCHR_SUBTRACT_VECTORS \ 41941 + - vsububm v4,v4,v18; \ 41942 + - vsububm v5,v5,v18; \ 41943 + - vsububm v6,v6,v18; \ 41944 + - vsububm v7,v7,v18; 41945 + -# define M_TAIL(vreg,increment) \ 41946 + - vctzlsbb r4,vreg; \ 41947 + - cmpld r5,r4; \ 41948 + - ble L(null); \ 41949 + - addi r4,r4,increment; \ 41950 + - add r3,r6,r4; \ 41951 + - blr 41952 + - 41953 + -/* TODO: Replace macros by the actual instructions when minimum binutils becomes 41954 + - >= 2.35. This is used to keep compatibility with older versions. */ 41955 + -#define M_VEXTRACTBM(rt,vrb) \ 41956 + - .long(((4)<<(32-6)) \ 41957 + - | ((rt)<<(32-11)) \ 41958 + - | ((8)<<(32-16)) \ 41959 + - | ((vrb)<<(32-21)) \ 41960 + - | 1602) 41961 + - 41962 + -#define M_LXVP(xtp,dq,ra) \ 41963 + - .long(((6)<<(32-6)) \ 41964 + - | ((((xtp)-32)>>1)<<(32-10)) \ 41965 + - | ((1)<<(32-11)) \ 41966 + - | ((ra)<<(32-16)) \ 41967 + - | dq) 41968 + - 41969 + -#define CHECK16B(vreg,offset,addr,label) \ 41970 + - lxv vreg+32,offset(addr); \ 41971 + - vcmpequb. vreg,vreg,v18; \ 41972 + - bne cr6,L(label); \ 41973 + - cmpldi r5,16; \ 41974 + - ble L(null); \ 41975 + - addi r5,r5,-16; 41976 + - 41977 + -/* Load 4 quadwords, merge into one VR for speed and check for NULLs. r6 has # 41978 + - of bytes already checked. */ 41979 + -#define CHECK64B(offset,addr,label) \ 41980 + - M_LXVP(v4+32,offset,addr); \ 41981 + - M_LXVP(v6+32,offset+32,addr); \ 41982 + - MEMCHR_SUBTRACT_VECTORS; \ 41983 + - vminub v14,v4,v5; \ 41984 + - vminub v15,v6,v7; \ 41985 + - vminub v16,v14,v15; \ 41986 + - vcmpequb. v0,v16,M_VREG_ZERO; \ 41987 + - beq cr6,$+12; \ 41988 + - li r7,offset; \ 41989 + - b L(label); \ 41990 + - cmpldi r5,64; \ 41991 + - ble L(null); \ 41992 + - addi r5,r5,-64 41993 + - 41994 + -/* Implements the function 41995 + - void *[r3] memchr (const void *s [r3], int c [r4], size_t n [r5]). */ 41996 + - 41997 + - .machine power9 41998 + - 41999 + -ENTRY_TOCLESS (MEMCHR) 42000 + - CALL_MCOUNT 3 42001 + - 42002 + - cmpldi r5,0 42003 + - beq L(null) 42004 + - mr r0,r5 42005 + - xori r6,r4,0xff 42006 + - 42007 + - mtvsrd v18+32,r4 /* matching char in v18 */ 42008 + - mtvsrd v19+32,r6 /* non matching char in v19 */ 42009 + - 42010 + - vspltb v18,v18,7 /* replicate */ 42011 + - vspltb v19,v19,7 /* replicate */ 42012 + - vspltisb M_VREG_ZERO,0 42013 + - 42014 + - /* Next 16B-aligned address. Prepare address for L(aligned). */ 42015 + - addi r6,r3,16 42016 + - clrrdi r6,r6,4 42017 + - 42018 + - /* Align data and fill bytes not loaded with non matching char. */ 42019 + - lvx v0,0,r3 42020 + - lvsr v1,0,r3 42021 + - vperm v0,v19,v0,v1 42022 + - 42023 + - vcmpequb. v6,v0,v18 42024 + - bne cr6,L(found) 42025 + - sub r4,r6,r3 42026 + - cmpld r5,r4 42027 + - ble L(null) 42028 + - sub r5,r5,r4 42029 + - 42030 + - /* Test up to OFF_START_LOOP-16 bytes in 16B chunks. The main loop is 42031 + - optimized for longer strings, so checking the first bytes in 16B 42032 + - chunks benefits a lot small strings. */ 42033 + - .p2align 5 42034 + -L(aligned): 42035 + - cmpldi r5,0 42036 + - beq L(null) 42037 + - 42038 + - CHECK16B(v0,0,r6,tail1) 42039 + - CHECK16B(v1,16,r6,tail2) 42040 + - CHECK16B(v2,32,r6,tail3) 42041 + - CHECK16B(v3,48,r6,tail4) 42042 + - CHECK16B(v4,64,r6,tail5) 42043 + - CHECK16B(v5,80,r6,tail6) 42044 + - CHECK16B(v6,96,r6,tail7) 42045 + - CHECK16B(v7,112,r6,tail8) 42046 + - CHECK16B(v8,128,r6,tail9) 42047 + - CHECK16B(v9,144,r6,tail10) 42048 + - CHECK16B(v10,160,r6,tail11) 42049 + - CHECK16B(v0,176,r6,tail12) 42050 + - CHECK16B(v1,192,r6,tail13) 42051 + - CHECK16B(v2,208,r6,tail14) 42052 + - CHECK16B(v3,224,r6,tail15) 42053 + - 42054 + - cmpdi cr5,r4,0 /* Check if c == 0. This will be useful to 42055 + - choose how we will perform the main loop. */ 42056 + - 42057 + - /* Prepare address for the loop. */ 42058 + - addi r4,r3,M_OFF_START_LOOP 42059 + - clrrdi r4,r4,6 42060 + - sub r6,r4,r3 42061 + - sub r5,r0,r6 42062 + - addi r6,r4,128 42063 + - 42064 + - /* If c == 0, use the loop without the vsububm. */ 42065 + - beq cr5,L(loop) 42066 + - 42067 + - /* This is very similar to the block after L(loop), the difference is 42068 + - that here MEMCHR_SUBTRACT_VECTORS is not empty, and we subtract 42069 + - each byte loaded by the char we are looking for, this way we can keep 42070 + - using vminub to merge the results and checking for nulls. */ 42071 + - .p2align 5 42072 + -L(memchr_loop): 42073 + - CHECK64B(0,r4,pre_tail_64b) 42074 + - CHECK64B(64,r4,pre_tail_64b) 42075 + - addi r4,r4,256 42076 + - 42077 + - CHECK64B(0,r6,tail_64b) 42078 + - CHECK64B(64,r6,tail_64b) 42079 + - addi r6,r6,256 42080 + - 42081 + - CHECK64B(0,r4,pre_tail_64b) 42082 + - CHECK64B(64,r4,pre_tail_64b) 42083 + - addi r4,r4,256 42084 + - 42085 + - CHECK64B(0,r6,tail_64b) 42086 + - CHECK64B(64,r6,tail_64b) 42087 + - addi r6,r6,256 42088 + - 42089 + - b L(memchr_loop) 42090 + - /* Switch to a more aggressive approach checking 64B each time. Use 2 42091 + - pointers 128B apart and unroll the loop once to make the pointer 42092 + - updates and usages separated enough to avoid stalls waiting for 42093 + - address calculation. */ 42094 + - .p2align 5 42095 + -L(loop): 42096 + -#undef MEMCHR_SUBTRACT_VECTORS 42097 + -#define MEMCHR_SUBTRACT_VECTORS /* nothing */ 42098 + - CHECK64B(0,r4,pre_tail_64b) 42099 + - CHECK64B(64,r4,pre_tail_64b) 42100 + - addi r4,r4,256 42101 + - 42102 + - CHECK64B(0,r6,tail_64b) 42103 + - CHECK64B(64,r6,tail_64b) 42104 + - addi r6,r6,256 42105 + - 42106 + - CHECK64B(0,r4,pre_tail_64b) 42107 + - CHECK64B(64,r4,pre_tail_64b) 42108 + - addi r4,r4,256 42109 + - 42110 + - CHECK64B(0,r6,tail_64b) 42111 + - CHECK64B(64,r6,tail_64b) 42112 + - addi r6,r6,256 42113 + - 42114 + - b L(loop) 42115 + - 42116 + - .p2align 5 42117 + -L(pre_tail_64b): 42118 + - mr r6,r4 42119 + -L(tail_64b): 42120 + - /* OK, we found a null byte. Let's look for it in the current 64-byte 42121 + - block and mark it in its corresponding VR. lxvp vx,0(ry) puts the 42122 + - low 16B bytes into vx+1, and the high into vx, so the order here is 42123 + - v5, v4, v7, v6. */ 42124 + - vcmpequb v1,v5,M_VREG_ZERO 42125 + - vcmpequb v2,v4,M_VREG_ZERO 42126 + - vcmpequb v3,v7,M_VREG_ZERO 42127 + - vcmpequb v4,v6,M_VREG_ZERO 42128 + - 42129 + - /* Take into account the other 64B blocks we had already checked. */ 42130 + - add r6,r6,r7 42131 + - /* Extract first bit of each byte. */ 42132 + - M_VEXTRACTBM(r8,v1) 42133 + - M_VEXTRACTBM(r9,v2) 42134 + - M_VEXTRACTBM(r10,v3) 42135 + - M_VEXTRACTBM(r11,v4) 42136 + - 42137 + - /* Shift each value into their corresponding position. */ 42138 + - sldi r9,r9,16 42139 + - sldi r10,r10,32 42140 + - sldi r11,r11,48 42141 + - 42142 + - /* Merge the results. */ 42143 + - or r8,r8,r9 42144 + - or r9,r10,r11 42145 + - or r11,r9,r8 42146 + - 42147 + - cnttzd r0,r11 /* Count trailing zeros before the match. */ 42148 + - cmpld r5,r0 42149 + - ble L(null) 42150 + - add r3,r6,r0 /* Compute final address. */ 42151 + - blr 42152 + - 42153 + - .p2align 5 42154 + -L(tail1): 42155 + - M_TAIL(v0,0) 42156 + - 42157 + - .p2align 5 42158 + -L(tail2): 42159 + - M_TAIL(v1,16) 42160 + - 42161 + - .p2align 5 42162 + -L(tail3): 42163 + - M_TAIL(v2,32) 42164 + - 42165 + - .p2align 5 42166 + -L(tail4): 42167 + - M_TAIL(v3,48) 42168 + - 42169 + - .p2align 5 42170 + -L(tail5): 42171 + - M_TAIL(v4,64) 42172 + - 42173 + - .p2align 5 42174 + -L(tail6): 42175 + - M_TAIL(v5,80) 42176 + - 42177 + - .p2align 5 42178 + -L(tail7): 42179 + - M_TAIL(v6,96) 42180 + - 42181 + - .p2align 5 42182 + -L(tail8): 42183 + - M_TAIL(v7,112) 42184 + - 42185 + - .p2align 5 42186 + -L(tail9): 42187 + - M_TAIL(v8,128) 42188 + - 42189 + - .p2align 5 42190 + -L(tail10): 42191 + - M_TAIL(v9,144) 42192 + - 42193 + - .p2align 5 42194 + -L(tail11): 42195 + - M_TAIL(v10,160) 42196 + - 42197 + - .p2align 5 42198 + -L(tail12): 42199 + - M_TAIL(v0,176) 42200 + - 42201 + - .p2align 5 42202 + -L(tail13): 42203 + - M_TAIL(v1,192) 42204 + - 42205 + - .p2align 5 42206 + -L(tail14): 42207 + - M_TAIL(v2,208) 42208 + - 42209 + - .p2align 5 42210 + -L(tail15): 42211 + - M_TAIL(v3,224) 42212 + - 42213 + - .p2align 5 42214 + -L(found): 42215 + - vctzlsbb r7,v6 42216 + - cmpld r5,r7 42217 + - ble L(null) 42218 + - add r3,r3,r7 42219 + - blr 42220 + - 42221 + - .p2align 5 42222 + -L(null): 42223 + - li r3,0 42224 + - blr 42225 + - 42226 + -END (MEMCHR) 42227 + - 42228 + -weak_alias (__memchr, memchr) 42229 + -libc_hidden_builtin_def (memchr) 42230 + diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile 42231 + index a38ff46448..fa1107dfd9 100644 42232 + --- a/sysdeps/powerpc/powerpc64/multiarch/Makefile 42233 + +++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile 42234 + @@ -31,10 +31,10 @@ sysdep_routines += memcpy-power8-cached memcpy-power7 memcpy-a2 memcpy-power6 \ 42235 + strncase-power8 42236 + 42237 + ifneq (,$(filter %le,$(config-machine))) 42238 + -sysdep_routines += memchr-power10 memcmp-power10 memcpy-power10 \ 42239 + - memmove-power10 memset-power10 rawmemchr-power9 \ 42240 + - rawmemchr-power10 strcmp-power9 strcmp-power10 \ 42241 + - strncmp-power9 strcpy-power9 stpcpy-power9 \ 42242 + +sysdep_routines += memcmp-power10 memcpy-power10 memmove-power10 memset-power10 \ 42243 + + rawmemchr-power9 rawmemchr-power10 \ 42244 + + strcmp-power9 strcmp-power10 strncmp-power9 \ 42245 + + strcpy-power9 stpcpy-power9 \ 42246 + strlen-power9 strncpy-power9 stpncpy-power9 strlen-power10 42247 + endif 42248 + CFLAGS-strncase-power7.c += -mcpu=power7 -funroll-loops 42249 + diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c 42250 + index 30fd89e109..9b3e617306 100644 42251 + --- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c 42252 + +++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c 42253 + @@ -226,12 +226,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, 42254 + 42255 + /* Support sysdeps/powerpc/powerpc64/multiarch/memchr.c. */ 42256 + IFUNC_IMPL (i, name, memchr, 42257 + -#ifdef __LITTLE_ENDIAN__ 42258 + - IFUNC_IMPL_ADD (array, i, memchr, 42259 + - hwcap2 & PPC_FEATURE2_ARCH_3_1 42260 + - && hwcap & PPC_FEATURE_HAS_VSX, 42261 + - __memchr_power10) 42262 + -#endif 42263 + IFUNC_IMPL_ADD (array, i, memchr, 42264 + hwcap2 & PPC_FEATURE2_ARCH_2_07 42265 + && hwcap & PPC_FEATURE_HAS_ALTIVEC, 42266 + diff --git a/sysdeps/powerpc/powerpc64/multiarch/memchr-power10.S b/sysdeps/powerpc/powerpc64/multiarch/memchr-power10.S 42267 + deleted file mode 100644 42268 + index 7d35ef28a9..0000000000 42269 + --- a/sysdeps/powerpc/powerpc64/multiarch/memchr-power10.S 42270 + +++ /dev/null 42271 + @@ -1,28 +0,0 @@ 42272 + -/* Optimized memchr implementation for POWER10/PPC64. 42273 + - Copyright (C) 2016-2024 Free Software Foundation, Inc. 42274 + - This file is part of the GNU C Library. 42275 + - 42276 + - The GNU C Library is free software; you can redistribute it and/or 42277 + - modify it under the terms of the GNU Lesser General Public 42278 + - License as published by the Free Software Foundation; either 42279 + - version 2.1 of the License, or (at your option) any later version. 42280 + - 42281 + - The GNU C Library is distributed in the hope that it will be useful, 42282 + - but WITHOUT ANY WARRANTY; without even the implied warranty of 42283 + - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 42284 + - Lesser General Public License for more details. 42285 + - 42286 + - You should have received a copy of the GNU Lesser General Public 42287 + - License along with the GNU C Library; if not, see 42288 + - <https://www.gnu.org/licenses/>. */ 42289 + - 42290 + -#if defined __LITTLE_ENDIAN__ && IS_IN (libc) 42291 + -#define MEMCHR __memchr_power10 42292 + - 42293 + -#undef libc_hidden_builtin_def 42294 + -#define libc_hidden_builtin_def(name) 42295 + -#undef weak_alias 42296 + -#define weak_alias(name,alias) 42297 + - 42298 + -#include <sysdeps/powerpc/powerpc64/le/power10/memchr.S> 42299 + -#endif 42300 + diff --git a/sysdeps/powerpc/powerpc64/multiarch/memchr.c b/sysdeps/powerpc/powerpc64/multiarch/memchr.c 42301 + index 57d23e7b18..b4655dfcaa 100644 42302 + --- a/sysdeps/powerpc/powerpc64/multiarch/memchr.c 42303 + +++ b/sysdeps/powerpc/powerpc64/multiarch/memchr.c 42304 + @@ -25,23 +25,15 @@ extern __typeof (__memchr) __memchr_ppc attribute_hidden; 42305 + extern __typeof (__memchr) __memchr_power7 attribute_hidden; 42306 + extern __typeof (__memchr) __memchr_power8 attribute_hidden; 42307 + 42308 + -# ifdef __LITTLE_ENDIAN__ 42309 + -extern __typeof (__memchr) __memchr_power10 attribute_hidden; 42310 + -# endif 42311 + /* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle 42312 + ifunc symbol properly. */ 42313 + libc_ifunc (__memchr, 42314 + -# ifdef __LITTLE_ENDIAN__ 42315 + - (hwcap2 & PPC_FEATURE2_ARCH_3_1 42316 + - && hwcap & PPC_FEATURE_HAS_VSX) 42317 + - ? __memchr_power10 : 42318 + -# endif 42319 + - (hwcap2 & PPC_FEATURE2_ARCH_2_07 42320 + - && hwcap & PPC_FEATURE_HAS_ALTIVEC) 42321 + - ? __memchr_power8 : 42322 + - (hwcap & PPC_FEATURE_ARCH_2_06) 42323 + - ? __memchr_power7 42324 + - : __memchr_ppc); 42325 + + (hwcap2 & PPC_FEATURE2_ARCH_2_07 42326 + + && hwcap & PPC_FEATURE_HAS_ALTIVEC) 42327 + + ? __memchr_power8 : 42328 + + (hwcap & PPC_FEATURE_ARCH_2_06) 42329 + + ? __memchr_power7 42330 + + : __memchr_ppc); 42331 + 42332 + weak_alias (__memchr, memchr) 42333 + libc_hidden_builtin_def (memchr) 42334 + 42335 + commit 7e12550b8e3a11764a4a9090ce6bd3fc23fc8a8e 42336 + Author: Carlos O'Donell <carlos@redhat.com> 42337 + Date: Mon Jun 16 13:09:57 2025 -0400 42338 + 42339 + ppc64le: Revert "powerpc: Optimized strcmp for power10" (CVE-2025-5702) 42340 + 42341 + This reverts commit 3367d8e180848030d1646f088759f02b8dfe0d6f 42342 + 42343 + Reason for revert: Power10 strcmp clobbers non-volatile vector 42344 + registers (Bug 33056) 42345 + 42346 + Tested on ppc64le without regression. 42347 + 42348 + (cherry picked from commit 15808c77b35319e67ee0dc8f984a9a1a434701bc) 42349 + 42350 + diff --git a/sysdeps/powerpc/powerpc64/le/power10/strcmp.S b/sysdeps/powerpc/powerpc64/le/power10/strcmp.S 42351 + deleted file mode 100644 42352 + index 00f1e9c170..0000000000 42353 + --- a/sysdeps/powerpc/powerpc64/le/power10/strcmp.S 42354 + +++ /dev/null 42355 + @@ -1,204 +0,0 @@ 42356 + -/* Optimized strcmp implementation for PowerPC64/POWER10. 42357 + - Copyright (C) 2021-2024 Free Software Foundation, Inc. 42358 + - This file is part of the GNU C Library. 42359 + - 42360 + - The GNU C Library is free software; you can redistribute it and/or 42361 + - modify it under the terms of the GNU Lesser General Public 42362 + - License as published by the Free Software Foundation; either 42363 + - version 2.1 of the License, or (at your option) any later version. 42364 + - 42365 + - The GNU C Library is distributed in the hope that it will be useful, 42366 + - but WITHOUT ANY WARRANTY; without even the implied warranty of 42367 + - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 42368 + - Lesser General Public License for more details. 42369 + - 42370 + - You should have received a copy of the GNU Lesser General Public 42371 + - License along with the GNU C Library; if not, see 42372 + - <https://www.gnu.org/licenses/>. */ 42373 + -#include <sysdep.h> 42374 + - 42375 + -#ifndef STRCMP 42376 + -# define STRCMP strcmp 42377 + -#endif 42378 + - 42379 + -/* Implements the function 42380 + - int [r3] strcmp (const char *s1 [r3], const char *s2 [r4]). */ 42381 + - 42382 + -/* TODO: Change this to actual instructions when minimum binutils is upgraded 42383 + - to 2.27. Macros are defined below for these newer instructions in order 42384 + - to maintain compatibility. */ 42385 + - 42386 + -#define LXVP(xtp,dq,ra) \ 42387 + - .long(((6)<<(32-6)) \ 42388 + - | ((((xtp)-32)>>1)<<(32-10)) \ 42389 + - | ((1)<<(32-11)) \ 42390 + - | ((ra)<<(32-16)) \ 42391 + - | dq) 42392 + - 42393 + -#define COMPARE_16(vreg1,vreg2,offset) \ 42394 + - lxv vreg1+32,offset(r3); \ 42395 + - lxv vreg2+32,offset(r4); \ 42396 + - vcmpnezb. v7,vreg1,vreg2; \ 42397 + - bne cr6,L(different); \ 42398 + - 42399 + -#define COMPARE_32(vreg1,vreg2,offset,label1,label2) \ 42400 + - LXVP(vreg1+32,offset,r3); \ 42401 + - LXVP(vreg2+32,offset,r4); \ 42402 + - vcmpnezb. v7,vreg1+1,vreg2+1; \ 42403 + - bne cr6,L(label1); \ 42404 + - vcmpnezb. v7,vreg1,vreg2; \ 42405 + - bne cr6,L(label2); \ 42406 + - 42407 + -#define TAIL(vreg1,vreg2) \ 42408 + - vctzlsbb r6,v7; \ 42409 + - vextubrx r5,r6,vreg1; \ 42410 + - vextubrx r4,r6,vreg2; \ 42411 + - subf r3,r4,r5; \ 42412 + - blr; \ 42413 + - 42414 + -#define CHECK_N_BYTES(reg1,reg2,len_reg) \ 42415 + - sldi r0,len_reg,56; \ 42416 + - lxvl 32+v4,reg1,r0; \ 42417 + - lxvl 32+v5,reg2,r0; \ 42418 + - add reg1,reg1,len_reg; \ 42419 + - add reg2,reg2,len_reg; \ 42420 + - vcmpnezb. v7,v4,v5; \ 42421 + - vctzlsbb r6,v7; \ 42422 + - cmpld cr7,r6,len_reg; \ 42423 + - blt cr7,L(different); \ 42424 + - 42425 + - /* TODO: change this to .machine power10 when the minimum required 42426 + - binutils allows it. */ 42427 + - 42428 + - .machine power9 42429 + -ENTRY_TOCLESS (STRCMP, 4) 42430 + - li r11,16 42431 + - /* eq bit of cr1 used as swap status flag to indicate if 42432 + - source pointers were swapped. */ 42433 + - crclr 4*cr1+eq 42434 + - vspltisb v19,-1 42435 + - andi. r7,r3,15 42436 + - sub r7,r11,r7 /* r7(nalign1) = 16 - (str1 & 15). */ 42437 + - andi. r9,r4,15 42438 + - sub r5,r11,r9 /* r5(nalign2) = 16 - (str2 & 15). */ 42439 + - cmpld cr7,r7,r5 42440 + - beq cr7,L(same_aligned) 42441 + - blt cr7,L(nalign1_min) 42442 + - /* Swap r3 and r4, and r7 and r5 such that r3 and r7 hold the 42443 + - pointer which is closer to the next 16B boundary so that only 42444 + - one CHECK_N_BYTES is needed before entering the loop below. */ 42445 + - mr r8,r4 42446 + - mr r4,r3 42447 + - mr r3,r8 42448 + - mr r12,r7 42449 + - mr r7,r5 42450 + - mr r5,r12 42451 + - crset 4*cr1+eq /* Set bit on swapping source pointers. */ 42452 + - 42453 + - .p2align 5 42454 + -L(nalign1_min): 42455 + - CHECK_N_BYTES(r3,r4,r7) 42456 + - 42457 + - .p2align 5 42458 + -L(s1_aligned): 42459 + - /* r9 and r5 is number of bytes to be read after and before 42460 + - page boundary correspondingly. */ 42461 + - sub r5,r5,r7 42462 + - subfic r9,r5,16 42463 + - /* Now let r7 hold the count of quadwords which can be 42464 + - checked without crossing a page boundary. quadword offset is 42465 + - (str2>>4)&0xFF. */ 42466 + - rlwinm r7,r4,28,0xFF 42467 + - /* Below check is required only for first iteration. For second 42468 + - iteration and beyond, the new loop counter is always 255. */ 42469 + - cmpldi r7,255 42470 + - beq L(L3) 42471 + - /* Get the initial loop count by 255-((str2>>4)&0xFF). */ 42472 + - subfic r11,r7,255 42473 + - 42474 + - .p2align 5 42475 + -L(L1): 42476 + - mtctr r11 42477 + - 42478 + - .p2align 5 42479 + -L(L2): 42480 + - COMPARE_16(v4,v5,0) /* Load 16B blocks using lxv. */ 42481 + - addi r3,r3,16 42482 + - addi r4,r4,16 42483 + - bdnz L(L2) 42484 + - /* Cross the page boundary of s2, carefully. */ 42485 + - 42486 + - .p2align 5 42487 + -L(L3): 42488 + - CHECK_N_BYTES(r3,r4,r5) 42489 + - CHECK_N_BYTES(r3,r4,r9) 42490 + - li r11,255 /* Load the new loop counter. */ 42491 + - b L(L1) 42492 + - 42493 + - .p2align 5 42494 + -L(same_aligned): 42495 + - CHECK_N_BYTES(r3,r4,r7) 42496 + - /* Align s1 to 32B and adjust s2 address. 42497 + - Use lxvp only if both s1 and s2 are 32B aligned. */ 42498 + - COMPARE_16(v4,v5,0) 42499 + - COMPARE_16(v4,v5,16) 42500 + - COMPARE_16(v4,v5,32) 42501 + - COMPARE_16(v4,v5,48) 42502 + - addi r3,r3,64 42503 + - addi r4,r4,64 42504 + - COMPARE_16(v4,v5,0) 42505 + - COMPARE_16(v4,v5,16) 42506 + - 42507 + - clrldi r6,r3,59 42508 + - subfic r5,r6,32 42509 + - add r3,r3,r5 42510 + - add r4,r4,r5 42511 + - andi. r5,r4,0x1F 42512 + - beq cr0,L(32B_aligned_loop) 42513 + - 42514 + - .p2align 5 42515 + -L(16B_aligned_loop): 42516 + - COMPARE_16(v4,v5,0) 42517 + - COMPARE_16(v4,v5,16) 42518 + - COMPARE_16(v4,v5,32) 42519 + - COMPARE_16(v4,v5,48) 42520 + - addi r3,r3,64 42521 + - addi r4,r4,64 42522 + - b L(16B_aligned_loop) 42523 + - 42524 + - /* Calculate and return the difference. */ 42525 + -L(different): 42526 + - vctzlsbb r6,v7 42527 + - vextubrx r5,r6,v4 42528 + - vextubrx r4,r6,v5 42529 + - bt 4*cr1+eq,L(swapped) 42530 + - subf r3,r4,r5 42531 + - blr 42532 + - 42533 + - /* If src pointers were swapped, then swap the 42534 + - indices and calculate the return value. */ 42535 + -L(swapped): 42536 + - subf r3,r5,r4 42537 + - blr 42538 + - 42539 + - .p2align 5 42540 + -L(32B_aligned_loop): 42541 + - COMPARE_32(v14,v16,0,tail1,tail2) 42542 + - COMPARE_32(v18,v20,32,tail3,tail4) 42543 + - COMPARE_32(v22,v24,64,tail5,tail6) 42544 + - COMPARE_32(v26,v28,96,tail7,tail8) 42545 + - addi r3,r3,128 42546 + - addi r4,r4,128 42547 + - b L(32B_aligned_loop) 42548 + - 42549 + -L(tail1): TAIL(v15,v17) 42550 + -L(tail2): TAIL(v14,v16) 42551 + -L(tail3): TAIL(v19,v21) 42552 + -L(tail4): TAIL(v18,v20) 42553 + -L(tail5): TAIL(v23,v25) 42554 + -L(tail6): TAIL(v22,v24) 42555 + -L(tail7): TAIL(v27,v29) 42556 + -L(tail8): TAIL(v26,v28) 42557 + - 42558 + -END (STRCMP) 42559 + -libc_hidden_builtin_def (strcmp) 42560 + diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile 42561 + index fa1107dfd9..9f15f3207f 100644 42562 + --- a/sysdeps/powerpc/powerpc64/multiarch/Makefile 42563 + +++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile 42564 + @@ -33,8 +33,7 @@ sysdep_routines += memcpy-power8-cached memcpy-power7 memcpy-a2 memcpy-power6 \ 42565 + ifneq (,$(filter %le,$(config-machine))) 42566 + sysdep_routines += memcmp-power10 memcpy-power10 memmove-power10 memset-power10 \ 42567 + rawmemchr-power9 rawmemchr-power10 \ 42568 + - strcmp-power9 strcmp-power10 strncmp-power9 \ 42569 + - strcpy-power9 stpcpy-power9 \ 42570 + + strcmp-power9 strncmp-power9 strcpy-power9 stpcpy-power9 \ 42571 + strlen-power9 strncpy-power9 stpncpy-power9 strlen-power10 42572 + endif 42573 + CFLAGS-strncase-power7.c += -mcpu=power7 -funroll-loops 42574 + diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c 42575 + index 9b3e617306..78443b7f34 100644 42576 + --- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c 42577 + +++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c 42578 + @@ -377,10 +377,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, 42579 + /* Support sysdeps/powerpc/powerpc64/multiarch/strcmp.c. */ 42580 + IFUNC_IMPL (i, name, strcmp, 42581 + #ifdef __LITTLE_ENDIAN__ 42582 + - IFUNC_IMPL_ADD (array, i, strcmp, 42583 + - (hwcap2 & PPC_FEATURE2_ARCH_3_1) 42584 + - && (hwcap & PPC_FEATURE_HAS_VSX), 42585 + - __strcmp_power10) 42586 + IFUNC_IMPL_ADD (array, i, strcmp, 42587 + hwcap2 & PPC_FEATURE2_ARCH_3_00 42588 + && hwcap & PPC_FEATURE_HAS_ALTIVEC, 42589 + diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcmp-power10.S b/sysdeps/powerpc/powerpc64/multiarch/strcmp-power10.S 42590 + deleted file mode 100644 42591 + index 1a9f6069f5..0000000000 42592 + --- a/sysdeps/powerpc/powerpc64/multiarch/strcmp-power10.S 42593 + +++ /dev/null 42594 + @@ -1,26 +0,0 @@ 42595 + -/* Optimized strcmp implementation for POWER10/PPC64. 42596 + - Copyright (C) 2021-2024 Free Software Foundation, Inc. 42597 + - This file is part of the GNU C Library. 42598 + - 42599 + - The GNU C Library is free software; you can redistribute it and/or 42600 + - modify it under the terms of the GNU Lesser General Public 42601 + - License as published by the Free Software Foundation; either 42602 + - version 2.1 of the License, or (at your option) any later version. 42603 + - 42604 + - The GNU C Library is distributed in the hope that it will be useful, 42605 + - but WITHOUT ANY WARRANTY; without even the implied warranty of 42606 + - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 42607 + - Lesser General Public License for more details. 42608 + - 42609 + - You should have received a copy of the GNU Lesser General Public 42610 + - License along with the GNU C Library; if not, see 42611 + - <https://www.gnu.org/licenses/>. */ 42612 + - 42613 + -#if defined __LITTLE_ENDIAN__ && IS_IN (libc) 42614 + -#define STRCMP __strcmp_power10 42615 + - 42616 + -#undef libc_hidden_builtin_def 42617 + -#define libc_hidden_builtin_def(name) 42618 + - 42619 + -#include <sysdeps/powerpc/powerpc64/le/power10/strcmp.S> 42620 + -#endif /* __LITTLE_ENDIAN__ && IS_IN (libc) */ 42621 + diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcmp.c b/sysdeps/powerpc/powerpc64/multiarch/strcmp.c 42622 + index ff32496fab..06b9b4090f 100644 42623 + --- a/sysdeps/powerpc/powerpc64/multiarch/strcmp.c 42624 + +++ b/sysdeps/powerpc/powerpc64/multiarch/strcmp.c 42625 + @@ -29,16 +29,12 @@ extern __typeof (strcmp) __strcmp_power7 attribute_hidden; 42626 + extern __typeof (strcmp) __strcmp_power8 attribute_hidden; 42627 + # ifdef __LITTLE_ENDIAN__ 42628 + extern __typeof (strcmp) __strcmp_power9 attribute_hidden; 42629 + -extern __typeof (strcmp) __strcmp_power10 attribute_hidden; 42630 + # endif 42631 + 42632 + # undef strcmp 42633 + 42634 + libc_ifunc_redirected (__redirect_strcmp, strcmp, 42635 + # ifdef __LITTLE_ENDIAN__ 42636 + - (hwcap2 & PPC_FEATURE2_ARCH_3_1 42637 + - && hwcap & PPC_FEATURE_HAS_VSX) 42638 + - ? __strcmp_power10 : 42639 + (hwcap2 & PPC_FEATURE2_ARCH_3_00 42640 + && hwcap & PPC_FEATURE_HAS_ALTIVEC) 42641 + ? __strcmp_power9 : 42642 + 42643 + commit 23a02e382c8ffebfed00a082d8898f1aa468b5da 42644 + Author: Florian Weimer <fweimer@redhat.com> 42645 + Date: Wed May 21 16:47:34 2025 +0200 42646 + 42647 + support: Pick group in support_capture_subprogram_self_sgid if UID == 0 42648 + 42649 + When running as root, it is likely that we can run under any group. 42650 + Pick a harmless group from /etc/group in this case. 42651 + 42652 + Reviewed-by: Carlos O'Donell <carlos@redhat.com> 42653 + (cherry picked from commit 2f769cec448d84a62b7dd0d4ff56978fe22c0cd6) 42654 + 42655 + diff --git a/support/support_capture_subprocess.c b/support/support_capture_subprocess.c 42656 + index 2383481911..1cb344eb04 100644 42657 + --- a/support/support_capture_subprocess.c 42658 + +++ b/support/support_capture_subprocess.c 42659 + @@ -21,7 +21,11 @@ 42660 + 42661 + #include <errno.h> 42662 + #include <fcntl.h> 42663 + +#include <grp.h> 42664 + +#include <scratch_buffer.h> 42665 + +#include <stdio_ext.h> 42666 + #include <stdlib.h> 42667 + +#include <string.h> 42668 + #include <support/check.h> 42669 + #include <support/xunistd.h> 42670 + #include <support/xsocket.h> 42671 + @@ -210,10 +214,48 @@ err: 42672 + return status; 42673 + } 42674 + 42675 + +/* Returns true if a group with NAME has been found, and writes its 42676 + + GID to *TARGET. */ 42677 + +static bool 42678 + +find_sgid_group (gid_t *target, const char *name) 42679 + +{ 42680 + + /* Do not use getgrname_r because it does not work in statically 42681 + + linked binaries if the system libc is different. */ 42682 + + FILE *fp = fopen ("/etc/group", "rce"); 42683 + + if (fp == NULL) 42684 + + return false; 42685 + + __fsetlocking (fp, FSETLOCKING_BYCALLER); 42686 + + 42687 + + bool ok = false; 42688 + + struct scratch_buffer buf; 42689 + + scratch_buffer_init (&buf); 42690 + + while (true) 42691 + + { 42692 + + struct group grp; 42693 + + struct group *result = NULL; 42694 + + int status = fgetgrent_r (fp, &grp, buf.data, buf.length, &result); 42695 + + if (status == 0 && result != NULL) 42696 + + { 42697 + + if (strcmp (result->gr_name, name) == 0) 42698 + + { 42699 + + *target = result->gr_gid; 42700 + + ok = true; 42701 + + break; 42702 + + } 42703 + + } 42704 + + else if (errno != ERANGE) 42705 + + break; 42706 + + else if (!scratch_buffer_grow (&buf)) 42707 + + break; 42708 + + } 42709 + + scratch_buffer_free (&buf); 42710 + + fclose (fp); 42711 + + return ok; 42712 + +} 42713 + + 42714 + int 42715 + support_capture_subprogram_self_sgid (const char *child_id) 42716 + { 42717 + - gid_t target = 0; 42718 + const int count = 64; 42719 + gid_t groups[count]; 42720 + 42721 + @@ -225,6 +267,7 @@ support_capture_subprogram_self_sgid (const char *child_id) 42722 + (intmax_t) getuid ()); 42723 + 42724 + gid_t current = getgid (); 42725 + + gid_t target = current; 42726 + for (int i = 0; i < ret; ++i) 42727 + { 42728 + if (groups[i] != current) 42729 + @@ -234,9 +277,16 @@ support_capture_subprogram_self_sgid (const char *child_id) 42730 + } 42731 + } 42732 + 42733 + - if (target == 0) 42734 + - FAIL_UNSUPPORTED("Could not find a suitable GID for user %jd\n", 42735 + - (intmax_t) getuid ()); 42736 + + if (target == current) 42737 + + { 42738 + + /* If running as root, try to find a harmless group for SGID. */ 42739 + + if (getuid () != 0 42740 + + || (!find_sgid_group (&target, "nogroup") 42741 + + && !find_sgid_group (&target, "bin") 42742 + + && !find_sgid_group (&target, "daemon"))) 42743 + + FAIL_UNSUPPORTED("Could not find a suitable GID for user %jd\n", 42744 + + (intmax_t) getuid ()); 42745 + + } 42746 + 42747 + return copy_and_spawn_sgid (child_id, target); 42748 + } 42749 + 42750 + commit dbc83657e290bdad3245259be80fb84cbe10304c 42751 + Author: Florian Weimer <fweimer@redhat.com> 42752 + Date: Thu May 22 14:36:37 2025 +0200 42753 + 42754 + Fix error reporting (false negatives) in SGID tests 42755 + 42756 + And simplify the interface of support_capture_subprogram_self_sgid. 42757 + 42758 + Use the existing framework for temporary directories (now with 42759 + mode 0700) and directory/file deletion. Handle all execution 42760 + errors within support_capture_subprogram_self_sgid. In particular, 42761 + this includes test failures because the invoked program did not 42762 + exit with exit status zero. Existing tests that expect exit 42763 + status 42 are adjusted to use zero instead. 42764 + 42765 + In addition, fix callers not to call exit (0) with test failures 42766 + pending (which may mask them, especially when running with --direct). 42767 + 42768 + Fixes commit 35fc356fa3b4f485bd3ba3114c9f774e5df7d3c2 42769 + ("elf: Fix subprocess status handling for tst-dlopen-sgid (bug 32987)"). 42770 + 42771 + Reviewed-by: Carlos O'Donell <carlos@redhat.com> 42772 + (cherry picked from commit 3a3fb2ed83f79100c116c824454095ecfb335ad7) 42773 + 42774 + diff --git a/elf/tst-dlopen-sgid.c b/elf/tst-dlopen-sgid.c 42775 + index 5688b79f2e..8aec52e19f 100644 42776 + --- a/elf/tst-dlopen-sgid.c 42777 + +++ b/elf/tst-dlopen-sgid.c 42778 + @@ -70,13 +70,7 @@ do_test (void) 42779 + 42780 + free (libdir); 42781 + 42782 + - int status = support_capture_subprogram_self_sgid (magic_argument); 42783 + - 42784 + - if (WEXITSTATUS (status) == EXIT_UNSUPPORTED) 42785 + - return EXIT_UNSUPPORTED; 42786 + - 42787 + - if (!WIFEXITED (status)) 42788 + - FAIL_EXIT1 ("Unexpected exit status %d from child process\n", status); 42789 + + support_capture_subprogram_self_sgid (magic_argument); 42790 + 42791 + return 0; 42792 + } 42793 + diff --git a/elf/tst-env-setuid-tunables.c b/elf/tst-env-setuid-tunables.c 42794 + index a47219047f..233eec7631 100644 42795 + --- a/elf/tst-env-setuid-tunables.c 42796 + +++ b/elf/tst-env-setuid-tunables.c 42797 + @@ -105,10 +105,7 @@ do_test (int argc, char **argv) 42798 + 42799 + if (ret != 0) 42800 + exit (1); 42801 + - 42802 + - /* Special return code to make sure that the child executed all the way 42803 + - through. */ 42804 + - exit (42); 42805 + + return 0; 42806 + } 42807 + else 42808 + { 42809 + @@ -127,18 +124,7 @@ do_test (int argc, char **argv) 42810 + continue; 42811 + } 42812 + 42813 + - int status = support_capture_subprogram_self_sgid (buf); 42814 + - 42815 + - /* Bail out early if unsupported. */ 42816 + - if (WEXITSTATUS (status) == EXIT_UNSUPPORTED) 42817 + - return EXIT_UNSUPPORTED; 42818 + - 42819 + - if (WEXITSTATUS (status) != 42) 42820 + - { 42821 + - printf (" [%d] child failed with status %d\n", i, 42822 + - WEXITSTATUS (status)); 42823 + - support_record_failure (); 42824 + - } 42825 + + support_capture_subprogram_self_sgid (buf); 42826 + } 42827 + return 0; 42828 + } 42829 + diff --git a/elf/tst-env-setuid.c b/elf/tst-env-setuid.c 42830 + index 59f2ffeb88..ee3f058468 100644 42831 + --- a/elf/tst-env-setuid.c 42832 + +++ b/elf/tst-env-setuid.c 42833 + @@ -147,10 +147,7 @@ do_test (int argc, char **argv) 42834 + 42835 + if (ret != 0) 42836 + exit (1); 42837 + - 42838 + - /* Special return code to make sure that the child executed all the way 42839 + - through. */ 42840 + - exit (42); 42841 + + return 0; 42842 + } 42843 + else 42844 + { 42845 + @@ -174,17 +171,7 @@ do_test (int argc, char **argv) 42846 + free (profilepath); 42847 + } 42848 + 42849 + - int status = support_capture_subprogram_self_sgid (SETGID_CHILD); 42850 + - 42851 + - if (WEXITSTATUS (status) == EXIT_UNSUPPORTED) 42852 + - exit (EXIT_UNSUPPORTED); 42853 + - 42854 + - if (WEXITSTATUS (status) != 42) 42855 + - { 42856 + - printf (" child failed with status %d\n", 42857 + - WEXITSTATUS (status)); 42858 + - support_record_failure (); 42859 + - } 42860 + + support_capture_subprogram_self_sgid (SETGID_CHILD); 42861 + 42862 + return 0; 42863 + } 42864 + diff --git a/stdlib/tst-secure-getenv.c b/stdlib/tst-secure-getenv.c 42865 + index cc26ed6d15..cefee58d46 100644 42866 + --- a/stdlib/tst-secure-getenv.c 42867 + +++ b/stdlib/tst-secure-getenv.c 42868 + @@ -57,13 +57,7 @@ do_test (void) 42869 + exit (1); 42870 + } 42871 + 42872 + - int status = support_capture_subprogram_self_sgid (MAGIC_ARGUMENT); 42873 + - 42874 + - if (WEXITSTATUS (status) == EXIT_UNSUPPORTED) 42875 + - return EXIT_UNSUPPORTED; 42876 + - 42877 + - if (!WIFEXITED (status)) 42878 + - FAIL_EXIT1 ("Unexpected exit status %d from child process\n", status); 42879 + + support_capture_subprogram_self_sgid (MAGIC_ARGUMENT); 42880 + 42881 + return 0; 42882 + } 42883 + @@ -82,6 +76,7 @@ alternative_main (int argc, char **argv) 42884 + if (secure_getenv ("PATH") != NULL) 42885 + FAIL_EXIT (4, "PATH variable not filtered out\n"); 42886 + 42887 + + support_record_failure_barrier (); 42888 + exit (EXIT_SUCCESS); 42889 + } 42890 + } 42891 + diff --git a/support/capture_subprocess.h b/support/capture_subprocess.h 42892 + index 5406d9f6c0..57bb941e7d 100644 42893 + --- a/support/capture_subprocess.h 42894 + +++ b/support/capture_subprocess.h 42895 + @@ -42,10 +42,12 @@ struct support_capture_subprocess support_capture_subprocess 42896 + struct support_capture_subprocess support_capture_subprogram 42897 + (const char *file, char *const argv[], char *const envp[]); 42898 + 42899 + -/* Copy the running program into a setgid binary and run it with CHILD_ID 42900 + - argument. If execution is successful, return the exit status of the child 42901 + - program, otherwise return a non-zero failure exit code. */ 42902 + -int support_capture_subprogram_self_sgid (const char *child_id); 42903 + +/* Copy the running program into a setgid binary and run it with 42904 + + CHILD_ID argument. If the program exits with a non-zero status, 42905 + + exit with that exit status (or status 1 if the program did not exit 42906 + + normally). If the test cannot be performed, exit with 42907 + + EXIT_UNSUPPORTED. */ 42908 + +void support_capture_subprogram_self_sgid (const char *child_id); 42909 + 42910 + /* Deallocate the subprocess data captured by 42911 + support_capture_subprocess. */ 42912 + diff --git a/support/support_capture_subprocess.c b/support/support_capture_subprocess.c 42913 + index 1cb344eb04..cbc6951064 100644 42914 + --- a/support/support_capture_subprocess.c 42915 + +++ b/support/support_capture_subprocess.c 42916 + @@ -31,6 +31,7 @@ 42917 + #include <support/xsocket.h> 42918 + #include <support/xspawn.h> 42919 + #include <support/support.h> 42920 + +#include <support/temp_file.h> 42921 + #include <support/test-driver.h> 42922 + 42923 + static void 42924 + @@ -113,105 +114,44 @@ support_capture_subprogram (const char *file, char *const argv[], 42925 + /* Copies the executable into a restricted directory, so that we can 42926 + safely make it SGID with the TARGET group ID. Then runs the 42927 + executable. */ 42928 + -static int 42929 + +static void 42930 + copy_and_spawn_sgid (const char *child_id, gid_t gid) 42931 + { 42932 + - char *dirname = xasprintf ("%s/tst-tunables-setuid.%jd", 42933 + - test_dir, (intmax_t) getpid ()); 42934 + + char *dirname = support_create_temp_directory ("tst-glibc-sgid-"); 42935 + char *execname = xasprintf ("%s/bin", dirname); 42936 + - int infd = -1; 42937 + - int outfd = -1; 42938 + - int ret = 1, status = 1; 42939 + - 42940 + - TEST_VERIFY (mkdir (dirname, 0700) == 0); 42941 + - if (support_record_failure_is_failed ()) 42942 + - goto err; 42943 + + add_temp_file (execname); 42944 + 42945 + - infd = open ("/proc/self/exe", O_RDONLY); 42946 + - if (infd < 0) 42947 + + if (access ("/proc/self/exe", R_OK) != 0) 42948 + FAIL_UNSUPPORTED ("unsupported: Cannot read binary from procfs\n"); 42949 + 42950 + - outfd = open (execname, O_WRONLY | O_CREAT | O_EXCL, 0700); 42951 + - TEST_VERIFY (outfd >= 0); 42952 + - if (support_record_failure_is_failed ()) 42953 + - goto err; 42954 + - 42955 + - char buf[4096]; 42956 + - for (;;) 42957 + - { 42958 + - ssize_t rdcount = read (infd, buf, sizeof (buf)); 42959 + - TEST_VERIFY (rdcount >= 0); 42960 + - if (support_record_failure_is_failed ()) 42961 + - goto err; 42962 + - if (rdcount == 0) 42963 + - break; 42964 + - char *p = buf; 42965 + - char *end = buf + rdcount; 42966 + - while (p != end) 42967 + - { 42968 + - ssize_t wrcount = write (outfd, buf, end - p); 42969 + - if (wrcount == 0) 42970 + - errno = ENOSPC; 42971 + - TEST_VERIFY (wrcount > 0); 42972 + - if (support_record_failure_is_failed ()) 42973 + - goto err; 42974 + - p += wrcount; 42975 + - } 42976 + - } 42977 + + support_copy_file ("/proc/self/exe", execname); 42978 + 42979 + - bool chowned = false; 42980 + - TEST_VERIFY ((chowned = fchown (outfd, getuid (), gid) == 0) 42981 + - || errno == EPERM); 42982 + - if (support_record_failure_is_failed ()) 42983 + - goto err; 42984 + - else if (!chowned) 42985 + - { 42986 + - ret = 77; 42987 + - goto err; 42988 + - } 42989 + + if (chown (execname, getuid (), gid) != 0) 42990 + + FAIL_UNSUPPORTED ("cannot change group of \"%s\" to %jd: %m", 42991 + + execname, (intmax_t) gid); 42992 + 42993 + - TEST_VERIFY (fchmod (outfd, 02750) == 0); 42994 + - if (support_record_failure_is_failed ()) 42995 + - goto err; 42996 + - TEST_VERIFY (close (outfd) == 0); 42997 + - if (support_record_failure_is_failed ()) 42998 + - goto err; 42999 + - TEST_VERIFY (close (infd) == 0); 43000 + - if (support_record_failure_is_failed ()) 43001 + - goto err; 43002 + + if (chmod (execname, 02750) != 0) 43003 + + FAIL_UNSUPPORTED ("cannot make \"%s\" SGID: %m ", execname); 43004 + 43005 + /* We have the binary, now spawn the subprocess. Avoid using 43006 + support_subprogram because we only want the program exit status, not the 43007 + contents. */ 43008 + - ret = 0; 43009 + - infd = outfd = -1; 43010 + 43011 + char * const args[] = {execname, (char *) child_id, NULL}; 43012 + + int status = support_subprogram_wait (args[0], args); 43013 + 43014 + - status = support_subprogram_wait (args[0], args); 43015 + + free (execname); 43016 + + free (dirname); 43017 + 43018 + -err: 43019 + - if (outfd >= 0) 43020 + - close (outfd); 43021 + - if (infd >= 0) 43022 + - close (infd); 43023 + - if (execname != NULL) 43024 + - { 43025 + - unlink (execname); 43026 + - free (execname); 43027 + - } 43028 + - if (dirname != NULL) 43029 + + if (WIFEXITED (status)) 43030 + { 43031 + - rmdir (dirname); 43032 + - free (dirname); 43033 + + if (WEXITSTATUS (status) == 0) 43034 + + return; 43035 + + else 43036 + + exit (WEXITSTATUS (status)); 43037 + } 43038 + - 43039 + - if (ret == 77) 43040 + - FAIL_UNSUPPORTED ("Failed to make sgid executable for test\n"); 43041 + - if (ret != 0) 43042 + - FAIL_EXIT1 ("Failed to make sgid executable for test\n"); 43043 + - 43044 + - return status; 43045 + + else 43046 + + FAIL_EXIT1 ("subprogram failed with status %d", status); 43047 + } 43048 + 43049 + /* Returns true if a group with NAME has been found, and writes its 43050 + @@ -253,7 +193,7 @@ find_sgid_group (gid_t *target, const char *name) 43051 + return ok; 43052 + } 43053 + 43054 + -int 43055 + +void 43056 + support_capture_subprogram_self_sgid (const char *child_id) 43057 + { 43058 + const int count = 64; 43059 + @@ -288,7 +228,7 @@ support_capture_subprogram_self_sgid (const char *child_id) 43060 + (intmax_t) getuid ()); 43061 + } 43062 + 43063 + - return copy_and_spawn_sgid (child_id, target); 43064 + + copy_and_spawn_sgid (child_id, target); 43065 + } 43066 + 43067 + void 43068 + 43069 + commit 2eb180377b96771b8368b0915669c8c7b267e739 43070 + Author: Florian Weimer <fweimer@redhat.com> 43071 + Date: Mon Jul 21 21:43:49 2025 +0200 43072 + 43073 + posix: Fix double-free after allocation failure in regcomp (bug 33185) 43074 + 43075 + If a memory allocation failure occurs during bracket expression 43076 + parsing in regcomp, a double-free error may result. 43077 + 43078 + Reported-by: Anastasia Belova <abelova@astralinux.ru> 43079 + Co-authored-by: Paul Eggert <eggert@cs.ucla.edu> 43080 + Reviewed-by: Andreas K. Huettel <dilfridge@gentoo.org> 43081 + (cherry picked from commit 7ea06e994093fa0bcca0d0ee2c1db271d8d7885d) 43082 + 43083 + diff --git a/NEWS b/NEWS 43084 + index 4b290ad4bf..253b07ae99 100644 43085 + --- a/NEWS 43086 + +++ b/NEWS 43087 + @@ -24,6 +24,7 @@ The following bugs are resolved with this release: 43088 + [32470] x86: Avoid integer truncation with large cache sizes 43089 + [32810] Crash on x86-64 if XSAVEC disable via tunable 43090 + [32987] elf: Fix subprocess status handling for tst-dlopen-sgid 43091 + + [33185] Fix double-free after allocation failure in regcomp 43092 + 43093 + Version 2.40 43094 + 43095 + diff --git a/posix/Makefile b/posix/Makefile 43096 + index 2c598cd20a..830278a423 100644 43097 + --- a/posix/Makefile 43098 + +++ b/posix/Makefile 43099 + @@ -303,6 +303,7 @@ tests := \ 43100 + tst-posix_spawn-setsid \ 43101 + tst-preadwrite \ 43102 + tst-preadwrite64 \ 43103 + + tst-regcomp-bracket-free \ 43104 + tst-regcomp-truncated \ 43105 + tst-regex \ 43106 + tst-regex2 \ 43107 + diff --git a/posix/regcomp.c b/posix/regcomp.c 43108 + index 5380d3c7b9..6595bb3c0d 100644 43109 + --- a/posix/regcomp.c 43110 + +++ b/posix/regcomp.c 43111 + @@ -3384,6 +3384,7 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token, 43112 + { 43113 + #ifdef RE_ENABLE_I18N 43114 + free_charset (mbcset); 43115 + + mbcset = NULL; 43116 + #endif 43117 + /* Build a tree for simple bracket. */ 43118 + br_token.type = SIMPLE_BRACKET; 43119 + @@ -3399,7 +3400,8 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token, 43120 + parse_bracket_exp_free_return: 43121 + re_free (sbcset); 43122 + #ifdef RE_ENABLE_I18N 43123 + - free_charset (mbcset); 43124 + + if (__glibc_likely (mbcset != NULL)) 43125 + + free_charset (mbcset); 43126 + #endif /* RE_ENABLE_I18N */ 43127 + return NULL; 43128 + } 43129 + diff --git a/posix/tst-regcomp-bracket-free.c b/posix/tst-regcomp-bracket-free.c 43130 + new file mode 100644 43131 + index 0000000000..3c091d8c44 43132 + --- /dev/null 43133 + +++ b/posix/tst-regcomp-bracket-free.c 43134 + @@ -0,0 +1,176 @@ 43135 + +/* Test regcomp bracket parsing with injected allocation failures (bug 33185). 43136 + + Copyright (C) 2025 Free Software Foundation, Inc. 43137 + + This file is part of the GNU C Library. 43138 + + 43139 + + The GNU C Library is free software; you can redistribute it and/or 43140 + + modify it under the terms of the GNU Lesser General Public 43141 + + License as published by the Free Software Foundation; either 43142 + + version 2.1 of the License, or (at your option) any later version. 43143 + + 43144 + + The GNU C Library is distributed in the hope that it will be useful, 43145 + + but WITHOUT ANY WARRANTY; without even the implied warranty of 43146 + + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 43147 + + Lesser General Public License for more details. 43148 + + 43149 + + You should have received a copy of the GNU Lesser General Public 43150 + + License along with the GNU C Library; if not, see 43151 + + <https://www.gnu.org/licenses/>. */ 43152 + + 43153 + +/* This test invokes regcomp multiple times, failing one memory 43154 + + allocation in each call. The function call should fail with 43155 + + REG_ESPACE (or succeed if it can recover from the allocation 43156 + + failure). Previously, there was double-free bug. */ 43157 + + 43158 + +#include <errno.h> 43159 + +#include <regex.h> 43160 + +#include <stdio.h> 43161 + +#include <string.h> 43162 + +#include <support/check.h> 43163 + +#include <support/namespace.h> 43164 + +#include <support/support.h> 43165 + + 43166 + +/* Data structure allocated via MAP_SHARED, so that writes from the 43167 + + subprocess are visible. */ 43168 + +struct shared_data 43169 + +{ 43170 + + /* Number of tracked allocations performed so far. */ 43171 + + volatile unsigned int allocation_count; 43172 + + 43173 + + /* If this number is reached, one allocation fails. */ 43174 + + volatile unsigned int failing_allocation; 43175 + + 43176 + + /* The subprocess stores the expected name here. */ 43177 + + char name[100]; 43178 + +}; 43179 + + 43180 + +/* Allocation count in shared mapping. */ 43181 + +static struct shared_data *shared; 43182 + + 43183 + +/* Returns true if a failure should be injected for this allocation. */ 43184 + +static bool 43185 + +fail_this_allocation (void) 43186 + +{ 43187 + + if (shared != NULL) 43188 + + { 43189 + + unsigned int count = shared->allocation_count; 43190 + + shared->allocation_count = count + 1; 43191 + + return count == shared->failing_allocation; 43192 + + } 43193 + + else 43194 + + return false; 43195 + +} 43196 + + 43197 + +/* Failure-injecting wrappers for allocation functions used by glibc. */ 43198 + + 43199 + +void * 43200 + +malloc (size_t size) 43201 + +{ 43202 + + if (fail_this_allocation ()) 43203 + + { 43204 + + errno = ENOMEM; 43205 + + return NULL; 43206 + + } 43207 + + extern __typeof (malloc) __libc_malloc; 43208 + + return __libc_malloc (size); 43209 + +} 43210 + + 43211 + +void * 43212 + +calloc (size_t a, size_t b) 43213 + +{ 43214 + + if (fail_this_allocation ()) 43215 + + { 43216 + + errno = ENOMEM; 43217 + + return NULL; 43218 + + } 43219 + + extern __typeof (calloc) __libc_calloc; 43220 + + return __libc_calloc (a, b); 43221 + +} 43222 + + 43223 + +void * 43224 + +realloc (void *ptr, size_t size) 43225 + +{ 43226 + + if (fail_this_allocation ()) 43227 + + { 43228 + + errno = ENOMEM; 43229 + + return NULL; 43230 + + } 43231 + + extern __typeof (realloc) __libc_realloc; 43232 + + return __libc_realloc (ptr, size); 43233 + +} 43234 + + 43235 + +/* No-op subprocess to verify that support_isolate_in_subprocess does 43236 + + not perform any heap allocations. */ 43237 + +static void 43238 + +no_op (void *ignored) 43239 + +{ 43240 + +} 43241 + + 43242 + +/* Perform a regcomp call in a subprocess. Used to count its 43243 + + allocations. */ 43244 + +static void 43245 + +initialize (void *regexp1) 43246 + +{ 43247 + + const char *regexp = regexp1; 43248 + + 43249 + + shared->allocation_count = 0; 43250 + + 43251 + + regex_t reg; 43252 + + TEST_COMPARE (regcomp (&reg, regexp, 0), 0); 43253 + +} 43254 + + 43255 + +/* Perform regcomp in a subprocess with fault injection. */ 43256 + +static void 43257 + +test_in_subprocess (void *regexp1) 43258 + +{ 43259 + + const char *regexp = regexp1; 43260 + + unsigned int inject_at = shared->failing_allocation; 43261 + + 43262 + + regex_t reg; 43263 + + int ret = regcomp (&reg, regexp, 0); 43264 + + 43265 + + if (ret != 0) 43266 + + { 43267 + + TEST_COMPARE (ret, REG_ESPACE); 43268 + + printf ("info: allocation %u failure results in return value %d," 43269 + + " error %s (%d)\n", 43270 + + inject_at, ret, strerrorname_np (errno), errno); 43271 + + } 43272 + +} 43273 + + 43274 + +static int 43275 + +do_test (void) 43276 + +{ 43277 + + char regexp[] = "[:alpha:]"; 43278 + + 43279 + + shared = support_shared_allocate (sizeof (*shared)); 43280 + + 43281 + + /* Disable fault injection. */ 43282 + + shared->failing_allocation = ~0U; 43283 + + 43284 + + support_isolate_in_subprocess (no_op, NULL); 43285 + + TEST_COMPARE (shared->allocation_count, 0); 43286 + + 43287 + + support_isolate_in_subprocess (initialize, regexp); 43288 + + 43289 + + /* The number of allocations in the successful case, plus some 43290 + + slack. Once the number of expected allocations is exceeded, 43291 + + injecting further failures does not make a difference. */ 43292 + + unsigned int maximum_allocation_count = shared->allocation_count; 43293 + + printf ("info: successful call performs %u allocations\n", 43294 + + maximum_allocation_count); 43295 + + maximum_allocation_count += 10; 43296 + + 43297 + + for (unsigned int inject_at = 0; inject_at <= maximum_allocation_count; 43298 + + ++inject_at) 43299 + + { 43300 + + shared->allocation_count = 0; 43301 + + shared->failing_allocation = inject_at; 43302 + + support_isolate_in_subprocess (test_in_subprocess, regexp); 43303 + + } 43304 + + 43305 + + support_shared_free (shared); 43306 + + 43307 + + return 0; 43308 + +} 43309 + + 43310 + +#include <support/test-driver.c>

+1 -1

pkgs/development/libraries/glibc/common.nix

··· 68 68 /* 69 69 No tarballs for stable upstream branch, only https://sourceware.org/git/glibc.git and using git would complicate bootstrapping. 70 70 $ git fetch --all -p && git checkout origin/release/2.40/master && git describe 71 - glibc-2.40-66-g7d4b6bcae9 71 + glibc-2.40-142-g2eb180377b 72 72 $ git show --minimal --reverse glibc-2.40.. ':!ADVISORIES' > 2.40-master.patch 73 73 74 74 To compare the archive contents zdiff can be used.