···2621626216 struct abort_msg_s *buf = __mmap (NULL, total,
2621726217 PROT_READ | PROT_WRITE,
2621826218 MAP_ANON | MAP_PRIVATE, -1, 0);
2621926219+2622026220+commit aef8f8d6a947b290162393e1d717c7aee96fef8e
2622126221+Author: H.J. Lu <hjl.tools@gmail.com>
2622226222+Date: Tue Dec 17 18:41:45 2024 +0800
2622326223+2622426224+ Hide all malloc functions from compiler [BZ #32366]
2622526225+2622626226+ Since -1 isn't a power of two, compiler may reject it, hide memalign from
2622726227+ Clang 19 which issues an error:
2622826228+2622926229+ tst-memalign.c:86:31: error: requested alignment is not a power of 2 [-Werror,-Wnon-power-of-two-alignment]
2623026230+ 86 | p = memalign (-1, pagesize);
2623126231+ | ^~
2623226232+ tst-memalign.c:86:31: error: requested alignment must be 4294967296 bytes or smaller; maximum alignment assumed [-Werror,-Wbuiltin-assume-aligned-alignment]
2623326233+ 86 | p = memalign (-1, pagesize);
2623426234+ | ^~
2623526235+2623626236+ Update tst-malloc-aux.h to hide all malloc functions and include it in
2623726237+ all malloc tests to prevent compiler from optimizing out any malloc
2623826238+ functions.
2623926239+2624026240+ Tested with Clang 19.1.5 and GCC 15 20241206 for BZ #32366.
2624126241+2624226242+ Signed-off-by: H.J. Lu <hjl.tools@gmail.com>
2624326243+ Reviewed-by: Sam James <sam@gentoo.org>
2624426244+ (cherry picked from commit f9493a15ea9cfb63a815c00c23142369ec09d8ce)
2624526245+2624626246+diff --git a/malloc/tst-mallinfo2.c b/malloc/tst-mallinfo2.c
2624726247+index 2c02f5f700..f072b9f24b 100644
2624826248+--- a/malloc/tst-mallinfo2.c
2624926249++++ b/malloc/tst-mallinfo2.c
2625026250+@@ -23,6 +23,8 @@
2625126251+ #include <stdlib.h>
2625226252+ #include <support/check.h>
2625326253+2625426254++#include "tst-malloc-aux.h"
2625526255++
2625626256+ /* This is not specifically needed for the test, but (1) does
2625726257+ something to the data so gcc doesn't optimize it away, and (2) may
2625826258+ help when developing future tests. */
2625926259+diff --git a/malloc/tst-malloc-aux.h b/malloc/tst-malloc-aux.h
2626026260+index 54908b4a24..3e1b61ce34 100644
2626126261+--- a/malloc/tst-malloc-aux.h
2626226262++++ b/malloc/tst-malloc-aux.h
2626326263+@@ -22,20 +22,35 @@
2626426264+2626526265+ #include <stddef.h>
2626626266+ #include <stdlib.h>
2626726267+-
2626826268+-static void *(*volatile aligned_alloc_indirect)(size_t, size_t) = aligned_alloc;
2626926269+-static void *(*volatile calloc_indirect)(size_t, size_t) = calloc;
2627026270+-static void *(*volatile malloc_indirect)(size_t) = malloc;
2627126271+-static void *(*volatile realloc_indirect)(void*, size_t) = realloc;
2627226272++#include <malloc.h>
2627326273++
2627426274++static __typeof (aligned_alloc) * volatile aligned_alloc_indirect
2627526275++ = aligned_alloc;
2627626276++static __typeof (calloc) * volatile calloc_indirect = calloc;
2627726277++static __typeof (malloc) * volatile malloc_indirect = malloc;
2627826278++static __typeof (memalign) * volatile memalign_indirect = memalign;
2627926279++static __typeof (posix_memalign) * volatile posix_memalign_indirect
2628026280++ = posix_memalign;
2628126281++static __typeof (pvalloc) * volatile pvalloc_indirect = pvalloc;
2628226282++static __typeof (realloc) * volatile realloc_indirect = realloc;
2628326283++static __typeof (valloc) * volatile valloc_indirect = valloc;
2628426284+2628526285+ #undef aligned_alloc
2628626286+ #undef calloc
2628726287+ #undef malloc
2628826288++#undef memalign
2628926289++#undef posix_memalign
2629026290++#undef pvalloc
2629126291+ #undef realloc
2629226292++#undef valloc
2629326293+2629426294+ #define aligned_alloc aligned_alloc_indirect
2629526295+ #define calloc calloc_indirect
2629626296+ #define malloc malloc_indirect
2629726297++#define memalign memalign_indirect
2629826298++#define posix_memalign posix_memalign_indirect
2629926299++#define pvalloc pvalloc_indirect
2630026300+ #define realloc realloc_indirect
2630126301++#define valloc valloc_indirect
2630226302+2630326303+ #endif /* TST_MALLOC_AUX_H */
2630426304+diff --git a/malloc/tst-malloc-backtrace.c b/malloc/tst-malloc-backtrace.c
2630526305+index c7b1d65e5c..65fa91f6fd 100644
2630626306+--- a/malloc/tst-malloc-backtrace.c
2630726307++++ b/malloc/tst-malloc-backtrace.c
2630826308+@@ -22,6 +22,8 @@
2630926309+ #include <support/support.h>
2631026310+ #include <libc-diag.h>
2631126311+2631226312++#include "tst-malloc-aux.h"
2631326313++
2631426314+ #define SIZE 4096
2631526315+2631626316+ /* Wrap free with a function to prevent gcc from optimizing it out. */
2631726317+diff --git a/malloc/tst-memalign.c b/malloc/tst-memalign.c
2631826318+index 563f6413d2..ac9770d3f9 100644
2631926319+--- a/malloc/tst-memalign.c
2632026320++++ b/malloc/tst-memalign.c
2632126321+@@ -23,6 +23,8 @@
2632226322+ #include <unistd.h>
2632326323+ #include <libc-diag.h>
2632426324+2632526325++#include "tst-malloc-aux.h"
2632626326++
2632726327+ static int errors = 0;
2632826328+2632926329+ static void
2633026330+diff --git a/malloc/tst-safe-linking.c b/malloc/tst-safe-linking.c
2633126331+index 01dd07004d..63a7e2bc8e 100644
2633226332+--- a/malloc/tst-safe-linking.c
2633326333++++ b/malloc/tst-safe-linking.c
2633426334+@@ -26,6 +26,8 @@
2633526335+ #include <support/capture_subprocess.h>
2633626336+ #include <support/check.h>
2633726337+2633826338++#include "tst-malloc-aux.h"
2633926339++
2634026340+ /* Run CALLBACK and check that the data on standard error equals
2634126341+ EXPECTED. */
2634226342+ static void
2634326343+diff --git a/malloc/tst-valloc.c b/malloc/tst-valloc.c
2634426344+index 9bab8c6470..0243d3dfd4 100644
2634526345+--- a/malloc/tst-valloc.c
2634626346++++ b/malloc/tst-valloc.c
2634726347+@@ -23,6 +23,8 @@
2634826348+ #include <unistd.h>
2634926349+ #include <libc-diag.h>
2635026350+2635126351++#include "tst-malloc-aux.h"
2635226352++
2635326353+ static int errors = 0;
2635426354+2635526355+ static void
2635626356+2635726357+commit be48b8f6ad0ec6d0d6b1d2f45eb59bf8e8c67dd7
2635826358+Author: Sam James <sam@gentoo.org>
2635926359+Date: Fri Jan 10 03:03:47 2025 +0000
2636026360+2636126361+ malloc: obscure calloc use in tst-calloc
2636226362+2636326363+ Similar to a9944a52c967ce76a5894c30d0274b824df43c7a and
2636426364+ f9493a15ea9cfb63a815c00c23142369ec09d8ce, we need to hide calloc use from
2636526365+ the compiler to accommodate GCC's r15-6566-g804e9d55d9e54c change.
2636626366+2636726367+ First, include tst-malloc-aux.h, but then use `volatile` variables
2636826368+ for size.
2636926369+2637026370+ The test passes without the tst-malloc-aux.h change but IMO we want
2637126371+ it there for consistency and to avoid future problems (possibly silent).
2637226372+2637326373+ Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
2637426374+ (cherry picked from commit c3d1dac96bdd10250aa37bb367d5ef8334a093a1)
2637526375+2637626376+diff --git a/malloc/tst-calloc.c b/malloc/tst-calloc.c
2637726377+index 01f17f9e65..5a8c7ab121 100644
2637826378+--- a/malloc/tst-calloc.c
2637926379++++ b/malloc/tst-calloc.c
2638026380+@@ -23,6 +23,7 @@
2638126381+ #include <stdio.h>
2638226382+ #include <libc-diag.h>
2638326383+2638426384++#include "tst-malloc-aux.h"
2638526385+2638626386+ /* Number of samples per size. */
2638726387+ #define N 50000
2638826388+@@ -94,16 +95,19 @@ random_test (void)
2638926389+ static void
2639026390+ null_test (void)
2639126391+ {
2639226392++ /* Obscure allocation size from the compiler. */
2639326393++ volatile size_t max_size = UINT_MAX;
2639426394++ volatile size_t zero_size = 0;
2639526395+ /* If the size is 0 the result is implementation defined. Just make
2639626396+ sure the program doesn't crash. The result of calloc is
2639726397+ deliberately ignored, so do not warn about that. */
2639826398+ DIAG_PUSH_NEEDS_COMMENT;
2639926399+ DIAG_IGNORE_NEEDS_COMMENT (10, "-Wunused-result");
2640026400+ calloc (0, 0);
2640126401+- calloc (0, UINT_MAX);
2640226402+- calloc (UINT_MAX, 0);
2640326403+- calloc (0, ~((size_t) 0));
2640426404+- calloc (~((size_t) 0), 0);
2640526405++ calloc (0, max_size);
2640626406++ calloc (max_size, 0);
2640726407++ calloc (0, ~((size_t) zero_size));
2640826408++ calloc (~((size_t) zero_size), 0);
2640926409+ DIAG_POP_NEEDS_COMMENT;
2641026410+ }
2641126411+2641226412+2641326413+commit 85668221974db44459527e04d04f77ca8f8e3115
2641426414+Author: H.J. Lu <hjl.tools@gmail.com>
2641526415+Date: Fri Jan 24 18:53:13 2025 +0800
2641626416+2641726417+ stdlib: Test using setenv with updated environ [BZ #32588]
2641826418+2641926419+ Add a test for setenv with updated environ. Verify that BZ #32588 is
2642026420+ fixed.
2642126421+2642226422+ Signed-off-by: H.J. Lu <hjl.tools@gmail.com>
2642326423+ Reviewed-by: Florian Weimer <fweimer@redhat.com>
2642426424+ (cherry picked from commit 8ab34497de14e35aff09b607222fe1309ef156da)
2642526425+2642626426+diff --git a/stdlib/Makefile b/stdlib/Makefile
2642726427+index 8213fa83ef..d3a84fa641 100644
2642826428+--- a/stdlib/Makefile
2642926429++++ b/stdlib/Makefile
2643026430+@@ -307,6 +307,7 @@ tests := \
2643126431+ tst-setcontext9 \
2643226432+ tst-setcontext10 \
2643326433+ tst-setcontext11 \
2643426434++ tst-setenv-environ \
2643526435+ tst-stdbit-Wconversion \
2643626436+ tst-stdbit-builtins \
2643726437+ tst-stdc_bit_ceil \
2643826438+diff --git a/stdlib/tst-setenv-environ.c b/stdlib/tst-setenv-environ.c
2643926439+new file mode 100644
2644026440+index 0000000000..02fcef96d0
2644126441+--- /dev/null
2644226442++++ b/stdlib/tst-setenv-environ.c
2644326443+@@ -0,0 +1,36 @@
2644426444++/* Test using setenv with updated environ.
2644526445++ Copyright (C) 2025 Free Software Foundation, Inc.
2644626446++ This file is part of the GNU C Library.
2644726447++
2644826448++ The GNU C Library is free software; you can redistribute it and/or
2644926449++ modify it under the terms of the GNU Lesser General Public
2645026450++ License as published by the Free Software Foundation; either
2645126451++ version 2.1 of the License, or (at your option) any later version.
2645226452++
2645326453++ The GNU C Library is distributed in the hope that it will be useful,
2645426454++ but WITHOUT ANY WARRANTY; without even the implied warranty of
2645526455++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
2645626456++ Lesser General Public License for more details.
2645726457++
2645826458++ You should have received a copy of the GNU Lesser General Public
2645926459++ License along with the GNU C Library; if not, see
2646026460++ <https://www.gnu.org/licenses/>. */
2646126461++
2646226462++#include <stdlib.h>
2646326463++#include <support/check.h>
2646426464++
2646526465++extern char **environ;
2646626466++
2646726467++int
2646826468++do_test (void)
2646926469++{
2647026470++ char *valp;
2647126471++ static char *dummy_environ[] = { NULL };
2647226472++ environ = dummy_environ;
2647326473++ setenv ("A", "1", 0);
2647426474++ valp = getenv ("A");
2647526475++ TEST_VERIFY_EXIT (valp[0] == '1' && valp[1] == '\0');
2647626476++ return 0;
2647726477++}
2647826478++
2647926479++#include <support/test-driver.c>
2648026480+2648126481+commit e899ca3651f8c5e01bf3420cfb34aad97d093f74
2648226482+Author: John David Anglin <danglin@gcc.gnu.org>
2648326483+Date: Wed Jan 29 16:51:16 2025 -0500
2648426484+2648526485+ nptl: Correct stack size attribute when stack grows up [BZ #32574]
2648626486+2648726487+ Set stack size attribute to the size of the mmap'd region only
2648826488+ when the size of the remaining stack space is less than the size
2648926489+ of the mmap'd region.
2649026490+2649126491+ This was reversed. As a result, the initial stack size was only
2649226492+ 135168 bytes. On architectures where the stack grows down, the
2649326493+ initial stack size is approximately 8384512 bytes with the default
2649426494+ rlimit settings. The small main stack size on hppa broke
2649526495+ applications like ruby that check for stack overflows.
2649626496+2649726497+ Signed-off-by: John David Anglin <dave.anglin@bell.net>
2649826498+2649926499+diff --git a/nptl/pthread_getattr_np.c b/nptl/pthread_getattr_np.c
2650026500+index 1e91874767..3ce34437bc 100644
2650126501+--- a/nptl/pthread_getattr_np.c
2650226502++++ b/nptl/pthread_getattr_np.c
2650326503+@@ -145,9 +145,9 @@ __pthread_getattr_np (pthread_t thread_id, pthread_attr_t *attr)
2650426504+ > (size_t) iattr->stackaddr - last_to)
2650526505+ iattr->stacksize = (size_t) iattr->stackaddr - last_to;
2650626506+ #else
2650726507+- /* The limit might be too high. */
2650826508++ /* The limit might be too low. */
2650926509+ if ((size_t) iattr->stacksize
2651026510+- > to - (size_t) iattr->stackaddr)
2651126511++ < to - (size_t) iattr->stackaddr)
2651226512+ iattr->stacksize = to - (size_t) iattr->stackaddr;
2651326513+ #endif
2651426514+ /* We succeed and no need to look further. */
2651526515+2651626516+commit d6c156c326999f144cb5b73d29982108d549ad8a
2651726517+Author: Siddhesh Poyarekar <siddhesh@sourceware.org>
2651826518+Date: Fri Jan 31 12:16:30 2025 -0500
2651926519+2652026520+ assert: Add test for CVE-2025-0395
2652126521+2652226522+ Use the __progname symbol to override the program name to induce the
2652326523+ failure that CVE-2025-0395 describes.
2652426524+2652526525+ This is related to BZ #32582
2652626526+2652726527+ Signed-off-by: Siddhesh Poyarekar <siddhesh@sourceware.org>
2652826528+ Reviewed-by: Adhemerval Zanella <adhemerval.zanella@linaro.org>
2652926529+ (cherry picked from commit cdb9ba84191ce72e86346fb8b1d906e7cd930ea2)
2653026530+2653126531+diff --git a/assert/Makefile b/assert/Makefile
2653226532+index 35dc908ddb..c0fe660bd6 100644
2653326533+--- a/assert/Makefile
2653426534++++ b/assert/Makefile
2653526535+@@ -38,6 +38,7 @@ tests := \
2653626536+ test-assert-perr \
2653726537+ tst-assert-c++ \
2653826538+ tst-assert-g++ \
2653926539++ tst-assert-sa-2025-0001 \
2654026540+ # tests
2654126541+2654226542+ ifeq ($(have-cxx-thread_local),yes)
2654326543+diff --git a/assert/tst-assert-sa-2025-0001.c b/assert/tst-assert-sa-2025-0001.c
2654426544+new file mode 100644
2654526545+index 0000000000..102cb0078d
2654626546+--- /dev/null
2654726547++++ b/assert/tst-assert-sa-2025-0001.c
2654826548+@@ -0,0 +1,92 @@
2654926549++/* Test for CVE-2025-0395.
2655026550++ Copyright The GNU Toolchain Authors.
2655126551++ This file is part of the GNU C Library.
2655226552++
2655326553++ The GNU C Library is free software; you can redistribute it and/or
2655426554++ modify it under the terms of the GNU Lesser General Public
2655526555++ License as published by the Free Software Foundation; either
2655626556++ version 2.1 of the License, or (at your option) any later version.
2655726557++
2655826558++ The GNU C Library is distributed in the hope that it will be useful,
2655926559++ but WITHOUT ANY WARRANTY; without even the implied warranty of
2656026560++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
2656126561++ Lesser General Public License for more details.
2656226562++
2656326563++ You should have received a copy of the GNU Lesser General Public
2656426564++ License along with the GNU C Library; if not, see
2656526565++ <https://www.gnu.org/licenses/>. */
2656626566++
2656726567++/* Test that a large enough __progname does not result in a buffer overflow
2656826568++ when printing an assertion failure. This was CVE-2025-0395. */
2656926569++#include <assert.h>
2657026570++#include <inttypes.h>
2657126571++#include <signal.h>
2657226572++#include <stdbool.h>
2657326573++#include <string.h>
2657426574++#include <sys/mman.h>
2657526575++#include <support/check.h>
2657626576++#include <support/support.h>
2657726577++#include <support/xstdio.h>
2657826578++#include <support/xunistd.h>
2657926579++
2658026580++extern const char *__progname;
2658126581++
2658226582++int
2658326583++do_test (int argc, char **argv)
2658426584++{
2658526585++
2658626586++ support_need_proc ("Reads /proc/self/maps to add guards to writable maps.");
2658726587++ ignore_stderr ();
2658826588++
2658926589++ /* XXX assumes that the assert is on a 2 digit line number. */
2659026590++ const char *prompt = ": %s:99: do_test: Assertion `argc < 1' failed.\n";
2659126591++
2659226592++ int ret = fprintf (stderr, prompt, __FILE__);
2659326593++ if (ret < 0)
2659426594++ FAIL_EXIT1 ("fprintf failed: %m\n");
2659526595++
2659626596++ size_t pagesize = getpagesize ();
2659726597++ size_t namesize = pagesize - 1 - ret;
2659826598++
2659926599++ /* Alter the progname so that the assert message fills the entire page. */
2660026600++ char progname[namesize];
2660126601++ memset (progname, 'A', namesize - 1);
2660226602++ progname[namesize - 1] = '\0';
2660326603++ __progname = progname;
2660426604++
2660526605++ FILE *f = xfopen ("/proc/self/maps", "r");
2660626606++ char *line = NULL;
2660726607++ size_t len = 0;
2660826608++ uintptr_t prev_to = 0;
2660926609++
2661026610++ /* Pad the beginning of every writable mapping with a PROT_NONE map. This
2661126611++ ensures that the mmap in the assert_fail path never ends up below a
2661226612++ writable map and will terminate immediately in case of a buffer
2661326613++ overflow. */
2661426614++ while (xgetline (&line, &len, f))
2661526615++ {
2661626616++ uintptr_t from, to;
2661726617++ char perm[4];
2661826618++
2661926619++ sscanf (line, "%" SCNxPTR "-%" SCNxPTR " %c%c%c%c ",
2662026620++ &from, &to,
2662126621++ &perm[0], &perm[1], &perm[2], &perm[3]);
2662226622++
2662326623++ bool writable = (memchr (perm, 'w', 4) != NULL);
2662426624++
2662526625++ if (prev_to != 0 && from - prev_to > pagesize && writable)
2662626626++ xmmap ((void *) from - pagesize, pagesize, PROT_NONE,
2662726627++ MAP_ANONYMOUS | MAP_PRIVATE, 0);
2662826628++
2662926629++ prev_to = to;
2663026630++ }
2663126631++
2663226632++ xfclose (f);
2663326633++
2663426634++ assert (argc < 1);
2663526635++ return 0;
2663626636++}
2663726637++
2663826638++#define EXPECTED_SIGNAL SIGABRT
2663926639++#define TEST_FUNCTION_ARGV do_test
2664026640++#include <support/test-driver.c>
2664126641+2664226642+commit 523f85558152a1b9cced6d669f758c27677775ba
2664326643+Author: John David Anglin <danglin@gcc.gnu.org>
2664426644+Date: Tue Feb 25 15:57:53 2025 -0500
2664526645+2664626646+ math: Add optimization barrier to ensure a1 + u.d is not reused [BZ #30664]
2664726647+2664826648+ A number of fma tests started to fail on hppa when gcc was changed to
2664926649+ use Ranger rather than EVRP. Eventually I found that the value of
2665026650+ a1 + u.d in this is block of code was being computed in FE_TOWARDZERO
2665126651+ mode and not the original rounding mode:
2665226652+2665326653+ if (TININESS_AFTER_ROUNDING)
2665426654+ {
2665526655+ w.d = a1 + u.d;
2665626656+ if (w.ieee.exponent == 109)
2665726657+ return w.d * 0x1p-108;
2665826658+ }
2665926659+2666026660+ This caused the exponent value to be wrong and the wrong return path
2666126661+ to be used.
2666226662+2666326663+ Here we add an optimization barrier after the rounding mode is reset
2666426664+ to ensure that the previous value of a1 + u.d is not reused.
2666526665+2666626666+ Signed-off-by: John David Anglin <dave.anglin@bell.net>
2666726667+2666826668+diff --git a/sysdeps/ieee754/dbl-64/s_fma.c b/sysdeps/ieee754/dbl-64/s_fma.c
2666926669+index c5f5abdc68..79a3cd721d 100644
2667026670+--- a/sysdeps/ieee754/dbl-64/s_fma.c
2667126671++++ b/sysdeps/ieee754/dbl-64/s_fma.c
2667226672+@@ -244,6 +244,9 @@ __fma (double x, double y, double z)
2667326673+ /* Reset rounding mode and test for inexact simultaneously. */
2667426674+ int j = libc_feupdateenv_test (&env, FE_INEXACT) != 0;
2667526675+2667626676++ /* Ensure value of a1 + u.d is not reused. */
2667726677++ a1 = math_opt_barrier (a1);
2667826678++
2667926679+ if (__glibc_likely (adjust == 0))
2668026680+ {
2668126681+ if ((u.ieee.mantissa1 & 1) == 0 && u.ieee.exponent != 0x7ff)
2668226682+2668326683+commit ff10623706ea0096f3af7b38a3330ffb7fb15ae7
2668426684+Author: Joe Ramsay <Joe.Ramsay@arm.com>
2668526685+Date: Mon Sep 9 13:00:01 2024 +0100
2668626686+2668726687+ aarch64: Avoid redundant MOVs in AdvSIMD F32 logs
2668826688+2668926689+ Since the last operation is destructive, the first argument to the FMA
2669026690+ also has to be the first argument to the special-case in order to
2669126691+ avoid unnecessary MOVs. Reorder arguments and adjust special-case
2669226692+ bounds to facilitate this.
2669326693+2669426694+ Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com>
2669526695+ (cherry picked from commit 8b09af572b208bfde4d31c6abbae047dcc217675)
2669626696+2669726697+diff --git a/sysdeps/aarch64/fpu/log10f_advsimd.c b/sysdeps/aarch64/fpu/log10f_advsimd.c
2669826698+index 9347422a77..82228b599a 100644
2669926699+--- a/sysdeps/aarch64/fpu/log10f_advsimd.c
2670026700++++ b/sysdeps/aarch64/fpu/log10f_advsimd.c
2670126701+@@ -22,11 +22,11 @@
2670226702+2670326703+ static const struct data
2670426704+ {
2670526705+- uint32x4_t min_norm;
2670626706++ uint32x4_t off, offset_lower_bound;
2670726707+ uint16x8_t special_bound;
2670826708++ uint32x4_t mantissa_mask;
2670926709+ float32x4_t poly[8];
2671026710+ float32x4_t inv_ln10, ln2;
2671126711+- uint32x4_t off, mantissa_mask;
2671226712+ } data = {
2671326713+ /* Use order 9 for log10(1+x), i.e. order 8 for log10(1+x)/x, with x in
2671426714+ [-1/3, 1/3] (offset=2/3). Max. relative error: 0x1.068ee468p-25. */
2671526715+@@ -35,18 +35,22 @@ static const struct data
2671626716+ V4 (-0x1.0fc92cp-4f), V4 (0x1.f5f76ap-5f) },
2671726717+ .ln2 = V4 (0x1.62e43p-1f),
2671826718+ .inv_ln10 = V4 (0x1.bcb7b2p-2f),
2671926719+- .min_norm = V4 (0x00800000),
2672026720+- .special_bound = V8 (0x7f00), /* asuint32(inf) - min_norm. */
2672126721++ /* Lower bound is the smallest positive normal float 0x00800000. For
2672226722++ optimised register use subnormals are detected after offset has been
2672326723++ subtracted, so lower bound is 0x0080000 - offset (which wraps around). */
2672426724++ .offset_lower_bound = V4 (0x00800000 - 0x3f2aaaab),
2672526725++ .special_bound = V8 (0x7f00), /* top16(asuint32(inf) - 0x00800000). */
2672626726+ .off = V4 (0x3f2aaaab), /* 0.666667. */
2672726727+ .mantissa_mask = V4 (0x007fffff),
2672826728+ };
2672926729+2673026730+ static float32x4_t VPCS_ATTR NOINLINE
2673126731+-special_case (float32x4_t x, float32x4_t y, float32x4_t p, float32x4_t r2,
2673226732+- uint16x4_t cmp)
2673326733++special_case (float32x4_t y, uint32x4_t u_off, float32x4_t p, float32x4_t r2,
2673426734++ uint16x4_t cmp, const struct data *d)
2673526735+ {
2673626736+ /* Fall back to scalar code. */
2673726737+- return v_call_f32 (log10f, x, vfmaq_f32 (y, p, r2), vmovl_u16 (cmp));
2673826738++ return v_call_f32 (log10f, vreinterpretq_f32_u32 (vaddq_u32 (u_off, d->off)),
2673926739++ vfmaq_f32 (y, p, r2), vmovl_u16 (cmp));
2674026740+ }
2674126741+2674226742+ /* Fast implementation of AdvSIMD log10f,
2674326743+@@ -58,15 +62,21 @@ special_case (float32x4_t x, float32x4_t y, float32x4_t p, float32x4_t r2,
2674426744+ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log10) (float32x4_t x)
2674526745+ {
2674626746+ const struct data *d = ptr_barrier (&data);
2674726747+- uint32x4_t u = vreinterpretq_u32_f32 (x);
2674826748+- uint16x4_t special = vcge_u16 (vsubhn_u32 (u, d->min_norm),
2674926749+- vget_low_u16 (d->special_bound));
2675026750++
2675126751++ /* To avoid having to mov x out of the way, keep u after offset has been
2675226752++ applied, and recover x by adding the offset back in the special-case
2675326753++ handler. */
2675426754++ uint32x4_t u_off = vreinterpretq_u32_f32 (x);
2675526755+2675626756+ /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3. */
2675726757+- u = vsubq_u32 (u, d->off);
2675826758++ u_off = vsubq_u32 (u_off, d->off);
2675926759+ float32x4_t n = vcvtq_f32_s32 (
2676026760+- vshrq_n_s32 (vreinterpretq_s32_u32 (u), 23)); /* signextend. */
2676126761+- u = vaddq_u32 (vandq_u32 (u, d->mantissa_mask), d->off);
2676226762++ vshrq_n_s32 (vreinterpretq_s32_u32 (u_off), 23)); /* signextend. */
2676326763++
2676426764++ uint16x4_t special = vcge_u16 (vsubhn_u32 (u_off, d->offset_lower_bound),
2676526765++ vget_low_u16 (d->special_bound));
2676626766++
2676726767++ uint32x4_t u = vaddq_u32 (vandq_u32 (u_off, d->mantissa_mask), d->off);
2676826768+ float32x4_t r = vsubq_f32 (vreinterpretq_f32_u32 (u), v_f32 (1.0f));
2676926769+2677026770+ /* y = log10(1+r) + n * log10(2). */
2677126771+@@ -77,7 +87,7 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log10) (float32x4_t x)
2677226772+ y = vmulq_f32 (y, d->inv_ln10);
2677326773+2677426774+ if (__glibc_unlikely (v_any_u16h (special)))
2677526775+- return special_case (x, y, poly, r2, special);
2677626776++ return special_case (y, u_off, poly, r2, special, d);
2677726777+ return vfmaq_f32 (y, poly, r2);
2677826778+ }
2677926779+ libmvec_hidden_def (V_NAME_F1 (log10))
2678026780+diff --git a/sysdeps/aarch64/fpu/log2f_advsimd.c b/sysdeps/aarch64/fpu/log2f_advsimd.c
2678126781+index db21836749..84effe4fe9 100644
2678226782+--- a/sysdeps/aarch64/fpu/log2f_advsimd.c
2678326783++++ b/sysdeps/aarch64/fpu/log2f_advsimd.c
2678426784+@@ -22,9 +22,9 @@
2678526785+2678626786+ static const struct data
2678726787+ {
2678826788+- uint32x4_t min_norm;
2678926789++ uint32x4_t off, offset_lower_bound;
2679026790+ uint16x8_t special_bound;
2679126791+- uint32x4_t off, mantissa_mask;
2679226792++ uint32x4_t mantissa_mask;
2679326793+ float32x4_t poly[9];
2679426794+ } data = {
2679526795+ /* Coefficients generated using Remez algorithm approximate
2679626796+@@ -34,18 +34,22 @@ static const struct data
2679726797+ V4 (-0x1.715458p-1f), V4 (0x1.ec701cp-2f), V4 (-0x1.7171a4p-2f),
2679826798+ V4 (0x1.27a0b8p-2f), V4 (-0x1.e5143ep-3f), V4 (0x1.9d8ecap-3f),
2679926799+ V4 (-0x1.c675bp-3f), V4 (0x1.9e495p-3f) },
2680026800+- .min_norm = V4 (0x00800000),
2680126801+- .special_bound = V8 (0x7f00), /* asuint32(inf) - min_norm. */
2680226802++ /* Lower bound is the smallest positive normal float 0x00800000. For
2680326803++ optimised register use subnormals are detected after offset has been
2680426804++ subtracted, so lower bound is 0x0080000 - offset (which wraps around). */
2680526805++ .offset_lower_bound = V4 (0x00800000 - 0x3f2aaaab),
2680626806++ .special_bound = V8 (0x7f00), /* top16(asuint32(inf) - 0x00800000). */
2680726807+ .off = V4 (0x3f2aaaab), /* 0.666667. */
2680826808+ .mantissa_mask = V4 (0x007fffff),
2680926809+ };
2681026810+2681126811+ static float32x4_t VPCS_ATTR NOINLINE
2681226812+-special_case (float32x4_t x, float32x4_t n, float32x4_t p, float32x4_t r,
2681326813+- uint16x4_t cmp)
2681426814++special_case (float32x4_t n, uint32x4_t u_off, float32x4_t p, float32x4_t r,
2681526815++ uint16x4_t cmp, const struct data *d)
2681626816+ {
2681726817+ /* Fall back to scalar code. */
2681826818+- return v_call_f32 (log2f, x, vfmaq_f32 (n, p, r), vmovl_u16 (cmp));
2681926819++ return v_call_f32 (log2f, vreinterpretq_f32_u32 (vaddq_u32 (u_off, d->off)),
2682026820++ vfmaq_f32 (n, p, r), vmovl_u16 (cmp));
2682126821+ }
2682226822+2682326823+ /* Fast implementation for single precision AdvSIMD log2,
2682426824+@@ -56,15 +60,21 @@ special_case (float32x4_t x, float32x4_t n, float32x4_t p, float32x4_t r,
2682526825+ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log2) (float32x4_t x)
2682626826+ {
2682726827+ const struct data *d = ptr_barrier (&data);
2682826828+- uint32x4_t u = vreinterpretq_u32_f32 (x);
2682926829+- uint16x4_t special = vcge_u16 (vsubhn_u32 (u, d->min_norm),
2683026830+- vget_low_u16 (d->special_bound));
2683126831++
2683226832++ /* To avoid having to mov x out of the way, keep u after offset has been
2683326833++ applied, and recover x by adding the offset back in the special-case
2683426834++ handler. */
2683526835++ uint32x4_t u_off = vreinterpretq_u32_f32 (x);
2683626836+2683726837+ /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3. */
2683826838+- u = vsubq_u32 (u, d->off);
2683926839++ u_off = vsubq_u32 (u_off, d->off);
2684026840+ float32x4_t n = vcvtq_f32_s32 (
2684126841+- vshrq_n_s32 (vreinterpretq_s32_u32 (u), 23)); /* signextend. */
2684226842+- u = vaddq_u32 (vandq_u32 (u, d->mantissa_mask), d->off);
2684326843++ vshrq_n_s32 (vreinterpretq_s32_u32 (u_off), 23)); /* signextend. */
2684426844++
2684526845++ uint16x4_t special = vcge_u16 (vsubhn_u32 (u_off, d->offset_lower_bound),
2684626846++ vget_low_u16 (d->special_bound));
2684726847++
2684826848++ uint32x4_t u = vaddq_u32 (vandq_u32 (u_off, d->mantissa_mask), d->off);
2684926849+ float32x4_t r = vsubq_f32 (vreinterpretq_f32_u32 (u), v_f32 (1.0f));
2685026850+2685126851+ /* y = log2(1+r) + n. */
2685226852+@@ -72,7 +82,7 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log2) (float32x4_t x)
2685326853+ float32x4_t p = v_pw_horner_8_f32 (r, r2, d->poly);
2685426854+2685526855+ if (__glibc_unlikely (v_any_u16h (special)))
2685626856+- return special_case (x, n, p, r, special);
2685726857++ return special_case (n, u_off, p, r, special, d);
2685826858+ return vfmaq_f32 (n, p, r);
2685926859+ }
2686026860+ libmvec_hidden_def (V_NAME_F1 (log2))
2686126861+diff --git a/sysdeps/aarch64/fpu/logf_advsimd.c b/sysdeps/aarch64/fpu/logf_advsimd.c
2686226862+index 3c0d0fcdc7..c20dbfd6c0 100644
2686326863+--- a/sysdeps/aarch64/fpu/logf_advsimd.c
2686426864++++ b/sysdeps/aarch64/fpu/logf_advsimd.c
2686526865+@@ -21,20 +21,22 @@
2686626866+2686726867+ static const struct data
2686826868+ {
2686926869+- uint32x4_t min_norm;
2687026870++ uint32x4_t off, offset_lower_bound;
2687126871+ uint16x8_t special_bound;
2687226872++ uint32x4_t mantissa_mask;
2687326873+ float32x4_t poly[7];
2687426874+- float32x4_t ln2, tiny_bound;
2687526875+- uint32x4_t off, mantissa_mask;
2687626876++ float32x4_t ln2;
2687726877+ } data = {
2687826878+ /* 3.34 ulp error. */
2687926879+ .poly = { V4 (-0x1.3e737cp-3f), V4 (0x1.5a9aa2p-3f), V4 (-0x1.4f9934p-3f),
2688026880+ V4 (0x1.961348p-3f), V4 (-0x1.00187cp-2f), V4 (0x1.555d7cp-2f),
2688126881+ V4 (-0x1.ffffc8p-2f) },
2688226882+ .ln2 = V4 (0x1.62e43p-1f),
2688326883+- .tiny_bound = V4 (0x1p-126),
2688426884+- .min_norm = V4 (0x00800000),
2688526885+- .special_bound = V8 (0x7f00), /* asuint32(inf) - min_norm. */
2688626886++ /* Lower bound is the smallest positive normal float 0x00800000. For
2688726887++ optimised register use subnormals are detected after offset has been
2688826888++ subtracted, so lower bound is 0x0080000 - offset (which wraps around). */
2688926889++ .offset_lower_bound = V4 (0x00800000 - 0x3f2aaaab),
2689026890++ .special_bound = V8 (0x7f00), /* top16(asuint32(inf) - 0x00800000). */
2689126891+ .off = V4 (0x3f2aaaab), /* 0.666667. */
2689226892+ .mantissa_mask = V4 (0x007fffff)
2689326893+ };
2689426894+@@ -42,32 +44,37 @@ static const struct data
2689526895+ #define P(i) d->poly[7 - i]
2689626896+2689726897+ static float32x4_t VPCS_ATTR NOINLINE
2689826898+-special_case (float32x4_t x, float32x4_t y, float32x4_t r2, float32x4_t p,
2689926899+- uint16x4_t cmp)
2690026900++special_case (float32x4_t p, uint32x4_t u_off, float32x4_t y, float32x4_t r2,
2690126901++ uint16x4_t cmp, const struct data *d)
2690226902+ {
2690326903+ /* Fall back to scalar code. */
2690426904+- return v_call_f32 (logf, x, vfmaq_f32 (p, y, r2), vmovl_u16 (cmp));
2690526905++ return v_call_f32 (logf, vreinterpretq_f32_u32 (vaddq_u32 (u_off, d->off)),
2690626906++ vfmaq_f32 (p, y, r2), vmovl_u16 (cmp));
2690726907+ }
2690826908+2690926909+ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log) (float32x4_t x)
2691026910+ {
2691126911+ const struct data *d = ptr_barrier (&data);
2691226912+ float32x4_t n, p, q, r, r2, y;
2691326913+- uint32x4_t u;
2691426914++ uint32x4_t u, u_off;
2691526915+ uint16x4_t cmp;
2691626916+2691726917+- u = vreinterpretq_u32_f32 (x);
2691826918+- cmp = vcge_u16 (vsubhn_u32 (u, d->min_norm),
2691926919+- vget_low_u16 (d->special_bound));
2692026920++ /* To avoid having to mov x out of the way, keep u after offset has been
2692126921++ applied, and recover x by adding the offset back in the special-case
2692226922++ handler. */
2692326923++ u_off = vreinterpretq_u32_f32 (x);
2692426924+2692526925+ /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3. */
2692626926+- u = vsubq_u32 (u, d->off);
2692726927++ u_off = vsubq_u32 (u_off, d->off);
2692826928+ n = vcvtq_f32_s32 (
2692926929+- vshrq_n_s32 (vreinterpretq_s32_u32 (u), 23)); /* signextend. */
2693026930+- u = vandq_u32 (u, d->mantissa_mask);
2693126931++ vshrq_n_s32 (vreinterpretq_s32_u32 (u_off), 23)); /* signextend. */
2693226932++ u = vandq_u32 (u_off, d->mantissa_mask);
2693326933+ u = vaddq_u32 (u, d->off);
2693426934+ r = vsubq_f32 (vreinterpretq_f32_u32 (u), v_f32 (1.0f));
2693526935+2693626936++ cmp = vcge_u16 (vsubhn_u32 (u_off, d->offset_lower_bound),
2693726937++ vget_low_u16 (d->special_bound));
2693826938++
2693926939+ /* y = log(1+r) + n*ln2. */
2694026940+ r2 = vmulq_f32 (r, r);
2694126941+ /* n*ln2 + r + r2*(P1 + r*P2 + r2*(P3 + r*P4 + r2*(P5 + r*P6 + r2*P7))). */
2694226942+@@ -80,7 +87,7 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log) (float32x4_t x)
2694326943+ p = vfmaq_f32 (r, d->ln2, n);
2694426944+2694526945+ if (__glibc_unlikely (v_any_u16h (cmp)))
2694626946+- return special_case (x, y, r2, p, cmp);
2694726947++ return special_case (p, u_off, y, r2, cmp, d);
2694826948+ return vfmaq_f32 (p, y, r2);
2694926949+ }
2695026950+ libmvec_hidden_def (V_NAME_F1 (log))
2695126951+2695226952+commit a991a0fc7c051d7ef2ea7778e0a699f22d4e53d7
2695326953+Author: Joe Ramsay <Joe.Ramsay@arm.com>
2695426954+Date: Thu Sep 19 17:34:02 2024 +0100
2695526955+2695626956+ AArch64: Add vector logp1 alias for log1p
2695726957+2695826958+ This enables vectorisation of C23 logp1, which is an alias for log1p.
2695926959+ There are no new tests or ulp entries because the new symbols are simply
2696026960+ aliases.
2696126961+2696226962+ Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com>
2696326963+ (cherry picked from commit 751a5502bea1d13551c62c47bb9bd25bff870cda)
2696426964+2696526965+diff --git a/bits/libm-simd-decl-stubs.h b/bits/libm-simd-decl-stubs.h
2696626966+index 08a41c46ad..5019e8e25c 100644
2696726967+--- a/bits/libm-simd-decl-stubs.h
2696826968++++ b/bits/libm-simd-decl-stubs.h
2696926969+@@ -253,6 +253,17 @@
2697026970+ #define __DECL_SIMD_log1pf64x
2697126971+ #define __DECL_SIMD_log1pf128x
2697226972+2697326973++#define __DECL_SIMD_logp1
2697426974++#define __DECL_SIMD_logp1f
2697526975++#define __DECL_SIMD_logp1l
2697626976++#define __DECL_SIMD_logp1f16
2697726977++#define __DECL_SIMD_logp1f32
2697826978++#define __DECL_SIMD_logp1f64
2697926979++#define __DECL_SIMD_logp1f128
2698026980++#define __DECL_SIMD_logp1f32x
2698126981++#define __DECL_SIMD_logp1f64x
2698226982++#define __DECL_SIMD_logp1f128x
2698326983++
2698426984+ #define __DECL_SIMD_atanh
2698526985+ #define __DECL_SIMD_atanhf
2698626986+ #define __DECL_SIMD_atanhl
2698726987+diff --git a/math/bits/mathcalls.h b/math/bits/mathcalls.h
2698826988+index 6cb594b6ff..92856becc4 100644
2698926989+--- a/math/bits/mathcalls.h
2699026990++++ b/math/bits/mathcalls.h
2699126991+@@ -126,7 +126,7 @@ __MATHCALL (log2p1,, (_Mdouble_ __x));
2699226992+ __MATHCALL (log10p1,, (_Mdouble_ __x));
2699326993+2699426994+ /* Return log(1 + X). */
2699526995+-__MATHCALL (logp1,, (_Mdouble_ __x));
2699626996++__MATHCALL_VEC (logp1,, (_Mdouble_ __x));
2699726997+ #endif
2699826998+2699926999+ #if defined __USE_XOPEN_EXTENDED || defined __USE_ISOC99
2700027000+diff --git a/sysdeps/aarch64/fpu/Versions b/sysdeps/aarch64/fpu/Versions
2700127001+index cc15ce2d1e..015211f5f4 100644
2700227002+--- a/sysdeps/aarch64/fpu/Versions
2700327003++++ b/sysdeps/aarch64/fpu/Versions
2700427004+@@ -135,4 +135,11 @@ libmvec {
2700527005+ _ZGVsMxv_tanh;
2700627006+ _ZGVsMxv_tanhf;
2700727007+ }
2700827008++ GLIBC_2.41 {
2700927009++ _ZGVnN2v_logp1;
2701027010++ _ZGVnN2v_logp1f;
2701127011++ _ZGVnN4v_logp1f;
2701227012++ _ZGVsMxv_logp1;
2701327013++ _ZGVsMxv_logp1f;
2701427014++ }
2701527015+ }
2701627016+diff --git a/sysdeps/aarch64/fpu/advsimd_f32_protos.h b/sysdeps/aarch64/fpu/advsimd_f32_protos.h
2701727017+index 097d403ffe..5909bb4ce9 100644
2701827018+--- a/sysdeps/aarch64/fpu/advsimd_f32_protos.h
2701927019++++ b/sysdeps/aarch64/fpu/advsimd_f32_protos.h
2702027020+@@ -36,6 +36,7 @@ libmvec_hidden_proto (V_NAME_F2(hypot));
2702127021+ libmvec_hidden_proto (V_NAME_F1(log10));
2702227022+ libmvec_hidden_proto (V_NAME_F1(log1p));
2702327023+ libmvec_hidden_proto (V_NAME_F1(log2));
2702427024++libmvec_hidden_proto (V_NAME_F1(logp1));
2702527025+ libmvec_hidden_proto (V_NAME_F1(log));
2702627026+ libmvec_hidden_proto (V_NAME_F2(pow));
2702727027+ libmvec_hidden_proto (V_NAME_F1(sin));
2702827028+diff --git a/sysdeps/aarch64/fpu/bits/math-vector.h b/sysdeps/aarch64/fpu/bits/math-vector.h
2702927029+index 7484150131..f295fe185d 100644
2703027030+--- a/sysdeps/aarch64/fpu/bits/math-vector.h
2703127031++++ b/sysdeps/aarch64/fpu/bits/math-vector.h
2703227032+@@ -113,6 +113,10 @@
2703327033+ # define __DECL_SIMD_log2 __DECL_SIMD_aarch64
2703427034+ # undef __DECL_SIMD_log2f
2703527035+ # define __DECL_SIMD_log2f __DECL_SIMD_aarch64
2703627036++# undef __DECL_SIMD_logp1
2703727037++# define __DECL_SIMD_logp1 __DECL_SIMD_aarch64
2703827038++# undef __DECL_SIMD_logp1f
2703927039++# define __DECL_SIMD_logp1f __DECL_SIMD_aarch64
2704027040+ # undef __DECL_SIMD_pow
2704127041+ # define __DECL_SIMD_pow __DECL_SIMD_aarch64
2704227042+ # undef __DECL_SIMD_powf
2704327043+@@ -180,6 +184,7 @@ __vpcs __f32x4_t _ZGVnN4v_logf (__f32x4_t);
2704427044+ __vpcs __f32x4_t _ZGVnN4v_log10f (__f32x4_t);
2704527045+ __vpcs __f32x4_t _ZGVnN4v_log1pf (__f32x4_t);
2704627046+ __vpcs __f32x4_t _ZGVnN4v_log2f (__f32x4_t);
2704727047++__vpcs __f32x4_t _ZGVnN4v_logp1f (__f32x4_t);
2704827048+ __vpcs __f32x4_t _ZGVnN4vv_powf (__f32x4_t, __f32x4_t);
2704927049+ __vpcs __f32x4_t _ZGVnN4v_sinf (__f32x4_t);
2705027050+ __vpcs __f32x4_t _ZGVnN4v_sinhf (__f32x4_t);
2705127051+@@ -207,6 +212,7 @@ __vpcs __f64x2_t _ZGVnN2v_log (__f64x2_t);
2705227052+ __vpcs __f64x2_t _ZGVnN2v_log10 (__f64x2_t);
2705327053+ __vpcs __f64x2_t _ZGVnN2v_log1p (__f64x2_t);
2705427054+ __vpcs __f64x2_t _ZGVnN2v_log2 (__f64x2_t);
2705527055++__vpcs __f64x2_t _ZGVnN2v_logp1 (__f64x2_t);
2705627056+ __vpcs __f64x2_t _ZGVnN2vv_pow (__f64x2_t, __f64x2_t);
2705727057+ __vpcs __f64x2_t _ZGVnN2v_sin (__f64x2_t);
2705827058+ __vpcs __f64x2_t _ZGVnN2v_sinh (__f64x2_t);
2705927059+@@ -239,6 +245,7 @@ __sv_f32_t _ZGVsMxv_logf (__sv_f32_t, __sv_bool_t);
2706027060+ __sv_f32_t _ZGVsMxv_log10f (__sv_f32_t, __sv_bool_t);
2706127061+ __sv_f32_t _ZGVsMxv_log1pf (__sv_f32_t, __sv_bool_t);
2706227062+ __sv_f32_t _ZGVsMxv_log2f (__sv_f32_t, __sv_bool_t);
2706327063++__sv_f32_t _ZGVsMxv_logp1f (__sv_f32_t, __sv_bool_t);
2706427064+ __sv_f32_t _ZGVsMxvv_powf (__sv_f32_t, __sv_f32_t, __sv_bool_t);
2706527065+ __sv_f32_t _ZGVsMxv_sinf (__sv_f32_t, __sv_bool_t);
2706627066+ __sv_f32_t _ZGVsMxv_sinhf (__sv_f32_t, __sv_bool_t);
2706727067+@@ -266,6 +273,7 @@ __sv_f64_t _ZGVsMxv_log (__sv_f64_t, __sv_bool_t);
2706827068+ __sv_f64_t _ZGVsMxv_log10 (__sv_f64_t, __sv_bool_t);
2706927069+ __sv_f64_t _ZGVsMxv_log1p (__sv_f64_t, __sv_bool_t);
2707027070+ __sv_f64_t _ZGVsMxv_log2 (__sv_f64_t, __sv_bool_t);
2707127071++__sv_f64_t _ZGVsMxv_logp1 (__sv_f64_t, __sv_bool_t);
2707227072+ __sv_f64_t _ZGVsMxvv_pow (__sv_f64_t, __sv_f64_t, __sv_bool_t);
2707327073+ __sv_f64_t _ZGVsMxv_sin (__sv_f64_t, __sv_bool_t);
2707427074+ __sv_f64_t _ZGVsMxv_sinh (__sv_f64_t, __sv_bool_t);
2707527075+diff --git a/sysdeps/aarch64/fpu/log1p_advsimd.c b/sysdeps/aarch64/fpu/log1p_advsimd.c
2707627076+index ffc418fc9c..114064c696 100644
2707727077+--- a/sysdeps/aarch64/fpu/log1p_advsimd.c
2707827078++++ b/sysdeps/aarch64/fpu/log1p_advsimd.c
2707927079+@@ -127,3 +127,5 @@ VPCS_ATTR float64x2_t V_NAME_D1 (log1p) (float64x2_t x)
2708027080+2708127081+ return vfmaq_f64 (y, f2, p);
2708227082+ }
2708327083++
2708427084++strong_alias (V_NAME_D1 (log1p), V_NAME_D1 (logp1))
2708527085+diff --git a/sysdeps/aarch64/fpu/log1p_sve.c b/sysdeps/aarch64/fpu/log1p_sve.c
2708627086+index 04f7e5720e..b21cfb2c90 100644
2708727087+--- a/sysdeps/aarch64/fpu/log1p_sve.c
2708827088++++ b/sysdeps/aarch64/fpu/log1p_sve.c
2708927089+@@ -116,3 +116,5 @@ svfloat64_t SV_NAME_D1 (log1p) (svfloat64_t x, svbool_t pg)
2709027090+2709127091+ return y;
2709227092+ }
2709327093++
2709427094++strong_alias (SV_NAME_D1 (log1p), SV_NAME_D1 (logp1))
2709527095+diff --git a/sysdeps/aarch64/fpu/log1pf_advsimd.c b/sysdeps/aarch64/fpu/log1pf_advsimd.c
2709627096+index dc15334a85..8cfa28fb8a 100644
2709727097+--- a/sysdeps/aarch64/fpu/log1pf_advsimd.c
2709827098++++ b/sysdeps/aarch64/fpu/log1pf_advsimd.c
2709927099+@@ -128,3 +128,6 @@ VPCS_ATTR float32x4_t V_NAME_F1 (log1p) (float32x4_t x)
2710027100+ }
2710127101+ libmvec_hidden_def (V_NAME_F1 (log1p))
2710227102+ HALF_WIDTH_ALIAS_F1 (log1p)
2710327103++strong_alias (V_NAME_F1 (log1p), V_NAME_F1 (logp1))
2710427104++libmvec_hidden_def (V_NAME_F1 (logp1))
2710527105++HALF_WIDTH_ALIAS_F1 (logp1)
2710627106+diff --git a/sysdeps/aarch64/fpu/log1pf_sve.c b/sysdeps/aarch64/fpu/log1pf_sve.c
2710727107+index f645cc997e..5256d5e94c 100644
2710827108+--- a/sysdeps/aarch64/fpu/log1pf_sve.c
2710927109++++ b/sysdeps/aarch64/fpu/log1pf_sve.c
2711027110+@@ -98,3 +98,5 @@ svfloat32_t SV_NAME_F1 (log1p) (svfloat32_t x, svbool_t pg)
2711127111+2711227112+ return y;
2711327113+ }
2711427114++
2711527115++strong_alias (SV_NAME_F1 (log1p), SV_NAME_F1 (logp1))
2711627116+diff --git a/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist b/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist
2711727117+index b685106954..98687cae0d 100644
2711827118+--- a/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist
2711927119++++ b/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist
2712027120+@@ -128,3 +128,8 @@ GLIBC_2.40 _ZGVsMxvv_hypot F
2712127121+ GLIBC_2.40 _ZGVsMxvv_hypotf F
2712227122+ GLIBC_2.40 _ZGVsMxvv_pow F
2712327123+ GLIBC_2.40 _ZGVsMxvv_powf F
2712427124++GLIBC_2.41 _ZGVnN2v_logp1 F
2712527125++GLIBC_2.41 _ZGVnN2v_logp1f F
2712627126++GLIBC_2.41 _ZGVnN4v_logp1f F
2712727127++GLIBC_2.41 _ZGVsMxv_logp1 F
2712827128++GLIBC_2.41 _ZGVsMxv_logp1f F
2712927129+2713027130+commit 354aeaf2130c1484007025563fe87c997f07324a
2713127131+Author: Joe Ramsay <Joe.Ramsay@arm.com>
2713227132+Date: Mon Sep 23 15:26:12 2024 +0100
2713327133+2713427134+ AArch64: Improve codegen in SVE expf & related routines
2713527135+2713627136+ Reduce MOV and MOVPRFX by improving special-case handling. Use inline
2713727137+ helper to duplicate the entire computation between the special- and
2713827138+ non-special case branches, removing the contention for z0 between x
2713927139+ and the return value.
2714027140+2714127141+ Also rearrange some MLAs and MLSs - by making the multiplicand the
2714227142+ destination we can avoid a MOVPRFX in several cases. Also change which
2714327143+ constants go in the vector used for lanewise ops - the last lane is no
2714427144+ longer wasted.
2714527145+2714627146+ Spotted that shift was incorrect in exp2f and exp10f, w.r.t. to the
2714727147+ comment that explains it. Fixed - worst-case ULP for exp2f moves
2714827148+ around but it doesn't change significantly for either routine.
2714927149+2715027150+ Worst-case error for coshf increases due to passing x to exp rather
2715127151+ than abs(x) - updated the comment, but does not require regen-ulps.
2715227152+2715327153+ Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com>
2715427154+ (cherry picked from commit 7b8c134b5460ed933d610fa92ed1227372b68fdc)
2715527155+2715627156+diff --git a/sysdeps/aarch64/fpu/coshf_sve.c b/sysdeps/aarch64/fpu/coshf_sve.c
2715727157+index e5d8a299c6..7ad6efa0fc 100644
2715827158+--- a/sysdeps/aarch64/fpu/coshf_sve.c
2715927159++++ b/sysdeps/aarch64/fpu/coshf_sve.c
2716027160+@@ -23,37 +23,42 @@
2716127161+ static const struct data
2716227162+ {
2716327163+ struct sv_expf_data expf_consts;
2716427164+- uint32_t special_bound;
2716527165++ float special_bound;
2716627166+ } data = {
2716727167+ .expf_consts = SV_EXPF_DATA,
2716827168+ /* 0x1.5a92d8p+6: expf overflows above this, so have to use special case. */
2716927169+- .special_bound = 0x42ad496c,
2717027170++ .special_bound = 0x1.5a92d8p+6,
2717127171+ };
2717227172+2717327173+ static svfloat32_t NOINLINE
2717427174+-special_case (svfloat32_t x, svfloat32_t y, svbool_t pg)
2717527175++special_case (svfloat32_t x, svfloat32_t half_e, svfloat32_t half_over_e,
2717627176++ svbool_t pg)
2717727177+ {
2717827178+- return sv_call_f32 (coshf, x, y, pg);
2717927179++ return sv_call_f32 (coshf, x, svadd_x (svptrue_b32 (), half_e, half_over_e),
2718027180++ pg);
2718127181+ }
2718227182+2718327183+ /* Single-precision vector cosh, using vector expf.
2718427184+- Maximum error is 1.89 ULP:
2718527185+- _ZGVsMxv_coshf (-0x1.65898cp+6) got 0x1.f00aep+127
2718627186+- want 0x1.f00adcp+127. */
2718727187++ Maximum error is 2.77 ULP:
2718827188++ _ZGVsMxv_coshf(-0x1.5b38f4p+1) got 0x1.e45946p+2
2718927189++ want 0x1.e4594cp+2. */
2719027190+ svfloat32_t SV_NAME_F1 (cosh) (svfloat32_t x, svbool_t pg)
2719127191+ {
2719227192+ const struct data *d = ptr_barrier (&data);
2719327193+2719427194+- svfloat32_t ax = svabs_x (pg, x);
2719527195+- svbool_t special = svcmpge (pg, svreinterpret_u32 (ax), d->special_bound);
2719627196++ svbool_t special = svacge (pg, x, d->special_bound);
2719727197+2719827198+- /* Calculate cosh by exp(x) / 2 + exp(-x) / 2. */
2719927199+- svfloat32_t t = expf_inline (ax, pg, &d->expf_consts);
2720027200+- svfloat32_t half_t = svmul_x (pg, t, 0.5);
2720127201+- svfloat32_t half_over_t = svdivr_x (pg, t, 0.5);
2720227202++ /* Calculate cosh by exp(x) / 2 + exp(-x) / 2.
2720327203++ Note that x is passed to exp here, rather than |x|. This is to avoid using
2720427204++ destructive unary ABS for better register usage. However it means the
2720527205++ routine is not exactly symmetrical, as the exp helper is slightly less
2720627206++ accurate in the negative range. */
2720727207++ svfloat32_t e = expf_inline (x, pg, &d->expf_consts);
2720827208++ svfloat32_t half_e = svmul_x (svptrue_b32 (), e, 0.5);
2720927209++ svfloat32_t half_over_e = svdivr_x (pg, e, 0.5);
2721027210+2721127211+ if (__glibc_unlikely (svptest_any (pg, special)))
2721227212+- return special_case (x, svadd_x (pg, half_t, half_over_t), special);
2721327213++ return special_case (x, half_e, half_over_e, special);
2721427214+2721527215+- return svadd_x (pg, half_t, half_over_t);
2721627216++ return svadd_x (svptrue_b32 (), half_e, half_over_e);
2721727217+ }
2721827218+diff --git a/sysdeps/aarch64/fpu/exp10f_sve.c b/sysdeps/aarch64/fpu/exp10f_sve.c
2721927219+index e09b2f3b27..8aa3fa9c43 100644
2722027220+--- a/sysdeps/aarch64/fpu/exp10f_sve.c
2722127221++++ b/sysdeps/aarch64/fpu/exp10f_sve.c
2722227222+@@ -18,74 +18,83 @@
2722327223+ <https://www.gnu.org/licenses/>. */
2722427224+2722527225+ #include "sv_math.h"
2722627226+-#include "poly_sve_f32.h"
2722727227+2722827228+-/* For x < -SpecialBound, the result is subnormal and not handled correctly by
2722927229++/* For x < -Thres, the result is subnormal and not handled correctly by
2723027230+ FEXPA. */
2723127231+-#define SpecialBound 37.9
2723227232++#define Thres 37.9
2723327233+2723427234+ static const struct data
2723527235+ {
2723627236+- float poly[5];
2723727237+- float shift, log10_2, log2_10_hi, log2_10_lo, special_bound;
2723827238++ float log2_10_lo, c0, c2, c4;
2723927239++ float c1, c3, log10_2;
2724027240++ float shift, log2_10_hi, thres;
2724127241+ } data = {
2724227242+ /* Coefficients generated using Remez algorithm with minimisation of relative
2724327243+ error.
2724427244+ rel error: 0x1.89dafa3p-24
2724527245+ abs error: 0x1.167d55p-23 in [-log10(2)/2, log10(2)/2]
2724627246+ maxerr: 0.52 +0.5 ulp. */
2724727247+- .poly = { 0x1.26bb16p+1f, 0x1.5350d2p+1f, 0x1.04744ap+1f, 0x1.2d8176p+0f,
2724827248+- 0x1.12b41ap-1f },
2724927249++ .c0 = 0x1.26bb16p+1f,
2725027250++ .c1 = 0x1.5350d2p+1f,
2725127251++ .c2 = 0x1.04744ap+1f,
2725227252++ .c3 = 0x1.2d8176p+0f,
2725327253++ .c4 = 0x1.12b41ap-1f,
2725427254+ /* 1.5*2^17 + 127, a shift value suitable for FEXPA. */
2725527255+- .shift = 0x1.903f8p17f,
2725627256++ .shift = 0x1.803f8p17f,
2725727257+ .log10_2 = 0x1.a934fp+1,
2725827258+ .log2_10_hi = 0x1.344136p-2,
2725927259+ .log2_10_lo = -0x1.ec10cp-27,
2726027260+- .special_bound = SpecialBound,
2726127261++ .thres = Thres,
2726227262+ };
2726327263+2726427264+-static svfloat32_t NOINLINE
2726527265+-special_case (svfloat32_t x, svfloat32_t y, svbool_t special)
2726627266++static inline svfloat32_t
2726727267++sv_exp10f_inline (svfloat32_t x, const svbool_t pg, const struct data *d)
2726827268+ {
2726927269+- return sv_call_f32 (exp10f, x, y, special);
2727027270+-}
2727127271+-
2727227272+-/* Single-precision SVE exp10f routine. Implements the same algorithm
2727327273+- as AdvSIMD exp10f.
2727427274+- Worst case error is 1.02 ULPs.
2727527275+- _ZGVsMxv_exp10f(-0x1.040488p-4) got 0x1.ba5f9ep-1
2727627276+- want 0x1.ba5f9cp-1. */
2727727277+-svfloat32_t SV_NAME_F1 (exp10) (svfloat32_t x, const svbool_t pg)
2727827278+-{
2727927279+- const struct data *d = ptr_barrier (&data);
2728027280+ /* exp10(x) = 2^(n/N) * 10^r = 2^n * (1 + poly (r)),
2728127281+ with poly(r) in [1/sqrt(2), sqrt(2)] and
2728227282+ x = r + n * log10(2) / N, with r in [-log10(2)/2N, log10(2)/2N]. */
2728327283+2728427284+- /* Load some constants in quad-word chunks to minimise memory access (last
2728527285+- lane is wasted). */
2728627286+- svfloat32_t log10_2_and_inv = svld1rq (svptrue_b32 (), &d->log10_2);
2728727287++ svfloat32_t lane_consts = svld1rq (svptrue_b32 (), &d->log2_10_lo);
2728827288+2728927289+ /* n = round(x/(log10(2)/N)). */
2729027290+ svfloat32_t shift = sv_f32 (d->shift);
2729127291+- svfloat32_t z = svmla_lane (shift, x, log10_2_and_inv, 0);
2729227292+- svfloat32_t n = svsub_x (pg, z, shift);
2729327293++ svfloat32_t z = svmad_x (pg, sv_f32 (d->log10_2), x, shift);
2729427294++ svfloat32_t n = svsub_x (svptrue_b32 (), z, shift);
2729527295+2729627296+ /* r = x - n*log10(2)/N. */
2729727297+- svfloat32_t r = svmls_lane (x, n, log10_2_and_inv, 1);
2729827298+- r = svmls_lane (r, n, log10_2_and_inv, 2);
2729927299++ svfloat32_t r = svmsb_x (pg, sv_f32 (d->log2_10_hi), n, x);
2730027300++ r = svmls_lane (r, n, lane_consts, 0);
2730127301+2730227302+- svbool_t special = svacgt (pg, x, d->special_bound);
2730327303+ svfloat32_t scale = svexpa (svreinterpret_u32 (z));
2730427304+2730527305+ /* Polynomial evaluation: poly(r) ~ exp10(r)-1. */
2730627306+- svfloat32_t r2 = svmul_x (pg, r, r);
2730727307+- svfloat32_t poly
2730827308+- = svmla_x (pg, svmul_x (pg, r, d->poly[0]),
2730927309+- sv_pairwise_poly_3_f32_x (pg, r, r2, d->poly + 1), r2);
2731027310+-
2731127311+- if (__glibc_unlikely (svptest_any (pg, special)))
2731227312+- return special_case (x, svmla_x (pg, scale, scale, poly), special);
2731327313++ svfloat32_t p12 = svmla_lane (sv_f32 (d->c1), r, lane_consts, 2);
2731427314++ svfloat32_t p34 = svmla_lane (sv_f32 (d->c3), r, lane_consts, 3);
2731527315++ svfloat32_t r2 = svmul_x (svptrue_b32 (), r, r);
2731627316++ svfloat32_t p14 = svmla_x (pg, p12, p34, r2);
2731727317++ svfloat32_t p0 = svmul_lane (r, lane_consts, 1);
2731827318++ svfloat32_t poly = svmla_x (pg, p0, r2, p14);
2731927319+2732027320+ return svmla_x (pg, scale, scale, poly);
2732127321+ }
2732227322++
2732327323++static svfloat32_t NOINLINE
2732427324++special_case (svfloat32_t x, svbool_t special, const struct data *d)
2732527325++{
2732627326++ return sv_call_f32 (exp10f, x, sv_exp10f_inline (x, svptrue_b32 (), d),
2732727327++ special);
2732827328++}
2732927329++
2733027330++/* Single-precision SVE exp10f routine. Implements the same algorithm
2733127331++ as AdvSIMD exp10f.
2733227332++ Worst case error is 1.02 ULPs.
2733327333++ _ZGVsMxv_exp10f(-0x1.040488p-4) got 0x1.ba5f9ep-1
2733427334++ want 0x1.ba5f9cp-1. */
2733527335++svfloat32_t SV_NAME_F1 (exp10) (svfloat32_t x, const svbool_t pg)
2733627336++{
2733727337++ const struct data *d = ptr_barrier (&data);
2733827338++ svbool_t special = svacgt (pg, x, d->thres);
2733927339++ if (__glibc_unlikely (svptest_any (special, special)))
2734027340++ return special_case (x, special, d);
2734127341++ return sv_exp10f_inline (x, pg, d);
2734227342++}
2734327343+diff --git a/sysdeps/aarch64/fpu/exp2f_sve.c b/sysdeps/aarch64/fpu/exp2f_sve.c
2734427344+index 8a686e3e05..c6216bed9e 100644
2734527345+--- a/sysdeps/aarch64/fpu/exp2f_sve.c
2734627346++++ b/sysdeps/aarch64/fpu/exp2f_sve.c
2734727347+@@ -24,54 +24,64 @@
2734827348+2734927349+ static const struct data
2735027350+ {
2735127351+- float poly[5];
2735227352++ float c0, c2, c4, c1, c3;
2735327353+ float shift, thres;
2735427354+ } data = {
2735527355+- /* Coefficients copied from the polynomial in AdvSIMD variant, reversed for
2735627356+- compatibility with polynomial helpers. */
2735727357+- .poly = { 0x1.62e422p-1f, 0x1.ebf9bcp-3f, 0x1.c6bd32p-5f, 0x1.3ce9e4p-7f,
2735827358+- 0x1.59977ap-10f },
2735927359++ /* Coefficients copied from the polynomial in AdvSIMD variant. */
2736027360++ .c0 = 0x1.62e422p-1f,
2736127361++ .c1 = 0x1.ebf9bcp-3f,
2736227362++ .c2 = 0x1.c6bd32p-5f,
2736327363++ .c3 = 0x1.3ce9e4p-7f,
2736427364++ .c4 = 0x1.59977ap-10f,
2736527365+ /* 1.5*2^17 + 127. */
2736627366+- .shift = 0x1.903f8p17f,
2736727367++ .shift = 0x1.803f8p17f,
2736827368+ /* Roughly 87.3. For x < -Thres, the result is subnormal and not handled
2736927369+ correctly by FEXPA. */
2737027370+ .thres = Thres,
2737127371+ };
2737227372+2737327373+-static svfloat32_t NOINLINE
2737427374+-special_case (svfloat32_t x, svfloat32_t y, svbool_t special)
2737527375+-{
2737627376+- return sv_call_f32 (exp2f, x, y, special);
2737727377+-}
2737827378+-
2737927379+-/* Single-precision SVE exp2f routine. Implements the same algorithm
2738027380+- as AdvSIMD exp2f.
2738127381+- Worst case error is 1.04 ULPs.
2738227382+- SV_NAME_F1 (exp2)(0x1.943b9p-1) got 0x1.ba7eb2p+0
2738327383+- want 0x1.ba7ebp+0. */
2738427384+-svfloat32_t SV_NAME_F1 (exp2) (svfloat32_t x, const svbool_t pg)
2738527385++static inline svfloat32_t
2738627386++sv_exp2f_inline (svfloat32_t x, const svbool_t pg, const struct data *d)
2738727387+ {
2738827388+- const struct data *d = ptr_barrier (&data);
2738927389+ /* exp2(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
2739027390+ x = n + r, with r in [-1/2, 1/2]. */
2739127391+- svfloat32_t shift = sv_f32 (d->shift);
2739227392+- svfloat32_t z = svadd_x (pg, x, shift);
2739327393+- svfloat32_t n = svsub_x (pg, z, shift);
2739427394+- svfloat32_t r = svsub_x (pg, x, n);
2739527395++ svfloat32_t z = svadd_x (svptrue_b32 (), x, d->shift);
2739627396++ svfloat32_t n = svsub_x (svptrue_b32 (), z, d->shift);
2739727397++ svfloat32_t r = svsub_x (svptrue_b32 (), x, n);
2739827398+2739927399+- svbool_t special = svacgt (pg, x, d->thres);
2740027400+ svfloat32_t scale = svexpa (svreinterpret_u32 (z));
2740127401+2740227402+ /* Polynomial evaluation: poly(r) ~ exp2(r)-1.
2740327403+ Evaluate polynomial use hybrid scheme - offset ESTRIN by 1 for
2740427404+ coefficients 1 to 4, and apply most significant coefficient directly. */
2740527405+- svfloat32_t r2 = svmul_x (pg, r, r);
2740627406+- svfloat32_t p14 = sv_pairwise_poly_3_f32_x (pg, r, r2, d->poly + 1);
2740727407+- svfloat32_t p0 = svmul_x (pg, r, d->poly[0]);
2740827408++ svfloat32_t even_coeffs = svld1rq (svptrue_b32 (), &d->c0);
2740927409++ svfloat32_t r2 = svmul_x (svptrue_b32 (), r, r);
2741027410++ svfloat32_t p12 = svmla_lane (sv_f32 (d->c1), r, even_coeffs, 1);
2741127411++ svfloat32_t p34 = svmla_lane (sv_f32 (d->c3), r, even_coeffs, 2);
2741227412++ svfloat32_t p14 = svmla_x (pg, p12, r2, p34);
2741327413++ svfloat32_t p0 = svmul_lane (r, even_coeffs, 0);
2741427414+ svfloat32_t poly = svmla_x (pg, p0, r2, p14);
2741527415+2741627416+- if (__glibc_unlikely (svptest_any (pg, special)))
2741727417+- return special_case (x, svmla_x (pg, scale, scale, poly), special);
2741827418+-
2741927419+ return svmla_x (pg, scale, scale, poly);
2742027420+ }
2742127421++
2742227422++static svfloat32_t NOINLINE
2742327423++special_case (svfloat32_t x, svbool_t special, const struct data *d)
2742427424++{
2742527425++ return sv_call_f32 (exp2f, x, sv_exp2f_inline (x, svptrue_b32 (), d),
2742627426++ special);
2742727427++}
2742827428++
2742927429++/* Single-precision SVE exp2f routine. Implements the same algorithm
2743027430++ as AdvSIMD exp2f.
2743127431++ Worst case error is 1.04 ULPs.
2743227432++ _ZGVsMxv_exp2f(-0x1.af994ap-3) got 0x1.ba6a66p-1
2743327433++ want 0x1.ba6a64p-1. */
2743427434++svfloat32_t SV_NAME_F1 (exp2) (svfloat32_t x, const svbool_t pg)
2743527435++{
2743627436++ const struct data *d = ptr_barrier (&data);
2743727437++ svbool_t special = svacgt (pg, x, d->thres);
2743827438++ if (__glibc_unlikely (svptest_any (special, special)))
2743927439++ return special_case (x, special, d);
2744027440++ return sv_exp2f_inline (x, pg, d);
2744127441++}
2744227442+diff --git a/sysdeps/aarch64/fpu/expf_sve.c b/sysdeps/aarch64/fpu/expf_sve.c
2744327443+index 3ba79bc4f1..da93e01b87 100644
2744427444+--- a/sysdeps/aarch64/fpu/expf_sve.c
2744527445++++ b/sysdeps/aarch64/fpu/expf_sve.c
2744627446+@@ -18,33 +18,25 @@
2744727447+ <https://www.gnu.org/licenses/>. */
2744827448+2744927449+ #include "sv_math.h"
2745027450++#include "sv_expf_inline.h"
2745127451++
2745227452++/* Roughly 87.3. For x < -Thres, the result is subnormal and not handled
2745327453++ correctly by FEXPA. */
2745427454++#define Thres 0x1.5d5e2ap+6f
2745527455+2745627456+ static const struct data
2745727457+ {
2745827458+- float poly[5];
2745927459+- float inv_ln2, ln2_hi, ln2_lo, shift, thres;
2746027460++ struct sv_expf_data d;
2746127461++ float thres;
2746227462+ } data = {
2746327463+- /* Coefficients copied from the polynomial in AdvSIMD variant, reversed for
2746427464+- compatibility with polynomial helpers. */
2746527465+- .poly = { 0x1.ffffecp-1f, 0x1.fffdb6p-2f, 0x1.555e66p-3f, 0x1.573e2ep-5f,
2746627466+- 0x1.0e4020p-7f },
2746727467+- .inv_ln2 = 0x1.715476p+0f,
2746827468+- .ln2_hi = 0x1.62e4p-1f,
2746927469+- .ln2_lo = 0x1.7f7d1cp-20f,
2747027470+- /* 1.5*2^17 + 127. */
2747127471+- .shift = 0x1.903f8p17f,
2747227472+- /* Roughly 87.3. For x < -Thres, the result is subnormal and not handled
2747327473+- correctly by FEXPA. */
2747427474+- .thres = 0x1.5d5e2ap+6f,
2747527475++ .d = SV_EXPF_DATA,
2747627476++ .thres = Thres,
2747727477+ };
2747827478+2747927479+-#define C(i) sv_f32 (d->poly[i])
2748027480+-#define ExponentBias 0x3f800000
2748127481+-
2748227482+ static svfloat32_t NOINLINE
2748327483+-special_case (svfloat32_t x, svfloat32_t y, svbool_t special)
2748427484++special_case (svfloat32_t x, svbool_t special, const struct sv_expf_data *d)
2748527485+ {
2748627486+- return sv_call_f32 (expf, x, y, special);
2748727487++ return sv_call_f32 (expf, x, expf_inline (x, svptrue_b32 (), d), special);
2748827488+ }
2748927489+2749027490+ /* Optimised single-precision SVE exp function.
2749127491+@@ -54,36 +46,8 @@ special_case (svfloat32_t x, svfloat32_t y, svbool_t special)
2749227492+ svfloat32_t SV_NAME_F1 (exp) (svfloat32_t x, const svbool_t pg)
2749327493+ {
2749427494+ const struct data *d = ptr_barrier (&data);
2749527495+-
2749627496+- /* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
2749727497+- x = ln2*n + r, with r in [-ln2/2, ln2/2]. */
2749827498+-
2749927499+- /* Load some constants in quad-word chunks to minimise memory access (last
2750027500+- lane is wasted). */
2750127501+- svfloat32_t invln2_and_ln2 = svld1rq (svptrue_b32 (), &d->inv_ln2);
2750227502+-
2750327503+- /* n = round(x/(ln2/N)). */
2750427504+- svfloat32_t z = svmla_lane (sv_f32 (d->shift), x, invln2_and_ln2, 0);
2750527505+- svfloat32_t n = svsub_x (pg, z, d->shift);
2750627506+-
2750727507+- /* r = x - n*ln2/N. */
2750827508+- svfloat32_t r = svmls_lane (x, n, invln2_and_ln2, 1);
2750927509+- r = svmls_lane (r, n, invln2_and_ln2, 2);
2751027510+-
2751127511+- /* scale = 2^(n/N). */
2751227512+ svbool_t is_special_case = svacgt (pg, x, d->thres);
2751327513+- svfloat32_t scale = svexpa (svreinterpret_u32 (z));
2751427514+-
2751527515+- /* y = exp(r) - 1 ~= r + C0 r^2 + C1 r^3 + C2 r^4 + C3 r^5 + C4 r^6. */
2751627516+- svfloat32_t p12 = svmla_x (pg, C (1), C (2), r);
2751727517+- svfloat32_t p34 = svmla_x (pg, C (3), C (4), r);
2751827518+- svfloat32_t r2 = svmul_x (pg, r, r);
2751927519+- svfloat32_t p14 = svmla_x (pg, p12, p34, r2);
2752027520+- svfloat32_t p0 = svmul_x (pg, r, C (0));
2752127521+- svfloat32_t poly = svmla_x (pg, p0, r2, p14);
2752227522+-
2752327523+ if (__glibc_unlikely (svptest_any (pg, is_special_case)))
2752427524+- return special_case (x, svmla_x (pg, scale, scale, poly), is_special_case);
2752527525+-
2752627526+- return svmla_x (pg, scale, scale, poly);
2752727527++ return special_case (x, is_special_case, &d->d);
2752827528++ return expf_inline (x, pg, &d->d);
2752927529+ }
2753027530+diff --git a/sysdeps/aarch64/fpu/sv_expf_inline.h b/sysdeps/aarch64/fpu/sv_expf_inline.h
2753127531+index 23963b5f8e..6166df6553 100644
2753227532+--- a/sysdeps/aarch64/fpu/sv_expf_inline.h
2753327533++++ b/sysdeps/aarch64/fpu/sv_expf_inline.h
2753427534+@@ -24,19 +24,20 @@
2753527535+2753627536+ struct sv_expf_data
2753727537+ {
2753827538+- float poly[5];
2753927539+- float inv_ln2, ln2_hi, ln2_lo, shift;
2754027540++ float c1, c3, inv_ln2;
2754127541++ float ln2_lo, c0, c2, c4;
2754227542++ float ln2_hi, shift;
2754327543+ };
2754427544+2754527545+ /* Coefficients copied from the polynomial in AdvSIMD variant, reversed for
2754627546+ compatibility with polynomial helpers. Shift is 1.5*2^17 + 127. */
2754727547+ #define SV_EXPF_DATA \
2754827548+ { \
2754927549+- .poly = { 0x1.ffffecp-1f, 0x1.fffdb6p-2f, 0x1.555e66p-3f, 0x1.573e2ep-5f, \
2755027550+- 0x1.0e4020p-7f }, \
2755127551+- \
2755227552+- .inv_ln2 = 0x1.715476p+0f, .ln2_hi = 0x1.62e4p-1f, \
2755327553+- .ln2_lo = 0x1.7f7d1cp-20f, .shift = 0x1.803f8p17f, \
2755427554++ /* Coefficients copied from the polynomial in AdvSIMD variant. */ \
2755527555++ .c0 = 0x1.ffffecp-1f, .c1 = 0x1.fffdb6p-2f, .c2 = 0x1.555e66p-3f, \
2755627556++ .c3 = 0x1.573e2ep-5f, .c4 = 0x1.0e4020p-7f, .inv_ln2 = 0x1.715476p+0f, \
2755727557++ .ln2_hi = 0x1.62e4p-1f, .ln2_lo = 0x1.7f7d1cp-20f, \
2755827558++ .shift = 0x1.803f8p17f, \
2755927559+ }
2756027560+2756127561+ #define C(i) sv_f32 (d->poly[i])
2756227562+@@ -47,26 +48,25 @@ expf_inline (svfloat32_t x, const svbool_t pg, const struct sv_expf_data *d)
2756327563+ /* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
2756427564+ x = ln2*n + r, with r in [-ln2/2, ln2/2]. */
2756527565+2756627566+- /* Load some constants in quad-word chunks to minimise memory access. */
2756727567+- svfloat32_t c4_invln2_and_ln2 = svld1rq (svptrue_b32 (), &d->poly[4]);
2756827568++ svfloat32_t lane_consts = svld1rq (svptrue_b32 (), &d->ln2_lo);
2756927569+2757027570+ /* n = round(x/(ln2/N)). */
2757127571+- svfloat32_t z = svmla_lane (sv_f32 (d->shift), x, c4_invln2_and_ln2, 1);
2757227572++ svfloat32_t z = svmad_x (pg, sv_f32 (d->inv_ln2), x, d->shift);
2757327573+ svfloat32_t n = svsub_x (pg, z, d->shift);
2757427574+2757527575+ /* r = x - n*ln2/N. */
2757627576+- svfloat32_t r = svmls_lane (x, n, c4_invln2_and_ln2, 2);
2757727577+- r = svmls_lane (r, n, c4_invln2_and_ln2, 3);
2757827578++ svfloat32_t r = svmsb_x (pg, sv_f32 (d->ln2_hi), n, x);
2757927579++ r = svmls_lane (r, n, lane_consts, 0);
2758027580+2758127581+ /* scale = 2^(n/N). */
2758227582+- svfloat32_t scale = svexpa (svreinterpret_u32_f32 (z));
2758327583++ svfloat32_t scale = svexpa (svreinterpret_u32 (z));
2758427584+2758527585+ /* y = exp(r) - 1 ~= r + C0 r^2 + C1 r^3 + C2 r^4 + C3 r^5 + C4 r^6. */
2758627586+- svfloat32_t p12 = svmla_x (pg, C (1), C (2), r);
2758727587+- svfloat32_t p34 = svmla_lane (C (3), r, c4_invln2_and_ln2, 0);
2758827588+- svfloat32_t r2 = svmul_f32_x (pg, r, r);
2758927589++ svfloat32_t p12 = svmla_lane (sv_f32 (d->c1), r, lane_consts, 2);
2759027590++ svfloat32_t p34 = svmla_lane (sv_f32 (d->c3), r, lane_consts, 3);
2759127591++ svfloat32_t r2 = svmul_x (svptrue_b32 (), r, r);
2759227592+ svfloat32_t p14 = svmla_x (pg, p12, p34, r2);
2759327593+- svfloat32_t p0 = svmul_f32_x (pg, r, C (0));
2759427594++ svfloat32_t p0 = svmul_lane (r, lane_consts, 1);
2759527595+ svfloat32_t poly = svmla_x (pg, p0, r2, p14);
2759627596+2759727597+ return svmla_x (pg, scale, scale, poly);
2759827598+2759927599+commit c4373426e3a85ec483a0f412c2a7c6cdfa32ccdb
2760027600+Author: Joe Ramsay <Joe.Ramsay@arm.com>
2760127601+Date: Mon Sep 23 15:30:20 2024 +0100
2760227602+2760327603+ AArch64: Improve codegen in SVE F32 logs
2760427604+2760527605+ Reduce MOVPRFXs by using unpredicated (non-destructive) instructions
2760627606+ where possible. Similar to the recent change to AdvSIMD F32 logs,
2760727607+ adjust special-case arguments and bounds to allow for more optimal
2760827608+ register usage. For all 3 routines one MOVPRFX remains in the
2760927609+ reduction, which cannot be avoided as immediate AND and ASR are both
2761027610+ destructive.
2761127611+2761227612+ Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com>
2761327613+ (cherry picked from commit a15b1394b5eba98ffe28a02a392b587e4fe13c0d)
2761427614+2761527615+diff --git a/sysdeps/aarch64/fpu/log10f_sve.c b/sysdeps/aarch64/fpu/log10f_sve.c
2761627616+index bdbb49cd32..7913679f67 100644
2761727617+--- a/sysdeps/aarch64/fpu/log10f_sve.c
2761827618++++ b/sysdeps/aarch64/fpu/log10f_sve.c
2761927619+@@ -24,6 +24,7 @@ static const struct data
2762027620+ float poly_0246[4];
2762127621+ float poly_1357[4];
2762227622+ float ln2, inv_ln10;
2762327623++ uint32_t off, lower;
2762427624+ } data = {
2762527625+ .poly_1357 = {
2762627626+ /* Coefficients copied from the AdvSIMD routine, then rearranged so that coeffs
2762727627+@@ -35,18 +36,23 @@ static const struct data
2762827628+ -0x1.0fc92cp-4f },
2762927629+ .ln2 = 0x1.62e43p-1f,
2763027630+ .inv_ln10 = 0x1.bcb7b2p-2f,
2763127631++ .off = 0x3f2aaaab,
2763227632++ /* Lower bound is the smallest positive normal float 0x00800000. For
2763327633++ optimised register use subnormals are detected after offset has been
2763427634++ subtracted, so lower bound is 0x0080000 - offset (which wraps around). */
2763527635++ .lower = 0x00800000 - 0x3f2aaaab
2763627636+ };
2763727637+2763827638+-#define Min 0x00800000
2763927639+-#define Max 0x7f800000
2764027640+-#define Thres 0x7f000000 /* Max - Min. */
2764127641+-#define Offset 0x3f2aaaab /* 0.666667. */
2764227642++#define Thres 0x7f000000 /* asuint32(inf) - 0x00800000. */
2764327643+ #define MantissaMask 0x007fffff
2764427644+2764527645+ static svfloat32_t NOINLINE
2764627646+-special_case (svfloat32_t x, svfloat32_t y, svbool_t special)
2764727647++special_case (svuint32_t u_off, svfloat32_t p, svfloat32_t r2, svfloat32_t y,
2764827648++ svbool_t cmp)
2764927649+ {
2765027650+- return sv_call_f32 (log10f, x, y, special);
2765127651++ return sv_call_f32 (
2765227652++ log10f, svreinterpret_f32 (svadd_x (svptrue_b32 (), u_off, data.off)),
2765327653++ svmla_x (svptrue_b32 (), p, r2, y), cmp);
2765427654+ }
2765527655+2765627656+ /* Optimised implementation of SVE log10f using the same algorithm and
2765727657+@@ -57,23 +63,25 @@ special_case (svfloat32_t x, svfloat32_t y, svbool_t special)
2765827658+ svfloat32_t SV_NAME_F1 (log10) (svfloat32_t x, const svbool_t pg)
2765927659+ {
2766027660+ const struct data *d = ptr_barrier (&data);
2766127661+- svuint32_t ix = svreinterpret_u32 (x);
2766227662+- svbool_t special = svcmpge (pg, svsub_x (pg, ix, Min), Thres);
2766327663++
2766427664++ svuint32_t u_off = svreinterpret_u32 (x);
2766527665++
2766627666++ u_off = svsub_x (pg, u_off, d->off);
2766727667++ svbool_t special = svcmpge (pg, svsub_x (pg, u_off, d->lower), Thres);
2766827668+2766927669+ /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3. */
2767027670+- ix = svsub_x (pg, ix, Offset);
2767127671+ svfloat32_t n = svcvt_f32_x (
2767227672+- pg, svasr_x (pg, svreinterpret_s32 (ix), 23)); /* signextend. */
2767327673+- ix = svand_x (pg, ix, MantissaMask);
2767427674+- ix = svadd_x (pg, ix, Offset);
2767527675++ pg, svasr_x (pg, svreinterpret_s32 (u_off), 23)); /* signextend. */
2767627676++ svuint32_t ix = svand_x (pg, u_off, MantissaMask);
2767727677++ ix = svadd_x (pg, ix, d->off);
2767827678+ svfloat32_t r = svsub_x (pg, svreinterpret_f32 (ix), 1.0f);
2767927679+2768027680+ /* y = log10(1+r) + n*log10(2)
2768127681+ log10(1+r) ~ r * InvLn(10) + P(r)
2768227682+ where P(r) is a polynomial. Use order 9 for log10(1+x), i.e. order 8 for
2768327683+ log10(1+x)/x, with x in [-1/3, 1/3] (offset=2/3). */
2768427684+- svfloat32_t r2 = svmul_x (pg, r, r);
2768527685+- svfloat32_t r4 = svmul_x (pg, r2, r2);
2768627686++ svfloat32_t r2 = svmul_x (svptrue_b32 (), r, r);
2768727687++ svfloat32_t r4 = svmul_x (svptrue_b32 (), r2, r2);
2768827688+ svfloat32_t p_1357 = svld1rq (svptrue_b32 (), &d->poly_1357[0]);
2768927689+ svfloat32_t q_01 = svmla_lane (sv_f32 (d->poly_0246[0]), r, p_1357, 0);
2769027690+ svfloat32_t q_23 = svmla_lane (sv_f32 (d->poly_0246[1]), r, p_1357, 1);
2769127691+@@ -88,7 +96,6 @@ svfloat32_t SV_NAME_F1 (log10) (svfloat32_t x, const svbool_t pg)
2769227692+ hi = svmul_x (pg, hi, d->inv_ln10);
2769327693+2769427694+ if (__glibc_unlikely (svptest_any (pg, special)))
2769527695+- return special_case (x, svmla_x (svnot_z (pg, special), hi, r2, y),
2769627696+- special);
2769727697+- return svmla_x (pg, hi, r2, y);
2769827698++ return special_case (u_off, hi, r2, y, special);
2769927699++ return svmla_x (svptrue_b32 (), hi, r2, y);
2770027700+ }
2770127701+diff --git a/sysdeps/aarch64/fpu/log2f_sve.c b/sysdeps/aarch64/fpu/log2f_sve.c
2770227702+index 5031c42483..939d89bfb9 100644
2770327703+--- a/sysdeps/aarch64/fpu/log2f_sve.c
2770427704++++ b/sysdeps/aarch64/fpu/log2f_sve.c
2770527705+@@ -23,6 +23,7 @@ static const struct data
2770627706+ {
2770727707+ float poly_02468[5];
2770827708+ float poly_1357[4];
2770927709++ uint32_t off, lower;
2771027710+ } data = {
2771127711+ .poly_1357 = {
2771227712+ /* Coefficients copied from the AdvSIMD routine, then rearranged so that coeffs
2771327713+@@ -32,18 +33,23 @@ static const struct data
2771427714+ },
2771527715+ .poly_02468 = { 0x1.715476p0f, 0x1.ec701cp-2f, 0x1.27a0b8p-2f,
2771627716+ 0x1.9d8ecap-3f, 0x1.9e495p-3f },
2771727717++ .off = 0x3f2aaaab,
2771827718++ /* Lower bound is the smallest positive normal float 0x00800000. For
2771927719++ optimised register use subnormals are detected after offset has been
2772027720++ subtracted, so lower bound is 0x0080000 - offset (which wraps around). */
2772127721++ .lower = 0x00800000 - 0x3f2aaaab
2772227722+ };
2772327723+2772427724+-#define Min (0x00800000)
2772527725+-#define Max (0x7f800000)
2772627726+-#define Thres (0x7f000000) /* Max - Min. */
2772727727++#define Thresh (0x7f000000) /* asuint32(inf) - 0x00800000. */
2772827728+ #define MantissaMask (0x007fffff)
2772927729+-#define Off (0x3f2aaaab) /* 0.666667. */
2773027730+2773127731+ static svfloat32_t NOINLINE
2773227732+-special_case (svfloat32_t x, svfloat32_t y, svbool_t cmp)
2773327733++special_case (svuint32_t u_off, svfloat32_t p, svfloat32_t r2, svfloat32_t y,
2773427734++ svbool_t cmp)
2773527735+ {
2773627736+- return sv_call_f32 (log2f, x, y, cmp);
2773727737++ return sv_call_f32 (
2773827738++ log2f, svreinterpret_f32 (svadd_x (svptrue_b32 (), u_off, data.off)),
2773927739++ svmla_x (svptrue_b32 (), p, r2, y), cmp);
2774027740+ }
2774127741+2774227742+ /* Optimised implementation of SVE log2f, using the same algorithm
2774327743+@@ -55,19 +61,20 @@ svfloat32_t SV_NAME_F1 (log2) (svfloat32_t x, const svbool_t pg)
2774427744+ {
2774527745+ const struct data *d = ptr_barrier (&data);
2774627746+2774727747+- svuint32_t u = svreinterpret_u32 (x);
2774827748+- svbool_t special = svcmpge (pg, svsub_x (pg, u, Min), Thres);
2774927749++ svuint32_t u_off = svreinterpret_u32 (x);
2775027750++
2775127751++ u_off = svsub_x (pg, u_off, d->off);
2775227752++ svbool_t special = svcmpge (pg, svsub_x (pg, u_off, d->lower), Thresh);
2775327753+2775427754+ /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3. */
2775527755+- u = svsub_x (pg, u, Off);
2775627756+ svfloat32_t n = svcvt_f32_x (
2775727757+- pg, svasr_x (pg, svreinterpret_s32 (u), 23)); /* Sign-extend. */
2775827758+- u = svand_x (pg, u, MantissaMask);
2775927759+- u = svadd_x (pg, u, Off);
2776027760++ pg, svasr_x (pg, svreinterpret_s32 (u_off), 23)); /* Sign-extend. */
2776127761++ svuint32_t u = svand_x (pg, u_off, MantissaMask);
2776227762++ u = svadd_x (pg, u, d->off);
2776327763+ svfloat32_t r = svsub_x (pg, svreinterpret_f32 (u), 1.0f);
2776427764+2776527765+ /* y = log2(1+r) + n. */
2776627766+- svfloat32_t r2 = svmul_x (pg, r, r);
2776727767++ svfloat32_t r2 = svmul_x (svptrue_b32 (), r, r);
2776827768+2776927769+ /* Evaluate polynomial using pairwise Horner scheme. */
2777027770+ svfloat32_t p_1357 = svld1rq (svptrue_b32 (), &d->poly_1357[0]);
2777127771+@@ -81,6 +88,6 @@ svfloat32_t SV_NAME_F1 (log2) (svfloat32_t x, const svbool_t pg)
2777227772+ y = svmla_x (pg, q_01, r2, y);
2777327773+2777427774+ if (__glibc_unlikely (svptest_any (pg, special)))
2777527775+- return special_case (x, svmla_x (svnot_z (pg, special), n, r, y), special);
2777627776+- return svmla_x (pg, n, r, y);
2777727777++ return special_case (u_off, n, r, y, special);
2777827778++ return svmla_x (svptrue_b32 (), n, r, y);
2777927779+ }
2778027780+diff --git a/sysdeps/aarch64/fpu/logf_sve.c b/sysdeps/aarch64/fpu/logf_sve.c
2778127781+index d64e810cfe..5b9324678d 100644
2778227782+--- a/sysdeps/aarch64/fpu/logf_sve.c
2778327783++++ b/sysdeps/aarch64/fpu/logf_sve.c
2778427784+@@ -24,6 +24,7 @@ static const struct data
2778527785+ float poly_0135[4];
2778627786+ float poly_246[3];
2778727787+ float ln2;
2778827788++ uint32_t off, lower;
2778927789+ } data = {
2779027790+ .poly_0135 = {
2779127791+ /* Coefficients copied from the AdvSIMD routine in math/, then rearranged so
2779227792+@@ -32,19 +33,24 @@ static const struct data
2779327793+ -0x1.3e737cp-3f, 0x1.5a9aa2p-3f, 0x1.961348p-3f, 0x1.555d7cp-2f
2779427794+ },
2779527795+ .poly_246 = { -0x1.4f9934p-3f, -0x1.00187cp-2f, -0x1.ffffc8p-2f },
2779627796+- .ln2 = 0x1.62e43p-1f
2779727797++ .ln2 = 0x1.62e43p-1f,
2779827798++ .off = 0x3f2aaaab,
2779927799++ /* Lower bound is the smallest positive normal float 0x00800000. For
2780027800++ optimised register use subnormals are detected after offset has been
2780127801++ subtracted, so lower bound is 0x0080000 - offset (which wraps around). */
2780227802++ .lower = 0x00800000 - 0x3f2aaaab
2780327803+ };
2780427804+2780527805+-#define Min (0x00800000)
2780627806+-#define Max (0x7f800000)
2780727807+-#define Thresh (0x7f000000) /* Max - Min. */
2780827808++#define Thresh (0x7f000000) /* asuint32(inf) - 0x00800000. */
2780927809+ #define Mask (0x007fffff)
2781027810+-#define Off (0x3f2aaaab) /* 0.666667. */
2781127811+2781227812+ static svfloat32_t NOINLINE
2781327813+-special_case (svfloat32_t x, svfloat32_t y, svbool_t cmp)
2781427814++special_case (svuint32_t u_off, svfloat32_t p, svfloat32_t r2, svfloat32_t y,
2781527815++ svbool_t cmp)
2781627816+ {
2781727817+- return sv_call_f32 (logf, x, y, cmp);
2781827818++ return sv_call_f32 (
2781927819++ logf, svreinterpret_f32 (svadd_x (svptrue_b32 (), u_off, data.off)),
2782027820++ svmla_x (svptrue_b32 (), p, r2, y), cmp);
2782127821+ }
2782227822+2782327823+ /* Optimised implementation of SVE logf, using the same algorithm and
2782427824+@@ -55,19 +61,21 @@ svfloat32_t SV_NAME_F1 (log) (svfloat32_t x, const svbool_t pg)
2782527825+ {
2782627826+ const struct data *d = ptr_barrier (&data);
2782727827+2782827828+- svuint32_t u = svreinterpret_u32 (x);
2782927829+- svbool_t cmp = svcmpge (pg, svsub_x (pg, u, Min), Thresh);
2783027830++ svuint32_t u_off = svreinterpret_u32 (x);
2783127831++
2783227832++ u_off = svsub_x (pg, u_off, d->off);
2783327833++ svbool_t cmp = svcmpge (pg, svsub_x (pg, u_off, d->lower), Thresh);
2783427834+2783527835+ /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3. */
2783627836+- u = svsub_x (pg, u, Off);
2783727837+ svfloat32_t n = svcvt_f32_x (
2783827838+- pg, svasr_x (pg, svreinterpret_s32 (u), 23)); /* Sign-extend. */
2783927839+- u = svand_x (pg, u, Mask);
2784027840+- u = svadd_x (pg, u, Off);
2784127841++ pg, svasr_x (pg, svreinterpret_s32 (u_off), 23)); /* Sign-extend. */
2784227842++
2784327843++ svuint32_t u = svand_x (pg, u_off, Mask);
2784427844++ u = svadd_x (pg, u, d->off);
2784527845+ svfloat32_t r = svsub_x (pg, svreinterpret_f32 (u), 1.0f);
2784627846+2784727847+ /* y = log(1+r) + n*ln2. */
2784827848+- svfloat32_t r2 = svmul_x (pg, r, r);
2784927849++ svfloat32_t r2 = svmul_x (svptrue_b32 (), r, r);
2785027850+ /* n*ln2 + r + r2*(P6 + r*P5 + r2*(P4 + r*P3 + r2*(P2 + r*P1 + r2*P0))). */
2785127851+ svfloat32_t p_0135 = svld1rq (svptrue_b32 (), &d->poly_0135[0]);
2785227852+ svfloat32_t p = svmla_lane (sv_f32 (d->poly_246[0]), r, p_0135, 1);
2785327853+@@ -80,6 +88,6 @@ svfloat32_t SV_NAME_F1 (log) (svfloat32_t x, const svbool_t pg)
2785427854+ p = svmla_x (pg, r, n, d->ln2);
2785527855+2785627856+ if (__glibc_unlikely (svptest_any (pg, cmp)))
2785727857+- return special_case (x, svmla_x (svnot_z (pg, cmp), p, r2, y), cmp);
2785827858++ return special_case (u_off, p, r2, y, cmp);
2785927859+ return svmla_x (pg, p, r2, y);
2786027860+ }
2786127861+2786227862+commit 520240173029fd03388ec01db9a5359291cbbd27
2786327863+Author: Joe Ramsay <Joe.Ramsay@arm.com>
2786427864+Date: Mon Sep 23 15:32:14 2024 +0100
2786527865+2786627866+ AArch64: Improve codegen in users of AdvSIMD log1pf helper
2786727867+2786827868+ log1pf is quite register-intensive - use fewer registers for the
2786927869+ polynomial, and make various changes to shorten dependency chains in
2787027870+ parent routines. There is now no spilling with GCC 14. Accuracy moves
2787127871+ around a little - comments adjusted accordingly but does not require
2787227872+ regen-ulps.
2787327873+2787427874+ Use the helper in log1pf as well, instead of having separate
2787527875+ implementations. The more accurate polynomial means special-casing can
2787627876+ be simplified, and the shorter dependency chain avoids the usual dance
2787727877+ around v0, which is otherwise difficult.
2787827878+2787927879+ There is a small duplication of vectors containing 1.0f (or 0x3f800000) -
2788027880+ GCC is not currently able to efficiently handle values which fit in FMOV
2788127881+ but not MOVI, and are reinterpreted to integer. There may be potential
2788227882+ for more optimisation if this is fixed.
2788327883+2788427884+ Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com>
2788527885+ (cherry picked from commit 5bc100bd4b7e00db3009ae93d25d303341545d23)
2788627886+2788727887+diff --git a/sysdeps/aarch64/fpu/acoshf_advsimd.c b/sysdeps/aarch64/fpu/acoshf_advsimd.c
2788827888+index 8916dcbf40..004474acf9 100644
2788927889+--- a/sysdeps/aarch64/fpu/acoshf_advsimd.c
2789027890++++ b/sysdeps/aarch64/fpu/acoshf_advsimd.c
2789127891+@@ -25,35 +25,32 @@ const static struct data
2789227892+ {
2789327893+ struct v_log1pf_data log1pf_consts;
2789427894+ uint32x4_t one;
2789527895+- uint16x4_t thresh;
2789627896+-} data = {
2789727897+- .log1pf_consts = V_LOG1PF_CONSTANTS_TABLE,
2789827898+- .one = V4 (0x3f800000),
2789927899+- .thresh = V4 (0x2000) /* top(asuint(SquareLim) - asuint(1)). */
2790027900+-};
2790127901++} data = { .log1pf_consts = V_LOG1PF_CONSTANTS_TABLE, .one = V4 (0x3f800000) };
2790227902++
2790327903++#define Thresh vdup_n_u16 (0x2000) /* top(asuint(SquareLim) - asuint(1)). */
2790427904+2790527905+ static float32x4_t NOINLINE VPCS_ATTR
2790627906+ special_case (float32x4_t x, float32x4_t y, uint16x4_t special,
2790727907+- const struct v_log1pf_data d)
2790827908++ const struct v_log1pf_data *d)
2790927909+ {
2791027910+ return v_call_f32 (acoshf, x, log1pf_inline (y, d), vmovl_u16 (special));
2791127911+ }
2791227912+2791327913+ /* Vector approximation for single-precision acosh, based on log1p. Maximum
2791427914+ error depends on WANT_SIMD_EXCEPT. With SIMD fp exceptions enabled, it
2791527915+- is 2.78 ULP:
2791627916+- __v_acoshf(0x1.07887p+0) got 0x1.ef9e9cp-3
2791727917+- want 0x1.ef9ea2p-3.
2791827918++ is 3.00 ULP:
2791927919++ _ZGVnN4v_acoshf(0x1.01df3ap+0) got 0x1.ef0a82p-4
2792027920++ want 0x1.ef0a7cp-4.
2792127921+ With exceptions disabled, we can compute u with a shorter dependency chain,
2792227922+- which gives maximum error of 3.07 ULP:
2792327923+- __v_acoshf(0x1.01f83ep+0) got 0x1.fbc7fap-4
2792427924+- want 0x1.fbc7f4p-4. */
2792527925++ which gives maximum error of 3.22 ULP:
2792627926++ _ZGVnN4v_acoshf(0x1.007ef2p+0) got 0x1.fdcdccp-5
2792727927++ want 0x1.fdcdd2p-5. */
2792827928+2792927929+ VPCS_ATTR float32x4_t NOINLINE V_NAME_F1 (acosh) (float32x4_t x)
2793027930+ {
2793127931+ const struct data *d = ptr_barrier (&data);
2793227932+ uint32x4_t ix = vreinterpretq_u32_f32 (x);
2793327933+- uint16x4_t special = vcge_u16 (vsubhn_u32 (ix, d->one), d->thresh);
2793427934++ uint16x4_t special = vcge_u16 (vsubhn_u32 (ix, d->one), Thresh);
2793527935+2793627936+ #if WANT_SIMD_EXCEPT
2793727937+ /* Mask special lanes with 1 to side-step spurious invalid or overflow. Use
2793827938+@@ -64,15 +61,16 @@ VPCS_ATTR float32x4_t NOINLINE V_NAME_F1 (acosh) (float32x4_t x)
2793927939+ float32x4_t xm1 = v_zerofy_f32 (vsubq_f32 (x, v_f32 (1)), p);
2794027940+ float32x4_t u = vfmaq_f32 (vaddq_f32 (xm1, xm1), xm1, xm1);
2794127941+ #else
2794227942+- float32x4_t xm1 = vsubq_f32 (x, v_f32 (1));
2794327943+- float32x4_t u = vmulq_f32 (xm1, vaddq_f32 (x, v_f32 (1.0f)));
2794427944++ float32x4_t xm1 = vsubq_f32 (x, vreinterpretq_f32_u32 (d->one));
2794527945++ float32x4_t u
2794627946++ = vmulq_f32 (xm1, vaddq_f32 (x, vreinterpretq_f32_u32 (d->one)));
2794727947+ #endif
2794827948+2794927949+ float32x4_t y = vaddq_f32 (xm1, vsqrtq_f32 (u));
2795027950+2795127951+ if (__glibc_unlikely (v_any_u16h (special)))
2795227952+- return special_case (x, y, special, d->log1pf_consts);
2795327953+- return log1pf_inline (y, d->log1pf_consts);
2795427954++ return special_case (x, y, special, &d->log1pf_consts);
2795527955++ return log1pf_inline (y, &d->log1pf_consts);
2795627956+ }
2795727957+ libmvec_hidden_def (V_NAME_F1 (acosh))
2795827958+ HALF_WIDTH_ALIAS_F1 (acosh)
2795927959+diff --git a/sysdeps/aarch64/fpu/asinhf_advsimd.c b/sysdeps/aarch64/fpu/asinhf_advsimd.c
2796027960+index 09fd8a6143..eb789b91b6 100644
2796127961+--- a/sysdeps/aarch64/fpu/asinhf_advsimd.c
2796227962++++ b/sysdeps/aarch64/fpu/asinhf_advsimd.c
2796327963+@@ -20,16 +20,16 @@
2796427964+ #include "v_math.h"
2796527965+ #include "v_log1pf_inline.h"
2796627966+2796727967+-#define SignMask v_u32 (0x80000000)
2796827968+-
2796927969+ const static struct data
2797027970+ {
2797127971+ struct v_log1pf_data log1pf_consts;
2797227972++ float32x4_t one;
2797327973+ uint32x4_t big_bound;
2797427974+ #if WANT_SIMD_EXCEPT
2797527975+ uint32x4_t tiny_bound;
2797627976+ #endif
2797727977+ } data = {
2797827978++ .one = V4 (1),
2797927979+ .log1pf_consts = V_LOG1PF_CONSTANTS_TABLE,
2798027980+ .big_bound = V4 (0x5f800000), /* asuint(0x1p64). */
2798127981+ #if WANT_SIMD_EXCEPT
2798227982+@@ -38,20 +38,27 @@ const static struct data
2798327983+ };
2798427984+2798527985+ static float32x4_t NOINLINE VPCS_ATTR
2798627986+-special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
2798727987++special_case (float32x4_t x, uint32x4_t sign, float32x4_t y,
2798827988++ uint32x4_t special, const struct data *d)
2798927989+ {
2799027990+- return v_call_f32 (asinhf, x, y, special);
2799127991++ return v_call_f32 (
2799227992++ asinhf, x,
2799327993++ vreinterpretq_f32_u32 (veorq_u32 (
2799427994++ sign, vreinterpretq_u32_f32 (log1pf_inline (y, &d->log1pf_consts)))),
2799527995++ special);
2799627996+ }
2799727997+2799827998+ /* Single-precision implementation of vector asinh(x), using vector log1p.
2799927999+- Worst-case error is 2.66 ULP, at roughly +/-0.25:
2800028000+- __v_asinhf(0x1.01b04p-2) got 0x1.fe163ep-3 want 0x1.fe1638p-3. */
2800128001++ Worst-case error is 2.59 ULP:
2800228002++ _ZGVnN4v_asinhf(0x1.d86124p-3) got 0x1.d449bep-3
2800328003++ want 0x1.d449c4p-3. */
2800428004+ VPCS_ATTR float32x4_t NOINLINE V_NAME_F1 (asinh) (float32x4_t x)
2800528005+ {
2800628006+ const struct data *dat = ptr_barrier (&data);
2800728007+- uint32x4_t iax = vbicq_u32 (vreinterpretq_u32_f32 (x), SignMask);
2800828008+- float32x4_t ax = vreinterpretq_f32_u32 (iax);
2800928009++ float32x4_t ax = vabsq_f32 (x);
2801028010++ uint32x4_t iax = vreinterpretq_u32_f32 (ax);
2801128011+ uint32x4_t special = vcgeq_u32 (iax, dat->big_bound);
2801228012++ uint32x4_t sign = veorq_u32 (vreinterpretq_u32_f32 (x), iax);
2801328013+ float32x4_t special_arg = x;
2801428014+2801528015+ #if WANT_SIMD_EXCEPT
2801628016+@@ -68,13 +75,13 @@ VPCS_ATTR float32x4_t NOINLINE V_NAME_F1 (asinh) (float32x4_t x)
2801728017+ /* asinh(x) = log(x + sqrt(x * x + 1)).
2801828018+ For positive x, asinh(x) = log1p(x + x * x / (1 + sqrt(x * x + 1))). */
2801928019+ float32x4_t d
2802028020+- = vaddq_f32 (v_f32 (1), vsqrtq_f32 (vfmaq_f32 (v_f32 (1), x, x)));
2802128021+- float32x4_t y = log1pf_inline (
2802228022+- vaddq_f32 (ax, vdivq_f32 (vmulq_f32 (ax, ax), d)), dat->log1pf_consts);
2802328023++ = vaddq_f32 (v_f32 (1), vsqrtq_f32 (vfmaq_f32 (dat->one, ax, ax)));
2802428024++ float32x4_t y = vaddq_f32 (ax, vdivq_f32 (vmulq_f32 (ax, ax), d));
2802528025+2802628026+ if (__glibc_unlikely (v_any_u32 (special)))
2802728027+- return special_case (special_arg, vbslq_f32 (SignMask, x, y), special);
2802828028+- return vbslq_f32 (SignMask, x, y);
2802928029++ return special_case (special_arg, sign, y, special, dat);
2803028030++ return vreinterpretq_f32_u32 (veorq_u32 (
2803128031++ sign, vreinterpretq_u32_f32 (log1pf_inline (y, &dat->log1pf_consts))));
2803228032+ }
2803328033+ libmvec_hidden_def (V_NAME_F1 (asinh))
2803428034+ HALF_WIDTH_ALIAS_F1 (asinh)
2803528035+diff --git a/sysdeps/aarch64/fpu/atanhf_advsimd.c b/sysdeps/aarch64/fpu/atanhf_advsimd.c
2803628036+index ae488f7b54..818b6c92ad 100644
2803728037+--- a/sysdeps/aarch64/fpu/atanhf_advsimd.c
2803828038++++ b/sysdeps/aarch64/fpu/atanhf_advsimd.c
2803928039+@@ -40,15 +40,17 @@ const static struct data
2804028040+ #define Half v_u32 (0x3f000000)
2804128041+2804228042+ static float32x4_t NOINLINE VPCS_ATTR
2804328043+-special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
2804428044++special_case (float32x4_t x, float32x4_t halfsign, float32x4_t y,
2804528045++ uint32x4_t special)
2804628046+ {
2804728047+- return v_call_f32 (atanhf, x, y, special);
2804828048++ return v_call_f32 (atanhf, vbslq_f32 (AbsMask, x, halfsign),
2804928049++ vmulq_f32 (halfsign, y), special);
2805028050+ }
2805128051+2805228052+ /* Approximation for vector single-precision atanh(x) using modified log1p.
2805328053+- The maximum error is 3.08 ULP:
2805428054+- __v_atanhf(0x1.ff215p-5) got 0x1.ffcb7cp-5
2805528055+- want 0x1.ffcb82p-5. */
2805628056++ The maximum error is 2.93 ULP:
2805728057++ _ZGVnN4v_atanhf(0x1.f43d7p-5) got 0x1.f4dcfep-5
2805828058++ want 0x1.f4dcf8p-5. */
2805928059+ VPCS_ATTR float32x4_t NOINLINE V_NAME_F1 (atanh) (float32x4_t x)
2806028060+ {
2806128061+ const struct data *d = ptr_barrier (&data);
2806228062+@@ -68,11 +70,19 @@ VPCS_ATTR float32x4_t NOINLINE V_NAME_F1 (atanh) (float32x4_t x)
2806328063+ uint32x4_t special = vcgeq_u32 (iax, d->one);
2806428064+ #endif
2806528065+2806628066+- float32x4_t y = vdivq_f32 (vaddq_f32 (ax, ax), vsubq_f32 (v_f32 (1), ax));
2806728067+- y = log1pf_inline (y, d->log1pf_consts);
2806828068++ float32x4_t y = vdivq_f32 (vaddq_f32 (ax, ax),
2806928069++ vsubq_f32 (vreinterpretq_f32_u32 (d->one), ax));
2807028070++ y = log1pf_inline (y, &d->log1pf_consts);
2807128071+2807228072++ /* If exceptions not required, pass ax to special-case for shorter dependency
2807328073++ chain. If exceptions are required ax will have been zerofied, so have to
2807428074++ pass x. */
2807528075+ if (__glibc_unlikely (v_any_u32 (special)))
2807628076+- return special_case (x, vmulq_f32 (halfsign, y), special);
2807728077++#if WANT_SIMD_EXCEPT
2807828078++ return special_case (x, halfsign, y, special);
2807928079++#else
2808028080++ return special_case (ax, halfsign, y, special);
2808128081++#endif
2808228082+ return vmulq_f32 (halfsign, y);
2808328083+ }
2808428084+ libmvec_hidden_def (V_NAME_F1 (atanh))
2808528085+diff --git a/sysdeps/aarch64/fpu/log1pf_advsimd.c b/sysdeps/aarch64/fpu/log1pf_advsimd.c
2808628086+index 8cfa28fb8a..00006fc703 100644
2808728087+--- a/sysdeps/aarch64/fpu/log1pf_advsimd.c
2808828088++++ b/sysdeps/aarch64/fpu/log1pf_advsimd.c
2808928089+@@ -18,114 +18,79 @@
2809028090+ <https://www.gnu.org/licenses/>. */
2809128091+2809228092+ #include "v_math.h"
2809328093+-#include "poly_advsimd_f32.h"
2809428094++#include "v_log1pf_inline.h"
2809528095++
2809628096++#if WANT_SIMD_EXCEPT
2809728097+2809828098+ const static struct data
2809928099+ {
2810028100+- float32x4_t poly[8], ln2;
2810128101+- uint32x4_t tiny_bound, minus_one, four, thresh;
2810228102+- int32x4_t three_quarters;
2810328103++ uint32x4_t minus_one, thresh;
2810428104++ struct v_log1pf_data d;
2810528105+ } data = {
2810628106+- .poly = { /* Generated using FPMinimax in [-0.25, 0.5]. First two coefficients
2810728107+- (1, -0.5) are not stored as they can be generated more
2810828108+- efficiently. */
2810928109+- V4 (0x1.5555aap-2f), V4 (-0x1.000038p-2f), V4 (0x1.99675cp-3f),
2811028110+- V4 (-0x1.54ef78p-3f), V4 (0x1.28a1f4p-3f), V4 (-0x1.0da91p-3f),
2811128111+- V4 (0x1.abcb6p-4f), V4 (-0x1.6f0d5ep-5f) },
2811228112+- .ln2 = V4 (0x1.62e43p-1f),
2811328113+- .tiny_bound = V4 (0x34000000), /* asuint32(0x1p-23). ulp=0.5 at 0x1p-23. */
2811428114+- .thresh = V4 (0x4b800000), /* asuint32(INFINITY) - tiny_bound. */
2811528115++ .d = V_LOG1PF_CONSTANTS_TABLE,
2811628116++ .thresh = V4 (0x4b800000), /* asuint32(INFINITY) - TinyBound. */
2811728117+ .minus_one = V4 (0xbf800000),
2811828118+- .four = V4 (0x40800000),
2811928119+- .three_quarters = V4 (0x3f400000)
2812028120+ };
2812128121+2812228122+-static inline float32x4_t
2812328123+-eval_poly (float32x4_t m, const float32x4_t *p)
2812428124+-{
2812528125+- /* Approximate log(1+m) on [-0.25, 0.5] using split Estrin scheme. */
2812628126+- float32x4_t p_12 = vfmaq_f32 (v_f32 (-0.5), m, p[0]);
2812728127+- float32x4_t p_34 = vfmaq_f32 (p[1], m, p[2]);
2812828128+- float32x4_t p_56 = vfmaq_f32 (p[3], m, p[4]);
2812928129+- float32x4_t p_78 = vfmaq_f32 (p[5], m, p[6]);
2813028130+-
2813128131+- float32x4_t m2 = vmulq_f32 (m, m);
2813228132+- float32x4_t p_02 = vfmaq_f32 (m, m2, p_12);
2813328133+- float32x4_t p_36 = vfmaq_f32 (p_34, m2, p_56);
2813428134+- float32x4_t p_79 = vfmaq_f32 (p_78, m2, p[7]);
2813528135+-
2813628136+- float32x4_t m4 = vmulq_f32 (m2, m2);
2813728137+- float32x4_t p_06 = vfmaq_f32 (p_02, m4, p_36);
2813828138+- return vfmaq_f32 (p_06, m4, vmulq_f32 (m4, p_79));
2813928139+-}
2814028140++/* asuint32(0x1p-23). ulp=0.5 at 0x1p-23. */
2814128141++# define TinyBound v_u32 (0x34000000)
2814228142+2814328143+ static float32x4_t NOINLINE VPCS_ATTR
2814428144+-special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
2814528145++special_case (float32x4_t x, uint32x4_t cmp, const struct data *d)
2814628146+ {
2814728147+- return v_call_f32 (log1pf, x, y, special);
2814828148++ /* Side-step special lanes so fenv exceptions are not triggered
2814928149++ inadvertently. */
2815028150++ float32x4_t x_nospecial = v_zerofy_f32 (x, cmp);
2815128151++ return v_call_f32 (log1pf, x, log1pf_inline (x_nospecial, &d->d), cmp);
2815228152+ }
2815328153+2815428154+-/* Vector log1pf approximation using polynomial on reduced interval. Accuracy
2815528155+- is roughly 2.02 ULP:
2815628156+- log1pf(0x1.21e13ap-2) got 0x1.fe8028p-3 want 0x1.fe802cp-3. */
2815728157++/* Vector log1pf approximation using polynomial on reduced interval. Worst-case
2815828158++ error is 1.69 ULP:
2815928159++ _ZGVnN4v_log1pf(0x1.04418ap-2) got 0x1.cfcbd8p-3
2816028160++ want 0x1.cfcbdcp-3. */
2816128161+ VPCS_ATTR float32x4_t V_NAME_F1 (log1p) (float32x4_t x)
2816228162+ {
2816328163+ const struct data *d = ptr_barrier (&data);
2816428164+-
2816528165+ uint32x4_t ix = vreinterpretq_u32_f32 (x);
2816628166+ uint32x4_t ia = vreinterpretq_u32_f32 (vabsq_f32 (x));
2816728167++
2816828168+ uint32x4_t special_cases
2816928169+- = vorrq_u32 (vcgeq_u32 (vsubq_u32 (ia, d->tiny_bound), d->thresh),
2817028170++ = vorrq_u32 (vcgeq_u32 (vsubq_u32 (ia, TinyBound), d->thresh),
2817128171+ vcgeq_u32 (ix, d->minus_one));
2817228172+- float32x4_t special_arg = x;
2817328173+2817428174+-#if WANT_SIMD_EXCEPT
2817528175+ if (__glibc_unlikely (v_any_u32 (special_cases)))
2817628176+- /* Side-step special lanes so fenv exceptions are not triggered
2817728177+- inadvertently. */
2817828178+- x = v_zerofy_f32 (x, special_cases);
2817928179+-#endif
2818028180++ return special_case (x, special_cases, d);
2818128181+2818228182+- /* With x + 1 = t * 2^k (where t = m + 1 and k is chosen such that m
2818328183+- is in [-0.25, 0.5]):
2818428184+- log1p(x) = log(t) + log(2^k) = log1p(m) + k*log(2).
2818528185+-
2818628186+- We approximate log1p(m) with a polynomial, then scale by
2818728187+- k*log(2). Instead of doing this directly, we use an intermediate
2818828188+- scale factor s = 4*k*log(2) to ensure the scale is representable
2818928189+- as a normalised fp32 number. */
2819028190++ return log1pf_inline (x, &d->d);
2819128191++}
2819228192+2819328193+- float32x4_t m = vaddq_f32 (x, v_f32 (1.0f));
2819428194++#else
2819528195+2819628196+- /* Choose k to scale x to the range [-1/4, 1/2]. */
2819728197+- int32x4_t k
2819828198+- = vandq_s32 (vsubq_s32 (vreinterpretq_s32_f32 (m), d->three_quarters),
2819928199+- v_s32 (0xff800000));
2820028200+- uint32x4_t ku = vreinterpretq_u32_s32 (k);
2820128201++const static struct v_log1pf_data data = V_LOG1PF_CONSTANTS_TABLE;
2820228202+2820328203+- /* Scale x by exponent manipulation. */
2820428204+- float32x4_t m_scale
2820528205+- = vreinterpretq_f32_u32 (vsubq_u32 (vreinterpretq_u32_f32 (x), ku));
2820628206++static float32x4_t NOINLINE VPCS_ATTR
2820728207++special_case (float32x4_t x, uint32x4_t cmp)
2820828208++{
2820928209++ return v_call_f32 (log1pf, x, log1pf_inline (x, ptr_barrier (&data)), cmp);
2821028210++}
2821128211+2821228212+- /* Scale up to ensure that the scale factor is representable as normalised
2821328213+- fp32 number, and scale m down accordingly. */
2821428214+- float32x4_t s = vreinterpretq_f32_u32 (vsubq_u32 (d->four, ku));
2821528215+- m_scale = vaddq_f32 (m_scale, vfmaq_f32 (v_f32 (-1.0f), v_f32 (0.25f), s));
2821628216++/* Vector log1pf approximation using polynomial on reduced interval. Worst-case
2821728217++ error is 1.63 ULP:
2821828218++ _ZGVnN4v_log1pf(0x1.216d12p-2) got 0x1.fdcb12p-3
2821928219++ want 0x1.fdcb16p-3. */
2822028220++VPCS_ATTR float32x4_t V_NAME_F1 (log1p) (float32x4_t x)
2822128221++{
2822228222++ uint32x4_t special_cases = vornq_u32 (vcleq_f32 (x, v_f32 (-1)),
2822328223++ vcaleq_f32 (x, v_f32 (0x1p127f)));
2822428224+2822528225+- /* Evaluate polynomial on the reduced interval. */
2822628226+- float32x4_t p = eval_poly (m_scale, d->poly);
2822728227++ if (__glibc_unlikely (v_any_u32 (special_cases)))
2822828228++ return special_case (x, special_cases);
2822928229+2823028230+- /* The scale factor to be applied back at the end - by multiplying float(k)
2823128231+- by 2^-23 we get the unbiased exponent of k. */
2823228232+- float32x4_t scale_back = vcvtq_f32_s32 (vshrq_n_s32 (k, 23));
2823328233++ return log1pf_inline (x, ptr_barrier (&data));
2823428234++}
2823528235+2823628236+- /* Apply the scaling back. */
2823728237+- float32x4_t y = vfmaq_f32 (p, scale_back, d->ln2);
2823828238++#endif
2823928239+2824028240+- if (__glibc_unlikely (v_any_u32 (special_cases)))
2824128241+- return special_case (special_arg, y, special_cases);
2824228242+- return y;
2824328243+-}
2824428244+ libmvec_hidden_def (V_NAME_F1 (log1p))
2824528245+ HALF_WIDTH_ALIAS_F1 (log1p)
2824628246+ strong_alias (V_NAME_F1 (log1p), V_NAME_F1 (logp1))
2824728247+diff --git a/sysdeps/aarch64/fpu/v_log1pf_inline.h b/sysdeps/aarch64/fpu/v_log1pf_inline.h
2824828248+index 643a6cdcfc..73e45a942e 100644
2824928249+--- a/sysdeps/aarch64/fpu/v_log1pf_inline.h
2825028250++++ b/sysdeps/aarch64/fpu/v_log1pf_inline.h
2825128251+@@ -25,54 +25,81 @@
2825228252+2825328253+ struct v_log1pf_data
2825428254+ {
2825528255+- float32x4_t poly[8], ln2;
2825628256+ uint32x4_t four;
2825728257+ int32x4_t three_quarters;
2825828258++ float c0, c3, c5, c7;
2825928259++ float32x4_t c4, c6, c1, c2, ln2;
2826028260+ };
2826128261+2826228262+ /* Polynomial generated using FPMinimax in [-0.25, 0.5]. First two coefficients
2826328263+ (1, -0.5) are not stored as they can be generated more efficiently. */
2826428264+ #define V_LOG1PF_CONSTANTS_TABLE \
2826528265+ { \
2826628266+- .poly \
2826728267+- = { V4 (0x1.5555aap-2f), V4 (-0x1.000038p-2f), V4 (0x1.99675cp-3f), \
2826828268+- V4 (-0x1.54ef78p-3f), V4 (0x1.28a1f4p-3f), V4 (-0x1.0da91p-3f), \
2826928269+- V4 (0x1.abcb6p-4f), V4 (-0x1.6f0d5ep-5f) }, \
2827028270+- .ln2 = V4 (0x1.62e43p-1f), .four = V4 (0x40800000), \
2827128271+- .three_quarters = V4 (0x3f400000) \
2827228272++ .c0 = 0x1.5555aap-2f, .c1 = V4 (-0x1.000038p-2f), \
2827328273++ .c2 = V4 (0x1.99675cp-3f), .c3 = -0x1.54ef78p-3f, \
2827428274++ .c4 = V4 (0x1.28a1f4p-3f), .c5 = -0x1.0da91p-3f, \
2827528275++ .c6 = V4 (0x1.abcb6p-4f), .c7 = -0x1.6f0d5ep-5f, \
2827628276++ .ln2 = V4 (0x1.62e43p-1f), .four = V4 (0x40800000), \
2827728277++ .three_quarters = V4 (0x3f400000) \
2827828278+ }
2827928279+2828028280+ static inline float32x4_t
2828128281+-eval_poly (float32x4_t m, const float32x4_t *c)
2828228282++eval_poly (float32x4_t m, const struct v_log1pf_data *d)
2828328283+ {
2828428284+- /* Approximate log(1+m) on [-0.25, 0.5] using pairwise Horner (main routine
2828528285+- uses split Estrin, but this way reduces register pressure in the calling
2828628286+- routine). */
2828728287+- float32x4_t q = vfmaq_f32 (v_f32 (-0.5), m, c[0]);
2828828288++ /* Approximate log(1+m) on [-0.25, 0.5] using pairwise Horner. */
2828928289++ float32x4_t c0357 = vld1q_f32 (&d->c0);
2829028290++ float32x4_t q = vfmaq_laneq_f32 (v_f32 (-0.5), m, c0357, 0);
2829128291+ float32x4_t m2 = vmulq_f32 (m, m);
2829228292+- q = vfmaq_f32 (m, m2, q);
2829328293+- float32x4_t p = v_pw_horner_6_f32 (m, m2, c + 1);
2829428294++ float32x4_t p67 = vfmaq_laneq_f32 (d->c6, m, c0357, 3);
2829528295++ float32x4_t p45 = vfmaq_laneq_f32 (d->c4, m, c0357, 2);
2829628296++ float32x4_t p23 = vfmaq_laneq_f32 (d->c2, m, c0357, 1);
2829728297++ float32x4_t p = vfmaq_f32 (p45, m2, p67);
2829828298++ p = vfmaq_f32 (p23, m2, p);
2829928299++ p = vfmaq_f32 (d->c1, m, p);
2830028300+ p = vmulq_f32 (m2, p);
2830128301+- return vfmaq_f32 (q, m2, p);
2830228302++ p = vfmaq_f32 (m, m2, p);
2830328303++ return vfmaq_f32 (p, m2, q);
2830428304+ }
2830528305+2830628306+ static inline float32x4_t
2830728307+-log1pf_inline (float32x4_t x, const struct v_log1pf_data d)
2830828308++log1pf_inline (float32x4_t x, const struct v_log1pf_data *d)
2830928309+ {
2831028310+- /* Helper for calculating log(x + 1). Copied from log1pf_2u1.c, with no
2831128311+- special-case handling. See that file for details of the algorithm. */
2831228312++ /* Helper for calculating log(x + 1). */
2831328313++
2831428314++ /* With x + 1 = t * 2^k (where t = m + 1 and k is chosen such that m
2831528315++ is in [-0.25, 0.5]):
2831628316++ log1p(x) = log(t) + log(2^k) = log1p(m) + k*log(2).
2831728317++
2831828318++ We approximate log1p(m) with a polynomial, then scale by
2831928319++ k*log(2). Instead of doing this directly, we use an intermediate
2832028320++ scale factor s = 4*k*log(2) to ensure the scale is representable
2832128321++ as a normalised fp32 number. */
2832228322+ float32x4_t m = vaddq_f32 (x, v_f32 (1.0f));
2832328323++
2832428324++ /* Choose k to scale x to the range [-1/4, 1/2]. */
2832528325+ int32x4_t k
2832628326+- = vandq_s32 (vsubq_s32 (vreinterpretq_s32_f32 (m), d.three_quarters),
2832728327++ = vandq_s32 (vsubq_s32 (vreinterpretq_s32_f32 (m), d->three_quarters),
2832828328+ v_s32 (0xff800000));
2832928329+ uint32x4_t ku = vreinterpretq_u32_s32 (k);
2833028330+- float32x4_t s = vreinterpretq_f32_u32 (vsubq_u32 (d.four, ku));
2833128331++
2833228332++ /* Scale up to ensure that the scale factor is representable as normalised
2833328333++ fp32 number, and scale m down accordingly. */
2833428334++ float32x4_t s = vreinterpretq_f32_u32 (vsubq_u32 (d->four, ku));
2833528335++
2833628336++ /* Scale x by exponent manipulation. */
2833728337+ float32x4_t m_scale
2833828338+ = vreinterpretq_f32_u32 (vsubq_u32 (vreinterpretq_u32_f32 (x), ku));
2833928339+ m_scale = vaddq_f32 (m_scale, vfmaq_f32 (v_f32 (-1.0f), v_f32 (0.25f), s));
2834028340+- float32x4_t p = eval_poly (m_scale, d.poly);
2834128341++
2834228342++ /* Evaluate polynomial on the reduced interval. */
2834328343++ float32x4_t p = eval_poly (m_scale, d);
2834428344++
2834528345++ /* The scale factor to be applied back at the end - by multiplying float(k)
2834628346++ by 2^-23 we get the unbiased exponent of k. */
2834728347+ float32x4_t scale_back = vmulq_f32 (vcvtq_f32_s32 (k), v_f32 (0x1.0p-23f));
2834828348+- return vfmaq_f32 (p, scale_back, d.ln2);
2834928349++
2835028350++ /* Apply the scaling back. */
2835128351++ return vfmaq_f32 (p, scale_back, d->ln2);
2835228352+ }
2835328353+2835428354+ #endif
2835528355+2835628356+commit a947a43b95bbea53ec50df058b42392fd5ea52b6
2835728357+Author: Joe Ramsay <Joe.Ramsay@arm.com>
2835828358+Date: Mon Sep 23 15:32:53 2024 +0100
2835928359+2836028360+ AArch64: Improve codegen in users of ADVSIMD expm1f helper
2836128361+2836228362+ Rearrange operations so MOV is not necessary in reduction or around
2836328363+ the special-case handler. Reduce memory access by using more indexed
2836428364+ MLAs in polynomial.
2836528365+2836628366+ Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com>
2836728367+ (cherry picked from commit 7900ac490db32f6bccff812733f00280dde34e27)
2836828368+2836928369+diff --git a/sysdeps/aarch64/fpu/expm1f_advsimd.c b/sysdeps/aarch64/fpu/expm1f_advsimd.c
2837028370+index a0616ec754..8303ca296e 100644
2837128371+--- a/sysdeps/aarch64/fpu/expm1f_advsimd.c
2837228372++++ b/sysdeps/aarch64/fpu/expm1f_advsimd.c
2837328373+@@ -18,27 +18,18 @@
2837428374+ <https://www.gnu.org/licenses/>. */
2837528375+2837628376+ #include "v_math.h"
2837728377+-#include "poly_advsimd_f32.h"
2837828378++#include "v_expm1f_inline.h"
2837928379+2838028380+ static const struct data
2838128381+ {
2838228382+- float32x4_t poly[5];
2838328383+- float invln2_and_ln2[4];
2838428384+- float32x4_t shift;
2838528385+- int32x4_t exponent_bias;
2838628386++ struct v_expm1f_data d;
2838728387+ #if WANT_SIMD_EXCEPT
2838828388+ uint32x4_t thresh;
2838928389+ #else
2839028390+ float32x4_t oflow_bound;
2839128391+ #endif
2839228392+ } data = {
2839328393+- /* Generated using fpminimax with degree=5 in [-log(2)/2, log(2)/2]. */
2839428394+- .poly = { V4 (0x1.fffffep-2), V4 (0x1.5554aep-3), V4 (0x1.555736p-5),
2839528395+- V4 (0x1.12287cp-7), V4 (0x1.6b55a2p-10) },
2839628396+- /* Stores constants: invln2, ln2_hi, ln2_lo, 0. */
2839728397+- .invln2_and_ln2 = { 0x1.715476p+0f, 0x1.62e4p-1f, 0x1.7f7d1cp-20f, 0 },
2839828398+- .shift = V4 (0x1.8p23f),
2839928399+- .exponent_bias = V4 (0x3f800000),
2840028400++ .d = V_EXPM1F_DATA,
2840128401+ #if !WANT_SIMD_EXCEPT
2840228402+ /* Value above which expm1f(x) should overflow. Absolute value of the
2840328403+ underflow bound is greater than this, so it catches both cases - there is
2840428404+@@ -55,67 +46,38 @@ static const struct data
2840528405+ #define TinyBound v_u32 (0x34000000 << 1)
2840628406+2840728407+ static float32x4_t VPCS_ATTR NOINLINE
2840828408+-special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
2840928409++special_case (float32x4_t x, uint32x4_t special, const struct data *d)
2841028410+ {
2841128411+- return v_call_f32 (expm1f, x, y, special);
2841228412++ return v_call_f32 (
2841328413++ expm1f, x, expm1f_inline (v_zerofy_f32 (x, special), &d->d), special);
2841428414+ }
2841528415+2841628416+ /* Single-precision vector exp(x) - 1 function.
2841728417+- The maximum error is 1.51 ULP:
2841828418+- _ZGVnN4v_expm1f (0x1.8baa96p-2) got 0x1.e2fb9p-2
2841928419+- want 0x1.e2fb94p-2. */
2842028420++ The maximum error is 1.62 ULP:
2842128421++ _ZGVnN4v_expm1f(0x1.85f83p-2) got 0x1.da9f4p-2
2842228422++ want 0x1.da9f44p-2. */
2842328423+ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (expm1) (float32x4_t x)
2842428424+ {
2842528425+ const struct data *d = ptr_barrier (&data);
2842628426+- uint32x4_t ix = vreinterpretq_u32_f32 (x);
2842728427+2842828428+ #if WANT_SIMD_EXCEPT
2842928429++ uint32x4_t ix = vreinterpretq_u32_f32 (x);
2843028430+ /* If fp exceptions are to be triggered correctly, fall back to scalar for
2843128431+ |x| < 2^-23, |x| > oflow_bound, Inf & NaN. Add ix to itself for
2843228432+ shift-left by 1, and compare with thresh which was left-shifted offline -
2843328433+ this is effectively an absolute compare. */
2843428434+ uint32x4_t special
2843528435+ = vcgeq_u32 (vsubq_u32 (vaddq_u32 (ix, ix), TinyBound), d->thresh);
2843628436+- if (__glibc_unlikely (v_any_u32 (special)))
2843728437+- x = v_zerofy_f32 (x, special);
2843828438+ #else
2843928439+ /* Handles very large values (+ve and -ve), +/-NaN, +/-Inf. */
2844028440+ uint32x4_t special = vcagtq_f32 (x, d->oflow_bound);
2844128441+ #endif
2844228442+2844328443+- /* Reduce argument to smaller range:
2844428444+- Let i = round(x / ln2)
2844528445+- and f = x - i * ln2, then f is in [-ln2/2, ln2/2].
2844628446+- exp(x) - 1 = 2^i * (expm1(f) + 1) - 1
2844728447+- where 2^i is exact because i is an integer. */
2844828448+- float32x4_t invln2_and_ln2 = vld1q_f32 (d->invln2_and_ln2);
2844928449+- float32x4_t j
2845028450+- = vsubq_f32 (vfmaq_laneq_f32 (d->shift, x, invln2_and_ln2, 0), d->shift);
2845128451+- int32x4_t i = vcvtq_s32_f32 (j);
2845228452+- float32x4_t f = vfmsq_laneq_f32 (x, j, invln2_and_ln2, 1);
2845328453+- f = vfmsq_laneq_f32 (f, j, invln2_and_ln2, 2);
2845428454+-
2845528455+- /* Approximate expm1(f) using polynomial.
2845628456+- Taylor expansion for expm1(x) has the form:
2845728457+- x + ax^2 + bx^3 + cx^4 ....
2845828458+- So we calculate the polynomial P(f) = a + bf + cf^2 + ...
2845928459+- and assemble the approximation expm1(f) ~= f + f^2 * P(f). */
2846028460+- float32x4_t p = v_horner_4_f32 (f, d->poly);
2846128461+- p = vfmaq_f32 (f, vmulq_f32 (f, f), p);
2846228462+-
2846328463+- /* Assemble the result.
2846428464+- expm1(x) ~= 2^i * (p + 1) - 1
2846528465+- Let t = 2^i. */
2846628466+- int32x4_t u = vaddq_s32 (vshlq_n_s32 (i, 23), d->exponent_bias);
2846728467+- float32x4_t t = vreinterpretq_f32_s32 (u);
2846828468+-
2846928469+ if (__glibc_unlikely (v_any_u32 (special)))
2847028470+- return special_case (vreinterpretq_f32_u32 (ix),
2847128471+- vfmaq_f32 (vsubq_f32 (t, v_f32 (1.0f)), p, t),
2847228472+- special);
2847328473++ return special_case (x, special, d);
2847428474+2847528475+ /* expm1(x) ~= p * t + (t - 1). */
2847628476+- return vfmaq_f32 (vsubq_f32 (t, v_f32 (1.0f)), p, t);
2847728477++ return expm1f_inline (x, &d->d);
2847828478+ }
2847928479+ libmvec_hidden_def (V_NAME_F1 (expm1))
2848028480+ HALF_WIDTH_ALIAS_F1 (expm1)
2848128481+diff --git a/sysdeps/aarch64/fpu/sinhf_advsimd.c b/sysdeps/aarch64/fpu/sinhf_advsimd.c
2848228482+index 6bb7482dc2..c6ed7598e7 100644
2848328483+--- a/sysdeps/aarch64/fpu/sinhf_advsimd.c
2848428484++++ b/sysdeps/aarch64/fpu/sinhf_advsimd.c
2848528485+@@ -23,15 +23,13 @@
2848628486+ static const struct data
2848728487+ {
2848828488+ struct v_expm1f_data expm1f_consts;
2848928489+- uint32x4_t halff;
2849028490+ #if WANT_SIMD_EXCEPT
2849128491+ uint32x4_t tiny_bound, thresh;
2849228492+ #else
2849328493+- uint32x4_t oflow_bound;
2849428494++ float32x4_t oflow_bound;
2849528495+ #endif
2849628496+ } data = {
2849728497+ .expm1f_consts = V_EXPM1F_DATA,
2849828498+- .halff = V4 (0x3f000000),
2849928499+ #if WANT_SIMD_EXCEPT
2850028500+ /* 0x1.6a09e8p-32, below which expm1f underflows. */
2850128501+ .tiny_bound = V4 (0x2fb504f4),
2850228502+@@ -39,14 +37,15 @@ static const struct data
2850328503+ .thresh = V4 (0x12fbbbb3),
2850428504+ #else
2850528505+ /* 0x1.61814ep+6, above which expm1f helper overflows. */
2850628506+- .oflow_bound = V4 (0x42b0c0a7),
2850728507++ .oflow_bound = V4 (0x1.61814ep+6),
2850828508+ #endif
2850928509+ };
2851028510+2851128511+ static float32x4_t NOINLINE VPCS_ATTR
2851228512+-special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
2851328513++special_case (float32x4_t x, float32x4_t t, float32x4_t halfsign,
2851428514++ uint32x4_t special)
2851528515+ {
2851628516+- return v_call_f32 (sinhf, x, y, special);
2851728517++ return v_call_f32 (sinhf, x, vmulq_f32 (t, halfsign), special);
2851828518+ }
2851928519+2852028520+ /* Approximation for vector single-precision sinh(x) using expm1.
2852128521+@@ -60,15 +59,15 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (sinh) (float32x4_t x)
2852228522+2852328523+ uint32x4_t ix = vreinterpretq_u32_f32 (x);
2852428524+ float32x4_t ax = vabsq_f32 (x);
2852528525+- uint32x4_t iax = vreinterpretq_u32_f32 (ax);
2852628526+- uint32x4_t sign = veorq_u32 (ix, iax);
2852728527+- float32x4_t halfsign = vreinterpretq_f32_u32 (vorrq_u32 (sign, d->halff));
2852828528++ float32x4_t halfsign = vreinterpretq_f32_u32 (
2852928529++ vbslq_u32 (v_u32 (0x80000000), ix, vreinterpretq_u32_f32 (v_f32 (0.5))));
2853028530+2853128531+ #if WANT_SIMD_EXCEPT
2853228532+- uint32x4_t special = vcgeq_u32 (vsubq_u32 (iax, d->tiny_bound), d->thresh);
2853328533++ uint32x4_t special = vcgeq_u32 (
2853428534++ vsubq_u32 (vreinterpretq_u32_f32 (ax), d->tiny_bound), d->thresh);
2853528535+ ax = v_zerofy_f32 (ax, special);
2853628536+ #else
2853728537+- uint32x4_t special = vcgeq_u32 (iax, d->oflow_bound);
2853828538++ uint32x4_t special = vcageq_f32 (x, d->oflow_bound);
2853928539+ #endif
2854028540+2854128541+ /* Up to the point that expm1f overflows, we can use it to calculate sinhf
2854228542+@@ -80,7 +79,7 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (sinh) (float32x4_t x)
2854328543+ /* Fall back to the scalar variant for any lanes that should trigger an
2854428544+ exception. */
2854528545+ if (__glibc_unlikely (v_any_u32 (special)))
2854628546+- return special_case (x, vmulq_f32 (t, halfsign), special);
2854728547++ return special_case (x, t, halfsign, special);
2854828548+2854928549+ return vmulq_f32 (t, halfsign);
2855028550+ }
2855128551+diff --git a/sysdeps/aarch64/fpu/tanhf_advsimd.c b/sysdeps/aarch64/fpu/tanhf_advsimd.c
2855228552+index 50defd6ef0..3ced9b7a41 100644
2855328553+--- a/sysdeps/aarch64/fpu/tanhf_advsimd.c
2855428554++++ b/sysdeps/aarch64/fpu/tanhf_advsimd.c
2855528555+@@ -28,13 +28,16 @@ static const struct data
2855628556+ /* 0x1.205966p+3, above which tanhf rounds to 1 (or -1 for negative). */
2855728557+ .boring_bound = V4 (0x41102cb3),
2855828558+ .large_bound = V4 (0x7f800000),
2855928559+- .onef = V4 (0x3f800000),
2856028560+ };
2856128561+2856228562+ static float32x4_t NOINLINE VPCS_ATTR
2856328563+-special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
2856428564++special_case (float32x4_t x, uint32x4_t is_boring, float32x4_t boring,
2856528565++ float32x4_t q, uint32x4_t special)
2856628566+ {
2856728567+- return v_call_f32 (tanhf, x, y, special);
2856828568++ return v_call_f32 (
2856928569++ tanhf, x,
2857028570++ vbslq_f32 (is_boring, boring, vdivq_f32 (q, vaddq_f32 (q, v_f32 (2.0)))),
2857128571++ special);
2857228572+ }
2857328573+2857428574+ /* Approximation for single-precision vector tanh(x), using a simplified
2857528575+@@ -50,7 +53,9 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (tanh) (float32x4_t x)
2857628576+ uint32x4_t iax = vreinterpretq_u32_f32 (ax);
2857728577+ uint32x4_t sign = veorq_u32 (ix, iax);
2857828578+ uint32x4_t is_boring = vcgtq_u32 (iax, d->boring_bound);
2857928579+- float32x4_t boring = vreinterpretq_f32_u32 (vorrq_u32 (sign, d->onef));
2858028580++ /* expm1 exponent bias is 1.0f reinterpreted to int. */
2858128581++ float32x4_t boring = vreinterpretq_f32_u32 (vorrq_u32 (
2858228582++ sign, vreinterpretq_u32_s32 (d->expm1f_consts.exponent_bias)));
2858328583+2858428584+ #if WANT_SIMD_EXCEPT
2858528585+ /* If fp exceptions are to be triggered properly, set all special and boring
2858628586+@@ -66,10 +71,12 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (tanh) (float32x4_t x)
2858728587+2858828588+ /* tanh(x) = (e^2x - 1) / (e^2x + 1). */
2858928589+ float32x4_t q = expm1f_inline (vmulq_n_f32 (x, 2), &d->expm1f_consts);
2859028590+- float32x4_t y = vdivq_f32 (q, vaddq_f32 (q, v_f32 (2.0)));
2859128591++
2859228592+ if (__glibc_unlikely (v_any_u32 (special)))
2859328593+- return special_case (vreinterpretq_f32_u32 (ix),
2859428594+- vbslq_f32 (is_boring, boring, y), special);
2859528595++ return special_case (vreinterpretq_f32_u32 (ix), is_boring, boring, q,
2859628596++ special);
2859728597++
2859828598++ float32x4_t y = vdivq_f32 (q, vaddq_f32 (q, v_f32 (2.0)));
2859928599+ return vbslq_f32 (is_boring, boring, y);
2860028600+ }
2860128601+ libmvec_hidden_def (V_NAME_F1 (tanh))
2860228602+diff --git a/sysdeps/aarch64/fpu/v_expm1f_inline.h b/sysdeps/aarch64/fpu/v_expm1f_inline.h
2860328603+index 59b552da6b..1daedfdd51 100644
2860428604+--- a/sysdeps/aarch64/fpu/v_expm1f_inline.h
2860528605++++ b/sysdeps/aarch64/fpu/v_expm1f_inline.h
2860628606+@@ -21,48 +21,47 @@
2860728607+ #define AARCH64_FPU_V_EXPM1F_INLINE_H
2860828608+2860928609+ #include "v_math.h"
2861028610+-#include "poly_advsimd_f32.h"
2861128611++#include "math_config.h"
2861228612+2861328613+ struct v_expm1f_data
2861428614+ {
2861528615+- float32x4_t poly[5];
2861628616+- float invln2_and_ln2[4];
2861728617+- float32x4_t shift;
2861828618++ float32x4_t c0, c2;
2861928619+ int32x4_t exponent_bias;
2862028620++ float c1, c3, inv_ln2, c4;
2862128621++ float ln2_hi, ln2_lo;
2862228622+ };
2862328623+2862428624+ /* Coefficients generated using fpminimax with degree=5 in [-log(2)/2,
2862528625+- log(2)/2]. Exponent bias is asuint(1.0f).
2862628626+- invln2_and_ln2 Stores constants: invln2, ln2_lo, ln2_hi, 0. */
2862728627++ log(2)/2]. Exponent bias is asuint(1.0f). */
2862828628+ #define V_EXPM1F_DATA \
2862928629+ { \
2863028630+- .poly = { V4 (0x1.fffffep-2), V4 (0x1.5554aep-3), V4 (0x1.555736p-5), \
2863128631+- V4 (0x1.12287cp-7), V4 (0x1.6b55a2p-10) }, \
2863228632+- .shift = V4 (0x1.8p23f), .exponent_bias = V4 (0x3f800000), \
2863328633+- .invln2_and_ln2 = { 0x1.715476p+0f, 0x1.62e4p-1f, 0x1.7f7d1cp-20f, 0 }, \
2863428634++ .c0 = V4 (0x1.fffffep-2), .c1 = 0x1.5554aep-3, .c2 = V4 (0x1.555736p-5), \
2863528635++ .c3 = 0x1.12287cp-7, .c4 = 0x1.6b55a2p-10, \
2863628636++ .exponent_bias = V4 (0x3f800000), .inv_ln2 = 0x1.715476p+0f, \
2863728637++ .ln2_hi = 0x1.62e4p-1f, .ln2_lo = 0x1.7f7d1cp-20f, \
2863828638+ }
2863928639+2864028640+ static inline float32x4_t
2864128641+ expm1f_inline (float32x4_t x, const struct v_expm1f_data *d)
2864228642+ {
2864328643+- /* Helper routine for calculating exp(x) - 1.
2864428644+- Copied from v_expm1f_1u6.c, with all special-case handling removed - the
2864528645+- calling routine should handle special values if required. */
2864628646++ /* Helper routine for calculating exp(x) - 1. */
2864728647++
2864828648++ float32x2_t ln2 = vld1_f32 (&d->ln2_hi);
2864928649++ float32x4_t lane_consts = vld1q_f32 (&d->c1);
2865028650+2865128651+ /* Reduce argument: f in [-ln2/2, ln2/2], i is exact. */
2865228652+- float32x4_t invln2_and_ln2 = vld1q_f32 (d->invln2_and_ln2);
2865328653+- float32x4_t j
2865428654+- = vsubq_f32 (vfmaq_laneq_f32 (d->shift, x, invln2_and_ln2, 0), d->shift);
2865528655++ float32x4_t j = vrndaq_f32 (vmulq_laneq_f32 (x, lane_consts, 2));
2865628656+ int32x4_t i = vcvtq_s32_f32 (j);
2865728657+- float32x4_t f = vfmsq_laneq_f32 (x, j, invln2_and_ln2, 1);
2865828658+- f = vfmsq_laneq_f32 (f, j, invln2_and_ln2, 2);
2865928659++ float32x4_t f = vfmsq_lane_f32 (x, j, ln2, 0);
2866028660++ f = vfmsq_lane_f32 (f, j, ln2, 1);
2866128661+2866228662+- /* Approximate expm1(f) with polynomial P, expm1(f) ~= f + f^2 * P(f).
2866328663+- Uses Estrin scheme, where the main _ZGVnN4v_expm1f routine uses
2866428664+- Horner. */
2866528665++ /* Approximate expm1(f) with polynomial P, expm1(f) ~= f + f^2 * P(f). */
2866628666+ float32x4_t f2 = vmulq_f32 (f, f);
2866728667+ float32x4_t f4 = vmulq_f32 (f2, f2);
2866828668+- float32x4_t p = v_estrin_4_f32 (f, f2, f4, d->poly);
2866928669++ float32x4_t p01 = vfmaq_laneq_f32 (d->c0, f, lane_consts, 0);
2867028670++ float32x4_t p23 = vfmaq_laneq_f32 (d->c2, f, lane_consts, 1);
2867128671++ float32x4_t p = vfmaq_f32 (p01, f2, p23);
2867228672++ p = vfmaq_laneq_f32 (p, f4, lane_consts, 3);
2867328673+ p = vfmaq_f32 (f, f2, p);
2867428674+2867528675+ /* t = 2^i. */
2867628676+2867728677+commit 68f2eb20de698675ddc74068c2cd03fee29207df
2867828678+Author: Joe Ramsay <Joe.Ramsay@arm.com>
2867928679+Date: Mon Sep 23 15:33:31 2024 +0100
2868028680+2868128681+ AArch64: Simplify rounding-multiply pattern in several AdvSIMD routines
2868228682+2868328683+ This operation can be simplified to use simpler multiply-round-convert
2868428684+ sequence, which uses fewer instructions and constants.
2868528685+2868628686+ Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com>
2868728687+ (cherry picked from commit 16a59571e4e9fd019d3fc23a2e7d73c1df8bb5cb)
2868828688+2868928689+diff --git a/sysdeps/aarch64/fpu/cos_advsimd.c b/sysdeps/aarch64/fpu/cos_advsimd.c
2869028690+index 3924c9ce44..11a89b1530 100644
2869128691+--- a/sysdeps/aarch64/fpu/cos_advsimd.c
2869228692++++ b/sysdeps/aarch64/fpu/cos_advsimd.c
2869328693+@@ -22,7 +22,7 @@
2869428694+ static const struct data
2869528695+ {
2869628696+ float64x2_t poly[7];
2869728697+- float64x2_t range_val, shift, inv_pi, half_pi, pi_1, pi_2, pi_3;
2869828698++ float64x2_t range_val, inv_pi, pi_1, pi_2, pi_3;
2869928699+ } data = {
2870028700+ /* Worst-case error is 3.3 ulp in [-pi/2, pi/2]. */
2870128701+ .poly = { V2 (-0x1.555555555547bp-3), V2 (0x1.1111111108a4dp-7),
2870228702+@@ -30,11 +30,9 @@ static const struct data
2870328703+ V2 (-0x1.ae633919987c6p-26), V2 (0x1.60e277ae07cecp-33),
2870428704+ V2 (-0x1.9e9540300a1p-41) },
2870528705+ .inv_pi = V2 (0x1.45f306dc9c883p-2),
2870628706+- .half_pi = V2 (0x1.921fb54442d18p+0),
2870728707+ .pi_1 = V2 (0x1.921fb54442d18p+1),
2870828708+ .pi_2 = V2 (0x1.1a62633145c06p-53),
2870928709+ .pi_3 = V2 (0x1.c1cd129024e09p-106),
2871028710+- .shift = V2 (0x1.8p52),
2871128711+ .range_val = V2 (0x1p23)
2871228712+ };
2871328713+2871428714+@@ -68,10 +66,9 @@ float64x2_t VPCS_ATTR V_NAME_D1 (cos) (float64x2_t x)
2871528715+ #endif
2871628716+2871728717+ /* n = rint((|x|+pi/2)/pi) - 0.5. */
2871828718+- n = vfmaq_f64 (d->shift, d->inv_pi, vaddq_f64 (r, d->half_pi));
2871928719+- odd = vshlq_n_u64 (vreinterpretq_u64_f64 (n), 63);
2872028720+- n = vsubq_f64 (n, d->shift);
2872128721+- n = vsubq_f64 (n, v_f64 (0.5));
2872228722++ n = vrndaq_f64 (vfmaq_f64 (v_f64 (0.5), r, d->inv_pi));
2872328723++ odd = vshlq_n_u64 (vreinterpretq_u64_s64 (vcvtq_s64_f64 (n)), 63);
2872428724++ n = vsubq_f64 (n, v_f64 (0.5f));
2872528725+2872628726+ /* r = |x| - n*pi (range reduction into -pi/2 .. pi/2). */
2872728727+ r = vfmsq_f64 (r, d->pi_1, n);
2872828728+diff --git a/sysdeps/aarch64/fpu/cosf_advsimd.c b/sysdeps/aarch64/fpu/cosf_advsimd.c
2872928729+index d0c285b03a..85a1b37373 100644
2873028730+--- a/sysdeps/aarch64/fpu/cosf_advsimd.c
2873128731++++ b/sysdeps/aarch64/fpu/cosf_advsimd.c
2873228732+@@ -22,7 +22,7 @@
2873328733+ static const struct data
2873428734+ {
2873528735+ float32x4_t poly[4];
2873628736+- float32x4_t range_val, inv_pi, half_pi, shift, pi_1, pi_2, pi_3;
2873728737++ float32x4_t range_val, inv_pi, pi_1, pi_2, pi_3;
2873828738+ } data = {
2873928739+ /* 1.886 ulp error. */
2874028740+ .poly = { V4 (-0x1.555548p-3f), V4 (0x1.110df4p-7f), V4 (-0x1.9f42eap-13f),
2874128741+@@ -33,8 +33,6 @@ static const struct data
2874228742+ .pi_3 = V4 (-0x1.ee59dap-49f),
2874328743+2874428744+ .inv_pi = V4 (0x1.45f306p-2f),
2874528745+- .shift = V4 (0x1.8p+23f),
2874628746+- .half_pi = V4 (0x1.921fb6p0f),
2874728747+ .range_val = V4 (0x1p20f)
2874828748+ };
2874928749+2875028750+@@ -69,9 +67,8 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (cos) (float32x4_t x)
2875128751+ #endif
2875228752+2875328753+ /* n = rint((|x|+pi/2)/pi) - 0.5. */
2875428754+- n = vfmaq_f32 (d->shift, d->inv_pi, vaddq_f32 (r, d->half_pi));
2875528755+- odd = vshlq_n_u32 (vreinterpretq_u32_f32 (n), 31);
2875628756+- n = vsubq_f32 (n, d->shift);
2875728757++ n = vrndaq_f32 (vfmaq_f32 (v_f32 (0.5), r, d->inv_pi));
2875828758++ odd = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtq_s32_f32 (n)), 31);
2875928759+ n = vsubq_f32 (n, v_f32 (0.5f));
2876028760+2876128761+ /* r = |x| - n*pi (range reduction into -pi/2 .. pi/2). */
2876228762+diff --git a/sysdeps/aarch64/fpu/expf_advsimd.c b/sysdeps/aarch64/fpu/expf_advsimd.c
2876328763+index 99d2e647aa..5c9cb72620 100644
2876428764+--- a/sysdeps/aarch64/fpu/expf_advsimd.c
2876528765++++ b/sysdeps/aarch64/fpu/expf_advsimd.c
2876628766+@@ -22,7 +22,7 @@
2876728767+ static const struct data
2876828768+ {
2876928769+ float32x4_t poly[5];
2877028770+- float32x4_t shift, inv_ln2, ln2_hi, ln2_lo;
2877128771++ float32x4_t inv_ln2, ln2_hi, ln2_lo;
2877228772+ uint32x4_t exponent_bias;
2877328773+ #if !WANT_SIMD_EXCEPT
2877428774+ float32x4_t special_bound, scale_thresh;
2877528775+@@ -31,7 +31,6 @@ static const struct data
2877628776+ /* maxerr: 1.45358 +0.5 ulp. */
2877728777+ .poly = { V4 (0x1.0e4020p-7f), V4 (0x1.573e2ep-5f), V4 (0x1.555e66p-3f),
2877828778+ V4 (0x1.fffdb6p-2f), V4 (0x1.ffffecp-1f) },
2877928779+- .shift = V4 (0x1.8p23f),
2878028780+ .inv_ln2 = V4 (0x1.715476p+0f),
2878128781+ .ln2_hi = V4 (0x1.62e4p-1f),
2878228782+ .ln2_lo = V4 (0x1.7f7d1cp-20f),
2878328783+@@ -85,7 +84,7 @@ special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1,
2878428784+ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (exp) (float32x4_t x)
2878528785+ {
2878628786+ const struct data *d = ptr_barrier (&data);
2878728787+- float32x4_t n, r, r2, scale, p, q, poly, z;
2878828788++ float32x4_t n, r, r2, scale, p, q, poly;
2878928789+ uint32x4_t cmp, e;
2879028790+2879128791+ #if WANT_SIMD_EXCEPT
2879228792+@@ -104,11 +103,10 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (exp) (float32x4_t x)
2879328793+2879428794+ /* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
2879528795+ x = ln2*n + r, with r in [-ln2/2, ln2/2]. */
2879628796+- z = vfmaq_f32 (d->shift, x, d->inv_ln2);
2879728797+- n = vsubq_f32 (z, d->shift);
2879828798++ n = vrndaq_f32 (vmulq_f32 (x, d->inv_ln2));
2879928799+ r = vfmsq_f32 (x, n, d->ln2_hi);
2880028800+ r = vfmsq_f32 (r, n, d->ln2_lo);
2880128801+- e = vshlq_n_u32 (vreinterpretq_u32_f32 (z), 23);
2880228802++ e = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtq_s32_f32 (n)), 23);
2880328803+ scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias));
2880428804+2880528805+ #if !WANT_SIMD_EXCEPT
2880628806+diff --git a/sysdeps/aarch64/fpu/sin_advsimd.c b/sysdeps/aarch64/fpu/sin_advsimd.c
2880728807+index a0d9d3b819..718125cbad 100644
2880828808+--- a/sysdeps/aarch64/fpu/sin_advsimd.c
2880928809++++ b/sysdeps/aarch64/fpu/sin_advsimd.c
2881028810+@@ -22,7 +22,7 @@
2881128811+ static const struct data
2881228812+ {
2881328813+ float64x2_t poly[7];
2881428814+- float64x2_t range_val, inv_pi, shift, pi_1, pi_2, pi_3;
2881528815++ float64x2_t range_val, inv_pi, pi_1, pi_2, pi_3;
2881628816+ } data = {
2881728817+ .poly = { V2 (-0x1.555555555547bp-3), V2 (0x1.1111111108a4dp-7),
2881828818+ V2 (-0x1.a01a019936f27p-13), V2 (0x1.71de37a97d93ep-19),
2881928819+@@ -34,12 +34,13 @@ static const struct data
2882028820+ .pi_1 = V2 (0x1.921fb54442d18p+1),
2882128821+ .pi_2 = V2 (0x1.1a62633145c06p-53),
2882228822+ .pi_3 = V2 (0x1.c1cd129024e09p-106),
2882328823+- .shift = V2 (0x1.8p52),
2882428824+ };
2882528825+2882628826+ #if WANT_SIMD_EXCEPT
2882728827+-# define TinyBound v_u64 (0x3000000000000000) /* asuint64 (0x1p-255). */
2882828828+-# define Thresh v_u64 (0x1160000000000000) /* RangeVal - TinyBound. */
2882928829++/* asuint64(0x1p-253)), below which multiply by inv_pi underflows. */
2883028830++# define TinyBound v_u64 (0x3020000000000000)
2883128831++/* RangeVal - TinyBound. */
2883228832++# define Thresh v_u64 (0x1160000000000000)
2883328833+ #endif
2883428834+2883528835+ #define C(i) d->poly[i]
2883628836+@@ -72,16 +73,15 @@ float64x2_t VPCS_ATTR V_NAME_D1 (sin) (float64x2_t x)
2883728837+ fenv). These lanes will be fixed by special-case handler later. */
2883828838+ uint64x2_t ir = vreinterpretq_u64_f64 (vabsq_f64 (x));
2883928839+ cmp = vcgeq_u64 (vsubq_u64 (ir, TinyBound), Thresh);
2884028840+- r = vbslq_f64 (cmp, vreinterpretq_f64_u64 (cmp), x);
2884128841++ r = vreinterpretq_f64_u64 (vbicq_u64 (vreinterpretq_u64_f64 (x), cmp));
2884228842+ #else
2884328843+ r = x;
2884428844+ cmp = vcageq_f64 (x, d->range_val);
2884528845+ #endif
2884628846+2884728847+ /* n = rint(|x|/pi). */
2884828848+- n = vfmaq_f64 (d->shift, d->inv_pi, r);
2884928849+- odd = vshlq_n_u64 (vreinterpretq_u64_f64 (n), 63);
2885028850+- n = vsubq_f64 (n, d->shift);
2885128851++ n = vrndaq_f64 (vmulq_f64 (r, d->inv_pi));
2885228852++ odd = vshlq_n_u64 (vreinterpretq_u64_s64 (vcvtq_s64_f64 (n)), 63);
2885328853+2885428854+ /* r = |x| - n*pi (range reduction into -pi/2 .. pi/2). */
2885528855+ r = vfmsq_f64 (r, d->pi_1, n);
2885628856+diff --git a/sysdeps/aarch64/fpu/sinf_advsimd.c b/sysdeps/aarch64/fpu/sinf_advsimd.c
2885728857+index 375dfc3331..6ee9a23d5b 100644
2885828858+--- a/sysdeps/aarch64/fpu/sinf_advsimd.c
2885928859++++ b/sysdeps/aarch64/fpu/sinf_advsimd.c
2886028860+@@ -22,7 +22,7 @@
2886128861+ static const struct data
2886228862+ {
2886328863+ float32x4_t poly[4];
2886428864+- float32x4_t range_val, inv_pi, shift, pi_1, pi_2, pi_3;
2886528865++ float32x4_t range_val, inv_pi, pi_1, pi_2, pi_3;
2886628866+ } data = {
2886728867+ /* 1.886 ulp error. */
2886828868+ .poly = { V4 (-0x1.555548p-3f), V4 (0x1.110df4p-7f), V4 (-0x1.9f42eap-13f),
2886928869+@@ -33,13 +33,14 @@ static const struct data
2887028870+ .pi_3 = V4 (-0x1.ee59dap-49f),
2887128871+2887228872+ .inv_pi = V4 (0x1.45f306p-2f),
2887328873+- .shift = V4 (0x1.8p+23f),
2887428874+ .range_val = V4 (0x1p20f)
2887528875+ };
2887628876+2887728877+ #if WANT_SIMD_EXCEPT
2887828878+-# define TinyBound v_u32 (0x21000000) /* asuint32(0x1p-61f). */
2887928879+-# define Thresh v_u32 (0x28800000) /* RangeVal - TinyBound. */
2888028880++/* asuint32(0x1p-59f), below which multiply by inv_pi underflows. */
2888128881++# define TinyBound v_u32 (0x22000000)
2888228882++/* RangeVal - TinyBound. */
2888328883++# define Thresh v_u32 (0x27800000)
2888428884+ #endif
2888528885+2888628886+ #define C(i) d->poly[i]
2888728887+@@ -64,23 +65,22 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (sin) (float32x4_t x)
2888828888+ /* If fenv exceptions are to be triggered correctly, set any special lanes
2888928889+ to 1 (which is neutral w.r.t. fenv). These lanes will be fixed by
2889028890+ special-case handler later. */
2889128891+- r = vbslq_f32 (cmp, vreinterpretq_f32_u32 (cmp), x);
2889228892++ r = vreinterpretq_f32_u32 (vbicq_u32 (vreinterpretq_u32_f32 (x), cmp));
2889328893+ #else
2889428894+ r = x;
2889528895+ cmp = vcageq_f32 (x, d->range_val);
2889628896+ #endif
2889728897+2889828898+- /* n = rint(|x|/pi) */
2889928899+- n = vfmaq_f32 (d->shift, d->inv_pi, r);
2890028900+- odd = vshlq_n_u32 (vreinterpretq_u32_f32 (n), 31);
2890128901+- n = vsubq_f32 (n, d->shift);
2890228902++ /* n = rint(|x|/pi). */
2890328903++ n = vrndaq_f32 (vmulq_f32 (r, d->inv_pi));
2890428904++ odd = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtq_s32_f32 (n)), 31);
2890528905+2890628906+- /* r = |x| - n*pi (range reduction into -pi/2 .. pi/2) */
2890728907++ /* r = |x| - n*pi (range reduction into -pi/2 .. pi/2). */
2890828908+ r = vfmsq_f32 (r, d->pi_1, n);
2890928909+ r = vfmsq_f32 (r, d->pi_2, n);
2891028910+ r = vfmsq_f32 (r, d->pi_3, n);
2891128911+2891228912+- /* y = sin(r) */
2891328913++ /* y = sin(r). */
2891428914+ r2 = vmulq_f32 (r, r);
2891528915+ y = vfmaq_f32 (C (2), C (3), r2);
2891628916+ y = vfmaq_f32 (C (1), y, r2);
2891728917+2891828918+commit 9ff7559b274eb0dbce2cbcf87284c1d30d47a2d6
2891928919+Author: Joe Ramsay <Joe.Ramsay@arm.com>
2892028920+Date: Mon Oct 28 14:58:35 2024 +0000
2892128921+2892228922+ AArch64: Small optimisation in AdvSIMD erf and erfc
2892328923+2892428924+ In both routines, reduce register pressure such that GCC 14 emits no
2892528925+ spills for erf and fewer spills for erfc. Also use more efficient
2892628926+ comparison for the special-case in erf.
2892728927+2892828928+ Benchtests show erf improves by 6.4%, erfc by 1.0%.
2892928929+2893028930+ (cherry picked from commit 1cf29fbc5be23db775d1dfa6b332ded6e6554252)
2893128931+2893228932+diff --git a/sysdeps/aarch64/fpu/erf_advsimd.c b/sysdeps/aarch64/fpu/erf_advsimd.c
2893328933+index 19cbb7d0f4..c0116735e4 100644
2893428934+--- a/sysdeps/aarch64/fpu/erf_advsimd.c
2893528935++++ b/sysdeps/aarch64/fpu/erf_advsimd.c
2893628936+@@ -22,19 +22,21 @@
2893728937+ static const struct data
2893828938+ {
2893928939+ float64x2_t third;
2894028940+- float64x2_t tenth, two_over_five, two_over_fifteen;
2894128941+- float64x2_t two_over_nine, two_over_fortyfive;
2894228942++ float64x2_t tenth, two_over_five, two_over_nine;
2894328943++ double two_over_fifteen, two_over_fortyfive;
2894428944+ float64x2_t max, shift;
2894528945++ uint64x2_t max_idx;
2894628946+ #if WANT_SIMD_EXCEPT
2894728947+ float64x2_t tiny_bound, huge_bound, scale_minus_one;
2894828948+ #endif
2894928949+ } data = {
2895028950++ .max_idx = V2 (768),
2895128951+ .third = V2 (0x1.5555555555556p-2), /* used to compute 2/3 and 1/6 too. */
2895228952+- .two_over_fifteen = V2 (0x1.1111111111111p-3),
2895328953++ .two_over_fifteen = 0x1.1111111111111p-3,
2895428954+ .tenth = V2 (-0x1.999999999999ap-4),
2895528955+ .two_over_five = V2 (-0x1.999999999999ap-2),
2895628956+ .two_over_nine = V2 (-0x1.c71c71c71c71cp-3),
2895728957+- .two_over_fortyfive = V2 (0x1.6c16c16c16c17p-5),
2895828958++ .two_over_fortyfive = 0x1.6c16c16c16c17p-5,
2895928959+ .max = V2 (5.9921875), /* 6 - 1/128. */
2896028960+ .shift = V2 (0x1p45),
2896128961+ #if WANT_SIMD_EXCEPT
2896228962+@@ -87,8 +89,8 @@ float64x2_t VPCS_ATTR V_NAME_D1 (erf) (float64x2_t x)
2896328963+ float64x2_t a = vabsq_f64 (x);
2896428964+ /* Reciprocal conditions that do not catch NaNs so they can be used in BSLs
2896528965+ to return expected results. */
2896628966+- uint64x2_t a_le_max = vcleq_f64 (a, dat->max);
2896728967+- uint64x2_t a_gt_max = vcgtq_f64 (a, dat->max);
2896828968++ uint64x2_t a_le_max = vcaleq_f64 (x, dat->max);
2896928969++ uint64x2_t a_gt_max = vcagtq_f64 (x, dat->max);
2897028970+2897128971+ #if WANT_SIMD_EXCEPT
2897228972+ /* |x| huge or tiny. */
2897328973+@@ -115,7 +117,7 @@ float64x2_t VPCS_ATTR V_NAME_D1 (erf) (float64x2_t x)
2897428974+ segfault. */
2897528975+ uint64x2_t i
2897628976+ = vsubq_u64 (vreinterpretq_u64_f64 (z), vreinterpretq_u64_f64 (shift));
2897728977+- i = vbslq_u64 (a_le_max, i, v_u64 (768));
2897828978++ i = vbslq_u64 (a_le_max, i, dat->max_idx);
2897928979+ struct entry e = lookup (i);
2898028980+2898128981+ float64x2_t r = vsubq_f64 (z, shift);
2898228982+@@ -125,14 +127,19 @@ float64x2_t VPCS_ATTR V_NAME_D1 (erf) (float64x2_t x)
2898328983+ float64x2_t d2 = vmulq_f64 (d, d);
2898428984+ float64x2_t r2 = vmulq_f64 (r, r);
2898528985+2898628986++ float64x2_t two_over_fifteen_and_fortyfive
2898728987++ = vld1q_f64 (&dat->two_over_fifteen);
2898828988++
2898928989+ /* poly (d, r) = 1 + p1(r) * d + p2(r) * d^2 + ... + p5(r) * d^5. */
2899028990+ float64x2_t p1 = r;
2899128991+ float64x2_t p2
2899228992+ = vfmsq_f64 (dat->third, r2, vaddq_f64 (dat->third, dat->third));
2899328993+ float64x2_t p3 = vmulq_f64 (r, vfmaq_f64 (v_f64 (-0.5), r2, dat->third));
2899428994+- float64x2_t p4 = vfmaq_f64 (dat->two_over_five, r2, dat->two_over_fifteen);
2899528995++ float64x2_t p4 = vfmaq_laneq_f64 (dat->two_over_five, r2,
2899628996++ two_over_fifteen_and_fortyfive, 0);
2899728997+ p4 = vfmsq_f64 (dat->tenth, r2, p4);
2899828998+- float64x2_t p5 = vfmaq_f64 (dat->two_over_nine, r2, dat->two_over_fortyfive);
2899928999++ float64x2_t p5 = vfmaq_laneq_f64 (dat->two_over_nine, r2,
2900029000++ two_over_fifteen_and_fortyfive, 1);
2900129001+ p5 = vmulq_f64 (r, vfmaq_f64 (vmulq_f64 (v_f64 (0.5), dat->third), r2, p5));
2900229002+2900329003+ float64x2_t p34 = vfmaq_f64 (p3, d, p4);
2900429004+diff --git a/sysdeps/aarch64/fpu/erfc_advsimd.c b/sysdeps/aarch64/fpu/erfc_advsimd.c
2900529005+index f1b3bfe830..2f2f755c46 100644
2900629006+--- a/sysdeps/aarch64/fpu/erfc_advsimd.c
2900729007++++ b/sysdeps/aarch64/fpu/erfc_advsimd.c
2900829008+@@ -24,8 +24,8 @@ static const struct data
2900929009+ {
2901029010+ uint64x2_t offset, table_scale;
2901129011+ float64x2_t max, shift;
2901229012+- float64x2_t p20, p40, p41, p42;
2901329013+- float64x2_t p51, p52;
2901429014++ float64x2_t p20, p40, p41, p51;
2901529015++ double p42, p52;
2901629016+ double qr5[2], qr6[2], qr7[2], qr8[2], qr9[2];
2901729017+ #if WANT_SIMD_EXCEPT
2901829018+ float64x2_t uflow_bound;
2901929019+@@ -41,9 +41,9 @@ static const struct data
2902029020+ .p20 = V2 (0x1.5555555555555p-2), /* 1/3, used to compute 2/3 and 1/6. */
2902129021+ .p40 = V2 (-0x1.999999999999ap-4), /* 1/10. */
2902229022+ .p41 = V2 (-0x1.999999999999ap-2), /* 2/5. */
2902329023+- .p42 = V2 (0x1.1111111111111p-3), /* 2/15. */
2902429024++ .p42 = 0x1.1111111111111p-3, /* 2/15. */
2902529025+ .p51 = V2 (-0x1.c71c71c71c71cp-3), /* 2/9. */
2902629026+- .p52 = V2 (0x1.6c16c16c16c17p-5), /* 2/45. */
2902729027++ .p52 = 0x1.6c16c16c16c17p-5, /* 2/45. */
2902829028+ /* Qi = (i+1) / i, Ri = -2 * i / ((i+1)*(i+2)), for i = 5, ..., 9. */
2902929029+ .qr5 = { 0x1.3333333333333p0, -0x1.e79e79e79e79ep-3 },
2903029030+ .qr6 = { 0x1.2aaaaaaaaaaabp0, -0x1.b6db6db6db6dbp-3 },
2903129031+@@ -157,9 +157,10 @@ float64x2_t V_NAME_D1 (erfc) (float64x2_t x)
2903229032+ float64x2_t p1 = r;
2903329033+ float64x2_t p2 = vfmsq_f64 (dat->p20, r2, vaddq_f64 (dat->p20, dat->p20));
2903429034+ float64x2_t p3 = vmulq_f64 (r, vfmaq_f64 (v_f64 (-0.5), r2, dat->p20));
2903529035+- float64x2_t p4 = vfmaq_f64 (dat->p41, r2, dat->p42);
2903629036++ float64x2_t p42_p52 = vld1q_f64 (&dat->p42);
2903729037++ float64x2_t p4 = vfmaq_laneq_f64 (dat->p41, r2, p42_p52, 0);
2903829038+ p4 = vfmsq_f64 (dat->p40, r2, p4);
2903929039+- float64x2_t p5 = vfmaq_f64 (dat->p51, r2, dat->p52);
2904029040++ float64x2_t p5 = vfmaq_laneq_f64 (dat->p51, r2, p42_p52, 1);
2904129041+ p5 = vmulq_f64 (r, vfmaq_f64 (vmulq_f64 (v_f64 (0.5), dat->p20), r2, p5));
2904229042+ /* Compute p_i using recurrence relation:
2904329043+ p_{i+2} = (p_i + r * Q_{i+1} * p_{i+1}) * R_{i+1}. */
2904429044+2904529045+commit 76c923fe9d09befc8131205659d99cb9ac97460a
2904629046+Author: Joe Ramsay <Joe.Ramsay@arm.com>
2904729047+Date: Fri Nov 1 15:48:54 2024 +0000
2904829048+2904929049+ AArch64: Remove SVE erf and erfc tables
2905029050+2905129051+ By using a combination of mask-and-add instead of the shift-based
2905229052+ index calculation the routines can share the same table as other
2905329053+ variants with no performance degradation.
2905429054+2905529055+ The tables change name because of other changes in downstream AOR.
2905629056+2905729057+ Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com>
2905829058+ (cherry picked from commit 2d82d781a539ce8e82178fc1fa2c99ae1884e7fe)
2905929059+2906029060+diff --git a/sysdeps/aarch64/fpu/Makefile b/sysdeps/aarch64/fpu/Makefile
2906129061+index 234a6c457c..be8541f649 100644
2906229062+--- a/sysdeps/aarch64/fpu/Makefile
2906329063++++ b/sysdeps/aarch64/fpu/Makefile
2906429064+@@ -41,8 +41,6 @@ libmvec-support = $(addsuffix f_advsimd,$(float-advsimd-funcs)) \
2906529065+ v_log10_data \
2906629066+ erf_data \
2906729067+ erff_data \
2906829068+- sv_erf_data \
2906929069+- sv_erff_data \
2907029070+ v_exp_tail_data \
2907129071+ erfc_data \
2907229072+ erfcf_data \
2907329073+diff --git a/sysdeps/aarch64/fpu/erf_advsimd.c b/sysdeps/aarch64/fpu/erf_advsimd.c
2907429074+index c0116735e4..a48092e838 100644
2907529075+--- a/sysdeps/aarch64/fpu/erf_advsimd.c
2907629076++++ b/sysdeps/aarch64/fpu/erf_advsimd.c
2907729077+@@ -58,8 +58,8 @@ static inline struct entry
2907829078+ lookup (uint64x2_t i)
2907929079+ {
2908029080+ struct entry e;
2908129081+- float64x2_t e1 = vld1q_f64 (&__erf_data.tab[vgetq_lane_u64 (i, 0)].erf),
2908229082+- e2 = vld1q_f64 (&__erf_data.tab[vgetq_lane_u64 (i, 1)].erf);
2908329083++ float64x2_t e1 = vld1q_f64 (&__v_erf_data.tab[vgetq_lane_u64 (i, 0)].erf),
2908429084++ e2 = vld1q_f64 (&__v_erf_data.tab[vgetq_lane_u64 (i, 1)].erf);
2908529085+ e.erf = vuzp1q_f64 (e1, e2);
2908629086+ e.scale = vuzp2q_f64 (e1, e2);
2908729087+ return e;
2908829088+diff --git a/sysdeps/aarch64/fpu/erf_data.c b/sysdeps/aarch64/fpu/erf_data.c
2908929089+index 6d2dcd235c..ea01fad7ca 100644
2909029090+--- a/sysdeps/aarch64/fpu/erf_data.c
2909129091++++ b/sysdeps/aarch64/fpu/erf_data.c
2909229092+@@ -19,14 +19,14 @@
2909329093+2909429094+ #include "vecmath_config.h"
2909529095+2909629096+-/* Lookup table used in erf.
2909729097++/* Lookup table used in vector erf.
2909829098+ For each possible rounded input r (multiples of 1/128), between
2909929099+ r = 0.0 and r = 6.0 (769 values):
2910029100+- - the first entry __erff_data.tab.erf contains the values of erf(r),
2910129101+- - the second entry __erff_data.tab.scale contains the values of
2910229102++ - the first entry __v_erff_data.tab.erf contains the values of erf(r),
2910329103++ - the second entry __v_erff_data.tab.scale contains the values of
2910429104+ 2/sqrt(pi)*exp(-r^2). Note that indices 0 and 1 are never hit by the
2910529105+ algorithm, since lookup is performed only for x >= 1/64-1/512. */
2910629106+-const struct erf_data __erf_data = {
2910729107++const struct v_erf_data __v_erf_data = {
2910829108+ .tab = { { 0x0.0000000000000p+0, 0x1.20dd750429b6dp+0 },
2910929109+ { 0x1.20dbf3deb1340p-7, 0x1.20d8f1975c85dp+0 },
2911029110+ { 0x1.20d77083f17a0p-6, 0x1.20cb67bd452c7p+0 },
2911129111+diff --git a/sysdeps/aarch64/fpu/erf_sve.c b/sysdeps/aarch64/fpu/erf_sve.c
2911229112+index 7d51417406..671d55a02b 100644
2911329113+--- a/sysdeps/aarch64/fpu/erf_sve.c
2911429114++++ b/sysdeps/aarch64/fpu/erf_sve.c
2911529115+@@ -67,14 +67,16 @@ svfloat64_t SV_NAME_D1 (erf) (svfloat64_t x, const svbool_t pg)
2911629116+ svfloat64_t a = svabs_x (pg, x);
2911729117+ svfloat64_t shift = sv_f64 (dat->shift);
2911829118+ svfloat64_t z = svadd_x (pg, a, shift);
2911929119+- svuint64_t i
2912029120+- = svsub_x (pg, svreinterpret_u64 (z), svreinterpret_u64 (shift));
2912129121++ svuint64_t i = svand_x (pg, svreinterpret_u64 (z), 0xfff);
2912229122++ i = svadd_x (pg, i, i);
2912329123+2912429124+ /* Lookup without shortcut for small values but with predicate to avoid
2912529125+ segfault for large values and NaNs. */
2912629126+ svfloat64_t r = svsub_x (pg, z, shift);
2912729127+- svfloat64_t erfr = svld1_gather_index (a_lt_max, __sv_erf_data.erf, i);
2912829128+- svfloat64_t scale = svld1_gather_index (a_lt_max, __sv_erf_data.scale, i);
2912929129++ svfloat64_t erfr
2913029130++ = svld1_gather_index (a_lt_max, &__v_erf_data.tab[0].erf, i);
2913129131++ svfloat64_t scale
2913229132++ = svld1_gather_index (a_lt_max, &__v_erf_data.tab[0].scale, i);
2913329133+2913429134+ /* erf(x) ~ erf(r) + scale * d * poly (r, d). */
2913529135+ svfloat64_t d = svsub_x (pg, a, r);
2913629136+diff --git a/sysdeps/aarch64/fpu/erfc_advsimd.c b/sysdeps/aarch64/fpu/erfc_advsimd.c
2913729137+index 2f2f755c46..d05eac61a2 100644
2913829138+--- a/sysdeps/aarch64/fpu/erfc_advsimd.c
2913929139++++ b/sysdeps/aarch64/fpu/erfc_advsimd.c
2914029140+@@ -69,9 +69,9 @@ lookup (uint64x2_t i)
2914129141+ {
2914229142+ struct entry e;
2914329143+ float64x2_t e1
2914429144+- = vld1q_f64 (&__erfc_data.tab[vgetq_lane_u64 (i, 0) - Off].erfc);
2914529145++ = vld1q_f64 (&__v_erfc_data.tab[vgetq_lane_u64 (i, 0) - Off].erfc);
2914629146+ float64x2_t e2
2914729147+- = vld1q_f64 (&__erfc_data.tab[vgetq_lane_u64 (i, 1) - Off].erfc);
2914829148++ = vld1q_f64 (&__v_erfc_data.tab[vgetq_lane_u64 (i, 1) - Off].erfc);
2914929149+ e.erfc = vuzp1q_f64 (e1, e2);
2915029150+ e.scale = vuzp2q_f64 (e1, e2);
2915129151+ return e;
2915229152+diff --git a/sysdeps/aarch64/fpu/erfc_data.c b/sysdeps/aarch64/fpu/erfc_data.c
2915329153+index 76a94e4681..8dc6a8c42c 100644
2915429154+--- a/sysdeps/aarch64/fpu/erfc_data.c
2915529155++++ b/sysdeps/aarch64/fpu/erfc_data.c
2915629156+@@ -19,14 +19,14 @@
2915729157+2915829158+ #include "vecmath_config.h"
2915929159+2916029160+-/* Lookup table used in erfc.
2916129161++/* Lookup table used in vector erfc.
2916229162+ For each possible rounded input r (multiples of 1/128), between
2916329163+ r = 0.0 and r = ~27.0 (3488 values):
2916429164+- - the first entry __erfc_data.tab.erfc contains the values of erfc(r),
2916529165+- - the second entry __erfc_data.tab.scale contains the values of
2916629166++ - the first entry __v_erfc_data.tab.erfc contains the values of erfc(r),
2916729167++ - the second entry __v_erfc_data.tab.scale contains the values of
2916829168+ 2/sqrt(pi)*exp(-r^2). Both values may go into subnormal range, therefore
2916929169+ they are scaled by a large enough value 2^128 (fits in 8bit). */
2917029170+-const struct erfc_data __erfc_data = {
2917129171++const struct v_erfc_data __v_erfc_data = {
2917229172+ .tab = { { 0x1p128, 0x1.20dd750429b6dp128 },
2917329173+ { 0x1.fb7c9030853b3p127, 0x1.20d8f1975c85dp128 },
2917429174+ { 0x1.f6f9447be0743p127, 0x1.20cb67bd452c7p128 },
2917529175+diff --git a/sysdeps/aarch64/fpu/erfc_sve.c b/sysdeps/aarch64/fpu/erfc_sve.c
2917629176+index c17d3e4484..703926ee41 100644
2917729177+--- a/sysdeps/aarch64/fpu/erfc_sve.c
2917829178++++ b/sysdeps/aarch64/fpu/erfc_sve.c
2917929179+@@ -104,7 +104,7 @@ svfloat64_t SV_NAME_D1 (erfc) (svfloat64_t x, const svbool_t pg)
2918029180+2918129181+ /* Lookup erfc(r) and 2/sqrt(pi)*exp(-r^2) in tables. */
2918229182+ i = svadd_x (pg, i, i);
2918329183+- const float64_t *p = &__erfc_data.tab[0].erfc - 2 * dat->off_arr;
2918429184++ const float64_t *p = &__v_erfc_data.tab[0].erfc - 2 * dat->off_arr;
2918529185+ svfloat64_t erfcr = svld1_gather_index (pg, p, i);
2918629186+ svfloat64_t scale = svld1_gather_index (pg, p + 1, i);
2918729187+2918829188+diff --git a/sysdeps/aarch64/fpu/erfcf_advsimd.c b/sysdeps/aarch64/fpu/erfcf_advsimd.c
2918929189+index ca5bc3ab33..59b0b0d64b 100644
2919029190+--- a/sysdeps/aarch64/fpu/erfcf_advsimd.c
2919129191++++ b/sysdeps/aarch64/fpu/erfcf_advsimd.c
2919229192+@@ -62,13 +62,13 @@ lookup (uint32x4_t i)
2919329193+ {
2919429194+ struct entry e;
2919529195+ float32x2_t t0
2919629196+- = vld1_f32 (&__erfcf_data.tab[vgetq_lane_u32 (i, 0) - Off].erfc);
2919729197++ = vld1_f32 (&__v_erfcf_data.tab[vgetq_lane_u32 (i, 0) - Off].erfc);
2919829198+ float32x2_t t1
2919929199+- = vld1_f32 (&__erfcf_data.tab[vgetq_lane_u32 (i, 1) - Off].erfc);
2920029200++ = vld1_f32 (&__v_erfcf_data.tab[vgetq_lane_u32 (i, 1) - Off].erfc);
2920129201+ float32x2_t t2
2920229202+- = vld1_f32 (&__erfcf_data.tab[vgetq_lane_u32 (i, 2) - Off].erfc);
2920329203++ = vld1_f32 (&__v_erfcf_data.tab[vgetq_lane_u32 (i, 2) - Off].erfc);
2920429204+ float32x2_t t3
2920529205+- = vld1_f32 (&__erfcf_data.tab[vgetq_lane_u32 (i, 3) - Off].erfc);
2920629206++ = vld1_f32 (&__v_erfcf_data.tab[vgetq_lane_u32 (i, 3) - Off].erfc);
2920729207+ float32x4_t e1 = vcombine_f32 (t0, t1);
2920829208+ float32x4_t e2 = vcombine_f32 (t2, t3);
2920929209+ e.erfc = vuzp1q_f32 (e1, e2);
2921029210+diff --git a/sysdeps/aarch64/fpu/erfcf_data.c b/sysdeps/aarch64/fpu/erfcf_data.c
2921129211+index 77fb889a78..d45087bbb9 100644
2921229212+--- a/sysdeps/aarch64/fpu/erfcf_data.c
2921329213++++ b/sysdeps/aarch64/fpu/erfcf_data.c
2921429214+@@ -19,14 +19,14 @@
2921529215+2921629216+ #include "vecmath_config.h"
2921729217+2921829218+-/* Lookup table used in erfcf.
2921929219++/* Lookup table used in vector erfcf.
2922029220+ For each possible rounded input r (multiples of 1/64), between
2922129221+ r = 0.0 and r = 10.0625 (645 values):
2922229222+- - the first entry __erfcf_data.tab.erfc contains the values of erfc(r),
2922329223+- - the second entry __erfcf_data.tab.scale contains the values of
2922429224++ - the first entry __v_erfcf_data.tab.erfc contains the values of erfc(r),
2922529225++ - the second entry __v_erfcf_data.tab.scale contains the values of
2922629226+ 2/sqrt(pi)*exp(-r^2). Both values may go into subnormal range, therefore
2922729227+ they are scaled by a large enough value 2^47 (fits in 8 bits). */
2922829228+-const struct erfcf_data __erfcf_data = {
2922929229++const struct v_erfcf_data __v_erfcf_data = {
2923029230+ .tab = { { 0x1p47, 0x1.20dd76p47 },
2923129231+ { 0x1.f6f944p46, 0x1.20cb68p47 },
2923229232+ { 0x1.edf3aap46, 0x1.209546p47 },
2923329233+diff --git a/sysdeps/aarch64/fpu/erfcf_sve.c b/sysdeps/aarch64/fpu/erfcf_sve.c
2923429234+index 48d1677eb4..ecacb933ac 100644
2923529235+--- a/sysdeps/aarch64/fpu/erfcf_sve.c
2923629236++++ b/sysdeps/aarch64/fpu/erfcf_sve.c
2923729237+@@ -77,7 +77,7 @@ svfloat32_t SV_NAME_F1 (erfc) (svfloat32_t x, const svbool_t pg)
2923829238+2923929239+ /* Lookup erfc(r) and 2/sqrt(pi)*exp(-r^2) in tables. */
2924029240+ i = svmul_x (pg, i, 2);
2924129241+- const float32_t *p = &__erfcf_data.tab[0].erfc - 2 * dat->off_arr;
2924229242++ const float32_t *p = &__v_erfcf_data.tab[0].erfc - 2 * dat->off_arr;
2924329243+ svfloat32_t erfcr = svld1_gather_index (pg, p, i);
2924429244+ svfloat32_t scale = svld1_gather_index (pg, p + 1, i);
2924529245+2924629246+diff --git a/sysdeps/aarch64/fpu/erff_advsimd.c b/sysdeps/aarch64/fpu/erff_advsimd.c
2924729247+index f2fe6ff236..db39e789b6 100644
2924829248+--- a/sysdeps/aarch64/fpu/erff_advsimd.c
2924929249++++ b/sysdeps/aarch64/fpu/erff_advsimd.c
2925029250+@@ -47,10 +47,10 @@ static inline struct entry
2925129251+ lookup (uint32x4_t i)
2925229252+ {
2925329253+ struct entry e;
2925429254+- float32x2_t t0 = vld1_f32 (&__erff_data.tab[vgetq_lane_u32 (i, 0)].erf);
2925529255+- float32x2_t t1 = vld1_f32 (&__erff_data.tab[vgetq_lane_u32 (i, 1)].erf);
2925629256+- float32x2_t t2 = vld1_f32 (&__erff_data.tab[vgetq_lane_u32 (i, 2)].erf);
2925729257+- float32x2_t t3 = vld1_f32 (&__erff_data.tab[vgetq_lane_u32 (i, 3)].erf);
2925829258++ float32x2_t t0 = vld1_f32 (&__v_erff_data.tab[vgetq_lane_u32 (i, 0)].erf);
2925929259++ float32x2_t t1 = vld1_f32 (&__v_erff_data.tab[vgetq_lane_u32 (i, 1)].erf);
2926029260++ float32x2_t t2 = vld1_f32 (&__v_erff_data.tab[vgetq_lane_u32 (i, 2)].erf);
2926129261++ float32x2_t t3 = vld1_f32 (&__v_erff_data.tab[vgetq_lane_u32 (i, 3)].erf);
2926229262+ float32x4_t e1 = vcombine_f32 (t0, t1);
2926329263+ float32x4_t e2 = vcombine_f32 (t2, t3);
2926429264+ e.erf = vuzp1q_f32 (e1, e2);
2926529265+diff --git a/sysdeps/aarch64/fpu/erff_data.c b/sysdeps/aarch64/fpu/erff_data.c
2926629266+index 9a32940915..da38aed205 100644
2926729267+--- a/sysdeps/aarch64/fpu/erff_data.c
2926829268++++ b/sysdeps/aarch64/fpu/erff_data.c
2926929269+@@ -19,14 +19,14 @@
2927029270+2927129271+ #include "vecmath_config.h"
2927229272+2927329273+-/* Lookup table used in erff.
2927429274++/* Lookup table used in vector erff.
2927529275+ For each possible rounded input r (multiples of 1/128), between
2927629276+ r = 0.0 and r = 4.0 (513 values):
2927729277+- - the first entry __erff_data.tab.erf contains the values of erf(r),
2927829278+- - the second entry __erff_data.tab.scale contains the values of
2927929279++ - the first entry __v_erff_data.tab.erf contains the values of erf(r),
2928029280++ - the second entry __v_erff_data.tab.scale contains the values of
2928129281+ 2/sqrt(pi)*exp(-r^2). Note that indices 0 and 1 are never hit by the
2928229282+ algorithm, since lookup is performed only for x >= 1/64-1/512. */
2928329283+-const struct erff_data __erff_data = {
2928429284++const struct v_erff_data __v_erff_data = {
2928529285+ .tab = { { 0x0.000000p+0, 0x1.20dd76p+0 },
2928629286+ { 0x1.20dbf4p-7, 0x1.20d8f2p+0 },
2928729287+ { 0x1.20d770p-6, 0x1.20cb68p+0 },
2928829288+diff --git a/sysdeps/aarch64/fpu/erff_sve.c b/sysdeps/aarch64/fpu/erff_sve.c
2928929289+index 38f00db9be..0e382eb09a 100644
2929029290+--- a/sysdeps/aarch64/fpu/erff_sve.c
2929129291++++ b/sysdeps/aarch64/fpu/erff_sve.c
2929229292+@@ -62,18 +62,17 @@ svfloat32_t SV_NAME_F1 (erf) (svfloat32_t x, const svbool_t pg)
2929329293+2929429294+ svfloat32_t shift = sv_f32 (dat->shift);
2929529295+ svfloat32_t z = svadd_x (pg, a, shift);
2929629296+- svuint32_t i
2929729297+- = svsub_x (pg, svreinterpret_u32 (z), svreinterpret_u32 (shift));
2929829298+-
2929929299+- /* Saturate lookup index. */
2930029300+- i = svsel (a_ge_max, sv_u32 (512), i);
2930129301++ svuint32_t i = svand_x (pg, svreinterpret_u32 (z), 0xfff);
2930229302++ i = svadd_x (pg, i, i);
2930329303+2930429304+ /* r and erf(r) set to 0 for |x| below min. */
2930529305+ svfloat32_t r = svsub_z (a_gt_min, z, shift);
2930629306+- svfloat32_t erfr = svld1_gather_index (a_gt_min, __sv_erff_data.erf, i);
2930729307++ svfloat32_t erfr
2930829308++ = svld1_gather_index (a_gt_min, &__v_erff_data.tab[0].erf, i);
2930929309+2931029310+ /* scale set to 2/sqrt(pi) for |x| below min. */
2931129311+- svfloat32_t scale = svld1_gather_index (a_gt_min, __sv_erff_data.scale, i);
2931229312++ svfloat32_t scale
2931329313++ = svld1_gather_index (a_gt_min, &__v_erff_data.tab[0].scale, i);
2931429314+ scale = svsel (a_gt_min, scale, sv_f32 (dat->scale));
2931529315+2931629316+ /* erf(x) ~ erf(r) + scale * d * (1 - r * d + 1/3 * d^2). */
2931729317+diff --git a/sysdeps/aarch64/fpu/sv_erf_data.c b/sysdeps/aarch64/fpu/sv_erf_data.c
2931829318+deleted file mode 100644
2931929319+index a53878f893..0000000000
2932029320+--- a/sysdeps/aarch64/fpu/sv_erf_data.c
2932129321++++ /dev/null
2932229322+@@ -1,1570 +0,0 @@
2932329323+-/* Table for SVE erf approximation
2932429324+-
2932529325+- Copyright (C) 2024 Free Software Foundation, Inc.
2932629326+- This file is part of the GNU C Library.
2932729327+-
2932829328+- The GNU C Library is free software; you can redistribute it and/or
2932929329+- modify it under the terms of the GNU Lesser General Public
2933029330+- License as published by the Free Software Foundation; either
2933129331+- version 2.1 of the License, or (at your option) any later version.
2933229332+-
2933329333+- The GNU C Library is distributed in the hope that it will be useful,
2933429334+- but WITHOUT ANY WARRANTY; without even the implied warranty of
2933529335+- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
2933629336+- Lesser General Public License for more details.
2933729337+-
2933829338+- You should have received a copy of the GNU Lesser General Public
2933929339+- License along with the GNU C Library; if not, see
2934029340+- <https://www.gnu.org/licenses/>. */
2934129341+-
2934229342+-#include "vecmath_config.h"
2934329343+-
2934429344+-/* Lookup table used in vector erf.
2934529345+- For each possible rounded input r (multiples of 1/128), between
2934629346+- r = 0.0 and r = 6.0 (769 values):
2934729347+- - the first entry __erf_data.tab.erf contains the values of erf(r),
2934829348+- - the second entry __erf_data.tab.scale contains the values of
2934929349+- 2/sqrt(pi)*exp(-r^2). Note that indices 0 and 1 are never hit by the
2935029350+- algorithm, since lookup is performed only for x >= 1/64-1/512. */
2935129351+-const struct sv_erf_data __sv_erf_data = {
2935229352+- .erf = { 0x0.0000000000000p+0,
2935329353+- 0x1.20dbf3deb1340p-7,
2935429354+- 0x1.20d77083f17a0p-6,
2935529355+- 0x1.b137e0cf584dcp-6,
2935629356+- 0x1.20c5645dd2538p-5,
2935729357+- 0x1.68e5d3bbc9526p-5,
2935829358+- 0x1.b0fafef135745p-5,
2935929359+- 0x1.f902a77bd3821p-5,
2936029360+- 0x1.207d480e90658p-4,
2936129361+- 0x1.44703e87e8593p-4,
2936229362+- 0x1.68591a1e83b5dp-4,
2936329363+- 0x1.8c36beb8a8d23p-4,
2936429364+- 0x1.b0081148a873ap-4,
2936529365+- 0x1.d3cbf7e70a4b3p-4,
2936629366+- 0x1.f78159ec8bb50p-4,
2936729367+- 0x1.0d939005f65e5p-3,
2936829368+- 0x1.1f5e1a35c3b89p-3,
2936929369+- 0x1.311fc15f56d14p-3,
2937029370+- 0x1.42d7fc2f64959p-3,
2937129371+- 0x1.548642321d7c6p-3,
2937229372+- 0x1.662a0bdf7a89fp-3,
2937329373+- 0x1.77c2d2a765f9ep-3,
2937429374+- 0x1.895010fdbdbfdp-3,
2937529375+- 0x1.9ad142662e14dp-3,
2937629376+- 0x1.ac45e37fe2526p-3,
2937729377+- 0x1.bdad72110a648p-3,
2937829378+- 0x1.cf076d1233237p-3,
2937929379+- 0x1.e05354b96ff36p-3,
2938029380+- 0x1.f190aa85540e2p-3,
2938129381+- 0x1.015f78a3dcf3dp-2,
2938229382+- 0x1.09eed6982b948p-2,
2938329383+- 0x1.127631eb8de32p-2,
2938429384+- 0x1.1af54e232d609p-2,
2938529385+- 0x1.236bef825d9a2p-2,
2938629386+- 0x1.2bd9db0f7827fp-2,
2938729387+- 0x1.343ed6989b7d9p-2,
2938829388+- 0x1.3c9aa8b84bedap-2,
2938929389+- 0x1.44ed18d9f6462p-2,
2939029390+- 0x1.4d35ef3e5372ep-2,
2939129391+- 0x1.5574f4ffac98ep-2,
2939229392+- 0x1.5da9f415ff23fp-2,
2939329393+- 0x1.65d4b75b00471p-2,
2939429394+- 0x1.6df50a8dff772p-2,
2939529395+- 0x1.760aba57a76bfp-2,
2939629396+- 0x1.7e15944d9d3e4p-2,
2939729397+- 0x1.861566f5fd3c0p-2,
2939829398+- 0x1.8e0a01cab516bp-2,
2939929399+- 0x1.95f3353cbb146p-2,
2940029400+- 0x1.9dd0d2b721f39p-2,
2940129401+- 0x1.a5a2aca209394p-2,
2940229402+- 0x1.ad68966569a87p-2,
2940329403+- 0x1.b522646bbda68p-2,
2940429404+- 0x1.bccfec24855b8p-2,
2940529405+- 0x1.c4710406a65fcp-2,
2940629406+- 0x1.cc058392a6d2dp-2,
2940729407+- 0x1.d38d4354c3bd0p-2,
2940829408+- 0x1.db081ce6e2a48p-2,
2940929409+- 0x1.e275eaf25e458p-2,
2941029410+- 0x1.e9d68931ae650p-2,
2941129411+- 0x1.f129d471eabb1p-2,
2941229412+- 0x1.f86faa9428f9dp-2,
2941329413+- 0x1.ffa7ea8eb5fd0p-2,
2941429414+- 0x1.03693a371519cp-1,
2941529415+- 0x1.06f794ab2cae7p-1,
2941629416+- 0x1.0a7ef5c18edd2p-1,
2941729417+- 0x1.0dff4f247f6c6p-1,
2941829418+- 0x1.1178930ada115p-1,
2941929419+- 0x1.14eab43841b55p-1,
2942029420+- 0x1.1855a5fd3dd50p-1,
2942129421+- 0x1.1bb95c3746199p-1,
2942229422+- 0x1.1f15cb50bc4dep-1,
2942329423+- 0x1.226ae840d4d70p-1,
2942429424+- 0x1.25b8a88b6dd7fp-1,
2942529425+- 0x1.28ff0240d52cdp-1,
2942629426+- 0x1.2c3debfd7d6c1p-1,
2942729427+- 0x1.2f755ce9a21f4p-1,
2942829428+- 0x1.32a54cb8db67bp-1,
2942929429+- 0x1.35cdb3a9a144dp-1,
2943029430+- 0x1.38ee8a84beb71p-1,
2943129431+- 0x1.3c07ca9cb4f9ep-1,
2943229432+- 0x1.3f196dcd0f135p-1,
2943329433+- 0x1.42236e79a5fa6p-1,
2943429434+- 0x1.4525c78dd5966p-1,
2943529435+- 0x1.4820747ba2dc2p-1,
2943629436+- 0x1.4b13713ad3513p-1,
2943729437+- 0x1.4dfeba47f63ccp-1,
2943829438+- 0x1.50e24ca35fd2cp-1,
2943929439+- 0x1.53be25d016a4fp-1,
2944029440+- 0x1.569243d2b3a9bp-1,
2944129441+- 0x1.595ea53035283p-1,
2944229442+- 0x1.5c2348ecc4dc3p-1,
2944329443+- 0x1.5ee02e8a71a53p-1,
2944429444+- 0x1.61955607dd15dp-1,
2944529445+- 0x1.6442bfdedd397p-1,
2944629446+- 0x1.66e86d0312e82p-1,
2944729447+- 0x1.69865ee075011p-1,
2944829448+- 0x1.6c1c9759d0e5fp-1,
2944929449+- 0x1.6eab18c74091bp-1,
2945029450+- 0x1.7131e5f496a5ap-1,
2945129451+- 0x1.73b1021fc0cb8p-1,
2945229452+- 0x1.762870f720c6fp-1,
2945329453+- 0x1.78983697dc96fp-1,
2945429454+- 0x1.7b00578c26037p-1,
2945529455+- 0x1.7d60d8c979f7bp-1,
2945629456+- 0x1.7fb9bfaed8078p-1,
2945729457+- 0x1.820b1202f27fbp-1,
2945829458+- 0x1.8454d5f25760dp-1,
2945929459+- 0x1.8697120d92a4ap-1,
2946029460+- 0x1.88d1cd474a2e0p-1,
2946129461+- 0x1.8b050ef253c37p-1,
2946229462+- 0x1.8d30debfc572ep-1,
2946329463+- 0x1.8f5544bd00c04p-1,
2946429464+- 0x1.91724951b8fc6p-1,
2946529465+- 0x1.9387f53df5238p-1,
2946629466+- 0x1.959651980da31p-1,
2946729467+- 0x1.979d67caa6631p-1,
2946829468+- 0x1.999d4192a5715p-1,
2946929469+- 0x1.9b95e8fd26abap-1,
2947029470+- 0x1.9d8768656cc42p-1,
2947129471+- 0x1.9f71ca72cffb6p-1,
2947229472+- 0x1.a1551a16aaeafp-1,
2947329473+- 0x1.a331628a45b92p-1,
2947429474+- 0x1.a506af4cc00f4p-1,
2947529475+- 0x1.a6d50c20fa293p-1,
2947629476+- 0x1.a89c850b7d54dp-1,
2947729477+- 0x1.aa5d265064366p-1,
2947829478+- 0x1.ac16fc7143263p-1,
2947929479+- 0x1.adca142b10f98p-1,
2948029480+- 0x1.af767a741088bp-1,
2948129481+- 0x1.b11c3c79bb424p-1,
2948229482+- 0x1.b2bb679ead19cp-1,
2948329483+- 0x1.b4540978921eep-1,
2948429484+- 0x1.b5e62fce16095p-1,
2948529485+- 0x1.b771e894d602ep-1,
2948629486+- 0x1.b8f741ef54f83p-1,
2948729487+- 0x1.ba764a2af2b78p-1,
2948829488+- 0x1.bbef0fbde6221p-1,
2948929489+- 0x1.bd61a1453ab44p-1,
2949029490+- 0x1.bece0d82d1a5cp-1,
2949129491+- 0x1.c034635b66e23p-1,
2949229492+- 0x1.c194b1d49a184p-1,
2949329493+- 0x1.c2ef0812fc1bdp-1,
2949429494+- 0x1.c443755820d64p-1,
2949529495+- 0x1.c5920900b5fd1p-1,
2949629496+- 0x1.c6dad2829ec62p-1,
2949729497+- 0x1.c81de16b14cefp-1,
2949829498+- 0x1.c95b455cce69dp-1,
2949929499+- 0x1.ca930e0e2a825p-1,
2950029500+- 0x1.cbc54b476248dp-1,
2950129501+- 0x1.ccf20ce0c0d27p-1,
2950229502+- 0x1.ce1962c0e0d8bp-1,
2950329503+- 0x1.cf3b5cdaf0c39p-1,
2950429504+- 0x1.d0580b2cfd249p-1,
2950529505+- 0x1.d16f7dbe41ca0p-1,
2950629506+- 0x1.d281c49d818d0p-1,
2950729507+- 0x1.d38eefdf64fddp-1,
2950829508+- 0x1.d4970f9ce00d9p-1,
2950929509+- 0x1.d59a33f19ed42p-1,
2951029510+- 0x1.d6986cfa798e7p-1,
2951129511+- 0x1.d791cad3eff01p-1,
2951229512+- 0x1.d8865d98abe01p-1,
2951329513+- 0x1.d97635600bb89p-1,
2951429514+- 0x1.da61623cb41e0p-1,
2951529515+- 0x1.db47f43b2980dp-1,
2951629516+- 0x1.dc29fb60715afp-1,
2951729517+- 0x1.dd0787a8bb39dp-1,
2951829518+- 0x1.dde0a90611a0dp-1,
2951929519+- 0x1.deb56f5f12d28p-1,
2952029520+- 0x1.df85ea8db188ep-1,
2952129521+- 0x1.e0522a5dfda73p-1,
2952229522+- 0x1.e11a3e8cf4eb8p-1,
2952329523+- 0x1.e1de36c75ba58p-1,
2952429524+- 0x1.e29e22a89d766p-1,
2952529525+- 0x1.e35a11b9b61cep-1,
2952629526+- 0x1.e4121370224ccp-1,
2952729527+- 0x1.e4c6372cd8927p-1,
2952829528+- 0x1.e5768c3b4a3fcp-1,
2952929529+- 0x1.e62321d06c5e0p-1,
2953029530+- 0x1.e6cc0709c8a0dp-1,
2953129531+- 0x1.e7714aec96534p-1,
2953229532+- 0x1.e812fc64db369p-1,
2953329533+- 0x1.e8b12a44944a8p-1,
2953429534+- 0x1.e94be342e6743p-1,
2953529535+- 0x1.e9e335fb56f87p-1,
2953629536+- 0x1.ea7730ed0bbb9p-1,
2953729537+- 0x1.eb07e27a133aap-1,
2953829538+- 0x1.eb9558e6b42cep-1,
2953929539+- 0x1.ec1fa258c4beap-1,
2954029540+- 0x1.eca6ccd709544p-1,
2954129541+- 0x1.ed2ae6489ac1ep-1,
2954229542+- 0x1.edabfc7453e63p-1,
2954329543+- 0x1.ee2a1d004692cp-1,
2954429544+- 0x1.eea5557137ae0p-1,
2954529545+- 0x1.ef1db32a2277cp-1,
2954629546+- 0x1.ef93436bc2daap-1,
2954729547+- 0x1.f006135426b26p-1,
2954829548+- 0x1.f0762fde45ee6p-1,
2954929549+- 0x1.f0e3a5e1a1788p-1,
2955029550+- 0x1.f14e8211e8c55p-1,
2955129551+- 0x1.f1b6d0fea5f4dp-1,
2955229552+- 0x1.f21c9f12f0677p-1,
2955329553+- 0x1.f27ff89525acfp-1,
2955429554+- 0x1.f2e0e9a6a8b09p-1,
2955529555+- 0x1.f33f7e43a706bp-1,
2955629556+- 0x1.f39bc242e43e6p-1,
2955729557+- 0x1.f3f5c1558b19ep-1,
2955829558+- 0x1.f44d870704911p-1,
2955929559+- 0x1.f4a31ebcd47dfp-1,
2956029560+- 0x1.f4f693b67bd77p-1,
2956129561+- 0x1.f547f10d60597p-1,
2956229562+- 0x1.f59741b4b97cfp-1,
2956329563+- 0x1.f5e4907982a07p-1,
2956429564+- 0x1.f62fe80272419p-1,
2956529565+- 0x1.f67952cff6282p-1,
2956629566+- 0x1.f6c0db3c34641p-1,
2956729567+- 0x1.f7068b7b10fd9p-1,
2956829568+- 0x1.f74a6d9a38383p-1,
2956929569+- 0x1.f78c8b812d498p-1,
2957029570+- 0x1.f7cceef15d631p-1,
2957129571+- 0x1.f80ba18636f07p-1,
2957229572+- 0x1.f848acb544e95p-1,
2957329573+- 0x1.f88419ce4e184p-1,
2957429574+- 0x1.f8bdf1fb78370p-1,
2957529575+- 0x1.f8f63e416ebffp-1,
2957629576+- 0x1.f92d077f8d56dp-1,
2957729577+- 0x1.f96256700da8ep-1,
2957829578+- 0x1.f99633a838a57p-1,
2957929579+- 0x1.f9c8a7989af0dp-1,
2958029580+- 0x1.f9f9ba8d3c733p-1,
2958129581+- 0x1.fa2974addae45p-1,
2958229582+- 0x1.fa57ddfe27376p-1,
2958329583+- 0x1.fa84fe5e05c8dp-1,
2958429584+- 0x1.fab0dd89d1309p-1,
2958529585+- 0x1.fadb831a9f9c3p-1,
2958629586+- 0x1.fb04f6868a944p-1,
2958729587+- 0x1.fb2d3f20f9101p-1,
2958829588+- 0x1.fb54641aebbc9p-1,
2958929589+- 0x1.fb7a6c834b5a2p-1,
2959029590+- 0x1.fb9f5f4739170p-1,
2959129591+- 0x1.fbc3433260ca5p-1,
2959229592+- 0x1.fbe61eef4cf6ap-1,
2959329593+- 0x1.fc07f907bc794p-1,
2959429594+- 0x1.fc28d7e4f9cd0p-1,
2959529595+- 0x1.fc48c1d033c7ap-1,
2959629596+- 0x1.fc67bcf2d7b8fp-1,
2959729597+- 0x1.fc85cf56ecd38p-1,
2959829598+- 0x1.fca2fee770c79p-1,
2959929599+- 0x1.fcbf5170b578bp-1,
2960029600+- 0x1.fcdacca0bfb73p-1,
2960129601+- 0x1.fcf57607a6e7cp-1,
2960229602+- 0x1.fd0f5317f582fp-1,
2960329603+- 0x1.fd2869270a56fp-1,
2960429604+- 0x1.fd40bd6d7a785p-1,
2960529605+- 0x1.fd58550773cb5p-1,
2960629606+- 0x1.fd6f34f52013ap-1,
2960729607+- 0x1.fd85621b0876dp-1,
2960829608+- 0x1.fd9ae142795e3p-1,
2960929609+- 0x1.fdafb719e6a69p-1,
2961029610+- 0x1.fdc3e835500b3p-1,
2961129611+- 0x1.fdd7790ea5bc0p-1,
2961229612+- 0x1.fdea6e062d0c9p-1,
2961329613+- 0x1.fdfccb62e52d3p-1,
2961429614+- 0x1.fe0e9552ebdd6p-1,
2961529615+- 0x1.fe1fcfebe2083p-1,
2961629616+- 0x1.fe307f2b503d0p-1,
2961729617+- 0x1.fe40a6f70af4bp-1,
2961829618+- 0x1.fe504b1d9696cp-1,
2961929619+- 0x1.fe5f6f568b301p-1,
2962029620+- 0x1.fe6e1742f7cf6p-1,
2962129621+- 0x1.fe7c466dc57a1p-1,
2962229622+- 0x1.fe8a004c19ae6p-1,
2962329623+- 0x1.fe97483db8670p-1,
2962429624+- 0x1.fea4218d6594ap-1,
2962529625+- 0x1.feb08f7146046p-1,
2962629626+- 0x1.febc950b3fa75p-1,
2962729627+- 0x1.fec835695932ep-1,
2962829628+- 0x1.fed37386190fbp-1,
2962929629+- 0x1.fede5248e38f4p-1,
2963029630+- 0x1.fee8d486585eep-1,
2963129631+- 0x1.fef2fd00af31ap-1,
2963229632+- 0x1.fefcce6813974p-1,
2963329633+- 0x1.ff064b5afffbep-1,
2963429634+- 0x1.ff0f766697c76p-1,
2963529635+- 0x1.ff18520700971p-1,
2963629636+- 0x1.ff20e0a7ba8c2p-1,
2963729637+- 0x1.ff2924a3f7a83p-1,
2963829638+- 0x1.ff312046f2339p-1,
2963929639+- 0x1.ff38d5cc4227fp-1,
2964029640+- 0x1.ff404760319b4p-1,
2964129641+- 0x1.ff47772010262p-1,
2964229642+- 0x1.ff4e671a85425p-1,
2964329643+- 0x1.ff55194fe19dfp-1,
2964429644+- 0x1.ff5b8fb26f5f6p-1,
2964529645+- 0x1.ff61cc26c1578p-1,
2964629646+- 0x1.ff67d08401202p-1,
2964729647+- 0x1.ff6d9e943c231p-1,
2964829648+- 0x1.ff733814af88cp-1,
2964929649+- 0x1.ff789eb6130c9p-1,
2965029650+- 0x1.ff7dd41ce2b4dp-1,
2965129651+- 0x1.ff82d9e1a76d8p-1,
2965229652+- 0x1.ff87b1913e853p-1,
2965329653+- 0x1.ff8c5cad200a5p-1,
2965429654+- 0x1.ff90dcaba4096p-1,
2965529655+- 0x1.ff9532f846ab0p-1,
2965629656+- 0x1.ff9960f3eb327p-1,
2965729657+- 0x1.ff9d67f51ddbap-1,
2965829658+- 0x1.ffa14948549a7p-1,
2965929659+- 0x1.ffa506302ebaep-1,
2966029660+- 0x1.ffa89fe5b3625p-1,
2966129661+- 0x1.ffac17988ef4bp-1,
2966229662+- 0x1.ffaf6e6f4f5c0p-1,
2966329663+- 0x1.ffb2a5879f35ep-1,
2966429664+- 0x1.ffb5bdf67fe6fp-1,
2966529665+- 0x1.ffb8b8c88295fp-1,
2966629666+- 0x1.ffbb970200110p-1,
2966729667+- 0x1.ffbe599f4f9d9p-1,
2966829668+- 0x1.ffc10194fcb64p-1,
2966929669+- 0x1.ffc38fcffbb7cp-1,
2967029670+- 0x1.ffc60535dd7f5p-1,
2967129671+- 0x1.ffc862a501fd7p-1,
2967229672+- 0x1.ffcaa8f4c9beap-1,
2967329673+- 0x1.ffccd8f5c66d1p-1,
2967429674+- 0x1.ffcef371ea4d7p-1,
2967529675+- 0x1.ffd0f92cb6ba7p-1,
2967629676+- 0x1.ffd2eae369a07p-1,
2967729677+- 0x1.ffd4c94d29fdbp-1,
2967829678+- 0x1.ffd6951b33686p-1,
2967929679+- 0x1.ffd84ef9009eep-1,
2968029680+- 0x1.ffd9f78c7524ap-1,
2968129681+- 0x1.ffdb8f7605ee7p-1,
2968229682+- 0x1.ffdd1750e1220p-1,
2968329683+- 0x1.ffde8fb314ebfp-1,
2968429684+- 0x1.ffdff92db56e5p-1,
2968529685+- 0x1.ffe1544d01ccbp-1,
2968629686+- 0x1.ffe2a1988857cp-1,
2968729687+- 0x1.ffe3e19349dc7p-1,
2968829688+- 0x1.ffe514bbdc197p-1,
2968929689+- 0x1.ffe63b8c8b5f7p-1,
2969029690+- 0x1.ffe7567b7b5e1p-1,
2969129691+- 0x1.ffe865fac722bp-1,
2969229692+- 0x1.ffe96a78a04a9p-1,
2969329693+- 0x1.ffea645f6d6dap-1,
2969429694+- 0x1.ffeb5415e7c44p-1,
2969529695+- 0x1.ffec39ff380b9p-1,
2969629696+- 0x1.ffed167b12ac2p-1,
2969729697+- 0x1.ffede9e5d3262p-1,
2969829698+- 0x1.ffeeb49896c6dp-1,
2969929699+- 0x1.ffef76e956a9fp-1,
2970029700+- 0x1.fff0312b010b5p-1,
2970129701+- 0x1.fff0e3ad91ec2p-1,
2970229702+- 0x1.fff18ebe2b0e1p-1,
2970329703+- 0x1.fff232a72b48ep-1,
2970429704+- 0x1.fff2cfb0453d9p-1,
2970529705+- 0x1.fff3661e9569dp-1,
2970629706+- 0x1.fff3f634b79f9p-1,
2970729707+- 0x1.fff48032dbe40p-1,
2970829708+- 0x1.fff50456dab8cp-1,
2970929709+- 0x1.fff582dc48d30p-1,
2971029710+- 0x1.fff5fbfc8a439p-1,
2971129711+- 0x1.fff66feee5129p-1,
2971229712+- 0x1.fff6dee89352ep-1,
2971329713+- 0x1.fff7491cd4af6p-1,
2971429714+- 0x1.fff7aebcff755p-1,
2971529715+- 0x1.fff80ff8911fdp-1,
2971629716+- 0x1.fff86cfd3e657p-1,
2971729717+- 0x1.fff8c5f702ccfp-1,
2971829718+- 0x1.fff91b102fca8p-1,
2971929719+- 0x1.fff96c717b695p-1,
2972029720+- 0x1.fff9ba420e834p-1,
2972129721+- 0x1.fffa04a7928b1p-1,
2972229722+- 0x1.fffa4bc63ee9ap-1,
2972329723+- 0x1.fffa8fc0e5f33p-1,
2972429724+- 0x1.fffad0b901755p-1,
2972529725+- 0x1.fffb0ecebee1bp-1,
2972629726+- 0x1.fffb4a210b172p-1,
2972729727+- 0x1.fffb82cd9dcbfp-1,
2972829728+- 0x1.fffbb8f1049c6p-1,
2972929729+- 0x1.fffbeca6adbe9p-1,
2973029730+- 0x1.fffc1e08f25f5p-1,
2973129731+- 0x1.fffc4d3120aa1p-1,
2973229732+- 0x1.fffc7a37857d2p-1,
2973329733+- 0x1.fffca53375ce3p-1,
2973429734+- 0x1.fffcce3b57bffp-1,
2973529735+- 0x1.fffcf564ab6b7p-1,
2973629736+- 0x1.fffd1ac4135f9p-1,
2973729737+- 0x1.fffd3e6d5cd87p-1,
2973829738+- 0x1.fffd607387b07p-1,
2973929739+- 0x1.fffd80e8ce0dap-1,
2974029740+- 0x1.fffd9fdeabccep-1,
2974129741+- 0x1.fffdbd65e5ad0p-1,
2974229742+- 0x1.fffdd98e903b2p-1,
2974329743+- 0x1.fffdf46816833p-1,
2974429744+- 0x1.fffe0e0140857p-1,
2974529745+- 0x1.fffe26683972ap-1,
2974629746+- 0x1.fffe3daa95b18p-1,
2974729747+- 0x1.fffe53d558ae9p-1,
2974829748+- 0x1.fffe68f4fa777p-1,
2974929749+- 0x1.fffe7d156d244p-1,
2975029750+- 0x1.fffe904222101p-1,
2975129751+- 0x1.fffea2860ee1ep-1,
2975229752+- 0x1.fffeb3ebb267bp-1,
2975329753+- 0x1.fffec47d19457p-1,
2975429754+- 0x1.fffed443e2787p-1,
2975529755+- 0x1.fffee34943b15p-1,
2975629756+- 0x1.fffef1960d85dp-1,
2975729757+- 0x1.fffeff32af7afp-1,
2975829758+- 0x1.ffff0c273bea2p-1,
2975929759+- 0x1.ffff187b6bc0ep-1,
2976029760+- 0x1.ffff2436a21dcp-1,
2976129761+- 0x1.ffff2f5fefcaap-1,
2976229762+- 0x1.ffff39fe16963p-1,
2976329763+- 0x1.ffff44178c8d2p-1,
2976429764+- 0x1.ffff4db27f146p-1,
2976529765+- 0x1.ffff56d4d5e5ep-1,
2976629766+- 0x1.ffff5f8435efcp-1,
2976729767+- 0x1.ffff67c604180p-1,
2976829768+- 0x1.ffff6f9f67e55p-1,
2976929769+- 0x1.ffff77154e0d6p-1,
2977029770+- 0x1.ffff7e2c6aea2p-1,
2977129771+- 0x1.ffff84e93cd75p-1,
2977229772+- 0x1.ffff8b500e77cp-1,
2977329773+- 0x1.ffff9164f8e46p-1,
2977429774+- 0x1.ffff972be5c59p-1,
2977529775+- 0x1.ffff9ca891572p-1,
2977629776+- 0x1.ffffa1de8c582p-1,
2977729777+- 0x1.ffffa6d13de73p-1,
2977829778+- 0x1.ffffab83e54b8p-1,
2977929779+- 0x1.ffffaff99bac4p-1,
2978029780+- 0x1.ffffb43555b5fp-1,
2978129781+- 0x1.ffffb839e52f3p-1,
2978229782+- 0x1.ffffbc09fa7cdp-1,
2978329783+- 0x1.ffffbfa82616bp-1,
2978429784+- 0x1.ffffc316d9ed0p-1,
2978529785+- 0x1.ffffc6586abf6p-1,
2978629786+- 0x1.ffffc96f1165ep-1,
2978729787+- 0x1.ffffcc5cec0c1p-1,
2978829788+- 0x1.ffffcf23ff5fcp-1,
2978929789+- 0x1.ffffd1c637b2bp-1,
2979029790+- 0x1.ffffd4456a10dp-1,
2979129791+- 0x1.ffffd6a3554a1p-1,
2979229792+- 0x1.ffffd8e1a2f22p-1,
2979329793+- 0x1.ffffdb01e8546p-1,
2979429794+- 0x1.ffffdd05a75eap-1,
2979529795+- 0x1.ffffdeee4f810p-1,
2979629796+- 0x1.ffffe0bd3e852p-1,
2979729797+- 0x1.ffffe273c15b7p-1,
2979829798+- 0x1.ffffe41314e06p-1,
2979929799+- 0x1.ffffe59c6698bp-1,
2980029800+- 0x1.ffffe710d565ep-1,
2980129801+- 0x1.ffffe8717232dp-1,
2980229802+- 0x1.ffffe9bf4098cp-1,
2980329803+- 0x1.ffffeafb377d5p-1,
2980429804+- 0x1.ffffec2641a9ep-1,
2980529805+- 0x1.ffffed413e5b7p-1,
2980629806+- 0x1.ffffee4d01cd6p-1,
2980729807+- 0x1.ffffef4a55bd4p-1,
2980829808+- 0x1.fffff039f9e8fp-1,
2980929809+- 0x1.fffff11ca4876p-1,
2981029810+- 0x1.fffff1f302bc1p-1,
2981129811+- 0x1.fffff2bdb904dp-1,
2981229812+- 0x1.fffff37d63a36p-1,
2981329813+- 0x1.fffff43297019p-1,
2981429814+- 0x1.fffff4dde0118p-1,
2981529815+- 0x1.fffff57fc4a95p-1,
2981629816+- 0x1.fffff618c3da6p-1,
2981729817+- 0x1.fffff6a956450p-1,
2981829818+- 0x1.fffff731ee681p-1,
2981929819+- 0x1.fffff7b2f8ed6p-1,
2982029820+- 0x1.fffff82cdcf1bp-1,
2982129821+- 0x1.fffff89ffc4aap-1,
2982229822+- 0x1.fffff90cb3c81p-1,
2982329823+- 0x1.fffff9735b73bp-1,
2982429824+- 0x1.fffff9d446cccp-1,
2982529825+- 0x1.fffffa2fc5015p-1,
2982629826+- 0x1.fffffa8621251p-1,
2982729827+- 0x1.fffffad7a2652p-1,
2982829828+- 0x1.fffffb248c39dp-1,
2982929829+- 0x1.fffffb6d1e95dp-1,
2983029830+- 0x1.fffffbb196132p-1,
2983129831+- 0x1.fffffbf22c1e2p-1,
2983229832+- 0x1.fffffc2f171e3p-1,
2983329833+- 0x1.fffffc688a9cfp-1,
2983429834+- 0x1.fffffc9eb76acp-1,
2983529835+- 0x1.fffffcd1cbc28p-1,
2983629836+- 0x1.fffffd01f36afp-1,
2983729837+- 0x1.fffffd2f57d68p-1,
2983829838+- 0x1.fffffd5a2041fp-1,
2983929839+- 0x1.fffffd8271d12p-1,
2984029840+- 0x1.fffffda86faa9p-1,
2984129841+- 0x1.fffffdcc3b117p-1,
2984229842+- 0x1.fffffdedf37edp-1,
2984329843+- 0x1.fffffe0db6b91p-1,
2984429844+- 0x1.fffffe2ba0ea5p-1,
2984529845+- 0x1.fffffe47ccb60p-1,
2984629846+- 0x1.fffffe62534d4p-1,
2984729847+- 0x1.fffffe7b4c81ep-1,
2984829848+- 0x1.fffffe92ced93p-1,
2984929849+- 0x1.fffffea8ef9cfp-1,
2985029850+- 0x1.fffffebdc2ec6p-1,
2985129851+- 0x1.fffffed15bcbap-1,
2985229852+- 0x1.fffffee3cc32cp-1,
2985329853+- 0x1.fffffef5251c2p-1,
2985429854+- 0x1.ffffff0576917p-1,
2985529855+- 0x1.ffffff14cfb92p-1,
2985629856+- 0x1.ffffff233ee1dp-1,
2985729857+- 0x1.ffffff30d18e8p-1,
2985829858+- 0x1.ffffff3d9480fp-1,
2985929859+- 0x1.ffffff4993c46p-1,
2986029860+- 0x1.ffffff54dab72p-1,
2986129861+- 0x1.ffffff5f74141p-1,
2986229862+- 0x1.ffffff6969fb8p-1,
2986329863+- 0x1.ffffff72c5fb6p-1,
2986429864+- 0x1.ffffff7b91176p-1,
2986529865+- 0x1.ffffff83d3d07p-1,
2986629866+- 0x1.ffffff8b962bep-1,
2986729867+- 0x1.ffffff92dfba2p-1,
2986829868+- 0x1.ffffff99b79d2p-1,
2986929869+- 0x1.ffffffa0248e8p-1,
2987029870+- 0x1.ffffffa62ce54p-1,
2987129871+- 0x1.ffffffabd69b4p-1,
2987229872+- 0x1.ffffffb127525p-1,
2987329873+- 0x1.ffffffb624592p-1,
2987429874+- 0x1.ffffffbad2affp-1,
2987529875+- 0x1.ffffffbf370cdp-1,
2987629876+- 0x1.ffffffc355dfdp-1,
2987729877+- 0x1.ffffffc733572p-1,
2987829878+- 0x1.ffffffcad3626p-1,
2987929879+- 0x1.ffffffce39b67p-1,
2988029880+- 0x1.ffffffd169d0cp-1,
2988129881+- 0x1.ffffffd466fa5p-1,
2988229882+- 0x1.ffffffd7344aap-1,
2988329883+- 0x1.ffffffd9d4aabp-1,
2988429884+- 0x1.ffffffdc4ad7ap-1,
2988529885+- 0x1.ffffffde9964ep-1,
2988629886+- 0x1.ffffffe0c2bf0p-1,
2988729887+- 0x1.ffffffe2c92dbp-1,
2988829888+- 0x1.ffffffe4aed5ep-1,
2988929889+- 0x1.ffffffe675bbdp-1,
2989029890+- 0x1.ffffffe81fc4ep-1,
2989129891+- 0x1.ffffffe9aeb97p-1,
2989229892+- 0x1.ffffffeb24467p-1,
2989329893+- 0x1.ffffffec81ff2p-1,
2989429894+- 0x1.ffffffedc95e7p-1,
2989529895+- 0x1.ffffffeefbc85p-1,
2989629896+- 0x1.fffffff01a8b6p-1,
2989729897+- 0x1.fffffff126e1ep-1,
2989829898+- 0x1.fffffff221f30p-1,
2989929899+- 0x1.fffffff30cd3fp-1,
2990029900+- 0x1.fffffff3e8892p-1,
2990129901+- 0x1.fffffff4b606fp-1,
2990229902+- 0x1.fffffff57632dp-1,
2990329903+- 0x1.fffffff629e44p-1,
2990429904+- 0x1.fffffff6d1e56p-1,
2990529905+- 0x1.fffffff76ef3fp-1,
2990629906+- 0x1.fffffff801c1fp-1,
2990729907+- 0x1.fffffff88af67p-1,
2990829908+- 0x1.fffffff90b2e3p-1,
2990929909+- 0x1.fffffff982fc1p-1,
2991029910+- 0x1.fffffff9f2e9fp-1,
2991129911+- 0x1.fffffffa5b790p-1,
2991229912+- 0x1.fffffffabd229p-1,
2991329913+- 0x1.fffffffb18582p-1,
2991429914+- 0x1.fffffffb6d844p-1,
2991529915+- 0x1.fffffffbbd0aap-1,
2991629916+- 0x1.fffffffc0748fp-1,
2991729917+- 0x1.fffffffc4c96cp-1,
2991829918+- 0x1.fffffffc8d462p-1,
2991929919+- 0x1.fffffffcc9a41p-1,
2992029920+- 0x1.fffffffd01f89p-1,
2992129921+- 0x1.fffffffd36871p-1,
2992229922+- 0x1.fffffffd678edp-1,
2992329923+- 0x1.fffffffd954aep-1,
2992429924+- 0x1.fffffffdbff2ap-1,
2992529925+- 0x1.fffffffde7ba0p-1,
2992629926+- 0x1.fffffffe0cd16p-1,
2992729927+- 0x1.fffffffe2f664p-1,
2992829928+- 0x1.fffffffe4fa30p-1,
2992929929+- 0x1.fffffffe6daf7p-1,
2993029930+- 0x1.fffffffe89b0cp-1,
2993129931+- 0x1.fffffffea3c9ap-1,
2993229932+- 0x1.fffffffebc1a9p-1,
2993329933+- 0x1.fffffffed2c21p-1,
2993429934+- 0x1.fffffffee7dc8p-1,
2993529935+- 0x1.fffffffefb847p-1,
2993629936+- 0x1.ffffffff0dd2bp-1,
2993729937+- 0x1.ffffffff1ede9p-1,
2993829938+- 0x1.ffffffff2ebdap-1,
2993929939+- 0x1.ffffffff3d843p-1,
2994029940+- 0x1.ffffffff4b453p-1,
2994129941+- 0x1.ffffffff58126p-1,
2994229942+- 0x1.ffffffff63fc3p-1,
2994329943+- 0x1.ffffffff6f121p-1,
2994429944+- 0x1.ffffffff79626p-1,
2994529945+- 0x1.ffffffff82fabp-1,
2994629946+- 0x1.ffffffff8be77p-1,
2994729947+- 0x1.ffffffff94346p-1,
2994829948+- 0x1.ffffffff9bec8p-1,
2994929949+- 0x1.ffffffffa319fp-1,
2995029950+- 0x1.ffffffffa9c63p-1,
2995129951+- 0x1.ffffffffaffa4p-1,
2995229952+- 0x1.ffffffffb5be5p-1,
2995329953+- 0x1.ffffffffbb1a2p-1,
2995429954+- 0x1.ffffffffc014ep-1,
2995529955+- 0x1.ffffffffc4b56p-1,
2995629956+- 0x1.ffffffffc901cp-1,
2995729957+- 0x1.ffffffffccfffp-1,
2995829958+- 0x1.ffffffffd0b56p-1,
2995929959+- 0x1.ffffffffd4271p-1,
2996029960+- 0x1.ffffffffd759dp-1,
2996129961+- 0x1.ffffffffda520p-1,
2996229962+- 0x1.ffffffffdd13cp-1,
2996329963+- 0x1.ffffffffdfa2dp-1,
2996429964+- 0x1.ffffffffe202dp-1,
2996529965+- 0x1.ffffffffe4371p-1,
2996629966+- 0x1.ffffffffe642ap-1,
2996729967+- 0x1.ffffffffe8286p-1,
2996829968+- 0x1.ffffffffe9eb0p-1,
2996929969+- 0x1.ffffffffeb8d0p-1,
2997029970+- 0x1.ffffffffed10ap-1,
2997129971+- 0x1.ffffffffee782p-1,
2997229972+- 0x1.ffffffffefc57p-1,
2997329973+- 0x1.fffffffff0fa7p-1,
2997429974+- 0x1.fffffffff218fp-1,
2997529975+- 0x1.fffffffff3227p-1,
2997629976+- 0x1.fffffffff4188p-1,
2997729977+- 0x1.fffffffff4fc9p-1,
2997829978+- 0x1.fffffffff5cfdp-1,
2997929979+- 0x1.fffffffff6939p-1,
2998029980+- 0x1.fffffffff748ep-1,
2998129981+- 0x1.fffffffff7f0dp-1,
2998229982+- 0x1.fffffffff88c5p-1,
2998329983+- 0x1.fffffffff91c6p-1,
2998429984+- 0x1.fffffffff9a1bp-1,
2998529985+- 0x1.fffffffffa1d2p-1,
2998629986+- 0x1.fffffffffa8f6p-1,
2998729987+- 0x1.fffffffffaf92p-1,
2998829988+- 0x1.fffffffffb5b0p-1,
2998929989+- 0x1.fffffffffbb58p-1,
2999029990+- 0x1.fffffffffc095p-1,
2999129991+- 0x1.fffffffffc56dp-1,
2999229992+- 0x1.fffffffffc9e8p-1,
2999329993+- 0x1.fffffffffce0dp-1,
2999429994+- 0x1.fffffffffd1e1p-1,
2999529995+- 0x1.fffffffffd56cp-1,
2999629996+- 0x1.fffffffffd8b3p-1,
2999729997+- 0x1.fffffffffdbbap-1,
2999829998+- 0x1.fffffffffde86p-1,
2999929999+- 0x1.fffffffffe11dp-1,
3000030000+- 0x1.fffffffffe380p-1,
3000130001+- 0x1.fffffffffe5b6p-1,
3000230002+- 0x1.fffffffffe7c0p-1,
3000330003+- 0x1.fffffffffe9a2p-1,
3000430004+- 0x1.fffffffffeb60p-1,
3000530005+- 0x1.fffffffffecfbp-1,
3000630006+- 0x1.fffffffffee77p-1,
3000730007+- 0x1.fffffffffefd6p-1,
3000830008+- 0x1.ffffffffff11ap-1,
3000930009+- 0x1.ffffffffff245p-1,
3001030010+- 0x1.ffffffffff359p-1,
3001130011+- 0x1.ffffffffff457p-1,
3001230012+- 0x1.ffffffffff542p-1,
3001330013+- 0x1.ffffffffff61bp-1,
3001430014+- 0x1.ffffffffff6e3p-1,
3001530015+- 0x1.ffffffffff79bp-1,
3001630016+- 0x1.ffffffffff845p-1,
3001730017+- 0x1.ffffffffff8e2p-1,
3001830018+- 0x1.ffffffffff973p-1,
3001930019+- 0x1.ffffffffff9f8p-1,
3002030020+- 0x1.ffffffffffa73p-1,
3002130021+- 0x1.ffffffffffae4p-1,
3002230022+- 0x1.ffffffffffb4cp-1,
3002330023+- 0x1.ffffffffffbadp-1,
3002430024+- 0x1.ffffffffffc05p-1,
3002530025+- 0x1.ffffffffffc57p-1,
3002630026+- 0x1.ffffffffffca2p-1,
3002730027+- 0x1.ffffffffffce7p-1,
3002830028+- 0x1.ffffffffffd27p-1,
3002930029+- 0x1.ffffffffffd62p-1,
3003030030+- 0x1.ffffffffffd98p-1,
3003130031+- 0x1.ffffffffffdcap-1,
3003230032+- 0x1.ffffffffffdf8p-1,
3003330033+- 0x1.ffffffffffe22p-1,
3003430034+- 0x1.ffffffffffe49p-1,
3003530035+- 0x1.ffffffffffe6cp-1,
3003630036+- 0x1.ffffffffffe8dp-1,
3003730037+- 0x1.ffffffffffeabp-1,
3003830038+- 0x1.ffffffffffec7p-1,
3003930039+- 0x1.ffffffffffee1p-1,
3004030040+- 0x1.ffffffffffef8p-1,
3004130041+- 0x1.fffffffffff0ep-1,
3004230042+- 0x1.fffffffffff22p-1,
3004330043+- 0x1.fffffffffff34p-1,
3004430044+- 0x1.fffffffffff45p-1,
3004530045+- 0x1.fffffffffff54p-1,
3004630046+- 0x1.fffffffffff62p-1,
3004730047+- 0x1.fffffffffff6fp-1,
3004830048+- 0x1.fffffffffff7bp-1,
3004930049+- 0x1.fffffffffff86p-1,
3005030050+- 0x1.fffffffffff90p-1,
3005130051+- 0x1.fffffffffff9ap-1,
3005230052+- 0x1.fffffffffffa2p-1,
3005330053+- 0x1.fffffffffffaap-1,
3005430054+- 0x1.fffffffffffb1p-1,
3005530055+- 0x1.fffffffffffb8p-1,
3005630056+- 0x1.fffffffffffbep-1,
3005730057+- 0x1.fffffffffffc3p-1,
3005830058+- 0x1.fffffffffffc8p-1,
3005930059+- 0x1.fffffffffffcdp-1,
3006030060+- 0x1.fffffffffffd1p-1,
3006130061+- 0x1.fffffffffffd5p-1,
3006230062+- 0x1.fffffffffffd9p-1,
3006330063+- 0x1.fffffffffffdcp-1,
3006430064+- 0x1.fffffffffffdfp-1,
3006530065+- 0x1.fffffffffffe2p-1,
3006630066+- 0x1.fffffffffffe4p-1,
3006730067+- 0x1.fffffffffffe7p-1,
3006830068+- 0x1.fffffffffffe9p-1,
3006930069+- 0x1.fffffffffffebp-1,
3007030070+- 0x1.fffffffffffedp-1,
3007130071+- 0x1.fffffffffffeep-1,
3007230072+- 0x1.ffffffffffff0p-1,
3007330073+- 0x1.ffffffffffff1p-1,
3007430074+- 0x1.ffffffffffff3p-1,
3007530075+- 0x1.ffffffffffff4p-1,
3007630076+- 0x1.ffffffffffff5p-1,
3007730077+- 0x1.ffffffffffff6p-1,
3007830078+- 0x1.ffffffffffff7p-1,
3007930079+- 0x1.ffffffffffff7p-1,
3008030080+- 0x1.ffffffffffff8p-1,
3008130081+- 0x1.ffffffffffff9p-1,
3008230082+- 0x1.ffffffffffff9p-1,
3008330083+- 0x1.ffffffffffffap-1,
3008430084+- 0x1.ffffffffffffbp-1,
3008530085+- 0x1.ffffffffffffbp-1,
3008630086+- 0x1.ffffffffffffbp-1,
3008730087+- 0x1.ffffffffffffcp-1,
3008830088+- 0x1.ffffffffffffcp-1,
3008930089+- 0x1.ffffffffffffdp-1,
3009030090+- 0x1.ffffffffffffdp-1,
3009130091+- 0x1.ffffffffffffdp-1,
3009230092+- 0x1.ffffffffffffdp-1,
3009330093+- 0x1.ffffffffffffep-1,
3009430094+- 0x1.ffffffffffffep-1,
3009530095+- 0x1.ffffffffffffep-1,
3009630096+- 0x1.ffffffffffffep-1,
3009730097+- 0x1.ffffffffffffep-1,
3009830098+- 0x1.ffffffffffffep-1,
3009930099+- 0x1.fffffffffffffp-1,
3010030100+- 0x1.fffffffffffffp-1,
3010130101+- 0x1.fffffffffffffp-1,
3010230102+- 0x1.fffffffffffffp-1,
3010330103+- 0x1.fffffffffffffp-1,
3010430104+- 0x1.fffffffffffffp-1,
3010530105+- 0x1.fffffffffffffp-1,
3010630106+- 0x1.fffffffffffffp-1,
3010730107+- 0x1.fffffffffffffp-1,
3010830108+- 0x1.fffffffffffffp-1,
3010930109+- 0x1.fffffffffffffp-1,
3011030110+- 0x1.0000000000000p+0,
3011130111+- 0x1.0000000000000p+0,
3011230112+- 0x1.0000000000000p+0,
3011330113+- 0x1.0000000000000p+0,
3011430114+- 0x1.0000000000000p+0,
3011530115+- 0x1.0000000000000p+0,
3011630116+- 0x1.0000000000000p+0,
3011730117+- 0x1.0000000000000p+0,
3011830118+- 0x1.0000000000000p+0,
3011930119+- 0x1.0000000000000p+0,
3012030120+- 0x1.0000000000000p+0,
3012130121+- },
3012230122+- .scale = { 0x1.20dd750429b6dp+0,
3012330123+- 0x1.20d8f1975c85dp+0,
3012430124+- 0x1.20cb67bd452c7p+0,
3012530125+- 0x1.20b4d8bac36c1p+0,
3012630126+- 0x1.209546ad13ccfp+0,
3012730127+- 0x1.206cb4897b148p+0,
3012830128+- 0x1.203b261cd0052p+0,
3012930129+- 0x1.2000a00ae3804p+0,
3013030130+- 0x1.1fbd27cdc72d3p+0,
3013130131+- 0x1.1f70c3b4f2cc7p+0,
3013230132+- 0x1.1f1b7ae44867fp+0,
3013330133+- 0x1.1ebd5552f795bp+0,
3013430134+- 0x1.1e565bca400d4p+0,
3013530135+- 0x1.1de697e413d28p+0,
3013630136+- 0x1.1d6e14099944ap+0,
3013730137+- 0x1.1cecdb718d61cp+0,
3013830138+- 0x1.1c62fa1e869b6p+0,
3013930139+- 0x1.1bd07cdd189acp+0,
3014030140+- 0x1.1b357141d95d5p+0,
3014130141+- 0x1.1a91e5a748165p+0,
3014230142+- 0x1.19e5e92b964abp+0,
3014330143+- 0x1.19318bae53a04p+0,
3014430144+- 0x1.1874ddcdfce24p+0,
3014530145+- 0x1.17aff0e56ec10p+0,
3014630146+- 0x1.16e2d7093cd8cp+0,
3014730147+- 0x1.160da304ed92fp+0,
3014830148+- 0x1.153068581b781p+0,
3014930149+- 0x1.144b3b337c90cp+0,
3015030150+- 0x1.135e3075d076bp+0,
3015130151+- 0x1.12695da8b5bdep+0,
3015230152+- 0x1.116cd8fd67618p+0,
3015330153+- 0x1.1068b94962e5ep+0,
3015430154+- 0x1.0f5d1602f7e41p+0,
3015530155+- 0x1.0e4a073dc1b91p+0,
3015630156+- 0x1.0d2fa5a70c168p+0,
3015730157+- 0x1.0c0e0a8223359p+0,
3015830158+- 0x1.0ae54fa490722p+0,
3015930159+- 0x1.09b58f724416bp+0,
3016030160+- 0x1.087ee4d9ad247p+0,
3016130161+- 0x1.07416b4fbfe7cp+0,
3016230162+- 0x1.05fd3ecbec297p+0,
3016330163+- 0x1.04b27bc403d30p+0,
3016430164+- 0x1.03613f2812dafp+0,
3016530165+- 0x1.0209a65e29545p+0,
3016630166+- 0x1.00abcf3e187a9p+0,
3016730167+- 0x1.fe8fb01a47307p-1,
3016830168+- 0x1.fbbbbef34b4b2p-1,
3016930169+- 0x1.f8dc092d58ff8p-1,
3017030170+- 0x1.f5f0cdaf15313p-1,
3017130171+- 0x1.f2fa4c16c0019p-1,
3017230172+- 0x1.eff8c4b1375dbp-1,
3017330173+- 0x1.ecec7870ebca7p-1,
3017430174+- 0x1.e9d5a8e4c934ep-1,
3017530175+- 0x1.e6b4982f158b9p-1,
3017630176+- 0x1.e38988fc46e72p-1,
3017730177+- 0x1.e054be79d3042p-1,
3017830178+- 0x1.dd167c4cf9d2ap-1,
3017930179+- 0x1.d9cf06898cdafp-1,
3018030180+- 0x1.d67ea1a8b5368p-1,
3018130181+- 0x1.d325927fb9d89p-1,
3018230182+- 0x1.cfc41e36c7df9p-1,
3018330183+- 0x1.cc5a8a3fbea40p-1,
3018430184+- 0x1.c8e91c4d01368p-1,
3018530185+- 0x1.c5701a484ef9dp-1,
3018630186+- 0x1.c1efca49a5011p-1,
3018730187+- 0x1.be68728e29d5dp-1,
3018830188+- 0x1.bada596f25436p-1,
3018930189+- 0x1.b745c55905bf8p-1,
3019030190+- 0x1.b3aafcc27502ep-1,
3019130191+- 0x1.b00a46237d5bep-1,
3019230192+- 0x1.ac63e7ecc1411p-1,
3019330193+- 0x1.a8b8287ec6a09p-1,
3019430194+- 0x1.a5074e2157620p-1,
3019530195+- 0x1.a1519efaf889ep-1,
3019630196+- 0x1.9d97610879642p-1,
3019730197+- 0x1.99d8da149c13fp-1,
3019830198+- 0x1.96164fafd8de3p-1,
3019930199+- 0x1.925007283d7aap-1,
3020030200+- 0x1.8e86458169af8p-1,
3020130201+- 0x1.8ab94f6caa71dp-1,
3020230202+- 0x1.86e9694134b9ep-1,
3020330203+- 0x1.8316d6f48133dp-1,
3020430204+- 0x1.7f41dc12c9e89p-1,
3020530205+- 0x1.7b6abbb7aaf19p-1,
3020630206+- 0x1.7791b886e7403p-1,
3020730207+- 0x1.73b714a552763p-1,
3020830208+- 0x1.6fdb11b1e0c34p-1,
3020930209+- 0x1.6bfdf0beddaf5p-1,
3021030210+- 0x1.681ff24b4ab04p-1,
3021130211+- 0x1.6441563c665d4p-1,
3021230212+- 0x1.60625bd75d07bp-1,
3021330213+- 0x1.5c8341bb23767p-1,
3021430214+- 0x1.58a445da7c74cp-1,
3021530215+- 0x1.54c5a57629db0p-1,
3021630216+- 0x1.50e79d1749ac9p-1,
3021730217+- 0x1.4d0a6889dfd9fp-1,
3021830218+- 0x1.492e42d78d2c5p-1,
3021930219+- 0x1.4553664273d24p-1,
3022030220+- 0x1.417a0c4049fd0p-1,
3022130221+- 0x1.3da26d759aef5p-1,
3022230222+- 0x1.39ccc1b136d5ap-1,
3022330223+- 0x1.35f93fe7d1b3dp-1,
3022430224+- 0x1.32281e2fd1a92p-1,
3022530225+- 0x1.2e5991bd4cbfcp-1,
3022630226+- 0x1.2a8dcede3673bp-1,
3022730227+- 0x1.26c508f6bd0ffp-1,
3022830228+- 0x1.22ff727dd6f7bp-1,
3022930229+- 0x1.1f3d3cf9ffe5ap-1,
3023030230+- 0x1.1b7e98fe26217p-1,
3023130231+- 0x1.17c3b626c7a11p-1,
3023230232+- 0x1.140cc3173f007p-1,
3023330233+- 0x1.1059ed7740313p-1,
3023430234+- 0x1.0cab61f084b93p-1,
3023530235+- 0x1.09014c2ca74dap-1,
3023630236+- 0x1.055bd6d32e8d7p-1,
3023730237+- 0x1.01bb2b87c6968p-1,
3023830238+- 0x1.fc3ee5d1524b0p-2,
3023930239+- 0x1.f511a91a67d2ap-2,
3024030240+- 0x1.edeeee0959518p-2,
3024130241+- 0x1.e6d6ffaa65a25p-2,
3024230242+- 0x1.dfca26f5bbf88p-2,
3024330243+- 0x1.d8c8aace11e63p-2,
3024430244+- 0x1.d1d2cfff91594p-2,
3024530245+- 0x1.cae8d93f1d7b6p-2,
3024630246+- 0x1.c40b0729ed547p-2,
3024730247+- 0x1.bd3998457afdap-2,
3024830248+- 0x1.b674c8ffc6283p-2,
3024930249+- 0x1.afbcd3afe8ab6p-2,
3025030250+- 0x1.a911f096fbc26p-2,
3025130251+- 0x1.a27455e14c93cp-2,
3025230252+- 0x1.9be437a7de946p-2,
3025330253+- 0x1.9561c7f23a47bp-2,
3025430254+- 0x1.8eed36b886d93p-2,
3025530255+- 0x1.8886b1e5ecfd1p-2,
3025630256+- 0x1.822e655b417e6p-2,
3025730257+- 0x1.7be47af1f5d89p-2,
3025830258+- 0x1.75a91a7f4d2edp-2,
3025930259+- 0x1.6f7c69d7d3ef8p-2,
3026030260+- 0x1.695e8cd31867ep-2,
3026130261+- 0x1.634fa54fa285fp-2,
3026230262+- 0x1.5d4fd33729015p-2,
3026330263+- 0x1.575f3483021c3p-2,
3026430264+- 0x1.517de540ce2a3p-2,
3026530265+- 0x1.4babff975a04cp-2,
3026630266+- 0x1.45e99bcbb7915p-2,
3026730267+- 0x1.4036d0468a7a2p-2,
3026830268+- 0x1.3a93b1998736cp-2,
3026930269+- 0x1.35005285227f1p-2,
3027030270+- 0x1.2f7cc3fe6f423p-2,
3027130271+- 0x1.2a09153529381p-2,
3027230272+- 0x1.24a55399ea239p-2,
3027330273+- 0x1.1f518ae487dc8p-2,
3027430274+- 0x1.1a0dc51a9934dp-2,
3027530275+- 0x1.14da0a961fd14p-2,
3027630276+- 0x1.0fb6620c550afp-2,
3027730277+- 0x1.0aa2d09497f2bp-2,
3027830278+- 0x1.059f59af7a906p-2,
3027930279+- 0x1.00abff4dec7a3p-2,
3028030280+- 0x1.f79183b101c5bp-3,
3028130281+- 0x1.edeb406d9c824p-3,
3028230282+- 0x1.e4652fadcb6b2p-3,
3028330283+- 0x1.daff4969c0b04p-3,
3028430284+- 0x1.d1b982c501370p-3,
3028530285+- 0x1.c893ce1dcbef7p-3,
3028630286+- 0x1.bf8e1b1ca2279p-3,
3028730287+- 0x1.b6a856c3ed54fp-3,
3028830288+- 0x1.ade26b7fbed95p-3,
3028930289+- 0x1.a53c4135a6526p-3,
3029030290+- 0x1.9cb5bd549b111p-3,
3029130291+- 0x1.944ec2e4f5630p-3,
3029230292+- 0x1.8c07329874652p-3,
3029330293+- 0x1.83deeada4d25ap-3,
3029430294+- 0x1.7bd5c7df3fe9cp-3,
3029530295+- 0x1.73eba3b5b07b7p-3,
3029630296+- 0x1.6c205655be71fp-3,
3029730297+- 0x1.6473b5b15a7a1p-3,
3029830298+- 0x1.5ce595c455b0ap-3,
3029930299+- 0x1.5575c8a468361p-3,
3030030300+- 0x1.4e241e912c305p-3,
3030130301+- 0x1.46f066040a832p-3,
3030230302+- 0x1.3fda6bc016994p-3,
3030330303+- 0x1.38e1fae1d6a9dp-3,
3030430304+- 0x1.3206dceef5f87p-3,
3030530305+- 0x1.2b48d9e5dea1cp-3,
3030630306+- 0x1.24a7b84d38971p-3,
3030730307+- 0x1.1e233d434b813p-3,
3030830308+- 0x1.17bb2c8d41535p-3,
3030930309+- 0x1.116f48a6476ccp-3,
3031030310+- 0x1.0b3f52ce8c383p-3,
3031130311+- 0x1.052b0b1a174eap-3,
3031230312+- 0x1.fe6460fef4680p-4,
3031330313+- 0x1.f2a901ccafb37p-4,
3031430314+- 0x1.e723726b824a9p-4,
3031530315+- 0x1.dbd32ac4c99b0p-4,
3031630316+- 0x1.d0b7a0f921e7cp-4,
3031730317+- 0x1.c5d0497c09e74p-4,
3031830318+- 0x1.bb1c972f23e50p-4,
3031930319+- 0x1.b09bfb7d11a83p-4,
3032030320+- 0x1.a64de673e8837p-4,
3032130321+- 0x1.9c31c6df3b1b8p-4,
3032230322+- 0x1.92470a61b6965p-4,
3032330323+- 0x1.888d1d8e510a3p-4,
3032430324+- 0x1.7f036c0107294p-4,
3032530325+- 0x1.75a96077274bap-4,
3032630326+- 0x1.6c7e64e7281cbp-4,
3032730327+- 0x1.6381e2980956bp-4,
3032830328+- 0x1.5ab342383d177p-4,
3032930329+- 0x1.5211ebf41880bp-4,
3033030330+- 0x1.499d478bca735p-4,
3033130331+- 0x1.4154bc68d75c3p-4,
3033230332+- 0x1.3937b1b319259p-4,
3033330333+- 0x1.31458e6542847p-4,
3033430334+- 0x1.297db960e4f63p-4,
3033530335+- 0x1.21df9981f8e53p-4,
3033630336+- 0x1.1a6a95b1e786fp-4,
3033730337+- 0x1.131e14fa1625dp-4,
3033830338+- 0x1.0bf97e95f2a64p-4,
3033930339+- 0x1.04fc3a0481321p-4,
3034030340+- 0x1.fc4b5e32d6259p-5,
3034130341+- 0x1.eeea8c1b1db93p-5,
3034230342+- 0x1.e1d4cf1e2450ap-5,
3034330343+- 0x1.d508f9a1ea64ep-5,
3034430344+- 0x1.c885df3451a07p-5,
3034530345+- 0x1.bc4a54a84e834p-5,
3034630346+- 0x1.b055303221015p-5,
3034730347+- 0x1.a4a549829587ep-5,
3034830348+- 0x1.993979e14fffdp-5,
3034930349+- 0x1.8e109c4622913p-5,
3035030350+- 0x1.83298d717210ep-5,
3035130351+- 0x1.78832c03aa2b1p-5,
3035230352+- 0x1.6e1c5893c380bp-5,
3035330353+- 0x1.63f3f5c4de13bp-5,
3035430354+- 0x1.5a08e85af27e0p-5,
3035530355+- 0x1.505a174e9c929p-5,
3035630356+- 0x1.46e66be002240p-5,
3035730357+- 0x1.3dacd1a8d8ccdp-5,
3035830358+- 0x1.34ac36ad8dafep-5,
3035930359+- 0x1.2be38b6d92415p-5,
3036030360+- 0x1.2351c2f2d1449p-5,
3036130361+- 0x1.1af5d2e04f3f6p-5,
3036230362+- 0x1.12ceb37ff9bc3p-5,
3036330363+- 0x1.0adb5fcfa8c75p-5,
3036430364+- 0x1.031ad58d56279p-5,
3036530365+- 0x1.f7182a851bca2p-6,
3036630366+- 0x1.e85c449e377f2p-6,
3036730367+- 0x1.da0005e5f28dfp-6,
3036830368+- 0x1.cc0180af00a8bp-6,
3036930369+- 0x1.be5ecd2fcb5f9p-6,
3037030370+- 0x1.b1160991ff737p-6,
3037130371+- 0x1.a4255a00b9f03p-6,
3037230372+- 0x1.978ae8b55ce1bp-6,
3037330373+- 0x1.8b44e6031383ep-6,
3037430374+- 0x1.7f5188610ddc8p-6,
3037530375+- 0x1.73af0c737bb45p-6,
3037630376+- 0x1.685bb5134ef13p-6,
3037730377+- 0x1.5d55cb54cd53ap-6,
3037830378+- 0x1.529b9e8cf9a1ep-6,
3037930379+- 0x1.482b8455dc491p-6,
3038030380+- 0x1.3e03d891b37dep-6,
3038130381+- 0x1.3422fd6d12e2bp-6,
3038230382+- 0x1.2a875b5ffab56p-6,
3038330383+- 0x1.212f612dee7fbp-6,
3038430384+- 0x1.181983e5133ddp-6,
3038530385+- 0x1.0f443edc5ce49p-6,
3038630386+- 0x1.06ae13b0d3255p-6,
3038730387+- 0x1.fcab1483ea7fcp-7,
3038830388+- 0x1.ec72615a894c4p-7,
3038930389+- 0x1.dcaf3691fc448p-7,
3039030390+- 0x1.cd5ec93c12431p-7,
3039130391+- 0x1.be7e5ac24963bp-7,
3039230392+- 0x1.b00b38d6b3575p-7,
3039330393+- 0x1.a202bd6372dcep-7,
3039430394+- 0x1.94624e78e0fafp-7,
3039530395+- 0x1.87275e3a6869dp-7,
3039630396+- 0x1.7a4f6aca256cbp-7,
3039730397+- 0x1.6dd7fe3358230p-7,
3039830398+- 0x1.61beae53b72b7p-7,
3039930399+- 0x1.56011cc3b036dp-7,
3040030400+- 0x1.4a9cf6bda3f4cp-7,
3040130401+- 0x1.3f8ff5042a88ep-7,
3040230402+- 0x1.34d7dbc76d7e5p-7,
3040330403+- 0x1.2a727a89a3f14p-7,
3040430404+- 0x1.205dac02bd6b9p-7,
3040530405+- 0x1.1697560347b25p-7,
3040630406+- 0x1.0d1d69569b82dp-7,
3040730407+- 0x1.03ede1a45bfeep-7,
3040830408+- 0x1.f60d8aa2a88f2p-8,
3040930409+- 0x1.e4cc4abf7d065p-8,
3041030410+- 0x1.d4143a9dfe965p-8,
3041130411+- 0x1.c3e1a5f5c077cp-8,
3041230412+- 0x1.b430ecf4a83a8p-8,
3041330413+- 0x1.a4fe83fb9db25p-8,
3041430414+- 0x1.9646f35a76623p-8,
3041530415+- 0x1.8806d70b2fc36p-8,
3041630416+- 0x1.7a3ade6c8b3e4p-8,
3041730417+- 0x1.6cdfcbfc1e263p-8,
3041830418+- 0x1.5ff2750fe7820p-8,
3041930419+- 0x1.536fc18f7ce5cp-8,
3042030420+- 0x1.4754abacdf1dcp-8,
3042130421+- 0x1.3b9e3f9d06e3fp-8,
3042230422+- 0x1.30499b503957fp-8,
3042330423+- 0x1.2553ee2a336bfp-8,
3042430424+- 0x1.1aba78ba3af89p-8,
3042530425+- 0x1.107a8c7323a6ep-8,
3042630426+- 0x1.06918b6355624p-8,
3042730427+- 0x1.f9f9cfd9c3035p-9,
3042830428+- 0x1.e77448fb66bb9p-9,
3042930429+- 0x1.d58da68fd1170p-9,
3043030430+- 0x1.c4412bf4b8f0bp-9,
3043130431+- 0x1.b38a3af2e55b4p-9,
3043230432+- 0x1.a3645330550ffp-9,
3043330433+- 0x1.93cb11a30d765p-9,
3043430434+- 0x1.84ba3004a50d0p-9,
3043530435+- 0x1.762d84469c18fp-9,
3043630436+- 0x1.6821000795a03p-9,
3043730437+- 0x1.5a90b00981d93p-9,
3043830438+- 0x1.4d78bba8ca5fdp-9,
3043930439+- 0x1.40d564548fad7p-9,
3044030440+- 0x1.34a305080681fp-9,
3044130441+- 0x1.28de11c5031ebp-9,
3044230442+- 0x1.1d83170fbf6fbp-9,
3044330443+- 0x1.128eb96be8798p-9,
3044430444+- 0x1.07fdb4dafea5fp-9,
3044530445+- 0x1.fb99b8b8279e1p-10,
3044630446+- 0x1.e7f232d9e2630p-10,
3044730447+- 0x1.d4fed7195d7e8p-10,
3044830448+- 0x1.c2b9cf7f893bfp-10,
3044930449+- 0x1.b11d702b3deb1p-10,
3045030450+- 0x1.a024365f771bdp-10,
3045130451+- 0x1.8fc8c794b03b5p-10,
3045230452+- 0x1.8005f08d6f1efp-10,
3045330453+- 0x1.70d6a46e07ddap-10,
3045430454+- 0x1.6235fbd7a4345p-10,
3045530455+- 0x1.541f340697987p-10,
3045630456+- 0x1.468dadf4080abp-10,
3045730457+- 0x1.397ced7af2b15p-10,
3045830458+- 0x1.2ce898809244ep-10,
3045930459+- 0x1.20cc76202c5fap-10,
3046030460+- 0x1.15246dda49d47p-10,
3046130461+- 0x1.09ec86c75d497p-10,
3046230462+- 0x1.fe41cd9bb4eeep-11,
3046330463+- 0x1.e97ba3b77f306p-11,
3046430464+- 0x1.d57f524723822p-11,
3046530465+- 0x1.c245d4b998479p-11,
3046630466+- 0x1.afc85e0f82e12p-11,
3046730467+- 0x1.9e005769dbc1dp-11,
3046830468+- 0x1.8ce75e9f6f8a0p-11,
3046930469+- 0x1.7c7744d9378f7p-11,
3047030470+- 0x1.6caa0d3582fe9p-11,
3047130471+- 0x1.5d79eb71e893bp-11,
3047230472+- 0x1.4ee1429bf7cc0p-11,
3047330473+- 0x1.40daa3c89f5b6p-11,
3047430474+- 0x1.3360ccd23db3ap-11,
3047530475+- 0x1.266ea71d4f71ap-11,
3047630476+- 0x1.19ff4663ae9dfp-11,
3047730477+- 0x1.0e0de78654d1ep-11,
3047830478+- 0x1.0295ef6591848p-11,
3047930479+- 0x1.ef25d37f49fe1p-12,
3048030480+- 0x1.da01102b5f851p-12,
3048130481+- 0x1.c5b5412dcafadp-12,
3048230482+- 0x1.b23a5a23e4210p-12,
3048330483+- 0x1.9f8893d8fd1c1p-12,
3048430484+- 0x1.8d986a4187285p-12,
3048530485+- 0x1.7c629a822bc9ep-12,
3048630486+- 0x1.6be02102b3520p-12,
3048730487+- 0x1.5c0a378c90bcap-12,
3048830488+- 0x1.4cda5374ea275p-12,
3048930489+- 0x1.3e4a23d1f4702p-12,
3049030490+- 0x1.30538fbb77ecdp-12,
3049130491+- 0x1.22f0b496539bdp-12,
3049230492+- 0x1.161be46ad3b50p-12,
3049330493+- 0x1.09cfa445b00ffp-12,
3049430494+- 0x1.fc0d55470cf51p-13,
3049530495+- 0x1.e577bbcd49935p-13,
3049630496+- 0x1.cfd4a5adec5bfp-13,
3049730497+- 0x1.bb1a9657ce465p-13,
3049830498+- 0x1.a740684026555p-13,
3049930499+- 0x1.943d4a1d1ed39p-13,
3050030500+- 0x1.8208bc334a6a5p-13,
3050130501+- 0x1.709a8db59f25cp-13,
3050230502+- 0x1.5feada379d8b7p-13,
3050330503+- 0x1.4ff207314a102p-13,
3050430504+- 0x1.40a8c1949f75ep-13,
3050530505+- 0x1.3207fb7420eb9p-13,
3050630506+- 0x1.2408e9ba3327fp-13,
3050730507+- 0x1.16a501f0e42cap-13,
3050830508+- 0x1.09d5f819c9e29p-13,
3050930509+- 0x1.fb2b792b40a22p-14,
3051030510+- 0x1.e3bcf436a1a95p-14,
3051130511+- 0x1.cd55277c18d05p-14,
3051230512+- 0x1.b7e94604479dcp-14,
3051330513+- 0x1.a36eec00926ddp-14,
3051430514+- 0x1.8fdc1b2dcf7b9p-14,
3051530515+- 0x1.7d2737527c3f9p-14,
3051630516+- 0x1.6b4702d7d5849p-14,
3051730517+- 0x1.5a329b7d30748p-14,
3051830518+- 0x1.49e17724f4d41p-14,
3051930519+- 0x1.3a4b60ba9aa4dp-14,
3052030520+- 0x1.2b6875310f785p-14,
3052130521+- 0x1.1d312098e9dbap-14,
3052230522+- 0x1.0f9e1b4dd36dfp-14,
3052330523+- 0x1.02a8673a94691p-14,
3052430524+- 0x1.ec929a665b449p-15,
3052530525+- 0x1.d4f4b4c8e09edp-15,
3052630526+- 0x1.be6abbb10a5aap-15,
3052730527+- 0x1.a8e8cc1fadef6p-15,
3052830528+- 0x1.94637d5bacfdbp-15,
3052930529+- 0x1.80cfdc72220cfp-15,
3053030530+- 0x1.6e2367dc27f95p-15,
3053130531+- 0x1.5c540b4936fd2p-15,
3053230532+- 0x1.4b581b8d170fcp-15,
3053330533+- 0x1.3b2652b06c2b2p-15,
3053430534+- 0x1.2bb5cc22e5db6p-15,
3053530535+- 0x1.1cfe010e2052dp-15,
3053630536+- 0x1.0ef6c4c84a0fep-15,
3053730537+- 0x1.01984165a5f36p-15,
3053830538+- 0x1.e9b5e8d00ce76p-16,
3053930539+- 0x1.d16f5716c6c1ap-16,
3054030540+- 0x1.ba4f035d60e02p-16,
3054130541+- 0x1.a447b7b03f045p-16,
3054230542+- 0x1.8f4ccca7fc90dp-16,
3054330543+- 0x1.7b5223dac7336p-16,
3054430544+- 0x1.684c227fcacefp-16,
3054530545+- 0x1.562fac4329b48p-16,
3054630546+- 0x1.44f21e49054f2p-16,
3054730547+- 0x1.34894a5e24657p-16,
3054830548+- 0x1.24eb7254ccf83p-16,
3054930549+- 0x1.160f438c70913p-16,
3055030550+- 0x1.07ebd2a2d2844p-16,
3055130551+- 0x1.f4f12e9ab070ap-17,
3055230552+- 0x1.db5ad0b27805cp-17,
3055330553+- 0x1.c304efa2c6f4ep-17,
3055430554+- 0x1.abe09e9144b5ep-17,
3055530555+- 0x1.95df988e76644p-17,
3055630556+- 0x1.80f439b4ee04bp-17,
3055730557+- 0x1.6d11788a69c64p-17,
3055830558+- 0x1.5a2adfa0b4bc4p-17,
3055930559+- 0x1.4834877429b8fp-17,
3056030560+- 0x1.37231085c7d9ap-17,
3056130561+- 0x1.26eb9daed6f7ep-17,
3056230562+- 0x1.1783ceac28910p-17,
3056330563+- 0x1.08e1badf0fcedp-17,
3056430564+- 0x1.f5f7d88472604p-18,
3056530565+- 0x1.db92b5212fb8dp-18,
3056630566+- 0x1.c282cd3957edap-18,
3056730567+- 0x1.aab7abace48dcp-18,
3056830568+- 0x1.94219bfcb4928p-18,
3056930569+- 0x1.7eb1a2075864dp-18,
3057030570+- 0x1.6a597219a93d9p-18,
3057130571+- 0x1.570b69502f313p-18,
3057230572+- 0x1.44ba864670882p-18,
3057330573+- 0x1.335a62115bce2p-18,
3057430574+- 0x1.22df298214423p-18,
3057530575+- 0x1.133d96ae7e0ddp-18,
3057630576+- 0x1.046aeabcfcdecp-18,
3057730577+- 0x1.ecb9cfe1d8642p-19,
3057830578+- 0x1.d21397ead99cbp-19,
3057930579+- 0x1.b8d094c86d374p-19,
3058030580+- 0x1.a0df0f0c626dcp-19,
3058130581+- 0x1.8a2e269750a39p-19,
3058230582+- 0x1.74adc8f4064d3p-19,
3058330583+- 0x1.604ea819f007cp-19,
3058430584+- 0x1.4d0231928c6f9p-19,
3058530585+- 0x1.3aba85fe22e1fp-19,
3058630586+- 0x1.296a70f414053p-19,
3058730587+- 0x1.1905613b3abf2p-19,
3058830588+- 0x1.097f6156f32c5p-19,
3058930589+- 0x1.f59a20caf6695p-20,
3059030590+- 0x1.d9c73698fb1dcp-20,
3059130591+- 0x1.bf716c6168baep-20,
3059230592+- 0x1.a6852c6b58392p-20,
3059330593+- 0x1.8eefd70594a88p-20,
3059430594+- 0x1.789fb715aae95p-20,
3059530595+- 0x1.6383f726a8e04p-20,
3059630596+- 0x1.4f8c96f26a26ap-20,
3059730597+- 0x1.3caa61607f920p-20,
3059830598+- 0x1.2acee2f5ecdb8p-20,
3059930599+- 0x1.19ec60b1242edp-20,
3060030600+- 0x1.09f5cf4dd2877p-20,
3060130601+- 0x1.f5bd95d8730d8p-21,
3060230602+- 0x1.d9371e2ff7c35p-21,
3060330603+- 0x1.be41de54d155ap-21,
3060430604+- 0x1.a4c89e08ef4f3p-21,
3060530605+- 0x1.8cb738399b12cp-21,
3060630606+- 0x1.75fa8dbc84becp-21,
3060730607+- 0x1.608078a70dcbcp-21,
3060830608+- 0x1.4c37c0394d094p-21,
3060930609+- 0x1.39100d5687bfep-21,
3061030610+- 0x1.26f9df8519bd6p-21,
3061130611+- 0x1.15e6827001f18p-21,
3061230612+- 0x1.05c803e4831c1p-21,
3061330613+- 0x1.ed22548cffd35p-22,
3061430614+- 0x1.d06ad6ecdf971p-22,
3061530615+- 0x1.b551c847fbc96p-22,
3061630616+- 0x1.9bc09f112b494p-22,
3061730617+- 0x1.83a1ff0aa239dp-22,
3061830618+- 0x1.6ce1aa3fd7bddp-22,
3061930619+- 0x1.576c72b514859p-22,
3062030620+- 0x1.43302cc4a0da8p-22,
3062130621+- 0x1.301ba221dc9bbp-22,
3062230622+- 0x1.1e1e857adc568p-22,
3062330623+- 0x1.0d2966b1746f7p-22,
3062430624+- 0x1.fa5b4f49cc6b2p-23,
3062530625+- 0x1.dc3ae30b55c16p-23,
3062630626+- 0x1.bfd7555a3bd68p-23,
3062730627+- 0x1.a517d9e61628ap-23,
3062830628+- 0x1.8be4f8f6c951fp-23,
3062930629+- 0x1.74287ded49339p-23,
3063030630+- 0x1.5dcd669f2cd34p-23,
3063130631+- 0x1.48bfd38302870p-23,
3063230632+- 0x1.34ecf8a3c124ap-23,
3063330633+- 0x1.22430f521cbcfp-23,
3063430634+- 0x1.10b1488aeb235p-23,
3063530635+- 0x1.0027c00a263a6p-23,
3063630636+- 0x1.e12ee004efc37p-24,
3063730637+- 0x1.c3e44ae32b16bp-24,
3063830638+- 0x1.a854ea14102a8p-24,
3063930639+- 0x1.8e6761569f45dp-24,
3064030640+- 0x1.7603bac345f65p-24,
3064130641+- 0x1.5f1353cdad001p-24,
3064230642+- 0x1.4980cb3c80949p-24,
3064330643+- 0x1.3537f00b6ad4dp-24,
3064430644+- 0x1.2225b12bffc68p-24,
3064530645+- 0x1.10380e1adb7e9p-24,
3064630646+- 0x1.febc107d5efaap-25,
3064730647+- 0x1.df0f2a0ee6946p-25,
3064830648+- 0x1.c14b2188bcee4p-25,
3064930649+- 0x1.a553644f7f07dp-25,
3065030650+- 0x1.8b0cfce0579dfp-25,
3065130651+- 0x1.725e7c5dd20f7p-25,
3065230652+- 0x1.5b2fe547a1340p-25,
3065330653+- 0x1.456a974e92e93p-25,
3065430654+- 0x1.30f93c3699078p-25,
3065530655+- 0x1.1dc7b5b978cf8p-25,
3065630656+- 0x1.0bc30c5d52f15p-25,
3065730657+- 0x1.f5b2be65a0c7fp-26,
3065830658+- 0x1.d5f3a8dea7357p-26,
3065930659+- 0x1.b82915b03515bp-26,
3066030660+- 0x1.9c3517e789488p-26,
3066130661+- 0x1.81fb7df06136ep-26,
3066230662+- 0x1.6961b8d641d06p-26,
3066330663+- 0x1.524ec4d916caep-26,
3066430664+- 0x1.3cab1343d18d1p-26,
3066530665+- 0x1.2860757487a01p-26,
3066630666+- 0x1.155a09065d4f7p-26,
3066730667+- 0x1.0384250e4c9fcp-26,
3066830668+- 0x1.e59890b926c78p-27,
3066930669+- 0x1.c642116a8a9e3p-27,
3067030670+- 0x1.a8e405e651ab6p-27,
3067130671+- 0x1.8d5f98114f872p-27,
3067230672+- 0x1.7397c5a66e307p-27,
3067330673+- 0x1.5b71456c5a4c4p-27,
3067430674+- 0x1.44d26de513197p-27,
3067530675+- 0x1.2fa31d6371537p-27,
3067630676+- 0x1.1bcca373b7b43p-27,
3067730677+- 0x1.0939ab853339fp-27,
3067830678+- 0x1.efac5187b2863p-28,
3067930679+- 0x1.cf1e86235d0e6p-28,
3068030680+- 0x1.b0a68a2128babp-28,
3068130681+- 0x1.9423165bc4444p-28,
3068230682+- 0x1.7974e743dea3cp-28,
3068330683+- 0x1.607e9eacd1050p-28,
3068430684+- 0x1.4924a74dec728p-28,
3068530685+- 0x1.334d19e0c2160p-28,
3068630686+- 0x1.1edfa3c5f5ccap-28,
3068730687+- 0x1.0bc56f1b54701p-28,
3068830688+- 0x1.f3d2185e047d9p-29,
3068930689+- 0x1.d26cb87945e87p-29,
3069030690+- 0x1.b334fac4b9f99p-29,
3069130691+- 0x1.96076f7918d1cp-29,
3069230692+- 0x1.7ac2d72fc2c63p-29,
3069330693+- 0x1.614801550319ep-29,
3069430694+- 0x1.4979ac8b28926p-29,
3069530695+- 0x1.333c68e2d0548p-29,
3069630696+- 0x1.1e767bce37dd7p-29,
3069730697+- 0x1.0b0fc5b6d05a0p-29,
3069830698+- 0x1.f1e3523b41d7dp-30,
3069930699+- 0x1.d00de6608effep-30,
3070030700+- 0x1.b0778b7b3301ap-30,
3070130701+- 0x1.92fb04ec0f6cfp-30,
3070230702+- 0x1.77756ec9f78fap-30,
3070330703+- 0x1.5dc61922d5a06p-30,
3070430704+- 0x1.45ce65699ff6dp-30,
3070530705+- 0x1.2f71a5f159970p-30,
3070630706+- 0x1.1a94ff571654fp-30,
3070730707+- 0x1.071f4bbea09ecp-30,
3070830708+- 0x1.e9f1ff8ddd774p-31,
3070930709+- 0x1.c818223a202c7p-31,
3071030710+- 0x1.a887bd2b4404dp-31,
3071130711+- 0x1.8b1a336c5eb6bp-31,
3071230712+- 0x1.6fab63324088ap-31,
3071330713+- 0x1.56197e30205bap-31,
3071430714+- 0x1.3e44e45301b92p-31,
3071530715+- 0x1.281000bfe4c3fp-31,
3071630716+- 0x1.135f28f2d50b4p-31,
3071730717+- 0x1.00187dded5975p-31,
3071830718+- 0x1.dc479de0ef001p-32,
3071930719+- 0x1.bad4fdad3caa1p-32,
3072030720+- 0x1.9baed3ed27ab8p-32,
3072130721+- 0x1.7ead9ce4285bbp-32,
3072230722+- 0x1.63ac6b4edc88ep-32,
3072330723+- 0x1.4a88be2a6390cp-32,
3072430724+- 0x1.332259185f1a0p-32,
3072530725+- 0x1.1d5b1f3793044p-32,
3072630726+- 0x1.0916f04b6e18bp-32,
3072730727+- 0x1.ec77101de6926p-33,
3072830728+- 0x1.c960bf23153e0p-33,
3072930729+- 0x1.a8bd20fc65ef7p-33,
3073030730+- 0x1.8a61745ec7d1dp-33,
3073130731+- 0x1.6e25d0e756261p-33,
3073230732+- 0x1.53e4f7d1666cbp-33,
3073330733+- 0x1.3b7c27a7ddb0ep-33,
3073430734+- 0x1.24caf2c32af14p-33,
3073530735+- 0x1.0fb3186804d0fp-33,
3073630736+- 0x1.f830c0bb41fd7p-34,
3073730737+- 0x1.d3c0f1a91c846p-34,
3073830738+- 0x1.b1e5acf351d87p-34,
3073930739+- 0x1.92712d259ce66p-34,
3074030740+- 0x1.7538c60a04476p-34,
3074130741+- 0x1.5a14b04b47879p-34,
3074230742+- 0x1.40dfd87456f4cp-34,
3074330743+- 0x1.2977b1172b9d5p-34,
3074430744+- 0x1.13bc07e891491p-34,
3074530745+- 0x1.ff1dbb4300811p-35,
3074630746+- 0x1.d9a880f306bd8p-35,
3074730747+- 0x1.b6e45220b55e0p-35,
3074830748+- 0x1.96a0b33f2c4dap-35,
3074930749+- 0x1.78b07e9e924acp-35,
3075030750+- 0x1.5ce9ab1670dd2p-35,
3075130751+- 0x1.4325167006bb0p-35,
3075230752+- 0x1.2b3e53538ff3fp-35,
3075330753+- 0x1.15137a7f44864p-35,
3075430754+- 0x1.0084ff125639dp-35,
3075530755+- 0x1.daeb0b7311ec7p-36,
3075630756+- 0x1.b7937d1c40c52p-36,
3075730757+- 0x1.96d082f59ab06p-36,
3075830758+- 0x1.7872d9fa10aadp-36,
3075930759+- 0x1.5c4e8e37bc7d0p-36,
3076030760+- 0x1.423ac0df49a40p-36,
3076130761+- 0x1.2a117230ad284p-36,
3076230762+- 0x1.13af4f04f9998p-36,
3076330763+- 0x1.fde703724e560p-37,
3076430764+- 0x1.d77f0c82e7641p-37,
3076530765+- 0x1.b3ee02611d7ddp-37,
3076630766+- 0x1.92ff33023d5bdp-37,
3076730767+- 0x1.7481a9e69f53fp-37,
3076830768+- 0x1.5847eda620959p-37,
3076930769+- 0x1.3e27c1fcc74bdp-37,
3077030770+- 0x1.25f9ee0b923dcp-37,
3077130771+- 0x1.0f9a0686531ffp-37,
3077230772+- 0x1.f5cc7718082afp-38,
3077330773+- 0x1.cf7e53d6a2ca5p-38,
3077430774+- 0x1.ac0f5f3229372p-38,
3077530775+- 0x1.8b498644847eap-38,
3077630776+- 0x1.6cfa9bcca59dcp-38,
3077730777+- 0x1.50f411d4fd2cdp-38,
3077830778+- 0x1.370ab8327af5ep-38,
3077930779+- 0x1.1f167f88c6b6ep-38,
3078030780+- 0x1.08f24085d4597p-38,
3078130781+- 0x1.e8f70e181d619p-39,
3078230782+- 0x1.c324c20e337dcp-39,
3078330783+- 0x1.a03261574b54ep-39,
3078430784+- 0x1.7fe903cdf5855p-39,
3078530785+- 0x1.6215c58da3450p-39,
3078630786+- 0x1.46897d4b69fc6p-39,
3078730787+- 0x1.2d1877d731b7bp-39,
3078830788+- 0x1.159a386b11517p-39,
3078930789+- 0x1.ffd27ae9393cep-40,
3079030790+- 0x1.d7c593130dd0bp-40,
3079130791+- 0x1.b2cd607c79bcfp-40,
3079230792+- 0x1.90ae4d3405651p-40,
3079330793+- 0x1.71312dd1759e2p-40,
3079430794+- 0x1.5422ef5d8949dp-40,
3079530795+- 0x1.39544b0ecc957p-40,
3079630796+- 0x1.20997f73e73ddp-40,
3079730797+- 0x1.09ca0eaacd277p-40,
3079830798+- 0x1.e9810295890ecp-41,
3079930799+- 0x1.c2b45b5aa4a1dp-41,
3080030800+- 0x1.9eee068fa7596p-41,
3080130801+- 0x1.7df2b399c10a8p-41,
3080230802+- 0x1.5f8b87a31bd85p-41,
3080330803+- 0x1.4385c96e9a2d9p-41,
3080430804+- 0x1.29b2933ef4cbcp-41,
3080530805+- 0x1.11e68a6378f8ap-41,
3080630806+- 0x1.f7f338086a86bp-42,
3080730807+- 0x1.cf8d7d9ce040ap-42,
3080830808+- 0x1.aa577251ae484p-42,
3080930809+- 0x1.8811d739efb5ep-42,
3081030810+- 0x1.68823e52970bep-42,
3081130811+- 0x1.4b72ae68e8b4cp-42,
3081230812+- 0x1.30b14dbe876bcp-42,
3081330813+- 0x1.181012ef86610p-42,
3081430814+- 0x1.01647ba798744p-42,
3081530815+- 0x1.d90e917701675p-43,
3081630816+- 0x1.b2a87e86d0c8ap-43,
3081730817+- 0x1.8f53dcb377293p-43,
3081830818+- 0x1.6ed2f2515e933p-43,
3081930819+- 0x1.50ecc9ed47f19p-43,
3082030820+- 0x1.356cd5ce7799ep-43,
3082130821+- 0x1.1c229a587ab78p-43,
3082230822+- 0x1.04e15ecc7f3f6p-43,
3082330823+- 0x1.deffc7e6a6017p-44,
3082430824+- 0x1.b7b040832f310p-44,
3082530825+- 0x1.938e021f36d76p-44,
3082630826+- 0x1.7258610b3b233p-44,
3082730827+- 0x1.53d3bfc82a909p-44,
3082830828+- 0x1.37c92babdc2fdp-44,
3082930829+- 0x1.1e06010120f6ap-44,
3083030830+- 0x1.065b9616170d4p-44,
3083130831+- 0x1.e13dd96b3753ap-45,
3083230832+- 0x1.b950d32467392p-45,
3083330833+- 0x1.94a72263259a5p-45,
3083430834+- 0x1.72fd93e036cdcp-45,
3083530835+- 0x1.54164576929abp-45,
3083630836+- 0x1.37b83c521fe96p-45,
3083730837+- 0x1.1daf033182e96p-45,
3083830838+- 0x1.05ca50205d26ap-45,
3083930839+- 0x1.dfbb6235639fap-46,
3084030840+- 0x1.b7807e294781fp-46,
3084130841+- 0x1.9298add70a734p-46,
3084230842+- 0x1.70beaf9c7ffb6p-46,
3084330843+- 0x1.51b2cd6709222p-46,
3084430844+- 0x1.353a6cf7f7fffp-46,
3084530845+- 0x1.1b1fa8cbe84a7p-46,
3084630846+- 0x1.0330f0fd69921p-46,
3084730847+- 0x1.da81670f96f9bp-47,
3084830848+- 0x1.b24a16b4d09aap-47,
3084930849+- 0x1.8d6eeb6efdbd6p-47,
3085030850+- 0x1.6ba91ac734785p-47,
3085130851+- 0x1.4cb7966770ab5p-47,
3085230852+- 0x1.305e9721d0981p-47,
3085330853+- 0x1.1667311fff70ap-47,
3085430854+- 0x1.fd3de10d62855p-48,
3085530855+- 0x1.d1aefbcd48d0cp-48,
3085630856+- 0x1.a9cc93c25aca9p-48,
3085730857+- 0x1.85487ee3ea735p-48,
3085830858+- 0x1.63daf8b4b1e0cp-48,
3085930859+- 0x1.45421e69a6ca1p-48,
3086030860+- 0x1.294175802d99ap-48,
3086130861+- 0x1.0fa17bf41068fp-48,
3086230862+- 0x1.f05e82aae2bb9p-49,
3086330863+- 0x1.c578101b29058p-49,
3086430864+- 0x1.9e39dc5dd2f7cp-49,
3086530865+- 0x1.7a553a728bbf2p-49,
3086630866+- 0x1.5982008db1304p-49,
3086730867+- 0x1.3b7e00422e51bp-49,
3086830868+- 0x1.200c898d9ee3ep-49,
3086930869+- 0x1.06f5f7eb65a56p-49,
3087030870+- 0x1.e00e9148a1d25p-50,
3087130871+- 0x1.b623734024e92p-50,
3087230872+- 0x1.8fd4e01891bf8p-50,
3087330873+- 0x1.6cd44c7470d89p-50,
3087430874+- 0x1.4cd9c04158cd7p-50,
3087530875+- 0x1.2fa34bf5c8344p-50,
3087630876+- 0x1.14f4890ff2461p-50,
3087730877+- 0x1.f92c49dfa4df5p-51,
3087830878+- 0x1.ccaaea71ab0dfp-51,
3087930879+- 0x1.a40829f001197p-51,
3088030880+- 0x1.7eef13b59e96cp-51,
3088130881+- 0x1.5d11e1a252bf5p-51,
3088230882+- 0x1.3e296303b2297p-51,
3088330883+- 0x1.21f47009f43cep-51,
3088430884+- 0x1.083768c5e4541p-51,
3088530885+- 0x1.e1777d831265ep-52,
3088630886+- 0x1.b69f10b0191b5p-52,
3088730887+- 0x1.8f8a3a05b5b52p-52,
3088830888+- 0x1.6be573c40c8e7p-52,
3088930889+- 0x1.4b645ba991fdbp-52,
3089030890+- 0x1.2dc119095729fp-52,
3089130891+- },
3089230892+-};
3089330893+diff --git a/sysdeps/aarch64/fpu/sv_erff_data.c b/sysdeps/aarch64/fpu/sv_erff_data.c
3089430894+deleted file mode 100644
3089530895+index 6dcd72af69..0000000000
3089630896+--- a/sysdeps/aarch64/fpu/sv_erff_data.c
3089730897++++ /dev/null
3089830898+@@ -1,1058 +0,0 @@
3089930899+-/* Table for SVE erff approximation
3090030900+-
3090130901+- Copyright (C) 2024 Free Software Foundation, Inc.
3090230902+- This file is part of the GNU C Library.
3090330903+-
3090430904+- The GNU C Library is free software; you can redistribute it and/or
3090530905+- modify it under the terms of the GNU Lesser General Public
3090630906+- License as published by the Free Software Foundation; either
3090730907+- version 2.1 of the License, or (at your option) any later version.
3090830908+-
3090930909+- The GNU C Library is distributed in the hope that it will be useful,
3091030910+- but WITHOUT ANY WARRANTY; without even the implied warranty of
3091130911+- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
3091230912+- Lesser General Public License for more details.
3091330913+-
3091430914+- You should have received a copy of the GNU Lesser General Public
3091530915+- License along with the GNU C Library; if not, see
3091630916+- <https://www.gnu.org/licenses/>. */
3091730917+-
3091830918+-#include "vecmath_config.h"
3091930919+-
3092030920+-/* Lookup table used in SVE erff.
3092130921+- For each possible rounded input r (multiples of 1/128), between
3092230922+- r = 0.0 and r = 4.0 (513 values):
3092330923+- - __erff_data.erf contains the values of erf(r),
3092430924+- - __erff_data.scale contains the values of 2/sqrt(pi)*exp(-r^2).
3092530925+- Note that indices 0 and 1 are never hit by the algorithm, since lookup is
3092630926+- performed only for x >= 1/64-1/512. */
3092730927+-const struct sv_erff_data __sv_erff_data = {
3092830928+- .erf = { 0x0.000000p+0,
3092930929+- 0x1.20dbf4p-7,
3093030930+- 0x1.20d770p-6,
3093130931+- 0x1.b137e0p-6,
3093230932+- 0x1.20c564p-5,
3093330933+- 0x1.68e5d4p-5,
3093430934+- 0x1.b0fafep-5,
3093530935+- 0x1.f902a8p-5,
3093630936+- 0x1.207d48p-4,
3093730937+- 0x1.44703ep-4,
3093830938+- 0x1.68591ap-4,
3093930939+- 0x1.8c36bep-4,
3094030940+- 0x1.b00812p-4,
3094130941+- 0x1.d3cbf8p-4,
3094230942+- 0x1.f7815ap-4,
3094330943+- 0x1.0d9390p-3,
3094430944+- 0x1.1f5e1ap-3,
3094530945+- 0x1.311fc2p-3,
3094630946+- 0x1.42d7fcp-3,
3094730947+- 0x1.548642p-3,
3094830948+- 0x1.662a0cp-3,
3094930949+- 0x1.77c2d2p-3,
3095030950+- 0x1.895010p-3,
3095130951+- 0x1.9ad142p-3,
3095230952+- 0x1.ac45e4p-3,
3095330953+- 0x1.bdad72p-3,
3095430954+- 0x1.cf076ep-3,
3095530955+- 0x1.e05354p-3,
3095630956+- 0x1.f190aap-3,
3095730957+- 0x1.015f78p-2,
3095830958+- 0x1.09eed6p-2,
3095930959+- 0x1.127632p-2,
3096030960+- 0x1.1af54ep-2,
3096130961+- 0x1.236bf0p-2,
3096230962+- 0x1.2bd9dcp-2,
3096330963+- 0x1.343ed6p-2,
3096430964+- 0x1.3c9aa8p-2,
3096530965+- 0x1.44ed18p-2,
3096630966+- 0x1.4d35f0p-2,
3096730967+- 0x1.5574f4p-2,
3096830968+- 0x1.5da9f4p-2,
3096930969+- 0x1.65d4b8p-2,
3097030970+- 0x1.6df50ap-2,
3097130971+- 0x1.760abap-2,
3097230972+- 0x1.7e1594p-2,
3097330973+- 0x1.861566p-2,
3097430974+- 0x1.8e0a02p-2,
3097530975+- 0x1.95f336p-2,
3097630976+- 0x1.9dd0d2p-2,
3097730977+- 0x1.a5a2acp-2,
3097830978+- 0x1.ad6896p-2,
3097930979+- 0x1.b52264p-2,
3098030980+- 0x1.bccfecp-2,
3098130981+- 0x1.c47104p-2,
3098230982+- 0x1.cc0584p-2,
3098330983+- 0x1.d38d44p-2,
3098430984+- 0x1.db081cp-2,
3098530985+- 0x1.e275eap-2,
3098630986+- 0x1.e9d68ap-2,
3098730987+- 0x1.f129d4p-2,
3098830988+- 0x1.f86faap-2,
3098930989+- 0x1.ffa7eap-2,
3099030990+- 0x1.03693ap-1,
3099130991+- 0x1.06f794p-1,
3099230992+- 0x1.0a7ef6p-1,
3099330993+- 0x1.0dff50p-1,
3099430994+- 0x1.117894p-1,
3099530995+- 0x1.14eab4p-1,
3099630996+- 0x1.1855a6p-1,
3099730997+- 0x1.1bb95cp-1,
3099830998+- 0x1.1f15ccp-1,
3099930999+- 0x1.226ae8p-1,
3100031000+- 0x1.25b8a8p-1,
3100131001+- 0x1.28ff02p-1,
3100231002+- 0x1.2c3decp-1,
3100331003+- 0x1.2f755cp-1,
3100431004+- 0x1.32a54cp-1,
3100531005+- 0x1.35cdb4p-1,
3100631006+- 0x1.38ee8ap-1,
3100731007+- 0x1.3c07cap-1,
3100831008+- 0x1.3f196ep-1,
3100931009+- 0x1.42236ep-1,
3101031010+- 0x1.4525c8p-1,
3101131011+- 0x1.482074p-1,
3101231012+- 0x1.4b1372p-1,
3101331013+- 0x1.4dfebap-1,
3101431014+- 0x1.50e24cp-1,
3101531015+- 0x1.53be26p-1,
3101631016+- 0x1.569244p-1,
3101731017+- 0x1.595ea6p-1,
3101831018+- 0x1.5c2348p-1,
3101931019+- 0x1.5ee02ep-1,
3102031020+- 0x1.619556p-1,
3102131021+- 0x1.6442c0p-1,
3102231022+- 0x1.66e86ep-1,
3102331023+- 0x1.69865ep-1,
3102431024+- 0x1.6c1c98p-1,
3102531025+- 0x1.6eab18p-1,
3102631026+- 0x1.7131e6p-1,
3102731027+- 0x1.73b102p-1,
3102831028+- 0x1.762870p-1,
3102931029+- 0x1.789836p-1,
3103031030+- 0x1.7b0058p-1,
3103131031+- 0x1.7d60d8p-1,
3103231032+- 0x1.7fb9c0p-1,
3103331033+- 0x1.820b12p-1,
3103431034+- 0x1.8454d6p-1,
3103531035+- 0x1.869712p-1,
3103631036+- 0x1.88d1cep-1,
3103731037+- 0x1.8b050ep-1,
3103831038+- 0x1.8d30dep-1,
3103931039+- 0x1.8f5544p-1,
3104031040+- 0x1.91724ap-1,
3104131041+- 0x1.9387f6p-1,
3104231042+- 0x1.959652p-1,
3104331043+- 0x1.979d68p-1,
3104431044+- 0x1.999d42p-1,
3104531045+- 0x1.9b95e8p-1,
3104631046+- 0x1.9d8768p-1,
3104731047+- 0x1.9f71cap-1,
3104831048+- 0x1.a1551ap-1,
3104931049+- 0x1.a33162p-1,
3105031050+- 0x1.a506b0p-1,
3105131051+- 0x1.a6d50cp-1,
3105231052+- 0x1.a89c86p-1,
3105331053+- 0x1.aa5d26p-1,
3105431054+- 0x1.ac16fcp-1,
3105531055+- 0x1.adca14p-1,
3105631056+- 0x1.af767ap-1,
3105731057+- 0x1.b11c3cp-1,
3105831058+- 0x1.b2bb68p-1,
3105931059+- 0x1.b4540ap-1,
3106031060+- 0x1.b5e630p-1,
3106131061+- 0x1.b771e8p-1,
3106231062+- 0x1.b8f742p-1,
3106331063+- 0x1.ba764ap-1,
3106431064+- 0x1.bbef10p-1,
3106531065+- 0x1.bd61a2p-1,
3106631066+- 0x1.bece0ep-1,
3106731067+- 0x1.c03464p-1,
3106831068+- 0x1.c194b2p-1,
3106931069+- 0x1.c2ef08p-1,
3107031070+- 0x1.c44376p-1,
3107131071+- 0x1.c5920ap-1,
3107231072+- 0x1.c6dad2p-1,
3107331073+- 0x1.c81de2p-1,
3107431074+- 0x1.c95b46p-1,
3107531075+- 0x1.ca930ep-1,
3107631076+- 0x1.cbc54cp-1,
3107731077+- 0x1.ccf20cp-1,
3107831078+- 0x1.ce1962p-1,
3107931079+- 0x1.cf3b5cp-1,
3108031080+- 0x1.d0580cp-1,
3108131081+- 0x1.d16f7ep-1,
3108231082+- 0x1.d281c4p-1,
3108331083+- 0x1.d38ef0p-1,
3108431084+- 0x1.d49710p-1,
3108531085+- 0x1.d59a34p-1,
3108631086+- 0x1.d6986cp-1,
3108731087+- 0x1.d791cap-1,
3108831088+- 0x1.d8865ep-1,
3108931089+- 0x1.d97636p-1,
3109031090+- 0x1.da6162p-1,
3109131091+- 0x1.db47f4p-1,
3109231092+- 0x1.dc29fcp-1,
3109331093+- 0x1.dd0788p-1,
3109431094+- 0x1.dde0aap-1,
3109531095+- 0x1.deb570p-1,
3109631096+- 0x1.df85eap-1,
3109731097+- 0x1.e0522ap-1,
3109831098+- 0x1.e11a3ep-1,
3109931099+- 0x1.e1de36p-1,
3110031100+- 0x1.e29e22p-1,
3110131101+- 0x1.e35a12p-1,
3110231102+- 0x1.e41214p-1,
3110331103+- 0x1.e4c638p-1,
3110431104+- 0x1.e5768cp-1,
3110531105+- 0x1.e62322p-1,
3110631106+- 0x1.e6cc08p-1,
3110731107+- 0x1.e7714ap-1,
3110831108+- 0x1.e812fcp-1,
3110931109+- 0x1.e8b12ap-1,
3111031110+- 0x1.e94be4p-1,
3111131111+- 0x1.e9e336p-1,
3111231112+- 0x1.ea7730p-1,
3111331113+- 0x1.eb07e2p-1,
3111431114+- 0x1.eb9558p-1,
3111531115+- 0x1.ec1fa2p-1,
3111631116+- 0x1.eca6ccp-1,
3111731117+- 0x1.ed2ae6p-1,
3111831118+- 0x1.edabfcp-1,
3111931119+- 0x1.ee2a1ep-1,
3112031120+- 0x1.eea556p-1,
3112131121+- 0x1.ef1db4p-1,
3112231122+- 0x1.ef9344p-1,
3112331123+- 0x1.f00614p-1,
3112431124+- 0x1.f07630p-1,
3112531125+- 0x1.f0e3a6p-1,
3112631126+- 0x1.f14e82p-1,
3112731127+- 0x1.f1b6d0p-1,
3112831128+- 0x1.f21ca0p-1,
3112931129+- 0x1.f27ff8p-1,
3113031130+- 0x1.f2e0eap-1,
3113131131+- 0x1.f33f7ep-1,
3113231132+- 0x1.f39bc2p-1,
3113331133+- 0x1.f3f5c2p-1,
3113431134+- 0x1.f44d88p-1,
3113531135+- 0x1.f4a31ep-1,
3113631136+- 0x1.f4f694p-1,
3113731137+- 0x1.f547f2p-1,
3113831138+- 0x1.f59742p-1,
3113931139+- 0x1.f5e490p-1,
3114031140+- 0x1.f62fe8p-1,
3114131141+- 0x1.f67952p-1,
3114231142+- 0x1.f6c0dcp-1,
3114331143+- 0x1.f7068cp-1,
3114431144+- 0x1.f74a6ep-1,
3114531145+- 0x1.f78c8cp-1,
3114631146+- 0x1.f7cceep-1,
3114731147+- 0x1.f80ba2p-1,
3114831148+- 0x1.f848acp-1,
3114931149+- 0x1.f8841ap-1,
3115031150+- 0x1.f8bdf2p-1,
3115131151+- 0x1.f8f63ep-1,
3115231152+- 0x1.f92d08p-1,
3115331153+- 0x1.f96256p-1,
3115431154+- 0x1.f99634p-1,
3115531155+- 0x1.f9c8a8p-1,
3115631156+- 0x1.f9f9bap-1,
3115731157+- 0x1.fa2974p-1,
3115831158+- 0x1.fa57dep-1,
3115931159+- 0x1.fa84fep-1,
3116031160+- 0x1.fab0dep-1,
3116131161+- 0x1.fadb84p-1,
3116231162+- 0x1.fb04f6p-1,
3116331163+- 0x1.fb2d40p-1,
3116431164+- 0x1.fb5464p-1,
3116531165+- 0x1.fb7a6cp-1,
3116631166+- 0x1.fb9f60p-1,
3116731167+- 0x1.fbc344p-1,
3116831168+- 0x1.fbe61ep-1,
3116931169+- 0x1.fc07fap-1,
3117031170+- 0x1.fc28d8p-1,
3117131171+- 0x1.fc48c2p-1,
3117231172+- 0x1.fc67bcp-1,
3117331173+- 0x1.fc85d0p-1,
3117431174+- 0x1.fca2fep-1,
3117531175+- 0x1.fcbf52p-1,
3117631176+- 0x1.fcdaccp-1,
3117731177+- 0x1.fcf576p-1,
3117831178+- 0x1.fd0f54p-1,
3117931179+- 0x1.fd286ap-1,
3118031180+- 0x1.fd40bep-1,
3118131181+- 0x1.fd5856p-1,
3118231182+- 0x1.fd6f34p-1,
3118331183+- 0x1.fd8562p-1,
3118431184+- 0x1.fd9ae2p-1,
3118531185+- 0x1.fdafb8p-1,
3118631186+- 0x1.fdc3e8p-1,
3118731187+- 0x1.fdd77ap-1,
3118831188+- 0x1.fdea6ep-1,
3118931189+- 0x1.fdfcccp-1,
3119031190+- 0x1.fe0e96p-1,
3119131191+- 0x1.fe1fd0p-1,
3119231192+- 0x1.fe3080p-1,
3119331193+- 0x1.fe40a6p-1,
3119431194+- 0x1.fe504cp-1,
3119531195+- 0x1.fe5f70p-1,
3119631196+- 0x1.fe6e18p-1,
3119731197+- 0x1.fe7c46p-1,
3119831198+- 0x1.fe8a00p-1,
3119931199+- 0x1.fe9748p-1,
3120031200+- 0x1.fea422p-1,
3120131201+- 0x1.feb090p-1,
3120231202+- 0x1.febc96p-1,
3120331203+- 0x1.fec836p-1,
3120431204+- 0x1.fed374p-1,
3120531205+- 0x1.fede52p-1,
3120631206+- 0x1.fee8d4p-1,
3120731207+- 0x1.fef2fep-1,
3120831208+- 0x1.fefccep-1,
3120931209+- 0x1.ff064cp-1,
3121031210+- 0x1.ff0f76p-1,
3121131211+- 0x1.ff1852p-1,
3121231212+- 0x1.ff20e0p-1,
3121331213+- 0x1.ff2924p-1,
3121431214+- 0x1.ff3120p-1,
3121531215+- 0x1.ff38d6p-1,
3121631216+- 0x1.ff4048p-1,
3121731217+- 0x1.ff4778p-1,
3121831218+- 0x1.ff4e68p-1,
3121931219+- 0x1.ff551ap-1,
3122031220+- 0x1.ff5b90p-1,
3122131221+- 0x1.ff61ccp-1,
3122231222+- 0x1.ff67d0p-1,
3122331223+- 0x1.ff6d9ep-1,
3122431224+- 0x1.ff7338p-1,
3122531225+- 0x1.ff789ep-1,
3122631226+- 0x1.ff7dd4p-1,
3122731227+- 0x1.ff82dap-1,
3122831228+- 0x1.ff87b2p-1,
3122931229+- 0x1.ff8c5cp-1,
3123031230+- 0x1.ff90dcp-1,
3123131231+- 0x1.ff9532p-1,
3123231232+- 0x1.ff9960p-1,
3123331233+- 0x1.ff9d68p-1,
3123431234+- 0x1.ffa14ap-1,
3123531235+- 0x1.ffa506p-1,
3123631236+- 0x1.ffa8a0p-1,
3123731237+- 0x1.ffac18p-1,
3123831238+- 0x1.ffaf6ep-1,
3123931239+- 0x1.ffb2a6p-1,
3124031240+- 0x1.ffb5bep-1,
3124131241+- 0x1.ffb8b8p-1,
3124231242+- 0x1.ffbb98p-1,
3124331243+- 0x1.ffbe5ap-1,
3124431244+- 0x1.ffc102p-1,
3124531245+- 0x1.ffc390p-1,
3124631246+- 0x1.ffc606p-1,
3124731247+- 0x1.ffc862p-1,
3124831248+- 0x1.ffcaa8p-1,
3124931249+- 0x1.ffccd8p-1,
3125031250+- 0x1.ffcef4p-1,
3125131251+- 0x1.ffd0fap-1,
3125231252+- 0x1.ffd2eap-1,
3125331253+- 0x1.ffd4cap-1,
3125431254+- 0x1.ffd696p-1,
3125531255+- 0x1.ffd84ep-1,
3125631256+- 0x1.ffd9f8p-1,
3125731257+- 0x1.ffdb90p-1,
3125831258+- 0x1.ffdd18p-1,
3125931259+- 0x1.ffde90p-1,
3126031260+- 0x1.ffdffap-1,
3126131261+- 0x1.ffe154p-1,
3126231262+- 0x1.ffe2a2p-1,
3126331263+- 0x1.ffe3e2p-1,
3126431264+- 0x1.ffe514p-1,
3126531265+- 0x1.ffe63cp-1,
3126631266+- 0x1.ffe756p-1,
3126731267+- 0x1.ffe866p-1,
3126831268+- 0x1.ffe96ap-1,
3126931269+- 0x1.ffea64p-1,
3127031270+- 0x1.ffeb54p-1,
3127131271+- 0x1.ffec3ap-1,
3127231272+- 0x1.ffed16p-1,
3127331273+- 0x1.ffedeap-1,
3127431274+- 0x1.ffeeb4p-1,
3127531275+- 0x1.ffef76p-1,
3127631276+- 0x1.fff032p-1,
3127731277+- 0x1.fff0e4p-1,
3127831278+- 0x1.fff18ep-1,
3127931279+- 0x1.fff232p-1,
3128031280+- 0x1.fff2d0p-1,
3128131281+- 0x1.fff366p-1,
3128231282+- 0x1.fff3f6p-1,
3128331283+- 0x1.fff480p-1,
3128431284+- 0x1.fff504p-1,
3128531285+- 0x1.fff582p-1,
3128631286+- 0x1.fff5fcp-1,
3128731287+- 0x1.fff670p-1,
3128831288+- 0x1.fff6dep-1,
3128931289+- 0x1.fff74ap-1,
3129031290+- 0x1.fff7aep-1,
3129131291+- 0x1.fff810p-1,
3129231292+- 0x1.fff86cp-1,
3129331293+- 0x1.fff8c6p-1,
3129431294+- 0x1.fff91cp-1,
3129531295+- 0x1.fff96cp-1,
3129631296+- 0x1.fff9bap-1,
3129731297+- 0x1.fffa04p-1,
3129831298+- 0x1.fffa4cp-1,
3129931299+- 0x1.fffa90p-1,
3130031300+- 0x1.fffad0p-1,
3130131301+- 0x1.fffb0ep-1,
3130231302+- 0x1.fffb4ap-1,
3130331303+- 0x1.fffb82p-1,
3130431304+- 0x1.fffbb8p-1,
3130531305+- 0x1.fffbecp-1,
3130631306+- 0x1.fffc1ep-1,
3130731307+- 0x1.fffc4ep-1,
3130831308+- 0x1.fffc7ap-1,
3130931309+- 0x1.fffca6p-1,
3131031310+- 0x1.fffccep-1,
3131131311+- 0x1.fffcf6p-1,
3131231312+- 0x1.fffd1ap-1,
3131331313+- 0x1.fffd3ep-1,
3131431314+- 0x1.fffd60p-1,
3131531315+- 0x1.fffd80p-1,
3131631316+- 0x1.fffda0p-1,
3131731317+- 0x1.fffdbep-1,
3131831318+- 0x1.fffddap-1,
3131931319+- 0x1.fffdf4p-1,
3132031320+- 0x1.fffe0ep-1,
3132131321+- 0x1.fffe26p-1,
3132231322+- 0x1.fffe3ep-1,
3132331323+- 0x1.fffe54p-1,
3132431324+- 0x1.fffe68p-1,
3132531325+- 0x1.fffe7ep-1,
3132631326+- 0x1.fffe90p-1,
3132731327+- 0x1.fffea2p-1,
3132831328+- 0x1.fffeb4p-1,
3132931329+- 0x1.fffec4p-1,
3133031330+- 0x1.fffed4p-1,
3133131331+- 0x1.fffee4p-1,
3133231332+- 0x1.fffef2p-1,
3133331333+- 0x1.ffff00p-1,
3133431334+- 0x1.ffff0cp-1,
3133531335+- 0x1.ffff18p-1,
3133631336+- 0x1.ffff24p-1,
3133731337+- 0x1.ffff30p-1,
3133831338+- 0x1.ffff3ap-1,
3133931339+- 0x1.ffff44p-1,
3134031340+- 0x1.ffff4ep-1,
3134131341+- 0x1.ffff56p-1,
3134231342+- 0x1.ffff60p-1,
3134331343+- 0x1.ffff68p-1,
3134431344+- 0x1.ffff70p-1,
3134531345+- 0x1.ffff78p-1,
3134631346+- 0x1.ffff7ep-1,
3134731347+- 0x1.ffff84p-1,
3134831348+- 0x1.ffff8cp-1,
3134931349+- 0x1.ffff92p-1,
3135031350+- 0x1.ffff98p-1,
3135131351+- 0x1.ffff9cp-1,
3135231352+- 0x1.ffffa2p-1,
3135331353+- 0x1.ffffa6p-1,
3135431354+- 0x1.ffffacp-1,
3135531355+- 0x1.ffffb0p-1,
3135631356+- 0x1.ffffb4p-1,
3135731357+- 0x1.ffffb8p-1,
3135831358+- 0x1.ffffbcp-1,
3135931359+- 0x1.ffffc0p-1,
3136031360+- 0x1.ffffc4p-1,
3136131361+- 0x1.ffffc6p-1,
3136231362+- 0x1.ffffcap-1,
3136331363+- 0x1.ffffccp-1,
3136431364+- 0x1.ffffd0p-1,
3136531365+- 0x1.ffffd2p-1,
3136631366+- 0x1.ffffd4p-1,
3136731367+- 0x1.ffffd6p-1,
3136831368+- 0x1.ffffd8p-1,
3136931369+- 0x1.ffffdcp-1,
3137031370+- 0x1.ffffdep-1,
3137131371+- 0x1.ffffdep-1,
3137231372+- 0x1.ffffe0p-1,
3137331373+- 0x1.ffffe2p-1,
3137431374+- 0x1.ffffe4p-1,
3137531375+- 0x1.ffffe6p-1,
3137631376+- 0x1.ffffe8p-1,
3137731377+- 0x1.ffffe8p-1,
3137831378+- 0x1.ffffeap-1,
3137931379+- 0x1.ffffeap-1,
3138031380+- 0x1.ffffecp-1,
3138131381+- 0x1.ffffeep-1,
3138231382+- 0x1.ffffeep-1,
3138331383+- 0x1.fffff0p-1,
3138431384+- 0x1.fffff0p-1,
3138531385+- 0x1.fffff2p-1,
3138631386+- 0x1.fffff2p-1,
3138731387+- 0x1.fffff2p-1,
3138831388+- 0x1.fffff4p-1,
3138931389+- 0x1.fffff4p-1,
3139031390+- 0x1.fffff4p-1,
3139131391+- 0x1.fffff6p-1,
3139231392+- 0x1.fffff6p-1,
3139331393+- 0x1.fffff6p-1,
3139431394+- 0x1.fffff8p-1,
3139531395+- 0x1.fffff8p-1,
3139631396+- 0x1.fffff8p-1,
3139731397+- 0x1.fffff8p-1,
3139831398+- 0x1.fffffap-1,
3139931399+- 0x1.fffffap-1,
3140031400+- 0x1.fffffap-1,
3140131401+- 0x1.fffffap-1,
3140231402+- 0x1.fffffap-1,
3140331403+- 0x1.fffffap-1,
3140431404+- 0x1.fffffcp-1,
3140531405+- 0x1.fffffcp-1,
3140631406+- 0x1.fffffcp-1,
3140731407+- 0x1.fffffcp-1,
3140831408+- 0x1.fffffcp-1,
3140931409+- 0x1.fffffcp-1,
3141031410+- 0x1.fffffcp-1,
3141131411+- 0x1.fffffcp-1,
3141231412+- 0x1.fffffep-1,
3141331413+- 0x1.fffffep-1,
3141431414+- 0x1.fffffep-1,
3141531415+- 0x1.fffffep-1,
3141631416+- 0x1.fffffep-1,
3141731417+- 0x1.fffffep-1,
3141831418+- 0x1.fffffep-1,
3141931419+- 0x1.fffffep-1,
3142031420+- 0x1.fffffep-1,
3142131421+- 0x1.fffffep-1,
3142231422+- 0x1.fffffep-1,
3142331423+- 0x1.fffffep-1,
3142431424+- 0x1.fffffep-1,
3142531425+- 0x1.fffffep-1,
3142631426+- 0x1.fffffep-1,
3142731427+- 0x1.fffffep-1,
3142831428+- 0x1.fffffep-1,
3142931429+- 0x1.fffffep-1,
3143031430+- 0x1.000000p+0,
3143131431+- 0x1.000000p+0,
3143231432+- 0x1.000000p+0,
3143331433+- 0x1.000000p+0,
3143431434+- 0x1.000000p+0,
3143531435+- 0x1.000000p+0,
3143631436+- 0x1.000000p+0,
3143731437+- 0x1.000000p+0,
3143831438+- 0x1.000000p+0,
3143931439+- 0x1.000000p+0,
3144031440+- 0x1.000000p+0,
3144131441+- },
3144231442+- .scale = { 0x1.20dd76p+0,
3144331443+- 0x1.20d8f2p+0,
3144431444+- 0x1.20cb68p+0,
3144531445+- 0x1.20b4d8p+0,
3144631446+- 0x1.209546p+0,
3144731447+- 0x1.206cb4p+0,
3144831448+- 0x1.203b26p+0,
3144931449+- 0x1.2000a0p+0,
3145031450+- 0x1.1fbd28p+0,
3145131451+- 0x1.1f70c4p+0,
3145231452+- 0x1.1f1b7ap+0,
3145331453+- 0x1.1ebd56p+0,
3145431454+- 0x1.1e565cp+0,
3145531455+- 0x1.1de698p+0,
3145631456+- 0x1.1d6e14p+0,
3145731457+- 0x1.1cecdcp+0,
3145831458+- 0x1.1c62fap+0,
3145931459+- 0x1.1bd07cp+0,
3146031460+- 0x1.1b3572p+0,
3146131461+- 0x1.1a91e6p+0,
3146231462+- 0x1.19e5eap+0,
3146331463+- 0x1.19318cp+0,
3146431464+- 0x1.1874dep+0,
3146531465+- 0x1.17aff0p+0,
3146631466+- 0x1.16e2d8p+0,
3146731467+- 0x1.160da4p+0,
3146831468+- 0x1.153068p+0,
3146931469+- 0x1.144b3cp+0,
3147031470+- 0x1.135e30p+0,
3147131471+- 0x1.12695ep+0,
3147231472+- 0x1.116cd8p+0,
3147331473+- 0x1.1068bap+0,
3147431474+- 0x1.0f5d16p+0,
3147531475+- 0x1.0e4a08p+0,
3147631476+- 0x1.0d2fa6p+0,
3147731477+- 0x1.0c0e0ap+0,
3147831478+- 0x1.0ae550p+0,
3147931479+- 0x1.09b590p+0,
3148031480+- 0x1.087ee4p+0,
3148131481+- 0x1.07416cp+0,
3148231482+- 0x1.05fd3ep+0,
3148331483+- 0x1.04b27cp+0,
3148431484+- 0x1.036140p+0,
3148531485+- 0x1.0209a6p+0,
3148631486+- 0x1.00abd0p+0,
3148731487+- 0x1.fe8fb0p-1,
3148831488+- 0x1.fbbbbep-1,
3148931489+- 0x1.f8dc0ap-1,
3149031490+- 0x1.f5f0cep-1,
3149131491+- 0x1.f2fa4cp-1,
3149231492+- 0x1.eff8c4p-1,
3149331493+- 0x1.ecec78p-1,
3149431494+- 0x1.e9d5a8p-1,
3149531495+- 0x1.e6b498p-1,
3149631496+- 0x1.e38988p-1,
3149731497+- 0x1.e054bep-1,
3149831498+- 0x1.dd167cp-1,
3149931499+- 0x1.d9cf06p-1,
3150031500+- 0x1.d67ea2p-1,
3150131501+- 0x1.d32592p-1,
3150231502+- 0x1.cfc41ep-1,
3150331503+- 0x1.cc5a8ap-1,
3150431504+- 0x1.c8e91cp-1,
3150531505+- 0x1.c5701ap-1,
3150631506+- 0x1.c1efcap-1,
3150731507+- 0x1.be6872p-1,
3150831508+- 0x1.bada5ap-1,
3150931509+- 0x1.b745c6p-1,
3151031510+- 0x1.b3aafcp-1,
3151131511+- 0x1.b00a46p-1,
3151231512+- 0x1.ac63e8p-1,
3151331513+- 0x1.a8b828p-1,
3151431514+- 0x1.a5074ep-1,
3151531515+- 0x1.a1519ep-1,
3151631516+- 0x1.9d9762p-1,
3151731517+- 0x1.99d8dap-1,
3151831518+- 0x1.961650p-1,
3151931519+- 0x1.925008p-1,
3152031520+- 0x1.8e8646p-1,
3152131521+- 0x1.8ab950p-1,
3152231522+- 0x1.86e96ap-1,
3152331523+- 0x1.8316d6p-1,
3152431524+- 0x1.7f41dcp-1,
3152531525+- 0x1.7b6abcp-1,
3152631526+- 0x1.7791b8p-1,
3152731527+- 0x1.73b714p-1,
3152831528+- 0x1.6fdb12p-1,
3152931529+- 0x1.6bfdf0p-1,
3153031530+- 0x1.681ff2p-1,
3153131531+- 0x1.644156p-1,
3153231532+- 0x1.60625cp-1,
3153331533+- 0x1.5c8342p-1,
3153431534+- 0x1.58a446p-1,
3153531535+- 0x1.54c5a6p-1,
3153631536+- 0x1.50e79ep-1,
3153731537+- 0x1.4d0a68p-1,
3153831538+- 0x1.492e42p-1,
3153931539+- 0x1.455366p-1,
3154031540+- 0x1.417a0cp-1,
3154131541+- 0x1.3da26ep-1,
3154231542+- 0x1.39ccc2p-1,
3154331543+- 0x1.35f940p-1,
3154431544+- 0x1.32281ep-1,
3154531545+- 0x1.2e5992p-1,
3154631546+- 0x1.2a8dcep-1,
3154731547+- 0x1.26c508p-1,
3154831548+- 0x1.22ff72p-1,
3154931549+- 0x1.1f3d3cp-1,
3155031550+- 0x1.1b7e98p-1,
3155131551+- 0x1.17c3b6p-1,
3155231552+- 0x1.140cc4p-1,
3155331553+- 0x1.1059eep-1,
3155431554+- 0x1.0cab62p-1,
3155531555+- 0x1.09014cp-1,
3155631556+- 0x1.055bd6p-1,
3155731557+- 0x1.01bb2cp-1,
3155831558+- 0x1.fc3ee6p-2,
3155931559+- 0x1.f511aap-2,
3156031560+- 0x1.edeeeep-2,
3156131561+- 0x1.e6d700p-2,
3156231562+- 0x1.dfca26p-2,
3156331563+- 0x1.d8c8aap-2,
3156431564+- 0x1.d1d2d0p-2,
3156531565+- 0x1.cae8dap-2,
3156631566+- 0x1.c40b08p-2,
3156731567+- 0x1.bd3998p-2,
3156831568+- 0x1.b674c8p-2,
3156931569+- 0x1.afbcd4p-2,
3157031570+- 0x1.a911f0p-2,
3157131571+- 0x1.a27456p-2,
3157231572+- 0x1.9be438p-2,
3157331573+- 0x1.9561c8p-2,
3157431574+- 0x1.8eed36p-2,
3157531575+- 0x1.8886b2p-2,
3157631576+- 0x1.822e66p-2,
3157731577+- 0x1.7be47ap-2,
3157831578+- 0x1.75a91ap-2,
3157931579+- 0x1.6f7c6ap-2,
3158031580+- 0x1.695e8cp-2,
3158131581+- 0x1.634fa6p-2,
3158231582+- 0x1.5d4fd4p-2,
3158331583+- 0x1.575f34p-2,
3158431584+- 0x1.517de6p-2,
3158531585+- 0x1.4bac00p-2,
3158631586+- 0x1.45e99cp-2,
3158731587+- 0x1.4036d0p-2,
3158831588+- 0x1.3a93b2p-2,
3158931589+- 0x1.350052p-2,
3159031590+- 0x1.2f7cc4p-2,
3159131591+- 0x1.2a0916p-2,
3159231592+- 0x1.24a554p-2,
3159331593+- 0x1.1f518ap-2,
3159431594+- 0x1.1a0dc6p-2,
3159531595+- 0x1.14da0ap-2,
3159631596+- 0x1.0fb662p-2,
3159731597+- 0x1.0aa2d0p-2,
3159831598+- 0x1.059f5ap-2,
3159931599+- 0x1.00ac00p-2,
3160031600+- 0x1.f79184p-3,
3160131601+- 0x1.edeb40p-3,
3160231602+- 0x1.e46530p-3,
3160331603+- 0x1.daff4ap-3,
3160431604+- 0x1.d1b982p-3,
3160531605+- 0x1.c893cep-3,
3160631606+- 0x1.bf8e1cp-3,
3160731607+- 0x1.b6a856p-3,
3160831608+- 0x1.ade26cp-3,
3160931609+- 0x1.a53c42p-3,
3161031610+- 0x1.9cb5bep-3,
3161131611+- 0x1.944ec2p-3,
3161231612+- 0x1.8c0732p-3,
3161331613+- 0x1.83deeap-3,
3161431614+- 0x1.7bd5c8p-3,
3161531615+- 0x1.73eba4p-3,
3161631616+- 0x1.6c2056p-3,
3161731617+- 0x1.6473b6p-3,
3161831618+- 0x1.5ce596p-3,
3161931619+- 0x1.5575c8p-3,
3162031620+- 0x1.4e241ep-3,
3162131621+- 0x1.46f066p-3,
3162231622+- 0x1.3fda6cp-3,
3162331623+- 0x1.38e1fap-3,
3162431624+- 0x1.3206dcp-3,
3162531625+- 0x1.2b48dap-3,
3162631626+- 0x1.24a7b8p-3,
3162731627+- 0x1.1e233ep-3,
3162831628+- 0x1.17bb2cp-3,
3162931629+- 0x1.116f48p-3,
3163031630+- 0x1.0b3f52p-3,
3163131631+- 0x1.052b0cp-3,
3163231632+- 0x1.fe6460p-4,
3163331633+- 0x1.f2a902p-4,
3163431634+- 0x1.e72372p-4,
3163531635+- 0x1.dbd32ap-4,
3163631636+- 0x1.d0b7a0p-4,
3163731637+- 0x1.c5d04ap-4,
3163831638+- 0x1.bb1c98p-4,
3163931639+- 0x1.b09bfcp-4,
3164031640+- 0x1.a64de6p-4,
3164131641+- 0x1.9c31c6p-4,
3164231642+- 0x1.92470ap-4,
3164331643+- 0x1.888d1ep-4,
3164431644+- 0x1.7f036cp-4,
3164531645+- 0x1.75a960p-4,
3164631646+- 0x1.6c7e64p-4,
3164731647+- 0x1.6381e2p-4,
3164831648+- 0x1.5ab342p-4,
3164931649+- 0x1.5211ecp-4,
3165031650+- 0x1.499d48p-4,
3165131651+- 0x1.4154bcp-4,
3165231652+- 0x1.3937b2p-4,
3165331653+- 0x1.31458ep-4,
3165431654+- 0x1.297dbap-4,
3165531655+- 0x1.21df9ap-4,
3165631656+- 0x1.1a6a96p-4,
3165731657+- 0x1.131e14p-4,
3165831658+- 0x1.0bf97ep-4,
3165931659+- 0x1.04fc3ap-4,
3166031660+- 0x1.fc4b5ep-5,
3166131661+- 0x1.eeea8cp-5,
3166231662+- 0x1.e1d4d0p-5,
3166331663+- 0x1.d508fap-5,
3166431664+- 0x1.c885e0p-5,
3166531665+- 0x1.bc4a54p-5,
3166631666+- 0x1.b05530p-5,
3166731667+- 0x1.a4a54ap-5,
3166831668+- 0x1.99397ap-5,
3166931669+- 0x1.8e109cp-5,
3167031670+- 0x1.83298ep-5,
3167131671+- 0x1.78832cp-5,
3167231672+- 0x1.6e1c58p-5,
3167331673+- 0x1.63f3f6p-5,
3167431674+- 0x1.5a08e8p-5,
3167531675+- 0x1.505a18p-5,
3167631676+- 0x1.46e66cp-5,
3167731677+- 0x1.3dacd2p-5,
3167831678+- 0x1.34ac36p-5,
3167931679+- 0x1.2be38cp-5,
3168031680+- 0x1.2351c2p-5,
3168131681+- 0x1.1af5d2p-5,
3168231682+- 0x1.12ceb4p-5,
3168331683+- 0x1.0adb60p-5,
3168431684+- 0x1.031ad6p-5,
3168531685+- 0x1.f7182ap-6,
3168631686+- 0x1.e85c44p-6,
3168731687+- 0x1.da0006p-6,
3168831688+- 0x1.cc0180p-6,
3168931689+- 0x1.be5ecep-6,
3169031690+- 0x1.b1160ap-6,
3169131691+- 0x1.a4255ap-6,
3169231692+- 0x1.978ae8p-6,
3169331693+- 0x1.8b44e6p-6,
3169431694+- 0x1.7f5188p-6,
3169531695+- 0x1.73af0cp-6,
3169631696+- 0x1.685bb6p-6,
3169731697+- 0x1.5d55ccp-6,
3169831698+- 0x1.529b9ep-6,
3169931699+- 0x1.482b84p-6,
3170031700+- 0x1.3e03d8p-6,
3170131701+- 0x1.3422fep-6,
3170231702+- 0x1.2a875cp-6,
3170331703+- 0x1.212f62p-6,
3170431704+- 0x1.181984p-6,
3170531705+- 0x1.0f443ep-6,
3170631706+- 0x1.06ae14p-6,
3170731707+- 0x1.fcab14p-7,
3170831708+- 0x1.ec7262p-7,
3170931709+- 0x1.dcaf36p-7,
3171031710+- 0x1.cd5ecap-7,
3171131711+- 0x1.be7e5ap-7,
3171231712+- 0x1.b00b38p-7,
3171331713+- 0x1.a202bep-7,
3171431714+- 0x1.94624ep-7,
3171531715+- 0x1.87275ep-7,
3171631716+- 0x1.7a4f6ap-7,
3171731717+- 0x1.6dd7fep-7,
3171831718+- 0x1.61beaep-7,
3171931719+- 0x1.56011cp-7,
3172031720+- 0x1.4a9cf6p-7,
3172131721+- 0x1.3f8ff6p-7,
3172231722+- 0x1.34d7dcp-7,
3172331723+- 0x1.2a727ap-7,
3172431724+- 0x1.205dacp-7,
3172531725+- 0x1.169756p-7,
3172631726+- 0x1.0d1d6ap-7,
3172731727+- 0x1.03ede2p-7,
3172831728+- 0x1.f60d8ap-8,
3172931729+- 0x1.e4cc4ap-8,
3173031730+- 0x1.d4143ap-8,
3173131731+- 0x1.c3e1a6p-8,
3173231732+- 0x1.b430ecp-8,
3173331733+- 0x1.a4fe84p-8,
3173431734+- 0x1.9646f4p-8,
3173531735+- 0x1.8806d8p-8,
3173631736+- 0x1.7a3adep-8,
3173731737+- 0x1.6cdfccp-8,
3173831738+- 0x1.5ff276p-8,
3173931739+- 0x1.536fc2p-8,
3174031740+- 0x1.4754acp-8,
3174131741+- 0x1.3b9e40p-8,
3174231742+- 0x1.30499cp-8,
3174331743+- 0x1.2553eep-8,
3174431744+- 0x1.1aba78p-8,
3174531745+- 0x1.107a8cp-8,
3174631746+- 0x1.06918cp-8,
3174731747+- 0x1.f9f9d0p-9,
3174831748+- 0x1.e77448p-9,
3174931749+- 0x1.d58da6p-9,
3175031750+- 0x1.c4412cp-9,
3175131751+- 0x1.b38a3ap-9,
3175231752+- 0x1.a36454p-9,
3175331753+- 0x1.93cb12p-9,
3175431754+- 0x1.84ba30p-9,
3175531755+- 0x1.762d84p-9,
3175631756+- 0x1.682100p-9,
3175731757+- 0x1.5a90b0p-9,
3175831758+- 0x1.4d78bcp-9,
3175931759+- 0x1.40d564p-9,
3176031760+- 0x1.34a306p-9,
3176131761+- 0x1.28de12p-9,
3176231762+- 0x1.1d8318p-9,
3176331763+- 0x1.128ebap-9,
3176431764+- 0x1.07fdb4p-9,
3176531765+- 0x1.fb99b8p-10,
3176631766+- 0x1.e7f232p-10,
3176731767+- 0x1.d4fed8p-10,
3176831768+- 0x1.c2b9d0p-10,
3176931769+- 0x1.b11d70p-10,
3177031770+- 0x1.a02436p-10,
3177131771+- 0x1.8fc8c8p-10,
3177231772+- 0x1.8005f0p-10,
3177331773+- 0x1.70d6a4p-10,
3177431774+- 0x1.6235fcp-10,
3177531775+- 0x1.541f34p-10,
3177631776+- 0x1.468daep-10,
3177731777+- 0x1.397ceep-10,
3177831778+- 0x1.2ce898p-10,
3177931779+- 0x1.20cc76p-10,
3178031780+- 0x1.15246ep-10,
3178131781+- 0x1.09ec86p-10,
3178231782+- 0x1.fe41cep-11,
3178331783+- 0x1.e97ba4p-11,
3178431784+- 0x1.d57f52p-11,
3178531785+- 0x1.c245d4p-11,
3178631786+- 0x1.afc85ep-11,
3178731787+- 0x1.9e0058p-11,
3178831788+- 0x1.8ce75ep-11,
3178931789+- 0x1.7c7744p-11,
3179031790+- 0x1.6caa0ep-11,
3179131791+- 0x1.5d79ecp-11,
3179231792+- 0x1.4ee142p-11,
3179331793+- 0x1.40daa4p-11,
3179431794+- 0x1.3360ccp-11,
3179531795+- 0x1.266ea8p-11,
3179631796+- 0x1.19ff46p-11,
3179731797+- 0x1.0e0de8p-11,
3179831798+- 0x1.0295f0p-11,
3179931799+- 0x1.ef25d4p-12,
3180031800+- 0x1.da0110p-12,
3180131801+- 0x1.c5b542p-12,
3180231802+- 0x1.b23a5ap-12,
3180331803+- 0x1.9f8894p-12,
3180431804+- 0x1.8d986ap-12,
3180531805+- 0x1.7c629ap-12,
3180631806+- 0x1.6be022p-12,
3180731807+- 0x1.5c0a38p-12,
3180831808+- 0x1.4cda54p-12,
3180931809+- 0x1.3e4a24p-12,
3181031810+- 0x1.305390p-12,
3181131811+- 0x1.22f0b4p-12,
3181231812+- 0x1.161be4p-12,
3181331813+- 0x1.09cfa4p-12,
3181431814+- 0x1.fc0d56p-13,
3181531815+- 0x1.e577bcp-13,
3181631816+- 0x1.cfd4a6p-13,
3181731817+- 0x1.bb1a96p-13,
3181831818+- 0x1.a74068p-13,
3181931819+- 0x1.943d4ap-13,
3182031820+- 0x1.8208bcp-13,
3182131821+- 0x1.709a8ep-13,
3182231822+- 0x1.5feadap-13,
3182331823+- 0x1.4ff208p-13,
3182431824+- 0x1.40a8c2p-13,
3182531825+- 0x1.3207fcp-13,
3182631826+- 0x1.2408eap-13,
3182731827+- 0x1.16a502p-13,
3182831828+- 0x1.09d5f8p-13,
3182931829+- 0x1.fb2b7ap-14,
3183031830+- 0x1.e3bcf4p-14,
3183131831+- 0x1.cd5528p-14,
3183231832+- 0x1.b7e946p-14,
3183331833+- 0x1.a36eecp-14,
3183431834+- 0x1.8fdc1cp-14,
3183531835+- 0x1.7d2738p-14,
3183631836+- 0x1.6b4702p-14,
3183731837+- 0x1.5a329cp-14,
3183831838+- 0x1.49e178p-14,
3183931839+- 0x1.3a4b60p-14,
3184031840+- 0x1.2b6876p-14,
3184131841+- 0x1.1d3120p-14,
3184231842+- 0x1.0f9e1cp-14,
3184331843+- 0x1.02a868p-14,
3184431844+- 0x1.ec929ap-15,
3184531845+- 0x1.d4f4b4p-15,
3184631846+- 0x1.be6abcp-15,
3184731847+- 0x1.a8e8ccp-15,
3184831848+- 0x1.94637ep-15,
3184931849+- 0x1.80cfdcp-15,
3185031850+- 0x1.6e2368p-15,
3185131851+- 0x1.5c540cp-15,
3185231852+- 0x1.4b581cp-15,
3185331853+- 0x1.3b2652p-15,
3185431854+- 0x1.2bb5ccp-15,
3185531855+- 0x1.1cfe02p-15,
3185631856+- 0x1.0ef6c4p-15,
3185731857+- 0x1.019842p-15,
3185831858+- 0x1.e9b5e8p-16,
3185931859+- 0x1.d16f58p-16,
3186031860+- 0x1.ba4f04p-16,
3186131861+- 0x1.a447b8p-16,
3186231862+- 0x1.8f4cccp-16,
3186331863+- 0x1.7b5224p-16,
3186431864+- 0x1.684c22p-16,
3186531865+- 0x1.562facp-16,
3186631866+- 0x1.44f21ep-16,
3186731867+- 0x1.34894ap-16,
3186831868+- 0x1.24eb72p-16,
3186931869+- 0x1.160f44p-16,
3187031870+- 0x1.07ebd2p-16,
3187131871+- 0x1.f4f12ep-17,
3187231872+- 0x1.db5ad0p-17,
3187331873+- 0x1.c304f0p-17,
3187431874+- 0x1.abe09ep-17,
3187531875+- 0x1.95df98p-17,
3187631876+- 0x1.80f43ap-17,
3187731877+- 0x1.6d1178p-17,
3187831878+- 0x1.5a2ae0p-17,
3187931879+- 0x1.483488p-17,
3188031880+- 0x1.372310p-17,
3188131881+- 0x1.26eb9ep-17,
3188231882+- 0x1.1783cep-17,
3188331883+- 0x1.08e1bap-17,
3188431884+- 0x1.f5f7d8p-18,
3188531885+- 0x1.db92b6p-18,
3188631886+- 0x1.c282cep-18,
3188731887+- 0x1.aab7acp-18,
3188831888+- 0x1.94219cp-18,
3188931889+- 0x1.7eb1a2p-18,
3189031890+- 0x1.6a5972p-18,
3189131891+- 0x1.570b6ap-18,
3189231892+- 0x1.44ba86p-18,
3189331893+- 0x1.335a62p-18,
3189431894+- 0x1.22df2ap-18,
3189531895+- 0x1.133d96p-18,
3189631896+- 0x1.046aeap-18,
3189731897+- 0x1.ecb9d0p-19,
3189831898+- 0x1.d21398p-19,
3189931899+- 0x1.b8d094p-19,
3190031900+- 0x1.a0df10p-19,
3190131901+- 0x1.8a2e26p-19,
3190231902+- 0x1.74adc8p-19,
3190331903+- 0x1.604ea8p-19,
3190431904+- 0x1.4d0232p-19,
3190531905+- 0x1.3aba86p-19,
3190631906+- 0x1.296a70p-19,
3190731907+- 0x1.190562p-19,
3190831908+- 0x1.097f62p-19,
3190931909+- 0x1.f59a20p-20,
3191031910+- 0x1.d9c736p-20,
3191131911+- 0x1.bf716cp-20,
3191231912+- 0x1.a6852cp-20,
3191331913+- 0x1.8eefd8p-20,
3191431914+- 0x1.789fb8p-20,
3191531915+- 0x1.6383f8p-20,
3191631916+- 0x1.4f8c96p-20,
3191731917+- 0x1.3caa62p-20,
3191831918+- 0x1.2acee2p-20,
3191931919+- 0x1.19ec60p-20,
3192031920+- 0x1.09f5d0p-20,
3192131921+- 0x1.f5bd96p-21,
3192231922+- 0x1.d9371ep-21,
3192331923+- 0x1.be41dep-21,
3192431924+- 0x1.a4c89ep-21,
3192531925+- 0x1.8cb738p-21,
3192631926+- 0x1.75fa8ep-21,
3192731927+- 0x1.608078p-21,
3192831928+- 0x1.4c37c0p-21,
3192931929+- 0x1.39100ep-21,
3193031930+- 0x1.26f9e0p-21,
3193131931+- 0x1.15e682p-21,
3193231932+- 0x1.05c804p-21,
3193331933+- 0x1.ed2254p-22,
3193431934+- 0x1.d06ad6p-22,
3193531935+- 0x1.b551c8p-22,
3193631936+- 0x1.9bc0a0p-22,
3193731937+- 0x1.83a200p-22,
3193831938+- 0x1.6ce1aap-22,
3193931939+- 0x1.576c72p-22,
3194031940+- 0x1.43302cp-22,
3194131941+- 0x1.301ba2p-22,
3194231942+- 0x1.1e1e86p-22,
3194331943+- 0x1.0d2966p-22,
3194431944+- 0x1.fa5b50p-23,
3194531945+- 0x1.dc3ae4p-23,
3194631946+- 0x1.bfd756p-23,
3194731947+- 0x1.a517dap-23,
3194831948+- 0x1.8be4f8p-23,
3194931949+- 0x1.74287ep-23,
3195031950+- 0x1.5dcd66p-23,
3195131951+- 0x1.48bfd4p-23,
3195231952+- 0x1.34ecf8p-23,
3195331953+- 0x1.224310p-23,
3195431954+- 0x1.10b148p-23,
3195531955+- },
3195631956+-};
3195731957+diff --git a/sysdeps/aarch64/fpu/vecmath_config.h b/sysdeps/aarch64/fpu/vecmath_config.h
3195831958+index 7f0a8aa5f2..862eefaf8f 100644
3195931959+--- a/sysdeps/aarch64/fpu/vecmath_config.h
3196031960++++ b/sysdeps/aarch64/fpu/vecmath_config.h
3196131961+@@ -75,49 +75,37 @@ extern const struct v_log10_data
3196231962+ } table[1 << V_LOG10_TABLE_BITS];
3196331963+ } __v_log10_data attribute_hidden;
3196431964+3196531965+-extern const struct erff_data
3196631966++extern const struct v_erff_data
3196731967+ {
3196831968+ struct
3196931969+ {
3197031970+ float erf, scale;
3197131971+ } tab[513];
3197231972+-} __erff_data attribute_hidden;
3197331973++} __v_erff_data attribute_hidden;
3197431974+3197531975+-extern const struct sv_erff_data
3197631976+-{
3197731977+- float erf[513];
3197831978+- float scale[513];
3197931979+-} __sv_erff_data attribute_hidden;
3198031980+-
3198131981+-extern const struct erf_data
3198231982++extern const struct v_erf_data
3198331983+ {
3198431984+ struct
3198531985+ {
3198631986+ double erf, scale;
3198731987+ } tab[769];
3198831988+-} __erf_data attribute_hidden;
3198931989+-
3199031990+-extern const struct sv_erf_data
3199131991+-{
3199231992+- double erf[769];
3199331993+- double scale[769];
3199431994+-} __sv_erf_data attribute_hidden;
3199531995++} __v_erf_data attribute_hidden;
3199631996+3199731997+-extern const struct erfc_data
3199831998++extern const struct v_erfc_data
3199931999+ {
3200032000+ struct
3200132001+ {
3200232002+ double erfc, scale;
3200332003+ } tab[3488];
3200432004+-} __erfc_data attribute_hidden;
3200532005++} __v_erfc_data attribute_hidden;
3200632006+3200732007+-extern const struct erfcf_data
3200832008++extern const struct v_erfcf_data
3200932009+ {
3201032010+ struct
3201132011+ {
3201232012+ float erfc, scale;
3201332013+ } tab[645];
3201432014+-} __erfcf_data attribute_hidden;
3201532015++} __v_erfcf_data attribute_hidden;
3201632016+3201732017+ /* Some data for AdvSIMD and SVE pow's internal exp and log. */
3201832018+ #define V_POW_EXP_TABLE_BITS 8
3201932019+3202032020+commit 4148940836eee07d1138da6f1805280eeb8217e3
3202132021+Author: Pierre Blanchard <pierre.blanchard@arm.com>
3202232022+Date: Mon Dec 9 15:53:04 2024 +0000
3202332023+3202432024+ AArch64: Improve codegen in AdvSIMD pow
3202532025+3202632026+ Remove spurious ADRP. Improve memory access by shuffling constants and
3202732027+ using more indexed MLAs.
3202832028+3202932029+ A few more optimisation with no impact on accuracy
3203032030+ - force fmas contraction
3203132031+ - switch from shift-aided rint to rint instruction
3203232032+3203332033+ Between 1 and 5% throughput improvement on Neoverse
3203432034+ V1 depending on benchmark.
3203532035+3203632036+ (cherry picked from commit 569cfaaf4984ae70b23c61ee28a609b5aef93fea)
3203732037+3203832038+diff --git a/sysdeps/aarch64/fpu/pow_advsimd.c b/sysdeps/aarch64/fpu/pow_advsimd.c
3203932039+index 3c91e3e183..81e134ac2f 100644
3204032040+--- a/sysdeps/aarch64/fpu/pow_advsimd.c
3204132041++++ b/sysdeps/aarch64/fpu/pow_advsimd.c
3204232042+@@ -22,9 +22,6 @@
3204332043+ /* Defines parameters of the approximation and scalar fallback. */
3204432044+ #include "finite_pow.h"
3204532045+3204632046+-#define VecSmallExp v_u64 (SmallExp)
3204732047+-#define VecThresExp v_u64 (ThresExp)
3204832048+-
3204932049+ #define VecSmallPowX v_u64 (SmallPowX)
3205032050+ #define VecThresPowX v_u64 (ThresPowX)
3205132051+ #define VecSmallPowY v_u64 (SmallPowY)
3205232052+@@ -32,36 +29,48 @@
3205332053+3205432054+ static const struct data
3205532055+ {
3205632056+- float64x2_t log_poly[6];
3205732057+- float64x2_t exp_poly[3];
3205832058+- float64x2_t ln2_hi, ln2_lo;
3205932059+- float64x2_t shift, inv_ln2_n, ln2_hi_n, ln2_lo_n, small_powx;
3206032060+ uint64x2_t inf;
3206132061++ float64x2_t small_powx;
3206232062++ uint64x2_t offset, mask;
3206332063++ uint64x2_t mask_sub_0, mask_sub_1;
3206432064++ float64x2_t log_c0, log_c2, log_c4, log_c5;
3206532065++ double log_c1, log_c3;
3206632066++ double ln2_lo, ln2_hi;
3206732067++ uint64x2_t small_exp, thres_exp;
3206832068++ double ln2_lo_n, ln2_hi_n;
3206932069++ double inv_ln2_n, exp_c2;
3207032070++ float64x2_t exp_c0, exp_c1;
3207132071+ } data = {
3207232072++ /* Power threshold. */
3207332073++ .inf = V2 (0x7ff0000000000000),
3207432074++ .small_powx = V2 (0x1p-126),
3207532075++ .offset = V2 (Off),
3207632076++ .mask = V2 (0xfffULL << 52),
3207732077++ .mask_sub_0 = V2 (1ULL << 52),
3207832078++ .mask_sub_1 = V2 (52ULL << 52),
3207932079+ /* Coefficients copied from v_pow_log_data.c
3208032080+ relative error: 0x1.11922ap-70 in [-0x1.6bp-8, 0x1.6bp-8]
3208132081+ Coefficients are scaled to match the scaling during evaluation. */
3208232082+- .log_poly
3208332083+- = { V2 (0x1.555555555556p-2 * -2), V2 (-0x1.0000000000006p-2 * -2),
3208432084+- V2 (0x1.999999959554ep-3 * 4), V2 (-0x1.555555529a47ap-3 * 4),
3208532085+- V2 (0x1.2495b9b4845e9p-3 * -8), V2 (-0x1.0002b8b263fc3p-3 * -8) },
3208632086+- .ln2_hi = V2 (0x1.62e42fefa3800p-1),
3208732087+- .ln2_lo = V2 (0x1.ef35793c76730p-45),
3208832088++ .log_c0 = V2 (0x1.555555555556p-2 * -2),
3208932089++ .log_c1 = -0x1.0000000000006p-2 * -2,
3209032090++ .log_c2 = V2 (0x1.999999959554ep-3 * 4),
3209132091++ .log_c3 = -0x1.555555529a47ap-3 * 4,
3209232092++ .log_c4 = V2 (0x1.2495b9b4845e9p-3 * -8),
3209332093++ .log_c5 = V2 (-0x1.0002b8b263fc3p-3 * -8),
3209432094++ .ln2_hi = 0x1.62e42fefa3800p-1,
3209532095++ .ln2_lo = 0x1.ef35793c76730p-45,
3209632096+ /* Polynomial coefficients: abs error: 1.43*2^-58, ulp error: 0.549
3209732097+ (0.550 without fma) if |x| < ln2/512. */
3209832098+- .exp_poly = { V2 (0x1.fffffffffffd4p-2), V2 (0x1.5555571d6ef9p-3),
3209932099+- V2 (0x1.5555576a5adcep-5) },
3210032100+- .shift = V2 (0x1.8p52), /* round to nearest int. without intrinsics. */
3210132101+- .inv_ln2_n = V2 (0x1.71547652b82fep8), /* N/ln2. */
3210232102+- .ln2_hi_n = V2 (0x1.62e42fefc0000p-9), /* ln2/N. */
3210332103+- .ln2_lo_n = V2 (-0x1.c610ca86c3899p-45),
3210432104+- .small_powx = V2 (0x1p-126),
3210532105+- .inf = V2 (0x7ff0000000000000)
3210632106++ .exp_c0 = V2 (0x1.fffffffffffd4p-2),
3210732107++ .exp_c1 = V2 (0x1.5555571d6ef9p-3),
3210832108++ .exp_c2 = 0x1.5555576a5adcep-5,
3210932109++ .small_exp = V2 (0x3c90000000000000),
3211032110++ .thres_exp = V2 (0x03f0000000000000),
3211132111++ .inv_ln2_n = 0x1.71547652b82fep8, /* N/ln2. */
3211232112++ .ln2_hi_n = 0x1.62e42fefc0000p-9, /* ln2/N. */
3211332113++ .ln2_lo_n = -0x1.c610ca86c3899p-45,
3211432114+ };
3211532115+3211632116+-#define A(i) data.log_poly[i]
3211732117+-#define C(i) data.exp_poly[i]
3211832118+-
3211932119+ /* This version implements an algorithm close to scalar pow but
3212032120+ - does not implement the trick in the exp's specialcase subroutine to avoid
3212132121+ double-rounding,
3212232122+@@ -91,10 +100,9 @@ v_log_inline (uint64x2_t ix, float64x2_t *tail, const struct data *d)
3212332123+ /* x = 2^k z; where z is in range [OFF,2*OFF) and exact.
3212432124+ The range is split into N subintervals.
3212532125+ The ith subinterval contains z and c is near its center. */
3212632126+- uint64x2_t tmp = vsubq_u64 (ix, v_u64 (Off));
3212732127+- int64x2_t k
3212832128+- = vshrq_n_s64 (vreinterpretq_s64_u64 (tmp), 52); /* arithmetic shift. */
3212932129+- uint64x2_t iz = vsubq_u64 (ix, vandq_u64 (tmp, v_u64 (0xfffULL << 52)));
3213032130++ uint64x2_t tmp = vsubq_u64 (ix, d->offset);
3213132131++ int64x2_t k = vshrq_n_s64 (vreinterpretq_s64_u64 (tmp), 52);
3213232132++ uint64x2_t iz = vsubq_u64 (ix, vandq_u64 (tmp, d->mask));
3213332133+ float64x2_t z = vreinterpretq_f64_u64 (iz);
3213432134+ float64x2_t kd = vcvtq_f64_s64 (k);
3213532135+ /* log(x) = k*Ln2 + log(c) + log1p(z/c-1). */
3213632136+@@ -105,9 +113,10 @@ v_log_inline (uint64x2_t ix, float64x2_t *tail, const struct data *d)
3213732137+ |z/c - 1| < 1/N, so r = z/c - 1 is exactly representible. */
3213832138+ float64x2_t r = vfmaq_f64 (v_f64 (-1.0), z, invc);
3213932139+ /* k*Ln2 + log(c) + r. */
3214032140+- float64x2_t t1 = vfmaq_f64 (logc, kd, d->ln2_hi);
3214132141++ float64x2_t ln2 = vld1q_f64 (&d->ln2_lo);
3214232142++ float64x2_t t1 = vfmaq_laneq_f64 (logc, kd, ln2, 1);
3214332143+ float64x2_t t2 = vaddq_f64 (t1, r);
3214432144+- float64x2_t lo1 = vfmaq_f64 (logctail, kd, d->ln2_lo);
3214532145++ float64x2_t lo1 = vfmaq_laneq_f64 (logctail, kd, ln2, 0);
3214632146+ float64x2_t lo2 = vaddq_f64 (vsubq_f64 (t1, t2), r);
3214732147+ /* Evaluation is optimized assuming superscalar pipelined execution. */
3214832148+ float64x2_t ar = vmulq_f64 (v_f64 (-0.5), r);
3214932149+@@ -118,9 +127,10 @@ v_log_inline (uint64x2_t ix, float64x2_t *tail, const struct data *d)
3215032150+ float64x2_t lo3 = vfmaq_f64 (vnegq_f64 (ar2), ar, r);
3215132151+ float64x2_t lo4 = vaddq_f64 (vsubq_f64 (t2, hi), ar2);
3215232152+ /* p = log1p(r) - r - A[0]*r*r. */
3215332153+- float64x2_t a56 = vfmaq_f64 (A (4), r, A (5));
3215432154+- float64x2_t a34 = vfmaq_f64 (A (2), r, A (3));
3215532155+- float64x2_t a12 = vfmaq_f64 (A (0), r, A (1));
3215632156++ float64x2_t odd_coeffs = vld1q_f64 (&d->log_c1);
3215732157++ float64x2_t a56 = vfmaq_f64 (d->log_c4, r, d->log_c5);
3215832158++ float64x2_t a34 = vfmaq_laneq_f64 (d->log_c2, r, odd_coeffs, 1);
3215932159++ float64x2_t a12 = vfmaq_laneq_f64 (d->log_c0, r, odd_coeffs, 0);
3216032160+ float64x2_t p = vfmaq_f64 (a34, ar2, a56);
3216132161+ p = vfmaq_f64 (a12, ar2, p);
3216232162+ p = vmulq_f64 (ar3, p);
3216332163+@@ -140,28 +150,28 @@ exp_special_case (float64x2_t x, float64x2_t xtail)
3216432164+3216532165+ /* Computes sign*exp(x+xtail) where |xtail| < 2^-8/N and |xtail| <= |x|. */
3216632166+ static inline float64x2_t
3216732167+-v_exp_inline (float64x2_t x, float64x2_t xtail, const struct data *d)
3216832168++v_exp_inline (float64x2_t x, float64x2_t neg_xtail, const struct data *d)
3216932169+ {
3217032170+ /* Fallback to scalar exp_inline for all lanes if any lane
3217132171+ contains value of x s.t. |x| <= 2^-54 or >= 512. */
3217232172+- uint64x2_t abstop
3217332173+- = vshrq_n_u64 (vandq_u64 (vreinterpretq_u64_f64 (x), d->inf), 52);
3217432174+- uint64x2_t uoflowx
3217532175+- = vcgeq_u64 (vsubq_u64 (abstop, VecSmallExp), VecThresExp);
3217632176++ uint64x2_t uoflowx = vcgeq_u64 (
3217732177++ vsubq_u64 (vreinterpretq_u64_f64 (vabsq_f64 (x)), d->small_exp),
3217832178++ d->thres_exp);
3217932179+ if (__glibc_unlikely (v_any_u64 (uoflowx)))
3218032180+- return exp_special_case (x, xtail);
3218132181++ return exp_special_case (x, vnegq_f64 (neg_xtail));
3218232182+3218332183+ /* exp(x) = 2^(k/N) * exp(r), with exp(r) in [2^(-1/2N),2^(1/2N)]. */
3218432184+ /* x = ln2/N*k + r, with k integer and r in [-ln2/2N, ln2/2N]. */
3218532185+- float64x2_t z = vmulq_f64 (d->inv_ln2_n, x);
3218632186+ /* z - kd is in [-1, 1] in non-nearest rounding modes. */
3218732187+- float64x2_t kd = vaddq_f64 (z, d->shift);
3218832188+- uint64x2_t ki = vreinterpretq_u64_f64 (kd);
3218932189+- kd = vsubq_f64 (kd, d->shift);
3219032190+- float64x2_t r = vfmsq_f64 (x, kd, d->ln2_hi_n);
3219132191+- r = vfmsq_f64 (r, kd, d->ln2_lo_n);
3219232192++ float64x2_t exp_consts = vld1q_f64 (&d->inv_ln2_n);
3219332193++ float64x2_t z = vmulq_laneq_f64 (x, exp_consts, 0);
3219432194++ float64x2_t kd = vrndnq_f64 (z);
3219532195++ uint64x2_t ki = vreinterpretq_u64_s64 (vcvtaq_s64_f64 (z));
3219632196++ float64x2_t ln2_n = vld1q_f64 (&d->ln2_lo_n);
3219732197++ float64x2_t r = vfmsq_laneq_f64 (x, kd, ln2_n, 1);
3219832198++ r = vfmsq_laneq_f64 (r, kd, ln2_n, 0);
3219932199+ /* The code assumes 2^-200 < |xtail| < 2^-8/N. */
3220032200+- r = vaddq_f64 (r, xtail);
3220132201++ r = vsubq_f64 (r, neg_xtail);
3220232202+ /* 2^(k/N) ~= scale. */
3220332203+ uint64x2_t idx = vandq_u64 (ki, v_u64 (N_EXP - 1));
3220432204+ uint64x2_t top = vshlq_n_u64 (ki, 52 - V_POW_EXP_TABLE_BITS);
3220532205+@@ -170,8 +180,8 @@ v_exp_inline (float64x2_t x, float64x2_t xtail, const struct data *d)
3220632206+ sbits = vaddq_u64 (sbits, top);
3220732207+ /* exp(x) = 2^(k/N) * exp(r) ~= scale + scale * (exp(r) - 1). */
3220832208+ float64x2_t r2 = vmulq_f64 (r, r);
3220932209+- float64x2_t tmp = vfmaq_f64 (C (1), r, C (2));
3221032210+- tmp = vfmaq_f64 (C (0), r, tmp);
3221132211++ float64x2_t tmp = vfmaq_laneq_f64 (d->exp_c1, r, exp_consts, 1);
3221232212++ tmp = vfmaq_f64 (d->exp_c0, r, tmp);
3221332213+ tmp = vfmaq_f64 (r, r2, tmp);
3221432214+ float64x2_t scale = vreinterpretq_f64_u64 (sbits);
3221532215+ /* Note: tmp == 0 or |tmp| > 2^-200 and scale > 2^-739, so there
3221632216+@@ -230,8 +240,8 @@ float64x2_t VPCS_ATTR V_NAME_D2 (pow) (float64x2_t x, float64x2_t y)
3221732217+ {
3221832218+ /* Normalize subnormal x so exponent becomes negative. */
3221932219+ uint64x2_t vix_norm = vreinterpretq_u64_f64 (
3222032220+- vabsq_f64 (vmulq_f64 (x, vcvtq_f64_u64 (v_u64 (1ULL << 52)))));
3222132221+- vix_norm = vsubq_u64 (vix_norm, v_u64 (52ULL << 52));
3222232222++ vabsq_f64 (vmulq_f64 (x, vcvtq_f64_u64 (d->mask_sub_0))));
3222332223++ vix_norm = vsubq_u64 (vix_norm, d->mask_sub_1);
3222432224+ vix = vbslq_u64 (sub_x, vix_norm, vix);
3222532225+ }
3222632226+ }
3222732227+@@ -242,8 +252,7 @@ float64x2_t VPCS_ATTR V_NAME_D2 (pow) (float64x2_t x, float64x2_t y)
3222832228+3222932229+ /* Vector Exp(y_loghi, y_loglo). */
3223032230+ float64x2_t vehi = vmulq_f64 (y, vhi);
3223132231+- float64x2_t velo = vmulq_f64 (y, vlo);
3223232232+ float64x2_t vemi = vfmsq_f64 (vehi, y, vhi);
3223332233+- velo = vsubq_f64 (velo, vemi);
3223432234+- return v_exp_inline (vehi, velo, d);
3223532235++ float64x2_t neg_velo = vfmsq_f64 (vemi, y, vlo);
3223632236++ return v_exp_inline (vehi, neg_velo, d);
3223732237+ }
3223832238+3223932239+commit ae04f63087415eba9060143608b03db693854bb7
3224032240+Author: Pierre Blanchard <pierre.blanchard@arm.com>
3224132241+Date: Mon Dec 9 15:54:34 2024 +0000
3224232242+3224332243+ AArch64: Improve codegen in AdvSIMD logs
3224432244+3224532245+ Remove spurious ADRP and a few MOVs.
3224632246+ Reduce memory access by using more indexed MLAs in polynomial.
3224732247+ Align notation so that algorithms are easier to compare.
3224832248+ Speedup on Neoverse V1 for log10 (8%), log (8.5%), and log2 (10%).
3224932249+ Update error threshold in AdvSIMD log (now matches SVE log).
3225032250+3225132251+ (cherry picked from commit 8eb5ad2ebc94cc5bedbac57c226c02ec254479c7)
3225232252+3225332253+diff --git a/sysdeps/aarch64/fpu/log10_advsimd.c b/sysdeps/aarch64/fpu/log10_advsimd.c
3225432254+index c065aaebae..f69ed21c39 100644
3225532255+--- a/sysdeps/aarch64/fpu/log10_advsimd.c
3225632256++++ b/sysdeps/aarch64/fpu/log10_advsimd.c
3225732257+@@ -18,36 +18,36 @@
3225832258+ <https://www.gnu.org/licenses/>. */
3225932259+3226032260+ #include "v_math.h"
3226132261+-#include "poly_advsimd_f64.h"
3226232262+-
3226332263+-#define N (1 << V_LOG10_TABLE_BITS)
3226432264+3226532265+ static const struct data
3226632266+ {
3226732267+- uint64x2_t min_norm;
3226832268++ uint64x2_t off, sign_exp_mask, offset_lower_bound;
3226932269+ uint32x4_t special_bound;
3227032270+- float64x2_t poly[5];
3227132271+- float64x2_t invln10, log10_2, ln2;
3227232272+- uint64x2_t sign_exp_mask;
3227332273++ double invln10, log10_2;
3227432274++ double c1, c3;
3227532275++ float64x2_t c0, c2, c4;
3227632276+ } data = {
3227732277+ /* Computed from log coefficients divided by log(10) then rounded to double
3227832278+ precision. */
3227932279+- .poly = { V2 (-0x1.bcb7b1526e506p-3), V2 (0x1.287a7636be1d1p-3),
3228032280+- V2 (-0x1.bcb7b158af938p-4), V2 (0x1.63c78734e6d07p-4),
3228132281+- V2 (-0x1.287461742fee4p-4) },
3228232282+- .ln2 = V2 (0x1.62e42fefa39efp-1),
3228332283+- .invln10 = V2 (0x1.bcb7b1526e50ep-2),
3228432284+- .log10_2 = V2 (0x1.34413509f79ffp-2),
3228532285+- .min_norm = V2 (0x0010000000000000), /* asuint64(0x1p-1022). */
3228632286+- .special_bound = V4 (0x7fe00000), /* asuint64(inf) - min_norm. */
3228732287++ .c0 = V2 (-0x1.bcb7b1526e506p-3),
3228832288++ .c1 = 0x1.287a7636be1d1p-3,
3228932289++ .c2 = V2 (-0x1.bcb7b158af938p-4),
3229032290++ .c3 = 0x1.63c78734e6d07p-4,
3229132291++ .c4 = V2 (-0x1.287461742fee4p-4),
3229232292++ .invln10 = 0x1.bcb7b1526e50ep-2,
3229332293++ .log10_2 = 0x1.34413509f79ffp-2,
3229432294++ .off = V2 (0x3fe6900900000000),
3229532295+ .sign_exp_mask = V2 (0xfff0000000000000),
3229632296++ /* Lower bound is 0x0010000000000000. For
3229732297++ optimised register use subnormals are detected after offset has been
3229832298++ subtracted, so lower bound - offset (which wraps around). */
3229932299++ .offset_lower_bound = V2 (0x0010000000000000 - 0x3fe6900900000000),
3230032300++ .special_bound = V4 (0x7fe00000), /* asuint64(inf) - 0x0010000000000000. */
3230132301+ };
3230232302+3230332303+-#define Off v_u64 (0x3fe6900900000000)
3230432304++#define N (1 << V_LOG10_TABLE_BITS)
3230532305+ #define IndexMask (N - 1)
3230632306+3230732307+-#define T(s, i) __v_log10_data.s[i]
3230832308+-
3230932309+ struct entry
3231032310+ {
3231132311+ float64x2_t invc;
3231232312+@@ -70,10 +70,11 @@ lookup (uint64x2_t i)
3231332313+ }
3231432314+3231532315+ static float64x2_t VPCS_ATTR NOINLINE
3231632316+-special_case (float64x2_t x, float64x2_t y, float64x2_t hi, float64x2_t r2,
3231732317+- uint32x2_t special)
3231832318++special_case (float64x2_t hi, uint64x2_t u_off, float64x2_t y, float64x2_t r2,
3231932319++ uint32x2_t special, const struct data *d)
3232032320+ {
3232132321+- return v_call_f64 (log10, x, vfmaq_f64 (hi, r2, y), vmovl_u32 (special));
3232232322++ float64x2_t x = vreinterpretq_f64_u64 (vaddq_u64 (u_off, d->off));
3232332323++ return v_call_f64 (log10, x, vfmaq_f64 (hi, y, r2), vmovl_u32 (special));
3232432324+ }
3232532325+3232632326+ /* Fast implementation of double-precision vector log10
3232732327+@@ -85,19 +86,24 @@ special_case (float64x2_t x, float64x2_t y, float64x2_t hi, float64x2_t r2,
3232832328+ float64x2_t VPCS_ATTR V_NAME_D1 (log10) (float64x2_t x)
3232932329+ {
3233032330+ const struct data *d = ptr_barrier (&data);
3233132331+- uint64x2_t ix = vreinterpretq_u64_f64 (x);
3233232332+- uint32x2_t special = vcge_u32 (vsubhn_u64 (ix, d->min_norm),
3233332333+- vget_low_u32 (d->special_bound));
3233432334++
3233532335++ /* To avoid having to mov x out of the way, keep u after offset has been
3233632336++ applied, and recover x by adding the offset back in the special-case
3233732337++ handler. */
3233832338++ uint64x2_t u = vreinterpretq_u64_f64 (x);
3233932339++ uint64x2_t u_off = vsubq_u64 (u, d->off);
3234032340+3234132341+ /* x = 2^k z; where z is in range [OFF,2*OFF) and exact.
3234232342+ The range is split into N subintervals.
3234332343+ The ith subinterval contains z and c is near its center. */
3234432344+- uint64x2_t tmp = vsubq_u64 (ix, Off);
3234532345+- int64x2_t k = vshrq_n_s64 (vreinterpretq_s64_u64 (tmp), 52);
3234632346+- uint64x2_t iz = vsubq_u64 (ix, vandq_u64 (tmp, d->sign_exp_mask));
3234732347++ int64x2_t k = vshrq_n_s64 (vreinterpretq_s64_u64 (u_off), 52);
3234832348++ uint64x2_t iz = vsubq_u64 (u, vandq_u64 (u_off, d->sign_exp_mask));
3234932349+ float64x2_t z = vreinterpretq_f64_u64 (iz);
3235032350+3235132351+- struct entry e = lookup (tmp);
3235232352++ struct entry e = lookup (u_off);
3235332353++
3235432354++ uint32x2_t special = vcge_u32 (vsubhn_u64 (u_off, d->offset_lower_bound),
3235532355++ vget_low_u32 (d->special_bound));
3235632356+3235732357+ /* log10(x) = log1p(z/c-1)/log(10) + log10(c) + k*log10(2). */
3235832358+ float64x2_t r = vfmaq_f64 (v_f64 (-1.0), z, e.invc);
3235932359+@@ -105,17 +111,22 @@ float64x2_t VPCS_ATTR V_NAME_D1 (log10) (float64x2_t x)
3236032360+3236132361+ /* hi = r / log(10) + log10(c) + k*log10(2).
3236232362+ Constants in v_log10_data.c are computed (in extended precision) as
3236332363+- e.log10c := e.logc * ivln10. */
3236432364+- float64x2_t w = vfmaq_f64 (e.log10c, r, d->invln10);
3236532365++ e.log10c := e.logc * invln10. */
3236632366++ float64x2_t cte = vld1q_f64 (&d->invln10);
3236732367++ float64x2_t hi = vfmaq_laneq_f64 (e.log10c, r, cte, 0);
3236832368+3236932369+ /* y = log10(1+r) + n * log10(2). */
3237032370+- float64x2_t hi = vfmaq_f64 (w, kd, d->log10_2);
3237132371++ hi = vfmaq_laneq_f64 (hi, kd, cte, 1);
3237232372+3237332373+ /* y = r2*(A0 + r*A1 + r2*(A2 + r*A3 + r2*A4)) + hi. */
3237432374+ float64x2_t r2 = vmulq_f64 (r, r);
3237532375+- float64x2_t y = v_pw_horner_4_f64 (r, r2, d->poly);
3237632376++ float64x2_t odd_coeffs = vld1q_f64 (&d->c1);
3237732377++ float64x2_t y = vfmaq_laneq_f64 (d->c2, r, odd_coeffs, 1);
3237832378++ float64x2_t p = vfmaq_laneq_f64 (d->c0, r, odd_coeffs, 0);
3237932379++ y = vfmaq_f64 (y, d->c4, r2);
3238032380++ y = vfmaq_f64 (p, y, r2);
3238132381+3238232382+ if (__glibc_unlikely (v_any_u32h (special)))
3238332383+- return special_case (x, y, hi, r2, special);
3238432384+- return vfmaq_f64 (hi, r2, y);
3238532385++ return special_case (hi, u_off, y, r2, special, d);
3238632386++ return vfmaq_f64 (hi, y, r2);
3238732387+ }
3238832388+diff --git a/sysdeps/aarch64/fpu/log2_advsimd.c b/sysdeps/aarch64/fpu/log2_advsimd.c
3238932389+index 4057c552d8..1eea1f86eb 100644
3239032390+--- a/sysdeps/aarch64/fpu/log2_advsimd.c
3239132391++++ b/sysdeps/aarch64/fpu/log2_advsimd.c
3239232392+@@ -18,31 +18,33 @@
3239332393+ <https://www.gnu.org/licenses/>. */
3239432394+3239532395+ #include "v_math.h"
3239632396+-#include "poly_advsimd_f64.h"
3239732397+-
3239832398+-#define N (1 << V_LOG2_TABLE_BITS)
3239932399+3240032400+ static const struct data
3240132401+ {
3240232402+- uint64x2_t min_norm;
3240332403++ uint64x2_t off, sign_exp_mask, offset_lower_bound;
3240432404+ uint32x4_t special_bound;
3240532405+- float64x2_t poly[5];
3240632406+- float64x2_t invln2;
3240732407+- uint64x2_t sign_exp_mask;
3240832408++ float64x2_t c0, c2;
3240932409++ double c1, c3, invln2, c4;
3241032410+ } data = {
3241132411+ /* Each coefficient was generated to approximate log(r) for |r| < 0x1.fp-9
3241232412+ and N = 128, then scaled by log2(e) in extended precision and rounded back
3241332413+ to double precision. */
3241432414+- .poly = { V2 (-0x1.71547652b83p-1), V2 (0x1.ec709dc340953p-2),
3241532415+- V2 (-0x1.71547651c8f35p-2), V2 (0x1.2777ebe12dda5p-2),
3241632416+- V2 (-0x1.ec738d616fe26p-3) },
3241732417+- .invln2 = V2 (0x1.71547652b82fep0),
3241832418+- .min_norm = V2 (0x0010000000000000), /* asuint64(0x1p-1022). */
3241932419+- .special_bound = V4 (0x7fe00000), /* asuint64(inf) - min_norm. */
3242032420++ .c0 = V2 (-0x1.71547652b8300p-1),
3242132421++ .c1 = 0x1.ec709dc340953p-2,
3242232422++ .c2 = V2 (-0x1.71547651c8f35p-2),
3242332423++ .c3 = 0x1.2777ebe12dda5p-2,
3242432424++ .c4 = -0x1.ec738d616fe26p-3,
3242532425++ .invln2 = 0x1.71547652b82fep0,
3242632426++ .off = V2 (0x3fe6900900000000),
3242732427+ .sign_exp_mask = V2 (0xfff0000000000000),
3242832428++ /* Lower bound is 0x0010000000000000. For
3242932429++ optimised register use subnormals are detected after offset has been
3243032430++ subtracted, so lower bound - offset (which wraps around). */
3243132431++ .offset_lower_bound = V2 (0x0010000000000000 - 0x3fe6900900000000),
3243232432++ .special_bound = V4 (0x7fe00000), /* asuint64(inf) - asuint64(0x1p-1022). */
3243332433+ };
3243432434+3243532435+-#define Off v_u64 (0x3fe6900900000000)
3243632436++#define N (1 << V_LOG2_TABLE_BITS)
3243732437+ #define IndexMask (N - 1)
3243832438+3243932439+ struct entry
3244032440+@@ -67,10 +69,11 @@ lookup (uint64x2_t i)
3244132441+ }
3244232442+3244332443+ static float64x2_t VPCS_ATTR NOINLINE
3244432444+-special_case (float64x2_t x, float64x2_t y, float64x2_t w, float64x2_t r2,
3244532445+- uint32x2_t special)
3244632446++special_case (float64x2_t hi, uint64x2_t u_off, float64x2_t y, float64x2_t r2,
3244732447++ uint32x2_t special, const struct data *d)
3244832448+ {
3244932449+- return v_call_f64 (log2, x, vfmaq_f64 (w, r2, y), vmovl_u32 (special));
3245032450++ float64x2_t x = vreinterpretq_f64_u64 (vaddq_u64 (u_off, d->off));
3245132451++ return v_call_f64 (log2, x, vfmaq_f64 (hi, y, r2), vmovl_u32 (special));
3245232452+ }
3245332453+3245432454+ /* Double-precision vector log2 routine. Implements the same algorithm as
3245532455+@@ -81,31 +84,41 @@ special_case (float64x2_t x, float64x2_t y, float64x2_t w, float64x2_t r2,
3245632456+ float64x2_t VPCS_ATTR V_NAME_D1 (log2) (float64x2_t x)
3245732457+ {
3245832458+ const struct data *d = ptr_barrier (&data);
3245932459+- uint64x2_t ix = vreinterpretq_u64_f64 (x);
3246032460+- uint32x2_t special = vcge_u32 (vsubhn_u64 (ix, d->min_norm),
3246132461+- vget_low_u32 (d->special_bound));
3246232462++
3246332463++ /* To avoid having to mov x out of the way, keep u after offset has been
3246432464++ applied, and recover x by adding the offset back in the special-case
3246532465++ handler. */
3246632466++ uint64x2_t u = vreinterpretq_u64_f64 (x);
3246732467++ uint64x2_t u_off = vsubq_u64 (u, d->off);
3246832468+3246932469+ /* x = 2^k z; where z is in range [Off,2*Off) and exact.
3247032470+ The range is split into N subintervals.
3247132471+ The ith subinterval contains z and c is near its center. */
3247232472+- uint64x2_t tmp = vsubq_u64 (ix, Off);
3247332473+- int64x2_t k = vshrq_n_s64 (vreinterpretq_s64_u64 (tmp), 52);
3247432474+- uint64x2_t iz = vsubq_u64 (ix, vandq_u64 (tmp, d->sign_exp_mask));
3247532475++ int64x2_t k = vshrq_n_s64 (vreinterpretq_s64_u64 (u_off), 52);
3247632476++ uint64x2_t iz = vsubq_u64 (u, vandq_u64 (u_off, d->sign_exp_mask));
3247732477+ float64x2_t z = vreinterpretq_f64_u64 (iz);
3247832478+3247932479+- struct entry e = lookup (tmp);
3248032480++ struct entry e = lookup (u_off);
3248132481+3248232482+- /* log2(x) = log1p(z/c-1)/log(2) + log2(c) + k. */
3248332483++ uint32x2_t special = vcge_u32 (vsubhn_u64 (u_off, d->offset_lower_bound),
3248432484++ vget_low_u32 (d->special_bound));
3248532485+3248632486++ /* log2(x) = log1p(z/c-1)/log(2) + log2(c) + k. */
3248732487+ float64x2_t r = vfmaq_f64 (v_f64 (-1.0), z, e.invc);
3248832488+ float64x2_t kd = vcvtq_f64_s64 (k);
3248932489+- float64x2_t w = vfmaq_f64 (e.log2c, r, d->invln2);
3249032490++
3249132491++ float64x2_t invln2_and_c4 = vld1q_f64 (&d->invln2);
3249232492++ float64x2_t hi
3249332493++ = vfmaq_laneq_f64 (vaddq_f64 (e.log2c, kd), r, invln2_and_c4, 0);
3249432494+3249532495+ float64x2_t r2 = vmulq_f64 (r, r);
3249632496+- float64x2_t y = v_pw_horner_4_f64 (r, r2, d->poly);
3249732497+- w = vaddq_f64 (kd, w);
3249832498++ float64x2_t odd_coeffs = vld1q_f64 (&d->c1);
3249932499++ float64x2_t y = vfmaq_laneq_f64 (d->c2, r, odd_coeffs, 1);
3250032500++ float64x2_t p = vfmaq_laneq_f64 (d->c0, r, odd_coeffs, 0);
3250132501++ y = vfmaq_laneq_f64 (y, r2, invln2_and_c4, 1);
3250232502++ y = vfmaq_f64 (p, r2, y);
3250332503+3250432504+ if (__glibc_unlikely (v_any_u32h (special)))
3250532505+- return special_case (x, y, w, r2, special);
3250632506+- return vfmaq_f64 (w, r2, y);
3250732507++ return special_case (hi, u_off, y, r2, special, d);
3250832508++ return vfmaq_f64 (hi, y, r2);
3250932509+ }
3251032510+diff --git a/sysdeps/aarch64/fpu/log_advsimd.c b/sysdeps/aarch64/fpu/log_advsimd.c
3251132511+index 015a6da7d7..b1a27fbc29 100644
3251232512+--- a/sysdeps/aarch64/fpu/log_advsimd.c
3251332513++++ b/sysdeps/aarch64/fpu/log_advsimd.c
3251432514+@@ -21,27 +21,29 @@
3251532515+3251632516+ static const struct data
3251732517+ {
3251832518+- uint64x2_t min_norm;
3251932519++ uint64x2_t off, sign_exp_mask, offset_lower_bound;
3252032520+ uint32x4_t special_bound;
3252132521+- float64x2_t poly[5];
3252232522+- float64x2_t ln2;
3252332523+- uint64x2_t sign_exp_mask;
3252432524++ float64x2_t c0, c2;
3252532525++ double c1, c3, ln2, c4;
3252632526+ } data = {
3252732527+- /* Worst-case error: 1.17 + 0.5 ulp.
3252832528+- Rel error: 0x1.6272e588p-56 in [ -0x1.fc1p-9 0x1.009p-8 ]. */
3252932529+- .poly = { V2 (-0x1.ffffffffffff7p-2), V2 (0x1.55555555170d4p-2),
3253032530+- V2 (-0x1.0000000399c27p-2), V2 (0x1.999b2e90e94cap-3),
3253132531+- V2 (-0x1.554e550bd501ep-3) },
3253232532+- .ln2 = V2 (0x1.62e42fefa39efp-1),
3253332533+- .min_norm = V2 (0x0010000000000000),
3253432534+- .special_bound = V4 (0x7fe00000), /* asuint64(inf) - min_norm. */
3253532535+- .sign_exp_mask = V2 (0xfff0000000000000)
3253632536++ /* Rel error: 0x1.6272e588p-56 in [ -0x1.fc1p-9 0x1.009p-8 ]. */
3253732537++ .c0 = V2 (-0x1.ffffffffffff7p-2),
3253832538++ .c1 = 0x1.55555555170d4p-2,
3253932539++ .c2 = V2 (-0x1.0000000399c27p-2),
3254032540++ .c3 = 0x1.999b2e90e94cap-3,
3254132541++ .c4 = -0x1.554e550bd501ep-3,
3254232542++ .ln2 = 0x1.62e42fefa39efp-1,
3254332543++ .sign_exp_mask = V2 (0xfff0000000000000),
3254432544++ .off = V2 (0x3fe6900900000000),
3254532545++ /* Lower bound is 0x0010000000000000. For
3254632546++ optimised register use subnormals are detected after offset has been
3254732547++ subtracted, so lower bound - offset (which wraps around). */
3254832548++ .offset_lower_bound = V2 (0x0010000000000000 - 0x3fe6900900000000),
3254932549++ .special_bound = V4 (0x7fe00000), /* asuint64(inf) - asuint64(0x1p-126). */
3255032550+ };
3255132551+3255232552+-#define A(i) d->poly[i]
3255332553+ #define N (1 << V_LOG_TABLE_BITS)
3255432554+ #define IndexMask (N - 1)
3255532555+-#define Off v_u64 (0x3fe6900900000000)
3255632556+3255732557+ struct entry
3255832558+ {
3255932559+@@ -64,48 +66,56 @@ lookup (uint64x2_t i)
3256032560+ }
3256132561+3256232562+ static float64x2_t VPCS_ATTR NOINLINE
3256332563+-special_case (float64x2_t x, float64x2_t y, float64x2_t hi, float64x2_t r2,
3256432564+- uint32x2_t cmp)
3256532565++special_case (float64x2_t hi, uint64x2_t u_off, float64x2_t y, float64x2_t r2,
3256632566++ uint32x2_t special, const struct data *d)
3256732567+ {
3256832568+- return v_call_f64 (log, x, vfmaq_f64 (hi, y, r2), vmovl_u32 (cmp));
3256932569++ float64x2_t x = vreinterpretq_f64_u64 (vaddq_u64 (u_off, d->off));
3257032570++ return v_call_f64 (log, x, vfmaq_f64 (hi, y, r2), vmovl_u32 (special));
3257132571+ }
3257232572+3257332573++/* Double-precision vector log routine.
3257432574++ The maximum observed error is 2.17 ULP:
3257532575++ _ZGVnN2v_log(0x1.a6129884398a3p+0) got 0x1.ffffff1cca043p-2
3257632576++ want 0x1.ffffff1cca045p-2. */
3257732577+ float64x2_t VPCS_ATTR V_NAME_D1 (log) (float64x2_t x)
3257832578+ {
3257932579+ const struct data *d = ptr_barrier (&data);
3258032580+- float64x2_t z, r, r2, p, y, kd, hi;
3258132581+- uint64x2_t ix, iz, tmp;
3258232582+- uint32x2_t cmp;
3258332583+- int64x2_t k;
3258432584+- struct entry e;
3258532585+3258632586+- ix = vreinterpretq_u64_f64 (x);
3258732587+- cmp = vcge_u32 (vsubhn_u64 (ix, d->min_norm),
3258832588+- vget_low_u32 (d->special_bound));
3258932589++ /* To avoid having to mov x out of the way, keep u after offset has been
3259032590++ applied, and recover x by adding the offset back in the special-case
3259132591++ handler. */
3259232592++ uint64x2_t u = vreinterpretq_u64_f64 (x);
3259332593++ uint64x2_t u_off = vsubq_u64 (u, d->off);
3259432594+3259532595+ /* x = 2^k z; where z is in range [Off,2*Off) and exact.
3259632596+ The range is split into N subintervals.
3259732597+ The ith subinterval contains z and c is near its center. */
3259832598+- tmp = vsubq_u64 (ix, Off);
3259932599+- k = vshrq_n_s64 (vreinterpretq_s64_u64 (tmp), 52); /* arithmetic shift. */
3260032600+- iz = vsubq_u64 (ix, vandq_u64 (tmp, d->sign_exp_mask));
3260132601+- z = vreinterpretq_f64_u64 (iz);
3260232602+- e = lookup (tmp);
3260332603++ int64x2_t k = vshrq_n_s64 (vreinterpretq_s64_u64 (u_off), 52);
3260432604++ uint64x2_t iz = vsubq_u64 (u, vandq_u64 (u_off, d->sign_exp_mask));
3260532605++ float64x2_t z = vreinterpretq_f64_u64 (iz);
3260632606++
3260732607++ struct entry e = lookup (u_off);
3260832608++
3260932609++ uint32x2_t special = vcge_u32 (vsubhn_u64 (u_off, d->offset_lower_bound),
3261032610++ vget_low_u32 (d->special_bound));
3261132611+3261232612+ /* log(x) = log1p(z/c-1) + log(c) + k*Ln2. */
3261332613+- r = vfmaq_f64 (v_f64 (-1.0), z, e.invc);
3261432614+- kd = vcvtq_f64_s64 (k);
3261532615++ float64x2_t r = vfmaq_f64 (v_f64 (-1.0), z, e.invc);
3261632616++ float64x2_t kd = vcvtq_f64_s64 (k);
3261732617+3261832618+ /* hi = r + log(c) + k*Ln2. */
3261932619+- hi = vfmaq_f64 (vaddq_f64 (e.logc, r), kd, d->ln2);
3262032620++ float64x2_t ln2_and_c4 = vld1q_f64 (&d->ln2);
3262132621++ float64x2_t hi = vfmaq_laneq_f64 (vaddq_f64 (e.logc, r), kd, ln2_and_c4, 0);
3262232622++
3262332623+ /* y = r2*(A0 + r*A1 + r2*(A2 + r*A3 + r2*A4)) + hi. */
3262432624+- r2 = vmulq_f64 (r, r);
3262532625+- y = vfmaq_f64 (A (2), A (3), r);
3262632626+- p = vfmaq_f64 (A (0), A (1), r);
3262732627+- y = vfmaq_f64 (y, A (4), r2);
3262832628+- y = vfmaq_f64 (p, y, r2);
3262932629+-
3263032630+- if (__glibc_unlikely (v_any_u32h (cmp)))
3263132631+- return special_case (x, y, hi, r2, cmp);
3263232632++ float64x2_t odd_coeffs = vld1q_f64 (&d->c1);
3263332633++ float64x2_t r2 = vmulq_f64 (r, r);
3263432634++ float64x2_t y = vfmaq_laneq_f64 (d->c2, r, odd_coeffs, 1);
3263532635++ float64x2_t p = vfmaq_laneq_f64 (d->c0, r, odd_coeffs, 0);
3263632636++ y = vfmaq_laneq_f64 (y, r2, ln2_and_c4, 1);
3263732637++ y = vfmaq_f64 (p, r2, y);
3263832638++
3263932639++ if (__glibc_unlikely (v_any_u32h (special)))
3264032640++ return special_case (hi, u_off, y, r2, special, d);
3264132641+ return vfmaq_f64 (hi, y, r2);
3264232642+ }
3264332643+3264432644+commit 2aed9796bfb17b257e63b12cefdb7ff60be09626
3264532645+Author: Pierre Blanchard <pierre.blanchard@arm.com>
3264632646+Date: Mon Dec 9 15:55:39 2024 +0000
3264732647+3264832648+ AArch64: Improve codegen in users of ADVSIMD log1p helper
3264932649+3265032650+ Add inline helper for log1p and rearrange operations so MOV
3265132651+ is not necessary in reduction or around the special-case handler.
3265232652+ Reduce memory access by using more indexed MLAs in polynomial.
3265332653+ Speedup on Neoverse V1 for log1p (3.5%), acosh (7.5%) and atanh (10%).
3265432654+3265532655+ (cherry picked from commit ca0c0d0f26fbf75b9cacc65122b457e8fdec40b8)
3265632656+3265732657+diff --git a/sysdeps/aarch64/fpu/acosh_advsimd.c b/sysdeps/aarch64/fpu/acosh_advsimd.c
3265832658+index c88283cf11..a98f4a2e4d 100644
3265932659+--- a/sysdeps/aarch64/fpu/acosh_advsimd.c
3266032660++++ b/sysdeps/aarch64/fpu/acosh_advsimd.c
3266132661+@@ -54,9 +54,8 @@ VPCS_ATTR float64x2_t V_NAME_D1 (acosh) (float64x2_t x)
3266232662+ x = vbslq_f64 (special, vreinterpretq_f64_u64 (d->one), x);
3266332663+ #endif
3266432664+3266532665+- float64x2_t xm1 = vsubq_f64 (x, v_f64 (1));
3266632666+- float64x2_t y;
3266732667+- y = vaddq_f64 (x, v_f64 (1));
3266832668++ float64x2_t xm1 = vsubq_f64 (x, v_f64 (1.0));
3266932669++ float64x2_t y = vaddq_f64 (x, v_f64 (1.0));
3267032670+ y = vmulq_f64 (y, xm1);
3267132671+ y = vsqrtq_f64 (y);
3267232672+ y = vaddq_f64 (xm1, y);
3267332673+diff --git a/sysdeps/aarch64/fpu/atanh_advsimd.c b/sysdeps/aarch64/fpu/atanh_advsimd.c
3267432674+index 3c3d0bd6ad..eb9769aeac 100644
3267532675+--- a/sysdeps/aarch64/fpu/atanh_advsimd.c
3267632676++++ b/sysdeps/aarch64/fpu/atanh_advsimd.c
3267732677+@@ -23,15 +23,19 @@
3267832678+ const static struct data
3267932679+ {
3268032680+ struct v_log1p_data log1p_consts;
3268132681+- uint64x2_t one, half;
3268232682++ uint64x2_t one;
3268332683++ uint64x2_t sign_mask;
3268432684+ } data = { .log1p_consts = V_LOG1P_CONSTANTS_TABLE,
3268532685+ .one = V2 (0x3ff0000000000000),
3268632686+- .half = V2 (0x3fe0000000000000) };
3268732687++ .sign_mask = V2 (0x8000000000000000) };
3268832688+3268932689+ static float64x2_t VPCS_ATTR NOINLINE
3269032690+-special_case (float64x2_t x, float64x2_t y, uint64x2_t special)
3269132691++special_case (float64x2_t x, float64x2_t halfsign, float64x2_t y,
3269232692++ uint64x2_t special, const struct data *d)
3269332693+ {
3269432694+- return v_call_f64 (atanh, x, y, special);
3269532695++ y = log1p_inline (y, &d->log1p_consts);
3269632696++ return v_call_f64 (atanh, vbslq_f64 (d->sign_mask, halfsign, x),
3269732697++ vmulq_f64 (halfsign, y), special);
3269832698+ }
3269932699+3270032700+ /* Approximation for vector double-precision atanh(x) using modified log1p.
3270132701+@@ -43,11 +47,10 @@ float64x2_t V_NAME_D1 (atanh) (float64x2_t x)
3270232702+ {
3270332703+ const struct data *d = ptr_barrier (&data);
3270432704+3270532705++ float64x2_t halfsign = vbslq_f64 (d->sign_mask, x, v_f64 (0.5));
3270632706+ float64x2_t ax = vabsq_f64 (x);
3270732707+ uint64x2_t ia = vreinterpretq_u64_f64 (ax);
3270832708+- uint64x2_t sign = veorq_u64 (vreinterpretq_u64_f64 (x), ia);
3270932709+ uint64x2_t special = vcgeq_u64 (ia, d->one);
3271032710+- float64x2_t halfsign = vreinterpretq_f64_u64 (vorrq_u64 (sign, d->half));
3271132711+3271232712+ #if WANT_SIMD_EXCEPT
3271332713+ ax = v_zerofy_f64 (ax, special);
3271432714+@@ -55,10 +58,15 @@ float64x2_t V_NAME_D1 (atanh) (float64x2_t x)
3271532715+3271632716+ float64x2_t y;
3271732717+ y = vaddq_f64 (ax, ax);
3271832718+- y = vdivq_f64 (y, vsubq_f64 (v_f64 (1), ax));
3271932719+- y = log1p_inline (y, &d->log1p_consts);
3272032720++ y = vdivq_f64 (y, vsubq_f64 (vreinterpretq_f64_u64 (d->one), ax));
3272132721+3272232722+ if (__glibc_unlikely (v_any_u64 (special)))
3272332723+- return special_case (x, vmulq_f64 (y, halfsign), special);
3272432724++#if WANT_SIMD_EXCEPT
3272532725++ return special_case (x, halfsign, y, special, d);
3272632726++#else
3272732727++ return special_case (ax, halfsign, y, special, d);
3272832728++#endif
3272932729++
3273032730++ y = log1p_inline (y, &d->log1p_consts);
3273132731+ return vmulq_f64 (y, halfsign);
3273232732+ }
3273332733+diff --git a/sysdeps/aarch64/fpu/log1p_advsimd.c b/sysdeps/aarch64/fpu/log1p_advsimd.c
3273432734+index 114064c696..1263587201 100644
3273532735+--- a/sysdeps/aarch64/fpu/log1p_advsimd.c
3273632736++++ b/sysdeps/aarch64/fpu/log1p_advsimd.c
3273732737+@@ -17,43 +17,26 @@
3273832738+ License along with the GNU C Library; if not, see
3273932739+ <https://www.gnu.org/licenses/>. */
3274032740+3274132741+-#include "v_math.h"
3274232742+-#include "poly_advsimd_f64.h"
3274332743++#define WANT_V_LOG1P_K0_SHORTCUT 0
3274432744++#include "v_log1p_inline.h"
3274532745+3274632746+ const static struct data
3274732747+ {
3274832748+- float64x2_t poly[19], ln2[2];
3274932749+- uint64x2_t hf_rt2_top, one_m_hf_rt2_top, umask, inf, minus_one;
3275032750+- int64x2_t one_top;
3275132751+-} data = {
3275232752+- /* Generated using Remez, deg=20, in [sqrt(2)/2-1, sqrt(2)-1]. */
3275332753+- .poly = { V2 (-0x1.ffffffffffffbp-2), V2 (0x1.55555555551a9p-2),
3275432754+- V2 (-0x1.00000000008e3p-2), V2 (0x1.9999999a32797p-3),
3275532755+- V2 (-0x1.555555552fecfp-3), V2 (0x1.249248e071e5ap-3),
3275632756+- V2 (-0x1.ffffff8bf8482p-4), V2 (0x1.c71c8f07da57ap-4),
3275732757+- V2 (-0x1.9999ca4ccb617p-4), V2 (0x1.7459ad2e1dfa3p-4),
3275832758+- V2 (-0x1.554d2680a3ff2p-4), V2 (0x1.3b4c54d487455p-4),
3275932759+- V2 (-0x1.2548a9ffe80e6p-4), V2 (0x1.0f389a24b2e07p-4),
3276032760+- V2 (-0x1.eee4db15db335p-5), V2 (0x1.e95b494d4a5ddp-5),
3276132761+- V2 (-0x1.15fdf07cb7c73p-4), V2 (0x1.0310b70800fcfp-4),
3276232762+- V2 (-0x1.cfa7385bdb37ep-6) },
3276332763+- .ln2 = { V2 (0x1.62e42fefa3800p-1), V2 (0x1.ef35793c76730p-45) },
3276432764+- /* top32(asuint64(sqrt(2)/2)) << 32. */
3276532765+- .hf_rt2_top = V2 (0x3fe6a09e00000000),
3276632766+- /* (top32(asuint64(1)) - top32(asuint64(sqrt(2)/2))) << 32. */
3276732767+- .one_m_hf_rt2_top = V2 (0x00095f6200000000),
3276832768+- .umask = V2 (0x000fffff00000000),
3276932769+- .one_top = V2 (0x3ff),
3277032770+- .inf = V2 (0x7ff0000000000000),
3277132771+- .minus_one = V2 (0xbff0000000000000)
3277232772+-};
3277332773++ struct v_log1p_data d;
3277432774++ uint64x2_t inf, minus_one;
3277532775++} data = { .d = V_LOG1P_CONSTANTS_TABLE,
3277632776++ .inf = V2 (0x7ff0000000000000),
3277732777++ .minus_one = V2 (0xbff0000000000000) };
3277832778+3277932779+ #define BottomMask v_u64 (0xffffffff)
3278032780+3278132781+-static float64x2_t VPCS_ATTR NOINLINE
3278232782+-special_case (float64x2_t x, float64x2_t y, uint64x2_t special)
3278332783++static float64x2_t NOINLINE VPCS_ATTR
3278432784++special_case (float64x2_t x, uint64x2_t cmp, const struct data *d)
3278532785+ {
3278632786+- return v_call_f64 (log1p, x, y, special);
3278732787++ /* Side-step special lanes so fenv exceptions are not triggered
3278832788++ inadvertently. */
3278932789++ float64x2_t x_nospecial = v_zerofy_f64 (x, cmp);
3279032790++ return v_call_f64 (log1p, x, log1p_inline (x_nospecial, &d->d), cmp);
3279132791+ }
3279232792+3279332793+ /* Vector log1p approximation using polynomial on reduced interval. Routine is
3279432794+@@ -66,66 +49,14 @@ VPCS_ATTR float64x2_t V_NAME_D1 (log1p) (float64x2_t x)
3279532795+ const struct data *d = ptr_barrier (&data);
3279632796+ uint64x2_t ix = vreinterpretq_u64_f64 (x);
3279732797+ uint64x2_t ia = vreinterpretq_u64_f64 (vabsq_f64 (x));
3279832798+- uint64x2_t special = vcgeq_u64 (ia, d->inf);
3279932799+3280032800+-#if WANT_SIMD_EXCEPT
3280132801+- special = vorrq_u64 (special,
3280232802+- vcgeq_u64 (ix, vreinterpretq_u64_f64 (v_f64 (-1))));
3280332803+- if (__glibc_unlikely (v_any_u64 (special)))
3280432804+- x = v_zerofy_f64 (x, special);
3280532805+-#else
3280632806+- special = vorrq_u64 (special, vcleq_f64 (x, v_f64 (-1)));
3280732807+-#endif
3280832808++ uint64x2_t special_cases
3280932809++ = vorrq_u64 (vcgeq_u64 (ia, d->inf), vcgeq_u64 (ix, d->minus_one));
3281032810+3281132811+- /* With x + 1 = t * 2^k (where t = f + 1 and k is chosen such that f
3281232812+- is in [sqrt(2)/2, sqrt(2)]):
3281332813+- log1p(x) = k*log(2) + log1p(f).
3281432814++ if (__glibc_unlikely (v_any_u64 (special_cases)))
3281532815++ return special_case (x, special_cases, d);
3281632816+3281732817+- f may not be representable exactly, so we need a correction term:
3281832818+- let m = round(1 + x), c = (1 + x) - m.
3281932819+- c << m: at very small x, log1p(x) ~ x, hence:
3282032820+- log(1+x) - log(m) ~ c/m.
3282132821+-
3282232822+- We therefore calculate log1p(x) by k*log2 + log1p(f) + c/m. */
3282332823+-
3282432824+- /* Obtain correctly scaled k by manipulation in the exponent.
3282532825+- The scalar algorithm casts down to 32-bit at this point to calculate k and
3282632826+- u_red. We stay in double-width to obtain f and k, using the same constants
3282732827+- as the scalar algorithm but shifted left by 32. */
3282832828+- float64x2_t m = vaddq_f64 (x, v_f64 (1));
3282932829+- uint64x2_t mi = vreinterpretq_u64_f64 (m);
3283032830+- uint64x2_t u = vaddq_u64 (mi, d->one_m_hf_rt2_top);
3283132831+-
3283232832+- int64x2_t ki
3283332833+- = vsubq_s64 (vreinterpretq_s64_u64 (vshrq_n_u64 (u, 52)), d->one_top);
3283432834+- float64x2_t k = vcvtq_f64_s64 (ki);
3283532835+-
3283632836+- /* Reduce x to f in [sqrt(2)/2, sqrt(2)]. */
3283732837+- uint64x2_t utop = vaddq_u64 (vandq_u64 (u, d->umask), d->hf_rt2_top);
3283832838+- uint64x2_t u_red = vorrq_u64 (utop, vandq_u64 (mi, BottomMask));
3283932839+- float64x2_t f = vsubq_f64 (vreinterpretq_f64_u64 (u_red), v_f64 (1));
3284032840+-
3284132841+- /* Correction term c/m. */
3284232842+- float64x2_t cm = vdivq_f64 (vsubq_f64 (x, vsubq_f64 (m, v_f64 (1))), m);
3284332843+-
3284432844+- /* Approximate log1p(x) on the reduced input using a polynomial. Because
3284532845+- log1p(0)=0 we choose an approximation of the form:
3284632846+- x + C0*x^2 + C1*x^3 + C2x^4 + ...
3284732847+- Hence approximation has the form f + f^2 * P(f)
3284832848+- where P(x) = C0 + C1*x + C2x^2 + ...
3284932849+- Assembling this all correctly is dealt with at the final step. */
3285032850+- float64x2_t f2 = vmulq_f64 (f, f);
3285132851+- float64x2_t p = v_pw_horner_18_f64 (f, f2, d->poly);
3285232852+-
3285332853+- float64x2_t ylo = vfmaq_f64 (cm, k, d->ln2[1]);
3285432854+- float64x2_t yhi = vfmaq_f64 (f, k, d->ln2[0]);
3285532855+- float64x2_t y = vaddq_f64 (ylo, yhi);
3285632856+-
3285732857+- if (__glibc_unlikely (v_any_u64 (special)))
3285832858+- return special_case (vreinterpretq_f64_u64 (ix), vfmaq_f64 (y, f2, p),
3285932859+- special);
3286032860+-
3286132861+- return vfmaq_f64 (y, f2, p);
3286232862++ return log1p_inline (x, &d->d);
3286332863+ }
3286432864+3286532865+ strong_alias (V_NAME_D1 (log1p), V_NAME_D1 (logp1))
3286632866+diff --git a/sysdeps/aarch64/fpu/v_log1p_inline.h b/sysdeps/aarch64/fpu/v_log1p_inline.h
3286732867+index 242e43b6ee..834ff65adf 100644
3286832868+--- a/sysdeps/aarch64/fpu/v_log1p_inline.h
3286932869++++ b/sysdeps/aarch64/fpu/v_log1p_inline.h
3287032870+@@ -21,29 +21,30 @@
3287132871+ #define AARCH64_FPU_V_LOG1P_INLINE_H
3287232872+3287332873+ #include "v_math.h"
3287432874+-#include "poly_advsimd_f64.h"
3287532875+3287632876+ struct v_log1p_data
3287732877+ {
3287832878+- float64x2_t poly[19], ln2[2];
3287932879++ float64x2_t c0, c2, c4, c6, c8, c10, c12, c14, c16;
3288032880+ uint64x2_t hf_rt2_top, one_m_hf_rt2_top, umask;
3288132881+ int64x2_t one_top;
3288232882++ double c1, c3, c5, c7, c9, c11, c13, c15, c17, c18;
3288332883++ double ln2[2];
3288432884+ };
3288532885+3288632886+ /* Coefficients generated using Remez, deg=20, in [sqrt(2)/2-1, sqrt(2)-1]. */
3288732887+ #define V_LOG1P_CONSTANTS_TABLE \
3288832888+ { \
3288932889+- .poly = { V2 (-0x1.ffffffffffffbp-2), V2 (0x1.55555555551a9p-2), \
3289032890+- V2 (-0x1.00000000008e3p-2), V2 (0x1.9999999a32797p-3), \
3289132891+- V2 (-0x1.555555552fecfp-3), V2 (0x1.249248e071e5ap-3), \
3289232892+- V2 (-0x1.ffffff8bf8482p-4), V2 (0x1.c71c8f07da57ap-4), \
3289332893+- V2 (-0x1.9999ca4ccb617p-4), V2 (0x1.7459ad2e1dfa3p-4), \
3289432894+- V2 (-0x1.554d2680a3ff2p-4), V2 (0x1.3b4c54d487455p-4), \
3289532895+- V2 (-0x1.2548a9ffe80e6p-4), V2 (0x1.0f389a24b2e07p-4), \
3289632896+- V2 (-0x1.eee4db15db335p-5), V2 (0x1.e95b494d4a5ddp-5), \
3289732897+- V2 (-0x1.15fdf07cb7c73p-4), V2 (0x1.0310b70800fcfp-4), \
3289832898+- V2 (-0x1.cfa7385bdb37ep-6) }, \
3289932899+- .ln2 = { V2 (0x1.62e42fefa3800p-1), V2 (0x1.ef35793c76730p-45) }, \
3290032900++ .c0 = V2 (-0x1.ffffffffffffbp-2), .c1 = 0x1.55555555551a9p-2, \
3290132901++ .c2 = V2 (-0x1.00000000008e3p-2), .c3 = 0x1.9999999a32797p-3, \
3290232902++ .c4 = V2 (-0x1.555555552fecfp-3), .c5 = 0x1.249248e071e5ap-3, \
3290332903++ .c6 = V2 (-0x1.ffffff8bf8482p-4), .c7 = 0x1.c71c8f07da57ap-4, \
3290432904++ .c8 = V2 (-0x1.9999ca4ccb617p-4), .c9 = 0x1.7459ad2e1dfa3p-4, \
3290532905++ .c10 = V2 (-0x1.554d2680a3ff2p-4), .c11 = 0x1.3b4c54d487455p-4, \
3290632906++ .c12 = V2 (-0x1.2548a9ffe80e6p-4), .c13 = 0x1.0f389a24b2e07p-4, \
3290732907++ .c14 = V2 (-0x1.eee4db15db335p-5), .c15 = 0x1.e95b494d4a5ddp-5, \
3290832908++ .c16 = V2 (-0x1.15fdf07cb7c73p-4), .c17 = 0x1.0310b70800fcfp-4, \
3290932909++ .c18 = -0x1.cfa7385bdb37ep-6, \
3291032910++ .ln2 = { 0x1.62e42fefa3800p-1, 0x1.ef35793c76730p-45 }, \
3291132911+ .hf_rt2_top = V2 (0x3fe6a09e00000000), \
3291232912+ .one_m_hf_rt2_top = V2 (0x00095f6200000000), \
3291332913+ .umask = V2 (0x000fffff00000000), .one_top = V2 (0x3ff) \
3291432914+@@ -51,19 +52,45 @@ struct v_log1p_data
3291532915+3291632916+ #define BottomMask v_u64 (0xffffffff)
3291732917+3291832918++static inline float64x2_t
3291932919++eval_poly (float64x2_t m, float64x2_t m2, const struct v_log1p_data *d)
3292032920++{
3292132921++ /* Approximate log(1+m) on [-0.25, 0.5] using pairwise Horner. */
3292232922++ float64x2_t c13 = vld1q_f64 (&d->c1);
3292332923++ float64x2_t c57 = vld1q_f64 (&d->c5);
3292432924++ float64x2_t c911 = vld1q_f64 (&d->c9);
3292532925++ float64x2_t c1315 = vld1q_f64 (&d->c13);
3292632926++ float64x2_t c1718 = vld1q_f64 (&d->c17);
3292732927++ float64x2_t p1617 = vfmaq_laneq_f64 (d->c16, m, c1718, 0);
3292832928++ float64x2_t p1415 = vfmaq_laneq_f64 (d->c14, m, c1315, 1);
3292932929++ float64x2_t p1213 = vfmaq_laneq_f64 (d->c12, m, c1315, 0);
3293032930++ float64x2_t p1011 = vfmaq_laneq_f64 (d->c10, m, c911, 1);
3293132931++ float64x2_t p89 = vfmaq_laneq_f64 (d->c8, m, c911, 0);
3293232932++ float64x2_t p67 = vfmaq_laneq_f64 (d->c6, m, c57, 1);
3293332933++ float64x2_t p45 = vfmaq_laneq_f64 (d->c4, m, c57, 0);
3293432934++ float64x2_t p23 = vfmaq_laneq_f64 (d->c2, m, c13, 1);
3293532935++ float64x2_t p01 = vfmaq_laneq_f64 (d->c0, m, c13, 0);
3293632936++ float64x2_t p = vfmaq_laneq_f64 (p1617, m2, c1718, 1);
3293732937++ p = vfmaq_f64 (p1415, m2, p);
3293832938++ p = vfmaq_f64 (p1213, m2, p);
3293932939++ p = vfmaq_f64 (p1011, m2, p);
3294032940++ p = vfmaq_f64 (p89, m2, p);
3294132941++ p = vfmaq_f64 (p67, m2, p);
3294232942++ p = vfmaq_f64 (p45, m2, p);
3294332943++ p = vfmaq_f64 (p23, m2, p);
3294432944++ return vfmaq_f64 (p01, m2, p);
3294532945++}
3294632946++
3294732947+ static inline float64x2_t
3294832948+ log1p_inline (float64x2_t x, const struct v_log1p_data *d)
3294932949+ {
3295032950+- /* Helper for calculating log(x + 1). Copied from v_log1p_2u5.c, with several
3295132951+- modifications:
3295232952++ /* Helper for calculating log(x + 1):
3295332953+ - No special-case handling - this should be dealt with by the caller.
3295432954+- - Pairwise Horner polynomial evaluation for improved accuracy.
3295532955+ - Optionally simulate the shortcut for k=0, used in the scalar routine,
3295632956+- using v_sel, for improved accuracy when the argument to log1p is close to
3295732957+- 0. This feature is enabled by defining WANT_V_LOG1P_K0_SHORTCUT as 1 in
3295832958+- the source of the caller before including this file.
3295932959+- See v_log1pf_2u1.c for details of the algorithm. */
3296032960+- float64x2_t m = vaddq_f64 (x, v_f64 (1));
3296132961++ using v_sel, for improved accuracy when the argument to log1p is close
3296232962++ to 0. This feature is enabled by defining WANT_V_LOG1P_K0_SHORTCUT as 1
3296332963++ in the source of the caller before including this file. */
3296432964++ float64x2_t m = vaddq_f64 (x, v_f64 (1.0));
3296532965+ uint64x2_t mi = vreinterpretq_u64_f64 (m);
3296632966+ uint64x2_t u = vaddq_u64 (mi, d->one_m_hf_rt2_top);
3296732967+3296832968+@@ -74,14 +101,14 @@ log1p_inline (float64x2_t x, const struct v_log1p_data *d)
3296932969+ /* Reduce x to f in [sqrt(2)/2, sqrt(2)]. */
3297032970+ uint64x2_t utop = vaddq_u64 (vandq_u64 (u, d->umask), d->hf_rt2_top);
3297132971+ uint64x2_t u_red = vorrq_u64 (utop, vandq_u64 (mi, BottomMask));
3297232972+- float64x2_t f = vsubq_f64 (vreinterpretq_f64_u64 (u_red), v_f64 (1));
3297332973++ float64x2_t f = vsubq_f64 (vreinterpretq_f64_u64 (u_red), v_f64 (1.0));
3297432974+3297532975+ /* Correction term c/m. */
3297632976+- float64x2_t cm = vdivq_f64 (vsubq_f64 (x, vsubq_f64 (m, v_f64 (1))), m);
3297732977++ float64x2_t cm = vdivq_f64 (vsubq_f64 (x, vsubq_f64 (m, v_f64 (1.0))), m);
3297832978+3297932979+ #ifndef WANT_V_LOG1P_K0_SHORTCUT
3298032980+-#error \
3298132981+- "Cannot use v_log1p_inline.h without specifying whether you need the k0 shortcut for greater accuracy close to 0"
3298232982++# error \
3298332983++ "Cannot use v_log1p_inline.h without specifying whether you need the k0 shortcut for greater accuracy close to 0"
3298432984+ #elif WANT_V_LOG1P_K0_SHORTCUT
3298532985+ /* Shortcut if k is 0 - set correction term to 0 and f to x. The result is
3298632986+ that the approximation is solely the polynomial. */
3298732987+@@ -92,11 +119,12 @@ log1p_inline (float64x2_t x, const struct v_log1p_data *d)
3298832988+3298932989+ /* Approximate log1p(f) on the reduced input using a polynomial. */
3299032990+ float64x2_t f2 = vmulq_f64 (f, f);
3299132991+- float64x2_t p = v_pw_horner_18_f64 (f, f2, d->poly);
3299232992++ float64x2_t p = eval_poly (f, f2, d);
3299332993+3299432994+ /* Assemble log1p(x) = k * log2 + log1p(f) + c/m. */
3299532995+- float64x2_t ylo = vfmaq_f64 (cm, k, d->ln2[1]);
3299632996+- float64x2_t yhi = vfmaq_f64 (f, k, d->ln2[0]);
3299732997++ float64x2_t ln2 = vld1q_f64 (&d->ln2[0]);
3299832998++ float64x2_t ylo = vfmaq_laneq_f64 (cm, k, ln2, 1);
3299932999++ float64x2_t yhi = vfmaq_laneq_f64 (f, k, ln2, 0);
3300033000+ return vfmaq_f64 (vaddq_f64 (ylo, yhi), f2, p);
3300133001+ }
3300233002+3300333003+3300433004+commit 9170b921fa49d2ef37141506837baaae92c7d3f8
3300533005+Author: Joana Cruz <Joana.Cruz@arm.com>
3300633006+Date: Tue Dec 17 14:47:31 2024 +0000
3300733007+3300833008+ AArch64: Improve codegen of AdvSIMD logf function family
3300933009+3301033010+ Load the polynomial evaluation coefficients into 2 vectors and use lanewise MLAs.
3301133011+ 8% improvement in throughput microbenchmark on Neoverse V1 for log2 and log,
3301233012+ and 2% for log10.
3301333013+3301433014+ Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com>
3301533015+ (cherry picked from commit d6e034f5b222a9ed1aeb5de0c0c7d0dda8b63da3)
3301633016+3301733017+diff --git a/sysdeps/aarch64/fpu/log10f_advsimd.c b/sysdeps/aarch64/fpu/log10f_advsimd.c
3301833018+index 82228b599a..0d792c3df9 100644
3301933019+--- a/sysdeps/aarch64/fpu/log10f_advsimd.c
3302033020++++ b/sysdeps/aarch64/fpu/log10f_advsimd.c
3302133021+@@ -18,21 +18,25 @@
3302233022+ <https://www.gnu.org/licenses/>. */
3302333023+3302433024+ #include "v_math.h"
3302533025+-#include "poly_advsimd_f32.h"
3302633026+3302733027+ static const struct data
3302833028+ {
3302933029++ float32x4_t c0, c2, c4, c6, inv_ln10, ln2;
3303033030+ uint32x4_t off, offset_lower_bound;
3303133031+ uint16x8_t special_bound;
3303233032+ uint32x4_t mantissa_mask;
3303333033+- float32x4_t poly[8];
3303433034+- float32x4_t inv_ln10, ln2;
3303533035++ float c1, c3, c5, c7;
3303633036+ } data = {
3303733037+ /* Use order 9 for log10(1+x), i.e. order 8 for log10(1+x)/x, with x in
3303833038+ [-1/3, 1/3] (offset=2/3). Max. relative error: 0x1.068ee468p-25. */
3303933039+- .poly = { V4 (-0x1.bcb79cp-3f), V4 (0x1.2879c8p-3f), V4 (-0x1.bcd472p-4f),
3304033040+- V4 (0x1.6408f8p-4f), V4 (-0x1.246f8p-4f), V4 (0x1.f0e514p-5f),
3304133041+- V4 (-0x1.0fc92cp-4f), V4 (0x1.f5f76ap-5f) },
3304233042++ .c0 = V4 (-0x1.bcb79cp-3f),
3304333043++ .c1 = 0x1.2879c8p-3f,
3304433044++ .c2 = V4 (-0x1.bcd472p-4f),
3304533045++ .c3 = 0x1.6408f8p-4f,
3304633046++ .c4 = V4 (-0x1.246f8p-4f),
3304733047++ .c5 = 0x1.f0e514p-5f,
3304833048++ .c6 = V4 (-0x1.0fc92cp-4f),
3304933049++ .c7 = 0x1.f5f76ap-5f,
3305033050+ .ln2 = V4 (0x1.62e43p-1f),
3305133051+ .inv_ln10 = V4 (0x1.bcb7b2p-2f),
3305233052+ /* Lower bound is the smallest positive normal float 0x00800000. For
3305333053+@@ -62,7 +66,7 @@ special_case (float32x4_t y, uint32x4_t u_off, float32x4_t p, float32x4_t r2,
3305433054+ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log10) (float32x4_t x)
3305533055+ {
3305633056+ const struct data *d = ptr_barrier (&data);
3305733057+-
3305833058++ float32x4_t c1357 = vld1q_f32 (&d->c1);
3305933059+ /* To avoid having to mov x out of the way, keep u after offset has been
3306033060+ applied, and recover x by adding the offset back in the special-case
3306133061+ handler. */
3306233062+@@ -81,7 +85,16 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log10) (float32x4_t x)
3306333063+3306433064+ /* y = log10(1+r) + n * log10(2). */
3306533065+ float32x4_t r2 = vmulq_f32 (r, r);
3306633066+- float32x4_t poly = v_pw_horner_7_f32 (r, r2, d->poly);
3306733067++
3306833068++ float32x4_t c01 = vfmaq_laneq_f32 (d->c0, r, c1357, 0);
3306933069++ float32x4_t c23 = vfmaq_laneq_f32 (d->c2, r, c1357, 1);
3307033070++ float32x4_t c45 = vfmaq_laneq_f32 (d->c4, r, c1357, 2);
3307133071++ float32x4_t c67 = vfmaq_laneq_f32 (d->c6, r, c1357, 3);
3307233072++
3307333073++ float32x4_t p47 = vfmaq_f32 (c45, r2, c67);
3307433074++ float32x4_t p27 = vfmaq_f32 (c23, r2, p47);
3307533075++ float32x4_t poly = vfmaq_f32 (c01, r2, p27);
3307633076++
3307733077+ /* y = Log10(2) * n + poly * InvLn(10). */
3307833078+ float32x4_t y = vfmaq_f32 (r, d->ln2, n);
3307933079+ y = vmulq_f32 (y, d->inv_ln10);
3308033080+diff --git a/sysdeps/aarch64/fpu/log2f_advsimd.c b/sysdeps/aarch64/fpu/log2f_advsimd.c
3308133081+index 84effe4fe9..116c36c8e2 100644
3308233082+--- a/sysdeps/aarch64/fpu/log2f_advsimd.c
3308333083++++ b/sysdeps/aarch64/fpu/log2f_advsimd.c
3308433084+@@ -18,22 +18,27 @@
3308533085+ <https://www.gnu.org/licenses/>. */
3308633086+3308733087+ #include "v_math.h"
3308833088+-#include "poly_advsimd_f32.h"
3308933089+3309033090+ static const struct data
3309133091+ {
3309233092++ float32x4_t c0, c2, c4, c6, c8;
3309333093+ uint32x4_t off, offset_lower_bound;
3309433094+ uint16x8_t special_bound;
3309533095+ uint32x4_t mantissa_mask;
3309633096+- float32x4_t poly[9];
3309733097++ float c1, c3, c5, c7;
3309833098+ } data = {
3309933099+ /* Coefficients generated using Remez algorithm approximate
3310033100+ log2(1+r)/r for r in [ -1/3, 1/3 ].
3310133101+ rel error: 0x1.c4c4b0cp-26. */
3310233102+- .poly = { V4 (0x1.715476p0f), /* (float)(1 / ln(2)). */
3310333103+- V4 (-0x1.715458p-1f), V4 (0x1.ec701cp-2f), V4 (-0x1.7171a4p-2f),
3310433104+- V4 (0x1.27a0b8p-2f), V4 (-0x1.e5143ep-3f), V4 (0x1.9d8ecap-3f),
3310533105+- V4 (-0x1.c675bp-3f), V4 (0x1.9e495p-3f) },
3310633106++ .c0 = V4 (0x1.715476p0f), /* (float)(1 / ln(2)). */
3310733107++ .c1 = -0x1.715458p-1f,
3310833108++ .c2 = V4 (0x1.ec701cp-2f),
3310933109++ .c3 = -0x1.7171a4p-2f,
3311033110++ .c4 = V4 (0x1.27a0b8p-2f),
3311133111++ .c5 = -0x1.e5143ep-3f,
3311233112++ .c6 = V4 (0x1.9d8ecap-3f),
3311333113++ .c7 = -0x1.c675bp-3f,
3311433114++ .c8 = V4 (0x1.9e495p-3f),
3311533115+ /* Lower bound is the smallest positive normal float 0x00800000. For
3311633116+ optimised register use subnormals are detected after offset has been
3311733117+ subtracted, so lower bound is 0x0080000 - offset (which wraps around). */
3311833118+@@ -79,11 +84,21 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log2) (float32x4_t x)
3311933119+3312033120+ /* y = log2(1+r) + n. */
3312133121+ float32x4_t r2 = vmulq_f32 (r, r);
3312233122+- float32x4_t p = v_pw_horner_8_f32 (r, r2, d->poly);
3312333123++
3312433124++ float32x4_t c1357 = vld1q_f32 (&d->c1);
3312533125++ float32x4_t c01 = vfmaq_laneq_f32 (d->c0, r, c1357, 0);
3312633126++ float32x4_t c23 = vfmaq_laneq_f32 (d->c2, r, c1357, 1);
3312733127++ float32x4_t c45 = vfmaq_laneq_f32 (d->c4, r, c1357, 2);
3312833128++ float32x4_t c67 = vfmaq_laneq_f32 (d->c6, r, c1357, 3);
3312933129++ float32x4_t p68 = vfmaq_f32 (c67, r2, d->c8);
3313033130++ float32x4_t p48 = vfmaq_f32 (c45, r2, p68);
3313133131++ float32x4_t p28 = vfmaq_f32 (c23, r2, p48);
3313233132++ float32x4_t p = vfmaq_f32 (c01, r2, p28);
3313333133+3313433134+ if (__glibc_unlikely (v_any_u16h (special)))
3313533135+ return special_case (n, u_off, p, r, special, d);
3313633136+ return vfmaq_f32 (n, p, r);
3313733137+ }
3313833138++
3313933139+ libmvec_hidden_def (V_NAME_F1 (log2))
3314033140+ HALF_WIDTH_ALIAS_F1 (log2)
3314133141+diff --git a/sysdeps/aarch64/fpu/logf_advsimd.c b/sysdeps/aarch64/fpu/logf_advsimd.c
3314233142+index c20dbfd6c0..d9e64c732d 100644
3314333143+--- a/sysdeps/aarch64/fpu/logf_advsimd.c
3314433144++++ b/sysdeps/aarch64/fpu/logf_advsimd.c
3314533145+@@ -21,16 +21,19 @@
3314633146+3314733147+ static const struct data
3314833148+ {
3314933149+- uint32x4_t off, offset_lower_bound;
3315033150++ float32x4_t c2, c4, c6, ln2;
3315133151++ uint32x4_t off, offset_lower_bound, mantissa_mask;
3315233152+ uint16x8_t special_bound;
3315333153+- uint32x4_t mantissa_mask;
3315433154+- float32x4_t poly[7];
3315533155+- float32x4_t ln2;
3315633156++ float c1, c3, c5, c0;
3315733157+ } data = {
3315833158+ /* 3.34 ulp error. */
3315933159+- .poly = { V4 (-0x1.3e737cp-3f), V4 (0x1.5a9aa2p-3f), V4 (-0x1.4f9934p-3f),
3316033160+- V4 (0x1.961348p-3f), V4 (-0x1.00187cp-2f), V4 (0x1.555d7cp-2f),
3316133161+- V4 (-0x1.ffffc8p-2f) },
3316233162++ .c0 = -0x1.3e737cp-3f,
3316333163++ .c1 = 0x1.5a9aa2p-3f,
3316433164++ .c2 = V4 (-0x1.4f9934p-3f),
3316533165++ .c3 = 0x1.961348p-3f,
3316633166++ .c4 = V4 (-0x1.00187cp-2f),
3316733167++ .c5 = 0x1.555d7cp-2f,
3316833168++ .c6 = V4 (-0x1.ffffc8p-2f),
3316933169+ .ln2 = V4 (0x1.62e43p-1f),
3317033170+ /* Lower bound is the smallest positive normal float 0x00800000. For
3317133171+ optimised register use subnormals are detected after offset has been
3317233172+@@ -41,8 +44,6 @@ static const struct data
3317333173+ .mantissa_mask = V4 (0x007fffff)
3317433174+ };
3317533175+3317633176+-#define P(i) d->poly[7 - i]
3317733177+-
3317833178+ static float32x4_t VPCS_ATTR NOINLINE
3317933179+ special_case (float32x4_t p, uint32x4_t u_off, float32x4_t y, float32x4_t r2,
3318033180+ uint16x4_t cmp, const struct data *d)
3318133181+@@ -55,33 +56,30 @@ special_case (float32x4_t p, uint32x4_t u_off, float32x4_t y, float32x4_t r2,
3318233182+ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log) (float32x4_t x)
3318333183+ {
3318433184+ const struct data *d = ptr_barrier (&data);
3318533185+- float32x4_t n, p, q, r, r2, y;
3318633186+- uint32x4_t u, u_off;
3318733187+- uint16x4_t cmp;
3318833188++ float32x4_t c1350 = vld1q_f32 (&d->c1);
3318933189+3319033190+ /* To avoid having to mov x out of the way, keep u after offset has been
3319133191+ applied, and recover x by adding the offset back in the special-case
3319233192+ handler. */
3319333193+- u_off = vreinterpretq_u32_f32 (x);
3319433194++ uint32x4_t u_off = vsubq_u32 (vreinterpretq_u32_f32 (x), d->off);
3319533195+3319633196+ /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3. */
3319733197+- u_off = vsubq_u32 (u_off, d->off);
3319833198+- n = vcvtq_f32_s32 (
3319933199++ float32x4_t n = vcvtq_f32_s32 (
3320033200+ vshrq_n_s32 (vreinterpretq_s32_u32 (u_off), 23)); /* signextend. */
3320133201+- u = vandq_u32 (u_off, d->mantissa_mask);
3320233202+- u = vaddq_u32 (u, d->off);
3320333203+- r = vsubq_f32 (vreinterpretq_f32_u32 (u), v_f32 (1.0f));
3320433204++ uint16x4_t cmp = vcge_u16 (vsubhn_u32 (u_off, d->offset_lower_bound),
3320533205++ vget_low_u16 (d->special_bound));
3320633206+3320733207+- cmp = vcge_u16 (vsubhn_u32 (u_off, d->offset_lower_bound),
3320833208+- vget_low_u16 (d->special_bound));
3320933209++ uint32x4_t u = vaddq_u32 (vandq_u32 (u_off, d->mantissa_mask), d->off);
3321033210++ float32x4_t r = vsubq_f32 (vreinterpretq_f32_u32 (u), v_f32 (1.0f));
3321133211+3321233212+ /* y = log(1+r) + n*ln2. */
3321333213+- r2 = vmulq_f32 (r, r);
3321433214++ float32x4_t r2 = vmulq_f32 (r, r);
3321533215+ /* n*ln2 + r + r2*(P1 + r*P2 + r2*(P3 + r*P4 + r2*(P5 + r*P6 + r2*P7))). */
3321633216+- p = vfmaq_f32 (P (5), P (6), r);
3321733217+- q = vfmaq_f32 (P (3), P (4), r);
3321833218+- y = vfmaq_f32 (P (1), P (2), r);
3321933219+- p = vfmaq_f32 (p, P (7), r2);
3322033220++ float32x4_t p = vfmaq_laneq_f32 (d->c2, r, c1350, 0);
3322133221++ float32x4_t q = vfmaq_laneq_f32 (d->c4, r, c1350, 1);
3322233222++ float32x4_t y = vfmaq_laneq_f32 (d->c6, r, c1350, 2);
3322333223++ p = vfmaq_laneq_f32 (p, r2, c1350, 3);
3322433224++
3322533225+ q = vfmaq_f32 (q, p, r2);
3322633226+ y = vfmaq_f32 (y, q, r2);
3322733227+ p = vfmaq_f32 (r, d->ln2, n);
3322833228+3322933229+commit 41dc9e7c2d80bc5e886950b8a7bd21f77c9793b3
3323033230+Author: Joana Cruz <Joana.Cruz@arm.com>
3323133231+Date: Tue Dec 17 14:49:30 2024 +0000
3323233232+3323333233+ AArch64: Improve codegen of AdvSIMD atan(2)(f)
3323433234+3323533235+ Load the polynomial evaluation coefficients into 2 vectors and use lanewise MLAs.
3323633236+ 8% improvement in throughput microbenchmark on Neoverse V1.
3323733237+3323833238+ Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com>
3323933239+ (cherry picked from commit 6914774b9d3460876d9ad4482782213ec01a752e)
3324033240+3324133241+diff --git a/sysdeps/aarch64/fpu/atan2_advsimd.c b/sysdeps/aarch64/fpu/atan2_advsimd.c
3324233242+index b1e7a9b8fc..1a8f02109f 100644
3324333243+--- a/sysdeps/aarch64/fpu/atan2_advsimd.c
3324433244++++ b/sysdeps/aarch64/fpu/atan2_advsimd.c
3324533245+@@ -23,40 +23,57 @@
3324633246+3324733247+ static const struct data
3324833248+ {
3324933249++ float64x2_t c0, c2, c4, c6, c8, c10, c12, c14, c16, c18;
3325033250+ float64x2_t pi_over_2;
3325133251+- float64x2_t poly[20];
3325233252++ double c1, c3, c5, c7, c9, c11, c13, c15, c17, c19;
3325333253++ uint64x2_t zeroinfnan, minustwo;
3325433254+ } data = {
3325533255+ /* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on
3325633256+- the interval [2**-1022, 1.0]. */
3325733257+- .poly = { V2 (-0x1.5555555555555p-2), V2 (0x1.99999999996c1p-3),
3325833258+- V2 (-0x1.2492492478f88p-3), V2 (0x1.c71c71bc3951cp-4),
3325933259+- V2 (-0x1.745d160a7e368p-4), V2 (0x1.3b139b6a88ba1p-4),
3326033260+- V2 (-0x1.11100ee084227p-4), V2 (0x1.e1d0f9696f63bp-5),
3326133261+- V2 (-0x1.aebfe7b418581p-5), V2 (0x1.842dbe9b0d916p-5),
3326233262+- V2 (-0x1.5d30140ae5e99p-5), V2 (0x1.338e31eb2fbbcp-5),
3326333263+- V2 (-0x1.00e6eece7de8p-5), V2 (0x1.860897b29e5efp-6),
3326433264+- V2 (-0x1.0051381722a59p-6), V2 (0x1.14e9dc19a4a4ep-7),
3326533265+- V2 (-0x1.d0062b42fe3bfp-9), V2 (0x1.17739e210171ap-10),
3326633266+- V2 (-0x1.ab24da7be7402p-13), V2 (0x1.358851160a528p-16), },
3326733267++ [2**-1022, 1.0]. */
3326833268++ .c0 = V2 (-0x1.5555555555555p-2),
3326933269++ .c1 = 0x1.99999999996c1p-3,
3327033270++ .c2 = V2 (-0x1.2492492478f88p-3),
3327133271++ .c3 = 0x1.c71c71bc3951cp-4,
3327233272++ .c4 = V2 (-0x1.745d160a7e368p-4),
3327333273++ .c5 = 0x1.3b139b6a88ba1p-4,
3327433274++ .c6 = V2 (-0x1.11100ee084227p-4),
3327533275++ .c7 = 0x1.e1d0f9696f63bp-5,
3327633276++ .c8 = V2 (-0x1.aebfe7b418581p-5),
3327733277++ .c9 = 0x1.842dbe9b0d916p-5,
3327833278++ .c10 = V2 (-0x1.5d30140ae5e99p-5),
3327933279++ .c11 = 0x1.338e31eb2fbbcp-5,
3328033280++ .c12 = V2 (-0x1.00e6eece7de8p-5),
3328133281++ .c13 = 0x1.860897b29e5efp-6,
3328233282++ .c14 = V2 (-0x1.0051381722a59p-6),
3328333283++ .c15 = 0x1.14e9dc19a4a4ep-7,
3328433284++ .c16 = V2 (-0x1.d0062b42fe3bfp-9),
3328533285++ .c17 = 0x1.17739e210171ap-10,
3328633286++ .c18 = V2 (-0x1.ab24da7be7402p-13),
3328733287++ .c19 = 0x1.358851160a528p-16,
3328833288+ .pi_over_2 = V2 (0x1.921fb54442d18p+0),
3328933289++ .zeroinfnan = V2 (2 * 0x7ff0000000000000ul - 1),
3329033290++ .minustwo = V2 (0xc000000000000000),
3329133291+ };
3329233292+3329333293+ #define SignMask v_u64 (0x8000000000000000)
3329433294+3329533295+ /* Special cases i.e. 0, infinity, NaN (fall back to scalar calls). */
3329633296+ static float64x2_t VPCS_ATTR NOINLINE
3329733297+-special_case (float64x2_t y, float64x2_t x, float64x2_t ret, uint64x2_t cmp)
3329833298++special_case (float64x2_t y, float64x2_t x, float64x2_t ret,
3329933299++ uint64x2_t sign_xy, uint64x2_t cmp)
3330033300+ {
3330133301++ /* Account for the sign of x and y. */
3330233302++ ret = vreinterpretq_f64_u64 (
3330333303++ veorq_u64 (vreinterpretq_u64_f64 (ret), sign_xy));
3330433304+ return v_call2_f64 (atan2, y, x, ret, cmp);
3330533305+ }
3330633306+3330733307+ /* Returns 1 if input is the bit representation of 0, infinity or nan. */
3330833308+ static inline uint64x2_t
3330933309+-zeroinfnan (uint64x2_t i)
3331033310++zeroinfnan (uint64x2_t i, const struct data *d)
3331133311+ {
3331233312+ /* (2 * i - 1) >= (2 * asuint64 (INFINITY) - 1). */
3331333313+- return vcgeq_u64 (vsubq_u64 (vaddq_u64 (i, i), v_u64 (1)),
3331433314+- v_u64 (2 * asuint64 (INFINITY) - 1));
3331533315++ return vcgeq_u64 (vsubq_u64 (vaddq_u64 (i, i), v_u64 (1)), d->zeroinfnan);
3331633316+ }
3331733317+3331833318+ /* Fast implementation of vector atan2.
3331933319+@@ -66,12 +83,13 @@ zeroinfnan (uint64x2_t i)
3332033320+ want 0x1.92d628ab678cfp-1. */
3332133321+ float64x2_t VPCS_ATTR V_NAME_D2 (atan2) (float64x2_t y, float64x2_t x)
3332233322+ {
3332333323+- const struct data *data_ptr = ptr_barrier (&data);
3332433324++ const struct data *d = ptr_barrier (&data);
3332533325+3332633326+ uint64x2_t ix = vreinterpretq_u64_f64 (x);
3332733327+ uint64x2_t iy = vreinterpretq_u64_f64 (y);
3332833328+3332933329+- uint64x2_t special_cases = vorrq_u64 (zeroinfnan (ix), zeroinfnan (iy));
3333033330++ uint64x2_t special_cases
3333133331++ = vorrq_u64 (zeroinfnan (ix, d), zeroinfnan (iy, d));
3333233332+3333333333+ uint64x2_t sign_x = vandq_u64 (ix, SignMask);
3333433334+ uint64x2_t sign_y = vandq_u64 (iy, SignMask);
3333533335+@@ -81,18 +99,18 @@ float64x2_t VPCS_ATTR V_NAME_D2 (atan2) (float64x2_t y, float64x2_t x)
3333633336+ float64x2_t ay = vabsq_f64 (y);
3333733337+3333833338+ uint64x2_t pred_xlt0 = vcltzq_f64 (x);
3333933339+- uint64x2_t pred_aygtax = vcgtq_f64 (ay, ax);
3334033340++ uint64x2_t pred_aygtax = vcagtq_f64 (y, x);
3334133341+3334233342+ /* Set up z for call to atan. */
3334333343+ float64x2_t n = vbslq_f64 (pred_aygtax, vnegq_f64 (ax), ay);
3334433344+- float64x2_t d = vbslq_f64 (pred_aygtax, ay, ax);
3334533345+- float64x2_t z = vdivq_f64 (n, d);
3334633346++ float64x2_t q = vbslq_f64 (pred_aygtax, ay, ax);
3334733347++ float64x2_t z = vdivq_f64 (n, q);
3334833348+3334933349+ /* Work out the correct shift. */
3335033350+- float64x2_t shift = vreinterpretq_f64_u64 (
3335133351+- vandq_u64 (pred_xlt0, vreinterpretq_u64_f64 (v_f64 (-2.0))));
3335233352++ float64x2_t shift
3335333353++ = vreinterpretq_f64_u64 (vandq_u64 (pred_xlt0, d->minustwo));
3335433354+ shift = vbslq_f64 (pred_aygtax, vaddq_f64 (shift, v_f64 (1.0)), shift);
3335533355+- shift = vmulq_f64 (shift, data_ptr->pi_over_2);
3335633356++ shift = vmulq_f64 (shift, d->pi_over_2);
3335733357+3335833358+ /* Calculate the polynomial approximation.
3335933359+ Use split Estrin scheme for P(z^2) with deg(P)=19. Use split instead of
3336033360+@@ -103,20 +121,52 @@ float64x2_t VPCS_ATTR V_NAME_D2 (atan2) (float64x2_t y, float64x2_t x)
3336133361+ float64x2_t x2 = vmulq_f64 (z2, z2);
3336233362+ float64x2_t x4 = vmulq_f64 (x2, x2);
3336333363+ float64x2_t x8 = vmulq_f64 (x4, x4);
3336433364+- float64x2_t ret
3336533365+- = vfmaq_f64 (v_estrin_7_f64 (z2, x2, x4, data_ptr->poly),
3336633366+- v_estrin_11_f64 (z2, x2, x4, x8, data_ptr->poly + 8), x8);
3336733367++
3336833368++ float64x2_t c13 = vld1q_f64 (&d->c1);
3336933369++ float64x2_t c57 = vld1q_f64 (&d->c5);
3337033370++ float64x2_t c911 = vld1q_f64 (&d->c9);
3337133371++ float64x2_t c1315 = vld1q_f64 (&d->c13);
3337233372++ float64x2_t c1719 = vld1q_f64 (&d->c17);
3337333373++
3337433374++ /* estrin_7. */
3337533375++ float64x2_t p01 = vfmaq_laneq_f64 (d->c0, z2, c13, 0);
3337633376++ float64x2_t p23 = vfmaq_laneq_f64 (d->c2, z2, c13, 1);
3337733377++ float64x2_t p03 = vfmaq_f64 (p01, x2, p23);
3337833378++
3337933379++ float64x2_t p45 = vfmaq_laneq_f64 (d->c4, z2, c57, 0);
3338033380++ float64x2_t p67 = vfmaq_laneq_f64 (d->c6, z2, c57, 1);
3338133381++ float64x2_t p47 = vfmaq_f64 (p45, x2, p67);
3338233382++
3338333383++ float64x2_t p07 = vfmaq_f64 (p03, x4, p47);
3338433384++
3338533385++ /* estrin_11. */
3338633386++ float64x2_t p89 = vfmaq_laneq_f64 (d->c8, z2, c911, 0);
3338733387++ float64x2_t p1011 = vfmaq_laneq_f64 (d->c10, z2, c911, 1);
3338833388++ float64x2_t p811 = vfmaq_f64 (p89, x2, p1011);
3338933389++
3339033390++ float64x2_t p1213 = vfmaq_laneq_f64 (d->c12, z2, c1315, 0);
3339133391++ float64x2_t p1415 = vfmaq_laneq_f64 (d->c14, z2, c1315, 1);
3339233392++ float64x2_t p1215 = vfmaq_f64 (p1213, x2, p1415);
3339333393++
3339433394++ float64x2_t p1617 = vfmaq_laneq_f64 (d->c16, z2, c1719, 0);
3339533395++ float64x2_t p1819 = vfmaq_laneq_f64 (d->c18, z2, c1719, 1);
3339633396++ float64x2_t p1619 = vfmaq_f64 (p1617, x2, p1819);
3339733397++
3339833398++ float64x2_t p815 = vfmaq_f64 (p811, x4, p1215);
3339933399++ float64x2_t p819 = vfmaq_f64 (p815, x8, p1619);
3340033400++
3340133401++ float64x2_t ret = vfmaq_f64 (p07, p819, x8);
3340233402+3340333403+ /* Finalize. y = shift + z + z^3 * P(z^2). */
3340433404+ ret = vfmaq_f64 (z, ret, vmulq_f64 (z2, z));
3340533405+ ret = vaddq_f64 (ret, shift);
3340633406+3340733407++ if (__glibc_unlikely (v_any_u64 (special_cases)))
3340833408++ return special_case (y, x, ret, sign_xy, special_cases);
3340933409++
3341033410+ /* Account for the sign of x and y. */
3341133411+ ret = vreinterpretq_f64_u64 (
3341233412+ veorq_u64 (vreinterpretq_u64_f64 (ret), sign_xy));
3341333413+3341433414+- if (__glibc_unlikely (v_any_u64 (special_cases)))
3341533415+- return special_case (y, x, ret, special_cases);
3341633416+-
3341733417+ return ret;
3341833418+ }
3341933419+diff --git a/sysdeps/aarch64/fpu/atan2f_advsimd.c b/sysdeps/aarch64/fpu/atan2f_advsimd.c
3342033420+index 56e610caf1..88daacd76c 100644
3342133421+--- a/sysdeps/aarch64/fpu/atan2f_advsimd.c
3342233422++++ b/sysdeps/aarch64/fpu/atan2f_advsimd.c
3342333423+@@ -22,34 +22,39 @@
3342433424+3342533425+ static const struct data
3342633426+ {
3342733427+- float32x4_t poly[8];
3342833428+- float32x4_t pi_over_2;
3342933429++ float32x4_t c0, pi_over_2, c4, c6, c2;
3343033430++ float c1, c3, c5, c7;
3343133431++ uint32x4_t comp_const;
3343233432+ } data = {
3343333433+ /* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on
3343433434+ [2**-128, 1.0].
3343533435+ Generated using fpminimax between FLT_MIN and 1. */
3343633436+- .poly = { V4 (-0x1.55555p-2f), V4 (0x1.99935ep-3f), V4 (-0x1.24051ep-3f),
3343733437+- V4 (0x1.bd7368p-4f), V4 (-0x1.491f0ep-4f), V4 (0x1.93a2c0p-5f),
3343833438+- V4 (-0x1.4c3c60p-6f), V4 (0x1.01fd88p-8f) },
3343933439+- .pi_over_2 = V4 (0x1.921fb6p+0f),
3344033440++ .c0 = V4 (-0x1.55555p-2f), .c1 = 0x1.99935ep-3f,
3344133441++ .c2 = V4 (-0x1.24051ep-3f), .c3 = 0x1.bd7368p-4f,
3344233442++ .c4 = V4 (-0x1.491f0ep-4f), .c5 = 0x1.93a2c0p-5f,
3344333443++ .c6 = V4 (-0x1.4c3c60p-6f), .c7 = 0x1.01fd88p-8f,
3344433444++ .pi_over_2 = V4 (0x1.921fb6p+0f), .comp_const = V4 (2 * 0x7f800000lu - 1),
3344533445+ };
3344633446+3344733447+ #define SignMask v_u32 (0x80000000)
3344833448+3344933449+ /* Special cases i.e. 0, infinity and nan (fall back to scalar calls). */
3345033450+ static float32x4_t VPCS_ATTR NOINLINE
3345133451+-special_case (float32x4_t y, float32x4_t x, float32x4_t ret, uint32x4_t cmp)
3345233452++special_case (float32x4_t y, float32x4_t x, float32x4_t ret,
3345333453++ uint32x4_t sign_xy, uint32x4_t cmp)
3345433454+ {
3345533455++ /* Account for the sign of y. */
3345633456++ ret = vreinterpretq_f32_u32 (
3345733457++ veorq_u32 (vreinterpretq_u32_f32 (ret), sign_xy));
3345833458+ return v_call2_f32 (atan2f, y, x, ret, cmp);
3345933459+ }
3346033460+3346133461+ /* Returns 1 if input is the bit representation of 0, infinity or nan. */
3346233462+ static inline uint32x4_t
3346333463+-zeroinfnan (uint32x4_t i)
3346433464++zeroinfnan (uint32x4_t i, const struct data *d)
3346533465+ {
3346633466+ /* 2 * i - 1 >= 2 * 0x7f800000lu - 1. */
3346733467+- return vcgeq_u32 (vsubq_u32 (vmulq_n_u32 (i, 2), v_u32 (1)),
3346833468+- v_u32 (2 * 0x7f800000lu - 1));
3346933469++ return vcgeq_u32 (vsubq_u32 (vmulq_n_u32 (i, 2), v_u32 (1)), d->comp_const);
3347033470+ }
3347133471+3347233472+ /* Fast implementation of vector atan2f. Maximum observed error is
3347333473+@@ -58,12 +63,13 @@ zeroinfnan (uint32x4_t i)
3347433474+ want 0x1.967f00p-1. */
3347533475+ float32x4_t VPCS_ATTR NOINLINE V_NAME_F2 (atan2) (float32x4_t y, float32x4_t x)
3347633476+ {
3347733477+- const struct data *data_ptr = ptr_barrier (&data);
3347833478++ const struct data *d = ptr_barrier (&data);
3347933479+3348033480+ uint32x4_t ix = vreinterpretq_u32_f32 (x);
3348133481+ uint32x4_t iy = vreinterpretq_u32_f32 (y);
3348233482+3348333483+- uint32x4_t special_cases = vorrq_u32 (zeroinfnan (ix), zeroinfnan (iy));
3348433484++ uint32x4_t special_cases
3348533485++ = vorrq_u32 (zeroinfnan (ix, d), zeroinfnan (iy, d));
3348633486+3348733487+ uint32x4_t sign_x = vandq_u32 (ix, SignMask);
3348833488+ uint32x4_t sign_y = vandq_u32 (iy, SignMask);
3348933489+@@ -77,14 +83,14 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F2 (atan2) (float32x4_t y, float32x4_t x)
3349033490+3349133491+ /* Set up z for call to atanf. */
3349233492+ float32x4_t n = vbslq_f32 (pred_aygtax, vnegq_f32 (ax), ay);
3349333493+- float32x4_t d = vbslq_f32 (pred_aygtax, ay, ax);
3349433494+- float32x4_t z = vdivq_f32 (n, d);
3349533495++ float32x4_t q = vbslq_f32 (pred_aygtax, ay, ax);
3349633496++ float32x4_t z = vdivq_f32 (n, q);
3349733497+3349833498+ /* Work out the correct shift. */
3349933499+ float32x4_t shift = vreinterpretq_f32_u32 (
3350033500+ vandq_u32 (pred_xlt0, vreinterpretq_u32_f32 (v_f32 (-2.0f))));
3350133501+ shift = vbslq_f32 (pred_aygtax, vaddq_f32 (shift, v_f32 (1.0f)), shift);
3350233502+- shift = vmulq_f32 (shift, data_ptr->pi_over_2);
3350333503++ shift = vmulq_f32 (shift, d->pi_over_2);
3350433504+3350533505+ /* Calculate the polynomial approximation.
3350633506+ Use 2-level Estrin scheme for P(z^2) with deg(P)=7. However,
3350733507+@@ -96,23 +102,27 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F2 (atan2) (float32x4_t y, float32x4_t x)
3350833508+ float32x4_t z2 = vmulq_f32 (z, z);
3350933509+ float32x4_t z4 = vmulq_f32 (z2, z2);
3351033510+3351133511+- float32x4_t ret = vfmaq_f32 (
3351233512+- v_pairwise_poly_3_f32 (z2, z4, data_ptr->poly), z4,
3351333513+- vmulq_f32 (z4, v_pairwise_poly_3_f32 (z2, z4, data_ptr->poly + 4)));
3351433514++ float32x4_t c1357 = vld1q_f32 (&d->c1);
3351533515++ float32x4_t p01 = vfmaq_laneq_f32 (d->c0, z2, c1357, 0);
3351633516++ float32x4_t p23 = vfmaq_laneq_f32 (d->c2, z2, c1357, 1);
3351733517++ float32x4_t p45 = vfmaq_laneq_f32 (d->c4, z2, c1357, 2);
3351833518++ float32x4_t p67 = vfmaq_laneq_f32 (d->c6, z2, c1357, 3);
3351933519++ float32x4_t p03 = vfmaq_f32 (p01, z4, p23);
3352033520++ float32x4_t p47 = vfmaq_f32 (p45, z4, p67);
3352133521++
3352233522++ float32x4_t ret = vfmaq_f32 (p03, z4, vmulq_f32 (z4, p47));
3352333523+3352433524+ /* y = shift + z * P(z^2). */
3352533525+ ret = vaddq_f32 (vfmaq_f32 (z, ret, vmulq_f32 (z2, z)), shift);
3352633526+3352733527+- /* Account for the sign of y. */
3352833528+- ret = vreinterpretq_f32_u32 (
3352933529+- veorq_u32 (vreinterpretq_u32_f32 (ret), sign_xy));
3353033530+-
3353133531+ if (__glibc_unlikely (v_any_u32 (special_cases)))
3353233532+ {
3353333533+- return special_case (y, x, ret, special_cases);
3353433534++ return special_case (y, x, ret, sign_xy, special_cases);
3353533535+ }
3353633536+3353733537+- return ret;
3353833538++ /* Account for the sign of y. */
3353933539++ return vreinterpretq_f32_u32 (
3354033540++ veorq_u32 (vreinterpretq_u32_f32 (ret), sign_xy));
3354133541+ }
3354233542+ libmvec_hidden_def (V_NAME_F2 (atan2))
3354333543+ HALF_WIDTH_ALIAS_F2(atan2)
3354433544+diff --git a/sysdeps/aarch64/fpu/atan_advsimd.c b/sysdeps/aarch64/fpu/atan_advsimd.c
3354533545+index a962be0f78..14f1809796 100644
3354633546+--- a/sysdeps/aarch64/fpu/atan_advsimd.c
3354733547++++ b/sysdeps/aarch64/fpu/atan_advsimd.c
3354833548+@@ -22,21 +22,22 @@
3354933549+3355033550+ static const struct data
3355133551+ {
3355233552++ float64x2_t c0, c2, c4, c6, c8, c10, c12, c14, c16, c18;
3355333553+ float64x2_t pi_over_2;
3355433554+- float64x2_t poly[20];
3355533555++ double c1, c3, c5, c7, c9, c11, c13, c15, c17, c19;
3355633556+ } data = {
3355733557+ /* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on
3355833558+ [2**-1022, 1.0]. */
3355933559+- .poly = { V2 (-0x1.5555555555555p-2), V2 (0x1.99999999996c1p-3),
3356033560+- V2 (-0x1.2492492478f88p-3), V2 (0x1.c71c71bc3951cp-4),
3356133561+- V2 (-0x1.745d160a7e368p-4), V2 (0x1.3b139b6a88ba1p-4),
3356233562+- V2 (-0x1.11100ee084227p-4), V2 (0x1.e1d0f9696f63bp-5),
3356333563+- V2 (-0x1.aebfe7b418581p-5), V2 (0x1.842dbe9b0d916p-5),
3356433564+- V2 (-0x1.5d30140ae5e99p-5), V2 (0x1.338e31eb2fbbcp-5),
3356533565+- V2 (-0x1.00e6eece7de8p-5), V2 (0x1.860897b29e5efp-6),
3356633566+- V2 (-0x1.0051381722a59p-6), V2 (0x1.14e9dc19a4a4ep-7),
3356733567+- V2 (-0x1.d0062b42fe3bfp-9), V2 (0x1.17739e210171ap-10),
3356833568+- V2 (-0x1.ab24da7be7402p-13), V2 (0x1.358851160a528p-16), },
3356933569++ .c0 = V2 (-0x1.5555555555555p-2), .c1 = 0x1.99999999996c1p-3,
3357033570++ .c2 = V2 (-0x1.2492492478f88p-3), .c3 = 0x1.c71c71bc3951cp-4,
3357133571++ .c4 = V2 (-0x1.745d160a7e368p-4), .c5 = 0x1.3b139b6a88ba1p-4,
3357233572++ .c6 = V2 (-0x1.11100ee084227p-4), .c7 = 0x1.e1d0f9696f63bp-5,
3357333573++ .c8 = V2 (-0x1.aebfe7b418581p-5), .c9 = 0x1.842dbe9b0d916p-5,
3357433574++ .c10 = V2 (-0x1.5d30140ae5e99p-5), .c11 = 0x1.338e31eb2fbbcp-5,
3357533575++ .c12 = V2 (-0x1.00e6eece7de8p-5), .c13 = 0x1.860897b29e5efp-6,
3357633576++ .c14 = V2 (-0x1.0051381722a59p-6), .c15 = 0x1.14e9dc19a4a4ep-7,
3357733577++ .c16 = V2 (-0x1.d0062b42fe3bfp-9), .c17 = 0x1.17739e210171ap-10,
3357833578++ .c18 = V2 (-0x1.ab24da7be7402p-13), .c19 = 0x1.358851160a528p-16,
3357933579+ .pi_over_2 = V2 (0x1.921fb54442d18p+0),
3358033580+ };
3358133581+3358233582+@@ -52,6 +53,11 @@ static const struct data
3358333583+ float64x2_t VPCS_ATTR V_NAME_D1 (atan) (float64x2_t x)
3358433584+ {
3358533585+ const struct data *d = ptr_barrier (&data);
3358633586++ float64x2_t c13 = vld1q_f64 (&d->c1);
3358733587++ float64x2_t c57 = vld1q_f64 (&d->c5);
3358833588++ float64x2_t c911 = vld1q_f64 (&d->c9);
3358933589++ float64x2_t c1315 = vld1q_f64 (&d->c13);
3359033590++ float64x2_t c1719 = vld1q_f64 (&d->c17);
3359133591+3359233592+ /* Small cases, infs and nans are supported by our approximation technique,
3359333593+ but do not set fenv flags correctly. Only trigger special case if we need
3359433594+@@ -90,9 +96,35 @@ float64x2_t VPCS_ATTR V_NAME_D1 (atan) (float64x2_t x)
3359533595+ float64x2_t x2 = vmulq_f64 (z2, z2);
3359633596+ float64x2_t x4 = vmulq_f64 (x2, x2);
3359733597+ float64x2_t x8 = vmulq_f64 (x4, x4);
3359833598+- float64x2_t y
3359933599+- = vfmaq_f64 (v_estrin_7_f64 (z2, x2, x4, d->poly),
3360033600+- v_estrin_11_f64 (z2, x2, x4, x8, d->poly + 8), x8);
3360133601++
3360233602++ /* estrin_7. */
3360333603++ float64x2_t p01 = vfmaq_laneq_f64 (d->c0, z2, c13, 0);
3360433604++ float64x2_t p23 = vfmaq_laneq_f64 (d->c2, z2, c13, 1);
3360533605++ float64x2_t p03 = vfmaq_f64 (p01, x2, p23);
3360633606++
3360733607++ float64x2_t p45 = vfmaq_laneq_f64 (d->c4, z2, c57, 0);
3360833608++ float64x2_t p67 = vfmaq_laneq_f64 (d->c6, z2, c57, 1);
3360933609++ float64x2_t p47 = vfmaq_f64 (p45, x2, p67);
3361033610++
3361133611++ float64x2_t p07 = vfmaq_f64 (p03, x4, p47);
3361233612++
3361333613++ /* estrin_11. */
3361433614++ float64x2_t p89 = vfmaq_laneq_f64 (d->c8, z2, c911, 0);
3361533615++ float64x2_t p1011 = vfmaq_laneq_f64 (d->c10, z2, c911, 1);
3361633616++ float64x2_t p811 = vfmaq_f64 (p89, x2, p1011);
3361733617++
3361833618++ float64x2_t p1213 = vfmaq_laneq_f64 (d->c12, z2, c1315, 0);
3361933619++ float64x2_t p1415 = vfmaq_laneq_f64 (d->c14, z2, c1315, 1);
3362033620++ float64x2_t p1215 = vfmaq_f64 (p1213, x2, p1415);
3362133621++
3362233622++ float64x2_t p1617 = vfmaq_laneq_f64 (d->c16, z2, c1719, 0);
3362333623++ float64x2_t p1819 = vfmaq_laneq_f64 (d->c18, z2, c1719, 1);
3362433624++ float64x2_t p1619 = vfmaq_f64 (p1617, x2, p1819);
3362533625++
3362633626++ float64x2_t p815 = vfmaq_f64 (p811, x4, p1215);
3362733627++ float64x2_t p819 = vfmaq_f64 (p815, x8, p1619);
3362833628++
3362933629++ float64x2_t y = vfmaq_f64 (p07, p819, x8);
3363033630+3363133631+ /* Finalize. y = shift + z + z^3 * P(z^2). */
3363233632+ y = vfmaq_f64 (az, y, vmulq_f64 (z2, az));
3363333633+3363433634+commit bf2b60a56036c951a798845223a2e04cc48507e4
3363533635+Author: Joana Cruz <Joana.Cruz@arm.com>
3363633636+Date: Tue Dec 17 14:50:33 2024 +0000
3363733637+3363833638+ AArch64: Improve codegen of AdvSIMD expf family
3363933639+3364033640+ Load the polynomial evaluation coefficients into 2 vectors and use lanewise MLAs.
3364133641+ Also use intrinsics instead of native operations.
3364233642+ expf: 3% improvement in throughput microbenchmark on Neoverse V1, exp2f: 5%,
3364333643+ exp10f: 13%, coshf: 14%.
3364433644+3364533645+ Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com>
3364633646+ (cherry picked from commit cff9648d0b50d19cdaf685f6767add040d4e1a8e)
3364733647+3364833648+diff --git a/sysdeps/aarch64/fpu/coshf_advsimd.c b/sysdeps/aarch64/fpu/coshf_advsimd.c
3364933649+index c1ab4923b8..cd5c866521 100644
3365033650+--- a/sysdeps/aarch64/fpu/coshf_advsimd.c
3365133651++++ b/sysdeps/aarch64/fpu/coshf_advsimd.c
3365233652+@@ -23,19 +23,27 @@
3365333653+ static const struct data
3365433654+ {
3365533655+ struct v_expf_data expf_consts;
3365633656+- uint32x4_t tiny_bound, special_bound;
3365733657++ uint32x4_t tiny_bound;
3365833658++ float32x4_t bound;
3365933659++#if WANT_SIMD_EXCEPT
3366033660++ uint32x4_t special_bound;
3366133661++#endif
3366233662+ } data = {
3366333663+ .expf_consts = V_EXPF_DATA,
3366433664+ .tiny_bound = V4 (0x20000000), /* 0x1p-63: Round to 1 below this. */
3366533665+ /* 0x1.5a92d8p+6: expf overflows above this, so have to use special case. */
3366633666++ .bound = V4 (0x1.5a92d8p+6),
3366733667++#if WANT_SIMD_EXCEPT
3366833668+ .special_bound = V4 (0x42ad496c),
3366933669++#endif
3367033670+ };
3367133671+3367233672+ #if !WANT_SIMD_EXCEPT
3367333673+ static float32x4_t NOINLINE VPCS_ATTR
3367433674+-special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
3367533675++special_case (float32x4_t x, float32x4_t half_t, float32x4_t half_over_t,
3367633676++ uint32x4_t special)
3367733677+ {
3367833678+- return v_call_f32 (coshf, x, y, special);
3367933679++ return v_call_f32 (coshf, x, vaddq_f32 (half_t, half_over_t), special);
3368033680+ }
3368133681+ #endif
3368233682+3368333683+@@ -47,14 +55,13 @@ float32x4_t VPCS_ATTR V_NAME_F1 (cosh) (float32x4_t x)
3368433684+ {
3368533685+ const struct data *d = ptr_barrier (&data);
3368633686+3368733687+- float32x4_t ax = vabsq_f32 (x);
3368833688+- uint32x4_t iax = vreinterpretq_u32_f32 (ax);
3368933689+- uint32x4_t special = vcgeq_u32 (iax, d->special_bound);
3369033690+-
3369133691+ #if WANT_SIMD_EXCEPT
3369233692+ /* If fp exceptions are to be triggered correctly, fall back to the scalar
3369333693+ variant for all inputs if any input is a special value or above the bound
3369433694+ at which expf overflows. */
3369533695++ float32x4_t ax = vabsq_f32 (x);
3369633696++ uint32x4_t iax = vreinterpretq_u32_f32 (ax);
3369733697++ uint32x4_t special = vcgeq_u32 (iax, d->special_bound);
3369833698+ if (__glibc_unlikely (v_any_u32 (special)))
3369933699+ return v_call_f32 (coshf, x, x, v_u32 (-1));
3370033700+3370133701+@@ -63,10 +70,13 @@ float32x4_t VPCS_ATTR V_NAME_F1 (cosh) (float32x4_t x)
3370233702+ input to 0, which will generate no exceptions. */
3370333703+ if (__glibc_unlikely (v_any_u32 (tiny)))
3370433704+ ax = v_zerofy_f32 (ax, tiny);
3370533705++ float32x4_t t = v_expf_inline (ax, &d->expf_consts);
3370633706++#else
3370733707++ uint32x4_t special = vcageq_f32 (x, d->bound);
3370833708++ float32x4_t t = v_expf_inline (x, &d->expf_consts);
3370933709+ #endif
3371033710+3371133711+ /* Calculate cosh by exp(x) / 2 + exp(-x) / 2. */
3371233712+- float32x4_t t = v_expf_inline (ax, &d->expf_consts);
3371333713+ float32x4_t half_t = vmulq_n_f32 (t, 0.5);
3371433714+ float32x4_t half_over_t = vdivq_f32 (v_f32 (0.5), t);
3371533715+3371633716+@@ -75,7 +85,7 @@ float32x4_t VPCS_ATTR V_NAME_F1 (cosh) (float32x4_t x)
3371733717+ return vbslq_f32 (tiny, v_f32 (1), vaddq_f32 (half_t, half_over_t));
3371833718+ #else
3371933719+ if (__glibc_unlikely (v_any_u32 (special)))
3372033720+- return special_case (x, vaddq_f32 (half_t, half_over_t), special);
3372133721++ return special_case (x, half_t, half_over_t, special);
3372233722+ #endif
3372333723+3372433724+ return vaddq_f32 (half_t, half_over_t);
3372533725+diff --git a/sysdeps/aarch64/fpu/exp10f_advsimd.c b/sysdeps/aarch64/fpu/exp10f_advsimd.c
3372633726+index cf53e73290..55d9cd83f2 100644
3372733727+--- a/sysdeps/aarch64/fpu/exp10f_advsimd.c
3372833728++++ b/sysdeps/aarch64/fpu/exp10f_advsimd.c
3372933729+@@ -18,16 +18,15 @@
3373033730+ <https://www.gnu.org/licenses/>. */
3373133731+3373233732+ #include "v_math.h"
3373333733+-#include "poly_advsimd_f32.h"
3373433734+3373533735+ #define ScaleBound 192.0f
3373633736+3373733737+ static const struct data
3373833738+ {
3373933739+- float32x4_t poly[5];
3374033740+- float log10_2_and_inv[4];
3374133741+- float32x4_t shift;
3374233742+-
3374333743++ float32x4_t c0, c1, c3;
3374433744++ float log10_2_high, log10_2_low, c2, c4;
3374533745++ float32x4_t inv_log10_2, special_bound;
3374633746++ uint32x4_t exponent_bias, special_offset, special_bias;
3374733747+ #if !WANT_SIMD_EXCEPT
3374833748+ float32x4_t scale_thresh;
3374933749+ #endif
3375033750+@@ -37,19 +36,24 @@ static const struct data
3375133751+ rel error: 0x1.89dafa3p-24
3375233752+ abs error: 0x1.167d55p-23 in [-log10(2)/2, log10(2)/2]
3375333753+ maxerr: 1.85943 +0.5 ulp. */
3375433754+- .poly = { V4 (0x1.26bb16p+1f), V4 (0x1.5350d2p+1f), V4 (0x1.04744ap+1f),
3375533755+- V4 (0x1.2d8176p+0f), V4 (0x1.12b41ap-1f) },
3375633756+- .shift = V4 (0x1.8p23f),
3375733757+-
3375833758+- /* Stores constants 1/log10(2), log10(2)_high, log10(2)_low, 0. */
3375933759+- .log10_2_and_inv = { 0x1.a934fp+1, 0x1.344136p-2, -0x1.ec10cp-27, 0 },
3376033760++ .c0 = V4 (0x1.26bb16p+1f),
3376133761++ .c1 = V4 (0x1.5350d2p+1f),
3376233762++ .c2 = 0x1.04744ap+1f,
3376333763++ .c3 = V4 (0x1.2d8176p+0f),
3376433764++ .c4 = 0x1.12b41ap-1f,
3376533765++ .inv_log10_2 = V4 (0x1.a934fp+1),
3376633766++ .log10_2_high = 0x1.344136p-2,
3376733767++ .log10_2_low = 0x1.ec10cp-27,
3376833768++ /* rint (log2 (2^127 / (1 + sqrt (2)))). */
3376933769++ .special_bound = V4 (126.0f),
3377033770++ .exponent_bias = V4 (0x3f800000),
3377133771++ .special_offset = V4 (0x82000000),
3377233772++ .special_bias = V4 (0x7f000000),
3377333773+ #if !WANT_SIMD_EXCEPT
3377433774+ .scale_thresh = V4 (ScaleBound)
3377533775+ #endif
3377633776+ };
3377733777+3377833778+-#define ExponentBias v_u32 (0x3f800000)
3377933779+-
3378033780+ #if WANT_SIMD_EXCEPT
3378133781+3378233782+ # define SpecialBound 38.0f /* rint(log10(2^127)). */
3378333783+@@ -67,17 +71,15 @@ special_case (float32x4_t x, float32x4_t y, uint32x4_t cmp)
3378433784+3378533785+ #else
3378633786+3378733787+-# define SpecialBound 126.0f /* rint (log2 (2^127 / (1 + sqrt (2)))). */
3378833788+-# define SpecialOffset v_u32 (0x82000000)
3378933789+-# define SpecialBias v_u32 (0x7f000000)
3379033790++# define SpecialBound 126.0f
3379133791+3379233792+ static float32x4_t VPCS_ATTR NOINLINE
3379333793+ special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1,
3379433794+ float32x4_t scale, const struct data *d)
3379533795+ {
3379633796+ /* 2^n may overflow, break it up into s1*s2. */
3379733797+- uint32x4_t b = vandq_u32 (vclezq_f32 (n), SpecialOffset);
3379833798+- float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, SpecialBias));
3379933799++ uint32x4_t b = vandq_u32 (vclezq_f32 (n), d->special_offset);
3380033800++ float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, d->special_bias));
3380133801+ float32x4_t s2 = vreinterpretq_f32_u32 (vsubq_u32 (e, b));
3380233802+ uint32x4_t cmp2 = vcagtq_f32 (n, d->scale_thresh);
3380333803+ float32x4_t r2 = vmulq_f32 (s1, s1);
3380433804+@@ -112,23 +114,23 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (exp10) (float32x4_t x)
3380533805+ /* exp10(x) = 2^n * 10^r = 2^n * (1 + poly (r)),
3380633806+ with poly(r) in [1/sqrt(2), sqrt(2)] and
3380733807+ x = r + n * log10 (2), with r in [-log10(2)/2, log10(2)/2]. */
3380833808+- float32x4_t log10_2_and_inv = vld1q_f32 (d->log10_2_and_inv);
3380933809+- float32x4_t z = vfmaq_laneq_f32 (d->shift, x, log10_2_and_inv, 0);
3381033810+- float32x4_t n = vsubq_f32 (z, d->shift);
3381133811+- float32x4_t r = vfmsq_laneq_f32 (x, n, log10_2_and_inv, 1);
3381233812+- r = vfmsq_laneq_f32 (r, n, log10_2_and_inv, 2);
3381333813+- uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_f32 (z), 23);
3381433814++ float32x4_t log10_2_c24 = vld1q_f32 (&d->log10_2_high);
3381533815++ float32x4_t n = vrndaq_f32 (vmulq_f32 (x, d->inv_log10_2));
3381633816++ float32x4_t r = vfmsq_laneq_f32 (x, n, log10_2_c24, 0);
3381733817++ r = vfmaq_laneq_f32 (r, n, log10_2_c24, 1);
3381833818++ uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtaq_s32_f32 (n)), 23);
3381933819+3382033820+- float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, ExponentBias));
3382133821++ float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias));
3382233822+3382333823+ #if !WANT_SIMD_EXCEPT
3382433824+- uint32x4_t cmp = vcagtq_f32 (n, v_f32 (SpecialBound));
3382533825++ uint32x4_t cmp = vcagtq_f32 (n, d->special_bound);
3382633826+ #endif
3382733827+3382833828+ float32x4_t r2 = vmulq_f32 (r, r);
3382933829+- float32x4_t poly
3383033830+- = vfmaq_f32 (vmulq_f32 (r, d->poly[0]),
3383133831+- v_pairwise_poly_3_f32 (r, r2, d->poly + 1), r2);
3383233832++ float32x4_t p12 = vfmaq_laneq_f32 (d->c1, r, log10_2_c24, 2);
3383333833++ float32x4_t p34 = vfmaq_laneq_f32 (d->c3, r, log10_2_c24, 3);
3383433834++ float32x4_t p14 = vfmaq_f32 (p12, r2, p34);
3383533835++ float32x4_t poly = vfmaq_f32 (vmulq_f32 (r, d->c0), p14, r2);
3383633836+3383733837+ if (__glibc_unlikely (v_any_u32 (cmp)))
3383833838+ #if WANT_SIMD_EXCEPT
3383933839+diff --git a/sysdeps/aarch64/fpu/exp2f_advsimd.c b/sysdeps/aarch64/fpu/exp2f_advsimd.c
3384033840+index 69e0b193a1..a4220da63c 100644
3384133841+--- a/sysdeps/aarch64/fpu/exp2f_advsimd.c
3384233842++++ b/sysdeps/aarch64/fpu/exp2f_advsimd.c
3384333843+@@ -21,24 +21,28 @@
3384433844+3384533845+ static const struct data
3384633846+ {
3384733847+- float32x4_t poly[5];
3384833848+- uint32x4_t exponent_bias;
3384933849++ float32x4_t c1, c3;
3385033850++ uint32x4_t exponent_bias, special_offset, special_bias;
3385133851+ #if !WANT_SIMD_EXCEPT
3385233852+- float32x4_t special_bound, scale_thresh;
3385333853++ float32x4_t scale_thresh, special_bound;
3385433854+ #endif
3385533855++ float c0, c2, c4, zero;
3385633856+ } data = {
3385733857+ /* maxerr: 1.962 ulp. */
3385833858+- .poly = { V4 (0x1.59977ap-10f), V4 (0x1.3ce9e4p-7f), V4 (0x1.c6bd32p-5f),
3385933859+- V4 (0x1.ebf9bcp-3f), V4 (0x1.62e422p-1f) },
3386033860++ .c0 = 0x1.59977ap-10f,
3386133861++ .c1 = V4 (0x1.3ce9e4p-7f),
3386233862++ .c2 = 0x1.c6bd32p-5f,
3386333863++ .c3 = V4 (0x1.ebf9bcp-3f),
3386433864++ .c4 = 0x1.62e422p-1f,
3386533865+ .exponent_bias = V4 (0x3f800000),
3386633866++ .special_offset = V4 (0x82000000),
3386733867++ .special_bias = V4 (0x7f000000),
3386833868+ #if !WANT_SIMD_EXCEPT
3386933869+ .special_bound = V4 (126.0f),
3387033870+ .scale_thresh = V4 (192.0f),
3387133871+ #endif
3387233872+ };
3387333873+3387433874+-#define C(i) d->poly[i]
3387533875+-
3387633876+ #if WANT_SIMD_EXCEPT
3387733877+3387833878+ # define TinyBound v_u32 (0x20000000) /* asuint (0x1p-63). */
3387933879+@@ -55,16 +59,13 @@ special_case (float32x4_t x, float32x4_t y, uint32x4_t cmp)
3388033880+3388133881+ #else
3388233882+3388333883+-# define SpecialOffset v_u32 (0x82000000)
3388433884+-# define SpecialBias v_u32 (0x7f000000)
3388533885+-
3388633886+ static float32x4_t VPCS_ATTR NOINLINE
3388733887+ special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1,
3388833888+ float32x4_t scale, const struct data *d)
3388933889+ {
3389033890+ /* 2^n may overflow, break it up into s1*s2. */
3389133891+- uint32x4_t b = vandq_u32 (vclezq_f32 (n), SpecialOffset);
3389233892+- float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, SpecialBias));
3389333893++ uint32x4_t b = vandq_u32 (vclezq_f32 (n), d->special_offset);
3389433894++ float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, d->special_bias));
3389533895+ float32x4_t s2 = vreinterpretq_f32_u32 (vsubq_u32 (e, b));
3389633896+ uint32x4_t cmp2 = vcagtq_f32 (n, d->scale_thresh);
3389733897+ float32x4_t r2 = vmulq_f32 (s1, s1);
3389833898+@@ -80,13 +81,11 @@ special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1,
3389933899+ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (exp2) (float32x4_t x)
3390033900+ {
3390133901+ const struct data *d = ptr_barrier (&data);
3390233902+- float32x4_t n, r, r2, scale, p, q, poly;
3390333903+- uint32x4_t cmp, e;
3390433904+3390533905+ #if WANT_SIMD_EXCEPT
3390633906+ /* asuint(|x|) - TinyBound >= BigBound - TinyBound. */
3390733907+ uint32x4_t ia = vreinterpretq_u32_f32 (vabsq_f32 (x));
3390833908+- cmp = vcgeq_u32 (vsubq_u32 (ia, TinyBound), SpecialBound);
3390933909++ uint32x4_t cmp = vcgeq_u32 (vsubq_u32 (ia, TinyBound), SpecialBound);
3391033910+ float32x4_t xm = x;
3391133911+ /* If any lanes are special, mask them with 1 and retain a copy of x to allow
3391233912+ special_case to fix special lanes later. This is only necessary if fenv
3391333913+@@ -95,23 +94,24 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (exp2) (float32x4_t x)
3391433914+ x = vbslq_f32 (cmp, v_f32 (1), x);
3391533915+ #endif
3391633916+3391733917+- /* exp2(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
3391833918+- x = n + r, with r in [-1/2, 1/2]. */
3391933919+- n = vrndaq_f32 (x);
3392033920+- r = vsubq_f32 (x, n);
3392133921+- e = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtaq_s32_f32 (x)), 23);
3392233922+- scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias));
3392333923++ /* exp2(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
3392433924++ x = n + r, with r in [-1/2, 1/2]. */
3392533925++ float32x4_t n = vrndaq_f32 (x);
3392633926++ float32x4_t r = vsubq_f32 (x, n);
3392733927++ uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtaq_s32_f32 (x)), 23);
3392833928++ float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias));
3392933929+3393033930+ #if !WANT_SIMD_EXCEPT
3393133931+- cmp = vcagtq_f32 (n, d->special_bound);
3393233932++ uint32x4_t cmp = vcagtq_f32 (n, d->special_bound);
3393333933+ #endif
3393433934+3393533935+- r2 = vmulq_f32 (r, r);
3393633936+- p = vfmaq_f32 (C (1), C (0), r);
3393733937+- q = vfmaq_f32 (C (3), C (2), r);
3393833938++ float32x4_t c024 = vld1q_f32 (&d->c0);
3393933939++ float32x4_t r2 = vmulq_f32 (r, r);
3394033940++ float32x4_t p = vfmaq_laneq_f32 (d->c1, r, c024, 0);
3394133941++ float32x4_t q = vfmaq_laneq_f32 (d->c3, r, c024, 1);
3394233942+ q = vfmaq_f32 (q, p, r2);
3394333943+- p = vmulq_f32 (C (4), r);
3394433944+- poly = vfmaq_f32 (p, q, r2);
3394533945++ p = vmulq_laneq_f32 (r, c024, 2);
3394633946++ float32x4_t poly = vfmaq_f32 (p, q, r2);
3394733947+3394833948+ if (__glibc_unlikely (v_any_u32 (cmp)))
3394933949+ #if WANT_SIMD_EXCEPT
3395033950+diff --git a/sysdeps/aarch64/fpu/expf_advsimd.c b/sysdeps/aarch64/fpu/expf_advsimd.c
3395133951+index 5c9cb72620..70f137e2e5 100644
3395233952+--- a/sysdeps/aarch64/fpu/expf_advsimd.c
3395333953++++ b/sysdeps/aarch64/fpu/expf_advsimd.c
3395433954+@@ -21,20 +21,25 @@
3395533955+3395633956+ static const struct data
3395733957+ {
3395833958+- float32x4_t poly[5];
3395933959+- float32x4_t inv_ln2, ln2_hi, ln2_lo;
3396033960+- uint32x4_t exponent_bias;
3396133961++ float32x4_t c1, c3, c4, inv_ln2;
3396233962++ float ln2_hi, ln2_lo, c0, c2;
3396333963++ uint32x4_t exponent_bias, special_offset, special_bias;
3396433964+ #if !WANT_SIMD_EXCEPT
3396533965+ float32x4_t special_bound, scale_thresh;
3396633966+ #endif
3396733967+ } data = {
3396833968+ /* maxerr: 1.45358 +0.5 ulp. */
3396933969+- .poly = { V4 (0x1.0e4020p-7f), V4 (0x1.573e2ep-5f), V4 (0x1.555e66p-3f),
3397033970+- V4 (0x1.fffdb6p-2f), V4 (0x1.ffffecp-1f) },
3397133971++ .c0 = 0x1.0e4020p-7f,
3397233972++ .c1 = V4 (0x1.573e2ep-5f),
3397333973++ .c2 = 0x1.555e66p-3f,
3397433974++ .c3 = V4 (0x1.fffdb6p-2f),
3397533975++ .c4 = V4 (0x1.ffffecp-1f),
3397633976+ .inv_ln2 = V4 (0x1.715476p+0f),
3397733977+- .ln2_hi = V4 (0x1.62e4p-1f),
3397833978+- .ln2_lo = V4 (0x1.7f7d1cp-20f),
3397933979++ .ln2_hi = 0x1.62e4p-1f,
3398033980++ .ln2_lo = 0x1.7f7d1cp-20f,
3398133981+ .exponent_bias = V4 (0x3f800000),
3398233982++ .special_offset = V4 (0x82000000),
3398333983++ .special_bias = V4 (0x7f000000),
3398433984+ #if !WANT_SIMD_EXCEPT
3398533985+ .special_bound = V4 (126.0f),
3398633986+ .scale_thresh = V4 (192.0f),
3398733987+@@ -59,19 +64,17 @@ special_case (float32x4_t x, float32x4_t y, uint32x4_t cmp)
3398833988+3398933989+ #else
3399033990+3399133991+-# define SpecialOffset v_u32 (0x82000000)
3399233992+-# define SpecialBias v_u32 (0x7f000000)
3399333993+-
3399433994+ static float32x4_t VPCS_ATTR NOINLINE
3399533995+ special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1,
3399633996+ float32x4_t scale, const struct data *d)
3399733997+ {
3399833998+ /* 2^n may overflow, break it up into s1*s2. */
3399933999+- uint32x4_t b = vandq_u32 (vclezq_f32 (n), SpecialOffset);
3400034000+- float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, SpecialBias));
3400134001++ uint32x4_t b = vandq_u32 (vclezq_f32 (n), d->special_offset);
3400234002++ float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, d->special_bias));
3400334003+ float32x4_t s2 = vreinterpretq_f32_u32 (vsubq_u32 (e, b));
3400434004+ uint32x4_t cmp2 = vcagtq_f32 (n, d->scale_thresh);
3400534005+ float32x4_t r2 = vmulq_f32 (s1, s1);
3400634006++ // (s2 + p*s2)*s1 = s2(p+1)s1
3400734007+ float32x4_t r1 = vmulq_f32 (vfmaq_f32 (s2, poly, s2), s1);
3400834008+ /* Similar to r1 but avoids double rounding in the subnormal range. */
3400934009+ float32x4_t r0 = vfmaq_f32 (scale, poly, scale);
3401034010+@@ -84,12 +87,11 @@ special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1,
3401134011+ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (exp) (float32x4_t x)
3401234012+ {
3401334013+ const struct data *d = ptr_barrier (&data);
3401434014+- float32x4_t n, r, r2, scale, p, q, poly;
3401534015+- uint32x4_t cmp, e;
3401634016++ float32x4_t ln2_c02 = vld1q_f32 (&d->ln2_hi);
3401734017+3401834018+ #if WANT_SIMD_EXCEPT
3401934019+ /* asuint(x) - TinyBound >= BigBound - TinyBound. */
3402034020+- cmp = vcgeq_u32 (
3402134021++ uint32x4_t cmp = vcgeq_u32 (
3402234022+ vsubq_u32 (vandq_u32 (vreinterpretq_u32_f32 (x), v_u32 (0x7fffffff)),
3402334023+ TinyBound),
3402434024+ SpecialBound);
3402534025+@@ -103,22 +105,22 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (exp) (float32x4_t x)
3402634026+3402734027+ /* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
3402834028+ x = ln2*n + r, with r in [-ln2/2, ln2/2]. */
3402934029+- n = vrndaq_f32 (vmulq_f32 (x, d->inv_ln2));
3403034030+- r = vfmsq_f32 (x, n, d->ln2_hi);
3403134031+- r = vfmsq_f32 (r, n, d->ln2_lo);
3403234032+- e = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtq_s32_f32 (n)), 23);
3403334033+- scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias));
3403434034++ float32x4_t n = vrndaq_f32 (vmulq_f32 (x, d->inv_ln2));
3403534035++ float32x4_t r = vfmsq_laneq_f32 (x, n, ln2_c02, 0);
3403634036++ r = vfmsq_laneq_f32 (r, n, ln2_c02, 1);
3403734037++ uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtq_s32_f32 (n)), 23);
3403834038++ float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias));
3403934039+3404034040+ #if !WANT_SIMD_EXCEPT
3404134041+- cmp = vcagtq_f32 (n, d->special_bound);
3404234042++ uint32x4_t cmp = vcagtq_f32 (n, d->special_bound);
3404334043+ #endif
3404434044+3404534045+- r2 = vmulq_f32 (r, r);
3404634046+- p = vfmaq_f32 (C (1), C (0), r);
3404734047+- q = vfmaq_f32 (C (3), C (2), r);
3404834048++ float32x4_t r2 = vmulq_f32 (r, r);
3404934049++ float32x4_t p = vfmaq_laneq_f32 (d->c1, r, ln2_c02, 2);
3405034050++ float32x4_t q = vfmaq_laneq_f32 (d->c3, r, ln2_c02, 3);
3405134051+ q = vfmaq_f32 (q, p, r2);
3405234052+- p = vmulq_f32 (C (4), r);
3405334053+- poly = vfmaq_f32 (p, q, r2);
3405434054++ p = vmulq_f32 (d->c4, r);
3405534055++ float32x4_t poly = vfmaq_f32 (p, q, r2);
3405634056+3405734057+ if (__glibc_unlikely (v_any_u32 (cmp)))
3405834058+ #if WANT_SIMD_EXCEPT
3405934059+diff --git a/sysdeps/aarch64/fpu/v_expf_inline.h b/sysdeps/aarch64/fpu/v_expf_inline.h
3406034060+index 08b06e0a6b..eacd2af241 100644
3406134061+--- a/sysdeps/aarch64/fpu/v_expf_inline.h
3406234062++++ b/sysdeps/aarch64/fpu/v_expf_inline.h
3406334063+@@ -24,50 +24,45 @@
3406434064+3406534065+ struct v_expf_data
3406634066+ {
3406734067+- float32x4_t poly[5];
3406834068+- float32x4_t shift;
3406934069+- float invln2_and_ln2[4];
3407034070++ float ln2_hi, ln2_lo, c0, c2;
3407134071++ float32x4_t inv_ln2, c1, c3, c4;
3407234072++ /* asuint(1.0f). */
3407334073++ uint32x4_t exponent_bias;
3407434074+ };
3407534075+3407634076+ /* maxerr: 1.45358 +0.5 ulp. */
3407734077+ #define V_EXPF_DATA \
3407834078+ { \
3407934079+- .poly = { V4 (0x1.0e4020p-7f), V4 (0x1.573e2ep-5f), V4 (0x1.555e66p-3f), \
3408034080+- V4 (0x1.fffdb6p-2f), V4 (0x1.ffffecp-1f) }, \
3408134081+- .shift = V4 (0x1.8p23f), \
3408234082+- .invln2_and_ln2 = { 0x1.715476p+0f, 0x1.62e4p-1f, 0x1.7f7d1cp-20f, 0 }, \
3408334083++ .c0 = 0x1.0e4020p-7f, .c1 = V4 (0x1.573e2ep-5f), .c2 = 0x1.555e66p-3f, \
3408434084++ .c3 = V4 (0x1.fffdb6p-2f), .c4 = V4 (0x1.ffffecp-1f), \
3408534085++ .ln2_hi = 0x1.62e4p-1f, .ln2_lo = 0x1.7f7d1cp-20f, \
3408634086++ .inv_ln2 = V4 (0x1.715476p+0f), .exponent_bias = V4 (0x3f800000), \
3408734087+ }
3408834088+3408934089+-#define ExponentBias v_u32 (0x3f800000) /* asuint(1.0f). */
3409034090+-#define C(i) d->poly[i]
3409134091+-
3409234092+ static inline float32x4_t
3409334093+ v_expf_inline (float32x4_t x, const struct v_expf_data *d)
3409434094+ {
3409534095+- /* Helper routine for calculating exp(x).
3409634096++ /* Helper routine for calculating exp(ax).
3409734097+ Copied from v_expf.c, with all special-case handling removed - the
3409834098+ calling routine should handle special values if required. */
3409934099+3410034100+- /* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
3410134101+- x = ln2*n + r, with r in [-ln2/2, ln2/2]. */
3410234102+- float32x4_t n, r, z;
3410334103+- float32x4_t invln2_and_ln2 = vld1q_f32 (d->invln2_and_ln2);
3410434104+- z = vfmaq_laneq_f32 (d->shift, x, invln2_and_ln2, 0);
3410534105+- n = vsubq_f32 (z, d->shift);
3410634106+- r = vfmsq_laneq_f32 (x, n, invln2_and_ln2, 1);
3410734107+- r = vfmsq_laneq_f32 (r, n, invln2_and_ln2, 2);
3410834108+- uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_f32 (z), 23);
3410934109+- float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, ExponentBias));
3411034110++ /* exp(ax) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
3411134111++ ax = ln2*n + r, with r in [-ln2/2, ln2/2]. */
3411234112++ float32x4_t ax = vabsq_f32 (x);
3411334113++ float32x4_t ln2_c02 = vld1q_f32 (&d->ln2_hi);
3411434114++ float32x4_t n = vrndaq_f32 (vmulq_f32 (ax, d->inv_ln2));
3411534115++ float32x4_t r = vfmsq_laneq_f32 (ax, n, ln2_c02, 0);
3411634116++ r = vfmsq_laneq_f32 (r, n, ln2_c02, 1);
3411734117++ uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtq_s32_f32 (n)), 23);
3411834118++ float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias));
3411934119+3412034120+ /* Custom order-4 Estrin avoids building high order monomial. */
3412134121+ float32x4_t r2 = vmulq_f32 (r, r);
3412234122+- float32x4_t p, q, poly;
3412334123+- p = vfmaq_f32 (C (1), C (0), r);
3412434124+- q = vfmaq_f32 (C (3), C (2), r);
3412534125++ float32x4_t p = vfmaq_laneq_f32 (d->c1, r, ln2_c02, 2);
3412634126++ float32x4_t q = vfmaq_laneq_f32 (d->c3, r, ln2_c02, 3);
3412734127+ q = vfmaq_f32 (q, p, r2);
3412834128+- p = vmulq_f32 (C (4), r);
3412934129+- poly = vfmaq_f32 (p, q, r2);
3413034130++ p = vmulq_f32 (d->c4, r);
3413134131++ float32x4_t poly = vfmaq_f32 (p, q, r2);
3413234132+ return vfmaq_f32 (scale, poly, scale);
3413334133+ }
3413434134+-
3413534135+ #endif
3413634136+3413734137+commit abfd20ebbd2883f2c6e5f16709f7b9781c3c8068
3413834138+Author: Luna Lamb <luna.lamb@arm.com>
3413934139+Date: Fri Jan 3 19:00:12 2025 +0000
3414034140+3414134141+ AArch64: Improve codegen in AdvSIMD asinh
3414234142+3414334143+ Improves memory access and removes spills.
3414434144+ Load the polynomial evaluation coefficients into 2 vectors and use lanewise
3414534145+ MLAs. Reduces MOVs 6->3 , LDR 11->5, STR/STP 2->0, ADRP 3->2.
3414634146+3414734147+ (cherry picked from commit 140b985e5a2071000122b3cb63ebfe88cf21dd29)
3414834148+3414934149+diff --git a/sysdeps/aarch64/fpu/asinh_advsimd.c b/sysdeps/aarch64/fpu/asinh_advsimd.c
3415034150+index 6207e7da95..2739f98b39 100644
3415134151+--- a/sysdeps/aarch64/fpu/asinh_advsimd.c
3415234152++++ b/sysdeps/aarch64/fpu/asinh_advsimd.c
3415334153+@@ -20,41 +20,71 @@
3415434154+ #include "v_math.h"
3415534155+ #include "poly_advsimd_f64.h"
3415634156+3415734157+-#define A(i) v_f64 (__v_log_data.poly[i])
3415834158+-#define N (1 << V_LOG_TABLE_BITS)
3415934159+-#define IndexMask (N - 1)
3416034160+-
3416134161+ const static struct data
3416234162+ {
3416334163+- float64x2_t poly[18];
3416434164+- uint64x2_t off, huge_bound, abs_mask;
3416534165+- float64x2_t ln2, tiny_bound;
3416634166++ uint64x2_t huge_bound, abs_mask, off, mask;
3416734167++#if WANT_SIMD_EXCEPT
3416834168++ float64x2_t tiny_bound;
3416934169++#endif
3417034170++ float64x2_t lc0, lc2;
3417134171++ double lc1, lc3, ln2, lc4;
3417234172++
3417334173++ float64x2_t c0, c2, c4, c6, c8, c10, c12, c14, c16, c17;
3417434174++ double c1, c3, c5, c7, c9, c11, c13, c15;
3417534175++
3417634176+ } data = {
3417734177+- .off = V2 (0x3fe6900900000000),
3417834178+- .ln2 = V2 (0x1.62e42fefa39efp-1),
3417934179+- .huge_bound = V2 (0x5fe0000000000000),
3418034180++
3418134181++#if WANT_SIMD_EXCEPT
3418234182+ .tiny_bound = V2 (0x1p-26),
3418334183+- .abs_mask = V2 (0x7fffffffffffffff),
3418434184++#endif
3418534185+ /* Even terms of polynomial s.t. asinh(x) is approximated by
3418634186+ asinh(x) ~= x + x^3 * (C0 + C1 * x + C2 * x^2 + C3 * x^3 + ...).
3418734187+ Generated using Remez, f = (asinh(sqrt(x)) - sqrt(x))/x^(3/2). */
3418834188+- .poly = { V2 (-0x1.55555555554a7p-3), V2 (0x1.3333333326c7p-4),
3418934189+- V2 (-0x1.6db6db68332e6p-5), V2 (0x1.f1c71b26fb40dp-6),
3419034190+- V2 (-0x1.6e8b8b654a621p-6), V2 (0x1.1c4daa9e67871p-6),
3419134191+- V2 (-0x1.c9871d10885afp-7), V2 (0x1.7a16e8d9d2ecfp-7),
3419234192+- V2 (-0x1.3ddca533e9f54p-7), V2 (0x1.0becef748dafcp-7),
3419334193+- V2 (-0x1.b90c7099dd397p-8), V2 (0x1.541f2bb1ffe51p-8),
3419434194+- V2 (-0x1.d217026a669ecp-9), V2 (0x1.0b5c7977aaf7p-9),
3419534195+- V2 (-0x1.e0f37daef9127p-11), V2 (0x1.388b5fe542a6p-12),
3419634196+- V2 (-0x1.021a48685e287p-14), V2 (0x1.93d4ba83d34dap-18) },
3419734197++
3419834198++ .c0 = V2 (-0x1.55555555554a7p-3),
3419934199++ .c1 = 0x1.3333333326c7p-4,
3420034200++ .c2 = V2 (-0x1.6db6db68332e6p-5),
3420134201++ .c3 = 0x1.f1c71b26fb40dp-6,
3420234202++ .c4 = V2 (-0x1.6e8b8b654a621p-6),
3420334203++ .c5 = 0x1.1c4daa9e67871p-6,
3420434204++ .c6 = V2 (-0x1.c9871d10885afp-7),
3420534205++ .c7 = 0x1.7a16e8d9d2ecfp-7,
3420634206++ .c8 = V2 (-0x1.3ddca533e9f54p-7),
3420734207++ .c9 = 0x1.0becef748dafcp-7,
3420834208++ .c10 = V2 (-0x1.b90c7099dd397p-8),
3420934209++ .c11 = 0x1.541f2bb1ffe51p-8,
3421034210++ .c12 = V2 (-0x1.d217026a669ecp-9),
3421134211++ .c13 = 0x1.0b5c7977aaf7p-9,
3421234212++ .c14 = V2 (-0x1.e0f37daef9127p-11),
3421334213++ .c15 = 0x1.388b5fe542a6p-12,
3421434214++ .c16 = V2 (-0x1.021a48685e287p-14),
3421534215++ .c17 = V2 (0x1.93d4ba83d34dap-18),
3421634216++
3421734217++ .lc0 = V2 (-0x1.ffffffffffff7p-2),
3421834218++ .lc1 = 0x1.55555555170d4p-2,
3421934219++ .lc2 = V2 (-0x1.0000000399c27p-2),
3422034220++ .lc3 = 0x1.999b2e90e94cap-3,
3422134221++ .lc4 = -0x1.554e550bd501ep-3,
3422234222++ .ln2 = 0x1.62e42fefa39efp-1,
3422334223++
3422434224++ .off = V2 (0x3fe6900900000000),
3422534225++ .huge_bound = V2 (0x5fe0000000000000),
3422634226++ .abs_mask = V2 (0x7fffffffffffffff),
3422734227++ .mask = V2 (0xfffULL << 52),
3422834228+ };
3422934229+3423034230+ static float64x2_t NOINLINE VPCS_ATTR
3423134231+-special_case (float64x2_t x, float64x2_t y, uint64x2_t special)
3423234232++special_case (float64x2_t x, float64x2_t y, uint64x2_t abs_mask,
3423334233++ uint64x2_t special)
3423434234+ {
3423534235++ /* Copy sign. */
3423634236++ y = vbslq_f64 (abs_mask, y, x);
3423734237+ return v_call_f64 (asinh, x, y, special);
3423834238+ }
3423934239+3424034240++#define N (1 << V_LOG_TABLE_BITS)
3424134241++#define IndexMask (N - 1)
3424234242++
3424334243+ struct entry
3424434244+ {
3424534245+ float64x2_t invc;
3424634246+@@ -76,27 +106,34 @@ lookup (uint64x2_t i)
3424734247+ }
3424834248+3424934249+ static inline float64x2_t
3425034250+-log_inline (float64x2_t x, const struct data *d)
3425134251++log_inline (float64x2_t xm, const struct data *d)
3425234252+ {
3425334253+- /* Double-precision vector log, copied from ordinary vector log with some
3425434254+- cosmetic modification and special-cases removed. */
3425534255+- uint64x2_t ix = vreinterpretq_u64_f64 (x);
3425634256+- uint64x2_t tmp = vsubq_u64 (ix, d->off);
3425734257+- int64x2_t k = vshrq_n_s64 (vreinterpretq_s64_u64 (tmp), 52);
3425834258+- uint64x2_t iz
3425934259+- = vsubq_u64 (ix, vandq_u64 (tmp, vdupq_n_u64 (0xfffULL << 52)));
3426034260++
3426134261++ uint64x2_t u = vreinterpretq_u64_f64 (xm);
3426234262++ uint64x2_t u_off = vsubq_u64 (u, d->off);
3426334263++
3426434264++ int64x2_t k = vshrq_n_s64 (vreinterpretq_s64_u64 (u_off), 52);
3426534265++ uint64x2_t iz = vsubq_u64 (u, vandq_u64 (u_off, d->mask));
3426634266+ float64x2_t z = vreinterpretq_f64_u64 (iz);
3426734267+- struct entry e = lookup (tmp);
3426834268++
3426934269++ struct entry e = lookup (u_off);
3427034270++
3427134271++ /* log(x) = log1p(z/c-1) + log(c) + k*Ln2. */
3427234272+ float64x2_t r = vfmaq_f64 (v_f64 (-1.0), z, e.invc);
3427334273+ float64x2_t kd = vcvtq_f64_s64 (k);
3427434274+- float64x2_t hi = vfmaq_f64 (vaddq_f64 (e.logc, r), kd, d->ln2);
3427534275++
3427634276++ /* hi = r + log(c) + k*Ln2. */
3427734277++ float64x2_t ln2_and_lc4 = vld1q_f64 (&d->ln2);
3427834278++ float64x2_t hi = vfmaq_laneq_f64 (vaddq_f64 (e.logc, r), kd, ln2_and_lc4, 0);
3427934279++
3428034280++ /* y = r2*(A0 + r*A1 + r2*(A2 + r*A3 + r2*A4)) + hi. */
3428134281++ float64x2_t odd_coeffs = vld1q_f64 (&d->lc1);
3428234282+ float64x2_t r2 = vmulq_f64 (r, r);
3428334283+- float64x2_t y = vfmaq_f64 (A (2), A (3), r);
3428434284+- float64x2_t p = vfmaq_f64 (A (0), A (1), r);
3428534285+- y = vfmaq_f64 (y, A (4), r2);
3428634286+- y = vfmaq_f64 (p, y, r2);
3428734287+- y = vfmaq_f64 (hi, y, r2);
3428834288+- return y;
3428934289++ float64x2_t y = vfmaq_laneq_f64 (d->lc2, r, odd_coeffs, 1);
3429034290++ float64x2_t p = vfmaq_laneq_f64 (d->lc0, r, odd_coeffs, 0);
3429134291++ y = vfmaq_laneq_f64 (y, r2, ln2_and_lc4, 1);
3429234292++ y = vfmaq_f64 (p, r2, y);
3429334293++ return vfmaq_f64 (hi, y, r2);
3429434294+ }
3429534295+3429634296+ /* Double-precision implementation of vector asinh(x).
3429734297+@@ -106,23 +143,24 @@ log_inline (float64x2_t x, const struct data *d)
3429834298+ asinh(x) = sign(x) * log(|x| + sqrt(x^2 + 1) if |x| >= 1
3429934299+ = sign(x) * (|x| + |x|^3 * P(x^2)) otherwise
3430034300+ where log(x) is an optimized log approximation, and P(x) is a polynomial
3430134301+- shared with the scalar routine. The greatest observed error 3.29 ULP, in
3430234302++ shared with the scalar routine. The greatest observed error 2.79 ULP, in
3430334303+ |x| >= 1:
3430434304+- __v_asinh(0x1.2cd9d717e2c9bp+0) got 0x1.ffffcfd0e234fp-1
3430534305+- want 0x1.ffffcfd0e2352p-1. */
3430634306++ _ZGVnN2v_asinh(0x1.2cd9d73ea76a6p+0) got 0x1.ffffd003219dap-1
3430734307++ want 0x1.ffffd003219ddp-1. */
3430834308+ VPCS_ATTR float64x2_t V_NAME_D1 (asinh) (float64x2_t x)
3430934309+ {
3431034310+ const struct data *d = ptr_barrier (&data);
3431134311+-
3431234312+ float64x2_t ax = vabsq_f64 (x);
3431334313+- uint64x2_t iax = vreinterpretq_u64_f64 (ax);
3431434314+3431534315+ uint64x2_t gt1 = vcgeq_f64 (ax, v_f64 (1));
3431634316+- uint64x2_t special = vcgeq_u64 (iax, d->huge_bound);
3431734317+3431834318+ #if WANT_SIMD_EXCEPT
3431934319++ uint64x2_t iax = vreinterpretq_u64_f64 (ax);
3432034320++ uint64x2_t special = vcgeq_u64 (iax, (d->huge_bound));
3432134321+ uint64x2_t tiny = vcltq_f64 (ax, d->tiny_bound);
3432234322+ special = vorrq_u64 (special, tiny);
3432334323++#else
3432434324++ uint64x2_t special = vcgeq_f64 (ax, vreinterpretq_f64_u64 (d->huge_bound));
3432534325+ #endif
3432634326+3432734327+ /* Option 1: |x| >= 1.
3432834328+@@ -147,19 +185,45 @@ VPCS_ATTR float64x2_t V_NAME_D1 (asinh) (float64x2_t x)
3432934329+ overflow, and tiny lanes, which will underflow, by setting them to 0. They
3433034330+ will be fixed later, either by selecting x or falling back to the scalar
3433134331+ special-case. The largest observed error in this region is 1.47 ULPs:
3433234332+- __v_asinh(0x1.fdfcd00cc1e6ap-1) got 0x1.c1d6bf874019bp-1
3433334333+- want 0x1.c1d6bf874019cp-1. */
3433434334++ _ZGVnN2v_asinh(0x1.fdfcd00cc1e6ap-1) got 0x1.c1d6bf874019bp-1
3433534335++ want 0x1.c1d6bf874019cp-1. */
3433634336+ float64x2_t option_2 = v_f64 (0);
3433734337++
3433834338+ if (__glibc_likely (v_any_u64 (vceqzq_u64 (gt1))))
3433934339+ {
3434034340++
3434134341+ #if WANT_SIMD_EXCEPT
3434234342+ ax = v_zerofy_f64 (ax, vorrq_u64 (tiny, gt1));
3434334343+ #endif
3434434344+- float64x2_t x2 = vmulq_f64 (ax, ax), x3 = vmulq_f64 (ax, x2),
3434534345+- z2 = vmulq_f64 (x2, x2), z4 = vmulq_f64 (z2, z2),
3434634346+- z8 = vmulq_f64 (z4, z4), z16 = vmulq_f64 (z8, z8);
3434734347+- float64x2_t p = v_estrin_17_f64 (x2, z2, z4, z8, z16, d->poly);
3434834348+- option_2 = vfmaq_f64 (ax, p, x3);
3434934349++ float64x2_t x2 = vmulq_f64 (ax, ax), z2 = vmulq_f64 (x2, x2);
3435034350++ /* Order-17 Pairwise Horner scheme. */
3435134351++ float64x2_t c13 = vld1q_f64 (&d->c1);
3435234352++ float64x2_t c57 = vld1q_f64 (&d->c5);
3435334353++ float64x2_t c911 = vld1q_f64 (&d->c9);
3435434354++ float64x2_t c1315 = vld1q_f64 (&d->c13);
3435534355++
3435634356++ float64x2_t p01 = vfmaq_laneq_f64 (d->c0, x2, c13, 0);
3435734357++ float64x2_t p23 = vfmaq_laneq_f64 (d->c2, x2, c13, 1);
3435834358++ float64x2_t p45 = vfmaq_laneq_f64 (d->c4, x2, c57, 0);
3435934359++ float64x2_t p67 = vfmaq_laneq_f64 (d->c6, x2, c57, 1);
3436034360++ float64x2_t p89 = vfmaq_laneq_f64 (d->c8, x2, c911, 0);
3436134361++ float64x2_t p1011 = vfmaq_laneq_f64 (d->c10, x2, c911, 1);
3436234362++ float64x2_t p1213 = vfmaq_laneq_f64 (d->c12, x2, c1315, 0);
3436334363++ float64x2_t p1415 = vfmaq_laneq_f64 (d->c14, x2, c1315, 1);
3436434364++ float64x2_t p1617 = vfmaq_f64 (d->c16, x2, d->c17);
3436534365++
3436634366++ float64x2_t p = vfmaq_f64 (p1415, z2, p1617);
3436734367++ p = vfmaq_f64 (p1213, z2, p);
3436834368++ p = vfmaq_f64 (p1011, z2, p);
3436934369++ p = vfmaq_f64 (p89, z2, p);
3437034370++
3437134371++ p = vfmaq_f64 (p67, z2, p);
3437234372++ p = vfmaq_f64 (p45, z2, p);
3437334373++
3437434374++ p = vfmaq_f64 (p23, z2, p);
3437534375++
3437634376++ p = vfmaq_f64 (p01, z2, p);
3437734377++ option_2 = vfmaq_f64 (ax, p, vmulq_f64 (ax, x2));
3437834378+ #if WANT_SIMD_EXCEPT
3437934379+ option_2 = vbslq_f64 (tiny, x, option_2);
3438034380+ #endif
3438134381+@@ -167,10 +231,10 @@ VPCS_ATTR float64x2_t V_NAME_D1 (asinh) (float64x2_t x)
3438234382+3438334383+ /* Choose the right option for each lane. */
3438434384+ float64x2_t y = vbslq_f64 (gt1, option_1, option_2);
3438534385+- /* Copy sign. */
3438634386+- y = vbslq_f64 (d->abs_mask, y, x);
3438734387+-
3438834388+ if (__glibc_unlikely (v_any_u64 (special)))
3438934389+- return special_case (x, y, special);
3439034390+- return y;
3439134391++ {
3439234392++ return special_case (x, y, d->abs_mask, special);
3439334393++ }
3439434394++ /* Copy sign. */
3439534395++ return vbslq_f64 (d->abs_mask, y, x);
3439634396+ }
3439734397+3439834398+commit 5f45c0f91eae99b7d49f5c63b900441eb3491213
3439934399+Author: Luna Lamb <luna.lamb@arm.com>
3440034400+Date: Fri Jan 3 19:02:52 2025 +0000
3440134401+3440234402+ AArch64: Improve codegen in SVE tans
3440334403+3440434404+ Improves memory access.
3440534405+ Tan: MOVPRFX 7 -> 2, LD1RD 12 -> 5, move MOV away from return.
3440634406+ Tanf: MOV 2 -> 1, MOVPRFX 6 -> 3, LD1RW 5 -> 4, move mov away from return.
3440734407+3440834408+ (cherry picked from commit aa6609feb20ebf8653db639dabe2a6afc77b02cc)
3440934409+3441034410+diff --git a/sysdeps/aarch64/fpu/tan_sve.c b/sysdeps/aarch64/fpu/tan_sve.c
3441134411+index b2e4447316..a7318fd417 100644
3441234412+--- a/sysdeps/aarch64/fpu/tan_sve.c
3441334413++++ b/sysdeps/aarch64/fpu/tan_sve.c
3441434414+@@ -22,24 +22,38 @@
3441534415+3441634416+ static const struct data
3441734417+ {
3441834418+- double poly[9];
3441934419+- double half_pi_hi, half_pi_lo, inv_half_pi, range_val, shift;
3442034420++ double c2, c4, c6, c8;
3442134421++ double poly_1357[4];
3442234422++ double c0, inv_half_pi;
3442334423++ double half_pi_hi, half_pi_lo, range_val;
3442434424+ } data = {
3442534425+ /* Polynomial generated with FPMinimax. */
3442634426+- .poly = { 0x1.5555555555556p-2, 0x1.1111111110a63p-3, 0x1.ba1ba1bb46414p-5,
3442734427+- 0x1.664f47e5b5445p-6, 0x1.226e5e5ecdfa3p-7, 0x1.d6c7ddbf87047p-9,
3442834428+- 0x1.7ea75d05b583ep-10, 0x1.289f22964a03cp-11,
3442934429+- 0x1.4e4fd14147622p-12, },
3443034430++ .c2 = 0x1.ba1ba1bb46414p-5,
3443134431++ .c4 = 0x1.226e5e5ecdfa3p-7,
3443234432++ .c6 = 0x1.7ea75d05b583ep-10,
3443334433++ .c8 = 0x1.4e4fd14147622p-12,
3443434434++ .poly_1357 = { 0x1.1111111110a63p-3, 0x1.664f47e5b5445p-6,
3443534435++ 0x1.d6c7ddbf87047p-9, 0x1.289f22964a03cp-11 },
3443634436++ .c0 = 0x1.5555555555556p-2,
3443734437++ .inv_half_pi = 0x1.45f306dc9c883p-1,
3443834438+ .half_pi_hi = 0x1.921fb54442d18p0,
3443934439+ .half_pi_lo = 0x1.1a62633145c07p-54,
3444034440+- .inv_half_pi = 0x1.45f306dc9c883p-1,
3444134441+ .range_val = 0x1p23,
3444234442+- .shift = 0x1.8p52,
3444334443+ };
3444434444+3444534445+ static svfloat64_t NOINLINE
3444634446+-special_case (svfloat64_t x, svfloat64_t y, svbool_t special)
3444734447++special_case (svfloat64_t x, svfloat64_t p, svfloat64_t q, svbool_t pg,
3444834448++ svbool_t special)
3444934449+ {
3445034450++ svbool_t use_recip = svcmpeq (
3445134451++ pg, svand_x (pg, svreinterpret_u64 (svcvt_s64_x (pg, q)), 1), 0);
3445234452++
3445334453++ svfloat64_t n = svmad_x (pg, p, p, -1);
3445434454++ svfloat64_t d = svmul_x (svptrue_b64 (), p, 2);
3445534455++ svfloat64_t swap = n;
3445634456++ n = svneg_m (n, use_recip, d);
3445734457++ d = svsel (use_recip, swap, d);
3445834458++ svfloat64_t y = svdiv_x (svnot_z (pg, special), n, d);
3445934459+ return sv_call_f64 (tan, x, y, special);
3446034460+ }
3446134461+3446234462+@@ -50,15 +64,10 @@ special_case (svfloat64_t x, svfloat64_t y, svbool_t special)
3446334463+ svfloat64_t SV_NAME_D1 (tan) (svfloat64_t x, svbool_t pg)
3446434464+ {
3446534465+ const struct data *dat = ptr_barrier (&data);
3446634466+-
3446734467+- /* Invert condition to catch NaNs and Infs as well as large values. */
3446834468+- svbool_t special = svnot_z (pg, svaclt (pg, x, dat->range_val));
3446934469+-
3447034470++ svfloat64_t half_pi_c0 = svld1rq (svptrue_b64 (), &dat->c0);
3447134471+ /* q = nearest integer to 2 * x / pi. */
3447234472+- svfloat64_t shift = sv_f64 (dat->shift);
3447334473+- svfloat64_t q = svmla_x (pg, shift, x, dat->inv_half_pi);
3447434474+- q = svsub_x (pg, q, shift);
3447534475+- svint64_t qi = svcvt_s64_x (pg, q);
3447634476++ svfloat64_t q = svmul_lane (x, half_pi_c0, 1);
3447734477++ q = svrinta_x (pg, q);
3447834478+3447934479+ /* Use q to reduce x to r in [-pi/4, pi/4], by:
3448034480+ r = x - q * pi/2, in extended precision. */
3448134481+@@ -68,7 +77,7 @@ svfloat64_t SV_NAME_D1 (tan) (svfloat64_t x, svbool_t pg)
3448234482+ r = svmls_lane (r, q, half_pi, 1);
3448334483+ /* Further reduce r to [-pi/8, pi/8], to be reconstructed using double angle
3448434484+ formula. */
3448534485+- r = svmul_x (pg, r, 0.5);
3448634486++ r = svmul_x (svptrue_b64 (), r, 0.5);
3448734487+3448834488+ /* Approximate tan(r) using order 8 polynomial.
3448934489+ tan(x) is odd, so polynomial has the form:
3449034490+@@ -76,29 +85,51 @@ svfloat64_t SV_NAME_D1 (tan) (svfloat64_t x, svbool_t pg)
3449134491+ Hence we first approximate P(r) = C1 + C2 * r^2 + C3 * r^4 + ...
3449234492+ Then compute the approximation by:
3449334493+ tan(r) ~= r + r^3 * (C0 + r^2 * P(r)). */
3449434494+- svfloat64_t r2 = svmul_x (pg, r, r);
3449534495+- svfloat64_t r4 = svmul_x (pg, r2, r2);
3449634496+- svfloat64_t r8 = svmul_x (pg, r4, r4);
3449734497++
3449834498++ svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r);
3449934499++ svfloat64_t r4 = svmul_x (svptrue_b64 (), r2, r2);
3450034500++ svfloat64_t r8 = svmul_x (svptrue_b64 (), r4, r4);
3450134501+ /* Use offset version coeff array by 1 to evaluate from C1 onwards. */
3450234502+- svfloat64_t p = sv_estrin_7_f64_x (pg, r2, r4, r8, dat->poly + 1);
3450334503+- p = svmad_x (pg, p, r2, dat->poly[0]);
3450434504+- p = svmla_x (pg, r, r2, svmul_x (pg, p, r));
3450534505++ svfloat64_t C_24 = svld1rq (svptrue_b64 (), &dat->c2);
3450634506++ svfloat64_t C_68 = svld1rq (svptrue_b64 (), &dat->c6);
3450734507++
3450834508++ /* Use offset version coeff array by 1 to evaluate from C1 onwards. */
3450934509++ svfloat64_t p01 = svmla_lane (sv_f64 (dat->poly_1357[0]), r2, C_24, 0);
3451034510++ svfloat64_t p23 = svmla_lane_f64 (sv_f64 (dat->poly_1357[1]), r2, C_24, 1);
3451134511++ svfloat64_t p03 = svmla_x (pg, p01, p23, r4);
3451234512++
3451334513++ svfloat64_t p45 = svmla_lane (sv_f64 (dat->poly_1357[2]), r2, C_68, 0);
3451434514++ svfloat64_t p67 = svmla_lane (sv_f64 (dat->poly_1357[3]), r2, C_68, 1);
3451534515++ svfloat64_t p47 = svmla_x (pg, p45, p67, r4);
3451634516++
3451734517++ svfloat64_t p = svmla_x (pg, p03, p47, r8);
3451834518++
3451934519++ svfloat64_t z = svmul_x (svptrue_b64 (), p, r);
3452034520++ z = svmul_x (svptrue_b64 (), r2, z);
3452134521++ z = svmla_lane (z, r, half_pi_c0, 0);
3452234522++ p = svmla_x (pg, r, r2, z);
3452334523+3452434524+ /* Recombination uses double-angle formula:
3452534525+ tan(2x) = 2 * tan(x) / (1 - (tan(x))^2)
3452634526+ and reciprocity around pi/2:
3452734527+ tan(x) = 1 / (tan(pi/2 - x))
3452834528+ to assemble result using change-of-sign and conditional selection of
3452934529+- numerator/denominator dependent on odd/even-ness of q (hence quadrant). */
3453034530+- svbool_t use_recip
3453134531+- = svcmpeq (pg, svand_x (pg, svreinterpret_u64 (qi), 1), 0);
3453234532++ numerator/denominator dependent on odd/even-ness of q (quadrant). */
3453334533++
3453434534++ /* Invert condition to catch NaNs and Infs as well as large values. */
3453534535++ svbool_t special = svnot_z (pg, svaclt (pg, x, dat->range_val));
3453634536++
3453734537++ if (__glibc_unlikely (svptest_any (pg, special)))
3453834538++ {
3453934539++ return special_case (x, p, q, pg, special);
3454034540++ }
3454134541++ svbool_t use_recip = svcmpeq (
3454234542++ pg, svand_x (pg, svreinterpret_u64 (svcvt_s64_x (pg, q)), 1), 0);
3454334543+3454434544+ svfloat64_t n = svmad_x (pg, p, p, -1);
3454534545+- svfloat64_t d = svmul_x (pg, p, 2);
3454634546++ svfloat64_t d = svmul_x (svptrue_b64 (), p, 2);
3454734547+ svfloat64_t swap = n;
3454834548+ n = svneg_m (n, use_recip, d);
3454934549+ d = svsel (use_recip, swap, d);
3455034550+- if (__glibc_unlikely (svptest_any (pg, special)))
3455134551+- return special_case (x, svdiv_x (svnot_z (pg, special), n, d), special);
3455234552+ return svdiv_x (pg, n, d);
3455334553+ }
3455434554+diff --git a/sysdeps/aarch64/fpu/tanf_sve.c b/sysdeps/aarch64/fpu/tanf_sve.c
3455534555+index f342583241..e850fb4882 100644
3455634556+--- a/sysdeps/aarch64/fpu/tanf_sve.c
3455734557++++ b/sysdeps/aarch64/fpu/tanf_sve.c
3455834558+@@ -60,21 +60,16 @@ svfloat32_t SV_NAME_F1 (tan) (svfloat32_t x, const svbool_t pg)
3455934559+ {
3456034560+ const struct data *d = ptr_barrier (&data);
3456134561+3456234562+- /* Determine whether input is too large to perform fast regression. */
3456334563+- svbool_t cmp = svacge (pg, x, d->range_val);
3456434564+-
3456534565+ svfloat32_t odd_coeffs = svld1rq (svptrue_b32 (), &d->c1);
3456634566+ svfloat32_t pi_vals = svld1rq (svptrue_b32 (), &d->pio2_1);
3456734567+3456834568+ /* n = rint(x/(pi/2)). */
3456934569+- svfloat32_t q = svmla_lane (sv_f32 (d->shift), x, pi_vals, 3);
3457034570+- svfloat32_t n = svsub_x (pg, q, d->shift);
3457134571++ svfloat32_t n = svrintn_x (pg, svmul_lane (x, pi_vals, 3));
3457234572+ /* n is already a signed integer, simply convert it. */
3457334573+ svint32_t in = svcvt_s32_x (pg, n);
3457434574+ /* Determine if x lives in an interval, where |tan(x)| grows to infinity. */
3457534575+ svint32_t alt = svand_x (pg, in, 1);
3457634576+ svbool_t pred_alt = svcmpne (pg, alt, 0);
3457734577+-
3457834578+ /* r = x - n * (pi/2) (range reduction into 0 .. pi/4). */
3457934579+ svfloat32_t r;
3458034580+ r = svmls_lane (x, n, pi_vals, 0);
3458134581+@@ -93,7 +88,7 @@ svfloat32_t SV_NAME_F1 (tan) (svfloat32_t x, const svbool_t pg)
3458234582+3458334583+ /* Evaluate polynomial approximation of tangent on [-pi/4, pi/4],
3458434584+ using Estrin on z^2. */
3458534585+- svfloat32_t z2 = svmul_x (pg, z, z);
3458634586++ svfloat32_t z2 = svmul_x (svptrue_b32 (), r, r);
3458734587+ svfloat32_t p01 = svmla_lane (sv_f32 (d->c0), z2, odd_coeffs, 0);
3458834588+ svfloat32_t p23 = svmla_lane (sv_f32 (d->c2), z2, odd_coeffs, 1);
3458934589+ svfloat32_t p45 = svmla_lane (sv_f32 (d->c4), z2, odd_coeffs, 2);
3459034590+@@ -106,13 +101,14 @@ svfloat32_t SV_NAME_F1 (tan) (svfloat32_t x, const svbool_t pg)
3459134591+3459234592+ svfloat32_t y = svmla_x (pg, z, p, svmul_x (pg, z, z2));
3459334593+3459434594+- /* Transform result back, if necessary. */
3459534595+- svfloat32_t inv_y = svdivr_x (pg, y, 1.0f);
3459634596+-
3459734597+ /* No need to pass pg to specialcase here since cmp is a strict subset,
3459834598+ guaranteed by the cmpge above. */
3459934599++
3460034600++ /* Determine whether input is too large to perform fast regression. */
3460134601++ svbool_t cmp = svacge (pg, x, d->range_val);
3460234602+ if (__glibc_unlikely (svptest_any (pg, cmp)))
3460334603+- return special_case (x, svsel (pred_alt, inv_y, y), cmp);
3460434604++ return special_case (x, svdivr_x (pg, y, 1.0f), cmp);
3460534605+3460634606++ svfloat32_t inv_y = svdivr_x (pg, y, 1.0f);
3460734607+ return svsel (pred_alt, inv_y, y);
3460834608+ }
3460934609+3461034610+commit ab5ba6c188159bb5e12be95cd90458924c2fe592
3461134611+Author: Yat Long Poon <yatlong.poon@arm.com>
3461234612+Date: Fri Jan 3 19:07:30 2025 +0000
3461334613+3461434614+ AArch64: Improve codegen for SVE logs
3461534615+3461634616+ Reduce memory access by using lanewise MLA and moving constants to struct
3461734617+ and reduce number of MOVPRFXs.
3461834618+ Update maximum ULP error for double log_sve from 1 to 2.
3461934619+ Speedup on Neoverse V1 for log (3%), log2 (5%), and log10 (4%).
3462034620+3462134621+ (cherry picked from commit 32d193a372feb28f9da247bb7283d404b84429c6)
3462234622+3462334623+diff --git a/sysdeps/aarch64/fpu/log10_sve.c b/sysdeps/aarch64/fpu/log10_sve.c
3462434624+index ab7362128d..f1cad2759a 100644
3462534625+--- a/sysdeps/aarch64/fpu/log10_sve.c
3462634626++++ b/sysdeps/aarch64/fpu/log10_sve.c
3462734627+@@ -23,28 +23,49 @@
3462834628+ #define Min 0x0010000000000000
3462934629+ #define Max 0x7ff0000000000000
3463034630+ #define Thres 0x7fe0000000000000 /* Max - Min. */
3463134631+-#define Off 0x3fe6900900000000
3463234632+ #define N (1 << V_LOG10_TABLE_BITS)
3463334633+3463434634++static const struct data
3463534635++{
3463634636++ double c0, c2;
3463734637++ double c1, c3;
3463834638++ double invln10, log10_2;
3463934639++ double c4;
3464034640++ uint64_t off;
3464134641++} data = {
3464234642++ .c0 = -0x1.bcb7b1526e506p-3,
3464334643++ .c1 = 0x1.287a7636be1d1p-3,
3464434644++ .c2 = -0x1.bcb7b158af938p-4,
3464534645++ .c3 = 0x1.63c78734e6d07p-4,
3464634646++ .c4 = -0x1.287461742fee4p-4,
3464734647++ .invln10 = 0x1.bcb7b1526e50ep-2,
3464834648++ .log10_2 = 0x1.34413509f79ffp-2,
3464934649++ .off = 0x3fe6900900000000,
3465034650++};
3465134651++
3465234652+ static svfloat64_t NOINLINE
3465334653+-special_case (svfloat64_t x, svfloat64_t y, svbool_t special)
3465434654++special_case (svfloat64_t hi, svuint64_t tmp, svfloat64_t y, svfloat64_t r2,
3465534655++ svbool_t special, const struct data *d)
3465634656+ {
3465734657+- return sv_call_f64 (log10, x, y, special);
3465834658++ svfloat64_t x = svreinterpret_f64 (svadd_x (svptrue_b64 (), tmp, d->off));
3465934659++ return sv_call_f64 (log10, x, svmla_x (svptrue_b64 (), hi, r2, y), special);
3466034660+ }
3466134661+3466234662+-/* SVE log10 algorithm.
3466334663++/* Double-precision SVE log10 routine.
3466434664+ Maximum measured error is 2.46 ulps.
3466534665+ SV_NAME_D1 (log10)(0x1.131956cd4b627p+0) got 0x1.fffbdf6eaa669p-6
3466634666+ want 0x1.fffbdf6eaa667p-6. */
3466734667+ svfloat64_t SV_NAME_D1 (log10) (svfloat64_t x, const svbool_t pg)
3466834668+ {
3466934669++ const struct data *d = ptr_barrier (&data);
3467034670++
3467134671+ svuint64_t ix = svreinterpret_u64 (x);
3467234672+ svbool_t special = svcmpge (pg, svsub_x (pg, ix, Min), Thres);
3467334673+3467434674+ /* x = 2^k z; where z is in range [Off,2*Off) and exact.
3467534675+ The range is split into N subintervals.
3467634676+ The ith subinterval contains z and c is near its center. */
3467734677+- svuint64_t tmp = svsub_x (pg, ix, Off);
3467834678++ svuint64_t tmp = svsub_x (pg, ix, d->off);
3467934679+ svuint64_t i = svlsr_x (pg, tmp, 51 - V_LOG10_TABLE_BITS);
3468034680+ i = svand_x (pg, i, (N - 1) << 1);
3468134681+ svfloat64_t k = svcvt_f64_x (pg, svasr_x (pg, svreinterpret_s64 (tmp), 52));
3468234682+@@ -62,15 +83,19 @@ svfloat64_t SV_NAME_D1 (log10) (svfloat64_t x, const svbool_t pg)
3468334683+ svfloat64_t r = svmad_x (pg, invc, z, -1.0);
3468434684+3468534685+ /* hi = log(c) + k*log(2). */
3468634686+- svfloat64_t w = svmla_x (pg, logc, r, __v_log10_data.invln10);
3468734687+- svfloat64_t hi = svmla_x (pg, w, k, __v_log10_data.log10_2);
3468834688++ svfloat64_t invln10_log10_2 = svld1rq_f64 (svptrue_b64 (), &d->invln10);
3468934689++ svfloat64_t w = svmla_lane_f64 (logc, r, invln10_log10_2, 0);
3469034690++ svfloat64_t hi = svmla_lane_f64 (w, k, invln10_log10_2, 1);
3469134691+3469234692+ /* y = r2*(A0 + r*A1 + r2*(A2 + r*A3 + r2*A4)) + hi. */
3469334693+- svfloat64_t r2 = svmul_x (pg, r, r);
3469434694+- svfloat64_t y = sv_pw_horner_4_f64_x (pg, r, r2, __v_log10_data.poly);
3469534695++ svfloat64_t odd_coeffs = svld1rq_f64 (svptrue_b64 (), &d->c1);
3469634696++ svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r);
3469734697++ svfloat64_t y = svmla_lane_f64 (sv_f64 (d->c2), r, odd_coeffs, 1);
3469834698++ svfloat64_t p = svmla_lane_f64 (sv_f64 (d->c0), r, odd_coeffs, 0);
3469934699++ y = svmla_x (pg, y, r2, d->c4);
3470034700++ y = svmla_x (pg, p, r2, y);
3470134701+3470234702+ if (__glibc_unlikely (svptest_any (pg, special)))
3470334703+- return special_case (x, svmla_x (svnot_z (pg, special), hi, r2, y),
3470434704+- special);
3470534705++ return special_case (hi, tmp, y, r2, special, d);
3470634706+ return svmla_x (pg, hi, r2, y);
3470734707+ }
3470834708+diff --git a/sysdeps/aarch64/fpu/log2_sve.c b/sysdeps/aarch64/fpu/log2_sve.c
3470934709+index 743fa2a913..908e638246 100644
3471034710+--- a/sysdeps/aarch64/fpu/log2_sve.c
3471134711++++ b/sysdeps/aarch64/fpu/log2_sve.c
3471234712+@@ -21,15 +21,32 @@
3471334713+ #include "poly_sve_f64.h"
3471434714+3471534715+ #define N (1 << V_LOG2_TABLE_BITS)
3471634716+-#define Off 0x3fe6900900000000
3471734717+ #define Max (0x7ff0000000000000)
3471834718+ #define Min (0x0010000000000000)
3471934719+ #define Thresh (0x7fe0000000000000) /* Max - Min. */
3472034720+3472134721++static const struct data
3472234722++{
3472334723++ double c0, c2;
3472434724++ double c1, c3;
3472534725++ double invln2, c4;
3472634726++ uint64_t off;
3472734727++} data = {
3472834728++ .c0 = -0x1.71547652b83p-1,
3472934729++ .c1 = 0x1.ec709dc340953p-2,
3473034730++ .c2 = -0x1.71547651c8f35p-2,
3473134731++ .c3 = 0x1.2777ebe12dda5p-2,
3473234732++ .c4 = -0x1.ec738d616fe26p-3,
3473334733++ .invln2 = 0x1.71547652b82fep0,
3473434734++ .off = 0x3fe6900900000000,
3473534735++};
3473634736++
3473734737+ static svfloat64_t NOINLINE
3473834738+-special_case (svfloat64_t x, svfloat64_t y, svbool_t cmp)
3473934739++special_case (svfloat64_t w, svuint64_t tmp, svfloat64_t y, svfloat64_t r2,
3474034740++ svbool_t special, const struct data *d)
3474134741+ {
3474234742+- return sv_call_f64 (log2, x, y, cmp);
3474334743++ svfloat64_t x = svreinterpret_f64 (svadd_x (svptrue_b64 (), tmp, d->off));
3474434744++ return sv_call_f64 (log2, x, svmla_x (svptrue_b64 (), w, r2, y), special);
3474534745+ }
3474634746+3474734747+ /* Double-precision SVE log2 routine.
3474834748+@@ -40,13 +57,15 @@ special_case (svfloat64_t x, svfloat64_t y, svbool_t cmp)
3474934749+ want 0x1.fffb34198d9ddp-5. */
3475034750+ svfloat64_t SV_NAME_D1 (log2) (svfloat64_t x, const svbool_t pg)
3475134751+ {
3475234752++ const struct data *d = ptr_barrier (&data);
3475334753++
3475434754+ svuint64_t ix = svreinterpret_u64 (x);
3475534755+ svbool_t special = svcmpge (pg, svsub_x (pg, ix, Min), Thresh);
3475634756+3475734757+ /* x = 2^k z; where z is in range [Off,2*Off) and exact.
3475834758+ The range is split into N subintervals.
3475934759+ The ith subinterval contains z and c is near its center. */
3476034760+- svuint64_t tmp = svsub_x (pg, ix, Off);
3476134761++ svuint64_t tmp = svsub_x (pg, ix, d->off);
3476234762+ svuint64_t i = svlsr_x (pg, tmp, 51 - V_LOG2_TABLE_BITS);
3476334763+ i = svand_x (pg, i, (N - 1) << 1);
3476434764+ svfloat64_t k = svcvt_f64_x (pg, svasr_x (pg, svreinterpret_s64 (tmp), 52));
3476534765+@@ -59,15 +78,19 @@ svfloat64_t SV_NAME_D1 (log2) (svfloat64_t x, const svbool_t pg)
3476634766+3476734767+ /* log2(x) = log1p(z/c-1)/log(2) + log2(c) + k. */
3476834768+3476934769++ svfloat64_t invln2_and_c4 = svld1rq_f64 (svptrue_b64 (), &d->invln2);
3477034770+ svfloat64_t r = svmad_x (pg, invc, z, -1.0);
3477134771+- svfloat64_t w = svmla_x (pg, log2c, r, __v_log2_data.invln2);
3477234772+-
3477334773+- svfloat64_t r2 = svmul_x (pg, r, r);
3477434774+- svfloat64_t y = sv_pw_horner_4_f64_x (pg, r, r2, __v_log2_data.poly);
3477534775++ svfloat64_t w = svmla_lane_f64 (log2c, r, invln2_and_c4, 0);
3477634776+ w = svadd_x (pg, k, w);
3477734777+3477834778++ svfloat64_t odd_coeffs = svld1rq_f64 (svptrue_b64 (), &d->c1);
3477934779++ svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r);
3478034780++ svfloat64_t y = svmla_lane_f64 (sv_f64 (d->c2), r, odd_coeffs, 1);
3478134781++ svfloat64_t p = svmla_lane_f64 (sv_f64 (d->c0), r, odd_coeffs, 0);
3478234782++ y = svmla_lane_f64 (y, r2, invln2_and_c4, 1);
3478334783++ y = svmla_x (pg, p, r2, y);
3478434784++
3478534785+ if (__glibc_unlikely (svptest_any (pg, special)))
3478634786+- return special_case (x, svmla_x (svnot_z (pg, special), w, r2, y),
3478734787+- special);
3478834788++ return special_case (w, tmp, y, r2, special, d);
3478934789+ return svmla_x (pg, w, r2, y);
3479034790+ }
3479134791+diff --git a/sysdeps/aarch64/fpu/log_sve.c b/sysdeps/aarch64/fpu/log_sve.c
3479234792+index 9b689f2ec7..044223400b 100644
3479334793+--- a/sysdeps/aarch64/fpu/log_sve.c
3479434794++++ b/sysdeps/aarch64/fpu/log_sve.c
3479534795+@@ -19,39 +19,54 @@
3479634796+3479734797+ #include "sv_math.h"
3479834798+3479934799+-#define P(i) sv_f64 (__v_log_data.poly[i])
3480034800+ #define N (1 << V_LOG_TABLE_BITS)
3480134801+-#define Off (0x3fe6900900000000)
3480234802+-#define MaxTop (0x7ff)
3480334803+-#define MinTop (0x001)
3480434804+-#define ThreshTop (0x7fe) /* MaxTop - MinTop. */
3480534805++#define Max (0x7ff0000000000000)
3480634806++#define Min (0x0010000000000000)
3480734807++#define Thresh (0x7fe0000000000000) /* Max - Min. */
3480834808++
3480934809++static const struct data
3481034810++{
3481134811++ double c0, c2;
3481234812++ double c1, c3;
3481334813++ double ln2, c4;
3481434814++ uint64_t off;
3481534815++} data = {
3481634816++ .c0 = -0x1.ffffffffffff7p-2,
3481734817++ .c1 = 0x1.55555555170d4p-2,
3481834818++ .c2 = -0x1.0000000399c27p-2,
3481934819++ .c3 = 0x1.999b2e90e94cap-3,
3482034820++ .c4 = -0x1.554e550bd501ep-3,
3482134821++ .ln2 = 0x1.62e42fefa39efp-1,
3482234822++ .off = 0x3fe6900900000000,
3482334823++};
3482434824+3482534825+ static svfloat64_t NOINLINE
3482634826+-special_case (svfloat64_t x, svfloat64_t y, svbool_t cmp)
3482734827++special_case (svfloat64_t hi, svuint64_t tmp, svfloat64_t y, svfloat64_t r2,
3482834828++ svbool_t special, const struct data *d)
3482934829+ {
3483034830+- return sv_call_f64 (log, x, y, cmp);
3483134831++ svfloat64_t x = svreinterpret_f64 (svadd_x (svptrue_b64 (), tmp, d->off));
3483234832++ return sv_call_f64 (log, x, svmla_x (svptrue_b64 (), hi, r2, y), special);
3483334833+ }
3483434834+3483534835+-/* SVE port of AdvSIMD log algorithm.
3483634836+- Maximum measured error is 2.17 ulp:
3483734837+- SV_NAME_D1 (log)(0x1.a6129884398a3p+0) got 0x1.ffffff1cca043p-2
3483834838+- want 0x1.ffffff1cca045p-2. */
3483934839++/* Double-precision SVE log routine.
3484034840++ Maximum measured error is 2.64 ulp:
3484134841++ SV_NAME_D1 (log)(0x1.95e54bc91a5e2p+184) got 0x1.fffffffe88cacp+6
3484234842++ want 0x1.fffffffe88cafp+6. */
3484334843+ svfloat64_t SV_NAME_D1 (log) (svfloat64_t x, const svbool_t pg)
3484434844+ {
3484534845++ const struct data *d = ptr_barrier (&data);
3484634846++
3484734847+ svuint64_t ix = svreinterpret_u64 (x);
3484834848+- svuint64_t top = svlsr_x (pg, ix, 52);
3484934849+- svbool_t cmp = svcmpge (pg, svsub_x (pg, top, MinTop), sv_u64 (ThreshTop));
3485034850++ svbool_t special = svcmpge (pg, svsub_x (pg, ix, Min), Thresh);
3485134851+3485234852+ /* x = 2^k z; where z is in range [Off,2*Off) and exact.
3485334853+ The range is split into N subintervals.
3485434854+ The ith subinterval contains z and c is near its center. */
3485534855+- svuint64_t tmp = svsub_x (pg, ix, Off);
3485634856++ svuint64_t tmp = svsub_x (pg, ix, d->off);
3485734857+ /* Calculate table index = (tmp >> (52 - V_LOG_TABLE_BITS)) % N.
3485834858+ The actual value of i is double this due to table layout. */
3485934859+ svuint64_t i
3486034860+ = svand_x (pg, svlsr_x (pg, tmp, (51 - V_LOG_TABLE_BITS)), (N - 1) << 1);
3486134861+- svint64_t k
3486234862+- = svasr_x (pg, svreinterpret_s64 (tmp), 52); /* Arithmetic shift. */
3486334863+ svuint64_t iz = svsub_x (pg, ix, svand_x (pg, tmp, 0xfffULL << 52));
3486434864+ svfloat64_t z = svreinterpret_f64 (iz);
3486534865+ /* Lookup in 2 global lists (length N). */
3486634866+@@ -59,18 +74,22 @@ svfloat64_t SV_NAME_D1 (log) (svfloat64_t x, const svbool_t pg)
3486734867+ svfloat64_t logc = svld1_gather_index (pg, &__v_log_data.table[0].logc, i);
3486834868+3486934869+ /* log(x) = log1p(z/c-1) + log(c) + k*Ln2. */
3487034870+- svfloat64_t r = svmad_x (pg, invc, z, -1);
3487134871+- svfloat64_t kd = svcvt_f64_x (pg, k);
3487234872++ svfloat64_t kd = svcvt_f64_x (pg, svasr_x (pg, svreinterpret_s64 (tmp), 52));
3487334873+ /* hi = r + log(c) + k*Ln2. */
3487434874+- svfloat64_t hi = svmla_x (pg, svadd_x (pg, logc, r), kd, __v_log_data.ln2);
3487534875++ svfloat64_t ln2_and_c4 = svld1rq_f64 (svptrue_b64 (), &d->ln2);
3487634876++ svfloat64_t r = svmad_x (pg, invc, z, -1);
3487734877++ svfloat64_t hi = svmla_lane_f64 (logc, kd, ln2_and_c4, 0);
3487834878++ hi = svadd_x (pg, r, hi);
3487934879++
3488034880+ /* y = r2*(A0 + r*A1 + r2*(A2 + r*A3 + r2*A4)) + hi. */
3488134881+- svfloat64_t r2 = svmul_x (pg, r, r);
3488234882+- svfloat64_t y = svmla_x (pg, P (2), r, P (3));
3488334883+- svfloat64_t p = svmla_x (pg, P (0), r, P (1));
3488434884+- y = svmla_x (pg, y, r2, P (4));
3488534885++ svfloat64_t odd_coeffs = svld1rq_f64 (svptrue_b64 (), &d->c1);
3488634886++ svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r);
3488734887++ svfloat64_t y = svmla_lane_f64 (sv_f64 (d->c2), r, odd_coeffs, 1);
3488834888++ svfloat64_t p = svmla_lane_f64 (sv_f64 (d->c0), r, odd_coeffs, 0);
3488934889++ y = svmla_lane_f64 (y, r2, ln2_and_c4, 1);
3489034890+ y = svmla_x (pg, p, r2, y);
3489134891+3489234892+- if (__glibc_unlikely (svptest_any (pg, cmp)))
3489334893+- return special_case (x, svmla_x (svnot_z (pg, cmp), hi, r2, y), cmp);
3489434894++ if (__glibc_unlikely (svptest_any (pg, special)))
3489534895++ return special_case (hi, tmp, y, r2, special, d);
3489634896+ return svmla_x (pg, hi, r2, y);
3489734897+ }
3489834898+diff --git a/sysdeps/aarch64/libm-test-ulps b/sysdeps/aarch64/libm-test-ulps
3489934899+index 6c96304611..b76c38dac2 100644
3490034900+--- a/sysdeps/aarch64/libm-test-ulps
3490134901++++ b/sysdeps/aarch64/libm-test-ulps
3490234902+@@ -1460,7 +1460,7 @@ float: 2
3490334903+ ldouble: 1
3490434904+3490534905+ Function: "log_sve":
3490634906+-double: 1
3490734907++double: 2
3490834908+ float: 3
3490934909+3491034910+ Function: "log_towardzero":
3491134911+3491234912+commit aa7c61ea15e27ae14717e065a5d4c50baa472851
3491334913+Author: Yat Long Poon <yatlong.poon@arm.com>
3491434914+Date: Fri Jan 3 19:09:05 2025 +0000
3491534915+3491634916+ AArch64: Improve codegen for SVE log1pf users
3491734917+3491834918+ Reduce memory access by using lanewise MLA and reduce number of MOVPRFXs.
3491934919+ Move log1pf implementation to inline helper function.
3492034920+ Speedup on Neoverse V1 for log1pf (10%), acoshf (-1%), atanhf (2%), asinhf (2%).
3492134921+3492234922+ (cherry picked from commit 91c1fadba338752bf514cd4cca057b27b1b10eed)
3492334923+3492434924+diff --git a/sysdeps/aarch64/fpu/acoshf_sve.c b/sysdeps/aarch64/fpu/acoshf_sve.c
3492534925+index 2110894e62..491365e24d 100644
3492634926+--- a/sysdeps/aarch64/fpu/acoshf_sve.c
3492734927++++ b/sysdeps/aarch64/fpu/acoshf_sve.c
3492834928+@@ -17,23 +17,26 @@
3492934929+ License along with the GNU C Library; if not, see
3493034930+ <https://www.gnu.org/licenses/>. */
3493134931+3493234932++#include "sv_math.h"
3493334933++#include "sv_log1pf_inline.h"
3493434934++
3493534935+ #define One 0x3f800000
3493634936+ #define Thres 0x20000000 /* asuint(0x1p64) - One. */
3493734937+3493834938+-#include "sv_log1pf_inline.h"
3493934939+-
3494034940+ static svfloat32_t NOINLINE
3494134941+-special_case (svfloat32_t x, svfloat32_t y, svbool_t special)
3494234942++special_case (svfloat32_t xm1, svfloat32_t tmp, svbool_t special)
3494334943+ {
3494434944++ svfloat32_t x = svadd_x (svptrue_b32 (), xm1, 1.0f);
3494534945++ svfloat32_t y = sv_log1pf_inline (tmp, svptrue_b32 ());
3494634946+ return sv_call_f32 (acoshf, x, y, special);
3494734947+ }
3494834948+3494934949+ /* Single-precision SVE acosh(x) routine. Implements the same algorithm as
3495034950+ vector acoshf and log1p.
3495134951+3495234952+- Maximum error is 2.78 ULPs:
3495334953+- SV_NAME_F1 (acosh) (0x1.01e996p+0) got 0x1.f45b42p-4
3495434954+- want 0x1.f45b3cp-4. */
3495534955++ Maximum error is 2.47 ULPs:
3495634956++ SV_NAME_F1 (acosh) (0x1.01ca76p+0) got 0x1.e435a6p-4
3495734957++ want 0x1.e435a2p-4. */
3495834958+ svfloat32_t SV_NAME_F1 (acosh) (svfloat32_t x, const svbool_t pg)
3495934959+ {
3496034960+ svuint32_t ix = svreinterpret_u32 (x);
3496134961+@@ -41,9 +44,9 @@ svfloat32_t SV_NAME_F1 (acosh) (svfloat32_t x, const svbool_t pg)
3496234962+3496334963+ svfloat32_t xm1 = svsub_x (pg, x, 1.0f);
3496434964+ svfloat32_t u = svmul_x (pg, xm1, svadd_x (pg, x, 1.0f));
3496534965+- svfloat32_t y = sv_log1pf_inline (svadd_x (pg, xm1, svsqrt_x (pg, u)), pg);
3496634966++ svfloat32_t tmp = svadd_x (pg, xm1, svsqrt_x (pg, u));
3496734967+3496834968+ if (__glibc_unlikely (svptest_any (pg, special)))
3496934969+- return special_case (x, y, special);
3497034970+- return y;
3497134971++ return special_case (xm1, tmp, special);
3497234972++ return sv_log1pf_inline (tmp, pg);
3497334973+ }
3497434974+diff --git a/sysdeps/aarch64/fpu/asinhf_sve.c b/sysdeps/aarch64/fpu/asinhf_sve.c
3497534975+index d85c3a685c..b7f253bf32 100644
3497634976+--- a/sysdeps/aarch64/fpu/asinhf_sve.c
3497734977++++ b/sysdeps/aarch64/fpu/asinhf_sve.c
3497834978+@@ -20,20 +20,23 @@
3497934979+ #include "sv_math.h"
3498034980+ #include "sv_log1pf_inline.h"
3498134981+3498234982+-#define BigBound (0x5f800000) /* asuint(0x1p64). */
3498334983++#define BigBound 0x5f800000 /* asuint(0x1p64). */
3498434984+3498534985+ static svfloat32_t NOINLINE
3498634986+-special_case (svfloat32_t x, svfloat32_t y, svbool_t special)
3498734987++special_case (svuint32_t iax, svuint32_t sign, svfloat32_t y, svbool_t special)
3498834988+ {
3498934989++ svfloat32_t x = svreinterpret_f32 (sveor_x (svptrue_b32 (), iax, sign));
3499034990++ y = svreinterpret_f32 (
3499134991++ svorr_x (svptrue_b32 (), sign, svreinterpret_u32 (y)));
3499234992+ return sv_call_f32 (asinhf, x, y, special);
3499334993+ }
3499434994+3499534995+ /* Single-precision SVE asinh(x) routine. Implements the same algorithm as
3499634996+ vector asinhf and log1p.
3499734997+3499834998+- Maximum error is 2.48 ULPs:
3499934999+- SV_NAME_F1 (asinh) (0x1.008864p-3) got 0x1.ffbbbcp-4
3500035000+- want 0x1.ffbbb8p-4. */
3500135001++ Maximum error is 1.92 ULPs:
3500235002++ SV_NAME_F1 (asinh) (-0x1.0922ecp-1) got -0x1.fd0bccp-2
3500335003++ want -0x1.fd0bc8p-2. */
3500435004+ svfloat32_t SV_NAME_F1 (asinh) (svfloat32_t x, const svbool_t pg)
3500535005+ {
3500635006+ svfloat32_t ax = svabs_x (pg, x);
3500735007+@@ -49,8 +52,6 @@ svfloat32_t SV_NAME_F1 (asinh) (svfloat32_t x, const svbool_t pg)
3500835008+ = sv_log1pf_inline (svadd_x (pg, ax, svdiv_x (pg, ax2, d)), pg);
3500935009+3501035010+ if (__glibc_unlikely (svptest_any (pg, special)))
3501135011+- return special_case (
3501235012+- x, svreinterpret_f32 (svorr_x (pg, sign, svreinterpret_u32 (y))),
3501335013+- special);
3501435014++ return special_case (iax, sign, y, special);
3501535015+ return svreinterpret_f32 (svorr_x (pg, sign, svreinterpret_u32 (y)));
3501635016+ }
3501735017+diff --git a/sysdeps/aarch64/fpu/atanhf_sve.c b/sysdeps/aarch64/fpu/atanhf_sve.c
3501835018+index dae83041ef..2d3005bbc8 100644
3501935019+--- a/sysdeps/aarch64/fpu/atanhf_sve.c
3502035020++++ b/sysdeps/aarch64/fpu/atanhf_sve.c
3502135021+@@ -17,21 +17,25 @@
3502235022+ License along with the GNU C Library; if not, see
3502335023+ <https://www.gnu.org/licenses/>. */
3502435024+3502535025++#include "sv_math.h"
3502635026+ #include "sv_log1pf_inline.h"
3502735027+3502835028+ #define One (0x3f800000)
3502935029+ #define Half (0x3f000000)
3503035030+3503135031+ static svfloat32_t NOINLINE
3503235032+-special_case (svfloat32_t x, svfloat32_t y, svbool_t special)
3503335033++special_case (svuint32_t iax, svuint32_t sign, svfloat32_t halfsign,
3503435034++ svfloat32_t y, svbool_t special)
3503535035+ {
3503635036++ svfloat32_t x = svreinterpret_f32 (sveor_x (svptrue_b32 (), iax, sign));
3503735037++ y = svmul_x (svptrue_b32 (), halfsign, y);
3503835038+ return sv_call_f32 (atanhf, x, y, special);
3503935039+ }
3504035040+3504135041+ /* Approximation for vector single-precision atanh(x) using modified log1p.
3504235042+- The maximum error is 2.28 ULP:
3504335043+- _ZGVsMxv_atanhf(0x1.ff1194p-5) got 0x1.ffbbbcp-5
3504435044+- want 0x1.ffbbb6p-5. */
3504535045++ The maximum error is 1.99 ULP:
3504635046++ _ZGVsMxv_atanhf(0x1.f1583p-5) got 0x1.f1f4fap-5
3504735047++ want 0x1.f1f4f6p-5. */
3504835048+ svfloat32_t SV_NAME_F1 (atanh) (svfloat32_t x, const svbool_t pg)
3504935049+ {
3505035050+ svfloat32_t ax = svabs_x (pg, x);
3505135051+@@ -48,7 +52,7 @@ svfloat32_t SV_NAME_F1 (atanh) (svfloat32_t x, const svbool_t pg)
3505235052+ y = sv_log1pf_inline (y, pg);
3505335053+3505435054+ if (__glibc_unlikely (svptest_any (pg, special)))
3505535055+- return special_case (x, svmul_x (pg, halfsign, y), special);
3505635056++ return special_case (iax, sign, halfsign, y, special);
3505735057+3505835058+ return svmul_x (pg, halfsign, y);
3505935059+ }
3506035060+diff --git a/sysdeps/aarch64/fpu/log1pf_sve.c b/sysdeps/aarch64/fpu/log1pf_sve.c
3506135061+index 5256d5e94c..18a185c838 100644
3506235062+--- a/sysdeps/aarch64/fpu/log1pf_sve.c
3506335063++++ b/sysdeps/aarch64/fpu/log1pf_sve.c
3506435064+@@ -18,30 +18,13 @@
3506535065+ <https://www.gnu.org/licenses/>. */
3506635066+3506735067+ #include "sv_math.h"
3506835068+-#include "poly_sve_f32.h"
3506935069+-
3507035070+-static const struct data
3507135071+-{
3507235072+- float poly[8];
3507335073+- float ln2, exp_bias;
3507435074+- uint32_t four, three_quarters;
3507535075+-} data = {.poly = {/* Do not store first term of polynomial, which is -0.5, as
3507635076+- this can be fmov-ed directly instead of including it in
3507735077+- the main load-and-mla polynomial schedule. */
3507835078+- 0x1.5555aap-2f, -0x1.000038p-2f, 0x1.99675cp-3f,
3507935079+- -0x1.54ef78p-3f, 0x1.28a1f4p-3f, -0x1.0da91p-3f,
3508035080+- 0x1.abcb6p-4f, -0x1.6f0d5ep-5f},
3508135081+- .ln2 = 0x1.62e43p-1f,
3508235082+- .exp_bias = 0x1p-23f,
3508335083+- .four = 0x40800000,
3508435084+- .three_quarters = 0x3f400000};
3508535085+-
3508635086+-#define SignExponentMask 0xff800000
3508735087++#include "sv_log1pf_inline.h"
3508835088+3508935089+ static svfloat32_t NOINLINE
3509035090+-special_case (svfloat32_t x, svfloat32_t y, svbool_t special)
3509135091++special_case (svfloat32_t x, svbool_t special)
3509235092+ {
3509335093+- return sv_call_f32 (log1pf, x, y, special);
3509435094++ return sv_call_f32 (log1pf, x, sv_log1pf_inline (x, svptrue_b32 ()),
3509535095++ special);
3509635096+ }
3509735097+3509835098+ /* Vector log1pf approximation using polynomial on reduced interval. Worst-case
3509935099+@@ -50,53 +33,14 @@ special_case (svfloat32_t x, svfloat32_t y, svbool_t special)
3510035100+ want 0x1.9f323ep-2. */
3510135101+ svfloat32_t SV_NAME_F1 (log1p) (svfloat32_t x, svbool_t pg)
3510235102+ {
3510335103+- const struct data *d = ptr_barrier (&data);
3510435104+ /* x < -1, Inf/Nan. */
3510535105+ svbool_t special = svcmpeq (pg, svreinterpret_u32 (x), 0x7f800000);
3510635106+ special = svorn_z (pg, special, svcmpge (pg, x, -1));
3510735107+3510835108+- /* With x + 1 = t * 2^k (where t = m + 1 and k is chosen such that m
3510935109+- is in [-0.25, 0.5]):
3511035110+- log1p(x) = log(t) + log(2^k) = log1p(m) + k*log(2).
3511135111+-
3511235112+- We approximate log1p(m) with a polynomial, then scale by
3511335113+- k*log(2). Instead of doing this directly, we use an intermediate
3511435114+- scale factor s = 4*k*log(2) to ensure the scale is representable
3511535115+- as a normalised fp32 number. */
3511635116+- svfloat32_t m = svadd_x (pg, x, 1);
3511735117+-
3511835118+- /* Choose k to scale x to the range [-1/4, 1/2]. */
3511935119+- svint32_t k
3512035120+- = svand_x (pg, svsub_x (pg, svreinterpret_s32 (m), d->three_quarters),
3512135121+- sv_s32 (SignExponentMask));
3512235122+-
3512335123+- /* Scale x by exponent manipulation. */
3512435124+- svfloat32_t m_scale = svreinterpret_f32 (
3512535125+- svsub_x (pg, svreinterpret_u32 (x), svreinterpret_u32 (k)));
3512635126+-
3512735127+- /* Scale up to ensure that the scale factor is representable as normalised
3512835128+- fp32 number, and scale m down accordingly. */
3512935129+- svfloat32_t s = svreinterpret_f32 (svsubr_x (pg, k, d->four));
3513035130+- m_scale = svadd_x (pg, m_scale, svmla_x (pg, sv_f32 (-1), s, 0.25));
3513135131+-
3513235132+- /* Evaluate polynomial on reduced interval. */
3513335133+- svfloat32_t ms2 = svmul_x (pg, m_scale, m_scale),
3513435134+- ms4 = svmul_x (pg, ms2, ms2);
3513535135+- svfloat32_t p = sv_estrin_7_f32_x (pg, m_scale, ms2, ms4, d->poly);
3513635136+- p = svmad_x (pg, m_scale, p, -0.5);
3513735137+- p = svmla_x (pg, m_scale, m_scale, svmul_x (pg, m_scale, p));
3513835138+-
3513935139+- /* The scale factor to be applied back at the end - by multiplying float(k)
3514035140+- by 2^-23 we get the unbiased exponent of k. */
3514135141+- svfloat32_t scale_back = svmul_x (pg, svcvt_f32_x (pg, k), d->exp_bias);
3514235142+-
3514335143+- /* Apply the scaling back. */
3514435144+- svfloat32_t y = svmla_x (pg, p, scale_back, d->ln2);
3514535145+-
3514635146+ if (__glibc_unlikely (svptest_any (pg, special)))
3514735147+- return special_case (x, y, special);
3514835148++ return special_case (x, special);
3514935149+3515035150+- return y;
3515135151++ return sv_log1pf_inline (x, pg);
3515235152+ }
3515335153+3515435154+ strong_alias (SV_NAME_F1 (log1p), SV_NAME_F1 (logp1))
3515535155+diff --git a/sysdeps/aarch64/fpu/sv_log1pf_inline.h b/sysdeps/aarch64/fpu/sv_log1pf_inline.h
3515635156+index b94b2da055..850297d615 100644
3515735157+--- a/sysdeps/aarch64/fpu/sv_log1pf_inline.h
3515835158++++ b/sysdeps/aarch64/fpu/sv_log1pf_inline.h
3515935159+@@ -22,55 +22,76 @@
3516035160+3516135161+ #include "sv_math.h"
3516235162+ #include "vecmath_config.h"
3516335163+-#include "poly_sve_f32.h"
3516435164++
3516535165++#define SignExponentMask 0xff800000
3516635166+3516735167+ static const struct sv_log1pf_data
3516835168+ {
3516935169+- float32_t poly[9];
3517035170+- float32_t ln2;
3517135171+- float32_t scale_back;
3517235172++ float c0, c2, c4, c6;
3517335173++ float c1, c3, c5, c7;
3517435174++ float ln2, exp_bias, quarter;
3517535175++ uint32_t four, three_quarters;
3517635176+ } sv_log1pf_data = {
3517735177+- /* Polynomial generated using FPMinimax in [-0.25, 0.5]. */
3517835178+- .poly = { -0x1p-1f, 0x1.5555aap-2f, -0x1.000038p-2f, 0x1.99675cp-3f,
3517935179+- -0x1.54ef78p-3f, 0x1.28a1f4p-3f, -0x1.0da91p-3f, 0x1.abcb6p-4f,
3518035180+- -0x1.6f0d5ep-5f },
3518135181+- .scale_back = 0x1.0p-23f,
3518235182+- .ln2 = 0x1.62e43p-1f,
3518335183++ /* Do not store first term of polynomial, which is -0.5, as
3518435184++ this can be fmov-ed directly instead of including it in
3518535185++ the main load-and-mla polynomial schedule. */
3518635186++ .c0 = 0x1.5555aap-2f, .c1 = -0x1.000038p-2f, .c2 = 0x1.99675cp-3f,
3518735187++ .c3 = -0x1.54ef78p-3f, .c4 = 0x1.28a1f4p-3f, .c5 = -0x1.0da91p-3f,
3518835188++ .c6 = 0x1.abcb6p-4f, .c7 = -0x1.6f0d5ep-5f, .ln2 = 0x1.62e43p-1f,
3518935189++ .exp_bias = 0x1p-23f, .quarter = 0x1p-2f, .four = 0x40800000,
3519035190++ .three_quarters = 0x3f400000,
3519135191+ };
3519235192+3519335193+-static inline svfloat32_t
3519435194+-eval_poly (svfloat32_t m, const float32_t *c, svbool_t pg)
3519535195+-{
3519635196+- svfloat32_t p_12 = svmla_x (pg, sv_f32 (c[0]), m, sv_f32 (c[1]));
3519735197+- svfloat32_t m2 = svmul_x (pg, m, m);
3519835198+- svfloat32_t q = svmla_x (pg, m, m2, p_12);
3519935199+- svfloat32_t p = sv_pw_horner_6_f32_x (pg, m, m2, c + 2);
3520035200+- p = svmul_x (pg, m2, p);
3520135201+-
3520235202+- return svmla_x (pg, q, m2, p);
3520335203+-}
3520435204+-
3520535205+ static inline svfloat32_t
3520635206+ sv_log1pf_inline (svfloat32_t x, svbool_t pg)
3520735207+ {
3520835208+ const struct sv_log1pf_data *d = ptr_barrier (&sv_log1pf_data);
3520935209+3521035210+- svfloat32_t m = svadd_x (pg, x, 1.0f);
3521135211+-
3521235212+- svint32_t ks = svsub_x (pg, svreinterpret_s32 (m),
3521335213+- svreinterpret_s32 (svdup_f32 (0.75f)));
3521435214+- ks = svand_x (pg, ks, 0xff800000);
3521535215+- svuint32_t k = svreinterpret_u32 (ks);
3521635216+- svfloat32_t s = svreinterpret_f32 (
3521735217+- svsub_x (pg, svreinterpret_u32 (svdup_f32 (4.0f)), k));
3521835218+-
3521935219+- svfloat32_t m_scale
3522035220+- = svreinterpret_f32 (svsub_x (pg, svreinterpret_u32 (x), k));
3522135221+- m_scale
3522235222+- = svadd_x (pg, m_scale, svmla_x (pg, sv_f32 (-1.0f), sv_f32 (0.25f), s));
3522335223+- svfloat32_t p = eval_poly (m_scale, d->poly, pg);
3522435224+- svfloat32_t scale_back = svmul_x (pg, svcvt_f32_x (pg, k), d->scale_back);
3522535225+- return svmla_x (pg, p, scale_back, d->ln2);
3522635226++ /* With x + 1 = t * 2^k (where t = m + 1 and k is chosen such that m
3522735227++ is in [-0.25, 0.5]):
3522835228++ log1p(x) = log(t) + log(2^k) = log1p(m) + k*log(2).
3522935229++
3523035230++ We approximate log1p(m) with a polynomial, then scale by
3523135231++ k*log(2). Instead of doing this directly, we use an intermediate
3523235232++ scale factor s = 4*k*log(2) to ensure the scale is representable
3523335233++ as a normalised fp32 number. */
3523435234++ svfloat32_t m = svadd_x (pg, x, 1);
3523535235++
3523635236++ /* Choose k to scale x to the range [-1/4, 1/2]. */
3523735237++ svint32_t k
3523835238++ = svand_x (pg, svsub_x (pg, svreinterpret_s32 (m), d->three_quarters),
3523935239++ sv_s32 (SignExponentMask));
3524035240++
3524135241++ /* Scale x by exponent manipulation. */
3524235242++ svfloat32_t m_scale = svreinterpret_f32 (
3524335243++ svsub_x (pg, svreinterpret_u32 (x), svreinterpret_u32 (k)));
3524435244++
3524535245++ /* Scale up to ensure that the scale factor is representable as normalised
3524635246++ fp32 number, and scale m down accordingly. */
3524735247++ svfloat32_t s = svreinterpret_f32 (svsubr_x (pg, k, d->four));
3524835248++ svfloat32_t fconst = svld1rq_f32 (svptrue_b32 (), &d->ln2);
3524935249++ m_scale = svadd_x (pg, m_scale, svmla_lane_f32 (sv_f32 (-1), s, fconst, 2));
3525035250++
3525135251++ /* Evaluate polynomial on reduced interval. */
3525235252++ svfloat32_t ms2 = svmul_x (svptrue_b32 (), m_scale, m_scale);
3525335253++
3525435254++ svfloat32_t c1357 = svld1rq_f32 (svptrue_b32 (), &d->c1);
3525535255++ svfloat32_t p01 = svmla_lane_f32 (sv_f32 (d->c0), m_scale, c1357, 0);
3525635256++ svfloat32_t p23 = svmla_lane_f32 (sv_f32 (d->c2), m_scale, c1357, 1);
3525735257++ svfloat32_t p45 = svmla_lane_f32 (sv_f32 (d->c4), m_scale, c1357, 2);
3525835258++ svfloat32_t p67 = svmla_lane_f32 (sv_f32 (d->c6), m_scale, c1357, 3);
3525935259++
3526035260++ svfloat32_t p = svmla_x (pg, p45, p67, ms2);
3526135261++ p = svmla_x (pg, p23, p, ms2);
3526235262++ p = svmla_x (pg, p01, p, ms2);
3526335263++
3526435264++ p = svmad_x (pg, m_scale, p, -0.5);
3526535265++ p = svmla_x (pg, m_scale, m_scale, svmul_x (pg, m_scale, p));
3526635266++
3526735267++ /* The scale factor to be applied back at the end - by multiplying float(k)
3526835268++ by 2^-23 we get the unbiased exponent of k. */
3526935269++ svfloat32_t scale_back = svmul_lane_f32 (svcvt_f32_x (pg, k), fconst, 1);
3527035270++ return svmla_lane_f32 (p, scale_back, fconst, 0);
3527135271+ }
3527235272+3527335273+ #endif
3527435274+3527535275+commit d983f14c304df2d880c7b01e904e4a889064b9b3
3527635276+Author: Luna Lamb <luna.lamb@arm.com>
3527735277+Date: Fri Jan 3 20:15:17 2025 +0000
3527835278+3527935279+ AArch64: Improve codegen in SVE expm1f and users
3528035280+3528135281+ Use unpredicated muls, use absolute compare and improve memory access.
3528235282+ Expm1f, sinhf and tanhf show 7%, 5% and 1% improvement in throughput
3528335283+ microbenchmark on Neoverse V1.
3528435284+3528535285+ (cherry picked from commit f86b4cf87581cf1e45702b07880679ffa0b1f47a)
3528635286+3528735287+diff --git a/sysdeps/aarch64/fpu/expm1f_sve.c b/sysdeps/aarch64/fpu/expm1f_sve.c
3528835288+index 7c852125cd..05a66400d4 100644
3528935289+--- a/sysdeps/aarch64/fpu/expm1f_sve.c
3529035290++++ b/sysdeps/aarch64/fpu/expm1f_sve.c
3529135291+@@ -18,7 +18,6 @@
3529235292+ <https://www.gnu.org/licenses/>. */
3529335293+3529435294+ #include "sv_math.h"
3529535295+-#include "poly_sve_f32.h"
3529635296+3529735297+ /* Largest value of x for which expm1(x) should round to -1. */
3529835298+ #define SpecialBound 0x1.5ebc4p+6f
3529935299+@@ -28,20 +27,17 @@ static const struct data
3530035300+ /* These 4 are grouped together so they can be loaded as one quadword, then
3530135301+ used with _lane forms of svmla/svmls. */
3530235302+ float c2, c4, ln2_hi, ln2_lo;
3530335303+- float c0, c1, c3, inv_ln2, special_bound, shift;
3530435304++ float c0, inv_ln2, c1, c3, special_bound;
3530535305+ } data = {
3530635306+ /* Generated using fpminimax. */
3530735307+ .c0 = 0x1.fffffep-2, .c1 = 0x1.5554aep-3,
3530835308+ .c2 = 0x1.555736p-5, .c3 = 0x1.12287cp-7,
3530935309+- .c4 = 0x1.6b55a2p-10,
3531035310++ .c4 = 0x1.6b55a2p-10, .inv_ln2 = 0x1.715476p+0f,
3531135311++ .special_bound = SpecialBound, .ln2_lo = 0x1.7f7d1cp-20f,
3531235312++ .ln2_hi = 0x1.62e4p-1f,
3531335313+3531435314+- .special_bound = SpecialBound, .shift = 0x1.8p23f,
3531535315+- .inv_ln2 = 0x1.715476p+0f, .ln2_hi = 0x1.62e4p-1f,
3531635316+- .ln2_lo = 0x1.7f7d1cp-20f,
3531735317+ };
3531835318+3531935319+-#define C(i) sv_f32 (d->c##i)
3532035320+-
3532135321+ static svfloat32_t NOINLINE
3532235322+ special_case (svfloat32_t x, svbool_t pg)
3532335323+ {
3532435324+@@ -71,9 +67,8 @@ svfloat32_t SV_NAME_F1 (expm1) (svfloat32_t x, svbool_t pg)
3532535325+ and f = x - i * ln2, then f is in [-ln2/2, ln2/2].
3532635326+ exp(x) - 1 = 2^i * (expm1(f) + 1) - 1
3532735327+ where 2^i is exact because i is an integer. */
3532835328+- svfloat32_t j = svmla_x (pg, sv_f32 (d->shift), x, d->inv_ln2);
3532935329+- j = svsub_x (pg, j, d->shift);
3533035330+- svint32_t i = svcvt_s32_x (pg, j);
3533135331++ svfloat32_t j = svmul_x (svptrue_b32 (), x, d->inv_ln2);
3533235332++ j = svrinta_x (pg, j);
3533335333+3533435334+ svfloat32_t f = svmls_lane (x, j, lane_constants, 2);
3533535335+ f = svmls_lane (f, j, lane_constants, 3);
3533635336+@@ -83,17 +78,17 @@ svfloat32_t SV_NAME_F1 (expm1) (svfloat32_t x, svbool_t pg)
3533735337+ x + ax^2 + bx^3 + cx^4 ....
3533835338+ So we calculate the polynomial P(f) = a + bf + cf^2 + ...
3533935339+ and assemble the approximation expm1(f) ~= f + f^2 * P(f). */
3534035340+- svfloat32_t p12 = svmla_lane (C (1), f, lane_constants, 0);
3534135341+- svfloat32_t p34 = svmla_lane (C (3), f, lane_constants, 1);
3534235342+- svfloat32_t f2 = svmul_x (pg, f, f);
3534335343++ svfloat32_t p12 = svmla_lane (sv_f32 (d->c1), f, lane_constants, 0);
3534435344++ svfloat32_t p34 = svmla_lane (sv_f32 (d->c3), f, lane_constants, 1);
3534535345++ svfloat32_t f2 = svmul_x (svptrue_b32 (), f, f);
3534635346+ svfloat32_t p = svmla_x (pg, p12, f2, p34);
3534735347+- p = svmla_x (pg, C (0), f, p);
3534835348++
3534935349++ p = svmla_x (pg, sv_f32 (d->c0), f, p);
3535035350+ p = svmla_x (pg, f, f2, p);
3535135351+3535235352+ /* Assemble the result.
3535335353+ expm1(x) ~= 2^i * (p + 1) - 1
3535435354+ Let t = 2^i. */
3535535355+- svfloat32_t t = svreinterpret_f32 (
3535635356+- svadd_x (pg, svreinterpret_u32 (svlsl_x (pg, i, 23)), 0x3f800000));
3535735357+- return svmla_x (pg, svsub_x (pg, t, 1), p, t);
3535835358++ svfloat32_t t = svscale_x (pg, sv_f32 (1.0f), svcvt_s32_x (pg, j));
3535935359++ return svmla_x (pg, svsub_x (pg, t, 1.0f), p, t);
3536035360+ }
3536135361+diff --git a/sysdeps/aarch64/fpu/sinhf_sve.c b/sysdeps/aarch64/fpu/sinhf_sve.c
3536235362+index 6c204b57a2..50dd386774 100644
3536335363+--- a/sysdeps/aarch64/fpu/sinhf_sve.c
3536435364++++ b/sysdeps/aarch64/fpu/sinhf_sve.c
3536535365+@@ -63,5 +63,5 @@ svfloat32_t SV_NAME_F1 (sinh) (svfloat32_t x, const svbool_t pg)
3536635366+ if (__glibc_unlikely (svptest_any (pg, special)))
3536735367+ return special_case (x, svmul_x (pg, t, halfsign), special);
3536835368+3536935369+- return svmul_x (pg, t, halfsign);
3537035370++ return svmul_x (svptrue_b32 (), t, halfsign);
3537135371+ }
3537235372+diff --git a/sysdeps/aarch64/fpu/sv_expm1f_inline.h b/sysdeps/aarch64/fpu/sv_expm1f_inline.h
3537335373+index 5b72451222..e46ddda543 100644
3537435374+--- a/sysdeps/aarch64/fpu/sv_expm1f_inline.h
3537535375++++ b/sysdeps/aarch64/fpu/sv_expm1f_inline.h
3537635376+@@ -27,21 +27,18 @@ struct sv_expm1f_data
3537735377+ /* These 4 are grouped together so they can be loaded as one quadword, then
3537835378+ used with _lane forms of svmla/svmls. */
3537935379+ float32_t c2, c4, ln2_hi, ln2_lo;
3538035380+- float32_t c0, c1, c3, inv_ln2, shift;
3538135381++ float c0, inv_ln2, c1, c3, special_bound;
3538235382+ };
3538335383+3538435384+ /* Coefficients generated using fpminimax. */
3538535385+ #define SV_EXPM1F_DATA \
3538635386+ { \
3538735387+- .c0 = 0x1.fffffep-2, .c1 = 0x1.5554aep-3, .c2 = 0x1.555736p-5, \
3538835388+- .c3 = 0x1.12287cp-7, .c4 = 0x1.6b55a2p-10, \
3538935389++ .c0 = 0x1.fffffep-2, .c1 = 0x1.5554aep-3, .inv_ln2 = 0x1.715476p+0f, \
3539035390++ .c2 = 0x1.555736p-5, .c3 = 0x1.12287cp-7, \
3539135391+ \
3539235392+- .shift = 0x1.8p23f, .inv_ln2 = 0x1.715476p+0f, .ln2_hi = 0x1.62e4p-1f, \
3539335393+- .ln2_lo = 0x1.7f7d1cp-20f, \
3539435394++ .c4 = 0x1.6b55a2p-10, .ln2_lo = 0x1.7f7d1cp-20f, .ln2_hi = 0x1.62e4p-1f, \
3539535395+ }
3539635396+3539735397+-#define C(i) sv_f32 (d->c##i)
3539835398+-
3539935399+ static inline svfloat32_t
3540035400+ expm1f_inline (svfloat32_t x, svbool_t pg, const struct sv_expm1f_data *d)
3540135401+ {
3540235402+@@ -55,9 +52,8 @@ expm1f_inline (svfloat32_t x, svbool_t pg, const struct sv_expm1f_data *d)
3540335403+ and f = x - i * ln2, then f is in [-ln2/2, ln2/2].
3540435404+ exp(x) - 1 = 2^i * (expm1(f) + 1) - 1
3540535405+ where 2^i is exact because i is an integer. */
3540635406+- svfloat32_t j = svmla_x (pg, sv_f32 (d->shift), x, d->inv_ln2);
3540735407+- j = svsub_x (pg, j, d->shift);
3540835408+- svint32_t i = svcvt_s32_x (pg, j);
3540935409++ svfloat32_t j = svmul_x (svptrue_b32 (), x, d->inv_ln2);
3541035410++ j = svrinta_x (pg, j);
3541135411+3541235412+ svfloat32_t f = svmls_lane (x, j, lane_constants, 2);
3541335413+ f = svmls_lane (f, j, lane_constants, 3);
3541435414+@@ -67,18 +63,18 @@ expm1f_inline (svfloat32_t x, svbool_t pg, const struct sv_expm1f_data *d)
3541535415+ x + ax^2 + bx^3 + cx^4 ....
3541635416+ So we calculate the polynomial P(f) = a + bf + cf^2 + ...
3541735417+ and assemble the approximation expm1(f) ~= f + f^2 * P(f). */
3541835418+- svfloat32_t p12 = svmla_lane (C (1), f, lane_constants, 0);
3541935419+- svfloat32_t p34 = svmla_lane (C (3), f, lane_constants, 1);
3542035420+- svfloat32_t f2 = svmul_x (pg, f, f);
3542135421++ svfloat32_t p12 = svmla_lane (sv_f32 (d->c1), f, lane_constants, 0);
3542235422++ svfloat32_t p34 = svmla_lane (sv_f32 (d->c3), f, lane_constants, 1);
3542335423++ svfloat32_t f2 = svmul_x (svptrue_b32 (), f, f);
3542435424+ svfloat32_t p = svmla_x (pg, p12, f2, p34);
3542535425+- p = svmla_x (pg, C (0), f, p);
3542635426++ p = svmla_x (pg, sv_f32 (d->c0), f, p);
3542735427+ p = svmla_x (pg, f, f2, p);
3542835428+3542935429+ /* Assemble the result.
3543035430+ expm1(x) ~= 2^i * (p + 1) - 1
3543135431+ Let t = 2^i. */
3543235432+- svfloat32_t t = svscale_x (pg, sv_f32 (1), i);
3543335433+- return svmla_x (pg, svsub_x (pg, t, 1), p, t);
3543435434++ svfloat32_t t = svscale_x (pg, sv_f32 (1.0f), svcvt_s32_x (pg, j));
3543535435++ return svmla_x (pg, svsub_x (pg, t, 1.0f), p, t);
3543635436+ }
3543735437+3543835438+ #endif
3543935439+diff --git a/sysdeps/aarch64/fpu/tanhf_sve.c b/sysdeps/aarch64/fpu/tanhf_sve.c
3544035440+index 0b94523cf5..80dd679346 100644
3544135441+--- a/sysdeps/aarch64/fpu/tanhf_sve.c
3544235442++++ b/sysdeps/aarch64/fpu/tanhf_sve.c
3544335443+@@ -19,20 +19,27 @@
3544435444+3544535445+ #include "sv_expm1f_inline.h"
3544635446+3544735447++/* Largest value of x for which tanhf(x) rounds to 1 (or -1 for negative). */
3544835448++#define BoringBound 0x1.205966p+3f
3544935449++
3545035450+ static const struct data
3545135451+ {
3545235452+ struct sv_expm1f_data expm1f_consts;
3545335453+- uint32_t boring_bound, onef;
3545435454++ uint32_t onef, special_bound;
3545535455++ float boring_bound;
3545635456+ } data = {
3545735457+ .expm1f_consts = SV_EXPM1F_DATA,
3545835458+- /* 0x1.205966p+3, above which tanhf rounds to 1 (or -1 for negative). */
3545935459+- .boring_bound = 0x41102cb3,
3546035460+ .onef = 0x3f800000,
3546135461++ .special_bound = 0x7f800000,
3546235462++ .boring_bound = BoringBound,
3546335463+ };
3546435464+3546535465+ static svfloat32_t NOINLINE
3546635466+-special_case (svfloat32_t x, svfloat32_t y, svbool_t special)
3546735467++special_case (svfloat32_t x, svbool_t pg, svbool_t is_boring,
3546835468++ svfloat32_t boring, svfloat32_t q, svbool_t special)
3546935469+ {
3547035470++ svfloat32_t y
3547135471++ = svsel_f32 (is_boring, boring, svdiv_x (pg, q, svadd_x (pg, q, 2.0)));
3547235472+ return sv_call_f32 (tanhf, x, y, special);
3547335473+ }
3547435474+3547535475+@@ -47,15 +54,16 @@ svfloat32_t SV_NAME_F1 (tanh) (svfloat32_t x, const svbool_t pg)
3547635476+ svfloat32_t ax = svabs_x (pg, x);
3547735477+ svuint32_t iax = svreinterpret_u32 (ax);
3547835478+ svuint32_t sign = sveor_x (pg, svreinterpret_u32 (x), iax);
3547935479+- svbool_t is_boring = svcmpgt (pg, iax, d->boring_bound);
3548035480+ svfloat32_t boring = svreinterpret_f32 (svorr_x (pg, sign, d->onef));
3548135481+-
3548235482+- svbool_t special = svcmpgt (pg, iax, 0x7f800000);
3548335483++ svbool_t special = svcmpgt (pg, iax, d->special_bound);
3548435484++ svbool_t is_boring = svacgt (pg, x, d->boring_bound);
3548535485+3548635486+ /* tanh(x) = (e^2x - 1) / (e^2x + 1). */
3548735487+- svfloat32_t q = expm1f_inline (svmul_x (pg, x, 2.0), pg, &d->expm1f_consts);
3548835488+- svfloat32_t y = svdiv_x (pg, q, svadd_x (pg, q, 2.0));
3548935489++ svfloat32_t q = expm1f_inline (svmul_x (svptrue_b32 (), x, 2.0), pg,
3549035490++ &d->expm1f_consts);
3549135491++
3549235492+ if (__glibc_unlikely (svptest_any (pg, special)))
3549335493+- return special_case (x, svsel_f32 (is_boring, boring, y), special);
3549435494++ return special_case (x, pg, is_boring, boring, q, special);
3549535495++ svfloat32_t y = svdiv_x (pg, q, svadd_x (pg, q, 2.0));
3549635496+ return svsel_f32 (is_boring, boring, y);
3549735497+ }
3549835498+3549935499+commit 0ff6a9ff79bca9384ce4ba20e8942d39cc377a14
3550035500+Author: Luna Lamb <luna.lamb@arm.com>
3550135501+Date: Thu Feb 13 17:52:09 2025 +0000
3550235502+3550335503+ Aarch64: Improve codegen in SVE asinh
3550435504+3550535505+ Use unpredicated muls, use lanewise mla's and improve memory access.
3550635506+ 1% regression in throughput microbenchmark on Neoverse V1.
3550735507+3550835508+ Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com>
3550935509+ (cherry picked from commit 8f0e7fe61e0a2ad5ed777933703ce09053810ec4)
3551035510+3551135511+diff --git a/sysdeps/aarch64/fpu/asinh_sve.c b/sysdeps/aarch64/fpu/asinh_sve.c
3551235512+index 28dc5c4587..fe8715e06c 100644
3551335513+--- a/sysdeps/aarch64/fpu/asinh_sve.c
3551435514++++ b/sysdeps/aarch64/fpu/asinh_sve.c
3551535515+@@ -18,36 +18,49 @@
3551635516+ <https://www.gnu.org/licenses/>. */
3551735517+3551835518+ #include "sv_math.h"
3551935519+-#include "poly_sve_f64.h"
3552035520+3552135521+ #define SignMask (0x8000000000000000)
3552235522+ #define One (0x3ff0000000000000)
3552335523+ #define Thres (0x5fe0000000000000) /* asuint64 (0x1p511). */
3552435524++#define IndexMask (((1 << V_LOG_TABLE_BITS) - 1) << 1)
3552535525+3552635526+ static const struct data
3552735527+ {
3552835528+- double poly[18];
3552935529+- double ln2, p3, p1, p4, p0, p2;
3553035530+- uint64_t n;
3553135531+- uint64_t off;
3553235532++ double even_coeffs[9];
3553335533++ double ln2, p3, p1, p4, p0, p2, c1, c3, c5, c7, c9, c11, c13, c15, c17;
3553435534++ uint64_t off, mask;
3553535535+3553635536+ } data = {
3553735537+- /* Polynomial generated using Remez on [2^-26, 1]. */
3553835538+- .poly
3553935539+- = { -0x1.55555555554a7p-3, 0x1.3333333326c7p-4, -0x1.6db6db68332e6p-5,
3554035540+- 0x1.f1c71b26fb40dp-6, -0x1.6e8b8b654a621p-6, 0x1.1c4daa9e67871p-6,
3554135541+- -0x1.c9871d10885afp-7, 0x1.7a16e8d9d2ecfp-7, -0x1.3ddca533e9f54p-7,
3554235542+- 0x1.0becef748dafcp-7, -0x1.b90c7099dd397p-8, 0x1.541f2bb1ffe51p-8,
3554335543+- -0x1.d217026a669ecp-9, 0x1.0b5c7977aaf7p-9, -0x1.e0f37daef9127p-11,
3554435544+- 0x1.388b5fe542a6p-12, -0x1.021a48685e287p-14, 0x1.93d4ba83d34dap-18 },
3554535545++ /* Polynomial generated using Remez on [2^-26, 1]. */
3554635546++ .even_coeffs ={
3554735547++ -0x1.55555555554a7p-3,
3554835548++ -0x1.6db6db68332e6p-5,
3554935549++ -0x1.6e8b8b654a621p-6,
3555035550++ -0x1.c9871d10885afp-7,
3555135551++ -0x1.3ddca533e9f54p-7,
3555235552++ -0x1.b90c7099dd397p-8,
3555335553++ -0x1.d217026a669ecp-9,
3555435554++ -0x1.e0f37daef9127p-11,
3555535555++ -0x1.021a48685e287p-14, },
3555635556++
3555735557++ .c1 = 0x1.3333333326c7p-4,
3555835558++ .c3 = 0x1.f1c71b26fb40dp-6,
3555935559++ .c5 = 0x1.1c4daa9e67871p-6,
3556035560++ .c7 = 0x1.7a16e8d9d2ecfp-7,
3556135561++ .c9 = 0x1.0becef748dafcp-7,
3556235562++ .c11 = 0x1.541f2bb1ffe51p-8,
3556335563++ .c13 = 0x1.0b5c7977aaf7p-9,
3556435564++ .c15 = 0x1.388b5fe542a6p-12,
3556535565++ .c17 = 0x1.93d4ba83d34dap-18,
3556635566++
3556735567+ .ln2 = 0x1.62e42fefa39efp-1,
3556835568+ .p0 = -0x1.ffffffffffff7p-2,
3556935569+ .p1 = 0x1.55555555170d4p-2,
3557035570+ .p2 = -0x1.0000000399c27p-2,
3557135571+ .p3 = 0x1.999b2e90e94cap-3,
3557235572+ .p4 = -0x1.554e550bd501ep-3,
3557335573+- .n = 1 << V_LOG_TABLE_BITS,
3557435574+- .off = 0x3fe6900900000000
3557535575++ .off = 0x3fe6900900000000,
3557635576++ .mask = 0xfffULL << 52,
3557735577+ };
3557835578+3557935579+ static svfloat64_t NOINLINE
3558035580+@@ -64,11 +77,10 @@ __sv_log_inline (svfloat64_t x, const struct data *d, const svbool_t pg)
3558135581+ of the algorithm used. */
3558235582+3558335583+ svuint64_t ix = svreinterpret_u64 (x);
3558435584+- svuint64_t tmp = svsub_x (pg, ix, d->off);
3558535585+- svuint64_t i = svand_x (pg, svlsr_x (pg, tmp, (51 - V_LOG_TABLE_BITS)),
3558635586+- (d->n - 1) << 1);
3558735587+- svint64_t k = svasr_x (pg, svreinterpret_s64 (tmp), 52);
3558835588+- svuint64_t iz = svsub_x (pg, ix, svand_x (pg, tmp, 0xfffULL << 52));
3558935589++ svuint64_t i_off = svsub_x (pg, ix, d->off);
3559035590++ svuint64_t i
3559135591++ = svand_x (pg, svlsr_x (pg, i_off, (51 - V_LOG_TABLE_BITS)), IndexMask);
3559235592++ svuint64_t iz = svsub_x (pg, ix, svand_x (pg, i_off, d->mask));
3559335593+ svfloat64_t z = svreinterpret_f64 (iz);
3559435594+3559535595+ svfloat64_t invc = svld1_gather_index (pg, &__v_log_data.table[0].invc, i);
3559635596+@@ -78,14 +90,14 @@ __sv_log_inline (svfloat64_t x, const struct data *d, const svbool_t pg)
3559735597+ svfloat64_t p1_p4 = svld1rq (svptrue_b64 (), &d->p1);
3559835598+3559935599+ svfloat64_t r = svmla_x (pg, sv_f64 (-1.0), invc, z);
3560035600+- svfloat64_t kd = svcvt_f64_x (pg, k);
3560135601++ svfloat64_t kd
3560235602++ = svcvt_f64_x (pg, svasr_x (pg, svreinterpret_s64 (i_off), 52));
3560335603+3560435604+ svfloat64_t hi = svmla_lane (svadd_x (pg, logc, r), kd, ln2_p3, 0);
3560535605+- svfloat64_t r2 = svmul_x (pg, r, r);
3560635606+-
3560735607++ svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r);
3560835608+ svfloat64_t y = svmla_lane (sv_f64 (d->p2), r, ln2_p3, 1);
3560935609+-
3561035610+ svfloat64_t p = svmla_lane (sv_f64 (d->p0), r, p1_p4, 0);
3561135611++
3561235612+ y = svmla_lane (y, r2, p1_p4, 1);
3561335613+ y = svmla_x (pg, p, r2, y);
3561435614+ y = svmla_x (pg, hi, r2, y);
3561535615+@@ -111,7 +123,6 @@ svfloat64_t SV_NAME_D1 (asinh) (svfloat64_t x, const svbool_t pg)
3561635616+ svuint64_t iax = svbic_x (pg, ix, SignMask);
3561735617+ svuint64_t sign = svand_x (pg, ix, SignMask);
3561835618+ svfloat64_t ax = svreinterpret_f64 (iax);
3561935619+-
3562035620+ svbool_t ge1 = svcmpge (pg, iax, One);
3562135621+ svbool_t special = svcmpge (pg, iax, Thres);
3562235622+3562335623+@@ -120,7 +131,7 @@ svfloat64_t SV_NAME_D1 (asinh) (svfloat64_t x, const svbool_t pg)
3562435624+ svfloat64_t option_1 = sv_f64 (0);
3562535625+ if (__glibc_likely (svptest_any (pg, ge1)))
3562635626+ {
3562735627+- svfloat64_t x2 = svmul_x (pg, ax, ax);
3562835628++ svfloat64_t x2 = svmul_x (svptrue_b64 (), ax, ax);
3562935629+ option_1 = __sv_log_inline (
3563035630+ svadd_x (pg, ax, svsqrt_x (pg, svadd_x (pg, x2, 1))), d, pg);
3563135631+ }
3563235632+@@ -130,21 +141,53 @@ svfloat64_t SV_NAME_D1 (asinh) (svfloat64_t x, const svbool_t pg)
3563335633+ The largest observed error in this region is 1.51 ULPs:
3563435634+ _ZGVsMxv_asinh(0x1.fe12bf8c616a2p-1) got 0x1.c1e649ee2681bp-1
3563535635+ want 0x1.c1e649ee2681dp-1. */
3563635636++
3563735637+ svfloat64_t option_2 = sv_f64 (0);
3563835638+ if (__glibc_likely (svptest_any (pg, svnot_z (pg, ge1))))
3563935639+ {
3564035640+- svfloat64_t x2 = svmul_x (pg, ax, ax);
3564135641+- svfloat64_t x4 = svmul_x (pg, x2, x2);
3564235642+- svfloat64_t p = sv_pw_horner_17_f64_x (pg, x2, x4, d->poly);
3564335643+- option_2 = svmla_x (pg, ax, p, svmul_x (pg, x2, ax));
3564435644++ svfloat64_t x2 = svmul_x (svptrue_b64 (), ax, ax);
3564535645++ svfloat64_t x4 = svmul_x (svptrue_b64 (), x2, x2);
3564635646++ /* Order-17 Pairwise Horner scheme. */
3564735647++ svfloat64_t c13 = svld1rq (svptrue_b64 (), &d->c1);
3564835648++ svfloat64_t c57 = svld1rq (svptrue_b64 (), &d->c5);
3564935649++ svfloat64_t c911 = svld1rq (svptrue_b64 (), &d->c9);
3565035650++ svfloat64_t c1315 = svld1rq (svptrue_b64 (), &d->c13);
3565135651++
3565235652++ svfloat64_t p01 = svmla_lane (sv_f64 (d->even_coeffs[0]), x2, c13, 0);
3565335653++ svfloat64_t p23 = svmla_lane (sv_f64 (d->even_coeffs[1]), x2, c13, 1);
3565435654++ svfloat64_t p45 = svmla_lane (sv_f64 (d->even_coeffs[2]), x2, c57, 0);
3565535655++ svfloat64_t p67 = svmla_lane (sv_f64 (d->even_coeffs[3]), x2, c57, 1);
3565635656++ svfloat64_t p89 = svmla_lane (sv_f64 (d->even_coeffs[4]), x2, c911, 0);
3565735657++ svfloat64_t p1011 = svmla_lane (sv_f64 (d->even_coeffs[5]), x2, c911, 1);
3565835658++ svfloat64_t p1213
3565935659++ = svmla_lane (sv_f64 (d->even_coeffs[6]), x2, c1315, 0);
3566035660++ svfloat64_t p1415
3566135661++ = svmla_lane (sv_f64 (d->even_coeffs[7]), x2, c1315, 1);
3566235662++ svfloat64_t p1617 = svmla_x (pg, sv_f64 (d->even_coeffs[8]), x2, d->c17);
3566335663++
3566435664++ svfloat64_t p = svmla_x (pg, p1415, x4, p1617);
3566535665++ p = svmla_x (pg, p1213, x4, p);
3566635666++ p = svmla_x (pg, p1011, x4, p);
3566735667++ p = svmla_x (pg, p89, x4, p);
3566835668++
3566935669++ p = svmla_x (pg, p67, x4, p);
3567035670++ p = svmla_x (pg, p45, x4, p);
3567135671++
3567235672++ p = svmla_x (pg, p23, x4, p);
3567335673++
3567435674++ p = svmla_x (pg, p01, x4, p);
3567535675++
3567635676++ option_2 = svmla_x (pg, ax, p, svmul_x (svptrue_b64 (), x2, ax));
3567735677+ }
3567835678+3567935679+- /* Choose the right option for each lane. */
3568035680+- svfloat64_t y = svsel (ge1, option_1, option_2);
3568135681+-
3568235682+ if (__glibc_unlikely (svptest_any (pg, special)))
3568335683+ return special_case (
3568435684+- x, svreinterpret_f64 (sveor_x (pg, svreinterpret_u64 (y), sign)),
3568535685++ x,
3568635686++ svreinterpret_f64 (sveor_x (
3568735687++ pg, svreinterpret_u64 (svsel (ge1, option_1, option_2)), sign)),
3568835688+ special);
3568935689++
3569035690++ /* Choose the right option for each lane. */
3569135691++ svfloat64_t y = svsel (ge1, option_1, option_2);
3569235692+ return svreinterpret_f64 (sveor_x (pg, svreinterpret_u64 (y), sign));
3569335693+ }
3569435694+3569535695+commit 4b0bb84eb7e52a135c873fd9d0fc6c30599aedf4
3569635696+Author: Luna Lamb <luna.lamb@arm.com>
3569735697+Date: Thu Feb 13 17:54:46 2025 +0000
3569835698+3569935699+ Aarch64: Improve codegen in SVE exp and users, and update expf_inline
3570035700+3570135701+ Use unpredicted muls, and improve memory access.
3570235702+ 7%, 3% and 1% improvement in throughput microbenchmark on Neoverse V1,
3570335703+ for exp, exp2 and cosh respectively.
3570435704+3570535705+ Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com>
3570635706+ (cherry picked from commit c0ff447edf19bd4630fe79adf5e8b896405b059f)
3570735707+3570835708+diff --git a/sysdeps/aarch64/fpu/cosh_sve.c b/sysdeps/aarch64/fpu/cosh_sve.c
3570935709+index 919f34604a..e375dd8a34 100644
3571035710+--- a/sysdeps/aarch64/fpu/cosh_sve.c
3571135711++++ b/sysdeps/aarch64/fpu/cosh_sve.c
3571235712+@@ -23,7 +23,7 @@ static const struct data
3571335713+ {
3571435714+ float64_t poly[3];
3571535715+ float64_t inv_ln2, ln2_hi, ln2_lo, shift, thres;
3571635716+- uint64_t index_mask, special_bound;
3571735717++ uint64_t special_bound;
3571835718+ } data = {
3571935719+ .poly = { 0x1.fffffffffffd4p-2, 0x1.5555571d6b68cp-3,
3572035720+ 0x1.5555576a59599p-5, },
3572135721+@@ -35,14 +35,16 @@ static const struct data
3572235722+ .shift = 0x1.8p+52,
3572335723+ .thres = 704.0,
3572435724+3572535725+- .index_mask = 0xff,
3572635726+ /* 0x1.6p9, above which exp overflows. */
3572735727+ .special_bound = 0x4086000000000000,
3572835728+ };
3572935729+3573035730+ static svfloat64_t NOINLINE
3573135731+-special_case (svfloat64_t x, svfloat64_t y, svbool_t special)
3573235732++special_case (svfloat64_t x, svbool_t pg, svfloat64_t t, svbool_t special)
3573335733+ {
3573435734++ svfloat64_t half_t = svmul_x (svptrue_b64 (), t, 0.5);
3573535735++ svfloat64_t half_over_t = svdivr_x (pg, t, 0.5);
3573635736++ svfloat64_t y = svadd_x (pg, half_t, half_over_t);
3573735737+ return sv_call_f64 (cosh, x, y, special);
3573835738+ }
3573935739+3574035740+@@ -60,12 +62,12 @@ exp_inline (svfloat64_t x, const svbool_t pg, const struct data *d)
3574135741+3574235742+ svuint64_t u = svreinterpret_u64 (z);
3574335743+ svuint64_t e = svlsl_x (pg, u, 52 - V_EXP_TAIL_TABLE_BITS);
3574435744+- svuint64_t i = svand_x (pg, u, d->index_mask);
3574535745++ svuint64_t i = svand_x (svptrue_b64 (), u, 0xff);
3574635746+3574735747+ svfloat64_t y = svmla_x (pg, sv_f64 (d->poly[1]), r, d->poly[2]);
3574835748+ y = svmla_x (pg, sv_f64 (d->poly[0]), r, y);
3574935749+ y = svmla_x (pg, sv_f64 (1.0), r, y);
3575035750+- y = svmul_x (pg, r, y);
3575135751++ y = svmul_x (svptrue_b64 (), r, y);
3575235752+3575335753+ /* s = 2^(n/N). */
3575435754+ u = svld1_gather_index (pg, __v_exp_tail_data, i);
3575535755+@@ -94,12 +96,12 @@ svfloat64_t SV_NAME_D1 (cosh) (svfloat64_t x, const svbool_t pg)
3575635756+ /* Up to the point that exp overflows, we can use it to calculate cosh by
3575735757+ exp(|x|) / 2 + 1 / (2 * exp(|x|)). */
3575835758+ svfloat64_t t = exp_inline (ax, pg, d);
3575935759+- svfloat64_t half_t = svmul_x (pg, t, 0.5);
3576035760+- svfloat64_t half_over_t = svdivr_x (pg, t, 0.5);
3576135761+3576235762+ /* Fall back to scalar for any special cases. */
3576335763+ if (__glibc_unlikely (svptest_any (pg, special)))
3576435764+- return special_case (x, svadd_x (pg, half_t, half_over_t), special);
3576535765++ return special_case (x, pg, t, special);
3576635766+3576735767++ svfloat64_t half_t = svmul_x (svptrue_b64 (), t, 0.5);
3576835768++ svfloat64_t half_over_t = svdivr_x (pg, t, 0.5);
3576935769+ return svadd_x (pg, half_t, half_over_t);
3577035770+ }
3577135771+diff --git a/sysdeps/aarch64/fpu/exp10_sve.c b/sysdeps/aarch64/fpu/exp10_sve.c
3577235772+index ddf64708cb..bfd3fb9e19 100644
3577335773+--- a/sysdeps/aarch64/fpu/exp10_sve.c
3577435774++++ b/sysdeps/aarch64/fpu/exp10_sve.c
3577535775+@@ -18,21 +18,23 @@
3577635776+ <https://www.gnu.org/licenses/>. */
3577735777+3577835778+ #include "sv_math.h"
3577935779+-#include "poly_sve_f64.h"
3578035780+3578135781+ #define SpecialBound 307.0 /* floor (log10 (2^1023)). */
3578235782+3578335783+ static const struct data
3578435784+ {
3578535785+- double poly[5];
3578635786++ double c1, c3, c2, c4, c0;
3578735787+ double shift, log10_2, log2_10_hi, log2_10_lo, scale_thres, special_bound;
3578835788+ } data = {
3578935789+ /* Coefficients generated using Remez algorithm.
3579035790+ rel error: 0x1.9fcb9b3p-60
3579135791+ abs error: 0x1.a20d9598p-60 in [ -log10(2)/128, log10(2)/128 ]
3579235792+ max ulp err 0.52 +0.5. */
3579335793+- .poly = { 0x1.26bb1bbb55516p1, 0x1.53524c73cd32ap1, 0x1.0470591daeafbp1,
3579435794+- 0x1.2bd77b1361ef6p0, 0x1.142b5d54e9621p-1 },
3579535795++ .c0 = 0x1.26bb1bbb55516p1,
3579635796++ .c1 = 0x1.53524c73cd32ap1,
3579735797++ .c2 = 0x1.0470591daeafbp1,
3579835798++ .c3 = 0x1.2bd77b1361ef6p0,
3579935799++ .c4 = 0x1.142b5d54e9621p-1,
3580035800+ /* 1.5*2^46+1023. This value is further explained below. */
3580135801+ .shift = 0x1.800000000ffc0p+46,
3580235802+ .log10_2 = 0x1.a934f0979a371p1, /* 1/log2(10). */
3580335803+@@ -70,9 +72,9 @@ special_case (svbool_t pg, svfloat64_t s, svfloat64_t y, svfloat64_t n,
3580435804+ /* |n| > 1280 => 2^(n) overflows. */
3580535805+ svbool_t p_cmp = svacgt (pg, n, d->scale_thres);
3580635806+3580735807+- svfloat64_t r1 = svmul_x (pg, s1, s1);
3580835808++ svfloat64_t r1 = svmul_x (svptrue_b64 (), s1, s1);
3580935809+ svfloat64_t r2 = svmla_x (pg, s2, s2, y);
3581035810+- svfloat64_t r0 = svmul_x (pg, r2, s1);
3581135811++ svfloat64_t r0 = svmul_x (svptrue_b64 (), r2, s1);
3581235812+3581335813+ return svsel (p_cmp, r1, r0);
3581435814+ }
3581535815+@@ -103,11 +105,14 @@ svfloat64_t SV_NAME_D1 (exp10) (svfloat64_t x, svbool_t pg)
3581635816+ comes at significant performance cost. */
3581735817+ svuint64_t u = svreinterpret_u64 (z);
3581835818+ svfloat64_t scale = svexpa (u);
3581935819+-
3582035820++ svfloat64_t c24 = svld1rq (svptrue_b64 (), &d->c2);
3582135821+ /* Approximate exp10(r) using polynomial. */
3582235822+- svfloat64_t r2 = svmul_x (pg, r, r);
3582335823+- svfloat64_t y = svmla_x (pg, svmul_x (pg, r, d->poly[0]), r2,
3582435824+- sv_pairwise_poly_3_f64_x (pg, r, r2, d->poly + 1));
3582535825++ svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r);
3582635826++ svfloat64_t p12 = svmla_lane (sv_f64 (d->c1), r, c24, 0);
3582735827++ svfloat64_t p34 = svmla_lane (sv_f64 (d->c3), r, c24, 1);
3582835828++ svfloat64_t p14 = svmla_x (pg, p12, p34, r2);
3582935829++
3583035830++ svfloat64_t y = svmla_x (pg, svmul_x (svptrue_b64 (), r, d->c0), r2, p14);
3583135831+3583235832+ /* Assemble result as exp10(x) = 2^n * exp10(r). If |x| > SpecialBound
3583335833+ multiplication may overflow, so use special case routine. */
3583435834+diff --git a/sysdeps/aarch64/fpu/exp2_sve.c b/sysdeps/aarch64/fpu/exp2_sve.c
3583535835+index 22848ebfa5..5dfb77cdbc 100644
3583635836+--- a/sysdeps/aarch64/fpu/exp2_sve.c
3583735837++++ b/sysdeps/aarch64/fpu/exp2_sve.c
3583835838+@@ -18,7 +18,6 @@
3583935839+ <https://www.gnu.org/licenses/>. */
3584035840+3584135841+ #include "sv_math.h"
3584235842+-#include "poly_sve_f64.h"
3584335843+3584435844+ #define N (1 << V_EXP_TABLE_BITS)
3584535845+3584635846+@@ -27,15 +26,15 @@
3584735847+3584835848+ static const struct data
3584935849+ {
3585035850+- double poly[4];
3585135851++ double c0, c2;
3585235852++ double c1, c3;
3585335853+ double shift, big_bound, uoflow_bound;
3585435854+ } data = {
3585535855+ /* Coefficients are computed using Remez algorithm with
3585635856+ minimisation of the absolute error. */
3585735857+- .poly = { 0x1.62e42fefa3686p-1, 0x1.ebfbdff82c241p-3, 0x1.c6b09b16de99ap-5,
3585835858+- 0x1.3b2abf5571ad8p-7 },
3585935859+- .shift = 0x1.8p52 / N,
3586035860+- .uoflow_bound = UOFlowBound,
3586135861++ .c0 = 0x1.62e42fefa3686p-1, .c1 = 0x1.ebfbdff82c241p-3,
3586235862++ .c2 = 0x1.c6b09b16de99ap-5, .c3 = 0x1.3b2abf5571ad8p-7,
3586335863++ .shift = 0x1.8p52 / N, .uoflow_bound = UOFlowBound,
3586435864+ .big_bound = BigBound,
3586535865+ };
3586635866+3586735867+@@ -67,9 +66,9 @@ special_case (svbool_t pg, svfloat64_t s, svfloat64_t y, svfloat64_t n,
3586835868+ /* |n| > 1280 => 2^(n) overflows. */
3586935869+ svbool_t p_cmp = svacgt (pg, n, d->uoflow_bound);
3587035870+3587135871+- svfloat64_t r1 = svmul_x (pg, s1, s1);
3587235872++ svfloat64_t r1 = svmul_x (svptrue_b64 (), s1, s1);
3587335873+ svfloat64_t r2 = svmla_x (pg, s2, s2, y);
3587435874+- svfloat64_t r0 = svmul_x (pg, r2, s1);
3587535875++ svfloat64_t r0 = svmul_x (svptrue_b64 (), r2, s1);
3587635876+3587735877+ return svsel (p_cmp, r1, r0);
3587835878+ }
3587935879+@@ -99,11 +98,14 @@ svfloat64_t SV_NAME_D1 (exp2) (svfloat64_t x, svbool_t pg)
3588035880+ svuint64_t top = svlsl_x (pg, ki, 52 - V_EXP_TABLE_BITS);
3588135881+ svfloat64_t scale = svreinterpret_f64 (svadd_x (pg, sbits, top));
3588235882+3588335883++ svfloat64_t c13 = svld1rq (svptrue_b64 (), &d->c1);
3588435884+ /* Approximate exp2(r) using polynomial. */
3588535885+- svfloat64_t r2 = svmul_x (pg, r, r);
3588635886+- svfloat64_t p = sv_pairwise_poly_3_f64_x (pg, r, r2, d->poly);
3588735887+- svfloat64_t y = svmul_x (pg, r, p);
3588835888+-
3588935889++ /* y = exp2(r) - 1 ~= C0 r + C1 r^2 + C2 r^3 + C3 r^4. */
3589035890++ svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r);
3589135891++ svfloat64_t p01 = svmla_lane (sv_f64 (d->c0), r, c13, 0);
3589235892++ svfloat64_t p23 = svmla_lane (sv_f64 (d->c2), r, c13, 1);
3589335893++ svfloat64_t p = svmla_x (pg, p01, p23, r2);
3589435894++ svfloat64_t y = svmul_x (svptrue_b64 (), r, p);
3589535895+ /* Assemble exp2(x) = exp2(r) * scale. */
3589635896+ if (__glibc_unlikely (svptest_any (pg, special)))
3589735897+ return special_case (pg, scale, y, kd, d);
3589835898+diff --git a/sysdeps/aarch64/fpu/exp_sve.c b/sysdeps/aarch64/fpu/exp_sve.c
3589935899+index aabaaa1d61..b2421d493f 100644
3590035900+--- a/sysdeps/aarch64/fpu/exp_sve.c
3590135901++++ b/sysdeps/aarch64/fpu/exp_sve.c
3590235902+@@ -21,12 +21,15 @@
3590335903+3590435904+ static const struct data
3590535905+ {
3590635906+- double poly[4];
3590735907++ double c0, c2;
3590835908++ double c1, c3;
3590935909+ double ln2_hi, ln2_lo, inv_ln2, shift, thres;
3591035910++
3591135911+ } data = {
3591235912+- .poly = { /* ulp error: 0.53. */
3591335913+- 0x1.fffffffffdbcdp-2, 0x1.555555555444cp-3, 0x1.555573c6a9f7dp-5,
3591435914+- 0x1.1111266d28935p-7 },
3591535915++ .c0 = 0x1.fffffffffdbcdp-2,
3591635916++ .c1 = 0x1.555555555444cp-3,
3591735917++ .c2 = 0x1.555573c6a9f7dp-5,
3591835918++ .c3 = 0x1.1111266d28935p-7,
3591935919+ .ln2_hi = 0x1.62e42fefa3800p-1,
3592035920+ .ln2_lo = 0x1.ef35793c76730p-45,
3592135921+ /* 1/ln2. */
3592235922+@@ -36,7 +39,6 @@ static const struct data
3592335923+ .thres = 704.0,
3592435924+ };
3592535925+3592635926+-#define C(i) sv_f64 (d->poly[i])
3592735927+ #define SpecialOffset 0x6000000000000000 /* 0x1p513. */
3592835928+ /* SpecialBias1 + SpecialBias1 = asuint(1.0). */
3592935929+ #define SpecialBias1 0x7000000000000000 /* 0x1p769. */
3593035930+@@ -56,20 +58,20 @@ special_case (svbool_t pg, svfloat64_t s, svfloat64_t y, svfloat64_t n)
3593135931+ svuint64_t b
3593235932+ = svdup_u64_z (p_sign, SpecialOffset); /* Inactive lanes set to 0. */
3593335933+3593435934+- /* Set s1 to generate overflow depending on sign of exponent n. */
3593535935+- svfloat64_t s1 = svreinterpret_f64 (
3593635936+- svsubr_x (pg, b, SpecialBias1)); /* 0x70...0 - b. */
3593735937+- /* Offset s to avoid overflow in final result if n is below threshold. */
3593835938++ /* Set s1 to generate overflow depending on sign of exponent n,
3593935939++ ie. s1 = 0x70...0 - b. */
3594035940++ svfloat64_t s1 = svreinterpret_f64 (svsubr_x (pg, b, SpecialBias1));
3594135941++ /* Offset s to avoid overflow in final result if n is below threshold.
3594235942++ ie. s2 = as_u64 (s) - 0x3010...0 + b. */
3594335943+ svfloat64_t s2 = svreinterpret_f64 (
3594435944+- svadd_x (pg, svsub_x (pg, svreinterpret_u64 (s), SpecialBias2),
3594535945+- b)); /* as_u64 (s) - 0x3010...0 + b. */
3594635946++ svadd_x (pg, svsub_x (pg, svreinterpret_u64 (s), SpecialBias2), b));
3594735947+3594835948+ /* |n| > 1280 => 2^(n) overflows. */
3594935949+ svbool_t p_cmp = svacgt (pg, n, 1280.0);
3595035950+3595135951+- svfloat64_t r1 = svmul_x (pg, s1, s1);
3595235952++ svfloat64_t r1 = svmul_x (svptrue_b64 (), s1, s1);
3595335953+ svfloat64_t r2 = svmla_x (pg, s2, s2, y);
3595435954+- svfloat64_t r0 = svmul_x (pg, r2, s1);
3595535955++ svfloat64_t r0 = svmul_x (svptrue_b64 (), r2, s1);
3595635956+3595735957+ return svsel (p_cmp, r1, r0);
3595835958+ }
3595935959+@@ -103,16 +105,16 @@ svfloat64_t SV_NAME_D1 (exp) (svfloat64_t x, const svbool_t pg)
3596035960+ svfloat64_t z = svmla_x (pg, sv_f64 (d->shift), x, d->inv_ln2);
3596135961+ svuint64_t u = svreinterpret_u64 (z);
3596235962+ svfloat64_t n = svsub_x (pg, z, d->shift);
3596335963+-
3596435964++ svfloat64_t c13 = svld1rq (svptrue_b64 (), &d->c1);
3596535965+ /* r = x - n * ln2, r is in [-ln2/(2N), ln2/(2N)]. */
3596635966+ svfloat64_t ln2 = svld1rq (svptrue_b64 (), &d->ln2_hi);
3596735967+ svfloat64_t r = svmls_lane (x, n, ln2, 0);
3596835968+ r = svmls_lane (r, n, ln2, 1);
3596935969+3597035970+ /* y = exp(r) - 1 ~= r + C0 r^2 + C1 r^3 + C2 r^4 + C3 r^5. */
3597135971+- svfloat64_t r2 = svmul_x (pg, r, r);
3597235972+- svfloat64_t p01 = svmla_x (pg, C (0), C (1), r);
3597335973+- svfloat64_t p23 = svmla_x (pg, C (2), C (3), r);
3597435974++ svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r);
3597535975++ svfloat64_t p01 = svmla_lane (sv_f64 (d->c0), r, c13, 0);
3597635976++ svfloat64_t p23 = svmla_lane (sv_f64 (d->c2), r, c13, 1);
3597735977+ svfloat64_t p04 = svmla_x (pg, p01, p23, r2);
3597835978+ svfloat64_t y = svmla_x (pg, r, p04, r2);
3597935979+3598035980+diff --git a/sysdeps/aarch64/fpu/sv_expf_inline.h b/sysdeps/aarch64/fpu/sv_expf_inline.h
3598135981+index 6166df6553..75781fb4dd 100644
3598235982+--- a/sysdeps/aarch64/fpu/sv_expf_inline.h
3598335983++++ b/sysdeps/aarch64/fpu/sv_expf_inline.h
3598435984+@@ -61,7 +61,7 @@ expf_inline (svfloat32_t x, const svbool_t pg, const struct sv_expf_data *d)
3598535985+ /* scale = 2^(n/N). */
3598635986+ svfloat32_t scale = svexpa (svreinterpret_u32 (z));
3598735987+3598835988+- /* y = exp(r) - 1 ~= r + C0 r^2 + C1 r^3 + C2 r^4 + C3 r^5 + C4 r^6. */
3598935989++ /* poly(r) = exp(r) - 1 ~= C0 r + C1 r^2 + C2 r^3 + C3 r^4 + C4 r^5. */
3599035990+ svfloat32_t p12 = svmla_lane (sv_f32 (d->c1), r, lane_consts, 2);
3599135991+ svfloat32_t p34 = svmla_lane (sv_f32 (d->c3), r, lane_consts, 3);
3599235992+ svfloat32_t r2 = svmul_x (svptrue_b32 (), r, r);
3599335993+@@ -71,5 +71,4 @@ expf_inline (svfloat32_t x, const svbool_t pg, const struct sv_expf_data *d)
3599435994+3599535995+ return svmla_x (pg, scale, scale, poly);
3599635996+ }
3599735997+-
3599835998+ #endif
3599935999+3600036000+commit 194185c28954dfa11a6ded8b32f34fee680d3218
3600136001+Author: Yat Long Poon <yatlong.poon@arm.com>
3600236002+Date: Thu Feb 13 18:00:50 2025 +0000
3600336003+3600436004+ AArch64: Improve codegen for SVE erfcf
3600536005+3600636006+ Reduce number of MOV/MOVPRFXs and use unpredicated FMUL.
3600736007+ Replace MUL with LSL. Speedup on Neoverse V1: 6%.
3600836008+3600936009+ Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com>
3601036010+ (cherry picked from commit f5ff34cb3c75ec1061c75bb9188b3c1176426947)
3601136011+3601236012+diff --git a/sysdeps/aarch64/fpu/erfcf_sve.c b/sysdeps/aarch64/fpu/erfcf_sve.c
3601336013+index ecacb933ac..e4869263e3 100644
3601436014+--- a/sysdeps/aarch64/fpu/erfcf_sve.c
3601536015++++ b/sysdeps/aarch64/fpu/erfcf_sve.c
3601636016+@@ -76,7 +76,7 @@ svfloat32_t SV_NAME_F1 (erfc) (svfloat32_t x, const svbool_t pg)
3601736017+ svuint32_t i = svqadd (svreinterpret_u32 (z), dat->off_idx);
3601836018+3601936019+ /* Lookup erfc(r) and 2/sqrt(pi)*exp(-r^2) in tables. */
3602036020+- i = svmul_x (pg, i, 2);
3602136021++ i = svlsl_x (svptrue_b32 (), i, 1);
3602236022+ const float32_t *p = &__v_erfcf_data.tab[0].erfc - 2 * dat->off_arr;
3602336023+ svfloat32_t erfcr = svld1_gather_index (pg, p, i);
3602436024+ svfloat32_t scale = svld1_gather_index (pg, p + 1, i);
3602536025+@@ -84,15 +84,15 @@ svfloat32_t SV_NAME_F1 (erfc) (svfloat32_t x, const svbool_t pg)
3602636026+ /* erfc(x) ~ erfc(r) - scale * d * poly(r, d). */
3602736027+ svfloat32_t r = svsub_x (pg, z, shift);
3602836028+ svfloat32_t d = svsub_x (pg, a, r);
3602936029+- svfloat32_t d2 = svmul_x (pg, d, d);
3603036030+- svfloat32_t r2 = svmul_x (pg, r, r);
3603136031++ svfloat32_t d2 = svmul_x (svptrue_b32 (), d, d);
3603236032++ svfloat32_t r2 = svmul_x (svptrue_b32 (), r, r);
3603336033+3603436034+ svfloat32_t coeffs = svld1rq (svptrue_b32 (), &dat->third);
3603536035+- svfloat32_t third = svdup_lane (coeffs, 0);
3603636036+3603736037+ svfloat32_t p1 = r;
3603836038+- svfloat32_t p2 = svmls_lane (third, r2, coeffs, 1);
3603936039+- svfloat32_t p3 = svmul_x (pg, r, svmla_lane (sv_f32 (-0.5), r2, coeffs, 0));
3604036040++ svfloat32_t p2 = svmls_lane (sv_f32 (dat->third), r2, coeffs, 1);
3604136041++ svfloat32_t p3
3604236042++ = svmul_x (svptrue_b32 (), r, svmla_lane (sv_f32 (-0.5), r2, coeffs, 0));
3604336043+ svfloat32_t p4 = svmla_lane (sv_f32 (dat->two_over_five), r2, coeffs, 2);
3604436044+ p4 = svmls_x (pg, sv_f32 (dat->tenth), r2, p4);
3604536045+3604636046+3604736047+commit 7dc549c5a4af3c32689147550144397116404d22
3604836048+Author: Yat Long Poon <yatlong.poon@arm.com>
3604936049+Date: Thu Feb 13 18:02:01 2025 +0000
3605036050+3605136051+ AArch64: Improve codegen for SVE pow
3605236052+3605336053+ Move constants to struct. Improve memory access with indexed/unpredicated
3605436054+ instructions. Eliminate register spills. Speedup on Neoverse V1: 24%.
3605536055+3605636056+ Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com>
3605736057+ (cherry picked from commit 0b195651db3ae793187c7dd6d78b5a7a8da9d5e6)
3605836058+3605936059+diff --git a/sysdeps/aarch64/fpu/pow_sve.c b/sysdeps/aarch64/fpu/pow_sve.c
3606036060+index 4c0bf8956c..4242d22a49 100644
3606136061+--- a/sysdeps/aarch64/fpu/pow_sve.c
3606236062++++ b/sysdeps/aarch64/fpu/pow_sve.c
3606336063+@@ -44,19 +44,18 @@
3606436064+3606536065+ /* Data is defined in v_pow_log_data.c. */
3606636066+ #define N_LOG (1 << V_POW_LOG_TABLE_BITS)
3606736067+-#define A __v_pow_log_data.poly
3606836068+ #define Off 0x3fe6955500000000
3606936069+3607036070+ /* Data is defined in v_pow_exp_data.c. */
3607136071+ #define N_EXP (1 << V_POW_EXP_TABLE_BITS)
3607236072+ #define SignBias (0x800 << V_POW_EXP_TABLE_BITS)
3607336073+-#define C __v_pow_exp_data.poly
3607436074+ #define SmallExp 0x3c9 /* top12(0x1p-54). */
3607536075+ #define BigExp 0x408 /* top12(512.). */
3607636076+ #define ThresExp 0x03f /* BigExp - SmallExp. */
3607736077+ #define HugeExp 0x409 /* top12(1024.). */
3607836078+3607936079+ /* Constants associated with pow. */
3608036080++#define SmallBoundX 0x1p-126
3608136081+ #define SmallPowX 0x001 /* top12(0x1p-126). */
3608236082+ #define BigPowX 0x7ff /* top12(INFINITY). */
3608336083+ #define ThresPowX 0x7fe /* BigPowX - SmallPowX. */
3608436084+@@ -64,6 +63,31 @@
3608536085+ #define BigPowY 0x43e /* top12(0x1.749p62). */
3608636086+ #define ThresPowY 0x080 /* BigPowY - SmallPowY. */
3608736087+3608836088++static const struct data
3608936089++{
3609036090++ double log_c0, log_c2, log_c4, log_c6, ln2_hi, ln2_lo;
3609136091++ double log_c1, log_c3, log_c5, off;
3609236092++ double n_over_ln2, exp_c2, ln2_over_n_hi, ln2_over_n_lo;
3609336093++ double exp_c0, exp_c1;
3609436094++} data = {
3609536095++ .log_c0 = -0x1p-1,
3609636096++ .log_c1 = -0x1.555555555556p-1,
3609736097++ .log_c2 = 0x1.0000000000006p-1,
3609836098++ .log_c3 = 0x1.999999959554ep-1,
3609936099++ .log_c4 = -0x1.555555529a47ap-1,
3610036100++ .log_c5 = -0x1.2495b9b4845e9p0,
3610136101++ .log_c6 = 0x1.0002b8b263fc3p0,
3610236102++ .off = Off,
3610336103++ .exp_c0 = 0x1.fffffffffffd4p-2,
3610436104++ .exp_c1 = 0x1.5555571d6ef9p-3,
3610536105++ .exp_c2 = 0x1.5555576a5adcep-5,
3610636106++ .ln2_hi = 0x1.62e42fefa3800p-1,
3610736107++ .ln2_lo = 0x1.ef35793c76730p-45,
3610836108++ .n_over_ln2 = 0x1.71547652b82fep0 * N_EXP,
3610936109++ .ln2_over_n_hi = 0x1.62e42fefc0000p-9,
3611036110++ .ln2_over_n_lo = -0x1.c610ca86c3899p-45,
3611136111++};
3611236112++
3611336113+ /* Check if x is an integer. */
3611436114+ static inline svbool_t
3611536115+ sv_isint (svbool_t pg, svfloat64_t x)
3611636116+@@ -82,7 +106,7 @@ sv_isnotint (svbool_t pg, svfloat64_t x)
3611736117+ static inline svbool_t
3611836118+ sv_isodd (svbool_t pg, svfloat64_t x)
3611936119+ {
3612036120+- svfloat64_t y = svmul_x (pg, x, 0.5);
3612136121++ svfloat64_t y = svmul_x (svptrue_b64 (), x, 0.5);
3612236122+ return sv_isnotint (pg, y);
3612336123+ }
3612436124+3612536125+@@ -121,7 +145,7 @@ zeroinfnan (uint64_t i)
3612636126+ static inline svbool_t
3612736127+ sv_zeroinfnan (svbool_t pg, svuint64_t i)
3612836128+ {
3612936129+- return svcmpge (pg, svsub_x (pg, svmul_x (pg, i, 2), 1),
3613036130++ return svcmpge (pg, svsub_x (pg, svadd_x (pg, i, i), 1),
3613136131+ 2 * asuint64 (INFINITY) - 1);
3613236132+ }
3613336133+3613436134+@@ -174,16 +198,17 @@ sv_call_specialcase (svfloat64_t x1, svuint64_t u1, svuint64_t u2,
3613536135+ additional 15 bits precision. IX is the bit representation of x, but
3613636136+ normalized in the subnormal range using the sign bit for the exponent. */
3613736137+ static inline svfloat64_t
3613836138+-sv_log_inline (svbool_t pg, svuint64_t ix, svfloat64_t *tail)
3613936139++sv_log_inline (svbool_t pg, svuint64_t ix, svfloat64_t *tail,
3614036140++ const struct data *d)
3614136141+ {
3614236142+ /* x = 2^k z; where z is in range [Off,2*Off) and exact.
3614336143+ The range is split into N subintervals.
3614436144+ The ith subinterval contains z and c is near its center. */
3614536145+- svuint64_t tmp = svsub_x (pg, ix, Off);
3614636146++ svuint64_t tmp = svsub_x (pg, ix, d->off);
3614736147+ svuint64_t i = svand_x (pg, svlsr_x (pg, tmp, 52 - V_POW_LOG_TABLE_BITS),
3614836148+ sv_u64 (N_LOG - 1));
3614936149+ svint64_t k = svasr_x (pg, svreinterpret_s64 (tmp), 52);
3615036150+- svuint64_t iz = svsub_x (pg, ix, svand_x (pg, tmp, sv_u64 (0xfffULL << 52)));
3615136151++ svuint64_t iz = svsub_x (pg, ix, svlsl_x (pg, svreinterpret_u64 (k), 52));
3615236152+ svfloat64_t z = svreinterpret_f64 (iz);
3615336153+ svfloat64_t kd = svcvt_f64_x (pg, k);
3615436154+3615536155+@@ -199,40 +224,85 @@ sv_log_inline (svbool_t pg, svuint64_t ix, svfloat64_t *tail)
3615636156+ |z/c - 1| < 1/N, so r = z/c - 1 is exactly representible. */
3615736157+ svfloat64_t r = svmad_x (pg, z, invc, -1.0);
3615836158+ /* k*Ln2 + log(c) + r. */
3615936159+- svfloat64_t t1 = svmla_x (pg, logc, kd, __v_pow_log_data.ln2_hi);
3616036160++
3616136161++ svfloat64_t ln2_hilo = svld1rq_f64 (svptrue_b64 (), &d->ln2_hi);
3616236162++ svfloat64_t t1 = svmla_lane_f64 (logc, kd, ln2_hilo, 0);
3616336163+ svfloat64_t t2 = svadd_x (pg, t1, r);
3616436164+- svfloat64_t lo1 = svmla_x (pg, logctail, kd, __v_pow_log_data.ln2_lo);
3616536165++ svfloat64_t lo1 = svmla_lane_f64 (logctail, kd, ln2_hilo, 1);
3616636166+ svfloat64_t lo2 = svadd_x (pg, svsub_x (pg, t1, t2), r);
3616736167+3616836168+ /* Evaluation is optimized assuming superscalar pipelined execution. */
3616936169+- svfloat64_t ar = svmul_x (pg, r, -0.5); /* A[0] = -0.5. */
3617036170+- svfloat64_t ar2 = svmul_x (pg, r, ar);
3617136171+- svfloat64_t ar3 = svmul_x (pg, r, ar2);
3617236172++
3617336173++ svfloat64_t log_c02 = svld1rq_f64 (svptrue_b64 (), &d->log_c0);
3617436174++ svfloat64_t ar = svmul_lane_f64 (r, log_c02, 0);
3617536175++ svfloat64_t ar2 = svmul_x (svptrue_b64 (), r, ar);
3617636176++ svfloat64_t ar3 = svmul_x (svptrue_b64 (), r, ar2);
3617736177+ /* k*Ln2 + log(c) + r + A[0]*r*r. */
3617836178+ svfloat64_t hi = svadd_x (pg, t2, ar2);
3617936179+- svfloat64_t lo3 = svmla_x (pg, svneg_x (pg, ar2), ar, r);
3618036180++ svfloat64_t lo3 = svmls_x (pg, ar2, ar, r);
3618136181+ svfloat64_t lo4 = svadd_x (pg, svsub_x (pg, t2, hi), ar2);
3618236182+ /* p = log1p(r) - r - A[0]*r*r. */
3618336183+ /* p = (ar3 * (A[1] + r * A[2] + ar2 * (A[3] + r * A[4] + ar2 * (A[5] + r *
3618436184+ A[6])))). */
3618536185+- svfloat64_t a56 = svmla_x (pg, sv_f64 (A[5]), r, A[6]);
3618636186+- svfloat64_t a34 = svmla_x (pg, sv_f64 (A[3]), r, A[4]);
3618736187+- svfloat64_t a12 = svmla_x (pg, sv_f64 (A[1]), r, A[2]);
3618836188++
3618936189++ svfloat64_t log_c46 = svld1rq_f64 (svptrue_b64 (), &d->log_c4);
3619036190++ svfloat64_t a56 = svmla_lane_f64 (sv_f64 (d->log_c5), r, log_c46, 1);
3619136191++ svfloat64_t a34 = svmla_lane_f64 (sv_f64 (d->log_c3), r, log_c46, 0);
3619236192++ svfloat64_t a12 = svmla_lane_f64 (sv_f64 (d->log_c1), r, log_c02, 1);
3619336193+ svfloat64_t p = svmla_x (pg, a34, ar2, a56);
3619436194+ p = svmla_x (pg, a12, ar2, p);
3619536195+- p = svmul_x (pg, ar3, p);
3619636196++ p = svmul_x (svptrue_b64 (), ar3, p);
3619736197+ svfloat64_t lo = svadd_x (
3619836198+- pg, svadd_x (pg, svadd_x (pg, svadd_x (pg, lo1, lo2), lo3), lo4), p);
3619936199++ pg, svadd_x (pg, svsub_x (pg, svadd_x (pg, lo1, lo2), lo3), lo4), p);
3620036200+ svfloat64_t y = svadd_x (pg, hi, lo);
3620136201+ *tail = svadd_x (pg, svsub_x (pg, hi, y), lo);
3620236202+ return y;
3620336203+ }
3620436204+3620536205++static inline svfloat64_t
3620636206++sv_exp_core (svbool_t pg, svfloat64_t x, svfloat64_t xtail,
3620736207++ svuint64_t sign_bias, svfloat64_t *tmp, svuint64_t *sbits,
3620836208++ svuint64_t *ki, const struct data *d)
3620936209++{
3621036210++ /* exp(x) = 2^(k/N) * exp(r), with exp(r) in [2^(-1/2N),2^(1/2N)]. */
3621136211++ /* x = ln2/N*k + r, with int k and r in [-ln2/2N, ln2/2N]. */
3621236212++ svfloat64_t n_over_ln2_and_c2 = svld1rq_f64 (svptrue_b64 (), &d->n_over_ln2);
3621336213++ svfloat64_t z = svmul_lane_f64 (x, n_over_ln2_and_c2, 0);
3621436214++ /* z - kd is in [-1, 1] in non-nearest rounding modes. */
3621536215++ svfloat64_t kd = svrinta_x (pg, z);
3621636216++ *ki = svreinterpret_u64 (svcvt_s64_x (pg, kd));
3621736217++
3621836218++ svfloat64_t ln2_over_n_hilo
3621936219++ = svld1rq_f64 (svptrue_b64 (), &d->ln2_over_n_hi);
3622036220++ svfloat64_t r = x;
3622136221++ r = svmls_lane_f64 (r, kd, ln2_over_n_hilo, 0);
3622236222++ r = svmls_lane_f64 (r, kd, ln2_over_n_hilo, 1);
3622336223++ /* The code assumes 2^-200 < |xtail| < 2^-8/N. */
3622436224++ r = svadd_x (pg, r, xtail);
3622536225++ /* 2^(k/N) ~= scale. */
3622636226++ svuint64_t idx = svand_x (pg, *ki, N_EXP - 1);
3622736227++ svuint64_t top
3622836228++ = svlsl_x (pg, svadd_x (pg, *ki, sign_bias), 52 - V_POW_EXP_TABLE_BITS);
3622936229++ /* This is only a valid scale when -1023*N < k < 1024*N. */
3623036230++ *sbits = svld1_gather_index (pg, __v_pow_exp_data.sbits, idx);
3623136231++ *sbits = svadd_x (pg, *sbits, top);
3623236232++ /* exp(x) = 2^(k/N) * exp(r) ~= scale + scale * (exp(r) - 1). */
3623336233++ svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r);
3623436234++ *tmp = svmla_lane_f64 (sv_f64 (d->exp_c1), r, n_over_ln2_and_c2, 1);
3623536235++ *tmp = svmla_x (pg, sv_f64 (d->exp_c0), r, *tmp);
3623636236++ *tmp = svmla_x (pg, r, r2, *tmp);
3623736237++ svfloat64_t scale = svreinterpret_f64 (*sbits);
3623836238++ /* Note: tmp == 0 or |tmp| > 2^-200 and scale > 2^-739, so there
3623936239++ is no spurious underflow here even without fma. */
3624036240++ z = svmla_x (pg, scale, scale, *tmp);
3624136241++ return z;
3624236242++}
3624336243++
3624436244+ /* Computes sign*exp(x+xtail) where |xtail| < 2^-8/N and |xtail| <= |x|.
3624536245+ The sign_bias argument is SignBias or 0 and sets the sign to -1 or 1. */
3624636246+ static inline svfloat64_t
3624736247+ sv_exp_inline (svbool_t pg, svfloat64_t x, svfloat64_t xtail,
3624836248+- svuint64_t sign_bias)
3624936249++ svuint64_t sign_bias, const struct data *d)
3625036250+ {
3625136251+ /* 3 types of special cases: tiny (uflow and spurious uflow), huge (oflow)
3625236252+ and other cases of large values of x (scale * (1 + TMP) oflow). */
3625336253+@@ -240,73 +310,46 @@ sv_exp_inline (svbool_t pg, svfloat64_t x, svfloat64_t xtail,
3625436254+ /* |x| is large (|x| >= 512) or tiny (|x| <= 0x1p-54). */
3625536255+ svbool_t uoflow = svcmpge (pg, svsub_x (pg, abstop, SmallExp), ThresExp);
3625636256+3625736257+- /* Conditions special, uflow and oflow are all expressed as uoflow &&
3625836258+- something, hence do not bother computing anything if no lane in uoflow is
3625936259+- true. */
3626036260+- svbool_t special = svpfalse_b ();
3626136261+- svbool_t uflow = svpfalse_b ();
3626236262+- svbool_t oflow = svpfalse_b ();
3626336263++ svfloat64_t tmp;
3626436264++ svuint64_t sbits, ki;
3626536265+ if (__glibc_unlikely (svptest_any (pg, uoflow)))
3626636266+ {
3626736267++ svfloat64_t z
3626836268++ = sv_exp_core (pg, x, xtail, sign_bias, &tmp, &sbits, &ki, d);
3626936269++
3627036270+ /* |x| is tiny (|x| <= 0x1p-54). */
3627136271+- uflow = svcmpge (pg, svsub_x (pg, abstop, SmallExp), 0x80000000);
3627236272++ svbool_t uflow
3627336273++ = svcmpge (pg, svsub_x (pg, abstop, SmallExp), 0x80000000);
3627436274+ uflow = svand_z (pg, uoflow, uflow);
3627536275+ /* |x| is huge (|x| >= 1024). */
3627636276+- oflow = svcmpge (pg, abstop, HugeExp);
3627736277++ svbool_t oflow = svcmpge (pg, abstop, HugeExp);
3627836278+ oflow = svand_z (pg, uoflow, svbic_z (pg, oflow, uflow));
3627936279++
3628036280+ /* For large |x| values (512 < |x| < 1024) scale * (1 + TMP) can overflow
3628136281+- or underflow. */
3628236282+- special = svbic_z (pg, uoflow, svorr_z (pg, uflow, oflow));
3628336283++ or underflow. */
3628436284++ svbool_t special = svbic_z (pg, uoflow, svorr_z (pg, uflow, oflow));
3628536285++
3628636286++ /* Update result with special and large cases. */
3628736287++ z = sv_call_specialcase (tmp, sbits, ki, z, special);
3628836288++
3628936289++ /* Handle underflow and overflow. */
3629036290++ svbool_t x_is_neg = svcmplt (pg, x, 0);
3629136291++ svuint64_t sign_mask
3629236292++ = svlsl_x (pg, sign_bias, 52 - V_POW_EXP_TABLE_BITS);
3629336293++ svfloat64_t res_uoflow
3629436294++ = svsel (x_is_neg, sv_f64 (0.0), sv_f64 (INFINITY));
3629536295++ res_uoflow = svreinterpret_f64 (
3629636296++ svorr_x (pg, svreinterpret_u64 (res_uoflow), sign_mask));
3629736297++ /* Avoid spurious underflow for tiny x. */
3629836298++ svfloat64_t res_spurious_uflow
3629936299++ = svreinterpret_f64 (svorr_x (pg, sign_mask, 0x3ff0000000000000));
3630036300++
3630136301++ z = svsel (oflow, res_uoflow, z);
3630236302++ z = svsel (uflow, res_spurious_uflow, z);
3630336303++ return z;
3630436304+ }
3630536305+3630636306+- /* exp(x) = 2^(k/N) * exp(r), with exp(r) in [2^(-1/2N),2^(1/2N)]. */
3630736307+- /* x = ln2/N*k + r, with int k and r in [-ln2/2N, ln2/2N]. */
3630836308+- svfloat64_t z = svmul_x (pg, x, __v_pow_exp_data.n_over_ln2);
3630936309+- /* z - kd is in [-1, 1] in non-nearest rounding modes. */
3631036310+- svfloat64_t shift = sv_f64 (__v_pow_exp_data.shift);
3631136311+- svfloat64_t kd = svadd_x (pg, z, shift);
3631236312+- svuint64_t ki = svreinterpret_u64 (kd);
3631336313+- kd = svsub_x (pg, kd, shift);
3631436314+- svfloat64_t r = x;
3631536315+- r = svmls_x (pg, r, kd, __v_pow_exp_data.ln2_over_n_hi);
3631636316+- r = svmls_x (pg, r, kd, __v_pow_exp_data.ln2_over_n_lo);
3631736317+- /* The code assumes 2^-200 < |xtail| < 2^-8/N. */
3631836318+- r = svadd_x (pg, r, xtail);
3631936319+- /* 2^(k/N) ~= scale. */
3632036320+- svuint64_t idx = svand_x (pg, ki, N_EXP - 1);
3632136321+- svuint64_t top
3632236322+- = svlsl_x (pg, svadd_x (pg, ki, sign_bias), 52 - V_POW_EXP_TABLE_BITS);
3632336323+- /* This is only a valid scale when -1023*N < k < 1024*N. */
3632436324+- svuint64_t sbits = svld1_gather_index (pg, __v_pow_exp_data.sbits, idx);
3632536325+- sbits = svadd_x (pg, sbits, top);
3632636326+- /* exp(x) = 2^(k/N) * exp(r) ~= scale + scale * (exp(r) - 1). */
3632736327+- svfloat64_t r2 = svmul_x (pg, r, r);
3632836328+- svfloat64_t tmp = svmla_x (pg, sv_f64 (C[1]), r, C[2]);
3632936329+- tmp = svmla_x (pg, sv_f64 (C[0]), r, tmp);
3633036330+- tmp = svmla_x (pg, r, r2, tmp);
3633136331+- svfloat64_t scale = svreinterpret_f64 (sbits);
3633236332+- /* Note: tmp == 0 or |tmp| > 2^-200 and scale > 2^-739, so there
3633336333+- is no spurious underflow here even without fma. */
3633436334+- z = svmla_x (pg, scale, scale, tmp);
3633536335+-
3633636336+- /* Update result with special and large cases. */
3633736337+- if (__glibc_unlikely (svptest_any (pg, special)))
3633836338+- z = sv_call_specialcase (tmp, sbits, ki, z, special);
3633936339+-
3634036340+- /* Handle underflow and overflow. */
3634136341+- svuint64_t sign_bit = svlsr_x (pg, svreinterpret_u64 (x), 63);
3634236342+- svbool_t x_is_neg = svcmpne (pg, sign_bit, 0);
3634336343+- svuint64_t sign_mask = svlsl_x (pg, sign_bias, 52 - V_POW_EXP_TABLE_BITS);
3634436344+- svfloat64_t res_uoflow = svsel (x_is_neg, sv_f64 (0.0), sv_f64 (INFINITY));
3634536345+- res_uoflow = svreinterpret_f64 (
3634636346+- svorr_x (pg, svreinterpret_u64 (res_uoflow), sign_mask));
3634736347+- z = svsel (oflow, res_uoflow, z);
3634836348+- /* Avoid spurious underflow for tiny x. */
3634936349+- svfloat64_t res_spurious_uflow
3635036350+- = svreinterpret_f64 (svorr_x (pg, sign_mask, 0x3ff0000000000000));
3635136351+- z = svsel (uflow, res_spurious_uflow, z);
3635236352+-
3635336353+- return z;
3635436354++ return sv_exp_core (pg, x, xtail, sign_bias, &tmp, &sbits, &ki, d);
3635536355+ }
3635636356+3635736357+ static inline double
3635836358+@@ -341,47 +384,39 @@ pow_sc (double x, double y)
3635936359+3636036360+ svfloat64_t SV_NAME_D2 (pow) (svfloat64_t x, svfloat64_t y, const svbool_t pg)
3636136361+ {
3636236362++ const struct data *d = ptr_barrier (&data);
3636336363++
3636436364+ /* This preamble handles special case conditions used in the final scalar
3636536365+ fallbacks. It also updates ix and sign_bias, that are used in the core
3636636366+ computation too, i.e., exp( y * log (x) ). */
3636736367+ svuint64_t vix0 = svreinterpret_u64 (x);
3636836368+ svuint64_t viy0 = svreinterpret_u64 (y);
3636936369+- svuint64_t vtopx0 = svlsr_x (svptrue_b64 (), vix0, 52);
3637036370+3637136371+ /* Negative x cases. */
3637236372+- svuint64_t sign_bit = svlsr_m (pg, vix0, 63);
3637336373+- svbool_t xisneg = svcmpeq (pg, sign_bit, 1);
3637436374++ svbool_t xisneg = svcmplt (pg, x, 0);
3637536375+3637636376+ /* Set sign_bias and ix depending on sign of x and nature of y. */
3637736377+- svbool_t yisnotint_xisneg = svpfalse_b ();
3637836378++ svbool_t yint_or_xpos = pg;
3637936379+ svuint64_t sign_bias = sv_u64 (0);
3638036380+ svuint64_t vix = vix0;
3638136381+- svuint64_t vtopx1 = vtopx0;
3638236382+ if (__glibc_unlikely (svptest_any (pg, xisneg)))
3638336383+ {
3638436384+ /* Determine nature of y. */
3638536385+- yisnotint_xisneg = sv_isnotint (xisneg, y);
3638636386+- svbool_t yisint_xisneg = sv_isint (xisneg, y);
3638736387++ yint_or_xpos = sv_isint (xisneg, y);
3638836388+ svbool_t yisodd_xisneg = sv_isodd (xisneg, y);
3638936389+ /* ix set to abs(ix) if y is integer. */
3639036390+- vix = svand_m (yisint_xisneg, vix0, 0x7fffffffffffffff);
3639136391+- vtopx1 = svand_m (yisint_xisneg, vtopx0, 0x7ff);
3639236392++ vix = svand_m (yint_or_xpos, vix0, 0x7fffffffffffffff);
3639336393+ /* Set to SignBias if x is negative and y is odd. */
3639436394+ sign_bias = svsel (yisodd_xisneg, sv_u64 (SignBias), sv_u64 (0));
3639536395+ }
3639636396+3639736397+- /* Special cases of x or y: zero, inf and nan. */
3639836398+- svbool_t xspecial = sv_zeroinfnan (pg, vix0);
3639936399+- svbool_t yspecial = sv_zeroinfnan (pg, viy0);
3640036400+- svbool_t special = svorr_z (pg, xspecial, yspecial);
3640136401+-
3640236402+ /* Small cases of x: |x| < 0x1p-126. */
3640336403+- svuint64_t vabstopx0 = svand_x (pg, vtopx0, 0x7ff);
3640436404+- svbool_t xsmall = svcmplt (pg, vabstopx0, SmallPowX);
3640536405+- if (__glibc_unlikely (svptest_any (pg, xsmall)))
3640636406++ svbool_t xsmall = svaclt (yint_or_xpos, x, SmallBoundX);
3640736407++ if (__glibc_unlikely (svptest_any (yint_or_xpos, xsmall)))
3640836408+ {
3640936409+ /* Normalize subnormal x so exponent becomes negative. */
3641036410+- svbool_t topx_is_null = svcmpeq (xsmall, vtopx1, 0);
3641136411++ svuint64_t vtopx = svlsr_x (svptrue_b64 (), vix, 52);
3641236412++ svbool_t topx_is_null = svcmpeq (xsmall, vtopx, 0);
3641336413+3641436414+ svuint64_t vix_norm = svreinterpret_u64 (svmul_m (xsmall, x, 0x1p52));
3641536415+ vix_norm = svand_m (xsmall, vix_norm, 0x7fffffffffffffff);
3641636416+@@ -391,20 +426,24 @@ svfloat64_t SV_NAME_D2 (pow) (svfloat64_t x, svfloat64_t y, const svbool_t pg)
3641736417+3641836418+ /* y_hi = log(ix, &y_lo). */
3641936419+ svfloat64_t vlo;
3642036420+- svfloat64_t vhi = sv_log_inline (pg, vix, &vlo);
3642136421++ svfloat64_t vhi = sv_log_inline (yint_or_xpos, vix, &vlo, d);
3642236422+3642336423+ /* z = exp(y_hi, y_lo, sign_bias). */
3642436424+- svfloat64_t vehi = svmul_x (pg, y, vhi);
3642536425+- svfloat64_t velo = svmul_x (pg, y, vlo);
3642636426+- svfloat64_t vemi = svmls_x (pg, vehi, y, vhi);
3642736427+- velo = svsub_x (pg, velo, vemi);
3642836428+- svfloat64_t vz = sv_exp_inline (pg, vehi, velo, sign_bias);
3642936429++ svfloat64_t vehi = svmul_x (svptrue_b64 (), y, vhi);
3643036430++ svfloat64_t vemi = svmls_x (yint_or_xpos, vehi, y, vhi);
3643136431++ svfloat64_t velo = svnmls_x (yint_or_xpos, vemi, y, vlo);
3643236432++ svfloat64_t vz = sv_exp_inline (yint_or_xpos, vehi, velo, sign_bias, d);
3643336433+3643436434+ /* Cases of finite y and finite negative x. */
3643536435+- vz = svsel (yisnotint_xisneg, sv_f64 (__builtin_nan ("")), vz);
3643636436++ vz = svsel (yint_or_xpos, vz, sv_f64 (__builtin_nan ("")));
3643736437++
3643836438++ /* Special cases of x or y: zero, inf and nan. */
3643936439++ svbool_t xspecial = sv_zeroinfnan (svptrue_b64 (), vix0);
3644036440++ svbool_t yspecial = sv_zeroinfnan (svptrue_b64 (), viy0);
3644136441++ svbool_t special = svorr_z (svptrue_b64 (), xspecial, yspecial);
3644236442+3644336443+ /* Cases of zero/inf/nan x or y. */
3644436444+- if (__glibc_unlikely (svptest_any (pg, special)))
3644536445++ if (__glibc_unlikely (svptest_any (svptrue_b64 (), special)))
3644636446+ vz = sv_call2_f64 (pow_sc, x, y, vz, special);
3644736447+3644836448+ return vz;
3644936449+3645036450+commit 06fd8ad78f35a6cc65dc7c6c08ce55faf6ad079d
3645136451+Author: Yat Long Poon <yatlong.poon@arm.com>
3645236452+Date: Thu Feb 13 18:03:04 2025 +0000
3645336453+3645436454+ AArch64: Improve codegen for SVE powf
3645536455+3645636456+ Improve memory access with indexed/unpredicated instructions.
3645736457+ Eliminate register spills. Speedup on Neoverse V1: 3%.
3645836458+3645936459+ Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com>
3646036460+ (cherry picked from commit 95e807209b680257a9afe81a507754f1565dbb4d)
3646136461+3646236462+diff --git a/sysdeps/aarch64/fpu/powf_sve.c b/sysdeps/aarch64/fpu/powf_sve.c
3646336463+index 4f6a142325..08d7019a18 100644
3646436464+--- a/sysdeps/aarch64/fpu/powf_sve.c
3646536465++++ b/sysdeps/aarch64/fpu/powf_sve.c
3646636466+@@ -26,7 +26,6 @@
3646736467+ #define Tlogc __v_powf_data.logc
3646836468+ #define Texp __v_powf_data.scale
3646936469+ #define SignBias (1 << (V_POWF_EXP2_TABLE_BITS + 11))
3647036470+-#define Shift 0x1.8p52
3647136471+ #define Norm 0x1p23f /* 0x4b000000. */
3647236472+3647336473+ /* Overall ULP error bound for pow is 2.6 ulp
3647436474+@@ -36,7 +35,7 @@ static const struct data
3647536475+ double log_poly[4];
3647636476+ double exp_poly[3];
3647736477+ float uflow_bound, oflow_bound, small_bound;
3647836478+- uint32_t sign_bias, sign_mask, subnormal_bias, off;
3647936479++ uint32_t sign_bias, subnormal_bias, off;
3648036480+ } data = {
3648136481+ /* rel err: 1.5 * 2^-30. Each coefficients is multiplied the value of
3648236482+ V_POWF_EXP2_N. */
3648336483+@@ -53,7 +52,6 @@ static const struct data
3648436484+ .small_bound = 0x1p-126f,
3648536485+ .off = 0x3f35d000,
3648636486+ .sign_bias = SignBias,
3648736487+- .sign_mask = 0x80000000,
3648836488+ .subnormal_bias = 0x0b800000, /* 23 << 23. */
3648936489+ };
3649036490+3649136491+@@ -86,7 +84,7 @@ svisodd (svbool_t pg, svfloat32_t x)
3649236492+ static inline svbool_t
3649336493+ sv_zeroinfnan (svbool_t pg, svuint32_t i)
3649436494+ {
3649536495+- return svcmpge (pg, svsub_x (pg, svmul_x (pg, i, 2u), 1),
3649636496++ return svcmpge (pg, svsub_x (pg, svadd_x (pg, i, i), 1),
3649736497+ 2u * 0x7f800000 - 1);
3649836498+ }
3649936499+3650036500+@@ -150,9 +148,14 @@ powf_specialcase (float x, float y, float z)
3650136501+ }
3650236502+3650336503+ /* Scalar fallback for special case routines with custom signature. */
3650436504+-static inline svfloat32_t
3650536505+-sv_call_powf_sc (svfloat32_t x1, svfloat32_t x2, svfloat32_t y, svbool_t cmp)
3650636506++static svfloat32_t NOINLINE
3650736507++sv_call_powf_sc (svfloat32_t x1, svfloat32_t x2, svfloat32_t y)
3650836508+ {
3650936509++ /* Special cases of x or y: zero, inf and nan. */
3651036510++ svbool_t xspecial = sv_zeroinfnan (svptrue_b32 (), svreinterpret_u32 (x1));
3651136511++ svbool_t yspecial = sv_zeroinfnan (svptrue_b32 (), svreinterpret_u32 (x2));
3651236512++ svbool_t cmp = svorr_z (svptrue_b32 (), xspecial, yspecial);
3651336513++
3651436514+ svbool_t p = svpfirst (cmp, svpfalse ());
3651536515+ while (svptest_any (cmp, p))
3651636516+ {
3651736517+@@ -182,30 +185,30 @@ sv_powf_core_ext (const svbool_t pg, svuint64_t i, svfloat64_t z, svint64_t k,
3651836518+3651936519+ /* Polynomial to approximate log1p(r)/ln2. */
3652036520+ svfloat64_t logx = A (0);
3652136521+- logx = svmla_x (pg, A (1), r, logx);
3652236522+- logx = svmla_x (pg, A (2), r, logx);
3652336523+- logx = svmla_x (pg, A (3), r, logx);
3652436524+- logx = svmla_x (pg, y0, r, logx);
3652536525++ logx = svmad_x (pg, r, logx, A (1));
3652636526++ logx = svmad_x (pg, r, logx, A (2));
3652736527++ logx = svmad_x (pg, r, logx, A (3));
3652836528++ logx = svmad_x (pg, r, logx, y0);
3652936529+ *pylogx = svmul_x (pg, y, logx);
3653036530+3653136531+ /* z - kd is in [-1, 1] in non-nearest rounding modes. */
3653236532+- svfloat64_t kd = svadd_x (pg, *pylogx, Shift);
3653336533+- svuint64_t ki = svreinterpret_u64 (kd);
3653436534+- kd = svsub_x (pg, kd, Shift);
3653536535++ svfloat64_t kd = svrinta_x (svptrue_b64 (), *pylogx);
3653636536++ svuint64_t ki = svreinterpret_u64 (svcvt_s64_x (svptrue_b64 (), kd));
3653736537+3653836538+ r = svsub_x (pg, *pylogx, kd);
3653936539+3654036540+ /* exp2(x) = 2^(k/N) * 2^r ~= s * (C0*r^3 + C1*r^2 + C2*r + 1). */
3654136541+- svuint64_t t
3654236542+- = svld1_gather_index (pg, Texp, svand_x (pg, ki, V_POWF_EXP2_N - 1));
3654336543+- svuint64_t ski = svadd_x (pg, ki, sign_bias);
3654436544+- t = svadd_x (pg, t, svlsl_x (pg, ski, 52 - V_POWF_EXP2_TABLE_BITS));
3654536545++ svuint64_t t = svld1_gather_index (
3654636546++ svptrue_b64 (), Texp, svand_x (svptrue_b64 (), ki, V_POWF_EXP2_N - 1));
3654736547++ svuint64_t ski = svadd_x (svptrue_b64 (), ki, sign_bias);
3654836548++ t = svadd_x (svptrue_b64 (), t,
3654936549++ svlsl_x (svptrue_b64 (), ski, 52 - V_POWF_EXP2_TABLE_BITS));
3655036550+ svfloat64_t s = svreinterpret_f64 (t);
3655136551+3655236552+ svfloat64_t p = C (0);
3655336553+ p = svmla_x (pg, C (1), p, r);
3655436554+ p = svmla_x (pg, C (2), p, r);
3655536555+- p = svmla_x (pg, s, p, svmul_x (pg, s, r));
3655636556++ p = svmla_x (pg, s, p, svmul_x (svptrue_b64 (), s, r));
3655736557+3655836558+ return p;
3655936559+ }
3656036560+@@ -219,19 +222,16 @@ sv_powf_core (const svbool_t pg, svuint32_t i, svuint32_t iz, svint32_t k,
3656136561+ {
3656236562+ const svbool_t ptrue = svptrue_b64 ();
3656336563+3656436564+- /* Unpack and promote input vectors (pg, y, z, i, k and sign_bias) into two in
3656536565+- order to perform core computation in double precision. */
3656636566++ /* Unpack and promote input vectors (pg, y, z, i, k and sign_bias) into two
3656736567++ * in order to perform core computation in double precision. */
3656836568+ const svbool_t pg_lo = svunpklo (pg);
3656936569+ const svbool_t pg_hi = svunpkhi (pg);
3657036570+- svfloat64_t y_lo = svcvt_f64_x (
3657136571+- ptrue, svreinterpret_f32 (svunpklo (svreinterpret_u32 (y))));
3657236572+- svfloat64_t y_hi = svcvt_f64_x (
3657336573+- ptrue, svreinterpret_f32 (svunpkhi (svreinterpret_u32 (y))));
3657436574+- svfloat32_t z = svreinterpret_f32 (iz);
3657536575+- svfloat64_t z_lo = svcvt_f64_x (
3657636576+- ptrue, svreinterpret_f32 (svunpklo (svreinterpret_u32 (z))));
3657736577+- svfloat64_t z_hi = svcvt_f64_x (
3657836578+- ptrue, svreinterpret_f32 (svunpkhi (svreinterpret_u32 (z))));
3657936579++ svfloat64_t y_lo
3658036580++ = svcvt_f64_x (pg, svreinterpret_f32 (svunpklo (svreinterpret_u32 (y))));
3658136581++ svfloat64_t y_hi
3658236582++ = svcvt_f64_x (pg, svreinterpret_f32 (svunpkhi (svreinterpret_u32 (y))));
3658336583++ svfloat64_t z_lo = svcvt_f64_x (pg, svreinterpret_f32 (svunpklo (iz)));
3658436584++ svfloat64_t z_hi = svcvt_f64_x (pg, svreinterpret_f32 (svunpkhi (iz)));
3658536585+ svuint64_t i_lo = svunpklo (i);
3658636586+ svuint64_t i_hi = svunpkhi (i);
3658736587+ svint64_t k_lo = svunpklo (k);
3658836588+@@ -258,9 +258,9 @@ sv_powf_core (const svbool_t pg, svuint32_t i, svuint32_t iz, svint32_t k,
3658936589+ /* Implementation of SVE powf.
3659036590+ Provides the same accuracy as AdvSIMD powf, since it relies on the same
3659136591+ algorithm. The theoretical maximum error is under 2.60 ULPs.
3659236592+- Maximum measured error is 2.56 ULPs:
3659336593+- SV_NAME_F2 (pow) (0x1.004118p+0, 0x1.5d14a4p+16) got 0x1.fd4bp+127
3659436594+- want 0x1.fd4b06p+127. */
3659536595++ Maximum measured error is 2.57 ULPs:
3659636596++ SV_NAME_F2 (pow) (0x1.031706p+0, 0x1.ce2ec2p+12) got 0x1.fff868p+127
3659736597++ want 0x1.fff862p+127. */
3659836598+ svfloat32_t SV_NAME_F2 (pow) (svfloat32_t x, svfloat32_t y, const svbool_t pg)
3659936599+ {
3660036600+ const struct data *d = ptr_barrier (&data);
3660136601+@@ -269,21 +269,19 @@ svfloat32_t SV_NAME_F2 (pow) (svfloat32_t x, svfloat32_t y, const svbool_t pg)
3660236602+ svuint32_t viy0 = svreinterpret_u32 (y);
3660336603+3660436604+ /* Negative x cases. */
3660536605+- svuint32_t sign_bit = svand_m (pg, vix0, d->sign_mask);
3660636606+- svbool_t xisneg = svcmpeq (pg, sign_bit, d->sign_mask);
3660736607++ svbool_t xisneg = svcmplt (pg, x, sv_f32 (0));
3660836608+3660936609+ /* Set sign_bias and ix depending on sign of x and nature of y. */
3661036610+- svbool_t yisnotint_xisneg = svpfalse_b ();
3661136611++ svbool_t yint_or_xpos = pg;
3661236612+ svuint32_t sign_bias = sv_u32 (0);
3661336613+ svuint32_t vix = vix0;
3661436614+ if (__glibc_unlikely (svptest_any (pg, xisneg)))
3661536615+ {
3661636616+ /* Determine nature of y. */
3661736617+- yisnotint_xisneg = svisnotint (xisneg, y);
3661836618+- svbool_t yisint_xisneg = svisint (xisneg, y);
3661936619++ yint_or_xpos = svisint (xisneg, y);
3662036620+ svbool_t yisodd_xisneg = svisodd (xisneg, y);
3662136621+ /* ix set to abs(ix) if y is integer. */
3662236622+- vix = svand_m (yisint_xisneg, vix0, 0x7fffffff);
3662336623++ vix = svand_m (yint_or_xpos, vix0, 0x7fffffff);
3662436624+ /* Set to SignBias if x is negative and y is odd. */
3662536625+ sign_bias = svsel (yisodd_xisneg, sv_u32 (d->sign_bias), sv_u32 (0));
3662636626+ }
3662736627+@@ -294,8 +292,8 @@ svfloat32_t SV_NAME_F2 (pow) (svfloat32_t x, svfloat32_t y, const svbool_t pg)
3662836628+ svbool_t cmp = svorr_z (pg, xspecial, yspecial);
3662936629+3663036630+ /* Small cases of x: |x| < 0x1p-126. */
3663136631+- svbool_t xsmall = svaclt (pg, x, d->small_bound);
3663236632+- if (__glibc_unlikely (svptest_any (pg, xsmall)))
3663336633++ svbool_t xsmall = svaclt (yint_or_xpos, x, d->small_bound);
3663436634++ if (__glibc_unlikely (svptest_any (yint_or_xpos, xsmall)))
3663536635+ {
3663636636+ /* Normalize subnormal x so exponent becomes negative. */
3663736637+ svuint32_t vix_norm = svreinterpret_u32 (svmul_x (xsmall, x, Norm));
3663836638+@@ -304,32 +302,35 @@ svfloat32_t SV_NAME_F2 (pow) (svfloat32_t x, svfloat32_t y, const svbool_t pg)
3663936639+ vix = svsel (xsmall, vix_norm, vix);
3664036640+ }
3664136641+ /* Part of core computation carried in working precision. */
3664236642+- svuint32_t tmp = svsub_x (pg, vix, d->off);
3664336643+- svuint32_t i = svand_x (pg, svlsr_x (pg, tmp, (23 - V_POWF_LOG2_TABLE_BITS)),
3664436644+- V_POWF_LOG2_N - 1);
3664536645+- svuint32_t top = svand_x (pg, tmp, 0xff800000);
3664636646+- svuint32_t iz = svsub_x (pg, vix, top);
3664736647+- svint32_t k
3664836648+- = svasr_x (pg, svreinterpret_s32 (top), (23 - V_POWF_EXP2_TABLE_BITS));
3664936649+-
3665036650+- /* Compute core in extended precision and return intermediate ylogx results to
3665136651+- handle cases of underflow and underflow in exp. */
3665236652++ svuint32_t tmp = svsub_x (yint_or_xpos, vix, d->off);
3665336653++ svuint32_t i = svand_x (
3665436654++ yint_or_xpos, svlsr_x (yint_or_xpos, tmp, (23 - V_POWF_LOG2_TABLE_BITS)),
3665536655++ V_POWF_LOG2_N - 1);
3665636656++ svuint32_t top = svand_x (yint_or_xpos, tmp, 0xff800000);
3665736657++ svuint32_t iz = svsub_x (yint_or_xpos, vix, top);
3665836658++ svint32_t k = svasr_x (yint_or_xpos, svreinterpret_s32 (top),
3665936659++ (23 - V_POWF_EXP2_TABLE_BITS));
3666036660++
3666136661++ /* Compute core in extended precision and return intermediate ylogx results
3666236662++ * to handle cases of underflow and underflow in exp. */
3666336663+ svfloat32_t ylogx;
3666436664+- svfloat32_t ret = sv_powf_core (pg, i, iz, k, y, sign_bias, &ylogx, d);
3666536665++ svfloat32_t ret
3666636666++ = sv_powf_core (yint_or_xpos, i, iz, k, y, sign_bias, &ylogx, d);
3666736667+3666836668+ /* Handle exp special cases of underflow and overflow. */
3666936669+- svuint32_t sign = svlsl_x (pg, sign_bias, 20 - V_POWF_EXP2_TABLE_BITS);
3667036670++ svuint32_t sign
3667136671++ = svlsl_x (yint_or_xpos, sign_bias, 20 - V_POWF_EXP2_TABLE_BITS);
3667236672+ svfloat32_t ret_oflow
3667336673+- = svreinterpret_f32 (svorr_x (pg, sign, asuint (INFINITY)));
3667436674++ = svreinterpret_f32 (svorr_x (yint_or_xpos, sign, asuint (INFINITY)));
3667536675+ svfloat32_t ret_uflow = svreinterpret_f32 (sign);
3667636676+- ret = svsel (svcmple (pg, ylogx, d->uflow_bound), ret_uflow, ret);
3667736677+- ret = svsel (svcmpgt (pg, ylogx, d->oflow_bound), ret_oflow, ret);
3667836678++ ret = svsel (svcmple (yint_or_xpos, ylogx, d->uflow_bound), ret_uflow, ret);
3667936679++ ret = svsel (svcmpgt (yint_or_xpos, ylogx, d->oflow_bound), ret_oflow, ret);
3668036680+3668136681+ /* Cases of finite y and finite negative x. */
3668236682+- ret = svsel (yisnotint_xisneg, sv_f32 (__builtin_nanf ("")), ret);
3668336683++ ret = svsel (yint_or_xpos, ret, sv_f32 (__builtin_nanf ("")));
3668436684+3668536685+- if (__glibc_unlikely (svptest_any (pg, cmp)))
3668636686+- return sv_call_powf_sc (x, y, ret, cmp);
3668736687++ if (__glibc_unlikely (svptest_any (cmp, cmp)))
3668836688++ return sv_call_powf_sc (x, y, ret);
3668936689+3669036690+ return ret;
3669136691+ }
3669236692+3669336693+commit fd9a3a36fdcf14d1678c469e8b9033a46aa6c6fb
3669436694+Author: Wilco Dijkstra <wilco.dijkstra@arm.com>
3669536695+Date: Thu Feb 27 20:34:34 2025 +0000
3669636696+3669736697+ Revert "AArch64: Add vector logp1 alias for log1p"
3669836698+3669936699+ This reverts commit a991a0fc7c051d7ef2ea7778e0a699f22d4e53d7.
3670036700+3670136701+diff --git a/bits/libm-simd-decl-stubs.h b/bits/libm-simd-decl-stubs.h
3670236702+index 5019e8e25c..08a41c46ad 100644
3670336703+--- a/bits/libm-simd-decl-stubs.h
3670436704++++ b/bits/libm-simd-decl-stubs.h
3670536705+@@ -253,17 +253,6 @@
3670636706+ #define __DECL_SIMD_log1pf64x
3670736707+ #define __DECL_SIMD_log1pf128x
3670836708+3670936709+-#define __DECL_SIMD_logp1
3671036710+-#define __DECL_SIMD_logp1f
3671136711+-#define __DECL_SIMD_logp1l
3671236712+-#define __DECL_SIMD_logp1f16
3671336713+-#define __DECL_SIMD_logp1f32
3671436714+-#define __DECL_SIMD_logp1f64
3671536715+-#define __DECL_SIMD_logp1f128
3671636716+-#define __DECL_SIMD_logp1f32x
3671736717+-#define __DECL_SIMD_logp1f64x
3671836718+-#define __DECL_SIMD_logp1f128x
3671936719+-
3672036720+ #define __DECL_SIMD_atanh
3672136721+ #define __DECL_SIMD_atanhf
3672236722+ #define __DECL_SIMD_atanhl
3672336723+diff --git a/math/bits/mathcalls.h b/math/bits/mathcalls.h
3672436724+index 92856becc4..6cb594b6ff 100644
3672536725+--- a/math/bits/mathcalls.h
3672636726++++ b/math/bits/mathcalls.h
3672736727+@@ -126,7 +126,7 @@ __MATHCALL (log2p1,, (_Mdouble_ __x));
3672836728+ __MATHCALL (log10p1,, (_Mdouble_ __x));
3672936729+3673036730+ /* Return log(1 + X). */
3673136731+-__MATHCALL_VEC (logp1,, (_Mdouble_ __x));
3673236732++__MATHCALL (logp1,, (_Mdouble_ __x));
3673336733+ #endif
3673436734+3673536735+ #if defined __USE_XOPEN_EXTENDED || defined __USE_ISOC99
3673636736+diff --git a/sysdeps/aarch64/fpu/Versions b/sysdeps/aarch64/fpu/Versions
3673736737+index 015211f5f4..cc15ce2d1e 100644
3673836738+--- a/sysdeps/aarch64/fpu/Versions
3673936739++++ b/sysdeps/aarch64/fpu/Versions
3674036740+@@ -135,11 +135,4 @@ libmvec {
3674136741+ _ZGVsMxv_tanh;
3674236742+ _ZGVsMxv_tanhf;
3674336743+ }
3674436744+- GLIBC_2.41 {
3674536745+- _ZGVnN2v_logp1;
3674636746+- _ZGVnN2v_logp1f;
3674736747+- _ZGVnN4v_logp1f;
3674836748+- _ZGVsMxv_logp1;
3674936749+- _ZGVsMxv_logp1f;
3675036750+- }
3675136751+ }
3675236752+diff --git a/sysdeps/aarch64/fpu/advsimd_f32_protos.h b/sysdeps/aarch64/fpu/advsimd_f32_protos.h
3675336753+index 5909bb4ce9..097d403ffe 100644
3675436754+--- a/sysdeps/aarch64/fpu/advsimd_f32_protos.h
3675536755++++ b/sysdeps/aarch64/fpu/advsimd_f32_protos.h
3675636756+@@ -36,7 +36,6 @@ libmvec_hidden_proto (V_NAME_F2(hypot));
3675736757+ libmvec_hidden_proto (V_NAME_F1(log10));
3675836758+ libmvec_hidden_proto (V_NAME_F1(log1p));
3675936759+ libmvec_hidden_proto (V_NAME_F1(log2));
3676036760+-libmvec_hidden_proto (V_NAME_F1(logp1));
3676136761+ libmvec_hidden_proto (V_NAME_F1(log));
3676236762+ libmvec_hidden_proto (V_NAME_F2(pow));
3676336763+ libmvec_hidden_proto (V_NAME_F1(sin));
3676436764+diff --git a/sysdeps/aarch64/fpu/bits/math-vector.h b/sysdeps/aarch64/fpu/bits/math-vector.h
3676536765+index f295fe185d..7484150131 100644
3676636766+--- a/sysdeps/aarch64/fpu/bits/math-vector.h
3676736767++++ b/sysdeps/aarch64/fpu/bits/math-vector.h
3676836768+@@ -113,10 +113,6 @@
3676936769+ # define __DECL_SIMD_log2 __DECL_SIMD_aarch64
3677036770+ # undef __DECL_SIMD_log2f
3677136771+ # define __DECL_SIMD_log2f __DECL_SIMD_aarch64
3677236772+-# undef __DECL_SIMD_logp1
3677336773+-# define __DECL_SIMD_logp1 __DECL_SIMD_aarch64
3677436774+-# undef __DECL_SIMD_logp1f
3677536775+-# define __DECL_SIMD_logp1f __DECL_SIMD_aarch64
3677636776+ # undef __DECL_SIMD_pow
3677736777+ # define __DECL_SIMD_pow __DECL_SIMD_aarch64
3677836778+ # undef __DECL_SIMD_powf
3677936779+@@ -184,7 +180,6 @@ __vpcs __f32x4_t _ZGVnN4v_logf (__f32x4_t);
3678036780+ __vpcs __f32x4_t _ZGVnN4v_log10f (__f32x4_t);
3678136781+ __vpcs __f32x4_t _ZGVnN4v_log1pf (__f32x4_t);
3678236782+ __vpcs __f32x4_t _ZGVnN4v_log2f (__f32x4_t);
3678336783+-__vpcs __f32x4_t _ZGVnN4v_logp1f (__f32x4_t);
3678436784+ __vpcs __f32x4_t _ZGVnN4vv_powf (__f32x4_t, __f32x4_t);
3678536785+ __vpcs __f32x4_t _ZGVnN4v_sinf (__f32x4_t);
3678636786+ __vpcs __f32x4_t _ZGVnN4v_sinhf (__f32x4_t);
3678736787+@@ -212,7 +207,6 @@ __vpcs __f64x2_t _ZGVnN2v_log (__f64x2_t);
3678836788+ __vpcs __f64x2_t _ZGVnN2v_log10 (__f64x2_t);
3678936789+ __vpcs __f64x2_t _ZGVnN2v_log1p (__f64x2_t);
3679036790+ __vpcs __f64x2_t _ZGVnN2v_log2 (__f64x2_t);
3679136791+-__vpcs __f64x2_t _ZGVnN2v_logp1 (__f64x2_t);
3679236792+ __vpcs __f64x2_t _ZGVnN2vv_pow (__f64x2_t, __f64x2_t);
3679336793+ __vpcs __f64x2_t _ZGVnN2v_sin (__f64x2_t);
3679436794+ __vpcs __f64x2_t _ZGVnN2v_sinh (__f64x2_t);
3679536795+@@ -245,7 +239,6 @@ __sv_f32_t _ZGVsMxv_logf (__sv_f32_t, __sv_bool_t);
3679636796+ __sv_f32_t _ZGVsMxv_log10f (__sv_f32_t, __sv_bool_t);
3679736797+ __sv_f32_t _ZGVsMxv_log1pf (__sv_f32_t, __sv_bool_t);
3679836798+ __sv_f32_t _ZGVsMxv_log2f (__sv_f32_t, __sv_bool_t);
3679936799+-__sv_f32_t _ZGVsMxv_logp1f (__sv_f32_t, __sv_bool_t);
3680036800+ __sv_f32_t _ZGVsMxvv_powf (__sv_f32_t, __sv_f32_t, __sv_bool_t);
3680136801+ __sv_f32_t _ZGVsMxv_sinf (__sv_f32_t, __sv_bool_t);
3680236802+ __sv_f32_t _ZGVsMxv_sinhf (__sv_f32_t, __sv_bool_t);
3680336803+@@ -273,7 +266,6 @@ __sv_f64_t _ZGVsMxv_log (__sv_f64_t, __sv_bool_t);
3680436804+ __sv_f64_t _ZGVsMxv_log10 (__sv_f64_t, __sv_bool_t);
3680536805+ __sv_f64_t _ZGVsMxv_log1p (__sv_f64_t, __sv_bool_t);
3680636806+ __sv_f64_t _ZGVsMxv_log2 (__sv_f64_t, __sv_bool_t);
3680736807+-__sv_f64_t _ZGVsMxv_logp1 (__sv_f64_t, __sv_bool_t);
3680836808+ __sv_f64_t _ZGVsMxvv_pow (__sv_f64_t, __sv_f64_t, __sv_bool_t);
3680936809+ __sv_f64_t _ZGVsMxv_sin (__sv_f64_t, __sv_bool_t);
3681036810+ __sv_f64_t _ZGVsMxv_sinh (__sv_f64_t, __sv_bool_t);
3681136811+diff --git a/sysdeps/aarch64/fpu/log1p_advsimd.c b/sysdeps/aarch64/fpu/log1p_advsimd.c
3681236812+index 1263587201..9d18578ce6 100644
3681336813+--- a/sysdeps/aarch64/fpu/log1p_advsimd.c
3681436814++++ b/sysdeps/aarch64/fpu/log1p_advsimd.c
3681536815+@@ -58,5 +58,3 @@ VPCS_ATTR float64x2_t V_NAME_D1 (log1p) (float64x2_t x)
3681636816+3681736817+ return log1p_inline (x, &d->d);
3681836818+ }
3681936819+-
3682036820+-strong_alias (V_NAME_D1 (log1p), V_NAME_D1 (logp1))
3682136821+diff --git a/sysdeps/aarch64/fpu/log1p_sve.c b/sysdeps/aarch64/fpu/log1p_sve.c
3682236822+index b21cfb2c90..04f7e5720e 100644
3682336823+--- a/sysdeps/aarch64/fpu/log1p_sve.c
3682436824++++ b/sysdeps/aarch64/fpu/log1p_sve.c
3682536825+@@ -116,5 +116,3 @@ svfloat64_t SV_NAME_D1 (log1p) (svfloat64_t x, svbool_t pg)
3682636826+3682736827+ return y;
3682836828+ }
3682936829+-
3683036830+-strong_alias (SV_NAME_D1 (log1p), SV_NAME_D1 (logp1))
3683136831+diff --git a/sysdeps/aarch64/fpu/log1pf_advsimd.c b/sysdeps/aarch64/fpu/log1pf_advsimd.c
3683236832+index 00006fc703..f2d47962fe 100644
3683336833+--- a/sysdeps/aarch64/fpu/log1pf_advsimd.c
3683436834++++ b/sysdeps/aarch64/fpu/log1pf_advsimd.c
3683536835+@@ -93,6 +93,3 @@ VPCS_ATTR float32x4_t V_NAME_F1 (log1p) (float32x4_t x)
3683636836+3683736837+ libmvec_hidden_def (V_NAME_F1 (log1p))
3683836838+ HALF_WIDTH_ALIAS_F1 (log1p)
3683936839+-strong_alias (V_NAME_F1 (log1p), V_NAME_F1 (logp1))
3684036840+-libmvec_hidden_def (V_NAME_F1 (logp1))
3684136841+-HALF_WIDTH_ALIAS_F1 (logp1)
3684236842+diff --git a/sysdeps/aarch64/fpu/log1pf_sve.c b/sysdeps/aarch64/fpu/log1pf_sve.c
3684336843+index 18a185c838..4f17c44e2d 100644
3684436844+--- a/sysdeps/aarch64/fpu/log1pf_sve.c
3684536845++++ b/sysdeps/aarch64/fpu/log1pf_sve.c
3684636846+@@ -42,5 +42,3 @@ svfloat32_t SV_NAME_F1 (log1p) (svfloat32_t x, svbool_t pg)
3684736847+3684836848+ return sv_log1pf_inline (x, pg);
3684936849+ }
3685036850+-
3685136851+-strong_alias (SV_NAME_F1 (log1p), SV_NAME_F1 (logp1))
3685236852+diff --git a/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist b/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist
3685336853+index 98687cae0d..b685106954 100644
3685436854+--- a/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist
3685536855++++ b/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist
3685636856+@@ -128,8 +128,3 @@ GLIBC_2.40 _ZGVsMxvv_hypot F
3685736857+ GLIBC_2.40 _ZGVsMxvv_hypotf F
3685836858+ GLIBC_2.40 _ZGVsMxvv_pow F
3685936859+ GLIBC_2.40 _ZGVsMxvv_powf F
3686036860+-GLIBC_2.41 _ZGVnN2v_logp1 F
3686136861+-GLIBC_2.41 _ZGVnN2v_logp1f F
3686236862+-GLIBC_2.41 _ZGVnN4v_logp1f F
3686336863+-GLIBC_2.41 _ZGVsMxv_logp1 F
3686436864+-GLIBC_2.41 _ZGVsMxv_logp1f F
3686536865+3686636866+commit 64896b7d329809127035fde42768a6f7eeffed75
3686736867+Author: Wilco Dijkstra <wilco.dijkstra@arm.com>
3686836868+Date: Wed Aug 7 14:43:47 2024 +0100
3686936869+3687036870+ AArch64: Improve generic strlen
3687136871+3687236872+ Improve performance by handling another 16 bytes before entering the loop.
3687336873+ Use ADDHN in the loop to avoid SHRN+FMOV when it terminates. Change final
3687436874+ size computation to avoid increasing latency. On Neoverse V1 performance
3687536875+ of the random strlen benchmark improves by 4.6%.
3687636876+3687736877+ Reviewed-by: Adhemerval Zanella <adhemerval.zanella@linaro.org>
3687836878+ (cherry picked from commit 3dc426b642dcafdbc11a99f2767e081d086f5fc7)
3687936879+3688036880+diff --git a/sysdeps/aarch64/strlen.S b/sysdeps/aarch64/strlen.S
3688136881+index ab2a576cdb..352fb40d3a 100644
3688236882+--- a/sysdeps/aarch64/strlen.S
3688336883++++ b/sysdeps/aarch64/strlen.S
3688436884+@@ -1,4 +1,5 @@
3688536885+-/* Copyright (C) 2012-2024 Free Software Foundation, Inc.
3688636886++/* Generic optimized strlen using SIMD.
3688736887++ Copyright (C) 2012-2024 Free Software Foundation, Inc.
3688836888+3688936889+ This file is part of the GNU C Library.
3689036890+3689136891+@@ -56,36 +57,50 @@ ENTRY (STRLEN)
3689236892+ shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */
3689336893+ fmov synd, dend
3689436894+ lsr synd, synd, shift
3689536895+- cbz synd, L(loop)
3689636896++ cbz synd, L(next16)
3689736897+3689836898+ rbit synd, synd
3689936899+ clz result, synd
3690036900+ lsr result, result, 2
3690136901+ ret
3690236902+3690336903++L(next16):
3690436904++ ldr data, [src, 16]
3690536905++ cmeq vhas_nul.16b, vdata.16b, 0
3690636906++ shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */
3690736907++ fmov synd, dend
3690836908++ cbz synd, L(loop)
3690936909++ add src, src, 16
3691036910++#ifndef __AARCH64EB__
3691136911++ rbit synd, synd
3691236912++#endif
3691336913++ sub result, src, srcin
3691436914++ clz tmp, synd
3691536915++ add result, result, tmp, lsr 2
3691636916++ ret
3691736917++
3691836918+ .p2align 5
3691936919+ L(loop):
3692036920+- ldr data, [src, 16]
3692136921++ ldr data, [src, 32]!
3692236922+ cmeq vhas_nul.16b, vdata.16b, 0
3692336923+- umaxp vend.16b, vhas_nul.16b, vhas_nul.16b
3692436924++ addhn vend.8b, vhas_nul.8h, vhas_nul.8h
3692536925+ fmov synd, dend
3692636926+ cbnz synd, L(loop_end)
3692736927+- ldr data, [src, 32]!
3692836928++ ldr data, [src, 16]
3692936929+ cmeq vhas_nul.16b, vdata.16b, 0
3693036930+- umaxp vend.16b, vhas_nul.16b, vhas_nul.16b
3693136931++ addhn vend.8b, vhas_nul.8h, vhas_nul.8h
3693236932+ fmov synd, dend
3693336933+ cbz synd, L(loop)
3693436934+- sub src, src, 16
3693536935++ add src, src, 16
3693636936+ L(loop_end):
3693736937+- shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */
3693836938+- sub result, src, srcin
3693936939+- fmov synd, dend
3694036940++ sub result, shift, src, lsl 2 /* (srcin - src) << 2. */
3694136941+ #ifndef __AARCH64EB__
3694236942+ rbit synd, synd
3694336943++ sub result, result, 3
3694436944+ #endif
3694536945+- add result, result, 16
3694636946+ clz tmp, synd
3694736947+- add result, result, tmp, lsr 2
3694836948++ sub result, tmp, result
3694936949++ lsr result, result, 2
3695036950+ ret
3695136951+3695236952+ END (STRLEN)
3695336953+3695436954+commit 544fb349d35efd5f86ed7e482759ff21496a32fd
3695536955+Author: Wilco Dijkstra <wilco.dijkstra@arm.com>
3695636956+Date: Mon Sep 9 15:26:47 2024 +0100
3695736957+3695836958+ AArch64: Optimize memset
3695936959+3696036960+ Improve small memsets by avoiding branches and use overlapping stores.
3696136961+ Use DC ZVA for copies over 128 bytes. Remove unnecessary code for ZVA sizes
3696236962+ other than 64 and 128. Performance of random memset benchmark improves by 24%
3696336963+ on Neoverse N1.
3696436964+3696536965+ Reviewed-by: Adhemerval Zanella <adhemerval.zanella@linaro.org>
3696636966+ (cherry picked from commit cec3aef32412779e207f825db0d057ebb4628ae8)
3696736967+3696836968+diff --git a/sysdeps/aarch64/memset.S b/sysdeps/aarch64/memset.S
3696936969+index 7ef77ee8c9..caafb019e2 100644
3697036970+--- a/sysdeps/aarch64/memset.S
3697136971++++ b/sysdeps/aarch64/memset.S
3697236972+@@ -1,4 +1,5 @@
3697336973+-/* Copyright (C) 2012-2024 Free Software Foundation, Inc.
3697436974++/* Generic optimized memset using SIMD.
3697536975++ Copyright (C) 2012-2024 Free Software Foundation, Inc.
3697636976+3697736977+ This file is part of the GNU C Library.
3697836978+3697936979+@@ -17,7 +18,6 @@
3698036980+ <https://www.gnu.org/licenses/>. */
3698136981+3698236982+ #include <sysdep.h>
3698336983+-#include "memset-reg.h"
3698436984+3698536985+ #ifndef MEMSET
3698636986+ # define MEMSET memset
3698736987+@@ -25,130 +25,132 @@
3698836988+3698936989+ /* Assumptions:
3699036990+ *
3699136991+- * ARMv8-a, AArch64, unaligned accesses
3699236992++ * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
3699336993+ *
3699436994+ */
3699536995+3699636996+-ENTRY (MEMSET)
3699736997++#define dstin x0
3699836998++#define val x1
3699936999++#define valw w1
3700037000++#define count x2
3700137001++#define dst x3
3700237002++#define dstend x4
3700337003++#define zva_val x5
3700437004++#define off x3
3700537005++#define dstend2 x5
3700637006+3700737007++ENTRY (MEMSET)
3700837008+ PTR_ARG (0)
3700937009+ SIZE_ARG (2)
3701037010+3701137011+ dup v0.16B, valw
3701237012++ cmp count, 16
3701337013++ b.lo L(set_small)
3701437014++
3701537015+ add dstend, dstin, count
3701637016++ cmp count, 64
3701737017++ b.hs L(set_128)
3701837018+3701937019+- cmp count, 96
3702037020+- b.hi L(set_long)
3702137021+- cmp count, 16
3702237022+- b.hs L(set_medium)
3702337023+- mov val, v0.D[0]
3702437024++ /* Set 16..63 bytes. */
3702537025++ mov off, 16
3702637026++ and off, off, count, lsr 1
3702737027++ sub dstend2, dstend, off
3702837028++ str q0, [dstin]
3702937029++ str q0, [dstin, off]
3703037030++ str q0, [dstend2, -16]
3703137031++ str q0, [dstend, -16]
3703237032++ ret
3703337033+3703437034++ .p2align 4
3703537035+ /* Set 0..15 bytes. */
3703637036+- tbz count, 3, 1f
3703737037+- str val, [dstin]
3703837038+- str val, [dstend, -8]
3703937039+- ret
3704037040+- nop
3704137041+-1: tbz count, 2, 2f
3704237042+- str valw, [dstin]
3704337043+- str valw, [dstend, -4]
3704437044++L(set_small):
3704537045++ add dstend, dstin, count
3704637046++ cmp count, 4
3704737047++ b.lo 2f
3704837048++ lsr off, count, 3
3704937049++ sub dstend2, dstend, off, lsl 2
3705037050++ str s0, [dstin]
3705137051++ str s0, [dstin, off, lsl 2]
3705237052++ str s0, [dstend2, -4]
3705337053++ str s0, [dstend, -4]
3705437054+ ret
3705537055++
3705637056++ /* Set 0..3 bytes. */
3705737057+ 2: cbz count, 3f
3705837058++ lsr off, count, 1
3705937059+ strb valw, [dstin]
3706037060+- tbz count, 1, 3f
3706137061+- strh valw, [dstend, -2]
3706237062++ strb valw, [dstin, off]
3706337063++ strb valw, [dstend, -1]
3706437064+ 3: ret
3706537065+3706637066+- /* Set 17..96 bytes. */
3706737067+-L(set_medium):
3706837068+- str q0, [dstin]
3706937069+- tbnz count, 6, L(set96)
3707037070+- str q0, [dstend, -16]
3707137071+- tbz count, 5, 1f
3707237072+- str q0, [dstin, 16]
3707337073+- str q0, [dstend, -32]
3707437074+-1: ret
3707537075+-
3707637076+ .p2align 4
3707737077+- /* Set 64..96 bytes. Write 64 bytes from the start and
3707837078+- 32 bytes from the end. */
3707937079+-L(set96):
3708037080+- str q0, [dstin, 16]
3708137081++L(set_128):
3708237082++ bic dst, dstin, 15
3708337083++ cmp count, 128
3708437084++ b.hi L(set_long)
3708537085++ stp q0, q0, [dstin]
3708637086+ stp q0, q0, [dstin, 32]
3708737087++ stp q0, q0, [dstend, -64]
3708837088+ stp q0, q0, [dstend, -32]
3708937089+ ret
3709037090+3709137091+- .p2align 3
3709237092+- nop
3709337093++ .p2align 4
3709437094+ L(set_long):
3709537095+- and valw, valw, 255
3709637096+- bic dst, dstin, 15
3709737097+ str q0, [dstin]
3709837098+- cmp count, 256
3709937099+- ccmp valw, 0, 0, cs
3710037100+- b.eq L(try_zva)
3710137101+-L(no_zva):
3710237102+- sub count, dstend, dst /* Count is 16 too large. */
3710337103+- sub dst, dst, 16 /* Dst is biased by -32. */
3710437104+- sub count, count, 64 + 16 /* Adjust count and bias for loop. */
3710537105+-1: stp q0, q0, [dst, 32]
3710637106+- stp q0, q0, [dst, 64]!
3710737107+-L(tail64):
3710837108+- subs count, count, 64
3710937109+- b.hi 1b
3711037110+-2: stp q0, q0, [dstend, -64]
3711137111++ str q0, [dst, 16]
3711237112++ tst valw, 255
3711337113++ b.ne L(no_zva)
3711437114++#ifndef ZVA64_ONLY
3711537115++ mrs zva_val, dczid_el0
3711637116++ and zva_val, zva_val, 31
3711737117++ cmp zva_val, 4 /* ZVA size is 64 bytes. */
3711837118++ b.ne L(zva_128)
3711937119++#endif
3712037120++ stp q0, q0, [dst, 32]
3712137121++ bic dst, dstin, 63
3712237122++ sub count, dstend, dst /* Count is now 64 too large. */
3712337123++ sub count, count, 64 + 64 /* Adjust count and bias for loop. */
3712437124++
3712537125++ /* Write last bytes before ZVA loop. */
3712637126++ stp q0, q0, [dstend, -64]
3712737127+ stp q0, q0, [dstend, -32]
3712837128++
3712937129++ .p2align 4
3713037130++L(zva64_loop):
3713137131++ add dst, dst, 64
3713237132++ dc zva, dst
3713337133++ subs count, count, 64
3713437134++ b.hi L(zva64_loop)
3713537135+ ret
3713637136+3713737137+-L(try_zva):
3713837138+-#ifndef ZVA64_ONLY
3713937139+ .p2align 3
3714037140+- mrs tmp1, dczid_el0
3714137141+- tbnz tmp1w, 4, L(no_zva)
3714237142+- and tmp1w, tmp1w, 15
3714337143+- cmp tmp1w, 4 /* ZVA size is 64 bytes. */
3714437144+- b.ne L(zva_128)
3714537145+- nop
3714637146+-#endif
3714737147+- /* Write the first and last 64 byte aligned block using stp rather
3714837148+- than using DC ZVA. This is faster on some cores.
3714937149+- */
3715037150+- .p2align 4
3715137151+-L(zva_64):
3715237152+- str q0, [dst, 16]
3715337153++L(no_zva):
3715437154++ sub count, dstend, dst /* Count is 32 too large. */
3715537155++ sub count, count, 64 + 32 /* Adjust count and bias for loop. */
3715637156++L(no_zva_loop):
3715737157+ stp q0, q0, [dst, 32]
3715837158+- bic dst, dst, 63
3715937159+ stp q0, q0, [dst, 64]
3716037160+- stp q0, q0, [dst, 96]
3716137161+- sub count, dstend, dst /* Count is now 128 too large. */
3716237162+- sub count, count, 128+64+64 /* Adjust count and bias for loop. */
3716337163+- add dst, dst, 128
3716437164+-1: dc zva, dst
3716537165+ add dst, dst, 64
3716637166+ subs count, count, 64
3716737167+- b.hi 1b
3716837168+- stp q0, q0, [dst, 0]
3716937169+- stp q0, q0, [dst, 32]
3717037170++ b.hi L(no_zva_loop)
3717137171+ stp q0, q0, [dstend, -64]
3717237172+ stp q0, q0, [dstend, -32]
3717337173+ ret
3717437174+3717537175+ #ifndef ZVA64_ONLY
3717637176+- .p2align 3
3717737177++ .p2align 4
3717837178+ L(zva_128):
3717937179+- cmp tmp1w, 5 /* ZVA size is 128 bytes. */
3718037180+- b.ne L(zva_other)
3718137181++ cmp zva_val, 5 /* ZVA size is 128 bytes. */
3718237182++ b.ne L(no_zva)
3718337183+3718437184+- str q0, [dst, 16]
3718537185+ stp q0, q0, [dst, 32]
3718637186+ stp q0, q0, [dst, 64]
3718737187+ stp q0, q0, [dst, 96]
3718837188+ bic dst, dst, 127
3718937189+ sub count, dstend, dst /* Count is now 128 too large. */
3719037190+- sub count, count, 128+128 /* Adjust count and bias for loop. */
3719137191+- add dst, dst, 128
3719237192+-1: dc zva, dst
3719337193+- add dst, dst, 128
3719437194++ sub count, count, 128 + 128 /* Adjust count and bias for loop. */
3719537195++1: add dst, dst, 128
3719637196++ dc zva, dst
3719737197+ subs count, count, 128
3719837198+ b.hi 1b
3719937199+ stp q0, q0, [dstend, -128]
3720037200+@@ -156,35 +158,6 @@ L(zva_128):
3720137201+ stp q0, q0, [dstend, -64]
3720237202+ stp q0, q0, [dstend, -32]
3720337203+ ret
3720437204+-
3720537205+-L(zva_other):
3720637206+- mov tmp2w, 4
3720737207+- lsl zva_lenw, tmp2w, tmp1w
3720837208+- add tmp1, zva_len, 64 /* Max alignment bytes written. */
3720937209+- cmp count, tmp1
3721037210+- blo L(no_zva)
3721137211+-
3721237212+- sub tmp2, zva_len, 1
3721337213+- add tmp1, dst, zva_len
3721437214+- add dst, dst, 16
3721537215+- subs count, tmp1, dst /* Actual alignment bytes to write. */
3721637216+- bic tmp1, tmp1, tmp2 /* Aligned dc zva start address. */
3721737217+- beq 2f
3721837218+-1: stp q0, q0, [dst], 64
3721937219+- stp q0, q0, [dst, -32]
3722037220+- subs count, count, 64
3722137221+- b.hi 1b
3722237222+-2: mov dst, tmp1
3722337223+- sub count, dstend, tmp1 /* Remaining bytes to write. */
3722437224+- subs count, count, zva_len
3722537225+- b.lo 4f
3722637226+-3: dc zva, dst
3722737227+- add dst, dst, zva_len
3722837228+- subs count, count, zva_len
3722937229+- b.hs 3b
3723037230+-4: add count, count, zva_len
3723137231+- sub dst, dst, 32 /* Bias dst for tail loop. */
3723237232+- b L(tail64)
3723337233+ #endif
3723437234+3723537235+ END (MEMSET)
3723637236+3723737237+commit 41eb2f8b5847079caca90a74659456adbb80ec29
3723837238+Author: Wilco Dijkstra <wilco.dijkstra@arm.com>
3723937239+Date: Mon Nov 25 18:43:08 2024 +0000
3724037240+3724137241+ AArch64: Remove zva_128 from memset
3724237242+3724337243+ Remove ZVA 128 support from memset - the new memset no longer
3724437244+ guarantees count >= 256, which can result in underflow and a
3724537245+ crash if ZVA size is 128 ([1]). Since only one CPU uses a ZVA
3724637246+ size of 128 and its memcpy implementation was removed in commit
3724737247+ e162ab2bf1b82c40f29e1925986582fa07568ce8, remove this special
3724837248+ case too.
3724937249+3725037250+ [1] https://sourceware.org/pipermail/libc-alpha/2024-November/161626.html
3725137251+3725237252+ Reviewed-by: Andrew Pinski <quic_apinski@quicinc.com>
3725337253+ (cherry picked from commit a08d9a52f967531a77e1824c23b5368c6434a72d)
3725437254+3725537255+diff --git a/sysdeps/aarch64/memset.S b/sysdeps/aarch64/memset.S
3725637256+index caafb019e2..71814d0b2f 100644
3725737257+--- a/sysdeps/aarch64/memset.S
3725837258++++ b/sysdeps/aarch64/memset.S
3725937259+@@ -104,7 +104,7 @@ L(set_long):
3726037260+ mrs zva_val, dczid_el0
3726137261+ and zva_val, zva_val, 31
3726237262+ cmp zva_val, 4 /* ZVA size is 64 bytes. */
3726337263+- b.ne L(zva_128)
3726437264++ b.ne L(no_zva)
3726537265+ #endif
3726637266+ stp q0, q0, [dst, 32]
3726737267+ bic dst, dstin, 63
3726837268+@@ -137,28 +137,5 @@ L(no_zva_loop):
3726937269+ stp q0, q0, [dstend, -32]
3727037270+ ret
3727137271+3727237272+-#ifndef ZVA64_ONLY
3727337273+- .p2align 4
3727437274+-L(zva_128):
3727537275+- cmp zva_val, 5 /* ZVA size is 128 bytes. */
3727637276+- b.ne L(no_zva)
3727737277+-
3727837278+- stp q0, q0, [dst, 32]
3727937279+- stp q0, q0, [dst, 64]
3728037280+- stp q0, q0, [dst, 96]
3728137281+- bic dst, dst, 127
3728237282+- sub count, dstend, dst /* Count is now 128 too large. */
3728337283+- sub count, count, 128 + 128 /* Adjust count and bias for loop. */
3728437284+-1: add dst, dst, 128
3728537285+- dc zva, dst
3728637286+- subs count, count, 128
3728737287+- b.hi 1b
3728837288+- stp q0, q0, [dstend, -128]
3728937289+- stp q0, q0, [dstend, -96]
3729037290+- stp q0, q0, [dstend, -64]
3729137291+- stp q0, q0, [dstend, -32]
3729237292+- ret
3729337293+-#endif
3729437294+-
3729537295+ END (MEMSET)
3729637296+ libc_hidden_builtin_def (MEMSET)
3729737297+3729837298+commit 27fa0268ead054810a5e2669d0b5bb88ceb05b05
3729937299+Author: Wilco Dijkstra <wilco.dijkstra@arm.com>
3730037300+Date: Wed Jul 24 15:17:47 2024 +0100
3730137301+3730237302+ math: Improve layout of expf data
3730337303+3730437304+ GCC aligns global data to 16 bytes if their size is >= 16 bytes. This patch
3730537305+ changes the exp2f_data struct slightly so that the fields are better aligned.
3730637306+ As a result on targets that support them, load-pair instructions accessing
3730737307+ poly_scaled and invln2_scaled are now 16-byte aligned.
3730837308+3730937309+ Reviewed-by: Adhemerval Zanella <adhemerval.zanella@linaro.org>
3731037310+ (cherry picked from commit 44fa9c1080fe6a9539f0d2345b9d2ae37b8ee57a)
3731137311+3731237312+diff --git a/sysdeps/ieee754/flt-32/math_config.h b/sysdeps/ieee754/flt-32/math_config.h
3731337313+index 729f22cd4f..dc07ebd459 100644
3731437314+--- a/sysdeps/ieee754/flt-32/math_config.h
3731537315++++ b/sysdeps/ieee754/flt-32/math_config.h
3731637316+@@ -166,9 +166,9 @@ extern const struct exp2f_data
3731737317+ uint64_t tab[1 << EXP2F_TABLE_BITS];
3731837318+ double shift_scaled;
3731937319+ double poly[EXP2F_POLY_ORDER];
3732037320+- double shift;
3732137321+ double invln2_scaled;
3732237322+ double poly_scaled[EXP2F_POLY_ORDER];
3732337323++ double shift;
3732437324+ } __exp2f_data attribute_hidden;
3732537325+3732637326+ #define LOGF_TABLE_BITS 4
3732737327+3732837328+commit 7038970f1f485fb660606f0c596f432fdef250f6
3732937329+Author: Wilco Dijkstra <wilco.dijkstra@arm.com>
3733037330+Date: Tue Dec 24 18:01:59 2024 +0000
3733137331+3733237332+ AArch64: Add SVE memset
3733337333+3733437334+ Add SVE memset based on the generic memset with predicated load for sizes < 16.
3733537335+ Unaligned memsets of 128-1024 are improved by ~20% on average by using aligned
3733637336+ stores for the last 64 bytes. Performance of random memset benchmark improves
3733737337+ by ~2% on Neoverse V1.
3733837338+3733937339+ Reviewed-by: Yury Khrustalev <yury.khrustalev@arm.com>
3734037340+ (cherry picked from commit 163b1bbb76caba4d9673c07940c5930a1afa7548)
3734137341+3734237342+diff --git a/sysdeps/aarch64/multiarch/Makefile b/sysdeps/aarch64/multiarch/Makefile
3734337343+index 3e251cc234..6880ebc035 100644
3734437344+--- a/sysdeps/aarch64/multiarch/Makefile
3734537345++++ b/sysdeps/aarch64/multiarch/Makefile
3734637346+@@ -16,6 +16,7 @@ sysdep_routines += \
3734737347+ memset_kunpeng \
3734837348+ memset_mops \
3734937349+ memset_oryon1 \
3735037350++ memset_sve_zva64 \
3735137351+ memset_zva64 \
3735237352+ strlen_asimd \
3735337353+ strlen_generic \
3735437354+diff --git a/sysdeps/aarch64/multiarch/ifunc-impl-list.c b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
3735537355+index b2fda541f9..1f101a719b 100644
3735637356+--- a/sysdeps/aarch64/multiarch/ifunc-impl-list.c
3735737357++++ b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
3735837358+@@ -61,6 +61,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
3735937359+ IFUNC_IMPL_ADD (array, i, memset, 1, __memset_kunpeng)
3736037360+ #if HAVE_AARCH64_SVE_ASM
3736137361+ IFUNC_IMPL_ADD (array, i, memset, sve && !bti && zva_size == 256, __memset_a64fx)
3736237362++ IFUNC_IMPL_ADD (array, i, memset, sve && zva_size == 64, __memset_sve_zva64)
3736337363+ #endif
3736437364+ IFUNC_IMPL_ADD (array, i, memset, mops, __memset_mops)
3736537365+ IFUNC_IMPL_ADD (array, i, memset, 1, __memset_generic))
3736637366+diff --git a/sysdeps/aarch64/multiarch/memset.c b/sysdeps/aarch64/multiarch/memset.c
3736737367+index bd063c16c9..4f65295e77 100644
3736837368+--- a/sysdeps/aarch64/multiarch/memset.c
3736937369++++ b/sysdeps/aarch64/multiarch/memset.c
3737037370+@@ -36,6 +36,7 @@ extern __typeof (__redirect_memset) __memset_a64fx attribute_hidden;
3737137371+ extern __typeof (__redirect_memset) __memset_generic attribute_hidden;
3737237372+ extern __typeof (__redirect_memset) __memset_mops attribute_hidden;
3737337373+ extern __typeof (__redirect_memset) __memset_oryon1 attribute_hidden;
3737437374++extern __typeof (__redirect_memset) __memset_sve_zva64 attribute_hidden;
3737537375+3737637376+ static inline __typeof (__redirect_memset) *
3737737377+ select_memset_ifunc (void)
3737837378+@@ -49,6 +50,9 @@ select_memset_ifunc (void)
3737937379+ {
3738037380+ if (IS_A64FX (midr) && zva_size == 256)
3738137381+ return __memset_a64fx;
3738237382++
3738337383++ if (zva_size == 64)
3738437384++ return __memset_sve_zva64;
3738537385+ }
3738637386+3738737387+ if (IS_ORYON1 (midr) && zva_size == 64)
3738837388+diff --git a/sysdeps/aarch64/multiarch/memset_sve_zva64.S b/sysdeps/aarch64/multiarch/memset_sve_zva64.S
3738937389+new file mode 100644
3739037390+index 0000000000..7fb40fdd9e
3739137391+--- /dev/null
3739237392++++ b/sysdeps/aarch64/multiarch/memset_sve_zva64.S
3739337393+@@ -0,0 +1,123 @@
3739437394++/* Optimized memset for SVE.
3739537395++ Copyright (C) 2025 Free Software Foundation, Inc.
3739637396++
3739737397++ This file is part of the GNU C Library.
3739837398++
3739937399++ The GNU C Library is free software; you can redistribute it and/or
3740037400++ modify it under the terms of the GNU Lesser General Public
3740137401++ License as published by the Free Software Foundation; either
3740237402++ version 2.1 of the License, or (at your option) any later version.
3740337403++
3740437404++ The GNU C Library is distributed in the hope that it will be useful,
3740537405++ but WITHOUT ANY WARRANTY; without even the implied warranty of
3740637406++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
3740737407++ Lesser General Public License for more details.
3740837408++
3740937409++ You should have received a copy of the GNU Lesser General Public
3741037410++ License along with the GNU C Library. If not, see
3741137411++ <https://www.gnu.org/licenses/>. */
3741237412++
3741337413++#include <sysdep.h>
3741437414++
3741537415++/* Assumptions:
3741637416++ *
3741737417++ * ARMv8-a, AArch64, Advanced SIMD, SVE, unaligned accesses.
3741837418++ * ZVA size is 64.
3741937419++ */
3742037420++
3742137421++#if HAVE_AARCH64_SVE_ASM
3742237422++
3742337423++.arch armv8.2-a+sve
3742437424++
3742537425++#define dstin x0
3742637426++#define val x1
3742737427++#define valw w1
3742837428++#define count x2
3742937429++#define dst x3
3743037430++#define dstend x4
3743137431++#define zva_val x5
3743237432++#define vlen x5
3743337433++#define off x3
3743437434++#define dstend2 x5
3743537435++
3743637436++ENTRY (__memset_sve_zva64)
3743737437++ dup v0.16B, valw
3743837438++ cmp count, 16
3743937439++ b.lo L(set_16)
3744037440++
3744137441++ add dstend, dstin, count
3744237442++ cmp count, 64
3744337443++ b.hs L(set_128)
3744437444++
3744537445++ /* Set 16..63 bytes. */
3744637446++ mov off, 16
3744737447++ and off, off, count, lsr 1
3744837448++ sub dstend2, dstend, off
3744937449++ str q0, [dstin]
3745037450++ str q0, [dstin, off]
3745137451++ str q0, [dstend2, -16]
3745237452++ str q0, [dstend, -16]
3745337453++ ret
3745437454++
3745537455++ .p2align 4
3745637456++L(set_16):
3745737457++ whilelo p0.b, xzr, count
3745837458++ st1b z0.b, p0, [dstin]
3745937459++ ret
3746037460++
3746137461++ .p2align 4
3746237462++L(set_128):
3746337463++ bic dst, dstin, 15
3746437464++ cmp count, 128
3746537465++ b.hi L(set_long)
3746637466++ stp q0, q0, [dstin]
3746737467++ stp q0, q0, [dstin, 32]
3746837468++ stp q0, q0, [dstend, -64]
3746937469++ stp q0, q0, [dstend, -32]
3747037470++ ret
3747137471++
3747237472++ .p2align 4
3747337473++L(set_long):
3747437474++ cmp count, 256
3747537475++ b.lo L(no_zva)
3747637476++ tst valw, 255
3747737477++ b.ne L(no_zva)
3747837478++
3747937479++ str q0, [dstin]
3748037480++ str q0, [dst, 16]
3748137481++ bic dst, dstin, 31
3748237482++ stp q0, q0, [dst, 32]
3748337483++ bic dst, dstin, 63
3748437484++ sub count, dstend, dst /* Count is now 64 too large. */
3748537485++ sub count, count, 128 /* Adjust count and bias for loop. */
3748637486++
3748737487++ sub x8, dstend, 1 /* Write last bytes before ZVA loop. */
3748837488++ bic x8, x8, 15
3748937489++ stp q0, q0, [x8, -48]
3749037490++ str q0, [x8, -16]
3749137491++ str q0, [dstend, -16]
3749237492++
3749337493++ .p2align 4
3749437494++L(zva64_loop):
3749537495++ add dst, dst, 64
3749637496++ dc zva, dst
3749737497++ subs count, count, 64
3749837498++ b.hi L(zva64_loop)
3749937499++ ret
3750037500++
3750137501++L(no_zva):
3750237502++ str q0, [dstin]
3750337503++ sub count, dstend, dst /* Count is 16 too large. */
3750437504++ sub count, count, 64 + 16 /* Adjust count and bias for loop. */
3750537505++L(no_zva_loop):
3750637506++ stp q0, q0, [dst, 16]
3750737507++ stp q0, q0, [dst, 48]
3750837508++ add dst, dst, 64
3750937509++ subs count, count, 64
3751037510++ b.hi L(no_zva_loop)
3751137511++ stp q0, q0, [dstend, -64]
3751237512++ stp q0, q0, [dstend, -32]
3751337513++ ret
3751437514++
3751537515++END (__memset_sve_zva64)
3751637516++#endif
3751737517+3751837518+commit d6175a44e95fe443d0fbfed37a9ff7424f1e2661
3751937519+Author: Wilco Dijkstra <wilco.dijkstra@arm.com>
3752037520+Date: Thu Feb 27 16:28:52 2025 +0000
3752137521+3752237522+ AArch64: Use prefer_sve_ifuncs for SVE memset
3752337523+3752437524+ Use prefer_sve_ifuncs for SVE memset just like memcpy.
3752537525+3752637526+ Reviewed-by: Yury Khrustalev <yury.khrustalev@arm.com>
3752737527+ (cherry picked from commit 0f044be1dae5169d0e57f8d487b427863aeadab4)
3752837528+3752937529+diff --git a/sysdeps/aarch64/multiarch/memset.c b/sysdeps/aarch64/multiarch/memset.c
3753037530+index 4f65295e77..bb1e865c97 100644
3753137531+--- a/sysdeps/aarch64/multiarch/memset.c
3753237532++++ b/sysdeps/aarch64/multiarch/memset.c
3753337533+@@ -51,7 +51,7 @@ select_memset_ifunc (void)
3753437534+ if (IS_A64FX (midr) && zva_size == 256)
3753537535+ return __memset_a64fx;
3753637536+3753737537+- if (zva_size == 64)
3753837538++ if (prefer_sve_ifuncs && zva_size == 64)
3753937539+ return __memset_sve_zva64;
3754037540+ }
3754137541+3754237542+3754337543+commit d8e8342369831808b00324790c8809ba33408ee7
3754437544+Author: Wilco Dijkstra <wilco.dijkstra@arm.com>
3754537545+Date: Fri Dec 13 15:43:07 2024 +0000
3754637546+3754737547+ math: Improve layout of exp/exp10 data
3754837548+3754937549+ GCC aligns global data to 16 bytes if their size is >= 16 bytes. This patch
3755037550+ changes the exp_data struct slightly so that the fields are better aligned
3755137551+ and without gaps. As a result on targets that support them, more load-pair
3755237552+ instructions are used in exp. Exp10 is improved by moving invlog10_2N later
3755337553+ so that neglog10_2hiN and neglog10_2loN can be loaded using load-pair.
3755437554+3755537555+ The exp benchmark improves 2.5%, "144bits" by 7.2%, "768bits" by 12.7% on
3755637556+ Neoverse V2. Exp10 improves by 1.5%.
3755737557+3755837558+ Reviewed-by: Adhemerval Zanella <adhemerval.zanella@linaro.org>
3755937559+ (cherry picked from commit 5afaf99edb326fd9f36eb306a828d129a3a1d7f7)
3756037560+3756137561+diff --git a/sysdeps/ieee754/dbl-64/math_config.h b/sysdeps/ieee754/dbl-64/math_config.h
3756237562+index ef87cfa6be..05515fd95a 100644
3756337563+--- a/sysdeps/ieee754/dbl-64/math_config.h
3756437564++++ b/sysdeps/ieee754/dbl-64/math_config.h
3756537565+@@ -195,16 +195,18 @@ check_uflow (double x)
3756637566+ extern const struct exp_data
3756737567+ {
3756837568+ double invln2N;
3756937569+- double shift;
3757037570+ double negln2hiN;
3757137571+ double negln2loN;
3757237572+ double poly[4]; /* Last four coefficients. */
3757337573++ double shift;
3757437574++
3757537575+ double exp2_shift;
3757637576+ double exp2_poly[EXP2_POLY_ORDER];
3757737577+- double invlog10_2N;
3757837578++
3757937579+ double neglog10_2hiN;
3758037580+ double neglog10_2loN;
3758137581+ double exp10_poly[5];
3758237582++ double invlog10_2N;
3758337583+ uint64_t tab[2*(1 << EXP_TABLE_BITS)];
3758437584+ } __exp_data attribute_hidden;
3758537585+3758637586+3758737587+commit 3e820e17a8cef84645d83b67abcbc3f88c7fd268
3758837588+Author: Michael Jeanson <mjeanson@efficios.com>
3758937589+Date: Fri Feb 14 13:54:22 2025 -0500
3759037590+3759137591+ nptl: clear the whole rseq area before registration
3759237592+3759337593+ Due to the extensible nature of the rseq area we can't explictly
3759437594+ initialize fields that are not part of the ABI yet. It was agreed with
3759537595+ upstream that all new fields will be documented as zero initialized by
3759637596+ userspace. Future kernels configured with CONFIG_DEBUG_RSEQ will
3759737597+ validate the content of all fields during registration.
3759837598+3759937599+ Replace the explicit field initialization with a memset of the whole
3760037600+ rseq area which will cover fields as they are added to future kernels.
3760137601+3760237602+ Signed-off-by: Michael Jeanson <mjeanson@efficios.com>
3760337603+ Reviewed-by: Florian Weimer <fweimer@redhat.com>
3760437604+ (cherry picked from commit 689a62a4217fae78b9ce0db781dc2a421f2b1ab4)
3760537605+3760637606+diff --git a/sysdeps/nptl/dl-tls_init_tp.c b/sysdeps/nptl/dl-tls_init_tp.c
3760737607+index 7803e19fd1..ed10185e37 100644
3760837608+--- a/sysdeps/nptl/dl-tls_init_tp.c
3760937609++++ b/sysdeps/nptl/dl-tls_init_tp.c
3761037610+@@ -23,6 +23,7 @@
3761137611+ #include <tls.h>
3761237612+ #include <rseq-internal.h>
3761337613+ #include <thread_pointer.h>
3761437614++#include <dl-symbol-redir-ifunc.h>
3761537615+3761637616+ #define TUNABLE_NAMESPACE pthread
3761737617+ #include <dl-tunables.h>
3761837618+diff --git a/sysdeps/unix/sysv/linux/rseq-internal.h b/sysdeps/unix/sysv/linux/rseq-internal.h
3761937619+index ef3eab1fef..76de2b7ff0 100644
3762037620+--- a/sysdeps/unix/sysv/linux/rseq-internal.h
3762137621++++ b/sysdeps/unix/sysv/linux/rseq-internal.h
3762237622+@@ -52,13 +52,12 @@ rseq_register_current_thread (struct pthread *self, bool do_rseq)
3762337623+ but still expected size 32. */
3762437624+ size = RSEQ_AREA_SIZE_INITIAL;
3762537625+3762637626+- /* Initialize the rseq fields that are read by the kernel on
3762737627+- registration, there is no guarantee that struct pthread is
3762837628+- cleared on all architectures. */
3762937629++ /* Initialize the whole rseq area to zero prior to registration. */
3763037630++ memset (&self->rseq_area, 0, size);
3763137631++
3763237632++ /* Set the cpu_id field to RSEQ_CPU_ID_UNINITIALIZED, this is checked by
3763337633++ the kernel at registration when CONFIG_DEBUG_RSEQ is enabled. */
3763437634+ THREAD_SETMEM (self, rseq_area.cpu_id, RSEQ_CPU_ID_UNINITIALIZED);
3763537635+- THREAD_SETMEM (self, rseq_area.cpu_id_start, 0);
3763637636+- THREAD_SETMEM (self, rseq_area.rseq_cs, 0);
3763737637+- THREAD_SETMEM (self, rseq_area.flags, 0);
3763837638+3763937639+ int ret = INTERNAL_SYSCALL_CALL (rseq, &self->rseq_area,
3764037640+ size, 0, RSEQ_SIG);
3764137641+3764237642+commit ee1ab9302363066b49cf8862b96664ed35eda81c
3764337643+Author: Sunil K Pandey <skpgkp2@gmail.com>
3764437644+Date: Mon Mar 10 10:24:07 2025 -0700
3764537645+3764637646+ x86_64: Add tanh with FMA
3764737647+3764837648+ On Skylake, it improves tanh bench performance by:
3764937649+3765037650+ Before After Improvement
3765137651+ max 110.89 95.826 14%
3765237652+ min 20.966 20.157 4%
3765337653+ mean 30.9601 29.8431 4%
3765437654+3765537655+ Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
3765637656+ (cherry picked from commit c6352111c72a20b3588ae304dd99b63e25dd6d85)
3765737657+3765837658+diff --git a/sysdeps/ieee754/dbl-64/s_tanh.c b/sysdeps/ieee754/dbl-64/s_tanh.c
3765937659+index 673a97102d..13063db04e 100644
3766037660+--- a/sysdeps/ieee754/dbl-64/s_tanh.c
3766137661++++ b/sysdeps/ieee754/dbl-64/s_tanh.c
3766237662+@@ -46,6 +46,11 @@ static char rcsid[] = "$NetBSD: s_tanh.c,v 1.7 1995/05/10 20:48:22 jtc Exp $";
3766337663+3766437664+ static const double one = 1.0, two = 2.0, tiny = 1.0e-300;
3766537665+3766637666++#ifndef SECTION
3766737667++# define SECTION
3766837668++#endif
3766937669++
3767037670++SECTION
3767137671+ double
3767237672+ __tanh (double x)
3767337673+ {
3767437674+diff --git a/sysdeps/x86_64/fpu/multiarch/Makefile b/sysdeps/x86_64/fpu/multiarch/Makefile
3767537675+index cbe09d49f4..0f69f7089c 100644
3767637676+--- a/sysdeps/x86_64/fpu/multiarch/Makefile
3767737677++++ b/sysdeps/x86_64/fpu/multiarch/Makefile
3767837678+@@ -10,6 +10,7 @@ CFLAGS-s_expm1-fma.c = -mfma -mavx2
3767937679+ CFLAGS-s_log1p-fma.c = -mfma -mavx2
3768037680+ CFLAGS-s_sin-fma.c = -mfma -mavx2
3768137681+ CFLAGS-s_tan-fma.c = -mfma -mavx2
3768237682++CFLAGS-s_tanh-fma.c = -mfma -mavx2
3768337683+ CFLAGS-s_sincos-fma.c = -mfma -mavx2
3768437684+3768537685+ CFLAGS-e_exp2f-fma.c = -mfma -mavx2
3768637686+@@ -92,6 +93,7 @@ libm-sysdep_routines += \
3768737687+ s_sinf-sse2 \
3768837688+ s_tan-avx \
3768937689+ s_tan-fma \
3769037690++ s_tanh-fma \
3769137691+ s_trunc-sse4_1 \
3769237692+ s_truncf-sse4_1 \
3769337693+ # libm-sysdep_routines
3769437694+diff --git a/sysdeps/x86_64/fpu/multiarch/s_tanh-fma.c b/sysdeps/x86_64/fpu/multiarch/s_tanh-fma.c
3769537695+new file mode 100644
3769637696+index 0000000000..1b808b1227
3769737697+--- /dev/null
3769837698++++ b/sysdeps/x86_64/fpu/multiarch/s_tanh-fma.c
3769937699+@@ -0,0 +1,11 @@
3770037700++#define __tanh __tanh_fma
3770137701++#define __expm1 __expm1_fma
3770237702++
3770337703++/* NB: __expm1 may be expanded to __expm1_fma in the following
3770437704++ prototypes. */
3770537705++extern long double __expm1l (long double);
3770637706++extern long double __expm1f128 (long double);
3770737707++
3770837708++#define SECTION __attribute__ ((section (".text.fma")))
3770937709++
3771037710++#include <sysdeps/ieee754/dbl-64/s_tanh.c>
3771137711+diff --git a/sysdeps/x86_64/fpu/multiarch/s_tanh.c b/sysdeps/x86_64/fpu/multiarch/s_tanh.c
3771237712+new file mode 100644
3771337713+index 0000000000..5539b6c61c
3771437714+--- /dev/null
3771537715++++ b/sysdeps/x86_64/fpu/multiarch/s_tanh.c
3771637716+@@ -0,0 +1,31 @@
3771737717++/* Multiple versions of tanh.
3771837718++ Copyright (C) 2025 Free Software Foundation, Inc.
3771937719++ This file is part of the GNU C Library.
3772037720++
3772137721++ The GNU C Library is free software; you can redistribute it and/or
3772237722++ modify it under the terms of the GNU Lesser General Public
3772337723++ License as published by the Free Software Foundation; either
3772437724++ version 2.1 of the License, or (at your option) any later version.
3772537725++
3772637726++ The GNU C Library is distributed in the hope that it will be useful,
3772737727++ but WITHOUT ANY WARRANTY; without even the implied warranty of
3772837728++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
3772937729++ Lesser General Public License for more details.
3773037730++
3773137731++ You should have received a copy of the GNU Lesser General Public
3773237732++ License along with the GNU C Library; if not, see
3773337733++ <https://www.gnu.org/licenses/>. */
3773437734++
3773537735++#include <sysdeps/x86/isa-level.h>
3773637736++#if MINIMUM_X86_ISA_LEVEL < AVX2_X86_ISA_LEVEL
3773737737++
3773837738++extern double __redirect_tanh (double);
3773937739++
3774037740++# define SYMBOL_NAME tanh
3774137741++# include "ifunc-fma.h"
3774237742++
3774337743++libc_ifunc_redirected (__redirect_tanh, __tanh, IFUNC_SELECTOR ());
3774437744++
3774537745++# define __tanh __tanh_sse2
3774637746++#endif
3774737747++#include <sysdeps/ieee754/dbl-64/s_tanh.c>
3774837748+3774937749+commit e854f6d37cbeabb9130fed74b587befad8b4ba08
3775037750+Author: Sunil K Pandey <skpgkp2@gmail.com>
3775137751+Date: Sat Mar 8 08:51:10 2025 -0800
3775237752+3775337753+ x86_64: Add sinh with FMA
3775437754+3775537755+ On SPR, it improves sinh bench performance by:
3775637756+3775737757+ Before After Improvement
3775837758+ reciprocal-throughput 14.2017 11.815 17%
3775937759+ latency 36.4917 35.2114 4%
3776037760+3776137761+ Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
3776237762+ (cherry picked from commit dded0d20f67ba1925ccbcb9cf28f0c75febe0dbe)
3776337763+3776437764+diff --git a/benchtests/sinh-inputs b/benchtests/sinh-inputs
3776537765+index 7b1ac46a39..2fcb2fabf8 100644
3776637766+--- a/benchtests/sinh-inputs
3776737767++++ b/benchtests/sinh-inputs
3776837768+@@ -1,6 +1,7 @@
3776937769+ ## args: double
3777037770+ ## ret: double
3777137771+ ## includes: math.h
3777237772++## name: workload-random
3777337773+ 0x1.bcb6129b5ff2bp8
3777437774+ -0x1.63057386325ebp9
3777537775+ 0x1.62f1d7dc4e8bfp9
3777637776+diff --git a/sysdeps/ieee754/dbl-64/e_sinh.c b/sysdeps/ieee754/dbl-64/e_sinh.c
3777737777+index b4b5857ddd..3f787967f9 100644
3777837778+--- a/sysdeps/ieee754/dbl-64/e_sinh.c
3777937779++++ b/sysdeps/ieee754/dbl-64/e_sinh.c
3778037780+@@ -41,6 +41,11 @@ static char rcsid[] = "$NetBSD: e_sinh.c,v 1.7 1995/05/10 20:46:13 jtc Exp $";
3778137781+3778237782+ static const double one = 1.0, shuge = 1.0e307;
3778337783+3778437784++#ifndef SECTION
3778537785++# define SECTION
3778637786++#endif
3778737787++
3778837788++SECTION
3778937789+ double
3779037790+ __ieee754_sinh (double x)
3779137791+ {
3779237792+@@ -90,4 +95,7 @@ __ieee754_sinh (double x)
3779337793+ /* |x| > overflowthresold, sinh(x) overflow */
3779437794+ return math_narrow_eval (x * shuge);
3779537795+ }
3779637796++
3779737797++#ifndef __ieee754_sinh
3779837798+ libm_alias_finite (__ieee754_sinh, __sinh)
3779937799++#endif
3780037800+diff --git a/sysdeps/x86_64/fpu/multiarch/Makefile b/sysdeps/x86_64/fpu/multiarch/Makefile
3780137801+index 0f69f7089c..b527cab8d1 100644
3780237802+--- a/sysdeps/x86_64/fpu/multiarch/Makefile
3780337803++++ b/sysdeps/x86_64/fpu/multiarch/Makefile
3780437804+@@ -5,6 +5,7 @@ CFLAGS-e_exp-fma.c = -mfma -mavx2
3780537805+ CFLAGS-e_log-fma.c = -mfma -mavx2
3780637806+ CFLAGS-e_log2-fma.c = -mfma -mavx2
3780737807+ CFLAGS-e_pow-fma.c = -mfma -mavx2
3780837808++CFLAGS-e_sinh-fma.c = -mfma -mavx2
3780937809+ CFLAGS-s_atan-fma.c = -mfma -mavx2
3781037810+ CFLAGS-s_expm1-fma.c = -mfma -mavx2
3781137811+ CFLAGS-s_log1p-fma.c = -mfma -mavx2
3781237812+@@ -67,6 +68,7 @@ libm-sysdep_routines += \
3781337813+ e_logf-fma \
3781437814+ e_pow-fma \
3781537815+ e_powf-fma \
3781637816++ e_sinh-fma \
3781737817+ s_atan-avx \
3781837818+ s_atan-fma \
3781937819+ s_ceil-sse4_1 \
3782037820+diff --git a/sysdeps/x86_64/fpu/multiarch/e_sinh-fma.c b/sysdeps/x86_64/fpu/multiarch/e_sinh-fma.c
3782137821+new file mode 100644
3782237822+index 0000000000..e0e1e39a7a
3782337823+--- /dev/null
3782437824++++ b/sysdeps/x86_64/fpu/multiarch/e_sinh-fma.c
3782537825+@@ -0,0 +1,12 @@
3782637826++#define __ieee754_sinh __ieee754_sinh_fma
3782737827++#define __ieee754_exp __ieee754_exp_fma
3782837828++#define __expm1 __expm1_fma
3782937829++
3783037830++/* NB: __expm1 may be expanded to __expm1_fma in the following
3783137831++ prototypes. */
3783237832++extern long double __expm1l (long double);
3783337833++extern long double __expm1f128 (long double);
3783437834++
3783537835++#define SECTION __attribute__ ((section (".text.fma")))
3783637836++
3783737837++#include <sysdeps/ieee754/dbl-64/e_sinh.c>
3783837838+diff --git a/sysdeps/x86_64/fpu/multiarch/e_sinh.c b/sysdeps/x86_64/fpu/multiarch/e_sinh.c
3783937839+new file mode 100644
3784037840+index 0000000000..3d3c18ccdf
3784137841+--- /dev/null
3784237842++++ b/sysdeps/x86_64/fpu/multiarch/e_sinh.c
3784337843+@@ -0,0 +1,35 @@
3784437844++/* Multiple versions of sinh.
3784537845++ Copyright (C) 2025 Free Software Foundation, Inc.
3784637846++ This file is part of the GNU C Library.
3784737847++
3784837848++ The GNU C Library is free software; you can redistribute it and/or
3784937849++ modify it under the terms of the GNU Lesser General Public
3785037850++ License as published by the Free Software Foundation; either
3785137851++ version 2.1 of the License, or (at your option) any later version.
3785237852++
3785337853++ The GNU C Library is distributed in the hope that it will be useful,
3785437854++ but WITHOUT ANY WARRANTY; without even the implied warranty of
3785537855++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
3785637856++ Lesser General Public License for more details.
3785737857++
3785837858++ You should have received a copy of the GNU Lesser General Public
3785937859++ License along with the GNU C Library; if not, see
3786037860++ <https://www.gnu.org/licenses/>. */
3786137861++
3786237862++#include <sysdeps/x86/isa-level.h>
3786337863++#if MINIMUM_X86_ISA_LEVEL < AVX2_X86_ISA_LEVEL
3786437864++# include <libm-alias-finite.h>
3786537865++
3786637866++extern double __redirect_ieee754_sinh (double);
3786737867++
3786837868++# define SYMBOL_NAME ieee754_sinh
3786937869++# include "ifunc-fma.h"
3787037870++
3787137871++libc_ifunc_redirected (__redirect_ieee754_sinh, __ieee754_sinh,
3787237872++ IFUNC_SELECTOR ());
3787337873++
3787437874++libm_alias_finite (__ieee754_sinh, __sinh)
3787537875++
3787637876++# define __ieee754_sinh __ieee754_sinh_sse2
3787737877++#endif
3787837878++#include <sysdeps/ieee754/dbl-64/e_sinh.c>
3787937879+3788037880+commit e5f5dfdda28def8362896bdb1748bb27dfc8be73
3788137881+Author: Sunil K Pandey <skpgkp2@gmail.com>
3788237882+Date: Wed Mar 5 16:13:38 2025 -0800
3788337883+3788437884+ x86_64: Add atanh with FMA
3788537885+3788637886+ On SPR, it improves atanh bench performance by:
3788737887+3788837888+ Before After Improvement
3788937889+ reciprocal-throughput 15.1715 14.8628 2%
3789037890+ latency 57.1941 56.1883 2%
3789137891+3789237892+ Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
3789337893+ (cherry picked from commit c7c4a5906f326f1290b1c2413a83c530564ec4b8)
3789437894+3789537895+diff --git a/benchtests/atanh-inputs b/benchtests/atanh-inputs
3789637896+index 455aa65b65..4985293254 100644
3789737897+--- a/benchtests/atanh-inputs
3789837898++++ b/benchtests/atanh-inputs
3789937899+@@ -1,6 +1,7 @@
3790037900+ ## args: double
3790137901+ ## ret: double
3790237902+ ## includes: math.h
3790337903++## name: workload-random
3790437904+ 0x1.5a2730bacd94ap-1
3790537905+ -0x1.b57eb40fc048ep-21
3790637906+ -0x1.c0b185fb450e2p-17
3790737907+diff --git a/sysdeps/ieee754/dbl-64/e_atanh.c b/sysdeps/ieee754/dbl-64/e_atanh.c
3790837908+index 11a2a45799..05ac0a1b30 100644
3790937909+--- a/sysdeps/ieee754/dbl-64/e_atanh.c
3791037910++++ b/sysdeps/ieee754/dbl-64/e_atanh.c
3791137911+@@ -44,6 +44,11 @@
3791237912+3791337913+ static const double huge = 1e300;
3791437914+3791537915++#ifndef SECTION
3791637916++# define SECTION
3791737917++#endif
3791837918++
3791937919++SECTION
3792037920+ double
3792137921+ __ieee754_atanh (double x)
3792237922+ {
3792337923+@@ -73,4 +78,7 @@ __ieee754_atanh (double x)
3792437924+3792537925+ return copysign (t, x);
3792637926+ }
3792737927++
3792837928++#ifndef __ieee754_atanh
3792937929+ libm_alias_finite (__ieee754_atanh, __atanh)
3793037930++#endif
3793137931+diff --git a/sysdeps/x86_64/fpu/multiarch/Makefile b/sysdeps/x86_64/fpu/multiarch/Makefile
3793237932+index b527cab8d1..bc479b42d2 100644
3793337933+--- a/sysdeps/x86_64/fpu/multiarch/Makefile
3793437934++++ b/sysdeps/x86_64/fpu/multiarch/Makefile
3793537935+@@ -1,6 +1,7 @@
3793637936+ ifeq ($(subdir),math)
3793737937+ CFLAGS-e_asin-fma.c = -mfma -mavx2
3793837938+ CFLAGS-e_atan2-fma.c = -mfma -mavx2
3793937939++CFLAGS-e_atanh-fma.c = -mfma -mavx2
3794037940+ CFLAGS-e_exp-fma.c = -mfma -mavx2
3794137941+ CFLAGS-e_log-fma.c = -mfma -mavx2
3794237942+ CFLAGS-e_log2-fma.c = -mfma -mavx2
3794337943+@@ -57,6 +58,7 @@ libm-sysdep_routines += \
3794437944+ e_asin-fma \
3794537945+ e_atan2-avx \
3794637946+ e_atan2-fma \
3794737947++ e_atanh-fma \
3794837948+ e_exp-avx \
3794937949+ e_exp-fma \
3795037950+ e_exp2f-fma \
3795137951+diff --git a/sysdeps/x86_64/fpu/multiarch/e_atanh-fma.c b/sysdeps/x86_64/fpu/multiarch/e_atanh-fma.c
3795237952+new file mode 100644
3795337953+index 0000000000..c3f2f9e550
3795437954+--- /dev/null
3795537955++++ b/sysdeps/x86_64/fpu/multiarch/e_atanh-fma.c
3795637956+@@ -0,0 +1,6 @@
3795737957++#define __ieee754_atanh __ieee754_atanh_fma
3795837958++#define __log1p __log1p_fma
3795937959++
3796037960++#define SECTION __attribute__ ((section (".text.fma")))
3796137961++
3796237962++#include <sysdeps/ieee754/dbl-64/e_atanh.c>
3796337963+diff --git a/sysdeps/x86_64/fpu/multiarch/e_atanh.c b/sysdeps/x86_64/fpu/multiarch/e_atanh.c
3796437964+new file mode 100644
3796537965+index 0000000000..d2b785dfc0
3796637966+--- /dev/null
3796737967++++ b/sysdeps/x86_64/fpu/multiarch/e_atanh.c
3796837968+@@ -0,0 +1,34 @@
3796937969++/* Multiple versions of atanh.
3797037970++ Copyright (C) 2025 Free Software Foundation, Inc.
3797137971++ This file is part of the GNU C Library.
3797237972++
3797337973++ The GNU C Library is free software; you can redistribute it and/or
3797437974++ modify it under the terms of the GNU Lesser General Public
3797537975++ License as published by the Free Software Foundation; either
3797637976++ version 2.1 of the License, or (at your option) any later version.
3797737977++
3797837978++ The GNU C Library is distributed in the hope that it will be useful,
3797937979++ but WITHOUT ANY WARRANTY; without even the implied warranty of
3798037980++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
3798137981++ Lesser General Public License for more details.
3798237982++
3798337983++ You should have received a copy of the GNU Lesser General Public
3798437984++ License along with the GNU C Library; if not, see
3798537985++ <https://www.gnu.org/licenses/>. */
3798637986++
3798737987++#include <sysdeps/x86/isa-level.h>
3798837988++#if MINIMUM_X86_ISA_LEVEL < AVX2_X86_ISA_LEVEL
3798937989++# include <libm-alias-finite.h>
3799037990++
3799137991++extern double __redirect_ieee754_atanh (double);
3799237992++
3799337993++# define SYMBOL_NAME ieee754_atanh
3799437994++# include "ifunc-fma.h"
3799537995++
3799637996++libc_ifunc_redirected (__redirect_ieee754_atanh, __ieee754_atanh, IFUNC_SELECTOR ());
3799737997++
3799837998++libm_alias_finite (__ieee754_atanh, __atanh)
3799937999++
3800038000++# define __ieee754_atanh __ieee754_atanh_sse2
3800138001++#endif
3800238002++#include <sysdeps/ieee754/dbl-64/e_atanh.c>
3800338003+3800438004+commit 8fc492bb4234edc1a5e8c3b7f76ba345ea7109ec
3800538005+Author: Florian Weimer <fweimer@redhat.com>
3800638006+Date: Fri Mar 28 09:26:06 2025 +0100
3800738007+3800838008+ x86: Skip XSAVE state size reset if ISA level requires XSAVE
3800938009+3801038010+ If we have to use XSAVE or XSAVEC trampolines, do not adjust the size
3801138011+ information they need. Technically, it is an operator error to try to
3801238012+ run with -XSAVE,-XSAVEC on such builds, but this change here disables
3801338013+ some unnecessary code with higher ISA levels and simplifies testing.
3801438014+3801538015+ Related to commit befe2d3c4dec8be2cdd01a47132e47bdb7020922
3801638016+ ("x86-64: Don't use SSE resolvers for ISA level 3 or above").
3801738017+3801838018+ Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
3801938019+ (cherry picked from commit 59585ddaa2d44f22af04bb4b8bd4ad1e302c4c02)
3802038020+3802138021+diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
3802238022+index c096dd390a..b5b264db7f 100644
3802338023+--- a/sysdeps/x86/cpu-features.c
3802438024++++ b/sysdeps/x86/cpu-features.c
3802538025+@@ -24,6 +24,7 @@
3802638026+ #include <dl-cacheinfo.h>
3802738027+ #include <dl-minsigstacksize.h>
3802838028+ #include <dl-hwcap2.h>
3802938029++#include <gcc-macros.h>
3803038030+3803138031+ extern void TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t *)
3803238032+ attribute_hidden;
3803338033+@@ -1119,6 +1120,9 @@ no_cpuid:
3803438034+ TUNABLE_CALLBACK (set_prefer_map_32bit_exec));
3803538035+ #endif
3803638036+3803738037++ /* Do not add the logic to disable XSAVE/XSAVEC if this glibc build
3803838038++ requires AVX and therefore XSAVE or XSAVEC support. */
3803938039++#ifndef GCCMACRO__AVX__
3804038040+ bool disable_xsave_features = false;
3804138041+3804238042+ if (!CPU_FEATURE_USABLE_P (cpu_features, OSXSAVE))
3804338043+@@ -1172,6 +1176,7 @@ no_cpuid:
3804438044+3804538045+ CPU_FEATURE_UNSET (cpu_features, FMA4);
3804638046+ }
3804738047++#endif
3804838048+3804938049+ #ifdef __x86_64__
3805038050+ GLRO(dl_hwcap) = HWCAP_X86_64;
3805138051+3805238052+commit df22af58f66e6815c054b1c56249356c2994935a
3805338053+Author: Florian Weimer <fweimer@redhat.com>
3805438054+Date: Fri Mar 28 09:26:59 2025 +0100
3805538055+3805638056+ x86: Use separate variable for TLSDESC XSAVE/XSAVEC state size (bug 32810)
3805738057+3805838058+ Previously, the initialization code reused the xsave_state_full_size
3805938059+ member of struct cpu_features for the TLSDESC state size. However,
3806038060+ the tunable processing code assumes that this member has the
3806138061+ original XSAVE (non-compact) state size, so that it can use its
3806238062+ value if XSAVEC is disabled via tunable.
3806338063+3806438064+ This change uses a separate variable and not a struct member because
3806538065+ the value is only needed in ld.so and the static libc, but not in
3806638066+ libc.so. As a result, struct cpu_features layout does not change,
3806738067+ helping a future backport of this change.
3806838068+3806938069+ Fixes commit 9b7091415af47082664717210ac49d51551456ab ("x86-64:
3807038070+ Update _dl_tlsdesc_dynamic to preserve AMX registers").
3807138071+3807238072+ Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
3807338073+ (cherry picked from commit 145097dff170507fe73190e8e41194f5b5f7e6bf)
3807438074+3807538075+diff --git a/NEWS b/NEWS
3807638076+index 57feba81cd..7a6985f5dd 100644
3807738077+--- a/NEWS
3807838078++++ b/NEWS
3807938079+@@ -22,6 +22,7 @@ The following bugs are resolved with this release:
3808038080+ [32231] elf: Change ldconfig auxcache magic number
3808138081+ [32245] glibc -Wstringop-overflow= build failure on hppa
3808238082+ [32470] x86: Avoid integer truncation with large cache sizes
3808338083++ [32810] Crash on x86-64 if XSAVEC disable via tunable
3808438084+3808538085+ Version 2.40
3808638086+3808738087+diff --git a/sysdeps/x86/Makefile b/sysdeps/x86/Makefile
3808838088+index 5311b594af..8819fba1b7 100644
3808938089+--- a/sysdeps/x86/Makefile
3809038090++++ b/sysdeps/x86/Makefile
3809138091+@@ -21,6 +21,9 @@ tests += \
3809238092+ tst-cpu-features-supports-static \
3809338093+ tst-get-cpu-features \
3809438094+ tst-get-cpu-features-static \
3809538095++ tst-gnu2-tls2-x86-noxsave \
3809638096++ tst-gnu2-tls2-x86-noxsavec \
3809738097++ tst-gnu2-tls2-x86-noxsavexsavec \
3809838098+ tst-hwcap-tunables \
3809938099+ # tests
3810038100+ tests-static += \
3810138101+@@ -91,6 +94,22 @@ CFLAGS-tst-gnu2-tls2.c += -msse
3810238102+ CFLAGS-tst-gnu2-tls2mod0.c += -msse2 -mtune=haswell
3810338103+ CFLAGS-tst-gnu2-tls2mod1.c += -msse2 -mtune=haswell
3810438104+ CFLAGS-tst-gnu2-tls2mod2.c += -msse2 -mtune=haswell
3810538105++
3810638106++LDFLAGS-tst-gnu2-tls2-x86-noxsave += -Wl,-z,lazy
3810738107++LDFLAGS-tst-gnu2-tls2-x86-noxsavec += -Wl,-z,lazy
3810838108++LDFLAGS-tst-gnu2-tls2-x86-noxsavexsavec += -Wl,-z,lazy
3810938109++
3811038110++# Test for bug 32810: incorrect XSAVE state size if XSAVEC is disabled
3811138111++# via tunable.
3811238112++tst-gnu2-tls2-x86-noxsave-ENV = GLIBC_TUNABLES=glibc.cpu.hwcaps=-XSAVE
3811338113++tst-gnu2-tls2-x86-noxsavec-ENV = GLIBC_TUNABLES=glibc.cpu.hwcaps=-XSAVEC
3811438114++tst-gnu2-tls2-x86-noxsavexsavec-ENV = GLIBC_TUNABLES=glibc.cpu.hwcaps=-XSAVE,-XSAVEC
3811538115++$(objpfx)tst-gnu2-tls2-x86-noxsave.out \
3811638116++$(objpfx)tst-gnu2-tls2-x86-noxsavec.out \
3811738117++$(objpfx)tst-gnu2-tls2-x86-noxsavexsavec.out: \
3811838118++ $(objpfx)tst-gnu2-tls2mod0.so \
3811938119++ $(objpfx)tst-gnu2-tls2mod1.so \
3812038120++ $(objpfx)tst-gnu2-tls2mod2.so
3812138121+ endif
3812238122+3812338123+ ifeq ($(subdir),math)
3812438124+diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
3812538125+index b5b264db7f..ec27337337 100644
3812638126+--- a/sysdeps/x86/cpu-features.c
3812738127++++ b/sysdeps/x86/cpu-features.c
3812838128+@@ -84,6 +84,8 @@ extern void TUNABLE_CALLBACK (set_x86_shstk) (tunable_val_t *)
3812938129+ # include <dl-cet.h>
3813038130+ #endif
3813138131+3813238132++unsigned long int _dl_x86_features_tlsdesc_state_size;
3813338133++
3813438134+ static void
3813538135+ update_active (struct cpu_features *cpu_features)
3813638136+ {
3813738137+@@ -318,6 +320,7 @@ update_active (struct cpu_features *cpu_features)
3813838138+ = xsave_state_full_size;
3813938139+ cpu_features->xsave_state_full_size
3814038140+ = xsave_state_full_size;
3814138141++ _dl_x86_features_tlsdesc_state_size = xsave_state_full_size;
3814238142+3814338143+ /* Check if XSAVEC is available. */
3814438144+ if (CPU_FEATURES_CPU_P (cpu_features, XSAVEC))
3814538145+@@ -406,11 +409,9 @@ update_active (struct cpu_features *cpu_features)
3814638146+ = ALIGN_UP ((amx_size
3814738147+ + TLSDESC_CALL_REGISTER_SAVE_AREA),
3814838148+ 64);
3814938149+- /* Set xsave_state_full_size to the compact AMX
3815038150+- state size for XSAVEC. NB: xsave_state_full_size
3815138151+- is only used in _dl_tlsdesc_dynamic_xsave and
3815238152+- _dl_tlsdesc_dynamic_xsavec. */
3815338153+- cpu_features->xsave_state_full_size = amx_size;
3815438154++ /* Set TLSDESC state size to the compact AMX
3815538155++ state size for XSAVEC. */
3815638156++ _dl_x86_features_tlsdesc_state_size = amx_size;
3815738157+ #endif
3815838158+ cpu_features->xsave_state_size
3815938159+ = ALIGN_UP (size + TLSDESC_CALL_REGISTER_SAVE_AREA,
3816038160+diff --git a/sysdeps/x86/cpu-tunables.c b/sysdeps/x86/cpu-tunables.c
3816138161+index ccc6b64dc2..a0b31d80f6 100644
3816238162+--- a/sysdeps/x86/cpu-tunables.c
3816338163++++ b/sysdeps/x86/cpu-tunables.c
3816438164+@@ -164,6 +164,8 @@ TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t *valp)
3816538165+ /* Update xsave_state_size to XSAVE state size. */
3816638166+ cpu_features->xsave_state_size
3816738167+ = cpu_features->xsave_state_full_size;
3816838168++ _dl_x86_features_tlsdesc_state_size
3816938169++ = cpu_features->xsave_state_full_size;
3817038170+ CPU_FEATURE_UNSET (cpu_features, XSAVEC);
3817138171+ }
3817238172+ }
3817338173+diff --git a/sysdeps/x86/dl-diagnostics-cpu.c b/sysdeps/x86/dl-diagnostics-cpu.c
3817438174+index 49eeb5f70a..41100a908a 100644
3817538175+--- a/sysdeps/x86/dl-diagnostics-cpu.c
3817638176++++ b/sysdeps/x86/dl-diagnostics-cpu.c
3817738177+@@ -89,6 +89,8 @@ _dl_diagnostics_cpu (void)
3817838178+ cpu_features->xsave_state_size);
3817938179+ print_cpu_features_value ("xsave_state_full_size",
3818038180+ cpu_features->xsave_state_full_size);
3818138181++ print_cpu_features_value ("tlsdesc_state_full_size",
3818238182++ _dl_x86_features_tlsdesc_state_size);
3818338183+ print_cpu_features_value ("data_cache_size", cpu_features->data_cache_size);
3818438184+ print_cpu_features_value ("shared_cache_size",
3818538185+ cpu_features->shared_cache_size);
3818638186+diff --git a/sysdeps/x86/include/cpu-features.h b/sysdeps/x86/include/cpu-features.h
3818738187+index aaae44f0e1..03c71387dd 100644
3818838188+--- a/sysdeps/x86/include/cpu-features.h
3818938189++++ b/sysdeps/x86/include/cpu-features.h
3819038190+@@ -934,8 +934,6 @@ struct cpu_features
3819138191+ /* The full state size for XSAVE when XSAVEC is disabled by
3819238192+3819338193+ GLIBC_TUNABLES=glibc.cpu.hwcaps=-XSAVEC
3819438194+-
3819538195+- and the AMX state size when XSAVEC is available.
3819638196+ */
3819738197+ unsigned int xsave_state_full_size;
3819838198+ /* Data cache size for use in memory and string routines, typically
3819938199+@@ -989,6 +987,13 @@ extern const struct cpu_features *_dl_x86_get_cpu_features (void)
3820038200+3820138201+ #define __get_cpu_features() _dl_x86_get_cpu_features()
3820238202+3820338203++#if IS_IN (rtld) || IS_IN (libc)
3820438204++/* XSAVE/XSAVEC state size used by TLS descriptors. Compared to
3820538205++ xsave_state_size from struct cpu_features, this includes additional
3820638206++ registers. */
3820738207++extern unsigned long int _dl_x86_features_tlsdesc_state_size attribute_hidden;
3820838208++#endif
3820938209++
3821038210+ #if defined (_LIBC) && !IS_IN (nonlib)
3821138211+ /* Unused for x86. */
3821238212+ # define INIT_ARCH()
3821338213+diff --git a/sysdeps/x86/tst-gnu2-tls2-x86-noxsave.c b/sysdeps/x86/tst-gnu2-tls2-x86-noxsave.c
3821438214+new file mode 100644
3821538215+index 0000000000..f0024c143d
3821638216+--- /dev/null
3821738217++++ b/sysdeps/x86/tst-gnu2-tls2-x86-noxsave.c
3821838218+@@ -0,0 +1 @@
3821938219++#include <elf/tst-gnu2-tls2.c>
3822038220+diff --git a/sysdeps/x86/tst-gnu2-tls2-x86-noxsavec.c b/sysdeps/x86/tst-gnu2-tls2-x86-noxsavec.c
3822138221+new file mode 100644
3822238222+index 0000000000..f0024c143d
3822338223+--- /dev/null
3822438224++++ b/sysdeps/x86/tst-gnu2-tls2-x86-noxsavec.c
3822538225+@@ -0,0 +1 @@
3822638226++#include <elf/tst-gnu2-tls2.c>
3822738227+diff --git a/sysdeps/x86/tst-gnu2-tls2-x86-noxsavexsavec.c b/sysdeps/x86/tst-gnu2-tls2-x86-noxsavexsavec.c
3822838228+new file mode 100644
3822938229+index 0000000000..f0024c143d
3823038230+--- /dev/null
3823138231++++ b/sysdeps/x86/tst-gnu2-tls2-x86-noxsavexsavec.c
3823238232+@@ -0,0 +1 @@
3823338233++#include <elf/tst-gnu2-tls2.c>
3823438234+diff --git a/sysdeps/x86_64/dl-tlsdesc-dynamic.h b/sysdeps/x86_64/dl-tlsdesc-dynamic.h
3823538235+index 9f02cfc3eb..44d948696f 100644
3823638236+--- a/sysdeps/x86_64/dl-tlsdesc-dynamic.h
3823738237++++ b/sysdeps/x86_64/dl-tlsdesc-dynamic.h
3823838238+@@ -99,7 +99,7 @@ _dl_tlsdesc_dynamic:
3823938239+ # endif
3824038240+ #else
3824138241+ /* Allocate stack space of the required size to save the state. */
3824238242+- sub _rtld_local_ro+RTLD_GLOBAL_RO_DL_X86_CPU_FEATURES_OFFSET+XSAVE_STATE_FULL_SIZE_OFFSET(%rip), %RSP_LP
3824338243++ sub _dl_x86_features_tlsdesc_state_size(%rip), %RSP_LP
3824438244+ #endif
3824538245+ /* Besides rdi and rsi, saved above, save rcx, rdx, r8, r9,
3824638246+ r10 and r11. */
3824738247+3824838248+commit a87d9a2c2cc17a3b22fd3be8d106336f4dcf2042
3824938249+Author: Florian Weimer <fweimer@redhat.com>
3825038250+Date: Mon Mar 31 21:33:18 2025 +0200
3825138251+3825238252+ x86: Link tst-gnu2-tls2-x86-noxsave{,c,xsavec} with libpthread
3825338253+3825438254+ This fixes a test build failure on Hurd.
3825538255+3825638256+ Fixes commit 145097dff170507fe73190e8e41194f5b5f7e6bf ("x86: Use separate
3825738257+ variable for TLSDESC XSAVE/XSAVEC state size (bug 32810)").
3825838258+3825938259+ Reviewed-by: Adhemerval Zanella <adhemerval.zanella@linaro.org>
3826038260+ (cherry picked from commit c6e2895695118ab59c7b17feb0fcb75a53e3478c)
3826138261+3826238262+diff --git a/sysdeps/x86/Makefile b/sysdeps/x86/Makefile
3826338263+index 8819fba1b7..01b0192ddf 100644
3826438264+--- a/sysdeps/x86/Makefile
3826538265++++ b/sysdeps/x86/Makefile
3826638266+@@ -104,6 +104,9 @@ LDFLAGS-tst-gnu2-tls2-x86-noxsavexsavec += -Wl,-z,lazy
3826738267+ tst-gnu2-tls2-x86-noxsave-ENV = GLIBC_TUNABLES=glibc.cpu.hwcaps=-XSAVE
3826838268+ tst-gnu2-tls2-x86-noxsavec-ENV = GLIBC_TUNABLES=glibc.cpu.hwcaps=-XSAVEC
3826938269+ tst-gnu2-tls2-x86-noxsavexsavec-ENV = GLIBC_TUNABLES=glibc.cpu.hwcaps=-XSAVE,-XSAVEC
3827038270++$(objpfx)tst-gnu2-tls2-x86-noxsave: $(shared-thread-library)
3827138271++$(objpfx)tst-gnu2-tls2-x86-noxsavec: $(shared-thread-library)
3827238272++$(objpfx)tst-gnu2-tls2-x86-noxsavexsavec: $(shared-thread-library)
3827338273+ $(objpfx)tst-gnu2-tls2-x86-noxsave.out \
3827438274+ $(objpfx)tst-gnu2-tls2-x86-noxsavec.out \
3827538275+ $(objpfx)tst-gnu2-tls2-x86-noxsavexsavec.out: \
3827638276+3827738277+commit 8fe27af20c8b25b84e12bcd52353862a95044aa2
3827838278+Author: Noah Goldstein <goldstein.w.n@gmail.com>
3827938279+Date: Wed Aug 14 14:37:30 2024 +0800
3828038280+3828138281+ x86: Use `Avoid_Non_Temporal_Memset` to control non-temporal path
3828238282+3828338283+ This is just a refactor and there should be no behavioral change from
3828438284+ this commit.
3828538285+3828638286+ The goal is to make `Avoid_Non_Temporal_Memset` a more universal knob
3828738287+ for controlling whether we use non-temporal memset rather than having
3828838288+ extra logic based on vendor.
3828938289+ Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
3829038290+3829138291+ (cherry picked from commit b93dddfaf440aa12f45d7c356f6ffe9f27d35577)
3829238292+3829338293+diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
3829438294+index ec27337337..8841020b36 100644
3829538295+--- a/sysdeps/x86/cpu-features.c
3829638296++++ b/sysdeps/x86/cpu-features.c
3829738297+@@ -758,6 +758,12 @@ init_cpu_features (struct cpu_features *cpu_features)
3829838298+ unsigned int stepping = 0;
3829938299+ enum cpu_features_kind kind;
3830038300+3830138301++ /* Default is avoid non-temporal memset for non Intel/AMD hardware. This is,
3830238302++ as of writing this, we only have benchmarks indicatings it profitability
3830338303++ on Intel/AMD. */
3830438304++ cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset]
3830538305++ |= bit_arch_Avoid_Non_Temporal_Memset;
3830638306++
3830738307+ cpu_features->cachesize_non_temporal_divisor = 4;
3830838308+ #if !HAS_CPUID
3830938309+ if (__get_cpuid_max (0, 0) == 0)
3831038310+@@ -783,6 +789,11 @@ init_cpu_features (struct cpu_features *cpu_features)
3831138311+3831238312+ update_active (cpu_features);
3831338313+3831438314++ /* Benchmarks indicate non-temporal memset can be profitable on Intel
3831538315++ hardware. */
3831638316++ cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset]
3831738317++ &= ~bit_arch_Avoid_Non_Temporal_Memset;
3831838318++
3831938319+ if (family == 0x06)
3832038320+ {
3832138321+ model += extended_model;
3832238322+@@ -993,6 +1004,11 @@ https://www.intel.com/content/www/us/en/support/articles/000059422/processors.ht
3832338323+3832438324+ ecx = cpu_features->features[CPUID_INDEX_1].cpuid.ecx;
3832538325+3832638326++ /* Benchmarks indicate non-temporal memset can be profitable on AMD
3832738327++ hardware. */
3832838328++ cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset]
3832938329++ &= ~bit_arch_Avoid_Non_Temporal_Memset;
3833038330++
3833138331+ if (CPU_FEATURE_USABLE_P (cpu_features, AVX))
3833238332+ {
3833338333+ /* Since the FMA4 bit is in CPUID_INDEX_80000001 and
3833438334+diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
3833538335+index ac97414b5b..7b1b61c096 100644
3833638336+--- a/sysdeps/x86/dl-cacheinfo.h
3833738337++++ b/sysdeps/x86/dl-cacheinfo.h
3833838338+@@ -988,14 +988,6 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
3833938339+ if (CPU_FEATURE_USABLE_P (cpu_features, FSRM))
3834038340+ rep_movsb_threshold = 2112;
3834138341+3834238342+- /* Non-temporal stores are more performant on Intel and AMD hardware above
3834338343+- non_temporal_threshold. Enable this for both Intel and AMD hardware. */
3834438344+- unsigned long int memset_non_temporal_threshold = SIZE_MAX;
3834538345+- if (!CPU_FEATURES_ARCH_P (cpu_features, Avoid_Non_Temporal_Memset)
3834638346+- && (cpu_features->basic.kind == arch_kind_intel
3834738347+- || cpu_features->basic.kind == arch_kind_amd))
3834838348+- memset_non_temporal_threshold = non_temporal_threshold;
3834938349+-
3835038350+ /* For AMD CPUs that support ERMS (Zen3+), REP MOVSB is in a lot of
3835138351+ cases slower than the vectorized path (and for some alignments,
3835238352+ it is really slow, check BZ #30994). */
3835338353+@@ -1017,6 +1009,13 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
3835438354+ if (tunable_size != 0)
3835538355+ shared = tunable_size;
3835638356+3835738357++ /* Non-temporal stores are more performant on some hardware above
3835838358++ non_temporal_threshold. Currently Prefer_Non_Temporal is set for for both
3835938359++ Intel and AMD hardware. */
3836038360++ unsigned long int memset_non_temporal_threshold = SIZE_MAX;
3836138361++ if (!CPU_FEATURES_ARCH_P (cpu_features, Avoid_Non_Temporal_Memset))
3836238362++ memset_non_temporal_threshold = non_temporal_threshold;
3836338363++
3836438364+ tunable_size = TUNABLE_GET (x86_non_temporal_threshold, long int, NULL);
3836538365+ if (tunable_size > minimum_non_temporal_threshold
3836638366+ && tunable_size <= maximum_non_temporal_threshold)
3836738367+3836838368+commit 7c6bd71b4dbdadab34e4fd21ec09b86b32daf443
3836938369+Author: Sunil K Pandey <skpgkp2@gmail.com>
3837038370+Date: Thu Apr 3 13:00:45 2025 -0700
3837138371+3837238372+ x86: Optimize xstate size calculation
3837338373+3837438374+ Scan xstate IDs up to the maximum supported xstate ID. Remove the
3837538375+ separate AMX xstate calculation. Instead, exclude the AMX space from
3837638376+ the start of TILECFG to the end of TILEDATA in xsave_state_size.
3837738377+3837838378+ Completed validation on SKL/SKX/SPR/SDE and compared xsave state size
3837938379+ with "ld.so --list-diagnostics" option, no regression.
3838038380+3838138381+ Co-Authored-By: H.J. Lu <hjl.tools@gmail.com>
3838238382+ Reviewed-by: Sunil K Pandey <skpgkp2@gmail.com>
3838338383+ (cherry picked from commit 70b648855185e967e54668b101d24704c3fb869d)
3838438384+3838538385+diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
3838638386+index 8841020b36..1d5e2a0072 100644
3838738387+--- a/sysdeps/x86/cpu-features.c
3838838388++++ b/sysdeps/x86/cpu-features.c
3838938389+@@ -325,13 +325,8 @@ update_active (struct cpu_features *cpu_features)
3839038390+ /* Check if XSAVEC is available. */
3839138391+ if (CPU_FEATURES_CPU_P (cpu_features, XSAVEC))
3839238392+ {
3839338393+- unsigned int xstate_comp_offsets[32];
3839438394+- unsigned int xstate_comp_sizes[32];
3839538395+-#ifdef __x86_64__
3839638396+- unsigned int xstate_amx_comp_offsets[32];
3839738397+- unsigned int xstate_amx_comp_sizes[32];
3839838398+- unsigned int amx_ecx;
3839938399+-#endif
3840038400++ unsigned int xstate_comp_offsets[X86_XSTATE_MAX_ID + 1];
3840138401++ unsigned int xstate_comp_sizes[X86_XSTATE_MAX_ID + 1];
3840238402+ unsigned int i;
3840338403+3840438404+ xstate_comp_offsets[0] = 0;
3840538405+@@ -339,39 +334,16 @@ update_active (struct cpu_features *cpu_features)
3840638406+ xstate_comp_offsets[2] = 576;
3840738407+ xstate_comp_sizes[0] = 160;
3840838408+ xstate_comp_sizes[1] = 256;
3840938409+-#ifdef __x86_64__
3841038410+- xstate_amx_comp_offsets[0] = 0;
3841138411+- xstate_amx_comp_offsets[1] = 160;
3841238412+- xstate_amx_comp_offsets[2] = 576;
3841338413+- xstate_amx_comp_sizes[0] = 160;
3841438414+- xstate_amx_comp_sizes[1] = 256;
3841538415+-#endif
3841638416+3841738417+- for (i = 2; i < 32; i++)
3841838418++ for (i = 2; i <= X86_XSTATE_MAX_ID; i++)
3841938419+ {
3842038420+ if ((FULL_STATE_SAVE_MASK & (1 << i)) != 0)
3842138421+ {
3842238422+ __cpuid_count (0xd, i, eax, ebx, ecx, edx);
3842338423+-#ifdef __x86_64__
3842438424+- /* Include this in xsave_state_full_size. */
3842538425+- amx_ecx = ecx;
3842638426+- xstate_amx_comp_sizes[i] = eax;
3842738427+- if ((AMX_STATE_SAVE_MASK & (1 << i)) != 0)
3842838428+- {
3842938429+- /* Exclude this from xsave_state_size. */
3843038430+- ecx = 0;
3843138431+- xstate_comp_sizes[i] = 0;
3843238432+- }
3843338433+- else
3843438434+-#endif
3843538435+- xstate_comp_sizes[i] = eax;
3843638436++ xstate_comp_sizes[i] = eax;
3843738437+ }
3843838438+ else
3843938439+ {
3844038440+-#ifdef __x86_64__
3844138441+- amx_ecx = 0;
3844238442+- xstate_amx_comp_sizes[i] = 0;
3844338443+-#endif
3844438444+ ecx = 0;
3844538445+ xstate_comp_sizes[i] = 0;
3844638446+ }
3844738447+@@ -380,42 +352,32 @@ update_active (struct cpu_features *cpu_features)
3844838448+ {
3844938449+ xstate_comp_offsets[i]
3845038450+ = (xstate_comp_offsets[i - 1]
3845138451+- + xstate_comp_sizes[i -1]);
3845238452++ + xstate_comp_sizes[i - 1]);
3845338453+ if ((ecx & (1 << 1)) != 0)
3845438454+ xstate_comp_offsets[i]
3845538455+ = ALIGN_UP (xstate_comp_offsets[i], 64);
3845638456+-#ifdef __x86_64__
3845738457+- xstate_amx_comp_offsets[i]
3845838458+- = (xstate_amx_comp_offsets[i - 1]
3845938459+- + xstate_amx_comp_sizes[i - 1]);
3846038460+- if ((amx_ecx & (1 << 1)) != 0)
3846138461+- xstate_amx_comp_offsets[i]
3846238462+- = ALIGN_UP (xstate_amx_comp_offsets[i],
3846338463+- 64);
3846438464+-#endif
3846538465+ }
3846638466+ }
3846738467+3846838468+ /* Use XSAVEC. */
3846938469+ unsigned int size
3847038470+- = xstate_comp_offsets[31] + xstate_comp_sizes[31];
3847138471++ = (xstate_comp_offsets[X86_XSTATE_MAX_ID]
3847238472++ + xstate_comp_sizes[X86_XSTATE_MAX_ID]);
3847338473+ if (size)
3847438474+ {
3847538475++ size = ALIGN_UP (size + TLSDESC_CALL_REGISTER_SAVE_AREA,
3847638476++ 64);
3847738477+ #ifdef __x86_64__
3847838478+- unsigned int amx_size
3847938479+- = (xstate_amx_comp_offsets[31]
3848038480+- + xstate_amx_comp_sizes[31]);
3848138481+- amx_size
3848238482+- = ALIGN_UP ((amx_size
3848338483+- + TLSDESC_CALL_REGISTER_SAVE_AREA),
3848438484+- 64);
3848538485+- /* Set TLSDESC state size to the compact AMX
3848638486+- state size for XSAVEC. */
3848738487+- _dl_x86_features_tlsdesc_state_size = amx_size;
3848838488++ _dl_x86_features_tlsdesc_state_size = size;
3848938489++ /* Exclude the AMX space from the start of TILECFG
3849038490++ space to the end of TILEDATA space. If CPU
3849138491++ doesn't support AMX, TILECFG offset is the same
3849238492++ as TILEDATA + 1 offset. Otherwise, they are
3849338493++ multiples of 64. */
3849438494++ size -= (xstate_comp_offsets[X86_XSTATE_TILEDATA_ID + 1]
3849538495++ - xstate_comp_offsets[X86_XSTATE_TILECFG_ID]);
3849638496+ #endif
3849738497+- cpu_features->xsave_state_size
3849838498+- = ALIGN_UP (size + TLSDESC_CALL_REGISTER_SAVE_AREA,
3849938499+- 64);
3850038500++ cpu_features->xsave_state_size = size;
3850138501+ CPU_FEATURE_SET (cpu_features, XSAVEC);
3850238502+ }
3850338503+ }
3850438504+diff --git a/sysdeps/x86/sysdep.h b/sysdeps/x86/sysdep.h
3850538505+index 7359149e17..1d6cabd816 100644
3850638506+--- a/sysdeps/x86/sysdep.h
3850738507++++ b/sysdeps/x86/sysdep.h
3850838508+@@ -102,6 +102,9 @@
3850938509+ | (1 << X86_XSTATE_ZMM_ID) \
3851038510+ | (1 << X86_XSTATE_APX_F_ID))
3851138511+3851238512++/* The maximum supported xstate ID. */
3851338513++# define X86_XSTATE_MAX_ID X86_XSTATE_APX_F_ID
3851438514++
3851538515+ /* AMX state mask. */
3851638516+ # define AMX_STATE_SAVE_MASK \
3851738517+ ((1 << X86_XSTATE_TILECFG_ID) | (1 << X86_XSTATE_TILEDATA_ID))
3851838518+@@ -123,6 +126,9 @@
3851938519+ | (1 << X86_XSTATE_K_ID) \
3852038520+ | (1 << X86_XSTATE_ZMM_H_ID))
3852138521+3852238522++/* The maximum supported xstate ID. */
3852338523++# define X86_XSTATE_MAX_ID X86_XSTATE_ZMM_H_ID
3852438524++
3852538525+ /* States to be included in xsave_state_size. */
3852638526+ # define FULL_STATE_SAVE_MASK STATE_SAVE_MASK
3852738527+ #endif
3852838528+3852938529+commit 44f92df8007d57f82b1518e219a0dbb60389ef2c
3853038530+Author: Sunil K Pandey <skpgkp2@gmail.com>
3853138531+Date: Thu Apr 3 18:14:20 2025 -0700
3853238532+3853338533+ x86: Add ARL/PTL/CWF model detection support
3853438534+3853538535+ - Add ARROWLAKE model detection.
3853638536+ - Add PANTHERLAKE model detection.
3853738537+ - Add CLEARWATERFOREST model detection.
3853838538+3853938539+ Intel® Architecture Instruction Set Extensions Programming Reference
3854038540+ https://cdrdv2.intel.com/v1/dl/getContent/671368 Section 1.2.
3854138541+3854238542+ No regression, validated model detection on SDE.
3854338543+3854438544+ Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
3854538545+ (cherry picked from commit e53eb952b970ac94c97d74fb447418fb327ca096)
3854638546+3854738547+diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
3854838548+index 1d5e2a0072..7f21a8227e 100644
3854938549+--- a/sysdeps/x86/cpu-features.c
3855038550++++ b/sysdeps/x86/cpu-features.c
3855138551+@@ -512,6 +512,7 @@ enum
3855238552+ INTEL_ATOM_GOLDMONT,
3855338553+ INTEL_ATOM_GOLDMONT_PLUS,
3855438554+ INTEL_ATOM_SIERRAFOREST,
3855538555++ INTEL_ATOM_CLEARWATERFOREST,
3855638556+ INTEL_ATOM_GRANDRIDGE,
3855738557+ INTEL_ATOM_TREMONT,
3855838558+3855938559+@@ -539,6 +540,7 @@ enum
3856038560+ INTEL_BIGCORE_METEORLAKE,
3856138561+ INTEL_BIGCORE_LUNARLAKE,
3856238562+ INTEL_BIGCORE_ARROWLAKE,
3856338563++ INTEL_BIGCORE_PANTHERLAKE,
3856438564+ INTEL_BIGCORE_GRANITERAPIDS,
3856538565+3856638566+ /* Mixed (bigcore + atom SOC). */
3856738567+@@ -584,6 +586,8 @@ intel_get_fam6_microarch (unsigned int model,
3856838568+ return INTEL_ATOM_GOLDMONT_PLUS;
3856938569+ case 0xAF:
3857038570+ return INTEL_ATOM_SIERRAFOREST;
3857138571++ case 0xDD:
3857238572++ return INTEL_ATOM_CLEARWATERFOREST;
3857338573+ case 0xB6:
3857438574+ return INTEL_ATOM_GRANDRIDGE;
3857538575+ case 0x86:
3857638576+@@ -691,8 +695,12 @@ intel_get_fam6_microarch (unsigned int model,
3857738577+ return INTEL_BIGCORE_METEORLAKE;
3857838578+ case 0xbd:
3857938579+ return INTEL_BIGCORE_LUNARLAKE;
3858038580++ case 0xb5:
3858138581++ case 0xc5:
3858238582+ case 0xc6:
3858338583+ return INTEL_BIGCORE_ARROWLAKE;
3858438584++ case 0xCC:
3858538585++ return INTEL_BIGCORE_PANTHERLAKE;
3858638586+ case 0xAD:
3858738587+ case 0xAE:
3858838588+ return INTEL_BIGCORE_GRANITERAPIDS;
3858938589+@@ -808,6 +816,7 @@ init_cpu_features (struct cpu_features *cpu_features)
3859038590+ Default tuned atom microarch.
3859138591+ case INTEL_ATOM_SIERRAFOREST:
3859238592+ case INTEL_ATOM_GRANDRIDGE:
3859338593++ case INTEL_ATOM_CLEARWATERFOREST:
3859438594+ */
3859538595+3859638596+ /* Bigcore/Default Tuning. */
3859738597+@@ -864,6 +873,7 @@ init_cpu_features (struct cpu_features *cpu_features)
3859838598+ case INTEL_BIGCORE_METEORLAKE:
3859938599+ case INTEL_BIGCORE_LUNARLAKE:
3860038600+ case INTEL_BIGCORE_ARROWLAKE:
3860138601++ case INTEL_BIGCORE_PANTHERLAKE:
3860238602+ case INTEL_BIGCORE_SAPPHIRERAPIDS:
3860338603+ case INTEL_BIGCORE_EMERALDRAPIDS:
3860438604+ case INTEL_BIGCORE_GRANITERAPIDS:
3860538605+3860638606+commit 9ee8083c4edbe5e92af7aabb23261309f03ef05c
3860738607+Author: Sunil K Pandey <sunil.k.pandey@intel.com>
3860838608+Date: Fri Apr 11 08:52:52 2025 -0700
3860938609+3861038610+ x86: Handle unknown Intel processor with default tuning
3861138611+3861238612+ Enable default tuning for unknown Intel processor.
3861338613+3861438614+ Tested on x86, no regression.
3861538615+3861638616+ Co-Authored-By: H.J. Lu <hjl.tools@gmail.com>
3861738617+ Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
3861838618+ (cherry picked from commit 9f0deff558d1d6b08c425c157f50de85013ada9c)
3861938619+3862038620+diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
3862138621+index 7f21a8227e..1a6e694abf 100644
3862238622+--- a/sysdeps/x86/cpu-features.c
3862338623++++ b/sysdeps/x86/cpu-features.c
3862438624+@@ -502,8 +502,8 @@ _Static_assert (((index_arch_Fast_Unaligned_Load
3862538625+ "Incorrect index_arch_Fast_Unaligned_Load");
3862638626+3862738627+3862838628+-/* Intel Family-6 microarch list. */
3862938629+-enum
3863038630++/* Intel microarch list. */
3863138631++enum intel_microarch
3863238632+ {
3863338633+ /* Atom processors. */
3863438634+ INTEL_ATOM_BONNELL,
3863538635+@@ -555,7 +555,7 @@ enum
3863638636+ INTEL_UNKNOWN,
3863738637+ };
3863838638+3863938639+-static unsigned int
3864038640++static enum intel_microarch
3864138641+ intel_get_fam6_microarch (unsigned int model,
3864238642+ __attribute__ ((unused)) unsigned int stepping)
3864338643+ {
3864438644+@@ -764,134 +764,20 @@ init_cpu_features (struct cpu_features *cpu_features)
3864538645+ cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset]
3864638646+ &= ~bit_arch_Avoid_Non_Temporal_Memset;
3864738647+3864838648++ enum intel_microarch microarch = INTEL_UNKNOWN;
3864938649+ if (family == 0x06)
3865038650+ {
3865138651+ model += extended_model;
3865238652+- unsigned int microarch
3865338653+- = intel_get_fam6_microarch (model, stepping);
3865438654++ microarch = intel_get_fam6_microarch (model, stepping);
3865538655+3865638656++ /* Disable TSX on some processors to avoid TSX on kernels that
3865738657++ weren't updated with the latest microcode package (which
3865838658++ disables broken feature by default). */
3865938659+ switch (microarch)
3866038660+ {
3866138661+- /* Atom / KNL tuning. */
3866238662+- case INTEL_ATOM_BONNELL:
3866338663+- /* BSF is slow on Bonnell. */
3866438664+- cpu_features->preferred[index_arch_Slow_BSF]
3866538665+- |= bit_arch_Slow_BSF;
3866638666+- break;
3866738667+-
3866838668+- /* Unaligned load versions are faster than SSSE3
3866938669+- on Airmont, Silvermont, Goldmont, and Goldmont Plus. */
3867038670+- case INTEL_ATOM_AIRMONT:
3867138671+- case INTEL_ATOM_SILVERMONT:
3867238672+- case INTEL_ATOM_GOLDMONT:
3867338673+- case INTEL_ATOM_GOLDMONT_PLUS:
3867438674+-
3867538675+- /* Knights Landing. Enable Silvermont optimizations. */
3867638676+- case INTEL_KNIGHTS_LANDING:
3867738677+-
3867838678+- cpu_features->preferred[index_arch_Fast_Unaligned_Load]
3867938679+- |= (bit_arch_Fast_Unaligned_Load
3868038680+- | bit_arch_Fast_Unaligned_Copy
3868138681+- | bit_arch_Prefer_PMINUB_for_stringop
3868238682+- | bit_arch_Slow_SSE4_2);
3868338683+- break;
3868438684+-
3868538685+- case INTEL_ATOM_TREMONT:
3868638686+- /* Enable rep string instructions, unaligned load, unaligned
3868738687+- copy, pminub and avoid SSE 4.2 on Tremont. */
3868838688+- cpu_features->preferred[index_arch_Fast_Rep_String]
3868938689+- |= (bit_arch_Fast_Rep_String
3869038690+- | bit_arch_Fast_Unaligned_Load
3869138691+- | bit_arch_Fast_Unaligned_Copy
3869238692+- | bit_arch_Prefer_PMINUB_for_stringop
3869338693+- | bit_arch_Slow_SSE4_2);
3869438694+- break;
3869538695+-
3869638696+- /*
3869738697+- Default tuned Knights microarch.
3869838698+- case INTEL_KNIGHTS_MILL:
3869938699+- */
3870038700+-
3870138701+- /*
3870238702+- Default tuned atom microarch.
3870338703+- case INTEL_ATOM_SIERRAFOREST:
3870438704+- case INTEL_ATOM_GRANDRIDGE:
3870538705+- case INTEL_ATOM_CLEARWATERFOREST:
3870638706+- */
3870738707+-
3870838708+- /* Bigcore/Default Tuning. */
3870938709+ default:
3871038710+- default_tuning:
3871138711+- /* Unknown family 0x06 processors. Assuming this is one
3871238712+- of Core i3/i5/i7 processors if AVX is available. */
3871338713+- if (!CPU_FEATURES_CPU_P (cpu_features, AVX))
3871438714+- break;
3871538715+-
3871638716+- enable_modern_features:
3871738717+- /* Rep string instructions, unaligned load, unaligned copy,
3871838718+- and pminub are fast on Intel Core i3, i5 and i7. */
3871938719+- cpu_features->preferred[index_arch_Fast_Rep_String]
3872038720+- |= (bit_arch_Fast_Rep_String
3872138721+- | bit_arch_Fast_Unaligned_Load
3872238722+- | bit_arch_Fast_Unaligned_Copy
3872338723+- | bit_arch_Prefer_PMINUB_for_stringop);
3872438724+ break;
3872538725+3872638726+- case INTEL_BIGCORE_NEHALEM:
3872738727+- case INTEL_BIGCORE_WESTMERE:
3872838728+- /* Older CPUs prefer non-temporal stores at lower threshold. */
3872938729+- cpu_features->cachesize_non_temporal_divisor = 8;
3873038730+- goto enable_modern_features;
3873138731+-
3873238732+- /* Older Bigcore microarch (smaller non-temporal store
3873338733+- threshold). */
3873438734+- case INTEL_BIGCORE_SANDYBRIDGE:
3873538735+- case INTEL_BIGCORE_IVYBRIDGE:
3873638736+- case INTEL_BIGCORE_HASWELL:
3873738737+- case INTEL_BIGCORE_BROADWELL:
3873838738+- cpu_features->cachesize_non_temporal_divisor = 8;
3873938739+- goto default_tuning;
3874038740+-
3874138741+- /* Newer Bigcore microarch (larger non-temporal store
3874238742+- threshold). */
3874338743+- case INTEL_BIGCORE_SKYLAKE_AVX512:
3874438744+- case INTEL_BIGCORE_CANNONLAKE:
3874538745+- /* Benchmarks indicate non-temporal memset is not
3874638746+- necessarily profitable on SKX (and in some cases much
3874738747+- worse). This is likely unique to SKX due its it unique
3874838748+- mesh interconnect (not present on ICX or BWD). Disable
3874938749+- non-temporal on all Skylake servers. */
3875038750+- cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset]
3875138751+- |= bit_arch_Avoid_Non_Temporal_Memset;
3875238752+- case INTEL_BIGCORE_COMETLAKE:
3875338753+- case INTEL_BIGCORE_SKYLAKE:
3875438754+- case INTEL_BIGCORE_KABYLAKE:
3875538755+- case INTEL_BIGCORE_ICELAKE:
3875638756+- case INTEL_BIGCORE_TIGERLAKE:
3875738757+- case INTEL_BIGCORE_ROCKETLAKE:
3875838758+- case INTEL_BIGCORE_RAPTORLAKE:
3875938759+- case INTEL_BIGCORE_METEORLAKE:
3876038760+- case INTEL_BIGCORE_LUNARLAKE:
3876138761+- case INTEL_BIGCORE_ARROWLAKE:
3876238762+- case INTEL_BIGCORE_PANTHERLAKE:
3876338763+- case INTEL_BIGCORE_SAPPHIRERAPIDS:
3876438764+- case INTEL_BIGCORE_EMERALDRAPIDS:
3876538765+- case INTEL_BIGCORE_GRANITERAPIDS:
3876638766+- cpu_features->cachesize_non_temporal_divisor = 2;
3876738767+- goto default_tuning;
3876838768+-
3876938769+- /* Default tuned Mixed (bigcore + atom SOC). */
3877038770+- case INTEL_MIXED_LAKEFIELD:
3877138771+- case INTEL_MIXED_ALDERLAKE:
3877238772+- cpu_features->cachesize_non_temporal_divisor = 2;
3877338773+- goto default_tuning;
3877438774+- }
3877538775+-
3877638776+- /* Disable TSX on some processors to avoid TSX on kernels that
3877738777+- weren't updated with the latest microcode package (which
3877838778+- disables broken feature by default). */
3877938779+- switch (microarch)
3878038780+- {
3878138781+ case INTEL_BIGCORE_SKYLAKE_AVX512:
3878238782+ /* 0x55 (Skylake-avx512) && stepping <= 5 disable TSX. */
3878338783+ if (stepping <= 5)
3878438784+@@ -900,38 +786,152 @@ init_cpu_features (struct cpu_features *cpu_features)
3878538785+3878638786+ case INTEL_BIGCORE_KABYLAKE:
3878738787+ /* NB: Although the errata documents that for model == 0x8e
3878838788+- (kabylake skylake client), only 0xb stepping or lower are
3878938789+- impacted, the intention of the errata was to disable TSX on
3879038790+- all client processors on all steppings. Include 0xc
3879138791+- stepping which is an Intel Core i7-8665U, a client mobile
3879238792+- processor. */
3879338793++ (kabylake skylake client), only 0xb stepping or lower are
3879438794++ impacted, the intention of the errata was to disable TSX on
3879538795++ all client processors on all steppings. Include 0xc
3879638796++ stepping which is an Intel Core i7-8665U, a client mobile
3879738797++ processor. */
3879838798+ if (stepping > 0xc)
3879938799+ break;
3880038800+ /* Fall through. */
3880138801+ case INTEL_BIGCORE_SKYLAKE:
3880238802+- /* Disable Intel TSX and enable RTM_ALWAYS_ABORT for
3880338803+- processors listed in:
3880438804+-
3880538805+-https://www.intel.com/content/www/us/en/support/articles/000059422/processors.html
3880638806+- */
3880738807+- disable_tsx:
3880838808+- CPU_FEATURE_UNSET (cpu_features, HLE);
3880938809+- CPU_FEATURE_UNSET (cpu_features, RTM);
3881038810+- CPU_FEATURE_SET (cpu_features, RTM_ALWAYS_ABORT);
3881138811+- break;
3881238812++ /* Disable Intel TSX and enable RTM_ALWAYS_ABORT for
3881338813++ processors listed in:
3881438814++
3881538815++ https://www.intel.com/content/www/us/en/support/articles/000059422/processors.html
3881638816++ */
3881738817++disable_tsx:
3881838818++ CPU_FEATURE_UNSET (cpu_features, HLE);
3881938819++ CPU_FEATURE_UNSET (cpu_features, RTM);
3882038820++ CPU_FEATURE_SET (cpu_features, RTM_ALWAYS_ABORT);
3882138821++ break;
3882238822+3882338823+ case INTEL_BIGCORE_HASWELL:
3882438824+- /* Xeon E7 v3 (model == 0x3f) with stepping >= 4 has working
3882538825+- TSX. Haswell also include other model numbers that have
3882638826+- working TSX. */
3882738827+- if (model == 0x3f && stepping >= 4)
3882838828++ /* Xeon E7 v3 (model == 0x3f) with stepping >= 4 has working
3882938829++ TSX. Haswell also includes other model numbers that have
3883038830++ working TSX. */
3883138831++ if (model == 0x3f && stepping >= 4)
3883238832+ break;
3883338833+3883438834+- CPU_FEATURE_UNSET (cpu_features, RTM);
3883538835+- break;
3883638836++ CPU_FEATURE_UNSET (cpu_features, RTM);
3883738837++ break;
3883838838+ }
3883938839+ }
3884038840+3884138841++ switch (microarch)
3884238842++ {
3884338843++ /* Atom / KNL tuning. */
3884438844++ case INTEL_ATOM_BONNELL:
3884538845++ /* BSF is slow on Bonnell. */
3884638846++ cpu_features->preferred[index_arch_Slow_BSF]
3884738847++ |= bit_arch_Slow_BSF;
3884838848++ break;
3884938849++
3885038850++ /* Unaligned load versions are faster than SSSE3
3885138851++ on Airmont, Silvermont, Goldmont, and Goldmont Plus. */
3885238852++ case INTEL_ATOM_AIRMONT:
3885338853++ case INTEL_ATOM_SILVERMONT:
3885438854++ case INTEL_ATOM_GOLDMONT:
3885538855++ case INTEL_ATOM_GOLDMONT_PLUS:
3885638856++
3885738857++ /* Knights Landing. Enable Silvermont optimizations. */
3885838858++ case INTEL_KNIGHTS_LANDING:
3885938859++
3886038860++ cpu_features->preferred[index_arch_Fast_Unaligned_Load]
3886138861++ |= (bit_arch_Fast_Unaligned_Load
3886238862++ | bit_arch_Fast_Unaligned_Copy
3886338863++ | bit_arch_Prefer_PMINUB_for_stringop
3886438864++ | bit_arch_Slow_SSE4_2);
3886538865++ break;
3886638866++
3886738867++ case INTEL_ATOM_TREMONT:
3886838868++ /* Enable rep string instructions, unaligned load, unaligned
3886938869++ copy, pminub and avoid SSE 4.2 on Tremont. */
3887038870++ cpu_features->preferred[index_arch_Fast_Rep_String]
3887138871++ |= (bit_arch_Fast_Rep_String
3887238872++ | bit_arch_Fast_Unaligned_Load
3887338873++ | bit_arch_Fast_Unaligned_Copy
3887438874++ | bit_arch_Prefer_PMINUB_for_stringop
3887538875++ | bit_arch_Slow_SSE4_2);
3887638876++ break;
3887738877++
3887838878++ /*
3887938879++ Default tuned Knights microarch.
3888038880++ case INTEL_KNIGHTS_MILL:
3888138881++ */
3888238882++
3888338883++ /*
3888438884++ Default tuned atom microarch.
3888538885++ case INTEL_ATOM_SIERRAFOREST:
3888638886++ case INTEL_ATOM_GRANDRIDGE:
3888738887++ case INTEL_ATOM_CLEARWATERFOREST:
3888838888++ */
3888938889++
3889038890++ /* Bigcore/Default Tuning. */
3889138891++ default:
3889238892++ default_tuning:
3889338893++ /* Unknown Intel processors. Assuming this is one of Core
3889438894++ i3/i5/i7 processors if AVX is available. */
3889538895++ if (!CPU_FEATURES_CPU_P (cpu_features, AVX))
3889638896++ break;
3889738897++
3889838898++ enable_modern_features:
3889938899++ /* Rep string instructions, unaligned load, unaligned copy,
3890038900++ and pminub are fast on Intel Core i3, i5 and i7. */
3890138901++ cpu_features->preferred[index_arch_Fast_Rep_String]
3890238902++ |= (bit_arch_Fast_Rep_String
3890338903++ | bit_arch_Fast_Unaligned_Load
3890438904++ | bit_arch_Fast_Unaligned_Copy
3890538905++ | bit_arch_Prefer_PMINUB_for_stringop);
3890638906++ break;
3890738907++
3890838908++ case INTEL_BIGCORE_NEHALEM:
3890938909++ case INTEL_BIGCORE_WESTMERE:
3891038910++ /* Older CPUs prefer non-temporal stores at lower threshold. */
3891138911++ cpu_features->cachesize_non_temporal_divisor = 8;
3891238912++ goto enable_modern_features;
3891338913++
3891438914++ /* Older Bigcore microarch (smaller non-temporal store
3891538915++ threshold). */
3891638916++ case INTEL_BIGCORE_SANDYBRIDGE:
3891738917++ case INTEL_BIGCORE_IVYBRIDGE:
3891838918++ case INTEL_BIGCORE_HASWELL:
3891938919++ case INTEL_BIGCORE_BROADWELL:
3892038920++ cpu_features->cachesize_non_temporal_divisor = 8;
3892138921++ goto default_tuning;
3892238922++
3892338923++ /* Newer Bigcore microarch (larger non-temporal store
3892438924++ threshold). */
3892538925++ case INTEL_BIGCORE_SKYLAKE_AVX512:
3892638926++ case INTEL_BIGCORE_CANNONLAKE:
3892738927++ /* Benchmarks indicate non-temporal memset is not
3892838928++ necessarily profitable on SKX (and in some cases much
3892938929++ worse). This is likely unique to SKX due to its unique
3893038930++ mesh interconnect (not present on ICX or BWD). Disable
3893138931++ non-temporal on all Skylake servers. */
3893238932++ cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset]
3893338933++ |= bit_arch_Avoid_Non_Temporal_Memset;
3893438934++ /* fallthrough */
3893538935++ case INTEL_BIGCORE_COMETLAKE:
3893638936++ case INTEL_BIGCORE_SKYLAKE:
3893738937++ case INTEL_BIGCORE_KABYLAKE:
3893838938++ case INTEL_BIGCORE_ICELAKE:
3893938939++ case INTEL_BIGCORE_TIGERLAKE:
3894038940++ case INTEL_BIGCORE_ROCKETLAKE:
3894138941++ case INTEL_BIGCORE_RAPTORLAKE:
3894238942++ case INTEL_BIGCORE_METEORLAKE:
3894338943++ case INTEL_BIGCORE_LUNARLAKE:
3894438944++ case INTEL_BIGCORE_ARROWLAKE:
3894538945++ case INTEL_BIGCORE_PANTHERLAKE:
3894638946++ case INTEL_BIGCORE_SAPPHIRERAPIDS:
3894738947++ case INTEL_BIGCORE_EMERALDRAPIDS:
3894838948++ case INTEL_BIGCORE_GRANITERAPIDS:
3894938949++ /* Default tuned Mixed (bigcore + atom SOC). */
3895038950++ case INTEL_MIXED_LAKEFIELD:
3895138951++ case INTEL_MIXED_ALDERLAKE:
3895238952++ cpu_features->cachesize_non_temporal_divisor = 2;
3895338953++ goto default_tuning;
3895438954++ }
3895538955+3895638956+ /* Since AVX512ER is unique to Xeon Phi, set Prefer_No_VZEROUPPER
3895738957+ if AVX512ER is available. Don't use AVX512 to avoid lower CPU
3895838958+3895938959+commit d8a1a1aef7a58b991505b9a1349a40736dec3abf
3896038960+Author: H.J. Lu <hjl.tools@gmail.com>
3896138961+Date: Sat Apr 12 08:37:29 2025 -0700
3896238962+3896338963+ x86: Detect Intel Diamond Rapids
3896438964+3896538965+ Detect Intel Diamond Rapids and tune it similar to Intel Granite Rapids.
3896638966+3896738967+ Signed-off-by: H.J. Lu <hjl.tools@gmail.com>
3896838968+ Reviewed-by: Sunil K Pandey <skpgkp2@gmail.com>
3896938969+ (cherry picked from commit de14f1959ee5f9b845a7cae43bee03068b8136f0)
3897038970+3897138971+diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
3897238972+index 1a6e694abf..52a2f03bdd 100644
3897338973+--- a/sysdeps/x86/cpu-features.c
3897438974++++ b/sysdeps/x86/cpu-features.c
3897538975+@@ -542,6 +542,7 @@ enum intel_microarch
3897638976+ INTEL_BIGCORE_ARROWLAKE,
3897738977+ INTEL_BIGCORE_PANTHERLAKE,
3897838978+ INTEL_BIGCORE_GRANITERAPIDS,
3897938979++ INTEL_BIGCORE_DIAMONDRAPIDS,
3898038980+3898138981+ /* Mixed (bigcore + atom SOC). */
3898238982+ INTEL_MIXED_LAKEFIELD,
3898338983+@@ -817,6 +818,16 @@ disable_tsx:
3898438984+ break;
3898538985+ }
3898638986+ }
3898738987++ else if (family == 19)
3898838988++ switch (model)
3898938989++ {
3899038990++ case 0x01:
3899138991++ microarch = INTEL_BIGCORE_DIAMONDRAPIDS;
3899238992++ break;
3899338993++
3899438994++ default:
3899538995++ break;
3899638996++ }
3899738997+3899838998+ switch (microarch)
3899938999+ {
3900039000+@@ -926,6 +937,7 @@ disable_tsx:
3900139001+ case INTEL_BIGCORE_SAPPHIRERAPIDS:
3900239002+ case INTEL_BIGCORE_EMERALDRAPIDS:
3900339003+ case INTEL_BIGCORE_GRANITERAPIDS:
3900439004++ case INTEL_BIGCORE_DIAMONDRAPIDS:
3900539005+ /* Default tuned Mixed (bigcore + atom SOC). */
3900639006+ case INTEL_MIXED_LAKEFIELD:
3900739007+ case INTEL_MIXED_ALDERLAKE:
3900839008+3900939009+commit 736e6735053f12181d3d287898dd5fdb9e8baf59
3901039010+Author: Frank Barrus <frankbarrus_sw@shaggy.cc>
3901139011+Date: Wed Dec 4 07:55:02 2024 -0500
3901239012+3901339013+ pthreads NPTL: lost wakeup fix 2
3901439014+3901539015+ This fixes the lost wakeup (from a bug in signal stealing) with a change
3901639016+ in the usage of g_signals[] in the condition variable internal state.
3901739017+ It also completely eliminates the concept and handling of signal stealing,
3901839018+ as well as the need for signalers to block to wait for waiters to wake
3901939019+ up every time there is a G1/G2 switch. This greatly reduces the average
3902039020+ and maximum latency for pthread_cond_signal.
3902139021+3902239022+ The g_signals[] field now contains a signal count that is relative to
3902339023+ the current g1_start value. Since it is a 32-bit field, and the LSB is
3902439024+ still reserved (though not currently used anymore), it has a 31-bit value
3902539025+ that corresponds to the low 31 bits of the sequence number in g1_start.
3902639026+ (since g1_start also has an LSB flag, this means bits 31:1 in g_signals
3902739027+ correspond to bits 31:1 in g1_start, plus the current signal count)
3902839028+3902939029+ By making the signal count relative to g1_start, there is no longer
3903039030+ any ambiguity or A/B/A issue, and thus any checks before blocking,
3903139031+ including the futex call itself, are guaranteed not to block if the G1/G2
3903239032+ switch occurs, even if the signal count remains the same. This allows
3903339033+ initially safely blocking in G2 until the switch to G1 occurs, and
3903439034+ then transitioning from G1 to a new G1 or G2, and always being able to
3903539035+ distinguish the state change. This removes the race condition and A/B/A
3903639036+ problems that otherwise ocurred if a late (pre-empted) waiter were to
3903739037+ resume just as the futex call attempted to block on g_signal since
3903839038+ otherwise there was no last opportunity to re-check things like whether
3903939039+ the current G1 group was already closed.
3904039040+3904139041+ By fixing these issues, the signal stealing code can be eliminated,
3904239042+ since there is no concept of signal stealing anymore. The code to block
3904339043+ for all waiters to exit g_refs can also be removed, since any waiters
3904439044+ that are still in the g_refs region can be guaranteed to safely wake
3904539045+ up and exit. If there are still any left at this time, they are all
3904639046+ sent one final futex wakeup to ensure that they are not blocked any
3904739047+ longer, but there is no need for the signaller to block and wait for
3904839048+ them to wake up and exit the g_refs region.
3904939049+3905039050+ The signal count is then effectively "zeroed" but since it is now
3905139051+ relative to g1_start, this is done by advancing it to a new value that
3905239052+ can be observed by any pending blocking waiters. Any late waiters can
3905339053+ always tell the difference, and can thus just cleanly exit if they are
3905439054+ in a stale G1 or G2. They can never steal a signal from the current
3905539055+ G1 if they are not in the current G1, since the signal value that has
3905639056+ to match in the cmpxchg has the low 31 bits of the g1_start value
3905739057+ contained in it, and that's first checked, and then it won't match if
3905839058+ there's a G1/G2 change.
3905939059+3906039060+ Note: the 31-bit sequence number used in g_signals is designed to
3906139061+ handle wrap-around when checking the signal count, but if the entire
3906239062+ 31-bit wraparound (2 billion signals) occurs while there is still a
3906339063+ late waiter that has not yet resumed, and it happens to then match
3906439064+ the current g1_start low bits, and the pre-emption occurs after the
3906539065+ normal "closed group" checks (which are 64-bit) but then hits the
3906639066+ futex syscall and signal consuming code, then an A/B/A issue could
3906739067+ still result and cause an incorrect assumption about whether it
3906839068+ should block. This particular scenario seems unlikely in practice.
3906939069+ Note that once awake from the futex, the waiter would notice the
3907039070+ closed group before consuming the signal (since that's still a 64-bit
3907139071+ check that would not be aliased in the wrap-around in g_signals),
3907239072+ so the biggest impact would be blocking on the futex until the next
3907339073+ full wakeup from a G1/G2 switch.
3907439074+3907539075+ Signed-off-by: Frank Barrus <frankbarrus_sw@shaggy.cc>
3907639076+ Reviewed-by: Carlos O'Donell <carlos@redhat.com>
3907739077+ (cherry picked from commit 1db84775f831a1494993ce9c118deaf9537cc50a)
3907839078+3907939079+diff --git a/nptl/pthread_cond_common.c b/nptl/pthread_cond_common.c
3908039080+index 3487557bb8..4855b8899f 100644
3908139081+--- a/nptl/pthread_cond_common.c
3908239082++++ b/nptl/pthread_cond_common.c
3908339083+@@ -201,7 +201,6 @@ static bool __attribute__ ((unused))
3908439084+ __condvar_quiesce_and_switch_g1 (pthread_cond_t *cond, uint64_t wseq,
3908539085+ unsigned int *g1index, int private)
3908639086+ {
3908739087+- const unsigned int maxspin = 0;
3908839088+ unsigned int g1 = *g1index;
3908939089+3909039090+ /* If there is no waiter in G2, we don't do anything. The expression may
3909139091+@@ -222,84 +221,46 @@ __condvar_quiesce_and_switch_g1 (pthread_cond_t *cond, uint64_t wseq,
3909239092+ * New waiters arriving concurrently with the group switching will all go
3909339093+ into G2 until we atomically make the switch. Waiters existing in G2
3909439094+ are not affected.
3909539095+- * Waiters in G1 will be closed out immediately by setting a flag in
3909639096+- __g_signals, which will prevent waiters from blocking using a futex on
3909739097+- __g_signals and also notifies them that the group is closed. As a
3909839098+- result, they will eventually remove their group reference, allowing us
3909939099+- to close switch group roles. */
3910039100+-
3910139101+- /* First, set the closed flag on __g_signals. This tells waiters that are
3910239102+- about to wait that they shouldn't do that anymore. This basically
3910339103+- serves as an advance notification of the upcoming change to __g1_start;
3910439104+- waiters interpret it as if __g1_start was larger than their waiter
3910539105+- sequence position. This allows us to change __g1_start after waiting
3910639106+- for all existing waiters with group references to leave, which in turn
3910739107+- makes recovery after stealing a signal simpler because it then can be
3910839108+- skipped if __g1_start indicates that the group is closed (otherwise,
3910939109+- we would have to recover always because waiters don't know how big their
3911039110+- groups are). Relaxed MO is fine. */
3911139111+- atomic_fetch_or_relaxed (cond->__data.__g_signals + g1, 1);
3911239112+-
3911339113+- /* Wait until there are no group references anymore. The fetch-or operation
3911439114+- injects us into the modification order of __g_refs; release MO ensures
3911539115+- that waiters incrementing __g_refs after our fetch-or see the previous
3911639116+- changes to __g_signals and to __g1_start that had to happen before we can
3911739117+- switch this G1 and alias with an older group (we have two groups, so
3911839118+- aliasing requires switching group roles twice). Note that nobody else
3911939119+- can have set the wake-request flag, so we do not have to act upon it.
3912039120+-
3912139121+- Also note that it is harmless if older waiters or waiters from this G1
3912239122+- get a group reference after we have quiesced the group because it will
3912339123+- remain closed for them either because of the closed flag in __g_signals
3912439124+- or the later update to __g1_start. New waiters will never arrive here
3912539125+- but instead continue to go into the still current G2. */
3912639126+- unsigned r = atomic_fetch_or_release (cond->__data.__g_refs + g1, 0);
3912739127+- while ((r >> 1) > 0)
3912839128+- {
3912939129+- for (unsigned int spin = maxspin; ((r >> 1) > 0) && (spin > 0); spin--)
3913039130+- {
3913139131+- /* TODO Back off. */
3913239132+- r = atomic_load_relaxed (cond->__data.__g_refs + g1);
3913339133+- }
3913439134+- if ((r >> 1) > 0)
3913539135+- {
3913639136+- /* There is still a waiter after spinning. Set the wake-request
3913739137+- flag and block. Relaxed MO is fine because this is just about
3913839138+- this futex word.
3913939139+-
3914039140+- Update r to include the set wake-request flag so that the upcoming
3914139141+- futex_wait only blocks if the flag is still set (otherwise, we'd
3914239142+- violate the basic client-side futex protocol). */
3914339143+- r = atomic_fetch_or_relaxed (cond->__data.__g_refs + g1, 1) | 1;
3914439144+-
3914539145+- if ((r >> 1) > 0)
3914639146+- futex_wait_simple (cond->__data.__g_refs + g1, r, private);
3914739147+- /* Reload here so we eventually see the most recent value even if we
3914839148+- do not spin. */
3914939149+- r = atomic_load_relaxed (cond->__data.__g_refs + g1);
3915039150+- }
3915139151+- }
3915239152+- /* Acquire MO so that we synchronize with the release operation that waiters
3915339153+- use to decrement __g_refs and thus happen after the waiters we waited
3915439154+- for. */
3915539155+- atomic_thread_fence_acquire ();
3915639156++ * Waiters in G1 will be closed out immediately by the advancing of
3915739157++ __g_signals to the next "lowseq" (low 31 bits of the new g1_start),
3915839158++ which will prevent waiters from blocking using a futex on
3915939159++ __g_signals since it provides enough signals for all possible
3916039160++ remaining waiters. As a result, they can each consume a signal
3916139161++ and they will eventually remove their group reference. */
3916239162+3916339163+ /* Update __g1_start, which finishes closing this group. The value we add
3916439164+ will never be negative because old_orig_size can only be zero when we
3916539165+ switch groups the first time after a condvar was initialized, in which
3916639166+- case G1 will be at index 1 and we will add a value of 1. See above for
3916739167+- why this takes place after waiting for quiescence of the group.
3916839168++ case G1 will be at index 1 and we will add a value of 1.
3916939169+ Relaxed MO is fine because the change comes with no additional
3917039170+ constraints that others would have to observe. */
3917139171+ __condvar_add_g1_start_relaxed (cond,
3917239172+ (old_orig_size << 1) + (g1 == 1 ? 1 : - 1));
3917339173+3917439174+- /* Now reopen the group, thus enabling waiters to again block using the
3917539175+- futex controlled by __g_signals. Release MO so that observers that see
3917639176+- no signals (and thus can block) also see the write __g1_start and thus
3917739177+- that this is now a new group (see __pthread_cond_wait_common for the
3917839178+- matching acquire MO loads). */
3917939179+- atomic_store_release (cond->__data.__g_signals + g1, 0);
3918039180++ unsigned int lowseq = ((old_g1_start + old_orig_size) << 1) & ~1U;
3918139181++
3918239182++ /* If any waiters still hold group references (and thus could be blocked),
3918339183++ then wake them all up now and prevent any running ones from blocking.
3918439184++ This is effectively a catch-all for any possible current or future
3918539185++ bugs that can allow the group size to reach 0 before all G1 waiters
3918639186++ have been awakened or at least given signals to consume, or any
3918739187++ other case that can leave blocked (or about to block) older waiters.. */
3918839188++ if ((atomic_fetch_or_release (cond->__data.__g_refs + g1, 0) >> 1) > 0)
3918939189++ {
3919039190++ /* First advance signals to the end of the group (i.e. enough signals
3919139191++ for the entire G1 group) to ensure that waiters which have not
3919239192++ yet blocked in the futex will not block.
3919339193++ Note that in the vast majority of cases, this should never
3919439194++ actually be necessary, since __g_signals will have enough
3919539195++ signals for the remaining g_refs waiters. As an optimization,
3919639196++ we could check this first before proceeding, although that
3919739197++ could still leave the potential for futex lost wakeup bugs
3919839198++ if the signal count was non-zero but the futex wakeup
3919939199++ was somehow lost. */
3920039200++ atomic_store_release (cond->__data.__g_signals + g1, lowseq);
3920139201++
3920239202++ futex_wake (cond->__data.__g_signals + g1, INT_MAX, private);
3920339203++ }
3920439204+3920539205+ /* At this point, the old G1 is now a valid new G2 (but not in use yet).
3920639206+ No old waiter can neither grab a signal nor acquire a reference without
3920739207+@@ -311,6 +272,10 @@ __condvar_quiesce_and_switch_g1 (pthread_cond_t *cond, uint64_t wseq,
3920839208+ g1 ^= 1;
3920939209+ *g1index ^= 1;
3921039210+3921139211++ /* Now advance the new G1 g_signals to the new lowseq, giving it
3921239212++ an effective signal count of 0 to start. */
3921339213++ atomic_store_release (cond->__data.__g_signals + g1, lowseq);
3921439214++
3921539215+ /* These values are just observed by signalers, and thus protected by the
3921639216+ lock. */
3921739217+ unsigned int orig_size = wseq - (old_g1_start + old_orig_size);
3921839218+diff --git a/nptl/pthread_cond_wait.c b/nptl/pthread_cond_wait.c
3921939219+index 66786c7b90..3d290e39c8 100644
3922039220+--- a/nptl/pthread_cond_wait.c
3922139221++++ b/nptl/pthread_cond_wait.c
3922239222+@@ -238,9 +238,7 @@ __condvar_cleanup_waiting (void *arg)
3922339223+ signaled), and a reference count.
3922439224+3922539225+ The group reference count is used to maintain the number of waiters that
3922639226+- are using the group's futex. Before a group can change its role, the
3922739227+- reference count must show that no waiters are using the futex anymore; this
3922839228+- prevents ABA issues on the futex word.
3922939229++ are using the group's futex.
3923039230+3923139231+ To represent which intervals in the waiter sequence the groups cover (and
3923239232+ thus also which group slot contains G1 or G2), we use a 64b counter to
3923339233+@@ -300,11 +298,12 @@ __condvar_cleanup_waiting (void *arg)
3923439234+ last reference.
3923539235+ * Reference count used by waiters concurrently with signalers that have
3923639236+ acquired the condvar-internal lock.
3923739237+- __g_signals: The number of signals that can still be consumed.
3923839238++ __g_signals: The number of signals that can still be consumed, relative to
3923939239++ the current g1_start. (i.e. bits 31 to 1 of __g_signals are bits
3924039240++ 31 to 1 of g1_start with the signal count added)
3924139241+ * Used as a futex word by waiters. Used concurrently by waiters and
3924239242+ signalers.
3924339243+- * LSB is true iff this group has been completely signaled (i.e., it is
3924439244+- closed).
3924539245++ * LSB is currently reserved and 0.
3924639246+ __g_size: Waiters remaining in this group (i.e., which have not been
3924739247+ signaled yet.
3924839248+ * Accessed by signalers and waiters that cancel waiting (both do so only
3924939249+@@ -328,18 +327,6 @@ __condvar_cleanup_waiting (void *arg)
3925039250+ sufficient because if a waiter can see a sufficiently large value, it could
3925139251+ have also consume a signal in the waiters group.
3925239252+3925339253+- Waiters try to grab a signal from __g_signals without holding a reference
3925439254+- count, which can lead to stealing a signal from a more recent group after
3925539255+- their own group was already closed. They cannot always detect whether they
3925639256+- in fact did because they do not know when they stole, but they can
3925739257+- conservatively add a signal back to the group they stole from; if they
3925839258+- did so unnecessarily, all that happens is a spurious wake-up. To make this
3925939259+- even less likely, __g1_start contains the index of the current g2 too,
3926039260+- which allows waiters to check if there aliasing on the group slots; if
3926139261+- there wasn't, they didn't steal from the current G1, which means that the
3926239262+- G1 they stole from must have been already closed and they do not need to
3926339263+- fix anything.
3926439264+-
3926539265+ It is essential that the last field in pthread_cond_t is __g_signals[1]:
3926639266+ The previous condvar used a pointer-sized field in pthread_cond_t, so a
3926739267+ PTHREAD_COND_INITIALIZER from that condvar implementation might only
3926839268+@@ -435,6 +422,9 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex,
3926939269+ {
3927039270+ while (1)
3927139271+ {
3927239272++ uint64_t g1_start = __condvar_load_g1_start_relaxed (cond);
3927339273++ unsigned int lowseq = (g1_start & 1) == g ? signals : g1_start & ~1U;
3927439274++
3927539275+ /* Spin-wait first.
3927639276+ Note that spinning first without checking whether a timeout
3927739277+ passed might lead to what looks like a spurious wake-up even
3927839278+@@ -446,35 +436,45 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex,
3927939279+ having to compare against the current time seems to be the right
3928039280+ choice from a performance perspective for most use cases. */
3928139281+ unsigned int spin = maxspin;
3928239282+- while (signals == 0 && spin > 0)
3928339283++ while (spin > 0 && ((int)(signals - lowseq) < 2))
3928439284+ {
3928539285+ /* Check that we are not spinning on a group that's already
3928639286+ closed. */
3928739287+- if (seq < (__condvar_load_g1_start_relaxed (cond) >> 1))
3928839288+- goto done;
3928939289++ if (seq < (g1_start >> 1))
3929039290++ break;
3929139291+3929239292+ /* TODO Back off. */
3929339293+3929439294+ /* Reload signals. See above for MO. */
3929539295+ signals = atomic_load_acquire (cond->__data.__g_signals + g);
3929639296++ g1_start = __condvar_load_g1_start_relaxed (cond);
3929739297++ lowseq = (g1_start & 1) == g ? signals : g1_start & ~1U;
3929839298+ spin--;
3929939299+ }
3930039300+3930139301+- /* If our group will be closed as indicated by the flag on signals,
3930239302+- don't bother grabbing a signal. */
3930339303+- if (signals & 1)
3930439304+- goto done;
3930539305+-
3930639306+- /* If there is an available signal, don't block. */
3930739307+- if (signals != 0)
3930839308++ if (seq < (g1_start >> 1))
3930939309++ {
3931039310++ /* If the group is closed already,
3931139311++ then this waiter originally had enough extra signals to
3931239312++ consume, up until the time its group was closed. */
3931339313++ goto done;
3931439314++ }
3931539315++
3931639316++ /* If there is an available signal, don't block.
3931739317++ If __g1_start has advanced at all, then we must be in G1
3931839318++ by now, perhaps in the process of switching back to an older
3931939319++ G2, but in either case we're allowed to consume the available
3932039320++ signal and should not block anymore. */
3932139321++ if ((int)(signals - lowseq) >= 2)
3932239322+ break;
3932339323+3932439324+ /* No signals available after spinning, so prepare to block.
3932539325+ We first acquire a group reference and use acquire MO for that so
3932639326+ that we synchronize with the dummy read-modify-write in
3932739327+ __condvar_quiesce_and_switch_g1 if we read from that. In turn,
3932839328+- in this case this will make us see the closed flag on __g_signals
3932939329+- that designates a concurrent attempt to reuse the group's slot.
3933039330++ in this case this will make us see the advancement of __g_signals
3933139331++ to the upcoming new g1_start that occurs with a concurrent
3933239332++ attempt to reuse the group's slot.
3933339333+ We use acquire MO for the __g_signals check to make the
3933439334+ __g1_start check work (see spinning above).
3933539335+ Note that the group reference acquisition will not mask the
3933639336+@@ -482,15 +482,24 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex,
3933739337+ an atomic read-modify-write operation and thus extend the release
3933839338+ sequence. */
3933939339+ atomic_fetch_add_acquire (cond->__data.__g_refs + g, 2);
3934039340+- if (((atomic_load_acquire (cond->__data.__g_signals + g) & 1) != 0)
3934139341+- || (seq < (__condvar_load_g1_start_relaxed (cond) >> 1)))
3934239342++ signals = atomic_load_acquire (cond->__data.__g_signals + g);
3934339343++ g1_start = __condvar_load_g1_start_relaxed (cond);
3934439344++ lowseq = (g1_start & 1) == g ? signals : g1_start & ~1U;
3934539345++
3934639346++ if (seq < (g1_start >> 1))
3934739347+ {
3934839348+- /* Our group is closed. Wake up any signalers that might be
3934939349+- waiting. */
3935039350++ /* group is closed already, so don't block */
3935139351+ __condvar_dec_grefs (cond, g, private);
3935239352+ goto done;
3935339353+ }
3935439354+3935539355++ if ((int)(signals - lowseq) >= 2)
3935639356++ {
3935739357++ /* a signal showed up or G1/G2 switched after we grabbed the refcount */
3935839358++ __condvar_dec_grefs (cond, g, private);
3935939359++ break;
3936039360++ }
3936139361++
3936239362+ // Now block.
3936339363+ struct _pthread_cleanup_buffer buffer;
3936439364+ struct _condvar_cleanup_buffer cbuffer;
3936539365+@@ -501,7 +510,7 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex,
3936639366+ __pthread_cleanup_push (&buffer, __condvar_cleanup_waiting, &cbuffer);
3936739367+3936839368+ err = __futex_abstimed_wait_cancelable64 (
3936939369+- cond->__data.__g_signals + g, 0, clockid, abstime, private);
3937039370++ cond->__data.__g_signals + g, signals, clockid, abstime, private);
3937139371+3937239372+ __pthread_cleanup_pop (&buffer, 0);
3937339373+3937439374+@@ -524,6 +533,8 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex,
3937539375+ signals = atomic_load_acquire (cond->__data.__g_signals + g);
3937639376+ }
3937739377+3937839378++ if (seq < (__condvar_load_g1_start_relaxed (cond) >> 1))
3937939379++ goto done;
3938039380+ }
3938139381+ /* Try to grab a signal. Use acquire MO so that we see an up-to-date value
3938239382+ of __g1_start below (see spinning above for a similar case). In
3938339383+@@ -532,69 +543,6 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex,
3938439384+ while (!atomic_compare_exchange_weak_acquire (cond->__data.__g_signals + g,
3938539385+ &signals, signals - 2));
3938639386+3938739387+- /* We consumed a signal but we could have consumed from a more recent group
3938839388+- that aliased with ours due to being in the same group slot. If this
3938939389+- might be the case our group must be closed as visible through
3939039390+- __g1_start. */
3939139391+- uint64_t g1_start = __condvar_load_g1_start_relaxed (cond);
3939239392+- if (seq < (g1_start >> 1))
3939339393+- {
3939439394+- /* We potentially stole a signal from a more recent group but we do not
3939539395+- know which group we really consumed from.
3939639396+- We do not care about groups older than current G1 because they are
3939739397+- closed; we could have stolen from these, but then we just add a
3939839398+- spurious wake-up for the current groups.
3939939399+- We will never steal a signal from current G2 that was really intended
3940039400+- for G2 because G2 never receives signals (until it becomes G1). We
3940139401+- could have stolen a signal from G2 that was conservatively added by a
3940239402+- previous waiter that also thought it stole a signal -- but given that
3940339403+- that signal was added unnecessarily, it's not a problem if we steal
3940439404+- it.
3940539405+- Thus, the remaining case is that we could have stolen from the current
3940639406+- G1, where "current" means the __g1_start value we observed. However,
3940739407+- if the current G1 does not have the same slot index as we do, we did
3940839408+- not steal from it and do not need to undo that. This is the reason
3940939409+- for putting a bit with G2's index into__g1_start as well. */
3941039410+- if (((g1_start & 1) ^ 1) == g)
3941139411+- {
3941239412+- /* We have to conservatively undo our potential mistake of stealing
3941339413+- a signal. We can stop trying to do that when the current G1
3941439414+- changes because other spinning waiters will notice this too and
3941539415+- __condvar_quiesce_and_switch_g1 has checked that there are no
3941639416+- futex waiters anymore before switching G1.
3941739417+- Relaxed MO is fine for the __g1_start load because we need to
3941839418+- merely be able to observe this fact and not have to observe
3941939419+- something else as well.
3942039420+- ??? Would it help to spin for a little while to see whether the
3942139421+- current G1 gets closed? This might be worthwhile if the group is
3942239422+- small or close to being closed. */
3942339423+- unsigned int s = atomic_load_relaxed (cond->__data.__g_signals + g);
3942439424+- while (__condvar_load_g1_start_relaxed (cond) == g1_start)
3942539425+- {
3942639426+- /* Try to add a signal. We don't need to acquire the lock
3942739427+- because at worst we can cause a spurious wake-up. If the
3942839428+- group is in the process of being closed (LSB is true), this
3942939429+- has an effect similar to us adding a signal. */
3943039430+- if (((s & 1) != 0)
3943139431+- || atomic_compare_exchange_weak_relaxed
3943239432+- (cond->__data.__g_signals + g, &s, s + 2))
3943339433+- {
3943439434+- /* If we added a signal, we also need to add a wake-up on
3943539435+- the futex. We also need to do that if we skipped adding
3943639436+- a signal because the group is being closed because
3943739437+- while __condvar_quiesce_and_switch_g1 could have closed
3943839438+- the group, it might still be waiting for futex waiters to
3943939439+- leave (and one of those waiters might be the one we stole
3944039440+- the signal from, which cause it to block using the
3944139441+- futex). */
3944239442+- futex_wake (cond->__data.__g_signals + g, 1, private);
3944339443+- break;
3944439444+- }
3944539445+- /* TODO Back off. */
3944639446+- }
3944739447+- }
3944839448+- }
3944939449+-
3945039450+ done:
3945139451+3945239452+ /* Confirm that we have been woken. We do that before acquiring the mutex
3945339453+3945439454+commit 88d999d840e77c9917f08870094a23ce42294848
3945539455+Author: Malte Skarupke <malteskarupke@fastmail.fm>
3945639456+Date: Wed Dec 4 07:55:22 2024 -0500
3945739457+3945839458+ nptl: Update comments and indentation for new condvar implementation
3945939459+3946039460+ Some comments were wrong after the most recent commit. This fixes that.
3946139461+3946239462+ Also fixing indentation where it was using spaces instead of tabs.
3946339463+3946439464+ Signed-off-by: Malte Skarupke <malteskarupke@fastmail.fm>
3946539465+ Reviewed-by: Carlos O'Donell <carlos@redhat.com>
3946639466+ (cherry picked from commit 0cc973160c23bb67f895bc887dd6942d29f8fee3)
3946739467+3946839468+diff --git a/nptl/pthread_cond_common.c b/nptl/pthread_cond_common.c
3946939469+index 4855b8899f..3475d15123 100644
3947039470+--- a/nptl/pthread_cond_common.c
3947139471++++ b/nptl/pthread_cond_common.c
3947239472+@@ -221,8 +221,9 @@ __condvar_quiesce_and_switch_g1 (pthread_cond_t *cond, uint64_t wseq,
3947339473+ * New waiters arriving concurrently with the group switching will all go
3947439474+ into G2 until we atomically make the switch. Waiters existing in G2
3947539475+ are not affected.
3947639476+- * Waiters in G1 will be closed out immediately by the advancing of
3947739477+- __g_signals to the next "lowseq" (low 31 bits of the new g1_start),
3947839478++ * Waiters in G1 have already received a signal and been woken. If they
3947939479++ haven't woken yet, they will be closed out immediately by the advancing
3948039480++ of __g_signals to the next "lowseq" (low 31 bits of the new g1_start),
3948139481+ which will prevent waiters from blocking using a futex on
3948239482+ __g_signals since it provides enough signals for all possible
3948339483+ remaining waiters. As a result, they can each consume a signal
3948439484+diff --git a/nptl/pthread_cond_wait.c b/nptl/pthread_cond_wait.c
3948539485+index 3d290e39c8..ad2cee7d59 100644
3948639486+--- a/nptl/pthread_cond_wait.c
3948739487++++ b/nptl/pthread_cond_wait.c
3948839488+@@ -249,7 +249,7 @@ __condvar_cleanup_waiting (void *arg)
3948939489+ figure out whether they are in a group that has already been completely
3949039490+ signaled (i.e., if the current G1 starts at a later position that the
3949139491+ waiter's position). Waiters cannot determine whether they are currently
3949239492+- in G2 or G1 -- but they do not have too because all they are interested in
3949339493++ in G2 or G1 -- but they do not have to because all they are interested in
3949439494+ is whether there are available signals, and they always start in G2 (whose
3949539495+ group slot they know because of the bit in the waiter sequence. Signalers
3949639496+ will simply fill the right group until it is completely signaled and can
3949739497+@@ -412,7 +412,7 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex,
3949839498+ }
3949939499+3950039500+ /* Now wait until a signal is available in our group or it is closed.
3950139501+- Acquire MO so that if we observe a value of zero written after group
3950239502++ Acquire MO so that if we observe (signals == lowseq) after group
3950339503+ switching in __condvar_quiesce_and_switch_g1, we synchronize with that
3950439504+ store and will see the prior update of __g1_start done while switching
3950539505+ groups too. */
3950639506+@@ -422,8 +422,8 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex,
3950739507+ {
3950839508+ while (1)
3950939509+ {
3951039510+- uint64_t g1_start = __condvar_load_g1_start_relaxed (cond);
3951139511+- unsigned int lowseq = (g1_start & 1) == g ? signals : g1_start & ~1U;
3951239512++ uint64_t g1_start = __condvar_load_g1_start_relaxed (cond);
3951339513++ unsigned int lowseq = (g1_start & 1) == g ? signals : g1_start & ~1U;
3951439514+3951539515+ /* Spin-wait first.
3951639516+ Note that spinning first without checking whether a timeout
3951739517+@@ -447,21 +447,21 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex,
3951839518+3951939519+ /* Reload signals. See above for MO. */
3952039520+ signals = atomic_load_acquire (cond->__data.__g_signals + g);
3952139521+- g1_start = __condvar_load_g1_start_relaxed (cond);
3952239522+- lowseq = (g1_start & 1) == g ? signals : g1_start & ~1U;
3952339523++ g1_start = __condvar_load_g1_start_relaxed (cond);
3952439524++ lowseq = (g1_start & 1) == g ? signals : g1_start & ~1U;
3952539525+ spin--;
3952639526+ }
3952739527+3952839528+- if (seq < (g1_start >> 1))
3952939529++ if (seq < (g1_start >> 1))
3953039530+ {
3953139531+- /* If the group is closed already,
3953239532++ /* If the group is closed already,
3953339533+ then this waiter originally had enough extra signals to
3953439534+ consume, up until the time its group was closed. */
3953539535+ goto done;
3953639536+- }
3953739537++ }
3953839538+3953939539+ /* If there is an available signal, don't block.
3954039540+- If __g1_start has advanced at all, then we must be in G1
3954139541++ If __g1_start has advanced at all, then we must be in G1
3954239542+ by now, perhaps in the process of switching back to an older
3954339543+ G2, but in either case we're allowed to consume the available
3954439544+ signal and should not block anymore. */
3954539545+@@ -483,22 +483,23 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex,
3954639546+ sequence. */
3954739547+ atomic_fetch_add_acquire (cond->__data.__g_refs + g, 2);
3954839548+ signals = atomic_load_acquire (cond->__data.__g_signals + g);
3954939549+- g1_start = __condvar_load_g1_start_relaxed (cond);
3955039550+- lowseq = (g1_start & 1) == g ? signals : g1_start & ~1U;
3955139551++ g1_start = __condvar_load_g1_start_relaxed (cond);
3955239552++ lowseq = (g1_start & 1) == g ? signals : g1_start & ~1U;
3955339553+3955439554+- if (seq < (g1_start >> 1))
3955539555++ if (seq < (g1_start >> 1))
3955639556+ {
3955739557+- /* group is closed already, so don't block */
3955839558++ /* group is closed already, so don't block */
3955939559+ __condvar_dec_grefs (cond, g, private);
3956039560+ goto done;
3956139561+ }
3956239562+3956339563+ if ((int)(signals - lowseq) >= 2)
3956439564+ {
3956539565+- /* a signal showed up or G1/G2 switched after we grabbed the refcount */
3956639566++ /* a signal showed up or G1/G2 switched after we grabbed the
3956739567++ refcount */
3956839568+ __condvar_dec_grefs (cond, g, private);
3956939569+ break;
3957039570+- }
3957139571++ }
3957239572+3957339573+ // Now block.
3957439574+ struct _pthread_cleanup_buffer buffer;
3957539575+@@ -536,10 +537,8 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex,
3957639576+ if (seq < (__condvar_load_g1_start_relaxed (cond) >> 1))
3957739577+ goto done;
3957839578+ }
3957939579+- /* Try to grab a signal. Use acquire MO so that we see an up-to-date value
3958039580+- of __g1_start below (see spinning above for a similar case). In
3958139581+- particular, if we steal from a more recent group, we will also see a
3958239582+- more recent __g1_start below. */
3958339583++ /* Try to grab a signal. See above for MO. (if we do another loop
3958439584++ iteration we need to see the correct value of g1_start) */
3958539585+ while (!atomic_compare_exchange_weak_acquire (cond->__data.__g_signals + g,
3958639586+ &signals, signals - 2));
3958739587+3958839588+3958939589+commit 136a29f9d0a3924828d5a16be82d054637517c95
3959039590+Author: Malte Skarupke <malteskarupke@fastmail.fm>
3959139591+Date: Wed Dec 4 07:55:50 2024 -0500
3959239592+3959339593+ nptl: Remove unnecessary catch-all-wake in condvar group switch
3959439594+3959539595+ This wake is unnecessary. We only switch groups after every sleeper in a group
3959639596+ has been woken. Sure, they may take a while to actually wake up and may still
3959739597+ hold a reference, but waking them a second time doesn't speed that up. Instead
3959839598+ this just makes the code more complicated and may hide problems.
3959939599+3960039600+ In particular this safety wake wouldn't even have helped with the bug that was
3960139601+ fixed by Barrus' patch: The bug there was that pthread_cond_signal would not
3960239602+ switch g1 when it should, so we wouldn't even have entered this code path.
3960339603+3960439604+ Signed-off-by: Malte Skarupke <malteskarupke@fastmail.fm>
3960539605+ Reviewed-by: Carlos O'Donell <carlos@redhat.com>
3960639606+ (cherry picked from commit b42cc6af11062c260c7dfa91f1c89891366fed3e)
3960739607+3960839608+diff --git a/nptl/pthread_cond_common.c b/nptl/pthread_cond_common.c
3960939609+index 3475d15123..30b8eee149 100644
3961039610+--- a/nptl/pthread_cond_common.c
3961139611++++ b/nptl/pthread_cond_common.c
3961239612+@@ -221,13 +221,7 @@ __condvar_quiesce_and_switch_g1 (pthread_cond_t *cond, uint64_t wseq,
3961339613+ * New waiters arriving concurrently with the group switching will all go
3961439614+ into G2 until we atomically make the switch. Waiters existing in G2
3961539615+ are not affected.
3961639616+- * Waiters in G1 have already received a signal and been woken. If they
3961739617+- haven't woken yet, they will be closed out immediately by the advancing
3961839618+- of __g_signals to the next "lowseq" (low 31 bits of the new g1_start),
3961939619+- which will prevent waiters from blocking using a futex on
3962039620+- __g_signals since it provides enough signals for all possible
3962139621+- remaining waiters. As a result, they can each consume a signal
3962239622+- and they will eventually remove their group reference. */
3962339623++ * Waiters in G1 have already received a signal and been woken. */
3962439624+3962539625+ /* Update __g1_start, which finishes closing this group. The value we add
3962639626+ will never be negative because old_orig_size can only be zero when we
3962739627+@@ -240,29 +234,6 @@ __condvar_quiesce_and_switch_g1 (pthread_cond_t *cond, uint64_t wseq,
3962839628+3962939629+ unsigned int lowseq = ((old_g1_start + old_orig_size) << 1) & ~1U;
3963039630+3963139631+- /* If any waiters still hold group references (and thus could be blocked),
3963239632+- then wake them all up now and prevent any running ones from blocking.
3963339633+- This is effectively a catch-all for any possible current or future
3963439634+- bugs that can allow the group size to reach 0 before all G1 waiters
3963539635+- have been awakened or at least given signals to consume, or any
3963639636+- other case that can leave blocked (or about to block) older waiters.. */
3963739637+- if ((atomic_fetch_or_release (cond->__data.__g_refs + g1, 0) >> 1) > 0)
3963839638+- {
3963939639+- /* First advance signals to the end of the group (i.e. enough signals
3964039640+- for the entire G1 group) to ensure that waiters which have not
3964139641+- yet blocked in the futex will not block.
3964239642+- Note that in the vast majority of cases, this should never
3964339643+- actually be necessary, since __g_signals will have enough
3964439644+- signals for the remaining g_refs waiters. As an optimization,
3964539645+- we could check this first before proceeding, although that
3964639646+- could still leave the potential for futex lost wakeup bugs
3964739647+- if the signal count was non-zero but the futex wakeup
3964839648+- was somehow lost. */
3964939649+- atomic_store_release (cond->__data.__g_signals + g1, lowseq);
3965039650+-
3965139651+- futex_wake (cond->__data.__g_signals + g1, INT_MAX, private);
3965239652+- }
3965339653+-
3965439654+ /* At this point, the old G1 is now a valid new G2 (but not in use yet).
3965539655+ No old waiter can neither grab a signal nor acquire a reference without
3965639656+ noticing that __g1_start is larger.
3965739657+3965839658+commit 2a259b6d77dc5bdab5c8f4ee0e69572d5699d4bf
3965939659+Author: Malte Skarupke <malteskarupke@fastmail.fm>
3966039660+Date: Wed Dec 4 07:56:13 2024 -0500
3966139661+3966239662+ nptl: Remove unnecessary quadruple check in pthread_cond_wait
3966339663+3966439664+ pthread_cond_wait was checking whether it was in a closed group no less than
3966539665+ four times. Checking once is enough. Here are the four checks:
3966639666+3966739667+ 1. While spin-waiting. This was dead code: maxspin is set to 0 and has been
3966839668+ for years.
3966939669+ 2. Before deciding to go to sleep, and before incrementing grefs: I kept this
3967039670+ 3. After incrementing grefs. There is no reason to think that the group would
3967139671+ close while we do an atomic increment. Obviously it could close at any
3967239672+ point, but that doesn't mean we have to recheck after every step. This
3967339673+ check was equally good as check 2, except it has to do more work.
3967439674+ 4. When we find ourselves in a group that has a signal. We only get here after
3967539675+ we check that we're not in a closed group. There is no need to check again.
3967639676+ The check would only have helped in cases where the compare_exchange in the
3967739677+ next line would also have failed. Relying on the compare_exchange is fine.
3967839678+3967939679+ Removing the duplicate checks clarifies the code.
3968039680+3968139681+ Signed-off-by: Malte Skarupke <malteskarupke@fastmail.fm>
3968239682+ Reviewed-by: Carlos O'Donell <carlos@redhat.com>
3968339683+ (cherry picked from commit 4f7b051f8ee3feff1b53b27a906f245afaa9cee1)
3968439684+3968539685+diff --git a/nptl/pthread_cond_wait.c b/nptl/pthread_cond_wait.c
3968639686+index ad2cee7d59..cfdd13bb87 100644
3968739687+--- a/nptl/pthread_cond_wait.c
3968839688++++ b/nptl/pthread_cond_wait.c
3968939689+@@ -366,7 +366,6 @@ static __always_inline int
3969039690+ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex,
3969139691+ clockid_t clockid, const struct __timespec64 *abstime)
3969239692+ {
3969339693+- const int maxspin = 0;
3969439694+ int err;
3969539695+ int result = 0;
3969639696+3969739697+@@ -425,33 +424,6 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex,
3969839698+ uint64_t g1_start = __condvar_load_g1_start_relaxed (cond);
3969939699+ unsigned int lowseq = (g1_start & 1) == g ? signals : g1_start & ~1U;
3970039700+3970139701+- /* Spin-wait first.
3970239702+- Note that spinning first without checking whether a timeout
3970339703+- passed might lead to what looks like a spurious wake-up even
3970439704+- though we should return ETIMEDOUT (e.g., if the caller provides
3970539705+- an absolute timeout that is clearly in the past). However,
3970639706+- (1) spurious wake-ups are allowed, (2) it seems unlikely that a
3970739707+- user will (ab)use pthread_cond_wait as a check for whether a
3970839708+- point in time is in the past, and (3) spinning first without
3970939709+- having to compare against the current time seems to be the right
3971039710+- choice from a performance perspective for most use cases. */
3971139711+- unsigned int spin = maxspin;
3971239712+- while (spin > 0 && ((int)(signals - lowseq) < 2))
3971339713+- {
3971439714+- /* Check that we are not spinning on a group that's already
3971539715+- closed. */
3971639716+- if (seq < (g1_start >> 1))
3971739717+- break;
3971839718+-
3971939719+- /* TODO Back off. */
3972039720+-
3972139721+- /* Reload signals. See above for MO. */
3972239722+- signals = atomic_load_acquire (cond->__data.__g_signals + g);
3972339723+- g1_start = __condvar_load_g1_start_relaxed (cond);
3972439724+- lowseq = (g1_start & 1) == g ? signals : g1_start & ~1U;
3972539725+- spin--;
3972639726+- }
3972739727+-
3972839728+ if (seq < (g1_start >> 1))
3972939729+ {
3973039730+ /* If the group is closed already,
3973139731+@@ -482,24 +454,6 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex,
3973239732+ an atomic read-modify-write operation and thus extend the release
3973339733+ sequence. */
3973439734+ atomic_fetch_add_acquire (cond->__data.__g_refs + g, 2);
3973539735+- signals = atomic_load_acquire (cond->__data.__g_signals + g);
3973639736+- g1_start = __condvar_load_g1_start_relaxed (cond);
3973739737+- lowseq = (g1_start & 1) == g ? signals : g1_start & ~1U;
3973839738+-
3973939739+- if (seq < (g1_start >> 1))
3974039740+- {
3974139741+- /* group is closed already, so don't block */
3974239742+- __condvar_dec_grefs (cond, g, private);
3974339743+- goto done;
3974439744+- }
3974539745+-
3974639746+- if ((int)(signals - lowseq) >= 2)
3974739747+- {
3974839748+- /* a signal showed up or G1/G2 switched after we grabbed the
3974939749+- refcount */
3975039750+- __condvar_dec_grefs (cond, g, private);
3975139751+- break;
3975239752+- }
3975339753+3975439754+ // Now block.
3975539755+ struct _pthread_cleanup_buffer buffer;
3975639756+@@ -533,9 +487,6 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex,
3975739757+ /* Reload signals. See above for MO. */
3975839758+ signals = atomic_load_acquire (cond->__data.__g_signals + g);
3975939759+ }
3976039760+-
3976139761+- if (seq < (__condvar_load_g1_start_relaxed (cond) >> 1))
3976239762+- goto done;
3976339763+ }
3976439764+ /* Try to grab a signal. See above for MO. (if we do another loop
3976539765+ iteration we need to see the correct value of g1_start) */
3976639766+3976739767+commit a2465f4293ecc37ac4650fbd02e517bc6fd801c6
3976839768+Author: Malte Skarupke <malteskarupke@fastmail.fm>
3976939769+Date: Wed Dec 4 07:56:38 2024 -0500
3977039770+3977139771+ nptl: Remove g_refs from condition variables
3977239772+3977339773+ This variable used to be needed to wait in group switching until all sleepers
3977439774+ have confirmed that they have woken. This is no longer needed. Nothing waits
3977539775+ on this variable so there is no need to track how many threads are currently
3977639776+ asleep in each group.
3977739777+3977839778+ Signed-off-by: Malte Skarupke <malteskarupke@fastmail.fm>
3977939779+ Reviewed-by: Carlos O'Donell <carlos@redhat.com>
3978039780+ (cherry picked from commit c36fc50781995e6758cae2b6927839d0157f213c)
3978139781+3978239782+diff --git a/nptl/pthread_cond_wait.c b/nptl/pthread_cond_wait.c
3978339783+index cfdd13bb87..411fc0380b 100644
3978439784+--- a/nptl/pthread_cond_wait.c
3978539785++++ b/nptl/pthread_cond_wait.c
3978639786+@@ -143,23 +143,6 @@ __condvar_cancel_waiting (pthread_cond_t *cond, uint64_t seq, unsigned int g,
3978739787+ }
3978839788+ }
3978939789+3979039790+-/* Wake up any signalers that might be waiting. */
3979139791+-static void
3979239792+-__condvar_dec_grefs (pthread_cond_t *cond, unsigned int g, int private)
3979339793+-{
3979439794+- /* Release MO to synchronize-with the acquire load in
3979539795+- __condvar_quiesce_and_switch_g1. */
3979639796+- if (atomic_fetch_add_release (cond->__data.__g_refs + g, -2) == 3)
3979739797+- {
3979839798+- /* Clear the wake-up request flag before waking up. We do not need more
3979939799+- than relaxed MO and it doesn't matter if we apply this for an aliased
3980039800+- group because we wake all futex waiters right after clearing the
3980139801+- flag. */
3980239802+- atomic_fetch_and_relaxed (cond->__data.__g_refs + g, ~(unsigned int) 1);
3980339803+- futex_wake (cond->__data.__g_refs + g, INT_MAX, private);
3980439804+- }
3980539805+-}
3980639806+-
3980739807+ /* Clean-up for cancellation of waiters waiting for normal signals. We cancel
3980839808+ our registration as a waiter, confirm we have woken up, and re-acquire the
3980939809+ mutex. */
3981039810+@@ -171,8 +154,6 @@ __condvar_cleanup_waiting (void *arg)
3981139811+ pthread_cond_t *cond = cbuffer->cond;
3981239812+ unsigned g = cbuffer->wseq & 1;
3981339813+3981439814+- __condvar_dec_grefs (cond, g, cbuffer->private);
3981539815+-
3981639816+ __condvar_cancel_waiting (cond, cbuffer->wseq >> 1, g, cbuffer->private);
3981739817+ /* FIXME With the current cancellation implementation, it is possible that
3981839818+ a thread is cancelled after it has returned from a syscall. This could
3981939819+@@ -327,15 +308,6 @@ __condvar_cleanup_waiting (void *arg)
3982039820+ sufficient because if a waiter can see a sufficiently large value, it could
3982139821+ have also consume a signal in the waiters group.
3982239822+3982339823+- It is essential that the last field in pthread_cond_t is __g_signals[1]:
3982439824+- The previous condvar used a pointer-sized field in pthread_cond_t, so a
3982539825+- PTHREAD_COND_INITIALIZER from that condvar implementation might only
3982639826+- initialize 4 bytes to zero instead of the 8 bytes we need (i.e., 44 bytes
3982739827+- in total instead of the 48 we need). __g_signals[1] is not accessed before
3982839828+- the first group switch (G2 starts at index 0), which will set its value to
3982939829+- zero after a harmless fetch-or whose return value is ignored. This
3983039830+- effectively completes initialization.
3983139831+-
3983239832+3983339833+ Limitations:
3983439834+ * This condvar isn't designed to allow for more than
3983539835+@@ -440,21 +412,6 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex,
3983639836+ if ((int)(signals - lowseq) >= 2)
3983739837+ break;
3983839838+3983939839+- /* No signals available after spinning, so prepare to block.
3984039840+- We first acquire a group reference and use acquire MO for that so
3984139841+- that we synchronize with the dummy read-modify-write in
3984239842+- __condvar_quiesce_and_switch_g1 if we read from that. In turn,
3984339843+- in this case this will make us see the advancement of __g_signals
3984439844+- to the upcoming new g1_start that occurs with a concurrent
3984539845+- attempt to reuse the group's slot.
3984639846+- We use acquire MO for the __g_signals check to make the
3984739847+- __g1_start check work (see spinning above).
3984839848+- Note that the group reference acquisition will not mask the
3984939849+- release MO when decrementing the reference count because we use
3985039850+- an atomic read-modify-write operation and thus extend the release
3985139851+- sequence. */
3985239852+- atomic_fetch_add_acquire (cond->__data.__g_refs + g, 2);
3985339853+-
3985439854+ // Now block.
3985539855+ struct _pthread_cleanup_buffer buffer;
3985639856+ struct _condvar_cleanup_buffer cbuffer;
3985739857+@@ -471,18 +428,11 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex,
3985839858+3985939859+ if (__glibc_unlikely (err == ETIMEDOUT || err == EOVERFLOW))
3986039860+ {
3986139861+- __condvar_dec_grefs (cond, g, private);
3986239862+- /* If we timed out, we effectively cancel waiting. Note that
3986339863+- we have decremented __g_refs before cancellation, so that a
3986439864+- deadlock between waiting for quiescence of our group in
3986539865+- __condvar_quiesce_and_switch_g1 and us trying to acquire
3986639866+- the lock during cancellation is not possible. */
3986739867++ /* If we timed out, we effectively cancel waiting. */
3986839868+ __condvar_cancel_waiting (cond, seq, g, private);
3986939869+ result = err;
3987039870+ goto done;
3987139871+ }
3987239872+- else
3987339873+- __condvar_dec_grefs (cond, g, private);
3987439874+3987539875+ /* Reload signals. See above for MO. */
3987639876+ signals = atomic_load_acquire (cond->__data.__g_signals + g);
3987739877+diff --git a/nptl/tst-cond22.c b/nptl/tst-cond22.c
3987839878+index 1336e9c79d..bdcb45c536 100644
3987939879+--- a/nptl/tst-cond22.c
3988039880++++ b/nptl/tst-cond22.c
3988139881+@@ -106,13 +106,13 @@ do_test (void)
3988239882+ status = 1;
3988339883+ }
3988439884+3988539885+- printf ("cond = { 0x%x:%x, 0x%x:%x, %u/%u/%u, %u/%u/%u, %u, %u }\n",
3988639886++ printf ("cond = { 0x%x:%x, 0x%x:%x, %u/%u, %u/%u, %u, %u }\n",
3988739887+ c.__data.__wseq.__value32.__high,
3988839888+ c.__data.__wseq.__value32.__low,
3988939889+ c.__data.__g1_start.__value32.__high,
3989039890+ c.__data.__g1_start.__value32.__low,
3989139891+- c.__data.__g_signals[0], c.__data.__g_refs[0], c.__data.__g_size[0],
3989239892+- c.__data.__g_signals[1], c.__data.__g_refs[1], c.__data.__g_size[1],
3989339893++ c.__data.__g_signals[0], c.__data.__g_size[0],
3989439894++ c.__data.__g_signals[1], c.__data.__g_size[1],
3989539895+ c.__data.__g1_orig_size, c.__data.__wrefs);
3989639896+3989739897+ if (pthread_create (&th, NULL, tf, (void *) 1l) != 0)
3989839898+@@ -152,13 +152,13 @@ do_test (void)
3989939899+ status = 1;
3990039900+ }
3990139901+3990239902+- printf ("cond = { 0x%x:%x, 0x%x:%x, %u/%u/%u, %u/%u/%u, %u, %u }\n",
3990339903++ printf ("cond = { 0x%x:%x, 0x%x:%x, %u/%u, %u/%u, %u, %u }\n",
3990439904+ c.__data.__wseq.__value32.__high,
3990539905+ c.__data.__wseq.__value32.__low,
3990639906+ c.__data.__g1_start.__value32.__high,
3990739907+ c.__data.__g1_start.__value32.__low,
3990839908+- c.__data.__g_signals[0], c.__data.__g_refs[0], c.__data.__g_size[0],
3990939909+- c.__data.__g_signals[1], c.__data.__g_refs[1], c.__data.__g_size[1],
3991039910++ c.__data.__g_signals[0], c.__data.__g_size[0],
3991139911++ c.__data.__g_signals[1], c.__data.__g_size[1],
3991239912+ c.__data.__g1_orig_size, c.__data.__wrefs);
3991339913+3991439914+ return status;
3991539915+diff --git a/sysdeps/nptl/bits/thread-shared-types.h b/sysdeps/nptl/bits/thread-shared-types.h
3991639916+index df54eef6f7..a3d482f80f 100644
3991739917+--- a/sysdeps/nptl/bits/thread-shared-types.h
3991839918++++ b/sysdeps/nptl/bits/thread-shared-types.h
3991939919+@@ -95,8 +95,7 @@ struct __pthread_cond_s
3992039920+ {
3992139921+ __atomic_wide_counter __wseq;
3992239922+ __atomic_wide_counter __g1_start;
3992339923+- unsigned int __g_refs[2] __LOCK_ALIGNMENT;
3992439924+- unsigned int __g_size[2];
3992539925++ unsigned int __g_size[2] __LOCK_ALIGNMENT;
3992639926+ unsigned int __g1_orig_size;
3992739927+ unsigned int __wrefs;
3992839928+ unsigned int __g_signals[2];
3992939929+diff --git a/sysdeps/nptl/pthread.h b/sysdeps/nptl/pthread.h
3993039930+index 3d4f4a756c..9af75d6eae 100644
3993139931+--- a/sysdeps/nptl/pthread.h
3993239932++++ b/sysdeps/nptl/pthread.h
3993339933+@@ -152,7 +152,7 @@ enum
3993439934+3993539935+3993639936+ /* Conditional variable handling. */
3993739937+-#define PTHREAD_COND_INITIALIZER { { {0}, {0}, {0, 0}, {0, 0}, 0, 0, {0, 0} } }
3993839938++#define PTHREAD_COND_INITIALIZER { { {0}, {0}, {0, 0}, 0, 0, {0, 0} } }
3993939939+3994039940+3994139941+ /* Cleanup buffers */
3994239942+3994339943+commit fa110993a6390ae5c97dff613ef02b59ec78c5da
3994439944+Author: Malte Skarupke <malteskarupke@fastmail.fm>
3994539945+Date: Wed Dec 4 08:03:44 2024 -0500
3994639946+3994739947+ nptl: Use a single loop in pthread_cond_wait instaed of a nested loop
3994839948+3994939949+ The loop was a little more complicated than necessary. There was only one
3995039950+ break statement out of the inner loop, and the outer loop was nearly empty.
3995139951+ So just remove the outer loop, moving its code to the one break statement in
3995239952+ the inner loop. This allows us to replace all gotos with break statements.
3995339953+3995439954+ Signed-off-by: Malte Skarupke <malteskarupke@fastmail.fm>
3995539955+ Reviewed-by: Carlos O'Donell <carlos@redhat.com>
3995639956+ (cherry picked from commit 929a4764ac90382616b6a21f099192b2475da674)
3995739957+3995839958+diff --git a/nptl/pthread_cond_wait.c b/nptl/pthread_cond_wait.c
3995939959+index 411fc0380b..683cb2b133 100644
3996039960+--- a/nptl/pthread_cond_wait.c
3996139961++++ b/nptl/pthread_cond_wait.c
3996239962+@@ -382,17 +382,15 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex,
3996339963+ return err;
3996439964+ }
3996539965+3996639966+- /* Now wait until a signal is available in our group or it is closed.
3996739967+- Acquire MO so that if we observe (signals == lowseq) after group
3996839968+- switching in __condvar_quiesce_and_switch_g1, we synchronize with that
3996939969+- store and will see the prior update of __g1_start done while switching
3997039970+- groups too. */
3997139971+- unsigned int signals = atomic_load_acquire (cond->__data.__g_signals + g);
3997239972+-
3997339973+- do
3997439974+- {
3997539975++
3997639976+ while (1)
3997739977+ {
3997839978++ /* Now wait until a signal is available in our group or it is closed.
3997939979++ Acquire MO so that if we observe (signals == lowseq) after group
3998039980++ switching in __condvar_quiesce_and_switch_g1, we synchronize with that
3998139981++ store and will see the prior update of __g1_start done while switching
3998239982++ groups too. */
3998339983++ unsigned int signals = atomic_load_acquire (cond->__data.__g_signals + g);
3998439984+ uint64_t g1_start = __condvar_load_g1_start_relaxed (cond);
3998539985+ unsigned int lowseq = (g1_start & 1) == g ? signals : g1_start & ~1U;
3998639986+3998739987+@@ -401,7 +399,7 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex,
3998839988+ /* If the group is closed already,
3998939989+ then this waiter originally had enough extra signals to
3999039990+ consume, up until the time its group was closed. */
3999139991+- goto done;
3999239992++ break;
3999339993+ }
3999439994+3999539995+ /* If there is an available signal, don't block.
3999639996+@@ -410,7 +408,16 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex,
3999739997+ G2, but in either case we're allowed to consume the available
3999839998+ signal and should not block anymore. */
3999939999+ if ((int)(signals - lowseq) >= 2)
4000040000+- break;
4000140001++ {
4000240002++ /* Try to grab a signal. See above for MO. (if we do another loop
4000340003++ iteration we need to see the correct value of g1_start) */
4000440004++ if (atomic_compare_exchange_weak_acquire (
4000540005++ cond->__data.__g_signals + g,
4000640006++ &signals, signals - 2))
4000740007++ break;
4000840008++ else
4000940009++ continue;
4001040010++ }
4001140011+4001240012+ // Now block.
4001340013+ struct _pthread_cleanup_buffer buffer;
4001440014+@@ -431,19 +438,9 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex,
4001540015+ /* If we timed out, we effectively cancel waiting. */
4001640016+ __condvar_cancel_waiting (cond, seq, g, private);
4001740017+ result = err;
4001840018+- goto done;
4001940019++ break;
4002040020+ }
4002140021+-
4002240022+- /* Reload signals. See above for MO. */
4002340023+- signals = atomic_load_acquire (cond->__data.__g_signals + g);
4002440024+ }
4002540025+- }
4002640026+- /* Try to grab a signal. See above for MO. (if we do another loop
4002740027+- iteration we need to see the correct value of g1_start) */
4002840028+- while (!atomic_compare_exchange_weak_acquire (cond->__data.__g_signals + g,
4002940029+- &signals, signals - 2));
4003040030+-
4003140031+- done:
4003240032+4003340033+ /* Confirm that we have been woken. We do that before acquiring the mutex
4003440034+ to allow for execution of pthread_cond_destroy while having acquired the
4003540035+4003640036+commit afbf0d46850dcd1b626d892ad8fde2162067ddc7
4003740037+Author: Malte Skarupke <malteskarupke@fastmail.fm>
4003840038+Date: Wed Dec 4 08:04:10 2024 -0500
4003940039+4004040040+ nptl: Fix indentation
4004140041+4004240042+ In my previous change I turned a nested loop into a simple loop. I'm doing
4004340043+ the resulting indentation changes in a separate commit to make the diff on
4004440044+ the previous commit easier to review.
4004540045+4004640046+ Signed-off-by: Malte Skarupke <malteskarupke@fastmail.fm>
4004740047+ Reviewed-by: Carlos O'Donell <carlos@redhat.com>
4004840048+ (cherry picked from commit ee6c14ed59d480720721aaacc5fb03213dc153da)
4004940049+4005040050+diff --git a/nptl/pthread_cond_wait.c b/nptl/pthread_cond_wait.c
4005140051+index 683cb2b133..7fc9dadf15 100644
4005240052+--- a/nptl/pthread_cond_wait.c
4005340053++++ b/nptl/pthread_cond_wait.c
4005440054+@@ -383,65 +383,65 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex,
4005540055+ }
4005640056+4005740057+4005840058+- while (1)
4005940059+- {
4006040060+- /* Now wait until a signal is available in our group or it is closed.
4006140061+- Acquire MO so that if we observe (signals == lowseq) after group
4006240062+- switching in __condvar_quiesce_and_switch_g1, we synchronize with that
4006340063+- store and will see the prior update of __g1_start done while switching
4006440064+- groups too. */
4006540065+- unsigned int signals = atomic_load_acquire (cond->__data.__g_signals + g);
4006640066+- uint64_t g1_start = __condvar_load_g1_start_relaxed (cond);
4006740067+- unsigned int lowseq = (g1_start & 1) == g ? signals : g1_start & ~1U;
4006840068+-
4006940069+- if (seq < (g1_start >> 1))
4007040070+- {
4007140071+- /* If the group is closed already,
4007240072+- then this waiter originally had enough extra signals to
4007340073+- consume, up until the time its group was closed. */
4007440074+- break;
4007540075+- }
4007640076+-
4007740077+- /* If there is an available signal, don't block.
4007840078+- If __g1_start has advanced at all, then we must be in G1
4007940079+- by now, perhaps in the process of switching back to an older
4008040080+- G2, but in either case we're allowed to consume the available
4008140081+- signal and should not block anymore. */
4008240082+- if ((int)(signals - lowseq) >= 2)
4008340083+- {
4008440084+- /* Try to grab a signal. See above for MO. (if we do another loop
4008540085+- iteration we need to see the correct value of g1_start) */
4008640086+- if (atomic_compare_exchange_weak_acquire (
4008740087+- cond->__data.__g_signals + g,
4008840088++ while (1)
4008940089++ {
4009040090++ /* Now wait until a signal is available in our group or it is closed.
4009140091++ Acquire MO so that if we observe (signals == lowseq) after group
4009240092++ switching in __condvar_quiesce_and_switch_g1, we synchronize with that
4009340093++ store and will see the prior update of __g1_start done while switching
4009440094++ groups too. */
4009540095++ unsigned int signals = atomic_load_acquire (cond->__data.__g_signals + g);
4009640096++ uint64_t g1_start = __condvar_load_g1_start_relaxed (cond);
4009740097++ unsigned int lowseq = (g1_start & 1) == g ? signals : g1_start & ~1U;
4009840098++
4009940099++ if (seq < (g1_start >> 1))
4010040100++ {
4010140101++ /* If the group is closed already,
4010240102++ then this waiter originally had enough extra signals to
4010340103++ consume, up until the time its group was closed. */
4010440104++ break;
4010540105++ }
4010640106++
4010740107++ /* If there is an available signal, don't block.
4010840108++ If __g1_start has advanced at all, then we must be in G1
4010940109++ by now, perhaps in the process of switching back to an older
4011040110++ G2, but in either case we're allowed to consume the available
4011140111++ signal and should not block anymore. */
4011240112++ if ((int)(signals - lowseq) >= 2)
4011340113++ {
4011440114++ /* Try to grab a signal. See above for MO. (if we do another loop
4011540115++ iteration we need to see the correct value of g1_start) */
4011640116++ if (atomic_compare_exchange_weak_acquire (
4011740117++ cond->__data.__g_signals + g,
4011840118+ &signals, signals - 2))
4011940119+- break;
4012040120+- else
4012140121+- continue;
4012240122+- }
4012340123+-
4012440124+- // Now block.
4012540125+- struct _pthread_cleanup_buffer buffer;
4012640126+- struct _condvar_cleanup_buffer cbuffer;
4012740127+- cbuffer.wseq = wseq;
4012840128+- cbuffer.cond = cond;
4012940129+- cbuffer.mutex = mutex;
4013040130+- cbuffer.private = private;
4013140131+- __pthread_cleanup_push (&buffer, __condvar_cleanup_waiting, &cbuffer);
4013240132+-
4013340133+- err = __futex_abstimed_wait_cancelable64 (
4013440134+- cond->__data.__g_signals + g, signals, clockid, abstime, private);
4013540135+-
4013640136+- __pthread_cleanup_pop (&buffer, 0);
4013740137+-
4013840138+- if (__glibc_unlikely (err == ETIMEDOUT || err == EOVERFLOW))
4013940139+- {
4014040140+- /* If we timed out, we effectively cancel waiting. */
4014140141+- __condvar_cancel_waiting (cond, seq, g, private);
4014240142+- result = err;
4014340143+ break;
4014440144+- }
4014540145++ else
4014640146++ continue;
4014740147+ }
4014840148+4014940149++ // Now block.
4015040150++ struct _pthread_cleanup_buffer buffer;
4015140151++ struct _condvar_cleanup_buffer cbuffer;
4015240152++ cbuffer.wseq = wseq;
4015340153++ cbuffer.cond = cond;
4015440154++ cbuffer.mutex = mutex;
4015540155++ cbuffer.private = private;
4015640156++ __pthread_cleanup_push (&buffer, __condvar_cleanup_waiting, &cbuffer);
4015740157++
4015840158++ err = __futex_abstimed_wait_cancelable64 (
4015940159++ cond->__data.__g_signals + g, signals, clockid, abstime, private);
4016040160++
4016140161++ __pthread_cleanup_pop (&buffer, 0);
4016240162++
4016340163++ if (__glibc_unlikely (err == ETIMEDOUT || err == EOVERFLOW))
4016440164++ {
4016540165++ /* If we timed out, we effectively cancel waiting. */
4016640166++ __condvar_cancel_waiting (cond, seq, g, private);
4016740167++ result = err;
4016840168++ break;
4016940169++ }
4017040170++ }
4017140171++
4017240172+ /* Confirm that we have been woken. We do that before acquiring the mutex
4017340173+ to allow for execution of pthread_cond_destroy while having acquired the
4017440174+ mutex. */
4017540175+4017640176+commit 2ad69497346cc20ef4d568108f1de49b2f451c55
4017740177+Author: Malte Skarupke <malteskarupke@fastmail.fm>
4017840178+Date: Wed Dec 4 08:04:54 2024 -0500
4017940179+4018040180+ nptl: rename __condvar_quiesce_and_switch_g1
4018140181+4018240182+ This function no longer waits for threads to leave g1, so rename it to
4018340183+ __condvar_switch_g1
4018440184+4018540185+ Signed-off-by: Malte Skarupke <malteskarupke@fastmail.fm>
4018640186+ Reviewed-by: Carlos O'Donell <carlos@redhat.com>
4018740187+ (cherry picked from commit 4b79e27a5073c02f6bff9aa8f4791230a0ab1867)
4018840188+4018940189+diff --git a/nptl/pthread_cond_broadcast.c b/nptl/pthread_cond_broadcast.c
4019040190+index aada91639a..38bba17bfc 100644
4019140191+--- a/nptl/pthread_cond_broadcast.c
4019240192++++ b/nptl/pthread_cond_broadcast.c
4019340193+@@ -60,7 +60,7 @@ ___pthread_cond_broadcast (pthread_cond_t *cond)
4019440194+ cond->__data.__g_size[g1] << 1);
4019540195+ cond->__data.__g_size[g1] = 0;
4019640196+4019740197+- /* We need to wake G1 waiters before we quiesce G1 below. */
4019840198++ /* We need to wake G1 waiters before we switch G1 below. */
4019940199+ /* TODO Only set it if there are indeed futex waiters. We could
4020040200+ also try to move this out of the critical section in cases when
4020140201+ G2 is empty (and we don't need to quiesce). */
4020240202+@@ -69,7 +69,7 @@ ___pthread_cond_broadcast (pthread_cond_t *cond)
4020340203+4020440204+ /* G1 is complete. Step (2) is next unless there are no waiters in G2, in
4020540205+ which case we can stop. */
4020640206+- if (__condvar_quiesce_and_switch_g1 (cond, wseq, &g1, private))
4020740207++ if (__condvar_switch_g1 (cond, wseq, &g1, private))
4020840208+ {
4020940209+ /* Step (3): Send signals to all waiters in the old G2 / new G1. */
4021040210+ atomic_fetch_add_relaxed (cond->__data.__g_signals + g1,
4021140211+diff --git a/nptl/pthread_cond_common.c b/nptl/pthread_cond_common.c
4021240212+index 30b8eee149..5044273cc2 100644
4021340213+--- a/nptl/pthread_cond_common.c
4021440214++++ b/nptl/pthread_cond_common.c
4021540215+@@ -189,16 +189,15 @@ __condvar_get_private (int flags)
4021640216+ return FUTEX_SHARED;
4021740217+ }
4021840218+4021940219+-/* This closes G1 (whose index is in G1INDEX), waits for all futex waiters to
4022040220+- leave G1, converts G1 into a fresh G2, and then switches group roles so that
4022140221+- the former G2 becomes the new G1 ending at the current __wseq value when we
4022240222+- eventually make the switch (WSEQ is just an observation of __wseq by the
4022340223+- signaler).
4022440224++/* This closes G1 (whose index is in G1INDEX), converts G1 into a fresh G2,
4022540225++ and then switches group roles so that the former G2 becomes the new G1
4022640226++ ending at the current __wseq value when we eventually make the switch
4022740227++ (WSEQ is just an observation of __wseq by the signaler).
4022840228+ If G2 is empty, it will not switch groups because then it would create an
4022940229+ empty G1 which would require switching groups again on the next signal.
4023040230+ Returns false iff groups were not switched because G2 was empty. */
4023140231+ static bool __attribute__ ((unused))
4023240232+-__condvar_quiesce_and_switch_g1 (pthread_cond_t *cond, uint64_t wseq,
4023340233++__condvar_switch_g1 (pthread_cond_t *cond, uint64_t wseq,
4023440234+ unsigned int *g1index, int private)
4023540235+ {
4023640236+ unsigned int g1 = *g1index;
4023740237+@@ -214,8 +213,7 @@ __condvar_quiesce_and_switch_g1 (pthread_cond_t *cond, uint64_t wseq,
4023840238+ + cond->__data.__g_size[g1 ^ 1]) == 0)
4023940239+ return false;
4024040240+4024140241+- /* Now try to close and quiesce G1. We have to consider the following kinds
4024240242+- of waiters:
4024340243++ /* We have to consider the following kinds of waiters:
4024440244+ * Waiters from less recent groups than G1 are not affected because
4024540245+ nothing will change for them apart from __g1_start getting larger.
4024640246+ * New waiters arriving concurrently with the group switching will all go
4024740247+@@ -223,12 +221,12 @@ __condvar_quiesce_and_switch_g1 (pthread_cond_t *cond, uint64_t wseq,
4024840248+ are not affected.
4024940249+ * Waiters in G1 have already received a signal and been woken. */
4025040250+4025140251+- /* Update __g1_start, which finishes closing this group. The value we add
4025240252+- will never be negative because old_orig_size can only be zero when we
4025340253+- switch groups the first time after a condvar was initialized, in which
4025440254+- case G1 will be at index 1 and we will add a value of 1.
4025540255+- Relaxed MO is fine because the change comes with no additional
4025640256+- constraints that others would have to observe. */
4025740257++ /* Update __g1_start, which closes this group. The value we add will never
4025840258++ be negative because old_orig_size can only be zero when we switch groups
4025940259++ the first time after a condvar was initialized, in which case G1 will be
4026040260++ at index 1 and we will add a value of 1. Relaxed MO is fine because the
4026140261++ change comes with no additional constraints that others would have to
4026240262++ observe. */
4026340263+ __condvar_add_g1_start_relaxed (cond,
4026440264+ (old_orig_size << 1) + (g1 == 1 ? 1 : - 1));
4026540265+4026640266+diff --git a/nptl/pthread_cond_signal.c b/nptl/pthread_cond_signal.c
4026740267+index 43d6286ecd..f095497142 100644
4026840268+--- a/nptl/pthread_cond_signal.c
4026940269++++ b/nptl/pthread_cond_signal.c
4027040270+@@ -69,18 +69,17 @@ ___pthread_cond_signal (pthread_cond_t *cond)
4027140271+ bool do_futex_wake = false;
4027240272+4027340273+ /* If G1 is still receiving signals, we put the signal there. If not, we
4027440274+- check if G2 has waiters, and if so, quiesce and switch G1 to the former
4027540275+- G2; if this results in a new G1 with waiters (G2 might have cancellations
4027640276+- already, see __condvar_quiesce_and_switch_g1), we put the signal in the
4027740277+- new G1. */
4027840278++ check if G2 has waiters, and if so, switch G1 to the former G2; if this
4027940279++ results in a new G1 with waiters (G2 might have cancellations already,
4028040280++ see __condvar_switch_g1), we put the signal in the new G1. */
4028140281+ if ((cond->__data.__g_size[g1] != 0)
4028240282+- || __condvar_quiesce_and_switch_g1 (cond, wseq, &g1, private))
4028340283++ || __condvar_switch_g1 (cond, wseq, &g1, private))
4028440284+ {
4028540285+ /* Add a signal. Relaxed MO is fine because signaling does not need to
4028640286+- establish a happens-before relation (see above). We do not mask the
4028740287+- release-MO store when initializing a group in
4028840288+- __condvar_quiesce_and_switch_g1 because we use an atomic
4028940289+- read-modify-write and thus extend that store's release sequence. */
4029040290++ establish a happens-before relation (see above). We do not mask the
4029140291++ release-MO store when initializing a group in __condvar_switch_g1
4029240292++ because we use an atomic read-modify-write and thus extend that
4029340293++ store's release sequence. */
4029440294+ atomic_fetch_add_relaxed (cond->__data.__g_signals + g1, 2);
4029540295+ cond->__data.__g_size[g1]--;
4029640296+ /* TODO Only set it if there are indeed futex waiters. */
4029740297+diff --git a/nptl/pthread_cond_wait.c b/nptl/pthread_cond_wait.c
4029840298+index 7fc9dadf15..80bb728211 100644
4029940299+--- a/nptl/pthread_cond_wait.c
4030040300++++ b/nptl/pthread_cond_wait.c
4030140301+@@ -354,8 +354,7 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex,
4030240302+ because we do not need to establish any happens-before relation with
4030340303+ signalers (see __pthread_cond_signal); modification order alone
4030440304+ establishes a total order of waiters/signals. We do need acquire MO
4030540305+- to synchronize with group reinitialization in
4030640306+- __condvar_quiesce_and_switch_g1. */
4030740307++ to synchronize with group reinitialization in __condvar_switch_g1. */
4030840308+ uint64_t wseq = __condvar_fetch_add_wseq_acquire (cond, 2);
4030940309+ /* Find our group's index. We always go into what was G2 when we acquired
4031040310+ our position. */
4031140311+@@ -387,9 +386,9 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex,
4031240312+ {
4031340313+ /* Now wait until a signal is available in our group or it is closed.
4031440314+ Acquire MO so that if we observe (signals == lowseq) after group
4031540315+- switching in __condvar_quiesce_and_switch_g1, we synchronize with that
4031640316+- store and will see the prior update of __g1_start done while switching
4031740317+- groups too. */
4031840318++ switching in __condvar_switch_g1, we synchronize with that store and
4031940319++ will see the prior update of __g1_start done while switching groups
4032040320++ too. */
4032140321+ unsigned int signals = atomic_load_acquire (cond->__data.__g_signals + g);
4032240322+ uint64_t g1_start = __condvar_load_g1_start_relaxed (cond);
4032340323+ unsigned int lowseq = (g1_start & 1) == g ? signals : g1_start & ~1U;
4032440324+4032540325+commit 7f71824b8039b8afc150dd5c881b61faf10675ef
4032640326+Author: Malte Skarupke <malteskarupke@fastmail.fm>
4032740327+Date: Wed Dec 4 08:05:40 2024 -0500
4032840328+4032940329+ nptl: Use all of g1_start and g_signals
4033040330+4033140331+ The LSB of g_signals was unused. The LSB of g1_start was used to indicate
4033240332+ which group is G2. This was used to always go to sleep in pthread_cond_wait
4033340333+ if a waiter is in G2. A comment earlier in the file says that this is not
4033440334+ correct to do:
4033540335+4033640336+ "Waiters cannot determine whether they are currently in G2 or G1 -- but they
4033740337+ do not have to because all they are interested in is whether there are
4033840338+ available signals"
4033940339+4034040340+ I either would have had to update the comment, or get rid of the check. I
4034140341+ chose to get rid of the check. In fact I don't quite know why it was there.
4034240342+ There will never be available signals for group G2, so we didn't need the
4034340343+ special case. Even if there were, this would just be a spurious wake. This
4034440344+ might have caught some cases where the count has wrapped around, but it
4034540345+ wouldn't reliably do that, (and even if it did, why would you want to force a
4034640346+ sleep in that case?) and we don't support that many concurrent waiters
4034740347+ anyway. Getting rid of it allows us to use one more bit, making us more
4034840348+ robust to wraparound.
4034940349+4035040350+ Signed-off-by: Malte Skarupke <malteskarupke@fastmail.fm>
4035140351+ Reviewed-by: Carlos O'Donell <carlos@redhat.com>
4035240352+ (cherry picked from commit 91bb902f58264a2fd50fbce8f39a9a290dd23706)
4035340353+4035440354+diff --git a/nptl/pthread_cond_broadcast.c b/nptl/pthread_cond_broadcast.c
4035540355+index 38bba17bfc..51afa62adf 100644
4035640356+--- a/nptl/pthread_cond_broadcast.c
4035740357++++ b/nptl/pthread_cond_broadcast.c
4035840358+@@ -57,7 +57,7 @@ ___pthread_cond_broadcast (pthread_cond_t *cond)
4035940359+ {
4036040360+ /* Add as many signals as the remaining size of the group. */
4036140361+ atomic_fetch_add_relaxed (cond->__data.__g_signals + g1,
4036240362+- cond->__data.__g_size[g1] << 1);
4036340363++ cond->__data.__g_size[g1]);
4036440364+ cond->__data.__g_size[g1] = 0;
4036540365+4036640366+ /* We need to wake G1 waiters before we switch G1 below. */
4036740367+@@ -73,7 +73,7 @@ ___pthread_cond_broadcast (pthread_cond_t *cond)
4036840368+ {
4036940369+ /* Step (3): Send signals to all waiters in the old G2 / new G1. */
4037040370+ atomic_fetch_add_relaxed (cond->__data.__g_signals + g1,
4037140371+- cond->__data.__g_size[g1] << 1);
4037240372++ cond->__data.__g_size[g1]);
4037340373+ cond->__data.__g_size[g1] = 0;
4037440374+ /* TODO Only set it if there are indeed futex waiters. */
4037540375+ do_futex_wake = true;
4037640376+diff --git a/nptl/pthread_cond_common.c b/nptl/pthread_cond_common.c
4037740377+index 5044273cc2..389402913c 100644
4037840378+--- a/nptl/pthread_cond_common.c
4037940379++++ b/nptl/pthread_cond_common.c
4038040380+@@ -208,9 +208,9 @@ __condvar_switch_g1 (pthread_cond_t *cond, uint64_t wseq,
4038140381+ behavior.
4038240382+ Note that this works correctly for a zero-initialized condvar too. */
4038340383+ unsigned int old_orig_size = __condvar_get_orig_size (cond);
4038440384+- uint64_t old_g1_start = __condvar_load_g1_start_relaxed (cond) >> 1;
4038540385+- if (((unsigned) (wseq - old_g1_start - old_orig_size)
4038640386+- + cond->__data.__g_size[g1 ^ 1]) == 0)
4038740387++ uint64_t old_g1_start = __condvar_load_g1_start_relaxed (cond);
4038840388++ uint64_t new_g1_start = old_g1_start + old_orig_size;
4038940389++ if (((unsigned) (wseq - new_g1_start) + cond->__data.__g_size[g1 ^ 1]) == 0)
4039040390+ return false;
4039140391+4039240392+ /* We have to consider the following kinds of waiters:
4039340393+@@ -221,16 +221,10 @@ __condvar_switch_g1 (pthread_cond_t *cond, uint64_t wseq,
4039440394+ are not affected.
4039540395+ * Waiters in G1 have already received a signal and been woken. */
4039640396+4039740397+- /* Update __g1_start, which closes this group. The value we add will never
4039840398+- be negative because old_orig_size can only be zero when we switch groups
4039940399+- the first time after a condvar was initialized, in which case G1 will be
4040040400+- at index 1 and we will add a value of 1. Relaxed MO is fine because the
4040140401+- change comes with no additional constraints that others would have to
4040240402+- observe. */
4040340403+- __condvar_add_g1_start_relaxed (cond,
4040440404+- (old_orig_size << 1) + (g1 == 1 ? 1 : - 1));
4040540405+-
4040640406+- unsigned int lowseq = ((old_g1_start + old_orig_size) << 1) & ~1U;
4040740407++ /* Update __g1_start, which closes this group. Relaxed MO is fine because
4040840408++ the change comes with no additional constraints that others would have
4040940409++ to observe. */
4041040410++ __condvar_add_g1_start_relaxed (cond, old_orig_size);
4041140411+4041240412+ /* At this point, the old G1 is now a valid new G2 (but not in use yet).
4041340413+ No old waiter can neither grab a signal nor acquire a reference without
4041440414+@@ -242,13 +236,13 @@ __condvar_switch_g1 (pthread_cond_t *cond, uint64_t wseq,
4041540415+ g1 ^= 1;
4041640416+ *g1index ^= 1;
4041740417+4041840418+- /* Now advance the new G1 g_signals to the new lowseq, giving it
4041940419++ /* Now advance the new G1 g_signals to the new g1_start, giving it
4042040420+ an effective signal count of 0 to start. */
4042140421+- atomic_store_release (cond->__data.__g_signals + g1, lowseq);
4042240422++ atomic_store_release (cond->__data.__g_signals + g1, (unsigned)new_g1_start);
4042340423+4042440424+ /* These values are just observed by signalers, and thus protected by the
4042540425+ lock. */
4042640426+- unsigned int orig_size = wseq - (old_g1_start + old_orig_size);
4042740427++ unsigned int orig_size = wseq - new_g1_start;
4042840428+ __condvar_set_orig_size (cond, orig_size);
4042940429+ /* Use and addition to not loose track of cancellations in what was
4043040430+ previously G2. */
4043140431+diff --git a/nptl/pthread_cond_signal.c b/nptl/pthread_cond_signal.c
4043240432+index f095497142..fa3a5c3d8f 100644
4043340433+--- a/nptl/pthread_cond_signal.c
4043440434++++ b/nptl/pthread_cond_signal.c
4043540435+@@ -80,7 +80,7 @@ ___pthread_cond_signal (pthread_cond_t *cond)
4043640436+ release-MO store when initializing a group in __condvar_switch_g1
4043740437+ because we use an atomic read-modify-write and thus extend that
4043840438+ store's release sequence. */
4043940439+- atomic_fetch_add_relaxed (cond->__data.__g_signals + g1, 2);
4044040440++ atomic_fetch_add_relaxed (cond->__data.__g_signals + g1, 1);
4044140441+ cond->__data.__g_size[g1]--;
4044240442+ /* TODO Only set it if there are indeed futex waiters. */
4044340443+ do_futex_wake = true;
4044440444+diff --git a/nptl/pthread_cond_wait.c b/nptl/pthread_cond_wait.c
4044540445+index 80bb728211..0f1dfcb595 100644
4044640446+--- a/nptl/pthread_cond_wait.c
4044740447++++ b/nptl/pthread_cond_wait.c
4044840448+@@ -84,7 +84,7 @@ __condvar_cancel_waiting (pthread_cond_t *cond, uint64_t seq, unsigned int g,
4044940449+ not hold a reference on the group. */
4045040450+ __condvar_acquire_lock (cond, private);
4045140451+4045240452+- uint64_t g1_start = __condvar_load_g1_start_relaxed (cond) >> 1;
4045340453++ uint64_t g1_start = __condvar_load_g1_start_relaxed (cond);
4045440454+ if (g1_start > seq)
4045540455+ {
4045640456+ /* Our group is closed, so someone provided enough signals for it.
4045740457+@@ -259,7 +259,6 @@ __condvar_cleanup_waiting (void *arg)
4045840458+ * Waiters fetch-add while having acquire the mutex associated with the
4045940459+ condvar. Signalers load it and fetch-xor it concurrently.
4046040460+ __g1_start: Starting position of G1 (inclusive)
4046140461+- * LSB is index of current G2.
4046240462+ * Modified by signalers while having acquired the condvar-internal lock
4046340463+ and observed concurrently by waiters.
4046440464+ __g1_orig_size: Initial size of G1
4046540465+@@ -280,11 +279,9 @@ __condvar_cleanup_waiting (void *arg)
4046640466+ * Reference count used by waiters concurrently with signalers that have
4046740467+ acquired the condvar-internal lock.
4046840468+ __g_signals: The number of signals that can still be consumed, relative to
4046940469+- the current g1_start. (i.e. bits 31 to 1 of __g_signals are bits
4047040470+- 31 to 1 of g1_start with the signal count added)
4047140471++ the current g1_start. (i.e. g1_start with the signal count added)
4047240472+ * Used as a futex word by waiters. Used concurrently by waiters and
4047340473+ signalers.
4047440474+- * LSB is currently reserved and 0.
4047540475+ __g_size: Waiters remaining in this group (i.e., which have not been
4047640476+ signaled yet.
4047740477+ * Accessed by signalers and waiters that cancel waiting (both do so only
4047840478+@@ -391,9 +388,8 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex,
4047940479+ too. */
4048040480+ unsigned int signals = atomic_load_acquire (cond->__data.__g_signals + g);
4048140481+ uint64_t g1_start = __condvar_load_g1_start_relaxed (cond);
4048240482+- unsigned int lowseq = (g1_start & 1) == g ? signals : g1_start & ~1U;
4048340483+4048440484+- if (seq < (g1_start >> 1))
4048540485++ if (seq < g1_start)
4048640486+ {
4048740487+ /* If the group is closed already,
4048840488+ then this waiter originally had enough extra signals to
4048940489+@@ -406,13 +402,13 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex,
4049040490+ by now, perhaps in the process of switching back to an older
4049140491+ G2, but in either case we're allowed to consume the available
4049240492+ signal and should not block anymore. */
4049340493+- if ((int)(signals - lowseq) >= 2)
4049440494++ if ((int)(signals - (unsigned int)g1_start) > 0)
4049540495+ {
4049640496+ /* Try to grab a signal. See above for MO. (if we do another loop
4049740497+ iteration we need to see the correct value of g1_start) */
4049840498+ if (atomic_compare_exchange_weak_acquire (
4049940499+ cond->__data.__g_signals + g,
4050040500+- &signals, signals - 2))
4050140501++ &signals, signals - 1))
4050240502+ break;
4050340503+ else
4050440504+ continue;
4050540505+4050640506+commit 8d3dd23e3de8b4c6e4b94f8bbfab971c3b8a55be
4050740507+Author: Florian Weimer <fweimer@redhat.com>
4050840508+Date: Thu Mar 13 06:07:07 2025 +0100
4050940509+4051040510+ nptl: PTHREAD_COND_INITIALIZER compatibility with pre-2.41 versions (bug 32786)
4051140511+4051240512+ The new initializer and struct layout does not initialize the
4051340513+ __g_signals field in the old struct layout before the change in
4051440514+ commit c36fc50781995e6758cae2b6927839d0157f213c ("nptl: Remove
4051540515+ g_refs from condition variables"). Bring back fields at the end
4051640516+ of struct __pthread_cond_s, so that they are again zero-initialized.
4051740517+4051840518+ Reviewed-by: Sam James <sam@gentoo.org>
4051940519+4052040520+diff --git a/sysdeps/nptl/bits/thread-shared-types.h b/sysdeps/nptl/bits/thread-shared-types.h
4052140521+index a3d482f80f..bccc2003ec 100644
4052240522+--- a/sysdeps/nptl/bits/thread-shared-types.h
4052340523++++ b/sysdeps/nptl/bits/thread-shared-types.h
4052440524+@@ -99,6 +99,8 @@ struct __pthread_cond_s
4052540525+ unsigned int __g1_orig_size;
4052640526+ unsigned int __wrefs;
4052740527+ unsigned int __g_signals[2];
4052840528++ unsigned int __unused_initialized_1;
4052940529++ unsigned int __unused_initialized_2;
4053040530+ };
4053140531+4053240532+ typedef unsigned int __tss_t;
4053340533+diff --git a/sysdeps/nptl/pthread.h b/sysdeps/nptl/pthread.h
4053440534+index 9af75d6eae..e0f24418fe 100644
4053540535+--- a/sysdeps/nptl/pthread.h
4053640536++++ b/sysdeps/nptl/pthread.h
4053740537+@@ -152,7 +152,7 @@ enum
4053840538+4053940539+4054040540+ /* Conditional variable handling. */
4054140541+-#define PTHREAD_COND_INITIALIZER { { {0}, {0}, {0, 0}, 0, 0, {0, 0} } }
4054240542++#define PTHREAD_COND_INITIALIZER { { {0}, {0}, {0, 0}, 0, 0, {0, 0}, 0, 0 } }
4054340543+4054440544+4054540545+ /* Cleanup buffers */
4054640546+4054740547+commit 33b33e9dd0ff26158b1b83cc4347a39c073e490e
4054840548+Author: Arjun Shankar <arjun@redhat.com>
4054940549+Date: Fri Oct 18 16:03:25 2024 +0200
4055040550+4055140551+ libio: Fix a deadlock after fork in popen
4055240552+4055340553+ popen modifies its file handler book-keeping under a lock that wasn't
4055440554+ being taken during fork. This meant that a concurrent popen and fork
4055540555+ could end up copying the lock in a "locked" state into the fork child,
4055640556+ where subsequently calling popen would lead to a deadlock due to the
4055740557+ already (spuriously) held lock.
4055840558+4055940559+ This commit fixes the deadlock by appropriately taking the lock before
4056040560+ fork, and releasing/resetting it in the parent/child after the fork.
4056140561+4056240562+ A new test for concurrent popen and fork is also added. It consistently
4056340563+ hangs (and therefore fails via timeout) without the fix applied.
4056440564+ Reviewed-by: Florian Weimer <fweimer@redhat.com>
4056540565+4056640566+ (cherry picked from commit 9f0d2c0ee6c728643fcf9a4879e9f20f5e45ce5f)
4056740567+4056840568+diff --git a/libio/Makefile b/libio/Makefile
4056940569+index 5292baa4e0..7faba230ac 100644
4057040570+--- a/libio/Makefile
4057140571++++ b/libio/Makefile
4057240572+@@ -117,6 +117,7 @@ tests = \
4057340573+ tst-mmap-offend \
4057440574+ tst-mmap-setvbuf \
4057540575+ tst-mmap2-eofsync \
4057640576++ tst-popen-fork \
4057740577+ tst-popen1 \
4057840578+ tst-setvbuf1 \
4057940579+ tst-sprintf-chk-ub \
4058040580+diff --git a/libio/iopopen.c b/libio/iopopen.c
4058140581+index d01cb0648e..352513a291 100644
4058240582+--- a/libio/iopopen.c
4058340583++++ b/libio/iopopen.c
4058440584+@@ -57,6 +57,26 @@ unlock (void *not_used)
4058540585+ }
4058640586+ #endif
4058740587+4058840588++/* These lock/unlock/resetlock functions are used during fork. */
4058940589++
4059040590++void
4059140591++_IO_proc_file_chain_lock (void)
4059240592++{
4059340593++ _IO_lock_lock (proc_file_chain_lock);
4059440594++}
4059540595++
4059640596++void
4059740597++_IO_proc_file_chain_unlock (void)
4059840598++{
4059940599++ _IO_lock_unlock (proc_file_chain_lock);
4060040600++}
4060140601++
4060240602++void
4060340603++_IO_proc_file_chain_resetlock (void)
4060440604++{
4060540605++ _IO_lock_init (proc_file_chain_lock);
4060640606++}
4060740607++
4060840608+ /* POSIX states popen shall ensure that any streams from previous popen()
4060940609+ calls that remain open in the parent process should be closed in the new
4061040610+ child process.
4061140611+diff --git a/libio/libioP.h b/libio/libioP.h
4061240612+index 616253fcd0..a83a411fdf 100644
4061340613+--- a/libio/libioP.h
4061440614++++ b/libio/libioP.h
4061540615+@@ -429,6 +429,12 @@ libc_hidden_proto (_IO_list_resetlock)
4061640616+ extern void _IO_enable_locks (void) __THROW;
4061740617+ libc_hidden_proto (_IO_enable_locks)
4061840618+4061940619++/* Functions for operating popen's proc_file_chain_lock during fork. */
4062040620++
4062140621++extern void _IO_proc_file_chain_lock (void) __THROW attribute_hidden;
4062240622++extern void _IO_proc_file_chain_unlock (void) __THROW attribute_hidden;
4062340623++extern void _IO_proc_file_chain_resetlock (void) __THROW attribute_hidden;
4062440624++
4062540625+ /* Default jumptable functions. */
4062640626+4062740627+ extern int _IO_default_underflow (FILE *) __THROW;
4062840628+diff --git a/libio/tst-popen-fork.c b/libio/tst-popen-fork.c
4062940629+new file mode 100644
4063040630+index 0000000000..1df30fc6c0
4063140631+--- /dev/null
4063240632++++ b/libio/tst-popen-fork.c
4063340633+@@ -0,0 +1,80 @@
4063440634++/* Test concurrent popen and fork.
4063540635++ Copyright (C) 2024 Free Software Foundation, Inc.
4063640636++ This file is part of the GNU C Library.
4063740637++
4063840638++ The GNU C Library is free software; you can redistribute it and/or
4063940639++ modify it under the terms of the GNU Lesser General Public
4064040640++ License as published by the Free Software Foundation; either
4064140641++ version 2.1 of the License, or (at your option) any later version.
4064240642++
4064340643++ The GNU C Library is distributed in the hope that it will be useful,
4064440644++ but WITHOUT ANY WARRANTY; without even the implied warranty of
4064540645++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
4064640646++ Lesser General Public License for more details.
4064740647++
4064840648++ You should have received a copy of the GNU Lesser General Public
4064940649++ License along with the GNU C Library; if not, see
4065040650++ <https://www.gnu.org/licenses/>. */
4065140651++
4065240652++#include <stdio.h>
4065340653++#include <stdatomic.h>
4065440654++#include <pthread.h>
4065540655++#include <unistd.h>
4065640656++#include <sys/wait.h>
4065740657++
4065840658++#include <support/check.h>
4065940659++#include <support/xthread.h>
4066040660++#include <support/xunistd.h>
4066140661++
4066240662++static void
4066340663++popen_and_pclose (void)
4066440664++{
4066540665++ FILE *f = popen ("true", "r");
4066640666++ TEST_VERIFY_EXIT (f != NULL);
4066740667++ pclose (f);
4066840668++ return;
4066940669++}
4067040670++
4067140671++static atomic_bool done = ATOMIC_VAR_INIT (0);
4067240672++
4067340673++static void *
4067440674++popen_and_pclose_forever (__attribute__ ((unused))
4067540675++ void *arg)
4067640676++{
4067740677++ while (!atomic_load_explicit (&done, memory_order_acquire))
4067840678++ popen_and_pclose ();
4067940679++ return NULL;
4068040680++}
4068140681++
4068240682++static int
4068340683++do_test (void)
4068440684++{
4068540685++
4068640686++ /* Repeatedly call popen in a loop during the entire test. */
4068740687++ pthread_t t = xpthread_create (NULL, popen_and_pclose_forever, NULL);
4068840688++
4068940689++ /* Repeatedly fork off and reap child processes one-by-one.
4069040690++ Each child calls popen once, then exits, leading to the possibility
4069140691++ that a child forks *during* our own popen call, thus inheriting any
4069240692++ intermediate popen state, possibly including lock state(s). */
4069340693++ for (int i = 0; i < 100; i++)
4069440694++ {
4069540695++ int cpid = xfork ();
4069640696++
4069740697++ if (cpid == 0)
4069840698++ {
4069940699++ popen_and_pclose ();
4070040700++ _exit (0);
4070140701++ }
4070240702++ else
4070340703++ xwaitpid (cpid, NULL, 0);
4070440704++ }
4070540705++
4070640706++ /* Stop calling popen. */
4070740707++ atomic_store_explicit (&done, 1, memory_order_release);
4070840708++ xpthread_join (t);
4070940709++
4071040710++ return 0;
4071140711++}
4071240712++
4071340713++#include <support/test-driver.c>
4071440714+diff --git a/posix/fork.c b/posix/fork.c
4071540715+index 298765a1ff..cf9b80e7c0 100644
4071640716+--- a/posix/fork.c
4071740717++++ b/posix/fork.c
4071840718+@@ -62,6 +62,7 @@ __libc_fork (void)
4071940719+ call_function_static_weak (__nss_database_fork_prepare_parent,
4072040720+ &nss_database_data);
4072140721+4072240722++ _IO_proc_file_chain_lock ();
4072340723+ _IO_list_lock ();
4072440724+4072540725+ /* Acquire malloc locks. This needs to come last because fork
4072640726+@@ -92,6 +93,7 @@ __libc_fork (void)
4072740727+4072840728+ /* Reset locks in the I/O code. */
4072940729+ _IO_list_resetlock ();
4073040730++ _IO_proc_file_chain_resetlock ();
4073140731+4073240732+ call_function_static_weak (__nss_database_fork_subprocess,
4073340733+ &nss_database_data);
4073440734+@@ -121,6 +123,7 @@ __libc_fork (void)
4073540735+4073640736+ /* We execute this even if the 'fork' call failed. */
4073740737+ _IO_list_unlock ();
4073840738++ _IO_proc_file_chain_unlock ();
4073940739+ }
4074040740+4074140741+ /* Run the handlers registered for the parent. */
4074240742+4074340743+commit 7c3c9ae28685a9142a8cfa3521bbca74c1007d0b
4074440744+Author: Arjun Shankar <arjun@redhat.com>
4074540745+Date: Fri Oct 25 09:33:45 2024 +0200
4074640746+4074740747+ libio: Correctly link tst-popen-fork against libpthread
4074840748+4074940749+ tst-popen-fork failed to build for Hurd due to not being linked with
4075040750+ libpthread. This commit fixes that.
4075140751+4075240752+ Tested with build-many-glibcs.py for i686-gnu.
4075340753+4075440754+ Reviewed-by: Florian Weimer <fweimer@redhat.com>
4075540755+ (cherry picked from commit 6a290b2895b77be839fcb7c44a6a9879560097ad)
4075640756+4075740757+diff --git a/libio/Makefile b/libio/Makefile
4075840758+index 7faba230ac..f2e98f96eb 100644
4075940759+--- a/libio/Makefile
4076040760++++ b/libio/Makefile
4076140761+@@ -142,6 +142,8 @@ tests = \
4076240762+ tst_wscanf \
4076340763+ # tests
4076440764+4076540765++$(objpfx)tst-popen-fork: $(shared-thread-library)
4076640766++
4076740767+ tests-internal = tst-vtables tst-vtables-interposed
4076840768+4076940769+ ifeq (yes,$(build-shared))
4077040770+4077140771+commit 8667345b83c8ca528a093d4db53f57a1bb1688e4
4077240772+Author: Florian Weimer <fweimer@redhat.com>
4077340773+Date: Thu Feb 13 21:56:52 2025 +0100
4077440774+4077540775+ elf: Keep using minimal malloc after early DTV resize (bug 32412)
4077640776+4077740777+ If an auditor loads many TLS-using modules during startup, it is
4077840778+ possible to trigger DTV resizing. Previously, the DTV was marked
4077940779+ as allocated by the main malloc afterwards, even if the minimal
4078040780+ malloc was still in use. With this change, _dl_resize_dtv marks
4078140781+ the resized DTV as allocated with the minimal malloc.
4078240782+4078340783+ The new test reuses TLS-using modules from other auditing tests.
4078440784+4078540785+ Reviewed-by: DJ Delorie <dj@redhat.com>
4078640786+ (cherry picked from commit aa3d7bd5299b33bffc118aa618b59bfa66059bcb)
4078740787+4078840788+diff --git a/elf/Makefile b/elf/Makefile
4078940789+index dc686c3bff..be64c59887 100644
4079040790+--- a/elf/Makefile
4079140791++++ b/elf/Makefile
4079240792+@@ -378,6 +378,7 @@ tests += \
4079340793+ tst-align3 \
4079440794+ tst-audit-tlsdesc \
4079540795+ tst-audit-tlsdesc-dlopen \
4079640796++ tst-audit-tlsdesc-dlopen2 \
4079740797+ tst-audit1 \
4079840798+ tst-audit2 \
4079940799+ tst-audit8 \
4080040800+@@ -817,6 +818,7 @@ modules-names += \
4080140801+ tst-auditmanymod8 \
4080240802+ tst-auditmanymod9 \
4080340803+ tst-auditmod-tlsdesc \
4080440804++ tst-auditmod-tlsdesc2 \
4080540805+ tst-auditmod1 \
4080640806+ tst-auditmod11 \
4080740807+ tst-auditmod12 \
4080840808+@@ -3040,6 +3042,9 @@ $(objpfx)tst-audit-tlsdesc.out: $(objpfx)tst-auditmod-tlsdesc.so
4080940809+ tst-audit-tlsdesc-ENV = LD_AUDIT=$(objpfx)tst-auditmod-tlsdesc.so
4081040810+ $(objpfx)tst-audit-tlsdesc-dlopen.out: $(objpfx)tst-auditmod-tlsdesc.so
4081140811+ tst-audit-tlsdesc-dlopen-ENV = LD_AUDIT=$(objpfx)tst-auditmod-tlsdesc.so
4081240812++$(objpfx)tst-audit-tlsdesc-dlopen2.out: $(objpfx)tst-auditmod-tlsdesc2.so \
4081340813++ $(patsubst %, $(objpfx)%.so, $(tlsmod17a-modules))
4081440814++tst-audit-tlsdesc-dlopen2-ENV = LD_AUDIT=$(objpfx)tst-auditmod-tlsdesc2.so
4081540815+4081640816+ $(objpfx)tst-dlmopen-twice.out: \
4081740817+ $(objpfx)tst-dlmopen-twice-mod1.so \
4081840818+diff --git a/elf/dl-tls.c b/elf/dl-tls.c
4081940819+index 3d529b722c..b13e752358 100644
4082040820+--- a/elf/dl-tls.c
4082140821++++ b/elf/dl-tls.c
4082240822+@@ -528,6 +528,13 @@ _dl_resize_dtv (dtv_t *dtv, size_t max_modid)
4082340823+ if (newp == NULL)
4082440824+ oom ();
4082540825+ memcpy (newp, &dtv[-1], (2 + oldsize) * sizeof (dtv_t));
4082640826++#ifdef SHARED
4082740827++ /* Auditors can trigger a DTV resize event while the full malloc
4082840828++ is not yet in use. Mark the new DTV allocation as the
4082940829++ initial allocation. */
4083040830++ if (!__rtld_malloc_is_complete ())
4083140831++ GL(dl_initial_dtv) = &newp[1];
4083240832++#endif
4083340833+ }
4083440834+ else
4083540835+ {
4083640836+diff --git a/elf/tst-audit-tlsdesc-dlopen2.c b/elf/tst-audit-tlsdesc-dlopen2.c
4083740837+new file mode 100644
4083840838+index 0000000000..7ba2c4129a
4083940839+--- /dev/null
4084040840++++ b/elf/tst-audit-tlsdesc-dlopen2.c
4084140841+@@ -0,0 +1,46 @@
4084240842++/* Loading TLS-using modules from auditors (bug 32412). Main program.
4084340843++ Copyright (C) 2021-2025 Free Software Foundation, Inc.
4084440844++ This file is part of the GNU C Library.
4084540845++
4084640846++ The GNU C Library is free software; you can redistribute it and/or
4084740847++ modify it under the terms of the GNU Lesser General Public
4084840848++ License as published by the Free Software Foundation; either
4084940849++ version 2.1 of the License, or (at your option) any later version.
4085040850++
4085140851++ The GNU C Library is distributed in the hope that it will be useful,
4085240852++ but WITHOUT ANY WARRANTY; without even the implied warranty of
4085340853++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
4085440854++ Lesser General Public License for more details.
4085540855++
4085640856++ You should have received a copy of the GNU Lesser General Public
4085740857++ License along with the GNU C Library; if not, see
4085840858++ <https://www.gnu.org/licenses/>. */
4085940859++
4086040860++#include <support/xdlfcn.h>
4086140861++#include <stdio.h>
4086240862++
4086340863++static int
4086440864++do_test (void)
4086540865++{
4086640866++ puts ("info: start of main program");
4086740867++
4086840868++ /* Load TLS-using modules, to trigger DTV resizing. The dynamic
4086940869++ linker will load them again (requiring their own TLS) because the
4087040870++ dlopen calls from the auditor were in the auditing namespace. */
4087140871++ for (int i = 1; i <= 19; ++i)
4087240872++ {
4087340873++ char dso[30];
4087440874++ snprintf (dso, sizeof (dso), "tst-tlsmod17a%d.so", i);
4087540875++ char sym[30];
4087640876++ snprintf (sym, sizeof(sym), "tlsmod17a%d", i);
4087740877++
4087840878++ void *handle = xdlopen (dso, RTLD_LAZY);
4087940879++ int (*func) (void) = xdlsym (handle, sym);
4088040880++ /* Trigger TLS allocation. */
4088140881++ func ();
4088240882++ }
4088340883++
4088440884++ return 0;
4088540885++}
4088640886++
4088740887++#include <support/test-driver.c>
4088840888+diff --git a/elf/tst-auditmod-tlsdesc2.c b/elf/tst-auditmod-tlsdesc2.c
4088940889+new file mode 100644
4089040890+index 0000000000..50275cd34d
4089140891+--- /dev/null
4089240892++++ b/elf/tst-auditmod-tlsdesc2.c
4089340893+@@ -0,0 +1,59 @@
4089440894++/* Loading TLS-using modules from auditors (bug 32412). Audit module.
4089540895++ Copyright (C) 2021-2025 Free Software Foundation, Inc.
4089640896++ This file is part of the GNU C Library.
4089740897++
4089840898++ The GNU C Library is free software; you can redistribute it and/or
4089940899++ modify it under the terms of the GNU Lesser General Public
4090040900++ License as published by the Free Software Foundation; either
4090140901++ version 2.1 of the License, or (at your option) any later version.
4090240902++
4090340903++ The GNU C Library is distributed in the hope that it will be useful,
4090440904++ but WITHOUT ANY WARRANTY; without even the implied warranty of
4090540905++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
4090640906++ Lesser General Public License for more details.
4090740907++
4090840908++ You should have received a copy of the GNU Lesser General Public
4090940909++ License along with the GNU C Library; if not, see
4091040910++ <https://www.gnu.org/licenses/>. */
4091140911++
4091240912++#include <dlfcn.h>
4091340913++#include <link.h>
4091440914++#include <stdbool.h>
4091540915++#include <stdio.h>
4091640916++#include <unistd.h>
4091740917++
4091840918++unsigned int
4091940919++la_version (unsigned int version)
4092040920++{
4092140921++ /* Open some modules, to trigger DTV resizing before the switch to
4092240922++ the main malloc. */
4092340923++ for (int i = 1; i <= 19; ++i)
4092440924++ {
4092540925++ char dso[30];
4092640926++ snprintf (dso, sizeof (dso), "tst-tlsmod17a%d.so", i);
4092740927++ char sym[30];
4092840928++ snprintf (sym, sizeof(sym), "tlsmod17a%d", i);
4092940929++
4093040930++ void *handle = dlopen (dso, RTLD_LAZY);
4093140931++ if (handle == NULL)
4093240932++ {
4093340933++ printf ("error: dlmopen from auditor: %s\n", dlerror ());
4093440934++ fflush (stdout);
4093540935++ _exit (1);
4093640936++ }
4093740937++ int (*func) (void) = dlsym (handle, sym);
4093840938++ if (func == NULL)
4093940939++ {
4094040940++ printf ("error: dlsym from auditor: %s\n", dlerror ());
4094140941++ fflush (stdout);
4094240942++ _exit (1);
4094340943++ }
4094440944++ /* Trigger TLS allocation. */
4094540945++ func ();
4094640946++ }
4094740947++
4094840948++ puts ("info: TLS-using modules loaded from auditor");
4094940949++ fflush (stdout);
4095040950++
4095140951++ return LAV_CURRENT;
4095240952++}
4095340953+4095440954+commit b3002f303cedb8262cbc1ec22999ea36482efa0e
4095540955+Author: Florian Weimer <fweimer@redhat.com>
4095640956+Date: Tue May 20 19:36:02 2025 +0200
4095740957+4095840958+ support: Use const char * argument in support_capture_subprogram_self_sgid
4095940959+4096040960+ The function does not modify the passed-in string, so make this clear
4096140961+ via the prototype.
4096240962+4096340963+ Reviewed-by: Carlos O'Donell <carlos@redhat.com>
4096440964+ (cherry picked from commit f0c09fe61678df6f7f18fe1ebff074e62fa5ca7a)
4096540965+4096640966+diff --git a/support/capture_subprocess.h b/support/capture_subprocess.h
4096740967+index 93b7245d2a..5406d9f6c0 100644
4096840968+--- a/support/capture_subprocess.h
4096940969++++ b/support/capture_subprocess.h
4097040970+@@ -45,8 +45,7 @@ struct support_capture_subprocess support_capture_subprogram
4097140971+ /* Copy the running program into a setgid binary and run it with CHILD_ID
4097240972+ argument. If execution is successful, return the exit status of the child
4097340973+ program, otherwise return a non-zero failure exit code. */
4097440974+-int support_capture_subprogram_self_sgid
4097540975+- (char *child_id);
4097640976++int support_capture_subprogram_self_sgid (const char *child_id);
4097740977+4097840978+ /* Deallocate the subprocess data captured by
4097940979+ support_capture_subprocess. */
4098040980+diff --git a/support/support_capture_subprocess.c b/support/support_capture_subprocess.c
4098140981+index 53847194cb..2383481911 100644
4098240982+--- a/support/support_capture_subprocess.c
4098340983++++ b/support/support_capture_subprocess.c
4098440984+@@ -110,7 +110,7 @@ support_capture_subprogram (const char *file, char *const argv[],
4098540985+ safely make it SGID with the TARGET group ID. Then runs the
4098640986+ executable. */
4098740987+ static int
4098840988+-copy_and_spawn_sgid (char *child_id, gid_t gid)
4098940989++copy_and_spawn_sgid (const char *child_id, gid_t gid)
4099040990+ {
4099140991+ char *dirname = xasprintf ("%s/tst-tunables-setuid.%jd",
4099240992+ test_dir, (intmax_t) getpid ());
4099340993+@@ -182,7 +182,7 @@ copy_and_spawn_sgid (char *child_id, gid_t gid)
4099440994+ ret = 0;
4099540995+ infd = outfd = -1;
4099640996+4099740997+- char * const args[] = {execname, child_id, NULL};
4099840998++ char * const args[] = {execname, (char *) child_id, NULL};
4099940999+4100041000+ status = support_subprogram_wait (args[0], args);
4100141001+4100241002+@@ -211,7 +211,7 @@ err:
4100341003+ }
4100441004+4100541005+ int
4100641006+-support_capture_subprogram_self_sgid (char *child_id)
4100741007++support_capture_subprogram_self_sgid (const char *child_id)
4100841008+ {
4100941009+ gid_t target = 0;
4101041010+ const int count = 64;
4101141011+4101241012+commit 61dcce21e06834f7248a8d516c9ec20788fc728c
4101341013+Author: Florian Weimer <fweimer@redhat.com>
4101441014+Date: Mon Dec 23 13:57:55 2024 +0100
4101541015+4101641016+ support: Add support_record_failure_barrier
4101741017+4101841018+ This can be used to stop execution after a TEST_COMPARE_BLOB
4101941019+ failure, for example.
4102041020+4102141021+ (cherry picked from commit d0b8aa6de4529231fadfe604ac2c434e559c2d9e)
4102241022+4102341023+diff --git a/support/check.h b/support/check.h
4102441024+index 7ea22c7a2c..8f41e5b99f 100644
4102541025+--- a/support/check.h
4102641026++++ b/support/check.h
4102741027+@@ -207,6 +207,9 @@ void support_record_failure_reset (void);
4102841028+ failures or not. */
4102941029+ int support_record_failure_is_failed (void);
4103041030+4103141031++/* Terminate the process if any failures have been encountered so far. */
4103241032++void support_record_failure_barrier (void);
4103341033++
4103441034+ __END_DECLS
4103541035+4103641036+ #endif /* SUPPORT_CHECK_H */
4103741037+diff --git a/support/support_record_failure.c b/support/support_record_failure.c
4103841038+index 978123701d..72ee2b232f 100644
4103941039+--- a/support/support_record_failure.c
4104041040++++ b/support/support_record_failure.c
4104141041+@@ -112,3 +112,13 @@ support_record_failure_is_failed (void)
4104241042+ synchronization for reliable test error reporting anyway. */
4104341043+ return __atomic_load_n (&state->failed, __ATOMIC_RELAXED);
4104441044+ }
4104541045++
4104641046++void
4104741047++support_record_failure_barrier (void)
4104841048++{
4104941049++ if (__atomic_load_n (&state->failed, __ATOMIC_RELAXED))
4105041050++ {
4105141051++ puts ("error: exiting due to previous errors");
4105241052++ exit (1);
4105341053++ }
4105441054++}
4105541055+4105641056+commit 079ac4a172a8f6ba37acf1e80e57f5042d2c7561
4105741057+Author: Florian Weimer <fweimer@redhat.com>
4105841058+Date: Tue May 20 19:45:06 2025 +0200
4105941059+4106041060+ elf: Test case for bug 32976 (CVE-2025-4802)
4106141061+4106241062+ Check that LD_LIBRARY_PATH is ignored for AT_SECURE statically
4106341063+ linked binaries, using support_capture_subprogram_self_sgid.
4106441064+4106541065+ Reviewed-by: Carlos O'Donell <carlos@redhat.com>
4106641066+ (cherry picked from commit d8f7a79335b0d861c12c42aec94c04cd5bb181e2)
4106741067+4106841068+diff --git a/elf/Makefile b/elf/Makefile
4106941069+index be64c59887..afd4eb6fdd 100644
4107041070+--- a/elf/Makefile
4107141071++++ b/elf/Makefile
4107241072+@@ -266,6 +266,7 @@ tests-static-normal := \
4107341073+ tst-array1-static \
4107441074+ tst-array5-static \
4107541075+ tst-dl-iter-static \
4107641076++ tst-dlopen-sgid \
4107741077+ tst-dst-static \
4107841078+ tst-env-setuid-static \
4107941079+ tst-getauxval-static \
4108041080+@@ -859,6 +860,7 @@ modules-names += \
4108141081+ tst-dlmopen-twice-mod1 \
4108241082+ tst-dlmopen-twice-mod2 \
4108341083+ tst-dlmopen1mod \
4108441084++ tst-dlopen-sgid-mod \
4108541085+ tst-dlopen-tlsreinitmod1 \
4108641086+ tst-dlopen-tlsreinitmod2 \
4108741087+ tst-dlopen-tlsreinitmod3 \
4108841088+@@ -3153,3 +3155,5 @@ $(objpfx)tst-dlopen-tlsreinit3.out: $(objpfx)tst-auditmod1.so
4108941089+ tst-dlopen-tlsreinit3-ENV = LD_AUDIT=$(objpfx)tst-auditmod1.so
4109041090+ $(objpfx)tst-dlopen-tlsreinit4.out: $(objpfx)tst-auditmod1.so
4109141091+ tst-dlopen-tlsreinit4-ENV = LD_AUDIT=$(objpfx)tst-auditmod1.so
4109241092++
4109341093++$(objpfx)tst-dlopen-sgid.out: $(objpfx)tst-dlopen-sgid-mod.so
4109441094+diff --git a/elf/tst-dlopen-sgid-mod.c b/elf/tst-dlopen-sgid-mod.c
4109541095+new file mode 100644
4109641096+index 0000000000..5eb79eef48
4109741097+--- /dev/null
4109841098++++ b/elf/tst-dlopen-sgid-mod.c
4109941099+@@ -0,0 +1 @@
4110041100++/* Opening this object should not succeed. */
4110141101+diff --git a/elf/tst-dlopen-sgid.c b/elf/tst-dlopen-sgid.c
4110241102+new file mode 100644
4110341103+index 0000000000..47829a405e
4110441104+--- /dev/null
4110541105++++ b/elf/tst-dlopen-sgid.c
4110641106+@@ -0,0 +1,104 @@
4110741107++/* Test case for ignored LD_LIBRARY_PATH in static startug (bug 32976).
4110841108++ Copyright (C) 2025 Free Software Foundation, Inc.
4110941109++ This file is part of the GNU C Library.
4111041110++
4111141111++ The GNU C Library is free software; you can redistribute it and/or
4111241112++ modify it under the terms of the GNU Lesser General Public
4111341113++ License as published by the Free Software Foundation; either
4111441114++ version 2.1 of the License, or (at your option) any later version.
4111541115++
4111641116++ The GNU C Library is distributed in the hope that it will be useful,
4111741117++ but WITHOUT ANY WARRANTY; without even the implied warranty of
4111841118++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
4111941119++ Lesser General Public License for more details.
4112041120++
4112141121++ You should have received a copy of the GNU Lesser General Public
4112241122++ License along with the GNU C Library; if not, see
4112341123++ <https://www.gnu.org/licenses/>. */
4112441124++
4112541125++#include <dlfcn.h>
4112641126++#include <gnu/lib-names.h>
4112741127++#include <stddef.h>
4112841128++#include <stdint.h>
4112941129++#include <stdlib.h>
4113041130++#include <string.h>
4113141131++#include <support/capture_subprocess.h>
4113241132++#include <support/check.h>
4113341133++#include <support/support.h>
4113441134++#include <support/temp_file.h>
4113541135++#include <unistd.h>
4113641136++
4113741137++/* This is the name of our test object. Use a custom module for
4113841138++ testing, so that this object does not get picked up from the system
4113941139++ path. */
4114041140++static const char dso_name[] = "tst-dlopen-sgid-mod.so";
4114141141++
4114241142++/* Used to mark the recursive invocation. */
4114341143++static const char magic_argument[] = "run-actual-test";
4114441144++
4114541145++static int
4114641146++do_test (void)
4114741147++{
4114841148++/* Pathname of the directory that receives the shared objects this
4114941149++ test attempts to load. */
4115041150++ char *libdir = support_create_temp_directory ("tst-dlopen-sgid-");
4115141151++
4115241152++ /* This is supposed to be ignored and stripped. */
4115341153++ TEST_COMPARE (setenv ("LD_LIBRARY_PATH", libdir, 1), 0);
4115441154++
4115541155++ /* Copy of libc.so.6. */
4115641156++ {
4115741157++ char *from = xasprintf ("%s/%s", support_objdir_root, LIBC_SO);
4115841158++ char *to = xasprintf ("%s/%s", libdir, LIBC_SO);
4115941159++ add_temp_file (to);
4116041160++ support_copy_file (from, to);
4116141161++ free (to);
4116241162++ free (from);
4116341163++ }
4116441164++
4116541165++ /* Copy of the test object. */
4116641166++ {
4116741167++ char *from = xasprintf ("%s/elf/%s", support_objdir_root, dso_name);
4116841168++ char *to = xasprintf ("%s/%s", libdir, dso_name);
4116941169++ add_temp_file (to);
4117041170++ support_copy_file (from, to);
4117141171++ free (to);
4117241172++ free (from);
4117341173++ }
4117441174++
4117541175++ TEST_COMPARE (support_capture_subprogram_self_sgid (magic_argument), 0);
4117641176++
4117741177++ free (libdir);
4117841178++
4117941179++ return 0;
4118041180++}
4118141181++
4118241182++static void
4118341183++alternative_main (int argc, char **argv)
4118441184++{
4118541185++ if (argc == 2 && strcmp (argv[1], magic_argument) == 0)
4118641186++ {
4118741187++ if (getgid () == getegid ())
4118841188++ /* This can happen if the file system is mounted nosuid. */
4118941189++ FAIL_UNSUPPORTED ("SGID failed: GID and EGID match (%jd)\n",
4119041190++ (intmax_t) getgid ());
4119141191++
4119241192++ /* Should be removed due to SGID. */
4119341193++ TEST_COMPARE_STRING (getenv ("LD_LIBRARY_PATH"), NULL);
4119441194++
4119541195++ TEST_VERIFY (dlopen (dso_name, RTLD_NOW) == NULL);
4119641196++ {
4119741197++ const char *message = dlerror ();
4119841198++ TEST_COMPARE_STRING (message,
4119941199++ "tst-dlopen-sgid-mod.so:"
4120041200++ " cannot open shared object file:"
4120141201++ " No such file or directory");
4120241202++ }
4120341203++
4120441204++ support_record_failure_barrier ();
4120541205++ exit (EXIT_SUCCESS);
4120641206++ }
4120741207++}
4120841208++
4120941209++#define PREPARE alternative_main
4121041210++#include <support/test-driver.c>
4121141211+4121241212+commit 56e75b810ac39b0e390be5b66397dca0cdfa4d80
4121341213+Author: Sunil K Pandey <sunil.k.pandey@intel.com>
4121441214+Date: Tue May 20 10:07:27 2025 -0700
4121541215+4121641216+ x86_64: Fix typo in ifunc-impl-list.c.
4121741217+4121841218+ Fix wcsncpy and wcpncpy typo in ifunc-impl-list.c.
4121941219+4122041220+ Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
4122141221+ (cherry picked from commit f2aeb6ff941dccc4c777b5621e77addea6cc076c)
4122241222+4122341223+diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
4122441224+index 0bbb71bbbf..3db45db39b 100644
4122541225+--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
4122641226++++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
4122741227+@@ -922,7 +922,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
4122841228+ (CPU_FEATURE_USABLE (AVX2)
4122941229+ && CPU_FEATURE_USABLE (BMI2)),
4123041230+ __wcsncpy_avx2)
4123141231+- X86_IFUNC_IMPL_ADD_V2 (array, i, wcpncpy,
4123241232++ X86_IFUNC_IMPL_ADD_V2 (array, i, wcsncpy,
4123341233+ 1,
4123441234+ __wcsncpy_generic))
4123541235+4123641236+@@ -952,7 +952,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
4123741237+ (CPU_FEATURE_USABLE (AVX2)
4123841238+ && CPU_FEATURE_USABLE (BMI2)),
4123941239+ __wcpncpy_avx2)
4124041240+- X86_IFUNC_IMPL_ADD_V2 (array, i, wcsncpy,
4124141241++ X86_IFUNC_IMPL_ADD_V2 (array, i, wcpncpy,
4124241242+ 1,
4124341243+ __wcpncpy_generic))
4124441244+4124541245+4124641246+commit c8e10f14328518954072df64aafd574e67cfdde5
4124741247+Author: Florian Weimer <fweimer@redhat.com>
4124841248+Date: Wed May 21 08:43:32 2025 +0200
4124941249+4125041250+ elf: Fix subprocess status handling for tst-dlopen-sgid (bug 32987)
4125141251+4125241252+ This should really move into support_capture_subprogram_self_sgid.
4125341253+4125441254+ Reviewed-by: Sam James <sam@gentoo.org>
4125541255+ (cherry picked from commit 35fc356fa3b4f485bd3ba3114c9f774e5df7d3c2)
4125641256+4125741257+diff --git a/NEWS b/NEWS
4125841258+index 7a6985f5dd..4b290ad4bf 100644
4125941259+--- a/NEWS
4126041260++++ b/NEWS
4126141261+@@ -23,6 +23,7 @@ The following bugs are resolved with this release:
4126241262+ [32245] glibc -Wstringop-overflow= build failure on hppa
4126341263+ [32470] x86: Avoid integer truncation with large cache sizes
4126441264+ [32810] Crash on x86-64 if XSAVEC disable via tunable
4126541265++ [32987] elf: Fix subprocess status handling for tst-dlopen-sgid
4126641266+4126741267+ Version 2.40
4126841268+4126941269+diff --git a/elf/tst-dlopen-sgid.c b/elf/tst-dlopen-sgid.c
4127041270+index 47829a405e..5688b79f2e 100644
4127141271+--- a/elf/tst-dlopen-sgid.c
4127241272++++ b/elf/tst-dlopen-sgid.c
4127341273+@@ -26,6 +26,8 @@
4127441274+ #include <support/check.h>
4127541275+ #include <support/support.h>
4127641276+ #include <support/temp_file.h>
4127741277++#include <support/test-driver.h>
4127841278++#include <sys/wait.h>
4127941279+ #include <unistd.h>
4128041280+4128141281+ /* This is the name of our test object. Use a custom module for
4128241282+@@ -66,10 +68,16 @@ do_test (void)
4128341283+ free (from);
4128441284+ }
4128541285+4128641286+- TEST_COMPARE (support_capture_subprogram_self_sgid (magic_argument), 0);
4128741287+-
4128841288+ free (libdir);
4128941289+4129041290++ int status = support_capture_subprogram_self_sgid (magic_argument);
4129141291++
4129241292++ if (WEXITSTATUS (status) == EXIT_UNSUPPORTED)
4129341293++ return EXIT_UNSUPPORTED;
4129441294++
4129541295++ if (!WIFEXITED (status))
4129641296++ FAIL_EXIT1 ("Unexpected exit status %d from child process\n", status);
4129741297++
4129841298+ return 0;
4129941299+ }
4130041300+4130141301+4130241302+commit 42a5a940c974d02540c8da26d6374c744d148cb9
4130341303+Author: Carlos O'Donell <carlos@redhat.com>
4130441304+Date: Wed Jun 11 09:19:17 2025 -0400
4130541305+4130641306+ ppc64le: Revert "powerpc: Optimized strncmp for power10" (CVE-2025-5745)
4130741307+4130841308+ This reverts commit 23f0d81608d0ca6379894ef81670cf30af7fd081
4130941309+4131041310+ Reason for revert: Power10 strncmp clobbers non-volatile vector
4131141311+ registers (Bug 33060)
4131241312+4131341313+ Tested on ppc64le with no regressions.
4131441314+4131541315+ (cherry picked from commit 63c60101ce7c5eac42be90f698ba02099b41b965)
4131641316+4131741317+diff --git a/sysdeps/powerpc/powerpc64/le/power10/strncmp.S b/sysdeps/powerpc/powerpc64/le/power10/strncmp.S
4131841318+deleted file mode 100644
4131941319+index d4ba76acae..0000000000
4132041320+--- a/sysdeps/powerpc/powerpc64/le/power10/strncmp.S
4132141321++++ /dev/null
4132241322+@@ -1,271 +0,0 @@
4132341323+-/* Optimized strncmp implementation for PowerPC64/POWER10.
4132441324+- Copyright (C) 2024 Free Software Foundation, Inc.
4132541325+- This file is part of the GNU C Library.
4132641326+-
4132741327+- The GNU C Library is free software; you can redistribute it and/or
4132841328+- modify it under the terms of the GNU Lesser General Public
4132941329+- License as published by the Free Software Foundation; either
4133041330+- version 2.1 of the License, or (at your option) any later version.
4133141331+-
4133241332+- The GNU C Library is distributed in the hope that it will be useful,
4133341333+- but WITHOUT ANY WARRANTY; without even the implied warranty of
4133441334+- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
4133541335+- Lesser General Public License for more details.
4133641336+-
4133741337+- You should have received a copy of the GNU Lesser General Public
4133841338+- License along with the GNU C Library; if not, see
4133941339+- <https://www.gnu.org/licenses/>. */
4134041340+-
4134141341+-#include <sysdep.h>
4134241342+-
4134341343+-/* Implements the function
4134441344+-
4134541345+- int [r3] strncmp (const char *s1 [r3], const char *s2 [r4], size_t [r5] n)
4134641346+-
4134741347+- The implementation uses unaligned doubleword access to avoid specialized
4134841348+- code paths depending of data alignment for first 32 bytes and uses
4134941349+- vectorised loops after that. */
4135041350+-
4135141351+-#ifndef STRNCMP
4135241352+-# define STRNCMP strncmp
4135341353+-#endif
4135441354+-
4135541355+-/* TODO: Change this to actual instructions when minimum binutils is upgraded
4135641356+- to 2.27. Macros are defined below for these newer instructions in order
4135741357+- to maintain compatibility. */
4135841358+-
4135941359+-#define LXVP(xtp,dq,ra) \
4136041360+- .long(((6)<<(32-6)) \
4136141361+- | ((((xtp)-32)>>1)<<(32-10)) \
4136241362+- | ((1)<<(32-11)) \
4136341363+- | ((ra)<<(32-16)) \
4136441364+- | dq)
4136541365+-
4136641366+-#define COMPARE_16(vreg1,vreg2,offset) \
4136741367+- lxv vreg1+32,offset(r3); \
4136841368+- lxv vreg2+32,offset(r4); \
4136941369+- vcmpnezb. v7,vreg1,vreg2; \
4137041370+- bne cr6,L(different); \
4137141371+- cmpldi cr7,r5,16; \
4137241372+- ble cr7,L(ret0); \
4137341373+- addi r5,r5,-16;
4137441374+-
4137541375+-#define COMPARE_32(vreg1,vreg2,offset,label1,label2) \
4137641376+- LXVP(vreg1+32,offset,r3); \
4137741377+- LXVP(vreg2+32,offset,r4); \
4137841378+- vcmpnezb. v7,vreg1+1,vreg2+1; \
4137941379+- bne cr6,L(label1); \
4138041380+- vcmpnezb. v7,vreg1,vreg2; \
4138141381+- bne cr6,L(label2); \
4138241382+- cmpldi cr7,r5,32; \
4138341383+- ble cr7,L(ret0); \
4138441384+- addi r5,r5,-32;
4138541385+-
4138641386+-#define TAIL_FIRST_16B(vreg1,vreg2) \
4138741387+- vctzlsbb r6,v7; \
4138841388+- cmpld cr7,r5,r6; \
4138941389+- ble cr7,L(ret0); \
4139041390+- vextubrx r5,r6,vreg1; \
4139141391+- vextubrx r4,r6,vreg2; \
4139241392+- subf r3,r4,r5; \
4139341393+- blr;
4139441394+-
4139541395+-#define TAIL_SECOND_16B(vreg1,vreg2) \
4139641396+- vctzlsbb r6,v7; \
4139741397+- addi r0,r6,16; \
4139841398+- cmpld cr7,r5,r0; \
4139941399+- ble cr7,L(ret0); \
4140041400+- vextubrx r5,r6,vreg1; \
4140141401+- vextubrx r4,r6,vreg2; \
4140241402+- subf r3,r4,r5; \
4140341403+- blr;
4140441404+-
4140541405+-#define CHECK_N_BYTES(reg1,reg2,len_reg) \
4140641406+- sldi r6,len_reg,56; \
4140741407+- lxvl 32+v4,reg1,r6; \
4140841408+- lxvl 32+v5,reg2,r6; \
4140941409+- add reg1,reg1,len_reg; \
4141041410+- add reg2,reg2,len_reg; \
4141141411+- vcmpnezb v7,v4,v5; \
4141241412+- vctzlsbb r6,v7; \
4141341413+- cmpld cr7,r6,len_reg; \
4141441414+- blt cr7,L(different); \
4141541415+- cmpld cr7,r5,len_reg; \
4141641416+- ble cr7,L(ret0); \
4141741417+- sub r5,r5,len_reg; \
4141841418+-
4141941419+- /* TODO: change this to .machine power10 when the minimum required
4142041420+- binutils allows it. */
4142141421+- .machine power9
4142241422+-ENTRY_TOCLESS (STRNCMP, 4)
4142341423+- /* Check if size is 0. */
4142441424+- cmpdi cr0,r5,0
4142541425+- beq cr0,L(ret0)
4142641426+- andi. r7,r3,4095
4142741427+- andi. r8,r4,4095
4142841428+- cmpldi cr0,r7,4096-16
4142941429+- cmpldi cr1,r8,4096-16
4143041430+- bgt cr0,L(crosses)
4143141431+- bgt cr1,L(crosses)
4143241432+- COMPARE_16(v4,v5,0)
4143341433+- addi r3,r3,16
4143441434+- addi r4,r4,16
4143541435+-
4143641436+-L(crosses):
4143741437+- andi. r7,r3,15
4143841438+- subfic r7,r7,16 /* r7(nalign1) = 16 - (str1 & 15). */
4143941439+- andi. r9,r4,15
4144041440+- subfic r8,r9,16 /* r8(nalign2) = 16 - (str2 & 15). */
4144141441+- cmpld cr7,r7,r8
4144241442+- beq cr7,L(same_aligned)
4144341443+- blt cr7,L(nalign1_min)
4144441444+-
4144541445+- /* nalign2 is minimum and s2 pointer is aligned. */
4144641446+- CHECK_N_BYTES(r3,r4,r8)
4144741447+- /* Are we on the 64B hunk which crosses a page? */
4144841448+- andi. r10,r3,63 /* Determine offset into 64B hunk. */
4144941449+- andi. r8,r3,15 /* The offset into the 16B hunk. */
4145041450+- neg r7,r3
4145141451+- andi. r9,r7,15 /* Number of bytes after a 16B cross. */
4145241452+- rlwinm. r7,r7,26,0x3F /* ((r4-4096))>>6&63. */
4145341453+- beq L(compare_64_pagecross)
4145441454+- mtctr r7
4145541455+- b L(compare_64B_unaligned)
4145641456+-
4145741457+- /* nalign1 is minimum and s1 pointer is aligned. */
4145841458+-L(nalign1_min):
4145941459+- CHECK_N_BYTES(r3,r4,r7)
4146041460+- /* Are we on the 64B hunk which crosses a page? */
4146141461+- andi. r10,r4,63 /* Determine offset into 64B hunk. */
4146241462+- andi. r8,r4,15 /* The offset into the 16B hunk. */
4146341463+- neg r7,r4
4146441464+- andi. r9,r7,15 /* Number of bytes after a 16B cross. */
4146541465+- rlwinm. r7,r7,26,0x3F /* ((r4-4096))>>6&63. */
4146641466+- beq L(compare_64_pagecross)
4146741467+- mtctr r7
4146841468+-
4146941469+- .p2align 5
4147041470+-L(compare_64B_unaligned):
4147141471+- COMPARE_16(v4,v5,0)
4147241472+- COMPARE_16(v4,v5,16)
4147341473+- COMPARE_16(v4,v5,32)
4147441474+- COMPARE_16(v4,v5,48)
4147541475+- addi r3,r3,64
4147641476+- addi r4,r4,64
4147741477+- bdnz L(compare_64B_unaligned)
4147841478+-
4147941479+- /* Cross the page boundary of s2, carefully. Only for first
4148041480+- iteration we have to get the count of 64B blocks to be checked.
4148141481+- From second iteration and beyond, loop counter is always 63. */
4148241482+-L(compare_64_pagecross):
4148341483+- li r11, 63
4148441484+- mtctr r11
4148541485+- cmpldi r10,16
4148641486+- ble L(cross_4)
4148741487+- cmpldi r10,32
4148841488+- ble L(cross_3)
4148941489+- cmpldi r10,48
4149041490+- ble L(cross_2)
4149141491+-L(cross_1):
4149241492+- CHECK_N_BYTES(r3,r4,r9)
4149341493+- CHECK_N_BYTES(r3,r4,r8)
4149441494+- COMPARE_16(v4,v5,0)
4149541495+- COMPARE_16(v4,v5,16)
4149641496+- COMPARE_16(v4,v5,32)
4149741497+- addi r3,r3,48
4149841498+- addi r4,r4,48
4149941499+- b L(compare_64B_unaligned)
4150041500+-L(cross_2):
4150141501+- COMPARE_16(v4,v5,0)
4150241502+- addi r3,r3,16
4150341503+- addi r4,r4,16
4150441504+- CHECK_N_BYTES(r3,r4,r9)
4150541505+- CHECK_N_BYTES(r3,r4,r8)
4150641506+- COMPARE_16(v4,v5,0)
4150741507+- COMPARE_16(v4,v5,16)
4150841508+- addi r3,r3,32
4150941509+- addi r4,r4,32
4151041510+- b L(compare_64B_unaligned)
4151141511+-L(cross_3):
4151241512+- COMPARE_16(v4,v5,0)
4151341513+- COMPARE_16(v4,v5,16)
4151441514+- addi r3,r3,32
4151541515+- addi r4,r4,32
4151641516+- CHECK_N_BYTES(r3,r4,r9)
4151741517+- CHECK_N_BYTES(r3,r4,r8)
4151841518+- COMPARE_16(v4,v5,0)
4151941519+- addi r3,r3,16
4152041520+- addi r4,r4,16
4152141521+- b L(compare_64B_unaligned)
4152241522+-L(cross_4):
4152341523+- COMPARE_16(v4,v5,0)
4152441524+- COMPARE_16(v4,v5,16)
4152541525+- COMPARE_16(v4,v5,32)
4152641526+- addi r3,r3,48
4152741527+- addi r4,r4,48
4152841528+- CHECK_N_BYTES(r3,r4,r9)
4152941529+- CHECK_N_BYTES(r3,r4,r8)
4153041530+- b L(compare_64B_unaligned)
4153141531+-
4153241532+-L(same_aligned):
4153341533+- CHECK_N_BYTES(r3,r4,r7)
4153441534+- /* Align s1 to 32B and adjust s2 address.
4153541535+- Use lxvp only if both s1 and s2 are 32B aligned. */
4153641536+- COMPARE_16(v4,v5,0)
4153741537+- COMPARE_16(v4,v5,16)
4153841538+- COMPARE_16(v4,v5,32)
4153941539+- COMPARE_16(v4,v5,48)
4154041540+- addi r3,r3,64
4154141541+- addi r4,r4,64
4154241542+- COMPARE_16(v4,v5,0)
4154341543+- COMPARE_16(v4,v5,16)
4154441544+- addi r5,r5,32
4154541545+-
4154641546+- clrldi r6,r3,59
4154741547+- subfic r7,r6,32
4154841548+- add r3,r3,r7
4154941549+- add r4,r4,r7
4155041550+- subf r5,r7,r5
4155141551+- andi. r7,r4,0x1F
4155241552+- beq cr0,L(32B_aligned_loop)
4155341553+-
4155441554+- .p2align 5
4155541555+-L(16B_aligned_loop):
4155641556+- COMPARE_16(v4,v5,0)
4155741557+- COMPARE_16(v4,v5,16)
4155841558+- COMPARE_16(v4,v5,32)
4155941559+- COMPARE_16(v4,v5,48)
4156041560+- addi r3,r3,64
4156141561+- addi r4,r4,64
4156241562+- b L(16B_aligned_loop)
4156341563+-
4156441564+- /* Calculate and return the difference. */
4156541565+-L(different):
4156641566+- TAIL_FIRST_16B(v4,v5)
4156741567+-
4156841568+- .p2align 5
4156941569+-L(32B_aligned_loop):
4157041570+- COMPARE_32(v14,v16,0,tail1,tail2)
4157141571+- COMPARE_32(v18,v20,32,tail3,tail4)
4157241572+- COMPARE_32(v22,v24,64,tail5,tail6)
4157341573+- COMPARE_32(v26,v28,96,tail7,tail8)
4157441574+- addi r3,r3,128
4157541575+- addi r4,r4,128
4157641576+- b L(32B_aligned_loop)
4157741577+-
4157841578+-L(tail1): TAIL_FIRST_16B(v15,v17)
4157941579+-L(tail2): TAIL_SECOND_16B(v14,v16)
4158041580+-L(tail3): TAIL_FIRST_16B(v19,v21)
4158141581+-L(tail4): TAIL_SECOND_16B(v18,v20)
4158241582+-L(tail5): TAIL_FIRST_16B(v23,v25)
4158341583+-L(tail6): TAIL_SECOND_16B(v22,v24)
4158441584+-L(tail7): TAIL_FIRST_16B(v27,v29)
4158541585+-L(tail8): TAIL_SECOND_16B(v26,v28)
4158641586+-
4158741587+- .p2align 5
4158841588+-L(ret0):
4158941589+- li r3,0
4159041590+- blr
4159141591+-
4159241592+-END(STRNCMP)
4159341593+-libc_hidden_builtin_def(strncmp)
4159441594+diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
4159541595+index b847c19049..a38ff46448 100644
4159641596+--- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
4159741597++++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile
4159841598+@@ -34,7 +34,7 @@ ifneq (,$(filter %le,$(config-machine)))
4159941599+ sysdep_routines += memchr-power10 memcmp-power10 memcpy-power10 \
4160041600+ memmove-power10 memset-power10 rawmemchr-power9 \
4160141601+ rawmemchr-power10 strcmp-power9 strcmp-power10 \
4160241602+- strncmp-power9 strncmp-power10 strcpy-power9 stpcpy-power9 \
4160341603++ strncmp-power9 strcpy-power9 stpcpy-power9 \
4160441604+ strlen-power9 strncpy-power9 stpncpy-power9 strlen-power10
4160541605+ endif
4160641606+ CFLAGS-strncase-power7.c += -mcpu=power7 -funroll-loops
4160741607+diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
4160841608+index 2bb47d3527..30fd89e109 100644
4160941609+--- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
4161041610++++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
4161141611+@@ -164,9 +164,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
4161241612+ /* Support sysdeps/powerpc/powerpc64/multiarch/strncmp.c. */
4161341613+ IFUNC_IMPL (i, name, strncmp,
4161441614+ #ifdef __LITTLE_ENDIAN__
4161541615+- IFUNC_IMPL_ADD (array, i, strncmp, hwcap2 & PPC_FEATURE2_ARCH_3_1
4161641616+- && hwcap & PPC_FEATURE_HAS_VSX,
4161741617+- __strncmp_power10)
4161841618+ IFUNC_IMPL_ADD (array, i, strncmp, hwcap2 & PPC_FEATURE2_ARCH_3_00
4161941619+ && hwcap & PPC_FEATURE_HAS_ALTIVEC,
4162041620+ __strncmp_power9)
4162141621+diff --git a/sysdeps/powerpc/powerpc64/multiarch/strncmp-power10.S b/sysdeps/powerpc/powerpc64/multiarch/strncmp-power10.S
4162241622+deleted file mode 100644
4162341623+index d7026c12e2..0000000000
4162441624+--- a/sysdeps/powerpc/powerpc64/multiarch/strncmp-power10.S
4162541625++++ /dev/null
4162641626+@@ -1,25 +0,0 @@
4162741627+-/* Copyright (C) 2024 Free Software Foundation, Inc.
4162841628+- This file is part of the GNU C Library.
4162941629+-
4163041630+- The GNU C Library is free software; you can redistribute it and/or
4163141631+- modify it under the terms of the GNU Lesser General Public
4163241632+- License as published by the Free Software Foundation; either
4163341633+- version 2.1 of the License, or (at your option) any later version.
4163441634+-
4163541635+- The GNU C Library is distributed in the hope that it will be useful,
4163641636+- but WITHOUT ANY WARRANTY; without even the implied warranty of
4163741637+- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
4163841638+- Lesser General Public License for more details.
4163941639+-
4164041640+- You should have received a copy of the GNU Lesser General Public
4164141641+- License along with the GNU C Library; if not, see
4164241642+- <https://www.gnu.org/licenses/>. */
4164341643+-
4164441644+-#if defined __LITTLE_ENDIAN__ && IS_IN (libc)
4164541645+-#define STRNCMP __strncmp_power10
4164641646+-
4164741647+-#undef libc_hidden_builtin_def
4164841648+-#define libc_hidden_builtin_def(name)
4164941649+-
4165041650+-#include <sysdeps/powerpc/powerpc64/le/power10/strncmp.S>
4165141651+-#endif
4165241652+diff --git a/sysdeps/powerpc/powerpc64/multiarch/strncmp.c b/sysdeps/powerpc/powerpc64/multiarch/strncmp.c
4165341653+index a5ed67f766..6178f4a432 100644
4165441654+--- a/sysdeps/powerpc/powerpc64/multiarch/strncmp.c
4165541655++++ b/sysdeps/powerpc/powerpc64/multiarch/strncmp.c
4165641656+@@ -29,7 +29,6 @@ extern __typeof (strncmp) __strncmp_ppc attribute_hidden;
4165741657+ extern __typeof (strncmp) __strncmp_power8 attribute_hidden;
4165841658+ # ifdef __LITTLE_ENDIAN__
4165941659+ extern __typeof (strncmp) __strncmp_power9 attribute_hidden;
4166041660+-extern __typeof (strncmp) __strncmp_power10 attribute_hidden;
4166141661+ # endif
4166241662+ # undef strncmp
4166341663+4166441664+@@ -37,9 +36,6 @@ extern __typeof (strncmp) __strncmp_power10 attribute_hidden;
4166541665+ ifunc symbol properly. */
4166641666+ libc_ifunc_redirected (__redirect_strncmp, strncmp,
4166741667+ # ifdef __LITTLE_ENDIAN__
4166841668+- (hwcap2 & PPC_FEATURE2_ARCH_3_1
4166941669+- && hwcap & PPC_FEATURE_HAS_VSX)
4167041670+- ? __strncmp_power10 :
4167141671+ (hwcap2 & PPC_FEATURE2_ARCH_3_00
4167241672+ && hwcap & PPC_FEATURE_HAS_ALTIVEC)
4167341673+ ? __strncmp_power9 :
4167441674+4167541675+commit 2ad6e55ea5cb23af5af7af35d5f80cd93032f96a
4167641676+Author: Carlos O'Donell <carlos@redhat.com>
4167741677+Date: Wed Jun 11 09:43:50 2025 -0400
4167841678+4167941679+ ppc64le: Revert "powerpc: Fix performance issues of strcmp power10" (CVE-2025-5702)
4168041680+4168141681+ This reverts commit 90bcc8721ef82b7378d2b080141228660e862d56
4168241682+4168341683+ This change is in the chain of the final revert that fixes the CVE
4168441684+ i.e. 3367d8e180848030d1646f088759f02b8dfe0d6f
4168541685+4168641686+ Reason for revert: Power10 strcmp clobbers non-volatile vector
4168741687+ registers (Bug 33056)
4168841688+4168941689+ Tested on ppc64le with no regressions.
4169041690+4169141691+ (cherry picked from commit c22de63588df7a8a0edceea9bb02534064c9d201)
4169241692+4169341693+diff --git a/sysdeps/powerpc/powerpc64/le/power10/strcmp.S b/sysdeps/powerpc/powerpc64/le/power10/strcmp.S
4169441694+index f0d6732a25..00f1e9c170 100644
4169541695+--- a/sysdeps/powerpc/powerpc64/le/power10/strcmp.S
4169641696++++ b/sysdeps/powerpc/powerpc64/le/power10/strcmp.S
4169741697+@@ -62,7 +62,7 @@
4169841698+ lxvl 32+v5,reg2,r0; \
4169941699+ add reg1,reg1,len_reg; \
4170041700+ add reg2,reg2,len_reg; \
4170141701+- vcmpnezb v7,v4,v5; \
4170241702++ vcmpnezb. v7,v4,v5; \
4170341703+ vctzlsbb r6,v7; \
4170441704+ cmpld cr7,r6,len_reg; \
4170541705+ blt cr7,L(different); \
4170641706+@@ -72,110 +72,70 @@
4170741707+4170841708+ .machine power9
4170941709+ ENTRY_TOCLESS (STRCMP, 4)
4171041710+- andi. r7,r3,4095
4171141711+- andi. r8,r4,4095
4171241712+- cmpldi cr0,r7,4096-16
4171341713+- cmpldi cr1,r8,4096-16
4171441714+- bgt cr0,L(crosses)
4171541715+- bgt cr1,L(crosses)
4171641716+- COMPARE_16(v4,v5,0)
4171741717+-
4171841718+-L(crosses):
4171941719+- andi. r7,r3,15
4172041720+- subfic r7,r7,16 /* r7(nalign1) = 16 - (str1 & 15). */
4172141721+- andi. r9,r4,15
4172241722+- subfic r5,r9,16 /* r5(nalign2) = 16 - (str2 & 15). */
4172341723+- cmpld cr7,r7,r5
4172441724+- beq cr7,L(same_aligned)
4172541725+- blt cr7,L(nalign1_min)
4172641726++ li r11,16
4172741727++ /* eq bit of cr1 used as swap status flag to indicate if
4172841728++ source pointers were swapped. */
4172941729++ crclr 4*cr1+eq
4173041730++ vspltisb v19,-1
4173141731++ andi. r7,r3,15
4173241732++ sub r7,r11,r7 /* r7(nalign1) = 16 - (str1 & 15). */
4173341733++ andi. r9,r4,15
4173441734++ sub r5,r11,r9 /* r5(nalign2) = 16 - (str2 & 15). */
4173541735++ cmpld cr7,r7,r5
4173641736++ beq cr7,L(same_aligned)
4173741737++ blt cr7,L(nalign1_min)
4173841738++ /* Swap r3 and r4, and r7 and r5 such that r3 and r7 hold the
4173941739++ pointer which is closer to the next 16B boundary so that only
4174041740++ one CHECK_N_BYTES is needed before entering the loop below. */
4174141741++ mr r8,r4
4174241742++ mr r4,r3
4174341743++ mr r3,r8
4174441744++ mr r12,r7
4174541745++ mr r7,r5
4174641746++ mr r5,r12
4174741747++ crset 4*cr1+eq /* Set bit on swapping source pointers. */
4174841748+4174941749+- /* nalign2 is minimum and s2 pointer is aligned. */
4175041750+- CHECK_N_BYTES(r3,r4,r5)
4175141751+- /* Are we on the 64B hunk which crosses a page? */
4175241752+- andi. r10,r3,63 /* Determine offset into 64B hunk. */
4175341753+- andi. r8,r3,15 /* The offset into the 16B hunk. */
4175441754+- neg r7,r3
4175541755+- andi. r9,r7,15 /* Number of bytes after a 16B cross. */
4175641756+- rlwinm. r7,r7,26,0x3F /* ((r3-4096))>>6&63. */
4175741757+- beq L(compare_64_pagecross)
4175841758+- mtctr r7
4175941759+- b L(compare_64B_unaligned)
4176041760+-
4176141761+- /* nalign1 is minimum and s1 pointer is aligned. */
4176241762++ .p2align 5
4176341763+ L(nalign1_min):
4176441764+ CHECK_N_BYTES(r3,r4,r7)
4176541765+- /* Are we on the 64B hunk which crosses a page? */
4176641766+- andi. r10,r4,63 /* Determine offset into 64B hunk. */
4176741767+- andi. r8,r4,15 /* The offset into the 16B hunk. */
4176841768+- neg r7,r4
4176941769+- andi. r9,r7,15 /* Number of bytes after a 16B cross. */
4177041770+- rlwinm. r7,r7,26,0x3F /* ((r4-4096))>>6&63. */
4177141771+- beq L(compare_64_pagecross)
4177241772+- mtctr r7
4177341773+4177441774+ .p2align 5
4177541775+-L(compare_64B_unaligned):
4177641776+- COMPARE_16(v4,v5,0)
4177741777+- COMPARE_16(v4,v5,16)
4177841778+- COMPARE_16(v4,v5,32)
4177941779+- COMPARE_16(v4,v5,48)
4178041780+- addi r3,r3,64
4178141781+- addi r4,r4,64
4178241782+- bdnz L(compare_64B_unaligned)
4178341783++L(s1_aligned):
4178441784++ /* r9 and r5 is number of bytes to be read after and before
4178541785++ page boundary correspondingly. */
4178641786++ sub r5,r5,r7
4178741787++ subfic r9,r5,16
4178841788++ /* Now let r7 hold the count of quadwords which can be
4178941789++ checked without crossing a page boundary. quadword offset is
4179041790++ (str2>>4)&0xFF. */
4179141791++ rlwinm r7,r4,28,0xFF
4179241792++ /* Below check is required only for first iteration. For second
4179341793++ iteration and beyond, the new loop counter is always 255. */
4179441794++ cmpldi r7,255
4179541795++ beq L(L3)
4179641796++ /* Get the initial loop count by 255-((str2>>4)&0xFF). */
4179741797++ subfic r11,r7,255
4179841798+4179941799+- /* Cross the page boundary of s2, carefully. Only for first
4180041800+- iteration we have to get the count of 64B blocks to be checked.
4180141801+- From second iteration and beyond, loop counter is always 63. */
4180241802+-L(compare_64_pagecross):
4180341803+- li r11, 63
4180441804++ .p2align 5
4180541805++L(L1):
4180641806+ mtctr r11
4180741807+- cmpldi r10,16
4180841808+- ble L(cross_4)
4180941809+- cmpldi r10,32
4181041810+- ble L(cross_3)
4181141811+- cmpldi r10,48
4181241812+- ble L(cross_2)
4181341813+-L(cross_1):
4181441814+- CHECK_N_BYTES(r3,r4,r9)
4181541815+- CHECK_N_BYTES(r3,r4,r8)
4181641816+- COMPARE_16(v4,v5,0)
4181741817+- COMPARE_16(v4,v5,16)
4181841818+- COMPARE_16(v4,v5,32)
4181941819+- addi r3,r3,48
4182041820+- addi r4,r4,48
4182141821+- b L(compare_64B_unaligned)
4182241822+-L(cross_2):
4182341823+- COMPARE_16(v4,v5,0)
4182441824+- addi r3,r3,16
4182541825+- addi r4,r4,16
4182641826+- CHECK_N_BYTES(r3,r4,r9)
4182741827+- CHECK_N_BYTES(r3,r4,r8)
4182841828+- COMPARE_16(v4,v5,0)
4182941829+- COMPARE_16(v4,v5,16)
4183041830+- addi r3,r3,32
4183141831+- addi r4,r4,32
4183241832+- b L(compare_64B_unaligned)
4183341833+-L(cross_3):
4183441834+- COMPARE_16(v4,v5,0)
4183541835+- COMPARE_16(v4,v5,16)
4183641836+- addi r3,r3,32
4183741837+- addi r4,r4,32
4183841838+- CHECK_N_BYTES(r3,r4,r9)
4183941839+- CHECK_N_BYTES(r3,r4,r8)
4184041840+- COMPARE_16(v4,v5,0)
4184141841++
4184241842++ .p2align 5
4184341843++L(L2):
4184441844++ COMPARE_16(v4,v5,0) /* Load 16B blocks using lxv. */
4184541845+ addi r3,r3,16
4184641846+ addi r4,r4,16
4184741847+- b L(compare_64B_unaligned)
4184841848+-L(cross_4):
4184941849+- COMPARE_16(v4,v5,0)
4185041850+- COMPARE_16(v4,v5,16)
4185141851+- COMPARE_16(v4,v5,32)
4185241852+- addi r3,r3,48
4185341853+- addi r4,r4,48
4185441854++ bdnz L(L2)
4185541855++ /* Cross the page boundary of s2, carefully. */
4185641856++
4185741857++ .p2align 5
4185841858++L(L3):
4185941859++ CHECK_N_BYTES(r3,r4,r5)
4186041860+ CHECK_N_BYTES(r3,r4,r9)
4186141861+- CHECK_N_BYTES(r3,r4,r8)
4186241862+- b L(compare_64B_unaligned)
4186341863++ li r11,255 /* Load the new loop counter. */
4186441864++ b L(L1)
4186541865+4186641866++ .p2align 5
4186741867+ L(same_aligned):
4186841868+ CHECK_N_BYTES(r3,r4,r7)
4186941869+ /* Align s1 to 32B and adjust s2 address.
4187041870+@@ -208,7 +168,18 @@ L(16B_aligned_loop):
4187141871+4187241872+ /* Calculate and return the difference. */
4187341873+ L(different):
4187441874+- TAIL(v4,v5)
4187541875++ vctzlsbb r6,v7
4187641876++ vextubrx r5,r6,v4
4187741877++ vextubrx r4,r6,v5
4187841878++ bt 4*cr1+eq,L(swapped)
4187941879++ subf r3,r4,r5
4188041880++ blr
4188141881++
4188241882++ /* If src pointers were swapped, then swap the
4188341883++ indices and calculate the return value. */
4188441884++L(swapped):
4188541885++ subf r3,r5,r4
4188641886++ blr
4188741887+4188841888+ .p2align 5
4188941889+ L(32B_aligned_loop):
4189041890+4189141891+commit 672f31b90e501b4ba10ba12ab4c6051f77589912
4189241892+Author: Carlos O'Donell <carlos@redhat.com>
4189341893+Date: Wed Jun 11 09:33:45 2025 -0400
4189441894+4189541895+ ppc64le: Revert "powerpc : Add optimized memchr for POWER10" (Bug 33059)
4189641896+4189741897+ This reverts commit b9182c793caa05df5d697427c0538936e6396d4b
4189841898+4189941899+ Reason for revert: Power10 memchr clobbers v20 vector register
4190041900+ (Bug 33059)
4190141901+4190241902+ This is not a security issue, unlike CVE-2025-5745 and
4190341903+ CVE-2025-5702.
4190441904+4190541905+ Tested on ppc64le without regression.
4190641906+4190741907+ (cherry picked from commit a7877bb6685300f159fa095c9f50b22b112cddb8)
4190841908+4190941909+diff --git a/sysdeps/powerpc/powerpc64/le/power10/memchr.S b/sysdeps/powerpc/powerpc64/le/power10/memchr.S
4191041910+deleted file mode 100644
4191141911+index 53e5716d72..0000000000
4191241912+--- a/sysdeps/powerpc/powerpc64/le/power10/memchr.S
4191341913++++ /dev/null
4191441914+@@ -1,315 +0,0 @@
4191541915+-/* Optimized memchr implementation for POWER10 LE.
4191641916+- Copyright (C) 2021-2024 Free Software Foundation, Inc.
4191741917+- This file is part of the GNU C Library.
4191841918+-
4191941919+- The GNU C Library is free software; you can redistribute it and/or
4192041920+- modify it under the terms of the GNU Lesser General Public
4192141921+- License as published by the Free Software Foundation; either
4192241922+- version 2.1 of the License, or (at your option) any later version.
4192341923+-
4192441924+- The GNU C Library is distributed in the hope that it will be useful,
4192541925+- but WITHOUT ANY WARRANTY; without even the implied warranty of
4192641926+- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
4192741927+- Lesser General Public License for more details.
4192841928+-
4192941929+- You should have received a copy of the GNU Lesser General Public
4193041930+- License along with the GNU C Library; if not, see
4193141931+- <https://www.gnu.org/licenses/>. */
4193241932+-
4193341933+-#include <sysdep.h>
4193441934+-
4193541935+-# ifndef MEMCHR
4193641936+-# define MEMCHR __memchr
4193741937+-# endif
4193841938+-# define M_VREG_ZERO v20
4193941939+-# define M_OFF_START_LOOP 256
4194041940+-# define MEMCHR_SUBTRACT_VECTORS \
4194141941+- vsububm v4,v4,v18; \
4194241942+- vsububm v5,v5,v18; \
4194341943+- vsububm v6,v6,v18; \
4194441944+- vsububm v7,v7,v18;
4194541945+-# define M_TAIL(vreg,increment) \
4194641946+- vctzlsbb r4,vreg; \
4194741947+- cmpld r5,r4; \
4194841948+- ble L(null); \
4194941949+- addi r4,r4,increment; \
4195041950+- add r3,r6,r4; \
4195141951+- blr
4195241952+-
4195341953+-/* TODO: Replace macros by the actual instructions when minimum binutils becomes
4195441954+- >= 2.35. This is used to keep compatibility with older versions. */
4195541955+-#define M_VEXTRACTBM(rt,vrb) \
4195641956+- .long(((4)<<(32-6)) \
4195741957+- | ((rt)<<(32-11)) \
4195841958+- | ((8)<<(32-16)) \
4195941959+- | ((vrb)<<(32-21)) \
4196041960+- | 1602)
4196141961+-
4196241962+-#define M_LXVP(xtp,dq,ra) \
4196341963+- .long(((6)<<(32-6)) \
4196441964+- | ((((xtp)-32)>>1)<<(32-10)) \
4196541965+- | ((1)<<(32-11)) \
4196641966+- | ((ra)<<(32-16)) \
4196741967+- | dq)
4196841968+-
4196941969+-#define CHECK16B(vreg,offset,addr,label) \
4197041970+- lxv vreg+32,offset(addr); \
4197141971+- vcmpequb. vreg,vreg,v18; \
4197241972+- bne cr6,L(label); \
4197341973+- cmpldi r5,16; \
4197441974+- ble L(null); \
4197541975+- addi r5,r5,-16;
4197641976+-
4197741977+-/* Load 4 quadwords, merge into one VR for speed and check for NULLs. r6 has #
4197841978+- of bytes already checked. */
4197941979+-#define CHECK64B(offset,addr,label) \
4198041980+- M_LXVP(v4+32,offset,addr); \
4198141981+- M_LXVP(v6+32,offset+32,addr); \
4198241982+- MEMCHR_SUBTRACT_VECTORS; \
4198341983+- vminub v14,v4,v5; \
4198441984+- vminub v15,v6,v7; \
4198541985+- vminub v16,v14,v15; \
4198641986+- vcmpequb. v0,v16,M_VREG_ZERO; \
4198741987+- beq cr6,$+12; \
4198841988+- li r7,offset; \
4198941989+- b L(label); \
4199041990+- cmpldi r5,64; \
4199141991+- ble L(null); \
4199241992+- addi r5,r5,-64
4199341993+-
4199441994+-/* Implements the function
4199541995+- void *[r3] memchr (const void *s [r3], int c [r4], size_t n [r5]). */
4199641996+-
4199741997+- .machine power9
4199841998+-
4199941999+-ENTRY_TOCLESS (MEMCHR)
4200042000+- CALL_MCOUNT 3
4200142001+-
4200242002+- cmpldi r5,0
4200342003+- beq L(null)
4200442004+- mr r0,r5
4200542005+- xori r6,r4,0xff
4200642006+-
4200742007+- mtvsrd v18+32,r4 /* matching char in v18 */
4200842008+- mtvsrd v19+32,r6 /* non matching char in v19 */
4200942009+-
4201042010+- vspltb v18,v18,7 /* replicate */
4201142011+- vspltb v19,v19,7 /* replicate */
4201242012+- vspltisb M_VREG_ZERO,0
4201342013+-
4201442014+- /* Next 16B-aligned address. Prepare address for L(aligned). */
4201542015+- addi r6,r3,16
4201642016+- clrrdi r6,r6,4
4201742017+-
4201842018+- /* Align data and fill bytes not loaded with non matching char. */
4201942019+- lvx v0,0,r3
4202042020+- lvsr v1,0,r3
4202142021+- vperm v0,v19,v0,v1
4202242022+-
4202342023+- vcmpequb. v6,v0,v18
4202442024+- bne cr6,L(found)
4202542025+- sub r4,r6,r3
4202642026+- cmpld r5,r4
4202742027+- ble L(null)
4202842028+- sub r5,r5,r4
4202942029+-
4203042030+- /* Test up to OFF_START_LOOP-16 bytes in 16B chunks. The main loop is
4203142031+- optimized for longer strings, so checking the first bytes in 16B
4203242032+- chunks benefits a lot small strings. */
4203342033+- .p2align 5
4203442034+-L(aligned):
4203542035+- cmpldi r5,0
4203642036+- beq L(null)
4203742037+-
4203842038+- CHECK16B(v0,0,r6,tail1)
4203942039+- CHECK16B(v1,16,r6,tail2)
4204042040+- CHECK16B(v2,32,r6,tail3)
4204142041+- CHECK16B(v3,48,r6,tail4)
4204242042+- CHECK16B(v4,64,r6,tail5)
4204342043+- CHECK16B(v5,80,r6,tail6)
4204442044+- CHECK16B(v6,96,r6,tail7)
4204542045+- CHECK16B(v7,112,r6,tail8)
4204642046+- CHECK16B(v8,128,r6,tail9)
4204742047+- CHECK16B(v9,144,r6,tail10)
4204842048+- CHECK16B(v10,160,r6,tail11)
4204942049+- CHECK16B(v0,176,r6,tail12)
4205042050+- CHECK16B(v1,192,r6,tail13)
4205142051+- CHECK16B(v2,208,r6,tail14)
4205242052+- CHECK16B(v3,224,r6,tail15)
4205342053+-
4205442054+- cmpdi cr5,r4,0 /* Check if c == 0. This will be useful to
4205542055+- choose how we will perform the main loop. */
4205642056+-
4205742057+- /* Prepare address for the loop. */
4205842058+- addi r4,r3,M_OFF_START_LOOP
4205942059+- clrrdi r4,r4,6
4206042060+- sub r6,r4,r3
4206142061+- sub r5,r0,r6
4206242062+- addi r6,r4,128
4206342063+-
4206442064+- /* If c == 0, use the loop without the vsububm. */
4206542065+- beq cr5,L(loop)
4206642066+-
4206742067+- /* This is very similar to the block after L(loop), the difference is
4206842068+- that here MEMCHR_SUBTRACT_VECTORS is not empty, and we subtract
4206942069+- each byte loaded by the char we are looking for, this way we can keep
4207042070+- using vminub to merge the results and checking for nulls. */
4207142071+- .p2align 5
4207242072+-L(memchr_loop):
4207342073+- CHECK64B(0,r4,pre_tail_64b)
4207442074+- CHECK64B(64,r4,pre_tail_64b)
4207542075+- addi r4,r4,256
4207642076+-
4207742077+- CHECK64B(0,r6,tail_64b)
4207842078+- CHECK64B(64,r6,tail_64b)
4207942079+- addi r6,r6,256
4208042080+-
4208142081+- CHECK64B(0,r4,pre_tail_64b)
4208242082+- CHECK64B(64,r4,pre_tail_64b)
4208342083+- addi r4,r4,256
4208442084+-
4208542085+- CHECK64B(0,r6,tail_64b)
4208642086+- CHECK64B(64,r6,tail_64b)
4208742087+- addi r6,r6,256
4208842088+-
4208942089+- b L(memchr_loop)
4209042090+- /* Switch to a more aggressive approach checking 64B each time. Use 2
4209142091+- pointers 128B apart and unroll the loop once to make the pointer
4209242092+- updates and usages separated enough to avoid stalls waiting for
4209342093+- address calculation. */
4209442094+- .p2align 5
4209542095+-L(loop):
4209642096+-#undef MEMCHR_SUBTRACT_VECTORS
4209742097+-#define MEMCHR_SUBTRACT_VECTORS /* nothing */
4209842098+- CHECK64B(0,r4,pre_tail_64b)
4209942099+- CHECK64B(64,r4,pre_tail_64b)
4210042100+- addi r4,r4,256
4210142101+-
4210242102+- CHECK64B(0,r6,tail_64b)
4210342103+- CHECK64B(64,r6,tail_64b)
4210442104+- addi r6,r6,256
4210542105+-
4210642106+- CHECK64B(0,r4,pre_tail_64b)
4210742107+- CHECK64B(64,r4,pre_tail_64b)
4210842108+- addi r4,r4,256
4210942109+-
4211042110+- CHECK64B(0,r6,tail_64b)
4211142111+- CHECK64B(64,r6,tail_64b)
4211242112+- addi r6,r6,256
4211342113+-
4211442114+- b L(loop)
4211542115+-
4211642116+- .p2align 5
4211742117+-L(pre_tail_64b):
4211842118+- mr r6,r4
4211942119+-L(tail_64b):
4212042120+- /* OK, we found a null byte. Let's look for it in the current 64-byte
4212142121+- block and mark it in its corresponding VR. lxvp vx,0(ry) puts the
4212242122+- low 16B bytes into vx+1, and the high into vx, so the order here is
4212342123+- v5, v4, v7, v6. */
4212442124+- vcmpequb v1,v5,M_VREG_ZERO
4212542125+- vcmpequb v2,v4,M_VREG_ZERO
4212642126+- vcmpequb v3,v7,M_VREG_ZERO
4212742127+- vcmpequb v4,v6,M_VREG_ZERO
4212842128+-
4212942129+- /* Take into account the other 64B blocks we had already checked. */
4213042130+- add r6,r6,r7
4213142131+- /* Extract first bit of each byte. */
4213242132+- M_VEXTRACTBM(r8,v1)
4213342133+- M_VEXTRACTBM(r9,v2)
4213442134+- M_VEXTRACTBM(r10,v3)
4213542135+- M_VEXTRACTBM(r11,v4)
4213642136+-
4213742137+- /* Shift each value into their corresponding position. */
4213842138+- sldi r9,r9,16
4213942139+- sldi r10,r10,32
4214042140+- sldi r11,r11,48
4214142141+-
4214242142+- /* Merge the results. */
4214342143+- or r8,r8,r9
4214442144+- or r9,r10,r11
4214542145+- or r11,r9,r8
4214642146+-
4214742147+- cnttzd r0,r11 /* Count trailing zeros before the match. */
4214842148+- cmpld r5,r0
4214942149+- ble L(null)
4215042150+- add r3,r6,r0 /* Compute final address. */
4215142151+- blr
4215242152+-
4215342153+- .p2align 5
4215442154+-L(tail1):
4215542155+- M_TAIL(v0,0)
4215642156+-
4215742157+- .p2align 5
4215842158+-L(tail2):
4215942159+- M_TAIL(v1,16)
4216042160+-
4216142161+- .p2align 5
4216242162+-L(tail3):
4216342163+- M_TAIL(v2,32)
4216442164+-
4216542165+- .p2align 5
4216642166+-L(tail4):
4216742167+- M_TAIL(v3,48)
4216842168+-
4216942169+- .p2align 5
4217042170+-L(tail5):
4217142171+- M_TAIL(v4,64)
4217242172+-
4217342173+- .p2align 5
4217442174+-L(tail6):
4217542175+- M_TAIL(v5,80)
4217642176+-
4217742177+- .p2align 5
4217842178+-L(tail7):
4217942179+- M_TAIL(v6,96)
4218042180+-
4218142181+- .p2align 5
4218242182+-L(tail8):
4218342183+- M_TAIL(v7,112)
4218442184+-
4218542185+- .p2align 5
4218642186+-L(tail9):
4218742187+- M_TAIL(v8,128)
4218842188+-
4218942189+- .p2align 5
4219042190+-L(tail10):
4219142191+- M_TAIL(v9,144)
4219242192+-
4219342193+- .p2align 5
4219442194+-L(tail11):
4219542195+- M_TAIL(v10,160)
4219642196+-
4219742197+- .p2align 5
4219842198+-L(tail12):
4219942199+- M_TAIL(v0,176)
4220042200+-
4220142201+- .p2align 5
4220242202+-L(tail13):
4220342203+- M_TAIL(v1,192)
4220442204+-
4220542205+- .p2align 5
4220642206+-L(tail14):
4220742207+- M_TAIL(v2,208)
4220842208+-
4220942209+- .p2align 5
4221042210+-L(tail15):
4221142211+- M_TAIL(v3,224)
4221242212+-
4221342213+- .p2align 5
4221442214+-L(found):
4221542215+- vctzlsbb r7,v6
4221642216+- cmpld r5,r7
4221742217+- ble L(null)
4221842218+- add r3,r3,r7
4221942219+- blr
4222042220+-
4222142221+- .p2align 5
4222242222+-L(null):
4222342223+- li r3,0
4222442224+- blr
4222542225+-
4222642226+-END (MEMCHR)
4222742227+-
4222842228+-weak_alias (__memchr, memchr)
4222942229+-libc_hidden_builtin_def (memchr)
4223042230+diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
4223142231+index a38ff46448..fa1107dfd9 100644
4223242232+--- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
4223342233++++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile
4223442234+@@ -31,10 +31,10 @@ sysdep_routines += memcpy-power8-cached memcpy-power7 memcpy-a2 memcpy-power6 \
4223542235+ strncase-power8
4223642236+4223742237+ ifneq (,$(filter %le,$(config-machine)))
4223842238+-sysdep_routines += memchr-power10 memcmp-power10 memcpy-power10 \
4223942239+- memmove-power10 memset-power10 rawmemchr-power9 \
4224042240+- rawmemchr-power10 strcmp-power9 strcmp-power10 \
4224142241+- strncmp-power9 strcpy-power9 stpcpy-power9 \
4224242242++sysdep_routines += memcmp-power10 memcpy-power10 memmove-power10 memset-power10 \
4224342243++ rawmemchr-power9 rawmemchr-power10 \
4224442244++ strcmp-power9 strcmp-power10 strncmp-power9 \
4224542245++ strcpy-power9 stpcpy-power9 \
4224642246+ strlen-power9 strncpy-power9 stpncpy-power9 strlen-power10
4224742247+ endif
4224842248+ CFLAGS-strncase-power7.c += -mcpu=power7 -funroll-loops
4224942249+diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
4225042250+index 30fd89e109..9b3e617306 100644
4225142251+--- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
4225242252++++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
4225342253+@@ -226,12 +226,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
4225442254+4225542255+ /* Support sysdeps/powerpc/powerpc64/multiarch/memchr.c. */
4225642256+ IFUNC_IMPL (i, name, memchr,
4225742257+-#ifdef __LITTLE_ENDIAN__
4225842258+- IFUNC_IMPL_ADD (array, i, memchr,
4225942259+- hwcap2 & PPC_FEATURE2_ARCH_3_1
4226042260+- && hwcap & PPC_FEATURE_HAS_VSX,
4226142261+- __memchr_power10)
4226242262+-#endif
4226342263+ IFUNC_IMPL_ADD (array, i, memchr,
4226442264+ hwcap2 & PPC_FEATURE2_ARCH_2_07
4226542265+ && hwcap & PPC_FEATURE_HAS_ALTIVEC,
4226642266+diff --git a/sysdeps/powerpc/powerpc64/multiarch/memchr-power10.S b/sysdeps/powerpc/powerpc64/multiarch/memchr-power10.S
4226742267+deleted file mode 100644
4226842268+index 7d35ef28a9..0000000000
4226942269+--- a/sysdeps/powerpc/powerpc64/multiarch/memchr-power10.S
4227042270++++ /dev/null
4227142271+@@ -1,28 +0,0 @@
4227242272+-/* Optimized memchr implementation for POWER10/PPC64.
4227342273+- Copyright (C) 2016-2024 Free Software Foundation, Inc.
4227442274+- This file is part of the GNU C Library.
4227542275+-
4227642276+- The GNU C Library is free software; you can redistribute it and/or
4227742277+- modify it under the terms of the GNU Lesser General Public
4227842278+- License as published by the Free Software Foundation; either
4227942279+- version 2.1 of the License, or (at your option) any later version.
4228042280+-
4228142281+- The GNU C Library is distributed in the hope that it will be useful,
4228242282+- but WITHOUT ANY WARRANTY; without even the implied warranty of
4228342283+- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
4228442284+- Lesser General Public License for more details.
4228542285+-
4228642286+- You should have received a copy of the GNU Lesser General Public
4228742287+- License along with the GNU C Library; if not, see
4228842288+- <https://www.gnu.org/licenses/>. */
4228942289+-
4229042290+-#if defined __LITTLE_ENDIAN__ && IS_IN (libc)
4229142291+-#define MEMCHR __memchr_power10
4229242292+-
4229342293+-#undef libc_hidden_builtin_def
4229442294+-#define libc_hidden_builtin_def(name)
4229542295+-#undef weak_alias
4229642296+-#define weak_alias(name,alias)
4229742297+-
4229842298+-#include <sysdeps/powerpc/powerpc64/le/power10/memchr.S>
4229942299+-#endif
4230042300+diff --git a/sysdeps/powerpc/powerpc64/multiarch/memchr.c b/sysdeps/powerpc/powerpc64/multiarch/memchr.c
4230142301+index 57d23e7b18..b4655dfcaa 100644
4230242302+--- a/sysdeps/powerpc/powerpc64/multiarch/memchr.c
4230342303++++ b/sysdeps/powerpc/powerpc64/multiarch/memchr.c
4230442304+@@ -25,23 +25,15 @@ extern __typeof (__memchr) __memchr_ppc attribute_hidden;
4230542305+ extern __typeof (__memchr) __memchr_power7 attribute_hidden;
4230642306+ extern __typeof (__memchr) __memchr_power8 attribute_hidden;
4230742307+4230842308+-# ifdef __LITTLE_ENDIAN__
4230942309+-extern __typeof (__memchr) __memchr_power10 attribute_hidden;
4231042310+-# endif
4231142311+ /* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
4231242312+ ifunc symbol properly. */
4231342313+ libc_ifunc (__memchr,
4231442314+-# ifdef __LITTLE_ENDIAN__
4231542315+- (hwcap2 & PPC_FEATURE2_ARCH_3_1
4231642316+- && hwcap & PPC_FEATURE_HAS_VSX)
4231742317+- ? __memchr_power10 :
4231842318+-# endif
4231942319+- (hwcap2 & PPC_FEATURE2_ARCH_2_07
4232042320+- && hwcap & PPC_FEATURE_HAS_ALTIVEC)
4232142321+- ? __memchr_power8 :
4232242322+- (hwcap & PPC_FEATURE_ARCH_2_06)
4232342323+- ? __memchr_power7
4232442324+- : __memchr_ppc);
4232542325++ (hwcap2 & PPC_FEATURE2_ARCH_2_07
4232642326++ && hwcap & PPC_FEATURE_HAS_ALTIVEC)
4232742327++ ? __memchr_power8 :
4232842328++ (hwcap & PPC_FEATURE_ARCH_2_06)
4232942329++ ? __memchr_power7
4233042330++ : __memchr_ppc);
4233142331+4233242332+ weak_alias (__memchr, memchr)
4233342333+ libc_hidden_builtin_def (memchr)
4233442334+4233542335+commit 7e12550b8e3a11764a4a9090ce6bd3fc23fc8a8e
4233642336+Author: Carlos O'Donell <carlos@redhat.com>
4233742337+Date: Mon Jun 16 13:09:57 2025 -0400
4233842338+4233942339+ ppc64le: Revert "powerpc: Optimized strcmp for power10" (CVE-2025-5702)
4234042340+4234142341+ This reverts commit 3367d8e180848030d1646f088759f02b8dfe0d6f
4234242342+4234342343+ Reason for revert: Power10 strcmp clobbers non-volatile vector
4234442344+ registers (Bug 33056)
4234542345+4234642346+ Tested on ppc64le without regression.
4234742347+4234842348+ (cherry picked from commit 15808c77b35319e67ee0dc8f984a9a1a434701bc)
4234942349+4235042350+diff --git a/sysdeps/powerpc/powerpc64/le/power10/strcmp.S b/sysdeps/powerpc/powerpc64/le/power10/strcmp.S
4235142351+deleted file mode 100644
4235242352+index 00f1e9c170..0000000000
4235342353+--- a/sysdeps/powerpc/powerpc64/le/power10/strcmp.S
4235442354++++ /dev/null
4235542355+@@ -1,204 +0,0 @@
4235642356+-/* Optimized strcmp implementation for PowerPC64/POWER10.
4235742357+- Copyright (C) 2021-2024 Free Software Foundation, Inc.
4235842358+- This file is part of the GNU C Library.
4235942359+-
4236042360+- The GNU C Library is free software; you can redistribute it and/or
4236142361+- modify it under the terms of the GNU Lesser General Public
4236242362+- License as published by the Free Software Foundation; either
4236342363+- version 2.1 of the License, or (at your option) any later version.
4236442364+-
4236542365+- The GNU C Library is distributed in the hope that it will be useful,
4236642366+- but WITHOUT ANY WARRANTY; without even the implied warranty of
4236742367+- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
4236842368+- Lesser General Public License for more details.
4236942369+-
4237042370+- You should have received a copy of the GNU Lesser General Public
4237142371+- License along with the GNU C Library; if not, see
4237242372+- <https://www.gnu.org/licenses/>. */
4237342373+-#include <sysdep.h>
4237442374+-
4237542375+-#ifndef STRCMP
4237642376+-# define STRCMP strcmp
4237742377+-#endif
4237842378+-
4237942379+-/* Implements the function
4238042380+- int [r3] strcmp (const char *s1 [r3], const char *s2 [r4]). */
4238142381+-
4238242382+-/* TODO: Change this to actual instructions when minimum binutils is upgraded
4238342383+- to 2.27. Macros are defined below for these newer instructions in order
4238442384+- to maintain compatibility. */
4238542385+-
4238642386+-#define LXVP(xtp,dq,ra) \
4238742387+- .long(((6)<<(32-6)) \
4238842388+- | ((((xtp)-32)>>1)<<(32-10)) \
4238942389+- | ((1)<<(32-11)) \
4239042390+- | ((ra)<<(32-16)) \
4239142391+- | dq)
4239242392+-
4239342393+-#define COMPARE_16(vreg1,vreg2,offset) \
4239442394+- lxv vreg1+32,offset(r3); \
4239542395+- lxv vreg2+32,offset(r4); \
4239642396+- vcmpnezb. v7,vreg1,vreg2; \
4239742397+- bne cr6,L(different); \
4239842398+-
4239942399+-#define COMPARE_32(vreg1,vreg2,offset,label1,label2) \
4240042400+- LXVP(vreg1+32,offset,r3); \
4240142401+- LXVP(vreg2+32,offset,r4); \
4240242402+- vcmpnezb. v7,vreg1+1,vreg2+1; \
4240342403+- bne cr6,L(label1); \
4240442404+- vcmpnezb. v7,vreg1,vreg2; \
4240542405+- bne cr6,L(label2); \
4240642406+-
4240742407+-#define TAIL(vreg1,vreg2) \
4240842408+- vctzlsbb r6,v7; \
4240942409+- vextubrx r5,r6,vreg1; \
4241042410+- vextubrx r4,r6,vreg2; \
4241142411+- subf r3,r4,r5; \
4241242412+- blr; \
4241342413+-
4241442414+-#define CHECK_N_BYTES(reg1,reg2,len_reg) \
4241542415+- sldi r0,len_reg,56; \
4241642416+- lxvl 32+v4,reg1,r0; \
4241742417+- lxvl 32+v5,reg2,r0; \
4241842418+- add reg1,reg1,len_reg; \
4241942419+- add reg2,reg2,len_reg; \
4242042420+- vcmpnezb. v7,v4,v5; \
4242142421+- vctzlsbb r6,v7; \
4242242422+- cmpld cr7,r6,len_reg; \
4242342423+- blt cr7,L(different); \
4242442424+-
4242542425+- /* TODO: change this to .machine power10 when the minimum required
4242642426+- binutils allows it. */
4242742427+-
4242842428+- .machine power9
4242942429+-ENTRY_TOCLESS (STRCMP, 4)
4243042430+- li r11,16
4243142431+- /* eq bit of cr1 used as swap status flag to indicate if
4243242432+- source pointers were swapped. */
4243342433+- crclr 4*cr1+eq
4243442434+- vspltisb v19,-1
4243542435+- andi. r7,r3,15
4243642436+- sub r7,r11,r7 /* r7(nalign1) = 16 - (str1 & 15). */
4243742437+- andi. r9,r4,15
4243842438+- sub r5,r11,r9 /* r5(nalign2) = 16 - (str2 & 15). */
4243942439+- cmpld cr7,r7,r5
4244042440+- beq cr7,L(same_aligned)
4244142441+- blt cr7,L(nalign1_min)
4244242442+- /* Swap r3 and r4, and r7 and r5 such that r3 and r7 hold the
4244342443+- pointer which is closer to the next 16B boundary so that only
4244442444+- one CHECK_N_BYTES is needed before entering the loop below. */
4244542445+- mr r8,r4
4244642446+- mr r4,r3
4244742447+- mr r3,r8
4244842448+- mr r12,r7
4244942449+- mr r7,r5
4245042450+- mr r5,r12
4245142451+- crset 4*cr1+eq /* Set bit on swapping source pointers. */
4245242452+-
4245342453+- .p2align 5
4245442454+-L(nalign1_min):
4245542455+- CHECK_N_BYTES(r3,r4,r7)
4245642456+-
4245742457+- .p2align 5
4245842458+-L(s1_aligned):
4245942459+- /* r9 and r5 is number of bytes to be read after and before
4246042460+- page boundary correspondingly. */
4246142461+- sub r5,r5,r7
4246242462+- subfic r9,r5,16
4246342463+- /* Now let r7 hold the count of quadwords which can be
4246442464+- checked without crossing a page boundary. quadword offset is
4246542465+- (str2>>4)&0xFF. */
4246642466+- rlwinm r7,r4,28,0xFF
4246742467+- /* Below check is required only for first iteration. For second
4246842468+- iteration and beyond, the new loop counter is always 255. */
4246942469+- cmpldi r7,255
4247042470+- beq L(L3)
4247142471+- /* Get the initial loop count by 255-((str2>>4)&0xFF). */
4247242472+- subfic r11,r7,255
4247342473+-
4247442474+- .p2align 5
4247542475+-L(L1):
4247642476+- mtctr r11
4247742477+-
4247842478+- .p2align 5
4247942479+-L(L2):
4248042480+- COMPARE_16(v4,v5,0) /* Load 16B blocks using lxv. */
4248142481+- addi r3,r3,16
4248242482+- addi r4,r4,16
4248342483+- bdnz L(L2)
4248442484+- /* Cross the page boundary of s2, carefully. */
4248542485+-
4248642486+- .p2align 5
4248742487+-L(L3):
4248842488+- CHECK_N_BYTES(r3,r4,r5)
4248942489+- CHECK_N_BYTES(r3,r4,r9)
4249042490+- li r11,255 /* Load the new loop counter. */
4249142491+- b L(L1)
4249242492+-
4249342493+- .p2align 5
4249442494+-L(same_aligned):
4249542495+- CHECK_N_BYTES(r3,r4,r7)
4249642496+- /* Align s1 to 32B and adjust s2 address.
4249742497+- Use lxvp only if both s1 and s2 are 32B aligned. */
4249842498+- COMPARE_16(v4,v5,0)
4249942499+- COMPARE_16(v4,v5,16)
4250042500+- COMPARE_16(v4,v5,32)
4250142501+- COMPARE_16(v4,v5,48)
4250242502+- addi r3,r3,64
4250342503+- addi r4,r4,64
4250442504+- COMPARE_16(v4,v5,0)
4250542505+- COMPARE_16(v4,v5,16)
4250642506+-
4250742507+- clrldi r6,r3,59
4250842508+- subfic r5,r6,32
4250942509+- add r3,r3,r5
4251042510+- add r4,r4,r5
4251142511+- andi. r5,r4,0x1F
4251242512+- beq cr0,L(32B_aligned_loop)
4251342513+-
4251442514+- .p2align 5
4251542515+-L(16B_aligned_loop):
4251642516+- COMPARE_16(v4,v5,0)
4251742517+- COMPARE_16(v4,v5,16)
4251842518+- COMPARE_16(v4,v5,32)
4251942519+- COMPARE_16(v4,v5,48)
4252042520+- addi r3,r3,64
4252142521+- addi r4,r4,64
4252242522+- b L(16B_aligned_loop)
4252342523+-
4252442524+- /* Calculate and return the difference. */
4252542525+-L(different):
4252642526+- vctzlsbb r6,v7
4252742527+- vextubrx r5,r6,v4
4252842528+- vextubrx r4,r6,v5
4252942529+- bt 4*cr1+eq,L(swapped)
4253042530+- subf r3,r4,r5
4253142531+- blr
4253242532+-
4253342533+- /* If src pointers were swapped, then swap the
4253442534+- indices and calculate the return value. */
4253542535+-L(swapped):
4253642536+- subf r3,r5,r4
4253742537+- blr
4253842538+-
4253942539+- .p2align 5
4254042540+-L(32B_aligned_loop):
4254142541+- COMPARE_32(v14,v16,0,tail1,tail2)
4254242542+- COMPARE_32(v18,v20,32,tail3,tail4)
4254342543+- COMPARE_32(v22,v24,64,tail5,tail6)
4254442544+- COMPARE_32(v26,v28,96,tail7,tail8)
4254542545+- addi r3,r3,128
4254642546+- addi r4,r4,128
4254742547+- b L(32B_aligned_loop)
4254842548+-
4254942549+-L(tail1): TAIL(v15,v17)
4255042550+-L(tail2): TAIL(v14,v16)
4255142551+-L(tail3): TAIL(v19,v21)
4255242552+-L(tail4): TAIL(v18,v20)
4255342553+-L(tail5): TAIL(v23,v25)
4255442554+-L(tail6): TAIL(v22,v24)
4255542555+-L(tail7): TAIL(v27,v29)
4255642556+-L(tail8): TAIL(v26,v28)
4255742557+-
4255842558+-END (STRCMP)
4255942559+-libc_hidden_builtin_def (strcmp)
4256042560+diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
4256142561+index fa1107dfd9..9f15f3207f 100644
4256242562+--- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
4256342563++++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile
4256442564+@@ -33,8 +33,7 @@ sysdep_routines += memcpy-power8-cached memcpy-power7 memcpy-a2 memcpy-power6 \
4256542565+ ifneq (,$(filter %le,$(config-machine)))
4256642566+ sysdep_routines += memcmp-power10 memcpy-power10 memmove-power10 memset-power10 \
4256742567+ rawmemchr-power9 rawmemchr-power10 \
4256842568+- strcmp-power9 strcmp-power10 strncmp-power9 \
4256942569+- strcpy-power9 stpcpy-power9 \
4257042570++ strcmp-power9 strncmp-power9 strcpy-power9 stpcpy-power9 \
4257142571+ strlen-power9 strncpy-power9 stpncpy-power9 strlen-power10
4257242572+ endif
4257342573+ CFLAGS-strncase-power7.c += -mcpu=power7 -funroll-loops
4257442574+diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
4257542575+index 9b3e617306..78443b7f34 100644
4257642576+--- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
4257742577++++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
4257842578+@@ -377,10 +377,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
4257942579+ /* Support sysdeps/powerpc/powerpc64/multiarch/strcmp.c. */
4258042580+ IFUNC_IMPL (i, name, strcmp,
4258142581+ #ifdef __LITTLE_ENDIAN__
4258242582+- IFUNC_IMPL_ADD (array, i, strcmp,
4258342583+- (hwcap2 & PPC_FEATURE2_ARCH_3_1)
4258442584+- && (hwcap & PPC_FEATURE_HAS_VSX),
4258542585+- __strcmp_power10)
4258642586+ IFUNC_IMPL_ADD (array, i, strcmp,
4258742587+ hwcap2 & PPC_FEATURE2_ARCH_3_00
4258842588+ && hwcap & PPC_FEATURE_HAS_ALTIVEC,
4258942589+diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcmp-power10.S b/sysdeps/powerpc/powerpc64/multiarch/strcmp-power10.S
4259042590+deleted file mode 100644
4259142591+index 1a9f6069f5..0000000000
4259242592+--- a/sysdeps/powerpc/powerpc64/multiarch/strcmp-power10.S
4259342593++++ /dev/null
4259442594+@@ -1,26 +0,0 @@
4259542595+-/* Optimized strcmp implementation for POWER10/PPC64.
4259642596+- Copyright (C) 2021-2024 Free Software Foundation, Inc.
4259742597+- This file is part of the GNU C Library.
4259842598+-
4259942599+- The GNU C Library is free software; you can redistribute it and/or
4260042600+- modify it under the terms of the GNU Lesser General Public
4260142601+- License as published by the Free Software Foundation; either
4260242602+- version 2.1 of the License, or (at your option) any later version.
4260342603+-
4260442604+- The GNU C Library is distributed in the hope that it will be useful,
4260542605+- but WITHOUT ANY WARRANTY; without even the implied warranty of
4260642606+- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
4260742607+- Lesser General Public License for more details.
4260842608+-
4260942609+- You should have received a copy of the GNU Lesser General Public
4261042610+- License along with the GNU C Library; if not, see
4261142611+- <https://www.gnu.org/licenses/>. */
4261242612+-
4261342613+-#if defined __LITTLE_ENDIAN__ && IS_IN (libc)
4261442614+-#define STRCMP __strcmp_power10
4261542615+-
4261642616+-#undef libc_hidden_builtin_def
4261742617+-#define libc_hidden_builtin_def(name)
4261842618+-
4261942619+-#include <sysdeps/powerpc/powerpc64/le/power10/strcmp.S>
4262042620+-#endif /* __LITTLE_ENDIAN__ && IS_IN (libc) */
4262142621+diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcmp.c b/sysdeps/powerpc/powerpc64/multiarch/strcmp.c
4262242622+index ff32496fab..06b9b4090f 100644
4262342623+--- a/sysdeps/powerpc/powerpc64/multiarch/strcmp.c
4262442624++++ b/sysdeps/powerpc/powerpc64/multiarch/strcmp.c
4262542625+@@ -29,16 +29,12 @@ extern __typeof (strcmp) __strcmp_power7 attribute_hidden;
4262642626+ extern __typeof (strcmp) __strcmp_power8 attribute_hidden;
4262742627+ # ifdef __LITTLE_ENDIAN__
4262842628+ extern __typeof (strcmp) __strcmp_power9 attribute_hidden;
4262942629+-extern __typeof (strcmp) __strcmp_power10 attribute_hidden;
4263042630+ # endif
4263142631+4263242632+ # undef strcmp
4263342633+4263442634+ libc_ifunc_redirected (__redirect_strcmp, strcmp,
4263542635+ # ifdef __LITTLE_ENDIAN__
4263642636+- (hwcap2 & PPC_FEATURE2_ARCH_3_1
4263742637+- && hwcap & PPC_FEATURE_HAS_VSX)
4263842638+- ? __strcmp_power10 :
4263942639+ (hwcap2 & PPC_FEATURE2_ARCH_3_00
4264042640+ && hwcap & PPC_FEATURE_HAS_ALTIVEC)
4264142641+ ? __strcmp_power9 :
4264242642+4264342643+commit 23a02e382c8ffebfed00a082d8898f1aa468b5da
4264442644+Author: Florian Weimer <fweimer@redhat.com>
4264542645+Date: Wed May 21 16:47:34 2025 +0200
4264642646+4264742647+ support: Pick group in support_capture_subprogram_self_sgid if UID == 0
4264842648+4264942649+ When running as root, it is likely that we can run under any group.
4265042650+ Pick a harmless group from /etc/group in this case.
4265142651+4265242652+ Reviewed-by: Carlos O'Donell <carlos@redhat.com>
4265342653+ (cherry picked from commit 2f769cec448d84a62b7dd0d4ff56978fe22c0cd6)
4265442654+4265542655+diff --git a/support/support_capture_subprocess.c b/support/support_capture_subprocess.c
4265642656+index 2383481911..1cb344eb04 100644
4265742657+--- a/support/support_capture_subprocess.c
4265842658++++ b/support/support_capture_subprocess.c
4265942659+@@ -21,7 +21,11 @@
4266042660+4266142661+ #include <errno.h>
4266242662+ #include <fcntl.h>
4266342663++#include <grp.h>
4266442664++#include <scratch_buffer.h>
4266542665++#include <stdio_ext.h>
4266642666+ #include <stdlib.h>
4266742667++#include <string.h>
4266842668+ #include <support/check.h>
4266942669+ #include <support/xunistd.h>
4267042670+ #include <support/xsocket.h>
4267142671+@@ -210,10 +214,48 @@ err:
4267242672+ return status;
4267342673+ }
4267442674+4267542675++/* Returns true if a group with NAME has been found, and writes its
4267642676++ GID to *TARGET. */
4267742677++static bool
4267842678++find_sgid_group (gid_t *target, const char *name)
4267942679++{
4268042680++ /* Do not use getgrname_r because it does not work in statically
4268142681++ linked binaries if the system libc is different. */
4268242682++ FILE *fp = fopen ("/etc/group", "rce");
4268342683++ if (fp == NULL)
4268442684++ return false;
4268542685++ __fsetlocking (fp, FSETLOCKING_BYCALLER);
4268642686++
4268742687++ bool ok = false;
4268842688++ struct scratch_buffer buf;
4268942689++ scratch_buffer_init (&buf);
4269042690++ while (true)
4269142691++ {
4269242692++ struct group grp;
4269342693++ struct group *result = NULL;
4269442694++ int status = fgetgrent_r (fp, &grp, buf.data, buf.length, &result);
4269542695++ if (status == 0 && result != NULL)
4269642696++ {
4269742697++ if (strcmp (result->gr_name, name) == 0)
4269842698++ {
4269942699++ *target = result->gr_gid;
4270042700++ ok = true;
4270142701++ break;
4270242702++ }
4270342703++ }
4270442704++ else if (errno != ERANGE)
4270542705++ break;
4270642706++ else if (!scratch_buffer_grow (&buf))
4270742707++ break;
4270842708++ }
4270942709++ scratch_buffer_free (&buf);
4271042710++ fclose (fp);
4271142711++ return ok;
4271242712++}
4271342713++
4271442714+ int
4271542715+ support_capture_subprogram_self_sgid (const char *child_id)
4271642716+ {
4271742717+- gid_t target = 0;
4271842718+ const int count = 64;
4271942719+ gid_t groups[count];
4272042720+4272142721+@@ -225,6 +267,7 @@ support_capture_subprogram_self_sgid (const char *child_id)
4272242722+ (intmax_t) getuid ());
4272342723+4272442724+ gid_t current = getgid ();
4272542725++ gid_t target = current;
4272642726+ for (int i = 0; i < ret; ++i)
4272742727+ {
4272842728+ if (groups[i] != current)
4272942729+@@ -234,9 +277,16 @@ support_capture_subprogram_self_sgid (const char *child_id)
4273042730+ }
4273142731+ }
4273242732+4273342733+- if (target == 0)
4273442734+- FAIL_UNSUPPORTED("Could not find a suitable GID for user %jd\n",
4273542735+- (intmax_t) getuid ());
4273642736++ if (target == current)
4273742737++ {
4273842738++ /* If running as root, try to find a harmless group for SGID. */
4273942739++ if (getuid () != 0
4274042740++ || (!find_sgid_group (&target, "nogroup")
4274142741++ && !find_sgid_group (&target, "bin")
4274242742++ && !find_sgid_group (&target, "daemon")))
4274342743++ FAIL_UNSUPPORTED("Could not find a suitable GID for user %jd\n",
4274442744++ (intmax_t) getuid ());
4274542745++ }
4274642746+4274742747+ return copy_and_spawn_sgid (child_id, target);
4274842748+ }
4274942749+4275042750+commit dbc83657e290bdad3245259be80fb84cbe10304c
4275142751+Author: Florian Weimer <fweimer@redhat.com>
4275242752+Date: Thu May 22 14:36:37 2025 +0200
4275342753+4275442754+ Fix error reporting (false negatives) in SGID tests
4275542755+4275642756+ And simplify the interface of support_capture_subprogram_self_sgid.
4275742757+4275842758+ Use the existing framework for temporary directories (now with
4275942759+ mode 0700) and directory/file deletion. Handle all execution
4276042760+ errors within support_capture_subprogram_self_sgid. In particular,
4276142761+ this includes test failures because the invoked program did not
4276242762+ exit with exit status zero. Existing tests that expect exit
4276342763+ status 42 are adjusted to use zero instead.
4276442764+4276542765+ In addition, fix callers not to call exit (0) with test failures
4276642766+ pending (which may mask them, especially when running with --direct).
4276742767+4276842768+ Fixes commit 35fc356fa3b4f485bd3ba3114c9f774e5df7d3c2
4276942769+ ("elf: Fix subprocess status handling for tst-dlopen-sgid (bug 32987)").
4277042770+4277142771+ Reviewed-by: Carlos O'Donell <carlos@redhat.com>
4277242772+ (cherry picked from commit 3a3fb2ed83f79100c116c824454095ecfb335ad7)
4277342773+4277442774+diff --git a/elf/tst-dlopen-sgid.c b/elf/tst-dlopen-sgid.c
4277542775+index 5688b79f2e..8aec52e19f 100644
4277642776+--- a/elf/tst-dlopen-sgid.c
4277742777++++ b/elf/tst-dlopen-sgid.c
4277842778+@@ -70,13 +70,7 @@ do_test (void)
4277942779+4278042780+ free (libdir);
4278142781+4278242782+- int status = support_capture_subprogram_self_sgid (magic_argument);
4278342783+-
4278442784+- if (WEXITSTATUS (status) == EXIT_UNSUPPORTED)
4278542785+- return EXIT_UNSUPPORTED;
4278642786+-
4278742787+- if (!WIFEXITED (status))
4278842788+- FAIL_EXIT1 ("Unexpected exit status %d from child process\n", status);
4278942789++ support_capture_subprogram_self_sgid (magic_argument);
4279042790+4279142791+ return 0;
4279242792+ }
4279342793+diff --git a/elf/tst-env-setuid-tunables.c b/elf/tst-env-setuid-tunables.c
4279442794+index a47219047f..233eec7631 100644
4279542795+--- a/elf/tst-env-setuid-tunables.c
4279642796++++ b/elf/tst-env-setuid-tunables.c
4279742797+@@ -105,10 +105,7 @@ do_test (int argc, char **argv)
4279842798+4279942799+ if (ret != 0)
4280042800+ exit (1);
4280142801+-
4280242802+- /* Special return code to make sure that the child executed all the way
4280342803+- through. */
4280442804+- exit (42);
4280542805++ return 0;
4280642806+ }
4280742807+ else
4280842808+ {
4280942809+@@ -127,18 +124,7 @@ do_test (int argc, char **argv)
4281042810+ continue;
4281142811+ }
4281242812+4281342813+- int status = support_capture_subprogram_self_sgid (buf);
4281442814+-
4281542815+- /* Bail out early if unsupported. */
4281642816+- if (WEXITSTATUS (status) == EXIT_UNSUPPORTED)
4281742817+- return EXIT_UNSUPPORTED;
4281842818+-
4281942819+- if (WEXITSTATUS (status) != 42)
4282042820+- {
4282142821+- printf (" [%d] child failed with status %d\n", i,
4282242822+- WEXITSTATUS (status));
4282342823+- support_record_failure ();
4282442824+- }
4282542825++ support_capture_subprogram_self_sgid (buf);
4282642826+ }
4282742827+ return 0;
4282842828+ }
4282942829+diff --git a/elf/tst-env-setuid.c b/elf/tst-env-setuid.c
4283042830+index 59f2ffeb88..ee3f058468 100644
4283142831+--- a/elf/tst-env-setuid.c
4283242832++++ b/elf/tst-env-setuid.c
4283342833+@@ -147,10 +147,7 @@ do_test (int argc, char **argv)
4283442834+4283542835+ if (ret != 0)
4283642836+ exit (1);
4283742837+-
4283842838+- /* Special return code to make sure that the child executed all the way
4283942839+- through. */
4284042840+- exit (42);
4284142841++ return 0;
4284242842+ }
4284342843+ else
4284442844+ {
4284542845+@@ -174,17 +171,7 @@ do_test (int argc, char **argv)
4284642846+ free (profilepath);
4284742847+ }
4284842848+4284942849+- int status = support_capture_subprogram_self_sgid (SETGID_CHILD);
4285042850+-
4285142851+- if (WEXITSTATUS (status) == EXIT_UNSUPPORTED)
4285242852+- exit (EXIT_UNSUPPORTED);
4285342853+-
4285442854+- if (WEXITSTATUS (status) != 42)
4285542855+- {
4285642856+- printf (" child failed with status %d\n",
4285742857+- WEXITSTATUS (status));
4285842858+- support_record_failure ();
4285942859+- }
4286042860++ support_capture_subprogram_self_sgid (SETGID_CHILD);
4286142861+4286242862+ return 0;
4286342863+ }
4286442864+diff --git a/stdlib/tst-secure-getenv.c b/stdlib/tst-secure-getenv.c
4286542865+index cc26ed6d15..cefee58d46 100644
4286642866+--- a/stdlib/tst-secure-getenv.c
4286742867++++ b/stdlib/tst-secure-getenv.c
4286842868+@@ -57,13 +57,7 @@ do_test (void)
4286942869+ exit (1);
4287042870+ }
4287142871+4287242872+- int status = support_capture_subprogram_self_sgid (MAGIC_ARGUMENT);
4287342873+-
4287442874+- if (WEXITSTATUS (status) == EXIT_UNSUPPORTED)
4287542875+- return EXIT_UNSUPPORTED;
4287642876+-
4287742877+- if (!WIFEXITED (status))
4287842878+- FAIL_EXIT1 ("Unexpected exit status %d from child process\n", status);
4287942879++ support_capture_subprogram_self_sgid (MAGIC_ARGUMENT);
4288042880+4288142881+ return 0;
4288242882+ }
4288342883+@@ -82,6 +76,7 @@ alternative_main (int argc, char **argv)
4288442884+ if (secure_getenv ("PATH") != NULL)
4288542885+ FAIL_EXIT (4, "PATH variable not filtered out\n");
4288642886+4288742887++ support_record_failure_barrier ();
4288842888+ exit (EXIT_SUCCESS);
4288942889+ }
4289042890+ }
4289142891+diff --git a/support/capture_subprocess.h b/support/capture_subprocess.h
4289242892+index 5406d9f6c0..57bb941e7d 100644
4289342893+--- a/support/capture_subprocess.h
4289442894++++ b/support/capture_subprocess.h
4289542895+@@ -42,10 +42,12 @@ struct support_capture_subprocess support_capture_subprocess
4289642896+ struct support_capture_subprocess support_capture_subprogram
4289742897+ (const char *file, char *const argv[], char *const envp[]);
4289842898+4289942899+-/* Copy the running program into a setgid binary and run it with CHILD_ID
4290042900+- argument. If execution is successful, return the exit status of the child
4290142901+- program, otherwise return a non-zero failure exit code. */
4290242902+-int support_capture_subprogram_self_sgid (const char *child_id);
4290342903++/* Copy the running program into a setgid binary and run it with
4290442904++ CHILD_ID argument. If the program exits with a non-zero status,
4290542905++ exit with that exit status (or status 1 if the program did not exit
4290642906++ normally). If the test cannot be performed, exit with
4290742907++ EXIT_UNSUPPORTED. */
4290842908++void support_capture_subprogram_self_sgid (const char *child_id);
4290942909+4291042910+ /* Deallocate the subprocess data captured by
4291142911+ support_capture_subprocess. */
4291242912+diff --git a/support/support_capture_subprocess.c b/support/support_capture_subprocess.c
4291342913+index 1cb344eb04..cbc6951064 100644
4291442914+--- a/support/support_capture_subprocess.c
4291542915++++ b/support/support_capture_subprocess.c
4291642916+@@ -31,6 +31,7 @@
4291742917+ #include <support/xsocket.h>
4291842918+ #include <support/xspawn.h>
4291942919+ #include <support/support.h>
4292042920++#include <support/temp_file.h>
4292142921+ #include <support/test-driver.h>
4292242922+4292342923+ static void
4292442924+@@ -113,105 +114,44 @@ support_capture_subprogram (const char *file, char *const argv[],
4292542925+ /* Copies the executable into a restricted directory, so that we can
4292642926+ safely make it SGID with the TARGET group ID. Then runs the
4292742927+ executable. */
4292842928+-static int
4292942929++static void
4293042930+ copy_and_spawn_sgid (const char *child_id, gid_t gid)
4293142931+ {
4293242932+- char *dirname = xasprintf ("%s/tst-tunables-setuid.%jd",
4293342933+- test_dir, (intmax_t) getpid ());
4293442934++ char *dirname = support_create_temp_directory ("tst-glibc-sgid-");
4293542935+ char *execname = xasprintf ("%s/bin", dirname);
4293642936+- int infd = -1;
4293742937+- int outfd = -1;
4293842938+- int ret = 1, status = 1;
4293942939+-
4294042940+- TEST_VERIFY (mkdir (dirname, 0700) == 0);
4294142941+- if (support_record_failure_is_failed ())
4294242942+- goto err;
4294342943++ add_temp_file (execname);
4294442944+4294542945+- infd = open ("/proc/self/exe", O_RDONLY);
4294642946+- if (infd < 0)
4294742947++ if (access ("/proc/self/exe", R_OK) != 0)
4294842948+ FAIL_UNSUPPORTED ("unsupported: Cannot read binary from procfs\n");
4294942949+4295042950+- outfd = open (execname, O_WRONLY | O_CREAT | O_EXCL, 0700);
4295142951+- TEST_VERIFY (outfd >= 0);
4295242952+- if (support_record_failure_is_failed ())
4295342953+- goto err;
4295442954+-
4295542955+- char buf[4096];
4295642956+- for (;;)
4295742957+- {
4295842958+- ssize_t rdcount = read (infd, buf, sizeof (buf));
4295942959+- TEST_VERIFY (rdcount >= 0);
4296042960+- if (support_record_failure_is_failed ())
4296142961+- goto err;
4296242962+- if (rdcount == 0)
4296342963+- break;
4296442964+- char *p = buf;
4296542965+- char *end = buf + rdcount;
4296642966+- while (p != end)
4296742967+- {
4296842968+- ssize_t wrcount = write (outfd, buf, end - p);
4296942969+- if (wrcount == 0)
4297042970+- errno = ENOSPC;
4297142971+- TEST_VERIFY (wrcount > 0);
4297242972+- if (support_record_failure_is_failed ())
4297342973+- goto err;
4297442974+- p += wrcount;
4297542975+- }
4297642976+- }
4297742977++ support_copy_file ("/proc/self/exe", execname);
4297842978+4297942979+- bool chowned = false;
4298042980+- TEST_VERIFY ((chowned = fchown (outfd, getuid (), gid) == 0)
4298142981+- || errno == EPERM);
4298242982+- if (support_record_failure_is_failed ())
4298342983+- goto err;
4298442984+- else if (!chowned)
4298542985+- {
4298642986+- ret = 77;
4298742987+- goto err;
4298842988+- }
4298942989++ if (chown (execname, getuid (), gid) != 0)
4299042990++ FAIL_UNSUPPORTED ("cannot change group of \"%s\" to %jd: %m",
4299142991++ execname, (intmax_t) gid);
4299242992+4299342993+- TEST_VERIFY (fchmod (outfd, 02750) == 0);
4299442994+- if (support_record_failure_is_failed ())
4299542995+- goto err;
4299642996+- TEST_VERIFY (close (outfd) == 0);
4299742997+- if (support_record_failure_is_failed ())
4299842998+- goto err;
4299942999+- TEST_VERIFY (close (infd) == 0);
4300043000+- if (support_record_failure_is_failed ())
4300143001+- goto err;
4300243002++ if (chmod (execname, 02750) != 0)
4300343003++ FAIL_UNSUPPORTED ("cannot make \"%s\" SGID: %m ", execname);
4300443004+4300543005+ /* We have the binary, now spawn the subprocess. Avoid using
4300643006+ support_subprogram because we only want the program exit status, not the
4300743007+ contents. */
4300843008+- ret = 0;
4300943009+- infd = outfd = -1;
4301043010+4301143011+ char * const args[] = {execname, (char *) child_id, NULL};
4301243012++ int status = support_subprogram_wait (args[0], args);
4301343013+4301443014+- status = support_subprogram_wait (args[0], args);
4301543015++ free (execname);
4301643016++ free (dirname);
4301743017+4301843018+-err:
4301943019+- if (outfd >= 0)
4302043020+- close (outfd);
4302143021+- if (infd >= 0)
4302243022+- close (infd);
4302343023+- if (execname != NULL)
4302443024+- {
4302543025+- unlink (execname);
4302643026+- free (execname);
4302743027+- }
4302843028+- if (dirname != NULL)
4302943029++ if (WIFEXITED (status))
4303043030+ {
4303143031+- rmdir (dirname);
4303243032+- free (dirname);
4303343033++ if (WEXITSTATUS (status) == 0)
4303443034++ return;
4303543035++ else
4303643036++ exit (WEXITSTATUS (status));
4303743037+ }
4303843038+-
4303943039+- if (ret == 77)
4304043040+- FAIL_UNSUPPORTED ("Failed to make sgid executable for test\n");
4304143041+- if (ret != 0)
4304243042+- FAIL_EXIT1 ("Failed to make sgid executable for test\n");
4304343043+-
4304443044+- return status;
4304543045++ else
4304643046++ FAIL_EXIT1 ("subprogram failed with status %d", status);
4304743047+ }
4304843048+4304943049+ /* Returns true if a group with NAME has been found, and writes its
4305043050+@@ -253,7 +193,7 @@ find_sgid_group (gid_t *target, const char *name)
4305143051+ return ok;
4305243052+ }
4305343053+4305443054+-int
4305543055++void
4305643056+ support_capture_subprogram_self_sgid (const char *child_id)
4305743057+ {
4305843058+ const int count = 64;
4305943059+@@ -288,7 +228,7 @@ support_capture_subprogram_self_sgid (const char *child_id)
4306043060+ (intmax_t) getuid ());
4306143061+ }
4306243062+4306343063+- return copy_and_spawn_sgid (child_id, target);
4306443064++ copy_and_spawn_sgid (child_id, target);
4306543065+ }
4306643066+4306743067+ void
4306843068+4306943069+commit 2eb180377b96771b8368b0915669c8c7b267e739
4307043070+Author: Florian Weimer <fweimer@redhat.com>
4307143071+Date: Mon Jul 21 21:43:49 2025 +0200
4307243072+4307343073+ posix: Fix double-free after allocation failure in regcomp (bug 33185)
4307443074+4307543075+ If a memory allocation failure occurs during bracket expression
4307643076+ parsing in regcomp, a double-free error may result.
4307743077+4307843078+ Reported-by: Anastasia Belova <abelova@astralinux.ru>
4307943079+ Co-authored-by: Paul Eggert <eggert@cs.ucla.edu>
4308043080+ Reviewed-by: Andreas K. Huettel <dilfridge@gentoo.org>
4308143081+ (cherry picked from commit 7ea06e994093fa0bcca0d0ee2c1db271d8d7885d)
4308243082+4308343083+diff --git a/NEWS b/NEWS
4308443084+index 4b290ad4bf..253b07ae99 100644
4308543085+--- a/NEWS
4308643086++++ b/NEWS
4308743087+@@ -24,6 +24,7 @@ The following bugs are resolved with this release:
4308843088+ [32470] x86: Avoid integer truncation with large cache sizes
4308943089+ [32810] Crash on x86-64 if XSAVEC disable via tunable
4309043090+ [32987] elf: Fix subprocess status handling for tst-dlopen-sgid
4309143091++ [33185] Fix double-free after allocation failure in regcomp
4309243092+4309343093+ Version 2.40
4309443094+4309543095+diff --git a/posix/Makefile b/posix/Makefile
4309643096+index 2c598cd20a..830278a423 100644
4309743097+--- a/posix/Makefile
4309843098++++ b/posix/Makefile
4309943099+@@ -303,6 +303,7 @@ tests := \
4310043100+ tst-posix_spawn-setsid \
4310143101+ tst-preadwrite \
4310243102+ tst-preadwrite64 \
4310343103++ tst-regcomp-bracket-free \
4310443104+ tst-regcomp-truncated \
4310543105+ tst-regex \
4310643106+ tst-regex2 \
4310743107+diff --git a/posix/regcomp.c b/posix/regcomp.c
4310843108+index 5380d3c7b9..6595bb3c0d 100644
4310943109+--- a/posix/regcomp.c
4311043110++++ b/posix/regcomp.c
4311143111+@@ -3384,6 +3384,7 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
4311243112+ {
4311343113+ #ifdef RE_ENABLE_I18N
4311443114+ free_charset (mbcset);
4311543115++ mbcset = NULL;
4311643116+ #endif
4311743117+ /* Build a tree for simple bracket. */
4311843118+ br_token.type = SIMPLE_BRACKET;
4311943119+@@ -3399,7 +3400,8 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
4312043120+ parse_bracket_exp_free_return:
4312143121+ re_free (sbcset);
4312243122+ #ifdef RE_ENABLE_I18N
4312343123+- free_charset (mbcset);
4312443124++ if (__glibc_likely (mbcset != NULL))
4312543125++ free_charset (mbcset);
4312643126+ #endif /* RE_ENABLE_I18N */
4312743127+ return NULL;
4312843128+ }
4312943129+diff --git a/posix/tst-regcomp-bracket-free.c b/posix/tst-regcomp-bracket-free.c
4313043130+new file mode 100644
4313143131+index 0000000000..3c091d8c44
4313243132+--- /dev/null
4313343133++++ b/posix/tst-regcomp-bracket-free.c
4313443134+@@ -0,0 +1,176 @@
4313543135++/* Test regcomp bracket parsing with injected allocation failures (bug 33185).
4313643136++ Copyright (C) 2025 Free Software Foundation, Inc.
4313743137++ This file is part of the GNU C Library.
4313843138++
4313943139++ The GNU C Library is free software; you can redistribute it and/or
4314043140++ modify it under the terms of the GNU Lesser General Public
4314143141++ License as published by the Free Software Foundation; either
4314243142++ version 2.1 of the License, or (at your option) any later version.
4314343143++
4314443144++ The GNU C Library is distributed in the hope that it will be useful,
4314543145++ but WITHOUT ANY WARRANTY; without even the implied warranty of
4314643146++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
4314743147++ Lesser General Public License for more details.
4314843148++
4314943149++ You should have received a copy of the GNU Lesser General Public
4315043150++ License along with the GNU C Library; if not, see
4315143151++ <https://www.gnu.org/licenses/>. */
4315243152++
4315343153++/* This test invokes regcomp multiple times, failing one memory
4315443154++ allocation in each call. The function call should fail with
4315543155++ REG_ESPACE (or succeed if it can recover from the allocation
4315643156++ failure). Previously, there was double-free bug. */
4315743157++
4315843158++#include <errno.h>
4315943159++#include <regex.h>
4316043160++#include <stdio.h>
4316143161++#include <string.h>
4316243162++#include <support/check.h>
4316343163++#include <support/namespace.h>
4316443164++#include <support/support.h>
4316543165++
4316643166++/* Data structure allocated via MAP_SHARED, so that writes from the
4316743167++ subprocess are visible. */
4316843168++struct shared_data
4316943169++{
4317043170++ /* Number of tracked allocations performed so far. */
4317143171++ volatile unsigned int allocation_count;
4317243172++
4317343173++ /* If this number is reached, one allocation fails. */
4317443174++ volatile unsigned int failing_allocation;
4317543175++
4317643176++ /* The subprocess stores the expected name here. */
4317743177++ char name[100];
4317843178++};
4317943179++
4318043180++/* Allocation count in shared mapping. */
4318143181++static struct shared_data *shared;
4318243182++
4318343183++/* Returns true if a failure should be injected for this allocation. */
4318443184++static bool
4318543185++fail_this_allocation (void)
4318643186++{
4318743187++ if (shared != NULL)
4318843188++ {
4318943189++ unsigned int count = shared->allocation_count;
4319043190++ shared->allocation_count = count + 1;
4319143191++ return count == shared->failing_allocation;
4319243192++ }
4319343193++ else
4319443194++ return false;
4319543195++}
4319643196++
4319743197++/* Failure-injecting wrappers for allocation functions used by glibc. */
4319843198++
4319943199++void *
4320043200++malloc (size_t size)
4320143201++{
4320243202++ if (fail_this_allocation ())
4320343203++ {
4320443204++ errno = ENOMEM;
4320543205++ return NULL;
4320643206++ }
4320743207++ extern __typeof (malloc) __libc_malloc;
4320843208++ return __libc_malloc (size);
4320943209++}
4321043210++
4321143211++void *
4321243212++calloc (size_t a, size_t b)
4321343213++{
4321443214++ if (fail_this_allocation ())
4321543215++ {
4321643216++ errno = ENOMEM;
4321743217++ return NULL;
4321843218++ }
4321943219++ extern __typeof (calloc) __libc_calloc;
4322043220++ return __libc_calloc (a, b);
4322143221++}
4322243222++
4322343223++void *
4322443224++realloc (void *ptr, size_t size)
4322543225++{
4322643226++ if (fail_this_allocation ())
4322743227++ {
4322843228++ errno = ENOMEM;
4322943229++ return NULL;
4323043230++ }
4323143231++ extern __typeof (realloc) __libc_realloc;
4323243232++ return __libc_realloc (ptr, size);
4323343233++}
4323443234++
4323543235++/* No-op subprocess to verify that support_isolate_in_subprocess does
4323643236++ not perform any heap allocations. */
4323743237++static void
4323843238++no_op (void *ignored)
4323943239++{
4324043240++}
4324143241++
4324243242++/* Perform a regcomp call in a subprocess. Used to count its
4324343243++ allocations. */
4324443244++static void
4324543245++initialize (void *regexp1)
4324643246++{
4324743247++ const char *regexp = regexp1;
4324843248++
4324943249++ shared->allocation_count = 0;
4325043250++
4325143251++ regex_t reg;
4325243252++ TEST_COMPARE (regcomp (®, regexp, 0), 0);
4325343253++}
4325443254++
4325543255++/* Perform regcomp in a subprocess with fault injection. */
4325643256++static void
4325743257++test_in_subprocess (void *regexp1)
4325843258++{
4325943259++ const char *regexp = regexp1;
4326043260++ unsigned int inject_at = shared->failing_allocation;
4326143261++
4326243262++ regex_t reg;
4326343263++ int ret = regcomp (®, regexp, 0);
4326443264++
4326543265++ if (ret != 0)
4326643266++ {
4326743267++ TEST_COMPARE (ret, REG_ESPACE);
4326843268++ printf ("info: allocation %u failure results in return value %d,"
4326943269++ " error %s (%d)\n",
4327043270++ inject_at, ret, strerrorname_np (errno), errno);
4327143271++ }
4327243272++}
4327343273++
4327443274++static int
4327543275++do_test (void)
4327643276++{
4327743277++ char regexp[] = "[:alpha:]";
4327843278++
4327943279++ shared = support_shared_allocate (sizeof (*shared));
4328043280++
4328143281++ /* Disable fault injection. */
4328243282++ shared->failing_allocation = ~0U;
4328343283++
4328443284++ support_isolate_in_subprocess (no_op, NULL);
4328543285++ TEST_COMPARE (shared->allocation_count, 0);
4328643286++
4328743287++ support_isolate_in_subprocess (initialize, regexp);
4328843288++
4328943289++ /* The number of allocations in the successful case, plus some
4329043290++ slack. Once the number of expected allocations is exceeded,
4329143291++ injecting further failures does not make a difference. */
4329243292++ unsigned int maximum_allocation_count = shared->allocation_count;
4329343293++ printf ("info: successful call performs %u allocations\n",
4329443294++ maximum_allocation_count);
4329543295++ maximum_allocation_count += 10;
4329643296++
4329743297++ for (unsigned int inject_at = 0; inject_at <= maximum_allocation_count;
4329843298++ ++inject_at)
4329943299++ {
4330043300++ shared->allocation_count = 0;
4330143301++ shared->failing_allocation = inject_at;
4330243302++ support_isolate_in_subprocess (test_in_subprocess, regexp);
4330343303++ }
4330443304++
4330543305++ support_shared_free (shared);
4330643306++
4330743307++ return 0;
4330843308++}
4330943309++
4331043310++#include <support/test-driver.c>
+1-1
pkgs/development/libraries/glibc/common.nix
···6868 /*
6969 No tarballs for stable upstream branch, only https://sourceware.org/git/glibc.git and using git would complicate bootstrapping.
7070 $ git fetch --all -p && git checkout origin/release/2.40/master && git describe
7171- glibc-2.40-66-g7d4b6bcae9
7171+ glibc-2.40-142-g2eb180377b
7272 $ git show --minimal --reverse glibc-2.40.. ':!ADVISORIES' > 2.40-master.patch
73737474 To compare the archive contents zdiff can be used.