python3Packages.torch: fix on aarch64-linux (#439489)

authored by Sandro and committed by GitHub 6f415c1e 9df58940

+54
+5
pkgs/development/python-modules/torch/source/default.nix
··· 306 url = "https://github.com/pytorch/pytorch/commit/231c72240d80091f099c95e326d3600cba866eee.patch"; 307 hash = "sha256-BBCjxzz2TUkx4nXRyRILA82kMwyb/4+C3eOtYqf5dhk="; 308 }) 309 ] 310 ++ lib.optionals cudaSupport [ 311 ./fix-cmake-cuda-toolkit.patch
··· 306 url = "https://github.com/pytorch/pytorch/commit/231c72240d80091f099c95e326d3600cba866eee.patch"; 307 hash = "sha256-BBCjxzz2TUkx4nXRyRILA82kMwyb/4+C3eOtYqf5dhk="; 308 }) 309 + 310 + # Fixes GCC-14 compatibility on ARM 311 + # Adapted from https://github.com/pytorch/pytorch/pull/157867 312 + # TODO: remove at the next release 313 + ./gcc-14-arm-compat.path 314 ] 315 ++ lib.optionals cudaSupport [ 316 ./fix-cmake-cuda-toolkit.patch
+49
pkgs/development/python-modules/torch/source/gcc-14-arm-compat.path
···
··· 1 + diff --git a/aten/src/ATen/cpu/vec/sve/vec_bfloat16.h b/aten/src/ATen/cpu/vec/sve/vec_bfloat16.h 2 + index 7f05c2ad166..1632b595c4c 100644 3 + --- a/aten/src/ATen/cpu/vec/sve/vec_bfloat16.h 4 + +++ b/aten/src/ATen/cpu/vec/sve/vec_bfloat16.h 5 + @@ -220,8 +220,12 @@ class Vectorized<BFloat16> { 6 + Vectorized<BFloat16> le(const Vectorized<BFloat16>& other) const; 7 + }; 8 + 9 + -inline std::tuple<Vectorized<float>, Vectorized<float>> convert_bfloat16_float( 10 + - const Vectorized<c10::BFloat16>& a) { 11 + +#if defined(__GNUC__) && __GNUC__ == 14 12 + +// Workaround for gcc-14.2.0 ICE during RTL pass: vregs when compiling for SVE 13 + +__attribute__((optimize("no-tree-vectorize"))) 14 + +#endif 15 + +inline std::tuple<Vectorized<float>, Vectorized<float>> 16 + +convert_bfloat16_float(const Vectorized<c10::BFloat16>& a) { 17 + static_assert( 18 + Vectorized<c10::BFloat16>::size() == 2 * Vectorized<float>::size()); 19 + auto zero = svreinterpret_bf16_f32(svdup_n_f32(0.0f)); 20 + diff --git a/aten/src/ATen/native/cpu/Activation.cpp b/aten/src/ATen/native/cpu/Activation.cpp 21 + index 52d5383e60f..00c9f4eb253 100644 22 + --- a/aten/src/ATen/native/cpu/Activation.cpp 23 + +++ b/aten/src/ATen/native/cpu/Activation.cpp 24 + @@ -26,6 +26,10 @@ namespace at::native { 25 + 26 + namespace { 27 + 28 + +#if defined(__GNUC__) && __GNUC__ == 14 && defined(__aarch64__) && !defined(__ARM_FEATURE_SVE) 29 + +// Workaround for gcc-14.2.0 ICE during RTL pass: expand when compiling for NEON 30 + +__attribute__((optimize("no-tree-vectorize"))) 31 + +#endif 32 + static void log_sigmoid_cpu_kernel(TensorBase &output, TensorBase &buffer, const TensorBase &input) { 33 + if (at::isReducedFloatingType(input.scalar_type())) { 34 + AT_DISPATCH_REDUCED_FLOATING_TYPES(input.scalar_type(), "log_sigmoid_cpu", [&]() { 35 + diff --git a/aten/src/ATen/native/cpu/Unfold2d.cpp b/aten/src/ATen/native/cpu/Unfold2d.cpp 36 + index 8ef0741e77a..8c94decfff0 100644 37 + --- a/aten/src/ATen/native/cpu/Unfold2d.cpp 38 + +++ b/aten/src/ATen/native/cpu/Unfold2d.cpp 39 + @@ -169,6 +169,10 @@ static void unfolded2d_acc_channels_last( 40 + 41 + /* note: due to write issues, this one cannot be parallelized as well as 42 + * unfolded2d_copy */ 43 + +#if defined(__GNUC__) && __GNUC__ == 14 && defined(__ARM_FEATURE_SVE) && !defined(__ARM_FEATURE_BF16) 44 + +// Workaround for gcc-14.2.0 ICE during RTL pass: vregs when compiling for SVE without BF16 45 + +__attribute__((optimize("no-tree-vectorize"))) 46 + +#endif 47 + void unfolded2d_acc_kernel( 48 + ScalarType dtype, 49 + void *finput_data,