cudaPackages.saxpy: init at unstable-2023-07-11

+132
+2
pkgs/development/compilers/cudatoolkit/extension.nix
··· 71 cudaFlags 72 markForCudatoolkitRootHook 73 setupCudaHook; 74 }
··· 71 cudaFlags 72 markForCudatoolkitRootHook 73 setupCudaHook; 74 + 75 + saxpy = final.callPackage ./saxpy { }; 76 }
+12
pkgs/development/compilers/cudatoolkit/saxpy/CMakeLists.txt
···
··· 1 + cmake_minimum_required(VERSION 3.25) 2 + project(saxpy LANGUAGES CXX CUDA) 3 + 4 + find_package(CUDAToolkit REQUIRED COMPONENTS cudart cublas) 5 + 6 + add_executable(saxpy saxpy.cu) 7 + target_link_libraries(saxpy PUBLIC CUDA::cublas CUDA::cudart m) 8 + target_compile_features(saxpy PRIVATE cxx_std_14) 9 + target_compile_options(saxpy PRIVATE $<$<COMPILE_LANGUAGE:CUDA>: 10 + --expt-relaxed-constexpr>) 11 + 12 + install(TARGETS saxpy)
+50
pkgs/development/compilers/cudatoolkit/saxpy/default.nix
···
··· 1 + { autoAddOpenGLRunpathHook 2 + , backendStdenv 3 + , cmake 4 + , cuda_cccl 5 + , cuda_cudart 6 + , cudaFlags 7 + , cuda_nvcc 8 + , lib 9 + , libcublas 10 + , setupCudaHook 11 + , stdenv 12 + }: 13 + 14 + backendStdenv.mkDerivation { 15 + pname = "saxpy"; 16 + version = "unstable-2023-07-11"; 17 + 18 + src = ./.; 19 + 20 + buildInputs = [ 21 + libcublas 22 + cuda_cudart 23 + cuda_cccl 24 + ]; 25 + nativeBuildInputs = [ 26 + cmake 27 + 28 + # NOTE: this needs to be pkgs.buildPackages.cudaPackages_XX_Y.cuda_nvcc for 29 + # cross-compilation to work. This should work automatically once we move to 30 + # spliced scopes. Delete this comment once that happens 31 + cuda_nvcc 32 + 33 + # Alternatively, we could remove the propagated hook from cuda_nvcc and add 34 + # directly: 35 + # setupCudaHook 36 + autoAddOpenGLRunpathHook 37 + ]; 38 + 39 + cmakeFlags = [ 40 + "-DCMAKE_VERBOSE_MAKEFILE=ON" 41 + "-DCMAKE_CUDA_ARCHITECTURES=${with cudaFlags; builtins.concatStringsSep ";" (map dropDot cudaCapabilities)}" 42 + ]; 43 + 44 + meta = { 45 + description = "A simple (Single-precision AX Plus Y) FindCUDAToolkit.cmake example for testing cross-compilation"; 46 + license = lib.licenses.mit; 47 + maintainers = lib.teams.cuda.members; 48 + platforms = lib.platforms.unix; 49 + }; 50 + }
+68
pkgs/development/compilers/cudatoolkit/saxpy/saxpy.cu
···
··· 1 + #include <cublas_v2.h> 2 + #include <cuda_runtime.h> 3 + #include <vector> 4 + 5 + #include <stdio.h> 6 + 7 + static inline void check(cudaError_t err, const char *context) { 8 + if (err != cudaSuccess) { 9 + fprintf(stderr, "CUDA error at %s: %s\n", context, cudaGetErrorString(err)); 10 + std::exit(EXIT_FAILURE); 11 + } 12 + } 13 + 14 + #define CHECK(x) check(x, #x) 15 + 16 + __global__ void saxpy(int n, float a, float *x, float *y) { 17 + int i = blockIdx.x * blockDim.x + threadIdx.x; 18 + if (i < n) 19 + y[i] = a * x[i] + y[i]; 20 + } 21 + 22 + int main(void) { 23 + setbuf(stderr, NULL); 24 + fprintf(stderr, "Start\n"); 25 + 26 + int rtVersion, driverVersion; 27 + CHECK(cudaRuntimeGetVersion(&rtVersion)); 28 + CHECK(cudaDriverGetVersion(&driverVersion)); 29 + 30 + fprintf(stderr, "Runtime version: %d\n", rtVersion); 31 + fprintf(stderr, "Driver version: %d\n", driverVersion); 32 + 33 + constexpr int N = 1 << 10; 34 + 35 + std::vector<float> xHost(N), yHost(N); 36 + for (int i = 0; i < N; i++) { 37 + xHost[i] = 1.0f; 38 + yHost[i] = 2.0f; 39 + } 40 + 41 + fprintf(stderr, "Host memory initialized, copying to the device\n"); 42 + fflush(stderr); 43 + 44 + float *xDevice, *yDevice; 45 + CHECK(cudaMalloc(&xDevice, N * sizeof(float))); 46 + CHECK(cudaMalloc(&yDevice, N * sizeof(float))); 47 + 48 + CHECK(cudaMemcpy(xDevice, xHost.data(), N * sizeof(float), 49 + cudaMemcpyHostToDevice)); 50 + CHECK(cudaMemcpy(yDevice, yHost.data(), N * sizeof(float), 51 + cudaMemcpyHostToDevice)); 52 + fprintf(stderr, "Scheduled a cudaMemcpy, calling the kernel\n"); 53 + 54 + saxpy<<<(N + 255) / 256, 256>>>(N, 2.0f, xDevice, yDevice); 55 + fprintf(stderr, "Scheduled a kernel call\n"); 56 + CHECK(cudaGetLastError()); 57 + 58 + CHECK(cudaMemcpy(yHost.data(), yDevice, N * sizeof(float), 59 + cudaMemcpyDeviceToHost)); 60 + 61 + float maxError = 0.0f; 62 + for (int i = 0; i < N; i++) 63 + maxError = max(maxError, abs(yHost[i] - 4.0f)); 64 + fprintf(stderr, "Max error: %f\n", maxError); 65 + 66 + CHECK(cudaFree(xDevice)); 67 + CHECK(cudaFree(yDevice)); 68 + }