tangled
alpha
login
or
join now
pyrox.dev
/
nixpkgs
lol
0
fork
atom
overview
issues
pulls
pipelines
cudaPackages.saxpy: init at unstable-2023-07-11
Someone Serge
2 years ago
251d3166
4df8614c
+132
4 changed files
expand all
collapse all
unified
split
pkgs
development
compilers
cudatoolkit
extension.nix
saxpy
CMakeLists.txt
default.nix
saxpy.cu
+2
pkgs/development/compilers/cudatoolkit/extension.nix
···
71
cudaFlags
72
markForCudatoolkitRootHook
73
setupCudaHook;
0
0
74
}
···
71
cudaFlags
72
markForCudatoolkitRootHook
73
setupCudaHook;
74
+
75
+
saxpy = final.callPackage ./saxpy { };
76
}
+12
pkgs/development/compilers/cudatoolkit/saxpy/CMakeLists.txt
···
0
0
0
0
0
0
0
0
0
0
0
0
···
1
+
cmake_minimum_required(VERSION 3.25)
2
+
project(saxpy LANGUAGES CXX CUDA)
3
+
4
+
find_package(CUDAToolkit REQUIRED COMPONENTS cudart cublas)
5
+
6
+
add_executable(saxpy saxpy.cu)
7
+
target_link_libraries(saxpy PUBLIC CUDA::cublas CUDA::cudart m)
8
+
target_compile_features(saxpy PRIVATE cxx_std_14)
9
+
target_compile_options(saxpy PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:
10
+
--expt-relaxed-constexpr>)
11
+
12
+
install(TARGETS saxpy)
+50
pkgs/development/compilers/cudatoolkit/saxpy/default.nix
···
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
···
1
+
{ autoAddOpenGLRunpathHook
2
+
, backendStdenv
3
+
, cmake
4
+
, cuda_cccl
5
+
, cuda_cudart
6
+
, cudaFlags
7
+
, cuda_nvcc
8
+
, lib
9
+
, libcublas
10
+
, setupCudaHook
11
+
, stdenv
12
+
}:
13
+
14
+
backendStdenv.mkDerivation {
15
+
pname = "saxpy";
16
+
version = "unstable-2023-07-11";
17
+
18
+
src = ./.;
19
+
20
+
buildInputs = [
21
+
libcublas
22
+
cuda_cudart
23
+
cuda_cccl
24
+
];
25
+
nativeBuildInputs = [
26
+
cmake
27
+
28
+
# NOTE: this needs to be pkgs.buildPackages.cudaPackages_XX_Y.cuda_nvcc for
29
+
# cross-compilation to work. This should work automatically once we move to
30
+
# spliced scopes. Delete this comment once that happens
31
+
cuda_nvcc
32
+
33
+
# Alternatively, we could remove the propagated hook from cuda_nvcc and add
34
+
# directly:
35
+
# setupCudaHook
36
+
autoAddOpenGLRunpathHook
37
+
];
38
+
39
+
cmakeFlags = [
40
+
"-DCMAKE_VERBOSE_MAKEFILE=ON"
41
+
"-DCMAKE_CUDA_ARCHITECTURES=${with cudaFlags; builtins.concatStringsSep ";" (map dropDot cudaCapabilities)}"
42
+
];
43
+
44
+
meta = {
45
+
description = "A simple (Single-precision AX Plus Y) FindCUDAToolkit.cmake example for testing cross-compilation";
46
+
license = lib.licenses.mit;
47
+
maintainers = lib.teams.cuda.members;
48
+
platforms = lib.platforms.unix;
49
+
};
50
+
}
+68
pkgs/development/compilers/cudatoolkit/saxpy/saxpy.cu
···
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
···
1
+
#include <cublas_v2.h>
2
+
#include <cuda_runtime.h>
3
+
#include <vector>
4
+
5
+
#include <stdio.h>
6
+
7
+
static inline void check(cudaError_t err, const char *context) {
8
+
if (err != cudaSuccess) {
9
+
fprintf(stderr, "CUDA error at %s: %s\n", context, cudaGetErrorString(err));
10
+
std::exit(EXIT_FAILURE);
11
+
}
12
+
}
13
+
14
+
#define CHECK(x) check(x, #x)
15
+
16
+
__global__ void saxpy(int n, float a, float *x, float *y) {
17
+
int i = blockIdx.x * blockDim.x + threadIdx.x;
18
+
if (i < n)
19
+
y[i] = a * x[i] + y[i];
20
+
}
21
+
22
+
int main(void) {
23
+
setbuf(stderr, NULL);
24
+
fprintf(stderr, "Start\n");
25
+
26
+
int rtVersion, driverVersion;
27
+
CHECK(cudaRuntimeGetVersion(&rtVersion));
28
+
CHECK(cudaDriverGetVersion(&driverVersion));
29
+
30
+
fprintf(stderr, "Runtime version: %d\n", rtVersion);
31
+
fprintf(stderr, "Driver version: %d\n", driverVersion);
32
+
33
+
constexpr int N = 1 << 10;
34
+
35
+
std::vector<float> xHost(N), yHost(N);
36
+
for (int i = 0; i < N; i++) {
37
+
xHost[i] = 1.0f;
38
+
yHost[i] = 2.0f;
39
+
}
40
+
41
+
fprintf(stderr, "Host memory initialized, copying to the device\n");
42
+
fflush(stderr);
43
+
44
+
float *xDevice, *yDevice;
45
+
CHECK(cudaMalloc(&xDevice, N * sizeof(float)));
46
+
CHECK(cudaMalloc(&yDevice, N * sizeof(float)));
47
+
48
+
CHECK(cudaMemcpy(xDevice, xHost.data(), N * sizeof(float),
49
+
cudaMemcpyHostToDevice));
50
+
CHECK(cudaMemcpy(yDevice, yHost.data(), N * sizeof(float),
51
+
cudaMemcpyHostToDevice));
52
+
fprintf(stderr, "Scheduled a cudaMemcpy, calling the kernel\n");
53
+
54
+
saxpy<<<(N + 255) / 256, 256>>>(N, 2.0f, xDevice, yDevice);
55
+
fprintf(stderr, "Scheduled a kernel call\n");
56
+
CHECK(cudaGetLastError());
57
+
58
+
CHECK(cudaMemcpy(yHost.data(), yDevice, N * sizeof(float),
59
+
cudaMemcpyDeviceToHost));
60
+
61
+
float maxError = 0.0f;
62
+
for (int i = 0; i < N; i++)
63
+
maxError = max(maxError, abs(yHost[i] - 4.0f));
64
+
fprintf(stderr, "Max error: %f\n", maxError);
65
+
66
+
CHECK(cudaFree(xDevice));
67
+
CHECK(cudaFree(yDevice));
68
+
}