nixpkgs mirror (for testing)
github.com/NixOS/nixpkgs
nix
1#include <cublas_v2.h>
2#include <cuda_runtime.h>
3#include <vector>
4
5#include <stdio.h>
6
7static inline void check(cudaError_t err, const char *context) {
8 if (err != cudaSuccess) {
9 fprintf(stderr, "CUDA error at %s: %s\n", context, cudaGetErrorString(err));
10 std::exit(EXIT_FAILURE);
11 }
12}
13
14#define CHECK(x) check(x, #x)
15
16__global__ void saxpy(int n, float a, float *x, float *y) {
17 int i = blockIdx.x * blockDim.x + threadIdx.x;
18 if (i < n)
19 y[i] = a * x[i] + y[i];
20}
21
22int main(void) {
23 setbuf(stderr, NULL);
24 fprintf(stderr, "Start\n");
25
26 int rtVersion, driverVersion;
27 CHECK(cudaRuntimeGetVersion(&rtVersion));
28 CHECK(cudaDriverGetVersion(&driverVersion));
29
30 fprintf(stderr, "Runtime version: %d\n", rtVersion);
31 fprintf(stderr, "Driver version: %d\n", driverVersion);
32
33 constexpr int N = 1 << 10;
34
35 std::vector<float> xHost(N), yHost(N);
36 for (int i = 0; i < N; i++) {
37 xHost[i] = 1.0f;
38 yHost[i] = 2.0f;
39 }
40
41 fprintf(stderr, "Host memory initialized, copying to the device\n");
42 fflush(stderr);
43
44 float *xDevice, *yDevice;
45 CHECK(cudaMalloc(&xDevice, N * sizeof(float)));
46 CHECK(cudaMalloc(&yDevice, N * sizeof(float)));
47
48 CHECK(cudaMemcpy(xDevice, xHost.data(), N * sizeof(float),
49 cudaMemcpyHostToDevice));
50 CHECK(cudaMemcpy(yDevice, yHost.data(), N * sizeof(float),
51 cudaMemcpyHostToDevice));
52 fprintf(stderr, "Scheduled a cudaMemcpy, calling the kernel\n");
53
54 saxpy<<<(N + 255) / 256, 256>>>(N, 2.0f, xDevice, yDevice);
55 fprintf(stderr, "Scheduled a kernel call\n");
56 CHECK(cudaGetLastError());
57
58 CHECK(cudaMemcpy(yHost.data(), yDevice, N * sizeof(float),
59 cudaMemcpyDeviceToHost));
60
61 float maxError = 0.0f;
62 for (int i = 0; i < N; i++)
63 maxError = max(maxError, abs(yHost[i] - 4.0f));
64 fprintf(stderr, "Max error: %f\n", maxError);
65
66 CHECK(cudaFree(xDevice));
67 CHECK(cudaFree(yDevice));
68}