pkgs/development/cuda-modules/packages/saxpy/src/saxpy.cu at python-updates

tjh.dev / nixpkgs

fork atom

nixpkgs mirror (for testing) github.com/NixOS/nixpkgs

nix

fork atom

nixpkgs / pkgs / development / cuda-modules / packages / saxpy / src / saxpy.cu

at python-updates 68 lines 1.9 kB view raw

wrap content

 1#include <cublas_v2.h>
 2#include <cuda_runtime.h>
 3#include <vector>
 4
 5#include <stdio.h>
 6
 7static inline void check(cudaError_t err, const char *context) {
 8  if (err != cudaSuccess) {
 9    fprintf(stderr, "CUDA error at %s: %s\n", context, cudaGetErrorString(err));
10    std::exit(EXIT_FAILURE);
11  }
12}
13
14#define CHECK(x) check(x, #x)
15
16__global__ void saxpy(int n, float a, float *x, float *y) {
17  int i = blockIdx.x * blockDim.x + threadIdx.x;
18  if (i < n)
19    y[i] = a * x[i] + y[i];
20}
21
22int main(void) {
23  setbuf(stderr, NULL);
24  fprintf(stderr, "Start\n");
25
26  int rtVersion, driverVersion;
27  CHECK(cudaRuntimeGetVersion(&rtVersion));
28  CHECK(cudaDriverGetVersion(&driverVersion));
29
30  fprintf(stderr, "Runtime version: %d\n", rtVersion);
31  fprintf(stderr, "Driver version: %d\n", driverVersion);
32
33  constexpr int N = 1 << 10;
34
35  std::vector<float> xHost(N), yHost(N);
36  for (int i = 0; i < N; i++) {
37    xHost[i] = 1.0f;
38    yHost[i] = 2.0f;
39  }
40
41  fprintf(stderr, "Host memory initialized, copying to the device\n");
42  fflush(stderr);
43
44  float *xDevice, *yDevice;
45  CHECK(cudaMalloc(&xDevice, N * sizeof(float)));
46  CHECK(cudaMalloc(&yDevice, N * sizeof(float)));
47
48  CHECK(cudaMemcpy(xDevice, xHost.data(), N * sizeof(float),
49                   cudaMemcpyHostToDevice));
50  CHECK(cudaMemcpy(yDevice, yHost.data(), N * sizeof(float),
51                   cudaMemcpyHostToDevice));
52  fprintf(stderr, "Scheduled a cudaMemcpy, calling the kernel\n");
53
54  saxpy<<<(N + 255) / 256, 256>>>(N, 2.0f, xDevice, yDevice);
55  fprintf(stderr, "Scheduled a kernel call\n");
56  CHECK(cudaGetLastError());
57
58  CHECK(cudaMemcpy(yHost.data(), yDevice, N * sizeof(float),
59                   cudaMemcpyDeviceToHost));
60
61  float maxError = 0.0f;
62  for (int i = 0; i < N; i++)
63    maxError = max(maxError, abs(yHost[i] - 4.0f));
64  fprintf(stderr, "Max error: %f\n", maxError);
65
66  CHECK(cudaFree(xDevice));
67  CHECK(cudaFree(yDevice));
68}