nixpkgs mirror (for testing) github.com/NixOS/nixpkgs
nix
at python-updates 68 lines 1.9 kB view raw
1#include <cublas_v2.h> 2#include <cuda_runtime.h> 3#include <vector> 4 5#include <stdio.h> 6 7static inline void check(cudaError_t err, const char *context) { 8 if (err != cudaSuccess) { 9 fprintf(stderr, "CUDA error at %s: %s\n", context, cudaGetErrorString(err)); 10 std::exit(EXIT_FAILURE); 11 } 12} 13 14#define CHECK(x) check(x, #x) 15 16__global__ void saxpy(int n, float a, float *x, float *y) { 17 int i = blockIdx.x * blockDim.x + threadIdx.x; 18 if (i < n) 19 y[i] = a * x[i] + y[i]; 20} 21 22int main(void) { 23 setbuf(stderr, NULL); 24 fprintf(stderr, "Start\n"); 25 26 int rtVersion, driverVersion; 27 CHECK(cudaRuntimeGetVersion(&rtVersion)); 28 CHECK(cudaDriverGetVersion(&driverVersion)); 29 30 fprintf(stderr, "Runtime version: %d\n", rtVersion); 31 fprintf(stderr, "Driver version: %d\n", driverVersion); 32 33 constexpr int N = 1 << 10; 34 35 std::vector<float> xHost(N), yHost(N); 36 for (int i = 0; i < N; i++) { 37 xHost[i] = 1.0f; 38 yHost[i] = 2.0f; 39 } 40 41 fprintf(stderr, "Host memory initialized, copying to the device\n"); 42 fflush(stderr); 43 44 float *xDevice, *yDevice; 45 CHECK(cudaMalloc(&xDevice, N * sizeof(float))); 46 CHECK(cudaMalloc(&yDevice, N * sizeof(float))); 47 48 CHECK(cudaMemcpy(xDevice, xHost.data(), N * sizeof(float), 49 cudaMemcpyHostToDevice)); 50 CHECK(cudaMemcpy(yDevice, yHost.data(), N * sizeof(float), 51 cudaMemcpyHostToDevice)); 52 fprintf(stderr, "Scheduled a cudaMemcpy, calling the kernel\n"); 53 54 saxpy<<<(N + 255) / 256, 256>>>(N, 2.0f, xDevice, yDevice); 55 fprintf(stderr, "Scheduled a kernel call\n"); 56 CHECK(cudaGetLastError()); 57 58 CHECK(cudaMemcpy(yHost.data(), yDevice, N * sizeof(float), 59 cudaMemcpyDeviceToHost)); 60 61 float maxError = 0.0f; 62 for (int i = 0; i < N; i++) 63 maxError = max(maxError, abs(yHost[i] - 4.0f)); 64 fprintf(stderr, "Max error: %f\n", maxError); 65 66 CHECK(cudaFree(xDevice)); 67 CHECK(cudaFree(yDevice)); 68}