flood_cuda.cu at trunk · storm.tngl.sh/acce

storm.tngl.sh / acce_gpu
fork
this repo has no description
fork
acce_gpu / flood_cuda.cu
at trunk 625 lines 21 kB view raw
wrap content
storm.tngl.sh Coalesced global memory reads 9d ago
09ef8b52
  1#include <cstdlib>
  2#include <cuda.h>
  3#include <float.h>
  4#include <stdio.h>
  5#include <utility>
  6
  7#include "flood.h"
  8
  9/* Example of macros for error checking in CUDA */
 10#define CCF(call)                                                                                 \
 11    {                                                                                             \
 12        cudaError_t check = call;                                                                 \
 13        if (check != cudaSuccess)                                                                 \
 14            fprintf(stderr, "CUDA Error in line: %d, %s\n", __LINE__, cudaGetErrorString(check)); \
 15    }
 16
 17#define CCK()                                                                                      \
 18    {                                                                                              \
 19        cudaError_t check = cudaGetLastError();                                                    \
 20        if (check != cudaSuccess)                                                                  \
 21            fprintf(                                                                               \
 22                stderr, "CUDA Kernel Error in line: %d, %s\n", __LINE__, cudaGetErrorString(check) \
 23            );                                                                                     \
 24    }
 25
 26#define get_global(buffer, index) buffer[index.y * sd.x_count + index.x]
 27#define get_shared(buffer, index) buffer[index.y * SHARED_X_COUNT + index.x]
 28
 29#define COORD_SCEN2MAT_X(x) (x * sd.x_count / SCENARIO_SIZE)
 30#define COORD_SCEN2MAT_Y(y) (y * sd.y_count / SCENARIO_SIZE)
 31#define COORD_MAT2SCEN_X(x) (x * SCENARIO_SIZE / sd.x_count)
 32#define COORD_MAT2SCEN_Y(y) (y * SCENARIO_SIZE / sd.y_count)
 33
 34typedef struct static_data {
 35    int x_count;
 36    int y_count;
 37    float ex_factor;
 38    int cloud_count;
 39} StaticData;
 40
 41typedef struct statistics {
 42    double max_spillage_iter;
 43    int max_spillage_minute;
 44    double max_spillage_scenario;
 45    double max_water_scenario;
 46
 47    int minute;
 48
 49    unsigned long long total_rain;
 50    unsigned long long total_water;
 51    unsigned long long total_water_loss;
 52} Statistics;
 53
 54__constant__ StaticData sd;
 55
 56__device__ __forceinline__ double atomicMaxDouble(double *address, double value) {
 57    return __longlong_as_double(
 58        atomicMax((unsigned long long *)address, __double_as_longlong(value))
 59    );
 60}
 61
 62template <typename T> __device__ __forceinline__ T warpReduceMax(T value) {
 63#pragma unroll
 64    for (int offset = warpSize / 2; offset > 0; offset >>= 1)
 65        value = max(value, __shfl_down_sync(0xffffffff, value, offset));
 66
 67    return value;
 68}
 69
 70template <typename T> __device__ __forceinline__ T warpReduceSum(T value) {
 71#pragma unroll
 72    for (int offset = warpSize / 2; offset > 0; offset >>= 1)
 73        value = value + __shfl_down_sync(0xffffffff, value, offset);
 74
 75    return value;
 76}
 77
 78__device__ __forceinline__ bool is_in_bounds(int2 index) {
 79    return index.x >= 0 && index.x < sd.x_count && index.y >= 0 && index.y < sd.y_count;
 80}
 81
 82__device__ __forceinline__ int2 operator+(int2 lhs, int2 rhs) {
 83    return make_int2(lhs.x + rhs.x, lhs.y + rhs.y);
 84}
 85
 86__global__ void next_iteration(Statistics *stats) {
 87    if (stats->max_spillage_iter > stats->max_spillage_scenario) {
 88        stats->max_spillage_scenario = stats->max_spillage_iter;
 89        stats->max_spillage_minute = stats->minute;
 90    }
 91
 92    stats->minute += 1;
 93}
 94
 95__global__ void step1(Statistics *stats, Cloud_t *clouds, int *water_level) {
 96    int cloud_index = blockIdx.x;
 97    int thread_index = threadIdx.x;
 98
 99    // Instead of the full Cloud_t, only the relevant fields are loaded into shared memory.
100    __shared__ float cloud_x;
101    __shared__ float cloud_y;
102    __shared__ float cloud_intensity;
103    __shared__ float cloud_radius;
104
105    if (thread_index == 0) {
106        if (cloud_index == 0) {
107            stats->max_spillage_iter = 0;
108        }
109
110        /* Step 1.1: Clouds movement */
111        Cloud_t cloud = clouds[cloud_index];
112
113        float km_minute = cloud.speed / 60;
114        cloud.x += km_minute * cos(cloud.angle * M_PI / 180.0);
115        cloud.y += km_minute * sin(cloud.angle * M_PI / 180.0);
116
117        clouds[cloud_index] = cloud;
118
119        // Load into shared memory.
120        cloud_x = cloud.x;
121        cloud_y = cloud.y;
122        cloud_intensity = cloud.intensity;
123        cloud_radius = cloud.radius;
124    }
125
126    __syncthreads();
127
128    float lhs_x_index = COORD_SCEN2MAT_X(max(0.0f, cloud_x - cloud_radius));
129    float lhs_y_index = COORD_SCEN2MAT_Y(max(0.0f, cloud_y - cloud_radius));
130
131    float rhs_x_index = COORD_SCEN2MAT_X(min(cloud_x + cloud_radius, (float)SCENARIO_SIZE));
132    float rhs_y_index = COORD_SCEN2MAT_Y(min(cloud_y + cloud_radius, (float)SCENARIO_SIZE));
133
134    int x_count = max(0, (int)ceilf(rhs_x_index - lhs_x_index));
135    int y_count = max(0, (int)ceilf(rhs_y_index - lhs_y_index));
136    if (x_count == 0 || y_count == 0) return;
137
138    int x_offset = thread_index % x_count;
139    int y_offset = thread_index / x_count;
140    int x_stride = blockDim.x % x_count;
141    int y_stride = blockDim.x / x_count;
142
143    int cell_count = x_count * y_count;
144    float distance_scale = sqrt(cloud_intensity) / cloud_radius;
145
146    unsigned long long total_rain_thread = 0;
147
148    for (int i = thread_index; i < cell_count; i += blockDim.x) {
149        float float_x_index = lhs_x_index + x_offset;
150        float float_y_index = lhs_y_index + y_offset;
151
152        // Manually updating offset and stride is faster
153        // than both a division and modulo every iteration.
154        x_offset += x_stride;
155        y_offset += y_stride;
156        if (x_offset >= x_count) {
157            x_offset -= x_count;
158            y_offset += 1;
159        }
160
161        int2 int_index = make_int2((int)float_x_index, (int)float_y_index);
162        if (!is_in_bounds(int_index)) continue;
163
164        float dx = COORD_MAT2SCEN_X(float_x_index) - cloud_x;
165        float dy = COORD_MAT2SCEN_Y(float_y_index) - cloud_y;
166
167        float distance = sqrt(dx * dx + dy * dy);
168        if (distance >= cloud_radius) continue;
169
170        float rain = sd.ex_factor * max(0.0f, cloud_intensity - distance * distance_scale);
171        int meters_per_minute = FIXED(rain / 1000 / 60);
172
173        total_rain_thread += meters_per_minute;
174        atomicAdd(&get_global(water_level, int_index), meters_per_minute);
175    }
176
177    unsigned long long total_rain_warp = warpReduceSum(total_rain_thread);
178
179    int warp_lane_index = thread_index % warpSize;
180    if (warp_lane_index == 0 && total_rain_warp > 0) {
181        atomicAdd(&stats->total_rain, total_rain_warp);
182    }
183}
184
185// 2 halo cells on both sides.
186constexpr int SHARED_X_COUNT = 32 + 4;
187constexpr int SHARED_Y_COUNT = 8 + 4;
188constexpr int SHARED_COUNT = SHARED_X_COUNT * SHARED_Y_COUNT;
189
190template <bool is_fully_in_bounds>
191__device__ __forceinline__ void inner_inner_kernel(
192    Statistics *stats,
193    int *water_level_read,
194    int *water_level_write,
195    float height_shared[SHARED_COUNT],
196    float water_level_read_shared[SHARED_COUNT],
197    int2 cell_index,
198
199    float *max_spillage_iter_thread
200) {
201    int2 cell_index_shared = make_int2(threadIdx.x + 2, threadIdx.y + 2);
202
203    int cell_water_level_delta_int = 0;
204    unsigned long long total_water_loss_thread = 0;
205
206    float cell_height = get_shared(height_shared, cell_index_shared);
207
208    constexpr int2 offsets[5] = {{0, 0}, {0, -1}, {0, +1}, {-1, 0}, {+1, 0}};
209
210#pragma unroll
211    for (int outer_index = 0; outer_index < 5; outer_index += 1) {
212        int2 lhs_index = cell_index + offsets[outer_index];
213
214        if constexpr (!is_fully_in_bounds) {
215            if (!is_in_bounds(lhs_index)) continue;
216        }
217
218        int2 lhs_index_shared = cell_index_shared + offsets[outer_index];
219
220        float lhs_water_level = get_shared(water_level_read_shared, lhs_index_shared);
221        if (lhs_water_level <= 0.0) continue;
222
223        float height_difference_sum = 0;
224        float height_difference_max = 0;
225        float height_difference_oob = 0;
226
227        float lhs_height = get_shared(height_shared, lhs_index_shared);
228
229#pragma unroll
230        for (int inner_index = 0; inner_index < 4; inner_index += 1) {
231            int2 rhs_index = lhs_index + offsets[1 + inner_index];
232            int2 rhs_index_shared = lhs_index_shared + offsets[1 + inner_index];
233
234            float rhs_height;
235
236            if constexpr (is_fully_in_bounds) {
237                rhs_height = get_shared(height_shared, rhs_index_shared);
238            } else {
239                if (is_in_bounds(rhs_index)) {
240                    rhs_height = get_shared(height_shared, rhs_index_shared);
241                } else {
242                    rhs_height = lhs_height - lhs_water_level;
243                }
244            }
245
246            // Compute level differences
247            float height_difference = max(0.0f, lhs_height - rhs_height);
248
249            height_difference_sum += height_difference;
250            height_difference_max = max(height_difference_max, height_difference);
251
252            if constexpr (!is_fully_in_bounds) {
253                if (!is_in_bounds(rhs_index)) {
254                    height_difference_oob += height_difference;
255                }
256            }
257        }
258
259        float lhs_spillage_level = min(lhs_water_level, height_difference_max);
260
261        if (lhs_spillage_level <= 1e-8f * height_difference_sum) continue;
262        float proportion = lhs_spillage_level / height_difference_sum;
263
264        // Identical logic to: flood_seq.c:196, flood_seq:199 and flood_seq:235.
265        int incoming = FIXED(proportion * max(0.0f, lhs_height - cell_height) / SPILLAGE_FACTOR);
266
267        cell_water_level_delta_int += incoming;
268
269        // Identical logic to: flood_seq.c:167 and flood_seq.c:218.
270        if (outer_index != 0) continue;
271
272        int outgoing = FIXED(lhs_spillage_level / SPILLAGE_FACTOR);
273        cell_water_level_delta_int -= outgoing;
274
275        if constexpr (!is_fully_in_bounds) {
276            total_water_loss_thread += FIXED(proportion * height_difference_oob / 2);
277        }
278
279        *max_spillage_iter_thread =
280            max(*max_spillage_iter_thread, lhs_spillage_level / SPILLAGE_FACTOR);
281    }
282
283    get_global(water_level_write, cell_index) =
284        get_global(water_level_read, cell_index) + cell_water_level_delta_int;
285
286    if constexpr (is_fully_in_bounds) return;
287
288    if (total_water_loss_thread > 0) {
289        atomicAdd(&stats->total_water_loss, total_water_loss_thread);
290    }
291}
292
293template <bool is_fully_in_bounds>
294__device__ __forceinline__ void load_cell(
295    int *water_level_read,
296    float *ground,
297    float height_shared[SHARED_COUNT],
298    float water_level_read_shared[SHARED_COUNT],
299    int2 shared_index
300) {
301    int global_x_index = blockIdx.x * blockDim.x + shared_index.x - 2;
302    int global_y_index = blockIdx.y * blockDim.y + shared_index.y - 2;
303    int2 global_index = make_int2(global_x_index, global_y_index);
304
305    if constexpr (!is_fully_in_bounds) {
306        if (!is_in_bounds(global_index)) {
307            get_shared(height_shared, shared_index) = 0.0;
308            get_shared(water_level_read_shared, shared_index) = 0.0;
309
310            return;
311        }
312    }
313
314    float ground_value = get_global(ground, global_index);
315    int water_level_int_value = get_global(water_level_read, global_index);
316
317    float water_level_float_value = FLOATING(water_level_int_value);
318    get_shared(water_level_read_shared, shared_index) = water_level_float_value;
319
320    float height_value = ground_value + water_level_float_value;
321    get_shared(height_shared, shared_index) = height_value;
322}
323
324template <bool is_fully_in_bounds>
325__device__ __forceinline__ void inner_kernel(
326    Statistics *stats,
327    int *water_level_read,
328    int *water_level_write,
329    float *ground,
330    float height_shared[SHARED_COUNT],
331    float water_level_read_shared[SHARED_COUNT]
332) {
333#define load_cell(x, y)                                                                   \
334    load_cell<is_fully_in_bounds>(                                                        \
335        water_level_read, ground, height_shared, water_level_read_shared, make_int2(x, y) \
336    )
337
338    // 32 x 08 - top left
339    load_cell(threadIdx.x, threadIdx.y);
340
341    // 32 x 04 - bottom left
342    if (threadIdx.y < 4) {
343        load_cell(threadIdx.x, threadIdx.y + 8);
344    }
345
346    // 04 x 08 - top right
347    if (threadIdx.x < 4) {
348        load_cell(threadIdx.x + 32, threadIdx.y);
349    }
350
351    // 04 x 04 - bottom right
352    if (threadIdx.x < 4 && threadIdx.y < 4) {
353        load_cell(threadIdx.x + 32, threadIdx.y + 8);
354    }
355
356    __syncthreads();
357
358    int cell_x_index = blockIdx.x * blockDim.x + threadIdx.x;
359    int cell_y_index = blockIdx.y * blockDim.y + threadIdx.y;
360    int2 cell_index = make_int2(cell_x_index, cell_y_index);
361
362    float max_spillage_iter_thread = 0.0;
363
364    if constexpr (is_fully_in_bounds) {
365        inner_inner_kernel<true>(
366            stats,
367            water_level_read,
368            water_level_write,
369            height_shared,
370            water_level_read_shared,
371            cell_index,
372            &max_spillage_iter_thread
373        );
374    } else {
375        if (is_in_bounds(cell_index)) {
376            inner_inner_kernel<false>(
377                stats,
378                water_level_read,
379                water_level_write,
380                height_shared,
381                water_level_read_shared,
382                cell_index,
383                &max_spillage_iter_thread
384            );
385        }
386    }
387
388    float max_spillage_iter_warp = warpReduceMax(max_spillage_iter_thread);
389
390    int thread_index = blockDim.x * threadIdx.y + threadIdx.x;
391    int warp_lane_index = thread_index % warpSize;
392
393    if (warp_lane_index == 0 && max_spillage_iter_warp > 0.0) {
394        atomicMaxDouble(&stats->max_spillage_iter, max_spillage_iter_warp);
395    }
396}
397
398__global__ void step2_3(
399    Statistics *stats,
400    int *water_level_read,
401    int *water_level_write,
402    float *ground
403) {
404    int tile_min_x = blockIdx.x * blockDim.x - 2;
405    int tile_min_y = blockIdx.y * blockDim.y - 2;
406    int tile_max_x = blockIdx.x * blockDim.x + blockDim.x + 1;
407    int tile_max_y = blockIdx.y * blockDim.y + blockDim.y + 1;
408
409    bool is_fully_in_bounds =
410        tile_min_x >= 0 && tile_min_y >= 0 && tile_max_x < sd.x_count && tile_max_y < sd.y_count;
411
412    __shared__ float height_shared[SHARED_COUNT];
413    __shared__ float water_level_read_shared[SHARED_COUNT];
414
415    if (is_fully_in_bounds) {
416        inner_kernel<true>(
417            stats,
418            water_level_read,
419            water_level_write,
420            ground,
421            height_shared,
422            water_level_read_shared
423        );
424    } else {
425        inner_kernel<false>(
426            stats,
427            water_level_read,
428            water_level_write,
429            ground,
430            height_shared,
431            water_level_read_shared
432        );
433    }
434}
435
436__global__ void calculate_total(Statistics *stats, int *water_level) {
437    int x_index = blockIdx.x * blockDim.x + threadIdx.x;
438    int y_index = blockIdx.y * blockDim.y + threadIdx.y;
439    int2 index = make_int2(x_index, y_index);
440
441    int water_level_thread = 0;
442    if (is_in_bounds(index)) {
443        water_level_thread = get_global(water_level, index);
444    }
445
446    int water_level_warp = warpReduceSum(water_level_thread);
447    float max_water_scenario_warp = warpReduceMax(FLOATING(water_level_thread));
448
449    auto thread_index = threadIdx.x + threadIdx.y * blockDim.x;
450    int warp_lane_index = thread_index % warpSize;
451
452    if (warp_lane_index == 0) {
453        atomicAdd(&stats->total_water, water_level_warp);
454        atomicMaxDouble(&stats->max_water_scenario, max_water_scenario_warp);
455    }
456}
457
458extern "C" void do_compute(struct parameters *p, struct results *r) {
459    /* 2. Start global timer */
460    CCF(cudaSetDevice(0));
461
462    /* Memory allocation */
463    size_t cell_count = (size_t)p->rows * (size_t)p->columns;
464
465    size_t clouds_size = sizeof(Cloud_t) * (size_t)p->num_clouds;
466    size_t ground_size = sizeof(float) * cell_count;
467    size_t water_level_size = sizeof(int) * cell_count;
468
469    StaticData sd_host = {
470        .x_count = p->columns,
471        .y_count = p->rows,
472        .ex_factor = p->ex_factor,
473        .cloud_count = p->num_clouds,
474    };
475
476    Statistics stats_host = {
477        .max_spillage_iter = DBL_MAX,
478        .max_spillage_minute = 0,
479        .max_spillage_scenario = 0.0,
480        .max_water_scenario = 0.0,
481        .minute = 0,
482        .total_rain = 0,
483        .total_water = 0,
484        .total_water_loss = 0,
485    };
486
487    Cloud_t *clouds;
488    CCF(cudaMalloc(&clouds, clouds_size));
489    CCF(cudaMemcpy(clouds, p->clouds, clouds_size, cudaMemcpyHostToDevice));
490
491#ifdef DEBUG
492    Cloud_t *clouds_debug = (Cloud_t *)malloc(clouds_size);
493    if (clouds_debug == NULL) {
494        printf("Failed to allocate clouds_debug\n");
495        exit(EXIT_FAILURE);
496    }
497#endif
498
499    float *ground;
500    CCF(cudaMalloc(&ground, ground_size));
501    CCF(cudaMemcpy(ground, p->ground, ground_size, cudaMemcpyHostToDevice));
502
503    CCF(cudaMemcpyToSymbol(sd, &sd_host, sizeof(StaticData), 0, cudaMemcpyHostToDevice));
504
505    struct statistics *stats_device;
506    CCF(cudaMalloc(&stats_device, sizeof(Statistics)));
507    CCF(cudaMemcpy(stats_device, &stats_host, sizeof(Statistics), cudaMemcpyHostToDevice));
508
509    int *water_level_read;
510    CCF(cudaMalloc(&water_level_read, water_level_size));
511    CCF(cudaMemset(water_level_read, 0, water_level_size));
512
513    int *water_level_write;
514    CCF(cudaMalloc(&water_level_write, water_level_size));
515    CCF(cudaMemset(water_level_write, 0, water_level_size));
516
517#ifdef DEBUG
518    int *water_level_debug = (int *)malloc(water_level_size);
519    if (water_level_debug == NULL) {
520        printf("Failed to allocate water_level_debug\n");
521        exit(EXIT_FAILURE);
522    }
523#endif
524
525    /* CUDA grid. */
526    dim3 block(32, 8);
527    auto columns_ceil = (p->columns + block.x - 1) / block.x;
528    auto rows_ceil = (p->rows + block.y - 1) / block.y;
529
530    dim3 grid(columns_ceil, rows_ceil);
531
532#ifdef DEBUG
533    print_matrix(PRECISION_FLOAT, p->rows, p->columns, p->ground, "Ground heights");
534#ifndef ANIMATION
535    print_clouds(p->num_clouds, p->clouds);
536#endif
537#endif
538
539    /* Prepare to measure runtime */
540    r->runtime = get_time();
541
542    /* Flood simulation */
543    while (stats_host.minute < p->num_minutes && stats_host.max_spillage_iter > p->threshold) {
544        step1<<<p->num_clouds, 768>>>(stats_device, clouds, water_level_read);
545        CCK();
546
547#ifdef DEBUG
548#ifndef ANIMATION
549        CCF(cudaMemcpy(clouds_debug, clouds, clouds_size, cudaMemcpyDeviceToHost));
550
551        print_clouds(p->num_clouds, clouds_debug);
552#endif
553#endif
554
555#ifdef DEBUG
556        CCF(cudaMemcpy(
557            water_level_debug, water_level_read, water_level_size, cudaMemcpyDeviceToHost
558        ));
559
560        print_matrix(PRECISION_FIXED, p->rows, p->columns, water_level_debug, "Water after rain");
561#endif
562
563        step2_3<<<grid, block>>>(stats_device, water_level_read, water_level_write, ground);
564        CCK();
565
566#ifdef DEBUG
567#ifndef ANIMATION
568        CCF(cudaMemcpy(
569            water_level_debug, water_level_write, water_level_size, cudaMemcpyDeviceToHost
570        ));
571
572        print_matrix(
573            PRECISION_FIXED, p->rows, p->columns, water_level_debug, "Water after spillage"
574        );
575#endif
576#endif
577
578        next_iteration<<<1, 1>>>(stats_device);
579        CCK();
580
581        // Copies minute and max_spillage_iter as these are required for the loop.
582        // Also implicitly synchronises the CPU and GPU.
583        CCF(cudaMemcpy(&stats_host, stats_device, sizeof(Statistics), cudaMemcpyDeviceToHost));
584
585        std::swap(water_level_read, water_level_write);
586    } // End of simulation.
587
588    r->runtime = get_time() - r->runtime;
589
590    /* Statistics: Total remaining water and maximum amount of water in a cell */
591    calculate_total<<<grid, block>>>(stats_device, water_level_read);
592    CCK();
593
594    // Copies max_water_scenario and total_water.
595    // Also implicitly synchronises the CPU and GPU.
596    CCF(cudaMemcpy(&stats_host, stats_device, sizeof(Statistics), cudaMemcpyDeviceToHost));
597
598    if (p->final_matrix) {
599        print_matrix(
600            PRECISION_FIXED, p->rows, p->columns, water_level_read, "Water after spillage"
601        );
602    }
603
604    r->max_water_scenario = stats_host.max_water_scenario;
605    r->total_water = stats_host.total_water;
606
607    r->minute = stats_host.minute;
608    r->total_rain = stats_host.total_rain;
609    r->total_water_loss = stats_host.total_water_loss;
610
611    r->max_spillage_minute = stats_host.max_spillage_minute;
612    r->max_spillage_scenario = stats_host.max_spillage_scenario;
613
614    /* Free resources */
615    CCF(cudaFree(clouds));
616    CCF(cudaFree(ground));
617    CCF(cudaFree(stats_device));
618    CCF(cudaFree(water_level_read));
619    CCF(cudaFree(water_level_write));
620
621#ifdef DEBUG
622    free(clouds_debug);
623    free(water_level_debug);
624#endif
625}
Configure Feed

Configure Feed