this repo has no description
1#include <cstdlib>
2#include <cuda.h>
3#include <float.h>
4#include <stdio.h>
5#include <utility>
6
7#include "flood.h"
8
9/* Example of macros for error checking in CUDA */
10#define CCF(call) \
11 { \
12 cudaError_t check = call; \
13 if (check != cudaSuccess) \
14 fprintf(stderr, "CUDA Error in line: %d, %s\n", __LINE__, cudaGetErrorString(check)); \
15 }
16
17#define CCK() \
18 { \
19 cudaError_t check = cudaGetLastError(); \
20 if (check != cudaSuccess) \
21 fprintf( \
22 stderr, "CUDA Kernel Error in line: %d, %s\n", __LINE__, cudaGetErrorString(check) \
23 ); \
24 }
25
26#define get_global(buffer, index) buffer[index.y * sd.x_count + index.x]
27#define get_shared(buffer, index) buffer[index.y * SHARED_X_COUNT + index.x]
28
29#define COORD_SCEN2MAT_X(x) (x * sd.x_count / SCENARIO_SIZE)
30#define COORD_SCEN2MAT_Y(y) (y * sd.y_count / SCENARIO_SIZE)
31#define COORD_MAT2SCEN_X(x) (x * SCENARIO_SIZE / sd.x_count)
32#define COORD_MAT2SCEN_Y(y) (y * SCENARIO_SIZE / sd.y_count)
33
34typedef struct static_data {
35 int x_count;
36 int y_count;
37 float ex_factor;
38 int cloud_count;
39} StaticData;
40
41typedef struct statistics {
42 double max_spillage_iter;
43 int max_spillage_minute;
44 double max_spillage_scenario;
45 double max_water_scenario;
46
47 int minute;
48
49 unsigned long long total_rain;
50 unsigned long long total_water;
51 unsigned long long total_water_loss;
52} Statistics;
53
54__constant__ StaticData sd;
55
56__device__ __forceinline__ double atomicMaxDouble(double *address, double value) {
57 return __longlong_as_double(
58 atomicMax((unsigned long long *)address, __double_as_longlong(value))
59 );
60}
61
62template <typename T> __device__ __forceinline__ T warpReduceMax(T value) {
63#pragma unroll
64 for (int offset = warpSize / 2; offset > 0; offset >>= 1)
65 value = max(value, __shfl_down_sync(0xffffffff, value, offset));
66
67 return value;
68}
69
70template <typename T> __device__ __forceinline__ T warpReduceSum(T value) {
71#pragma unroll
72 for (int offset = warpSize / 2; offset > 0; offset >>= 1)
73 value = value + __shfl_down_sync(0xffffffff, value, offset);
74
75 return value;
76}
77
78__device__ __forceinline__ bool is_in_bounds(int2 index) {
79 return index.x >= 0 && index.x < sd.x_count && index.y >= 0 && index.y < sd.y_count;
80}
81
82__device__ __forceinline__ int2 operator+(int2 lhs, int2 rhs) {
83 return make_int2(lhs.x + rhs.x, lhs.y + rhs.y);
84}
85
86__global__ void next_iteration(Statistics *stats) {
87 if (stats->max_spillage_iter > stats->max_spillage_scenario) {
88 stats->max_spillage_scenario = stats->max_spillage_iter;
89 stats->max_spillage_minute = stats->minute;
90 }
91
92 stats->minute += 1;
93}
94
95__global__ void step1(Statistics *stats, Cloud_t *clouds, int *water_level) {
96 int cloud_index = blockIdx.x;
97 int thread_index = threadIdx.x;
98
99 // Instead of the full Cloud_t, only the relevant fields are loaded into shared memory.
100 __shared__ float cloud_x;
101 __shared__ float cloud_y;
102 __shared__ float cloud_intensity;
103 __shared__ float cloud_radius;
104
105 if (thread_index == 0) {
106 if (cloud_index == 0) {
107 stats->max_spillage_iter = 0;
108 }
109
110 /* Step 1.1: Clouds movement */
111 Cloud_t cloud = clouds[cloud_index];
112
113 float km_minute = cloud.speed / 60;
114 cloud.x += km_minute * cos(cloud.angle * M_PI / 180.0);
115 cloud.y += km_minute * sin(cloud.angle * M_PI / 180.0);
116
117 clouds[cloud_index] = cloud;
118
119 // Load into shared memory.
120 cloud_x = cloud.x;
121 cloud_y = cloud.y;
122 cloud_intensity = cloud.intensity;
123 cloud_radius = cloud.radius;
124 }
125
126 __syncthreads();
127
128 float lhs_x_index = COORD_SCEN2MAT_X(max(0.0f, cloud_x - cloud_radius));
129 float lhs_y_index = COORD_SCEN2MAT_Y(max(0.0f, cloud_y - cloud_radius));
130
131 float rhs_x_index = COORD_SCEN2MAT_X(min(cloud_x + cloud_radius, (float)SCENARIO_SIZE));
132 float rhs_y_index = COORD_SCEN2MAT_Y(min(cloud_y + cloud_radius, (float)SCENARIO_SIZE));
133
134 int x_count = max(0, (int)ceilf(rhs_x_index - lhs_x_index));
135 int y_count = max(0, (int)ceilf(rhs_y_index - lhs_y_index));
136 if (x_count == 0 || y_count == 0) return;
137
138 int x_offset = thread_index % x_count;
139 int y_offset = thread_index / x_count;
140 int x_stride = blockDim.x % x_count;
141 int y_stride = blockDim.x / x_count;
142
143 int cell_count = x_count * y_count;
144 float distance_scale = sqrt(cloud_intensity) / cloud_radius;
145
146 unsigned long long total_rain_thread = 0;
147
148 for (int i = thread_index; i < cell_count; i += blockDim.x) {
149 float float_x_index = lhs_x_index + x_offset;
150 float float_y_index = lhs_y_index + y_offset;
151
152 // Manually updating offset and stride is faster
153 // than both a division and modulo every iteration.
154 x_offset += x_stride;
155 y_offset += y_stride;
156 if (x_offset >= x_count) {
157 x_offset -= x_count;
158 y_offset += 1;
159 }
160
161 int2 int_index = make_int2((int)float_x_index, (int)float_y_index);
162 if (!is_in_bounds(int_index)) continue;
163
164 float dx = COORD_MAT2SCEN_X(float_x_index) - cloud_x;
165 float dy = COORD_MAT2SCEN_Y(float_y_index) - cloud_y;
166
167 float distance = sqrt(dx * dx + dy * dy);
168 if (distance >= cloud_radius) continue;
169
170 float rain = sd.ex_factor * max(0.0f, cloud_intensity - distance * distance_scale);
171 int meters_per_minute = FIXED(rain / 1000 / 60);
172
173 total_rain_thread += meters_per_minute;
174 atomicAdd(&get_global(water_level, int_index), meters_per_minute);
175 }
176
177 unsigned long long total_rain_warp = warpReduceSum(total_rain_thread);
178
179 int warp_lane_index = thread_index % warpSize;
180 if (warp_lane_index == 0 && total_rain_warp > 0) {
181 atomicAdd(&stats->total_rain, total_rain_warp);
182 }
183}
184
185// 2 halo cells on both sides.
186constexpr int SHARED_X_COUNT = 32 + 4;
187constexpr int SHARED_Y_COUNT = 8 + 4;
188constexpr int SHARED_COUNT = SHARED_X_COUNT * SHARED_Y_COUNT;
189
190template <bool is_fully_in_bounds>
191__device__ __forceinline__ void inner_inner_kernel(
192 Statistics *stats,
193 int *water_level_read,
194 int *water_level_write,
195 float height_shared[SHARED_COUNT],
196 float water_level_read_shared[SHARED_COUNT],
197 int2 cell_index,
198
199 float *max_spillage_iter_thread
200) {
201 int2 cell_index_shared = make_int2(threadIdx.x + 2, threadIdx.y + 2);
202
203 int cell_water_level_delta_int = 0;
204 unsigned long long total_water_loss_thread = 0;
205
206 float cell_height = get_shared(height_shared, cell_index_shared);
207
208 constexpr int2 offsets[5] = {{0, 0}, {0, -1}, {0, +1}, {-1, 0}, {+1, 0}};
209
210#pragma unroll
211 for (int outer_index = 0; outer_index < 5; outer_index += 1) {
212 int2 lhs_index = cell_index + offsets[outer_index];
213
214 if constexpr (!is_fully_in_bounds) {
215 if (!is_in_bounds(lhs_index)) continue;
216 }
217
218 int2 lhs_index_shared = cell_index_shared + offsets[outer_index];
219
220 float lhs_water_level = get_shared(water_level_read_shared, lhs_index_shared);
221 if (lhs_water_level <= 0.0) continue;
222
223 float height_difference_sum = 0;
224 float height_difference_max = 0;
225 float height_difference_oob = 0;
226
227 float lhs_height = get_shared(height_shared, lhs_index_shared);
228
229#pragma unroll
230 for (int inner_index = 0; inner_index < 4; inner_index += 1) {
231 int2 rhs_index = lhs_index + offsets[1 + inner_index];
232 int2 rhs_index_shared = lhs_index_shared + offsets[1 + inner_index];
233
234 float rhs_height;
235
236 if constexpr (is_fully_in_bounds) {
237 rhs_height = get_shared(height_shared, rhs_index_shared);
238 } else {
239 if (is_in_bounds(rhs_index)) {
240 rhs_height = get_shared(height_shared, rhs_index_shared);
241 } else {
242 rhs_height = lhs_height - lhs_water_level;
243 }
244 }
245
246 // Compute level differences
247 float height_difference = max(0.0f, lhs_height - rhs_height);
248
249 height_difference_sum += height_difference;
250 height_difference_max = max(height_difference_max, height_difference);
251
252 if constexpr (!is_fully_in_bounds) {
253 if (!is_in_bounds(rhs_index)) {
254 height_difference_oob += height_difference;
255 }
256 }
257 }
258
259 float lhs_spillage_level = min(lhs_water_level, height_difference_max);
260
261 if (lhs_spillage_level <= 1e-8f * height_difference_sum) continue;
262 float proportion = lhs_spillage_level / height_difference_sum;
263
264 // Identical logic to: flood_seq.c:196, flood_seq:199 and flood_seq:235.
265 int incoming = FIXED(proportion * max(0.0f, lhs_height - cell_height) / SPILLAGE_FACTOR);
266
267 cell_water_level_delta_int += incoming;
268
269 // Identical logic to: flood_seq.c:167 and flood_seq.c:218.
270 if (outer_index != 0) continue;
271
272 int outgoing = FIXED(lhs_spillage_level / SPILLAGE_FACTOR);
273 cell_water_level_delta_int -= outgoing;
274
275 if constexpr (!is_fully_in_bounds) {
276 total_water_loss_thread += FIXED(proportion * height_difference_oob / 2);
277 }
278
279 *max_spillage_iter_thread =
280 max(*max_spillage_iter_thread, lhs_spillage_level / SPILLAGE_FACTOR);
281 }
282
283 get_global(water_level_write, cell_index) =
284 get_global(water_level_read, cell_index) + cell_water_level_delta_int;
285
286 if constexpr (is_fully_in_bounds) return;
287
288 if (total_water_loss_thread > 0) {
289 atomicAdd(&stats->total_water_loss, total_water_loss_thread);
290 }
291}
292
293template <bool is_fully_in_bounds>
294__device__ __forceinline__ void load_cell(
295 int *water_level_read,
296 float *ground,
297 float height_shared[SHARED_COUNT],
298 float water_level_read_shared[SHARED_COUNT],
299 int2 shared_index
300) {
301 int global_x_index = blockIdx.x * blockDim.x + shared_index.x - 2;
302 int global_y_index = blockIdx.y * blockDim.y + shared_index.y - 2;
303 int2 global_index = make_int2(global_x_index, global_y_index);
304
305 if constexpr (!is_fully_in_bounds) {
306 if (!is_in_bounds(global_index)) {
307 get_shared(height_shared, shared_index) = 0.0;
308 get_shared(water_level_read_shared, shared_index) = 0.0;
309
310 return;
311 }
312 }
313
314 float ground_value = get_global(ground, global_index);
315 int water_level_int_value = get_global(water_level_read, global_index);
316
317 float water_level_float_value = FLOATING(water_level_int_value);
318 get_shared(water_level_read_shared, shared_index) = water_level_float_value;
319
320 float height_value = ground_value + water_level_float_value;
321 get_shared(height_shared, shared_index) = height_value;
322}
323
324template <bool is_fully_in_bounds>
325__device__ __forceinline__ void inner_kernel(
326 Statistics *stats,
327 int *water_level_read,
328 int *water_level_write,
329 float *ground,
330 float height_shared[SHARED_COUNT],
331 float water_level_read_shared[SHARED_COUNT]
332) {
333#define load_cell(x, y) \
334 load_cell<is_fully_in_bounds>( \
335 water_level_read, ground, height_shared, water_level_read_shared, make_int2(x, y) \
336 )
337
338 // 32 x 08 - top left
339 load_cell(threadIdx.x, threadIdx.y);
340
341 // 32 x 04 - bottom left
342 if (threadIdx.y < 4) {
343 load_cell(threadIdx.x, threadIdx.y + 8);
344 }
345
346 // 04 x 08 - top right
347 if (threadIdx.x < 4) {
348 load_cell(threadIdx.x + 32, threadIdx.y);
349 }
350
351 // 04 x 04 - bottom right
352 if (threadIdx.x < 4 && threadIdx.y < 4) {
353 load_cell(threadIdx.x + 32, threadIdx.y + 8);
354 }
355
356 __syncthreads();
357
358 int cell_x_index = blockIdx.x * blockDim.x + threadIdx.x;
359 int cell_y_index = blockIdx.y * blockDim.y + threadIdx.y;
360 int2 cell_index = make_int2(cell_x_index, cell_y_index);
361
362 float max_spillage_iter_thread = 0.0;
363
364 if constexpr (is_fully_in_bounds) {
365 inner_inner_kernel<true>(
366 stats,
367 water_level_read,
368 water_level_write,
369 height_shared,
370 water_level_read_shared,
371 cell_index,
372 &max_spillage_iter_thread
373 );
374 } else {
375 if (is_in_bounds(cell_index)) {
376 inner_inner_kernel<false>(
377 stats,
378 water_level_read,
379 water_level_write,
380 height_shared,
381 water_level_read_shared,
382 cell_index,
383 &max_spillage_iter_thread
384 );
385 }
386 }
387
388 float max_spillage_iter_warp = warpReduceMax(max_spillage_iter_thread);
389
390 int thread_index = blockDim.x * threadIdx.y + threadIdx.x;
391 int warp_lane_index = thread_index % warpSize;
392
393 if (warp_lane_index == 0 && max_spillage_iter_warp > 0.0) {
394 atomicMaxDouble(&stats->max_spillage_iter, max_spillage_iter_warp);
395 }
396}
397
398__global__ void step2_3(
399 Statistics *stats,
400 int *water_level_read,
401 int *water_level_write,
402 float *ground
403) {
404 int tile_min_x = blockIdx.x * blockDim.x - 2;
405 int tile_min_y = blockIdx.y * blockDim.y - 2;
406 int tile_max_x = blockIdx.x * blockDim.x + blockDim.x + 1;
407 int tile_max_y = blockIdx.y * blockDim.y + blockDim.y + 1;
408
409 bool is_fully_in_bounds =
410 tile_min_x >= 0 && tile_min_y >= 0 && tile_max_x < sd.x_count && tile_max_y < sd.y_count;
411
412 __shared__ float height_shared[SHARED_COUNT];
413 __shared__ float water_level_read_shared[SHARED_COUNT];
414
415 if (is_fully_in_bounds) {
416 inner_kernel<true>(
417 stats,
418 water_level_read,
419 water_level_write,
420 ground,
421 height_shared,
422 water_level_read_shared
423 );
424 } else {
425 inner_kernel<false>(
426 stats,
427 water_level_read,
428 water_level_write,
429 ground,
430 height_shared,
431 water_level_read_shared
432 );
433 }
434}
435
436__global__ void calculate_total(Statistics *stats, int *water_level) {
437 int x_index = blockIdx.x * blockDim.x + threadIdx.x;
438 int y_index = blockIdx.y * blockDim.y + threadIdx.y;
439 int2 index = make_int2(x_index, y_index);
440
441 int water_level_thread = 0;
442 if (is_in_bounds(index)) {
443 water_level_thread = get_global(water_level, index);
444 }
445
446 int water_level_warp = warpReduceSum(water_level_thread);
447 float max_water_scenario_warp = warpReduceMax(FLOATING(water_level_thread));
448
449 auto thread_index = threadIdx.x + threadIdx.y * blockDim.x;
450 int warp_lane_index = thread_index % warpSize;
451
452 if (warp_lane_index == 0) {
453 atomicAdd(&stats->total_water, water_level_warp);
454 atomicMaxDouble(&stats->max_water_scenario, max_water_scenario_warp);
455 }
456}
457
458extern "C" void do_compute(struct parameters *p, struct results *r) {
459 /* 2. Start global timer */
460 CCF(cudaSetDevice(0));
461
462 /* Memory allocation */
463 size_t cell_count = (size_t)p->rows * (size_t)p->columns;
464
465 size_t clouds_size = sizeof(Cloud_t) * (size_t)p->num_clouds;
466 size_t ground_size = sizeof(float) * cell_count;
467 size_t water_level_size = sizeof(int) * cell_count;
468
469 StaticData sd_host = {
470 .x_count = p->columns,
471 .y_count = p->rows,
472 .ex_factor = p->ex_factor,
473 .cloud_count = p->num_clouds,
474 };
475
476 Statistics stats_host = {
477 .max_spillage_iter = DBL_MAX,
478 .max_spillage_minute = 0,
479 .max_spillage_scenario = 0.0,
480 .max_water_scenario = 0.0,
481 .minute = 0,
482 .total_rain = 0,
483 .total_water = 0,
484 .total_water_loss = 0,
485 };
486
487 Cloud_t *clouds;
488 CCF(cudaMalloc(&clouds, clouds_size));
489 CCF(cudaMemcpy(clouds, p->clouds, clouds_size, cudaMemcpyHostToDevice));
490
491#ifdef DEBUG
492 Cloud_t *clouds_debug = (Cloud_t *)malloc(clouds_size);
493 if (clouds_debug == NULL) {
494 printf("Failed to allocate clouds_debug\n");
495 exit(EXIT_FAILURE);
496 }
497#endif
498
499 float *ground;
500 CCF(cudaMalloc(&ground, ground_size));
501 CCF(cudaMemcpy(ground, p->ground, ground_size, cudaMemcpyHostToDevice));
502
503 CCF(cudaMemcpyToSymbol(sd, &sd_host, sizeof(StaticData), 0, cudaMemcpyHostToDevice));
504
505 struct statistics *stats_device;
506 CCF(cudaMalloc(&stats_device, sizeof(Statistics)));
507 CCF(cudaMemcpy(stats_device, &stats_host, sizeof(Statistics), cudaMemcpyHostToDevice));
508
509 int *water_level_read;
510 CCF(cudaMalloc(&water_level_read, water_level_size));
511 CCF(cudaMemset(water_level_read, 0, water_level_size));
512
513 int *water_level_write;
514 CCF(cudaMalloc(&water_level_write, water_level_size));
515 CCF(cudaMemset(water_level_write, 0, water_level_size));
516
517#ifdef DEBUG
518 int *water_level_debug = (int *)malloc(water_level_size);
519 if (water_level_debug == NULL) {
520 printf("Failed to allocate water_level_debug\n");
521 exit(EXIT_FAILURE);
522 }
523#endif
524
525 /* CUDA grid. */
526 dim3 block(32, 8);
527 auto columns_ceil = (p->columns + block.x - 1) / block.x;
528 auto rows_ceil = (p->rows + block.y - 1) / block.y;
529
530 dim3 grid(columns_ceil, rows_ceil);
531
532#ifdef DEBUG
533 print_matrix(PRECISION_FLOAT, p->rows, p->columns, p->ground, "Ground heights");
534#ifndef ANIMATION
535 print_clouds(p->num_clouds, p->clouds);
536#endif
537#endif
538
539 /* Prepare to measure runtime */
540 r->runtime = get_time();
541
542 /* Flood simulation */
543 while (stats_host.minute < p->num_minutes && stats_host.max_spillage_iter > p->threshold) {
544 step1<<<p->num_clouds, 768>>>(stats_device, clouds, water_level_read);
545 CCK();
546
547#ifdef DEBUG
548#ifndef ANIMATION
549 CCF(cudaMemcpy(clouds_debug, clouds, clouds_size, cudaMemcpyDeviceToHost));
550
551 print_clouds(p->num_clouds, clouds_debug);
552#endif
553#endif
554
555#ifdef DEBUG
556 CCF(cudaMemcpy(
557 water_level_debug, water_level_read, water_level_size, cudaMemcpyDeviceToHost
558 ));
559
560 print_matrix(PRECISION_FIXED, p->rows, p->columns, water_level_debug, "Water after rain");
561#endif
562
563 step2_3<<<grid, block>>>(stats_device, water_level_read, water_level_write, ground);
564 CCK();
565
566#ifdef DEBUG
567#ifndef ANIMATION
568 CCF(cudaMemcpy(
569 water_level_debug, water_level_write, water_level_size, cudaMemcpyDeviceToHost
570 ));
571
572 print_matrix(
573 PRECISION_FIXED, p->rows, p->columns, water_level_debug, "Water after spillage"
574 );
575#endif
576#endif
577
578 next_iteration<<<1, 1>>>(stats_device);
579 CCK();
580
581 // Copies minute and max_spillage_iter as these are required for the loop.
582 // Also implicitly synchronises the CPU and GPU.
583 CCF(cudaMemcpy(&stats_host, stats_device, sizeof(Statistics), cudaMemcpyDeviceToHost));
584
585 std::swap(water_level_read, water_level_write);
586 } // End of simulation.
587
588 r->runtime = get_time() - r->runtime;
589
590 /* Statistics: Total remaining water and maximum amount of water in a cell */
591 calculate_total<<<grid, block>>>(stats_device, water_level_read);
592 CCK();
593
594 // Copies max_water_scenario and total_water.
595 // Also implicitly synchronises the CPU and GPU.
596 CCF(cudaMemcpy(&stats_host, stats_device, sizeof(Statistics), cudaMemcpyDeviceToHost));
597
598 if (p->final_matrix) {
599 print_matrix(
600 PRECISION_FIXED, p->rows, p->columns, water_level_read, "Water after spillage"
601 );
602 }
603
604 r->max_water_scenario = stats_host.max_water_scenario;
605 r->total_water = stats_host.total_water;
606
607 r->minute = stats_host.minute;
608 r->total_rain = stats_host.total_rain;
609 r->total_water_loss = stats_host.total_water_loss;
610
611 r->max_spillage_minute = stats_host.max_spillage_minute;
612 r->max_spillage_scenario = stats_host.max_spillage_scenario;
613
614 /* Free resources */
615 CCF(cudaFree(clouds));
616 CCF(cudaFree(ground));
617 CCF(cudaFree(stats_device));
618 CCF(cudaFree(water_level_read));
619 CCF(cudaFree(water_level_write));
620
621#ifdef DEBUG
622 free(clouds_debug);
623 free(water_level_debug);
624#endif
625}