this repo has no description
0
fork

Configure Feed

Select the types of activity you want to include in your feed.

at trunk 625 lines 21 kB view raw
1#include <cstdlib> 2#include <cuda.h> 3#include <float.h> 4#include <stdio.h> 5#include <utility> 6 7#include "flood.h" 8 9/* Example of macros for error checking in CUDA */ 10#define CCF(call) \ 11 { \ 12 cudaError_t check = call; \ 13 if (check != cudaSuccess) \ 14 fprintf(stderr, "CUDA Error in line: %d, %s\n", __LINE__, cudaGetErrorString(check)); \ 15 } 16 17#define CCK() \ 18 { \ 19 cudaError_t check = cudaGetLastError(); \ 20 if (check != cudaSuccess) \ 21 fprintf( \ 22 stderr, "CUDA Kernel Error in line: %d, %s\n", __LINE__, cudaGetErrorString(check) \ 23 ); \ 24 } 25 26#define get_global(buffer, index) buffer[index.y * sd.x_count + index.x] 27#define get_shared(buffer, index) buffer[index.y * SHARED_X_COUNT + index.x] 28 29#define COORD_SCEN2MAT_X(x) (x * sd.x_count / SCENARIO_SIZE) 30#define COORD_SCEN2MAT_Y(y) (y * sd.y_count / SCENARIO_SIZE) 31#define COORD_MAT2SCEN_X(x) (x * SCENARIO_SIZE / sd.x_count) 32#define COORD_MAT2SCEN_Y(y) (y * SCENARIO_SIZE / sd.y_count) 33 34typedef struct static_data { 35 int x_count; 36 int y_count; 37 float ex_factor; 38 int cloud_count; 39} StaticData; 40 41typedef struct statistics { 42 double max_spillage_iter; 43 int max_spillage_minute; 44 double max_spillage_scenario; 45 double max_water_scenario; 46 47 int minute; 48 49 unsigned long long total_rain; 50 unsigned long long total_water; 51 unsigned long long total_water_loss; 52} Statistics; 53 54__constant__ StaticData sd; 55 56__device__ __forceinline__ double atomicMaxDouble(double *address, double value) { 57 return __longlong_as_double( 58 atomicMax((unsigned long long *)address, __double_as_longlong(value)) 59 ); 60} 61 62template <typename T> __device__ __forceinline__ T warpReduceMax(T value) { 63#pragma unroll 64 for (int offset = warpSize / 2; offset > 0; offset >>= 1) 65 value = max(value, __shfl_down_sync(0xffffffff, value, offset)); 66 67 return value; 68} 69 70template <typename T> __device__ __forceinline__ T warpReduceSum(T value) { 71#pragma unroll 72 for (int offset = warpSize / 2; offset > 0; offset >>= 1) 73 value = value + __shfl_down_sync(0xffffffff, value, offset); 74 75 return value; 76} 77 78__device__ __forceinline__ bool is_in_bounds(int2 index) { 79 return index.x >= 0 && index.x < sd.x_count && index.y >= 0 && index.y < sd.y_count; 80} 81 82__device__ __forceinline__ int2 operator+(int2 lhs, int2 rhs) { 83 return make_int2(lhs.x + rhs.x, lhs.y + rhs.y); 84} 85 86__global__ void next_iteration(Statistics *stats) { 87 if (stats->max_spillage_iter > stats->max_spillage_scenario) { 88 stats->max_spillage_scenario = stats->max_spillage_iter; 89 stats->max_spillage_minute = stats->minute; 90 } 91 92 stats->minute += 1; 93} 94 95__global__ void step1(Statistics *stats, Cloud_t *clouds, int *water_level) { 96 int cloud_index = blockIdx.x; 97 int thread_index = threadIdx.x; 98 99 // Instead of the full Cloud_t, only the relevant fields are loaded into shared memory. 100 __shared__ float cloud_x; 101 __shared__ float cloud_y; 102 __shared__ float cloud_intensity; 103 __shared__ float cloud_radius; 104 105 if (thread_index == 0) { 106 if (cloud_index == 0) { 107 stats->max_spillage_iter = 0; 108 } 109 110 /* Step 1.1: Clouds movement */ 111 Cloud_t cloud = clouds[cloud_index]; 112 113 float km_minute = cloud.speed / 60; 114 cloud.x += km_minute * cos(cloud.angle * M_PI / 180.0); 115 cloud.y += km_minute * sin(cloud.angle * M_PI / 180.0); 116 117 clouds[cloud_index] = cloud; 118 119 // Load into shared memory. 120 cloud_x = cloud.x; 121 cloud_y = cloud.y; 122 cloud_intensity = cloud.intensity; 123 cloud_radius = cloud.radius; 124 } 125 126 __syncthreads(); 127 128 float lhs_x_index = COORD_SCEN2MAT_X(max(0.0f, cloud_x - cloud_radius)); 129 float lhs_y_index = COORD_SCEN2MAT_Y(max(0.0f, cloud_y - cloud_radius)); 130 131 float rhs_x_index = COORD_SCEN2MAT_X(min(cloud_x + cloud_radius, (float)SCENARIO_SIZE)); 132 float rhs_y_index = COORD_SCEN2MAT_Y(min(cloud_y + cloud_radius, (float)SCENARIO_SIZE)); 133 134 int x_count = max(0, (int)ceilf(rhs_x_index - lhs_x_index)); 135 int y_count = max(0, (int)ceilf(rhs_y_index - lhs_y_index)); 136 if (x_count == 0 || y_count == 0) return; 137 138 int x_offset = thread_index % x_count; 139 int y_offset = thread_index / x_count; 140 int x_stride = blockDim.x % x_count; 141 int y_stride = blockDim.x / x_count; 142 143 int cell_count = x_count * y_count; 144 float distance_scale = sqrt(cloud_intensity) / cloud_radius; 145 146 unsigned long long total_rain_thread = 0; 147 148 for (int i = thread_index; i < cell_count; i += blockDim.x) { 149 float float_x_index = lhs_x_index + x_offset; 150 float float_y_index = lhs_y_index + y_offset; 151 152 // Manually updating offset and stride is faster 153 // than both a division and modulo every iteration. 154 x_offset += x_stride; 155 y_offset += y_stride; 156 if (x_offset >= x_count) { 157 x_offset -= x_count; 158 y_offset += 1; 159 } 160 161 int2 int_index = make_int2((int)float_x_index, (int)float_y_index); 162 if (!is_in_bounds(int_index)) continue; 163 164 float dx = COORD_MAT2SCEN_X(float_x_index) - cloud_x; 165 float dy = COORD_MAT2SCEN_Y(float_y_index) - cloud_y; 166 167 float distance = sqrt(dx * dx + dy * dy); 168 if (distance >= cloud_radius) continue; 169 170 float rain = sd.ex_factor * max(0.0f, cloud_intensity - distance * distance_scale); 171 int meters_per_minute = FIXED(rain / 1000 / 60); 172 173 total_rain_thread += meters_per_minute; 174 atomicAdd(&get_global(water_level, int_index), meters_per_minute); 175 } 176 177 unsigned long long total_rain_warp = warpReduceSum(total_rain_thread); 178 179 int warp_lane_index = thread_index % warpSize; 180 if (warp_lane_index == 0 && total_rain_warp > 0) { 181 atomicAdd(&stats->total_rain, total_rain_warp); 182 } 183} 184 185// 2 halo cells on both sides. 186constexpr int SHARED_X_COUNT = 32 + 4; 187constexpr int SHARED_Y_COUNT = 8 + 4; 188constexpr int SHARED_COUNT = SHARED_X_COUNT * SHARED_Y_COUNT; 189 190template <bool is_fully_in_bounds> 191__device__ __forceinline__ void inner_inner_kernel( 192 Statistics *stats, 193 int *water_level_read, 194 int *water_level_write, 195 float height_shared[SHARED_COUNT], 196 float water_level_read_shared[SHARED_COUNT], 197 int2 cell_index, 198 199 float *max_spillage_iter_thread 200) { 201 int2 cell_index_shared = make_int2(threadIdx.x + 2, threadIdx.y + 2); 202 203 int cell_water_level_delta_int = 0; 204 unsigned long long total_water_loss_thread = 0; 205 206 float cell_height = get_shared(height_shared, cell_index_shared); 207 208 constexpr int2 offsets[5] = {{0, 0}, {0, -1}, {0, +1}, {-1, 0}, {+1, 0}}; 209 210#pragma unroll 211 for (int outer_index = 0; outer_index < 5; outer_index += 1) { 212 int2 lhs_index = cell_index + offsets[outer_index]; 213 214 if constexpr (!is_fully_in_bounds) { 215 if (!is_in_bounds(lhs_index)) continue; 216 } 217 218 int2 lhs_index_shared = cell_index_shared + offsets[outer_index]; 219 220 float lhs_water_level = get_shared(water_level_read_shared, lhs_index_shared); 221 if (lhs_water_level <= 0.0) continue; 222 223 float height_difference_sum = 0; 224 float height_difference_max = 0; 225 float height_difference_oob = 0; 226 227 float lhs_height = get_shared(height_shared, lhs_index_shared); 228 229#pragma unroll 230 for (int inner_index = 0; inner_index < 4; inner_index += 1) { 231 int2 rhs_index = lhs_index + offsets[1 + inner_index]; 232 int2 rhs_index_shared = lhs_index_shared + offsets[1 + inner_index]; 233 234 float rhs_height; 235 236 if constexpr (is_fully_in_bounds) { 237 rhs_height = get_shared(height_shared, rhs_index_shared); 238 } else { 239 if (is_in_bounds(rhs_index)) { 240 rhs_height = get_shared(height_shared, rhs_index_shared); 241 } else { 242 rhs_height = lhs_height - lhs_water_level; 243 } 244 } 245 246 // Compute level differences 247 float height_difference = max(0.0f, lhs_height - rhs_height); 248 249 height_difference_sum += height_difference; 250 height_difference_max = max(height_difference_max, height_difference); 251 252 if constexpr (!is_fully_in_bounds) { 253 if (!is_in_bounds(rhs_index)) { 254 height_difference_oob += height_difference; 255 } 256 } 257 } 258 259 float lhs_spillage_level = min(lhs_water_level, height_difference_max); 260 261 if (lhs_spillage_level <= 1e-8f * height_difference_sum) continue; 262 float proportion = lhs_spillage_level / height_difference_sum; 263 264 // Identical logic to: flood_seq.c:196, flood_seq:199 and flood_seq:235. 265 int incoming = FIXED(proportion * max(0.0f, lhs_height - cell_height) / SPILLAGE_FACTOR); 266 267 cell_water_level_delta_int += incoming; 268 269 // Identical logic to: flood_seq.c:167 and flood_seq.c:218. 270 if (outer_index != 0) continue; 271 272 int outgoing = FIXED(lhs_spillage_level / SPILLAGE_FACTOR); 273 cell_water_level_delta_int -= outgoing; 274 275 if constexpr (!is_fully_in_bounds) { 276 total_water_loss_thread += FIXED(proportion * height_difference_oob / 2); 277 } 278 279 *max_spillage_iter_thread = 280 max(*max_spillage_iter_thread, lhs_spillage_level / SPILLAGE_FACTOR); 281 } 282 283 get_global(water_level_write, cell_index) = 284 get_global(water_level_read, cell_index) + cell_water_level_delta_int; 285 286 if constexpr (is_fully_in_bounds) return; 287 288 if (total_water_loss_thread > 0) { 289 atomicAdd(&stats->total_water_loss, total_water_loss_thread); 290 } 291} 292 293template <bool is_fully_in_bounds> 294__device__ __forceinline__ void load_cell( 295 int *water_level_read, 296 float *ground, 297 float height_shared[SHARED_COUNT], 298 float water_level_read_shared[SHARED_COUNT], 299 int2 shared_index 300) { 301 int global_x_index = blockIdx.x * blockDim.x + shared_index.x - 2; 302 int global_y_index = blockIdx.y * blockDim.y + shared_index.y - 2; 303 int2 global_index = make_int2(global_x_index, global_y_index); 304 305 if constexpr (!is_fully_in_bounds) { 306 if (!is_in_bounds(global_index)) { 307 get_shared(height_shared, shared_index) = 0.0; 308 get_shared(water_level_read_shared, shared_index) = 0.0; 309 310 return; 311 } 312 } 313 314 float ground_value = get_global(ground, global_index); 315 int water_level_int_value = get_global(water_level_read, global_index); 316 317 float water_level_float_value = FLOATING(water_level_int_value); 318 get_shared(water_level_read_shared, shared_index) = water_level_float_value; 319 320 float height_value = ground_value + water_level_float_value; 321 get_shared(height_shared, shared_index) = height_value; 322} 323 324template <bool is_fully_in_bounds> 325__device__ __forceinline__ void inner_kernel( 326 Statistics *stats, 327 int *water_level_read, 328 int *water_level_write, 329 float *ground, 330 float height_shared[SHARED_COUNT], 331 float water_level_read_shared[SHARED_COUNT] 332) { 333#define load_cell(x, y) \ 334 load_cell<is_fully_in_bounds>( \ 335 water_level_read, ground, height_shared, water_level_read_shared, make_int2(x, y) \ 336 ) 337 338 // 32 x 08 - top left 339 load_cell(threadIdx.x, threadIdx.y); 340 341 // 32 x 04 - bottom left 342 if (threadIdx.y < 4) { 343 load_cell(threadIdx.x, threadIdx.y + 8); 344 } 345 346 // 04 x 08 - top right 347 if (threadIdx.x < 4) { 348 load_cell(threadIdx.x + 32, threadIdx.y); 349 } 350 351 // 04 x 04 - bottom right 352 if (threadIdx.x < 4 && threadIdx.y < 4) { 353 load_cell(threadIdx.x + 32, threadIdx.y + 8); 354 } 355 356 __syncthreads(); 357 358 int cell_x_index = blockIdx.x * blockDim.x + threadIdx.x; 359 int cell_y_index = blockIdx.y * blockDim.y + threadIdx.y; 360 int2 cell_index = make_int2(cell_x_index, cell_y_index); 361 362 float max_spillage_iter_thread = 0.0; 363 364 if constexpr (is_fully_in_bounds) { 365 inner_inner_kernel<true>( 366 stats, 367 water_level_read, 368 water_level_write, 369 height_shared, 370 water_level_read_shared, 371 cell_index, 372 &max_spillage_iter_thread 373 ); 374 } else { 375 if (is_in_bounds(cell_index)) { 376 inner_inner_kernel<false>( 377 stats, 378 water_level_read, 379 water_level_write, 380 height_shared, 381 water_level_read_shared, 382 cell_index, 383 &max_spillage_iter_thread 384 ); 385 } 386 } 387 388 float max_spillage_iter_warp = warpReduceMax(max_spillage_iter_thread); 389 390 int thread_index = blockDim.x * threadIdx.y + threadIdx.x; 391 int warp_lane_index = thread_index % warpSize; 392 393 if (warp_lane_index == 0 && max_spillage_iter_warp > 0.0) { 394 atomicMaxDouble(&stats->max_spillage_iter, max_spillage_iter_warp); 395 } 396} 397 398__global__ void step2_3( 399 Statistics *stats, 400 int *water_level_read, 401 int *water_level_write, 402 float *ground 403) { 404 int tile_min_x = blockIdx.x * blockDim.x - 2; 405 int tile_min_y = blockIdx.y * blockDim.y - 2; 406 int tile_max_x = blockIdx.x * blockDim.x + blockDim.x + 1; 407 int tile_max_y = blockIdx.y * blockDim.y + blockDim.y + 1; 408 409 bool is_fully_in_bounds = 410 tile_min_x >= 0 && tile_min_y >= 0 && tile_max_x < sd.x_count && tile_max_y < sd.y_count; 411 412 __shared__ float height_shared[SHARED_COUNT]; 413 __shared__ float water_level_read_shared[SHARED_COUNT]; 414 415 if (is_fully_in_bounds) { 416 inner_kernel<true>( 417 stats, 418 water_level_read, 419 water_level_write, 420 ground, 421 height_shared, 422 water_level_read_shared 423 ); 424 } else { 425 inner_kernel<false>( 426 stats, 427 water_level_read, 428 water_level_write, 429 ground, 430 height_shared, 431 water_level_read_shared 432 ); 433 } 434} 435 436__global__ void calculate_total(Statistics *stats, int *water_level) { 437 int x_index = blockIdx.x * blockDim.x + threadIdx.x; 438 int y_index = blockIdx.y * blockDim.y + threadIdx.y; 439 int2 index = make_int2(x_index, y_index); 440 441 int water_level_thread = 0; 442 if (is_in_bounds(index)) { 443 water_level_thread = get_global(water_level, index); 444 } 445 446 int water_level_warp = warpReduceSum(water_level_thread); 447 float max_water_scenario_warp = warpReduceMax(FLOATING(water_level_thread)); 448 449 auto thread_index = threadIdx.x + threadIdx.y * blockDim.x; 450 int warp_lane_index = thread_index % warpSize; 451 452 if (warp_lane_index == 0) { 453 atomicAdd(&stats->total_water, water_level_warp); 454 atomicMaxDouble(&stats->max_water_scenario, max_water_scenario_warp); 455 } 456} 457 458extern "C" void do_compute(struct parameters *p, struct results *r) { 459 /* 2. Start global timer */ 460 CCF(cudaSetDevice(0)); 461 462 /* Memory allocation */ 463 size_t cell_count = (size_t)p->rows * (size_t)p->columns; 464 465 size_t clouds_size = sizeof(Cloud_t) * (size_t)p->num_clouds; 466 size_t ground_size = sizeof(float) * cell_count; 467 size_t water_level_size = sizeof(int) * cell_count; 468 469 StaticData sd_host = { 470 .x_count = p->columns, 471 .y_count = p->rows, 472 .ex_factor = p->ex_factor, 473 .cloud_count = p->num_clouds, 474 }; 475 476 Statistics stats_host = { 477 .max_spillage_iter = DBL_MAX, 478 .max_spillage_minute = 0, 479 .max_spillage_scenario = 0.0, 480 .max_water_scenario = 0.0, 481 .minute = 0, 482 .total_rain = 0, 483 .total_water = 0, 484 .total_water_loss = 0, 485 }; 486 487 Cloud_t *clouds; 488 CCF(cudaMalloc(&clouds, clouds_size)); 489 CCF(cudaMemcpy(clouds, p->clouds, clouds_size, cudaMemcpyHostToDevice)); 490 491#ifdef DEBUG 492 Cloud_t *clouds_debug = (Cloud_t *)malloc(clouds_size); 493 if (clouds_debug == NULL) { 494 printf("Failed to allocate clouds_debug\n"); 495 exit(EXIT_FAILURE); 496 } 497#endif 498 499 float *ground; 500 CCF(cudaMalloc(&ground, ground_size)); 501 CCF(cudaMemcpy(ground, p->ground, ground_size, cudaMemcpyHostToDevice)); 502 503 CCF(cudaMemcpyToSymbol(sd, &sd_host, sizeof(StaticData), 0, cudaMemcpyHostToDevice)); 504 505 struct statistics *stats_device; 506 CCF(cudaMalloc(&stats_device, sizeof(Statistics))); 507 CCF(cudaMemcpy(stats_device, &stats_host, sizeof(Statistics), cudaMemcpyHostToDevice)); 508 509 int *water_level_read; 510 CCF(cudaMalloc(&water_level_read, water_level_size)); 511 CCF(cudaMemset(water_level_read, 0, water_level_size)); 512 513 int *water_level_write; 514 CCF(cudaMalloc(&water_level_write, water_level_size)); 515 CCF(cudaMemset(water_level_write, 0, water_level_size)); 516 517#ifdef DEBUG 518 int *water_level_debug = (int *)malloc(water_level_size); 519 if (water_level_debug == NULL) { 520 printf("Failed to allocate water_level_debug\n"); 521 exit(EXIT_FAILURE); 522 } 523#endif 524 525 /* CUDA grid. */ 526 dim3 block(32, 8); 527 auto columns_ceil = (p->columns + block.x - 1) / block.x; 528 auto rows_ceil = (p->rows + block.y - 1) / block.y; 529 530 dim3 grid(columns_ceil, rows_ceil); 531 532#ifdef DEBUG 533 print_matrix(PRECISION_FLOAT, p->rows, p->columns, p->ground, "Ground heights"); 534#ifndef ANIMATION 535 print_clouds(p->num_clouds, p->clouds); 536#endif 537#endif 538 539 /* Prepare to measure runtime */ 540 r->runtime = get_time(); 541 542 /* Flood simulation */ 543 while (stats_host.minute < p->num_minutes && stats_host.max_spillage_iter > p->threshold) { 544 step1<<<p->num_clouds, 768>>>(stats_device, clouds, water_level_read); 545 CCK(); 546 547#ifdef DEBUG 548#ifndef ANIMATION 549 CCF(cudaMemcpy(clouds_debug, clouds, clouds_size, cudaMemcpyDeviceToHost)); 550 551 print_clouds(p->num_clouds, clouds_debug); 552#endif 553#endif 554 555#ifdef DEBUG 556 CCF(cudaMemcpy( 557 water_level_debug, water_level_read, water_level_size, cudaMemcpyDeviceToHost 558 )); 559 560 print_matrix(PRECISION_FIXED, p->rows, p->columns, water_level_debug, "Water after rain"); 561#endif 562 563 step2_3<<<grid, block>>>(stats_device, water_level_read, water_level_write, ground); 564 CCK(); 565 566#ifdef DEBUG 567#ifndef ANIMATION 568 CCF(cudaMemcpy( 569 water_level_debug, water_level_write, water_level_size, cudaMemcpyDeviceToHost 570 )); 571 572 print_matrix( 573 PRECISION_FIXED, p->rows, p->columns, water_level_debug, "Water after spillage" 574 ); 575#endif 576#endif 577 578 next_iteration<<<1, 1>>>(stats_device); 579 CCK(); 580 581 // Copies minute and max_spillage_iter as these are required for the loop. 582 // Also implicitly synchronises the CPU and GPU. 583 CCF(cudaMemcpy(&stats_host, stats_device, sizeof(Statistics), cudaMemcpyDeviceToHost)); 584 585 std::swap(water_level_read, water_level_write); 586 } // End of simulation. 587 588 r->runtime = get_time() - r->runtime; 589 590 /* Statistics: Total remaining water and maximum amount of water in a cell */ 591 calculate_total<<<grid, block>>>(stats_device, water_level_read); 592 CCK(); 593 594 // Copies max_water_scenario and total_water. 595 // Also implicitly synchronises the CPU and GPU. 596 CCF(cudaMemcpy(&stats_host, stats_device, sizeof(Statistics), cudaMemcpyDeviceToHost)); 597 598 if (p->final_matrix) { 599 print_matrix( 600 PRECISION_FIXED, p->rows, p->columns, water_level_read, "Water after spillage" 601 ); 602 } 603 604 r->max_water_scenario = stats_host.max_water_scenario; 605 r->total_water = stats_host.total_water; 606 607 r->minute = stats_host.minute; 608 r->total_rain = stats_host.total_rain; 609 r->total_water_loss = stats_host.total_water_loss; 610 611 r->max_spillage_minute = stats_host.max_spillage_minute; 612 r->max_spillage_scenario = stats_host.max_spillage_scenario; 613 614 /* Free resources */ 615 CCF(cudaFree(clouds)); 616 CCF(cudaFree(ground)); 617 CCF(cudaFree(stats_device)); 618 CCF(cudaFree(water_level_read)); 619 CCF(cudaFree(water_level_write)); 620 621#ifdef DEBUG 622 free(clouds_debug); 623 free(water_level_debug); 624#endif 625}