linux/dim: Implement RDMA adaptive moderation (DIM)

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

RDMA DIM implements a different algorithm from net DIM and is based on
completions which is how we can implement interrupt moderation in RDMA.

The algorithm optimizes for number of completions and ratio between
completions and events. In order to avoid long latencies, the
implementation performs fast reduction of moderation level when the
traffic changes.

Signed-off-by: Yamin Friedman <yaminf@mellanox.com>
Reviewed-by: Max Gurtovoy <maxg@mellanox.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>

authored by

Yamin Friedman and committed by

Jason Gunthorpe 6 years ago f4915455 2ef38e38

+146 -4

3 changed files

expand all

include

linux

dim.h

lib

dim

Makefile

rdma_dim.c

+36

include/linux/dim.h

··· 82 82 * @prev_stats: Measured rates from previous iteration (for comparison) 83 83 * @start_sample: Sampled data at start of current iteration 84 84 * @work: Work to perform on action required 85 + * @priv: A pointer to the struct that points to dim 85 86 * @profile_ix: Current moderation profile 86 87 * @mode: CQ period count mode 87 88 * @tune_state: Algorithm tuning state (see below) ··· 96 95 struct dim_sample start_sample; 97 96 struct dim_sample measuring_sample; 98 97 struct work_struct work; 98 + void *priv; 99 99 u8 profile_ix; 100 100 u8 mode; 101 101 u8 tune_state; ··· 364 362 * required action. 365 363 */ 366 364 void net_dim(struct dim *dim, struct dim_sample end_sample); 365 + 366 + /* RDMA DIM */ 367 + 368 + /* 369 + * RDMA DIM profile: 370 + * profile size must be of RDMA_DIM_PARAMS_NUM_PROFILES. 371 + */ 372 + #define RDMA_DIM_PARAMS_NUM_PROFILES 9 373 + #define RDMA_DIM_START_PROFILE 0 374 + 375 + static const struct dim_cq_moder 376 + rdma_dim_prof[RDMA_DIM_PARAMS_NUM_PROFILES] = { 377 + {1, 0, 1, 0}, 378 + {1, 0, 4, 0}, 379 + {2, 0, 4, 0}, 380 + {2, 0, 8, 0}, 381 + {4, 0, 8, 0}, 382 + {16, 0, 8, 0}, 383 + {16, 0, 16, 0}, 384 + {32, 0, 16, 0}, 385 + {32, 0, 32, 0}, 386 + }; 387 + 388 + /** 389 + * rdma_dim - Runs the adaptive moderation. 390 + * @dim: The moderation struct. 391 + * @completions: The number of completions collected in this round. 392 + * 393 + * Each call to rdma_dim takes the latest amount of completions that 394 + * have been collected and counts them as a new event. 395 + * Once enough events have been collected the algorithm decides a new 396 + * moderation level. 397 + */ 398 + void rdma_dim(struct dim *dim, u64 completions); 367 399 368 400 #endif /* DIM_H */

+2 -4

lib/dim/Makefile

··· 2 2 # DIM Dynamic Interrupt Moderation library 3 3 # 4 4 5 - obj-$(CONFIG_DIMLIB) = net_dim.o 5 + obj-$(CONFIG_DIMLIB) += dim.o 6 6 7 - net_dim-y = \ 8 - dim.o \ 9 - net_dim.o 7 + dim-y := dim.o net_dim.o rdma_dim.o

+108

lib/dim/rdma_dim.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB 2 + /* 3 + * Copyright (c) 2019, Mellanox Technologies inc. All rights reserved. 4 + */ 5 + 6 + #include <linux/dim.h> 7 + 8 + static int rdma_dim_step(struct dim *dim) 9 + { 10 + if (dim->tune_state == DIM_GOING_RIGHT) { 11 + if (dim->profile_ix == (RDMA_DIM_PARAMS_NUM_PROFILES - 1)) 12 + return DIM_ON_EDGE; 13 + dim->profile_ix++; 14 + dim->steps_right++; 15 + } 16 + if (dim->tune_state == DIM_GOING_LEFT) { 17 + if (dim->profile_ix == 0) 18 + return DIM_ON_EDGE; 19 + dim->profile_ix--; 20 + dim->steps_left++; 21 + } 22 + 23 + return DIM_STEPPED; 24 + } 25 + 26 + static int rdma_dim_stats_compare(struct dim_stats *curr, 27 + struct dim_stats *prev) 28 + { 29 + /* first stat */ 30 + if (!prev->cpms) 31 + return DIM_STATS_SAME; 32 + 33 + if (IS_SIGNIFICANT_DIFF(curr->cpms, prev->cpms)) 34 + return (curr->cpms > prev->cpms) ? DIM_STATS_BETTER : 35 + DIM_STATS_WORSE; 36 + 37 + if (IS_SIGNIFICANT_DIFF(curr->cpe_ratio, prev->cpe_ratio)) 38 + return (curr->cpe_ratio > prev->cpe_ratio) ? DIM_STATS_BETTER : 39 + DIM_STATS_WORSE; 40 + 41 + return DIM_STATS_SAME; 42 + } 43 + 44 + static bool rdma_dim_decision(struct dim_stats *curr_stats, struct dim *dim) 45 + { 46 + int prev_ix = dim->profile_ix; 47 + u8 state = dim->tune_state; 48 + int stats_res; 49 + int step_res; 50 + 51 + if (state != DIM_PARKING_ON_TOP && state != DIM_PARKING_TIRED) { 52 + stats_res = rdma_dim_stats_compare(curr_stats, 53 + &dim->prev_stats); 54 + 55 + switch (stats_res) { 56 + case DIM_STATS_SAME: 57 + if (curr_stats->cpe_ratio <= 50 * prev_ix) 58 + dim->profile_ix = 0; 59 + break; 60 + case DIM_STATS_WORSE: 61 + dim_turn(dim); 62 + /* fall through */ 63 + case DIM_STATS_BETTER: 64 + step_res = rdma_dim_step(dim); 65 + if (step_res == DIM_ON_EDGE) 66 + dim_turn(dim); 67 + break; 68 + } 69 + } 70 + 71 + dim->prev_stats = *curr_stats; 72 + 73 + return dim->profile_ix != prev_ix; 74 + } 75 + 76 + void rdma_dim(struct dim *dim, u64 completions) 77 + { 78 + struct dim_sample *curr_sample = &dim->measuring_sample; 79 + struct dim_stats curr_stats; 80 + u32 nevents; 81 + 82 + dim_update_sample_with_comps(curr_sample->event_ctr + 1, 0, 0, 83 + curr_sample->comp_ctr + completions, 84 + &dim->measuring_sample); 85 + 86 + switch (dim->state) { 87 + case DIM_MEASURE_IN_PROGRESS: 88 + nevents = curr_sample->event_ctr - dim->start_sample.event_ctr; 89 + if (nevents < DIM_NEVENTS) 90 + break; 91 + dim_calc_stats(&dim->start_sample, curr_sample, &curr_stats); 92 + if (rdma_dim_decision(&curr_stats, dim)) { 93 + dim->state = DIM_APPLY_NEW_PROFILE; 94 + schedule_work(&dim->work); 95 + break; 96 + } 97 + /* fall through */ 98 + case DIM_START_MEASURE: 99 + dim->state = DIM_MEASURE_IN_PROGRESS; 100 + dim_update_sample_with_comps(curr_sample->event_ctr, 0, 0, 101 + curr_sample->comp_ctr, 102 + &dim->start_sample); 103 + break; 104 + case DIM_APPLY_NEW_PROFILE: 105 + break; 106 + } 107 + } 108 + EXPORT_SYMBOL(rdma_dim);