Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

IB/mad: Add flow control for solicited MADs

Currently, MADs sent via an agent are being forwarded directly to the
corresponding MAD QP layer.

MADs with a timeout value set and requiring a response (solicited MADs)
will be resent if the timeout expires without receiving a response.
In a congested subnet, flooding MAD QP layer with more solicited send
requests from the agent will only worsen the situation by triggering
more timeouts and therefore more retries.

Thus, add flow control for non-user solicited MADs to block agents from
issuing new solicited MAD requests to the MAD QP until outstanding
requests are completed and the MAD QP is ready to process additional
requests. While at it, keep track of the total outstanding solicited
MAD work requests in send or wait list. The number of outstanding send
WRs will be limited by a fraction of the RQ size, and any new send WR
that exceeds that limit will be held in a backlog list.
Backlog MADs will be forwarded to agent send list only once the total
number of outstanding send WRs falls below the limit.

Unsolicited MADs, RMPP MADs and MADs which are not SA, SMP or CM are
not subject to this flow control mechanism and will not be affected by
this change.

For this purpose, a new state is introduced:
- 'IB_MAD_STATE_QUEUED': MAD is in backlog list

Signed-off-by: Or Har-Toov <ohartoov@nvidia.com>
Signed-off-by: Vlad Dumitrescu <vdumitrescu@nvidia.com>
Link: https://patch.msgid.link/c0ecaa1821badee124cd13f3bf860f67ce453beb.1751278420.git.leon@kernel.org
Signed-off-by: Leon Romanovsky <leon@kernel.org>

authored by

Or Har-Toov and committed by
Leon Romanovsky
314cb74c 1cf0d899

+214 -10
+196 -10
drivers/infiniband/core/mad.c
··· 210 210 } 211 211 EXPORT_SYMBOL(ib_response_mad); 212 212 213 + #define SOL_FC_MAX_DEFAULT_FRAC 4 214 + #define SOL_FC_MAX_SA_FRAC 32 215 + 216 + static int get_sol_fc_max_outstanding(struct ib_mad_reg_req *mad_reg_req) 217 + { 218 + if (!mad_reg_req) 219 + /* Send only agent */ 220 + return mad_recvq_size / SOL_FC_MAX_DEFAULT_FRAC; 221 + 222 + switch (mad_reg_req->mgmt_class) { 223 + case IB_MGMT_CLASS_CM: 224 + return mad_recvq_size / SOL_FC_MAX_DEFAULT_FRAC; 225 + case IB_MGMT_CLASS_SUBN_ADM: 226 + return mad_recvq_size / SOL_FC_MAX_SA_FRAC; 227 + case IB_MGMT_CLASS_SUBN_LID_ROUTED: 228 + case IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE: 229 + return min(mad_recvq_size, IB_MAD_QP_RECV_SIZE) / 230 + SOL_FC_MAX_DEFAULT_FRAC; 231 + default: 232 + return 0; 233 + } 234 + } 235 + 213 236 /* 214 237 * ib_register_mad_agent - Register to send/receive MADs 215 238 * ··· 415 392 INIT_LIST_HEAD(&mad_agent_priv->send_list); 416 393 INIT_LIST_HEAD(&mad_agent_priv->wait_list); 417 394 INIT_LIST_HEAD(&mad_agent_priv->rmpp_list); 395 + INIT_LIST_HEAD(&mad_agent_priv->backlog_list); 418 396 INIT_DELAYED_WORK(&mad_agent_priv->timed_work, timeout_sends); 419 397 INIT_LIST_HEAD(&mad_agent_priv->local_list); 420 398 INIT_WORK(&mad_agent_priv->local_work, local_completions); 421 399 refcount_set(&mad_agent_priv->refcount, 1); 422 400 init_completion(&mad_agent_priv->comp); 423 - 401 + mad_agent_priv->sol_fc_send_count = 0; 402 + mad_agent_priv->sol_fc_wait_count = 0; 403 + mad_agent_priv->sol_fc_max = 404 + get_sol_fc_max_outstanding(mad_reg_req); 424 405 ret2 = ib_mad_agent_security_setup(&mad_agent_priv->agent, qp_type); 425 406 if (ret2) { 426 407 ret = ERR_PTR(ret2); ··· 1081 1054 return ret; 1082 1055 } 1083 1056 1057 + static void handle_queued_state(struct ib_mad_send_wr_private *mad_send_wr, 1058 + struct ib_mad_agent_private *mad_agent_priv) 1059 + { 1060 + if (mad_send_wr->state == IB_MAD_STATE_WAIT_RESP) { 1061 + mad_agent_priv->sol_fc_wait_count--; 1062 + list_move_tail(&mad_send_wr->agent_list, 1063 + &mad_agent_priv->backlog_list); 1064 + } else { 1065 + expect_mad_state(mad_send_wr, IB_MAD_STATE_INIT); 1066 + list_add_tail(&mad_send_wr->agent_list, 1067 + &mad_agent_priv->backlog_list); 1068 + } 1069 + } 1070 + 1084 1071 static void handle_send_state(struct ib_mad_send_wr_private *mad_send_wr, 1085 1072 struct ib_mad_agent_private *mad_agent_priv) 1086 1073 { ··· 1102 1061 list_add_tail(&mad_send_wr->agent_list, 1103 1062 &mad_agent_priv->send_list); 1104 1063 } else { 1105 - expect_mad_state(mad_send_wr, IB_MAD_STATE_WAIT_RESP); 1064 + expect_mad_state2(mad_send_wr, IB_MAD_STATE_WAIT_RESP, 1065 + IB_MAD_STATE_QUEUED); 1106 1066 list_move_tail(&mad_send_wr->agent_list, 1107 1067 &mad_agent_priv->send_list); 1068 + } 1069 + 1070 + if (mad_send_wr->is_solicited_fc) { 1071 + if (mad_send_wr->state == IB_MAD_STATE_WAIT_RESP) 1072 + mad_agent_priv->sol_fc_wait_count--; 1073 + mad_agent_priv->sol_fc_send_count++; 1108 1074 } 1109 1075 } 1110 1076 ··· 1124 1076 1125 1077 expect_mad_state3(mad_send_wr, IB_MAD_STATE_SEND_START, 1126 1078 IB_MAD_STATE_WAIT_RESP, IB_MAD_STATE_CANCELED); 1127 - list_del_init(&mad_send_wr->agent_list); 1079 + if (mad_send_wr->state == IB_MAD_STATE_SEND_START && 1080 + mad_send_wr->is_solicited_fc) { 1081 + mad_agent_priv->sol_fc_send_count--; 1082 + mad_agent_priv->sol_fc_wait_count++; 1083 + } 1128 1084 1085 + list_del_init(&mad_send_wr->agent_list); 1129 1086 delay = mad_send_wr->timeout; 1130 1087 mad_send_wr->timeout += jiffies; 1131 1088 ··· 1156 1103 struct ib_mad_agent_private *mad_agent_priv) 1157 1104 { 1158 1105 expect_mad_state(mad_send_wr, IB_MAD_STATE_SEND_START); 1106 + mad_agent_priv->sol_fc_send_count -= mad_send_wr->is_solicited_fc; 1159 1107 } 1160 1108 1161 1109 static void handle_canceled_state(struct ib_mad_send_wr_private *mad_send_wr, 1162 1110 struct ib_mad_agent_private *mad_agent_priv) 1163 1111 { 1164 1112 not_expect_mad_state(mad_send_wr, IB_MAD_STATE_DONE); 1113 + if (mad_send_wr->is_solicited_fc) { 1114 + if (mad_send_wr->state == IB_MAD_STATE_SEND_START) 1115 + mad_agent_priv->sol_fc_send_count--; 1116 + else if (mad_send_wr->state == IB_MAD_STATE_WAIT_RESP) 1117 + mad_agent_priv->sol_fc_wait_count--; 1118 + } 1165 1119 } 1166 1120 1167 1121 static void handle_done_state(struct ib_mad_send_wr_private *mad_send_wr, 1168 1122 struct ib_mad_agent_private *mad_agent_priv) 1169 1123 { 1124 + if (mad_send_wr->is_solicited_fc) { 1125 + if (mad_send_wr->state == IB_MAD_STATE_SEND_START) 1126 + mad_agent_priv->sol_fc_send_count--; 1127 + else if (mad_send_wr->state == IB_MAD_STATE_WAIT_RESP) 1128 + mad_agent_priv->sol_fc_wait_count--; 1129 + } 1130 + 1170 1131 list_del_init(&mad_send_wr->agent_list); 1171 1132 } 1172 1133 ··· 1192 1125 1193 1126 switch (new_state) { 1194 1127 case IB_MAD_STATE_INIT: 1128 + break; 1129 + case IB_MAD_STATE_QUEUED: 1130 + handle_queued_state(mad_send_wr, mad_agent_priv); 1195 1131 break; 1196 1132 case IB_MAD_STATE_SEND_START: 1197 1133 handle_send_state(mad_send_wr, mad_agent_priv); ··· 1216 1146 } 1217 1147 1218 1148 mad_send_wr->state = new_state; 1149 + } 1150 + 1151 + static bool is_solicited_fc_mad(struct ib_mad_send_wr_private *mad_send_wr) 1152 + { 1153 + struct ib_rmpp_mad *rmpp_mad; 1154 + u8 mgmt_class; 1155 + 1156 + if (!mad_send_wr->timeout) 1157 + return 0; 1158 + 1159 + rmpp_mad = mad_send_wr->send_buf.mad; 1160 + if (mad_send_wr->mad_agent_priv->agent.rmpp_version && 1161 + (ib_get_rmpp_flags(&rmpp_mad->rmpp_hdr) & IB_MGMT_RMPP_FLAG_ACTIVE)) 1162 + return 0; 1163 + 1164 + mgmt_class = 1165 + ((struct ib_mad_hdr *)mad_send_wr->send_buf.mad)->mgmt_class; 1166 + return mgmt_class == IB_MGMT_CLASS_CM || 1167 + mgmt_class == IB_MGMT_CLASS_SUBN_ADM || 1168 + mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED || 1169 + mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE; 1170 + } 1171 + 1172 + static bool mad_is_for_backlog(struct ib_mad_send_wr_private *mad_send_wr) 1173 + { 1174 + struct ib_mad_agent_private *mad_agent_priv = 1175 + mad_send_wr->mad_agent_priv; 1176 + 1177 + if (!mad_send_wr->is_solicited_fc || !mad_agent_priv->sol_fc_max) 1178 + return false; 1179 + 1180 + if (!list_empty(&mad_agent_priv->backlog_list)) 1181 + return true; 1182 + 1183 + return mad_agent_priv->sol_fc_send_count + 1184 + mad_agent_priv->sol_fc_wait_count >= 1185 + mad_agent_priv->sol_fc_max; 1219 1186 } 1220 1187 1221 1188 /* ··· 1323 1216 /* Reference MAD agent until send completes */ 1324 1217 refcount_inc(&mad_agent_priv->refcount); 1325 1218 spin_lock_irqsave(&mad_agent_priv->lock, flags); 1219 + mad_send_wr->is_solicited_fc = is_solicited_fc_mad(mad_send_wr); 1220 + if (mad_is_for_backlog(mad_send_wr)) { 1221 + change_mad_state(mad_send_wr, IB_MAD_STATE_QUEUED); 1222 + spin_unlock_irqrestore(&mad_agent_priv->lock, flags); 1223 + return 0; 1224 + } 1225 + 1326 1226 change_mad_state(mad_send_wr, IB_MAD_STATE_SEND_START); 1327 1227 spin_unlock_irqrestore(&mad_agent_priv->lock, flags); 1328 1228 ··· 1953 1839 return (wr->state != IB_MAD_STATE_CANCELED) ? wr : NULL; 1954 1840 } 1955 1841 1842 + list_for_each_entry(wr, &mad_agent_priv->backlog_list, agent_list) { 1843 + if ((wr->tid == mad_hdr->tid) && 1844 + rcv_has_same_class(wr, wc) && 1845 + /* 1846 + * Don't check GID for direct routed MADs. 1847 + * These might have permissive LIDs. 1848 + */ 1849 + (is_direct(mad_hdr->mgmt_class) || 1850 + rcv_has_same_gid(mad_agent_priv, wr, wc))) 1851 + return (wr->state != IB_MAD_STATE_CANCELED) ? wr : NULL; 1852 + } 1853 + 1956 1854 /* 1957 1855 * It's possible to receive the response before we've 1958 1856 * been notified that the send has completed ··· 1986 1860 return NULL; 1987 1861 } 1988 1862 1863 + static void 1864 + process_backlog_mads(struct ib_mad_agent_private *mad_agent_priv) 1865 + { 1866 + struct ib_mad_send_wr_private *mad_send_wr; 1867 + struct ib_mad_send_wc mad_send_wc = {}; 1868 + unsigned long flags; 1869 + int ret; 1870 + 1871 + spin_lock_irqsave(&mad_agent_priv->lock, flags); 1872 + while (!list_empty(&mad_agent_priv->backlog_list) && 1873 + (mad_agent_priv->sol_fc_send_count + 1874 + mad_agent_priv->sol_fc_wait_count < 1875 + mad_agent_priv->sol_fc_max)) { 1876 + mad_send_wr = list_entry(mad_agent_priv->backlog_list.next, 1877 + struct ib_mad_send_wr_private, 1878 + agent_list); 1879 + change_mad_state(mad_send_wr, IB_MAD_STATE_SEND_START); 1880 + spin_unlock_irqrestore(&mad_agent_priv->lock, flags); 1881 + ret = ib_send_mad(mad_send_wr); 1882 + if (ret) { 1883 + spin_lock_irqsave(&mad_agent_priv->lock, flags); 1884 + deref_mad_agent(mad_agent_priv); 1885 + change_mad_state(mad_send_wr, IB_MAD_STATE_DONE); 1886 + spin_unlock_irqrestore(&mad_agent_priv->lock, flags); 1887 + mad_send_wc.send_buf = &mad_send_wr->send_buf; 1888 + mad_send_wc.status = IB_WC_LOC_QP_OP_ERR; 1889 + mad_agent_priv->agent.send_handler( 1890 + &mad_agent_priv->agent, &mad_send_wc); 1891 + } 1892 + 1893 + spin_lock_irqsave(&mad_agent_priv->lock, flags); 1894 + } 1895 + 1896 + spin_unlock_irqrestore(&mad_agent_priv->lock, flags); 1897 + } 1898 + 1989 1899 void ib_mark_mad_done(struct ib_mad_send_wr_private *mad_send_wr) 1990 1900 { 1991 1901 mad_send_wr->timeout = 0; 1992 - if (mad_send_wr->state == IB_MAD_STATE_WAIT_RESP) 1902 + if (mad_send_wr->state == IB_MAD_STATE_WAIT_RESP || 1903 + mad_send_wr->state == IB_MAD_STATE_QUEUED) 1993 1904 change_mad_state(mad_send_wr, IB_MAD_STATE_DONE); 1994 1905 else 1995 1906 change_mad_state(mad_send_wr, IB_MAD_STATE_EARLY_RESP); ··· 2483 2320 adjust_timeout(mad_agent_priv); 2484 2321 spin_unlock_irqrestore(&mad_agent_priv->lock, flags); 2485 2322 2486 - if (ret == IB_RMPP_RESULT_INTERNAL) 2323 + if (ret == IB_RMPP_RESULT_INTERNAL) { 2487 2324 ib_rmpp_send_handler(mad_send_wc); 2488 - else 2325 + } else { 2326 + if (mad_send_wr->is_solicited_fc) 2327 + process_backlog_mads(mad_agent_priv); 2489 2328 mad_agent_priv->agent.send_handler(&mad_agent_priv->agent, 2490 2329 mad_send_wc); 2330 + } 2491 2331 2492 2332 /* Release reference on agent taken when sending */ 2493 2333 deref_mad_agent(mad_agent_priv); ··· 2663 2497 &mad_agent_priv->send_list, agent_list) 2664 2498 change_mad_state(mad_send_wr, IB_MAD_STATE_CANCELED); 2665 2499 2666 - /* Empty wait list to prevent receives from finding a request */ 2500 + /* Empty wait & backlog list to prevent receives from finding request */ 2667 2501 list_for_each_entry_safe(mad_send_wr, temp_mad_send_wr, 2668 2502 &mad_agent_priv->wait_list, agent_list) { 2669 2503 change_mad_state(mad_send_wr, IB_MAD_STATE_DONE); 2670 2504 list_add_tail(&mad_send_wr->agent_list, &cancel_list); 2671 2505 } 2672 - spin_unlock_irqrestore(&mad_agent_priv->lock, flags); 2673 2506 2507 + list_for_each_entry_safe(mad_send_wr, temp_mad_send_wr, 2508 + &mad_agent_priv->backlog_list, agent_list) { 2509 + change_mad_state(mad_send_wr, IB_MAD_STATE_DONE); 2510 + list_add_tail(&mad_send_wr->agent_list, &cancel_list); 2511 + } 2512 + 2513 + spin_unlock_irqrestore(&mad_agent_priv->lock, flags); 2674 2514 /* Report all cancelled requests */ 2675 2515 clear_mad_error_list(&cancel_list, IB_WC_WR_FLUSH_ERR, mad_agent_priv); 2676 2516 } ··· 2700 2528 &mad_send_wr->send_buf == send_buf) 2701 2529 return mad_send_wr; 2702 2530 } 2531 + 2532 + list_for_each_entry(mad_send_wr, &mad_agent_priv->backlog_list, 2533 + agent_list) { 2534 + if (&mad_send_wr->send_buf == send_buf) 2535 + return mad_send_wr; 2536 + } 2537 + 2703 2538 return NULL; 2704 2539 } 2705 2540 ··· 2729 2550 return -EINVAL; 2730 2551 } 2731 2552 2732 - active = (mad_send_wr->state == IB_MAD_STATE_SEND_START || 2733 - mad_send_wr->state == IB_MAD_STATE_EARLY_RESP); 2553 + active = ((mad_send_wr->state == IB_MAD_STATE_SEND_START) || 2554 + (mad_send_wr->state == IB_MAD_STATE_EARLY_RESP) || 2555 + (mad_send_wr->state == IB_MAD_STATE_QUEUED && timeout_ms)); 2734 2556 if (!timeout_ms) 2735 2557 change_mad_state(mad_send_wr, IB_MAD_STATE_CANCELED); 2736 2558 ··· 2845 2665 mad_send_wr->send_buf.retries++; 2846 2666 2847 2667 mad_send_wr->timeout = msecs_to_jiffies(mad_send_wr->send_buf.timeout_ms); 2668 + if (mad_send_wr->is_solicited_fc && 2669 + !list_empty(&mad_send_wr->mad_agent_priv->backlog_list)) { 2670 + change_mad_state(mad_send_wr, IB_MAD_STATE_QUEUED); 2671 + return 0; 2672 + } 2848 2673 2849 2674 if (ib_mad_kernel_rmpp_agent(&mad_send_wr->mad_agent_priv->agent)) { 2850 2675 ret = ib_retry_rmpp(mad_send_wr); ··· 2915 2730 } 2916 2731 2917 2732 spin_unlock_irqrestore(&mad_agent_priv->lock, flags); 2733 + process_backlog_mads(mad_agent_priv); 2918 2734 clear_mad_error_list(&timeout_list, IB_WC_RESP_TIMEOUT_ERR, 2919 2735 mad_agent_priv); 2920 2736 clear_mad_error_list(&cancel_list, IB_WC_WR_FLUSH_ERR, mad_agent_priv);
+18
drivers/infiniband/core/mad_priv.h
··· 95 95 96 96 spinlock_t lock; 97 97 struct list_head send_list; 98 + unsigned int sol_fc_send_count; 98 99 struct list_head wait_list; 100 + unsigned int sol_fc_wait_count; 99 101 struct delayed_work timed_work; 100 102 unsigned long timeout; 101 103 struct list_head local_list; 102 104 struct work_struct local_work; 103 105 struct list_head rmpp_list; 106 + unsigned int sol_fc_max; 107 + struct list_head backlog_list; 104 108 105 109 refcount_t refcount; 106 110 union { ··· 124 120 enum ib_mad_state { 125 121 /* MAD is in the making and is not yet in any list */ 126 122 IB_MAD_STATE_INIT, 123 + /* MAD is in backlog list */ 124 + IB_MAD_STATE_QUEUED, 127 125 /* 128 126 * MAD was sent to the QP and is waiting for completion 129 127 * notification in send list. ··· 172 166 int pad; 173 167 174 168 enum ib_mad_state state; 169 + 170 + /* Solicited MAD flow control */ 171 + bool is_solicited_fc; 175 172 }; 176 173 177 174 static inline void expect_mad_state(struct ib_mad_send_wr_private *mad_send_wr, ··· 182 173 { 183 174 if (IS_ENABLED(CONFIG_LOCKDEP)) 184 175 WARN_ON(mad_send_wr->state != expected_state); 176 + } 177 + 178 + static inline void expect_mad_state2(struct ib_mad_send_wr_private *mad_send_wr, 179 + enum ib_mad_state expected_state1, 180 + enum ib_mad_state expected_state2) 181 + { 182 + if (IS_ENABLED(CONFIG_LOCKDEP)) 183 + WARN_ON(mad_send_wr->state != expected_state1 && 184 + mad_send_wr->state != expected_state2); 185 185 } 186 186 187 187 static inline void expect_mad_state3(struct ib_mad_send_wr_private *mad_send_wr,