Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

IB/{hfi1, qib, rdmavt}: Move copy SGE logic into rdmavt

This patch moves hfi1_copy_sge() into rdmavt for sharing with qib.
This patch also moves all the wss_*() functions into rdmavt as
several wss_*() functions are called from hfi1_copy_sge()

When SGE copy mode is adaptive, cacheless copy may be done in some cases
for performance reasons. In those cases, X86 cacheless copy function
is called since the drivers that use rdmavt and may set SGE copy mode
to adaptive are X86 only. For this reason, this patch adds
"depends on X86_64" to rdmavt/Kconfig.

Reviewed-by: Ashutosh Dixit <ashutosh.dixit@intel.com>
Reviewed-by: Michael J. Ruhl <michael.j.ruhl@intel.com>
Reviewed-by: Mike Marciniszyn <mike.marciniszyn@intel.com>
Reviewed-by: Dennis Dalessandro <dennis.dalessandro@intel.com>
Signed-off-by: Brian Welty <brian.welty@intel.com>
Signed-off-by: Harish Chegondi <harish.chegondi@intel.com>
Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>

authored by

Brian Welty and committed by
Jason Gunthorpe
019f118b b56511c1

+344 -314
-6
drivers/infiniband/hw/hfi1/init.c
··· 1504 1504 idr_init(&hfi1_unit_table); 1505 1505 1506 1506 hfi1_dbg_init(); 1507 - ret = hfi1_wss_init(); 1508 - if (ret < 0) 1509 - goto bail_wss; 1510 1507 ret = pci_register_driver(&hfi1_pci_driver); 1511 1508 if (ret < 0) { 1512 1509 pr_err("Unable to register driver: error %d\n", -ret); ··· 1512 1515 goto bail; /* all OK */ 1513 1516 1514 1517 bail_dev: 1515 - hfi1_wss_exit(); 1516 - bail_wss: 1517 1518 hfi1_dbg_exit(); 1518 1519 idr_destroy(&hfi1_unit_table); 1519 1520 dev_cleanup(); ··· 1528 1533 { 1529 1534 pci_unregister_driver(&hfi1_pci_driver); 1530 1535 node_affinity_destroy_all(); 1531 - hfi1_wss_exit(); 1532 1536 hfi1_dbg_exit(); 1533 1537 1534 1538 idr_destroy(&hfi1_unit_table);
+6 -4
drivers/infiniband/hw/hfi1/rc.c
··· 1644 1644 qp->s_rdma_read_len -= pmtu; 1645 1645 update_last_psn(qp, psn); 1646 1646 spin_unlock_irqrestore(&qp->s_lock, flags); 1647 - hfi1_copy_sge(&qp->s_rdma_read_sge, data, pmtu, false, false); 1647 + rvt_copy_sge(qp, &qp->s_rdma_read_sge, 1648 + data, pmtu, false, false); 1648 1649 goto bail; 1649 1650 1650 1651 case OP(RDMA_READ_RESPONSE_ONLY): ··· 1685 1684 if (unlikely(tlen != qp->s_rdma_read_len)) 1686 1685 goto ack_len_err; 1687 1686 aeth = be32_to_cpu(ohdr->u.aeth); 1688 - hfi1_copy_sge(&qp->s_rdma_read_sge, data, tlen, false, false); 1687 + rvt_copy_sge(qp, &qp->s_rdma_read_sge, 1688 + data, tlen, false, false); 1689 1689 WARN_ON(qp->s_rdma_read_sge.num_sge); 1690 1690 (void)do_rc_ack(qp, aeth, psn, 1691 1691 OP(RDMA_READ_RESPONSE_LAST), 0, rcd); ··· 2146 2144 qp->r_rcv_len += pmtu; 2147 2145 if (unlikely(qp->r_rcv_len > qp->r_len)) 2148 2146 goto nack_inv; 2149 - hfi1_copy_sge(&qp->r_sge, data, pmtu, true, false); 2147 + rvt_copy_sge(qp, &qp->r_sge, data, pmtu, true, false); 2150 2148 break; 2151 2149 2152 2150 case OP(RDMA_WRITE_LAST_WITH_IMMEDIATE): ··· 2202 2200 wc.byte_len = tlen + qp->r_rcv_len; 2203 2201 if (unlikely(wc.byte_len > qp->r_len)) 2204 2202 goto nack_inv; 2205 - hfi1_copy_sge(&qp->r_sge, data, tlen, true, copy_last); 2203 + rvt_copy_sge(qp, &qp->r_sge, data, tlen, true, copy_last); 2206 2204 rvt_put_ss(&qp->r_sge); 2207 2205 qp->r_msn++; 2208 2206 if (!__test_and_clear_bit(RVT_R_WRID_VALID, &qp->r_aflags))
+2 -1
drivers/infiniband/hw/hfi1/ruc.c
··· 361 361 if (len > sge->sge_length) 362 362 len = sge->sge_length; 363 363 WARN_ON_ONCE(len == 0); 364 - hfi1_copy_sge(&qp->r_sge, sge->vaddr, len, release, copy_last); 364 + rvt_copy_sge(qp, &qp->r_sge, sge->vaddr, 365 + len, release, copy_last); 365 366 sge->vaddr += len; 366 367 sge->length -= len; 367 368 sge->sge_length -= len;
+5 -5
drivers/infiniband/hw/hfi1/uc.c
··· 426 426 qp->r_rcv_len += pmtu; 427 427 if (unlikely(qp->r_rcv_len > qp->r_len)) 428 428 goto rewind; 429 - hfi1_copy_sge(&qp->r_sge, data, pmtu, false, false); 429 + rvt_copy_sge(qp, &qp->r_sge, data, pmtu, false, false); 430 430 break; 431 431 432 432 case OP(SEND_LAST_WITH_IMMEDIATE): ··· 449 449 if (unlikely(wc.byte_len > qp->r_len)) 450 450 goto rewind; 451 451 wc.opcode = IB_WC_RECV; 452 - hfi1_copy_sge(&qp->r_sge, data, tlen, false, false); 452 + rvt_copy_sge(qp, &qp->r_sge, data, tlen, false, false); 453 453 rvt_put_ss(&qp->s_rdma_read_sge); 454 454 last_imm: 455 455 wc.wr_id = qp->r_wr_id; ··· 523 523 qp->r_rcv_len += pmtu; 524 524 if (unlikely(qp->r_rcv_len > qp->r_len)) 525 525 goto drop; 526 - hfi1_copy_sge(&qp->r_sge, data, pmtu, true, false); 526 + rvt_copy_sge(qp, &qp->r_sge, data, pmtu, true, false); 527 527 break; 528 528 529 529 case OP(RDMA_WRITE_LAST_WITH_IMMEDIATE): ··· 550 550 } 551 551 wc.byte_len = qp->r_len; 552 552 wc.opcode = IB_WC_RECV_RDMA_WITH_IMM; 553 - hfi1_copy_sge(&qp->r_sge, data, tlen, true, false); 553 + rvt_copy_sge(qp, &qp->r_sge, data, tlen, true, false); 554 554 rvt_put_ss(&qp->r_sge); 555 555 goto last_imm; 556 556 ··· 564 564 tlen -= (hdrsize + extra_bytes); 565 565 if (unlikely(tlen + qp->r_rcv_len != qp->r_len)) 566 566 goto drop; 567 - hfi1_copy_sge(&qp->r_sge, data, tlen, true, false); 567 + rvt_copy_sge(qp, &qp->r_sge, data, tlen, true, false); 568 568 rvt_put_ss(&qp->r_sge); 569 569 break; 570 570
+9 -9
drivers/infiniband/hw/hfi1/ud.c
··· 210 210 } 211 211 212 212 hfi1_make_grh(ibp, &grh, &grd, 0, 0); 213 - hfi1_copy_sge(&qp->r_sge, &grh, 214 - sizeof(grh), true, false); 213 + rvt_copy_sge(qp, &qp->r_sge, &grh, 214 + sizeof(grh), true, false); 215 215 wc.wc_flags |= IB_WC_GRH; 216 216 } else { 217 217 rvt_skip_sge(&qp->r_sge, sizeof(struct ib_grh), true); ··· 228 228 if (len > sge->sge_length) 229 229 len = sge->sge_length; 230 230 WARN_ON_ONCE(len == 0); 231 - hfi1_copy_sge(&qp->r_sge, sge->vaddr, len, true, false); 231 + rvt_copy_sge(qp, &qp->r_sge, sge->vaddr, len, true, false); 232 232 sge->vaddr += len; 233 233 sge->length -= len; 234 234 sge->sge_length -= len; ··· 1019 1019 goto drop; 1020 1020 } 1021 1021 if (packet->grh) { 1022 - hfi1_copy_sge(&qp->r_sge, packet->grh, 1023 - sizeof(struct ib_grh), true, false); 1022 + rvt_copy_sge(qp, &qp->r_sge, packet->grh, 1023 + sizeof(struct ib_grh), true, false); 1024 1024 wc.wc_flags |= IB_WC_GRH; 1025 1025 } else if (packet->etype == RHF_RCV_TYPE_BYPASS) { 1026 1026 struct ib_grh grh; ··· 1030 1030 * out when creating 16B, add back the GRH here. 1031 1031 */ 1032 1032 hfi1_make_ext_grh(packet, &grh, slid, dlid); 1033 - hfi1_copy_sge(&qp->r_sge, &grh, 1034 - sizeof(struct ib_grh), true, false); 1033 + rvt_copy_sge(qp, &qp->r_sge, &grh, 1034 + sizeof(struct ib_grh), true, false); 1035 1035 wc.wc_flags |= IB_WC_GRH; 1036 1036 } else { 1037 1037 rvt_skip_sge(&qp->r_sge, sizeof(struct ib_grh), true); 1038 1038 } 1039 - hfi1_copy_sge(&qp->r_sge, data, wc.byte_len - sizeof(struct ib_grh), 1040 - true, false); 1039 + rvt_copy_sge(qp, &qp->r_sge, data, wc.byte_len - sizeof(struct ib_grh), 1040 + true, false); 1041 1041 rvt_put_ss(&qp->r_sge); 1042 1042 if (!test_and_clear_bit(RVT_R_WRID_VALID, &qp->r_aflags)) 1043 1043 return;
+4 -222
drivers/infiniband/hw/hfi1/verbs.c
··· 129 129 module_param(piothreshold, ushort, S_IRUGO); 130 130 MODULE_PARM_DESC(piothreshold, "size used to determine sdma vs. pio"); 131 131 132 - #define COPY_CACHELESS 1 133 - #define COPY_ADAPTIVE 2 134 132 static unsigned int sge_copy_mode; 135 133 module_param(sge_copy_mode, uint, S_IRUGO); 136 134 MODULE_PARM_DESC(sge_copy_mode, ··· 149 151 /* 16B trailing buffer */ 150 152 static const u8 trail_buf[MAX_16B_PADDING]; 151 153 152 - static uint wss_threshold; 154 + static uint wss_threshold = 80; 153 155 module_param(wss_threshold, uint, S_IRUGO); 154 156 MODULE_PARM_DESC(wss_threshold, "Percentage (1-100) of LLC to use as a threshold for a cacheless copy"); 155 157 static uint wss_clean_period = 256; 156 158 module_param(wss_clean_period, uint, S_IRUGO); 157 159 MODULE_PARM_DESC(wss_clean_period, "Count of verbs copies before an entry in the page copy table is cleaned"); 158 - 159 - /* memory working set size */ 160 - struct hfi1_wss { 161 - unsigned long *entries; 162 - atomic_t total_count; 163 - atomic_t clean_counter; 164 - atomic_t clean_entry; 165 - 166 - int threshold; 167 - int num_entries; 168 - long pages_mask; 169 - }; 170 - 171 - static struct hfi1_wss wss; 172 - 173 - int hfi1_wss_init(void) 174 - { 175 - long llc_size; 176 - long llc_bits; 177 - long table_size; 178 - long table_bits; 179 - 180 - /* check for a valid percent range - default to 80 if none or invalid */ 181 - if (wss_threshold < 1 || wss_threshold > 100) 182 - wss_threshold = 80; 183 - /* reject a wildly large period */ 184 - if (wss_clean_period > 1000000) 185 - wss_clean_period = 256; 186 - /* reject a zero period */ 187 - if (wss_clean_period == 0) 188 - wss_clean_period = 1; 189 - 190 - /* 191 - * Calculate the table size - the next power of 2 larger than the 192 - * LLC size. LLC size is in KiB. 193 - */ 194 - llc_size = wss_llc_size() * 1024; 195 - table_size = roundup_pow_of_two(llc_size); 196 - 197 - /* one bit per page in rounded up table */ 198 - llc_bits = llc_size / PAGE_SIZE; 199 - table_bits = table_size / PAGE_SIZE; 200 - wss.pages_mask = table_bits - 1; 201 - wss.num_entries = table_bits / BITS_PER_LONG; 202 - 203 - wss.threshold = (llc_bits * wss_threshold) / 100; 204 - if (wss.threshold == 0) 205 - wss.threshold = 1; 206 - 207 - atomic_set(&wss.clean_counter, wss_clean_period); 208 - 209 - wss.entries = kcalloc(wss.num_entries, sizeof(*wss.entries), 210 - GFP_KERNEL); 211 - if (!wss.entries) { 212 - hfi1_wss_exit(); 213 - return -ENOMEM; 214 - } 215 - 216 - return 0; 217 - } 218 - 219 - void hfi1_wss_exit(void) 220 - { 221 - /* coded to handle partially initialized and repeat callers */ 222 - kfree(wss.entries); 223 - wss.entries = NULL; 224 - } 225 - 226 - /* 227 - * Advance the clean counter. When the clean period has expired, 228 - * clean an entry. 229 - * 230 - * This is implemented in atomics to avoid locking. Because multiple 231 - * variables are involved, it can be racy which can lead to slightly 232 - * inaccurate information. Since this is only a heuristic, this is 233 - * OK. Any innaccuracies will clean themselves out as the counter 234 - * advances. That said, it is unlikely the entry clean operation will 235 - * race - the next possible racer will not start until the next clean 236 - * period. 237 - * 238 - * The clean counter is implemented as a decrement to zero. When zero 239 - * is reached an entry is cleaned. 240 - */ 241 - static void wss_advance_clean_counter(void) 242 - { 243 - int entry; 244 - int weight; 245 - unsigned long bits; 246 - 247 - /* become the cleaner if we decrement the counter to zero */ 248 - if (atomic_dec_and_test(&wss.clean_counter)) { 249 - /* 250 - * Set, not add, the clean period. This avoids an issue 251 - * where the counter could decrement below the clean period. 252 - * Doing a set can result in lost decrements, slowing the 253 - * clean advance. Since this a heuristic, this possible 254 - * slowdown is OK. 255 - * 256 - * An alternative is to loop, advancing the counter by a 257 - * clean period until the result is > 0. However, this could 258 - * lead to several threads keeping another in the clean loop. 259 - * This could be mitigated by limiting the number of times 260 - * we stay in the loop. 261 - */ 262 - atomic_set(&wss.clean_counter, wss_clean_period); 263 - 264 - /* 265 - * Uniquely grab the entry to clean and move to next. 266 - * The current entry is always the lower bits of 267 - * wss.clean_entry. The table size, wss.num_entries, 268 - * is always a power-of-2. 269 - */ 270 - entry = (atomic_inc_return(&wss.clean_entry) - 1) 271 - & (wss.num_entries - 1); 272 - 273 - /* clear the entry and count the bits */ 274 - bits = xchg(&wss.entries[entry], 0); 275 - weight = hweight64((u64)bits); 276 - /* only adjust the contended total count if needed */ 277 - if (weight) 278 - atomic_sub(weight, &wss.total_count); 279 - } 280 - } 281 - 282 - /* 283 - * Insert the given address into the working set array. 284 - */ 285 - static void wss_insert(void *address) 286 - { 287 - u32 page = ((unsigned long)address >> PAGE_SHIFT) & wss.pages_mask; 288 - u32 entry = page / BITS_PER_LONG; /* assumes this ends up a shift */ 289 - u32 nr = page & (BITS_PER_LONG - 1); 290 - 291 - if (!test_and_set_bit(nr, &wss.entries[entry])) 292 - atomic_inc(&wss.total_count); 293 - 294 - wss_advance_clean_counter(); 295 - } 296 - 297 - /* 298 - * Is the working set larger than the threshold? 299 - */ 300 - static inline bool wss_exceeds_threshold(void) 301 - { 302 - return atomic_read(&wss.total_count) >= wss.threshold; 303 - } 304 160 305 161 /* 306 162 * Translate ib_wr_opcode into ib_wc_opcode. ··· 289 437 * System image GUID. 290 438 */ 291 439 __be64 ib_hfi1_sys_image_guid; 292 - 293 - /** 294 - * hfi1_copy_sge - copy data to SGE memory 295 - * @ss: the SGE state 296 - * @data: the data to copy 297 - * @length: the length of the data 298 - * @release: boolean to release MR 299 - * @copy_last: do a separate copy of the last 8 bytes 300 - */ 301 - void hfi1_copy_sge( 302 - struct rvt_sge_state *ss, 303 - void *data, u32 length, 304 - bool release, 305 - bool copy_last) 306 - { 307 - struct rvt_sge *sge = &ss->sge; 308 - int i; 309 - bool in_last = false; 310 - bool cacheless_copy = false; 311 - 312 - if (sge_copy_mode == COPY_CACHELESS) { 313 - cacheless_copy = length >= PAGE_SIZE; 314 - } else if (sge_copy_mode == COPY_ADAPTIVE) { 315 - if (length >= PAGE_SIZE) { 316 - /* 317 - * NOTE: this *assumes*: 318 - * o The first vaddr is the dest. 319 - * o If multiple pages, then vaddr is sequential. 320 - */ 321 - wss_insert(sge->vaddr); 322 - if (length >= (2 * PAGE_SIZE)) 323 - wss_insert(sge->vaddr + PAGE_SIZE); 324 - 325 - cacheless_copy = wss_exceeds_threshold(); 326 - } else { 327 - wss_advance_clean_counter(); 328 - } 329 - } 330 - if (copy_last) { 331 - if (length > 8) { 332 - length -= 8; 333 - } else { 334 - copy_last = false; 335 - in_last = true; 336 - } 337 - } 338 - 339 - again: 340 - while (length) { 341 - u32 len = rvt_get_sge_length(sge, length); 342 - 343 - WARN_ON_ONCE(len == 0); 344 - if (unlikely(in_last)) { 345 - /* enforce byte transfer ordering */ 346 - for (i = 0; i < len; i++) 347 - ((u8 *)sge->vaddr)[i] = ((u8 *)data)[i]; 348 - } else if (cacheless_copy) { 349 - cacheless_memcpy(sge->vaddr, data, len); 350 - } else { 351 - memcpy(sge->vaddr, data, len); 352 - } 353 - rvt_update_sge(ss, len, release); 354 - data += len; 355 - length -= len; 356 - } 357 - 358 - if (copy_last) { 359 - copy_last = false; 360 - in_last = true; 361 - length = 8; 362 - goto again; 363 - } 364 - } 365 440 366 441 /* 367 442 * Make sure the QP is ready and able to accept the given opcode. ··· 1728 1949 dd->verbs_dev.rdi.dparms.lkey_table_size = hfi1_lkey_table_size; 1729 1950 dd->verbs_dev.rdi.dparms.nports = dd->num_pports; 1730 1951 dd->verbs_dev.rdi.dparms.npkeys = hfi1_get_npkeys(dd); 1952 + dd->verbs_dev.rdi.dparms.sge_copy_mode = sge_copy_mode; 1953 + dd->verbs_dev.rdi.dparms.wss_threshold = wss_threshold; 1954 + dd->verbs_dev.rdi.dparms.wss_clean_period = wss_clean_period; 1731 1955 1732 1956 /* post send table */ 1733 1957 dd->verbs_dev.rdi.post_parms = hfi1_post_parms;
-25
drivers/infiniband/hw/hfi1/verbs.h
··· 315 315 316 316 int hfi1_verbs_send(struct rvt_qp *qp, struct hfi1_pkt_state *ps); 317 317 318 - void hfi1_copy_sge(struct rvt_sge_state *ss, void *data, u32 length, 319 - bool release, bool copy_last); 320 - 321 318 void hfi1_cnp_rcv(struct hfi1_packet *packet); 322 319 323 320 void hfi1_uc_rcv(struct hfi1_packet *packet); ··· 389 392 390 393 int hfi1_verbs_send_pio(struct rvt_qp *qp, struct hfi1_pkt_state *ps, 391 394 u64 pbc); 392 - 393 - int hfi1_wss_init(void); 394 - void hfi1_wss_exit(void); 395 - 396 - /* platform specific: return the lowest level cache (llc) size, in KiB */ 397 - static inline int wss_llc_size(void) 398 - { 399 - /* assume that the boot CPU value is universal for all CPUs */ 400 - return boot_cpu_data.x86_cache_size; 401 - } 402 - 403 - /* platform specific: cacheless copy */ 404 - static inline void cacheless_memcpy(void *dst, void *src, size_t n) 405 - { 406 - /* 407 - * Use the only available X64 cacheless copy. Add a __user cast 408 - * to quiet sparse. The src agument is already in the kernel so 409 - * there are no security issues. The extra fault recovery machinery 410 - * is not invoked. 411 - */ 412 - __copy_user_nocache(dst, (void __user *)src, n, 0); 413 - } 414 395 415 396 static inline bool opa_bth_is_migration(struct ib_other_headers *ohdr) 416 397 {
+6 -4
drivers/infiniband/hw/qib/qib_rc.c
··· 1425 1425 qp->s_rdma_read_len -= pmtu; 1426 1426 update_last_psn(qp, psn); 1427 1427 spin_unlock_irqrestore(&qp->s_lock, flags); 1428 - qib_copy_sge(&qp->s_rdma_read_sge, data, pmtu, 0); 1428 + rvt_copy_sge(qp, &qp->s_rdma_read_sge, 1429 + data, pmtu, false, false); 1429 1430 goto bail; 1430 1431 1431 1432 case OP(RDMA_READ_RESPONSE_ONLY): ··· 1472 1471 if (unlikely(tlen != qp->s_rdma_read_len)) 1473 1472 goto ack_len_err; 1474 1473 aeth = be32_to_cpu(ohdr->u.aeth); 1475 - qib_copy_sge(&qp->s_rdma_read_sge, data, tlen, 0); 1474 + rvt_copy_sge(qp, &qp->s_rdma_read_sge, 1475 + data, tlen, false, false); 1476 1476 WARN_ON(qp->s_rdma_read_sge.num_sge); 1477 1477 (void) do_rc_ack(qp, aeth, psn, 1478 1478 OP(RDMA_READ_RESPONSE_LAST), 0, rcd); ··· 1846 1844 qp->r_rcv_len += pmtu; 1847 1845 if (unlikely(qp->r_rcv_len > qp->r_len)) 1848 1846 goto nack_inv; 1849 - qib_copy_sge(&qp->r_sge, data, pmtu, 1); 1847 + rvt_copy_sge(qp, &qp->r_sge, data, pmtu, true, false); 1850 1848 break; 1851 1849 1852 1850 case OP(RDMA_WRITE_LAST_WITH_IMMEDIATE): ··· 1892 1890 wc.byte_len = tlen + qp->r_rcv_len; 1893 1891 if (unlikely(wc.byte_len > qp->r_len)) 1894 1892 goto nack_inv; 1895 - qib_copy_sge(&qp->r_sge, data, tlen, 1); 1893 + rvt_copy_sge(qp, &qp->r_sge, data, tlen, true, false); 1896 1894 rvt_put_ss(&qp->r_sge); 1897 1895 qp->r_msn++; 1898 1896 if (!test_and_clear_bit(RVT_R_WRID_VALID, &qp->r_aflags))
+1 -1
drivers/infiniband/hw/qib/qib_ruc.c
··· 354 354 if (len > sge->sge_length) 355 355 len = sge->sge_length; 356 356 BUG_ON(len == 0); 357 - qib_copy_sge(&qp->r_sge, sge->vaddr, len, release); 357 + rvt_copy_sge(qp, &qp->r_sge, sge->vaddr, len, release, false); 358 358 sge->vaddr += len; 359 359 sge->length -= len; 360 360 sge->sge_length -= len;
+5 -5
drivers/infiniband/hw/qib/qib_uc.c
··· 359 359 qp->r_rcv_len += pmtu; 360 360 if (unlikely(qp->r_rcv_len > qp->r_len)) 361 361 goto rewind; 362 - qib_copy_sge(&qp->r_sge, data, pmtu, 0); 362 + rvt_copy_sge(qp, &qp->r_sge, data, pmtu, false, false); 363 363 break; 364 364 365 365 case OP(SEND_LAST_WITH_IMMEDIATE): ··· 385 385 if (unlikely(wc.byte_len > qp->r_len)) 386 386 goto rewind; 387 387 wc.opcode = IB_WC_RECV; 388 - qib_copy_sge(&qp->r_sge, data, tlen, 0); 388 + rvt_copy_sge(qp, &qp->r_sge, data, tlen, false, false); 389 389 rvt_put_ss(&qp->s_rdma_read_sge); 390 390 last_imm: 391 391 wc.wr_id = qp->r_wr_id; ··· 449 449 qp->r_rcv_len += pmtu; 450 450 if (unlikely(qp->r_rcv_len > qp->r_len)) 451 451 goto drop; 452 - qib_copy_sge(&qp->r_sge, data, pmtu, 1); 452 + rvt_copy_sge(qp, &qp->r_sge, data, pmtu, true, false); 453 453 break; 454 454 455 455 case OP(RDMA_WRITE_LAST_WITH_IMMEDIATE): ··· 479 479 } 480 480 wc.byte_len = qp->r_len; 481 481 wc.opcode = IB_WC_RECV_RDMA_WITH_IMM; 482 - qib_copy_sge(&qp->r_sge, data, tlen, 1); 482 + rvt_copy_sge(qp, &qp->r_sge, data, tlen, true, false); 483 483 rvt_put_ss(&qp->r_sge); 484 484 goto last_imm; 485 485 ··· 495 495 tlen -= (hdrsize + pad + 4); 496 496 if (unlikely(tlen + qp->r_rcv_len != qp->r_len)) 497 497 goto drop; 498 - qib_copy_sge(&qp->r_sge, data, tlen, 1); 498 + rvt_copy_sge(qp, &qp->r_sge, data, tlen, true, false); 499 499 rvt_put_ss(&qp->r_sge); 500 500 break; 501 501
+7 -6
drivers/infiniband/hw/qib/qib_ud.c
··· 162 162 const struct ib_global_route *grd = rdma_ah_read_grh(ah_attr); 163 163 164 164 qib_make_grh(ibp, &grh, grd, 0, 0); 165 - qib_copy_sge(&qp->r_sge, &grh, 166 - sizeof(grh), 1); 165 + rvt_copy_sge(qp, &qp->r_sge, &grh, 166 + sizeof(grh), true, false); 167 167 wc.wc_flags |= IB_WC_GRH; 168 168 } else 169 169 rvt_skip_sge(&qp->r_sge, sizeof(struct ib_grh), true); ··· 179 179 if (len > sge->sge_length) 180 180 len = sge->sge_length; 181 181 BUG_ON(len == 0); 182 - qib_copy_sge(&qp->r_sge, sge->vaddr, len, 1); 182 + rvt_copy_sge(qp, &qp->r_sge, sge->vaddr, len, true, false); 183 183 sge->vaddr += len; 184 184 sge->length -= len; 185 185 sge->sge_length -= len; ··· 551 551 goto drop; 552 552 } 553 553 if (has_grh) { 554 - qib_copy_sge(&qp->r_sge, &hdr->u.l.grh, 555 - sizeof(struct ib_grh), 1); 554 + rvt_copy_sge(qp, &qp->r_sge, &hdr->u.l.grh, 555 + sizeof(struct ib_grh), true, false); 556 556 wc.wc_flags |= IB_WC_GRH; 557 557 } else 558 558 rvt_skip_sge(&qp->r_sge, sizeof(struct ib_grh), true); 559 - qib_copy_sge(&qp->r_sge, data, wc.byte_len - sizeof(struct ib_grh), 1); 559 + rvt_copy_sge(qp, &qp->r_sge, data, wc.byte_len - sizeof(struct ib_grh), 560 + true, false); 560 561 rvt_put_ss(&qp->r_sge); 561 562 if (!test_and_clear_bit(RVT_R_WRID_VALID, &qp->r_aflags)) 562 563 return;
+1 -21
drivers/infiniband/hw/qib/qib_verbs.c
··· 131 131 */ 132 132 __be64 ib_qib_sys_image_guid; 133 133 134 - /** 135 - * qib_copy_sge - copy data to SGE memory 136 - * @ss: the SGE state 137 - * @data: the data to copy 138 - * @length: the length of the data 139 - */ 140 - void qib_copy_sge(struct rvt_sge_state *ss, void *data, u32 length, int release) 141 - { 142 - struct rvt_sge *sge = &ss->sge; 143 - 144 - while (length) { 145 - u32 len = rvt_get_sge_length(sge, length); 146 - 147 - WARN_ON_ONCE(len == 0); 148 - memcpy(sge->vaddr, data, len); 149 - rvt_update_sge(ss, len, release); 150 - data += len; 151 - length -= len; 152 - } 153 - } 154 - 155 134 /* 156 135 * Count the number of DMA descriptors needed to send length bytes of data. 157 136 * Don't modify the qib_sge_state to get the count. ··· 1610 1631 dd->verbs_dev.rdi.dparms.node = dd->assigned_node_id; 1611 1632 dd->verbs_dev.rdi.dparms.core_cap_flags = RDMA_CORE_PORT_IBA_IB; 1612 1633 dd->verbs_dev.rdi.dparms.max_mad_size = IB_MGMT_MAD_SIZE; 1634 + dd->verbs_dev.rdi.dparms.sge_copy_mode = RVT_SGE_COPY_MEMCPY; 1613 1635 1614 1636 qib_fill_device_attr(dd); 1615 1637
-3
drivers/infiniband/hw/qib/qib_verbs.h
··· 292 292 int qib_verbs_send(struct rvt_qp *qp, struct ib_header *hdr, 293 293 u32 hdrwords, struct rvt_sge_state *ss, u32 len); 294 294 295 - void qib_copy_sge(struct rvt_sge_state *ss, void *data, u32 length, 296 - int release); 297 - 298 295 void qib_uc_rcv(struct qib_ibport *ibp, struct ib_header *hdr, 299 296 int has_grh, void *data, u32 tlen, struct rvt_qp *qp); 300 297
+1 -1
drivers/infiniband/sw/rdmavt/Kconfig
··· 1 1 config INFINIBAND_RDMAVT 2 2 tristate "RDMA verbs transport library" 3 - depends on 64BIT && ARCH_DMA_ADDR_T_64BIT 3 + depends on X86_64 && ARCH_DMA_ADDR_T_64BIT 4 4 depends on PCI 5 5 select DMA_VIRT_OPS 6 6 ---help---
+258
drivers/infiniband/sw/rdmavt/qp.c
··· 118 118 }; 119 119 EXPORT_SYMBOL(ib_rvt_state_ops); 120 120 121 + /* platform specific: return the last level cache (llc) size, in KiB */ 122 + static int rvt_wss_llc_size(void) 123 + { 124 + /* assume that the boot CPU value is universal for all CPUs */ 125 + return boot_cpu_data.x86_cache_size; 126 + } 127 + 128 + /* platform specific: cacheless copy */ 129 + static void cacheless_memcpy(void *dst, void *src, size_t n) 130 + { 131 + /* 132 + * Use the only available X64 cacheless copy. Add a __user cast 133 + * to quiet sparse. The src agument is already in the kernel so 134 + * there are no security issues. The extra fault recovery machinery 135 + * is not invoked. 136 + */ 137 + __copy_user_nocache(dst, (void __user *)src, n, 0); 138 + } 139 + 140 + void rvt_wss_exit(struct rvt_dev_info *rdi) 141 + { 142 + struct rvt_wss *wss = rdi->wss; 143 + 144 + if (!wss) 145 + return; 146 + 147 + /* coded to handle partially initialized and repeat callers */ 148 + kfree(wss->entries); 149 + wss->entries = NULL; 150 + kfree(rdi->wss); 151 + rdi->wss = NULL; 152 + } 153 + 154 + /** 155 + * rvt_wss_init - Init wss data structures 156 + * 157 + * Return: 0 on success 158 + */ 159 + int rvt_wss_init(struct rvt_dev_info *rdi) 160 + { 161 + unsigned int sge_copy_mode = rdi->dparms.sge_copy_mode; 162 + unsigned int wss_threshold = rdi->dparms.wss_threshold; 163 + unsigned int wss_clean_period = rdi->dparms.wss_clean_period; 164 + long llc_size; 165 + long llc_bits; 166 + long table_size; 167 + long table_bits; 168 + struct rvt_wss *wss; 169 + int node = rdi->dparms.node; 170 + 171 + if (sge_copy_mode != RVT_SGE_COPY_ADAPTIVE) { 172 + rdi->wss = NULL; 173 + return 0; 174 + } 175 + 176 + rdi->wss = kzalloc_node(sizeof(*rdi->wss), GFP_KERNEL, node); 177 + if (!rdi->wss) 178 + return -ENOMEM; 179 + wss = rdi->wss; 180 + 181 + /* check for a valid percent range - default to 80 if none or invalid */ 182 + if (wss_threshold < 1 || wss_threshold > 100) 183 + wss_threshold = 80; 184 + 185 + /* reject a wildly large period */ 186 + if (wss_clean_period > 1000000) 187 + wss_clean_period = 256; 188 + 189 + /* reject a zero period */ 190 + if (wss_clean_period == 0) 191 + wss_clean_period = 1; 192 + 193 + /* 194 + * Calculate the table size - the next power of 2 larger than the 195 + * LLC size. LLC size is in KiB. 196 + */ 197 + llc_size = rvt_wss_llc_size() * 1024; 198 + table_size = roundup_pow_of_two(llc_size); 199 + 200 + /* one bit per page in rounded up table */ 201 + llc_bits = llc_size / PAGE_SIZE; 202 + table_bits = table_size / PAGE_SIZE; 203 + wss->pages_mask = table_bits - 1; 204 + wss->num_entries = table_bits / BITS_PER_LONG; 205 + 206 + wss->threshold = (llc_bits * wss_threshold) / 100; 207 + if (wss->threshold == 0) 208 + wss->threshold = 1; 209 + 210 + wss->clean_period = wss_clean_period; 211 + atomic_set(&wss->clean_counter, wss_clean_period); 212 + 213 + wss->entries = kcalloc_node(wss->num_entries, sizeof(*wss->entries), 214 + GFP_KERNEL, node); 215 + if (!wss->entries) { 216 + rvt_wss_exit(rdi); 217 + return -ENOMEM; 218 + } 219 + 220 + return 0; 221 + } 222 + 223 + /* 224 + * Advance the clean counter. When the clean period has expired, 225 + * clean an entry. 226 + * 227 + * This is implemented in atomics to avoid locking. Because multiple 228 + * variables are involved, it can be racy which can lead to slightly 229 + * inaccurate information. Since this is only a heuristic, this is 230 + * OK. Any innaccuracies will clean themselves out as the counter 231 + * advances. That said, it is unlikely the entry clean operation will 232 + * race - the next possible racer will not start until the next clean 233 + * period. 234 + * 235 + * The clean counter is implemented as a decrement to zero. When zero 236 + * is reached an entry is cleaned. 237 + */ 238 + static void wss_advance_clean_counter(struct rvt_wss *wss) 239 + { 240 + int entry; 241 + int weight; 242 + unsigned long bits; 243 + 244 + /* become the cleaner if we decrement the counter to zero */ 245 + if (atomic_dec_and_test(&wss->clean_counter)) { 246 + /* 247 + * Set, not add, the clean period. This avoids an issue 248 + * where the counter could decrement below the clean period. 249 + * Doing a set can result in lost decrements, slowing the 250 + * clean advance. Since this a heuristic, this possible 251 + * slowdown is OK. 252 + * 253 + * An alternative is to loop, advancing the counter by a 254 + * clean period until the result is > 0. However, this could 255 + * lead to several threads keeping another in the clean loop. 256 + * This could be mitigated by limiting the number of times 257 + * we stay in the loop. 258 + */ 259 + atomic_set(&wss->clean_counter, wss->clean_period); 260 + 261 + /* 262 + * Uniquely grab the entry to clean and move to next. 263 + * The current entry is always the lower bits of 264 + * wss.clean_entry. The table size, wss.num_entries, 265 + * is always a power-of-2. 266 + */ 267 + entry = (atomic_inc_return(&wss->clean_entry) - 1) 268 + & (wss->num_entries - 1); 269 + 270 + /* clear the entry and count the bits */ 271 + bits = xchg(&wss->entries[entry], 0); 272 + weight = hweight64((u64)bits); 273 + /* only adjust the contended total count if needed */ 274 + if (weight) 275 + atomic_sub(weight, &wss->total_count); 276 + } 277 + } 278 + 279 + /* 280 + * Insert the given address into the working set array. 281 + */ 282 + static void wss_insert(struct rvt_wss *wss, void *address) 283 + { 284 + u32 page = ((unsigned long)address >> PAGE_SHIFT) & wss->pages_mask; 285 + u32 entry = page / BITS_PER_LONG; /* assumes this ends up a shift */ 286 + u32 nr = page & (BITS_PER_LONG - 1); 287 + 288 + if (!test_and_set_bit(nr, &wss->entries[entry])) 289 + atomic_inc(&wss->total_count); 290 + 291 + wss_advance_clean_counter(wss); 292 + } 293 + 294 + /* 295 + * Is the working set larger than the threshold? 296 + */ 297 + static inline bool wss_exceeds_threshold(struct rvt_wss *wss) 298 + { 299 + return atomic_read(&wss->total_count) >= wss->threshold; 300 + } 301 + 121 302 static void get_map_page(struct rvt_qpn_table *qpt, 122 303 struct rvt_qpn_map *map) 123 304 { ··· 2657 2476 rcu_read_unlock(); 2658 2477 } 2659 2478 EXPORT_SYMBOL(rvt_qp_iter); 2479 + 2480 + /** 2481 + * rvt_copy_sge - copy data to SGE memory 2482 + * @qp: associated QP 2483 + * @ss: the SGE state 2484 + * @data: the data to copy 2485 + * @length: the length of the data 2486 + * @release: boolean to release MR 2487 + * @copy_last: do a separate copy of the last 8 bytes 2488 + */ 2489 + void rvt_copy_sge(struct rvt_qp *qp, struct rvt_sge_state *ss, 2490 + void *data, u32 length, 2491 + bool release, bool copy_last) 2492 + { 2493 + struct rvt_sge *sge = &ss->sge; 2494 + int i; 2495 + bool in_last = false; 2496 + bool cacheless_copy = false; 2497 + struct rvt_dev_info *rdi = ib_to_rvt(qp->ibqp.device); 2498 + struct rvt_wss *wss = rdi->wss; 2499 + unsigned int sge_copy_mode = rdi->dparms.sge_copy_mode; 2500 + 2501 + if (sge_copy_mode == RVT_SGE_COPY_CACHELESS) { 2502 + cacheless_copy = length >= PAGE_SIZE; 2503 + } else if (sge_copy_mode == RVT_SGE_COPY_ADAPTIVE) { 2504 + if (length >= PAGE_SIZE) { 2505 + /* 2506 + * NOTE: this *assumes*: 2507 + * o The first vaddr is the dest. 2508 + * o If multiple pages, then vaddr is sequential. 2509 + */ 2510 + wss_insert(wss, sge->vaddr); 2511 + if (length >= (2 * PAGE_SIZE)) 2512 + wss_insert(wss, (sge->vaddr + PAGE_SIZE)); 2513 + 2514 + cacheless_copy = wss_exceeds_threshold(wss); 2515 + } else { 2516 + wss_advance_clean_counter(wss); 2517 + } 2518 + } 2519 + 2520 + if (copy_last) { 2521 + if (length > 8) { 2522 + length -= 8; 2523 + } else { 2524 + copy_last = false; 2525 + in_last = true; 2526 + } 2527 + } 2528 + 2529 + again: 2530 + while (length) { 2531 + u32 len = rvt_get_sge_length(sge, length); 2532 + 2533 + WARN_ON_ONCE(len == 0); 2534 + if (unlikely(in_last)) { 2535 + /* enforce byte transfer ordering */ 2536 + for (i = 0; i < len; i++) 2537 + ((u8 *)sge->vaddr)[i] = ((u8 *)data)[i]; 2538 + } else if (cacheless_copy) { 2539 + cacheless_memcpy(sge->vaddr, data, len); 2540 + } else { 2541 + memcpy(sge->vaddr, data, len); 2542 + } 2543 + rvt_update_sge(ss, len, release); 2544 + data += len; 2545 + length -= len; 2546 + } 2547 + 2548 + if (copy_last) { 2549 + copy_last = false; 2550 + in_last = true; 2551 + length = 8; 2552 + goto again; 2553 + } 2554 + } 2555 + EXPORT_SYMBOL(rvt_copy_sge);
+2
drivers/infiniband/sw/rdmavt/qp.h
··· 66 66 const struct ib_send_wr **bad_wr); 67 67 int rvt_post_srq_recv(struct ib_srq *ibsrq, const struct ib_recv_wr *wr, 68 68 const struct ib_recv_wr **bad_wr); 69 + int rvt_wss_init(struct rvt_dev_info *rdi); 70 + void rvt_wss_exit(struct rvt_dev_info *rdi); 69 71 #endif /* DEF_RVTQP_H */
+11 -1
drivers/infiniband/sw/rdmavt/vt.c
··· 774 774 goto bail_no_mr; 775 775 } 776 776 777 + /* Memory Working Set Size */ 778 + ret = rvt_wss_init(rdi); 779 + if (ret) { 780 + rvt_pr_err(rdi, "Error in WSS init.\n"); 781 + goto bail_mr; 782 + } 783 + 777 784 /* Completion queues */ 778 785 spin_lock_init(&rdi->n_cqs_lock); 779 786 ··· 839 832 rdi->driver_f.port_callback); 840 833 if (ret) { 841 834 rvt_pr_err(rdi, "Failed to register driver with ib core.\n"); 842 - goto bail_mr; 835 + goto bail_wss; 843 836 } 844 837 845 838 rvt_create_mad_agents(rdi); ··· 847 840 rvt_pr_info(rdi, "Registration with rdmavt done.\n"); 848 841 return ret; 849 842 843 + bail_wss: 844 + rvt_wss_exit(rdi); 850 845 bail_mr: 851 846 rvt_mr_exit(rdi); 852 847 ··· 872 863 rvt_free_mad_agents(rdi); 873 864 874 865 ib_unregister_device(&rdi->ibdev); 866 + rvt_wss_exit(rdi); 875 867 rvt_mr_exit(rdi); 876 868 rvt_qp_exit(rdi); 877 869 }
+22
include/rdma/rdma_vt.h
··· 149 149 150 150 #define RVT_CQN_MAX 16 /* maximum length of cq name */ 151 151 152 + #define RVT_SGE_COPY_MEMCPY 0 153 + #define RVT_SGE_COPY_CACHELESS 1 154 + #define RVT_SGE_COPY_ADAPTIVE 2 155 + 152 156 /* 153 157 * Things that are driver specific, module parameters in hfi1 and qib 154 158 */ ··· 165 161 */ 166 162 unsigned int lkey_table_size; 167 163 unsigned int qp_table_size; 164 + unsigned int sge_copy_mode; 165 + unsigned int wss_threshold; 166 + unsigned int wss_clean_period; 168 167 int qpn_start; 169 168 int qpn_inc; 170 169 int qpn_res_start; ··· 198 191 atomic_t refcount; 199 192 u8 vl; 200 193 u8 log_pmtu; 194 + }; 195 + 196 + /* memory working set size */ 197 + struct rvt_wss { 198 + unsigned long *entries; 199 + atomic_t total_count; 200 + atomic_t clean_counter; 201 + atomic_t clean_entry; 202 + 203 + int threshold; 204 + int num_entries; 205 + long pages_mask; 206 + unsigned int clean_period; 201 207 }; 202 208 203 209 struct rvt_dev_info; ··· 438 418 u32 n_mcast_grps_allocated; /* number of mcast groups allocated */ 439 419 spinlock_t n_mcast_grps_lock; 440 420 421 + /* Memory Working Set Size */ 422 + struct rvt_wss *wss; 441 423 }; 442 424 443 425 /**
+4
include/rdma/rdmavt_qp.h
··· 678 678 void rvt_stop_rc_timers(struct rvt_qp *qp); 679 679 void rvt_add_retry_timer(struct rvt_qp *qp); 680 680 681 + void rvt_copy_sge(struct rvt_qp *qp, struct rvt_sge_state *ss, 682 + void *data, u32 length, 683 + bool release, bool copy_last); 684 + 681 685 /** 682 686 * struct rvt_qp_iter - the iterator for QPs 683 687 * @qp - the current QP