Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

IB/mthca: Fix access to MTT and MPT tables on non-cache-coherent CPUs

We allocate the MTT table with alloc_pages() and then do pci_map_sg(),
so we must call pci_dma_sync_sg() after the CPU writes to the MTT
table. This works since the device will never write MTTs on mem-free
HCAs, once we get rid of the use of the WRITE_MTT firmware command.
This change is needed to make that work, and is an improvement for
now, since it gives FMRs a chance at working.

For MPTs, both the device and CPU might write there, so we must
allocate DMA coherent memory for these.

Signed-off-by: Michael S. Tsirkin <mst@mellanox.co.il>
Signed-off-by: Roland Dreier <rolandd@cisco.com>

authored by

Michael S. Tsirkin and committed by
Roland Dreier
391e4dea 1d1f19cf

+132 -51
+21 -15
drivers/infiniband/hw/mthca/mthca_main.c
··· 379 379 380 380 mdev->fw.arbel.fw_icm = 381 381 mthca_alloc_icm(mdev, mdev->fw.arbel.fw_pages, 382 - GFP_HIGHUSER | __GFP_NOWARN); 382 + GFP_HIGHUSER | __GFP_NOWARN, 0); 383 383 if (!mdev->fw.arbel.fw_icm) { 384 384 mthca_err(mdev, "Couldn't allocate FW area, aborting.\n"); 385 385 return -ENOMEM; ··· 412 412 mthca_UNMAP_FA(mdev, &status); 413 413 414 414 err_free: 415 - mthca_free_icm(mdev, mdev->fw.arbel.fw_icm); 415 + mthca_free_icm(mdev, mdev->fw.arbel.fw_icm, 0); 416 416 return err; 417 417 } 418 418 ··· 441 441 (unsigned long long) aux_pages << 2); 442 442 443 443 mdev->fw.arbel.aux_icm = mthca_alloc_icm(mdev, aux_pages, 444 - GFP_HIGHUSER | __GFP_NOWARN); 444 + GFP_HIGHUSER | __GFP_NOWARN, 0); 445 445 if (!mdev->fw.arbel.aux_icm) { 446 446 mthca_err(mdev, "Couldn't allocate aux memory, aborting.\n"); 447 447 return -ENOMEM; ··· 471 471 mdev->mr_table.mtt_table = mthca_alloc_icm_table(mdev, init_hca->mtt_base, 472 472 MTHCA_MTT_SEG_SIZE, 473 473 mdev->limits.num_mtt_segs, 474 - mdev->limits.reserved_mtts, 1); 474 + mdev->limits.reserved_mtts, 475 + 1, 0); 475 476 if (!mdev->mr_table.mtt_table) { 476 477 mthca_err(mdev, "Failed to map MTT context memory, aborting.\n"); 477 478 err = -ENOMEM; ··· 482 481 mdev->mr_table.mpt_table = mthca_alloc_icm_table(mdev, init_hca->mpt_base, 483 482 dev_lim->mpt_entry_sz, 484 483 mdev->limits.num_mpts, 485 - mdev->limits.reserved_mrws, 1); 484 + mdev->limits.reserved_mrws, 485 + 1, 1); 486 486 if (!mdev->mr_table.mpt_table) { 487 487 mthca_err(mdev, "Failed to map MPT context memory, aborting.\n"); 488 488 err = -ENOMEM; ··· 493 491 mdev->qp_table.qp_table = mthca_alloc_icm_table(mdev, init_hca->qpc_base, 494 492 dev_lim->qpc_entry_sz, 495 493 mdev->limits.num_qps, 496 - mdev->limits.reserved_qps, 0); 494 + mdev->limits.reserved_qps, 495 + 0, 0); 497 496 if (!mdev->qp_table.qp_table) { 498 497 mthca_err(mdev, "Failed to map QP context memory, aborting.\n"); 499 498 err = -ENOMEM; ··· 504 501 mdev->qp_table.eqp_table = mthca_alloc_icm_table(mdev, init_hca->eqpc_base, 505 502 dev_lim->eqpc_entry_sz, 506 503 mdev->limits.num_qps, 507 - mdev->limits.reserved_qps, 0); 504 + mdev->limits.reserved_qps, 505 + 0, 0); 508 506 if (!mdev->qp_table.eqp_table) { 509 507 mthca_err(mdev, "Failed to map EQP context memory, aborting.\n"); 510 508 err = -ENOMEM; ··· 515 511 mdev->qp_table.rdb_table = mthca_alloc_icm_table(mdev, init_hca->rdb_base, 516 512 MTHCA_RDB_ENTRY_SIZE, 517 513 mdev->limits.num_qps << 518 - mdev->qp_table.rdb_shift, 514 + mdev->qp_table.rdb_shift, 0, 519 515 0, 0); 520 516 if (!mdev->qp_table.rdb_table) { 521 517 mthca_err(mdev, "Failed to map RDB context memory, aborting\n"); ··· 526 522 mdev->cq_table.table = mthca_alloc_icm_table(mdev, init_hca->cqc_base, 527 523 dev_lim->cqc_entry_sz, 528 524 mdev->limits.num_cqs, 529 - mdev->limits.reserved_cqs, 0); 525 + mdev->limits.reserved_cqs, 526 + 0, 0); 530 527 if (!mdev->cq_table.table) { 531 528 mthca_err(mdev, "Failed to map CQ context memory, aborting.\n"); 532 529 err = -ENOMEM; ··· 539 534 mthca_alloc_icm_table(mdev, init_hca->srqc_base, 540 535 dev_lim->srq_entry_sz, 541 536 mdev->limits.num_srqs, 542 - mdev->limits.reserved_srqs, 0); 537 + mdev->limits.reserved_srqs, 538 + 0, 0); 543 539 if (!mdev->srq_table.table) { 544 540 mthca_err(mdev, "Failed to map SRQ context memory, " 545 541 "aborting.\n"); ··· 560 554 mdev->limits.num_amgms, 561 555 mdev->limits.num_mgms + 562 556 mdev->limits.num_amgms, 563 - 0); 557 + 0, 0); 564 558 if (!mdev->mcg_table.table) { 565 559 mthca_err(mdev, "Failed to map MCG context memory, aborting.\n"); 566 560 err = -ENOMEM; ··· 598 592 mthca_UNMAP_ICM_AUX(mdev, &status); 599 593 600 594 err_free_aux: 601 - mthca_free_icm(mdev, mdev->fw.arbel.aux_icm); 595 + mthca_free_icm(mdev, mdev->fw.arbel.aux_icm, 0); 602 596 603 597 return err; 604 598 } ··· 619 613 mthca_unmap_eq_icm(mdev); 620 614 621 615 mthca_UNMAP_ICM_AUX(mdev, &status); 622 - mthca_free_icm(mdev, mdev->fw.arbel.aux_icm); 616 + mthca_free_icm(mdev, mdev->fw.arbel.aux_icm, 0); 623 617 } 624 618 625 619 static int mthca_init_arbel(struct mthca_dev *mdev) ··· 703 697 704 698 err_stop_fw: 705 699 mthca_UNMAP_FA(mdev, &status); 706 - mthca_free_icm(mdev, mdev->fw.arbel.fw_icm); 700 + mthca_free_icm(mdev, mdev->fw.arbel.fw_icm, 0); 707 701 708 702 err_disable: 709 703 if (!(mdev->mthca_flags & MTHCA_FLAG_NO_LAM)) ··· 722 716 mthca_free_icms(mdev); 723 717 724 718 mthca_UNMAP_FA(mdev, &status); 725 - mthca_free_icm(mdev, mdev->fw.arbel.fw_icm); 719 + mthca_free_icm(mdev, mdev->fw.arbel.fw_icm, 0); 726 720 727 721 if (!(mdev->mthca_flags & MTHCA_FLAG_NO_LAM)) 728 722 mthca_DISABLE_LAM(mdev, &status);
+99 -30
drivers/infiniband/hw/mthca/mthca_memfree.c
··· 35 35 */ 36 36 37 37 #include <linux/mm.h> 38 + #include <linux/scatterlist.h> 39 + 40 + #include <asm/page.h> 38 41 39 42 #include "mthca_memfree.h" 40 43 #include "mthca_dev.h" ··· 61 58 } page[0]; 62 59 }; 63 60 64 - void mthca_free_icm(struct mthca_dev *dev, struct mthca_icm *icm) 61 + static void mthca_free_icm_pages(struct mthca_dev *dev, struct mthca_icm_chunk *chunk) 62 + { 63 + int i; 64 + 65 + if (chunk->nsg > 0) 66 + pci_unmap_sg(dev->pdev, chunk->mem, chunk->npages, 67 + PCI_DMA_BIDIRECTIONAL); 68 + 69 + for (i = 0; i < chunk->npages; ++i) 70 + __free_pages(chunk->mem[i].page, 71 + get_order(chunk->mem[i].length)); 72 + } 73 + 74 + static void mthca_free_icm_coherent(struct mthca_dev *dev, struct mthca_icm_chunk *chunk) 75 + { 76 + int i; 77 + 78 + for (i = 0; i < chunk->npages; ++i) { 79 + dma_free_coherent(&dev->pdev->dev, chunk->mem[i].length, 80 + lowmem_page_address(chunk->mem[i].page), 81 + sg_dma_address(&chunk->mem[i])); 82 + } 83 + } 84 + 85 + void mthca_free_icm(struct mthca_dev *dev, struct mthca_icm *icm, int coherent) 65 86 { 66 87 struct mthca_icm_chunk *chunk, *tmp; 67 - int i; 68 88 69 89 if (!icm) 70 90 return; 71 91 72 92 list_for_each_entry_safe(chunk, tmp, &icm->chunk_list, list) { 73 - if (chunk->nsg > 0) 74 - pci_unmap_sg(dev->pdev, chunk->mem, chunk->npages, 75 - PCI_DMA_BIDIRECTIONAL); 76 - 77 - for (i = 0; i < chunk->npages; ++i) 78 - __free_pages(chunk->mem[i].page, 79 - get_order(chunk->mem[i].length)); 93 + if (coherent) 94 + mthca_free_icm_coherent(dev, chunk); 95 + else 96 + mthca_free_icm_pages(dev, chunk); 80 97 81 98 kfree(chunk); 82 99 } ··· 104 81 kfree(icm); 105 82 } 106 83 84 + static int mthca_alloc_icm_pages(struct scatterlist *mem, int order, gfp_t gfp_mask) 85 + { 86 + mem->page = alloc_pages(gfp_mask, order); 87 + if (!mem->page) 88 + return -ENOMEM; 89 + 90 + mem->length = PAGE_SIZE << order; 91 + mem->offset = 0; 92 + return 0; 93 + } 94 + 95 + static int mthca_alloc_icm_coherent(struct device *dev, struct scatterlist *mem, 96 + int order, gfp_t gfp_mask) 97 + { 98 + void *buf = dma_alloc_coherent(dev, PAGE_SIZE << order, &sg_dma_address(mem), 99 + gfp_mask); 100 + if (!buf) 101 + return -ENOMEM; 102 + 103 + sg_set_buf(mem, buf, PAGE_SIZE << order); 104 + BUG_ON(mem->offset); 105 + sg_dma_len(mem) = PAGE_SIZE << order; 106 + return 0; 107 + } 108 + 107 109 struct mthca_icm *mthca_alloc_icm(struct mthca_dev *dev, int npages, 108 - gfp_t gfp_mask) 110 + gfp_t gfp_mask, int coherent) 109 111 { 110 112 struct mthca_icm *icm; 111 113 struct mthca_icm_chunk *chunk = NULL; 112 114 int cur_order; 115 + int ret; 116 + 117 + /* We use sg_set_buf for coherent allocs, which assumes low memory */ 118 + BUG_ON(coherent && (gfp_mask & __GFP_HIGHMEM)); 113 119 114 120 icm = kmalloc(sizeof *icm, gfp_mask & ~(__GFP_HIGHMEM | __GFP_NOWARN)); 115 121 if (!icm) ··· 164 112 while (1 << cur_order > npages) 165 113 --cur_order; 166 114 167 - chunk->mem[chunk->npages].page = alloc_pages(gfp_mask, cur_order); 168 - if (chunk->mem[chunk->npages].page) { 169 - chunk->mem[chunk->npages].length = PAGE_SIZE << cur_order; 170 - chunk->mem[chunk->npages].offset = 0; 115 + if (coherent) 116 + ret = mthca_alloc_icm_coherent(&dev->pdev->dev, 117 + &chunk->mem[chunk->npages], 118 + cur_order, gfp_mask); 119 + else 120 + ret = mthca_alloc_icm_pages(&chunk->mem[chunk->npages], 121 + cur_order, gfp_mask); 171 122 172 - if (++chunk->npages == MTHCA_ICM_CHUNK_LEN) { 123 + if (!ret) { 124 + ++chunk->npages; 125 + 126 + if (!coherent && chunk->npages == MTHCA_ICM_CHUNK_LEN) { 173 127 chunk->nsg = pci_map_sg(dev->pdev, chunk->mem, 174 128 chunk->npages, 175 129 PCI_DMA_BIDIRECTIONAL); 176 130 177 131 if (chunk->nsg <= 0) 178 132 goto fail; 179 - 180 - chunk = NULL; 181 133 } 134 + 135 + if (chunk->npages == MTHCA_ICM_CHUNK_LEN) 136 + chunk = NULL; 182 137 183 138 npages -= 1 << cur_order; 184 139 } else { ··· 195 136 } 196 137 } 197 138 198 - if (chunk) { 139 + if (!coherent && chunk) { 199 140 chunk->nsg = pci_map_sg(dev->pdev, chunk->mem, 200 141 chunk->npages, 201 142 PCI_DMA_BIDIRECTIONAL); ··· 207 148 return icm; 208 149 209 150 fail: 210 - mthca_free_icm(dev, icm); 151 + mthca_free_icm(dev, icm, coherent); 211 152 return NULL; 212 153 } 213 154 ··· 226 167 227 168 table->icm[i] = mthca_alloc_icm(dev, MTHCA_TABLE_CHUNK_SIZE >> PAGE_SHIFT, 228 169 (table->lowmem ? GFP_KERNEL : GFP_HIGHUSER) | 229 - __GFP_NOWARN); 170 + __GFP_NOWARN, table->coherent); 230 171 if (!table->icm[i]) { 231 172 ret = -ENOMEM; 232 173 goto out; ··· 234 175 235 176 if (mthca_MAP_ICM(dev, table->icm[i], table->virt + i * MTHCA_TABLE_CHUNK_SIZE, 236 177 &status) || status) { 237 - mthca_free_icm(dev, table->icm[i]); 178 + mthca_free_icm(dev, table->icm[i], table->coherent); 238 179 table->icm[i] = NULL; 239 180 ret = -ENOMEM; 240 181 goto out; ··· 263 204 mthca_UNMAP_ICM(dev, table->virt + i * MTHCA_TABLE_CHUNK_SIZE, 264 205 MTHCA_TABLE_CHUNK_SIZE / MTHCA_ICM_PAGE_SIZE, 265 206 &status); 266 - mthca_free_icm(dev, table->icm[i]); 207 + mthca_free_icm(dev, table->icm[i], table->coherent); 267 208 table->icm[i] = NULL; 268 209 } 269 210 270 211 mutex_unlock(&table->mutex); 271 212 } 272 213 273 - void *mthca_table_find(struct mthca_icm_table *table, int obj) 214 + void *mthca_table_find(struct mthca_icm_table *table, int obj, dma_addr_t *dma_handle) 274 215 { 275 - int idx, offset, i; 216 + int idx, offset, dma_offset, i; 276 217 struct mthca_icm_chunk *chunk; 277 218 struct mthca_icm *icm; 278 219 struct page *page = NULL; ··· 284 225 285 226 idx = (obj & (table->num_obj - 1)) * table->obj_size; 286 227 icm = table->icm[idx / MTHCA_TABLE_CHUNK_SIZE]; 287 - offset = idx % MTHCA_TABLE_CHUNK_SIZE; 228 + dma_offset = offset = idx % MTHCA_TABLE_CHUNK_SIZE; 288 229 289 230 if (!icm) 290 231 goto out; 291 232 292 233 list_for_each_entry(chunk, &icm->chunk_list, list) { 293 234 for (i = 0; i < chunk->npages; ++i) { 235 + if (dma_handle && dma_offset >= 0) { 236 + if (sg_dma_len(&chunk->mem[i]) > dma_offset) 237 + *dma_handle = sg_dma_address(&chunk->mem[i]) + 238 + dma_offset; 239 + dma_offset -= sg_dma_len(&chunk->mem[i]); 240 + } 241 + /* DMA mapping can merge pages but not split them, 242 + * so if we found the page, dma_handle has already 243 + * been assigned to. */ 294 244 if (chunk->mem[i].length > offset) { 295 245 page = chunk->mem[i].page; 296 246 goto out; ··· 351 283 struct mthca_icm_table *mthca_alloc_icm_table(struct mthca_dev *dev, 352 284 u64 virt, int obj_size, 353 285 int nobj, int reserved, 354 - int use_lowmem) 286 + int use_lowmem, int use_coherent) 355 287 { 356 288 struct mthca_icm_table *table; 357 289 int num_icm; ··· 370 302 table->num_obj = nobj; 371 303 table->obj_size = obj_size; 372 304 table->lowmem = use_lowmem; 305 + table->coherent = use_coherent; 373 306 mutex_init(&table->mutex); 374 307 375 308 for (i = 0; i < num_icm; ++i) ··· 383 314 384 315 table->icm[i] = mthca_alloc_icm(dev, chunk_size >> PAGE_SHIFT, 385 316 (use_lowmem ? GFP_KERNEL : GFP_HIGHUSER) | 386 - __GFP_NOWARN); 317 + __GFP_NOWARN, use_coherent); 387 318 if (!table->icm[i]) 388 319 goto err; 389 320 if (mthca_MAP_ICM(dev, table->icm[i], virt + i * MTHCA_TABLE_CHUNK_SIZE, 390 321 &status) || status) { 391 - mthca_free_icm(dev, table->icm[i]); 322 + mthca_free_icm(dev, table->icm[i], table->coherent); 392 323 table->icm[i] = NULL; 393 324 goto err; 394 325 } ··· 408 339 mthca_UNMAP_ICM(dev, virt + i * MTHCA_TABLE_CHUNK_SIZE, 409 340 MTHCA_TABLE_CHUNK_SIZE / MTHCA_ICM_PAGE_SIZE, 410 341 &status); 411 - mthca_free_icm(dev, table->icm[i]); 342 + mthca_free_icm(dev, table->icm[i], table->coherent); 412 343 } 413 344 414 345 kfree(table); ··· 426 357 mthca_UNMAP_ICM(dev, table->virt + i * MTHCA_TABLE_CHUNK_SIZE, 427 358 MTHCA_TABLE_CHUNK_SIZE / MTHCA_ICM_PAGE_SIZE, 428 359 &status); 429 - mthca_free_icm(dev, table->icm[i]); 360 + mthca_free_icm(dev, table->icm[i], table->coherent); 430 361 } 431 362 432 363 kfree(table);
+5 -4
drivers/infiniband/hw/mthca/mthca_memfree.h
··· 69 69 int num_obj; 70 70 int obj_size; 71 71 int lowmem; 72 + int coherent; 72 73 struct mutex mutex; 73 74 struct mthca_icm *icm[0]; 74 75 }; ··· 83 82 struct mthca_dev; 84 83 85 84 struct mthca_icm *mthca_alloc_icm(struct mthca_dev *dev, int npages, 86 - gfp_t gfp_mask); 87 - void mthca_free_icm(struct mthca_dev *dev, struct mthca_icm *icm); 85 + gfp_t gfp_mask, int coherent); 86 + void mthca_free_icm(struct mthca_dev *dev, struct mthca_icm *icm, int coherent); 88 87 89 88 struct mthca_icm_table *mthca_alloc_icm_table(struct mthca_dev *dev, 90 89 u64 virt, int obj_size, 91 90 int nobj, int reserved, 92 - int use_lowmem); 91 + int use_lowmem, int use_coherent); 93 92 void mthca_free_icm_table(struct mthca_dev *dev, struct mthca_icm_table *table); 94 93 int mthca_table_get(struct mthca_dev *dev, struct mthca_icm_table *table, int obj); 95 94 void mthca_table_put(struct mthca_dev *dev, struct mthca_icm_table *table, int obj); 96 - void *mthca_table_find(struct mthca_icm_table *table, int obj); 95 + void *mthca_table_find(struct mthca_icm_table *table, int obj, dma_addr_t *dma_handle); 97 96 int mthca_table_get_range(struct mthca_dev *dev, struct mthca_icm_table *table, 98 97 int start, int end); 99 98 void mthca_table_put_range(struct mthca_dev *dev, struct mthca_icm_table *table,
+6 -2
drivers/infiniband/hw/mthca/mthca_mr.c
··· 524 524 if (err) 525 525 goto err_out_mpt_free; 526 526 527 - mr->mem.arbel.mpt = mthca_table_find(dev->mr_table.mpt_table, key); 527 + mr->mem.arbel.mpt = mthca_table_find(dev->mr_table.mpt_table, key, NULL); 528 528 BUG_ON(!mr->mem.arbel.mpt); 529 529 } else 530 530 mr->mem.tavor.mpt = dev->mr_table.tavor_fmr.mpt_base + ··· 538 538 539 539 if (mthca_is_memfree(dev)) { 540 540 mr->mem.arbel.mtts = mthca_table_find(dev->mr_table.mtt_table, 541 - mr->mtt->first_seg); 541 + mr->mtt->first_seg, 542 + &mr->mem.arbel.dma_handle); 542 543 BUG_ON(!mr->mem.arbel.mtts); 543 544 } else 544 545 mr->mem.tavor.mtts = dev->mr_table.tavor_fmr.mtt_base + mtt_seg; ··· 712 711 for (i = 0; i < list_len; ++i) 713 712 fmr->mem.arbel.mtts[i] = cpu_to_be64(page_list[i] | 714 713 MTHCA_MTT_FLAG_PRESENT); 714 + 715 + dma_sync_single(&dev->pdev->dev, fmr->mem.arbel.dma_handle, 716 + list_len * sizeof(u64), DMA_TO_DEVICE); 715 717 716 718 fmr->mem.arbel.mpt->key = cpu_to_be32(key); 717 719 fmr->mem.arbel.mpt->lkey = cpu_to_be32(key);
+1
drivers/infiniband/hw/mthca/mthca_provider.h
··· 89 89 struct { 90 90 struct mthca_mpt_entry *mpt; 91 91 __be64 *mtts; 92 + dma_addr_t dma_handle; 92 93 } arbel; 93 94 } mem; 94 95 };