Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

at v5.5 668 lines 17 kB view raw
1// SPDX-License-Identifier: GPL-2.0-only 2/* 3 * Copyright © 2015 Intel Corporation. 4 * 5 * Authors: David Woodhouse <dwmw2@infradead.org> 6 */ 7 8#include <linux/intel-iommu.h> 9#include <linux/mmu_notifier.h> 10#include <linux/sched.h> 11#include <linux/sched/mm.h> 12#include <linux/slab.h> 13#include <linux/intel-svm.h> 14#include <linux/rculist.h> 15#include <linux/pci.h> 16#include <linux/pci-ats.h> 17#include <linux/dmar.h> 18#include <linux/interrupt.h> 19#include <linux/mm_types.h> 20#include <asm/page.h> 21 22#include "intel-pasid.h" 23 24static irqreturn_t prq_event_thread(int irq, void *d); 25 26int intel_svm_init(struct intel_iommu *iommu) 27{ 28 if (cpu_feature_enabled(X86_FEATURE_GBPAGES) && 29 !cap_fl1gp_support(iommu->cap)) 30 return -EINVAL; 31 32 if (cpu_feature_enabled(X86_FEATURE_LA57) && 33 !cap_5lp_support(iommu->cap)) 34 return -EINVAL; 35 36 return 0; 37} 38 39#define PRQ_ORDER 0 40 41int intel_svm_enable_prq(struct intel_iommu *iommu) 42{ 43 struct page *pages; 44 int irq, ret; 45 46 pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, PRQ_ORDER); 47 if (!pages) { 48 pr_warn("IOMMU: %s: Failed to allocate page request queue\n", 49 iommu->name); 50 return -ENOMEM; 51 } 52 iommu->prq = page_address(pages); 53 54 irq = dmar_alloc_hwirq(DMAR_UNITS_SUPPORTED + iommu->seq_id, iommu->node, iommu); 55 if (irq <= 0) { 56 pr_err("IOMMU: %s: Failed to create IRQ vector for page request queue\n", 57 iommu->name); 58 ret = -EINVAL; 59 err: 60 free_pages((unsigned long)iommu->prq, PRQ_ORDER); 61 iommu->prq = NULL; 62 return ret; 63 } 64 iommu->pr_irq = irq; 65 66 snprintf(iommu->prq_name, sizeof(iommu->prq_name), "dmar%d-prq", iommu->seq_id); 67 68 ret = request_threaded_irq(irq, NULL, prq_event_thread, IRQF_ONESHOT, 69 iommu->prq_name, iommu); 70 if (ret) { 71 pr_err("IOMMU: %s: Failed to request IRQ for page request queue\n", 72 iommu->name); 73 dmar_free_hwirq(irq); 74 iommu->pr_irq = 0; 75 goto err; 76 } 77 dmar_writeq(iommu->reg + DMAR_PQH_REG, 0ULL); 78 dmar_writeq(iommu->reg + DMAR_PQT_REG, 0ULL); 79 dmar_writeq(iommu->reg + DMAR_PQA_REG, virt_to_phys(iommu->prq) | PRQ_ORDER); 80 81 return 0; 82} 83 84int intel_svm_finish_prq(struct intel_iommu *iommu) 85{ 86 dmar_writeq(iommu->reg + DMAR_PQH_REG, 0ULL); 87 dmar_writeq(iommu->reg + DMAR_PQT_REG, 0ULL); 88 dmar_writeq(iommu->reg + DMAR_PQA_REG, 0ULL); 89 90 if (iommu->pr_irq) { 91 free_irq(iommu->pr_irq, iommu); 92 dmar_free_hwirq(iommu->pr_irq); 93 iommu->pr_irq = 0; 94 } 95 96 free_pages((unsigned long)iommu->prq, PRQ_ORDER); 97 iommu->prq = NULL; 98 99 return 0; 100} 101 102static void intel_flush_svm_range_dev (struct intel_svm *svm, struct intel_svm_dev *sdev, 103 unsigned long address, unsigned long pages, int ih) 104{ 105 struct qi_desc desc; 106 107 if (pages == -1) { 108 desc.qw0 = QI_EIOTLB_PASID(svm->pasid) | 109 QI_EIOTLB_DID(sdev->did) | 110 QI_EIOTLB_GRAN(QI_GRAN_NONG_PASID) | 111 QI_EIOTLB_TYPE; 112 desc.qw1 = 0; 113 } else { 114 int mask = ilog2(__roundup_pow_of_two(pages)); 115 116 desc.qw0 = QI_EIOTLB_PASID(svm->pasid) | 117 QI_EIOTLB_DID(sdev->did) | 118 QI_EIOTLB_GRAN(QI_GRAN_PSI_PASID) | 119 QI_EIOTLB_TYPE; 120 desc.qw1 = QI_EIOTLB_ADDR(address) | 121 QI_EIOTLB_IH(ih) | 122 QI_EIOTLB_AM(mask); 123 } 124 desc.qw2 = 0; 125 desc.qw3 = 0; 126 qi_submit_sync(&desc, svm->iommu); 127 128 if (sdev->dev_iotlb) { 129 desc.qw0 = QI_DEV_EIOTLB_PASID(svm->pasid) | 130 QI_DEV_EIOTLB_SID(sdev->sid) | 131 QI_DEV_EIOTLB_QDEP(sdev->qdep) | 132 QI_DEIOTLB_TYPE; 133 if (pages == -1) { 134 desc.qw1 = QI_DEV_EIOTLB_ADDR(-1ULL >> 1) | 135 QI_DEV_EIOTLB_SIZE; 136 } else if (pages > 1) { 137 /* The least significant zero bit indicates the size. So, 138 * for example, an "address" value of 0x12345f000 will 139 * flush from 0x123440000 to 0x12347ffff (256KiB). */ 140 unsigned long last = address + ((unsigned long)(pages - 1) << VTD_PAGE_SHIFT); 141 unsigned long mask = __rounddown_pow_of_two(address ^ last); 142 143 desc.qw1 = QI_DEV_EIOTLB_ADDR((address & ~mask) | 144 (mask - 1)) | QI_DEV_EIOTLB_SIZE; 145 } else { 146 desc.qw1 = QI_DEV_EIOTLB_ADDR(address); 147 } 148 desc.qw2 = 0; 149 desc.qw3 = 0; 150 qi_submit_sync(&desc, svm->iommu); 151 } 152} 153 154static void intel_flush_svm_range(struct intel_svm *svm, unsigned long address, 155 unsigned long pages, int ih) 156{ 157 struct intel_svm_dev *sdev; 158 159 rcu_read_lock(); 160 list_for_each_entry_rcu(sdev, &svm->devs, list) 161 intel_flush_svm_range_dev(svm, sdev, address, pages, ih); 162 rcu_read_unlock(); 163} 164 165/* Pages have been freed at this point */ 166static void intel_invalidate_range(struct mmu_notifier *mn, 167 struct mm_struct *mm, 168 unsigned long start, unsigned long end) 169{ 170 struct intel_svm *svm = container_of(mn, struct intel_svm, notifier); 171 172 intel_flush_svm_range(svm, start, 173 (end - start + PAGE_SIZE - 1) >> VTD_PAGE_SHIFT, 0); 174} 175 176static void intel_mm_release(struct mmu_notifier *mn, struct mm_struct *mm) 177{ 178 struct intel_svm *svm = container_of(mn, struct intel_svm, notifier); 179 struct intel_svm_dev *sdev; 180 181 /* This might end up being called from exit_mmap(), *before* the page 182 * tables are cleared. And __mmu_notifier_release() will delete us from 183 * the list of notifiers so that our invalidate_range() callback doesn't 184 * get called when the page tables are cleared. So we need to protect 185 * against hardware accessing those page tables. 186 * 187 * We do it by clearing the entry in the PASID table and then flushing 188 * the IOTLB and the PASID table caches. This might upset hardware; 189 * perhaps we'll want to point the PASID to a dummy PGD (like the zero 190 * page) so that we end up taking a fault that the hardware really 191 * *has* to handle gracefully without affecting other processes. 192 */ 193 rcu_read_lock(); 194 list_for_each_entry_rcu(sdev, &svm->devs, list) { 195 intel_pasid_tear_down_entry(svm->iommu, sdev->dev, svm->pasid); 196 intel_flush_svm_range_dev(svm, sdev, 0, -1, 0); 197 } 198 rcu_read_unlock(); 199 200} 201 202static const struct mmu_notifier_ops intel_mmuops = { 203 .release = intel_mm_release, 204 .invalidate_range = intel_invalidate_range, 205}; 206 207static DEFINE_MUTEX(pasid_mutex); 208static LIST_HEAD(global_svm_list); 209 210int intel_svm_bind_mm(struct device *dev, int *pasid, int flags, struct svm_dev_ops *ops) 211{ 212 struct intel_iommu *iommu = intel_svm_device_to_iommu(dev); 213 struct device_domain_info *info; 214 struct intel_svm_dev *sdev; 215 struct intel_svm *svm = NULL; 216 struct mm_struct *mm = NULL; 217 int pasid_max; 218 int ret; 219 220 if (!iommu || dmar_disabled) 221 return -EINVAL; 222 223 if (dev_is_pci(dev)) { 224 pasid_max = pci_max_pasids(to_pci_dev(dev)); 225 if (pasid_max < 0) 226 return -EINVAL; 227 } else 228 pasid_max = 1 << 20; 229 230 if (flags & SVM_FLAG_SUPERVISOR_MODE) { 231 if (!ecap_srs(iommu->ecap)) 232 return -EINVAL; 233 } else if (pasid) { 234 mm = get_task_mm(current); 235 BUG_ON(!mm); 236 } 237 238 mutex_lock(&pasid_mutex); 239 if (pasid && !(flags & SVM_FLAG_PRIVATE_PASID)) { 240 struct intel_svm *t; 241 242 list_for_each_entry(t, &global_svm_list, list) { 243 if (t->mm != mm || (t->flags & SVM_FLAG_PRIVATE_PASID)) 244 continue; 245 246 svm = t; 247 if (svm->pasid >= pasid_max) { 248 dev_warn(dev, 249 "Limited PASID width. Cannot use existing PASID %d\n", 250 svm->pasid); 251 ret = -ENOSPC; 252 goto out; 253 } 254 255 list_for_each_entry(sdev, &svm->devs, list) { 256 if (dev == sdev->dev) { 257 if (sdev->ops != ops) { 258 ret = -EBUSY; 259 goto out; 260 } 261 sdev->users++; 262 goto success; 263 } 264 } 265 266 break; 267 } 268 } 269 270 sdev = kzalloc(sizeof(*sdev), GFP_KERNEL); 271 if (!sdev) { 272 ret = -ENOMEM; 273 goto out; 274 } 275 sdev->dev = dev; 276 277 ret = intel_iommu_enable_pasid(iommu, dev); 278 if (ret || !pasid) { 279 /* If they don't actually want to assign a PASID, this is 280 * just an enabling check/preparation. */ 281 kfree(sdev); 282 goto out; 283 } 284 285 info = dev->archdata.iommu; 286 if (!info || !info->pasid_supported) { 287 kfree(sdev); 288 goto out; 289 } 290 291 sdev->did = FLPT_DEFAULT_DID; 292 sdev->sid = PCI_DEVID(info->bus, info->devfn); 293 if (info->ats_enabled) { 294 sdev->dev_iotlb = 1; 295 sdev->qdep = info->ats_qdep; 296 if (sdev->qdep >= QI_DEV_EIOTLB_MAX_INVS) 297 sdev->qdep = 0; 298 } 299 300 /* Finish the setup now we know we're keeping it */ 301 sdev->users = 1; 302 sdev->ops = ops; 303 init_rcu_head(&sdev->rcu); 304 305 if (!svm) { 306 svm = kzalloc(sizeof(*svm), GFP_KERNEL); 307 if (!svm) { 308 ret = -ENOMEM; 309 kfree(sdev); 310 goto out; 311 } 312 svm->iommu = iommu; 313 314 if (pasid_max > intel_pasid_max_id) 315 pasid_max = intel_pasid_max_id; 316 317 /* Do not use PASID 0 in caching mode (virtualised IOMMU) */ 318 ret = intel_pasid_alloc_id(svm, 319 !!cap_caching_mode(iommu->cap), 320 pasid_max - 1, GFP_KERNEL); 321 if (ret < 0) { 322 kfree(svm); 323 kfree(sdev); 324 goto out; 325 } 326 svm->pasid = ret; 327 svm->notifier.ops = &intel_mmuops; 328 svm->mm = mm; 329 svm->flags = flags; 330 INIT_LIST_HEAD_RCU(&svm->devs); 331 INIT_LIST_HEAD(&svm->list); 332 ret = -ENOMEM; 333 if (mm) { 334 ret = mmu_notifier_register(&svm->notifier, mm); 335 if (ret) { 336 intel_pasid_free_id(svm->pasid); 337 kfree(svm); 338 kfree(sdev); 339 goto out; 340 } 341 } 342 343 spin_lock(&iommu->lock); 344 ret = intel_pasid_setup_first_level(iommu, dev, 345 mm ? mm->pgd : init_mm.pgd, 346 svm->pasid, FLPT_DEFAULT_DID, 347 mm ? 0 : PASID_FLAG_SUPERVISOR_MODE); 348 spin_unlock(&iommu->lock); 349 if (ret) { 350 if (mm) 351 mmu_notifier_unregister(&svm->notifier, mm); 352 intel_pasid_free_id(svm->pasid); 353 kfree(svm); 354 kfree(sdev); 355 goto out; 356 } 357 358 list_add_tail(&svm->list, &global_svm_list); 359 } else { 360 /* 361 * Binding a new device with existing PASID, need to setup 362 * the PASID entry. 363 */ 364 spin_lock(&iommu->lock); 365 ret = intel_pasid_setup_first_level(iommu, dev, 366 mm ? mm->pgd : init_mm.pgd, 367 svm->pasid, FLPT_DEFAULT_DID, 368 mm ? 0 : PASID_FLAG_SUPERVISOR_MODE); 369 spin_unlock(&iommu->lock); 370 if (ret) { 371 kfree(sdev); 372 goto out; 373 } 374 } 375 list_add_rcu(&sdev->list, &svm->devs); 376 377 success: 378 *pasid = svm->pasid; 379 ret = 0; 380 out: 381 mutex_unlock(&pasid_mutex); 382 if (mm) 383 mmput(mm); 384 return ret; 385} 386EXPORT_SYMBOL_GPL(intel_svm_bind_mm); 387 388int intel_svm_unbind_mm(struct device *dev, int pasid) 389{ 390 struct intel_svm_dev *sdev; 391 struct intel_iommu *iommu; 392 struct intel_svm *svm; 393 int ret = -EINVAL; 394 395 mutex_lock(&pasid_mutex); 396 iommu = intel_svm_device_to_iommu(dev); 397 if (!iommu) 398 goto out; 399 400 svm = intel_pasid_lookup_id(pasid); 401 if (!svm) 402 goto out; 403 404 list_for_each_entry(sdev, &svm->devs, list) { 405 if (dev == sdev->dev) { 406 ret = 0; 407 sdev->users--; 408 if (!sdev->users) { 409 list_del_rcu(&sdev->list); 410 /* Flush the PASID cache and IOTLB for this device. 411 * Note that we do depend on the hardware *not* using 412 * the PASID any more. Just as we depend on other 413 * devices never using PASIDs that they have no right 414 * to use. We have a *shared* PASID table, because it's 415 * large and has to be physically contiguous. So it's 416 * hard to be as defensive as we might like. */ 417 intel_pasid_tear_down_entry(iommu, dev, svm->pasid); 418 intel_flush_svm_range_dev(svm, sdev, 0, -1, 0); 419 kfree_rcu(sdev, rcu); 420 421 if (list_empty(&svm->devs)) { 422 intel_pasid_free_id(svm->pasid); 423 if (svm->mm) 424 mmu_notifier_unregister(&svm->notifier, svm->mm); 425 426 list_del(&svm->list); 427 428 /* We mandate that no page faults may be outstanding 429 * for the PASID when intel_svm_unbind_mm() is called. 430 * If that is not obeyed, subtle errors will happen. 431 * Let's make them less subtle... */ 432 memset(svm, 0x6b, sizeof(*svm)); 433 kfree(svm); 434 } 435 } 436 break; 437 } 438 } 439 out: 440 mutex_unlock(&pasid_mutex); 441 442 return ret; 443} 444EXPORT_SYMBOL_GPL(intel_svm_unbind_mm); 445 446int intel_svm_is_pasid_valid(struct device *dev, int pasid) 447{ 448 struct intel_iommu *iommu; 449 struct intel_svm *svm; 450 int ret = -EINVAL; 451 452 mutex_lock(&pasid_mutex); 453 iommu = intel_svm_device_to_iommu(dev); 454 if (!iommu) 455 goto out; 456 457 svm = intel_pasid_lookup_id(pasid); 458 if (!svm) 459 goto out; 460 461 /* init_mm is used in this case */ 462 if (!svm->mm) 463 ret = 1; 464 else if (atomic_read(&svm->mm->mm_users) > 0) 465 ret = 1; 466 else 467 ret = 0; 468 469 out: 470 mutex_unlock(&pasid_mutex); 471 472 return ret; 473} 474EXPORT_SYMBOL_GPL(intel_svm_is_pasid_valid); 475 476/* Page request queue descriptor */ 477struct page_req_dsc { 478 union { 479 struct { 480 u64 type:8; 481 u64 pasid_present:1; 482 u64 priv_data_present:1; 483 u64 rsvd:6; 484 u64 rid:16; 485 u64 pasid:20; 486 u64 exe_req:1; 487 u64 pm_req:1; 488 u64 rsvd2:10; 489 }; 490 u64 qw_0; 491 }; 492 union { 493 struct { 494 u64 rd_req:1; 495 u64 wr_req:1; 496 u64 lpig:1; 497 u64 prg_index:9; 498 u64 addr:52; 499 }; 500 u64 qw_1; 501 }; 502 u64 priv_data[2]; 503}; 504 505#define PRQ_RING_MASK ((0x1000 << PRQ_ORDER) - 0x10) 506 507static bool access_error(struct vm_area_struct *vma, struct page_req_dsc *req) 508{ 509 unsigned long requested = 0; 510 511 if (req->exe_req) 512 requested |= VM_EXEC; 513 514 if (req->rd_req) 515 requested |= VM_READ; 516 517 if (req->wr_req) 518 requested |= VM_WRITE; 519 520 return (requested & ~vma->vm_flags) != 0; 521} 522 523static bool is_canonical_address(u64 addr) 524{ 525 int shift = 64 - (__VIRTUAL_MASK_SHIFT + 1); 526 long saddr = (long) addr; 527 528 return (((saddr << shift) >> shift) == saddr); 529} 530 531static irqreturn_t prq_event_thread(int irq, void *d) 532{ 533 struct intel_iommu *iommu = d; 534 struct intel_svm *svm = NULL; 535 int head, tail, handled = 0; 536 537 /* Clear PPR bit before reading head/tail registers, to 538 * ensure that we get a new interrupt if needed. */ 539 writel(DMA_PRS_PPR, iommu->reg + DMAR_PRS_REG); 540 541 tail = dmar_readq(iommu->reg + DMAR_PQT_REG) & PRQ_RING_MASK; 542 head = dmar_readq(iommu->reg + DMAR_PQH_REG) & PRQ_RING_MASK; 543 while (head != tail) { 544 struct intel_svm_dev *sdev; 545 struct vm_area_struct *vma; 546 struct page_req_dsc *req; 547 struct qi_desc resp; 548 int result; 549 vm_fault_t ret; 550 u64 address; 551 552 handled = 1; 553 554 req = &iommu->prq[head / sizeof(*req)]; 555 556 result = QI_RESP_FAILURE; 557 address = (u64)req->addr << VTD_PAGE_SHIFT; 558 if (!req->pasid_present) { 559 pr_err("%s: Page request without PASID: %08llx %08llx\n", 560 iommu->name, ((unsigned long long *)req)[0], 561 ((unsigned long long *)req)[1]); 562 goto no_pasid; 563 } 564 565 if (!svm || svm->pasid != req->pasid) { 566 rcu_read_lock(); 567 svm = intel_pasid_lookup_id(req->pasid); 568 /* It *can't* go away, because the driver is not permitted 569 * to unbind the mm while any page faults are outstanding. 570 * So we only need RCU to protect the internal idr code. */ 571 rcu_read_unlock(); 572 573 if (!svm) { 574 pr_err("%s: Page request for invalid PASID %d: %08llx %08llx\n", 575 iommu->name, req->pasid, ((unsigned long long *)req)[0], 576 ((unsigned long long *)req)[1]); 577 goto no_pasid; 578 } 579 } 580 581 result = QI_RESP_INVALID; 582 /* Since we're using init_mm.pgd directly, we should never take 583 * any faults on kernel addresses. */ 584 if (!svm->mm) 585 goto bad_req; 586 /* If the mm is already defunct, don't handle faults. */ 587 if (!mmget_not_zero(svm->mm)) 588 goto bad_req; 589 590 /* If address is not canonical, return invalid response */ 591 if (!is_canonical_address(address)) 592 goto bad_req; 593 594 down_read(&svm->mm->mmap_sem); 595 vma = find_extend_vma(svm->mm, address); 596 if (!vma || address < vma->vm_start) 597 goto invalid; 598 599 if (access_error(vma, req)) 600 goto invalid; 601 602 ret = handle_mm_fault(vma, address, 603 req->wr_req ? FAULT_FLAG_WRITE : 0); 604 if (ret & VM_FAULT_ERROR) 605 goto invalid; 606 607 result = QI_RESP_SUCCESS; 608 invalid: 609 up_read(&svm->mm->mmap_sem); 610 mmput(svm->mm); 611 bad_req: 612 /* Accounting for major/minor faults? */ 613 rcu_read_lock(); 614 list_for_each_entry_rcu(sdev, &svm->devs, list) { 615 if (sdev->sid == req->rid) 616 break; 617 } 618 /* Other devices can go away, but the drivers are not permitted 619 * to unbind while any page faults might be in flight. So it's 620 * OK to drop the 'lock' here now we have it. */ 621 rcu_read_unlock(); 622 623 if (WARN_ON(&sdev->list == &svm->devs)) 624 sdev = NULL; 625 626 if (sdev && sdev->ops && sdev->ops->fault_cb) { 627 int rwxp = (req->rd_req << 3) | (req->wr_req << 2) | 628 (req->exe_req << 1) | (req->pm_req); 629 sdev->ops->fault_cb(sdev->dev, req->pasid, req->addr, 630 req->priv_data, rwxp, result); 631 } 632 /* We get here in the error case where the PASID lookup failed, 633 and these can be NULL. Do not use them below this point! */ 634 sdev = NULL; 635 svm = NULL; 636 no_pasid: 637 if (req->lpig || req->priv_data_present) { 638 /* 639 * Per VT-d spec. v3.0 ch7.7, system software must 640 * respond with page group response if private data 641 * is present (PDP) or last page in group (LPIG) bit 642 * is set. This is an additional VT-d feature beyond 643 * PCI ATS spec. 644 */ 645 resp.qw0 = QI_PGRP_PASID(req->pasid) | 646 QI_PGRP_DID(req->rid) | 647 QI_PGRP_PASID_P(req->pasid_present) | 648 QI_PGRP_PDP(req->pasid_present) | 649 QI_PGRP_RESP_CODE(result) | 650 QI_PGRP_RESP_TYPE; 651 resp.qw1 = QI_PGRP_IDX(req->prg_index) | 652 QI_PGRP_LPIG(req->lpig); 653 654 if (req->priv_data_present) 655 memcpy(&resp.qw2, req->priv_data, 656 sizeof(req->priv_data)); 657 } 658 resp.qw2 = 0; 659 resp.qw3 = 0; 660 qi_submit_sync(&resp, iommu); 661 662 head = (head + sizeof(*req)) & PRQ_RING_MASK; 663 } 664 665 dmar_writeq(iommu->reg + DMAR_PQH_REG, tail); 666 667 return IRQ_RETVAL(handled); 668}