Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

KVM: PPC: Book3S HV: XICS: Replace the 'destroy' method by a 'release' method

Similarly to what was done with XICS-on-XIVE and XIVE native KVM devices
with commit 5422e95103cf ("KVM: PPC: Book3S HV: XIVE: Replace the 'destroy'
method by a 'release' method"), convert the historical XICS KVM device to
implement the 'release' method. This is needed to run nested guests with
an in-kernel IRQ chip. A typical POWER9 guest can select XICS or XIVE
during boot, which requires to be able to destroy and to re-create the
KVM device. Only the historical XICS KVM device is available under pseries
at the current time and it still uses the legacy 'destroy' method.

Switching to 'release' means that vCPUs might still be running when the
device is destroyed. In order to avoid potential use-after-free, the
kvmppc_xics structure is allocated on first usage and kept around until
the VM exits. The same pointer is used each time a KVM XICS device is
being created, but this is okay since we only have one per VM.

Clear the ICP of each vCPU with vcpu->mutex held. This ensures that the
next time the vCPU resumes execution, it won't be going into the XICS
code anymore.

Signed-off-by: Greg Kurz <groug@kaod.org>
Reviewed-by: Cédric Le Goater <clg@kaod.org>
Tested-by: Cédric Le Goater <clg@kaod.org>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>

authored by

Greg Kurz and committed by
Paul Mackerras
5706d14d d012a719

+72 -19
+1
arch/powerpc/include/asm/kvm_host.h
··· 326 326 #endif 327 327 #ifdef CONFIG_KVM_XICS 328 328 struct kvmppc_xics *xics; 329 + struct kvmppc_xics *xics_device; 329 330 struct kvmppc_xive *xive; /* Current XIVE device in use */ 330 331 struct { 331 332 struct kvmppc_xive *native;
+3 -1
arch/powerpc/kvm/book3s.c
··· 879 879 880 880 #ifdef CONFIG_KVM_XICS 881 881 /* 882 - * Free the XIVE devices which are not directly freed by the 882 + * Free the XIVE and XICS devices which are not directly freed by the 883 883 * device 'release' method 884 884 */ 885 885 kfree(kvm->arch.xive_devices.native); 886 886 kvm->arch.xive_devices.native = NULL; 887 887 kfree(kvm->arch.xive_devices.xics_on_xive); 888 888 kvm->arch.xive_devices.xics_on_xive = NULL; 889 + kfree(kvm->arch.xics_device); 890 + kvm->arch.xics_device = NULL; 889 891 #endif /* CONFIG_KVM_XICS */ 890 892 } 891 893
+68 -18
arch/powerpc/kvm/book3s_xics.c
··· 1334 1334 return -ENXIO; 1335 1335 } 1336 1336 1337 - static void kvmppc_xics_free(struct kvm_device *dev) 1337 + /* 1338 + * Called when device fd is closed. kvm->lock is held. 1339 + */ 1340 + static void kvmppc_xics_release(struct kvm_device *dev) 1338 1341 { 1339 1342 struct kvmppc_xics *xics = dev->private; 1340 1343 int i; 1341 1344 struct kvm *kvm = xics->kvm; 1345 + struct kvm_vcpu *vcpu; 1346 + 1347 + pr_devel("Releasing xics device\n"); 1348 + 1349 + /* 1350 + * Since this is the device release function, we know that 1351 + * userspace does not have any open fd referring to the 1352 + * device. Therefore there can not be any of the device 1353 + * attribute set/get functions being executed concurrently, 1354 + * and similarly, the connect_vcpu and set/clr_mapped 1355 + * functions also cannot be being executed. 1356 + */ 1342 1357 1343 1358 debugfs_remove(xics->dentry); 1359 + 1360 + /* 1361 + * We should clean up the vCPU interrupt presenters first. 1362 + */ 1363 + kvm_for_each_vcpu(i, vcpu, kvm) { 1364 + /* 1365 + * Take vcpu->mutex to ensure that no one_reg get/set ioctl 1366 + * (i.e. kvmppc_xics_[gs]et_icp) can be done concurrently. 1367 + * Holding the vcpu->mutex also means that execution is 1368 + * excluded for the vcpu until the ICP was freed. When the vcpu 1369 + * can execute again, vcpu->arch.icp and vcpu->arch.irq_type 1370 + * have been cleared and the vcpu will not be going into the 1371 + * XICS code anymore. 1372 + */ 1373 + mutex_lock(&vcpu->mutex); 1374 + kvmppc_xics_free_icp(vcpu); 1375 + mutex_unlock(&vcpu->mutex); 1376 + } 1344 1377 1345 1378 if (kvm) 1346 1379 kvm->arch.xics = NULL; 1347 1380 1348 - for (i = 0; i <= xics->max_icsid; i++) 1381 + for (i = 0; i <= xics->max_icsid; i++) { 1349 1382 kfree(xics->ics[i]); 1350 - kfree(xics); 1383 + xics->ics[i] = NULL; 1384 + } 1385 + /* 1386 + * A reference of the kvmppc_xics pointer is now kept under 1387 + * the xics_device pointer of the machine for reuse. It is 1388 + * freed when the VM is destroyed for now until we fix all the 1389 + * execution paths. 1390 + */ 1351 1391 kfree(dev); 1392 + } 1393 + 1394 + static struct kvmppc_xics *kvmppc_xics_get_device(struct kvm *kvm) 1395 + { 1396 + struct kvmppc_xics **kvm_xics_device = &kvm->arch.xics_device; 1397 + struct kvmppc_xics *xics = *kvm_xics_device; 1398 + 1399 + if (!xics) { 1400 + xics = kzalloc(sizeof(*xics), GFP_KERNEL); 1401 + *kvm_xics_device = xics; 1402 + } else { 1403 + memset(xics, 0, sizeof(*xics)); 1404 + } 1405 + 1406 + return xics; 1352 1407 } 1353 1408 1354 1409 static int kvmppc_xics_create(struct kvm_device *dev, u32 type) 1355 1410 { 1356 1411 struct kvmppc_xics *xics; 1357 1412 struct kvm *kvm = dev->kvm; 1358 - int ret = 0; 1359 1413 1360 - xics = kzalloc(sizeof(*xics), GFP_KERNEL); 1414 + pr_devel("Creating xics for partition\n"); 1415 + 1416 + /* Already there ? */ 1417 + if (kvm->arch.xics) 1418 + return -EEXIST; 1419 + 1420 + xics = kvmppc_xics_get_device(kvm); 1361 1421 if (!xics) 1362 1422 return -ENOMEM; 1363 1423 1364 1424 dev->private = xics; 1365 1425 xics->dev = dev; 1366 1426 xics->kvm = kvm; 1367 - 1368 - /* Already there ? */ 1369 - if (kvm->arch.xics) 1370 - ret = -EEXIST; 1371 - else 1372 - kvm->arch.xics = xics; 1373 - 1374 - if (ret) { 1375 - kfree(xics); 1376 - return ret; 1377 - } 1427 + kvm->arch.xics = xics; 1378 1428 1379 1429 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE 1380 1430 if (cpu_has_feature(CPU_FTR_ARCH_206) && ··· 1449 1399 .name = "kvm-xics", 1450 1400 .create = kvmppc_xics_create, 1451 1401 .init = kvmppc_xics_init, 1452 - .destroy = kvmppc_xics_free, 1402 + .release = kvmppc_xics_release, 1453 1403 .set_attr = xics_set_attr, 1454 1404 .get_attr = xics_get_attr, 1455 1405 .has_attr = xics_has_attr, ··· 1465 1415 return -EPERM; 1466 1416 if (xics->kvm != vcpu->kvm) 1467 1417 return -EPERM; 1468 - if (vcpu->arch.irq_type) 1418 + if (vcpu->arch.irq_type != KVMPPC_IRQ_DEFAULT) 1469 1419 return -EBUSY; 1470 1420 1471 1421 r = kvmppc_xics_create_icp(vcpu, xcpu);