Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

KVM: s390: CMMA tracking, ESSA emulation, migration mode

* Add a migration state bitmap to keep track of which pages have dirty
CMMA information.
* Disable CMMA by default, so we can track if it's used or not. Enable
it on first use like we do for storage keys (unless we are doing a
migration).
* Creates a VM attribute to enter and leave migration mode.
* In migration mode, CMMA is disabled in the SIE block, so ESSA is
always interpreted and emulated in software.
* Free the migration state on VM destroy.

Signed-off-by: Claudio Imbrenda <imbrenda@linux.vnet.ibm.com>
Acked-by: Cornelia Huck <cornelia.huck@de.ibm.com>
Reviewed-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>

authored by

Claudio Imbrenda and committed by
Christian Borntraeger
190df4a2 865279c5

+304 -6
+33
Documentation/virtual/kvm/devices/vm.txt
··· 222 222 223 223 Parameters: none 224 224 Returns: 0 225 + 226 + 5. GROUP: KVM_S390_VM_MIGRATION 227 + Architectures: s390 228 + 229 + 5.1. ATTRIBUTE: KVM_S390_VM_MIGRATION_STOP (w/o) 230 + 231 + Allows userspace to stop migration mode, needed for PGSTE migration. 232 + Setting this attribute when migration mode is not active will have no 233 + effects. 234 + 235 + Parameters: none 236 + Returns: 0 237 + 238 + 5.2. ATTRIBUTE: KVM_S390_VM_MIGRATION_START (w/o) 239 + 240 + Allows userspace to start migration mode, needed for PGSTE migration. 241 + Setting this attribute when migration mode is already active will have 242 + no effects. 243 + 244 + Parameters: none 245 + Returns: -ENOMEM if there is not enough free memory to start migration mode 246 + -EINVAL if the state of the VM is invalid (e.g. no memory defined) 247 + 0 in case of success. 248 + 249 + 5.3. ATTRIBUTE: KVM_S390_VM_MIGRATION_STATUS (r/o) 250 + 251 + Allows userspace to query the status of migration mode. 252 + 253 + Parameters: address of a buffer in user space to store the data (u64) to; 254 + the data itself is either 0 if migration mode is disabled or 1 255 + if it is enabled 256 + Returns: -EFAULT if the given address is not accessible from kernel space 257 + 0 in case of success.
+9
arch/s390/include/asm/kvm_host.h
··· 45 45 #define KVM_REQ_ENABLE_IBS 8 46 46 #define KVM_REQ_DISABLE_IBS 9 47 47 #define KVM_REQ_ICPT_OPEREXC 10 48 + #define KVM_REQ_START_MIGRATION 11 49 + #define KVM_REQ_STOP_MIGRATION 12 48 50 49 51 #define SIGP_CTRL_C 0x80 50 52 #define SIGP_CTRL_SCN_MASK 0x3f ··· 693 691 struct page *pages[KVM_MAX_VCPUS]; 694 692 }; 695 693 694 + struct kvm_s390_migration_state { 695 + unsigned long bitmap_size; /* in bits (number of guest pages) */ 696 + atomic64_t dirty_pages; /* number of dirty pages */ 697 + unsigned long *pgste_bitmap; 698 + }; 699 + 696 700 struct kvm_arch{ 697 701 void *sca; 698 702 int use_esca; ··· 726 718 struct kvm_s390_crypto crypto; 727 719 struct kvm_s390_vsie vsie; 728 720 u64 epoch; 721 + struct kvm_s390_migration_state *migration_state; 729 722 /* subset of available cpu features enabled by user space */ 730 723 DECLARE_BITMAP(cpu_feat, KVM_S390_VM_CPU_FEAT_NR_BITS); 731 724 };
+6
arch/s390/include/uapi/asm/kvm.h
··· 70 70 #define KVM_S390_VM_TOD 1 71 71 #define KVM_S390_VM_CRYPTO 2 72 72 #define KVM_S390_VM_CPU_MODEL 3 73 + #define KVM_S390_VM_MIGRATION 4 73 74 74 75 /* kvm attributes for mem_ctrl */ 75 76 #define KVM_S390_VM_MEM_ENABLE_CMMA 0 ··· 151 150 #define KVM_S390_VM_CRYPTO_ENABLE_DEA_KW 1 152 151 #define KVM_S390_VM_CRYPTO_DISABLE_AES_KW 2 153 152 #define KVM_S390_VM_CRYPTO_DISABLE_DEA_KW 3 153 + 154 + /* kvm attributes for migration mode */ 155 + #define KVM_S390_VM_MIGRATION_STOP 0 156 + #define KVM_S390_VM_MIGRATION_START 1 157 + #define KVM_S390_VM_MIGRATION_STATUS 2 154 158 155 159 /* for KVM_GET_REGS and KVM_SET_REGS */ 156 160 struct kvm_regs {
+158 -1
arch/s390/kvm/kvm-s390.c
··· 31 31 #include <linux/bitmap.h> 32 32 #include <linux/sched/signal.h> 33 33 34 + #include <linux/string.h> 34 35 #include <asm/asm-offsets.h> 35 36 #include <asm/lowcore.h> 36 37 #include <asm/stp.h> ··· 751 750 return 0; 752 751 } 753 752 753 + static void kvm_s390_sync_request_broadcast(struct kvm *kvm, int req) 754 + { 755 + int cx; 756 + struct kvm_vcpu *vcpu; 757 + 758 + kvm_for_each_vcpu(cx, vcpu, kvm) 759 + kvm_s390_sync_request(req, vcpu); 760 + } 761 + 762 + /* 763 + * Must be called with kvm->srcu held to avoid races on memslots, and with 764 + * kvm->lock to avoid races with ourselves and kvm_s390_vm_stop_migration. 765 + */ 766 + static int kvm_s390_vm_start_migration(struct kvm *kvm) 767 + { 768 + struct kvm_s390_migration_state *mgs; 769 + struct kvm_memory_slot *ms; 770 + /* should be the only one */ 771 + struct kvm_memslots *slots; 772 + unsigned long ram_pages; 773 + int slotnr; 774 + 775 + /* migration mode already enabled */ 776 + if (kvm->arch.migration_state) 777 + return 0; 778 + 779 + slots = kvm_memslots(kvm); 780 + if (!slots || !slots->used_slots) 781 + return -EINVAL; 782 + 783 + mgs = kzalloc(sizeof(*mgs), GFP_KERNEL); 784 + if (!mgs) 785 + return -ENOMEM; 786 + kvm->arch.migration_state = mgs; 787 + 788 + if (kvm->arch.use_cmma) { 789 + /* 790 + * Get the last slot. They should be sorted by base_gfn, so the 791 + * last slot is also the one at the end of the address space. 792 + * We have verified above that at least one slot is present. 793 + */ 794 + ms = slots->memslots + slots->used_slots - 1; 795 + /* round up so we only use full longs */ 796 + ram_pages = roundup(ms->base_gfn + ms->npages, BITS_PER_LONG); 797 + /* allocate enough bytes to store all the bits */ 798 + mgs->pgste_bitmap = vmalloc(ram_pages / 8); 799 + if (!mgs->pgste_bitmap) { 800 + kfree(mgs); 801 + kvm->arch.migration_state = NULL; 802 + return -ENOMEM; 803 + } 804 + 805 + mgs->bitmap_size = ram_pages; 806 + atomic64_set(&mgs->dirty_pages, ram_pages); 807 + /* mark all the pages in active slots as dirty */ 808 + for (slotnr = 0; slotnr < slots->used_slots; slotnr++) { 809 + ms = slots->memslots + slotnr; 810 + bitmap_set(mgs->pgste_bitmap, ms->base_gfn, ms->npages); 811 + } 812 + 813 + kvm_s390_sync_request_broadcast(kvm, KVM_REQ_START_MIGRATION); 814 + } 815 + return 0; 816 + } 817 + 818 + /* 819 + * Must be called with kvm->lock to avoid races with ourselves and 820 + * kvm_s390_vm_start_migration. 821 + */ 822 + static int kvm_s390_vm_stop_migration(struct kvm *kvm) 823 + { 824 + struct kvm_s390_migration_state *mgs; 825 + 826 + /* migration mode already disabled */ 827 + if (!kvm->arch.migration_state) 828 + return 0; 829 + mgs = kvm->arch.migration_state; 830 + kvm->arch.migration_state = NULL; 831 + 832 + if (kvm->arch.use_cmma) { 833 + kvm_s390_sync_request_broadcast(kvm, KVM_REQ_STOP_MIGRATION); 834 + vfree(mgs->pgste_bitmap); 835 + } 836 + kfree(mgs); 837 + return 0; 838 + } 839 + 840 + static int kvm_s390_vm_set_migration(struct kvm *kvm, 841 + struct kvm_device_attr *attr) 842 + { 843 + int idx, res = -ENXIO; 844 + 845 + mutex_lock(&kvm->lock); 846 + switch (attr->attr) { 847 + case KVM_S390_VM_MIGRATION_START: 848 + idx = srcu_read_lock(&kvm->srcu); 849 + res = kvm_s390_vm_start_migration(kvm); 850 + srcu_read_unlock(&kvm->srcu, idx); 851 + break; 852 + case KVM_S390_VM_MIGRATION_STOP: 853 + res = kvm_s390_vm_stop_migration(kvm); 854 + break; 855 + default: 856 + break; 857 + } 858 + mutex_unlock(&kvm->lock); 859 + 860 + return res; 861 + } 862 + 863 + static int kvm_s390_vm_get_migration(struct kvm *kvm, 864 + struct kvm_device_attr *attr) 865 + { 866 + u64 mig = (kvm->arch.migration_state != NULL); 867 + 868 + if (attr->attr != KVM_S390_VM_MIGRATION_STATUS) 869 + return -ENXIO; 870 + 871 + if (copy_to_user((void __user *)attr->addr, &mig, sizeof(mig))) 872 + return -EFAULT; 873 + return 0; 874 + } 875 + 754 876 static int kvm_s390_set_tod_high(struct kvm *kvm, struct kvm_device_attr *attr) 755 877 { 756 878 u8 gtod_high; ··· 1214 1090 case KVM_S390_VM_CRYPTO: 1215 1091 ret = kvm_s390_vm_set_crypto(kvm, attr); 1216 1092 break; 1093 + case KVM_S390_VM_MIGRATION: 1094 + ret = kvm_s390_vm_set_migration(kvm, attr); 1095 + break; 1217 1096 default: 1218 1097 ret = -ENXIO; 1219 1098 break; ··· 1238 1111 break; 1239 1112 case KVM_S390_VM_CPU_MODEL: 1240 1113 ret = kvm_s390_get_cpu_model(kvm, attr); 1114 + break; 1115 + case KVM_S390_VM_MIGRATION: 1116 + ret = kvm_s390_vm_get_migration(kvm, attr); 1241 1117 break; 1242 1118 default: 1243 1119 ret = -ENXIO; ··· 1308 1178 ret = -ENXIO; 1309 1179 break; 1310 1180 } 1181 + break; 1182 + case KVM_S390_VM_MIGRATION: 1183 + ret = 0; 1311 1184 break; 1312 1185 default: 1313 1186 ret = -ENXIO; ··· 1766 1633 kvm_s390_destroy_adapters(kvm); 1767 1634 kvm_s390_clear_float_irqs(kvm); 1768 1635 kvm_s390_vsie_destroy(kvm); 1636 + if (kvm->arch.migration_state) { 1637 + vfree(kvm->arch.migration_state->pgste_bitmap); 1638 + kfree(kvm->arch.migration_state); 1639 + } 1769 1640 KVM_EVENT(3, "vm 0x%pK destroyed", kvm); 1770 1641 } 1771 1642 ··· 2114 1977 if (!vcpu->arch.sie_block->cbrlo) 2115 1978 return -ENOMEM; 2116 1979 2117 - vcpu->arch.sie_block->ecb2 |= ECB2_CMMA; 2118 1980 vcpu->arch.sie_block->ecb2 &= ~ECB2_PFMFI; 2119 1981 return 0; 2120 1982 } ··· 2622 2486 2623 2487 if (kvm_check_request(KVM_REQ_ICPT_OPEREXC, vcpu)) { 2624 2488 vcpu->arch.sie_block->ictl |= ICTL_OPEREXC; 2489 + goto retry; 2490 + } 2491 + 2492 + if (kvm_check_request(KVM_REQ_START_MIGRATION, vcpu)) { 2493 + /* 2494 + * Disable CMMA virtualization; we will emulate the ESSA 2495 + * instruction manually, in order to provide additional 2496 + * functionalities needed for live migration. 2497 + */ 2498 + vcpu->arch.sie_block->ecb2 &= ~ECB2_CMMA; 2499 + goto retry; 2500 + } 2501 + 2502 + if (kvm_check_request(KVM_REQ_STOP_MIGRATION, vcpu)) { 2503 + /* 2504 + * Re-enable CMMA virtualization if CMMA is available and 2505 + * was used. 2506 + */ 2507 + if ((vcpu->kvm->arch.use_cmma) && 2508 + (vcpu->kvm->mm->context.use_cmma)) 2509 + vcpu->arch.sie_block->ecb2 |= ECB2_CMMA; 2625 2510 goto retry; 2626 2511 } 2627 2512
+98 -5
arch/s390/kvm/priv.c
··· 24 24 #include <asm/ebcdic.h> 25 25 #include <asm/sysinfo.h> 26 26 #include <asm/pgtable.h> 27 + #include <asm/page-states.h> 27 28 #include <asm/pgalloc.h> 28 29 #include <asm/gmap.h> 29 30 #include <asm/io.h> ··· 950 949 return 0; 951 950 } 952 951 952 + static inline int do_essa(struct kvm_vcpu *vcpu, const int orc) 953 + { 954 + struct kvm_s390_migration_state *ms = vcpu->kvm->arch.migration_state; 955 + int r1, r2, nappended, entries; 956 + unsigned long gfn, hva, res, pgstev, ptev; 957 + unsigned long *cbrlo; 958 + 959 + /* 960 + * We don't need to set SD.FPF.SK to 1 here, because if we have a 961 + * machine check here we either handle it or crash 962 + */ 963 + 964 + kvm_s390_get_regs_rre(vcpu, &r1, &r2); 965 + gfn = vcpu->run->s.regs.gprs[r2] >> PAGE_SHIFT; 966 + hva = gfn_to_hva(vcpu->kvm, gfn); 967 + entries = (vcpu->arch.sie_block->cbrlo & ~PAGE_MASK) >> 3; 968 + 969 + if (kvm_is_error_hva(hva)) 970 + return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); 971 + 972 + nappended = pgste_perform_essa(vcpu->kvm->mm, hva, orc, &ptev, &pgstev); 973 + if (nappended < 0) { 974 + res = orc ? 0x10 : 0; 975 + vcpu->run->s.regs.gprs[r1] = res; /* Exception Indication */ 976 + return 0; 977 + } 978 + res = (pgstev & _PGSTE_GPS_USAGE_MASK) >> 22; 979 + /* 980 + * Set the block-content state part of the result. 0 means resident, so 981 + * nothing to do if the page is valid. 2 is for preserved pages 982 + * (non-present and non-zero), and 3 for zero pages (non-present and 983 + * zero). 984 + */ 985 + if (ptev & _PAGE_INVALID) { 986 + res |= 2; 987 + if (pgstev & _PGSTE_GPS_ZERO) 988 + res |= 1; 989 + } 990 + vcpu->run->s.regs.gprs[r1] = res; 991 + /* 992 + * It is possible that all the normal 511 slots were full, in which case 993 + * we will now write in the 512th slot, which is reserved for host use. 994 + * In both cases we let the normal essa handling code process all the 995 + * slots, including the reserved one, if needed. 996 + */ 997 + if (nappended > 0) { 998 + cbrlo = phys_to_virt(vcpu->arch.sie_block->cbrlo & PAGE_MASK); 999 + cbrlo[entries] = gfn << PAGE_SHIFT; 1000 + } 1001 + 1002 + if (orc) { 1003 + /* increment only if we are really flipping the bit to 1 */ 1004 + if (!test_and_set_bit(gfn, ms->pgste_bitmap)) 1005 + atomic64_inc(&ms->dirty_pages); 1006 + } 1007 + 1008 + return nappended; 1009 + } 1010 + 953 1011 static int handle_essa(struct kvm_vcpu *vcpu) 954 1012 { 955 1013 /* entries expected to be 1FF */ 956 1014 int entries = (vcpu->arch.sie_block->cbrlo & ~PAGE_MASK) >> 3; 957 1015 unsigned long *cbrlo; 958 1016 struct gmap *gmap; 959 - int i; 1017 + int i, orc; 960 1018 961 1019 VCPU_EVENT(vcpu, 4, "ESSA: release %d pages", entries); 962 1020 gmap = vcpu->arch.gmap; ··· 1025 965 1026 966 if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE) 1027 967 return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP); 1028 - 1029 - if (((vcpu->arch.sie_block->ipb & 0xf0000000) >> 28) > 6) 968 + /* Check for invalid operation request code */ 969 + orc = (vcpu->arch.sie_block->ipb & 0xf0000000) >> 28; 970 + if (orc > ESSA_MAX) 1030 971 return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); 1031 972 1032 - /* Retry the ESSA instruction */ 1033 - kvm_s390_retry_instr(vcpu); 973 + if (likely(!vcpu->kvm->arch.migration_state)) { 974 + /* 975 + * CMMA is enabled in the KVM settings, but is disabled in 976 + * the SIE block and in the mm_context, and we are not doing 977 + * a migration. Enable CMMA in the mm_context. 978 + * Since we need to take a write lock to write to the context 979 + * to avoid races with storage keys handling, we check if the 980 + * value really needs to be written to; if the value is 981 + * already correct, we do nothing and avoid the lock. 982 + */ 983 + if (vcpu->kvm->mm->context.use_cmma == 0) { 984 + down_write(&vcpu->kvm->mm->mmap_sem); 985 + vcpu->kvm->mm->context.use_cmma = 1; 986 + up_write(&vcpu->kvm->mm->mmap_sem); 987 + } 988 + /* 989 + * If we are here, we are supposed to have CMMA enabled in 990 + * the SIE block. Enabling CMMA works on a per-CPU basis, 991 + * while the context use_cmma flag is per process. 992 + * It's possible that the context flag is enabled and the 993 + * SIE flag is not, so we set the flag always; if it was 994 + * already set, nothing changes, otherwise we enable it 995 + * on this CPU too. 996 + */ 997 + vcpu->arch.sie_block->ecb2 |= ECB2_CMMA; 998 + /* Retry the ESSA instruction */ 999 + kvm_s390_retry_instr(vcpu); 1000 + } else { 1001 + /* Account for the possible extra cbrl entry */ 1002 + i = do_essa(vcpu, orc); 1003 + if (i < 0) 1004 + return i; 1005 + entries += i; 1006 + } 1034 1007 vcpu->arch.sie_block->cbrlo &= PAGE_MASK; /* reset nceo */ 1035 1008 cbrlo = phys_to_virt(vcpu->arch.sie_block->cbrlo); 1036 1009 down_read(&gmap->mm->mmap_sem);