From 2fa6e1e12a024b48b2c7ea39f50205246e027da7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Radim=20Kr=C4=8Dm=C3=A1=C5=99?= <rkrcmar@redhat.com>
Date: Sun, 4 Jun 2017 14:43:52 +0200
Subject: KVM: add kvm_request_pending
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

A first step in vcpu->requests encapsulation.  Additionally, we now
use READ_ONCE() when accessing vcpu->requests, which ensures we
always load vcpu->requests when it's accessed.  This is important as
other threads can change it any time.  Also, READ_ONCE() documents
that vcpu->requests is used with other threads, likely requiring
memory barriers, which it does.

Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
[ Documented the new use of READ_ONCE() and converted another check
  in arch/mips/kvm/vz.c ]
Signed-off-by: Andrew Jones <drjones@redhat.com>
Acked-by: Christoffer Dall <cdall@linaro.org>
Signed-off-by: Christoffer Dall <cdall@linaro.org>
---
 arch/s390/kvm/kvm-s390.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/s390/kvm')

diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 689ac48361c6..ad41e0fa3a21 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -2440,7 +2440,7 @@ static int kvm_s390_handle_requests(struct kvm_vcpu *vcpu)
 {
 retry:
 	kvm_s390_vcpu_request_handled(vcpu);
-	if (!vcpu->requests)
+	if (!kvm_request_pending(vcpu))
 		return 0;
 	/*
 	 * We use MMU_RELOAD just to re-arm the ipte notifier for the
-- 
cgit v1.2.3


From 8bb3fdd6863c3b6b84bbab750d6b35e889c1399d Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Sat, 3 Jun 2017 10:19:55 +0200
Subject: s390: rename psw_bits enums

The address space enums that must be used when modifying the address
space part of a psw with the psw_bits() macro can easily be confused
with the psw defines that are used to mask and compare directly the
mask part of a psw.
We have e.g. PSW_AS_PRIMARY vs PSW_ASC_PRIMARY.

To avoid confusion rename the PSW_AS_* enums to PSW_BITS_AS_*.

In addition also rename the PSW_AMODE_* enums, so they also follow the
same naming scheme: PSW_BITS_AMODE_*.

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/kvm/gaccess.c  | 12 ++++++------
 arch/s390/kvm/gaccess.h  |  4 ++--
 arch/s390/kvm/guestdbg.c |  6 +++---
 arch/s390/kvm/priv.c     |  6 +++---
 4 files changed, 14 insertions(+), 14 deletions(-)

(limited to 'arch/s390/kvm')

diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c
index 9da243d94cc3..6ad4a9797de8 100644
--- a/arch/s390/kvm/gaccess.c
+++ b/arch/s390/kvm/gaccess.c
@@ -557,20 +557,20 @@ static int get_vcpu_asce(struct kvm_vcpu *vcpu, union asce *asce,
 		return 0;
 	}
 
-	if (mode == GACC_IFETCH)
-		psw.as = psw.as == PSW_AS_HOME ? PSW_AS_HOME : PSW_AS_PRIMARY;
+	if ((mode == GACC_IFETCH) && (psw.as != PSW_BITS_AS_HOME))
+		psw.as = PSW_BITS_AS_PRIMARY;
 
 	switch (psw.as) {
-	case PSW_AS_PRIMARY:
+	case PSW_BITS_AS_PRIMARY:
 		asce->val = vcpu->arch.sie_block->gcr[1];
 		return 0;
-	case PSW_AS_SECONDARY:
+	case PSW_BITS_AS_SECONDARY:
 		asce->val = vcpu->arch.sie_block->gcr[7];
 		return 0;
-	case PSW_AS_HOME:
+	case PSW_BITS_AS_HOME:
 		asce->val = vcpu->arch.sie_block->gcr[13];
 		return 0;
-	case PSW_AS_ACCREG:
+	case PSW_BITS_AS_ACCREG:
 		rc = ar_translation(vcpu, asce, ar, mode);
 		if (rc > 0)
 			return trans_exc(vcpu, rc, ga, ar, mode, PROT_TYPE_ALC);
diff --git a/arch/s390/kvm/gaccess.h b/arch/s390/kvm/gaccess.h
index 7ce47fd36f28..bec42b852246 100644
--- a/arch/s390/kvm/gaccess.h
+++ b/arch/s390/kvm/gaccess.h
@@ -57,9 +57,9 @@ static inline unsigned long kvm_s390_logical_to_effective(struct kvm_vcpu *vcpu,
 {
 	psw_t *psw = &vcpu->arch.sie_block->gpsw;
 
-	if (psw_bits(*psw).eaba == PSW_AMODE_64BIT)
+	if (psw_bits(*psw).eaba == PSW_BITS_AMODE_64BIT)
 		return ga;
-	if (psw_bits(*psw).eaba == PSW_AMODE_31BIT)
+	if (psw_bits(*psw).eaba == PSW_BITS_AMODE_31BIT)
 		return ga & ((1UL << 31) - 1);
 	return ga & ((1UL << 24) - 1);
 }
diff --git a/arch/s390/kvm/guestdbg.c b/arch/s390/kvm/guestdbg.c
index 23d9a4e12da1..c2e0ddc1356e 100644
--- a/arch/s390/kvm/guestdbg.c
+++ b/arch/s390/kvm/guestdbg.c
@@ -613,15 +613,15 @@ int kvm_s390_handle_per_event(struct kvm_vcpu *vcpu)
 		 * instruction. Check primary and home space-switch-event
 		 * controls. (theoretically home -> home produced no event)
 		 */
-		if (((new_as == PSW_AS_HOME) ^ old_as_is_home(vcpu)) &&
-		     (pssec(vcpu) || hssec(vcpu)))
+		if (((new_as == PSW_BITS_AS_HOME) ^ old_as_is_home(vcpu)) &&
+		    (pssec(vcpu) || hssec(vcpu)))
 			vcpu->arch.sie_block->iprcc = PGM_SPACE_SWITCH;
 
 		/*
 		 * PT, PTI, PR, PC instruction operate on primary AS only. Check
 		 * if the primary-space-switch-event control was or got set.
 		 */
-		if (new_as == PSW_AS_PRIMARY && !old_as_is_home(vcpu) &&
+		if (new_as == PSW_BITS_AS_PRIMARY && !old_as_is_home(vcpu) &&
 		    (pssec(vcpu) || old_ssec(vcpu)))
 			vcpu->arch.sie_block->iprcc = PGM_SPACE_SWITCH;
 	}
diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c
index c03106c428cf..e9dd7efc57b8 100644
--- a/arch/s390/kvm/priv.c
+++ b/arch/s390/kvm/priv.c
@@ -361,7 +361,7 @@ static int handle_sske(struct kvm_vcpu *vcpu)
 		}
 	}
 	if (m3 & SSKE_MB) {
-		if (psw_bits(vcpu->arch.sie_block->gpsw).eaba == PSW_AMODE_64BIT)
+		if (psw_bits(vcpu->arch.sie_block->gpsw).eaba == PSW_BITS_AMODE_64BIT)
 			vcpu->run->s.regs.gprs[reg2] &= ~PAGE_MASK;
 		else
 			vcpu->run->s.regs.gprs[reg2] &= ~0xfffff000UL;
@@ -901,7 +901,7 @@ static int handle_pfmf(struct kvm_vcpu *vcpu)
 		/* only support 2G frame size if EDAT2 is available and we are
 		   not in 24-bit addressing mode */
 		if (!test_kvm_facility(vcpu->kvm, 78) ||
-		    psw_bits(vcpu->arch.sie_block->gpsw).eaba == PSW_AMODE_24BIT)
+		    psw_bits(vcpu->arch.sie_block->gpsw).eaba == PSW_BITS_AMODE_24BIT)
 			return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
 		end = (start + (1UL << 31)) & ~((1UL << 31) - 1);
 		break;
@@ -938,7 +938,7 @@ static int handle_pfmf(struct kvm_vcpu *vcpu)
 		start += PAGE_SIZE;
 	}
 	if (vcpu->run->s.regs.gprs[reg1] & PFMF_FSC) {
-		if (psw_bits(vcpu->arch.sie_block->gpsw).eaba == PSW_AMODE_64BIT) {
+		if (psw_bits(vcpu->arch.sie_block->gpsw).eaba == PSW_BITS_AMODE_64BIT) {
 			vcpu->run->s.regs.gprs[reg2] = end;
 		} else {
 			vcpu->run->s.regs.gprs[reg2] &= ~0xffffffffUL;
-- 
cgit v1.2.3


From a752598254016d2f9b4415d43a6402fe083f70b2 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Sat, 3 Jun 2017 10:56:07 +0200
Subject: s390: rename struct psw_bits members

Rename a couple of the struct psw_bits members so it is more obvious
for what they are good. Initially I thought using the single character
names from the PoP would be sufficient and obvious, but admittedly
that is not true.

The current implementation is not easy to use, if one has to look into
the source file to figure out which member represents the 'per' bit
(which is the 'r' member).

Therefore rename the members to sane names that are identical to the
uapi psw mask defines:

r -> per
i -> io
e -> ext
t -> dat
m -> mcheck
w -> wait
p -> pstate

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/kvm/gaccess.c | 10 +++++-----
 arch/s390/kvm/priv.c    |  2 +-
 2 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'arch/s390/kvm')

diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c
index 6ad4a9797de8..e0f7d5fc7efd 100644
--- a/arch/s390/kvm/gaccess.c
+++ b/arch/s390/kvm/gaccess.c
@@ -551,7 +551,7 @@ static int get_vcpu_asce(struct kvm_vcpu *vcpu, union asce *asce,
 	int rc;
 	struct psw_bits psw = psw_bits(vcpu->arch.sie_block->gpsw);
 
-	if (!psw.t) {
+	if (!psw.dat) {
 		asce->val = 0;
 		asce->r = 1;
 		return 0;
@@ -771,7 +771,7 @@ static int low_address_protection_enabled(struct kvm_vcpu *vcpu,
 
 	if (!ctlreg0.lap)
 		return 0;
-	if (psw_bits(*psw).t && asce.p)
+	if (psw_bits(*psw).dat && asce.p)
 		return 0;
 	return 1;
 }
@@ -790,7 +790,7 @@ static int guest_page_range(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar,
 			return trans_exc(vcpu, PGM_PROTECTION, ga, ar, mode,
 					 PROT_TYPE_LA);
 		ga &= PAGE_MASK;
-		if (psw_bits(*psw).t) {
+		if (psw_bits(*psw).dat) {
 			rc = guest_translate(vcpu, ga, pages, asce, mode);
 			if (rc < 0)
 				return rc;
@@ -831,7 +831,7 @@ int access_guest(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar, void *data,
 		pages = vmalloc(nr_pages * sizeof(unsigned long));
 	if (!pages)
 		return -ENOMEM;
-	need_ipte_lock = psw_bits(*psw).t && !asce.r;
+	need_ipte_lock = psw_bits(*psw).dat && !asce.r;
 	if (need_ipte_lock)
 		ipte_lock(vcpu);
 	rc = guest_page_range(vcpu, ga, ar, pages, nr_pages, asce, mode);
@@ -899,7 +899,7 @@ int guest_translate_address(struct kvm_vcpu *vcpu, unsigned long gva, u8 ar,
 					 mode, PROT_TYPE_LA);
 	}
 
-	if (psw_bits(*psw).t && !asce.r) {	/* Use DAT? */
+	if (psw_bits(*psw).dat && !asce.r) {	/* Use DAT? */
 		rc = guest_translate(vcpu, gva, gpa, asce, mode);
 		if (rc > 0)
 			return trans_exc(vcpu, rc, gva, 0, mode, PROT_TYPE_DAT);
diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c
index e9dd7efc57b8..e53292a89257 100644
--- a/arch/s390/kvm/priv.c
+++ b/arch/s390/kvm/priv.c
@@ -374,7 +374,7 @@ static int handle_sske(struct kvm_vcpu *vcpu)
 static int handle_ipte_interlock(struct kvm_vcpu *vcpu)
 {
 	vcpu->stat.instruction_ipte_interlock++;
-	if (psw_bits(vcpu->arch.sie_block->gpsw).p)
+	if (psw_bits(vcpu->arch.sie_block->gpsw).pstate)
 		return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
 	wait_event(vcpu->kvm->arch.ipte_wq, !ipte_lock_held(vcpu));
 	kvm_s390_retry_instr(vcpu);
-- 
cgit v1.2.3


From 190df4a212a708fdd18f6cabfdd82594c91fdf25 Mon Sep 17 00:00:00 2001
From: Claudio Imbrenda <imbrenda@linux.vnet.ibm.com>
Date: Thu, 4 Aug 2016 17:54:42 +0200
Subject: KVM: s390: CMMA tracking, ESSA emulation, migration mode

* Add a migration state bitmap to keep track of which pages have dirty
  CMMA information.
* Disable CMMA by default, so we can track if it's used or not. Enable
  it on first use like we do for storage keys (unless we are doing a
  migration).
* Creates a VM attribute to enter and leave migration mode.
* In migration mode, CMMA is disabled in the SIE block, so ESSA is
  always interpreted and emulated in software.
* Free the migration state on VM destroy.

Signed-off-by: Claudio Imbrenda <imbrenda@linux.vnet.ibm.com>
Acked-by: Cornelia Huck <cornelia.huck@de.ibm.com>
Reviewed-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/kvm/kvm-s390.c | 159 ++++++++++++++++++++++++++++++++++++++++++++++-
 arch/s390/kvm/priv.c     | 103 ++++++++++++++++++++++++++++--
 2 files changed, 256 insertions(+), 6 deletions(-)

(limited to 'arch/s390/kvm')

diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 689ac48361c6..c2b391499374 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -31,6 +31,7 @@
 #include <linux/bitmap.h>
 #include <linux/sched/signal.h>
 
+#include <linux/string.h>
 #include <asm/asm-offsets.h>
 #include <asm/lowcore.h>
 #include <asm/stp.h>
@@ -750,6 +751,129 @@ static int kvm_s390_vm_set_crypto(struct kvm *kvm, struct kvm_device_attr *attr)
 	return 0;
 }
 
+static void kvm_s390_sync_request_broadcast(struct kvm *kvm, int req)
+{
+	int cx;
+	struct kvm_vcpu *vcpu;
+
+	kvm_for_each_vcpu(cx, vcpu, kvm)
+		kvm_s390_sync_request(req, vcpu);
+}
+
+/*
+ * Must be called with kvm->srcu held to avoid races on memslots, and with
+ * kvm->lock to avoid races with ourselves and kvm_s390_vm_stop_migration.
+ */
+static int kvm_s390_vm_start_migration(struct kvm *kvm)
+{
+	struct kvm_s390_migration_state *mgs;
+	struct kvm_memory_slot *ms;
+	/* should be the only one */
+	struct kvm_memslots *slots;
+	unsigned long ram_pages;
+	int slotnr;
+
+	/* migration mode already enabled */
+	if (kvm->arch.migration_state)
+		return 0;
+
+	slots = kvm_memslots(kvm);
+	if (!slots || !slots->used_slots)
+		return -EINVAL;
+
+	mgs = kzalloc(sizeof(*mgs), GFP_KERNEL);
+	if (!mgs)
+		return -ENOMEM;
+	kvm->arch.migration_state = mgs;
+
+	if (kvm->arch.use_cmma) {
+		/*
+		 * Get the last slot. They should be sorted by base_gfn, so the
+		 * last slot is also the one at the end of the address space.
+		 * We have verified above that at least one slot is present.
+		 */
+		ms = slots->memslots + slots->used_slots - 1;
+		/* round up so we only use full longs */
+		ram_pages = roundup(ms->base_gfn + ms->npages, BITS_PER_LONG);
+		/* allocate enough bytes to store all the bits */
+		mgs->pgste_bitmap = vmalloc(ram_pages / 8);
+		if (!mgs->pgste_bitmap) {
+			kfree(mgs);
+			kvm->arch.migration_state = NULL;
+			return -ENOMEM;
+		}
+
+		mgs->bitmap_size = ram_pages;
+		atomic64_set(&mgs->dirty_pages, ram_pages);
+		/* mark all the pages in active slots as dirty */
+		for (slotnr = 0; slotnr < slots->used_slots; slotnr++) {
+			ms = slots->memslots + slotnr;
+			bitmap_set(mgs->pgste_bitmap, ms->base_gfn, ms->npages);
+		}
+
+		kvm_s390_sync_request_broadcast(kvm, KVM_REQ_START_MIGRATION);
+	}
+	return 0;
+}
+
+/*
+ * Must be called with kvm->lock to avoid races with ourselves and
+ * kvm_s390_vm_start_migration.
+ */
+static int kvm_s390_vm_stop_migration(struct kvm *kvm)
+{
+	struct kvm_s390_migration_state *mgs;
+
+	/* migration mode already disabled */
+	if (!kvm->arch.migration_state)
+		return 0;
+	mgs = kvm->arch.migration_state;
+	kvm->arch.migration_state = NULL;
+
+	if (kvm->arch.use_cmma) {
+		kvm_s390_sync_request_broadcast(kvm, KVM_REQ_STOP_MIGRATION);
+		vfree(mgs->pgste_bitmap);
+	}
+	kfree(mgs);
+	return 0;
+}
+
+static int kvm_s390_vm_set_migration(struct kvm *kvm,
+				     struct kvm_device_attr *attr)
+{
+	int idx, res = -ENXIO;
+
+	mutex_lock(&kvm->lock);
+	switch (attr->attr) {
+	case KVM_S390_VM_MIGRATION_START:
+		idx = srcu_read_lock(&kvm->srcu);
+		res = kvm_s390_vm_start_migration(kvm);
+		srcu_read_unlock(&kvm->srcu, idx);
+		break;
+	case KVM_S390_VM_MIGRATION_STOP:
+		res = kvm_s390_vm_stop_migration(kvm);
+		break;
+	default:
+		break;
+	}
+	mutex_unlock(&kvm->lock);
+
+	return res;
+}
+
+static int kvm_s390_vm_get_migration(struct kvm *kvm,
+				     struct kvm_device_attr *attr)
+{
+	u64 mig = (kvm->arch.migration_state != NULL);
+
+	if (attr->attr != KVM_S390_VM_MIGRATION_STATUS)
+		return -ENXIO;
+
+	if (copy_to_user((void __user *)attr->addr, &mig, sizeof(mig)))
+		return -EFAULT;
+	return 0;
+}
+
 static int kvm_s390_set_tod_high(struct kvm *kvm, struct kvm_device_attr *attr)
 {
 	u8 gtod_high;
@@ -1090,6 +1214,9 @@ static int kvm_s390_vm_set_attr(struct kvm *kvm, struct kvm_device_attr *attr)
 	case KVM_S390_VM_CRYPTO:
 		ret = kvm_s390_vm_set_crypto(kvm, attr);
 		break;
+	case KVM_S390_VM_MIGRATION:
+		ret = kvm_s390_vm_set_migration(kvm, attr);
+		break;
 	default:
 		ret = -ENXIO;
 		break;
@@ -1112,6 +1239,9 @@ static int kvm_s390_vm_get_attr(struct kvm *kvm, struct kvm_device_attr *attr)
 	case KVM_S390_VM_CPU_MODEL:
 		ret = kvm_s390_get_cpu_model(kvm, attr);
 		break;
+	case KVM_S390_VM_MIGRATION:
+		ret = kvm_s390_vm_get_migration(kvm, attr);
+		break;
 	default:
 		ret = -ENXIO;
 		break;
@@ -1179,6 +1309,9 @@ static int kvm_s390_vm_has_attr(struct kvm *kvm, struct kvm_device_attr *attr)
 			break;
 		}
 		break;
+	case KVM_S390_VM_MIGRATION:
+		ret = 0;
+		break;
 	default:
 		ret = -ENXIO;
 		break;
@@ -1633,6 +1766,10 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
 	kvm_s390_destroy_adapters(kvm);
 	kvm_s390_clear_float_irqs(kvm);
 	kvm_s390_vsie_destroy(kvm);
+	if (kvm->arch.migration_state) {
+		vfree(kvm->arch.migration_state->pgste_bitmap);
+		kfree(kvm->arch.migration_state);
+	}
 	KVM_EVENT(3, "vm 0x%pK destroyed", kvm);
 }
 
@@ -1977,7 +2114,6 @@ int kvm_s390_vcpu_setup_cmma(struct kvm_vcpu *vcpu)
 	if (!vcpu->arch.sie_block->cbrlo)
 		return -ENOMEM;
 
-	vcpu->arch.sie_block->ecb2 |= ECB2_CMMA;
 	vcpu->arch.sie_block->ecb2 &= ~ECB2_PFMFI;
 	return 0;
 }
@@ -2489,6 +2625,27 @@ retry:
 		goto retry;
 	}
 
+	if (kvm_check_request(KVM_REQ_START_MIGRATION, vcpu)) {
+		/*
+		 * Disable CMMA virtualization; we will emulate the ESSA
+		 * instruction manually, in order to provide additional
+		 * functionalities needed for live migration.
+		 */
+		vcpu->arch.sie_block->ecb2 &= ~ECB2_CMMA;
+		goto retry;
+	}
+
+	if (kvm_check_request(KVM_REQ_STOP_MIGRATION, vcpu)) {
+		/*
+		 * Re-enable CMMA virtualization if CMMA is available and
+		 * was used.
+		 */
+		if ((vcpu->kvm->arch.use_cmma) &&
+		    (vcpu->kvm->mm->context.use_cmma))
+			vcpu->arch.sie_block->ecb2 |= ECB2_CMMA;
+		goto retry;
+	}
+
 	/* nothing to do, just clear the request */
 	kvm_clear_request(KVM_REQ_UNHALT, vcpu);
 
diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c
index c03106c428cf..a226c459809b 100644
--- a/arch/s390/kvm/priv.c
+++ b/arch/s390/kvm/priv.c
@@ -24,6 +24,7 @@
 #include <asm/ebcdic.h>
 #include <asm/sysinfo.h>
 #include <asm/pgtable.h>
+#include <asm/page-states.h>
 #include <asm/pgalloc.h>
 #include <asm/gmap.h>
 #include <asm/io.h>
@@ -949,13 +950,72 @@ static int handle_pfmf(struct kvm_vcpu *vcpu)
 	return 0;
 }
 
+static inline int do_essa(struct kvm_vcpu *vcpu, const int orc)
+{
+	struct kvm_s390_migration_state *ms = vcpu->kvm->arch.migration_state;
+	int r1, r2, nappended, entries;
+	unsigned long gfn, hva, res, pgstev, ptev;
+	unsigned long *cbrlo;
+
+	/*
+	 * We don't need to set SD.FPF.SK to 1 here, because if we have a
+	 * machine check here we either handle it or crash
+	 */
+
+	kvm_s390_get_regs_rre(vcpu, &r1, &r2);
+	gfn = vcpu->run->s.regs.gprs[r2] >> PAGE_SHIFT;
+	hva = gfn_to_hva(vcpu->kvm, gfn);
+	entries = (vcpu->arch.sie_block->cbrlo & ~PAGE_MASK) >> 3;
+
+	if (kvm_is_error_hva(hva))
+		return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
+
+	nappended = pgste_perform_essa(vcpu->kvm->mm, hva, orc, &ptev, &pgstev);
+	if (nappended < 0) {
+		res = orc ? 0x10 : 0;
+		vcpu->run->s.regs.gprs[r1] = res; /* Exception Indication */
+		return 0;
+	}
+	res = (pgstev & _PGSTE_GPS_USAGE_MASK) >> 22;
+	/*
+	 * Set the block-content state part of the result. 0 means resident, so
+	 * nothing to do if the page is valid. 2 is for preserved pages
+	 * (non-present and non-zero), and 3 for zero pages (non-present and
+	 * zero).
+	 */
+	if (ptev & _PAGE_INVALID) {
+		res |= 2;
+		if (pgstev & _PGSTE_GPS_ZERO)
+			res |= 1;
+	}
+	vcpu->run->s.regs.gprs[r1] = res;
+	/*
+	 * It is possible that all the normal 511 slots were full, in which case
+	 * we will now write in the 512th slot, which is reserved for host use.
+	 * In both cases we let the normal essa handling code process all the
+	 * slots, including the reserved one, if needed.
+	 */
+	if (nappended > 0) {
+		cbrlo = phys_to_virt(vcpu->arch.sie_block->cbrlo & PAGE_MASK);
+		cbrlo[entries] = gfn << PAGE_SHIFT;
+	}
+
+	if (orc) {
+		/* increment only if we are really flipping the bit to 1 */
+		if (!test_and_set_bit(gfn, ms->pgste_bitmap))
+			atomic64_inc(&ms->dirty_pages);
+	}
+
+	return nappended;
+}
+
 static int handle_essa(struct kvm_vcpu *vcpu)
 {
 	/* entries expected to be 1FF */
 	int entries = (vcpu->arch.sie_block->cbrlo & ~PAGE_MASK) >> 3;
 	unsigned long *cbrlo;
 	struct gmap *gmap;
-	int i;
+	int i, orc;
 
 	VCPU_EVENT(vcpu, 4, "ESSA: release %d pages", entries);
 	gmap = vcpu->arch.gmap;
@@ -965,12 +1025,45 @@ static int handle_essa(struct kvm_vcpu *vcpu)
 
 	if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
 		return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
-
-	if (((vcpu->arch.sie_block->ipb & 0xf0000000) >> 28) > 6)
+	/* Check for invalid operation request code */
+	orc = (vcpu->arch.sie_block->ipb & 0xf0000000) >> 28;
+	if (orc > ESSA_MAX)
 		return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
 
-	/* Retry the ESSA instruction */
-	kvm_s390_retry_instr(vcpu);
+	if (likely(!vcpu->kvm->arch.migration_state)) {
+		/*
+		 * CMMA is enabled in the KVM settings, but is disabled in
+		 * the SIE block and in the mm_context, and we are not doing
+		 * a migration. Enable CMMA in the mm_context.
+		 * Since we need to take a write lock to write to the context
+		 * to avoid races with storage keys handling, we check if the
+		 * value really needs to be written to; if the value is
+		 * already correct, we do nothing and avoid the lock.
+		 */
+		if (vcpu->kvm->mm->context.use_cmma == 0) {
+			down_write(&vcpu->kvm->mm->mmap_sem);
+			vcpu->kvm->mm->context.use_cmma = 1;
+			up_write(&vcpu->kvm->mm->mmap_sem);
+		}
+		/*
+		 * If we are here, we are supposed to have CMMA enabled in
+		 * the SIE block. Enabling CMMA works on a per-CPU basis,
+		 * while the context use_cmma flag is per process.
+		 * It's possible that the context flag is enabled and the
+		 * SIE flag is not, so we set the flag always; if it was
+		 * already set, nothing changes, otherwise we enable it
+		 * on this CPU too.
+		 */
+		vcpu->arch.sie_block->ecb2 |= ECB2_CMMA;
+		/* Retry the ESSA instruction */
+		kvm_s390_retry_instr(vcpu);
+	} else {
+		/* Account for the possible extra cbrl entry */
+		i = do_essa(vcpu, orc);
+		if (i < 0)
+			return i;
+		entries += i;
+	}
 	vcpu->arch.sie_block->cbrlo &= PAGE_MASK;	/* reset nceo */
 	cbrlo = phys_to_virt(vcpu->arch.sie_block->cbrlo);
 	down_read(&gmap->mm->mmap_sem);
-- 
cgit v1.2.3


From 4036e3874a1ce41a4f7267289f9a0d8e5cd49408 Mon Sep 17 00:00:00 2001
From: Claudio Imbrenda <imbrenda@linux.vnet.ibm.com>
Date: Thu, 4 Aug 2016 17:58:47 +0200
Subject: KVM: s390: ioctls to get and set guest storage attributes

* Add the struct used in the ioctls to get and set CMMA attributes.
* Add the two functions needed to get and set the CMMA attributes for
  guest pages.
* Add the two ioctls that use the aforementioned functions.

Signed-off-by: Claudio Imbrenda <imbrenda@linux.vnet.ibm.com>
Acked-by: Cornelia Huck <cornelia.huck@de.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/kvm/kvm-s390.c | 202 ++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 201 insertions(+), 1 deletion(-)

(limited to 'arch/s390/kvm')

diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index c2b391499374..e100a7ff35c7 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -30,8 +30,8 @@
 #include <linux/vmalloc.h>
 #include <linux/bitmap.h>
 #include <linux/sched/signal.h>
-
 #include <linux/string.h>
+
 #include <asm/asm-offsets.h>
 #include <asm/lowcore.h>
 #include <asm/stp.h>
@@ -387,6 +387,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_S390_SKEYS:
 	case KVM_CAP_S390_IRQ_STATE:
 	case KVM_CAP_S390_USER_INSTR0:
+	case KVM_CAP_S390_CMMA_MIGRATION:
 	case KVM_CAP_S390_AIS:
 		r = 1;
 		break;
@@ -1419,6 +1420,182 @@ out:
 	return r;
 }
 
+/*
+ * Base address and length must be sent at the start of each block, therefore
+ * it's cheaper to send some clean data, as long as it's less than the size of
+ * two longs.
+ */
+#define KVM_S390_MAX_BIT_DISTANCE (2 * sizeof(void *))
+/* for consistency */
+#define KVM_S390_CMMA_SIZE_MAX ((u32)KVM_S390_SKEYS_MAX)
+
+/*
+ * This function searches for the next page with dirty CMMA attributes, and
+ * saves the attributes in the buffer up to either the end of the buffer or
+ * until a block of at least KVM_S390_MAX_BIT_DISTANCE clean bits is found;
+ * no trailing clean bytes are saved.
+ * In case no dirty bits were found, or if CMMA was not enabled or used, the
+ * output buffer will indicate 0 as length.
+ */
+static int kvm_s390_get_cmma_bits(struct kvm *kvm,
+				  struct kvm_s390_cmma_log *args)
+{
+	struct kvm_s390_migration_state *s = kvm->arch.migration_state;
+	unsigned long bufsize, hva, pgstev, i, next, cur;
+	int srcu_idx, peek, r = 0, rr;
+	u8 *res;
+
+	cur = args->start_gfn;
+	i = next = pgstev = 0;
+
+	if (unlikely(!kvm->arch.use_cmma))
+		return -ENXIO;
+	/* Invalid/unsupported flags were specified */
+	if (args->flags & ~KVM_S390_CMMA_PEEK)
+		return -EINVAL;
+	/* Migration mode query, and we are not doing a migration */
+	peek = !!(args->flags & KVM_S390_CMMA_PEEK);
+	if (!peek && !s)
+		return -EINVAL;
+	/* CMMA is disabled or was not used, or the buffer has length zero */
+	bufsize = min(args->count, KVM_S390_CMMA_SIZE_MAX);
+	if (!bufsize || !kvm->mm->context.use_cmma) {
+		memset(args, 0, sizeof(*args));
+		return 0;
+	}
+
+	if (!peek) {
+		/* We are not peeking, and there are no dirty pages */
+		if (!atomic64_read(&s->dirty_pages)) {
+			memset(args, 0, sizeof(*args));
+			return 0;
+		}
+		cur = find_next_bit(s->pgste_bitmap, s->bitmap_size,
+				    args->start_gfn);
+		if (cur >= s->bitmap_size)	/* nothing found, loop back */
+			cur = find_next_bit(s->pgste_bitmap, s->bitmap_size, 0);
+		if (cur >= s->bitmap_size) {	/* again! (very unlikely) */
+			memset(args, 0, sizeof(*args));
+			return 0;
+		}
+		next = find_next_bit(s->pgste_bitmap, s->bitmap_size, cur + 1);
+	}
+
+	res = vmalloc(bufsize);
+	if (!res)
+		return -ENOMEM;
+
+	args->start_gfn = cur;
+
+	down_read(&kvm->mm->mmap_sem);
+	srcu_idx = srcu_read_lock(&kvm->srcu);
+	while (i < bufsize) {
+		hva = gfn_to_hva(kvm, cur);
+		if (kvm_is_error_hva(hva)) {
+			r = -EFAULT;
+			break;
+		}
+		/* decrement only if we actually flipped the bit to 0 */
+		if (!peek && test_and_clear_bit(cur, s->pgste_bitmap))
+			atomic64_dec(&s->dirty_pages);
+		r = get_pgste(kvm->mm, hva, &pgstev);
+		if (r < 0)
+			pgstev = 0;
+		/* save the value */
+		res[i++] = (pgstev >> 24) & 0x3;
+		/*
+		 * if the next bit is too far away, stop.
+		 * if we reached the previous "next", find the next one
+		 */
+		if (!peek) {
+			if (next > cur + KVM_S390_MAX_BIT_DISTANCE)
+				break;
+			if (cur == next)
+				next = find_next_bit(s->pgste_bitmap,
+						     s->bitmap_size, cur + 1);
+		/* reached the end of the bitmap or of the buffer, stop */
+			if ((next >= s->bitmap_size) ||
+			    (next >= args->start_gfn + bufsize))
+				break;
+		}
+		cur++;
+	}
+	srcu_read_unlock(&kvm->srcu, srcu_idx);
+	up_read(&kvm->mm->mmap_sem);
+	args->count = i;
+	args->remaining = s ? atomic64_read(&s->dirty_pages) : 0;
+
+	rr = copy_to_user((void __user *)args->values, res, args->count);
+	if (rr)
+		r = -EFAULT;
+
+	vfree(res);
+	return r;
+}
+
+/*
+ * This function sets the CMMA attributes for the given pages. If the input
+ * buffer has zero length, no action is taken, otherwise the attributes are
+ * set and the mm->context.use_cmma flag is set.
+ */
+static int kvm_s390_set_cmma_bits(struct kvm *kvm,
+				  const struct kvm_s390_cmma_log *args)
+{
+	unsigned long hva, mask, pgstev, i;
+	uint8_t *bits;
+	int srcu_idx, r = 0;
+
+	mask = args->mask;
+
+	if (!kvm->arch.use_cmma)
+		return -ENXIO;
+	/* invalid/unsupported flags */
+	if (args->flags != 0)
+		return -EINVAL;
+	/* Enforce sane limit on memory allocation */
+	if (args->count > KVM_S390_CMMA_SIZE_MAX)
+		return -EINVAL;
+	/* Nothing to do */
+	if (args->count == 0)
+		return 0;
+
+	bits = vmalloc(sizeof(*bits) * args->count);
+	if (!bits)
+		return -ENOMEM;
+
+	r = copy_from_user(bits, (void __user *)args->values, args->count);
+	if (r) {
+		r = -EFAULT;
+		goto out;
+	}
+
+	down_read(&kvm->mm->mmap_sem);
+	srcu_idx = srcu_read_lock(&kvm->srcu);
+	for (i = 0; i < args->count; i++) {
+		hva = gfn_to_hva(kvm, args->start_gfn + i);
+		if (kvm_is_error_hva(hva)) {
+			r = -EFAULT;
+			break;
+		}
+
+		pgstev = bits[i];
+		pgstev = pgstev << 24;
+		mask &= _PGSTE_GPS_USAGE_MASK;
+		set_pgste_bits(kvm->mm, hva, mask, pgstev);
+	}
+	srcu_read_unlock(&kvm->srcu, srcu_idx);
+	up_read(&kvm->mm->mmap_sem);
+
+	if (!kvm->mm->context.use_cmma) {
+		down_write(&kvm->mm->mmap_sem);
+		kvm->mm->context.use_cmma = 1;
+		up_write(&kvm->mm->mmap_sem);
+	}
+out:
+	vfree(bits);
+	return r;
+}
+
 long kvm_arch_vm_ioctl(struct file *filp,
 		       unsigned int ioctl, unsigned long arg)
 {
@@ -1497,6 +1674,29 @@ long kvm_arch_vm_ioctl(struct file *filp,
 		r = kvm_s390_set_skeys(kvm, &args);
 		break;
 	}
+	case KVM_S390_GET_CMMA_BITS: {
+		struct kvm_s390_cmma_log args;
+
+		r = -EFAULT;
+		if (copy_from_user(&args, argp, sizeof(args)))
+			break;
+		r = kvm_s390_get_cmma_bits(kvm, &args);
+		if (!r) {
+			r = copy_to_user(argp, &args, sizeof(args));
+			if (r)
+				r = -EFAULT;
+		}
+		break;
+	}
+	case KVM_S390_SET_CMMA_BITS: {
+		struct kvm_s390_cmma_log args;
+
+		r = -EFAULT;
+		if (copy_from_user(&args, argp, sizeof(args)))
+			break;
+		r = kvm_s390_set_cmma_bits(kvm, &args);
+		break;
+	}
 	default:
 		r = -ENOTTY;
 	}
-- 
cgit v1.2.3


From 6ae1574c2a24eec5efa8bac305a8f87c839acc64 Mon Sep 17 00:00:00 2001
From: Christian Borntraeger <borntraeger@de.ibm.com>
Date: Wed, 7 Jun 2017 12:45:22 +0200
Subject: KVM: s390: implement instruction execution protection for emulated
 ifetch

While currently only used to fetch the original instruction on failure
for getting the instruction length code, we should make the page table
walking code future proof.

Suggested-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
Reviewed-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/kvm/gaccess.c | 39 +++++++++++++++++++++++++++++----------
 1 file changed, 29 insertions(+), 10 deletions(-)

(limited to 'arch/s390/kvm')

diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c
index 9da243d94cc3..6fda095f1a99 100644
--- a/arch/s390/kvm/gaccess.c
+++ b/arch/s390/kvm/gaccess.c
@@ -89,7 +89,7 @@ struct region3_table_entry_fc1 {
 	unsigned long f  : 1; /* Fetch-Protection Bit */
 	unsigned long fc : 1; /* Format-Control */
 	unsigned long p  : 1; /* DAT-Protection Bit */
-	unsigned long co : 1; /* Change-Recording Override */
+	unsigned long iep: 1; /* Instruction-Execution-Protection */
 	unsigned long	 : 2;
 	unsigned long i  : 1; /* Region-Invalid Bit */
 	unsigned long cr : 1; /* Common-Region Bit */
@@ -131,7 +131,7 @@ struct segment_entry_fc1 {
 	unsigned long f  : 1; /* Fetch-Protection Bit */
 	unsigned long fc : 1; /* Format-Control */
 	unsigned long p  : 1; /* DAT-Protection Bit */
-	unsigned long co : 1; /* Change-Recording Override */
+	unsigned long iep: 1; /* Instruction-Execution-Protection */
 	unsigned long	 : 2;
 	unsigned long i  : 1; /* Segment-Invalid Bit */
 	unsigned long cs : 1; /* Common-Segment Bit */
@@ -168,7 +168,8 @@ union page_table_entry {
 		unsigned long z  : 1; /* Zero Bit */
 		unsigned long i  : 1; /* Page-Invalid Bit */
 		unsigned long p  : 1; /* DAT-Protection Bit */
-		unsigned long	 : 9;
+		unsigned long iep: 1; /* Instruction-Execution-Protection */
+		unsigned long	 : 8;
 	};
 };
 
@@ -485,6 +486,7 @@ enum prot_type {
 	PROT_TYPE_KEYC = 1,
 	PROT_TYPE_ALC  = 2,
 	PROT_TYPE_DAT  = 3,
+	PROT_TYPE_IEP  = 4,
 };
 
 static int trans_exc(struct kvm_vcpu *vcpu, int code, unsigned long gva,
@@ -500,6 +502,9 @@ static int trans_exc(struct kvm_vcpu *vcpu, int code, unsigned long gva,
 	switch (code) {
 	case PGM_PROTECTION:
 		switch (prot) {
+		case PROT_TYPE_IEP:
+			tec->b61 = 1;
+			/* FALL THROUGH */
 		case PROT_TYPE_LA:
 			tec->b56 = 1;
 			break;
@@ -591,6 +596,7 @@ static int deref_table(struct kvm *kvm, unsigned long gpa, unsigned long *val)
  * @gpa: points to where guest physical (absolute) address should be stored
  * @asce: effective asce
  * @mode: indicates the access mode to be used
+ * @prot: returns the type for protection exceptions
  *
  * Translate a guest virtual address into a guest absolute address by means
  * of dynamic address translation as specified by the architecture.
@@ -606,19 +612,21 @@ static int deref_table(struct kvm *kvm, unsigned long gpa, unsigned long *val)
  */
 static unsigned long guest_translate(struct kvm_vcpu *vcpu, unsigned long gva,
 				     unsigned long *gpa, const union asce asce,
-				     enum gacc_mode mode)
+				     enum gacc_mode mode, enum prot_type *prot)
 {
 	union vaddress vaddr = {.addr = gva};
 	union raddress raddr = {.addr = gva};
 	union page_table_entry pte;
 	int dat_protection = 0;
+	int iep_protection = 0;
 	union ctlreg0 ctlreg0;
 	unsigned long ptr;
-	int edat1, edat2;
+	int edat1, edat2, iep;
 
 	ctlreg0.val = vcpu->arch.sie_block->gcr[0];
 	edat1 = ctlreg0.edat && test_kvm_facility(vcpu->kvm, 8);
 	edat2 = edat1 && test_kvm_facility(vcpu->kvm, 78);
+	iep = ctlreg0.iep && test_kvm_facility(vcpu->kvm, 130);
 	if (asce.r)
 		goto real_address;
 	ptr = asce.origin * 4096;
@@ -702,6 +710,7 @@ static unsigned long guest_translate(struct kvm_vcpu *vcpu, unsigned long gva,
 			return PGM_TRANSLATION_SPEC;
 		if (rtte.fc && edat2) {
 			dat_protection |= rtte.fc1.p;
+			iep_protection = rtte.fc1.iep;
 			raddr.rfaa = rtte.fc1.rfaa;
 			goto absolute_address;
 		}
@@ -729,6 +738,7 @@ static unsigned long guest_translate(struct kvm_vcpu *vcpu, unsigned long gva,
 			return PGM_TRANSLATION_SPEC;
 		if (ste.fc && edat1) {
 			dat_protection |= ste.fc1.p;
+			iep_protection = ste.fc1.iep;
 			raddr.sfaa = ste.fc1.sfaa;
 			goto absolute_address;
 		}
@@ -745,12 +755,19 @@ static unsigned long guest_translate(struct kvm_vcpu *vcpu, unsigned long gva,
 	if (pte.z)
 		return PGM_TRANSLATION_SPEC;
 	dat_protection |= pte.p;
+	iep_protection = pte.iep;
 	raddr.pfra = pte.pfra;
 real_address:
 	raddr.addr = kvm_s390_real_to_abs(vcpu, raddr.addr);
 absolute_address:
-	if (mode == GACC_STORE && dat_protection)
+	if (mode == GACC_STORE && dat_protection) {
+		*prot = PROT_TYPE_DAT;
 		return PGM_PROTECTION;
+	}
+	if (mode == GACC_IFETCH && iep_protection && iep) {
+		*prot = PROT_TYPE_IEP;
+		return PGM_PROTECTION;
+	}
 	if (kvm_is_error_gpa(vcpu->kvm, raddr.addr))
 		return PGM_ADDRESSING;
 	*gpa = raddr.addr;
@@ -782,6 +799,7 @@ static int guest_page_range(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar,
 {
 	psw_t *psw = &vcpu->arch.sie_block->gpsw;
 	int lap_enabled, rc = 0;
+	enum prot_type prot;
 
 	lap_enabled = low_address_protection_enabled(vcpu, asce);
 	while (nr_pages) {
@@ -791,7 +809,7 @@ static int guest_page_range(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar,
 					 PROT_TYPE_LA);
 		ga &= PAGE_MASK;
 		if (psw_bits(*psw).t) {
-			rc = guest_translate(vcpu, ga, pages, asce, mode);
+			rc = guest_translate(vcpu, ga, pages, asce, mode, &prot);
 			if (rc < 0)
 				return rc;
 		} else {
@@ -800,7 +818,7 @@ static int guest_page_range(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar,
 				rc = PGM_ADDRESSING;
 		}
 		if (rc)
-			return trans_exc(vcpu, rc, ga, ar, mode, PROT_TYPE_DAT);
+			return trans_exc(vcpu, rc, ga, ar, mode, prot);
 		ga += PAGE_SIZE;
 		pages++;
 		nr_pages--;
@@ -886,6 +904,7 @@ int guest_translate_address(struct kvm_vcpu *vcpu, unsigned long gva, u8 ar,
 			    unsigned long *gpa, enum gacc_mode mode)
 {
 	psw_t *psw = &vcpu->arch.sie_block->gpsw;
+	enum prot_type prot;
 	union asce asce;
 	int rc;
 
@@ -900,9 +919,9 @@ int guest_translate_address(struct kvm_vcpu *vcpu, unsigned long gva, u8 ar,
 	}
 
 	if (psw_bits(*psw).t && !asce.r) {	/* Use DAT? */
-		rc = guest_translate(vcpu, gva, gpa, asce, mode);
+		rc = guest_translate(vcpu, gva, gpa, asce, mode, &prot);
 		if (rc > 0)
-			return trans_exc(vcpu, rc, gva, 0, mode, PROT_TYPE_DAT);
+			return trans_exc(vcpu, rc, gva, 0, mode, prot);
 	} else {
 		*gpa = kvm_s390_real_to_abs(vcpu, gva);
 		if (kvm_is_error_gpa(vcpu->kvm, *gpa))
-- 
cgit v1.2.3


From 2c1a48f2e5ed31b881eaa003a6276818a4794485 Mon Sep 17 00:00:00 2001
From: Yi Min Zhao <zyimin@linux.vnet.ibm.com>
Date: Wed, 7 Jun 2017 16:09:52 +0800
Subject: KVM: S390: add new group for flic

In some cases, userspace needs to get or set all ais states for example
migration. So we introduce a new group KVM_DEV_FLIC_AISM_ALL to provide
interfaces to get or set the adapter-interruption-suppression mode for
all ISCs. The corresponding documentation is updated.

Signed-off-by: Yi Min Zhao <zyimin@linux.vnet.ibm.com>
Reviewed-by: Halil Pasic <pasic@linux.vnet.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/kvm/interrupt.c | 48 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 48 insertions(+)

(limited to 'arch/s390/kvm')

diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
index caf15c8a8948..72f3aafad5b1 100644
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@ -1876,6 +1876,28 @@ out:
 	return ret < 0 ? ret : n;
 }
 
+static int flic_ais_mode_get_all(struct kvm *kvm, struct kvm_device_attr *attr)
+{
+	struct kvm_s390_float_interrupt *fi = &kvm->arch.float_int;
+	struct kvm_s390_ais_all ais;
+
+	if (attr->attr < sizeof(ais))
+		return -EINVAL;
+
+	if (!test_kvm_facility(kvm, 72))
+		return -ENOTSUPP;
+
+	mutex_lock(&fi->ais_lock);
+	ais.simm = fi->simm;
+	ais.nimm = fi->nimm;
+	mutex_unlock(&fi->ais_lock);
+
+	if (copy_to_user((void __user *)attr->addr, &ais, sizeof(ais)))
+		return -EFAULT;
+
+	return 0;
+}
+
 static int flic_get_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
 {
 	int r;
@@ -1885,6 +1907,9 @@ static int flic_get_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
 		r = get_all_floating_irqs(dev->kvm, (u8 __user *) attr->addr,
 					  attr->attr);
 		break;
+	case KVM_DEV_FLIC_AISM_ALL:
+		r = flic_ais_mode_get_all(dev->kvm, attr);
+		break;
 	default:
 		r = -EINVAL;
 	}
@@ -2235,6 +2260,25 @@ static int flic_inject_airq(struct kvm *kvm, struct kvm_device_attr *attr)
 	return kvm_s390_inject_airq(kvm, adapter);
 }
 
+static int flic_ais_mode_set_all(struct kvm *kvm, struct kvm_device_attr *attr)
+{
+	struct kvm_s390_float_interrupt *fi = &kvm->arch.float_int;
+	struct kvm_s390_ais_all ais;
+
+	if (!test_kvm_facility(kvm, 72))
+		return -ENOTSUPP;
+
+	if (copy_from_user(&ais, (void __user *)attr->addr, sizeof(ais)))
+		return -EFAULT;
+
+	mutex_lock(&fi->ais_lock);
+	fi->simm = ais.simm;
+	fi->nimm = ais.nimm;
+	mutex_unlock(&fi->ais_lock);
+
+	return 0;
+}
+
 static int flic_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
 {
 	int r = 0;
@@ -2277,6 +2321,9 @@ static int flic_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
 	case KVM_DEV_FLIC_AIRQ_INJECT:
 		r = flic_inject_airq(dev->kvm, attr);
 		break;
+	case KVM_DEV_FLIC_AISM_ALL:
+		r = flic_ais_mode_set_all(dev->kvm, attr);
+		break;
 	default:
 		r = -EINVAL;
 	}
@@ -2298,6 +2345,7 @@ static int flic_has_attr(struct kvm_device *dev,
 	case KVM_DEV_FLIC_CLEAR_IO_IRQ:
 	case KVM_DEV_FLIC_AISM:
 	case KVM_DEV_FLIC_AIRQ_INJECT:
+	case KVM_DEV_FLIC_AISM_ALL:
 		return 0;
 	}
 	return -ENXIO;
-- 
cgit v1.2.3


From 1cae025577f447fa69e55f194b1f3b078f5fbc25 Mon Sep 17 00:00:00 2001
From: Martin Schwidefsky <schwidefsky@de.ibm.com>
Date: Wed, 21 Jun 2017 16:49:15 +0200
Subject: KVM: s390: avoid packed attribute

For naturally aligned and sized data structures avoid superfluous
packed and aligned attributes.

Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Reviewed-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/kvm/gaccess.c | 4 ++--
 arch/s390/kvm/vsie.c    | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'arch/s390/kvm')

diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c
index 6fda095f1a99..17e3a4e71bc9 100644
--- a/arch/s390/kvm/gaccess.c
+++ b/arch/s390/kvm/gaccess.c
@@ -242,7 +242,7 @@ struct ale {
 	unsigned long asteo  : 25; /* ASN-Second-Table-Entry Origin */
 	unsigned long        : 6;
 	unsigned long astesn : 32; /* ASTE Sequence Number */
-} __packed;
+};
 
 struct aste {
 	unsigned long i      : 1; /* ASX-Invalid Bit */
@@ -258,7 +258,7 @@ struct aste {
 	unsigned long ald    : 32;
 	unsigned long astesn : 32;
 	/* .. more fields there */
-} __packed;
+};
 
 int ipte_lock_held(struct kvm_vcpu *vcpu)
 {
diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c
index 4719ecb9ab42..e947657d82ac 100644
--- a/arch/s390/kvm/vsie.c
+++ b/arch/s390/kvm/vsie.c
@@ -35,7 +35,7 @@ struct vsie_page {
 	__u8 reserved[0x0700 - 0x0218];		/* 0x0218 */
 	struct kvm_s390_crypto_cb crycb;	/* 0x0700 */
 	__u8 fac[S390_ARCH_FAC_LIST_SIZE_BYTE];	/* 0x0800 */
-} __packed;
+};
 
 /* trigger a validity icpt for the given scb */
 static int set_validity_icpt(struct kvm_s390_sie_block *scb,
-- 
cgit v1.2.3


From da72ca4d4090a8ab0e6b0a23682ef42d39d7ae00 Mon Sep 17 00:00:00 2001
From: QingFeng Hao <haoqf@linux.vnet.ibm.com>
Date: Wed, 7 Jun 2017 11:41:19 +0200
Subject: KVM: s390: Backup the guest's machine check info

When a machine check happens in the guest, related mcck info (mcic,
external damage code, ...) is stored in the vcpu's lowcore on the host.
Then the machine check handler's low-level part is executed, followed
by the high-level part.

If the high-level part's execution is interrupted by a new machine check
happening on the same vcpu on the host, the mcck info in the lowcore is
overwritten with the new machine check's data.

If the high-level part's execution is scheduled to a different cpu,
the mcck info in the lowcore is uncertain.

Therefore, for both cases, the further reinjection to the guest will use
the wrong data.
Let's backup the mcck info in the lowcore to the sie page
for further reinjection, so that the right data will be used.

Add new member into struct sie_page to store related machine check's
info of mcic, failing storage address and external damage code.

Signed-off-by: QingFeng Hao <haoqf@linux.vnet.ibm.com>
Acked-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/kvm/kvm-s390.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/s390/kvm')

diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 689ac48361c6..0457e03199c5 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -2069,6 +2069,7 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
 	if (!vcpu)
 		goto out;
 
+	BUILD_BUG_ON(sizeof(struct sie_page) != 4096);
 	sie_page = (struct sie_page *) get_zeroed_page(GFP_KERNEL);
 	if (!sie_page)
 		goto out_free_cpu;
-- 
cgit v1.2.3


From 4d62fcc0b692e3b4058d7d138114c27cd8b011f7 Mon Sep 17 00:00:00 2001
From: QingFeng Hao <haoqf@linux.vnet.ibm.com>
Date: Wed, 7 Jun 2017 12:03:05 +0200
Subject: KVM: s390: Inject machine check into the guest

If the exit flag of SIE indicates that a machine check has happened
during guest's running and needs to be injected, inject it to the guest
accordingly.
But some machine checks, e.g. Channel Report Pending (CRW), refer to
host conditions only (the guest's channel devices are not managed by
the kernel directly) and are therefore not injected into the guest.
External Damage (ED) is also not reinjected into the guest because ETR
conditions are gone in Linux and STP conditions are not enabled in the
guest, and ED contains only these 8 ETR and STP conditions.
In general, instruction-processing damage, system recovery,
storage error, service-processor damage and channel subsystem damage
will be reinjected into the guest, and the remain (System damage,
timing-facility damage, warning, ED and CRW) will be handled on the host.

Signed-off-by: QingFeng Hao <haoqf@linux.vnet.ibm.com>
Acked-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/kvm/interrupt.c | 43 ++++++++++++++++++++++++++++++++++++++++++-
 arch/s390/kvm/kvm-s390.c  | 12 ++++++++++++
 arch/s390/kvm/kvm-s390.h  |  2 ++
 3 files changed, 56 insertions(+), 1 deletion(-)

(limited to 'arch/s390/kvm')

diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
index 72f3aafad5b1..f2c78fc1852d 100644
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@ -251,8 +251,13 @@ static unsigned long deliverable_irqs(struct kvm_vcpu *vcpu)
 		__clear_bit(IRQ_PEND_EXT_SERVICE, &active_mask);
 	if (psw_mchk_disabled(vcpu))
 		active_mask &= ~IRQ_PEND_MCHK_MASK;
+	/*
+	 * Check both floating and local interrupt's cr14 because
+	 * bit IRQ_PEND_MCHK_REP could be set in both cases.
+	 */
 	if (!(vcpu->arch.sie_block->gcr[14] &
-	      vcpu->kvm->arch.float_int.mchk.cr14))
+	   (vcpu->kvm->arch.float_int.mchk.cr14 |
+	   vcpu->arch.local_int.irq.mchk.cr14)))
 		__clear_bit(IRQ_PEND_MCHK_REP, &active_mask);
 
 	/*
@@ -2463,6 +2468,42 @@ static int set_adapter_int(struct kvm_kernel_irq_routing_entry *e,
 	return ret;
 }
 
+/*
+ * Inject the machine check to the guest.
+ */
+void kvm_s390_reinject_machine_check(struct kvm_vcpu *vcpu,
+				     struct mcck_volatile_info *mcck_info)
+{
+	struct kvm_s390_interrupt_info inti;
+	struct kvm_s390_irq irq;
+	struct kvm_s390_mchk_info *mchk;
+	union mci mci;
+	__u64 cr14 = 0;         /* upper bits are not used */
+
+	mci.val = mcck_info->mcic;
+	if (mci.sr)
+		cr14 |= MCCK_CR14_RECOVERY_SUB_MASK;
+	if (mci.dg)
+		cr14 |= MCCK_CR14_DEGRAD_SUB_MASK;
+	if (mci.w)
+		cr14 |= MCCK_CR14_WARN_SUB_MASK;
+
+	mchk = mci.ck ? &inti.mchk : &irq.u.mchk;
+	mchk->cr14 = cr14;
+	mchk->mcic = mcck_info->mcic;
+	mchk->ext_damage_code = mcck_info->ext_damage_code;
+	mchk->failing_storage_address = mcck_info->failing_storage_address;
+	if (mci.ck) {
+		/* Inject the floating machine check */
+		inti.type = KVM_S390_MCHK;
+		WARN_ON_ONCE(__inject_vm(vcpu->kvm, &inti));
+	} else {
+		/* Inject the machine check to specified vcpu */
+		irq.type = KVM_S390_MCHK;
+		WARN_ON_ONCE(kvm_s390_inject_vcpu(vcpu, &irq));
+	}
+}
+
 int kvm_set_routing_entry(struct kvm *kvm,
 			  struct kvm_kernel_irq_routing_entry *e,
 			  const struct kvm_irq_routing_entry *ue)
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 90434760cda5..a0f6b599ce6b 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -3041,6 +3041,9 @@ static int vcpu_post_run_fault_in_sie(struct kvm_vcpu *vcpu)
 
 static int vcpu_post_run(struct kvm_vcpu *vcpu, int exit_reason)
 {
+	struct mcck_volatile_info *mcck_info;
+	struct sie_page *sie_page;
+
 	VCPU_EVENT(vcpu, 6, "exit sie icptcode %d",
 		   vcpu->arch.sie_block->icptcode);
 	trace_kvm_s390_sie_exit(vcpu, vcpu->arch.sie_block->icptcode);
@@ -3051,6 +3054,15 @@ static int vcpu_post_run(struct kvm_vcpu *vcpu, int exit_reason)
 	vcpu->run->s.regs.gprs[14] = vcpu->arch.sie_block->gg14;
 	vcpu->run->s.regs.gprs[15] = vcpu->arch.sie_block->gg15;
 
+	if (exit_reason == -EINTR) {
+		VCPU_EVENT(vcpu, 3, "%s", "machine check");
+		sie_page = container_of(vcpu->arch.sie_block,
+					struct sie_page, sie_block);
+		mcck_info = &sie_page->mcck_info;
+		kvm_s390_reinject_machine_check(vcpu, mcck_info);
+		return 0;
+	}
+
 	if (vcpu->arch.sie_block->icptcode > 0) {
 		int rc = kvm_handle_sie_intercept(vcpu);
 
diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h
index 55f5c8457d6d..6fedc8bc7a37 100644
--- a/arch/s390/kvm/kvm-s390.h
+++ b/arch/s390/kvm/kvm-s390.h
@@ -397,4 +397,6 @@ static inline int kvm_s390_use_sca_entries(void)
 	 */
 	return sclp.has_sigpif;
 }
+void kvm_s390_reinject_machine_check(struct kvm_vcpu *vcpu,
+				     struct mcck_volatile_info *mcck_info);
 #endif
-- 
cgit v1.2.3


From d52cd2076eb2ace9fe95dbf795f6d93587453914 Mon Sep 17 00:00:00 2001
From: QingFeng Hao <haoqf@linux.vnet.ibm.com>
Date: Wed, 7 Jun 2017 12:11:18 +0200
Subject: KVM: s390: Inject machine check into the nested guest

With vsie feature enabled, kvm can support nested guests (guest-3).
So inject machine check to the guest-2 if it happens when the nested
guest is running. And guest-2 will detect the machine check belongs
to guest-3 and reinject it into guest-3.
The host (guest-1) tries to inject the machine check to the picked
destination vcpu if it's a floating machine check.

Signed-off-by: QingFeng Hao <haoqf@linux.vnet.ibm.com>
Acked-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/kvm/vsie.c | 23 +++++++++++++++++++----
 1 file changed, 19 insertions(+), 4 deletions(-)

(limited to 'arch/s390/kvm')

diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c
index e947657d82ac..715c19c45d9a 100644
--- a/arch/s390/kvm/vsie.c
+++ b/arch/s390/kvm/vsie.c
@@ -26,13 +26,18 @@
 
 struct vsie_page {
 	struct kvm_s390_sie_block scb_s;	/* 0x0000 */
+	/*
+	 * the backup info for machine check. ensure it's at
+	 * the same offset as that in struct sie_page!
+	 */
+	struct mcck_volatile_info mcck_info;    /* 0x0200 */
 	/* the pinned originial scb */
-	struct kvm_s390_sie_block *scb_o;	/* 0x0200 */
+	struct kvm_s390_sie_block *scb_o;	/* 0x0218 */
 	/* the shadow gmap in use by the vsie_page */
-	struct gmap *gmap;			/* 0x0208 */
+	struct gmap *gmap;			/* 0x0220 */
 	/* address of the last reported fault to guest2 */
-	unsigned long fault_addr;		/* 0x0210 */
-	__u8 reserved[0x0700 - 0x0218];		/* 0x0218 */
+	unsigned long fault_addr;		/* 0x0228 */
+	__u8 reserved[0x0700 - 0x0230];		/* 0x0230 */
 	struct kvm_s390_crypto_cb crycb;	/* 0x0700 */
 	__u8 fac[S390_ARCH_FAC_LIST_SIZE_BYTE];	/* 0x0800 */
 };
@@ -801,6 +806,8 @@ static int do_vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 {
 	struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
 	struct kvm_s390_sie_block *scb_o = vsie_page->scb_o;
+	struct mcck_volatile_info *mcck_info;
+	struct sie_page *sie_page;
 	int rc;
 
 	handle_last_fault(vcpu, vsie_page);
@@ -822,6 +829,14 @@ static int do_vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 	local_irq_enable();
 	vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
 
+	if (rc == -EINTR) {
+		VCPU_EVENT(vcpu, 3, "%s", "machine check");
+		sie_page = container_of(scb_s, struct sie_page, sie_block);
+		mcck_info = &sie_page->mcck_info;
+		kvm_s390_reinject_machine_check(vcpu, mcck_info);
+		return 0;
+	}
+
 	if (rc > 0)
 		rc = 0; /* we could still have an icpt */
 	else if (rc == -EFAULT)
-- 
cgit v1.2.3