diff options
author | Paolo Bonzini <pbonzini@redhat.com> | 2025-05-26 16:19:46 -0400 |
---|---|---|
committer | Paolo Bonzini <pbonzini@redhat.com> | 2025-05-26 16:19:46 -0400 |
commit | 4d526b02df59efb9b966f66e6cace836c4cfc5d8 (patch) | |
tree | 24e33e0f98913537ed67e16ce1caaa5cc884addf /arch/arm64/kvm/at.c | |
parent | 85502b2214d50ba0ddf2a5fb454e4d28a160d175 (diff) | |
parent | 1b85d923ba8c9e6afaf19e26708411adde94fba8 (diff) |
Merge tag 'kvmarm-6.16' of git://git.kernel.org/pub/scm/linux/kernel/git/kvmarm/kvmarm into HEAD
KVM/arm64 updates for 6.16
* New features:
- Add large stage-2 mapping support for non-protected pKVM guests,
clawing back some performance.
- Add UBSAN support to the standalone EL2 object used in nVHE/hVHE and
protected modes.
- Enable nested virtualisation support on systems that support it
(yes, it has been a long time coming), though it is disabled by
default.
* Improvements, fixes and cleanups:
- Large rework of the way KVM tracks architecture features and links
them with the effects of control bits. This ensures correctness of
emulation (the data is automatically extracted from the published
JSON files), and helps dealing with the evolution of the
architecture.
- Significant changes to the way pKVM tracks ownership of pages,
avoiding page table walks by storing the state in the hypervisor's
vmemmap. This in turn enables the THP support described above.
- New selftest checking the pKVM ownership transition rules
- Fixes for FEAT_MTE_ASYNC being accidentally advertised to guests
even if the host didn't have it.
- Fixes for the address translation emulation, which happened to be
rather buggy in some specific contexts.
- Fixes for the PMU emulation in NV contexts, decoupling PMCR_EL0.N
from the number of counters exposed to a guest and addressing a
number of issues in the process.
- Add a new selftest for the SVE host state being corrupted by a
guest.
- Keep HCR_EL2.xMO set at all times for systems running with the
kernel at EL2, ensuring that the window for interrupts is slightly
bigger, and avoiding a pretty bad erratum on the AmpereOne HW.
- Add workaround for AmpereOne's erratum AC04_CPU_23, which suffers
from a pretty bad case of TLB corruption unless accesses to HCR_EL2
are heavily synchronised.
- Add a per-VM, per-ITS debugfs entry to dump the state of the ITS
tables in a human-friendly fashion.
- and the usual random cleanups.
Diffstat (limited to 'arch/arm64/kvm/at.c')
-rw-r--r-- | arch/arm64/kvm/at.c | 186 |
1 files changed, 102 insertions, 84 deletions
diff --git a/arch/arm64/kvm/at.c b/arch/arm64/kvm/at.c index f74a66ce3064..a25be111cd8f 100644 --- a/arch/arm64/kvm/at.c +++ b/arch/arm64/kvm/at.c @@ -10,61 +10,11 @@ #include <asm/kvm_hyp.h> #include <asm/kvm_mmu.h> -enum trans_regime { - TR_EL10, - TR_EL20, - TR_EL2, -}; - -struct s1_walk_info { - u64 baddr; - enum trans_regime regime; - unsigned int max_oa_bits; - unsigned int pgshift; - unsigned int txsz; - int sl; - bool hpd; - bool e0poe; - bool poe; - bool pan; - bool be; - bool s2; -}; - -struct s1_walk_result { - union { - struct { - u64 desc; - u64 pa; - s8 level; - u8 APTable; - bool UXNTable; - bool PXNTable; - bool uwxn; - bool uov; - bool ur; - bool uw; - bool ux; - bool pwxn; - bool pov; - bool pr; - bool pw; - bool px; - }; - struct { - u8 fst; - bool ptw; - bool s2; - }; - }; - bool failed; -}; - -static void fail_s1_walk(struct s1_walk_result *wr, u8 fst, bool ptw, bool s2) +static void fail_s1_walk(struct s1_walk_result *wr, u8 fst, bool s1ptw) { wr->fst = fst; - wr->ptw = ptw; - wr->s2 = s2; + wr->ptw = s1ptw; + wr->s2 = s1ptw; wr->failed = true; } @@ -145,20 +95,15 @@ static void compute_s1poe(struct kvm_vcpu *vcpu, struct s1_walk_info *wi) } } -static int setup_s1_walk(struct kvm_vcpu *vcpu, u32 op, struct s1_walk_info *wi, +static int setup_s1_walk(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, struct s1_walk_result *wr, u64 va) { u64 hcr, sctlr, tcr, tg, ps, ia_bits, ttbr; unsigned int stride, x; - bool va55, tbi, lva, as_el0; + bool va55, tbi, lva; hcr = __vcpu_sys_reg(vcpu, HCR_EL2); - wi->regime = compute_translation_regime(vcpu, op); - as_el0 = (op == OP_AT_S1E0R || op == OP_AT_S1E0W); - wi->pan = (op == OP_AT_S1E1RP || op == OP_AT_S1E1WP) && - (*vcpu_cpsr(vcpu) & PSR_PAN_BIT); - va55 = va & BIT(55); if (wi->regime == TR_EL2 && va55) @@ -319,7 +264,7 @@ static int setup_s1_walk(struct kvm_vcpu *vcpu, u32 op, struct s1_walk_info *wi, /* R_BNDVG and following statements */ if (kvm_has_feat(vcpu->kvm, ID_AA64MMFR2_EL1, E0PD, IMP) && - as_el0 && (tcr & (va55 ? TCR_E0PD1 : TCR_E0PD0))) + wi->as_el0 && (tcr & (va55 ? TCR_E0PD1 : TCR_E0PD0))) goto transfault_l0; /* AArch64.S1StartLevel() */ @@ -345,11 +290,11 @@ static int setup_s1_walk(struct kvm_vcpu *vcpu, u32 op, struct s1_walk_info *wi, return 0; addrsz: /* Address Size Fault level 0 */ - fail_s1_walk(wr, ESR_ELx_FSC_ADDRSZ_L(0), false, false); + fail_s1_walk(wr, ESR_ELx_FSC_ADDRSZ_L(0), false); return -EFAULT; transfault_l0: /* Translation Fault level 0 */ - fail_s1_walk(wr, ESR_ELx_FSC_FAULT_L(0), false, false); + fail_s1_walk(wr, ESR_ELx_FSC_FAULT_L(0), false); return -EFAULT; } @@ -380,13 +325,13 @@ static int walk_s1(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, if (ret) { fail_s1_walk(wr, (s2_trans.esr & ~ESR_ELx_FSC_LEVEL) | level, - true, true); + true); return ret; } if (!kvm_s2_trans_readable(&s2_trans)) { fail_s1_walk(wr, ESR_ELx_FSC_PERM_L(level), - true, true); + true); return -EPERM; } @@ -396,8 +341,7 @@ static int walk_s1(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, ret = kvm_read_guest(vcpu->kvm, ipa, &desc, sizeof(desc)); if (ret) { - fail_s1_walk(wr, ESR_ELx_FSC_SEA_TTW(level), - true, false); + fail_s1_walk(wr, ESR_ELx_FSC_SEA_TTW(level), false); return ret; } @@ -457,6 +401,11 @@ static int walk_s1(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, if (check_output_size(desc & GENMASK(47, va_bottom), wi)) goto addrsz; + if (!(desc & PTE_AF)) { + fail_s1_walk(wr, ESR_ELx_FSC_ACCESS_L(level), false); + return -EACCES; + } + va_bottom += contiguous_bit_shift(desc, wi, level); wr->failed = false; @@ -465,13 +414,40 @@ static int walk_s1(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, wr->pa = desc & GENMASK(47, va_bottom); wr->pa |= va & GENMASK_ULL(va_bottom - 1, 0); + wr->nG = (wi->regime != TR_EL2) && (desc & PTE_NG); + if (wr->nG) { + u64 asid_ttbr, tcr; + + switch (wi->regime) { + case TR_EL10: + tcr = vcpu_read_sys_reg(vcpu, TCR_EL1); + asid_ttbr = ((tcr & TCR_A1) ? + vcpu_read_sys_reg(vcpu, TTBR1_EL1) : + vcpu_read_sys_reg(vcpu, TTBR0_EL1)); + break; + case TR_EL20: + tcr = vcpu_read_sys_reg(vcpu, TCR_EL2); + asid_ttbr = ((tcr & TCR_A1) ? + vcpu_read_sys_reg(vcpu, TTBR1_EL2) : + vcpu_read_sys_reg(vcpu, TTBR0_EL2)); + break; + default: + BUG(); + } + + wr->asid = FIELD_GET(TTBR_ASID_MASK, asid_ttbr); + if (!kvm_has_feat_enum(vcpu->kvm, ID_AA64MMFR0_EL1, ASIDBITS, 16) || + !(tcr & TCR_ASID16)) + wr->asid &= GENMASK(7, 0); + } + return 0; addrsz: - fail_s1_walk(wr, ESR_ELx_FSC_ADDRSZ_L(level), true, false); + fail_s1_walk(wr, ESR_ELx_FSC_ADDRSZ_L(level), false); return -EINVAL; transfault: - fail_s1_walk(wr, ESR_ELx_FSC_FAULT_L(level), true, false); + fail_s1_walk(wr, ESR_ELx_FSC_FAULT_L(level), false); return -ENOENT; } @@ -488,7 +464,6 @@ struct mmu_config { u64 sctlr; u64 vttbr; u64 vtcr; - u64 hcr; }; static void __mmu_config_save(struct mmu_config *config) @@ -511,13 +486,10 @@ static void __mmu_config_save(struct mmu_config *config) config->sctlr = read_sysreg_el1(SYS_SCTLR); config->vttbr = read_sysreg(vttbr_el2); config->vtcr = read_sysreg(vtcr_el2); - config->hcr = read_sysreg(hcr_el2); } static void __mmu_config_restore(struct mmu_config *config) { - write_sysreg(config->hcr, hcr_el2); - /* * ARM errata 1165522 and 1530923 require TGE to be 1 before * we update the guest state. @@ -1155,7 +1127,12 @@ static u64 handle_at_slow(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) bool perm_fail = false; int ret, idx; - ret = setup_s1_walk(vcpu, op, &wi, &wr, vaddr); + wi.regime = compute_translation_regime(vcpu, op); + wi.as_el0 = (op == OP_AT_S1E0R || op == OP_AT_S1E0W); + wi.pan = (op == OP_AT_S1E1RP || op == OP_AT_S1E1WP) && + (*vcpu_cpsr(vcpu) & PSR_PAN_BIT); + + ret = setup_s1_walk(vcpu, &wi, &wr, vaddr); if (ret) goto compute_par; @@ -1198,7 +1175,7 @@ static u64 handle_at_slow(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) } if (perm_fail) - fail_s1_walk(&wr, ESR_ELx_FSC_PERM_L(wr.level), false, false); + fail_s1_walk(&wr, ESR_ELx_FSC_PERM_L(wr.level), false); compute_par: return compute_par_s1(vcpu, &wr, wi.regime); @@ -1210,7 +1187,8 @@ compute_par: * If the translation is unsuccessful, the value may only contain * PAR_EL1.F, and cannot be taken at face value. It isn't an * indication of the translation having failed, only that the fast - * path did not succeed, *unless* it indicates a S1 permission fault. + * path did not succeed, *unless* it indicates a S1 permission or + * access fault. */ static u64 __kvm_at_s1e01_fast(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) { @@ -1266,8 +1244,8 @@ static u64 __kvm_at_s1e01_fast(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) __load_stage2(mmu, mmu->arch); skip_mmu_switch: - /* Clear TGE, enable S2 translation, we're rolling */ - write_sysreg((config.hcr & ~HCR_TGE) | HCR_VM, hcr_el2); + /* Temporarily switch back to guest context */ + write_sysreg_hcr(vcpu->arch.hcr_el2); isb(); switch (op) { @@ -1299,6 +1277,8 @@ skip_mmu_switch: if (!fail) par = read_sysreg_par(); + write_sysreg_hcr(HCR_HOST_VHE_FLAGS); + if (!(vcpu_el2_e2h_is_set(vcpu) && vcpu_el2_tge_is_set(vcpu))) __mmu_config_restore(&config); @@ -1313,19 +1293,29 @@ static bool par_check_s1_perm_fault(u64 par) !(par & SYS_PAR_EL1_S)); } +static bool par_check_s1_access_fault(u64 par) +{ + u8 fst = FIELD_GET(SYS_PAR_EL1_FST, par); + + return ((fst & ESR_ELx_FSC_TYPE) == ESR_ELx_FSC_ACCESS && + !(par & SYS_PAR_EL1_S)); +} + void __kvm_at_s1e01(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) { u64 par = __kvm_at_s1e01_fast(vcpu, op, vaddr); /* - * If PAR_EL1 reports that AT failed on a S1 permission fault, we - * know for sure that the PTW was able to walk the S1 tables and - * there's nothing else to do. + * If PAR_EL1 reports that AT failed on a S1 permission or access + * fault, we know for sure that the PTW was able to walk the S1 + * tables and there's nothing else to do. * * If AT failed for any other reason, then we must walk the guest S1 * to emulate the instruction. */ - if ((par & SYS_PAR_EL1_F) && !par_check_s1_perm_fault(par)) + if ((par & SYS_PAR_EL1_F) && + !par_check_s1_perm_fault(par) && + !par_check_s1_access_fault(par)) par = handle_at_slow(vcpu, op, vaddr); vcpu_write_sys_reg(vcpu, par, PAR_EL1); @@ -1350,7 +1340,7 @@ void __kvm_at_s1e2(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) if (!vcpu_el2_e2h_is_set(vcpu)) val |= HCR_NV | HCR_NV1; - write_sysreg(val, hcr_el2); + write_sysreg_hcr(val); isb(); par = SYS_PAR_EL1_F; @@ -1375,7 +1365,7 @@ void __kvm_at_s1e2(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) if (!fail) par = read_sysreg_par(); - write_sysreg(hcr, hcr_el2); + write_sysreg_hcr(hcr); isb(); } @@ -1444,3 +1434,31 @@ void __kvm_at_s12(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) par = compute_par_s12(vcpu, par, &out); vcpu_write_sys_reg(vcpu, par, PAR_EL1); } + +/* + * Translate a VA for a given EL in a given translation regime, with + * or without PAN. This requires wi->{regime, as_el0, pan} to be + * set. The rest of the wi and wr should be 0-initialised. + */ +int __kvm_translate_va(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, + struct s1_walk_result *wr, u64 va) +{ + int ret; + + ret = setup_s1_walk(vcpu, wi, wr, va); + if (ret) + return ret; + + if (wr->level == S1_MMU_DISABLED) { + wr->ur = wr->uw = wr->ux = true; + wr->pr = wr->pw = wr->px = true; + } else { + ret = walk_s1(vcpu, wi, wr, va); + if (ret) + return ret; + + compute_s1_permissions(vcpu, wi, wr); + } + + return 0; +} |