diff options
63 files changed, 2133 insertions, 1191 deletions
diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 07049eadb124..5237e1b2fd66 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -1482,6 +1482,11 @@ struct kvm_irq_routing_msi { __u32 pad; }; +On x86, address_hi is ignored unless the KVM_X2APIC_API_USE_32BIT_IDS +feature of KVM_CAP_X2APIC_API capability is enabled. If it is enabled, +address_hi bits 31-8 provide bits 31-8 of the destination id. Bits 7-0 of +address_hi must be zero. + struct kvm_irq_routing_s390_adapter { __u64 ind_addr; __u64 summary_addr; @@ -1583,6 +1588,17 @@ struct kvm_lapic_state { Reads the Local APIC registers and copies them into the input argument. The data format and layout are the same as documented in the architecture manual. +If KVM_X2APIC_API_USE_32BIT_IDS feature of KVM_CAP_X2APIC_API is +enabled, then the format of APIC_ID register depends on the APIC mode +(reported by MSR_IA32_APICBASE) of its VCPU. x2APIC stores APIC ID in +the APIC_ID register (bytes 32-35). xAPIC only allows an 8-bit APIC ID +which is stored in bits 31-24 of the APIC register, or equivalently in +byte 35 of struct kvm_lapic_state's regs field. KVM_GET_LAPIC must then +be called after MSR_IA32_APICBASE has been set with KVM_SET_MSR. + +If KVM_X2APIC_API_USE_32BIT_IDS feature is disabled, struct kvm_lapic_state +always uses xAPIC format. + 4.58 KVM_SET_LAPIC @@ -1600,6 +1616,10 @@ struct kvm_lapic_state { Copies the input argument into the Local APIC registers. The data format and layout are the same as documented in the architecture manual. +The format of the APIC ID register (bytes 32-35 of struct kvm_lapic_state's +regs field) depends on the state of the KVM_CAP_X2APIC_API capability. +See the note in KVM_GET_LAPIC. + 4.59 KVM_IOEVENTFD @@ -2188,6 +2208,10 @@ The per-VM KVM_CAP_MSI_DEVID capability advertises the need to provide the device ID. If this capability is not set, userland cannot rely on the kernel to allow the KVM_MSI_VALID_DEVID flag being set. +On x86, address_hi is ignored unless the KVM_CAP_X2APIC_API capability is +enabled. If it is enabled, address_hi bits 31-8 provide bits 31-8 of the +destination id. Bits 7-0 of address_hi must be zero. + 4.71 KVM_CREATE_PIT2 @@ -3819,6 +3843,42 @@ Allows use of runtime-instrumentation introduced with zEC12 processor. Will return -EINVAL if the machine does not support runtime-instrumentation. Will return -EBUSY if a VCPU has already been created. +7.7 KVM_CAP_X2APIC_API + +Architectures: x86 +Parameters: args[0] - features that should be enabled +Returns: 0 on success, -EINVAL when args[0] contains invalid features + +Valid feature flags in args[0] are + +#define KVM_X2APIC_API_USE_32BIT_IDS (1ULL << 0) +#define KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK (1ULL << 1) + +Enabling KVM_X2APIC_API_USE_32BIT_IDS changes the behavior of +KVM_SET_GSI_ROUTING, KVM_SIGNAL_MSI, KVM_SET_LAPIC, and KVM_GET_LAPIC, +allowing the use of 32-bit APIC IDs. See KVM_CAP_X2APIC_API in their +respective sections. + +KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK must be enabled for x2APIC to work +in logical mode or with more than 255 VCPUs. Otherwise, KVM treats 0xff +as a broadcast even in x2APIC mode in order to support physical x2APIC +without interrupt remapping. This is undesirable in logical mode, +where 0xff represents CPUs 0-7 in cluster 0. + +7.8 KVM_CAP_S390_USER_INSTR0 + +Architectures: s390 +Parameters: none + +With this capability enabled, all illegal instructions 0x0000 (2 bytes) will +be intercepted and forwarded to user space. User space can use this +mechanism e.g. to realize 2-byte software breakpoints. The kernel will +not inject an operating exception for these instructions, user space has +to take care of that. + +This capability can be enabled dynamically even if VCPUs were already +created and are running. + 8. Other capabilities. ---------------------- diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c index fb4661cf896e..abdb2ea19bf0 100644 --- a/arch/arm/kvm/arm.c +++ b/arch/arm/kvm/arm.c @@ -616,7 +616,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) * Enter the guest */ trace_kvm_entry(*vcpu_pc(vcpu)); - __kvm_guest_enter(); + guest_enter_irqoff(); vcpu->mode = IN_GUEST_MODE; ret = kvm_call_hyp(__kvm_vcpu_run, vcpu); @@ -642,14 +642,14 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) local_irq_enable(); /* - * We do local_irq_enable() before calling kvm_guest_exit() so + * We do local_irq_enable() before calling guest_exit() so * that if a timer interrupt hits while running the guest we * account that tick as being spent in the guest. We enable - * preemption after calling kvm_guest_exit() so that if we get + * preemption after calling guest_exit() so that if we get * preempted we make sure ticks after that is not counted as * guest time. */ - kvm_guest_exit(); + guest_exit(); trace_kvm_exit(ret, kvm_vcpu_trap_get_class(vcpu), *vcpu_pc(vcpu)); /* diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h index b0773c6d622f..b54bcadd8aec 100644 --- a/arch/mips/include/asm/kvm_host.h +++ b/arch/mips/include/asm/kvm_host.h @@ -400,7 +400,7 @@ static inline void _kvm_atomic_set_c0_guest_reg(unsigned long *reg, unsigned long temp; do { __asm__ __volatile__( - " .set mips3 \n" + " .set "MIPS_ISA_ARCH_LEVEL" \n" " " __LL "%0, %1 \n" " or %0, %2 \n" " " __SC "%0, %1 \n" @@ -416,7 +416,7 @@ static inline void _kvm_atomic_clear_c0_guest_reg(unsigned long *reg, unsigned long temp; do { __asm__ __volatile__( - " .set mips3 \n" + " .set "MIPS_ISA_ARCH_LEVEL" \n" " " __LL "%0, %1 \n" " and %0, %2 \n" " " __SC "%0, %1 \n" @@ -433,7 +433,7 @@ static inline void _kvm_atomic_change_c0_guest_reg(unsigned long *reg, unsigned long temp; do { __asm__ __volatile__( - " .set mips3 \n" + " .set "MIPS_ISA_ARCH_LEVEL" \n" " " __LL "%0, %1 \n" " and %0, %2 \n" " or %0, %3 \n" @@ -533,8 +533,13 @@ int kvm_mips_emulation_init(struct kvm_mips_callbacks **install_callbacks); /* Debug: dump vcpu state */ int kvm_arch_vcpu_dump_regs(struct kvm_vcpu *vcpu); -/* Trampoline ASM routine to start running in "Guest" context */ -extern int __kvm_mips_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu); +extern int kvm_mips_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu); + +/* Building of entry/exception code */ +int kvm_mips_entry_setup(void); +void *kvm_mips_build_vcpu_run(void *addr); +void *kvm_mips_build_exception(void *addr, void *handler); +void *kvm_mips_build_exit(void *addr); /* FPU/MSA context management */ void __kvm_save_fpu(struct kvm_vcpu_arch *vcpu); diff --git a/arch/mips/include/asm/uasm.h b/arch/mips/include/asm/uasm.h index b6ecfeee4dbe..f7929f65f7ca 100644 --- a/arch/mips/include/asm/uasm.h +++ b/arch/mips/include/asm/uasm.h @@ -104,8 +104,13 @@ Ip_u1s2(_bltz); Ip_u1s2(_bltzl); Ip_u1u2s3(_bne); Ip_u2s3u1(_cache); +Ip_u1u2(_cfc1); +Ip_u2u1(_cfcmsa); +Ip_u1u2(_ctc1); +Ip_u2u1(_ctcmsa); Ip_u2u1s3(_daddiu); Ip_u3u1u2(_daddu); +Ip_u1(_di); Ip_u2u1msbu3(_dins); Ip_u2u1msbu3(_dinsm); Ip_u1u2(_divu); @@ -141,6 +146,8 @@ Ip_u1(_mfhi); Ip_u1(_mflo); Ip_u1u2u3(_mtc0); Ip_u1u2u3(_mthc0); +Ip_u1(_mthi); +Ip_u1(_mtlo); Ip_u3u1u2(_mul); Ip_u3u1u2(_or); Ip_u2u1u3(_ori); diff --git a/arch/mips/include/uapi/asm/inst.h b/arch/mips/include/uapi/asm/inst.h index a1ebf973725c..77429d1622b3 100644 --- a/arch/mips/include/uapi/asm/inst.h +++ b/arch/mips/include/uapi/asm/inst.h @@ -21,20 +21,20 @@ enum major_op { spec_op, bcond_op, j_op, jal_op, beq_op, bne_op, blez_op, bgtz_op, - addi_op, cbcond0_op = addi_op, addiu_op, slti_op, sltiu_op, + addi_op, pop10_op = addi_op, addiu_op, slti_op, sltiu_op, andi_op, ori_op, xori_op, lui_op, cop0_op, cop1_op, cop2_op, cop1x_op, beql_op, bnel_op, blezl_op, bgtzl_op, - daddi_op, cbcond1_op = daddi_op, daddiu_op, ldl_op, ldr_op, + daddi_op, pop30_op = daddi_op, daddiu_op, ldl_op, ldr_op, spec2_op, jalx_op, mdmx_op, msa_op = mdmx_op, spec3_op, lb_op, lh_op, lwl_op, lw_op, lbu_op, lhu_op, lwr_op, lwu_op, sb_op, sh_op, swl_op, sw_op, sdl_op, sdr_op, swr_op, cache_op, ll_op, lwc1_op, lwc2_op, bc6_op = lwc2_op, pref_op, - lld_op, ldc1_op, ldc2_op, beqzcjic_op = ldc2_op, ld_op, + lld_op, ldc1_op, ldc2_op, pop66_op = ldc2_op, ld_op, sc_op, swc1_op, swc2_op, balc6_op = swc2_op, major_3b_op, - scd_op, sdc1_op, sdc2_op, bnezcjialc_op = sdc2_op, sd_op + scd_op, sdc1_op, sdc2_op, pop76_op = sdc2_op, sd_op }; /* @@ -93,6 +93,50 @@ enum spec3_op { }; /* + * Bits 10-6 minor opcode for r6 spec mult/div encodings + */ +enum mult_op { + mult_mult_op = 0x0, + mult_mul_op = 0x2, + mult_muh_op = 0x3, +}; +enum multu_op { + multu_multu_op = 0x0, + multu_mulu_op = 0x2, + multu_muhu_op = 0x3, +}; +enum div_op { + div_div_op = 0x0, + div_div6_op = 0x2, + div_mod_op = 0x3, +}; +enum divu_op { + divu_divu_op = 0x0, + divu_divu6_op = 0x2, + divu_modu_op = 0x3, +}; +enum dmult_op { + dmult_dmult_op = 0x0, + dmult_dmul_op = 0x2, + dmult_dmuh_op = 0x3, +}; +enum dmultu_op { + dmultu_dmultu_op = 0x0, + dmultu_dmulu_op = 0x2, + dmultu_dmuhu_op = 0x3, +}; +enum ddiv_op { + ddiv_ddiv_op = 0x0, + ddiv_ddiv6_op = 0x2, + ddiv_dmod_op = 0x3, +}; +enum ddivu_op { + ddivu_ddivu_op = 0x0, + ddivu_ddivu6_op = 0x2, + ddivu_dmodu_op = 0x3, +}; + +/* * rt field of bcond opcodes. */ enum rt_op { @@ -238,6 +282,21 @@ enum bshfl_func { }; /* + * MSA minor opcodes. + */ +enum msa_func { + msa_elm_op = 0x19, +}; + +/* + * MSA ELM opcodes. + */ +enum msa_elm { + msa_ctc_op = 0x3e, + msa_cfc_op = 0x7e, +}; + +/* * func field for MSA MI10 format. */ enum msa_mi10_func { @@ -264,7 +323,7 @@ enum mm_major_op { mm_pool32b_op, mm_pool16b_op, mm_lhu16_op, mm_andi16_op, mm_addiu32_op, mm_lhu32_op, mm_sh32_op, mm_lh32_op, mm_pool32i_op, mm_pool16c_op, mm_lwsp16_op, mm_pool16d_op, - mm_ori32_op, mm_pool32f_op, mm_reserved1_op, mm_reserved2_op, + mm_ori32_op, mm_pool32f_op, mm_pool32s_op, mm_reserved2_op, mm_pool32c_op, mm_lwgp16_op, mm_lw16_op, mm_pool16e_op, mm_xori32_op, mm_jals32_op, mm_addiupc_op, mm_reserved3_op, mm_reserved4_op, mm_pool16f_op, mm_sb16_op, mm_beqz16_op, @@ -360,7 +419,10 @@ enum mm_32axf_minor_op { mm_mflo32_op = 0x075, mm_jalrhb_op = 0x07c, mm_tlbwi_op = 0x08d, + mm_mthi32_op = 0x0b5, mm_tlbwr_op = 0x0cd, + mm_mtlo32_op = 0x0f5, + mm_di_op = 0x11d, mm_jalrs_op = 0x13c, mm_jalrshb_op = 0x17c, mm_sync_op = 0x1ad, @@ -479,6 +541,13 @@ enum mm_32f_73_minor_op { }; /* + * (microMIPS) POOL32S minor opcodes. + */ +enum mm_32s_minor_op { + mm_32s_elm_op = 0x16, +}; + +/* * (microMIPS) POOL16C minor opcodes. */ enum mm_16c_minor_op { diff --git a/arch/mips/kernel/asm-offsets.c b/arch/mips/kernel/asm-offsets.c index a1263d188a5a..fae2f9447792 100644 --- a/arch/mips/kernel/asm-offsets.c +++ b/arch/mips/kernel/asm-offsets.c @@ -339,67 +339,9 @@ void output_pm_defines(void) } #endif -void output_cpuinfo_defines(void) -{ - COMMENT(" MIPS cpuinfo offsets. "); - DEFINE(CPUINFO_SIZE, sizeof(struct cpuinfo_mips)); -#ifdef CONFIG_MIPS_ASID_BITS_VARIABLE - OFFSET(CPUINFO_ASID_MASK, cpuinfo_mips, asid_mask); -#endif -} - void output_kvm_defines(void) { COMMENT(" KVM/MIPS Specfic offsets. "); - DEFINE(VCPU_ARCH_SIZE, sizeof(struct kvm_vcpu_arch)); - OFFSET(VCPU_RUN, kvm_vcpu, run); - OFFSET(VCPU_HOST_ARCH, kvm_vcpu, arch); - - OFFSET(VCPU_GUEST_EBASE, kvm_vcpu_arch, guest_ebase); - - OFFSET(VCPU_HOST_STACK, kvm_vcpu_arch, host_stack); - OFFSET(VCPU_HOST_GP, kvm_vcpu_arch, host_gp); - - OFFSET(VCPU_HOST_CP0_BADVADDR, kvm_vcpu_arch, host_cp0_badvaddr); - OFFSET(VCPU_HOST_CP0_CAUSE, kvm_vcpu_arch, host_cp0_cause); - OFFSET(VCPU_HOST_EPC, kvm_vcpu_arch, host_cp0_epc); - - OFFSET(VCPU_R0, kvm_vcpu_arch, gprs[0]); - OFFSET(VCPU_R1, kvm_vcpu_arch, gprs[1]); - OFFSET(VCPU_R2, kvm_vcpu_arch, gprs[2]); - OFFSET(VCPU_R3, kvm_vcpu_arch, gprs[3]); - OFFSET(VCPU_R4, kvm_vcpu_arch, gprs[4]); - OFFSET(VCPU_R5, kvm_vcpu_arch, gprs[5]); - OFFSET(VCPU_R6, kvm_vcpu_arch, gprs[6]); - OFFSET(VCPU_R7, kvm_vcpu_arch, gprs[7]); - OFFSET(VCPU_R8, kvm_vcpu_arch, gprs[8]); - OFFSET(VCPU_R9, kvm_vcpu_arch, gprs[9]); - OFFSET(VCPU_R10, kvm_vcpu_arch, gprs[10]); - OFFSET(VCPU_R11, kvm_vcpu_arch, gprs[11]); - OFFSET(VCPU_R12, kvm_vcpu_arch, gprs[12]); - OFFSET(VCPU_R13, kvm_vcpu_arch, gprs[13]); - OFFSET(VCPU_R14, kvm_vcpu_arch, gprs[14]); - OFFSET(VCPU_R15, kvm_vcpu_arch, gprs[15]); - OFFSET(VCPU_R16, kvm_vcpu_arch, gprs[16]); - OFFSET(VCPU_R17, kvm_vcpu_arch, gprs[17]); - OFFSET(VCPU_R18, kvm_vcpu_arch, gprs[18]); - OFFSET(VCPU_R19, kvm_vcpu_arch, gprs[19]); - OFFSET(VCPU_R20, kvm_vcpu_arch, gprs[20]); - OFFSET(VCPU_R21, kvm_vcpu_arch, gprs[21]); - OFFSET(VCPU_R22, kvm_vcpu_arch, gprs[22]); - OFFSET(VCPU_R23, kvm_vcpu_arch, gprs[23]); - OFFSET(VCPU_R24, kvm_vcpu_arch, gprs[24]); - OFFSET(VCPU_R25, kvm_vcpu_arch, gprs[25]); - OFFSET(VCPU_R26, kvm_vcpu_arch, gprs[26]); - OFFSET(VCPU_R27, kvm_vcpu_arch, gprs[27]); - OFFSET(VCPU_R28, kvm_vcpu_arch, gprs[28]); - OFFSET(VCPU_R29, kvm_vcpu_arch, gprs[29]); - OFFSET(VCPU_R30, kvm_vcpu_arch, gprs[30]); - OFFSET(VCPU_R31, kvm_vcpu_arch, gprs[31]); - OFFSET(VCPU_LO, kvm_vcpu_arch, lo); - OFFSET(VCPU_HI, kvm_vcpu_arch, hi); - OFFSET(VCPU_PC, kvm_vcpu_arch, pc); - BLANK(); OFFSET(VCPU_FPR0, kvm_vcpu_arch, fpu.fpr[0]); OFFSET(VCPU_FPR1, kvm_vcpu_arch, fpu.fpr[1]); @@ -437,14 +379,6 @@ void output_kvm_defines(void) OFFSET(VCPU_FCR31, kvm_vcpu_arch, fpu.fcr31); OFFSET(VCPU_MSA_CSR, kvm_vcpu_arch, fpu.msacsr); BLANK(); - - OFFSET(VCPU_COP0, kvm_vcpu_arch, cop0); - OFFSET(VCPU_GUEST_KERNEL_ASID, kvm_vcpu_arch, guest_kernel_asid); - OFFSET(VCPU_GUEST_USER_ASID, kvm_vcpu_arch, guest_user_asid); - - OFFSET(COP0_TLB_HI, mips_coproc, reg[MIPS_CP0_TLB_HI][0]); - OFFSET(COP0_STATUS, mips_coproc, reg[MIPS_CP0_STATUS][0]); - BLANK(); } #ifdef CONFIG_MIPS_CPS diff --git a/arch/mips/kernel/branch.c b/arch/mips/kernel/branch.c index 6dc3f1fdaccc..46c227fc98f5 100644 --- a/arch/mips/kernel/branch.c +++ b/arch/mips/kernel/branch.c @@ -790,7 +790,7 @@ int __compute_return_epc_for_insn(struct pt_regs *regs, epc += 4 + (insn.i_format.simmediate << 2); regs->cp0_epc = epc; break; - case beqzcjic_op: + case pop66_op: if (!cpu_has_mips_r6) { ret = -SIGILL; break; @@ -798,7 +798,7 @@ int __compute_return_epc_for_insn(struct pt_regs *regs, /* Compact branch: BEQZC || JIC */ regs->cp0_epc += 8; break; - case bnezcjialc_op: + case pop76_op: if (!cpu_has_mips_r6) { ret = -SIGILL; break; @@ -809,8 +809,8 @@ int __compute_return_epc_for_insn(struct pt_regs *regs, regs->cp0_epc += 8; break; #endif - case cbcond0_op: - case cbcond1_op: + case pop10_op: + case pop30_op: /* Only valid for MIPS R6 */ if (!cpu_has_mips_r6) { ret = -SIGILL; diff --git a/arch/mips/kvm/Kconfig b/arch/mips/kvm/Kconfig index 2ae12825529f..7c56d6b124d1 100644 --- a/arch/mips/kvm/Kconfig +++ b/arch/mips/kvm/Kconfig @@ -17,6 +17,7 @@ if VIRTUALIZATION config KVM tristate "Kernel-based Virtual Machine (KVM) support" depends on HAVE_KVM + select EXPORT_UASM select PREEMPT_NOTIFIERS select ANON_INODES select KVM_MMIO diff --git a/arch/mips/kvm/Makefile b/arch/mips/kvm/Makefile index 0aabe40fcac9..847429de780d 100644 --- a/arch/mips/kvm/Makefile +++ b/arch/mips/kvm/Makefile @@ -7,7 +7,7 @@ EXTRA_CFLAGS += -Ivirt/kvm -Iarch/mips/kvm common-objs-$(CONFIG_CPU_HAS_MSA) += msa.o -kvm-objs := $(common-objs-y) mips.o emulate.o locore.o \ +kvm-objs := $(common-objs-y) mips.o emulate.o entry.o \ interrupt.o stats.o commpage.o \ dyntrans.o trap_emul.o fpu.o kvm-objs += mmu.o diff --git a/arch/mips/kvm/dyntrans.c b/arch/mips/kvm/dyntrans.c index 8a1833b9eb38..91ebd2b6034f 100644 --- a/arch/mips/kvm/dyntrans.c +++ b/arch/mips/kvm/dyntrans.c @@ -72,7 +72,10 @@ int kvm_mips_trans_cache_va(union mips_instruction inst, u32 *opc, synci_inst.i_format.opcode = bcond_op; synci_inst.i_format.rs = inst.i_format.rs; synci_inst.i_format.rt = synci_op; - synci_inst.i_format.simmediate = inst.i_format.simmediate; + if (cpu_has_mips_r6) + synci_inst.i_format.simmediate = inst.spec3_format.simmediate; + else + synci_inst.i_format.simmediate = inst.i_format.simmediate; return kvm_mips_trans_replace(vcpu, opc, synci_inst); } diff --git a/arch/mips/kvm/emulate.c b/arch/mips/kvm/emulate.c index 5f0354c80c8e..be18dfe9ecaa 100644 --- a/arch/mips/kvm/emulate.c +++ b/arch/mips/kvm/emulate.c @@ -161,9 +161,12 @@ unsigned long kvm_compute_return_epc(struct kvm_vcpu *vcpu, nextpc = epc; break; - case blez_op: /* not really i_format */ - case blezl_op: - /* rt field assumed to be zero */ + case blez_op: /* POP06 */ +#ifndef CONFIG_CPU_MIPSR6 + case blezl_op: /* removed in R6 */ +#endif + if (insn.i_format.rt != 0) + goto compact_branch; if ((long)arch->gprs[insn.i_format.rs] <= 0) epc = epc + 4 + (insn.i_format.simmediate << 2); else @@ -171,9 +174,12 @@ unsigned long kvm_compute_return_epc(struct kvm_vcpu *vcpu, nextpc = epc; break; - case bgtz_op: - case bgtzl_op: - /* rt field assumed to be zero */ + case bgtz_op: /* POP07 */ +#ifndef CONFIG_CPU_MIPSR6 + case bgtzl_op: /* removed in R6 */ +#endif + if (insn.i_format.rt != 0) + goto compact_branch; if ((long)arch->gprs[insn.i_format.rs] > 0) epc = epc + 4 + (insn.i_format.simmediate << 2); else @@ -185,6 +191,40 @@ unsigned long kvm_compute_return_epc(struct kvm_vcpu *vcpu, case cop1_op: kvm_err("%s: unsupported cop1_op\n", __func__); break; + +#ifdef CONFIG_CPU_MIPSR6 + /* R6 added the following compact branches with forbidden slots */ + case blezl_op: /* POP26 */ + case bgtzl_op: /* POP27 */ + /* only rt == 0 isn't compact branch */ + if (insn.i_format.rt != 0) + goto compact_branch; + break; + case pop10_op: + case pop30_op: + /* only rs == rt == 0 is reserved, rest are compact branches */ + if (insn.i_format.rs != 0 || insn.i_format.rt != 0) + goto compact_branch; + break; + case pop66_op: + case pop76_op: + /* only rs == 0 isn't compact branch */ + if (insn.i_format.rs != 0) + goto compact_branch; + break; +compact_branch: + /* + * If we've hit an exception on the forbidden slot, then + * the branch must not have been taken. + */ + epc += 8; + nextpc = epc; + break; +#else +compact_branch: + /* Compact branches not supported before R6 */ + break; +#endif } return nextpc; @@ -1561,7 +1601,10 @@ enum emulation_result kvm_mips_emulate_cache(union mips_instruction inst, base = inst.i_format.rs; op_inst = inst.i_format.rt; - offset = inst.i_format.simmediate; + if (cpu_has_mips_r6) + offset = inst.spec3_format.simmediate; + else + offset = inst.i_format.simmediate; cache = op_inst & CacheOp_Cache; op = op_inst & CacheOp_Op; @@ -1724,11 +1767,27 @@ enum emulation_result kvm_mips_emulate_inst(u32 cause, u32 *opc, er = kvm_mips_emulate_load(inst, cause, run, vcpu); break; +#ifndef CONFIG_CPU_MIPSR6 case cache_op: ++vcpu->stat.cache_exits; trace_kvm_exit(vcpu, KVM_TRACE_EXIT_CACHE); er = kvm_mips_emulate_cache(inst, opc, cause, run, vcpu); break; +#else + case spec3_op: + switch (inst.spec3_format.func) { + case cache6_op: + ++vcpu->stat.cache_exits; + trace_kvm_exit(vcpu, KVM_TRACE_EXIT_CACHE); + er = kvm_mips_emulate_cache(inst, opc, cause, run, + vcpu); + break; + default: + goto unknown; + }; + break; +unknown: +#endif default: kvm_err("Instruction emulation not supported (%p/%#x)\n", opc, @@ -2298,7 +2357,9 @@ enum emulation_result kvm_mips_handle_ri(u32 cause, u32 *opc, } if (inst.r_format.opcode == spec3_op && - inst.r_format.func == rdhwr_op) { + inst.r_format.func == rdhwr_op && + inst.r_format.rs == 0 && + (inst.r_format.re >> 3) == 0) { int usermode = !KVM_GUEST_KERNEL_MODE(vcpu); int rd = inst.r_format.rd; int rt = inst.r_format.rt; diff --git a/arch/mips/kvm/entry.c b/arch/mips/kvm/entry.c new file mode 100644 index 000000000000..75ba7c2ecb3d --- /dev/null +++ b/arch/mips/kvm/entry.c @@ -0,0 +1,676 @@ +/* + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file "COPYING" in the main directory of this archive + * for more details. + * + * Generation of main entry point for the guest, exception handling. + * + * Copyright (C) 2012 MIPS Technologies, Inc. + * Authors: Sanjay Lal <sanjayl@kymasys.com> + * + * Copyright (C) 2016 Imagination Technologies Ltd. + */ + +#include <linux/kvm_host.h> +#include <asm/msa.h> +#include <asm/setup.h> +#include <asm/uasm.h> + +/* Register names */ +#define ZERO 0 +#define AT 1 +#define V0 2 +#define V1 3 +#define A0 4 +#define A1 5 + +#if _MIPS_SIM == _MIPS_SIM_ABI32 +#define T0 8 +#define T1 9 +#define T2 10 +#define T3 11 +#endif /* _MIPS_SIM == _MIPS_SIM_ABI32 */ + +#if _MIPS_SIM == _MIPS_SIM_ABI64 || _MIPS_SIM == _MIPS_SIM_NABI32 +#define T0 12 +#define T1 13 +#define T2 14 +#define T3 15 +#endif /* _MIPS_SIM == _MIPS_SIM_ABI64 || _MIPS_SIM == _MIPS_SIM_NABI32 */ + +#define S0 16 +#define S1 17 +#define T9 25 +#define K0 26 +#define K1 27 +#define GP 28 +#define SP 29 +#define RA 31 + +/* Some CP0 registers */ +#define C0_HWRENA 7, 0 +#define C0_BADVADDR 8, 0 +#define C0_ENTRYHI 10, 0 +#define C0_STATUS 12, 0 +#define C0_CAUSE 13, 0 +#define C0_EPC 14, 0 +#define C0_EBASE 15, 1 +#define C0_CONFIG5 16, 5 +#define C0_DDATA_LO 28, 3 +#define C0_ERROREPC 30, 0 + +#define CALLFRAME_SIZ 32 + +static unsigned int scratch_vcpu[2] = { C0_DDATA_LO }; +static unsigned int scratch_tmp[2] = { C0_ERROREPC }; + +enum label_id { + label_fpu_1 = 1, + label_msa_1, + label_return_to_host, + label_kernel_asid, + label_exit_common, +}; + +UASM_L_LA(_fpu_1) +UASM_L_LA(_msa_1) +UASM_L_LA(_return_to_host) +UASM_L_LA(_kernel_asid) +UASM_L_LA(_exit_common) + +static void *kvm_mips_build_enter_guest(void *addr); +static void *kvm_mips_build_ret_from_exit(void *addr); +static void *kvm_mips_build_ret_to_guest(void *addr); +static void *kvm_mips_build_ret_to_host(void *addr); + +/** + * kvm_mips_entry_setup() - Perform global setup for entry code. + * + * Perform global setup for entry code, such as choosing a scratch register. + * + * Returns: 0 on success. + * -errno on failure. + */ +int kvm_mips_entry_setup(void) +{ + /* + * We prefer to use KScratchN registers if they are available over the + * defaults above, which may not work on all cores. + */ + unsigned int kscratch_mask = cpu_data[0].kscratch_mask & 0xfc; + + /* Pick a scratch register for storing VCPU */ + if (kscratch_mask) { + scratch_vcpu[0] = 31; + scratch_vcpu[1] = ffs(kscratch_mask) - 1; + kscratch_mask &= ~BIT(scratch_vcpu[1]); + } + + /* Pick a scratch register to use as a temp for saving state */ + if (kscratch_mask) { + scratch_tmp[0] = 31; + scratch_tmp[1] = ffs(kscratch_mask) - 1; + kscratch_mask &= ~BIT(scratch_tmp[1]); + } + + return 0; +} + +static void kvm_mips_build_save_scratch(u32 **p, unsigned int tmp, + unsigned int frame) +{ + /* Save the VCPU scratch register value in cp0_epc of the stack frame */ + uasm_i_mfc0(p, tmp, scratch_vcpu[0], scratch_vcpu[1]); + UASM_i_SW(p, tmp, offsetof(struct pt_regs, cp0_epc), frame); + + /* Save the temp scratch register value in cp0_cause of stack frame */ + if (scratch_tmp[0] == 31) { + uasm_i_mfc0(p, tmp, scratch_tmp[0], scratch_tmp[1]); + UASM_i_SW(p, tmp, offsetof(struct pt_regs, cp0_cause), frame); + } +} + +static void kvm_mips_build_restore_scratch(u32 **p, unsigned int tmp, + unsigned int frame) +{ + /* + * Restore host scratch register values saved by + * kvm_mips_build_save_scratch(). + */ + UASM_i_LW(p, tmp, offsetof(struct pt_regs, cp0_epc), frame); + uasm_i_mtc0(p, tmp, scratch_vcpu[0], scratch_vcpu[1]); + + if (scratch_tmp[0] == 31) { + UASM_i_LW(p, tmp, offsetof(struct pt_regs, cp0_cause), frame); + uasm_i_mtc0(p, tmp, scratch_tmp[0], scratch_tmp[1]); + } +} + +/** + * kvm_mips_build_vcpu_run() - Assemble function to start running a guest VCPU. + * @addr: Address to start writing code. + * + * Assemble the start of the vcpu_run function to run a guest VCPU. The function + * conforms to the following prototype: + * + * int vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu); + * + * The exit from the guest and return to the caller is handled by the code + * generated by kvm_mips_build_ret_to_host(). + * + * Returns: Next address after end of written function. + */ +void *kvm_mips_build_vcpu_run(void *addr) +{ + u32 *p = addr; + unsigned int i; + + /* + * A0: run + * A1: vcpu + */ + + /* k0/k1 not being used in host kernel context */ + uasm_i_addiu(&p, K1, SP, -(int)sizeof(struct pt_regs)); + for (i = 16; i < 32; ++i) { + if (i == 24) + i = 28; + UASM_i_SW(&p, i, offsetof(struct pt_regs, regs[i]), K1); + } + + /* Save host status */ + uasm_i_mfc0(&p, V0, C0_STATUS); + UASM_i_SW(&p, V0, offsetof(struct pt_regs, cp0_status), K1); + + /* Save scratch registers, will be used to store pointer to vcpu etc */ + kvm_mips_build_save_scratch(&p, V1, K1); + + /* VCPU scratch register has pointer to vcpu */ + uasm_i_mtc0(&p, A1, scratch_vcpu[0], scratch_vcpu[1]); + + /* Offset into vcpu->arch */ + uasm_i_addiu(&p, K1, A1, offsetof(struct kvm_vcpu, arch)); + + /* + * Save the host stack to VCPU, used for exception processing + * when we exit from the Guest + */ + UASM_i_SW(&p, SP, offsetof(struct kvm_vcpu_arch, host_stack), K1); + + /* Save the kernel gp as well */ + UASM_i_SW(&p, GP, offsetof(struct kvm_vcpu_arch, host_gp), K1); + + /* + * Setup status register for running the guest in UM, interrupts + * are disabled + */ + UASM_i_LA(&p, K0, ST0_EXL | KSU_USER | ST0_BEV); + uasm_i_mtc0(&p, K0, C0_STATUS); + uasm_i_ehb(&p); + + /* load up the new EBASE */ + UASM_i_LW(&p, K0, offsetof(struct kvm_vcpu_arch, guest_ebase), K1); + uasm_i_mtc0(&p, K0, C0_EBASE); + + /* + * Now that the new EBASE has been loaded, unset BEV, set + * interrupt mask as it was but make sure that timer interrupts + * are enabled + */ + uasm_i_addiu(&p, K0, ZERO, ST0_EXL | KSU_USER | ST0_IE); + uasm_i_andi(&p, V0, V0, ST0_IM); + uasm_i_or(&p, K0, K0, V0); + uasm_i_mtc0(&p, K0, C0_STATUS); + uasm_i_ehb(&p); + + p = kvm_mips_build_enter_guest(p); + + return p; +} + +/** + * kvm_mips_build_enter_guest() - Assemble code to resume guest execution. + * @addr: Address to start writing code. + * + * Assemble the code to resume guest execution. This code is common between the + * initial entry into the guest from the host, and returning from the exit + * handler back to the guest. + * + * Returns: Next address after end of written function. + */ +static void *kvm_mips_build_enter_guest(void *addr) +{ + u32 *p = addr; + unsigned int i; + struct uasm_label labels[2]; + struct uasm_reloc relocs[2]; + struct uasm_label *l = labels; + struct uasm_reloc *r = relocs; + + memset(labels, 0, sizeof(labels)); + memset(relocs, 0, sizeof(relocs)); + + /* Set Guest EPC */ + UASM_i_LW(&p, T0, offsetof(struct kvm_vcpu_arch, pc), K1); + uasm_i_mtc0(&p, T0, C0_EPC); + + /* Set the ASID for the Guest Kernel */ + UASM_i_LW(&p, T0, offsetof(struct kvm_vcpu_arch, cop0), K1); + UASM_i_LW(&p, T0, offsetof(struct mips_coproc, reg[MIPS_CP0_STATUS][0]), + T0); + uasm_i_andi(&p, T0, T0, KSU_USER | ST0_ERL | ST0_EXL); + uasm_i_xori(&p, T0, T0, KSU_USER); + uasm_il_bnez(&p, &r, T0, label_kernel_asid); + uasm_i_addiu(&p, T1, K1, + offsetof(struct kvm_vcpu_arch, guest_kernel_asid)); + /* else user */ + uasm_i_addiu(&p, T1, K1, + offsetof(struct kvm_vcpu_arch, guest_user_asid)); + uasm_l_kernel_asid(&l, p); + + /* t1: contains the base of the ASID array, need to get the cpu id */ + /* smp_processor_id */ + UASM_i_LW(&p, T2, offsetof(struct thread_info, cpu), GP); + /* x4 */ + uasm_i_sll(&p, T2, T2, 2); + UASM_i_ADDU(&p, T3, T1, T2); + UASM_i_LW(&p, K0, 0, T3); +#ifdef CONFIG_MIPS_ASID_BITS_VARIABLE + /* x sizeof(struct cpuinfo_mips)/4 */ + uasm_i_addiu(&p, T3, ZERO, sizeof(struct cpuinfo_mips)/4); + uasm_i_mul(&p, T2, T2, T3); + + UASM_i_LA_mostly(&p, AT, (long)&cpu_data[0].asid_mask); + UASM_i_ADDU(&p, AT, AT, T2); + UASM_i_LW(&p, T2, uasm_rel_lo((long)&cpu_data[0].asid_mask), AT); + uasm_i_and(&p, K0, K0, T2); +#else + uasm_i_andi(&p, K0, K0, MIPS_ENTRYHI_ASID); +#endif + uasm_i_mtc0(&p, K0, C0_ENTRYHI); + uasm_i_ehb(&p); + + /* Disable RDHWR access */ + uasm_i_mtc0(&p, ZERO, C0_HWRENA); + + /* load the guest context from VCPU and return */ + for (i = 1; i < 32; ++i) { + /* Guest k0/k1 loaded later */ + if (i == K0 || i == K1) + continue; + UASM_i_LW(&p, i, offsetof(struct kvm_vcpu_arch, gprs[i]), K1); + } + +#ifndef CONFIG_CPU_MIPSR6 + /* Restore hi/lo */ + UASM_i_LW(&p, K0, offsetof(struct kvm_vcpu_arch, hi), K1); + uasm_i_mthi(&p, K0); + + UASM_i_LW(&p, K0, offsetof(struct kvm_vcpu_arch, lo), K1); + uasm_i_mtlo(&p, K0); +#endif + + /* Restore the guest's k0/k1 registers */ + UASM_i_LW(&p, K0, offsetof(struct kvm_vcpu_arch, gprs[K0]), K1); + UASM_i_LW(&p, K1, offsetof(struct kvm_vcpu_arch, gprs[K1]), K1); + + /* Jump to guest */ + uasm_i_eret(&p); + + uasm_resolve_relocs(relocs, labels); + + return p; +} + +/** + * kvm_mips_build_exception() - Assemble first level guest exception handler. + * @addr: Address to start writing code. + * @handler: Address of common handler (within range of @addr). + * + * Assemble exception vector code for guest execution. The generated vector will + * branch to the common exception handler generated by kvm_mips_build_exit(). + * + * Returns: Next address after end of written function. + */ +void *kvm_mips_build_exception(void *addr, void *handler) +{ + u32 *p = addr; + struct uasm_label labels[2]; + struct uasm_reloc relocs[2]; + struct uasm_label *l = labels; + struct uasm_reloc *r = relocs; + + memset(labels, 0, sizeof(labels)); + memset(relocs, 0, sizeof(relocs)); + + /* Save guest k1 into scratch register */ + uasm_i_mtc0(&p, K1, scratch_tmp[0], scratch_tmp[1]); + + /* Get the VCPU pointer from the VCPU scratch register */ + uasm_i_mfc0(&p, K1, scratch_vcpu[0], scratch_vcpu[1]); + uasm_i_addiu(&p, K1, K1, offsetof(struct kvm_vcpu, arch)); + + /* Save guest k0 into VCPU structure */ + UASM_i_SW(&p, K0, offsetof(struct kvm_vcpu_arch, gprs[K0]), K1); + + /* Branch to the common handler */ + uasm_il_b(&p, &r, label_exit_common); + uasm_i_nop(&p); + + uasm_l_exit_common(&l, handler); + uasm_resolve_relocs(relocs, labels); + + return p; +} + +/** + * kvm_mips_build_exit() - Assemble common guest exit handler. + * @addr: Address to start writing code. + * + * Assemble the generic guest exit handling code. This is called by the + * exception vectors (generated by kvm_mips_build_exception()), and calls + * kvm_mips_handle_exit(), then either resumes the guest or returns to the host + * depending on the return value. + * + * Returns: Next address after end of written function. + */ +void *kvm_mips_build_exit(void *addr) +{ + u32 *p = addr; + unsigned int i; + struct uasm_label labels[3]; + struct uasm_reloc relocs[3]; + struct uasm_label *l = labels; + struct uasm_reloc *r = relocs; + + memset(labels, 0, sizeof(labels)); + memset(relocs, 0, sizeof(relocs)); + + /* + * Generic Guest exception handler. We end up here when the guest + * does something that causes a trap to kernel mode. + * + * Both k0/k1 registers will have already been saved (k0 into the vcpu + * structure, and k1 into the scratch_tmp register). + * + * The k1 register will already contain the kvm_vcpu_arch pointer. + */ + + /* Start saving Guest context to VCPU */ + for (i = 0; i < 32; ++i) { + /* Guest k0/k1 saved later */ + if (i == K0 || i == K1) + continue; + UASM_i_SW(&p, i, offsetof(struct kvm_vcpu_arch, gprs[i]), K1); + } + +#ifndef CONFIG_CPU_MIPSR6 + /* We need to save hi/lo and restore them on the way out */ + uasm_i_mfhi(&p, T0); + UASM_i_SW(&p, T0, offsetof(struct kvm_vcpu_arch, hi), K1); + + uasm_i_mflo(&p, T0); + UASM_i_SW(&p, T0, offsetof(struct kvm_vcpu_arch, lo), K1); +#endif + + /* Finally save guest k1 to VCPU */ + uasm_i_ehb(&p); + uasm_i_mfc0(&p, T0, scratch_tmp[0], scratch_tmp[1]); + UASM_i_SW(&p, T0, offsetof(struct kvm_vcpu_arch, gprs[K1]), K1); + + /* Now that context has been saved, we can use other registers */ + + /* Restore vcpu */ + uasm_i_mfc0(&p, A1, scratch_vcpu[0], scratch_vcpu[1]); + uasm_i_move(&p, S1, A1); + + /* Restore run (vcpu->run) */ + UASM_i_LW(&p, A0, offsetof(struct kvm_vcpu, run), A1); + /* Save pointer to run in s0, will be saved by the compiler */ + uasm_i_move(&p, S0, A0); + + /* + * Save Host level EPC, BadVaddr and Cause to VCPU, useful to process + * the exception + */ + uasm_i_mfc0(&p, K0, C0_EPC); + UASM_i_SW(&p, K0, offsetof(struct kvm_vcpu_arch, pc), K1); + + uasm_i_mfc0(&p, K0, C0_BADVADDR); + UASM_i_SW(&p, K0, offsetof(struct kvm_vcpu_arch, host_cp0_badvaddr), + K1); + + uasm_i_mfc0(&p, K0, C0_CAUSE); + uasm_i_sw(&p, K0, offsetof(struct kvm_vcpu_arch, host_cp0_cause), K1); + + /* Now restore the host state just enough to run the handlers */ + + /* Switch EBASE to the one used by Linux */ + /* load up the host EBASE */ + uasm_i_mfc0(&p, V0, C0_STATUS); + + uasm_i_lui(&p, AT, ST0_BEV >> 16); + uasm_i_or(&p, K0, V0, AT); + + uasm_i_mtc0(&p, K0, C0_STATUS); + uasm_i_ehb(&p); + + UASM_i_LA_mostly(&p, K0, (long)&ebase); + UASM_i_LW(&p, K0, uasm_rel_lo((long)&ebase), K0); + uasm_i_mtc0(&p, K0, C0_EBASE); + + if (raw_cpu_has_fpu) { + /* + * If FPU is enabled, save FCR31 and clear it so that later + * ctc1's don't trigger FPE for pending exceptions. + */ + uasm_i_lui(&p, AT, ST0_CU1 >> 16); + uasm_i_and(&p, V1, V0, AT); + uasm_il_beqz(&p, &r, V1, label_fpu_1); + uasm_i_nop(&p); + uasm_i_cfc1(&p, T0, 31); + uasm_i_sw(&p, T0, offsetof(struct kvm_vcpu_arch, fpu.fcr31), + K1); + uasm_i_ctc1(&p, ZERO, 31); + uasm_l_fpu_1(&l, p); + } + + if (cpu_has_msa) { + /* + * If MSA is enabled, save MSACSR and clear it so that later + * instructions don't trigger MSAFPE for pending exceptions. + */ + uasm_i_mfc0(&p, T0, C0_CONFIG5); + uasm_i_ext(&p, T0, T0, 27, 1); /* MIPS_CONF5_MSAEN */ + uasm_il_beqz(&p, &r, T0, label_msa_1); + uasm_i_nop(&p); + uasm_i_cfcmsa(&p, T0, MSA_CSR); + uasm_i_sw(&p, T0, offsetof(struct kvm_vcpu_arch, fpu.msacsr), + K1); + uasm_i_ctcmsa(&p, MSA_CSR, ZERO); + uasm_l_msa_1(&l, p); + } + + /* Now that the new EBASE has been loaded, unset BEV and KSU_USER */ + uasm_i_addiu(&p, AT, ZERO, ~(ST0_EXL | KSU_USER | ST0_IE)); + uasm_i_and(&p, V0, V0, AT); + uasm_i_lui(&p, AT, ST0_CU0 >> 16); + uasm_i_or(&p, V0, V0, AT); + uasm_i_mtc0(&p, V0, C0_STATUS); + uasm_i_ehb(&p); + + /* Load up host GP */ + UASM_i_LW(&p, GP, offsetof(struct kvm_vcpu_arch, host_gp), K1); + + /* Need a stack before we can jump to "C" */ + UASM_i_LW(&p, SP, offsetof(struct kvm_vcpu_arch, host_stack), K1); + + /* Saved host state */ + uasm_i_addiu(&p, SP, SP, -(int)sizeof(struct pt_regs)); + + /* + * XXXKYMA do we need to load the host ASID, maybe not because the + * kernel entries are marked GLOBAL, need to verify + */ + + /* Restore host scratch registers, as we'll have clobbered them */ + kvm_mips_build_restore_scratch(&p, K0, SP); + + /* Restore RDHWR access */ + UASM_i_LA_mostly(&p, K0, (long)&hwrena); + uasm_i_lw(&p, K0, uasm_rel_lo((long)&hwrena), K0); + uasm_i_mtc0(&p, K0, C0_HWRENA); + + /* Jump to handler */ + /* + * XXXKYMA: not sure if this is safe, how large is the stack?? + * Now jump to the kvm_mips_handle_exit() to see if we can deal + * with this in the kernel + */ + UASM_i_LA(&p, T9, (unsigned long)kvm_mips_handle_exit); + uasm_i_jalr(&p, RA, T9); + uasm_i_addiu(&p, SP, SP, -CALLFRAME_SIZ); + + uasm_resolve_relocs(relocs, labels); + + p = kvm_mips_build_ret_from_exit(p); + + return p; +} + +/** + * kvm_mips_build_ret_from_exit() - Assemble guest exit return handler. + * @addr: Address to start writing code. + * + * Assemble the code to handle the return from kvm_mips_handle_exit(), either + * resuming the guest or returning to the host depending on the return value. + * + * Returns: Next address after end of written function. + */ +static void *kvm_mips_build_ret_from_exit(void *addr) +{ + u32 *p = addr; + struct uasm_label labels[2]; + struct uasm_reloc relocs[2]; + struct uasm_label *l = labels; + struct uasm_reloc *r = relocs; + + memset(labels, 0, sizeof(labels)); + memset(relocs, 0, sizeof(relocs)); + + /* Return from handler Make sure interrupts are disabled */ + uasm_i_di(&p, ZERO); + uasm_i_ehb(&p); + + /* + * XXXKYMA: k0/k1 could have been blown away if we processed + * an exception while we were handling the exception from the + * guest, reload k1 + */ + + uasm_i_move(&p, K1, S1); + uasm_i_addiu(&p, K1, K1, offsetof(struct kvm_vcpu, arch)); + + /* + * Check return value, should tell us if we are returning to the + * host (handle I/O etc)or resuming the guest + */ + uasm_i_andi(&p, T0, V0, RESUME_HOST); + uasm_il_bnez(&p, &r, T0, label_return_to_host); + uasm_i_nop(&p); + + p = kvm_mips_build_ret_to_guest(p); + + uasm_l_return_to_host(&l, p); + p = kvm_mips_build_ret_to_host(p); + + uasm_resolve_relocs(relocs, labels); + + return p; +} + +/** + * kvm_mips_build_ret_to_guest() - Assemble code to return to the guest. + * @addr: Address to start writing code. + * + * Assemble the code to handle return from the guest exit handler + * (kvm_mips_handle_exit()) back to the guest. + * + * Returns: Next address after end of written function. + */ +static void *kvm_mips_build_ret_to_guest(void *addr) +{ + u32 *p = addr; + + /* Put the saved pointer to vcpu (s1) back into the scratch register */ + uasm_i_mtc0(&p, S1, scratch_vcpu[0], scratch_vcpu[1]); + + /* Load up the Guest EBASE to minimize the window where BEV is set */ + UASM_i_LW(&p, T0, offsetof(struct kvm_vcpu_arch, guest_ebase), K1); + + /* Switch EBASE back to the one used by KVM */ + uasm_i_mfc0(&p, V1, C0_STATUS); + uasm_i_lui(&p, AT, ST0_BEV >> 16); + uasm_i_or(&p, K0, V1, AT); + uasm_i_mtc0(&p, K0, C0_STATUS); + uasm_i_ehb(&p); + uasm_i_mtc0(&p, T0, C0_EBASE); + + /* Setup status register for running guest in UM */ + uasm_i_ori(&p, V1, V1, ST0_EXL | KSU_USER | ST0_IE); + UASM_i_LA(&p, AT, ~(ST0_CU0 | ST0_MX)); + uasm_i_and(&p, V1, V1, AT); + uasm_i_mtc0(&p, V1, C0_STATUS); + uasm_i_ehb(&p); + + p = kvm_mips_build_enter_guest(p); + + return p; +} + +/** + * kvm_mips_build_ret_to_host() - Assemble code to return to the host. + * @addr: Address to start writing code. + * + * Assemble the code to handle return from the guest exit handler + * (kvm_mips_handle_exit()) back to the host, i.e. to the caller of the vcpu_run + * function generated by kvm_mips_build_vcpu_run(). + * + * Returns: Next address after end of written function. + */ +static void *kvm_mips_build_ret_to_host(void *addr) +{ + u32 *p = addr; + unsigned int i; + + /* EBASE is already pointing to Linux */ + UASM_i_LW(&p, K1, offsetof(struct kvm_vcpu_arch, host_stack), K1); + uasm_i_addiu(&p, K1, K1, -(int)sizeof(struct pt_regs)); + + /* + * r2/v0 is the return code, shift it down by 2 (arithmetic) + * to recover the err code + */ + uasm_i_sra(&p, K0, V0, 2); + uasm_i_move(&p, V0, K0); + + /* Load context saved on the host stack */ + for (i = 16; i < 31; ++i) { + if (i == 24) + i = 28; + UASM_i_LW(&p, i, offsetof(struct pt_regs, regs[i]), K1); + } + + /* Restore RDHWR access */ + UASM_i_LA_mostly(&p, K0, (long)&hwrena); + uasm_i_lw(&p, K0, uasm_rel_lo((long)&hwrena), K0); + uasm_i_mtc0(&p, K0, C0_HWRENA); + + /* Restore RA, which is the address we will return to */ + UASM_i_LW(&p, RA, offsetof(struct pt_regs, regs[RA]), K1); + uasm_i_jr(&p, RA); + uasm_i_nop(&p); + + return p; +} + diff --git a/arch/mips/kvm/fpu.S b/arch/mips/kvm/fpu.S index 531fbf5131c0..16f17c6390dd 100644 --- a/arch/mips/kvm/fpu.S +++ b/arch/mips/kvm/fpu.S @@ -14,13 +14,16 @@ #include <asm/mipsregs.h> #include <asm/regdef.h> +/* preprocessor replaces the fp in ".set fp=64" with $30 otherwise */ +#undef fp + .set noreorder .set noat LEAF(__kvm_save_fpu) .set push - .set mips64r2 SET_HARDFLOAT + .set fp=64 mfc0 t0, CP0_STATUS sll t0, t0, 5 # is Status.FR set? bgez t0, 1f # no: skip odd doubles @@ -63,8 +66,8 @@ LEAF(__kvm_save_fpu) LEAF(__kvm_restore_fpu) .set push - .set mips64r2 SET_HARDFLOAT + .set fp=64 mfc0 t0, CP0_STATUS sll t0, t0, 5 # is Status.FR set? bgez t0, 1f # no: skip odd doubles diff --git a/arch/mips/kvm/interrupt.h b/arch/mips/kvm/interrupt.h index d661c100b219..fb118a2c8379 100644 --- a/arch/mips/kvm/interrupt.h +++ b/arch/mips/kvm/interrupt.h @@ -28,10 +28,6 @@ #define MIPS_EXC_MAX 12 /* XXXSL More to follow */ -extern char __kvm_mips_vcpu_run_end[]; -extern char mips32_exception[], mips32_exceptionEnd[]; -extern char mips32_GuestException[], mips32_GuestExceptionEnd[]; - #define C_TI (_ULCAST_(1) << 30) #define KVM_MIPS_IRQ_DELIVER_ALL_AT_ONCE (0) diff --git a/arch/mips/kvm/locore.S b/arch/mips/kvm/locore.S deleted file mode 100644 index 698286c0f732..000000000000 --- a/arch/mips/kvm/locore.S +++ /dev/null @@ -1,602 +0,0 @@ -/* - * This file is subject to the terms and conditions of the GNU General Public - * License. See the file "COPYING" in the main directory of this archive - * for more details. - * - * Main entry point for the guest, exception handling. - * - * Copyright (C) 2012 MIPS Technologies, Inc. All rights reserved. - * Authors: Sanjay Lal <sanjayl@kymasys.com> - */ - -#include <asm/asm.h> -#include <asm/asmmacro.h> -#include <asm/regdef.h> -#include <asm/mipsregs.h> -#include <asm/stackframe.h> -#include <asm/asm-offsets.h> - -#define _C_LABEL(x) x -#define MIPSX(name) mips32_ ## name -#define CALLFRAME_SIZ 32 - -/* - * VECTOR - * exception vector entrypoint - */ -#define VECTOR(x, regmask) \ - .ent _C_LABEL(x),0; \ - EXPORT(x); - -#define VECTOR_END(x) \ - EXPORT(x); - -/* Overload, Danger Will Robinson!! */ -#define PT_HOST_USERLOCAL PT_EPC - -#define CP0_DDATA_LO $28,3 - -/* Resume Flags */ -#define RESUME_FLAG_HOST (1<<1) /* Resume host? */ - -#define RESUME_GUEST 0 -#define RESUME_HOST RESUME_FLAG_HOST - -/* - * __kvm_mips_vcpu_run: entry point to the guest - * a0: run - * a1: vcpu - */ - .set noreorder - -FEXPORT(__kvm_mips_vcpu_run) - /* k0/k1 not being used in host kernel context */ - INT_ADDIU k1, sp, -PT_SIZE - LONG_S $16, PT_R16(k1) - LONG_S $17, PT_R17(k1) - LONG_S $18, PT_R18(k1) - LONG_S $19, PT_R19(k1) - LONG_S $20, PT_R20(k1) - LONG_S $21, PT_R21(k1) - LONG_S $22, PT_R22(k1) - LONG_S $23, PT_R23(k1) - - LONG_S $28, PT_R28(k1) - LONG_S $29, PT_R29(k1) - LONG_S $30, PT_R30(k1) - LONG_S $31, PT_R31(k1) - - /* Save hi/lo */ - mflo v0 - LONG_S v0, PT_LO(k1) - mfhi v1 - LONG_S v1, PT_HI(k1) - - /* Save host status */ - mfc0 v0, CP0_STATUS - LONG_S v0, PT_STATUS(k1) - - /* Save DDATA_LO, will be used to store pointer to vcpu */ - mfc0 v1, CP0_DDATA_LO - LONG_S v1, PT_HOST_USERLOCAL(k1) - - /* DDATA_LO has pointer to vcpu */ - mtc0 a1, CP0_DDATA_LO - - /* Offset into vcpu->arch */ - INT_ADDIU k1, a1, VCPU_HOST_ARCH - - /* - * Save the host stack to VCPU, used for exception processing - * when we exit from the Guest - */ - LONG_S sp, VCPU_HOST_STACK(k1) - - /* Save the kernel gp as well */ - LONG_S gp, VCPU_HOST_GP(k1) - - /* - * Setup status register for running the guest in UM, interrupts - * are disabled - */ - li k0, (ST0_EXL | KSU_USER | ST0_BEV) - mtc0 k0, CP0_STATUS - ehb - - /* load up the new EBASE */ - LONG_L k0, VCPU_GUEST_EBASE(k1) - mtc0 k0, CP0_EBASE - - /* - * Now that the new EBASE has been loaded, unset BEV, set - * interrupt mask as it was but make sure that timer interrupts - * are enabled - */ - li k0, (ST0_EXL | KSU_USER | ST0_IE) - andi v0, v0, ST0_IM - or k0, k0, v0 - mtc0 k0, CP0_STATUS - ehb - - /* Set Guest EPC */ - LONG_L t0, VCPU_PC(k1) - mtc0 t0, CP0_EPC - -FEXPORT(__kvm_mips_load_asid) - /* Set the ASID for the Guest Kernel */ - PTR_L t0, VCPU_COP0(k1) - LONG_L t0, COP0_STATUS(t0) - andi t0, KSU_USER | ST0_ERL | ST0_EXL - xori t0, KSU_USER - bnez t0, 1f /* If kernel */ - INT_ADDIU t1, k1, VCPU_GUEST_KERNEL_ASID /* (BD) */ - INT_ADDIU t1, k1, VCPU_GUEST_USER_ASID /* else user */ -1: - /* t1: contains the base of the ASID array, need to get the cpu id */ - LONG_L t2, TI_CPU($28) /* smp_processor_id */ - INT_SLL t2, t2, 2 /* x4 */ - REG_ADDU t3, t1, t2 - LONG_L k0, (t3) -#ifdef CONFIG_MIPS_ASID_BITS_VARIABLE - li t3, CPUINFO_SIZE/4 - mul t2, t2, t3 /* x sizeof(struct cpuinfo_mips)/4 */ - LONG_L t2, (cpu_data + CPUINFO_ASID_MASK)(t2) - and k0, k0, t2 -#else - andi k0, k0, MIPS_ENTRYHI_ASID -#endif - mtc0 k0, CP0_ENTRYHI - ehb - - /* Disable RDHWR access */ - mtc0 zero, CP0_HWRENA - - .set noat - /* Now load up the Guest Context from VCPU */ - LONG_L $1, VCPU_R1(k1) - LONG_L $2, VCPU_R2(k1) - LONG_L $3, VCPU_R3(k1) - - LONG_L $4, VCPU_R4(k1) - LONG_L $5, VCPU_R5(k1) - LONG_L $6, VCPU_R6(k1) - LONG_L $7, VCPU_R7(k1) - - LONG_L $8, VCPU_R8(k1) - LONG_L $9, VCPU_R9(k1) - LONG_L $10, VCPU_R10(k1) - LONG_L $11, VCPU_R11(k1) - LONG_L $12, VCPU_R12(k1) - LONG_L $13, VCPU_R13(k1) - LONG_L $14, VCPU_R14(k1) - LONG_L $15, VCPU_R15(k1) - LONG_L $16, VCPU_R16(k1) - LONG_L $17, VCPU_R17(k1) - LONG_L $18, VCPU_R18(k1) - LONG_L $19, VCPU_R19(k1) - LONG_L $20, VCPU_R20(k1) - LONG_L $21, VCPU_R21(k1) - LONG_L $22, VCPU_R22(k1) - LONG_L $23, VCPU_R23(k1) - LONG_L $24, VCPU_R24(k1) - LONG_L $25, VCPU_R25(k1) - - /* k0/k1 loaded up later */ - - LONG_L $28, VCPU_R28(k1) - LONG_L $29, VCPU_R29(k1) - LONG_L $30, VCPU_R30(k1) - LONG_L $31, VCPU_R31(k1) - - /* Restore hi/lo */ - LONG_L k0, VCPU_LO(k1) - mtlo k0 - - LONG_L k0, VCPU_HI(k1) - mthi k0 - -FEXPORT(__kvm_mips_load_k0k1) - /* Restore the guest's k0/k1 registers */ - LONG_L k0, VCPU_R26(k1) - LONG_L k1, VCPU_R27(k1) - - /* Jump to guest */ - eret -EXPORT(__kvm_mips_vcpu_run_end) - -VECTOR(MIPSX(exception), unknown) -/* Find out what mode we came from and jump to the proper handler. */ - mtc0 k0, CP0_ERROREPC #01: Save guest k0 - ehb #02: - - mfc0 k0, CP0_EBASE #02: Get EBASE - INT_SRL k0, k0, 10 #03: Get rid of CPUNum - INT_SLL k0, k0, 10 #04 - LONG_S k1, 0x3000(k0) #05: Save k1 @ offset 0x3000 - INT_ADDIU k0, k0, 0x2000 #06: Exception handler is - # installed @ offset 0x2000 - j k0 #07: jump to the function - nop #08: branch delay slot -VECTOR_END(MIPSX(exceptionEnd)) -.end MIPSX(exception) - -/* - * Generic Guest exception handler. We end up here when the guest - * does something that causes a trap to kernel mode. - */ -NESTED (MIPSX(GuestException), CALLFRAME_SIZ, ra) - /* Get the VCPU pointer from DDTATA_LO */ - mfc0 k1, CP0_DDATA_LO - INT_ADDIU k1, k1, VCPU_HOST_ARCH - - /* Start saving Guest context to VCPU */ - LONG_S $0, VCPU_R0(k1) - LONG_S $1, VCPU_R1(k1) - LONG_S $2, VCPU_R2(k1) - LONG_S $3, VCPU_R3(k1) - LONG_S $4, VCPU_R4(k1) - LONG_S $5, VCPU_R5(k1) - LONG_S $6, VCPU_R6(k1) - LONG_S $7, VCPU_R7(k1) - LONG_S $8, VCPU_R8(k1) - LONG_S $9, VCPU_R9(k1) - LONG_S $10, VCPU_R10(k1) - LONG_S $11, VCPU_R11(k1) - LONG_S $12, VCPU_R12(k1) - LONG_S $13, VCPU_R13(k1) - LONG_S $14, VCPU_R14(k1) - LONG_S $15, VCPU_R15(k1) - LONG_S $16, VCPU_R16(k1) - LONG_S $17, VCPU_R17(k1) - LONG_S $18, VCPU_R18(k1) - LONG_S $19, VCPU_R19(k1) - LONG_S $20, VCPU_R20(k1) - LONG_S $21, VCPU_R21(k1) - LONG_S $22, VCPU_R22(k1) - LONG_S $23, VCPU_R23(k1) - LONG_S $24, VCPU_R24(k1) - LONG_S $25, VCPU_R25(k1) - - /* Guest k0/k1 saved later */ - - LONG_S $28, VCPU_R28(k1) - LONG_S $29, VCPU_R29(k1) - LONG_S $30, VCPU_R30(k1) - LONG_S $31, VCPU_R31(k1) - - .set at - - /* We need to save hi/lo and restore them on the way out */ - mfhi t0 - LONG_S t0, VCPU_HI(k1) - - mflo t0 - LONG_S t0, VCPU_LO(k1) - - /* Finally save guest k0/k1 to VCPU */ - mfc0 t0, CP0_ERROREPC - LONG_S t0, VCPU_R26(k1) - - /* Get GUEST k1 and save it in VCPU */ - PTR_LI t1, ~0x2ff - mfc0 t0, CP0_EBASE - and t0, t0, t1 - LONG_L t0, 0x3000(t0) - LONG_S t0, VCPU_R27(k1) - - /* Now that context has been saved, we can use other registers */ - - /* Restore vcpu */ - mfc0 a1, CP0_DDATA_LO - move s1, a1 - - /* Restore run (vcpu->run) */ - LONG_L a0, VCPU_RUN(a1) - /* Save pointer to run in s0, will be saved by the compiler */ - move s0, a0 - - /* - * Save Host level EPC, BadVaddr and Cause to VCPU, useful to - * process the exception - */ - mfc0 k0,CP0_EPC - LONG_S k0, VCPU_PC(k1) - - mfc0 k0, CP0_BADVADDR - LONG_S k0, VCPU_HOST_CP0_BADVADDR(k1) - - mfc0 k0, CP0_CAUSE - sw k0, VCPU_HOST_CP0_CAUSE(k1) - - /* Now restore the host state just enough to run the handlers */ - - /* Switch EBASE to the one used by Linux */ - /* load up the host EBASE */ - mfc0 v0, CP0_STATUS - - or k0, v0, ST0_BEV - - mtc0 k0, CP0_STATUS - ehb - - LONG_L k0, ebase - mtc0 k0,CP0_EBASE - - /* - * If FPU is enabled, save FCR31 and clear it so that later ctc1's don't - * trigger FPE for pending exceptions. - */ - and v1, v0, ST0_CU1 - beqz v1, 1f - nop - .set push - SET_HARDFLOAT - cfc1 t0, fcr31 - sw t0, VCPU_FCR31(k1) - ctc1 zero,fcr31 - .set pop -1: - -#ifdef CONFIG_CPU_HAS_MSA - /* - * If MSA is enabled, save MSACSR and clear it so that later - * instructions don't trigger MSAFPE for pending exceptions. - */ - mfc0 t0, CP0_CONFIG3 - ext t0, t0, 28, 1 /* MIPS_CONF3_MSAP */ - beqz t0, 1f - nop - mfc0 t0, CP0_CONFIG5 - ext t0, t0, 27, 1 /* MIPS_CONF5_MSAEN */ - beqz t0, 1f - nop - _cfcmsa t0, MSA_CSR - sw t0, VCPU_MSA_CSR(k1) - _ctcmsa MSA_CSR, zero -1: -#endif - - /* Now that the new EBASE has been loaded, unset BEV and KSU_USER */ - and v0, v0, ~(ST0_EXL | KSU_USER | ST0_IE) - or v0, v0, ST0_CU0 - mtc0 v0, CP0_STATUS - ehb - - /* Load up host GP */ - LONG_L gp, VCPU_HOST_GP(k1) - - /* Need a stack before we can jump to "C" */ - LONG_L sp, VCPU_HOST_STACK(k1) - - /* Saved host state */ - INT_ADDIU sp, sp, -PT_SIZE - - /* - * XXXKYMA do we need to load the host ASID, maybe not because the - * kernel entries are marked GLOBAL, need to verify - */ - - /* Restore host DDATA_LO */ - LONG_L k0, PT_HOST_USERLOCAL(sp) - mtc0 k0, CP0_DDATA_LO - - /* Restore RDHWR access */ - INT_L k0, hwrena - mtc0 k0, CP0_HWRENA - - /* Jump to handler */ -FEXPORT(__kvm_mips_jump_to_handler) - /* - * XXXKYMA: not sure if this is safe, how large is the stack?? - * Now jump to the kvm_mips_handle_exit() to see if we can deal - * with this in the kernel - */ - PTR_LA t9, kvm_mips_handle_exit - jalr.hb t9 - INT_ADDIU sp, sp, -CALLFRAME_SIZ /* BD Slot */ - - /* Return from handler Make sure interrupts are disabled */ - di - ehb - - /* - * XXXKYMA: k0/k1 could have been blown away if we processed - * an exception while we were handling the exception from the - * guest, reload k1 - */ - - move k1, s1 - INT_ADDIU k1, k1, VCPU_HOST_ARCH - - /* - * Check return value, should tell us if we are returning to the - * host (handle I/O etc)or resuming the guest - */ - andi t0, v0, RESUME_HOST - bnez t0, __kvm_mips_return_to_host - nop - -__kvm_mips_return_to_guest: - /* Put the saved pointer to vcpu (s1) back into the DDATA_LO Register */ - mtc0 s1, CP0_DDATA_LO - - /* Load up the Guest EBASE to minimize the window where BEV is set */ - LONG_L t0, VCPU_GUEST_EBASE(k1) - - /* Switch EBASE back to the one used by KVM */ - mfc0 v1, CP0_STATUS - or k0, v1, ST0_BEV - mtc0 k0, CP0_STATUS - ehb - mtc0 t0, CP0_EBASE - - /* Setup status register for running guest in UM */ - or v1, v1, (ST0_EXL | KSU_USER | ST0_IE) - and v1, v1, ~(ST0_CU0 | ST0_MX) - mtc0 v1, CP0_STATUS - ehb - - /* Set Guest EPC */ - LONG_L t0, VCPU_PC(k1) - mtc0 t0, CP0_EPC - - /* Set the ASID for the Guest Kernel */ - PTR_L t0, VCPU_COP0(k1) - LONG_L t0, COP0_STATUS(t0) - andi t0, KSU_USER | ST0_ERL | ST0_EXL - xori t0, KSU_USER - bnez t0, 1f /* If kernel */ - INT_ADDIU t1, k1, VCPU_GUEST_KERNEL_ASID /* (BD) */ - INT_ADDIU t1, k1, VCPU_GUEST_USER_ASID /* else user */ -1: - /* t1: contains the base of the ASID array, need to get the cpu id */ - LONG_L t2, TI_CPU($28) /* smp_processor_id */ - INT_SLL t2, t2, 2 /* x4 */ - REG_ADDU t3, t1, t2 - LONG_L k0, (t3) -#ifdef CONFIG_MIPS_ASID_BITS_VARIABLE - li t3, CPUINFO_SIZE/4 - mul t2, t2, t3 /* x sizeof(struct cpuinfo_mips)/4 */ - LONG_L t2, (cpu_data + CPUINFO_ASID_MASK)(t2) - and k0, k0, t2 -#else - andi k0, k0, MIPS_ENTRYHI_ASID -#endif - mtc0 k0, CP0_ENTRYHI - ehb - - /* Disable RDHWR access */ - mtc0 zero, CP0_HWRENA - - .set noat - /* load the guest context from VCPU and return */ - LONG_L $0, VCPU_R0(k1) - LONG_L $1, VCPU_R1(k1) - LONG_L $2, VCPU_R2(k1) - LONG_L $3, VCPU_R3(k1) - LONG_L $4, VCPU_R4(k1) - LONG_L $5, VCPU_R5(k1) - LONG_L $6, VCPU_R6(k1) - LONG_L $7, VCPU_R7(k1) - LONG_L $8, VCPU_R8(k1) - LONG_L $9, VCPU_R9(k1) - LONG_L $10, VCPU_R10(k1) - LONG_L $11, VCPU_R11(k1) - LONG_L $12, VCPU_R12(k1) - LONG_L $13, VCPU_R13(k1) - LONG_L $14, VCPU_R14(k1) - LONG_L $15, VCPU_R15(k1) - LONG_L $16, VCPU_R16(k1) - LONG_L $17, VCPU_R17(k1) - LONG_L $18, VCPU_R18(k1) - LONG_L $19, VCPU_R19(k1) - LONG_L $20, VCPU_R20(k1) - LONG_L $21, VCPU_R21(k1) - LONG_L $22, VCPU_R22(k1) - LONG_L $23, VCPU_R23(k1) - LONG_L $24, VCPU_R24(k1) - LONG_L $25, VCPU_R25(k1) - - /* $/k1 loaded later */ - LONG_L $28, VCPU_R28(k1) - LONG_L $29, VCPU_R29(k1) - LONG_L $30, VCPU_R30(k1) - LONG_L $31, VCPU_R31(k1) - -FEXPORT(__kvm_mips_skip_guest_restore) - LONG_L k0, VCPU_HI(k1) - mthi k0 - - LONG_L k0, VCPU_LO(k1) - mtlo k0 - - LONG_L k0, VCPU_R26(k1) - LONG_L k1, VCPU_R27(k1) - - eret - .set at - -__kvm_mips_return_to_host: - /* EBASE is already pointing to Linux */ - LONG_L k1, VCPU_HOST_STACK(k1) - INT_ADDIU k1,k1, -PT_SIZE - - /* Restore host DDATA_LO */ - LONG_L k0, PT_HOST_USERLOCAL(k1) - mtc0 k0, CP0_DDATA_LO - - /* - * r2/v0 is the return code, shift it down by 2 (arithmetic) - * to recover the err code - */ - INT_SRA k0, v0, 2 - move $2, k0 - - /* Load context saved on the host stack */ - LONG_L $16, PT_R16(k1) - LONG_L $17, PT_R17(k1) - LONG_L $18, PT_R18(k1) - LONG_L $19, PT_R19(k1) - LONG_L $20, PT_R20(k1) - LONG_L $21, PT_R21(k1) - LONG_L $22, PT_R22(k1) - LONG_L $23, PT_R23(k1) - - LONG_L $28, PT_R28(k1) - LONG_L $29, PT_R29(k1) - LONG_L $30, PT_R30(k1) - - LONG_L k0, PT_HI(k1) - mthi k0 - - LONG_L k0, PT_LO(k1) - mtlo k0 - - /* Restore RDHWR access */ - INT_L k0, hwrena - mtc0 k0, CP0_HWRENA - - /* Restore RA, which is the address we will return to */ - LONG_L ra, PT_R31(k1) - j ra - nop - -VECTOR_END(MIPSX(GuestExceptionEnd)) -.end MIPSX(GuestException) - -MIPSX(exceptions): - #### - ##### The exception handlers. - ##### - .word _C_LABEL(MIPSX(GuestException)) # 0 - .word _C_LABEL(MIPSX(GuestException)) # 1 - .word _C_LABEL(MIPSX(GuestException)) # 2 - .word _C_LABEL(MIPSX(GuestException)) # 3 - .word _C_LABEL(MIPSX(GuestException)) # 4 - .word _C_LABEL(MIPSX(GuestException)) # 5 - .word _C_LABEL(MIPSX(GuestException)) # 6 - .word _C_LABEL(MIPSX(GuestException)) # 7 - .word _C_LABEL(MIPSX(GuestException)) # 8 - .word _C_LABEL(MIPSX(GuestException)) # 9 - .word _C_LABEL(MIPSX(GuestException)) # 10 - .word _C_LABEL(MIPSX(GuestException)) # 11 - .word _C_LABEL(MIPSX(GuestException)) # 12 - .word _C_LABEL(MIPSX(GuestException)) # 13 - .word _C_LABEL(MIPSX(GuestException)) # 14 - .word _C_LABEL(MIPSX(GuestException)) # 15 - .word _C_LABEL(MIPSX(GuestException)) # 16 - .word _C_LABEL(MIPSX(GuestException)) # 17 - .word _C_LABEL(MIPSX(GuestException)) # 18 - .word _C_LABEL(MIPSX(GuestException)) # 19 - .word _C_LABEL(MIPSX(GuestException)) # 20 - .word _C_LABEL(MIPSX(GuestException)) # 21 - .word _C_LABEL(MIPSX(GuestException)) # 22 - .word _C_LABEL(MIPSX(GuestException)) # 23 - .word _C_LABEL(MIPSX(GuestException)) # 24 - .word _C_LABEL(MIPSX(GuestException)) # 25 - .word _C_LABEL(MIPSX(GuestException)) # 26 - .word _C_LABEL(MIPSX(GuestException)) # 27 - .word _C_LABEL(MIPSX(GuestException)) # 28 - .word _C_LABEL(MIPSX(GuestException)) # 29 - .word _C_LABEL(MIPSX(GuestException)) # 30 - .word _C_LABEL(MIPSX(GuestException)) # 31 diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index 5a2b9034a05c..414b00074e29 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -245,10 +245,27 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, } } +static inline void dump_handler(const char *symbol, void *start, void *end) +{ + u32 *p; + + pr_debug("LEAF(%s)\n", symbol); + + pr_debug("\t.set push\n"); + pr_debug("\t.set noreorder\n"); + + for (p = start; p < (u32 *)end; ++p) + pr_debug("\t.word\t0x%08x\t\t# %p\n", *p, p); + + pr_debug("\t.set\tpop\n"); + + pr_debug("\tEND(%s)\n", symbol); +} + struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id) { - int err, size, offset; - void *gebase; + int err, size; + void *gebase, *p, *handler; int i; struct kvm_vcpu *vcpu = kzalloc(sizeof(struct kvm_vcpu), GFP_KERNEL); @@ -286,41 +303,38 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id) /* Save new ebase */ vcpu->arch.guest_ebase = gebase; - /* Copy L1 Guest Exception handler to correct offset */ + /* Build guest exception vectors dynamically in unmapped memory */ + handler = gebase + 0x2000; /* TLB Refill, EXL = 0 */ - memcpy(gebase, mips32_exception, - mips32_exceptionEnd - mips32_exception); + kvm_mips_build_exception(gebase, handler); /* General Exception Entry point */ - memcpy(gebase + 0x180, mips32_exception, - mips32_exceptionEnd - mips32_exception); + kvm_mips_build_exception(gebase + 0x180, handler); /* For vectored interrupts poke the exception code @ all offsets 0-7 */ for (i = 0; i < 8; i++) { kvm_debug("L1 Vectored handler @ %p\n", gebase + 0x200 + (i * VECTORSPACING)); - memcpy(gebase + 0x200 + (i * VECTORSPACING), mips32_exception, - mips32_exceptionEnd - mips32_exception); + kvm_mips_build_exception(gebase + 0x200 + i * VECTORSPACING, + handler); } - /* General handler, relocate to unmapped space for sanity's sake */ - offset = 0x2000; - kvm_debug("Installing KVM Exception handlers @ %p, %#x bytes\n", - gebase + offset, - mips32_GuestExceptionEnd - mips32_GuestException); + /* General exit handler */ + p = handler; + p = kvm_mips_build_exit(p); - memcpy(gebase + offset, mips32_GuestException, - mips32_GuestExceptionEnd - mips32_GuestException); + /* Guest entry routine */ + vcpu->arch.vcpu_run = p; + p = kvm_mips_build_vcpu_run(p); -#ifdef MODULE - offset += mips32_GuestExceptionEnd - mips32_GuestException; - memcpy(gebase + offset, (char *)__kvm_mips_vcpu_run, - __kvm_mips_vcpu_run_end - (char *)__kvm_mips_vcpu_run); - vcpu->arch.vcpu_run = gebase + offset; -#else - vcpu->arch.vcpu_run = __kvm_mips_vcpu_run; -#endif + /* Dump the generated code */ + pr_debug("#include <asm/asm.h>\n"); + pr_debug("#include <asm/regdef.h>\n"); + pr_debug("\n"); + dump_handler("kvm_vcpu_run", vcpu->arch.vcpu_run, p); + dump_handler("kvm_gen_exc", gebase + 0x180, gebase + 0x200); + dump_handler("kvm_exit", gebase + 0x2000, vcpu->arch.vcpu_run); /* Invalidate the icache for these ranges */ local_flush_icache_range((unsigned long)gebase, @@ -406,7 +420,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) kvm_mips_deliver_interrupts(vcpu, kvm_read_c0_guest_cause(vcpu->arch.cop0)); - __kvm_guest_enter(); + guest_enter_irqoff(); /* Disable hardware page table walking while in guest */ htw_stop(); @@ -418,7 +432,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) /* Re-enable HTW before enabling interrupts */ htw_start(); - __kvm_guest_exit(); + guest_exit_irqoff(); local_irq_enable(); if (vcpu->sigset_active) @@ -507,8 +521,10 @@ static u64 kvm_mips_get_one_regs[] = { KVM_REG_MIPS_R30, KVM_REG_MIPS_R31, +#ifndef CONFIG_CPU_MIPSR6 KVM_REG_MIPS_HI, KVM_REG_MIPS_LO, +#endif KVM_REG_MIPS_PC, KVM_REG_MIPS_CP0_INDEX, @@ -652,12 +668,14 @@ static int kvm_mips_get_reg(struct kvm_vcpu *vcpu, case KVM_REG_MIPS_R0 ... KVM_REG_MIPS_R31: v = (long)vcpu->arch.gprs[reg->id - KVM_REG_MIPS_R0]; break; +#ifndef CONFIG_CPU_MIPSR6 case KVM_REG_MIPS_HI: v = (long)vcpu->arch.hi; break; case KVM_REG_MIPS_LO: v = (long)vcpu->arch.lo; break; +#endif case KVM_REG_MIPS_PC: v = (long)vcpu->arch.pc; break; @@ -873,12 +891,14 @@ static int kvm_mips_set_reg(struct kvm_vcpu *vcpu, case KVM_REG_MIPS_R1 ... KVM_REG_MIPS_R31: vcpu->arch.gprs[reg->id - KVM_REG_MIPS_R0] = v; break; +#ifndef CONFIG_CPU_MIPSR6 case KVM_REG_MIPS_HI: vcpu->arch.hi = v; break; case KVM_REG_MIPS_LO: vcpu->arch.lo = v; break; +#endif case KVM_REG_MIPS_PC: vcpu->arch.pc = v; break; @@ -1763,6 +1783,10 @@ static int __init kvm_mips_init(void) { int ret; + ret = kvm_mips_entry_setup(); + if (ret) + return ret; + ret = kvm_init(NULL, sizeof(struct kvm_vcpu), 0, THIS_MODULE); if (ret) diff --git a/arch/mips/kvm/trace.h b/arch/mips/kvm/trace.h index a38bdab68574..c858cf168078 100644 --- a/arch/mips/kvm/trace.h +++ b/arch/mips/kvm/trace.h @@ -20,50 +20,32 @@ /* * Tracepoints for VM enters */ -TRACE_EVENT(kvm_enter, - TP_PROTO(struct kvm_vcpu *vcpu), - TP_ARGS(vcpu), - TP_STRUCT__entry( - __field(unsigned long, pc) - ), - - TP_fast_assign( - __entry->pc = vcpu->arch.pc; - ), - - TP_printk("PC: 0x%08lx", - __entry->pc) -); - -TRACE_EVENT(kvm_reenter, - TP_PROTO(struct kvm_vcpu *vcpu), - TP_ARGS(vcpu), - TP_STRUCT__entry( - __field(unsigned long, pc) - ), - - TP_fast_assign( - __entry->pc = vcpu->arch.pc; - ), - - TP_printk("PC: 0x%08lx", - __entry->pc) +DECLARE_EVENT_CLASS(kvm_transition, + TP_PROTO(struct kvm_vcpu *vcpu), + TP_ARGS(vcpu), + TP_STRUCT__entry( + __field(unsigned long, pc) + ), + + TP_fast_assign( + __entry->pc = vcpu->arch.pc; + ), + + TP_printk("PC: 0x%08lx", + __entry->pc) ); -TRACE_EVENT(kvm_out, - TP_PROTO(struct kvm_vcpu *vcpu), - TP_ARGS(vcpu), - TP_STRUCT__entry( - __field(unsigned long, pc) - ), +DEFINE_EVENT(kvm_transition, kvm_enter, + TP_PROTO(struct kvm_vcpu *vcpu), + TP_ARGS(vcpu)); - TP_fast_assign( - __entry->pc = vcpu->arch.pc; - ), +DEFINE_EVENT(kvm_transition, kvm_reenter, + TP_PROTO(struct kvm_vcpu *vcpu), + TP_ARGS(vcpu)); - TP_printk("PC: 0x%08lx", - __entry->pc) -); +DEFINE_EVENT(kvm_transition, kvm_out, + TP_PROTO(struct kvm_vcpu *vcpu), + TP_ARGS(vcpu)); /* The first 32 exit reasons correspond to Cause.ExcCode */ #define KVM_TRACE_EXIT_INT 0 diff --git a/arch/mips/kvm/trap_emul.c b/arch/mips/kvm/trap_emul.c index 00e8dc3d36cb..091553942bcb 100644 --- a/arch/mips/kvm/trap_emul.c +++ b/arch/mips/kvm/trap_emul.c @@ -431,9 +431,15 @@ static int kvm_trap_emul_vcpu_setup(struct kvm_vcpu *vcpu) /* * Arch specific stuff, set up config registers properly so that the - * guest will come up as expected, for now we simulate a MIPS 24kc + * guest will come up as expected */ +#ifndef CONFIG_CPU_MIPSR6 + /* r2-r5, simulate a MIPS 24kc */ kvm_write_c0_guest_prid(cop0, 0x00019300); +#else + /* r6+, simulate a generic QEMU machine */ + kvm_write_c0_guest_prid(cop0, 0x00010000); +#endif /* * Have config1, Cacheable, noncoherent, write-back, write allocate. * Endianness, arch revision & virtually tagged icache should match diff --git a/arch/mips/math-emu/cp1emu.c b/arch/mips/math-emu/cp1emu.c index d96e912b9d44..6dc07fba187f 100644 --- a/arch/mips/math-emu/cp1emu.c +++ b/arch/mips/math-emu/cp1emu.c @@ -627,8 +627,8 @@ static int isBranchInstr(struct pt_regs *regs, struct mm_decoded_insn dec_insn, dec_insn.pc_inc + dec_insn.next_pc_inc; return 1; - case cbcond0_op: - case cbcond1_op: + case pop10_op: + case pop30_op: if (!cpu_has_mips_r6) break; if (insn.i_format.rt && !insn.i_format.rs) @@ -683,14 +683,14 @@ static int isBranchInstr(struct pt_regs *regs, struct mm_decoded_insn dec_insn, dec_insn.next_pc_inc; return 1; - case beqzcjic_op: + case pop66_op: if (!cpu_has_mips_r6) break; *contpc = regs->cp0_epc + dec_insn.pc_inc + dec_insn.next_pc_inc; return 1; - case bnezcjialc_op: + case pop76_op: if (!cpu_has_mips_r6) break; if (!insn.i_format.rs) diff --git a/arch/mips/mm/uasm-micromips.c b/arch/mips/mm/uasm-micromips.c index d78178daea4b..277cf52d80e1 100644 --- a/arch/mips/mm/uasm-micromips.c +++ b/arch/mips/mm/uasm-micromips.c @@ -53,8 +53,13 @@ static struct insn insn_table_MM[] = { { insn_bltzl, 0, 0 }, { insn_bne, M(mm_bne32_op, 0, 0, 0, 0, 0), RT | RS | BIMM }, { insn_cache, M(mm_pool32b_op, 0, 0, mm_cache_func, 0, 0), RT | RS | SIMM }, + { insn_cfc1, M(mm_pool32f_op, 0, 0, 0, mm_cfc1_op, mm_32f_73_op), RT | RS }, + { insn_cfcmsa, M(mm_pool32s_op, 0, msa_cfc_op, 0, 0, mm_32s_elm_op), RD | RE }, + { insn_ctc1, M(mm_pool32f_op, 0, 0, 0, mm_ctc1_op, mm_32f_73_op), RT | RS }, + { insn_ctcmsa, M(mm_pool32s_op, 0, msa_ctc_op, 0, 0, mm_32s_elm_op), RD | RE }, { insn_daddu, 0, 0 }, { insn_daddiu, 0, 0 }, + { insn_di, M(mm_pool32a_op, 0, 0, 0, mm_di_op, mm_pool32axf_op), RS }, { insn_divu, M(mm_pool32a_op, 0, 0, 0, mm_divu_op, mm_pool32axf_op), RT | RS }, { insn_dmfc0, 0, 0 }, { insn_dmtc0, 0, 0 }, @@ -84,6 +89,8 @@ static struct insn insn_table_MM[] = { { insn_mfhi, M(mm_pool32a_op, 0, 0, 0, mm_mfhi32_op, mm_pool32axf_op), RS }, { insn_mflo, M(mm_pool32a_op, 0, 0, 0, mm_mflo32_op, mm_pool32axf_op), RS }, { insn_mtc0, M(mm_pool32a_op, 0, 0, 0, mm_mtc0_op, mm_pool32axf_op), RT | RS | RD }, + { insn_mthi, M(mm_pool32a_op, 0, 0, 0, mm_mthi32_op, mm_pool32axf_op), RS }, + { insn_mtlo, M(mm_pool32a_op, 0, 0, 0, mm_mtlo32_op, mm_pool32axf_op), RS }, { insn_mul, M(mm_pool32a_op, 0, 0, 0, 0, mm_mul_op), RT | RS | RD }, { insn_or, M(mm_pool32a_op, 0, 0, 0, 0, mm_or32_op), RT | RS | RD }, { insn_ori, M(mm_ori32_op, 0, 0, 0, 0, 0), RT | RS | UIMM }, @@ -166,13 +173,15 @@ static void build_insn(u32 **buf, enum opcode opc, ...) op = ip->match; va_start(ap, opc); if (ip->fields & RS) { - if (opc == insn_mfc0 || opc == insn_mtc0) + if (opc == insn_mfc0 || opc == insn_mtc0 || + opc == insn_cfc1 || opc == insn_ctc1) op |= build_rt(va_arg(ap, u32)); else op |= build_rs(va_arg(ap, u32)); } if (ip->fields & RT) { - if (opc == insn_mfc0 || opc == insn_mtc0) + if (opc == insn_mfc0 || opc == insn_mtc0 || + opc == insn_cfc1 || opc == insn_ctc1) op |= build_rs(va_arg(ap, u32)); else op |= build_rt(va_arg(ap, u32)); diff --git a/arch/mips/mm/uasm-mips.c b/arch/mips/mm/uasm-mips.c index 9c2220a45189..cec524167822 100644 --- a/arch/mips/mm/uasm-mips.c +++ b/arch/mips/mm/uasm-mips.c @@ -67,9 +67,14 @@ static struct insn insn_table[] = { #else { insn_cache, M6(cache_op, 0, 0, 0, cache6_op), RS | RT | SIMM9 }, #endif + { insn_cfc1, M(cop1_op, cfc_op, 0, 0, 0, 0), RT | RD }, + { insn_cfcmsa, M(msa_op, 0, msa_cfc_op, 0, 0, msa_elm_op), RD | RE }, + { insn_ctc1, M(cop1_op, ctc_op, 0, 0, 0, 0), RT | RD }, + { insn_ctcmsa, M(msa_op, 0, msa_ctc_op, 0, 0, msa_elm_op), RD | RE }, { insn_daddiu, M(daddiu_op, 0, 0, 0, 0, 0), RS | RT | SIMM }, { insn_daddu, M(spec_op, 0, 0, 0, 0, daddu_op), RS | RT | RD }, { insn_dinsm, M(spec3_op, 0, 0, 0, 0, dinsm_op), RS | RT | RD | RE }, + { insn_di, M(cop0_op, mfmc0_op, 0, 12, 0, 0), RT }, { insn_dins, M(spec3_op, 0, 0, 0, 0, dins_op), RS | RT | RD | RE }, { insn_divu, M(spec_op, 0, 0, 0, 0, divu_op), RS | RT }, { insn_dmfc0, M(cop0_op, dmfc_op, 0, 0, 0, 0), RT | RD | SET}, @@ -114,7 +119,13 @@ static struct insn insn_table[] = { { insn_mflo, M(spec_op, 0, 0, 0, 0, mflo_op), RD }, { insn_mtc0, M(cop0_op, mtc_op, 0, 0, 0, 0), RT | RD | SET}, { insn_mthc0, M(cop0_op, mthc0_op, 0, 0, 0, 0), RT | RD | SET}, + { insn_mthi, M(spec_op, 0, 0, 0, 0, mthi_op), RS }, + { insn_mtlo, M(spec_op, 0, 0, 0, 0, mtlo_op), RS }, +#ifndef CONFIG_CPU_MIPSR6 { insn_mul, M(spec2_op, 0, 0, 0, 0, mul_op), RS | RT | RD}, +#else + { insn_mul, M(spec_op, 0, 0, 0, mult_mul_op, mult_op), RS | RT | RD}, +#endif { insn_ori, M(ori_op, 0, 0, 0, 0, 0), RS | RT | UIMM }, { insn_or, M(spec_op, 0, 0, 0, 0, or_op), RS | RT | RD }, #ifndef CONFIG_CPU_MIPSR6 diff --git a/arch/mips/mm/uasm.c b/arch/mips/mm/uasm.c index ad718debc35a..3e0282d301d6 100644 --- a/arch/mips/mm/uasm.c +++ b/arch/mips/mm/uasm.c @@ -49,18 +49,19 @@ enum opcode { insn_invalid, insn_addiu, insn_addu, insn_and, insn_andi, insn_bbit0, insn_bbit1, insn_beq, insn_beql, insn_bgez, insn_bgezl, insn_bltz, insn_bltzl, - insn_bne, insn_cache, insn_daddiu, insn_daddu, insn_dins, insn_dinsm, - insn_divu, insn_dmfc0, insn_dmtc0, insn_drotr, insn_drotr32, insn_dsll, + insn_bne, insn_cache, insn_cfc1, insn_cfcmsa, insn_ctc1, insn_ctcmsa, + insn_daddiu, insn_daddu, insn_di, insn_dins, insn_dinsm, insn_divu, + insn_dmfc0, insn_dmtc0, insn_drotr, insn_drotr32, insn_dsll, insn_dsll32, insn_dsra, insn_dsrl, insn_dsrl32, insn_dsubu, insn_eret, insn_ext, insn_ins, insn_j, insn_jal, insn_jalr, insn_jr, insn_lb, insn_ld, insn_ldx, insn_lh, insn_ll, insn_lld, insn_lui, insn_lw, insn_lwx, insn_mfc0, insn_mfhc0, insn_mfhi, insn_mflo, insn_mtc0, - insn_mthc0, insn_mul, insn_or, insn_ori, insn_pref, insn_rfe, - insn_rotr, insn_sc, insn_scd, insn_sd, insn_sll, insn_sllv, insn_slt, - insn_sltiu, insn_sltu, insn_sra, insn_srl, insn_srlv, insn_subu, - insn_sw, insn_sync, insn_syscall, insn_tlbp, insn_tlbr, insn_tlbwi, - insn_tlbwr, insn_wait, insn_wsbh, insn_xor, insn_xori, insn_yield, - insn_lddir, insn_ldpte, + insn_mthc0, insn_mthi, insn_mtlo, insn_mul, insn_or, insn_ori, + insn_pref, insn_rfe, insn_rotr, insn_sc, insn_scd, insn_sd, insn_sll, + insn_sllv, insn_slt, insn_sltiu, insn_sltu, insn_sra, insn_srl, + insn_srlv, insn_subu, insn_sw, insn_sync, insn_syscall, insn_tlbp, + insn_tlbr, insn_tlbwi, insn_tlbwr, insn_wait, insn_wsbh, insn_xor, + insn_xori, insn_yield, insn_lddir, insn_ldpte, }; struct insn { @@ -268,10 +269,15 @@ I_u1s2(_bltz) I_u1s2(_bltzl) I_u1u2s3(_bne) I_u2s3u1(_cache) +I_u1u2(_cfc1) +I_u2u1(_cfcmsa) +I_u1u2(_ctc1) +I_u2u1(_ctcmsa) I_u1u2u3(_dmfc0) I_u1u2u3(_dmtc0) I_u2u1s3(_daddiu) I_u3u1u2(_daddu) +I_u1(_di); I_u1u2(_divu) I_u2u1u3(_dsll) I_u2u1u3(_dsll32) @@ -301,6 +307,8 @@ I_u1(_mfhi) I_u1(_mflo) I_u1u2u3(_mtc0) I_u1u2u3(_mthc0) +I_u1(_mthi) +I_u1(_mtlo) I_u3u1u2(_mul) I_u2u1u3(_ori) I_u3u1u2(_or) diff --git a/arch/powerpc/include/asm/hmi.h b/arch/powerpc/include/asm/hmi.h new file mode 100644 index 000000000000..88b4901ac4ee --- /dev/null +++ b/arch/powerpc/include/asm/hmi.h @@ -0,0 +1,45 @@ +/* + * Hypervisor Maintenance Interrupt header file. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. + * + * Copyright 2015 IBM Corporation + * Author: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com> + */ + +#ifndef __ASM_PPC64_HMI_H__ +#define __ASM_PPC64_HMI_H__ + +#ifdef CONFIG_PPC_BOOK3S_64 + +#define CORE_TB_RESYNC_REQ_BIT 63 +#define MAX_SUBCORE_PER_CORE 4 + +/* + * sibling_subcore_state structure is used to co-ordinate all threads + * during HMI to avoid TB corruption. This structure is allocated once + * per each core and shared by all threads on that core. + */ +struct sibling_subcore_state { + unsigned long flags; + u8 in_guest[MAX_SUBCORE_PER_CORE]; +}; + +extern void wait_for_subcore_guest_exit(void); +extern void wait_for_tb_resync(void); +#else +static inline void wait_for_subcore_guest_exit(void) { } +static inline void wait_for_tb_resync(void) { } +#endif +#endif /* __ASM_PPC64_HMI_H__ */ diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h index 546540b91095..4b17bd058e01 100644 --- a/arch/powerpc/include/asm/paca.h +++ b/arch/powerpc/include/asm/paca.h @@ -25,6 +25,7 @@ #ifdef CONFIG_KVM_BOOK3S_64_HANDLER #include <asm/kvm_book3s_asm.h> #endif +#include <asm/hmi.h> register struct paca_struct *local_paca asm("r13"); @@ -181,6 +182,11 @@ struct paca_struct { */ u16 in_mce; u8 hmi_event_available; /* HMI event is available */ + /* + * Bitmap for sibling subcore status. See kvm/book3s_hv_ras.c for + * more details + */ + struct sibling_subcore_state *sibling_subcore_state; #endif /* Stuff for accurate time accounting */ diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile index 2da380fcc34c..6972a23433d3 100644 --- a/arch/powerpc/kernel/Makefile +++ b/arch/powerpc/kernel/Makefile @@ -41,7 +41,7 @@ obj-$(CONFIG_VDSO32) += vdso32/ obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o obj-$(CONFIG_PPC_BOOK3S_64) += cpu_setup_ppc970.o cpu_setup_pa6t.o obj-$(CONFIG_PPC_BOOK3S_64) += cpu_setup_power.o -obj-$(CONFIG_PPC_BOOK3S_64) += mce.o mce_power.o +obj-$(CONFIG_PPC_BOOK3S_64) += mce.o mce_power.o hmi.o obj64-$(CONFIG_RELOCATABLE) += reloc_64.o obj-$(CONFIG_PPC_BOOK3E_64) += exceptions-64e.o idle_book3e.o obj-$(CONFIG_PPC64) += vdso64/ diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 4c9440629128..0eba47e074b9 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -680,6 +680,8 @@ _GLOBAL(__replay_interrupt) BEGIN_FTR_SECTION cmpwi r3,0xe80 beq h_doorbell_common + cmpwi r3,0xe60 + beq hmi_exception_common FTR_SECTION_ELSE cmpwi r3,0xa00 beq doorbell_super_common @@ -1172,7 +1174,7 @@ fwnmi_data_area: .globl hmi_exception_early hmi_exception_early: - EXCEPTION_PROLOG_1(PACA_EXGEN, NOTEST, 0xe60) + EXCEPTION_PROLOG_1(PACA_EXGEN, KVMTEST, 0xe62) mr r10,r1 /* Save r1 */ ld r1,PACAEMERGSP(r13) /* Use emergency stack */ subi r1,r1,INT_FRAME_SIZE /* alloc stack frame */ diff --git a/arch/powerpc/kernel/hmi.c b/arch/powerpc/kernel/hmi.c new file mode 100644 index 000000000000..e3f738eb1cac --- /dev/null +++ b/arch/powerpc/kernel/hmi.c @@ -0,0 +1,56 @@ +/* + * Hypervisor Maintenance Interrupt (HMI) handling. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. + * + * Copyright 2015 IBM Corporation + * Author: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com> + */ + +#undef DEBUG + +#include <linux/types.h> +#include <linux/compiler.h> +#include <asm/paca.h> +#include <asm/hmi.h> + +void wait_for_subcore_guest_exit(void) +{ + int i; + + /* + * NULL bitmap pointer indicates that KVM module hasn't + * been loaded yet and hence no guests are running. + * If no KVM is in use, no need to co-ordinate among threads + * as all of them will always be in host and no one is going + * to modify TB other than the opal hmi handler. + * Hence, just return from here. + */ + if (!local_paca->sibling_subcore_state) + return; + + for (i = 0; i < MAX_SUBCORE_PER_CORE; i++) + while (local_paca->sibling_subcore_state->in_guest[i]) + cpu_relax(); +} + +void wait_for_tb_resync(void) +{ + if (!local_paca->sibling_subcore_state) + return; + + while (test_bit(CORE_TB_RESYNC_REQ_BIT, + &local_paca->sibling_subcore_state->flags)) + cpu_relax(); +} diff --git a/arch/powerpc/kernel/idle_power7.S b/arch/powerpc/kernel/idle_power7.S index 470ceebd2d23..bb5112908fb2 100644 --- a/arch/powerpc/kernel/idle_power7.S +++ b/arch/powerpc/kernel/idle_power7.S @@ -270,8 +270,9 @@ ALT_FTR_SECTION_END_NESTED_IFSET(CPU_FTR_ARCH_207S, 66); \ ld r2,PACATOC(r13); \ ld r1,PACAR1(r13); \ std r3,ORIG_GPR3(r1); /* Save original r3 */ \ - li r0,OPAL_HANDLE_HMI; /* Pass opal token argument*/ \ - bl opal_call_realmode; \ + li r3,0; /* NULL argument */ \ + bl hmi_exception_realmode; \ + nop; \ ld r3,ORIG_GPR3(r1); /* Restore original r3 */ \ 20: nop; diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c index 9229ba63c370..9ec95daccad9 100644 --- a/arch/powerpc/kernel/traps.c +++ b/arch/powerpc/kernel/traps.c @@ -60,6 +60,7 @@ #include <asm/switch_to.h> #include <asm/tm.h> #include <asm/debug.h> +#include <asm/hmi.h> #include <sysdev/fsl_pci.h> #if defined(CONFIG_DEBUGGER) || defined(CONFIG_KEXEC) @@ -307,9 +308,13 @@ long hmi_exception_realmode(struct pt_regs *regs) { __this_cpu_inc(irq_stat.hmi_exceptions); + wait_for_subcore_guest_exit(); + if (ppc_md.hmi_exception_early) ppc_md.hmi_exception_early(regs); + wait_for_tb_resync(); + return 0; } diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index e20beae5ca7a..2fd5580c8f6e 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -52,6 +52,7 @@ #include <asm/switch_to.h> #include <asm/smp.h> #include <asm/dbell.h> +#include <asm/hmi.h> #include <linux/gfp.h> #include <linux/vmalloc.h> #include <linux/highmem.h> @@ -2522,7 +2523,7 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) list_for_each_entry(pvc, &core_info.vcs[sub], preempt_list) spin_unlock(&pvc->lock); - kvm_guest_enter(); + guest_enter(); srcu_idx = srcu_read_lock(&vc->kvm->srcu); @@ -2570,7 +2571,7 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) /* make sure updates to secondary vcpu structs are visible now */ smp_mb(); - kvm_guest_exit(); + guest_exit(); for (sub = 0; sub < core_info.n_subcores; ++sub) list_for_each_entry_safe(pvc, vcnext, &core_info.vcs[sub], @@ -3401,6 +3402,38 @@ static struct kvmppc_ops kvm_ops_hv = { .hcall_implemented = kvmppc_hcall_impl_hv, }; +static int kvm_init_subcore_bitmap(void) +{ + int i, j; + int nr_cores = cpu_nr_cores(); + struct sibling_subcore_state *sibling_subcore_state; + + for (i = 0; i < nr_cores; i++) { + int first_cpu = i * threads_per_core; + int node = cpu_to_node(first_cpu); + + /* Ignore if it is already allocated. */ + if (paca[first_cpu].sibling_subcore_state) + continue; + + sibling_subcore_state = + kmalloc_node(sizeof(struct sibling_subcore_state), + GFP_KERNEL, node); + if (!sibling_subcore_state) + return -ENOMEM; + + memset(sibling_subcore_state, 0, + sizeof(struct sibling_subcore_state)); + + for (j = 0; j < threads_per_core; j++) { + int cpu = first_cpu + j; + + paca[cpu].sibling_subcore_state = sibling_subcore_state; + } + } + return 0; +} + static int kvmppc_book3s_init_hv(void) { int r; @@ -3411,6 +3444,10 @@ static int kvmppc_book3s_init_hv(void) if (r < 0) return -ENODEV; + r = kvm_init_subcore_bitmap(); + if (r) + return r; + kvm_ops_hv.owner = THIS_MODULE; kvmppc_hv_ops = &kvm_ops_hv; diff --git a/arch/powerpc/kvm/book3s_hv_ras.c b/arch/powerpc/kvm/book3s_hv_ras.c index 93b5f5c9b445..0fa70a9618d7 100644 --- a/arch/powerpc/kvm/book3s_hv_ras.c +++ b/arch/powerpc/kvm/book3s_hv_ras.c @@ -13,6 +13,9 @@ #include <linux/kernel.h> #include <asm/opal.h> #include <asm/mce.h> +#include <asm/machdep.h> +#include <asm/cputhreads.h> +#include <asm/hmi.h> /* SRR1 bits for machine check on POWER7 */ #define SRR1_MC_LDSTERR (1ul << (63-42)) @@ -140,3 +143,176 @@ long kvmppc_realmode_machine_check(struct kvm_vcpu *vcpu) { return kvmppc_realmode_mc_power7(vcpu); } + +/* Check if dynamic split is in force and return subcore size accordingly. */ +static inline int kvmppc_cur_subcore_size(void) +{ + if (local_paca->kvm_hstate.kvm_split_mode) + return local_paca->kvm_hstate.kvm_split_mode->subcore_size; + + return threads_per_subcore; +} + +void kvmppc_subcore_enter_guest(void) +{ + int thread_id, subcore_id; + + thread_id = cpu_thread_in_core(local_paca->paca_index); + subcore_id = thread_id / kvmppc_cur_subcore_size(); + + local_paca->sibling_subcore_state->in_guest[subcore_id] = 1; +} + +void kvmppc_subcore_exit_guest(void) +{ + int thread_id, subcore_id; + + thread_id = cpu_thread_in_core(local_paca->paca_index); + subcore_id = thread_id / kvmppc_cur_subcore_size(); + + local_paca->sibling_subcore_state->in_guest[subcore_id] = 0; +} + +static bool kvmppc_tb_resync_required(void) +{ + if (test_and_set_bit(CORE_TB_RESYNC_REQ_BIT, + &local_paca->sibling_subcore_state->flags)) + return false; + + return true; +} + +static void kvmppc_tb_resync_done(void) +{ + clear_bit(CORE_TB_RESYNC_REQ_BIT, + &local_paca->sibling_subcore_state->flags); +} + +/* + * kvmppc_realmode_hmi_handler() is called only by primary thread during + * guest exit path. + * + * There are multiple reasons why HMI could occur, one of them is + * Timebase (TB) error. If this HMI is due to TB error, then TB would + * have been in stopped state. The opal hmi handler Will fix it and + * restore the TB value with host timebase value. For HMI caused due + * to non-TB errors, opal hmi handler will not touch/restore TB register + * and hence there won't be any change in TB value. + * + * Since we are not sure about the cause of this HMI, we can't be sure + * about the content of TB register whether it holds guest or host timebase + * value. Hence the idea is to resync the TB on every HMI, so that we + * know about the exact state of the TB value. Resync TB call will + * restore TB to host timebase. + * + * Things to consider: + * - On TB error, HMI interrupt is reported on all the threads of the core + * that has encountered TB error irrespective of split-core mode. + * - The very first thread on the core that get chance to fix TB error + * would rsync the TB with local chipTOD value. + * - The resync TB is a core level action i.e. it will sync all the TBs + * in that core independent of split-core mode. This means if we trigger + * TB sync from a thread from one subcore, it would affect TB values of + * sibling subcores of the same core. + * + * All threads need to co-ordinate before making opal hmi handler. + * All threads will use sibling_subcore_state->in_guest[] (shared by all + * threads in the core) in paca which holds information about whether + * sibling subcores are in Guest mode or host mode. The in_guest[] array + * is of size MAX_SUBCORE_PER_CORE=4, indexed using subcore id to set/unset + * subcore status. Only primary threads from each subcore is responsible + * to set/unset its designated array element while entering/exiting the + * guset. + * + * After invoking opal hmi handler call, one of the thread (of entire core) + * will need to resync the TB. Bit 63 from subcore state bitmap flags + * (sibling_subcore_state->flags) will be used to co-ordinate between + * primary threads to decide who takes up the responsibility. + * + * This is what we do: + * - Primary thread from each subcore tries to set resync required bit[63] + * of paca->sibling_subcore_state->flags. + * - The first primary thread that is able to set the flag takes the + * responsibility of TB resync. (Let us call it as thread leader) + * - All other threads which are in host will call + * wait_for_subcore_guest_exit() and wait for in_guest[0-3] from + * paca->sibling_subcore_state to get cleared. + * - All the primary thread will clear its subcore status from subcore + * state in_guest[] array respectively. + * - Once all primary threads clear in_guest[0-3], all of them will invoke + * opal hmi handler. + * - Now all threads will wait for TB resync to complete by invoking + * wait_for_tb_resync() except the thread leader. + * - Thread leader will do a TB resync by invoking opal_resync_timebase() + * call and the it will clear the resync required bit. + * - All other threads will now come out of resync wait loop and proceed + * with individual execution. + * - On return of this function, primary thread will signal all + * secondary threads to proceed. + * - All secondary threads will eventually call opal hmi handler on + * their exit path. + */ + +long kvmppc_realmode_hmi_handler(void) +{ + int ptid = local_paca->kvm_hstate.ptid; + bool resync_req; + + /* This is only called on primary thread. */ + BUG_ON(ptid != 0); + __this_cpu_inc(irq_stat.hmi_exceptions); + + /* + * By now primary thread has already completed guest->host + * partition switch but haven't signaled secondaries yet. + * All the secondary threads on this subcore is waiting + * for primary thread to signal them to go ahead. + * + * For threads from subcore which isn't in guest, they all will + * wait until all other subcores on this core exit the guest. + * + * Now set the resync required bit. If you are the first to + * set this bit then kvmppc_tb_resync_required() function will + * return true. For rest all other subcores + * kvmppc_tb_resync_required() will return false. + * + * If resync_req == true, then this thread is responsible to + * initiate TB resync after hmi handler has completed. + * All other threads on this core will wait until this thread + * clears the resync required bit flag. + */ + resync_req = kvmppc_tb_resync_required(); + + /* Reset the subcore status to indicate it has exited guest */ + kvmppc_subcore_exit_guest(); + + /* + * Wait for other subcores on this core to exit the guest. + * All the primary threads and threads from subcore that are + * not in guest will wait here until all subcores are out + * of guest context. + */ + wait_for_subcore_guest_exit(); + + /* + * At this point we are sure that primary threads from each + * subcore on this core have completed guest->host partition + * switch. Now it is safe to call HMI handler. + */ + if (ppc_md.hmi_exception_early) + ppc_md.hmi_exception_early(NULL); + + /* + * Check if this thread is responsible to resync TB. + * All other threads will wait until this thread completes the + * TB resync. + */ + if (resync_req) { + opal_resync_timebase(); + /* Reset TB resync req bit */ + kvmppc_tb_resync_done(); + } else { + wait_for_tb_resync(); + } + return 0; +} diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index e571ad277398..0d246fca157a 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -29,6 +29,7 @@ #include <asm/kvm_book3s_asm.h> #include <asm/book3s/64/mmu-hash.h> #include <asm/tm.h> +#include <asm/opal.h> #define VCPU_GPRS_TM(reg) (((reg) * ULONG_SIZE) + VCPU_GPR_TM) @@ -373,6 +374,18 @@ kvm_secondary_got_guest: lwsync std r0, HSTATE_KVM_VCORE(r13) + /* + * All secondaries exiting guest will fall through this path. + * Before proceeding, just check for HMI interrupt and + * invoke opal hmi handler. By now we are sure that the + * primary thread on this core/subcore has already made partition + * switch/TB resync and we are good to call opal hmi handler. + */ + cmpwi r12, BOOK3S_INTERRUPT_HMI + bne kvm_no_guest + + li r3,0 /* NULL argument */ + bl hmi_exception_realmode /* * At this point we have finished executing in the guest. * We need to wait for hwthread_req to become zero, since @@ -428,6 +441,22 @@ kvm_no_guest: */ kvm_unsplit_nap: /* + * When secondaries are napping in kvm_unsplit_nap() with + * hwthread_req = 1, HMI goes ignored even though subcores are + * already exited the guest. Hence HMI keeps waking up secondaries + * from nap in a loop and secondaries always go back to nap since + * no vcore is assigned to them. This makes impossible for primary + * thread to get hold of secondary threads resulting into a soft + * lockup in KVM path. + * + * Let us check if HMI is pending and handle it before we go to nap. + */ + cmpwi r12, BOOK3S_INTERRUPT_HMI + bne 55f + li r3, 0 /* NULL argument */ + bl hmi_exception_realmode +55: + /* * Ensure that secondary doesn't nap when it has * its vcore pointer set. */ @@ -601,6 +630,11 @@ BEGIN_FTR_SECTION mtspr SPRN_DPDES, r8 END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) + /* Mark the subcore state as inside guest */ + bl kvmppc_subcore_enter_guest + nop + ld r5, HSTATE_KVM_VCORE(r13) + ld r4, HSTATE_KVM_VCPU(r13) li r0,1 stb r0,VCORE_IN_GUEST(r5) /* signal secondaries to continue */ @@ -1683,6 +1717,23 @@ BEGIN_FTR_SECTION mtspr SPRN_DPDES, r8 END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) + /* If HMI, call kvmppc_realmode_hmi_handler() */ + cmpwi r12, BOOK3S_INTERRUPT_HMI + bne 27f + bl kvmppc_realmode_hmi_handler + nop + li r12, BOOK3S_INTERRUPT_HMI + /* + * At this point kvmppc_realmode_hmi_handler would have resync-ed + * the TB. Hence it is not required to subtract guest timebase + * offset from timebase. So, skip it. + * + * Also, do not call kvmppc_subcore_exit_guest() because it has + * been invoked as part of kvmppc_realmode_hmi_handler(). + */ + b 30f + +27: /* Subtract timebase offset from timebase */ ld r8,VCORE_TB_OFFSET(r5) cmpdi r8,0 @@ -1698,8 +1749,13 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) addis r8,r8,0x100 /* if so, increment upper 40 bits */ mtspr SPRN_TBU40,r8 +17: bl kvmppc_subcore_exit_guest + nop +30: ld r5,HSTATE_KVM_VCORE(r13) + ld r4,VCORE_KVM(r5) /* pointer to struct kvm */ + /* Reset PCR */ -17: ld r0, VCORE_PCR(r5) + ld r0, VCORE_PCR(r5) cmpdi r0, 0 beq 18f li r0, 0 @@ -2461,6 +2517,8 @@ BEGIN_FTR_SECTION cmpwi r6, 3 /* hypervisor doorbell? */ beq 3f END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) + cmpwi r6, 0xa /* Hypervisor maintenance ? */ + beq 4f li r3, 1 /* anything else, return 1 */ 0: blr @@ -2482,6 +2540,11 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) li r3, -1 blr + /* Woken up due to Hypervisor maintenance interrupt */ +4: li r12, BOOK3S_INTERRUPT_HMI + li r3, 1 + blr + /* * Determine what sort of external interrupt is pending (if any). * Returns: diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c index 8e4f64f0b774..b6cd730c33ef 100644 --- a/arch/powerpc/kvm/book3s_pr.c +++ b/arch/powerpc/kvm/book3s_pr.c @@ -914,7 +914,7 @@ int kvmppc_handle_exit_pr(struct kvm_run *run, struct kvm_vcpu *vcpu, /* We get here with MSR.EE=1 */ trace_kvm_exit(exit_nr, vcpu); - kvm_guest_exit(); + guest_exit(); switch (exit_nr) { case BOOK3S_INTERRUPT_INST_STORAGE: @@ -1049,7 +1049,17 @@ int kvmppc_handle_exit_pr(struct kvm_run *run, struct kvm_vcpu *vcpu, int emul; program_interrupt: - flags = vcpu->arch.shadow_srr1 & 0x1f0000ull; + /* + * shadow_srr1 only contains valid flags if we came here via + * a program exception. The other exceptions (emulation assist, + * FP unavailable, etc.) do not provide flags in SRR1, so use + * an illegal-instruction exception when injecting a program + * interrupt into the guest. + */ + if (exit_nr == BOOK3S_INTERRUPT_PROGRAM) + flags = vcpu->arch.shadow_srr1 & 0x1f0000ull; + else + flags = SRR1_PROGILL; emul = kvmppc_get_last_inst(vcpu, INST_GENERIC, &last_inst); if (emul != EMULATE_DONE) { @@ -1531,7 +1541,7 @@ static int kvmppc_vcpu_run_pr(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) kvmppc_clear_debug(vcpu); - /* No need for kvm_guest_exit. It's done in handle_exit. + /* No need for guest_exit. It's done in handle_exit. We also get here with interrupts enabled. */ /* Make sure we save the guest FPU/Altivec/VSX state */ diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index 4afae695899a..02b4672f7347 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -776,7 +776,7 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) ret = __kvmppc_vcpu_run(kvm_run, vcpu); - /* No need for kvm_guest_exit. It's done in handle_exit. + /* No need for guest_exit. It's done in handle_exit. We also get here with interrupts enabled. */ /* Switch back to user space debug context */ @@ -1012,7 +1012,7 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, } trace_kvm_exit(exit_nr, vcpu); - __kvm_guest_exit(); + guest_exit_irqoff(); local_irq_enable(); diff --git a/arch/powerpc/kvm/emulate.c b/arch/powerpc/kvm/emulate.c index 5cc2e7af3a7b..b379146de55b 100644 --- a/arch/powerpc/kvm/emulate.c +++ b/arch/powerpc/kvm/emulate.c @@ -302,7 +302,6 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu) advance = 0; printk(KERN_ERR "Couldn't emulate instruction 0x%08x " "(op %d xop %d)\n", inst, get_op(inst), get_xop(inst)); - kvmppc_core_queue_program(vcpu, 0); } } diff --git a/arch/powerpc/kvm/mpic.c b/arch/powerpc/kvm/mpic.c index 6249cdc834d1..ed38f8114118 100644 --- a/arch/powerpc/kvm/mpic.c +++ b/arch/powerpc/kvm/mpic.c @@ -1823,7 +1823,8 @@ int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e, return 0; } -int kvm_set_routing_entry(struct kvm_kernel_irq_routing_entry *e, +int kvm_set_routing_entry(struct kvm *kvm, + struct kvm_kernel_irq_routing_entry *e, const struct kvm_irq_routing_entry *ue) { int r = -EINVAL; diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 02416fea7653..1ac036e45ed4 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -119,7 +119,7 @@ int kvmppc_prepare_to_enter(struct kvm_vcpu *vcpu) continue; } - __kvm_guest_enter(); + guest_enter_irqoff(); return 1; } diff --git a/arch/powerpc/platforms/powernv/opal-wrappers.S b/arch/powerpc/platforms/powernv/opal-wrappers.S index e45b88a5d7e0..df6ad949e35f 100644 --- a/arch/powerpc/platforms/powernv/opal-wrappers.S +++ b/arch/powerpc/platforms/powernv/opal-wrappers.S @@ -64,7 +64,6 @@ END_FTR_SECTION(0, 1); \ OPAL_BRANCH(opal_tracepoint_entry) \ mfcr r12; \ stw r12,8(r1); \ - std r1,PACAR1(r13); \ li r11,0; \ mfmsr r12; \ ori r11,r11,MSR_EE; \ @@ -127,7 +126,6 @@ opal_tracepoint_entry: mfcr r12 std r11,16(r1) stw r12,8(r1) - std r1,PACAR1(r13) li r11,0 mfmsr r12 ori r11,r11,MSR_EE diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index 946fc86202fd..183b01727de4 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -43,6 +43,7 @@ /* s390-specific vcpu->requests bit members */ #define KVM_REQ_ENABLE_IBS 8 #define KVM_REQ_DISABLE_IBS 9 +#define KVM_REQ_ICPT_OPEREXC 10 #define SIGP_CTRL_C 0x80 #define SIGP_CTRL_SCN_MASK 0x3f @@ -666,6 +667,7 @@ struct kvm_arch{ int user_cpu_state_ctrl; int user_sigp; int user_stsi; + int user_instr0; struct s390_io_adapter *adapters[MAX_S390_IO_ADAPTERS]; wait_queue_head_t ipte_wq; int ipte_lock_count; diff --git a/arch/s390/kvm/guestdbg.c b/arch/s390/kvm/guestdbg.c index 1e0849e20965..31a05330d11c 100644 --- a/arch/s390/kvm/guestdbg.c +++ b/arch/s390/kvm/guestdbg.c @@ -439,6 +439,23 @@ exit_required: #define guest_per_enabled(vcpu) \ (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PER) +int kvm_s390_handle_per_ifetch_icpt(struct kvm_vcpu *vcpu) +{ + const u8 ilen = kvm_s390_get_ilen(vcpu); + struct kvm_s390_pgm_info pgm_info = { + .code = PGM_PER, + .per_code = PER_EVENT_IFETCH >> 24, + .per_address = __rewind_psw(vcpu->arch.sie_block->gpsw, ilen), + }; + + /* + * The PSW points to the next instruction, therefore the intercepted + * instruction generated a PER i-fetch event. PER address therefore + * points at the previous PSW address (could be an EXECUTE function). + */ + return kvm_s390_inject_prog_irq(vcpu, &pgm_info); +} + static void filter_guest_per_event(struct kvm_vcpu *vcpu) { u32 perc = vcpu->arch.sie_block->perc << 24; diff --git a/arch/s390/kvm/intercept.c b/arch/s390/kvm/intercept.c index 9359f65c8634..7a2f1551bc39 100644 --- a/arch/s390/kvm/intercept.c +++ b/arch/s390/kvm/intercept.c @@ -359,11 +359,16 @@ static int handle_operexc(struct kvm_vcpu *vcpu) test_kvm_facility(vcpu->kvm, 74)) return handle_sthyi(vcpu); + if (vcpu->arch.sie_block->ipa == 0 && vcpu->kvm->arch.user_instr0) + return -EOPNOTSUPP; + return kvm_s390_inject_program_int(vcpu, PGM_OPERATION); } int kvm_handle_sie_intercept(struct kvm_vcpu *vcpu) { + int rc, per_rc = 0; + if (kvm_is_ucontrol(vcpu->kvm)) return -EOPNOTSUPP; @@ -372,7 +377,8 @@ int kvm_handle_sie_intercept(struct kvm_vcpu *vcpu) case 0x18: return handle_noop(vcpu); case 0x04: - return handle_instruction(vcpu); + rc = handle_instruction(vcpu); + break; case 0x08: return handle_prog(vcpu); case 0x14: @@ -384,10 +390,18 @@ int kvm_handle_sie_intercept(struct kvm_vcpu *vcpu) case 0x28: return handle_stop(vcpu); case 0x2c: - return handle_operexc(vcpu); + rc = handle_operexc(vcpu); + break; case 0x38: - return handle_partial_execution(vcpu); + rc = handle_partial_execution(vcpu); + break; default: return -EOPNOTSUPP; } + + /* process PER, also if the instrution is processed in user space */ + if (vcpu->arch.sie_block->icptstatus & 0x02 && + (!rc || rc == -EOPNOTSUPP)) + per_rc = kvm_s390_handle_per_ifetch_icpt(vcpu); + return per_rc ? per_rc : rc; } diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index ca19627779db..24524c0f3ef8 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -2246,7 +2246,8 @@ static int set_adapter_int(struct kvm_kernel_irq_routing_entry *e, return ret; } -int kvm_set_routing_entry(struct kvm_kernel_irq_routing_entry *e, +int kvm_set_routing_entry(struct kvm *kvm, + struct kvm_kernel_irq_routing_entry *e, const struct kvm_irq_routing_entry *ue) { int ret; diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 03eeeb0ded24..63ac7c1641a7 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -364,6 +364,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_S390_USER_STSI: case KVM_CAP_S390_SKEYS: case KVM_CAP_S390_IRQ_STATE: + case KVM_CAP_S390_USER_INSTR0: r = 1; break; case KVM_CAP_S390_MEM_OP: @@ -456,6 +457,16 @@ out: return r; } +static void icpt_operexc_on_all_vcpus(struct kvm *kvm) +{ + unsigned int i; + struct kvm_vcpu *vcpu; + + kvm_for_each_vcpu(i, vcpu, kvm) { + kvm_s390_sync_request(KVM_REQ_ICPT_OPEREXC, vcpu); + } +} + static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap) { int r; @@ -507,6 +518,12 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap) kvm->arch.user_stsi = 1; r = 0; break; + case KVM_CAP_S390_USER_INSTR0: + VM_EVENT(kvm, 3, "%s", "ENABLE: CAP_S390_USER_INSTR0"); + kvm->arch.user_instr0 = 1; + icpt_operexc_on_all_vcpus(kvm); + r = 0; + break; default: r = -EINVAL; break; @@ -1836,6 +1853,8 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) vcpu->arch.gmap = vcpu->kvm->arch.gmap; sca_add_vcpu(vcpu); } + if (test_kvm_facility(vcpu->kvm, 74) || vcpu->kvm->arch.user_instr0) + vcpu->arch.sie_block->ictl |= ICTL_OPEREXC; /* make vcpu_load load the right gmap on the first trigger */ vcpu->arch.enabled_gmap = vcpu->arch.gmap; } @@ -1923,8 +1942,6 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) } vcpu->arch.sie_block->riccbd = (unsigned long) &vcpu->run->s.regs.riccb; vcpu->arch.sie_block->ictl |= ICTL_ISKE | ICTL_SSKE | ICTL_RRBE; - if (test_kvm_facility(vcpu->kvm, 74)) - vcpu->arch.sie_block->ictl |= ICTL_OPEREXC; if (vcpu->kvm->arch.use_cmma) { rc = kvm_s390_vcpu_setup_cmma(vcpu); @@ -2369,6 +2386,11 @@ retry: goto retry; } + if (kvm_check_request(KVM_REQ_ICPT_OPEREXC, vcpu)) { + vcpu->arch.sie_block->ictl |= ICTL_OPEREXC; + goto retry; + } + /* nothing to do, just clear the request */ clear_bit(KVM_REQ_UNHALT, &vcpu->requests); @@ -2623,14 +2645,14 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) * guest_enter and guest_exit should be no uaccess. */ local_irq_disable(); - __kvm_guest_enter(); + guest_enter_irqoff(); __disable_cpu_timer_accounting(vcpu); local_irq_enable(); exit_reason = sie64a(vcpu->arch.sie_block, vcpu->run->s.regs.gprs); local_irq_disable(); __enable_cpu_timer_accounting(vcpu); - __kvm_guest_exit(); + guest_exit_irqoff(); local_irq_enable(); vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h index 031f451bb2cf..b8432862a817 100644 --- a/arch/s390/kvm/kvm-s390.h +++ b/arch/s390/kvm/kvm-s390.h @@ -238,6 +238,8 @@ static inline void kvm_s390_forward_psw(struct kvm_vcpu *vcpu, int ilen) } static inline void kvm_s390_retry_instr(struct kvm_vcpu *vcpu) { + /* don't inject PER events if we re-execute the instruction */ + vcpu->arch.sie_block->icptstatus &= ~0x02; kvm_s390_rewind_psw(vcpu, kvm_s390_get_ilen(vcpu)); } @@ -377,6 +379,7 @@ int kvm_s390_import_bp_data(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg); void kvm_s390_clear_bp_data(struct kvm_vcpu *vcpu); void kvm_s390_prepare_debug_exit(struct kvm_vcpu *vcpu); +int kvm_s390_handle_per_ifetch_icpt(struct kvm_vcpu *vcpu); void kvm_s390_handle_per_event(struct kvm_vcpu *vcpu); /* support for Basic/Extended SCA handling */ diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c index c77ad2dc334f..46160388e996 100644 --- a/arch/s390/kvm/priv.c +++ b/arch/s390/kvm/priv.c @@ -1185,7 +1185,15 @@ static int handle_sckpf(struct kvm_vcpu *vcpu) return 0; } +static int handle_ptff(struct kvm_vcpu *vcpu) +{ + /* we don't emulate any control instructions yet */ + kvm_s390_set_psw_cc(vcpu, 3); + return 0; +} + static const intercept_handler_t x01_handlers[256] = { + [0x04] = handle_ptff, [0x07] = handle_sckpf, }; diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c index 6895e7b3be12..c106488b4137 100644 --- a/arch/s390/kvm/vsie.c +++ b/arch/s390/kvm/vsie.c @@ -765,13 +765,13 @@ static int do_vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); local_irq_disable(); - kvm_guest_enter(); + guest_enter_irqoff(); local_irq_enable(); rc = sie64a(scb_s, vcpu->run->s.regs.gprs); local_irq_disable(); - kvm_guest_exit(); + guest_exit_irqoff(); local_irq_enable(); vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 360c5171ea1a..9fcb197aa5ce 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -34,8 +34,9 @@ #include <asm/asm.h> #include <asm/kvm_page_track.h> -#define KVM_MAX_VCPUS 255 -#define KVM_SOFT_MAX_VCPUS 160 +#define KVM_MAX_VCPUS 288 +#define KVM_SOFT_MAX_VCPUS 240 +#define KVM_MAX_VCPU_ID 1023 #define KVM_USER_MEM_SLOTS 509 /* memory slots that are not exposed to userspace */ #define KVM_PRIVATE_MEM_SLOTS 3 @@ -598,6 +599,7 @@ struct kvm_vcpu_arch { u64 mcg_cap; u64 mcg_status; u64 mcg_ctl; + u64 mcg_ext_ctl; u64 *mce_banks; /* Cache MMIO info */ @@ -681,9 +683,12 @@ struct kvm_arch_memory_slot { struct kvm_apic_map { struct rcu_head rcu; u8 mode; - struct kvm_lapic *phys_map[256]; - /* first index is cluster id second is cpu id in a cluster */ - struct kvm_lapic *logical_map[16][16]; + u32 max_apic_id; + union { + struct kvm_lapic *xapic_flat_map[8]; + struct kvm_lapic *xapic_cluster_map[16][4]; + }; + struct kvm_lapic *phys_map[]; }; /* Hyper-V emulation context */ @@ -778,6 +783,9 @@ struct kvm_arch { u32 ldr_mode; struct page *avic_logical_id_table_page; struct page *avic_physical_id_table_page; + + bool x2apic_format; + bool x2apic_broadcast_quirk_disabled; }; struct kvm_vm_stat { @@ -1008,6 +1016,8 @@ struct kvm_x86_ops { int (*set_hv_timer)(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc); void (*cancel_hv_timer)(struct kvm_vcpu *vcpu); + + void (*setup_mce)(struct kvm_vcpu *vcpu); }; struct kvm_arch_async_pf { @@ -1028,7 +1038,7 @@ void kvm_mmu_setup(struct kvm_vcpu *vcpu); void kvm_mmu_init_vm(struct kvm *kvm); void kvm_mmu_uninit_vm(struct kvm *kvm); void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, - u64 dirty_mask, u64 nx_mask, u64 x_mask); + u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask); void kvm_mmu_reset_context(struct kvm_vcpu *vcpu); void kvm_mmu_slot_remove_write_access(struct kvm *kvm, @@ -1082,6 +1092,8 @@ extern u64 kvm_max_tsc_scaling_ratio; /* 1ull << kvm_tsc_scaling_ratio_frac_bits */ extern u64 kvm_default_tsc_scaling_ratio; +extern u64 kvm_mce_cap_supported; + enum emulation_result { EMULATE_DONE, /* no further processing */ EMULATE_USER_EXIT, /* kvm_run ready for userspace exit */ @@ -1356,7 +1368,7 @@ bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu); bool kvm_intr_is_single_vcpu(struct kvm *kvm, struct kvm_lapic_irq *irq, struct kvm_vcpu **dest_vcpu); -void kvm_set_msi_irq(struct kvm_kernel_irq_routing_entry *e, +void kvm_set_msi_irq(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e, struct kvm_lapic_irq *irq); static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c index dfb4c6476877..25810b144b58 100644 --- a/arch/x86/kvm/irq_comm.c +++ b/arch/x86/kvm/irq_comm.c @@ -110,13 +110,17 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, return r; } -void kvm_set_msi_irq(struct kvm_kernel_irq_routing_entry *e, +void kvm_set_msi_irq(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e, struct kvm_lapic_irq *irq) { - trace_kvm_msi_set_irq(e->msi.address_lo, e->msi.data); + trace_kvm_msi_set_irq(e->msi.address_lo | (kvm->arch.x2apic_format ? + (u64)e->msi.address_hi << 32 : 0), + e->msi.data); irq->dest_id = (e->msi.address_lo & MSI_ADDR_DEST_ID_MASK) >> MSI_ADDR_DEST_ID_SHIFT; + if (kvm->arch.x2apic_format) + irq->dest_id |= MSI_ADDR_EXT_DEST_ID(e->msi.address_hi); irq->vector = (e->msi.data & MSI_DATA_VECTOR_MASK) >> MSI_DATA_VECTOR_SHIFT; irq->dest_mode = (1 << MSI_ADDR_DEST_MODE_SHIFT) & e->msi.address_lo; @@ -129,15 +133,24 @@ void kvm_set_msi_irq(struct kvm_kernel_irq_routing_entry *e, } EXPORT_SYMBOL_GPL(kvm_set_msi_irq); +static inline bool kvm_msi_route_invalid(struct kvm *kvm, + struct kvm_kernel_irq_routing_entry *e) +{ + return kvm->arch.x2apic_format && (e->msi.address_hi & 0xff); +} + int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm, int irq_source_id, int level, bool line_status) { struct kvm_lapic_irq irq; + if (kvm_msi_route_invalid(kvm, e)) + return -EINVAL; + if (!level) return -1; - kvm_set_msi_irq(e, &irq); + kvm_set_msi_irq(kvm, e, &irq); return kvm_irq_delivery_to_apic(kvm, NULL, &irq, NULL); } @@ -153,7 +166,10 @@ int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *e, if (unlikely(e->type != KVM_IRQ_ROUTING_MSI)) return -EWOULDBLOCK; - kvm_set_msi_irq(e, &irq); + if (kvm_msi_route_invalid(kvm, e)) + return -EINVAL; + + kvm_set_msi_irq(kvm, e, &irq); if (kvm_irq_delivery_to_apic_fast(kvm, NULL, &irq, &r, NULL)) return r; @@ -248,7 +264,8 @@ static int kvm_hv_set_sint(struct kvm_kernel_irq_routing_entry *e, return kvm_hv_synic_set_irq(kvm, e->hv_sint.vcpu, e->hv_sint.sint); } -int kvm_set_routing_entry(struct kvm_kernel_irq_routing_entry *e, +int kvm_set_routing_entry(struct kvm *kvm, + struct kvm_kernel_irq_routing_entry *e, const struct kvm_irq_routing_entry *ue) { int r = -EINVAL; @@ -285,6 +302,9 @@ int kvm_set_routing_entry(struct kvm_kernel_irq_routing_entry *e, e->msi.address_lo = ue->u.msi.address_lo; e->msi.address_hi = ue->u.msi.address_hi; e->msi.data = ue->u.msi.data; + + if (kvm_msi_route_invalid(kvm, e)) + goto out; break; case KVM_IRQ_ROUTING_HV_SINT: e->set = kvm_hv_set_sint; @@ -388,21 +408,16 @@ void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu, kvm->arch.nr_reserved_ioapic_pins); for (i = 0; i < nr_ioapic_pins; ++i) { hlist_for_each_entry(entry, &table->map[i], link) { - u32 dest_id, dest_mode; - bool level; + struct kvm_lapic_irq irq; if (entry->type != KVM_IRQ_ROUTING_MSI) continue; - dest_id = (entry->msi.address_lo >> 12) & 0xff; - dest_mode = (entry->msi.address_lo >> 2) & 0x1; - level = entry->msi.data & MSI_DATA_TRIGGER_LEVEL; - if (level && kvm_apic_match_dest(vcpu, NULL, 0, - dest_id, dest_mode)) { - u32 vector = entry->msi.data & 0xff; - - __set_bit(vector, - ioapic_handled_vectors); - } + + kvm_set_msi_irq(vcpu->kvm, entry, &irq); + + if (irq.level && kvm_apic_match_dest(vcpu, NULL, 0, + irq.dest_id, irq.dest_mode)) + __set_bit(irq.vector, ioapic_handled_vectors); } } srcu_read_unlock(&kvm->irq_srcu, idx); diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index fdc05ae08bac..6895fd28aae9 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -115,26 +115,43 @@ static inline int apic_enabled(struct kvm_lapic *apic) (LVT_MASK | APIC_MODE_MASK | APIC_INPUT_POLARITY | \ APIC_LVT_REMOTE_IRR | APIC_LVT_LEVEL_TRIGGER) -/* The logical map is definitely wrong if we have multiple - * modes at the same time. (Physical map is always right.) - */ -static inline bool kvm_apic_logical_map_valid(struct kvm_apic_map *map) -{ - return !(map->mode & (map->mode - 1)); +static inline bool kvm_apic_map_get_logical_dest(struct kvm_apic_map *map, + u32 dest_id, struct kvm_lapic ***cluster, u16 *mask) { + switch (map->mode) { + case KVM_APIC_MODE_X2APIC: { + u32 offset = (dest_id >> 16) * 16; + u32 max_apic_id = map->max_apic_id; + + if (offset <= max_apic_id) { + u8 cluster_size = min(max_apic_id - offset + 1, 16U); + + *cluster = &map->phys_map[offset]; + *mask = dest_id & (0xffff >> (16 - cluster_size)); + } else { + *mask = 0; + } + + return true; + } + case KVM_APIC_MODE_XAPIC_FLAT: + *cluster = map->xapic_flat_map; + *mask = dest_id & 0xff; + return true; + case KVM_APIC_MODE_XAPIC_CLUSTER: + *cluster = map->xapic_cluster_map[dest_id >> 4]; + *mask = dest_id & 0xf; + return true; + default: + /* Not optimized. */ + return false; + } } -static inline void -apic_logical_id(struct kvm_apic_map *map, u32 dest_id, u16 *cid, u16 *lid) +static void kvm_apic_map_free(struct rcu_head *rcu) { - unsigned lid_bits; - - BUILD_BUG_ON(KVM_APIC_MODE_XAPIC_CLUSTER != 4); - BUILD_BUG_ON(KVM_APIC_MODE_XAPIC_FLAT != 8); - BUILD_BUG_ON(KVM_APIC_MODE_X2APIC != 16); - lid_bits = map->mode; + struct kvm_apic_map *map = container_of(rcu, struct kvm_apic_map, rcu); - *cid = dest_id >> lid_bits; - *lid = dest_id & ((1 << lid_bits) - 1); + kvfree(map); } static void recalculate_apic_map(struct kvm *kvm) @@ -142,17 +159,26 @@ static void recalculate_apic_map(struct kvm *kvm) struct kvm_apic_map *new, *old = NULL; struct kvm_vcpu *vcpu; int i; - - new = kzalloc(sizeof(struct kvm_apic_map), GFP_KERNEL); + u32 max_id = 255; mutex_lock(&kvm->arch.apic_map_lock); + kvm_for_each_vcpu(i, vcpu, kvm) + if (kvm_apic_present(vcpu)) + max_id = max(max_id, kvm_apic_id(vcpu->arch.apic)); + + new = kvm_kvzalloc(sizeof(struct kvm_apic_map) + + sizeof(struct kvm_lapic *) * ((u64)max_id + 1)); + if (!new) goto out; + new->max_apic_id = max_id; + kvm_for_each_vcpu(i, vcpu, kvm) { struct kvm_lapic *apic = vcpu->arch.apic; - u16 cid, lid; + struct kvm_lapic **cluster; + u16 mask; u32 ldr, aid; if (!kvm_apic_present(vcpu)) @@ -161,7 +187,7 @@ static void recalculate_apic_map(struct kvm *kvm) aid = kvm_apic_id(apic); ldr = kvm_lapic_get_reg(apic, APIC_LDR); - if (aid < ARRAY_SIZE(new->phys_map)) + if (aid <= new->max_apic_id) new->phys_map[aid] = apic; if (apic_x2apic_mode(apic)) { @@ -174,13 +200,11 @@ static void recalculate_apic_map(struct kvm *kvm) new->mode |= KVM_APIC_MODE_XAPIC_CLUSTER; } - if (!kvm_apic_logical_map_valid(new)) + if (!kvm_apic_map_get_logical_dest(new, ldr, &cluster, &mask)) continue; - apic_logical_id(new, ldr, &cid, &lid); - - if (lid && cid < ARRAY_SIZE(new->logical_map)) - new->logical_map[cid][ffs(lid) - 1] = apic; + if (mask) + cluster[ffs(mask) - 1] = apic; } out: old = rcu_dereference_protected(kvm->arch.apic_map, @@ -189,7 +213,7 @@ out: mutex_unlock(&kvm->arch.apic_map_lock); if (old) - kfree_rcu(old, rcu); + call_rcu(&old->rcu, kvm_apic_map_free); kvm_make_scan_ioapic_request(kvm); } @@ -210,7 +234,7 @@ static inline void apic_set_spiv(struct kvm_lapic *apic, u32 val) } } -static inline void kvm_apic_set_id(struct kvm_lapic *apic, u8 id) +static inline void kvm_apic_set_xapic_id(struct kvm_lapic *apic, u8 id) { kvm_lapic_set_reg(apic, APIC_ID, id << 24); recalculate_apic_map(apic->vcpu->kvm); @@ -222,11 +246,11 @@ static inline void kvm_apic_set_ldr(struct kvm_lapic *apic, u32 id) recalculate_apic_map(apic->vcpu->kvm); } -static inline void kvm_apic_set_x2apic_id(struct kvm_lapic *apic, u8 id) +static inline void kvm_apic_set_x2apic_id(struct kvm_lapic *apic, u32 id) { u32 ldr = ((id >> 4) << 16) | (1 << (id & 0xf)); - kvm_lapic_set_reg(apic, APIC_ID, id << 24); + kvm_lapic_set_reg(apic, APIC_ID, id); kvm_lapic_set_reg(apic, APIC_LDR, ldr); recalculate_apic_map(apic->vcpu->kvm); } @@ -599,17 +623,30 @@ static bool kvm_apic_match_logical_addr(struct kvm_lapic *apic, u32 mda) } } -/* KVM APIC implementation has two quirks - * - dest always begins at 0 while xAPIC MDA has offset 24, - * - IOxAPIC messages have to be delivered (directly) to x2APIC. +/* The KVM local APIC implementation has two quirks: + * + * - the xAPIC MDA stores the destination at bits 24-31, while this + * is not true of struct kvm_lapic_irq's dest_id field. This is + * just a quirk in the API and is not problematic. + * + * - in-kernel IOAPIC messages have to be delivered directly to + * x2APIC, because the kernel does not support interrupt remapping. + * In order to support broadcast without interrupt remapping, x2APIC + * rewrites the destination of non-IPI messages from APIC_BROADCAST + * to X2APIC_BROADCAST. + * + * The broadcast quirk can be disabled with KVM_CAP_X2APIC_API. This is + * important when userspace wants to use x2APIC-format MSIs, because + * APIC_BROADCAST (0xff) is a legal route for "cluster 0, CPUs 0-7". */ -static u32 kvm_apic_mda(unsigned int dest_id, struct kvm_lapic *source, - struct kvm_lapic *target) +static u32 kvm_apic_mda(struct kvm_vcpu *vcpu, unsigned int dest_id, + struct kvm_lapic *source, struct kvm_lapic *target) { bool ipi = source != NULL; bool x2apic_mda = apic_x2apic_mode(ipi ? source : target); - if (!ipi && dest_id == APIC_BROADCAST && x2apic_mda) + if (!vcpu->kvm->arch.x2apic_broadcast_quirk_disabled && + !ipi && dest_id == APIC_BROADCAST && x2apic_mda) return X2APIC_BROADCAST; return x2apic_mda ? dest_id : SET_APIC_DEST_FIELD(dest_id); @@ -619,7 +656,7 @@ bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, int short_hand, unsigned int dest, int dest_mode) { struct kvm_lapic *target = vcpu->arch.apic; - u32 mda = kvm_apic_mda(dest, source, target); + u32 mda = kvm_apic_mda(vcpu, dest, source, target); apic_debug("target %p, source %p, dest 0x%x, " "dest_mode 0x%x, short_hand 0x%x\n", @@ -671,102 +708,126 @@ static void kvm_apic_disabled_lapic_found(struct kvm *kvm) } } -bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src, - struct kvm_lapic_irq *irq, int *r, struct dest_map *dest_map) +static bool kvm_apic_is_broadcast_dest(struct kvm *kvm, struct kvm_lapic **src, + struct kvm_lapic_irq *irq, struct kvm_apic_map *map) { - struct kvm_apic_map *map; - unsigned long bitmap = 1; - struct kvm_lapic **dst; - int i; - bool ret, x2apic_ipi; + if (kvm->arch.x2apic_broadcast_quirk_disabled) { + if ((irq->dest_id == APIC_BROADCAST && + map->mode != KVM_APIC_MODE_X2APIC)) + return true; + if (irq->dest_id == X2APIC_BROADCAST) + return true; + } else { + bool x2apic_ipi = src && *src && apic_x2apic_mode(*src); + if (irq->dest_id == (x2apic_ipi ? + X2APIC_BROADCAST : APIC_BROADCAST)) + return true; + } - *r = -1; + return false; +} - if (irq->shorthand == APIC_DEST_SELF) { - *r = kvm_apic_set_irq(src->vcpu, irq, dest_map); - return true; - } +/* Return true if the interrupt can be handled by using *bitmap as index mask + * for valid destinations in *dst array. + * Return false if kvm_apic_map_get_dest_lapic did nothing useful. + * Note: we may have zero kvm_lapic destinations when we return true, which + * means that the interrupt should be dropped. In this case, *bitmap would be + * zero and *dst undefined. + */ +static inline bool kvm_apic_map_get_dest_lapic(struct kvm *kvm, + struct kvm_lapic **src, struct kvm_lapic_irq *irq, + struct kvm_apic_map *map, struct kvm_lapic ***dst, + unsigned long *bitmap) +{ + int i, lowest; - if (irq->shorthand) + if (irq->shorthand == APIC_DEST_SELF && src) { + *dst = src; + *bitmap = 1; + return true; + } else if (irq->shorthand) return false; - x2apic_ipi = src && apic_x2apic_mode(src); - if (irq->dest_id == (x2apic_ipi ? X2APIC_BROADCAST : APIC_BROADCAST)) + if (!map || kvm_apic_is_broadcast_dest(kvm, src, irq, map)) return false; - ret = true; - rcu_read_lock(); - map = rcu_dereference(kvm->arch.apic_map); - - if (!map) { - ret = false; - goto out; + if (irq->dest_mode == APIC_DEST_PHYSICAL) { + if (irq->dest_id > map->max_apic_id) { + *bitmap = 0; + } else { + *dst = &map->phys_map[irq->dest_id]; + *bitmap = 1; + } + return true; } - if (irq->dest_mode == APIC_DEST_PHYSICAL) { - if (irq->dest_id >= ARRAY_SIZE(map->phys_map)) - goto out; + *bitmap = 0; + if (!kvm_apic_map_get_logical_dest(map, irq->dest_id, dst, + (u16 *)bitmap)) + return false; - dst = &map->phys_map[irq->dest_id]; - } else { - u16 cid; + if (!kvm_lowest_prio_delivery(irq)) + return true; - if (!kvm_apic_logical_map_valid(map)) { - ret = false; - goto out; + if (!kvm_vector_hashing_enabled()) { + lowest = -1; + for_each_set_bit(i, bitmap, 16) { + if (!(*dst)[i]) + continue; + if (lowest < 0) + lowest = i; + else if (kvm_apic_compare_prio((*dst)[i]->vcpu, + (*dst)[lowest]->vcpu) < 0) + lowest = i; } + } else { + if (!*bitmap) + return true; - apic_logical_id(map, irq->dest_id, &cid, (u16 *)&bitmap); + lowest = kvm_vector_to_index(irq->vector, hweight16(*bitmap), + bitmap, 16); - if (cid >= ARRAY_SIZE(map->logical_map)) - goto out; + if (!(*dst)[lowest]) { + kvm_apic_disabled_lapic_found(kvm); + *bitmap = 0; + return true; + } + } - dst = map->logical_map[cid]; + *bitmap = (lowest >= 0) ? 1 << lowest : 0; - if (!kvm_lowest_prio_delivery(irq)) - goto set_irq; + return true; +} - if (!kvm_vector_hashing_enabled()) { - int l = -1; - for_each_set_bit(i, &bitmap, 16) { - if (!dst[i]) - continue; - if (l < 0) - l = i; - else if (kvm_apic_compare_prio(dst[i]->vcpu, - dst[l]->vcpu) < 0) - l = i; - } - bitmap = (l >= 0) ? 1 << l : 0; - } else { - int idx; - unsigned int dest_vcpus; +bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src, + struct kvm_lapic_irq *irq, int *r, struct dest_map *dest_map) +{ + struct kvm_apic_map *map; + unsigned long bitmap; + struct kvm_lapic **dst = NULL; + int i; + bool ret; - dest_vcpus = hweight16(bitmap); - if (dest_vcpus == 0) - goto out; + *r = -1; - idx = kvm_vector_to_index(irq->vector, - dest_vcpus, &bitmap, 16); + if (irq->shorthand == APIC_DEST_SELF) { + *r = kvm_apic_set_irq(src->vcpu, irq, dest_map); + return true; + } - if (!dst[idx]) { - kvm_apic_disabled_lapic_found(kvm); - goto out; - } + rcu_read_lock(); + map = rcu_dereference(kvm->arch.apic_map); - bitmap = (idx >= 0) ? 1 << idx : 0; + ret = kvm_apic_map_get_dest_lapic(kvm, &src, irq, map, &dst, &bitmap); + if (ret) + for_each_set_bit(i, &bitmap, 16) { + if (!dst[i]) + continue; + if (*r < 0) + *r = 0; + *r += kvm_apic_set_irq(dst[i]->vcpu, irq, dest_map); } - } -set_irq: - for_each_set_bit(i, &bitmap, 16) { - if (!dst[i]) - continue; - if (*r < 0) - *r = 0; - *r += kvm_apic_set_irq(dst[i]->vcpu, irq, dest_map); - } -out: rcu_read_unlock(); return ret; } @@ -789,8 +850,9 @@ bool kvm_intr_is_single_vcpu_fast(struct kvm *kvm, struct kvm_lapic_irq *irq, struct kvm_vcpu **dest_vcpu) { struct kvm_apic_map *map; + unsigned long bitmap; + struct kvm_lapic **dst = NULL; bool ret = false; - struct kvm_lapic *dst = NULL; if (irq->shorthand) return false; @@ -798,69 +860,16 @@ bool kvm_intr_is_single_vcpu_fast(struct kvm *kvm, struct kvm_lapic_irq *irq, rcu_read_lock(); map = rcu_dereference(kvm->arch.apic_map); - if (!map) - goto out; - - if (irq->dest_mode == APIC_DEST_PHYSICAL) { - if (irq->dest_id == 0xFF) - goto out; - - if (irq->dest_id >= ARRAY_SIZE(map->phys_map)) - goto out; - - dst = map->phys_map[irq->dest_id]; - if (dst && kvm_apic_present(dst->vcpu)) - *dest_vcpu = dst->vcpu; - else - goto out; - } else { - u16 cid; - unsigned long bitmap = 1; - int i, r = 0; - - if (!kvm_apic_logical_map_valid(map)) - goto out; - - apic_logical_id(map, irq->dest_id, &cid, (u16 *)&bitmap); - - if (cid >= ARRAY_SIZE(map->logical_map)) - goto out; - - if (kvm_vector_hashing_enabled() && - kvm_lowest_prio_delivery(irq)) { - int idx; - unsigned int dest_vcpus; + if (kvm_apic_map_get_dest_lapic(kvm, NULL, irq, map, &dst, &bitmap) && + hweight16(bitmap) == 1) { + unsigned long i = find_first_bit(&bitmap, 16); - dest_vcpus = hweight16(bitmap); - if (dest_vcpus == 0) - goto out; - - idx = kvm_vector_to_index(irq->vector, dest_vcpus, - &bitmap, 16); - - dst = map->logical_map[cid][idx]; - if (!dst) { - kvm_apic_disabled_lapic_found(kvm); - goto out; - } - - *dest_vcpu = dst->vcpu; - } else { - for_each_set_bit(i, &bitmap, 16) { - dst = map->logical_map[cid][i]; - if (++r == 2) - goto out; - } - - if (dst && kvm_apic_present(dst->vcpu)) - *dest_vcpu = dst->vcpu; - else - goto out; + if (dst[i]) { + *dest_vcpu = dst[i]->vcpu; + ret = true; } } - ret = true; -out: rcu_read_unlock(); return ret; } @@ -1127,12 +1136,6 @@ static u32 __apic_read(struct kvm_lapic *apic, unsigned int offset) return 0; switch (offset) { - case APIC_ID: - if (apic_x2apic_mode(apic)) - val = kvm_apic_id(apic); - else - val = kvm_apic_id(apic) << 24; - break; case APIC_ARBPRI: apic_debug("Access APIC ARBPRI register which is for P6\n"); break; @@ -1349,41 +1352,52 @@ bool kvm_lapic_hv_timer_in_use(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_lapic_hv_timer_in_use); +static void cancel_hv_tscdeadline(struct kvm_lapic *apic) +{ + kvm_x86_ops->cancel_hv_timer(apic->vcpu); + apic->lapic_timer.hv_timer_in_use = false; +} + void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; WARN_ON(!apic->lapic_timer.hv_timer_in_use); WARN_ON(swait_active(&vcpu->wq)); - kvm_x86_ops->cancel_hv_timer(vcpu); - apic->lapic_timer.hv_timer_in_use = false; + cancel_hv_tscdeadline(apic); apic_timer_expired(apic); } EXPORT_SYMBOL_GPL(kvm_lapic_expired_hv_timer); +static bool start_hv_tscdeadline(struct kvm_lapic *apic) +{ + u64 tscdeadline = apic->lapic_timer.tscdeadline; + + if (atomic_read(&apic->lapic_timer.pending) || + kvm_x86_ops->set_hv_timer(apic->vcpu, tscdeadline)) { + if (apic->lapic_timer.hv_timer_in_use) + cancel_hv_tscdeadline(apic); + } else { + apic->lapic_timer.hv_timer_in_use = true; + hrtimer_cancel(&apic->lapic_timer.timer); + + /* In case the sw timer triggered in the window */ + if (atomic_read(&apic->lapic_timer.pending)) + cancel_hv_tscdeadline(apic); + } + trace_kvm_hv_timer_state(apic->vcpu->vcpu_id, + apic->lapic_timer.hv_timer_in_use); + return apic->lapic_timer.hv_timer_in_use; +} + void kvm_lapic_switch_to_hv_timer(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; WARN_ON(apic->lapic_timer.hv_timer_in_use); - if (apic_lvtt_tscdeadline(apic) && - !atomic_read(&apic->lapic_timer.pending)) { - u64 tscdeadline = apic->lapic_timer.tscdeadline; - - if (!kvm_x86_ops->set_hv_timer(vcpu, tscdeadline)) { - apic->lapic_timer.hv_timer_in_use = true; - hrtimer_cancel(&apic->lapic_timer.timer); - - /* In case the sw timer triggered in the window */ - if (atomic_read(&apic->lapic_timer.pending)) { - apic->lapic_timer.hv_timer_in_use = false; - kvm_x86_ops->cancel_hv_timer(apic->vcpu); - } - } - trace_kvm_hv_timer_state(vcpu->vcpu_id, - apic->lapic_timer.hv_timer_in_use); - } + if (apic_lvtt_tscdeadline(apic)) + start_hv_tscdeadline(apic); } EXPORT_SYMBOL_GPL(kvm_lapic_switch_to_hv_timer); @@ -1395,8 +1409,7 @@ void kvm_lapic_switch_to_sw_timer(struct kvm_vcpu *vcpu) if (!apic->lapic_timer.hv_timer_in_use) return; - kvm_x86_ops->cancel_hv_timer(vcpu); - apic->lapic_timer.hv_timer_in_use = false; + cancel_hv_tscdeadline(apic); if (atomic_read(&apic->lapic_timer.pending)) return; @@ -1451,15 +1464,7 @@ static void start_apic_timer(struct kvm_lapic *apic) ktime_to_ns(ktime_add_ns(now, apic->lapic_timer.period))); } else if (apic_lvtt_tscdeadline(apic)) { - /* lapic timer in tsc deadline mode */ - u64 tscdeadline = apic->lapic_timer.tscdeadline; - - if (kvm_x86_ops->set_hv_timer && - !kvm_x86_ops->set_hv_timer(apic->vcpu, tscdeadline)) { - apic->lapic_timer.hv_timer_in_use = true; - trace_kvm_hv_timer_state(apic->vcpu->vcpu_id, - apic->lapic_timer.hv_timer_in_use); - } else + if (!(kvm_x86_ops->set_hv_timer && start_hv_tscdeadline(apic))) start_sw_tscdeadline(apic); } } @@ -1488,7 +1493,7 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) switch (reg) { case APIC_ID: /* Local APIC ID */ if (!apic_x2apic_mode(apic)) - kvm_apic_set_id(apic, val >> 24); + kvm_apic_set_xapic_id(apic, val >> 24); else ret = 1; break; @@ -1749,9 +1754,10 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value) /* update jump label if enable bit changes */ if ((old_value ^ value) & MSR_IA32_APICBASE_ENABLE) { - if (value & MSR_IA32_APICBASE_ENABLE) + if (value & MSR_IA32_APICBASE_ENABLE) { + kvm_apic_set_xapic_id(apic, vcpu->vcpu_id); static_key_slow_dec_deferred(&apic_hw_disabled); - else + } else static_key_slow_inc(&apic_hw_disabled.key); recalculate_apic_map(vcpu->kvm); } @@ -1791,8 +1797,11 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event) /* Stop the timer in case it's a reset to an active apic */ hrtimer_cancel(&apic->lapic_timer.timer); - if (!init_event) - kvm_apic_set_id(apic, vcpu->vcpu_id); + if (!init_event) { + kvm_lapic_set_base(vcpu, APIC_DEFAULT_PHYS_BASE | + MSR_IA32_APICBASE_ENABLE); + kvm_apic_set_xapic_id(apic, vcpu->vcpu_id); + } kvm_apic_set_version(apic->vcpu); for (i = 0; i < KVM_APIC_LVT_NUM; i++) @@ -1931,9 +1940,6 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu) * thinking that APIC satet has changed. */ vcpu->arch.apic_base = MSR_IA32_APICBASE_ENABLE; - kvm_lapic_set_base(vcpu, - APIC_DEFAULT_PHYS_BASE | MSR_IA32_APICBASE_ENABLE); - static_key_slow_inc(&apic_sw_disabled.key); /* sw disabled at reset */ kvm_lapic_reset(vcpu, false); kvm_iodevice_init(&apic->dev, &apic_mmio_ops); @@ -2013,17 +2019,48 @@ int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu) return vector; } -void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu, - struct kvm_lapic_state *s) +static int kvm_apic_state_fixup(struct kvm_vcpu *vcpu, + struct kvm_lapic_state *s, bool set) +{ + if (apic_x2apic_mode(vcpu->arch.apic)) { + u32 *id = (u32 *)(s->regs + APIC_ID); + + if (vcpu->kvm->arch.x2apic_format) { + if (*id != vcpu->vcpu_id) + return -EINVAL; + } else { + if (set) + *id >>= 24; + else + *id <<= 24; + } + } + + return 0; +} + +int kvm_apic_get_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s) +{ + memcpy(s->regs, vcpu->arch.apic->regs, sizeof(*s)); + return kvm_apic_state_fixup(vcpu, s, false); +} + +int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s) { struct kvm_lapic *apic = vcpu->arch.apic; + int r; + kvm_lapic_set_base(vcpu, vcpu->arch.apic_base); /* set SPIV separately to get count of SW disabled APICs right */ apic_set_spiv(apic, *((u32 *)(s->regs + APIC_SPIV))); + + r = kvm_apic_state_fixup(vcpu, s, true); + if (r) + return r; memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s); - /* call kvm_apic_set_id() to put apic into apic_map */ - kvm_apic_set_id(apic, kvm_apic_id(apic)); + + recalculate_apic_map(vcpu->kvm); kvm_apic_set_version(vcpu); apic_update_ppr(apic); @@ -2049,6 +2086,8 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu, kvm_rtc_eoi_tracking_restore_one(vcpu); vcpu->arch.apic_arb_prio = 0; + + return 0; } void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 336ba51bb16e..f60d01c29d51 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -81,8 +81,8 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src, u64 kvm_get_apic_base(struct kvm_vcpu *vcpu); int kvm_set_apic_base(struct kvm_vcpu *vcpu, struct msr_data *msr_info); -void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu, - struct kvm_lapic_state *s); +int kvm_apic_get_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s); +int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s); int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu); u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu); @@ -200,9 +200,15 @@ static inline int kvm_lapic_latched_init(struct kvm_vcpu *vcpu) return lapic_in_kernel(vcpu) && test_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events); } -static inline int kvm_apic_id(struct kvm_lapic *apic) +static inline u32 kvm_apic_id(struct kvm_lapic *apic) { - return (kvm_lapic_get_reg(apic, APIC_ID) >> 24) & 0xff; + /* To avoid a race between apic_base and following APIC_ID update when + * switching to x2apic_mode, the x2apic mode returns initial x2apic id. + */ + if (apic_x2apic_mode(apic)) + return apic->vcpu->vcpu_id; + + return kvm_lapic_get_reg(apic, APIC_ID) >> 24; } bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector); diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 837bf23c5b06..3041902ec827 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -175,6 +175,7 @@ static u64 __read_mostly shadow_user_mask; static u64 __read_mostly shadow_accessed_mask; static u64 __read_mostly shadow_dirty_mask; static u64 __read_mostly shadow_mmio_mask; +static u64 __read_mostly shadow_present_mask; static void mmu_spte_set(u64 *sptep, u64 spte); static void mmu_free_roots(struct kvm_vcpu *vcpu); @@ -282,13 +283,14 @@ static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte) } void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, - u64 dirty_mask, u64 nx_mask, u64 x_mask) + u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask) { shadow_user_mask = user_mask; shadow_accessed_mask = accessed_mask; shadow_dirty_mask = dirty_mask; shadow_nx_mask = nx_mask; shadow_x_mask = x_mask; + shadow_present_mask = p_mask; } EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes); @@ -304,7 +306,7 @@ static int is_nx(struct kvm_vcpu *vcpu) static int is_shadow_present_pte(u64 pte) { - return pte & PT_PRESENT_MASK && !is_mmio_spte(pte); + return (pte & 0xFFFFFFFFull) && !is_mmio_spte(pte); } static int is_large_pte(u64 pte) @@ -2245,10 +2247,9 @@ static void link_shadow_page(struct kvm_vcpu *vcpu, u64 *sptep, { u64 spte; - BUILD_BUG_ON(VMX_EPT_READABLE_MASK != PT_PRESENT_MASK || - VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK); + BUILD_BUG_ON(VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK); - spte = __pa(sp->spt) | PT_PRESENT_MASK | PT_WRITABLE_MASK | + spte = __pa(sp->spt) | shadow_present_mask | PT_WRITABLE_MASK | shadow_user_mask | shadow_x_mask | shadow_accessed_mask; mmu_spte_set(sptep, spte); @@ -2515,13 +2516,19 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn, kvm_pfn_t pfn, bool speculative, bool can_unsync, bool host_writable) { - u64 spte; + u64 spte = 0; int ret = 0; if (set_mmio_spte(vcpu, sptep, gfn, pfn, pte_access)) return 0; - spte = PT_PRESENT_MASK; + /* + * For the EPT case, shadow_present_mask is 0 if hardware + * supports exec-only page table entries. In that case, + * ACC_USER_MASK and shadow_user_mask are used to represent + * read access. See FNAME(gpte_access) in paging_tmpl.h. + */ + spte |= shadow_present_mask; if (!speculative) spte |= shadow_accessed_mask; @@ -3189,7 +3196,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) MMU_WARN_ON(VALID_PAGE(root)); if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) { pdptr = vcpu->arch.mmu.get_pdptr(vcpu, i); - if (!is_present_gpte(pdptr)) { + if (!(pdptr & PT_PRESENT_MASK)) { vcpu->arch.mmu.pae_root[i] = 0; continue; } @@ -3914,9 +3921,7 @@ static void update_permission_bitmask(struct kvm_vcpu *vcpu, * clearer. */ smap = cr4_smap && u && !uf && !ff; - } else - /* Not really needed: no U/S accesses on ept */ - u = 1; + } fault = (ff && !x) || (uf && !u) || (wf && !w) || (smapf && smap); diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 66b33b96a31b..ddc56e91f2e4 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -93,11 +93,6 @@ static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu) return kvm_mmu_load(vcpu); } -static inline int is_present_gpte(unsigned long pte) -{ - return pte & PT_PRESENT_MASK; -} - /* * Currently, we have two sorts of write-protection, a) the first one * write-protects guest page to sync the guest modification, b) another one is diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index bc019f70e0b6..a01105485315 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -131,7 +131,7 @@ static inline void FNAME(protect_clean_gpte)(unsigned *access, unsigned gpte) static inline int FNAME(is_present_gpte)(unsigned long pte) { #if PTTYPE != PTTYPE_EPT - return is_present_gpte(pte); + return pte & PT_PRESENT_MASK; #else return pte & 7; #endif @@ -181,13 +181,19 @@ no_present: return true; } +/* + * For PTTYPE_EPT, a page table can be executable but not readable + * on supported processors. Therefore, set_spte does not automatically + * set bit 0 if execute only is supported. Here, we repurpose ACC_USER_MASK + * to signify readability since it isn't used in the EPT case + */ static inline unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, u64 gpte) { unsigned access; #if PTTYPE == PTTYPE_EPT access = ((gpte & VMX_EPT_WRITABLE_MASK) ? ACC_WRITE_MASK : 0) | ((gpte & VMX_EPT_EXECUTABLE_MASK) ? ACC_EXEC_MASK : 0) | - ACC_USER_MASK; + ((gpte & VMX_EPT_READABLE_MASK) ? ACC_USER_MASK : 0); #else BUILD_BUG_ON(ACC_EXEC_MASK != PT_PRESENT_MASK); BUILD_BUG_ON(ACC_EXEC_MASK != 1); diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 5ff292778110..5bfdbbf1ce79 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -4935,6 +4935,12 @@ out: static void svm_handle_external_intr(struct kvm_vcpu *vcpu) { local_irq_enable(); + /* + * We must have an instruction with interrupts enabled, so + * the timer interrupt isn't delayed by the interrupt shadow. + */ + asm("nop"); + local_irq_disable(); } static void svm_sched_in(struct kvm_vcpu *vcpu, int cpu) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index e185649fb8b7..b61cdadf8623 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -428,7 +428,6 @@ struct nested_vmx { struct pi_desc *pi_desc; bool pi_pending; u16 posted_intr_nv; - u64 msr_ia32_feature_control; struct hrtimer preemption_timer; bool preemption_timer_expired; @@ -612,6 +611,14 @@ struct vcpu_vmx { bool guest_pkru_valid; u32 guest_pkru; u32 host_pkru; + + /* + * Only bits masked by msr_ia32_feature_control_valid_bits can be set in + * msr_ia32_feature_control. FEATURE_CONTROL_LOCKED is always included + * in msr_ia32_feature_control_valid_bits. + */ + u64 msr_ia32_feature_control; + u64 msr_ia32_feature_control_valid_bits; }; enum segment_cache_field { @@ -1105,7 +1112,7 @@ static inline bool cpu_has_broken_vmx_preemption_timer(void) /* Clear the reserved bits */ eax &= ~(0x3U << 14 | 0xfU << 28); - for (i = 0; i < sizeof(vmx_preemption_cpu_tfms)/sizeof(u32); i++) + for (i = 0; i < ARRAY_SIZE(vmx_preemption_cpu_tfms); i++) if (eax == vmx_preemption_cpu_tfms[i]) return true; @@ -1114,9 +1121,6 @@ static inline bool cpu_has_broken_vmx_preemption_timer(void) static inline bool cpu_has_vmx_preemption_timer(void) { - if (cpu_has_broken_vmx_preemption_timer()) - return false; - return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VMX_PREEMPTION_TIMER; } @@ -1668,6 +1672,11 @@ static __always_inline void vmcs_set_bits(unsigned long field, u32 mask) __vmcs_writel(field, __vmcs_readl(field) | mask); } +static inline void vm_entry_controls_reset_shadow(struct vcpu_vmx *vmx) +{ + vmx->vm_entry_controls_shadow = vmcs_read32(VM_ENTRY_CONTROLS); +} + static inline void vm_entry_controls_init(struct vcpu_vmx *vmx, u32 val) { vmcs_write32(VM_ENTRY_CONTROLS, val); @@ -1696,6 +1705,11 @@ static inline void vm_entry_controls_clearbit(struct vcpu_vmx *vmx, u32 val) vm_entry_controls_set(vmx, vm_entry_controls_get(vmx) & ~val); } +static inline void vm_exit_controls_reset_shadow(struct vcpu_vmx *vmx) +{ + vmx->vm_exit_controls_shadow = vmcs_read32(VM_EXIT_CONTROLS); +} + static inline void vm_exit_controls_init(struct vcpu_vmx *vmx, u32 val) { vmcs_write32(VM_EXIT_CONTROLS, val); @@ -2780,6 +2794,9 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) vmx->nested.nested_vmx_ept_caps = VMX_EPT_PAGE_WALK_4_BIT | VMX_EPTP_WB_BIT | VMX_EPT_2MB_PAGE_BIT | VMX_EPT_INVEPT_BIT; + if (cpu_has_vmx_ept_execute_only()) + vmx->nested.nested_vmx_ept_caps |= + VMX_EPT_EXECUTE_ONLY_BIT; vmx->nested.nested_vmx_ept_caps &= vmx_capability.ept; /* * For nested guests, we don't do anything specific @@ -2928,6 +2945,14 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) return 0; } +static inline bool vmx_feature_control_msr_valid(struct kvm_vcpu *vcpu, + uint64_t val) +{ + uint64_t valid_bits = to_vmx(vcpu)->msr_ia32_feature_control_valid_bits; + + return !(val & ~valid_bits); +} + /* * Reads an msr value (of 'msr_index') into 'pdata'. * Returns 0 on success, non-0 otherwise. @@ -2969,10 +2994,15 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 1; msr_info->data = vmcs_read64(GUEST_BNDCFGS); break; - case MSR_IA32_FEATURE_CONTROL: - if (!nested_vmx_allowed(vcpu)) + case MSR_IA32_MCG_EXT_CTL: + if (!msr_info->host_initiated && + !(to_vmx(vcpu)->msr_ia32_feature_control & + FEATURE_CONTROL_LMCE)) return 1; - msr_info->data = to_vmx(vcpu)->nested.msr_ia32_feature_control; + msr_info->data = vcpu->arch.mcg_ext_ctl; + break; + case MSR_IA32_FEATURE_CONTROL: + msr_info->data = to_vmx(vcpu)->msr_ia32_feature_control; break; case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC: if (!nested_vmx_allowed(vcpu)) @@ -3062,12 +3092,20 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_IA32_TSC_ADJUST: ret = kvm_set_msr_common(vcpu, msr_info); break; + case MSR_IA32_MCG_EXT_CTL: + if ((!msr_info->host_initiated && + !(to_vmx(vcpu)->msr_ia32_feature_control & + FEATURE_CONTROL_LMCE)) || + (data & ~MCG_EXT_CTL_LMCE_EN)) + return 1; + vcpu->arch.mcg_ext_ctl = data; + break; case MSR_IA32_FEATURE_CONTROL: - if (!nested_vmx_allowed(vcpu) || - (to_vmx(vcpu)->nested.msr_ia32_feature_control & + if (!vmx_feature_control_msr_valid(vcpu, data) || + (to_vmx(vcpu)->msr_ia32_feature_control & FEATURE_CONTROL_LOCKED && !msr_info->host_initiated)) return 1; - vmx->nested.msr_ia32_feature_control = data; + vmx->msr_ia32_feature_control = data; if (msr_info->host_initiated && data == 0) vmx_leave_nested(vcpu); break; @@ -3362,12 +3400,12 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) vmx_capability.ept, vmx_capability.vpid); } - min = VM_EXIT_SAVE_DEBUG_CONTROLS; + min = VM_EXIT_SAVE_DEBUG_CONTROLS | VM_EXIT_ACK_INTR_ON_EXIT; #ifdef CONFIG_X86_64 min |= VM_EXIT_HOST_ADDR_SPACE_SIZE; #endif opt = VM_EXIT_SAVE_IA32_PAT | VM_EXIT_LOAD_IA32_PAT | - VM_EXIT_ACK_INTR_ON_EXIT | VM_EXIT_CLEAR_BNDCFGS; + VM_EXIT_CLEAR_BNDCFGS; if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS, &_vmexit_control) < 0) return -EIO; @@ -3379,9 +3417,10 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) &_pin_based_exec_control) < 0) return -EIO; + if (cpu_has_broken_vmx_preemption_timer()) + _pin_based_exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER; if (!(_cpu_based_2nd_exec_control & - SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) || - !(_vmexit_control & VM_EXIT_ACK_INTR_ON_EXIT)) + SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY)) _pin_based_exec_control &= ~PIN_BASED_POSTED_INTR; min = VM_ENTRY_LOAD_DEBUG_CONTROLS; @@ -6081,12 +6120,14 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu) gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); trace_kvm_page_fault(gpa, exit_qualification); - /* It is a write fault? */ - error_code = exit_qualification & PFERR_WRITE_MASK; + /* it is a read fault? */ + error_code = (exit_qualification << 2) & PFERR_USER_MASK; + /* it is a write fault? */ + error_code |= exit_qualification & PFERR_WRITE_MASK; /* It is a fetch fault? */ error_code |= (exit_qualification << 2) & PFERR_FETCH_MASK; /* ept page table is present? */ - error_code |= (exit_qualification >> 3) & PFERR_PRESENT_MASK; + error_code |= (exit_qualification & 0x38) != 0; vcpu->arch.exit_qualification = exit_qualification; @@ -6420,9 +6461,6 @@ static __init int hardware_setup(void) for (msr = 0x800; msr <= 0x8ff; msr++) vmx_disable_intercept_msr_read_x2apic(msr); - /* According SDM, in x2apic mode, the whole id reg is used. But in - * KVM, it only use the highest eight bits. Need to intercept it */ - vmx_enable_intercept_msr_read_x2apic(0x802); /* TMCCT */ vmx_enable_intercept_msr_read_x2apic(0x839); /* TPR */ @@ -6433,10 +6471,12 @@ static __init int hardware_setup(void) vmx_disable_intercept_msr_write_x2apic(0x83f); if (enable_ept) { - kvm_mmu_set_mask_ptes(0ull, + kvm_mmu_set_mask_ptes(VMX_EPT_READABLE_MASK, (enable_ept_ad_bits) ? VMX_EPT_ACCESS_BIT : 0ull, (enable_ept_ad_bits) ? VMX_EPT_DIRTY_BIT : 0ull, - 0ull, VMX_EPT_EXECUTABLE_MASK); + 0ull, VMX_EPT_EXECUTABLE_MASK, + cpu_has_vmx_ept_execute_only() ? + 0ull : VMX_EPT_READABLE_MASK); ept_set_mmio_spte_mask(); kvm_enable_tdp(); } else @@ -6471,6 +6511,8 @@ static __init int hardware_setup(void) kvm_set_posted_intr_wakeup_handler(wakeup_handler); + kvm_mce_cap_supported |= MCG_LMCE_P; + return alloc_kvm_area(); out8: @@ -6939,7 +6981,7 @@ static int handle_vmon(struct kvm_vcpu *vcpu) return 1; } - if ((vmx->nested.msr_ia32_feature_control & VMXON_NEEDED_FEATURES) + if ((vmx->msr_ia32_feature_control & VMXON_NEEDED_FEATURES) != VMXON_NEEDED_FEATURES) { kvm_inject_gp(vcpu, 0); return 1; @@ -8012,6 +8054,8 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES); case EXIT_REASON_PCOMMIT: return nested_cpu_has2(vmcs12, SECONDARY_EXEC_PCOMMIT); + case EXIT_REASON_PREEMPTION_TIMER: + return false; default: return true; } @@ -8545,7 +8589,6 @@ static void vmx_handle_external_intr(struct kvm_vcpu *vcpu) "push %[sp]\n\t" #endif "pushf\n\t" - "orl $0x200, (%%" _ASM_SP ")\n\t" __ASM_SIZE(push) " $%c[cs]\n\t" "call *%[entry]\n\t" : @@ -8558,8 +8601,7 @@ static void vmx_handle_external_intr(struct kvm_vcpu *vcpu) [ss]"i"(__KERNEL_DS), [cs]"i"(__KERNEL_CS) ); - } else - local_irq_enable(); + } } static bool vmx_has_high_real_mode_segbase(void) @@ -9056,6 +9098,8 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) goto free_vmcs; } + vmx->msr_ia32_feature_control_valid_bits = FEATURE_CONTROL_LOCKED; + return &vmx->vcpu; free_vmcs: @@ -9203,6 +9247,13 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu) vmx->nested.nested_vmx_secondary_ctls_high &= ~SECONDARY_EXEC_PCOMMIT; } + + if (nested_vmx_allowed(vcpu)) + to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |= + FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX; + else + to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &= + ~FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX; } static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) @@ -9759,9 +9810,14 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) vmcs_write64(VMCS_LINK_POINTER, -1ull); exec_control = vmcs12->pin_based_vm_exec_control; - exec_control |= vmcs_config.pin_based_exec_ctrl; + + /* Preemption timer setting is only taken from vmcs01. */ exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER; + exec_control |= vmcs_config.pin_based_exec_ctrl; + if (vmx->hv_deadline_tsc == -1) + exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER; + /* Posted interrupts setting is only taken from vmcs12. */ if (nested_cpu_has_posted_intr(vmcs12)) { /* * Note that we use L0's vector here and in @@ -10680,8 +10736,8 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, vmcs12->vm_exit_intr_error_code, KVM_ISA_VMX); - vm_entry_controls_init(vmx, vmcs_read32(VM_ENTRY_CONTROLS)); - vm_exit_controls_init(vmx, vmcs_read32(VM_EXIT_CONTROLS)); + vm_entry_controls_reset_shadow(vmx); + vm_exit_controls_reset_shadow(vmx); vmx_segment_cache_clear(vmx); /* if no vmcs02 cache requested, remove the one we used */ @@ -10690,8 +10746,14 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, load_vmcs12_host_state(vcpu, vmcs12); - /* Update TSC_OFFSET if TSC was changed while L2 ran */ + /* Update any VMCS fields that might have changed while L2 ran */ vmcs_write64(TSC_OFFSET, vmx->nested.vmcs01_tsc_offset); + if (vmx->hv_deadline_tsc == -1) + vmcs_clear_bits(PIN_BASED_VM_EXEC_CONTROL, + PIN_BASED_VMX_PREEMPTION_TIMER); + else + vmcs_set_bits(PIN_BASED_VM_EXEC_CONTROL, + PIN_BASED_VMX_PREEMPTION_TIMER); /* This is needed for same reason as it was needed in prepare_vmcs02 */ vmx->host_rsp = 0; @@ -10793,9 +10855,9 @@ static inline int u64_shl_div_u64(u64 a, unsigned int shift, static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc) { struct vcpu_vmx *vmx = to_vmx(vcpu); - u64 tscl = rdtsc(), delta_tsc; - - delta_tsc = guest_deadline_tsc - kvm_read_l1_tsc(vcpu, tscl); + u64 tscl = rdtsc(); + u64 guest_tscl = kvm_read_l1_tsc(vcpu, tscl); + u64 delta_tsc = max(guest_deadline_tsc, guest_tscl) - guest_tscl; /* Convert to host delta tsc if tsc scaling is enabled */ if (vcpu->arch.tsc_scaling_ratio != kvm_default_tsc_scaling_ratio && @@ -11042,7 +11104,7 @@ static int vmx_update_pi_irte(struct kvm *kvm, unsigned int host_irq, * We will support full lowest-priority interrupt later. */ - kvm_set_msi_irq(e, &irq); + kvm_set_msi_irq(kvm, e, &irq); if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu)) { /* * Make sure the IRTE is in remapped mode if @@ -11087,6 +11149,16 @@ out: return ret; } +static void vmx_setup_mce(struct kvm_vcpu *vcpu) +{ + if (vcpu->arch.mcg_cap & MCG_LMCE_P) + to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |= + FEATURE_CONTROL_LMCE; + else + to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &= + ~FEATURE_CONTROL_LMCE; +} + static struct kvm_x86_ops vmx_x86_ops = { .cpu_has_kvm_support = cpu_has_kvm_support, .disabled_by_bios = vmx_disabled_by_bios, @@ -11216,6 +11288,8 @@ static struct kvm_x86_ops vmx_x86_ops = { .set_hv_timer = vmx_set_hv_timer, .cancel_hv_timer = vmx_cancel_hv_timer, #endif + + .setup_mce = vmx_setup_mce, }; static int __init vmx_init(void) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 299219630c94..a27b33033700 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -70,7 +70,8 @@ #define MAX_IO_MSRS 256 #define KVM_MAX_MCE_BANKS 32 -#define KVM_MCE_CAP_SUPPORTED (MCG_CTL_P | MCG_SER_P) +u64 __read_mostly kvm_mce_cap_supported = MCG_CTL_P | MCG_SER_P; +EXPORT_SYMBOL_GPL(kvm_mce_cap_supported); #define emul_to_vcpu(ctxt) \ container_of(ctxt, struct kvm_vcpu, arch.emulate_ctxt) @@ -89,6 +90,9 @@ static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE); #define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM #define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU +#define KVM_X2APIC_API_VALID_FLAGS (KVM_X2APIC_API_USE_32BIT_IDS | \ + KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK) + static void update_cr8_intercept(struct kvm_vcpu *vcpu); static void process_nmi(struct kvm_vcpu *vcpu); static void enter_smm(struct kvm_vcpu *vcpu); @@ -539,7 +543,7 @@ int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3) goto out; } for (i = 0; i < ARRAY_SIZE(pdpte); ++i) { - if (is_present_gpte(pdpte[i]) && + if ((pdpte[i] & PT_PRESENT_MASK) && (pdpte[i] & vcpu->arch.mmu.guest_rsvd_check.rsvd_bits_mask[0][2])) { ret = 0; @@ -984,6 +988,7 @@ static u32 emulated_msrs[] = { MSR_IA32_MISC_ENABLE, MSR_IA32_MCG_STATUS, MSR_IA32_MCG_CTL, + MSR_IA32_MCG_EXT_CTL, MSR_IA32_SMBASE, }; @@ -1163,7 +1168,7 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock) int version; int r; struct pvclock_wall_clock wc; - struct timespec boot; + struct timespec64 boot; if (!wall_clock) return; @@ -1186,13 +1191,13 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock) * wall clock specified here. guest system time equals host * system time for us, thus we must fill in host boot time here. */ - getboottime(&boot); + getboottime64(&boot); if (kvm->arch.kvmclock_offset) { - struct timespec ts = ns_to_timespec(kvm->arch.kvmclock_offset); - boot = timespec_sub(boot, ts); + struct timespec64 ts = ns_to_timespec64(kvm->arch.kvmclock_offset); + boot = timespec64_sub(boot, ts); } - wc.sec = boot.tv_sec; + wc.sec = (u32)boot.tv_sec; /* overflow in 2106 guest time */ wc.nsec = boot.tv_nsec; wc.version = version; @@ -2623,6 +2628,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_TSC_CONTROL: r = kvm_has_tsc_control; break; + case KVM_CAP_X2APIC_API: + r = KVM_X2APIC_API_VALID_FLAGS; + break; default: r = 0; break; @@ -2685,11 +2693,9 @@ long kvm_arch_dev_ioctl(struct file *filp, break; } case KVM_X86_GET_MCE_CAP_SUPPORTED: { - u64 mce_cap; - - mce_cap = KVM_MCE_CAP_SUPPORTED; r = -EFAULT; - if (copy_to_user(argp, &mce_cap, sizeof mce_cap)) + if (copy_to_user(argp, &kvm_mce_cap_supported, + sizeof(kvm_mce_cap_supported))) goto out; r = 0; break; @@ -2779,15 +2785,17 @@ static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, if (vcpu->arch.apicv_active) kvm_x86_ops->sync_pir_to_irr(vcpu); - memcpy(s->regs, vcpu->arch.apic->regs, sizeof *s); - - return 0; + return kvm_apic_get_state(vcpu, s); } static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s) { - kvm_apic_post_state_restore(vcpu, s); + int r; + + r = kvm_apic_set_state(vcpu, s); + if (r) + return r; update_cr8_intercept(vcpu); return 0; @@ -2872,7 +2880,7 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu, r = -EINVAL; if (!bank_num || bank_num >= KVM_MAX_MCE_BANKS) goto out; - if (mcg_cap & ~(KVM_MCE_CAP_SUPPORTED | 0xff | 0xff0000)) + if (mcg_cap & ~(kvm_mce_cap_supported | 0xff | 0xff0000)) goto out; r = 0; vcpu->arch.mcg_cap = mcg_cap; @@ -2882,6 +2890,9 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu, /* Init IA32_MCi_CTL to all 1s */ for (bank = 0; bank < bank_num; bank++) vcpu->arch.mce_banks[bank*4] = ~(u64)0; + + if (kvm_x86_ops->setup_mce) + kvm_x86_ops->setup_mce(vcpu); out: return r; } @@ -3794,6 +3805,18 @@ split_irqchip_unlock: mutex_unlock(&kvm->lock); break; } + case KVM_CAP_X2APIC_API: + r = -EINVAL; + if (cap->args[0] & ~KVM_X2APIC_API_VALID_FLAGS) + break; + + if (cap->args[0] & KVM_X2APIC_API_USE_32BIT_IDS) + kvm->arch.x2apic_format = true; + if (cap->args[0] & KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK) + kvm->arch.x2apic_broadcast_quirk_disabled = true; + + r = 0; + break; default: r = -EINVAL; break; @@ -5875,8 +5898,8 @@ int kvm_arch_init(void *opaque) kvm_x86_ops = ops; kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK, - PT_DIRTY_MASK, PT64_NX_MASK, 0); - + PT_DIRTY_MASK, PT64_NX_MASK, 0, + PT_PRESENT_MASK); kvm_timer_init(); perf_register_guest_info_callbacks(&kvm_guest_cbs); @@ -6655,7 +6678,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) trace_kvm_entry(vcpu->vcpu_id); wait_lapic_expire(vcpu); - __kvm_guest_enter(); + guest_enter_irqoff(); if (unlikely(vcpu->arch.switch_db_regs)) { set_debugreg(0, 7); @@ -6706,16 +6729,9 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) ++vcpu->stat.exits; - /* - * We must have an instruction between local_irq_enable() and - * kvm_guest_exit(), so the timer interrupt isn't delayed by - * the interrupt shadow. The stat.exits increment will do nicely. - * But we need to prevent reordering, hence this barrier(): - */ - barrier(); - - kvm_guest_exit(); + guest_exit_irqoff(); + local_irq_enable(); preempt_enable(); vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); @@ -7911,7 +7927,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm) kfree(kvm->arch.vpic); kfree(kvm->arch.vioapic); kvm_free_vcpus(kvm); - kfree(rcu_dereference_check(kvm->arch.apic_map, 1)); + kvfree(rcu_dereference_check(kvm->arch.apic_map, 1)); kvm_mmu_uninit_vm(kvm); } diff --git a/include/linux/context_tracking.h b/include/linux/context_tracking.h index d259274238db..ff4a32d24d56 100644 --- a/include/linux/context_tracking.h +++ b/include/linux/context_tracking.h @@ -84,7 +84,8 @@ static inline void context_tracking_init(void) { } #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN -static inline void guest_enter(void) +/* must be called with irqs disabled */ +static inline void guest_enter_irqoff(void) { if (vtime_accounting_cpu_enabled()) vtime_guest_enter(current); @@ -93,9 +94,19 @@ static inline void guest_enter(void) if (context_tracking_is_enabled()) __context_tracking_enter(CONTEXT_GUEST); + + /* KVM does not hold any references to rcu protected data when it + * switches CPU into a guest mode. In fact switching to a guest mode + * is very similar to exiting to userspace from rcu point of view. In + * addition CPU may stay in a guest mode for quite a long time (up to + * one time slice). Lets treat guest mode as quiescent state, just like + * we do with user-mode execution. + */ + if (!context_tracking_cpu_is_enabled()) + rcu_virt_note_context_switch(smp_processor_id()); } -static inline void guest_exit(void) +static inline void guest_exit_irqoff(void) { if (context_tracking_is_enabled()) __context_tracking_exit(CONTEXT_GUEST); @@ -107,7 +118,7 @@ static inline void guest_exit(void) } #else -static inline void guest_enter(void) +static inline void guest_enter_irqoff(void) { /* * This is running in ioctl context so its safe @@ -116,9 +127,10 @@ static inline void guest_enter(void) */ vtime_account_system(current); current->flags |= PF_VCPU; + rcu_virt_note_context_switch(smp_processor_id()); } -static inline void guest_exit(void) +static inline void guest_exit_irqoff(void) { /* Flush the guest cputime we spent on the guest */ vtime_account_system(current); @@ -126,4 +138,22 @@ static inline void guest_exit(void) } #endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */ +static inline void guest_enter(void) +{ + unsigned long flags; + + local_irq_save(flags); + guest_enter_irqoff(); + local_irq_restore(flags); +} + +static inline void guest_exit(void) +{ + unsigned long flags; + + local_irq_save(flags); + guest_exit_irqoff(); + local_irq_restore(flags); +} + #endif diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 614a98137c5f..aafd702f3e21 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -877,45 +877,6 @@ static inline void kvm_iommu_unmap_pages(struct kvm *kvm, } #endif -/* must be called with irqs disabled */ -static inline void __kvm_guest_enter(void) -{ - guest_enter(); - /* KVM does not hold any references to rcu protected data when it - * switches CPU into a guest mode. In fact switching to a guest mode - * is very similar to exiting to userspace from rcu point of view. In - * addition CPU may stay in a guest mode for quite a long time (up to - * one time slice). Lets treat guest mode as quiescent state, just like - * we do with user-mode execution. - */ - if (!context_tracking_cpu_is_enabled()) - rcu_virt_note_context_switch(smp_processor_id()); -} - -/* must be called with irqs disabled */ -static inline void __kvm_guest_exit(void) -{ - guest_exit(); -} - -static inline void kvm_guest_enter(void) -{ - unsigned long flags; - - local_irq_save(flags); - __kvm_guest_enter(); - local_irq_restore(flags); -} - -static inline void kvm_guest_exit(void) -{ - unsigned long flags; - - local_irq_save(flags); - __kvm_guest_exit(); - local_irq_restore(flags); -} - /* * search_memslots() and __gfn_to_memslot() are here because they are * used in non-modular code in arch/powerpc/kvm/book3s_hv_rm_mmu.c. @@ -1052,7 +1013,8 @@ int kvm_set_irq_routing(struct kvm *kvm, const struct kvm_irq_routing_entry *entries, unsigned nr, unsigned flags); -int kvm_set_routing_entry(struct kvm_kernel_irq_routing_entry *e, +int kvm_set_routing_entry(struct kvm *kvm, + struct kvm_kernel_irq_routing_entry *e, const struct kvm_irq_routing_entry *ue); void kvm_free_irq_routing(struct kvm *kvm); diff --git a/include/trace/events/kvm.h b/include/trace/events/kvm.h index f28292d73ddb..8ade3eb6c640 100644 --- a/include/trace/events/kvm.h +++ b/include/trace/events/kvm.h @@ -151,8 +151,9 @@ TRACE_EVENT(kvm_msi_set_irq, __entry->data = data; ), - TP_printk("dst %u vec %u (%s|%s|%s%s)", - (u8)(__entry->address >> 12), (u8)__entry->data, + TP_printk("dst %llx vec %u (%s|%s|%s%s)", + (u8)(__entry->address >> 12) | ((__entry->address >> 32) & 0xffffff00), + (u8)__entry->data, __print_symbolic((__entry->data >> 8 & 0x7), kvm_deliver_mode), (__entry->address & (1<<2)) ? "logical" : "physical", (__entry->data & (1<<15)) ? "level" : "edge", diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index d8c4c324cfae..8f2756c263d4 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -866,7 +866,9 @@ struct kvm_ppc_smmu_info { #define KVM_CAP_ARM_PMU_V3 126 #define KVM_CAP_VCPU_ATTRIBUTES 127 #define KVM_CAP_MAX_VCPU_ID 128 -#define KVM_CAP_MSI_DEVID 129 +#define KVM_CAP_X2APIC_API 129 +#define KVM_CAP_S390_USER_INSTR0 130 +#define KVM_CAP_MSI_DEVID 131 #ifdef KVM_CAP_IRQ_ROUTING @@ -1318,4 +1320,7 @@ struct kvm_assigned_msix_entry { __u16 padding[3]; }; +#define KVM_X2APIC_API_USE_32BIT_IDS (1ULL << 0) +#define KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK (1ULL << 1) + #endif /* __LINUX_KVM_H */ @@ -720,6 +720,7 @@ retry: } return 0; } +EXPORT_SYMBOL_GPL(fixup_user_fault); static __always_inline long __get_user_pages_locked(struct task_struct *tsk, struct mm_struct *mm, diff --git a/virt/kvm/irqchip.c b/virt/kvm/irqchip.c index 8db197bb6c7a..df99e9c3b64d 100644 --- a/virt/kvm/irqchip.c +++ b/virt/kvm/irqchip.c @@ -135,7 +135,8 @@ void kvm_free_irq_routing(struct kvm *kvm) free_irq_routing_table(rt); } -static int setup_routing_entry(struct kvm_irq_routing_table *rt, +static int setup_routing_entry(struct kvm *kvm, + struct kvm_irq_routing_table *rt, struct kvm_kernel_irq_routing_entry *e, const struct kvm_irq_routing_entry *ue) { @@ -154,7 +155,7 @@ static int setup_routing_entry(struct kvm_irq_routing_table *rt, e->gsi = ue->gsi; e->type = ue->type; - r = kvm_set_routing_entry(e, ue); + r = kvm_set_routing_entry(kvm, e, ue); if (r) goto out; if (e->type == KVM_IRQ_ROUTING_IRQCHIP) @@ -211,7 +212,7 @@ int kvm_set_irq_routing(struct kvm *kvm, kfree(e); goto out; } - r = setup_routing_entry(new, e, ue); + r = setup_routing_entry(kvm, new, e, ue); if (r) { kfree(e); goto out; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index bd2eb92c5d0e..61b31a5f76c8 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1442,6 +1442,52 @@ static bool vma_is_valid(struct vm_area_struct *vma, bool write_fault) return true; } +static int hva_to_pfn_remapped(struct vm_area_struct *vma, + unsigned long addr, bool *async, + bool write_fault, kvm_pfn_t *p_pfn) +{ + unsigned long pfn; + int r; + + r = follow_pfn(vma, addr, &pfn); + if (r) { + /* + * get_user_pages fails for VM_IO and VM_PFNMAP vmas and does + * not call the fault handler, so do it here. + */ + bool unlocked = false; + r = fixup_user_fault(current, current->mm, addr, + (write_fault ? FAULT_FLAG_WRITE : 0), + &unlocked); + if (unlocked) + return -EAGAIN; + if (r) + return r; + + r = follow_pfn(vma, addr, &pfn); + if (r) + return r; + + } + + + /* + * Get a reference here because callers of *hva_to_pfn* and + * *gfn_to_pfn* ultimately call kvm_release_pfn_clean on the + * returned pfn. This is only needed if the VMA has VM_MIXEDMAP + * set, but the kvm_get_pfn/kvm_release_pfn_clean pair will + * simply do nothing for reserved pfns. + * + * Whoever called remap_pfn_range is also going to call e.g. + * unmap_mapping_range before the underlying pages are freed, + * causing a call to our MMU notifier. + */ + kvm_get_pfn(pfn); + + *p_pfn = pfn; + return 0; +} + /* * Pin guest page in memory and return its pfn. * @addr: host virtual address which maps memory to the guest @@ -1461,7 +1507,7 @@ static kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async, { struct vm_area_struct *vma; kvm_pfn_t pfn = 0; - int npages; + int npages, r; /* we can do it either atomically or asynchronously, not both */ BUG_ON(atomic && async); @@ -1483,14 +1529,17 @@ static kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async, goto exit; } +retry: vma = find_vma_intersection(current->mm, addr, addr + 1); if (vma == NULL) pfn = KVM_PFN_ERR_FAULT; - else if ((vma->vm_flags & VM_PFNMAP)) { - pfn = ((addr - vma->vm_start) >> PAGE_SHIFT) + - vma->vm_pgoff; - BUG_ON(!kvm_is_reserved_pfn(pfn)); + else if (vma->vm_flags & (VM_IO | VM_PFNMAP)) { + r = hva_to_pfn_remapped(vma, addr, async, write_fault, &pfn); + if (r == -EAGAIN) + goto retry; + if (r < 0) + pfn = KVM_PFN_ERR_FAULT; } else { if (async && vma_is_valid(vma, write_fault)) *async = true; |