diff options
238 files changed, 6000 insertions, 1794 deletions
diff --git a/Documentation/devicetree/bindings/net/macb.txt b/Documentation/devicetree/bindings/net/macb.txt index 457d5ae16f23..3e17ac1d5d58 100644 --- a/Documentation/devicetree/bindings/net/macb.txt +++ b/Documentation/devicetree/bindings/net/macb.txt @@ -10,6 +10,7 @@ Required properties: Use "cdns,pc302-gem" for Picochip picoXcell pc302 and later devices based on the Cadence GEM, or the generic form: "cdns,gem". Use "atmel,sama5d2-gem" for the GEM IP (10/100) available on Atmel sama5d2 SoCs. + Use "atmel,sama5d3-macb" for the 10/100Mbit IP available on Atmel sama5d3 SoCs. Use "atmel,sama5d3-gem" for the Gigabit IP available on Atmel sama5d3 SoCs. Use "atmel,sama5d4-gem" for the GEM IP (10/100) available on Atmel sama5d4 SoCs. Use "cdns,zynq-gem" Xilinx Zynq-7xxx SoC. diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index c664064f76fb..df98b6304769 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -1922,6 +1922,7 @@ registers, find a list below: PPC | KVM_REG_PPC_TIDR | 64 PPC | KVM_REG_PPC_PSSCR | 64 PPC | KVM_REG_PPC_DEC_EXPIRY | 64 + PPC | KVM_REG_PPC_PTCR | 64 PPC | KVM_REG_PPC_TM_GPR0 | 64 ... PPC | KVM_REG_PPC_TM_GPR31 | 64 @@ -2269,6 +2270,10 @@ The supported flags are: The emulated MMU supports 1T segments in addition to the standard 256M ones. + - KVM_PPC_NO_HASH + This flag indicates that HPT guests are not supported by KVM, + thus all guests must use radix MMU mode. + The "slb_size" field indicates how many SLB entries are supported The "sps" array contains 8 entries indicating the supported base @@ -4510,7 +4515,8 @@ Do not enable KVM_FEATURE_PV_UNHALT if you disable HLT exits. Architectures: s390 Parameters: none Returns: 0 on success, -EINVAL if hpage module parameter was not set - or cmma is enabled + or cmma is enabled, or the VM has the KVM_VM_S390_UCONTROL + flag set With this capability the KVM support for memory backing with 1m pages through hugetlbfs can be enabled for a VM. After the capability is @@ -4521,6 +4527,29 @@ hpage module parameter is not set to 1, -EINVAL is returned. While it is generally possible to create a huge page backed VM without this capability, the VM will not be able to run. +7.14 KVM_CAP_MSR_PLATFORM_INFO + +Architectures: x86 +Parameters: args[0] whether feature should be enabled or not + +With this capability, a guest may read the MSR_PLATFORM_INFO MSR. Otherwise, +a #GP would be raised when the guest tries to access. Currently, this +capability does not enable write permissions of this MSR for the guest. + +7.16 KVM_CAP_PPC_NESTED_HV + +Architectures: ppc +Parameters: none +Returns: 0 on success, -EINVAL when the implementation doesn't support + nested-HV virtualization. + +HV-KVM on POWER9 and later systems allows for "nested-HV" +virtualization, which provides a way for a guest VM to run guests that +can run using the CPU's supervisor mode (privileged non-hypervisor +state). Enabling this capability on a VM depends on the CPU having +the necessary functionality and on the facility being enabled with a +kvm-hv module parameter. + 8. Other capabilities. ---------------------- diff --git a/MAINTAINERS b/MAINTAINERS index a2e401fe8d4e..1610fb26bdac 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -13461,9 +13461,8 @@ F: drivers/i2c/busses/i2c-synquacer.c F: Documentation/devicetree/bindings/i2c/i2c-synquacer.txt SOCIONEXT UNIPHIER SOUND DRIVER -M: Katsuhiro Suzuki <suzuki.katsuhiro@socionext.com> L: alsa-devel@alsa-project.org (moderated for non-subscribers) -S: Maintained +S: Orphan F: sound/soc/uniphier/ SOEKRIS NET48XX LED SUPPORT @@ -299,19 +299,7 @@ KERNELRELEASE = $(shell cat include/config/kernel.release 2> /dev/null) KERNELVERSION = $(VERSION)$(if $(PATCHLEVEL),.$(PATCHLEVEL)$(if $(SUBLEVEL),.$(SUBLEVEL)))$(EXTRAVERSION) export VERSION PATCHLEVEL SUBLEVEL KERNELRELEASE KERNELVERSION -# SUBARCH tells the usermode build what the underlying arch is. That is set -# first, and if a usermode build is happening, the "ARCH=um" on the command -# line overrides the setting of ARCH below. If a native build is happening, -# then ARCH is assigned, getting whatever value it gets normally, and -# SUBARCH is subsequently ignored. - -SUBARCH := $(shell uname -m | sed -e s/i.86/x86/ -e s/x86_64/x86/ \ - -e s/sun4u/sparc64/ \ - -e s/arm.*/arm/ -e s/sa110/arm/ \ - -e s/s390x/s390/ -e s/parisc64/parisc/ \ - -e s/ppc.*/powerpc/ -e s/mips.*/mips/ \ - -e s/sh[234].*/sh/ -e s/aarch64.*/arm64/ \ - -e s/riscv.*/riscv/) +include scripts/subarch.include # Cross compiling and selecting different set of gcc/bin-utils # --------------------------------------------------------------------------- diff --git a/arch/arm/boot/dts/sama5d3_emac.dtsi b/arch/arm/boot/dts/sama5d3_emac.dtsi index 7cb235ef0fb6..6e9e1c2f9def 100644 --- a/arch/arm/boot/dts/sama5d3_emac.dtsi +++ b/arch/arm/boot/dts/sama5d3_emac.dtsi @@ -41,7 +41,7 @@ }; macb1: ethernet@f802c000 { - compatible = "cdns,at91sam9260-macb", "cdns,macb"; + compatible = "atmel,sama5d3-macb", "cdns,at91sam9260-macb", "cdns,macb"; reg = <0xf802c000 0x100>; interrupts = <35 IRQ_TYPE_LEVEL_HIGH 3>; pinctrl-names = "default"; diff --git a/arch/powerpc/include/asm/asm-prototypes.h b/arch/powerpc/include/asm/asm-prototypes.h index 1f4691ce4126..c55ba3b4873b 100644 --- a/arch/powerpc/include/asm/asm-prototypes.h +++ b/arch/powerpc/include/asm/asm-prototypes.h @@ -150,4 +150,25 @@ extern s32 patch__memset_nocache, patch__memcpy_nocache; extern long flush_count_cache; +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM +void kvmppc_save_tm_hv(struct kvm_vcpu *vcpu, u64 msr, bool preserve_nv); +void kvmppc_restore_tm_hv(struct kvm_vcpu *vcpu, u64 msr, bool preserve_nv); +#else +static inline void kvmppc_save_tm_hv(struct kvm_vcpu *vcpu, u64 msr, + bool preserve_nv) { } +static inline void kvmppc_restore_tm_hv(struct kvm_vcpu *vcpu, u64 msr, + bool preserve_nv) { } +#endif /* CONFIG_PPC_TRANSACTIONAL_MEM */ + +void kvmhv_save_host_pmu(void); +void kvmhv_load_host_pmu(void); +void kvmhv_save_guest_pmu(struct kvm_vcpu *vcpu, bool pmu_in_use); +void kvmhv_load_guest_pmu(struct kvm_vcpu *vcpu); + +int __kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu); + +long kvmppc_h_set_dabr(struct kvm_vcpu *vcpu, unsigned long dabr); +long kvmppc_h_set_xdabr(struct kvm_vcpu *vcpu, unsigned long dabr, + unsigned long dabrx); + #endif /* _ASM_POWERPC_ASM_PROTOTYPES_H */ diff --git a/arch/powerpc/include/asm/book3s/64/mmu-hash.h b/arch/powerpc/include/asm/book3s/64/mmu-hash.h index b3520b549cba..66db23e2f4dc 100644 --- a/arch/powerpc/include/asm/book3s/64/mmu-hash.h +++ b/arch/powerpc/include/asm/book3s/64/mmu-hash.h @@ -203,6 +203,18 @@ static inline unsigned int mmu_psize_to_shift(unsigned int mmu_psize) BUG(); } +static inline unsigned int ap_to_shift(unsigned long ap) +{ + int psize; + + for (psize = 0; psize < MMU_PAGE_COUNT; psize++) { + if (mmu_psize_defs[psize].ap == ap) + return mmu_psize_defs[psize].shift; + } + + return -1; +} + static inline unsigned long get_sllp_encoding(int psize) { unsigned long sllp; diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index 13a688fc8cd0..2fdc865ca374 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -1051,7 +1051,6 @@ static inline void vmemmap_remove_mapping(unsigned long start, return hash__vmemmap_remove_mapping(start, page_size); } #endif -struct page *realmode_pfn_to_page(unsigned long pfn); static inline pte_t pmd_pte(pmd_t pmd) { diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h b/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h index 1154a6dc6d26..671316f9e95d 100644 --- a/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h +++ b/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h @@ -53,6 +53,7 @@ extern void radix__flush_tlb_lpid_page(unsigned int lpid, unsigned long addr, unsigned long page_size); extern void radix__flush_pwc_lpid(unsigned int lpid); +extern void radix__flush_tlb_lpid(unsigned int lpid); extern void radix__local_flush_tlb_lpid(unsigned int lpid); extern void radix__local_flush_tlb_lpid_guest(unsigned int lpid); diff --git a/arch/powerpc/include/asm/hvcall.h b/arch/powerpc/include/asm/hvcall.h index a0b17f9f1ea4..45e8789bb770 100644 --- a/arch/powerpc/include/asm/hvcall.h +++ b/arch/powerpc/include/asm/hvcall.h @@ -322,6 +322,11 @@ #define H_GET_24X7_DATA 0xF07C #define H_GET_PERF_COUNTER_INFO 0xF080 +/* Platform-specific hcalls used for nested HV KVM */ +#define H_SET_PARTITION_TABLE 0xF800 +#define H_ENTER_NESTED 0xF804 +#define H_TLB_INVALIDATE 0xF808 + /* Values for 2nd argument to H_SET_MODE */ #define H_SET_MODE_RESOURCE_SET_CIABR 1 #define H_SET_MODE_RESOURCE_SET_DAWR 2 @@ -461,6 +466,42 @@ struct h_cpu_char_result { u64 behaviour; }; +/* Register state for entering a nested guest with H_ENTER_NESTED */ +struct hv_guest_state { + u64 version; /* version of this structure layout */ + u32 lpid; + u32 vcpu_token; + /* These registers are hypervisor privileged (at least for writing) */ + u64 lpcr; + u64 pcr; + u64 amor; + u64 dpdes; + u64 hfscr; + s64 tb_offset; + u64 dawr0; + u64 dawrx0; + u64 ciabr; + u64 hdec_expiry; + u64 purr; + u64 spurr; + u64 ic; + u64 vtb; + u64 hdar; + u64 hdsisr; + u64 heir; + u64 asdr; + /* These are OS privileged but need to be set late in guest entry */ + u64 srr0; + u64 srr1; + u64 sprg[4]; + u64 pidr; + u64 cfar; + u64 ppr; +}; + +/* Latest version of hv_guest_state structure */ +#define HV_GUEST_STATE_VERSION 1 + #endif /* __ASSEMBLY__ */ #endif /* __KERNEL__ */ #endif /* _ASM_POWERPC_HVCALL_H */ diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h index ab3a4fba38e3..3d4b88cb8599 100644 --- a/arch/powerpc/include/asm/iommu.h +++ b/arch/powerpc/include/asm/iommu.h @@ -220,8 +220,6 @@ extern void iommu_del_device(struct device *dev); extern int __init tce_iommu_bus_notifier_init(void); extern long iommu_tce_xchg(struct iommu_table *tbl, unsigned long entry, unsigned long *hpa, enum dma_data_direction *direction); -extern long iommu_tce_xchg_rm(struct iommu_table *tbl, unsigned long entry, - unsigned long *hpa, enum dma_data_direction *direction); #else static inline void iommu_register_group(struct iommu_table_group *table_group, int pci_domain_number, diff --git a/arch/powerpc/include/asm/kvm_asm.h b/arch/powerpc/include/asm/kvm_asm.h index a790d5cf6ea3..1f321914676d 100644 --- a/arch/powerpc/include/asm/kvm_asm.h +++ b/arch/powerpc/include/asm/kvm_asm.h @@ -84,7 +84,6 @@ #define BOOK3S_INTERRUPT_INST_STORAGE 0x400 #define BOOK3S_INTERRUPT_INST_SEGMENT 0x480 #define BOOK3S_INTERRUPT_EXTERNAL 0x500 -#define BOOK3S_INTERRUPT_EXTERNAL_LEVEL 0x501 #define BOOK3S_INTERRUPT_EXTERNAL_HV 0x502 #define BOOK3S_INTERRUPT_ALIGNMENT 0x600 #define BOOK3S_INTERRUPT_PROGRAM 0x700 @@ -134,8 +133,7 @@ #define BOOK3S_IRQPRIO_EXTERNAL 14 #define BOOK3S_IRQPRIO_DECREMENTER 15 #define BOOK3S_IRQPRIO_PERFORMANCE_MONITOR 16 -#define BOOK3S_IRQPRIO_EXTERNAL_LEVEL 17 -#define BOOK3S_IRQPRIO_MAX 18 +#define BOOK3S_IRQPRIO_MAX 17 #define BOOK3S_HFLAG_DCBZ32 0x1 #define BOOK3S_HFLAG_SLB 0x2 diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h index 83a9aa3cf689..09f8e9ba69bc 100644 --- a/arch/powerpc/include/asm/kvm_book3s.h +++ b/arch/powerpc/include/asm/kvm_book3s.h @@ -188,14 +188,37 @@ extern int kvmppc_book3s_hcall_implemented(struct kvm *kvm, unsigned long hc); extern int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, unsigned long ea, unsigned long dsisr); +extern int kvmppc_mmu_walk_radix_tree(struct kvm_vcpu *vcpu, gva_t eaddr, + struct kvmppc_pte *gpte, u64 root, + u64 *pte_ret_p); +extern int kvmppc_mmu_radix_translate_table(struct kvm_vcpu *vcpu, gva_t eaddr, + struct kvmppc_pte *gpte, u64 table, + int table_index, u64 *pte_ret_p); extern int kvmppc_mmu_radix_xlate(struct kvm_vcpu *vcpu, gva_t eaddr, struct kvmppc_pte *gpte, bool data, bool iswrite); +extern void kvmppc_unmap_pte(struct kvm *kvm, pte_t *pte, unsigned long gpa, + unsigned int shift, struct kvm_memory_slot *memslot, + unsigned int lpid); +extern bool kvmppc_hv_handle_set_rc(struct kvm *kvm, pgd_t *pgtable, + bool writing, unsigned long gpa, + unsigned int lpid); +extern int kvmppc_book3s_instantiate_page(struct kvm_vcpu *vcpu, + unsigned long gpa, + struct kvm_memory_slot *memslot, + bool writing, bool kvm_ro, + pte_t *inserted_pte, unsigned int *levelp); extern int kvmppc_init_vm_radix(struct kvm *kvm); extern void kvmppc_free_radix(struct kvm *kvm); +extern void kvmppc_free_pgtable_radix(struct kvm *kvm, pgd_t *pgd, + unsigned int lpid); extern int kvmppc_radix_init(void); extern void kvmppc_radix_exit(void); extern int kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, unsigned long gfn); +extern void kvmppc_unmap_pte(struct kvm *kvm, pte_t *pte, + unsigned long gpa, unsigned int shift, + struct kvm_memory_slot *memslot, + unsigned int lpid); extern int kvm_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, unsigned long gfn); extern int kvm_test_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, @@ -271,6 +294,21 @@ static inline void kvmppc_save_tm_sprs(struct kvm_vcpu *vcpu) {} static inline void kvmppc_restore_tm_sprs(struct kvm_vcpu *vcpu) {} #endif +long kvmhv_nested_init(void); +void kvmhv_nested_exit(void); +void kvmhv_vm_nested_init(struct kvm *kvm); +long kvmhv_set_partition_table(struct kvm_vcpu *vcpu); +void kvmhv_set_ptbl_entry(unsigned int lpid, u64 dw0, u64 dw1); +void kvmhv_release_all_nested(struct kvm *kvm); +long kvmhv_enter_nested_guest(struct kvm_vcpu *vcpu); +long kvmhv_do_nested_tlbie(struct kvm_vcpu *vcpu); +int kvmhv_run_single_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu, + u64 time_limit, unsigned long lpcr); +void kvmhv_save_hv_regs(struct kvm_vcpu *vcpu, struct hv_guest_state *hr); +void kvmhv_restore_hv_return_state(struct kvm_vcpu *vcpu, + struct hv_guest_state *hr); +long int kvmhv_nested_page_fault(struct kvm_vcpu *vcpu); + void kvmppc_giveup_fac(struct kvm_vcpu *vcpu, ulong fac); extern int kvm_irq_bypass; @@ -301,12 +339,12 @@ static inline ulong kvmppc_get_gpr(struct kvm_vcpu *vcpu, int num) static inline void kvmppc_set_cr(struct kvm_vcpu *vcpu, u32 val) { - vcpu->arch.cr = val; + vcpu->arch.regs.ccr = val; } static inline u32 kvmppc_get_cr(struct kvm_vcpu *vcpu) { - return vcpu->arch.cr; + return vcpu->arch.regs.ccr; } static inline void kvmppc_set_xer(struct kvm_vcpu *vcpu, ulong val) @@ -384,9 +422,6 @@ extern int kvmppc_h_logical_ci_store(struct kvm_vcpu *vcpu); /* TO = 31 for unconditional trap */ #define INS_TW 0x7fe00008 -/* LPIDs we support with this build -- runtime limit may be lower */ -#define KVMPPC_NR_LPIDS (LPID_RSVD + 1) - #define SPLIT_HACK_MASK 0xff000000 #define SPLIT_HACK_OFFS 0xfb000000 diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h index dc435a5af7d6..6d298145d564 100644 --- a/arch/powerpc/include/asm/kvm_book3s_64.h +++ b/arch/powerpc/include/asm/kvm_book3s_64.h @@ -23,6 +23,108 @@ #include <linux/string.h> #include <asm/bitops.h> #include <asm/book3s/64/mmu-hash.h> +#include <asm/cpu_has_feature.h> +#include <asm/ppc-opcode.h> + +#ifdef CONFIG_PPC_PSERIES +static inline bool kvmhv_on_pseries(void) +{ + return !cpu_has_feature(CPU_FTR_HVMODE); +} +#else +static inline bool kvmhv_on_pseries(void) +{ + return false; +} +#endif + +/* + * Structure for a nested guest, that is, for a guest that is managed by + * one of our guests. + */ +struct kvm_nested_guest { + struct kvm *l1_host; /* L1 VM that owns this nested guest */ + int l1_lpid; /* lpid L1 guest thinks this guest is */ + int shadow_lpid; /* real lpid of this nested guest */ + pgd_t *shadow_pgtable; /* our page table for this guest */ + u64 l1_gr_to_hr; /* L1's addr of part'n-scoped table */ + u64 process_table; /* process table entry for this guest */ + long refcnt; /* number of pointers to this struct */ + struct mutex tlb_lock; /* serialize page faults and tlbies */ + struct kvm_nested_guest *next; + cpumask_t need_tlb_flush; + cpumask_t cpu_in_guest; + short prev_cpu[NR_CPUS]; +}; + +/* + * We define a nested rmap entry as a single 64-bit quantity + * 0xFFF0000000000000 12-bit lpid field + * 0x000FFFFFFFFFF000 40-bit guest 4k page frame number + * 0x0000000000000001 1-bit single entry flag + */ +#define RMAP_NESTED_LPID_MASK 0xFFF0000000000000UL +#define RMAP_NESTED_LPID_SHIFT (52) +#define RMAP_NESTED_GPA_MASK 0x000FFFFFFFFFF000UL +#define RMAP_NESTED_IS_SINGLE_ENTRY 0x0000000000000001UL + +/* Structure for a nested guest rmap entry */ +struct rmap_nested { + struct llist_node list; + u64 rmap; +}; + +/* + * for_each_nest_rmap_safe - iterate over the list of nested rmap entries + * safe against removal of the list entry or NULL list + * @pos: a (struct rmap_nested *) to use as a loop cursor + * @node: pointer to the first entry + * NOTE: this can be NULL + * @rmapp: an (unsigned long *) in which to return the rmap entries on each + * iteration + * NOTE: this must point to already allocated memory + * + * The nested_rmap is a llist of (struct rmap_nested) entries pointed to by the + * rmap entry in the memslot. The list is always terminated by a "single entry" + * stored in the list element of the final entry of the llist. If there is ONLY + * a single entry then this is itself in the rmap entry of the memslot, not a + * llist head pointer. + * + * Note that the iterator below assumes that a nested rmap entry is always + * non-zero. This is true for our usage because the LPID field is always + * non-zero (zero is reserved for the host). + * + * This should be used to iterate over the list of rmap_nested entries with + * processing done on the u64 rmap value given by each iteration. This is safe + * against removal of list entries and it is always safe to call free on (pos). + * + * e.g. + * struct rmap_nested *cursor; + * struct llist_node *first; + * unsigned long rmap; + * for_each_nest_rmap_safe(cursor, first, &rmap) { + * do_something(rmap); + * free(cursor); + * } + */ +#define for_each_nest_rmap_safe(pos, node, rmapp) \ + for ((pos) = llist_entry((node), typeof(*(pos)), list); \ + (node) && \ + (*(rmapp) = ((RMAP_NESTED_IS_SINGLE_ENTRY & ((u64) (node))) ? \ + ((u64) (node)) : ((pos)->rmap))) && \ + (((node) = ((RMAP_NESTED_IS_SINGLE_ENTRY & ((u64) (node))) ? \ + ((struct llist_node *) ((pos) = NULL)) : \ + (pos)->list.next)), true); \ + (pos) = llist_entry((node), typeof(*(pos)), list)) + +struct kvm_nested_guest *kvmhv_get_nested(struct kvm *kvm, int l1_lpid, + bool create); +void kvmhv_put_nested(struct kvm_nested_guest *gp); +int kvmhv_nested_next_lpid(struct kvm *kvm, int lpid); + +/* Encoding of first parameter for H_TLB_INVALIDATE */ +#define H_TLBIE_P1_ENC(ric, prs, r) (___PPC_RIC(ric) | ___PPC_PRS(prs) | \ + ___PPC_R(r)) /* Power architecture requires HPT is at least 256kiB, at most 64TiB */ #define PPC_MIN_HPT_ORDER 18 @@ -435,6 +537,7 @@ static inline struct kvm_memslots *kvm_memslots_raw(struct kvm *kvm) } extern void kvmppc_mmu_debugfs_init(struct kvm *kvm); +extern void kvmhv_radix_debugfs_init(struct kvm *kvm); extern void kvmhv_rm_send_ipi(int cpu); @@ -482,7 +585,7 @@ static inline u64 sanitize_msr(u64 msr) #ifdef CONFIG_PPC_TRANSACTIONAL_MEM static inline void copy_from_checkpoint(struct kvm_vcpu *vcpu) { - vcpu->arch.cr = vcpu->arch.cr_tm; + vcpu->arch.regs.ccr = vcpu->arch.cr_tm; vcpu->arch.regs.xer = vcpu->arch.xer_tm; vcpu->arch.regs.link = vcpu->arch.lr_tm; vcpu->arch.regs.ctr = vcpu->arch.ctr_tm; @@ -499,7 +602,7 @@ static inline void copy_from_checkpoint(struct kvm_vcpu *vcpu) static inline void copy_to_checkpoint(struct kvm_vcpu *vcpu) { - vcpu->arch.cr_tm = vcpu->arch.cr; + vcpu->arch.cr_tm = vcpu->arch.regs.ccr; vcpu->arch.xer_tm = vcpu->arch.regs.xer; vcpu->arch.lr_tm = vcpu->arch.regs.link; vcpu->arch.ctr_tm = vcpu->arch.regs.ctr; @@ -515,6 +618,17 @@ static inline void copy_to_checkpoint(struct kvm_vcpu *vcpu) } #endif /* CONFIG_PPC_TRANSACTIONAL_MEM */ +extern int kvmppc_create_pte(struct kvm *kvm, pgd_t *pgtable, pte_t pte, + unsigned long gpa, unsigned int level, + unsigned long mmu_seq, unsigned int lpid, + unsigned long *rmapp, struct rmap_nested **n_rmap); +extern void kvmhv_insert_nest_rmap(struct kvm *kvm, unsigned long *rmapp, + struct rmap_nested **n_rmap); +extern void kvmhv_remove_nest_rmap_range(struct kvm *kvm, + struct kvm_memory_slot *memslot, + unsigned long gpa, unsigned long hpa, + unsigned long nbytes); + #endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */ #endif /* __ASM_KVM_BOOK3S_64_H__ */ diff --git a/arch/powerpc/include/asm/kvm_book3s_asm.h b/arch/powerpc/include/asm/kvm_book3s_asm.h index d978fdf698af..eb3ba6390108 100644 --- a/arch/powerpc/include/asm/kvm_book3s_asm.h +++ b/arch/powerpc/include/asm/kvm_book3s_asm.h @@ -25,6 +25,9 @@ #define XICS_MFRR 0xc #define XICS_IPI 2 /* interrupt source # for IPIs */ +/* LPIDs we support with this build -- runtime limit may be lower */ +#define KVMPPC_NR_LPIDS (LPID_RSVD + 1) + /* Maximum number of threads per physical core */ #define MAX_SMT_THREADS 8 diff --git a/arch/powerpc/include/asm/kvm_booke.h b/arch/powerpc/include/asm/kvm_booke.h index d513e3ed1c65..f0cef625f17c 100644 --- a/arch/powerpc/include/asm/kvm_booke.h +++ b/arch/powerpc/include/asm/kvm_booke.h @@ -46,12 +46,12 @@ static inline ulong kvmppc_get_gpr(struct kvm_vcpu *vcpu, int num) static inline void kvmppc_set_cr(struct kvm_vcpu *vcpu, u32 val) { - vcpu->arch.cr = val; + vcpu->arch.regs.ccr = val; } static inline u32 kvmppc_get_cr(struct kvm_vcpu *vcpu) { - return vcpu->arch.cr; + return vcpu->arch.regs.ccr; } static inline void kvmppc_set_xer(struct kvm_vcpu *vcpu, ulong val) diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 906bcbdfd2a1..fac6f631ed29 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -46,6 +46,7 @@ #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE #include <asm/kvm_book3s_asm.h> /* for MAX_SMT_THREADS */ #define KVM_MAX_VCPU_ID (MAX_SMT_THREADS * KVM_MAX_VCORES) +#define KVM_MAX_NESTED_GUESTS KVMPPC_NR_LPIDS #else #define KVM_MAX_VCPU_ID KVM_MAX_VCPUS @@ -94,6 +95,7 @@ struct dtl_entry; struct kvmppc_vcpu_book3s; struct kvmppc_book3s_shadow_vcpu; +struct kvm_nested_guest; struct kvm_vm_stat { ulong remote_tlb_flush; @@ -287,10 +289,12 @@ struct kvm_arch { u8 radix; u8 fwnmi_enabled; bool threads_indep; + bool nested_enable; pgd_t *pgtable; u64 process_table; struct dentry *debugfs_dir; struct dentry *htab_dentry; + struct dentry *radix_dentry; struct kvm_resize_hpt *resize_hpt; /* protected by kvm->lock */ #endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */ #ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE @@ -311,6 +315,9 @@ struct kvm_arch { #endif struct kvmppc_ops *kvm_ops; #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE + u64 l1_ptcr; + int max_nested_lpid; + struct kvm_nested_guest *nested_guests[KVM_MAX_NESTED_GUESTS]; /* This array can grow quite large, keep it at the end */ struct kvmppc_vcore *vcores[KVM_MAX_VCORES]; #endif @@ -360,7 +367,9 @@ struct kvmppc_pte { bool may_write : 1; bool may_execute : 1; unsigned long wimg; + unsigned long rc; u8 page_size; /* MMU_PAGE_xxx */ + u8 page_shift; }; struct kvmppc_mmu { @@ -537,8 +546,6 @@ struct kvm_vcpu_arch { ulong tar; #endif - u32 cr; - #ifdef CONFIG_PPC_BOOK3S ulong hflags; ulong guest_owned_ext; @@ -707,6 +714,7 @@ struct kvm_vcpu_arch { u8 hcall_needed; u8 epr_flags; /* KVMPPC_EPR_xxx */ u8 epr_needed; + u8 external_oneshot; /* clear external irq after delivery */ u32 cpr0_cfgaddr; /* holds the last set cpr0_cfgaddr */ @@ -781,6 +789,10 @@ struct kvm_vcpu_arch { u32 emul_inst; u32 online; + + /* For support of nested guests */ + struct kvm_nested_guest *nested; + u32 nested_vcpu_id; #endif #ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index e991821dd7fa..9b89b1918dfc 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -194,9 +194,7 @@ extern struct kvmppc_spapr_tce_table *kvmppc_find_table( (iommu_tce_check_ioba((stt)->page_shift, (stt)->offset, \ (stt)->size, (ioba), (npages)) ? \ H_PARAMETER : H_SUCCESS) -extern long kvmppc_tce_validate(struct kvmppc_spapr_tce_table *tt, - unsigned long tce); -extern long kvmppc_gpa_to_ua(struct kvm *kvm, unsigned long gpa, +extern long kvmppc_tce_to_ua(struct kvm *kvm, unsigned long tce, unsigned long *ua, unsigned long **prmap); extern void kvmppc_tce_put(struct kvmppc_spapr_tce_table *tt, unsigned long idx, unsigned long tce); @@ -327,6 +325,7 @@ struct kvmppc_ops { int (*set_smt_mode)(struct kvm *kvm, unsigned long mode, unsigned long flags); void (*giveup_ext)(struct kvm_vcpu *vcpu, ulong msr); + int (*enable_nested)(struct kvm *kvm); }; extern struct kvmppc_ops *kvmppc_hv_ops; @@ -585,6 +584,7 @@ extern int kvmppc_xive_set_icp(struct kvm_vcpu *vcpu, u64 icpval); extern int kvmppc_xive_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level, bool line_status); +extern void kvmppc_xive_push_vcpu(struct kvm_vcpu *vcpu); #else static inline int kvmppc_xive_set_xive(struct kvm *kvm, u32 irq, u32 server, u32 priority) { return -1; } @@ -607,6 +607,7 @@ static inline int kvmppc_xive_set_icp(struct kvm_vcpu *vcpu, u64 icpval) { retur static inline int kvmppc_xive_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level, bool line_status) { return -ENODEV; } +static inline void kvmppc_xive_push_vcpu(struct kvm_vcpu *vcpu) { } #endif /* CONFIG_KVM_XIVE */ /* @@ -652,6 +653,7 @@ int kvmppc_rm_h_ipi(struct kvm_vcpu *vcpu, unsigned long server, unsigned long mfrr); int kvmppc_rm_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr); int kvmppc_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr); +void kvmppc_guest_entry_inject_int(struct kvm_vcpu *vcpu); /* * Host-side operations we want to set up while running in real diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h index b2f89b621b15..b694d6af1150 100644 --- a/arch/powerpc/include/asm/mmu_context.h +++ b/arch/powerpc/include/asm/mmu_context.h @@ -38,6 +38,7 @@ extern long mm_iommu_ua_to_hpa(struct mm_iommu_table_group_mem_t *mem, unsigned long ua, unsigned int pageshift, unsigned long *hpa); extern long mm_iommu_ua_to_hpa_rm(struct mm_iommu_table_group_mem_t *mem, unsigned long ua, unsigned int pageshift, unsigned long *hpa); +extern void mm_iommu_ua_mark_dirty_rm(struct mm_struct *mm, unsigned long ua); extern long mm_iommu_mapped_inc(struct mm_iommu_table_group_mem_t *mem); extern void mm_iommu_mapped_dec(struct mm_iommu_table_group_mem_t *mem); #endif diff --git a/arch/powerpc/include/asm/ppc-opcode.h b/arch/powerpc/include/asm/ppc-opcode.h index 665af14850e4..6093bc8f74e5 100644 --- a/arch/powerpc/include/asm/ppc-opcode.h +++ b/arch/powerpc/include/asm/ppc-opcode.h @@ -104,6 +104,7 @@ #define OP_31_XOP_LHZUX 311 #define OP_31_XOP_MSGSNDP 142 #define OP_31_XOP_MSGCLRP 174 +#define OP_31_XOP_TLBIE 306 #define OP_31_XOP_MFSPR 339 #define OP_31_XOP_LWAX 341 #define OP_31_XOP_LHAX 343 diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h index e5b314ed054e..c90698972f42 100644 --- a/arch/powerpc/include/asm/reg.h +++ b/arch/powerpc/include/asm/reg.h @@ -415,6 +415,7 @@ #define HFSCR_DSCR __MASK(FSCR_DSCR_LG) #define HFSCR_VECVSX __MASK(FSCR_VECVSX_LG) #define HFSCR_FP __MASK(FSCR_FP_LG) +#define HFSCR_INTR_CAUSE (ASM_CONST(0xFF) << 56) /* interrupt cause */ #define SPRN_TAR 0x32f /* Target Address Register */ #define SPRN_LPCR 0x13E /* LPAR Control Register */ #define LPCR_VPM0 ASM_CONST(0x8000000000000000) @@ -766,6 +767,7 @@ #define SPRN_HSRR0 0x13A /* Save/Restore Register 0 */ #define SPRN_HSRR1 0x13B /* Save/Restore Register 1 */ #define HSRR1_DENORM 0x00100000 /* Denorm exception */ +#define HSRR1_HISI_WRITE 0x00010000 /* HISI bcs couldn't update mem */ #define SPRN_TBCTL 0x35f /* PA6T Timebase control register */ #define TBCTL_FREEZE 0x0000000000000000ull /* Freeze all tbs */ diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h index 1b32b56a03d3..8c876c166ef2 100644 --- a/arch/powerpc/include/uapi/asm/kvm.h +++ b/arch/powerpc/include/uapi/asm/kvm.h @@ -634,6 +634,7 @@ struct kvm_ppc_cpu_char { #define KVM_REG_PPC_DEC_EXPIRY (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xbe) #define KVM_REG_PPC_ONLINE (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0xbf) +#define KVM_REG_PPC_PTCR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xc0) /* Transactional Memory checkpointed state: * This is all GPRs, all VSX regs and a subset of SPRs diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index 89cf15566c4e..d0abcbbdc700 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -438,7 +438,7 @@ int main(void) #ifdef CONFIG_PPC_BOOK3S OFFSET(VCPU_TAR, kvm_vcpu, arch.tar); #endif - OFFSET(VCPU_CR, kvm_vcpu, arch.cr); + OFFSET(VCPU_CR, kvm_vcpu, arch.regs.ccr); OFFSET(VCPU_PC, kvm_vcpu, arch.regs.nip); #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE OFFSET(VCPU_MSR, kvm_vcpu, arch.shregs.msr); @@ -503,6 +503,7 @@ int main(void) OFFSET(VCPU_VPA, kvm_vcpu, arch.vpa.pinned_addr); OFFSET(VCPU_VPA_DIRTY, kvm_vcpu, arch.vpa.dirty); OFFSET(VCPU_HEIR, kvm_vcpu, arch.emul_inst); + OFFSET(VCPU_NESTED, kvm_vcpu, arch.nested); OFFSET(VCPU_CPU, kvm_vcpu, cpu); OFFSET(VCPU_THREAD_CPU, kvm_vcpu, arch.thread_cpu); #endif @@ -695,7 +696,7 @@ int main(void) #endif /* CONFIG_PPC_BOOK3S_64 */ #else /* CONFIG_PPC_BOOK3S */ - OFFSET(VCPU_CR, kvm_vcpu, arch.cr); + OFFSET(VCPU_CR, kvm_vcpu, arch.regs.ccr); OFFSET(VCPU_XER, kvm_vcpu, arch.regs.xer); OFFSET(VCPU_LR, kvm_vcpu, arch.regs.link); OFFSET(VCPU_CTR, kvm_vcpu, arch.regs.ctr); diff --git a/arch/powerpc/kernel/cpu_setup_power.S b/arch/powerpc/kernel/cpu_setup_power.S index 458b928dbd84..c317080db771 100644 --- a/arch/powerpc/kernel/cpu_setup_power.S +++ b/arch/powerpc/kernel/cpu_setup_power.S @@ -147,8 +147,8 @@ __init_hvmode_206: rldicl. r0,r3,4,63 bnelr ld r5,CPU_SPEC_FEATURES(r4) - LOAD_REG_IMMEDIATE(r6,CPU_FTR_HVMODE) - xor r5,r5,r6 + LOAD_REG_IMMEDIATE(r6,CPU_FTR_HVMODE | CPU_FTR_P9_TM_HV_ASSIST) + andc r5,r5,r6 std r5,CPU_SPEC_FEATURES(r4) blr diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c index af7a20dc6e09..19b4c628f3be 100644 --- a/arch/powerpc/kernel/iommu.c +++ b/arch/powerpc/kernel/iommu.c @@ -1013,31 +1013,6 @@ long iommu_tce_xchg(struct iommu_table *tbl, unsigned long entry, } EXPORT_SYMBOL_GPL(iommu_tce_xchg); -#ifdef CONFIG_PPC_BOOK3S_64 -long iommu_tce_xchg_rm(struct iommu_table *tbl, unsigned long entry, - unsigned long *hpa, enum dma_data_direction *direction) -{ - long ret; - - ret = tbl->it_ops->exchange_rm(tbl, entry, hpa, direction); - - if (!ret && ((*direction == DMA_FROM_DEVICE) || - (*direction == DMA_BIDIRECTIONAL))) { - struct page *pg = realmode_pfn_to_page(*hpa >> PAGE_SHIFT); - - if (likely(pg)) { - SetPageDirty(pg); - } else { - tbl->it_ops->exchange_rm(tbl, entry, hpa, direction); - ret = -EFAULT; - } - } - - return ret; -} -EXPORT_SYMBOL_GPL(iommu_tce_xchg_rm); -#endif - int iommu_take_ownership(struct iommu_table *tbl) { unsigned long flags, i, sz = (tbl->it_size + 7) >> 3; diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile index f872c04bb5b1..e814f40ab836 100644 --- a/arch/powerpc/kvm/Makefile +++ b/arch/powerpc/kvm/Makefile @@ -75,7 +75,8 @@ kvm-hv-y += \ book3s_hv.o \ book3s_hv_interrupts.o \ book3s_64_mmu_hv.o \ - book3s_64_mmu_radix.o + book3s_64_mmu_radix.o \ + book3s_hv_nested.o kvm-hv-$(CONFIG_PPC_TRANSACTIONAL_MEM) += \ book3s_hv_tm.o diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c index 87348e498c89..fd9893bc7aa1 100644 --- a/arch/powerpc/kvm/book3s.c +++ b/arch/powerpc/kvm/book3s.c @@ -78,8 +78,11 @@ void kvmppc_unfixup_split_real(struct kvm_vcpu *vcpu) { if (vcpu->arch.hflags & BOOK3S_HFLAG_SPLIT_HACK) { ulong pc = kvmppc_get_pc(vcpu); + ulong lr = kvmppc_get_lr(vcpu); if ((pc & SPLIT_HACK_MASK) == SPLIT_HACK_OFFS) kvmppc_set_pc(vcpu, pc & ~SPLIT_HACK_MASK); + if ((lr & SPLIT_HACK_MASK) == SPLIT_HACK_OFFS) + kvmppc_set_lr(vcpu, lr & ~SPLIT_HACK_MASK); vcpu->arch.hflags &= ~BOOK3S_HFLAG_SPLIT_HACK; } } @@ -150,7 +153,6 @@ static int kvmppc_book3s_vec2irqprio(unsigned int vec) case 0x400: prio = BOOK3S_IRQPRIO_INST_STORAGE; break; case 0x480: prio = BOOK3S_IRQPRIO_INST_SEGMENT; break; case 0x500: prio = BOOK3S_IRQPRIO_EXTERNAL; break; - case 0x501: prio = BOOK3S_IRQPRIO_EXTERNAL_LEVEL; break; case 0x600: prio = BOOK3S_IRQPRIO_ALIGNMENT; break; case 0x700: prio = BOOK3S_IRQPRIO_PROGRAM; break; case 0x800: prio = BOOK3S_IRQPRIO_FP_UNAVAIL; break; @@ -236,18 +238,35 @@ EXPORT_SYMBOL_GPL(kvmppc_core_dequeue_dec); void kvmppc_core_queue_external(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq) { - unsigned int vec = BOOK3S_INTERRUPT_EXTERNAL; - - if (irq->irq == KVM_INTERRUPT_SET_LEVEL) - vec = BOOK3S_INTERRUPT_EXTERNAL_LEVEL; + /* + * This case (KVM_INTERRUPT_SET) should never actually arise for + * a pseries guest (because pseries guests expect their interrupt + * controllers to continue asserting an external interrupt request + * until it is acknowledged at the interrupt controller), but is + * included to avoid ABI breakage and potentially for other + * sorts of guest. + * + * There is a subtlety here: HV KVM does not test the + * external_oneshot flag in the code that synthesizes + * external interrupts for the guest just before entering + * the guest. That is OK even if userspace did do a + * KVM_INTERRUPT_SET on a pseries guest vcpu, because the + * caller (kvm_vcpu_ioctl_interrupt) does a kvm_vcpu_kick() + * which ends up doing a smp_send_reschedule(), which will + * pull the guest all the way out to the host, meaning that + * we will call kvmppc_core_prepare_to_enter() before entering + * the guest again, and that will handle the external_oneshot + * flag correctly. + */ + if (irq->irq == KVM_INTERRUPT_SET) + vcpu->arch.external_oneshot = 1; - kvmppc_book3s_queue_irqprio(vcpu, vec); + kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_EXTERNAL); } void kvmppc_core_dequeue_external(struct kvm_vcpu *vcpu) { kvmppc_book3s_dequeue_irqprio(vcpu, BOOK3S_INTERRUPT_EXTERNAL); - kvmppc_book3s_dequeue_irqprio(vcpu, BOOK3S_INTERRUPT_EXTERNAL_LEVEL); } void kvmppc_core_queue_data_storage(struct kvm_vcpu *vcpu, ulong dar, @@ -278,7 +297,6 @@ static int kvmppc_book3s_irqprio_deliver(struct kvm_vcpu *vcpu, vec = BOOK3S_INTERRUPT_DECREMENTER; break; case BOOK3S_IRQPRIO_EXTERNAL: - case BOOK3S_IRQPRIO_EXTERNAL_LEVEL: deliver = (kvmppc_get_msr(vcpu) & MSR_EE) && !crit; vec = BOOK3S_INTERRUPT_EXTERNAL; break; @@ -352,8 +370,16 @@ static bool clear_irqprio(struct kvm_vcpu *vcpu, unsigned int priority) case BOOK3S_IRQPRIO_DECREMENTER: /* DEC interrupts get cleared by mtdec */ return false; - case BOOK3S_IRQPRIO_EXTERNAL_LEVEL: - /* External interrupts get cleared by userspace */ + case BOOK3S_IRQPRIO_EXTERNAL: + /* + * External interrupts get cleared by userspace + * except when set by the KVM_INTERRUPT ioctl with + * KVM_INTERRUPT_SET (not KVM_INTERRUPT_SET_LEVEL). + */ + if (vcpu->arch.external_oneshot) { + vcpu->arch.external_oneshot = 0; + return true; + } return false; } diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index 68e14afecac8..c615617e78ac 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -268,14 +268,13 @@ int kvmppc_mmu_hv_init(void) { unsigned long host_lpid, rsvd_lpid; - if (!cpu_has_feature(CPU_FTR_HVMODE)) - return -EINVAL; - if (!mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE)) return -EINVAL; /* POWER7 has 10-bit LPIDs (12-bit in POWER8) */ - host_lpid = mfspr(SPRN_LPID); + host_lpid = 0; + if (cpu_has_feature(CPU_FTR_HVMODE)) + host_lpid = mfspr(SPRN_LPID); rsvd_lpid = LPID_RSVD; kvmppc_init_lpid(rsvd_lpid + 1); diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c index fd6e8c13685f..43b21e88c716 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_radix.c +++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c @@ -10,6 +10,9 @@ #include <linux/string.h> #include <linux/kvm.h> #include <linux/kvm_host.h> +#include <linux/anon_inodes.h> +#include <linux/file.h> +#include <linux/debugfs.h> #include <asm/kvm_ppc.h> #include <asm/kvm_book3s.h> @@ -26,87 +29,74 @@ */ static int p9_supported_radix_bits[4] = { 5, 9, 9, 13 }; -int kvmppc_mmu_radix_xlate(struct kvm_vcpu *vcpu, gva_t eaddr, - struct kvmppc_pte *gpte, bool data, bool iswrite) +int kvmppc_mmu_walk_radix_tree(struct kvm_vcpu *vcpu, gva_t eaddr, + struct kvmppc_pte *gpte, u64 root, + u64 *pte_ret_p) { struct kvm *kvm = vcpu->kvm; - u32 pid; int ret, level, ps; - __be64 prte, rpte; - unsigned long ptbl; - unsigned long root, pte, index; - unsigned long rts, bits, offset; - unsigned long gpa; - unsigned long proc_tbl_size; - - /* Work out effective PID */ - switch (eaddr >> 62) { - case 0: - pid = vcpu->arch.pid; - break; - case 3: - pid = 0; - break; - default: - return -EINVAL; - } - proc_tbl_size = 1 << ((kvm->arch.process_table & PRTS_MASK) + 12); - if (pid * 16 >= proc_tbl_size) - return -EINVAL; + unsigned long rts, bits, offset, index; + u64 pte, base, gpa; + __be64 rpte; - /* Read partition table to find root of tree for effective PID */ - ptbl = (kvm->arch.process_table & PRTB_MASK) + (pid * 16); - ret = kvm_read_guest(kvm, ptbl, &prte, sizeof(prte)); - if (ret) - return ret; - - root = be64_to_cpu(prte); rts = ((root & RTS1_MASK) >> (RTS1_SHIFT - 3)) | ((root & RTS2_MASK) >> RTS2_SHIFT); bits = root & RPDS_MASK; - root = root & RPDB_MASK; + base = root & RPDB_MASK; offset = rts + 31; - /* current implementations only support 52-bit space */ + /* Current implementations only support 52-bit space */ if (offset != 52) return -EINVAL; + /* Walk each level of the radix tree */ for (level = 3; level >= 0; --level) { + u64 addr; + /* Check a valid size */ if (level && bits != p9_supported_radix_bits[level]) return -EINVAL; if (level == 0 && !(bits == 5 || bits == 9)) return -EINVAL; offset -= bits; index = (eaddr >> offset) & ((1UL << bits) - 1); - /* check that low bits of page table base are zero */ - if (root & ((1UL << (bits + 3)) - 1)) + /* Check that low bits of page table base are zero */ + if (base & ((1UL << (bits + 3)) - 1)) return -EINVAL; - ret = kvm_read_guest(kvm, root + index * 8, - &rpte, sizeof(rpte)); - if (ret) + /* Read the entry from guest memory */ + addr = base + (index * sizeof(rpte)); + ret = kvm_read_guest(kvm, addr, &rpte, sizeof(rpte)); + if (ret) { + if (pte_ret_p) + *pte_ret_p = addr; return ret; + } pte = __be64_to_cpu(rpte); if (!(pte & _PAGE_PRESENT)) return -ENOENT; + /* Check if a leaf entry */ if (pte & _PAGE_PTE) break; - bits = pte & 0x1f; - root = pte & 0x0fffffffffffff00ul; + /* Get ready to walk the next level */ + base = pte & RPDB_MASK; + bits = pte & RPDS_MASK; } - /* need a leaf at lowest level; 512GB pages not supported */ + + /* Need a leaf at lowest level; 512GB pages not supported */ if (level < 0 || level == 3) return -EINVAL; - /* offset is now log base 2 of the page size */ + /* We found a valid leaf PTE */ + /* Offset is now log base 2 of the page size */ gpa = pte & 0x01fffffffffff000ul; if (gpa & ((1ul << offset) - 1)) return -EINVAL; - gpa += eaddr & ((1ul << offset) - 1); + gpa |= eaddr & ((1ul << offset) - 1); for (ps = MMU_PAGE_4K; ps < MMU_PAGE_COUNT; ++ps) if (offset == mmu_psize_defs[ps].shift) break; gpte->page_size = ps; + gpte->page_shift = offset; gpte->eaddr = eaddr; gpte->raddr = gpa; @@ -115,6 +105,77 @@ int kvmppc_mmu_radix_xlate(struct kvm_vcpu *vcpu, gva_t eaddr, gpte->may_read = !!(pte & _PAGE_READ); gpte->may_write = !!(pte & _PAGE_WRITE); gpte->may_execute = !!(pte & _PAGE_EXEC); + + gpte->rc = pte & (_PAGE_ACCESSED | _PAGE_DIRTY); + + if (pte_ret_p) + *pte_ret_p = pte; + + return 0; +} + +/* + * Used to walk a partition or process table radix tree in guest memory + * Note: We exploit the fact that a partition table and a process + * table have the same layout, a partition-scoped page table and a + * process-scoped page table have the same layout, and the 2nd + * doubleword of a partition table entry has the same layout as + * the PTCR register. + */ +int kvmppc_mmu_radix_translate_table(struct kvm_vcpu *vcpu, gva_t eaddr, + struct kvmppc_pte *gpte, u64 table, + int table_index, u64 *pte_ret_p) +{ + struct kvm *kvm = vcpu->kvm; + int ret; + unsigned long size, ptbl, root; + struct prtb_entry entry; + + if ((table & PRTS_MASK) > 24) + return -EINVAL; + size = 1ul << ((table & PRTS_MASK) + 12); + + /* Is the table big enough to contain this entry? */ + if ((table_index * sizeof(entry)) >= size) + return -EINVAL; + + /* Read the table to find the root of the radix tree */ + ptbl = (table & PRTB_MASK) + (table_index * sizeof(entry)); + ret = kvm_read_guest(kvm, ptbl, &entry, sizeof(entry)); + if (ret) + return ret; + + /* Root is stored in the first double word */ + root = be64_to_cpu(entry.prtb0); + + return kvmppc_mmu_walk_radix_tree(vcpu, eaddr, gpte, root, pte_ret_p); +} + +int kvmppc_mmu_radix_xlate(struct kvm_vcpu *vcpu, gva_t eaddr, + struct kvmppc_pte *gpte, bool data, bool iswrite) +{ + u32 pid; + u64 pte; + int ret; + + /* Work out effective PID */ + switch (eaddr >> 62) { + case 0: + pid = vcpu->arch.pid; + break; + case 3: + pid = 0; + break; + default: + return -EINVAL; + } + + ret = kvmppc_mmu_radix_translate_table(vcpu, eaddr, gpte, + vcpu->kvm->arch.process_table, pid, &pte); + if (ret) + return ret; + + /* Check privilege (applies only to process scoped translations) */ if (kvmppc_get_msr(vcpu) & MSR_PR) { if (pte & _PAGE_PRIVILEGED) { gpte->may_read = 0; @@ -137,20 +198,46 @@ int kvmppc_mmu_radix_xlate(struct kvm_vcpu *vcpu, gva_t eaddr, } static void kvmppc_radix_tlbie_page(struct kvm *kvm, unsigned long addr, - unsigned int pshift) + unsigned int pshift, unsigned int lpid) { unsigned long psize = PAGE_SIZE; + int psi; + long rc; + unsigned long rb; if (pshift) psize = 1UL << pshift; + else + pshift = PAGE_SHIFT; addr &= ~(psize - 1); - radix__flush_tlb_lpid_page(kvm->arch.lpid, addr, psize); + + if (!kvmhv_on_pseries()) { + radix__flush_tlb_lpid_page(lpid, addr, psize); + return; + } + + psi = shift_to_mmu_psize(pshift); + rb = addr | (mmu_get_ap(psi) << PPC_BITLSHIFT(58)); + rc = plpar_hcall_norets(H_TLB_INVALIDATE, H_TLBIE_P1_ENC(0, 0, 1), + lpid, rb); + if (rc) + pr_err("KVM: TLB page invalidation hcall failed, rc=%ld\n", rc); } -static void kvmppc_radix_flush_pwc(struct kvm *kvm) +static void kvmppc_radix_flush_pwc(struct kvm *kvm, unsigned int lpid) { - radix__flush_pwc_lpid(kvm->arch.lpid); + long rc; + + if (!kvmhv_on_pseries()) { + radix__flush_pwc_lpid(lpid); + return; + } + + rc = plpar_hcall_norets(H_TLB_INVALIDATE, H_TLBIE_P1_ENC(1, 0, 1), + lpid, TLBIEL_INVAL_SET_LPID); + if (rc) + pr_err("KVM: TLB PWC invalidation hcall failed, rc=%ld\n", rc); } static unsigned long kvmppc_radix_update_pte(struct kvm *kvm, pte_t *ptep, @@ -195,23 +282,38 @@ static void kvmppc_pmd_free(pmd_t *pmdp) kmem_cache_free(kvm_pmd_cache, pmdp); } -static void kvmppc_unmap_pte(struct kvm *kvm, pte_t *pte, - unsigned long gpa, unsigned int shift) +/* Called with kvm->mmu_lock held */ +void kvmppc_unmap_pte(struct kvm *kvm, pte_t *pte, unsigned long gpa, + unsigned int shift, struct kvm_memory_slot *memslot, + unsigned int lpid) { - unsigned long page_size = 1ul << shift; unsigned long old; + unsigned long gfn = gpa >> PAGE_SHIFT; + unsigned long page_size = PAGE_SIZE; + unsigned long hpa; old = kvmppc_radix_update_pte(kvm, pte, ~0UL, 0, gpa, shift); - kvmppc_radix_tlbie_page(kvm, gpa, shift); - if (old & _PAGE_DIRTY) { - unsigned long gfn = gpa >> PAGE_SHIFT; - struct kvm_memory_slot *memslot; + kvmppc_radix_tlbie_page(kvm, gpa, shift, lpid); + /* The following only applies to L1 entries */ + if (lpid != kvm->arch.lpid) + return; + + if (!memslot) { memslot = gfn_to_memslot(kvm, gfn); - if (memslot && memslot->dirty_bitmap) - kvmppc_update_dirty_map(memslot, gfn, page_size); + if (!memslot) + return; } + if (shift) + page_size = 1ul << shift; + + gpa &= ~(page_size - 1); + hpa = old & PTE_RPN_MASK; + kvmhv_remove_nest_rmap_range(kvm, memslot, gpa, hpa, page_size); + + if ((old & _PAGE_DIRTY) && memslot->dirty_bitmap) + kvmppc_update_dirty_map(memslot, gfn, page_size); } /* @@ -224,7 +326,8 @@ static void kvmppc_unmap_pte(struct kvm *kvm, pte_t *pte, * and emit a warning if encountered, but there may already be data * corruption due to the unexpected mappings. */ -static void kvmppc_unmap_free_pte(struct kvm *kvm, pte_t *pte, bool full) +static void kvmppc_unmap_free_pte(struct kvm *kvm, pte_t *pte, bool full, + unsigned int lpid) { if (full) { memset(pte, 0, sizeof(long) << PTE_INDEX_SIZE); @@ -238,14 +341,15 @@ static void kvmppc_unmap_free_pte(struct kvm *kvm, pte_t *pte, bool full) WARN_ON_ONCE(1); kvmppc_unmap_pte(kvm, p, pte_pfn(*p) << PAGE_SHIFT, - PAGE_SHIFT); + PAGE_SHIFT, NULL, lpid); } } kvmppc_pte_free(pte); } -static void kvmppc_unmap_free_pmd(struct kvm *kvm, pmd_t *pmd, bool full) +static void kvmppc_unmap_free_pmd(struct kvm *kvm, pmd_t *pmd, bool full, + unsigned int lpid) { unsigned long im; pmd_t *p = pmd; @@ -260,20 +364,21 @@ static void kvmppc_unmap_free_pmd(struct kvm *kvm, pmd_t *pmd, bool full) WARN_ON_ONCE(1); kvmppc_unmap_pte(kvm, (pte_t *)p, pte_pfn(*(pte_t *)p) << PAGE_SHIFT, - PMD_SHIFT); + PMD_SHIFT, NULL, lpid); } } else { pte_t *pte; pte = pte_offset_map(p, 0); - kvmppc_unmap_free_pte(kvm, pte, full); + kvmppc_unmap_free_pte(kvm, pte, full, lpid); pmd_clear(p); } } kvmppc_pmd_free(pmd); } -static void kvmppc_unmap_free_pud(struct kvm *kvm, pud_t *pud) +static void kvmppc_unmap_free_pud(struct kvm *kvm, pud_t *pud, + unsigned int lpid) { unsigned long iu; pud_t *p = pud; @@ -287,36 +392,40 @@ static void kvmppc_unmap_free_pud(struct kvm *kvm, pud_t *pud) pmd_t *pmd; pmd = pmd_offset(p, 0); - kvmppc_unmap_free_pmd(kvm, pmd, true); + kvmppc_unmap_free_pmd(kvm, pmd, true, lpid); pud_clear(p); } } pud_free(kvm->mm, pud); } -void kvmppc_free_radix(struct kvm *kvm) +void kvmppc_free_pgtable_radix(struct kvm *kvm, pgd_t *pgd, unsigned int lpid) { unsigned long ig; - pgd_t *pgd; - if (!kvm->arch.pgtable) - return; - pgd = kvm->arch.pgtable; for (ig = 0; ig < PTRS_PER_PGD; ++ig, ++pgd) { pud_t *pud; if (!pgd_present(*pgd)) continue; pud = pud_offset(pgd, 0); - kvmppc_unmap_free_pud(kvm, pud); + kvmppc_unmap_free_pud(kvm, pud, lpid); pgd_clear(pgd); } - pgd_free(kvm->mm, kvm->arch.pgtable); - kvm->arch.pgtable = NULL; +} + +void kvmppc_free_radix(struct kvm *kvm) +{ + if (kvm->arch.pgtable) { + kvmppc_free_pgtable_radix(kvm, kvm->arch.pgtable, + kvm->arch.lpid); + pgd_free(kvm->mm, kvm->arch.pgtable); + kvm->arch.pgtable = NULL; + } } static void kvmppc_unmap_free_pmd_entry_table(struct kvm *kvm, pmd_t *pmd, - unsigned long gpa) + unsigned long gpa, unsigned int lpid) { pte_t *pte = pte_offset_kernel(pmd, 0); @@ -326,13 +435,13 @@ static void kvmppc_unmap_free_pmd_entry_table(struct kvm *kvm, pmd_t *pmd, * flushing the PWC again. */ pmd_clear(pmd); - kvmppc_radix_flush_pwc(kvm); + kvmppc_radix_flush_pwc(kvm, lpid); - kvmppc_unmap_free_pte(kvm, pte, false); + kvmppc_unmap_free_pte(kvm, pte, false, lpid); } static void kvmppc_unmap_free_pud_entry_table(struct kvm *kvm, pud_t *pud, - unsigned long gpa) + unsigned long gpa, unsigned int lpid) { pmd_t *pmd = pmd_offset(pud, 0); @@ -342,9 +451,9 @@ static void kvmppc_unmap_free_pud_entry_table(struct kvm *kvm, pud_t *pud, * so can be freed without flushing the PWC again. */ pud_clear(pud); - kvmppc_radix_flush_pwc(kvm); + kvmppc_radix_flush_pwc(kvm, lpid); - kvmppc_unmap_free_pmd(kvm, pmd, false); + kvmppc_unmap_free_pmd(kvm, pmd, false, lpid); } /* @@ -356,8 +465,10 @@ static void kvmppc_unmap_free_pud_entry_table(struct kvm *kvm, pud_t *pud, */ #define PTE_BITS_MUST_MATCH (~(_PAGE_WRITE | _PAGE_DIRTY | _PAGE_ACCESSED)) -static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa, - unsigned int level, unsigned long mmu_seq) +int kvmppc_create_pte(struct kvm *kvm, pgd_t *pgtable, pte_t pte, + unsigned long gpa, unsigned int level, + unsigned long mmu_seq, unsigned int lpid, + unsigned long *rmapp, struct rmap_nested **n_rmap) { pgd_t *pgd; pud_t *pud, *new_pud = NULL; @@ -366,7 +477,7 @@ static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa, int ret; /* Traverse the guest's 2nd-level tree, allocate new levels needed */ - pgd = kvm->arch.pgtable + pgd_index(gpa); + pgd = pgtable + pgd_index(gpa); pud = NULL; if (pgd_present(*pgd)) pud = pud_offset(pgd, gpa); @@ -423,7 +534,8 @@ static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa, goto out_unlock; } /* Valid 1GB page here already, remove it */ - kvmppc_unmap_pte(kvm, (pte_t *)pud, hgpa, PUD_SHIFT); + kvmppc_unmap_pte(kvm, (pte_t *)pud, hgpa, PUD_SHIFT, NULL, + lpid); } if (level == 2) { if (!pud_none(*pud)) { @@ -432,9 +544,11 @@ static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa, * install a large page, so remove and free the page * table page. */ - kvmppc_unmap_free_pud_entry_table(kvm, pud, gpa); + kvmppc_unmap_free_pud_entry_table(kvm, pud, gpa, lpid); } kvmppc_radix_set_pte_at(kvm, gpa, (pte_t *)pud, pte); + if (rmapp && n_rmap) + kvmhv_insert_nest_rmap(kvm, rmapp, n_rmap); ret = 0; goto out_unlock; } @@ -458,7 +572,7 @@ static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa, WARN_ON_ONCE((pmd_val(*pmd) ^ pte_val(pte)) & PTE_BITS_MUST_MATCH); kvmppc_radix_update_pte(kvm, pmdp_ptep(pmd), - 0, pte_val(pte), lgpa, PMD_SHIFT); + 0, pte_val(pte), lgpa, PMD_SHIFT); ret = 0; goto out_unlock; } @@ -472,7 +586,8 @@ static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa, goto out_unlock; } /* Valid 2MB page here already, remove it */ - kvmppc_unmap_pte(kvm, pmdp_ptep(pmd), lgpa, PMD_SHIFT); + kvmppc_unmap_pte(kvm, pmdp_ptep(pmd), lgpa, PMD_SHIFT, NULL, + lpid); } if (level == 1) { if (!pmd_none(*pmd)) { @@ -481,9 +596,11 @@ static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa, * install a large page, so remove and free the page * table page. */ - kvmppc_unmap_free_pmd_entry_table(kvm, pmd, gpa); + kvmppc_unmap_free_pmd_entry_table(kvm, pmd, gpa, lpid); } kvmppc_radix_set_pte_at(kvm, gpa, pmdp_ptep(pmd), pte); + if (rmapp && n_rmap) + kvmhv_insert_nest_rmap(kvm, rmapp, n_rmap); ret = 0; goto out_unlock; } @@ -508,6 +625,8 @@ static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa, goto out_unlock; } kvmppc_radix_set_pte_at(kvm, gpa, ptep, pte); + if (rmapp && n_rmap) + kvmhv_insert_nest_rmap(kvm, rmapp, n_rmap); ret = 0; out_unlock: @@ -521,21 +640,144 @@ static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa, return ret; } -int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, - unsigned long ea, unsigned long dsisr) +bool kvmppc_hv_handle_set_rc(struct kvm *kvm, pgd_t *pgtable, bool writing, + unsigned long gpa, unsigned int lpid) +{ + unsigned long pgflags; + unsigned int shift; + pte_t *ptep; + + /* + * Need to set an R or C bit in the 2nd-level tables; + * since we are just helping out the hardware here, + * it is sufficient to do what the hardware does. + */ + pgflags = _PAGE_ACCESSED; + if (writing) + pgflags |= _PAGE_DIRTY; + /* + * We are walking the secondary (partition-scoped) page table here. + * We can do this without disabling irq because the Linux MM + * subsystem doesn't do THP splits and collapses on this tree. + */ + ptep = __find_linux_pte(pgtable, gpa, NULL, &shift); + if (ptep && pte_present(*ptep) && (!writing || pte_write(*ptep))) { + kvmppc_radix_update_pte(kvm, ptep, 0, pgflags, gpa, shift); + return true; + } + return false; +} + +int kvmppc_book3s_instantiate_page(struct kvm_vcpu *vcpu, + unsigned long gpa, + struct kvm_memory_slot *memslot, + bool writing, bool kvm_ro, + pte_t *inserted_pte, unsigned int *levelp) { struct kvm *kvm = vcpu->kvm; - unsigned long mmu_seq, pte_size; - unsigned long gpa, gfn, hva, pfn; - struct kvm_memory_slot *memslot; struct page *page = NULL; - long ret; - bool writing; + unsigned long mmu_seq; + unsigned long hva, gfn = gpa >> PAGE_SHIFT; bool upgrade_write = false; bool *upgrade_p = &upgrade_write; pte_t pte, *ptep; - unsigned long pgflags; unsigned int shift, level; + int ret; + + /* used to check for invalidations in progress */ + mmu_seq = kvm->mmu_notifier_seq; + smp_rmb(); + + /* + * Do a fast check first, since __gfn_to_pfn_memslot doesn't + * do it with !atomic && !async, which is how we call it. + * We always ask for write permission since the common case + * is that the page is writable. + */ + hva = gfn_to_hva_memslot(memslot, gfn); + if (!kvm_ro && __get_user_pages_fast(hva, 1, 1, &page) == 1) { + upgrade_write = true; + } else { + unsigned long pfn; + + /* Call KVM generic code to do the slow-path check */ + pfn = __gfn_to_pfn_memslot(memslot, gfn, false, NULL, + writing, upgrade_p); + if (is_error_noslot_pfn(pfn)) + return -EFAULT; + page = NULL; + if (pfn_valid(pfn)) { + page = pfn_to_page(pfn); + if (PageReserved(page)) + page = NULL; + } + } + + /* + * Read the PTE from the process' radix tree and use that + * so we get the shift and attribute bits. + */ + local_irq_disable(); + ptep = __find_linux_pte(vcpu->arch.pgdir, hva, NULL, &shift); + pte = *ptep; + local_irq_enable(); + + /* Get pte level from shift/size */ + if (shift == PUD_SHIFT && + (gpa & (PUD_SIZE - PAGE_SIZE)) == + (hva & (PUD_SIZE - PAGE_SIZE))) { + level = 2; + } else if (shift == PMD_SHIFT && + (gpa & (PMD_SIZE - PAGE_SIZE)) == + (hva & (PMD_SIZE - PAGE_SIZE))) { + level = 1; + } else { + level = 0; + if (shift > PAGE_SHIFT) { + /* + * If the pte maps more than one page, bring over + * bits from the virtual address to get the real + * address of the specific single page we want. + */ + unsigned long rpnmask = (1ul << shift) - PAGE_SIZE; + pte = __pte(pte_val(pte) | (hva & rpnmask)); + } + } + + pte = __pte(pte_val(pte) | _PAGE_EXEC | _PAGE_ACCESSED); + if (writing || upgrade_write) { + if (pte_val(pte) & _PAGE_WRITE) + pte = __pte(pte_val(pte) | _PAGE_DIRTY); + } else { + pte = __pte(pte_val(pte) & ~(_PAGE_WRITE | _PAGE_DIRTY)); + } + + /* Allocate space in the tree and write the PTE */ + ret = kvmppc_create_pte(kvm, kvm->arch.pgtable, pte, gpa, level, + mmu_seq, kvm->arch.lpid, NULL, NULL); + if (inserted_pte) + *inserted_pte = pte; + if (levelp) + *levelp = level; + + if (page) { + if (!ret && (pte_val(pte) & _PAGE_WRITE)) + set_page_dirty_lock(page); + put_page(page); + } + + return ret; +} + +int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, + unsigned long ea, unsigned long dsisr) +{ + struct kvm *kvm = vcpu->kvm; + unsigned long gpa, gfn; + struct kvm_memory_slot *memslot; + long ret; + bool writing = !!(dsisr & DSISR_ISSTORE); + bool kvm_ro = false; /* Check for unusual errors */ if (dsisr & DSISR_UNSUPP_MMU) { @@ -549,12 +791,14 @@ int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, return RESUME_GUEST; } - /* Translate the logical address and get the page */ + /* Translate the logical address */ gpa = vcpu->arch.fault_gpa & ~0xfffUL; gpa &= ~0xF000000000000000ul; gfn = gpa >> PAGE_SHIFT; if (!(dsisr & DSISR_PRTABLE_FAULT)) gpa |= ea & 0xfff; + + /* Get the corresponding memslot */ memslot = gfn_to_memslot(kvm, gfn); /* No memslot means it's an emulated MMIO region */ @@ -568,142 +812,35 @@ int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, kvmppc_core_queue_data_storage(vcpu, ea, dsisr); return RESUME_GUEST; } - return kvmppc_hv_emulate_mmio(run, vcpu, gpa, ea, - dsisr & DSISR_ISSTORE); + return kvmppc_hv_emulate_mmio(run, vcpu, gpa, ea, writing); } - writing = (dsisr & DSISR_ISSTORE) != 0; if (memslot->flags & KVM_MEM_READONLY) { if (writing) { /* give the guest a DSI */ - dsisr = DSISR_ISSTORE | DSISR_PROTFAULT; - kvmppc_core_queue_data_storage(vcpu, ea, dsisr); + kvmppc_core_queue_data_storage(vcpu, ea, DSISR_ISSTORE | + DSISR_PROTFAULT); return RESUME_GUEST; } - upgrade_p = NULL; + kvm_ro = true; } + /* Failed to set the reference/change bits */ if (dsisr & DSISR_SET_RC) { - /* - * Need to set an R or C bit in the 2nd-level tables; - * since we are just helping out the hardware here, - * it is sufficient to do what the hardware does. - */ - pgflags = _PAGE_ACCESSED; - if (writing) - pgflags |= _PAGE_DIRTY; - /* - * We are walking the secondary page table here. We can do this - * without disabling irq. - */ spin_lock(&kvm->mmu_lock); - ptep = __find_linux_pte(kvm->arch.pgtable, - gpa, NULL, &shift); - if (ptep && pte_present(*ptep) && - (!writing || pte_write(*ptep))) { - kvmppc_radix_update_pte(kvm, ptep, 0, pgflags, - gpa, shift); + if (kvmppc_hv_handle_set_rc(kvm, kvm->arch.pgtable, + writing, gpa, kvm->arch.lpid)) dsisr &= ~DSISR_SET_RC; - } spin_unlock(&kvm->mmu_lock); + if (!(dsisr & (DSISR_BAD_FAULT_64S | DSISR_NOHPTE | DSISR_PROTFAULT | DSISR_SET_RC))) return RESUME_GUEST; } - /* used to check for invalidations in progress */ - mmu_seq = kvm->mmu_notifier_seq; - smp_rmb(); - - /* - * Do a fast check first, since __gfn_to_pfn_memslot doesn't - * do it with !atomic && !async, which is how we call it. - * We always ask for write permission since the common case - * is that the page is writable. - */ - hva = gfn_to_hva_memslot(memslot, gfn); - if (upgrade_p && __get_user_pages_fast(hva, 1, 1, &page) == 1) { - pfn = page_to_pfn(page); - upgrade_write = true; - } else { - /* Call KVM generic code to do the slow-path check */ - pfn = __gfn_to_pfn_memslot(memslot, gfn, false, NULL, - writing, upgrade_p); - if (is_error_noslot_pfn(pfn)) - return -EFAULT; - page = NULL; - if (pfn_valid(pfn)) { - page = pfn_to_page(pfn); - if (PageReserved(page)) - page = NULL; - } - } - - /* See if we can insert a 1GB or 2MB large PTE here */ - level = 0; - if (page && PageCompound(page)) { - pte_size = PAGE_SIZE << compound_order(compound_head(page)); - if (pte_size >= PUD_SIZE && - (gpa & (PUD_SIZE - PAGE_SIZE)) == - (hva & (PUD_SIZE - PAGE_SIZE))) { - level = 2; - pfn &= ~((PUD_SIZE >> PAGE_SHIFT) - 1); - } else if (pte_size >= PMD_SIZE && - (gpa & (PMD_SIZE - PAGE_SIZE)) == - (hva & (PMD_SIZE - PAGE_SIZE))) { - level = 1; - pfn &= ~((PMD_SIZE >> PAGE_SHIFT) - 1); - } - } - - /* - * Compute the PTE value that we need to insert. - */ - if (page) { - pgflags = _PAGE_READ | _PAGE_EXEC | _PAGE_PRESENT | _PAGE_PTE | - _PAGE_ACCESSED; - if (writing || upgrade_write) - pgflags |= _PAGE_WRITE | _PAGE_DIRTY; - pte = pfn_pte(pfn, __pgprot(pgflags)); - } else { - /* - * Read the PTE from the process' radix tree and use that - * so we get the attribute bits. - */ - local_irq_disable(); - ptep = __find_linux_pte(vcpu->arch.pgdir, hva, NULL, &shift); - pte = *ptep; - local_irq_enable(); - if (shift == PUD_SHIFT && - (gpa & (PUD_SIZE - PAGE_SIZE)) == - (hva & (PUD_SIZE - PAGE_SIZE))) { - level = 2; - } else if (shift == PMD_SHIFT && - (gpa & (PMD_SIZE - PAGE_SIZE)) == - (hva & (PMD_SIZE - PAGE_SIZE))) { - level = 1; - } else if (shift && shift != PAGE_SHIFT) { - /* Adjust PFN */ - unsigned long mask = (1ul << shift) - PAGE_SIZE; - pte = __pte(pte_val(pte) | (hva & mask)); - } - pte = __pte(pte_val(pte) | _PAGE_EXEC | _PAGE_ACCESSED); - if (writing || upgrade_write) { - if (pte_val(pte) & _PAGE_WRITE) - pte = __pte(pte_val(pte) | _PAGE_DIRTY); - } else { - pte = __pte(pte_val(pte) & ~(_PAGE_WRITE | _PAGE_DIRTY)); - } - } - - /* Allocate space in the tree and write the PTE */ - ret = kvmppc_create_pte(kvm, pte, gpa, level, mmu_seq); - - if (page) { - if (!ret && (pte_val(pte) & _PAGE_WRITE)) - set_page_dirty_lock(page); - put_page(page); - } + /* Try to insert a pte */ + ret = kvmppc_book3s_instantiate_page(vcpu, gpa, memslot, writing, + kvm_ro, NULL, NULL); if (ret == 0 || ret == -EAGAIN) ret = RESUME_GUEST; @@ -717,20 +854,11 @@ int kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, pte_t *ptep; unsigned long gpa = gfn << PAGE_SHIFT; unsigned int shift; - unsigned long old; ptep = __find_linux_pte(kvm->arch.pgtable, gpa, NULL, &shift); - if (ptep && pte_present(*ptep)) { - old = kvmppc_radix_update_pte(kvm, ptep, ~0UL, 0, - gpa, shift); - kvmppc_radix_tlbie_page(kvm, gpa, shift); - if ((old & _PAGE_DIRTY) && memslot->dirty_bitmap) { - unsigned long psize = PAGE_SIZE; - if (shift) - psize = 1ul << shift; - kvmppc_update_dirty_map(memslot, gfn, psize); - } - } + if (ptep && pte_present(*ptep)) + kvmppc_unmap_pte(kvm, ptep, gpa, shift, memslot, + kvm->arch.lpid); return 0; } @@ -785,7 +913,7 @@ static int kvm_radix_test_clear_dirty(struct kvm *kvm, ret = 1 << (shift - PAGE_SHIFT); kvmppc_radix_update_pte(kvm, ptep, _PAGE_DIRTY, 0, gpa, shift); - kvmppc_radix_tlbie_page(kvm, gpa, shift); + kvmppc_radix_tlbie_page(kvm, gpa, shift, kvm->arch.lpid); } return ret; } @@ -870,6 +998,215 @@ static void pmd_ctor(void *addr) memset(addr, 0, RADIX_PMD_TABLE_SIZE); } +struct debugfs_radix_state { + struct kvm *kvm; + struct mutex mutex; + unsigned long gpa; + int lpid; + int chars_left; + int buf_index; + char buf[128]; + u8 hdr; +}; + +static int debugfs_radix_open(struct inode *inode, struct file *file) +{ + struct kvm *kvm = inode->i_private; + struct debugfs_radix_state *p; + + p = kzalloc(sizeof(*p), GFP_KERNEL); + if (!p) + return -ENOMEM; + + kvm_get_kvm(kvm); + p->kvm = kvm; + mutex_init(&p->mutex); + file->private_data = p; + + return nonseekable_open(inode, file); +} + +static int debugfs_radix_release(struct inode *inode, struct file *file) +{ + struct debugfs_radix_state *p = file->private_data; + + kvm_put_kvm(p->kvm); + kfree(p); + return 0; +} + +static ssize_t debugfs_radix_read(struct file *file, char __user *buf, + size_t len, loff_t *ppos) +{ + struct debugfs_radix_state *p = file->private_data; + ssize_t ret, r; + unsigned long n; + struct kvm *kvm; + unsigned long gpa; + pgd_t *pgt; + struct kvm_nested_guest *nested; + pgd_t pgd, *pgdp; + pud_t pud, *pudp; + pmd_t pmd, *pmdp; + pte_t *ptep; + int shift; + unsigned long pte; + + kvm = p->kvm; + if (!kvm_is_radix(kvm)) + return 0; + + ret = mutex_lock_interruptible(&p->mutex); + if (ret) + return ret; + + if (p->chars_left) { + n = p->chars_left; + if (n > len) + n = len; + r = copy_to_user(buf, p->buf + p->buf_index, n); + n -= r; + p->chars_left -= n; + p->buf_index += n; + buf += n; + len -= n; + ret = n; + if (r) { + if (!n) + ret = -EFAULT; + goto out; + } + } + + gpa = p->gpa; + nested = NULL; + pgt = NULL; + while (len != 0 && p->lpid >= 0) { + if (gpa >= RADIX_PGTABLE_RANGE) { + gpa = 0; + pgt = NULL; + if (nested) { + kvmhv_put_nested(nested); + nested = NULL; + } + p->lpid = kvmhv_nested_next_lpid(kvm, p->lpid); + p->hdr = 0; + if (p->lpid < 0) + break; + } + if (!pgt) { + if (p->lpid == 0) { + pgt = kvm->arch.pgtable; + } else { + nested = kvmhv_get_nested(kvm, p->lpid, false); + if (!nested) { + gpa = RADIX_PGTABLE_RANGE; + continue; + } + pgt = nested->shadow_pgtable; + } + } + n = 0; + if (!p->hdr) { + if (p->lpid > 0) + n = scnprintf(p->buf, sizeof(p->buf), + "\nNested LPID %d: ", p->lpid); + n += scnprintf(p->buf + n, sizeof(p->buf) - n, + "pgdir: %lx\n", (unsigned long)pgt); + p->hdr = 1; + goto copy; + } + + pgdp = pgt + pgd_index(gpa); + pgd = READ_ONCE(*pgdp); + if (!(pgd_val(pgd) & _PAGE_PRESENT)) { + gpa = (gpa & PGDIR_MASK) + PGDIR_SIZE; + continue; + } + + pudp = pud_offset(&pgd, gpa); + pud = READ_ONCE(*pudp); + if (!(pud_val(pud) & _PAGE_PRESENT)) { + gpa = (gpa & PUD_MASK) + PUD_SIZE; + continue; + } + if (pud_val(pud) & _PAGE_PTE) { + pte = pud_val(pud); + shift = PUD_SHIFT; + goto leaf; + } + + pmdp = pmd_offset(&pud, gpa); + pmd = READ_ONCE(*pmdp); + if (!(pmd_val(pmd) & _PAGE_PRESENT)) { + gpa = (gpa & PMD_MASK) + PMD_SIZE; + continue; + } + if (pmd_val(pmd) & _PAGE_PTE) { + pte = pmd_val(pmd); + shift = PMD_SHIFT; + goto leaf; + } + + ptep = pte_offset_kernel(&pmd, gpa); + pte = pte_val(READ_ONCE(*ptep)); + if (!(pte & _PAGE_PRESENT)) { + gpa += PAGE_SIZE; + continue; + } + shift = PAGE_SHIFT; + leaf: + n = scnprintf(p->buf, sizeof(p->buf), + " %lx: %lx %d\n", gpa, pte, shift); + gpa += 1ul << shift; + copy: + p->chars_left = n; + if (n > len) + n = len; + r = copy_to_user(buf, p->buf, n); + n -= r; + p->chars_left -= n; + p->buf_index = n; + buf += n; + len -= n; + ret += n; + if (r) { + if (!ret) + ret = -EFAULT; + break; + } + } + p->gpa = gpa; + if (nested) + kvmhv_put_nested(nested); + + out: + mutex_unlock(&p->mutex); + return ret; +} + +static ssize_t debugfs_radix_write(struct file *file, const char __user *buf, + size_t len, loff_t *ppos) +{ + return -EACCES; +} + +static const struct file_operations debugfs_radix_fops = { + .owner = THIS_MODULE, + .open = debugfs_radix_open, + .release = debugfs_radix_release, + .read = debugfs_radix_read, + .write = debugfs_radix_write, + .llseek = generic_file_llseek, +}; + +void kvmhv_radix_debugfs_init(struct kvm *kvm) +{ + kvm->arch.radix_dentry = debugfs_create_file("radix", 0400, + kvm->arch.debugfs_dir, kvm, + &debugfs_radix_fops); +} + int kvmppc_radix_init(void) { unsigned long size = sizeof(void *) << RADIX_PTE_INDEX_SIZE; diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c index 9a3f2646ecc7..c0c64d11cc71 100644 --- a/arch/powerpc/kvm/book3s_64_vio.c +++ b/arch/powerpc/kvm/book3s_64_vio.c @@ -363,6 +363,40 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, return ret; } +static long kvmppc_tce_validate(struct kvmppc_spapr_tce_table *stt, + unsigned long tce) +{ + unsigned long gpa = tce & ~(TCE_PCI_READ | TCE_PCI_WRITE); + enum dma_data_direction dir = iommu_tce_direction(tce); + struct kvmppc_spapr_tce_iommu_table *stit; + unsigned long ua = 0; + + /* Allow userspace to poison TCE table */ + if (dir == DMA_NONE) + return H_SUCCESS; + + if (iommu_tce_check_gpa(stt->page_shift, gpa)) + return H_TOO_HARD; + + if (kvmppc_tce_to_ua(stt->kvm, tce, &ua, NULL)) + return H_TOO_HARD; + + list_for_each_entry_rcu(stit, &stt->iommu_tables, next) { + unsigned long hpa = 0; + struct mm_iommu_table_group_mem_t *mem; + long shift = stit->tbl->it_page_shift; + + mem = mm_iommu_lookup(stt->kvm->mm, ua, 1ULL << shift); + if (!mem) + return H_TOO_HARD; + + if (mm_iommu_ua_to_hpa(mem, ua, shift, &hpa)) + return H_TOO_HARD; + } + + return H_SUCCESS; +} + static void kvmppc_clear_tce(struct iommu_table *tbl, unsigned long entry) { unsigned long hpa = 0; @@ -401,7 +435,7 @@ static long kvmppc_tce_iommu_do_unmap(struct kvm *kvm, long ret; if (WARN_ON_ONCE(iommu_tce_xchg(tbl, entry, &hpa, &dir))) - return H_HARDWARE; + return H_TOO_HARD; if (dir == DMA_NONE) return H_SUCCESS; @@ -449,15 +483,15 @@ long kvmppc_tce_iommu_do_map(struct kvm *kvm, struct iommu_table *tbl, return H_TOO_HARD; if (WARN_ON_ONCE(mm_iommu_ua_to_hpa(mem, ua, tbl->it_page_shift, &hpa))) - return H_HARDWARE; + return H_TOO_HARD; if (mm_iommu_mapped_inc(mem)) - return H_CLOSED; + return H_TOO_HARD; ret = iommu_tce_xchg(tbl, entry, &hpa, &dir); if (WARN_ON_ONCE(ret)) { mm_iommu_mapped_dec(mem); - return H_HARDWARE; + return H_TOO_HARD; } if (dir != DMA_NONE) @@ -517,8 +551,7 @@ long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn, idx = srcu_read_lock(&vcpu->kvm->srcu); - if ((dir != DMA_NONE) && kvmppc_gpa_to_ua(vcpu->kvm, - tce & ~(TCE_PCI_READ | TCE_PCI_WRITE), &ua, NULL)) { + if ((dir != DMA_NONE) && kvmppc_tce_to_ua(vcpu->kvm, tce, &ua, NULL)) { ret = H_PARAMETER; goto unlock_exit; } @@ -533,14 +566,10 @@ long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn, ret = kvmppc_tce_iommu_map(vcpu->kvm, stt, stit->tbl, entry, ua, dir); - if (ret == H_SUCCESS) - continue; - - if (ret == H_TOO_HARD) + if (ret != H_SUCCESS) { + kvmppc_clear_tce(stit->tbl, entry); goto unlock_exit; - - WARN_ON_ONCE(1); - kvmppc_clear_tce(stit->tbl, entry); + } } kvmppc_tce_put(stt, entry, tce); @@ -583,7 +612,7 @@ long kvmppc_h_put_tce_indirect(struct kvm_vcpu *vcpu, return ret; idx = srcu_read_lock(&vcpu->kvm->srcu); - if (kvmppc_gpa_to_ua(vcpu->kvm, tce_list, &ua, NULL)) { + if (kvmppc_tce_to_ua(vcpu->kvm, tce_list, &ua, NULL)) { ret = H_TOO_HARD; goto unlock_exit; } @@ -599,10 +628,26 @@ long kvmppc_h_put_tce_indirect(struct kvm_vcpu *vcpu, ret = kvmppc_tce_validate(stt, tce); if (ret != H_SUCCESS) goto unlock_exit; + } + + for (i = 0; i < npages; ++i) { + /* + * This looks unsafe, because we validate, then regrab + * the TCE from userspace which could have been changed by + * another thread. + * + * But it actually is safe, because the relevant checks will be + * re-executed in the following code. If userspace tries to + * change this dodgily it will result in a messier failure mode + * but won't threaten the host. + */ + if (get_user(tce, tces + i)) { + ret = H_TOO_HARD; + goto unlock_exit; + } + tce = be64_to_cpu(tce); - if (kvmppc_gpa_to_ua(vcpu->kvm, - tce & ~(TCE_PCI_READ | TCE_PCI_WRITE), - &ua, NULL)) + if (kvmppc_tce_to_ua(vcpu->kvm, tce, &ua, NULL)) return H_PARAMETER; list_for_each_entry_lockless(stit, &stt->iommu_tables, next) { @@ -610,14 +655,10 @@ long kvmppc_h_put_tce_indirect(struct kvm_vcpu *vcpu, stit->tbl, entry + i, ua, iommu_tce_direction(tce)); - if (ret == H_SUCCESS) - continue; - - if (ret == H_TOO_HARD) + if (ret != H_SUCCESS) { + kvmppc_clear_tce(stit->tbl, entry); goto unlock_exit; - - WARN_ON_ONCE(1); - kvmppc_clear_tce(stit->tbl, entry); + } } kvmppc_tce_put(stt, entry + i, tce); diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c index 506a4d400458..ec99363fdf54 100644 --- a/arch/powerpc/kvm/book3s_64_vio_hv.c +++ b/arch/powerpc/kvm/book3s_64_vio_hv.c @@ -87,6 +87,7 @@ struct kvmppc_spapr_tce_table *kvmppc_find_table(struct kvm *kvm, } EXPORT_SYMBOL_GPL(kvmppc_find_table); +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE /* * Validates TCE address. * At the moment flags and page mask are validated. @@ -94,14 +95,14 @@ EXPORT_SYMBOL_GPL(kvmppc_find_table); * to the table and user space is supposed to process them), we can skip * checking other things (such as TCE is a guest RAM address or the page * was actually allocated). - * - * WARNING: This will be called in real-mode on HV KVM and virtual - * mode on PR KVM */ -long kvmppc_tce_validate(struct kvmppc_spapr_tce_table *stt, unsigned long tce) +static long kvmppc_rm_tce_validate(struct kvmppc_spapr_tce_table *stt, + unsigned long tce) { unsigned long gpa = tce & ~(TCE_PCI_READ | TCE_PCI_WRITE); enum dma_data_direction dir = iommu_tce_direction(tce); + struct kvmppc_spapr_tce_iommu_table *stit; + unsigned long ua = 0; /* Allow userspace to poison TCE table */ if (dir == DMA_NONE) @@ -110,9 +111,25 @@ long kvmppc_tce_validate(struct kvmppc_spapr_tce_table *stt, unsigned long tce) if (iommu_tce_check_gpa(stt->page_shift, gpa)) return H_PARAMETER; + if (kvmppc_tce_to_ua(stt->kvm, tce, &ua, NULL)) + return H_TOO_HARD; + + list_for_each_entry_lockless(stit, &stt->iommu_tables, next) { + unsigned long hpa = 0; + struct mm_iommu_table_group_mem_t *mem; + long shift = stit->tbl->it_page_shift; + + mem = mm_iommu_lookup_rm(stt->kvm->mm, ua, 1ULL << shift); + if (!mem) + return H_TOO_HARD; + + if (mm_iommu_ua_to_hpa_rm(mem, ua, shift, &hpa)) + return H_TOO_HARD; + } + return H_SUCCESS; } -EXPORT_SYMBOL_GPL(kvmppc_tce_validate); +#endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */ /* Note on the use of page_address() in real mode, * @@ -164,10 +181,10 @@ void kvmppc_tce_put(struct kvmppc_spapr_tce_table *stt, } EXPORT_SYMBOL_GPL(kvmppc_tce_put); -long kvmppc_gpa_to_ua(struct kvm *kvm, unsigned long gpa, +long kvmppc_tce_to_ua(struct kvm *kvm, unsigned long tce, unsigned long *ua, unsigned long **prmap) { - unsigned long gfn = gpa >> PAGE_SHIFT; + unsigned long gfn = tce >> PAGE_SHIFT; struct kvm_memory_slot *memslot; memslot = search_memslots(kvm_memslots(kvm), gfn); @@ -175,7 +192,7 @@ long kvmppc_gpa_to_ua(struct kvm *kvm, unsigned long gpa, return -EINVAL; *ua = __gfn_to_hva_memslot(memslot, gfn) | - (gpa & ~(PAGE_MASK | TCE_PCI_READ | TCE_PCI_WRITE)); + (tce & ~(PAGE_MASK | TCE_PCI_READ | TCE_PCI_WRITE)); #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE if (prmap) @@ -184,15 +201,38 @@ long kvmppc_gpa_to_ua(struct kvm *kvm, unsigned long gpa, return 0; } -EXPORT_SYMBOL_GPL(kvmppc_gpa_to_ua); +EXPORT_SYMBOL_GPL(kvmppc_tce_to_ua); #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE -static void kvmppc_rm_clear_tce(struct iommu_table *tbl, unsigned long entry) +static long iommu_tce_xchg_rm(struct mm_struct *mm, struct iommu_table *tbl, + unsigned long entry, unsigned long *hpa, + enum dma_data_direction *direction) +{ + long ret; + + ret = tbl->it_ops->exchange_rm(tbl, entry, hpa, direction); + + if (!ret && ((*direction == DMA_FROM_DEVICE) || + (*direction == DMA_BIDIRECTIONAL))) { + __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RM(tbl, entry); + /* + * kvmppc_rm_tce_iommu_do_map() updates the UA cache after + * calling this so we still get here a valid UA. + */ + if (pua && *pua) + mm_iommu_ua_mark_dirty_rm(mm, be64_to_cpu(*pua)); + } + + return ret; +} + +static void kvmppc_rm_clear_tce(struct kvm *kvm, struct iommu_table *tbl, + unsigned long entry) { unsigned long hpa = 0; enum dma_data_direction dir = DMA_NONE; - iommu_tce_xchg_rm(tbl, entry, &hpa, &dir); + iommu_tce_xchg_rm(kvm->mm, tbl, entry, &hpa, &dir); } static long kvmppc_rm_tce_iommu_mapped_dec(struct kvm *kvm, @@ -224,7 +264,7 @@ static long kvmppc_rm_tce_iommu_do_unmap(struct kvm *kvm, unsigned long hpa = 0; long ret; - if (iommu_tce_xchg_rm(tbl, entry, &hpa, &dir)) + if (iommu_tce_xchg_rm(kvm->mm, tbl, entry, &hpa, &dir)) /* * real mode xchg can fail if struct page crosses * a page boundary @@ -236,7 +276,7 @@ static long kvmppc_rm_tce_iommu_do_unmap(struct kvm *kvm, ret = kvmppc_rm_tce_iommu_mapped_dec(kvm, tbl, entry); if (ret) - iommu_tce_xchg_rm(tbl, entry, &hpa, &dir); + iommu_tce_xchg_rm(kvm->mm, tbl, entry, &hpa, &dir); return ret; } @@ -277,12 +317,12 @@ static long kvmppc_rm_tce_iommu_do_map(struct kvm *kvm, struct iommu_table *tbl, if (WARN_ON_ONCE_RM(mm_iommu_ua_to_hpa_rm(mem, ua, tbl->it_page_shift, &hpa))) - return H_HARDWARE; + return H_TOO_HARD; if (WARN_ON_ONCE_RM(mm_iommu_mapped_inc(mem))) - return H_CLOSED; + return H_TOO_HARD; - ret = iommu_tce_xchg_rm(tbl, entry, &hpa, &dir); + ret = iommu_tce_xchg_rm(kvm->mm, tbl, entry, &hpa, &dir); if (ret) { mm_iommu_mapped_dec(mem); /* @@ -345,13 +385,12 @@ long kvmppc_rm_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn, if (ret != H_SUCCESS) return ret; - ret = kvmppc_tce_validate(stt, tce); + ret = kvmppc_rm_tce_validate(stt, tce); if (ret != H_SUCCESS) return ret; dir = iommu_tce_direction(tce); - if ((dir != DMA_NONE) && kvmppc_gpa_to_ua(vcpu->kvm, - tce & ~(TCE_PCI_READ | TCE_PCI_WRITE), &ua, NULL)) + if ((dir != DMA_NONE) && kvmppc_tce_to_ua(vcpu->kvm, tce, &ua, NULL)) return H_PARAMETER; entry = ioba >> stt->page_shift; @@ -364,14 +403,10 @@ long kvmppc_rm_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn, ret = kvmppc_rm_tce_iommu_map(vcpu->kvm, stt, stit->tbl, entry, ua, dir); - if (ret == H_SUCCESS) - continue; - - if (ret == H_TOO_HARD) + if (ret != H_SUCCESS) { + kvmppc_rm_clear_tce(vcpu->kvm, stit->tbl, entry); return ret; - - WARN_ON_ONCE_RM(1); - kvmppc_rm_clear_tce(stit->tbl, entry); + } } kvmppc_tce_put(stt, entry, tce); @@ -457,7 +492,7 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu, */ struct mm_iommu_table_group_mem_t *mem; - if (kvmppc_gpa_to_ua(vcpu->kvm, tce_list, &ua, NULL)) + if (kvmppc_tce_to_ua(vcpu->kvm, tce_list, &ua, NULL)) return H_TOO_HARD; mem = mm_iommu_lookup_rm(vcpu->kvm->mm, ua, IOMMU_PAGE_SIZE_4K); @@ -473,12 +508,12 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu, * We do not require memory to be preregistered in this case * so lock rmap and do __find_linux_pte_or_hugepte(). */ - if (kvmppc_gpa_to_ua(vcpu->kvm, tce_list, &ua, &rmap)) + if (kvmppc_tce_to_ua(vcpu->kvm, tce_list, &ua, &rmap)) return H_TOO_HARD; rmap = (void *) vmalloc_to_phys(rmap); if (WARN_ON_ONCE_RM(!rmap)) - return H_HARDWARE; + return H_TOO_HARD; /* * Synchronize with the MMU notifier callbacks in @@ -498,14 +533,16 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu, for (i = 0; i < npages; ++i) { unsigned long tce = be64_to_cpu(((u64 *)tces)[i]); - ret = kvmppc_tce_validate(stt, tce); + ret = kvmppc_rm_tce_validate(stt, tce); if (ret != H_SUCCESS) goto unlock_exit; + } + + for (i = 0; i < npages; ++i) { + unsigned long tce = be64_to_cpu(((u64 *)tces)[i]); ua = 0; - if (kvmppc_gpa_to_ua(vcpu->kvm, - tce & ~(TCE_PCI_READ | TCE_PCI_WRITE), - &ua, NULL)) + if (kvmppc_tce_to_ua(vcpu->kvm, tce, &ua, NULL)) return H_PARAMETER; list_for_each_entry_lockless(stit, &stt->iommu_tables, next) { @@ -513,14 +550,11 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu, stit->tbl, entry + i, ua, iommu_tce_direction(tce)); - if (ret == H_SUCCESS) - continue; - - if (ret == H_TOO_HARD) + if (ret != H_SUCCESS) { + kvmppc_rm_clear_tce(vcpu->kvm, stit->tbl, + entry); goto unlock_exit; - - WARN_ON_ONCE_RM(1); - kvmppc_rm_clear_tce(stit->tbl, entry); + } } kvmppc_tce_put(stt, entry + i, tce); @@ -571,7 +605,7 @@ long kvmppc_rm_h_stuff_tce(struct kvm_vcpu *vcpu, return ret; WARN_ON_ONCE_RM(1); - kvmppc_rm_clear_tce(stit->tbl, entry); + kvmppc_rm_clear_tce(vcpu->kvm, stit->tbl, entry); } } diff --git a/arch/powerpc/kvm/book3s_emulate.c b/arch/powerpc/kvm/book3s_emulate.c index 36b11c5a0dbb..8c7e933e942e 100644 --- a/arch/powerpc/kvm/book3s_emulate.c +++ b/arch/powerpc/kvm/book3s_emulate.c @@ -36,7 +36,6 @@ #define OP_31_XOP_MTSR 210 #define OP_31_XOP_MTSRIN 242 #define OP_31_XOP_TLBIEL 274 -#define OP_31_XOP_TLBIE 306 /* Opcode is officially reserved, reuse it as sc 1 when sc 1 doesn't trap */ #define OP_31_XOP_FAKE_SC1 308 #define OP_31_XOP_SLBMTE 402 @@ -110,7 +109,7 @@ static inline void kvmppc_copyto_vcpu_tm(struct kvm_vcpu *vcpu) vcpu->arch.ctr_tm = vcpu->arch.regs.ctr; vcpu->arch.tar_tm = vcpu->arch.tar; vcpu->arch.lr_tm = vcpu->arch.regs.link; - vcpu->arch.cr_tm = vcpu->arch.cr; + vcpu->arch.cr_tm = vcpu->arch.regs.ccr; vcpu->arch.xer_tm = vcpu->arch.regs.xer; vcpu->arch.vrsave_tm = vcpu->arch.vrsave; } @@ -129,7 +128,7 @@ static inline void kvmppc_copyfrom_vcpu_tm(struct kvm_vcpu *vcpu) vcpu->arch.regs.ctr = vcpu->arch.ctr_tm; vcpu->arch.tar = vcpu->arch.tar_tm; vcpu->arch.regs.link = vcpu->arch.lr_tm; - vcpu->arch.cr = vcpu->arch.cr_tm; + vcpu->arch.regs.ccr = vcpu->arch.cr_tm; vcpu->arch.regs.xer = vcpu->arch.xer_tm; vcpu->arch.vrsave = vcpu->arch.vrsave_tm; } @@ -141,7 +140,7 @@ static void kvmppc_emulate_treclaim(struct kvm_vcpu *vcpu, int ra_val) uint64_t texasr; /* CR0 = 0 | MSR[TS] | 0 */ - vcpu->arch.cr = (vcpu->arch.cr & ~(CR0_MASK << CR0_SHIFT)) | + vcpu->arch.regs.ccr = (vcpu->arch.regs.ccr & ~(CR0_MASK << CR0_SHIFT)) | (((guest_msr & MSR_TS_MASK) >> (MSR_TS_S_LG - 1)) << CR0_SHIFT); @@ -220,7 +219,7 @@ void kvmppc_emulate_tabort(struct kvm_vcpu *vcpu, int ra_val) tm_abort(ra_val); /* CR0 = 0 | MSR[TS] | 0 */ - vcpu->arch.cr = (vcpu->arch.cr & ~(CR0_MASK << CR0_SHIFT)) | + vcpu->arch.regs.ccr = (vcpu->arch.regs.ccr & ~(CR0_MASK << CR0_SHIFT)) | (((guest_msr & MSR_TS_MASK) >> (MSR_TS_S_LG - 1)) << CR0_SHIFT); @@ -494,8 +493,8 @@ int kvmppc_core_emulate_op_pr(struct kvm_run *run, struct kvm_vcpu *vcpu, if (!(kvmppc_get_msr(vcpu) & MSR_PR)) { preempt_disable(); - vcpu->arch.cr = (CR0_TBEGIN_FAILURE | - (vcpu->arch.cr & ~(CR0_MASK << CR0_SHIFT))); + vcpu->arch.regs.ccr = (CR0_TBEGIN_FAILURE | + (vcpu->arch.regs.ccr & ~(CR0_MASK << CR0_SHIFT))); vcpu->arch.texasr = (TEXASR_FS | TEXASR_EXACT | (((u64)(TM_CAUSE_EMULATE | TM_CAUSE_PERSISTENT)) diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 3e3a71594e63..788bc61bd08c 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -50,6 +50,7 @@ #include <asm/reg.h> #include <asm/ppc-opcode.h> #include <asm/asm-prototypes.h> +#include <asm/archrandom.h> #include <asm/debug.h> #include <asm/disassemble.h> #include <asm/cputable.h> @@ -104,6 +105,10 @@ static bool indep_threads_mode = true; module_param(indep_threads_mode, bool, S_IRUGO | S_IWUSR); MODULE_PARM_DESC(indep_threads_mode, "Independent-threads mode (only on POWER9)"); +static bool one_vm_per_core; +module_param(one_vm_per_core, bool, S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(one_vm_per_core, "Only run vCPUs from the same VM on a core (requires indep_threads_mode=N)"); + #ifdef CONFIG_KVM_XICS static struct kernel_param_ops module_param_ops = { .set = param_set_int, @@ -117,6 +122,16 @@ module_param_cb(h_ipi_redirect, &module_param_ops, &h_ipi_redirect, 0644); MODULE_PARM_DESC(h_ipi_redirect, "Redirect H_IPI wakeup to a free host core"); #endif +/* If set, guests are allowed to create and control nested guests */ +static bool nested = true; +module_param(nested, bool, S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(nested, "Enable nested virtualization (only on POWER9)"); + +static inline bool nesting_enabled(struct kvm *kvm) +{ + return kvm->arch.nested_enable && kvm_is_radix(kvm); +} + /* If set, the threads on each CPU core have to be in the same MMU mode */ static bool no_mixing_hpt_and_radix; @@ -173,6 +188,10 @@ static bool kvmppc_ipi_thread(int cpu) { unsigned long msg = PPC_DBELL_TYPE(PPC_DBELL_SERVER); + /* If we're a nested hypervisor, fall back to ordinary IPIs for now */ + if (kvmhv_on_pseries()) + return false; + /* On POWER9 we can use msgsnd to IPI any cpu */ if (cpu_has_feature(CPU_FTR_ARCH_300)) { msg |= get_hard_smp_processor_id(cpu); @@ -410,8 +429,8 @@ static void kvmppc_dump_regs(struct kvm_vcpu *vcpu) vcpu->arch.shregs.sprg0, vcpu->arch.shregs.sprg1); pr_err("sprg2 = %.16llx sprg3 = %.16llx\n", vcpu->arch.shregs.sprg2, vcpu->arch.shregs.sprg3); - pr_err("cr = %.8x xer = %.16lx dsisr = %.8x\n", - vcpu->arch.cr, vcpu->arch.regs.xer, vcpu->arch.shregs.dsisr); + pr_err("cr = %.8lx xer = %.16lx dsisr = %.8x\n", + vcpu->arch.regs.ccr, vcpu->arch.regs.xer, vcpu->arch.shregs.dsisr); pr_err("dar = %.16llx\n", vcpu->arch.shregs.dar); pr_err("fault dar = %.16lx dsisr = %.8x\n", vcpu->arch.fault_dar, vcpu->arch.fault_dsisr); @@ -730,8 +749,7 @@ static bool kvmppc_doorbell_pending(struct kvm_vcpu *vcpu) /* * Ensure that the read of vcore->dpdes comes after the read * of vcpu->doorbell_request. This barrier matches the - * lwsync in book3s_hv_rmhandlers.S just before the - * fast_guest_return label. + * smb_wmb() in kvmppc_guest_entry_inject(). */ smp_rmb(); vc = vcpu->arch.vcore; @@ -912,6 +930,19 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu) break; } return RESUME_HOST; + case H_SET_DABR: + ret = kvmppc_h_set_dabr(vcpu, kvmppc_get_gpr(vcpu, 4)); + break; + case H_SET_XDABR: + ret = kvmppc_h_set_xdabr(vcpu, kvmppc_get_gpr(vcpu, 4), + kvmppc_get_gpr(vcpu, 5)); + break; + case H_GET_TCE: + ret = kvmppc_h_get_tce(vcpu, kvmppc_get_gpr(vcpu, 4), + kvmppc_get_gpr(vcpu, 5)); + if (ret == H_TOO_HARD) + return RESUME_HOST; + break; case H_PUT_TCE: ret = kvmppc_h_put_tce(vcpu, kvmppc_get_gpr(vcpu, 4), kvmppc_get_gpr(vcpu, 5), @@ -935,6 +966,32 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu) if (ret == H_TOO_HARD) return RESUME_HOST; break; + case H_RANDOM: + if (!powernv_get_random_long(&vcpu->arch.regs.gpr[4])) + ret = H_HARDWARE; + break; + + case H_SET_PARTITION_TABLE: + ret = H_FUNCTION; + if (nesting_enabled(vcpu->kvm)) + ret = kvmhv_set_partition_table(vcpu); + break; + case H_ENTER_NESTED: + ret = H_FUNCTION; + if (!nesting_enabled(vcpu->kvm)) + break; + ret = kvmhv_enter_nested_guest(vcpu); + if (ret == H_INTERRUPT) { + kvmppc_set_gpr(vcpu, 3, 0); + return -EINTR; + } + break; + case H_TLB_INVALIDATE: + ret = H_FUNCTION; + if (nesting_enabled(vcpu->kvm)) + ret = kvmhv_do_nested_tlbie(vcpu); + break; + default: return RESUME_HOST; } @@ -943,6 +1000,24 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu) return RESUME_GUEST; } +/* + * Handle H_CEDE in the nested virtualization case where we haven't + * called the real-mode hcall handlers in book3s_hv_rmhandlers.S. + * This has to be done early, not in kvmppc_pseries_do_hcall(), so + * that the cede logic in kvmppc_run_single_vcpu() works properly. + */ +static void kvmppc_nested_cede(struct kvm_vcpu *vcpu) +{ + vcpu->arch.shregs.msr |= MSR_EE; + vcpu->arch.ceded = 1; + smp_mb(); + if (vcpu->arch.prodded) { + vcpu->arch.prodded = 0; + smp_mb(); + vcpu->arch.ceded = 0; + } +} + static int kvmppc_hcall_impl_hv(unsigned long cmd) { switch (cmd) { @@ -1085,7 +1160,6 @@ static int kvmppc_emulate_doorbell_instr(struct kvm_vcpu *vcpu) return RESUME_GUEST; } -/* Called with vcpu->arch.vcore->lock held */ static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu, struct task_struct *tsk) { @@ -1190,7 +1264,10 @@ static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu, break; case BOOK3S_INTERRUPT_H_INST_STORAGE: vcpu->arch.fault_dar = kvmppc_get_pc(vcpu); - vcpu->arch.fault_dsisr = 0; + vcpu->arch.fault_dsisr = vcpu->arch.shregs.msr & + DSISR_SRR1_MATCH_64S; + if (vcpu->arch.shregs.msr & HSRR1_HISI_WRITE) + vcpu->arch.fault_dsisr |= DSISR_ISSTORE; r = RESUME_PAGE_FAULT; break; /* @@ -1206,10 +1283,7 @@ static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu, swab32(vcpu->arch.emul_inst) : vcpu->arch.emul_inst; if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) { - /* Need vcore unlocked to call kvmppc_get_last_inst */ - spin_unlock(&vcpu->arch.vcore->lock); r = kvmppc_emulate_debug_inst(run, vcpu); - spin_lock(&vcpu->arch.vcore->lock); } else { kvmppc_core_queue_program(vcpu, SRR1_PROGILL); r = RESUME_GUEST; @@ -1225,12 +1299,8 @@ static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu, case BOOK3S_INTERRUPT_H_FAC_UNAVAIL: r = EMULATE_FAIL; if (((vcpu->arch.hfscr >> 56) == FSCR_MSGP_LG) && - cpu_has_feature(CPU_FTR_ARCH_300)) { - /* Need vcore unlocked to call kvmppc_get_last_inst */ - spin_unlock(&vcpu->arch.vcore->lock); + cpu_has_feature(CPU_FTR_ARCH_300)) r = kvmppc_emulate_doorbell_instr(vcpu); - spin_lock(&vcpu->arch.vcore->lock); - } if (r == EMULATE_FAIL) { kvmppc_core_queue_program(vcpu, SRR1_PROGILL); r = RESUME_GUEST; @@ -1265,6 +1335,104 @@ static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu, return r; } +static int kvmppc_handle_nested_exit(struct kvm_vcpu *vcpu) +{ + int r; + int srcu_idx; + + vcpu->stat.sum_exits++; + + /* + * This can happen if an interrupt occurs in the last stages + * of guest entry or the first stages of guest exit (i.e. after + * setting paca->kvm_hstate.in_guest to KVM_GUEST_MODE_GUEST_HV + * and before setting it to KVM_GUEST_MODE_HOST_HV). + * That can happen due to a bug, or due to a machine check + * occurring at just the wrong time. + */ + if (vcpu->arch.shregs.msr & MSR_HV) { + pr_emerg("KVM trap in HV mode while nested!\n"); + pr_emerg("trap=0x%x | pc=0x%lx | msr=0x%llx\n", + vcpu->arch.trap, kvmppc_get_pc(vcpu), + vcpu->arch.shregs.msr); + kvmppc_dump_regs(vcpu); + return RESUME_HOST; + } + switch (vcpu->arch.trap) { + /* We're good on these - the host merely wanted to get our attention */ + case BOOK3S_INTERRUPT_HV_DECREMENTER: + vcpu->stat.dec_exits++; + r = RESUME_GUEST; + break; + case BOOK3S_INTERRUPT_EXTERNAL: + vcpu->stat.ext_intr_exits++; + r = RESUME_HOST; + break; + case BOOK3S_INTERRUPT_H_DOORBELL: + case BOOK3S_INTERRUPT_H_VIRT: + vcpu->stat.ext_intr_exits++; + r = RESUME_GUEST; + break; + /* SR/HMI/PMI are HV interrupts that host has handled. Resume guest.*/ + case BOOK3S_INTERRUPT_HMI: + case BOOK3S_INTERRUPT_PERFMON: + case BOOK3S_INTERRUPT_SYSTEM_RESET: + r = RESUME_GUEST; + break; + case BOOK3S_INTERRUPT_MACHINE_CHECK: + /* Pass the machine check to the L1 guest */ + r = RESUME_HOST; + /* Print the MCE event to host console. */ + machine_check_print_event_info(&vcpu->arch.mce_evt, false); + break; + /* + * We get these next two if the guest accesses a page which it thinks + * it has mapped but which is not actually present, either because + * it is for an emulated I/O device or because the corresonding + * host page has been paged out. + */ + case BOOK3S_INTERRUPT_H_DATA_STORAGE: + srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); + r = kvmhv_nested_page_fault(vcpu); + srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx); + break; + case BOOK3S_INTERRUPT_H_INST_STORAGE: + vcpu->arch.fault_dar = kvmppc_get_pc(vcpu); + vcpu->arch.fault_dsisr = kvmppc_get_msr(vcpu) & + DSISR_SRR1_MATCH_64S; + if (vcpu->arch.shregs.msr & HSRR1_HISI_WRITE) + vcpu->arch.fault_dsisr |= DSISR_ISSTORE; + srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); + r = kvmhv_nested_page_fault(vcpu); + srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx); + break; + +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM + case BOOK3S_INTERRUPT_HV_SOFTPATCH: + /* + * This occurs for various TM-related instructions that + * we need to emulate on POWER9 DD2.2. We have already + * handled the cases where the guest was in real-suspend + * mode and was transitioning to transactional state. + */ + r = kvmhv_p9_tm_emulation(vcpu); + break; +#endif + + case BOOK3S_INTERRUPT_HV_RM_HARD: + vcpu->arch.trap = 0; + r = RESUME_GUEST; + if (!xive_enabled()) + kvmppc_xics_rm_complete(vcpu, 0); + break; + default: + r = RESUME_HOST; + break; + } + + return r; +} + static int kvm_arch_vcpu_ioctl_get_sregs_hv(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) { @@ -1555,6 +1723,9 @@ static int kvmppc_get_one_reg_hv(struct kvm_vcpu *vcpu, u64 id, case KVM_REG_PPC_ONLINE: *val = get_reg_val(id, vcpu->arch.online); break; + case KVM_REG_PPC_PTCR: + *val = get_reg_val(id, vcpu->kvm->arch.l1_ptcr); + break; default: r = -EINVAL; break; @@ -1786,6 +1957,9 @@ static int kvmppc_set_one_reg_hv(struct kvm_vcpu *vcpu, u64 id, atomic_dec(&vcpu->arch.vcore->online_count); vcpu->arch.online = i; break; + case KVM_REG_PPC_PTCR: + vcpu->kvm->arch.l1_ptcr = set_reg_val(id, *val); + break; default: r = -EINVAL; break; @@ -2019,15 +2193,18 @@ static struct kvm_vcpu *kvmppc_core_vcpu_create_hv(struct kvm *kvm, * Set the default HFSCR for the guest from the host value. * This value is only used on POWER9. * On POWER9, we want to virtualize the doorbell facility, so we - * turn off the HFSCR bit, which causes those instructions to trap. + * don't set the HFSCR_MSGP bit, and that causes those instructions + * to trap and then we emulate them. */ - vcpu->arch.hfscr = mfspr(SPRN_HFSCR); - if (cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST)) + vcpu->arch.hfscr = HFSCR_TAR | HFSCR_EBB | HFSCR_PM | HFSCR_BHRB | + HFSCR_DSCR | HFSCR_VECVSX | HFSCR_FP; + if (cpu_has_feature(CPU_FTR_HVMODE)) { + vcpu->arch.hfscr &= mfspr(SPRN_HFSCR); + if (cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST)) + vcpu->arch.hfscr |= HFSCR_TM; + } + if (cpu_has_feature(CPU_FTR_TM_COMP)) vcpu->arch.hfscr |= HFSCR_TM; - else if (!cpu_has_feature(CPU_FTR_TM_COMP)) - vcpu->arch.hfscr &= ~HFSCR_TM; - if (cpu_has_feature(CPU_FTR_ARCH_300)) - vcpu->arch.hfscr &= ~HFSCR_MSGP; kvmppc_mmu_book3s_hv_init(vcpu); @@ -2242,10 +2419,18 @@ static void kvmppc_release_hwthread(int cpu) static void radix_flush_cpu(struct kvm *kvm, int cpu, struct kvm_vcpu *vcpu) { + struct kvm_nested_guest *nested = vcpu->arch.nested; + cpumask_t *cpu_in_guest; int i; cpu = cpu_first_thread_sibling(cpu); - cpumask_set_cpu(cpu, &kvm->arch.need_tlb_flush); + if (nested) { + cpumask_set_cpu(cpu, &nested->need_tlb_flush); + cpu_in_guest = &nested->cpu_in_guest; + } else { + cpumask_set_cpu(cpu, &kvm->arch.need_tlb_flush); + cpu_in_guest = &kvm->arch.cpu_in_guest; + } /* * Make sure setting of bit in need_tlb_flush precedes * testing of cpu_in_guest bits. The matching barrier on @@ -2253,13 +2438,23 @@ static void radix_flush_cpu(struct kvm *kvm, int cpu, struct kvm_vcpu *vcpu) */ smp_mb(); for (i = 0; i < threads_per_core; ++i) - if (cpumask_test_cpu(cpu + i, &kvm->arch.cpu_in_guest)) + if (cpumask_test_cpu(cpu + i, cpu_in_guest)) smp_call_function_single(cpu + i, do_nothing, NULL, 1); } static void kvmppc_prepare_radix_vcpu(struct kvm_vcpu *vcpu, int pcpu) { + struct kvm_nested_guest *nested = vcpu->arch.nested; struct kvm *kvm = vcpu->kvm; + int prev_cpu; + + if (!cpu_has_feature(CPU_FTR_HVMODE)) + return; + + if (nested) + prev_cpu = nested->prev_cpu[vcpu->arch.nested_vcpu_id]; + else + prev_cpu = vcpu->arch.prev_cpu; /* * With radix, the guest can do TLB invalidations itself, @@ -2273,12 +2468,46 @@ static void kvmppc_prepare_radix_vcpu(struct kvm_vcpu *vcpu, int pcpu) * ran to flush the TLB. The TLB is shared between threads, * so we use a single bit in .need_tlb_flush for all 4 threads. */ - if (vcpu->arch.prev_cpu != pcpu) { - if (vcpu->arch.prev_cpu >= 0 && - cpu_first_thread_sibling(vcpu->arch.prev_cpu) != + if (prev_cpu != pcpu) { + if (prev_cpu >= 0 && + cpu_first_thread_sibling(prev_cpu) != cpu_first_thread_sibling(pcpu)) - radix_flush_cpu(kvm, vcpu->arch.prev_cpu, vcpu); - vcpu->arch.prev_cpu = pcpu; + radix_flush_cpu(kvm, prev_cpu, vcpu); + if (nested) + nested->prev_cpu[vcpu->arch.nested_vcpu_id] = pcpu; + else + vcpu->arch.prev_cpu = pcpu; + } +} + +static void kvmppc_radix_check_need_tlb_flush(struct kvm *kvm, int pcpu, + struct kvm_nested_guest *nested) +{ + cpumask_t *need_tlb_flush; + int lpid; + + if (!cpu_has_feature(CPU_FTR_HVMODE)) + return; + + if (cpu_has_feature(CPU_FTR_ARCH_300)) + pcpu &= ~0x3UL; + + if (nested) { + lpid = nested->shadow_lpid; + need_tlb_flush = &nested->need_tlb_flush; + } else { + lpid = kvm->arch.lpid; + need_tlb_flush = &kvm->arch.need_tlb_flush; + } + + mtspr(SPRN_LPID, lpid); + isync(); + smp_mb(); + + if (cpumask_test_cpu(pcpu, need_tlb_flush)) { + radix__local_flush_tlb_lpid_guest(lpid); + /* Clear the bit after the TLB flush */ + cpumask_clear_cpu(pcpu, need_tlb_flush); } } @@ -2493,6 +2722,10 @@ static bool can_dynamic_split(struct kvmppc_vcore *vc, struct core_info *cip) if (!cpu_has_feature(CPU_FTR_ARCH_207S)) return false; + /* In one_vm_per_core mode, require all vcores to be from the same vm */ + if (one_vm_per_core && vc->kvm != cip->vc[0]->kvm) + return false; + /* Some POWER9 chips require all threads to be in the same MMU mode */ if (no_mixing_hpt_and_radix && kvm_is_radix(vc->kvm) != kvm_is_radix(cip->vc[0]->kvm)) @@ -2600,6 +2833,14 @@ static void post_guest_process(struct kvmppc_vcore *vc, bool is_master) spin_lock(&vc->lock); now = get_tb(); for_each_runnable_thread(i, vcpu, vc) { + /* + * It's safe to unlock the vcore in the loop here, because + * for_each_runnable_thread() is safe against removal of + * the vcpu, and the vcore state is VCORE_EXITING here, + * so any vcpus becoming runnable will have their arch.trap + * set to zero and can't actually run in the guest. + */ + spin_unlock(&vc->lock); /* cancel pending dec exception if dec is positive */ if (now < vcpu->arch.dec_expires && kvmppc_core_pending_dec(vcpu)) @@ -2615,6 +2856,7 @@ static void post_guest_process(struct kvmppc_vcore *vc, bool is_master) vcpu->arch.ret = ret; vcpu->arch.trap = 0; + spin_lock(&vc->lock); if (is_kvmppc_resume_guest(vcpu->arch.ret)) { if (vcpu->arch.pending_exceptions) kvmppc_core_prepare_to_enter(vcpu); @@ -2963,8 +3205,6 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) spin_unlock(&core_info.vc[sub]->lock); if (kvm_is_radix(vc->kvm)) { - int tmp = pcpu; - /* * Do we need to flush the process scoped TLB for the LPAR? * @@ -2975,17 +3215,7 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) * * Hash must be flushed in realmode in order to use tlbiel. */ - mtspr(SPRN_LPID, vc->kvm->arch.lpid); - isync(); - - if (cpu_has_feature(CPU_FTR_ARCH_300)) - tmp &= ~0x3UL; - - if (cpumask_test_cpu(tmp, &vc->kvm->arch.need_tlb_flush)) { - radix__local_flush_tlb_lpid_guest(vc->kvm->arch.lpid); - /* Clear the bit after the TLB flush */ - cpumask_clear_cpu(tmp, &vc->kvm->arch.need_tlb_flush); - } + kvmppc_radix_check_need_tlb_flush(vc->kvm, pcpu, NULL); } /* @@ -3080,6 +3310,300 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) } /* + * Load up hypervisor-mode registers on P9. + */ +static int kvmhv_load_hv_regs_and_go(struct kvm_vcpu *vcpu, u64 time_limit, + unsigned long lpcr) +{ + struct kvmppc_vcore *vc = vcpu->arch.vcore; + s64 hdec; + u64 tb, purr, spurr; + int trap; + unsigned long host_hfscr = mfspr(SPRN_HFSCR); + unsigned long host_ciabr = mfspr(SPRN_CIABR); + unsigned long host_dawr = mfspr(SPRN_DAWR); + unsigned long host_dawrx = mfspr(SPRN_DAWRX); + unsigned long host_psscr = mfspr(SPRN_PSSCR); + unsigned long host_pidr = mfspr(SPRN_PID); + + hdec = time_limit - mftb(); + if (hdec < 0) + return BOOK3S_INTERRUPT_HV_DECREMENTER; + mtspr(SPRN_HDEC, hdec); + + if (vc->tb_offset) { + u64 new_tb = mftb() + vc->tb_offset; + mtspr(SPRN_TBU40, new_tb); + tb = mftb(); + if ((tb & 0xffffff) < (new_tb & 0xffffff)) + mtspr(SPRN_TBU40, new_tb + 0x1000000); + vc->tb_offset_applied = vc->tb_offset; + } + + if (vc->pcr) + mtspr(SPRN_PCR, vc->pcr); + mtspr(SPRN_DPDES, vc->dpdes); + mtspr(SPRN_VTB, vc->vtb); + + local_paca->kvm_hstate.host_purr = mfspr(SPRN_PURR); + local_paca->kvm_hstate.host_spurr = mfspr(SPRN_SPURR); + mtspr(SPRN_PURR, vcpu->arch.purr); + mtspr(SPRN_SPURR, vcpu->arch.spurr); + + if (cpu_has_feature(CPU_FTR_DAWR)) { + mtspr(SPRN_DAWR, vcpu->arch.dawr); + mtspr(SPRN_DAWRX, vcpu->arch.dawrx); + } + mtspr(SPRN_CIABR, vcpu->arch.ciabr); + mtspr(SPRN_IC, vcpu->arch.ic); + mtspr(SPRN_PID, vcpu->arch.pid); + + mtspr(SPRN_PSSCR, vcpu->arch.psscr | PSSCR_EC | + (local_paca->kvm_hstate.fake_suspend << PSSCR_FAKE_SUSPEND_LG)); + + mtspr(SPRN_HFSCR, vcpu->arch.hfscr); + + mtspr(SPRN_SPRG0, vcpu->arch.shregs.sprg0); + mtspr(SPRN_SPRG1, vcpu->arch.shregs.sprg1); + mtspr(SPRN_SPRG2, vcpu->arch.shregs.sprg2); + mtspr(SPRN_SPRG3, vcpu->arch.shregs.sprg3); + + mtspr(SPRN_AMOR, ~0UL); + + mtspr(SPRN_LPCR, lpcr); + isync(); + + kvmppc_xive_push_vcpu(vcpu); + + mtspr(SPRN_SRR0, vcpu->arch.shregs.srr0); + mtspr(SPRN_SRR1, vcpu->arch.shregs.srr1); + + trap = __kvmhv_vcpu_entry_p9(vcpu); + + /* Advance host PURR/SPURR by the amount used by guest */ + purr = mfspr(SPRN_PURR); + spurr = mfspr(SPRN_SPURR); + mtspr(SPRN_PURR, local_paca->kvm_hstate.host_purr + + purr - vcpu->arch.purr); + mtspr(SPRN_SPURR, local_paca->kvm_hstate.host_spurr + + spurr - vcpu->arch.spurr); + vcpu->arch.purr = purr; + vcpu->arch.spurr = spurr; + + vcpu->arch.ic = mfspr(SPRN_IC); + vcpu->arch.pid = mfspr(SPRN_PID); + vcpu->arch.psscr = mfspr(SPRN_PSSCR) & PSSCR_GUEST_VIS; + + vcpu->arch.shregs.sprg0 = mfspr(SPRN_SPRG0); + vcpu->arch.shregs.sprg1 = mfspr(SPRN_SPRG1); + vcpu->arch.shregs.sprg2 = mfspr(SPRN_SPRG2); + vcpu->arch.shregs.sprg3 = mfspr(SPRN_SPRG3); + + mtspr(SPRN_PSSCR, host_psscr); + mtspr(SPRN_HFSCR, host_hfscr); + mtspr(SPRN_CIABR, host_ciabr); + mtspr(SPRN_DAWR, host_dawr); + mtspr(SPRN_DAWRX, host_dawrx); + mtspr(SPRN_PID, host_pidr); + + /* + * Since this is radix, do a eieio; tlbsync; ptesync sequence in + * case we interrupted the guest between a tlbie and a ptesync. + */ + asm volatile("eieio; tlbsync; ptesync"); + + mtspr(SPRN_LPID, vcpu->kvm->arch.host_lpid); /* restore host LPID */ + isync(); + + vc->dpdes = mfspr(SPRN_DPDES); + vc->vtb = mfspr(SPRN_VTB); + mtspr(SPRN_DPDES, 0); + if (vc->pcr) + mtspr(SPRN_PCR, 0); + + if (vc->tb_offset_applied) { + u64 new_tb = mftb() - vc->tb_offset_applied; + mtspr(SPRN_TBU40, new_tb); + tb = mftb(); + if ((tb & 0xffffff) < (new_tb & 0xffffff)) + mtspr(SPRN_TBU40, new_tb + 0x1000000); + vc->tb_offset_applied = 0; + } + + mtspr(SPRN_HDEC, 0x7fffffff); + mtspr(SPRN_LPCR, vcpu->kvm->arch.host_lpcr); + + return trap; +} + +/* + * Virtual-mode guest entry for POWER9 and later when the host and + * guest are both using the radix MMU. The LPIDR has already been set. + */ +int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit, + unsigned long lpcr) +{ + struct kvmppc_vcore *vc = vcpu->arch.vcore; + unsigned long host_dscr = mfspr(SPRN_DSCR); + unsigned long host_tidr = mfspr(SPRN_TIDR); + unsigned long host_iamr = mfspr(SPRN_IAMR); + s64 dec; + u64 tb; + int trap, save_pmu; + + dec = mfspr(SPRN_DEC); + tb = mftb(); + if (dec < 512) + return BOOK3S_INTERRUPT_HV_DECREMENTER; + local_paca->kvm_hstate.dec_expires = dec + tb; + if (local_paca->kvm_hstate.dec_expires < time_limit) + time_limit = local_paca->kvm_hstate.dec_expires; + + vcpu->arch.ceded = 0; + + kvmhv_save_host_pmu(); /* saves it to PACA kvm_hstate */ + + kvmppc_subcore_enter_guest(); + + vc->entry_exit_map = 1; + vc->in_guest = 1; + + if (vcpu->arch.vpa.pinned_addr) { + struct lppaca *lp = vcpu->arch.vpa.pinned_addr; + u32 yield_count = be32_to_cpu(lp->yield_count) + 1; + lp->yield_count = cpu_to_be32(yield_count); + vcpu->arch.vpa.dirty = 1; + } + + if (cpu_has_feature(CPU_FTR_TM) || + cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST)) + kvmppc_restore_tm_hv(vcpu, vcpu->arch.shregs.msr, true); + + kvmhv_load_guest_pmu(vcpu); + + msr_check_and_set(MSR_FP | MSR_VEC | MSR_VSX); + load_fp_state(&vcpu->arch.fp); +#ifdef CONFIG_ALTIVEC + load_vr_state(&vcpu->arch.vr); +#endif + + mtspr(SPRN_DSCR, vcpu->arch.dscr); + mtspr(SPRN_IAMR, vcpu->arch.iamr); + mtspr(SPRN_PSPB, vcpu->arch.pspb); + mtspr(SPRN_FSCR, vcpu->arch.fscr); + mtspr(SPRN_TAR, vcpu->arch.tar); + mtspr(SPRN_EBBHR, vcpu->arch.ebbhr); + mtspr(SPRN_EBBRR, vcpu->arch.ebbrr); + mtspr(SPRN_BESCR, vcpu->arch.bescr); + mtspr(SPRN_WORT, vcpu->arch.wort); + mtspr(SPRN_TIDR, vcpu->arch.tid); + mtspr(SPRN_DAR, vcpu->arch.shregs.dar); + mtspr(SPRN_DSISR, vcpu->arch.shregs.dsisr); + mtspr(SPRN_AMR, vcpu->arch.amr); + mtspr(SPRN_UAMOR, vcpu->arch.uamor); + + if (!(vcpu->arch.ctrl & 1)) + mtspr(SPRN_CTRLT, mfspr(SPRN_CTRLF) & ~1); + + mtspr(SPRN_DEC, vcpu->arch.dec_expires - mftb()); + + if (kvmhv_on_pseries()) { + /* call our hypervisor to load up HV regs and go */ + struct hv_guest_state hvregs; + + kvmhv_save_hv_regs(vcpu, &hvregs); + hvregs.lpcr = lpcr; + vcpu->arch.regs.msr = vcpu->arch.shregs.msr; + hvregs.version = HV_GUEST_STATE_VERSION; + if (vcpu->arch.nested) { + hvregs.lpid = vcpu->arch.nested->shadow_lpid; + hvregs.vcpu_token = vcpu->arch.nested_vcpu_id; + } else { + hvregs.lpid = vcpu->kvm->arch.lpid; + hvregs.vcpu_token = vcpu->vcpu_id; + } + hvregs.hdec_expiry = time_limit; + trap = plpar_hcall_norets(H_ENTER_NESTED, __pa(&hvregs), + __pa(&vcpu->arch.regs)); + kvmhv_restore_hv_return_state(vcpu, &hvregs); + vcpu->arch.shregs.msr = vcpu->arch.regs.msr; + vcpu->arch.shregs.dar = mfspr(SPRN_DAR); + vcpu->arch.shregs.dsisr = mfspr(SPRN_DSISR); + + /* H_CEDE has to be handled now, not later */ + if (trap == BOOK3S_INTERRUPT_SYSCALL && !vcpu->arch.nested && + kvmppc_get_gpr(vcpu, 3) == H_CEDE) { + kvmppc_nested_cede(vcpu); + trap = 0; + } + } else { + trap = kvmhv_load_hv_regs_and_go(vcpu, time_limit, lpcr); + } + + vcpu->arch.slb_max = 0; + dec = mfspr(SPRN_DEC); + tb = mftb(); + vcpu->arch.dec_expires = dec + tb; + vcpu->cpu = -1; + vcpu->arch.thread_cpu = -1; + vcpu->arch.ctrl = mfspr(SPRN_CTRLF); + + vcpu->arch.iamr = mfspr(SPRN_IAMR); + vcpu->arch.pspb = mfspr(SPRN_PSPB); + vcpu->arch.fscr = mfspr(SPRN_FSCR); + vcpu->arch.tar = mfspr(SPRN_TAR); + vcpu->arch.ebbhr = mfspr(SPRN_EBBHR); + vcpu->arch.ebbrr = mfspr(SPRN_EBBRR); + vcpu->arch.bescr = mfspr(SPRN_BESCR); + vcpu->arch.wort = mfspr(SPRN_WORT); + vcpu->arch.tid = mfspr(SPRN_TIDR); + vcpu->arch.amr = mfspr(SPRN_AMR); + vcpu->arch.uamor = mfspr(SPRN_UAMOR); + vcpu->arch.dscr = mfspr(SPRN_DSCR); + + mtspr(SPRN_PSPB, 0); + mtspr(SPRN_WORT, 0); + mtspr(SPRN_AMR, 0); + mtspr(SPRN_UAMOR, 0); + mtspr(SPRN_DSCR, host_dscr); + mtspr(SPRN_TIDR, host_tidr); + mtspr(SPRN_IAMR, host_iamr); + mtspr(SPRN_PSPB, 0); + + msr_check_and_set(MSR_FP | MSR_VEC | MSR_VSX); + store_fp_state(&vcpu->arch.fp); +#ifdef CONFIG_ALTIVEC + store_vr_state(&vcpu->arch.vr); +#endif + + if (cpu_has_feature(CPU_FTR_TM) || + cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST)) + kvmppc_save_tm_hv(vcpu, vcpu->arch.shregs.msr, true); + + save_pmu = 1; + if (vcpu->arch.vpa.pinned_addr) { + struct lppaca *lp = vcpu->arch.vpa.pinned_addr; + u32 yield_count = be32_to_cpu(lp->yield_count) + 1; + lp->yield_count = cpu_to_be32(yield_count); + vcpu->arch.vpa.dirty = 1; + save_pmu = lp->pmcregs_in_use; + } + + kvmhv_save_guest_pmu(vcpu, save_pmu); + + vc->entry_exit_map = 0x101; + vc->in_guest = 0; + + mtspr(SPRN_DEC, local_paca->kvm_hstate.dec_expires - mftb()); + + kvmhv_load_host_pmu(); + + kvmppc_subcore_exit_guest(); + + return trap; +} + +/* * Wait for some other vcpu thread to execute us, and * wake us up when we need to handle something in the host. */ @@ -3256,6 +3780,11 @@ out: trace_kvmppc_vcore_wakeup(do_sleep, block_ns); } +/* + * This never fails for a radix guest, as none of the operations it does + * for a radix guest can fail or have a way to report failure. + * kvmhv_run_single_vcpu() relies on this fact. + */ static int kvmhv_setup_mmu(struct kvm_vcpu *vcpu) { int r = 0; @@ -3405,6 +3934,171 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) return vcpu->arch.ret; } +int kvmhv_run_single_vcpu(struct kvm_run *kvm_run, + struct kvm_vcpu *vcpu, u64 time_limit, + unsigned long lpcr) +{ + int trap, r, pcpu; + int srcu_idx; + struct kvmppc_vcore *vc; + struct kvm *kvm = vcpu->kvm; + struct kvm_nested_guest *nested = vcpu->arch.nested; + + trace_kvmppc_run_vcpu_enter(vcpu); + + kvm_run->exit_reason = 0; + vcpu->arch.ret = RESUME_GUEST; + vcpu->arch.trap = 0; + + vc = vcpu->arch.vcore; + vcpu->arch.ceded = 0; + vcpu->arch.run_task = current; + vcpu->arch.kvm_run = kvm_run; + vcpu->arch.stolen_logged = vcore_stolen_time(vc, mftb()); + vcpu->arch.state = KVMPPC_VCPU_RUNNABLE; + vcpu->arch.busy_preempt = TB_NIL; + vcpu->arch.last_inst = KVM_INST_FETCH_FAILED; + vc->runnable_threads[0] = vcpu; + vc->n_runnable = 1; + vc->runner = vcpu; + + /* See if the MMU is ready to go */ + if (!kvm->arch.mmu_ready) + kvmhv_setup_mmu(vcpu); + + if (need_resched()) + cond_resched(); + + kvmppc_update_vpas(vcpu); + + init_vcore_to_run(vc); + vc->preempt_tb = TB_NIL; + + preempt_disable(); + pcpu = smp_processor_id(); + vc->pcpu = pcpu; + kvmppc_prepare_radix_vcpu(vcpu, pcpu); + + local_irq_disable(); + hard_irq_disable(); + if (signal_pending(current)) + goto sigpend; + if (lazy_irq_pending() || need_resched() || !kvm->arch.mmu_ready) + goto out; + + if (!nested) { + kvmppc_core_prepare_to_enter(vcpu); + if (vcpu->arch.doorbell_request) { + vc->dpdes = 1; + smp_wmb(); + vcpu->arch.doorbell_request = 0; + } + if (test_bit(BOOK3S_IRQPRIO_EXTERNAL, + &vcpu->arch.pending_exceptions)) + lpcr |= LPCR_MER; + } else if (vcpu->arch.pending_exceptions || + vcpu->arch.doorbell_request || + xive_interrupt_pending(vcpu)) { + vcpu->arch.ret = RESUME_HOST; + goto out; + } + + kvmppc_clear_host_core(pcpu); + + local_paca->kvm_hstate.tid = 0; + local_paca->kvm_hstate.napping = 0; + local_paca->kvm_hstate.kvm_split_mode = NULL; + kvmppc_start_thread(vcpu, vc); + kvmppc_create_dtl_entry(vcpu, vc); + trace_kvm_guest_enter(vcpu); + + vc->vcore_state = VCORE_RUNNING; + trace_kvmppc_run_core(vc, 0); + + if (cpu_has_feature(CPU_FTR_HVMODE)) + kvmppc_radix_check_need_tlb_flush(kvm, pcpu, nested); + + trace_hardirqs_on(); + guest_enter_irqoff(); + + srcu_idx = srcu_read_lock(&kvm->srcu); + + this_cpu_disable_ftrace(); + + trap = kvmhv_p9_guest_entry(vcpu, time_limit, lpcr); + vcpu->arch.trap = trap; + + this_cpu_enable_ftrace(); + + srcu_read_unlock(&kvm->srcu, srcu_idx); + + if (cpu_has_feature(CPU_FTR_HVMODE)) { + mtspr(SPRN_LPID, kvm->arch.host_lpid); + isync(); + } + + trace_hardirqs_off(); + set_irq_happened(trap); + + kvmppc_set_host_core(pcpu); + + local_irq_enable(); + guest_exit(); + + cpumask_clear_cpu(pcpu, &kvm->arch.cpu_in_guest); + + preempt_enable(); + + /* cancel pending decrementer exception if DEC is now positive */ + if (get_tb() < vcpu->arch.dec_expires && kvmppc_core_pending_dec(vcpu)) + kvmppc_core_dequeue_dec(vcpu); + + trace_kvm_guest_exit(vcpu); + r = RESUME_GUEST; + if (trap) { + if (!nested) + r = kvmppc_handle_exit_hv(kvm_run, vcpu, current); + else + r = kvmppc_handle_nested_exit(vcpu); + } + vcpu->arch.ret = r; + + if (is_kvmppc_resume_guest(r) && vcpu->arch.ceded && + !kvmppc_vcpu_woken(vcpu)) { + kvmppc_set_timer(vcpu); + while (vcpu->arch.ceded && !kvmppc_vcpu_woken(vcpu)) { + if (signal_pending(current)) { + vcpu->stat.signal_exits++; + kvm_run->exit_reason = KVM_EXIT_INTR; + vcpu->arch.ret = -EINTR; + break; + } + spin_lock(&vc->lock); + kvmppc_vcore_blocked(vc); + spin_unlock(&vc->lock); + } + } + vcpu->arch.ceded = 0; + + vc->vcore_state = VCORE_INACTIVE; + trace_kvmppc_run_core(vc, 1); + + done: + kvmppc_remove_runnable(vc, vcpu); + trace_kvmppc_run_vcpu_exit(vcpu, kvm_run); + + return vcpu->arch.ret; + + sigpend: + vcpu->stat.signal_exits++; + kvm_run->exit_reason = KVM_EXIT_INTR; + vcpu->arch.ret = -EINTR; + out: + local_irq_enable(); + preempt_enable(); + goto done; +} + static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu) { int r; @@ -3480,7 +4174,11 @@ static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu) vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST; do { - r = kvmppc_run_vcpu(run, vcpu); + if (kvm->arch.threads_indep && kvm_is_radix(kvm)) + r = kvmhv_run_single_vcpu(run, vcpu, ~(u64)0, + vcpu->arch.vcore->lpcr); + else + r = kvmppc_run_vcpu(run, vcpu); if (run->exit_reason == KVM_EXIT_PAPR_HCALL && !(vcpu->arch.shregs.msr & MSR_PR)) { @@ -3559,6 +4257,10 @@ static int kvm_vm_ioctl_get_smmu_info_hv(struct kvm *kvm, kvmppc_add_seg_page_size(&sps, 16, SLB_VSID_L | SLB_VSID_LP_01); kvmppc_add_seg_page_size(&sps, 24, SLB_VSID_L); + /* If running as a nested hypervisor, we don't support HPT guests */ + if (kvmhv_on_pseries()) + info->flags |= KVM_PPC_NO_HASH; + return 0; } @@ -3723,8 +4425,7 @@ void kvmppc_setup_partition_table(struct kvm *kvm) __pa(kvm->arch.pgtable) | RADIX_PGD_INDEX_SIZE; dw1 = PATB_GR | kvm->arch.process_table; } - - mmu_partition_table_set_entry(kvm->arch.lpid, dw0, dw1); + kvmhv_set_ptbl_entry(kvm->arch.lpid, dw0, dw1); } /* @@ -3820,6 +4521,8 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu) /* Must be called with kvm->lock held and mmu_ready = 0 and no vcpus running */ int kvmppc_switch_mmu_to_hpt(struct kvm *kvm) { + if (nesting_enabled(kvm)) + kvmhv_release_all_nested(kvm); kvmppc_free_radix(kvm); kvmppc_update_lpcr(kvm, LPCR_VPM1, LPCR_VPM1 | LPCR_UPRT | LPCR_GTSE | LPCR_HR); @@ -3841,6 +4544,7 @@ int kvmppc_switch_mmu_to_radix(struct kvm *kvm) kvmppc_free_hpt(&kvm->arch.hpt); kvmppc_update_lpcr(kvm, LPCR_UPRT | LPCR_GTSE | LPCR_HR, LPCR_VPM1 | LPCR_UPRT | LPCR_GTSE | LPCR_HR); + kvmppc_rmap_reset(kvm); kvm->arch.radix = 1; return 0; } @@ -3940,6 +4644,8 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm) kvmppc_alloc_host_rm_ops(); + kvmhv_vm_nested_init(kvm); + /* * Since we don't flush the TLB when tearing down a VM, * and this lpid might have previously been used, @@ -3958,9 +4664,13 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm) kvm->arch.host_sdr1 = mfspr(SPRN_SDR1); /* Init LPCR for virtual RMA mode */ - kvm->arch.host_lpid = mfspr(SPRN_LPID); - kvm->arch.host_lpcr = lpcr = mfspr(SPRN_LPCR); - lpcr &= LPCR_PECE | LPCR_LPES; + if (cpu_has_feature(CPU_FTR_HVMODE)) { + kvm->arch.host_lpid = mfspr(SPRN_LPID); + kvm->arch.host_lpcr = lpcr = mfspr(SPRN_LPCR); + lpcr &= LPCR_PECE | LPCR_LPES; + } else { + lpcr = 0; + } lpcr |= (4UL << LPCR_DPFD_SH) | LPCR_HDICE | LPCR_VPM0 | LPCR_VPM1; kvm->arch.vrma_slb_v = SLB_VSID_B_1T | @@ -4027,8 +4737,14 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm) * On POWER9, we only need to do this if the "indep_threads_mode" * module parameter has been set to N. */ - if (cpu_has_feature(CPU_FTR_ARCH_300)) - kvm->arch.threads_indep = indep_threads_mode; + if (cpu_has_feature(CPU_FTR_ARCH_300)) { + if (!indep_threads_mode && !cpu_has_feature(CPU_FTR_HVMODE)) { + pr_warn("KVM: Ignoring indep_threads_mode=N in nested hypervisor\n"); + kvm->arch.threads_indep = true; + } else { + kvm->arch.threads_indep = indep_threads_mode; + } + } if (!kvm->arch.threads_indep) kvm_hv_vm_activated(); @@ -4051,6 +4767,8 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm) snprintf(buf, sizeof(buf), "vm%d", current->pid); kvm->arch.debugfs_dir = debugfs_create_dir(buf, kvm_debugfs_dir); kvmppc_mmu_debugfs_init(kvm); + if (radix_enabled()) + kvmhv_radix_debugfs_init(kvm); return 0; } @@ -4073,13 +4791,21 @@ static void kvmppc_core_destroy_vm_hv(struct kvm *kvm) kvmppc_free_vcores(kvm); - kvmppc_free_lpid(kvm->arch.lpid); if (kvm_is_radix(kvm)) kvmppc_free_radix(kvm); else kvmppc_free_hpt(&kvm->arch.hpt); + /* Perform global invalidation and return lpid to the pool */ + if (cpu_has_feature(CPU_FTR_ARCH_300)) { + if (nesting_enabled(kvm)) + kvmhv_release_all_nested(kvm); + kvm->arch.process_table = 0; + kvmhv_set_ptbl_entry(kvm->arch.lpid, 0, 0); + } + kvmppc_free_lpid(kvm->arch.lpid); + kvmppc_free_pimap(kvm); } @@ -4104,11 +4830,15 @@ static int kvmppc_core_emulate_mfspr_hv(struct kvm_vcpu *vcpu, int sprn, static int kvmppc_core_check_processor_compat_hv(void) { - if (!cpu_has_feature(CPU_FTR_HVMODE) || - !cpu_has_feature(CPU_FTR_ARCH_206)) - return -EIO; + if (cpu_has_feature(CPU_FTR_HVMODE) && + cpu_has_feature(CPU_FTR_ARCH_206)) + return 0; - return 0; + /* POWER9 in radix mode is capable of being a nested hypervisor. */ + if (cpu_has_feature(CPU_FTR_ARCH_300) && radix_enabled()) + return 0; + + return -EIO; } #ifdef CONFIG_KVM_XICS @@ -4426,6 +5156,10 @@ static int kvmhv_configure_mmu(struct kvm *kvm, struct kvm_ppc_mmuv3_cfg *cfg) if (radix && !radix_enabled()) return -EINVAL; + /* If we're a nested hypervisor, we currently only support radix */ + if (kvmhv_on_pseries() && !radix) + return -EINVAL; + mutex_lock(&kvm->lock); if (radix != kvm_is_radix(kvm)) { if (kvm->arch.mmu_ready) { @@ -4458,6 +5192,19 @@ static int kvmhv_configure_mmu(struct kvm *kvm, struct kvm_ppc_mmuv3_cfg *cfg) return err; } +static int kvmhv_enable_nested(struct kvm *kvm) +{ + if (!nested) + return -EPERM; + if (!cpu_has_feature(CPU_FTR_ARCH_300)) + return -ENODEV; + + /* kvm == NULL means the caller is testing if the capability exists */ + if (kvm) + kvm->arch.nested_enable = true; + return 0; +} + static struct kvmppc_ops kvm_ops_hv = { .get_sregs = kvm_arch_vcpu_ioctl_get_sregs_hv, .set_sregs = kvm_arch_vcpu_ioctl_set_sregs_hv, @@ -4497,6 +5244,7 @@ static struct kvmppc_ops kvm_ops_hv = { .configure_mmu = kvmhv_configure_mmu, .get_rmmu_info = kvmhv_get_rmmu_info, .set_smt_mode = kvmhv_set_smt_mode, + .enable_nested = kvmhv_enable_nested, }; static int kvm_init_subcore_bitmap(void) @@ -4547,6 +5295,10 @@ static int kvmppc_book3s_init_hv(void) if (r < 0) return -ENODEV; + r = kvmhv_nested_init(); + if (r) + return r; + r = kvm_init_subcore_bitmap(); if (r) return r; @@ -4557,7 +5309,8 @@ static int kvmppc_book3s_init_hv(void) * indirectly, via OPAL. */ #ifdef CONFIG_SMP - if (!xive_enabled() && !local_paca->kvm_hstate.xics_phys) { + if (!xive_enabled() && !kvmhv_on_pseries() && + !local_paca->kvm_hstate.xics_phys) { struct device_node *np; np = of_find_compatible_node(NULL, NULL, "ibm,opal-intc"); @@ -4605,6 +5358,7 @@ static void kvmppc_book3s_exit_hv(void) if (kvmppc_radix_possible()) kvmppc_radix_exit(); kvmppc_hv_ops = NULL; + kvmhv_nested_exit(); } module_init(kvmppc_book3s_init_hv); diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c index fc6bb9630a9c..a71e2fc00a4e 100644 --- a/arch/powerpc/kvm/book3s_hv_builtin.c +++ b/arch/powerpc/kvm/book3s_hv_builtin.c @@ -231,6 +231,15 @@ void kvmhv_rm_send_ipi(int cpu) void __iomem *xics_phys; unsigned long msg = PPC_DBELL_TYPE(PPC_DBELL_SERVER); + /* For a nested hypervisor, use the XICS via hcall */ + if (kvmhv_on_pseries()) { + unsigned long retbuf[PLPAR_HCALL_BUFSIZE]; + + plpar_hcall_raw(H_IPI, retbuf, get_hard_smp_processor_id(cpu), + IPI_PRIORITY); + return; + } + /* On POWER9 we can use msgsnd for any destination cpu. */ if (cpu_has_feature(CPU_FTR_ARCH_300)) { msg |= get_hard_smp_processor_id(cpu); @@ -460,12 +469,19 @@ static long kvmppc_read_one_intr(bool *again) return 1; /* Now read the interrupt from the ICP */ - xics_phys = local_paca->kvm_hstate.xics_phys; - rc = 0; - if (!xics_phys) - rc = opal_int_get_xirr(&xirr, false); - else - xirr = __raw_rm_readl(xics_phys + XICS_XIRR); + if (kvmhv_on_pseries()) { + unsigned long retbuf[PLPAR_HCALL_BUFSIZE]; + + rc = plpar_hcall_raw(H_XIRR, retbuf, 0xFF); + xirr = cpu_to_be32(retbuf[0]); + } else { + xics_phys = local_paca->kvm_hstate.xics_phys; + rc = 0; + if (!xics_phys) + rc = opal_int_get_xirr(&xirr, false); + else + xirr = __raw_rm_readl(xics_phys + XICS_XIRR); + } if (rc < 0) return 1; @@ -494,7 +510,13 @@ static long kvmppc_read_one_intr(bool *again) */ if (xisr == XICS_IPI) { rc = 0; - if (xics_phys) { + if (kvmhv_on_pseries()) { + unsigned long retbuf[PLPAR_HCALL_BUFSIZE]; + + plpar_hcall_raw(H_IPI, retbuf, + hard_smp_processor_id(), 0xff); + plpar_hcall_raw(H_EOI, retbuf, h_xirr); + } else if (xics_phys) { __raw_rm_writeb(0xff, xics_phys + XICS_MFRR); __raw_rm_writel(xirr, xics_phys + XICS_XIRR); } else { @@ -520,7 +542,13 @@ static long kvmppc_read_one_intr(bool *again) /* We raced with the host, * we need to resend that IPI, bummer */ - if (xics_phys) + if (kvmhv_on_pseries()) { + unsigned long retbuf[PLPAR_HCALL_BUFSIZE]; + + plpar_hcall_raw(H_IPI, retbuf, + hard_smp_processor_id(), + IPI_PRIORITY); + } else if (xics_phys) __raw_rm_writeb(IPI_PRIORITY, xics_phys + XICS_MFRR); else @@ -729,3 +757,51 @@ void kvmhv_p9_restore_lpcr(struct kvm_split_mode *sip) smp_mb(); local_paca->kvm_hstate.kvm_split_mode = NULL; } + +/* + * Is there a PRIV_DOORBELL pending for the guest (on POWER9)? + * Can we inject a Decrementer or a External interrupt? + */ +void kvmppc_guest_entry_inject_int(struct kvm_vcpu *vcpu) +{ + int ext; + unsigned long vec = 0; + unsigned long lpcr; + + /* Insert EXTERNAL bit into LPCR at the MER bit position */ + ext = (vcpu->arch.pending_exceptions >> BOOK3S_IRQPRIO_EXTERNAL) & 1; + lpcr = mfspr(SPRN_LPCR); + lpcr |= ext << LPCR_MER_SH; + mtspr(SPRN_LPCR, lpcr); + isync(); + + if (vcpu->arch.shregs.msr & MSR_EE) { + if (ext) { + vec = BOOK3S_INTERRUPT_EXTERNAL; + } else { + long int dec = mfspr(SPRN_DEC); + if (!(lpcr & LPCR_LD)) + dec = (int) dec; + if (dec < 0) + vec = BOOK3S_INTERRUPT_DECREMENTER; + } + } + if (vec) { + unsigned long msr, old_msr = vcpu->arch.shregs.msr; + + kvmppc_set_srr0(vcpu, kvmppc_get_pc(vcpu)); + kvmppc_set_srr1(vcpu, old_msr); + kvmppc_set_pc(vcpu, vec); + msr = vcpu->arch.intr_msr; + if (MSR_TM_ACTIVE(old_msr)) + msr |= MSR_TS_S; + vcpu->arch.shregs.msr = msr; + } + + if (vcpu->arch.doorbell_request) { + mtspr(SPRN_DPDES, 1); + vcpu->arch.vcore->dpdes = 1; + smp_wmb(); + vcpu->arch.doorbell_request = 0; + } +} diff --git a/arch/powerpc/kvm/book3s_hv_interrupts.S b/arch/powerpc/kvm/book3s_hv_interrupts.S index 666b91c79eb4..a6d10010d9e8 100644 --- a/arch/powerpc/kvm/book3s_hv_interrupts.S +++ b/arch/powerpc/kvm/book3s_hv_interrupts.S @@ -64,52 +64,7 @@ BEGIN_FTR_SECTION END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S) /* Save host PMU registers */ -BEGIN_FTR_SECTION - /* Work around P8 PMAE bug */ - li r3, -1 - clrrdi r3, r3, 10 - mfspr r8, SPRN_MMCR2 - mtspr SPRN_MMCR2, r3 /* freeze all counters using MMCR2 */ - isync -END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) - li r3, 1 - sldi r3, r3, 31 /* MMCR0_FC (freeze counters) bit */ - mfspr r7, SPRN_MMCR0 /* save MMCR0 */ - mtspr SPRN_MMCR0, r3 /* freeze all counters, disable interrupts */ - mfspr r6, SPRN_MMCRA - /* Clear MMCRA in order to disable SDAR updates */ - li r5, 0 - mtspr SPRN_MMCRA, r5 - isync - lbz r5, PACA_PMCINUSE(r13) /* is the host using the PMU? */ - cmpwi r5, 0 - beq 31f /* skip if not */ - mfspr r5, SPRN_MMCR1 - mfspr r9, SPRN_SIAR - mfspr r10, SPRN_SDAR - std r7, HSTATE_MMCR0(r13) - std r5, HSTATE_MMCR1(r13) - std r6, HSTATE_MMCRA(r13) - std r9, HSTATE_SIAR(r13) - std r10, HSTATE_SDAR(r13) -BEGIN_FTR_SECTION - mfspr r9, SPRN_SIER - std r8, HSTATE_MMCR2(r13) - std r9, HSTATE_SIER(r13) -END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) - mfspr r3, SPRN_PMC1 - mfspr r5, SPRN_PMC2 - mfspr r6, SPRN_PMC3 - mfspr r7, SPRN_PMC4 - mfspr r8, SPRN_PMC5 - mfspr r9, SPRN_PMC6 - stw r3, HSTATE_PMC1(r13) - stw r5, HSTATE_PMC2(r13) - stw r6, HSTATE_PMC3(r13) - stw r7, HSTATE_PMC4(r13) - stw r8, HSTATE_PMC5(r13) - stw r9, HSTATE_PMC6(r13) -31: + bl kvmhv_save_host_pmu /* * Put whatever is in the decrementer into the @@ -161,3 +116,51 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) ld r0, PPC_LR_STKOFF(r1) mtlr r0 blr + +_GLOBAL(kvmhv_save_host_pmu) +BEGIN_FTR_SECTION + /* Work around P8 PMAE bug */ + li r3, -1 + clrrdi r3, r3, 10 + mfspr r8, SPRN_MMCR2 + mtspr SPRN_MMCR2, r3 /* freeze all counters using MMCR2 */ + isync +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) + li r3, 1 + sldi r3, r3, 31 /* MMCR0_FC (freeze counters) bit */ + mfspr r7, SPRN_MMCR0 /* save MMCR0 */ + mtspr SPRN_MMCR0, r3 /* freeze all counters, disable interrupts */ + mfspr r6, SPRN_MMCRA + /* Clear MMCRA in order to disable SDAR updates */ + li r5, 0 + mtspr SPRN_MMCRA, r5 + isync + lbz r5, PACA_PMCINUSE(r13) /* is the host using the PMU? */ + cmpwi r5, 0 + beq 31f /* skip if not */ + mfspr r5, SPRN_MMCR1 + mfspr r9, SPRN_SIAR + mfspr r10, SPRN_SDAR + std r7, HSTATE_MMCR0(r13) + std r5, HSTATE_MMCR1(r13) + std r6, HSTATE_MMCRA(r13) + std r9, HSTATE_SIAR(r13) + std r10, HSTATE_SDAR(r13) +BEGIN_FTR_SECTION + mfspr r9, SPRN_SIER + std r8, HSTATE_MMCR2(r13) + std r9, HSTATE_SIER(r13) +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) + mfspr r3, SPRN_PMC1 + mfspr r5, SPRN_PMC2 + mfspr r6, SPRN_PMC3 + mfspr r7, SPRN_PMC4 + mfspr r8, SPRN_PMC5 + mfspr r9, SPRN_PMC6 + stw r3, HSTATE_PMC1(r13) + stw r5, HSTATE_PMC2(r13) + stw r6, HSTATE_PMC3(r13) + stw r7, HSTATE_PMC4(r13) + stw r8, HSTATE_PMC5(r13) + stw r9, HSTATE_PMC6(r13) +31: blr diff --git a/arch/powerpc/kvm/book3s_hv_nested.c b/arch/powerpc/kvm/book3s_hv_nested.c new file mode 100644 index 000000000000..401d2ecbebc5 --- /dev/null +++ b/arch/powerpc/kvm/book3s_hv_nested.c @@ -0,0 +1,1291 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright IBM Corporation, 2018 + * Authors Suraj Jitindar Singh <sjitindarsingh@gmail.com> + * Paul Mackerras <paulus@ozlabs.org> + * + * Description: KVM functions specific to running nested KVM-HV guests + * on Book3S processors (specifically POWER9 and later). + */ + +#include <linux/kernel.h> +#include <linux/kvm_host.h> +#include <linux/llist.h> + +#include <asm/kvm_ppc.h> +#include <asm/kvm_book3s.h> +#include <asm/mmu.h> +#include <asm/pgtable.h> +#include <asm/pgalloc.h> +#include <asm/pte-walk.h> +#include <asm/reg.h> + +static struct patb_entry *pseries_partition_tb; + +static void kvmhv_update_ptbl_cache(struct kvm_nested_guest *gp); +static void kvmhv_free_memslot_nest_rmap(struct kvm_memory_slot *free); + +void kvmhv_save_hv_regs(struct kvm_vcpu *vcpu, struct hv_guest_state *hr) +{ + struct kvmppc_vcore *vc = vcpu->arch.vcore; + + hr->pcr = vc->pcr; + hr->dpdes = vc->dpdes; + hr->hfscr = vcpu->arch.hfscr; + hr->tb_offset = vc->tb_offset; + hr->dawr0 = vcpu->arch.dawr; + hr->dawrx0 = vcpu->arch.dawrx; + hr->ciabr = vcpu->arch.ciabr; + hr->purr = vcpu->arch.purr; + hr->spurr = vcpu->arch.spurr; + hr->ic = vcpu->arch.ic; + hr->vtb = vc->vtb; + hr->srr0 = vcpu->arch.shregs.srr0; + hr->srr1 = vcpu->arch.shregs.srr1; + hr->sprg[0] = vcpu->arch.shregs.sprg0; + hr->sprg[1] = vcpu->arch.shregs.sprg1; + hr->sprg[2] = vcpu->arch.shregs.sprg2; + hr->sprg[3] = vcpu->arch.shregs.sprg3; + hr->pidr = vcpu->arch.pid; + hr->cfar = vcpu->arch.cfar; + hr->ppr = vcpu->arch.ppr; +} + +static void byteswap_pt_regs(struct pt_regs *regs) +{ + unsigned long *addr = (unsigned long *) regs; + + for (; addr < ((unsigned long *) (regs + 1)); addr++) + *addr = swab64(*addr); +} + +static void byteswap_hv_regs(struct hv_guest_state *hr) +{ + hr->version = swab64(hr->version); + hr->lpid = swab32(hr->lpid); + hr->vcpu_token = swab32(hr->vcpu_token); + hr->lpcr = swab64(hr->lpcr); + hr->pcr = swab64(hr->pcr); + hr->amor = swab64(hr->amor); + hr->dpdes = swab64(hr->dpdes); + hr->hfscr = swab64(hr->hfscr); + hr->tb_offset = swab64(hr->tb_offset); + hr->dawr0 = swab64(hr->dawr0); + hr->dawrx0 = swab64(hr->dawrx0); + hr->ciabr = swab64(hr->ciabr); + hr->hdec_expiry = swab64(hr->hdec_expiry); + hr->purr = swab64(hr->purr); + hr->spurr = swab64(hr->spurr); + hr->ic = swab64(hr->ic); + hr->vtb = swab64(hr->vtb); + hr->hdar = swab64(hr->hdar); + hr->hdsisr = swab64(hr->hdsisr); + hr->heir = swab64(hr->heir); + hr->asdr = swab64(hr->asdr); + hr->srr0 = swab64(hr->srr0); + hr->srr1 = swab64(hr->srr1); + hr->sprg[0] = swab64(hr->sprg[0]); + hr->sprg[1] = swab64(hr->sprg[1]); + hr->sprg[2] = swab64(hr->sprg[2]); + hr->sprg[3] = swab64(hr->sprg[3]); + hr->pidr = swab64(hr->pidr); + hr->cfar = swab64(hr->cfar); + hr->ppr = swab64(hr->ppr); +} + +static void save_hv_return_state(struct kvm_vcpu *vcpu, int trap, + struct hv_guest_state *hr) +{ + struct kvmppc_vcore *vc = vcpu->arch.vcore; + + hr->dpdes = vc->dpdes; + hr->hfscr = vcpu->arch.hfscr; + hr->purr = vcpu->arch.purr; + hr->spurr = vcpu->arch.spurr; + hr->ic = vcpu->arch.ic; + hr->vtb = vc->vtb; + hr->srr0 = vcpu->arch.shregs.srr0; + hr->srr1 = vcpu->arch.shregs.srr1; + hr->sprg[0] = vcpu->arch.shregs.sprg0; + hr->sprg[1] = vcpu->arch.shregs.sprg1; + hr->sprg[2] = vcpu->arch.shregs.sprg2; + hr->sprg[3] = vcpu->arch.shregs.sprg3; + hr->pidr = vcpu->arch.pid; + hr->cfar = vcpu->arch.cfar; + hr->ppr = vcpu->arch.ppr; + switch (trap) { + case BOOK3S_INTERRUPT_H_DATA_STORAGE: + hr->hdar = vcpu->arch.fault_dar; + hr->hdsisr = vcpu->arch.fault_dsisr; + hr->asdr = vcpu->arch.fault_gpa; + break; + case BOOK3S_INTERRUPT_H_INST_STORAGE: + hr->asdr = vcpu->arch.fault_gpa; + break; + case BOOK3S_INTERRUPT_H_EMUL_ASSIST: + hr->heir = vcpu->arch.emul_inst; + break; + } +} + +static void sanitise_hv_regs(struct kvm_vcpu *vcpu, struct hv_guest_state *hr) +{ + /* + * Don't let L1 enable features for L2 which we've disabled for L1, + * but preserve the interrupt cause field. + */ + hr->hfscr &= (HFSCR_INTR_CAUSE | vcpu->arch.hfscr); + + /* Don't let data address watchpoint match in hypervisor state */ + hr->dawrx0 &= ~DAWRX_HYP; + + /* Don't let completed instruction address breakpt match in HV state */ + if ((hr->ciabr & CIABR_PRIV) == CIABR_PRIV_HYPER) + hr->ciabr &= ~CIABR_PRIV; +} + +static void restore_hv_regs(struct kvm_vcpu *vcpu, struct hv_guest_state *hr) +{ + struct kvmppc_vcore *vc = vcpu->arch.vcore; + + vc->pcr = hr->pcr; + vc->dpdes = hr->dpdes; + vcpu->arch.hfscr = hr->hfscr; + vcpu->arch.dawr = hr->dawr0; + vcpu->arch.dawrx = hr->dawrx0; + vcpu->arch.ciabr = hr->ciabr; + vcpu->arch.purr = hr->purr; + vcpu->arch.spurr = hr->spurr; + vcpu->arch.ic = hr->ic; + vc->vtb = hr->vtb; + vcpu->arch.shregs.srr0 = hr->srr0; + vcpu->arch.shregs.srr1 = hr->srr1; + vcpu->arch.shregs.sprg0 = hr->sprg[0]; + vcpu->arch.shregs.sprg1 = hr->sprg[1]; + vcpu->arch.shregs.sprg2 = hr->sprg[2]; + vcpu->arch.shregs.sprg3 = hr->sprg[3]; + vcpu->arch.pid = hr->pidr; + vcpu->arch.cfar = hr->cfar; + vcpu->arch.ppr = hr->ppr; +} + +void kvmhv_restore_hv_return_state(struct kvm_vcpu *vcpu, + struct hv_guest_state *hr) +{ + struct kvmppc_vcore *vc = vcpu->arch.vcore; + + vc->dpdes = hr->dpdes; + vcpu->arch.hfscr = hr->hfscr; + vcpu->arch.purr = hr->purr; + vcpu->arch.spurr = hr->spurr; + vcpu->arch.ic = hr->ic; + vc->vtb = hr->vtb; + vcpu->arch.fault_dar = hr->hdar; + vcpu->arch.fault_dsisr = hr->hdsisr; + vcpu->arch.fault_gpa = hr->asdr; + vcpu->arch.emul_inst = hr->heir; + vcpu->arch.shregs.srr0 = hr->srr0; + vcpu->arch.shregs.srr1 = hr->srr1; + vcpu->arch.shregs.sprg0 = hr->sprg[0]; + vcpu->arch.shregs.sprg1 = hr->sprg[1]; + vcpu->arch.shregs.sprg2 = hr->sprg[2]; + vcpu->arch.shregs.sprg3 = hr->sprg[3]; + vcpu->arch.pid = hr->pidr; + vcpu->arch.cfar = hr->cfar; + vcpu->arch.ppr = hr->ppr; +} + +long kvmhv_enter_nested_guest(struct kvm_vcpu *vcpu) +{ + long int err, r; + struct kvm_nested_guest *l2; + struct pt_regs l2_regs, saved_l1_regs; + struct hv_guest_state l2_hv, saved_l1_hv; + struct kvmppc_vcore *vc = vcpu->arch.vcore; + u64 hv_ptr, regs_ptr; + u64 hdec_exp; + s64 delta_purr, delta_spurr, delta_ic, delta_vtb; + u64 mask; + unsigned long lpcr; + + if (vcpu->kvm->arch.l1_ptcr == 0) + return H_NOT_AVAILABLE; + + /* copy parameters in */ + hv_ptr = kvmppc_get_gpr(vcpu, 4); + err = kvm_vcpu_read_guest(vcpu, hv_ptr, &l2_hv, + sizeof(struct hv_guest_state)); + if (err) + return H_PARAMETER; + if (kvmppc_need_byteswap(vcpu)) + byteswap_hv_regs(&l2_hv); + if (l2_hv.version != HV_GUEST_STATE_VERSION) + return H_P2; + + regs_ptr = kvmppc_get_gpr(vcpu, 5); + err = kvm_vcpu_read_guest(vcpu, regs_ptr, &l2_regs, + sizeof(struct pt_regs)); + if (err) + return H_PARAMETER; + if (kvmppc_need_byteswap(vcpu)) + byteswap_pt_regs(&l2_regs); + if (l2_hv.vcpu_token >= NR_CPUS) + return H_PARAMETER; + + /* translate lpid */ + l2 = kvmhv_get_nested(vcpu->kvm, l2_hv.lpid, true); + if (!l2) + return H_PARAMETER; + if (!l2->l1_gr_to_hr) { + mutex_lock(&l2->tlb_lock); + kvmhv_update_ptbl_cache(l2); + mutex_unlock(&l2->tlb_lock); + } + + /* save l1 values of things */ + vcpu->arch.regs.msr = vcpu->arch.shregs.msr; + saved_l1_regs = vcpu->arch.regs; + kvmhv_save_hv_regs(vcpu, &saved_l1_hv); + + /* convert TB values/offsets to host (L0) values */ + hdec_exp = l2_hv.hdec_expiry - vc->tb_offset; + vc->tb_offset += l2_hv.tb_offset; + + /* set L1 state to L2 state */ + vcpu->arch.nested = l2; + vcpu->arch.nested_vcpu_id = l2_hv.vcpu_token; + vcpu->arch.regs = l2_regs; + vcpu->arch.shregs.msr = vcpu->arch.regs.msr; + mask = LPCR_DPFD | LPCR_ILE | LPCR_TC | LPCR_AIL | LPCR_LD | + LPCR_LPES | LPCR_MER; + lpcr = (vc->lpcr & ~mask) | (l2_hv.lpcr & mask); + sanitise_hv_regs(vcpu, &l2_hv); + restore_hv_regs(vcpu, &l2_hv); + + vcpu->arch.ret = RESUME_GUEST; + vcpu->arch.trap = 0; + do { + if (mftb() >= hdec_exp) { + vcpu->arch.trap = BOOK3S_INTERRUPT_HV_DECREMENTER; + r = RESUME_HOST; + break; + } + r = kvmhv_run_single_vcpu(vcpu->arch.kvm_run, vcpu, hdec_exp, + lpcr); + } while (is_kvmppc_resume_guest(r)); + + /* save L2 state for return */ + l2_regs = vcpu->arch.regs; + l2_regs.msr = vcpu->arch.shregs.msr; + delta_purr = vcpu->arch.purr - l2_hv.purr; + delta_spurr = vcpu->arch.spurr - l2_hv.spurr; + delta_ic = vcpu->arch.ic - l2_hv.ic; + delta_vtb = vc->vtb - l2_hv.vtb; + save_hv_return_state(vcpu, vcpu->arch.trap, &l2_hv); + + /* restore L1 state */ + vcpu->arch.nested = NULL; + vcpu->arch.regs = saved_l1_regs; + vcpu->arch.shregs.msr = saved_l1_regs.msr & ~MSR_TS_MASK; + /* set L1 MSR TS field according to L2 transaction state */ + if (l2_regs.msr & MSR_TS_MASK) + vcpu->arch.shregs.msr |= MSR_TS_S; + vc->tb_offset = saved_l1_hv.tb_offset; + restore_hv_regs(vcpu, &saved_l1_hv); + vcpu->arch.purr += delta_purr; + vcpu->arch.spurr += delta_spurr; + vcpu->arch.ic += delta_ic; + vc->vtb += delta_vtb; + + kvmhv_put_nested(l2); + + /* copy l2_hv_state and regs back to guest */ + if (kvmppc_need_byteswap(vcpu)) { + byteswap_hv_regs(&l2_hv); + byteswap_pt_regs(&l2_regs); + } + err = kvm_vcpu_write_guest(vcpu, hv_ptr, &l2_hv, + sizeof(struct hv_guest_state)); + if (err) + return H_AUTHORITY; + err = kvm_vcpu_write_guest(vcpu, regs_ptr, &l2_regs, + sizeof(struct pt_regs)); + if (err) + return H_AUTHORITY; + + if (r == -EINTR) + return H_INTERRUPT; + + return vcpu->arch.trap; +} + +long kvmhv_nested_init(void) +{ + long int ptb_order; + unsigned long ptcr; + long rc; + + if (!kvmhv_on_pseries()) + return 0; + if (!radix_enabled()) + return -ENODEV; + + /* find log base 2 of KVMPPC_NR_LPIDS, rounding up */ + ptb_order = __ilog2(KVMPPC_NR_LPIDS - 1) + 1; + if (ptb_order < 8) + ptb_order = 8; + pseries_partition_tb = kmalloc(sizeof(struct patb_entry) << ptb_order, + GFP_KERNEL); + if (!pseries_partition_tb) { + pr_err("kvm-hv: failed to allocated nested partition table\n"); + return -ENOMEM; + } + + ptcr = __pa(pseries_partition_tb) | (ptb_order - 8); + rc = plpar_hcall_norets(H_SET_PARTITION_TABLE, ptcr); + if (rc != H_SUCCESS) { + pr_err("kvm-hv: Parent hypervisor does not support nesting (rc=%ld)\n", + rc); + kfree(pseries_partition_tb); + pseries_partition_tb = NULL; + return -ENODEV; + } + + return 0; +} + +void kvmhv_nested_exit(void) +{ + /* + * N.B. the kvmhv_on_pseries() test is there because it enables + * the compiler to remove the call to plpar_hcall_norets() + * when CONFIG_PPC_PSERIES=n. + */ + if (kvmhv_on_pseries() && pseries_partition_tb) { + plpar_hcall_norets(H_SET_PARTITION_TABLE, 0); + kfree(pseries_partition_tb); + pseries_partition_tb = NULL; + } +} + +static void kvmhv_flush_lpid(unsigned int lpid) +{ + long rc; + + if (!kvmhv_on_pseries()) { + radix__flush_tlb_lpid(lpid); + return; + } + + rc = plpar_hcall_norets(H_TLB_INVALIDATE, H_TLBIE_P1_ENC(2, 0, 1), + lpid, TLBIEL_INVAL_SET_LPID); + if (rc) + pr_err("KVM: TLB LPID invalidation hcall failed, rc=%ld\n", rc); +} + +void kvmhv_set_ptbl_entry(unsigned int lpid, u64 dw0, u64 dw1) +{ + if (!kvmhv_on_pseries()) { + mmu_partition_table_set_entry(lpid, dw0, dw1); + return; + } + + pseries_partition_tb[lpid].patb0 = cpu_to_be64(dw0); + pseries_partition_tb[lpid].patb1 = cpu_to_be64(dw1); + /* L0 will do the necessary barriers */ + kvmhv_flush_lpid(lpid); +} + +static void kvmhv_set_nested_ptbl(struct kvm_nested_guest *gp) +{ + unsigned long dw0; + + dw0 = PATB_HR | radix__get_tree_size() | + __pa(gp->shadow_pgtable) | RADIX_PGD_INDEX_SIZE; + kvmhv_set_ptbl_entry(gp->shadow_lpid, dw0, gp->process_table); +} + +void kvmhv_vm_nested_init(struct kvm *kvm) +{ + kvm->arch.max_nested_lpid = -1; +} + +/* + * Handle the H_SET_PARTITION_TABLE hcall. + * r4 = guest real address of partition table + log_2(size) - 12 + * (formatted as for the PTCR). + */ +long kvmhv_set_partition_table(struct kvm_vcpu *vcpu) +{ + struct kvm *kvm = vcpu->kvm; + unsigned long ptcr = kvmppc_get_gpr(vcpu, 4); + int srcu_idx; + long ret = H_SUCCESS; + + srcu_idx = srcu_read_lock(&kvm->srcu); + /* + * Limit the partition table to 4096 entries (because that's what + * hardware supports), and check the base address. + */ + if ((ptcr & PRTS_MASK) > 12 - 8 || + !kvm_is_visible_gfn(vcpu->kvm, (ptcr & PRTB_MASK) >> PAGE_SHIFT)) + ret = H_PARAMETER; + srcu_read_unlock(&kvm->srcu, srcu_idx); + if (ret == H_SUCCESS) + kvm->arch.l1_ptcr = ptcr; + return ret; +} + +/* + * Reload the partition table entry for a guest. + * Caller must hold gp->tlb_lock. + */ +static void kvmhv_update_ptbl_cache(struct kvm_nested_guest *gp) +{ + int ret; + struct patb_entry ptbl_entry; + unsigned long ptbl_addr; + struct kvm *kvm = gp->l1_host; + + ret = -EFAULT; + ptbl_addr = (kvm->arch.l1_ptcr & PRTB_MASK) + (gp->l1_lpid << 4); + if (gp->l1_lpid < (1ul << ((kvm->arch.l1_ptcr & PRTS_MASK) + 8))) + ret = kvm_read_guest(kvm, ptbl_addr, + &ptbl_entry, sizeof(ptbl_entry)); + if (ret) { + gp->l1_gr_to_hr = 0; + gp->process_table = 0; + } else { + gp->l1_gr_to_hr = be64_to_cpu(ptbl_entry.patb0); + gp->process_table = be64_to_cpu(ptbl_entry.patb1); + } + kvmhv_set_nested_ptbl(gp); +} + +struct kvm_nested_guest *kvmhv_alloc_nested(struct kvm *kvm, unsigned int lpid) +{ + struct kvm_nested_guest *gp; + long shadow_lpid; + + gp = kzalloc(sizeof(*gp), GFP_KERNEL); + if (!gp) + return NULL; + gp->l1_host = kvm; + gp->l1_lpid = lpid; + mutex_init(&gp->tlb_lock); + gp->shadow_pgtable = pgd_alloc(kvm->mm); + if (!gp->shadow_pgtable) + goto out_free; + shadow_lpid = kvmppc_alloc_lpid(); + if (shadow_lpid < 0) + goto out_free2; + gp->shadow_lpid = shadow_lpid; + + memset(gp->prev_cpu, -1, sizeof(gp->prev_cpu)); + + return gp; + + out_free2: + pgd_free(kvm->mm, gp->shadow_pgtable); + out_free: + kfree(gp); + return NULL; +} + +/* + * Free up any resources allocated for a nested guest. + */ +static void kvmhv_release_nested(struct kvm_nested_guest *gp) +{ + struct kvm *kvm = gp->l1_host; + + if (gp->shadow_pgtable) { + /* + * No vcpu is using this struct and no call to + * kvmhv_get_nested can find this struct, + * so we don't need to hold kvm->mmu_lock. + */ + kvmppc_free_pgtable_radix(kvm, gp->shadow_pgtable, + gp->shadow_lpid); + pgd_free(kvm->mm, gp->shadow_pgtable); + } + kvmhv_set_ptbl_entry(gp->shadow_lpid, 0, 0); + kvmppc_free_lpid(gp->shadow_lpid); + kfree(gp); +} + +static void kvmhv_remove_nested(struct kvm_nested_guest *gp) +{ + struct kvm *kvm = gp->l1_host; + int lpid = gp->l1_lpid; + long ref; + + spin_lock(&kvm->mmu_lock); + if (gp == kvm->arch.nested_guests[lpid]) { + kvm->arch.nested_guests[lpid] = NULL; + if (lpid == kvm->arch.max_nested_lpid) { + while (--lpid >= 0 && !kvm->arch.nested_guests[lpid]) + ; + kvm->arch.max_nested_lpid = lpid; + } + --gp->refcnt; + } + ref = gp->refcnt; + spin_unlock(&kvm->mmu_lock); + if (ref == 0) + kvmhv_release_nested(gp); +} + +/* + * Free up all nested resources allocated for this guest. + * This is called with no vcpus of the guest running, when + * switching the guest to HPT mode or when destroying the + * guest. + */ +void kvmhv_release_all_nested(struct kvm *kvm) +{ + int i; + struct kvm_nested_guest *gp; + struct kvm_nested_guest *freelist = NULL; + struct kvm_memory_slot *memslot; + int srcu_idx; + + spin_lock(&kvm->mmu_lock); + for (i = 0; i <= kvm->arch.max_nested_lpid; i++) { + gp = kvm->arch.nested_guests[i]; + if (!gp) + continue; + kvm->arch.nested_guests[i] = NULL; + if (--gp->refcnt == 0) { + gp->next = freelist; + freelist = gp; + } + } + kvm->arch.max_nested_lpid = -1; + spin_unlock(&kvm->mmu_lock); + while ((gp = freelist) != NULL) { + freelist = gp->next; + kvmhv_release_nested(gp); + } + + srcu_idx = srcu_read_lock(&kvm->srcu); + kvm_for_each_memslot(memslot, kvm_memslots(kvm)) + kvmhv_free_memslot_nest_rmap(memslot); + srcu_read_unlock(&kvm->srcu, srcu_idx); +} + +/* caller must hold gp->tlb_lock */ +static void kvmhv_flush_nested(struct kvm_nested_guest *gp) +{ + struct kvm *kvm = gp->l1_host; + + spin_lock(&kvm->mmu_lock); + kvmppc_free_pgtable_radix(kvm, gp->shadow_pgtable, gp->shadow_lpid); + spin_unlock(&kvm->mmu_lock); + kvmhv_flush_lpid(gp->shadow_lpid); + kvmhv_update_ptbl_cache(gp); + if (gp->l1_gr_to_hr == 0) + kvmhv_remove_nested(gp); +} + +struct kvm_nested_guest *kvmhv_get_nested(struct kvm *kvm, int l1_lpid, + bool create) +{ + struct kvm_nested_guest *gp, *newgp; + + if (l1_lpid >= KVM_MAX_NESTED_GUESTS || + l1_lpid >= (1ul << ((kvm->arch.l1_ptcr & PRTS_MASK) + 12 - 4))) + return NULL; + + spin_lock(&kvm->mmu_lock); + gp = kvm->arch.nested_guests[l1_lpid]; + if (gp) + ++gp->refcnt; + spin_unlock(&kvm->mmu_lock); + + if (gp || !create) + return gp; + + newgp = kvmhv_alloc_nested(kvm, l1_lpid); + if (!newgp) + return NULL; + spin_lock(&kvm->mmu_lock); + if (kvm->arch.nested_guests[l1_lpid]) { + /* someone else beat us to it */ + gp = kvm->arch.nested_guests[l1_lpid]; + } else { + kvm->arch.nested_guests[l1_lpid] = newgp; + ++newgp->refcnt; + gp = newgp; + newgp = NULL; + if (l1_lpid > kvm->arch.max_nested_lpid) + kvm->arch.max_nested_lpid = l1_lpid; + } + ++gp->refcnt; + spin_unlock(&kvm->mmu_lock); + + if (newgp) + kvmhv_release_nested(newgp); + + return gp; +} + +void kvmhv_put_nested(struct kvm_nested_guest *gp) +{ + struct kvm *kvm = gp->l1_host; + long ref; + + spin_lock(&kvm->mmu_lock); + ref = --gp->refcnt; + spin_unlock(&kvm->mmu_lock); + if (ref == 0) + kvmhv_release_nested(gp); +} + +static struct kvm_nested_guest *kvmhv_find_nested(struct kvm *kvm, int lpid) +{ + if (lpid > kvm->arch.max_nested_lpid) + return NULL; + return kvm->arch.nested_guests[lpid]; +} + +static inline bool kvmhv_n_rmap_is_equal(u64 rmap_1, u64 rmap_2) +{ + return !((rmap_1 ^ rmap_2) & (RMAP_NESTED_LPID_MASK | + RMAP_NESTED_GPA_MASK)); +} + +void kvmhv_insert_nest_rmap(struct kvm *kvm, unsigned long *rmapp, + struct rmap_nested **n_rmap) +{ + struct llist_node *entry = ((struct llist_head *) rmapp)->first; + struct rmap_nested *cursor; + u64 rmap, new_rmap = (*n_rmap)->rmap; + + /* Are there any existing entries? */ + if (!(*rmapp)) { + /* No -> use the rmap as a single entry */ + *rmapp = new_rmap | RMAP_NESTED_IS_SINGLE_ENTRY; + return; + } + + /* Do any entries match what we're trying to insert? */ + for_each_nest_rmap_safe(cursor, entry, &rmap) { + if (kvmhv_n_rmap_is_equal(rmap, new_rmap)) + return; + } + + /* Do we need to create a list or just add the new entry? */ + rmap = *rmapp; + if (rmap & RMAP_NESTED_IS_SINGLE_ENTRY) /* Not previously a list */ + *rmapp = 0UL; + llist_add(&((*n_rmap)->list), (struct llist_head *) rmapp); + if (rmap & RMAP_NESTED_IS_SINGLE_ENTRY) /* Not previously a list */ + (*n_rmap)->list.next = (struct llist_node *) rmap; + + /* Set NULL so not freed by caller */ + *n_rmap = NULL; +} + +static void kvmhv_remove_nest_rmap(struct kvm *kvm, u64 n_rmap, + unsigned long hpa, unsigned long mask) +{ + struct kvm_nested_guest *gp; + unsigned long gpa; + unsigned int shift, lpid; + pte_t *ptep; + + gpa = n_rmap & RMAP_NESTED_GPA_MASK; + lpid = (n_rmap & RMAP_NESTED_LPID_MASK) >> RMAP_NESTED_LPID_SHIFT; + gp = kvmhv_find_nested(kvm, lpid); + if (!gp) + return; + + /* Find and invalidate the pte */ + ptep = __find_linux_pte(gp->shadow_pgtable, gpa, NULL, &shift); + /* Don't spuriously invalidate ptes if the pfn has changed */ + if (ptep && pte_present(*ptep) && ((pte_val(*ptep) & mask) == hpa)) + kvmppc_unmap_pte(kvm, ptep, gpa, shift, NULL, gp->shadow_lpid); +} + +static void kvmhv_remove_nest_rmap_list(struct kvm *kvm, unsigned long *rmapp, + unsigned long hpa, unsigned long mask) +{ + struct llist_node *entry = llist_del_all((struct llist_head *) rmapp); + struct rmap_nested *cursor; + unsigned long rmap; + + for_each_nest_rmap_safe(cursor, entry, &rmap) { + kvmhv_remove_nest_rmap(kvm, rmap, hpa, mask); + kfree(cursor); + } +} + +/* called with kvm->mmu_lock held */ +void kvmhv_remove_nest_rmap_range(struct kvm *kvm, + struct kvm_memory_slot *memslot, + unsigned long gpa, unsigned long hpa, + unsigned long nbytes) +{ + unsigned long gfn, end_gfn; + unsigned long addr_mask; + + if (!memslot) + return; + gfn = (gpa >> PAGE_SHIFT) - memslot->base_gfn; + end_gfn = gfn + (nbytes >> PAGE_SHIFT); + + addr_mask = PTE_RPN_MASK & ~(nbytes - 1); + hpa &= addr_mask; + + for (; gfn < end_gfn; gfn++) { + unsigned long *rmap = &memslot->arch.rmap[gfn]; + kvmhv_remove_nest_rmap_list(kvm, rmap, hpa, addr_mask); + } +} + +static void kvmhv_free_memslot_nest_rmap(struct kvm_memory_slot *free) +{ + unsigned long page; + + for (page = 0; page < free->npages; page++) { + unsigned long rmap, *rmapp = &free->arch.rmap[page]; + struct rmap_nested *cursor; + struct llist_node *entry; + + entry = llist_del_all((struct llist_head *) rmapp); + for_each_nest_rmap_safe(cursor, entry, &rmap) + kfree(cursor); + } +} + +static bool kvmhv_invalidate_shadow_pte(struct kvm_vcpu *vcpu, + struct kvm_nested_guest *gp, + long gpa, int *shift_ret) +{ + struct kvm *kvm = vcpu->kvm; + bool ret = false; + pte_t *ptep; + int shift; + + spin_lock(&kvm->mmu_lock); + ptep = __find_linux_pte(gp->shadow_pgtable, gpa, NULL, &shift); + if (!shift) + shift = PAGE_SHIFT; + if (ptep && pte_present(*ptep)) { + kvmppc_unmap_pte(kvm, ptep, gpa, shift, NULL, gp->shadow_lpid); + ret = true; + } + spin_unlock(&kvm->mmu_lock); + + if (shift_ret) + *shift_ret = shift; + return ret; +} + +static inline int get_ric(unsigned int instr) +{ + return (instr >> 18) & 0x3; +} + +static inline int get_prs(unsigned int instr) +{ + return (instr >> 17) & 0x1; +} + +static inline int get_r(unsigned int instr) +{ + return (instr >> 16) & 0x1; +} + +static inline int get_lpid(unsigned long r_val) +{ + return r_val & 0xffffffff; +} + +static inline int get_is(unsigned long r_val) +{ + return (r_val >> 10) & 0x3; +} + +static inline int get_ap(unsigned long r_val) +{ + return (r_val >> 5) & 0x7; +} + +static inline long get_epn(unsigned long r_val) +{ + return r_val >> 12; +} + +static int kvmhv_emulate_tlbie_tlb_addr(struct kvm_vcpu *vcpu, int lpid, + int ap, long epn) +{ + struct kvm *kvm = vcpu->kvm; + struct kvm_nested_guest *gp; + long npages; + int shift, shadow_shift; + unsigned long addr; + + shift = ap_to_shift(ap); + addr = epn << 12; + if (shift < 0) + /* Invalid ap encoding */ + return -EINVAL; + + addr &= ~((1UL << shift) - 1); + npages = 1UL << (shift - PAGE_SHIFT); + + gp = kvmhv_get_nested(kvm, lpid, false); + if (!gp) /* No such guest -> nothing to do */ + return 0; + mutex_lock(&gp->tlb_lock); + + /* There may be more than one host page backing this single guest pte */ + do { + kvmhv_invalidate_shadow_pte(vcpu, gp, addr, &shadow_shift); + + npages -= 1UL << (shadow_shift - PAGE_SHIFT); + addr += 1UL << shadow_shift; + } while (npages > 0); + + mutex_unlock(&gp->tlb_lock); + kvmhv_put_nested(gp); + return 0; +} + +static void kvmhv_emulate_tlbie_lpid(struct kvm_vcpu *vcpu, + struct kvm_nested_guest *gp, int ric) +{ + struct kvm *kvm = vcpu->kvm; + + mutex_lock(&gp->tlb_lock); + switch (ric) { + case 0: + /* Invalidate TLB */ + spin_lock(&kvm->mmu_lock); + kvmppc_free_pgtable_radix(kvm, gp->shadow_pgtable, + gp->shadow_lpid); + kvmhv_flush_lpid(gp->shadow_lpid); + spin_unlock(&kvm->mmu_lock); + break; + case 1: + /* + * Invalidate PWC + * We don't cache this -> nothing to do + */ + break; + case 2: + /* Invalidate TLB, PWC and caching of partition table entries */ + kvmhv_flush_nested(gp); + break; + default: + break; + } + mutex_unlock(&gp->tlb_lock); +} + +static void kvmhv_emulate_tlbie_all_lpid(struct kvm_vcpu *vcpu, int ric) +{ + struct kvm *kvm = vcpu->kvm; + struct kvm_nested_guest *gp; + int i; + + spin_lock(&kvm->mmu_lock); + for (i = 0; i <= kvm->arch.max_nested_lpid; i++) { + gp = kvm->arch.nested_guests[i]; + if (gp) { + spin_unlock(&kvm->mmu_lock); + kvmhv_emulate_tlbie_lpid(vcpu, gp, ric); + spin_lock(&kvm->mmu_lock); + } + } + spin_unlock(&kvm->mmu_lock); +} + +static int kvmhv_emulate_priv_tlbie(struct kvm_vcpu *vcpu, unsigned int instr, + unsigned long rsval, unsigned long rbval) +{ + struct kvm *kvm = vcpu->kvm; + struct kvm_nested_guest *gp; + int r, ric, prs, is, ap; + int lpid; + long epn; + int ret = 0; + + ric = get_ric(instr); + prs = get_prs(instr); + r = get_r(instr); + lpid = get_lpid(rsval); + is = get_is(rbval); + + /* + * These cases are invalid and are not handled: + * r != 1 -> Only radix supported + * prs == 1 -> Not HV privileged + * ric == 3 -> No cluster bombs for radix + * is == 1 -> Partition scoped translations not associated with pid + * (!is) && (ric == 1 || ric == 2) -> Not supported by ISA + */ + if ((!r) || (prs) || (ric == 3) || (is == 1) || + ((!is) && (ric == 1 || ric == 2))) + return -EINVAL; + + switch (is) { + case 0: + /* + * We know ric == 0 + * Invalidate TLB for a given target address + */ + epn = get_epn(rbval); + ap = get_ap(rbval); + ret = kvmhv_emulate_tlbie_tlb_addr(vcpu, lpid, ap, epn); + break; + case 2: + /* Invalidate matching LPID */ + gp = kvmhv_get_nested(kvm, lpid, false); + if (gp) { + kvmhv_emulate_tlbie_lpid(vcpu, gp, ric); + kvmhv_put_nested(gp); + } + break; + case 3: + /* Invalidate ALL LPIDs */ + kvmhv_emulate_tlbie_all_lpid(vcpu, ric); + break; + default: + ret = -EINVAL; + break; + } + + return ret; +} + +/* + * This handles the H_TLB_INVALIDATE hcall. + * Parameters are (r4) tlbie instruction code, (r5) rS contents, + * (r6) rB contents. + */ +long kvmhv_do_nested_tlbie(struct kvm_vcpu *vcpu) +{ + int ret; + + ret = kvmhv_emulate_priv_tlbie(vcpu, kvmppc_get_gpr(vcpu, 4), + kvmppc_get_gpr(vcpu, 5), kvmppc_get_gpr(vcpu, 6)); + if (ret) + return H_PARAMETER; + return H_SUCCESS; +} + +/* Used to convert a nested guest real address to a L1 guest real address */ +static int kvmhv_translate_addr_nested(struct kvm_vcpu *vcpu, + struct kvm_nested_guest *gp, + unsigned long n_gpa, unsigned long dsisr, + struct kvmppc_pte *gpte_p) +{ + u64 fault_addr, flags = dsisr & DSISR_ISSTORE; + int ret; + + ret = kvmppc_mmu_walk_radix_tree(vcpu, n_gpa, gpte_p, gp->l1_gr_to_hr, + &fault_addr); + + if (ret) { + /* We didn't find a pte */ + if (ret == -EINVAL) { + /* Unsupported mmu config */ + flags |= DSISR_UNSUPP_MMU; + } else if (ret == -ENOENT) { + /* No translation found */ + flags |= DSISR_NOHPTE; + } else if (ret == -EFAULT) { + /* Couldn't access L1 real address */ + flags |= DSISR_PRTABLE_FAULT; + vcpu->arch.fault_gpa = fault_addr; + } else { + /* Unknown error */ + return ret; + } + goto forward_to_l1; + } else { + /* We found a pte -> check permissions */ + if (dsisr & DSISR_ISSTORE) { + /* Can we write? */ + if (!gpte_p->may_write) { + flags |= DSISR_PROTFAULT; + goto forward_to_l1; + } + } else if (vcpu->arch.trap == BOOK3S_INTERRUPT_H_INST_STORAGE) { + /* Can we execute? */ + if (!gpte_p->may_execute) { + flags |= SRR1_ISI_N_OR_G; + goto forward_to_l1; + } + } else { + /* Can we read? */ + if (!gpte_p->may_read && !gpte_p->may_write) { + flags |= DSISR_PROTFAULT; + goto forward_to_l1; + } + } + } + + return 0; + +forward_to_l1: + vcpu->arch.fault_dsisr = flags; + if (vcpu->arch.trap == BOOK3S_INTERRUPT_H_INST_STORAGE) { + vcpu->arch.shregs.msr &= ~0x783f0000ul; + vcpu->arch.shregs.msr |= flags; + } + return RESUME_HOST; +} + +static long kvmhv_handle_nested_set_rc(struct kvm_vcpu *vcpu, + struct kvm_nested_guest *gp, + unsigned long n_gpa, + struct kvmppc_pte gpte, + unsigned long dsisr) +{ + struct kvm *kvm = vcpu->kvm; + bool writing = !!(dsisr & DSISR_ISSTORE); + u64 pgflags; + bool ret; + + /* Are the rc bits set in the L1 partition scoped pte? */ + pgflags = _PAGE_ACCESSED; + if (writing) + pgflags |= _PAGE_DIRTY; + if (pgflags & ~gpte.rc) + return RESUME_HOST; + + spin_lock(&kvm->mmu_lock); + /* Set the rc bit in the pte of our (L0) pgtable for the L1 guest */ + ret = kvmppc_hv_handle_set_rc(kvm, kvm->arch.pgtable, writing, + gpte.raddr, kvm->arch.lpid); + spin_unlock(&kvm->mmu_lock); + if (!ret) + return -EINVAL; + + /* Set the rc bit in the pte of the shadow_pgtable for the nest guest */ + ret = kvmppc_hv_handle_set_rc(kvm, gp->shadow_pgtable, writing, n_gpa, + gp->shadow_lpid); + if (!ret) + return -EINVAL; + return 0; +} + +static inline int kvmppc_radix_level_to_shift(int level) +{ + switch (level) { + case 2: + return PUD_SHIFT; + case 1: + return PMD_SHIFT; + default: + return PAGE_SHIFT; + } +} + +static inline int kvmppc_radix_shift_to_level(int shift) +{ + if (shift == PUD_SHIFT) + return 2; + if (shift == PMD_SHIFT) + return 1; + if (shift == PAGE_SHIFT) + return 0; + WARN_ON_ONCE(1); + return 0; +} + +/* called with gp->tlb_lock held */ +static long int __kvmhv_nested_page_fault(struct kvm_vcpu *vcpu, + struct kvm_nested_guest *gp) +{ + struct kvm *kvm = vcpu->kvm; + struct kvm_memory_slot *memslot; + struct rmap_nested *n_rmap; + struct kvmppc_pte gpte; + pte_t pte, *pte_p; + unsigned long mmu_seq; + unsigned long dsisr = vcpu->arch.fault_dsisr; + unsigned long ea = vcpu->arch.fault_dar; + unsigned long *rmapp; + unsigned long n_gpa, gpa, gfn, perm = 0UL; + unsigned int shift, l1_shift, level; + bool writing = !!(dsisr & DSISR_ISSTORE); + bool kvm_ro = false; + long int ret; + + if (!gp->l1_gr_to_hr) { + kvmhv_update_ptbl_cache(gp); + if (!gp->l1_gr_to_hr) + return RESUME_HOST; + } + + /* Convert the nested guest real address into a L1 guest real address */ + + n_gpa = vcpu->arch.fault_gpa & ~0xF000000000000FFFULL; + if (!(dsisr & DSISR_PRTABLE_FAULT)) + n_gpa |= ea & 0xFFF; + ret = kvmhv_translate_addr_nested(vcpu, gp, n_gpa, dsisr, &gpte); + + /* + * If the hardware found a translation but we don't now have a usable + * translation in the l1 partition-scoped tree, remove the shadow pte + * and let the guest retry. + */ + if (ret == RESUME_HOST && + (dsisr & (DSISR_PROTFAULT | DSISR_BADACCESS | DSISR_NOEXEC_OR_G | + DSISR_BAD_COPYPASTE))) + goto inval; + if (ret) + return ret; + + /* Failed to set the reference/change bits */ + if (dsisr & DSISR_SET_RC) { + ret = kvmhv_handle_nested_set_rc(vcpu, gp, n_gpa, gpte, dsisr); + if (ret == RESUME_HOST) + return ret; + if (ret) + goto inval; + dsisr &= ~DSISR_SET_RC; + if (!(dsisr & (DSISR_BAD_FAULT_64S | DSISR_NOHPTE | + DSISR_PROTFAULT))) + return RESUME_GUEST; + } + + /* + * We took an HISI or HDSI while we were running a nested guest which + * means we have no partition scoped translation for that. This means + * we need to insert a pte for the mapping into our shadow_pgtable. + */ + + l1_shift = gpte.page_shift; + if (l1_shift < PAGE_SHIFT) { + /* We don't support l1 using a page size smaller than our own */ + pr_err("KVM: L1 guest page shift (%d) less than our own (%d)\n", + l1_shift, PAGE_SHIFT); + return -EINVAL; + } + gpa = gpte.raddr; + gfn = gpa >> PAGE_SHIFT; + + /* 1. Get the corresponding host memslot */ + + memslot = gfn_to_memslot(kvm, gfn); + if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) { + if (dsisr & (DSISR_PRTABLE_FAULT | DSISR_BADACCESS)) { + /* unusual error -> reflect to the guest as a DSI */ + kvmppc_core_queue_data_storage(vcpu, ea, dsisr); + return RESUME_GUEST; + } + /* passthrough of emulated MMIO case... */ + pr_err("emulated MMIO passthrough?\n"); + return -EINVAL; + } + if (memslot->flags & KVM_MEM_READONLY) { + if (writing) { + /* Give the guest a DSI */ + kvmppc_core_queue_data_storage(vcpu, ea, + DSISR_ISSTORE | DSISR_PROTFAULT); + return RESUME_GUEST; + } + kvm_ro = true; + } + + /* 2. Find the host pte for this L1 guest real address */ + + /* Used to check for invalidations in progress */ + mmu_seq = kvm->mmu_notifier_seq; + smp_rmb(); + + /* See if can find translation in our partition scoped tables for L1 */ + pte = __pte(0); + spin_lock(&kvm->mmu_lock); + pte_p = __find_linux_pte(kvm->arch.pgtable, gpa, NULL, &shift); + if (!shift) + shift = PAGE_SHIFT; + if (pte_p) + pte = *pte_p; + spin_unlock(&kvm->mmu_lock); + + if (!pte_present(pte) || (writing && !(pte_val(pte) & _PAGE_WRITE))) { + /* No suitable pte found -> try to insert a mapping */ + ret = kvmppc_book3s_instantiate_page(vcpu, gpa, memslot, + writing, kvm_ro, &pte, &level); + if (ret == -EAGAIN) + return RESUME_GUEST; + else if (ret) + return ret; + shift = kvmppc_radix_level_to_shift(level); + } + + /* 3. Compute the pte we need to insert for nest_gpa -> host r_addr */ + + /* The permissions is the combination of the host and l1 guest ptes */ + perm |= gpte.may_read ? 0UL : _PAGE_READ; + perm |= gpte.may_write ? 0UL : _PAGE_WRITE; + perm |= gpte.may_execute ? 0UL : _PAGE_EXEC; + pte = __pte(pte_val(pte) & ~perm); + + /* What size pte can we insert? */ + if (shift > l1_shift) { + u64 mask; + unsigned int actual_shift = PAGE_SHIFT; + if (PMD_SHIFT < l1_shift) + actual_shift = PMD_SHIFT; + mask = (1UL << shift) - (1UL << actual_shift); + pte = __pte(pte_val(pte) | (gpa & mask)); + shift = actual_shift; + } + level = kvmppc_radix_shift_to_level(shift); + n_gpa &= ~((1UL << shift) - 1); + + /* 4. Insert the pte into our shadow_pgtable */ + + n_rmap = kzalloc(sizeof(*n_rmap), GFP_KERNEL); + if (!n_rmap) + return RESUME_GUEST; /* Let the guest try again */ + n_rmap->rmap = (n_gpa & RMAP_NESTED_GPA_MASK) | + (((unsigned long) gp->l1_lpid) << RMAP_NESTED_LPID_SHIFT); + rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn]; + ret = kvmppc_create_pte(kvm, gp->shadow_pgtable, pte, n_gpa, level, + mmu_seq, gp->shadow_lpid, rmapp, &n_rmap); + if (n_rmap) + kfree(n_rmap); + if (ret == -EAGAIN) + ret = RESUME_GUEST; /* Let the guest try again */ + + return ret; + + inval: + kvmhv_invalidate_shadow_pte(vcpu, gp, n_gpa, NULL); + return RESUME_GUEST; +} + +long int kvmhv_nested_page_fault(struct kvm_vcpu *vcpu) +{ + struct kvm_nested_guest *gp = vcpu->arch.nested; + long int ret; + + mutex_lock(&gp->tlb_lock); + ret = __kvmhv_nested_page_fault(vcpu, gp); + mutex_unlock(&gp->tlb_lock); + return ret; +} + +int kvmhv_nested_next_lpid(struct kvm *kvm, int lpid) +{ + int ret = -1; + + spin_lock(&kvm->mmu_lock); + while (++lpid <= kvm->arch.max_nested_lpid) { + if (kvm->arch.nested_guests[lpid]) { + ret = lpid; + break; + } + } + spin_unlock(&kvm->mmu_lock); + return ret; +} diff --git a/arch/powerpc/kvm/book3s_hv_ras.c b/arch/powerpc/kvm/book3s_hv_ras.c index b11043b23c18..0787f12c1a1b 100644 --- a/arch/powerpc/kvm/book3s_hv_ras.c +++ b/arch/powerpc/kvm/book3s_hv_ras.c @@ -177,6 +177,7 @@ void kvmppc_subcore_enter_guest(void) local_paca->sibling_subcore_state->in_guest[subcore_id] = 1; } +EXPORT_SYMBOL_GPL(kvmppc_subcore_enter_guest); void kvmppc_subcore_exit_guest(void) { @@ -187,6 +188,7 @@ void kvmppc_subcore_exit_guest(void) local_paca->sibling_subcore_state->in_guest[subcore_id] = 0; } +EXPORT_SYMBOL_GPL(kvmppc_subcore_exit_guest); static bool kvmppc_tb_resync_required(void) { @@ -331,5 +333,13 @@ long kvmppc_realmode_hmi_handler(void) } else { wait_for_tb_resync(); } + + /* + * Reset tb_offset_applied so the guest exit code won't try + * to subtract the previous timebase offset from the timebase. + */ + if (local_paca->kvm_hstate.kvm_vcore) + local_paca->kvm_hstate.kvm_vcore->tb_offset_applied = 0; + return 0; } diff --git a/arch/powerpc/kvm/book3s_hv_rm_xics.c b/arch/powerpc/kvm/book3s_hv_rm_xics.c index 758d1d23215e..b3f5786b20dc 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_xics.c +++ b/arch/powerpc/kvm/book3s_hv_rm_xics.c @@ -136,7 +136,7 @@ static void icp_rm_set_vcpu_irq(struct kvm_vcpu *vcpu, /* Mark the target VCPU as having an interrupt pending */ vcpu->stat.queue_intr++; - set_bit(BOOK3S_IRQPRIO_EXTERNAL_LEVEL, &vcpu->arch.pending_exceptions); + set_bit(BOOK3S_IRQPRIO_EXTERNAL, &vcpu->arch.pending_exceptions); /* Kick self ? Just set MER and return */ if (vcpu == this_vcpu) { @@ -170,8 +170,7 @@ static void icp_rm_set_vcpu_irq(struct kvm_vcpu *vcpu, static void icp_rm_clr_vcpu_irq(struct kvm_vcpu *vcpu) { /* Note: Only called on self ! */ - clear_bit(BOOK3S_IRQPRIO_EXTERNAL_LEVEL, - &vcpu->arch.pending_exceptions); + clear_bit(BOOK3S_IRQPRIO_EXTERNAL, &vcpu->arch.pending_exceptions); mtspr(SPRN_LPCR, mfspr(SPRN_LPCR) & ~LPCR_MER); } @@ -768,6 +767,14 @@ static void icp_eoi(struct irq_chip *c, u32 hwirq, __be32 xirr, bool *again) void __iomem *xics_phys; int64_t rc; + if (kvmhv_on_pseries()) { + unsigned long retbuf[PLPAR_HCALL_BUFSIZE]; + + iosync(); + plpar_hcall_raw(H_EOI, retbuf, hwirq); + return; + } + rc = pnv_opal_pci_msi_eoi(c, hwirq); if (rc) diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index 1d14046124a0..9b8d50a7cbaf 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -28,6 +28,7 @@ #include <asm/exception-64s.h> #include <asm/kvm_book3s_asm.h> #include <asm/book3s/64/mmu-hash.h> +#include <asm/export.h> #include <asm/tm.h> #include <asm/opal.h> #include <asm/xive-regs.h> @@ -46,8 +47,9 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300) #define NAPPING_NOVCPU 2 /* Stack frame offsets for kvmppc_hv_entry */ -#define SFS 160 +#define SFS 208 #define STACK_SLOT_TRAP (SFS-4) +#define STACK_SLOT_SHORT_PATH (SFS-8) #define STACK_SLOT_TID (SFS-16) #define STACK_SLOT_PSSCR (SFS-24) #define STACK_SLOT_PID (SFS-32) @@ -56,6 +58,8 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300) #define STACK_SLOT_DAWR (SFS-56) #define STACK_SLOT_DAWRX (SFS-64) #define STACK_SLOT_HFSCR (SFS-72) +/* the following is used by the P9 short path */ +#define STACK_SLOT_NVGPRS (SFS-152) /* 18 gprs */ /* * Call kvmppc_hv_entry in real mode. @@ -113,45 +117,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S) mtspr SPRN_SPRG_VDSO_WRITE,r3 /* Reload the host's PMU registers */ - lbz r4, PACA_PMCINUSE(r13) /* is the host using the PMU? */ - cmpwi r4, 0 - beq 23f /* skip if not */ -BEGIN_FTR_SECTION - ld r3, HSTATE_MMCR0(r13) - andi. r4, r3, MMCR0_PMAO_SYNC | MMCR0_PMAO - cmpwi r4, MMCR0_PMAO - beql kvmppc_fix_pmao -END_FTR_SECTION_IFSET(CPU_FTR_PMAO_BUG) - lwz r3, HSTATE_PMC1(r13) - lwz r4, HSTATE_PMC2(r13) - lwz r5, HSTATE_PMC3(r13) - lwz r6, HSTATE_PMC4(r13) - lwz r8, HSTATE_PMC5(r13) - lwz r9, HSTATE_PMC6(r13) - mtspr SPRN_PMC1, r3 - mtspr SPRN_PMC2, r4 - mtspr SPRN_PMC3, r5 - mtspr SPRN_PMC4, r6 - mtspr SPRN_PMC5, r8 - mtspr SPRN_PMC6, r9 - ld r3, HSTATE_MMCR0(r13) - ld r4, HSTATE_MMCR1(r13) - ld r5, HSTATE_MMCRA(r13) - ld r6, HSTATE_SIAR(r13) - ld r7, HSTATE_SDAR(r13) - mtspr SPRN_MMCR1, r4 - mtspr SPRN_MMCRA, r5 - mtspr SPRN_SIAR, r6 - mtspr SPRN_SDAR, r7 -BEGIN_FTR_SECTION - ld r8, HSTATE_MMCR2(r13) - ld r9, HSTATE_SIER(r13) - mtspr SPRN_MMCR2, r8 - mtspr SPRN_SIER, r9 -END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) - mtspr SPRN_MMCR0, r3 - isync -23: + bl kvmhv_load_host_pmu /* * Reload DEC. HDEC interrupts were disabled when @@ -796,66 +762,23 @@ BEGIN_FTR_SECTION b 91f END_FTR_SECTION(CPU_FTR_TM | CPU_FTR_P9_TM_HV_ASSIST, 0) /* - * NOTE THAT THIS TRASHES ALL NON-VOLATILE REGISTERS INCLUDING CR + * NOTE THAT THIS TRASHES ALL NON-VOLATILE REGISTERS (but not CR) */ mr r3, r4 ld r4, VCPU_MSR(r3) + li r5, 0 /* don't preserve non-vol regs */ bl kvmppc_restore_tm_hv + nop ld r4, HSTATE_KVM_VCPU(r13) 91: #endif - /* Load guest PMU registers */ - /* R4 is live here (vcpu pointer) */ - li r3, 1 - sldi r3, r3, 31 /* MMCR0_FC (freeze counters) bit */ - mtspr SPRN_MMCR0, r3 /* freeze all counters, disable ints */ - isync -BEGIN_FTR_SECTION - ld r3, VCPU_MMCR(r4) - andi. r5, r3, MMCR0_PMAO_SYNC | MMCR0_PMAO - cmpwi r5, MMCR0_PMAO - beql kvmppc_fix_pmao -END_FTR_SECTION_IFSET(CPU_FTR_PMAO_BUG) - lwz r3, VCPU_PMC(r4) /* always load up guest PMU registers */ - lwz r5, VCPU_PMC + 4(r4) /* to prevent information leak */ - lwz r6, VCPU_PMC + 8(r4) - lwz r7, VCPU_PMC + 12(r4) - lwz r8, VCPU_PMC + 16(r4) - lwz r9, VCPU_PMC + 20(r4) - mtspr SPRN_PMC1, r3 - mtspr SPRN_PMC2, r5 - mtspr SPRN_PMC3, r6 - mtspr SPRN_PMC4, r7 - mtspr SPRN_PMC5, r8 - mtspr SPRN_PMC6, r9 - ld r3, VCPU_MMCR(r4) - ld r5, VCPU_MMCR + 8(r4) - ld r6, VCPU_MMCR + 16(r4) - ld r7, VCPU_SIAR(r4) - ld r8, VCPU_SDAR(r4) - mtspr SPRN_MMCR1, r5 - mtspr SPRN_MMCRA, r6 - mtspr SPRN_SIAR, r7 - mtspr SPRN_SDAR, r8 -BEGIN_FTR_SECTION - ld r5, VCPU_MMCR + 24(r4) - ld r6, VCPU_SIER(r4) - mtspr SPRN_MMCR2, r5 - mtspr SPRN_SIER, r6 -BEGIN_FTR_SECTION_NESTED(96) - lwz r7, VCPU_PMC + 24(r4) - lwz r8, VCPU_PMC + 28(r4) - ld r9, VCPU_MMCR + 32(r4) - mtspr SPRN_SPMC1, r7 - mtspr SPRN_SPMC2, r8 - mtspr SPRN_MMCRS, r9 -END_FTR_SECTION_NESTED(CPU_FTR_ARCH_300, 0, 96) -END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) - mtspr SPRN_MMCR0, r3 - isync + /* Load guest PMU registers; r4 = vcpu pointer here */ + mr r3, r4 + bl kvmhv_load_guest_pmu /* Load up FP, VMX and VSX registers */ + ld r4, HSTATE_KVM_VCPU(r13) bl kvmppc_load_fp ld r14, VCPU_GPR(R14)(r4) @@ -1100,73 +1023,40 @@ ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300) no_xive: #endif /* CONFIG_KVM_XICS */ -deliver_guest_interrupt: - ld r6, VCPU_CTR(r4) - ld r7, VCPU_XER(r4) - - mtctr r6 - mtxer r7 + li r0, 0 + stw r0, STACK_SLOT_SHORT_PATH(r1) -kvmppc_cede_reentry: /* r4 = vcpu, r13 = paca */ - ld r10, VCPU_PC(r4) - ld r11, VCPU_MSR(r4) +deliver_guest_interrupt: /* r4 = vcpu, r13 = paca */ + /* Check if we can deliver an external or decrementer interrupt now */ + ld r0, VCPU_PENDING_EXC(r4) +BEGIN_FTR_SECTION + /* On POWER9, also check for emulated doorbell interrupt */ + lbz r3, VCPU_DBELL_REQ(r4) + or r0, r0, r3 +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) + cmpdi r0, 0 + beq 71f + mr r3, r4 + bl kvmppc_guest_entry_inject_int + ld r4, HSTATE_KVM_VCPU(r13) +71: ld r6, VCPU_SRR0(r4) ld r7, VCPU_SRR1(r4) mtspr SPRN_SRR0, r6 mtspr SPRN_SRR1, r7 +fast_guest_entry_c: + ld r10, VCPU_PC(r4) + ld r11, VCPU_MSR(r4) /* r11 = vcpu->arch.msr & ~MSR_HV */ rldicl r11, r11, 63 - MSR_HV_LG, 1 rotldi r11, r11, 1 + MSR_HV_LG ori r11, r11, MSR_ME - /* Check if we can deliver an external or decrementer interrupt now */ - ld r0, VCPU_PENDING_EXC(r4) - rldicl r0, r0, 64 - BOOK3S_IRQPRIO_EXTERNAL_LEVEL, 63 - cmpdi cr1, r0, 0 - andi. r8, r11, MSR_EE - mfspr r8, SPRN_LPCR - /* Insert EXTERNAL_LEVEL bit into LPCR at the MER bit position */ - rldimi r8, r0, LPCR_MER_SH, 63 - LPCR_MER_SH - mtspr SPRN_LPCR, r8 - isync - beq 5f - li r0, BOOK3S_INTERRUPT_EXTERNAL - bne cr1, 12f - mfspr r0, SPRN_DEC -BEGIN_FTR_SECTION - /* On POWER9 check whether the guest has large decrementer enabled */ - andis. r8, r8, LPCR_LD@h - bne 15f -END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) - extsw r0, r0 -15: cmpdi r0, 0 - li r0, BOOK3S_INTERRUPT_DECREMENTER - bge 5f - -12: mtspr SPRN_SRR0, r10 - mr r10,r0 - mtspr SPRN_SRR1, r11 - mr r9, r4 - bl kvmppc_msr_interrupt -5: -BEGIN_FTR_SECTION - b fast_guest_return -END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300) - /* On POWER9, check for pending doorbell requests */ - lbz r0, VCPU_DBELL_REQ(r4) - cmpwi r0, 0 - beq fast_guest_return - ld r5, HSTATE_KVM_VCORE(r13) - /* Set DPDES register so the CPU will take a doorbell interrupt */ - li r0, 1 - mtspr SPRN_DPDES, r0 - std r0, VCORE_DPDES(r5) - /* Make sure other cpus see vcore->dpdes set before dbell req clear */ - lwsync - /* Clear the pending doorbell request */ - li r0, 0 - stb r0, VCPU_DBELL_REQ(r4) + ld r6, VCPU_CTR(r4) + ld r7, VCPU_XER(r4) + mtctr r6 + mtxer r7 /* * Required state: @@ -1202,7 +1092,7 @@ BEGIN_FTR_SECTION END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) ld r5, VCPU_LR(r4) - lwz r6, VCPU_CR(r4) + ld r6, VCPU_CR(r4) mtlr r5 mtcr r6 @@ -1234,6 +1124,83 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) HRFI_TO_GUEST b . +/* + * Enter the guest on a P9 or later system where we have exactly + * one vcpu per vcore and we don't need to go to real mode + * (which implies that host and guest are both using radix MMU mode). + * r3 = vcpu pointer + * Most SPRs and all the VSRs have been loaded already. + */ +_GLOBAL(__kvmhv_vcpu_entry_p9) +EXPORT_SYMBOL_GPL(__kvmhv_vcpu_entry_p9) + mflr r0 + std r0, PPC_LR_STKOFF(r1) + stdu r1, -SFS(r1) + + li r0, 1 + stw r0, STACK_SLOT_SHORT_PATH(r1) + + std r3, HSTATE_KVM_VCPU(r13) + mfcr r4 + stw r4, SFS+8(r1) + + std r1, HSTATE_HOST_R1(r13) + + reg = 14 + .rept 18 + std reg, STACK_SLOT_NVGPRS + ((reg - 14) * 8)(r1) + reg = reg + 1 + .endr + + reg = 14 + .rept 18 + ld reg, __VCPU_GPR(reg)(r3) + reg = reg + 1 + .endr + + mfmsr r10 + std r10, HSTATE_HOST_MSR(r13) + + mr r4, r3 + b fast_guest_entry_c +guest_exit_short_path: + + li r0, KVM_GUEST_MODE_NONE + stb r0, HSTATE_IN_GUEST(r13) + + reg = 14 + .rept 18 + std reg, __VCPU_GPR(reg)(r9) + reg = reg + 1 + .endr + + reg = 14 + .rept 18 + ld reg, STACK_SLOT_NVGPRS + ((reg - 14) * 8)(r1) + reg = reg + 1 + .endr + + lwz r4, SFS+8(r1) + mtcr r4 + + mr r3, r12 /* trap number */ + + addi r1, r1, SFS + ld r0, PPC_LR_STKOFF(r1) + mtlr r0 + + /* If we are in real mode, do a rfid to get back to the caller */ + mfmsr r4 + andi. r5, r4, MSR_IR + bnelr + rldicl r5, r4, 64 - MSR_TS_S_LG, 62 /* extract TS field */ + mtspr SPRN_SRR0, r0 + ld r10, HSTATE_HOST_MSR(r13) + rldimi r10, r5, MSR_TS_S_LG, 63 - MSR_TS_T_LG + mtspr SPRN_SRR1, r10 + RFI_TO_KERNEL + b . + secondary_too_late: li r12, 0 stw r12, STACK_SLOT_TRAP(r1) @@ -1313,7 +1280,7 @@ kvmppc_interrupt_hv: std r3, VCPU_GPR(R12)(r9) /* CR is in the high half of r12 */ srdi r4, r12, 32 - stw r4, VCPU_CR(r9) + std r4, VCPU_CR(r9) BEGIN_FTR_SECTION ld r3, HSTATE_CFAR(r13) std r3, VCPU_CFAR(r9) @@ -1387,18 +1354,26 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) std r3, VCPU_CTR(r9) std r4, VCPU_XER(r9) -#ifdef CONFIG_PPC_TRANSACTIONAL_MEM - /* For softpatch interrupt, go off and do TM instruction emulation */ - cmpwi r12, BOOK3S_INTERRUPT_HV_SOFTPATCH - beq kvmppc_tm_emul -#endif + /* Save more register state */ + mfdar r3 + mfdsisr r4 + std r3, VCPU_DAR(r9) + stw r4, VCPU_DSISR(r9) /* If this is a page table miss then see if it's theirs or ours */ cmpwi r12, BOOK3S_INTERRUPT_H_DATA_STORAGE beq kvmppc_hdsi + std r3, VCPU_FAULT_DAR(r9) + stw r4, VCPU_FAULT_DSISR(r9) cmpwi r12, BOOK3S_INTERRUPT_H_INST_STORAGE beq kvmppc_hisi +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM + /* For softpatch interrupt, go off and do TM instruction emulation */ + cmpwi r12, BOOK3S_INTERRUPT_HV_SOFTPATCH + beq kvmppc_tm_emul +#endif + /* See if this is a leftover HDEC interrupt */ cmpwi r12,BOOK3S_INTERRUPT_HV_DECREMENTER bne 2f @@ -1418,10 +1393,14 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) BEGIN_FTR_SECTION PPC_MSGSYNC lwsync + /* always exit if we're running a nested guest */ + ld r0, VCPU_NESTED(r9) + cmpdi r0, 0 + bne guest_exit_cont END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) lbz r0, HSTATE_HOST_IPI(r13) cmpwi r0, 0 - beq 4f + beq maybe_reenter_guest b guest_exit_cont 3: /* If it's a hypervisor facility unavailable interrupt, save HFSCR */ @@ -1433,82 +1412,16 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) 14: /* External interrupt ? */ cmpwi r12, BOOK3S_INTERRUPT_EXTERNAL - bne+ guest_exit_cont - - /* External interrupt, first check for host_ipi. If this is - * set, we know the host wants us out so let's do it now - */ - bl kvmppc_read_intr - - /* - * Restore the active volatile registers after returning from - * a C function. - */ - ld r9, HSTATE_KVM_VCPU(r13) - li r12, BOOK3S_INTERRUPT_EXTERNAL - - /* - * kvmppc_read_intr return codes: - * - * Exit to host (r3 > 0) - * 1 An interrupt is pending that needs to be handled by the host - * Exit guest and return to host by branching to guest_exit_cont - * - * 2 Passthrough that needs completion in the host - * Exit guest and return to host by branching to guest_exit_cont - * However, we also set r12 to BOOK3S_INTERRUPT_HV_RM_HARD - * to indicate to the host to complete handling the interrupt - * - * Before returning to guest, we check if any CPU is heading out - * to the host and if so, we head out also. If no CPUs are heading - * check return values <= 0. - * - * Return to guest (r3 <= 0) - * 0 No external interrupt is pending - * -1 A guest wakeup IPI (which has now been cleared) - * In either case, we return to guest to deliver any pending - * guest interrupts. - * - * -2 A PCI passthrough external interrupt was handled - * (interrupt was delivered directly to guest) - * Return to guest to deliver any pending guest interrupts. - */ - - cmpdi r3, 1 - ble 1f - - /* Return code = 2 */ - li r12, BOOK3S_INTERRUPT_HV_RM_HARD - stw r12, VCPU_TRAP(r9) - b guest_exit_cont - -1: /* Return code <= 1 */ - cmpdi r3, 0 - bgt guest_exit_cont - - /* Return code <= 0 */ -4: ld r5, HSTATE_KVM_VCORE(r13) - lwz r0, VCORE_ENTRY_EXIT(r5) - cmpwi r0, 0x100 - mr r4, r9 - blt deliver_guest_interrupt - -guest_exit_cont: /* r9 = vcpu, r12 = trap, r13 = paca */ - /* Save more register state */ - mfdar r6 - mfdsisr r7 - std r6, VCPU_DAR(r9) - stw r7, VCPU_DSISR(r9) - /* don't overwrite fault_dar/fault_dsisr if HDSI */ - cmpwi r12,BOOK3S_INTERRUPT_H_DATA_STORAGE - beq mc_cont - std r6, VCPU_FAULT_DAR(r9) - stw r7, VCPU_FAULT_DSISR(r9) - + beq kvmppc_guest_external /* See if it is a machine check */ cmpwi r12, BOOK3S_INTERRUPT_MACHINE_CHECK beq machine_check_realmode -mc_cont: + /* Or a hypervisor maintenance interrupt */ + cmpwi r12, BOOK3S_INTERRUPT_HMI + beq hmi_realmode + +guest_exit_cont: /* r9 = vcpu, r12 = trap, r13 = paca */ + #ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING addi r3, r9, VCPU_TB_RMEXIT mr r4, r9 @@ -1552,6 +1465,11 @@ mc_cont: 1: #endif /* CONFIG_KVM_XICS */ + /* If we came in through the P9 short path, go back out to C now */ + lwz r0, STACK_SLOT_SHORT_PATH(r1) + cmpwi r0, 0 + bne guest_exit_short_path + /* For hash guest, read the guest SLB and save it away */ ld r5, VCPU_KVM(r9) lbz r0, KVM_RADIX(r5) @@ -1780,11 +1698,13 @@ BEGIN_FTR_SECTION b 91f END_FTR_SECTION(CPU_FTR_TM | CPU_FTR_P9_TM_HV_ASSIST, 0) /* - * NOTE THAT THIS TRASHES ALL NON-VOLATILE REGISTERS INCLUDING CR + * NOTE THAT THIS TRASHES ALL NON-VOLATILE REGISTERS (but not CR) */ mr r3, r9 ld r4, VCPU_MSR(r3) + li r5, 0 /* don't preserve non-vol regs */ bl kvmppc_save_tm_hv + nop ld r9, HSTATE_KVM_VCPU(r13) 91: #endif @@ -1802,83 +1722,12 @@ END_FTR_SECTION(CPU_FTR_TM | CPU_FTR_P9_TM_HV_ASSIST, 0) 25: /* Save PMU registers if requested */ /* r8 and cr0.eq are live here */ -BEGIN_FTR_SECTION - /* - * POWER8 seems to have a hardware bug where setting - * MMCR0[PMAE] along with MMCR0[PMC1CE] and/or MMCR0[PMCjCE] - * when some counters are already negative doesn't seem - * to cause a performance monitor alert (and hence interrupt). - * The effect of this is that when saving the PMU state, - * if there is no PMU alert pending when we read MMCR0 - * before freezing the counters, but one becomes pending - * before we read the counters, we lose it. - * To work around this, we need a way to freeze the counters - * before reading MMCR0. Normally, freezing the counters - * is done by writing MMCR0 (to set MMCR0[FC]) which - * unavoidably writes MMCR0[PMA0] as well. On POWER8, - * we can also freeze the counters using MMCR2, by writing - * 1s to all the counter freeze condition bits (there are - * 9 bits each for 6 counters). - */ - li r3, -1 /* set all freeze bits */ - clrrdi r3, r3, 10 - mfspr r10, SPRN_MMCR2 - mtspr SPRN_MMCR2, r3 - isync -END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) - li r3, 1 - sldi r3, r3, 31 /* MMCR0_FC (freeze counters) bit */ - mfspr r4, SPRN_MMCR0 /* save MMCR0 */ - mtspr SPRN_MMCR0, r3 /* freeze all counters, disable ints */ - mfspr r6, SPRN_MMCRA - /* Clear MMCRA in order to disable SDAR updates */ - li r7, 0 - mtspr SPRN_MMCRA, r7 - isync + mr r3, r9 + li r4, 1 beq 21f /* if no VPA, save PMU stuff anyway */ - lbz r7, LPPACA_PMCINUSE(r8) - cmpwi r7, 0 /* did they ask for PMU stuff to be saved? */ - bne 21f - std r3, VCPU_MMCR(r9) /* if not, set saved MMCR0 to FC */ - b 22f -21: mfspr r5, SPRN_MMCR1 - mfspr r7, SPRN_SIAR - mfspr r8, SPRN_SDAR - std r4, VCPU_MMCR(r9) - std r5, VCPU_MMCR + 8(r9) - std r6, VCPU_MMCR + 16(r9) -BEGIN_FTR_SECTION - std r10, VCPU_MMCR + 24(r9) -END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) - std r7, VCPU_SIAR(r9) - std r8, VCPU_SDAR(r9) - mfspr r3, SPRN_PMC1 - mfspr r4, SPRN_PMC2 - mfspr r5, SPRN_PMC3 - mfspr r6, SPRN_PMC4 - mfspr r7, SPRN_PMC5 - mfspr r8, SPRN_PMC6 - stw r3, VCPU_PMC(r9) - stw r4, VCPU_PMC + 4(r9) - stw r5, VCPU_PMC + 8(r9) - stw r6, VCPU_PMC + 12(r9) - stw r7, VCPU_PMC + 16(r9) - stw r8, VCPU_PMC + 20(r9) -BEGIN_FTR_SECTION - mfspr r5, SPRN_SIER - std r5, VCPU_SIER(r9) -BEGIN_FTR_SECTION_NESTED(96) - mfspr r6, SPRN_SPMC1 - mfspr r7, SPRN_SPMC2 - mfspr r8, SPRN_MMCRS - stw r6, VCPU_PMC + 24(r9) - stw r7, VCPU_PMC + 28(r9) - std r8, VCPU_MMCR + 32(r9) - lis r4, 0x8000 - mtspr SPRN_MMCRS, r4 -END_FTR_SECTION_NESTED(CPU_FTR_ARCH_300, 0, 96) -END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) -22: + lbz r4, LPPACA_PMCINUSE(r8) +21: bl kvmhv_save_guest_pmu + ld r9, HSTATE_KVM_VCPU(r13) /* Restore host values of some registers */ BEGIN_FTR_SECTION @@ -2010,24 +1859,6 @@ BEGIN_FTR_SECTION mtspr SPRN_DPDES, r8 END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) - /* If HMI, call kvmppc_realmode_hmi_handler() */ - lwz r12, STACK_SLOT_TRAP(r1) - cmpwi r12, BOOK3S_INTERRUPT_HMI - bne 27f - bl kvmppc_realmode_hmi_handler - nop - cmpdi r3, 0 - /* - * At this point kvmppc_realmode_hmi_handler may have resync-ed - * the TB, and if it has, we must not subtract the guest timebase - * offset from the timebase. So, skip it. - * - * Also, do not call kvmppc_subcore_exit_guest() because it has - * been invoked as part of kvmppc_realmode_hmi_handler(). - */ - beq 30f - -27: /* Subtract timebase offset from timebase */ ld r8, VCORE_TB_OFFSET_APPL(r5) cmpdi r8,0 @@ -2045,7 +1876,16 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) addis r8,r8,0x100 /* if so, increment upper 40 bits */ mtspr SPRN_TBU40,r8 -17: bl kvmppc_subcore_exit_guest +17: + /* + * If this is an HMI, we called kvmppc_realmode_hmi_handler + * above, which may or may not have already called + * kvmppc_subcore_exit_guest. Fortunately, all that + * kvmppc_subcore_exit_guest does is clear a flag, so calling + * it again here is benign even if kvmppc_realmode_hmi_handler + * has already called it. + */ + bl kvmppc_subcore_exit_guest nop 30: ld r5,HSTATE_KVM_VCORE(r13) ld r4,VCORE_KVM(r5) /* pointer to struct kvm */ @@ -2099,6 +1939,67 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) mtlr r0 blr +kvmppc_guest_external: + /* External interrupt, first check for host_ipi. If this is + * set, we know the host wants us out so let's do it now + */ + bl kvmppc_read_intr + + /* + * Restore the active volatile registers after returning from + * a C function. + */ + ld r9, HSTATE_KVM_VCPU(r13) + li r12, BOOK3S_INTERRUPT_EXTERNAL + + /* + * kvmppc_read_intr return codes: + * + * Exit to host (r3 > 0) + * 1 An interrupt is pending that needs to be handled by the host + * Exit guest and return to host by branching to guest_exit_cont + * + * 2 Passthrough that needs completion in the host + * Exit guest and return to host by branching to guest_exit_cont + * However, we also set r12 to BOOK3S_INTERRUPT_HV_RM_HARD + * to indicate to the host to complete handling the interrupt + * + * Before returning to guest, we check if any CPU is heading out + * to the host and if so, we head out also. If no CPUs are heading + * check return values <= 0. + * + * Return to guest (r3 <= 0) + * 0 No external interrupt is pending + * -1 A guest wakeup IPI (which has now been cleared) + * In either case, we return to guest to deliver any pending + * guest interrupts. + * + * -2 A PCI passthrough external interrupt was handled + * (interrupt was delivered directly to guest) + * Return to guest to deliver any pending guest interrupts. + */ + + cmpdi r3, 1 + ble 1f + + /* Return code = 2 */ + li r12, BOOK3S_INTERRUPT_HV_RM_HARD + stw r12, VCPU_TRAP(r9) + b guest_exit_cont + +1: /* Return code <= 1 */ + cmpdi r3, 0 + bgt guest_exit_cont + + /* Return code <= 0 */ +maybe_reenter_guest: + ld r5, HSTATE_KVM_VCORE(r13) + lwz r0, VCORE_ENTRY_EXIT(r5) + cmpwi r0, 0x100 + mr r4, r9 + blt deliver_guest_interrupt + b guest_exit_cont + #ifdef CONFIG_PPC_TRANSACTIONAL_MEM /* * Softpatch interrupt for transactional memory emulation cases @@ -2302,6 +2203,10 @@ hcall_try_real_mode: andi. r0,r11,MSR_PR /* sc 1 from userspace - reflect to guest syscall */ bne sc_1_fast_return + /* sc 1 from nested guest - give it to L1 to handle */ + ld r0, VCPU_NESTED(r9) + cmpdi r0, 0 + bne guest_exit_cont clrrdi r3,r3,2 cmpldi r3,hcall_real_table_end - hcall_real_table bge guest_exit_cont @@ -2561,6 +2466,7 @@ hcall_real_table: hcall_real_table_end: _GLOBAL(kvmppc_h_set_xdabr) +EXPORT_SYMBOL_GPL(kvmppc_h_set_xdabr) andi. r0, r5, DABRX_USER | DABRX_KERNEL beq 6f li r0, DABRX_USER | DABRX_KERNEL | DABRX_BTI @@ -2570,6 +2476,7 @@ _GLOBAL(kvmppc_h_set_xdabr) blr _GLOBAL(kvmppc_h_set_dabr) +EXPORT_SYMBOL_GPL(kvmppc_h_set_dabr) li r5, DABRX_USER | DABRX_KERNEL 3: BEGIN_FTR_SECTION @@ -2682,11 +2589,13 @@ BEGIN_FTR_SECTION b 91f END_FTR_SECTION(CPU_FTR_TM | CPU_FTR_P9_TM_HV_ASSIST, 0) /* - * NOTE THAT THIS TRASHES ALL NON-VOLATILE REGISTERS INCLUDING CR + * NOTE THAT THIS TRASHES ALL NON-VOLATILE REGISTERS (but not CR) */ ld r3, HSTATE_KVM_VCPU(r13) ld r4, VCPU_MSR(r3) + li r5, 0 /* don't preserve non-vol regs */ bl kvmppc_save_tm_hv + nop 91: #endif @@ -2802,11 +2711,13 @@ BEGIN_FTR_SECTION b 91f END_FTR_SECTION(CPU_FTR_TM | CPU_FTR_P9_TM_HV_ASSIST, 0) /* - * NOTE THAT THIS TRASHES ALL NON-VOLATILE REGISTERS INCLUDING CR + * NOTE THAT THIS TRASHES ALL NON-VOLATILE REGISTERS (but not CR) */ mr r3, r4 ld r4, VCPU_MSR(r3) + li r5, 0 /* don't preserve non-vol regs */ bl kvmppc_restore_tm_hv + nop ld r4, HSTATE_KVM_VCPU(r13) 91: #endif @@ -2874,13 +2785,7 @@ END_FTR_SECTION(CPU_FTR_TM | CPU_FTR_P9_TM_HV_ASSIST, 0) mr r9, r4 cmpdi r3, 0 bgt guest_exit_cont - - /* see if any other thread is already exiting */ - lwz r0,VCORE_ENTRY_EXIT(r5) - cmpwi r0,0x100 - bge guest_exit_cont - - b kvmppc_cede_reentry /* if not go back to guest */ + b maybe_reenter_guest /* cede when already previously prodded case */ kvm_cede_prodded: @@ -2947,12 +2852,12 @@ machine_check_realmode: */ ld r11, VCPU_MSR(r9) rldicl. r0, r11, 64-MSR_HV_LG, 63 /* check if it happened in HV mode */ - bne mc_cont /* if so, exit to host */ + bne guest_exit_cont /* if so, exit to host */ /* Check if guest is capable of handling NMI exit */ ld r10, VCPU_KVM(r9) lbz r10, KVM_FWNMI(r10) cmpdi r10, 1 /* FWNMI capable? */ - beq mc_cont /* if so, exit with KVM_EXIT_NMI. */ + beq guest_exit_cont /* if so, exit with KVM_EXIT_NMI. */ /* if not, fall through for backward compatibility. */ andi. r10, r11, MSR_RI /* check for unrecoverable exception */ @@ -2966,6 +2871,21 @@ machine_check_realmode: 2: b fast_interrupt_c_return /* + * Call C code to handle a HMI in real mode. + * Only the primary thread does the call, secondary threads are handled + * by calling hmi_exception_realmode() after kvmppc_hv_entry returns. + * r9 points to the vcpu on entry + */ +hmi_realmode: + lbz r0, HSTATE_PTID(r13) + cmpwi r0, 0 + bne guest_exit_cont + bl kvmppc_realmode_hmi_handler + ld r9, HSTATE_KVM_VCPU(r13) + li r12, BOOK3S_INTERRUPT_HMI + b guest_exit_cont + +/* * Check the reason we woke from nap, and take appropriate action. * Returns (in r3): * 0 if nothing needs to be done @@ -3130,10 +3050,12 @@ END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) * Save transactional state and TM-related registers. * Called with r3 pointing to the vcpu struct and r4 containing * the guest MSR value. - * This can modify all checkpointed registers, but + * r5 is non-zero iff non-volatile register state needs to be maintained. + * If r5 == 0, this can modify all checkpointed registers, but * restores r1 and r2 before exit. */ -kvmppc_save_tm_hv: +_GLOBAL_TOC(kvmppc_save_tm_hv) +EXPORT_SYMBOL_GPL(kvmppc_save_tm_hv) /* See if we need to handle fake suspend mode */ BEGIN_FTR_SECTION b __kvmppc_save_tm @@ -3161,12 +3083,6 @@ BEGIN_FTR_SECTION END_FTR_SECTION_IFSET(CPU_FTR_P9_TM_XER_SO_BUG) nop - std r1, HSTATE_HOST_R1(r13) - - /* Clear the MSR RI since r1, r13 may be foobar. */ - li r5, 0 - mtmsrd r5, 1 - /* We have to treclaim here because that's the only way to do S->N */ li r3, TM_CAUSE_KVM_RESCHED TRECLAIM(R3) @@ -3175,22 +3091,13 @@ END_FTR_SECTION_IFSET(CPU_FTR_P9_TM_XER_SO_BUG) * We were in fake suspend, so we are not going to save the * register state as the guest checkpointed state (since * we already have it), therefore we can now use any volatile GPR. + * In fact treclaim in fake suspend state doesn't modify + * any registers. */ - /* Reload PACA pointer, stack pointer and TOC. */ - GET_PACA(r13) - ld r1, HSTATE_HOST_R1(r13) - ld r2, PACATOC(r13) - /* Set MSR RI now we have r1 and r13 back. */ - li r5, MSR_RI - mtmsrd r5, 1 - - HMT_MEDIUM - ld r6, HSTATE_DSCR(r13) - mtspr SPRN_DSCR, r6 -BEGIN_FTR_SECTION_NESTED(96) +BEGIN_FTR_SECTION bl pnv_power9_force_smt4_release -END_FTR_SECTION_NESTED(CPU_FTR_P9_TM_XER_SO_BUG, CPU_FTR_P9_TM_XER_SO_BUG, 96) +END_FTR_SECTION_IFSET(CPU_FTR_P9_TM_XER_SO_BUG) nop 4: @@ -3216,10 +3123,12 @@ END_FTR_SECTION_NESTED(CPU_FTR_P9_TM_XER_SO_BUG, CPU_FTR_P9_TM_XER_SO_BUG, 96) * Restore transactional state and TM-related registers. * Called with r3 pointing to the vcpu struct * and r4 containing the guest MSR value. + * r5 is non-zero iff non-volatile register state needs to be maintained. * This potentially modifies all checkpointed registers. * It restores r1 and r2 from the PACA. */ -kvmppc_restore_tm_hv: +_GLOBAL_TOC(kvmppc_restore_tm_hv) +EXPORT_SYMBOL_GPL(kvmppc_restore_tm_hv) /* * If we are doing TM emulation for the guest on a POWER9 DD2, * then we don't actually do a trechkpt -- we either set up @@ -3424,6 +3333,194 @@ kvmppc_msr_interrupt: blr /* + * Load up guest PMU state. R3 points to the vcpu struct. + */ +_GLOBAL(kvmhv_load_guest_pmu) +EXPORT_SYMBOL_GPL(kvmhv_load_guest_pmu) + mr r4, r3 + mflr r0 + li r3, 1 + sldi r3, r3, 31 /* MMCR0_FC (freeze counters) bit */ + mtspr SPRN_MMCR0, r3 /* freeze all counters, disable ints */ + isync +BEGIN_FTR_SECTION + ld r3, VCPU_MMCR(r4) + andi. r5, r3, MMCR0_PMAO_SYNC | MMCR0_PMAO + cmpwi r5, MMCR0_PMAO + beql kvmppc_fix_pmao +END_FTR_SECTION_IFSET(CPU_FTR_PMAO_BUG) + lwz r3, VCPU_PMC(r4) /* always load up guest PMU registers */ + lwz r5, VCPU_PMC + 4(r4) /* to prevent information leak */ + lwz r6, VCPU_PMC + 8(r4) + lwz r7, VCPU_PMC + 12(r4) + lwz r8, VCPU_PMC + 16(r4) + lwz r9, VCPU_PMC + 20(r4) + mtspr SPRN_PMC1, r3 + mtspr SPRN_PMC2, r5 + mtspr SPRN_PMC3, r6 + mtspr SPRN_PMC4, r7 + mtspr SPRN_PMC5, r8 + mtspr SPRN_PMC6, r9 + ld r3, VCPU_MMCR(r4) + ld r5, VCPU_MMCR + 8(r4) + ld r6, VCPU_MMCR + 16(r4) + ld r7, VCPU_SIAR(r4) + ld r8, VCPU_SDAR(r4) + mtspr SPRN_MMCR1, r5 + mtspr SPRN_MMCRA, r6 + mtspr SPRN_SIAR, r7 + mtspr SPRN_SDAR, r8 +BEGIN_FTR_SECTION + ld r5, VCPU_MMCR + 24(r4) + ld r6, VCPU_SIER(r4) + mtspr SPRN_MMCR2, r5 + mtspr SPRN_SIER, r6 +BEGIN_FTR_SECTION_NESTED(96) + lwz r7, VCPU_PMC + 24(r4) + lwz r8, VCPU_PMC + 28(r4) + ld r9, VCPU_MMCR + 32(r4) + mtspr SPRN_SPMC1, r7 + mtspr SPRN_SPMC2, r8 + mtspr SPRN_MMCRS, r9 +END_FTR_SECTION_NESTED(CPU_FTR_ARCH_300, 0, 96) +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) + mtspr SPRN_MMCR0, r3 + isync + mtlr r0 + blr + +/* + * Reload host PMU state saved in the PACA by kvmhv_save_host_pmu. + */ +_GLOBAL(kvmhv_load_host_pmu) +EXPORT_SYMBOL_GPL(kvmhv_load_host_pmu) + mflr r0 + lbz r4, PACA_PMCINUSE(r13) /* is the host using the PMU? */ + cmpwi r4, 0 + beq 23f /* skip if not */ +BEGIN_FTR_SECTION + ld r3, HSTATE_MMCR0(r13) + andi. r4, r3, MMCR0_PMAO_SYNC | MMCR0_PMAO + cmpwi r4, MMCR0_PMAO + beql kvmppc_fix_pmao +END_FTR_SECTION_IFSET(CPU_FTR_PMAO_BUG) + lwz r3, HSTATE_PMC1(r13) + lwz r4, HSTATE_PMC2(r13) + lwz r5, HSTATE_PMC3(r13) + lwz r6, HSTATE_PMC4(r13) + lwz r8, HSTATE_PMC5(r13) + lwz r9, HSTATE_PMC6(r13) + mtspr SPRN_PMC1, r3 + mtspr SPRN_PMC2, r4 + mtspr SPRN_PMC3, r5 + mtspr SPRN_PMC4, r6 + mtspr SPRN_PMC5, r8 + mtspr SPRN_PMC6, r9 + ld r3, HSTATE_MMCR0(r13) + ld r4, HSTATE_MMCR1(r13) + ld r5, HSTATE_MMCRA(r13) + ld r6, HSTATE_SIAR(r13) + ld r7, HSTATE_SDAR(r13) + mtspr SPRN_MMCR1, r4 + mtspr SPRN_MMCRA, r5 + mtspr SPRN_SIAR, r6 + mtspr SPRN_SDAR, r7 +BEGIN_FTR_SECTION + ld r8, HSTATE_MMCR2(r13) + ld r9, HSTATE_SIER(r13) + mtspr SPRN_MMCR2, r8 + mtspr SPRN_SIER, r9 +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) + mtspr SPRN_MMCR0, r3 + isync + mtlr r0 +23: blr + +/* + * Save guest PMU state into the vcpu struct. + * r3 = vcpu, r4 = full save flag (PMU in use flag set in VPA) + */ +_GLOBAL(kvmhv_save_guest_pmu) +EXPORT_SYMBOL_GPL(kvmhv_save_guest_pmu) + mr r9, r3 + mr r8, r4 +BEGIN_FTR_SECTION + /* + * POWER8 seems to have a hardware bug where setting + * MMCR0[PMAE] along with MMCR0[PMC1CE] and/or MMCR0[PMCjCE] + * when some counters are already negative doesn't seem + * to cause a performance monitor alert (and hence interrupt). + * The effect of this is that when saving the PMU state, + * if there is no PMU alert pending when we read MMCR0 + * before freezing the counters, but one becomes pending + * before we read the counters, we lose it. + * To work around this, we need a way to freeze the counters + * before reading MMCR0. Normally, freezing the counters + * is done by writing MMCR0 (to set MMCR0[FC]) which + * unavoidably writes MMCR0[PMA0] as well. On POWER8, + * we can also freeze the counters using MMCR2, by writing + * 1s to all the counter freeze condition bits (there are + * 9 bits each for 6 counters). + */ + li r3, -1 /* set all freeze bits */ + clrrdi r3, r3, 10 + mfspr r10, SPRN_MMCR2 + mtspr SPRN_MMCR2, r3 + isync +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) + li r3, 1 + sldi r3, r3, 31 /* MMCR0_FC (freeze counters) bit */ + mfspr r4, SPRN_MMCR0 /* save MMCR0 */ + mtspr SPRN_MMCR0, r3 /* freeze all counters, disable ints */ + mfspr r6, SPRN_MMCRA + /* Clear MMCRA in order to disable SDAR updates */ + li r7, 0 + mtspr SPRN_MMCRA, r7 + isync + cmpwi r8, 0 /* did they ask for PMU stuff to be saved? */ + bne 21f + std r3, VCPU_MMCR(r9) /* if not, set saved MMCR0 to FC */ + b 22f +21: mfspr r5, SPRN_MMCR1 + mfspr r7, SPRN_SIAR + mfspr r8, SPRN_SDAR + std r4, VCPU_MMCR(r9) + std r5, VCPU_MMCR + 8(r9) + std r6, VCPU_MMCR + 16(r9) +BEGIN_FTR_SECTION + std r10, VCPU_MMCR + 24(r9) +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) + std r7, VCPU_SIAR(r9) + std r8, VCPU_SDAR(r9) + mfspr r3, SPRN_PMC1 + mfspr r4, SPRN_PMC2 + mfspr r5, SPRN_PMC3 + mfspr r6, SPRN_PMC4 + mfspr r7, SPRN_PMC5 + mfspr r8, SPRN_PMC6 + stw r3, VCPU_PMC(r9) + stw r4, VCPU_PMC + 4(r9) + stw r5, VCPU_PMC + 8(r9) + stw r6, VCPU_PMC + 12(r9) + stw r7, VCPU_PMC + 16(r9) + stw r8, VCPU_PMC + 20(r9) +BEGIN_FTR_SECTION + mfspr r5, SPRN_SIER + std r5, VCPU_SIER(r9) +BEGIN_FTR_SECTION_NESTED(96) + mfspr r6, SPRN_SPMC1 + mfspr r7, SPRN_SPMC2 + mfspr r8, SPRN_MMCRS + stw r6, VCPU_PMC + 24(r9) + stw r7, VCPU_PMC + 28(r9) + std r8, VCPU_MMCR + 32(r9) + lis r4, 0x8000 + mtspr SPRN_MMCRS, r4 +END_FTR_SECTION_NESTED(CPU_FTR_ARCH_300, 0, 96) +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) +22: blr + +/* * This works around a hardware bug on POWER8E processors, where * writing a 1 to the MMCR0[PMAO] bit doesn't generate a * performance monitor interrupt. Instead, when we need to have diff --git a/arch/powerpc/kvm/book3s_hv_tm.c b/arch/powerpc/kvm/book3s_hv_tm.c index 008285058f9b..888e2609e3f1 100644 --- a/arch/powerpc/kvm/book3s_hv_tm.c +++ b/arch/powerpc/kvm/book3s_hv_tm.c @@ -130,7 +130,7 @@ int kvmhv_p9_tm_emulation(struct kvm_vcpu *vcpu) return RESUME_GUEST; } /* Set CR0 to indicate previous transactional state */ - vcpu->arch.cr = (vcpu->arch.cr & 0x0fffffff) | + vcpu->arch.regs.ccr = (vcpu->arch.regs.ccr & 0x0fffffff) | (((msr & MSR_TS_MASK) >> MSR_TS_S_LG) << 28); /* L=1 => tresume, L=0 => tsuspend */ if (instr & (1 << 21)) { @@ -174,7 +174,7 @@ int kvmhv_p9_tm_emulation(struct kvm_vcpu *vcpu) copy_from_checkpoint(vcpu); /* Set CR0 to indicate previous transactional state */ - vcpu->arch.cr = (vcpu->arch.cr & 0x0fffffff) | + vcpu->arch.regs.ccr = (vcpu->arch.regs.ccr & 0x0fffffff) | (((msr & MSR_TS_MASK) >> MSR_TS_S_LG) << 28); vcpu->arch.shregs.msr &= ~MSR_TS_MASK; return RESUME_GUEST; @@ -204,7 +204,7 @@ int kvmhv_p9_tm_emulation(struct kvm_vcpu *vcpu) copy_to_checkpoint(vcpu); /* Set CR0 to indicate previous transactional state */ - vcpu->arch.cr = (vcpu->arch.cr & 0x0fffffff) | + vcpu->arch.regs.ccr = (vcpu->arch.regs.ccr & 0x0fffffff) | (((msr & MSR_TS_MASK) >> MSR_TS_S_LG) << 28); vcpu->arch.shregs.msr = msr | MSR_TS_S; return RESUME_GUEST; diff --git a/arch/powerpc/kvm/book3s_hv_tm_builtin.c b/arch/powerpc/kvm/book3s_hv_tm_builtin.c index b2c7c6fca4f9..3cf5863bc06e 100644 --- a/arch/powerpc/kvm/book3s_hv_tm_builtin.c +++ b/arch/powerpc/kvm/book3s_hv_tm_builtin.c @@ -89,7 +89,8 @@ int kvmhv_p9_tm_emulation_early(struct kvm_vcpu *vcpu) if (instr & (1 << 21)) vcpu->arch.shregs.msr = (msr & ~MSR_TS_MASK) | MSR_TS_T; /* Set CR0 to 0b0010 */ - vcpu->arch.cr = (vcpu->arch.cr & 0x0fffffff) | 0x20000000; + vcpu->arch.regs.ccr = (vcpu->arch.regs.ccr & 0x0fffffff) | + 0x20000000; return 1; } @@ -105,5 +106,5 @@ void kvmhv_emulate_tm_rollback(struct kvm_vcpu *vcpu) vcpu->arch.shregs.msr &= ~MSR_TS_MASK; /* go to N state */ vcpu->arch.regs.nip = vcpu->arch.tfhar; copy_from_checkpoint(vcpu); - vcpu->arch.cr = (vcpu->arch.cr & 0x0fffffff) | 0xa0000000; + vcpu->arch.regs.ccr = (vcpu->arch.regs.ccr & 0x0fffffff) | 0xa0000000; } diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c index 614ebb4261f7..4efd65d9e828 100644 --- a/arch/powerpc/kvm/book3s_pr.c +++ b/arch/powerpc/kvm/book3s_pr.c @@ -167,7 +167,7 @@ void kvmppc_copy_to_svcpu(struct kvm_vcpu *vcpu) svcpu->gpr[11] = vcpu->arch.regs.gpr[11]; svcpu->gpr[12] = vcpu->arch.regs.gpr[12]; svcpu->gpr[13] = vcpu->arch.regs.gpr[13]; - svcpu->cr = vcpu->arch.cr; + svcpu->cr = vcpu->arch.regs.ccr; svcpu->xer = vcpu->arch.regs.xer; svcpu->ctr = vcpu->arch.regs.ctr; svcpu->lr = vcpu->arch.regs.link; @@ -249,7 +249,7 @@ void kvmppc_copy_from_svcpu(struct kvm_vcpu *vcpu) vcpu->arch.regs.gpr[11] = svcpu->gpr[11]; vcpu->arch.regs.gpr[12] = svcpu->gpr[12]; vcpu->arch.regs.gpr[13] = svcpu->gpr[13]; - vcpu->arch.cr = svcpu->cr; + vcpu->arch.regs.ccr = svcpu->cr; vcpu->arch.regs.xer = svcpu->xer; vcpu->arch.regs.ctr = svcpu->ctr; vcpu->arch.regs.link = svcpu->lr; @@ -1246,7 +1246,6 @@ int kvmppc_handle_exit_pr(struct kvm_run *run, struct kvm_vcpu *vcpu, r = RESUME_GUEST; break; case BOOK3S_INTERRUPT_EXTERNAL: - case BOOK3S_INTERRUPT_EXTERNAL_LEVEL: case BOOK3S_INTERRUPT_EXTERNAL_HV: case BOOK3S_INTERRUPT_H_VIRT: vcpu->stat.ext_intr_exits++; diff --git a/arch/powerpc/kvm/book3s_xics.c b/arch/powerpc/kvm/book3s_xics.c index b8356cdc0c04..b0b2bfc2ff51 100644 --- a/arch/powerpc/kvm/book3s_xics.c +++ b/arch/powerpc/kvm/book3s_xics.c @@ -310,7 +310,7 @@ static inline bool icp_try_update(struct kvmppc_icp *icp, */ if (new.out_ee) { kvmppc_book3s_queue_irqprio(icp->vcpu, - BOOK3S_INTERRUPT_EXTERNAL_LEVEL); + BOOK3S_INTERRUPT_EXTERNAL); if (!change_self) kvmppc_fast_vcpu_kick(icp->vcpu); } @@ -593,8 +593,7 @@ static noinline unsigned long kvmppc_h_xirr(struct kvm_vcpu *vcpu) u32 xirr; /* First, remove EE from the processor */ - kvmppc_book3s_dequeue_irqprio(icp->vcpu, - BOOK3S_INTERRUPT_EXTERNAL_LEVEL); + kvmppc_book3s_dequeue_irqprio(icp->vcpu, BOOK3S_INTERRUPT_EXTERNAL); /* * ICP State: Accept_Interrupt @@ -754,8 +753,7 @@ static noinline void kvmppc_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr) * We can remove EE from the current processor, the update * transaction will set it again if needed */ - kvmppc_book3s_dequeue_irqprio(icp->vcpu, - BOOK3S_INTERRUPT_EXTERNAL_LEVEL); + kvmppc_book3s_dequeue_irqprio(icp->vcpu, BOOK3S_INTERRUPT_EXTERNAL); do { old_state = new_state = READ_ONCE(icp->state); @@ -1167,8 +1165,7 @@ int kvmppc_xics_set_icp(struct kvm_vcpu *vcpu, u64 icpval) * Deassert the CPU interrupt request. * icp_try_update will reassert it if necessary. */ - kvmppc_book3s_dequeue_irqprio(icp->vcpu, - BOOK3S_INTERRUPT_EXTERNAL_LEVEL); + kvmppc_book3s_dequeue_irqprio(icp->vcpu, BOOK3S_INTERRUPT_EXTERNAL); /* * Note that if we displace an interrupt from old_state.xisr, @@ -1393,7 +1390,8 @@ static int kvmppc_xics_create(struct kvm_device *dev, u32 type) } #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE - if (cpu_has_feature(CPU_FTR_ARCH_206)) { + if (cpu_has_feature(CPU_FTR_ARCH_206) && + cpu_has_feature(CPU_FTR_HVMODE)) { /* Enable real mode support */ xics->real_mode = ENABLE_REALMODE; xics->real_mode_dbg = DEBUG_REALMODE; diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c index 30c2eb766954..ad4a370703d3 100644 --- a/arch/powerpc/kvm/book3s_xive.c +++ b/arch/powerpc/kvm/book3s_xive.c @@ -62,6 +62,69 @@ #define XIVE_Q_GAP 2 /* + * Push a vcpu's context to the XIVE on guest entry. + * This assumes we are in virtual mode (MMU on) + */ +void kvmppc_xive_push_vcpu(struct kvm_vcpu *vcpu) +{ + void __iomem *tima = local_paca->kvm_hstate.xive_tima_virt; + u64 pq; + + if (!tima) + return; + eieio(); + __raw_writeq(vcpu->arch.xive_saved_state.w01, tima + TM_QW1_OS); + __raw_writel(vcpu->arch.xive_cam_word, tima + TM_QW1_OS + TM_WORD2); + vcpu->arch.xive_pushed = 1; + eieio(); + + /* + * We clear the irq_pending flag. There is a small chance of a + * race vs. the escalation interrupt happening on another + * processor setting it again, but the only consequence is to + * cause a spurious wakeup on the next H_CEDE, which is not an + * issue. + */ + vcpu->arch.irq_pending = 0; + + /* + * In single escalation mode, if the escalation interrupt is + * on, we mask it. + */ + if (vcpu->arch.xive_esc_on) { + pq = __raw_readq((void __iomem *)(vcpu->arch.xive_esc_vaddr + + XIVE_ESB_SET_PQ_01)); + mb(); + + /* + * We have a possible subtle race here: The escalation + * interrupt might have fired and be on its way to the + * host queue while we mask it, and if we unmask it + * early enough (re-cede right away), there is a + * theorical possibility that it fires again, thus + * landing in the target queue more than once which is + * a big no-no. + * + * Fortunately, solving this is rather easy. If the + * above load setting PQ to 01 returns a previous + * value where P is set, then we know the escalation + * interrupt is somewhere on its way to the host. In + * that case we simply don't clear the xive_esc_on + * flag below. It will be eventually cleared by the + * handler for the escalation interrupt. + * + * Then, when doing a cede, we check that flag again + * before re-enabling the escalation interrupt, and if + * set, we abort the cede. + */ + if (!(pq & XIVE_ESB_VAL_P)) + /* Now P is 0, we can clear the flag */ + vcpu->arch.xive_esc_on = 0; + } +} +EXPORT_SYMBOL_GPL(kvmppc_xive_push_vcpu); + +/* * This is a simple trigger for a generic XIVE IRQ. This must * only be called for interrupts that support a trigger page */ diff --git a/arch/powerpc/kvm/book3s_xive_template.c b/arch/powerpc/kvm/book3s_xive_template.c index 4171ede8722b..033363d6e764 100644 --- a/arch/powerpc/kvm/book3s_xive_template.c +++ b/arch/powerpc/kvm/book3s_xive_template.c @@ -280,14 +280,6 @@ X_STATIC unsigned long GLUE(X_PFX,h_xirr)(struct kvm_vcpu *vcpu) /* First collect pending bits from HW */ GLUE(X_PFX,ack_pending)(xc); - /* - * Cleanup the old-style bits if needed (they may have been - * set by pull or an escalation interrupts). - */ - if (test_bit(BOOK3S_IRQPRIO_EXTERNAL, &vcpu->arch.pending_exceptions)) - clear_bit(BOOK3S_IRQPRIO_EXTERNAL_LEVEL, - &vcpu->arch.pending_exceptions); - pr_devel(" new pending=0x%02x hw_cppr=%d cppr=%d\n", xc->pending, xc->hw_cppr, xc->cppr); diff --git a/arch/powerpc/kvm/bookehv_interrupts.S b/arch/powerpc/kvm/bookehv_interrupts.S index 81bd8a07aa51..051af7d97327 100644 --- a/arch/powerpc/kvm/bookehv_interrupts.S +++ b/arch/powerpc/kvm/bookehv_interrupts.S @@ -182,7 +182,7 @@ */ PPC_LL r4, PACACURRENT(r13) PPC_LL r4, (THREAD + THREAD_KVM_VCPU)(r4) - stw r10, VCPU_CR(r4) + PPC_STL r10, VCPU_CR(r4) PPC_STL r11, VCPU_GPR(R4)(r4) PPC_STL r5, VCPU_GPR(R5)(r4) PPC_STL r6, VCPU_GPR(R6)(r4) @@ -292,7 +292,7 @@ _GLOBAL(kvmppc_handler_\intno\()_\srr1) PPC_STL r4, VCPU_GPR(R4)(r11) PPC_LL r4, THREAD_NORMSAVE(0)(r10) PPC_STL r5, VCPU_GPR(R5)(r11) - stw r13, VCPU_CR(r11) + PPC_STL r13, VCPU_CR(r11) mfspr r5, \srr0 PPC_STL r3, VCPU_GPR(R10)(r11) PPC_LL r3, THREAD_NORMSAVE(2)(r10) @@ -319,7 +319,7 @@ _GLOBAL(kvmppc_handler_\intno\()_\srr1) PPC_STL r4, VCPU_GPR(R4)(r11) PPC_LL r4, GPR9(r8) PPC_STL r5, VCPU_GPR(R5)(r11) - stw r9, VCPU_CR(r11) + PPC_STL r9, VCPU_CR(r11) mfspr r5, \srr0 PPC_STL r3, VCPU_GPR(R8)(r11) PPC_LL r3, GPR10(r8) @@ -643,7 +643,7 @@ lightweight_exit: PPC_LL r3, VCPU_LR(r4) PPC_LL r5, VCPU_XER(r4) PPC_LL r6, VCPU_CTR(r4) - lwz r7, VCPU_CR(r4) + PPC_LL r7, VCPU_CR(r4) PPC_LL r8, VCPU_PC(r4) PPC_LD(r9, VCPU_SHARED_MSR, r11) PPC_LL r0, VCPU_GPR(R0)(r4) diff --git a/arch/powerpc/kvm/emulate_loadstore.c b/arch/powerpc/kvm/emulate_loadstore.c index 75dce1ef3bc8..f91b1309a0a8 100644 --- a/arch/powerpc/kvm/emulate_loadstore.c +++ b/arch/powerpc/kvm/emulate_loadstore.c @@ -117,7 +117,6 @@ int kvmppc_emulate_loadstore(struct kvm_vcpu *vcpu) emulated = EMULATE_FAIL; vcpu->arch.regs.msr = vcpu->arch.shared->msr; - vcpu->arch.regs.ccr = vcpu->arch.cr; if (analyse_instr(&op, &vcpu->arch.regs, inst) == 0) { int type = op.type & INSTR_TYPE_MASK; int size = GETSIZE(op.type); diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index eba5756d5b41..2869a299c4ed 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -594,7 +594,12 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) r = !!(hv_enabled && radix_enabled()); break; case KVM_CAP_PPC_MMU_HASH_V3: - r = !!(hv_enabled && cpu_has_feature(CPU_FTR_ARCH_300)); + r = !!(hv_enabled && cpu_has_feature(CPU_FTR_ARCH_300) && + cpu_has_feature(CPU_FTR_HVMODE)); + break; + case KVM_CAP_PPC_NESTED_HV: + r = !!(hv_enabled && kvmppc_hv_ops->enable_nested && + !kvmppc_hv_ops->enable_nested(NULL)); break; #endif case KVM_CAP_SYNC_MMU: @@ -2114,6 +2119,14 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, r = kvm->arch.kvm_ops->set_smt_mode(kvm, mode, flags); break; } + + case KVM_CAP_PPC_NESTED_HV: + r = -EINVAL; + if (!is_kvmppc_hv_enabled(kvm) || + !kvm->arch.kvm_ops->enable_nested) + break; + r = kvm->arch.kvm_ops->enable_nested(kvm); + break; #endif default: r = -EINVAL; diff --git a/arch/powerpc/kvm/tm.S b/arch/powerpc/kvm/tm.S index 90e330f21356..0531a1492fdf 100644 --- a/arch/powerpc/kvm/tm.S +++ b/arch/powerpc/kvm/tm.S @@ -28,17 +28,25 @@ * Save transactional state and TM-related registers. * Called with: * - r3 pointing to the vcpu struct - * - r4 points to the MSR with current TS bits: + * - r4 containing the MSR with current TS bits: * (For HV KVM, it is VCPU_MSR ; For PR KVM, it is host MSR). - * This can modify all checkpointed registers, but - * restores r1, r2 before exit. + * - r5 containing a flag indicating that non-volatile registers + * must be preserved. + * If r5 == 0, this can modify all checkpointed registers, but + * restores r1, r2 before exit. If r5 != 0, this restores the + * MSR TM/FP/VEC/VSX bits to their state on entry. */ _GLOBAL(__kvmppc_save_tm) mflr r0 std r0, PPC_LR_STKOFF(r1) + stdu r1, -SWITCH_FRAME_SIZE(r1) + + mr r9, r3 + cmpdi cr7, r5, 0 /* Turn on TM. */ mfmsr r8 + mr r10, r8 li r0, 1 rldimi r8, r0, MSR_TM_LG, 63-MSR_TM_LG ori r8, r8, MSR_FP @@ -51,6 +59,27 @@ _GLOBAL(__kvmppc_save_tm) std r1, HSTATE_SCRATCH2(r13) std r3, HSTATE_SCRATCH1(r13) + /* Save CR on the stack - even if r5 == 0 we need to get cr7 back. */ + mfcr r6 + SAVE_GPR(6, r1) + + /* Save DSCR so we can restore it to avoid running with user value */ + mfspr r7, SPRN_DSCR + SAVE_GPR(7, r1) + + /* + * We are going to do treclaim., which will modify all checkpointed + * registers. Save the non-volatile registers on the stack if + * preservation of non-volatile state has been requested. + */ + beq cr7, 3f + SAVE_NVGPRS(r1) + + /* MSR[TS] will be 0 (non-transactional) once we do treclaim. */ + li r0, 0 + rldimi r10, r0, MSR_TS_S_LG, 63 - MSR_TS_T_LG + SAVE_GPR(10, r1) /* final MSR value */ +3: #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE BEGIN_FTR_SECTION /* Emulation of the treclaim instruction needs TEXASR before treclaim */ @@ -74,22 +103,25 @@ END_FTR_SECTION_IFSET(CPU_FTR_P9_TM_HV_ASSIST) std r9, PACATMSCRATCH(r13) ld r9, HSTATE_SCRATCH1(r13) - /* Get a few more GPRs free. */ - std r29, VCPU_GPRS_TM(29)(r9) - std r30, VCPU_GPRS_TM(30)(r9) - std r31, VCPU_GPRS_TM(31)(r9) - - /* Save away PPR and DSCR soon so don't run with user values. */ - mfspr r31, SPRN_PPR + /* Save away PPR soon so we don't run with user value. */ + std r0, VCPU_GPRS_TM(0)(r9) + mfspr r0, SPRN_PPR HMT_MEDIUM - mfspr r30, SPRN_DSCR -#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE - ld r29, HSTATE_DSCR(r13) - mtspr SPRN_DSCR, r29 -#endif - /* Save all but r9, r13 & r29-r31 */ - reg = 0 + /* Reload stack pointer. */ + std r1, VCPU_GPRS_TM(1)(r9) + ld r1, HSTATE_SCRATCH2(r13) + + /* Set MSR RI now we have r1 and r13 back. */ + std r2, VCPU_GPRS_TM(2)(r9) + li r2, MSR_RI + mtmsrd r2, 1 + + /* Reload TOC pointer. */ + ld r2, PACATOC(r13) + + /* Save all but r0-r2, r9 & r13 */ + reg = 3 .rept 29 .if (reg != 9) && (reg != 13) std reg, VCPU_GPRS_TM(reg)(r9) @@ -103,33 +135,29 @@ END_FTR_SECTION_IFSET(CPU_FTR_P9_TM_HV_ASSIST) ld r4, PACATMSCRATCH(r13) std r4, VCPU_GPRS_TM(9)(r9) - /* Reload stack pointer and TOC. */ - ld r1, HSTATE_SCRATCH2(r13) - ld r2, PACATOC(r13) - - /* Set MSR RI now we have r1 and r13 back. */ - li r5, MSR_RI - mtmsrd r5, 1 + /* Restore host DSCR and CR values, after saving guest values */ + mfcr r6 + mfspr r7, SPRN_DSCR + stw r6, VCPU_CR_TM(r9) + std r7, VCPU_DSCR_TM(r9) + REST_GPR(6, r1) + REST_GPR(7, r1) + mtcr r6 + mtspr SPRN_DSCR, r7 - /* Save away checkpinted SPRs. */ - std r31, VCPU_PPR_TM(r9) - std r30, VCPU_DSCR_TM(r9) + /* Save away checkpointed SPRs. */ + std r0, VCPU_PPR_TM(r9) mflr r5 - mfcr r6 mfctr r7 mfspr r8, SPRN_AMR mfspr r10, SPRN_TAR mfxer r11 std r5, VCPU_LR_TM(r9) - stw r6, VCPU_CR_TM(r9) std r7, VCPU_CTR_TM(r9) std r8, VCPU_AMR_TM(r9) std r10, VCPU_TAR_TM(r9) std r11, VCPU_XER_TM(r9) - /* Restore r12 as trap number. */ - lwz r12, VCPU_TRAP(r9) - /* Save FP/VSX. */ addi r3, r9, VCPU_FPRS_TM bl store_fp_state @@ -137,6 +165,11 @@ END_FTR_SECTION_IFSET(CPU_FTR_P9_TM_HV_ASSIST) bl store_vr_state mfspr r6, SPRN_VRSAVE stw r6, VCPU_VRSAVE_TM(r9) + + /* Restore non-volatile registers if requested to */ + beq cr7, 1f + REST_NVGPRS(r1) + REST_GPR(10, r1) 1: /* * We need to save these SPRs after the treclaim so that the software @@ -146,12 +179,16 @@ END_FTR_SECTION_IFSET(CPU_FTR_P9_TM_HV_ASSIST) */ mfspr r7, SPRN_TEXASR std r7, VCPU_TEXASR(r9) -11: mfspr r5, SPRN_TFHAR mfspr r6, SPRN_TFIAR std r5, VCPU_TFHAR(r9) std r6, VCPU_TFIAR(r9) + /* Restore MSR state if requested */ + beq cr7, 2f + mtmsrd r10, 0 +2: + addi r1, r1, SWITCH_FRAME_SIZE ld r0, PPC_LR_STKOFF(r1) mtlr r0 blr @@ -161,49 +198,22 @@ END_FTR_SECTION_IFSET(CPU_FTR_P9_TM_HV_ASSIST) * be invoked from C function by PR KVM only. */ _GLOBAL(_kvmppc_save_tm_pr) - mflr r5 - std r5, PPC_LR_STKOFF(r1) - stdu r1, -SWITCH_FRAME_SIZE(r1) - SAVE_NVGPRS(r1) - - /* save MSR since TM/math bits might be impacted - * by __kvmppc_save_tm(). - */ - mfmsr r5 - SAVE_GPR(5, r1) - - /* also save DSCR/CR/TAR so that it can be recovered later */ - mfspr r6, SPRN_DSCR - SAVE_GPR(6, r1) - - mfcr r7 - stw r7, _CCR(r1) + mflr r0 + std r0, PPC_LR_STKOFF(r1) + stdu r1, -PPC_MIN_STKFRM(r1) mfspr r8, SPRN_TAR - SAVE_GPR(8, r1) + std r8, PPC_MIN_STKFRM-8(r1) + li r5, 1 /* preserve non-volatile registers */ bl __kvmppc_save_tm - REST_GPR(8, r1) + ld r8, PPC_MIN_STKFRM-8(r1) mtspr SPRN_TAR, r8 - ld r7, _CCR(r1) - mtcr r7 - - REST_GPR(6, r1) - mtspr SPRN_DSCR, r6 - - /* need preserve current MSR's MSR_TS bits */ - REST_GPR(5, r1) - mfmsr r6 - rldicl r6, r6, 64 - MSR_TS_S_LG, 62 - rldimi r5, r6, MSR_TS_S_LG, 63 - MSR_TS_T_LG - mtmsrd r5 - - REST_NVGPRS(r1) - addi r1, r1, SWITCH_FRAME_SIZE - ld r5, PPC_LR_STKOFF(r1) - mtlr r5 + addi r1, r1, PPC_MIN_STKFRM + ld r0, PPC_LR_STKOFF(r1) + mtlr r0 blr EXPORT_SYMBOL_GPL(_kvmppc_save_tm_pr); @@ -215,15 +225,21 @@ EXPORT_SYMBOL_GPL(_kvmppc_save_tm_pr); * - r4 is the guest MSR with desired TS bits: * For HV KVM, it is VCPU_MSR * For PR KVM, it is provided by caller - * This potentially modifies all checkpointed registers. - * It restores r1, r2 from the PACA. + * - r5 containing a flag indicating that non-volatile registers + * must be preserved. + * If r5 == 0, this potentially modifies all checkpointed registers, but + * restores r1, r2 from the PACA before exit. + * If r5 != 0, this restores the MSR TM/FP/VEC/VSX bits to their state on entry. */ _GLOBAL(__kvmppc_restore_tm) mflr r0 std r0, PPC_LR_STKOFF(r1) + cmpdi cr7, r5, 0 + /* Turn on TM/FP/VSX/VMX so we can restore them. */ mfmsr r5 + mr r10, r5 li r6, MSR_TM >> 32 sldi r6, r6, 32 or r5, r5, r6 @@ -244,8 +260,7 @@ _GLOBAL(__kvmppc_restore_tm) mr r5, r4 rldicl. r5, r5, 64 - MSR_TS_S_LG, 62 - beqlr /* TM not active in guest */ - std r1, HSTATE_SCRATCH2(r13) + beq 9f /* TM not active in guest */ /* Make sure the failure summary is set, otherwise we'll program check * when we trechkpt. It's possible that this might have been not set @@ -256,6 +271,26 @@ _GLOBAL(__kvmppc_restore_tm) mtspr SPRN_TEXASR, r7 /* + * Make a stack frame and save non-volatile registers if requested. + */ + stdu r1, -SWITCH_FRAME_SIZE(r1) + std r1, HSTATE_SCRATCH2(r13) + + mfcr r6 + mfspr r7, SPRN_DSCR + SAVE_GPR(2, r1) + SAVE_GPR(6, r1) + SAVE_GPR(7, r1) + + beq cr7, 4f + SAVE_NVGPRS(r1) + + /* MSR[TS] will be 1 (suspended) once we do trechkpt */ + li r0, 1 + rldimi r10, r0, MSR_TS_S_LG, 63 - MSR_TS_T_LG + SAVE_GPR(10, r1) /* final MSR value */ +4: + /* * We need to load up the checkpointed state for the guest. * We need to do this early as it will blow away any GPRs, VSRs and * some SPRs. @@ -291,8 +326,6 @@ _GLOBAL(__kvmppc_restore_tm) ld r29, VCPU_DSCR_TM(r3) ld r30, VCPU_PPR_TM(r3) - std r2, PACATMSCRATCH(r13) /* Save TOC */ - /* Clear the MSR RI since r1, r13 are all going to be foobar. */ li r5, 0 mtmsrd r5, 1 @@ -318,18 +351,31 @@ _GLOBAL(__kvmppc_restore_tm) /* Now let's get back the state we need. */ HMT_MEDIUM GET_PACA(r13) -#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE - ld r29, HSTATE_DSCR(r13) - mtspr SPRN_DSCR, r29 -#endif ld r1, HSTATE_SCRATCH2(r13) - ld r2, PACATMSCRATCH(r13) + REST_GPR(7, r1) + mtspr SPRN_DSCR, r7 /* Set the MSR RI since we have our registers back. */ li r5, MSR_RI mtmsrd r5, 1 + + /* Restore TOC pointer and CR */ + REST_GPR(2, r1) + REST_GPR(6, r1) + mtcr r6 + + /* Restore non-volatile registers if requested to. */ + beq cr7, 5f + REST_GPR(10, r1) + REST_NVGPRS(r1) + +5: addi r1, r1, SWITCH_FRAME_SIZE ld r0, PPC_LR_STKOFF(r1) mtlr r0 + +9: /* Restore MSR bits if requested */ + beqlr cr7 + mtmsrd r10, 0 blr /* @@ -337,47 +383,23 @@ _GLOBAL(__kvmppc_restore_tm) * can be invoked from C function by PR KVM only. */ _GLOBAL(_kvmppc_restore_tm_pr) - mflr r5 - std r5, PPC_LR_STKOFF(r1) - stdu r1, -SWITCH_FRAME_SIZE(r1) - SAVE_NVGPRS(r1) - - /* save MSR to avoid TM/math bits change */ - mfmsr r5 - SAVE_GPR(5, r1) - - /* also save DSCR/CR/TAR so that it can be recovered later */ - mfspr r6, SPRN_DSCR - SAVE_GPR(6, r1) - - mfcr r7 - stw r7, _CCR(r1) + mflr r0 + std r0, PPC_LR_STKOFF(r1) + stdu r1, -PPC_MIN_STKFRM(r1) + /* save TAR so that it can be recovered later */ mfspr r8, SPRN_TAR - SAVE_GPR(8, r1) + std r8, PPC_MIN_STKFRM-8(r1) + li r5, 1 bl __kvmppc_restore_tm - REST_GPR(8, r1) + ld r8, PPC_MIN_STKFRM-8(r1) mtspr SPRN_TAR, r8 - ld r7, _CCR(r1) - mtcr r7 - - REST_GPR(6, r1) - mtspr SPRN_DSCR, r6 - - /* need preserve current MSR's MSR_TS bits */ - REST_GPR(5, r1) - mfmsr r6 - rldicl r6, r6, 64 - MSR_TS_S_LG, 62 - rldimi r5, r6, MSR_TS_S_LG, 63 - MSR_TS_T_LG - mtmsrd r5 - - REST_NVGPRS(r1) - addi r1, r1, SWITCH_FRAME_SIZE - ld r5, PPC_LR_STKOFF(r1) - mtlr r5 + addi r1, r1, PPC_MIN_STKFRM + ld r0, PPC_LR_STKOFF(r1) + mtlr r0 blr EXPORT_SYMBOL_GPL(_kvmppc_restore_tm_pr); diff --git a/arch/powerpc/kvm/trace_book3s.h b/arch/powerpc/kvm/trace_book3s.h index f3b23759e017..372a82fa2de3 100644 --- a/arch/powerpc/kvm/trace_book3s.h +++ b/arch/powerpc/kvm/trace_book3s.h @@ -14,7 +14,6 @@ {0x400, "INST_STORAGE"}, \ {0x480, "INST_SEGMENT"}, \ {0x500, "EXTERNAL"}, \ - {0x501, "EXTERNAL_LEVEL"}, \ {0x502, "EXTERNAL_HV"}, \ {0x600, "ALIGNMENT"}, \ {0x700, "PROGRAM"}, \ diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c index 51ce091914f9..7a9886f98b0c 100644 --- a/arch/powerpc/mm/init_64.c +++ b/arch/powerpc/mm/init_64.c @@ -308,55 +308,6 @@ void register_page_bootmem_memmap(unsigned long section_nr, { } -/* - * We do not have access to the sparsemem vmemmap, so we fallback to - * walking the list of sparsemem blocks which we already maintain for - * the sake of crashdump. In the long run, we might want to maintain - * a tree if performance of that linear walk becomes a problem. - * - * realmode_pfn_to_page functions can fail due to: - * 1) As real sparsemem blocks do not lay in RAM continously (they - * are in virtual address space which is not available in the real mode), - * the requested page struct can be split between blocks so get_page/put_page - * may fail. - * 2) When huge pages are used, the get_page/put_page API will fail - * in real mode as the linked addresses in the page struct are virtual - * too. - */ -struct page *realmode_pfn_to_page(unsigned long pfn) -{ - struct vmemmap_backing *vmem_back; - struct page *page; - unsigned long page_size = 1 << mmu_psize_defs[mmu_vmemmap_psize].shift; - unsigned long pg_va = (unsigned long) pfn_to_page(pfn); - - for (vmem_back = vmemmap_list; vmem_back; vmem_back = vmem_back->list) { - if (pg_va < vmem_back->virt_addr) - continue; - - /* After vmemmap_list entry free is possible, need check all */ - if ((pg_va + sizeof(struct page)) <= - (vmem_back->virt_addr + page_size)) { - page = (struct page *) (vmem_back->phys + pg_va - - vmem_back->virt_addr); - return page; - } - } - - /* Probably that page struct is split between real pages */ - return NULL; -} -EXPORT_SYMBOL_GPL(realmode_pfn_to_page); - -#else - -struct page *realmode_pfn_to_page(unsigned long pfn) -{ - struct page *page = pfn_to_page(pfn); - return page; -} -EXPORT_SYMBOL_GPL(realmode_pfn_to_page); - #endif /* CONFIG_SPARSEMEM_VMEMMAP */ #ifdef CONFIG_PPC_BOOK3S_64 diff --git a/arch/powerpc/mm/mmu_context_iommu.c b/arch/powerpc/mm/mmu_context_iommu.c index c9ee9e23845f..56c2234cc6ae 100644 --- a/arch/powerpc/mm/mmu_context_iommu.c +++ b/arch/powerpc/mm/mmu_context_iommu.c @@ -18,11 +18,15 @@ #include <linux/migrate.h> #include <linux/hugetlb.h> #include <linux/swap.h> +#include <linux/sizes.h> #include <asm/mmu_context.h> #include <asm/pte-walk.h> static DEFINE_MUTEX(mem_list_mutex); +#define MM_IOMMU_TABLE_GROUP_PAGE_DIRTY 0x1 +#define MM_IOMMU_TABLE_GROUP_PAGE_MASK ~(SZ_4K - 1) + struct mm_iommu_table_group_mem_t { struct list_head next; struct rcu_head rcu; @@ -263,6 +267,9 @@ static void mm_iommu_unpin(struct mm_iommu_table_group_mem_t *mem) if (!page) continue; + if (mem->hpas[i] & MM_IOMMU_TABLE_GROUP_PAGE_DIRTY) + SetPageDirty(page); + put_page(page); mem->hpas[i] = 0; } @@ -360,7 +367,6 @@ struct mm_iommu_table_group_mem_t *mm_iommu_lookup_rm(struct mm_struct *mm, return ret; } -EXPORT_SYMBOL_GPL(mm_iommu_lookup_rm); struct mm_iommu_table_group_mem_t *mm_iommu_find(struct mm_struct *mm, unsigned long ua, unsigned long entries) @@ -390,7 +396,7 @@ long mm_iommu_ua_to_hpa(struct mm_iommu_table_group_mem_t *mem, if (pageshift > mem->pageshift) return -EFAULT; - *hpa = *va | (ua & ~PAGE_MASK); + *hpa = (*va & MM_IOMMU_TABLE_GROUP_PAGE_MASK) | (ua & ~PAGE_MASK); return 0; } @@ -413,11 +419,31 @@ long mm_iommu_ua_to_hpa_rm(struct mm_iommu_table_group_mem_t *mem, if (!pa) return -EFAULT; - *hpa = *pa | (ua & ~PAGE_MASK); + *hpa = (*pa & MM_IOMMU_TABLE_GROUP_PAGE_MASK) | (ua & ~PAGE_MASK); return 0; } -EXPORT_SYMBOL_GPL(mm_iommu_ua_to_hpa_rm); + +extern void mm_iommu_ua_mark_dirty_rm(struct mm_struct *mm, unsigned long ua) +{ + struct mm_iommu_table_group_mem_t *mem; + long entry; + void *va; + unsigned long *pa; + + mem = mm_iommu_lookup_rm(mm, ua, PAGE_SIZE); + if (!mem) + return; + + entry = (ua - mem->ua) >> PAGE_SHIFT; + va = &mem->hpas[entry]; + + pa = (void *) vmalloc_to_phys(va); + if (!pa) + return; + + *pa |= MM_IOMMU_TABLE_GROUP_PAGE_DIRTY; +} long mm_iommu_mapped_inc(struct mm_iommu_table_group_mem_t *mem) { diff --git a/arch/powerpc/mm/tlb-radix.c b/arch/powerpc/mm/tlb-radix.c index fef3e1eb3a19..4c4dfc473800 100644 --- a/arch/powerpc/mm/tlb-radix.c +++ b/arch/powerpc/mm/tlb-radix.c @@ -833,6 +833,15 @@ EXPORT_SYMBOL_GPL(radix__flush_pwc_lpid); /* * Flush partition scoped translations from LPID (=LPIDR) */ +void radix__flush_tlb_lpid(unsigned int lpid) +{ + _tlbie_lpid(lpid, RIC_FLUSH_ALL); +} +EXPORT_SYMBOL_GPL(radix__flush_tlb_lpid); + +/* + * Flush partition scoped translations from LPID (=LPIDR) + */ void radix__local_flush_tlb_lpid(unsigned int lpid) { _tlbiel_lpid(lpid, RIC_FLUSH_ALL); diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 2ea3e798a0bd..fe24150ff666 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -482,7 +482,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) break; case KVM_CAP_S390_HPAGE_1M: r = 0; - if (hpage) + if (hpage && !kvm_is_ucontrol(kvm)) r = 1; break; case KVM_CAP_S390_MEM_OP: @@ -692,7 +692,7 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap) mutex_lock(&kvm->lock); if (kvm->created_vcpus) r = -EBUSY; - else if (!hpage || kvm->arch.use_cmma) + else if (!hpage || kvm->arch.use_cmma || kvm_is_ucontrol(kvm)) r = -EINVAL; else { r = 0; diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c index d4fa0a4514e0..1e668b95e0c6 100644 --- a/arch/s390/mm/gmap.c +++ b/arch/s390/mm/gmap.c @@ -708,11 +708,13 @@ void gmap_discard(struct gmap *gmap, unsigned long from, unsigned long to) vmaddr |= gaddr & ~PMD_MASK; /* Find vma in the parent mm */ vma = find_vma(gmap->mm, vmaddr); + if (!vma) + continue; /* * We do not discard pages that are backed by * hugetlbfs, so we don't have to refault them. */ - if (vma && is_vm_hugetlb_page(vma)) + if (is_vm_hugetlb_page(vma)) continue; size = min(to - gaddr, PMD_SIZE - (gaddr & ~PMD_MASK)); zap_page_range(vma, vmaddr, size); diff --git a/arch/x86/crypto/aegis128-aesni-glue.c b/arch/x86/crypto/aegis128-aesni-glue.c index acd11b3bf639..2a356b948720 100644 --- a/arch/x86/crypto/aegis128-aesni-glue.c +++ b/arch/x86/crypto/aegis128-aesni-glue.c @@ -379,7 +379,6 @@ static int __init crypto_aegis128_aesni_module_init(void) { if (!boot_cpu_has(X86_FEATURE_XMM2) || !boot_cpu_has(X86_FEATURE_AES) || - !boot_cpu_has(X86_FEATURE_OSXSAVE) || !cpu_has_xfeatures(XFEATURE_MASK_SSE, NULL)) return -ENODEV; diff --git a/arch/x86/crypto/aegis128l-aesni-glue.c b/arch/x86/crypto/aegis128l-aesni-glue.c index 2071c3d1ae07..dbe8bb980da1 100644 --- a/arch/x86/crypto/aegis128l-aesni-glue.c +++ b/arch/x86/crypto/aegis128l-aesni-glue.c @@ -379,7 +379,6 @@ static int __init crypto_aegis128l_aesni_module_init(void) { if (!boot_cpu_has(X86_FEATURE_XMM2) || !boot_cpu_has(X86_FEATURE_AES) || - !boot_cpu_has(X86_FEATURE_OSXSAVE) || !cpu_has_xfeatures(XFEATURE_MASK_SSE, NULL)) return -ENODEV; diff --git a/arch/x86/crypto/aegis256-aesni-glue.c b/arch/x86/crypto/aegis256-aesni-glue.c index b5f2a8fd5a71..8bebda2de92f 100644 --- a/arch/x86/crypto/aegis256-aesni-glue.c +++ b/arch/x86/crypto/aegis256-aesni-glue.c @@ -379,7 +379,6 @@ static int __init crypto_aegis256_aesni_module_init(void) { if (!boot_cpu_has(X86_FEATURE_XMM2) || !boot_cpu_has(X86_FEATURE_AES) || - !boot_cpu_has(X86_FEATURE_OSXSAVE) || !cpu_has_xfeatures(XFEATURE_MASK_SSE, NULL)) return -ENODEV; diff --git a/arch/x86/crypto/morus1280-sse2-glue.c b/arch/x86/crypto/morus1280-sse2-glue.c index 95cf857d2cbb..f40244eaf14d 100644 --- a/arch/x86/crypto/morus1280-sse2-glue.c +++ b/arch/x86/crypto/morus1280-sse2-glue.c @@ -40,7 +40,6 @@ MORUS1280_DECLARE_ALGS(sse2, "morus1280-sse2", 350); static int __init crypto_morus1280_sse2_module_init(void) { if (!boot_cpu_has(X86_FEATURE_XMM2) || - !boot_cpu_has(X86_FEATURE_OSXSAVE) || !cpu_has_xfeatures(XFEATURE_MASK_SSE, NULL)) return -ENODEV; diff --git a/arch/x86/crypto/morus640-sse2-glue.c b/arch/x86/crypto/morus640-sse2-glue.c index 615fb7bc9a32..9afaf8f8565a 100644 --- a/arch/x86/crypto/morus640-sse2-glue.c +++ b/arch/x86/crypto/morus640-sse2-glue.c @@ -40,7 +40,6 @@ MORUS640_DECLARE_ALGS(sse2, "morus640-sse2", 400); static int __init crypto_morus640_sse2_module_init(void) { if (!boot_cpu_has(X86_FEATURE_XMM2) || - !boot_cpu_has(X86_FEATURE_OSXSAVE) || !cpu_has_xfeatures(XFEATURE_MASK_SSE, NULL)) return -ENODEV; diff --git a/arch/x86/hyperv/hv_apic.c b/arch/x86/hyperv/hv_apic.c index 5b0f613428c2..2c43e3055948 100644 --- a/arch/x86/hyperv/hv_apic.c +++ b/arch/x86/hyperv/hv_apic.c @@ -95,8 +95,8 @@ static void hv_apic_eoi_write(u32 reg, u32 val) */ static bool __send_ipi_mask_ex(const struct cpumask *mask, int vector) { - struct ipi_arg_ex **arg; - struct ipi_arg_ex *ipi_arg; + struct hv_send_ipi_ex **arg; + struct hv_send_ipi_ex *ipi_arg; unsigned long flags; int nr_bank = 0; int ret = 1; @@ -105,7 +105,7 @@ static bool __send_ipi_mask_ex(const struct cpumask *mask, int vector) return false; local_irq_save(flags); - arg = (struct ipi_arg_ex **)this_cpu_ptr(hyperv_pcpu_input_arg); + arg = (struct hv_send_ipi_ex **)this_cpu_ptr(hyperv_pcpu_input_arg); ipi_arg = *arg; if (unlikely(!ipi_arg)) @@ -135,7 +135,7 @@ ipi_mask_ex_done: static bool __send_ipi_mask(const struct cpumask *mask, int vector) { int cur_cpu, vcpu; - struct ipi_arg_non_ex ipi_arg; + struct hv_send_ipi ipi_arg; int ret = 1; trace_hyperv_send_ipi_mask(mask, vector); diff --git a/arch/x86/include/asm/hyperv-tlfs.h b/arch/x86/include/asm/hyperv-tlfs.h index e977b6b3a538..00e01d215f74 100644 --- a/arch/x86/include/asm/hyperv-tlfs.h +++ b/arch/x86/include/asm/hyperv-tlfs.h @@ -726,19 +726,21 @@ struct hv_enlightened_vmcs { #define HV_STIMER_AUTOENABLE (1ULL << 3) #define HV_STIMER_SINT(config) (__u8)(((config) >> 16) & 0x0F) -struct ipi_arg_non_ex { - u32 vector; - u32 reserved; - u64 cpu_mask; -}; - struct hv_vpset { u64 format; u64 valid_bank_mask; u64 bank_contents[]; }; -struct ipi_arg_ex { +/* HvCallSendSyntheticClusterIpi hypercall */ +struct hv_send_ipi { + u32 vector; + u32 reserved; + u64 cpu_mask; +}; + +/* HvCallSendSyntheticClusterIpiEx hypercall */ +struct hv_send_ipi_ex { u32 vector; u32 reserved; struct hv_vpset vp_set; diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 8e90488c3d56..09b2e3e2cf1b 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -869,6 +869,8 @@ struct kvm_arch { bool x2apic_format; bool x2apic_broadcast_quirk_disabled; + + bool guest_can_read_msr_platform_info; }; struct kvm_vm_stat { @@ -1022,6 +1024,7 @@ struct kvm_x86_ops { void (*refresh_apicv_exec_ctrl)(struct kvm_vcpu *vcpu); void (*hwapic_irr_update)(struct kvm_vcpu *vcpu, int max_irr); void (*hwapic_isr_update)(struct kvm_vcpu *vcpu, int isr); + bool (*guest_apic_has_interrupt)(struct kvm_vcpu *vcpu); void (*load_eoi_exitmap)(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap); void (*set_virtual_apic_mode)(struct kvm_vcpu *vcpu); void (*set_apic_access_page_addr)(struct kvm_vcpu *vcpu, hpa_t hpa); @@ -1055,6 +1058,7 @@ struct kvm_x86_ops { bool (*umip_emulated)(void); int (*check_nested_events)(struct kvm_vcpu *vcpu, bool external_intr); + void (*request_immediate_exit)(struct kvm_vcpu *vcpu); void (*sched_in)(struct kvm_vcpu *kvm, int cpu); @@ -1482,6 +1486,7 @@ extern bool kvm_find_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn); int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu); int kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err); +void __kvm_request_immediate_exit(struct kvm_vcpu *vcpu); int kvm_is_in_guest(void); diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index 86299efa804a..fd23d5778ea1 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -377,6 +377,7 @@ struct kvm_sync_regs { #define KVM_X86_QUIRK_LINT0_REENABLED (1 << 0) #define KVM_X86_QUIRK_CD_NW_CLEARED (1 << 1) +#define KVM_X86_QUIRK_LAPIC_MMIO_HOLE (1 << 2) #define KVM_STATE_NESTED_GUEST_MODE 0x00000001 #define KVM_STATE_NESTED_RUN_PENDING 0x00000002 diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 17c0472c5b34..fbb0e6df121b 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1344,9 +1344,8 @@ EXPORT_SYMBOL_GPL(kvm_lapic_reg_read); static int apic_mmio_in_range(struct kvm_lapic *apic, gpa_t addr) { - return kvm_apic_hw_enabled(apic) && - addr >= apic->base_address && - addr < apic->base_address + LAPIC_MMIO_LENGTH; + return addr >= apic->base_address && + addr < apic->base_address + LAPIC_MMIO_LENGTH; } static int apic_mmio_read(struct kvm_vcpu *vcpu, struct kvm_io_device *this, @@ -1358,6 +1357,15 @@ static int apic_mmio_read(struct kvm_vcpu *vcpu, struct kvm_io_device *this, if (!apic_mmio_in_range(apic, address)) return -EOPNOTSUPP; + if (!kvm_apic_hw_enabled(apic) || apic_x2apic_mode(apic)) { + if (!kvm_check_has_quirk(vcpu->kvm, + KVM_X86_QUIRK_LAPIC_MMIO_HOLE)) + return -EOPNOTSUPP; + + memset(data, 0xff, len); + return 0; + } + kvm_lapic_reg_read(apic, offset, len, data); return 0; @@ -1917,6 +1925,14 @@ static int apic_mmio_write(struct kvm_vcpu *vcpu, struct kvm_io_device *this, if (!apic_mmio_in_range(apic, address)) return -EOPNOTSUPP; + if (!kvm_apic_hw_enabled(apic) || apic_x2apic_mode(apic)) { + if (!kvm_check_has_quirk(vcpu->kvm, + KVM_X86_QUIRK_LAPIC_MMIO_HOLE)) + return -EOPNOTSUPP; + + return 0; + } + /* * APIC register must be aligned on 128-bits boundary. * 32/64/128 bits registers must be accessed thru 32 bits. diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index e24ea7067373..51b953ad9d4e 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -249,6 +249,17 @@ static u64 __read_mostly shadow_nonpresent_or_rsvd_mask; */ static const u64 shadow_nonpresent_or_rsvd_mask_len = 5; +/* + * In some cases, we need to preserve the GFN of a non-present or reserved + * SPTE when we usurp the upper five bits of the physical address space to + * defend against L1TF, e.g. for MMIO SPTEs. To preserve the GFN, we'll + * shift bits of the GFN that overlap with shadow_nonpresent_or_rsvd_mask + * left into the reserved bits, i.e. the GFN in the SPTE will be split into + * high and low parts. This mask covers the lower bits of the GFN. + */ +static u64 __read_mostly shadow_nonpresent_or_rsvd_lower_gfn_mask; + + static void mmu_spte_set(u64 *sptep, u64 spte); static union kvm_mmu_page_role kvm_mmu_calc_root_page_role(struct kvm_vcpu *vcpu); @@ -357,9 +368,7 @@ static bool is_mmio_spte(u64 spte) static gfn_t get_mmio_spte_gfn(u64 spte) { - u64 mask = generation_mmio_spte_mask(MMIO_GEN_MASK) | shadow_mmio_mask | - shadow_nonpresent_or_rsvd_mask; - u64 gpa = spte & ~mask; + u64 gpa = spte & shadow_nonpresent_or_rsvd_lower_gfn_mask; gpa |= (spte >> shadow_nonpresent_or_rsvd_mask_len) & shadow_nonpresent_or_rsvd_mask; @@ -423,6 +432,8 @@ EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes); static void kvm_mmu_reset_all_pte_masks(void) { + u8 low_phys_bits; + shadow_user_mask = 0; shadow_accessed_mask = 0; shadow_dirty_mask = 0; @@ -437,12 +448,17 @@ static void kvm_mmu_reset_all_pte_masks(void) * appropriate mask to guard against L1TF attacks. Otherwise, it is * assumed that the CPU is not vulnerable to L1TF. */ + low_phys_bits = boot_cpu_data.x86_phys_bits; if (boot_cpu_data.x86_phys_bits < - 52 - shadow_nonpresent_or_rsvd_mask_len) + 52 - shadow_nonpresent_or_rsvd_mask_len) { shadow_nonpresent_or_rsvd_mask = rsvd_bits(boot_cpu_data.x86_phys_bits - shadow_nonpresent_or_rsvd_mask_len, boot_cpu_data.x86_phys_bits - 1); + low_phys_bits -= shadow_nonpresent_or_rsvd_mask_len; + } + shadow_nonpresent_or_rsvd_lower_gfn_mask = + GENMASK_ULL(low_phys_bits - 1, PAGE_SHIFT); } static int is_cpuid_PSE36(void) @@ -899,7 +915,7 @@ static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu) { /* * Make sure the write to vcpu->mode is not reordered in front of - * reads to sptes. If it does, kvm_commit_zap_page() can see us + * reads to sptes. If it does, kvm_mmu_commit_zap_page() can see us * OUTSIDE_GUEST_MODE and proceed to free the shadow page table. */ smp_store_release(&vcpu->mode, OUTSIDE_GUEST_MODE); @@ -5417,7 +5433,12 @@ void kvm_mmu_setup(struct kvm_vcpu *vcpu) { MMU_WARN_ON(VALID_PAGE(vcpu->arch.mmu.root_hpa)); - kvm_init_mmu(vcpu, true); + /* + * kvm_mmu_setup() is called only on vCPU initialization. + * Therefore, no need to reset mmu roots as they are not yet + * initialized. + */ + kvm_init_mmu(vcpu, false); } static void kvm_mmu_invalidate_zap_pages_in_memslot(struct kvm *kvm, diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 89c4c5aa15f1..d96092b35936 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1226,8 +1226,7 @@ static __init int sev_hardware_setup(void) min_sev_asid = cpuid_edx(0x8000001F); /* Initialize SEV ASID bitmap */ - sev_asid_bitmap = kcalloc(BITS_TO_LONGS(max_sev_asid), - sizeof(unsigned long), GFP_KERNEL); + sev_asid_bitmap = bitmap_zalloc(max_sev_asid, GFP_KERNEL); if (!sev_asid_bitmap) return 1; @@ -1405,7 +1404,7 @@ static __exit void svm_hardware_unsetup(void) int cpu; if (svm_sev_enabled()) - kfree(sev_asid_bitmap); + bitmap_free(sev_asid_bitmap); for_each_possible_cpu(cpu) svm_cpu_uninit(cpu); @@ -7149,6 +7148,8 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = { .check_intercept = svm_check_intercept, .handle_external_intr = svm_handle_external_intr, + .request_immediate_exit = __kvm_request_immediate_exit, + .sched_in = svm_sched_in, .pmu_ops = &amd_pmu_ops, diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 533a327372c8..612fd17be635 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -121,7 +121,6 @@ module_param_named(pml, enable_pml, bool, S_IRUGO); #define MSR_BITMAP_MODE_X2APIC 1 #define MSR_BITMAP_MODE_X2APIC_APICV 2 -#define MSR_BITMAP_MODE_LM 4 #define KVM_VMX_TSC_MULTIPLIER_MAX 0xffffffffffffffffULL @@ -397,6 +396,7 @@ struct loaded_vmcs { int cpu; bool launched; bool nmi_known_unmasked; + bool hv_timer_armed; /* Support for vnmi-less CPUs */ int soft_vnmi_blocked; ktime_t entry_time; @@ -856,6 +856,7 @@ struct nested_vmx { /* to migrate it to L2 if VM_ENTRY_LOAD_DEBUG_CONTROLS is off */ u64 vmcs01_debugctl; + u64 vmcs01_guest_bndcfgs; u16 vpid02; u16 last_vpid; @@ -1019,6 +1020,8 @@ struct vcpu_vmx { int ple_window; bool ple_window_dirty; + bool req_immediate_exit; + /* Support for PML */ #define PML_ENTITY_NUM 512 struct page *pml_pg; @@ -2864,6 +2867,8 @@ static void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) u16 fs_sel, gs_sel; int i; + vmx->req_immediate_exit = false; + if (vmx->loaded_cpu_state) return; @@ -2894,8 +2899,7 @@ static void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) vmx->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE); } - if (is_long_mode(&vmx->vcpu)) - wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); + wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); #else savesegment(fs, fs_sel); savesegment(gs, gs_sel); @@ -2946,8 +2950,7 @@ static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx) vmx->loaded_cpu_state = NULL; #ifdef CONFIG_X86_64 - if (is_long_mode(&vmx->vcpu)) - rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); + rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); #endif if (host_state->ldt_sel || (host_state->gs_sel & 7)) { kvm_load_ldt(host_state->ldt_sel); @@ -2975,24 +2978,19 @@ static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx) #ifdef CONFIG_X86_64 static u64 vmx_read_guest_kernel_gs_base(struct vcpu_vmx *vmx) { - if (is_long_mode(&vmx->vcpu)) { - preempt_disable(); - if (vmx->loaded_cpu_state) - rdmsrl(MSR_KERNEL_GS_BASE, - vmx->msr_guest_kernel_gs_base); - preempt_enable(); - } + preempt_disable(); + if (vmx->loaded_cpu_state) + rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); + preempt_enable(); return vmx->msr_guest_kernel_gs_base; } static void vmx_write_guest_kernel_gs_base(struct vcpu_vmx *vmx, u64 data) { - if (is_long_mode(&vmx->vcpu)) { - preempt_disable(); - if (vmx->loaded_cpu_state) - wrmsrl(MSR_KERNEL_GS_BASE, data); - preempt_enable(); - } + preempt_disable(); + if (vmx->loaded_cpu_state) + wrmsrl(MSR_KERNEL_GS_BASE, data); + preempt_enable(); vmx->msr_guest_kernel_gs_base = data; } #endif @@ -3528,9 +3526,6 @@ static void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, bool apicv) VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER | VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT; - if (kvm_mpx_supported()) - msrs->exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS; - /* We support free control of debug control saving. */ msrs->exit_ctls_low &= ~VM_EXIT_SAVE_DEBUG_CONTROLS; @@ -3547,8 +3542,6 @@ static void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, bool apicv) VM_ENTRY_LOAD_IA32_PAT; msrs->entry_ctls_high |= (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER); - if (kvm_mpx_supported()) - msrs->entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS; /* We support free control of debug control loading. */ msrs->entry_ctls_low &= ~VM_ENTRY_LOAD_DEBUG_CONTROLS; @@ -3596,12 +3589,12 @@ static void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, bool apicv) msrs->secondary_ctls_high); msrs->secondary_ctls_low = 0; msrs->secondary_ctls_high &= - SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | SECONDARY_EXEC_DESC | SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | SECONDARY_EXEC_APIC_REGISTER_VIRT | SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | SECONDARY_EXEC_WBINVD_EXITING; + /* * We can emulate "VMCS shadowing," even if the hardware * doesn't support it. @@ -3658,6 +3651,10 @@ static void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, bool apicv) msrs->secondary_ctls_high |= SECONDARY_EXEC_UNRESTRICTED_GUEST; + if (flexpriority_enabled) + msrs->secondary_ctls_high |= + SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; + /* miscellaneous data */ rdmsr(MSR_IA32_VMX_MISC, msrs->misc_low, @@ -5068,19 +5065,6 @@ static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer) if (!msr) return; - /* - * MSR_KERNEL_GS_BASE is not intercepted when the guest is in - * 64-bit mode as a 64-bit kernel may frequently access the - * MSR. This means we need to manually save/restore the MSR - * when switching between guest and host state, but only if - * the guest is in 64-bit mode. Sync our cached value if the - * guest is transitioning to 32-bit mode and the CPU contains - * guest state, i.e. the cache is stale. - */ -#ifdef CONFIG_X86_64 - if (!(efer & EFER_LMA)) - (void)vmx_read_guest_kernel_gs_base(vmx); -#endif vcpu->arch.efer = efer; if (efer & EFER_LMA) { vm_entry_controls_setbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE); @@ -5393,9 +5377,10 @@ static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) * To use VMXON (and later other VMX instructions), a guest * must first be able to turn on cr4.VMXE (see handle_vmon()). * So basically the check on whether to allow nested VMX - * is here. + * is here. We operate under the default treatment of SMM, + * so VMX cannot be enabled under SMM. */ - if (!nested_vmx_allowed(vcpu)) + if (!nested_vmx_allowed(vcpu) || is_smm(vcpu)) return 1; } @@ -6072,9 +6057,6 @@ static u8 vmx_msr_bitmap_mode(struct kvm_vcpu *vcpu) mode |= MSR_BITMAP_MODE_X2APIC_APICV; } - if (is_long_mode(vcpu)) - mode |= MSR_BITMAP_MODE_LM; - return mode; } @@ -6115,9 +6097,6 @@ static void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu) if (!changed) return; - vmx_set_intercept_for_msr(msr_bitmap, MSR_KERNEL_GS_BASE, MSR_TYPE_RW, - !(mode & MSR_BITMAP_MODE_LM)); - if (changed & (MSR_BITMAP_MODE_X2APIC | MSR_BITMAP_MODE_X2APIC_APICV)) vmx_update_msr_bitmap_x2apic(msr_bitmap, mode); @@ -6183,6 +6162,32 @@ static void vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) nested_mark_vmcs12_pages_dirty(vcpu); } +static u8 vmx_get_rvi(void) +{ + return vmcs_read16(GUEST_INTR_STATUS) & 0xff; +} + +static bool vmx_guest_apic_has_interrupt(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + void *vapic_page; + u32 vppr; + int rvi; + + if (WARN_ON_ONCE(!is_guest_mode(vcpu)) || + !nested_cpu_has_vid(get_vmcs12(vcpu)) || + WARN_ON_ONCE(!vmx->nested.virtual_apic_page)) + return false; + + rvi = vmx_get_rvi(); + + vapic_page = kmap(vmx->nested.virtual_apic_page); + vppr = *((u32 *)(vapic_page + APIC_PROCPRI)); + kunmap(vmx->nested.virtual_apic_page); + + return ((rvi & 0xf0) > (vppr & 0xf0)); +} + static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu, bool nested) { @@ -7966,6 +7971,9 @@ static __init int hardware_setup(void) kvm_x86_ops->enable_log_dirty_pt_masked = NULL; } + if (!cpu_has_vmx_preemption_timer()) + kvm_x86_ops->request_immediate_exit = __kvm_request_immediate_exit; + if (cpu_has_vmx_preemption_timer() && enable_preemption_timer) { u64 vmx_msr; @@ -9208,7 +9216,8 @@ static int handle_pml_full(struct kvm_vcpu *vcpu) static int handle_preemption_timer(struct kvm_vcpu *vcpu) { - kvm_lapic_expired_hv_timer(vcpu); + if (!to_vmx(vcpu)->req_immediate_exit) + kvm_lapic_expired_hv_timer(vcpu); return 1; } @@ -10214,15 +10223,16 @@ static void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu) if (!lapic_in_kernel(vcpu)) return; + if (!flexpriority_enabled && + !cpu_has_vmx_virtualize_x2apic_mode()) + return; + /* Postpone execution until vmcs01 is the current VMCS. */ if (is_guest_mode(vcpu)) { to_vmx(vcpu)->nested.change_vmcs01_virtual_apic_mode = true; return; } - if (!cpu_need_tpr_shadow(vcpu)) - return; - sec_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); sec_exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE); @@ -10344,6 +10354,14 @@ static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu) return max_irr; } +static u8 vmx_has_apicv_interrupt(struct kvm_vcpu *vcpu) +{ + u8 rvi = vmx_get_rvi(); + u8 vppr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_PROCPRI); + + return ((rvi & 0xf0) > (vppr & 0xf0)); +} + static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) { if (!kvm_vcpu_apicv_active(vcpu)) @@ -10595,24 +10613,43 @@ static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx) msrs[i].host, false); } -static void vmx_arm_hv_timer(struct kvm_vcpu *vcpu) +static void vmx_arm_hv_timer(struct vcpu_vmx *vmx, u32 val) +{ + vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, val); + if (!vmx->loaded_vmcs->hv_timer_armed) + vmcs_set_bits(PIN_BASED_VM_EXEC_CONTROL, + PIN_BASED_VMX_PREEMPTION_TIMER); + vmx->loaded_vmcs->hv_timer_armed = true; +} + +static void vmx_update_hv_timer(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); u64 tscl; u32 delta_tsc; - if (vmx->hv_deadline_tsc == -1) + if (vmx->req_immediate_exit) { + vmx_arm_hv_timer(vmx, 0); return; + } - tscl = rdtsc(); - if (vmx->hv_deadline_tsc > tscl) - /* sure to be 32 bit only because checked on set_hv_timer */ - delta_tsc = (u32)((vmx->hv_deadline_tsc - tscl) >> - cpu_preemption_timer_multi); - else - delta_tsc = 0; + if (vmx->hv_deadline_tsc != -1) { + tscl = rdtsc(); + if (vmx->hv_deadline_tsc > tscl) + /* set_hv_timer ensures the delta fits in 32-bits */ + delta_tsc = (u32)((vmx->hv_deadline_tsc - tscl) >> + cpu_preemption_timer_multi); + else + delta_tsc = 0; + + vmx_arm_hv_timer(vmx, delta_tsc); + return; + } - vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, delta_tsc); + if (vmx->loaded_vmcs->hv_timer_armed) + vmcs_clear_bits(PIN_BASED_VM_EXEC_CONTROL, + PIN_BASED_VMX_PREEMPTION_TIMER); + vmx->loaded_vmcs->hv_timer_armed = false; } static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) @@ -10672,7 +10709,7 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) atomic_switch_perf_msrs(vmx); - vmx_arm_hv_timer(vcpu); + vmx_update_hv_timer(vcpu); /* * If this vCPU has touched SPEC_CTRL, restore the guest's value if @@ -11214,6 +11251,23 @@ static void nested_vmx_cr_fixed1_bits_update(struct kvm_vcpu *vcpu) #undef cr4_fixed1_update } +static void nested_vmx_entry_exit_ctls_update(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + if (kvm_mpx_supported()) { + bool mpx_enabled = guest_cpuid_has(vcpu, X86_FEATURE_MPX); + + if (mpx_enabled) { + vmx->nested.msrs.entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS; + vmx->nested.msrs.exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS; + } else { + vmx->nested.msrs.entry_ctls_high &= ~VM_ENTRY_LOAD_BNDCFGS; + vmx->nested.msrs.exit_ctls_high &= ~VM_EXIT_CLEAR_BNDCFGS; + } + } +} + static void vmx_cpuid_update(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -11230,8 +11284,10 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu) to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &= ~FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX; - if (nested_vmx_allowed(vcpu)) + if (nested_vmx_allowed(vcpu)) { nested_vmx_cr_fixed1_bits_update(vcpu); + nested_vmx_entry_exit_ctls_update(vcpu); + } } static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) @@ -11427,16 +11483,18 @@ static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu) u64 preemption_timeout = get_vmcs12(vcpu)->vmx_preemption_timer_value; struct vcpu_vmx *vmx = to_vmx(vcpu); - if (vcpu->arch.virtual_tsc_khz == 0) - return; - - /* Make sure short timeouts reliably trigger an immediate vmexit. - * hrtimer_start does not guarantee this. */ - if (preemption_timeout <= 1) { + /* + * A timer value of zero is architecturally guaranteed to cause + * a VMExit prior to executing any instructions in the guest. + */ + if (preemption_timeout == 0) { vmx_preemption_timer_fn(&vmx->nested.preemption_timer); return; } + if (vcpu->arch.virtual_tsc_khz == 0) + return; + preemption_timeout <<= VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; preemption_timeout *= 1000000; do_div(preemption_timeout, vcpu->arch.virtual_tsc_khz); @@ -11646,11 +11704,15 @@ static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu, * bits 15:8 should be zero in posted_intr_nv, * the descriptor address has been already checked * in nested_get_vmcs12_pages. + * + * bits 5:0 of posted_intr_desc_addr should be zero. */ if (nested_cpu_has_posted_intr(vmcs12) && (!nested_cpu_has_vid(vmcs12) || !nested_exit_intr_ack_set(vcpu) || - vmcs12->posted_intr_nv & 0xff00)) + (vmcs12->posted_intr_nv & 0xff00) || + (vmcs12->posted_intr_desc_addr & 0x3f) || + (!page_address_valid(vcpu, vmcs12->posted_intr_desc_addr)))) return -EINVAL; /* tpr shadow is needed by all apicv features. */ @@ -11993,8 +12055,13 @@ static void prepare_vmcs02_full(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) set_cr4_guest_host_mask(vmx); - if (vmx_mpx_supported()) - vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs); + if (kvm_mpx_supported()) { + if (vmx->nested.nested_run_pending && + (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)) + vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs); + else + vmcs_write64(GUEST_BNDCFGS, vmx->nested.vmcs01_guest_bndcfgs); + } if (enable_vpid) { if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02) @@ -12076,11 +12143,10 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, exec_control = vmcs12->pin_based_vm_exec_control; - /* Preemption timer setting is only taken from vmcs01. */ - exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER; + /* Preemption timer setting is computed directly in vmx_vcpu_run. */ exec_control |= vmcs_config.pin_based_exec_ctrl; - if (vmx->hv_deadline_tsc == -1) - exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER; + exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER; + vmx->loaded_vmcs->hv_timer_armed = false; /* Posted interrupts setting is only taken from vmcs12. */ if (nested_cpu_has_posted_intr(vmcs12)) { @@ -12318,6 +12384,9 @@ static int check_vmentry_prereqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT) return VMXERR_ENTRY_INVALID_CONTROL_FIELD; + if (nested_cpu_has_vpid(vmcs12) && !vmcs12->virtual_processor_id) + return VMXERR_ENTRY_INVALID_CONTROL_FIELD; + if (nested_vmx_check_io_bitmap_controls(vcpu, vmcs12)) return VMXERR_ENTRY_INVALID_CONTROL_FIELD; @@ -12537,15 +12606,21 @@ static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, u32 *exit_qual) struct vmcs12 *vmcs12 = get_vmcs12(vcpu); bool from_vmentry = !!exit_qual; u32 dummy_exit_qual; - u32 vmcs01_cpu_exec_ctrl; + bool evaluate_pending_interrupts; int r = 0; - vmcs01_cpu_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); + evaluate_pending_interrupts = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) & + (CPU_BASED_VIRTUAL_INTR_PENDING | CPU_BASED_VIRTUAL_NMI_PENDING); + if (likely(!evaluate_pending_interrupts) && kvm_vcpu_apicv_active(vcpu)) + evaluate_pending_interrupts |= vmx_has_apicv_interrupt(vcpu); enter_guest_mode(vcpu); if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) vmx->nested.vmcs01_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL); + if (kvm_mpx_supported() && + !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)) + vmx->nested.vmcs01_guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS); vmx_switch_vmcs(vcpu, &vmx->nested.vmcs02); vmx_segment_cache_clear(vmx); @@ -12585,16 +12660,14 @@ static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, u32 *exit_qual) * to L1 or delivered directly to L2 (e.g. In case L1 don't * intercept EXTERNAL_INTERRUPT). * - * Usually this would be handled by L0 requesting a - * IRQ/NMI window by setting VMCS accordingly. However, - * this setting was done on VMCS01 and now VMCS02 is active - * instead. Thus, we force L0 to perform pending event - * evaluation by requesting a KVM_REQ_EVENT. - */ - if (vmcs01_cpu_exec_ctrl & - (CPU_BASED_VIRTUAL_INTR_PENDING | CPU_BASED_VIRTUAL_NMI_PENDING)) { + * Usually this would be handled by the processor noticing an + * IRQ/NMI window request, or checking RVI during evaluation of + * pending virtual interrupts. However, this setting was done + * on VMCS01 and now VMCS02 is active instead. Thus, we force L0 + * to perform pending event evaluation by requesting a KVM_REQ_EVENT. + */ + if (unlikely(evaluate_pending_interrupts)) kvm_make_request(KVM_REQ_EVENT, vcpu); - } /* * Note no nested_vmx_succeed or nested_vmx_fail here. At this point @@ -12863,6 +12936,11 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr) return 0; } +static void vmx_request_immediate_exit(struct kvm_vcpu *vcpu) +{ + to_vmx(vcpu)->req_immediate_exit = true; +} + static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu) { ktime_t remaining = @@ -13253,12 +13331,7 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); - if (vmx->hv_deadline_tsc == -1) - vmcs_clear_bits(PIN_BASED_VM_EXEC_CONTROL, - PIN_BASED_VMX_PREEMPTION_TIMER); - else - vmcs_set_bits(PIN_BASED_VM_EXEC_CONTROL, - PIN_BASED_VMX_PREEMPTION_TIMER); + if (kvm_has_tsc_control) decache_tsc_multiplier(vmx); @@ -13462,18 +13535,12 @@ static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc) return -ERANGE; vmx->hv_deadline_tsc = tscl + delta_tsc; - vmcs_set_bits(PIN_BASED_VM_EXEC_CONTROL, - PIN_BASED_VMX_PREEMPTION_TIMER); - return delta_tsc == 0; } static void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu) { - struct vcpu_vmx *vmx = to_vmx(vcpu); - vmx->hv_deadline_tsc = -1; - vmcs_clear_bits(PIN_BASED_VM_EXEC_CONTROL, - PIN_BASED_VMX_PREEMPTION_TIMER); + to_vmx(vcpu)->hv_deadline_tsc = -1; } #endif @@ -13954,6 +14021,14 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu, ~(KVM_STATE_NESTED_SMM_GUEST_MODE | KVM_STATE_NESTED_SMM_VMXON)) return -EINVAL; + /* + * SMM temporarily disables VMX, so we cannot be in guest mode, + * nor can VMLAUNCH/VMRESUME be pending. Outside SMM, SMM flags + * must be zero. + */ + if (is_smm(vcpu) ? kvm_state->flags : kvm_state->vmx.smm.flags) + return -EINVAL; + if ((kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) && !(kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON)) return -EINVAL; @@ -14097,6 +14172,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = { .apicv_post_state_restore = vmx_apicv_post_state_restore, .hwapic_irr_update = vmx_hwapic_irr_update, .hwapic_isr_update = vmx_hwapic_isr_update, + .guest_apic_has_interrupt = vmx_guest_apic_has_interrupt, .sync_pir_to_irr = vmx_sync_pir_to_irr, .deliver_posted_interrupt = vmx_deliver_posted_interrupt, @@ -14130,6 +14206,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = { .umip_emulated = vmx_umip_emulated, .check_nested_events = vmx_check_nested_events, + .request_immediate_exit = vmx_request_immediate_exit, .sched_in = vmx_sched_in, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 542f6315444d..ca717737347e 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -628,7 +628,7 @@ bool pdptrs_changed(struct kvm_vcpu *vcpu) gfn_t gfn; int r; - if (is_long_mode(vcpu) || !is_pae(vcpu)) + if (is_long_mode(vcpu) || !is_pae(vcpu) || !is_paging(vcpu)) return false; if (!test_bit(VCPU_EXREG_PDPTR, @@ -2537,7 +2537,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; case MSR_PLATFORM_INFO: if (!msr_info->host_initiated || - data & ~MSR_PLATFORM_INFO_CPUID_FAULT || (!(data & MSR_PLATFORM_INFO_CPUID_FAULT) && cpuid_fault_enabled(vcpu))) return 1; @@ -2780,6 +2779,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) msr_info->data = vcpu->arch.osvw.status; break; case MSR_PLATFORM_INFO: + if (!msr_info->host_initiated && + !vcpu->kvm->arch.guest_can_read_msr_platform_info) + return 1; msr_info->data = vcpu->arch.msr_platform_info; break; case MSR_MISC_FEATURES_ENABLES: @@ -2927,6 +2929,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_SPLIT_IRQCHIP: case KVM_CAP_IMMEDIATE_EXIT: case KVM_CAP_GET_MSR_FEATURES: + case KVM_CAP_MSR_PLATFORM_INFO: r = 1; break; case KVM_CAP_SYNC_REGS: @@ -4007,19 +4010,23 @@ long kvm_arch_vcpu_ioctl(struct file *filp, break; BUILD_BUG_ON(sizeof(user_data_size) != sizeof(user_kvm_nested_state->size)); + r = -EFAULT; if (get_user(user_data_size, &user_kvm_nested_state->size)) - return -EFAULT; + break; r = kvm_x86_ops->get_nested_state(vcpu, user_kvm_nested_state, user_data_size); if (r < 0) - return r; + break; if (r > user_data_size) { if (put_user(r, &user_kvm_nested_state->size)) - return -EFAULT; - return -E2BIG; + r = -EFAULT; + else + r = -E2BIG; + break; } + r = 0; break; } @@ -4031,19 +4038,21 @@ long kvm_arch_vcpu_ioctl(struct file *filp, if (!kvm_x86_ops->set_nested_state) break; + r = -EFAULT; if (copy_from_user(&kvm_state, user_kvm_nested_state, sizeof(kvm_state))) - return -EFAULT; + break; + r = -EINVAL; if (kvm_state.size < sizeof(kvm_state)) - return -EINVAL; + break; if (kvm_state.flags & ~(KVM_STATE_NESTED_RUN_PENDING | KVM_STATE_NESTED_GUEST_MODE)) - return -EINVAL; + break; /* nested_run_pending implies guest_mode. */ if (kvm_state.flags == KVM_STATE_NESTED_RUN_PENDING) - return -EINVAL; + break; r = kvm_x86_ops->set_nested_state(vcpu, user_kvm_nested_state, &kvm_state); break; @@ -4350,6 +4359,10 @@ split_irqchip_unlock: kvm->arch.pause_in_guest = true; r = 0; break; + case KVM_CAP_MSR_PLATFORM_INFO: + kvm->arch.guest_can_read_msr_platform_info = cap->args[0]; + r = 0; + break; default: r = -EINVAL; break; @@ -4685,7 +4698,7 @@ static void kvm_init_msr_list(void) */ switch (msrs_to_save[i]) { case MSR_IA32_BNDCFGS: - if (!kvm_x86_ops->mpx_supported()) + if (!kvm_mpx_supported()) continue; break; case MSR_TSC_AUX: @@ -7361,6 +7374,12 @@ void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_vcpu_reload_apic_access_page); +void __kvm_request_immediate_exit(struct kvm_vcpu *vcpu) +{ + smp_send_reschedule(vcpu->cpu); +} +EXPORT_SYMBOL_GPL(__kvm_request_immediate_exit); + /* * Returns 1 to let vcpu_run() continue the guest execution loop without * exiting to the userspace. Otherwise, the value will be returned to the @@ -7565,7 +7584,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) if (req_immediate_exit) { kvm_make_request(KVM_REQ_EVENT, vcpu); - smp_send_reschedule(vcpu->cpu); + kvm_x86_ops->request_immediate_exit(vcpu); } trace_kvm_entry(vcpu->vcpu_id); @@ -7829,6 +7848,29 @@ static int complete_emulated_mmio(struct kvm_vcpu *vcpu) return 0; } +/* Swap (qemu) user FPU context for the guest FPU context. */ +static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) +{ + preempt_disable(); + copy_fpregs_to_fpstate(&vcpu->arch.user_fpu); + /* PKRU is separately restored in kvm_x86_ops->run. */ + __copy_kernel_to_fpregs(&vcpu->arch.guest_fpu.state, + ~XFEATURE_MASK_PKRU); + preempt_enable(); + trace_kvm_fpu(1); +} + +/* When vcpu_run ends, restore user space FPU context. */ +static void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) +{ + preempt_disable(); + copy_fpregs_to_fpstate(&vcpu->arch.guest_fpu); + copy_kernel_to_fpregs(&vcpu->arch.user_fpu.state); + preempt_enable(); + ++vcpu->stat.fpu_reload; + trace_kvm_fpu(0); +} + int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { int r; @@ -8177,7 +8219,7 @@ static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) kvm_update_cpuid(vcpu); idx = srcu_read_lock(&vcpu->kvm->srcu); - if (!is_long_mode(vcpu) && is_pae(vcpu)) { + if (!is_long_mode(vcpu) && is_pae(vcpu) && is_paging(vcpu)) { load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu)); mmu_reset_needed = 1; } @@ -8406,29 +8448,6 @@ static void fx_init(struct kvm_vcpu *vcpu) vcpu->arch.cr0 |= X86_CR0_ET; } -/* Swap (qemu) user FPU context for the guest FPU context. */ -void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) -{ - preempt_disable(); - copy_fpregs_to_fpstate(&vcpu->arch.user_fpu); - /* PKRU is separately restored in kvm_x86_ops->run. */ - __copy_kernel_to_fpregs(&vcpu->arch.guest_fpu.state, - ~XFEATURE_MASK_PKRU); - preempt_enable(); - trace_kvm_fpu(1); -} - -/* When vcpu_run ends, restore user space FPU context. */ -void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) -{ - preempt_disable(); - copy_fpregs_to_fpstate(&vcpu->arch.guest_fpu); - copy_kernel_to_fpregs(&vcpu->arch.user_fpu.state); - preempt_enable(); - ++vcpu->stat.fpu_reload; - trace_kvm_fpu(0); -} - void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) { void *wbinvd_dirty_mask = vcpu->arch.wbinvd_dirty_mask; @@ -8852,6 +8871,8 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) kvm->arch.kvmclock_offset = -ktime_get_boot_ns(); pvclock_update_vm_gtod_copy(kvm); + kvm->arch.guest_can_read_msr_platform_info = true; + INIT_DELAYED_WORK(&kvm->arch.kvmclock_update_work, kvmclock_update_fn); INIT_DELAYED_WORK(&kvm->arch.kvmclock_sync_work, kvmclock_sync_fn); @@ -9200,6 +9221,13 @@ void kvm_arch_flush_shadow_memslot(struct kvm *kvm, kvm_page_track_flush_slot(kvm, slot); } +static inline bool kvm_guest_apic_has_interrupt(struct kvm_vcpu *vcpu) +{ + return (is_guest_mode(vcpu) && + kvm_x86_ops->guest_apic_has_interrupt && + kvm_x86_ops->guest_apic_has_interrupt(vcpu)); +} + static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu) { if (!list_empty_careful(&vcpu->async_pf.done)) @@ -9224,7 +9252,8 @@ static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu) return true; if (kvm_arch_interrupt_allowed(vcpu) && - kvm_cpu_has_interrupt(vcpu)) + (kvm_cpu_has_interrupt(vcpu) || + kvm_guest_apic_has_interrupt(vcpu))) return true; if (kvm_hv_has_stimer_pending(vcpu)) diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c index 599e01bcdef2..a9dd4ea7467d 100644 --- a/drivers/ata/libata-core.c +++ b/drivers/ata/libata-core.c @@ -5359,10 +5359,20 @@ void ata_qc_complete(struct ata_queued_cmd *qc) */ int ata_qc_complete_multiple(struct ata_port *ap, u64 qc_active) { + u64 done_mask, ap_qc_active = ap->qc_active; int nr_done = 0; - u64 done_mask; - done_mask = ap->qc_active ^ qc_active; + /* + * If the internal tag is set on ap->qc_active, then we care about + * bit0 on the passed in qc_active mask. Move that bit up to match + * the internal tag. + */ + if (ap_qc_active & (1ULL << ATA_TAG_INTERNAL)) { + qc_active |= (qc_active & 0x01) << ATA_TAG_INTERNAL; + qc_active ^= qc_active & 0x01; + } + + done_mask = ap_qc_active ^ qc_active; if (unlikely(done_mask & qc_active)) { ata_port_err(ap, "illegal qc_active transition (%08llx->%08llx)\n", diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c index 48f622728ce6..f2b6f4da1034 100644 --- a/drivers/block/floppy.c +++ b/drivers/block/floppy.c @@ -3467,6 +3467,9 @@ static int fd_locked_ioctl(struct block_device *bdev, fmode_t mode, unsigned int (struct floppy_struct **)&outparam); if (ret) return ret; + memcpy(&inparam.g, outparam, + offsetof(struct floppy_struct, name)); + outparam = &inparam.g; break; case FDMSGON: UDP->flags |= FTD_MSG; diff --git a/drivers/bluetooth/hci_ldisc.c b/drivers/bluetooth/hci_ldisc.c index 963bb0309e25..ea6238ed5c0e 100644 --- a/drivers/bluetooth/hci_ldisc.c +++ b/drivers/bluetooth/hci_ldisc.c @@ -543,6 +543,8 @@ static void hci_uart_tty_close(struct tty_struct *tty) } clear_bit(HCI_UART_PROTO_SET, &hu->flags); + percpu_free_rwsem(&hu->proto_lock); + kfree(hu); } diff --git a/drivers/clk/x86/clk-pmc-atom.c b/drivers/clk/x86/clk-pmc-atom.c index 08ef69945ffb..d977193842df 100644 --- a/drivers/clk/x86/clk-pmc-atom.c +++ b/drivers/clk/x86/clk-pmc-atom.c @@ -55,6 +55,7 @@ struct clk_plt_data { u8 nparents; struct clk_plt *clks[PMC_CLK_NUM]; struct clk_lookup *mclk_lookup; + struct clk_lookup *ether_clk_lookup; }; /* Return an index in parent table */ @@ -186,13 +187,6 @@ static struct clk_plt *plt_clk_register(struct platform_device *pdev, int id, pclk->reg = base + PMC_CLK_CTL_OFFSET + id * PMC_CLK_CTL_SIZE; spin_lock_init(&pclk->lock); - /* - * If the clock was already enabled by the firmware mark it as critical - * to avoid it being gated by the clock framework if no driver owns it. - */ - if (plt_clk_is_enabled(&pclk->hw)) - init.flags |= CLK_IS_CRITICAL; - ret = devm_clk_hw_register(&pdev->dev, &pclk->hw); if (ret) { pclk = ERR_PTR(ret); @@ -351,11 +345,20 @@ static int plt_clk_probe(struct platform_device *pdev) goto err_unreg_clk_plt; } + data->ether_clk_lookup = clkdev_hw_create(&data->clks[4]->hw, + "ether_clk", NULL); + if (!data->ether_clk_lookup) { + err = -ENOMEM; + goto err_drop_mclk; + } + plt_clk_free_parent_names_loop(parent_names, data->nparents); platform_set_drvdata(pdev, data); return 0; +err_drop_mclk: + clkdev_drop(data->mclk_lookup); err_unreg_clk_plt: plt_clk_unregister_loop(data, i); plt_clk_unregister_parents(data); @@ -369,6 +372,7 @@ static int plt_clk_remove(struct platform_device *pdev) data = platform_get_drvdata(pdev); + clkdev_drop(data->ether_clk_lookup); clkdev_drop(data->mclk_lookup); plt_clk_unregister_loop(data, PMC_CLK_NUM); plt_clk_unregister_parents(data); diff --git a/drivers/crypto/ccp/psp-dev.c b/drivers/crypto/ccp/psp-dev.c index 218739b961fe..72790d88236d 100644 --- a/drivers/crypto/ccp/psp-dev.c +++ b/drivers/crypto/ccp/psp-dev.c @@ -38,6 +38,17 @@ static DEFINE_MUTEX(sev_cmd_mutex); static struct sev_misc_dev *misc_dev; static struct psp_device *psp_master; +static int psp_cmd_timeout = 100; +module_param(psp_cmd_timeout, int, 0644); +MODULE_PARM_DESC(psp_cmd_timeout, " default timeout value, in seconds, for PSP commands"); + +static int psp_probe_timeout = 5; +module_param(psp_probe_timeout, int, 0644); +MODULE_PARM_DESC(psp_probe_timeout, " default timeout value, in seconds, during PSP device probe"); + +static bool psp_dead; +static int psp_timeout; + static struct psp_device *psp_alloc_struct(struct sp_device *sp) { struct device *dev = sp->dev; @@ -82,10 +93,19 @@ done: return IRQ_HANDLED; } -static void sev_wait_cmd_ioc(struct psp_device *psp, unsigned int *reg) +static int sev_wait_cmd_ioc(struct psp_device *psp, + unsigned int *reg, unsigned int timeout) { - wait_event(psp->sev_int_queue, psp->sev_int_rcvd); + int ret; + + ret = wait_event_timeout(psp->sev_int_queue, + psp->sev_int_rcvd, timeout * HZ); + if (!ret) + return -ETIMEDOUT; + *reg = ioread32(psp->io_regs + psp->vdata->cmdresp_reg); + + return 0; } static int sev_cmd_buffer_len(int cmd) @@ -133,12 +153,15 @@ static int __sev_do_cmd_locked(int cmd, void *data, int *psp_ret) if (!psp) return -ENODEV; + if (psp_dead) + return -EBUSY; + /* Get the physical address of the command buffer */ phys_lsb = data ? lower_32_bits(__psp_pa(data)) : 0; phys_msb = data ? upper_32_bits(__psp_pa(data)) : 0; - dev_dbg(psp->dev, "sev command id %#x buffer 0x%08x%08x\n", - cmd, phys_msb, phys_lsb); + dev_dbg(psp->dev, "sev command id %#x buffer 0x%08x%08x timeout %us\n", + cmd, phys_msb, phys_lsb, psp_timeout); print_hex_dump_debug("(in): ", DUMP_PREFIX_OFFSET, 16, 2, data, sev_cmd_buffer_len(cmd), false); @@ -154,7 +177,18 @@ static int __sev_do_cmd_locked(int cmd, void *data, int *psp_ret) iowrite32(reg, psp->io_regs + psp->vdata->cmdresp_reg); /* wait for command completion */ - sev_wait_cmd_ioc(psp, ®); + ret = sev_wait_cmd_ioc(psp, ®, psp_timeout); + if (ret) { + if (psp_ret) + *psp_ret = 0; + + dev_err(psp->dev, "sev command %#x timed out, disabling PSP \n", cmd); + psp_dead = true; + + return ret; + } + + psp_timeout = psp_cmd_timeout; if (psp_ret) *psp_ret = reg & PSP_CMDRESP_ERR_MASK; @@ -888,6 +922,8 @@ void psp_pci_init(void) psp_master = sp->psp_data; + psp_timeout = psp_probe_timeout; + if (sev_get_api_version()) goto err; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c index f8bbbb3a9504..0c791e35acf0 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c @@ -272,7 +272,7 @@ void amdgpu_amdkfd_gpu_reset(struct kgd_dev *kgd) int alloc_gtt_mem(struct kgd_dev *kgd, size_t size, void **mem_obj, uint64_t *gpu_addr, - void **cpu_ptr) + void **cpu_ptr, bool mqd_gfx9) { struct amdgpu_device *adev = (struct amdgpu_device *)kgd; struct amdgpu_bo *bo = NULL; @@ -287,6 +287,10 @@ int alloc_gtt_mem(struct kgd_dev *kgd, size_t size, bp.flags = AMDGPU_GEM_CREATE_CPU_GTT_USWC; bp.type = ttm_bo_type_kernel; bp.resv = NULL; + + if (mqd_gfx9) + bp.flags |= AMDGPU_GEM_CREATE_MQD_GFX9; + r = amdgpu_bo_create(adev, &bp, &bo); if (r) { dev_err(adev->dev, diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h index 2f379c183ed2..cc9aeab5468c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h @@ -136,7 +136,7 @@ void amdgpu_amdkfd_gpu_reset(struct kgd_dev *kgd); /* Shared API */ int alloc_gtt_mem(struct kgd_dev *kgd, size_t size, void **mem_obj, uint64_t *gpu_addr, - void **cpu_ptr); + void **cpu_ptr, bool mqd_gfx9); void free_gtt_mem(struct kgd_dev *kgd, void *mem_obj); void get_local_mem_info(struct kgd_dev *kgd, struct kfd_local_mem_info *mem_info); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c index ea3f698aef5e..9803b91f3e77 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c @@ -685,7 +685,7 @@ static int kgd_hqd_sdma_destroy(struct kgd_dev *kgd, void *mqd, while (true) { temp = RREG32(sdma_base_addr + mmSDMA0_RLC0_CONTEXT_STATUS); - if (temp & SDMA0_STATUS_REG__RB_CMD_IDLE__SHIFT) + if (temp & SDMA0_RLC0_CONTEXT_STATUS__IDLE_MASK) break; if (time_after(jiffies, end_jiffies)) return -ETIME; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cgs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cgs.c index 693ec5ea4950..8816c697b205 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cgs.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cgs.c @@ -367,12 +367,14 @@ static int amdgpu_cgs_get_firmware_info(struct cgs_device *cgs_device, break; case CHIP_POLARIS10: if (type == CGS_UCODE_ID_SMU) { - if ((adev->pdev->device == 0x67df) && - ((adev->pdev->revision == 0xe0) || - (adev->pdev->revision == 0xe3) || - (adev->pdev->revision == 0xe4) || - (adev->pdev->revision == 0xe5) || - (adev->pdev->revision == 0xe7) || + if (((adev->pdev->device == 0x67df) && + ((adev->pdev->revision == 0xe0) || + (adev->pdev->revision == 0xe3) || + (adev->pdev->revision == 0xe4) || + (adev->pdev->revision == 0xe5) || + (adev->pdev->revision == 0xe7) || + (adev->pdev->revision == 0xef))) || + ((adev->pdev->device == 0x6fdf) && (adev->pdev->revision == 0xef))) { info->is_kicker = true; strcpy(fw_name, "amdgpu/polaris10_k_smc.bin"); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index 8843a06360fa..0f41d8647376 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -740,6 +740,7 @@ static const struct pci_device_id pciidlist[] = { {0x1002, 0x67CA, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_POLARIS10}, {0x1002, 0x67CC, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_POLARIS10}, {0x1002, 0x67CF, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_POLARIS10}, + {0x1002, 0x6FDF, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_POLARIS10}, /* Polaris12 */ {0x1002, 0x6980, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_POLARIS12}, {0x1002, 0x6981, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_POLARIS12}, diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c index 1b048715ab8a..29ac74f40dce 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c @@ -457,7 +457,8 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd, if (kfd->kfd2kgd->init_gtt_mem_allocation( kfd->kgd, size, &kfd->gtt_mem, - &kfd->gtt_start_gpu_addr, &kfd->gtt_start_cpu_ptr)){ + &kfd->gtt_start_gpu_addr, &kfd->gtt_start_cpu_ptr, + false)) { dev_err(kfd_device, "Could not allocate %d bytes\n", size); goto out; } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_iommu.c b/drivers/gpu/drm/amd/amdkfd/kfd_iommu.c index 7a61f38c09e6..01494752c36a 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_iommu.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_iommu.c @@ -62,9 +62,20 @@ int kfd_iommu_device_init(struct kfd_dev *kfd) struct amd_iommu_device_info iommu_info; unsigned int pasid_limit; int err; + struct kfd_topology_device *top_dev; - if (!kfd->device_info->needs_iommu_device) + top_dev = kfd_topology_device_by_id(kfd->id); + + /* + * Overwrite ATS capability according to needs_iommu_device to fix + * potential missing corresponding bit in CRAT of BIOS. + */ + if (!kfd->device_info->needs_iommu_device) { + top_dev->node_props.capability &= ~HSA_CAP_ATS_PRESENT; return 0; + } + + top_dev->node_props.capability |= HSA_CAP_ATS_PRESENT; iommu_info.flags = 0; err = amd_iommu_device_info(kfd->pdev, &iommu_info); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c index f5fc3675f21e..0cedb37cf513 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c @@ -88,7 +88,7 @@ static int init_mqd(struct mqd_manager *mm, void **mqd, ALIGN(sizeof(struct v9_mqd), PAGE_SIZE), &((*mqd_mem_obj)->gtt_mem), &((*mqd_mem_obj)->gpu_addr), - (void *)&((*mqd_mem_obj)->cpu_ptr)); + (void *)&((*mqd_mem_obj)->cpu_ptr), true); } else retval = kfd_gtt_sa_allocate(mm->dev, sizeof(struct v9_mqd), mqd_mem_obj); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h index f971710f1c91..92b285ca73aa 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h @@ -806,6 +806,7 @@ int kfd_topology_add_device(struct kfd_dev *gpu); int kfd_topology_remove_device(struct kfd_dev *gpu); struct kfd_topology_device *kfd_topology_device_by_proximity_domain( uint32_t proximity_domain); +struct kfd_topology_device *kfd_topology_device_by_id(uint32_t gpu_id); struct kfd_dev *kfd_device_by_id(uint32_t gpu_id); struct kfd_dev *kfd_device_by_pci_dev(const struct pci_dev *pdev); int kfd_topology_enum_kfd_devices(uint8_t idx, struct kfd_dev **kdev); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c index bc95d4dfee2e..80f5db4ef75f 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c @@ -63,22 +63,33 @@ struct kfd_topology_device *kfd_topology_device_by_proximity_domain( return device; } -struct kfd_dev *kfd_device_by_id(uint32_t gpu_id) +struct kfd_topology_device *kfd_topology_device_by_id(uint32_t gpu_id) { - struct kfd_topology_device *top_dev; - struct kfd_dev *device = NULL; + struct kfd_topology_device *top_dev = NULL; + struct kfd_topology_device *ret = NULL; down_read(&topology_lock); list_for_each_entry(top_dev, &topology_device_list, list) if (top_dev->gpu_id == gpu_id) { - device = top_dev->gpu; + ret = top_dev; break; } up_read(&topology_lock); - return device; + return ret; +} + +struct kfd_dev *kfd_device_by_id(uint32_t gpu_id) +{ + struct kfd_topology_device *top_dev; + + top_dev = kfd_topology_device_by_id(gpu_id); + if (!top_dev) + return NULL; + + return top_dev->gpu; } struct kfd_dev *kfd_device_by_pci_dev(const struct pci_dev *pdev) diff --git a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h index 14391b06080c..43b82e14007e 100644 --- a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h +++ b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h @@ -292,7 +292,7 @@ struct tile_config { struct kfd2kgd_calls { int (*init_gtt_mem_allocation)(struct kgd_dev *kgd, size_t size, void **mem_obj, uint64_t *gpu_addr, - void **cpu_ptr); + void **cpu_ptr, bool mqd_gfx9); void (*free_gtt_mem)(struct kgd_dev *kgd, void *mem_obj); diff --git a/drivers/gpu/drm/drm_atomic.c b/drivers/gpu/drm/drm_atomic.c index 3eb061e11e2e..018fcdb353d2 100644 --- a/drivers/gpu/drm/drm_atomic.c +++ b/drivers/gpu/drm/drm_atomic.c @@ -2067,7 +2067,7 @@ static void __drm_state_dump(struct drm_device *dev, struct drm_printer *p, struct drm_connector *connector; struct drm_connector_list_iter conn_iter; - if (!drm_core_check_feature(dev, DRIVER_ATOMIC)) + if (!drm_drv_uses_atomic_modeset(dev)) return; list_for_each_entry(plane, &config->plane_list, head) { diff --git a/drivers/gpu/drm/drm_debugfs.c b/drivers/gpu/drm/drm_debugfs.c index 6f28fe58f169..373bd4c2b698 100644 --- a/drivers/gpu/drm/drm_debugfs.c +++ b/drivers/gpu/drm/drm_debugfs.c @@ -151,7 +151,7 @@ int drm_debugfs_init(struct drm_minor *minor, int minor_id, return ret; } - if (drm_core_check_feature(dev, DRIVER_ATOMIC)) { + if (drm_drv_uses_atomic_modeset(dev)) { ret = drm_atomic_debugfs_init(minor); if (ret) { DRM_ERROR("Failed to create atomic debugfs files\n"); diff --git a/drivers/gpu/drm/drm_fb_helper.c b/drivers/gpu/drm/drm_fb_helper.c index 4b0dd20bccb8..16ec93b75dbf 100644 --- a/drivers/gpu/drm/drm_fb_helper.c +++ b/drivers/gpu/drm/drm_fb_helper.c @@ -2370,7 +2370,6 @@ static int drm_pick_crtcs(struct drm_fb_helper *fb_helper, { int c, o; struct drm_connector *connector; - const struct drm_connector_helper_funcs *connector_funcs; int my_score, best_score, score; struct drm_fb_helper_crtc **crtcs, *crtc; struct drm_fb_helper_connector *fb_helper_conn; @@ -2399,8 +2398,6 @@ static int drm_pick_crtcs(struct drm_fb_helper *fb_helper, if (drm_has_preferred_mode(fb_helper_conn, width, height)) my_score++; - connector_funcs = connector->helper_private; - /* * select a crtc for this connector and then attempt to configure * remaining connectors diff --git a/drivers/gpu/drm/i915/gvt/handlers.c b/drivers/gpu/drm/i915/gvt/handlers.c index 72afa518edd9..94c1089ecf59 100644 --- a/drivers/gpu/drm/i915/gvt/handlers.c +++ b/drivers/gpu/drm/i915/gvt/handlers.c @@ -3210,6 +3210,7 @@ static int init_bxt_mmio_info(struct intel_gvt *gvt) MMIO_D(BXT_DSI_PLL_ENABLE, D_BXT); MMIO_D(GEN9_CLKGATE_DIS_0, D_BXT); + MMIO_D(GEN9_CLKGATE_DIS_4, D_BXT); MMIO_D(HSW_TVIDEO_DIP_GCP(TRANSCODER_A), D_BXT); MMIO_D(HSW_TVIDEO_DIP_GCP(TRANSCODER_B), D_BXT); diff --git a/drivers/gpu/drm/i915/gvt/kvmgt.c b/drivers/gpu/drm/i915/gvt/kvmgt.c index c7afee37b2b8..9ad89e38f6c0 100644 --- a/drivers/gpu/drm/i915/gvt/kvmgt.c +++ b/drivers/gpu/drm/i915/gvt/kvmgt.c @@ -1833,6 +1833,8 @@ static bool kvmgt_is_valid_gfn(unsigned long handle, unsigned long gfn) { struct kvmgt_guest_info *info; struct kvm *kvm; + int idx; + bool ret; if (!handle_valid(handle)) return false; @@ -1840,8 +1842,11 @@ static bool kvmgt_is_valid_gfn(unsigned long handle, unsigned long gfn) info = (struct kvmgt_guest_info *)handle; kvm = info->kvm; - return kvm_is_visible_gfn(kvm, gfn); + idx = srcu_read_lock(&kvm->srcu); + ret = kvm_is_visible_gfn(kvm, gfn); + srcu_read_unlock(&kvm->srcu, idx); + return ret; } struct intel_gvt_mpt kvmgt_mpt = { diff --git a/drivers/gpu/drm/i915/gvt/mmio.c b/drivers/gpu/drm/i915/gvt/mmio.c index 994366035364..9bb9a85c992c 100644 --- a/drivers/gpu/drm/i915/gvt/mmio.c +++ b/drivers/gpu/drm/i915/gvt/mmio.c @@ -244,6 +244,34 @@ void intel_vgpu_reset_mmio(struct intel_vgpu *vgpu, bool dmlr) /* set the bit 0:2(Core C-State ) to C0 */ vgpu_vreg_t(vgpu, GEN6_GT_CORE_STATUS) = 0; + + if (IS_BROXTON(vgpu->gvt->dev_priv)) { + vgpu_vreg_t(vgpu, BXT_P_CR_GT_DISP_PWRON) &= + ~(BIT(0) | BIT(1)); + vgpu_vreg_t(vgpu, BXT_PORT_CL1CM_DW0(DPIO_PHY0)) &= + ~PHY_POWER_GOOD; + vgpu_vreg_t(vgpu, BXT_PORT_CL1CM_DW0(DPIO_PHY1)) &= + ~PHY_POWER_GOOD; + vgpu_vreg_t(vgpu, BXT_PHY_CTL_FAMILY(DPIO_PHY0)) &= + ~BIT(30); + vgpu_vreg_t(vgpu, BXT_PHY_CTL_FAMILY(DPIO_PHY1)) &= + ~BIT(30); + vgpu_vreg_t(vgpu, BXT_PHY_CTL(PORT_A)) &= + ~BXT_PHY_LANE_ENABLED; + vgpu_vreg_t(vgpu, BXT_PHY_CTL(PORT_A)) |= + BXT_PHY_CMNLANE_POWERDOWN_ACK | + BXT_PHY_LANE_POWERDOWN_ACK; + vgpu_vreg_t(vgpu, BXT_PHY_CTL(PORT_B)) &= + ~BXT_PHY_LANE_ENABLED; + vgpu_vreg_t(vgpu, BXT_PHY_CTL(PORT_B)) |= + BXT_PHY_CMNLANE_POWERDOWN_ACK | + BXT_PHY_LANE_POWERDOWN_ACK; + vgpu_vreg_t(vgpu, BXT_PHY_CTL(PORT_C)) &= + ~BXT_PHY_LANE_ENABLED; + vgpu_vreg_t(vgpu, BXT_PHY_CTL(PORT_C)) |= + BXT_PHY_CMNLANE_POWERDOWN_ACK | + BXT_PHY_LANE_POWERDOWN_ACK; + } } else { #define GVT_GEN8_MMIO_RESET_OFFSET (0x44200) /* only reset the engine related, so starting with 0x44200 diff --git a/drivers/gpu/drm/i915/gvt/vgpu.c b/drivers/gpu/drm/i915/gvt/vgpu.c index a4e8e3cf74fd..c628be05fbfe 100644 --- a/drivers/gpu/drm/i915/gvt/vgpu.c +++ b/drivers/gpu/drm/i915/gvt/vgpu.c @@ -281,6 +281,7 @@ void intel_gvt_destroy_vgpu(struct intel_vgpu *vgpu) intel_vgpu_clean_submission(vgpu); intel_vgpu_clean_display(vgpu); intel_vgpu_clean_opregion(vgpu); + intel_vgpu_reset_ggtt(vgpu, true); intel_vgpu_clean_gtt(vgpu); intel_gvt_hypervisor_detach_vgpu(vgpu); intel_vgpu_free_resource(vgpu); diff --git a/drivers/gpu/drm/pl111/pl111_vexpress.c b/drivers/gpu/drm/pl111/pl111_vexpress.c index a534b225e31b..5fa0441bb6df 100644 --- a/drivers/gpu/drm/pl111/pl111_vexpress.c +++ b/drivers/gpu/drm/pl111/pl111_vexpress.c @@ -111,7 +111,8 @@ static int vexpress_muxfpga_probe(struct platform_device *pdev) } static const struct of_device_id vexpress_muxfpga_match[] = { - { .compatible = "arm,vexpress-muxfpga", } + { .compatible = "arm,vexpress-muxfpga", }, + {} }; static struct platform_driver vexpress_muxfpga_driver = { diff --git a/drivers/gpu/drm/sun4i/sun4i_drv.c b/drivers/gpu/drm/sun4i/sun4i_drv.c index dd19d674055c..8b0cd08034e0 100644 --- a/drivers/gpu/drm/sun4i/sun4i_drv.c +++ b/drivers/gpu/drm/sun4i/sun4i_drv.c @@ -418,7 +418,6 @@ static const struct of_device_id sun4i_drv_of_table[] = { { .compatible = "allwinner,sun8i-a33-display-engine" }, { .compatible = "allwinner,sun8i-a83t-display-engine" }, { .compatible = "allwinner,sun8i-h3-display-engine" }, - { .compatible = "allwinner,sun8i-r40-display-engine" }, { .compatible = "allwinner,sun8i-v3s-display-engine" }, { .compatible = "allwinner,sun9i-a80-display-engine" }, { } diff --git a/drivers/gpu/drm/sun4i/sun8i_hdmi_phy.c b/drivers/gpu/drm/sun4i/sun8i_hdmi_phy.c index 82502b351aec..a564b5dfe082 100644 --- a/drivers/gpu/drm/sun4i/sun8i_hdmi_phy.c +++ b/drivers/gpu/drm/sun4i/sun8i_hdmi_phy.c @@ -398,7 +398,6 @@ static struct regmap_config sun8i_hdmi_phy_regmap_config = { static const struct sun8i_hdmi_phy_variant sun50i_a64_hdmi_phy = { .has_phy_clk = true, - .has_second_pll = true, .phy_init = &sun8i_hdmi_phy_init_h3, .phy_disable = &sun8i_hdmi_phy_disable_h3, .phy_config = &sun8i_hdmi_phy_config_h3, diff --git a/drivers/gpu/drm/sun4i/sun8i_mixer.c b/drivers/gpu/drm/sun4i/sun8i_mixer.c index fc3713608f78..cb65b0ed53fd 100644 --- a/drivers/gpu/drm/sun4i/sun8i_mixer.c +++ b/drivers/gpu/drm/sun4i/sun8i_mixer.c @@ -545,22 +545,6 @@ static const struct sun8i_mixer_cfg sun8i_h3_mixer0_cfg = { .vi_num = 1, }; -static const struct sun8i_mixer_cfg sun8i_r40_mixer0_cfg = { - .ccsc = 0, - .mod_rate = 297000000, - .scaler_mask = 0xf, - .ui_num = 3, - .vi_num = 1, -}; - -static const struct sun8i_mixer_cfg sun8i_r40_mixer1_cfg = { - .ccsc = 1, - .mod_rate = 297000000, - .scaler_mask = 0x3, - .ui_num = 1, - .vi_num = 1, -}; - static const struct sun8i_mixer_cfg sun8i_v3s_mixer_cfg = { .vi_num = 2, .ui_num = 1, @@ -583,14 +567,6 @@ static const struct of_device_id sun8i_mixer_of_table[] = { .data = &sun8i_h3_mixer0_cfg, }, { - .compatible = "allwinner,sun8i-r40-de2-mixer-0", - .data = &sun8i_r40_mixer0_cfg, - }, - { - .compatible = "allwinner,sun8i-r40-de2-mixer-1", - .data = &sun8i_r40_mixer1_cfg, - }, - { .compatible = "allwinner,sun8i-v3s-de2-mixer", .data = &sun8i_v3s_mixer_cfg, }, diff --git a/drivers/gpu/drm/sun4i/sun8i_tcon_top.c b/drivers/gpu/drm/sun4i/sun8i_tcon_top.c index 55fe398d8290..d5240b777a8f 100644 --- a/drivers/gpu/drm/sun4i/sun8i_tcon_top.c +++ b/drivers/gpu/drm/sun4i/sun8i_tcon_top.c @@ -253,7 +253,6 @@ static int sun8i_tcon_top_remove(struct platform_device *pdev) /* sun4i_drv uses this list to check if a device node is a TCON TOP */ const struct of_device_id sun8i_tcon_top_of_table[] = { - { .compatible = "allwinner,sun8i-r40-tcon-top" }, { /* sentinel */ } }; MODULE_DEVICE_TABLE(of, sun8i_tcon_top_of_table); diff --git a/drivers/gpu/drm/udl/udl_fb.c b/drivers/gpu/drm/udl/udl_fb.c index dbb62f6eb48a..dd9ffded223b 100644 --- a/drivers/gpu/drm/udl/udl_fb.c +++ b/drivers/gpu/drm/udl/udl_fb.c @@ -432,9 +432,11 @@ static void udl_fbdev_destroy(struct drm_device *dev, { drm_fb_helper_unregister_fbi(&ufbdev->helper); drm_fb_helper_fini(&ufbdev->helper); - drm_framebuffer_unregister_private(&ufbdev->ufb.base); - drm_framebuffer_cleanup(&ufbdev->ufb.base); - drm_gem_object_put_unlocked(&ufbdev->ufb.obj->base); + if (ufbdev->ufb.obj) { + drm_framebuffer_unregister_private(&ufbdev->ufb.base); + drm_framebuffer_cleanup(&ufbdev->ufb.base); + drm_gem_object_put_unlocked(&ufbdev->ufb.obj->base); + } } int udl_fbdev_init(struct drm_device *dev) diff --git a/drivers/gpu/drm/vc4/vc4_plane.c b/drivers/gpu/drm/vc4/vc4_plane.c index cfb50fedfa2b..a3275fa66b7b 100644 --- a/drivers/gpu/drm/vc4/vc4_plane.c +++ b/drivers/gpu/drm/vc4/vc4_plane.c @@ -297,6 +297,9 @@ static int vc4_plane_setup_clipping_and_scaling(struct drm_plane_state *state) vc4_state->y_scaling[0] = vc4_get_scaling_mode(vc4_state->src_h[0], vc4_state->crtc_h); + vc4_state->is_unity = (vc4_state->x_scaling[0] == VC4_SCALING_NONE && + vc4_state->y_scaling[0] == VC4_SCALING_NONE); + if (num_planes > 1) { vc4_state->is_yuv = true; @@ -312,24 +315,17 @@ static int vc4_plane_setup_clipping_and_scaling(struct drm_plane_state *state) vc4_get_scaling_mode(vc4_state->src_h[1], vc4_state->crtc_h); - /* YUV conversion requires that scaling be enabled, - * even on a plane that's otherwise 1:1. Choose TPZ - * for simplicity. + /* YUV conversion requires that horizontal scaling be enabled, + * even on a plane that's otherwise 1:1. Looks like only PPF + * works in that case, so let's pick that one. */ - if (vc4_state->x_scaling[0] == VC4_SCALING_NONE) - vc4_state->x_scaling[0] = VC4_SCALING_TPZ; - if (vc4_state->y_scaling[0] == VC4_SCALING_NONE) - vc4_state->y_scaling[0] = VC4_SCALING_TPZ; + if (vc4_state->is_unity) + vc4_state->x_scaling[0] = VC4_SCALING_PPF; } else { vc4_state->x_scaling[1] = VC4_SCALING_NONE; vc4_state->y_scaling[1] = VC4_SCALING_NONE; } - vc4_state->is_unity = (vc4_state->x_scaling[0] == VC4_SCALING_NONE && - vc4_state->y_scaling[0] == VC4_SCALING_NONE && - vc4_state->x_scaling[1] == VC4_SCALING_NONE && - vc4_state->y_scaling[1] == VC4_SCALING_NONE); - /* No configuring scaling on the cursor plane, since it gets non-vblank-synced updates, and scaling requires requires LBM changes which have to be vblank-synced. @@ -672,7 +668,10 @@ static int vc4_plane_mode_set(struct drm_plane *plane, vc4_dlist_write(vc4_state, SCALER_CSC2_ITR_R_601_5); } - if (!vc4_state->is_unity) { + if (vc4_state->x_scaling[0] != VC4_SCALING_NONE || + vc4_state->x_scaling[1] != VC4_SCALING_NONE || + vc4_state->y_scaling[0] != VC4_SCALING_NONE || + vc4_state->y_scaling[1] != VC4_SCALING_NONE) { /* LBM Base Address. */ if (vc4_state->y_scaling[0] != VC4_SCALING_NONE || vc4_state->y_scaling[1] != VC4_SCALING_NONE) { diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c b/drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c index 1f134570b759..f0ab6b2313bb 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c @@ -3729,7 +3729,7 @@ int vmw_validate_single_buffer(struct vmw_private *dev_priv, { struct vmw_buffer_object *vbo = container_of(bo, struct vmw_buffer_object, base); - struct ttm_operation_ctx ctx = { interruptible, true }; + struct ttm_operation_ctx ctx = { interruptible, false }; int ret; if (vbo->pin_count > 0) diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_kms.c b/drivers/gpu/drm/vmwgfx/vmwgfx_kms.c index 23beff5d8e3c..6a712a8d59e9 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_kms.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_kms.c @@ -1512,21 +1512,19 @@ static int vmw_kms_check_display_memory(struct drm_device *dev, struct drm_rect *rects) { struct vmw_private *dev_priv = vmw_priv(dev); - struct drm_mode_config *mode_config = &dev->mode_config; struct drm_rect bounding_box = {0}; u64 total_pixels = 0, pixel_mem, bb_mem; int i; for (i = 0; i < num_rects; i++) { /* - * Currently this check is limiting the topology within max - * texture/screentarget size. This should change in future when - * user-space support multiple fb with topology. + * For STDU only individual screen (screen target) is limited by + * SCREENTARGET_MAX_WIDTH/HEIGHT registers. */ - if (rects[i].x1 < 0 || rects[i].y1 < 0 || - rects[i].x2 > mode_config->max_width || - rects[i].y2 > mode_config->max_height) { - DRM_ERROR("Invalid GUI layout.\n"); + if (dev_priv->active_display_unit == vmw_du_screen_target && + (drm_rect_width(&rects[i]) > dev_priv->stdu_max_width || + drm_rect_height(&rects[i]) > dev_priv->stdu_max_height)) { + DRM_ERROR("Screen size not supported.\n"); return -EINVAL; } @@ -1615,7 +1613,7 @@ static int vmw_kms_check_topology(struct drm_device *dev, struct drm_connector_state *conn_state; struct vmw_connector_state *vmw_conn_state; - if (!new_crtc_state->enable && old_crtc_state->enable) { + if (!new_crtc_state->enable) { rects[i].x1 = 0; rects[i].y1 = 0; rects[i].x2 = 0; @@ -2216,12 +2214,16 @@ int vmw_du_connector_fill_modes(struct drm_connector *connector, if (dev_priv->assume_16bpp) assumed_bpp = 2; + max_width = min(max_width, dev_priv->texture_max_width); + max_height = min(max_height, dev_priv->texture_max_height); + + /* + * For STDU extra limit for a mode on SVGA_REG_SCREENTARGET_MAX_WIDTH/ + * HEIGHT registers. + */ if (dev_priv->active_display_unit == vmw_du_screen_target) { max_width = min(max_width, dev_priv->stdu_max_width); - max_width = min(max_width, dev_priv->texture_max_width); - max_height = min(max_height, dev_priv->stdu_max_height); - max_height = min(max_height, dev_priv->texture_max_height); } /* Add preferred mode */ @@ -2376,6 +2378,7 @@ int vmw_kms_update_layout_ioctl(struct drm_device *dev, void *data, struct drm_file *file_priv) { struct vmw_private *dev_priv = vmw_priv(dev); + struct drm_mode_config *mode_config = &dev->mode_config; struct drm_vmw_update_layout_arg *arg = (struct drm_vmw_update_layout_arg *)data; void __user *user_rects; @@ -2421,6 +2424,21 @@ int vmw_kms_update_layout_ioctl(struct drm_device *dev, void *data, drm_rects[i].y1 = curr_rect.y; drm_rects[i].x2 = curr_rect.x + curr_rect.w; drm_rects[i].y2 = curr_rect.y + curr_rect.h; + + /* + * Currently this check is limiting the topology within + * mode_config->max (which actually is max texture size + * supported by virtual device). This limit is here to address + * window managers that create a big framebuffer for whole + * topology. + */ + if (drm_rects[i].x1 < 0 || drm_rects[i].y1 < 0 || + drm_rects[i].x2 > mode_config->max_width || + drm_rects[i].y2 > mode_config->max_height) { + DRM_ERROR("Invalid GUI layout.\n"); + ret = -EINVAL; + goto out_free; + } } ret = vmw_kms_check_display_memory(dev, arg->num_outputs, drm_rects); diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_stdu.c b/drivers/gpu/drm/vmwgfx/vmwgfx_stdu.c index 93f6b96ca7bb..f30e839f7bfd 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_stdu.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_stdu.c @@ -1600,31 +1600,6 @@ int vmw_kms_stdu_init_display(struct vmw_private *dev_priv) dev_priv->active_display_unit = vmw_du_screen_target; - if (dev_priv->capabilities & SVGA_CAP_3D) { - /* - * For 3D VMs, display (scanout) buffer size is the smaller of - * max texture and max STDU - */ - uint32_t max_width, max_height; - - max_width = min(dev_priv->texture_max_width, - dev_priv->stdu_max_width); - max_height = min(dev_priv->texture_max_height, - dev_priv->stdu_max_height); - - dev->mode_config.max_width = max_width; - dev->mode_config.max_height = max_height; - } else { - /* - * Given various display aspect ratios, there's no way to - * estimate these using prim_bb_mem. So just set these to - * something arbitrarily large and we will reject any layout - * that doesn't fit prim_bb_mem later - */ - dev->mode_config.max_width = 8192; - dev->mode_config.max_height = 8192; - } - vmw_kms_create_implicit_placement_property(dev_priv, false); for (i = 0; i < VMWGFX_NUM_DISPLAY_UNITS; ++i) { diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_surface.c b/drivers/gpu/drm/vmwgfx/vmwgfx_surface.c index e125233e074b..80a01cd4c051 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_surface.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_surface.c @@ -1404,22 +1404,17 @@ int vmw_surface_gb_priv_define(struct drm_device *dev, *srf_out = NULL; if (for_scanout) { - uint32_t max_width, max_height; - if (!svga3dsurface_is_screen_target_format(format)) { DRM_ERROR("Invalid Screen Target surface format."); return -EINVAL; } - max_width = min(dev_priv->texture_max_width, - dev_priv->stdu_max_width); - max_height = min(dev_priv->texture_max_height, - dev_priv->stdu_max_height); - - if (size.width > max_width || size.height > max_height) { + if (size.width > dev_priv->texture_max_width || + size.height > dev_priv->texture_max_height) { DRM_ERROR("%ux%u\n, exceeds max surface size %ux%u", size.width, size.height, - max_width, max_height); + dev_priv->texture_max_width, + dev_priv->texture_max_height); return -EINVAL; } } else { @@ -1495,8 +1490,17 @@ int vmw_surface_gb_priv_define(struct drm_device *dev, if (srf->flags & SVGA3D_SURFACE_BIND_STREAM_OUTPUT) srf->res.backup_size += sizeof(SVGA3dDXSOState); + /* + * Don't set SVGA3D_SURFACE_SCREENTARGET flag for a scanout surface with + * size greater than STDU max width/height. This is really a workaround + * to support creation of big framebuffer requested by some user-space + * for whole topology. That big framebuffer won't really be used for + * binding with screen target as during prepare_fb a separate surface is + * created so it's safe to ignore SVGA3D_SURFACE_SCREENTARGET flag. + */ if (dev_priv->active_display_unit == vmw_du_screen_target && - for_scanout) + for_scanout && size.width <= dev_priv->stdu_max_width && + size.height <= dev_priv->stdu_max_height) srf->flags |= SVGA3D_SURFACE_SCREENTARGET; /* diff --git a/drivers/gpu/vga/vga_switcheroo.c b/drivers/gpu/vga/vga_switcheroo.c index a96bf46bc483..cf2a18571d48 100644 --- a/drivers/gpu/vga/vga_switcheroo.c +++ b/drivers/gpu/vga/vga_switcheroo.c @@ -215,6 +215,8 @@ static void vga_switcheroo_enable(void) return; client->id = ret | ID_BIT_AUDIO; + if (client->ops->gpu_bound) + client->ops->gpu_bound(client->pdev, ret); } vga_switcheroo_debugfs_init(&vgasr_priv); diff --git a/drivers/hwmon/nct6775.c b/drivers/hwmon/nct6775.c index 944f5b63aecd..78603b78cf41 100644 --- a/drivers/hwmon/nct6775.c +++ b/drivers/hwmon/nct6775.c @@ -207,8 +207,6 @@ superio_exit(int ioreg) #define NUM_FAN 7 -#define TEMP_SOURCE_VIRTUAL 0x1f - /* Common and NCT6775 specific data */ /* Voltage min/max registers for nr=7..14 are in bank 5 */ @@ -299,8 +297,9 @@ static const u16 NCT6775_REG_PWM_READ[] = { static const u16 NCT6775_REG_FAN[] = { 0x630, 0x632, 0x634, 0x636, 0x638 }; static const u16 NCT6775_REG_FAN_MIN[] = { 0x3b, 0x3c, 0x3d }; -static const u16 NCT6775_REG_FAN_PULSES[] = { 0x641, 0x642, 0x643, 0x644, 0 }; -static const u16 NCT6775_FAN_PULSE_SHIFT[] = { 0, 0, 0, 0, 0, 0 }; +static const u16 NCT6775_REG_FAN_PULSES[NUM_FAN] = { + 0x641, 0x642, 0x643, 0x644 }; +static const u16 NCT6775_FAN_PULSE_SHIFT[NUM_FAN] = { }; static const u16 NCT6775_REG_TEMP[] = { 0x27, 0x150, 0x250, 0x62b, 0x62c, 0x62d }; @@ -373,6 +372,7 @@ static const char *const nct6775_temp_label[] = { }; #define NCT6775_TEMP_MASK 0x001ffffe +#define NCT6775_VIRT_TEMP_MASK 0x00000000 static const u16 NCT6775_REG_TEMP_ALTERNATE[32] = { [13] = 0x661, @@ -425,8 +425,8 @@ static const u8 NCT6776_PWM_MODE_MASK[] = { 0x01, 0, 0, 0, 0, 0 }; static const u16 NCT6776_REG_FAN_MIN[] = { 0x63a, 0x63c, 0x63e, 0x640, 0x642, 0x64a, 0x64c }; -static const u16 NCT6776_REG_FAN_PULSES[] = { - 0x644, 0x645, 0x646, 0x647, 0x648, 0x649, 0 }; +static const u16 NCT6776_REG_FAN_PULSES[NUM_FAN] = { + 0x644, 0x645, 0x646, 0x647, 0x648, 0x649 }; static const u16 NCT6776_REG_WEIGHT_DUTY_BASE[] = { 0x13e, 0x23e, 0x33e, 0x83e, 0x93e, 0xa3e }; @@ -461,6 +461,7 @@ static const char *const nct6776_temp_label[] = { }; #define NCT6776_TEMP_MASK 0x007ffffe +#define NCT6776_VIRT_TEMP_MASK 0x00000000 static const u16 NCT6776_REG_TEMP_ALTERNATE[32] = { [14] = 0x401, @@ -501,9 +502,9 @@ static const s8 NCT6779_BEEP_BITS[] = { 30, 31 }; /* intrusion0, intrusion1 */ static const u16 NCT6779_REG_FAN[] = { - 0x4b0, 0x4b2, 0x4b4, 0x4b6, 0x4b8, 0x4ba, 0x660 }; -static const u16 NCT6779_REG_FAN_PULSES[] = { - 0x644, 0x645, 0x646, 0x647, 0x648, 0x649, 0 }; + 0x4c0, 0x4c2, 0x4c4, 0x4c6, 0x4c8, 0x4ca, 0x4ce }; +static const u16 NCT6779_REG_FAN_PULSES[NUM_FAN] = { + 0x644, 0x645, 0x646, 0x647, 0x648, 0x649 }; static const u16 NCT6779_REG_CRITICAL_PWM_ENABLE[] = { 0x136, 0x236, 0x336, 0x836, 0x936, 0xa36, 0xb36 }; @@ -559,7 +560,9 @@ static const char *const nct6779_temp_label[] = { }; #define NCT6779_TEMP_MASK 0x07ffff7e +#define NCT6779_VIRT_TEMP_MASK 0x00000000 #define NCT6791_TEMP_MASK 0x87ffff7e +#define NCT6791_VIRT_TEMP_MASK 0x80000000 static const u16 NCT6779_REG_TEMP_ALTERNATE[32] = { 0x490, 0x491, 0x492, 0x493, 0x494, 0x495, 0, 0, @@ -638,6 +641,7 @@ static const char *const nct6792_temp_label[] = { }; #define NCT6792_TEMP_MASK 0x9fffff7e +#define NCT6792_VIRT_TEMP_MASK 0x80000000 static const char *const nct6793_temp_label[] = { "", @@ -675,6 +679,7 @@ static const char *const nct6793_temp_label[] = { }; #define NCT6793_TEMP_MASK 0xbfff037e +#define NCT6793_VIRT_TEMP_MASK 0x80000000 static const char *const nct6795_temp_label[] = { "", @@ -712,6 +717,7 @@ static const char *const nct6795_temp_label[] = { }; #define NCT6795_TEMP_MASK 0xbfffff7e +#define NCT6795_VIRT_TEMP_MASK 0x80000000 static const char *const nct6796_temp_label[] = { "", @@ -724,8 +730,8 @@ static const char *const nct6796_temp_label[] = { "AUXTIN4", "SMBUSMASTER 0", "SMBUSMASTER 1", - "", - "", + "Virtual_TEMP", + "Virtual_TEMP", "", "", "", @@ -748,7 +754,8 @@ static const char *const nct6796_temp_label[] = { "Virtual_TEMP" }; -#define NCT6796_TEMP_MASK 0xbfff03fe +#define NCT6796_TEMP_MASK 0xbfff0ffe +#define NCT6796_VIRT_TEMP_MASK 0x80000c00 /* NCT6102D/NCT6106D specific data */ @@ -779,8 +786,8 @@ static const u16 NCT6106_REG_TEMP_CONFIG[] = { static const u16 NCT6106_REG_FAN[] = { 0x20, 0x22, 0x24 }; static const u16 NCT6106_REG_FAN_MIN[] = { 0xe0, 0xe2, 0xe4 }; -static const u16 NCT6106_REG_FAN_PULSES[] = { 0xf6, 0xf6, 0xf6, 0, 0 }; -static const u16 NCT6106_FAN_PULSE_SHIFT[] = { 0, 2, 4, 0, 0 }; +static const u16 NCT6106_REG_FAN_PULSES[] = { 0xf6, 0xf6, 0xf6 }; +static const u16 NCT6106_FAN_PULSE_SHIFT[] = { 0, 2, 4 }; static const u8 NCT6106_REG_PWM_MODE[] = { 0xf3, 0xf3, 0xf3 }; static const u8 NCT6106_PWM_MODE_MASK[] = { 0x01, 0x02, 0x04 }; @@ -917,6 +924,11 @@ static unsigned int fan_from_reg16(u16 reg, unsigned int divreg) return 1350000U / (reg << divreg); } +static unsigned int fan_from_reg_rpm(u16 reg, unsigned int divreg) +{ + return reg; +} + static u16 fan_to_reg(u32 fan, unsigned int divreg) { if (!fan) @@ -969,6 +981,7 @@ struct nct6775_data { u16 reg_temp_config[NUM_TEMP]; const char * const *temp_label; u32 temp_mask; + u32 virt_temp_mask; u16 REG_CONFIG; u16 REG_VBAT; @@ -1276,11 +1289,11 @@ static bool is_word_sized(struct nct6775_data *data, u16 reg) case nct6795: case nct6796: return reg == 0x150 || reg == 0x153 || reg == 0x155 || - ((reg & 0xfff0) == 0x4b0 && (reg & 0x000f) < 0x0b) || + (reg & 0xfff0) == 0x4c0 || reg == 0x402 || reg == 0x63a || reg == 0x63c || reg == 0x63e || reg == 0x640 || reg == 0x642 || reg == 0x64a || - reg == 0x64c || reg == 0x660 || + reg == 0x64c || reg == 0x73 || reg == 0x75 || reg == 0x77 || reg == 0x79 || reg == 0x7b || reg == 0x7d; } @@ -1558,7 +1571,7 @@ static void nct6775_update_pwm(struct device *dev) reg = nct6775_read_value(data, data->REG_WEIGHT_TEMP_SEL[i]); data->pwm_weight_temp_sel[i] = reg & 0x1f; /* If weight is disabled, report weight source as 0 */ - if (j == 1 && !(reg & 0x80)) + if (!(reg & 0x80)) data->pwm_weight_temp_sel[i] = 0; /* Weight temp data */ @@ -1682,9 +1695,13 @@ static struct nct6775_data *nct6775_update_device(struct device *dev) if (data->has_fan_min & BIT(i)) data->fan_min[i] = nct6775_read_value(data, data->REG_FAN_MIN[i]); - data->fan_pulses[i] = - (nct6775_read_value(data, data->REG_FAN_PULSES[i]) - >> data->FAN_PULSE_SHIFT[i]) & 0x03; + + if (data->REG_FAN_PULSES[i]) { + data->fan_pulses[i] = + (nct6775_read_value(data, + data->REG_FAN_PULSES[i]) + >> data->FAN_PULSE_SHIFT[i]) & 0x03; + } nct6775_select_fan_div(dev, data, i, reg); } @@ -3639,6 +3656,7 @@ static int nct6775_probe(struct platform_device *pdev) data->temp_label = nct6776_temp_label; data->temp_mask = NCT6776_TEMP_MASK; + data->virt_temp_mask = NCT6776_VIRT_TEMP_MASK; data->REG_VBAT = NCT6106_REG_VBAT; data->REG_DIODE = NCT6106_REG_DIODE; @@ -3717,6 +3735,7 @@ static int nct6775_probe(struct platform_device *pdev) data->temp_label = nct6775_temp_label; data->temp_mask = NCT6775_TEMP_MASK; + data->virt_temp_mask = NCT6775_VIRT_TEMP_MASK; data->REG_CONFIG = NCT6775_REG_CONFIG; data->REG_VBAT = NCT6775_REG_VBAT; @@ -3789,6 +3808,7 @@ static int nct6775_probe(struct platform_device *pdev) data->temp_label = nct6776_temp_label; data->temp_mask = NCT6776_TEMP_MASK; + data->virt_temp_mask = NCT6776_VIRT_TEMP_MASK; data->REG_CONFIG = NCT6775_REG_CONFIG; data->REG_VBAT = NCT6775_REG_VBAT; @@ -3853,7 +3873,7 @@ static int nct6775_probe(struct platform_device *pdev) data->ALARM_BITS = NCT6779_ALARM_BITS; data->BEEP_BITS = NCT6779_BEEP_BITS; - data->fan_from_reg = fan_from_reg13; + data->fan_from_reg = fan_from_reg_rpm; data->fan_from_reg_min = fan_from_reg13; data->target_temp_mask = 0xff; data->tolerance_mask = 0x07; @@ -3861,6 +3881,7 @@ static int nct6775_probe(struct platform_device *pdev) data->temp_label = nct6779_temp_label; data->temp_mask = NCT6779_TEMP_MASK; + data->virt_temp_mask = NCT6779_VIRT_TEMP_MASK; data->REG_CONFIG = NCT6775_REG_CONFIG; data->REG_VBAT = NCT6775_REG_VBAT; @@ -3933,7 +3954,7 @@ static int nct6775_probe(struct platform_device *pdev) data->ALARM_BITS = NCT6791_ALARM_BITS; data->BEEP_BITS = NCT6779_BEEP_BITS; - data->fan_from_reg = fan_from_reg13; + data->fan_from_reg = fan_from_reg_rpm; data->fan_from_reg_min = fan_from_reg13; data->target_temp_mask = 0xff; data->tolerance_mask = 0x07; @@ -3944,22 +3965,27 @@ static int nct6775_probe(struct platform_device *pdev) case nct6791: data->temp_label = nct6779_temp_label; data->temp_mask = NCT6791_TEMP_MASK; + data->virt_temp_mask = NCT6791_VIRT_TEMP_MASK; break; case nct6792: data->temp_label = nct6792_temp_label; data->temp_mask = NCT6792_TEMP_MASK; + data->virt_temp_mask = NCT6792_VIRT_TEMP_MASK; break; case nct6793: data->temp_label = nct6793_temp_label; data->temp_mask = NCT6793_TEMP_MASK; + data->virt_temp_mask = NCT6793_VIRT_TEMP_MASK; break; case nct6795: data->temp_label = nct6795_temp_label; data->temp_mask = NCT6795_TEMP_MASK; + data->virt_temp_mask = NCT6795_VIRT_TEMP_MASK; break; case nct6796: data->temp_label = nct6796_temp_label; data->temp_mask = NCT6796_TEMP_MASK; + data->virt_temp_mask = NCT6796_VIRT_TEMP_MASK; break; } @@ -4143,7 +4169,7 @@ static int nct6775_probe(struct platform_device *pdev) * for each fan reflects a different temperature, and there * are no duplicates. */ - if (src != TEMP_SOURCE_VIRTUAL) { + if (!(data->virt_temp_mask & BIT(src))) { if (mask & BIT(src)) continue; mask |= BIT(src); diff --git a/drivers/mtd/devices/m25p80.c b/drivers/mtd/devices/m25p80.c index cbfafc453274..270d3c9580c5 100644 --- a/drivers/mtd/devices/m25p80.c +++ b/drivers/mtd/devices/m25p80.c @@ -39,13 +39,23 @@ static int m25p80_read_reg(struct spi_nor *nor, u8 code, u8 *val, int len) struct spi_mem_op op = SPI_MEM_OP(SPI_MEM_OP_CMD(code, 1), SPI_MEM_OP_NO_ADDR, SPI_MEM_OP_NO_DUMMY, - SPI_MEM_OP_DATA_IN(len, val, 1)); + SPI_MEM_OP_DATA_IN(len, NULL, 1)); + void *scratchbuf; int ret; + scratchbuf = kmalloc(len, GFP_KERNEL); + if (!scratchbuf) + return -ENOMEM; + + op.data.buf.in = scratchbuf; ret = spi_mem_exec_op(flash->spimem, &op); if (ret < 0) dev_err(&flash->spimem->spi->dev, "error %d reading %x\n", ret, code); + else + memcpy(val, scratchbuf, len); + + kfree(scratchbuf); return ret; } @@ -56,9 +66,19 @@ static int m25p80_write_reg(struct spi_nor *nor, u8 opcode, u8 *buf, int len) struct spi_mem_op op = SPI_MEM_OP(SPI_MEM_OP_CMD(opcode, 1), SPI_MEM_OP_NO_ADDR, SPI_MEM_OP_NO_DUMMY, - SPI_MEM_OP_DATA_OUT(len, buf, 1)); + SPI_MEM_OP_DATA_OUT(len, NULL, 1)); + void *scratchbuf; + int ret; - return spi_mem_exec_op(flash->spimem, &op); + scratchbuf = kmemdup(buf, len, GFP_KERNEL); + if (!scratchbuf) + return -ENOMEM; + + op.data.buf.out = scratchbuf; + ret = spi_mem_exec_op(flash->spimem, &op); + kfree(scratchbuf); + + return ret; } static ssize_t m25p80_write(struct spi_nor *nor, loff_t to, size_t len, diff --git a/drivers/mtd/mtdpart.c b/drivers/mtd/mtdpart.c index 52e2cb35fc79..99c460facd5e 100644 --- a/drivers/mtd/mtdpart.c +++ b/drivers/mtd/mtdpart.c @@ -873,8 +873,11 @@ static int mtd_part_of_parse(struct mtd_info *master, int ret, err = 0; np = mtd_get_of_node(master); - if (!mtd_is_partition(master)) + if (mtd_is_partition(master)) + of_node_get(np); + else np = of_get_child_by_name(np, "partitions"); + of_property_for_each_string(np, "compatible", prop, compat) { parser = mtd_part_get_compatible_parser(compat); if (!parser) diff --git a/drivers/mtd/nand/raw/denali.c b/drivers/mtd/nand/raw/denali.c index 67b2065e7a19..b864b93dd289 100644 --- a/drivers/mtd/nand/raw/denali.c +++ b/drivers/mtd/nand/raw/denali.c @@ -596,6 +596,12 @@ static int denali_dma_xfer(struct denali_nand_info *denali, void *buf, } iowrite32(DMA_ENABLE__FLAG, denali->reg + DMA_ENABLE); + /* + * The ->setup_dma() hook kicks DMA by using the data/command + * interface, which belongs to a different AXI port from the + * register interface. Read back the register to avoid a race. + */ + ioread32(denali->reg + DMA_ENABLE); denali_reset_irq(denali); denali->setup_dma(denali, dma_addr, page, write); diff --git a/drivers/mtd/nand/raw/marvell_nand.c b/drivers/mtd/nand/raw/marvell_nand.c index 7af4d6213ee5..bc2ef5209783 100644 --- a/drivers/mtd/nand/raw/marvell_nand.c +++ b/drivers/mtd/nand/raw/marvell_nand.c @@ -1547,7 +1547,7 @@ static void marvell_nfc_parse_instructions(struct nand_chip *chip, for (op_id = 0; op_id < subop->ninstrs; op_id++) { unsigned int offset, naddrs; const u8 *addrs; - int len = nand_subop_get_data_len(subop, op_id); + int len; instr = &subop->instrs[op_id]; @@ -1593,6 +1593,7 @@ static void marvell_nfc_parse_instructions(struct nand_chip *chip, nfc_op->ndcb[0] |= NDCB0_CMD_XTYPE(XTYPE_MONOLITHIC_RW) | NDCB0_LEN_OVRD; + len = nand_subop_get_data_len(subop, op_id); nfc_op->ndcb[3] |= round_up(len, FIFO_DEPTH); } nfc_op->data_delay_ns = instr->delay_ns; @@ -1606,6 +1607,7 @@ static void marvell_nfc_parse_instructions(struct nand_chip *chip, nfc_op->ndcb[0] |= NDCB0_CMD_XTYPE(XTYPE_MONOLITHIC_RW) | NDCB0_LEN_OVRD; + len = nand_subop_get_data_len(subop, op_id); nfc_op->ndcb[3] |= round_up(len, FIFO_DEPTH); } nfc_op->data_delay_ns = instr->delay_ns; diff --git a/drivers/net/appletalk/ipddp.c b/drivers/net/appletalk/ipddp.c index 9375cef22420..3d27616d9c85 100644 --- a/drivers/net/appletalk/ipddp.c +++ b/drivers/net/appletalk/ipddp.c @@ -283,8 +283,12 @@ static int ipddp_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) case SIOCFINDIPDDPRT: spin_lock_bh(&ipddp_route_lock); rp = __ipddp_find_route(&rcp); - if (rp) - memcpy(&rcp2, rp, sizeof(rcp2)); + if (rp) { + memset(&rcp2, 0, sizeof(rcp2)); + rcp2.ip = rp->ip; + rcp2.at = rp->at; + rcp2.flags = rp->flags; + } spin_unlock_bh(&ipddp_route_lock); if (rp) { diff --git a/drivers/net/dsa/mv88e6xxx/global1.h b/drivers/net/dsa/mv88e6xxx/global1.h index 7c791c1da4b9..bef01331266f 100644 --- a/drivers/net/dsa/mv88e6xxx/global1.h +++ b/drivers/net/dsa/mv88e6xxx/global1.h @@ -128,7 +128,7 @@ #define MV88E6XXX_G1_ATU_OP_GET_CLR_VIOLATION 0x7000 #define MV88E6XXX_G1_ATU_OP_AGE_OUT_VIOLATION BIT(7) #define MV88E6XXX_G1_ATU_OP_MEMBER_VIOLATION BIT(6) -#define MV88E6XXX_G1_ATU_OP_MISS_VIOLTATION BIT(5) +#define MV88E6XXX_G1_ATU_OP_MISS_VIOLATION BIT(5) #define MV88E6XXX_G1_ATU_OP_FULL_VIOLATION BIT(4) /* Offset 0x0C: ATU Data Register */ diff --git a/drivers/net/dsa/mv88e6xxx/global1_atu.c b/drivers/net/dsa/mv88e6xxx/global1_atu.c index 307410898fc9..5200e4bdce93 100644 --- a/drivers/net/dsa/mv88e6xxx/global1_atu.c +++ b/drivers/net/dsa/mv88e6xxx/global1_atu.c @@ -349,7 +349,7 @@ static irqreturn_t mv88e6xxx_g1_atu_prob_irq_thread_fn(int irq, void *dev_id) chip->ports[entry.portvec].atu_member_violation++; } - if (val & MV88E6XXX_G1_ATU_OP_MEMBER_VIOLATION) { + if (val & MV88E6XXX_G1_ATU_OP_MISS_VIOLATION) { dev_err_ratelimited(chip->dev, "ATU miss violation for %pM portvec %x\n", entry.mac, entry.portvec); diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index cecbb1d1f587..177587f9c3f1 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -8027,7 +8027,7 @@ static int bnxt_change_mac_addr(struct net_device *dev, void *p) if (ether_addr_equal(addr->sa_data, dev->dev_addr)) return 0; - rc = bnxt_approve_mac(bp, addr->sa_data); + rc = bnxt_approve_mac(bp, addr->sa_data, true); if (rc) return rc; @@ -8827,14 +8827,19 @@ static int bnxt_init_mac_addr(struct bnxt *bp) } else { #ifdef CONFIG_BNXT_SRIOV struct bnxt_vf_info *vf = &bp->vf; + bool strict_approval = true; if (is_valid_ether_addr(vf->mac_addr)) { /* overwrite netdev dev_addr with admin VF MAC */ memcpy(bp->dev->dev_addr, vf->mac_addr, ETH_ALEN); + /* Older PF driver or firmware may not approve this + * correctly. + */ + strict_approval = false; } else { eth_hw_addr_random(bp->dev); } - rc = bnxt_approve_mac(bp, bp->dev->dev_addr); + rc = bnxt_approve_mac(bp, bp->dev->dev_addr, strict_approval); #endif } return rc; diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c index fcd085a9853a..3962f6fd543c 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c @@ -1104,7 +1104,7 @@ update_vf_mac_exit: mutex_unlock(&bp->hwrm_cmd_lock); } -int bnxt_approve_mac(struct bnxt *bp, u8 *mac) +int bnxt_approve_mac(struct bnxt *bp, u8 *mac, bool strict) { struct hwrm_func_vf_cfg_input req = {0}; int rc = 0; @@ -1122,12 +1122,13 @@ int bnxt_approve_mac(struct bnxt *bp, u8 *mac) memcpy(req.dflt_mac_addr, mac, ETH_ALEN); rc = hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT); mac_done: - if (rc) { + if (rc && strict) { rc = -EADDRNOTAVAIL; netdev_warn(bp->dev, "VF MAC address %pM not approved by the PF\n", mac); + return rc; } - return rc; + return 0; } #else @@ -1144,7 +1145,7 @@ void bnxt_update_vf_mac(struct bnxt *bp) { } -int bnxt_approve_mac(struct bnxt *bp, u8 *mac) +int bnxt_approve_mac(struct bnxt *bp, u8 *mac, bool strict) { return 0; } diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.h b/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.h index e9b20cd19881..2eed9eda1195 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.h +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.h @@ -39,5 +39,5 @@ int bnxt_sriov_configure(struct pci_dev *pdev, int num_vfs); void bnxt_sriov_disable(struct bnxt *); void bnxt_hwrm_exec_fwd_req(struct bnxt *); void bnxt_update_vf_mac(struct bnxt *); -int bnxt_approve_mac(struct bnxt *, u8 *); +int bnxt_approve_mac(struct bnxt *, u8 *, bool); #endif diff --git a/drivers/net/ethernet/cadence/macb_main.c b/drivers/net/ethernet/cadence/macb_main.c index 16e4ef7d7185..f1a86b422617 100644 --- a/drivers/net/ethernet/cadence/macb_main.c +++ b/drivers/net/ethernet/cadence/macb_main.c @@ -3837,6 +3837,13 @@ static const struct macb_config at91sam9260_config = { .init = macb_init, }; +static const struct macb_config sama5d3macb_config = { + .caps = MACB_CAPS_SG_DISABLED + | MACB_CAPS_USRIO_HAS_CLKEN | MACB_CAPS_USRIO_DEFAULT_IS_MII_GMII, + .clk_init = macb_clk_init, + .init = macb_init, +}; + static const struct macb_config pc302gem_config = { .caps = MACB_CAPS_SG_DISABLED | MACB_CAPS_GIGABIT_MODE_AVAILABLE, .dma_burst_length = 16, @@ -3904,6 +3911,7 @@ static const struct of_device_id macb_dt_ids[] = { { .compatible = "cdns,gem", .data = &pc302gem_config }, { .compatible = "atmel,sama5d2-gem", .data = &sama5d2_config }, { .compatible = "atmel,sama5d3-gem", .data = &sama5d3_config }, + { .compatible = "atmel,sama5d3-macb", .data = &sama5d3macb_config }, { .compatible = "atmel,sama5d4-gem", .data = &sama5d4_config }, { .compatible = "cdns,at91rm9200-emac", .data = &emac_config }, { .compatible = "cdns,emac", .data = &emac_config }, diff --git a/drivers/net/ethernet/hp/hp100.c b/drivers/net/ethernet/hp/hp100.c index c8c7ad2eff77..9b5a68b65432 100644 --- a/drivers/net/ethernet/hp/hp100.c +++ b/drivers/net/ethernet/hp/hp100.c @@ -2634,7 +2634,7 @@ static int hp100_login_to_vg_hub(struct net_device *dev, u_short force_relogin) /* Wait for link to drop */ time = jiffies + (HZ / 10); do { - if (~(hp100_inb(VG_LAN_CFG_1) & HP100_LINK_UP_ST)) + if (!(hp100_inb(VG_LAN_CFG_1) & HP100_LINK_UP_ST)) break; if (!in_interrupt()) schedule_timeout_interruptible(1); diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c index 28500417843e..702fec82d806 100644 --- a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c +++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c @@ -58,6 +58,8 @@ static struct { */ static void mvpp2_mac_config(struct net_device *dev, unsigned int mode, const struct phylink_link_state *state); +static void mvpp2_mac_link_up(struct net_device *dev, unsigned int mode, + phy_interface_t interface, struct phy_device *phy); /* Queue modes */ #define MVPP2_QDIST_SINGLE_MODE 0 @@ -3142,6 +3144,7 @@ static void mvpp2_start_dev(struct mvpp2_port *port) mvpp22_mode_reconfigure(port); if (port->phylink) { + netif_carrier_off(port->dev); phylink_start(port->phylink); } else { /* Phylink isn't used as of now for ACPI, so the MAC has to be @@ -3150,9 +3153,10 @@ static void mvpp2_start_dev(struct mvpp2_port *port) */ struct phylink_link_state state = { .interface = port->phy_interface, - .link = 1, }; mvpp2_mac_config(port->dev, MLO_AN_INBAND, &state); + mvpp2_mac_link_up(port->dev, MLO_AN_INBAND, port->phy_interface, + NULL); } netif_tx_start_all_queues(port->dev); @@ -4495,10 +4499,6 @@ static void mvpp2_mac_config(struct net_device *dev, unsigned int mode, return; } - netif_tx_stop_all_queues(port->dev); - if (!port->has_phy) - netif_carrier_off(port->dev); - /* Make sure the port is disabled when reconfiguring the mode */ mvpp2_port_disable(port); @@ -4523,16 +4523,7 @@ static void mvpp2_mac_config(struct net_device *dev, unsigned int mode, if (port->priv->hw_version == MVPP21 && port->flags & MVPP2_F_LOOPBACK) mvpp2_port_loopback_set(port, state); - /* If the port already was up, make sure it's still in the same state */ - if (state->link || !port->has_phy) { - mvpp2_port_enable(port); - - mvpp2_egress_enable(port); - mvpp2_ingress_enable(port); - if (!port->has_phy) - netif_carrier_on(dev); - netif_tx_wake_all_queues(dev); - } + mvpp2_port_enable(port); } static void mvpp2_mac_link_up(struct net_device *dev, unsigned int mode, diff --git a/drivers/net/ethernet/microchip/lan743x_main.c b/drivers/net/ethernet/microchip/lan743x_main.c index e7dce79ff2c9..001b5f714c1b 100644 --- a/drivers/net/ethernet/microchip/lan743x_main.c +++ b/drivers/net/ethernet/microchip/lan743x_main.c @@ -2850,7 +2850,7 @@ static void lan743x_pcidev_shutdown(struct pci_dev *pdev) lan743x_hardware_cleanup(adapter); } -#ifdef CONFIG_PM +#ifdef CONFIG_PM_SLEEP static u16 lan743x_pm_wakeframe_crc16(const u8 *buf, int len) { return bitrev16(crc16(0xFFFF, buf, len)); @@ -3016,7 +3016,7 @@ static int lan743x_pm_resume(struct device *dev) static const struct dev_pm_ops lan743x_pm_ops = { SET_SYSTEM_SLEEP_PM_OPS(lan743x_pm_suspend, lan743x_pm_resume) }; -#endif /*CONFIG_PM */ +#endif /* CONFIG_PM_SLEEP */ static const struct pci_device_id lan743x_pcidev_tbl[] = { { PCI_DEVICE(PCI_VENDOR_ID_SMSC, PCI_DEVICE_ID_SMSC_LAN7430) }, @@ -3028,7 +3028,7 @@ static struct pci_driver lan743x_pcidev_driver = { .id_table = lan743x_pcidev_tbl, .probe = lan743x_pcidev_probe, .remove = lan743x_pcidev_remove, -#ifdef CONFIG_PM +#ifdef CONFIG_PM_SLEEP .driver.pm = &lan743x_pm_ops, #endif .shutdown = lan743x_pcidev_shutdown, diff --git a/drivers/net/ethernet/realtek/r8169.c b/drivers/net/ethernet/realtek/r8169.c index 1d8631303b53..bb529ff2ca81 100644 --- a/drivers/net/ethernet/realtek/r8169.c +++ b/drivers/net/ethernet/realtek/r8169.c @@ -13,6 +13,7 @@ #include <linux/pci.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> +#include <linux/clk.h> #include <linux/delay.h> #include <linux/ethtool.h> #include <linux/phy.h> @@ -665,6 +666,7 @@ struct rtl8169_private { u16 event_slow; const struct rtl_coalesce_info *coalesce_info; + struct clk *clk; struct mdio_ops { void (*write)(struct rtl8169_private *, int, int); @@ -4775,12 +4777,14 @@ static void rtl_pcie_state_l2l3_enable(struct rtl8169_private *tp, bool enable) static void rtl_hw_aspm_clkreq_enable(struct rtl8169_private *tp, bool enable) { if (enable) { - RTL_W8(tp, Config2, RTL_R8(tp, Config2) | ClkReqEn); RTL_W8(tp, Config5, RTL_R8(tp, Config5) | ASPM_en); + RTL_W8(tp, Config2, RTL_R8(tp, Config2) | ClkReqEn); } else { RTL_W8(tp, Config2, RTL_R8(tp, Config2) & ~ClkReqEn); RTL_W8(tp, Config5, RTL_R8(tp, Config5) & ~ASPM_en); } + + udelay(10); } static void rtl_hw_start_8168bb(struct rtl8169_private *tp) @@ -5625,6 +5629,8 @@ static void rtl_hw_start_8402(struct rtl8169_private *tp) static void rtl_hw_start_8106(struct rtl8169_private *tp) { + rtl_hw_aspm_clkreq_enable(tp, false); + /* Force LAN exit from ASPM if Rx/Tx are not idle */ RTL_W32(tp, FuncEvent, RTL_R32(tp, FuncEvent) | 0x002800); @@ -5633,6 +5639,7 @@ static void rtl_hw_start_8106(struct rtl8169_private *tp) RTL_W8(tp, DLLPR, RTL_R8(tp, DLLPR) & ~PFM_EN); rtl_pcie_state_l2l3_enable(tp, false); + rtl_hw_aspm_clkreq_enable(tp, true); } static void rtl_hw_start_8101(struct rtl8169_private *tp) @@ -7257,6 +7264,11 @@ static int rtl_jumbo_max(struct rtl8169_private *tp) } } +static void rtl_disable_clk(void *data) +{ + clk_disable_unprepare(data); +} + static int rtl_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) { const struct rtl_cfg_info *cfg = rtl_cfg_infos + ent->driver_data; @@ -7277,6 +7289,32 @@ static int rtl_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) tp->msg_enable = netif_msg_init(debug.msg_enable, R8169_MSG_DEFAULT); tp->supports_gmii = cfg->has_gmii; + /* Get the *optional* external "ether_clk" used on some boards */ + tp->clk = devm_clk_get(&pdev->dev, "ether_clk"); + if (IS_ERR(tp->clk)) { + rc = PTR_ERR(tp->clk); + if (rc == -ENOENT) { + /* clk-core allows NULL (for suspend / resume) */ + tp->clk = NULL; + } else if (rc == -EPROBE_DEFER) { + return rc; + } else { + dev_err(&pdev->dev, "failed to get clk: %d\n", rc); + return rc; + } + } else { + rc = clk_prepare_enable(tp->clk); + if (rc) { + dev_err(&pdev->dev, "failed to enable clk: %d\n", rc); + return rc; + } + + rc = devm_add_action_or_reset(&pdev->dev, rtl_disable_clk, + tp->clk); + if (rc) + return rc; + } + /* enable device (incl. PCI PM wakeup and hotplug setup) */ rc = pcim_enable_device(pdev); if (rc < 0) { diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c index 3609c7b696c7..2b800ce1d5bf 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c @@ -67,7 +67,7 @@ static int dwmac1000_validate_mcast_bins(int mcast_bins) * Description: * This function validates the number of Unicast address entries supported * by a particular Synopsys 10/100/1000 controller. The Synopsys controller - * supports 1, 32, 64, or 128 Unicast filter entries for it's Unicast filter + * supports 1..32, 64, or 128 Unicast filter entries for it's Unicast filter * logic. This function validates a valid, supported configuration is * selected, and defaults to 1 Unicast address if an unsupported * configuration is selected. @@ -77,8 +77,7 @@ static int dwmac1000_validate_ucast_entries(int ucast_entries) int x = ucast_entries; switch (x) { - case 1: - case 32: + case 1 ... 32: case 64: case 128: break; diff --git a/drivers/net/ethernet/ti/Kconfig b/drivers/net/ethernet/ti/Kconfig index 9263d638bd6d..f932923f7d56 100644 --- a/drivers/net/ethernet/ti/Kconfig +++ b/drivers/net/ethernet/ti/Kconfig @@ -41,6 +41,7 @@ config TI_DAVINCI_MDIO config TI_DAVINCI_CPDMA tristate "TI DaVinci CPDMA Support" depends on ARCH_DAVINCI || ARCH_OMAP2PLUS || COMPILE_TEST + select GENERIC_ALLOCATOR ---help--- This driver supports TI's DaVinci CPDMA dma engine. diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c index 31c3d77b4733..fe01e141c8f8 100644 --- a/drivers/net/hyperv/netvsc.c +++ b/drivers/net/hyperv/netvsc.c @@ -1203,6 +1203,9 @@ static void netvsc_send_vf(struct net_device *ndev, net_device_ctx->vf_alloc = nvmsg->msg.v4_msg.vf_assoc.allocated; net_device_ctx->vf_serial = nvmsg->msg.v4_msg.vf_assoc.serial; + netdev_info(ndev, "VF slot %u %s\n", + net_device_ctx->vf_serial, + net_device_ctx->vf_alloc ? "added" : "removed"); } static void netvsc_receive_inband(struct net_device *ndev, diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c index 70921bbe0e28..3af6d8d15233 100644 --- a/drivers/net/hyperv/netvsc_drv.c +++ b/drivers/net/hyperv/netvsc_drv.c @@ -1894,20 +1894,6 @@ out_unlock: rtnl_unlock(); } -static struct net_device *get_netvsc_bymac(const u8 *mac) -{ - struct net_device_context *ndev_ctx; - - list_for_each_entry(ndev_ctx, &netvsc_dev_list, list) { - struct net_device *dev = hv_get_drvdata(ndev_ctx->device_ctx); - - if (ether_addr_equal(mac, dev->perm_addr)) - return dev; - } - - return NULL; -} - static struct net_device *get_netvsc_byref(struct net_device *vf_netdev) { struct net_device_context *net_device_ctx; @@ -2036,26 +2022,48 @@ static void netvsc_vf_setup(struct work_struct *w) rtnl_unlock(); } +/* Find netvsc by VMBus serial number. + * The PCI hyperv controller records the serial number as the slot. + */ +static struct net_device *get_netvsc_byslot(const struct net_device *vf_netdev) +{ + struct device *parent = vf_netdev->dev.parent; + struct net_device_context *ndev_ctx; + struct pci_dev *pdev; + + if (!parent || !dev_is_pci(parent)) + return NULL; /* not a PCI device */ + + pdev = to_pci_dev(parent); + if (!pdev->slot) { + netdev_notice(vf_netdev, "no PCI slot information\n"); + return NULL; + } + + list_for_each_entry(ndev_ctx, &netvsc_dev_list, list) { + if (!ndev_ctx->vf_alloc) + continue; + + if (ndev_ctx->vf_serial == pdev->slot->number) + return hv_get_drvdata(ndev_ctx->device_ctx); + } + + netdev_notice(vf_netdev, + "no netdev found for slot %u\n", pdev->slot->number); + return NULL; +} + static int netvsc_register_vf(struct net_device *vf_netdev) { - struct net_device *ndev; struct net_device_context *net_device_ctx; - struct device *pdev = vf_netdev->dev.parent; struct netvsc_device *netvsc_dev; + struct net_device *ndev; int ret; if (vf_netdev->addr_len != ETH_ALEN) return NOTIFY_DONE; - if (!pdev || !dev_is_pci(pdev) || dev_is_pf(pdev)) - return NOTIFY_DONE; - - /* - * We will use the MAC address to locate the synthetic interface to - * associate with the VF interface. If we don't find a matching - * synthetic interface, move on. - */ - ndev = get_netvsc_bymac(vf_netdev->perm_addr); + ndev = get_netvsc_byslot(vf_netdev); if (!ndev) return NOTIFY_DONE; @@ -2272,17 +2280,15 @@ static int netvsc_remove(struct hv_device *dev) cancel_delayed_work_sync(&ndev_ctx->dwork); - rcu_read_lock(); - nvdev = rcu_dereference(ndev_ctx->nvdev); - - if (nvdev) + rtnl_lock(); + nvdev = rtnl_dereference(ndev_ctx->nvdev); + if (nvdev) cancel_work_sync(&nvdev->subchan_work); /* * Call to the vsc driver to let it know that the device is being * removed. Also blocks mtu and channel changes. */ - rtnl_lock(); vf_netdev = rtnl_dereference(ndev_ctx->vf_netdev); if (vf_netdev) netvsc_unregister_vf(vf_netdev); @@ -2294,7 +2300,6 @@ static int netvsc_remove(struct hv_device *dev) list_del(&ndev_ctx->list); rtnl_unlock(); - rcu_read_unlock(); hv_set_drvdata(dev, NULL); diff --git a/drivers/net/ppp/pppoe.c b/drivers/net/ppp/pppoe.c index ce61231e96ea..62dc564b251d 100644 --- a/drivers/net/ppp/pppoe.c +++ b/drivers/net/ppp/pppoe.c @@ -429,6 +429,9 @@ static int pppoe_rcv(struct sk_buff *skb, struct net_device *dev, if (!skb) goto out; + if (skb_mac_header_len(skb) < ETH_HLEN) + goto drop; + if (!pskb_may_pull(skb, sizeof(struct pppoe_hdr))) goto drop; diff --git a/drivers/net/usb/qmi_wwan.c b/drivers/net/usb/qmi_wwan.c index e3270deecec2..533b6fb8d923 100644 --- a/drivers/net/usb/qmi_wwan.c +++ b/drivers/net/usb/qmi_wwan.c @@ -1213,13 +1213,13 @@ static const struct usb_device_id products[] = { {QMI_FIXED_INTF(0x1199, 0x9061, 8)}, /* Sierra Wireless Modem */ {QMI_FIXED_INTF(0x1199, 0x9063, 8)}, /* Sierra Wireless EM7305 */ {QMI_FIXED_INTF(0x1199, 0x9063, 10)}, /* Sierra Wireless EM7305 */ - {QMI_FIXED_INTF(0x1199, 0x9071, 8)}, /* Sierra Wireless MC74xx */ - {QMI_FIXED_INTF(0x1199, 0x9071, 10)}, /* Sierra Wireless MC74xx */ - {QMI_FIXED_INTF(0x1199, 0x9079, 8)}, /* Sierra Wireless EM74xx */ - {QMI_FIXED_INTF(0x1199, 0x9079, 10)}, /* Sierra Wireless EM74xx */ - {QMI_FIXED_INTF(0x1199, 0x907b, 8)}, /* Sierra Wireless EM74xx */ - {QMI_FIXED_INTF(0x1199, 0x907b, 10)}, /* Sierra Wireless EM74xx */ - {QMI_FIXED_INTF(0x1199, 0x9091, 8)}, /* Sierra Wireless EM7565 */ + {QMI_QUIRK_SET_DTR(0x1199, 0x9071, 8)}, /* Sierra Wireless MC74xx */ + {QMI_QUIRK_SET_DTR(0x1199, 0x9071, 10)},/* Sierra Wireless MC74xx */ + {QMI_QUIRK_SET_DTR(0x1199, 0x9079, 8)}, /* Sierra Wireless EM74xx */ + {QMI_QUIRK_SET_DTR(0x1199, 0x9079, 10)},/* Sierra Wireless EM74xx */ + {QMI_QUIRK_SET_DTR(0x1199, 0x907b, 8)}, /* Sierra Wireless EM74xx */ + {QMI_QUIRK_SET_DTR(0x1199, 0x907b, 10)},/* Sierra Wireless EM74xx */ + {QMI_QUIRK_SET_DTR(0x1199, 0x9091, 8)}, /* Sierra Wireless EM7565 */ {QMI_FIXED_INTF(0x1bbb, 0x011e, 4)}, /* Telekom Speedstick LTE II (Alcatel One Touch L100V LTE) */ {QMI_FIXED_INTF(0x1bbb, 0x0203, 2)}, /* Alcatel L800MA */ {QMI_FIXED_INTF(0x2357, 0x0201, 4)}, /* TP-LINK HSUPA Modem MA180 */ diff --git a/drivers/net/veth.c b/drivers/net/veth.c index 8d679c8b7f25..41a00cd76955 100644 --- a/drivers/net/veth.c +++ b/drivers/net/veth.c @@ -463,6 +463,8 @@ static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq, struct sk_buff *skb, int mac_len, delta, off; struct xdp_buff xdp; + skb_orphan(skb); + rcu_read_lock(); xdp_prog = rcu_dereference(rq->xdp_prog); if (unlikely(!xdp_prog)) { @@ -508,8 +510,6 @@ static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq, struct sk_buff *skb, skb_copy_header(nskb, skb); head_off = skb_headroom(nskb) - skb_headroom(skb); skb_headers_offset_update(nskb, head_off); - if (skb->sk) - skb_set_owner_w(nskb, skb->sk); consume_skb(skb); skb = nskb; } diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c index 9407acbd19a9..f17f602e6171 100644 --- a/drivers/net/xen-netfront.c +++ b/drivers/net/xen-netfront.c @@ -908,7 +908,11 @@ static RING_IDX xennet_fill_frags(struct netfront_queue *queue, BUG_ON(pull_to <= skb_headlen(skb)); __pskb_pull_tail(skb, pull_to - skb_headlen(skb)); } - BUG_ON(skb_shinfo(skb)->nr_frags >= MAX_SKB_FRAGS); + if (unlikely(skb_shinfo(skb)->nr_frags >= MAX_SKB_FRAGS)) { + queue->rx.rsp_cons = ++cons; + kfree_skb(nskb); + return ~0U; + } skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, skb_frag_page(nfrag), @@ -1045,6 +1049,8 @@ err: skb->len += rx->status; i = xennet_fill_frags(queue, skb, &tmpq); + if (unlikely(i == ~0U)) + goto err; if (rx->flags & XEN_NETRXF_csum_blank) skb->ip_summed = CHECKSUM_PARTIAL; diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c index a21caea1e080..2008fa62a373 100644 --- a/drivers/nvme/target/admin-cmd.c +++ b/drivers/nvme/target/admin-cmd.c @@ -245,6 +245,10 @@ static void nvmet_execute_get_log_page_ana(struct nvmet_req *req) offset += len; ngrps++; } + for ( ; grpid <= NVMET_MAX_ANAGRPS; grpid++) { + if (nvmet_ana_group_enabled[grpid]) + ngrps++; + } hdr.chgcnt = cpu_to_le64(nvmet_ana_chgcnt); hdr.ngrps = cpu_to_le16(ngrps); diff --git a/drivers/pci/controller/pci-hyperv.c b/drivers/pci/controller/pci-hyperv.c index c00f82cc54aa..ee80e79db21a 100644 --- a/drivers/pci/controller/pci-hyperv.c +++ b/drivers/pci/controller/pci-hyperv.c @@ -89,6 +89,9 @@ static enum pci_protocol_version_t pci_protocol_version; #define STATUS_REVISION_MISMATCH 0xC0000059 +/* space for 32bit serial number as string */ +#define SLOT_NAME_SIZE 11 + /* * Message Types */ @@ -494,6 +497,7 @@ struct hv_pci_dev { struct list_head list_entry; refcount_t refs; enum hv_pcichild_state state; + struct pci_slot *pci_slot; struct pci_function_description desc; bool reported_missing; struct hv_pcibus_device *hbus; @@ -1457,6 +1461,34 @@ static void prepopulate_bars(struct hv_pcibus_device *hbus) spin_unlock_irqrestore(&hbus->device_list_lock, flags); } +/* + * Assign entries in sysfs pci slot directory. + * + * Note that this function does not need to lock the children list + * because it is called from pci_devices_present_work which + * is serialized with hv_eject_device_work because they are on the + * same ordered workqueue. Therefore hbus->children list will not change + * even when pci_create_slot sleeps. + */ +static void hv_pci_assign_slots(struct hv_pcibus_device *hbus) +{ + struct hv_pci_dev *hpdev; + char name[SLOT_NAME_SIZE]; + int slot_nr; + + list_for_each_entry(hpdev, &hbus->children, list_entry) { + if (hpdev->pci_slot) + continue; + + slot_nr = PCI_SLOT(wslot_to_devfn(hpdev->desc.win_slot.slot)); + snprintf(name, SLOT_NAME_SIZE, "%u", hpdev->desc.ser); + hpdev->pci_slot = pci_create_slot(hbus->pci_bus, slot_nr, + name, NULL); + if (!hpdev->pci_slot) + pr_warn("pci_create slot %s failed\n", name); + } +} + /** * create_root_hv_pci_bus() - Expose a new root PCI bus * @hbus: Root PCI bus, as understood by this driver @@ -1480,6 +1512,7 @@ static int create_root_hv_pci_bus(struct hv_pcibus_device *hbus) pci_lock_rescan_remove(); pci_scan_child_bus(hbus->pci_bus); pci_bus_assign_resources(hbus->pci_bus); + hv_pci_assign_slots(hbus); pci_bus_add_devices(hbus->pci_bus); pci_unlock_rescan_remove(); hbus->state = hv_pcibus_installed; @@ -1742,6 +1775,7 @@ static void pci_devices_present_work(struct work_struct *work) */ pci_lock_rescan_remove(); pci_scan_child_bus(hbus->pci_bus); + hv_pci_assign_slots(hbus); pci_unlock_rescan_remove(); break; @@ -1858,6 +1892,9 @@ static void hv_eject_device_work(struct work_struct *work) list_del(&hpdev->list_entry); spin_unlock_irqrestore(&hpdev->hbus->device_list_lock, flags); + if (hpdev->pci_slot) + pci_destroy_slot(hpdev->pci_slot); + memset(&ctxt, 0, sizeof(ctxt)); ejct_pkt = (struct pci_eject_response *)&ctxt.pkt.message; ejct_pkt->message_type.type = PCI_EJECTION_COMPLETE; diff --git a/drivers/platform/x86/alienware-wmi.c b/drivers/platform/x86/alienware-wmi.c index d975462a4c57..f10af5c383c5 100644 --- a/drivers/platform/x86/alienware-wmi.c +++ b/drivers/platform/x86/alienware-wmi.c @@ -536,6 +536,7 @@ static acpi_status alienware_wmax_command(struct wmax_basic_args *in_args, if (obj && obj->type == ACPI_TYPE_INTEGER) *out_data = (u32) obj->integer.value; } + kfree(output.pointer); return status; } diff --git a/drivers/platform/x86/dell-smbios-wmi.c b/drivers/platform/x86/dell-smbios-wmi.c index 88afe5651d24..cf2229ece9ff 100644 --- a/drivers/platform/x86/dell-smbios-wmi.c +++ b/drivers/platform/x86/dell-smbios-wmi.c @@ -78,6 +78,7 @@ static int run_smbios_call(struct wmi_device *wdev) dev_dbg(&wdev->dev, "result: [%08x,%08x,%08x,%08x]\n", priv->buf->std.output[0], priv->buf->std.output[1], priv->buf->std.output[2], priv->buf->std.output[3]); + kfree(output.pointer); return 0; } diff --git a/drivers/scsi/qla2xxx/qla_target.h b/drivers/scsi/qla2xxx/qla_target.h index fecf96f0225c..199d3ba1916d 100644 --- a/drivers/scsi/qla2xxx/qla_target.h +++ b/drivers/scsi/qla2xxx/qla_target.h @@ -374,8 +374,8 @@ struct atio_from_isp { static inline int fcpcmd_is_corrupted(struct atio *atio) { if (atio->entry_type == ATIO_TYPE7 && - (le16_to_cpu(atio->attr_n_length & FCP_CMD_LENGTH_MASK) < - FCP_CMD_LENGTH_MIN)) + ((le16_to_cpu(atio->attr_n_length) & FCP_CMD_LENGTH_MASK) < + FCP_CMD_LENGTH_MIN)) return 1; else return 0; diff --git a/drivers/spi/spi-fsl-dspi.c b/drivers/spi/spi-fsl-dspi.c index 7cb3ab0a35a0..3082e72e4f6c 100644 --- a/drivers/spi/spi-fsl-dspi.c +++ b/drivers/spi/spi-fsl-dspi.c @@ -30,7 +30,11 @@ #define DRIVER_NAME "fsl-dspi" +#ifdef CONFIG_M5441x +#define DSPI_FIFO_SIZE 16 +#else #define DSPI_FIFO_SIZE 4 +#endif #define DSPI_DMA_BUFSIZE (DSPI_FIFO_SIZE * 1024) #define SPI_MCR 0x00 @@ -623,9 +627,11 @@ static void dspi_tcfq_read(struct fsl_dspi *dspi) static void dspi_eoq_write(struct fsl_dspi *dspi) { int fifo_size = DSPI_FIFO_SIZE; + u16 xfer_cmd = dspi->tx_cmd; /* Fill TX FIFO with as many transfers as possible */ while (dspi->len && fifo_size--) { + dspi->tx_cmd = xfer_cmd; /* Request EOQF for last transfer in FIFO */ if (dspi->len == dspi->bytes_per_word || fifo_size == 0) dspi->tx_cmd |= SPI_PUSHR_CMD_EOQ; diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c index ec395a6baf9c..9da0bc5a036c 100644 --- a/drivers/spi/spi.c +++ b/drivers/spi/spi.c @@ -2143,8 +2143,17 @@ int spi_register_controller(struct spi_controller *ctlr) */ if (ctlr->num_chipselect == 0) return -EINVAL; - /* allocate dynamic bus number using Linux idr */ - if ((ctlr->bus_num < 0) && ctlr->dev.of_node) { + if (ctlr->bus_num >= 0) { + /* devices with a fixed bus num must check-in with the num */ + mutex_lock(&board_lock); + id = idr_alloc(&spi_master_idr, ctlr, ctlr->bus_num, + ctlr->bus_num + 1, GFP_KERNEL); + mutex_unlock(&board_lock); + if (WARN(id < 0, "couldn't get idr")) + return id == -ENOSPC ? -EBUSY : id; + ctlr->bus_num = id; + } else if (ctlr->dev.of_node) { + /* allocate dynamic bus number using Linux idr */ id = of_alias_get_id(ctlr->dev.of_node, "spi"); if (id >= 0) { ctlr->bus_num = id; diff --git a/drivers/target/iscsi/iscsi_target.c b/drivers/target/iscsi/iscsi_target.c index 9cdfccbdd06f..cc756a123fd8 100644 --- a/drivers/target/iscsi/iscsi_target.c +++ b/drivers/target/iscsi/iscsi_target.c @@ -1416,7 +1416,8 @@ static void iscsit_do_crypto_hash_buf(struct ahash_request *hash, sg_init_table(sg, ARRAY_SIZE(sg)); sg_set_buf(sg, buf, payload_length); - sg_set_buf(sg + 1, pad_bytes, padding); + if (padding) + sg_set_buf(sg + 1, pad_bytes, padding); ahash_request_set_crypt(hash, sg, data_crc, payload_length + padding); @@ -3910,10 +3911,14 @@ static bool iscsi_target_check_conn_state(struct iscsi_conn *conn) static void iscsit_get_rx_pdu(struct iscsi_conn *conn) { int ret; - u8 buffer[ISCSI_HDR_LEN], opcode; + u8 *buffer, opcode; u32 checksum = 0, digest = 0; struct kvec iov; + buffer = kcalloc(ISCSI_HDR_LEN, sizeof(*buffer), GFP_KERNEL); + if (!buffer) + return; + while (!kthread_should_stop()) { /* * Ensure that both TX and RX per connection kthreads @@ -3921,7 +3926,6 @@ static void iscsit_get_rx_pdu(struct iscsi_conn *conn) */ iscsit_thread_check_cpumask(conn, current, 0); - memset(buffer, 0, ISCSI_HDR_LEN); memset(&iov, 0, sizeof(struct kvec)); iov.iov_base = buffer; @@ -3930,7 +3934,7 @@ static void iscsit_get_rx_pdu(struct iscsi_conn *conn) ret = rx_data(conn, &iov, 1, ISCSI_HDR_LEN); if (ret != ISCSI_HDR_LEN) { iscsit_rx_thread_wait_for_tcp(conn); - return; + break; } if (conn->conn_ops->HeaderDigest) { @@ -3940,7 +3944,7 @@ static void iscsit_get_rx_pdu(struct iscsi_conn *conn) ret = rx_data(conn, &iov, 1, ISCSI_CRC_LEN); if (ret != ISCSI_CRC_LEN) { iscsit_rx_thread_wait_for_tcp(conn); - return; + break; } iscsit_do_crypto_hash_buf(conn->conn_rx_hash, buffer, @@ -3964,7 +3968,7 @@ static void iscsit_get_rx_pdu(struct iscsi_conn *conn) } if (conn->conn_state == TARG_CONN_STATE_IN_LOGOUT) - return; + break; opcode = buffer[0] & ISCSI_OPCODE_MASK; @@ -3975,13 +3979,15 @@ static void iscsit_get_rx_pdu(struct iscsi_conn *conn) " while in Discovery Session, rejecting.\n", opcode); iscsit_add_reject(conn, ISCSI_REASON_PROTOCOL_ERROR, buffer); - return; + break; } ret = iscsi_target_rx_opcode(conn, buffer); if (ret < 0) - return; + break; } + + kfree(buffer); } int iscsi_target_rx_thread(void *arg) diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index e2902d394f1b..f93f9881ec18 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -76,7 +76,7 @@ int __ext4_check_dir_entry(const char *function, unsigned int line, else if (unlikely(rlen < EXT4_DIR_REC_LEN(de->name_len))) error_msg = "rec_len is too small for name_len"; else if (unlikely(((char *) de - buf) + rlen > size)) - error_msg = "directory entry across range"; + error_msg = "directory entry overrun"; else if (unlikely(le32_to_cpu(de->inode) > le32_to_cpu(EXT4_SB(dir->i_sb)->s_es->s_inodes_count))) error_msg = "inode out of bounds"; @@ -85,18 +85,16 @@ int __ext4_check_dir_entry(const char *function, unsigned int line, if (filp) ext4_error_file(filp, function, line, bh->b_blocknr, - "bad entry in directory: %s - offset=%u(%u), " - "inode=%u, rec_len=%d, name_len=%d", - error_msg, (unsigned) (offset % size), - offset, le32_to_cpu(de->inode), - rlen, de->name_len); + "bad entry in directory: %s - offset=%u, " + "inode=%u, rec_len=%d, name_len=%d, size=%d", + error_msg, offset, le32_to_cpu(de->inode), + rlen, de->name_len, size); else ext4_error_inode(dir, function, line, bh->b_blocknr, - "bad entry in directory: %s - offset=%u(%u), " - "inode=%u, rec_len=%d, name_len=%d", - error_msg, (unsigned) (offset % size), - offset, le32_to_cpu(de->inode), - rlen, de->name_len); + "bad entry in directory: %s - offset=%u, " + "inode=%u, rec_len=%d, name_len=%d, size=%d", + error_msg, offset, le32_to_cpu(de->inode), + rlen, de->name_len, size); return 1; } diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 0f0edd1cd0cd..caff935fbeb8 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -43,6 +43,17 @@ #define __FS_HAS_ENCRYPTION IS_ENABLED(CONFIG_EXT4_FS_ENCRYPTION) #include <linux/fscrypt.h> +#include <linux/compiler.h> + +/* Until this gets included into linux/compiler-gcc.h */ +#ifndef __nonstring +#if defined(GCC_VERSION) && (GCC_VERSION >= 80000) +#define __nonstring __attribute__((nonstring)) +#else +#define __nonstring +#endif +#endif + /* * The fourth extended filesystem constants/structures */ @@ -675,6 +686,9 @@ enum { /* Max physical block we can address w/o extents */ #define EXT4_MAX_BLOCK_FILE_PHYS 0xFFFFFFFF +/* Max logical block we can support */ +#define EXT4_MAX_LOGICAL_BLOCK 0xFFFFFFFF + /* * Structure of an inode on the disk */ @@ -1226,7 +1240,7 @@ struct ext4_super_block { __le32 s_feature_ro_compat; /* readonly-compatible feature set */ /*68*/ __u8 s_uuid[16]; /* 128-bit uuid for volume */ /*78*/ char s_volume_name[16]; /* volume name */ -/*88*/ char s_last_mounted[64]; /* directory where last mounted */ +/*88*/ char s_last_mounted[64] __nonstring; /* directory where last mounted */ /*C8*/ __le32 s_algorithm_usage_bitmap; /* For compression */ /* * Performance hints. Directory preallocation should only @@ -1277,13 +1291,13 @@ struct ext4_super_block { __le32 s_first_error_time; /* first time an error happened */ __le32 s_first_error_ino; /* inode involved in first error */ __le64 s_first_error_block; /* block involved of first error */ - __u8 s_first_error_func[32]; /* function where the error happened */ + __u8 s_first_error_func[32] __nonstring; /* function where the error happened */ __le32 s_first_error_line; /* line number where error happened */ __le32 s_last_error_time; /* most recent time of an error */ __le32 s_last_error_ino; /* inode involved in last error */ __le32 s_last_error_line; /* line number where error happened */ __le64 s_last_error_block; /* block involved of last error */ - __u8 s_last_error_func[32]; /* function where the error happened */ + __u8 s_last_error_func[32] __nonstring; /* function where the error happened */ #define EXT4_S_ERR_END offsetof(struct ext4_super_block, s_mount_opts) __u8 s_mount_opts[64]; __le32 s_usr_quota_inum; /* inode for tracking user quota */ diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index 3543fe80a3c4..7b4736022761 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -1753,6 +1753,7 @@ bool empty_inline_dir(struct inode *dir, int *has_inline_data) { int err, inline_size; struct ext4_iloc iloc; + size_t inline_len; void *inline_pos; unsigned int offset; struct ext4_dir_entry_2 *de; @@ -1780,8 +1781,9 @@ bool empty_inline_dir(struct inode *dir, int *has_inline_data) goto out; } + inline_len = ext4_get_inline_size(dir); offset = EXT4_INLINE_DOTDOT_SIZE; - while (offset < dir->i_size) { + while (offset < inline_len) { de = ext4_get_inline_entry(dir, &iloc, offset, &inline_pos, &inline_size); if (ext4_check_dir_entry(dir, NULL, de, diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index d0dd585add6a..d767e993591d 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3413,12 +3413,16 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length, { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); unsigned int blkbits = inode->i_blkbits; - unsigned long first_block = offset >> blkbits; - unsigned long last_block = (offset + length - 1) >> blkbits; + unsigned long first_block, last_block; struct ext4_map_blocks map; bool delalloc = false; int ret; + if ((offset >> blkbits) > EXT4_MAX_LOGICAL_BLOCK) + return -EINVAL; + first_block = offset >> blkbits; + last_block = min_t(loff_t, (offset + length - 1) >> blkbits, + EXT4_MAX_LOGICAL_BLOCK); if (flags & IOMAP_REPORT) { if (ext4_has_inline_data(inode)) { @@ -3948,6 +3952,7 @@ static const struct address_space_operations ext4_dax_aops = { .writepages = ext4_dax_writepages, .direct_IO = noop_direct_IO, .set_page_dirty = noop_set_page_dirty, + .bmap = ext4_bmap, .invalidatepage = noop_invalidatepage, }; @@ -4192,9 +4197,8 @@ int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset, return 0; } -static void ext4_wait_dax_page(struct ext4_inode_info *ei, bool *did_unlock) +static void ext4_wait_dax_page(struct ext4_inode_info *ei) { - *did_unlock = true; up_write(&ei->i_mmap_sem); schedule(); down_write(&ei->i_mmap_sem); @@ -4204,14 +4208,12 @@ int ext4_break_layouts(struct inode *inode) { struct ext4_inode_info *ei = EXT4_I(inode); struct page *page; - bool retry; int error; if (WARN_ON_ONCE(!rwsem_is_locked(&ei->i_mmap_sem))) return -EINVAL; do { - retry = false; page = dax_layout_busy_page(inode->i_mapping); if (!page) return 0; @@ -4219,8 +4221,8 @@ int ext4_break_layouts(struct inode *inode) error = ___wait_var_event(&page->_refcount, atomic_read(&page->_refcount) == 1, TASK_INTERRUPTIBLE, 0, 0, - ext4_wait_dax_page(ei, &retry)); - } while (error == 0 && retry); + ext4_wait_dax_page(ei)); + } while (error == 0); return error; } @@ -4895,6 +4897,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) * not initialized on a new filesystem. */ } ei->i_flags = le32_to_cpu(raw_inode->i_flags); + ext4_set_inode_flags(inode); inode->i_blocks = ext4_inode_blocks(raw_inode, ei); ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl_lo); if (ext4_has_feature_64bit(sb)) @@ -5041,7 +5044,6 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) goto bad_inode; } brelse(iloc.bh); - ext4_set_inode_flags(inode); unlock_new_inode(inode); return inode; diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c index 39b07c2d3384..2305b4374fd3 100644 --- a/fs/ext4/mmp.c +++ b/fs/ext4/mmp.c @@ -49,7 +49,6 @@ static int write_mmp_block(struct super_block *sb, struct buffer_head *bh) */ sb_start_write(sb); ext4_mmp_csum_set(sb, mmp); - mark_buffer_dirty(bh); lock_buffer(bh); bh->b_end_io = end_buffer_write_sync; get_bh(bh); diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 116ff68c5bd4..377d516c475f 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -3478,6 +3478,12 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, int credits; u8 old_file_type; + if (new.inode && new.inode->i_nlink == 0) { + EXT4_ERROR_INODE(new.inode, + "target of rename is already freed"); + return -EFSCORRUPTED; + } + if ((ext4_test_inode_flag(new_dir, EXT4_INODE_PROJINHERIT)) && (!projid_eq(EXT4_I(new_dir)->i_projid, EXT4_I(old_dentry->d_inode)->i_projid))) diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index e5fb38451a73..ebbc663d0798 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -19,6 +19,7 @@ int ext4_resize_begin(struct super_block *sb) { + struct ext4_sb_info *sbi = EXT4_SB(sb); int ret = 0; if (!capable(CAP_SYS_RESOURCE)) @@ -29,7 +30,7 @@ int ext4_resize_begin(struct super_block *sb) * because the user tools have no way of handling this. Probably a * bad time to do it anyways. */ - if (EXT4_SB(sb)->s_sbh->b_blocknr != + if (EXT4_B2C(sbi, sbi->s_sbh->b_blocknr) != le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block)) { ext4_warning(sb, "won't resize using backup superblock at %llu", (unsigned long long)EXT4_SB(sb)->s_sbh->b_blocknr); @@ -1986,6 +1987,26 @@ retry: } } + /* + * Make sure the last group has enough space so that it's + * guaranteed to have enough space for all metadata blocks + * that it might need to hold. (We might not need to store + * the inode table blocks in the last block group, but there + * will be cases where this might be needed.) + */ + if ((ext4_group_first_block_no(sb, n_group) + + ext4_group_overhead_blocks(sb, n_group) + 2 + + sbi->s_itb_per_group + sbi->s_cluster_ratio) >= n_blocks_count) { + n_blocks_count = ext4_group_first_block_no(sb, n_group); + n_group--; + n_blocks_count_retry = 0; + if (resize_inode) { + iput(resize_inode); + resize_inode = NULL; + } + goto retry; + } + /* extend the last group */ if (n_group == o_group) add = n_blocks_count - o_blocks_count; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 5863fd22e90b..1145109968ef 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -2145,6 +2145,8 @@ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb, SEQ_OPTS_PRINT("max_dir_size_kb=%u", sbi->s_max_dir_size_kb); if (test_opt(sb, DATA_ERR_ABORT)) SEQ_OPTS_PUTS("data_err=abort"); + if (DUMMY_ENCRYPTION_ENABLED(sbi)) + SEQ_OPTS_PUTS("test_dummy_encryption"); ext4_show_quota_options(seq, sb); return 0; @@ -4378,11 +4380,13 @@ no_journal: block = ext4_count_free_clusters(sb); ext4_free_blocks_count_set(sbi->s_es, EXT4_C2B(sbi, block)); + ext4_superblock_csum_set(sb); err = percpu_counter_init(&sbi->s_freeclusters_counter, block, GFP_KERNEL); if (!err) { unsigned long freei = ext4_count_free_inodes(sb); sbi->s_es->s_free_inodes_count = cpu_to_le32(freei); + ext4_superblock_csum_set(sb); err = percpu_counter_init(&sbi->s_freeinodes_counter, freei, GFP_KERNEL); } diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c index d9ebe11c8990..1d098c3c00e0 100644 --- a/fs/ocfs2/buffer_head_io.c +++ b/fs/ocfs2/buffer_head_io.c @@ -342,6 +342,7 @@ int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr, * for this bh as it's not marked locally * uptodate. */ status = -EIO; + clear_buffer_needs_validate(bh); put_bh(bh); bhs[i] = NULL; continue; diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index ad72261ee3fe..d297fe4472a9 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -464,6 +464,7 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) ret = -EFAULT; goto out; } + m = NULL; /* skip the list anchor */ } else if (m->type == KCORE_VMALLOC) { vread(buf, (char *)start, tsz); /* we have to zero-fill user buffer even if no read */ diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index 23e7042666a7..bf000c8aeffb 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -1912,7 +1912,9 @@ static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data) mutex_unlock(&c->bu_mutex); } - ubifs_assert(c, c->lst.taken_empty_lebs > 0); + if (!c->need_recovery) + ubifs_assert(c, c->lst.taken_empty_lebs > 0); + return 0; } @@ -1954,6 +1956,9 @@ static struct ubi_volume_desc *open_ubi(const char *name, int mode) int dev, vol; char *endptr; + if (!name || !*name) + return ERR_PTR(-EINVAL); + /* First, try to open using the device node path method */ ubi = ubi_open_volume_path(name, mode); if (!IS_ERR(ubi)) diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c index 61afdfee4b28..f5ad1ede7990 100644 --- a/fs/ubifs/xattr.c +++ b/fs/ubifs/xattr.c @@ -152,12 +152,6 @@ static int create_xattr(struct ubifs_info *c, struct inode *host, ui->data_len = size; mutex_lock(&host_ui->ui_mutex); - - if (!host->i_nlink) { - err = -ENOENT; - goto out_noent; - } - host->i_ctime = current_time(host); host_ui->xattr_cnt += 1; host_ui->xattr_size += CALC_DENT_SIZE(fname_len(nm)); @@ -190,7 +184,6 @@ out_cancel: host_ui->xattr_size -= CALC_XATTR_BYTES(size); host_ui->xattr_names -= fname_len(nm); host_ui->flags &= ~UBIFS_CRYPT_FL; -out_noent: mutex_unlock(&host_ui->ui_mutex); out_free: make_bad_inode(inode); @@ -242,12 +235,6 @@ static int change_xattr(struct ubifs_info *c, struct inode *host, mutex_unlock(&ui->ui_mutex); mutex_lock(&host_ui->ui_mutex); - - if (!host->i_nlink) { - err = -ENOENT; - goto out_noent; - } - host->i_ctime = current_time(host); host_ui->xattr_size -= CALC_XATTR_BYTES(old_size); host_ui->xattr_size += CALC_XATTR_BYTES(size); @@ -269,7 +256,6 @@ static int change_xattr(struct ubifs_info *c, struct inode *host, out_cancel: host_ui->xattr_size -= CALC_XATTR_BYTES(size); host_ui->xattr_size += CALC_XATTR_BYTES(old_size); -out_noent: mutex_unlock(&host_ui->ui_mutex); make_bad_inode(inode); out_free: @@ -496,12 +482,6 @@ static int remove_xattr(struct ubifs_info *c, struct inode *host, return err; mutex_lock(&host_ui->ui_mutex); - - if (!host->i_nlink) { - err = -ENOENT; - goto out_noent; - } - host->i_ctime = current_time(host); host_ui->xattr_cnt -= 1; host_ui->xattr_size -= CALC_DENT_SIZE(fname_len(nm)); @@ -521,7 +501,6 @@ out_cancel: host_ui->xattr_size += CALC_DENT_SIZE(fname_len(nm)); host_ui->xattr_size += CALC_XATTR_BYTES(ui->data_len); host_ui->xattr_names += fname_len(nm); -out_noent: mutex_unlock(&host_ui->ui_mutex); ubifs_release_budget(c, &req); make_bad_inode(inode); @@ -561,9 +540,6 @@ static int ubifs_xattr_remove(struct inode *host, const char *name) ubifs_assert(c, inode_is_locked(host)); - if (!host->i_nlink) - return -ENOENT; - if (fname_len(&nm) > UBIFS_MAX_NLEN) return -ENAMETOOLONG; diff --git a/include/drm/drm_drv.h b/include/drm/drm_drv.h index 46a8009784df..152b3055e9e1 100644 --- a/include/drm/drm_drv.h +++ b/include/drm/drm_drv.h @@ -675,7 +675,7 @@ static inline bool drm_core_check_feature(struct drm_device *dev, int feature) static inline bool drm_drv_uses_atomic_modeset(struct drm_device *dev) { return drm_core_check_feature(dev, DRIVER_ATOMIC) || - dev->mode_config.funcs->atomic_commit != NULL; + (dev->mode_config.funcs && dev->mode_config.funcs->atomic_commit != NULL); } diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h index 763bbad1e258..4d36b27214fd 100644 --- a/include/linux/compiler-gcc.h +++ b/include/linux/compiler-gcc.h @@ -79,20 +79,6 @@ #define __noretpoline __attribute__((indirect_branch("keep"))) #endif -/* - * it doesn't make sense on ARM (currently the only user of __naked) - * to trace naked functions because then mcount is called without - * stack and frame pointer being set up and there is no chance to - * restore the lr register to the value before mcount was called. - * - * The asm() bodies of naked functions often depend on standard calling - * conventions, therefore they must be noinline and noclone. - * - * GCC 4.[56] currently fail to enforce this, so we must do so ourselves. - * See GCC PR44290. - */ -#define __naked __attribute__((naked)) noinline __noclone notrace - #define __UNIQUE_ID(prefix) __PASTE(__PASTE(__UNIQUE_ID_, prefix), __COUNTER__) #define __optimize(level) __attribute__((__optimize__(level))) diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h index 3525c179698c..db192becfec4 100644 --- a/include/linux/compiler_types.h +++ b/include/linux/compiler_types.h @@ -226,6 +226,14 @@ struct ftrace_likely_data { #define notrace __attribute__((no_instrument_function)) #endif +/* + * it doesn't make sense on ARM (currently the only user of __naked) + * to trace naked functions because then mcount is called without + * stack and frame pointer being set up and there is no chance to + * restore the lr register to the value before mcount was called. + */ +#define __naked __attribute__((naked)) notrace + #define __compiler_offsetof(a, b) __builtin_offsetof(a, b) /* diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 0205aee44ded..c926698040e0 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -733,8 +733,6 @@ bool kvm_vcpu_wake_up(struct kvm_vcpu *vcpu); void kvm_vcpu_kick(struct kvm_vcpu *vcpu); int kvm_vcpu_yield_to(struct kvm_vcpu *target); void kvm_vcpu_on_spin(struct kvm_vcpu *vcpu, bool usermode_vcpu_not_eligible); -void kvm_load_guest_fpu(struct kvm_vcpu *vcpu); -void kvm_put_guest_fpu(struct kvm_vcpu *vcpu); void kvm_flush_remote_tlbs(struct kvm *kvm); void kvm_reload_remote_mmus(struct kvm *kvm); diff --git a/include/linux/vga_switcheroo.h b/include/linux/vga_switcheroo.h index a34539b7f750..7e6ac0114d55 100644 --- a/include/linux/vga_switcheroo.h +++ b/include/linux/vga_switcheroo.h @@ -133,15 +133,18 @@ struct vga_switcheroo_handler { * @can_switch: check if the device is in a position to switch now. * Mandatory. The client should return false if a user space process * has one of its device files open + * @gpu_bound: notify the client id to audio client when the GPU is bound. * * Client callbacks. A client can be either a GPU or an audio device on a GPU. * The @set_gpu_state and @can_switch methods are mandatory, @reprobe may be * set to NULL. For audio clients, the @reprobe member is bogus. + * OTOH, @gpu_bound is only for audio clients, and not used for GPU clients. */ struct vga_switcheroo_client_ops { void (*set_gpu_state)(struct pci_dev *dev, enum vga_switcheroo_state); void (*reprobe)(struct pci_dev *dev); bool (*can_switch)(struct pci_dev *dev); + void (*gpu_bound)(struct pci_dev *dev, enum vga_switcheroo_client_id); }; #if defined(CONFIG_VGA_SWITCHEROO) diff --git a/include/net/tls.h b/include/net/tls.h index d5c683e8bb22..0a769cf2f5f3 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -171,15 +171,14 @@ struct cipher_context { char *rec_seq; }; +union tls_crypto_context { + struct tls_crypto_info info; + struct tls12_crypto_info_aes_gcm_128 aes_gcm_128; +}; + struct tls_context { - union { - struct tls_crypto_info crypto_send; - struct tls12_crypto_info_aes_gcm_128 crypto_send_aes_gcm_128; - }; - union { - struct tls_crypto_info crypto_recv; - struct tls12_crypto_info_aes_gcm_128 crypto_recv_aes_gcm_128; - }; + union tls_crypto_context crypto_send; + union tls_crypto_context crypto_recv; struct list_head list; struct net_device *netdev; @@ -367,8 +366,8 @@ static inline void tls_fill_prepend(struct tls_context *ctx, * size KTLS_DTLS_HEADER_SIZE + KTLS_DTLS_NONCE_EXPLICIT_SIZE */ buf[0] = record_type; - buf[1] = TLS_VERSION_MINOR(ctx->crypto_send.version); - buf[2] = TLS_VERSION_MAJOR(ctx->crypto_send.version); + buf[1] = TLS_VERSION_MINOR(ctx->crypto_send.info.version); + buf[2] = TLS_VERSION_MAJOR(ctx->crypto_send.info.version); /* we can use IV for nonce explicit according to spec */ buf[3] = pkt_len >> 8; buf[4] = pkt_len & 0xFF; diff --git a/include/sound/hdaudio.h b/include/sound/hdaudio.h index 6f1e1f3b3063..cd1773d0e08f 100644 --- a/include/sound/hdaudio.h +++ b/include/sound/hdaudio.h @@ -412,6 +412,7 @@ void snd_hdac_bus_init_cmd_io(struct hdac_bus *bus); void snd_hdac_bus_stop_cmd_io(struct hdac_bus *bus); void snd_hdac_bus_enter_link_reset(struct hdac_bus *bus); void snd_hdac_bus_exit_link_reset(struct hdac_bus *bus); +int snd_hdac_bus_reset_link(struct hdac_bus *bus, bool full_reset); void snd_hdac_bus_update_rirb(struct hdac_bus *bus); int snd_hdac_bus_handle_stream_irq(struct hdac_bus *bus, unsigned int status, diff --git a/include/sound/soc-dapm.h b/include/sound/soc-dapm.h index af9ef16cc34d..fdaaafdc7a00 100644 --- a/include/sound/soc-dapm.h +++ b/include/sound/soc-dapm.h @@ -407,6 +407,7 @@ int snd_soc_dapm_new_dai_widgets(struct snd_soc_dapm_context *dapm, int snd_soc_dapm_link_dai_widgets(struct snd_soc_card *card); void snd_soc_dapm_connect_dai_link_widgets(struct snd_soc_card *card); int snd_soc_dapm_new_pcm(struct snd_soc_card *card, + struct snd_soc_pcm_runtime *rtd, const struct snd_soc_pcm_stream *params, unsigned int num_params, struct snd_soc_dapm_widget *source, diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 07548de5c988..7f2ff3a76995 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -719,6 +719,7 @@ struct kvm_ppc_one_seg_page_size { #define KVM_PPC_PAGE_SIZES_REAL 0x00000001 #define KVM_PPC_1T_SEGMENTS 0x00000002 +#define KVM_PPC_NO_HASH 0x00000004 struct kvm_ppc_smmu_info { __u64 flags; @@ -952,6 +953,8 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_S390_HPAGE_1M 156 #define KVM_CAP_NESTED_STATE 157 #define KVM_CAP_ARM_INJECT_SERROR_ESR 158 +#define KVM_CAP_MSR_PLATFORM_INFO 159 +#define KVM_CAP_PPC_NESTED_HV 160 #ifdef KVM_CAP_IRQ_ROUTING diff --git a/include/uapi/sound/skl-tplg-interface.h b/include/uapi/sound/skl-tplg-interface.h index f58cafa42f18..f39352cef382 100644 --- a/include/uapi/sound/skl-tplg-interface.h +++ b/include/uapi/sound/skl-tplg-interface.h @@ -10,6 +10,8 @@ #ifndef __HDA_TPLG_INTERFACE_H__ #define __HDA_TPLG_INTERFACE_H__ +#include <linux/types.h> + /* * Default types range from 0~12. type can range from 0 to 0xff * SST types start at higher to avoid any overlapping in future @@ -143,10 +145,10 @@ enum skl_module_param_type { }; struct skl_dfw_algo_data { - u32 set_params:2; - u32 rsvd:30; - u32 param_id; - u32 max; + __u32 set_params:2; + __u32 rsvd:30; + __u32 param_id; + __u32 max; char params[0]; } __packed; @@ -163,68 +165,68 @@ enum skl_tuple_type { /* v4 configuration data */ struct skl_dfw_v4_module_pin { - u16 module_id; - u16 instance_id; + __u16 module_id; + __u16 instance_id; } __packed; struct skl_dfw_v4_module_fmt { - u32 channels; - u32 freq; - u32 bit_depth; - u32 valid_bit_depth; - u32 ch_cfg; - u32 interleaving_style; - u32 sample_type; - u32 ch_map; + __u32 channels; + __u32 freq; + __u32 bit_depth; + __u32 valid_bit_depth; + __u32 ch_cfg; + __u32 interleaving_style; + __u32 sample_type; + __u32 ch_map; } __packed; struct skl_dfw_v4_module_caps { - u32 set_params:2; - u32 rsvd:30; - u32 param_id; - u32 caps_size; - u32 caps[HDA_SST_CFG_MAX]; + __u32 set_params:2; + __u32 rsvd:30; + __u32 param_id; + __u32 caps_size; + __u32 caps[HDA_SST_CFG_MAX]; } __packed; struct skl_dfw_v4_pipe { - u8 pipe_id; - u8 pipe_priority; - u16 conn_type:4; - u16 rsvd:4; - u16 memory_pages:8; + __u8 pipe_id; + __u8 pipe_priority; + __u16 conn_type:4; + __u16 rsvd:4; + __u16 memory_pages:8; } __packed; struct skl_dfw_v4_module { char uuid[SKL_UUID_STR_SZ]; - u16 module_id; - u16 instance_id; - u32 max_mcps; - u32 mem_pages; - u32 obs; - u32 ibs; - u32 vbus_id; - - u32 max_in_queue:8; - u32 max_out_queue:8; - u32 time_slot:8; - u32 core_id:4; - u32 rsvd1:4; - - u32 module_type:8; - u32 conn_type:4; - u32 dev_type:4; - u32 hw_conn_type:4; - u32 rsvd2:12; - - u32 params_fixup:8; - u32 converter:8; - u32 input_pin_type:1; - u32 output_pin_type:1; - u32 is_dynamic_in_pin:1; - u32 is_dynamic_out_pin:1; - u32 is_loadable:1; - u32 rsvd3:11; + __u16 module_id; + __u16 instance_id; + __u32 max_mcps; + __u32 mem_pages; + __u32 obs; + __u32 ibs; + __u32 vbus_id; + + __u32 max_in_queue:8; + __u32 max_out_queue:8; + __u32 time_slot:8; + __u32 core_id:4; + __u32 rsvd1:4; + + __u32 module_type:8; + __u32 conn_type:4; + __u32 dev_type:4; + __u32 hw_conn_type:4; + __u32 rsvd2:12; + + __u32 params_fixup:8; + __u32 converter:8; + __u32 input_pin_type:1; + __u32 output_pin_type:1; + __u32 is_dynamic_in_pin:1; + __u32 is_dynamic_out_pin:1; + __u32 is_loadable:1; + __u32 rsvd3:11; struct skl_dfw_v4_pipe pipe; struct skl_dfw_v4_module_fmt in_fmt[MAX_IN_QUEUE]; diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 2590700237c1..138f0302692e 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -1844,7 +1844,7 @@ static int btf_check_all_metas(struct btf_verifier_env *env) hdr = &btf->hdr; cur = btf->nohdr_data + hdr->type_off; - end = btf->nohdr_data + hdr->type_len; + end = cur + hdr->type_len; env->log_type_id = 1; while (cur < end) { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 92246117d2b0..bb07e74b34a2 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3163,7 +3163,7 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env, * an arbitrary scalar. Disallow all math except * pointer subtraction */ - if (opcode == BPF_SUB){ + if (opcode == BPF_SUB && env->allow_ptr_leaks) { mark_reg_unknown(env, regs, insn->dst_reg); return 0; } diff --git a/kernel/pid.c b/kernel/pid.c index de1cfc4f75a2..cdf63e53a014 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -195,7 +195,7 @@ struct pid *alloc_pid(struct pid_namespace *ns) idr_preload_end(); if (nr < 0) { - retval = nr; + retval = (nr == -ENOSPC) ? -EAGAIN : nr; goto out_free; } diff --git a/kernel/sys.c b/kernel/sys.c index cf5c67533ff1..123bd73046ec 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -71,9 +71,6 @@ #include <asm/io.h> #include <asm/unistd.h> -/* Hardening for Spectre-v1 */ -#include <linux/nospec.h> - #include "uid16.h" #ifndef SET_UNALIGN_CTL diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 1d92d4a982fd..65bd4616220d 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1546,6 +1546,8 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned long nr_pages) tmp_iter_page = first_page; do { + cond_resched(); + to_remove_page = tmp_iter_page; rb_inc_page(cpu_buffer, &tmp_iter_page); diff --git a/mm/Kconfig b/mm/Kconfig index a550635ea5c3..de64ea658716 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -637,6 +637,7 @@ config DEFERRED_STRUCT_PAGE_INIT depends on NO_BOOTMEM depends on SPARSEMEM depends on !NEED_PER_CPU_KM + depends on 64BIT help Ordinarily all struct pages are initialised during early boot in a single thread. On very large machines this can take a considerable diff --git a/mm/shmem.c b/mm/shmem.c index 0376c124b043..446942677cd4 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2227,6 +2227,8 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode mpol_shared_policy_init(&info->policy, NULL); break; } + + lockdep_annotate_inode_mutex_key(inode); } else shmem_free_inode(sb); return inode; diff --git a/mm/vmscan.c b/mm/vmscan.c index 7e7d25504651..c7ce2c161225 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -476,6 +476,17 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, delta = freeable >> priority; delta *= 4; do_div(delta, shrinker->seeks); + + /* + * Make sure we apply some minimal pressure on default priority + * even on small cgroups. Stale objects are not only consuming memory + * by themselves, but can also hold a reference to a dying cgroup, + * preventing it from being reclaimed. A dying cgroup with all + * corresponding structures like per-cpu stats and kmem caches + * can be really big, so it may lead to a significant waste of memory. + */ + delta = max_t(unsigned long long, delta, min(freeable, batch_size)); + total_scan += delta; if (total_scan < 0) { pr_err("shrink_slab: %pF negative objects to delete nr=%ld\n", diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c index ae91e2d40056..3a7b0773536b 100644 --- a/net/bluetooth/smp.c +++ b/net/bluetooth/smp.c @@ -83,6 +83,7 @@ enum { struct smp_dev { /* Secure Connections OOB data */ + bool local_oob; u8 local_pk[64]; u8 local_rand[16]; bool debug_key; @@ -599,6 +600,8 @@ int smp_generate_oob(struct hci_dev *hdev, u8 hash[16], u8 rand[16]) memcpy(rand, smp->local_rand, 16); + smp->local_oob = true; + return 0; } @@ -1785,7 +1788,7 @@ static u8 smp_cmd_pairing_req(struct l2cap_conn *conn, struct sk_buff *skb) * successfully received our local OOB data - therefore set the * flag to indicate that local OOB is in use. */ - if (req->oob_flag == SMP_OOB_PRESENT) + if (req->oob_flag == SMP_OOB_PRESENT && SMP_DEV(hdev)->local_oob) set_bit(SMP_FLAG_LOCAL_OOB, &smp->flags); /* SMP over BR/EDR requires special treatment */ @@ -1967,7 +1970,7 @@ static u8 smp_cmd_pairing_rsp(struct l2cap_conn *conn, struct sk_buff *skb) * successfully received our local OOB data - therefore set the * flag to indicate that local OOB is in use. */ - if (rsp->oob_flag == SMP_OOB_PRESENT) + if (rsp->oob_flag == SMP_OOB_PRESENT && SMP_DEV(hdev)->local_oob) set_bit(SMP_FLAG_LOCAL_OOB, &smp->flags); smp->prsp[0] = SMP_CMD_PAIRING_RSP; @@ -2697,7 +2700,13 @@ static int smp_cmd_public_key(struct l2cap_conn *conn, struct sk_buff *skb) * key was set/generated. */ if (test_bit(SMP_FLAG_LOCAL_OOB, &smp->flags)) { - struct smp_dev *smp_dev = chan->data; + struct l2cap_chan *hchan = hdev->smp_data; + struct smp_dev *smp_dev; + + if (!hchan || !hchan->data) + return SMP_UNSPECIFIED; + + smp_dev = hchan->data; tfm_ecdh = smp_dev->tfm_ecdh; } else { @@ -3230,6 +3239,7 @@ static struct l2cap_chan *smp_add_cid(struct hci_dev *hdev, u16 cid) return ERR_CAST(tfm_ecdh); } + smp->local_oob = false; smp->tfm_aes = tfm_aes; smp->tfm_cmac = tfm_cmac; smp->tfm_ecdh = tfm_ecdh; diff --git a/net/core/filter.c b/net/core/filter.c index aecdeba052d3..5e00f2b85a56 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2344,7 +2344,8 @@ BPF_CALL_4(bpf_msg_pull_data, if (unlikely(bytes_sg_total > copy)) return -EINVAL; - page = alloc_pages(__GFP_NOWARN | GFP_ATOMIC, get_order(copy)); + page = alloc_pages(__GFP_NOWARN | GFP_ATOMIC | __GFP_COMP, + get_order(copy)); if (unlikely(!page)) return -ENOMEM; p = page_address(page); diff --git a/net/core/neighbour.c b/net/core/neighbour.c index aa19d86937af..91592fceeaad 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -1180,6 +1180,12 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, lladdr = neigh->ha; } + /* Update confirmed timestamp for neighbour entry after we + * received ARP packet even if it doesn't change IP to MAC binding. + */ + if (new & NUD_CONNECTED) + neigh->confirmed = jiffies; + /* If entry was valid and address is not changed, do not change entry state, if new one is STALE. */ @@ -1201,15 +1207,12 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, } } - /* Update timestamps only once we know we will make a change to the + /* Update timestamp only once we know we will make a change to the * neighbour entry. Otherwise we risk to move the locktime window with * noop updates and ignore relevant ARP updates. */ - if (new != old || lladdr != neigh->ha) { - if (new & NUD_CONNECTED) - neigh->confirmed = jiffies; + if (new != old || lladdr != neigh->ha) neigh->updated = jiffies; - } if (new != old) { neigh_del_timer(neigh); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 60c928894a78..63ce2283a456 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -2810,7 +2810,7 @@ int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm) } if (dev->rtnl_link_state == RTNL_LINK_INITIALIZED) { - __dev_notify_flags(dev, old_flags, 0U); + __dev_notify_flags(dev, old_flags, (old_flags ^ dev->flags)); } else { dev->rtnl_link_state = RTNL_LINK_INITIALIZED; __dev_notify_flags(dev, old_flags, ~0U); diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 20fda8fb8ffd..1fbe2f815474 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1377,6 +1377,7 @@ struct sk_buff *inet_gso_segment(struct sk_buff *skb, if (encap) skb_reset_inner_headers(skb); skb->network_header = (u8 *)iph - skb->head; + skb_reset_mac_len(skb); } while ((skb = skb->next)); out: diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index f4e35b2ff8b8..7d69dd6fa7e8 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -2124,6 +2124,28 @@ static inline int udp4_csum_init(struct sk_buff *skb, struct udphdr *uh, inet_compute_pseudo); } +/* wrapper for udp_queue_rcv_skb tacking care of csum conversion and + * return code conversion for ip layer consumption + */ +static int udp_unicast_rcv_skb(struct sock *sk, struct sk_buff *skb, + struct udphdr *uh) +{ + int ret; + + if (inet_get_convert_csum(sk) && uh->check && !IS_UDPLITE(sk)) + skb_checksum_try_convert(skb, IPPROTO_UDP, uh->check, + inet_compute_pseudo); + + ret = udp_queue_rcv_skb(sk, skb); + + /* a return value > 0 means to resubmit the input, but + * it wants the return to be -protocol, or 0 + */ + if (ret > 0) + return -ret; + return 0; +} + /* * All we need to do is get the socket, and then do a checksum. */ @@ -2170,14 +2192,9 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, if (unlikely(sk->sk_rx_dst != dst)) udp_sk_rx_dst_set(sk, dst); - ret = udp_queue_rcv_skb(sk, skb); + ret = udp_unicast_rcv_skb(sk, skb, uh); sock_put(sk); - /* a return value > 0 means to resubmit the input, but - * it wants the return to be -protocol, or 0 - */ - if (ret > 0) - return -ret; - return 0; + return ret; } if (rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST)) @@ -2185,22 +2202,8 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, saddr, daddr, udptable, proto); sk = __udp4_lib_lookup_skb(skb, uh->source, uh->dest, udptable); - if (sk) { - int ret; - - if (inet_get_convert_csum(sk) && uh->check && !IS_UDPLITE(sk)) - skb_checksum_try_convert(skb, IPPROTO_UDP, uh->check, - inet_compute_pseudo); - - ret = udp_queue_rcv_skb(sk, skb); - - /* a return value > 0 means to resubmit the input, but - * it wants the return to be -protocol, or 0 - */ - if (ret > 0) - return -ret; - return 0; - } + if (sk) + return udp_unicast_rcv_skb(sk, skb, uh); if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) goto drop; diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c index 37ff4805b20c..c7e495f12011 100644 --- a/net/ipv6/ip6_offload.c +++ b/net/ipv6/ip6_offload.c @@ -115,6 +115,7 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb, payload_len = skb->len - nhoff - sizeof(*ipv6h); ipv6h->payload_len = htons(payload_len); skb->network_header = (u8 *)ipv6h - skb->head; + skb_reset_mac_len(skb); if (udpfrag) { int err = ip6_find_1stfragopt(skb, &prevhdr); diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 16f200f06500..f9f8f554d141 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -219,12 +219,10 @@ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, kfree_skb(skb); return -ENOBUFS; } + if (skb->sk) + skb_set_owner_w(skb2, skb->sk); consume_skb(skb); skb = skb2; - /* skb_set_owner_w() changes sk->sk_wmem_alloc atomically, - * it is safe to call in our context (socket lock not held) - */ - skb_set_owner_w(skb, (struct sock *)sk); } if (opt->opt_flen) ipv6_push_frag_opts(skb, opt, &proto); diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 18e00ce1719a..480a79f47c52 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -946,8 +946,6 @@ static void ip6_rt_init_dst_reject(struct rt6_info *rt, struct fib6_info *ort) static void ip6_rt_init_dst(struct rt6_info *rt, struct fib6_info *ort) { - rt->dst.flags |= fib6_info_dst_flags(ort); - if (ort->fib6_flags & RTF_REJECT) { ip6_rt_init_dst_reject(rt, ort); return; @@ -4670,20 +4668,31 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb, int iif, int type, u32 portid, u32 seq, unsigned int flags) { - struct rtmsg *rtm; + struct rt6_info *rt6 = (struct rt6_info *)dst; + struct rt6key *rt6_dst, *rt6_src; + u32 *pmetrics, table, rt6_flags; struct nlmsghdr *nlh; + struct rtmsg *rtm; long expires = 0; - u32 *pmetrics; - u32 table; nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags); if (!nlh) return -EMSGSIZE; + if (rt6) { + rt6_dst = &rt6->rt6i_dst; + rt6_src = &rt6->rt6i_src; + rt6_flags = rt6->rt6i_flags; + } else { + rt6_dst = &rt->fib6_dst; + rt6_src = &rt->fib6_src; + rt6_flags = rt->fib6_flags; + } + rtm = nlmsg_data(nlh); rtm->rtm_family = AF_INET6; - rtm->rtm_dst_len = rt->fib6_dst.plen; - rtm->rtm_src_len = rt->fib6_src.plen; + rtm->rtm_dst_len = rt6_dst->plen; + rtm->rtm_src_len = rt6_src->plen; rtm->rtm_tos = 0; if (rt->fib6_table) table = rt->fib6_table->tb6_id; @@ -4698,7 +4707,7 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb, rtm->rtm_scope = RT_SCOPE_UNIVERSE; rtm->rtm_protocol = rt->fib6_protocol; - if (rt->fib6_flags & RTF_CACHE) + if (rt6_flags & RTF_CACHE) rtm->rtm_flags |= RTM_F_CLONED; if (dest) { @@ -4706,7 +4715,7 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb, goto nla_put_failure; rtm->rtm_dst_len = 128; } else if (rtm->rtm_dst_len) - if (nla_put_in6_addr(skb, RTA_DST, &rt->fib6_dst.addr)) + if (nla_put_in6_addr(skb, RTA_DST, &rt6_dst->addr)) goto nla_put_failure; #ifdef CONFIG_IPV6_SUBTREES if (src) { @@ -4714,12 +4723,12 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb, goto nla_put_failure; rtm->rtm_src_len = 128; } else if (rtm->rtm_src_len && - nla_put_in6_addr(skb, RTA_SRC, &rt->fib6_src.addr)) + nla_put_in6_addr(skb, RTA_SRC, &rt6_src->addr)) goto nla_put_failure; #endif if (iif) { #ifdef CONFIG_IPV6_MROUTE - if (ipv6_addr_is_multicast(&rt->fib6_dst.addr)) { + if (ipv6_addr_is_multicast(&rt6_dst->addr)) { int err = ip6mr_get_route(net, skb, rtm, portid); if (err == 0) @@ -4754,7 +4763,14 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb, /* For multipath routes, walk the siblings list and add * each as a nexthop within RTA_MULTIPATH. */ - if (rt->fib6_nsiblings) { + if (rt6) { + if (rt6_flags & RTF_GATEWAY && + nla_put_in6_addr(skb, RTA_GATEWAY, &rt6->rt6i_gateway)) + goto nla_put_failure; + + if (dst->dev && nla_put_u32(skb, RTA_OIF, dst->dev->ifindex)) + goto nla_put_failure; + } else if (rt->fib6_nsiblings) { struct fib6_info *sibling, *next_sibling; struct nlattr *mp; @@ -4777,7 +4793,7 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb, goto nla_put_failure; } - if (rt->fib6_flags & RTF_EXPIRES) { + if (rt6_flags & RTF_EXPIRES) { expires = dst ? dst->expires : rt->expires; expires -= jiffies; } @@ -4785,7 +4801,7 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb, if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0) goto nla_put_failure; - if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->fib6_flags))) + if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt6_flags))) goto nla_put_failure; diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 83f4c77c79d8..28c4aa5078fc 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -752,6 +752,28 @@ static void udp6_sk_rx_dst_set(struct sock *sk, struct dst_entry *dst) } } +/* wrapper for udp_queue_rcv_skb tacking care of csum conversion and + * return code conversion for ip layer consumption + */ +static int udp6_unicast_rcv_skb(struct sock *sk, struct sk_buff *skb, + struct udphdr *uh) +{ + int ret; + + if (inet_get_convert_csum(sk) && uh->check && !IS_UDPLITE(sk)) + skb_checksum_try_convert(skb, IPPROTO_UDP, uh->check, + ip6_compute_pseudo); + + ret = udpv6_queue_rcv_skb(sk, skb); + + /* a return value > 0 means to resubmit the input, but + * it wants the return to be -protocol, or 0 + */ + if (ret > 0) + return -ret; + return 0; +} + int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, int proto) { @@ -803,13 +825,14 @@ int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, if (unlikely(sk->sk_rx_dst != dst)) udp6_sk_rx_dst_set(sk, dst); - ret = udpv6_queue_rcv_skb(sk, skb); - sock_put(sk); + if (!uh->check && !udp_sk(sk)->no_check6_rx) { + sock_put(sk); + goto report_csum_error; + } - /* a return value > 0 means to resubmit the input */ - if (ret > 0) - return ret; - return 0; + ret = udp6_unicast_rcv_skb(sk, skb, uh); + sock_put(sk); + return ret; } /* @@ -822,30 +845,13 @@ int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, /* Unicast */ sk = __udp6_lib_lookup_skb(skb, uh->source, uh->dest, udptable); if (sk) { - int ret; - - if (!uh->check && !udp_sk(sk)->no_check6_rx) { - udp6_csum_zero_error(skb); - goto csum_error; - } - - if (inet_get_convert_csum(sk) && uh->check && !IS_UDPLITE(sk)) - skb_checksum_try_convert(skb, IPPROTO_UDP, uh->check, - ip6_compute_pseudo); - - ret = udpv6_queue_rcv_skb(sk, skb); - - /* a return value > 0 means to resubmit the input */ - if (ret > 0) - return ret; - - return 0; + if (!uh->check && !udp_sk(sk)->no_check6_rx) + goto report_csum_error; + return udp6_unicast_rcv_skb(sk, skb, uh); } - if (!uh->check) { - udp6_csum_zero_error(skb); - goto csum_error; - } + if (!uh->check) + goto report_csum_error; if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) goto discard; @@ -866,6 +872,9 @@ short_packet: ulen, skb->len, daddr, ntohs(uh->dest)); goto discard; + +report_csum_error: + udp6_csum_zero_error(skb); csum_error: __UDP6_INC_STATS(net, UDP_MIB_CSUMERRORS, proto == IPPROTO_UDPLITE); discard: diff --git a/net/sched/act_sample.c b/net/sched/act_sample.c index 44e9c00657bc..6b67aa13d2dd 100644 --- a/net/sched/act_sample.c +++ b/net/sched/act_sample.c @@ -69,7 +69,7 @@ static int tcf_sample_init(struct net *net, struct nlattr *nla, if (!exists) { ret = tcf_idr_create(tn, parm->index, est, a, - &act_sample_ops, bind, false); + &act_sample_ops, bind, true); if (ret) { tcf_idr_cleanup(tn, parm->index); return ret; diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 1a67af8a6e8c..0a75cb2e5e7b 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -1902,6 +1902,8 @@ replay: RTM_NEWCHAIN, false); break; case RTM_DELCHAIN: + tfilter_notify_chain(net, skb, block, q, parent, n, + chain, RTM_DELTFILTER); /* Flush the chain first as the user requested chain removal. */ tcf_chain_flush(chain); /* In case the chain was successfully deleted, put a reference diff --git a/net/socket.c b/net/socket.c index e6945e318f02..01f3f8f32d6f 100644 --- a/net/socket.c +++ b/net/socket.c @@ -941,7 +941,8 @@ void dlci_ioctl_set(int (*hook) (unsigned int, void __user *)) EXPORT_SYMBOL(dlci_ioctl_set); static long sock_do_ioctl(struct net *net, struct socket *sock, - unsigned int cmd, unsigned long arg) + unsigned int cmd, unsigned long arg, + unsigned int ifreq_size) { int err; void __user *argp = (void __user *)arg; @@ -967,11 +968,11 @@ static long sock_do_ioctl(struct net *net, struct socket *sock, } else { struct ifreq ifr; bool need_copyout; - if (copy_from_user(&ifr, argp, sizeof(struct ifreq))) + if (copy_from_user(&ifr, argp, ifreq_size)) return -EFAULT; err = dev_ioctl(net, cmd, &ifr, &need_copyout); if (!err && need_copyout) - if (copy_to_user(argp, &ifr, sizeof(struct ifreq))) + if (copy_to_user(argp, &ifr, ifreq_size)) return -EFAULT; } return err; @@ -1070,7 +1071,8 @@ static long sock_ioctl(struct file *file, unsigned cmd, unsigned long arg) err = open_related_ns(&net->ns, get_net_ns); break; default: - err = sock_do_ioctl(net, sock, cmd, arg); + err = sock_do_ioctl(net, sock, cmd, arg, + sizeof(struct ifreq)); break; } return err; @@ -2750,7 +2752,8 @@ static int do_siocgstamp(struct net *net, struct socket *sock, int err; set_fs(KERNEL_DS); - err = sock_do_ioctl(net, sock, cmd, (unsigned long)&ktv); + err = sock_do_ioctl(net, sock, cmd, (unsigned long)&ktv, + sizeof(struct compat_ifreq)); set_fs(old_fs); if (!err) err = compat_put_timeval(&ktv, up); @@ -2766,7 +2769,8 @@ static int do_siocgstampns(struct net *net, struct socket *sock, int err; set_fs(KERNEL_DS); - err = sock_do_ioctl(net, sock, cmd, (unsigned long)&kts); + err = sock_do_ioctl(net, sock, cmd, (unsigned long)&kts, + sizeof(struct compat_ifreq)); set_fs(old_fs); if (!err) err = compat_put_timespec(&kts, up); @@ -3072,7 +3076,8 @@ static int routing_ioctl(struct net *net, struct socket *sock, } set_fs(KERNEL_DS); - ret = sock_do_ioctl(net, sock, cmd, (unsigned long) r); + ret = sock_do_ioctl(net, sock, cmd, (unsigned long) r, + sizeof(struct compat_ifreq)); set_fs(old_fs); out: @@ -3185,7 +3190,8 @@ static int compat_sock_ioctl_trans(struct file *file, struct socket *sock, case SIOCBONDSETHWADDR: case SIOCBONDCHANGEACTIVE: case SIOCGIFNAME: - return sock_do_ioctl(net, sock, cmd, arg); + return sock_do_ioctl(net, sock, cmd, arg, + sizeof(struct compat_ifreq)); } return -ENOIOCTLCMD; diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c index 292742e50bfa..961b07d4d41c 100644 --- a/net/tls/tls_device.c +++ b/net/tls/tls_device.c @@ -686,7 +686,7 @@ int tls_set_device_offload(struct sock *sk, struct tls_context *ctx) goto free_marker_record; } - crypto_info = &ctx->crypto_send; + crypto_info = &ctx->crypto_send.info; switch (crypto_info->cipher_type) { case TLS_CIPHER_AES_GCM_128: nonce_size = TLS_CIPHER_AES_GCM_128_IV_SIZE; @@ -780,7 +780,7 @@ int tls_set_device_offload(struct sock *sk, struct tls_context *ctx) ctx->priv_ctx_tx = offload_ctx; rc = netdev->tlsdev_ops->tls_dev_add(netdev, sk, TLS_OFFLOAD_CTX_DIR_TX, - &ctx->crypto_send, + &ctx->crypto_send.info, tcp_sk(sk)->write_seq); if (rc) goto release_netdev; @@ -862,7 +862,7 @@ int tls_set_device_offload_rx(struct sock *sk, struct tls_context *ctx) goto release_ctx; rc = netdev->tlsdev_ops->tls_dev_add(netdev, sk, TLS_OFFLOAD_CTX_DIR_RX, - &ctx->crypto_recv, + &ctx->crypto_recv.info, tcp_sk(sk)->copied_seq); if (rc) { pr_err_ratelimited("%s: The netdev has refused to offload this socket\n", diff --git a/net/tls/tls_device_fallback.c b/net/tls/tls_device_fallback.c index 6102169239d1..450a6dbc5a88 100644 --- a/net/tls/tls_device_fallback.c +++ b/net/tls/tls_device_fallback.c @@ -320,7 +320,7 @@ static struct sk_buff *tls_enc_skb(struct tls_context *tls_ctx, goto free_req; iv = buf; - memcpy(iv, tls_ctx->crypto_send_aes_gcm_128.salt, + memcpy(iv, tls_ctx->crypto_send.aes_gcm_128.salt, TLS_CIPHER_AES_GCM_128_SALT_SIZE); aad = buf + TLS_CIPHER_AES_GCM_128_SALT_SIZE + TLS_CIPHER_AES_GCM_128_IV_SIZE; diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index 180b6640e531..523622dc74f8 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -241,6 +241,16 @@ static void tls_write_space(struct sock *sk) ctx->sk_write_space(sk); } +static void tls_ctx_free(struct tls_context *ctx) +{ + if (!ctx) + return; + + memzero_explicit(&ctx->crypto_send, sizeof(ctx->crypto_send)); + memzero_explicit(&ctx->crypto_recv, sizeof(ctx->crypto_recv)); + kfree(ctx); +} + static void tls_sk_proto_close(struct sock *sk, long timeout) { struct tls_context *ctx = tls_get_ctx(sk); @@ -294,7 +304,7 @@ static void tls_sk_proto_close(struct sock *sk, long timeout) #else { #endif - kfree(ctx); + tls_ctx_free(ctx); ctx = NULL; } @@ -305,7 +315,7 @@ skip_tx_cleanup: * for sk->sk_prot->unhash [tls_hw_unhash] */ if (free_ctx) - kfree(ctx); + tls_ctx_free(ctx); } static int do_tls_getsockopt_tx(struct sock *sk, char __user *optval, @@ -330,7 +340,7 @@ static int do_tls_getsockopt_tx(struct sock *sk, char __user *optval, } /* get user crypto info */ - crypto_info = &ctx->crypto_send; + crypto_info = &ctx->crypto_send.info; if (!TLS_CRYPTO_INFO_READY(crypto_info)) { rc = -EBUSY; @@ -417,9 +427,9 @@ static int do_tls_setsockopt_conf(struct sock *sk, char __user *optval, } if (tx) - crypto_info = &ctx->crypto_send; + crypto_info = &ctx->crypto_send.info; else - crypto_info = &ctx->crypto_recv; + crypto_info = &ctx->crypto_recv.info; /* Currently we don't support set crypto info more than one time */ if (TLS_CRYPTO_INFO_READY(crypto_info)) { @@ -499,7 +509,7 @@ static int do_tls_setsockopt_conf(struct sock *sk, char __user *optval, goto out; err_crypto_info: - memset(crypto_info, 0, sizeof(*crypto_info)); + memzero_explicit(crypto_info, sizeof(union tls_crypto_context)); out: return rc; } diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index e28a6ff25d96..b9c6ecfbcfea 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -931,7 +931,15 @@ int tls_sw_recvmsg(struct sock *sk, if (control != TLS_RECORD_TYPE_DATA) goto recv_end; } + } else { + /* MSG_PEEK right now cannot look beyond current skb + * from strparser, meaning we cannot advance skb here + * and thus unpause strparser since we'd loose original + * one. + */ + break; } + /* If we have a new message from strparser, continue now. */ if (copied >= target && !ctx->recv_pkt) break; @@ -1055,8 +1063,8 @@ static int tls_read_size(struct strparser *strp, struct sk_buff *skb) goto read_failure; } - if (header[1] != TLS_VERSION_MINOR(tls_ctx->crypto_recv.version) || - header[2] != TLS_VERSION_MAJOR(tls_ctx->crypto_recv.version)) { + if (header[1] != TLS_VERSION_MINOR(tls_ctx->crypto_recv.info.version) || + header[2] != TLS_VERSION_MAJOR(tls_ctx->crypto_recv.info.version)) { ret = -EINVAL; goto read_failure; } @@ -1136,7 +1144,6 @@ void tls_sw_free_resources_rx(struct sock *sk) int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx) { - char keyval[TLS_CIPHER_AES_GCM_128_KEY_SIZE]; struct tls_crypto_info *crypto_info; struct tls12_crypto_info_aes_gcm_128 *gcm_128_info; struct tls_sw_context_tx *sw_ctx_tx = NULL; @@ -1181,12 +1188,12 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx) if (tx) { crypto_init_wait(&sw_ctx_tx->async_wait); - crypto_info = &ctx->crypto_send; + crypto_info = &ctx->crypto_send.info; cctx = &ctx->tx; aead = &sw_ctx_tx->aead_send; } else { crypto_init_wait(&sw_ctx_rx->async_wait); - crypto_info = &ctx->crypto_recv; + crypto_info = &ctx->crypto_recv.info; cctx = &ctx->rx; aead = &sw_ctx_rx->aead_recv; } @@ -1265,9 +1272,7 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx) ctx->push_pending_record = tls_sw_push_pending_record; - memcpy(keyval, gcm_128_info->key, TLS_CIPHER_AES_GCM_128_KEY_SIZE); - - rc = crypto_aead_setkey(*aead, keyval, + rc = crypto_aead_setkey(*aead, gcm_128_info->key, TLS_CIPHER_AES_GCM_128_KEY_SIZE); if (rc) goto free_aead; diff --git a/scripts/subarch.include b/scripts/subarch.include new file mode 100644 index 000000000000..650682821126 --- /dev/null +++ b/scripts/subarch.include @@ -0,0 +1,13 @@ +# SUBARCH tells the usermode build what the underlying arch is. That is set +# first, and if a usermode build is happening, the "ARCH=um" on the command +# line overrides the setting of ARCH below. If a native build is happening, +# then ARCH is assigned, getting whatever value it gets normally, and +# SUBARCH is subsequently ignored. + +SUBARCH := $(shell uname -m | sed -e s/i.86/x86/ -e s/x86_64/x86/ \ + -e s/sun4u/sparc64/ \ + -e s/arm.*/arm/ -e s/sa110/arm/ \ + -e s/s390x/s390/ -e s/parisc64/parisc/ \ + -e s/ppc.*/powerpc/ -e s/mips.*/mips/ \ + -e s/sh[234].*/sh/ -e s/aarch64.*/arm64/ \ + -e s/riscv.*/riscv/) diff --git a/sound/firewire/bebob/bebob.c b/sound/firewire/bebob/bebob.c index 730ea91d9be8..93676354f87f 100644 --- a/sound/firewire/bebob/bebob.c +++ b/sound/firewire/bebob/bebob.c @@ -263,6 +263,8 @@ do_registration(struct work_struct *work) error: mutex_unlock(&devices_mutex); snd_bebob_stream_destroy_duplex(bebob); + kfree(bebob->maudio_special_quirk); + bebob->maudio_special_quirk = NULL; snd_card_free(bebob->card); dev_info(&bebob->unit->device, "Sound card registration failed: %d\n", err); diff --git a/sound/firewire/bebob/bebob_maudio.c b/sound/firewire/bebob/bebob_maudio.c index bd55620c6a47..c266997ad299 100644 --- a/sound/firewire/bebob/bebob_maudio.c +++ b/sound/firewire/bebob/bebob_maudio.c @@ -96,17 +96,13 @@ int snd_bebob_maudio_load_firmware(struct fw_unit *unit) struct fw_device *device = fw_parent_device(unit); int err, rcode; u64 date; - __le32 cues[3] = { - cpu_to_le32(MAUDIO_BOOTLOADER_CUE1), - cpu_to_le32(MAUDIO_BOOTLOADER_CUE2), - cpu_to_le32(MAUDIO_BOOTLOADER_CUE3) - }; + __le32 *cues; /* check date of software used to build */ err = snd_bebob_read_block(unit, INFO_OFFSET_SW_DATE, &date, sizeof(u64)); if (err < 0) - goto end; + return err; /* * firmware version 5058 or later has date later than "20070401", but * 'date' is not null-terminated. @@ -114,20 +110,28 @@ int snd_bebob_maudio_load_firmware(struct fw_unit *unit) if (date < 0x3230303730343031LL) { dev_err(&unit->device, "Use firmware version 5058 or later\n"); - err = -ENOSYS; - goto end; + return -ENXIO; } + cues = kmalloc_array(3, sizeof(*cues), GFP_KERNEL); + if (!cues) + return -ENOMEM; + + cues[0] = cpu_to_le32(MAUDIO_BOOTLOADER_CUE1); + cues[1] = cpu_to_le32(MAUDIO_BOOTLOADER_CUE2); + cues[2] = cpu_to_le32(MAUDIO_BOOTLOADER_CUE3); + rcode = fw_run_transaction(device->card, TCODE_WRITE_BLOCK_REQUEST, device->node_id, device->generation, device->max_speed, BEBOB_ADDR_REG_REQ, - cues, sizeof(cues)); + cues, 3 * sizeof(*cues)); + kfree(cues); if (rcode != RCODE_COMPLETE) { dev_err(&unit->device, "Failed to send a cue to load firmware\n"); err = -EIO; } -end: + return err; } @@ -290,10 +294,6 @@ snd_bebob_maudio_special_discover(struct snd_bebob *bebob, bool is1814) bebob->midi_output_ports = 2; } end: - if (err < 0) { - kfree(params); - bebob->maudio_special_quirk = NULL; - } mutex_unlock(&bebob->mutex); return err; } diff --git a/sound/firewire/digi00x/digi00x.c b/sound/firewire/digi00x/digi00x.c index 1f5e1d23f31a..ef689997d6a5 100644 --- a/sound/firewire/digi00x/digi00x.c +++ b/sound/firewire/digi00x/digi00x.c @@ -49,6 +49,7 @@ static void dg00x_free(struct snd_dg00x *dg00x) fw_unit_put(dg00x->unit); mutex_destroy(&dg00x->mutex); + kfree(dg00x); } static void dg00x_card_free(struct snd_card *card) diff --git a/sound/firewire/fireface/ff-protocol-ff400.c b/sound/firewire/fireface/ff-protocol-ff400.c index ad7a0a32557d..64c3cb0fb926 100644 --- a/sound/firewire/fireface/ff-protocol-ff400.c +++ b/sound/firewire/fireface/ff-protocol-ff400.c @@ -146,6 +146,7 @@ static int ff400_switch_fetching_mode(struct snd_ff *ff, bool enable) { __le32 *reg; int i; + int err; reg = kcalloc(18, sizeof(__le32), GFP_KERNEL); if (reg == NULL) @@ -163,9 +164,11 @@ static int ff400_switch_fetching_mode(struct snd_ff *ff, bool enable) reg[i] = cpu_to_le32(0x00000001); } - return snd_fw_transaction(ff->unit, TCODE_WRITE_BLOCK_REQUEST, - FF400_FETCH_PCM_FRAMES, reg, - sizeof(__le32) * 18, 0); + err = snd_fw_transaction(ff->unit, TCODE_WRITE_BLOCK_REQUEST, + FF400_FETCH_PCM_FRAMES, reg, + sizeof(__le32) * 18, 0); + kfree(reg); + return err; } static void ff400_dump_sync_status(struct snd_ff *ff, diff --git a/sound/firewire/fireworks/fireworks.c b/sound/firewire/fireworks/fireworks.c index 71a0613d3da0..f2d073365cf6 100644 --- a/sound/firewire/fireworks/fireworks.c +++ b/sound/firewire/fireworks/fireworks.c @@ -301,6 +301,8 @@ error: snd_efw_transaction_remove_instance(efw); snd_efw_stream_destroy_duplex(efw); snd_card_free(efw->card); + kfree(efw->resp_buf); + efw->resp_buf = NULL; dev_info(&efw->unit->device, "Sound card registration failed: %d\n", err); } diff --git a/sound/firewire/oxfw/oxfw.c b/sound/firewire/oxfw/oxfw.c index 1e5b2c802635..2ea8be6c8584 100644 --- a/sound/firewire/oxfw/oxfw.c +++ b/sound/firewire/oxfw/oxfw.c @@ -130,6 +130,7 @@ static void oxfw_free(struct snd_oxfw *oxfw) kfree(oxfw->spec); mutex_destroy(&oxfw->mutex); + kfree(oxfw); } /* @@ -207,6 +208,7 @@ static int detect_quirks(struct snd_oxfw *oxfw) static void do_registration(struct work_struct *work) { struct snd_oxfw *oxfw = container_of(work, struct snd_oxfw, dwork.work); + int i; int err; if (oxfw->registered) @@ -269,7 +271,15 @@ error: snd_oxfw_stream_destroy_simplex(oxfw, &oxfw->rx_stream); if (oxfw->has_output) snd_oxfw_stream_destroy_simplex(oxfw, &oxfw->tx_stream); + for (i = 0; i < SND_OXFW_STREAM_FORMAT_ENTRIES; ++i) { + kfree(oxfw->tx_stream_formats[i]); + oxfw->tx_stream_formats[i] = NULL; + kfree(oxfw->rx_stream_formats[i]); + oxfw->rx_stream_formats[i] = NULL; + } snd_card_free(oxfw->card); + kfree(oxfw->spec); + oxfw->spec = NULL; dev_info(&oxfw->unit->device, "Sound card registration failed: %d\n", err); } diff --git a/sound/firewire/tascam/tascam.c b/sound/firewire/tascam/tascam.c index 44ad41fb7374..d3fdc463a884 100644 --- a/sound/firewire/tascam/tascam.c +++ b/sound/firewire/tascam/tascam.c @@ -93,6 +93,7 @@ static void tscm_free(struct snd_tscm *tscm) fw_unit_put(tscm->unit); mutex_destroy(&tscm->mutex); + kfree(tscm); } static void tscm_card_free(struct snd_card *card) diff --git a/sound/hda/hdac_controller.c b/sound/hda/hdac_controller.c index 560ec0986e1a..74244d8e2909 100644 --- a/sound/hda/hdac_controller.c +++ b/sound/hda/hdac_controller.c @@ -40,6 +40,8 @@ static void azx_clear_corbrp(struct hdac_bus *bus) */ void snd_hdac_bus_init_cmd_io(struct hdac_bus *bus) { + WARN_ON_ONCE(!bus->rb.area); + spin_lock_irq(&bus->reg_lock); /* CORB set up */ bus->corb.addr = bus->rb.addr; @@ -383,7 +385,7 @@ void snd_hdac_bus_exit_link_reset(struct hdac_bus *bus) EXPORT_SYMBOL_GPL(snd_hdac_bus_exit_link_reset); /* reset codec link */ -static int azx_reset(struct hdac_bus *bus, bool full_reset) +int snd_hdac_bus_reset_link(struct hdac_bus *bus, bool full_reset) { if (!full_reset) goto skip_reset; @@ -408,7 +410,7 @@ static int azx_reset(struct hdac_bus *bus, bool full_reset) skip_reset: /* check to see if controller is ready */ if (!snd_hdac_chip_readb(bus, GCTL)) { - dev_dbg(bus->dev, "azx_reset: controller not ready!\n"); + dev_dbg(bus->dev, "controller not ready!\n"); return -EBUSY; } @@ -423,6 +425,7 @@ static int azx_reset(struct hdac_bus *bus, bool full_reset) return 0; } +EXPORT_SYMBOL_GPL(snd_hdac_bus_reset_link); /* enable interrupts */ static void azx_int_enable(struct hdac_bus *bus) @@ -477,15 +480,17 @@ bool snd_hdac_bus_init_chip(struct hdac_bus *bus, bool full_reset) return false; /* reset controller */ - azx_reset(bus, full_reset); + snd_hdac_bus_reset_link(bus, full_reset); - /* initialize interrupts */ + /* clear interrupts */ azx_int_clear(bus); - azx_int_enable(bus); /* initialize the codec command I/O */ snd_hdac_bus_init_cmd_io(bus); + /* enable interrupts after CORB/RIRB buffers are initialized above */ + azx_int_enable(bus); + /* program the position buffer */ if (bus->use_posbuf && bus->posbuf.addr) { snd_hdac_chip_writel(bus, DPLBASE, (u32)bus->posbuf.addr); diff --git a/sound/pci/emu10k1/emufx.c b/sound/pci/emu10k1/emufx.c index 90713741c2dc..6ebe817801ea 100644 --- a/sound/pci/emu10k1/emufx.c +++ b/sound/pci/emu10k1/emufx.c @@ -2540,7 +2540,7 @@ static int snd_emu10k1_fx8010_ioctl(struct snd_hwdep * hw, struct file *file, un emu->support_tlv = 1; return put_user(SNDRV_EMU10K1_VERSION, (int __user *)argp); case SNDRV_EMU10K1_IOCTL_INFO: - info = kmalloc(sizeof(*info), GFP_KERNEL); + info = kzalloc(sizeof(*info), GFP_KERNEL); if (!info) return -ENOMEM; snd_emu10k1_fx8010_info(emu, info); diff --git a/sound/pci/hda/hda_intel.c b/sound/pci/hda/hda_intel.c index 1b2ce304152a..aa4c672dbaf7 100644 --- a/sound/pci/hda/hda_intel.c +++ b/sound/pci/hda/hda_intel.c @@ -365,8 +365,10 @@ enum { */ #ifdef SUPPORT_VGA_SWITCHEROO #define use_vga_switcheroo(chip) ((chip)->use_vga_switcheroo) +#define needs_eld_notify_link(chip) ((chip)->need_eld_notify_link) #else #define use_vga_switcheroo(chip) 0 +#define needs_eld_notify_link(chip) false #endif #define CONTROLLER_IN_GPU(pci) (((pci)->device == 0x0a0c) || \ @@ -453,6 +455,7 @@ static inline void mark_runtime_wc(struct azx *chip, struct azx_dev *azx_dev, #endif static int azx_acquire_irq(struct azx *chip, int do_disconnect); +static void set_default_power_save(struct azx *chip); /* * initialize the PCI registers @@ -1201,6 +1204,10 @@ static int azx_runtime_idle(struct device *dev) azx_bus(chip)->codec_powered || !chip->running) return -EBUSY; + /* ELD notification gets broken when HD-audio bus is off */ + if (needs_eld_notify_link(hda)) + return -EBUSY; + return 0; } @@ -1298,6 +1305,36 @@ static bool azx_vs_can_switch(struct pci_dev *pci) return true; } +/* + * The discrete GPU cannot power down unless the HDA controller runtime + * suspends, so activate runtime PM on codecs even if power_save == 0. + */ +static void setup_vga_switcheroo_runtime_pm(struct azx *chip) +{ + struct hda_intel *hda = container_of(chip, struct hda_intel, chip); + struct hda_codec *codec; + + if (hda->use_vga_switcheroo && !hda->need_eld_notify_link) { + list_for_each_codec(codec, &chip->bus) + codec->auto_runtime_pm = 1; + /* reset the power save setup */ + if (chip->running) + set_default_power_save(chip); + } +} + +static void azx_vs_gpu_bound(struct pci_dev *pci, + enum vga_switcheroo_client_id client_id) +{ + struct snd_card *card = pci_get_drvdata(pci); + struct azx *chip = card->private_data; + struct hda_intel *hda = container_of(chip, struct hda_intel, chip); + + if (client_id == VGA_SWITCHEROO_DIS) + hda->need_eld_notify_link = 0; + setup_vga_switcheroo_runtime_pm(chip); +} + static void init_vga_switcheroo(struct azx *chip) { struct hda_intel *hda = container_of(chip, struct hda_intel, chip); @@ -1306,6 +1343,7 @@ static void init_vga_switcheroo(struct azx *chip) dev_info(chip->card->dev, "Handle vga_switcheroo audio client\n"); hda->use_vga_switcheroo = 1; + hda->need_eld_notify_link = 1; /* cleared in gpu_bound op */ chip->driver_caps |= AZX_DCAPS_PM_RUNTIME; pci_dev_put(p); } @@ -1314,6 +1352,7 @@ static void init_vga_switcheroo(struct azx *chip) static const struct vga_switcheroo_client_ops azx_vs_ops = { .set_gpu_state = azx_vs_set_state, .can_switch = azx_vs_can_switch, + .gpu_bound = azx_vs_gpu_bound, }; static int register_vga_switcheroo(struct azx *chip) @@ -1339,6 +1378,7 @@ static int register_vga_switcheroo(struct azx *chip) #define init_vga_switcheroo(chip) /* NOP */ #define register_vga_switcheroo(chip) 0 #define check_hdmi_disabled(pci) false +#define setup_vga_switcheroo_runtime_pm(chip) /* NOP */ #endif /* SUPPORT_VGA_SWITCHER */ /* @@ -1352,6 +1392,7 @@ static int azx_free(struct azx *chip) if (azx_has_pm_runtime(chip) && chip->running) pm_runtime_get_noresume(&pci->dev); + chip->running = 0; azx_del_card_list(chip); @@ -2230,6 +2271,25 @@ static struct snd_pci_quirk power_save_blacklist[] = { }; #endif /* CONFIG_PM */ +static void set_default_power_save(struct azx *chip) +{ + int val = power_save; + +#ifdef CONFIG_PM + if (pm_blacklist) { + const struct snd_pci_quirk *q; + + q = snd_pci_quirk_lookup(chip->pci, power_save_blacklist); + if (q && val) { + dev_info(chip->card->dev, "device %04x:%04x is on the power_save blacklist, forcing power_save to 0\n", + q->subvendor, q->subdevice); + val = 0; + } + } +#endif /* CONFIG_PM */ + snd_hda_set_power_save(&chip->bus, val * 1000); +} + /* number of codec slots for each chipset: 0 = default slots (i.e. 4) */ static unsigned int azx_max_codecs[AZX_NUM_DRIVERS] = { [AZX_DRIVER_NVIDIA] = 8, @@ -2241,9 +2301,7 @@ static int azx_probe_continue(struct azx *chip) struct hda_intel *hda = container_of(chip, struct hda_intel, chip); struct hdac_bus *bus = azx_bus(chip); struct pci_dev *pci = chip->pci; - struct hda_codec *codec; int dev = chip->dev_index; - int val; int err; hda->probe_continued = 1; @@ -2322,31 +2380,13 @@ static int azx_probe_continue(struct azx *chip) if (err < 0) goto out_free; + setup_vga_switcheroo_runtime_pm(chip); + chip->running = 1; azx_add_card_list(chip); - val = power_save; -#ifdef CONFIG_PM - if (pm_blacklist) { - const struct snd_pci_quirk *q; - - q = snd_pci_quirk_lookup(chip->pci, power_save_blacklist); - if (q && val) { - dev_info(chip->card->dev, "device %04x:%04x is on the power_save blacklist, forcing power_save to 0\n", - q->subvendor, q->subdevice); - val = 0; - } - } -#endif /* CONFIG_PM */ - /* - * The discrete GPU cannot power down unless the HDA controller runtime - * suspends, so activate runtime PM on codecs even if power_save == 0. - */ - if (use_vga_switcheroo(hda)) - list_for_each_codec(codec, &chip->bus) - codec->auto_runtime_pm = 1; + set_default_power_save(chip); - snd_hda_set_power_save(&chip->bus, val * 1000); if (azx_has_pm_runtime(chip)) pm_runtime_put_autosuspend(&pci->dev); diff --git a/sound/pci/hda/hda_intel.h b/sound/pci/hda/hda_intel.h index e3a3d318d2e5..f59719e06b91 100644 --- a/sound/pci/hda/hda_intel.h +++ b/sound/pci/hda/hda_intel.h @@ -37,6 +37,7 @@ struct hda_intel { /* vga_switcheroo setup */ unsigned int use_vga_switcheroo:1; + unsigned int need_eld_notify_link:1; unsigned int vga_switcheroo_registered:1; unsigned int init_failed:1; /* delayed init failed */ diff --git a/sound/soc/amd/acp-pcm-dma.c b/sound/soc/amd/acp-pcm-dma.c index e359938e3d7e..77b265bd0505 100644 --- a/sound/soc/amd/acp-pcm-dma.c +++ b/sound/soc/amd/acp-pcm-dma.c @@ -16,6 +16,7 @@ #include <linux/module.h> #include <linux/delay.h> #include <linux/io.h> +#include <linux/iopoll.h> #include <linux/sizes.h> #include <linux/pm_runtime.h> @@ -184,6 +185,24 @@ static void config_dma_descriptor_in_sram(void __iomem *acp_mmio, acp_reg_write(descr_info->xfer_val, acp_mmio, mmACP_SRBM_Targ_Idx_Data); } +static void pre_config_reset(void __iomem *acp_mmio, u16 ch_num) +{ + u32 dma_ctrl; + int ret; + + /* clear the reset bit */ + dma_ctrl = acp_reg_read(acp_mmio, mmACP_DMA_CNTL_0 + ch_num); + dma_ctrl &= ~ACP_DMA_CNTL_0__DMAChRst_MASK; + acp_reg_write(dma_ctrl, acp_mmio, mmACP_DMA_CNTL_0 + ch_num); + /* check the reset bit before programming configuration registers */ + ret = readl_poll_timeout(acp_mmio + ((mmACP_DMA_CNTL_0 + ch_num) * 4), + dma_ctrl, + !(dma_ctrl & ACP_DMA_CNTL_0__DMAChRst_MASK), + 100, ACP_DMA_RESET_TIME); + if (ret < 0) + pr_err("Failed to clear reset of channel : %d\n", ch_num); +} + /* * Initialize the DMA descriptor information for transfer between * system memory <-> ACP SRAM @@ -236,6 +255,7 @@ static void set_acp_sysmem_dma_descriptors(void __iomem *acp_mmio, config_dma_descriptor_in_sram(acp_mmio, dma_dscr_idx, &dmadscr[i]); } + pre_config_reset(acp_mmio, ch); config_acp_dma_channel(acp_mmio, ch, dma_dscr_idx - 1, NUM_DSCRS_PER_CHANNEL, @@ -275,6 +295,7 @@ static void set_acp_to_i2s_dma_descriptors(void __iomem *acp_mmio, u32 size, config_dma_descriptor_in_sram(acp_mmio, dma_dscr_idx, &dmadscr[i]); } + pre_config_reset(acp_mmio, ch); /* Configure the DMA channel with the above descriptore */ config_acp_dma_channel(acp_mmio, ch, dma_dscr_idx - 1, NUM_DSCRS_PER_CHANNEL, diff --git a/sound/soc/codecs/cs4265.c b/sound/soc/codecs/cs4265.c index 275677de669f..407554175282 100644 --- a/sound/soc/codecs/cs4265.c +++ b/sound/soc/codecs/cs4265.c @@ -157,8 +157,8 @@ static const struct snd_kcontrol_new cs4265_snd_controls[] = { SOC_SINGLE("Validity Bit Control Switch", CS4265_SPDIF_CTL2, 3, 1, 0), SOC_ENUM("SPDIF Mono/Stereo", spdif_mono_stereo_enum), - SOC_SINGLE("MMTLR Data Switch", 0, - 1, 1, 0), + SOC_SINGLE("MMTLR Data Switch", CS4265_SPDIF_CTL2, + 0, 1, 0), SOC_ENUM("Mono Channel Select", spdif_mono_select_enum), SND_SOC_BYTES("C Data Buffer", CS4265_C_DATA_BUFF, 24), }; diff --git a/sound/soc/codecs/max98373.c b/sound/soc/codecs/max98373.c index 92b7125ea169..1093f766d0d2 100644 --- a/sound/soc/codecs/max98373.c +++ b/sound/soc/codecs/max98373.c @@ -520,6 +520,7 @@ static bool max98373_volatile_reg(struct device *dev, unsigned int reg) { switch (reg) { case MAX98373_R2000_SW_RESET ... MAX98373_R2009_INT_FLAG3: + case MAX98373_R203E_AMP_PATH_GAIN: case MAX98373_R2054_MEAS_ADC_PVDD_CH_READBACK: case MAX98373_R2055_MEAS_ADC_THERM_CH_READBACK: case MAX98373_R20B6_BDE_CUR_STATE_READBACK: @@ -729,6 +730,7 @@ static int max98373_probe(struct snd_soc_component *component) /* Software Reset */ regmap_write(max98373->regmap, MAX98373_R2000_SW_RESET, MAX98373_SOFT_RESET); + usleep_range(10000, 11000); /* IV default slot configuration */ regmap_write(max98373->regmap, @@ -817,6 +819,7 @@ static int max98373_resume(struct device *dev) regmap_write(max98373->regmap, MAX98373_R2000_SW_RESET, MAX98373_SOFT_RESET); + usleep_range(10000, 11000); regcache_cache_only(max98373->regmap, false); regcache_sync(max98373->regmap); return 0; diff --git a/sound/soc/codecs/rt5514.c b/sound/soc/codecs/rt5514.c index dca82dd6e3bf..32fe76c3134a 100644 --- a/sound/soc/codecs/rt5514.c +++ b/sound/soc/codecs/rt5514.c @@ -64,8 +64,8 @@ static const struct reg_sequence rt5514_patch[] = { {RT5514_ANA_CTRL_LDO10, 0x00028604}, {RT5514_ANA_CTRL_ADCFED, 0x00000800}, {RT5514_ASRC_IN_CTRL1, 0x00000003}, - {RT5514_DOWNFILTER0_CTRL3, 0x10000352}, - {RT5514_DOWNFILTER1_CTRL3, 0x10000352}, + {RT5514_DOWNFILTER0_CTRL3, 0x10000342}, + {RT5514_DOWNFILTER1_CTRL3, 0x10000342}, }; static const struct reg_default rt5514_reg[] = { @@ -92,10 +92,10 @@ static const struct reg_default rt5514_reg[] = { {RT5514_ASRC_IN_CTRL1, 0x00000003}, {RT5514_DOWNFILTER0_CTRL1, 0x00020c2f}, {RT5514_DOWNFILTER0_CTRL2, 0x00020c2f}, - {RT5514_DOWNFILTER0_CTRL3, 0x10000352}, + {RT5514_DOWNFILTER0_CTRL3, 0x10000342}, {RT5514_DOWNFILTER1_CTRL1, 0x00020c2f}, {RT5514_DOWNFILTER1_CTRL2, 0x00020c2f}, - {RT5514_DOWNFILTER1_CTRL3, 0x10000352}, + {RT5514_DOWNFILTER1_CTRL3, 0x10000342}, {RT5514_ANA_CTRL_LDO10, 0x00028604}, {RT5514_ANA_CTRL_LDO18_16, 0x02000345}, {RT5514_ANA_CTRL_ADC12, 0x0000a2a8}, diff --git a/sound/soc/codecs/rt5682.c b/sound/soc/codecs/rt5682.c index 640d400ca013..afe7d5b19313 100644 --- a/sound/soc/codecs/rt5682.c +++ b/sound/soc/codecs/rt5682.c @@ -750,8 +750,8 @@ static bool rt5682_readable_register(struct device *dev, unsigned int reg) } static const DECLARE_TLV_DB_SCALE(hp_vol_tlv, -2250, 150, 0); -static const DECLARE_TLV_DB_SCALE(dac_vol_tlv, -65625, 375, 0); -static const DECLARE_TLV_DB_SCALE(adc_vol_tlv, -17625, 375, 0); +static const DECLARE_TLV_DB_SCALE(dac_vol_tlv, -6525, 75, 0); +static const DECLARE_TLV_DB_SCALE(adc_vol_tlv, -1725, 75, 0); static const DECLARE_TLV_DB_SCALE(adc_bst_tlv, 0, 1200, 0); /* {0, +20, +24, +30, +35, +40, +44, +50, +52} dB */ @@ -1114,7 +1114,7 @@ static const struct snd_kcontrol_new rt5682_snd_controls[] = { /* DAC Digital Volume */ SOC_DOUBLE_TLV("DAC1 Playback Volume", RT5682_DAC1_DIG_VOL, - RT5682_L_VOL_SFT, RT5682_R_VOL_SFT, 175, 0, dac_vol_tlv), + RT5682_L_VOL_SFT + 1, RT5682_R_VOL_SFT + 1, 86, 0, dac_vol_tlv), /* IN Boost Volume */ SOC_SINGLE_TLV("CBJ Boost Volume", RT5682_CBJ_BST_CTRL, @@ -1124,7 +1124,7 @@ static const struct snd_kcontrol_new rt5682_snd_controls[] = { SOC_DOUBLE("STO1 ADC Capture Switch", RT5682_STO1_ADC_DIG_VOL, RT5682_L_MUTE_SFT, RT5682_R_MUTE_SFT, 1, 1), SOC_DOUBLE_TLV("STO1 ADC Capture Volume", RT5682_STO1_ADC_DIG_VOL, - RT5682_L_VOL_SFT, RT5682_R_VOL_SFT, 127, 0, adc_vol_tlv), + RT5682_L_VOL_SFT + 1, RT5682_R_VOL_SFT + 1, 63, 0, adc_vol_tlv), /* ADC Boost Volume Control */ SOC_DOUBLE_TLV("STO1 ADC Boost Gain Volume", RT5682_STO1_ADC_BOOST, diff --git a/sound/soc/codecs/sigmadsp.c b/sound/soc/codecs/sigmadsp.c index d53680ac78e4..6df158669420 100644 --- a/sound/soc/codecs/sigmadsp.c +++ b/sound/soc/codecs/sigmadsp.c @@ -117,8 +117,7 @@ static int sigmadsp_ctrl_write(struct sigmadsp *sigmadsp, struct sigmadsp_control *ctrl, void *data) { /* safeload loads up to 20 bytes in a atomic operation */ - if (ctrl->num_bytes > 4 && ctrl->num_bytes <= 20 && sigmadsp->ops && - sigmadsp->ops->safeload) + if (ctrl->num_bytes <= 20 && sigmadsp->ops && sigmadsp->ops->safeload) return sigmadsp->ops->safeload(sigmadsp, ctrl->addr, data, ctrl->num_bytes); else diff --git a/sound/soc/codecs/tas6424.c b/sound/soc/codecs/tas6424.c index 14999b999fd3..0d6145549a98 100644 --- a/sound/soc/codecs/tas6424.c +++ b/sound/soc/codecs/tas6424.c @@ -424,8 +424,10 @@ static void tas6424_fault_check_work(struct work_struct *work) TAS6424_FAULT_PVDD_UV | TAS6424_FAULT_VBAT_UV; - if (reg) + if (!reg) { + tas6424->last_fault1 = reg; goto check_global_fault2_reg; + } /* * Only flag errors once for a given occurrence. This is needed as @@ -461,8 +463,10 @@ check_global_fault2_reg: TAS6424_FAULT_OTSD_CH3 | TAS6424_FAULT_OTSD_CH4; - if (!reg) + if (!reg) { + tas6424->last_fault2 = reg; goto check_warn_reg; + } if ((reg & TAS6424_FAULT_OTSD) && !(tas6424->last_fault2 & TAS6424_FAULT_OTSD)) dev_crit(dev, "experienced a global overtemp shutdown\n"); @@ -497,8 +501,10 @@ check_warn_reg: TAS6424_WARN_VDD_OTW_CH3 | TAS6424_WARN_VDD_OTW_CH4; - if (!reg) + if (!reg) { + tas6424->last_warn = reg; goto out; + } if ((reg & TAS6424_WARN_VDD_UV) && !(tas6424->last_warn & TAS6424_WARN_VDD_UV)) dev_warn(dev, "experienced a VDD under voltage condition\n"); diff --git a/sound/soc/codecs/wm8804-i2c.c b/sound/soc/codecs/wm8804-i2c.c index f27464c2c5ba..79541960f45d 100644 --- a/sound/soc/codecs/wm8804-i2c.c +++ b/sound/soc/codecs/wm8804-i2c.c @@ -13,6 +13,7 @@ #include <linux/init.h> #include <linux/module.h> #include <linux/i2c.h> +#include <linux/acpi.h> #include "wm8804.h" @@ -40,17 +41,29 @@ static const struct i2c_device_id wm8804_i2c_id[] = { }; MODULE_DEVICE_TABLE(i2c, wm8804_i2c_id); +#if defined(CONFIG_OF) static const struct of_device_id wm8804_of_match[] = { { .compatible = "wlf,wm8804", }, { } }; MODULE_DEVICE_TABLE(of, wm8804_of_match); +#endif + +#ifdef CONFIG_ACPI +static const struct acpi_device_id wm8804_acpi_match[] = { + { "1AEC8804", 0 }, /* Wolfson PCI ID + part ID */ + { "10138804", 0 }, /* Cirrus Logic PCI ID + part ID */ + { }, +}; +MODULE_DEVICE_TABLE(acpi, wm8804_acpi_match); +#endif static struct i2c_driver wm8804_i2c_driver = { .driver = { .name = "wm8804", .pm = &wm8804_pm, - .of_match_table = wm8804_of_match, + .of_match_table = of_match_ptr(wm8804_of_match), + .acpi_match_table = ACPI_PTR(wm8804_acpi_match), }, .probe = wm8804_i2c_probe, .remove = wm8804_i2c_remove, diff --git a/sound/soc/codecs/wm9712.c b/sound/soc/codecs/wm9712.c index 953d94d50586..ade34c26ad2f 100644 --- a/sound/soc/codecs/wm9712.c +++ b/sound/soc/codecs/wm9712.c @@ -719,7 +719,7 @@ static int wm9712_probe(struct platform_device *pdev) static struct platform_driver wm9712_component_driver = { .driver = { - .name = "wm9712-component", + .name = "wm9712-codec", }, .probe = wm9712_probe, diff --git a/sound/soc/intel/boards/bytcr_rt5640.c b/sound/soc/intel/boards/bytcr_rt5640.c index d32844f94d74..b6dc524830b2 100644 --- a/sound/soc/intel/boards/bytcr_rt5640.c +++ b/sound/soc/intel/boards/bytcr_rt5640.c @@ -575,6 +575,17 @@ static const struct dmi_system_id byt_rt5640_quirk_table[] = { BYT_RT5640_MONO_SPEAKER | BYT_RT5640_MCLK_EN), }, + { /* Linx Linx7 tablet */ + .matches = { + DMI_EXACT_MATCH(DMI_SYS_VENDOR, "LINX"), + DMI_EXACT_MATCH(DMI_PRODUCT_NAME, "LINX7"), + }, + .driver_data = (void *)(BYTCR_INPUT_DEFAULTS | + BYT_RT5640_MONO_SPEAKER | + BYT_RT5640_JD_NOT_INV | + BYT_RT5640_SSP0_AIF1 | + BYT_RT5640_MCLK_EN), + }, { /* MSI S100 tablet */ .matches = { DMI_EXACT_MATCH(DMI_SYS_VENDOR, "Micro-Star International Co., Ltd."), @@ -602,6 +613,21 @@ static const struct dmi_system_id byt_rt5640_quirk_table[] = { BYT_RT5640_SSP0_AIF1 | BYT_RT5640_MCLK_EN), }, + { /* Onda v975w */ + .matches = { + DMI_EXACT_MATCH(DMI_BOARD_VENDOR, "AMI Corporation"), + DMI_EXACT_MATCH(DMI_BOARD_NAME, "Aptio CRB"), + /* The above are too generic, also match BIOS info */ + DMI_EXACT_MATCH(DMI_BIOS_VERSION, "5.6.5"), + DMI_EXACT_MATCH(DMI_BIOS_DATE, "07/25/2014"), + }, + .driver_data = (void *)(BYT_RT5640_IN1_MAP | + BYT_RT5640_JD_SRC_JD2_IN4N | + BYT_RT5640_OVCD_TH_2000UA | + BYT_RT5640_OVCD_SF_0P75 | + BYT_RT5640_DIFF_MIC | + BYT_RT5640_MCLK_EN), + }, { /* Pipo W4 */ .matches = { DMI_EXACT_MATCH(DMI_BOARD_VENDOR, "AMI Corporation"), diff --git a/sound/soc/intel/skylake/skl.c b/sound/soc/intel/skylake/skl.c index dce649485649..1d17be0f78a0 100644 --- a/sound/soc/intel/skylake/skl.c +++ b/sound/soc/intel/skylake/skl.c @@ -834,7 +834,7 @@ static int skl_first_init(struct hdac_bus *bus) return -ENXIO; } - skl_init_chip(bus, true); + snd_hdac_bus_reset_link(bus, true); snd_hdac_bus_parse_capabilities(bus); diff --git a/sound/soc/qcom/qdsp6/q6routing.c b/sound/soc/qcom/qdsp6/q6routing.c index dc94c5c53788..c6b51571be94 100644 --- a/sound/soc/qcom/qdsp6/q6routing.c +++ b/sound/soc/qcom/qdsp6/q6routing.c @@ -960,8 +960,10 @@ static int msm_routing_probe(struct snd_soc_component *c) { int i; - for (i = 0; i < MAX_SESSIONS; i++) + for (i = 0; i < MAX_SESSIONS; i++) { routing_data->sessions[i].port_id = -1; + routing_data->sessions[i].fedai_id = -1; + } return 0; } diff --git a/sound/soc/sh/rcar/adg.c b/sound/soc/sh/rcar/adg.c index 3a3064dda57f..051f96405346 100644 --- a/sound/soc/sh/rcar/adg.c +++ b/sound/soc/sh/rcar/adg.c @@ -462,6 +462,11 @@ static void rsnd_adg_get_clkout(struct rsnd_priv *priv, goto rsnd_adg_get_clkout_end; req_size = prop->length / sizeof(u32); + if (req_size > REQ_SIZE) { + dev_err(dev, + "too many clock-frequency, use top %d\n", REQ_SIZE); + req_size = REQ_SIZE; + } of_property_read_u32_array(np, "clock-frequency", req_rate, req_size); req_48kHz_rate = 0; diff --git a/sound/soc/sh/rcar/core.c b/sound/soc/sh/rcar/core.c index f8425d8b44d2..d23c2bbff0cf 100644 --- a/sound/soc/sh/rcar/core.c +++ b/sound/soc/sh/rcar/core.c @@ -478,7 +478,7 @@ static int rsnd_status_update(u32 *status, (func_call && (mod)->ops->fn) ? #fn : ""); \ if (func_call && (mod)->ops->fn) \ tmp = (mod)->ops->fn(mod, io, param); \ - if (tmp) \ + if (tmp && (tmp != -EPROBE_DEFER)) \ dev_err(dev, "%s[%d] : %s error %d\n", \ rsnd_mod_name(mod), rsnd_mod_id(mod), \ #fn, tmp); \ @@ -958,12 +958,23 @@ static void rsnd_soc_dai_shutdown(struct snd_pcm_substream *substream, rsnd_dai_stream_quit(io); } +static int rsnd_soc_dai_prepare(struct snd_pcm_substream *substream, + struct snd_soc_dai *dai) +{ + struct rsnd_priv *priv = rsnd_dai_to_priv(dai); + struct rsnd_dai *rdai = rsnd_dai_to_rdai(dai); + struct rsnd_dai_stream *io = rsnd_rdai_to_io(rdai, substream); + + return rsnd_dai_call(prepare, io, priv); +} + static const struct snd_soc_dai_ops rsnd_soc_dai_ops = { .startup = rsnd_soc_dai_startup, .shutdown = rsnd_soc_dai_shutdown, .trigger = rsnd_soc_dai_trigger, .set_fmt = rsnd_soc_dai_set_fmt, .set_tdm_slot = rsnd_soc_set_dai_tdm_slot, + .prepare = rsnd_soc_dai_prepare, }; void rsnd_parse_connect_common(struct rsnd_dai *rdai, @@ -1550,6 +1561,14 @@ exit_snd_probe: rsnd_dai_call(remove, &rdai->capture, priv); } + /* + * adg is very special mod which can't use rsnd_dai_call(remove), + * and it registers ADG clock on probe. + * It should be unregister if probe failed. + * Mainly it is assuming -EPROBE_DEFER case + */ + rsnd_adg_remove(priv); + return ret; } diff --git a/sound/soc/sh/rcar/dma.c b/sound/soc/sh/rcar/dma.c index fe63ef8600d0..d65ea7bc4dac 100644 --- a/sound/soc/sh/rcar/dma.c +++ b/sound/soc/sh/rcar/dma.c @@ -241,6 +241,10 @@ static int rsnd_dmaen_attach(struct rsnd_dai_stream *io, /* try to get DMAEngine channel */ chan = rsnd_dmaen_request_channel(io, mod_from, mod_to); if (IS_ERR_OR_NULL(chan)) { + /* Let's follow when -EPROBE_DEFER case */ + if (PTR_ERR(chan) == -EPROBE_DEFER) + return PTR_ERR(chan); + /* * DMA failed. try to PIO mode * see diff --git a/sound/soc/sh/rcar/rsnd.h b/sound/soc/sh/rcar/rsnd.h index 96d93330b1e1..8f7a0abfa751 100644 --- a/sound/soc/sh/rcar/rsnd.h +++ b/sound/soc/sh/rcar/rsnd.h @@ -280,6 +280,9 @@ struct rsnd_mod_ops { int (*nolock_stop)(struct rsnd_mod *mod, struct rsnd_dai_stream *io, struct rsnd_priv *priv); + int (*prepare)(struct rsnd_mod *mod, + struct rsnd_dai_stream *io, + struct rsnd_priv *priv); }; struct rsnd_dai_stream; @@ -309,6 +312,7 @@ struct rsnd_mod { * H 0: fallback * H 0: hw_params * H 0: pointer + * H 0: prepare */ #define __rsnd_mod_shift_nolock_start 0 #define __rsnd_mod_shift_nolock_stop 0 @@ -323,6 +327,7 @@ struct rsnd_mod { #define __rsnd_mod_shift_fallback 28 /* always called */ #define __rsnd_mod_shift_hw_params 28 /* always called */ #define __rsnd_mod_shift_pointer 28 /* always called */ +#define __rsnd_mod_shift_prepare 28 /* always called */ #define __rsnd_mod_add_probe 0 #define __rsnd_mod_add_remove 0 @@ -337,6 +342,7 @@ struct rsnd_mod { #define __rsnd_mod_add_fallback 0 #define __rsnd_mod_add_hw_params 0 #define __rsnd_mod_add_pointer 0 +#define __rsnd_mod_add_prepare 0 #define __rsnd_mod_call_probe 0 #define __rsnd_mod_call_remove 0 @@ -351,6 +357,7 @@ struct rsnd_mod { #define __rsnd_mod_call_pointer 0 #define __rsnd_mod_call_nolock_start 0 #define __rsnd_mod_call_nolock_stop 1 +#define __rsnd_mod_call_prepare 0 #define rsnd_mod_to_priv(mod) ((mod)->priv) #define rsnd_mod_name(mod) ((mod)->ops->name) diff --git a/sound/soc/sh/rcar/ssi.c b/sound/soc/sh/rcar/ssi.c index 8304e4ec9242..3f880ec66459 100644 --- a/sound/soc/sh/rcar/ssi.c +++ b/sound/soc/sh/rcar/ssi.c @@ -283,7 +283,7 @@ static int rsnd_ssi_master_clk_start(struct rsnd_mod *mod, if (rsnd_ssi_is_multi_slave(mod, io)) return 0; - if (ssi->usrcnt > 1) { + if (ssi->rate) { if (ssi->rate != rate) { dev_err(dev, "SSI parent/child should use same rate\n"); return -EINVAL; @@ -434,7 +434,6 @@ static int rsnd_ssi_init(struct rsnd_mod *mod, struct rsnd_priv *priv) { struct rsnd_ssi *ssi = rsnd_mod_to_ssi(mod); - int ret; if (!rsnd_ssi_is_run_mods(mod, io)) return 0; @@ -443,10 +442,6 @@ static int rsnd_ssi_init(struct rsnd_mod *mod, rsnd_mod_power_on(mod); - ret = rsnd_ssi_master_clk_start(mod, io); - if (ret < 0) - return ret; - rsnd_ssi_config_init(mod, io); rsnd_ssi_register_setup(mod); @@ -852,6 +847,13 @@ static int rsnd_ssi_pio_pointer(struct rsnd_mod *mod, return 0; } +static int rsnd_ssi_prepare(struct rsnd_mod *mod, + struct rsnd_dai_stream *io, + struct rsnd_priv *priv) +{ + return rsnd_ssi_master_clk_start(mod, io); +} + static struct rsnd_mod_ops rsnd_ssi_pio_ops = { .name = SSI_NAME, .probe = rsnd_ssi_common_probe, @@ -864,6 +866,7 @@ static struct rsnd_mod_ops rsnd_ssi_pio_ops = { .pointer = rsnd_ssi_pio_pointer, .pcm_new = rsnd_ssi_pcm_new, .hw_params = rsnd_ssi_hw_params, + .prepare = rsnd_ssi_prepare, }; static int rsnd_ssi_dma_probe(struct rsnd_mod *mod, @@ -940,6 +943,7 @@ static struct rsnd_mod_ops rsnd_ssi_dma_ops = { .pcm_new = rsnd_ssi_pcm_new, .fallback = rsnd_ssi_fallback, .hw_params = rsnd_ssi_hw_params, + .prepare = rsnd_ssi_prepare, }; int rsnd_ssi_is_dma_mode(struct rsnd_mod *mod) diff --git a/sound/soc/soc-core.c b/sound/soc/soc-core.c index 9cfe10d8040c..473eefe8658e 100644 --- a/sound/soc/soc-core.c +++ b/sound/soc/soc-core.c @@ -1447,7 +1447,7 @@ static int soc_link_dai_widgets(struct snd_soc_card *card, sink = codec_dai->playback_widget; source = cpu_dai->capture_widget; if (sink && source) { - ret = snd_soc_dapm_new_pcm(card, dai_link->params, + ret = snd_soc_dapm_new_pcm(card, rtd, dai_link->params, dai_link->num_params, source, sink); if (ret != 0) { @@ -1460,7 +1460,7 @@ static int soc_link_dai_widgets(struct snd_soc_card *card, sink = cpu_dai->playback_widget; source = codec_dai->capture_widget; if (sink && source) { - ret = snd_soc_dapm_new_pcm(card, dai_link->params, + ret = snd_soc_dapm_new_pcm(card, rtd, dai_link->params, dai_link->num_params, source, sink); if (ret != 0) { diff --git a/sound/soc/soc-dapm.c b/sound/soc/soc-dapm.c index 7e96793050c9..461d951917c0 100644 --- a/sound/soc/soc-dapm.c +++ b/sound/soc/soc-dapm.c @@ -3652,6 +3652,7 @@ static int snd_soc_dai_link_event(struct snd_soc_dapm_widget *w, { struct snd_soc_dapm_path *source_p, *sink_p; struct snd_soc_dai *source, *sink; + struct snd_soc_pcm_runtime *rtd = w->priv; const struct snd_soc_pcm_stream *config = w->params + w->params_select; struct snd_pcm_substream substream; struct snd_pcm_hw_params *params = NULL; @@ -3711,6 +3712,7 @@ static int snd_soc_dai_link_event(struct snd_soc_dapm_widget *w, goto out; } substream.runtime = runtime; + substream.private_data = rtd; switch (event) { case SND_SOC_DAPM_PRE_PMU: @@ -3895,6 +3897,7 @@ outfree_w_param: } int snd_soc_dapm_new_pcm(struct snd_soc_card *card, + struct snd_soc_pcm_runtime *rtd, const struct snd_soc_pcm_stream *params, unsigned int num_params, struct snd_soc_dapm_widget *source, @@ -3963,6 +3966,7 @@ int snd_soc_dapm_new_pcm(struct snd_soc_card *card, w->params = params; w->num_params = num_params; + w->priv = rtd; ret = snd_soc_dapm_add_path(&card->dapm, source, w, NULL, NULL); if (ret) diff --git a/tools/kvm/kvm_stat/kvm_stat b/tools/kvm/kvm_stat/kvm_stat index 439b8a27488d..195ba486640f 100755 --- a/tools/kvm/kvm_stat/kvm_stat +++ b/tools/kvm/kvm_stat/kvm_stat @@ -1325,7 +1325,7 @@ class Tui(object): msg = '' while True: self.screen.erase() - self.screen.addstr(0, 0, 'Set update interval (defaults to %fs).' % + self.screen.addstr(0, 0, 'Set update interval (defaults to %.1fs).' % DELAY_DEFAULT, curses.A_BOLD) self.screen.addstr(4, 0, msg) self.screen.addstr(2, 0, 'Change delay from %.1fs to ' % diff --git a/tools/perf/arch/powerpc/util/book3s_hv_exits.h b/tools/perf/arch/powerpc/util/book3s_hv_exits.h index 853b95d1e139..2011376c7ab5 100644 --- a/tools/perf/arch/powerpc/util/book3s_hv_exits.h +++ b/tools/perf/arch/powerpc/util/book3s_hv_exits.h @@ -15,7 +15,6 @@ {0x400, "INST_STORAGE"}, \ {0x480, "INST_SEGMENT"}, \ {0x500, "EXTERNAL"}, \ - {0x501, "EXTERNAL_LEVEL"}, \ {0x502, "EXTERNAL_HV"}, \ {0x600, "ALIGNMENT"}, \ {0x700, "PROGRAM"}, \ diff --git a/tools/testing/selftests/android/Makefile b/tools/testing/selftests/android/Makefile index 72c25a3cb658..d9a725478375 100644 --- a/tools/testing/selftests/android/Makefile +++ b/tools/testing/selftests/android/Makefile @@ -6,7 +6,7 @@ TEST_PROGS := run.sh include ../lib.mk -all: +all: khdr @for DIR in $(SUBDIRS); do \ BUILD_TARGET=$(OUTPUT)/$$DIR; \ mkdir $$BUILD_TARGET -p; \ diff --git a/tools/testing/selftests/android/ion/config b/tools/testing/selftests/android/config index b4ad748a9dd9..b4ad748a9dd9 100644 --- a/tools/testing/selftests/android/ion/config +++ b/tools/testing/selftests/android/config diff --git a/tools/testing/selftests/android/ion/Makefile b/tools/testing/selftests/android/ion/Makefile index e03695287f76..88cfe88e466f 100644 --- a/tools/testing/selftests/android/ion/Makefile +++ b/tools/testing/selftests/android/ion/Makefile @@ -10,6 +10,8 @@ $(TEST_GEN_FILES): ipcsocket.c ionutils.c TEST_PROGS := ion_test.sh +KSFT_KHDR_INSTALL := 1 +top_srcdir = ../../../../.. include ../../lib.mk $(OUTPUT)/ionapp_export: ionapp_export.c ipcsocket.c ionutils.c diff --git a/tools/testing/selftests/cgroup/.gitignore b/tools/testing/selftests/cgroup/.gitignore index 95eb3a53c381..adacda50a4b2 100644 --- a/tools/testing/selftests/cgroup/.gitignore +++ b/tools/testing/selftests/cgroup/.gitignore @@ -1 +1,2 @@ test_memcontrol +test_core diff --git a/tools/testing/selftests/cgroup/cgroup_util.c b/tools/testing/selftests/cgroup/cgroup_util.c index 1c5d2b2a583b..14c9fe284806 100644 --- a/tools/testing/selftests/cgroup/cgroup_util.c +++ b/tools/testing/selftests/cgroup/cgroup_util.c @@ -89,17 +89,28 @@ int cg_read(const char *cgroup, const char *control, char *buf, size_t len) int cg_read_strcmp(const char *cgroup, const char *control, const char *expected) { - size_t size = strlen(expected) + 1; + size_t size; char *buf; + int ret; + + /* Handle the case of comparing against empty string */ + if (!expected) + size = 32; + else + size = strlen(expected) + 1; buf = malloc(size); if (!buf) return -1; - if (cg_read(cgroup, control, buf, size)) + if (cg_read(cgroup, control, buf, size)) { + free(buf); return -1; + } - return strcmp(expected, buf); + ret = strcmp(expected, buf); + free(buf); + return ret; } int cg_read_strstr(const char *cgroup, const char *control, const char *needle) @@ -337,3 +348,24 @@ int is_swap_enabled(void) return cnt > 1; } + +int set_oom_adj_score(int pid, int score) +{ + char path[PATH_MAX]; + int fd, len; + + sprintf(path, "/proc/%d/oom_score_adj", pid); + + fd = open(path, O_WRONLY | O_APPEND); + if (fd < 0) + return fd; + + len = dprintf(fd, "%d", score); + if (len < 0) { + close(fd); + return len; + } + + close(fd); + return 0; +} diff --git a/tools/testing/selftests/cgroup/cgroup_util.h b/tools/testing/selftests/cgroup/cgroup_util.h index 1ff6f9f1abdc..9ac8b7958f83 100644 --- a/tools/testing/selftests/cgroup/cgroup_util.h +++ b/tools/testing/selftests/cgroup/cgroup_util.h @@ -40,3 +40,4 @@ extern int get_temp_fd(void); extern int alloc_pagecache(int fd, size_t size); extern int alloc_anon(const char *cgroup, void *arg); extern int is_swap_enabled(void); +extern int set_oom_adj_score(int pid, int score); diff --git a/tools/testing/selftests/cgroup/test_memcontrol.c b/tools/testing/selftests/cgroup/test_memcontrol.c index cf0bddc9d271..28d321ba311b 100644 --- a/tools/testing/selftests/cgroup/test_memcontrol.c +++ b/tools/testing/selftests/cgroup/test_memcontrol.c @@ -2,6 +2,7 @@ #define _GNU_SOURCE #include <linux/limits.h> +#include <linux/oom.h> #include <fcntl.h> #include <stdio.h> #include <stdlib.h> @@ -202,6 +203,36 @@ static int alloc_pagecache_50M_noexit(const char *cgroup, void *arg) return 0; } +static int alloc_anon_noexit(const char *cgroup, void *arg) +{ + int ppid = getppid(); + + if (alloc_anon(cgroup, arg)) + return -1; + + while (getppid() == ppid) + sleep(1); + + return 0; +} + +/* + * Wait until processes are killed asynchronously by the OOM killer + * If we exceed a timeout, fail. + */ +static int cg_test_proc_killed(const char *cgroup) +{ + int limit; + + for (limit = 10; limit > 0; limit--) { + if (cg_read_strcmp(cgroup, "cgroup.procs", "") == 0) + return 0; + + usleep(100000); + } + return -1; +} + /* * First, this test creates the following hierarchy: * A memory.min = 50M, memory.max = 200M @@ -964,6 +995,177 @@ cleanup: return ret; } +/* + * This test disables swapping and tries to allocate anonymous memory + * up to OOM with memory.group.oom set. Then it checks that all + * processes in the leaf (but not the parent) were killed. + */ +static int test_memcg_oom_group_leaf_events(const char *root) +{ + int ret = KSFT_FAIL; + char *parent, *child; + + parent = cg_name(root, "memcg_test_0"); + child = cg_name(root, "memcg_test_0/memcg_test_1"); + + if (!parent || !child) + goto cleanup; + + if (cg_create(parent)) + goto cleanup; + + if (cg_create(child)) + goto cleanup; + + if (cg_write(parent, "cgroup.subtree_control", "+memory")) + goto cleanup; + + if (cg_write(child, "memory.max", "50M")) + goto cleanup; + + if (cg_write(child, "memory.swap.max", "0")) + goto cleanup; + + if (cg_write(child, "memory.oom.group", "1")) + goto cleanup; + + cg_run_nowait(parent, alloc_anon_noexit, (void *) MB(60)); + cg_run_nowait(child, alloc_anon_noexit, (void *) MB(1)); + cg_run_nowait(child, alloc_anon_noexit, (void *) MB(1)); + if (!cg_run(child, alloc_anon, (void *)MB(100))) + goto cleanup; + + if (cg_test_proc_killed(child)) + goto cleanup; + + if (cg_read_key_long(child, "memory.events", "oom_kill ") <= 0) + goto cleanup; + + if (cg_read_key_long(parent, "memory.events", "oom_kill ") != 0) + goto cleanup; + + ret = KSFT_PASS; + +cleanup: + if (child) + cg_destroy(child); + if (parent) + cg_destroy(parent); + free(child); + free(parent); + + return ret; +} + +/* + * This test disables swapping and tries to allocate anonymous memory + * up to OOM with memory.group.oom set. Then it checks that all + * processes in the parent and leaf were killed. + */ +static int test_memcg_oom_group_parent_events(const char *root) +{ + int ret = KSFT_FAIL; + char *parent, *child; + + parent = cg_name(root, "memcg_test_0"); + child = cg_name(root, "memcg_test_0/memcg_test_1"); + + if (!parent || !child) + goto cleanup; + + if (cg_create(parent)) + goto cleanup; + + if (cg_create(child)) + goto cleanup; + + if (cg_write(parent, "memory.max", "80M")) + goto cleanup; + + if (cg_write(parent, "memory.swap.max", "0")) + goto cleanup; + + if (cg_write(parent, "memory.oom.group", "1")) + goto cleanup; + + cg_run_nowait(parent, alloc_anon_noexit, (void *) MB(60)); + cg_run_nowait(child, alloc_anon_noexit, (void *) MB(1)); + cg_run_nowait(child, alloc_anon_noexit, (void *) MB(1)); + + if (!cg_run(child, alloc_anon, (void *)MB(100))) + goto cleanup; + + if (cg_test_proc_killed(child)) + goto cleanup; + if (cg_test_proc_killed(parent)) + goto cleanup; + + ret = KSFT_PASS; + +cleanup: + if (child) + cg_destroy(child); + if (parent) + cg_destroy(parent); + free(child); + free(parent); + + return ret; +} + +/* + * This test disables swapping and tries to allocate anonymous memory + * up to OOM with memory.group.oom set. Then it checks that all + * processes were killed except those set with OOM_SCORE_ADJ_MIN + */ +static int test_memcg_oom_group_score_events(const char *root) +{ + int ret = KSFT_FAIL; + char *memcg; + int safe_pid; + + memcg = cg_name(root, "memcg_test_0"); + + if (!memcg) + goto cleanup; + + if (cg_create(memcg)) + goto cleanup; + + if (cg_write(memcg, "memory.max", "50M")) + goto cleanup; + + if (cg_write(memcg, "memory.swap.max", "0")) + goto cleanup; + + if (cg_write(memcg, "memory.oom.group", "1")) + goto cleanup; + + safe_pid = cg_run_nowait(memcg, alloc_anon_noexit, (void *) MB(1)); + if (set_oom_adj_score(safe_pid, OOM_SCORE_ADJ_MIN)) + goto cleanup; + + cg_run_nowait(memcg, alloc_anon_noexit, (void *) MB(1)); + if (!cg_run(memcg, alloc_anon, (void *)MB(100))) + goto cleanup; + + if (cg_read_key_long(memcg, "memory.events", "oom_kill ") != 3) + goto cleanup; + + if (kill(safe_pid, SIGKILL)) + goto cleanup; + + ret = KSFT_PASS; + +cleanup: + if (memcg) + cg_destroy(memcg); + free(memcg); + + return ret; +} + + #define T(x) { x, #x } struct memcg_test { int (*fn)(const char *root); @@ -978,6 +1180,9 @@ struct memcg_test { T(test_memcg_oom_events), T(test_memcg_swap_max), T(test_memcg_sock), + T(test_memcg_oom_group_leaf_events), + T(test_memcg_oom_group_parent_events), + T(test_memcg_oom_group_score_events), }; #undef T diff --git a/tools/testing/selftests/efivarfs/config b/tools/testing/selftests/efivarfs/config new file mode 100644 index 000000000000..4e151f1005b2 --- /dev/null +++ b/tools/testing/selftests/efivarfs/config @@ -0,0 +1 @@ +CONFIG_EFIVAR_FS=y diff --git a/tools/testing/selftests/futex/functional/Makefile b/tools/testing/selftests/futex/functional/Makefile index ff8feca49746..ad1eeb14fda7 100644 --- a/tools/testing/selftests/futex/functional/Makefile +++ b/tools/testing/selftests/futex/functional/Makefile @@ -18,6 +18,7 @@ TEST_GEN_FILES := \ TEST_PROGS := run.sh +top_srcdir = ../../../../.. include ../../lib.mk $(TEST_GEN_FILES): $(HEADERS) diff --git a/tools/testing/selftests/gpio/Makefile b/tools/testing/selftests/gpio/Makefile index 1bbb47565c55..4665cdbf1a8d 100644 --- a/tools/testing/selftests/gpio/Makefile +++ b/tools/testing/selftests/gpio/Makefile @@ -21,11 +21,8 @@ endef CFLAGS += -O2 -g -std=gnu99 -Wall -I../../../../usr/include/ LDLIBS += -lmount -I/usr/include/libmount -$(BINARIES): ../../../gpio/gpio-utils.o ../../../../usr/include/linux/gpio.h +$(BINARIES):| khdr +$(BINARIES): ../../../gpio/gpio-utils.o ../../../gpio/gpio-utils.o: make ARCH=$(ARCH) CROSS_COMPILE=$(CROSS_COMPILE) -C ../../../gpio - -../../../../usr/include/linux/gpio.h: - make -C ../../../.. headers_install INSTALL_HDR_PATH=$(shell pwd)/../../../../usr/ - diff --git a/tools/testing/selftests/kselftest.h b/tools/testing/selftests/kselftest.h index 15e6b75fc3a5..a3edb2c8e43d 100644 --- a/tools/testing/selftests/kselftest.h +++ b/tools/testing/selftests/kselftest.h @@ -19,7 +19,6 @@ #define KSFT_FAIL 1 #define KSFT_XFAIL 2 #define KSFT_XPASS 3 -/* Treat skip as pass */ #define KSFT_SKIP 4 /* counters */ diff --git a/tools/testing/selftests/kvm/.gitignore b/tools/testing/selftests/kvm/.gitignore index 4202139d81d9..5c34752e1cff 100644 --- a/tools/testing/selftests/kvm/.gitignore +++ b/tools/testing/selftests/kvm/.gitignore @@ -1,4 +1,5 @@ cr4_cpuid_sync_test +platform_info_test set_sregs_test sync_regs_test vmx_tsc_adjust_test diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile index 03b0f551bedf..ec32dad3c3f0 100644 --- a/tools/testing/selftests/kvm/Makefile +++ b/tools/testing/selftests/kvm/Makefile @@ -6,7 +6,8 @@ UNAME_M := $(shell uname -m) LIBKVM = lib/assert.c lib/elf.c lib/io.c lib/kvm_util.c lib/sparsebit.c LIBKVM_x86_64 = lib/x86.c lib/vmx.c -TEST_GEN_PROGS_x86_64 = set_sregs_test +TEST_GEN_PROGS_x86_64 = platform_info_test +TEST_GEN_PROGS_x86_64 += set_sregs_test TEST_GEN_PROGS_x86_64 += sync_regs_test TEST_GEN_PROGS_x86_64 += vmx_tsc_adjust_test TEST_GEN_PROGS_x86_64 += cr4_cpuid_sync_test @@ -20,7 +21,7 @@ INSTALL_HDR_PATH = $(top_srcdir)/usr LINUX_HDR_PATH = $(INSTALL_HDR_PATH)/include/ LINUX_TOOL_INCLUDE = $(top_srcdir)tools/include CFLAGS += -O2 -g -std=gnu99 -I$(LINUX_TOOL_INCLUDE) -I$(LINUX_HDR_PATH) -Iinclude -I$(<D) -I.. -LDFLAGS += -lpthread +LDFLAGS += -pthread # After inclusion, $(OUTPUT) is defined and # $(TEST_GEN_PROGS) starts with $(OUTPUT)/ @@ -37,9 +38,6 @@ $(LIBKVM_OBJ): $(OUTPUT)/%.o: %.c $(OUTPUT)/libkvm.a: $(LIBKVM_OBJ) $(AR) crs $@ $^ -$(LINUX_HDR_PATH): - make -C $(top_srcdir) headers_install - -all: $(STATIC_LIBS) $(LINUX_HDR_PATH) +all: $(STATIC_LIBS) $(TEST_GEN_PROGS): $(STATIC_LIBS) -$(TEST_GEN_PROGS) $(LIBKVM_OBJ): | $(LINUX_HDR_PATH) +$(STATIC_LIBS):| khdr diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h index bb5a25fb82c6..3acf9a91704c 100644 --- a/tools/testing/selftests/kvm/include/kvm_util.h +++ b/tools/testing/selftests/kvm/include/kvm_util.h @@ -50,6 +50,7 @@ enum vm_mem_backing_src_type { }; int kvm_check_cap(long cap); +int vm_enable_cap(struct kvm_vm *vm, struct kvm_enable_cap *cap); struct kvm_vm *vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int perm); void kvm_vm_free(struct kvm_vm *vmp); @@ -108,6 +109,9 @@ void vcpu_events_get(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_vcpu_events *events); void vcpu_events_set(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_vcpu_events *events); +uint64_t vcpu_get_msr(struct kvm_vm *vm, uint32_t vcpuid, uint64_t msr_index); +void vcpu_set_msr(struct kvm_vm *vm, uint32_t vcpuid, uint64_t msr_index, + uint64_t msr_value); const char *exit_reason_str(unsigned int exit_reason); diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index e9ba389c48db..6fd8c089cafc 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -63,6 +63,29 @@ int kvm_check_cap(long cap) return ret; } +/* VM Enable Capability + * + * Input Args: + * vm - Virtual Machine + * cap - Capability + * + * Output Args: None + * + * Return: On success, 0. On failure a TEST_ASSERT failure is produced. + * + * Enables a capability (KVM_CAP_*) on the VM. + */ +int vm_enable_cap(struct kvm_vm *vm, struct kvm_enable_cap *cap) +{ + int ret; + + ret = ioctl(vm->fd, KVM_ENABLE_CAP, cap); + TEST_ASSERT(ret == 0, "KVM_ENABLE_CAP IOCTL failed,\n" + " rc: %i errno: %i", ret, errno); + + return ret; +} + static void vm_open(struct kvm_vm *vm, int perm) { vm->kvm_fd = open(KVM_DEV_PATH, perm); @@ -1220,6 +1243,72 @@ void vcpu_events_set(struct kvm_vm *vm, uint32_t vcpuid, ret, errno); } +/* VCPU Get MSR + * + * Input Args: + * vm - Virtual Machine + * vcpuid - VCPU ID + * msr_index - Index of MSR + * + * Output Args: None + * + * Return: On success, value of the MSR. On failure a TEST_ASSERT is produced. + * + * Get value of MSR for VCPU. + */ +uint64_t vcpu_get_msr(struct kvm_vm *vm, uint32_t vcpuid, uint64_t msr_index) +{ + struct vcpu *vcpu = vcpu_find(vm, vcpuid); + struct { + struct kvm_msrs header; + struct kvm_msr_entry entry; + } buffer = {}; + int r; + + TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid); + buffer.header.nmsrs = 1; + buffer.entry.index = msr_index; + r = ioctl(vcpu->fd, KVM_GET_MSRS, &buffer.header); + TEST_ASSERT(r == 1, "KVM_GET_MSRS IOCTL failed,\n" + " rc: %i errno: %i", r, errno); + + return buffer.entry.data; +} + +/* VCPU Set MSR + * + * Input Args: + * vm - Virtual Machine + * vcpuid - VCPU ID + * msr_index - Index of MSR + * msr_value - New value of MSR + * + * Output Args: None + * + * Return: On success, nothing. On failure a TEST_ASSERT is produced. + * + * Set value of MSR for VCPU. + */ +void vcpu_set_msr(struct kvm_vm *vm, uint32_t vcpuid, uint64_t msr_index, + uint64_t msr_value) +{ + struct vcpu *vcpu = vcpu_find(vm, vcpuid); + struct { + struct kvm_msrs header; + struct kvm_msr_entry entry; + } buffer = {}; + int r; + + TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid); + memset(&buffer, 0, sizeof(buffer)); + buffer.header.nmsrs = 1; + buffer.entry.index = msr_index; + buffer.entry.data = msr_value; + r = ioctl(vcpu->fd, KVM_SET_MSRS, &buffer.header); + TEST_ASSERT(r == 1, "KVM_SET_MSRS IOCTL failed,\n" + " rc: %i errno: %i", r, errno); +} + /* VM VCPU Args Set * * Input Args: diff --git a/tools/testing/selftests/kvm/platform_info_test.c b/tools/testing/selftests/kvm/platform_info_test.c new file mode 100644 index 000000000000..3764e7121265 --- /dev/null +++ b/tools/testing/selftests/kvm/platform_info_test.c @@ -0,0 +1,110 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Test for x86 KVM_CAP_MSR_PLATFORM_INFO + * + * Copyright (C) 2018, Google LLC. + * + * This work is licensed under the terms of the GNU GPL, version 2. + * + * Verifies expected behavior of controlling guest access to + * MSR_PLATFORM_INFO. + */ + +#define _GNU_SOURCE /* for program_invocation_short_name */ +#include <fcntl.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/ioctl.h> + +#include "test_util.h" +#include "kvm_util.h" +#include "x86.h" + +#define VCPU_ID 0 +#define MSR_PLATFORM_INFO_MAX_TURBO_RATIO 0xff00 + +static void guest_code(void) +{ + uint64_t msr_platform_info; + + for (;;) { + msr_platform_info = rdmsr(MSR_PLATFORM_INFO); + GUEST_SYNC(msr_platform_info); + asm volatile ("inc %r11"); + } +} + +static void set_msr_platform_info_enabled(struct kvm_vm *vm, bool enable) +{ + struct kvm_enable_cap cap = {}; + + cap.cap = KVM_CAP_MSR_PLATFORM_INFO; + cap.flags = 0; + cap.args[0] = (int)enable; + vm_enable_cap(vm, &cap); +} + +static void test_msr_platform_info_enabled(struct kvm_vm *vm) +{ + struct kvm_run *run = vcpu_state(vm, VCPU_ID); + struct guest_args args; + + set_msr_platform_info_enabled(vm, true); + vcpu_run(vm, VCPU_ID); + TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, + "Exit_reason other than KVM_EXIT_IO: %u (%s),\n", + run->exit_reason, + exit_reason_str(run->exit_reason)); + guest_args_read(vm, VCPU_ID, &args); + TEST_ASSERT(args.port == GUEST_PORT_SYNC, + "Received IO from port other than PORT_HOST_SYNC: %u\n", + run->io.port); + TEST_ASSERT((args.arg1 & MSR_PLATFORM_INFO_MAX_TURBO_RATIO) == + MSR_PLATFORM_INFO_MAX_TURBO_RATIO, + "Expected MSR_PLATFORM_INFO to have max turbo ratio mask: %i.", + MSR_PLATFORM_INFO_MAX_TURBO_RATIO); +} + +static void test_msr_platform_info_disabled(struct kvm_vm *vm) +{ + struct kvm_run *run = vcpu_state(vm, VCPU_ID); + + set_msr_platform_info_enabled(vm, false); + vcpu_run(vm, VCPU_ID); + TEST_ASSERT(run->exit_reason == KVM_EXIT_SHUTDOWN, + "Exit_reason other than KVM_EXIT_SHUTDOWN: %u (%s)\n", + run->exit_reason, + exit_reason_str(run->exit_reason)); +} + +int main(int argc, char *argv[]) +{ + struct kvm_vm *vm; + struct kvm_run *state; + int rv; + uint64_t msr_platform_info; + + /* Tell stdout not to buffer its content */ + setbuf(stdout, NULL); + + rv = kvm_check_cap(KVM_CAP_MSR_PLATFORM_INFO); + if (!rv) { + fprintf(stderr, + "KVM_CAP_MSR_PLATFORM_INFO not supported, skip test\n"); + exit(KSFT_SKIP); + } + + vm = vm_create_default(VCPU_ID, 0, guest_code); + + msr_platform_info = vcpu_get_msr(vm, VCPU_ID, MSR_PLATFORM_INFO); + vcpu_set_msr(vm, VCPU_ID, MSR_PLATFORM_INFO, + msr_platform_info | MSR_PLATFORM_INFO_MAX_TURBO_RATIO); + test_msr_platform_info_disabled(vm); + test_msr_platform_info_enabled(vm); + vcpu_set_msr(vm, VCPU_ID, MSR_PLATFORM_INFO, msr_platform_info); + + kvm_vm_free(vm); + + return 0; +} diff --git a/tools/testing/selftests/lib.mk b/tools/testing/selftests/lib.mk index 17ab36605a8e..0a8e75886224 100644 --- a/tools/testing/selftests/lib.mk +++ b/tools/testing/selftests/lib.mk @@ -16,8 +16,20 @@ TEST_GEN_PROGS := $(patsubst %,$(OUTPUT)/%,$(TEST_GEN_PROGS)) TEST_GEN_PROGS_EXTENDED := $(patsubst %,$(OUTPUT)/%,$(TEST_GEN_PROGS_EXTENDED)) TEST_GEN_FILES := $(patsubst %,$(OUTPUT)/%,$(TEST_GEN_FILES)) +top_srcdir ?= ../../../.. +include $(top_srcdir)/scripts/subarch.include +ARCH ?= $(SUBARCH) + all: $(TEST_GEN_PROGS) $(TEST_GEN_PROGS_EXTENDED) $(TEST_GEN_FILES) +.PHONY: khdr +khdr: + make ARCH=$(ARCH) -C $(top_srcdir) headers_install + +ifdef KSFT_KHDR_INSTALL +$(TEST_GEN_PROGS) $(TEST_GEN_PROGS_EXTENDED) $(TEST_GEN_FILES):| khdr +endif + .ONESHELL: define RUN_TEST_PRINT_RESULT TEST_HDR_MSG="selftests: "`basename $$PWD`:" $$BASENAME_TEST"; \ diff --git a/tools/testing/selftests/memory-hotplug/config b/tools/testing/selftests/memory-hotplug/config index 2fde30191a47..a7e8cd5bb265 100644 --- a/tools/testing/selftests/memory-hotplug/config +++ b/tools/testing/selftests/memory-hotplug/config @@ -2,3 +2,4 @@ CONFIG_MEMORY_HOTPLUG=y CONFIG_MEMORY_HOTPLUG_SPARSE=y CONFIG_NOTIFIER_ERROR_INJECTION=y CONFIG_MEMORY_NOTIFIER_ERROR_INJECT=m +CONFIG_MEMORY_HOTREMOVE=y diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile index 9cca68e440a0..919aa2ac00af 100644 --- a/tools/testing/selftests/net/Makefile +++ b/tools/testing/selftests/net/Makefile @@ -15,6 +15,7 @@ TEST_GEN_FILES += udpgso udpgso_bench_tx udpgso_bench_rx TEST_GEN_PROGS = reuseport_bpf reuseport_bpf_cpu reuseport_bpf_numa TEST_GEN_PROGS += reuseport_dualstack reuseaddr_conflict tls +KSFT_KHDR_INSTALL := 1 include ../lib.mk $(OUTPUT)/reuseport_bpf_numa: LDFLAGS += -lnuma diff --git a/tools/testing/selftests/net/tls.c b/tools/testing/selftests/net/tls.c index b3ebf2646e52..8fdfeafaf8c0 100644 --- a/tools/testing/selftests/net/tls.c +++ b/tools/testing/selftests/net/tls.c @@ -502,6 +502,55 @@ TEST_F(tls, recv_peek_multiple) EXPECT_EQ(memcmp(test_str, buf, send_len), 0); } +TEST_F(tls, recv_peek_multiple_records) +{ + char const *test_str = "test_read_peek_mult_recs"; + char const *test_str_first = "test_read_peek"; + char const *test_str_second = "_mult_recs"; + int len; + char buf[64]; + + len = strlen(test_str_first); + EXPECT_EQ(send(self->fd, test_str_first, len, 0), len); + + len = strlen(test_str_second) + 1; + EXPECT_EQ(send(self->fd, test_str_second, len, 0), len); + + len = sizeof(buf); + memset(buf, 0, len); + EXPECT_NE(recv(self->cfd, buf, len, MSG_PEEK), -1); + + /* MSG_PEEK can only peek into the current record. */ + len = strlen(test_str_first) + 1; + EXPECT_EQ(memcmp(test_str_first, buf, len), 0); + + len = sizeof(buf); + memset(buf, 0, len); + EXPECT_NE(recv(self->cfd, buf, len, 0), -1); + + /* Non-MSG_PEEK will advance strparser (and therefore record) + * however. + */ + len = strlen(test_str) + 1; + EXPECT_EQ(memcmp(test_str, buf, len), 0); + + /* MSG_MORE will hold current record open, so later MSG_PEEK + * will see everything. + */ + len = strlen(test_str_first); + EXPECT_EQ(send(self->fd, test_str_first, len, MSG_MORE), len); + + len = strlen(test_str_second) + 1; + EXPECT_EQ(send(self->fd, test_str_second, len, 0), len); + + len = sizeof(buf); + memset(buf, 0, len); + EXPECT_NE(recv(self->cfd, buf, len, MSG_PEEK), -1); + + len = strlen(test_str) + 1; + EXPECT_EQ(memcmp(test_str, buf, len), 0); +} + TEST_F(tls, pollin) { char const *test_str = "test_poll"; diff --git a/tools/testing/selftests/networking/timestamping/Makefile b/tools/testing/selftests/networking/timestamping/Makefile index a728040edbe1..14cfcf006936 100644 --- a/tools/testing/selftests/networking/timestamping/Makefile +++ b/tools/testing/selftests/networking/timestamping/Makefile @@ -5,6 +5,7 @@ TEST_PROGS := hwtstamp_config rxtimestamp timestamping txtimestamp all: $(TEST_PROGS) +top_srcdir = ../../../../.. include ../../lib.mk clean: diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile index 9881876d2aa0..e94b7b14bcb2 100644 --- a/tools/testing/selftests/vm/Makefile +++ b/tools/testing/selftests/vm/Makefile @@ -26,10 +26,6 @@ TEST_PROGS := run_vmtests include ../lib.mk -$(OUTPUT)/userfaultfd: ../../../../usr/include/linux/kernel.h $(OUTPUT)/userfaultfd: LDLIBS += -lpthread $(OUTPUT)/mlock-random-test: LDLIBS += -lcap - -../../../../usr/include/linux/kernel.h: - make -C ../../../.. headers_install |