From 5e25f5db6abb96ca8ee2aaedcb863daa6dfcc07a Mon Sep 17 00:00:00 2001 From: Dongli Zhang Date: Wed, 1 Nov 2017 09:46:33 +0800 Subject: xen/time: do not decrease steal time after live migration on xen After guest live migration on xen, steal time in /proc/stat (cpustat[CPUTIME_STEAL]) might decrease because steal returned by xen_steal_lock() might be less than this_rq()->prev_steal_time which is derived from previous return value of xen_steal_clock(). For instance, steal time of each vcpu is 335 before live migration. cpu 198 0 368 200064 1962 0 0 1340 0 0 cpu0 38 0 81 50063 492 0 0 335 0 0 cpu1 65 0 97 49763 634 0 0 335 0 0 cpu2 38 0 81 50098 462 0 0 335 0 0 cpu3 56 0 107 50138 374 0 0 335 0 0 After live migration, steal time is reduced to 312. cpu 200 0 370 200330 1971 0 0 1248 0 0 cpu0 38 0 82 50123 500 0 0 312 0 0 cpu1 65 0 97 49832 634 0 0 312 0 0 cpu2 39 0 82 50167 462 0 0 312 0 0 cpu3 56 0 107 50207 374 0 0 312 0 0 Since runstate times are cumulative and cleared during xen live migration by xen hypervisor, the idea of this patch is to accumulate runstate times to global percpu variables before live migration suspend. Once guest VM is resumed, xen_get_runstate_snapshot_cpu() would always return the sum of new runstate times and previously accumulated times stored in global percpu variables. Comment above HYPERVISOR_suspend() has been removed as it is inaccurate: the call can return an error code (e.g., possibly -EPERM in the future). Similar and more severe issue would impact prior linux 4.8-4.10 as discussed by Michael Las at https://0xstubs.org/debugging-a-flaky-cpu-steal-time-counter-on-a-paravirtualized-xen-guest, which would overflow steal time and lead to 100% st usage in top command for linux 4.8-4.10. A backport of this patch would fix that issue. [boris: added linux/slab.h to driver/xen/time.c, slightly reformatted commit message] References: https://0xstubs.org/debugging-a-flaky-cpu-steal-time-counter-on-a-paravirtualized-xen-guest Signed-off-by: Dongli Zhang Reviewed-by: Boris Ostrovsky Signed-off-by: Boris Ostrovsky --- include/xen/xen-ops.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/xen') diff --git a/include/xen/xen-ops.h b/include/xen/xen-ops.h index 218e6aae5433..09072271f122 100644 --- a/include/xen/xen-ops.h +++ b/include/xen/xen-ops.h @@ -32,6 +32,7 @@ void xen_resume_notifier_unregister(struct notifier_block *nb); bool xen_vcpu_stolen(int vcpu); void xen_setup_runstate_info(int cpu); void xen_time_setup_guest(void); +void xen_manage_runstate_time(int action); void xen_get_runstate_snapshot(struct vcpu_runstate_info *res); u64 xen_steal_clock(int cpu); -- cgit v1.2.1 From ec4001c3f29ebb3d4147aaec7be9c687ddadb7c8 Mon Sep 17 00:00:00 2001 From: Paul Durrant Date: Fri, 3 Nov 2017 17:04:11 +0000 Subject: xen: support priv-mapping in an HVM tools domain If the domain has XENFEAT_auto_translated_physmap then use of the PV- specific HYPERVISOR_mmu_update hypercall is clearly incorrect. This patch adds checks in xen_remap_domain_gfn_array() and xen_unmap_domain_gfn_array() which call through to the approprate xlate_mmu function if the feature is present. A check is also added to xen_remap_domain_gfn_range() to fail with -EOPNOTSUPP since this should not be used in an HVM tools domain. Signed-off-by: Paul Durrant Reviewed-by: Boris Ostrovsky Signed-off-by: Boris Ostrovsky --- include/xen/xen-ops.h | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'include/xen') diff --git a/include/xen/xen-ops.h b/include/xen/xen-ops.h index 09072271f122..c278b21cad39 100644 --- a/include/xen/xen-ops.h +++ b/include/xen/xen-ops.h @@ -104,6 +104,8 @@ int xen_remap_domain_gfn_range(struct vm_area_struct *vma, struct page **pages); int xen_unmap_domain_gfn_range(struct vm_area_struct *vma, int numpgs, struct page **pages); + +#ifdef CONFIG_XEN_AUTO_XLATE int xen_xlate_remap_gfn_array(struct vm_area_struct *vma, unsigned long addr, xen_pfn_t *gfn, int nr, @@ -112,6 +114,28 @@ int xen_xlate_remap_gfn_array(struct vm_area_struct *vma, struct page **pages); int xen_xlate_unmap_gfn_range(struct vm_area_struct *vma, int nr, struct page **pages); +#else +/* + * These two functions are called from arch/x86/xen/mmu.c and so stubs + * are needed for a configuration not specifying CONFIG_XEN_AUTO_XLATE. + */ +static inline int xen_xlate_remap_gfn_array(struct vm_area_struct *vma, + unsigned long addr, + xen_pfn_t *gfn, int nr, + int *err_ptr, pgprot_t prot, + unsigned int domid, + struct page **pages) +{ + return -EOPNOTSUPP; +} + +static inline int xen_xlate_unmap_gfn_range(struct vm_area_struct *vma, + int nr, struct page **pages) +{ + return -EOPNOTSUPP; +} +#endif + int xen_xlate_map_ballooned_pages(xen_pfn_t **pfns, void **vaddr, unsigned long nr_grant_frames); -- cgit v1.2.1 From b988b8ff072ab04abd62d10d3fe44ec544be8a7d Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Thu, 2 Nov 2017 10:19:17 +0100 Subject: xen: re-introduce support for grant v2 interface The grant v2 support was removed from the kernel with commit 438b33c7145ca8a5131a30c36d8f59bce119a19a ("xen/grant-table: remove support for V2 tables") as the higher memory footprint of v2 grants resulted in less grants being possible for a kernel compared to the v1 grant interface. As machines with more than 16TB of memory are expected to be more common in the near future support of grant v2 is mandatory in order to be able to run a Xen pv domain at any memory location. So re-add grant v2 support basically by reverting above commit. Signed-off-by: Juergen Gross Reviewed-by: Boris Ostrovsky Signed-off-by: Boris Ostrovsky --- include/xen/grant_table.h | 30 +++++++++++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) (limited to 'include/xen') diff --git a/include/xen/grant_table.h b/include/xen/grant_table.h index 34b1379f9777..dd6c7a32ee32 100644 --- a/include/xen/grant_table.h +++ b/include/xen/grant_table.h @@ -84,6 +84,24 @@ int gnttab_resume(void); int gnttab_grant_foreign_access(domid_t domid, unsigned long frame, int readonly); +int gnttab_grant_foreign_access_subpage(domid_t domid, unsigned long frame, + int flags, unsigned page_off, + unsigned length); +int gnttab_grant_foreign_access_trans(domid_t domid, int flags, + domid_t trans_domid, + grant_ref_t trans_gref); + +/* + * Are sub-page grants available on this version of Xen? Returns true if they + * are, and false if they're not. + */ +bool gnttab_subpage_grants_available(void); + +/* + * Are transitive grants available on this version of Xen? Returns true if they + * are, and false if they're not. + */ +bool gnttab_trans_grants_available(void); /* * End access through the given grant reference, iff the grant entry is no @@ -130,6 +148,13 @@ void gnttab_cancel_free_callback(struct gnttab_free_callback *callback); void gnttab_grant_foreign_access_ref(grant_ref_t ref, domid_t domid, unsigned long frame, int readonly); +int gnttab_grant_foreign_access_subpage_ref(grant_ref_t ref, domid_t domid, + unsigned long frame, int flags, + unsigned page_off, + unsigned length); +int gnttab_grant_foreign_access_trans_ref(grant_ref_t ref, domid_t domid, + int flags, domid_t trans_domid, + grant_ref_t trans_gref); /* Give access to the first 4K of the page */ static inline void gnttab_page_grant_foreign_access_ref_one( @@ -174,10 +199,13 @@ gnttab_set_unmap_op(struct gnttab_unmap_grant_ref *unmap, phys_addr_t addr, unmap->dev_bus_addr = 0; } -int arch_gnttab_init(unsigned long nr_shared); +int arch_gnttab_init(unsigned long nr_shared, unsigned long nr_status); int arch_gnttab_map_shared(xen_pfn_t *frames, unsigned long nr_gframes, unsigned long max_nr_gframes, void **__shared); +int arch_gnttab_map_status(uint64_t *frames, unsigned long nr_gframes, + unsigned long max_nr_gframes, + grant_status_t **__shared); void arch_gnttab_unmap(void *shared, unsigned long nr_gframes); struct grant_frames { -- cgit v1.2.1 From 56c9c700c4399858e50971d98ac44b5842b06a87 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Thu, 2 Nov 2017 10:19:18 +0100 Subject: xen: limit grant v2 interface to the v1 functionality As there is currently no user for sub-page grants or transient grants remove that functionality. This at once makes it possible to switch from grant v2 to grant v1 without restrictions, as there is no loss of functionality other than the limited frame number width related to the switch. Signed-off-by: Juergen Gross Reviewed-by: Boris Ostrovsky Signed-off-by: Boris Ostrovsky --- include/xen/grant_table.h | 25 ------------------------- 1 file changed, 25 deletions(-) (limited to 'include/xen') diff --git a/include/xen/grant_table.h b/include/xen/grant_table.h index dd6c7a32ee32..2e37741f6b8d 100644 --- a/include/xen/grant_table.h +++ b/include/xen/grant_table.h @@ -84,24 +84,6 @@ int gnttab_resume(void); int gnttab_grant_foreign_access(domid_t domid, unsigned long frame, int readonly); -int gnttab_grant_foreign_access_subpage(domid_t domid, unsigned long frame, - int flags, unsigned page_off, - unsigned length); -int gnttab_grant_foreign_access_trans(domid_t domid, int flags, - domid_t trans_domid, - grant_ref_t trans_gref); - -/* - * Are sub-page grants available on this version of Xen? Returns true if they - * are, and false if they're not. - */ -bool gnttab_subpage_grants_available(void); - -/* - * Are transitive grants available on this version of Xen? Returns true if they - * are, and false if they're not. - */ -bool gnttab_trans_grants_available(void); /* * End access through the given grant reference, iff the grant entry is no @@ -148,13 +130,6 @@ void gnttab_cancel_free_callback(struct gnttab_free_callback *callback); void gnttab_grant_foreign_access_ref(grant_ref_t ref, domid_t domid, unsigned long frame, int readonly); -int gnttab_grant_foreign_access_subpage_ref(grant_ref_t ref, domid_t domid, - unsigned long frame, int flags, - unsigned page_off, - unsigned length); -int gnttab_grant_foreign_access_trans_ref(grant_ref_t ref, domid_t domid, - int flags, domid_t trans_domid, - grant_ref_t trans_gref); /* Give access to the first 4K of the page */ static inline void gnttab_page_grant_foreign_access_ref_one( -- cgit v1.2.1 From 2229f70b5bbb025e1394b61007938a68060afbfb Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Wed, 8 Nov 2017 17:19:57 +0000 Subject: x86/xen/time: setup vcpu 0 time info page In order to support pvclock vdso on xen we need to setup the time info page for vcpu 0 and register the page with Xen using the VCPUOP_register_vcpu_time_memory_area hypercall. This hypercall will also forcefully update the pvti which will set some of the necessary flags for vdso. Afterwards we check if it supports the PVCLOCK_TSC_STABLE_BIT flag which is mandatory for having vdso/vsyscall support. And if so, it will set the cpu 0 pvti that will be later on used when mapping the vdso image. The xen headers are also updated to include the new hypercall for registering the secondary vcpu_time_info struct. Signed-off-by: Joao Martins Reviewed-by: Juergen Gross Reviewed-by: Boris Ostrovsky Signed-off-by: Boris Ostrovsky --- include/xen/interface/vcpu.h | 42 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) (limited to 'include/xen') diff --git a/include/xen/interface/vcpu.h b/include/xen/interface/vcpu.h index 98188c87f5c1..504c71601511 100644 --- a/include/xen/interface/vcpu.h +++ b/include/xen/interface/vcpu.h @@ -178,4 +178,46 @@ DEFINE_GUEST_HANDLE_STRUCT(vcpu_register_vcpu_info); /* Send an NMI to the specified VCPU. @extra_arg == NULL. */ #define VCPUOP_send_nmi 11 + +/* + * Get the physical ID information for a pinned vcpu's underlying physical + * processor. The physical ID informmation is architecture-specific. + * On x86: id[31:0]=apic_id, id[63:32]=acpi_id. + * This command returns -EINVAL if it is not a valid operation for this VCPU. + */ +#define VCPUOP_get_physid 12 /* arg == vcpu_get_physid_t */ +struct vcpu_get_physid { + uint64_t phys_id; +}; +DEFINE_GUEST_HANDLE_STRUCT(vcpu_get_physid); +#define xen_vcpu_physid_to_x86_apicid(physid) ((uint32_t)(physid)) +#define xen_vcpu_physid_to_x86_acpiid(physid) ((uint32_t)((physid) >> 32)) + +/* + * Register a memory location to get a secondary copy of the vcpu time + * parameters. The master copy still exists as part of the vcpu shared + * memory area, and this secondary copy is updated whenever the master copy + * is updated (and using the same versioning scheme for synchronisation). + * + * The intent is that this copy may be mapped (RO) into userspace so + * that usermode can compute system time using the time info and the + * tsc. Usermode will see an array of vcpu_time_info structures, one + * for each vcpu, and choose the right one by an existing mechanism + * which allows it to get the current vcpu number (such as via a + * segment limit). It can then apply the normal algorithm to compute + * system time from the tsc. + * + * @extra_arg == pointer to vcpu_register_time_info_memory_area structure. + */ +#define VCPUOP_register_vcpu_time_memory_area 13 +DEFINE_GUEST_HANDLE_STRUCT(vcpu_time_info); +struct vcpu_register_time_memory_area { + union { + GUEST_HANDLE(vcpu_time_info) h; + struct pvclock_vcpu_time_info *v; + uint64_t p; + } addr; +}; +DEFINE_GUEST_HANDLE_STRUCT(vcpu_register_time_memory_area); + #endif /* __XEN_PUBLIC_VCPU_H__ */ -- cgit v1.2.1