From 5e25f5db6abb96ca8ee2aaedcb863daa6dfcc07a Mon Sep 17 00:00:00 2001
From: Dongli Zhang <dongli.zhang@oracle.com>
Date: Wed, 1 Nov 2017 09:46:33 +0800
Subject: xen/time: do not decrease steal time after live migration on xen

After guest live migration on xen, steal time in /proc/stat
(cpustat[CPUTIME_STEAL]) might decrease because steal returned by
xen_steal_lock() might be less than this_rq()->prev_steal_time which is
derived from previous return value of xen_steal_clock().

For instance, steal time of each vcpu is 335 before live migration.

cpu  198 0 368 200064 1962 0 0 1340 0 0
cpu0 38 0 81 50063 492 0 0 335 0 0
cpu1 65 0 97 49763 634 0 0 335 0 0
cpu2 38 0 81 50098 462 0 0 335 0 0
cpu3 56 0 107 50138 374 0 0 335 0 0

After live migration, steal time is reduced to 312.

cpu  200 0 370 200330 1971 0 0 1248 0 0
cpu0 38 0 82 50123 500 0 0 312 0 0
cpu1 65 0 97 49832 634 0 0 312 0 0
cpu2 39 0 82 50167 462 0 0 312 0 0
cpu3 56 0 107 50207 374 0 0 312 0 0

Since runstate times are cumulative and cleared during xen live migration
by xen hypervisor, the idea of this patch is to accumulate runstate times
to global percpu variables before live migration suspend. Once guest VM is
resumed, xen_get_runstate_snapshot_cpu() would always return the sum of new
runstate times and previously accumulated times stored in global percpu
variables.

Comment above HYPERVISOR_suspend() has been removed as it is inaccurate:
the call can return an error code (e.g., possibly -EPERM in the future).

Similar and more severe issue would impact prior linux 4.8-4.10 as
discussed by Michael Las at
https://0xstubs.org/debugging-a-flaky-cpu-steal-time-counter-on-a-paravirtualized-xen-guest,
which would overflow steal time and lead to 100% st usage in top command
for linux 4.8-4.10. A backport of this patch would fix that issue.

[boris: added linux/slab.h to driver/xen/time.c, slightly reformatted
        commit message]

References: https://0xstubs.org/debugging-a-flaky-cpu-steal-time-counter-on-a-paravirtualized-xen-guest
Signed-off-by: Dongli Zhang <dongli.zhang@oracle.com>
Reviewed-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
---
 include/xen/xen-ops.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/xen')

diff --git a/include/xen/xen-ops.h b/include/xen/xen-ops.h
index 218e6aae5433..09072271f122 100644
--- a/include/xen/xen-ops.h
+++ b/include/xen/xen-ops.h
@@ -32,6 +32,7 @@ void xen_resume_notifier_unregister(struct notifier_block *nb);
 bool xen_vcpu_stolen(int vcpu);
 void xen_setup_runstate_info(int cpu);
 void xen_time_setup_guest(void);
+void xen_manage_runstate_time(int action);
 void xen_get_runstate_snapshot(struct vcpu_runstate_info *res);
 u64 xen_steal_clock(int cpu);
 
-- 
cgit v1.2.1


From ec4001c3f29ebb3d4147aaec7be9c687ddadb7c8 Mon Sep 17 00:00:00 2001
From: Paul Durrant <paul.durrant@citrix.com>
Date: Fri, 3 Nov 2017 17:04:11 +0000
Subject: xen: support priv-mapping in an HVM tools domain

If the domain has XENFEAT_auto_translated_physmap then use of the PV-
specific HYPERVISOR_mmu_update hypercall is clearly incorrect.

This patch adds checks in xen_remap_domain_gfn_array() and
xen_unmap_domain_gfn_array() which call through to the approprate
xlate_mmu function if the feature is present. A check is also added
to xen_remap_domain_gfn_range() to fail with -EOPNOTSUPP since this
should not be used in an HVM tools domain.

Signed-off-by: Paul Durrant <paul.durrant@citrix.com>
Reviewed-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
---
 include/xen/xen-ops.h | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

(limited to 'include/xen')

diff --git a/include/xen/xen-ops.h b/include/xen/xen-ops.h
index 09072271f122..c278b21cad39 100644
--- a/include/xen/xen-ops.h
+++ b/include/xen/xen-ops.h
@@ -104,6 +104,8 @@ int xen_remap_domain_gfn_range(struct vm_area_struct *vma,
 			       struct page **pages);
 int xen_unmap_domain_gfn_range(struct vm_area_struct *vma,
 			       int numpgs, struct page **pages);
+
+#ifdef CONFIG_XEN_AUTO_XLATE
 int xen_xlate_remap_gfn_array(struct vm_area_struct *vma,
 			      unsigned long addr,
 			      xen_pfn_t *gfn, int nr,
@@ -112,6 +114,28 @@ int xen_xlate_remap_gfn_array(struct vm_area_struct *vma,
 			      struct page **pages);
 int xen_xlate_unmap_gfn_range(struct vm_area_struct *vma,
 			      int nr, struct page **pages);
+#else
+/*
+ * These two functions are called from arch/x86/xen/mmu.c and so stubs
+ * are needed for a configuration not specifying CONFIG_XEN_AUTO_XLATE.
+ */
+static inline int xen_xlate_remap_gfn_array(struct vm_area_struct *vma,
+					    unsigned long addr,
+					    xen_pfn_t *gfn, int nr,
+					    int *err_ptr, pgprot_t prot,
+					    unsigned int domid,
+					    struct page **pages)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline int xen_xlate_unmap_gfn_range(struct vm_area_struct *vma,
+					    int nr, struct page **pages)
+{
+	return -EOPNOTSUPP;
+}
+#endif
+
 int xen_xlate_map_ballooned_pages(xen_pfn_t **pfns, void **vaddr,
 				  unsigned long nr_grant_frames);
 
-- 
cgit v1.2.1


From b988b8ff072ab04abd62d10d3fe44ec544be8a7d Mon Sep 17 00:00:00 2001
From: Juergen Gross <jgross@suse.com>
Date: Thu, 2 Nov 2017 10:19:17 +0100
Subject: xen: re-introduce support for grant v2 interface

The grant v2 support was removed from the kernel with
commit 438b33c7145ca8a5131a30c36d8f59bce119a19a ("xen/grant-table:
remove support for V2 tables") as the higher memory footprint of v2
grants resulted in less grants being possible for a kernel compared
to the v1 grant interface.

As machines with more than 16TB of memory are expected to be more
common in the near future support of grant v2 is mandatory in order
to be able to run a Xen pv domain at any memory location.

So re-add grant v2 support basically by reverting above commit.

Signed-off-by: Juergen Gross <jgross@suse.com>
Reviewed-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
---
 include/xen/grant_table.h | 30 +++++++++++++++++++++++++++++-
 1 file changed, 29 insertions(+), 1 deletion(-)

(limited to 'include/xen')

diff --git a/include/xen/grant_table.h b/include/xen/grant_table.h
index 34b1379f9777..dd6c7a32ee32 100644
--- a/include/xen/grant_table.h
+++ b/include/xen/grant_table.h
@@ -84,6 +84,24 @@ int gnttab_resume(void);
 
 int gnttab_grant_foreign_access(domid_t domid, unsigned long frame,
 				int readonly);
+int gnttab_grant_foreign_access_subpage(domid_t domid, unsigned long frame,
+					int flags, unsigned page_off,
+					unsigned length);
+int gnttab_grant_foreign_access_trans(domid_t domid, int flags,
+				      domid_t trans_domid,
+				      grant_ref_t trans_gref);
+
+/*
+ * Are sub-page grants available on this version of Xen?  Returns true if they
+ * are, and false if they're not.
+ */
+bool gnttab_subpage_grants_available(void);
+
+/*
+ * Are transitive grants available on this version of Xen?  Returns true if they
+ * are, and false if they're not.
+ */
+bool gnttab_trans_grants_available(void);
 
 /*
  * End access through the given grant reference, iff the grant entry is no
@@ -130,6 +148,13 @@ void gnttab_cancel_free_callback(struct gnttab_free_callback *callback);
 
 void gnttab_grant_foreign_access_ref(grant_ref_t ref, domid_t domid,
 				     unsigned long frame, int readonly);
+int gnttab_grant_foreign_access_subpage_ref(grant_ref_t ref, domid_t domid,
+					    unsigned long frame, int flags,
+					    unsigned page_off,
+					    unsigned length);
+int gnttab_grant_foreign_access_trans_ref(grant_ref_t ref, domid_t domid,
+					  int flags, domid_t trans_domid,
+					  grant_ref_t trans_gref);
 
 /* Give access to the first 4K of the page */
 static inline void gnttab_page_grant_foreign_access_ref_one(
@@ -174,10 +199,13 @@ gnttab_set_unmap_op(struct gnttab_unmap_grant_ref *unmap, phys_addr_t addr,
 	unmap->dev_bus_addr = 0;
 }
 
-int arch_gnttab_init(unsigned long nr_shared);
+int arch_gnttab_init(unsigned long nr_shared, unsigned long nr_status);
 int arch_gnttab_map_shared(xen_pfn_t *frames, unsigned long nr_gframes,
 			   unsigned long max_nr_gframes,
 			   void **__shared);
+int arch_gnttab_map_status(uint64_t *frames, unsigned long nr_gframes,
+			   unsigned long max_nr_gframes,
+			   grant_status_t **__shared);
 void arch_gnttab_unmap(void *shared, unsigned long nr_gframes);
 
 struct grant_frames {
-- 
cgit v1.2.1


From 56c9c700c4399858e50971d98ac44b5842b06a87 Mon Sep 17 00:00:00 2001
From: Juergen Gross <jgross@suse.com>
Date: Thu, 2 Nov 2017 10:19:18 +0100
Subject: xen: limit grant v2 interface to the v1 functionality

As there is currently no user for sub-page grants or transient grants
remove that functionality. This at once makes it possible to switch
from grant v2 to grant v1 without restrictions, as there is no loss of
functionality other than the limited frame number width related to
the switch.

Signed-off-by: Juergen Gross <jgross@suse.com>
Reviewed-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
---
 include/xen/grant_table.h | 25 -------------------------
 1 file changed, 25 deletions(-)

(limited to 'include/xen')

diff --git a/include/xen/grant_table.h b/include/xen/grant_table.h
index dd6c7a32ee32..2e37741f6b8d 100644
--- a/include/xen/grant_table.h
+++ b/include/xen/grant_table.h
@@ -84,24 +84,6 @@ int gnttab_resume(void);
 
 int gnttab_grant_foreign_access(domid_t domid, unsigned long frame,
 				int readonly);
-int gnttab_grant_foreign_access_subpage(domid_t domid, unsigned long frame,
-					int flags, unsigned page_off,
-					unsigned length);
-int gnttab_grant_foreign_access_trans(domid_t domid, int flags,
-				      domid_t trans_domid,
-				      grant_ref_t trans_gref);
-
-/*
- * Are sub-page grants available on this version of Xen?  Returns true if they
- * are, and false if they're not.
- */
-bool gnttab_subpage_grants_available(void);
-
-/*
- * Are transitive grants available on this version of Xen?  Returns true if they
- * are, and false if they're not.
- */
-bool gnttab_trans_grants_available(void);
 
 /*
  * End access through the given grant reference, iff the grant entry is no
@@ -148,13 +130,6 @@ void gnttab_cancel_free_callback(struct gnttab_free_callback *callback);
 
 void gnttab_grant_foreign_access_ref(grant_ref_t ref, domid_t domid,
 				     unsigned long frame, int readonly);
-int gnttab_grant_foreign_access_subpage_ref(grant_ref_t ref, domid_t domid,
-					    unsigned long frame, int flags,
-					    unsigned page_off,
-					    unsigned length);
-int gnttab_grant_foreign_access_trans_ref(grant_ref_t ref, domid_t domid,
-					  int flags, domid_t trans_domid,
-					  grant_ref_t trans_gref);
 
 /* Give access to the first 4K of the page */
 static inline void gnttab_page_grant_foreign_access_ref_one(
-- 
cgit v1.2.1


From 2229f70b5bbb025e1394b61007938a68060afbfb Mon Sep 17 00:00:00 2001
From: Joao Martins <joao.m.martins@oracle.com>
Date: Wed, 8 Nov 2017 17:19:57 +0000
Subject: x86/xen/time: setup vcpu 0 time info page

In order to support pvclock vdso on xen we need to setup the time
info page for vcpu 0 and register the page with Xen using the
VCPUOP_register_vcpu_time_memory_area hypercall. This hypercall
will also forcefully update the pvti which will set some of the
necessary flags for vdso. Afterwards we check if it supports the
PVCLOCK_TSC_STABLE_BIT flag which is mandatory for having
vdso/vsyscall support. And if so, it will set the cpu 0 pvti that
will be later on used when mapping the vdso image.

The xen headers are also updated to include the new hypercall for
registering the secondary vcpu_time_info struct.

Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
Reviewed-by: Juergen Gross <jgross@suse.com>
Reviewed-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
---
 include/xen/interface/vcpu.h | 42 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 42 insertions(+)

(limited to 'include/xen')

diff --git a/include/xen/interface/vcpu.h b/include/xen/interface/vcpu.h
index 98188c87f5c1..504c71601511 100644
--- a/include/xen/interface/vcpu.h
+++ b/include/xen/interface/vcpu.h
@@ -178,4 +178,46 @@ DEFINE_GUEST_HANDLE_STRUCT(vcpu_register_vcpu_info);
 
 /* Send an NMI to the specified VCPU. @extra_arg == NULL. */
 #define VCPUOP_send_nmi             11
+
+/*
+ * Get the physical ID information for a pinned vcpu's underlying physical
+ * processor.  The physical ID informmation is architecture-specific.
+ * On x86: id[31:0]=apic_id, id[63:32]=acpi_id.
+ * This command returns -EINVAL if it is not a valid operation for this VCPU.
+ */
+#define VCPUOP_get_physid           12 /* arg == vcpu_get_physid_t */
+struct vcpu_get_physid {
+	uint64_t phys_id;
+};
+DEFINE_GUEST_HANDLE_STRUCT(vcpu_get_physid);
+#define xen_vcpu_physid_to_x86_apicid(physid) ((uint32_t)(physid))
+#define xen_vcpu_physid_to_x86_acpiid(physid) ((uint32_t)((physid) >> 32))
+
+/*
+ * Register a memory location to get a secondary copy of the vcpu time
+ * parameters.  The master copy still exists as part of the vcpu shared
+ * memory area, and this secondary copy is updated whenever the master copy
+ * is updated (and using the same versioning scheme for synchronisation).
+ *
+ * The intent is that this copy may be mapped (RO) into userspace so
+ * that usermode can compute system time using the time info and the
+ * tsc.  Usermode will see an array of vcpu_time_info structures, one
+ * for each vcpu, and choose the right one by an existing mechanism
+ * which allows it to get the current vcpu number (such as via a
+ * segment limit).  It can then apply the normal algorithm to compute
+ * system time from the tsc.
+ *
+ * @extra_arg == pointer to vcpu_register_time_info_memory_area structure.
+ */
+#define VCPUOP_register_vcpu_time_memory_area   13
+DEFINE_GUEST_HANDLE_STRUCT(vcpu_time_info);
+struct vcpu_register_time_memory_area {
+	union {
+		GUEST_HANDLE(vcpu_time_info) h;
+		struct pvclock_vcpu_time_info *v;
+		uint64_t p;
+	} addr;
+};
+DEFINE_GUEST_HANDLE_STRUCT(vcpu_register_time_memory_area);
+
 #endif /* __XEN_PUBLIC_VCPU_H__ */
-- 
cgit v1.2.1