summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorThomas Schwinge <thomas@codesourcery.com>2023-03-30 10:08:12 +0200
committerThomas Schwinge <thomas@codesourcery.com>2023-04-03 16:43:02 +0200
commit43095690ea519205bf56fc148b346edaa43e0f0f (patch)
treeca117c97ba0831022c9f452fbab3d7ac041e7cf8
parentc58b28cb650995a41e1ab0166169799f3991bdd6 (diff)
downloadgcc-43095690ea519205bf56fc148b346edaa43e0f0f.tar.gz
'-foffload-memory=pinned' using offloading device interfaces
Implemented for nvptx offloading via 'cuMemHostAlloc', 'cuMemHostRegister'. gcc/ * doc/invoke.texi (-foffload-memory=pinned): Document. include/ * cuda/cuda.h (CUresult): Add 'CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED'. (CUdevice_attribute): Add 'CU_DEVICE_ATTRIBUTE_READ_ONLY_HOST_REGISTER_SUPPORTED'. (CU_MEMHOSTREGISTER_READ_ONLY): Add. (cuMemHostGetFlags, cuMemHostRegister, cuMemHostUnregister): Add. libgomp/ * libgomp-plugin.h (GOMP_OFFLOAD_page_locked_host_free): Add 'struct goacc_asyncqueue *' formal parameter. (GOMP_OFFLOAD_page_locked_host_register) (GOMP_OFFLOAD_page_locked_host_unregister) (GOMP_OFFLOAD_page_locked_host_p): Add. * libgomp.h (always_pinned_mode) (gomp_page_locked_host_register_dev) (gomp_page_locked_host_unregister_dev): Add. (struct splay_tree_key_s): Add 'page_locked_host_p'. (struct gomp_device_descr): Add 'GOMP_OFFLOAD_page_locked_host_register', 'GOMP_OFFLOAD_page_locked_host_unregister', 'GOMP_OFFLOAD_page_locked_host_p'. * libgomp.texi (-foffload-memory=pinned): Document. * plugin/cuda-lib.def (cuMemHostGetFlags, cuMemHostRegister_v2) (cuMemHostRegister, cuMemHostUnregister): Add. * plugin/plugin-nvptx.c (struct ptx_device): Add 'read_only_host_register_supported'. (nvptx_open_device): Initialize it. (free_host_blocks, free_host_blocks_lock) (nvptx_run_deferred_page_locked_host_free) (nvptx_page_locked_host_free_callback, nvptx_page_locked_host_p) (GOMP_OFFLOAD_page_locked_host_register) (nvptx_page_locked_host_unregister_callback) (GOMP_OFFLOAD_page_locked_host_unregister) (GOMP_OFFLOAD_page_locked_host_p) (nvptx_run_deferred_page_locked_host_unregister) (nvptx_move_page_locked_host_unregister_blocks_aq1_aq2_callback): Add. (GOMP_OFFLOAD_fini_device, GOMP_OFFLOAD_page_locked_host_alloc) (GOMP_OFFLOAD_run): Call 'nvptx_run_deferred_page_locked_host_free'. (struct goacc_asyncqueue): Add 'page_locked_host_unregister_blocks_lock', 'page_locked_host_unregister_blocks'. (nvptx_goacc_asyncqueue_construct) (nvptx_goacc_asyncqueue_destruct): Handle those. (GOMP_OFFLOAD_page_locked_host_free): Handle 'struct goacc_asyncqueue *' formal parameter. (GOMP_OFFLOAD_openacc_async_test) (nvptx_goacc_asyncqueue_synchronize): Call 'nvptx_run_deferred_page_locked_host_unregister'. (GOMP_OFFLOAD_openacc_async_serialize): Call 'nvptx_move_page_locked_host_unregister_blocks_aq1_aq2_callback'. * config/linux/allocator.c (linux_memspace_alloc) (linux_memspace_calloc, linux_memspace_free) (linux_memspace_realloc): Remove 'always_pinned_mode' handling. (GOMP_enable_pinned_mode): Move... * target.c: ... here. (always_pinned_mode, verify_always_pinned_mode) (gomp_verify_always_pinned_mode, gomp_page_locked_host_alloc_dev) (gomp_page_locked_host_free_dev) (gomp_page_locked_host_aligned_alloc_dev) (gomp_page_locked_host_aligned_free_dev) (gomp_page_locked_host_register_dev) (gomp_page_locked_host_unregister_dev): Add. (gomp_copy_host2dev, gomp_map_vars_internal) (gomp_remove_var_internal, gomp_unmap_vars_internal) (get_gomp_offload_icvs, gomp_load_image_to_device) (gomp_target_rev, omp_target_memcpy_copy) (omp_target_memcpy_rect_worker): Handle 'always_pinned_mode'. (gomp_copy_host2dev, gomp_copy_dev2host): Handle 'verify_always_pinned_mode'. (GOMP_target_ext): Add 'assert'. (gomp_page_locked_host_alloc): Use 'gomp_page_locked_host_alloc_dev'. (gomp_page_locked_host_free): Use 'gomp_page_locked_host_free_dev'. (omp_target_associate_ptr): Adjust. (gomp_load_plugin_for_device): Handle 'page_locked_host_register', 'page_locked_host_unregister', 'page_locked_host_p'. * oacc-mem.c (memcpy_tofrom_device): Handle 'always_pinned_mode'. * libgomp_g.h (GOMP_enable_pinned_mode): Adjust. * testsuite/libgomp.c/alloc-pinned-7.c: Remove.
-rw-r--r--gcc/ChangeLog.omp4
-rw-r--r--gcc/doc/invoke.texi19
-rw-r--r--include/ChangeLog.omp9
-rw-r--r--include/cuda/cuda.h11
-rw-r--r--libgomp/ChangeLog.omp75
-rw-r--r--libgomp/config/linux/allocator.c26
-rw-r--r--libgomp/libgomp-plugin.h7
-rw-r--r--libgomp/libgomp.h15
-rw-r--r--libgomp/libgomp.texi35
-rw-r--r--libgomp/libgomp_g.h2
-rw-r--r--libgomp/oacc-mem.c16
-rw-r--r--libgomp/plugin/cuda-lib.def4
-rw-r--r--libgomp/plugin/plugin-nvptx.c435
-rw-r--r--libgomp/target.c771
-rw-r--r--libgomp/testsuite/libgomp.c/alloc-pinned-7.c63
15 files changed, 1339 insertions, 153 deletions
diff --git a/gcc/ChangeLog.omp b/gcc/ChangeLog.omp
index 5e76158db06..d8aa0ab51bf 100644
--- a/gcc/ChangeLog.omp
+++ b/gcc/ChangeLog.omp
@@ -1,3 +1,7 @@
+2023-04-03 Thomas Schwinge <thomas@codesourcery.com>
+
+ * doc/invoke.texi (-foffload-memory=pinned): Document.
+
2023-03-31 Frederik Harwath <frederik@codesourcery.com>
* omp-transform-loops.cc (walk_omp_for_loops): Handle
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 1fe047042ae..070b63030f8 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -2711,13 +2711,28 @@ Typical command lines are
@itemx -foffload-memory=unified
@itemx -foffload-memory=pinned
@opindex foffload-memory
+@cindex Offloading memory modes
@cindex OpenMP offloading memory modes
+
Enable a memory optimization mode to use with OpenMP. The default behavior,
@option{-foffload-memory=none}, is to do nothing special (unless enabled via
a requires directive in the code). @option{-foffload-memory=unified} is
equivalent to @code{#pragma omp requires unified_shared_memory}.
-@option{-foffload-memory=pinned} forces all host memory to be pinned (this
-mode may require the user to increase the ulimit setting for locked memory).
+
+@c The following paragraph is duplicated in
+@c '../../libgomp/libgomp.texi', '-foffload-memory=pinned'.
+If supported by the active offloading device,
+@option{-foffload-memory=pinned} enables automatic use of page-locked
+host memory for memory objects participating in host <-> device memory
+transfers, for both OpenACC and OpenMP offloading.
+Such memory is allocated or registered using the respective offloading
+device interfaces, which potentially helps optimization of host <->
+device data transfers.
+This option is experimental.
+Beware that use of a lot of pinned memory may degrade overall system
+performance, as it does reduce the amount of host memory available for
+paging.
+
All translation units must select the same setting to avoid undefined
behavior.
diff --git a/include/ChangeLog.omp b/include/ChangeLog.omp
index 244d67e6608..655377a6d0d 100644
--- a/include/ChangeLog.omp
+++ b/include/ChangeLog.omp
@@ -1,3 +1,12 @@
+2023-04-03 Thomas Schwinge <thomas@codesourcery.com>
+
+ * cuda/cuda.h (CUresult): Add
+ 'CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED'.
+ (CUdevice_attribute): Add
+ 'CU_DEVICE_ATTRIBUTE_READ_ONLY_HOST_REGISTER_SUPPORTED'.
+ (CU_MEMHOSTREGISTER_READ_ONLY): Add.
+ (cuMemHostGetFlags, cuMemHostRegister, cuMemHostUnregister): Add.
+
2023-02-20 Thomas Schwinge <thomas@codesourcery.com>
* cuda/cuda.h (cuMemHostRegister, cuMemHostUnregister): Remove.
diff --git a/include/cuda/cuda.h b/include/cuda/cuda.h
index 062d394b95f..f8f464607db 100644
--- a/include/cuda/cuda.h
+++ b/include/cuda/cuda.h
@@ -57,6 +57,7 @@ typedef enum {
CUDA_ERROR_INVALID_CONTEXT = 201,
CUDA_ERROR_NOT_FOUND = 500,
CUDA_ERROR_NOT_READY = 600,
+ CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED = 712,
CUDA_ERROR_LAUNCH_FAILED = 719,
CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE = 720,
CUDA_ERROR_NOT_PERMITTED = 800,
@@ -80,7 +81,8 @@ typedef enum {
CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING = 41,
CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR = 75,
CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR = 76,
- CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR = 82
+ CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR = 82,
+ CU_DEVICE_ATTRIBUTE_READ_ONLY_HOST_REGISTER_SUPPORTED = 113
} CUdevice_attribute;
typedef enum {
@@ -124,8 +126,11 @@ enum {
#define CU_LAUNCH_PARAM_END ((void *) 0)
#define CU_LAUNCH_PARAM_BUFFER_POINTER ((void *) 1)
#define CU_LAUNCH_PARAM_BUFFER_SIZE ((void *) 2)
+
#define CU_MEMHOSTALLOC_DEVICEMAP 0x02U
+#define CU_MEMHOSTREGISTER_READ_ONLY 0x08
+
enum {
CU_STREAM_DEFAULT = 0,
CU_STREAM_NON_BLOCKING = 1
@@ -183,6 +188,10 @@ CUresult cuMemAlloc (CUdeviceptr *, size_t);
CUresult cuMemAllocHost (void **, size_t);
CUresult cuMemAllocManaged(CUdeviceptr *, size_t, unsigned int);
CUresult cuMemHostAlloc (void **, size_t, unsigned int);
+CUresult cuMemHostGetFlags (unsigned int *, void *);
+#define cuMemHostRegister cuMemHostRegister_v2
+CUresult cuMemHostRegister(void *, size_t, unsigned int);
+CUresult cuMemHostUnregister(void *);
CUresult cuMemcpy (CUdeviceptr, CUdeviceptr, size_t);
#define cuMemcpyDtoDAsync cuMemcpyDtoDAsync_v2
CUresult cuMemcpyDtoDAsync (CUdeviceptr, CUdeviceptr, size_t, CUstream);
diff --git a/libgomp/ChangeLog.omp b/libgomp/ChangeLog.omp
index 7afb5f43c04..1b02c057562 100644
--- a/libgomp/ChangeLog.omp
+++ b/libgomp/ChangeLog.omp
@@ -1,5 +1,80 @@
2023-04-03 Thomas Schwinge <thomas@codesourcery.com>
+ * libgomp-plugin.h (GOMP_OFFLOAD_page_locked_host_free): Add
+ 'struct goacc_asyncqueue *' formal parameter.
+ (GOMP_OFFLOAD_page_locked_host_register)
+ (GOMP_OFFLOAD_page_locked_host_unregister)
+ (GOMP_OFFLOAD_page_locked_host_p): Add.
+ * libgomp.h (always_pinned_mode)
+ (gomp_page_locked_host_register_dev)
+ (gomp_page_locked_host_unregister_dev): Add.
+ (struct splay_tree_key_s): Add 'page_locked_host_p'.
+ (struct gomp_device_descr): Add
+ 'GOMP_OFFLOAD_page_locked_host_register',
+ 'GOMP_OFFLOAD_page_locked_host_unregister',
+ 'GOMP_OFFLOAD_page_locked_host_p'.
+ * libgomp.texi (-foffload-memory=pinned): Document.
+ * plugin/cuda-lib.def (cuMemHostGetFlags, cuMemHostRegister_v2)
+ (cuMemHostRegister, cuMemHostUnregister): Add.
+ * plugin/plugin-nvptx.c (struct ptx_device): Add
+ 'read_only_host_register_supported'.
+ (nvptx_open_device): Initialize it.
+ (free_host_blocks, free_host_blocks_lock)
+ (nvptx_run_deferred_page_locked_host_free)
+ (nvptx_page_locked_host_free_callback, nvptx_page_locked_host_p)
+ (GOMP_OFFLOAD_page_locked_host_register)
+ (nvptx_page_locked_host_unregister_callback)
+ (GOMP_OFFLOAD_page_locked_host_unregister)
+ (GOMP_OFFLOAD_page_locked_host_p)
+ (nvptx_run_deferred_page_locked_host_unregister)
+ (nvptx_move_page_locked_host_unregister_blocks_aq1_aq2_callback):
+ Add.
+ (GOMP_OFFLOAD_fini_device, GOMP_OFFLOAD_page_locked_host_alloc)
+ (GOMP_OFFLOAD_run): Call
+ 'nvptx_run_deferred_page_locked_host_free'.
+ (struct goacc_asyncqueue): Add
+ 'page_locked_host_unregister_blocks_lock',
+ 'page_locked_host_unregister_blocks'.
+ (nvptx_goacc_asyncqueue_construct)
+ (nvptx_goacc_asyncqueue_destruct): Handle those.
+ (GOMP_OFFLOAD_page_locked_host_free): Handle
+ 'struct goacc_asyncqueue *' formal parameter.
+ (GOMP_OFFLOAD_openacc_async_test)
+ (nvptx_goacc_asyncqueue_synchronize): Call
+ 'nvptx_run_deferred_page_locked_host_unregister'.
+ (GOMP_OFFLOAD_openacc_async_serialize): Call
+ 'nvptx_move_page_locked_host_unregister_blocks_aq1_aq2_callback'.
+ * config/linux/allocator.c (linux_memspace_alloc)
+ (linux_memspace_calloc, linux_memspace_free)
+ (linux_memspace_realloc): Remove 'always_pinned_mode' handling.
+ (GOMP_enable_pinned_mode): Move...
+ * target.c: ... here.
+ (always_pinned_mode, verify_always_pinned_mode)
+ (gomp_verify_always_pinned_mode, gomp_page_locked_host_alloc_dev)
+ (gomp_page_locked_host_free_dev)
+ (gomp_page_locked_host_aligned_alloc_dev)
+ (gomp_page_locked_host_aligned_free_dev)
+ (gomp_page_locked_host_register_dev)
+ (gomp_page_locked_host_unregister_dev): Add.
+ (gomp_copy_host2dev, gomp_map_vars_internal)
+ (gomp_remove_var_internal, gomp_unmap_vars_internal)
+ (get_gomp_offload_icvs, gomp_load_image_to_device)
+ (gomp_target_rev, omp_target_memcpy_copy)
+ (omp_target_memcpy_rect_worker): Handle 'always_pinned_mode'.
+ (gomp_copy_host2dev, gomp_copy_dev2host): Handle
+ 'verify_always_pinned_mode'.
+ (GOMP_target_ext): Add 'assert'.
+ (gomp_page_locked_host_alloc): Use
+ 'gomp_page_locked_host_alloc_dev'.
+ (gomp_page_locked_host_free): Use
+ 'gomp_page_locked_host_free_dev'.
+ (omp_target_associate_ptr): Adjust.
+ (gomp_load_plugin_for_device): Handle 'page_locked_host_register',
+ 'page_locked_host_unregister', 'page_locked_host_p'.
+ * oacc-mem.c (memcpy_tofrom_device): Handle 'always_pinned_mode'.
+ * libgomp_g.h (GOMP_enable_pinned_mode): Adjust.
+ * testsuite/libgomp.c/alloc-pinned-7.c: Remove.
+
PR other/76739
* target.c (gomp_map_vars_internal): Pass pre-allocated 'ptrblock'
to 'goacc_noncontig_array_create_ptrblock'.
diff --git a/libgomp/config/linux/allocator.c b/libgomp/config/linux/allocator.c
index 3e1bd5a1285..62649f64221 100644
--- a/libgomp/config/linux/allocator.c
+++ b/libgomp/config/linux/allocator.c
@@ -45,20 +45,6 @@
#include <assert.h>
#include "libgomp.h"
-static bool always_pinned_mode = false;
-
-/* This function is called by the compiler when -foffload-memory=pinned
- is used. */
-
-void
-GOMP_enable_pinned_mode ()
-{
- if (mlockall (MCL_CURRENT | MCL_FUTURE) != 0)
- gomp_error ("failed to pin all memory (ulimit too low?)");
- else
- always_pinned_mode = true;
-}
-
static int using_device_for_page_locked
= /* uninitialized */ -1;
@@ -70,9 +56,6 @@ linux_memspace_alloc (omp_memspace_handle_t memspace, size_t size, int pin,
__FUNCTION__, (unsigned long long) memspace,
(unsigned long long) size, pin, init0);
- /* Explicit pinning may not be required. */
- pin = pin && !always_pinned_mode;
-
void *addr;
if (memspace == ompx_unified_shared_mem_space)
@@ -137,9 +120,6 @@ linux_memspace_calloc (omp_memspace_handle_t memspace, size_t size, int pin)
gomp_debug (0, "%s: memspace=%llu, size=%llu, pin=%d\n",
__FUNCTION__, (unsigned long long) memspace, (unsigned long long) size, pin);
- /* Explicit pinning may not be required. */
- pin = pin && !always_pinned_mode;
-
if (memspace == ompx_unified_shared_mem_space)
{
void *ret = gomp_usm_alloc (size, GOMP_DEVICE_ICV);
@@ -159,9 +139,6 @@ linux_memspace_free (omp_memspace_handle_t memspace, void *addr, size_t size,
gomp_debug (0, "%s: memspace=%llu, addr=%p, size=%llu, pin=%d\n",
__FUNCTION__, (unsigned long long) memspace, addr, (unsigned long long) size, pin);
- /* Explicit pinning may not be required. */
- pin = pin && !always_pinned_mode;
-
if (memspace == ompx_unified_shared_mem_space)
gomp_usm_free (addr, GOMP_DEVICE_ICV);
else if (pin)
@@ -188,9 +165,6 @@ linux_memspace_realloc (omp_memspace_handle_t memspace, void *addr,
gomp_debug (0, "%s: memspace=%llu, addr=%p, oldsize=%llu, size=%llu, oldpin=%d, pin=%d\n",
__FUNCTION__, (unsigned long long) memspace, addr, (unsigned long long) oldsize, (unsigned long long) size, oldpin, pin);
- /* Explicit pinning may not be required. */
- pin = pin && !always_pinned_mode;
-
if (memspace == ompx_unified_shared_mem_space)
goto manual_realloc;
else if (oldpin && pin)
diff --git a/libgomp/libgomp-plugin.h b/libgomp/libgomp-plugin.h
index ca557a79380..7456b7d1026 100644
--- a/libgomp/libgomp-plugin.h
+++ b/libgomp/libgomp-plugin.h
@@ -141,7 +141,12 @@ extern void *GOMP_OFFLOAD_usm_alloc (int, size_t);
extern bool GOMP_OFFLOAD_usm_free (int, void *);
extern bool GOMP_OFFLOAD_is_usm_ptr (void *);
extern bool GOMP_OFFLOAD_page_locked_host_alloc (void **, size_t);
-extern bool GOMP_OFFLOAD_page_locked_host_free (void *);
+extern bool GOMP_OFFLOAD_page_locked_host_free (void *,
+ struct goacc_asyncqueue *);
+extern int GOMP_OFFLOAD_page_locked_host_register (int, void *, size_t, int);
+extern bool GOMP_OFFLOAD_page_locked_host_unregister (void *, size_t,
+ struct goacc_asyncqueue *);
+extern int GOMP_OFFLOAD_page_locked_host_p (int, const void *, size_t);
extern bool GOMP_OFFLOAD_dev2host (int, void *, const void *, size_t);
extern bool GOMP_OFFLOAD_host2dev (int, void *, const void *, size_t);
extern bool GOMP_OFFLOAD_dev2dev (int, void *, const void *, size_t);
diff --git a/libgomp/libgomp.h b/libgomp/libgomp.h
index 3b2b4aa9534..b7ac9d3da5b 100644
--- a/libgomp/libgomp.h
+++ b/libgomp/libgomp.h
@@ -1123,6 +1123,8 @@ extern int gomp_pause_host (void);
/* target.c */
+extern bool always_pinned_mode;
+
extern void gomp_init_targets_once (void);
extern int gomp_get_num_devices (void);
extern bool gomp_target_task_fn (void *);
@@ -1130,6 +1132,11 @@ extern void gomp_target_rev (uint64_t, uint64_t, uint64_t, uint64_t, uint64_t,
int, struct goacc_asyncqueue *);
extern void * gomp_usm_alloc (size_t size, int device_num);
extern void gomp_usm_free (void *device_ptr, int device_num);
+extern int gomp_page_locked_host_register_dev (struct gomp_device_descr *,
+ void *, size_t, int);
+extern bool gomp_page_locked_host_unregister_dev (struct gomp_device_descr *,
+ void *, size_t,
+ struct goacc_asyncqueue *);
extern bool gomp_page_locked_host_alloc (void **, size_t);
extern void gomp_page_locked_host_free (void *);
@@ -1232,6 +1239,9 @@ struct splay_tree_key_s {
uintptr_t *structelem_refcount_ptr;
};
struct splay_tree_aux *aux;
+ /* Whether we have registered page-locked host memory for
+ '[host_start, host_end)'. */
+ bool page_locked_host_p;
};
/* The comparison function. */
@@ -1393,6 +1403,11 @@ struct gomp_device_descr
__typeof (GOMP_OFFLOAD_is_usm_ptr) *is_usm_ptr_func;
__typeof (GOMP_OFFLOAD_page_locked_host_alloc) *page_locked_host_alloc_func;
__typeof (GOMP_OFFLOAD_page_locked_host_free) *page_locked_host_free_func;
+ __typeof (GOMP_OFFLOAD_page_locked_host_register)
+ *page_locked_host_register_func;
+ __typeof (GOMP_OFFLOAD_page_locked_host_unregister)
+ *page_locked_host_unregister_func;
+ __typeof (GOMP_OFFLOAD_page_locked_host_p) *page_locked_host_p_func;
__typeof (GOMP_OFFLOAD_dev2host) *dev2host_func;
__typeof (GOMP_OFFLOAD_host2dev) *host2dev_func;
__typeof (GOMP_OFFLOAD_dev2dev) *dev2dev_func;
diff --git a/libgomp/libgomp.texi b/libgomp/libgomp.texi
index 6355ce2a37b..df52fd3039c 100644
--- a/libgomp/libgomp.texi
+++ b/libgomp/libgomp.texi
@@ -4402,10 +4402,41 @@ creating memory allocators requesting
The following sections present notes on the offload-target specifics
@menu
+* @option{-foffload-memory=pinned}::
* AMD Radeon::
* nvptx::
@end menu
+@node @option{-foffload-memory=pinned}
+@section @option{-foffload-memory=pinned}
+
+@c The following paragraph is duplicated from
+@c '../gcc/doc/invoke.texi', '-foffload-memory=pinned'.
+If supported by the active offloading device,
+@option{-foffload-memory=pinned} enables automatic use of page-locked
+host memory for memory objects participating in host <-> device memory
+transfers, for both OpenACC and OpenMP offloading.
+Such memory is allocated or registered using the respective offloading
+device interfaces, which potentially helps optimization of host <->
+device data transfers.
+This option is experimental.
+Beware that use of a lot of pinned memory may degrade overall system
+performance, as it does reduce the amount of host memory available for
+paging.
+
+An OpenACC @emph{async} @code{enter data}-like operation may register
+a memory object as pinned. After the corresponding @emph{async}
+@code{exit data}-like operation, this registration does last until
+next synchronization point (such as @code{acc_async_synchronize}).
+During this time, the user code must not "touch" the host-side memory
+allocation -- but that does correspond to the @emph{async} semantics
+anyway.
+
+We don't consider @code{-foffload-memory=pinned} for one-time internal
+data transfers, such as setup during device initialization.
+
+
+
@node AMD Radeon
@section AMD Radeon (GCN)
@@ -4459,6 +4490,8 @@ The implementation remark:
@item OpenMP @emph{pinned} memory (@code{omp_atk_pinned},
@code{ompx_pinned_mem_alloc}, for example)
is allocated via @code{mmap}, @code{mlock}.
+@item @option{-foffload-memory=pinned} is not supported,
+ @pxref{@option{-foffload-memory=pinned}}.
@end itemize
@@ -4526,6 +4559,8 @@ The implementation remark:
is allocated via @code{cuMemHostAlloc} (CUDA Driver API).
This potentially helps optimization of host <-> device data
transfers.
+@item @option{-foffload-memory=pinned} is supported,
+ @pxref{@option{-foffload-memory=pinned}}.
@end itemize
diff --git a/libgomp/libgomp_g.h b/libgomp/libgomp_g.h
index fe66a53d94a..2a515ce7348 100644
--- a/libgomp/libgomp_g.h
+++ b/libgomp/libgomp_g.h
@@ -365,6 +365,7 @@ extern bool GOMP_teams4 (unsigned int, unsigned int, unsigned int, bool);
extern bool GOMP_evaluate_target_device (int, const char *, const char *,
const char *);
+extern void GOMP_enable_pinned_mode (void);
/* teams.c */
@@ -375,7 +376,6 @@ extern void GOMP_teams_reg (void (*) (void *), void *, unsigned, unsigned,
extern void *GOMP_alloc (size_t, size_t, uintptr_t);
extern void GOMP_free (void *, uintptr_t);
-extern void GOMP_enable_pinned_mode (void);
/* error.c */
diff --git a/libgomp/oacc-mem.c b/libgomp/oacc-mem.c
index bd82beefcdb..75ec8958501 100644
--- a/libgomp/oacc-mem.c
+++ b/libgomp/oacc-mem.c
@@ -199,11 +199,27 @@ memcpy_tofrom_device (bool from, void *d, void *h, size_t s, int async,
}
goacc_aq aq = get_goacc_asyncqueue (async);
+
+ int h_page_locked_host_p = 0;
+
+ if (always_pinned_mode
+ && s != 0)
+ {
+ h_page_locked_host_p = gomp_page_locked_host_register_dev
+ (thr->dev, h, s, from ? GOMP_MAP_FROM : GOMP_MAP_TO);
+ if (h_page_locked_host_p < 0)
+ exit (EXIT_FAILURE);
+ }
+
if (from)
gomp_copy_dev2host (thr->dev, aq, h, d, s);
else
gomp_copy_host2dev (thr->dev, aq, d, h, s, false, /* TODO: cbuf? */ NULL);
+ if (h_page_locked_host_p
+ && !gomp_page_locked_host_unregister_dev (thr->dev, h, s, aq))
+ exit (EXIT_FAILURE);
+
if (profiling_p)
{
thr->prof_info = NULL;
diff --git a/libgomp/plugin/cuda-lib.def b/libgomp/plugin/cuda-lib.def
index 9b786c9f2f6..062a141053f 100644
--- a/libgomp/plugin/cuda-lib.def
+++ b/libgomp/plugin/cuda-lib.def
@@ -31,6 +31,10 @@ CUDA_ONE_CALL (cuMemAlloc)
CUDA_ONE_CALL (cuMemAllocHost)
CUDA_ONE_CALL (cuMemAllocManaged)
CUDA_ONE_CALL (cuMemHostAlloc)
+CUDA_ONE_CALL (cuMemHostGetFlags)
+CUDA_ONE_CALL_MAYBE_NULL (cuMemHostRegister_v2)
+CUDA_ONE_CALL (cuMemHostRegister)
+CUDA_ONE_CALL (cuMemHostUnregister)
CUDA_ONE_CALL (cuMemcpy)
CUDA_ONE_CALL (cuMemcpyDtoDAsync)
CUDA_ONE_CALL (cuMemcpyDtoH)
diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c
index 23f89b6fb34..e57a2b30e66 100644
--- a/libgomp/plugin/plugin-nvptx.c
+++ b/libgomp/plugin/plugin-nvptx.c
@@ -78,11 +78,14 @@ extern CUresult cuGetErrorString (CUresult, const char **);
CUresult cuLinkAddData (CUlinkState, CUjitInputType, void *, size_t,
const char *, unsigned, CUjit_option *, void **);
CUresult cuLinkCreate (unsigned, CUjit_option *, void **, CUlinkState *);
+#undef cuMemHostRegister
+CUresult cuMemHostRegister (void *, size_t, unsigned int);
#else
typedef size_t (*CUoccupancyB2DSize)(int);
CUresult cuLinkAddData_v2 (CUlinkState, CUjitInputType, void *, size_t,
const char *, unsigned, CUjit_option *, void **);
CUresult cuLinkCreate_v2 (unsigned, CUjit_option *, void **, CUlinkState *);
+CUresult cuMemHostRegister_v2 (void *, size_t, unsigned int);
CUresult cuOccupancyMaxPotentialBlockSize(int *, int *, CUfunction,
CUoccupancyB2DSize, size_t, int);
#endif
@@ -218,6 +221,8 @@ static pthread_mutex_t ptx_dev_lock = PTHREAD_MUTEX_INITIALIZER;
struct goacc_asyncqueue
{
CUstream cuda_stream;
+ pthread_mutex_t page_locked_host_unregister_blocks_lock;
+ struct ptx_free_block *page_locked_host_unregister_blocks;
};
struct nvptx_callback
@@ -314,6 +319,7 @@ struct ptx_device
int warp_size;
int max_threads_per_block;
int max_threads_per_multiprocessor;
+ bool read_only_host_register_supported;
int default_dims[GOMP_DIM_MAX];
int compute_major, compute_minor;
@@ -340,6 +346,33 @@ struct ptx_device
static struct ptx_device **ptx_devices;
+static struct ptx_free_block *free_host_blocks = NULL;
+static pthread_mutex_t free_host_blocks_lock = PTHREAD_MUTEX_INITIALIZER;
+
+static bool
+nvptx_run_deferred_page_locked_host_free (void)
+{
+ GOMP_PLUGIN_debug (0, "%s\n",
+ __FUNCTION__);
+
+ pthread_mutex_lock (&free_host_blocks_lock);
+ struct ptx_free_block *b = free_host_blocks;
+ free_host_blocks = NULL;
+ pthread_mutex_unlock (&free_host_blocks_lock);
+
+ while (b)
+ {
+ GOMP_PLUGIN_debug (0, " b=%p: cuMemFreeHost(b->ptr=%p)\n",
+ b, b->ptr);
+
+ struct ptx_free_block *b_next = b->next;
+ CUDA_CALL (cuMemFreeHost, b->ptr);
+ free (b);
+ b = b_next;
+ }
+ return true;
+}
+
/* OpenMP kernels reserve a small amount of ".shared" space for use by
omp_alloc. The size is configured using GOMP_NVPTX_LOWLAT_POOL, but the
default is set here. */
@@ -542,6 +575,19 @@ nvptx_open_device (int n)
CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, dev);
assert (r == CUDA_SUCCESS && pi);
+ /* This is a CUDA 11.1 feature. */
+ r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &pi,
+ CU_DEVICE_ATTRIBUTE_READ_ONLY_HOST_REGISTER_SUPPORTED,
+ dev);
+ if (r == CUDA_ERROR_INVALID_VALUE)
+ pi = false;
+ else if (r != CUDA_SUCCESS)
+ {
+ GOMP_PLUGIN_error ("cuDeviceGetAttribute error: %s", cuda_error (r));
+ return NULL;
+ }
+ ptx_dev->read_only_host_register_supported = pi;
+
for (int i = 0; i != GOMP_DIM_MAX; i++)
ptx_dev->default_dims[i] = 0;
@@ -1278,6 +1324,11 @@ GOMP_OFFLOAD_init_device (int n)
bool
GOMP_OFFLOAD_fini_device (int n)
{
+ /* This isn't related to this specific 'ptx_devices[n]', but is a convenient
+ place to clean up. */
+ if (!nvptx_run_deferred_page_locked_host_free ())
+ return false;
+
pthread_mutex_lock (&ptx_dev_lock);
if (ptx_devices[n] != NULL)
@@ -1711,6 +1762,12 @@ GOMP_OFFLOAD_page_locked_host_alloc (void **ptr, size_t size)
GOMP_PLUGIN_debug (0, "nvptx %s: ptr=%p, size=%llu\n",
__FUNCTION__, ptr, (unsigned long long) size);
+ /* TODO: Maybe running the deferred 'cuMemFreeHost's here is not the best
+ idea, given that we don't know what context we're called from? (See
+ 'GOMP_OFFLOAD_run' reverse offload handling.) But, where to do it? */
+ if (!nvptx_run_deferred_page_locked_host_free ())
+ return false;
+
CUresult r;
unsigned int flags = 0;
@@ -1729,16 +1786,243 @@ GOMP_OFFLOAD_page_locked_host_alloc (void **ptr, size_t size)
return true;
}
+static void
+nvptx_page_locked_host_free_callback (CUstream stream, CUresult r, void *ptr)
+{
+ GOMP_PLUGIN_debug (0, "%s: stream=%p, r=%u, ptr=%p\n",
+ __FUNCTION__, stream, (unsigned) r, ptr);
+
+ if (r != CUDA_SUCCESS)
+ GOMP_PLUGIN_error ("%s error: %s", __FUNCTION__, cuda_error (r));
+
+ /* We can't now call 'cuMemFreeHost': we're in a CUDA stream context,
+ where we "must not make any CUDA API calls".
+ And, in particular in an OpenMP 'target' reverse offload context,
+ this may even dead-lock?! */
+ /* See 'nvptx_free'. */
+ struct ptx_free_block *n
+ = GOMP_PLUGIN_malloc (sizeof (struct ptx_free_block));
+ GOMP_PLUGIN_debug (0, " defer; n=%p\n", n);
+ n->ptr = ptr;
+ pthread_mutex_lock (&free_host_blocks_lock);
+ n->next = free_host_blocks;
+ free_host_blocks = n;
+ pthread_mutex_unlock (&free_host_blocks_lock);
+}
+
+bool
+GOMP_OFFLOAD_page_locked_host_free (void *ptr, struct goacc_asyncqueue *aq)
+{
+ GOMP_PLUGIN_debug (0, "nvptx %s: ptr=%p, aq=%p\n",
+ __FUNCTION__, ptr, aq);
+
+ if (aq)
+ {
+ GOMP_PLUGIN_debug (0, " aq <-"
+ " nvptx_page_locked_host_free_callback(ptr)\n");
+ CUDA_CALL (cuStreamAddCallback, aq->cuda_stream,
+ nvptx_page_locked_host_free_callback, ptr, 0);
+ }
+ else
+ CUDA_CALL (cuMemFreeHost, ptr);
+ return true;
+}
+
+static int
+nvptx_page_locked_host_p (const void *ptr, size_t size)
+{
+ GOMP_PLUGIN_debug (0, "%s: ptr=%p, size=%llu\n",
+ __FUNCTION__, ptr, (unsigned long long) size);
+
+ int ret;
+
+ CUresult r;
+
+ /* Apparently, there exists no CUDA call to query 'PTR + [0, SIZE)'. Instead
+ of invoking 'cuMemHostGetFlags' SIZE times, we deem it sufficient to only
+ query the base PTR. */
+ unsigned int flags;
+ void *ptr_noconst = (void *) ptr;
+ r = CUDA_CALL_NOCHECK (cuMemHostGetFlags, &flags, ptr_noconst);
+ (void) flags;
+ if (r == CUDA_SUCCESS)
+ ret = 1;
+ else if (r == CUDA_ERROR_INVALID_VALUE)
+ ret = 0;
+ else
+ {
+ GOMP_PLUGIN_error ("cuMemHostGetFlags error: %s", cuda_error (r));
+ ret = -1;
+ }
+ GOMP_PLUGIN_debug (0, " -> %d (with r = %u)\n",
+ ret, (unsigned) r);
+ return ret;
+}
+
+int
+GOMP_OFFLOAD_page_locked_host_register (int ord,
+ void *ptr, size_t size, int kind)
+{
+ bool try_read_only;
+ /* Magic number: if the actualy mapping kind is unknown... */
+ if (kind == -1)
+ /* ..., allow for trying read-only registration here. */
+ try_read_only = true;
+ else
+ try_read_only = !GOMP_MAP_COPY_FROM_P (kind);
+ GOMP_PLUGIN_debug (0, "nvptx %s: ord=%d, ptr=%p, size=%llu,"
+ " kind=%d (try_read_only=%d)\n",
+ __FUNCTION__, ord, ptr, (unsigned long long) size,
+ kind, try_read_only);
+ assert (size != 0);
+
+ if (!nvptx_attach_host_thread_to_device (ord))
+ return -1;
+ struct ptx_device *ptx_dev = ptx_devices[ord];
+
+ int ret = -1;
+
+ CUresult r;
+
+ unsigned int flags = 0;
+ /* Given 'CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING', we don't need
+ 'flags |= CU_MEMHOSTREGISTER_PORTABLE;' here. */
+ cuMemHostRegister:
+ if (CUDA_CALL_EXISTS (cuMemHostRegister_v2))
+ r = CUDA_CALL_NOCHECK (cuMemHostRegister_v2, ptr, size, flags);
+ else
+ r = CUDA_CALL_NOCHECK (cuMemHostRegister, ptr, size, flags);
+ if (r == CUDA_SUCCESS)
+ ret = 1;
+ else if (r == CUDA_ERROR_INVALID_VALUE)
+ {
+ /* For example, for 'cuMemHostAlloc' (via the user code, for example)
+ followed by 'cuMemHostRegister' (via 'always_pinned_mode', for
+ example), we don't get 'CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED' but
+ 'CUDA_ERROR_INVALID_VALUE'. */
+ if (nvptx_page_locked_host_p (ptr, size))
+ /* Accept the case that the region already is page-locked. */
+ ret = 0;
+ /* Depending on certain circumstances (see 'cuMemHostRegister'
+ documentation), for example, 'const' data that is placed in section
+ '.rodata' may need 'flags |= CU_MEMHOSTREGISTER_READ_ONLY;', to avoid
+ 'CUDA_ERROR_INVALID_VALUE'. If running into that, we now apply/re-try
+ lazily instead of actively setting it above, to avoid the following
+ problem. Supposedly/observably (but, not documented), if part of a
+ memory page has been registered without 'CU_MEMHOSTREGISTER_READ_ONLY'
+ and we then try to register another part with
+ 'CU_MEMHOSTREGISTER_READ_ONLY', we'll get 'CUDA_ERROR_INVALID_VALUE'.
+ In that case, we can solve the issue by re-trying with
+ 'CU_MEMHOSTREGISTER_READ_ONLY' masked out. However, if part of a
+ memory page has been registered with 'CU_MEMHOSTREGISTER_READ_ONLY'
+ and we then try to register another part without
+ 'CU_MEMHOSTREGISTER_READ_ONLY', that latter part apparently inherits
+ the former's 'CU_MEMHOSTREGISTER_READ_ONLY' (and any device to host
+ copy then fails). We can't easily resolve that situation
+ retroactively, that is, we can't easily re-register the first
+ 'CU_MEMHOSTREGISTER_READ_ONLY' part without that flag. */
+ else if (!(flags & CU_MEMHOSTREGISTER_READ_ONLY)
+ && try_read_only
+ && ptx_dev->read_only_host_register_supported)
+ {
+ GOMP_PLUGIN_debug (0, " flags |= CU_MEMHOSTREGISTER_READ_ONLY;\n");
+ flags |= CU_MEMHOSTREGISTER_READ_ONLY;
+ goto cuMemHostRegister;
+ }
+ /* We ought to use 'CU_MEMHOSTREGISTER_READ_ONLY', but it's not
+ available. */
+ else if (try_read_only
+ && !ptx_dev->read_only_host_register_supported)
+ {
+ assert (!(flags & CU_MEMHOSTREGISTER_READ_ONLY));
+ GOMP_PLUGIN_debug (0, " punt;"
+ " CU_MEMHOSTREGISTER_READ_ONLY not available\n");
+ /* Accept this (legacy) case; we can't (easily) register page-locked
+ this region of host memory. */
+ ret = 0;
+ }
+ }
+ else if (r == CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED)
+ {
+ /* 'cuMemHostRegister' (via the user code, for example) followed by
+ another (potentially partially overlapping) 'cuMemHostRegister'
+ (via 'always_pinned_mode', for example). */
+ /* Accept this case in good faith; do not verify further. */
+ ret = 0;
+ }
+ if (ret == -1)
+ GOMP_PLUGIN_error ("cuMemHostRegister error: %s", cuda_error (r));
+ GOMP_PLUGIN_debug (0, " -> %d (with r = %u)\n",
+ ret, (unsigned) r);
+ return ret;
+}
+
+static void
+nvptx_page_locked_host_unregister_callback (CUstream stream, CUresult r,
+ void *b_)
+{
+ void **b = b_;
+ struct goacc_asyncqueue *aq = b[0];
+ void *ptr = b[1];
+ GOMP_PLUGIN_debug (0, "%s: stream=%p, r=%u, b_=%p (aq=%p, ptr=%p)\n",
+ __FUNCTION__, stream, (unsigned) r, b_, aq, ptr);
+
+ free (b_);
+
+ if (r != CUDA_SUCCESS)
+ GOMP_PLUGIN_error ("%s error: %s", __FUNCTION__, cuda_error (r));
+
+ /* We can't now call 'cuMemHostUnregister': we're in a CUDA stream context,
+ where we "must not make any CUDA API calls". */
+ /* See 'nvptx_free'. */
+ struct ptx_free_block *n
+ = GOMP_PLUGIN_malloc (sizeof (struct ptx_free_block));
+ GOMP_PLUGIN_debug (0, " defer; n=%p\n", n);
+ n->ptr = ptr;
+ pthread_mutex_lock (&aq->page_locked_host_unregister_blocks_lock);
+ n->next = aq->page_locked_host_unregister_blocks;
+ aq->page_locked_host_unregister_blocks = n;
+ pthread_mutex_unlock (&aq->page_locked_host_unregister_blocks_lock);
+}
+
bool
-GOMP_OFFLOAD_page_locked_host_free (void *ptr)
+GOMP_OFFLOAD_page_locked_host_unregister (void *ptr, size_t size,
+ struct goacc_asyncqueue *aq)
{
- GOMP_PLUGIN_debug (0, "nvptx %s: ptr=%p\n",
- __FUNCTION__, ptr);
+ GOMP_PLUGIN_debug (0, "nvptx %s: ptr=%p, size=%llu, aq=%p\n",
+ __FUNCTION__, ptr, (unsigned long long) size, aq);
+ assert (size != 0);
- CUDA_CALL (cuMemFreeHost, ptr);
+ if (aq)
+ {
+ /* We don't unregister right away, as in-flight operations may still
+ benefit from the registration. */
+ void **b = GOMP_PLUGIN_malloc (2 * sizeof (*b));
+ b[0] = aq;
+ b[1] = ptr;
+ GOMP_PLUGIN_debug (0, " aq <-"
+ " nvptx_page_locked_host_unregister_callback(b=%p)\n",
+ b);
+ CUDA_CALL (cuStreamAddCallback, aq->cuda_stream,
+ nvptx_page_locked_host_unregister_callback, b, 0);
+ }
+ else
+ CUDA_CALL (cuMemHostUnregister, ptr);
return true;
}
+int
+GOMP_OFFLOAD_page_locked_host_p (int ord, const void *ptr, size_t size)
+{
+ GOMP_PLUGIN_debug (0, "nvptx %s: ord=%d, ptr=%p, size=%llu\n",
+ __FUNCTION__, ord, ptr, (unsigned long long) size);
+
+ if (!nvptx_attach_host_thread_to_device (ord))
+ return -1;
+
+ return nvptx_page_locked_host_p (ptr, size);
+}
+
void
GOMP_OFFLOAD_openacc_exec (void (*fn) (void *),
@@ -1841,12 +2125,19 @@ GOMP_OFFLOAD_openacc_cuda_set_stream (struct goacc_asyncqueue *aq, void *stream)
static struct goacc_asyncqueue *
nvptx_goacc_asyncqueue_construct (unsigned int flags)
{
+ GOMP_PLUGIN_debug (0, "%s: flags=%u\n",
+ __FUNCTION__, flags);
+
CUstream stream = NULL;
CUDA_CALL_ERET (NULL, cuStreamCreate, &stream, flags);
struct goacc_asyncqueue *aq
= GOMP_PLUGIN_malloc (sizeof (struct goacc_asyncqueue));
aq->cuda_stream = stream;
+ pthread_mutex_init (&aq->page_locked_host_unregister_blocks_lock, NULL);
+ aq->page_locked_host_unregister_blocks = NULL;
+ GOMP_PLUGIN_debug (0, " -> aq=%p (with cuda_stream=%p)\n",
+ aq, aq->cuda_stream);
return aq;
}
@@ -1859,9 +2150,24 @@ GOMP_OFFLOAD_openacc_async_construct (int device __attribute__((unused)))
static bool
nvptx_goacc_asyncqueue_destruct (struct goacc_asyncqueue *aq)
{
+ GOMP_PLUGIN_debug (0, "nvptx %s: aq=%p\n",
+ __FUNCTION__, aq);
+
CUDA_CALL_ERET (false, cuStreamDestroy, aq->cuda_stream);
+
+ bool ret = true;
+ pthread_mutex_lock (&aq->page_locked_host_unregister_blocks_lock);
+ if (aq->page_locked_host_unregister_blocks != NULL)
+ {
+ GOMP_PLUGIN_error ("aq->page_locked_host_unregister_blocks not empty");
+ ret = false;
+ }
+ pthread_mutex_unlock (&aq->page_locked_host_unregister_blocks_lock);
+ pthread_mutex_destroy (&aq->page_locked_host_unregister_blocks_lock);
+
free (aq);
- return true;
+
+ return ret;
}
bool
@@ -1870,12 +2176,50 @@ GOMP_OFFLOAD_openacc_async_destruct (struct goacc_asyncqueue *aq)
return nvptx_goacc_asyncqueue_destruct (aq);
}
+static bool
+nvptx_run_deferred_page_locked_host_unregister (struct goacc_asyncqueue *aq)
+{
+ GOMP_PLUGIN_debug (0, "%s: aq=%p\n",
+ __FUNCTION__, aq);
+
+ bool ret = true;
+ pthread_mutex_lock (&aq->page_locked_host_unregister_blocks_lock);
+ for (struct ptx_free_block *b = aq->page_locked_host_unregister_blocks; b;)
+ {
+ GOMP_PLUGIN_debug (0, " b=%p: cuMemHostUnregister(b->ptr=%p)\n",
+ b, b->ptr);
+
+ struct ptx_free_block *b_next = b->next;
+ CUresult r = CUDA_CALL_NOCHECK (cuMemHostUnregister, b->ptr);
+ if (r != CUDA_SUCCESS)
+ {
+ GOMP_PLUGIN_error ("cuMemHostUnregister error: %s", cuda_error (r));
+ ret = false;
+ }
+ free (b);
+ b = b_next;
+ }
+ aq->page_locked_host_unregister_blocks = NULL;
+ pthread_mutex_unlock (&aq->page_locked_host_unregister_blocks_lock);
+ return ret;
+}
+
int
GOMP_OFFLOAD_openacc_async_test (struct goacc_asyncqueue *aq)
{
+ GOMP_PLUGIN_debug (0, "nvptx %s: aq=%p\n",
+ __FUNCTION__, aq);
+
CUresult r = CUDA_CALL_NOCHECK (cuStreamQuery, aq->cuda_stream);
if (r == CUDA_SUCCESS)
- return 1;
+ {
+ /* As a user may expect that they don't need to 'wait' if
+ 'acc_async_test' returns 'true', clean up here, too. */
+ if (!nvptx_run_deferred_page_locked_host_unregister (aq))
+ return -1;
+
+ return 1;
+ }
if (r == CUDA_ERROR_NOT_READY)
return 0;
@@ -1886,7 +2230,17 @@ GOMP_OFFLOAD_openacc_async_test (struct goacc_asyncqueue *aq)
static bool
nvptx_goacc_asyncqueue_synchronize (struct goacc_asyncqueue *aq)
{
+ GOMP_PLUGIN_debug (0, "%s: aq=%p\n",
+ __FUNCTION__, aq);
+
CUDA_CALL_ERET (false, cuStreamSynchronize, aq->cuda_stream);
+
+ /* This is called from a user code (non-stream) context, and upon returning,
+ we must've given up on any page-locked memory registrations, so unregister
+ any pending ones now. */
+ if (!nvptx_run_deferred_page_locked_host_unregister (aq))
+ return false;
+
return true;
}
@@ -1896,14 +2250,70 @@ GOMP_OFFLOAD_openacc_async_synchronize (struct goacc_asyncqueue *aq)
return nvptx_goacc_asyncqueue_synchronize (aq);
}
+static void
+nvptx_move_page_locked_host_unregister_blocks_aq1_aq2_callback
+(CUstream stream, CUresult r, void *b_)
+{
+ void **b = b_;
+ struct goacc_asyncqueue *aq1 = b[0];
+ struct goacc_asyncqueue *aq2 = b[1];
+ GOMP_PLUGIN_debug (0, "%s: stream=%p, r=%u, b_=%p (aq1=%p, aq2=%p)\n",
+ __FUNCTION__, stream, (unsigned) r, b_, aq1, aq2);
+
+ free (b_);
+
+ if (r != CUDA_SUCCESS)
+ GOMP_PLUGIN_error ("%s error: %s", __FUNCTION__, cuda_error (r));
+
+ pthread_mutex_lock (&aq1->page_locked_host_unregister_blocks_lock);
+ if (aq1->page_locked_host_unregister_blocks)
+ {
+ pthread_mutex_lock (&aq2->page_locked_host_unregister_blocks_lock);
+ GOMP_PLUGIN_debug (0, " page_locked_host_unregister_blocks:"
+ " aq1 -> aq2\n");
+ if (aq2->page_locked_host_unregister_blocks == NULL)
+ aq2->page_locked_host_unregister_blocks
+ = aq1->page_locked_host_unregister_blocks;
+ else
+ {
+ struct ptx_free_block *b = aq2->page_locked_host_unregister_blocks;
+ while (b->next != NULL)
+ b = b->next;
+ b->next = aq1->page_locked_host_unregister_blocks;
+ }
+ pthread_mutex_unlock (&aq2->page_locked_host_unregister_blocks_lock);
+ aq1->page_locked_host_unregister_blocks = NULL;
+ }
+ pthread_mutex_unlock (&aq1->page_locked_host_unregister_blocks_lock);
+}
+
bool
GOMP_OFFLOAD_openacc_async_serialize (struct goacc_asyncqueue *aq1,
struct goacc_asyncqueue *aq2)
{
+ GOMP_PLUGIN_debug (0, "nvptx %s: aq1=%p, aq2=%p\n",
+ __FUNCTION__, aq1, aq2);
+
+ if (aq1 != aq2)
+ {
+ void **b = GOMP_PLUGIN_malloc (2 * sizeof (*b));
+ b[0] = aq1;
+ b[1] = aq2;
+ /* Enqueue on 'aq1': move 'page_locked_host_unregister_blocks' of 'aq1'
+ to 'aq2'. */
+ GOMP_PLUGIN_debug (0, " aq1 <-"
+ " nvptx_move_page_locked_host_unregister_blocks_aq1_aq2_callback"
+ "(b=%p)\n", b);
+ CUDA_CALL (cuStreamAddCallback, aq1->cuda_stream,
+ nvptx_move_page_locked_host_unregister_blocks_aq1_aq2_callback,
+ b, 0);
+ }
+
CUevent e;
CUDA_CALL_ERET (false, cuEventCreate, &e, CU_EVENT_DISABLE_TIMING);
CUDA_CALL_ERET (false, cuEventRecord, e, aq1->cuda_stream);
CUDA_CALL_ERET (false, cuStreamWaitEvent, aq2->cuda_stream, e, 0);
+
return true;
}
@@ -2238,6 +2648,19 @@ GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args)
if (!nvptx_goacc_asyncqueue_synchronize (reverse_offload_aq))
exit (EXIT_FAILURE);
__atomic_store_n (&rev_data->fn, 0, __ATOMIC_RELEASE);
+
+ /* Clean up here; otherwise we may run into the situation that
+ a following reverse offload does
+ 'GOMP_OFFLOAD_page_locked_host_alloc', and that then runs the
+ deferred 'cuMemFreeHost's -- which may dead-lock?!
+ TODO: This may need more considerations for the case that
+ different host threads do reverse offload? We could move
+ 'free_host_blocks' into 'aq' (which is separate per reverse
+ offload) instead of global, like
+ 'page_locked_host_unregister_blocks', but that doesn't seem the
+ right thing for OpenACC 'async' generally? */
+ if (!nvptx_run_deferred_page_locked_host_free ())
+ exit (EXIT_FAILURE);
}
usleep (1);
}
diff --git a/libgomp/target.c b/libgomp/target.c
index b88b1ebaa13..ed2fc09cf44 100644
--- a/libgomp/target.c
+++ b/libgomp/target.c
@@ -108,6 +108,74 @@ static int num_devices_openmp;
/* OpenMP requires mask. */
static int omp_requires_mask;
+
+static void *gomp_page_locked_host_alloc_dev (struct gomp_device_descr *,
+ size_t, bool);
+static bool gomp_page_locked_host_free_dev (struct gomp_device_descr *,
+ void *,
+ struct goacc_asyncqueue *);
+static void *gomp_page_locked_host_aligned_alloc_dev (struct gomp_device_descr *,
+ size_t, size_t);
+static bool gomp_page_locked_host_aligned_free_dev (struct gomp_device_descr *,
+ void *,
+ struct goacc_asyncqueue *);
+
+/* Use (that is, allocate or register) page-locked host memory for memory
+ objects participating in host <-> device memory transfers.
+
+ When this is enabled, there is no fallback to non-page-locked host
+ memory. */
+
+attribute_hidden
+bool always_pinned_mode = false;
+
+/* This function is called by the compiler when -foffload-memory=pinned
+ is used. */
+
+void
+GOMP_enable_pinned_mode ()
+{
+ always_pinned_mode = true;
+}
+
+/* Verify that page-locked host memory is used for memory objects participating
+ in host <-> device memory transfers. */
+
+static const bool verify_always_pinned_mode = false;
+
+static bool
+gomp_verify_always_pinned_mode (struct gomp_device_descr *device,
+ const void *ptr, size_t size)
+{
+ gomp_debug (0, "%s: device=%p (%s), ptr=%p, size=%llu\n",
+ __FUNCTION__,
+ device, device->name, ptr, (unsigned long long) size);
+
+ if (size == 0)
+ /* Skip zero-size requests; for those we've got no actual region of
+ page-locked host memory. */
+ ;
+ else if (device->page_locked_host_register_func)
+ {
+ int page_locked_host_p
+ = device->page_locked_host_p_func (device->target_id, ptr, size);
+ if (page_locked_host_p < 0)
+ {
+ gomp_error ("Failed to test page-locked host memory"
+ " via %s libgomp plugin",
+ device->name);
+ return false;
+ }
+ if (!page_locked_host_p)
+ {
+ gomp_error ("Failed page-locked host memory test");
+ return false;
+ }
+ }
+ return true;
+}
+
+
/* Similar to gomp_realloc, but release register_lock before gomp_fatal. */
static void *
@@ -402,6 +470,9 @@ gomp_copy_host2dev (struct gomp_device_descr *devicep,
if (__builtin_expect (aq != NULL, 0))
assert (ephemeral);
+ /* We're just filling the CBUF; 'always_pinned_mode' isn't
+ relevant. */
+
memcpy ((char *) cbuf->buf + (doff - cbuf->chunks[0].start),
h, sz);
return;
@@ -422,18 +493,92 @@ gomp_copy_host2dev (struct gomp_device_descr *devicep,
stack local in a function that is no longer executing). As we've
not been able to use CBUF, make a copy of the data into a
temporary buffer. */
- h_buf = gomp_malloc (sz);
+ if (always_pinned_mode)
+ {
+ h_buf = gomp_page_locked_host_alloc_dev (devicep, sz, false);
+ if (!h_buf)
+ {
+ gomp_mutex_unlock (&devicep->lock);
+ exit (EXIT_FAILURE);
+ }
+ }
+ else
+ h_buf = gomp_malloc (sz);
memcpy (h_buf, h, sz);
}
+
+ /* No 'gomp_verify_always_pinned_mode' for 'ephemeral'; have just
+ allocated. */
+ if (!ephemeral
+ && verify_always_pinned_mode
+ && always_pinned_mode)
+ if (!gomp_verify_always_pinned_mode (devicep, h_buf, sz))
+ {
+ gomp_mutex_unlock (&devicep->lock);
+ exit (EXIT_FAILURE);
+ }
+
goacc_device_copy_async (devicep, devicep->openacc.async.host2dev_func,
"dev", d, "host", h_buf, h, sz, aq);
+
if (ephemeral)
- /* Free once the transfer has completed. */
- devicep->openacc.async.queue_callback_func (aq, free, h_buf);
+ {
+ if (always_pinned_mode)
+ {
+ if (!gomp_page_locked_host_free_dev (devicep, h_buf, aq))
+ {
+ gomp_mutex_unlock (&devicep->lock);
+ exit (EXIT_FAILURE);
+ }
+ }
+ else
+ /* Free once the transfer has completed. */
+ devicep->openacc.async.queue_callback_func (aq, free, h_buf);
+ }
}
else
- gomp_device_copy (devicep, devicep->host2dev_func,
- "dev", d, "host", h, sz);
+ {
+ if (ephemeral
+ && always_pinned_mode)
+ {
+ /* TODO: Page-locking on the spot probably doesn't make a lot of
+ sense (performance-wise). Should we instead use a "page-locked
+ host memory bounce buffer" (per host thread, or per device,
+ or...)? */
+ void *ptr = (void *) h;
+ int page_locked_host_p
+ = gomp_page_locked_host_register_dev (devicep,
+ ptr, sz, GOMP_MAP_TO);
+ if (page_locked_host_p < 0)
+ {
+ gomp_mutex_unlock (&devicep->lock);
+ exit (EXIT_FAILURE);
+ }
+ /* Ephemeral data isn't already page-locked host memory. */
+ assert (page_locked_host_p);
+ }
+ else if (verify_always_pinned_mode
+ && always_pinned_mode)
+ if (!gomp_verify_always_pinned_mode (devicep, h, sz))
+ {
+ gomp_mutex_unlock (&devicep->lock);
+ exit (EXIT_FAILURE);
+ }
+
+ gomp_device_copy (devicep, devicep->host2dev_func,
+ "dev", d, "host", h, sz);
+
+ if (ephemeral
+ && always_pinned_mode)
+ {
+ void *ptr = (void *) h;
+ if (!gomp_page_locked_host_unregister_dev (devicep, ptr, sz, aq))
+ {
+ gomp_mutex_unlock (&devicep->lock);
+ exit (EXIT_FAILURE);
+ }
+ }
+ }
}
attribute_hidden void
@@ -441,6 +586,14 @@ gomp_copy_dev2host (struct gomp_device_descr *devicep,
struct goacc_asyncqueue *aq,
void *h, const void *d, size_t sz)
{
+ if (verify_always_pinned_mode
+ && always_pinned_mode)
+ if (!gomp_verify_always_pinned_mode (devicep, h, sz))
+ {
+ gomp_mutex_unlock (&devicep->lock);
+ exit (EXIT_FAILURE);
+ }
+
if (__builtin_expect (aq != NULL, 0))
goacc_device_copy_async (devicep, devicep->openacc.async.dev2host_func,
"host", h, "dev", d, NULL, sz, aq);
@@ -1367,8 +1520,19 @@ gomp_map_vars_internal (struct gomp_device_descr *devicep,
cbuf.chunk_cnt--;
if (cbuf.chunk_cnt > 0)
{
- cbuf.buf
- = malloc (cbuf.chunks[cbuf.chunk_cnt - 1].end - cbuf.chunks[0].start);
+ size_t sz
+ = cbuf.chunks[cbuf.chunk_cnt - 1].end - cbuf.chunks[0].start;
+ if (always_pinned_mode)
+ {
+ cbuf.buf = gomp_page_locked_host_alloc_dev (devicep, sz, false);
+ if (!cbuf.buf)
+ {
+ gomp_mutex_unlock (&devicep->lock);
+ exit (EXIT_FAILURE);
+ }
+ }
+ else
+ cbuf.buf = malloc (sz);
if (cbuf.buf)
{
cbuf.tgt = tgt;
@@ -1671,6 +1835,23 @@ gomp_map_vars_internal (struct gomp_device_descr *devicep,
k->tgt = tgt;
k->refcount = 0;
k->dynamic_refcount = 0;
+ k->page_locked_host_p = false;
+ if (always_pinned_mode)
+ {
+ void *ptr = (void *) k->host_start;
+ size_t size = k->host_end - k->host_start;
+ int page_locked_host_p = 0;
+ if (size != 0)
+ page_locked_host_p = gomp_page_locked_host_register_dev
+ (devicep, ptr, size, kind & typemask);
+ if (page_locked_host_p < 0)
+ {
+ gomp_mutex_unlock (&devicep->lock);
+ exit (EXIT_FAILURE);
+ }
+ if (page_locked_host_p)
+ k->page_locked_host_p = true;
+ }
if (field_tgt_clear != FIELD_TGT_EMPTY)
{
k->tgt_offset = k->host_start - field_tgt_base
@@ -1976,11 +2157,22 @@ gomp_map_vars_internal (struct gomp_device_descr *devicep,
- cbuf.chunks[0].start),
cbuf.chunks[c].end - cbuf.chunks[c].start,
false, NULL);
- if (aq)
- /* Free once the transfer has completed. */
- devicep->openacc.async.queue_callback_func (aq, free, cbuf.buf);
+ if (always_pinned_mode)
+ {
+ if (!gomp_page_locked_host_free_dev (devicep, cbuf.buf, aq))
+ {
+ gomp_mutex_unlock (&devicep->lock);
+ exit (EXIT_FAILURE);
+ }
+ }
else
- free (cbuf.buf);
+ {
+ if (aq)
+ /* Free once the transfer has completed. */
+ devicep->openacc.async.queue_callback_func (aq, free, cbuf.buf);
+ else
+ free (cbuf.buf);
+ }
cbuf.buf = NULL;
cbufp = NULL;
}
@@ -2112,6 +2304,23 @@ gomp_remove_var_internal (struct gomp_device_descr *devicep, splay_tree_key k,
/* Starting from the _FIRST key, and continue for all following
sibling keys. */
gomp_remove_splay_tree_key (&devicep->mem_map, k);
+
+ if (always_pinned_mode)
+ {
+ if (k->page_locked_host_p)
+ {
+ void *ptr = (void *) k->host_start;
+ size_t size = k->host_end - k->host_start;
+ if (!gomp_page_locked_host_unregister_dev (devicep,
+ ptr, size, aq))
+ {
+ gomp_mutex_unlock (&devicep->lock);
+ exit (EXIT_FAILURE);
+ }
+ k->page_locked_host_p = false;
+ }
+ }
+
if (REFCOUNT_STRUCTELEM_LAST_P (k->refcount))
break;
else
@@ -2119,7 +2328,25 @@ gomp_remove_var_internal (struct gomp_device_descr *devicep, splay_tree_key k,
}
}
else
- gomp_remove_splay_tree_key (&devicep->mem_map, k);
+ {
+ gomp_remove_splay_tree_key (&devicep->mem_map, k);
+
+ if (always_pinned_mode)
+ {
+ if (k->page_locked_host_p)
+ {
+ void *ptr = (void *) k->host_start;
+ size_t size = k->host_end - k->host_start;
+ if (!gomp_page_locked_host_unregister_dev (devicep,
+ ptr, size, aq))
+ {
+ gomp_mutex_unlock (&devicep->lock);
+ exit (EXIT_FAILURE);
+ }
+ k->page_locked_host_p = false;
+ }
+ }
+ }
if (aq)
devicep->openacc.async.queue_callback_func (aq, gomp_unref_tgt_void,
@@ -2211,6 +2438,8 @@ gomp_unmap_vars_internal (struct target_mem_desc *tgt, bool do_copyfrom,
+ tgt->list[i].offset),
tgt->list[i].length);
/* Queue all removals together for processing below.
+ We may unregister page-locked host memory only after all device to
+ host memory transfers have completed.
See also 'gomp_exit_data'. */
if (do_remove)
remove_vars[nrmvars++] = k;
@@ -2392,8 +2621,17 @@ get_gomp_offload_icvs (int dev_num)
if (offload_icvs != NULL)
return &offload_icvs->icvs;
- struct gomp_offload_icv_list *new
- = (struct gomp_offload_icv_list *) gomp_malloc (sizeof (struct gomp_offload_icv_list));
+ struct gomp_offload_icv_list *new;
+ size_t size = sizeof (struct gomp_offload_icv_list);
+ if (always_pinned_mode)
+ {
+ struct gomp_device_descr *device = &devices[dev_num];
+ new = gomp_page_locked_host_alloc_dev (device, size, false);
+ if (!new)
+ exit (EXIT_FAILURE);
+ }
+ else
+ new = gomp_malloc (size);
new->device_num = dev_num;
new->icvs.device_num = dev_num;
@@ -2447,6 +2685,8 @@ gomp_load_image_to_device (struct gomp_device_descr *devicep, unsigned version,
const void *host_table, const void *target_data,
bool is_register_lock)
{
+ gomp_debug (0, "%s: devicep=%p (%s)\n",
+ __FUNCTION__, devicep, devicep->name);
void **host_func_table = ((void ***) host_table)[0];
void **host_funcs_end = ((void ***) host_table)[1];
void **host_var_table = ((void ***) host_table)[2];
@@ -2511,6 +2751,7 @@ gomp_load_image_to_device (struct gomp_device_descr *devicep, unsigned version,
k->refcount = REFCOUNT_INFINITY;
k->dynamic_refcount = 0;
k->aux = NULL;
+ k->page_locked_host_p = false;
array->left = NULL;
array->right = NULL;
splay_tree_insert (&devicep->mem_map, array);
@@ -2556,6 +2797,34 @@ gomp_load_image_to_device (struct gomp_device_descr *devicep, unsigned version,
k->refcount = is_link_var ? REFCOUNT_LINK : REFCOUNT_INFINITY;
k->dynamic_refcount = 0;
k->aux = NULL;
+ k->page_locked_host_p = false;
+ if (always_pinned_mode)
+ {
+ void *ptr = (void *) k->host_start;
+ size_t size = k->host_end - k->host_start;
+ gomp_debug (0, " var %d: ptr=%p, size=%llu, is_link_var=%d\n",
+ i, ptr, (unsigned long long) size, is_link_var);
+ if (!is_link_var)
+ {
+ /* '#pragma omp declare target' variables typically are
+ read/write, but in particular artificial ones, like Fortran
+ array constructors, may be placed in section '.rodata'.
+ We don't have the actual mapping kind available here, so we
+ use a magic number. */
+ const int kind = -1;
+ int page_locked_host_p = gomp_page_locked_host_register_dev
+ (devicep, ptr, size, kind);
+ if (page_locked_host_p < 0)
+ {
+ gomp_mutex_unlock (&devicep->lock);
+ if (is_register_lock)
+ gomp_mutex_unlock (&register_lock);
+ exit (EXIT_FAILURE);
+ }
+ if (page_locked_host_p)
+ k->page_locked_host_p = true;
+ }
+ }
array->left = NULL;
array->right = NULL;
splay_tree_insert (&devicep->mem_map, array);
@@ -2577,6 +2846,13 @@ gomp_load_image_to_device (struct gomp_device_descr *devicep, unsigned version,
devicep->target_id. */
int dev_num = (int) (devicep - &devices[0]);
struct gomp_offload_icvs *icvs = get_gomp_offload_icvs (dev_num);
+ if (!icvs)
+ {
+ gomp_mutex_unlock (&devicep->lock);
+ if (is_register_lock)
+ gomp_mutex_unlock (&register_lock);
+ gomp_fatal ("'get_gomp_offload_icvs' failed");
+ }
size_t var_size = var->end - var->start;
if (var_size != sizeof (struct gomp_offload_icvs))
{
@@ -2599,6 +2875,8 @@ gomp_load_image_to_device (struct gomp_device_descr *devicep, unsigned version,
k->refcount = REFCOUNT_INFINITY;
k->dynamic_refcount = 0;
k->aux = NULL;
+ /* 'always_pinned_mode' handled via 'get_gomp_offload_icvs'. */
+ k->page_locked_host_p = always_pinned_mode;
array->left = NULL;
array->right = NULL;
splay_tree_insert (&devicep->mem_map, array);
@@ -3261,6 +3539,12 @@ GOMP_target_ext (int device, void (*fn) (void *), size_t mapnum,
flags = clear_unsupported_flags (devicep, flags);
+ /* For 'nowait' we supposedly have to unregister/free page-locked host memory
+ via 'GOMP_PLUGIN_target_task_completion'. There is no current
+ configuration exercising this (and thus, infeasible to test). */
+ assert (!(flags & GOMP_TARGET_FLAG_NOWAIT)
+ || !(devicep && devicep->page_locked_host_register_func));
+
if (flags & GOMP_TARGET_FLAG_NOWAIT)
{
struct gomp_thread *thr = gomp_thread ();
@@ -3572,18 +3856,37 @@ gomp_target_rev (uint64_t fn_ptr, uint64_t mapnum, uint64_t devaddrs_ptr,
}
else
{
- devaddrs = (uint64_t *) gomp_malloc (mapnum * sizeof (uint64_t));
- sizes = (uint64_t *) gomp_malloc (mapnum * sizeof (uint64_t));
- kinds = (unsigned short *) gomp_malloc (mapnum * sizeof (unsigned short));
+ size_t devaddrs_size = mapnum * sizeof (uint64_t);
+ size_t sizes_size = mapnum * sizeof (uint64_t);
+ size_t kinds_size = mapnum * sizeof (unsigned short);
+ if (always_pinned_mode)
+ {
+ if (!(devaddrs = gomp_page_locked_host_alloc_dev (devicep,
+ devaddrs_size,
+ false))
+ || !(sizes = gomp_page_locked_host_alloc_dev (devicep,
+ sizes_size,
+ false))
+ || !(kinds = gomp_page_locked_host_alloc_dev (devicep,
+ kinds_size,
+ false)))
+ exit (EXIT_FAILURE);
+ }
+ else
+ {
+ devaddrs = gomp_malloc (devaddrs_size);
+ sizes = gomp_malloc (sizes_size);
+ kinds = gomp_malloc (kinds_size);
+ }
gomp_copy_dev2host (devicep, aq, devaddrs,
(const void *) (uintptr_t) devaddrs_ptr,
- mapnum * sizeof (uint64_t));
+ devaddrs_size);
gomp_copy_dev2host (devicep, aq, sizes,
(const void *) (uintptr_t) sizes_ptr,
- mapnum * sizeof (uint64_t));
+ sizes_size);
gomp_copy_dev2host (devicep, aq, kinds,
(const void *) (uintptr_t) kinds_ptr,
- mapnum * sizeof (unsigned short));
+ kinds_size);
if (aq && !devicep->openacc.async.synchronize_func (aq))
exit (EXIT_FAILURE);
}
@@ -3598,7 +3901,23 @@ gomp_target_rev (uint64_t fn_ptr, uint64_t mapnum, uint64_t devaddrs_ptr,
if (tgt_align)
{
- char *tgt = gomp_alloca (tgt_size + tgt_align - 1);
+ size_t tgt_alloc_size = tgt_size + tgt_align - 1;
+ char *tgt = gomp_alloca (tgt_alloc_size);
+ if (always_pinned_mode)
+ {
+ /* TODO: See 'gomp_copy_host2dev' re "page-locking on the spot".
+ On the other hand, performance isn't really a concern, here. */
+ int page_locked_host_p = 0;
+ if (tgt_alloc_size != 0)
+ {
+ page_locked_host_p = gomp_page_locked_host_register_dev
+ (devicep, tgt, tgt_alloc_size, GOMP_MAP_TOFROM);
+ if (page_locked_host_p < 0)
+ exit (EXIT_FAILURE);
+ /* 'gomp_alloca' isn't already page-locked host memory. */
+ assert (page_locked_host_p);
+ }
+ }
uintptr_t al = (uintptr_t) tgt & (tgt_align - 1);
if (al)
tgt += tgt_align - al;
@@ -3632,6 +3951,14 @@ gomp_target_rev (uint64_t fn_ptr, uint64_t mapnum, uint64_t devaddrs_ptr,
++i;
}
}
+ if (always_pinned_mode)
+ {
+ if (tgt_alloc_size != 0
+ && !gomp_page_locked_host_unregister_dev (devicep,
+ tgt, tgt_alloc_size,
+ NULL))
+ exit (EXIT_FAILURE);
+ }
}
if (!(devicep->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM) && mapnum > 0)
@@ -3718,9 +4045,20 @@ gomp_target_rev (uint64_t fn_ptr, uint64_t mapnum, uint64_t devaddrs_ptr,
{
cdata[i].aligned = true;
size_t align = (size_t) 1 << (kinds[i] >> 8);
- devaddrs[i]
- = (uint64_t) (uintptr_t) gomp_aligned_alloc (align,
- sizes[i]);
+ void *ptr;
+ if (always_pinned_mode)
+ {
+ ptr = gomp_page_locked_host_aligned_alloc_dev
+ (devicep, align, sizes[i]);
+ if (!ptr)
+ {
+ gomp_mutex_unlock (&devicep->lock);
+ exit (EXIT_FAILURE);
+ }
+ }
+ else
+ ptr = gomp_aligned_alloc (align, sizes[i]);
+ devaddrs[i] = (uint64_t) (uintptr_t) ptr;
}
else if (n2 != NULL)
devaddrs[i] = (n2->host_start + cdata[i].devaddr
@@ -3770,7 +4108,23 @@ gomp_target_rev (uint64_t fn_ptr, uint64_t mapnum, uint64_t devaddrs_ptr,
}
}
if (!cdata[i].present)
- devaddrs[i] = (uintptr_t) gomp_malloc (sizeof (void*));
+ {
+ void *ptr;
+ size_t size = sizeof (void *);
+ if (always_pinned_mode)
+ {
+ ptr = gomp_page_locked_host_alloc_dev (devicep,
+ size, false);
+ if (!ptr)
+ {
+ gomp_mutex_unlock (&devicep->lock);
+ exit (EXIT_FAILURE);
+ }
+ }
+ else
+ ptr = gomp_malloc (size);
+ devaddrs[i] = (uintptr_t) ptr;
+ }
/* Assume that when present, the pointer is already correct. */
if (!n2)
*(uint64_t *) (uintptr_t) (devaddrs[i] + sizes[i])
@@ -3803,9 +4157,20 @@ gomp_target_rev (uint64_t fn_ptr, uint64_t mapnum, uint64_t devaddrs_ptr,
{
cdata[i].aligned = true;
size_t align = (size_t) 1 << (kinds[i] >> 8);
- devaddrs[i]
- = (uint64_t) (uintptr_t) gomp_aligned_alloc (align,
- sizes[i]);
+ void *ptr;
+ if (always_pinned_mode)
+ {
+ ptr = gomp_page_locked_host_aligned_alloc_dev
+ (devicep, align, sizes[i]);
+ if (!ptr)
+ {
+ gomp_mutex_unlock (&devicep->lock);
+ exit (EXIT_FAILURE);
+ }
+ }
+ else
+ ptr = gomp_aligned_alloc (align, sizes[i]);
+ devaddrs[i] = (uint64_t) (uintptr_t) ptr;
gomp_copy_dev2host (devicep, aq,
(void *) (uintptr_t) devaddrs[i],
(void *) (uintptr_t) cdata[i].devaddr,
@@ -3881,7 +4246,20 @@ gomp_target_rev (uint64_t fn_ptr, uint64_t mapnum, uint64_t devaddrs_ptr,
+ sizes[i + sizes[i]]);
size_t align = (size_t) 1 << (kinds[i] >> 8);
cdata[i].aligned = true;
- devaddrs[i] = (uintptr_t) gomp_aligned_alloc (align, sz);
+ void *ptr;
+ if (always_pinned_mode)
+ {
+ ptr = gomp_page_locked_host_aligned_alloc_dev
+ (devicep, align, sz);
+ if (!ptr)
+ {
+ gomp_mutex_unlock (&devicep->lock);
+ exit (EXIT_FAILURE);
+ }
+ }
+ else
+ ptr = gomp_aligned_alloc (align, sz);
+ devaddrs[i] = (uintptr_t) ptr;
devaddrs[i] -= devaddrs[i+1] - cdata[i].devaddr;
}
else
@@ -3945,9 +4323,29 @@ gomp_target_rev (uint64_t fn_ptr, uint64_t mapnum, uint64_t devaddrs_ptr,
struct_cpy = sizes[i];
}
else if (!cdata[i].present && cdata[i].aligned)
- gomp_aligned_free ((void *) (uintptr_t) devaddrs[i]);
+ {
+ void *ptr = (void *) (uintptr_t) devaddrs[i];
+ if (always_pinned_mode)
+ {
+ if (!gomp_page_locked_host_aligned_free_dev (devicep,
+ ptr,
+ aq))
+ exit (EXIT_FAILURE);
+ }
+ else
+ gomp_aligned_free (ptr);
+ }
else if (!cdata[i].present)
- free ((void *) (uintptr_t) devaddrs[i]);
+ {
+ void *ptr = (void *) (uintptr_t) devaddrs[i];
+ if (always_pinned_mode)
+ {
+ if (!gomp_page_locked_host_free_dev (devicep, ptr, aq))
+ exit (EXIT_FAILURE);
+ }
+ else
+ free (ptr);
+ }
}
if (clean_struct)
for (uint64_t i = 0; i < mapnum; i++)
@@ -3956,12 +4354,30 @@ gomp_target_rev (uint64_t fn_ptr, uint64_t mapnum, uint64_t devaddrs_ptr,
== GOMP_MAP_STRUCT))
{
devaddrs[i] += cdata[i+1].devaddr - cdata[i].devaddr;
- gomp_aligned_free ((void *) (uintptr_t) devaddrs[i]);
+ void *ptr = (void *) (uintptr_t) devaddrs[i];
+ if (always_pinned_mode)
+ {
+ if (!gomp_page_locked_host_aligned_free_dev (devicep,
+ ptr, aq))
+ exit (EXIT_FAILURE);
+ }
+ else
+ gomp_aligned_free (ptr);
}
- free (devaddrs);
- free (sizes);
- free (kinds);
+ if (always_pinned_mode)
+ {
+ if (!gomp_page_locked_host_free_dev (devicep, devaddrs, aq)
+ || !gomp_page_locked_host_free_dev (devicep, sizes, aq)
+ || !gomp_page_locked_host_free_dev (devicep, kinds, aq))
+ exit (EXIT_FAILURE);
+ }
+ else
+ {
+ free (devaddrs);
+ free (sizes);
+ free (kinds);
+ }
}
}
@@ -4585,6 +5001,160 @@ gomp_usm_free (void *device_ptr, int device_num)
}
+/* Allocate page-locked host memory via DEVICE. */
+
+static void *
+gomp_page_locked_host_alloc_dev (struct gomp_device_descr *device,
+ size_t size, bool allow_null)
+{
+ gomp_debug (0, "%s: device=%p (%s), size=%llu\n",
+ __FUNCTION__, device, device->name, (unsigned long long) size);
+
+ void *ret;
+ if (!device->page_locked_host_alloc_func (&ret, size))
+ {
+ const char *fmt
+ = "Failed to allocate page-locked host memory via %s libgomp plugin";
+ if (allow_null)
+ gomp_fatal (fmt, device->name);
+ else
+ gomp_error (fmt, device->name);
+ ret = NULL;
+ }
+ else if (ret == NULL && !allow_null)
+ gomp_error ("Out of memory allocating %lu bytes"
+ " page-locked host memory"
+ " via %s libgomp plugin",
+ (unsigned long) size, device->name);
+ else
+ gomp_debug (0, " -> ret=[%p, %p)\n",
+ ret, ret + size);
+ return ret;
+}
+
+/* Free page-locked host memory via DEVICE. */
+
+static bool
+gomp_page_locked_host_free_dev (struct gomp_device_descr *device,
+ void *ptr,
+ struct goacc_asyncqueue *aq)
+{
+ gomp_debug (0, "%s: device=%p (%s), ptr=%p, aq=%p\n",
+ __FUNCTION__, device, device->name, ptr, aq);
+
+ if (!device->page_locked_host_free_func (ptr, aq))
+ {
+ gomp_error ("Failed to free page-locked host memory"
+ " via %s libgomp plugin",
+ device->name);
+ return false;
+ }
+ return true;
+}
+
+/* Allocate aligned page-locked host memory via DEVICE.
+
+ That is, 'gomp_aligned_alloc' (see 'alloc.c') for page-locked host
+ memory. */
+
+static void *
+gomp_page_locked_host_aligned_alloc_dev (struct gomp_device_descr *device,
+ size_t al, size_t size)
+{
+ gomp_debug (0, "%s: device=%p (%s), al=%llu, size=%llu\n",
+ __FUNCTION__, device, device->name,
+ (unsigned long long) al, (unsigned long long) size);
+
+ void *ret;
+ if (al < sizeof (void *))
+ al = sizeof (void *);
+ ret = NULL;
+ if ((al & (al - 1)) == 0 && size)
+ {
+ void *p = gomp_page_locked_host_alloc_dev (device, size + al, true);
+ if (p)
+ {
+ void *ap = (void *) (((uintptr_t) p + al) & -al);
+ ((void **) ap)[-1] = p;
+ ret = ap;
+ }
+ }
+ if (ret == NULL)
+ gomp_error ("Out of memory allocating %lu bytes", (unsigned long) size);
+ else
+ gomp_debug (0, " -> ret=[%p, %p)\n",
+ ret, ret + size);
+ return ret;
+}
+
+/* Free aligned page-locked host memory via DEVICE.
+
+ That is, 'gomp_aligned_free' (see 'alloc.c') for page-locked host
+ memory. */
+
+static bool
+gomp_page_locked_host_aligned_free_dev (struct gomp_device_descr *device,
+ void *ptr,
+ struct goacc_asyncqueue *aq)
+{
+ gomp_debug (0, "%s: device=%p (%s), ptr=%p, aq=%p\n",
+ __FUNCTION__, device, device->name, ptr, aq);
+
+ if (ptr)
+ {
+ ptr = ((void **) ptr)[-1];
+ gomp_debug (0, " ptr=%p\n",
+ ptr);
+
+ if (!gomp_page_locked_host_free_dev (device, ptr, aq))
+ return false;
+ }
+ return true;
+}
+
+/* Register page-locked host memory via DEVICE. */
+
+attribute_hidden int
+gomp_page_locked_host_register_dev (struct gomp_device_descr *device,
+ void *ptr, size_t size, int kind)
+{
+ gomp_debug (0, "%s: device=%p (%s), ptr=%p, size=%llu, kind=%d\n",
+ __FUNCTION__, device, device->name,
+ ptr, (unsigned long long) size, kind);
+ assert (size != 0);
+
+ int ret = device->page_locked_host_register_func (device->target_id,
+ ptr, size, kind);
+ if (ret < 0)
+ gomp_error ("Failed to register page-locked host memory"
+ " via %s libgomp plugin",
+ device->name);
+ return ret;
+}
+
+/* Unregister page-locked host memory via DEVICE. */
+
+attribute_hidden bool
+gomp_page_locked_host_unregister_dev (struct gomp_device_descr *device,
+ void *ptr, size_t size,
+ struct goacc_asyncqueue *aq)
+{
+ gomp_debug (0, "%s: device=%p (%s), ptr=%p, size=%llu, aq=%p\n",
+ __FUNCTION__, device, device->name,
+ ptr, (unsigned long long) size, aq);
+ assert (size != 0);
+
+ if (!device->page_locked_host_unregister_func (ptr, size, aq))
+ {
+ gomp_error ("Failed to unregister page-locked host memory"
+ " via %s libgomp plugin",
+ device->name);
+ return false;
+ }
+ return true;
+}
+
+
/* Device (really: libgomp plugin) to use for paged-locked memory. We
assume there is either none or exactly one such device for the lifetime of
the process. */
@@ -4681,10 +5251,7 @@ gomp_page_locked_host_alloc (void **ptr, size_t size)
}
gomp_mutex_unlock (&device->lock);
- if (!device->page_locked_host_alloc_func (ptr, size))
- gomp_fatal ("Failed to allocate page-locked host memory"
- " via %s libgomp plugin",
- device->name);
+ *ptr = gomp_page_locked_host_alloc_dev (device, size, true);
}
return device != NULL;
}
@@ -4713,10 +5280,8 @@ gomp_page_locked_host_free (void *ptr)
}
gomp_mutex_unlock (&device->lock);
- if (!device->page_locked_host_free_func (ptr))
- gomp_fatal ("Failed to free page-locked host memory"
- " via %s libgomp plugin",
- device->name);
+ if (!gomp_page_locked_host_free_dev (device, ptr, NULL))
+ exit (EXIT_FAILURE);
}
@@ -4792,30 +5357,84 @@ omp_target_memcpy_copy (void *dst, const void *src, size_t length,
bool ret;
if (src_devicep == NULL && dst_devicep == NULL)
{
+ /* No 'gomp_verify_always_pinned_mode' here. */
memcpy ((char *) dst + dst_offset, (char *) src + src_offset, length);
return 0;
}
if (src_devicep == NULL)
{
gomp_mutex_lock (&dst_devicep->lock);
+
+ void *src_ptr = (void *) src + src_offset;
+ int src_ptr_page_locked_host_p = 0;
+
+ if (always_pinned_mode)
+ {
+ if (length != 0)
+ src_ptr_page_locked_host_p = gomp_page_locked_host_register_dev
+ (dst_devicep, src_ptr, length, GOMP_MAP_TO);
+ if (src_ptr_page_locked_host_p < 0)
+ {
+ gomp_mutex_unlock (&dst_devicep->lock);
+ return ENOMEM;
+ }
+ }
+
+ /* No 'gomp_verify_always_pinned_mode' here; have just registered. */
ret = dst_devicep->host2dev_func (dst_devicep->target_id,
(char *) dst + dst_offset,
- (char *) src + src_offset, length);
+ src_ptr, length);
+
+ if (src_ptr_page_locked_host_p
+ && !gomp_page_locked_host_unregister_dev (dst_devicep,
+ src_ptr, length, NULL))
+ {
+ gomp_mutex_unlock (&dst_devicep->lock);
+ return ENOMEM;
+ }
+
gomp_mutex_unlock (&dst_devicep->lock);
return (ret ? 0 : EINVAL);
}
if (dst_devicep == NULL)
{
gomp_mutex_lock (&src_devicep->lock);
+
+ void *dst_ptr = (void *) dst + dst_offset;
+ int dst_ptr_page_locked_host_p = 0;
+
+ if (always_pinned_mode)
+ {
+ if (length != 0)
+ dst_ptr_page_locked_host_p = gomp_page_locked_host_register_dev
+ (src_devicep, dst_ptr, length, GOMP_MAP_FROM);
+ if (dst_ptr_page_locked_host_p < 0)
+ {
+ gomp_mutex_unlock (&src_devicep->lock);
+ return ENOMEM;
+ }
+ }
+
+ /* No 'gomp_verify_always_pinned_mode' here; have just registered. */
ret = src_devicep->dev2host_func (src_devicep->target_id,
- (char *) dst + dst_offset,
+ dst_ptr,
(char *) src + src_offset, length);
+
+ if (dst_ptr_page_locked_host_p
+ && !gomp_page_locked_host_unregister_dev (src_devicep,
+ dst_ptr, length, NULL))
+ {
+ gomp_mutex_unlock (&src_devicep->lock);
+ return ENOMEM;
+ }
+
gomp_mutex_unlock (&src_devicep->lock);
return (ret ? 0 : EINVAL);
}
if (src_devicep == dst_devicep)
{
gomp_mutex_lock (&src_devicep->lock);
+ /* No 'gomp_verify_always_pinned_mode' here. */
ret = src_devicep->dev2dev_func (src_devicep->target_id,
(char *) dst + dst_offset,
(char *) src + src_offset, length);
@@ -4927,21 +5546,63 @@ omp_target_memcpy_rect_worker (void *dst, const void *src, size_t element_size,
return EINVAL;
if (dst_devicep == NULL && src_devicep == NULL)
{
+ /* No 'gomp_verify_always_pinned_mode' here. */
memcpy ((char *) dst + dst_off, (const char *) src + src_off,
length);
ret = 1;
}
else if (src_devicep == NULL)
- ret = dst_devicep->host2dev_func (dst_devicep->target_id,
- (char *) dst + dst_off,
- (const char *) src + src_off,
- length);
+ {
+ void *src_ptr = (void *) src + src_off;
+ int src_ptr_page_locked_host_p = 0;
+
+ if (always_pinned_mode)
+ {
+ if (length != 0)
+ src_ptr_page_locked_host_p = gomp_page_locked_host_register_dev
+ (dst_devicep, src_ptr, length, GOMP_MAP_TO);
+ if (src_ptr_page_locked_host_p < 0)
+ return ENOMEM;
+ }
+
+ /* No 'gomp_verify_always_pinned_mode' here; have just registered. */
+ ret = dst_devicep->host2dev_func (dst_devicep->target_id,
+ (char *) dst + dst_off,
+ src_ptr,
+ length);
+
+ if (src_ptr_page_locked_host_p
+ && !gomp_page_locked_host_unregister_dev (dst_devicep,
+ src_ptr, length, NULL))
+ return ENOMEM;
+ }
else if (dst_devicep == NULL)
- ret = src_devicep->dev2host_func (src_devicep->target_id,
- (char *) dst + dst_off,
- (const char *) src + src_off,
- length);
+ {
+ void *dst_ptr = (void *) dst + dst_off;
+ int dst_ptr_page_locked_host_p = 0;
+
+ if (always_pinned_mode)
+ {
+ if (length != 0)
+ dst_ptr_page_locked_host_p = gomp_page_locked_host_register_dev
+ (src_devicep, dst_ptr, length, GOMP_MAP_FROM);
+ if (dst_ptr_page_locked_host_p < 0)
+ return ENOMEM;
+ }
+
+ /* No 'gomp_verify_always_pinned_mode' here; have just registered. */
+ ret = src_devicep->dev2host_func (src_devicep->target_id,
+ dst_ptr,
+ (const char *) src + src_off,
+ length);
+
+ if (dst_ptr_page_locked_host_p
+ && !gomp_page_locked_host_unregister_dev (src_devicep,
+ dst_ptr, length, NULL))
+ return ENOMEM;
+ }
else if (src_devicep == dst_devicep)
+ /* No 'gomp_verify_always_pinned_mode' here. */
ret = src_devicep->dev2dev_func (src_devicep->target_id,
(char *) dst + dst_off,
(const char *) src + src_off,
@@ -5184,6 +5845,7 @@ omp_target_associate_ptr (const void *host_ptr, const void *device_ptr,
k->refcount = REFCOUNT_INFINITY;
k->dynamic_refcount = 0;
k->aux = NULL;
+ k->page_locked_host_p = false;
array->left = NULL;
array->right = NULL;
splay_tree_insert (&devicep->mem_map, array);
@@ -5406,6 +6068,9 @@ gomp_load_plugin_for_device (struct gomp_device_descr *device,
DLSYM_OPT (is_usm_ptr, is_usm_ptr);
DLSYM_OPT (page_locked_host_alloc, page_locked_host_alloc);
DLSYM_OPT (page_locked_host_free, page_locked_host_free);
+ DLSYM_OPT (page_locked_host_register, page_locked_host_register);
+ DLSYM_OPT (page_locked_host_unregister, page_locked_host_unregister);
+ DLSYM_OPT (page_locked_host_p, page_locked_host_p);
DLSYM (dev2host);
DLSYM (host2dev);
DLSYM (evaluate_device);
diff --git a/libgomp/testsuite/libgomp.c/alloc-pinned-7.c b/libgomp/testsuite/libgomp.c/alloc-pinned-7.c
deleted file mode 100644
index 8dc19055038..00000000000
--- a/libgomp/testsuite/libgomp.c/alloc-pinned-7.c
+++ /dev/null
@@ -1,63 +0,0 @@
-/* { dg-do run } */
-/* { dg-additional-options "-foffload-memory=pinned" } */
-
-/* { dg-xfail-run-if "Pinning not implemented on this host" { ! *-*-linux-gnu } } */
-
-/* Test that pinned memory works. */
-
-#include <stdio.h>
-#include <stdlib.h>
-
-#ifdef __linux__
-#include <sys/types.h>
-#include <unistd.h>
-
-#include <sys/mman.h>
-
-int
-get_pinned_mem ()
-{
- int pid = getpid ();
- char buf[100];
- sprintf (buf, "/proc/%d/status", pid);
-
- FILE *proc = fopen (buf, "r");
- if (!proc)
- abort ();
- while (fgets (buf, 100, proc))
- {
- int val;
- if (sscanf (buf, "VmLck: %d", &val))
- {
- fclose (proc);
- return val;
- }
- }
- abort ();
-}
-#else
-int
-get_pinned_mem ()
-{
- return 0;
-}
-
-#define mlockall(...) 0
-#endif
-
-#include <omp.h>
-
-int
-main ()
-{
- // Sanity check
- if (get_pinned_mem () == 0)
- {
- /* -foffload-memory=pinned has failed, but maybe that's because
- isufficient pinned memory was available. */
- if (mlockall (MCL_CURRENT | MCL_FUTURE) == 0)
- abort ();
- }
-
- return 0;
-}