diff options
author | cltang <cltang@138bc75d-0d04-0410-961f-82ee72b054a4> | 2016-05-26 09:58:56 +0000 |
---|---|---|
committer | cltang <cltang@138bc75d-0d04-0410-961f-82ee72b054a4> | 2016-05-26 09:58:56 +0000 |
commit | 9b50ad1d18e7c4d34bf7d1f42eb0998b88b0dc8a (patch) | |
tree | 1788f7eb15481be06ce674ae4ed2f5734bedeb2c /libgomp | |
parent | 0db93a5b9e943d919a4c37ecefba9c9317f48fad (diff) | |
download | gcc-9b50ad1d18e7c4d34bf7d1f42eb0998b88b0dc8a.tar.gz |
libgomp/
2016-05-26 Chung-Lin Tang <cltang@codesourcery.com>
* target.c (gomp_device_copy): New function.
(gomp_copy_host2dev): Likewise.
(gomp_copy_dev2host): Likewise.
(gomp_free_device_memory): Likewise.
(gomp_map_vars_existing): Adjust to call gomp_copy_host2dev.
(gomp_map_pointer): Likewise.
(gomp_map_vars): Adjust to call gomp_copy_host2dev, handle
NULL value from alloc_func plugin hook.
(gomp_unmap_tgt): Adjust to call gomp_free_device_memory.
(gomp_copy_from_async): Adjust to call gomp_copy_dev2host.
(gomp_unmap_vars): Likewise.
(gomp_update): Adjust to call gomp_copy_dev2host and
gomp_copy_host2dev functions.
(gomp_unload_image_from_device): Handle false value from
unload_image_func plugin hook.
(gomp_init_device): Handle false value from init_device_func
plugin hook.
(gomp_exit_data): Adjust to call gomp_copy_dev2host.
(omp_target_free): Adjust to call gomp_free_device_memory.
(omp_target_memcpy): Handle return values from host2dev_func,
dev2host_func, and dev2dev_func plugin hooks.
(omp_target_memcpy_rect_worker): Likewise.
(gomp_target_fini): Handle false value from fini_device_func
plugin hook.
* libgomp.h (struct gomp_device_descr): Adjust return type of
init_device_func, fini_device_func, unload_image_func, free_func,
dev2host_func,host2dev_func, and dev2dev_func plugin hooks to 'bool'.
* oacc-init.c (acc_shutdown_1): Handle false value from
fini_device_func plugin hook.
* oacc-host.c (host_init_device): Change return type to bool.
(host_fini_device): Likewise.
(host_unload_image): Likewise.
(host_free): Likewise.
(host_dev2host): Likewise.
(host_host2dev): Likewise.
* oacc-mem.c (acc_free): Handle plugin hook fatal error case.
(acc_memcpy_to_device): Likewise.
(acc_memcpy_from_device): Likewise.
(delete_copyout): Add libfnname parameter, handle free_func
hook fatal error case.
(acc_delete): Adjust delete_copyout call.
(acc_copyout): Likewise.
(update_dev_host): Move gomp_mutex_unlock to after
host2dev/dev2host hook calls.
* plugin/plugin-hsa.c (hsa_warn): Adjust 'hsa_error' local variable
to 'hsa_error_msg', for clarity.
(hsa_fatal): Likewise.
(hsa_error): New function.
(init_hsa_context): Change return type to bool, adjust to return
false on error.
(GOMP_OFFLOAD_get_num_devices): Adjust to handle init_hsa_context
return value.
(GOMP_OFFLOAD_init_device): Change return type to bool, adjust to
return false on error.
(get_agent_info): Adjust to return NULL on error.
(destroy_hsa_program): Change return type to bool, adjust to
return false on error.
(GOMP_OFFLOAD_load_image): Adjust to return -1 on error.
(destroy_module): Change return type to bool, adjust to
return false on error.
(GOMP_OFFLOAD_unload_image): Likewise.
(GOMP_OFFLOAD_fini_device): Likewise.
(GOMP_OFFLOAD_alloc): Change to return NULL when called.
(GOMP_OFFLOAD_free): Change to return false when called.
(GOMP_OFFLOAD_dev2host): Likewise.
(GOMP_OFFLOAD_host2dev): Likewise.
(GOMP_OFFLOAD_dev2dev): Likewise.
* plugin/plugin-nvptx.c (CUDA_CALL_ERET): New convenience macro.
(CUDA_CALL): Likewise.
(CUDA_CALL_ASSERT): Likewise.
(map_init): Change return type to bool, use CUDA_CALL* macros.
(map_fini): Likewise.
(init_streams_for_device): Change return type to bool, adjust
call to map_init.
(fini_streams_for_device): Change return type to bool, adjust
call to map_fini.
(select_stream_for_async): Release stream_lock before calls to
GOMP_PLUGIN_fatal, adjust call to map_init.
(nvptx_init): Use CUDA_CALL* macros.
(nvptx_attach_host_thread_to_device): Change return type to bool,
use CUDA_CALL* macros.
(nvptx_open_device): Use CUDA_CALL* macros.
(nvptx_close_device): Change return type to bool, use CUDA_CALL*
macros.
(nvptx_get_num_devices): Use CUDA_CALL* macros.
(link_ptx): Change return type to bool, use CUDA_CALL* macros.
(nvptx_exec): Use CUDA_CALL* macros.
(nvptx_alloc): Use CUDA_CALL* macros.
(nvptx_free): Change return type to bool, use CUDA_CALL* macros.
(nvptx_host2dev): Likewise.
(nvptx_dev2host): Likewise.
(nvptx_wait): Use CUDA_CALL* macros.
(nvptx_wait_async): Likewise.
(nvptx_wait_all): Likewise.
(nvptx_wait_all_async): Likewise.
(nvptx_set_cuda_stream): Adjust order of stream_lock acquire,
use CUDA_CALL* macros, adjust call to map_fini.
(GOMP_OFFLOAD_init_device): Change return type to bool,
adjust code accordingly.
(GOMP_OFFLOAD_fini_device): Likewise.
(GOMP_OFFLOAD_load_image): Adjust calls to
nvptx_attach_host_thread_to_device/link_ptx to handle errors,
use CUDA_CALL* macros.
(GOMP_OFFLOAD_unload_image): Change return type to bool, adjust
return code.
(GOMP_OFFLOAD_alloc): Adjust calls to code to handle error return.
(GOMP_OFFLOAD_free): Change return type to bool, adjust calls to
handle error return.
(GOMP_OFFLOAD_dev2host): Likewise.
(GOMP_OFFLOAD_host2dev): Likewise.
(GOMP_OFFLOAD_openacc_register_async_cleanup): Use CUDA_CALL* macros.
(GOMP_OFFLOAD_openacc_create_thread_data): Likewise.
liboffloadmic/
2016-05-26 Chung-Lin Tang <cltang@codesourcery.com>
* plugin/libgomp-plugin-intelmic.cpp (offload): Change return type
to bool, adjust return code.
(GOMP_OFFLOAD_init_device): Likewise.
(GOMP_OFFLOAD_fini_device): Likewise.
(get_target_table): Likewise.
(offload_image): Likwise.
(GOMP_OFFLOAD_load_image): Adjust call to offload_image(), change
to return -1 on error.
(GOMP_OFFLOAD_unload_image): Change return type to bool, adjust return
code.
(GOMP_OFFLOAD_alloc): Likewise.
(GOMP_OFFLOAD_free): Likewise.
(GOMP_OFFLOAD_host2dev): Likewise.
(GOMP_OFFLOAD_dev2host): Likewise.
(GOMP_OFFLOAD_dev2dev): Likewise.
git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/trunk@236768 138bc75d-0d04-0410-961f-82ee72b054a4
Diffstat (limited to 'libgomp')
-rw-r--r-- | libgomp/ChangeLog | 117 | ||||
-rw-r--r-- | libgomp/libgomp.h | 14 | ||||
-rw-r--r-- | libgomp/oacc-host.c | 20 | ||||
-rw-r--r-- | libgomp/oacc-init.c | 7 | ||||
-rw-r--r-- | libgomp/oacc-mem.c | 25 | ||||
-rw-r--r-- | libgomp/plugin/plugin-hsa.c | 252 | ||||
-rw-r--r-- | libgomp/plugin/plugin-nvptx.c | 599 | ||||
-rw-r--r-- | libgomp/target.c | 252 |
8 files changed, 764 insertions, 522 deletions
diff --git a/libgomp/ChangeLog b/libgomp/ChangeLog index 7ad7ff49ff8..e2496ff5833 100644 --- a/libgomp/ChangeLog +++ b/libgomp/ChangeLog @@ -1,3 +1,120 @@ +2016-05-26 Chung-Lin Tang <cltang@codesourcery.com> + + * target.c (gomp_device_copy): New function. + (gomp_copy_host2dev): Likewise. + (gomp_copy_dev2host): Likewise. + (gomp_free_device_memory): Likewise. + (gomp_map_vars_existing): Adjust to call gomp_copy_host2dev. + (gomp_map_pointer): Likewise. + (gomp_map_vars): Adjust to call gomp_copy_host2dev, handle + NULL value from alloc_func plugin hook. + (gomp_unmap_tgt): Adjust to call gomp_free_device_memory. + (gomp_copy_from_async): Adjust to call gomp_copy_dev2host. + (gomp_unmap_vars): Likewise. + (gomp_update): Adjust to call gomp_copy_dev2host and + gomp_copy_host2dev functions. + (gomp_unload_image_from_device): Handle false value from + unload_image_func plugin hook. + (gomp_init_device): Handle false value from init_device_func + plugin hook. + (gomp_exit_data): Adjust to call gomp_copy_dev2host. + (omp_target_free): Adjust to call gomp_free_device_memory. + (omp_target_memcpy): Handle return values from host2dev_func, + dev2host_func, and dev2dev_func plugin hooks. + (omp_target_memcpy_rect_worker): Likewise. + (gomp_target_fini): Handle false value from fini_device_func + plugin hook. + * libgomp.h (struct gomp_device_descr): Adjust return type of + init_device_func, fini_device_func, unload_image_func, free_func, + dev2host_func,host2dev_func, and dev2dev_func plugin hooks to 'bool'. + * oacc-init.c (acc_shutdown_1): Handle false value from + fini_device_func plugin hook. + * oacc-host.c (host_init_device): Change return type to bool. + (host_fini_device): Likewise. + (host_unload_image): Likewise. + (host_free): Likewise. + (host_dev2host): Likewise. + (host_host2dev): Likewise. + * oacc-mem.c (acc_free): Handle plugin hook fatal error case. + (acc_memcpy_to_device): Likewise. + (acc_memcpy_from_device): Likewise. + (delete_copyout): Add libfnname parameter, handle free_func + hook fatal error case. + (acc_delete): Adjust delete_copyout call. + (acc_copyout): Likewise. + (update_dev_host): Move gomp_mutex_unlock to after + host2dev/dev2host hook calls. + + * plugin/plugin-hsa.c (hsa_warn): Adjust 'hsa_error' local variable + to 'hsa_error_msg', for clarity. + (hsa_fatal): Likewise. + (hsa_error): New function. + (init_hsa_context): Change return type to bool, adjust to return + false on error. + (GOMP_OFFLOAD_get_num_devices): Adjust to handle init_hsa_context + return value. + (GOMP_OFFLOAD_init_device): Change return type to bool, adjust to + return false on error. + (get_agent_info): Adjust to return NULL on error. + (destroy_hsa_program): Change return type to bool, adjust to + return false on error. + (GOMP_OFFLOAD_load_image): Adjust to return -1 on error. + (destroy_module): Change return type to bool, adjust to + return false on error. + (GOMP_OFFLOAD_unload_image): Likewise. + (GOMP_OFFLOAD_fini_device): Likewise. + (GOMP_OFFLOAD_alloc): Change to return NULL when called. + (GOMP_OFFLOAD_free): Change to return false when called. + (GOMP_OFFLOAD_dev2host): Likewise. + (GOMP_OFFLOAD_host2dev): Likewise. + (GOMP_OFFLOAD_dev2dev): Likewise. + + * plugin/plugin-nvptx.c (CUDA_CALL_ERET): New convenience macro. + (CUDA_CALL): Likewise. + (CUDA_CALL_ASSERT): Likewise. + (map_init): Change return type to bool, use CUDA_CALL* macros. + (map_fini): Likewise. + (init_streams_for_device): Change return type to bool, adjust + call to map_init. + (fini_streams_for_device): Change return type to bool, adjust + call to map_fini. + (select_stream_for_async): Release stream_lock before calls to + GOMP_PLUGIN_fatal, adjust call to map_init. + (nvptx_init): Use CUDA_CALL* macros. + (nvptx_attach_host_thread_to_device): Change return type to bool, + use CUDA_CALL* macros. + (nvptx_open_device): Use CUDA_CALL* macros. + (nvptx_close_device): Change return type to bool, use CUDA_CALL* + macros. + (nvptx_get_num_devices): Use CUDA_CALL* macros. + (link_ptx): Change return type to bool, use CUDA_CALL* macros. + (nvptx_exec): Use CUDA_CALL* macros. + (nvptx_alloc): Use CUDA_CALL* macros. + (nvptx_free): Change return type to bool, use CUDA_CALL* macros. + (nvptx_host2dev): Likewise. + (nvptx_dev2host): Likewise. + (nvptx_wait): Use CUDA_CALL* macros. + (nvptx_wait_async): Likewise. + (nvptx_wait_all): Likewise. + (nvptx_wait_all_async): Likewise. + (nvptx_set_cuda_stream): Adjust order of stream_lock acquire, + use CUDA_CALL* macros, adjust call to map_fini. + (GOMP_OFFLOAD_init_device): Change return type to bool, + adjust code accordingly. + (GOMP_OFFLOAD_fini_device): Likewise. + (GOMP_OFFLOAD_load_image): Adjust calls to + nvptx_attach_host_thread_to_device/link_ptx to handle errors, + use CUDA_CALL* macros. + (GOMP_OFFLOAD_unload_image): Change return type to bool, adjust + return code. + (GOMP_OFFLOAD_alloc): Adjust calls to code to handle error return. + (GOMP_OFFLOAD_free): Change return type to bool, adjust calls to + handle error return. + (GOMP_OFFLOAD_dev2host): Likewise. + (GOMP_OFFLOAD_host2dev): Likewise. + (GOMP_OFFLOAD_openacc_register_async_cleanup): Use CUDA_CALL* macros. + (GOMP_OFFLOAD_openacc_create_thread_data): Likewise. + 2016-05-24 Cesar Philippidis <cesar@codesourcery.com> * oacc-mem.c (acc_malloc): Update handling of shared-memory targets. diff --git a/libgomp/libgomp.h b/libgomp/libgomp.h index 664e76b52d1..f0c048b151b 100644 --- a/libgomp/libgomp.h +++ b/libgomp/libgomp.h @@ -930,16 +930,16 @@ struct gomp_device_descr unsigned int (*get_caps_func) (void); int (*get_type_func) (void); int (*get_num_devices_func) (void); - void (*init_device_func) (int); - void (*fini_device_func) (int); + bool (*init_device_func) (int); + bool (*fini_device_func) (int); unsigned (*version_func) (void); int (*load_image_func) (int, unsigned, const void *, struct addr_pair **); - void (*unload_image_func) (int, unsigned, const void *); + bool (*unload_image_func) (int, unsigned, const void *); void *(*alloc_func) (int, size_t); - void (*free_func) (int, void *); - void *(*dev2host_func) (int, void *, const void *, size_t); - void *(*host2dev_func) (int, void *, const void *, size_t); - void *(*dev2dev_func) (int, void *, const void *, size_t); + bool (*free_func) (int, void *); + bool (*dev2host_func) (int, void *, const void *, size_t); + bool (*host2dev_func) (int, void *, const void *, size_t); + bool (*dev2dev_func) (int, void *, const void *, size_t); bool (*can_run_func) (void *); void (*run_func) (int, void *, void *, void **); void (*async_run_func) (int, void *, void *, void **, void *); diff --git a/libgomp/oacc-host.c b/libgomp/oacc-host.c index 1e760f6d83a..a24899c7f60 100644 --- a/libgomp/oacc-host.c +++ b/libgomp/oacc-host.c @@ -60,14 +60,16 @@ host_get_num_devices (void) return 1; } -static void +static bool host_init_device (int n __attribute__ ((unused))) { + return true; } -static void +static bool host_fini_device (int n __attribute__ ((unused))) { + return true; } static unsigned @@ -85,11 +87,12 @@ host_load_image (int n __attribute__ ((unused)), return 0; } -static void +static bool host_unload_image (int n __attribute__ ((unused)), unsigned v __attribute__ ((unused)), const void *t __attribute__ ((unused))) { + return true; } static void * @@ -98,28 +101,29 @@ host_alloc (int n __attribute__ ((unused)), size_t s) return gomp_malloc (s); } -static void +static bool host_free (int n __attribute__ ((unused)), void *p) { free (p); + return true; } -static void * +static bool host_dev2host (int n __attribute__ ((unused)), void *h __attribute__ ((unused)), const void *d __attribute__ ((unused)), size_t s __attribute__ ((unused))) { - return NULL; + return true; } -static void * +static bool host_host2dev (int n __attribute__ ((unused)), void *d __attribute__ ((unused)), const void *h __attribute__ ((unused)), size_t s __attribute__ ((unused))) { - return NULL; + return true; } static void diff --git a/libgomp/oacc-init.c b/libgomp/oacc-init.c index f6176eccd3f..f2325adfb46 100644 --- a/libgomp/oacc-init.c +++ b/libgomp/oacc-init.c @@ -300,8 +300,8 @@ acc_shutdown_1 (acc_device_t d) gomp_mutex_unlock (&goacc_thread_lock); - /* Close all the devices of this type that have been opened. */ + bool ret = true; for (i = 0; i < ndevs; i++) { struct gomp_device_descr *acc_dev = &base_dev[i]; @@ -309,12 +309,15 @@ acc_shutdown_1 (acc_device_t d) if (acc_dev->state == GOMP_DEVICE_INITIALIZED) { devices_active = true; - acc_dev->fini_device_func (acc_dev->target_id); + ret &= acc_dev->fini_device_func (acc_dev->target_id); acc_dev->state = GOMP_DEVICE_UNINITIALIZED; } gomp_mutex_unlock (&acc_dev->lock); } + if (!ret) + gomp_fatal ("device finalization failed"); + if (!devices_active) gomp_fatal ("no device initialized"); } diff --git a/libgomp/oacc-mem.c b/libgomp/oacc-mem.c index 665e208cd36..2aaa0d295cb 100644 --- a/libgomp/oacc-mem.c +++ b/libgomp/oacc-mem.c @@ -149,7 +149,8 @@ acc_free (void *d) else gomp_mutex_unlock (&acc_dev->lock); - acc_dev->free_func (acc_dev->target_id, d); + if (!acc_dev->free_func (acc_dev->target_id, d)) + gomp_fatal ("error in freeing device memory in %s", __FUNCTION__); } void @@ -167,7 +168,8 @@ acc_memcpy_to_device (void *d, void *h, size_t s) return; } - thr->dev->host2dev_func (thr->dev->target_id, d, h, s); + if (!thr->dev->host2dev_func (thr->dev->target_id, d, h, s)) + gomp_fatal ("error in %s", __FUNCTION__); } void @@ -185,7 +187,8 @@ acc_memcpy_from_device (void *h, void *d, size_t s) return; } - thr->dev->dev2host_func (thr->dev->target_id, h, d, s); + if (!thr->dev->dev2host_func (thr->dev->target_id, h, d, s)) + gomp_fatal ("error in %s", __FUNCTION__); } /* Return the device pointer that corresponds to host data H. Or NULL @@ -520,7 +523,7 @@ acc_present_or_copyin (void *h, size_t s) #define FLAG_COPYOUT (1 << 0) static void -delete_copyout (unsigned f, void *h, size_t s) +delete_copyout (unsigned f, void *h, size_t s, const char *libfnname) { size_t host_size; splay_tree_key n; @@ -563,18 +566,20 @@ delete_copyout (unsigned f, void *h, size_t s) acc_unmap_data (h); - acc_dev->free_func (acc_dev->target_id, d); + if (!acc_dev->free_func (acc_dev->target_id, d)) + gomp_fatal ("error in freeing device memory in %s", libfnname); } void acc_delete (void *h , size_t s) { - delete_copyout (0, h, s); + delete_copyout (0, h, s, __FUNCTION__); } -void acc_copyout (void *h, size_t s) +void +acc_copyout (void *h, size_t s) { - delete_copyout (FLAG_COPYOUT, h, s); + delete_copyout (FLAG_COPYOUT, h, s, __FUNCTION__); } static void @@ -604,12 +609,12 @@ update_dev_host (int is_dev, void *h, size_t s) d = (void *) (n->tgt->tgt_start + n->tgt_offset + (uintptr_t) h - n->host_start); - gomp_mutex_unlock (&acc_dev->lock); - if (is_dev) acc_dev->host2dev_func (acc_dev->target_id, d, h, s); else acc_dev->dev2host_func (acc_dev->target_id, h, d, s); + + gomp_mutex_unlock (&acc_dev->lock); } void diff --git a/libgomp/plugin/plugin-hsa.c b/libgomp/plugin/plugin-hsa.c index 0b3b67ac01c..bed8555fb90 100644 --- a/libgomp/plugin/plugin-hsa.c +++ b/libgomp/plugin/plugin-hsa.c @@ -175,10 +175,10 @@ hsa_warn (const char *str, hsa_status_t status) if (!debug) return; - const char *hsa_error; - hsa_status_string (status, &hsa_error); + const char *hsa_error_msg; + hsa_status_string (status, &hsa_error_msg); - fprintf (stderr, "HSA warning: %s\nRuntime message: %s", str, hsa_error); + fprintf (stderr, "HSA warning: %s\nRuntime message: %s", str, hsa_error_msg); } /* Report a fatal error STR together with the HSA error corresponding to STATUS @@ -187,10 +187,23 @@ hsa_warn (const char *str, hsa_status_t status) static void hsa_fatal (const char *str, hsa_status_t status) { - const char *hsa_error; - hsa_status_string (status, &hsa_error); + const char *hsa_error_msg; + hsa_status_string (status, &hsa_error_msg); GOMP_PLUGIN_fatal ("HSA fatal error: %s\nRuntime message: %s", str, - hsa_error); + hsa_error_msg); +} + +/* Like hsa_fatal, except only report error message, and return FALSE + for propagating error processing to outside of plugin. */ + +static bool +hsa_error (const char *str, hsa_status_t status) +{ + const char *hsa_error_msg; + hsa_status_string (status, &hsa_error_msg); + GOMP_PLUGIN_error ("HSA fatal error: %s\nRuntime message: %s", str, + hsa_error_msg); + return false; } struct hsa_kernel_description @@ -418,24 +431,25 @@ assign_agent_ids (hsa_agent_t agent, void *data) return HSA_STATUS_SUCCESS; } -/* Initialize hsa_context if it has not already been done. */ +/* Initialize hsa_context if it has not already been done. + Return TRUE on success. */ -static void +static bool init_hsa_context (void) { hsa_status_t status; int agent_index = 0; if (hsa_context.initialized) - return; + return true; init_enviroment_variables (); status = hsa_init (); if (status != HSA_STATUS_SUCCESS) - hsa_fatal ("Run-time could not be initialized", status); + return hsa_error ("Run-time could not be initialized", status); HSA_DEBUG ("HSA run-time initialized\n"); status = hsa_iterate_agents (count_gpu_agents, NULL); if (status != HSA_STATUS_SUCCESS) - hsa_fatal ("HSA GPU devices could not be enumerated", status); + return hsa_error ("HSA GPU devices could not be enumerated", status); HSA_DEBUG ("There are %i HSA GPU devices.\n", hsa_context.agent_count); hsa_context.agents @@ -443,8 +457,12 @@ init_hsa_context (void) * sizeof (struct agent_info)); status = hsa_iterate_agents (assign_agent_ids, &agent_index); if (agent_index != hsa_context.agent_count) - GOMP_PLUGIN_fatal ("Failed to assign IDs to all HSA agents"); + { + GOMP_PLUGIN_error ("Failed to assign IDs to all HSA agents"); + return false; + } hsa_context.initialized = true; + return true; } /* Callback of dispatch queues to report errors. */ @@ -492,75 +510,100 @@ get_kernarg_memory_region (hsa_region_t region, void *data) int GOMP_OFFLOAD_get_num_devices (void) { - init_hsa_context (); + if (!init_hsa_context ()) + return 0; return hsa_context.agent_count; } /* Part of the libgomp plugin interface. Initialize agent number N so that it - can be used for computation. */ + can be used for computation. Return TRUE on success. */ -void +bool GOMP_OFFLOAD_init_device (int n) { - init_hsa_context (); + if (!init_hsa_context ()) + return false; if (n >= hsa_context.agent_count) - GOMP_PLUGIN_fatal ("Request to initialize non-existing HSA device %i", n); + { + GOMP_PLUGIN_error ("Request to initialize non-existing HSA device %i", n); + return false; + } struct agent_info *agent = &hsa_context.agents[n]; if (agent->initialized) - return; + return true; if (pthread_rwlock_init (&agent->modules_rwlock, NULL)) - GOMP_PLUGIN_fatal ("Failed to initialize an HSA agent rwlock"); + { + GOMP_PLUGIN_error ("Failed to initialize an HSA agent rwlock"); + return false; + } if (pthread_mutex_init (&agent->prog_mutex, NULL)) - GOMP_PLUGIN_fatal ("Failed to initialize an HSA agent program mutex"); + { + GOMP_PLUGIN_error ("Failed to initialize an HSA agent program mutex"); + return false; + } uint32_t queue_size; hsa_status_t status; status = hsa_agent_get_info (agent->id, HSA_AGENT_INFO_QUEUE_MAX_SIZE, &queue_size); if (status != HSA_STATUS_SUCCESS) - hsa_fatal ("Error requesting maximum queue size of the HSA agent", status); + return hsa_error ("Error requesting maximum queue size of the HSA agent", + status); status = hsa_agent_get_info (agent->id, HSA_AGENT_INFO_ISA, &agent->isa); if (status != HSA_STATUS_SUCCESS) - hsa_fatal ("Error querying the ISA of the agent", status); + return hsa_error ("Error querying the ISA of the agent", status); status = hsa_queue_create (agent->id, queue_size, HSA_QUEUE_TYPE_MULTI, queue_callback, NULL, UINT32_MAX, UINT32_MAX, &agent->command_q); if (status != HSA_STATUS_SUCCESS) - hsa_fatal ("Error creating command queue", status); + return hsa_error ("Error creating command queue", status); status = hsa_queue_create (agent->id, queue_size, HSA_QUEUE_TYPE_MULTI, queue_callback, NULL, UINT32_MAX, UINT32_MAX, &agent->kernel_dispatch_command_q); if (status != HSA_STATUS_SUCCESS) - hsa_fatal ("Error creating kernel dispatch command queue", status); + return hsa_error ("Error creating kernel dispatch command queue", status); agent->kernarg_region.handle = (uint64_t) -1; status = hsa_agent_iterate_regions (agent->id, get_kernarg_memory_region, &agent->kernarg_region); if (agent->kernarg_region.handle == (uint64_t) -1) - GOMP_PLUGIN_fatal ("Could not find suitable memory region for kernel " - "arguments"); + { + GOMP_PLUGIN_error ("Could not find suitable memory region for kernel " + "arguments"); + return false; + } HSA_DEBUG ("HSA agent initialized, queue has id %llu\n", (long long unsigned) agent->command_q->id); HSA_DEBUG ("HSA agent initialized, kernel dispatch queue has id %llu\n", (long long unsigned) agent->kernel_dispatch_command_q->id); agent->initialized = true; + return true; } /* Verify that hsa_context has already been initialized and return the - agent_info structure describing device number N. */ + agent_info structure describing device number N. Return NULL on error. */ static struct agent_info * get_agent_info (int n) { if (!hsa_context.initialized) - GOMP_PLUGIN_fatal ("Attempt to use uninitialized HSA context."); + { + GOMP_PLUGIN_error ("Attempt to use uninitialized HSA context."); + return NULL; + } if (n >= hsa_context.agent_count) - GOMP_PLUGIN_fatal ("Request to operate on anon-existing HSA device %i", n); + { + GOMP_PLUGIN_error ("Request to operate on anon-existing HSA device %i", n); + return NULL; + } if (!hsa_context.agents[n].initialized) - GOMP_PLUGIN_fatal ("Attempt to use an uninitialized HSA agent."); + { + GOMP_PLUGIN_error ("Attempt to use an uninitialized HSA agent."); + return NULL; + } return &hsa_context.agents[n]; } @@ -590,13 +633,14 @@ remove_module_from_agent (struct agent_info *agent, struct module_info *module) } /* Free the HSA program in agent and everything associated with it and set - agent->prog_finalized and the initialized flags of all kernels to false. */ + agent->prog_finalized and the initialized flags of all kernels to false. + Return TRUE on success. */ -static void +static bool destroy_hsa_program (struct agent_info *agent) { if (!agent->prog_finalized || agent->prog_finalized_error) - return; + return true; hsa_status_t status; @@ -604,7 +648,7 @@ destroy_hsa_program (struct agent_info *agent) status = hsa_executable_destroy (agent->executable); if (status != HSA_STATUS_SUCCESS) - hsa_fatal ("Could not destroy HSA executable", status); + return hsa_error ("Could not destroy HSA executable", status); struct module_info *module; for (module = agent->first_module; module; module = module->next) @@ -614,6 +658,7 @@ destroy_hsa_program (struct agent_info *agent) module->kernels[i].initialized = false; } agent->prog_finalized = false; + return true; } /* Part of the libgomp plugin interface. Load BRIG module described by struct @@ -625,9 +670,12 @@ GOMP_OFFLOAD_load_image (int ord, unsigned version, void *target_data, struct addr_pair **target_table) { if (GOMP_VERSION_DEV (version) > GOMP_VERSION_HSA) - GOMP_PLUGIN_fatal ("Offload data incompatible with HSA plugin" - " (expected %u, received %u)", - GOMP_VERSION_HSA, GOMP_VERSION_DEV (version)); + { + GOMP_PLUGIN_error ("Offload data incompatible with HSA plugin" + " (expected %u, received %u)", + GOMP_VERSION_HSA, GOMP_VERSION_DEV (version)); + return -1; + } struct brig_image_desc *image_desc = (struct brig_image_desc *) target_data; struct agent_info *agent; @@ -637,10 +685,17 @@ GOMP_OFFLOAD_load_image (int ord, unsigned version, void *target_data, int kernel_count = image_desc->kernel_count; agent = get_agent_info (ord); + if (!agent) + return -1; + if (pthread_rwlock_wrlock (&agent->modules_rwlock)) - GOMP_PLUGIN_fatal ("Unable to write-lock an HSA agent rwlock"); - if (agent->prog_finalized) - destroy_hsa_program (agent); + { + GOMP_PLUGIN_error ("Unable to write-lock an HSA agent rwlock"); + return -1; + } + if (agent->prog_finalized + && !destroy_hsa_program (agent)) + return -1; HSA_DEBUG ("Encountered %d kernels in an image\n", kernel_count); pair = GOMP_PLUGIN_malloc (kernel_count * sizeof (struct addr_pair)); @@ -668,7 +723,10 @@ GOMP_OFFLOAD_load_image (int ord, unsigned version, void *target_data, kernel->dependencies_count = d->kernel_dependencies_count; kernel->dependencies = d->kernel_dependencies; if (pthread_mutex_init (&kernel->init_mutex, NULL)) - GOMP_PLUGIN_fatal ("Failed to initialize an HSA kernel mutex"); + { + GOMP_PLUGIN_error ("Failed to initialize an HSA kernel mutex"); + return -1; + } kernel++; pair++; @@ -676,7 +734,10 @@ GOMP_OFFLOAD_load_image (int ord, unsigned version, void *target_data, add_module_to_agent (agent, module); if (pthread_rwlock_unlock (&agent->modules_rwlock)) - GOMP_PLUGIN_fatal ("Unable to unlock an HSA agent rwlock"); + { + GOMP_PLUGIN_error ("Unable to unlock an HSA agent rwlock"); + return -1; + } return kernel_count; } @@ -1373,34 +1434,47 @@ GOMP_OFFLOAD_async_run (int device, void *tgt_fn, void *tgt_vars, } /* Deinitialize all information associated with MODULE and kernels within - it. */ + it. Return TRUE on success. */ -void +static bool destroy_module (struct module_info *module) { int i; for (i = 0; i < module->kernel_count; i++) if (pthread_mutex_destroy (&module->kernels[i].init_mutex)) - GOMP_PLUGIN_fatal ("Failed to destroy an HSA kernel initialization " - "mutex"); + { + GOMP_PLUGIN_error ("Failed to destroy an HSA kernel initialization " + "mutex"); + return false; + } + return true; } /* Part of the libgomp plugin interface. Unload BRIG module described by - struct brig_image_desc in TARGET_DATA from agent number N. */ + struct brig_image_desc in TARGET_DATA from agent number N. Return + TRUE on success. */ -void +bool GOMP_OFFLOAD_unload_image (int n, unsigned version, void *target_data) { if (GOMP_VERSION_DEV (version) > GOMP_VERSION_HSA) - GOMP_PLUGIN_fatal ("Offload data incompatible with HSA plugin" - " (expected %u, received %u)", - GOMP_VERSION_HSA, GOMP_VERSION_DEV (version)); + { + GOMP_PLUGIN_error ("Offload data incompatible with HSA plugin" + " (expected %u, received %u)", + GOMP_VERSION_HSA, GOMP_VERSION_DEV (version)); + return false; + } struct agent_info *agent; agent = get_agent_info (n); - if (pthread_rwlock_wrlock (&agent->modules_rwlock)) - GOMP_PLUGIN_fatal ("Unable to write-lock an HSA agent rwlock"); + if (!agent) + return false; + if (pthread_rwlock_wrlock (&agent->modules_rwlock)) + { + GOMP_PLUGIN_error ("Unable to write-lock an HSA agent rwlock"); + return false; + } struct module_info *module = agent->first_module; while (module) { @@ -1409,54 +1483,75 @@ GOMP_OFFLOAD_unload_image (int n, unsigned version, void *target_data) module = module->next; } if (!module) - GOMP_PLUGIN_fatal ("Attempt to unload an image that has never been " - "loaded before"); + { + GOMP_PLUGIN_error ("Attempt to unload an image that has never been " + "loaded before"); + return false; + } remove_module_from_agent (agent, module); - destroy_module (module); + if (!destroy_module (module)) + return false; free (module); - destroy_hsa_program (agent); + if (!destroy_hsa_program (agent)) + return false; if (pthread_rwlock_unlock (&agent->modules_rwlock)) - GOMP_PLUGIN_fatal ("Unable to unlock an HSA agent rwlock"); + { + GOMP_PLUGIN_error ("Unable to unlock an HSA agent rwlock"); + return false; + } + return true; } /* Part of the libgomp plugin interface. Deinitialize all information and status associated with agent number N. We do not attempt any synchronization, assuming the user and libgomp will not attempt deinitialization of a device that is in any way being used at the same - time. */ + time. Return TRUE on success. */ -void +bool GOMP_OFFLOAD_fini_device (int n) { struct agent_info *agent = get_agent_info (n); + if (!agent) + return false; + if (!agent->initialized) - return; + return true; struct module_info *next_module = agent->first_module; while (next_module) { struct module_info *module = next_module; next_module = module->next; - destroy_module (module); + if (!destroy_module (module)) + return false; free (module); } agent->first_module = NULL; - destroy_hsa_program (agent); + if (!destroy_hsa_program (agent)) + return false; release_agent_shared_libraries (agent); hsa_status_t status = hsa_queue_destroy (agent->command_q); if (status != HSA_STATUS_SUCCESS) - hsa_fatal ("Error destroying command queue", status); + return hsa_error ("Error destroying command queue", status); status = hsa_queue_destroy (agent->kernel_dispatch_command_q); if (status != HSA_STATUS_SUCCESS) - hsa_fatal ("Error destroying kernel dispatch command queue", status); + return hsa_error ("Error destroying kernel dispatch command queue", status); if (pthread_mutex_destroy (&agent->prog_mutex)) - GOMP_PLUGIN_fatal ("Failed to destroy an HSA agent program mutex"); + { + GOMP_PLUGIN_error ("Failed to destroy an HSA agent program mutex"); + return false; + } if (pthread_rwlock_destroy (&agent->modules_rwlock)) - GOMP_PLUGIN_fatal ("Failed to destroy an HSA agent rwlock"); + { + GOMP_PLUGIN_error ("Failed to destroy an HSA agent rwlock"); + return false; + } agent->initialized = false; + return true; } /* Part of the libgomp plugin interface. Not implemented as it is not required @@ -1465,46 +1560,51 @@ GOMP_OFFLOAD_fini_device (int n) void * GOMP_OFFLOAD_alloc (int ord, size_t size) { - GOMP_PLUGIN_fatal ("HSA GOMP_OFFLOAD_alloc is not implemented because " + GOMP_PLUGIN_error ("HSA GOMP_OFFLOAD_alloc is not implemented because " "it should never be called"); + return NULL; } /* Part of the libgomp plugin interface. Not implemented as it is not required for HSA. */ -void +bool GOMP_OFFLOAD_free (int ord, void *ptr) { - GOMP_PLUGIN_fatal ("HSA GOMP_OFFLOAD_free is not implemented because " + GOMP_PLUGIN_error ("HSA GOMP_OFFLOAD_free is not implemented because " "it should never be called"); + return false; } /* Part of the libgomp plugin interface. Not implemented as it is not required for HSA. */ -void * +bool GOMP_OFFLOAD_dev2host (int ord, void *dst, const void *src, size_t n) { - GOMP_PLUGIN_fatal ("HSA GOMP_OFFLOAD_dev2host is not implemented because " + GOMP_PLUGIN_error ("HSA GOMP_OFFLOAD_dev2host is not implemented because " "it should never be called"); + return false; } /* Part of the libgomp plugin interface. Not implemented as it is not required for HSA. */ -void * +bool GOMP_OFFLOAD_host2dev (int ord, void *dst, const void *src, size_t n) { - GOMP_PLUGIN_fatal ("HSA GOMP_OFFLOAD_host2dev is not implemented because " + GOMP_PLUGIN_error ("HSA GOMP_OFFLOAD_host2dev is not implemented because " "it should never be called"); + return false; } /* Part of the libgomp plugin interface. Not implemented as it is not required for HSA. */ -void * +bool GOMP_OFFLOAD_dev2dev (int ord, void *dst, const void *src, size_t n) { - GOMP_PLUGIN_fatal ("HSA GOMP_OFFLOAD_dev2dev is not implemented because " + GOMP_PLUGIN_error ("HSA GOMP_OFFLOAD_dev2dev is not implemented because " "it should never be called"); + return false; } diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c index 4b5783344fb..2b6a888cbd2 100644 --- a/libgomp/plugin/plugin-nvptx.c +++ b/libgomp/plugin/plugin-nvptx.c @@ -63,6 +63,34 @@ cuda_error (CUresult r) return desc; } +/* Convenience macros for the frequently used CUDA library call and + error handling sequence. This does not capture all the cases we + use in this file, but is common enough. */ + +#define CUDA_CALL_ERET(ERET, FN, ...) \ + do { \ + unsigned __r = FN (__VA_ARGS__); \ + if (__r != CUDA_SUCCESS) \ + { \ + GOMP_PLUGIN_error (#FN " error: %s", \ + cuda_error (__r)); \ + return ERET; \ + } \ + } while (0) + +#define CUDA_CALL(FN, ...) \ + CUDA_CALL_ERET (false, (FN), __VA_ARGS__) + +#define CUDA_CALL_ASSERT(FN, ...) \ + do { \ + unsigned __r = FN (__VA_ARGS__); \ + if (__r != CUDA_SUCCESS) \ + { \ + GOMP_PLUGIN_fatal (#FN " error: %s", \ + cuda_error (__r)); \ + } \ + } while (0) + static unsigned int instantiated_devices = 0; static pthread_mutex_t ptx_dev_lock = PTHREAD_MUTEX_INITIALIZER; @@ -98,24 +126,17 @@ struct map char mappings[0]; }; -static void +static bool map_init (struct ptx_stream *s) { - CUresult r; - int size = getpagesize (); assert (s); assert (!s->d); assert (!s->h); - r = cuMemAllocHost (&s->h, size); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuMemAllocHost error: %s", cuda_error (r)); - - r = cuMemHostGetDevicePointer (&s->d, s->h, 0); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuMemHostGetDevicePointer error: %s", cuda_error (r)); + CUDA_CALL (cuMemAllocHost, &s->h, size); + CUDA_CALL (cuMemHostGetDevicePointer, &s->d, s->h, 0); assert (s->h); @@ -125,16 +146,14 @@ map_init (struct ptx_stream *s) assert (s->h_next); assert (s->h_end); + return true; } -static void +static bool map_fini (struct ptx_stream *s) { - CUresult r; - - r = cuMemFreeHost (s->h); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_error ("cuMemFreeHost error: %s", cuda_error (r)); + CUDA_CALL (cuMemFreeHost, s->h); + return true; } static void @@ -325,7 +344,7 @@ nvptx_thread (void) return (struct nvptx_thread *) GOMP_PLUGIN_acc_thread (); } -static void +static bool init_streams_for_device (struct ptx_device *ptx_dev, int concurrency) { int i; @@ -337,9 +356,10 @@ init_streams_for_device (struct ptx_device *ptx_dev, int concurrency) null_stream->multithreaded = true; null_stream->d = (CUdeviceptr) NULL; null_stream->h = NULL; - map_init (null_stream); - ptx_dev->null_stream = null_stream; + if (!map_init (null_stream)) + return false; + ptx_dev->null_stream = null_stream; ptx_dev->active_streams = NULL; pthread_mutex_init (&ptx_dev->stream_lock, NULL); @@ -355,25 +375,35 @@ init_streams_for_device (struct ptx_device *ptx_dev, int concurrency) for (i = 0; i < concurrency; i++) ptx_dev->async_streams.arr[i] = NULL; + + return true; } -static void +static bool fini_streams_for_device (struct ptx_device *ptx_dev) { free (ptx_dev->async_streams.arr); + bool ret = true; while (ptx_dev->active_streams != NULL) { struct ptx_stream *s = ptx_dev->active_streams; ptx_dev->active_streams = ptx_dev->active_streams->next; - map_fini (s); - cuStreamDestroy (s->stream); + ret &= map_fini (s); + + CUresult r = cuStreamDestroy (s->stream); + if (r != CUDA_SUCCESS) + { + GOMP_PLUGIN_error ("cuStreamDestroy error: %s", cuda_error (r)); + ret = false; + } free (s); } - map_fini (ptx_dev->null_stream); + ret &= map_fini (ptx_dev->null_stream); free (ptx_dev->null_stream); + return ret; } /* Select a stream for (OpenACC-semantics) ASYNC argument for the current @@ -447,7 +477,11 @@ select_stream_for_async (int async, pthread_t thread, bool create, { r = cuStreamCreate (&s->stream, CU_STREAM_DEFAULT); if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuStreamCreate error: %s", cuda_error (r)); + { + pthread_mutex_unlock (&ptx_dev->stream_lock); + GOMP_PLUGIN_fatal ("cuStreamCreate error: %s", + cuda_error (r)); + } } /* If CREATE is true, we're going to be queueing some work on this @@ -457,7 +491,11 @@ select_stream_for_async (int async, pthread_t thread, bool create, s->d = (CUdeviceptr) NULL; s->h = NULL; - map_init (s); + if (!map_init (s)) + { + pthread_mutex_unlock (&ptx_dev->stream_lock); + GOMP_PLUGIN_fatal ("map_init fail"); + } s->next = ptx_dev->active_streams; ptx_dev->active_streams = s; @@ -467,7 +505,11 @@ select_stream_for_async (int async, pthread_t thread, bool create, stream = ptx_dev->async_streams.arr[async]; } else if (async < 0) - GOMP_PLUGIN_fatal ("bad async %d", async); + { + if (create) + pthread_mutex_unlock (&ptx_dev->stream_lock); + GOMP_PLUGIN_fatal ("bad async %d", async); + } if (create) { @@ -498,34 +540,25 @@ select_stream_for_async (int async, pthread_t thread, bool create, static bool nvptx_init (void) { - CUresult r; int ndevs; if (instantiated_devices != 0) return true; - r = cuInit (0); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuInit error: %s", cuda_error (r)); - + CUDA_CALL (cuInit, 0); ptx_events = NULL; - pthread_mutex_init (&ptx_event_lock, NULL); - r = cuDeviceGetCount (&ndevs); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuDeviceGetCount error: %s", cuda_error (r)); - + CUDA_CALL (cuDeviceGetCount, &ndevs); ptx_devices = GOMP_PLUGIN_malloc_cleared (sizeof (struct ptx_device *) * ndevs); - return true; } /* Select the N'th PTX device for the current host thread. The device must have been previously opened before calling this function. */ -static void +static bool nvptx_attach_host_thread_to_device (int n) { CUdevice dev; @@ -535,34 +568,34 @@ nvptx_attach_host_thread_to_device (int n) r = cuCtxGetDevice (&dev); if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT) - GOMP_PLUGIN_fatal ("cuCtxGetDevice error: %s", cuda_error (r)); + { + GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r)); + return false; + } if (r != CUDA_ERROR_INVALID_CONTEXT && dev == n) - return; + return true; else { CUcontext old_ctx; ptx_dev = ptx_devices[n]; - assert (ptx_dev); + if (!ptx_dev) + { + GOMP_PLUGIN_error ("device %d not found", n); + return false; + } - r = cuCtxGetCurrent (&thd_ctx); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuCtxGetCurrent error: %s", cuda_error (r)); + CUDA_CALL (cuCtxGetCurrent, &thd_ctx); /* We don't necessarily have a current context (e.g. if it has been destroyed. Pop it if we do though. */ if (thd_ctx != NULL) - { - r = cuCtxPopCurrent (&old_ctx); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuCtxPopCurrent error: %s", cuda_error (r)); - } + CUDA_CALL (cuCtxPopCurrent, &old_ctx); - r = cuCtxPushCurrent (ptx_dev->ctx); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuCtxPushCurrent error: %s", cuda_error (r)); + CUDA_CALL (cuCtxPushCurrent, ptx_dev->ctx); } + return true; } static struct ptx_device * @@ -573,9 +606,7 @@ nvptx_open_device (int n) CUresult r; int async_engines, pi; - r = cuDeviceGet (&dev, n); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuDeviceGet error: %s", cuda_error (r)); + CUDA_CALL_ERET (NULL, cuDeviceGet, &dev, n); ptx_dev = GOMP_PLUGIN_malloc (sizeof (struct ptx_device)); @@ -585,60 +616,44 @@ nvptx_open_device (int n) r = cuCtxGetDevice (&ctx_dev); if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT) - GOMP_PLUGIN_fatal ("cuCtxGetDevice error: %s", cuda_error (r)); + { + GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r)); + return NULL; + } if (r != CUDA_ERROR_INVALID_CONTEXT && ctx_dev != dev) { /* The current host thread has an active context for a different device. Detach it. */ CUcontext old_ctx; - - r = cuCtxPopCurrent (&old_ctx); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuCtxPopCurrent error: %s", cuda_error (r)); + CUDA_CALL_ERET (NULL, cuCtxPopCurrent, &old_ctx); } - r = cuCtxGetCurrent (&ptx_dev->ctx); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuCtxGetCurrent error: %s", cuda_error (r)); + CUDA_CALL_ERET (NULL, cuCtxGetCurrent, &ptx_dev->ctx); if (!ptx_dev->ctx) - { - r = cuCtxCreate (&ptx_dev->ctx, CU_CTX_SCHED_AUTO, dev); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuCtxCreate error: %s", cuda_error (r)); - } + CUDA_CALL_ERET (NULL, cuCtxCreate, &ptx_dev->ctx, CU_CTX_SCHED_AUTO, dev); else ptx_dev->ctx_shared = true; - r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, dev); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r)); - + CUDA_CALL_ERET (NULL, cuDeviceGetAttribute, + &pi, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, dev); ptx_dev->overlap = pi; - r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r)); - + CUDA_CALL_ERET (NULL, cuDeviceGetAttribute, + &pi, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev); ptx_dev->map = pi; - r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r)); - + CUDA_CALL_ERET (NULL, cuDeviceGetAttribute, + &pi, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev); ptx_dev->concur = pi; - r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r)); - + CUDA_CALL_ERET (NULL, cuDeviceGetAttribute, + &pi, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev); ptx_dev->mode = pi; - r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r)); - + CUDA_CALL_ERET (NULL, cuDeviceGetAttribute, + &pi, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev); ptx_dev->mkern = pi; r = cuDeviceGetAttribute (&async_engines, @@ -649,38 +664,34 @@ nvptx_open_device (int n) ptx_dev->images = NULL; pthread_mutex_init (&ptx_dev->image_lock, NULL); - init_streams_for_device (ptx_dev, async_engines); + if (!init_streams_for_device (ptx_dev, async_engines)) + return NULL; return ptx_dev; } -static void +static bool nvptx_close_device (struct ptx_device *ptx_dev) { - CUresult r; - if (!ptx_dev) - return; + return true; - fini_streams_for_device (ptx_dev); + if (!fini_streams_for_device (ptx_dev)) + return false; pthread_mutex_destroy (&ptx_dev->image_lock); if (!ptx_dev->ctx_shared) - { - r = cuCtxDestroy (ptx_dev->ctx); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuCtxDestroy error: %s", cuda_error (r)); - } + CUDA_CALL (cuCtxDestroy, ptx_dev->ctx); free (ptx_dev); + return true; } static int nvptx_get_num_devices (void) { int n; - CUresult r; /* PR libgomp/65099: Currently, we only support offloading in 64-bit configurations. */ @@ -693,22 +704,19 @@ nvptx_get_num_devices (void) further initialization). */ if (instantiated_devices == 0) { - r = cuInit (0); + CUresult r = cuInit (0); /* This is not an error: e.g. we may have CUDA libraries installed but no devices available. */ if (r != CUDA_SUCCESS) return 0; } - r = cuDeviceGetCount (&n); - if (r!= CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuDeviceGetCount error: %s", cuda_error (r)); - + CUDA_CALL_ERET (-1, cuDeviceGetCount, &n); return n; } -static void +static bool link_ptx (CUmodule *module, const struct targ_ptx_obj *ptx_objs, unsigned num_objs) { @@ -742,9 +750,7 @@ link_ptx (CUmodule *module, const struct targ_ptx_obj *ptx_objs, opts[5] = CU_JIT_LOG_VERBOSE; optvals[5] = (void *) 1; - r = cuLinkCreate (6, opts, optvals, &linkstate); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuLinkCreate error: %s", cuda_error (r)); + CUDA_CALL (cuLinkCreate, 6, opts, optvals, &linkstate); for (; num_objs--; ptx_objs++) { @@ -756,8 +762,9 @@ link_ptx (CUmodule *module, const struct targ_ptx_obj *ptx_objs, if (r != CUDA_SUCCESS) { GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]); - GOMP_PLUGIN_fatal ("cuLinkAddData (ptx_code) error: %s", + GOMP_PLUGIN_error ("cuLinkAddData (ptx_code) error: %s", cuda_error (r)); + return false; } } @@ -768,15 +775,14 @@ link_ptx (CUmodule *module, const struct targ_ptx_obj *ptx_objs, GOMP_PLUGIN_debug (0, "Link log %s\n", &ilog[0]); if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuLinkComplete error: %s", cuda_error (r)); - - r = cuModuleLoadData (module, linkout); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuModuleLoadData error: %s", cuda_error (r)); + { + GOMP_PLUGIN_error ("cuLinkComplete error: %s", cuda_error (r)); + return false; + } - r = cuLinkDestroy (linkstate); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuLinkDestory error: %s", cuda_error (r)); + CUDA_CALL (cuModuleLoadData, module, linkout); + CUDA_CALL (cuLinkDestroy, linkstate); + return true; } static void @@ -923,10 +929,8 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs, /* Copy the (device) pointers to arguments to the device (dp and hp might in fact have the same value on a unified-memory system). */ - r = cuMemcpy ((CUdeviceptr)dp, (CUdeviceptr)hp, mapnum * sizeof (void *)); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuMemcpy failed: %s", cuda_error (r)); - + CUDA_CALL_ASSERT (cuMemcpy, (CUdeviceptr) dp, (CUdeviceptr) hp, + mapnum * sizeof (void *)); GOMP_PLUGIN_debug (0, " %s: kernel %s: launch" " gangs=%u, workers=%u, vectors=%u\n", __FUNCTION__, targ_fn->launch->fn, @@ -939,12 +943,10 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs, // vector length ntid.x kargs[0] = &dp; - r = cuLaunchKernel (function, - dims[GOMP_DIM_GANG], 1, 1, - dims[GOMP_DIM_VECTOR], dims[GOMP_DIM_WORKER], 1, - 0, dev_str->stream, kargs, 0); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuda_error (r)); + CUDA_CALL_ASSERT (cuLaunchKernel, function, + dims[GOMP_DIM_GANG], 1, 1, + dims[GOMP_DIM_VECTOR], dims[GOMP_DIM_WORKER], 1, + 0, dev_str->stream, kargs, 0); #ifndef DISABLE_ASYNC if (async < acc_async_noval) @@ -971,9 +973,7 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs, event_gc (true); - r = cuEventRecord (*e, dev_str->stream); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r)); + CUDA_CALL_ASSERT (cuEventRecord, *e, dev_str->stream); event_add (PTX_EVT_KNL, e, (void *)dev_str); } @@ -1001,163 +1001,139 @@ static void * nvptx_alloc (size_t s) { CUdeviceptr d; - CUresult r; - r = cuMemAlloc (&d, s); - if (r == CUDA_ERROR_OUT_OF_MEMORY) - return 0; - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuMemAlloc error: %s", cuda_error (r)); - return (void *)d; + CUDA_CALL_ERET (NULL, cuMemAlloc, &d, s); + return (void *) d; } -static void +static bool nvptx_free (void *p) { - CUresult r; CUdeviceptr pb; size_t ps; - r = cuMemGetAddressRange (&pb, &ps, (CUdeviceptr)p); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuMemGetAddressRange error: %s", cuda_error (r)); - - if ((CUdeviceptr)p != pb) - GOMP_PLUGIN_fatal ("invalid device address"); + CUDA_CALL (cuMemGetAddressRange, &pb, &ps, (CUdeviceptr) p); + if ((CUdeviceptr) p != pb) + { + GOMP_PLUGIN_error ("invalid device address"); + return false; + } - r = cuMemFree ((CUdeviceptr)p); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuda_error (r)); + CUDA_CALL (cuMemFree, (CUdeviceptr) p); + return true; } -static void * + +static bool nvptx_host2dev (void *d, const void *h, size_t s) { - CUresult r; CUdeviceptr pb; size_t ps; struct nvptx_thread *nvthd = nvptx_thread (); if (!s) - return 0; - + return true; if (!d) - GOMP_PLUGIN_fatal ("invalid device address"); + { + GOMP_PLUGIN_error ("invalid device address"); + return false; + } - r = cuMemGetAddressRange (&pb, &ps, (CUdeviceptr)d); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuMemGetAddressRange error: %s", cuda_error (r)); + CUDA_CALL (cuMemGetAddressRange, &pb, &ps, (CUdeviceptr) d); if (!pb) - GOMP_PLUGIN_fatal ("invalid device address"); - + { + GOMP_PLUGIN_error ("invalid device address"); + return false; + } if (!h) - GOMP_PLUGIN_fatal ("invalid host address"); - + { + GOMP_PLUGIN_error ("invalid host address"); + return false; + } if (d == h) - GOMP_PLUGIN_fatal ("invalid host or device address"); - + { + GOMP_PLUGIN_error ("invalid host or device address"); + return false; + } if ((void *)(d + s) > (void *)(pb + ps)) - GOMP_PLUGIN_fatal ("invalid size"); + { + GOMP_PLUGIN_error ("invalid size"); + return false; + } #ifndef DISABLE_ASYNC if (nvthd->current_stream != nvthd->ptx_dev->null_stream) { - CUevent *e; - - e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent)); - - r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r)); - + CUevent *e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent)); + CUDA_CALL (cuEventCreate, e, CU_EVENT_DISABLE_TIMING); event_gc (false); - - r = cuMemcpyHtoDAsync ((CUdeviceptr)d, h, s, - nvthd->current_stream->stream); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuMemcpyHtoDAsync error: %s", cuda_error (r)); - - r = cuEventRecord (*e, nvthd->current_stream->stream); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r)); - + CUDA_CALL (cuMemcpyHtoDAsync, + (CUdeviceptr) d, h, s, nvthd->current_stream->stream); + CUDA_CALL (cuEventRecord, *e, nvthd->current_stream->stream); event_add (PTX_EVT_MEM, e, (void *)h); } else #endif - { - r = cuMemcpyHtoD ((CUdeviceptr)d, h, s); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuda_error (r)); - } + CUDA_CALL (cuMemcpyHtoD, (CUdeviceptr) d, h, s); - return 0; + return true; } -static void * +static bool nvptx_dev2host (void *h, const void *d, size_t s) { - CUresult r; CUdeviceptr pb; size_t ps; struct nvptx_thread *nvthd = nvptx_thread (); if (!s) - return 0; - + return true; if (!d) - GOMP_PLUGIN_fatal ("invalid device address"); + { + GOMP_PLUGIN_error ("invalid device address"); + return false; + } - r = cuMemGetAddressRange (&pb, &ps, (CUdeviceptr)d); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuMemGetAddressRange error: %s", cuda_error (r)); + CUDA_CALL (cuMemGetAddressRange, &pb, &ps, (CUdeviceptr) d); if (!pb) - GOMP_PLUGIN_fatal ("invalid device address"); - + { + GOMP_PLUGIN_error ("invalid device address"); + return false; + } if (!h) - GOMP_PLUGIN_fatal ("invalid host address"); - + { + GOMP_PLUGIN_error ("invalid host address"); + return false; + } if (d == h) - GOMP_PLUGIN_fatal ("invalid host or device address"); - + { + GOMP_PLUGIN_error ("invalid host or device address"); + return false; + } if ((void *)(d + s) > (void *)(pb + ps)) - GOMP_PLUGIN_fatal ("invalid size"); + { + GOMP_PLUGIN_error ("invalid size"); + return false; + } #ifndef DISABLE_ASYNC if (nvthd->current_stream != nvthd->ptx_dev->null_stream) { - CUevent *e; - - e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent)); - - r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuEventCreate error: %s\n", cuda_error (r)); - + CUevent *e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent)); + CUDA_CALL (cuEventCreate, e, CU_EVENT_DISABLE_TIMING); event_gc (false); - - r = cuMemcpyDtoHAsync (h, (CUdeviceptr)d, s, - nvthd->current_stream->stream); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuMemcpyDtoHAsync error: %s", cuda_error (r)); - - r = cuEventRecord (*e, nvthd->current_stream->stream); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r)); - + CUDA_CALL (cuMemcpyDtoHAsync, + h, (CUdeviceptr) d, s, nvthd->current_stream->stream); + CUDA_CALL (cuEventRecord, *e, nvthd->current_stream->stream); event_add (PTX_EVT_MEM, e, (void *)h); } else #endif - { - r = cuMemcpyDtoH (h, (CUdeviceptr)d, s); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuMemcpyDtoH error: %s", cuda_error (r)); - } + CUDA_CALL (cuMemcpyDtoH, h, (CUdeviceptr) d, s); - return 0; + return true; } static void @@ -1227,17 +1203,13 @@ nvptx_async_test_all (void) static void nvptx_wait (int async) { - CUresult r; struct ptx_stream *s; s = select_stream_for_async (async, pthread_self (), false, NULL); - if (!s) GOMP_PLUGIN_fatal ("unknown async %d", async); - r = cuStreamSynchronize (s->stream); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r)); + CUDA_CALL_ASSERT (cuStreamSynchronize, s->stream); event_gc (true); } @@ -1245,7 +1217,6 @@ nvptx_wait (int async) static void nvptx_wait_async (int async1, int async2) { - CUresult r; CUevent *e; struct ptx_stream *s1, *s2; pthread_t self = pthread_self (); @@ -1261,23 +1232,17 @@ nvptx_wait_async (int async1, int async2) if (s1 == s2) GOMP_PLUGIN_fatal ("identical parameters"); - e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent)); + e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent)); - r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r)); + CUDA_CALL_ASSERT (cuEventCreate, e, CU_EVENT_DISABLE_TIMING); event_gc (true); - r = cuEventRecord (*e, s1->stream); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r)); + CUDA_CALL_ASSERT (cuEventRecord, *e, s1->stream); event_add (PTX_EVT_SYNC, e, NULL); - r = cuStreamWaitEvent (s2->stream, *e, 0); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuStreamWaitEvent error: %s", cuda_error (r)); + CUDA_CALL_ASSERT (cuStreamWaitEvent, s2->stream, *e, 0); } static void @@ -1302,9 +1267,7 @@ nvptx_wait_all (void) else if (r != CUDA_ERROR_NOT_READY) GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r)); - r = cuStreamSynchronize (s->stream); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r)); + CUDA_CALL_ASSERT (cuStreamSynchronize, s->stream); } } @@ -1316,7 +1279,6 @@ nvptx_wait_all (void) static void nvptx_wait_all_async (int async) { - CUresult r; struct ptx_stream *waiting_stream, *other_stream; CUevent *e; struct nvptx_thread *nvthd = nvptx_thread (); @@ -1346,20 +1308,14 @@ nvptx_wait_all_async (int async) e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent)); - r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r)); + CUDA_CALL_ASSERT (cuEventCreate, e, CU_EVENT_DISABLE_TIMING); /* Record an event on the waited-for stream. */ - r = cuEventRecord (*e, other_stream->stream); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r)); + CUDA_CALL_ASSERT (cuEventRecord, *e, other_stream->stream); event_add (PTX_EVT_SYNC, e, NULL); - r = cuStreamWaitEvent (waiting_stream->stream, *e, 0); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuStreamWaitEvent error: %s", cuda_error (r)); + CUDA_CALL_ASSERT (cuStreamWaitEvent, waiting_stream->stream, *e, 0); } pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock); @@ -1408,11 +1364,11 @@ nvptx_set_cuda_stream (int async, void *stream) pthread_t self = pthread_self (); struct nvptx_thread *nvthd = nvptx_thread (); - pthread_mutex_lock (&nvthd->ptx_dev->stream_lock); - if (async < 0) GOMP_PLUGIN_fatal ("bad async %d", async); + pthread_mutex_lock (&nvthd->ptx_dev->stream_lock); + /* We have a list of active streams and an array mapping async values to entries of that list. We need to take "ownership" of the passed-in stream, and add it to our list, removing the previous entry also (if there was one) @@ -1435,8 +1391,11 @@ nvptx_set_cuda_stream (int async, void *stream) s->next = s->next->next; } - cuStreamDestroy (oldstream->stream); - map_fini (oldstream); + CUDA_CALL_ASSERT (cuStreamDestroy, oldstream->stream); + + if (!map_fini (oldstream)) + GOMP_PLUGIN_fatal ("error when freeing host memory"); + free (oldstream); } @@ -1473,37 +1432,50 @@ GOMP_OFFLOAD_get_num_devices (void) return nvptx_get_num_devices (); } -void +bool GOMP_OFFLOAD_init_device (int n) { + struct ptx_device *dev; + pthread_mutex_lock (&ptx_dev_lock); if (!nvptx_init () || ptx_devices[n] != NULL) { pthread_mutex_unlock (&ptx_dev_lock); - return; + return false; } - ptx_devices[n] = nvptx_open_device (n); - instantiated_devices++; + dev = nvptx_open_device (n); + if (dev) + { + ptx_devices[n] = dev; + instantiated_devices++; + } pthread_mutex_unlock (&ptx_dev_lock); + + return dev != NULL; } -void +bool GOMP_OFFLOAD_fini_device (int n) { pthread_mutex_lock (&ptx_dev_lock); if (ptx_devices[n] != NULL) { - nvptx_attach_host_thread_to_device (n); - nvptx_close_device (ptx_devices[n]); + if (!nvptx_attach_host_thread_to_device (n) + || !nvptx_close_device (ptx_devices[n])) + { + pthread_mutex_unlock (&ptx_dev_lock); + return false; + } ptx_devices[n] = NULL; instantiated_devices--; } pthread_mutex_unlock (&ptx_dev_lock); + return true; } /* Return the libgomp version number we're compatible with. There is @@ -1526,7 +1498,6 @@ GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data, const char *const *var_names; const struct targ_fn_launch *fn_descs; unsigned int fn_entries, var_entries, i, j; - CUresult r; struct targ_fn_descriptor *targ_fns; struct addr_pair *targ_tbl; const nvptx_tdata_t *img_header = (const nvptx_tdata_t *) target_data; @@ -1534,17 +1505,18 @@ GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data, struct ptx_device *dev; if (GOMP_VERSION_DEV (version) > GOMP_VERSION_NVIDIA_PTX) - GOMP_PLUGIN_fatal ("Offload data incompatible with PTX plugin" - " (expected %u, received %u)", - GOMP_VERSION_NVIDIA_PTX, GOMP_VERSION_DEV (version)); - - GOMP_OFFLOAD_init_device (ord); + { + GOMP_PLUGIN_error ("Offload data incompatible with PTX plugin" + " (expected %u, received %u)", + GOMP_VERSION_NVIDIA_PTX, GOMP_VERSION_DEV (version)); + return -1; + } - dev = ptx_devices[ord]; - - nvptx_attach_host_thread_to_device (ord); + if (!nvptx_attach_host_thread_to_device (ord) + || !link_ptx (&module, img_header->ptx_objs, img_header->ptx_num)) + return -1; - link_ptx (&module, img_header->ptx_objs, img_header->ptx_num); + dev = ptx_devices[ord]; /* The mkoffload utility emits a struct of pointers/integers at the start of each offload image. The array of kernel names and the @@ -1576,9 +1548,8 @@ GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data, { CUfunction function; - r = cuModuleGetFunction (&function, module, fn_descs[i].fn); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuModuleGetFunction error: %s", cuda_error (r)); + CUDA_CALL_ERET (-1, cuModuleGetFunction, &function, module, + fn_descs[i].fn); targ_fns->fn = function; targ_fns->launch = &fn_descs[i]; @@ -1592,9 +1563,8 @@ GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data, CUdeviceptr var; size_t bytes; - r = cuModuleGetGlobal (&var, &bytes, module, var_names[j]); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuModuleGetGlobal error: %s", cuda_error (r)); + CUDA_CALL_ERET (-1, cuModuleGetGlobal, + &var, &bytes, module, var_names[j]); targ_tbl->start = (uintptr_t) var; targ_tbl->end = targ_tbl->start + bytes; @@ -1606,54 +1576,63 @@ GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data, /* Unload the program described by TARGET_DATA. DEV_DATA is the function descriptors allocated by G_O_load_image. */ -void +bool GOMP_OFFLOAD_unload_image (int ord, unsigned version, const void *target_data) { struct ptx_image_data *image, **prev_p; struct ptx_device *dev = ptx_devices[ord]; if (GOMP_VERSION_DEV (version) > GOMP_VERSION_NVIDIA_PTX) - return; - + { + GOMP_PLUGIN_error ("Offload data incompatible with PTX plugin" + " (expected %u, received %u)", + GOMP_VERSION_NVIDIA_PTX, GOMP_VERSION_DEV (version)); + return false; + } + + bool ret = true; pthread_mutex_lock (&dev->image_lock); for (prev_p = &dev->images; (image = *prev_p) != 0; prev_p = &image->next) if (image->target_data == target_data) { *prev_p = image->next; - cuModuleUnload (image->module); + if (cuModuleUnload (image->module) != CUDA_SUCCESS) + ret = false; free (image->fns); free (image); break; } pthread_mutex_unlock (&dev->image_lock); + return ret; } void * GOMP_OFFLOAD_alloc (int ord, size_t size) { - nvptx_attach_host_thread_to_device (ord); + if (!nvptx_attach_host_thread_to_device (ord)) + return NULL; return nvptx_alloc (size); } -void +bool GOMP_OFFLOAD_free (int ord, void *ptr) { - nvptx_attach_host_thread_to_device (ord); - nvptx_free (ptr); + return (nvptx_attach_host_thread_to_device (ord) + && nvptx_free (ptr)); } -void * +bool GOMP_OFFLOAD_dev2host (int ord, void *dst, const void *src, size_t n) { - nvptx_attach_host_thread_to_device (ord); - return nvptx_dev2host (dst, src, n); + return (nvptx_attach_host_thread_to_device (ord) + && nvptx_dev2host (dst, src, n)); } -void * +bool GOMP_OFFLOAD_host2dev (int ord, void *dst, const void *src, size_t n) { - nvptx_attach_host_thread_to_device (ord); - return nvptx_host2dev (dst, src, n); + return (nvptx_attach_host_thread_to_device (ord) + && nvptx_host2dev (dst, src, n)); } void (*device_run) (int n, void *fn_ptr, void *vars) = NULL; @@ -1669,20 +1648,11 @@ GOMP_OFFLOAD_openacc_parallel (void (*fn) (void *), size_t mapnum, void GOMP_OFFLOAD_openacc_register_async_cleanup (void *targ_mem_desc) { - CUevent *e; - CUresult r; struct nvptx_thread *nvthd = nvptx_thread (); + CUevent *e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent)); - e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent)); - - r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r)); - - r = cuEventRecord (*e, nvthd->current_stream->stream); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r)); - + CUDA_CALL_ASSERT (cuEventCreate, e, CU_EVENT_DISABLE_TIMING); + CUDA_CALL_ASSERT (cuEventRecord, *e, nvthd->current_stream->stream); event_add (PTX_EVT_ASYNC_CLEANUP, e, targ_mem_desc); } @@ -1734,25 +1704,18 @@ GOMP_OFFLOAD_openacc_create_thread_data (int ord) struct ptx_device *ptx_dev; struct nvptx_thread *nvthd = GOMP_PLUGIN_malloc (sizeof (struct nvptx_thread)); - CUresult r; CUcontext thd_ctx; ptx_dev = ptx_devices[ord]; assert (ptx_dev); - r = cuCtxGetCurrent (&thd_ctx); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuCtxGetCurrent error: %s", cuda_error (r)); + CUDA_CALL_ASSERT (cuCtxGetCurrent, &thd_ctx); assert (ptx_dev->ctx); if (!thd_ctx) - { - r = cuCtxPushCurrent (ptx_dev->ctx); - if (r != CUDA_SUCCESS) - GOMP_PLUGIN_fatal ("cuCtxPushCurrent error: %s", cuda_error (r)); - } + CUDA_CALL_ASSERT (cuCtxPushCurrent, ptx_dev->ctx); nvthd->current_stream = ptx_dev->null_stream; nvthd->ptx_dev = ptx_dev; diff --git a/libgomp/target.c b/libgomp/target.c index e2dd0e08997..5a86fc077e6 100644 --- a/libgomp/target.c +++ b/libgomp/target.c @@ -162,6 +162,45 @@ gomp_map_0len_lookup (splay_tree mem_map, splay_tree_key key) return n; } +static inline void +gomp_device_copy (struct gomp_device_descr *devicep, + bool (*copy_func) (int, void *, const void *, size_t), + const char *dst, void *dstaddr, + const char *src, const void *srcaddr, + size_t size) +{ + if (!copy_func (devicep->target_id, dstaddr, srcaddr, size)) + { + gomp_mutex_unlock (&devicep->lock); + gomp_fatal ("Copying of %s object [%p..%p) to %s object [%p..%p) failed", + src, srcaddr, srcaddr + size, dst, dstaddr, dstaddr + size); + } +} + +static void +gomp_copy_host2dev (struct gomp_device_descr *devicep, + void *d, const void *h, size_t sz) +{ + gomp_device_copy (devicep, devicep->host2dev_func, "dev", d, "host", h, sz); +} + +static void +gomp_copy_dev2host (struct gomp_device_descr *devicep, + void *h, const void *d, size_t sz) +{ + gomp_device_copy (devicep, devicep->dev2host_func, "host", h, "dev", d, sz); +} + +static void +gomp_free_device_memory (struct gomp_device_descr *devicep, void *devptr) +{ + if (!devicep->free_func (devicep->target_id, devptr)) + { + gomp_mutex_unlock (&devicep->lock); + gomp_fatal ("error in freeing device memory block at %p", devptr); + } +} + /* Handle the case where gomp_map_lookup, splay_tree_lookup or gomp_map_0len_lookup found oldn for newn. Helper function of gomp_map_vars. */ @@ -189,11 +228,12 @@ gomp_map_vars_existing (struct gomp_device_descr *devicep, splay_tree_key oldn, } if (GOMP_MAP_ALWAYS_TO_P (kind)) - devicep->host2dev_func (devicep->target_id, - (void *) (oldn->tgt->tgt_start + oldn->tgt_offset - + newn->host_start - oldn->host_start), - (void *) newn->host_start, - newn->host_end - newn->host_start); + gomp_copy_host2dev (devicep, + (void *) (oldn->tgt->tgt_start + oldn->tgt_offset + + newn->host_start - oldn->host_start), + (void *) newn->host_start, + newn->host_end - newn->host_start); + if (oldn->refcount != REFCOUNT_INFINITY) oldn->refcount++; } @@ -218,10 +258,10 @@ gomp_map_pointer (struct target_mem_desc *tgt, uintptr_t host_ptr, { cur_node.tgt_offset = (uintptr_t) NULL; /* FIXME: see comment about coalescing host/dev transfers below. */ - devicep->host2dev_func (devicep->target_id, - (void *) (tgt->tgt_start + target_offset), - (void *) &cur_node.tgt_offset, - sizeof (void *)); + gomp_copy_host2dev (devicep, + (void *) (tgt->tgt_start + target_offset), + (void *) &cur_node.tgt_offset, + sizeof (void *)); return; } /* Add bias to the pointer value. */ @@ -241,10 +281,8 @@ gomp_map_pointer (struct target_mem_desc *tgt, uintptr_t host_ptr, to initialize the pointer with. */ cur_node.tgt_offset -= bias; /* FIXME: see comment about coalescing host/dev transfers below. */ - devicep->host2dev_func (devicep->target_id, - (void *) (tgt->tgt_start + target_offset), - (void *) &cur_node.tgt_offset, - sizeof (void *)); + gomp_copy_host2dev (devicep, (void *) (tgt->tgt_start + target_offset), + (void *) &cur_node.tgt_offset, sizeof (void *)); } static void @@ -515,6 +553,12 @@ gomp_map_vars (struct gomp_device_descr *devicep, size_t mapnum, memory. */ tgt->to_free = devicep->alloc_func (devicep->target_id, tgt_size + tgt_align - 1); + if (!tgt->to_free) + { + gomp_mutex_unlock (&devicep->lock); + gomp_fatal ("device memory allocation fail"); + } + tgt->tgt_start = (uintptr_t) tgt->to_free; tgt->tgt_start = (tgt->tgt_start + tgt_align - 1) & ~(tgt_align - 1); tgt->tgt_end = tgt->tgt_start + tgt_size; @@ -554,9 +598,9 @@ gomp_map_vars (struct gomp_device_descr *devicep, size_t mapnum, tgt_size = (tgt_size + align - 1) & ~(align - 1); tgt->list[i].offset = tgt_size; len = sizes[i]; - devicep->host2dev_func (devicep->target_id, - (void *) (tgt->tgt_start + tgt_size), - (void *) hostaddrs[i], len); + gomp_copy_host2dev (devicep, + (void *) (tgt->tgt_start + tgt_size), + (void *) hostaddrs[i], len); tgt_size += len; continue; case GOMP_MAP_FIRSTPRIVATE_INT: @@ -608,13 +652,13 @@ gomp_map_vars (struct gomp_device_descr *devicep, size_t mapnum, cur_node.tgt_offset = gomp_map_val (tgt, hostaddrs, i - 1); if (cur_node.tgt_offset) cur_node.tgt_offset -= sizes[i]; - devicep->host2dev_func (devicep->target_id, - (void *) (n->tgt->tgt_start - + n->tgt_offset - + cur_node.host_start - - n->host_start), - (void *) &cur_node.tgt_offset, - sizeof (void *)); + gomp_copy_host2dev (devicep, + (void *) (n->tgt->tgt_start + + n->tgt_offset + + cur_node.host_start + - n->host_start), + (void *) &cur_node.tgt_offset, + sizeof (void *)); cur_node.tgt_offset = n->tgt->tgt_start + n->tgt_offset + cur_node.host_start - n->host_start; continue; @@ -685,11 +729,11 @@ gomp_map_vars (struct gomp_device_descr *devicep, size_t mapnum, /* FIXME: Perhaps add some smarts, like if copying several adjacent fields from host to target, use some host buffer to avoid sending each var individually. */ - devicep->host2dev_func (devicep->target_id, - (void *) (tgt->tgt_start - + k->tgt_offset), - (void *) k->host_start, - k->host_end - k->host_start); + gomp_copy_host2dev (devicep, + (void *) (tgt->tgt_start + + k->tgt_offset), + (void *) k->host_start, + k->host_end - k->host_start); break; case GOMP_MAP_POINTER: gomp_map_pointer (tgt, (uintptr_t) *(void **) k->host_start, @@ -697,11 +741,11 @@ gomp_map_vars (struct gomp_device_descr *devicep, size_t mapnum, break; case GOMP_MAP_TO_PSET: /* FIXME: see above FIXME comment. */ - devicep->host2dev_func (devicep->target_id, - (void *) (tgt->tgt_start - + k->tgt_offset), - (void *) k->host_start, - k->host_end - k->host_start); + gomp_copy_host2dev (devicep, + (void *) (tgt->tgt_start + + k->tgt_offset), + (void *) k->host_start, + k->host_end - k->host_start); for (j = i + 1; j < mapnum; j++) if (!GOMP_MAP_POINTER_P (get_kind (short_mapkind, kinds, @@ -748,12 +792,11 @@ gomp_map_vars (struct gomp_device_descr *devicep, size_t mapnum, break; case GOMP_MAP_FORCE_DEVICEPTR: assert (k->host_end - k->host_start == sizeof (void *)); - - devicep->host2dev_func (devicep->target_id, - (void *) (tgt->tgt_start - + k->tgt_offset), - (void *) k->host_start, - sizeof (void *)); + gomp_copy_host2dev (devicep, + (void *) (tgt->tgt_start + + k->tgt_offset), + (void *) k->host_start, + sizeof (void *)); break; default: gomp_mutex_unlock (&devicep->lock); @@ -781,11 +824,9 @@ gomp_map_vars (struct gomp_device_descr *devicep, size_t mapnum, { cur_node.tgt_offset = gomp_map_val (tgt, hostaddrs, i); /* FIXME: see above FIXME comment. */ - devicep->host2dev_func (devicep->target_id, - (void *) (tgt->tgt_start - + i * sizeof (void *)), - (void *) &cur_node.tgt_offset, - sizeof (void *)); + gomp_copy_host2dev (devicep, + (void *) (tgt->tgt_start + i * sizeof (void *)), + (void *) &cur_node.tgt_offset, sizeof (void *)); } } @@ -807,7 +848,7 @@ gomp_unmap_tgt (struct target_mem_desc *tgt) { /* Deallocate on target the tgt->tgt_start .. tgt->tgt_end region. */ if (tgt->tgt_end) - tgt->device_descr->free_func (tgt->device_descr->target_id, tgt->to_free); + gomp_free_device_memory (tgt->device_descr, tgt->to_free); free (tgt->array); free (tgt); @@ -839,9 +880,9 @@ gomp_copy_from_async (struct target_mem_desc *tgt) { splay_tree_key k = tgt->list[i].key; if (tgt->list[i].copy_from) - devicep->dev2host_func (devicep->target_id, (void *) k->host_start, - (void *) (k->tgt->tgt_start + k->tgt_offset), - k->host_end - k->host_start); + gomp_copy_dev2host (devicep, (void *) k->host_start, + (void *) (k->tgt->tgt_start + k->tgt_offset), + k->host_end - k->host_start); } gomp_mutex_unlock (&devicep->lock); @@ -894,11 +935,11 @@ gomp_unmap_vars (struct target_mem_desc *tgt, bool do_copyfrom) if ((do_unmap && do_copyfrom && tgt->list[i].copy_from) || tgt->list[i].always_copy_from) - devicep->dev2host_func (devicep->target_id, - (void *) (k->host_start + tgt->list[i].offset), - (void *) (k->tgt->tgt_start + k->tgt_offset - + tgt->list[i].offset), - tgt->list[i].length); + gomp_copy_dev2host (devicep, + (void *) (k->host_start + tgt->list[i].offset), + (void *) (k->tgt->tgt_start + k->tgt_offset + + tgt->list[i].offset), + tgt->list[i].length); if (do_unmap) { splay_tree_remove (&devicep->mem_map, k); @@ -961,22 +1002,17 @@ gomp_update (struct gomp_device_descr *devicep, size_t mapnum, void **hostaddrs, (void *) n->host_start, (void *) n->host_end); } + + + void *hostaddr = (void *) cur_node.host_start; + void *devaddr = (void *) (n->tgt->tgt_start + n->tgt_offset + + cur_node.host_start - n->host_start); + size_t size = cur_node.host_end - cur_node.host_start; + if (GOMP_MAP_COPY_TO_P (kind & typemask)) - devicep->host2dev_func (devicep->target_id, - (void *) (n->tgt->tgt_start - + n->tgt_offset - + cur_node.host_start - - n->host_start), - (void *) cur_node.host_start, - cur_node.host_end - cur_node.host_start); + gomp_copy_host2dev (devicep, devaddr, hostaddr, size); if (GOMP_MAP_COPY_FROM_P (kind & typemask)) - devicep->dev2host_func (devicep->target_id, - (void *) cur_node.host_start, - (void *) (n->tgt->tgt_start - + n->tgt_offset - + cur_node.host_start - - n->host_start), - cur_node.host_end - cur_node.host_start); + gomp_copy_dev2host (devicep, hostaddr, devaddr, size); } } gomp_mutex_unlock (&devicep->lock); @@ -1114,7 +1150,11 @@ gomp_unload_image_from_device (struct gomp_device_descr *devicep, node = splay_tree_lookup (&devicep->mem_map, &k); } - devicep->unload_image_func (devicep->target_id, version, target_data); + if (!devicep->unload_image_func (devicep->target_id, version, target_data)) + { + gomp_mutex_unlock (&devicep->lock); + gomp_fatal ("image unload fail"); + } /* Remove mappings from splay tree. */ int i; @@ -1261,7 +1301,11 @@ attribute_hidden void gomp_init_device (struct gomp_device_descr *devicep) { int i; - devicep->init_device_func (devicep->target_id); + if (!devicep->init_device_func (devicep->target_id)) + { + gomp_mutex_unlock (&devicep->lock); + gomp_fatal ("device initialization failed"); + } /* Load to device all images registered by the moment. */ for (i = 0; i < num_offload_images; i++) @@ -1765,12 +1809,11 @@ gomp_exit_data (struct gomp_device_descr *devicep, size_t mapnum, if ((kind == GOMP_MAP_FROM && k->refcount == 0) || kind == GOMP_MAP_ALWAYS_FROM) - devicep->dev2host_func (devicep->target_id, - (void *) cur_node.host_start, - (void *) (k->tgt->tgt_start + k->tgt_offset - + cur_node.host_start - - k->host_start), - cur_node.host_end - cur_node.host_start); + gomp_copy_dev2host (devicep, (void *) cur_node.host_start, + (void *) (k->tgt->tgt_start + k->tgt_offset + + cur_node.host_start + - k->host_start), + cur_node.host_end - cur_node.host_start); if (k->refcount == 0) { splay_tree_remove (&devicep->mem_map, k); @@ -2001,7 +2044,7 @@ omp_target_free (void *device_ptr, int device_num) } gomp_mutex_lock (&devicep->lock); - devicep->free_func (devicep->target_id, device_ptr); + gomp_free_device_memory (devicep, device_ptr); gomp_mutex_unlock (&devicep->lock); } @@ -2042,6 +2085,7 @@ omp_target_memcpy (void *dst, void *src, size_t length, size_t dst_offset, size_t src_offset, int dst_device_num, int src_device_num) { struct gomp_device_descr *dst_devicep = NULL, *src_devicep = NULL; + bool ret; if (dst_device_num != GOMP_DEVICE_HOST_FALLBACK) { @@ -2077,29 +2121,29 @@ omp_target_memcpy (void *dst, void *src, size_t length, size_t dst_offset, if (src_devicep == NULL) { gomp_mutex_lock (&dst_devicep->lock); - dst_devicep->host2dev_func (dst_devicep->target_id, - (char *) dst + dst_offset, - (char *) src + src_offset, length); + ret = dst_devicep->host2dev_func (dst_devicep->target_id, + (char *) dst + dst_offset, + (char *) src + src_offset, length); gomp_mutex_unlock (&dst_devicep->lock); - return 0; + return (ret ? 0 : EINVAL); } if (dst_devicep == NULL) { gomp_mutex_lock (&src_devicep->lock); - src_devicep->dev2host_func (src_devicep->target_id, - (char *) dst + dst_offset, - (char *) src + src_offset, length); + ret = src_devicep->dev2host_func (src_devicep->target_id, + (char *) dst + dst_offset, + (char *) src + src_offset, length); gomp_mutex_unlock (&src_devicep->lock); - return 0; + return (ret ? 0 : EINVAL); } if (src_devicep == dst_devicep) { gomp_mutex_lock (&src_devicep->lock); - src_devicep->dev2dev_func (src_devicep->target_id, - (char *) dst + dst_offset, - (char *) src + src_offset, length); + ret = src_devicep->dev2dev_func (src_devicep->target_id, + (char *) dst + dst_offset, + (char *) src + src_offset, length); gomp_mutex_unlock (&src_devicep->lock); - return 0; + return (ret ? 0 : EINVAL); } return EINVAL; } @@ -2126,22 +2170,25 @@ omp_target_memcpy_rect_worker (void *dst, void *src, size_t element_size, || __builtin_mul_overflow (element_size, src_offsets[0], &src_off)) return EINVAL; if (dst_devicep == NULL && src_devicep == NULL) - memcpy ((char *) dst + dst_off, (char *) src + src_off, length); + { + memcpy ((char *) dst + dst_off, (char *) src + src_off, length); + ret = 1; + } else if (src_devicep == NULL) - dst_devicep->host2dev_func (dst_devicep->target_id, - (char *) dst + dst_off, - (char *) src + src_off, length); + ret = dst_devicep->host2dev_func (dst_devicep->target_id, + (char *) dst + dst_off, + (char *) src + src_off, length); else if (dst_devicep == NULL) - src_devicep->dev2host_func (src_devicep->target_id, - (char *) dst + dst_off, - (char *) src + src_off, length); + ret = src_devicep->dev2host_func (src_devicep->target_id, + (char *) dst + dst_off, + (char *) src + src_off, length); else if (src_devicep == dst_devicep) - src_devicep->dev2dev_func (src_devicep->target_id, - (char *) dst + dst_off, - (char *) src + src_off, length); + ret = src_devicep->dev2dev_func (src_devicep->target_id, + (char *) dst + dst_off, + (char *) src + src_off, length); else - return EINVAL; - return 0; + ret = 0; + return ret ? 0 : EINVAL; } /* FIXME: it would be nice to have some plugin function to handle @@ -2456,14 +2503,17 @@ gomp_target_fini (void) int i; for (i = 0; i < num_devices; i++) { + bool ret = true; struct gomp_device_descr *devicep = &devices[i]; gomp_mutex_lock (&devicep->lock); if (devicep->state == GOMP_DEVICE_INITIALIZED) { - devicep->fini_device_func (devicep->target_id); + ret = devicep->fini_device_func (devicep->target_id); devicep->state = GOMP_DEVICE_FINALIZED; } gomp_mutex_unlock (&devicep->lock); + if (!ret) + gomp_fatal ("device finalization failed"); } } |