summaryrefslogtreecommitdiff
path: root/openmp
diff options
context:
space:
mode:
authorChenle Yu <chenle.yu@bsc.es>2023-05-15 09:56:48 -0500
committerJose M Monsalve Diaz <jmonsalvediaz@anl.gov>2023-05-15 10:00:55 -0500
commit36d4e4c9b5f6cd0577b6029055b825caaec2dd11 (patch)
treea0ba2fed1ffe2a6bd4dc5da7321f4aef5461dfcf /openmp
parent0f1fb626b3314386d3885f328cae36b830a04901 (diff)
downloadllvm-36d4e4c9b5f6cd0577b6029055b825caaec2dd11.tar.gz
[OpenMP] Implement task record and replay mechanism
This patch implements the "task record and replay" mechanism. The idea is to be able to store tasks and their dependencies in the runtime so that we do not pay the cost of task creation and dependency resolution for future executions. The objective is to improve fine-grained task performance, both for those from "omp task" and "taskloop". The entry point of the recording phase is __kmpc_start_record_task, and the end of record is triggered by __kmpc_end_record_task. Tasks encapsulated between a record start and a record end are saved, meaning that the runtime stores their dependencies and structures, referred to as TDG, in order to replay them in subsequent executions. In these TDG replays, we start the execution by scheduling all root tasks (tasks that do not have input dependencies), and there will be no involvement of a hash table to track the dependencies, yet tasks do not need to be created again. At the beginning of __kmpc_start_record_task, we must check if a TDG has already been recorded. If yes, the function returns 0 and starts to replay the TDG by calling __kmp_exec_tdg; if not, we start to record, and the function returns 1. An integer uniquely identifies TDGs. Currently, this identifier needs to be incremented manually in the source code. Still, depending on how this feature would eventually be used in the library, the caller function must do it; also, the caller function needs to implement a mechanism to skip the associated region, according to the return value of __kmpc_start_record_task. Reviewed By: tianshilei1992 Differential Revision: https://reviews.llvm.org/D146642
Diffstat (limited to 'openmp')
-rw-r--r--openmp/runtime/CMakeLists.txt5
-rw-r--r--openmp/runtime/src/kmp.h79
-rw-r--r--openmp/runtime/src/kmp_config.h.cmake2
-rw-r--r--openmp/runtime/src/kmp_global.cpp12
-rw-r--r--openmp/runtime/src/kmp_settings.cpp16
-rw-r--r--openmp/runtime/src/kmp_taskdeps.cpp123
-rw-r--r--openmp/runtime/src/kmp_taskdeps.h25
-rw-r--r--openmp/runtime/src/kmp_tasking.cpp420
-rw-r--r--openmp/runtime/test/CMakeLists.txt1
-rw-r--r--openmp/runtime/test/lit.cfg3
-rw-r--r--openmp/runtime/test/lit.site.cfg.in1
-rw-r--r--openmp/runtime/test/tasking/omp_record_replay.cpp48
-rw-r--r--openmp/runtime/test/tasking/omp_record_replay_deps.cpp63
-rw-r--r--openmp/runtime/test/tasking/omp_record_replay_multiTDGs.cpp76
-rw-r--r--openmp/runtime/test/tasking/omp_record_replay_taskloop.cpp50
15 files changed, 917 insertions, 7 deletions
diff --git a/openmp/runtime/CMakeLists.txt b/openmp/runtime/CMakeLists.txt
index 43299ddbad84..2b7a3eb5bfce 100644
--- a/openmp/runtime/CMakeLists.txt
+++ b/openmp/runtime/CMakeLists.txt
@@ -342,6 +342,10 @@ if(LIBOMP_OMPD_SUPPORT AND ((NOT LIBOMP_OMPT_SUPPORT) OR (NOT "${CMAKE_SYSTEM_NA
set(LIBOMP_OMPD_SUPPORT FALSE)
endif()
+# OMPX Taskgraph support
+# Whether to build with OMPX Taskgraph (e.g. task record & replay)
+set(LIBOMP_OMPX_TASKGRAPH FALSE CACHE BOOL "OMPX-taskgraph (task record & replay)?")
+
# Error check hwloc support after config-ix has run
if(LIBOMP_USE_HWLOC AND (NOT LIBOMP_HAVE_HWLOC))
libomp_error_say("Hwloc requested but not available")
@@ -411,6 +415,7 @@ if(${OPENMP_STANDALONE_BUILD})
libomp_say("Use Adaptive locks -- ${LIBOMP_USE_ADAPTIVE_LOCKS}")
libomp_say("Use quad precision -- ${LIBOMP_USE_QUAD_PRECISION}")
libomp_say("Use Hwloc library -- ${LIBOMP_USE_HWLOC}")
+ libomp_say("Use OMPX-taskgraph -- ${LIBOMP_OMPX_TASKGRAPH}")
endif()
add_subdirectory(src)
diff --git a/openmp/runtime/src/kmp.h b/openmp/runtime/src/kmp.h
index bd50d58f0fdc..251a5f0ce76f 100644
--- a/openmp/runtime/src/kmp.h
+++ b/openmp/runtime/src/kmp.h
@@ -2487,6 +2487,62 @@ typedef struct {
} ed;
} kmp_event_t;
+#if OMPX_TASKGRAPH
+// Initial number of allocated nodes while recording
+#define INIT_MAPSIZE 50
+
+typedef struct kmp_taskgraph_flags { /*This needs to be exactly 32 bits */
+ unsigned nowait : 1;
+ unsigned re_record : 1;
+ unsigned reserved : 30;
+} kmp_taskgraph_flags_t;
+
+/// Represents a TDG node
+typedef struct kmp_node_info {
+ kmp_task_t *task; // Pointer to the actual task
+ kmp_int32 *successors; // Array of the succesors ids
+ kmp_int32 nsuccessors; // Number of succesors of the node
+ std::atomic<kmp_int32>
+ npredecessors_counter; // Number of predessors on the fly
+ kmp_int32 npredecessors; // Total number of predecessors
+ kmp_int32 successors_size; // Number of allocated succesors ids
+ kmp_taskdata_t *parent_task; // Parent implicit task
+} kmp_node_info_t;
+
+/// Represent a TDG's current status
+typedef enum kmp_tdg_status {
+ KMP_TDG_NONE = 0,
+ KMP_TDG_RECORDING = 1,
+ KMP_TDG_READY = 2
+} kmp_tdg_status_t;
+
+/// Structure that contains a TDG
+typedef struct kmp_tdg_info {
+ kmp_int32 tdg_id; // Unique idenfifier of the TDG
+ kmp_taskgraph_flags_t tdg_flags; // Flags related to a TDG
+ kmp_int32 map_size; // Number of allocated TDG nodes
+ kmp_int32 num_roots; // Number of roots tasks int the TDG
+ kmp_int32 *root_tasks; // Array of tasks identifiers that are roots
+ kmp_node_info_t *record_map; // Array of TDG nodes
+ kmp_tdg_status_t tdg_status =
+ KMP_TDG_NONE; // Status of the TDG (recording, ready...)
+ std::atomic<kmp_int32> num_tasks; // Number of TDG nodes
+ kmp_bootstrap_lock_t
+ graph_lock; // Protect graph attributes when updated via taskloop_recur
+ // Taskloop reduction related
+ void *rec_taskred_data; // Data to pass to __kmpc_task_reduction_init or
+ // __kmpc_taskred_init
+ kmp_int32 rec_num_taskred;
+} kmp_tdg_info_t;
+
+extern kmp_int32 __kmp_max_tdgs;
+extern kmp_tdg_info_t **__kmp_global_tdgs;
+extern kmp_int32 __kmp_curr_tdg_idx;
+extern kmp_int32 __kmp_successors_size;
+extern std::atomic<kmp_int32> __kmp_tdg_task_id;
+extern kmp_int32 __kmp_num_tdg;
+#endif
+
#ifdef BUILD_TIED_TASK_STACK
/* Tied Task stack definitions */
@@ -2534,7 +2590,12 @@ typedef struct kmp_tasking_flags { /* Total struct must be exactly 32 bits */
unsigned complete : 1; /* 1==complete, 0==not complete */
unsigned freed : 1; /* 1==freed, 0==allocated */
unsigned native : 1; /* 1==gcc-compiled task, 0==intel */
+#if OMPX_TASKGRAPH
+ unsigned onced : 1; /* 1==ran once already, 0==never ran, record & replay purposes */
+ unsigned reserved31 : 6; /* reserved for library use */
+#else
unsigned reserved31 : 7; /* reserved for library use */
+#endif
} kmp_tasking_flags_t;
@@ -2584,6 +2645,10 @@ struct kmp_taskdata { /* aligned during dynamic allocation */
#if OMPT_SUPPORT
ompt_task_info_t ompt_task_info;
#endif
+#if OMPX_TASKGRAPH
+ bool is_taskgraph = 0; // whether the task is within a TDG
+ kmp_tdg_info_t *tdg; // used to associate task with a TDG
+#endif
kmp_target_data_t td_target_data;
}; // struct kmp_taskdata
@@ -4124,6 +4189,20 @@ KMP_EXPORT void __kmpc_init_nest_lock_with_hint(ident_t *loc, kmp_int32 gtid,
void **user_lock,
uintptr_t hint);
+#if OMPX_TASKGRAPH
+// Taskgraph's Record & Replay mechanism
+// __kmp_tdg_is_recording: check whether a given TDG is recording
+// status: the tdg's current status
+static inline bool __kmp_tdg_is_recording(kmp_tdg_status_t status) {
+ return status == KMP_TDG_RECORDING;
+}
+
+KMP_EXPORT kmp_int32 __kmpc_start_record_task(ident_t *loc, kmp_int32 gtid,
+ kmp_int32 input_flags,
+ kmp_int32 tdg_id);
+KMP_EXPORT void __kmpc_end_record_task(ident_t *loc, kmp_int32 gtid,
+ kmp_int32 input_flags, kmp_int32 tdg_id);
+#endif
/* Interface to fast scalable reduce methods routines */
KMP_EXPORT kmp_int32 __kmpc_reduce_nowait(
diff --git a/openmp/runtime/src/kmp_config.h.cmake b/openmp/runtime/src/kmp_config.h.cmake
index 91bb8a8312e0..58bf64112b1a 100644
--- a/openmp/runtime/src/kmp_config.h.cmake
+++ b/openmp/runtime/src/kmp_config.h.cmake
@@ -46,6 +46,8 @@
#define OMPT_SUPPORT LIBOMP_OMPT_SUPPORT
#cmakedefine01 LIBOMP_OMPD_SUPPORT
#define OMPD_SUPPORT LIBOMP_OMPD_SUPPORT
+#cmakedefine01 LIBOMP_OMPX_TASKGRAPH
+#define OMPX_TASKGRAPH LIBOMP_OMPX_TASKGRAPH
#cmakedefine01 LIBOMP_PROFILING_SUPPORT
#define OMP_PROFILING_SUPPORT LIBOMP_PROFILING_SUPPORT
#cmakedefine01 LIBOMP_OMPT_OPTIONAL
diff --git a/openmp/runtime/src/kmp_global.cpp b/openmp/runtime/src/kmp_global.cpp
index 0163846b4454..9557c519f835 100644
--- a/openmp/runtime/src/kmp_global.cpp
+++ b/openmp/runtime/src/kmp_global.cpp
@@ -557,4 +557,16 @@ int __kmp_nesting_mode = 0;
int __kmp_nesting_mode_nlevels = 1;
int *__kmp_nesting_nth_level;
+#if OMPX_TASKGRAPH
+// TDG record & replay
+kmp_int32 __kmp_max_tdgs = 100;
+kmp_tdg_info_t **__kmp_global_tdgs = NULL;
+kmp_int32 __kmp_curr_tdg_idx =
+ 0; // Id of the current TDG being recorded or executed
+kmp_int32 __kmp_num_tdg = 0;
+kmp_int32 __kmp_successors_size = 10; // Initial succesor size list for
+ // recording
+std::atomic<kmp_int32> __kmp_tdg_task_id = 0;
+#endif
// end of file //
+
diff --git a/openmp/runtime/src/kmp_settings.cpp b/openmp/runtime/src/kmp_settings.cpp
index 88eb150fc0c2..e6dd0ec94655 100644
--- a/openmp/runtime/src/kmp_settings.cpp
+++ b/openmp/runtime/src/kmp_settings.cpp
@@ -1238,6 +1238,18 @@ static void __kmp_stg_parse_num_threads(char const *name, char const *value,
K_DIAG(1, ("__kmp_dflt_team_nth == %d\n", __kmp_dflt_team_nth));
} // __kmp_stg_parse_num_threads
+#if OMPX_TASKGRAPH
+static void __kmp_stg_parse_max_tdgs(char const *name, char const *value,
+ void *data) {
+ __kmp_stg_parse_int(name, value, 0, INT_MAX, &__kmp_max_tdgs);
+} // __kmp_stg_parse_max_tdgs
+
+static void __kmp_std_print_max_tdgs(kmp_str_buf_t *buffer, char const *name,
+ void *data) {
+ __kmp_stg_print_int(buffer, name, __kmp_max_tdgs);
+} // __kmp_std_print_max_tdgs
+#endif
+
static void __kmp_stg_parse_num_hidden_helper_threads(char const *name,
char const *value,
void *data) {
@@ -5592,6 +5604,10 @@ static kmp_setting_t __kmp_stg_table[] = {
{"LIBOMP_NUM_HIDDEN_HELPER_THREADS",
__kmp_stg_parse_num_hidden_helper_threads,
__kmp_stg_print_num_hidden_helper_threads, NULL, 0, 0},
+#if OMPX_TASKGRAPH
+ {"KMP_MAX_TDGS", __kmp_stg_parse_max_tdgs, __kmp_std_print_max_tdgs, NULL,
+ 0, 0},
+#endif
#if OMPT_SUPPORT
{"OMP_TOOL", __kmp_stg_parse_omp_tool, __kmp_stg_print_omp_tool, NULL, 0,
diff --git a/openmp/runtime/src/kmp_taskdeps.cpp b/openmp/runtime/src/kmp_taskdeps.cpp
index ee4f96f7e5e2..22119a9d2d45 100644
--- a/openmp/runtime/src/kmp_taskdeps.cpp
+++ b/openmp/runtime/src/kmp_taskdeps.cpp
@@ -218,6 +218,44 @@ static kmp_depnode_list_t *__kmp_add_node(kmp_info_t *thread,
static inline void __kmp_track_dependence(kmp_int32 gtid, kmp_depnode_t *source,
kmp_depnode_t *sink,
kmp_task_t *sink_task) {
+#if OMPX_TASKGRAPH
+ kmp_taskdata_t *task_source = KMP_TASK_TO_TASKDATA(source->dn.task);
+ kmp_taskdata_t *task_sink = KMP_TASK_TO_TASKDATA(sink_task);
+ if (source->dn.task && sink_task) {
+ // Not supporting dependency between two tasks that one is within the TDG
+ // and the other is not
+ KMP_ASSERT(task_source->is_taskgraph == task_sink->is_taskgraph);
+ }
+ if (task_sink->is_taskgraph &&
+ __kmp_tdg_is_recording(task_sink->tdg->tdg_status)) {
+ kmp_node_info_t *source_info =
+ &task_sink->tdg->record_map[task_source->td_task_id];
+ bool exists = false;
+ for (int i = 0; i < source_info->nsuccessors; i++) {
+ if (source_info->successors[i] == task_sink->td_task_id) {
+ exists = true;
+ break;
+ }
+ }
+ if (!exists) {
+ if (source_info->nsuccessors >= source_info->successors_size) {
+ source_info->successors_size = 2 * source_info->successors_size;
+ kmp_int32 *old_succ_ids = source_info->successors;
+ kmp_int32 *new_succ_ids = (kmp_int32 *)__kmp_allocate(
+ source_info->successors_size * sizeof(kmp_int32));
+ source_info->successors = new_succ_ids;
+ __kmp_free(old_succ_ids);
+ }
+
+ source_info->successors[source_info->nsuccessors] = task_sink->td_task_id;
+ source_info->nsuccessors++;
+
+ kmp_node_info_t *sink_info =
+ &(task_sink->tdg->record_map[task_sink->td_task_id]);
+ sink_info->npredecessors++;
+ }
+ }
+#endif
#ifdef KMP_SUPPORT_GRAPH_OUTPUT
kmp_taskdata_t *task_source = KMP_TASK_TO_TASKDATA(source->dn.task);
// do not use sink->dn.task as that is only filled after the dependences
@@ -256,10 +294,23 @@ __kmp_depnode_link_successor(kmp_int32 gtid, kmp_info_t *thread,
// link node as successor of list elements
for (kmp_depnode_list_t *p = plist; p; p = p->next) {
kmp_depnode_t *dep = p->node;
+#if OMPX_TASKGRAPH
+ kmp_tdg_status tdg_status = KMP_TDG_NONE;
+ if (task) {
+ kmp_taskdata_t *td = KMP_TASK_TO_TASKDATA(task);
+ if (td->is_taskgraph)
+ tdg_status = KMP_TASK_TO_TASKDATA(task)->tdg->tdg_status;
+ if (__kmp_tdg_is_recording(tdg_status))
+ __kmp_track_dependence(gtid, dep, node, task);
+ }
+#endif
if (dep->dn.task) {
KMP_ACQUIRE_DEPNODE(gtid, dep);
if (dep->dn.task) {
- __kmp_track_dependence(gtid, dep, node, task);
+#if OMPX_TASKGRAPH
+ if (!(__kmp_tdg_is_recording(tdg_status)) && task)
+#endif
+ __kmp_track_dependence(gtid, dep, node, task);
dep->dn.successors = __kmp_add_node(thread, dep->dn.successors, node);
KA_TRACE(40, ("__kmp_process_deps: T#%d adding dependence from %p to "
"%p\n",
@@ -281,16 +332,42 @@ static inline kmp_int32 __kmp_depnode_link_successor(kmp_int32 gtid,
if (!sink)
return 0;
kmp_int32 npredecessors = 0;
+#if OMPX_TASKGRAPH
+ kmp_tdg_status tdg_status = KMP_TDG_NONE;
+ kmp_taskdata_t *td = KMP_TASK_TO_TASKDATA(task);
+ if (task) {
+ if (td->is_taskgraph)
+ tdg_status = KMP_TASK_TO_TASKDATA(task)->tdg->tdg_status;
+ if (__kmp_tdg_is_recording(tdg_status) && sink->dn.task)
+ __kmp_track_dependence(gtid, sink, source, task);
+ }
+#endif
if (sink->dn.task) {
// synchronously add source to sink' list of successors
KMP_ACQUIRE_DEPNODE(gtid, sink);
if (sink->dn.task) {
- __kmp_track_dependence(gtid, sink, source, task);
+#if OMPX_TASKGRAPH
+ if (!(__kmp_tdg_is_recording(tdg_status)) && task)
+#endif
+ __kmp_track_dependence(gtid, sink, source, task);
sink->dn.successors = __kmp_add_node(thread, sink->dn.successors, source);
KA_TRACE(40, ("__kmp_process_deps: T#%d adding dependence from %p to "
"%p\n",
gtid, KMP_TASK_TO_TASKDATA(sink->dn.task),
KMP_TASK_TO_TASKDATA(task)));
+#if OMPX_TASKGRAPH
+ if (__kmp_tdg_is_recording(tdg_status)) {
+ kmp_taskdata_t *tdd = KMP_TASK_TO_TASKDATA(sink->dn.task);
+ if (tdd->is_taskgraph) {
+ if (tdd->td_flags.onced)
+ // decrement npredecessors if sink->dn.task belongs to a taskgraph
+ // and
+ // 1) the task is reset to its initial state (by kmp_free_task) or
+ // 2) the task is complete but not yet reset
+ npredecessors--;
+ }
+ }
+#endif
npredecessors++;
}
KMP_RELEASE_DEPNODE(gtid, sink);
@@ -595,6 +672,48 @@ kmp_int32 __kmpc_omp_task_with_deps(ident_t *loc_ref, kmp_int32 gtid,
kmp_info_t *thread = __kmp_threads[gtid];
kmp_taskdata_t *current_task = thread->th.th_current_task;
+#if OMPX_TASKGRAPH
+ // record TDG with deps
+ if (new_taskdata->is_taskgraph &&
+ __kmp_tdg_is_recording(new_taskdata->tdg->tdg_status)) {
+ kmp_tdg_info_t *tdg = new_taskdata->tdg;
+ // extend record_map if needed
+ if (new_taskdata->td_task_id >= tdg->map_size) {
+ __kmp_acquire_bootstrap_lock(&tdg->graph_lock);
+ if (new_taskdata->td_task_id >= tdg->map_size) {
+ kmp_uint old_size = tdg->map_size;
+ kmp_uint new_size = old_size * 2;
+ kmp_node_info_t *old_record = tdg->record_map;
+ kmp_node_info_t *new_record = (kmp_node_info_t *)__kmp_allocate(
+ new_size * sizeof(kmp_node_info_t));
+ KMP_MEMCPY(new_record, tdg->record_map,
+ old_size * sizeof(kmp_node_info_t));
+ tdg->record_map = new_record;
+
+ __kmp_free(old_record);
+
+ for (kmp_int i = old_size; i < new_size; i++) {
+ kmp_int32 *successorsList = (kmp_int32 *)__kmp_allocate(
+ __kmp_successors_size * sizeof(kmp_int32));
+ new_record[i].task = nullptr;
+ new_record[i].successors = successorsList;
+ new_record[i].nsuccessors = 0;
+ new_record[i].npredecessors = 0;
+ new_record[i].successors_size = __kmp_successors_size;
+ KMP_ATOMIC_ST_REL(&new_record[i].npredecessors_counter, 0);
+ }
+ // update the size at the end, so that we avoid other
+ // threads use old_record while map_size is already updated
+ tdg->map_size = new_size;
+ }
+ __kmp_release_bootstrap_lock(&tdg->graph_lock);
+ }
+ tdg->record_map[new_taskdata->td_task_id].task = new_task;
+ tdg->record_map[new_taskdata->td_task_id].parent_task =
+ new_taskdata->td_parent;
+ KMP_ATOMIC_INC(&tdg->num_tasks);
+ }
+#endif
#if OMPT_SUPPORT
if (ompt_enabled.enabled) {
if (!current_task->ompt_task_info.frame.enter_frame.ptr)
diff --git a/openmp/runtime/src/kmp_taskdeps.h b/openmp/runtime/src/kmp_taskdeps.h
index ac6174afd3f5..d2ab51515801 100644
--- a/openmp/runtime/src/kmp_taskdeps.h
+++ b/openmp/runtime/src/kmp_taskdeps.h
@@ -92,6 +92,23 @@ static inline void __kmp_dephash_free(kmp_info_t *thread, kmp_dephash_t *h) {
extern void __kmpc_give_task(kmp_task_t *ptask, kmp_int32 start);
static inline void __kmp_release_deps(kmp_int32 gtid, kmp_taskdata_t *task) {
+
+#if OMPX_TASKGRAPH
+ if (task->is_taskgraph && !(__kmp_tdg_is_recording(task->tdg->tdg_status))) {
+ kmp_node_info_t *TaskInfo = &(task->tdg->record_map[task->td_task_id]);
+
+ for (int i = 0; i < TaskInfo->nsuccessors; i++) {
+ kmp_int32 successorNumber = TaskInfo->successors[i];
+ kmp_node_info_t *successor = &(task->tdg->record_map[successorNumber]);
+ kmp_int32 npredecessors = KMP_ATOMIC_DEC(&successor->npredecessors_counter) - 1;
+ if (successor->task != nullptr && npredecessors == 0) {
+ __kmp_omp_task(gtid, successor->task, false);
+ }
+ }
+ return;
+ }
+#endif
+
kmp_info_t *thread = __kmp_threads[gtid];
kmp_depnode_t *node = task->td_depnode;
@@ -120,8 +137,12 @@ static inline void __kmp_release_deps(kmp_int32 gtid, kmp_taskdata_t *task) {
gtid, task));
KMP_ACQUIRE_DEPNODE(gtid, node);
- node->dn.task =
- NULL; // mark this task as finished, so no new dependencies are generated
+#if OMPX_TASKGRAPH
+ if (!task->is_taskgraph ||
+ (task->is_taskgraph && !__kmp_tdg_is_recording(task->tdg->tdg_status)))
+#endif
+ node->dn.task =
+ NULL; // mark this task as finished, so no new dependencies are generated
KMP_RELEASE_DEPNODE(gtid, node);
kmp_depnode_list_t *next;
diff --git a/openmp/runtime/src/kmp_tasking.cpp b/openmp/runtime/src/kmp_tasking.cpp
index 7e9147eeeebf..5b6c9edd77da 100644
--- a/openmp/runtime/src/kmp_tasking.cpp
+++ b/openmp/runtime/src/kmp_tasking.cpp
@@ -37,6 +37,10 @@ static void __kmp_alloc_task_deque(kmp_info_t *thread,
static int __kmp_realloc_task_threads_data(kmp_info_t *thread,
kmp_task_team_t *task_team);
static void __kmp_bottom_half_finish_proxy(kmp_int32 gtid, kmp_task_t *ptask);
+#if OMPX_TASKGRAPH
+static kmp_tdg_info_t *__kmp_find_tdg(kmp_int32 tdg_id);
+int __kmp_taskloop_task(int gtid, void *ptask);
+#endif
#ifdef BUILD_TIED_TASK_STACK
@@ -281,7 +285,11 @@ static bool __kmp_task_is_allowed(int gtid, const kmp_int32 is_constrained,
}
// Check mutexinoutset dependencies, acquire locks
kmp_depnode_t *node = tasknew->td_depnode;
+#if OMPX_TASKGRAPH
+ if (!tasknew->is_taskgraph && UNLIKELY(node && (node->dn.mtx_num_locks > 0))) {
+#else
if (UNLIKELY(node && (node->dn.mtx_num_locks > 0))) {
+#endif
for (int i = 0; i < node->dn.mtx_num_locks; ++i) {
KMP_DEBUG_ASSERT(node->dn.mtx_locks[i] != NULL);
if (__kmp_test_lock(node->dn.mtx_locks[i], gtid))
@@ -888,12 +896,34 @@ static void __kmp_free_task(kmp_int32 gtid, kmp_taskdata_t *taskdata,
task->data2.priority = 0;
taskdata->td_flags.freed = 1;
+#if OMPX_TASKGRAPH
+ // do not free tasks in taskgraph
+ if (!taskdata->is_taskgraph) {
+#endif
// deallocate the taskdata and shared variable blocks associated with this task
#if USE_FAST_MEMORY
__kmp_fast_free(thread, taskdata);
#else /* ! USE_FAST_MEMORY */
__kmp_thread_free(thread, taskdata);
#endif
+#if OMPX_TASKGRAPH
+ } else {
+ taskdata->td_flags.complete = 0;
+ taskdata->td_flags.started = 0;
+ taskdata->td_flags.freed = 0;
+ taskdata->td_flags.executing = 0;
+ taskdata->td_flags.task_serial =
+ (taskdata->td_parent->td_flags.final ||
+ taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser);
+
+ // taskdata->td_allow_completion_event.pending_events_count = 1;
+ KMP_ATOMIC_ST_RLX(&taskdata->td_untied_count, 0);
+ KMP_ATOMIC_ST_RLX(&taskdata->td_incomplete_child_tasks, 0);
+ // start at one because counts current task and children
+ KMP_ATOMIC_ST_RLX(&taskdata->td_allocated_child_tasks, 1);
+ }
+#endif
+
KA_TRACE(20, ("__kmp_free_task: T#%d freed task %p\n", gtid, taskdata));
}
@@ -980,6 +1010,10 @@ static bool __kmp_track_children_task(kmp_taskdata_t *taskdata) {
flags.detachable == TASK_DETACHABLE || flags.hidden_helper;
ret = ret ||
KMP_ATOMIC_LD_ACQ(&taskdata->td_parent->td_incomplete_child_tasks) > 0;
+#if OMPX_TASKGRAPH
+ if (taskdata->td_taskgroup && taskdata->is_taskgraph)
+ ret = ret || KMP_ATOMIC_LD_ACQ(&taskdata->td_taskgroup->count) > 0;
+#endif
return ret;
}
@@ -999,6 +1033,10 @@ static void __kmp_task_finish(kmp_int32 gtid, kmp_task_t *task,
kmp_info_t *thread = __kmp_threads[gtid];
kmp_task_team_t *task_team =
thread->th.th_task_team; // might be NULL for serial teams...
+#if OMPX_TASKGRAPH
+ // to avoid seg fault when we need to access taskdata->td_flags after free when using vanilla taskloop
+ bool is_taskgraph;
+#endif
#if KMP_DEBUG
kmp_int32 children = 0;
#endif
@@ -1008,6 +1046,10 @@ static void __kmp_task_finish(kmp_int32 gtid, kmp_task_t *task,
KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
+#if OMPX_TASKGRAPH
+ is_taskgraph = taskdata->is_taskgraph;
+#endif
+
// Pop task from stack if tied
#ifdef BUILD_TIED_TASK_STACK
if (taskdata->td_flags.tiedness == TASK_TIED) {
@@ -1114,6 +1156,9 @@ static void __kmp_task_finish(kmp_int32 gtid, kmp_task_t *task,
if (completed) {
taskdata->td_flags.complete = 1; // mark the task as completed
+#if OMPX_TASKGRAPH
+ taskdata->td_flags.onced = 1; // mark the task as ran once already
+#endif
#if OMPT_SUPPORT
// This is not a detached task, we are done here
@@ -1130,7 +1175,11 @@ static void __kmp_task_finish(kmp_int32 gtid, kmp_task_t *task,
#endif
KMP_ATOMIC_DEC(&taskdata->td_parent->td_incomplete_child_tasks);
KMP_DEBUG_ASSERT(children >= 0);
+#if OMPX_TASKGRAPH
+ if (taskdata->td_taskgroup && !taskdata->is_taskgraph)
+#else
if (taskdata->td_taskgroup)
+#endif
KMP_ATOMIC_DEC(&taskdata->td_taskgroup->count);
} else if (task_team && (task_team->tt.tt_found_proxy_tasks ||
task_team->tt.tt_hidden_helper_task_encountered)) {
@@ -1169,6 +1218,19 @@ static void __kmp_task_finish(kmp_int32 gtid, kmp_task_t *task,
// KMP_DEBUG_ASSERT( resumed_task->td_flags.executing == 0 );
resumed_task->td_flags.executing = 1; // resume previous task
+#if OMPX_TASKGRAPH
+ if (is_taskgraph && __kmp_track_children_task(taskdata) &&
+ taskdata->td_taskgroup) {
+ // TDG: we only release taskgroup barrier here because
+ // free_task_and_ancestors will call
+ // __kmp_free_task, which resets all task parameters such as
+ // taskdata->started, etc. If we release the barrier earlier, these
+ // parameters could be read before being reset. This is not an issue for
+ // non-TDG implementation because we never reuse a task(data) structure
+ KMP_ATOMIC_DEC(&taskdata->td_taskgroup->count);
+ }
+#endif
+
KA_TRACE(
10, ("__kmp_task_finish(exit): T#%d finished task %p, resuming task %p\n",
gtid, taskdata, resumed_task));
@@ -1285,6 +1347,9 @@ void __kmp_init_implicit_task(ident_t *loc_ref, kmp_info_t *this_thr,
task->td_flags.executing = 1;
task->td_flags.complete = 0;
task->td_flags.freed = 0;
+#if OMPX_TASKGRAPH
+ task->td_flags.onced = 0;
+#endif
task->td_depnode = NULL;
task->td_last_tied = task;
@@ -1321,6 +1386,9 @@ void __kmp_finish_implicit_task(kmp_info_t *thread) {
if (task->td_dephash) {
int children;
task->td_flags.complete = 1;
+#if OMPX_TASKGRAPH
+ task->td_flags.onced = 1;
+#endif
children = KMP_ATOMIC_LD_ACQ(&task->td_incomplete_child_tasks);
kmp_tasking_flags_t flags_old = task->td_flags;
if (children == 0 && flags_old.complete == 1) {
@@ -1550,7 +1618,9 @@ kmp_task_t *__kmp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
taskdata->td_flags.executing = 0;
taskdata->td_flags.complete = 0;
taskdata->td_flags.freed = 0;
-
+#if OMPX_TASKGRAPH
+ taskdata->td_flags.onced = 0;
+#endif
KMP_ATOMIC_ST_RLX(&taskdata->td_incomplete_child_tasks, 0);
// start at one because counts current task and children
KMP_ATOMIC_ST_RLX(&taskdata->td_allocated_child_tasks, 1);
@@ -1586,6 +1656,15 @@ kmp_task_t *__kmp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
}
}
+#if OMPX_TASKGRAPH
+ kmp_tdg_info_t *tdg = __kmp_find_tdg(__kmp_curr_tdg_idx);
+ if (tdg && __kmp_tdg_is_recording(tdg->tdg_status) &&
+ (task_entry != (kmp_routine_entry_t)__kmp_taskloop_task)) {
+ taskdata->is_taskgraph = 1;
+ taskdata->tdg = __kmp_global_tdgs[__kmp_curr_tdg_idx];
+ taskdata->td_task_id = KMP_ATOMIC_INC(&__kmp_tdg_task_id);
+ }
+#endif
KA_TRACE(20, ("__kmp_task_alloc(exit): T#%d created task %p parent=%p\n",
gtid, taskdata, taskdata->td_parent));
@@ -1929,6 +2008,53 @@ kmp_int32 __kmp_omp_task(kmp_int32 gtid, kmp_task_t *new_task,
bool serialize_immediate) {
kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
+#if OMPX_TASKGRAPH
+ if (new_taskdata->is_taskgraph &&
+ __kmp_tdg_is_recording(new_taskdata->tdg->tdg_status)) {
+ kmp_tdg_info_t *tdg = new_taskdata->tdg;
+ // extend the record_map if needed
+ if (new_taskdata->td_task_id >= new_taskdata->tdg->map_size) {
+ __kmp_acquire_bootstrap_lock(&tdg->graph_lock);
+ // map_size could have been updated by another thread if recursive
+ // taskloop
+ if (new_taskdata->td_task_id >= tdg->map_size) {
+ kmp_uint old_size = tdg->map_size;
+ kmp_uint new_size = old_size * 2;
+ kmp_node_info_t *old_record = tdg->record_map;
+ kmp_node_info_t *new_record = (kmp_node_info_t *)__kmp_allocate(
+ new_size * sizeof(kmp_node_info_t));
+
+ KMP_MEMCPY(new_record, old_record, old_size * sizeof(kmp_node_info_t));
+ tdg->record_map = new_record;
+
+ __kmp_free(old_record);
+
+ for (kmp_int i = old_size; i < new_size; i++) {
+ kmp_int32 *successorsList = (kmp_int32 *)__kmp_allocate(
+ __kmp_successors_size * sizeof(kmp_int32));
+ new_record[i].task = nullptr;
+ new_record[i].successors = successorsList;
+ new_record[i].nsuccessors = 0;
+ new_record[i].npredecessors = 0;
+ new_record[i].successors_size = __kmp_successors_size;
+ KMP_ATOMIC_ST_REL(&new_record[i].npredecessors_counter, 0);
+ }
+ // update the size at the end, so that we avoid other
+ // threads use old_record while map_size is already updated
+ tdg->map_size = new_size;
+ }
+ __kmp_release_bootstrap_lock(&tdg->graph_lock);
+ }
+ // record a task
+ if (tdg->record_map[new_taskdata->td_task_id].task == nullptr) {
+ tdg->record_map[new_taskdata->td_task_id].task = new_task;
+ tdg->record_map[new_taskdata->td_task_id].parent_task =
+ new_taskdata->td_parent;
+ KMP_ATOMIC_INC(&tdg->num_tasks);
+ }
+ }
+#endif
+
/* Should we execute the new task or queue it? For now, let's just always try
to queue it. If the queue fills up, then we'll execute it. */
if (new_taskdata->td_flags.proxy == TASK_PROXY ||
@@ -2177,7 +2303,6 @@ static kmp_int32 __kmpc_omp_taskwait_template(ident_t *loc_ref, kmp_int32 gtid,
taskdata->ompt_task_info.frame.enter_frame = ompt_data_none;
}
#endif // OMPT_SUPPORT && OMPT_OPTIONAL
-
}
KA_TRACE(10, ("__kmpc_omp_taskwait(exit): T#%d task %p finished waiting, "
@@ -2445,6 +2570,17 @@ the reduction either does not use omp_orig object, or the omp_orig is accessible
without help of the runtime library.
*/
void *__kmpc_task_reduction_init(int gtid, int num, void *data) {
+#if OMPX_TASKGRAPH
+ kmp_tdg_info_t *tdg = __kmp_find_tdg(__kmp_curr_tdg_idx);
+ if (tdg && __kmp_tdg_is_recording(tdg->tdg_status)) {
+ kmp_tdg_info_t *this_tdg = __kmp_global_tdgs[__kmp_curr_tdg_idx];
+ this_tdg->rec_taskred_data =
+ __kmp_allocate(sizeof(kmp_task_red_input_t) * num);
+ this_tdg->rec_num_taskred = num;
+ KMP_MEMCPY(this_tdg->rec_taskred_data, data,
+ sizeof(kmp_task_red_input_t) * num);
+ }
+#endif
return __kmp_task_reduction_init(gtid, num, (kmp_task_red_input_t *)data);
}
@@ -2461,6 +2597,17 @@ Note: this entry supposes the optional compiler-generated initializer routine
has two parameters, pointer to object to be initialized and pointer to omp_orig
*/
void *__kmpc_taskred_init(int gtid, int num, void *data) {
+#if OMPX_TASKGRAPH
+ kmp_tdg_info_t *tdg = __kmp_find_tdg(__kmp_curr_tdg_idx);
+ if (tdg && __kmp_tdg_is_recording(tdg->tdg_status)) {
+ kmp_tdg_info_t *this_tdg = __kmp_global_tdgs[__kmp_curr_tdg_idx];
+ this_tdg->rec_taskred_data =
+ __kmp_allocate(sizeof(kmp_task_red_input_t) * num);
+ this_tdg->rec_num_taskred = num;
+ KMP_MEMCPY(this_tdg->rec_taskred_data, data,
+ sizeof(kmp_task_red_input_t) * num);
+ }
+#endif
return __kmp_task_reduction_init(gtid, num, (kmp_taskred_input_t *)data);
}
@@ -2507,6 +2654,18 @@ void *__kmpc_task_reduction_get_th_data(int gtid, void *tskgrp, void *data) {
kmp_int32 num = tg->reduce_num_data;
kmp_int32 tid = thread->th.th_info.ds.ds_tid;
+#if OMPX_TASKGRAPH
+ if ((thread->th.th_current_task->is_taskgraph) &&
+ (!__kmp_tdg_is_recording(
+ __kmp_global_tdgs[__kmp_curr_tdg_idx]->tdg_status))) {
+ tg = thread->th.th_current_task->td_taskgroup;
+ KMP_ASSERT(tg != NULL);
+ KMP_ASSERT(tg->reduce_data != NULL);
+ arr = (kmp_taskred_data_t *)(tg->reduce_data);
+ num = tg->reduce_num_data;
+ }
+#endif
+
KMP_ASSERT(data != NULL);
while (tg != NULL) {
for (int i = 0; i < num; ++i) {
@@ -4278,6 +4437,9 @@ static void __kmp_first_top_half_finish_proxy(kmp_taskdata_t *taskdata) {
KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
taskdata->td_flags.complete = 1; // mark the task as completed
+#if OMPX_TASKGRAPH
+ taskdata->td_flags.onced = 1;
+#endif
if (taskdata->td_taskgroup)
KMP_ATOMIC_DEC(&taskdata->td_taskgroup->count);
@@ -4476,8 +4638,14 @@ void __kmp_fulfill_event(kmp_event_t *event) {
//
// thread: allocating thread
// task_src: pointer to source task to be duplicated
+// taskloop_recur: used only when dealing with taskgraph,
+// indicating whether we need to update task->td_task_id
// returns: a pointer to the allocated kmp_task_t structure (task).
-kmp_task_t *__kmp_task_dup_alloc(kmp_info_t *thread, kmp_task_t *task_src) {
+kmp_task_t *__kmp_task_dup_alloc(kmp_info_t *thread, kmp_task_t *task_src
+#if OMPX_TASKGRAPH
+ , int taskloop_recur
+#endif
+) {
kmp_task_t *task;
kmp_taskdata_t *taskdata;
kmp_taskdata_t *taskdata_src = KMP_TASK_TO_TASKDATA(task_src);
@@ -4505,7 +4673,15 @@ kmp_task_t *__kmp_task_dup_alloc(kmp_info_t *thread, kmp_task_t *task_src) {
task = KMP_TASKDATA_TO_TASK(taskdata);
// Initialize new task (only specific fields not affected by memcpy)
+#if OMPX_TASKGRAPH
+ if (!taskdata->is_taskgraph || taskloop_recur)
+ taskdata->td_task_id = KMP_GEN_TASK_ID();
+ else if (taskdata->is_taskgraph &&
+ __kmp_tdg_is_recording(taskdata_src->tdg->tdg_status))
+ taskdata->td_task_id = KMP_ATOMIC_INC(&__kmp_tdg_task_id);
+#else
taskdata->td_task_id = KMP_GEN_TASK_ID();
+#endif
if (task->shareds != NULL) { // need setup shareds pointer
shareds_offset = (char *)task_src->shareds - (char *)taskdata_src;
task->shareds = &((char *)taskdata)[shareds_offset];
@@ -4732,7 +4908,13 @@ void __kmp_taskloop_linear(ident_t *loc, int gtid, kmp_task_t *task,
lastpriv = 1;
}
}
+
+#if OMPX_TASKGRAPH
+ next_task = __kmp_task_dup_alloc(thread, task, /* taskloop_recur */ 0);
+#else
next_task = __kmp_task_dup_alloc(thread, task); // allocate new task
+#endif
+
kmp_taskdata_t *next_taskdata = KMP_TASK_TO_TASKDATA(next_task);
kmp_taskloop_bounds_t next_task_bounds =
kmp_taskloop_bounds_t(next_task, task_bounds);
@@ -4929,7 +5111,12 @@ void __kmp_taskloop_recur(ident_t *loc, int gtid, kmp_task_t *task,
lb1 = ub0 + st;
// create pattern task for 2nd half of the loop
+#if OMPX_TASKGRAPH
+ next_task = __kmp_task_dup_alloc(thread, task,
+ /* taskloop_recur */ 1);
+#else
next_task = __kmp_task_dup_alloc(thread, task); // duplicate the task
+#endif
// adjust lower bound (upper bound is not changed) for the 2nd half
*(kmp_uint64 *)((char *)next_task + lower_offset) = lb1;
if (ptask_dup != NULL) // construct firstprivates, etc.
@@ -4962,6 +5149,12 @@ void __kmp_taskloop_recur(ident_t *loc, int gtid, kmp_task_t *task,
p->codeptr_ra = codeptr_ra;
#endif
+#if OMPX_TASKGRAPH
+ kmp_taskdata_t *new_task_data = KMP_TASK_TO_TASKDATA(new_task);
+ new_task_data->tdg = taskdata->tdg;
+ new_task_data->is_taskgraph = 0;
+#endif
+
#if OMPT_SUPPORT
// schedule new task with correct return address for OMPT events
__kmp_omp_taskloop_task(NULL, gtid, new_task, codeptr_ra);
@@ -5001,6 +5194,9 @@ static void __kmp_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val,
__kmpc_taskgroup(loc, gtid);
}
+#if OMPX_TASKGRAPH
+ KMP_ATOMIC_DEC(&__kmp_tdg_task_id);
+#endif
// =========================================================================
// calculate loop parameters
kmp_taskloop_bounds_t task_bounds(task, lb, ub);
@@ -5248,3 +5444,221 @@ bool __kmpc_omp_has_task_team(kmp_int32 gtid) {
return taskdata->td_task_team != NULL;
}
+
+#if OMPX_TASKGRAPH
+// __kmp_find_tdg: identify a TDG through its ID
+// gtid: Global Thread ID
+// tdg_id: ID of the TDG
+// returns: If a TDG corresponding to this ID is found and not
+// its initial state, return the pointer to it, otherwise nullptr
+static kmp_tdg_info_t *__kmp_find_tdg(kmp_int32 tdg_id) {
+ kmp_tdg_info_t *res = nullptr;
+ if (__kmp_max_tdgs == 0)
+ return res;
+
+ if (__kmp_global_tdgs == NULL)
+ __kmp_global_tdgs = (kmp_tdg_info_t **)__kmp_allocate(
+ sizeof(kmp_tdg_info_t *) * __kmp_max_tdgs);
+
+ if ((__kmp_global_tdgs[tdg_id]) &&
+ (__kmp_global_tdgs[tdg_id]->tdg_status != KMP_TDG_NONE))
+ res = __kmp_global_tdgs[tdg_id];
+ return res;
+}
+
+// __kmp_start_record: launch the execution of a previous
+// recorded TDG
+// gtid: Global Thread ID
+// tdg: ID of the TDG
+void __kmp_exec_tdg(kmp_int32 gtid, kmp_tdg_info_t *tdg) {
+ KMP_DEBUG_ASSERT(tdg->tdg_status == KMP_TDG_READY);
+ KA_TRACE(10, ("__kmp_exec_tdg(enter): T#%d tdg_id=%d num_roots=%d\n", gtid,
+ tdg->tdg_id, tdg->num_roots));
+ kmp_node_info_t *this_record_map = tdg->record_map;
+ kmp_int32 *this_root_tasks = tdg->root_tasks;
+ kmp_int32 this_num_roots = tdg->num_roots;
+ kmp_int32 this_num_tasks = KMP_ATOMIC_LD_RLX(&tdg->num_tasks);
+
+ kmp_info_t *thread = __kmp_threads[gtid];
+ kmp_taskdata_t *parent_task = thread->th.th_current_task;
+
+ if (tdg->rec_taskred_data) {
+ __kmpc_taskred_init(gtid, tdg->rec_num_taskred, tdg->rec_taskred_data);
+ }
+
+ for (kmp_int32 j = 0; j < this_num_tasks; j++) {
+ kmp_taskdata_t *td = KMP_TASK_TO_TASKDATA(this_record_map[j].task);
+
+ td->td_parent = parent_task;
+ this_record_map[j].parent_task = parent_task;
+
+ kmp_taskgroup_t *parent_taskgroup =
+ this_record_map[j].parent_task->td_taskgroup;
+
+ KMP_ATOMIC_ST_RLX(&this_record_map[j].npredecessors_counter,
+ this_record_map[j].npredecessors);
+ KMP_ATOMIC_INC(&this_record_map[j].parent_task->td_incomplete_child_tasks);
+
+ if (parent_taskgroup) {
+ KMP_ATOMIC_INC(&parent_taskgroup->count);
+ // The taskgroup is different so we must update it
+ td->td_taskgroup = parent_taskgroup;
+ } else if (td->td_taskgroup != nullptr) {
+ // If the parent doesnt have a taskgroup, remove it from the task
+ td->td_taskgroup = nullptr;
+ }
+ if (this_record_map[j].parent_task->td_flags.tasktype == TASK_EXPLICIT)
+ KMP_ATOMIC_INC(&this_record_map[j].parent_task->td_allocated_child_tasks);
+ }
+
+ for (kmp_int32 j = 0; j < this_num_roots; ++j) {
+ __kmp_omp_task(gtid, this_record_map[this_root_tasks[j]].task, true);
+ }
+ KA_TRACE(10, ("__kmp_exec_tdg(exit): T#%d tdg_id=%d num_roots=%d\n", gtid,
+ tdg->tdg_id, tdg->num_roots));
+}
+
+// __kmp_start_record: set up a TDG structure and turn the
+// recording flag to true
+// gtid: Global Thread ID of the encountering thread
+// input_flags: Flags associated with the TDG
+// tdg_id: ID of the TDG to record
+static inline void __kmp_start_record(kmp_int32 gtid,
+ kmp_taskgraph_flags_t *flags,
+ kmp_int32 tdg_id) {
+ kmp_tdg_info_t *tdg =
+ (kmp_tdg_info_t *)__kmp_allocate(sizeof(kmp_tdg_info_t));
+ __kmp_global_tdgs[__kmp_curr_tdg_idx] = tdg;
+ // Initializing the TDG structure
+ tdg->tdg_id = tdg_id;
+ tdg->map_size = INIT_MAPSIZE;
+ tdg->num_roots = -1;
+ tdg->root_tasks = nullptr;
+ tdg->tdg_status = KMP_TDG_RECORDING;
+ tdg->rec_num_taskred = 0;
+ tdg->rec_taskred_data = nullptr;
+ KMP_ATOMIC_ST_RLX(&tdg->num_tasks, 0);
+
+ // Initializing the list of nodes in this TDG
+ kmp_node_info_t *this_record_map =
+ (kmp_node_info_t *)__kmp_allocate(INIT_MAPSIZE * sizeof(kmp_node_info_t));
+ for (kmp_int32 i = 0; i < INIT_MAPSIZE; i++) {
+ kmp_int32 *successorsList =
+ (kmp_int32 *)__kmp_allocate(__kmp_successors_size * sizeof(kmp_int32));
+ this_record_map[i].task = nullptr;
+ this_record_map[i].successors = successorsList;
+ this_record_map[i].nsuccessors = 0;
+ this_record_map[i].npredecessors = 0;
+ this_record_map[i].successors_size = __kmp_successors_size;
+ KMP_ATOMIC_ST_RLX(&this_record_map[i].npredecessors_counter, 0);
+ }
+
+ __kmp_global_tdgs[__kmp_curr_tdg_idx]->record_map = this_record_map;
+}
+
+// __kmpc_start_record_task: Wrapper around __kmp_start_record to mark
+// the beginning of the record process of a task region
+// loc_ref: Location of TDG, not used yet
+// gtid: Global Thread ID of the encountering thread
+// input_flags: Flags associated with the TDG
+// tdg_id: ID of the TDG to record, for now, incremental integer
+// returns: 1 if we record, otherwise, 0
+kmp_int32 __kmpc_start_record_task(ident_t *loc_ref, kmp_int32 gtid,
+ kmp_int32 input_flags, kmp_int32 tdg_id) {
+
+ kmp_int32 res;
+ kmp_taskgraph_flags_t *flags = (kmp_taskgraph_flags_t *)&input_flags;
+ KA_TRACE(10,
+ ("__kmpc_start_record_task(enter): T#%d loc=%p flags=%d tdg_id=%d\n",
+ gtid, loc_ref, input_flags, tdg_id));
+
+ if (__kmp_max_tdgs == 0) {
+ KA_TRACE(
+ 10,
+ ("__kmpc_start_record_task(abandon): T#%d loc=%p flags=%d tdg_id = %d, "
+ "__kmp_max_tdgs = 0\n",
+ gtid, loc_ref, input_flags, tdg_id));
+ return 1;
+ }
+
+ __kmpc_taskgroup(loc_ref, gtid);
+ if (kmp_tdg_info_t *tdg = __kmp_find_tdg(tdg_id)) {
+ // TODO: use re_record flag
+ __kmp_exec_tdg(gtid, tdg);
+ res = 0;
+ } else {
+ __kmp_curr_tdg_idx = tdg_id;
+ KMP_DEBUG_ASSERT(__kmp_curr_tdg_idx < __kmp_max_tdgs);
+ __kmp_start_record(gtid, flags, tdg_id);
+ __kmp_num_tdg++;
+ res = 1;
+ }
+ KA_TRACE(10, ("__kmpc_start_record_task(exit): T#%d TDG %d starts to %s\n",
+ gtid, tdg_id, res ? "record" : "execute"));
+ return res;
+}
+
+// __kmp_end_record: set up a TDG after recording it
+// gtid: Global thread ID
+// tdg: Pointer to the TDG
+void __kmp_end_record(kmp_int32 gtid, kmp_tdg_info_t *tdg) {
+ // Store roots
+ kmp_node_info_t *this_record_map = tdg->record_map;
+ kmp_int32 this_num_tasks = KMP_ATOMIC_LD_RLX(&tdg->num_tasks);
+ kmp_int32 *this_root_tasks =
+ (kmp_int32 *)__kmp_allocate(this_num_tasks * sizeof(kmp_int32));
+ kmp_int32 this_map_size = tdg->map_size;
+ kmp_int32 this_num_roots = 0;
+ kmp_info_t *thread = __kmp_threads[gtid];
+
+ for (kmp_int32 i = 0; i < this_num_tasks; i++) {
+ if (this_record_map[i].npredecessors == 0) {
+ this_root_tasks[this_num_roots++] = i;
+ }
+ }
+
+ // Update with roots info and mapsize
+ tdg->map_size = this_map_size;
+ tdg->num_roots = this_num_roots;
+ tdg->root_tasks = this_root_tasks;
+ KMP_DEBUG_ASSERT(tdg->tdg_status == KMP_TDG_RECORDING);
+ tdg->tdg_status = KMP_TDG_READY;
+
+ if (thread->th.th_current_task->td_dephash) {
+ __kmp_dephash_free(thread, thread->th.th_current_task->td_dephash);
+ thread->th.th_current_task->td_dephash = NULL;
+ }
+
+ // Reset predecessor counter
+ for (kmp_int32 i = 0; i < this_num_tasks; i++) {
+ KMP_ATOMIC_ST_RLX(&this_record_map[i].npredecessors_counter,
+ this_record_map[i].npredecessors);
+ }
+ KMP_ATOMIC_ST_RLX(&__kmp_tdg_task_id, 0);
+}
+
+// __kmpc_end_record_task: wrapper around __kmp_end_record to mark
+// the end of recording phase
+//
+// loc_ref: Source location information
+// gtid: Global thread ID
+// input_flags: Flags attached to the graph
+// tdg_id: ID of the TDG just finished recording
+void __kmpc_end_record_task(ident_t *loc_ref, kmp_int32 gtid,
+ kmp_int32 input_flags, kmp_int32 tdg_id) {
+ kmp_tdg_info_t *tdg = __kmp_find_tdg(tdg_id);
+
+ KA_TRACE(10, ("__kmpc_end_record_task(enter): T#%d loc=%p finishes recording"
+ " tdg=%d with flags=%d\n",
+ gtid, loc_ref, tdg_id, input_flags));
+ if (__kmp_max_tdgs) {
+ // TODO: use input_flags->nowait
+ __kmpc_end_taskgroup(loc_ref, gtid);
+ if (__kmp_tdg_is_recording(tdg->tdg_status))
+ __kmp_end_record(gtid, tdg);
+ }
+ KA_TRACE(10, ("__kmpc_end_record_task(exit): T#%d loc=%p finished recording"
+ " tdg=%d, its status is now READY\n",
+ gtid, loc_ref, tdg_id));
+}
+#endif
diff --git a/openmp/runtime/test/CMakeLists.txt b/openmp/runtime/test/CMakeLists.txt
index 05b517fb920f..a7790804542b 100644
--- a/openmp/runtime/test/CMakeLists.txt
+++ b/openmp/runtime/test/CMakeLists.txt
@@ -30,6 +30,7 @@ update_test_compiler_features()
pythonize_bool(LIBOMP_USE_HWLOC)
pythonize_bool(LIBOMP_OMPT_SUPPORT)
pythonize_bool(LIBOMP_OMPT_OPTIONAL)
+pythonize_bool(LIBOMP_OMPX_TASKGRAPH)
pythonize_bool(LIBOMP_HAVE_LIBM)
pythonize_bool(LIBOMP_HAVE_LIBATOMIC)
pythonize_bool(OPENMP_STANDALONE_BUILD)
diff --git a/openmp/runtime/test/lit.cfg b/openmp/runtime/test/lit.cfg
index b1a5b7a35109..a9399288e550 100644
--- a/openmp/runtime/test/lit.cfg
+++ b/openmp/runtime/test/lit.cfg
@@ -99,6 +99,9 @@ if config.has_ompt:
# for callback.h
config.test_flags += " -I " + config.test_source_root + "/ompt"
+if config.has_ompx_taskgraph:
+ config.available_features.add("ompx_taskgraph")
+
if 'Linux' in config.operating_system:
config.available_features.add("linux")
diff --git a/openmp/runtime/test/lit.site.cfg.in b/openmp/runtime/test/lit.site.cfg.in
index 45a18b480130..d6c259280619 100644
--- a/openmp/runtime/test/lit.site.cfg.in
+++ b/openmp/runtime/test/lit.site.cfg.in
@@ -15,6 +15,7 @@ config.operating_system = "@CMAKE_SYSTEM_NAME@"
config.hwloc_library_dir = "@LIBOMP_HWLOC_LIBRARY_DIR@"
config.using_hwloc = @LIBOMP_USE_HWLOC@
config.has_ompt = @LIBOMP_OMPT_SUPPORT@ and @LIBOMP_OMPT_OPTIONAL@
+config.has_ompx_taskgraph = @LIBOMP_OMPX_TASKGRAPH@
config.has_libm = @LIBOMP_HAVE_LIBM@
config.has_libatomic = @LIBOMP_HAVE_LIBATOMIC@
config.is_standalone_build = @OPENMP_STANDALONE_BUILD@
diff --git a/openmp/runtime/test/tasking/omp_record_replay.cpp b/openmp/runtime/test/tasking/omp_record_replay.cpp
new file mode 100644
index 000000000000..69ad98003a0d
--- /dev/null
+++ b/openmp/runtime/test/tasking/omp_record_replay.cpp
@@ -0,0 +1,48 @@
+// REQUIRES: ompx_taskgraph
+// RUN: %libomp-cxx-compile-and-run
+#include <iostream>
+#include <cassert>
+#define NT 100
+
+// Compiler-generated code (emulation)
+typedef struct ident {
+ void* dummy;
+} ident_t;
+
+
+#ifdef __cplusplus
+extern "C" {
+ int __kmpc_global_thread_num(ident_t *);
+ int __kmpc_start_record_task(ident_t *, int, int, int);
+ void __kmpc_end_record_task(ident_t *, int, int , int);
+}
+#endif
+
+void func(int *num_exec) {
+ (*num_exec)++;
+}
+
+int main() {
+ int num_exec = 0;
+ int num_tasks = 0;
+ int x=0;
+ #pragma omp parallel
+ #pragma omp single
+ for (int iter = 0; iter < NT; ++iter) {
+ int gtid = __kmpc_global_thread_num(nullptr);
+ int res = __kmpc_start_record_task(nullptr, gtid, /* kmp_tdg_flags */ 0, /* tdg_id */0);
+ if (res) {
+ num_tasks++;
+ #pragma omp task
+ func(&num_exec);
+ }
+ __kmpc_end_record_task(nullptr, gtid, /* kmp_tdg_flags */0, /* tdg_id */0);
+ }
+
+ assert(num_tasks==1);
+ assert(num_exec==NT);
+
+ std::cout << "Passed" << std::endl;
+ return 0;
+}
+// CHECK: Passed
diff --git a/openmp/runtime/test/tasking/omp_record_replay_deps.cpp b/openmp/runtime/test/tasking/omp_record_replay_deps.cpp
new file mode 100644
index 000000000000..9b6b370b30ef
--- /dev/null
+++ b/openmp/runtime/test/tasking/omp_record_replay_deps.cpp
@@ -0,0 +1,63 @@
+// REQUIRES: ompx_taskgraph
+// RUN: %libomp-cxx-compile-and-run
+#include <iostream>
+#include <cassert>
+#define NT 100
+#define MULTIPLIER 100
+#define DECREMENT 5
+
+int val;
+// Compiler-generated code (emulation)
+typedef struct ident {
+ void* dummy;
+} ident_t;
+
+
+#ifdef __cplusplus
+extern "C" {
+ int __kmpc_global_thread_num(ident_t *);
+ int __kmpc_start_record_task(ident_t *, int, int, int);
+ void __kmpc_end_record_task(ident_t *, int, int, int);
+}
+#endif
+
+void sub() {
+ #pragma omp atomic
+ val -= DECREMENT;
+}
+
+void add() {
+ #pragma omp atomic
+ val += DECREMENT;
+}
+
+void mult() {
+ // no atomicity needed, can only be executed by 1 thread
+ // and no concurrency with other tasks possible
+ val *= MULTIPLIER;
+}
+
+int main() {
+ val = 0;
+ int *x, *y;
+ #pragma omp parallel
+ #pragma omp single
+ for (int iter = 0; iter < NT; ++iter) {
+ int gtid = __kmpc_global_thread_num(nullptr);
+ int res = __kmpc_start_record_task(nullptr, gtid, /* kmp_tdg_flags */0, /* tdg_id */0);
+ if (res) {
+ #pragma omp task depend(out:y)
+ add();
+ #pragma omp task depend(out:x)
+ sub();
+ #pragma omp task depend(in:x,y)
+ mult();
+ }
+ __kmpc_end_record_task(nullptr, gtid, /* kmp_tdg_flags */0, /* tdg_id */0);
+ }
+ assert(val==0);
+
+ std::cout << "Passed" << std::endl;
+ return 0;
+}
+// CHECK: Passed
diff --git a/openmp/runtime/test/tasking/omp_record_replay_multiTDGs.cpp b/openmp/runtime/test/tasking/omp_record_replay_multiTDGs.cpp
new file mode 100644
index 000000000000..03252843689c
--- /dev/null
+++ b/openmp/runtime/test/tasking/omp_record_replay_multiTDGs.cpp
@@ -0,0 +1,76 @@
+// REQUIRES: ompx_taskgraph
+// RUN: %libomp-cxx-compile-and-run
+#include <iostream>
+#include <cassert>
+#define NT 20
+#define MULTIPLIER 100
+#define DECREMENT 5
+
+// Compiler-generated code (emulation)
+typedef struct ident {
+ void* dummy;
+} ident_t;
+
+int val;
+#ifdef __cplusplus
+extern "C" {
+ int __kmpc_global_thread_num(ident_t *);
+ int __kmpc_start_record_task(ident_t *, int, int, int);
+ void __kmpc_end_record_task(ident_t *, int, int , int);
+}
+#endif
+
+void sub() {
+ #pragma omp atomic
+ val -= DECREMENT;
+}
+
+void add() {
+ #pragma omp atomic
+ val += DECREMENT;
+}
+
+void mult() {
+ // no atomicity needed, can only be executed by 1 thread
+ // and no concurrency with other tasks possible
+ val *= MULTIPLIER;
+}
+
+int main() {
+ int num_tasks = 0;
+ int *x, *y;
+ #pragma omp parallel
+ #pragma omp single
+ for (int iter = 0; iter < NT; ++iter) {
+ int gtid = __kmpc_global_thread_num(nullptr);
+ int res = __kmpc_start_record_task(nullptr, gtid, /* kmp_tdg_flags */ 0, /* tdg_id */0);
+ if (res) {
+ num_tasks++;
+ #pragma omp task depend(out:y)
+ add();
+ #pragma omp task depend(out:x)
+ sub();
+ #pragma omp task depend(in:x,y)
+ mult();
+ }
+ __kmpc_end_record_task(nullptr, gtid, /* kmp_tdg_flags */0, /* tdg_id */0);
+ res = __kmpc_start_record_task(nullptr, gtid, /* kmp_tdg_flags */ 0, /* tdg_id */1);
+ if (res) {
+ num_tasks++;
+ #pragma omp task depend(out:y)
+ add();
+ #pragma omp task depend(out:x)
+ sub();
+ #pragma omp task depend(in:x,y)
+ mult();
+ }
+ __kmpc_end_record_task(nullptr, gtid, /* kmp_tdg_flags */0, /* tdg_id */1);
+ }
+
+ assert(num_tasks==2);
+ assert(val==0);
+
+ std::cout << "Passed" << std::endl;
+ return 0;
+}
+// CHECK: Passed
diff --git a/openmp/runtime/test/tasking/omp_record_replay_taskloop.cpp b/openmp/runtime/test/tasking/omp_record_replay_taskloop.cpp
new file mode 100644
index 000000000000..3d88faeeb28e
--- /dev/null
+++ b/openmp/runtime/test/tasking/omp_record_replay_taskloop.cpp
@@ -0,0 +1,50 @@
+// REQUIRES: ompx_taskgraph
+// RUN: %libomp-cxx-compile-and-run
+#include <iostream>
+#include <cassert>
+
+#define NT 20
+#define N 128*128
+
+typedef struct ident {
+ void* dummy;
+} ident_t;
+
+
+#ifdef __cplusplus
+extern "C" {
+ int __kmpc_global_thread_num(ident_t *);
+ int __kmpc_start_record_task(ident_t *, int, int, int);
+ void __kmpc_end_record_task(ident_t *, int, int , int);
+}
+#endif
+
+int main() {
+ int num_tasks = 0;
+
+ int array[N];
+ for (int i = 0; i < N; ++i)
+ array[i] = 1;
+
+ long sum = 0;
+ #pragma omp parallel
+ #pragma omp single
+ for (int iter = 0; iter < NT; ++iter) {
+ int gtid = __kmpc_global_thread_num(nullptr);
+ int res = __kmpc_start_record_task(nullptr, gtid, /* kmp_tdg_flags */0, /* tdg_id */0);
+ if (res) {
+ num_tasks++;
+ #pragma omp taskloop reduction(+:sum) num_tasks(4096)
+ for (int i = 0; i < N; ++i) {
+ sum += array[i];
+ }
+ }
+ __kmpc_end_record_task(nullptr, gtid, /* kmp_tdg_flags */0, /* tdg_id */0);
+ }
+ assert(sum==N*NT);
+ assert(num_tasks==1);
+
+ std::cout << "Passed" << std::endl;
+ return 0;
+}
+// CHECK: Passed