1 files changed, 112 insertions, 0 deletions
diff --git a/pstl/include/pstl/internal/omp/parallel_transform_reduce.h b/pstl/include/pstl/internal/omp/parallel_transform_reduce.h
new file mode 100644
index 000000000000..72ea37f5faeb
--- /dev/null
+++ b/pstl/include/pstl/internal/omp/parallel_transform_reduce.h
@@ -0,0 +1,112 @@
+// -*- C++ -*-
+// -*-===----------------------------------------------------------------------===//
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _PSTL_INTERNAL_OMP_PARALLEL_TRANSFORM_REDUCE_H
+#define _PSTL_INTERNAL_OMP_PARALLEL_TRANSFORM_REDUCE_H
+
+#include "util.h"
+
+namespace __pstl
+{
+namespace __omp_backend
+{
+
+//------------------------------------------------------------------------
+// parallel_transform_reduce
+//
+// Notation:
+//      r(i,j,init) returns reduction of init with reduction over [i,j)
+//      u(i) returns f(i,i+1,identity) for a hypothetical left identity element
+//      of r c(x,y) combines values x and y that were the result of r or u
+//------------------------------------------------------------------------
+
+template <class _RandomAccessIterator, class _UnaryOp, class _Value, class _Combiner, class _Reduction>
+auto
+__transform_reduce_body(_RandomAccessIterator __first, _RandomAccessIterator __last, _UnaryOp __unary_op, _Value __init,
+                        _Combiner __combiner, _Reduction __reduction)
+{
+    const std::size_t __num_threads = omp_get_num_threads();
+    const std::size_t __size = __last - __first;
+
+    // Initial partition of the iteration space into chunks. If the range is too small,
+    // this will result in a nonsense policy, so we check on the size as well below.
+    auto __policy = __omp_backend::__chunk_partitioner(__first + __num_threads, __last);
+
+    if (__size <= __num_threads || __policy.__n_chunks < 2)
+    {
+        return __reduction(__first, __last, __init);
+    }
+
+    // Here, we cannot use OpenMP UDR because we must store the init value in
+    // the combiner and it will be used several times. Although there should be
+    // the only one; we manually generate the identity elements for each thread.
+    std::vector<_Value> __accums;
+    __accums.reserve(__num_threads);
+
+    // initialize accumulators for all threads
+    for (std::size_t __i = 0; __i < __num_threads; ++__i)
+    {
+        __accums.emplace_back(__unary_op(__first + __i));
+    }
+
+    // main loop
+    _PSTL_PRAGMA(omp taskloop shared(__accums))
+    for (std::size_t __chunk = 0; __chunk < __policy.__n_chunks; ++__chunk)
+    {
+        __omp_backend::__process_chunk(__policy, __first + __num_threads, __chunk,
+                                       [&](auto __chunk_first, auto __chunk_last)
+                                       {
+                                           auto __thread_num = omp_get_thread_num();
+                                           __accums[__thread_num] =
+                                               __reduction(__chunk_first, __chunk_last, __accums[__thread_num]);
+                                       });
+    }
+
+    // combine by accumulators
+    for (std::size_t __i = 0; __i < __num_threads; ++__i)
+    {
+        __init = __combiner(__init, __accums[__i]);
+    }
+
+    return __init;
+}
+
+template <class _ExecutionPolicy, class _RandomAccessIterator, class _UnaryOp, class _Value, class _Combiner,
+          class _Reduction>
+_Value
+__parallel_transform_reduce(_ExecutionPolicy&&, _RandomAccessIterator __first, _RandomAccessIterator __last,
+                            _UnaryOp __unary_op, _Value __init, _Combiner __combiner, _Reduction __reduction)
+{
+    _Value __result = __init;
+    if (omp_in_parallel())
+    {
+        // We don't create a nested parallel region in an existing parallel
+        // region: just create tasks
+        __result = __pstl::__omp_backend::__transform_reduce_body(__first, __last, __unary_op, __init, __combiner,
+                                                                  __reduction);
+    }
+    else
+    {
+        // Create a parallel region, and a single thread will create tasks
+        // for the region.
+        _PSTL_PRAGMA(omp parallel)
+        _PSTL_PRAGMA(omp single nowait)
+        {
+            __result = __pstl::__omp_backend::__transform_reduce_body(__first, __last, __unary_op, __init, __combiner,
+                                                                      __reduction);
+        }
+    }
+
+    return __result;
+}
+
+} // namespace __omp_backend
+} // namespace __pstl
+#endif // _PSTL_INTERNAL_OMP_PARALLEL_TRANSFORM_REDUCE_H