diff options
author | Johannes Singler <singler@ira.uka.de> | 2007-11-22 10:13:08 +0000 |
---|---|---|
committer | Johannes Singler <singler@gcc.gnu.org> | 2007-11-22 10:13:08 +0000 |
commit | e683ee2a20b907e9b225dd83cd047331e87a7edf (patch) | |
tree | c5838738b1b4d3a360f76e4418890226a450834a | |
parent | 7861a5ce14376a46409d83eeebef942be4030fe4 (diff) | |
download | gcc-e683ee2a20b907e9b225dd83cd047331e87a7edf.tar.gz |
re PR libstdc++/33893 ([parallel mode] Algorithms rely on omp_set_dynamic(false))
2007-11-22 Johannes Singler <singler@ira.uka.de>
PR libstdc++/33893
* include/parallel/multiway_merge.h: made omp_dynamic-safe
* include/parallel/workstealing.h: made omp_dynamic-safe
* include/parallel/base.h: infrastructure, cleanup
* include/parallel/par_loop.h: made omp_dynamic-safe
* include/parallel/features.h: activate loser tree variant
* include/parallel/quicksort.h: made omp_dynamic-safe
* include/parallel/compiletime_settings.h: settings overridable
* include/parallel/equally_split.h: made omp_dynamic-safe
* include/parallel/omp_loop_static.h: made omp_dynamic-safe
* include/parallel/random_shuffle.h: made omp_dynamic-safe
* include/parallel/balanced_quicksort.h: made omp_dynamic-safe
* include/parallel/set_operations.h: made omp_dynamic-safe
* include/parallel/unique_copy.h: made omp_dynamic-safe
* include/parallel/multiway_mergesort.h: made omp_dynamic-safe
* include/parallel/search.h: made omp_dynamic-safe
* include/parallel/partition.h: made omp_dynamic-safe
* include/parallel/partial_sum.h: made omp_dynamic-safe
* include/parallel/find.h: made omp_dynamic-safe
* include/parallel/omp_loop.h: made omp_dynamic-safe
* include/parallel/losertree.h: avoid default constructor
From-SVN: r130347
21 files changed, 3914 insertions, 3304 deletions
diff --git a/libstdc++-v3/ChangeLog b/libstdc++-v3/ChangeLog index dbb620cda10..7415d5584b8 100644 --- a/libstdc++-v3/ChangeLog +++ b/libstdc++-v3/ChangeLog @@ -1,3 +1,27 @@ +2007-11-22 Johannes Singler <singler@ira.uka.de> + + PR libstdc++/33893 + * include/parallel/multiway_merge.h: made omp_dynamic-safe + * include/parallel/workstealing.h: made omp_dynamic-safe + * include/parallel/base.h: infrastructure, cleanup + * include/parallel/par_loop.h: made omp_dynamic-safe + * include/parallel/features.h: activate loser tree variant + * include/parallel/quicksort.h: made omp_dynamic-safe + * include/parallel/compiletime_settings.h: settings overridable + * include/parallel/equally_split.h: made omp_dynamic-safe + * include/parallel/omp_loop_static.h: made omp_dynamic-safe + * include/parallel/random_shuffle.h: made omp_dynamic-safe + * include/parallel/balanced_quicksort.h: made omp_dynamic-safe + * include/parallel/set_operations.h: made omp_dynamic-safe + * include/parallel/unique_copy.h: made omp_dynamic-safe + * include/parallel/multiway_mergesort.h: made omp_dynamic-safe + * include/parallel/search.h: made omp_dynamic-safe + * include/parallel/partition.h: made omp_dynamic-safe + * include/parallel/partial_sum.h: made omp_dynamic-safe + * include/parallel/find.h: made omp_dynamic-safe + * include/parallel/omp_loop.h: made omp_dynamic-safe + * include/parallel/losertree.h: avoid default constructor + 2007-11-21 Jonathan Wakely <jwakely.gcc@gmail.com> * docs/html/17_intro/C++STYLE: Fix typos. diff --git a/libstdc++-v3/include/parallel/balanced_quicksort.h b/libstdc++-v3/include/parallel/balanced_quicksort.h index c28277039c5..0d845ce67c8 100644 --- a/libstdc++-v3/include/parallel/balanced_quicksort.h +++ b/libstdc++-v3/include/parallel/balanced_quicksort.h @@ -63,15 +63,15 @@ namespace __gnu_parallel { - /** @brief Information local to one thread in the parallel quicksort run. */ - template<typename RandomAccessIterator> +/** @brief Information local to one thread in the parallel quicksort run. */ +template<typename RandomAccessIterator> struct QSBThreadLocal { typedef std::iterator_traits<RandomAccessIterator> traits_type; typedef typename traits_type::difference_type difference_type; /** @brief Continuous part of the sequence, described by an - iterator pair. */ + iterator pair. */ typedef std::pair<RandomAccessIterator, RandomAccessIterator> Piece; /** @brief Initial piece to work on. */ @@ -94,29 +94,17 @@ namespace __gnu_parallel QSBThreadLocal(int queue_size) : leftover_parts(queue_size) { } }; - /** @brief Initialize the thread local storage. - * @param tls Array of thread-local storages. - * @param queue_size Size of the work-stealing queue. */ - template<typename RandomAccessIterator> - inline void - qsb_initialize(QSBThreadLocal<RandomAccessIterator>** tls, int queue_size) - { - int iam = omp_get_thread_num(); - tls[iam] = new QSBThreadLocal<RandomAccessIterator>(queue_size); - } - - - /** @brief Balanced quicksort divide step. - * @param begin Begin iterator of subsequence. - * @param end End iterator of subsequence. - * @param comp Comparator. - * @param num_threads Number of threads that are allowed to work on - * this part. - * @pre @c (end-begin)>=1 */ - template<typename RandomAccessIterator, typename Comparator> +/** @brief Balanced quicksort divide step. + * @param begin Begin iterator of subsequence. + * @param end End iterator of subsequence. + * @param comp Comparator. + * @param num_threads Number of threads that are allowed to work on + * this part. + * @pre @c (end-begin)>=1 */ +template<typename RandomAccessIterator, typename Comparator> inline typename std::iterator_traits<RandomAccessIterator>::difference_type qsb_divide(RandomAccessIterator begin, RandomAccessIterator end, - Comparator comp, int num_threads) + Comparator comp, thread_index_t num_threads) { _GLIBCXX_PARALLEL_ASSERT(num_threads > 0); @@ -124,18 +112,20 @@ namespace __gnu_parallel typedef typename traits_type::value_type value_type; typedef typename traits_type::difference_type difference_type; - RandomAccessIterator pivot_pos = median_of_three_iterators(begin, begin + (end - begin) / 2, end - 1, comp); + RandomAccessIterator pivot_pos = median_of_three_iterators( + begin, begin + (end - begin) / 2, end - 1, comp); #if defined(_GLIBCXX_ASSERTIONS) // Must be in between somewhere. difference_type n = end - begin; - _GLIBCXX_PARALLEL_ASSERT((!comp(*pivot_pos, *begin) && !comp(*(begin + n / 2), *pivot_pos)) - || (!comp(*pivot_pos, *begin) && !comp(*end, *pivot_pos)) - || (!comp(*pivot_pos, *(begin + n / 2)) && !comp(*begin, *pivot_pos)) - || (!comp(*pivot_pos, *(begin + n / 2)) && !comp(*end, *pivot_pos)) - || (!comp(*pivot_pos, *end) && !comp(*begin, *pivot_pos)) - || (!comp(*pivot_pos, *end) && !comp(*(begin + n / 2), *pivot_pos))); + _GLIBCXX_PARALLEL_ASSERT( + (!comp(*pivot_pos, *begin) && !comp(*(begin + n / 2), *pivot_pos)) + || (!comp(*pivot_pos, *begin) && !comp(*end, *pivot_pos)) + || (!comp(*pivot_pos, *(begin + n / 2)) && !comp(*begin, *pivot_pos)) + || (!comp(*pivot_pos, *(begin + n / 2)) && !comp(*end, *pivot_pos)) + || (!comp(*pivot_pos, *end) && !comp(*begin, *pivot_pos)) + || (!comp(*pivot_pos, *end) && !comp(*(begin + n / 2), *pivot_pos))); #endif // Swap pivot value to end. @@ -143,10 +133,12 @@ namespace __gnu_parallel std::swap(*pivot_pos, *(end - 1)); pivot_pos = end - 1; - __gnu_parallel::binder2nd<Comparator, value_type, value_type, bool> pred(comp, *pivot_pos); + __gnu_parallel::binder2nd<Comparator, value_type, value_type, bool> + pred(comp, *pivot_pos); // Divide, returning end - begin - 1 in the worst case. - difference_type split_pos = parallel_partition(begin, end - 1, pred, num_threads); + difference_type split_pos = parallel_partition( + begin, end - 1, pred, num_threads); // Swap back pivot to middle. std::swap(*(begin + split_pos), *pivot_pos); @@ -163,18 +155,21 @@ namespace __gnu_parallel return split_pos; } - /** @brief Quicksort conquer step. - * @param tls Array of thread-local storages. - * @param begin Begin iterator of subsequence. - * @param end End iterator of subsequence. - * @param comp Comparator. - * @param iam Number of the thread processing this function. - * @param num_threads Number of threads that are allowed to work on this part. */ - template<typename RandomAccessIterator, typename Comparator> +/** @brief Quicksort conquer step. + * @param tls Array of thread-local storages. + * @param begin Begin iterator of subsequence. + * @param end End iterator of subsequence. + * @param comp Comparator. + * @param iam Number of the thread processing this function. + * @param num_threads + * Number of threads that are allowed to work on this part. */ +template<typename RandomAccessIterator, typename Comparator> inline void qsb_conquer(QSBThreadLocal<RandomAccessIterator>** tls, - RandomAccessIterator begin, RandomAccessIterator end, - Comparator comp, thread_index_t iam, thread_index_t num_threads) + RandomAccessIterator begin, RandomAccessIterator end, + Comparator comp, + thread_index_t iam, thread_index_t num_threads, + bool parent_wait) { typedef std::iterator_traits<RandomAccessIterator> traits_type; typedef typename traits_type::value_type value_type; @@ -182,14 +177,14 @@ namespace __gnu_parallel difference_type n = end - begin; - if (num_threads <= 1 || n < 2) + if (num_threads <= 1 || n <= 1) { - tls[iam]->initial.first = begin; - tls[iam]->initial.second = end; + tls[iam]->initial.first = begin; + tls[iam]->initial.second = end; - qsb_local_sort_with_helping(tls, comp, iam); + qsb_local_sort_with_helping(tls, comp, iam, parent_wait); - return; + return; } // Divide step. @@ -199,33 +194,55 @@ namespace __gnu_parallel _GLIBCXX_PARALLEL_ASSERT(0 <= split_pos && split_pos < (end - begin)); #endif - thread_index_t num_threads_leftside = std::max<thread_index_t>(1, std::min<thread_index_t>(num_threads - 1, split_pos * num_threads / n)); + thread_index_t num_threads_leftside = + std::max<thread_index_t>(1, std::min<thread_index_t>( + num_threads - 1, split_pos * num_threads / n)); -#pragma omp atomic +# pragma omp atomic *tls[iam]->elements_leftover -= (difference_type)1; // Conquer step. -#pragma omp parallel sections num_threads(2) +# pragma omp parallel num_threads(2) { -#pragma omp section - qsb_conquer(tls, begin, begin + split_pos, comp, iam, num_threads_leftside); - // The pivot_pos is left in place, to ensure termination. -#pragma omp section - qsb_conquer(tls, begin + split_pos + 1, end, comp, - iam + num_threads_leftside, num_threads - num_threads_leftside); + bool wait; + if(omp_get_num_threads() < 2) + wait = false; + else + wait = parent_wait; + +# pragma omp sections + { +# pragma omp section + { + qsb_conquer(tls, begin, begin + split_pos, comp, + iam, + num_threads_leftside, + wait); + wait = parent_wait; + } + // The pivot_pos is left in place, to ensure termination. +# pragma omp section + { + qsb_conquer(tls, begin + split_pos + 1, end, comp, + iam + num_threads_leftside, + num_threads - num_threads_leftside, + wait); + wait = parent_wait; + } + } } } - /** - * @brief Quicksort step doing load-balanced local sort. - * @param tls Array of thread-local storages. - * @param comp Comparator. - * @param iam Number of the thread processing this function. - */ - template<typename RandomAccessIterator, typename Comparator> +/** + * @brief Quicksort step doing load-balanced local sort. + * @param tls Array of thread-local storages. + * @param comp Comparator. + * @param iam Number of the thread processing this function. + */ +template<typename RandomAccessIterator, typename Comparator> inline void qsb_local_sort_with_helping(QSBThreadLocal<RandomAccessIterator>** tls, - Comparator& comp, int iam) + Comparator& comp, int iam, bool wait) { typedef std::iterator_traits<RandomAccessIterator> traits_type; typedef typename traits_type::value_type value_type; @@ -251,151 +268,162 @@ namespace __gnu_parallel for (;;) { - // Invariant: current must be a valid (maybe empty) range. - RandomAccessIterator begin = current.first, end = current.second; - difference_type n = end - begin; + // Invariant: current must be a valid (maybe empty) range. + RandomAccessIterator begin = current.first, end = current.second; + difference_type n = end - begin; - if (n > base_case_n) - { - // Divide. - RandomAccessIterator pivot_pos = begin + rng(n); + if (n > base_case_n) + { + // Divide. + RandomAccessIterator pivot_pos = begin + rng(n); - // Swap pivot_pos value to end. - if (pivot_pos != (end - 1)) - std::swap(*pivot_pos, *(end - 1)); - pivot_pos = end - 1; + // Swap pivot_pos value to end. + if (pivot_pos != (end - 1)) + std::swap(*pivot_pos, *(end - 1)); + pivot_pos = end - 1; - __gnu_parallel::binder2nd<Comparator, value_type, value_type, bool> pred(comp, *pivot_pos); + __gnu_parallel::binder2nd + <Comparator, value_type, value_type, bool> + pred(comp, *pivot_pos); - // Divide, leave pivot unchanged in last place. - RandomAccessIterator split_pos1, split_pos2; - split_pos1 = __gnu_sequential::partition(begin, end - 1, pred); + // Divide, leave pivot unchanged in last place. + RandomAccessIterator split_pos1, split_pos2; + split_pos1 = __gnu_sequential::partition(begin, end - 1, pred); - // Left side: < pivot_pos; right side: >= pivot_pos. + // Left side: < pivot_pos; right side: >= pivot_pos. #if _GLIBCXX_ASSERTIONS - _GLIBCXX_PARALLEL_ASSERT(begin <= split_pos1 && split_pos1 < end); + _GLIBCXX_PARALLEL_ASSERT(begin <= split_pos1 && split_pos1 < end); #endif - // Swap pivot back to middle. - if (split_pos1 != pivot_pos) - std::swap(*split_pos1, *pivot_pos); - pivot_pos = split_pos1; - - // In case all elements are equal, split_pos1 == 0. - if ((split_pos1 + 1 - begin) < (n >> 7) - || (end - split_pos1) < (n >> 7)) - { - // Very unequal split, one part smaller than one 128th - // elements not strictly larger than the pivot. - __gnu_parallel::unary_negate<__gnu_parallel::binder1st<Comparator, value_type, value_type, bool>, value_type> pred(__gnu_parallel::binder1st<Comparator, value_type, value_type, bool>(comp, *pivot_pos)); - - // Find other end of pivot-equal range. - split_pos2 = __gnu_sequential::partition(split_pos1 + 1, end, pred); - } - else - { - // Only skip the pivot. - split_pos2 = split_pos1 + 1; - } - - // Elements equal to pivot are done. - elements_done += (split_pos2 - split_pos1); + // Swap pivot back to middle. + if (split_pos1 != pivot_pos) + std::swap(*split_pos1, *pivot_pos); + pivot_pos = split_pos1; + + // In case all elements are equal, split_pos1 == 0. + if ((split_pos1 + 1 - begin) < (n >> 7) + || (end - split_pos1) < (n >> 7)) + { + // Very unequal split, one part smaller than one 128th + // elements not strictly larger than the pivot. + __gnu_parallel::unary_negate<__gnu_parallel::binder1st + <Comparator, value_type, value_type, bool>, value_type> + pred(__gnu_parallel::binder1st + <Comparator, value_type, value_type, bool>( + comp, *pivot_pos)); + + // Find other end of pivot-equal range. + split_pos2 = __gnu_sequential::partition( + split_pos1 + 1, end, pred); + } + else + // Only skip the pivot. + split_pos2 = split_pos1 + 1; + + // Elements equal to pivot are done. + elements_done += (split_pos2 - split_pos1); #if _GLIBCXX_ASSERTIONS - total_elements_done += (split_pos2 - split_pos1); + total_elements_done += (split_pos2 - split_pos1); #endif - // Always push larger part onto stack. - if (((split_pos1 + 1) - begin) < (end - (split_pos2))) - { - // Right side larger. - if ((split_pos2) != end) - tl.leftover_parts.push_front(std::make_pair(split_pos2, end)); - - //current.first = begin; //already set anyway - current.second = split_pos1; - continue; - } - else - { - // Left side larger. - if (begin != split_pos1) - tl.leftover_parts.push_front(std::make_pair(begin, split_pos1)); - - current.first = split_pos2; - //current.second = end; //already set anyway - continue; - } - } - else - { - __gnu_sequential::sort(begin, end, comp); - elements_done += n; + // Always push larger part onto stack. + if (((split_pos1 + 1) - begin) < (end - (split_pos2))) + { + // Right side larger. + if ((split_pos2) != end) + tl.leftover_parts.push_front(std::make_pair(split_pos2, end)); + + //current.first = begin; //already set anyway + current.second = split_pos1; + continue; + } + else + { + // Left side larger. + if (begin != split_pos1) + tl.leftover_parts.push_front( + std::make_pair(begin, split_pos1)); + + current.first = split_pos2; + //current.second = end; //already set anyway + continue; + } + } + else + { + __gnu_sequential::sort(begin, end, comp); + elements_done += n; #if _GLIBCXX_ASSERTIONS - total_elements_done += n; + total_elements_done += n; #endif - // Prefer own stack, small pieces. - if (tl.leftover_parts.pop_front(current)) - continue; + // Prefer own stack, small pieces. + if (tl.leftover_parts.pop_front(current)) + continue; -#pragma omp atomic - *tl.elements_leftover -= elements_done; - elements_done = 0; +# pragma omp atomic + *tl.elements_leftover -= elements_done; + + elements_done = 0; #if _GLIBCXX_ASSERTIONS - double search_start = omp_get_wtime(); + double search_start = omp_get_wtime(); #endif - // Look for new work. - bool success = false; - while (*tl.elements_leftover > 0 && !success + // Look for new work. + bool successfully_stolen = false; + while (wait && *tl.elements_leftover > 0 && !successfully_stolen #if _GLIBCXX_ASSERTIONS - // Possible dead-lock. - && (omp_get_wtime() < (search_start + 1.0)) + // Possible dead-lock. + && (omp_get_wtime() < (search_start + 1.0)) #endif - ) - { - thread_index_t victim; - victim = rng(num_threads); - - // Large pieces. - success = (victim != iam) && tls[victim]->leftover_parts.pop_back(current); - if (!success) - yield(); + ) + { + thread_index_t victim; + victim = rng(num_threads); + + // Large pieces. + successfully_stolen = (victim != iam) + && tls[victim]->leftover_parts.pop_back(current); + if (!successfully_stolen) + yield(); #if !defined(__ICC) && !defined(__ECC) -#pragma omp flush +# pragma omp flush #endif - } + } #if _GLIBCXX_ASSERTIONS - if (omp_get_wtime() >= (search_start + 1.0)) - { - sleep(1); - _GLIBCXX_PARALLEL_ASSERT(omp_get_wtime() < (search_start + 1.0)); - } + if (omp_get_wtime() >= (search_start + 1.0)) + { + sleep(1); + _GLIBCXX_PARALLEL_ASSERT( + omp_get_wtime() < (search_start + 1.0)); + } #endif - if (!success) - { + if (!successfully_stolen) + { #if _GLIBCXX_ASSERTIONS - _GLIBCXX_PARALLEL_ASSERT(*tl.elements_leftover == 0); + _GLIBCXX_PARALLEL_ASSERT(*tl.elements_leftover == 0); #endif - return; - } - } + return; + } + } } } - /** @brief Top-level quicksort routine. - * @param begin Begin iterator of sequence. - * @param end End iterator of sequence. - * @param comp Comparator. - * @param n Length of the sequence to sort. - * @param num_threads Number of threads that are allowed to work on - * this part. - */ - template<typename RandomAccessIterator, typename Comparator> +/** @brief Top-level quicksort routine. + * @param begin Begin iterator of sequence. + * @param end End iterator of sequence. + * @param comp Comparator. + * @param n Length of the sequence to sort. + * @param num_threads Number of threads that are allowed to work on + * this part. + */ +template<typename RandomAccessIterator, typename Comparator> inline void parallel_sort_qsb(RandomAccessIterator begin, RandomAccessIterator end, - Comparator comp, - typename std::iterator_traits<RandomAccessIterator>::difference_type n, int num_threads) + Comparator comp, + typename std::iterator_traits<RandomAccessIterator> + ::difference_type n, + thread_index_t num_threads) { _GLIBCXX_CALL(end - begin) @@ -413,11 +441,11 @@ namespace __gnu_parallel if (num_threads > n) num_threads = static_cast<thread_index_t>(n); + // Initialize thread local storage tls_type** tls = new tls_type*[num_threads]; - -#pragma omp parallel num_threads(num_threads) - // Initialize variables per processor. - qsb_initialize(tls, num_threads * (thread_index_t)(log2(n) + 1)); + difference_type queue_size = num_threads * (thread_index_t)(log2(n) + 1); + for (thread_index_t t = 0; t < num_threads; ++t) + tls[t] = new QSBThreadLocal<RandomAccessIterator>(queue_size); // There can never be more than ceil(log2(n)) ranges on the stack, because // 1. Only one processor pushes onto the stack @@ -426,22 +454,16 @@ namespace __gnu_parallel volatile difference_type elements_leftover = n; for (int i = 0; i < num_threads; i++) { - tls[i]->elements_leftover = &elements_leftover; - tls[i]->num_threads = num_threads; - tls[i]->global = std::make_pair(begin, end); + tls[i]->elements_leftover = &elements_leftover; + tls[i]->num_threads = num_threads; + tls[i]->global = std::make_pair(begin, end); - // Just in case nothing is left to assign. - tls[i]->initial = std::make_pair(end, end); + // Just in case nothing is left to assign. + tls[i]->initial = std::make_pair(end, end); } - // Initial splitting, recursively. - int old_nested = omp_get_nested(); - omp_set_nested(true); - // Main recursion call. - qsb_conquer(tls, begin, begin + n, comp, 0, num_threads); - - omp_set_nested(old_nested); + qsb_conquer(tls, begin, begin + n, comp, 0, num_threads, true); #if _GLIBCXX_ASSERTIONS // All stack must be empty. diff --git a/libstdc++-v3/include/parallel/base.h b/libstdc++-v3/include/parallel/base.h index 0a86d8351fe..5a756cd4be9 100644 --- a/libstdc++-v3/include/parallel/base.h +++ b/libstdc++-v3/include/parallel/base.h @@ -49,54 +49,70 @@ namespace __gnu_parallel // XXX remove std::duplicates from here if possible, // XXX but keep minimal dependencies. - /** @brief Calculates the rounded-down logarithm of @c n for base 2. - * @param n Argument. - * @return Returns 0 for argument 0. - */ - template<typename Size> - inline Size - log2(Size n) +/** @brief Calculates the rounded-down logarithm of @c n for base 2. + * @param n Argument. + * @return Returns 0 for argument 0. + */ +template<typename Size> + inline Size + log2(Size n) { Size k; for (k = 0; n != 1; n >>= 1) - ++k; + ++k; return k; } - /** @brief Encode two integers into one __gnu_parallel::lcas_t. - * @param a First integer, to be encoded in the most-significant @c - * lcas_t_bits/2 bits. - * @param b Second integer, to be encoded in the least-significant - * @c lcas_t_bits/2 bits. - * @return __gnu_parallel::lcas_t value encoding @c a and @c b. - * @see decode2 - */ - inline lcas_t - encode2(int a, int b) //must all be non-negative, actually +/** @brief Encode two integers into one __gnu_parallel::lcas_t. + * @param a First integer, to be encoded in the most-significant @c + * lcas_t_bits/2 bits. + * @param b Second integer, to be encoded in the least-significant + * @c lcas_t_bits/2 bits. + * @return __gnu_parallel::lcas_t value encoding @c a and @c b. + * @see decode2 + */ +inline lcas_t +encode2(int a, int b) //must all be non-negative, actually +{ + return (((lcas_t)a) << (lcas_t_bits / 2)) | (((lcas_t)b) << 0); +} + +/** @brief Decode two integers from one __gnu_parallel::lcas_t. + * @param x __gnu_parallel::lcas_t to decode integers from. + * @param a First integer, to be decoded from the most-significant + * @c lcas_t_bits/2 bits of @c x. + * @param b Second integer, to be encoded in the least-significant + * @c lcas_t_bits/2 bits of @c x. + * @see encode2 + */ +inline void +decode2(lcas_t x, int& a, int& b) +{ + a = (int)((x >> (lcas_t_bits / 2)) & lcas_t_mask); + b = (int)((x >> 0 ) & lcas_t_mask); +} + +/** @brief Equivalent to std::min. */ +template<typename T> + const T& + min(const T& a, const T& b) { - return (((lcas_t)a) << (lcas_t_bits / 2)) | (((lcas_t)b) << 0); - } + return (a < b) ? a : b; + }; - /** @brief Decode two integers from one __gnu_parallel::lcas_t. - * @param x __gnu_parallel::lcas_t to decode integers from. - * @param a First integer, to be decoded from the most-significant - * @c lcas_t_bits/2 bits of @c x. - * @param b Second integer, to be encoded in the least-significant - * @c lcas_t_bits/2 bits of @c x. - * @see encode2 - */ - inline void - decode2(lcas_t x, int& a, int& b) +/** @brief Equivalent to std::max. */ +template<typename T> + const T& + max(const T& a, const T& b) { - a = (int)((x >> (lcas_t_bits / 2)) & lcas_t_mask); - b = (int)((x >> 0 ) & lcas_t_mask); - } + return (a > b) ? a : b; + }; - /** @brief Constructs predicate for equality from strict weak - * ordering predicate - */ - // XXX comparator at the end, as per others - template<typename Comparator, typename T1, typename T2> +/** @brief Constructs predicate for equality from strict weak + * ordering predicate + */ +// XXX comparator at the end, as per others +template<typename Comparator, typename T1, typename T2> class equal_from_less : public std::binary_function<T1, T2, bool> { private: @@ -112,162 +128,176 @@ namespace __gnu_parallel }; - /** @brief Similar to std::binder1st, but giving the argument types explicitly. */ - template<typename _Predicate, typename argument_type> - class unary_negate - : public std::unary_function<argument_type, bool> - { - protected: - _Predicate _M_pred; - - public: - explicit - unary_negate(const _Predicate& __x) : _M_pred(__x) { } - - bool - operator()(const argument_type& __x) - { return !_M_pred(__x); } - }; - - /** @brief Similar to std::binder1st, but giving the argument types explicitly. */ - template<typename _Operation, typename first_argument_type, typename second_argument_type, typename result_type> - class binder1st - : public std::unary_function<second_argument_type, result_type> - { - protected: - _Operation op; - first_argument_type value; - - public: - binder1st(const _Operation& __x, - const first_argument_type& __y) - : op(__x), value(__y) { } - - result_type - operator()(const second_argument_type& __x) - { return op(value, __x); } - - // _GLIBCXX_RESOLVE_LIB_DEFECTS - // 109. Missing binders for non-const sequence elements - result_type - operator()(second_argument_type& __x) const - { return op(value, __x); } - }; - - /** - * @brief Similar to std::binder2nd, but giving the argument types - * explicitly. - */ - template<typename _Operation, typename first_argument_type, typename second_argument_type, typename result_type> - class binder2nd - : public std::unary_function<first_argument_type, result_type> - { - protected: - _Operation op; - second_argument_type value; - - public: - binder2nd(const _Operation& __x, - const second_argument_type& __y) - : op(__x), value(__y) { } - - result_type - operator()(const first_argument_type& __x) const - { return op(__x, value); } - - // _GLIBCXX_RESOLVE_LIB_DEFECTS - // 109. Missing binders for non-const sequence elements - result_type - operator()(first_argument_type& __x) - { return op(__x, value); } - }; - - /** @brief Similar to std::equal_to, but allows two different types. */ - template<typename T1, typename T2> +/** @brief Similar to std::binder1st, + * but giving the argument types explicitly. */ +template<typename _Predicate, typename argument_type> + class unary_negate + : public std::unary_function<argument_type, bool> + { + protected: + _Predicate _M_pred; + + public: + explicit + unary_negate(const _Predicate& __x) : _M_pred(__x) { } + + bool + operator()(const argument_type& __x) + { return !_M_pred(__x); } + }; + +/** @brief Similar to std::binder1st, + * but giving the argument types explicitly. */ +template< + typename _Operation, + typename first_argument_type, + typename second_argument_type, + typename result_type> + class binder1st + : public std::unary_function<second_argument_type, result_type> + { + protected: + _Operation op; + first_argument_type value; + + public: + binder1st(const _Operation& __x, + const first_argument_type& __y) + : op(__x), value(__y) { } + + result_type + operator()(const second_argument_type& __x) + { return op(value, __x); } + + // _GLIBCXX_RESOLVE_LIB_DEFECTS + // 109. Missing binders for non-const sequence elements + result_type + operator()(second_argument_type& __x) const + { return op(value, __x); } + }; + +/** + * @brief Similar to std::binder2nd, but giving the argument types + * explicitly. + */ +template< + typename _Operation, + typename first_argument_type, + typename second_argument_type, + typename result_type> + class binder2nd + : public std::unary_function<first_argument_type, result_type> + { + protected: + _Operation op; + second_argument_type value; + + public: + binder2nd(const _Operation& __x, + const second_argument_type& __y) + : op(__x), value(__y) { } + + result_type + operator()(const first_argument_type& __x) const + { return op(__x, value); } + + // _GLIBCXX_RESOLVE_LIB_DEFECTS + // 109. Missing binders for non-const sequence elements + result_type + operator()(first_argument_type& __x) + { return op(__x, value); } + }; + +/** @brief Similar to std::equal_to, but allows two different types. */ +template<typename T1, typename T2> struct equal_to : std::binary_function<T1, T2, bool> { bool operator()(const T1& t1, const T2& t2) const { return t1 == t2; } }; - /** @brief Similar to std::less, but allows two different types. */ - template<typename T1, typename T2> +/** @brief Similar to std::less, but allows two different types. */ +template<typename T1, typename T2> struct less : std::binary_function<T1, T2, bool> { - bool + bool operator()(const T1& t1, const T2& t2) const { return t1 < t2; } - bool + bool operator()(const T2& t2, const T1& t1) const { return t2 < t1; } }; - // Partial specialization for one type. Same as std::less. - template<typename _Tp> - struct less<_Tp, _Tp> : public std::binary_function<_Tp, _Tp, bool> - { - bool - operator()(const _Tp& __x, const _Tp& __y) const - { return __x < __y; } - }; +// Partial specialization for one type. Same as std::less. +template<typename _Tp> +struct less<_Tp, _Tp> : public std::binary_function<_Tp, _Tp, bool> + { + bool + operator()(const _Tp& __x, const _Tp& __y) const + { return __x < __y; } + }; - /** @brief Similar to std::plus, but allows two different types. */ - template<typename _Tp1, typename _Tp2> - struct plus : public std::binary_function<_Tp1, _Tp2, _Tp1> - { - typedef typeof(*static_cast<_Tp1*>(NULL) + *static_cast<_Tp2*>(NULL)) result; + /** @brief Similar to std::plus, but allows two different types. */ +template<typename _Tp1, typename _Tp2> + struct plus : public std::binary_function<_Tp1, _Tp2, _Tp1> + { + typedef typeof(*static_cast<_Tp1*>(NULL) + + *static_cast<_Tp2*>(NULL)) result; - result - operator()(const _Tp1& __x, const _Tp2& __y) const - { return __x + __y; } - }; + result + operator()(const _Tp1& __x, const _Tp2& __y) const + { return __x + __y; } + }; - // Partial specialization for one type. Same as std::plus. - template<typename _Tp> - struct plus<_Tp, _Tp> : public std::binary_function<_Tp, _Tp, _Tp> - { - typedef typeof(*static_cast<_Tp*>(NULL) + *static_cast<_Tp*>(NULL)) result; +// Partial specialization for one type. Same as std::plus. +template<typename _Tp> + struct plus<_Tp, _Tp> : public std::binary_function<_Tp, _Tp, _Tp> + { + typedef typeof(*static_cast<_Tp*>(NULL) + + *static_cast<_Tp*>(NULL)) result; - result - operator()(const _Tp& __x, const _Tp& __y) const - { return __x + __y; } - }; + result + operator()(const _Tp& __x, const _Tp& __y) const + { return __x + __y; } + }; - /** @brief Similar to std::multiplies, but allows two different types. */ - template<typename _Tp1, typename _Tp2> - struct multiplies : public std::binary_function<_Tp1, _Tp2, _Tp1> - { - typedef typeof(*static_cast<_Tp1*>(NULL) * *static_cast<_Tp2*>(NULL)) result; +/** @brief Similar to std::multiplies, but allows two different types. */ +template<typename _Tp1, typename _Tp2> + struct multiplies : public std::binary_function<_Tp1, _Tp2, _Tp1> + { + typedef typeof(*static_cast<_Tp1*>(NULL) + * *static_cast<_Tp2*>(NULL)) result; - result - operator()(const _Tp1& __x, const _Tp2& __y) const - { return __x * __y; } - }; + result + operator()(const _Tp1& __x, const _Tp2& __y) const + { return __x * __y; } + }; - // Partial specialization for one type. Same as std::multiplies. - template<typename _Tp> - struct multiplies<_Tp, _Tp> : public std::binary_function<_Tp, _Tp, _Tp> - { - typedef typeof(*static_cast<_Tp*>(NULL) * *static_cast<_Tp*>(NULL)) result; +// Partial specialization for one type. Same as std::multiplies. +template<typename _Tp> + struct multiplies<_Tp, _Tp> : public std::binary_function<_Tp, _Tp, _Tp> + { + typedef typeof(*static_cast<_Tp*>(NULL) + * *static_cast<_Tp*>(NULL)) result; - result - operator()(const _Tp& __x, const _Tp& __y) const - { return __x * __y; } - }; + result + operator()(const _Tp& __x, const _Tp& __y) const + { return __x * __y; } + }; - template<typename T, typename _DifferenceTp> +template<typename T, typename _DifferenceTp> class pseudo_sequence; - /** @brief Iterator associated with __gnu_parallel::pseudo_sequence. - * If features the usual random-access iterator functionality. - * @param T Sequence value type. - * @param difference_type Sequence difference type. - */ - template<typename T, typename _DifferenceTp> +/** @brief Iterator associated with __gnu_parallel::pseudo_sequence. + * If features the usual random-access iterator functionality. + * @param T Sequence value type. + * @param difference_type Sequence difference type. + */ +template<typename T, typename _DifferenceTp> class pseudo_sequence_iterator { public: @@ -296,34 +326,34 @@ namespace __gnu_parallel operator++(int) { return type(pos++); } - const T& + const T& operator*() const { return val; } - const T& + const T& operator[](difference_type) const { return val; } - bool + bool operator==(const type& i2) { return pos == i2.pos; } - difference_type + difference_type operator!=(const type& i2) { return pos != i2.pos; } - difference_type + difference_type operator-(const type& i2) { return pos - i2.pos; } }; - /** @brief Sequence that conceptually consists of multiple copies of - the same element. - * The copies are not stored explicitly, of course. - * @param T Sequence value type. - * @param difference_type Sequence difference type. - */ - template<typename T, typename _DifferenceTp> +/** @brief Sequence that conceptually consists of multiple copies of + the same element. + * The copies are not stored explicitly, of course. + * @param T Sequence value type. + * @param difference_type Sequence difference type. + */ +template<typename T, typename _DifferenceTp> class pseudo_sequence { typedef pseudo_sequence<T, _DifferenceTp> type; @@ -335,10 +365,10 @@ namespace __gnu_parallel typedef pseudo_sequence_iterator<T, uint64> iterator; /** @brief Constructor. - * @param val Element of the sequence. - * @param count Number of (virtual) copies. - */ - pseudo_sequence(const T& val, difference_type count) + * @param val Element of the sequence. + * @param count Number of (virtual) copies. + */ + pseudo_sequence(const T& val, difference_type count) : val(val), count(count) { } /** @brief Begin iterator. */ @@ -356,67 +386,66 @@ namespace __gnu_parallel difference_type count; }; - /** @brief Functor that does nothing */ - template<typename _ValueTp> +/** @brief Functor that does nothing */ +template<typename _ValueTp> class void_functor { - inline void + inline void operator()(const _ValueTp& v) const { } }; - /** @brief Compute the median of three referenced elements, - according to @c comp. - * @param a First iterator. - * @param b Second iterator. - * @param c Third iterator. - * @param comp Comparator. - */ - template<typename RandomAccessIterator, typename Comparator> - RandomAccessIterator - median_of_three_iterators(RandomAccessIterator a, RandomAccessIterator b, - RandomAccessIterator c, Comparator& comp) +/** @brief Compute the median of three referenced elements, + according to @c comp. + * @param a First iterator. + * @param b Second iterator. + * @param c Third iterator. + * @param comp Comparator. + */ +template<typename RandomAccessIterator, typename Comparator> +RandomAccessIterator + median_of_three_iterators(RandomAccessIterator a, RandomAccessIterator b, + RandomAccessIterator c, Comparator& comp) { if (comp(*a, *b)) if (comp(*b, *c)) - return b; + return b; else - if (comp(*a, *c)) - return c; - else - return a; + if (comp(*a, *c)) + return c; + else + return a; else { - // Just swap a and b. - if (comp(*a, *c)) - return a; - else - if (comp(*b, *c)) - return c; - else - return b; + // Just swap a and b. + if (comp(*a, *c)) + return a; + else + if (comp(*b, *c)) + return c; + else + return b; } } - // Avoid the use of assert, because we're trying to keep the <cassert> - // include out of the mix. (Same as debug mode). - inline void - __replacement_assert(const char* __file, int __line, - const char* __function, const char* __condition) - { - std::printf("%s:%d: %s: Assertion '%s' failed.\n", __file, __line, - __function, __condition); - __builtin_abort(); - } - +// Avoid the use of assert, because we're trying to keep the <cassert> +// include out of the mix. (Same as debug mode). +inline void +__replacement_assert(const char* __file, int __line, + const char* __function, const char* __condition) +{ + std::printf("%s:%d: %s: Assertion '%s' failed.\n", __file, __line, + __function, __condition); + __builtin_abort(); +} + #define _GLIBCXX_PARALLEL_ASSERT(_Condition) \ - do \ - { \ - if (!(_Condition)) \ - __gnu_parallel::__replacement_assert(__FILE__, __LINE__, \ - __PRETTY_FUNCTION__, #_Condition); \ - } while (false) - +do \ + { \ + if (!(_Condition)) \ + __gnu_parallel::__replacement_assert(__FILE__, __LINE__, \ + __PRETTY_FUNCTION__, #_Condition); \ + } while (false) + } //namespace __gnu_parallel #endif - diff --git a/libstdc++-v3/include/parallel/compiletime_settings.h b/libstdc++-v3/include/parallel/compiletime_settings.h index 717bda56f6f..edaea3856ad 100644 --- a/libstdc++-v3/include/parallel/compiletime_settings.h +++ b/libstdc++-v3/include/parallel/compiletime_settings.h @@ -39,7 +39,7 @@ #include <cstdio> /** @brief Determine verbosity level of the parallel mode. - * Level 1 prints a message each time when entering a parallel-mode function. */ + * Level 1 prints a message each time a parallel-mode function is entered. */ #define _GLIBCXX_VERBOSE_LEVEL 0 /** @def _GLIBCXX_CALL @@ -50,27 +50,40 @@ #define _GLIBCXX_CALL(n) #endif #if (_GLIBCXX_VERBOSE_LEVEL == 1) -#define _GLIBCXX_CALL(n) printf(" %s:\niam = %d, n = %ld, num_threads = %d\n", __PRETTY_FUNCTION__, omp_get_thread_num(), (n), get_max_threads()); +#define _GLIBCXX_CALL(n) \ + printf(" %s:\niam = %d, n = %ld, num_threads = %d\n", \ + __PRETTY_FUNCTION__, omp_get_thread_num(), (n), get_max_threads()); #endif +#ifndef _GLIBCXX_SCALE_DOWN_FPU /** @brief Use floating-point scaling instead of modulo for mapping * random numbers to a range. This can be faster on certain CPUs. */ #define _GLIBCXX_SCALE_DOWN_FPU 0 +#endif +#ifndef _GLIBCXX_ASSERTIONS /** @brief Switch on many _GLIBCXX_PARALLEL_ASSERTions in parallel code. * Should be switched on only locally. */ #define _GLIBCXX_ASSERTIONS 0 +#endif +#ifndef _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_L1 /** @brief Switch on many _GLIBCXX_PARALLEL_ASSERTions in parallel code. - * Consider the size of the L1 cache for __gnu_parallel::parallel_random_shuffle(). */ + * Consider the size of the L1 cache for + * __gnu_parallel::parallel_random_shuffle(). */ #define _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_L1 0 +#endif +#ifndef _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB /** @brief Switch on many _GLIBCXX_PARALLEL_ASSERTions in parallel code. - * Consider the size of the TLB for __gnu_parallel::parallel_random_shuffle(). */ + * Consider the size of the TLB for + * __gnu_parallel::parallel_random_shuffle(). */ #define _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB 0 +#endif +#ifndef _GLIBCXX_MULTIWAY_MERGESORT_COPY_LAST /** @brief First copy the data, sort it locally, and merge it back * (0); or copy it back after everything is done (1). * * Recommendation: 0 */ #define _GLIBCXX_MULTIWAY_MERGESORT_COPY_LAST 0 - +#endif diff --git a/libstdc++-v3/include/parallel/equally_split.h b/libstdc++-v3/include/parallel/equally_split.h index 730875d0d52..4c4167e3d25 100644 --- a/libstdc++-v3/include/parallel/equally_split.h +++ b/libstdc++-v3/include/parallel/equally_split.h @@ -39,30 +39,58 @@ namespace __gnu_parallel { - /** @brief Function to split a sequence into parts of almost equal size. - * - * The resulting sequence s of length p+1 contains the splitting - * positions when splitting the range [0,n) into parts of almost - * equal size (plus minus 1). The first entry is 0, the last one - * n. There may result empty parts. - * @param n Number of elements - * @param p Number of parts - * @param s Splitters - * @returns End of splitter sequence, i. e. @c s+p+1 */ - template<typename _DifferenceTp, typename OutputIterator> +/** @brief Function to split a sequence into parts of almost equal size. + * + * The resulting sequence s of length num_threads+1 contains the splitting + * positions when splitting the range [0,n) into parts of almost + * equal size (plus minus 1). The first entry is 0, the last one + * n. There may result empty parts. + * @param n Number of elements + * @param num_threads Number of parts + * @param s Splitters + * @returns End of splitter sequence, i. e. @c s+num_threads+1 */ +template<typename difference_type, typename OutputIterator> OutputIterator - equally_split(_DifferenceTp n, thread_index_t p, OutputIterator s) + equally_split(difference_type n, + thread_index_t num_threads, + OutputIterator s) { - typedef _DifferenceTp difference_type; - difference_type chunk_length = n / p, split = n % p, start = 0; - for (int i = 0; i < p; i++) + difference_type chunk_length = n / num_threads, + num_longer_chunks = n % num_threads, + pos = 0; + for (thread_index_t i = 0; i < num_threads; ++i) { - *s++ = start; - start += (difference_type(i) < split) ? (chunk_length + 1) : chunk_length; + *s++ = pos; + pos += (i < num_longer_chunks) ? (chunk_length + 1) : chunk_length; } *s++ = n; return s; } + + +/** @brief Function to split a sequence into parts of almost equal size. + * + * Returns the position of the splitting point between + * thread number thread_no (included) and + * thread number thread_no+1 (excluded). + * @param n Number of elements + * @param num_threads Number of parts + * @returns Splitting point */ +template<typename difference_type> + difference_type + equally_split_point(difference_type n, + thread_index_t num_threads, + thread_index_t thread_no) + { + difference_type chunk_length = n / num_threads, + num_longer_chunks = n % num_threads; + + if(thread_no < num_longer_chunks) + return thread_no * (chunk_length + 1); + else + return num_longer_chunks * (chunk_length + 1) + + (thread_no - num_longer_chunks) * chunk_length; + } } #endif diff --git a/libstdc++-v3/include/parallel/features.h b/libstdc++-v3/include/parallel/features.h index c67ffb550e9..a78c16b8886 100644 --- a/libstdc++-v3/include/parallel/features.h +++ b/libstdc++-v3/include/parallel/features.h @@ -66,7 +66,7 @@ * @brief Include guarded (sequences may run empty) loser tree, * moving objects. * @see __gnu_parallel::Settings multiway_merge_algorithm */ -#define _GLIBCXX_LOSER_TREE 0 +#define _GLIBCXX_LOSER_TREE 1 #endif #ifndef _GLIBCXX_LOSER_TREE_EXPLICIT diff --git a/libstdc++-v3/include/parallel/find.h b/libstdc++-v3/include/parallel/find.h index bc89fa570e5..2a5b22c629f 100644 --- a/libstdc++-v3/include/parallel/find.h +++ b/libstdc++-v3/include/parallel/find.h @@ -10,7 +10,7 @@ // This library is distributed in the hope that it will be useful, but // WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURstartE. See the GNU // General Public License for more details. // You should have received a copy of the GNU General Public License @@ -48,50 +48,66 @@ namespace __gnu_parallel { - /** - * @brief Parallel std::find, switch for different algorithms. - * @param begin1 Begin iterator of first sequence. - * @param end1 End iterator of first sequence. - * @param begin2 Begin iterator of second sequence. Must have same - * length as first sequence. - * @param pred Find predicate. - * @param selector Functionality (e. g. std::find_if (), std::equal(),...) - * @return Place of finding in both sequences. - */ - template<typename RandomAccessIterator1, typename RandomAccessIterator2, typename Pred, typename Selector> +/** + * @brief Parallel std::find, switch for different algorithms. + * @param begin1 Begin iterator of first sequence. + * @param end1 End iterator of first sequence. + * @param begin2 Begin iterator of second sequence. Must have same + * length as first sequence. + * @param pred Find predicate. + * @param selector Functionality (e. g. std::find_if (), std::equal(),...) + * @return Place of finding in both sequences. + */ +template< + typename RandomAccessIterator1, + typename RandomAccessIterator2, + typename Pred, + typename Selector> std::pair<RandomAccessIterator1, RandomAccessIterator2> find_template(RandomAccessIterator1 begin1, RandomAccessIterator1 end1, - RandomAccessIterator2 begin2, Pred pred, Selector selector) + RandomAccessIterator2 begin2, Pred pred, Selector selector) { switch (Settings::find_distribution) { case Settings::GROWING_BLOCKS: - return find_template(begin1, end1, begin2, pred, selector, growing_blocks_tag()); + return find_template(begin1, end1, begin2, pred, selector, + growing_blocks_tag()); case Settings::CONSTANT_SIZE_BLOCKS: - return find_template(begin1, end1, begin2, pred, selector, constant_size_blocks_tag()); + return find_template(begin1, end1, begin2, pred, selector, + constant_size_blocks_tag()); case Settings::EQUAL_SPLIT: - return find_template(begin1, end1, begin2, pred, selector, equal_split_tag()); + return find_template(begin1, end1, begin2, pred, selector, + equal_split_tag()); default: - _GLIBCXX_PARALLEL_ASSERT(false); - return std::make_pair(begin1, begin2); + _GLIBCXX_PARALLEL_ASSERT(false); + return std::make_pair(begin1, begin2); } } #if _GLIBCXX_FIND_EQUAL_SPLIT - /** - * @brief Parallel std::find, equal splitting variant. - * @param begin1 Begin iterator of first sequence. - * @param end1 End iterator of first sequence. - * @param begin2 Begin iterator of second sequence. Second sequence - * must have same length as first sequence. - * @param pred Find predicate. - * @param selector Functionality (e. g. std::find_if (), std::equal(),...) - * @return Place of finding in both sequences. - */ - template<typename RandomAccessIterator1, typename RandomAccessIterator2, typename Pred, typename Selector> +/** + * @brief Parallel std::find, equal splitting variant. + * @param begin1 Begin iterator of first sequence. + * @param end1 End iterator of first sequence. + * @param begin2 Begin iterator of second sequence. Second sequence + * must have same length as first sequence. + * @param pred Find predicate. + * @param selector Functionality (e. g. std::find_if (), std::equal(),...) + * @return Place of finding in both sequences. + */ +template< + typename RandomAccessIterator1, + typename RandomAccessIterator2, + typename Pred, + typename Selector> std::pair<RandomAccessIterator1, RandomAccessIterator2> - find_template(RandomAccessIterator1 begin1, RandomAccessIterator1 end1, RandomAccessIterator2 begin2, Pred pred, Selector selector, equal_split_tag) + find_template(RandomAccessIterator1 begin1, + RandomAccessIterator1 end1, + RandomAccessIterator2 begin2, + Pred pred, + Selector selector, + equal_split_tag) { _GLIBCXX_CALL(end1 - begin1) @@ -100,79 +116,89 @@ namespace __gnu_parallel typedef typename traits_type::value_type value_type; difference_type length = end1 - begin1; - difference_type result = length; + difference_type* borders; - const thread_index_t num_threads = get_max_threads(); omp_lock_t result_lock; omp_init_lock(&result_lock); - difference_type* borders = static_cast<difference_type*>(__builtin_alloca(sizeof(difference_type) * (num_threads + 1))); - - equally_split(length, num_threads, borders); - -#pragma omp parallel shared(result) num_threads(num_threads) - { - int iam = omp_get_thread_num(); - difference_type pos = borders[iam], limit = borders[iam + 1]; - - RandomAccessIterator1 i1 = begin1 + pos; - RandomAccessIterator2 i2 = begin2 + pos; - for (; pos < limit; pos++) - { -#pragma omp flush(result) - // Result has been set to something lower. - if (result < pos) - break; - - if (selector(i1, i2, pred)) - { - omp_set_lock(&result_lock); - if (result > pos) - result = pos; - omp_unset_lock(&result_lock); + thread_index_t num_threads = get_max_threads(); +# pragma omp parallel num_threads(num_threads) + { +# pragma omp single + { + num_threads = omp_get_num_threads(); + borders = new difference_type[num_threads + 1]; + equally_split(length, num_threads, borders); + } //single + + thread_index_t iam = omp_get_thread_num(); + difference_type start = borders[iam], stop = borders[iam + 1]; + + RandomAccessIterator1 i1 = begin1 + start; + RandomAccessIterator2 i2 = begin2 + start; + for (difference_type pos = start; pos < stop; ++pos) + { + #pragma omp flush(result) + // Result has been set to something lower. + if (result < pos) break; - } - i1++; - i2++; - } - } + + if (selector(i1, i2, pred)) + { + omp_set_lock(&result_lock); + if (pos < result) + result = pos; + omp_unset_lock(&result_lock); + break; + } + ++i1; + ++i2; + } + } //parallel omp_destroy_lock(&result_lock); - return std::pair<RandomAccessIterator1, RandomAccessIterator2>(begin1 + result, begin2 + result); + delete[] borders; + + return std::pair<RandomAccessIterator1, RandomAccessIterator2>( + begin1 + result, begin2 + result); } #endif #if _GLIBCXX_FIND_GROWING_BLOCKS - /** - * @brief Parallel std::find, growing block size variant. - * @param begin1 Begin iterator of first sequence. - * @param end1 End iterator of first sequence. - * @param begin2 Begin iterator of second sequence. Second sequence - * must have same length as first sequence. - * @param pred Find predicate. - * @param selector Functionality (e. g. std::find_if (), std::equal(),...) - * @return Place of finding in both sequences. - * @see __gnu_parallel::Settings::find_sequential_search_size - * @see __gnu_parallel::Settings::find_initial_block_size - * @see __gnu_parallel::Settings::find_maximum_block_size - * @see __gnu_parallel::Settings::find_increasing_factor - * - * There are two main differences between the growing blocks and - * the constant-size blocks variants. - * 1. For GB, the block size grows; for CSB, the block size is fixed. - - * 2. For GB, the blocks are allocated dynamically; - * for CSB, the blocks are allocated in a predetermined manner, - * namely spacial round-robin. - */ - template<typename RandomAccessIterator1, typename RandomAccessIterator2, typename Pred, typename Selector> +/** + * @brief Parallel std::find, growing block size variant. + * @param begin1 Begin iterator of first sequence. + * @param end1 End iterator of first sequence. + * @param begin2 Begin iterator of second sequence. Second sequence + * must have same length as first sequence. + * @param pred Find predicate. + * @param selector Functionality (e. g. std::find_if (), std::equal(),...) + * @return Place of finding in both sequences. + * @see __gnu_parallel::Settings::find_sequential_search_size + * @see __gnu_parallel::Settings::find_initial_block_size + * @see __gnu_parallel::Settings::find_maximum_block_size + * @see __gnu_parallel::Settings::find_increasing_factor + * + * There are two main differences between the growing blocks and + * the constant-size blocks variants. + * 1. For GB, the block size grows; for CSB, the block size is fixed. + + * 2. For GB, the blocks are allocated dynamically; + * for CSB, the blocks are allocated in a predetermined manner, + * namely spacial round-robin. + */ +template< + typename RandomAccessIterator1, + typename RandomAccessIterator2, + typename Pred, + typename Selector> std::pair<RandomAccessIterator1, RandomAccessIterator2> find_template(RandomAccessIterator1 begin1, RandomAccessIterator1 end1, - RandomAccessIterator2 begin2, Pred pred, Selector selector, - growing_blocks_tag) + RandomAccessIterator2 begin2, Pred pred, Selector selector, + growing_blocks_tag) { _GLIBCXX_CALL(end1 - begin1) @@ -182,101 +208,118 @@ namespace __gnu_parallel difference_type length = end1 - begin1; - difference_type sequential_search_size = std::min<difference_type>(length, Settings::find_sequential_search_size); + difference_type sequential_search_size = std::min<difference_type>( + length, Settings::find_sequential_search_size); // Try it sequentially first. std::pair<RandomAccessIterator1, RandomAccessIterator2> find_seq_result = - selector.sequential_algorithm(begin1, begin1 + sequential_search_size, begin2, pred); + selector.sequential_algorithm( + begin1, begin1 + sequential_search_size, begin2, pred); if (find_seq_result.first != (begin1 + sequential_search_size)) return find_seq_result; // Index of beginning of next free block (after sequential find). - difference_type next_block_pos = sequential_search_size; + difference_type next_block_start = sequential_search_size; difference_type result = length; - const thread_index_t num_threads = get_max_threads(); omp_lock_t result_lock; omp_init_lock(&result_lock); -#pragma omp parallel shared(result) num_threads(num_threads) - { - // Not within first k elements -> start parallel. - thread_index_t iam = omp_get_thread_num(); - - difference_type block_size = Settings::find_initial_block_size; - difference_type start = fetch_and_add<difference_type>(&next_block_pos, block_size); - - // Get new block, update pointer to next block. - difference_type stop = std::min<difference_type>(length, start + block_size); - - std::pair<RandomAccessIterator1, RandomAccessIterator2> local_result; - - while (start < length) - { -#pragma omp flush(result) - // Get new value of result. - if (result < start) - { - // No chance to find first element. - break; - } - - local_result = selector.sequential_algorithm(begin1 + start, begin1 + stop, begin2 + start, pred); - if (local_result.first != (begin1 + stop)) - { - omp_set_lock(&result_lock); - if ((local_result.first - begin1) < result) - { - result = local_result.first - begin1; - - // Result cannot be in future blocks, stop algorithm. - fetch_and_add<difference_type>(&next_block_pos, length); - } - omp_unset_lock(&result_lock); - } - - block_size = std::min<difference_type>(block_size * Settings::find_increasing_factor, Settings::find_maximum_block_size); - - // Get new block, update pointer to next block. - start = fetch_and_add<difference_type>(&next_block_pos, block_size); - stop = (length < (start + block_size)) ? length : (start + block_size); - } - } + thread_index_t num_threads = get_max_threads(); +# pragma omp parallel shared(result) num_threads(num_threads) + { +# pragma omp single + num_threads = omp_get_num_threads(); + + // Not within first k elements -> start parallel. + thread_index_t iam = omp_get_thread_num(); + + difference_type block_size = Settings::find_initial_block_size; + difference_type start = + fetch_and_add<difference_type>(&next_block_start, block_size); + + // Get new block, update pointer to next block. + difference_type stop = + std::min<difference_type>(length, start + block_size); + + std::pair<RandomAccessIterator1, RandomAccessIterator2> local_result; + + while (start < length) + { +# pragma omp flush(result) + // Get new value of result. + if (result < start) + { + // No chance to find first element. + break; + } + + local_result = selector.sequential_algorithm( + begin1 + start, begin1 + stop, begin2 + start, pred); + if (local_result.first != (begin1 + stop)) + { + omp_set_lock(&result_lock); + if ((local_result.first - begin1) < result) + { + result = local_result.first - begin1; + + // Result cannot be in future blocks, stop algorithm. + fetch_and_add<difference_type>(&next_block_start, length); + } + omp_unset_lock(&result_lock); + } + + block_size = std::min<difference_type>( + block_size * Settings::find_increasing_factor, + Settings::find_maximum_block_size); + + // Get new block, update pointer to next block. + start = + fetch_and_add<difference_type>(&next_block_start, block_size); + stop = (length < (start + block_size)) ? + length : (start + block_size); + } + } //parallel omp_destroy_lock(&result_lock); // Return iterator on found element. - return std::pair<RandomAccessIterator1, RandomAccessIterator2>(begin1 + result, begin2 + result); + return std::pair<RandomAccessIterator1, RandomAccessIterator2>( + begin1 + result, begin2 + result); } #endif #if _GLIBCXX_FIND_CONSTANT_SIZE_BLOCKS - /** - * @brief Parallel std::find, constant block size variant. - * @param begin1 Begin iterator of first sequence. - * @param end1 End iterator of first sequence. - * @param begin2 Begin iterator of second sequence. Second sequence - * must have same length as first sequence. - * @param pred Find predicate. - * @param selector Functionality (e. g. std::find_if (), std::equal(),...) - * @return Place of finding in both sequences. - * @see __gnu_parallel::Settings::find_sequential_search_size - * @see __gnu_parallel::Settings::find_block_size - * There are two main differences between the growing blocks and the - * constant-size blocks variants. - * 1. For GB, the block size grows; for CSB, the block size is fixed. - * 2. For GB, the blocks are allocated dynamically; for CSB, the - * blocks are allocated in a predetermined manner, namely spacial - * round-robin. - */ - template<typename RandomAccessIterator1, typename RandomAccessIterator2, typename Pred, typename Selector> +/** + * @brief Parallel std::find, constant block size variant. + * @param begin1 Begin iterator of first sequence. + * @param end1 End iterator of first sequence. + * @param begin2 Begin iterator of second sequence. Second sequence + * must have same length as first sequence. + * @param pred Find predicate. + * @param selector Functionality (e. g. std::find_if (), std::equal(),...) + * @return Place of finding in both sequences. + * @see __gnu_parallel::Settings::find_sequential_search_size + * @see __gnu_parallel::Settings::find_block_size + * There are two main differences between the growing blocks and the + * constant-size blocks variants. + * 1. For GB, the block size grows; for CSB, the block size is fixed. + * 2. For GB, the blocks are allocated dynamically; for CSB, the + * blocks are allocated in a predetermined manner, namely spacial + * round-robin. + */ +template< + typename RandomAccessIterator1, + typename RandomAccessIterator2, + typename Pred, + typename Selector> std::pair<RandomAccessIterator1, RandomAccessIterator2> find_template(RandomAccessIterator1 begin1, RandomAccessIterator1 end1, - RandomAccessIterator2 begin2, Pred pred, Selector selector, - constant_size_blocks_tag) + RandomAccessIterator2 begin2, Pred pred, Selector selector, + constant_size_blocks_tag) { _GLIBCXX_CALL(end1 - begin1) typedef std::iterator_traits<RandomAccessIterator1> traits_type; @@ -285,72 +328,77 @@ namespace __gnu_parallel difference_type length = end1 - begin1; - difference_type sequential_search_size = std::min<difference_type>(length, Settings::find_sequential_search_size); + difference_type sequential_search_size = std::min<difference_type>( + length, Settings::find_sequential_search_size); // Try it sequentially first. std::pair<RandomAccessIterator1, RandomAccessIterator2> find_seq_result = - selector.sequential_algorithm(begin1, begin1 + sequential_search_size, begin2, pred); + selector.sequential_algorithm(begin1, begin1 + sequential_search_size, + begin2, pred); if (find_seq_result.first != (begin1 + sequential_search_size)) return find_seq_result; difference_type result = length; - const thread_index_t num_threads = get_max_threads(); - omp_lock_t result_lock; omp_init_lock(&result_lock); // Not within first sequential_search_size elements -> start parallel. -#pragma omp parallel shared(result) num_threads(num_threads) - { - thread_index_t iam = omp_get_thread_num(); - difference_type block_size = Settings::find_initial_block_size; - - difference_type start, stop; - - // First element of thread's current iteration. - difference_type iteration_start = sequential_search_size; - - // Where to work (initialization). - start = iteration_start + iam * block_size; - stop = std::min<difference_type>(length, start + block_size); - - std::pair<RandomAccessIterator1, RandomAccessIterator2> local_result; - - while (start < length) - { - // Get new value of result. -#pragma omp flush(result) - // No chance to find first element. - if (result < start) - break; - - local_result = selector.sequential_algorithm(begin1 + start, begin1 + stop, begin2 + start, pred); - if (local_result.first != (begin1 + stop)) - { - omp_set_lock(&result_lock); - if ((local_result.first - begin1) < result) - result = local_result.first - begin1; - omp_unset_lock(&result_lock); - // Will not find better value in its interval. - break; - } - - iteration_start += num_threads * block_size; - - // Where to work. - start = iteration_start + iam * block_size; - stop = std::min<difference_type>(length, start + block_size); - } - } + + thread_index_t num_threads = get_max_threads(); +# pragma omp parallel shared(result) num_threads(num_threads) + { +# pragma omp single + num_threads = omp_get_num_threads(); + + thread_index_t iam = omp_get_thread_num(); + difference_type block_size = Settings::find_initial_block_size; + + // First element of thread's current iteration. + difference_type iteration_start = sequential_search_size; + + // Where to work (initialization). + difference_type start = iteration_start + iam * block_size; + difference_type stop = + std::min<difference_type>(length, start + block_size); + + std::pair<RandomAccessIterator1, RandomAccessIterator2> local_result; + + while (start < length) + { + // Get new value of result. +# pragma omp flush(result) + // No chance to find first element. + if (result < start) + break; + local_result = selector.sequential_algorithm( + begin1 + start, begin1 + stop, + begin2 + start, pred); + if (local_result.first != (begin1 + stop)) + { + omp_set_lock(&result_lock); + if ((local_result.first - begin1) < result) + result = local_result.first - begin1; + omp_unset_lock(&result_lock); + // Will not find better value in its interval. + break; + } + + iteration_start += num_threads * block_size; + + // Where to work. + start = iteration_start + iam * block_size; + stop = std::min<difference_type>(length, start + block_size); + } + } //parallel omp_destroy_lock(&result_lock); // Return iterator on found element. - return std::pair<RandomAccessIterator1, RandomAccessIterator2>(begin1 + result, begin2 + result); + return std::pair<RandomAccessIterator1, RandomAccessIterator2>( + begin1 + result, begin2 + result); } #endif } // end namespace #endif - diff --git a/libstdc++-v3/include/parallel/losertree.h b/libstdc++-v3/include/parallel/losertree.h index 5c7a8085e20..7b8b654baa1 100644 --- a/libstdc++-v3/include/parallel/losertree.h +++ b/libstdc++-v3/include/parallel/losertree.h @@ -29,9 +29,9 @@ // Public License. /** @file parallel/losertree.h - * @brief Many generic loser tree variants. - * This file is a GNU parallel extension to the Standard C++ Library. - */ +* @brief Many generic loser tree variants. +* This file is a GNU parallel extension to the Standard C++ Library. +*/ // Written by Johannes Singler. @@ -49,13 +49,13 @@ namespace __gnu_parallel #if _GLIBCXX_LOSER_TREE_EXPLICIT - /** @brief Guarded loser tree, copying the whole element into the - * tree structure. - * - * Guarding is done explicitly through two flags per element, inf - * and sup This is a quite slow variant. - */ - template<typename T, typename Comparator = std::less<T> > +/** @brief Guarded loser tree, copying the whole element into the +* tree structure. +* +* Guarding is done explicitly through two flags per element, inf +* and sup This is a quite slow variant. +*/ +template<typename T, typename Comparator = std::less<T> > class LoserTreeExplicit { private: @@ -76,26 +76,25 @@ namespace __gnu_parallel Comparator comp; public: - inline LoserTreeExplicit(unsigned int _size, Comparator _comp = std::less<T>()) : comp(_comp) + inline + LoserTreeExplicit(unsigned int _size, Comparator _comp = std::less<T>()) + : comp(_comp) { size = _size; offset = size; losers = new Loser[size]; for (unsigned int l = 0; l < size; l++) - { - //losers[l].key = ... stays unset - losers[l].inf = true; - losers[l].sup = false; - //losers[l].source = -1; //sentinel - } + { + //losers[l].key = ... stays unset + losers[l].inf = true; + losers[l].sup = false; + //losers[l].source = -1; //sentinel + } } inline ~LoserTreeExplicit() { delete[] losers; } - inline void - print() { } - inline int get_min_source() { return losers[0].source; } @@ -105,16 +104,17 @@ namespace __gnu_parallel { bool inf = false; for (unsigned int pos = (offset + source) / 2; pos > 0; pos /= 2) - { - if ((!inf && !losers[pos].inf && !sup && !losers[pos].sup && comp(losers[pos].key, key)) || losers[pos].inf || sup) - { - // The other one is smaller. - std::swap(losers[pos].key, key); - std::swap(losers[pos].inf, inf); - std::swap(losers[pos].sup, sup); - std::swap(losers[pos].source, source); - } - } + { + if ((!inf && !losers[pos].inf && !sup && !losers[pos].sup + && comp(losers[pos].key, key)) || losers[pos].inf || sup) + { + // The other one is smaller. + std::swap(losers[pos].key, key); + std::swap(losers[pos].inf, inf); + std::swap(losers[pos].sup, sup); + std::swap(losers[pos].source, source); + } + } losers[0].key = key; losers[0].inf = inf; @@ -131,19 +131,19 @@ namespace __gnu_parallel bool inf = false; int source = losers[0].source; for (unsigned int pos = (offset + source) / 2; pos > 0; pos /= 2) - { - // The smaller one gets promoted. - if ((!inf && !losers[pos].inf && !sup && !losers[pos].sup - && comp(losers[pos].key, key)) - || losers[pos].inf || sup) - { - // The other one is smaller. - std::swap(losers[pos].key, key); - std::swap(losers[pos].inf, inf); - std::swap(losers[pos].sup, sup); - std::swap(losers[pos].source, source); - } - } + { + // The smaller one gets promoted. + if ((!inf && !losers[pos].inf && !sup && !losers[pos].sup + && comp(losers[pos].key, key)) + || losers[pos].inf || sup) + { + // The other one is smaller. + std::swap(losers[pos].key, key); + std::swap(losers[pos].inf, inf); + std::swap(losers[pos].sup, sup); + std::swap(losers[pos].source, source); + } + } losers[0].key = key; losers[0].inf = inf; @@ -156,19 +156,19 @@ namespace __gnu_parallel { bool inf = false; for (unsigned int pos = (offset + source) / 2; pos > 0; pos /= 2) - { - if ((!inf && !losers[pos].inf && !sup && !losers[pos].sup && - ((comp(losers[pos].key, key)) || - (!comp(key, losers[pos].key) && losers[pos].source < source))) - || losers[pos].inf || sup) - { - // Take next key. - std::swap(losers[pos].key, key); - std::swap(losers[pos].inf, inf); - std::swap(losers[pos].sup, sup); - std::swap(losers[pos].source, source); - } - } + { + if ((!inf && !losers[pos].inf && !sup && !losers[pos].sup && + ((comp(losers[pos].key, key)) || + (!comp(key, losers[pos].key) && losers[pos].source < source))) + || losers[pos].inf || sup) + { + // Take next key. + std::swap(losers[pos].key, key); + std::swap(losers[pos].inf, inf); + std::swap(losers[pos].sup, sup); + std::swap(losers[pos].source, source); + } + } losers[0].key = key; losers[0].inf = inf; @@ -185,18 +185,18 @@ namespace __gnu_parallel bool inf = false; int source = losers[0].source; for (unsigned int pos = (offset + source) / 2; pos > 0; pos /= 2) - { - if ((!inf && !losers[pos].inf && !sup && !losers[pos].sup - && ((comp(losers[pos].key, key)) || - (!comp(key, losers[pos].key) && losers[pos].source < source))) - || losers[pos].inf || sup) - { - std::swap(losers[pos].key, key); - std::swap(losers[pos].inf, inf); - std::swap(losers[pos].sup, sup); - std::swap(losers[pos].source, source); - } - } + { + if ((!inf && !losers[pos].inf && !sup && !losers[pos].sup + && ((comp(losers[pos].key, key)) || + (!comp(key, losers[pos].key) && losers[pos].source < source))) + || losers[pos].inf || sup) + { + std::swap(losers[pos].key, key); + std::swap(losers[pos].inf, inf); + std::swap(losers[pos].sup, sup); + std::swap(losers[pos].source, source); + } + } losers[0].key = key; losers[0].inf = inf; @@ -209,14 +209,14 @@ namespace __gnu_parallel #if _GLIBCXX_LOSER_TREE - /** @brief Guarded loser tree, either copying the whole element into - * the tree structure, or looking up the element via the index. - * - * Guarding is done explicitly through one flag sup per element, - * inf is not needed due to a better initialization routine. This - * is a well-performing variant. - */ - template<typename T, typename Comparator = std::less<T> > +/** @brief Guarded loser tree, either copying the whole element into +* the tree structure, or looking up the element via the index. +* +* Guarding is done explicitly through one flag sup per element, +* inf is not needed due to a better initialization routine. This +* is a well-performing variant. +*/ +template<typename T, typename Comparator = std::less<T> > class LoserTree { private: @@ -240,22 +240,14 @@ namespace __gnu_parallel // Next greater power of 2. k = 1 << (log2(ik - 1) + 1); offset = k; - losers = new Loser[k * 2]; + losers = static_cast<Loser*>(::operator new(k * 2 * sizeof(Loser))); for (unsigned int i = ik - 1; i < k; i++) - losers[i + k].sup = true; + losers[i + k].sup = true; } inline ~LoserTree() { delete[] losers; } - void - print() - { - for (unsigned int i = 0; i < (k * 2); i++) - printf("%d %d from %d, %d\n", i, losers[i].key, - losers[i].source, losers[i].sup); - } - inline int get_min_source() { return losers[0].source; } @@ -267,33 +259,34 @@ namespace __gnu_parallel losers[pos].sup = sup; losers[pos].source = source; - losers[pos].key = key; + new(&(losers[pos].key)) T(key); } unsigned int init_winner (unsigned int root) { if (root >= k) - { - return root; - } + { + return root; + } else - { - unsigned int left = init_winner (2 * root); - unsigned int right = init_winner (2 * root + 1); - if (losers[right].sup || - (!losers[left].sup && !comp(losers[right].key, losers[left].key))) - { - // Left one is less or equal. - losers[root] = losers[right]; - return left; - } - else - { // Right one is less. - losers[root] = losers[left]; - return right; - } - } + { + unsigned int left = init_winner (2 * root); + unsigned int right = init_winner (2 * root + 1); + if (losers[right].sup || + (!losers[left].sup + && !comp(losers[right].key, losers[left].key))) + { + // Left one is less or equal. + losers[root] = losers[right]; + return left; + } + else + { // Right one is less. + losers[root] = losers[left]; + return right; + } + } } inline void @@ -306,16 +299,16 @@ namespace __gnu_parallel { int source = losers[0].source; for (unsigned int pos = (k + source) / 2; pos > 0; pos /= 2) - { - // The smaller one gets promoted. - if (sup || (!losers[pos].sup && comp(losers[pos].key, key))) - { - // The other one is smaller. - std::swap(losers[pos].sup, sup); - std::swap(losers[pos].source, source); - std::swap(losers[pos].key, key); - } - } + { + // The smaller one gets promoted. + if (sup || (!losers[pos].sup && comp(losers[pos].key, key))) + { + // The other one is smaller. + std::swap(losers[pos].sup, sup); + std::swap(losers[pos].source, source); + std::swap(losers[pos].key, key); + } + } losers[0].sup = sup; losers[0].source = source; @@ -330,27 +323,28 @@ namespace __gnu_parallel init_winner_stable (unsigned int root) { if (root >= k) - { - return root; - } + { + return root; + } else - { - unsigned int left = init_winner (2 * root); - unsigned int right = init_winner (2 * root + 1); - if ( losers[right].sup || - (!losers[left].sup && !comp(losers[right].key, losers[left].key))) - { - // Left one is less or equal. - losers[root] = losers[right]; - return left; - } - else - { - // Right one is less. - losers[root] = losers[left]; - return right; - } - } + { + unsigned int left = init_winner (2 * root); + unsigned int right = init_winner (2 * root + 1); + if (losers[right].sup + || (!losers[left].sup + && !comp(losers[right].key, losers[left].key))) + { + // Left one is less or equal. + losers[root] = losers[right]; + return left; + } + else + { + // Right one is less. + losers[root] = losers[left]; + return right; + } + } } inline void @@ -363,19 +357,20 @@ namespace __gnu_parallel { int source = losers[0].source; for (unsigned int pos = (k + source) / 2; pos > 0; pos /= 2) - { - // The smaller one gets promoted, ties are broken by source. - if ( (sup && (!losers[pos].sup || losers[pos].source < source)) || - (!sup && !losers[pos].sup && - ((comp(losers[pos].key, key)) || - (!comp(key, losers[pos].key) && losers[pos].source < source)))) - { - // The other one is smaller. - std::swap(losers[pos].sup, sup); - std::swap(losers[pos].source, source); - std::swap(losers[pos].key, key); - } - } + { + // The smaller one gets promoted, ties are broken by source. + if ( (sup && (!losers[pos].sup || losers[pos].source < source)) + || (!sup && !losers[pos].sup + && ((comp(losers[pos].key, key)) + || (!comp(key, losers[pos].key) + && losers[pos].source < source)))) + { + // The other one is smaller. + std::swap(losers[pos].sup, sup); + std::swap(losers[pos].source, source); + std::swap(losers[pos].key, key); + } + } losers[0].sup = sup; losers[0].source = source; @@ -387,14 +382,14 @@ namespace __gnu_parallel #if _GLIBCXX_LOSER_TREE_REFERENCE - /** @brief Guarded loser tree, either copying the whole element into - * the tree structure, or looking up the element via the index. - * - * Guarding is done explicitly through one flag sup per element, - * inf is not needed due to a better initialization routine. This - * is a well-performing variant. - */ - template<typename T, typename Comparator = std::less<T> > +/** @brief Guarded loser tree, either copying the whole element into +* the tree structure, or looking up the element via the index. +* +* Guarding is done explicitly through one flag sup per element, +* inf is not needed due to a better initialization routine. This +* is a well-performing variant. +*/ +template<typename T, typename Comparator = std::less<T> > class LoserTreeReference { #undef COPY @@ -423,7 +418,9 @@ namespace __gnu_parallel Comparator comp; public: - inline LoserTreeReference(unsigned int _k, Comparator _comp = std::less<T>()) : comp(_comp) + inline + LoserTreeReference(unsigned int _k, Comparator _comp = std::less<T>()) + : comp(_comp) { ik = _k; @@ -435,7 +432,7 @@ namespace __gnu_parallel keys = new T[ik]; #endif for (unsigned int i = ik - 1; i < k; i++) - losers[i + k].sup = true; + losers[i + k].sup = true; } inline ~LoserTreeReference() @@ -446,13 +443,6 @@ namespace __gnu_parallel #endif } - void - print() - { - for (unsigned int i = 0; i < (k * 2); i++) - printf("%d %d from %d, %d\n", i, KEY(i), losers[i].source, losers[i].sup); - } - inline int get_min_source() { return losers[0].source; } @@ -471,27 +461,27 @@ namespace __gnu_parallel init_winner(unsigned int root) { if (root >= k) - { - return root; - } + { + return root; + } else - { - unsigned int left = init_winner (2 * root); - unsigned int right = init_winner (2 * root + 1); - if ( losers[right].sup || - (!losers[left].sup && !comp(KEY(right), KEY(left)))) - { - // Left one is less or equal. - losers[root] = losers[right]; - return left; - } - else - { - // Right one is less. - losers[root] = losers[left]; - return right; - } - } + { + unsigned int left = init_winner (2 * root); + unsigned int right = init_winner (2 * root + 1); + if ( losers[right].sup || + (!losers[left].sup && !comp(KEY(right), KEY(left)))) + { + // Left one is less or equal. + losers[root] = losers[right]; + return left; + } + else + { + // Right one is less. + losers[root] = losers[left]; + return right; + } + } } inline void @@ -505,18 +495,18 @@ namespace __gnu_parallel { int source = losers[0].source; for (unsigned int pos = (k + source) / 2; pos > 0; pos /= 2) - { - // The smaller one gets promoted. - if (sup || (!losers[pos].sup && comp(KEY(pos), KEY_SOURCE(source)))) - { - // The other one is smaller. - std::swap(losers[pos].sup, sup); - std::swap(losers[pos].source, source); + { + // The smaller one gets promoted. + if (sup || (!losers[pos].sup && comp(KEY(pos), KEY_SOURCE(source)))) + { + // The other one is smaller. + std::swap(losers[pos].sup, sup); + std::swap(losers[pos].source, source); #ifdef COPY - std::swap(KEY(pos), KEY_SOURCE(source)); + std::swap(KEY(pos), KEY_SOURCE(source)); #endif - } - } + } + } losers[0].sup = sup; losers[0].source = source; @@ -533,27 +523,27 @@ namespace __gnu_parallel init_winner_stable(unsigned int root) { if (root >= k) - { - return root; - } + { + return root; + } else - { - unsigned int left = init_winner (2 * root); - unsigned int right = init_winner (2 * root + 1); - if (losers[right].sup - || (!losers[left].sup && !comp(KEY(right), KEY(left)))) - { - // Left one is less or equal. - losers[root] = losers[right]; - return left; - } - else - { - // Right one is less. - losers[root] = losers[left]; - return right; - } - } + { + unsigned int left = init_winner (2 * root); + unsigned int right = init_winner (2 * root + 1); + if (losers[right].sup + || (!losers[left].sup && !comp(KEY(right), KEY(left)))) + { + // Left one is less or equal. + losers[root] = losers[right]; + return left; + } + else + { + // Right one is less. + losers[root] = losers[left]; + return right; + } + } } inline void @@ -565,21 +555,22 @@ namespace __gnu_parallel { int source = losers[0].source; for (unsigned int pos = (k + source) / 2; pos > 0; pos /= 2) - { - // The smaller one gets promoted, ties are broken by source. - if ( (sup && (!losers[pos].sup || losers[pos].source < source)) || - (!sup && !losers[pos].sup && - ((comp(KEY(pos), KEY_SOURCE(source))) || - (!comp(KEY_SOURCE(source), KEY(pos)) && losers[pos].source < source)))) - { - // The other one is smaller. - std::swap(losers[pos].sup, sup); - std::swap(losers[pos].source, source); + { + // The smaller one gets promoted, ties are broken by source. + if ( (sup && (!losers[pos].sup || losers[pos].source < source)) || + (!sup && !losers[pos].sup && + ((comp(KEY(pos), KEY_SOURCE(source))) || + (!comp(KEY_SOURCE(source), KEY(pos)) + && losers[pos].source < source)))) + { + // The other one is smaller. + std::swap(losers[pos].sup, sup); + std::swap(losers[pos].source, source); #ifdef COPY - std::swap(KEY(pos), KEY_SOURCE(source)); + std::swap(KEY(pos), KEY_SOURCE(source)); #endif - } - } + } + } losers[0].sup = sup; losers[0].source = source; @@ -595,13 +586,13 @@ namespace __gnu_parallel #if _GLIBCXX_LOSER_TREE_POINTER - /** @brief Guarded loser tree, either copying the whole element into - the tree structure, or looking up the element via the index. - * Guarding is done explicitly through one flag sup per element, - * inf is not needed due to a better initialization routine. - * This is a well-performing variant. - */ - template<typename T, typename Comparator = std::less<T> > +/** @brief Guarded loser tree, either copying the whole element into + the tree structure, or looking up the element via the index. +* Guarding is done explicitly through one flag sup per element, +* inf is not needed due to a better initialization routine. +* This is a well-performing variant. +*/ +template<typename T, typename Comparator = std::less<T> > class LoserTreePointer { private: @@ -617,7 +608,9 @@ namespace __gnu_parallel Comparator comp; public: - inline LoserTreePointer(unsigned int _k, Comparator _comp = std::less<T>()) : comp(_comp) + inline + LoserTreePointer(unsigned int _k, Comparator _comp = std::less<T>()) + : comp(_comp) { ik = _k; @@ -626,19 +619,12 @@ namespace __gnu_parallel offset = k; losers = new Loser[k * 2]; for (unsigned int i = ik - 1; i < k; i++) - losers[i + k].sup = true; + losers[i + k].sup = true; } inline ~LoserTreePointer() { delete[] losers; } - void - print() - { - for (unsigned int i = 0; i < (k * 2); i++) - printf("%d %d from %d, %d\n", i, losers[i].keyp, losers[i].source, losers[i].sup); - } - inline int get_min_source() { return losers[0].source; } @@ -657,49 +643,50 @@ namespace __gnu_parallel init_winner(unsigned int root) { if (root >= k) - { - return root; - } + { + return root; + } else - { - unsigned int left = init_winner (2 * root); - unsigned int right = init_winner (2 * root + 1); - if ( losers[right].sup || - (!losers[left].sup && !comp(*losers[right].keyp, *losers[left].keyp))) - { - // Left one is less or equal. - losers[root] = losers[right]; - return left; - } - else - { - // Right one is less. - losers[root] = losers[left]; - return right; - } - } + { + unsigned int left = init_winner (2 * root); + unsigned int right = init_winner (2 * root + 1); + if (losers[right].sup + || (!losers[left].sup + && !comp(*losers[right].keyp, *losers[left].keyp))) + { + // Left one is less or equal. + losers[root] = losers[right]; + return left; + } + else + { + // Right one is less. + losers[root] = losers[left]; + return right; + } + } } inline void init() { losers[0] = losers[init_winner(1)]; } - inline void + inline void delete_min_insert(const T& key, bool sup) { const T* keyp = &key; int source = losers[0].source; for (unsigned int pos = (k + source) / 2; pos > 0; pos /= 2) - { - // The smaller one gets promoted. - if (sup || (!losers[pos].sup && comp(*losers[pos].keyp, *keyp))) - { - // The other one is smaller. - std::swap(losers[pos].sup, sup); - std::swap(losers[pos].source, source); - std::swap(losers[pos].keyp, keyp); - } - } + { + // The smaller one gets promoted. + if (sup || (!losers[pos].sup && comp(*losers[pos].keyp, *keyp))) + { + // The other one is smaller. + std::swap(losers[pos].sup, sup); + std::swap(losers[pos].source, source); + std::swap(losers[pos].keyp, keyp); + } + } losers[0].sup = sup; losers[0].source = source; @@ -714,28 +701,28 @@ namespace __gnu_parallel init_winner_stable(unsigned int root) { if (root >= k) - { - return root; - } + { + return root; + } else - { - unsigned int left = init_winner (2 * root); - unsigned int right = init_winner (2 * root + 1); - if (losers[right].sup - || (!losers[left].sup && !comp(*losers[right].keyp, - *losers[left].keyp))) - { - // Left one is less or equal. - losers[root] = losers[right]; - return left; - } - else - { - // Right one is less. - losers[root] = losers[left]; - return right; - } - } + { + unsigned int left = init_winner (2 * root); + unsigned int right = init_winner (2 * root + 1); + if (losers[right].sup + || (!losers[left].sup && !comp(*losers[right].keyp, + *losers[left].keyp))) + { + // Left one is less or equal. + losers[root] = losers[right]; + return left; + } + else + { + // Right one is less. + losers[root] = losers[left]; + return right; + } + } } inline void @@ -748,20 +735,20 @@ namespace __gnu_parallel const T* keyp = &key; int source = losers[0].source; for (unsigned int pos = (k + source) / 2; pos > 0; pos /= 2) - { - // The smaller one gets promoted, ties are broken by source. - if ( (sup && (!losers[pos].sup || losers[pos].source < source)) || - (!sup && !losers[pos].sup && - ((comp(*losers[pos].keyp, *keyp)) || - (!comp(*keyp, *losers[pos].keyp) - && losers[pos].source < source)))) - { - // The other one is smaller. - std::swap(losers[pos].sup, sup); - std::swap(losers[pos].source, source); - std::swap(losers[pos].keyp, keyp); - } - } + { + // The smaller one gets promoted, ties are broken by source. + if ( (sup && (!losers[pos].sup || losers[pos].source < source)) || + (!sup && !losers[pos].sup && + ((comp(*losers[pos].keyp, *keyp)) || + (!comp(*keyp, *losers[pos].keyp) + && losers[pos].source < source)))) + { + // The other one is smaller. + std::swap(losers[pos].sup, sup); + std::swap(losers[pos].source, source); + std::swap(losers[pos].keyp, keyp); + } + } losers[0].sup = sup; losers[0].source = source; @@ -773,13 +760,13 @@ namespace __gnu_parallel #if _GLIBCXX_LOSER_TREE_UNGUARDED - /** @brief Unguarded loser tree, copying the whole element into the - * tree structure. - * - * No guarding is done, therefore not a single input sequence must - * run empty. This is a very fast variant. - */ - template<typename T, typename Comparator = std::less<T> > +/** @brief Unguarded loser tree, copying the whole element into the +* tree structure. +* +* No guarding is done, therefore not a single input sequence must +* run empty. This is a very fast variant. +*/ +template<typename T, typename Comparator = std::less<T> > class LoserTreeUnguarded { private: @@ -798,18 +785,20 @@ namespace __gnu_parallel map(unsigned int root, unsigned int begin, unsigned int end) { if (begin + 1 == end) - mapping[begin] = root; + mapping[begin] = root; else - { - // Next greater or equal power of 2. - unsigned int left = 1 << (log2(end - begin - 1)); - map(root * 2, begin, begin + left); - map(root * 2 + 1, begin + left, end); - } + { + // Next greater or equal power of 2. + unsigned int left = 1 << (log2(end - begin - 1)); + map(root * 2, begin, begin + left); + map(root * 2 + 1, begin + left, end); + } } public: - inline LoserTreeUnguarded(unsigned int _k, Comparator _comp = std::less<T>()) : comp(_comp) + inline + LoserTreeUnguarded(unsigned int _k, Comparator _comp = std::less<T>()) + : comp(_comp) { ik = _k; // Next greater or equal power of 2. @@ -826,13 +815,6 @@ namespace __gnu_parallel delete[] mapping; } - void - print() - { - for (unsigned int i = 0; i < k + ik; i++) - printf("%d %d from %d\n", i, losers[i].key, losers[i].source); - } - inline int get_min_source() { return losers[0].source; } @@ -849,26 +831,27 @@ namespace __gnu_parallel init_winner(unsigned int root, unsigned int begin, unsigned int end) { if (begin + 1 == end) - return mapping[begin]; + return mapping[begin]; else - { - // Next greater or equal power of 2. - unsigned int division = 1 << (log2(end - begin - 1)); - unsigned int left = init_winner(2 * root, begin, begin + division); - unsigned int right = init_winner(2 * root + 1, begin + division, end); - if (!comp(losers[right].key, losers[left].key)) - { - // Left one is less or equal. - losers[root] = losers[right]; - return left; - } - else - { - // Right one is less. - losers[root] = losers[left]; - return right; - } - } + { + // Next greater or equal power of 2. + unsigned int division = 1 << (log2(end - begin - 1)); + unsigned int left = init_winner(2 * root, begin, begin + division); + unsigned int right = + init_winner(2 * root + 1, begin + division, end); + if (!comp(losers[right].key, losers[left].key)) + { + // Left one is less or equal. + losers[root] = losers[right]; + return left; + } + else + { + // Right one is less. + losers[root] = losers[left]; + return right; + } + } } inline void @@ -883,15 +866,15 @@ namespace __gnu_parallel T& keyr = losers[0].key; int& source = losers[0].source; for (int pos = mapping[source] / 2; pos > 0; pos /= 2) - { - // The smaller one gets promoted. - if (comp(losers[pos].key, keyr)) - { - // The other one is smaller. - std::swap(losers[pos].source, source); - std::swap(losers[pos].key, keyr); - } - } + { + // The smaller one gets promoted. + if (comp(losers[pos].key, keyr)) + { + // The other one is smaller. + std::swap(losers[pos].source, source); + std::swap(losers[pos].key, keyr); + } + } } inline void @@ -909,16 +892,17 @@ namespace __gnu_parallel T& keyr = losers[0].key; int& source = losers[0].source; for (int pos = mapping[source] / 2; pos > 0; pos /= 2) - { - // The smaller one gets promoted, ties are broken by source. - if (comp(losers[pos].key, keyr) - || (!comp(keyr, losers[pos].key) && losers[pos].source < source)) - { - // The other one is smaller. - std::swap(losers[pos].source, source); - std::swap(losers[pos].key, keyr); - } - } + { + // The smaller one gets promoted, ties are broken by source. + if (comp(losers[pos].key, keyr) + || (!comp(keyr, losers[pos].key) + && losers[pos].source < source)) + { + // The other one is smaller. + std::swap(losers[pos].source, source); + std::swap(losers[pos].key, keyr); + } + } } }; @@ -926,13 +910,13 @@ namespace __gnu_parallel #if _GLIBCXX_LOSER_TREE_POINTER_UNGUARDED - /** @brief Unguarded loser tree, keeping only pointers to the - * elements in the tree structure. - * - * No guarding is done, therefore not a single input sequence must - * run empty. This is a very fast variant. - */ - template<typename T, typename Comparator = std::less<T> > +/** @brief Unguarded loser tree, keeping only pointers to the +* elements in the tree structure. +* +* No guarding is done, therefore not a single input sequence must +* run empty. This is a very fast variant. +*/ +template<typename T, typename Comparator = std::less<T> > class LoserTreePointerUnguarded { private: @@ -950,18 +934,21 @@ namespace __gnu_parallel void map(unsigned int root, unsigned int begin, unsigned int end) { if (begin + 1 == end) - mapping[begin] = root; + mapping[begin] = root; else - { - // Next greater or equal power of 2. - unsigned int left = 1 << (log2(end - begin - 1)); - map(root * 2, begin, begin + left); - map(root * 2 + 1, begin + left, end); - } + { + // Next greater or equal power of 2. + unsigned int left = 1 << (log2(end - begin - 1)); + map(root * 2, begin, begin + left); + map(root * 2 + 1, begin + left, end); + } } public: - inline LoserTreePointerUnguarded(unsigned int _k, Comparator _comp = std::less<T>()) : comp(_comp) + inline + LoserTreePointerUnguarded(unsigned int _k, + Comparator _comp = std::less<T>()) + : comp(_comp) { ik = _k; @@ -979,13 +966,6 @@ namespace __gnu_parallel delete[] mapping; } - void - print() - { - for (unsigned int i = 0; i < k + ik; i++) - printf("%d %d from %d\n", i, *losers[i].keyp, losers[i].source); - } - inline int get_min_source() { return losers[0].source; } @@ -1002,26 +982,27 @@ namespace __gnu_parallel init_winner(unsigned int root, unsigned int begin, unsigned int end) { if (begin + 1 == end) - return mapping[begin]; + return mapping[begin]; else - { - // Next greater or equal power of 2. - unsigned int division = 1 << (log2(end - begin - 1)); - unsigned int left = init_winner(2 * root, begin, begin + division); - unsigned int right = init_winner(2 * root + 1, begin + division, end); - if (!comp(*losers[right].keyp, *losers[left].keyp)) - { - // Left one is less or equal. - losers[root] = losers[right]; - return left; - } - else - { - // Right one is less. - losers[root] = losers[left]; - return right; - } - } + { + // Next greater or equal power of 2. + unsigned int division = 1 << (log2(end - begin - 1)); + unsigned int left = init_winner(2 * root, begin, begin + division); + unsigned int right + = init_winner(2 * root + 1, begin + division, end); + if (!comp(*losers[right].keyp, *losers[left].keyp)) + { + // Left one is less or equal. + losers[root] = losers[right]; + return left; + } + else + { + // Right one is less. + losers[root] = losers[left]; + return right; + } + } } inline void @@ -1036,15 +1017,15 @@ namespace __gnu_parallel const T* keyp = &key; int& source = losers[0].source; for (int pos = mapping[source] / 2; pos > 0; pos /= 2) - { - // The smaller one gets promoted. - if (comp(*losers[pos].keyp, *keyp)) - { - // The other one is smaller. - std::swap(losers[pos].source, source); - std::swap(losers[pos].keyp, keyp); - } - } + { + // The smaller one gets promoted. + if (comp(*losers[pos].keyp, *keyp)) + { + // The other one is smaller. + std::swap(losers[pos].source, source); + std::swap(losers[pos].keyp, keyp); + } + } losers[0].keyp = keyp; } @@ -1063,23 +1044,23 @@ namespace __gnu_parallel int& source = losers[0].source; const T* keyp = &key; for (int pos = mapping[source] / 2; pos > 0; pos /= 2) - { - // The smaller one gets promoted, ties are broken by source. - if (comp(*losers[pos].keyp, *keyp) - || (!comp(*keyp, *losers[pos].keyp) - && losers[pos].source < source)) - { - // The other one is smaller. - std::swap(losers[pos].source, source); - std::swap(losers[pos].keyp, keyp); - } - } + { + // The smaller one gets promoted, ties are broken by source. + if (comp(*losers[pos].keyp, *keyp) + || (!comp(*keyp, *losers[pos].keyp) + && losers[pos].source < source)) + { + // The other one is smaller. + std::swap(losers[pos].source, source); + std::swap(losers[pos].keyp, keyp); + } + } losers[0].keyp = keyp; } }; #endif - template<typename _ValueTp, class Comparator> +template<typename _ValueTp, class Comparator> struct loser_tree_traits { #if _GLIBCXX_LOSER_TREE @@ -1093,7 +1074,7 @@ namespace __gnu_parallel #endif }; - template<typename _ValueTp, class Comparator> +template<typename _ValueTp, class Comparator> struct loser_tree_unguarded_traits { #if _GLIBCXX_LOSER_TREE_UNGUARDED diff --git a/libstdc++-v3/include/parallel/multiway_merge.h b/libstdc++-v3/include/parallel/multiway_merge.h index 2e3be7c4287..67f2f8cb1da 100644 --- a/libstdc++-v3/include/parallel/multiway_merge.h +++ b/libstdc++-v3/include/parallel/multiway_merge.h @@ -29,16 +29,16 @@ // Public License. /** @file parallel/multiway_merge.h - * @brief Implementation of sequential and parallel multiway merge. - * - * Explanations on the high-speed merging routines in the appendix of - * - * P. Sanders. - * Fast priority queues for cached memory. - * ACM Journal of Experimental Algorithmics, 5, 2000. - * - * This file is a GNU parallel extension to the Standard C++ Library. - */ +* @brief Implementation of sequential and parallel multiway merge. +* +* Explanations on the high-speed merging routines in the appendix of +* +* P. Sanders. +* Fast priority queues for cached memory. +* ACM Journal of Experimental Algorithmics, 5, 2000. +* +* This file is a GNU parallel extension to the Standard C++ Library. +*/ // Written by Johannes Singler. @@ -62,25 +62,25 @@ // XXX need iterator typedefs namespace __gnu_parallel { - template<typename RandomAccessIterator, typename Comparator> +template<typename RandomAccessIterator, typename Comparator> class guarded_iterator; - template<typename RandomAccessIterator, typename Comparator> +template<typename RandomAccessIterator, typename Comparator> inline bool operator<(guarded_iterator<RandomAccessIterator, Comparator>& bi1, - guarded_iterator<RandomAccessIterator, Comparator>& bi2); + guarded_iterator<RandomAccessIterator, Comparator>& bi2); - template<typename RandomAccessIterator, typename Comparator> +template<typename RandomAccessIterator, typename Comparator> inline bool operator<=(guarded_iterator<RandomAccessIterator, Comparator>& bi1, - guarded_iterator<RandomAccessIterator, Comparator>& bi2); + guarded_iterator<RandomAccessIterator, Comparator>& bi2); /** @brief Iterator wrapper supporting an implicit supremum at the end of the sequence, dominating all comparisons. * Deriving from RandomAccessIterator is not possible since * RandomAccessIterator need not be a class. */ - template<typename RandomAccessIterator, typename Comparator> +template<typename RandomAccessIterator, typename Comparator> class guarded_iterator { private: @@ -95,17 +95,17 @@ namespace __gnu_parallel public: /** @brief Constructor. Sets iterator to beginning of sequence. - * @param begin Begin iterator of sequence. - * @param end End iterator of sequence. - * @param comp Comparator provided for associated overloaded - * compare operators. */ - inline guarded_iterator(RandomAccessIterator begin, - RandomAccessIterator end, Comparator& comp) + * @param begin Begin iterator of sequence. + * @param end End iterator of sequence. + * @param comp Comparator provided for associated overloaded + * compare operators. */ + inline guarded_iterator(RandomAccessIterator begin, + RandomAccessIterator end, Comparator& comp) : current(begin), end(end), comp(comp) { } /** @brief Pre-increment operator. - * @return This. */ + * @return This. */ inline guarded_iterator<RandomAccessIterator, Comparator>& operator++() { @@ -114,31 +114,35 @@ namespace __gnu_parallel } /** @brief Dereference operator. - * @return Referenced element. */ + * @return Referenced element. */ inline typename std::iterator_traits<RandomAccessIterator>::value_type operator*() { return *current; } /** @brief Convert to wrapped iterator. - * @return Wrapped iterator. */ + * @return Wrapped iterator. */ inline operator RandomAccessIterator() { return current; } friend bool - operator< <RandomAccessIterator, Comparator>(guarded_iterator<RandomAccessIterator, Comparator>& bi1, guarded_iterator<RandomAccessIterator, Comparator>& bi2); + operator< <RandomAccessIterator, Comparator>( + guarded_iterator<RandomAccessIterator, Comparator>& bi1, + guarded_iterator<RandomAccessIterator, Comparator>& bi2); friend bool - operator<= <RandomAccessIterator, Comparator>(guarded_iterator<RandomAccessIterator, Comparator>& bi1, guarded_iterator<RandomAccessIterator, Comparator>& bi2); + operator<= <RandomAccessIterator, Comparator>( + guarded_iterator<RandomAccessIterator, Comparator>& bi1, + guarded_iterator<RandomAccessIterator, Comparator>& bi2); }; - /** @brief Compare two elements referenced by guarded iterators. - * @param bi1 First iterator. - * @param bi2 Second iterator. - * @return @c True if less. */ - template<typename RandomAccessIterator, typename Comparator> +/** @brief Compare two elements referenced by guarded iterators. + * @param bi1 First iterator. + * @param bi2 Second iterator. + * @return @c True if less. */ +template<typename RandomAccessIterator, typename Comparator> inline bool operator<(guarded_iterator<RandomAccessIterator, Comparator>& bi1, - guarded_iterator<RandomAccessIterator, Comparator>& bi2) + guarded_iterator<RandomAccessIterator, Comparator>& bi2) { if (bi1.current == bi1.end) //bi1 is sup return bi2.current == bi2.end; //bi2 is not sup @@ -147,14 +151,14 @@ namespace __gnu_parallel return (bi1.comp)(*bi1, *bi2); //normal compare } - /** @brief Compare two elements referenced by guarded iterators. - * @param bi1 First iterator. - * @param bi2 Second iterator. - * @return @c True if less equal. */ - template<typename RandomAccessIterator, typename Comparator> +/** @brief Compare two elements referenced by guarded iterators. + * @param bi1 First iterator. + * @param bi2 Second iterator. + * @return @c True if less equal. */ +template<typename RandomAccessIterator, typename Comparator> inline bool operator<=(guarded_iterator<RandomAccessIterator, Comparator>& bi1, - guarded_iterator<RandomAccessIterator, Comparator>& bi2) + guarded_iterator<RandomAccessIterator, Comparator>& bi2) { if (bi2.current == bi2.end) //bi1 is sup return bi1.current != bi1.end; //bi2 is not sup @@ -163,20 +167,20 @@ namespace __gnu_parallel return !(bi1.comp)(*bi2, *bi1); //normal compare } - template<typename RandomAccessIterator, typename Comparator> +template<typename RandomAccessIterator, typename Comparator> class unguarded_iterator; - template<typename RandomAccessIterator, typename Comparator> +template<typename RandomAccessIterator, typename Comparator> inline bool operator<(unguarded_iterator<RandomAccessIterator, Comparator>& bi1, - unguarded_iterator<RandomAccessIterator, Comparator>& bi2); + unguarded_iterator<RandomAccessIterator, Comparator>& bi2); - template<typename RandomAccessIterator, typename Comparator> +template<typename RandomAccessIterator, typename Comparator> inline bool operator<=(unguarded_iterator<RandomAccessIterator, Comparator>& bi1, - unguarded_iterator<RandomAccessIterator, Comparator>& bi2); + unguarded_iterator<RandomAccessIterator, Comparator>& bi2); - template<typename RandomAccessIterator, typename Comparator> +template<typename RandomAccessIterator, typename Comparator> class unguarded_iterator { private: @@ -187,16 +191,16 @@ namespace __gnu_parallel public: /** @brief Constructor. Sets iterator to beginning of sequence. - * @param begin Begin iterator of sequence. - * @param end Unused, only for compatibility. - * @param comp Unused, only for compatibility. */ - inline unguarded_iterator(RandomAccessIterator begin, - RandomAccessIterator end, Comparator& comp) + * @param begin Begin iterator of sequence. + * @param end Unused, only for compatibility. + * @param comp Unused, only for compatibility. */ + inline unguarded_iterator(RandomAccessIterator begin, + RandomAccessIterator end, Comparator& comp) : current(begin), comp(comp) { } /** @brief Pre-increment operator. - * @return This. */ + * @return This. */ inline unguarded_iterator<RandomAccessIterator, Comparator>& operator++() { @@ -205,77 +209,85 @@ namespace __gnu_parallel } /** @brief Dereference operator. - * @return Referenced element. */ - inline typename std::iterator_traits<RandomAccessIterator>::value_type + * @return Referenced element. */ + inline typename std::iterator_traits<RandomAccessIterator>::value_type operator*() { return *current; } /** @brief Convert to wrapped iterator. - * @return Wrapped iterator. */ + * @return Wrapped iterator. */ inline operator RandomAccessIterator() { return current; } friend bool - operator< <RandomAccessIterator, Comparator>(unguarded_iterator<RandomAccessIterator, Comparator>& bi1, unguarded_iterator<RandomAccessIterator, Comparator>& bi2); + operator< <RandomAccessIterator, Comparator>( + unguarded_iterator<RandomAccessIterator, Comparator>& bi1, + unguarded_iterator<RandomAccessIterator, Comparator>& bi2); friend bool - operator<= <RandomAccessIterator, Comparator>(unguarded_iterator<RandomAccessIterator, Comparator>& bi1, unguarded_iterator<RandomAccessIterator, Comparator>& bi2); + operator<= <RandomAccessIterator, Comparator>( + unguarded_iterator<RandomAccessIterator, Comparator>& bi1, + unguarded_iterator<RandomAccessIterator, Comparator>& bi2); }; - /** @brief Compare two elements referenced by unguarded iterators. - * @param bi1 First iterator. - * @param bi2 Second iterator. - * @return @c True if less. */ - template<typename RandomAccessIterator, typename Comparator> +/** @brief Compare two elements referenced by unguarded iterators. + * @param bi1 First iterator. + * @param bi2 Second iterator. + * @return @c True if less. */ +template<typename RandomAccessIterator, typename Comparator> inline bool operator<(unguarded_iterator<RandomAccessIterator, Comparator>& bi1, - unguarded_iterator<RandomAccessIterator, Comparator>& bi2) + unguarded_iterator<RandomAccessIterator, Comparator>& bi2) { // Normal compare. return (bi1.comp)(*bi1, *bi2); } - /** @brief Compare two elements referenced by unguarded iterators. - * @param bi1 First iterator. - * @param bi2 Second iterator. - * @return @c True if less equal. */ - template<typename RandomAccessIterator, typename Comparator> +/** @brief Compare two elements referenced by unguarded iterators. + * @param bi1 First iterator. + * @param bi2 Second iterator. + * @return @c True if less equal. */ +template<typename RandomAccessIterator, typename Comparator> inline bool operator<=(unguarded_iterator<RandomAccessIterator, Comparator>& bi1, - unguarded_iterator<RandomAccessIterator, Comparator>& bi2) + unguarded_iterator<RandomAccessIterator, Comparator>& bi2) { // Normal compare. return !(bi1.comp)(*bi2, *bi1); } - /** Prepare a set of sequences to be merged without a (end) guard - * @param seqs_begin - * @param seqs_end - * @param comp - * @param min_sequence - * @param stable - * @pre (seqs_end - seqs_begin > 0) */ - template<typename RandomAccessIteratorIterator, typename Comparator> - typename std::iterator_traits<typename std::iterator_traits<RandomAccessIteratorIterator>::value_type::first_type>::difference_type +/** Prepare a set of sequences to be merged without a (end) guard + * @param seqs_begin + * @param seqs_end + * @param comp + * @param min_sequence + * @param stable + * @pre (seqs_end - seqs_begin > 0) */ +template<typename RandomAccessIteratorIterator, typename Comparator> + typename std::iterator_traits< + typename std::iterator_traits<RandomAccessIteratorIterator>::value_type + ::first_type>::difference_type prepare_unguarded(RandomAccessIteratorIterator seqs_begin, - RandomAccessIteratorIterator seqs_end, Comparator comp, - int& min_sequence, bool stable) + RandomAccessIteratorIterator seqs_end, Comparator comp, + int& min_sequence, bool stable) { _GLIBCXX_CALL(seqs_end - seqs_begin) - typedef typename std::iterator_traits<RandomAccessIteratorIterator>::value_type::first_type + typedef typename std::iterator_traits<RandomAccessIteratorIterator> + ::value_type::first_type RandomAccessIterator1; typedef typename std::iterator_traits<RandomAccessIterator1>::value_type value_type; - typedef typename std::iterator_traits<RandomAccessIterator1>::difference_type + typedef typename std::iterator_traits<RandomAccessIterator1> + ::difference_type difference_type; if ((*seqs_begin).first == (*seqs_begin).second) { - // Empty sequence found, it's the first one. - min_sequence = 0; - return -1; + // Empty sequence found, it's the first one. + min_sequence = 0; + return -1; } // Last element in sequence. @@ -283,20 +295,20 @@ namespace __gnu_parallel min_sequence = 0; for (RandomAccessIteratorIterator s = seqs_begin + 1; s != seqs_end; s++) { - if ((*s).first == (*s).second) - { - // Empty sequence found. - min_sequence = static_cast<int>(s - seqs_begin); - return -1; - } - - // Last element in sequence. - const value_type& v = *((*s).second - 1); - if (comp(v, min)) //strictly smaller - { - min = v; - min_sequence = static_cast<int>(s - seqs_begin); - } + if ((*s).first == (*s).second) + { + // Empty sequence found. + min_sequence = static_cast<int>(s - seqs_begin); + return -1; + } + + // Last element in sequence. + const value_type& v = *((*s).second - 1); + if (comp(v, min)) //strictly smaller + { + min = v; + min_sequence = static_cast<int>(s - seqs_begin); + } } difference_type overhang_size = 0; @@ -304,93 +316,108 @@ namespace __gnu_parallel int s = 0; for (s = 0; s <= min_sequence; s++) { - RandomAccessIterator1 split; - if (stable) - split = std::upper_bound(seqs_begin[s].first, seqs_begin[s].second, - min, comp); - else - split = std::lower_bound(seqs_begin[s].first, seqs_begin[s].second, - min, comp); - - overhang_size += seqs_begin[s].second - split; + RandomAccessIterator1 split; + if (stable) + split = std::upper_bound(seqs_begin[s].first, seqs_begin[s].second, + min, comp); + else + split = std::lower_bound(seqs_begin[s].first, seqs_begin[s].second, + min, comp); + + overhang_size += seqs_begin[s].second - split; } for (; s < (seqs_end - seqs_begin); s++) { - RandomAccessIterator1 split = std::lower_bound(seqs_begin[s].first, seqs_begin[s].second, min, comp); - overhang_size += seqs_begin[s].second - split; + RandomAccessIterator1 split = std::lower_bound( + seqs_begin[s].first, seqs_begin[s].second, min, comp); + overhang_size += seqs_begin[s].second - split; } // So many elements will be left over afterwards. return overhang_size; } - /** Prepare a set of sequences to be merged with a (end) guard (sentinel) - * @param seqs_begin - * @param seqs_end - * @param comp */ - template<typename RandomAccessIteratorIterator, typename Comparator> - typename std::iterator_traits<typename std::iterator_traits<RandomAccessIteratorIterator>::value_type::first_type>::difference_type +/** Prepare a set of sequences to be merged with a (end) guard (sentinel) + * @param seqs_begin + * @param seqs_end + * @param comp */ +template<typename RandomAccessIteratorIterator, typename Comparator> + typename std::iterator_traits<typename std::iterator_traits< + RandomAccessIteratorIterator>::value_type::first_type>::difference_type prepare_unguarded_sentinel(RandomAccessIteratorIterator seqs_begin, - RandomAccessIteratorIterator seqs_end, - Comparator comp) + RandomAccessIteratorIterator seqs_end, + Comparator comp) { _GLIBCXX_CALL(seqs_end - seqs_begin) - typedef typename std::iterator_traits<RandomAccessIteratorIterator>::value_type::first_type + typedef typename std::iterator_traits<RandomAccessIteratorIterator> + ::value_type::first_type RandomAccessIterator1; - typedef typename std::iterator_traits<RandomAccessIterator1>::value_type + typedef typename std::iterator_traits<RandomAccessIterator1> + ::value_type value_type; - typedef typename std::iterator_traits<RandomAccessIterator1>::difference_type + typedef typename std::iterator_traits<RandomAccessIterator1> + ::difference_type difference_type; // Last element in sequence. value_type* max = NULL; for (RandomAccessIteratorIterator s = seqs_begin; s != seqs_end; s++) { - if ((*s).first == (*s).second) - continue; + if ((*s).first == (*s).second) + continue; - // Last element in sequence. - value_type& v = *((*s).second - 1); + // Last element in sequence. + value_type& v = *((*s).second - 1); - // Strictly greater. - if (!max || comp(*max, v)) - max = &v; + // Strictly greater. + if (!max || comp(*max, v)) + max = &v; } difference_type overhang_size = 0; for (RandomAccessIteratorIterator s = seqs_begin; s != seqs_end; s++) { - RandomAccessIterator1 split = std::lower_bound((*s).first, (*s).second, - *max, comp); - overhang_size += (*s).second - split; + RandomAccessIterator1 split = + std::lower_bound((*s).first, (*s).second, *max, comp); + overhang_size += (*s).second - split; - // Set sentinel. - *((*s).second) = *max; + // Set sentinel. + *((*s).second) = *max; } // So many elements will be left over afterwards. return overhang_size; } - /** @brief Highly efficient 3-way merging procedure. - * @param seqs_begin Begin iterator of iterator pair input sequence. - * @param seqs_end End iterator of iterator pair input sequence. - * @param target Begin iterator out output sequence. - * @param comp Comparator. - * @param length Maximum length to merge. - * @param stable Unused, stable anyway. - * @return End iterator of output sequence. */ - template<template<typename RAI, typename C> class iterator, typename RandomAccessIteratorIterator, typename RandomAccessIterator3, typename _DifferenceTp, typename Comparator> +/** @brief Highly efficient 3-way merging procedure. + * @param seqs_begin Begin iterator of iterator pair input sequence. + * @param seqs_end End iterator of iterator pair input sequence. + * @param target Begin iterator out output sequence. + * @param comp Comparator. + * @param length Maximum length to merge. + * @param stable Unused, stable anyway. + * @return End iterator of output sequence. */ +template< + template<typename RAI, typename C> class iterator, + typename RandomAccessIteratorIterator, + typename RandomAccessIterator3, + typename _DifferenceTp, + typename Comparator> RandomAccessIterator3 - multiway_merge_3_variant(RandomAccessIteratorIterator seqs_begin, RandomAccessIteratorIterator seqs_end, RandomAccessIterator3 target, Comparator comp, _DifferenceTp length, bool stable) + multiway_merge_3_variant( + RandomAccessIteratorIterator seqs_begin, + RandomAccessIteratorIterator seqs_end, + RandomAccessIterator3 target, + Comparator comp, _DifferenceTp length, bool stable) { _GLIBCXX_CALL(length); - + typedef _DifferenceTp difference_type; - typedef typename std::iterator_traits<RandomAccessIteratorIterator>::value_type::first_type + typedef typename std::iterator_traits<RandomAccessIteratorIterator> + ::value_type::first_type RandomAccessIterator1; typedef typename std::iterator_traits<RandomAccessIterator1>::value_type value_type; @@ -405,25 +432,25 @@ namespace __gnu_parallel if (seq0 <= seq1) { - if (seq1 <= seq2) - goto s012; - else - if (seq2 < seq0) - goto s201; - else - goto s021; + if (seq1 <= seq2) + goto s012; + else + if (seq2 < seq0) + goto s201; + else + goto s021; } else { - if (seq1 <= seq2) - { - if (seq0 <= seq2) - goto s102; - else - goto s120; - } - else - goto s210; + if (seq1 <= seq2) + { + if (seq0 <= seq2) + goto s102; + else + goto s120; + } + else + goto s210; } #define Merge3Case(a,b,c,c0,c1) \ @@ -456,14 +483,23 @@ namespace __gnu_parallel return target; } - template<typename RandomAccessIteratorIterator, typename RandomAccessIterator3, typename _DifferenceTp, typename Comparator> +template< + typename RandomAccessIteratorIterator, + typename RandomAccessIterator3, + typename _DifferenceTp, + typename Comparator> RandomAccessIterator3 - multiway_merge_3_combined(RandomAccessIteratorIterator seqs_begin, RandomAccessIteratorIterator seqs_end, RandomAccessIterator3 target, Comparator comp, _DifferenceTp length, bool stable) + multiway_merge_3_combined(RandomAccessIteratorIterator seqs_begin, + RandomAccessIteratorIterator seqs_end, + RandomAccessIterator3 target, + Comparator comp, + _DifferenceTp length, bool stable) { _GLIBCXX_CALL(length); - + typedef _DifferenceTp difference_type; - typedef typename std::iterator_traits<RandomAccessIteratorIterator>::value_type::first_type + typedef typename std::iterator_traits<RandomAccessIteratorIterator> + ::value_type::first_type RandomAccessIterator1; typedef typename std::iterator_traits<RandomAccessIterator1>::value_type value_type; @@ -472,7 +508,8 @@ namespace __gnu_parallel RandomAccessIterator3 target_end; // Stable anyway. - difference_type overhang = prepare_unguarded(seqs_begin, seqs_end, comp, min_seq, true); + difference_type overhang = + prepare_unguarded(seqs_begin, seqs_end, comp, min_seq, true); difference_type total_length = 0; for (RandomAccessIteratorIterator s = seqs_begin; s != seqs_end; ++s) @@ -480,16 +517,17 @@ namespace __gnu_parallel if (overhang != -1) { - difference_type unguarded_length = std::min(length, total_length - overhang); - target_end = multiway_merge_3_variant<unguarded_iterator> - (seqs_begin, seqs_end, target, comp, unguarded_length, stable); - overhang = length - unguarded_length; + difference_type unguarded_length = + std::min(length, total_length - overhang); + target_end = multiway_merge_3_variant<unguarded_iterator> + (seqs_begin, seqs_end, target, comp, unguarded_length, stable); + overhang = length - unguarded_length; } else { - // Empty sequence found. - overhang = length; - target_end = target; + // Empty sequence found. + overhang = length; + target_end = target; } #if _GLIBCXX_ASSERTIONS @@ -500,23 +538,23 @@ namespace __gnu_parallel switch (min_seq) { case 0: - // Iterators will be advanced accordingly. - target_end = merge_advance(seqs_begin[1].first, seqs_begin[1].second, - seqs_begin[2].first, seqs_begin[2].second, - target_end, overhang, comp); - break; + // Iterators will be advanced accordingly. + target_end = merge_advance(seqs_begin[1].first, seqs_begin[1].second, + seqs_begin[2].first, seqs_begin[2].second, + target_end, overhang, comp); + break; case 1: - target_end = merge_advance(seqs_begin[0].first, seqs_begin[0].second, - seqs_begin[2].first, seqs_begin[2].second, - target_end, overhang, comp); - break; + target_end = merge_advance(seqs_begin[0].first, seqs_begin[0].second, + seqs_begin[2].first, seqs_begin[2].second, + target_end, overhang, comp); + break; case 2: - target_end = merge_advance(seqs_begin[0].first, seqs_begin[0].second, - seqs_begin[1].first, seqs_begin[1].second, - target_end, overhang, comp); - break; + target_end = merge_advance(seqs_begin[0].first, seqs_begin[0].second, + seqs_begin[1].first, seqs_begin[1].second, + target_end, overhang, comp); + break; default: - _GLIBCXX_PARALLEL_ASSERT(false); + _GLIBCXX_PARALLEL_ASSERT(false); } #if _GLIBCXX_ASSERTIONS @@ -527,22 +565,31 @@ namespace __gnu_parallel return target_end; } - /** @brief Highly efficient 4-way merging procedure. - * @param seqs_begin Begin iterator of iterator pair input sequence. - * @param seqs_end End iterator of iterator pair input sequence. - * @param target Begin iterator out output sequence. - * @param comp Comparator. - * @param length Maximum length to merge. - * @param stable Unused, stable anyway. - * @return End iterator of output sequence. */ - template<template<typename RAI, typename C> class iterator, typename RandomAccessIteratorIterator, typename RandomAccessIterator3, typename _DifferenceTp, typename Comparator> +/** @brief Highly efficient 4-way merging procedure. + * @param seqs_begin Begin iterator of iterator pair input sequence. + * @param seqs_end End iterator of iterator pair input sequence. + * @param target Begin iterator out output sequence. + * @param comp Comparator. + * @param length Maximum length to merge. + * @param stable Unused, stable anyway. + * @return End iterator of output sequence. */ +template< + template<typename RAI, typename C> class iterator, + typename RandomAccessIteratorIterator, + typename RandomAccessIterator3, + typename _DifferenceTp, + typename Comparator> RandomAccessIterator3 - multiway_merge_4_variant(RandomAccessIteratorIterator seqs_begin, RandomAccessIteratorIterator seqs_end, RandomAccessIterator3 target, Comparator comp, _DifferenceTp length, bool stable) + multiway_merge_4_variant(RandomAccessIteratorIterator seqs_begin, + RandomAccessIteratorIterator seqs_end, + RandomAccessIterator3 target, + Comparator comp, _DifferenceTp length, bool stable) { _GLIBCXX_CALL(length); typedef _DifferenceTp difference_type; - typedef typename std::iterator_traits<RandomAccessIteratorIterator>::value_type::first_type + typedef typename std::iterator_traits<RandomAccessIteratorIterator> + ::value_type::first_type RandomAccessIterator1; typedef typename std::iterator_traits<RandomAccessIterator1>::value_type value_type; @@ -561,26 +608,26 @@ namespace __gnu_parallel if (seq0 <= seq1) { - if (seq1 <= seq2) - Decision(0,1,2,3) - else - if (seq2 < seq0) - Decision(2,0,1,3) - else - Decision(0,2,1,3) - } + if (seq1 <= seq2) + Decision(0,1,2,3) + else + if (seq2 < seq0) + Decision(2,0,1,3) + else + Decision(0,2,1,3) + } else { - if (seq1 <= seq2) - { - if (seq0 <= seq2) - Decision(1,0,2,3) - else - Decision(1,2,0,3) - } - else - Decision(2,1,0,3) - } + if (seq1 <= seq2) + { + if (seq0 <= seq2) + Decision(1,0,2,3) + else + Decision(1,2,0,3) + } + else + Decision(2,1,0,3) + } #define Merge4Case(a,b,c,d,c0,c1,c2) \ s ## a ## b ## c ## d: \ @@ -633,14 +680,23 @@ namespace __gnu_parallel return target; } - template<typename RandomAccessIteratorIterator, typename RandomAccessIterator3, typename _DifferenceTp, typename Comparator> +template< + typename RandomAccessIteratorIterator, + typename RandomAccessIterator3, + typename _DifferenceTp, + typename Comparator> RandomAccessIterator3 - multiway_merge_4_combined(RandomAccessIteratorIterator seqs_begin, RandomAccessIteratorIterator seqs_end, RandomAccessIterator3 target, Comparator comp, _DifferenceTp length, bool stable) + multiway_merge_4_combined(RandomAccessIteratorIterator seqs_begin, + RandomAccessIteratorIterator seqs_end, + RandomAccessIterator3 target, + Comparator comp, + _DifferenceTp length, bool stable) { _GLIBCXX_CALL(length); typedef _DifferenceTp difference_type; - typedef typename std::iterator_traits<RandomAccessIteratorIterator>::value_type::first_type + typedef typename std::iterator_traits<RandomAccessIteratorIterator> + ::value_type::first_type RandomAccessIterator1; typedef typename std::iterator_traits<RandomAccessIterator1>::value_type value_type; @@ -649,7 +705,8 @@ namespace __gnu_parallel RandomAccessIterator3 target_end; // Stable anyway. - difference_type overhang = prepare_unguarded(seqs_begin, seqs_end, comp, min_seq, true); + difference_type overhang = + prepare_unguarded(seqs_begin, seqs_end, comp, min_seq, true); difference_type total_length = 0; for (RandomAccessIteratorIterator s = seqs_begin; s != seqs_end; ++s) @@ -657,16 +714,17 @@ namespace __gnu_parallel if (overhang != -1) { - difference_type unguarded_length = std::min(length, total_length - overhang); - target_end = multiway_merge_4_variant<unguarded_iterator> - (seqs_begin, seqs_end, target, comp, unguarded_length, stable); - overhang = length - unguarded_length; + difference_type unguarded_length = + std::min(length, total_length - overhang); + target_end = multiway_merge_4_variant<unguarded_iterator> + (seqs_begin, seqs_end, target, comp, unguarded_length, stable); + overhang = length - unguarded_length; } else { - // Empty sequence found. - overhang = length; - target_end = target; + // Empty sequence found. + overhang = length; + target_end = target; } #if _GLIBCXX_ASSERTIONS @@ -674,10 +732,13 @@ namespace __gnu_parallel _GLIBCXX_PARALLEL_ASSERT(is_sorted(target, target_end, comp)); #endif - std::vector<std::pair<RandomAccessIterator1, RandomAccessIterator1> > one_missing(seqs_begin, seqs_end); + std::vector<std::pair<RandomAccessIterator1, RandomAccessIterator1> > + one_missing(seqs_begin, seqs_end); one_missing.erase(one_missing.begin() + min_seq); //remove - target_end = multiway_merge_3_variant<guarded_iterator>(one_missing.begin(), one_missing.end(), target_end, comp, overhang, stable); + target_end = multiway_merge_3_variant<guarded_iterator>( + one_missing.begin(), one_missing.end(), + target_end, comp, overhang, stable); // Insert back again. one_missing.insert(one_missing.begin() + min_seq, seqs_begin[min_seq]); @@ -692,26 +753,34 @@ namespace __gnu_parallel return target_end; } - /** @brief Basic multi-way merging procedure. - * - * The head elements are kept in a sorted array, new heads are - * inserted linearly. - * @param seqs_begin Begin iterator of iterator pair input sequence. - * @param seqs_end End iterator of iterator pair input sequence. - * @param target Begin iterator out output sequence. - * @param comp Comparator. - * @param length Maximum length to merge. - * @param stable Stable merging incurs a performance penalty. - * @return End iterator of output sequence. - */ - template<typename RandomAccessIteratorIterator, typename RandomAccessIterator3, typename _DifferenceTp, typename Comparator> +/** @brief Basic multi-way merging procedure. + * + * The head elements are kept in a sorted array, new heads are + * inserted linearly. + * @param seqs_begin Begin iterator of iterator pair input sequence. + * @param seqs_end End iterator of iterator pair input sequence. + * @param target Begin iterator out output sequence. + * @param comp Comparator. + * @param length Maximum length to merge. + * @param stable Stable merging incurs a performance penalty. + * @return End iterator of output sequence. + */ +template< + typename RandomAccessIteratorIterator, + typename RandomAccessIterator3, + typename _DifferenceTp, + typename Comparator> RandomAccessIterator3 - multiway_merge_bubble(RandomAccessIteratorIterator seqs_begin, RandomAccessIteratorIterator seqs_end, RandomAccessIterator3 target, Comparator comp, _DifferenceTp length, bool stable) + multiway_merge_bubble(RandomAccessIteratorIterator seqs_begin, + RandomAccessIteratorIterator seqs_end, + RandomAccessIterator3 target, + Comparator comp, _DifferenceTp length, bool stable) { _GLIBCXX_CALL(length) typedef _DifferenceTp difference_type; - typedef typename std::iterator_traits<RandomAccessIteratorIterator>::value_type::first_type + typedef typename std::iterator_traits<RandomAccessIteratorIterator> + ::value_type::first_type RandomAccessIterator1; typedef typename std::iterator_traits<RandomAccessIterator1>::value_type value_type; @@ -719,7 +788,8 @@ namespace __gnu_parallel // Num remaining pieces. int k = static_cast<int>(seqs_end - seqs_begin), nrp; - value_type* pl = static_cast<value_type*>(::operator new(sizeof(value_type) * k)); + value_type* pl = static_cast<value_type*>( + ::operator new(sizeof(value_type) * k)); int* source = new int[k]; difference_type total_length = 0; @@ -730,137 +800,138 @@ namespace __gnu_parallel nrp = 0; for (int pi = 0; pi < k; pi++) { - if (STOPS(pi) != POS(pi)) - { - pl[nrp] = *(POS(pi)); - source[nrp] = pi; - nrp++; - total_length += LENGTH(seqs_begin[pi]); - } + if (STOPS(pi) != POS(pi)) + { + pl[nrp] = *(POS(pi)); + source[nrp] = pi; + nrp++; + total_length += LENGTH(seqs_begin[pi]); + } } if (stable) { - for (int k = 0; k < nrp - 1; k++) - for (int pi = nrp - 1; pi > k; pi--) - if (comp(pl[pi], pl[pi - 1]) || - (!comp(pl[pi - 1], pl[pi]) && source[pi] < source[pi - 1])) - { - std::swap(pl[pi - 1], pl[pi]); - std::swap(source[pi - 1], source[pi]); - } + for (int k = 0; k < nrp - 1; k++) + for (int pi = nrp - 1; pi > k; pi--) + if (comp(pl[pi], pl[pi - 1]) || + (!comp(pl[pi - 1], pl[pi]) && source[pi] < source[pi - 1])) + { + std::swap(pl[pi - 1], pl[pi]); + std::swap(source[pi - 1], source[pi]); + } } else { - for (int k = 0; k < nrp - 1; k++) - for (int pi = nrp - 1; pi > k; pi--) - if (comp(pl[pi], pl[pi-1])) - { - std::swap(pl[pi-1], pl[pi]); - std::swap(source[pi-1], source[pi]); - } + for (int k = 0; k < nrp - 1; k++) + for (int pi = nrp - 1; pi > k; pi--) + if (comp(pl[pi], pl[pi-1])) + { + std::swap(pl[pi-1], pl[pi]); + std::swap(source[pi-1], source[pi]); + } } // Iterate. if (stable) { - int j; - while (nrp > 0 && length > 0) - { - if (source[0] < source[1]) - { - // pl[0] <= pl[1] - while ((nrp == 1 || !(comp(pl[1], pl[0]))) && length > 0) - { - *target = pl[0]; - ++target; - ++POS(source[0]); - length--; - if (POS(source[0]) == STOPS(source[0])) - { - // Move everything to the left. - for (int s = 0; s < nrp - 1; s++) - { - pl[s] = pl[s + 1]; - source[s] = source[s + 1]; - } - nrp--; - break; - } - else - pl[0] = *(POS(source[0])); - } - } - else - { - // pl[0] < pl[1] - while ((nrp == 1 || comp(pl[0], pl[1])) && length > 0) - { - *target = pl[0]; - ++target; - ++POS(source[0]); - length--; - if (POS(source[0]) == STOPS(source[0])) - { - for (int s = 0; s < nrp - 1; s++) - { - pl[s] = pl[s + 1]; - source[s] = source[s + 1]; - } - nrp--; - break; - } - else - pl[0] = *(POS(source[0])); - } - } - - // Sink down. - j = 1; - while ((j < nrp) && (comp(pl[j], pl[j - 1]) || - (!comp(pl[j - 1], pl[j]) && (source[j] < source[j - 1])))) - { - std::swap(pl[j - 1], pl[j]); - std::swap(source[j - 1], source[j]); - j++; - } - } + int j; + while (nrp > 0 && length > 0) + { + if (source[0] < source[1]) + { + // pl[0] <= pl[1] + while ((nrp == 1 || !(comp(pl[1], pl[0]))) && length > 0) + { + *target = pl[0]; + ++target; + ++POS(source[0]); + length--; + if (POS(source[0]) == STOPS(source[0])) + { + // Move everything to the left. + for (int s = 0; s < nrp - 1; s++) + { + pl[s] = pl[s + 1]; + source[s] = source[s + 1]; + } + nrp--; + break; + } + else + pl[0] = *(POS(source[0])); + } + } + else + { + // pl[0] < pl[1] + while ((nrp == 1 || comp(pl[0], pl[1])) && length > 0) + { + *target = pl[0]; + ++target; + ++POS(source[0]); + length--; + if (POS(source[0]) == STOPS(source[0])) + { + for (int s = 0; s < nrp - 1; s++) + { + pl[s] = pl[s + 1]; + source[s] = source[s + 1]; + } + nrp--; + break; + } + else + pl[0] = *(POS(source[0])); + } + } + + // Sink down. + j = 1; + while ((j < nrp) && (comp(pl[j], pl[j - 1]) || + (!comp(pl[j - 1], pl[j]) + && (source[j] < source[j - 1])))) + { + std::swap(pl[j - 1], pl[j]); + std::swap(source[j - 1], source[j]); + j++; + } + } } else { - int j; - while (nrp > 0 && length > 0) - { - // pl[0] <= pl[1] - while (nrp == 1 || (!comp(pl[1], pl[0])) && length > 0) - { - *target = pl[0]; - ++target; - ++POS(source[0]); - length--; - if (POS(source[0]) == STOPS(source[0])) - { - for (int s = 0; s < (nrp - 1); s++) - { - pl[s] = pl[s + 1]; - source[s] = source[s + 1]; - } - nrp--; - break; - } - else - pl[0] = *(POS(source[0])); - } - - // Sink down. - j = 1; - while ((j < nrp) && comp(pl[j], pl[j - 1])) - { - std::swap(pl[j - 1], pl[j]); - std::swap(source[j - 1], source[j]); - j++; - } - } + int j; + while (nrp > 0 && length > 0) + { + // pl[0] <= pl[1] + while (nrp == 1 || (!comp(pl[1], pl[0])) && length > 0) + { + *target = pl[0]; + ++target; + ++POS(source[0]); + length--; + if (POS(source[0]) == STOPS(source[0])) + { + for (int s = 0; s < (nrp - 1); s++) + { + pl[s] = pl[s + 1]; + source[s] = source[s + 1]; + } + nrp--; + break; + } + else + pl[0] = *(POS(source[0])); + } + + // Sink down. + j = 1; + while ((j < nrp) && comp(pl[j], pl[j - 1])) + { + std::swap(pl[j - 1], pl[j]); + std::swap(source[j - 1], source[j]); + j++; + } + } } delete[] pl; @@ -869,26 +940,36 @@ namespace __gnu_parallel return target; } - /** @brief Multi-way merging procedure for a high branching factor, - * guarded case. - * - * The head elements are kept in a loser tree. - * @param seqs_begin Begin iterator of iterator pair input sequence. - * @param seqs_end End iterator of iterator pair input sequence. - * @param target Begin iterator out output sequence. - * @param comp Comparator. - * @param length Maximum length to merge. - * @param stable Stable merging incurs a performance penalty. - * @return End iterator of output sequence. - */ - template<typename LT, typename RandomAccessIteratorIterator, typename RandomAccessIterator3, typename _DifferenceTp, typename Comparator> +/** @brief Multi-way merging procedure for a high branching factor, + * guarded case. + * + * The head elements are kept in a loser tree. + * @param seqs_begin Begin iterator of iterator pair input sequence. + * @param seqs_end End iterator of iterator pair input sequence. + * @param target Begin iterator out output sequence. + * @param comp Comparator. + * @param length Maximum length to merge. + * @param stable Stable merging incurs a performance penalty. + * @return End iterator of output sequence. + */ +template< + typename LT, + typename RandomAccessIteratorIterator, + typename RandomAccessIterator3, + typename _DifferenceTp, + typename Comparator> RandomAccessIterator3 - multiway_merge_loser_tree(RandomAccessIteratorIterator seqs_begin, RandomAccessIteratorIterator seqs_end, RandomAccessIterator3 target, Comparator comp, _DifferenceTp length, bool stable) + multiway_merge_loser_tree(RandomAccessIteratorIterator seqs_begin, + RandomAccessIteratorIterator seqs_end, + RandomAccessIterator3 target, + Comparator comp, + _DifferenceTp length, bool stable) { _GLIBCXX_CALL(length) typedef _DifferenceTp difference_type; - typedef typename std::iterator_traits<RandomAccessIteratorIterator>::value_type::first_type + typedef typename std::iterator_traits<RandomAccessIteratorIterator> + ::value_type::first_type RandomAccessIterator1; typedef typename std::iterator_traits<RandomAccessIterator1>::value_type value_type; @@ -941,64 +1022,73 @@ namespace __gnu_parallel if (stable) { - for (difference_type i = 0; i < total_length; i++) - { - // Take out. - source = lt.get_min_source(); + for (difference_type i = 0; i < total_length; i++) + { + // Take out. + source = lt.get_min_source(); - *(target++) = *(seqs_begin[source].first++); + *(target++) = *(seqs_begin[source].first++); - // Feed. - if (seqs_begin[source].first == seqs_begin[source].second) - lt.delete_min_insert_stable(*arbitrary_element, true); - else - // Replace from same source. - lt.delete_min_insert_stable(*seqs_begin[source].first, false); + // Feed. + if (seqs_begin[source].first == seqs_begin[source].second) + lt.delete_min_insert_stable(*arbitrary_element, true); + else + // Replace from same source. + lt.delete_min_insert_stable(*seqs_begin[source].first, false); - } + } } else { - for (difference_type i = 0; i < total_length; i++) - { - //take out - source = lt.get_min_source(); - - *(target++) = *(seqs_begin[source].first++); - - // Feed. - if (seqs_begin[source].first == seqs_begin[source].second) - lt.delete_min_insert(*arbitrary_element, true); - else - // Replace from same source. - lt.delete_min_insert(*seqs_begin[source].first, false); - } + for (difference_type i = 0; i < total_length; i++) + { + //take out + source = lt.get_min_source(); + + *(target++) = *(seqs_begin[source].first++); + + // Feed. + if (seqs_begin[source].first == seqs_begin[source].second) + lt.delete_min_insert(*arbitrary_element, true); + else + // Replace from same source. + lt.delete_min_insert(*seqs_begin[source].first, false); + } } return target; } - /** @brief Multi-way merging procedure for a high branching factor, - * unguarded case. - * - * The head elements are kept in a loser tree. - * @param seqs_begin Begin iterator of iterator pair input sequence. - * @param seqs_end End iterator of iterator pair input sequence. - * @param target Begin iterator out output sequence. - * @param comp Comparator. - * @param length Maximum length to merge. - * @param stable Stable merging incurs a performance penalty. - * @return End iterator of output sequence. - * @pre No input will run out of elements during the merge. - */ - template<typename LT, typename RandomAccessIteratorIterator, typename RandomAccessIterator3, typename _DifferenceTp, typename Comparator> +/** @brief Multi-way merging procedure for a high branching factor, + * unguarded case. + * + * The head elements are kept in a loser tree. + * @param seqs_begin Begin iterator of iterator pair input sequence. + * @param seqs_end End iterator of iterator pair input sequence. + * @param target Begin iterator out output sequence. + * @param comp Comparator. + * @param length Maximum length to merge. + * @param stable Stable merging incurs a performance penalty. + * @return End iterator of output sequence. + * @pre No input will run out of elements during the merge. + */ +template< + typename LT, + typename RandomAccessIteratorIterator, + typename RandomAccessIterator3, + typename _DifferenceTp, typename Comparator> RandomAccessIterator3 - multiway_merge_loser_tree_unguarded(RandomAccessIteratorIterator seqs_begin, RandomAccessIteratorIterator seqs_end, RandomAccessIterator3 target, Comparator comp, _DifferenceTp length, bool stable) + multiway_merge_loser_tree_unguarded(RandomAccessIteratorIterator seqs_begin, + RandomAccessIteratorIterator seqs_end, + RandomAccessIterator3 target, + Comparator comp, + _DifferenceTp length, bool stable) { _GLIBCXX_CALL(length) typedef _DifferenceTp difference_type; - typedef typename std::iterator_traits<RandomAccessIteratorIterator>::value_type::first_type + typedef typename std::iterator_traits<RandomAccessIteratorIterator> + ::value_type::first_type RandomAccessIterator1; typedef typename std::iterator_traits<RandomAccessIterator1>::value_type value_type; @@ -1012,14 +1102,14 @@ namespace __gnu_parallel for (int t = 0; t < k; t++) { #if _GLIBCXX_ASSERTIONS - _GLIBCXX_PARALLEL_ASSERT(seqs_begin[t].first != seqs_begin[t].second); + _GLIBCXX_PARALLEL_ASSERT(seqs_begin[t].first != seqs_begin[t].second); #endif - if (stable) - lt.insert_start_stable(*seqs_begin[t].first, t, false); - else - lt.insert_start(*seqs_begin[t].first, t, false); + if (stable) + lt.insert_start_stable(*seqs_begin[t].first, t, false); + else + lt.insert_start(*seqs_begin[t].first, t, false); - total_length += LENGTH(seqs_begin[t]); + total_length += LENGTH(seqs_begin[t]); } if (stable) @@ -1038,68 +1128,84 @@ namespace __gnu_parallel if (stable) { - RandomAccessIterator3 target_end = target + length; - while (target < target_end) - { - // Take out. - source = lt.get_min_source(); + RandomAccessIterator3 target_end = target + length; + while (target < target_end) + { + // Take out. + source = lt.get_min_source(); #if _GLIBCXX_ASSERTIONS - _GLIBCXX_PARALLEL_ASSERT(i == 0 || !comp(*(seqs_begin[source].first), *(target - 1))); + _GLIBCXX_PARALLEL_ASSERT(i == 0 + || !comp(*(seqs_begin[source].first), *(target - 1))); #endif - *(target++) = *(seqs_begin[source].first++); + *(target++) = *(seqs_begin[source].first++); #if _GLIBCXX_ASSERTIONS - _GLIBCXX_PARALLEL_ASSERT((seqs_begin[source].first != seqs_begin[source].second) || (i == length - 1)); - i++; + _GLIBCXX_PARALLEL_ASSERT( + (seqs_begin[source].first != seqs_begin[source].second) + || (i == length - 1)); + i++; #endif - // Feed. - // Replace from same source. - lt.delete_min_insert_stable(*seqs_begin[source].first, false); + // Feed. + // Replace from same source. + lt.delete_min_insert_stable(*seqs_begin[source].first, false); - } + } } else { - RandomAccessIterator3 target_end = target + length; - while (target < target_end) - { - // Take out. - source = lt.get_min_source(); + RandomAccessIterator3 target_end = target + length; + while (target < target_end) + { + // Take out. + source = lt.get_min_source(); #if _GLIBCXX_ASSERTIONS - if (i > 0 && comp(*(seqs_begin[source].first), *(target - 1))) - printf(" %i %i %i\n", length, i, source); - _GLIBCXX_PARALLEL_ASSERT(i == 0 || !comp(*(seqs_begin[source].first), *(target - 1))); + if (i > 0 && comp(*(seqs_begin[source].first), *(target - 1))) + printf(" %i %i %i\n", length, i, source); + _GLIBCXX_PARALLEL_ASSERT(i == 0 + || !comp(*(seqs_begin[source].first), *(target - 1))); #endif - *(target++) = *(seqs_begin[source].first++); + *(target++) = *(seqs_begin[source].first++); #if _GLIBCXX_ASSERTIONS - if (!((seqs_begin[source].first != seqs_begin[source].second) || (i >= length - 1))) - printf(" %i %i %i\n", length, i, source); - _GLIBCXX_PARALLEL_ASSERT((seqs_begin[source].first != seqs_begin[source].second) || (i >= length - 1)); - i++; + if (!((seqs_begin[source].first != seqs_begin[source].second) + || (i >= length - 1))) + printf(" %i %i %i\n", length, i, source); + _GLIBCXX_PARALLEL_ASSERT( + (seqs_begin[source].first != seqs_begin[source].second) + || (i >= length - 1)); + i++; #endif - // Feed. - // Replace from same source. - lt.delete_min_insert(*seqs_begin[source].first, false); - } + // Feed. + // Replace from same source. + lt.delete_min_insert(*seqs_begin[source].first, false); + } } return target; } - template<typename RandomAccessIteratorIterator, typename RandomAccessIterator3, typename _DifferenceTp, typename Comparator> +template< + typename RandomAccessIteratorIterator, + typename RandomAccessIterator3, + typename _DifferenceTp, + typename Comparator> RandomAccessIterator3 - multiway_merge_loser_tree_combined(RandomAccessIteratorIterator seqs_begin, RandomAccessIteratorIterator seqs_end, RandomAccessIterator3 target, Comparator comp, _DifferenceTp length, bool stable) + multiway_merge_loser_tree_combined(RandomAccessIteratorIterator seqs_begin, + RandomAccessIteratorIterator seqs_end, + RandomAccessIterator3 target, + Comparator comp, + _DifferenceTp length, bool stable) { _GLIBCXX_CALL(length) typedef _DifferenceTp difference_type; - typedef typename std::iterator_traits<RandomAccessIteratorIterator>::value_type::first_type + typedef typename std::iterator_traits<RandomAccessIteratorIterator> + ::value_type::first_type RandomAccessIterator1; typedef typename std::iterator_traits<RandomAccessIterator1>::value_type value_type; @@ -1107,7 +1213,7 @@ namespace __gnu_parallel int min_seq; RandomAccessIterator3 target_end; difference_type overhang = prepare_unguarded(seqs_begin, seqs_end, - comp, min_seq, stable); + comp, min_seq, stable); difference_type total_length = 0; for (RandomAccessIteratorIterator s = seqs_begin; s != seqs_end; s++) @@ -1115,17 +1221,18 @@ namespace __gnu_parallel if (overhang != -1) { - difference_type unguarded_length = std::min(length, total_length - overhang); - target_end = multiway_merge_loser_tree_unguarded - <typename loser_tree_unguarded_traits<value_type, Comparator>::LT> - (seqs_begin, seqs_end, target, comp, unguarded_length, stable); - overhang = length - unguarded_length; + difference_type unguarded_length = + std::min(length, total_length - overhang); + target_end = multiway_merge_loser_tree_unguarded + <typename loser_tree_unguarded_traits<value_type, Comparator>::LT> + (seqs_begin, seqs_end, target, comp, unguarded_length, stable); + overhang = length - unguarded_length; } else { - // Empty sequence found. - overhang = length; - target_end = target; + // Empty sequence found. + overhang = length; + target_end = target; } #if _GLIBCXX_ASSERTIONS @@ -1145,34 +1252,43 @@ namespace __gnu_parallel return target_end; } - template<typename RandomAccessIteratorIterator, typename RandomAccessIterator3, typename _DifferenceTp, typename Comparator> +template< + typename RandomAccessIteratorIterator, + typename RandomAccessIterator3, + typename _DifferenceTp, + typename Comparator> RandomAccessIterator3 - multiway_merge_loser_tree_sentinel(RandomAccessIteratorIterator seqs_begin, RandomAccessIteratorIterator seqs_end, RandomAccessIterator3 target, Comparator comp, _DifferenceTp length, bool stable) + multiway_merge_loser_tree_sentinel(RandomAccessIteratorIterator seqs_begin, + RandomAccessIteratorIterator seqs_end, + RandomAccessIterator3 target, + Comparator comp, + _DifferenceTp length, bool stable) { _GLIBCXX_CALL(length) typedef _DifferenceTp difference_type; typedef std::iterator_traits<RandomAccessIteratorIterator> traits_type; - typedef typename std::iterator_traits<RandomAccessIteratorIterator>::value_type::first_type + typedef typename std::iterator_traits<RandomAccessIteratorIterator> + ::value_type::first_type RandomAccessIterator1; typedef typename std::iterator_traits<RandomAccessIterator1>::value_type value_type; - typedef typename std::iterator_traits<RandomAccessIteratorIterator>::value_type::first_type - RandomAccessIterator1; RandomAccessIterator3 target_end; - difference_type overhang = prepare_unguarded_sentinel(seqs_begin, seqs_end, comp); + difference_type overhang = + prepare_unguarded_sentinel(seqs_begin, seqs_end, comp); difference_type total_length = 0; for (RandomAccessIteratorIterator s = seqs_begin; s != seqs_end; s++) { - total_length += LENGTH(*s); + total_length += LENGTH(*s); - // Sentinel spot. - (*s).second++; + // Sentinel spot. + (*s).second++; } - difference_type unguarded_length = std::min(length, total_length - overhang); + difference_type unguarded_length = + std::min(length, total_length - overhang); target_end = multiway_merge_loser_tree_unguarded <typename loser_tree_unguarded_traits<value_type, Comparator>::LT> (seqs_begin, seqs_end, target, comp, unguarded_length, stable); @@ -1184,14 +1300,17 @@ namespace __gnu_parallel #endif // Copy rest stable. - for (RandomAccessIteratorIterator s = seqs_begin; s != seqs_end && overhang > 0; s++) + for (RandomAccessIteratorIterator s = seqs_begin; + s != seqs_end && overhang > 0; s++) { - // Restore. - (*s).second--; - difference_type local_length = std::min((difference_type)overhang, (difference_type)LENGTH(*s)); - target_end = std::copy((*s).first, (*s).first + local_length, target_end); - (*s).first += local_length; - overhang -= local_length; + // Restore. + (*s).second--; + difference_type local_length = + std::min<difference_type>(overhang, LENGTH(*s)); + target_end = std::copy((*s).first, (*s).first + local_length, + target_end); + (*s).first += local_length; + overhang -= local_length; } #if _GLIBCXX_ASSERTIONS @@ -1203,25 +1322,35 @@ namespace __gnu_parallel return target_end; } - /** @brief Sequential multi-way merging switch. - * - * The decision if based on the branching factor and runtime settings. - * @param seqs_begin Begin iterator of iterator pair input sequence. - * @param seqs_end End iterator of iterator pair input sequence. - * @param target Begin iterator out output sequence. - * @param comp Comparator. - * @param length Maximum length to merge. - * @param stable Stable merging incurs a performance penalty. - * @param sentinel The sequences have a sentinel element. - * @return End iterator of output sequence. */ - template<typename RandomAccessIteratorIterator, typename RandomAccessIterator3, typename _DifferenceTp, typename Comparator> +/** @brief Sequential multi-way merging switch. + * + * The decision if based on the branching factor and runtime settings. + * @param seqs_begin Begin iterator of iterator pair input sequence. + * @param seqs_end End iterator of iterator pair input sequence. + * @param target Begin iterator out output sequence. + * @param comp Comparator. + * @param length Maximum length to merge. + * @param stable Stable merging incurs a performance penalty. + * @param sentinel The sequences have a sentinel element. + * @return End iterator of output sequence. */ +template< + typename RandomAccessIteratorIterator, + typename RandomAccessIterator3, + typename _DifferenceTp, + typename Comparator> RandomAccessIterator3 - multiway_merge(RandomAccessIteratorIterator seqs_begin, RandomAccessIteratorIterator seqs_end, RandomAccessIterator3 target, Comparator comp, _DifferenceTp length, bool stable, bool sentinel, sequential_tag) + multiway_merge(RandomAccessIteratorIterator seqs_begin, + RandomAccessIteratorIterator seqs_end, + RandomAccessIterator3 target, + Comparator comp, _DifferenceTp length, + bool stable, bool sentinel, + sequential_tag) { _GLIBCXX_CALL(length) typedef _DifferenceTp difference_type; - typedef typename std::iterator_traits<RandomAccessIteratorIterator>::value_type::first_type + typedef typename std::iterator_traits<RandomAccessIteratorIterator> + ::value_type::first_type RandomAccessIterator1; typedef typename std::iterator_traits<RandomAccessIterator1>::value_type value_type; @@ -1234,7 +1363,8 @@ namespace __gnu_parallel RandomAccessIterator3 return_target = target; int k = static_cast<int>(seqs_end - seqs_begin); - Settings::MultiwayMergeAlgorithm mwma = Settings::multiway_merge_algorithm; + Settings::MultiwayMergeAlgorithm mwma = + Settings::multiway_merge_algorithm; if (!sentinel && mwma == Settings::LOSER_TREE_SENTINEL) mwma = Settings::LOSER_TREE_COMBINED; @@ -1242,75 +1372,126 @@ namespace __gnu_parallel switch (k) { case 0: - break; + break; case 1: - return_target = std::copy(seqs_begin[0].first, seqs_begin[0].first + length, target); - seqs_begin[0].first += length; - break; + return_target = std::copy(seqs_begin[0].first, + seqs_begin[0].first + length, + target); + seqs_begin[0].first += length; + break; case 2: - return_target = merge_advance(seqs_begin[0].first, seqs_begin[0].second, seqs_begin[1].first, seqs_begin[1].second, target, length, comp); - break; + return_target = merge_advance(seqs_begin[0].first, + seqs_begin[0].second, + seqs_begin[1].first, + seqs_begin[1].second, + target, length, comp); + break; case 3: - switch (mwma) - { - case Settings::LOSER_TREE_COMBINED: - return_target = multiway_merge_3_combined(seqs_begin, seqs_end, target, comp, length, stable); - break; - case Settings::LOSER_TREE_SENTINEL: - return_target = multiway_merge_3_variant<unguarded_iterator>(seqs_begin, seqs_end, target, comp, length, stable); - break; - default: - return_target = multiway_merge_3_variant<guarded_iterator>(seqs_begin, seqs_end, target, comp, length, stable); - break; - } - break; + switch (mwma) + { + case Settings::LOSER_TREE_COMBINED: + return_target = multiway_merge_3_combined(seqs_begin, + seqs_end, + target, + comp, length, stable); + break; + case Settings::LOSER_TREE_SENTINEL: + return_target = multiway_merge_3_variant<unguarded_iterator>( + seqs_begin, + seqs_end, + target, + comp, length, stable); + break; + default: + return_target = multiway_merge_3_variant<guarded_iterator>( + seqs_begin, + seqs_end, + target, + comp, length, stable); + break; + } + break; case 4: - switch (mwma) - { - case Settings::LOSER_TREE_COMBINED: - return_target = multiway_merge_4_combined(seqs_begin, seqs_end, target, comp, length, stable); - break; - case Settings::LOSER_TREE_SENTINEL: - return_target = multiway_merge_4_variant<unguarded_iterator>(seqs_begin, seqs_end, target, comp, length, stable); - break; - default: - return_target = multiway_merge_4_variant<guarded_iterator>(seqs_begin, seqs_end, target, comp, length, stable); - break; - } - break; + switch (mwma) + { + case Settings::LOSER_TREE_COMBINED: + return_target = multiway_merge_4_combined( + seqs_begin, + seqs_end, + target, + comp, length, stable); + break; + case Settings::LOSER_TREE_SENTINEL: + return_target = multiway_merge_4_variant<unguarded_iterator>( + seqs_begin, + seqs_end, + target, + comp, length, stable); + break; + default: + return_target = multiway_merge_4_variant<guarded_iterator>( + seqs_begin, + seqs_end, + target, + comp, length, stable); + break; + } + break; default: - { - switch (mwma) - { - case Settings::BUBBLE: - return_target = multiway_merge_bubble(seqs_begin, seqs_end, target, comp, length, stable); - break; + { + switch (mwma) + { + case Settings::BUBBLE: + return_target = multiway_merge_bubble( + seqs_begin, + seqs_end, + target, + comp, length, stable); + break; #if _GLIBCXX_LOSER_TREE_EXPLICIT - case Settings::LOSER_TREE_EXPLICIT: - return_target = multiway_merge_loser_tree<LoserTreeExplicit<value_type, Comparator> >(seqs_begin, seqs_end, target, comp, length, stable); - break; + case Settings::LOSER_TREE_EXPLICIT: + return_target = multiway_merge_loser_tree< + LoserTreeExplicit<value_type, Comparator> >( + seqs_begin, + seqs_end, + target, + comp, length, stable); + break; #endif #if _GLIBCXX_LOSER_TREE - case Settings::LOSER_TREE: - return_target = multiway_merge_loser_tree<LoserTree<value_type, Comparator> >(seqs_begin, seqs_end, target, comp, length, stable); - break; + case Settings::LOSER_TREE: + return_target = multiway_merge_loser_tree< + LoserTree<value_type, Comparator> >( + seqs_begin, + seqs_end, + target, + comp, length, stable); + break; #endif #if _GLIBCXX_LOSER_TREE_COMBINED - case Settings::LOSER_TREE_COMBINED: - return_target = multiway_merge_loser_tree_combined(seqs_begin, seqs_end, target, comp, length, stable); - break; + case Settings::LOSER_TREE_COMBINED: + return_target = multiway_merge_loser_tree_combined( + seqs_begin, + seqs_end, + target, + comp, length, stable); + break; #endif #if _GLIBCXX_LOSER_TREE_SENTINEL - case Settings::LOSER_TREE_SENTINEL: - return_target = multiway_merge_loser_tree_sentinel(seqs_begin, seqs_end, target, comp, length, stable); - break; + case Settings::LOSER_TREE_SENTINEL: + return_target = multiway_merge_loser_tree_sentinel( + seqs_begin, + seqs_end, + target, + comp, length, stable); + break; #endif - default: - // multiway_merge algorithm not implemented. - _GLIBCXX_PARALLEL_ASSERT(0); - break; - } - } + default: + // multiway_merge algorithm not implemented. + _GLIBCXX_PARALLEL_ASSERT(0); + break; + } + } } #if _GLIBCXX_ASSERTIONS _GLIBCXX_PARALLEL_ASSERT(is_sorted(target, target + length, comp)); @@ -1319,214 +1500,268 @@ namespace __gnu_parallel return return_target; } - /** @brief Parallel multi-way merge routine. - * - * The decision if based on the branching factor and runtime settings. - * @param seqs_begin Begin iterator of iterator pair input sequence. - * @param seqs_end End iterator of iterator pair input sequence. - * @param target Begin iterator out output sequence. - * @param comp Comparator. - * @param length Maximum length to merge. - * @param stable Stable merging incurs a performance penalty. - * @param sentinel Ignored. - * @return End iterator of output sequence. - */ - template<typename RandomAccessIteratorIterator, typename RandomAccessIterator3, typename _DifferenceTp, typename Comparator> +/** @brief Parallel multi-way merge routine. + * + * The decision if based on the branching factor and runtime settings. + * @param seqs_begin Begin iterator of iterator pair input sequence. + * @param seqs_end End iterator of iterator pair input sequence. + * @param target Begin iterator out output sequence. + * @param comp Comparator. + * @param length Maximum length to merge. + * @param stable Stable merging incurs a performance penalty. + * @param sentinel Ignored. + * @return End iterator of output sequence. + */ +template< + typename RandomAccessIteratorIterator, + typename RandomAccessIterator3, + typename _DifferenceTp, + typename Comparator> RandomAccessIterator3 - parallel_multiway_merge(RandomAccessIteratorIterator seqs_begin, RandomAccessIteratorIterator seqs_end, RandomAccessIterator3 target, Comparator comp, _DifferenceTp length, bool stable, bool sentinel) - { - _GLIBCXX_CALL(length) - - typedef _DifferenceTp difference_type; - typedef typename std::iterator_traits<RandomAccessIteratorIterator>::value_type::first_type - RandomAccessIterator1; - typedef typename std::iterator_traits<RandomAccessIterator1>::value_type - value_type; - -#if _GLIBCXX_ASSERTIONS - for (RandomAccessIteratorIterator rii = seqs_begin; rii != seqs_end; rii++) - _GLIBCXX_PARALLEL_ASSERT(is_sorted((*rii).first, (*rii).second, comp)); -#endif - - // k sequences. - int k = static_cast<int>(seqs_end - seqs_begin); - - difference_type total_length = 0; - for (RandomAccessIteratorIterator raii = seqs_begin; raii != seqs_end; raii++) - total_length += LENGTH(*raii); - - _GLIBCXX_CALL(total_length) - - if (total_length == 0 || k == 0) - return target; - - thread_index_t num_threads = static_cast<thread_index_t>(std::min(static_cast<difference_type>(get_max_threads()), total_length)); - - bool tight = (total_length == length); - - // Thread t will have to merge pieces[iam][0..k - 1] - std::vector<std::pair<difference_type, difference_type> >* pieces = new std::vector<std::pair<difference_type, difference_type> >[num_threads]; - for (int s = 0; s < num_threads; s++) - pieces[s].resize(k); - - difference_type num_samples = Settings::merge_oversampling * num_threads; - - if (Settings::multiway_merge_splitting == Settings::SAMPLING) - { - value_type* samples = static_cast<value_type*>(::operator new(sizeof(value_type) * k * num_samples)); - // Sample. - for (int s = 0; s < k; s++) - for (int i = 0; (difference_type)i < num_samples; i++) - { - difference_type sample_index = static_cast<difference_type>(LENGTH(seqs_begin[s]) * (double(i + 1) / (num_samples + 1)) * (double(length) / total_length)); - samples[s * num_samples + i] = seqs_begin[s].first[sample_index]; - } - - if (stable) - __gnu_sequential::stable_sort(samples, samples + (num_samples * k), comp); - else - __gnu_sequential::sort(samples, samples + (num_samples * k), comp); - - for (int slab = 0; slab < num_threads; slab++) - // For each slab / processor. - for (int seq = 0; seq < k; seq++) - { - // For each sequence. - if (slab > 0) - pieces[slab][seq].first = std::upper_bound(seqs_begin[seq].first, seqs_begin[seq].second, samples[num_samples * k * slab / num_threads], comp) - seqs_begin[seq].first; - else - { - // Absolute beginning. - pieces[slab][seq].first = 0; - } - if ((slab + 1) < num_threads) - pieces[slab][seq].second = std::upper_bound(seqs_begin[seq].first, seqs_begin[seq].second, samples[num_samples * k * (slab + 1) / num_threads], comp) - seqs_begin[seq].first; - else - pieces[slab][seq].second = LENGTH(seqs_begin[seq]); //absolute ending - } - delete[] samples; - } - else - { - // (Settings::multiway_merge_splitting == Settings::EXACT). - std::vector<RandomAccessIterator1>* offsets = new std::vector<RandomAccessIterator1>[num_threads]; - std::vector<std::pair<RandomAccessIterator1, RandomAccessIterator1> > se(k); - - copy(seqs_begin, seqs_end, se.begin()); - - difference_type* borders = static_cast<difference_type*>(__builtin_alloca(sizeof(difference_type) * (num_threads + 1))); - equally_split(length, num_threads, borders); - - for (int s = 0; s < (num_threads - 1); s++) - { - offsets[s].resize(k); - multiseq_partition(se.begin(), se.end(), borders[s + 1], - offsets[s].begin(), comp); - - // Last one also needed and available. - if (!tight) - { - offsets[num_threads - 1].resize(k); - multiseq_partition(se.begin(), se.end(), - difference_type(length), - offsets[num_threads - 1].begin(), comp); - } - } - - - for (int slab = 0; slab < num_threads; slab++) - { - // For each slab / processor. - for (int seq = 0; seq < k; seq++) - { - // For each sequence. - if (slab == 0) - { - // Absolute beginning. - pieces[slab][seq].first = 0; - } - else - pieces[slab][seq].first = pieces[slab - 1][seq].second; - if (!tight || slab < (num_threads - 1)) - pieces[slab][seq].second = offsets[slab][seq] - seqs_begin[seq].first; - else - { - // slab == num_threads - 1 - pieces[slab][seq].second = LENGTH(seqs_begin[seq]); - } - } - } - delete[] offsets; - } - -# pragma omp parallel num_threads(num_threads) + parallel_multiway_merge(RandomAccessIteratorIterator seqs_begin, + RandomAccessIteratorIterator seqs_end, + RandomAccessIterator3 target, + Comparator comp, + _DifferenceTp length, bool stable, bool sentinel) { - thread_index_t iam = omp_get_thread_num(); - - difference_type target_position = 0; - - for (int c = 0; c < k; c++) - target_position += pieces[iam][c].first; - - if (k > 2) - { - std::pair<RandomAccessIterator1, RandomAccessIterator1>* chunks = new std::pair<RandomAccessIterator1, RandomAccessIterator1>[k]; - - difference_type local_length = 0; - for (int s = 0; s < k; s++) - { - chunks[s] = std::make_pair(seqs_begin[s].first + pieces[iam][s].first, seqs_begin[s].first + pieces[iam][s].second); - local_length += LENGTH(chunks[s]); - } - - multiway_merge(chunks, chunks + k, target + target_position, comp, - std::min(local_length, length - target_position), - stable, false, sequential_tag()); - - delete[] chunks; - } - else if (k == 2) - { - RandomAccessIterator1 begin0 = seqs_begin[0].first + pieces[iam][0].first, begin1 = seqs_begin[1].first + pieces[iam][1].first; - merge_advance(begin0, - seqs_begin[0].first + pieces[iam][0].second, - begin1, - seqs_begin[1].first + pieces[iam][1].second, - target + target_position, - (pieces[iam][0].second - pieces[iam][0].first) + (pieces[iam][1].second - pieces[iam][1].first), - comp); - } - } + _GLIBCXX_CALL(length) + + typedef _DifferenceTp difference_type; + typedef typename std::iterator_traits<RandomAccessIteratorIterator> + ::value_type::first_type + RandomAccessIterator1; + typedef typename std::iterator_traits<RandomAccessIterator1>::value_type + value_type; + + // k sequences. + int k = static_cast<int>(seqs_end - seqs_begin); + + difference_type total_length = 0; + for (RandomAccessIteratorIterator raii = seqs_begin; + raii != seqs_end; raii++) + total_length += LENGTH(*raii); + + _GLIBCXX_CALL(total_length) + + if (total_length == 0 || k == 0) + return target; + + bool tight = (total_length == length); + + std::vector<std::pair<difference_type, difference_type> >* pieces; + + thread_index_t num_threads = static_cast<thread_index_t>( + std::min<difference_type>(get_max_threads(), total_length)); + +# pragma omp parallel num_threads (num_threads) + { +# pragma omp single + { + num_threads = omp_get_num_threads(); + // Thread t will have to merge pieces[iam][0..k - 1] + pieces = new std::vector< + std::pair<difference_type, difference_type> >[num_threads]; + for (int s = 0; s < num_threads; s++) + pieces[s].resize(k); + + difference_type num_samples = + Settings::merge_oversampling * num_threads; + + if (Settings::multiway_merge_splitting == Settings::SAMPLING) + { + value_type* samples = static_cast<value_type*>( + ::operator new(sizeof(value_type) * k * num_samples)); + // Sample. + for (int s = 0; s < k; s++) + for (int i = 0; (difference_type)i < num_samples; i++) + { + difference_type sample_index = + static_cast<difference_type>( + LENGTH(seqs_begin[s]) * (double(i + 1) / + (num_samples + 1)) * (double(length) + / total_length)); + samples[s * num_samples + i] = + seqs_begin[s].first[sample_index]; + } + + if (stable) + __gnu_sequential::stable_sort( + samples, samples + (num_samples * k), comp); + else + __gnu_sequential::sort( + samples, samples + (num_samples * k), comp); + + for (int slab = 0; slab < num_threads; slab++) + // For each slab / processor. + for (int seq = 0; seq < k; seq++) + { + // For each sequence. + if (slab > 0) + pieces[slab][seq].first = + std::upper_bound( + seqs_begin[seq].first, + seqs_begin[seq].second, + samples[num_samples * k * slab / num_threads], + comp) + - seqs_begin[seq].first; + else + { + // Absolute beginning. + pieces[slab][seq].first = 0; + } + if ((slab + 1) < num_threads) + pieces[slab][seq].second = + std::upper_bound( + seqs_begin[seq].first, + seqs_begin[seq].second, + samples[num_samples * k * (slab + 1) / + num_threads], comp) + - seqs_begin[seq].first; + else + pieces[slab][seq].second = LENGTH(seqs_begin[seq]); + } + delete[] samples; + } + else + { + // (Settings::multiway_merge_splitting == Settings::EXACT). + std::vector<RandomAccessIterator1>* offsets = + new std::vector<RandomAccessIterator1>[num_threads]; + std::vector< + std::pair<RandomAccessIterator1, RandomAccessIterator1> + > se(k); + + copy(seqs_begin, seqs_end, se.begin()); + + difference_type* borders = + new difference_type[num_threads + 1]; + equally_split(length, num_threads, borders); + + for (int s = 0; s < (num_threads - 1); s++) + { + offsets[s].resize(k); + multiseq_partition( + se.begin(), se.end(), borders[s + 1], + offsets[s].begin(), comp); + + // Last one also needed and available. + if (!tight) + { + offsets[num_threads - 1].resize(k); + multiseq_partition(se.begin(), se.end(), + difference_type(length), + offsets[num_threads - 1].begin(), comp); + } + } + + + for (int slab = 0; slab < num_threads; slab++) + { + // For each slab / processor. + for (int seq = 0; seq < k; seq++) + { + // For each sequence. + if (slab == 0) + { + // Absolute beginning. + pieces[slab][seq].first = 0; + } + else + pieces[slab][seq].first = + pieces[slab - 1][seq].second; + if (!tight || slab < (num_threads - 1)) + pieces[slab][seq].second = + offsets[slab][seq] - seqs_begin[seq].first; + else + { + // slab == num_threads - 1 + pieces[slab][seq].second = + LENGTH(seqs_begin[seq]); + } + } + } + delete[] offsets; + } + } //single + + thread_index_t iam = omp_get_thread_num(); + + difference_type target_position = 0; + + for (int c = 0; c < k; c++) + target_position += pieces[iam][c].first; + + if (k > 2) + { + std::pair<RandomAccessIterator1, RandomAccessIterator1>* chunks + = new + std::pair<RandomAccessIterator1, RandomAccessIterator1>[k]; + + difference_type local_length = 0; + for (int s = 0; s < k; s++) + { + chunks[s] = std::make_pair( + seqs_begin[s].first + pieces[iam][s].first, + seqs_begin[s].first + pieces[iam][s].second); + local_length += LENGTH(chunks[s]); + } + + multiway_merge( + chunks, chunks + k, target + target_position, comp, + std::min(local_length, length - target_position), + stable, false, sequential_tag()); + + delete[] chunks; + } + else if (k == 2) + { + RandomAccessIterator1 + begin0 = seqs_begin[0].first + pieces[iam][0].first, + begin1 = seqs_begin[1].first + pieces[iam][1].first; + merge_advance(begin0, + seqs_begin[0].first + pieces[iam][0].second, + begin1, + seqs_begin[1].first + pieces[iam][1].second, + target + target_position, + (pieces[iam][0].second - pieces[iam][0].first) + + (pieces[iam][1].second - pieces[iam][1].first), + comp); + } + } //parallel #if _GLIBCXX_ASSERTIONS - _GLIBCXX_PARALLEL_ASSERT(is_sorted(target, target + length, comp)); + _GLIBCXX_PARALLEL_ASSERT(is_sorted(target, target + length, comp)); #endif - // Update ends of sequences. - for (int s = 0; s < k; s++) - seqs_begin[s].first += pieces[num_threads - 1][s].second; + // Update ends of sequences. + for (int s = 0; s < k; s++) + seqs_begin[s].first += pieces[num_threads - 1][s].second; - delete[] pieces; + delete[] pieces; - return target + length; - } + return target + length; + } - /** - * @brief Multi-way merging front-end. - * @param seqs_begin Begin iterator of iterator pair input sequence. - * @param seqs_end End iterator of iterator pair input sequence. - * @param target Begin iterator out output sequence. - * @param comp Comparator. - * @param length Maximum length to merge. - * @param stable Stable merging incurs a performance penalty. - * @return End iterator of output sequence. - */ - template<typename RandomAccessIteratorPairIterator, typename RandomAccessIterator3, typename _DifferenceTp, typename Comparator> +/** + * @brief Multi-way merging front-end. + * @param seqs_begin Begin iterator of iterator pair input sequence. + * @param seqs_end End iterator of iterator pair input sequence. + * @param target Begin iterator out output sequence. + * @param comp Comparator. + * @param length Maximum length to merge. + * @param stable Stable merging incurs a performance penalty. + * @return End iterator of output sequence. + */ +template< + typename RandomAccessIteratorPairIterator, + typename RandomAccessIterator3, + typename _DifferenceTp, + typename Comparator> RandomAccessIterator3 multiway_merge(RandomAccessIteratorPairIterator seqs_begin, - RandomAccessIteratorPairIterator seqs_end, - RandomAccessIterator3 target, Comparator comp, - _DifferenceTp length, bool stable) + RandomAccessIteratorPairIterator seqs_end, + RandomAccessIterator3 target, Comparator comp, + _DifferenceTp length, bool stable) { typedef _DifferenceTp difference_type; _GLIBCXX_CALL(seqs_end - seqs_begin) @@ -1535,33 +1770,43 @@ namespace __gnu_parallel return target; RandomAccessIterator3 target_end; - if (_GLIBCXX_PARALLEL_CONDITION(((seqs_end - seqs_begin) >= Settings::multiway_merge_minimal_k) && ((sequence_index_t)length >= Settings::multiway_merge_minimal_n))) - target_end = parallel_multiway_merge(seqs_begin, seqs_end, target, comp, (difference_type)length, stable, false); + if (_GLIBCXX_PARALLEL_CONDITION( + ((seqs_end - seqs_begin) >= Settings::multiway_merge_minimal_k) + && ((sequence_index_t)length >= Settings::multiway_merge_minimal_n))) + target_end = parallel_multiway_merge( + seqs_begin, seqs_end, + target, comp, static_cast<difference_type>(length), stable, false); else - target_end = multiway_merge(seqs_begin, seqs_end, target, comp, length, stable, false, sequential_tag()); + target_end = multiway_merge( + seqs_begin, seqs_end, + target, comp, length, stable, false, sequential_tag()); return target_end; } - /** @brief Multi-way merging front-end. - * @param seqs_begin Begin iterator of iterator pair input sequence. - * @param seqs_end End iterator of iterator pair input sequence. - * @param target Begin iterator out output sequence. - * @param comp Comparator. - * @param length Maximum length to merge. - * @param stable Stable merging incurs a performance penalty. - * @return End iterator of output sequence. - * @pre For each @c i, @c seqs_begin[i].second must be the end - * marker of the sequence, but also reference the one more sentinel - * element. */ - template<typename RandomAccessIteratorPairIterator, typename RandomAccessIterator3, typename _DifferenceTp, typename Comparator> +/** @brief Multi-way merging front-end. + * @param seqs_begin Begin iterator of iterator pair input sequence. + * @param seqs_end End iterator of iterator pair input sequence. + * @param target Begin iterator out output sequence. + * @param comp Comparator. + * @param length Maximum length to merge. + * @param stable Stable merging incurs a performance penalty. + * @return End iterator of output sequence. + * @pre For each @c i, @c seqs_begin[i].second must be the end + * marker of the sequence, but also reference the one more sentinel + * element. */ +template< + typename RandomAccessIteratorPairIterator, + typename RandomAccessIterator3, + typename _DifferenceTp, + typename Comparator> RandomAccessIterator3 multiway_merge_sentinel(RandomAccessIteratorPairIterator seqs_begin, - RandomAccessIteratorPairIterator seqs_end, - RandomAccessIterator3 target, - Comparator comp, - _DifferenceTp length, - bool stable) + RandomAccessIteratorPairIterator seqs_end, + RandomAccessIterator3 target, + Comparator comp, + _DifferenceTp length, + bool stable) { typedef _DifferenceTp difference_type; @@ -1570,10 +1815,16 @@ namespace __gnu_parallel _GLIBCXX_CALL(seqs_end - seqs_begin) - if (_GLIBCXX_PARALLEL_CONDITION(((seqs_end - seqs_begin) >= Settings::multiway_merge_minimal_k) && ((sequence_index_t)length >= Settings::multiway_merge_minimal_n))) - return parallel_multiway_merge(seqs_begin, seqs_end, target, comp, (typename std::iterator_traits<RandomAccessIterator3>::difference_type)length, stable, true); + if (_GLIBCXX_PARALLEL_CONDITION( + ((seqs_end - seqs_begin) >= Settings::multiway_merge_minimal_k) + && ((sequence_index_t)length >= Settings::multiway_merge_minimal_n))) + return parallel_multiway_merge( + seqs_begin, seqs_end, + target, comp, static_cast<difference_type>(length), stable, true); else - return multiway_merge(seqs_begin, seqs_end, target, comp, length, stable, true, sequential_tag()); + return multiway_merge( + seqs_begin, seqs_end, + target, comp, length, stable, true, sequential_tag()); } } diff --git a/libstdc++-v3/include/parallel/multiway_mergesort.h b/libstdc++-v3/include/parallel/multiway_mergesort.h index 89285e16389..5dedd0035c4 100644 --- a/libstdc++-v3/include/parallel/multiway_mergesort.h +++ b/libstdc++-v3/include/parallel/multiway_mergesort.h @@ -48,8 +48,8 @@ namespace __gnu_parallel { - /** @brief Subsequence description. */ - template<typename _DifferenceTp> +/** @brief Subsequence description. */ +template<typename _DifferenceTp> struct Piece { typedef _DifferenceTp difference_type; @@ -61,16 +61,19 @@ namespace __gnu_parallel difference_type end; }; - /** @brief Data accessed by all threads. - * - * PMWMS = parallel multiway mergesort */ - template<typename RandomAccessIterator> +/** @brief Data accessed by all threads. + * + * PMWMS = parallel multiway mergesort */ +template<typename RandomAccessIterator> struct PMWMSSortingData { typedef std::iterator_traits<RandomAccessIterator> traits_type; typedef typename traits_type::value_type value_type; typedef typename traits_type::difference_type difference_type; + /** @brief Number of threads involved. */ + thread_index_t num_threads; + /** @brief Input begin. */ RandomAccessIterator source; @@ -105,62 +108,55 @@ namespace __gnu_parallel /** @brief Pieces of data to merge @c [thread][sequence] */ std::vector<Piece<difference_type> >* pieces; - }; - /** @brief Thread local data for PMWMS. */ - template<typename RandomAccessIterator> - struct PMWMSSorterPU - { - /** @brief Total number of thread involved. */ - thread_index_t num_threads; - /** @brief Number of owning thread. */ - thread_index_t iam; /** @brief Stable sorting desired. */ bool stable; - /** @brief Pointer to global data. */ - PMWMSSortingData<RandomAccessIterator>* sd; - }; - - /** - * @brief Select samples from a sequence. - * @param d Pointer to thread-local data. Result will be placed in - * @c d->ds->samples. - * @param num_samples Number of samples to select. - */ - template<typename RandomAccessIterator, typename _DifferenceTp> +}; + +/** + * @brief Select samples from a sequence. + * @param sd Pointer to algorithm data. Result will be placed in + * @c sd->samples. + * @param num_samples Number of samples to select. + */ +template<typename RandomAccessIterator, typename _DifferenceTp> inline void - determine_samples(PMWMSSorterPU<RandomAccessIterator>* d, - _DifferenceTp& num_samples) + determine_samples(PMWMSSortingData<RandomAccessIterator>* sd, + _DifferenceTp& num_samples) { typedef _DifferenceTp difference_type; - PMWMSSortingData<RandomAccessIterator>* sd = d->sd; + thread_index_t iam = omp_get_thread_num(); - num_samples = Settings::sort_mwms_oversampling * d->num_threads - 1; + num_samples = + Settings::sort_mwms_oversampling * sd->num_threads - 1; - difference_type* es = static_cast<difference_type*>(__builtin_alloca(sizeof(difference_type) * (num_samples + 2))); + difference_type* es = new difference_type[num_samples + 2]; - equally_split(sd->starts[d->iam + 1] - sd->starts[d->iam], num_samples + 1, es); + equally_split(sd->starts[iam + 1] - sd->starts[iam], + num_samples + 1, es); for (difference_type i = 0; i < num_samples; i++) - sd->samples[d->iam * num_samples + i] = sd->source[sd->starts[d->iam] + es[i + 1]]; + sd->samples[iam * num_samples + i] = + sd->source[sd->starts[iam] + es[i + 1]]; + + delete[] es; } - /** @brief PMWMS code executed by each thread. - * @param d Pointer to thread-local data. - * @param comp Comparator. - */ - template<typename RandomAccessIterator, typename Comparator> +/** @brief PMWMS code executed by each thread. + * @param sd Pointer to algorithm data. + * @param comp Comparator. + */ +template<typename RandomAccessIterator, typename Comparator> inline void - parallel_sort_mwms_pu(PMWMSSorterPU<RandomAccessIterator>* d, - Comparator& comp) + parallel_sort_mwms_pu(PMWMSSortingData<RandomAccessIterator>* sd, + Comparator& comp) { typedef std::iterator_traits<RandomAccessIterator> traits_type; typedef typename traits_type::value_type value_type; typedef typename traits_type::difference_type difference_type; - PMWMSSortingData<RandomAccessIterator>* sd = d->sd; - thread_index_t iam = d->iam; + thread_index_t iam = omp_get_thread_num(); // Length of this thread's chunk, before merging. difference_type length_local = sd->starts[iam + 1] - sd->starts[iam]; @@ -174,161 +170,168 @@ namespace __gnu_parallel typedef value_type* SortingPlacesIterator; // Sort in temporary storage, leave space for sentinel. - sd->sorting_places[iam] = sd->temporaries[iam] = static_cast<value_type*>(::operator new(sizeof(value_type) * (length_local + 1))); + sd->sorting_places[iam] = sd->temporaries[iam] = + static_cast<value_type*>( + ::operator new(sizeof(value_type) * (length_local + 1))); // Copy there. - std::uninitialized_copy(sd->source + sd->starts[iam], sd->source + sd->starts[iam] + length_local, sd->sorting_places[iam]); + std::uninitialized_copy(sd->source + sd->starts[iam], + sd->source + sd->starts[iam] + length_local, + sd->sorting_places[iam]); #endif // Sort locally. - if (d->stable) - __gnu_sequential::stable_sort(sd->sorting_places[iam], sd->sorting_places[iam] + length_local, comp); + if (sd->stable) + __gnu_sequential::stable_sort(sd->sorting_places[iam], + sd->sorting_places[iam] + length_local, + comp); else - __gnu_sequential::sort(sd->sorting_places[iam], sd->sorting_places[iam] + length_local, comp); - -#if _GLIBCXX_ASSERTIONS - _GLIBCXX_PARALLEL_ASSERT(is_sorted(sd->sorting_places[iam], sd->sorting_places[iam] + length_local, comp)); -#endif + __gnu_sequential::sort(sd->sorting_places[iam], + sd->sorting_places[iam] + length_local, + comp); // Invariant: locally sorted subsequence in sd->sorting_places[iam], // sd->sorting_places[iam] + length_local. if (Settings::sort_splitting == Settings::SAMPLING) { - difference_type num_samples; - determine_samples(d, num_samples); - -#pragma omp barrier - -#pragma omp single - __gnu_sequential::sort(sd->samples, - sd->samples + (num_samples * d->num_threads), - comp); - -#pragma omp barrier - - for (int s = 0; s < d->num_threads; s++) - { - // For each sequence. - if (num_samples * iam > 0) - sd->pieces[iam][s].begin = std::lower_bound(sd->sorting_places[s], - sd->sorting_places[s] + sd->starts[s + 1] - sd->starts[s], - sd->samples[num_samples * iam], - comp) - - sd->sorting_places[s]; - else - // Absolute beginning. - sd->pieces[iam][s].begin = 0; - - if ((num_samples * (iam + 1)) < (num_samples * d->num_threads)) - sd->pieces[iam][s].end = std::lower_bound(sd->sorting_places[s], - sd->sorting_places[s] + sd->starts[s + 1] - sd->starts[s], sd->samples[num_samples * (iam + 1)], comp) - - sd->sorting_places[s]; - else - // Absolute end. - sd->pieces[iam][s].end = sd->starts[s + 1] - sd->starts[s]; - } - + difference_type num_samples; + determine_samples(sd, num_samples); + +# pragma omp barrier + +# pragma omp single + __gnu_sequential::sort(sd->samples, + sd->samples + (num_samples * sd->num_threads), + comp); + +# pragma omp barrier + + for (int s = 0; s < sd->num_threads; s++) + { + // For each sequence. + if (num_samples * iam > 0) + sd->pieces[iam][s].begin = + std::lower_bound(sd->sorting_places[s], + sd->sorting_places[s] + sd->starts[s + 1] - sd->starts[s], + sd->samples[num_samples * iam], + comp) + - sd->sorting_places[s]; + else + // Absolute beginning. + sd->pieces[iam][s].begin = 0; + + if ((num_samples * (iam + 1)) < (num_samples * sd->num_threads)) + sd->pieces[iam][s].end = + std::lower_bound(sd->sorting_places[s], + sd->sorting_places[s] + sd->starts[s + 1] - sd->starts[s], + sd->samples[num_samples * (iam + 1)], comp) + - sd->sorting_places[s]; + else + // Absolute end. + sd->pieces[iam][s].end = sd->starts[s + 1] - sd->starts[s]; + } } else if (Settings::sort_splitting == Settings::EXACT) { -#pragma omp barrier - - std::vector<std::pair<SortingPlacesIterator, SortingPlacesIterator> > seqs(d->num_threads); - for (int s = 0; s < d->num_threads; s++) - seqs[s] = std::make_pair(sd->sorting_places[s], sd->sorting_places[s] + sd->starts[s + 1] - sd->starts[s]); - - std::vector<SortingPlacesIterator> offsets(d->num_threads); - - // If not last thread. - if (iam < d->num_threads - 1) - multiseq_partition(seqs.begin(), seqs.end(), sd->starts[iam + 1], offsets.begin(), comp); - - for (int seq = 0; seq < d->num_threads; seq++) - { - // For each sequence. - if (iam < (d->num_threads - 1)) - sd->pieces[iam][seq].end = offsets[seq] - seqs[seq].first; - else - // Absolute end of this sequence. - sd->pieces[iam][seq].end = sd->starts[seq + 1] - sd->starts[seq]; - } - -#pragma omp barrier - - for (int seq = 0; seq < d->num_threads; seq++) - { - // For each sequence. - if (iam > 0) - sd->pieces[iam][seq].begin = sd->pieces[iam - 1][seq].end; - else - // Absolute beginning. - sd->pieces[iam][seq].begin = 0; - } +# pragma omp barrier + + std::vector<std::pair<SortingPlacesIterator, SortingPlacesIterator> > + seqs(sd->num_threads); + for (int s = 0; s < sd->num_threads; s++) + seqs[s] = std::make_pair(sd->sorting_places[s], + sd->sorting_places[s] + sd->starts[s + 1] - sd->starts[s]); + + std::vector<SortingPlacesIterator> offsets(sd->num_threads); + + // if not last thread + if (iam < sd->num_threads - 1) + multiseq_partition(seqs.begin(), seqs.end(), + sd->starts[iam + 1], offsets.begin(), comp); + + for (int seq = 0; seq < sd->num_threads; seq++) + { + // for each sequence + if (iam < (sd->num_threads - 1)) + sd->pieces[iam][seq].end = offsets[seq] - seqs[seq].first; + else + // very end of this sequence + sd->pieces[iam][seq].end = sd->starts[seq + 1] - sd->starts[seq]; + } + +# pragma omp barrier + + for (int seq = 0; seq < sd->num_threads; seq++) + { + // For each sequence. + if (iam > 0) + sd->pieces[iam][seq].begin = sd->pieces[iam - 1][seq].end; + else + // Absolute beginning. + sd->pieces[iam][seq].begin = 0; + } } // Offset from target begin, length after merging. difference_type offset = 0, length_am = 0; - for (int s = 0; s < d->num_threads; s++) + for (int s = 0; s < sd->num_threads; s++) { - length_am += sd->pieces[iam][s].end - sd->pieces[iam][s].begin; - offset += sd->pieces[iam][s].begin; + length_am += sd->pieces[iam][s].end - sd->pieces[iam][s].begin; + offset += sd->pieces[iam][s].begin; } #if _GLIBCXX_MULTIWAY_MERGESORT_COPY_LAST // Merge to temporary storage, uninitialized creation not possible // since there is no multiway_merge calling the placement new // instead of the assignment operator. - sd->merging_places[iam] = sd->temporaries[iam] = static_cast<value_type*>(::operator new(sizeof(value_type) * length_am)); + sd->merging_places[iam] = sd->temporaries[iam] = + static_cast<value_type*>( + ::operator new(sizeof(value_type) * length_am)); #else // Merge directly to target. sd->merging_places[iam] = sd->source + offset; #endif - std::vector<std::pair<SortingPlacesIterator, SortingPlacesIterator> > seqs(d->num_threads); + std::vector<std::pair<SortingPlacesIterator, SortingPlacesIterator> > + seqs(sd->num_threads); - for (int s = 0; s < d->num_threads; s++) + for (int s = 0; s < sd->num_threads; s++) { - seqs[s] = std::make_pair(sd->sorting_places[s] + sd->pieces[iam][s].begin, sd->sorting_places[s] + sd->pieces[iam][s].end); - -#if _GLIBCXX_ASSERTIONS - _GLIBCXX_PARALLEL_ASSERT(is_sorted(seqs[s].first, seqs[s].second, comp)); -#endif + seqs[s] = std::make_pair(sd->sorting_places[s] + sd->pieces[iam][s].begin, + sd->sorting_places[s] + sd->pieces[iam][s].end); } - multiway_merge(seqs.begin(), seqs.end(), sd->merging_places[iam], comp, length_am, d->stable, false, sequential_tag()); - -#if _GLIBCXX_ASSERTIONS - _GLIBCXX_PARALLEL_ASSERT(is_sorted(sd->merging_places[iam], sd->merging_places[iam] + length_am, comp)); -#endif + multiway_merge(seqs.begin(), seqs.end(), sd->merging_places[iam], comp, length_am, sd->stable, false, sequential_tag()); -# pragma omp barrier +# pragma omp barrier #if _GLIBCXX_MULTIWAY_MERGESORT_COPY_LAST // Write back. - std::copy(sd->merging_places[iam], sd->merging_places[iam] + length_am, - sd->source + offset); + std::copy(sd->merging_places[iam], + sd->merging_places[iam] + length_am, + sd->source + offset); #endif delete[] sd->temporaries[iam]; } - /** @brief PMWMS main call. - * @param begin Begin iterator of sequence. - * @param end End iterator of sequence. - * @param comp Comparator. - * @param n Length of sequence. - * @param num_threads Number of threads to use. - * @param stable Stable sorting. - */ - template<typename RandomAccessIterator, typename Comparator> +/** @brief PMWMS main call. + * @param begin Begin iterator of sequence. + * @param end End iterator of sequence. + * @param comp Comparator. + * @param n Length of sequence. + * @param num_threads Number of threads to use. + * @param stable Stable sorting. + */ +template<typename RandomAccessIterator, typename Comparator> inline void - parallel_sort_mwms(RandomAccessIterator begin, RandomAccessIterator end, - Comparator comp, - typename std::iterator_traits<RandomAccessIterator>::difference_type n, - int num_threads, bool stable) + parallel_sort_mwms(RandomAccessIterator begin, RandomAccessIterator end, + Comparator comp, + typename std::iterator_traits<RandomAccessIterator>::difference_type n, + int num_threads, + bool stable) { _GLIBCXX_CALL(n) - + typedef std::iterator_traits<RandomAccessIterator> traits_type; typedef typename traits_type::value_type value_type; typedef typename traits_type::difference_type difference_type; @@ -336,75 +339,75 @@ namespace __gnu_parallel if (n <= 1) return; - // At least one element per thread. + // at least one element per thread if (num_threads > n) num_threads = static_cast<thread_index_t>(n); + // shared variables PMWMSSortingData<RandomAccessIterator> sd; + difference_type* starts; - sd.source = begin; - sd.temporaries = new value_type*[num_threads]; +# pragma omp parallel num_threads(num_threads) + { + num_threads = omp_get_num_threads(); //no more threads than requested + +# pragma omp single + { + sd.num_threads = num_threads; + sd.source = begin; + sd.temporaries = new value_type*[num_threads]; #if _GLIBCXX_MULTIWAY_MERGESORT_COPY_LAST - sd.sorting_places = new RandomAccessIterator[num_threads]; - sd.merging_places = new value_type*[num_threads]; + sd.sorting_places = new RandomAccessIterator[num_threads]; + sd.merging_places = new value_type*[num_threads]; #else - sd.sorting_places = new value_type*[num_threads]; - sd.merging_places = new RandomAccessIterator[num_threads]; + sd.sorting_places = new value_type*[num_threads]; + sd.merging_places = new RandomAccessIterator[num_threads]; #endif - if (Settings::sort_splitting == Settings::SAMPLING) - { - unsigned int sz = Settings::sort_mwms_oversampling * num_threads - 1; - sz *= num_threads; - - // Equivalent to value_type[sz], without need of default construction. - sz *= sizeof(value_type); - sd.samples = static_cast<value_type*>(::operator new(sz)); - } - else - sd.samples = NULL; - - sd.offsets = new difference_type[num_threads - 1]; - sd.pieces = new std::vector<Piece<difference_type> >[num_threads]; - for (int s = 0; s < num_threads; s++) - sd.pieces[s].resize(num_threads); - PMWMSSorterPU<RandomAccessIterator>* pus = new PMWMSSorterPU<RandomAccessIterator>[num_threads]; - difference_type* starts = sd.starts = new difference_type[num_threads + 1]; - - difference_type chunk_length = n / num_threads; - difference_type split = n % num_threads; - difference_type start = 0; - for (int i = 0; i < num_threads; i++) - { - starts[i] = start; - start += (i < split) ? (chunk_length + 1) : chunk_length; - pus[i].num_threads = num_threads; - pus[i].iam = i; - pus[i].sd = &sd; - pus[i].stable = stable; - } - starts[num_threads] = start; - - // Now sort in parallel. -#pragma omp parallel num_threads(num_threads) - parallel_sort_mwms_pu(&(pus[omp_get_thread_num()]), comp); + if (Settings::sort_splitting == Settings::SAMPLING) + { + unsigned int size = + (Settings::sort_mwms_oversampling * num_threads - 1) * num_threads; + sd.samples = static_cast<value_type*>( + ::operator new(size * sizeof(value_type))); + } + else + sd.samples = NULL; + + sd.offsets = new difference_type[num_threads - 1]; + sd.pieces = new std::vector<Piece<difference_type> >[num_threads]; + for (int s = 0; s < num_threads; s++) + sd.pieces[s].resize(num_threads); + starts = sd.starts = new difference_type[num_threads + 1]; + sd.stable = stable; + + difference_type chunk_length = n / num_threads; + difference_type split = n % num_threads; + difference_type pos = 0; + for (int i = 0; i < num_threads; i++) + { + starts[i] = pos; + pos += (i < split) ? (chunk_length + 1) : chunk_length; + } + starts[num_threads] = pos; + } + + // Now sort in parallel. + parallel_sort_mwms_pu(&sd, comp); + } //parallel - // XXX sd as RAII delete[] starts; delete[] sd.temporaries; delete[] sd.sorting_places; delete[] sd.merging_places; if (Settings::sort_splitting == Settings::SAMPLING) - delete[] sd.samples; + delete[] sd.samples; delete[] sd.offsets; delete[] sd.pieces; - - delete[] pus; } - -} +} //namespace __gnu_parallel #endif diff --git a/libstdc++-v3/include/parallel/omp_loop.h b/libstdc++-v3/include/parallel/omp_loop.h index 23fe6f4a95f..0a992b063de 100644 --- a/libstdc++-v3/include/parallel/omp_loop.h +++ b/libstdc++-v3/include/parallel/omp_loop.h @@ -43,54 +43,71 @@ #include <parallel/settings.h> #include <parallel/basic_iterator.h> +#include <parallel/base.h> namespace __gnu_parallel { - /** @brief Embarrassingly parallel algorithm for random access - * iterators, using an OpenMP for loop. - * - * @param begin Begin iterator of element sequence. - * @param end End iterator of element sequence. - * @param o User-supplied functor (comparator, predicate, adding - * functor, etc.). - * @param f Functor to "process" an element with op (depends on - * desired functionality, e. g. for std::for_each(), ...). - * @param r Functor to "add" a single result to the already - * processed elements (depends on functionality). - * @param base Base value for reduction. - * @param output Pointer to position where final result is written to - * @param bound Maximum number of elements processed (e. g. for - * std::count_n()). - * @return User-supplied functor (that may contain a part of the result). - */ - template<typename RandomAccessIterator, typename Op, typename Fu, typename Red, typename Result> +/** @brief Embarrassingly parallel algorithm for random access + * iterators, using an OpenMP for loop. + * + * @param begin Begin iterator of element sequence. + * @param end End iterator of element sequence. + * @param o User-supplied functor (comparator, predicate, adding + * functor, etc.). + * @param f Functor to "process" an element with op (depends on + * desired functionality, e. g. for std::for_each(), ...). + * @param r Functor to "add" a single result to the already + * processed elements (depends on functionality). + * @param base Base value for reduction. + * @param output Pointer to position where final result is written to + * @param bound Maximum number of elements processed (e. g. for + * std::count_n()). + * @return User-supplied functor (that may contain a part of the result). + */ +template<typename RandomAccessIterator, + typename Op, + typename Fu, + typename Red, + typename Result> Op - for_each_template_random_access_omp_loop(RandomAccessIterator begin, RandomAccessIterator end, Op o, Fu& f, Red r, Result base, Result& output, typename std::iterator_traits<RandomAccessIterator>::difference_type bound) + for_each_template_random_access_omp_loop( + RandomAccessIterator begin, + RandomAccessIterator end, + Op o, Fu& f, Red r, Result base, Result& output, + typename std::iterator_traits<RandomAccessIterator>:: + difference_type bound) { - typedef typename std::iterator_traits<RandomAccessIterator>::difference_type difference_type; + typedef typename + std::iterator_traits<RandomAccessIterator>::difference_type + difference_type; - thread_index_t num_threads = (get_max_threads() < (end - begin)) ? get_max_threads() : static_cast<thread_index_t>((end - begin)); - Result *thread_results = new Result[num_threads]; difference_type length = end - begin; + thread_index_t num_threads = + __gnu_parallel::min<difference_type>(get_max_threads(), length); - for (thread_index_t i = 0; i < num_threads; i++) + Result *thread_results; + +# pragma omp parallel num_threads(num_threads) { - thread_results[i] = r(thread_results[i], f(o, begin+i)); - } - -#pragma omp parallel num_threads(num_threads) - { -#pragma omp for schedule(dynamic, Settings::workstealing_chunk_size) - for (difference_type pos = 0; pos < length; pos++) - { - thread_results[omp_get_thread_num()] = r(thread_results[omp_get_thread_num()], f(o, begin+pos)); - } - } +# pragma omp single + { + num_threads = omp_get_num_threads(); + thread_results = new Result[num_threads]; + + for (thread_index_t i = 0; i < num_threads; i++) + thread_results[i] = Result(); + } + + thread_index_t iam = omp_get_thread_num(); + +# pragma omp for schedule(dynamic, Settings::workstealing_chunk_size) + for (difference_type pos = 0; pos < length; pos++) + thread_results[iam] = + r(thread_results[iam], f(o, begin+pos)); + } //parallel for (thread_index_t i = 0; i < num_threads; i++) - { - output = r(output, thread_results[i]); - } + output = r(output, thread_results[i]); delete [] thread_results; @@ -100,6 +117,7 @@ namespace __gnu_parallel return o; } + } // end namespace #endif diff --git a/libstdc++-v3/include/parallel/omp_loop_static.h b/libstdc++-v3/include/parallel/omp_loop_static.h index 22acb2de348..df2c3b5e6a7 100644 --- a/libstdc++-v3/include/parallel/omp_loop_static.h +++ b/libstdc++-v3/include/parallel/omp_loop_static.h @@ -64,39 +64,50 @@ namespace __gnu_parallel * std::count_n()). * @return User-supplied functor (that may contain a part of the result). */ - template<typename RandomAccessIterator, typename Op, typename Fu, typename Red, typename Result> +template<typename RandomAccessIterator, + typename Op, + typename Fu, + typename Red, + typename Result> Op - for_each_template_random_access_omp_loop_static(RandomAccessIterator begin, - RandomAccessIterator end, - Op o, Fu& f, Red r, - Result base, Result& output, - typename std::iterator_traits<RandomAccessIterator>::difference_type bound) + for_each_template_random_access_omp_loop_static( + RandomAccessIterator begin, + RandomAccessIterator end, + Op o, Fu& f, Red r, Result base, Result& output, + typename std::iterator_traits<RandomAccessIterator>:: + difference_type bound) { - typedef std::iterator_traits<RandomAccessIterator> traits_type; - typedef typename traits_type::difference_type difference_type; + typedef typename + std::iterator_traits<RandomAccessIterator>::difference_type + difference_type; - thread_index_t num_threads = (get_max_threads() < (end - begin)) ? get_max_threads() : (end - begin); - Result *thread_results = new Result[num_threads]; difference_type length = end - begin; + thread_index_t num_threads = + std::min<difference_type>(get_max_threads(), length); - for (thread_index_t i = 0; i < num_threads; i++) + Result *thread_results; + +# pragma omp parallel num_threads(num_threads) { - thread_results[i] = r(thread_results[i], f(o, begin+i)); - } - -#pragma omp parallel num_threads(num_threads) - { -#pragma omp for schedule(static, Settings::workstealing_chunk_size) - for (difference_type pos = 0; pos < length; pos++) - { - thread_results[omp_get_thread_num()] = r(thread_results[omp_get_thread_num()], f(o, begin+pos)); - } - } +# pragma omp single + { + num_threads = omp_get_num_threads(); + thread_results = new Result[num_threads]; + + for (thread_index_t i = 0; i < num_threads; i++) + thread_results[i] = Result(); + } + + thread_index_t iam = omp_get_thread_num(); + +# pragma omp for schedule(static, Settings::workstealing_chunk_size) + for (difference_type pos = 0; pos < length; pos++) + thread_results[iam] = + r(thread_results[iam], f(o, begin+pos)); + } //parallel for (thread_index_t i = 0; i < num_threads; i++) - { - output = r(output, thread_results[i]); - } + output = r(output, thread_results[i]); delete [] thread_results; @@ -106,6 +117,7 @@ namespace __gnu_parallel return o; } + } // end namespace #endif diff --git a/libstdc++-v3/include/parallel/par_loop.h b/libstdc++-v3/include/parallel/par_loop.h index 98604cf1da4..6954e745076 100644 --- a/libstdc++-v3/include/parallel/par_loop.h +++ b/libstdc++-v3/include/parallel/par_loop.h @@ -41,69 +41,80 @@ #include <omp.h> #include <parallel/settings.h> +#include <parallel/base.h> namespace __gnu_parallel { - /** @brief Embarrassingly parallel algorithm for random access - * iterators, using hand-crafted parallelization by equal splitting - * the work. - * - * @param begin Begin iterator of element sequence. - * @param end End iterator of element sequence. - * @param o User-supplied functor (comparator, predicate, adding - * functor, ...) - * @param f Functor to "process" an element with op (depends on - * desired functionality, e. g. for std::for_each(), ...). - * @param r Functor to "add" a single result to the already - * processed elements (depends on functionality). - * @param base Base value for reduction. - * @param output Pointer to position where final result is written to - * @param bound Maximum number of elements processed (e. g. for - * std::count_n()). - * @return User-supplied functor (that may contain a part of the result). - */ - template<typename RandomAccessIterator, typename Op, typename Fu, typename Red, typename Result> +/** @brief Embarrassingly parallel algorithm for random access + * iterators, using hand-crafted parallelization by equal splitting + * the work. + * + * @param begin Begin iterator of element sequence. + * @param end End iterator of element sequence. + * @param o User-supplied functor (comparator, predicate, adding + * functor, ...) + * @param f Functor to "process" an element with op (depends on + * desired functionality, e. g. for std::for_each(), ...). + * @param r Functor to "add" a single result to the already + * processed elements (depends on functionality). + * @param base Base value for reduction. + * @param output Pointer to position where final result is written to + * @param bound Maximum number of elements processed (e. g. for + * std::count_n()). + * @return User-supplied functor (that may contain a part of the result). + */ +template< + typename RandomAccessIterator, + typename Op, + typename Fu, + typename Red, + typename Result> Op - for_each_template_random_access_ed(RandomAccessIterator begin, - RandomAccessIterator end, Op o, Fu& f, - Red r, Result base, Result& output, - typename std::iterator_traits<RandomAccessIterator>::difference_type bound) + for_each_template_random_access_ed( + RandomAccessIterator begin, + RandomAccessIterator end, + Op o, Fu& f, Red r, Result base, Result& output, + typename std::iterator_traits<RandomAccessIterator>:: + difference_type bound) { typedef std::iterator_traits<RandomAccessIterator> traits_type; typedef typename traits_type::difference_type difference_type; const difference_type length = end - begin; - const difference_type settings_threads = static_cast<difference_type>(get_max_threads()); - const difference_type dmin = settings_threads < length ? settings_threads : length; - const difference_type dmax = dmin > 1 ? dmin : 1; + Result *thread_results; - thread_index_t num_threads = static_cast<thread_index_t>(dmax); + thread_index_t num_threads = + __gnu_parallel::min<difference_type>(get_max_threads(), length); +# pragma omp parallel num_threads(num_threads) + { +# pragma omp single + { + num_threads = omp_get_num_threads(); + thread_results = new Result[num_threads]; + } - Result *thread_results = new Result[num_threads]; + thread_index_t iam = omp_get_thread_num(); -#pragma omp parallel num_threads(num_threads) - { - // Neutral element. - Result reduct = Result(); + // Neutral element. + Result reduct = Result(); - thread_index_t p = num_threads; - thread_index_t iam = omp_get_thread_num(); - difference_type start = iam * length / p; - difference_type limit = (iam == p - 1) ? length : (iam + 1) * length / p; + difference_type + start = equally_split_point(length, num_threads, iam), + stop = equally_split_point(length, num_threads, iam + 1); - if (start < limit) - { - reduct = f(o, begin + start); - start++; - } + if (start < stop) + { + reduct = f(o, begin + start); + ++start; + } - for (; start < limit; start++) - reduct = r(reduct, f(o, begin + start)); + for (; start < stop; ++start) + reduct = r(reduct, f(o, begin + start)); - thread_results[iam] = reduct; - } + thread_results[iam] = reduct; + } //parallel for (thread_index_t i = 0; i < num_threads; i++) output = r(output, thread_results[i]); diff --git a/libstdc++-v3/include/parallel/partial_sum.h b/libstdc++-v3/include/parallel/partial_sum.h index e7652c07cd9..3dfce86e018 100644 --- a/libstdc++-v3/include/parallel/partial_sum.h +++ b/libstdc++-v3/include/parallel/partial_sum.h @@ -48,130 +48,156 @@ namespace __gnu_parallel { // Problem: there is no 0-element given. - /** @brief Base case prefix sum routine. - * @param begin Begin iterator of input sequence. - * @param end End iterator of input sequence. - * @param result Begin iterator of output sequence. - * @param bin_op Associative binary function. - * @param value Start value. Must be passed since the neutral - * element is unknown in general. - * @return End iterator of output sequence. */ - template<typename InputIterator, typename OutputIterator, typename BinaryOperation> +/** @brief Base case prefix sum routine. + * @param begin Begin iterator of input sequence. + * @param end End iterator of input sequence. + * @param result Begin iterator of output sequence. + * @param bin_op Associative binary function. + * @param value Start value. Must be passed since the neutral + * element is unknown in general. + * @return End iterator of output sequence. */ +template< + typename InputIterator, + typename OutputIterator, + typename BinaryOperation> inline OutputIterator - parallel_partial_sum_basecase(InputIterator begin, InputIterator end, - OutputIterator result, BinaryOperation bin_op, - typename std::iterator_traits<InputIterator>::value_type value) + parallel_partial_sum_basecase( + InputIterator begin, InputIterator end, + OutputIterator result, BinaryOperation bin_op, + typename std::iterator_traits<InputIterator>::value_type value) { if (begin == end) return result; while (begin != end) { - value = bin_op(value, *begin); - *result = value; - result++; - begin++; + value = bin_op(value, *begin); + *result = value; + result++; + begin++; } return result; } - /** @brief Parallel partial sum implementation, two-phase approach, - no recursion. - * @param begin Begin iterator of input sequence. - * @param end End iterator of input sequence. - * @param result Begin iterator of output sequence. - * @param bin_op Associative binary function. - * @param n Length of sequence. - * @param num_threads Number of threads to use. - * @return End iterator of output sequence. - */ - template<typename InputIterator, typename OutputIterator, typename BinaryOperation> +/** @brief Parallel partial sum implementation, two-phase approach, + no recursion. + * @param begin Begin iterator of input sequence. + * @param end End iterator of input sequence. + * @param result Begin iterator of output sequence. + * @param bin_op Associative binary function. + * @param n Length of sequence. + * @param num_threads Number of threads to use. + * @return End iterator of output sequence. + */ +template< + typename InputIterator, + typename OutputIterator, + typename BinaryOperation> OutputIterator - parallel_partial_sum_linear(InputIterator begin, InputIterator end, - OutputIterator result, BinaryOperation bin_op, - typename std::iterator_traits<InputIterator>::difference_type n, int num_threads) + parallel_partial_sum_linear( + InputIterator begin, InputIterator end, + OutputIterator result, BinaryOperation bin_op, + typename std::iterator_traits<InputIterator>::difference_type n) { typedef std::iterator_traits<InputIterator> traits_type; typedef typename traits_type::value_type value_type; typedef typename traits_type::difference_type difference_type; - if (num_threads > (n - 1)) - num_threads = static_cast<thread_index_t>(n - 1); + thread_index_t num_threads = + std::min<difference_type>(get_max_threads(), n - 1); + if (num_threads < 2) { - *result = *begin; - return parallel_partial_sum_basecase(begin + 1, end, result + 1, bin_op, *begin); + *result = *begin; + return parallel_partial_sum_basecase( + begin + 1, end, result + 1, bin_op, *begin); } - difference_type* borders = static_cast<difference_type*>(__builtin_alloca(sizeof(difference_type) * (num_threads + 2))); + difference_type* borders; + value_type* sums; - if (Settings::partial_sum_dilatation == 1.0f) - equally_split(n, num_threads + 1, borders); - else +# pragma omp parallel num_threads(num_threads) { - difference_type chunk_length = (int)((double)n / ((double)num_threads + Settings::partial_sum_dilatation)), borderstart = n - num_threads * chunk_length; - borders[0] = 0; - for (int i = 1; i < (num_threads + 1); i++) - { - borders[i] = borderstart; - borderstart += chunk_length; - } - borders[num_threads + 1] = n; - } - - value_type* sums = static_cast<value_type*>(::operator new(sizeof(value_type) * num_threads)); - OutputIterator target_end; - -#pragma omp parallel num_threads(num_threads) - { - int id = omp_get_thread_num(); - if (id == 0) - { - *result = *begin; - parallel_partial_sum_basecase(begin + 1, begin + borders[1], - result + 1, bin_op, *begin); - sums[0] = *(result + borders[1] - 1); - } - else - { - sums[id] = std::accumulate(begin + borders[id] + 1, - begin + borders[id + 1], - *(begin + borders[id]), - bin_op, __gnu_parallel::sequential_tag()); - } - -#pragma omp barrier - -#pragma omp single - parallel_partial_sum_basecase(sums + 1, sums + num_threads, sums + 1, - bin_op, sums[0]); - -#pragma omp barrier - - // Still same team. - parallel_partial_sum_basecase(begin + borders[id + 1], - begin + borders[id + 2], - result + borders[id + 1], bin_op, - sums[id]); - } - - delete [] sums; +# pragma omp single + { + num_threads = omp_get_num_threads(); + + borders = new difference_type[num_threads + 2]; + + if (Settings::partial_sum_dilatation == 1.0f) + equally_split(n, num_threads + 1, borders); + else + { + difference_type chunk_length = + ((double)n / + ((double)num_threads + Settings::partial_sum_dilatation)), + borderstart = n - num_threads * chunk_length; + borders[0] = 0; + for (int i = 1; i < (num_threads + 1); i++) + { + borders[i] = borderstart; + borderstart += chunk_length; + } + borders[num_threads + 1] = n; + } + + sums = static_cast<value_type*>( + ::operator new(sizeof(value_type) * num_threads)); + OutputIterator target_end; + } //single + + int iam = omp_get_thread_num(); + if (iam == 0) + { + *result = *begin; + parallel_partial_sum_basecase(begin + 1, begin + borders[1], + result + 1, bin_op, *begin); + sums[0] = *(result + borders[1] - 1); + } + else + { + sums[iam] = std::accumulate(begin + borders[iam] + 1, + begin + borders[iam + 1], + *(begin + borders[iam]), + bin_op, __gnu_parallel::sequential_tag()); + } + +# pragma omp barrier + +# pragma omp single + parallel_partial_sum_basecase( + sums + 1, sums + num_threads, sums + 1, bin_op, sums[0]); + +# pragma omp barrier + + // Still same team. + parallel_partial_sum_basecase(begin + borders[iam + 1], + begin + borders[iam + 2], + result + borders[iam + 1], bin_op, + sums[iam]); + } //parallel + + delete[] sums; + delete[] borders; return result + n; } - /** @brief Parallel partial sum front-end. - * @param begin Begin iterator of input sequence. - * @param end End iterator of input sequence. - * @param result Begin iterator of output sequence. - * @param bin_op Associative binary function. - * @return End iterator of output sequence. */ - template<typename InputIterator, typename OutputIterator, typename BinaryOperation> +/** @brief Parallel partial sum front-end. + * @param begin Begin iterator of input sequence. + * @param end End iterator of input sequence. + * @param result Begin iterator of output sequence. + * @param bin_op Associative binary function. + * @return End iterator of output sequence. */ +template< + typename InputIterator, + typename OutputIterator, + typename BinaryOperation> OutputIterator parallel_partial_sum(InputIterator begin, InputIterator end, - OutputIterator result, BinaryOperation bin_op) + OutputIterator result, BinaryOperation bin_op) { - _GLIBCXX_CALL(begin - end); + _GLIBCXX_CALL(begin - end) typedef std::iterator_traits<InputIterator> traits_type; typedef typename traits_type::value_type value_type; @@ -179,18 +205,15 @@ namespace __gnu_parallel difference_type n = end - begin; - int num_threads = get_max_threads(); - switch (Settings::partial_sum_algorithm) { case Settings::LINEAR: - // Need an initial offset. - return parallel_partial_sum_linear(begin, end, result, bin_op, - n, num_threads); + // Need an initial offset. + return parallel_partial_sum_linear(begin, end, result, bin_op, n); default: - // Partial_sum algorithm not implemented. - _GLIBCXX_PARALLEL_ASSERT(0); - return result + n; + // Partial_sum algorithm not implemented. + _GLIBCXX_PARALLEL_ASSERT(0); + return result + n; } } } diff --git a/libstdc++-v3/include/parallel/partition.h b/libstdc++-v3/include/parallel/partition.h index 2b8631db73e..d6dac37d7c8 100644 --- a/libstdc++-v3/include/parallel/partition.h +++ b/libstdc++-v3/include/parallel/partition.h @@ -45,21 +45,21 @@ #include <bits/stl_algo.h> #include <parallel/parallel.h> -/** @brief Decide whether to declare certain variable volatile in this file. */ +/** @brief Decide whether to declare certain variables volatile. */ #define _GLIBCXX_VOLATILE volatile namespace __gnu_parallel { - /** @brief Parallel implementation of std::partition. - * @param begin Begin iterator of input sequence to split. - * @param end End iterator of input sequence to split. - * @param pred Partition predicate, possibly including some kind of pivot. - * @param max_num_threads Maximum number of threads to use for this task. - * @return Number of elements not fulfilling the predicate. */ - template<typename RandomAccessIterator, typename Predicate> - inline typename std::iterator_traits<RandomAccessIterator>::difference_type +/** @brief Parallel implementation of std::partition. + * @param begin Begin iterator of input sequence to split. + * @param end End iterator of input sequence to split. + * @param pred Partition predicate, possibly including some kind of pivot. + * @param num_threads Maximum number of threads to use for this task. + * @return Number of elements not fulfilling the predicate. */ +template<typename RandomAccessIterator, typename Predicate> + typename std::iterator_traits<RandomAccessIterator>::difference_type parallel_partition(RandomAccessIterator begin, RandomAccessIterator end, - Predicate pred, thread_index_t max_num_threads) + Predicate pred, thread_index_t num_threads) { typedef std::iterator_traits<RandomAccessIterator> traits_type; typedef typename traits_type::value_type value_type; @@ -74,212 +74,238 @@ namespace __gnu_parallel _GLIBCXX_VOLATILE difference_type leftover_left, leftover_right; _GLIBCXX_VOLATILE difference_type leftnew, rightnew; - bool* reserved_left, * reserved_right; - - reserved_left = new bool[max_num_threads]; - reserved_right = new bool[max_num_threads]; + bool* reserved_left = NULL, * reserved_right = NULL; difference_type chunk_size; - if (Settings::partition_chunk_share > 0.0) - chunk_size = std::max((difference_type)Settings::partition_chunk_size, (difference_type)((double)n * Settings::partition_chunk_share / (double)max_num_threads)); - else - chunk_size = Settings::partition_chunk_size; omp_lock_t result_lock; omp_init_lock(&result_lock); - // At least good for two processors. - while (right - left + 1 >= 2 * max_num_threads * chunk_size) + //at least two chunks per thread + if(right - left + 1 >= 2 * num_threads * chunk_size) +# pragma omp parallel num_threads(num_threads) { - difference_type num_chunks = (right - left + 1) / chunk_size; - thread_index_t num_threads = (int)std::min((difference_type)max_num_threads, num_chunks / 2); - - for (int r = 0; r < num_threads; r++) - { - reserved_left[r] = false; - reserved_right[r] = false; - } - leftover_left = 0; - leftover_right = 0; - -#pragma omp parallel num_threads(num_threads) - { - // Private. - difference_type thread_left, thread_left_border, thread_right, thread_right_border; - thread_left = left + 1; - - // Just to satisfy the condition below. - thread_left_border = thread_left - 1; - thread_right = n - 1; - thread_right_border = thread_right + 1; - - bool iam_finished = false; - while (!iam_finished) - { - if (thread_left > thread_left_border) - { - omp_set_lock(&result_lock); - if (left + (chunk_size - 1) > right) - iam_finished = true; - else - { - thread_left = left; - thread_left_border = left + (chunk_size - 1); - left += chunk_size; - } - omp_unset_lock(&result_lock); - } - - if (thread_right < thread_right_border) - { - omp_set_lock(&result_lock); - if (left > right - (chunk_size - 1)) - iam_finished = true; - else - { - thread_right = right; - thread_right_border = right - (chunk_size - 1); - right -= chunk_size; - } - omp_unset_lock(&result_lock); - } - - if (iam_finished) - break; - - // Swap as usual. - while (thread_left < thread_right) - { - while (pred(begin[thread_left]) && thread_left <= thread_left_border) - thread_left++; - while (!pred(begin[thread_right]) && thread_right >= thread_right_border) - thread_right--; - - if (thread_left > thread_left_border || thread_right < thread_right_border) - // Fetch new chunk(s). - break; - - std::swap(begin[thread_left], begin[thread_right]); - thread_left++; - thread_right--; - } - } - - // Now swap the leftover chunks to the right places. - if (thread_left <= thread_left_border) -#pragma omp atomic - leftover_left++; - if (thread_right >= thread_right_border) -#pragma omp atomic - leftover_right++; - -#pragma omp barrier - -#pragma omp single - { - leftnew = left - leftover_left * chunk_size; - rightnew = right + leftover_right * chunk_size; - } - -#pragma omp barrier - - // <=> thread_left_border + (chunk_size - 1) >= leftnew - if (thread_left <= thread_left_border - && thread_left_border >= leftnew) - { - // Chunk already in place, reserve spot. - reserved_left[(left - (thread_left_border + 1)) / chunk_size] = true; - } - - // <=> thread_right_border - (chunk_size - 1) <= rightnew - if (thread_right >= thread_right_border - && thread_right_border <= rightnew) - { - // Chunk already in place, reserve spot. - reserved_right[((thread_right_border - 1) - right) / chunk_size] = true; - } - -#pragma omp barrier - - if (thread_left <= thread_left_border && thread_left_border < leftnew) - { - // Find spot and swap. - difference_type swapstart = -1; - omp_set_lock(&result_lock); - for (int r = 0; r < leftover_left; r++) +# pragma omp single + { + num_threads = omp_get_num_threads(); + reserved_left = new bool[num_threads]; + reserved_right = new bool[num_threads]; + + if (Settings::partition_chunk_share > 0.0) + chunk_size = std::max<difference_type>( + Settings::partition_chunk_size, + (double)n * Settings::partition_chunk_share / + (double)num_threads); + else + chunk_size = Settings::partition_chunk_size; + } + + while (right - left + 1 >= 2 * num_threads * chunk_size) + { +# pragma omp single + { + difference_type num_chunks = (right - left + 1) / chunk_size; + + for (int r = 0; r < num_threads; r++) + { + reserved_left[r] = false; + reserved_right[r] = false; + } + leftover_left = 0; + leftover_right = 0; + } //implicit barrier + + // Private. + difference_type thread_left, thread_left_border, + thread_right, thread_right_border; + thread_left = left + 1; + + // Just to satisfy the condition below. + thread_left_border = thread_left - 1; + thread_right = n - 1; + thread_right_border = thread_right + 1; + + bool iam_finished = false; + while (!iam_finished) + { + if (thread_left > thread_left_border) + { + omp_set_lock(&result_lock); + if (left + (chunk_size - 1) > right) + iam_finished = true; + else + { + thread_left = left; + thread_left_border = left + (chunk_size - 1); + left += chunk_size; + } + omp_unset_lock(&result_lock); + } + + if (thread_right < thread_right_border) + { + omp_set_lock(&result_lock); + if (left > right - (chunk_size - 1)) + iam_finished = true; + else + { + thread_right = right; + thread_right_border = right - (chunk_size - 1); + right -= chunk_size; + } + omp_unset_lock(&result_lock); + } + + if (iam_finished) + break; + + // Swap as usual. + while (thread_left < thread_right) + { + while (pred(begin[thread_left]) + && thread_left <= thread_left_border) + thread_left++; + while (!pred(begin[thread_right]) + && thread_right >= thread_right_border) + thread_right--; + + if (thread_left > thread_left_border + || thread_right < thread_right_border) + // Fetch new chunk(s). + break; + + std::swap(begin[thread_left], begin[thread_right]); + thread_left++; + thread_right--; + } + } + + // Now swap the leftover chunks to the right places. + if (thread_left <= thread_left_border) +# pragma omp atomic + leftover_left++; + if (thread_right >= thread_right_border) +# pragma omp atomic + leftover_right++; + +# pragma omp barrier + +# pragma omp single + { + leftnew = left - leftover_left * chunk_size; + rightnew = right + leftover_right * chunk_size; + } + +# pragma omp barrier + + // <=> thread_left_border + (chunk_size - 1) >= leftnew + if (thread_left <= thread_left_border + && thread_left_border >= leftnew) + { + // Chunk already in place, reserve spot. + reserved_left[(left - (thread_left_border + 1)) / chunk_size] + = true; + } + + // <=> thread_right_border - (chunk_size - 1) <= rightnew + if (thread_right >= thread_right_border + && thread_right_border <= rightnew) + { + // Chunk already in place, reserve spot. + reserved_right + [((thread_right_border - 1) - right) / chunk_size] + = true; + } + +# pragma omp barrier + + if (thread_left <= thread_left_border + && thread_left_border < leftnew) + { + // Find spot and swap. + difference_type swapstart = -1; + omp_set_lock(&result_lock); + for (int r = 0; r < leftover_left; r++) if (!reserved_left[r]) { reserved_left[r] = true; swapstart = left - (r + 1) * chunk_size; break; } - omp_unset_lock(&result_lock); + omp_unset_lock(&result_lock); #if _GLIBCXX_ASSERTIONS - _GLIBCXX_PARALLEL_ASSERT(swapstart != -1); + _GLIBCXX_PARALLEL_ASSERT(swapstart != -1); #endif - std::swap_ranges(begin + thread_left_border - (chunk_size - 1), begin + thread_left_border + 1, begin + swapstart); - } - - if (thread_right >= thread_right_border - && thread_right_border > rightnew) - { - // Find spot and swap - difference_type swapstart = -1; - omp_set_lock(&result_lock); - for (int r = 0; r < leftover_right; r++) - if (!reserved_right[r]) - { - reserved_right[r] = true; - swapstart = right + r * chunk_size + 1; - break; - } - omp_unset_lock(&result_lock); + std::swap_ranges( + begin + thread_left_border - (chunk_size - 1), + begin + thread_left_border + 1, + begin + swapstart); + } + + if (thread_right >= thread_right_border + && thread_right_border > rightnew) + { + // Find spot and swap + difference_type swapstart = -1; + omp_set_lock(&result_lock); + for (int r = 0; r < leftover_right; r++) + if (!reserved_right[r]) + { + reserved_right[r] = true; + swapstart = right + r * chunk_size + 1; + break; + } + omp_unset_lock(&result_lock); #if _GLIBCXX_ASSERTIONS - _GLIBCXX_PARALLEL_ASSERT(swapstart != -1); + _GLIBCXX_PARALLEL_ASSERT(swapstart != -1); #endif - std::swap_ranges(begin + thread_right_border, begin + thread_right_border + chunk_size, begin + swapstart); - } + std::swap_ranges(begin + thread_right_border, + begin + thread_right_border + chunk_size, + begin + swapstart); + } #if _GLIBCXX_ASSERTIONS -#pragma omp barrier +# pragma omp barrier -#pragma omp single - { - for (int r = 0; r < leftover_left; r++) - _GLIBCXX_PARALLEL_ASSERT(reserved_left[r]); - for (int r = 0; r < leftover_right; r++) - _GLIBCXX_PARALLEL_ASSERT(reserved_right[r]); - } +# pragma omp single + { + for (int r = 0; r < leftover_left; r++) + _GLIBCXX_PARALLEL_ASSERT(reserved_left[r]); + for (int r = 0; r < leftover_right; r++) + _GLIBCXX_PARALLEL_ASSERT(reserved_right[r]); + } -#pragma omp barrier +# pragma omp barrier #endif -#pragma omp barrier - left = leftnew; - right = rightnew; - } - } // end "recursion" +# pragma omp barrier + + left = leftnew; + right = rightnew; + } +# pragma omp flush(left, right) + } // end "recursion" //parallel difference_type final_left = left, final_right = right; while (final_left < final_right) { - // Go right until key is geq than pivot. - while (pred(begin[final_left]) && final_left < final_right) - final_left++; - - // Go left until key is less than pivot. - while (!pred(begin[final_right]) && final_left < final_right) - final_right--; - - if (final_left == final_right) - break; - std::swap(begin[final_left], begin[final_right]); - final_left++; - final_right--; + // Go right until key is geq than pivot. + while (pred(begin[final_left]) && final_left < final_right) + final_left++; + + // Go left until key is less than pivot. + while (!pred(begin[final_right]) && final_left < final_right) + final_right--; + + if (final_left == final_right) + break; + std::swap(begin[final_left], begin[final_right]); + final_left++; + final_right--; } // All elements on the left side are < piv, all elements on the @@ -298,14 +324,14 @@ namespace __gnu_parallel return final_left + 1; } - /** - * @brief Parallel implementation of std::nth_element(). - * @param begin Begin iterator of input sequence. - * @param nth Iterator of element that must be in position afterwards. - * @param end End iterator of input sequence. - * @param comp Comparator. - */ - template<typename RandomAccessIterator, typename Comparator> +/** + * @brief Parallel implementation of std::nth_element(). + * @param begin Begin iterator of input sequence. + * @param nth Iterator of element that must be in position afterwards. + * @param end End iterator of input sequence. + * @param comp Comparator. + */ +template<typename RandomAccessIterator, typename Comparator> void parallel_nth_element(RandomAccessIterator begin, RandomAccessIterator nth, RandomAccessIterator end, Comparator comp) @@ -324,65 +350,65 @@ namespace __gnu_parallel // Break if input range to small. while (static_cast<sequence_index_t>(end - begin) >= minimum_length) { - difference_type n = end - begin; - - RandomAccessIterator pivot_pos = begin + rng(n); - - // Swap pivot_pos value to end. - if (pivot_pos != (end - 1)) - std::swap(*pivot_pos, *(end - 1)); - pivot_pos = end - 1; - - // XXX Comparator must have first_value_type, second_value_type, result_type - // Comparator == __gnu_parallel::lexicographic<S, int, __gnu_parallel::less<S, S> > - // pivot_pos == std::pair<S, int>* - // XXX binder2nd only for RandomAccessIterators?? - __gnu_parallel::binder2nd<Comparator, value_type, value_type, bool> pred(comp, *pivot_pos); - - // Divide, leave pivot unchanged in last place. - RandomAccessIterator split_pos1, split_pos2; - split_pos1 = begin + parallel_partition(begin, end - 1, pred, get_max_threads()); - - // Left side: < pivot_pos; right side: >= pivot_pos - - // Swap pivot back to middle. - if (split_pos1 != pivot_pos) - std::swap(*split_pos1, *pivot_pos); - pivot_pos = split_pos1; - - // In case all elements are equal, split_pos1 == 0 - if ((split_pos1 + 1 - begin) < (n >> 7) || (end - split_pos1) < (n >> 7)) - { - // Very unequal split, one part smaller than one 128th - // elements not stricly larger than the pivot. - __gnu_parallel::unary_negate<__gnu_parallel::binder1st<Comparator, value_type, value_type, bool>, value_type> pred(__gnu_parallel::binder1st<Comparator, value_type, value_type, bool>(comp, *pivot_pos)); - - // Find other end of pivot-equal range. - split_pos2 = __gnu_sequential::partition(split_pos1 + 1, end, pred); - } - else - // Only skip the pivot. - split_pos2 = split_pos1 + 1; - - // Compare iterators. - if (split_pos2 <= nth) - begin = split_pos2; - else if (nth < split_pos1) - end = split_pos1; - else - break; + difference_type n = end - begin; + + RandomAccessIterator pivot_pos = begin + rng(n); + + // Swap pivot_pos value to end. + if (pivot_pos != (end - 1)) + std::swap(*pivot_pos, *(end - 1)); + pivot_pos = end - 1; + + // XXX Comparator must have first_value_type, second_value_type, result_type + // Comparator == __gnu_parallel::lexicographic<S, int, __gnu_parallel::less<S, S> > + // pivot_pos == std::pair<S, int>* + // XXX binder2nd only for RandomAccessIterators?? + __gnu_parallel::binder2nd<Comparator, value_type, value_type, bool> pred(comp, *pivot_pos); + + // Divide, leave pivot unchanged in last place. + RandomAccessIterator split_pos1, split_pos2; + split_pos1 = begin + parallel_partition(begin, end - 1, pred, get_max_threads()); + + // Left side: < pivot_pos; right side: >= pivot_pos + + // Swap pivot back to middle. + if (split_pos1 != pivot_pos) + std::swap(*split_pos1, *pivot_pos); + pivot_pos = split_pos1; + + // In case all elements are equal, split_pos1 == 0 + if ((split_pos1 + 1 - begin) < (n >> 7) || (end - split_pos1) < (n >> 7)) + { + // Very unequal split, one part smaller than one 128th + // elements not stricly larger than the pivot. + __gnu_parallel::unary_negate<__gnu_parallel::binder1st<Comparator, value_type, value_type, bool>, value_type> pred(__gnu_parallel::binder1st<Comparator, value_type, value_type, bool>(comp, *pivot_pos)); + + // Find other end of pivot-equal range. + split_pos2 = __gnu_sequential::partition(split_pos1 + 1, end, pred); + } + else + // Only skip the pivot. + split_pos2 = split_pos1 + 1; + + // Compare iterators. + if (split_pos2 <= nth) + begin = split_pos2; + else if (nth < split_pos1) + end = split_pos1; + else + break; } // Only at most Settings::partition_minimal_n elements left. __gnu_sequential::sort(begin, end, comp); } - /** @brief Parallel implementation of std::partial_sort(). - * @param begin Begin iterator of input sequence. - * @param middle Sort until this position. - * @param end End iterator of input sequence. - * @param comp Comparator. */ - template<typename RandomAccessIterator, typename Comparator> +/** @brief Parallel implementation of std::partial_sort(). +* @param begin Begin iterator of input sequence. +* @param middle Sort until this position. +* @param end End iterator of input sequence. +* @param comp Comparator. */ +template<typename RandomAccessIterator, typename Comparator> void parallel_partial_sort(RandomAccessIterator begin, RandomAccessIterator middle, RandomAccessIterator end, Comparator comp) { @@ -390,7 +416,7 @@ namespace __gnu_parallel std::sort(begin, middle, comp); } -} //namespace __gnu_parallel +} //namespace __gnu_parallel #undef _GLIBCXX_VOLATILE diff --git a/libstdc++-v3/include/parallel/quicksort.h b/libstdc++-v3/include/parallel/quicksort.h index a9ceab4a555..4eb357818cf 100644 --- a/libstdc++-v3/include/parallel/quicksort.h +++ b/libstdc++-v3/include/parallel/quicksort.h @@ -53,11 +53,17 @@ namespace __gnu_parallel * this part. */ template<typename RandomAccessIterator, typename Comparator> - inline typename std::iterator_traits<RandomAccessIterator>::difference_type - parallel_sort_qs_divide(RandomAccessIterator begin, RandomAccessIterator end, - Comparator comp, - typename std::iterator_traits<RandomAccessIterator>::difference_type pivot_rank, - typename std::iterator_traits<RandomAccessIterator>::difference_type num_samples, thread_index_t num_threads) + inline + typename std::iterator_traits<RandomAccessIterator>::difference_type + parallel_sort_qs_divide( + RandomAccessIterator begin, + RandomAccessIterator end, + Comparator comp, + typename std::iterator_traits<RandomAccessIterator>::difference_type + pivot_rank, + typename std::iterator_traits<RandomAccessIterator>::difference_type + num_samples, + thread_index_t num_threads) { typedef std::iterator_traits<RandomAccessIterator> traits_type; typedef typename traits_type::value_type value_type; @@ -65,20 +71,24 @@ namespace __gnu_parallel difference_type n = end - begin; num_samples = std::min(num_samples, n); - value_type* samples = static_cast<value_type*>(__builtin_alloca(sizeof(value_type) * num_samples)); + + // Allocate uninitialized, to avoid default constructor. + value_type* samples = static_cast<value_type*>( + operator new(num_samples * sizeof(value_type))); for (difference_type s = 0; s < num_samples; s++) { - const unsigned long long index = static_cast<unsigned long long>(s) - * n / num_samples; - samples[s] = begin[index]; + const unsigned long long index = static_cast<unsigned long long>(s) + * n / num_samples; + new(samples + s) value_type(begin[index]); } __gnu_sequential::sort(samples, samples + num_samples, comp); value_type& pivot = samples[pivot_rank * num_samples / n]; - __gnu_parallel::binder2nd<Comparator, value_type, value_type, bool> pred(comp, pivot); + __gnu_parallel::binder2nd<Comparator, value_type, value_type, bool> + pred(comp, pivot); difference_type split = parallel_partition(begin, end, pred, num_threads); return split; @@ -93,7 +103,10 @@ namespace __gnu_parallel */ template<typename RandomAccessIterator, typename Comparator> inline void - parallel_sort_qs_conquer(RandomAccessIterator begin, RandomAccessIterator end, Comparator comp, int num_threads) + parallel_sort_qs_conquer(RandomAccessIterator begin, + RandomAccessIterator end, + Comparator comp, + thread_index_t num_threads) { typedef std::iterator_traits<RandomAccessIterator> traits_type; typedef typename traits_type::value_type value_type; @@ -101,8 +114,8 @@ namespace __gnu_parallel if (num_threads <= 1) { - __gnu_sequential::sort(begin, end, comp); - return; + __gnu_sequential::sort(begin, end, comp); + return; } difference_type n = end - begin, pivot_rank; @@ -110,24 +123,27 @@ namespace __gnu_parallel if (n <= 1) return; - thread_index_t num_processors_left; + thread_index_t num_threads_left; if ((num_threads % 2) == 1) - num_processors_left = num_threads / 2 + 1; + num_threads_left = num_threads / 2 + 1; else - num_processors_left = num_threads / 2; + num_threads_left = num_threads / 2; - pivot_rank = n * num_processors_left / num_threads; + pivot_rank = n * num_threads_left / num_threads; - difference_type split = parallel_sort_qs_divide(begin, end, comp, pivot_rank, -Settings::sort_qs_num_samples_preset, num_threads); + difference_type split = parallel_sort_qs_divide( + begin, end, comp, pivot_rank, + Settings::sort_qs_num_samples_preset, num_threads); #pragma omp parallel sections { #pragma omp section - parallel_sort_qs_conquer(begin, begin + split, comp, num_processors_left); + parallel_sort_qs_conquer(begin, begin + split, + comp, num_threads_left); #pragma omp section - parallel_sort_qs_conquer(begin + split, end, comp, num_threads - num_processors_left); + parallel_sort_qs_conquer(begin + split, end, + comp, num_threads - num_threads_left); } } @@ -143,9 +159,12 @@ Settings::sort_qs_num_samples_preset, num_threads); */ template<typename RandomAccessIterator, typename Comparator> inline void - parallel_sort_qs(RandomAccessIterator begin, RandomAccessIterator end, - Comparator comp, - typename std::iterator_traits<RandomAccessIterator>::difference_type n, int num_threads) + parallel_sort_qs( + RandomAccessIterator begin, + RandomAccessIterator end, + Comparator comp, + typename std::iterator_traits<RandomAccessIterator>::difference_type n, + int num_threads) { _GLIBCXX_CALL(n) @@ -165,12 +184,9 @@ Settings::sort_qs_num_samples_preset, num_threads); // Hard to avoid. omp_set_num_threads(num_threads); - bool old_nested = (omp_get_nested() != 0); - omp_set_nested(true); parallel_sort_qs_conquer(begin, begin + n, comp, num_threads); - omp_set_nested(old_nested); } -} //namespace __gnu_parallel +} //namespace __gnu_parallel #endif diff --git a/libstdc++-v3/include/parallel/random_shuffle.h b/libstdc++-v3/include/parallel/random_shuffle.h index 933ab3e8a8e..d7a82fbc59e 100644 --- a/libstdc++-v3/include/parallel/random_shuffle.h +++ b/libstdc++-v3/include/parallel/random_shuffle.h @@ -45,16 +45,16 @@ namespace __gnu_parallel { - /** @brief Type to hold the index of a bin. - * - * Since many variables of this type are allocated, it should be - * chosen as small as possible. - */ - typedef unsigned short bin_index; - - /** @brief Data known to every thread participating in - __gnu_parallel::parallel_random_shuffle(). */ - template<typename RandomAccessIterator> +/** @brief Type to hold the index of a bin. + * + * Since many variables of this type are allocated, it should be + * chosen as small as possible. + */ +typedef unsigned short bin_index; + +/** @brief Data known to every thread participating in + __gnu_parallel::parallel_random_shuffle(). */ +template<typename RandomAccessIterator> struct DRandomShufflingGlobalData { typedef std::iterator_traits<RandomAccessIterator> traits_type; @@ -90,18 +90,15 @@ namespace __gnu_parallel : source(_source) { } }; - /** @brief Local data for a thread participating in - __gnu_parallel::parallel_random_shuffle(). - */ - template<typename RandomAccessIterator, typename RandomNumberGenerator> +/** @brief Local data for a thread participating in + __gnu_parallel::parallel_random_shuffle(). + */ +template<typename RandomAccessIterator, typename RandomNumberGenerator> struct DRSSorterPU { /** @brief Number of threads participating in total. */ int num_threads; - /** @brief Number of owning thread. */ - int iam; - /** @brief Begin index for bins taken care of by this thread. */ bin_index bins_begin; @@ -115,29 +112,29 @@ namespace __gnu_parallel DRandomShufflingGlobalData<RandomAccessIterator>* sd; }; - /** @brief Generate a random number in @c [0,2^logp). - * @param logp Logarithm (basis 2) of the upper range bound. - * @param rng Random number generator to use. - */ - template<typename RandomNumberGenerator> +/** @brief Generate a random number in @c [0,2^logp). + * @param logp Logarithm (basis 2) of the upper range bound. + * @param rng Random number generator to use. + */ +template<typename RandomNumberGenerator> inline int random_number_pow2(int logp, RandomNumberGenerator& rng) { return rng.genrand_bits(logp); } - /** @brief Random shuffle code executed by each thread. - * @param pus Array of thread-local data records. */ - template<typename RandomAccessIterator, typename RandomNumberGenerator> +/** @brief Random shuffle code executed by each thread. + * @param pus Array of thread-local data records. */ +template<typename RandomAccessIterator, typename RandomNumberGenerator> inline void - parallel_random_shuffle_drs_pu(DRSSorterPU<RandomAccessIterator, - RandomNumberGenerator>* pus) + parallel_random_shuffle_drs_pu(DRSSorterPU<RandomAccessIterator, + RandomNumberGenerator>* pus) { typedef std::iterator_traits<RandomAccessIterator> traits_type; typedef typename traits_type::value_type value_type; typedef typename traits_type::difference_type difference_type; - DRSSorterPU<RandomAccessIterator, RandomNumberGenerator>* d = &pus[omp_get_thread_num()]; + thread_index_t iam = omp_get_thread_num(); + DRSSorterPU<RandomAccessIterator, RandomNumberGenerator>* d = &pus[iam]; DRandomShufflingGlobalData<RandomAccessIterator>* sd = d->sd; - thread_index_t iam = d->iam; // Indexing: dist[bin][processor] difference_type length = sd->starts[iam + 1] - sd->starts[iam]; @@ -156,35 +153,35 @@ namespace __gnu_parallel // First main loop. for (difference_type i = 0; i < length; i++) { - bin_index oracle = random_number_pow2(num_bits, rng); - oracles[i] = oracle; + bin_index oracle = random_number_pow2(num_bits, rng); + oracles[i] = oracle; - // To allow prefix (partial) sum. - dist[oracle + 1]++; + // To allow prefix (partial) sum. + dist[oracle + 1]++; } for (bin_index b = 0; b < sd->num_bins + 1; b++) sd->dist[b][iam + 1] = dist[b]; -#pragma omp barrier +# pragma omp barrier -#pragma omp single +# pragma omp single { // Sum up bins, sd->dist[s + 1][d->num_threads] now contains the // total number of items in bin s for (bin_index s = 0; s < sd->num_bins; s++) - __gnu_sequential::partial_sum(sd->dist[s + 1], - sd->dist[s + 1] + d->num_threads + 1, - sd->dist[s + 1]); + __gnu_sequential::partial_sum(sd->dist[s + 1], + sd->dist[s + 1] + d->num_threads + 1, + sd->dist[s + 1]); } -#pragma omp barrier +# pragma omp barrier sequence_index_t offset = 0, global_offset = 0; for (bin_index s = 0; s < d->bins_begin; s++) global_offset += sd->dist[s + 1][d->num_threads]; -#pragma omp barrier +# pragma omp barrier for (bin_index s = d->bins_begin; s < d->bins_end; s++) { @@ -193,9 +190,10 @@ namespace __gnu_parallel offset = sd->dist[s + 1][d->num_threads]; } - sd->temporaries[iam] = static_cast<value_type*>(::operator new(sizeof(value_type) * offset)); + sd->temporaries[iam] = static_cast<value_type*>( + ::operator new(sizeof(value_type) * offset)); -#pragma omp barrier +# pragma omp barrier // Draw local copies to avoid false sharing. for (bin_index b = 0; b < sd->num_bins + 1; b++) @@ -211,11 +209,11 @@ namespace __gnu_parallel // Distribute according to oracles, second main loop. for (difference_type i = 0; i < length; i++) { - bin_index target_bin = oracles[i]; - thread_index_t target_p = bin_proc[target_bin]; + bin_index target_bin = oracles[i]; + thread_index_t target_p = bin_proc[target_bin]; - // Last column [d->num_threads] stays unchanged. - temporaries[target_p][dist[target_bin + 1]++] = *(source + i + start); + // Last column [d->num_threads] stays unchanged. + temporaries[target_p][dist[target_bin + 1]++] = *(source + i + start); } delete[] oracles; @@ -223,23 +221,27 @@ namespace __gnu_parallel delete[] bin_proc; delete[] temporaries; -#pragma omp barrier +# pragma omp barrier // Shuffle bins internally. for (bin_index b = d->bins_begin; b < d->bins_end; b++) { - value_type* begin = sd->temporaries[iam] + ((b == d->bins_begin) ? 0 : sd->dist[b][d->num_threads]), - * end = sd->temporaries[iam] + sd->dist[b + 1][d->num_threads]; - sequential_random_shuffle(begin, end, rng); - std::copy(begin, end, sd->source + global_offset + ((b == d->bins_begin) ? 0 : sd->dist[b][d->num_threads])); + value_type* begin = + sd->temporaries[iam] + + ((b == d->bins_begin) ? 0 : sd->dist[b][d->num_threads]), + * end = + sd->temporaries[iam] + sd->dist[b + 1][d->num_threads]; + sequential_random_shuffle(begin, end, rng); + std::copy(begin, end, sd->source + global_offset + + ((b == d->bins_begin) ? 0 : sd->dist[b][d->num_threads])); } delete[] sd->temporaries[iam]; } - /** @brief Round up to the next greater power of 2. - * @param x Integer to round up */ - template<typename T> +/** @brief Round up to the next greater power of 2. + * @param x Integer to round up */ +template<typename T> T round_up_to_pow2(T x) { @@ -249,16 +251,21 @@ namespace __gnu_parallel return (T)1 << (log2(x - 1) + 1); } - /** @brief Main parallel random shuffle step. - * @param begin Begin iterator of sequence. - * @param end End iterator of sequence. - * @param n Length of sequence. - * @param num_threads Number of threads to use. - * @param rng Random number generator to use. - */ - template<typename RandomAccessIterator, typename RandomNumberGenerator> +/** @brief Main parallel random shuffle step. + * @param begin Begin iterator of sequence. + * @param end End iterator of sequence. + * @param n Length of sequence. + * @param num_threads Number of threads to use. + * @param rng Random number generator to use. + */ +template<typename RandomAccessIterator, typename RandomNumberGenerator> inline void - parallel_random_shuffle_drs(RandomAccessIterator begin, RandomAccessIterator end, typename std::iterator_traits<RandomAccessIterator>::difference_type n, int num_threads, RandomNumberGenerator& rng) + parallel_random_shuffle_drs( + RandomAccessIterator begin, + RandomAccessIterator end, + typename std::iterator_traits<RandomAccessIterator>::difference_type n, + thread_index_t num_threads, + RandomNumberGenerator& rng) { typedef std::iterator_traits<RandomAccessIterator> traits_type; typedef typename traits_type::value_type value_type; @@ -275,87 +282,99 @@ namespace __gnu_parallel // Try the L1 cache first. // Must fit into L1. - num_bins_cache = std::max((difference_type)1, (difference_type)(n / (Settings::L1_cache_size_lb / sizeof(value_type)))); + num_bins_cache = std::max<difference_type>( + 1, n / (Settings::L1_cache_size_lb / sizeof(value_type))); num_bins_cache = round_up_to_pow2(num_bins_cache); // No more buckets than TLB entries, power of 2 // Power of 2 and at least one element per bin, at most the TLB size. - num_bins = std::min(n, (difference_type)num_bins_cache); + num_bins = std::min<difference_type>(n, num_bins_cache); #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB // 2 TLB entries needed per bin. - num_bins = std::min((difference_type)Settings::TLB_size / 2, num_bins); + num_bins = std::min<difference_type>(Settings::TLB_size / 2, num_bins); #endif num_bins = round_up_to_pow2(num_bins); if (num_bins < num_bins_cache) { #endif - // Now try the L2 cache - // Must fit into L2 - num_bins_cache = static_cast<bin_index>(std::max((difference_type)1, (difference_type)(n / (Settings::L2_cache_size / sizeof(value_type))))); - num_bins_cache = round_up_to_pow2(num_bins_cache); - - // No more buckets than TLB entries, power of 2. - num_bins = static_cast<bin_index>(std::min(n, (difference_type)num_bins_cache)); - // Power of 2 and at least one element per bin, at most the TLB size. + // Now try the L2 cache + // Must fit into L2 + num_bins_cache = static_cast<bin_index>(std::max<difference_type>( + 1, n / (Settings::L2_cache_size / sizeof(value_type)))); + num_bins_cache = round_up_to_pow2(num_bins_cache); + + // No more buckets than TLB entries, power of 2. + num_bins = static_cast<bin_index>( + std::min(n, static_cast<difference_type>(num_bins_cache))); + // Power of 2 and at least one element per bin, at most the TLB size. #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB - // 2 TLB entries needed per bin. - num_bins = std::min((difference_type)Settings::TLB_size / 2, num_bins); + // 2 TLB entries needed per bin. + num_bins = std::min( + static_cast<difference_type>(Settings::TLB_size / 2), num_bins); #endif - num_bins = round_up_to_pow2(num_bins); + num_bins = round_up_to_pow2(num_bins); #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_L1 } #endif - num_threads = std::min((bin_index)num_threads, (bin_index)num_bins); + num_threads = std::min<bin_index>(num_threads, num_bins); if (num_threads <= 1) return sequential_random_shuffle(begin, end, rng); DRandomShufflingGlobalData<RandomAccessIterator> sd(begin); + DRSSorterPU<RandomAccessIterator, random_number >* pus; + difference_type* starts; - DRSSorterPU<RandomAccessIterator, random_number >* pus = new DRSSorterPU<RandomAccessIterator, random_number >[num_threads]; - - sd.temporaries = new value_type*[num_threads]; - //sd.oracles = new bin_index[n]; - sd.dist = new difference_type*[num_bins + 1]; - sd.bin_proc = new thread_index_t[num_bins]; - for (bin_index b = 0; b < num_bins + 1; b++) - sd.dist[b] = new difference_type[num_threads + 1]; - for (bin_index b = 0; b < (num_bins + 1); b++) +# pragma omp parallel num_threads(num_threads) { - sd.dist[0][0] = 0; - sd.dist[b][0] = 0; - } - difference_type* starts = sd.starts = new difference_type[num_threads + 1]; - int bin_cursor = 0; - sd.num_bins = num_bins; - sd.num_bits = log2(num_bins); - - difference_type chunk_length = n / num_threads, split = n % num_threads, start = 0; - int bin_chunk_length = num_bins / num_threads, bin_split = num_bins % num_threads; - for (int i = 0; i < num_threads; i++) - { - starts[i] = start; - start += (i < split) ? (chunk_length + 1) : chunk_length; - int j = pus[i].bins_begin = bin_cursor; - - // Range of bins for this processor. - bin_cursor += (i < bin_split) ? (bin_chunk_length + 1) : bin_chunk_length; - pus[i].bins_end = bin_cursor; - for (; j < bin_cursor; j++) - sd.bin_proc[j] = i; - pus[i].num_threads = num_threads; - pus[i].iam = i; - pus[i].seed = rng(std::numeric_limits<uint32>::max()); - pus[i].sd = &sd; - } - starts[num_threads] = start; - - // Now shuffle in parallel. -#pragma omp parallel num_threads(num_threads) - parallel_random_shuffle_drs_pu(pus); +# pragma omp single + { + pus = new DRSSorterPU<RandomAccessIterator, random_number> + [num_threads]; + + sd.temporaries = new value_type*[num_threads]; + sd.dist = new difference_type*[num_bins + 1]; + sd.bin_proc = new thread_index_t[num_bins]; + for (bin_index b = 0; b < num_bins + 1; b++) + sd.dist[b] = new difference_type[num_threads + 1]; + for (bin_index b = 0; b < (num_bins + 1); b++) + { + sd.dist[0][0] = 0; + sd.dist[b][0] = 0; + } + starts = sd.starts = new difference_type[num_threads + 1]; + int bin_cursor = 0; + sd.num_bins = num_bins; + sd.num_bits = log2(num_bins); + + difference_type chunk_length = n / num_threads, + split = n % num_threads, start = 0; + difference_type bin_chunk_length = num_bins / num_threads, + bin_split = num_bins % num_threads; + for (thread_index_t i = 0; i < num_threads; i++) + { + starts[i] = start; + start += (i < split) ? (chunk_length + 1) : chunk_length; + int j = pus[i].bins_begin = bin_cursor; + + // Range of bins for this processor. + bin_cursor += (i < bin_split) ? + (bin_chunk_length + 1) : bin_chunk_length; + pus[i].bins_end = bin_cursor; + for (; j < bin_cursor; j++) + sd.bin_proc[j] = i; + pus[i].num_threads = num_threads; + pus[i].seed = rng(std::numeric_limits<uint32>::max()); + pus[i].sd = &sd; + } + starts[num_threads] = start; + } //single + // Now shuffle in parallel. + parallel_random_shuffle_drs_pu(pus); + } delete[] starts; delete[] sd.bin_proc; @@ -367,16 +386,16 @@ namespace __gnu_parallel delete[] pus; } - /** @brief Sequential cache-efficient random shuffle. - * @param begin Begin iterator of sequence. - * @param end End iterator of sequence. - * @param rng Random number generator to use. - */ - template<typename RandomAccessIterator, typename RandomNumberGenerator> +/** @brief Sequential cache-efficient random shuffle. + * @param begin Begin iterator of sequence. + * @param end End iterator of sequence. + * @param rng Random number generator to use. + */ +template<typename RandomAccessIterator, typename RandomNumberGenerator> inline void sequential_random_shuffle(RandomAccessIterator begin, - RandomAccessIterator end, - RandomNumberGenerator& rng) + RandomAccessIterator end, + RandomNumberGenerator& rng) { typedef std::iterator_traits<RandomAccessIterator> traits_type; typedef typename traits_type::value_type value_type; @@ -388,7 +407,9 @@ namespace __gnu_parallel #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_L1 // Try the L1 cache first, must fit into L1. - num_bins_cache = std::max((difference_type)1, (difference_type)(n / (Settings::L1_cache_size_lb / sizeof(value_type)))); + num_bins_cache = + std::max<difference_type> + (1, n / (Settings::L1_cache_size_lb / sizeof(value_type))); num_bins_cache = round_up_to_pow2(num_bins_cache); // No more buckets than TLB entries, power of 2 @@ -403,19 +424,23 @@ namespace __gnu_parallel if (num_bins < num_bins_cache) { #endif - // Now try the L2 cache, must fit into L2. - num_bins_cache = static_cast<bin_index>(std::max((difference_type)1, (difference_type)(n / (Settings::L2_cache_size / sizeof(value_type))))); - num_bins_cache = round_up_to_pow2(num_bins_cache); + // Now try the L2 cache, must fit into L2. + num_bins_cache = + static_cast<bin_index>(std::max<difference_type>( + 1, n / (Settings::L2_cache_size / sizeof(value_type)))); + num_bins_cache = round_up_to_pow2(num_bins_cache); - // No more buckets than TLB entries, power of 2 - // Power of 2 and at least one element per bin, at most the TLB size. - num_bins = static_cast<bin_index>(std::min(n, (difference_type)num_bins_cache)); + // No more buckets than TLB entries, power of 2 + // Power of 2 and at least one element per bin, at most the TLB size. + num_bins = static_cast<bin_index> + (std::min(n, static_cast<difference_type>(num_bins_cache))); #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB - // 2 TLB entries needed per bin - num_bins = std::min((difference_type)Settings::TLB_size / 2, num_bins); + // 2 TLB entries needed per bin + num_bins = + std::min<difference_type>(Settings::TLB_size / 2, num_bins); #endif - num_bins = round_up_to_pow2(num_bins); + num_bins = round_up_to_pow2(num_bins); #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_L1 } #endif @@ -424,58 +449,62 @@ namespace __gnu_parallel if (num_bins > 1) { - value_type* target = static_cast<value_type*>(::operator new(sizeof(value_type) * n)); - bin_index* oracles = new bin_index[n]; - difference_type* dist0 = new difference_type[num_bins + 1], * dist1 = new difference_type[num_bins + 1]; - - for (int b = 0; b < num_bins + 1; b++) - dist0[b] = 0; - - random_number bitrng(rng(0xFFFFFFFF)); - - for (difference_type i = 0; i < n; i++) - { - bin_index oracle = random_number_pow2(num_bits, bitrng); - oracles[i] = oracle; - - // To allow prefix (partial) sum. - dist0[oracle + 1]++; - } - - // Sum up bins. - __gnu_sequential::partial_sum(dist0, dist0 + num_bins + 1, dist0); - - for (int b = 0; b < num_bins + 1; b++) - dist1[b] = dist0[b]; - - // Distribute according to oracles. - for (difference_type i = 0; i < n; i++) - target[(dist0[oracles[i]])++] = *(begin + i); - - for (int b = 0; b < num_bins; b++) - { - sequential_random_shuffle(target + dist1[b], target + dist1[b + 1], - rng); - } - - delete[] dist0; - delete[] dist1; - delete[] oracles; - delete[] target; + value_type* target = static_cast<value_type*>( + ::operator new(sizeof(value_type) * n)); + bin_index* oracles = new bin_index[n]; + difference_type* dist0 = new difference_type[num_bins + 1], + * dist1 = new difference_type[num_bins + 1]; + + for (int b = 0; b < num_bins + 1; b++) + dist0[b] = 0; + + random_number bitrng(rng(0xFFFFFFFF)); + + for (difference_type i = 0; i < n; i++) + { + bin_index oracle = random_number_pow2(num_bits, bitrng); + oracles[i] = oracle; + + // To allow prefix (partial) sum. + dist0[oracle + 1]++; + } + + // Sum up bins. + __gnu_sequential::partial_sum(dist0, dist0 + num_bins + 1, dist0); + + for (int b = 0; b < num_bins + 1; b++) + dist1[b] = dist0[b]; + + // Distribute according to oracles. + for (difference_type i = 0; i < n; i++) + target[(dist0[oracles[i]])++] = *(begin + i); + + for (int b = 0; b < num_bins; b++) + { + sequential_random_shuffle(target + dist1[b], + target + dist1[b + 1], + rng); + } + + delete[] dist0; + delete[] dist1; + delete[] oracles; + delete[] target; } else __gnu_sequential::random_shuffle(begin, end, rng); } - /** @brief Parallel random public call. - * @param begin Begin iterator of sequence. - * @param end End iterator of sequence. - * @param rng Random number generator to use. - */ - template<typename RandomAccessIterator, typename RandomNumberGenerator> +/** @brief Parallel random public call. + * @param begin Begin iterator of sequence. + * @param end End iterator of sequence. + * @param rng Random number generator to use. + */ +template<typename RandomAccessIterator, typename RandomNumberGenerator> inline void - parallel_random_shuffle(RandomAccessIterator begin, RandomAccessIterator end, - RandomNumberGenerator rng = random_number()) + parallel_random_shuffle(RandomAccessIterator begin, + RandomAccessIterator end, + RandomNumberGenerator rng = random_number()) { typedef std::iterator_traits<RandomAccessIterator> traits_type; typedef typename traits_type::difference_type difference_type; diff --git a/libstdc++-v3/include/parallel/search.h b/libstdc++-v3/include/parallel/search.h index ba00a8faf6e..d917b12693a 100644 --- a/libstdc++-v3/include/parallel/search.h +++ b/libstdc++-v3/include/parallel/search.h @@ -53,10 +53,10 @@ namespace __gnu_parallel * @param length Length of sequence to search for. * @param advances Returned offsets. */ - template<typename RandomAccessIterator, typename _DifferenceTp> +template<typename RandomAccessIterator, typename _DifferenceTp> void calc_borders(RandomAccessIterator elements, _DifferenceTp length, - _DifferenceTp* off) + _DifferenceTp* off) { typedef _DifferenceTp difference_type; @@ -66,9 +66,9 @@ namespace __gnu_parallel difference_type k = 0; for (difference_type j = 2; j <= length; j++) { - while ((k >= 0) && !(elements[k] == elements[j-1])) - k = off[k]; - off[j] = ++k; + while ((k >= 0) && !(elements[k] == elements[j-1])) + k = off[k]; + off[j] = ++k; } } @@ -81,11 +81,14 @@ namespace __gnu_parallel * @param end2 End iterator of second sequence. * @param pred Find predicate. * @return Place of finding in first sequences. */ - template<typename _RandomAccessIterator1, typename _RandomAccessIterator2, typename Pred> +template< + typename _RandomAccessIterator1, + typename _RandomAccessIterator2, + typename Pred> _RandomAccessIterator1 search_template(_RandomAccessIterator1 begin1, _RandomAccessIterator1 end1, - _RandomAccessIterator2 begin2, _RandomAccessIterator2 end2, - Pred pred) + _RandomAccessIterator2 begin2, _RandomAccessIterator2 end2, + Pred pred) { typedef std::iterator_traits<_RandomAccessIterator1> traits_type; typedef typename traits_type::difference_type difference_type; @@ -103,60 +106,71 @@ namespace __gnu_parallel // Where is first occurrence of pattern? defaults to end. difference_type result = (end1 - begin1); + difference_type *splitters; // Pattern too long. if (input_length < 0) return end1; - thread_index_t num_threads = std::max<difference_type>(1, std::min<difference_type>(input_length, __gnu_parallel::get_max_threads())); - omp_lock_t result_lock; omp_init_lock(&result_lock); - difference_type borders[num_threads + 1]; - __gnu_parallel::equally_split(input_length, num_threads, borders); + thread_index_t num_threads = + std::max<difference_type>(1, + std::min<difference_type>(input_length, get_max_threads())); difference_type advances[pattern_length]; calc_borders(begin2, pattern_length, advances); -#pragma omp parallel num_threads(num_threads) - { - thread_index_t iam = omp_get_thread_num(); - - difference_type start = borders[iam], stop = borders[iam + 1]; - - difference_type pos_in_pattern = 0; - bool found_pattern = false; - - while (start <= stop && !found_pattern) - { - // Get new value of result. -#pragma omp flush(result) - // No chance for this thread to find first occurrence. - if (result < start) - break; - while (pred(begin1[start + pos_in_pattern], begin2[pos_in_pattern])) - { - ++pos_in_pattern; - if (pos_in_pattern == pattern_length) - { - // Found new candidate for result. - omp_set_lock(&result_lock); - result = std::min(result, start); - omp_unset_lock(&result_lock); - - found_pattern = true; - break; - } - } - // Make safe jump. - start += (pos_in_pattern - advances[pos_in_pattern]); - pos_in_pattern = (advances[pos_in_pattern] < 0) ? 0 : advances[pos_in_pattern]; - } - } +# pragma omp parallel num_threads(num_threads) + { +# pragma omp single + { + num_threads = omp_get_num_threads(); + splitters = new difference_type[num_threads + 1]; + equally_split(input_length, num_threads, splitters); + } + + thread_index_t iam = omp_get_thread_num(); + + difference_type start = splitters[iam], stop = splitters[iam + 1]; + + difference_type pos_in_pattern = 0; + bool found_pattern = false; + + while (start <= stop && !found_pattern) + { + // Get new value of result. + #pragma omp flush(result) + // No chance for this thread to find first occurrence. + if (result < start) + break; + while (pred(begin1[start + pos_in_pattern], + begin2[pos_in_pattern])) + { + ++pos_in_pattern; + if (pos_in_pattern == pattern_length) + { + // Found new candidate for result. + omp_set_lock(&result_lock); + result = std::min(result, start); + omp_unset_lock(&result_lock); + + found_pattern = true; + break; + } + } + // Make safe jump. + start += (pos_in_pattern - advances[pos_in_pattern]); + pos_in_pattern = + (advances[pos_in_pattern] < 0) ? 0 : advances[pos_in_pattern]; + } + } //parallel omp_destroy_lock(&result_lock); + delete[] splitters; + // Return iterator on found element. return (begin1 + result); } diff --git a/libstdc++-v3/include/parallel/set_operations.h b/libstdc++-v3/include/parallel/set_operations.h index 2a16691459a..7be31277077 100644 --- a/libstdc++-v3/include/parallel/set_operations.h +++ b/libstdc++-v3/include/parallel/set_operations.h @@ -47,28 +47,31 @@ namespace __gnu_parallel { - template<typename InputIterator, typename OutputIterator> +template<typename InputIterator, typename OutputIterator> inline OutputIterator copy_tail(std::pair<InputIterator, InputIterator> b, - std::pair<InputIterator, InputIterator> e, OutputIterator r) + std::pair<InputIterator, InputIterator> e, OutputIterator r) { if (b.first != e.first) { - do - { - *r++ = *b.first++; - } - while (b.first != e.first); + do + { + *r++ = *b.first++; + } + while (b.first != e.first); } else { - while (b.second != e.second) - *r++ = *b.second++; + while (b.second != e.second) + *r++ = *b.second++; } return r; } - template<typename InputIterator, typename OutputIterator, typename Comparator> +template< + typename InputIterator, + typename OutputIterator, + typename Comparator> struct symmetric_difference_func { typedef std::iterator_traits<InputIterator> traits_type; @@ -80,55 +83,56 @@ namespace __gnu_parallel Comparator comp; inline OutputIterator invoke(InputIterator a, InputIterator b, - InputIterator c, InputIterator d, - OutputIterator r) const + InputIterator c, InputIterator d, + OutputIterator r) const { while (a != b && c != d) - { - if (comp(*a, *c)) - { - *r = *a; - ++a; - ++r; - } - else if (comp(*c, *a)) - { - *r = *c; - ++c; - ++r; - } - else - { - ++a; - ++c; - } - } + { + if (comp(*a, *c)) + { + *r = *a; + ++a; + ++r; + } + else if (comp(*c, *a)) + { + *r = *c; + ++c; + ++r; + } + else + { + ++a; + ++c; + } + } return std::copy(c, d, std::copy(a, b, r)); } inline difference_type - count(InputIterator a, InputIterator b, InputIterator c, InputIterator d) const + count(InputIterator a, InputIterator b, InputIterator c, InputIterator d) + const { difference_type counter = 0; while (a != b && c != d) - { - if (comp(*a, *c)) - { - ++a; - ++counter; - } - else if (comp(*c, *a)) - { - ++c; - ++counter; - } - else - { - ++a; - ++c; - } - } + { + if (comp(*a, *c)) + { + ++a; + ++counter; + } + else if (comp(*c, *a)) + { + ++c; + ++counter; + } + else + { + ++a; + ++c; + } + } return counter + (b - a) + (d - c); } @@ -144,7 +148,10 @@ namespace __gnu_parallel }; - template<typename InputIterator, typename OutputIterator, typename Comparator> +template< + typename InputIterator, + typename OutputIterator, + typename Comparator> struct difference_func { typedef std::iterator_traits<InputIterator> traits_type; @@ -157,44 +164,45 @@ namespace __gnu_parallel inline OutputIterator invoke(InputIterator a, InputIterator b, InputIterator c, InputIterator d, - OutputIterator r) const + OutputIterator r) const { while (a != b && c != d) - { - if (comp(*a, *c)) - { - *r = *a; - ++a; - ++r; - } - else if (comp(*c, *a)) - { ++c; } - else - { - ++a; - ++c; - } - } + { + if (comp(*a, *c)) + { + *r = *a; + ++a; + ++r; + } + else if (comp(*c, *a)) + { ++c; } + else + { + ++a; + ++c; + } + } return std::copy(a, b, r); } inline difference_type - count(InputIterator a, InputIterator b, InputIterator c, InputIterator d) const + count(InputIterator a, InputIterator b, InputIterator c, InputIterator d) + const { difference_type counter = 0; while (a != b && c != d) - { - if (comp(*a, *c)) - { - ++a; - ++counter; - } - else if (comp(*c, *a)) - { ++c; } - else - { ++a; ++c; } - } + { + if (comp(*a, *c)) + { + ++a; + ++counter; + } + else if (comp(*c, *a)) + { ++c; } + else + { ++a; ++c; } + } return counter + (b - a); } @@ -209,7 +217,10 @@ namespace __gnu_parallel }; - template<typename InputIterator, typename OutputIterator, typename Comparator> +template< + typename InputIterator, + typename OutputIterator, + typename Comparator> struct intersection_func { typedef std::iterator_traits<InputIterator> traits_type; @@ -222,44 +233,45 @@ namespace __gnu_parallel inline OutputIterator invoke(InputIterator a, InputIterator b, InputIterator c, InputIterator d, - OutputIterator r) const + OutputIterator r) const { while (a != b && c != d) - { - if (comp(*a, *c)) - { ++a; } - else if (comp(*c, *a)) - { ++c; } - else - { - *r = *a; - ++a; - ++c; - ++r; - } - } + { + if (comp(*a, *c)) + { ++a; } + else if (comp(*c, *a)) + { ++c; } + else + { + *r = *a; + ++a; + ++c; + ++r; + } + } return r; } inline difference_type - count(InputIterator a, InputIterator b, InputIterator c, InputIterator d) const + count(InputIterator a, InputIterator b, InputIterator c, InputIterator d) + const { difference_type counter = 0; while (a != b && c != d) - { - if (comp(*a, *c)) - { ++a; } - else if (comp(*c, *a)) - { ++c; } - else - { - ++a; - ++c; - ++counter; - } - } + { + if (comp(*a, *c)) + { ++a; } + else if (comp(*c, *a)) + { ++c; } + else + { + ++a; + ++c; + ++counter; + } + } return counter; } @@ -273,10 +285,11 @@ namespace __gnu_parallel { return out; } }; - template<class InputIterator, class OutputIterator, class Comparator> +template<class InputIterator, class OutputIterator, class Comparator> struct union_func { - typedef typename std::iterator_traits<InputIterator>::difference_type difference_type; + typedef typename std::iterator_traits<InputIterator>::difference_type + difference_type; union_func(Comparator c) : comp(c) {} @@ -284,50 +297,50 @@ namespace __gnu_parallel inline OutputIterator invoke(InputIterator a, const InputIterator b, InputIterator c, - const InputIterator d, OutputIterator r) const + const InputIterator d, OutputIterator r) const { while (a != b && c != d) - { - if (comp(*a, *c)) - { - *r = *a; - ++a; - } - else if (comp(*c, *a)) - { - *r = *c; - ++c; - } - else - { - *r = *a; - ++a; - ++c; - } - ++r; - } + { + if (comp(*a, *c)) + { + *r = *a; + ++a; + } + else if (comp(*c, *a)) + { + *r = *c; + ++c; + } + else + { + *r = *a; + ++a; + ++c; + } + ++r; + } return std::copy(c, d, std::copy(a, b, r)); } inline difference_type - count(InputIterator a, const InputIterator b, InputIterator c, - const InputIterator d) const + count(InputIterator a, InputIterator b, InputIterator c, InputIterator d) + const { difference_type counter = 0; while (a != b && c != d) - { - if (comp(*a, *c)) - { ++a; } - else if (comp(*c, *a)) - { ++c; } - else - { - ++a; - ++c; - } - ++counter; - } + { + if (comp(*a, *c)) + { ++a; } + else if (comp(*c, *a)) + { ++c; } + else + { + ++a; + ++c; + } + ++counter; + } counter += (b - a); counter += (d - c); @@ -343,11 +356,14 @@ namespace __gnu_parallel { return std::copy(a, b, out); } }; - template<typename InputIterator, typename OutputIterator, typename Operation> +template< + typename InputIterator, + typename OutputIterator, + typename Operation> OutputIterator parallel_set_operation(InputIterator begin1, InputIterator end1, - InputIterator begin2, InputIterator end2, - OutputIterator result, Operation op) + InputIterator begin2, InputIterator end2, + OutputIterator result, Operation op) { _GLIBCXX_CALL((end1 - begin1) + (end2 - begin2)) @@ -355,7 +371,6 @@ namespace __gnu_parallel typedef typename traits_type::difference_type difference_type; typedef typename std::pair<InputIterator, InputIterator> iterator_pair; - if (begin1 == end1) return op.first_empty(begin2, end2, result); @@ -364,152 +379,174 @@ namespace __gnu_parallel const difference_type size = (end1 - begin1) + (end2 - begin2); - thread_index_t num_threads = std::min<difference_type>(std::min(end1 - begin1, end2 - begin2), get_max_threads()); - - difference_type borders[num_threads + 2]; - equally_split(size, num_threads + 1, borders); - - const iterator_pair sequence[ 2 ] = { std::make_pair(begin1, end1), std::make_pair(begin2, end2) } ; - - iterator_pair block_begins[num_threads + 1]; - - // Very start. - block_begins[0] = std::make_pair(begin1, begin2); - difference_type length[num_threads]; - + const iterator_pair sequence[ 2 ] = + { std::make_pair(begin1, end1), std::make_pair(begin2, end2) } ; OutputIterator return_value = result; + difference_type *borders; + iterator_pair *block_begins; + difference_type* lengths; -#pragma omp parallel num_threads(num_threads) - { - // Result from multiseq_partition. - InputIterator offset[2]; - const int iam = omp_get_thread_num(); - - const difference_type rank = borders[iam + 1]; - - multiseq_partition(sequence, sequence + 2, rank, offset, op.comp); - - // allowed to read? - // together - // *(offset[ 0 ] - 1) == *offset[ 1 ] - if (offset[ 0 ] != begin1 && offset[ 1 ] != end2 - && !op.comp(*(offset[ 0 ] - 1), *offset[ 1 ]) - && !op.comp(*offset[ 1 ], *(offset[ 0 ] - 1))) - { - // Avoid split between globally equal elements: move one to - // front in first sequence. - --offset[ 0 ]; - } - - iterator_pair block_end = block_begins[ iam + 1 ] = iterator_pair(offset[ 0 ], offset[ 1 ]); - - // Make sure all threads have their block_begin result written out. -#pragma omp barrier - - iterator_pair block_begin = block_begins[ iam ]; - - // Begin working for the first block, while the others except - // the last start to count. - if (iam == 0) - { - // The first thread can copy already. - length[ iam ] = op.invoke(block_begin.first, block_end.first, block_begin.second, block_end.second, result) - result; - } - else - { - length[ iam ] = op.count(block_begin.first, block_end.first, - block_begin.second, block_end.second); - } - - // Make sure everyone wrote their lengths. -#pragma omp barrier - - OutputIterator r = result; - - if (iam == 0) - { - // Do the last block. - for (int i = 0; i < num_threads; ++i) - r += length[i]; - - block_begin = block_begins[num_threads]; - - // Return the result iterator of the last block. - return_value = op.invoke(block_begin.first, end1, block_begin.second, end2, r); - - } - else - { - for (int i = 0; i < iam; ++i) - r += length[ i ]; - - // Reset begins for copy pass. - op.invoke(block_begin.first, block_end.first, - block_begin.second, block_end.second, r); - } - } + thread_index_t num_threads = + std::min<difference_type>(get_max_threads(), + std::min(end1 - begin1, end2 - begin2)); + +# pragma omp parallel num_threads(num_threads) + { +# pragma omp single + { + num_threads = omp_get_num_threads(); + + borders = new difference_type[num_threads + 2]; + equally_split(size, num_threads + 1, borders); + block_begins = new iterator_pair[num_threads + 1]; + // Very start. + block_begins[0] = std::make_pair(begin1, begin2); + lengths = new difference_type[num_threads]; + } //single + + thread_index_t iam = omp_get_thread_num(); + + // Result from multiseq_partition. + InputIterator offset[2]; + const difference_type rank = borders[iam + 1]; + + multiseq_partition(sequence, sequence + 2, rank, offset, op.comp); + + // allowed to read? + // together + // *(offset[ 0 ] - 1) == *offset[ 1 ] + if (offset[ 0 ] != begin1 && offset[ 1 ] != end2 + && !op.comp(*(offset[ 0 ] - 1), *offset[ 1 ]) + && !op.comp(*offset[ 1 ], *(offset[ 0 ] - 1))) + { + // Avoid split between globally equal elements: move one to + // front in first sequence. + --offset[ 0 ]; + } + + iterator_pair block_end = block_begins[ iam + 1 ] = + iterator_pair(offset[ 0 ], offset[ 1 ]); + + // Make sure all threads have their block_begin result written out. +# pragma omp barrier + + iterator_pair block_begin = block_begins[ iam ]; + + // Begin working for the first block, while the others except + // the last start to count. + if (iam == 0) + { + // The first thread can copy already. + lengths[ iam ] = op.invoke(block_begin.first, block_end.first, + block_begin.second, block_end.second, + result) + - result; + } + else + { + lengths[ iam ] = op.count(block_begin.first, block_end.first, + block_begin.second, block_end.second); + } + + // Make sure everyone wrote their lengths. +# pragma omp barrier + + OutputIterator r = result; + + if (iam == 0) + { + // Do the last block. + for (int i = 0; i < num_threads; ++i) + r += lengths[i]; + + block_begin = block_begins[num_threads]; + + // Return the result iterator of the last block. + return_value = op.invoke( + block_begin.first, end1, block_begin.second, end2, r); + + } + else + { + for (int i = 0; i < iam; ++i) + r += lengths[ i ]; + + // Reset begins for copy pass. + op.invoke(block_begin.first, block_end.first, + block_begin.second, block_end.second, r); + } + } return return_value; } - template<typename InputIterator, typename OutputIterator, typename Comparator> +template< + typename InputIterator, + typename OutputIterator, + typename Comparator> OutputIterator parallel_set_union(InputIterator begin1, InputIterator end1, - InputIterator begin2, InputIterator end2, - OutputIterator result, Comparator comp) + InputIterator begin2, InputIterator end2, + OutputIterator result, Comparator comp) { return parallel_set_operation(begin1, end1, begin2, end2, result, - union_func< InputIterator, OutputIterator, Comparator>(comp)); + union_func< InputIterator, OutputIterator, Comparator>(comp)); } - template<typename InputIterator, typename OutputIterator, typename Comparator> +template< + typename InputIterator, + typename OutputIterator, + typename Comparator> OutputIterator parallel_set_intersection(InputIterator begin1, InputIterator end1, - InputIterator begin2, InputIterator end2, - OutputIterator result, Comparator comp) + InputIterator begin2, InputIterator end2, + OutputIterator result, Comparator comp) { return parallel_set_operation(begin1, end1, begin2, end2, result, - intersection_func<InputIterator, OutputIterator, Comparator>(comp)); + intersection_func<InputIterator, OutputIterator, Comparator>(comp)); } - template<typename InputIterator, typename OutputIterator> +template<typename InputIterator, typename OutputIterator> OutputIterator - set_intersection(InputIterator begin1, InputIterator end1, InputIterator begin2, InputIterator end2, OutputIterator result) + set_intersection(InputIterator begin1, InputIterator end1, + InputIterator begin2, InputIterator end2, + OutputIterator result) { typedef std::iterator_traits<InputIterator> traits_type; typedef typename traits_type::value_type value_type; return set_intersection(begin1, end1, begin2, end2, result, - std::less<value_type>()); + std::less<value_type>()); } - template<typename InputIterator, typename OutputIterator, typename Comparator> +template< + typename InputIterator, + typename OutputIterator, + typename Comparator> OutputIterator parallel_set_difference(InputIterator begin1, InputIterator end1, - InputIterator begin2, InputIterator end2, - OutputIterator result, Comparator comp) + InputIterator begin2, InputIterator end2, + OutputIterator result, Comparator comp) { return parallel_set_operation(begin1, end1, begin2, end2, result, - difference_func<InputIterator, OutputIterator, Comparator>(comp)); + difference_func<InputIterator, OutputIterator, Comparator>(comp)); } - template<typename InputIterator, typename OutputIterator, typename Comparator> +template< + typename InputIterator, + typename OutputIterator, + typename Comparator> OutputIterator - parallel_set_symmetric_difference(InputIterator begin1, InputIterator end1, InputIterator begin2, InputIterator end2, OutputIterator result, Comparator comp) + parallel_set_symmetric_difference(InputIterator begin1, InputIterator end1, + InputIterator begin2, InputIterator end2, + OutputIterator result, Comparator comp) { return parallel_set_operation(begin1, end1, begin2, end2, result, - symmetric_difference_func<InputIterator, OutputIterator, Comparator>(comp)); + symmetric_difference_func<InputIterator, OutputIterator, Comparator> + (comp)); } } #endif // _GLIBCXX_SET_ALGORITHM_ - - - - - - - - diff --git a/libstdc++-v3/include/parallel/unique_copy.h b/libstdc++-v3/include/parallel/unique_copy.h index 93a030429eb..bf305031ed3 100644 --- a/libstdc++-v3/include/parallel/unique_copy.h +++ b/libstdc++-v3/include/parallel/unique_copy.h @@ -44,16 +44,19 @@ namespace __gnu_parallel { - /** @brief Parallel std::unique_copy(), without explicit equality predicate. - * @param first Begin iterator of input sequence. - * @param last End iterator of input sequence. - * @param result Begin iterator of result sequence. - * @param binary_pred Equality predicate. - * @return End iterator of result sequence. */ - template<typename InputIterator, class OutputIterator, class BinaryPredicate> +/** @brief Parallel std::unique_copy(), w/o explicit equality predicate. + * @param first Begin iterator of input sequence. + * @param last End iterator of input sequence. + * @param result Begin iterator of result sequence. + * @param binary_pred Equality predicate. + * @return End iterator of result sequence. */ +template< + typename InputIterator, + class OutputIterator, + class BinaryPredicate> inline OutputIterator parallel_unique_copy(InputIterator first, InputIterator last, - OutputIterator result, BinaryPredicate binary_pred) + OutputIterator result, BinaryPredicate binary_pred) { _GLIBCXX_CALL(last - first) @@ -62,126 +65,136 @@ namespace __gnu_parallel typedef typename traits_type::difference_type difference_type; difference_type size = last - first; - int num_threads = __gnu_parallel::get_max_threads(); - difference_type counter[num_threads + 1]; if (size == 0) return result; // Let the first thread process two parts. - difference_type borders[num_threads + 2]; - __gnu_parallel::equally_split(size, num_threads + 1, borders); + difference_type *counter; + difference_type *borders; + thread_index_t num_threads = get_max_threads(); // First part contains at least one element. -#pragma omp parallel num_threads(num_threads) - { - int iam = omp_get_thread_num(); - - difference_type begin, end; - - // Check for length without duplicates - // Needed for position in output - difference_type i = 0; - OutputIterator out = result; - if (iam == 0) - { - begin = borders[0] + 1; // == 1 - end = borders[iam + 1]; - - i++; - new (static_cast<void *>(&*out)) value_type(*first); - out++; - - for (InputIterator iter = first + begin; iter < first + end; ++iter) - { - if (!binary_pred(*iter, *(iter-1))) - { - i++; - new (static_cast<void *>(&*out)) value_type(*iter); - out++; - } - } - } +# pragma omp parallel num_threads(num_threads) + { +# pragma omp single + { + num_threads = omp_get_num_threads(); + borders = new difference_type[num_threads + 2]; + equally_split(size, num_threads + 1, borders); + counter = new difference_type[num_threads + 1]; + } + + thread_index_t iam = omp_get_thread_num(); + + difference_type begin, end; + + // Check for length without duplicates + // Needed for position in output + difference_type i = 0; + OutputIterator out = result; + + if (iam == 0) + { + begin = borders[0] + 1; // == 1 + end = borders[iam + 1]; + + i++; + new (static_cast<void *>(&*out)) value_type(*first); + out++; + + for (InputIterator iter = first + begin; iter < first + end; ++iter) + { + if (!binary_pred(*iter, *(iter-1))) + { + i++; + new (static_cast<void *>(&*out)) value_type(*iter); + out++; + } + } + } else - { - begin = borders[iam]; //one part - end = borders[iam + 1]; - - for (InputIterator iter = first + begin; iter < first + end; ++iter) - { - if (!binary_pred(*iter, *(iter-1))) - { - i++; - } - } - } + { + begin = borders[iam]; //one part + end = borders[iam + 1]; + + for (InputIterator iter = first + begin; iter < first + end; ++iter) + { + if (!binary_pred(*iter, *(iter-1))) + { + i++; + } + } + } counter[iam] = i; // Last part still untouched. difference_type begin_output; -#pragma omp barrier +# pragma omp barrier // Store result in output on calculated positions. begin_output = 0; if (iam == 0) - { - for (int t = 0; t < num_threads; t++) - begin_output += counter[t]; + { + for (int t = 0; t < num_threads; t++) + begin_output += counter[t]; - i = 0; + i = 0; - OutputIterator iter_out = result + begin_output; + OutputIterator iter_out = result + begin_output; - begin = borders[num_threads]; - end = size; + begin = borders[num_threads]; + end = size; - for (InputIterator iter = first + begin; iter < first + end; ++iter) - { - if (iter == first || !binary_pred(*iter, *(iter-1))) - { - i++; - new (static_cast<void *>(&*iter_out)) value_type(*iter); - iter_out++; - } - } + for (InputIterator iter = first + begin; iter < first + end; ++iter) + { + if (iter == first || !binary_pred(*iter, *(iter-1))) + { + i++; + new (static_cast<void *>(&*iter_out)) value_type(*iter); + iter_out++; + } + } - counter[num_threads] = i; - } + counter[num_threads] = i; + } else - { - for (int t = 0; t < iam; t++) - begin_output += counter[t]; - - OutputIterator iter_out = result + begin_output; - for (InputIterator iter = first + begin; iter < first + end; ++iter) - { - if (!binary_pred(*iter, *(iter-1))) - { - new (static_cast<void *> (&*iter_out)) value_type(*iter); - iter_out++; - } - } - } + { + for (int t = 0; t < iam; t++) + begin_output += counter[t]; + + OutputIterator iter_out = result + begin_output; + for (InputIterator iter = first + begin; iter < first + end; ++iter) + { + if (!binary_pred(*iter, *(iter-1))) + { + new (static_cast<void *> (&*iter_out)) value_type(*iter); + iter_out++; + } + } + } } difference_type end_output = 0; for (int t = 0; t < num_threads + 1; t++) end_output += counter[t]; + delete[] borders; + return result + end_output; } - /** @brief Parallel std::unique_copy(), without explicit equality predicate - * @param first Begin iterator of input sequence. - * @param last End iterator of input sequence. - * @param result Begin iterator of result sequence. - * @return End iterator of result sequence. */ - template<typename InputIterator, class OutputIterator> +/** @brief Parallel std::unique_copy(), without explicit equality predicate + * @param first Begin iterator of input sequence. + * @param last End iterator of input sequence. + * @param result Begin iterator of result sequence. + * @return End iterator of result sequence. */ +template<typename InputIterator, class OutputIterator> inline OutputIterator parallel_unique_copy(InputIterator first, InputIterator last, - OutputIterator result) + OutputIterator result) { typedef typename std::iterator_traits<InputIterator>::value_type value_type; diff --git a/libstdc++-v3/include/parallel/workstealing.h b/libstdc++-v3/include/parallel/workstealing.h index 7704245d307..60c873951e1 100644 --- a/libstdc++-v3/include/parallel/workstealing.h +++ b/libstdc++-v3/include/parallel/workstealing.h @@ -55,8 +55,8 @@ namespace __gnu_parallel #define _GLIBCXX_JOB_VOLATILE volatile - /** @brief One job for a certain thread. */ - template<typename _DifferenceTp> +/** @brief One job for a certain thread. */ +template<typename _DifferenceTp> struct Job { typedef _DifferenceTp difference_type; @@ -78,31 +78,38 @@ namespace __gnu_parallel _GLIBCXX_JOB_VOLATILE difference_type load; }; - /** @brief Work stealing algorithm for random access iterators. - * - * Uses O(1) additional memory. Synchronization at job lists is - * done with atomic operations. - * @param begin Begin iterator of element sequence. - * @param end End iterator of element sequence. - * @param op User-supplied functor (comparator, predicate, adding - * functor, ...). - * @param f Functor to "process" an element with op (depends on - * desired functionality, e. g. for std::for_each(), ...). - * @param r Functor to "add" a single result to the already - * processed elements (depends on functionality). - * @param base Base value for reduction. - * @param output Pointer to position where final result is written to - * @param bound Maximum number of elements processed (e. g. for - * std::count_n()). - * @return User-supplied functor (that may contain a part of the result). - */ - template<typename RandomAccessIterator, typename Op, typename Fu, typename Red, typename Result> +/** @brief Work stealing algorithm for random access iterators. + * + * Uses O(1) additional memory. Synchronization at job lists is + * done with atomic operations. + * @param begin Begin iterator of element sequence. + * @param end End iterator of element sequence. + * @param op User-supplied functor (comparator, predicate, adding + * functor, ...). + * @param f Functor to "process" an element with op (depends on + * desired functionality, e. g. for std::for_each(), ...). + * @param r Functor to "add" a single result to the already + * processed elements (depends on functionality). + * @param base Base value for reduction. + * @param output Pointer to position where final result is written to + * @param bound Maximum number of elements processed (e. g. for + * std::count_n()). + * @return User-supplied functor (that may contain a part of the result). + */ +template< + typename RandomAccessIterator, + typename Op, + typename Fu, + typename Red, + typename Result> Op - for_each_template_random_access_workstealing(RandomAccessIterator begin, - RandomAccessIterator end, - Op op, Fu& f, Red r, - Result base, Result& output, - typename std::iterator_traits<RandomAccessIterator>::difference_type bound) + for_each_template_random_access_workstealing( + RandomAccessIterator begin, + RandomAccessIterator end, + Op op, Fu& f, Red r, + Result base, Result& output, + typename std::iterator_traits<RandomAccessIterator>::difference_type + bound) { _GLIBCXX_CALL(end - begin) @@ -110,182 +117,187 @@ namespace __gnu_parallel typedef typename traits_type::difference_type difference_type; - difference_type chunk_size = static_cast<difference_type>(Settings::workstealing_chunk_size); + difference_type chunk_size = + static_cast<difference_type>(Settings::workstealing_chunk_size); // How many jobs? difference_type length = (bound < 0) ? (end - begin) : bound; // To avoid false sharing in a cache line. - const int stride = Settings::cache_line_size * 10 / sizeof(Job<difference_type>) + 1; + const int stride = + Settings::cache_line_size * 10 / sizeof(Job<difference_type>) + 1; // Total number of threads currently working. thread_index_t busy = 0; - thread_index_t num_threads = get_max_threads(); - difference_type num_threads_min = num_threads < end - begin ? num_threads : end - begin; + + Job<difference_type> *job; omp_lock_t output_lock; omp_init_lock(&output_lock); - // No more threads than jobs, at least one thread. - difference_type num_threads_max = num_threads_min > 1 ? num_threads_min : 1; - num_threads = static_cast<thread_index_t>(num_threads_max); - - // Create job description array. - Job<difference_type> *job = new Job<difference_type>[num_threads * stride]; - // Write base value to output. output = base; -#pragma omp parallel shared(busy) num_threads(num_threads) - { - // Initialization phase. - - // Flags for every thread if it is doing productive work. - bool iam_working = false; - - // Thread id. - thread_index_t iam = omp_get_thread_num(); - - // This job. - Job<difference_type>& my_job = job[iam * stride]; - - // Random number (for work stealing). - thread_index_t victim; - - // Local value for reduction. - Result result = Result(); - - // Number of elements to steal in one attempt. - difference_type steal; - - // Every thread has its own random number generator (modulo num_threads). - random_number rand_gen(iam, num_threads); - -#pragma omp atomic - // This thread is currently working. - busy++; - - iam_working = true; - - // How many jobs per thread? last thread gets the rest. - my_job.first = static_cast<difference_type>(iam * (length / num_threads)); - - my_job.last = (iam == (num_threads - 1)) ? (length - 1) : ((iam + 1) * (length / num_threads) - 1); - my_job.load = my_job.last - my_job.first + 1; - - // Init result with first value (to have a base value for reduction). - if (my_job.first <= my_job.last) - { - // Cannot use volatile variable directly. - difference_type my_first = my_job.first; - result = f(op, begin + my_first); - my_job.first++; - my_job.load--; - } - - RandomAccessIterator current; - -#pragma omp barrier - - // Actual work phase - // Work on own or stolen start - while (busy > 0) - { - // Work until no productive thread left. -#pragma omp flush(busy) - - // Thread has own work to do - while (my_job.first <= my_job.last) - { - // fetch-and-add call - // Reserve current job block (size chunk_size) in my queue. - difference_type current_job = fetch_and_add<difference_type>(&(my_job.first), chunk_size); - - // Update load, to make the three values consistent, - // first might have been changed in the meantime - my_job.load = my_job.last - my_job.first + 1; - for (difference_type job_counter = 0; job_counter < chunk_size && current_job <= my_job.last; job_counter++) - { - // Yes: process it! - current = begin + current_job; - current_job++; - - // Do actual work. - result = r(result, f(op, current)); - } - -#pragma omp flush(busy) - - } - - // After reaching this point, a thread's job list is empty. - if (iam_working) - { -#pragma omp atomic - // This thread no longer has work. - busy--; - - iam_working = false; - } - - difference_type supposed_first, supposed_last, supposed_load; - do - { - // Find random nonempty deque (not own) and do consistency check. - yield(); -#pragma omp flush(busy) - victim = rand_gen(); - supposed_first = job[victim * stride].first; - supposed_last = job[victim * stride].last; - supposed_load = job[victim * stride].load; - } - while (busy > 0 - && ((supposed_load <= 0) || ((supposed_first + supposed_load - 1) != supposed_last))); - - if (busy == 0) - break; - - if (supposed_load > 0) - { - // Has work and work to do. - // Number of elements to steal (at least one). - steal = (supposed_load < 2) ? 1 : supposed_load / 2; - - // Protects against stealing threads - // omp_set_lock(&(job[victim * stride].lock)); - - // Push victim's start forward. - difference_type stolen_first = fetch_and_add<difference_type>(&(job[victim * stride].first), steal); - difference_type stolen_try = stolen_first + steal - difference_type(1); - - // Protects against working thread - // omp_unset_lock(&(job[victim * stride].lock)); - - my_job.first = stolen_first; - - // Avoid std::min dependencies. - my_job.last = stolen_try < supposed_last ? stolen_try : supposed_last; - - my_job.load = my_job.last - my_job.first + 1; - - //omp_unset_lock(&(my_job.lock)); - -#pragma omp atomic - // Has potential work again. - busy++; - iam_working = true; - -#pragma omp flush(busy) - } -#pragma omp flush(busy) - } // end while busy > 0 - // Add accumulated result to output. - omp_set_lock(&output_lock); - output = r(output, result); - omp_unset_lock(&output_lock); - - //omp_destroy_lock(&(my_job.lock)); - } + // No more threads than jobs, at least one thread. + thread_index_t num_threads = + __gnu_parallel::max<thread_index_t>(1, + __gnu_parallel::min<difference_type>(length, get_max_threads())); + +# pragma omp parallel shared(busy) num_threads(num_threads) + { + +# pragma omp single + { + num_threads = omp_get_num_threads(); + + // Create job description array. + job = new Job<difference_type>[num_threads * stride]; + } + + // Initialization phase. + + // Flags for every thread if it is doing productive work. + bool iam_working = false; + + // Thread id. + thread_index_t iam = omp_get_thread_num(); + + // This job. + Job<difference_type>& my_job = job[iam * stride]; + + // Random number (for work stealing). + thread_index_t victim; + + // Local value for reduction. + Result result = Result(); + + // Number of elements to steal in one attempt. + difference_type steal; + + // Every thread has its own random number generator + // (modulo num_threads). + random_number rand_gen(iam, num_threads); + + // This thread is currently working. +# pragma omp atomic + busy++; + + iam_working = true; + + // How many jobs per thread? last thread gets the rest. + my_job.first = + static_cast<difference_type>(iam * (length / num_threads)); + + my_job.last = (iam == (num_threads - 1)) ? + (length - 1) : ((iam + 1) * (length / num_threads) - 1); + my_job.load = my_job.last - my_job.first + 1; + + // Init result with first value (to have a base value for reduction). + if (my_job.first <= my_job.last) + { + // Cannot use volatile variable directly. + difference_type my_first = my_job.first; + result = f(op, begin + my_first); + my_job.first++; + my_job.load--; + } + + RandomAccessIterator current; + +# pragma omp barrier + + // Actual work phase + // Work on own or stolen start + while (busy > 0) + { + // Work until no productive thread left. +# pragma omp flush(busy) + + // Thread has own work to do + while (my_job.first <= my_job.last) + { + // fetch-and-add call + // Reserve current job block (size chunk_size) in my queue. + difference_type current_job = + fetch_and_add<difference_type>(&(my_job.first), chunk_size); + + // Update load, to make the three values consistent, + // first might have been changed in the meantime + my_job.load = my_job.last - my_job.first + 1; + for (difference_type job_counter = 0; + job_counter < chunk_size && current_job <= my_job.last; + job_counter++) + { + // Yes: process it! + current = begin + current_job; + current_job++; + + // Do actual work. + result = r(result, f(op, current)); + } + +# pragma omp flush(busy) + } + + // After reaching this point, a thread's job list is empty. + if (iam_working) + { + // This thread no longer has work. +# pragma omp atomic + busy--; + + iam_working = false; + } + + difference_type supposed_first, supposed_last, supposed_load; + do + { + // Find random nonempty deque (not own), do consistency check. + yield(); +# pragma omp flush(busy) + victim = rand_gen(); + supposed_first = job[victim * stride].first; + supposed_last = job[victim * stride].last; + supposed_load = job[victim * stride].load; + } + while (busy > 0 + && ((supposed_load <= 0) + || ((supposed_first + supposed_load - 1) != supposed_last))); + + if (busy == 0) + break; + + if (supposed_load > 0) + { + // Has work and work to do. + // Number of elements to steal (at least one). + steal = (supposed_load < 2) ? 1 : supposed_load / 2; + + // Push victim's start forward. + difference_type stolen_first = + fetch_and_add<difference_type>( + &(job[victim * stride].first), steal); + difference_type stolen_try = + stolen_first + steal - difference_type(1); + + my_job.first = stolen_first; + my_job.last = __gnu_parallel::min(stolen_try, supposed_last); + my_job.load = my_job.last - my_job.first + 1; + + // Has potential work again. +# pragma omp atomic + busy++; + iam_working = true; + +# pragma omp flush(busy) + } +# pragma omp flush(busy) + } // end while busy > 0 + // Add accumulated result to output. + omp_set_lock(&output_lock); + output = r(output, result); + omp_unset_lock(&output_lock); + } delete[] job; |