Commit e683ee2a by Johannes Singler Committed by Johannes Singler

re PR libstdc++/33893 ([parallel mode] Algorithms rely on omp_set_dynamic(false))

2007-11-22  Johannes Singler  <singler@ira.uka.de>

        PR libstdc++/33893
        * include/parallel/multiway_merge.h: made omp_dynamic-safe
        * include/parallel/workstealing.h: made omp_dynamic-safe
        * include/parallel/base.h: infrastructure, cleanup
        * include/parallel/par_loop.h: made omp_dynamic-safe
        * include/parallel/features.h: activate loser tree variant
        * include/parallel/quicksort.h: made omp_dynamic-safe
        * include/parallel/compiletime_settings.h: settings overridable
        * include/parallel/equally_split.h: made omp_dynamic-safe
        * include/parallel/omp_loop_static.h: made omp_dynamic-safe
        * include/parallel/random_shuffle.h: made omp_dynamic-safe
        * include/parallel/balanced_quicksort.h: made omp_dynamic-safe
        * include/parallel/set_operations.h: made omp_dynamic-safe
        * include/parallel/unique_copy.h: made omp_dynamic-safe
        * include/parallel/multiway_mergesort.h: made omp_dynamic-safe
        * include/parallel/search.h: made omp_dynamic-safe
        * include/parallel/partition.h: made omp_dynamic-safe
        * include/parallel/partial_sum.h: made omp_dynamic-safe
        * include/parallel/find.h: made omp_dynamic-safe
        * include/parallel/omp_loop.h: made omp_dynamic-safe
        * include/parallel/losertree.h: avoid default constructor

From-SVN: r130347
parent 7861a5ce
2007-11-22 Johannes Singler <singler@ira.uka.de>
PR libstdc++/33893
* include/parallel/multiway_merge.h: made omp_dynamic-safe
* include/parallel/workstealing.h: made omp_dynamic-safe
* include/parallel/base.h: infrastructure, cleanup
* include/parallel/par_loop.h: made omp_dynamic-safe
* include/parallel/features.h: activate loser tree variant
* include/parallel/quicksort.h: made omp_dynamic-safe
* include/parallel/compiletime_settings.h: settings overridable
* include/parallel/equally_split.h: made omp_dynamic-safe
* include/parallel/omp_loop_static.h: made omp_dynamic-safe
* include/parallel/random_shuffle.h: made omp_dynamic-safe
* include/parallel/balanced_quicksort.h: made omp_dynamic-safe
* include/parallel/set_operations.h: made omp_dynamic-safe
* include/parallel/unique_copy.h: made omp_dynamic-safe
* include/parallel/multiway_mergesort.h: made omp_dynamic-safe
* include/parallel/search.h: made omp_dynamic-safe
* include/parallel/partition.h: made omp_dynamic-safe
* include/parallel/partial_sum.h: made omp_dynamic-safe
* include/parallel/find.h: made omp_dynamic-safe
* include/parallel/omp_loop.h: made omp_dynamic-safe
* include/parallel/losertree.h: avoid default constructor
2007-11-21 Jonathan Wakely <jwakely.gcc@gmail.com> 2007-11-21 Jonathan Wakely <jwakely.gcc@gmail.com>
* docs/html/17_intro/C++STYLE: Fix typos. * docs/html/17_intro/C++STYLE: Fix typos.
......
...@@ -63,15 +63,15 @@ ...@@ -63,15 +63,15 @@
namespace __gnu_parallel namespace __gnu_parallel
{ {
/** @brief Information local to one thread in the parallel quicksort run. */ /** @brief Information local to one thread in the parallel quicksort run. */
template<typename RandomAccessIterator> template<typename RandomAccessIterator>
struct QSBThreadLocal struct QSBThreadLocal
{ {
typedef std::iterator_traits<RandomAccessIterator> traits_type; typedef std::iterator_traits<RandomAccessIterator> traits_type;
typedef typename traits_type::difference_type difference_type; typedef typename traits_type::difference_type difference_type;
/** @brief Continuous part of the sequence, described by an /** @brief Continuous part of the sequence, described by an
iterator pair. */ iterator pair. */
typedef std::pair<RandomAccessIterator, RandomAccessIterator> Piece; typedef std::pair<RandomAccessIterator, RandomAccessIterator> Piece;
/** @brief Initial piece to work on. */ /** @brief Initial piece to work on. */
...@@ -94,29 +94,17 @@ namespace __gnu_parallel ...@@ -94,29 +94,17 @@ namespace __gnu_parallel
QSBThreadLocal(int queue_size) : leftover_parts(queue_size) { } QSBThreadLocal(int queue_size) : leftover_parts(queue_size) { }
}; };
/** @brief Initialize the thread local storage. /** @brief Balanced quicksort divide step.
* @param tls Array of thread-local storages. * @param begin Begin iterator of subsequence.
* @param queue_size Size of the work-stealing queue. */ * @param end End iterator of subsequence.
template<typename RandomAccessIterator> * @param comp Comparator.
inline void * @param num_threads Number of threads that are allowed to work on
qsb_initialize(QSBThreadLocal<RandomAccessIterator>** tls, int queue_size) * this part.
{ * @pre @c (end-begin)>=1 */
int iam = omp_get_thread_num(); template<typename RandomAccessIterator, typename Comparator>
tls[iam] = new QSBThreadLocal<RandomAccessIterator>(queue_size);
}
/** @brief Balanced quicksort divide step.
* @param begin Begin iterator of subsequence.
* @param end End iterator of subsequence.
* @param comp Comparator.
* @param num_threads Number of threads that are allowed to work on
* this part.
* @pre @c (end-begin)>=1 */
template<typename RandomAccessIterator, typename Comparator>
inline typename std::iterator_traits<RandomAccessIterator>::difference_type inline typename std::iterator_traits<RandomAccessIterator>::difference_type
qsb_divide(RandomAccessIterator begin, RandomAccessIterator end, qsb_divide(RandomAccessIterator begin, RandomAccessIterator end,
Comparator comp, int num_threads) Comparator comp, thread_index_t num_threads)
{ {
_GLIBCXX_PARALLEL_ASSERT(num_threads > 0); _GLIBCXX_PARALLEL_ASSERT(num_threads > 0);
...@@ -124,18 +112,20 @@ namespace __gnu_parallel ...@@ -124,18 +112,20 @@ namespace __gnu_parallel
typedef typename traits_type::value_type value_type; typedef typename traits_type::value_type value_type;
typedef typename traits_type::difference_type difference_type; typedef typename traits_type::difference_type difference_type;
RandomAccessIterator pivot_pos = median_of_three_iterators(begin, begin + (end - begin) / 2, end - 1, comp); RandomAccessIterator pivot_pos = median_of_three_iterators(
begin, begin + (end - begin) / 2, end - 1, comp);
#if defined(_GLIBCXX_ASSERTIONS) #if defined(_GLIBCXX_ASSERTIONS)
// Must be in between somewhere. // Must be in between somewhere.
difference_type n = end - begin; difference_type n = end - begin;
_GLIBCXX_PARALLEL_ASSERT((!comp(*pivot_pos, *begin) && !comp(*(begin + n / 2), *pivot_pos)) _GLIBCXX_PARALLEL_ASSERT(
|| (!comp(*pivot_pos, *begin) && !comp(*end, *pivot_pos)) (!comp(*pivot_pos, *begin) && !comp(*(begin + n / 2), *pivot_pos))
|| (!comp(*pivot_pos, *(begin + n / 2)) && !comp(*begin, *pivot_pos)) || (!comp(*pivot_pos, *begin) && !comp(*end, *pivot_pos))
|| (!comp(*pivot_pos, *(begin + n / 2)) && !comp(*end, *pivot_pos)) || (!comp(*pivot_pos, *(begin + n / 2)) && !comp(*begin, *pivot_pos))
|| (!comp(*pivot_pos, *end) && !comp(*begin, *pivot_pos)) || (!comp(*pivot_pos, *(begin + n / 2)) && !comp(*end, *pivot_pos))
|| (!comp(*pivot_pos, *end) && !comp(*(begin + n / 2), *pivot_pos))); || (!comp(*pivot_pos, *end) && !comp(*begin, *pivot_pos))
|| (!comp(*pivot_pos, *end) && !comp(*(begin + n / 2), *pivot_pos)));
#endif #endif
// Swap pivot value to end. // Swap pivot value to end.
...@@ -143,10 +133,12 @@ namespace __gnu_parallel ...@@ -143,10 +133,12 @@ namespace __gnu_parallel
std::swap(*pivot_pos, *(end - 1)); std::swap(*pivot_pos, *(end - 1));
pivot_pos = end - 1; pivot_pos = end - 1;
__gnu_parallel::binder2nd<Comparator, value_type, value_type, bool> pred(comp, *pivot_pos); __gnu_parallel::binder2nd<Comparator, value_type, value_type, bool>
pred(comp, *pivot_pos);
// Divide, returning end - begin - 1 in the worst case. // Divide, returning end - begin - 1 in the worst case.
difference_type split_pos = parallel_partition(begin, end - 1, pred, num_threads); difference_type split_pos = parallel_partition(
begin, end - 1, pred, num_threads);
// Swap back pivot to middle. // Swap back pivot to middle.
std::swap(*(begin + split_pos), *pivot_pos); std::swap(*(begin + split_pos), *pivot_pos);
...@@ -163,18 +155,21 @@ namespace __gnu_parallel ...@@ -163,18 +155,21 @@ namespace __gnu_parallel
return split_pos; return split_pos;
} }
/** @brief Quicksort conquer step. /** @brief Quicksort conquer step.
* @param tls Array of thread-local storages. * @param tls Array of thread-local storages.
* @param begin Begin iterator of subsequence. * @param begin Begin iterator of subsequence.
* @param end End iterator of subsequence. * @param end End iterator of subsequence.
* @param comp Comparator. * @param comp Comparator.
* @param iam Number of the thread processing this function. * @param iam Number of the thread processing this function.
* @param num_threads Number of threads that are allowed to work on this part. */ * @param num_threads
template<typename RandomAccessIterator, typename Comparator> * Number of threads that are allowed to work on this part. */
template<typename RandomAccessIterator, typename Comparator>
inline void inline void
qsb_conquer(QSBThreadLocal<RandomAccessIterator>** tls, qsb_conquer(QSBThreadLocal<RandomAccessIterator>** tls,
RandomAccessIterator begin, RandomAccessIterator end, RandomAccessIterator begin, RandomAccessIterator end,
Comparator comp, thread_index_t iam, thread_index_t num_threads) Comparator comp,
thread_index_t iam, thread_index_t num_threads,
bool parent_wait)
{ {
typedef std::iterator_traits<RandomAccessIterator> traits_type; typedef std::iterator_traits<RandomAccessIterator> traits_type;
typedef typename traits_type::value_type value_type; typedef typename traits_type::value_type value_type;
...@@ -182,14 +177,14 @@ namespace __gnu_parallel ...@@ -182,14 +177,14 @@ namespace __gnu_parallel
difference_type n = end - begin; difference_type n = end - begin;
if (num_threads <= 1 || n < 2) if (num_threads <= 1 || n <= 1)
{ {
tls[iam]->initial.first = begin; tls[iam]->initial.first = begin;
tls[iam]->initial.second = end; tls[iam]->initial.second = end;
qsb_local_sort_with_helping(tls, comp, iam); qsb_local_sort_with_helping(tls, comp, iam, parent_wait);
return; return;
} }
// Divide step. // Divide step.
...@@ -199,33 +194,55 @@ namespace __gnu_parallel ...@@ -199,33 +194,55 @@ namespace __gnu_parallel
_GLIBCXX_PARALLEL_ASSERT(0 <= split_pos && split_pos < (end - begin)); _GLIBCXX_PARALLEL_ASSERT(0 <= split_pos && split_pos < (end - begin));
#endif #endif
thread_index_t num_threads_leftside = std::max<thread_index_t>(1, std::min<thread_index_t>(num_threads - 1, split_pos * num_threads / n)); thread_index_t num_threads_leftside =
std::max<thread_index_t>(1, std::min<thread_index_t>(
num_threads - 1, split_pos * num_threads / n));
#pragma omp atomic # pragma omp atomic
*tls[iam]->elements_leftover -= (difference_type)1; *tls[iam]->elements_leftover -= (difference_type)1;
// Conquer step. // Conquer step.
#pragma omp parallel sections num_threads(2) # pragma omp parallel num_threads(2)
{ {
#pragma omp section bool wait;
qsb_conquer(tls, begin, begin + split_pos, comp, iam, num_threads_leftside); if(omp_get_num_threads() < 2)
// The pivot_pos is left in place, to ensure termination. wait = false;
#pragma omp section else
qsb_conquer(tls, begin + split_pos + 1, end, comp, wait = parent_wait;
iam + num_threads_leftside, num_threads - num_threads_leftside);
# pragma omp sections
{
# pragma omp section
{
qsb_conquer(tls, begin, begin + split_pos, comp,
iam,
num_threads_leftside,
wait);
wait = parent_wait;
}
// The pivot_pos is left in place, to ensure termination.
# pragma omp section
{
qsb_conquer(tls, begin + split_pos + 1, end, comp,
iam + num_threads_leftside,
num_threads - num_threads_leftside,
wait);
wait = parent_wait;
}
}
} }
} }
/** /**
* @brief Quicksort step doing load-balanced local sort. * @brief Quicksort step doing load-balanced local sort.
* @param tls Array of thread-local storages. * @param tls Array of thread-local storages.
* @param comp Comparator. * @param comp Comparator.
* @param iam Number of the thread processing this function. * @param iam Number of the thread processing this function.
*/ */
template<typename RandomAccessIterator, typename Comparator> template<typename RandomAccessIterator, typename Comparator>
inline void inline void
qsb_local_sort_with_helping(QSBThreadLocal<RandomAccessIterator>** tls, qsb_local_sort_with_helping(QSBThreadLocal<RandomAccessIterator>** tls,
Comparator& comp, int iam) Comparator& comp, int iam, bool wait)
{ {
typedef std::iterator_traits<RandomAccessIterator> traits_type; typedef std::iterator_traits<RandomAccessIterator> traits_type;
typedef typename traits_type::value_type value_type; typedef typename traits_type::value_type value_type;
...@@ -251,151 +268,162 @@ namespace __gnu_parallel ...@@ -251,151 +268,162 @@ namespace __gnu_parallel
for (;;) for (;;)
{ {
// Invariant: current must be a valid (maybe empty) range. // Invariant: current must be a valid (maybe empty) range.
RandomAccessIterator begin = current.first, end = current.second; RandomAccessIterator begin = current.first, end = current.second;
difference_type n = end - begin; difference_type n = end - begin;
if (n > base_case_n) if (n > base_case_n)
{ {
// Divide. // Divide.
RandomAccessIterator pivot_pos = begin + rng(n); RandomAccessIterator pivot_pos = begin + rng(n);
// Swap pivot_pos value to end. // Swap pivot_pos value to end.
if (pivot_pos != (end - 1)) if (pivot_pos != (end - 1))
std::swap(*pivot_pos, *(end - 1)); std::swap(*pivot_pos, *(end - 1));
pivot_pos = end - 1; pivot_pos = end - 1;
__gnu_parallel::binder2nd<Comparator, value_type, value_type, bool> pred(comp, *pivot_pos); __gnu_parallel::binder2nd
<Comparator, value_type, value_type, bool>
pred(comp, *pivot_pos);
// Divide, leave pivot unchanged in last place. // Divide, leave pivot unchanged in last place.
RandomAccessIterator split_pos1, split_pos2; RandomAccessIterator split_pos1, split_pos2;
split_pos1 = __gnu_sequential::partition(begin, end - 1, pred); split_pos1 = __gnu_sequential::partition(begin, end - 1, pred);
// Left side: < pivot_pos; right side: >= pivot_pos. // Left side: < pivot_pos; right side: >= pivot_pos.
#if _GLIBCXX_ASSERTIONS #if _GLIBCXX_ASSERTIONS
_GLIBCXX_PARALLEL_ASSERT(begin <= split_pos1 && split_pos1 < end); _GLIBCXX_PARALLEL_ASSERT(begin <= split_pos1 && split_pos1 < end);
#endif #endif
// Swap pivot back to middle. // Swap pivot back to middle.
if (split_pos1 != pivot_pos) if (split_pos1 != pivot_pos)
std::swap(*split_pos1, *pivot_pos); std::swap(*split_pos1, *pivot_pos);
pivot_pos = split_pos1; pivot_pos = split_pos1;
// In case all elements are equal, split_pos1 == 0. // In case all elements are equal, split_pos1 == 0.
if ((split_pos1 + 1 - begin) < (n >> 7) if ((split_pos1 + 1 - begin) < (n >> 7)
|| (end - split_pos1) < (n >> 7)) || (end - split_pos1) < (n >> 7))
{ {
// Very unequal split, one part smaller than one 128th // Very unequal split, one part smaller than one 128th
// elements not strictly larger than the pivot. // elements not strictly larger than the pivot.
__gnu_parallel::unary_negate<__gnu_parallel::binder1st<Comparator, value_type, value_type, bool>, value_type> pred(__gnu_parallel::binder1st<Comparator, value_type, value_type, bool>(comp, *pivot_pos)); __gnu_parallel::unary_negate<__gnu_parallel::binder1st
<Comparator, value_type, value_type, bool>, value_type>
// Find other end of pivot-equal range. pred(__gnu_parallel::binder1st
split_pos2 = __gnu_sequential::partition(split_pos1 + 1, end, pred); <Comparator, value_type, value_type, bool>(
} comp, *pivot_pos));
else
{ // Find other end of pivot-equal range.
// Only skip the pivot. split_pos2 = __gnu_sequential::partition(
split_pos2 = split_pos1 + 1; split_pos1 + 1, end, pred);
} }
else
// Elements equal to pivot are done. // Only skip the pivot.
elements_done += (split_pos2 - split_pos1); split_pos2 = split_pos1 + 1;
// Elements equal to pivot are done.
elements_done += (split_pos2 - split_pos1);
#if _GLIBCXX_ASSERTIONS #if _GLIBCXX_ASSERTIONS
total_elements_done += (split_pos2 - split_pos1); total_elements_done += (split_pos2 - split_pos1);
#endif #endif
// Always push larger part onto stack. // Always push larger part onto stack.
if (((split_pos1 + 1) - begin) < (end - (split_pos2))) if (((split_pos1 + 1) - begin) < (end - (split_pos2)))
{ {
// Right side larger. // Right side larger.
if ((split_pos2) != end) if ((split_pos2) != end)
tl.leftover_parts.push_front(std::make_pair(split_pos2, end)); tl.leftover_parts.push_front(std::make_pair(split_pos2, end));
//current.first = begin; //already set anyway //current.first = begin; //already set anyway
current.second = split_pos1; current.second = split_pos1;
continue; continue;
} }
else else
{ {
// Left side larger. // Left side larger.
if (begin != split_pos1) if (begin != split_pos1)
tl.leftover_parts.push_front(std::make_pair(begin, split_pos1)); tl.leftover_parts.push_front(
std::make_pair(begin, split_pos1));
current.first = split_pos2;
//current.second = end; //already set anyway current.first = split_pos2;
continue; //current.second = end; //already set anyway
} continue;
} }
else }
{ else
__gnu_sequential::sort(begin, end, comp); {
elements_done += n; __gnu_sequential::sort(begin, end, comp);
elements_done += n;
#if _GLIBCXX_ASSERTIONS #if _GLIBCXX_ASSERTIONS
total_elements_done += n; total_elements_done += n;
#endif #endif
// Prefer own stack, small pieces. // Prefer own stack, small pieces.
if (tl.leftover_parts.pop_front(current)) if (tl.leftover_parts.pop_front(current))
continue; continue;
#pragma omp atomic # pragma omp atomic
*tl.elements_leftover -= elements_done; *tl.elements_leftover -= elements_done;
elements_done = 0;
elements_done = 0;
#if _GLIBCXX_ASSERTIONS #if _GLIBCXX_ASSERTIONS
double search_start = omp_get_wtime(); double search_start = omp_get_wtime();
#endif #endif
// Look for new work. // Look for new work.
bool success = false; bool successfully_stolen = false;
while (*tl.elements_leftover > 0 && !success while (wait && *tl.elements_leftover > 0 && !successfully_stolen
#if _GLIBCXX_ASSERTIONS #if _GLIBCXX_ASSERTIONS
// Possible dead-lock. // Possible dead-lock.
&& (omp_get_wtime() < (search_start + 1.0)) && (omp_get_wtime() < (search_start + 1.0))
#endif #endif
) )
{ {
thread_index_t victim; thread_index_t victim;
victim = rng(num_threads); victim = rng(num_threads);
// Large pieces. // Large pieces.
success = (victim != iam) && tls[victim]->leftover_parts.pop_back(current); successfully_stolen = (victim != iam)
if (!success) && tls[victim]->leftover_parts.pop_back(current);
yield(); if (!successfully_stolen)
yield();
#if !defined(__ICC) && !defined(__ECC) #if !defined(__ICC) && !defined(__ECC)
#pragma omp flush # pragma omp flush
#endif #endif
} }
#if _GLIBCXX_ASSERTIONS #if _GLIBCXX_ASSERTIONS
if (omp_get_wtime() >= (search_start + 1.0)) if (omp_get_wtime() >= (search_start + 1.0))
{ {
sleep(1); sleep(1);
_GLIBCXX_PARALLEL_ASSERT(omp_get_wtime() < (search_start + 1.0)); _GLIBCXX_PARALLEL_ASSERT(
} omp_get_wtime() < (search_start + 1.0));
}
#endif #endif
if (!success) if (!successfully_stolen)
{ {
#if _GLIBCXX_ASSERTIONS #if _GLIBCXX_ASSERTIONS
_GLIBCXX_PARALLEL_ASSERT(*tl.elements_leftover == 0); _GLIBCXX_PARALLEL_ASSERT(*tl.elements_leftover == 0);
#endif #endif
return; return;
} }
} }
} }
} }
/** @brief Top-level quicksort routine. /** @brief Top-level quicksort routine.
* @param begin Begin iterator of sequence. * @param begin Begin iterator of sequence.
* @param end End iterator of sequence. * @param end End iterator of sequence.
* @param comp Comparator. * @param comp Comparator.
* @param n Length of the sequence to sort. * @param n Length of the sequence to sort.
* @param num_threads Number of threads that are allowed to work on * @param num_threads Number of threads that are allowed to work on
* this part. * this part.
*/ */
template<typename RandomAccessIterator, typename Comparator> template<typename RandomAccessIterator, typename Comparator>
inline void inline void
parallel_sort_qsb(RandomAccessIterator begin, RandomAccessIterator end, parallel_sort_qsb(RandomAccessIterator begin, RandomAccessIterator end,
Comparator comp, Comparator comp,
typename std::iterator_traits<RandomAccessIterator>::difference_type n, int num_threads) typename std::iterator_traits<RandomAccessIterator>
::difference_type n,
thread_index_t num_threads)
{ {
_GLIBCXX_CALL(end - begin) _GLIBCXX_CALL(end - begin)
...@@ -413,11 +441,11 @@ namespace __gnu_parallel ...@@ -413,11 +441,11 @@ namespace __gnu_parallel
if (num_threads > n) if (num_threads > n)
num_threads = static_cast<thread_index_t>(n); num_threads = static_cast<thread_index_t>(n);
// Initialize thread local storage
tls_type** tls = new tls_type*[num_threads]; tls_type** tls = new tls_type*[num_threads];
difference_type queue_size = num_threads * (thread_index_t)(log2(n) + 1);
#pragma omp parallel num_threads(num_threads) for (thread_index_t t = 0; t < num_threads; ++t)
// Initialize variables per processor. tls[t] = new QSBThreadLocal<RandomAccessIterator>(queue_size);
qsb_initialize(tls, num_threads * (thread_index_t)(log2(n) + 1));
// There can never be more than ceil(log2(n)) ranges on the stack, because // There can never be more than ceil(log2(n)) ranges on the stack, because
// 1. Only one processor pushes onto the stack // 1. Only one processor pushes onto the stack
...@@ -426,22 +454,16 @@ namespace __gnu_parallel ...@@ -426,22 +454,16 @@ namespace __gnu_parallel
volatile difference_type elements_leftover = n; volatile difference_type elements_leftover = n;
for (int i = 0; i < num_threads; i++) for (int i = 0; i < num_threads; i++)
{ {
tls[i]->elements_leftover = &elements_leftover; tls[i]->elements_leftover = &elements_leftover;
tls[i]->num_threads = num_threads; tls[i]->num_threads = num_threads;
tls[i]->global = std::make_pair(begin, end); tls[i]->global = std::make_pair(begin, end);
// Just in case nothing is left to assign. // Just in case nothing is left to assign.
tls[i]->initial = std::make_pair(end, end); tls[i]->initial = std::make_pair(end, end);
} }
// Initial splitting, recursively.
int old_nested = omp_get_nested();
omp_set_nested(true);
// Main recursion call. // Main recursion call.
qsb_conquer(tls, begin, begin + n, comp, 0, num_threads); qsb_conquer(tls, begin, begin + n, comp, 0, num_threads, true);
omp_set_nested(old_nested);
#if _GLIBCXX_ASSERTIONS #if _GLIBCXX_ASSERTIONS
// All stack must be empty. // All stack must be empty.
......
...@@ -49,54 +49,70 @@ namespace __gnu_parallel ...@@ -49,54 +49,70 @@ namespace __gnu_parallel
// XXX remove std::duplicates from here if possible, // XXX remove std::duplicates from here if possible,
// XXX but keep minimal dependencies. // XXX but keep minimal dependencies.
/** @brief Calculates the rounded-down logarithm of @c n for base 2. /** @brief Calculates the rounded-down logarithm of @c n for base 2.
* @param n Argument. * @param n Argument.
* @return Returns 0 for argument 0. * @return Returns 0 for argument 0.
*/ */
template<typename Size> template<typename Size>
inline Size inline Size
log2(Size n) log2(Size n)
{ {
Size k; Size k;
for (k = 0; n != 1; n >>= 1) for (k = 0; n != 1; n >>= 1)
++k; ++k;
return k; return k;
} }
/** @brief Encode two integers into one __gnu_parallel::lcas_t. /** @brief Encode two integers into one __gnu_parallel::lcas_t.
* @param a First integer, to be encoded in the most-significant @c * @param a First integer, to be encoded in the most-significant @c
* lcas_t_bits/2 bits. * lcas_t_bits/2 bits.
* @param b Second integer, to be encoded in the least-significant * @param b Second integer, to be encoded in the least-significant
* @c lcas_t_bits/2 bits. * @c lcas_t_bits/2 bits.
* @return __gnu_parallel::lcas_t value encoding @c a and @c b. * @return __gnu_parallel::lcas_t value encoding @c a and @c b.
* @see decode2 * @see decode2
*/ */
inline lcas_t inline lcas_t
encode2(int a, int b) //must all be non-negative, actually encode2(int a, int b) //must all be non-negative, actually
{
return (((lcas_t)a) << (lcas_t_bits / 2)) | (((lcas_t)b) << 0);
}
/** @brief Decode two integers from one __gnu_parallel::lcas_t.
* @param x __gnu_parallel::lcas_t to decode integers from.
* @param a First integer, to be decoded from the most-significant
* @c lcas_t_bits/2 bits of @c x.
* @param b Second integer, to be encoded in the least-significant
* @c lcas_t_bits/2 bits of @c x.
* @see encode2
*/
inline void
decode2(lcas_t x, int& a, int& b)
{
a = (int)((x >> (lcas_t_bits / 2)) & lcas_t_mask);
b = (int)((x >> 0 ) & lcas_t_mask);
}
/** @brief Equivalent to std::min. */
template<typename T>
const T&
min(const T& a, const T& b)
{ {
return (((lcas_t)a) << (lcas_t_bits / 2)) | (((lcas_t)b) << 0); return (a < b) ? a : b;
} };
/** @brief Decode two integers from one __gnu_parallel::lcas_t. /** @brief Equivalent to std::max. */
* @param x __gnu_parallel::lcas_t to decode integers from. template<typename T>
* @param a First integer, to be decoded from the most-significant const T&
* @c lcas_t_bits/2 bits of @c x. max(const T& a, const T& b)
* @param b Second integer, to be encoded in the least-significant
* @c lcas_t_bits/2 bits of @c x.
* @see encode2
*/
inline void
decode2(lcas_t x, int& a, int& b)
{ {
a = (int)((x >> (lcas_t_bits / 2)) & lcas_t_mask); return (a > b) ? a : b;
b = (int)((x >> 0 ) & lcas_t_mask); };
}
/** @brief Constructs predicate for equality from strict weak /** @brief Constructs predicate for equality from strict weak
* ordering predicate * ordering predicate
*/ */
// XXX comparator at the end, as per others // XXX comparator at the end, as per others
template<typename Comparator, typename T1, typename T2> template<typename Comparator, typename T1, typename T2>
class equal_from_less : public std::binary_function<T1, T2, bool> class equal_from_less : public std::binary_function<T1, T2, bool>
{ {
private: private:
...@@ -112,162 +128,176 @@ namespace __gnu_parallel ...@@ -112,162 +128,176 @@ namespace __gnu_parallel
}; };
/** @brief Similar to std::binder1st, but giving the argument types explicitly. */ /** @brief Similar to std::binder1st,
template<typename _Predicate, typename argument_type> * but giving the argument types explicitly. */
class unary_negate template<typename _Predicate, typename argument_type>
: public std::unary_function<argument_type, bool> class unary_negate
{ : public std::unary_function<argument_type, bool>
protected: {
_Predicate _M_pred; protected:
_Predicate _M_pred;
public:
explicit public:
unary_negate(const _Predicate& __x) : _M_pred(__x) { } explicit
unary_negate(const _Predicate& __x) : _M_pred(__x) { }
bool
operator()(const argument_type& __x) bool
{ return !_M_pred(__x); } operator()(const argument_type& __x)
}; { return !_M_pred(__x); }
};
/** @brief Similar to std::binder1st, but giving the argument types explicitly. */
template<typename _Operation, typename first_argument_type, typename second_argument_type, typename result_type> /** @brief Similar to std::binder1st,
class binder1st * but giving the argument types explicitly. */
: public std::unary_function<second_argument_type, result_type> template<
{ typename _Operation,
protected: typename first_argument_type,
_Operation op; typename second_argument_type,
first_argument_type value; typename result_type>
class binder1st
public: : public std::unary_function<second_argument_type, result_type>
binder1st(const _Operation& __x, {
const first_argument_type& __y) protected:
: op(__x), value(__y) { } _Operation op;
first_argument_type value;
result_type
operator()(const second_argument_type& __x) public:
{ return op(value, __x); } binder1st(const _Operation& __x,
const first_argument_type& __y)
// _GLIBCXX_RESOLVE_LIB_DEFECTS : op(__x), value(__y) { }
// 109. Missing binders for non-const sequence elements
result_type result_type
operator()(second_argument_type& __x) const operator()(const second_argument_type& __x)
{ return op(value, __x); } { return op(value, __x); }
};
// _GLIBCXX_RESOLVE_LIB_DEFECTS
/** // 109. Missing binders for non-const sequence elements
* @brief Similar to std::binder2nd, but giving the argument types result_type
* explicitly. operator()(second_argument_type& __x) const
*/ { return op(value, __x); }
template<typename _Operation, typename first_argument_type, typename second_argument_type, typename result_type> };
class binder2nd
: public std::unary_function<first_argument_type, result_type> /**
{ * @brief Similar to std::binder2nd, but giving the argument types
protected: * explicitly.
_Operation op; */
second_argument_type value; template<
typename _Operation,
public: typename first_argument_type,
binder2nd(const _Operation& __x, typename second_argument_type,
const second_argument_type& __y) typename result_type>
: op(__x), value(__y) { } class binder2nd
: public std::unary_function<first_argument_type, result_type>
result_type {
operator()(const first_argument_type& __x) const protected:
{ return op(__x, value); } _Operation op;
second_argument_type value;
// _GLIBCXX_RESOLVE_LIB_DEFECTS
// 109. Missing binders for non-const sequence elements public:
result_type binder2nd(const _Operation& __x,
operator()(first_argument_type& __x) const second_argument_type& __y)
{ return op(__x, value); } : op(__x), value(__y) { }
};
result_type
/** @brief Similar to std::equal_to, but allows two different types. */ operator()(const first_argument_type& __x) const
template<typename T1, typename T2> { return op(__x, value); }
// _GLIBCXX_RESOLVE_LIB_DEFECTS
// 109. Missing binders for non-const sequence elements
result_type
operator()(first_argument_type& __x)
{ return op(__x, value); }
};
/** @brief Similar to std::equal_to, but allows two different types. */
template<typename T1, typename T2>
struct equal_to : std::binary_function<T1, T2, bool> struct equal_to : std::binary_function<T1, T2, bool>
{ {
bool operator()(const T1& t1, const T2& t2) const bool operator()(const T1& t1, const T2& t2) const
{ return t1 == t2; } { return t1 == t2; }
}; };
/** @brief Similar to std::less, but allows two different types. */ /** @brief Similar to std::less, but allows two different types. */
template<typename T1, typename T2> template<typename T1, typename T2>
struct less : std::binary_function<T1, T2, bool> struct less : std::binary_function<T1, T2, bool>
{ {
bool bool
operator()(const T1& t1, const T2& t2) const operator()(const T1& t1, const T2& t2) const
{ return t1 < t2; } { return t1 < t2; }
bool bool
operator()(const T2& t2, const T1& t1) const operator()(const T2& t2, const T1& t1) const
{ return t2 < t1; } { return t2 < t1; }
}; };
// Partial specialization for one type. Same as std::less. // Partial specialization for one type. Same as std::less.
template<typename _Tp> template<typename _Tp>
struct less<_Tp, _Tp> : public std::binary_function<_Tp, _Tp, bool> struct less<_Tp, _Tp> : public std::binary_function<_Tp, _Tp, bool>
{ {
bool bool
operator()(const _Tp& __x, const _Tp& __y) const operator()(const _Tp& __x, const _Tp& __y) const
{ return __x < __y; } { return __x < __y; }
}; };
/** @brief Similar to std::plus, but allows two different types. */ /** @brief Similar to std::plus, but allows two different types. */
template<typename _Tp1, typename _Tp2> template<typename _Tp1, typename _Tp2>
struct plus : public std::binary_function<_Tp1, _Tp2, _Tp1> struct plus : public std::binary_function<_Tp1, _Tp2, _Tp1>
{ {
typedef typeof(*static_cast<_Tp1*>(NULL) + *static_cast<_Tp2*>(NULL)) result; typedef typeof(*static_cast<_Tp1*>(NULL)
+ *static_cast<_Tp2*>(NULL)) result;
result result
operator()(const _Tp1& __x, const _Tp2& __y) const operator()(const _Tp1& __x, const _Tp2& __y) const
{ return __x + __y; } { return __x + __y; }
}; };
// Partial specialization for one type. Same as std::plus. // Partial specialization for one type. Same as std::plus.
template<typename _Tp> template<typename _Tp>
struct plus<_Tp, _Tp> : public std::binary_function<_Tp, _Tp, _Tp> struct plus<_Tp, _Tp> : public std::binary_function<_Tp, _Tp, _Tp>
{ {
typedef typeof(*static_cast<_Tp*>(NULL) + *static_cast<_Tp*>(NULL)) result; typedef typeof(*static_cast<_Tp*>(NULL)
+ *static_cast<_Tp*>(NULL)) result;
result result
operator()(const _Tp& __x, const _Tp& __y) const operator()(const _Tp& __x, const _Tp& __y) const
{ return __x + __y; } { return __x + __y; }
}; };
/** @brief Similar to std::multiplies, but allows two different types. */ /** @brief Similar to std::multiplies, but allows two different types. */
template<typename _Tp1, typename _Tp2> template<typename _Tp1, typename _Tp2>
struct multiplies : public std::binary_function<_Tp1, _Tp2, _Tp1> struct multiplies : public std::binary_function<_Tp1, _Tp2, _Tp1>
{ {
typedef typeof(*static_cast<_Tp1*>(NULL) * *static_cast<_Tp2*>(NULL)) result; typedef typeof(*static_cast<_Tp1*>(NULL)
* *static_cast<_Tp2*>(NULL)) result;
result result
operator()(const _Tp1& __x, const _Tp2& __y) const operator()(const _Tp1& __x, const _Tp2& __y) const
{ return __x * __y; } { return __x * __y; }
}; };
// Partial specialization for one type. Same as std::multiplies. // Partial specialization for one type. Same as std::multiplies.
template<typename _Tp> template<typename _Tp>
struct multiplies<_Tp, _Tp> : public std::binary_function<_Tp, _Tp, _Tp> struct multiplies<_Tp, _Tp> : public std::binary_function<_Tp, _Tp, _Tp>
{ {
typedef typeof(*static_cast<_Tp*>(NULL) * *static_cast<_Tp*>(NULL)) result; typedef typeof(*static_cast<_Tp*>(NULL)
* *static_cast<_Tp*>(NULL)) result;
result result
operator()(const _Tp& __x, const _Tp& __y) const operator()(const _Tp& __x, const _Tp& __y) const
{ return __x * __y; } { return __x * __y; }
}; };
template<typename T, typename _DifferenceTp> template<typename T, typename _DifferenceTp>
class pseudo_sequence; class pseudo_sequence;
/** @brief Iterator associated with __gnu_parallel::pseudo_sequence. /** @brief Iterator associated with __gnu_parallel::pseudo_sequence.
* If features the usual random-access iterator functionality. * If features the usual random-access iterator functionality.
* @param T Sequence value type. * @param T Sequence value type.
* @param difference_type Sequence difference type. * @param difference_type Sequence difference type.
*/ */
template<typename T, typename _DifferenceTp> template<typename T, typename _DifferenceTp>
class pseudo_sequence_iterator class pseudo_sequence_iterator
{ {
public: public:
...@@ -296,34 +326,34 @@ namespace __gnu_parallel ...@@ -296,34 +326,34 @@ namespace __gnu_parallel
operator++(int) operator++(int)
{ return type(pos++); } { return type(pos++); }
const T& const T&
operator*() const operator*() const
{ return val; } { return val; }
const T& const T&
operator[](difference_type) const operator[](difference_type) const
{ return val; } { return val; }
bool bool
operator==(const type& i2) operator==(const type& i2)
{ return pos == i2.pos; } { return pos == i2.pos; }
difference_type difference_type
operator!=(const type& i2) operator!=(const type& i2)
{ return pos != i2.pos; } { return pos != i2.pos; }
difference_type difference_type
operator-(const type& i2) operator-(const type& i2)
{ return pos - i2.pos; } { return pos - i2.pos; }
}; };
/** @brief Sequence that conceptually consists of multiple copies of /** @brief Sequence that conceptually consists of multiple copies of
the same element. the same element.
* The copies are not stored explicitly, of course. * The copies are not stored explicitly, of course.
* @param T Sequence value type. * @param T Sequence value type.
* @param difference_type Sequence difference type. * @param difference_type Sequence difference type.
*/ */
template<typename T, typename _DifferenceTp> template<typename T, typename _DifferenceTp>
class pseudo_sequence class pseudo_sequence
{ {
typedef pseudo_sequence<T, _DifferenceTp> type; typedef pseudo_sequence<T, _DifferenceTp> type;
...@@ -335,10 +365,10 @@ namespace __gnu_parallel ...@@ -335,10 +365,10 @@ namespace __gnu_parallel
typedef pseudo_sequence_iterator<T, uint64> iterator; typedef pseudo_sequence_iterator<T, uint64> iterator;
/** @brief Constructor. /** @brief Constructor.
* @param val Element of the sequence. * @param val Element of the sequence.
* @param count Number of (virtual) copies. * @param count Number of (virtual) copies.
*/ */
pseudo_sequence(const T& val, difference_type count) pseudo_sequence(const T& val, difference_type count)
: val(val), count(count) { } : val(val), count(count) { }
/** @brief Begin iterator. */ /** @brief Begin iterator. */
...@@ -356,67 +386,66 @@ namespace __gnu_parallel ...@@ -356,67 +386,66 @@ namespace __gnu_parallel
difference_type count; difference_type count;
}; };
/** @brief Functor that does nothing */ /** @brief Functor that does nothing */
template<typename _ValueTp> template<typename _ValueTp>
class void_functor class void_functor
{ {
inline void inline void
operator()(const _ValueTp& v) const { } operator()(const _ValueTp& v) const { }
}; };
/** @brief Compute the median of three referenced elements, /** @brief Compute the median of three referenced elements,
according to @c comp. according to @c comp.
* @param a First iterator. * @param a First iterator.
* @param b Second iterator. * @param b Second iterator.
* @param c Third iterator. * @param c Third iterator.
* @param comp Comparator. * @param comp Comparator.
*/ */
template<typename RandomAccessIterator, typename Comparator> template<typename RandomAccessIterator, typename Comparator>
RandomAccessIterator RandomAccessIterator
median_of_three_iterators(RandomAccessIterator a, RandomAccessIterator b, median_of_three_iterators(RandomAccessIterator a, RandomAccessIterator b,
RandomAccessIterator c, Comparator& comp) RandomAccessIterator c, Comparator& comp)
{ {
if (comp(*a, *b)) if (comp(*a, *b))
if (comp(*b, *c)) if (comp(*b, *c))
return b; return b;
else else
if (comp(*a, *c)) if (comp(*a, *c))
return c; return c;
else else
return a; return a;
else else
{ {
// Just swap a and b. // Just swap a and b.
if (comp(*a, *c)) if (comp(*a, *c))
return a; return a;
else else
if (comp(*b, *c)) if (comp(*b, *c))
return c; return c;
else else
return b; return b;
} }
} }
// Avoid the use of assert, because we're trying to keep the <cassert> // Avoid the use of assert, because we're trying to keep the <cassert>
// include out of the mix. (Same as debug mode). // include out of the mix. (Same as debug mode).
inline void inline void
__replacement_assert(const char* __file, int __line, __replacement_assert(const char* __file, int __line,
const char* __function, const char* __condition) const char* __function, const char* __condition)
{ {
std::printf("%s:%d: %s: Assertion '%s' failed.\n", __file, __line, std::printf("%s:%d: %s: Assertion '%s' failed.\n", __file, __line,
__function, __condition); __function, __condition);
__builtin_abort(); __builtin_abort();
} }
#define _GLIBCXX_PARALLEL_ASSERT(_Condition) \ #define _GLIBCXX_PARALLEL_ASSERT(_Condition) \
do \ do \
{ \ { \
if (!(_Condition)) \ if (!(_Condition)) \
__gnu_parallel::__replacement_assert(__FILE__, __LINE__, \ __gnu_parallel::__replacement_assert(__FILE__, __LINE__, \
__PRETTY_FUNCTION__, #_Condition); \ __PRETTY_FUNCTION__, #_Condition); \
} while (false) } while (false)
} //namespace __gnu_parallel } //namespace __gnu_parallel
#endif #endif
...@@ -39,7 +39,7 @@ ...@@ -39,7 +39,7 @@
#include <cstdio> #include <cstdio>
/** @brief Determine verbosity level of the parallel mode. /** @brief Determine verbosity level of the parallel mode.
* Level 1 prints a message each time when entering a parallel-mode function. */ * Level 1 prints a message each time a parallel-mode function is entered. */
#define _GLIBCXX_VERBOSE_LEVEL 0 #define _GLIBCXX_VERBOSE_LEVEL 0
/** @def _GLIBCXX_CALL /** @def _GLIBCXX_CALL
...@@ -50,27 +50,40 @@ ...@@ -50,27 +50,40 @@
#define _GLIBCXX_CALL(n) #define _GLIBCXX_CALL(n)
#endif #endif
#if (_GLIBCXX_VERBOSE_LEVEL == 1) #if (_GLIBCXX_VERBOSE_LEVEL == 1)
#define _GLIBCXX_CALL(n) printf(" %s:\niam = %d, n = %ld, num_threads = %d\n", __PRETTY_FUNCTION__, omp_get_thread_num(), (n), get_max_threads()); #define _GLIBCXX_CALL(n) \
printf(" %s:\niam = %d, n = %ld, num_threads = %d\n", \
__PRETTY_FUNCTION__, omp_get_thread_num(), (n), get_max_threads());
#endif #endif
#ifndef _GLIBCXX_SCALE_DOWN_FPU
/** @brief Use floating-point scaling instead of modulo for mapping /** @brief Use floating-point scaling instead of modulo for mapping
* random numbers to a range. This can be faster on certain CPUs. */ * random numbers to a range. This can be faster on certain CPUs. */
#define _GLIBCXX_SCALE_DOWN_FPU 0 #define _GLIBCXX_SCALE_DOWN_FPU 0
#endif
#ifndef _GLIBCXX_ASSERTIONS
/** @brief Switch on many _GLIBCXX_PARALLEL_ASSERTions in parallel code. /** @brief Switch on many _GLIBCXX_PARALLEL_ASSERTions in parallel code.
* Should be switched on only locally. */ * Should be switched on only locally. */
#define _GLIBCXX_ASSERTIONS 0 #define _GLIBCXX_ASSERTIONS 0
#endif
#ifndef _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_L1
/** @brief Switch on many _GLIBCXX_PARALLEL_ASSERTions in parallel code. /** @brief Switch on many _GLIBCXX_PARALLEL_ASSERTions in parallel code.
* Consider the size of the L1 cache for __gnu_parallel::parallel_random_shuffle(). */ * Consider the size of the L1 cache for
* __gnu_parallel::parallel_random_shuffle(). */
#define _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_L1 0 #define _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_L1 0
#endif
#ifndef _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB
/** @brief Switch on many _GLIBCXX_PARALLEL_ASSERTions in parallel code. /** @brief Switch on many _GLIBCXX_PARALLEL_ASSERTions in parallel code.
* Consider the size of the TLB for __gnu_parallel::parallel_random_shuffle(). */ * Consider the size of the TLB for
* __gnu_parallel::parallel_random_shuffle(). */
#define _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB 0 #define _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB 0
#endif
#ifndef _GLIBCXX_MULTIWAY_MERGESORT_COPY_LAST
/** @brief First copy the data, sort it locally, and merge it back /** @brief First copy the data, sort it locally, and merge it back
* (0); or copy it back after everything is done (1). * (0); or copy it back after everything is done (1).
* *
* Recommendation: 0 */ * Recommendation: 0 */
#define _GLIBCXX_MULTIWAY_MERGESORT_COPY_LAST 0 #define _GLIBCXX_MULTIWAY_MERGESORT_COPY_LAST 0
#endif
...@@ -39,30 +39,58 @@ ...@@ -39,30 +39,58 @@
namespace __gnu_parallel namespace __gnu_parallel
{ {
/** @brief Function to split a sequence into parts of almost equal size. /** @brief Function to split a sequence into parts of almost equal size.
* *
* The resulting sequence s of length p+1 contains the splitting * The resulting sequence s of length num_threads+1 contains the splitting
* positions when splitting the range [0,n) into parts of almost * positions when splitting the range [0,n) into parts of almost
* equal size (plus minus 1). The first entry is 0, the last one * equal size (plus minus 1). The first entry is 0, the last one
* n. There may result empty parts. * n. There may result empty parts.
* @param n Number of elements * @param n Number of elements
* @param p Number of parts * @param num_threads Number of parts
* @param s Splitters * @param s Splitters
* @returns End of splitter sequence, i. e. @c s+p+1 */ * @returns End of splitter sequence, i. e. @c s+num_threads+1 */
template<typename _DifferenceTp, typename OutputIterator> template<typename difference_type, typename OutputIterator>
OutputIterator OutputIterator
equally_split(_DifferenceTp n, thread_index_t p, OutputIterator s) equally_split(difference_type n,
thread_index_t num_threads,
OutputIterator s)
{ {
typedef _DifferenceTp difference_type; difference_type chunk_length = n / num_threads,
difference_type chunk_length = n / p, split = n % p, start = 0; num_longer_chunks = n % num_threads,
for (int i = 0; i < p; i++) pos = 0;
for (thread_index_t i = 0; i < num_threads; ++i)
{ {
*s++ = start; *s++ = pos;
start += (difference_type(i) < split) ? (chunk_length + 1) : chunk_length; pos += (i < num_longer_chunks) ? (chunk_length + 1) : chunk_length;
} }
*s++ = n; *s++ = n;
return s; return s;
} }
/** @brief Function to split a sequence into parts of almost equal size.
*
* Returns the position of the splitting point between
* thread number thread_no (included) and
* thread number thread_no+1 (excluded).
* @param n Number of elements
* @param num_threads Number of parts
* @returns Splitting point */
template<typename difference_type>
difference_type
equally_split_point(difference_type n,
thread_index_t num_threads,
thread_index_t thread_no)
{
difference_type chunk_length = n / num_threads,
num_longer_chunks = n % num_threads;
if(thread_no < num_longer_chunks)
return thread_no * (chunk_length + 1);
else
return num_longer_chunks * (chunk_length + 1)
+ (thread_no - num_longer_chunks) * chunk_length;
}
} }
#endif #endif
...@@ -66,7 +66,7 @@ ...@@ -66,7 +66,7 @@
* @brief Include guarded (sequences may run empty) loser tree, * @brief Include guarded (sequences may run empty) loser tree,
* moving objects. * moving objects.
* @see __gnu_parallel::Settings multiway_merge_algorithm */ * @see __gnu_parallel::Settings multiway_merge_algorithm */
#define _GLIBCXX_LOSER_TREE 0 #define _GLIBCXX_LOSER_TREE 1
#endif #endif
#ifndef _GLIBCXX_LOSER_TREE_EXPLICIT #ifndef _GLIBCXX_LOSER_TREE_EXPLICIT
......
...@@ -10,7 +10,7 @@ ...@@ -10,7 +10,7 @@
// This library is distributed in the hope that it will be useful, but // This library is distributed in the hope that it will be useful, but
// WITHOUT ANY WARRANTY; without even the implied warranty of // WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURstartE. See the GNU
// General Public License for more details. // General Public License for more details.
// You should have received a copy of the GNU General Public License // You should have received a copy of the GNU General Public License
...@@ -48,50 +48,66 @@ ...@@ -48,50 +48,66 @@
namespace __gnu_parallel namespace __gnu_parallel
{ {
/** /**
* @brief Parallel std::find, switch for different algorithms. * @brief Parallel std::find, switch for different algorithms.
* @param begin1 Begin iterator of first sequence. * @param begin1 Begin iterator of first sequence.
* @param end1 End iterator of first sequence. * @param end1 End iterator of first sequence.
* @param begin2 Begin iterator of second sequence. Must have same * @param begin2 Begin iterator of second sequence. Must have same
* length as first sequence. * length as first sequence.
* @param pred Find predicate. * @param pred Find predicate.
* @param selector Functionality (e. g. std::find_if (), std::equal(),...) * @param selector Functionality (e. g. std::find_if (), std::equal(),...)
* @return Place of finding in both sequences. * @return Place of finding in both sequences.
*/ */
template<typename RandomAccessIterator1, typename RandomAccessIterator2, typename Pred, typename Selector> template<
typename RandomAccessIterator1,
typename RandomAccessIterator2,
typename Pred,
typename Selector>
std::pair<RandomAccessIterator1, RandomAccessIterator2> std::pair<RandomAccessIterator1, RandomAccessIterator2>
find_template(RandomAccessIterator1 begin1, RandomAccessIterator1 end1, find_template(RandomAccessIterator1 begin1, RandomAccessIterator1 end1,
RandomAccessIterator2 begin2, Pred pred, Selector selector) RandomAccessIterator2 begin2, Pred pred, Selector selector)
{ {
switch (Settings::find_distribution) switch (Settings::find_distribution)
{ {
case Settings::GROWING_BLOCKS: case Settings::GROWING_BLOCKS:
return find_template(begin1, end1, begin2, pred, selector, growing_blocks_tag()); return find_template(begin1, end1, begin2, pred, selector,
growing_blocks_tag());
case Settings::CONSTANT_SIZE_BLOCKS: case Settings::CONSTANT_SIZE_BLOCKS:
return find_template(begin1, end1, begin2, pred, selector, constant_size_blocks_tag()); return find_template(begin1, end1, begin2, pred, selector,
constant_size_blocks_tag());
case Settings::EQUAL_SPLIT: case Settings::EQUAL_SPLIT:
return find_template(begin1, end1, begin2, pred, selector, equal_split_tag()); return find_template(begin1, end1, begin2, pred, selector,
equal_split_tag());
default: default:
_GLIBCXX_PARALLEL_ASSERT(false); _GLIBCXX_PARALLEL_ASSERT(false);
return std::make_pair(begin1, begin2); return std::make_pair(begin1, begin2);
} }
} }
#if _GLIBCXX_FIND_EQUAL_SPLIT #if _GLIBCXX_FIND_EQUAL_SPLIT
/** /**
* @brief Parallel std::find, equal splitting variant. * @brief Parallel std::find, equal splitting variant.
* @param begin1 Begin iterator of first sequence. * @param begin1 Begin iterator of first sequence.
* @param end1 End iterator of first sequence. * @param end1 End iterator of first sequence.
* @param begin2 Begin iterator of second sequence. Second sequence * @param begin2 Begin iterator of second sequence. Second sequence
* must have same length as first sequence. * must have same length as first sequence.
* @param pred Find predicate. * @param pred Find predicate.
* @param selector Functionality (e. g. std::find_if (), std::equal(),...) * @param selector Functionality (e. g. std::find_if (), std::equal(),...)
* @return Place of finding in both sequences. * @return Place of finding in both sequences.
*/ */
template<typename RandomAccessIterator1, typename RandomAccessIterator2, typename Pred, typename Selector> template<
typename RandomAccessIterator1,
typename RandomAccessIterator2,
typename Pred,
typename Selector>
std::pair<RandomAccessIterator1, RandomAccessIterator2> std::pair<RandomAccessIterator1, RandomAccessIterator2>
find_template(RandomAccessIterator1 begin1, RandomAccessIterator1 end1, RandomAccessIterator2 begin2, Pred pred, Selector selector, equal_split_tag) find_template(RandomAccessIterator1 begin1,
RandomAccessIterator1 end1,
RandomAccessIterator2 begin2,
Pred pred,
Selector selector,
equal_split_tag)
{ {
_GLIBCXX_CALL(end1 - begin1) _GLIBCXX_CALL(end1 - begin1)
...@@ -100,79 +116,89 @@ namespace __gnu_parallel ...@@ -100,79 +116,89 @@ namespace __gnu_parallel
typedef typename traits_type::value_type value_type; typedef typename traits_type::value_type value_type;
difference_type length = end1 - begin1; difference_type length = end1 - begin1;
difference_type result = length; difference_type result = length;
difference_type* borders;
const thread_index_t num_threads = get_max_threads();
omp_lock_t result_lock; omp_lock_t result_lock;
omp_init_lock(&result_lock); omp_init_lock(&result_lock);
difference_type* borders = static_cast<difference_type*>(__builtin_alloca(sizeof(difference_type) * (num_threads + 1))); thread_index_t num_threads = get_max_threads();
# pragma omp parallel num_threads(num_threads)
equally_split(length, num_threads, borders); {
# pragma omp single
#pragma omp parallel shared(result) num_threads(num_threads) {
{ num_threads = omp_get_num_threads();
int iam = omp_get_thread_num(); borders = new difference_type[num_threads + 1];
difference_type pos = borders[iam], limit = borders[iam + 1]; equally_split(length, num_threads, borders);
} //single
RandomAccessIterator1 i1 = begin1 + pos;
RandomAccessIterator2 i2 = begin2 + pos; thread_index_t iam = omp_get_thread_num();
for (; pos < limit; pos++) difference_type start = borders[iam], stop = borders[iam + 1];
{
#pragma omp flush(result) RandomAccessIterator1 i1 = begin1 + start;
// Result has been set to something lower. RandomAccessIterator2 i2 = begin2 + start;
if (result < pos) for (difference_type pos = start; pos < stop; ++pos)
break; {
#pragma omp flush(result)
if (selector(i1, i2, pred)) // Result has been set to something lower.
{ if (result < pos)
omp_set_lock(&result_lock);
if (result > pos)
result = pos;
omp_unset_lock(&result_lock);
break; break;
}
i1++; if (selector(i1, i2, pred))
i2++; {
} omp_set_lock(&result_lock);
} if (pos < result)
result = pos;
omp_unset_lock(&result_lock);
break;
}
++i1;
++i2;
}
} //parallel
omp_destroy_lock(&result_lock); omp_destroy_lock(&result_lock);
return std::pair<RandomAccessIterator1, RandomAccessIterator2>(begin1 + result, begin2 + result); delete[] borders;
return std::pair<RandomAccessIterator1, RandomAccessIterator2>(
begin1 + result, begin2 + result);
} }
#endif #endif
#if _GLIBCXX_FIND_GROWING_BLOCKS #if _GLIBCXX_FIND_GROWING_BLOCKS
/** /**
* @brief Parallel std::find, growing block size variant. * @brief Parallel std::find, growing block size variant.
* @param begin1 Begin iterator of first sequence. * @param begin1 Begin iterator of first sequence.
* @param end1 End iterator of first sequence. * @param end1 End iterator of first sequence.
* @param begin2 Begin iterator of second sequence. Second sequence * @param begin2 Begin iterator of second sequence. Second sequence
* must have same length as first sequence. * must have same length as first sequence.
* @param pred Find predicate. * @param pred Find predicate.
* @param selector Functionality (e. g. std::find_if (), std::equal(),...) * @param selector Functionality (e. g. std::find_if (), std::equal(),...)
* @return Place of finding in both sequences. * @return Place of finding in both sequences.
* @see __gnu_parallel::Settings::find_sequential_search_size * @see __gnu_parallel::Settings::find_sequential_search_size
* @see __gnu_parallel::Settings::find_initial_block_size * @see __gnu_parallel::Settings::find_initial_block_size
* @see __gnu_parallel::Settings::find_maximum_block_size * @see __gnu_parallel::Settings::find_maximum_block_size
* @see __gnu_parallel::Settings::find_increasing_factor * @see __gnu_parallel::Settings::find_increasing_factor
* *
* There are two main differences between the growing blocks and * There are two main differences between the growing blocks and
* the constant-size blocks variants. * the constant-size blocks variants.
* 1. For GB, the block size grows; for CSB, the block size is fixed. * 1. For GB, the block size grows; for CSB, the block size is fixed.
* 2. For GB, the blocks are allocated dynamically; * 2. For GB, the blocks are allocated dynamically;
* for CSB, the blocks are allocated in a predetermined manner, * for CSB, the blocks are allocated in a predetermined manner,
* namely spacial round-robin. * namely spacial round-robin.
*/ */
template<typename RandomAccessIterator1, typename RandomAccessIterator2, typename Pred, typename Selector> template<
typename RandomAccessIterator1,
typename RandomAccessIterator2,
typename Pred,
typename Selector>
std::pair<RandomAccessIterator1, RandomAccessIterator2> std::pair<RandomAccessIterator1, RandomAccessIterator2>
find_template(RandomAccessIterator1 begin1, RandomAccessIterator1 end1, find_template(RandomAccessIterator1 begin1, RandomAccessIterator1 end1,
RandomAccessIterator2 begin2, Pred pred, Selector selector, RandomAccessIterator2 begin2, Pred pred, Selector selector,
growing_blocks_tag) growing_blocks_tag)
{ {
_GLIBCXX_CALL(end1 - begin1) _GLIBCXX_CALL(end1 - begin1)
...@@ -182,101 +208,118 @@ namespace __gnu_parallel ...@@ -182,101 +208,118 @@ namespace __gnu_parallel
difference_type length = end1 - begin1; difference_type length = end1 - begin1;
difference_type sequential_search_size = std::min<difference_type>(length, Settings::find_sequential_search_size); difference_type sequential_search_size = std::min<difference_type>(
length, Settings::find_sequential_search_size);
// Try it sequentially first. // Try it sequentially first.
std::pair<RandomAccessIterator1, RandomAccessIterator2> find_seq_result = std::pair<RandomAccessIterator1, RandomAccessIterator2> find_seq_result =
selector.sequential_algorithm(begin1, begin1 + sequential_search_size, begin2, pred); selector.sequential_algorithm(
begin1, begin1 + sequential_search_size, begin2, pred);
if (find_seq_result.first != (begin1 + sequential_search_size)) if (find_seq_result.first != (begin1 + sequential_search_size))
return find_seq_result; return find_seq_result;
// Index of beginning of next free block (after sequential find). // Index of beginning of next free block (after sequential find).
difference_type next_block_pos = sequential_search_size; difference_type next_block_start = sequential_search_size;
difference_type result = length; difference_type result = length;
const thread_index_t num_threads = get_max_threads();
omp_lock_t result_lock; omp_lock_t result_lock;
omp_init_lock(&result_lock); omp_init_lock(&result_lock);
#pragma omp parallel shared(result) num_threads(num_threads) thread_index_t num_threads = get_max_threads();
{ # pragma omp parallel shared(result) num_threads(num_threads)
// Not within first k elements -> start parallel. {
thread_index_t iam = omp_get_thread_num(); # pragma omp single
num_threads = omp_get_num_threads();
difference_type block_size = Settings::find_initial_block_size;
difference_type start = fetch_and_add<difference_type>(&next_block_pos, block_size); // Not within first k elements -> start parallel.
thread_index_t iam = omp_get_thread_num();
// Get new block, update pointer to next block.
difference_type stop = std::min<difference_type>(length, start + block_size); difference_type block_size = Settings::find_initial_block_size;
difference_type start =
std::pair<RandomAccessIterator1, RandomAccessIterator2> local_result; fetch_and_add<difference_type>(&next_block_start, block_size);
while (start < length) // Get new block, update pointer to next block.
{ difference_type stop =
#pragma omp flush(result) std::min<difference_type>(length, start + block_size);
// Get new value of result.
if (result < start) std::pair<RandomAccessIterator1, RandomAccessIterator2> local_result;
{
// No chance to find first element. while (start < length)
break; {
} # pragma omp flush(result)
// Get new value of result.
local_result = selector.sequential_algorithm(begin1 + start, begin1 + stop, begin2 + start, pred); if (result < start)
if (local_result.first != (begin1 + stop)) {
{ // No chance to find first element.
omp_set_lock(&result_lock); break;
if ((local_result.first - begin1) < result) }
{
result = local_result.first - begin1; local_result = selector.sequential_algorithm(
begin1 + start, begin1 + stop, begin2 + start, pred);
// Result cannot be in future blocks, stop algorithm. if (local_result.first != (begin1 + stop))
fetch_and_add<difference_type>(&next_block_pos, length); {
} omp_set_lock(&result_lock);
omp_unset_lock(&result_lock); if ((local_result.first - begin1) < result)
} {
result = local_result.first - begin1;
block_size = std::min<difference_type>(block_size * Settings::find_increasing_factor, Settings::find_maximum_block_size);
// Result cannot be in future blocks, stop algorithm.
// Get new block, update pointer to next block. fetch_and_add<difference_type>(&next_block_start, length);
start = fetch_and_add<difference_type>(&next_block_pos, block_size); }
stop = (length < (start + block_size)) ? length : (start + block_size); omp_unset_lock(&result_lock);
} }
}
block_size = std::min<difference_type>(
block_size * Settings::find_increasing_factor,
Settings::find_maximum_block_size);
// Get new block, update pointer to next block.
start =
fetch_and_add<difference_type>(&next_block_start, block_size);
stop = (length < (start + block_size)) ?
length : (start + block_size);
}
} //parallel
omp_destroy_lock(&result_lock); omp_destroy_lock(&result_lock);
// Return iterator on found element. // Return iterator on found element.
return std::pair<RandomAccessIterator1, RandomAccessIterator2>(begin1 + result, begin2 + result); return std::pair<RandomAccessIterator1, RandomAccessIterator2>(
begin1 + result, begin2 + result);
} }
#endif #endif
#if _GLIBCXX_FIND_CONSTANT_SIZE_BLOCKS #if _GLIBCXX_FIND_CONSTANT_SIZE_BLOCKS
/** /**
* @brief Parallel std::find, constant block size variant. * @brief Parallel std::find, constant block size variant.
* @param begin1 Begin iterator of first sequence. * @param begin1 Begin iterator of first sequence.
* @param end1 End iterator of first sequence. * @param end1 End iterator of first sequence.
* @param begin2 Begin iterator of second sequence. Second sequence * @param begin2 Begin iterator of second sequence. Second sequence
* must have same length as first sequence. * must have same length as first sequence.
* @param pred Find predicate. * @param pred Find predicate.
* @param selector Functionality (e. g. std::find_if (), std::equal(),...) * @param selector Functionality (e. g. std::find_if (), std::equal(),...)
* @return Place of finding in both sequences. * @return Place of finding in both sequences.
* @see __gnu_parallel::Settings::find_sequential_search_size * @see __gnu_parallel::Settings::find_sequential_search_size
* @see __gnu_parallel::Settings::find_block_size * @see __gnu_parallel::Settings::find_block_size
* There are two main differences between the growing blocks and the * There are two main differences between the growing blocks and the
* constant-size blocks variants. * constant-size blocks variants.
* 1. For GB, the block size grows; for CSB, the block size is fixed. * 1. For GB, the block size grows; for CSB, the block size is fixed.
* 2. For GB, the blocks are allocated dynamically; for CSB, the * 2. For GB, the blocks are allocated dynamically; for CSB, the
* blocks are allocated in a predetermined manner, namely spacial * blocks are allocated in a predetermined manner, namely spacial
* round-robin. * round-robin.
*/ */
template<typename RandomAccessIterator1, typename RandomAccessIterator2, typename Pred, typename Selector> template<
typename RandomAccessIterator1,
typename RandomAccessIterator2,
typename Pred,
typename Selector>
std::pair<RandomAccessIterator1, RandomAccessIterator2> std::pair<RandomAccessIterator1, RandomAccessIterator2>
find_template(RandomAccessIterator1 begin1, RandomAccessIterator1 end1, find_template(RandomAccessIterator1 begin1, RandomAccessIterator1 end1,
RandomAccessIterator2 begin2, Pred pred, Selector selector, RandomAccessIterator2 begin2, Pred pred, Selector selector,
constant_size_blocks_tag) constant_size_blocks_tag)
{ {
_GLIBCXX_CALL(end1 - begin1) _GLIBCXX_CALL(end1 - begin1)
typedef std::iterator_traits<RandomAccessIterator1> traits_type; typedef std::iterator_traits<RandomAccessIterator1> traits_type;
...@@ -285,72 +328,77 @@ namespace __gnu_parallel ...@@ -285,72 +328,77 @@ namespace __gnu_parallel
difference_type length = end1 - begin1; difference_type length = end1 - begin1;
difference_type sequential_search_size = std::min<difference_type>(length, Settings::find_sequential_search_size); difference_type sequential_search_size = std::min<difference_type>(
length, Settings::find_sequential_search_size);
// Try it sequentially first. // Try it sequentially first.
std::pair<RandomAccessIterator1, RandomAccessIterator2> find_seq_result = std::pair<RandomAccessIterator1, RandomAccessIterator2> find_seq_result =
selector.sequential_algorithm(begin1, begin1 + sequential_search_size, begin2, pred); selector.sequential_algorithm(begin1, begin1 + sequential_search_size,
begin2, pred);
if (find_seq_result.first != (begin1 + sequential_search_size)) if (find_seq_result.first != (begin1 + sequential_search_size))
return find_seq_result; return find_seq_result;
difference_type result = length; difference_type result = length;
const thread_index_t num_threads = get_max_threads();
omp_lock_t result_lock; omp_lock_t result_lock;
omp_init_lock(&result_lock); omp_init_lock(&result_lock);
// Not within first sequential_search_size elements -> start parallel. // Not within first sequential_search_size elements -> start parallel.
#pragma omp parallel shared(result) num_threads(num_threads)
{ thread_index_t num_threads = get_max_threads();
thread_index_t iam = omp_get_thread_num(); # pragma omp parallel shared(result) num_threads(num_threads)
difference_type block_size = Settings::find_initial_block_size; {
# pragma omp single
difference_type start, stop; num_threads = omp_get_num_threads();
// First element of thread's current iteration. thread_index_t iam = omp_get_thread_num();
difference_type iteration_start = sequential_search_size; difference_type block_size = Settings::find_initial_block_size;
// Where to work (initialization). // First element of thread's current iteration.
start = iteration_start + iam * block_size; difference_type iteration_start = sequential_search_size;
stop = std::min<difference_type>(length, start + block_size);
// Where to work (initialization).
std::pair<RandomAccessIterator1, RandomAccessIterator2> local_result; difference_type start = iteration_start + iam * block_size;
difference_type stop =
while (start < length) std::min<difference_type>(length, start + block_size);
{
// Get new value of result. std::pair<RandomAccessIterator1, RandomAccessIterator2> local_result;
#pragma omp flush(result)
// No chance to find first element. while (start < length)
if (result < start) {
break; // Get new value of result.
# pragma omp flush(result)
local_result = selector.sequential_algorithm(begin1 + start, begin1 + stop, begin2 + start, pred); // No chance to find first element.
if (local_result.first != (begin1 + stop)) if (result < start)
{ break;
omp_set_lock(&result_lock); local_result = selector.sequential_algorithm(
if ((local_result.first - begin1) < result) begin1 + start, begin1 + stop,
result = local_result.first - begin1; begin2 + start, pred);
omp_unset_lock(&result_lock); if (local_result.first != (begin1 + stop))
// Will not find better value in its interval. {
break; omp_set_lock(&result_lock);
} if ((local_result.first - begin1) < result)
result = local_result.first - begin1;
iteration_start += num_threads * block_size; omp_unset_lock(&result_lock);
// Will not find better value in its interval.
// Where to work. break;
start = iteration_start + iam * block_size; }
stop = std::min<difference_type>(length, start + block_size);
} iteration_start += num_threads * block_size;
}
// Where to work.
start = iteration_start + iam * block_size;
stop = std::min<difference_type>(length, start + block_size);
}
} //parallel
omp_destroy_lock(&result_lock); omp_destroy_lock(&result_lock);
// Return iterator on found element. // Return iterator on found element.
return std::pair<RandomAccessIterator1, RandomAccessIterator2>(begin1 + result, begin2 + result); return std::pair<RandomAccessIterator1, RandomAccessIterator2>(
begin1 + result, begin2 + result);
} }
#endif #endif
} // end namespace } // end namespace
#endif #endif
...@@ -29,9 +29,9 @@ ...@@ -29,9 +29,9 @@
// Public License. // Public License.
/** @file parallel/losertree.h /** @file parallel/losertree.h
* @brief Many generic loser tree variants. * @brief Many generic loser tree variants.
* This file is a GNU parallel extension to the Standard C++ Library. * This file is a GNU parallel extension to the Standard C++ Library.
*/ */
// Written by Johannes Singler. // Written by Johannes Singler.
...@@ -49,13 +49,13 @@ namespace __gnu_parallel ...@@ -49,13 +49,13 @@ namespace __gnu_parallel
#if _GLIBCXX_LOSER_TREE_EXPLICIT #if _GLIBCXX_LOSER_TREE_EXPLICIT
/** @brief Guarded loser tree, copying the whole element into the /** @brief Guarded loser tree, copying the whole element into the
* tree structure. * tree structure.
* *
* Guarding is done explicitly through two flags per element, inf * Guarding is done explicitly through two flags per element, inf
* and sup This is a quite slow variant. * and sup This is a quite slow variant.
*/ */
template<typename T, typename Comparator = std::less<T> > template<typename T, typename Comparator = std::less<T> >
class LoserTreeExplicit class LoserTreeExplicit
{ {
private: private:
...@@ -76,26 +76,25 @@ namespace __gnu_parallel ...@@ -76,26 +76,25 @@ namespace __gnu_parallel
Comparator comp; Comparator comp;
public: public:
inline LoserTreeExplicit(unsigned int _size, Comparator _comp = std::less<T>()) : comp(_comp) inline
LoserTreeExplicit(unsigned int _size, Comparator _comp = std::less<T>())
: comp(_comp)
{ {
size = _size; size = _size;
offset = size; offset = size;
losers = new Loser[size]; losers = new Loser[size];
for (unsigned int l = 0; l < size; l++) for (unsigned int l = 0; l < size; l++)
{ {
//losers[l].key = ... stays unset //losers[l].key = ... stays unset
losers[l].inf = true; losers[l].inf = true;
losers[l].sup = false; losers[l].sup = false;
//losers[l].source = -1; //sentinel //losers[l].source = -1; //sentinel
} }
} }
inline ~LoserTreeExplicit() inline ~LoserTreeExplicit()
{ delete[] losers; } { delete[] losers; }
inline void
print() { }
inline int inline int
get_min_source() get_min_source()
{ return losers[0].source; } { return losers[0].source; }
...@@ -105,16 +104,17 @@ namespace __gnu_parallel ...@@ -105,16 +104,17 @@ namespace __gnu_parallel
{ {
bool inf = false; bool inf = false;
for (unsigned int pos = (offset + source) / 2; pos > 0; pos /= 2) for (unsigned int pos = (offset + source) / 2; pos > 0; pos /= 2)
{ {
if ((!inf && !losers[pos].inf && !sup && !losers[pos].sup && comp(losers[pos].key, key)) || losers[pos].inf || sup) if ((!inf && !losers[pos].inf && !sup && !losers[pos].sup
{ && comp(losers[pos].key, key)) || losers[pos].inf || sup)
// The other one is smaller. {
std::swap(losers[pos].key, key); // The other one is smaller.
std::swap(losers[pos].inf, inf); std::swap(losers[pos].key, key);
std::swap(losers[pos].sup, sup); std::swap(losers[pos].inf, inf);
std::swap(losers[pos].source, source); std::swap(losers[pos].sup, sup);
} std::swap(losers[pos].source, source);
} }
}
losers[0].key = key; losers[0].key = key;
losers[0].inf = inf; losers[0].inf = inf;
...@@ -131,19 +131,19 @@ namespace __gnu_parallel ...@@ -131,19 +131,19 @@ namespace __gnu_parallel
bool inf = false; bool inf = false;
int source = losers[0].source; int source = losers[0].source;
for (unsigned int pos = (offset + source) / 2; pos > 0; pos /= 2) for (unsigned int pos = (offset + source) / 2; pos > 0; pos /= 2)
{ {
// The smaller one gets promoted. // The smaller one gets promoted.
if ((!inf && !losers[pos].inf && !sup && !losers[pos].sup if ((!inf && !losers[pos].inf && !sup && !losers[pos].sup
&& comp(losers[pos].key, key)) && comp(losers[pos].key, key))
|| losers[pos].inf || sup) || losers[pos].inf || sup)
{ {
// The other one is smaller. // The other one is smaller.
std::swap(losers[pos].key, key); std::swap(losers[pos].key, key);
std::swap(losers[pos].inf, inf); std::swap(losers[pos].inf, inf);
std::swap(losers[pos].sup, sup); std::swap(losers[pos].sup, sup);
std::swap(losers[pos].source, source); std::swap(losers[pos].source, source);
} }
} }
losers[0].key = key; losers[0].key = key;
losers[0].inf = inf; losers[0].inf = inf;
...@@ -156,19 +156,19 @@ namespace __gnu_parallel ...@@ -156,19 +156,19 @@ namespace __gnu_parallel
{ {
bool inf = false; bool inf = false;
for (unsigned int pos = (offset + source) / 2; pos > 0; pos /= 2) for (unsigned int pos = (offset + source) / 2; pos > 0; pos /= 2)
{ {
if ((!inf && !losers[pos].inf && !sup && !losers[pos].sup && if ((!inf && !losers[pos].inf && !sup && !losers[pos].sup &&
((comp(losers[pos].key, key)) || ((comp(losers[pos].key, key)) ||
(!comp(key, losers[pos].key) && losers[pos].source < source))) (!comp(key, losers[pos].key) && losers[pos].source < source)))
|| losers[pos].inf || sup) || losers[pos].inf || sup)
{ {
// Take next key. // Take next key.
std::swap(losers[pos].key, key); std::swap(losers[pos].key, key);
std::swap(losers[pos].inf, inf); std::swap(losers[pos].inf, inf);
std::swap(losers[pos].sup, sup); std::swap(losers[pos].sup, sup);
std::swap(losers[pos].source, source); std::swap(losers[pos].source, source);
} }
} }
losers[0].key = key; losers[0].key = key;
losers[0].inf = inf; losers[0].inf = inf;
...@@ -185,18 +185,18 @@ namespace __gnu_parallel ...@@ -185,18 +185,18 @@ namespace __gnu_parallel
bool inf = false; bool inf = false;
int source = losers[0].source; int source = losers[0].source;
for (unsigned int pos = (offset + source) / 2; pos > 0; pos /= 2) for (unsigned int pos = (offset + source) / 2; pos > 0; pos /= 2)
{ {
if ((!inf && !losers[pos].inf && !sup && !losers[pos].sup if ((!inf && !losers[pos].inf && !sup && !losers[pos].sup
&& ((comp(losers[pos].key, key)) || && ((comp(losers[pos].key, key)) ||
(!comp(key, losers[pos].key) && losers[pos].source < source))) (!comp(key, losers[pos].key) && losers[pos].source < source)))
|| losers[pos].inf || sup) || losers[pos].inf || sup)
{ {
std::swap(losers[pos].key, key); std::swap(losers[pos].key, key);
std::swap(losers[pos].inf, inf); std::swap(losers[pos].inf, inf);
std::swap(losers[pos].sup, sup); std::swap(losers[pos].sup, sup);
std::swap(losers[pos].source, source); std::swap(losers[pos].source, source);
} }
} }
losers[0].key = key; losers[0].key = key;
losers[0].inf = inf; losers[0].inf = inf;
...@@ -209,14 +209,14 @@ namespace __gnu_parallel ...@@ -209,14 +209,14 @@ namespace __gnu_parallel
#if _GLIBCXX_LOSER_TREE #if _GLIBCXX_LOSER_TREE
/** @brief Guarded loser tree, either copying the whole element into /** @brief Guarded loser tree, either copying the whole element into
* the tree structure, or looking up the element via the index. * the tree structure, or looking up the element via the index.
* *
* Guarding is done explicitly through one flag sup per element, * Guarding is done explicitly through one flag sup per element,
* inf is not needed due to a better initialization routine. This * inf is not needed due to a better initialization routine. This
* is a well-performing variant. * is a well-performing variant.
*/ */
template<typename T, typename Comparator = std::less<T> > template<typename T, typename Comparator = std::less<T> >
class LoserTree class LoserTree
{ {
private: private:
...@@ -240,22 +240,14 @@ namespace __gnu_parallel ...@@ -240,22 +240,14 @@ namespace __gnu_parallel
// Next greater power of 2. // Next greater power of 2.
k = 1 << (log2(ik - 1) + 1); k = 1 << (log2(ik - 1) + 1);
offset = k; offset = k;
losers = new Loser[k * 2]; losers = static_cast<Loser*>(::operator new(k * 2 * sizeof(Loser)));
for (unsigned int i = ik - 1; i < k; i++) for (unsigned int i = ik - 1; i < k; i++)
losers[i + k].sup = true; losers[i + k].sup = true;
} }
inline ~LoserTree() inline ~LoserTree()
{ delete[] losers; } { delete[] losers; }
void
print()
{
for (unsigned int i = 0; i < (k * 2); i++)
printf("%d %d from %d, %d\n", i, losers[i].key,
losers[i].source, losers[i].sup);
}
inline int inline int
get_min_source() get_min_source()
{ return losers[0].source; } { return losers[0].source; }
...@@ -267,33 +259,34 @@ namespace __gnu_parallel ...@@ -267,33 +259,34 @@ namespace __gnu_parallel
losers[pos].sup = sup; losers[pos].sup = sup;
losers[pos].source = source; losers[pos].source = source;
losers[pos].key = key; new(&(losers[pos].key)) T(key);
} }
unsigned int unsigned int
init_winner (unsigned int root) init_winner (unsigned int root)
{ {
if (root >= k) if (root >= k)
{ {
return root; return root;
} }
else else
{ {
unsigned int left = init_winner (2 * root); unsigned int left = init_winner (2 * root);
unsigned int right = init_winner (2 * root + 1); unsigned int right = init_winner (2 * root + 1);
if (losers[right].sup || if (losers[right].sup ||
(!losers[left].sup && !comp(losers[right].key, losers[left].key))) (!losers[left].sup
{ && !comp(losers[right].key, losers[left].key)))
// Left one is less or equal. {
losers[root] = losers[right]; // Left one is less or equal.
return left; losers[root] = losers[right];
} return left;
else }
{ // Right one is less. else
losers[root] = losers[left]; { // Right one is less.
return right; losers[root] = losers[left];
} return right;
} }
}
} }
inline void inline void
...@@ -306,16 +299,16 @@ namespace __gnu_parallel ...@@ -306,16 +299,16 @@ namespace __gnu_parallel
{ {
int source = losers[0].source; int source = losers[0].source;
for (unsigned int pos = (k + source) / 2; pos > 0; pos /= 2) for (unsigned int pos = (k + source) / 2; pos > 0; pos /= 2)
{ {
// The smaller one gets promoted. // The smaller one gets promoted.
if (sup || (!losers[pos].sup && comp(losers[pos].key, key))) if (sup || (!losers[pos].sup && comp(losers[pos].key, key)))
{ {
// The other one is smaller. // The other one is smaller.
std::swap(losers[pos].sup, sup); std::swap(losers[pos].sup, sup);
std::swap(losers[pos].source, source); std::swap(losers[pos].source, source);
std::swap(losers[pos].key, key); std::swap(losers[pos].key, key);
} }
} }
losers[0].sup = sup; losers[0].sup = sup;
losers[0].source = source; losers[0].source = source;
...@@ -330,27 +323,28 @@ namespace __gnu_parallel ...@@ -330,27 +323,28 @@ namespace __gnu_parallel
init_winner_stable (unsigned int root) init_winner_stable (unsigned int root)
{ {
if (root >= k) if (root >= k)
{ {
return root; return root;
} }
else else
{ {
unsigned int left = init_winner (2 * root); unsigned int left = init_winner (2 * root);
unsigned int right = init_winner (2 * root + 1); unsigned int right = init_winner (2 * root + 1);
if ( losers[right].sup || if (losers[right].sup
(!losers[left].sup && !comp(losers[right].key, losers[left].key))) || (!losers[left].sup
{ && !comp(losers[right].key, losers[left].key)))
// Left one is less or equal. {
losers[root] = losers[right]; // Left one is less or equal.
return left; losers[root] = losers[right];
} return left;
else }
{ else
// Right one is less. {
losers[root] = losers[left]; // Right one is less.
return right; losers[root] = losers[left];
} return right;
} }
}
} }
inline void inline void
...@@ -363,19 +357,20 @@ namespace __gnu_parallel ...@@ -363,19 +357,20 @@ namespace __gnu_parallel
{ {
int source = losers[0].source; int source = losers[0].source;
for (unsigned int pos = (k + source) / 2; pos > 0; pos /= 2) for (unsigned int pos = (k + source) / 2; pos > 0; pos /= 2)
{ {
// The smaller one gets promoted, ties are broken by source. // The smaller one gets promoted, ties are broken by source.
if ( (sup && (!losers[pos].sup || losers[pos].source < source)) || if ( (sup && (!losers[pos].sup || losers[pos].source < source))
(!sup && !losers[pos].sup && || (!sup && !losers[pos].sup
((comp(losers[pos].key, key)) || && ((comp(losers[pos].key, key))
(!comp(key, losers[pos].key) && losers[pos].source < source)))) || (!comp(key, losers[pos].key)
{ && losers[pos].source < source))))
// The other one is smaller. {
std::swap(losers[pos].sup, sup); // The other one is smaller.
std::swap(losers[pos].source, source); std::swap(losers[pos].sup, sup);
std::swap(losers[pos].key, key); std::swap(losers[pos].source, source);
} std::swap(losers[pos].key, key);
} }
}
losers[0].sup = sup; losers[0].sup = sup;
losers[0].source = source; losers[0].source = source;
...@@ -387,14 +382,14 @@ namespace __gnu_parallel ...@@ -387,14 +382,14 @@ namespace __gnu_parallel
#if _GLIBCXX_LOSER_TREE_REFERENCE #if _GLIBCXX_LOSER_TREE_REFERENCE
/** @brief Guarded loser tree, either copying the whole element into /** @brief Guarded loser tree, either copying the whole element into
* the tree structure, or looking up the element via the index. * the tree structure, or looking up the element via the index.
* *
* Guarding is done explicitly through one flag sup per element, * Guarding is done explicitly through one flag sup per element,
* inf is not needed due to a better initialization routine. This * inf is not needed due to a better initialization routine. This
* is a well-performing variant. * is a well-performing variant.
*/ */
template<typename T, typename Comparator = std::less<T> > template<typename T, typename Comparator = std::less<T> >
class LoserTreeReference class LoserTreeReference
{ {
#undef COPY #undef COPY
...@@ -423,7 +418,9 @@ namespace __gnu_parallel ...@@ -423,7 +418,9 @@ namespace __gnu_parallel
Comparator comp; Comparator comp;
public: public:
inline LoserTreeReference(unsigned int _k, Comparator _comp = std::less<T>()) : comp(_comp) inline
LoserTreeReference(unsigned int _k, Comparator _comp = std::less<T>())
: comp(_comp)
{ {
ik = _k; ik = _k;
...@@ -435,7 +432,7 @@ namespace __gnu_parallel ...@@ -435,7 +432,7 @@ namespace __gnu_parallel
keys = new T[ik]; keys = new T[ik];
#endif #endif
for (unsigned int i = ik - 1; i < k; i++) for (unsigned int i = ik - 1; i < k; i++)
losers[i + k].sup = true; losers[i + k].sup = true;
} }
inline ~LoserTreeReference() inline ~LoserTreeReference()
...@@ -446,13 +443,6 @@ namespace __gnu_parallel ...@@ -446,13 +443,6 @@ namespace __gnu_parallel
#endif #endif
} }
void
print()
{
for (unsigned int i = 0; i < (k * 2); i++)
printf("%d %d from %d, %d\n", i, KEY(i), losers[i].source, losers[i].sup);
}
inline int inline int
get_min_source() get_min_source()
{ return losers[0].source; } { return losers[0].source; }
...@@ -471,27 +461,27 @@ namespace __gnu_parallel ...@@ -471,27 +461,27 @@ namespace __gnu_parallel
init_winner(unsigned int root) init_winner(unsigned int root)
{ {
if (root >= k) if (root >= k)
{ {
return root; return root;
} }
else else
{ {
unsigned int left = init_winner (2 * root); unsigned int left = init_winner (2 * root);
unsigned int right = init_winner (2 * root + 1); unsigned int right = init_winner (2 * root + 1);
if ( losers[right].sup || if ( losers[right].sup ||
(!losers[left].sup && !comp(KEY(right), KEY(left)))) (!losers[left].sup && !comp(KEY(right), KEY(left))))
{ {
// Left one is less or equal. // Left one is less or equal.
losers[root] = losers[right]; losers[root] = losers[right];
return left; return left;
} }
else else
{ {
// Right one is less. // Right one is less.
losers[root] = losers[left]; losers[root] = losers[left];
return right; return right;
} }
} }
} }
inline void inline void
...@@ -505,18 +495,18 @@ namespace __gnu_parallel ...@@ -505,18 +495,18 @@ namespace __gnu_parallel
{ {
int source = losers[0].source; int source = losers[0].source;
for (unsigned int pos = (k + source) / 2; pos > 0; pos /= 2) for (unsigned int pos = (k + source) / 2; pos > 0; pos /= 2)
{ {
// The smaller one gets promoted. // The smaller one gets promoted.
if (sup || (!losers[pos].sup && comp(KEY(pos), KEY_SOURCE(source)))) if (sup || (!losers[pos].sup && comp(KEY(pos), KEY_SOURCE(source))))
{ {
// The other one is smaller. // The other one is smaller.
std::swap(losers[pos].sup, sup); std::swap(losers[pos].sup, sup);
std::swap(losers[pos].source, source); std::swap(losers[pos].source, source);
#ifdef COPY #ifdef COPY
std::swap(KEY(pos), KEY_SOURCE(source)); std::swap(KEY(pos), KEY_SOURCE(source));
#endif #endif
} }
} }
losers[0].sup = sup; losers[0].sup = sup;
losers[0].source = source; losers[0].source = source;
...@@ -533,27 +523,27 @@ namespace __gnu_parallel ...@@ -533,27 +523,27 @@ namespace __gnu_parallel
init_winner_stable(unsigned int root) init_winner_stable(unsigned int root)
{ {
if (root >= k) if (root >= k)
{ {
return root; return root;
} }
else else
{ {
unsigned int left = init_winner (2 * root); unsigned int left = init_winner (2 * root);
unsigned int right = init_winner (2 * root + 1); unsigned int right = init_winner (2 * root + 1);
if (losers[right].sup if (losers[right].sup
|| (!losers[left].sup && !comp(KEY(right), KEY(left)))) || (!losers[left].sup && !comp(KEY(right), KEY(left))))
{ {
// Left one is less or equal. // Left one is less or equal.
losers[root] = losers[right]; losers[root] = losers[right];
return left; return left;
} }
else else
{ {
// Right one is less. // Right one is less.
losers[root] = losers[left]; losers[root] = losers[left];
return right; return right;
} }
} }
} }
inline void inline void
...@@ -565,21 +555,22 @@ namespace __gnu_parallel ...@@ -565,21 +555,22 @@ namespace __gnu_parallel
{ {
int source = losers[0].source; int source = losers[0].source;
for (unsigned int pos = (k + source) / 2; pos > 0; pos /= 2) for (unsigned int pos = (k + source) / 2; pos > 0; pos /= 2)
{ {
// The smaller one gets promoted, ties are broken by source. // The smaller one gets promoted, ties are broken by source.
if ( (sup && (!losers[pos].sup || losers[pos].source < source)) || if ( (sup && (!losers[pos].sup || losers[pos].source < source)) ||
(!sup && !losers[pos].sup && (!sup && !losers[pos].sup &&
((comp(KEY(pos), KEY_SOURCE(source))) || ((comp(KEY(pos), KEY_SOURCE(source))) ||
(!comp(KEY_SOURCE(source), KEY(pos)) && losers[pos].source < source)))) (!comp(KEY_SOURCE(source), KEY(pos))
{ && losers[pos].source < source))))
// The other one is smaller. {
std::swap(losers[pos].sup, sup); // The other one is smaller.
std::swap(losers[pos].source, source); std::swap(losers[pos].sup, sup);
std::swap(losers[pos].source, source);
#ifdef COPY #ifdef COPY
std::swap(KEY(pos), KEY_SOURCE(source)); std::swap(KEY(pos), KEY_SOURCE(source));
#endif #endif
} }
} }
losers[0].sup = sup; losers[0].sup = sup;
losers[0].source = source; losers[0].source = source;
...@@ -595,13 +586,13 @@ namespace __gnu_parallel ...@@ -595,13 +586,13 @@ namespace __gnu_parallel
#if _GLIBCXX_LOSER_TREE_POINTER #if _GLIBCXX_LOSER_TREE_POINTER
/** @brief Guarded loser tree, either copying the whole element into /** @brief Guarded loser tree, either copying the whole element into
the tree structure, or looking up the element via the index. the tree structure, or looking up the element via the index.
* Guarding is done explicitly through one flag sup per element, * Guarding is done explicitly through one flag sup per element,
* inf is not needed due to a better initialization routine. * inf is not needed due to a better initialization routine.
* This is a well-performing variant. * This is a well-performing variant.
*/ */
template<typename T, typename Comparator = std::less<T> > template<typename T, typename Comparator = std::less<T> >
class LoserTreePointer class LoserTreePointer
{ {
private: private:
...@@ -617,7 +608,9 @@ namespace __gnu_parallel ...@@ -617,7 +608,9 @@ namespace __gnu_parallel
Comparator comp; Comparator comp;
public: public:
inline LoserTreePointer(unsigned int _k, Comparator _comp = std::less<T>()) : comp(_comp) inline
LoserTreePointer(unsigned int _k, Comparator _comp = std::less<T>())
: comp(_comp)
{ {
ik = _k; ik = _k;
...@@ -626,19 +619,12 @@ namespace __gnu_parallel ...@@ -626,19 +619,12 @@ namespace __gnu_parallel
offset = k; offset = k;
losers = new Loser[k * 2]; losers = new Loser[k * 2];
for (unsigned int i = ik - 1; i < k; i++) for (unsigned int i = ik - 1; i < k; i++)
losers[i + k].sup = true; losers[i + k].sup = true;
} }
inline ~LoserTreePointer() inline ~LoserTreePointer()
{ delete[] losers; } { delete[] losers; }
void
print()
{
for (unsigned int i = 0; i < (k * 2); i++)
printf("%d %d from %d, %d\n", i, losers[i].keyp, losers[i].source, losers[i].sup);
}
inline int inline int
get_min_source() get_min_source()
{ return losers[0].source; } { return losers[0].source; }
...@@ -657,49 +643,50 @@ namespace __gnu_parallel ...@@ -657,49 +643,50 @@ namespace __gnu_parallel
init_winner(unsigned int root) init_winner(unsigned int root)
{ {
if (root >= k) if (root >= k)
{ {
return root; return root;
} }
else else
{ {
unsigned int left = init_winner (2 * root); unsigned int left = init_winner (2 * root);
unsigned int right = init_winner (2 * root + 1); unsigned int right = init_winner (2 * root + 1);
if ( losers[right].sup || if (losers[right].sup
(!losers[left].sup && !comp(*losers[right].keyp, *losers[left].keyp))) || (!losers[left].sup
{ && !comp(*losers[right].keyp, *losers[left].keyp)))
// Left one is less or equal. {
losers[root] = losers[right]; // Left one is less or equal.
return left; losers[root] = losers[right];
} return left;
else }
{ else
// Right one is less. {
losers[root] = losers[left]; // Right one is less.
return right; losers[root] = losers[left];
} return right;
} }
}
} }
inline void inline void
init() init()
{ losers[0] = losers[init_winner(1)]; } { losers[0] = losers[init_winner(1)]; }
inline void inline void
delete_min_insert(const T& key, bool sup) delete_min_insert(const T& key, bool sup)
{ {
const T* keyp = &key; const T* keyp = &key;
int source = losers[0].source; int source = losers[0].source;
for (unsigned int pos = (k + source) / 2; pos > 0; pos /= 2) for (unsigned int pos = (k + source) / 2; pos > 0; pos /= 2)
{ {
// The smaller one gets promoted. // The smaller one gets promoted.
if (sup || (!losers[pos].sup && comp(*losers[pos].keyp, *keyp))) if (sup || (!losers[pos].sup && comp(*losers[pos].keyp, *keyp)))
{ {
// The other one is smaller. // The other one is smaller.
std::swap(losers[pos].sup, sup); std::swap(losers[pos].sup, sup);
std::swap(losers[pos].source, source); std::swap(losers[pos].source, source);
std::swap(losers[pos].keyp, keyp); std::swap(losers[pos].keyp, keyp);
} }
} }
losers[0].sup = sup; losers[0].sup = sup;
losers[0].source = source; losers[0].source = source;
...@@ -714,28 +701,28 @@ namespace __gnu_parallel ...@@ -714,28 +701,28 @@ namespace __gnu_parallel
init_winner_stable(unsigned int root) init_winner_stable(unsigned int root)
{ {
if (root >= k) if (root >= k)
{ {
return root; return root;
} }
else else
{ {
unsigned int left = init_winner (2 * root); unsigned int left = init_winner (2 * root);
unsigned int right = init_winner (2 * root + 1); unsigned int right = init_winner (2 * root + 1);
if (losers[right].sup if (losers[right].sup
|| (!losers[left].sup && !comp(*losers[right].keyp, || (!losers[left].sup && !comp(*losers[right].keyp,
*losers[left].keyp))) *losers[left].keyp)))
{ {
// Left one is less or equal. // Left one is less or equal.
losers[root] = losers[right]; losers[root] = losers[right];
return left; return left;
} }
else else
{ {
// Right one is less. // Right one is less.
losers[root] = losers[left]; losers[root] = losers[left];
return right; return right;
} }
} }
} }
inline void inline void
...@@ -748,20 +735,20 @@ namespace __gnu_parallel ...@@ -748,20 +735,20 @@ namespace __gnu_parallel
const T* keyp = &key; const T* keyp = &key;
int source = losers[0].source; int source = losers[0].source;
for (unsigned int pos = (k + source) / 2; pos > 0; pos /= 2) for (unsigned int pos = (k + source) / 2; pos > 0; pos /= 2)
{ {
// The smaller one gets promoted, ties are broken by source. // The smaller one gets promoted, ties are broken by source.
if ( (sup && (!losers[pos].sup || losers[pos].source < source)) || if ( (sup && (!losers[pos].sup || losers[pos].source < source)) ||
(!sup && !losers[pos].sup && (!sup && !losers[pos].sup &&
((comp(*losers[pos].keyp, *keyp)) || ((comp(*losers[pos].keyp, *keyp)) ||
(!comp(*keyp, *losers[pos].keyp) (!comp(*keyp, *losers[pos].keyp)
&& losers[pos].source < source)))) && losers[pos].source < source))))
{ {
// The other one is smaller. // The other one is smaller.
std::swap(losers[pos].sup, sup); std::swap(losers[pos].sup, sup);
std::swap(losers[pos].source, source); std::swap(losers[pos].source, source);
std::swap(losers[pos].keyp, keyp); std::swap(losers[pos].keyp, keyp);
} }
} }
losers[0].sup = sup; losers[0].sup = sup;
losers[0].source = source; losers[0].source = source;
...@@ -773,13 +760,13 @@ namespace __gnu_parallel ...@@ -773,13 +760,13 @@ namespace __gnu_parallel
#if _GLIBCXX_LOSER_TREE_UNGUARDED #if _GLIBCXX_LOSER_TREE_UNGUARDED
/** @brief Unguarded loser tree, copying the whole element into the /** @brief Unguarded loser tree, copying the whole element into the
* tree structure. * tree structure.
* *
* No guarding is done, therefore not a single input sequence must * No guarding is done, therefore not a single input sequence must
* run empty. This is a very fast variant. * run empty. This is a very fast variant.
*/ */
template<typename T, typename Comparator = std::less<T> > template<typename T, typename Comparator = std::less<T> >
class LoserTreeUnguarded class LoserTreeUnguarded
{ {
private: private:
...@@ -798,18 +785,20 @@ namespace __gnu_parallel ...@@ -798,18 +785,20 @@ namespace __gnu_parallel
map(unsigned int root, unsigned int begin, unsigned int end) map(unsigned int root, unsigned int begin, unsigned int end)
{ {
if (begin + 1 == end) if (begin + 1 == end)
mapping[begin] = root; mapping[begin] = root;
else else
{ {
// Next greater or equal power of 2. // Next greater or equal power of 2.
unsigned int left = 1 << (log2(end - begin - 1)); unsigned int left = 1 << (log2(end - begin - 1));
map(root * 2, begin, begin + left); map(root * 2, begin, begin + left);
map(root * 2 + 1, begin + left, end); map(root * 2 + 1, begin + left, end);
} }
} }
public: public:
inline LoserTreeUnguarded(unsigned int _k, Comparator _comp = std::less<T>()) : comp(_comp) inline
LoserTreeUnguarded(unsigned int _k, Comparator _comp = std::less<T>())
: comp(_comp)
{ {
ik = _k; ik = _k;
// Next greater or equal power of 2. // Next greater or equal power of 2.
...@@ -826,13 +815,6 @@ namespace __gnu_parallel ...@@ -826,13 +815,6 @@ namespace __gnu_parallel
delete[] mapping; delete[] mapping;
} }
void
print()
{
for (unsigned int i = 0; i < k + ik; i++)
printf("%d %d from %d\n", i, losers[i].key, losers[i].source);
}
inline int inline int
get_min_source() get_min_source()
{ return losers[0].source; } { return losers[0].source; }
...@@ -849,26 +831,27 @@ namespace __gnu_parallel ...@@ -849,26 +831,27 @@ namespace __gnu_parallel
init_winner(unsigned int root, unsigned int begin, unsigned int end) init_winner(unsigned int root, unsigned int begin, unsigned int end)
{ {
if (begin + 1 == end) if (begin + 1 == end)
return mapping[begin]; return mapping[begin];
else else
{ {
// Next greater or equal power of 2. // Next greater or equal power of 2.
unsigned int division = 1 << (log2(end - begin - 1)); unsigned int division = 1 << (log2(end - begin - 1));
unsigned int left = init_winner(2 * root, begin, begin + division); unsigned int left = init_winner(2 * root, begin, begin + division);
unsigned int right = init_winner(2 * root + 1, begin + division, end); unsigned int right =
if (!comp(losers[right].key, losers[left].key)) init_winner(2 * root + 1, begin + division, end);
{ if (!comp(losers[right].key, losers[left].key))
// Left one is less or equal. {
losers[root] = losers[right]; // Left one is less or equal.
return left; losers[root] = losers[right];
} return left;
else }
{ else
// Right one is less. {
losers[root] = losers[left]; // Right one is less.
return right; losers[root] = losers[left];
} return right;
} }
}
} }
inline void inline void
...@@ -883,15 +866,15 @@ namespace __gnu_parallel ...@@ -883,15 +866,15 @@ namespace __gnu_parallel
T& keyr = losers[0].key; T& keyr = losers[0].key;
int& source = losers[0].source; int& source = losers[0].source;
for (int pos = mapping[source] / 2; pos > 0; pos /= 2) for (int pos = mapping[source] / 2; pos > 0; pos /= 2)
{ {
// The smaller one gets promoted. // The smaller one gets promoted.
if (comp(losers[pos].key, keyr)) if (comp(losers[pos].key, keyr))
{ {
// The other one is smaller. // The other one is smaller.
std::swap(losers[pos].source, source); std::swap(losers[pos].source, source);
std::swap(losers[pos].key, keyr); std::swap(losers[pos].key, keyr);
} }
} }
} }
inline void inline void
...@@ -909,16 +892,17 @@ namespace __gnu_parallel ...@@ -909,16 +892,17 @@ namespace __gnu_parallel
T& keyr = losers[0].key; T& keyr = losers[0].key;
int& source = losers[0].source; int& source = losers[0].source;
for (int pos = mapping[source] / 2; pos > 0; pos /= 2) for (int pos = mapping[source] / 2; pos > 0; pos /= 2)
{ {
// The smaller one gets promoted, ties are broken by source. // The smaller one gets promoted, ties are broken by source.
if (comp(losers[pos].key, keyr) if (comp(losers[pos].key, keyr)
|| (!comp(keyr, losers[pos].key) && losers[pos].source < source)) || (!comp(keyr, losers[pos].key)
{ && losers[pos].source < source))
// The other one is smaller. {
std::swap(losers[pos].source, source); // The other one is smaller.
std::swap(losers[pos].key, keyr); std::swap(losers[pos].source, source);
} std::swap(losers[pos].key, keyr);
} }
}
} }
}; };
...@@ -926,13 +910,13 @@ namespace __gnu_parallel ...@@ -926,13 +910,13 @@ namespace __gnu_parallel
#if _GLIBCXX_LOSER_TREE_POINTER_UNGUARDED #if _GLIBCXX_LOSER_TREE_POINTER_UNGUARDED
/** @brief Unguarded loser tree, keeping only pointers to the /** @brief Unguarded loser tree, keeping only pointers to the
* elements in the tree structure. * elements in the tree structure.
* *
* No guarding is done, therefore not a single input sequence must * No guarding is done, therefore not a single input sequence must
* run empty. This is a very fast variant. * run empty. This is a very fast variant.
*/ */
template<typename T, typename Comparator = std::less<T> > template<typename T, typename Comparator = std::less<T> >
class LoserTreePointerUnguarded class LoserTreePointerUnguarded
{ {
private: private:
...@@ -950,18 +934,21 @@ namespace __gnu_parallel ...@@ -950,18 +934,21 @@ namespace __gnu_parallel
void map(unsigned int root, unsigned int begin, unsigned int end) void map(unsigned int root, unsigned int begin, unsigned int end)
{ {
if (begin + 1 == end) if (begin + 1 == end)
mapping[begin] = root; mapping[begin] = root;
else else
{ {
// Next greater or equal power of 2. // Next greater or equal power of 2.
unsigned int left = 1 << (log2(end - begin - 1)); unsigned int left = 1 << (log2(end - begin - 1));
map(root * 2, begin, begin + left); map(root * 2, begin, begin + left);
map(root * 2 + 1, begin + left, end); map(root * 2 + 1, begin + left, end);
} }
} }
public: public:
inline LoserTreePointerUnguarded(unsigned int _k, Comparator _comp = std::less<T>()) : comp(_comp) inline
LoserTreePointerUnguarded(unsigned int _k,
Comparator _comp = std::less<T>())
: comp(_comp)
{ {
ik = _k; ik = _k;
...@@ -979,13 +966,6 @@ namespace __gnu_parallel ...@@ -979,13 +966,6 @@ namespace __gnu_parallel
delete[] mapping; delete[] mapping;
} }
void
print()
{
for (unsigned int i = 0; i < k + ik; i++)
printf("%d %d from %d\n", i, *losers[i].keyp, losers[i].source);
}
inline int inline int
get_min_source() get_min_source()
{ return losers[0].source; } { return losers[0].source; }
...@@ -1002,26 +982,27 @@ namespace __gnu_parallel ...@@ -1002,26 +982,27 @@ namespace __gnu_parallel
init_winner(unsigned int root, unsigned int begin, unsigned int end) init_winner(unsigned int root, unsigned int begin, unsigned int end)
{ {
if (begin + 1 == end) if (begin + 1 == end)
return mapping[begin]; return mapping[begin];
else else
{ {
// Next greater or equal power of 2. // Next greater or equal power of 2.
unsigned int division = 1 << (log2(end - begin - 1)); unsigned int division = 1 << (log2(end - begin - 1));
unsigned int left = init_winner(2 * root, begin, begin + division); unsigned int left = init_winner(2 * root, begin, begin + division);
unsigned int right = init_winner(2 * root + 1, begin + division, end); unsigned int right
if (!comp(*losers[right].keyp, *losers[left].keyp)) = init_winner(2 * root + 1, begin + division, end);
{ if (!comp(*losers[right].keyp, *losers[left].keyp))
// Left one is less or equal. {
losers[root] = losers[right]; // Left one is less or equal.
return left; losers[root] = losers[right];
} return left;
else }
{ else
// Right one is less. {
losers[root] = losers[left]; // Right one is less.
return right; losers[root] = losers[left];
} return right;
} }
}
} }
inline void inline void
...@@ -1036,15 +1017,15 @@ namespace __gnu_parallel ...@@ -1036,15 +1017,15 @@ namespace __gnu_parallel
const T* keyp = &key; const T* keyp = &key;
int& source = losers[0].source; int& source = losers[0].source;
for (int pos = mapping[source] / 2; pos > 0; pos /= 2) for (int pos = mapping[source] / 2; pos > 0; pos /= 2)
{ {
// The smaller one gets promoted. // The smaller one gets promoted.
if (comp(*losers[pos].keyp, *keyp)) if (comp(*losers[pos].keyp, *keyp))
{ {
// The other one is smaller. // The other one is smaller.
std::swap(losers[pos].source, source); std::swap(losers[pos].source, source);
std::swap(losers[pos].keyp, keyp); std::swap(losers[pos].keyp, keyp);
} }
} }
losers[0].keyp = keyp; losers[0].keyp = keyp;
} }
...@@ -1063,23 +1044,23 @@ namespace __gnu_parallel ...@@ -1063,23 +1044,23 @@ namespace __gnu_parallel
int& source = losers[0].source; int& source = losers[0].source;
const T* keyp = &key; const T* keyp = &key;
for (int pos = mapping[source] / 2; pos > 0; pos /= 2) for (int pos = mapping[source] / 2; pos > 0; pos /= 2)
{ {
// The smaller one gets promoted, ties are broken by source. // The smaller one gets promoted, ties are broken by source.
if (comp(*losers[pos].keyp, *keyp) if (comp(*losers[pos].keyp, *keyp)
|| (!comp(*keyp, *losers[pos].keyp) || (!comp(*keyp, *losers[pos].keyp)
&& losers[pos].source < source)) && losers[pos].source < source))
{ {
// The other one is smaller. // The other one is smaller.
std::swap(losers[pos].source, source); std::swap(losers[pos].source, source);
std::swap(losers[pos].keyp, keyp); std::swap(losers[pos].keyp, keyp);
} }
} }
losers[0].keyp = keyp; losers[0].keyp = keyp;
} }
}; };
#endif #endif
template<typename _ValueTp, class Comparator> template<typename _ValueTp, class Comparator>
struct loser_tree_traits struct loser_tree_traits
{ {
#if _GLIBCXX_LOSER_TREE #if _GLIBCXX_LOSER_TREE
...@@ -1093,7 +1074,7 @@ namespace __gnu_parallel ...@@ -1093,7 +1074,7 @@ namespace __gnu_parallel
#endif #endif
}; };
template<typename _ValueTp, class Comparator> template<typename _ValueTp, class Comparator>
struct loser_tree_unguarded_traits struct loser_tree_unguarded_traits
{ {
#if _GLIBCXX_LOSER_TREE_UNGUARDED #if _GLIBCXX_LOSER_TREE_UNGUARDED
......
...@@ -29,16 +29,16 @@ ...@@ -29,16 +29,16 @@
// Public License. // Public License.
/** @file parallel/multiway_merge.h /** @file parallel/multiway_merge.h
* @brief Implementation of sequential and parallel multiway merge. * @brief Implementation of sequential and parallel multiway merge.
* *
* Explanations on the high-speed merging routines in the appendix of * Explanations on the high-speed merging routines in the appendix of
* *
* P. Sanders. * P. Sanders.
* Fast priority queues for cached memory. * Fast priority queues for cached memory.
* ACM Journal of Experimental Algorithmics, 5, 2000. * ACM Journal of Experimental Algorithmics, 5, 2000.
* *
* This file is a GNU parallel extension to the Standard C++ Library. * This file is a GNU parallel extension to the Standard C++ Library.
*/ */
// Written by Johannes Singler. // Written by Johannes Singler.
...@@ -62,25 +62,25 @@ ...@@ -62,25 +62,25 @@
// XXX need iterator typedefs // XXX need iterator typedefs
namespace __gnu_parallel namespace __gnu_parallel
{ {
template<typename RandomAccessIterator, typename Comparator> template<typename RandomAccessIterator, typename Comparator>
class guarded_iterator; class guarded_iterator;
template<typename RandomAccessIterator, typename Comparator> template<typename RandomAccessIterator, typename Comparator>
inline bool inline bool
operator<(guarded_iterator<RandomAccessIterator, Comparator>& bi1, operator<(guarded_iterator<RandomAccessIterator, Comparator>& bi1,
guarded_iterator<RandomAccessIterator, Comparator>& bi2); guarded_iterator<RandomAccessIterator, Comparator>& bi2);
template<typename RandomAccessIterator, typename Comparator> template<typename RandomAccessIterator, typename Comparator>
inline bool inline bool
operator<=(guarded_iterator<RandomAccessIterator, Comparator>& bi1, operator<=(guarded_iterator<RandomAccessIterator, Comparator>& bi1,
guarded_iterator<RandomAccessIterator, Comparator>& bi2); guarded_iterator<RandomAccessIterator, Comparator>& bi2);
/** @brief Iterator wrapper supporting an implicit supremum at the end /** @brief Iterator wrapper supporting an implicit supremum at the end
of the sequence, dominating all comparisons. of the sequence, dominating all comparisons.
* Deriving from RandomAccessIterator is not possible since * Deriving from RandomAccessIterator is not possible since
* RandomAccessIterator need not be a class. * RandomAccessIterator need not be a class.
*/ */
template<typename RandomAccessIterator, typename Comparator> template<typename RandomAccessIterator, typename Comparator>
class guarded_iterator class guarded_iterator
{ {
private: private:
...@@ -95,17 +95,17 @@ namespace __gnu_parallel ...@@ -95,17 +95,17 @@ namespace __gnu_parallel
public: public:
/** @brief Constructor. Sets iterator to beginning of sequence. /** @brief Constructor. Sets iterator to beginning of sequence.
* @param begin Begin iterator of sequence. * @param begin Begin iterator of sequence.
* @param end End iterator of sequence. * @param end End iterator of sequence.
* @param comp Comparator provided for associated overloaded * @param comp Comparator provided for associated overloaded
* compare operators. */ * compare operators. */
inline guarded_iterator(RandomAccessIterator begin, inline guarded_iterator(RandomAccessIterator begin,
RandomAccessIterator end, Comparator& comp) RandomAccessIterator end, Comparator& comp)
: current(begin), end(end), comp(comp) : current(begin), end(end), comp(comp)
{ } { }
/** @brief Pre-increment operator. /** @brief Pre-increment operator.
* @return This. */ * @return This. */
inline guarded_iterator<RandomAccessIterator, Comparator>& inline guarded_iterator<RandomAccessIterator, Comparator>&
operator++() operator++()
{ {
...@@ -114,31 +114,35 @@ namespace __gnu_parallel ...@@ -114,31 +114,35 @@ namespace __gnu_parallel
} }
/** @brief Dereference operator. /** @brief Dereference operator.
* @return Referenced element. */ * @return Referenced element. */
inline typename std::iterator_traits<RandomAccessIterator>::value_type inline typename std::iterator_traits<RandomAccessIterator>::value_type
operator*() operator*()
{ return *current; } { return *current; }
/** @brief Convert to wrapped iterator. /** @brief Convert to wrapped iterator.
* @return Wrapped iterator. */ * @return Wrapped iterator. */
inline operator RandomAccessIterator() inline operator RandomAccessIterator()
{ return current; } { return current; }
friend bool friend bool
operator< <RandomAccessIterator, Comparator>(guarded_iterator<RandomAccessIterator, Comparator>& bi1, guarded_iterator<RandomAccessIterator, Comparator>& bi2); operator< <RandomAccessIterator, Comparator>(
guarded_iterator<RandomAccessIterator, Comparator>& bi1,
guarded_iterator<RandomAccessIterator, Comparator>& bi2);
friend bool friend bool
operator<= <RandomAccessIterator, Comparator>(guarded_iterator<RandomAccessIterator, Comparator>& bi1, guarded_iterator<RandomAccessIterator, Comparator>& bi2); operator<= <RandomAccessIterator, Comparator>(
guarded_iterator<RandomAccessIterator, Comparator>& bi1,
guarded_iterator<RandomAccessIterator, Comparator>& bi2);
}; };
/** @brief Compare two elements referenced by guarded iterators. /** @brief Compare two elements referenced by guarded iterators.
* @param bi1 First iterator. * @param bi1 First iterator.
* @param bi2 Second iterator. * @param bi2 Second iterator.
* @return @c True if less. */ * @return @c True if less. */
template<typename RandomAccessIterator, typename Comparator> template<typename RandomAccessIterator, typename Comparator>
inline bool inline bool
operator<(guarded_iterator<RandomAccessIterator, Comparator>& bi1, operator<(guarded_iterator<RandomAccessIterator, Comparator>& bi1,
guarded_iterator<RandomAccessIterator, Comparator>& bi2) guarded_iterator<RandomAccessIterator, Comparator>& bi2)
{ {
if (bi1.current == bi1.end) //bi1 is sup if (bi1.current == bi1.end) //bi1 is sup
return bi2.current == bi2.end; //bi2 is not sup return bi2.current == bi2.end; //bi2 is not sup
...@@ -147,14 +151,14 @@ namespace __gnu_parallel ...@@ -147,14 +151,14 @@ namespace __gnu_parallel
return (bi1.comp)(*bi1, *bi2); //normal compare return (bi1.comp)(*bi1, *bi2); //normal compare
} }
/** @brief Compare two elements referenced by guarded iterators. /** @brief Compare two elements referenced by guarded iterators.
* @param bi1 First iterator. * @param bi1 First iterator.
* @param bi2 Second iterator. * @param bi2 Second iterator.
* @return @c True if less equal. */ * @return @c True if less equal. */
template<typename RandomAccessIterator, typename Comparator> template<typename RandomAccessIterator, typename Comparator>
inline bool inline bool
operator<=(guarded_iterator<RandomAccessIterator, Comparator>& bi1, operator<=(guarded_iterator<RandomAccessIterator, Comparator>& bi1,
guarded_iterator<RandomAccessIterator, Comparator>& bi2) guarded_iterator<RandomAccessIterator, Comparator>& bi2)
{ {
if (bi2.current == bi2.end) //bi1 is sup if (bi2.current == bi2.end) //bi1 is sup
return bi1.current != bi1.end; //bi2 is not sup return bi1.current != bi1.end; //bi2 is not sup
...@@ -163,20 +167,20 @@ namespace __gnu_parallel ...@@ -163,20 +167,20 @@ namespace __gnu_parallel
return !(bi1.comp)(*bi2, *bi1); //normal compare return !(bi1.comp)(*bi2, *bi1); //normal compare
} }
template<typename RandomAccessIterator, typename Comparator> template<typename RandomAccessIterator, typename Comparator>
class unguarded_iterator; class unguarded_iterator;
template<typename RandomAccessIterator, typename Comparator> template<typename RandomAccessIterator, typename Comparator>
inline bool inline bool
operator<(unguarded_iterator<RandomAccessIterator, Comparator>& bi1, operator<(unguarded_iterator<RandomAccessIterator, Comparator>& bi1,
unguarded_iterator<RandomAccessIterator, Comparator>& bi2); unguarded_iterator<RandomAccessIterator, Comparator>& bi2);
template<typename RandomAccessIterator, typename Comparator> template<typename RandomAccessIterator, typename Comparator>
inline bool inline bool
operator<=(unguarded_iterator<RandomAccessIterator, Comparator>& bi1, operator<=(unguarded_iterator<RandomAccessIterator, Comparator>& bi1,
unguarded_iterator<RandomAccessIterator, Comparator>& bi2); unguarded_iterator<RandomAccessIterator, Comparator>& bi2);
template<typename RandomAccessIterator, typename Comparator> template<typename RandomAccessIterator, typename Comparator>
class unguarded_iterator class unguarded_iterator
{ {
private: private:
...@@ -187,16 +191,16 @@ namespace __gnu_parallel ...@@ -187,16 +191,16 @@ namespace __gnu_parallel
public: public:
/** @brief Constructor. Sets iterator to beginning of sequence. /** @brief Constructor. Sets iterator to beginning of sequence.
* @param begin Begin iterator of sequence. * @param begin Begin iterator of sequence.
* @param end Unused, only for compatibility. * @param end Unused, only for compatibility.
* @param comp Unused, only for compatibility. */ * @param comp Unused, only for compatibility. */
inline unguarded_iterator(RandomAccessIterator begin, inline unguarded_iterator(RandomAccessIterator begin,
RandomAccessIterator end, Comparator& comp) RandomAccessIterator end, Comparator& comp)
: current(begin), comp(comp) : current(begin), comp(comp)
{ } { }
/** @brief Pre-increment operator. /** @brief Pre-increment operator.
* @return This. */ * @return This. */
inline unguarded_iterator<RandomAccessIterator, Comparator>& inline unguarded_iterator<RandomAccessIterator, Comparator>&
operator++() operator++()
{ {
...@@ -205,77 +209,85 @@ namespace __gnu_parallel ...@@ -205,77 +209,85 @@ namespace __gnu_parallel
} }
/** @brief Dereference operator. /** @brief Dereference operator.
* @return Referenced element. */ * @return Referenced element. */
inline typename std::iterator_traits<RandomAccessIterator>::value_type inline typename std::iterator_traits<RandomAccessIterator>::value_type
operator*() operator*()
{ return *current; } { return *current; }
/** @brief Convert to wrapped iterator. /** @brief Convert to wrapped iterator.
* @return Wrapped iterator. */ * @return Wrapped iterator. */
inline inline
operator RandomAccessIterator() operator RandomAccessIterator()
{ return current; } { return current; }
friend bool friend bool
operator< <RandomAccessIterator, Comparator>(unguarded_iterator<RandomAccessIterator, Comparator>& bi1, unguarded_iterator<RandomAccessIterator, Comparator>& bi2); operator< <RandomAccessIterator, Comparator>(
unguarded_iterator<RandomAccessIterator, Comparator>& bi1,
unguarded_iterator<RandomAccessIterator, Comparator>& bi2);
friend bool friend bool
operator<= <RandomAccessIterator, Comparator>(unguarded_iterator<RandomAccessIterator, Comparator>& bi1, unguarded_iterator<RandomAccessIterator, Comparator>& bi2); operator<= <RandomAccessIterator, Comparator>(
unguarded_iterator<RandomAccessIterator, Comparator>& bi1,
unguarded_iterator<RandomAccessIterator, Comparator>& bi2);
}; };
/** @brief Compare two elements referenced by unguarded iterators. /** @brief Compare two elements referenced by unguarded iterators.
* @param bi1 First iterator. * @param bi1 First iterator.
* @param bi2 Second iterator. * @param bi2 Second iterator.
* @return @c True if less. */ * @return @c True if less. */
template<typename RandomAccessIterator, typename Comparator> template<typename RandomAccessIterator, typename Comparator>
inline bool inline bool
operator<(unguarded_iterator<RandomAccessIterator, Comparator>& bi1, operator<(unguarded_iterator<RandomAccessIterator, Comparator>& bi1,
unguarded_iterator<RandomAccessIterator, Comparator>& bi2) unguarded_iterator<RandomAccessIterator, Comparator>& bi2)
{ {
// Normal compare. // Normal compare.
return (bi1.comp)(*bi1, *bi2); return (bi1.comp)(*bi1, *bi2);
} }
/** @brief Compare two elements referenced by unguarded iterators. /** @brief Compare two elements referenced by unguarded iterators.
* @param bi1 First iterator. * @param bi1 First iterator.
* @param bi2 Second iterator. * @param bi2 Second iterator.
* @return @c True if less equal. */ * @return @c True if less equal. */
template<typename RandomAccessIterator, typename Comparator> template<typename RandomAccessIterator, typename Comparator>
inline bool inline bool
operator<=(unguarded_iterator<RandomAccessIterator, Comparator>& bi1, operator<=(unguarded_iterator<RandomAccessIterator, Comparator>& bi1,
unguarded_iterator<RandomAccessIterator, Comparator>& bi2) unguarded_iterator<RandomAccessIterator, Comparator>& bi2)
{ {
// Normal compare. // Normal compare.
return !(bi1.comp)(*bi2, *bi1); return !(bi1.comp)(*bi2, *bi1);
} }
/** Prepare a set of sequences to be merged without a (end) guard /** Prepare a set of sequences to be merged without a (end) guard
* @param seqs_begin * @param seqs_begin
* @param seqs_end * @param seqs_end
* @param comp * @param comp
* @param min_sequence * @param min_sequence
* @param stable * @param stable
* @pre (seqs_end - seqs_begin > 0) */ * @pre (seqs_end - seqs_begin > 0) */
template<typename RandomAccessIteratorIterator, typename Comparator> template<typename RandomAccessIteratorIterator, typename Comparator>
typename std::iterator_traits<typename std::iterator_traits<RandomAccessIteratorIterator>::value_type::first_type>::difference_type typename std::iterator_traits<
typename std::iterator_traits<RandomAccessIteratorIterator>::value_type
::first_type>::difference_type
prepare_unguarded(RandomAccessIteratorIterator seqs_begin, prepare_unguarded(RandomAccessIteratorIterator seqs_begin,
RandomAccessIteratorIterator seqs_end, Comparator comp, RandomAccessIteratorIterator seqs_end, Comparator comp,
int& min_sequence, bool stable) int& min_sequence, bool stable)
{ {
_GLIBCXX_CALL(seqs_end - seqs_begin) _GLIBCXX_CALL(seqs_end - seqs_begin)
typedef typename std::iterator_traits<RandomAccessIteratorIterator>::value_type::first_type typedef typename std::iterator_traits<RandomAccessIteratorIterator>
::value_type::first_type
RandomAccessIterator1; RandomAccessIterator1;
typedef typename std::iterator_traits<RandomAccessIterator1>::value_type typedef typename std::iterator_traits<RandomAccessIterator1>::value_type
value_type; value_type;
typedef typename std::iterator_traits<RandomAccessIterator1>::difference_type typedef typename std::iterator_traits<RandomAccessIterator1>
::difference_type
difference_type; difference_type;
if ((*seqs_begin).first == (*seqs_begin).second) if ((*seqs_begin).first == (*seqs_begin).second)
{ {
// Empty sequence found, it's the first one. // Empty sequence found, it's the first one.
min_sequence = 0; min_sequence = 0;
return -1; return -1;
} }
// Last element in sequence. // Last element in sequence.
...@@ -283,20 +295,20 @@ namespace __gnu_parallel ...@@ -283,20 +295,20 @@ namespace __gnu_parallel
min_sequence = 0; min_sequence = 0;
for (RandomAccessIteratorIterator s = seqs_begin + 1; s != seqs_end; s++) for (RandomAccessIteratorIterator s = seqs_begin + 1; s != seqs_end; s++)
{ {
if ((*s).first == (*s).second) if ((*s).first == (*s).second)
{ {
// Empty sequence found. // Empty sequence found.
min_sequence = static_cast<int>(s - seqs_begin); min_sequence = static_cast<int>(s - seqs_begin);
return -1; return -1;
} }
// Last element in sequence. // Last element in sequence.
const value_type& v = *((*s).second - 1); const value_type& v = *((*s).second - 1);
if (comp(v, min)) //strictly smaller if (comp(v, min)) //strictly smaller
{ {
min = v; min = v;
min_sequence = static_cast<int>(s - seqs_begin); min_sequence = static_cast<int>(s - seqs_begin);
} }
} }
difference_type overhang_size = 0; difference_type overhang_size = 0;
...@@ -304,93 +316,108 @@ namespace __gnu_parallel ...@@ -304,93 +316,108 @@ namespace __gnu_parallel
int s = 0; int s = 0;
for (s = 0; s <= min_sequence; s++) for (s = 0; s <= min_sequence; s++)
{ {
RandomAccessIterator1 split; RandomAccessIterator1 split;
if (stable) if (stable)
split = std::upper_bound(seqs_begin[s].first, seqs_begin[s].second, split = std::upper_bound(seqs_begin[s].first, seqs_begin[s].second,
min, comp); min, comp);
else else
split = std::lower_bound(seqs_begin[s].first, seqs_begin[s].second, split = std::lower_bound(seqs_begin[s].first, seqs_begin[s].second,
min, comp); min, comp);
overhang_size += seqs_begin[s].second - split; overhang_size += seqs_begin[s].second - split;
} }
for (; s < (seqs_end - seqs_begin); s++) for (; s < (seqs_end - seqs_begin); s++)
{ {
RandomAccessIterator1 split = std::lower_bound(seqs_begin[s].first, seqs_begin[s].second, min, comp); RandomAccessIterator1 split = std::lower_bound(
overhang_size += seqs_begin[s].second - split; seqs_begin[s].first, seqs_begin[s].second, min, comp);
overhang_size += seqs_begin[s].second - split;
} }
// So many elements will be left over afterwards. // So many elements will be left over afterwards.
return overhang_size; return overhang_size;
} }
/** Prepare a set of sequences to be merged with a (end) guard (sentinel) /** Prepare a set of sequences to be merged with a (end) guard (sentinel)
* @param seqs_begin * @param seqs_begin
* @param seqs_end * @param seqs_end
* @param comp */ * @param comp */
template<typename RandomAccessIteratorIterator, typename Comparator> template<typename RandomAccessIteratorIterator, typename Comparator>
typename std::iterator_traits<typename std::iterator_traits<RandomAccessIteratorIterator>::value_type::first_type>::difference_type typename std::iterator_traits<typename std::iterator_traits<
RandomAccessIteratorIterator>::value_type::first_type>::difference_type
prepare_unguarded_sentinel(RandomAccessIteratorIterator seqs_begin, prepare_unguarded_sentinel(RandomAccessIteratorIterator seqs_begin,
RandomAccessIteratorIterator seqs_end, RandomAccessIteratorIterator seqs_end,
Comparator comp) Comparator comp)
{ {
_GLIBCXX_CALL(seqs_end - seqs_begin) _GLIBCXX_CALL(seqs_end - seqs_begin)
typedef typename std::iterator_traits<RandomAccessIteratorIterator>::value_type::first_type typedef typename std::iterator_traits<RandomAccessIteratorIterator>
::value_type::first_type
RandomAccessIterator1; RandomAccessIterator1;
typedef typename std::iterator_traits<RandomAccessIterator1>::value_type typedef typename std::iterator_traits<RandomAccessIterator1>
::value_type
value_type; value_type;
typedef typename std::iterator_traits<RandomAccessIterator1>::difference_type typedef typename std::iterator_traits<RandomAccessIterator1>
::difference_type
difference_type; difference_type;
// Last element in sequence. // Last element in sequence.
value_type* max = NULL; value_type* max = NULL;
for (RandomAccessIteratorIterator s = seqs_begin; s != seqs_end; s++) for (RandomAccessIteratorIterator s = seqs_begin; s != seqs_end; s++)
{ {
if ((*s).first == (*s).second) if ((*s).first == (*s).second)
continue; continue;
// Last element in sequence. // Last element in sequence.
value_type& v = *((*s).second - 1); value_type& v = *((*s).second - 1);
// Strictly greater. // Strictly greater.
if (!max || comp(*max, v)) if (!max || comp(*max, v))
max = &v; max = &v;
} }
difference_type overhang_size = 0; difference_type overhang_size = 0;
for (RandomAccessIteratorIterator s = seqs_begin; s != seqs_end; s++) for (RandomAccessIteratorIterator s = seqs_begin; s != seqs_end; s++)
{ {
RandomAccessIterator1 split = std::lower_bound((*s).first, (*s).second, RandomAccessIterator1 split =
*max, comp); std::lower_bound((*s).first, (*s).second, *max, comp);
overhang_size += (*s).second - split; overhang_size += (*s).second - split;
// Set sentinel. // Set sentinel.
*((*s).second) = *max; *((*s).second) = *max;
} }
// So many elements will be left over afterwards. // So many elements will be left over afterwards.
return overhang_size; return overhang_size;
} }
/** @brief Highly efficient 3-way merging procedure. /** @brief Highly efficient 3-way merging procedure.
* @param seqs_begin Begin iterator of iterator pair input sequence. * @param seqs_begin Begin iterator of iterator pair input sequence.
* @param seqs_end End iterator of iterator pair input sequence. * @param seqs_end End iterator of iterator pair input sequence.
* @param target Begin iterator out output sequence. * @param target Begin iterator out output sequence.
* @param comp Comparator. * @param comp Comparator.
* @param length Maximum length to merge. * @param length Maximum length to merge.
* @param stable Unused, stable anyway. * @param stable Unused, stable anyway.
* @return End iterator of output sequence. */ * @return End iterator of output sequence. */
template<template<typename RAI, typename C> class iterator, typename RandomAccessIteratorIterator, typename RandomAccessIterator3, typename _DifferenceTp, typename Comparator> template<
template<typename RAI, typename C> class iterator,
typename RandomAccessIteratorIterator,
typename RandomAccessIterator3,
typename _DifferenceTp,
typename Comparator>
RandomAccessIterator3 RandomAccessIterator3
multiway_merge_3_variant(RandomAccessIteratorIterator seqs_begin, RandomAccessIteratorIterator seqs_end, RandomAccessIterator3 target, Comparator comp, _DifferenceTp length, bool stable) multiway_merge_3_variant(
RandomAccessIteratorIterator seqs_begin,
RandomAccessIteratorIterator seqs_end,
RandomAccessIterator3 target,
Comparator comp, _DifferenceTp length, bool stable)
{ {
_GLIBCXX_CALL(length); _GLIBCXX_CALL(length);
typedef _DifferenceTp difference_type; typedef _DifferenceTp difference_type;
typedef typename std::iterator_traits<RandomAccessIteratorIterator>::value_type::first_type typedef typename std::iterator_traits<RandomAccessIteratorIterator>
::value_type::first_type
RandomAccessIterator1; RandomAccessIterator1;
typedef typename std::iterator_traits<RandomAccessIterator1>::value_type typedef typename std::iterator_traits<RandomAccessIterator1>::value_type
value_type; value_type;
...@@ -405,25 +432,25 @@ namespace __gnu_parallel ...@@ -405,25 +432,25 @@ namespace __gnu_parallel
if (seq0 <= seq1) if (seq0 <= seq1)
{ {
if (seq1 <= seq2) if (seq1 <= seq2)
goto s012; goto s012;
else else
if (seq2 < seq0) if (seq2 < seq0)
goto s201; goto s201;
else else
goto s021; goto s021;
} }
else else
{ {
if (seq1 <= seq2) if (seq1 <= seq2)
{ {
if (seq0 <= seq2) if (seq0 <= seq2)
goto s102; goto s102;
else else
goto s120; goto s120;
} }
else else
goto s210; goto s210;
} }
#define Merge3Case(a,b,c,c0,c1) \ #define Merge3Case(a,b,c,c0,c1) \
...@@ -456,14 +483,23 @@ namespace __gnu_parallel ...@@ -456,14 +483,23 @@ namespace __gnu_parallel
return target; return target;
} }
template<typename RandomAccessIteratorIterator, typename RandomAccessIterator3, typename _DifferenceTp, typename Comparator> template<
typename RandomAccessIteratorIterator,
typename RandomAccessIterator3,
typename _DifferenceTp,
typename Comparator>
RandomAccessIterator3 RandomAccessIterator3
multiway_merge_3_combined(RandomAccessIteratorIterator seqs_begin, RandomAccessIteratorIterator seqs_end, RandomAccessIterator3 target, Comparator comp, _DifferenceTp length, bool stable) multiway_merge_3_combined(RandomAccessIteratorIterator seqs_begin,
RandomAccessIteratorIterator seqs_end,
RandomAccessIterator3 target,
Comparator comp,
_DifferenceTp length, bool stable)
{ {
_GLIBCXX_CALL(length); _GLIBCXX_CALL(length);
typedef _DifferenceTp difference_type; typedef _DifferenceTp difference_type;
typedef typename std::iterator_traits<RandomAccessIteratorIterator>::value_type::first_type typedef typename std::iterator_traits<RandomAccessIteratorIterator>
::value_type::first_type
RandomAccessIterator1; RandomAccessIterator1;
typedef typename std::iterator_traits<RandomAccessIterator1>::value_type typedef typename std::iterator_traits<RandomAccessIterator1>::value_type
value_type; value_type;
...@@ -472,7 +508,8 @@ namespace __gnu_parallel ...@@ -472,7 +508,8 @@ namespace __gnu_parallel
RandomAccessIterator3 target_end; RandomAccessIterator3 target_end;
// Stable anyway. // Stable anyway.
difference_type overhang = prepare_unguarded(seqs_begin, seqs_end, comp, min_seq, true); difference_type overhang =
prepare_unguarded(seqs_begin, seqs_end, comp, min_seq, true);
difference_type total_length = 0; difference_type total_length = 0;
for (RandomAccessIteratorIterator s = seqs_begin; s != seqs_end; ++s) for (RandomAccessIteratorIterator s = seqs_begin; s != seqs_end; ++s)
...@@ -480,16 +517,17 @@ namespace __gnu_parallel ...@@ -480,16 +517,17 @@ namespace __gnu_parallel
if (overhang != -1) if (overhang != -1)
{ {
difference_type unguarded_length = std::min(length, total_length - overhang); difference_type unguarded_length =
target_end = multiway_merge_3_variant<unguarded_iterator> std::min(length, total_length - overhang);
(seqs_begin, seqs_end, target, comp, unguarded_length, stable); target_end = multiway_merge_3_variant<unguarded_iterator>
overhang = length - unguarded_length; (seqs_begin, seqs_end, target, comp, unguarded_length, stable);
overhang = length - unguarded_length;
} }
else else
{ {
// Empty sequence found. // Empty sequence found.
overhang = length; overhang = length;
target_end = target; target_end = target;
} }
#if _GLIBCXX_ASSERTIONS #if _GLIBCXX_ASSERTIONS
...@@ -500,23 +538,23 @@ namespace __gnu_parallel ...@@ -500,23 +538,23 @@ namespace __gnu_parallel
switch (min_seq) switch (min_seq)
{ {
case 0: case 0:
// Iterators will be advanced accordingly. // Iterators will be advanced accordingly.
target_end = merge_advance(seqs_begin[1].first, seqs_begin[1].second, target_end = merge_advance(seqs_begin[1].first, seqs_begin[1].second,
seqs_begin[2].first, seqs_begin[2].second, seqs_begin[2].first, seqs_begin[2].second,
target_end, overhang, comp); target_end, overhang, comp);
break; break;
case 1: case 1:
target_end = merge_advance(seqs_begin[0].first, seqs_begin[0].second, target_end = merge_advance(seqs_begin[0].first, seqs_begin[0].second,
seqs_begin[2].first, seqs_begin[2].second, seqs_begin[2].first, seqs_begin[2].second,
target_end, overhang, comp); target_end, overhang, comp);
break; break;
case 2: case 2:
target_end = merge_advance(seqs_begin[0].first, seqs_begin[0].second, target_end = merge_advance(seqs_begin[0].first, seqs_begin[0].second,
seqs_begin[1].first, seqs_begin[1].second, seqs_begin[1].first, seqs_begin[1].second,
target_end, overhang, comp); target_end, overhang, comp);
break; break;
default: default:
_GLIBCXX_PARALLEL_ASSERT(false); _GLIBCXX_PARALLEL_ASSERT(false);
} }
#if _GLIBCXX_ASSERTIONS #if _GLIBCXX_ASSERTIONS
...@@ -527,22 +565,31 @@ namespace __gnu_parallel ...@@ -527,22 +565,31 @@ namespace __gnu_parallel
return target_end; return target_end;
} }
/** @brief Highly efficient 4-way merging procedure. /** @brief Highly efficient 4-way merging procedure.
* @param seqs_begin Begin iterator of iterator pair input sequence. * @param seqs_begin Begin iterator of iterator pair input sequence.
* @param seqs_end End iterator of iterator pair input sequence. * @param seqs_end End iterator of iterator pair input sequence.
* @param target Begin iterator out output sequence. * @param target Begin iterator out output sequence.
* @param comp Comparator. * @param comp Comparator.
* @param length Maximum length to merge. * @param length Maximum length to merge.
* @param stable Unused, stable anyway. * @param stable Unused, stable anyway.
* @return End iterator of output sequence. */ * @return End iterator of output sequence. */
template<template<typename RAI, typename C> class iterator, typename RandomAccessIteratorIterator, typename RandomAccessIterator3, typename _DifferenceTp, typename Comparator> template<
template<typename RAI, typename C> class iterator,
typename RandomAccessIteratorIterator,
typename RandomAccessIterator3,
typename _DifferenceTp,
typename Comparator>
RandomAccessIterator3 RandomAccessIterator3
multiway_merge_4_variant(RandomAccessIteratorIterator seqs_begin, RandomAccessIteratorIterator seqs_end, RandomAccessIterator3 target, Comparator comp, _DifferenceTp length, bool stable) multiway_merge_4_variant(RandomAccessIteratorIterator seqs_begin,
RandomAccessIteratorIterator seqs_end,
RandomAccessIterator3 target,
Comparator comp, _DifferenceTp length, bool stable)
{ {
_GLIBCXX_CALL(length); _GLIBCXX_CALL(length);
typedef _DifferenceTp difference_type; typedef _DifferenceTp difference_type;
typedef typename std::iterator_traits<RandomAccessIteratorIterator>::value_type::first_type typedef typename std::iterator_traits<RandomAccessIteratorIterator>
::value_type::first_type
RandomAccessIterator1; RandomAccessIterator1;
typedef typename std::iterator_traits<RandomAccessIterator1>::value_type typedef typename std::iterator_traits<RandomAccessIterator1>::value_type
value_type; value_type;
...@@ -561,26 +608,26 @@ namespace __gnu_parallel ...@@ -561,26 +608,26 @@ namespace __gnu_parallel
if (seq0 <= seq1) if (seq0 <= seq1)
{ {
if (seq1 <= seq2) if (seq1 <= seq2)
Decision(0,1,2,3) Decision(0,1,2,3)
else else
if (seq2 < seq0) if (seq2 < seq0)
Decision(2,0,1,3) Decision(2,0,1,3)
else else
Decision(0,2,1,3) Decision(0,2,1,3)
} }
else else
{ {
if (seq1 <= seq2) if (seq1 <= seq2)
{ {
if (seq0 <= seq2) if (seq0 <= seq2)
Decision(1,0,2,3) Decision(1,0,2,3)
else else
Decision(1,2,0,3) Decision(1,2,0,3)
} }
else else
Decision(2,1,0,3) Decision(2,1,0,3)
} }
#define Merge4Case(a,b,c,d,c0,c1,c2) \ #define Merge4Case(a,b,c,d,c0,c1,c2) \
s ## a ## b ## c ## d: \ s ## a ## b ## c ## d: \
...@@ -633,14 +680,23 @@ namespace __gnu_parallel ...@@ -633,14 +680,23 @@ namespace __gnu_parallel
return target; return target;
} }
template<typename RandomAccessIteratorIterator, typename RandomAccessIterator3, typename _DifferenceTp, typename Comparator> template<
typename RandomAccessIteratorIterator,
typename RandomAccessIterator3,
typename _DifferenceTp,
typename Comparator>
RandomAccessIterator3 RandomAccessIterator3
multiway_merge_4_combined(RandomAccessIteratorIterator seqs_begin, RandomAccessIteratorIterator seqs_end, RandomAccessIterator3 target, Comparator comp, _DifferenceTp length, bool stable) multiway_merge_4_combined(RandomAccessIteratorIterator seqs_begin,
RandomAccessIteratorIterator seqs_end,
RandomAccessIterator3 target,
Comparator comp,
_DifferenceTp length, bool stable)
{ {
_GLIBCXX_CALL(length); _GLIBCXX_CALL(length);
typedef _DifferenceTp difference_type; typedef _DifferenceTp difference_type;
typedef typename std::iterator_traits<RandomAccessIteratorIterator>::value_type::first_type typedef typename std::iterator_traits<RandomAccessIteratorIterator>
::value_type::first_type
RandomAccessIterator1; RandomAccessIterator1;
typedef typename std::iterator_traits<RandomAccessIterator1>::value_type typedef typename std::iterator_traits<RandomAccessIterator1>::value_type
value_type; value_type;
...@@ -649,7 +705,8 @@ namespace __gnu_parallel ...@@ -649,7 +705,8 @@ namespace __gnu_parallel
RandomAccessIterator3 target_end; RandomAccessIterator3 target_end;
// Stable anyway. // Stable anyway.
difference_type overhang = prepare_unguarded(seqs_begin, seqs_end, comp, min_seq, true); difference_type overhang =
prepare_unguarded(seqs_begin, seqs_end, comp, min_seq, true);
difference_type total_length = 0; difference_type total_length = 0;
for (RandomAccessIteratorIterator s = seqs_begin; s != seqs_end; ++s) for (RandomAccessIteratorIterator s = seqs_begin; s != seqs_end; ++s)
...@@ -657,16 +714,17 @@ namespace __gnu_parallel ...@@ -657,16 +714,17 @@ namespace __gnu_parallel
if (overhang != -1) if (overhang != -1)
{ {
difference_type unguarded_length = std::min(length, total_length - overhang); difference_type unguarded_length =
target_end = multiway_merge_4_variant<unguarded_iterator> std::min(length, total_length - overhang);
(seqs_begin, seqs_end, target, comp, unguarded_length, stable); target_end = multiway_merge_4_variant<unguarded_iterator>
overhang = length - unguarded_length; (seqs_begin, seqs_end, target, comp, unguarded_length, stable);
overhang = length - unguarded_length;
} }
else else
{ {
// Empty sequence found. // Empty sequence found.
overhang = length; overhang = length;
target_end = target; target_end = target;
} }
#if _GLIBCXX_ASSERTIONS #if _GLIBCXX_ASSERTIONS
...@@ -674,10 +732,13 @@ namespace __gnu_parallel ...@@ -674,10 +732,13 @@ namespace __gnu_parallel
_GLIBCXX_PARALLEL_ASSERT(is_sorted(target, target_end, comp)); _GLIBCXX_PARALLEL_ASSERT(is_sorted(target, target_end, comp));
#endif #endif
std::vector<std::pair<RandomAccessIterator1, RandomAccessIterator1> > one_missing(seqs_begin, seqs_end); std::vector<std::pair<RandomAccessIterator1, RandomAccessIterator1> >
one_missing(seqs_begin, seqs_end);
one_missing.erase(one_missing.begin() + min_seq); //remove one_missing.erase(one_missing.begin() + min_seq); //remove
target_end = multiway_merge_3_variant<guarded_iterator>(one_missing.begin(), one_missing.end(), target_end, comp, overhang, stable); target_end = multiway_merge_3_variant<guarded_iterator>(
one_missing.begin(), one_missing.end(),
target_end, comp, overhang, stable);
// Insert back again. // Insert back again.
one_missing.insert(one_missing.begin() + min_seq, seqs_begin[min_seq]); one_missing.insert(one_missing.begin() + min_seq, seqs_begin[min_seq]);
...@@ -692,26 +753,34 @@ namespace __gnu_parallel ...@@ -692,26 +753,34 @@ namespace __gnu_parallel
return target_end; return target_end;
} }
/** @brief Basic multi-way merging procedure. /** @brief Basic multi-way merging procedure.
* *
* The head elements are kept in a sorted array, new heads are * The head elements are kept in a sorted array, new heads are
* inserted linearly. * inserted linearly.
* @param seqs_begin Begin iterator of iterator pair input sequence. * @param seqs_begin Begin iterator of iterator pair input sequence.
* @param seqs_end End iterator of iterator pair input sequence. * @param seqs_end End iterator of iterator pair input sequence.
* @param target Begin iterator out output sequence. * @param target Begin iterator out output sequence.
* @param comp Comparator. * @param comp Comparator.
* @param length Maximum length to merge. * @param length Maximum length to merge.
* @param stable Stable merging incurs a performance penalty. * @param stable Stable merging incurs a performance penalty.
* @return End iterator of output sequence. * @return End iterator of output sequence.
*/ */
template<typename RandomAccessIteratorIterator, typename RandomAccessIterator3, typename _DifferenceTp, typename Comparator> template<
typename RandomAccessIteratorIterator,
typename RandomAccessIterator3,
typename _DifferenceTp,
typename Comparator>
RandomAccessIterator3 RandomAccessIterator3
multiway_merge_bubble(RandomAccessIteratorIterator seqs_begin, RandomAccessIteratorIterator seqs_end, RandomAccessIterator3 target, Comparator comp, _DifferenceTp length, bool stable) multiway_merge_bubble(RandomAccessIteratorIterator seqs_begin,
RandomAccessIteratorIterator seqs_end,
RandomAccessIterator3 target,
Comparator comp, _DifferenceTp length, bool stable)
{ {
_GLIBCXX_CALL(length) _GLIBCXX_CALL(length)
typedef _DifferenceTp difference_type; typedef _DifferenceTp difference_type;
typedef typename std::iterator_traits<RandomAccessIteratorIterator>::value_type::first_type typedef typename std::iterator_traits<RandomAccessIteratorIterator>
::value_type::first_type
RandomAccessIterator1; RandomAccessIterator1;
typedef typename std::iterator_traits<RandomAccessIterator1>::value_type typedef typename std::iterator_traits<RandomAccessIterator1>::value_type
value_type; value_type;
...@@ -719,7 +788,8 @@ namespace __gnu_parallel ...@@ -719,7 +788,8 @@ namespace __gnu_parallel
// Num remaining pieces. // Num remaining pieces.
int k = static_cast<int>(seqs_end - seqs_begin), nrp; int k = static_cast<int>(seqs_end - seqs_begin), nrp;
value_type* pl = static_cast<value_type*>(::operator new(sizeof(value_type) * k)); value_type* pl = static_cast<value_type*>(
::operator new(sizeof(value_type) * k));
int* source = new int[k]; int* source = new int[k];
difference_type total_length = 0; difference_type total_length = 0;
...@@ -730,137 +800,138 @@ namespace __gnu_parallel ...@@ -730,137 +800,138 @@ namespace __gnu_parallel
nrp = 0; nrp = 0;
for (int pi = 0; pi < k; pi++) for (int pi = 0; pi < k; pi++)
{ {
if (STOPS(pi) != POS(pi)) if (STOPS(pi) != POS(pi))
{ {
pl[nrp] = *(POS(pi)); pl[nrp] = *(POS(pi));
source[nrp] = pi; source[nrp] = pi;
nrp++; nrp++;
total_length += LENGTH(seqs_begin[pi]); total_length += LENGTH(seqs_begin[pi]);
} }
} }
if (stable) if (stable)
{ {
for (int k = 0; k < nrp - 1; k++) for (int k = 0; k < nrp - 1; k++)
for (int pi = nrp - 1; pi > k; pi--) for (int pi = nrp - 1; pi > k; pi--)
if (comp(pl[pi], pl[pi - 1]) || if (comp(pl[pi], pl[pi - 1]) ||
(!comp(pl[pi - 1], pl[pi]) && source[pi] < source[pi - 1])) (!comp(pl[pi - 1], pl[pi]) && source[pi] < source[pi - 1]))
{ {
std::swap(pl[pi - 1], pl[pi]); std::swap(pl[pi - 1], pl[pi]);
std::swap(source[pi - 1], source[pi]); std::swap(source[pi - 1], source[pi]);
} }
} }
else else
{ {
for (int k = 0; k < nrp - 1; k++) for (int k = 0; k < nrp - 1; k++)
for (int pi = nrp - 1; pi > k; pi--) for (int pi = nrp - 1; pi > k; pi--)
if (comp(pl[pi], pl[pi-1])) if (comp(pl[pi], pl[pi-1]))
{ {
std::swap(pl[pi-1], pl[pi]); std::swap(pl[pi-1], pl[pi]);
std::swap(source[pi-1], source[pi]); std::swap(source[pi-1], source[pi]);
} }
} }
// Iterate. // Iterate.
if (stable) if (stable)
{ {
int j; int j;
while (nrp > 0 && length > 0) while (nrp > 0 && length > 0)
{ {
if (source[0] < source[1]) if (source[0] < source[1])
{ {
// pl[0] <= pl[1] // pl[0] <= pl[1]
while ((nrp == 1 || !(comp(pl[1], pl[0]))) && length > 0) while ((nrp == 1 || !(comp(pl[1], pl[0]))) && length > 0)
{ {
*target = pl[0]; *target = pl[0];
++target; ++target;
++POS(source[0]); ++POS(source[0]);
length--; length--;
if (POS(source[0]) == STOPS(source[0])) if (POS(source[0]) == STOPS(source[0]))
{ {
// Move everything to the left. // Move everything to the left.
for (int s = 0; s < nrp - 1; s++) for (int s = 0; s < nrp - 1; s++)
{ {
pl[s] = pl[s + 1]; pl[s] = pl[s + 1];
source[s] = source[s + 1]; source[s] = source[s + 1];
} }
nrp--; nrp--;
break; break;
} }
else else
pl[0] = *(POS(source[0])); pl[0] = *(POS(source[0]));
} }
} }
else else
{ {
// pl[0] < pl[1] // pl[0] < pl[1]
while ((nrp == 1 || comp(pl[0], pl[1])) && length > 0) while ((nrp == 1 || comp(pl[0], pl[1])) && length > 0)
{ {
*target = pl[0]; *target = pl[0];
++target; ++target;
++POS(source[0]); ++POS(source[0]);
length--; length--;
if (POS(source[0]) == STOPS(source[0])) if (POS(source[0]) == STOPS(source[0]))
{ {
for (int s = 0; s < nrp - 1; s++) for (int s = 0; s < nrp - 1; s++)
{ {
pl[s] = pl[s + 1]; pl[s] = pl[s + 1];
source[s] = source[s + 1]; source[s] = source[s + 1];
} }
nrp--; nrp--;
break; break;
} }
else else
pl[0] = *(POS(source[0])); pl[0] = *(POS(source[0]));
} }
} }
// Sink down. // Sink down.
j = 1; j = 1;
while ((j < nrp) && (comp(pl[j], pl[j - 1]) || while ((j < nrp) && (comp(pl[j], pl[j - 1]) ||
(!comp(pl[j - 1], pl[j]) && (source[j] < source[j - 1])))) (!comp(pl[j - 1], pl[j])
{ && (source[j] < source[j - 1]))))
std::swap(pl[j - 1], pl[j]); {
std::swap(source[j - 1], source[j]); std::swap(pl[j - 1], pl[j]);
j++; std::swap(source[j - 1], source[j]);
} j++;
} }
}
} }
else else
{ {
int j; int j;
while (nrp > 0 && length > 0) while (nrp > 0 && length > 0)
{ {
// pl[0] <= pl[1] // pl[0] <= pl[1]
while (nrp == 1 || (!comp(pl[1], pl[0])) && length > 0) while (nrp == 1 || (!comp(pl[1], pl[0])) && length > 0)
{ {
*target = pl[0]; *target = pl[0];
++target; ++target;
++POS(source[0]); ++POS(source[0]);
length--; length--;
if (POS(source[0]) == STOPS(source[0])) if (POS(source[0]) == STOPS(source[0]))
{ {
for (int s = 0; s < (nrp - 1); s++) for (int s = 0; s < (nrp - 1); s++)
{ {
pl[s] = pl[s + 1]; pl[s] = pl[s + 1];
source[s] = source[s + 1]; source[s] = source[s + 1];
} }
nrp--; nrp--;
break; break;
} }
else else
pl[0] = *(POS(source[0])); pl[0] = *(POS(source[0]));
} }
// Sink down. // Sink down.
j = 1; j = 1;
while ((j < nrp) && comp(pl[j], pl[j - 1])) while ((j < nrp) && comp(pl[j], pl[j - 1]))
{ {
std::swap(pl[j - 1], pl[j]); std::swap(pl[j - 1], pl[j]);
std::swap(source[j - 1], source[j]); std::swap(source[j - 1], source[j]);
j++; j++;
} }
} }
} }
delete[] pl; delete[] pl;
...@@ -869,26 +940,36 @@ namespace __gnu_parallel ...@@ -869,26 +940,36 @@ namespace __gnu_parallel
return target; return target;
} }
/** @brief Multi-way merging procedure for a high branching factor, /** @brief Multi-way merging procedure for a high branching factor,
* guarded case. * guarded case.
* *
* The head elements are kept in a loser tree. * The head elements are kept in a loser tree.
* @param seqs_begin Begin iterator of iterator pair input sequence. * @param seqs_begin Begin iterator of iterator pair input sequence.
* @param seqs_end End iterator of iterator pair input sequence. * @param seqs_end End iterator of iterator pair input sequence.
* @param target Begin iterator out output sequence. * @param target Begin iterator out output sequence.
* @param comp Comparator. * @param comp Comparator.
* @param length Maximum length to merge. * @param length Maximum length to merge.
* @param stable Stable merging incurs a performance penalty. * @param stable Stable merging incurs a performance penalty.
* @return End iterator of output sequence. * @return End iterator of output sequence.
*/ */
template<typename LT, typename RandomAccessIteratorIterator, typename RandomAccessIterator3, typename _DifferenceTp, typename Comparator> template<
typename LT,
typename RandomAccessIteratorIterator,
typename RandomAccessIterator3,
typename _DifferenceTp,
typename Comparator>
RandomAccessIterator3 RandomAccessIterator3
multiway_merge_loser_tree(RandomAccessIteratorIterator seqs_begin, RandomAccessIteratorIterator seqs_end, RandomAccessIterator3 target, Comparator comp, _DifferenceTp length, bool stable) multiway_merge_loser_tree(RandomAccessIteratorIterator seqs_begin,
RandomAccessIteratorIterator seqs_end,
RandomAccessIterator3 target,
Comparator comp,
_DifferenceTp length, bool stable)
{ {
_GLIBCXX_CALL(length) _GLIBCXX_CALL(length)
typedef _DifferenceTp difference_type; typedef _DifferenceTp difference_type;
typedef typename std::iterator_traits<RandomAccessIteratorIterator>::value_type::first_type typedef typename std::iterator_traits<RandomAccessIteratorIterator>
::value_type::first_type
RandomAccessIterator1; RandomAccessIterator1;
typedef typename std::iterator_traits<RandomAccessIterator1>::value_type typedef typename std::iterator_traits<RandomAccessIterator1>::value_type
value_type; value_type;
...@@ -941,64 +1022,73 @@ namespace __gnu_parallel ...@@ -941,64 +1022,73 @@ namespace __gnu_parallel
if (stable) if (stable)
{ {
for (difference_type i = 0; i < total_length; i++) for (difference_type i = 0; i < total_length; i++)
{ {
// Take out. // Take out.
source = lt.get_min_source(); source = lt.get_min_source();
*(target++) = *(seqs_begin[source].first++); *(target++) = *(seqs_begin[source].first++);
// Feed. // Feed.
if (seqs_begin[source].first == seqs_begin[source].second) if (seqs_begin[source].first == seqs_begin[source].second)
lt.delete_min_insert_stable(*arbitrary_element, true); lt.delete_min_insert_stable(*arbitrary_element, true);
else else
// Replace from same source. // Replace from same source.
lt.delete_min_insert_stable(*seqs_begin[source].first, false); lt.delete_min_insert_stable(*seqs_begin[source].first, false);
} }
} }
else else
{ {
for (difference_type i = 0; i < total_length; i++) for (difference_type i = 0; i < total_length; i++)
{ {
//take out //take out
source = lt.get_min_source(); source = lt.get_min_source();
*(target++) = *(seqs_begin[source].first++); *(target++) = *(seqs_begin[source].first++);
// Feed. // Feed.
if (seqs_begin[source].first == seqs_begin[source].second) if (seqs_begin[source].first == seqs_begin[source].second)
lt.delete_min_insert(*arbitrary_element, true); lt.delete_min_insert(*arbitrary_element, true);
else else
// Replace from same source. // Replace from same source.
lt.delete_min_insert(*seqs_begin[source].first, false); lt.delete_min_insert(*seqs_begin[source].first, false);
} }
} }
return target; return target;
} }
/** @brief Multi-way merging procedure for a high branching factor, /** @brief Multi-way merging procedure for a high branching factor,
* unguarded case. * unguarded case.
* *
* The head elements are kept in a loser tree. * The head elements are kept in a loser tree.
* @param seqs_begin Begin iterator of iterator pair input sequence. * @param seqs_begin Begin iterator of iterator pair input sequence.
* @param seqs_end End iterator of iterator pair input sequence. * @param seqs_end End iterator of iterator pair input sequence.
* @param target Begin iterator out output sequence. * @param target Begin iterator out output sequence.
* @param comp Comparator. * @param comp Comparator.
* @param length Maximum length to merge. * @param length Maximum length to merge.
* @param stable Stable merging incurs a performance penalty. * @param stable Stable merging incurs a performance penalty.
* @return End iterator of output sequence. * @return End iterator of output sequence.
* @pre No input will run out of elements during the merge. * @pre No input will run out of elements during the merge.
*/ */
template<typename LT, typename RandomAccessIteratorIterator, typename RandomAccessIterator3, typename _DifferenceTp, typename Comparator> template<
typename LT,
typename RandomAccessIteratorIterator,
typename RandomAccessIterator3,
typename _DifferenceTp, typename Comparator>
RandomAccessIterator3 RandomAccessIterator3
multiway_merge_loser_tree_unguarded(RandomAccessIteratorIterator seqs_begin, RandomAccessIteratorIterator seqs_end, RandomAccessIterator3 target, Comparator comp, _DifferenceTp length, bool stable) multiway_merge_loser_tree_unguarded(RandomAccessIteratorIterator seqs_begin,
RandomAccessIteratorIterator seqs_end,
RandomAccessIterator3 target,
Comparator comp,
_DifferenceTp length, bool stable)
{ {
_GLIBCXX_CALL(length) _GLIBCXX_CALL(length)
typedef _DifferenceTp difference_type; typedef _DifferenceTp difference_type;
typedef typename std::iterator_traits<RandomAccessIteratorIterator>::value_type::first_type typedef typename std::iterator_traits<RandomAccessIteratorIterator>
::value_type::first_type
RandomAccessIterator1; RandomAccessIterator1;
typedef typename std::iterator_traits<RandomAccessIterator1>::value_type typedef typename std::iterator_traits<RandomAccessIterator1>::value_type
value_type; value_type;
...@@ -1012,14 +1102,14 @@ namespace __gnu_parallel ...@@ -1012,14 +1102,14 @@ namespace __gnu_parallel
for (int t = 0; t < k; t++) for (int t = 0; t < k; t++)
{ {
#if _GLIBCXX_ASSERTIONS #if _GLIBCXX_ASSERTIONS
_GLIBCXX_PARALLEL_ASSERT(seqs_begin[t].first != seqs_begin[t].second); _GLIBCXX_PARALLEL_ASSERT(seqs_begin[t].first != seqs_begin[t].second);
#endif #endif
if (stable) if (stable)
lt.insert_start_stable(*seqs_begin[t].first, t, false); lt.insert_start_stable(*seqs_begin[t].first, t, false);
else else
lt.insert_start(*seqs_begin[t].first, t, false); lt.insert_start(*seqs_begin[t].first, t, false);
total_length += LENGTH(seqs_begin[t]); total_length += LENGTH(seqs_begin[t]);
} }
if (stable) if (stable)
...@@ -1038,68 +1128,84 @@ namespace __gnu_parallel ...@@ -1038,68 +1128,84 @@ namespace __gnu_parallel
if (stable) if (stable)
{ {
RandomAccessIterator3 target_end = target + length; RandomAccessIterator3 target_end = target + length;
while (target < target_end) while (target < target_end)
{ {
// Take out. // Take out.
source = lt.get_min_source(); source = lt.get_min_source();
#if _GLIBCXX_ASSERTIONS #if _GLIBCXX_ASSERTIONS
_GLIBCXX_PARALLEL_ASSERT(i == 0 || !comp(*(seqs_begin[source].first), *(target - 1))); _GLIBCXX_PARALLEL_ASSERT(i == 0
|| !comp(*(seqs_begin[source].first), *(target - 1)));
#endif #endif
*(target++) = *(seqs_begin[source].first++); *(target++) = *(seqs_begin[source].first++);
#if _GLIBCXX_ASSERTIONS #if _GLIBCXX_ASSERTIONS
_GLIBCXX_PARALLEL_ASSERT((seqs_begin[source].first != seqs_begin[source].second) || (i == length - 1)); _GLIBCXX_PARALLEL_ASSERT(
i++; (seqs_begin[source].first != seqs_begin[source].second)
|| (i == length - 1));
i++;
#endif #endif
// Feed. // Feed.
// Replace from same source. // Replace from same source.
lt.delete_min_insert_stable(*seqs_begin[source].first, false); lt.delete_min_insert_stable(*seqs_begin[source].first, false);
} }
} }
else else
{ {
RandomAccessIterator3 target_end = target + length; RandomAccessIterator3 target_end = target + length;
while (target < target_end) while (target < target_end)
{ {
// Take out. // Take out.
source = lt.get_min_source(); source = lt.get_min_source();
#if _GLIBCXX_ASSERTIONS #if _GLIBCXX_ASSERTIONS
if (i > 0 && comp(*(seqs_begin[source].first), *(target - 1))) if (i > 0 && comp(*(seqs_begin[source].first), *(target - 1)))
printf(" %i %i %i\n", length, i, source); printf(" %i %i %i\n", length, i, source);
_GLIBCXX_PARALLEL_ASSERT(i == 0 || !comp(*(seqs_begin[source].first), *(target - 1))); _GLIBCXX_PARALLEL_ASSERT(i == 0
|| !comp(*(seqs_begin[source].first), *(target - 1)));
#endif #endif
*(target++) = *(seqs_begin[source].first++); *(target++) = *(seqs_begin[source].first++);
#if _GLIBCXX_ASSERTIONS #if _GLIBCXX_ASSERTIONS
if (!((seqs_begin[source].first != seqs_begin[source].second) || (i >= length - 1))) if (!((seqs_begin[source].first != seqs_begin[source].second)
printf(" %i %i %i\n", length, i, source); || (i >= length - 1)))
_GLIBCXX_PARALLEL_ASSERT((seqs_begin[source].first != seqs_begin[source].second) || (i >= length - 1)); printf(" %i %i %i\n", length, i, source);
i++; _GLIBCXX_PARALLEL_ASSERT(
(seqs_begin[source].first != seqs_begin[source].second)
|| (i >= length - 1));
i++;
#endif #endif
// Feed. // Feed.
// Replace from same source. // Replace from same source.
lt.delete_min_insert(*seqs_begin[source].first, false); lt.delete_min_insert(*seqs_begin[source].first, false);
} }
} }
return target; return target;
} }
template<typename RandomAccessIteratorIterator, typename RandomAccessIterator3, typename _DifferenceTp, typename Comparator> template<
typename RandomAccessIteratorIterator,
typename RandomAccessIterator3,
typename _DifferenceTp,
typename Comparator>
RandomAccessIterator3 RandomAccessIterator3
multiway_merge_loser_tree_combined(RandomAccessIteratorIterator seqs_begin, RandomAccessIteratorIterator seqs_end, RandomAccessIterator3 target, Comparator comp, _DifferenceTp length, bool stable) multiway_merge_loser_tree_combined(RandomAccessIteratorIterator seqs_begin,
RandomAccessIteratorIterator seqs_end,
RandomAccessIterator3 target,
Comparator comp,
_DifferenceTp length, bool stable)
{ {
_GLIBCXX_CALL(length) _GLIBCXX_CALL(length)
typedef _DifferenceTp difference_type; typedef _DifferenceTp difference_type;
typedef typename std::iterator_traits<RandomAccessIteratorIterator>::value_type::first_type typedef typename std::iterator_traits<RandomAccessIteratorIterator>
::value_type::first_type
RandomAccessIterator1; RandomAccessIterator1;
typedef typename std::iterator_traits<RandomAccessIterator1>::value_type typedef typename std::iterator_traits<RandomAccessIterator1>::value_type
value_type; value_type;
...@@ -1107,7 +1213,7 @@ namespace __gnu_parallel ...@@ -1107,7 +1213,7 @@ namespace __gnu_parallel
int min_seq; int min_seq;
RandomAccessIterator3 target_end; RandomAccessIterator3 target_end;
difference_type overhang = prepare_unguarded(seqs_begin, seqs_end, difference_type overhang = prepare_unguarded(seqs_begin, seqs_end,
comp, min_seq, stable); comp, min_seq, stable);
difference_type total_length = 0; difference_type total_length = 0;
for (RandomAccessIteratorIterator s = seqs_begin; s != seqs_end; s++) for (RandomAccessIteratorIterator s = seqs_begin; s != seqs_end; s++)
...@@ -1115,17 +1221,18 @@ namespace __gnu_parallel ...@@ -1115,17 +1221,18 @@ namespace __gnu_parallel
if (overhang != -1) if (overhang != -1)
{ {
difference_type unguarded_length = std::min(length, total_length - overhang); difference_type unguarded_length =
target_end = multiway_merge_loser_tree_unguarded std::min(length, total_length - overhang);
<typename loser_tree_unguarded_traits<value_type, Comparator>::LT> target_end = multiway_merge_loser_tree_unguarded
(seqs_begin, seqs_end, target, comp, unguarded_length, stable); <typename loser_tree_unguarded_traits<value_type, Comparator>::LT>
overhang = length - unguarded_length; (seqs_begin, seqs_end, target, comp, unguarded_length, stable);
overhang = length - unguarded_length;
} }
else else
{ {
// Empty sequence found. // Empty sequence found.
overhang = length; overhang = length;
target_end = target; target_end = target;
} }
#if _GLIBCXX_ASSERTIONS #if _GLIBCXX_ASSERTIONS
...@@ -1145,34 +1252,43 @@ namespace __gnu_parallel ...@@ -1145,34 +1252,43 @@ namespace __gnu_parallel
return target_end; return target_end;
} }
template<typename RandomAccessIteratorIterator, typename RandomAccessIterator3, typename _DifferenceTp, typename Comparator> template<
typename RandomAccessIteratorIterator,
typename RandomAccessIterator3,
typename _DifferenceTp,
typename Comparator>
RandomAccessIterator3 RandomAccessIterator3
multiway_merge_loser_tree_sentinel(RandomAccessIteratorIterator seqs_begin, RandomAccessIteratorIterator seqs_end, RandomAccessIterator3 target, Comparator comp, _DifferenceTp length, bool stable) multiway_merge_loser_tree_sentinel(RandomAccessIteratorIterator seqs_begin,
RandomAccessIteratorIterator seqs_end,
RandomAccessIterator3 target,
Comparator comp,
_DifferenceTp length, bool stable)
{ {
_GLIBCXX_CALL(length) _GLIBCXX_CALL(length)
typedef _DifferenceTp difference_type; typedef _DifferenceTp difference_type;
typedef std::iterator_traits<RandomAccessIteratorIterator> traits_type; typedef std::iterator_traits<RandomAccessIteratorIterator> traits_type;
typedef typename std::iterator_traits<RandomAccessIteratorIterator>::value_type::first_type typedef typename std::iterator_traits<RandomAccessIteratorIterator>
::value_type::first_type
RandomAccessIterator1; RandomAccessIterator1;
typedef typename std::iterator_traits<RandomAccessIterator1>::value_type typedef typename std::iterator_traits<RandomAccessIterator1>::value_type
value_type; value_type;
typedef typename std::iterator_traits<RandomAccessIteratorIterator>::value_type::first_type
RandomAccessIterator1;
RandomAccessIterator3 target_end; RandomAccessIterator3 target_end;
difference_type overhang = prepare_unguarded_sentinel(seqs_begin, seqs_end, comp); difference_type overhang =
prepare_unguarded_sentinel(seqs_begin, seqs_end, comp);
difference_type total_length = 0; difference_type total_length = 0;
for (RandomAccessIteratorIterator s = seqs_begin; s != seqs_end; s++) for (RandomAccessIteratorIterator s = seqs_begin; s != seqs_end; s++)
{ {
total_length += LENGTH(*s); total_length += LENGTH(*s);
// Sentinel spot. // Sentinel spot.
(*s).second++; (*s).second++;
} }
difference_type unguarded_length = std::min(length, total_length - overhang); difference_type unguarded_length =
std::min(length, total_length - overhang);
target_end = multiway_merge_loser_tree_unguarded target_end = multiway_merge_loser_tree_unguarded
<typename loser_tree_unguarded_traits<value_type, Comparator>::LT> <typename loser_tree_unguarded_traits<value_type, Comparator>::LT>
(seqs_begin, seqs_end, target, comp, unguarded_length, stable); (seqs_begin, seqs_end, target, comp, unguarded_length, stable);
...@@ -1184,14 +1300,17 @@ namespace __gnu_parallel ...@@ -1184,14 +1300,17 @@ namespace __gnu_parallel
#endif #endif
// Copy rest stable. // Copy rest stable.
for (RandomAccessIteratorIterator s = seqs_begin; s != seqs_end && overhang > 0; s++) for (RandomAccessIteratorIterator s = seqs_begin;
s != seqs_end && overhang > 0; s++)
{ {
// Restore. // Restore.
(*s).second--; (*s).second--;
difference_type local_length = std::min((difference_type)overhang, (difference_type)LENGTH(*s)); difference_type local_length =
target_end = std::copy((*s).first, (*s).first + local_length, target_end); std::min<difference_type>(overhang, LENGTH(*s));
(*s).first += local_length; target_end = std::copy((*s).first, (*s).first + local_length,
overhang -= local_length; target_end);
(*s).first += local_length;
overhang -= local_length;
} }
#if _GLIBCXX_ASSERTIONS #if _GLIBCXX_ASSERTIONS
...@@ -1203,25 +1322,35 @@ namespace __gnu_parallel ...@@ -1203,25 +1322,35 @@ namespace __gnu_parallel
return target_end; return target_end;
} }
/** @brief Sequential multi-way merging switch. /** @brief Sequential multi-way merging switch.
* *
* The decision if based on the branching factor and runtime settings. * The decision if based on the branching factor and runtime settings.
* @param seqs_begin Begin iterator of iterator pair input sequence. * @param seqs_begin Begin iterator of iterator pair input sequence.
* @param seqs_end End iterator of iterator pair input sequence. * @param seqs_end End iterator of iterator pair input sequence.
* @param target Begin iterator out output sequence. * @param target Begin iterator out output sequence.
* @param comp Comparator. * @param comp Comparator.
* @param length Maximum length to merge. * @param length Maximum length to merge.
* @param stable Stable merging incurs a performance penalty. * @param stable Stable merging incurs a performance penalty.
* @param sentinel The sequences have a sentinel element. * @param sentinel The sequences have a sentinel element.
* @return End iterator of output sequence. */ * @return End iterator of output sequence. */
template<typename RandomAccessIteratorIterator, typename RandomAccessIterator3, typename _DifferenceTp, typename Comparator> template<
typename RandomAccessIteratorIterator,
typename RandomAccessIterator3,
typename _DifferenceTp,
typename Comparator>
RandomAccessIterator3 RandomAccessIterator3
multiway_merge(RandomAccessIteratorIterator seqs_begin, RandomAccessIteratorIterator seqs_end, RandomAccessIterator3 target, Comparator comp, _DifferenceTp length, bool stable, bool sentinel, sequential_tag) multiway_merge(RandomAccessIteratorIterator seqs_begin,
RandomAccessIteratorIterator seqs_end,
RandomAccessIterator3 target,
Comparator comp, _DifferenceTp length,
bool stable, bool sentinel,
sequential_tag)
{ {
_GLIBCXX_CALL(length) _GLIBCXX_CALL(length)
typedef _DifferenceTp difference_type; typedef _DifferenceTp difference_type;
typedef typename std::iterator_traits<RandomAccessIteratorIterator>::value_type::first_type typedef typename std::iterator_traits<RandomAccessIteratorIterator>
::value_type::first_type
RandomAccessIterator1; RandomAccessIterator1;
typedef typename std::iterator_traits<RandomAccessIterator1>::value_type typedef typename std::iterator_traits<RandomAccessIterator1>::value_type
value_type; value_type;
...@@ -1234,7 +1363,8 @@ namespace __gnu_parallel ...@@ -1234,7 +1363,8 @@ namespace __gnu_parallel
RandomAccessIterator3 return_target = target; RandomAccessIterator3 return_target = target;
int k = static_cast<int>(seqs_end - seqs_begin); int k = static_cast<int>(seqs_end - seqs_begin);
Settings::MultiwayMergeAlgorithm mwma = Settings::multiway_merge_algorithm; Settings::MultiwayMergeAlgorithm mwma =
Settings::multiway_merge_algorithm;
if (!sentinel && mwma == Settings::LOSER_TREE_SENTINEL) if (!sentinel && mwma == Settings::LOSER_TREE_SENTINEL)
mwma = Settings::LOSER_TREE_COMBINED; mwma = Settings::LOSER_TREE_COMBINED;
...@@ -1242,75 +1372,126 @@ namespace __gnu_parallel ...@@ -1242,75 +1372,126 @@ namespace __gnu_parallel
switch (k) switch (k)
{ {
case 0: case 0:
break; break;
case 1: case 1:
return_target = std::copy(seqs_begin[0].first, seqs_begin[0].first + length, target); return_target = std::copy(seqs_begin[0].first,
seqs_begin[0].first += length; seqs_begin[0].first + length,
break; target);
seqs_begin[0].first += length;
break;
case 2: case 2:
return_target = merge_advance(seqs_begin[0].first, seqs_begin[0].second, seqs_begin[1].first, seqs_begin[1].second, target, length, comp); return_target = merge_advance(seqs_begin[0].first,
break; seqs_begin[0].second,
seqs_begin[1].first,
seqs_begin[1].second,
target, length, comp);
break;
case 3: case 3:
switch (mwma) switch (mwma)
{ {
case Settings::LOSER_TREE_COMBINED: case Settings::LOSER_TREE_COMBINED:
return_target = multiway_merge_3_combined(seqs_begin, seqs_end, target, comp, length, stable); return_target = multiway_merge_3_combined(seqs_begin,
break; seqs_end,
case Settings::LOSER_TREE_SENTINEL: target,
return_target = multiway_merge_3_variant<unguarded_iterator>(seqs_begin, seqs_end, target, comp, length, stable); comp, length, stable);
break; break;
default: case Settings::LOSER_TREE_SENTINEL:
return_target = multiway_merge_3_variant<guarded_iterator>(seqs_begin, seqs_end, target, comp, length, stable); return_target = multiway_merge_3_variant<unguarded_iterator>(
break; seqs_begin,
} seqs_end,
break; target,
comp, length, stable);
break;
default:
return_target = multiway_merge_3_variant<guarded_iterator>(
seqs_begin,
seqs_end,
target,
comp, length, stable);
break;
}
break;
case 4: case 4:
switch (mwma) switch (mwma)
{ {
case Settings::LOSER_TREE_COMBINED: case Settings::LOSER_TREE_COMBINED:
return_target = multiway_merge_4_combined(seqs_begin, seqs_end, target, comp, length, stable); return_target = multiway_merge_4_combined(
break; seqs_begin,
case Settings::LOSER_TREE_SENTINEL: seqs_end,
return_target = multiway_merge_4_variant<unguarded_iterator>(seqs_begin, seqs_end, target, comp, length, stable); target,
break; comp, length, stable);
default: break;
return_target = multiway_merge_4_variant<guarded_iterator>(seqs_begin, seqs_end, target, comp, length, stable); case Settings::LOSER_TREE_SENTINEL:
break; return_target = multiway_merge_4_variant<unguarded_iterator>(
} seqs_begin,
break; seqs_end,
target,
comp, length, stable);
break;
default:
return_target = multiway_merge_4_variant<guarded_iterator>(
seqs_begin,
seqs_end,
target,
comp, length, stable);
break;
}
break;
default: default:
{ {
switch (mwma) switch (mwma)
{ {
case Settings::BUBBLE: case Settings::BUBBLE:
return_target = multiway_merge_bubble(seqs_begin, seqs_end, target, comp, length, stable); return_target = multiway_merge_bubble(
break; seqs_begin,
seqs_end,
target,
comp, length, stable);
break;
#if _GLIBCXX_LOSER_TREE_EXPLICIT #if _GLIBCXX_LOSER_TREE_EXPLICIT
case Settings::LOSER_TREE_EXPLICIT: case Settings::LOSER_TREE_EXPLICIT:
return_target = multiway_merge_loser_tree<LoserTreeExplicit<value_type, Comparator> >(seqs_begin, seqs_end, target, comp, length, stable); return_target = multiway_merge_loser_tree<
break; LoserTreeExplicit<value_type, Comparator> >(
seqs_begin,
seqs_end,
target,
comp, length, stable);
break;
#endif #endif
#if _GLIBCXX_LOSER_TREE #if _GLIBCXX_LOSER_TREE
case Settings::LOSER_TREE: case Settings::LOSER_TREE:
return_target = multiway_merge_loser_tree<LoserTree<value_type, Comparator> >(seqs_begin, seqs_end, target, comp, length, stable); return_target = multiway_merge_loser_tree<
break; LoserTree<value_type, Comparator> >(
seqs_begin,
seqs_end,
target,
comp, length, stable);
break;
#endif #endif
#if _GLIBCXX_LOSER_TREE_COMBINED #if _GLIBCXX_LOSER_TREE_COMBINED
case Settings::LOSER_TREE_COMBINED: case Settings::LOSER_TREE_COMBINED:
return_target = multiway_merge_loser_tree_combined(seqs_begin, seqs_end, target, comp, length, stable); return_target = multiway_merge_loser_tree_combined(
break; seqs_begin,
seqs_end,
target,
comp, length, stable);
break;
#endif #endif
#if _GLIBCXX_LOSER_TREE_SENTINEL #if _GLIBCXX_LOSER_TREE_SENTINEL
case Settings::LOSER_TREE_SENTINEL: case Settings::LOSER_TREE_SENTINEL:
return_target = multiway_merge_loser_tree_sentinel(seqs_begin, seqs_end, target, comp, length, stable); return_target = multiway_merge_loser_tree_sentinel(
break; seqs_begin,
seqs_end,
target,
comp, length, stable);
break;
#endif #endif
default: default:
// multiway_merge algorithm not implemented. // multiway_merge algorithm not implemented.
_GLIBCXX_PARALLEL_ASSERT(0); _GLIBCXX_PARALLEL_ASSERT(0);
break; break;
} }
} }
} }
#if _GLIBCXX_ASSERTIONS #if _GLIBCXX_ASSERTIONS
_GLIBCXX_PARALLEL_ASSERT(is_sorted(target, target + length, comp)); _GLIBCXX_PARALLEL_ASSERT(is_sorted(target, target + length, comp));
...@@ -1319,214 +1500,268 @@ namespace __gnu_parallel ...@@ -1319,214 +1500,268 @@ namespace __gnu_parallel
return return_target; return return_target;
} }
/** @brief Parallel multi-way merge routine. /** @brief Parallel multi-way merge routine.
* *
* The decision if based on the branching factor and runtime settings. * The decision if based on the branching factor and runtime settings.
* @param seqs_begin Begin iterator of iterator pair input sequence. * @param seqs_begin Begin iterator of iterator pair input sequence.
* @param seqs_end End iterator of iterator pair input sequence. * @param seqs_end End iterator of iterator pair input sequence.
* @param target Begin iterator out output sequence. * @param target Begin iterator out output sequence.
* @param comp Comparator. * @param comp Comparator.
* @param length Maximum length to merge. * @param length Maximum length to merge.
* @param stable Stable merging incurs a performance penalty. * @param stable Stable merging incurs a performance penalty.
* @param sentinel Ignored. * @param sentinel Ignored.
* @return End iterator of output sequence. * @return End iterator of output sequence.
*/ */
template<typename RandomAccessIteratorIterator, typename RandomAccessIterator3, typename _DifferenceTp, typename Comparator> template<
typename RandomAccessIteratorIterator,
typename RandomAccessIterator3,
typename _DifferenceTp,
typename Comparator>
RandomAccessIterator3 RandomAccessIterator3
parallel_multiway_merge(RandomAccessIteratorIterator seqs_begin, RandomAccessIteratorIterator seqs_end, RandomAccessIterator3 target, Comparator comp, _DifferenceTp length, bool stable, bool sentinel) parallel_multiway_merge(RandomAccessIteratorIterator seqs_begin,
{ RandomAccessIteratorIterator seqs_end,
_GLIBCXX_CALL(length) RandomAccessIterator3 target,
Comparator comp,
typedef _DifferenceTp difference_type; _DifferenceTp length, bool stable, bool sentinel)
typedef typename std::iterator_traits<RandomAccessIteratorIterator>::value_type::first_type
RandomAccessIterator1;
typedef typename std::iterator_traits<RandomAccessIterator1>::value_type
value_type;
#if _GLIBCXX_ASSERTIONS
for (RandomAccessIteratorIterator rii = seqs_begin; rii != seqs_end; rii++)
_GLIBCXX_PARALLEL_ASSERT(is_sorted((*rii).first, (*rii).second, comp));
#endif
// k sequences.
int k = static_cast<int>(seqs_end - seqs_begin);
difference_type total_length = 0;
for (RandomAccessIteratorIterator raii = seqs_begin; raii != seqs_end; raii++)
total_length += LENGTH(*raii);
_GLIBCXX_CALL(total_length)
if (total_length == 0 || k == 0)
return target;
thread_index_t num_threads = static_cast<thread_index_t>(std::min(static_cast<difference_type>(get_max_threads()), total_length));
bool tight = (total_length == length);
// Thread t will have to merge pieces[iam][0..k - 1]
std::vector<std::pair<difference_type, difference_type> >* pieces = new std::vector<std::pair<difference_type, difference_type> >[num_threads];
for (int s = 0; s < num_threads; s++)
pieces[s].resize(k);
difference_type num_samples = Settings::merge_oversampling * num_threads;
if (Settings::multiway_merge_splitting == Settings::SAMPLING)
{
value_type* samples = static_cast<value_type*>(::operator new(sizeof(value_type) * k * num_samples));
// Sample.
for (int s = 0; s < k; s++)
for (int i = 0; (difference_type)i < num_samples; i++)
{
difference_type sample_index = static_cast<difference_type>(LENGTH(seqs_begin[s]) * (double(i + 1) / (num_samples + 1)) * (double(length) / total_length));
samples[s * num_samples + i] = seqs_begin[s].first[sample_index];
}
if (stable)
__gnu_sequential::stable_sort(samples, samples + (num_samples * k), comp);
else
__gnu_sequential::sort(samples, samples + (num_samples * k), comp);
for (int slab = 0; slab < num_threads; slab++)
// For each slab / processor.
for (int seq = 0; seq < k; seq++)
{
// For each sequence.
if (slab > 0)
pieces[slab][seq].first = std::upper_bound(seqs_begin[seq].first, seqs_begin[seq].second, samples[num_samples * k * slab / num_threads], comp) - seqs_begin[seq].first;
else
{
// Absolute beginning.
pieces[slab][seq].first = 0;
}
if ((slab + 1) < num_threads)
pieces[slab][seq].second = std::upper_bound(seqs_begin[seq].first, seqs_begin[seq].second, samples[num_samples * k * (slab + 1) / num_threads], comp) - seqs_begin[seq].first;
else
pieces[slab][seq].second = LENGTH(seqs_begin[seq]); //absolute ending
}
delete[] samples;
}
else
{
// (Settings::multiway_merge_splitting == Settings::EXACT).
std::vector<RandomAccessIterator1>* offsets = new std::vector<RandomAccessIterator1>[num_threads];
std::vector<std::pair<RandomAccessIterator1, RandomAccessIterator1> > se(k);
copy(seqs_begin, seqs_end, se.begin());
difference_type* borders = static_cast<difference_type*>(__builtin_alloca(sizeof(difference_type) * (num_threads + 1)));
equally_split(length, num_threads, borders);
for (int s = 0; s < (num_threads - 1); s++)
{
offsets[s].resize(k);
multiseq_partition(se.begin(), se.end(), borders[s + 1],
offsets[s].begin(), comp);
// Last one also needed and available.
if (!tight)
{
offsets[num_threads - 1].resize(k);
multiseq_partition(se.begin(), se.end(),
difference_type(length),
offsets[num_threads - 1].begin(), comp);
}
}
for (int slab = 0; slab < num_threads; slab++)
{
// For each slab / processor.
for (int seq = 0; seq < k; seq++)
{
// For each sequence.
if (slab == 0)
{
// Absolute beginning.
pieces[slab][seq].first = 0;
}
else
pieces[slab][seq].first = pieces[slab - 1][seq].second;
if (!tight || slab < (num_threads - 1))
pieces[slab][seq].second = offsets[slab][seq] - seqs_begin[seq].first;
else
{
// slab == num_threads - 1
pieces[slab][seq].second = LENGTH(seqs_begin[seq]);
}
}
}
delete[] offsets;
}
# pragma omp parallel num_threads(num_threads)
{ {
thread_index_t iam = omp_get_thread_num(); _GLIBCXX_CALL(length)
difference_type target_position = 0; typedef _DifferenceTp difference_type;
typedef typename std::iterator_traits<RandomAccessIteratorIterator>
for (int c = 0; c < k; c++) ::value_type::first_type
target_position += pieces[iam][c].first; RandomAccessIterator1;
typedef typename std::iterator_traits<RandomAccessIterator1>::value_type
if (k > 2) value_type;
{
std::pair<RandomAccessIterator1, RandomAccessIterator1>* chunks = new std::pair<RandomAccessIterator1, RandomAccessIterator1>[k]; // k sequences.
int k = static_cast<int>(seqs_end - seqs_begin);
difference_type local_length = 0;
for (int s = 0; s < k; s++) difference_type total_length = 0;
{ for (RandomAccessIteratorIterator raii = seqs_begin;
chunks[s] = std::make_pair(seqs_begin[s].first + pieces[iam][s].first, seqs_begin[s].first + pieces[iam][s].second); raii != seqs_end; raii++)
local_length += LENGTH(chunks[s]); total_length += LENGTH(*raii);
}
_GLIBCXX_CALL(total_length)
multiway_merge(chunks, chunks + k, target + target_position, comp,
std::min(local_length, length - target_position), if (total_length == 0 || k == 0)
stable, false, sequential_tag()); return target;
delete[] chunks; bool tight = (total_length == length);
}
else if (k == 2) std::vector<std::pair<difference_type, difference_type> >* pieces;
{
RandomAccessIterator1 begin0 = seqs_begin[0].first + pieces[iam][0].first, begin1 = seqs_begin[1].first + pieces[iam][1].first; thread_index_t num_threads = static_cast<thread_index_t>(
merge_advance(begin0, std::min<difference_type>(get_max_threads(), total_length));
seqs_begin[0].first + pieces[iam][0].second,
begin1, # pragma omp parallel num_threads (num_threads)
seqs_begin[1].first + pieces[iam][1].second, {
target + target_position, # pragma omp single
(pieces[iam][0].second - pieces[iam][0].first) + (pieces[iam][1].second - pieces[iam][1].first), {
comp); num_threads = omp_get_num_threads();
} // Thread t will have to merge pieces[iam][0..k - 1]
} pieces = new std::vector<
std::pair<difference_type, difference_type> >[num_threads];
for (int s = 0; s < num_threads; s++)
pieces[s].resize(k);
difference_type num_samples =
Settings::merge_oversampling * num_threads;
if (Settings::multiway_merge_splitting == Settings::SAMPLING)
{
value_type* samples = static_cast<value_type*>(
::operator new(sizeof(value_type) * k * num_samples));
// Sample.
for (int s = 0; s < k; s++)
for (int i = 0; (difference_type)i < num_samples; i++)
{
difference_type sample_index =
static_cast<difference_type>(
LENGTH(seqs_begin[s]) * (double(i + 1) /
(num_samples + 1)) * (double(length)
/ total_length));
samples[s * num_samples + i] =
seqs_begin[s].first[sample_index];
}
if (stable)
__gnu_sequential::stable_sort(
samples, samples + (num_samples * k), comp);
else
__gnu_sequential::sort(
samples, samples + (num_samples * k), comp);
for (int slab = 0; slab < num_threads; slab++)
// For each slab / processor.
for (int seq = 0; seq < k; seq++)
{
// For each sequence.
if (slab > 0)
pieces[slab][seq].first =
std::upper_bound(
seqs_begin[seq].first,
seqs_begin[seq].second,
samples[num_samples * k * slab / num_threads],
comp)
- seqs_begin[seq].first;
else
{
// Absolute beginning.
pieces[slab][seq].first = 0;
}
if ((slab + 1) < num_threads)
pieces[slab][seq].second =
std::upper_bound(
seqs_begin[seq].first,
seqs_begin[seq].second,
samples[num_samples * k * (slab + 1) /
num_threads], comp)
- seqs_begin[seq].first;
else
pieces[slab][seq].second = LENGTH(seqs_begin[seq]);
}
delete[] samples;
}
else
{
// (Settings::multiway_merge_splitting == Settings::EXACT).
std::vector<RandomAccessIterator1>* offsets =
new std::vector<RandomAccessIterator1>[num_threads];
std::vector<
std::pair<RandomAccessIterator1, RandomAccessIterator1>
> se(k);
copy(seqs_begin, seqs_end, se.begin());
difference_type* borders =
new difference_type[num_threads + 1];
equally_split(length, num_threads, borders);
for (int s = 0; s < (num_threads - 1); s++)
{
offsets[s].resize(k);
multiseq_partition(
se.begin(), se.end(), borders[s + 1],
offsets[s].begin(), comp);
// Last one also needed and available.
if (!tight)
{
offsets[num_threads - 1].resize(k);
multiseq_partition(se.begin(), se.end(),
difference_type(length),
offsets[num_threads - 1].begin(), comp);
}
}
for (int slab = 0; slab < num_threads; slab++)
{
// For each slab / processor.
for (int seq = 0; seq < k; seq++)
{
// For each sequence.
if (slab == 0)
{
// Absolute beginning.
pieces[slab][seq].first = 0;
}
else
pieces[slab][seq].first =
pieces[slab - 1][seq].second;
if (!tight || slab < (num_threads - 1))
pieces[slab][seq].second =
offsets[slab][seq] - seqs_begin[seq].first;
else
{
// slab == num_threads - 1
pieces[slab][seq].second =
LENGTH(seqs_begin[seq]);
}
}
}
delete[] offsets;
}
} //single
thread_index_t iam = omp_get_thread_num();
difference_type target_position = 0;
for (int c = 0; c < k; c++)
target_position += pieces[iam][c].first;
if (k > 2)
{
std::pair<RandomAccessIterator1, RandomAccessIterator1>* chunks
= new
std::pair<RandomAccessIterator1, RandomAccessIterator1>[k];
difference_type local_length = 0;
for (int s = 0; s < k; s++)
{
chunks[s] = std::make_pair(
seqs_begin[s].first + pieces[iam][s].first,
seqs_begin[s].first + pieces[iam][s].second);
local_length += LENGTH(chunks[s]);
}
multiway_merge(
chunks, chunks + k, target + target_position, comp,
std::min(local_length, length - target_position),
stable, false, sequential_tag());
delete[] chunks;
}
else if (k == 2)
{
RandomAccessIterator1
begin0 = seqs_begin[0].first + pieces[iam][0].first,
begin1 = seqs_begin[1].first + pieces[iam][1].first;
merge_advance(begin0,
seqs_begin[0].first + pieces[iam][0].second,
begin1,
seqs_begin[1].first + pieces[iam][1].second,
target + target_position,
(pieces[iam][0].second - pieces[iam][0].first) +
(pieces[iam][1].second - pieces[iam][1].first),
comp);
}
} //parallel
#if _GLIBCXX_ASSERTIONS #if _GLIBCXX_ASSERTIONS
_GLIBCXX_PARALLEL_ASSERT(is_sorted(target, target + length, comp)); _GLIBCXX_PARALLEL_ASSERT(is_sorted(target, target + length, comp));
#endif #endif
// Update ends of sequences. // Update ends of sequences.
for (int s = 0; s < k; s++) for (int s = 0; s < k; s++)
seqs_begin[s].first += pieces[num_threads - 1][s].second; seqs_begin[s].first += pieces[num_threads - 1][s].second;
delete[] pieces; delete[] pieces;
return target + length; return target + length;
} }
/** /**
* @brief Multi-way merging front-end. * @brief Multi-way merging front-end.
* @param seqs_begin Begin iterator of iterator pair input sequence. * @param seqs_begin Begin iterator of iterator pair input sequence.
* @param seqs_end End iterator of iterator pair input sequence. * @param seqs_end End iterator of iterator pair input sequence.
* @param target Begin iterator out output sequence. * @param target Begin iterator out output sequence.
* @param comp Comparator. * @param comp Comparator.
* @param length Maximum length to merge. * @param length Maximum length to merge.
* @param stable Stable merging incurs a performance penalty. * @param stable Stable merging incurs a performance penalty.
* @return End iterator of output sequence. * @return End iterator of output sequence.
*/ */
template<typename RandomAccessIteratorPairIterator, typename RandomAccessIterator3, typename _DifferenceTp, typename Comparator> template<
typename RandomAccessIteratorPairIterator,
typename RandomAccessIterator3,
typename _DifferenceTp,
typename Comparator>
RandomAccessIterator3 RandomAccessIterator3
multiway_merge(RandomAccessIteratorPairIterator seqs_begin, multiway_merge(RandomAccessIteratorPairIterator seqs_begin,
RandomAccessIteratorPairIterator seqs_end, RandomAccessIteratorPairIterator seqs_end,
RandomAccessIterator3 target, Comparator comp, RandomAccessIterator3 target, Comparator comp,
_DifferenceTp length, bool stable) _DifferenceTp length, bool stable)
{ {
typedef _DifferenceTp difference_type; typedef _DifferenceTp difference_type;
_GLIBCXX_CALL(seqs_end - seqs_begin) _GLIBCXX_CALL(seqs_end - seqs_begin)
...@@ -1535,33 +1770,43 @@ namespace __gnu_parallel ...@@ -1535,33 +1770,43 @@ namespace __gnu_parallel
return target; return target;
RandomAccessIterator3 target_end; RandomAccessIterator3 target_end;
if (_GLIBCXX_PARALLEL_CONDITION(((seqs_end - seqs_begin) >= Settings::multiway_merge_minimal_k) && ((sequence_index_t)length >= Settings::multiway_merge_minimal_n))) if (_GLIBCXX_PARALLEL_CONDITION(
target_end = parallel_multiway_merge(seqs_begin, seqs_end, target, comp, (difference_type)length, stable, false); ((seqs_end - seqs_begin) >= Settings::multiway_merge_minimal_k)
&& ((sequence_index_t)length >= Settings::multiway_merge_minimal_n)))
target_end = parallel_multiway_merge(
seqs_begin, seqs_end,
target, comp, static_cast<difference_type>(length), stable, false);
else else
target_end = multiway_merge(seqs_begin, seqs_end, target, comp, length, stable, false, sequential_tag()); target_end = multiway_merge(
seqs_begin, seqs_end,
target, comp, length, stable, false, sequential_tag());
return target_end; return target_end;
} }
/** @brief Multi-way merging front-end. /** @brief Multi-way merging front-end.
* @param seqs_begin Begin iterator of iterator pair input sequence. * @param seqs_begin Begin iterator of iterator pair input sequence.
* @param seqs_end End iterator of iterator pair input sequence. * @param seqs_end End iterator of iterator pair input sequence.
* @param target Begin iterator out output sequence. * @param target Begin iterator out output sequence.
* @param comp Comparator. * @param comp Comparator.
* @param length Maximum length to merge. * @param length Maximum length to merge.
* @param stable Stable merging incurs a performance penalty. * @param stable Stable merging incurs a performance penalty.
* @return End iterator of output sequence. * @return End iterator of output sequence.
* @pre For each @c i, @c seqs_begin[i].second must be the end * @pre For each @c i, @c seqs_begin[i].second must be the end
* marker of the sequence, but also reference the one more sentinel * marker of the sequence, but also reference the one more sentinel
* element. */ * element. */
template<typename RandomAccessIteratorPairIterator, typename RandomAccessIterator3, typename _DifferenceTp, typename Comparator> template<
typename RandomAccessIteratorPairIterator,
typename RandomAccessIterator3,
typename _DifferenceTp,
typename Comparator>
RandomAccessIterator3 RandomAccessIterator3
multiway_merge_sentinel(RandomAccessIteratorPairIterator seqs_begin, multiway_merge_sentinel(RandomAccessIteratorPairIterator seqs_begin,
RandomAccessIteratorPairIterator seqs_end, RandomAccessIteratorPairIterator seqs_end,
RandomAccessIterator3 target, RandomAccessIterator3 target,
Comparator comp, Comparator comp,
_DifferenceTp length, _DifferenceTp length,
bool stable) bool stable)
{ {
typedef _DifferenceTp difference_type; typedef _DifferenceTp difference_type;
...@@ -1570,10 +1815,16 @@ namespace __gnu_parallel ...@@ -1570,10 +1815,16 @@ namespace __gnu_parallel
_GLIBCXX_CALL(seqs_end - seqs_begin) _GLIBCXX_CALL(seqs_end - seqs_begin)
if (_GLIBCXX_PARALLEL_CONDITION(((seqs_end - seqs_begin) >= Settings::multiway_merge_minimal_k) && ((sequence_index_t)length >= Settings::multiway_merge_minimal_n))) if (_GLIBCXX_PARALLEL_CONDITION(
return parallel_multiway_merge(seqs_begin, seqs_end, target, comp, (typename std::iterator_traits<RandomAccessIterator3>::difference_type)length, stable, true); ((seqs_end - seqs_begin) >= Settings::multiway_merge_minimal_k)
&& ((sequence_index_t)length >= Settings::multiway_merge_minimal_n)))
return parallel_multiway_merge(
seqs_begin, seqs_end,
target, comp, static_cast<difference_type>(length), stable, true);
else else
return multiway_merge(seqs_begin, seqs_end, target, comp, length, stable, true, sequential_tag()); return multiway_merge(
seqs_begin, seqs_end,
target, comp, length, stable, true, sequential_tag());
} }
} }
......
...@@ -48,8 +48,8 @@ ...@@ -48,8 +48,8 @@
namespace __gnu_parallel namespace __gnu_parallel
{ {
/** @brief Subsequence description. */ /** @brief Subsequence description. */
template<typename _DifferenceTp> template<typename _DifferenceTp>
struct Piece struct Piece
{ {
typedef _DifferenceTp difference_type; typedef _DifferenceTp difference_type;
...@@ -61,16 +61,19 @@ namespace __gnu_parallel ...@@ -61,16 +61,19 @@ namespace __gnu_parallel
difference_type end; difference_type end;
}; };
/** @brief Data accessed by all threads. /** @brief Data accessed by all threads.
* *
* PMWMS = parallel multiway mergesort */ * PMWMS = parallel multiway mergesort */
template<typename RandomAccessIterator> template<typename RandomAccessIterator>
struct PMWMSSortingData struct PMWMSSortingData
{ {
typedef std::iterator_traits<RandomAccessIterator> traits_type; typedef std::iterator_traits<RandomAccessIterator> traits_type;
typedef typename traits_type::value_type value_type; typedef typename traits_type::value_type value_type;
typedef typename traits_type::difference_type difference_type; typedef typename traits_type::difference_type difference_type;
/** @brief Number of threads involved. */
thread_index_t num_threads;
/** @brief Input begin. */ /** @brief Input begin. */
RandomAccessIterator source; RandomAccessIterator source;
...@@ -105,62 +108,55 @@ namespace __gnu_parallel ...@@ -105,62 +108,55 @@ namespace __gnu_parallel
/** @brief Pieces of data to merge @c [thread][sequence] */ /** @brief Pieces of data to merge @c [thread][sequence] */
std::vector<Piece<difference_type> >* pieces; std::vector<Piece<difference_type> >* pieces;
};
/** @brief Thread local data for PMWMS. */
template<typename RandomAccessIterator>
struct PMWMSSorterPU
{
/** @brief Total number of thread involved. */
thread_index_t num_threads;
/** @brief Number of owning thread. */
thread_index_t iam;
/** @brief Stable sorting desired. */ /** @brief Stable sorting desired. */
bool stable; bool stable;
/** @brief Pointer to global data. */ };
PMWMSSortingData<RandomAccessIterator>* sd;
}; /**
* @brief Select samples from a sequence.
/** * @param sd Pointer to algorithm data. Result will be placed in
* @brief Select samples from a sequence. * @c sd->samples.
* @param d Pointer to thread-local data. Result will be placed in * @param num_samples Number of samples to select.
* @c d->ds->samples. */
* @param num_samples Number of samples to select. template<typename RandomAccessIterator, typename _DifferenceTp>
*/
template<typename RandomAccessIterator, typename _DifferenceTp>
inline void inline void
determine_samples(PMWMSSorterPU<RandomAccessIterator>* d, determine_samples(PMWMSSortingData<RandomAccessIterator>* sd,
_DifferenceTp& num_samples) _DifferenceTp& num_samples)
{ {
typedef _DifferenceTp difference_type; typedef _DifferenceTp difference_type;
PMWMSSortingData<RandomAccessIterator>* sd = d->sd; thread_index_t iam = omp_get_thread_num();
num_samples = Settings::sort_mwms_oversampling * d->num_threads - 1; num_samples =
Settings::sort_mwms_oversampling * sd->num_threads - 1;
difference_type* es = static_cast<difference_type*>(__builtin_alloca(sizeof(difference_type) * (num_samples + 2))); difference_type* es = new difference_type[num_samples + 2];
equally_split(sd->starts[d->iam + 1] - sd->starts[d->iam], num_samples + 1, es); equally_split(sd->starts[iam + 1] - sd->starts[iam],
num_samples + 1, es);
for (difference_type i = 0; i < num_samples; i++) for (difference_type i = 0; i < num_samples; i++)
sd->samples[d->iam * num_samples + i] = sd->source[sd->starts[d->iam] + es[i + 1]]; sd->samples[iam * num_samples + i] =
sd->source[sd->starts[iam] + es[i + 1]];
delete[] es;
} }
/** @brief PMWMS code executed by each thread. /** @brief PMWMS code executed by each thread.
* @param d Pointer to thread-local data. * @param sd Pointer to algorithm data.
* @param comp Comparator. * @param comp Comparator.
*/ */
template<typename RandomAccessIterator, typename Comparator> template<typename RandomAccessIterator, typename Comparator>
inline void inline void
parallel_sort_mwms_pu(PMWMSSorterPU<RandomAccessIterator>* d, parallel_sort_mwms_pu(PMWMSSortingData<RandomAccessIterator>* sd,
Comparator& comp) Comparator& comp)
{ {
typedef std::iterator_traits<RandomAccessIterator> traits_type; typedef std::iterator_traits<RandomAccessIterator> traits_type;
typedef typename traits_type::value_type value_type; typedef typename traits_type::value_type value_type;
typedef typename traits_type::difference_type difference_type; typedef typename traits_type::difference_type difference_type;
PMWMSSortingData<RandomAccessIterator>* sd = d->sd; thread_index_t iam = omp_get_thread_num();
thread_index_t iam = d->iam;
// Length of this thread's chunk, before merging. // Length of this thread's chunk, before merging.
difference_type length_local = sd->starts[iam + 1] - sd->starts[iam]; difference_type length_local = sd->starts[iam + 1] - sd->starts[iam];
...@@ -174,161 +170,168 @@ namespace __gnu_parallel ...@@ -174,161 +170,168 @@ namespace __gnu_parallel
typedef value_type* SortingPlacesIterator; typedef value_type* SortingPlacesIterator;
// Sort in temporary storage, leave space for sentinel. // Sort in temporary storage, leave space for sentinel.
sd->sorting_places[iam] = sd->temporaries[iam] = static_cast<value_type*>(::operator new(sizeof(value_type) * (length_local + 1))); sd->sorting_places[iam] = sd->temporaries[iam] =
static_cast<value_type*>(
::operator new(sizeof(value_type) * (length_local + 1)));
// Copy there. // Copy there.
std::uninitialized_copy(sd->source + sd->starts[iam], sd->source + sd->starts[iam] + length_local, sd->sorting_places[iam]); std::uninitialized_copy(sd->source + sd->starts[iam],
sd->source + sd->starts[iam] + length_local,
sd->sorting_places[iam]);
#endif #endif
// Sort locally. // Sort locally.
if (d->stable) if (sd->stable)
__gnu_sequential::stable_sort(sd->sorting_places[iam], sd->sorting_places[iam] + length_local, comp); __gnu_sequential::stable_sort(sd->sorting_places[iam],
sd->sorting_places[iam] + length_local,
comp);
else else
__gnu_sequential::sort(sd->sorting_places[iam], sd->sorting_places[iam] + length_local, comp); __gnu_sequential::sort(sd->sorting_places[iam],
sd->sorting_places[iam] + length_local,
#if _GLIBCXX_ASSERTIONS comp);
_GLIBCXX_PARALLEL_ASSERT(is_sorted(sd->sorting_places[iam], sd->sorting_places[iam] + length_local, comp));
#endif
// Invariant: locally sorted subsequence in sd->sorting_places[iam], // Invariant: locally sorted subsequence in sd->sorting_places[iam],
// sd->sorting_places[iam] + length_local. // sd->sorting_places[iam] + length_local.
if (Settings::sort_splitting == Settings::SAMPLING) if (Settings::sort_splitting == Settings::SAMPLING)
{ {
difference_type num_samples; difference_type num_samples;
determine_samples(d, num_samples); determine_samples(sd, num_samples);
#pragma omp barrier # pragma omp barrier
#pragma omp single # pragma omp single
__gnu_sequential::sort(sd->samples, __gnu_sequential::sort(sd->samples,
sd->samples + (num_samples * d->num_threads), sd->samples + (num_samples * sd->num_threads),
comp); comp);
#pragma omp barrier # pragma omp barrier
for (int s = 0; s < d->num_threads; s++) for (int s = 0; s < sd->num_threads; s++)
{ {
// For each sequence. // For each sequence.
if (num_samples * iam > 0) if (num_samples * iam > 0)
sd->pieces[iam][s].begin = std::lower_bound(sd->sorting_places[s], sd->pieces[iam][s].begin =
sd->sorting_places[s] + sd->starts[s + 1] - sd->starts[s], std::lower_bound(sd->sorting_places[s],
sd->samples[num_samples * iam], sd->sorting_places[s] + sd->starts[s + 1] - sd->starts[s],
comp) sd->samples[num_samples * iam],
- sd->sorting_places[s]; comp)
else - sd->sorting_places[s];
// Absolute beginning. else
sd->pieces[iam][s].begin = 0; // Absolute beginning.
sd->pieces[iam][s].begin = 0;
if ((num_samples * (iam + 1)) < (num_samples * d->num_threads))
sd->pieces[iam][s].end = std::lower_bound(sd->sorting_places[s], if ((num_samples * (iam + 1)) < (num_samples * sd->num_threads))
sd->sorting_places[s] + sd->starts[s + 1] - sd->starts[s], sd->samples[num_samples * (iam + 1)], comp) sd->pieces[iam][s].end =
- sd->sorting_places[s]; std::lower_bound(sd->sorting_places[s],
else sd->sorting_places[s] + sd->starts[s + 1] - sd->starts[s],
// Absolute end. sd->samples[num_samples * (iam + 1)], comp)
sd->pieces[iam][s].end = sd->starts[s + 1] - sd->starts[s]; - sd->sorting_places[s];
} else
// Absolute end.
sd->pieces[iam][s].end = sd->starts[s + 1] - sd->starts[s];
}
} }
else if (Settings::sort_splitting == Settings::EXACT) else if (Settings::sort_splitting == Settings::EXACT)
{ {
#pragma omp barrier # pragma omp barrier
std::vector<std::pair<SortingPlacesIterator, SortingPlacesIterator> > seqs(d->num_threads); std::vector<std::pair<SortingPlacesIterator, SortingPlacesIterator> >
for (int s = 0; s < d->num_threads; s++) seqs(sd->num_threads);
seqs[s] = std::make_pair(sd->sorting_places[s], sd->sorting_places[s] + sd->starts[s + 1] - sd->starts[s]); for (int s = 0; s < sd->num_threads; s++)
seqs[s] = std::make_pair(sd->sorting_places[s],
std::vector<SortingPlacesIterator> offsets(d->num_threads); sd->sorting_places[s] + sd->starts[s + 1] - sd->starts[s]);
// If not last thread. std::vector<SortingPlacesIterator> offsets(sd->num_threads);
if (iam < d->num_threads - 1)
multiseq_partition(seqs.begin(), seqs.end(), sd->starts[iam + 1], offsets.begin(), comp); // if not last thread
if (iam < sd->num_threads - 1)
for (int seq = 0; seq < d->num_threads; seq++) multiseq_partition(seqs.begin(), seqs.end(),
{ sd->starts[iam + 1], offsets.begin(), comp);
// For each sequence.
if (iam < (d->num_threads - 1)) for (int seq = 0; seq < sd->num_threads; seq++)
sd->pieces[iam][seq].end = offsets[seq] - seqs[seq].first; {
else // for each sequence
// Absolute end of this sequence. if (iam < (sd->num_threads - 1))
sd->pieces[iam][seq].end = sd->starts[seq + 1] - sd->starts[seq]; sd->pieces[iam][seq].end = offsets[seq] - seqs[seq].first;
} else
// very end of this sequence
#pragma omp barrier sd->pieces[iam][seq].end = sd->starts[seq + 1] - sd->starts[seq];
}
for (int seq = 0; seq < d->num_threads; seq++)
{ # pragma omp barrier
// For each sequence.
if (iam > 0) for (int seq = 0; seq < sd->num_threads; seq++)
sd->pieces[iam][seq].begin = sd->pieces[iam - 1][seq].end; {
else // For each sequence.
// Absolute beginning. if (iam > 0)
sd->pieces[iam][seq].begin = 0; sd->pieces[iam][seq].begin = sd->pieces[iam - 1][seq].end;
} else
// Absolute beginning.
sd->pieces[iam][seq].begin = 0;
}
} }
// Offset from target begin, length after merging. // Offset from target begin, length after merging.
difference_type offset = 0, length_am = 0; difference_type offset = 0, length_am = 0;
for (int s = 0; s < d->num_threads; s++) for (int s = 0; s < sd->num_threads; s++)
{ {
length_am += sd->pieces[iam][s].end - sd->pieces[iam][s].begin; length_am += sd->pieces[iam][s].end - sd->pieces[iam][s].begin;
offset += sd->pieces[iam][s].begin; offset += sd->pieces[iam][s].begin;
} }
#if _GLIBCXX_MULTIWAY_MERGESORT_COPY_LAST #if _GLIBCXX_MULTIWAY_MERGESORT_COPY_LAST
// Merge to temporary storage, uninitialized creation not possible // Merge to temporary storage, uninitialized creation not possible
// since there is no multiway_merge calling the placement new // since there is no multiway_merge calling the placement new
// instead of the assignment operator. // instead of the assignment operator.
sd->merging_places[iam] = sd->temporaries[iam] = static_cast<value_type*>(::operator new(sizeof(value_type) * length_am)); sd->merging_places[iam] = sd->temporaries[iam] =
static_cast<value_type*>(
::operator new(sizeof(value_type) * length_am));
#else #else
// Merge directly to target. // Merge directly to target.
sd->merging_places[iam] = sd->source + offset; sd->merging_places[iam] = sd->source + offset;
#endif #endif
std::vector<std::pair<SortingPlacesIterator, SortingPlacesIterator> > seqs(d->num_threads); std::vector<std::pair<SortingPlacesIterator, SortingPlacesIterator> >
seqs(sd->num_threads);
for (int s = 0; s < d->num_threads; s++) for (int s = 0; s < sd->num_threads; s++)
{ {
seqs[s] = std::make_pair(sd->sorting_places[s] + sd->pieces[iam][s].begin, sd->sorting_places[s] + sd->pieces[iam][s].end); seqs[s] = std::make_pair(sd->sorting_places[s] + sd->pieces[iam][s].begin,
sd->sorting_places[s] + sd->pieces[iam][s].end);
#if _GLIBCXX_ASSERTIONS
_GLIBCXX_PARALLEL_ASSERT(is_sorted(seqs[s].first, seqs[s].second, comp));
#endif
} }
multiway_merge(seqs.begin(), seqs.end(), sd->merging_places[iam], comp, length_am, d->stable, false, sequential_tag()); multiway_merge(seqs.begin(), seqs.end(), sd->merging_places[iam], comp, length_am, sd->stable, false, sequential_tag());
#if _GLIBCXX_ASSERTIONS
_GLIBCXX_PARALLEL_ASSERT(is_sorted(sd->merging_places[iam], sd->merging_places[iam] + length_am, comp));
#endif
# pragma omp barrier # pragma omp barrier
#if _GLIBCXX_MULTIWAY_MERGESORT_COPY_LAST #if _GLIBCXX_MULTIWAY_MERGESORT_COPY_LAST
// Write back. // Write back.
std::copy(sd->merging_places[iam], sd->merging_places[iam] + length_am, std::copy(sd->merging_places[iam],
sd->source + offset); sd->merging_places[iam] + length_am,
sd->source + offset);
#endif #endif
delete[] sd->temporaries[iam]; delete[] sd->temporaries[iam];
} }
/** @brief PMWMS main call. /** @brief PMWMS main call.
* @param begin Begin iterator of sequence. * @param begin Begin iterator of sequence.
* @param end End iterator of sequence. * @param end End iterator of sequence.
* @param comp Comparator. * @param comp Comparator.
* @param n Length of sequence. * @param n Length of sequence.
* @param num_threads Number of threads to use. * @param num_threads Number of threads to use.
* @param stable Stable sorting. * @param stable Stable sorting.
*/ */
template<typename RandomAccessIterator, typename Comparator> template<typename RandomAccessIterator, typename Comparator>
inline void inline void
parallel_sort_mwms(RandomAccessIterator begin, RandomAccessIterator end, parallel_sort_mwms(RandomAccessIterator begin, RandomAccessIterator end,
Comparator comp, Comparator comp,
typename std::iterator_traits<RandomAccessIterator>::difference_type n, typename std::iterator_traits<RandomAccessIterator>::difference_type n,
int num_threads, bool stable) int num_threads,
bool stable)
{ {
_GLIBCXX_CALL(n) _GLIBCXX_CALL(n)
typedef std::iterator_traits<RandomAccessIterator> traits_type; typedef std::iterator_traits<RandomAccessIterator> traits_type;
typedef typename traits_type::value_type value_type; typedef typename traits_type::value_type value_type;
typedef typename traits_type::difference_type difference_type; typedef typename traits_type::difference_type difference_type;
...@@ -336,75 +339,75 @@ namespace __gnu_parallel ...@@ -336,75 +339,75 @@ namespace __gnu_parallel
if (n <= 1) if (n <= 1)
return; return;
// At least one element per thread. // at least one element per thread
if (num_threads > n) if (num_threads > n)
num_threads = static_cast<thread_index_t>(n); num_threads = static_cast<thread_index_t>(n);
// shared variables
PMWMSSortingData<RandomAccessIterator> sd; PMWMSSortingData<RandomAccessIterator> sd;
difference_type* starts;
sd.source = begin; # pragma omp parallel num_threads(num_threads)
sd.temporaries = new value_type*[num_threads]; {
num_threads = omp_get_num_threads(); //no more threads than requested
# pragma omp single
{
sd.num_threads = num_threads;
sd.source = begin;
sd.temporaries = new value_type*[num_threads];
#if _GLIBCXX_MULTIWAY_MERGESORT_COPY_LAST #if _GLIBCXX_MULTIWAY_MERGESORT_COPY_LAST
sd.sorting_places = new RandomAccessIterator[num_threads]; sd.sorting_places = new RandomAccessIterator[num_threads];
sd.merging_places = new value_type*[num_threads]; sd.merging_places = new value_type*[num_threads];
#else #else
sd.sorting_places = new value_type*[num_threads]; sd.sorting_places = new value_type*[num_threads];
sd.merging_places = new RandomAccessIterator[num_threads]; sd.merging_places = new RandomAccessIterator[num_threads];
#endif #endif
if (Settings::sort_splitting == Settings::SAMPLING) if (Settings::sort_splitting == Settings::SAMPLING)
{ {
unsigned int sz = Settings::sort_mwms_oversampling * num_threads - 1; unsigned int size =
sz *= num_threads; (Settings::sort_mwms_oversampling * num_threads - 1) * num_threads;
sd.samples = static_cast<value_type*>(
// Equivalent to value_type[sz], without need of default construction. ::operator new(size * sizeof(value_type)));
sz *= sizeof(value_type); }
sd.samples = static_cast<value_type*>(::operator new(sz)); else
} sd.samples = NULL;
else
sd.samples = NULL; sd.offsets = new difference_type[num_threads - 1];
sd.pieces = new std::vector<Piece<difference_type> >[num_threads];
sd.offsets = new difference_type[num_threads - 1]; for (int s = 0; s < num_threads; s++)
sd.pieces = new std::vector<Piece<difference_type> >[num_threads]; sd.pieces[s].resize(num_threads);
for (int s = 0; s < num_threads; s++) starts = sd.starts = new difference_type[num_threads + 1];
sd.pieces[s].resize(num_threads); sd.stable = stable;
PMWMSSorterPU<RandomAccessIterator>* pus = new PMWMSSorterPU<RandomAccessIterator>[num_threads];
difference_type* starts = sd.starts = new difference_type[num_threads + 1]; difference_type chunk_length = n / num_threads;
difference_type split = n % num_threads;
difference_type chunk_length = n / num_threads; difference_type pos = 0;
difference_type split = n % num_threads; for (int i = 0; i < num_threads; i++)
difference_type start = 0; {
for (int i = 0; i < num_threads; i++) starts[i] = pos;
{ pos += (i < split) ? (chunk_length + 1) : chunk_length;
starts[i] = start; }
start += (i < split) ? (chunk_length + 1) : chunk_length; starts[num_threads] = pos;
pus[i].num_threads = num_threads; }
pus[i].iam = i;
pus[i].sd = &sd; // Now sort in parallel.
pus[i].stable = stable; parallel_sort_mwms_pu(&sd, comp);
} } //parallel
starts[num_threads] = start;
// Now sort in parallel.
#pragma omp parallel num_threads(num_threads)
parallel_sort_mwms_pu(&(pus[omp_get_thread_num()]), comp);
// XXX sd as RAII
delete[] starts; delete[] starts;
delete[] sd.temporaries; delete[] sd.temporaries;
delete[] sd.sorting_places; delete[] sd.sorting_places;
delete[] sd.merging_places; delete[] sd.merging_places;
if (Settings::sort_splitting == Settings::SAMPLING) if (Settings::sort_splitting == Settings::SAMPLING)
delete[] sd.samples; delete[] sd.samples;
delete[] sd.offsets; delete[] sd.offsets;
delete[] sd.pieces; delete[] sd.pieces;
delete[] pus;
} }
} //namespace __gnu_parallel
}
#endif #endif
...@@ -43,54 +43,71 @@ ...@@ -43,54 +43,71 @@
#include <parallel/settings.h> #include <parallel/settings.h>
#include <parallel/basic_iterator.h> #include <parallel/basic_iterator.h>
#include <parallel/base.h>
namespace __gnu_parallel namespace __gnu_parallel
{ {
/** @brief Embarrassingly parallel algorithm for random access /** @brief Embarrassingly parallel algorithm for random access
* iterators, using an OpenMP for loop. * iterators, using an OpenMP for loop.
* *
* @param begin Begin iterator of element sequence. * @param begin Begin iterator of element sequence.
* @param end End iterator of element sequence. * @param end End iterator of element sequence.
* @param o User-supplied functor (comparator, predicate, adding * @param o User-supplied functor (comparator, predicate, adding
* functor, etc.). * functor, etc.).
* @param f Functor to "process" an element with op (depends on * @param f Functor to "process" an element with op (depends on
* desired functionality, e. g. for std::for_each(), ...). * desired functionality, e. g. for std::for_each(), ...).
* @param r Functor to "add" a single result to the already * @param r Functor to "add" a single result to the already
* processed elements (depends on functionality). * processed elements (depends on functionality).
* @param base Base value for reduction. * @param base Base value for reduction.
* @param output Pointer to position where final result is written to * @param output Pointer to position where final result is written to
* @param bound Maximum number of elements processed (e. g. for * @param bound Maximum number of elements processed (e. g. for
* std::count_n()). * std::count_n()).
* @return User-supplied functor (that may contain a part of the result). * @return User-supplied functor (that may contain a part of the result).
*/ */
template<typename RandomAccessIterator, typename Op, typename Fu, typename Red, typename Result> template<typename RandomAccessIterator,
typename Op,
typename Fu,
typename Red,
typename Result>
Op Op
for_each_template_random_access_omp_loop(RandomAccessIterator begin, RandomAccessIterator end, Op o, Fu& f, Red r, Result base, Result& output, typename std::iterator_traits<RandomAccessIterator>::difference_type bound) for_each_template_random_access_omp_loop(
RandomAccessIterator begin,
RandomAccessIterator end,
Op o, Fu& f, Red r, Result base, Result& output,
typename std::iterator_traits<RandomAccessIterator>::
difference_type bound)
{ {
typedef typename std::iterator_traits<RandomAccessIterator>::difference_type difference_type; typedef typename
std::iterator_traits<RandomAccessIterator>::difference_type
difference_type;
thread_index_t num_threads = (get_max_threads() < (end - begin)) ? get_max_threads() : static_cast<thread_index_t>((end - begin));
Result *thread_results = new Result[num_threads];
difference_type length = end - begin; difference_type length = end - begin;
thread_index_t num_threads =
__gnu_parallel::min<difference_type>(get_max_threads(), length);
for (thread_index_t i = 0; i < num_threads; i++) Result *thread_results;
# pragma omp parallel num_threads(num_threads)
{ {
thread_results[i] = r(thread_results[i], f(o, begin+i)); # pragma omp single
} {
num_threads = omp_get_num_threads();
#pragma omp parallel num_threads(num_threads) thread_results = new Result[num_threads];
{
#pragma omp for schedule(dynamic, Settings::workstealing_chunk_size) for (thread_index_t i = 0; i < num_threads; i++)
for (difference_type pos = 0; pos < length; pos++) thread_results[i] = Result();
{ }
thread_results[omp_get_thread_num()] = r(thread_results[omp_get_thread_num()], f(o, begin+pos));
} thread_index_t iam = omp_get_thread_num();
}
# pragma omp for schedule(dynamic, Settings::workstealing_chunk_size)
for (difference_type pos = 0; pos < length; pos++)
thread_results[iam] =
r(thread_results[iam], f(o, begin+pos));
} //parallel
for (thread_index_t i = 0; i < num_threads; i++) for (thread_index_t i = 0; i < num_threads; i++)
{ output = r(output, thread_results[i]);
output = r(output, thread_results[i]);
}
delete [] thread_results; delete [] thread_results;
...@@ -100,6 +117,7 @@ namespace __gnu_parallel ...@@ -100,6 +117,7 @@ namespace __gnu_parallel
return o; return o;
} }
} // end namespace } // end namespace
#endif #endif
...@@ -64,39 +64,50 @@ namespace __gnu_parallel ...@@ -64,39 +64,50 @@ namespace __gnu_parallel
* std::count_n()). * std::count_n()).
* @return User-supplied functor (that may contain a part of the result). * @return User-supplied functor (that may contain a part of the result).
*/ */
template<typename RandomAccessIterator, typename Op, typename Fu, typename Red, typename Result> template<typename RandomAccessIterator,
typename Op,
typename Fu,
typename Red,
typename Result>
Op Op
for_each_template_random_access_omp_loop_static(RandomAccessIterator begin, for_each_template_random_access_omp_loop_static(
RandomAccessIterator end, RandomAccessIterator begin,
Op o, Fu& f, Red r, RandomAccessIterator end,
Result base, Result& output, Op o, Fu& f, Red r, Result base, Result& output,
typename std::iterator_traits<RandomAccessIterator>::difference_type bound) typename std::iterator_traits<RandomAccessIterator>::
difference_type bound)
{ {
typedef std::iterator_traits<RandomAccessIterator> traits_type; typedef typename
typedef typename traits_type::difference_type difference_type; std::iterator_traits<RandomAccessIterator>::difference_type
difference_type;
thread_index_t num_threads = (get_max_threads() < (end - begin)) ? get_max_threads() : (end - begin);
Result *thread_results = new Result[num_threads];
difference_type length = end - begin; difference_type length = end - begin;
thread_index_t num_threads =
std::min<difference_type>(get_max_threads(), length);
for (thread_index_t i = 0; i < num_threads; i++) Result *thread_results;
# pragma omp parallel num_threads(num_threads)
{ {
thread_results[i] = r(thread_results[i], f(o, begin+i)); # pragma omp single
} {
num_threads = omp_get_num_threads();
#pragma omp parallel num_threads(num_threads) thread_results = new Result[num_threads];
{
#pragma omp for schedule(static, Settings::workstealing_chunk_size) for (thread_index_t i = 0; i < num_threads; i++)
for (difference_type pos = 0; pos < length; pos++) thread_results[i] = Result();
{ }
thread_results[omp_get_thread_num()] = r(thread_results[omp_get_thread_num()], f(o, begin+pos));
} thread_index_t iam = omp_get_thread_num();
}
# pragma omp for schedule(static, Settings::workstealing_chunk_size)
for (difference_type pos = 0; pos < length; pos++)
thread_results[iam] =
r(thread_results[iam], f(o, begin+pos));
} //parallel
for (thread_index_t i = 0; i < num_threads; i++) for (thread_index_t i = 0; i < num_threads; i++)
{ output = r(output, thread_results[i]);
output = r(output, thread_results[i]);
}
delete [] thread_results; delete [] thread_results;
...@@ -106,6 +117,7 @@ namespace __gnu_parallel ...@@ -106,6 +117,7 @@ namespace __gnu_parallel
return o; return o;
} }
} // end namespace } // end namespace
#endif #endif
...@@ -41,69 +41,80 @@ ...@@ -41,69 +41,80 @@
#include <omp.h> #include <omp.h>
#include <parallel/settings.h> #include <parallel/settings.h>
#include <parallel/base.h>
namespace __gnu_parallel namespace __gnu_parallel
{ {
/** @brief Embarrassingly parallel algorithm for random access /** @brief Embarrassingly parallel algorithm for random access
* iterators, using hand-crafted parallelization by equal splitting * iterators, using hand-crafted parallelization by equal splitting
* the work. * the work.
* *
* @param begin Begin iterator of element sequence. * @param begin Begin iterator of element sequence.
* @param end End iterator of element sequence. * @param end End iterator of element sequence.
* @param o User-supplied functor (comparator, predicate, adding * @param o User-supplied functor (comparator, predicate, adding
* functor, ...) * functor, ...)
* @param f Functor to "process" an element with op (depends on * @param f Functor to "process" an element with op (depends on
* desired functionality, e. g. for std::for_each(), ...). * desired functionality, e. g. for std::for_each(), ...).
* @param r Functor to "add" a single result to the already * @param r Functor to "add" a single result to the already
* processed elements (depends on functionality). * processed elements (depends on functionality).
* @param base Base value for reduction. * @param base Base value for reduction.
* @param output Pointer to position where final result is written to * @param output Pointer to position where final result is written to
* @param bound Maximum number of elements processed (e. g. for * @param bound Maximum number of elements processed (e. g. for
* std::count_n()). * std::count_n()).
* @return User-supplied functor (that may contain a part of the result). * @return User-supplied functor (that may contain a part of the result).
*/ */
template<typename RandomAccessIterator, typename Op, typename Fu, typename Red, typename Result> template<
typename RandomAccessIterator,
typename Op,
typename Fu,
typename Red,
typename Result>
Op Op
for_each_template_random_access_ed(RandomAccessIterator begin, for_each_template_random_access_ed(
RandomAccessIterator end, Op o, Fu& f, RandomAccessIterator begin,
Red r, Result base, Result& output, RandomAccessIterator end,
typename std::iterator_traits<RandomAccessIterator>::difference_type bound) Op o, Fu& f, Red r, Result base, Result& output,
typename std::iterator_traits<RandomAccessIterator>::
difference_type bound)
{ {
typedef std::iterator_traits<RandomAccessIterator> traits_type; typedef std::iterator_traits<RandomAccessIterator> traits_type;
typedef typename traits_type::difference_type difference_type; typedef typename traits_type::difference_type difference_type;
const difference_type length = end - begin; const difference_type length = end - begin;
const difference_type settings_threads = static_cast<difference_type>(get_max_threads()); Result *thread_results;
const difference_type dmin = settings_threads < length ? settings_threads : length;
const difference_type dmax = dmin > 1 ? dmin : 1;
thread_index_t num_threads = static_cast<thread_index_t>(dmax); thread_index_t num_threads =
__gnu_parallel::min<difference_type>(get_max_threads(), length);
# pragma omp parallel num_threads(num_threads)
{
# pragma omp single
{
num_threads = omp_get_num_threads();
thread_results = new Result[num_threads];
}
Result *thread_results = new Result[num_threads]; thread_index_t iam = omp_get_thread_num();
#pragma omp parallel num_threads(num_threads) // Neutral element.
{ Result reduct = Result();
// Neutral element.
Result reduct = Result();
thread_index_t p = num_threads; difference_type
thread_index_t iam = omp_get_thread_num(); start = equally_split_point(length, num_threads, iam),
difference_type start = iam * length / p; stop = equally_split_point(length, num_threads, iam + 1);
difference_type limit = (iam == p - 1) ? length : (iam + 1) * length / p;
if (start < limit) if (start < stop)
{ {
reduct = f(o, begin + start); reduct = f(o, begin + start);
start++; ++start;
} }
for (; start < limit; start++) for (; start < stop; ++start)
reduct = r(reduct, f(o, begin + start)); reduct = r(reduct, f(o, begin + start));
thread_results[iam] = reduct; thread_results[iam] = reduct;
} } //parallel
for (thread_index_t i = 0; i < num_threads; i++) for (thread_index_t i = 0; i < num_threads; i++)
output = r(output, thread_results[i]); output = r(output, thread_results[i]);
......
...@@ -48,130 +48,156 @@ namespace __gnu_parallel ...@@ -48,130 +48,156 @@ namespace __gnu_parallel
{ {
// Problem: there is no 0-element given. // Problem: there is no 0-element given.
/** @brief Base case prefix sum routine. /** @brief Base case prefix sum routine.
* @param begin Begin iterator of input sequence. * @param begin Begin iterator of input sequence.
* @param end End iterator of input sequence. * @param end End iterator of input sequence.
* @param result Begin iterator of output sequence. * @param result Begin iterator of output sequence.
* @param bin_op Associative binary function. * @param bin_op Associative binary function.
* @param value Start value. Must be passed since the neutral * @param value Start value. Must be passed since the neutral
* element is unknown in general. * element is unknown in general.
* @return End iterator of output sequence. */ * @return End iterator of output sequence. */
template<typename InputIterator, typename OutputIterator, typename BinaryOperation> template<
typename InputIterator,
typename OutputIterator,
typename BinaryOperation>
inline OutputIterator inline OutputIterator
parallel_partial_sum_basecase(InputIterator begin, InputIterator end, parallel_partial_sum_basecase(
OutputIterator result, BinaryOperation bin_op, InputIterator begin, InputIterator end,
typename std::iterator_traits<InputIterator>::value_type value) OutputIterator result, BinaryOperation bin_op,
typename std::iterator_traits<InputIterator>::value_type value)
{ {
if (begin == end) if (begin == end)
return result; return result;
while (begin != end) while (begin != end)
{ {
value = bin_op(value, *begin); value = bin_op(value, *begin);
*result = value; *result = value;
result++; result++;
begin++; begin++;
} }
return result; return result;
} }
/** @brief Parallel partial sum implementation, two-phase approach, /** @brief Parallel partial sum implementation, two-phase approach,
no recursion. no recursion.
* @param begin Begin iterator of input sequence. * @param begin Begin iterator of input sequence.
* @param end End iterator of input sequence. * @param end End iterator of input sequence.
* @param result Begin iterator of output sequence. * @param result Begin iterator of output sequence.
* @param bin_op Associative binary function. * @param bin_op Associative binary function.
* @param n Length of sequence. * @param n Length of sequence.
* @param num_threads Number of threads to use. * @param num_threads Number of threads to use.
* @return End iterator of output sequence. * @return End iterator of output sequence.
*/ */
template<typename InputIterator, typename OutputIterator, typename BinaryOperation> template<
typename InputIterator,
typename OutputIterator,
typename BinaryOperation>
OutputIterator OutputIterator
parallel_partial_sum_linear(InputIterator begin, InputIterator end, parallel_partial_sum_linear(
OutputIterator result, BinaryOperation bin_op, InputIterator begin, InputIterator end,
typename std::iterator_traits<InputIterator>::difference_type n, int num_threads) OutputIterator result, BinaryOperation bin_op,
typename std::iterator_traits<InputIterator>::difference_type n)
{ {
typedef std::iterator_traits<InputIterator> traits_type; typedef std::iterator_traits<InputIterator> traits_type;
typedef typename traits_type::value_type value_type; typedef typename traits_type::value_type value_type;
typedef typename traits_type::difference_type difference_type; typedef typename traits_type::difference_type difference_type;
if (num_threads > (n - 1)) thread_index_t num_threads =
num_threads = static_cast<thread_index_t>(n - 1); std::min<difference_type>(get_max_threads(), n - 1);
if (num_threads < 2) if (num_threads < 2)
{ {
*result = *begin; *result = *begin;
return parallel_partial_sum_basecase(begin + 1, end, result + 1, bin_op, *begin); return parallel_partial_sum_basecase(
begin + 1, end, result + 1, bin_op, *begin);
} }
difference_type* borders = static_cast<difference_type*>(__builtin_alloca(sizeof(difference_type) * (num_threads + 2))); difference_type* borders;
value_type* sums;
if (Settings::partial_sum_dilatation == 1.0f) # pragma omp parallel num_threads(num_threads)
equally_split(n, num_threads + 1, borders);
else
{ {
difference_type chunk_length = (int)((double)n / ((double)num_threads + Settings::partial_sum_dilatation)), borderstart = n - num_threads * chunk_length; # pragma omp single
borders[0] = 0; {
for (int i = 1; i < (num_threads + 1); i++) num_threads = omp_get_num_threads();
{
borders[i] = borderstart; borders = new difference_type[num_threads + 2];
borderstart += chunk_length;
} if (Settings::partial_sum_dilatation == 1.0f)
borders[num_threads + 1] = n; equally_split(n, num_threads + 1, borders);
} else
{
value_type* sums = static_cast<value_type*>(::operator new(sizeof(value_type) * num_threads)); difference_type chunk_length =
OutputIterator target_end; ((double)n /
((double)num_threads + Settings::partial_sum_dilatation)),
#pragma omp parallel num_threads(num_threads) borderstart = n - num_threads * chunk_length;
{ borders[0] = 0;
int id = omp_get_thread_num(); for (int i = 1; i < (num_threads + 1); i++)
if (id == 0) {
{ borders[i] = borderstart;
*result = *begin; borderstart += chunk_length;
parallel_partial_sum_basecase(begin + 1, begin + borders[1], }
result + 1, bin_op, *begin); borders[num_threads + 1] = n;
sums[0] = *(result + borders[1] - 1); }
}
else sums = static_cast<value_type*>(
{ ::operator new(sizeof(value_type) * num_threads));
sums[id] = std::accumulate(begin + borders[id] + 1, OutputIterator target_end;
begin + borders[id + 1], } //single
*(begin + borders[id]),
bin_op, __gnu_parallel::sequential_tag()); int iam = omp_get_thread_num();
} if (iam == 0)
{
#pragma omp barrier *result = *begin;
parallel_partial_sum_basecase(begin + 1, begin + borders[1],
#pragma omp single result + 1, bin_op, *begin);
parallel_partial_sum_basecase(sums + 1, sums + num_threads, sums + 1, sums[0] = *(result + borders[1] - 1);
bin_op, sums[0]); }
else
#pragma omp barrier {
sums[iam] = std::accumulate(begin + borders[iam] + 1,
// Still same team. begin + borders[iam + 1],
parallel_partial_sum_basecase(begin + borders[id + 1], *(begin + borders[iam]),
begin + borders[id + 2], bin_op, __gnu_parallel::sequential_tag());
result + borders[id + 1], bin_op, }
sums[id]);
} # pragma omp barrier
delete [] sums; # pragma omp single
parallel_partial_sum_basecase(
sums + 1, sums + num_threads, sums + 1, bin_op, sums[0]);
# pragma omp barrier
// Still same team.
parallel_partial_sum_basecase(begin + borders[iam + 1],
begin + borders[iam + 2],
result + borders[iam + 1], bin_op,
sums[iam]);
} //parallel
delete[] sums;
delete[] borders;
return result + n; return result + n;
} }
/** @brief Parallel partial sum front-end. /** @brief Parallel partial sum front-end.
* @param begin Begin iterator of input sequence. * @param begin Begin iterator of input sequence.
* @param end End iterator of input sequence. * @param end End iterator of input sequence.
* @param result Begin iterator of output sequence. * @param result Begin iterator of output sequence.
* @param bin_op Associative binary function. * @param bin_op Associative binary function.
* @return End iterator of output sequence. */ * @return End iterator of output sequence. */
template<typename InputIterator, typename OutputIterator, typename BinaryOperation> template<
typename InputIterator,
typename OutputIterator,
typename BinaryOperation>
OutputIterator OutputIterator
parallel_partial_sum(InputIterator begin, InputIterator end, parallel_partial_sum(InputIterator begin, InputIterator end,
OutputIterator result, BinaryOperation bin_op) OutputIterator result, BinaryOperation bin_op)
{ {
_GLIBCXX_CALL(begin - end); _GLIBCXX_CALL(begin - end)
typedef std::iterator_traits<InputIterator> traits_type; typedef std::iterator_traits<InputIterator> traits_type;
typedef typename traits_type::value_type value_type; typedef typename traits_type::value_type value_type;
...@@ -179,18 +205,15 @@ namespace __gnu_parallel ...@@ -179,18 +205,15 @@ namespace __gnu_parallel
difference_type n = end - begin; difference_type n = end - begin;
int num_threads = get_max_threads();
switch (Settings::partial_sum_algorithm) switch (Settings::partial_sum_algorithm)
{ {
case Settings::LINEAR: case Settings::LINEAR:
// Need an initial offset. // Need an initial offset.
return parallel_partial_sum_linear(begin, end, result, bin_op, return parallel_partial_sum_linear(begin, end, result, bin_op, n);
n, num_threads);
default: default:
// Partial_sum algorithm not implemented. // Partial_sum algorithm not implemented.
_GLIBCXX_PARALLEL_ASSERT(0); _GLIBCXX_PARALLEL_ASSERT(0);
return result + n; return result + n;
} }
} }
} }
......
...@@ -45,21 +45,21 @@ ...@@ -45,21 +45,21 @@
#include <bits/stl_algo.h> #include <bits/stl_algo.h>
#include <parallel/parallel.h> #include <parallel/parallel.h>
/** @brief Decide whether to declare certain variable volatile in this file. */ /** @brief Decide whether to declare certain variables volatile. */
#define _GLIBCXX_VOLATILE volatile #define _GLIBCXX_VOLATILE volatile
namespace __gnu_parallel namespace __gnu_parallel
{ {
/** @brief Parallel implementation of std::partition. /** @brief Parallel implementation of std::partition.
* @param begin Begin iterator of input sequence to split. * @param begin Begin iterator of input sequence to split.
* @param end End iterator of input sequence to split. * @param end End iterator of input sequence to split.
* @param pred Partition predicate, possibly including some kind of pivot. * @param pred Partition predicate, possibly including some kind of pivot.
* @param max_num_threads Maximum number of threads to use for this task. * @param num_threads Maximum number of threads to use for this task.
* @return Number of elements not fulfilling the predicate. */ * @return Number of elements not fulfilling the predicate. */
template<typename RandomAccessIterator, typename Predicate> template<typename RandomAccessIterator, typename Predicate>
inline typename std::iterator_traits<RandomAccessIterator>::difference_type typename std::iterator_traits<RandomAccessIterator>::difference_type
parallel_partition(RandomAccessIterator begin, RandomAccessIterator end, parallel_partition(RandomAccessIterator begin, RandomAccessIterator end,
Predicate pred, thread_index_t max_num_threads) Predicate pred, thread_index_t num_threads)
{ {
typedef std::iterator_traits<RandomAccessIterator> traits_type; typedef std::iterator_traits<RandomAccessIterator> traits_type;
typedef typename traits_type::value_type value_type; typedef typename traits_type::value_type value_type;
...@@ -74,212 +74,238 @@ namespace __gnu_parallel ...@@ -74,212 +74,238 @@ namespace __gnu_parallel
_GLIBCXX_VOLATILE difference_type leftover_left, leftover_right; _GLIBCXX_VOLATILE difference_type leftover_left, leftover_right;
_GLIBCXX_VOLATILE difference_type leftnew, rightnew; _GLIBCXX_VOLATILE difference_type leftnew, rightnew;
bool* reserved_left, * reserved_right; bool* reserved_left = NULL, * reserved_right = NULL;
reserved_left = new bool[max_num_threads];
reserved_right = new bool[max_num_threads];
difference_type chunk_size; difference_type chunk_size;
if (Settings::partition_chunk_share > 0.0)
chunk_size = std::max((difference_type)Settings::partition_chunk_size, (difference_type)((double)n * Settings::partition_chunk_share / (double)max_num_threads));
else
chunk_size = Settings::partition_chunk_size;
omp_lock_t result_lock; omp_lock_t result_lock;
omp_init_lock(&result_lock); omp_init_lock(&result_lock);
// At least good for two processors. //at least two chunks per thread
while (right - left + 1 >= 2 * max_num_threads * chunk_size) if(right - left + 1 >= 2 * num_threads * chunk_size)
# pragma omp parallel num_threads(num_threads)
{ {
difference_type num_chunks = (right - left + 1) / chunk_size; # pragma omp single
thread_index_t num_threads = (int)std::min((difference_type)max_num_threads, num_chunks / 2); {
num_threads = omp_get_num_threads();
for (int r = 0; r < num_threads; r++) reserved_left = new bool[num_threads];
{ reserved_right = new bool[num_threads];
reserved_left[r] = false;
reserved_right[r] = false; if (Settings::partition_chunk_share > 0.0)
} chunk_size = std::max<difference_type>(
leftover_left = 0; Settings::partition_chunk_size,
leftover_right = 0; (double)n * Settings::partition_chunk_share /
(double)num_threads);
#pragma omp parallel num_threads(num_threads) else
{ chunk_size = Settings::partition_chunk_size;
// Private. }
difference_type thread_left, thread_left_border, thread_right, thread_right_border;
thread_left = left + 1; while (right - left + 1 >= 2 * num_threads * chunk_size)
{
// Just to satisfy the condition below. # pragma omp single
thread_left_border = thread_left - 1; {
thread_right = n - 1; difference_type num_chunks = (right - left + 1) / chunk_size;
thread_right_border = thread_right + 1;
for (int r = 0; r < num_threads; r++)
bool iam_finished = false; {
while (!iam_finished) reserved_left[r] = false;
{ reserved_right[r] = false;
if (thread_left > thread_left_border) }
{ leftover_left = 0;
omp_set_lock(&result_lock); leftover_right = 0;
if (left + (chunk_size - 1) > right) } //implicit barrier
iam_finished = true;
else // Private.
{ difference_type thread_left, thread_left_border,
thread_left = left; thread_right, thread_right_border;
thread_left_border = left + (chunk_size - 1); thread_left = left + 1;
left += chunk_size;
} // Just to satisfy the condition below.
omp_unset_lock(&result_lock); thread_left_border = thread_left - 1;
} thread_right = n - 1;
thread_right_border = thread_right + 1;
if (thread_right < thread_right_border)
{ bool iam_finished = false;
omp_set_lock(&result_lock); while (!iam_finished)
if (left > right - (chunk_size - 1)) {
iam_finished = true; if (thread_left > thread_left_border)
else {
{ omp_set_lock(&result_lock);
thread_right = right; if (left + (chunk_size - 1) > right)
thread_right_border = right - (chunk_size - 1); iam_finished = true;
right -= chunk_size; else
} {
omp_unset_lock(&result_lock); thread_left = left;
} thread_left_border = left + (chunk_size - 1);
left += chunk_size;
if (iam_finished) }
break; omp_unset_lock(&result_lock);
}
// Swap as usual.
while (thread_left < thread_right) if (thread_right < thread_right_border)
{ {
while (pred(begin[thread_left]) && thread_left <= thread_left_border) omp_set_lock(&result_lock);
thread_left++; if (left > right - (chunk_size - 1))
while (!pred(begin[thread_right]) && thread_right >= thread_right_border) iam_finished = true;
thread_right--; else
{
if (thread_left > thread_left_border || thread_right < thread_right_border) thread_right = right;
// Fetch new chunk(s). thread_right_border = right - (chunk_size - 1);
break; right -= chunk_size;
}
std::swap(begin[thread_left], begin[thread_right]); omp_unset_lock(&result_lock);
thread_left++; }
thread_right--;
} if (iam_finished)
} break;
// Now swap the leftover chunks to the right places. // Swap as usual.
if (thread_left <= thread_left_border) while (thread_left < thread_right)
#pragma omp atomic {
leftover_left++; while (pred(begin[thread_left])
if (thread_right >= thread_right_border) && thread_left <= thread_left_border)
#pragma omp atomic thread_left++;
leftover_right++; while (!pred(begin[thread_right])
&& thread_right >= thread_right_border)
#pragma omp barrier thread_right--;
#pragma omp single if (thread_left > thread_left_border
{ || thread_right < thread_right_border)
leftnew = left - leftover_left * chunk_size; // Fetch new chunk(s).
rightnew = right + leftover_right * chunk_size; break;
}
std::swap(begin[thread_left], begin[thread_right]);
#pragma omp barrier thread_left++;
thread_right--;
// <=> thread_left_border + (chunk_size - 1) >= leftnew }
if (thread_left <= thread_left_border }
&& thread_left_border >= leftnew)
{ // Now swap the leftover chunks to the right places.
// Chunk already in place, reserve spot. if (thread_left <= thread_left_border)
reserved_left[(left - (thread_left_border + 1)) / chunk_size] = true; # pragma omp atomic
} leftover_left++;
if (thread_right >= thread_right_border)
// <=> thread_right_border - (chunk_size - 1) <= rightnew # pragma omp atomic
if (thread_right >= thread_right_border leftover_right++;
&& thread_right_border <= rightnew)
{ # pragma omp barrier
// Chunk already in place, reserve spot.
reserved_right[((thread_right_border - 1) - right) / chunk_size] = true; # pragma omp single
} {
leftnew = left - leftover_left * chunk_size;
#pragma omp barrier rightnew = right + leftover_right * chunk_size;
}
if (thread_left <= thread_left_border && thread_left_border < leftnew)
{ # pragma omp barrier
// Find spot and swap.
difference_type swapstart = -1; // <=> thread_left_border + (chunk_size - 1) >= leftnew
omp_set_lock(&result_lock); if (thread_left <= thread_left_border
for (int r = 0; r < leftover_left; r++) && thread_left_border >= leftnew)
{
// Chunk already in place, reserve spot.
reserved_left[(left - (thread_left_border + 1)) / chunk_size]
= true;
}
// <=> thread_right_border - (chunk_size - 1) <= rightnew
if (thread_right >= thread_right_border
&& thread_right_border <= rightnew)
{
// Chunk already in place, reserve spot.
reserved_right
[((thread_right_border - 1) - right) / chunk_size]
= true;
}
# pragma omp barrier
if (thread_left <= thread_left_border
&& thread_left_border < leftnew)
{
// Find spot and swap.
difference_type swapstart = -1;
omp_set_lock(&result_lock);
for (int r = 0; r < leftover_left; r++)
if (!reserved_left[r]) if (!reserved_left[r])
{ {
reserved_left[r] = true; reserved_left[r] = true;
swapstart = left - (r + 1) * chunk_size; swapstart = left - (r + 1) * chunk_size;
break; break;
} }
omp_unset_lock(&result_lock); omp_unset_lock(&result_lock);
#if _GLIBCXX_ASSERTIONS #if _GLIBCXX_ASSERTIONS
_GLIBCXX_PARALLEL_ASSERT(swapstart != -1); _GLIBCXX_PARALLEL_ASSERT(swapstart != -1);
#endif #endif
std::swap_ranges(begin + thread_left_border - (chunk_size - 1), begin + thread_left_border + 1, begin + swapstart); std::swap_ranges(
} begin + thread_left_border - (chunk_size - 1),
begin + thread_left_border + 1,
if (thread_right >= thread_right_border begin + swapstart);
&& thread_right_border > rightnew) }
{
// Find spot and swap if (thread_right >= thread_right_border
difference_type swapstart = -1; && thread_right_border > rightnew)
omp_set_lock(&result_lock); {
for (int r = 0; r < leftover_right; r++) // Find spot and swap
if (!reserved_right[r]) difference_type swapstart = -1;
{ omp_set_lock(&result_lock);
reserved_right[r] = true; for (int r = 0; r < leftover_right; r++)
swapstart = right + r * chunk_size + 1; if (!reserved_right[r])
break; {
} reserved_right[r] = true;
omp_unset_lock(&result_lock); swapstart = right + r * chunk_size + 1;
break;
}
omp_unset_lock(&result_lock);
#if _GLIBCXX_ASSERTIONS #if _GLIBCXX_ASSERTIONS
_GLIBCXX_PARALLEL_ASSERT(swapstart != -1); _GLIBCXX_PARALLEL_ASSERT(swapstart != -1);
#endif #endif
std::swap_ranges(begin + thread_right_border, begin + thread_right_border + chunk_size, begin + swapstart); std::swap_ranges(begin + thread_right_border,
} begin + thread_right_border + chunk_size,
begin + swapstart);
}
#if _GLIBCXX_ASSERTIONS #if _GLIBCXX_ASSERTIONS
#pragma omp barrier # pragma omp barrier
#pragma omp single # pragma omp single
{ {
for (int r = 0; r < leftover_left; r++) for (int r = 0; r < leftover_left; r++)
_GLIBCXX_PARALLEL_ASSERT(reserved_left[r]); _GLIBCXX_PARALLEL_ASSERT(reserved_left[r]);
for (int r = 0; r < leftover_right; r++) for (int r = 0; r < leftover_right; r++)
_GLIBCXX_PARALLEL_ASSERT(reserved_right[r]); _GLIBCXX_PARALLEL_ASSERT(reserved_right[r]);
} }
#pragma omp barrier # pragma omp barrier
#endif #endif
#pragma omp barrier # pragma omp barrier
left = leftnew;
right = rightnew; left = leftnew;
} right = rightnew;
} // end "recursion" }
# pragma omp flush(left, right)
} // end "recursion" //parallel
difference_type final_left = left, final_right = right; difference_type final_left = left, final_right = right;
while (final_left < final_right) while (final_left < final_right)
{ {
// Go right until key is geq than pivot. // Go right until key is geq than pivot.
while (pred(begin[final_left]) && final_left < final_right) while (pred(begin[final_left]) && final_left < final_right)
final_left++; final_left++;
// Go left until key is less than pivot. // Go left until key is less than pivot.
while (!pred(begin[final_right]) && final_left < final_right) while (!pred(begin[final_right]) && final_left < final_right)
final_right--; final_right--;
if (final_left == final_right) if (final_left == final_right)
break; break;
std::swap(begin[final_left], begin[final_right]); std::swap(begin[final_left], begin[final_right]);
final_left++; final_left++;
final_right--; final_right--;
} }
// All elements on the left side are < piv, all elements on the // All elements on the left side are < piv, all elements on the
...@@ -298,14 +324,14 @@ namespace __gnu_parallel ...@@ -298,14 +324,14 @@ namespace __gnu_parallel
return final_left + 1; return final_left + 1;
} }
/** /**
* @brief Parallel implementation of std::nth_element(). * @brief Parallel implementation of std::nth_element().
* @param begin Begin iterator of input sequence. * @param begin Begin iterator of input sequence.
* @param nth Iterator of element that must be in position afterwards. * @param nth Iterator of element that must be in position afterwards.
* @param end End iterator of input sequence. * @param end End iterator of input sequence.
* @param comp Comparator. * @param comp Comparator.
*/ */
template<typename RandomAccessIterator, typename Comparator> template<typename RandomAccessIterator, typename Comparator>
void void
parallel_nth_element(RandomAccessIterator begin, RandomAccessIterator nth, parallel_nth_element(RandomAccessIterator begin, RandomAccessIterator nth,
RandomAccessIterator end, Comparator comp) RandomAccessIterator end, Comparator comp)
...@@ -324,65 +350,65 @@ namespace __gnu_parallel ...@@ -324,65 +350,65 @@ namespace __gnu_parallel
// Break if input range to small. // Break if input range to small.
while (static_cast<sequence_index_t>(end - begin) >= minimum_length) while (static_cast<sequence_index_t>(end - begin) >= minimum_length)
{ {
difference_type n = end - begin; difference_type n = end - begin;
RandomAccessIterator pivot_pos = begin + rng(n); RandomAccessIterator pivot_pos = begin + rng(n);
// Swap pivot_pos value to end. // Swap pivot_pos value to end.
if (pivot_pos != (end - 1)) if (pivot_pos != (end - 1))
std::swap(*pivot_pos, *(end - 1)); std::swap(*pivot_pos, *(end - 1));
pivot_pos = end - 1; pivot_pos = end - 1;
// XXX Comparator must have first_value_type, second_value_type, result_type // XXX Comparator must have first_value_type, second_value_type, result_type
// Comparator == __gnu_parallel::lexicographic<S, int, __gnu_parallel::less<S, S> > // Comparator == __gnu_parallel::lexicographic<S, int, __gnu_parallel::less<S, S> >
// pivot_pos == std::pair<S, int>* // pivot_pos == std::pair<S, int>*
// XXX binder2nd only for RandomAccessIterators?? // XXX binder2nd only for RandomAccessIterators??
__gnu_parallel::binder2nd<Comparator, value_type, value_type, bool> pred(comp, *pivot_pos); __gnu_parallel::binder2nd<Comparator, value_type, value_type, bool> pred(comp, *pivot_pos);
// Divide, leave pivot unchanged in last place. // Divide, leave pivot unchanged in last place.
RandomAccessIterator split_pos1, split_pos2; RandomAccessIterator split_pos1, split_pos2;
split_pos1 = begin + parallel_partition(begin, end - 1, pred, get_max_threads()); split_pos1 = begin + parallel_partition(begin, end - 1, pred, get_max_threads());
// Left side: < pivot_pos; right side: >= pivot_pos // Left side: < pivot_pos; right side: >= pivot_pos
// Swap pivot back to middle. // Swap pivot back to middle.
if (split_pos1 != pivot_pos) if (split_pos1 != pivot_pos)
std::swap(*split_pos1, *pivot_pos); std::swap(*split_pos1, *pivot_pos);
pivot_pos = split_pos1; pivot_pos = split_pos1;
// In case all elements are equal, split_pos1 == 0 // In case all elements are equal, split_pos1 == 0
if ((split_pos1 + 1 - begin) < (n >> 7) || (end - split_pos1) < (n >> 7)) if ((split_pos1 + 1 - begin) < (n >> 7) || (end - split_pos1) < (n >> 7))
{ {
// Very unequal split, one part smaller than one 128th // Very unequal split, one part smaller than one 128th
// elements not stricly larger than the pivot. // elements not stricly larger than the pivot.
__gnu_parallel::unary_negate<__gnu_parallel::binder1st<Comparator, value_type, value_type, bool>, value_type> pred(__gnu_parallel::binder1st<Comparator, value_type, value_type, bool>(comp, *pivot_pos)); __gnu_parallel::unary_negate<__gnu_parallel::binder1st<Comparator, value_type, value_type, bool>, value_type> pred(__gnu_parallel::binder1st<Comparator, value_type, value_type, bool>(comp, *pivot_pos));
// Find other end of pivot-equal range. // Find other end of pivot-equal range.
split_pos2 = __gnu_sequential::partition(split_pos1 + 1, end, pred); split_pos2 = __gnu_sequential::partition(split_pos1 + 1, end, pred);
} }
else else
// Only skip the pivot. // Only skip the pivot.
split_pos2 = split_pos1 + 1; split_pos2 = split_pos1 + 1;
// Compare iterators. // Compare iterators.
if (split_pos2 <= nth) if (split_pos2 <= nth)
begin = split_pos2; begin = split_pos2;
else if (nth < split_pos1) else if (nth < split_pos1)
end = split_pos1; end = split_pos1;
else else
break; break;
} }
// Only at most Settings::partition_minimal_n elements left. // Only at most Settings::partition_minimal_n elements left.
__gnu_sequential::sort(begin, end, comp); __gnu_sequential::sort(begin, end, comp);
} }
/** @brief Parallel implementation of std::partial_sort(). /** @brief Parallel implementation of std::partial_sort().
* @param begin Begin iterator of input sequence. * @param begin Begin iterator of input sequence.
* @param middle Sort until this position. * @param middle Sort until this position.
* @param end End iterator of input sequence. * @param end End iterator of input sequence.
* @param comp Comparator. */ * @param comp Comparator. */
template<typename RandomAccessIterator, typename Comparator> template<typename RandomAccessIterator, typename Comparator>
void void
parallel_partial_sort(RandomAccessIterator begin, RandomAccessIterator middle, RandomAccessIterator end, Comparator comp) parallel_partial_sort(RandomAccessIterator begin, RandomAccessIterator middle, RandomAccessIterator end, Comparator comp)
{ {
...@@ -390,7 +416,7 @@ namespace __gnu_parallel ...@@ -390,7 +416,7 @@ namespace __gnu_parallel
std::sort(begin, middle, comp); std::sort(begin, middle, comp);
} }
} //namespace __gnu_parallel } //namespace __gnu_parallel
#undef _GLIBCXX_VOLATILE #undef _GLIBCXX_VOLATILE
......
...@@ -53,11 +53,17 @@ namespace __gnu_parallel ...@@ -53,11 +53,17 @@ namespace __gnu_parallel
* this part. * this part.
*/ */
template<typename RandomAccessIterator, typename Comparator> template<typename RandomAccessIterator, typename Comparator>
inline typename std::iterator_traits<RandomAccessIterator>::difference_type inline
parallel_sort_qs_divide(RandomAccessIterator begin, RandomAccessIterator end, typename std::iterator_traits<RandomAccessIterator>::difference_type
Comparator comp, parallel_sort_qs_divide(
typename std::iterator_traits<RandomAccessIterator>::difference_type pivot_rank, RandomAccessIterator begin,
typename std::iterator_traits<RandomAccessIterator>::difference_type num_samples, thread_index_t num_threads) RandomAccessIterator end,
Comparator comp,
typename std::iterator_traits<RandomAccessIterator>::difference_type
pivot_rank,
typename std::iterator_traits<RandomAccessIterator>::difference_type
num_samples,
thread_index_t num_threads)
{ {
typedef std::iterator_traits<RandomAccessIterator> traits_type; typedef std::iterator_traits<RandomAccessIterator> traits_type;
typedef typename traits_type::value_type value_type; typedef typename traits_type::value_type value_type;
...@@ -65,20 +71,24 @@ namespace __gnu_parallel ...@@ -65,20 +71,24 @@ namespace __gnu_parallel
difference_type n = end - begin; difference_type n = end - begin;
num_samples = std::min(num_samples, n); num_samples = std::min(num_samples, n);
value_type* samples = static_cast<value_type*>(__builtin_alloca(sizeof(value_type) * num_samples));
// Allocate uninitialized, to avoid default constructor.
value_type* samples = static_cast<value_type*>(
operator new(num_samples * sizeof(value_type)));
for (difference_type s = 0; s < num_samples; s++) for (difference_type s = 0; s < num_samples; s++)
{ {
const unsigned long long index = static_cast<unsigned long long>(s) const unsigned long long index = static_cast<unsigned long long>(s)
* n / num_samples; * n / num_samples;
samples[s] = begin[index]; new(samples + s) value_type(begin[index]);
} }
__gnu_sequential::sort(samples, samples + num_samples, comp); __gnu_sequential::sort(samples, samples + num_samples, comp);
value_type& pivot = samples[pivot_rank * num_samples / n]; value_type& pivot = samples[pivot_rank * num_samples / n];
__gnu_parallel::binder2nd<Comparator, value_type, value_type, bool> pred(comp, pivot); __gnu_parallel::binder2nd<Comparator, value_type, value_type, bool>
pred(comp, pivot);
difference_type split = parallel_partition(begin, end, pred, num_threads); difference_type split = parallel_partition(begin, end, pred, num_threads);
return split; return split;
...@@ -93,7 +103,10 @@ namespace __gnu_parallel ...@@ -93,7 +103,10 @@ namespace __gnu_parallel
*/ */
template<typename RandomAccessIterator, typename Comparator> template<typename RandomAccessIterator, typename Comparator>
inline void inline void
parallel_sort_qs_conquer(RandomAccessIterator begin, RandomAccessIterator end, Comparator comp, int num_threads) parallel_sort_qs_conquer(RandomAccessIterator begin,
RandomAccessIterator end,
Comparator comp,
thread_index_t num_threads)
{ {
typedef std::iterator_traits<RandomAccessIterator> traits_type; typedef std::iterator_traits<RandomAccessIterator> traits_type;
typedef typename traits_type::value_type value_type; typedef typename traits_type::value_type value_type;
...@@ -101,8 +114,8 @@ namespace __gnu_parallel ...@@ -101,8 +114,8 @@ namespace __gnu_parallel
if (num_threads <= 1) if (num_threads <= 1)
{ {
__gnu_sequential::sort(begin, end, comp); __gnu_sequential::sort(begin, end, comp);
return; return;
} }
difference_type n = end - begin, pivot_rank; difference_type n = end - begin, pivot_rank;
...@@ -110,24 +123,27 @@ namespace __gnu_parallel ...@@ -110,24 +123,27 @@ namespace __gnu_parallel
if (n <= 1) if (n <= 1)
return; return;
thread_index_t num_processors_left; thread_index_t num_threads_left;
if ((num_threads % 2) == 1) if ((num_threads % 2) == 1)
num_processors_left = num_threads / 2 + 1; num_threads_left = num_threads / 2 + 1;
else else
num_processors_left = num_threads / 2; num_threads_left = num_threads / 2;
pivot_rank = n * num_processors_left / num_threads; pivot_rank = n * num_threads_left / num_threads;
difference_type split = parallel_sort_qs_divide(begin, end, comp, pivot_rank, difference_type split = parallel_sort_qs_divide(
Settings::sort_qs_num_samples_preset, num_threads); begin, end, comp, pivot_rank,
Settings::sort_qs_num_samples_preset, num_threads);
#pragma omp parallel sections #pragma omp parallel sections
{ {
#pragma omp section #pragma omp section
parallel_sort_qs_conquer(begin, begin + split, comp, num_processors_left); parallel_sort_qs_conquer(begin, begin + split,
comp, num_threads_left);
#pragma omp section #pragma omp section
parallel_sort_qs_conquer(begin + split, end, comp, num_threads - num_processors_left); parallel_sort_qs_conquer(begin + split, end,
comp, num_threads - num_threads_left);
} }
} }
...@@ -143,9 +159,12 @@ Settings::sort_qs_num_samples_preset, num_threads); ...@@ -143,9 +159,12 @@ Settings::sort_qs_num_samples_preset, num_threads);
*/ */
template<typename RandomAccessIterator, typename Comparator> template<typename RandomAccessIterator, typename Comparator>
inline void inline void
parallel_sort_qs(RandomAccessIterator begin, RandomAccessIterator end, parallel_sort_qs(
Comparator comp, RandomAccessIterator begin,
typename std::iterator_traits<RandomAccessIterator>::difference_type n, int num_threads) RandomAccessIterator end,
Comparator comp,
typename std::iterator_traits<RandomAccessIterator>::difference_type n,
int num_threads)
{ {
_GLIBCXX_CALL(n) _GLIBCXX_CALL(n)
...@@ -165,12 +184,9 @@ Settings::sort_qs_num_samples_preset, num_threads); ...@@ -165,12 +184,9 @@ Settings::sort_qs_num_samples_preset, num_threads);
// Hard to avoid. // Hard to avoid.
omp_set_num_threads(num_threads); omp_set_num_threads(num_threads);
bool old_nested = (omp_get_nested() != 0);
omp_set_nested(true);
parallel_sort_qs_conquer(begin, begin + n, comp, num_threads); parallel_sort_qs_conquer(begin, begin + n, comp, num_threads);
omp_set_nested(old_nested);
} }
} //namespace __gnu_parallel } //namespace __gnu_parallel
#endif #endif
...@@ -45,16 +45,16 @@ ...@@ -45,16 +45,16 @@
namespace __gnu_parallel namespace __gnu_parallel
{ {
/** @brief Type to hold the index of a bin. /** @brief Type to hold the index of a bin.
* *
* Since many variables of this type are allocated, it should be * Since many variables of this type are allocated, it should be
* chosen as small as possible. * chosen as small as possible.
*/ */
typedef unsigned short bin_index; typedef unsigned short bin_index;
/** @brief Data known to every thread participating in /** @brief Data known to every thread participating in
__gnu_parallel::parallel_random_shuffle(). */ __gnu_parallel::parallel_random_shuffle(). */
template<typename RandomAccessIterator> template<typename RandomAccessIterator>
struct DRandomShufflingGlobalData struct DRandomShufflingGlobalData
{ {
typedef std::iterator_traits<RandomAccessIterator> traits_type; typedef std::iterator_traits<RandomAccessIterator> traits_type;
...@@ -90,18 +90,15 @@ namespace __gnu_parallel ...@@ -90,18 +90,15 @@ namespace __gnu_parallel
: source(_source) { } : source(_source) { }
}; };
/** @brief Local data for a thread participating in /** @brief Local data for a thread participating in
__gnu_parallel::parallel_random_shuffle(). __gnu_parallel::parallel_random_shuffle().
*/ */
template<typename RandomAccessIterator, typename RandomNumberGenerator> template<typename RandomAccessIterator, typename RandomNumberGenerator>
struct DRSSorterPU struct DRSSorterPU
{ {
/** @brief Number of threads participating in total. */ /** @brief Number of threads participating in total. */
int num_threads; int num_threads;
/** @brief Number of owning thread. */
int iam;
/** @brief Begin index for bins taken care of by this thread. */ /** @brief Begin index for bins taken care of by this thread. */
bin_index bins_begin; bin_index bins_begin;
...@@ -115,29 +112,29 @@ namespace __gnu_parallel ...@@ -115,29 +112,29 @@ namespace __gnu_parallel
DRandomShufflingGlobalData<RandomAccessIterator>* sd; DRandomShufflingGlobalData<RandomAccessIterator>* sd;
}; };
/** @brief Generate a random number in @c [0,2^logp). /** @brief Generate a random number in @c [0,2^logp).
* @param logp Logarithm (basis 2) of the upper range bound. * @param logp Logarithm (basis 2) of the upper range bound.
* @param rng Random number generator to use. * @param rng Random number generator to use.
*/ */
template<typename RandomNumberGenerator> template<typename RandomNumberGenerator>
inline int inline int
random_number_pow2(int logp, RandomNumberGenerator& rng) random_number_pow2(int logp, RandomNumberGenerator& rng)
{ return rng.genrand_bits(logp); } { return rng.genrand_bits(logp); }
/** @brief Random shuffle code executed by each thread. /** @brief Random shuffle code executed by each thread.
* @param pus Array of thread-local data records. */ * @param pus Array of thread-local data records. */
template<typename RandomAccessIterator, typename RandomNumberGenerator> template<typename RandomAccessIterator, typename RandomNumberGenerator>
inline void inline void
parallel_random_shuffle_drs_pu(DRSSorterPU<RandomAccessIterator, parallel_random_shuffle_drs_pu(DRSSorterPU<RandomAccessIterator,
RandomNumberGenerator>* pus) RandomNumberGenerator>* pus)
{ {
typedef std::iterator_traits<RandomAccessIterator> traits_type; typedef std::iterator_traits<RandomAccessIterator> traits_type;
typedef typename traits_type::value_type value_type; typedef typename traits_type::value_type value_type;
typedef typename traits_type::difference_type difference_type; typedef typename traits_type::difference_type difference_type;
DRSSorterPU<RandomAccessIterator, RandomNumberGenerator>* d = &pus[omp_get_thread_num()]; thread_index_t iam = omp_get_thread_num();
DRSSorterPU<RandomAccessIterator, RandomNumberGenerator>* d = &pus[iam];
DRandomShufflingGlobalData<RandomAccessIterator>* sd = d->sd; DRandomShufflingGlobalData<RandomAccessIterator>* sd = d->sd;
thread_index_t iam = d->iam;
// Indexing: dist[bin][processor] // Indexing: dist[bin][processor]
difference_type length = sd->starts[iam + 1] - sd->starts[iam]; difference_type length = sd->starts[iam + 1] - sd->starts[iam];
...@@ -156,35 +153,35 @@ namespace __gnu_parallel ...@@ -156,35 +153,35 @@ namespace __gnu_parallel
// First main loop. // First main loop.
for (difference_type i = 0; i < length; i++) for (difference_type i = 0; i < length; i++)
{ {
bin_index oracle = random_number_pow2(num_bits, rng); bin_index oracle = random_number_pow2(num_bits, rng);
oracles[i] = oracle; oracles[i] = oracle;
// To allow prefix (partial) sum. // To allow prefix (partial) sum.
dist[oracle + 1]++; dist[oracle + 1]++;
} }
for (bin_index b = 0; b < sd->num_bins + 1; b++) for (bin_index b = 0; b < sd->num_bins + 1; b++)
sd->dist[b][iam + 1] = dist[b]; sd->dist[b][iam + 1] = dist[b];
#pragma omp barrier # pragma omp barrier
#pragma omp single # pragma omp single
{ {
// Sum up bins, sd->dist[s + 1][d->num_threads] now contains the // Sum up bins, sd->dist[s + 1][d->num_threads] now contains the
// total number of items in bin s // total number of items in bin s
for (bin_index s = 0; s < sd->num_bins; s++) for (bin_index s = 0; s < sd->num_bins; s++)
__gnu_sequential::partial_sum(sd->dist[s + 1], __gnu_sequential::partial_sum(sd->dist[s + 1],
sd->dist[s + 1] + d->num_threads + 1, sd->dist[s + 1] + d->num_threads + 1,
sd->dist[s + 1]); sd->dist[s + 1]);
} }
#pragma omp barrier # pragma omp barrier
sequence_index_t offset = 0, global_offset = 0; sequence_index_t offset = 0, global_offset = 0;
for (bin_index s = 0; s < d->bins_begin; s++) for (bin_index s = 0; s < d->bins_begin; s++)
global_offset += sd->dist[s + 1][d->num_threads]; global_offset += sd->dist[s + 1][d->num_threads];
#pragma omp barrier # pragma omp barrier
for (bin_index s = d->bins_begin; s < d->bins_end; s++) for (bin_index s = d->bins_begin; s < d->bins_end; s++)
{ {
...@@ -193,9 +190,10 @@ namespace __gnu_parallel ...@@ -193,9 +190,10 @@ namespace __gnu_parallel
offset = sd->dist[s + 1][d->num_threads]; offset = sd->dist[s + 1][d->num_threads];
} }
sd->temporaries[iam] = static_cast<value_type*>(::operator new(sizeof(value_type) * offset)); sd->temporaries[iam] = static_cast<value_type*>(
::operator new(sizeof(value_type) * offset));
#pragma omp barrier # pragma omp barrier
// Draw local copies to avoid false sharing. // Draw local copies to avoid false sharing.
for (bin_index b = 0; b < sd->num_bins + 1; b++) for (bin_index b = 0; b < sd->num_bins + 1; b++)
...@@ -211,11 +209,11 @@ namespace __gnu_parallel ...@@ -211,11 +209,11 @@ namespace __gnu_parallel
// Distribute according to oracles, second main loop. // Distribute according to oracles, second main loop.
for (difference_type i = 0; i < length; i++) for (difference_type i = 0; i < length; i++)
{ {
bin_index target_bin = oracles[i]; bin_index target_bin = oracles[i];
thread_index_t target_p = bin_proc[target_bin]; thread_index_t target_p = bin_proc[target_bin];
// Last column [d->num_threads] stays unchanged. // Last column [d->num_threads] stays unchanged.
temporaries[target_p][dist[target_bin + 1]++] = *(source + i + start); temporaries[target_p][dist[target_bin + 1]++] = *(source + i + start);
} }
delete[] oracles; delete[] oracles;
...@@ -223,23 +221,27 @@ namespace __gnu_parallel ...@@ -223,23 +221,27 @@ namespace __gnu_parallel
delete[] bin_proc; delete[] bin_proc;
delete[] temporaries; delete[] temporaries;
#pragma omp barrier # pragma omp barrier
// Shuffle bins internally. // Shuffle bins internally.
for (bin_index b = d->bins_begin; b < d->bins_end; b++) for (bin_index b = d->bins_begin; b < d->bins_end; b++)
{ {
value_type* begin = sd->temporaries[iam] + ((b == d->bins_begin) ? 0 : sd->dist[b][d->num_threads]), value_type* begin =
* end = sd->temporaries[iam] + sd->dist[b + 1][d->num_threads]; sd->temporaries[iam] +
sequential_random_shuffle(begin, end, rng); ((b == d->bins_begin) ? 0 : sd->dist[b][d->num_threads]),
std::copy(begin, end, sd->source + global_offset + ((b == d->bins_begin) ? 0 : sd->dist[b][d->num_threads])); * end =
sd->temporaries[iam] + sd->dist[b + 1][d->num_threads];
sequential_random_shuffle(begin, end, rng);
std::copy(begin, end, sd->source + global_offset +
((b == d->bins_begin) ? 0 : sd->dist[b][d->num_threads]));
} }
delete[] sd->temporaries[iam]; delete[] sd->temporaries[iam];
} }
/** @brief Round up to the next greater power of 2. /** @brief Round up to the next greater power of 2.
* @param x Integer to round up */ * @param x Integer to round up */
template<typename T> template<typename T>
T T
round_up_to_pow2(T x) round_up_to_pow2(T x)
{ {
...@@ -249,16 +251,21 @@ namespace __gnu_parallel ...@@ -249,16 +251,21 @@ namespace __gnu_parallel
return (T)1 << (log2(x - 1) + 1); return (T)1 << (log2(x - 1) + 1);
} }
/** @brief Main parallel random shuffle step. /** @brief Main parallel random shuffle step.
* @param begin Begin iterator of sequence. * @param begin Begin iterator of sequence.
* @param end End iterator of sequence. * @param end End iterator of sequence.
* @param n Length of sequence. * @param n Length of sequence.
* @param num_threads Number of threads to use. * @param num_threads Number of threads to use.
* @param rng Random number generator to use. * @param rng Random number generator to use.
*/ */
template<typename RandomAccessIterator, typename RandomNumberGenerator> template<typename RandomAccessIterator, typename RandomNumberGenerator>
inline void inline void
parallel_random_shuffle_drs(RandomAccessIterator begin, RandomAccessIterator end, typename std::iterator_traits<RandomAccessIterator>::difference_type n, int num_threads, RandomNumberGenerator& rng) parallel_random_shuffle_drs(
RandomAccessIterator begin,
RandomAccessIterator end,
typename std::iterator_traits<RandomAccessIterator>::difference_type n,
thread_index_t num_threads,
RandomNumberGenerator& rng)
{ {
typedef std::iterator_traits<RandomAccessIterator> traits_type; typedef std::iterator_traits<RandomAccessIterator> traits_type;
typedef typename traits_type::value_type value_type; typedef typename traits_type::value_type value_type;
...@@ -275,87 +282,99 @@ namespace __gnu_parallel ...@@ -275,87 +282,99 @@ namespace __gnu_parallel
// Try the L1 cache first. // Try the L1 cache first.
// Must fit into L1. // Must fit into L1.
num_bins_cache = std::max((difference_type)1, (difference_type)(n / (Settings::L1_cache_size_lb / sizeof(value_type)))); num_bins_cache = std::max<difference_type>(
1, n / (Settings::L1_cache_size_lb / sizeof(value_type)));
num_bins_cache = round_up_to_pow2(num_bins_cache); num_bins_cache = round_up_to_pow2(num_bins_cache);
// No more buckets than TLB entries, power of 2 // No more buckets than TLB entries, power of 2
// Power of 2 and at least one element per bin, at most the TLB size. // Power of 2 and at least one element per bin, at most the TLB size.
num_bins = std::min(n, (difference_type)num_bins_cache); num_bins = std::min<difference_type>(n, num_bins_cache);
#if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB
// 2 TLB entries needed per bin. // 2 TLB entries needed per bin.
num_bins = std::min((difference_type)Settings::TLB_size / 2, num_bins); num_bins = std::min<difference_type>(Settings::TLB_size / 2, num_bins);
#endif #endif
num_bins = round_up_to_pow2(num_bins); num_bins = round_up_to_pow2(num_bins);
if (num_bins < num_bins_cache) if (num_bins < num_bins_cache)
{ {
#endif #endif
// Now try the L2 cache // Now try the L2 cache
// Must fit into L2 // Must fit into L2
num_bins_cache = static_cast<bin_index>(std::max((difference_type)1, (difference_type)(n / (Settings::L2_cache_size / sizeof(value_type))))); num_bins_cache = static_cast<bin_index>(std::max<difference_type>(
num_bins_cache = round_up_to_pow2(num_bins_cache); 1, n / (Settings::L2_cache_size / sizeof(value_type))));
num_bins_cache = round_up_to_pow2(num_bins_cache);
// No more buckets than TLB entries, power of 2.
num_bins = static_cast<bin_index>(std::min(n, (difference_type)num_bins_cache)); // No more buckets than TLB entries, power of 2.
// Power of 2 and at least one element per bin, at most the TLB size. num_bins = static_cast<bin_index>(
std::min(n, static_cast<difference_type>(num_bins_cache)));
// Power of 2 and at least one element per bin, at most the TLB size.
#if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB
// 2 TLB entries needed per bin. // 2 TLB entries needed per bin.
num_bins = std::min((difference_type)Settings::TLB_size / 2, num_bins); num_bins = std::min(
static_cast<difference_type>(Settings::TLB_size / 2), num_bins);
#endif #endif
num_bins = round_up_to_pow2(num_bins); num_bins = round_up_to_pow2(num_bins);
#if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_L1 #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_L1
} }
#endif #endif
num_threads = std::min((bin_index)num_threads, (bin_index)num_bins); num_threads = std::min<bin_index>(num_threads, num_bins);
if (num_threads <= 1) if (num_threads <= 1)
return sequential_random_shuffle(begin, end, rng); return sequential_random_shuffle(begin, end, rng);
DRandomShufflingGlobalData<RandomAccessIterator> sd(begin); DRandomShufflingGlobalData<RandomAccessIterator> sd(begin);
DRSSorterPU<RandomAccessIterator, random_number >* pus;
difference_type* starts;
DRSSorterPU<RandomAccessIterator, random_number >* pus = new DRSSorterPU<RandomAccessIterator, random_number >[num_threads]; # pragma omp parallel num_threads(num_threads)
sd.temporaries = new value_type*[num_threads];
//sd.oracles = new bin_index[n];
sd.dist = new difference_type*[num_bins + 1];
sd.bin_proc = new thread_index_t[num_bins];
for (bin_index b = 0; b < num_bins + 1; b++)
sd.dist[b] = new difference_type[num_threads + 1];
for (bin_index b = 0; b < (num_bins + 1); b++)
{ {
sd.dist[0][0] = 0; # pragma omp single
sd.dist[b][0] = 0; {
} pus = new DRSSorterPU<RandomAccessIterator, random_number>
difference_type* starts = sd.starts = new difference_type[num_threads + 1]; [num_threads];
int bin_cursor = 0;
sd.num_bins = num_bins; sd.temporaries = new value_type*[num_threads];
sd.num_bits = log2(num_bins); sd.dist = new difference_type*[num_bins + 1];
sd.bin_proc = new thread_index_t[num_bins];
difference_type chunk_length = n / num_threads, split = n % num_threads, start = 0; for (bin_index b = 0; b < num_bins + 1; b++)
int bin_chunk_length = num_bins / num_threads, bin_split = num_bins % num_threads; sd.dist[b] = new difference_type[num_threads + 1];
for (int i = 0; i < num_threads; i++) for (bin_index b = 0; b < (num_bins + 1); b++)
{ {
starts[i] = start; sd.dist[0][0] = 0;
start += (i < split) ? (chunk_length + 1) : chunk_length; sd.dist[b][0] = 0;
int j = pus[i].bins_begin = bin_cursor; }
starts = sd.starts = new difference_type[num_threads + 1];
// Range of bins for this processor. int bin_cursor = 0;
bin_cursor += (i < bin_split) ? (bin_chunk_length + 1) : bin_chunk_length; sd.num_bins = num_bins;
pus[i].bins_end = bin_cursor; sd.num_bits = log2(num_bins);
for (; j < bin_cursor; j++)
sd.bin_proc[j] = i; difference_type chunk_length = n / num_threads,
pus[i].num_threads = num_threads; split = n % num_threads, start = 0;
pus[i].iam = i; difference_type bin_chunk_length = num_bins / num_threads,
pus[i].seed = rng(std::numeric_limits<uint32>::max()); bin_split = num_bins % num_threads;
pus[i].sd = &sd; for (thread_index_t i = 0; i < num_threads; i++)
} {
starts[num_threads] = start; starts[i] = start;
start += (i < split) ? (chunk_length + 1) : chunk_length;
// Now shuffle in parallel. int j = pus[i].bins_begin = bin_cursor;
#pragma omp parallel num_threads(num_threads)
parallel_random_shuffle_drs_pu(pus); // Range of bins for this processor.
bin_cursor += (i < bin_split) ?
(bin_chunk_length + 1) : bin_chunk_length;
pus[i].bins_end = bin_cursor;
for (; j < bin_cursor; j++)
sd.bin_proc[j] = i;
pus[i].num_threads = num_threads;
pus[i].seed = rng(std::numeric_limits<uint32>::max());
pus[i].sd = &sd;
}
starts[num_threads] = start;
} //single
// Now shuffle in parallel.
parallel_random_shuffle_drs_pu(pus);
}
delete[] starts; delete[] starts;
delete[] sd.bin_proc; delete[] sd.bin_proc;
...@@ -367,16 +386,16 @@ namespace __gnu_parallel ...@@ -367,16 +386,16 @@ namespace __gnu_parallel
delete[] pus; delete[] pus;
} }
/** @brief Sequential cache-efficient random shuffle. /** @brief Sequential cache-efficient random shuffle.
* @param begin Begin iterator of sequence. * @param begin Begin iterator of sequence.
* @param end End iterator of sequence. * @param end End iterator of sequence.
* @param rng Random number generator to use. * @param rng Random number generator to use.
*/ */
template<typename RandomAccessIterator, typename RandomNumberGenerator> template<typename RandomAccessIterator, typename RandomNumberGenerator>
inline void inline void
sequential_random_shuffle(RandomAccessIterator begin, sequential_random_shuffle(RandomAccessIterator begin,
RandomAccessIterator end, RandomAccessIterator end,
RandomNumberGenerator& rng) RandomNumberGenerator& rng)
{ {
typedef std::iterator_traits<RandomAccessIterator> traits_type; typedef std::iterator_traits<RandomAccessIterator> traits_type;
typedef typename traits_type::value_type value_type; typedef typename traits_type::value_type value_type;
...@@ -388,7 +407,9 @@ namespace __gnu_parallel ...@@ -388,7 +407,9 @@ namespace __gnu_parallel
#if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_L1 #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_L1
// Try the L1 cache first, must fit into L1. // Try the L1 cache first, must fit into L1.
num_bins_cache = std::max((difference_type)1, (difference_type)(n / (Settings::L1_cache_size_lb / sizeof(value_type)))); num_bins_cache =
std::max<difference_type>
(1, n / (Settings::L1_cache_size_lb / sizeof(value_type)));
num_bins_cache = round_up_to_pow2(num_bins_cache); num_bins_cache = round_up_to_pow2(num_bins_cache);
// No more buckets than TLB entries, power of 2 // No more buckets than TLB entries, power of 2
...@@ -403,19 +424,23 @@ namespace __gnu_parallel ...@@ -403,19 +424,23 @@ namespace __gnu_parallel
if (num_bins < num_bins_cache) if (num_bins < num_bins_cache)
{ {
#endif #endif
// Now try the L2 cache, must fit into L2. // Now try the L2 cache, must fit into L2.
num_bins_cache = static_cast<bin_index>(std::max((difference_type)1, (difference_type)(n / (Settings::L2_cache_size / sizeof(value_type))))); num_bins_cache =
num_bins_cache = round_up_to_pow2(num_bins_cache); static_cast<bin_index>(std::max<difference_type>(
1, n / (Settings::L2_cache_size / sizeof(value_type))));
num_bins_cache = round_up_to_pow2(num_bins_cache);
// No more buckets than TLB entries, power of 2 // No more buckets than TLB entries, power of 2
// Power of 2 and at least one element per bin, at most the TLB size. // Power of 2 and at least one element per bin, at most the TLB size.
num_bins = static_cast<bin_index>(std::min(n, (difference_type)num_bins_cache)); num_bins = static_cast<bin_index>
(std::min(n, static_cast<difference_type>(num_bins_cache)));
#if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB
// 2 TLB entries needed per bin // 2 TLB entries needed per bin
num_bins = std::min((difference_type)Settings::TLB_size / 2, num_bins); num_bins =
std::min<difference_type>(Settings::TLB_size / 2, num_bins);
#endif #endif
num_bins = round_up_to_pow2(num_bins); num_bins = round_up_to_pow2(num_bins);
#if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_L1 #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_L1
} }
#endif #endif
...@@ -424,58 +449,62 @@ namespace __gnu_parallel ...@@ -424,58 +449,62 @@ namespace __gnu_parallel
if (num_bins > 1) if (num_bins > 1)
{ {
value_type* target = static_cast<value_type*>(::operator new(sizeof(value_type) * n)); value_type* target = static_cast<value_type*>(
bin_index* oracles = new bin_index[n]; ::operator new(sizeof(value_type) * n));
difference_type* dist0 = new difference_type[num_bins + 1], * dist1 = new difference_type[num_bins + 1]; bin_index* oracles = new bin_index[n];
difference_type* dist0 = new difference_type[num_bins + 1],
for (int b = 0; b < num_bins + 1; b++) * dist1 = new difference_type[num_bins + 1];
dist0[b] = 0;
for (int b = 0; b < num_bins + 1; b++)
random_number bitrng(rng(0xFFFFFFFF)); dist0[b] = 0;
for (difference_type i = 0; i < n; i++) random_number bitrng(rng(0xFFFFFFFF));
{
bin_index oracle = random_number_pow2(num_bits, bitrng); for (difference_type i = 0; i < n; i++)
oracles[i] = oracle; {
bin_index oracle = random_number_pow2(num_bits, bitrng);
// To allow prefix (partial) sum. oracles[i] = oracle;
dist0[oracle + 1]++;
} // To allow prefix (partial) sum.
dist0[oracle + 1]++;
// Sum up bins. }
__gnu_sequential::partial_sum(dist0, dist0 + num_bins + 1, dist0);
// Sum up bins.
for (int b = 0; b < num_bins + 1; b++) __gnu_sequential::partial_sum(dist0, dist0 + num_bins + 1, dist0);
dist1[b] = dist0[b];
for (int b = 0; b < num_bins + 1; b++)
// Distribute according to oracles. dist1[b] = dist0[b];
for (difference_type i = 0; i < n; i++)
target[(dist0[oracles[i]])++] = *(begin + i); // Distribute according to oracles.
for (difference_type i = 0; i < n; i++)
for (int b = 0; b < num_bins; b++) target[(dist0[oracles[i]])++] = *(begin + i);
{
sequential_random_shuffle(target + dist1[b], target + dist1[b + 1], for (int b = 0; b < num_bins; b++)
rng); {
} sequential_random_shuffle(target + dist1[b],
target + dist1[b + 1],
delete[] dist0; rng);
delete[] dist1; }
delete[] oracles;
delete[] target; delete[] dist0;
delete[] dist1;
delete[] oracles;
delete[] target;
} }
else else
__gnu_sequential::random_shuffle(begin, end, rng); __gnu_sequential::random_shuffle(begin, end, rng);
} }
/** @brief Parallel random public call. /** @brief Parallel random public call.
* @param begin Begin iterator of sequence. * @param begin Begin iterator of sequence.
* @param end End iterator of sequence. * @param end End iterator of sequence.
* @param rng Random number generator to use. * @param rng Random number generator to use.
*/ */
template<typename RandomAccessIterator, typename RandomNumberGenerator> template<typename RandomAccessIterator, typename RandomNumberGenerator>
inline void inline void
parallel_random_shuffle(RandomAccessIterator begin, RandomAccessIterator end, parallel_random_shuffle(RandomAccessIterator begin,
RandomNumberGenerator rng = random_number()) RandomAccessIterator end,
RandomNumberGenerator rng = random_number())
{ {
typedef std::iterator_traits<RandomAccessIterator> traits_type; typedef std::iterator_traits<RandomAccessIterator> traits_type;
typedef typename traits_type::difference_type difference_type; typedef typename traits_type::difference_type difference_type;
......
...@@ -53,10 +53,10 @@ namespace __gnu_parallel ...@@ -53,10 +53,10 @@ namespace __gnu_parallel
* @param length Length of sequence to search for. * @param length Length of sequence to search for.
* @param advances Returned offsets. * @param advances Returned offsets.
*/ */
template<typename RandomAccessIterator, typename _DifferenceTp> template<typename RandomAccessIterator, typename _DifferenceTp>
void void
calc_borders(RandomAccessIterator elements, _DifferenceTp length, calc_borders(RandomAccessIterator elements, _DifferenceTp length,
_DifferenceTp* off) _DifferenceTp* off)
{ {
typedef _DifferenceTp difference_type; typedef _DifferenceTp difference_type;
...@@ -66,9 +66,9 @@ namespace __gnu_parallel ...@@ -66,9 +66,9 @@ namespace __gnu_parallel
difference_type k = 0; difference_type k = 0;
for (difference_type j = 2; j <= length; j++) for (difference_type j = 2; j <= length; j++)
{ {
while ((k >= 0) && !(elements[k] == elements[j-1])) while ((k >= 0) && !(elements[k] == elements[j-1]))
k = off[k]; k = off[k];
off[j] = ++k; off[j] = ++k;
} }
} }
...@@ -81,11 +81,14 @@ namespace __gnu_parallel ...@@ -81,11 +81,14 @@ namespace __gnu_parallel
* @param end2 End iterator of second sequence. * @param end2 End iterator of second sequence.
* @param pred Find predicate. * @param pred Find predicate.
* @return Place of finding in first sequences. */ * @return Place of finding in first sequences. */
template<typename _RandomAccessIterator1, typename _RandomAccessIterator2, typename Pred> template<
typename _RandomAccessIterator1,
typename _RandomAccessIterator2,
typename Pred>
_RandomAccessIterator1 _RandomAccessIterator1
search_template(_RandomAccessIterator1 begin1, _RandomAccessIterator1 end1, search_template(_RandomAccessIterator1 begin1, _RandomAccessIterator1 end1,
_RandomAccessIterator2 begin2, _RandomAccessIterator2 end2, _RandomAccessIterator2 begin2, _RandomAccessIterator2 end2,
Pred pred) Pred pred)
{ {
typedef std::iterator_traits<_RandomAccessIterator1> traits_type; typedef std::iterator_traits<_RandomAccessIterator1> traits_type;
typedef typename traits_type::difference_type difference_type; typedef typename traits_type::difference_type difference_type;
...@@ -103,60 +106,71 @@ namespace __gnu_parallel ...@@ -103,60 +106,71 @@ namespace __gnu_parallel
// Where is first occurrence of pattern? defaults to end. // Where is first occurrence of pattern? defaults to end.
difference_type result = (end1 - begin1); difference_type result = (end1 - begin1);
difference_type *splitters;
// Pattern too long. // Pattern too long.
if (input_length < 0) if (input_length < 0)
return end1; return end1;
thread_index_t num_threads = std::max<difference_type>(1, std::min<difference_type>(input_length, __gnu_parallel::get_max_threads()));
omp_lock_t result_lock; omp_lock_t result_lock;
omp_init_lock(&result_lock); omp_init_lock(&result_lock);
difference_type borders[num_threads + 1]; thread_index_t num_threads =
__gnu_parallel::equally_split(input_length, num_threads, borders); std::max<difference_type>(1,
std::min<difference_type>(input_length, get_max_threads()));
difference_type advances[pattern_length]; difference_type advances[pattern_length];
calc_borders(begin2, pattern_length, advances); calc_borders(begin2, pattern_length, advances);
#pragma omp parallel num_threads(num_threads) # pragma omp parallel num_threads(num_threads)
{ {
thread_index_t iam = omp_get_thread_num(); # pragma omp single
{
difference_type start = borders[iam], stop = borders[iam + 1]; num_threads = omp_get_num_threads();
splitters = new difference_type[num_threads + 1];
difference_type pos_in_pattern = 0; equally_split(input_length, num_threads, splitters);
bool found_pattern = false; }
while (start <= stop && !found_pattern) thread_index_t iam = omp_get_thread_num();
{
// Get new value of result. difference_type start = splitters[iam], stop = splitters[iam + 1];
#pragma omp flush(result)
// No chance for this thread to find first occurrence. difference_type pos_in_pattern = 0;
if (result < start) bool found_pattern = false;
break;
while (pred(begin1[start + pos_in_pattern], begin2[pos_in_pattern])) while (start <= stop && !found_pattern)
{ {
++pos_in_pattern; // Get new value of result.
if (pos_in_pattern == pattern_length) #pragma omp flush(result)
{ // No chance for this thread to find first occurrence.
// Found new candidate for result. if (result < start)
omp_set_lock(&result_lock); break;
result = std::min(result, start); while (pred(begin1[start + pos_in_pattern],
omp_unset_lock(&result_lock); begin2[pos_in_pattern]))
{
found_pattern = true; ++pos_in_pattern;
break; if (pos_in_pattern == pattern_length)
} {
} // Found new candidate for result.
// Make safe jump. omp_set_lock(&result_lock);
start += (pos_in_pattern - advances[pos_in_pattern]); result = std::min(result, start);
pos_in_pattern = (advances[pos_in_pattern] < 0) ? 0 : advances[pos_in_pattern]; omp_unset_lock(&result_lock);
}
} found_pattern = true;
break;
}
}
// Make safe jump.
start += (pos_in_pattern - advances[pos_in_pattern]);
pos_in_pattern =
(advances[pos_in_pattern] < 0) ? 0 : advances[pos_in_pattern];
}
} //parallel
omp_destroy_lock(&result_lock); omp_destroy_lock(&result_lock);
delete[] splitters;
// Return iterator on found element. // Return iterator on found element.
return (begin1 + result); return (begin1 + result);
} }
......
...@@ -47,28 +47,31 @@ ...@@ -47,28 +47,31 @@
namespace __gnu_parallel namespace __gnu_parallel
{ {
template<typename InputIterator, typename OutputIterator> template<typename InputIterator, typename OutputIterator>
inline OutputIterator inline OutputIterator
copy_tail(std::pair<InputIterator, InputIterator> b, copy_tail(std::pair<InputIterator, InputIterator> b,
std::pair<InputIterator, InputIterator> e, OutputIterator r) std::pair<InputIterator, InputIterator> e, OutputIterator r)
{ {
if (b.first != e.first) if (b.first != e.first)
{ {
do do
{ {
*r++ = *b.first++; *r++ = *b.first++;
} }
while (b.first != e.first); while (b.first != e.first);
} }
else else
{ {
while (b.second != e.second) while (b.second != e.second)
*r++ = *b.second++; *r++ = *b.second++;
} }
return r; return r;
} }
template<typename InputIterator, typename OutputIterator, typename Comparator> template<
typename InputIterator,
typename OutputIterator,
typename Comparator>
struct symmetric_difference_func struct symmetric_difference_func
{ {
typedef std::iterator_traits<InputIterator> traits_type; typedef std::iterator_traits<InputIterator> traits_type;
...@@ -80,55 +83,56 @@ namespace __gnu_parallel ...@@ -80,55 +83,56 @@ namespace __gnu_parallel
Comparator comp; Comparator comp;
inline OutputIterator invoke(InputIterator a, InputIterator b, inline OutputIterator invoke(InputIterator a, InputIterator b,
InputIterator c, InputIterator d, InputIterator c, InputIterator d,
OutputIterator r) const OutputIterator r) const
{ {
while (a != b && c != d) while (a != b && c != d)
{ {
if (comp(*a, *c)) if (comp(*a, *c))
{ {
*r = *a; *r = *a;
++a; ++a;
++r; ++r;
} }
else if (comp(*c, *a)) else if (comp(*c, *a))
{ {
*r = *c; *r = *c;
++c; ++c;
++r; ++r;
} }
else else
{ {
++a; ++a;
++c; ++c;
} }
} }
return std::copy(c, d, std::copy(a, b, r)); return std::copy(c, d, std::copy(a, b, r));
} }
inline difference_type inline difference_type
count(InputIterator a, InputIterator b, InputIterator c, InputIterator d) const count(InputIterator a, InputIterator b, InputIterator c, InputIterator d)
const
{ {
difference_type counter = 0; difference_type counter = 0;
while (a != b && c != d) while (a != b && c != d)
{ {
if (comp(*a, *c)) if (comp(*a, *c))
{ {
++a; ++a;
++counter; ++counter;
} }
else if (comp(*c, *a)) else if (comp(*c, *a))
{ {
++c; ++c;
++counter; ++counter;
} }
else else
{ {
++a; ++a;
++c; ++c;
} }
} }
return counter + (b - a) + (d - c); return counter + (b - a) + (d - c);
} }
...@@ -144,7 +148,10 @@ namespace __gnu_parallel ...@@ -144,7 +148,10 @@ namespace __gnu_parallel
}; };
template<typename InputIterator, typename OutputIterator, typename Comparator> template<
typename InputIterator,
typename OutputIterator,
typename Comparator>
struct difference_func struct difference_func
{ {
typedef std::iterator_traits<InputIterator> traits_type; typedef std::iterator_traits<InputIterator> traits_type;
...@@ -157,44 +164,45 @@ namespace __gnu_parallel ...@@ -157,44 +164,45 @@ namespace __gnu_parallel
inline OutputIterator inline OutputIterator
invoke(InputIterator a, InputIterator b, InputIterator c, InputIterator d, invoke(InputIterator a, InputIterator b, InputIterator c, InputIterator d,
OutputIterator r) const OutputIterator r) const
{ {
while (a != b && c != d) while (a != b && c != d)
{ {
if (comp(*a, *c)) if (comp(*a, *c))
{ {
*r = *a; *r = *a;
++a; ++a;
++r; ++r;
} }
else if (comp(*c, *a)) else if (comp(*c, *a))
{ ++c; } { ++c; }
else else
{ {
++a; ++a;
++c; ++c;
} }
} }
return std::copy(a, b, r); return std::copy(a, b, r);
} }
inline difference_type inline difference_type
count(InputIterator a, InputIterator b, InputIterator c, InputIterator d) const count(InputIterator a, InputIterator b, InputIterator c, InputIterator d)
const
{ {
difference_type counter = 0; difference_type counter = 0;
while (a != b && c != d) while (a != b && c != d)
{ {
if (comp(*a, *c)) if (comp(*a, *c))
{ {
++a; ++a;
++counter; ++counter;
} }
else if (comp(*c, *a)) else if (comp(*c, *a))
{ ++c; } { ++c; }
else else
{ ++a; ++c; } { ++a; ++c; }
} }
return counter + (b - a); return counter + (b - a);
} }
...@@ -209,7 +217,10 @@ namespace __gnu_parallel ...@@ -209,7 +217,10 @@ namespace __gnu_parallel
}; };
template<typename InputIterator, typename OutputIterator, typename Comparator> template<
typename InputIterator,
typename OutputIterator,
typename Comparator>
struct intersection_func struct intersection_func
{ {
typedef std::iterator_traits<InputIterator> traits_type; typedef std::iterator_traits<InputIterator> traits_type;
...@@ -222,44 +233,45 @@ namespace __gnu_parallel ...@@ -222,44 +233,45 @@ namespace __gnu_parallel
inline OutputIterator inline OutputIterator
invoke(InputIterator a, InputIterator b, InputIterator c, InputIterator d, invoke(InputIterator a, InputIterator b, InputIterator c, InputIterator d,
OutputIterator r) const OutputIterator r) const
{ {
while (a != b && c != d) while (a != b && c != d)
{ {
if (comp(*a, *c)) if (comp(*a, *c))
{ ++a; } { ++a; }
else if (comp(*c, *a)) else if (comp(*c, *a))
{ ++c; } { ++c; }
else else
{ {
*r = *a; *r = *a;
++a; ++a;
++c; ++c;
++r; ++r;
} }
} }
return r; return r;
} }
inline difference_type inline difference_type
count(InputIterator a, InputIterator b, InputIterator c, InputIterator d) const count(InputIterator a, InputIterator b, InputIterator c, InputIterator d)
const
{ {
difference_type counter = 0; difference_type counter = 0;
while (a != b && c != d) while (a != b && c != d)
{ {
if (comp(*a, *c)) if (comp(*a, *c))
{ ++a; } { ++a; }
else if (comp(*c, *a)) else if (comp(*c, *a))
{ ++c; } { ++c; }
else else
{ {
++a; ++a;
++c; ++c;
++counter; ++counter;
} }
} }
return counter; return counter;
} }
...@@ -273,10 +285,11 @@ namespace __gnu_parallel ...@@ -273,10 +285,11 @@ namespace __gnu_parallel
{ return out; } { return out; }
}; };
template<class InputIterator, class OutputIterator, class Comparator> template<class InputIterator, class OutputIterator, class Comparator>
struct union_func struct union_func
{ {
typedef typename std::iterator_traits<InputIterator>::difference_type difference_type; typedef typename std::iterator_traits<InputIterator>::difference_type
difference_type;
union_func(Comparator c) : comp(c) {} union_func(Comparator c) : comp(c) {}
...@@ -284,50 +297,50 @@ namespace __gnu_parallel ...@@ -284,50 +297,50 @@ namespace __gnu_parallel
inline OutputIterator inline OutputIterator
invoke(InputIterator a, const InputIterator b, InputIterator c, invoke(InputIterator a, const InputIterator b, InputIterator c,
const InputIterator d, OutputIterator r) const const InputIterator d, OutputIterator r) const
{ {
while (a != b && c != d) while (a != b && c != d)
{ {
if (comp(*a, *c)) if (comp(*a, *c))
{ {
*r = *a; *r = *a;
++a; ++a;
} }
else if (comp(*c, *a)) else if (comp(*c, *a))
{ {
*r = *c; *r = *c;
++c; ++c;
} }
else else
{ {
*r = *a; *r = *a;
++a; ++a;
++c; ++c;
} }
++r; ++r;
} }
return std::copy(c, d, std::copy(a, b, r)); return std::copy(c, d, std::copy(a, b, r));
} }
inline difference_type inline difference_type
count(InputIterator a, const InputIterator b, InputIterator c, count(InputIterator a, InputIterator b, InputIterator c, InputIterator d)
const InputIterator d) const const
{ {
difference_type counter = 0; difference_type counter = 0;
while (a != b && c != d) while (a != b && c != d)
{ {
if (comp(*a, *c)) if (comp(*a, *c))
{ ++a; } { ++a; }
else if (comp(*c, *a)) else if (comp(*c, *a))
{ ++c; } { ++c; }
else else
{ {
++a; ++a;
++c; ++c;
} }
++counter; ++counter;
} }
counter += (b - a); counter += (b - a);
counter += (d - c); counter += (d - c);
...@@ -343,11 +356,14 @@ namespace __gnu_parallel ...@@ -343,11 +356,14 @@ namespace __gnu_parallel
{ return std::copy(a, b, out); } { return std::copy(a, b, out); }
}; };
template<typename InputIterator, typename OutputIterator, typename Operation> template<
typename InputIterator,
typename OutputIterator,
typename Operation>
OutputIterator OutputIterator
parallel_set_operation(InputIterator begin1, InputIterator end1, parallel_set_operation(InputIterator begin1, InputIterator end1,
InputIterator begin2, InputIterator end2, InputIterator begin2, InputIterator end2,
OutputIterator result, Operation op) OutputIterator result, Operation op)
{ {
_GLIBCXX_CALL((end1 - begin1) + (end2 - begin2)) _GLIBCXX_CALL((end1 - begin1) + (end2 - begin2))
...@@ -355,7 +371,6 @@ namespace __gnu_parallel ...@@ -355,7 +371,6 @@ namespace __gnu_parallel
typedef typename traits_type::difference_type difference_type; typedef typename traits_type::difference_type difference_type;
typedef typename std::pair<InputIterator, InputIterator> iterator_pair; typedef typename std::pair<InputIterator, InputIterator> iterator_pair;
if (begin1 == end1) if (begin1 == end1)
return op.first_empty(begin2, end2, result); return op.first_empty(begin2, end2, result);
...@@ -364,152 +379,174 @@ namespace __gnu_parallel ...@@ -364,152 +379,174 @@ namespace __gnu_parallel
const difference_type size = (end1 - begin1) + (end2 - begin2); const difference_type size = (end1 - begin1) + (end2 - begin2);
thread_index_t num_threads = std::min<difference_type>(std::min(end1 - begin1, end2 - begin2), get_max_threads()); const iterator_pair sequence[ 2 ] =
{ std::make_pair(begin1, end1), std::make_pair(begin2, end2) } ;
difference_type borders[num_threads + 2];
equally_split(size, num_threads + 1, borders);
const iterator_pair sequence[ 2 ] = { std::make_pair(begin1, end1), std::make_pair(begin2, end2) } ;
iterator_pair block_begins[num_threads + 1];
// Very start.
block_begins[0] = std::make_pair(begin1, begin2);
difference_type length[num_threads];
OutputIterator return_value = result; OutputIterator return_value = result;
difference_type *borders;
iterator_pair *block_begins;
difference_type* lengths;
#pragma omp parallel num_threads(num_threads) thread_index_t num_threads =
{ std::min<difference_type>(get_max_threads(),
// Result from multiseq_partition. std::min(end1 - begin1, end2 - begin2));
InputIterator offset[2];
const int iam = omp_get_thread_num(); # pragma omp parallel num_threads(num_threads)
{
const difference_type rank = borders[iam + 1]; # pragma omp single
{
multiseq_partition(sequence, sequence + 2, rank, offset, op.comp); num_threads = omp_get_num_threads();
// allowed to read? borders = new difference_type[num_threads + 2];
// together equally_split(size, num_threads + 1, borders);
// *(offset[ 0 ] - 1) == *offset[ 1 ] block_begins = new iterator_pair[num_threads + 1];
if (offset[ 0 ] != begin1 && offset[ 1 ] != end2 // Very start.
&& !op.comp(*(offset[ 0 ] - 1), *offset[ 1 ]) block_begins[0] = std::make_pair(begin1, begin2);
&& !op.comp(*offset[ 1 ], *(offset[ 0 ] - 1))) lengths = new difference_type[num_threads];
{ } //single
// Avoid split between globally equal elements: move one to
// front in first sequence. thread_index_t iam = omp_get_thread_num();
--offset[ 0 ];
} // Result from multiseq_partition.
InputIterator offset[2];
iterator_pair block_end = block_begins[ iam + 1 ] = iterator_pair(offset[ 0 ], offset[ 1 ]); const difference_type rank = borders[iam + 1];
// Make sure all threads have their block_begin result written out. multiseq_partition(sequence, sequence + 2, rank, offset, op.comp);
#pragma omp barrier
// allowed to read?
iterator_pair block_begin = block_begins[ iam ]; // together
// *(offset[ 0 ] - 1) == *offset[ 1 ]
// Begin working for the first block, while the others except if (offset[ 0 ] != begin1 && offset[ 1 ] != end2
// the last start to count. && !op.comp(*(offset[ 0 ] - 1), *offset[ 1 ])
if (iam == 0) && !op.comp(*offset[ 1 ], *(offset[ 0 ] - 1)))
{ {
// The first thread can copy already. // Avoid split between globally equal elements: move one to
length[ iam ] = op.invoke(block_begin.first, block_end.first, block_begin.second, block_end.second, result) - result; // front in first sequence.
} --offset[ 0 ];
else }
{
length[ iam ] = op.count(block_begin.first, block_end.first, iterator_pair block_end = block_begins[ iam + 1 ] =
block_begin.second, block_end.second); iterator_pair(offset[ 0 ], offset[ 1 ]);
}
// Make sure all threads have their block_begin result written out.
// Make sure everyone wrote their lengths. # pragma omp barrier
#pragma omp barrier
iterator_pair block_begin = block_begins[ iam ];
OutputIterator r = result;
// Begin working for the first block, while the others except
if (iam == 0) // the last start to count.
{ if (iam == 0)
// Do the last block. {
for (int i = 0; i < num_threads; ++i) // The first thread can copy already.
r += length[i]; lengths[ iam ] = op.invoke(block_begin.first, block_end.first,
block_begin.second, block_end.second,
block_begin = block_begins[num_threads]; result)
- result;
// Return the result iterator of the last block. }
return_value = op.invoke(block_begin.first, end1, block_begin.second, end2, r); else
{
} lengths[ iam ] = op.count(block_begin.first, block_end.first,
else block_begin.second, block_end.second);
{ }
for (int i = 0; i < iam; ++i)
r += length[ i ]; // Make sure everyone wrote their lengths.
# pragma omp barrier
// Reset begins for copy pass.
op.invoke(block_begin.first, block_end.first, OutputIterator r = result;
block_begin.second, block_end.second, r);
} if (iam == 0)
} {
// Do the last block.
for (int i = 0; i < num_threads; ++i)
r += lengths[i];
block_begin = block_begins[num_threads];
// Return the result iterator of the last block.
return_value = op.invoke(
block_begin.first, end1, block_begin.second, end2, r);
}
else
{
for (int i = 0; i < iam; ++i)
r += lengths[ i ];
// Reset begins for copy pass.
op.invoke(block_begin.first, block_end.first,
block_begin.second, block_end.second, r);
}
}
return return_value; return return_value;
} }
template<typename InputIterator, typename OutputIterator, typename Comparator> template<
typename InputIterator,
typename OutputIterator,
typename Comparator>
OutputIterator OutputIterator
parallel_set_union(InputIterator begin1, InputIterator end1, parallel_set_union(InputIterator begin1, InputIterator end1,
InputIterator begin2, InputIterator end2, InputIterator begin2, InputIterator end2,
OutputIterator result, Comparator comp) OutputIterator result, Comparator comp)
{ {
return parallel_set_operation(begin1, end1, begin2, end2, result, return parallel_set_operation(begin1, end1, begin2, end2, result,
union_func< InputIterator, OutputIterator, Comparator>(comp)); union_func< InputIterator, OutputIterator, Comparator>(comp));
} }
template<typename InputIterator, typename OutputIterator, typename Comparator> template<
typename InputIterator,
typename OutputIterator,
typename Comparator>
OutputIterator OutputIterator
parallel_set_intersection(InputIterator begin1, InputIterator end1, parallel_set_intersection(InputIterator begin1, InputIterator end1,
InputIterator begin2, InputIterator end2, InputIterator begin2, InputIterator end2,
OutputIterator result, Comparator comp) OutputIterator result, Comparator comp)
{ {
return parallel_set_operation(begin1, end1, begin2, end2, result, return parallel_set_operation(begin1, end1, begin2, end2, result,
intersection_func<InputIterator, OutputIterator, Comparator>(comp)); intersection_func<InputIterator, OutputIterator, Comparator>(comp));
} }
template<typename InputIterator, typename OutputIterator> template<typename InputIterator, typename OutputIterator>
OutputIterator OutputIterator
set_intersection(InputIterator begin1, InputIterator end1, InputIterator begin2, InputIterator end2, OutputIterator result) set_intersection(InputIterator begin1, InputIterator end1,
InputIterator begin2, InputIterator end2,
OutputIterator result)
{ {
typedef std::iterator_traits<InputIterator> traits_type; typedef std::iterator_traits<InputIterator> traits_type;
typedef typename traits_type::value_type value_type; typedef typename traits_type::value_type value_type;
return set_intersection(begin1, end1, begin2, end2, result, return set_intersection(begin1, end1, begin2, end2, result,
std::less<value_type>()); std::less<value_type>());
} }
template<typename InputIterator, typename OutputIterator, typename Comparator> template<
typename InputIterator,
typename OutputIterator,
typename Comparator>
OutputIterator OutputIterator
parallel_set_difference(InputIterator begin1, InputIterator end1, parallel_set_difference(InputIterator begin1, InputIterator end1,
InputIterator begin2, InputIterator end2, InputIterator begin2, InputIterator end2,
OutputIterator result, Comparator comp) OutputIterator result, Comparator comp)
{ {
return parallel_set_operation(begin1, end1, begin2, end2, result, return parallel_set_operation(begin1, end1, begin2, end2, result,
difference_func<InputIterator, OutputIterator, Comparator>(comp)); difference_func<InputIterator, OutputIterator, Comparator>(comp));
} }
template<typename InputIterator, typename OutputIterator, typename Comparator> template<
typename InputIterator,
typename OutputIterator,
typename Comparator>
OutputIterator OutputIterator
parallel_set_symmetric_difference(InputIterator begin1, InputIterator end1, InputIterator begin2, InputIterator end2, OutputIterator result, Comparator comp) parallel_set_symmetric_difference(InputIterator begin1, InputIterator end1,
InputIterator begin2, InputIterator end2,
OutputIterator result, Comparator comp)
{ {
return parallel_set_operation(begin1, end1, begin2, end2, result, return parallel_set_operation(begin1, end1, begin2, end2, result,
symmetric_difference_func<InputIterator, OutputIterator, Comparator>(comp)); symmetric_difference_func<InputIterator, OutputIterator, Comparator>
(comp));
} }
} }
#endif // _GLIBCXX_SET_ALGORITHM_ #endif // _GLIBCXX_SET_ALGORITHM_
...@@ -44,16 +44,19 @@ ...@@ -44,16 +44,19 @@
namespace __gnu_parallel namespace __gnu_parallel
{ {
/** @brief Parallel std::unique_copy(), without explicit equality predicate. /** @brief Parallel std::unique_copy(), w/o explicit equality predicate.
* @param first Begin iterator of input sequence. * @param first Begin iterator of input sequence.
* @param last End iterator of input sequence. * @param last End iterator of input sequence.
* @param result Begin iterator of result sequence. * @param result Begin iterator of result sequence.
* @param binary_pred Equality predicate. * @param binary_pred Equality predicate.
* @return End iterator of result sequence. */ * @return End iterator of result sequence. */
template<typename InputIterator, class OutputIterator, class BinaryPredicate> template<
typename InputIterator,
class OutputIterator,
class BinaryPredicate>
inline OutputIterator inline OutputIterator
parallel_unique_copy(InputIterator first, InputIterator last, parallel_unique_copy(InputIterator first, InputIterator last,
OutputIterator result, BinaryPredicate binary_pred) OutputIterator result, BinaryPredicate binary_pred)
{ {
_GLIBCXX_CALL(last - first) _GLIBCXX_CALL(last - first)
...@@ -62,126 +65,136 @@ namespace __gnu_parallel ...@@ -62,126 +65,136 @@ namespace __gnu_parallel
typedef typename traits_type::difference_type difference_type; typedef typename traits_type::difference_type difference_type;
difference_type size = last - first; difference_type size = last - first;
int num_threads = __gnu_parallel::get_max_threads();
difference_type counter[num_threads + 1];
if (size == 0) if (size == 0)
return result; return result;
// Let the first thread process two parts. // Let the first thread process two parts.
difference_type borders[num_threads + 2]; difference_type *counter;
__gnu_parallel::equally_split(size, num_threads + 1, borders); difference_type *borders;
thread_index_t num_threads = get_max_threads();
// First part contains at least one element. // First part contains at least one element.
#pragma omp parallel num_threads(num_threads) # pragma omp parallel num_threads(num_threads)
{ {
int iam = omp_get_thread_num(); # pragma omp single
{
difference_type begin, end; num_threads = omp_get_num_threads();
borders = new difference_type[num_threads + 2];
// Check for length without duplicates equally_split(size, num_threads + 1, borders);
// Needed for position in output counter = new difference_type[num_threads + 1];
difference_type i = 0; }
OutputIterator out = result;
if (iam == 0) thread_index_t iam = omp_get_thread_num();
{
begin = borders[0] + 1; // == 1 difference_type begin, end;
end = borders[iam + 1];
// Check for length without duplicates
i++; // Needed for position in output
new (static_cast<void *>(&*out)) value_type(*first); difference_type i = 0;
out++; OutputIterator out = result;
for (InputIterator iter = first + begin; iter < first + end; ++iter) if (iam == 0)
{ {
if (!binary_pred(*iter, *(iter-1))) begin = borders[0] + 1; // == 1
{ end = borders[iam + 1];
i++;
new (static_cast<void *>(&*out)) value_type(*iter); i++;
out++; new (static_cast<void *>(&*out)) value_type(*first);
} out++;
}
} for (InputIterator iter = first + begin; iter < first + end; ++iter)
{
if (!binary_pred(*iter, *(iter-1)))
{
i++;
new (static_cast<void *>(&*out)) value_type(*iter);
out++;
}
}
}
else else
{ {
begin = borders[iam]; //one part begin = borders[iam]; //one part
end = borders[iam + 1]; end = borders[iam + 1];
for (InputIterator iter = first + begin; iter < first + end; ++iter) for (InputIterator iter = first + begin; iter < first + end; ++iter)
{ {
if (!binary_pred(*iter, *(iter-1))) if (!binary_pred(*iter, *(iter-1)))
{ {
i++; i++;
} }
} }
} }
counter[iam] = i; counter[iam] = i;
// Last part still untouched. // Last part still untouched.
difference_type begin_output; difference_type begin_output;
#pragma omp barrier # pragma omp barrier
// Store result in output on calculated positions. // Store result in output on calculated positions.
begin_output = 0; begin_output = 0;
if (iam == 0) if (iam == 0)
{ {
for (int t = 0; t < num_threads; t++) for (int t = 0; t < num_threads; t++)
begin_output += counter[t]; begin_output += counter[t];
i = 0; i = 0;
OutputIterator iter_out = result + begin_output; OutputIterator iter_out = result + begin_output;
begin = borders[num_threads]; begin = borders[num_threads];
end = size; end = size;
for (InputIterator iter = first + begin; iter < first + end; ++iter) for (InputIterator iter = first + begin; iter < first + end; ++iter)
{ {
if (iter == first || !binary_pred(*iter, *(iter-1))) if (iter == first || !binary_pred(*iter, *(iter-1)))
{ {
i++; i++;
new (static_cast<void *>(&*iter_out)) value_type(*iter); new (static_cast<void *>(&*iter_out)) value_type(*iter);
iter_out++; iter_out++;
} }
} }
counter[num_threads] = i; counter[num_threads] = i;
} }
else else
{ {
for (int t = 0; t < iam; t++) for (int t = 0; t < iam; t++)
begin_output += counter[t]; begin_output += counter[t];
OutputIterator iter_out = result + begin_output; OutputIterator iter_out = result + begin_output;
for (InputIterator iter = first + begin; iter < first + end; ++iter) for (InputIterator iter = first + begin; iter < first + end; ++iter)
{ {
if (!binary_pred(*iter, *(iter-1))) if (!binary_pred(*iter, *(iter-1)))
{ {
new (static_cast<void *> (&*iter_out)) value_type(*iter); new (static_cast<void *> (&*iter_out)) value_type(*iter);
iter_out++; iter_out++;
} }
} }
} }
} }
difference_type end_output = 0; difference_type end_output = 0;
for (int t = 0; t < num_threads + 1; t++) for (int t = 0; t < num_threads + 1; t++)
end_output += counter[t]; end_output += counter[t];
delete[] borders;
return result + end_output; return result + end_output;
} }
/** @brief Parallel std::unique_copy(), without explicit equality predicate /** @brief Parallel std::unique_copy(), without explicit equality predicate
* @param first Begin iterator of input sequence. * @param first Begin iterator of input sequence.
* @param last End iterator of input sequence. * @param last End iterator of input sequence.
* @param result Begin iterator of result sequence. * @param result Begin iterator of result sequence.
* @return End iterator of result sequence. */ * @return End iterator of result sequence. */
template<typename InputIterator, class OutputIterator> template<typename InputIterator, class OutputIterator>
inline OutputIterator inline OutputIterator
parallel_unique_copy(InputIterator first, InputIterator last, parallel_unique_copy(InputIterator first, InputIterator last,
OutputIterator result) OutputIterator result)
{ {
typedef typename std::iterator_traits<InputIterator>::value_type value_type; typedef typename std::iterator_traits<InputIterator>::value_type value_type;
......
...@@ -55,8 +55,8 @@ namespace __gnu_parallel ...@@ -55,8 +55,8 @@ namespace __gnu_parallel
#define _GLIBCXX_JOB_VOLATILE volatile #define _GLIBCXX_JOB_VOLATILE volatile
/** @brief One job for a certain thread. */ /** @brief One job for a certain thread. */
template<typename _DifferenceTp> template<typename _DifferenceTp>
struct Job struct Job
{ {
typedef _DifferenceTp difference_type; typedef _DifferenceTp difference_type;
...@@ -78,31 +78,38 @@ namespace __gnu_parallel ...@@ -78,31 +78,38 @@ namespace __gnu_parallel
_GLIBCXX_JOB_VOLATILE difference_type load; _GLIBCXX_JOB_VOLATILE difference_type load;
}; };
/** @brief Work stealing algorithm for random access iterators. /** @brief Work stealing algorithm for random access iterators.
* *
* Uses O(1) additional memory. Synchronization at job lists is * Uses O(1) additional memory. Synchronization at job lists is
* done with atomic operations. * done with atomic operations.
* @param begin Begin iterator of element sequence. * @param begin Begin iterator of element sequence.
* @param end End iterator of element sequence. * @param end End iterator of element sequence.
* @param op User-supplied functor (comparator, predicate, adding * @param op User-supplied functor (comparator, predicate, adding
* functor, ...). * functor, ...).
* @param f Functor to "process" an element with op (depends on * @param f Functor to "process" an element with op (depends on
* desired functionality, e. g. for std::for_each(), ...). * desired functionality, e. g. for std::for_each(), ...).
* @param r Functor to "add" a single result to the already * @param r Functor to "add" a single result to the already
* processed elements (depends on functionality). * processed elements (depends on functionality).
* @param base Base value for reduction. * @param base Base value for reduction.
* @param output Pointer to position where final result is written to * @param output Pointer to position where final result is written to
* @param bound Maximum number of elements processed (e. g. for * @param bound Maximum number of elements processed (e. g. for
* std::count_n()). * std::count_n()).
* @return User-supplied functor (that may contain a part of the result). * @return User-supplied functor (that may contain a part of the result).
*/ */
template<typename RandomAccessIterator, typename Op, typename Fu, typename Red, typename Result> template<
typename RandomAccessIterator,
typename Op,
typename Fu,
typename Red,
typename Result>
Op Op
for_each_template_random_access_workstealing(RandomAccessIterator begin, for_each_template_random_access_workstealing(
RandomAccessIterator end, RandomAccessIterator begin,
Op op, Fu& f, Red r, RandomAccessIterator end,
Result base, Result& output, Op op, Fu& f, Red r,
typename std::iterator_traits<RandomAccessIterator>::difference_type bound) Result base, Result& output,
typename std::iterator_traits<RandomAccessIterator>::difference_type
bound)
{ {
_GLIBCXX_CALL(end - begin) _GLIBCXX_CALL(end - begin)
...@@ -110,182 +117,187 @@ namespace __gnu_parallel ...@@ -110,182 +117,187 @@ namespace __gnu_parallel
typedef typename traits_type::difference_type difference_type; typedef typename traits_type::difference_type difference_type;
difference_type chunk_size = static_cast<difference_type>(Settings::workstealing_chunk_size); difference_type chunk_size =
static_cast<difference_type>(Settings::workstealing_chunk_size);
// How many jobs? // How many jobs?
difference_type length = (bound < 0) ? (end - begin) : bound; difference_type length = (bound < 0) ? (end - begin) : bound;
// To avoid false sharing in a cache line. // To avoid false sharing in a cache line.
const int stride = Settings::cache_line_size * 10 / sizeof(Job<difference_type>) + 1; const int stride =
Settings::cache_line_size * 10 / sizeof(Job<difference_type>) + 1;
// Total number of threads currently working. // Total number of threads currently working.
thread_index_t busy = 0; thread_index_t busy = 0;
thread_index_t num_threads = get_max_threads();
difference_type num_threads_min = num_threads < end - begin ? num_threads : end - begin; Job<difference_type> *job;
omp_lock_t output_lock; omp_lock_t output_lock;
omp_init_lock(&output_lock); omp_init_lock(&output_lock);
// No more threads than jobs, at least one thread.
difference_type num_threads_max = num_threads_min > 1 ? num_threads_min : 1;
num_threads = static_cast<thread_index_t>(num_threads_max);
// Create job description array.
Job<difference_type> *job = new Job<difference_type>[num_threads * stride];
// Write base value to output. // Write base value to output.
output = base; output = base;
#pragma omp parallel shared(busy) num_threads(num_threads) // No more threads than jobs, at least one thread.
{ thread_index_t num_threads =
// Initialization phase. __gnu_parallel::max<thread_index_t>(1,
__gnu_parallel::min<difference_type>(length, get_max_threads()));
// Flags for every thread if it is doing productive work.
bool iam_working = false; # pragma omp parallel shared(busy) num_threads(num_threads)
{
// Thread id.
thread_index_t iam = omp_get_thread_num(); # pragma omp single
{
// This job. num_threads = omp_get_num_threads();
Job<difference_type>& my_job = job[iam * stride];
// Create job description array.
// Random number (for work stealing). job = new Job<difference_type>[num_threads * stride];
thread_index_t victim; }
// Local value for reduction. // Initialization phase.
Result result = Result();
// Flags for every thread if it is doing productive work.
// Number of elements to steal in one attempt. bool iam_working = false;
difference_type steal;
// Thread id.
// Every thread has its own random number generator (modulo num_threads). thread_index_t iam = omp_get_thread_num();
random_number rand_gen(iam, num_threads);
// This job.
#pragma omp atomic Job<difference_type>& my_job = job[iam * stride];
// This thread is currently working.
busy++; // Random number (for work stealing).
thread_index_t victim;
iam_working = true;
// Local value for reduction.
// How many jobs per thread? last thread gets the rest. Result result = Result();
my_job.first = static_cast<difference_type>(iam * (length / num_threads));
// Number of elements to steal in one attempt.
my_job.last = (iam == (num_threads - 1)) ? (length - 1) : ((iam + 1) * (length / num_threads) - 1); difference_type steal;
my_job.load = my_job.last - my_job.first + 1;
// Every thread has its own random number generator
// Init result with first value (to have a base value for reduction). // (modulo num_threads).
if (my_job.first <= my_job.last) random_number rand_gen(iam, num_threads);
{
// Cannot use volatile variable directly. // This thread is currently working.
difference_type my_first = my_job.first; # pragma omp atomic
result = f(op, begin + my_first); busy++;
my_job.first++;
my_job.load--; iam_working = true;
}
// How many jobs per thread? last thread gets the rest.
RandomAccessIterator current; my_job.first =
static_cast<difference_type>(iam * (length / num_threads));
#pragma omp barrier
my_job.last = (iam == (num_threads - 1)) ?
// Actual work phase (length - 1) : ((iam + 1) * (length / num_threads) - 1);
// Work on own or stolen start my_job.load = my_job.last - my_job.first + 1;
while (busy > 0)
{ // Init result with first value (to have a base value for reduction).
// Work until no productive thread left. if (my_job.first <= my_job.last)
#pragma omp flush(busy) {
// Cannot use volatile variable directly.
// Thread has own work to do difference_type my_first = my_job.first;
while (my_job.first <= my_job.last) result = f(op, begin + my_first);
{ my_job.first++;
// fetch-and-add call my_job.load--;
// Reserve current job block (size chunk_size) in my queue. }
difference_type current_job = fetch_and_add<difference_type>(&(my_job.first), chunk_size);
RandomAccessIterator current;
// Update load, to make the three values consistent,
// first might have been changed in the meantime # pragma omp barrier
my_job.load = my_job.last - my_job.first + 1;
for (difference_type job_counter = 0; job_counter < chunk_size && current_job <= my_job.last; job_counter++) // Actual work phase
{ // Work on own or stolen start
// Yes: process it! while (busy > 0)
current = begin + current_job; {
current_job++; // Work until no productive thread left.
# pragma omp flush(busy)
// Do actual work.
result = r(result, f(op, current)); // Thread has own work to do
} while (my_job.first <= my_job.last)
{
#pragma omp flush(busy) // fetch-and-add call
// Reserve current job block (size chunk_size) in my queue.
} difference_type current_job =
fetch_and_add<difference_type>(&(my_job.first), chunk_size);
// After reaching this point, a thread's job list is empty.
if (iam_working) // Update load, to make the three values consistent,
{ // first might have been changed in the meantime
#pragma omp atomic my_job.load = my_job.last - my_job.first + 1;
// This thread no longer has work. for (difference_type job_counter = 0;
busy--; job_counter < chunk_size && current_job <= my_job.last;
job_counter++)
iam_working = false; {
} // Yes: process it!
current = begin + current_job;
difference_type supposed_first, supposed_last, supposed_load; current_job++;
do
{ // Do actual work.
// Find random nonempty deque (not own) and do consistency check. result = r(result, f(op, current));
yield(); }
#pragma omp flush(busy)
victim = rand_gen(); # pragma omp flush(busy)
supposed_first = job[victim * stride].first; }
supposed_last = job[victim * stride].last;
supposed_load = job[victim * stride].load; // After reaching this point, a thread's job list is empty.
} if (iam_working)
while (busy > 0 {
&& ((supposed_load <= 0) || ((supposed_first + supposed_load - 1) != supposed_last))); // This thread no longer has work.
# pragma omp atomic
if (busy == 0) busy--;
break;
iam_working = false;
if (supposed_load > 0) }
{
// Has work and work to do. difference_type supposed_first, supposed_last, supposed_load;
// Number of elements to steal (at least one). do
steal = (supposed_load < 2) ? 1 : supposed_load / 2; {
// Find random nonempty deque (not own), do consistency check.
// Protects against stealing threads yield();
// omp_set_lock(&(job[victim * stride].lock)); # pragma omp flush(busy)
victim = rand_gen();
// Push victim's start forward. supposed_first = job[victim * stride].first;
difference_type stolen_first = fetch_and_add<difference_type>(&(job[victim * stride].first), steal); supposed_last = job[victim * stride].last;
difference_type stolen_try = stolen_first + steal - difference_type(1); supposed_load = job[victim * stride].load;
}
// Protects against working thread while (busy > 0
// omp_unset_lock(&(job[victim * stride].lock)); && ((supposed_load <= 0)
|| ((supposed_first + supposed_load - 1) != supposed_last)));
my_job.first = stolen_first;
if (busy == 0)
// Avoid std::min dependencies. break;
my_job.last = stolen_try < supposed_last ? stolen_try : supposed_last;
if (supposed_load > 0)
my_job.load = my_job.last - my_job.first + 1; {
// Has work and work to do.
//omp_unset_lock(&(my_job.lock)); // Number of elements to steal (at least one).
steal = (supposed_load < 2) ? 1 : supposed_load / 2;
#pragma omp atomic
// Has potential work again. // Push victim's start forward.
busy++; difference_type stolen_first =
iam_working = true; fetch_and_add<difference_type>(
&(job[victim * stride].first), steal);
#pragma omp flush(busy) difference_type stolen_try =
} stolen_first + steal - difference_type(1);
#pragma omp flush(busy)
} // end while busy > 0 my_job.first = stolen_first;
// Add accumulated result to output. my_job.last = __gnu_parallel::min(stolen_try, supposed_last);
omp_set_lock(&output_lock); my_job.load = my_job.last - my_job.first + 1;
output = r(output, result);
omp_unset_lock(&output_lock); // Has potential work again.
# pragma omp atomic
//omp_destroy_lock(&(my_job.lock)); busy++;
} iam_working = true;
# pragma omp flush(busy)
}
# pragma omp flush(busy)
} // end while busy > 0
// Add accumulated result to output.
omp_set_lock(&output_lock);
output = r(output, result);
omp_unset_lock(&output_lock);
}
delete[] job; delete[] job;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment